From 8b4000f13b303cc154136abc74c55670673e2a96 Mon Sep 17 00:00:00 2001
From: Dimitry Andric <dim@FreeBSD.org>
Date: Mon, 8 May 2017 17:13:54 +0000
Subject: Vendor import of lldb trunk r302418:
 https://llvm.org/svn/llvm-project/lldb/trunk@302418

---
 include/lldb/API/SBAddress.h                       |   4 +
 include/lldb/API/SBInstruction.h                   |   2 +
 include/lldb/API/SBInstructionList.h               |   9 +
 include/lldb/Core/Disassembler.h                   |   2 +
 include/lldb/Expression/Expression.h               |  10 +
 include/lldb/Host/MainLoop.h                       |   7 +
 include/lldb/Host/common/UDPSocket.h               |   4 +-
 include/lldb/Target/ThreadPlanCallFunction.h       |   2 +-
 include/lldb/Target/ThreadPlanCallUserExpression.h |   3 +
 include/lldb/Utility/TaskPool.h                    | 108 +----------
 .../multiline/TestMultilineExpressions.py          |  28 +++
 .../step_over_breakpoint/TestStepOverBreakpoint.py |   9 +-
 .../return-value/TestReturnValue.py                |  50 +++--
 .../tools/lldb-server/TestGdbRemoteHostInfo.py     |   1 +
 scripts/interface/SBInstruction.i                  |   3 +
 scripts/interface/SBInstructionList.i              |   3 +
 source/API/SBAddress.cpp                           |   6 +
 source/API/SBInstruction.cpp                       |   7 +
 source/API/SBInstructionList.cpp                   |  26 +++
 source/API/SBProcess.cpp                           |  16 +-
 source/Core/Disassembler.cpp                       |   4 +
 source/Host/common/Editline.cpp                    |   2 +-
 source/Host/common/MainLoop.cpp                    | 206 ++++++++++-----------
 source/Host/common/UDPSocket.cpp                   |  82 ++++----
 source/Plugins/ABI/SysV-arm64/ABISysV_arm64.cpp    |  34 ++--
 .../MacOSX-DYLD/DynamicLoaderMacOS.cpp             |  17 +-
 .../Plugins/SymbolFile/DWARF/SymbolFileDWARF.cpp   |  73 +++-----
 source/Target/ThreadPlanCallUserExpression.cpp     |  11 ++
 source/Utility/TaskPool.cpp                        |  23 +++
 unittests/Host/CMakeLists.txt                      |   1 +
 unittests/Host/MainLoopTest.cpp                    | 120 ++++++++++++
 unittests/Utility/TaskPoolTest.cpp                 |  31 +---
 www/lldb-gdb.html                                  |  31 ++++
 33 files changed, 566 insertions(+), 369 deletions(-)
 create mode 100644 unittests/Host/MainLoopTest.cpp

diff --git a/include/lldb/API/SBAddress.h b/include/lldb/API/SBAddress.h
index ddbe5a742786..9e697beffdd1 100644
--- a/include/lldb/API/SBAddress.h
+++ b/include/lldb/API/SBAddress.h
@@ -103,6 +103,8 @@ protected:
 
   const lldb_private::Address *operator->() const;
 
+  friend bool operator==(const SBAddress &lhs, const SBAddress &rhs);
+
   lldb_private::Address *get();
 
   lldb_private::Address &ref();
@@ -117,6 +119,8 @@ private:
   std::unique_ptr<lldb_private::Address> m_opaque_ap;
 };
 
+bool operator==(const SBAddress &lhs, const SBAddress &rhs);
+
 } // namespace lldb
 
 #endif // LLDB_SBAddress_h_
diff --git a/include/lldb/API/SBInstruction.h b/include/lldb/API/SBInstruction.h
index 0fc12eb61cba..23daf1c56637 100644
--- a/include/lldb/API/SBInstruction.h
+++ b/include/lldb/API/SBInstruction.h
@@ -53,6 +53,8 @@ public:
 
   bool HasDelaySlot();
 
+  bool CanSetBreakpoint();
+
   void Print(FILE *out);
 
   bool GetDescription(lldb::SBStream &description);
diff --git a/include/lldb/API/SBInstructionList.h b/include/lldb/API/SBInstructionList.h
index 29baef5790eb..0323a3c80c05 100644
--- a/include/lldb/API/SBInstructionList.h
+++ b/include/lldb/API/SBInstructionList.h
@@ -32,6 +32,15 @@ public:
 
   lldb::SBInstruction GetInstructionAtIndex(uint32_t idx);
 
+  // ----------------------------------------------------------------------
+  // Returns the number of instructions between the start and end address.
+  // If canSetBreakpoint is true then the count will be the number of 
+  // instructions on which a breakpoint can be set.
+  // ----------------------------------------------------------------------
+  size_t GetInstructionsCount(const SBAddress &start,
+                              const SBAddress &end,
+                              bool canSetBreakpoint = false);                                   
+
   void Clear();
 
   void AppendInstruction(lldb::SBInstruction inst);
diff --git a/include/lldb/Core/Disassembler.h b/include/lldb/Core/Disassembler.h
index 929b668c092b..addc83ad5e9d 100644
--- a/include/lldb/Core/Disassembler.h
+++ b/include/lldb/Core/Disassembler.h
@@ -173,6 +173,8 @@ public:
 
   virtual bool HasDelaySlot();
 
+  bool CanSetBreakpoint ();
+
   virtual size_t Decode(const Disassembler &disassembler,
                         const DataExtractor &data,
                         lldb::offset_t data_offset) = 0;
diff --git a/include/lldb/Expression/Expression.h b/include/lldb/Expression/Expression.h
index f48a7992227d..860444e9c2c2 100644
--- a/include/lldb/Expression/Expression.h
+++ b/include/lldb/Expression/Expression.h
@@ -99,6 +99,16 @@ public:
   //------------------------------------------------------------------
   lldb::addr_t StartAddress() { return m_jit_start_addr; }
 
+  //------------------------------------------------------------------
+  /// Called to notify the expression that it is about to be executed.
+  //------------------------------------------------------------------
+  virtual void WillStartExecuting() {}
+
+  //------------------------------------------------------------------
+  /// Called to notify the expression that its execution has finished.
+  //------------------------------------------------------------------
+  virtual void DidFinishExecuting() {}
+
   virtual ExpressionTypeSystemHelper *GetTypeSystemHelper() { return nullptr; }
 
 protected:
diff --git a/include/lldb/Host/MainLoop.h b/include/lldb/Host/MainLoop.h
index 79370bf8461f..f5d906e98a7b 100644
--- a/include/lldb/Host/MainLoop.h
+++ b/include/lldb/Host/MainLoop.h
@@ -42,6 +42,7 @@ private:
 public:
   typedef std::unique_ptr<SignalHandle> SignalHandleUP;
 
+  MainLoop();
   ~MainLoop() override;
 
   ReadHandleUP RegisterReadObject(const lldb::IOObjectSP &object_sp,
@@ -71,6 +72,9 @@ protected:
   void UnregisterSignal(int signo);
 
 private:
+  void ProcessReadObject(IOObject::WaitableHandle handle);
+  void ProcessSignal(int signo);
+
   class SignalHandle {
   public:
     ~SignalHandle() { m_mainloop.UnregisterSignal(m_signo); }
@@ -97,6 +101,9 @@ private:
 
   llvm::DenseMap<IOObject::WaitableHandle, Callback> m_read_fds;
   llvm::DenseMap<int, SignalInfo> m_signals;
+#if HAVE_SYS_EVENT_H
+  int m_kqueue;
+#endif
   bool m_terminate_request : 1;
 };
 
diff --git a/include/lldb/Host/common/UDPSocket.h b/include/lldb/Host/common/UDPSocket.h
index 38524fa8f62b..977ce151e4ff 100644
--- a/include/lldb/Host/common/UDPSocket.h
+++ b/include/lldb/Host/common/UDPSocket.h
@@ -21,15 +21,13 @@ public:
                        Socket *&socket);
 
 private:
-  UDPSocket(NativeSocket socket, const UDPSocket &listen_socket);
+  UDPSocket(NativeSocket socket);
 
   size_t Send(const void *buf, const size_t num_bytes) override;
   Error Connect(llvm::StringRef name) override;
   Error Listen(llvm::StringRef name, int backlog) override;
   Error Accept(Socket *&socket) override;
 
-  Error CreateSocket();
-
   SocketAddress m_sockaddr;
 };
 }
diff --git a/include/lldb/Target/ThreadPlanCallFunction.h b/include/lldb/Target/ThreadPlanCallFunction.h
index 3d43491af9af..1c75b0a3645c 100644
--- a/include/lldb/Target/ThreadPlanCallFunction.h
+++ b/include/lldb/Target/ThreadPlanCallFunction.h
@@ -117,7 +117,7 @@ protected:
                         lldb::addr_t &start_load_addr,
                         lldb::addr_t &function_load_addr);
 
-  void DoTakedown(bool success);
+  virtual void DoTakedown(bool success);
 
   void SetBreakpoints();
 
diff --git a/include/lldb/Target/ThreadPlanCallUserExpression.h b/include/lldb/Target/ThreadPlanCallUserExpression.h
index f1425b2f97e1..5fe80927ca21 100644
--- a/include/lldb/Target/ThreadPlanCallUserExpression.h
+++ b/include/lldb/Target/ThreadPlanCallUserExpression.h
@@ -35,6 +35,8 @@ public:
 
   void GetDescription(Stream *s, lldb::DescriptionLevel level) override;
 
+  void DidPush() override;
+
   void WillPop() override;
 
   lldb::StopInfoSP GetRealStopInfo() override;
@@ -48,6 +50,7 @@ public:
   }
 
 protected:
+  void DoTakedown(bool success) override;
 private:
   lldb::UserExpressionSP
       m_user_expression_sp; // This is currently just used to ensure the
diff --git a/include/lldb/Utility/TaskPool.h b/include/lldb/Utility/TaskPool.h
index fb936bbb739a..87b8824f9226 100644
--- a/include/lldb/Utility/TaskPool.h
+++ b/include/lldb/Utility/TaskPool.h
@@ -53,50 +53,6 @@ private:
   static void AddTaskImpl(std::function<void()> &&task_fn);
 };
 
-// Wrapper class around the global TaskPool implementation to make it possible
-// to create a set of
-// tasks and then wait for the tasks to be completed by the
-// WaitForNextCompletedTask call. This
-// class should be used when WaitForNextCompletedTask is needed because this
-// class add no other
-// extra functionality to the TaskPool class and it have a very minor
-// performance overhead.
-template <typename T> // The return type of the tasks what will be added to this
-                      // task runner
-                      class TaskRunner {
-public:
-  // Add a task to the task runner what will also add the task to the global
-  // TaskPool. The
-  // function doesn't return the std::future for the task because it will be
-  // supplied by the
-  // WaitForNextCompletedTask after the task is completed.
-  template <typename F, typename... Args> void AddTask(F &&f, Args &&... args);
-
-  // Wait for the next task in this task runner to finish and then return the
-  // std::future what
-  // belongs to the finished task. If there is no task in this task runner
-  // (neither pending nor
-  // comleted) then this function will return an invalid future. Usually this
-  // function should be
-  // called in a loop processing the results of the tasks until it returns an
-  // invalid std::future
-  // what means that all task in this task runner is completed.
-  std::future<T> WaitForNextCompletedTask();
-
-  // Convenience method to wait for all task in this TaskRunner to finish. Do
-  // NOT use this class
-  // just because of this method. Use TaskPool instead and wait for each
-  // std::future returned by
-  // AddTask in a loop.
-  void WaitForAllTasks();
-
-private:
-  std::list<std::future<T>> m_ready;
-  std::list<std::future<T>> m_pending;
-  std::mutex m_mutex;
-  std::condition_variable m_cv;
-};
-
 template <typename F, typename... Args>
 std::future<typename std::result_of<F(Args...)>::type>
 TaskPool::AddTask(F &&f, Args &&... args) {
@@ -126,64 +82,10 @@ template <> struct TaskPool::RunTaskImpl<> {
   static void Run() {}
 };
 
-template <typename T>
-template <typename F, typename... Args>
-void TaskRunner<T>::AddTask(F &&f, Args &&... args) {
-  std::unique_lock<std::mutex> lock(m_mutex);
-  auto it = m_pending.emplace(m_pending.end());
-  *it = std::move(TaskPool::AddTask(
-      [this, it](F f, Args... args) {
-        T &&r = f(std::forward<Args>(args)...);
-
-        std::unique_lock<std::mutex> lock(this->m_mutex);
-        this->m_ready.splice(this->m_ready.end(), this->m_pending, it);
-        lock.unlock();
-
-        this->m_cv.notify_one();
-        return r;
-      },
-      std::forward<F>(f), std::forward<Args>(args)...));
-}
-
-template <>
-template <typename F, typename... Args>
-void TaskRunner<void>::AddTask(F &&f, Args &&... args) {
-  std::unique_lock<std::mutex> lock(m_mutex);
-  auto it = m_pending.emplace(m_pending.end());
-  *it = std::move(TaskPool::AddTask(
-      [this, it](F f, Args... args) {
-        f(std::forward<Args>(args)...);
-
-        std::unique_lock<std::mutex> lock(this->m_mutex);
-        this->m_ready.emplace_back(std::move(*it));
-        this->m_pending.erase(it);
-        lock.unlock();
-
-        this->m_cv.notify_one();
-      },
-      std::forward<F>(f), std::forward<Args>(args)...));
-}
-
-template <typename T> std::future<T> TaskRunner<T>::WaitForNextCompletedTask() {
-  std::unique_lock<std::mutex> lock(m_mutex);
-  if (m_ready.empty() && m_pending.empty())
-    return std::future<T>(); // No more tasks
-
-  if (m_ready.empty())
-    m_cv.wait(lock, [this]() { return !this->m_ready.empty(); });
-
-  std::future<T> res = std::move(m_ready.front());
-  m_ready.pop_front();
-
-  lock.unlock();
-  res.wait();
-
-  return std::move(res);
-}
-
-template <typename T> void TaskRunner<T>::WaitForAllTasks() {
-  while (WaitForNextCompletedTask().valid())
-    ;
-}
+// Run 'func' on every value from begin .. end-1.  Each worker will grab
+// 'batch_size' numbers at a time to work on, so for very fast functions, batch
+// should be large enough to avoid too much cache line contention.
+void TaskMapOverInt(size_t begin, size_t end,
+                    std::function<void(size_t)> const &func);
 
 #endif // #ifndef utility_TaskPool_h_
diff --git a/packages/Python/lldbsuite/test/expression_command/multiline/TestMultilineExpressions.py b/packages/Python/lldbsuite/test/expression_command/multiline/TestMultilineExpressions.py
index b1b5cbe677c4..aa369ebeff87 100644
--- a/packages/Python/lldbsuite/test/expression_command/multiline/TestMultilineExpressions.py
+++ b/packages/Python/lldbsuite/test/expression_command/multiline/TestMultilineExpressions.py
@@ -12,6 +12,7 @@ from lldbsuite.test import lldbutil
 class MultilineExpressionsTestCase(TestBase):
 
     mydir = TestBase.compute_mydir(__file__)
+    NO_DEBUG_INFO_TESTCASE = True
 
     def setUp(self):
         # Call super's setUp().
@@ -60,3 +61,30 @@ class MultilineExpressionsTestCase(TestBase):
         child.expect_exact(prompt)
         self.expect(child.before, exe=False,
                     patterns=['= 5'])
+
+    @skipIfRemote
+    @expectedFailureAll(
+        oslist=["windows"],
+        bugnumber="llvm.org/pr22274: need a pexpect replacement for windows")
+    def test_empty_list(self):
+        """Test printing an empty list of expressions"""
+        import pexpect
+        prompt = "(lldb) "
+
+        # So that the child gets torn down after the test
+        self.child = pexpect.spawn(
+                "%s %s" %
+                (lldbtest_config.lldbExec, self.lldbOption))
+        child = self.child
+
+        # Turn on logging for what the child sends back.
+        if self.TraceOn():
+            child.logfile_read = sys.stdout
+
+        # We expect a prompt, then send "print" to start a list of expressions,
+        # then an empty line. We expect a prompt back.
+        child.expect_exact(prompt)
+        child.sendline("print")
+        child.expect_exact('1:')
+        child.sendline("")
+        child.expect_exact(prompt)
diff --git a/packages/Python/lldbsuite/test/functionalities/breakpoint/step_over_breakpoint/TestStepOverBreakpoint.py b/packages/Python/lldbsuite/test/functionalities/breakpoint/step_over_breakpoint/TestStepOverBreakpoint.py
index 00ddc628607c..4dfeae3f5e19 100644
--- a/packages/Python/lldbsuite/test/functionalities/breakpoint/step_over_breakpoint/TestStepOverBreakpoint.py
+++ b/packages/Python/lldbsuite/test/functionalities/breakpoint/step_over_breakpoint/TestStepOverBreakpoint.py
@@ -62,12 +62,11 @@ class StepOverBreakpointsTestCase(TestBase):
         instructions = function.GetInstructions(self.target)
         addr_1 = self.breakpoint1.GetLocationAtIndex(0).GetAddress()
         addr_4 = self.breakpoint4.GetLocationAtIndex(0).GetAddress()
-        for i in range(instructions.GetSize()) :
-            addr = instructions.GetInstructionAtIndex(i).GetAddress()
-            if (addr == addr_1) : index_1 = i
-            if (addr == addr_4) : index_4 = i 
 
-        steps_expected = index_4 - index_1
+        # if third argument is true then the count will be the number of
+        # instructions on which a breakpoint can be set.
+        # start = addr_1, end = addr_4, canSetBreakpoint = True
+        steps_expected = instructions.GetInstructionsCount(addr_1, addr_4, True)
         step_count = 0
         # Step from breakpoint_1 to breakpoint_4
         while True:
diff --git a/packages/Python/lldbsuite/test/functionalities/return-value/TestReturnValue.py b/packages/Python/lldbsuite/test/functionalities/return-value/TestReturnValue.py
index 778c098a38ee..90562f52a4b2 100644
--- a/packages/Python/lldbsuite/test/functionalities/return-value/TestReturnValue.py
+++ b/packages/Python/lldbsuite/test/functionalities/return-value/TestReturnValue.py
@@ -171,17 +171,45 @@ class ReturnValueTestCase(TestBase):
         #self.return_and_test_struct_value ("return_one_int_one_double_packed")
         self.return_and_test_struct_value("return_one_int_one_long")
 
-        # icc and gcc don't support this extension.
-        if self.getCompiler().endswith('clang'):
-            self.return_and_test_struct_value("return_vector_size_float32_8")
-            self.return_and_test_struct_value("return_vector_size_float32_16")
-            self.return_and_test_struct_value("return_vector_size_float32_32")
-            self.return_and_test_struct_value(
-                "return_ext_vector_size_float32_2")
-            self.return_and_test_struct_value(
-                "return_ext_vector_size_float32_4")
-            self.return_and_test_struct_value(
-                "return_ext_vector_size_float32_8")
+    @expectedFailureAll(oslist=["freebsd"], archs=["i386"])
+    @expectedFailureAll(oslist=["macosx"], archs=["i386"], bugnumber="<rdar://problem/28719652>")
+    @expectedFailureAll(
+        oslist=["linux"],
+        compiler="clang",
+        compiler_version=[
+            "<=",
+            "3.6"],
+        archs=["i386"])
+    @expectedFailureAll(
+        bugnumber="llvm.org/pr25785",
+        hostoslist=["windows"],
+        compiler="gcc",
+        archs=["i386"],
+        triple='.*-android')
+    @expectedFailureAll(compiler=["gcc"], archs=["x86_64", "i386"])
+    @expectedFailureAll(oslist=["windows"], bugnumber="llvm.org/pr24778")
+    def test_vector_values(self):
+        self.build()
+        exe = os.path.join(os.getcwd(), "a.out")
+        error = lldb.SBError()
+
+        self.target = self.dbg.CreateTarget(exe)
+        self.assertTrue(self.target, VALID_TARGET)
+
+        main_bktp = self.target.BreakpointCreateByName("main", exe)
+        self.assertTrue(main_bktp, VALID_BREAKPOINT)
+
+        self.process = self.target.LaunchSimple(
+            None, None, self.get_process_working_directory())
+        self.assertEqual(len(lldbutil.get_threads_stopped_at_breakpoint(
+            self.process, main_bktp)), 1)
+
+        self.return_and_test_struct_value("return_vector_size_float32_8")
+        self.return_and_test_struct_value("return_vector_size_float32_16")
+        self.return_and_test_struct_value("return_vector_size_float32_32")
+        self.return_and_test_struct_value("return_ext_vector_size_float32_2")
+        self.return_and_test_struct_value("return_ext_vector_size_float32_4")
+        self.return_and_test_struct_value("return_ext_vector_size_float32_8")
 
     def return_and_test_struct_value(self, func_name):
         """Pass in the name of the function to return from - takes in value, returns value."""
diff --git a/packages/Python/lldbsuite/test/tools/lldb-server/TestGdbRemoteHostInfo.py b/packages/Python/lldbsuite/test/tools/lldb-server/TestGdbRemoteHostInfo.py
index 5089ee85773f..d84511d54273 100644
--- a/packages/Python/lldbsuite/test/tools/lldb-server/TestGdbRemoteHostInfo.py
+++ b/packages/Python/lldbsuite/test/tools/lldb-server/TestGdbRemoteHostInfo.py
@@ -14,6 +14,7 @@ class TestGdbRemoteHostInfo(GdbRemoteTestCaseBase):
     mydir = TestBase.compute_mydir(__file__)
 
     KNOWN_HOST_INFO_KEYS = set([
+        "arch",
         "cputype",
         "cpusubtype",
         "distribution_id",
diff --git a/scripts/interface/SBInstruction.i b/scripts/interface/SBInstruction.i
index d5b60201e95e..c78799c6fe69 100644
--- a/scripts/interface/SBInstruction.i
+++ b/scripts/interface/SBInstruction.i
@@ -54,6 +54,9 @@ public:
     bool
     HasDelaySlot ();
 
+    bool
+    CanSetBreakpoint ();
+
     void
     Print (FILE *out);
 
diff --git a/scripts/interface/SBInstructionList.i b/scripts/interface/SBInstructionList.i
index 32603be5cc1e..f4b572c341cd 100644
--- a/scripts/interface/SBInstructionList.i
+++ b/scripts/interface/SBInstructionList.i
@@ -44,6 +44,9 @@ public:
     lldb::SBInstruction
     GetInstructionAtIndex (uint32_t idx);
 
+    size_t GetInstructionsCount(const SBAddress &start, const SBAddress &end,
+                                bool canSetBreakpoint);
+
     void
     Clear ();
 
diff --git a/source/API/SBAddress.cpp b/source/API/SBAddress.cpp
index b452ce327ab7..a3493d7c743f 100644
--- a/source/API/SBAddress.cpp
+++ b/source/API/SBAddress.cpp
@@ -55,6 +55,12 @@ const SBAddress &SBAddress::operator=(const SBAddress &rhs) {
   return *this;
 }
 
+bool lldb::operator==(const SBAddress &lhs, const SBAddress &rhs) {
+  if (lhs.IsValid() && rhs.IsValid())
+    return lhs.ref() == rhs.ref();
+  return false;
+}
+
 bool SBAddress::IsValid() const {
   return m_opaque_ap.get() != NULL && m_opaque_ap->IsValid();
 }
diff --git a/source/API/SBInstruction.cpp b/source/API/SBInstruction.cpp
index c47307c733a8..8b7deb7011be 100644
--- a/source/API/SBInstruction.cpp
+++ b/source/API/SBInstruction.cpp
@@ -176,6 +176,13 @@ bool SBInstruction::HasDelaySlot() {
   return false;
 }
 
+bool SBInstruction::CanSetBreakpoint () {
+  lldb::InstructionSP inst_sp(GetOpaque());
+  if (inst_sp)
+    return inst_sp->CanSetBreakpoint();
+  return false;
+}
+
 lldb::InstructionSP SBInstruction::GetOpaque() {
   if (m_opaque_sp)
     return m_opaque_sp->GetSP();
diff --git a/source/API/SBInstructionList.cpp b/source/API/SBInstructionList.cpp
index 04c37f50c2d7..3edb9eae98c1 100644
--- a/source/API/SBInstructionList.cpp
+++ b/source/API/SBInstructionList.cpp
@@ -9,6 +9,7 @@
 
 #include "lldb/API/SBInstructionList.h"
 #include "lldb/API/SBInstruction.h"
+#include "lldb/API/SBAddress.h"
 #include "lldb/API/SBStream.h"
 #include "lldb/Core/Disassembler.h"
 #include "lldb/Core/Module.h"
@@ -49,6 +50,31 @@ SBInstruction SBInstructionList::GetInstructionAtIndex(uint32_t idx) {
   return inst;
 }
 
+size_t SBInstructionList::GetInstructionsCount(const SBAddress &start,
+                                              const SBAddress &end, 
+                                              bool canSetBreakpoint) {
+  size_t num_instructions = GetSize();
+  size_t i = 0;
+  SBAddress addr;
+  size_t lower_index = 0;
+  size_t upper_index = 0;
+  size_t instructions_to_skip = 0;
+  for (i = 0; i < num_instructions; ++i) {
+    addr = GetInstructionAtIndex(i).GetAddress();
+    if (start == addr)
+      lower_index = i;
+    if (end == addr)
+      upper_index = i;
+  }
+  if (canSetBreakpoint)
+    for (i = lower_index; i <= upper_index; ++i) {
+      SBInstruction insn = GetInstructionAtIndex(i);
+      if (!insn.CanSetBreakpoint())
+        ++instructions_to_skip;
+    }
+  return upper_index - lower_index - instructions_to_skip;
+}
+
 void SBInstructionList::Clear() { m_opaque_sp.reset(); }
 
 void SBInstructionList::AppendInstruction(SBInstruction insn) {}
diff --git a/source/API/SBProcess.cpp b/source/API/SBProcess.cpp
index 5614cb468a69..0348113a9873 100644
--- a/source/API/SBProcess.cpp
+++ b/source/API/SBProcess.cpp
@@ -1157,22 +1157,34 @@ uint32_t SBProcess::LoadImage(lldb::SBFileSpec &sb_remote_image_spec,
 uint32_t SBProcess::LoadImage(const lldb::SBFileSpec &sb_local_image_spec,
                               const lldb::SBFileSpec &sb_remote_image_spec,
                               lldb::SBError &sb_error) {
+  Log *log(lldb_private::GetLogIfAllCategoriesSet(LIBLLDB_LOG_API));
   ProcessSP process_sp(GetSP());
   if (process_sp) {
     Process::StopLocker stop_locker;
     if (stop_locker.TryLock(&process_sp->GetRunLock())) {
+      if (log)
+        log->Printf("SBProcess(%p)::LoadImage() => calling Platform::LoadImage"
+                    "for: %s",
+                    static_cast<void *>(process_sp.get()),
+                    sb_local_image_spec.GetFilename());
+
       std::lock_guard<std::recursive_mutex> guard(
-          process_sp->GetTarget().GetAPIMutex());
+        process_sp->GetTarget().GetAPIMutex());
       PlatformSP platform_sp = process_sp->GetTarget().GetPlatform();
       return platform_sp->LoadImage(process_sp.get(), *sb_local_image_spec,
                                     *sb_remote_image_spec, sb_error.ref());
     } else {
-      Log *log(lldb_private::GetLogIfAllCategoriesSet(LIBLLDB_LOG_API));
       if (log)
         log->Printf("SBProcess(%p)::LoadImage() => error: process is running",
                     static_cast<void *>(process_sp.get()));
       sb_error.SetErrorString("process is running");
     }
+  } else { 
+    if (log)
+      log->Printf("SBProcess(%p)::LoadImage() => error: called with invalid"
+                    " process",
+                    static_cast<void *>(process_sp.get()));
+    sb_error.SetErrorString("process is invalid");
   }
   return LLDB_INVALID_IMAGE_TOKEN;
 }
diff --git a/source/Core/Disassembler.cpp b/source/Core/Disassembler.cpp
index 3880bfd16ecc..51d93d9acdbb 100644
--- a/source/Core/Disassembler.cpp
+++ b/source/Core/Disassembler.cpp
@@ -759,6 +759,10 @@ bool Instruction::DumpEmulation(const ArchSpec &arch) {
   return false;
 }
 
+bool Instruction::CanSetBreakpoint () {
+  return !HasDelaySlot();
+}
+
 bool Instruction::HasDelaySlot() {
   // Default is false.
   return false;
diff --git a/source/Host/common/Editline.cpp b/source/Host/common/Editline.cpp
index b157cdb7c110..851287e76331 100644
--- a/source/Host/common/Editline.cpp
+++ b/source/Host/common/Editline.cpp
@@ -367,7 +367,7 @@ void Editline::MoveCursor(CursorLocation from, CursorLocation to) {
   if (to == CursorLocation::EditingCursor) {
     toColumn =
         editline_cursor_position - (editline_cursor_row * m_terminal_width) + 1;
-  } else if (to == CursorLocation::BlockEnd) {
+  } else if (to == CursorLocation::BlockEnd && !m_input_lines.empty()) {
     toColumn =
         ((m_input_lines[m_input_lines.size() - 1].length() + GetPromptWidth()) %
          80) +
diff --git a/source/Host/common/MainLoop.cpp b/source/Host/common/MainLoop.cpp
index 8a9d4f020d5f..abd52f7f46fb 100644
--- a/source/Host/common/MainLoop.cpp
+++ b/source/Host/common/MainLoop.cpp
@@ -18,6 +18,11 @@
 #include <vector>
 #include <time.h>
 
+// Multiplexing is implemented using kqueue on systems that support it (BSD
+// variants including OSX). On linux we use ppoll, while android uses pselect
+// (ppoll is present but not implemented properly). On windows we use WSApoll
+// (which does not support signals).
+
 #if HAVE_SYS_EVENT_H
 #include <sys/event.h>
 #elif defined(LLVM_ON_WIN32)
@@ -65,92 +70,72 @@ static void SignalHandler(int signo, siginfo_t *info, void *) {
 
 class MainLoop::RunImpl {
 public:
-  // TODO: Use llvm::Expected<T>
-  static std::unique_ptr<RunImpl> Create(MainLoop &loop, Error &error);
-  ~RunImpl();
+  RunImpl(MainLoop &loop);
+  ~RunImpl() = default;
 
   Error Poll();
-
-  template <typename F> void ForEachReadFD(F &&f);
-  template <typename F> void ForEachSignal(F &&f);
+  void ProcessEvents();
 
 private:
   MainLoop &loop;
 
 #if HAVE_SYS_EVENT_H
-  int queue_id;
   std::vector<struct kevent> in_events;
   struct kevent out_events[4];
   int num_events = -1;
 
-  RunImpl(MainLoop &loop, int queue_id) : loop(loop), queue_id(queue_id) {
-    in_events.reserve(loop.m_read_fds.size() + loop.m_signals.size());
-  }
 #else
-  std::vector<int> signals;
 #ifdef FORCE_PSELECT
   fd_set read_fd_set;
 #else
   std::vector<struct pollfd> read_fds;
 #endif
 
-  RunImpl(MainLoop &loop) : loop(loop) {
-    signals.reserve(loop.m_signals.size());
-  }
-
   sigset_t get_sigmask();
 #endif
 };
 
 #if HAVE_SYS_EVENT_H
-MainLoop::RunImpl::~RunImpl() {
-  int r = close(queue_id);
-  assert(r == 0);
-  (void)r;
-}
-std::unique_ptr<MainLoop::RunImpl> MainLoop::RunImpl::Create(MainLoop &loop, Error &error)
-{
-  error.Clear();
-  int queue_id = kqueue();
-  if(queue_id < 0) {
-    error = Error(errno, eErrorTypePOSIX);
-    return nullptr;
-  }
-  return std::unique_ptr<RunImpl>(new RunImpl(loop, queue_id));
+MainLoop::RunImpl::RunImpl(MainLoop &loop) : loop(loop) {
+  in_events.reserve(loop.m_read_fds.size());
 }
 
 Error MainLoop::RunImpl::Poll() {
-  in_events.resize(loop.m_read_fds.size() + loop.m_signals.size());
+  in_events.resize(loop.m_read_fds.size());
   unsigned i = 0;
   for (auto &fd : loop.m_read_fds)
     EV_SET(&in_events[i++], fd.first, EVFILT_READ, EV_ADD, 0, 0, 0);
 
-  for (const auto &sig : loop.m_signals)
-    EV_SET(&in_events[i++], sig.first, EVFILT_SIGNAL, EV_ADD, 0, 0, 0);
-
-  num_events = kevent(queue_id, in_events.data(), in_events.size(), out_events,
-                      llvm::array_lengthof(out_events), nullptr);
+  num_events = kevent(loop.m_kqueue, in_events.data(), in_events.size(),
+                      out_events, llvm::array_lengthof(out_events), nullptr);
 
   if (num_events < 0)
     return Error("kevent() failed with error %d\n", num_events);
   return Error();
 }
 
-template <typename F> void MainLoop::RunImpl::ForEachReadFD(F &&f) {
+void MainLoop::RunImpl::ProcessEvents() {
   assert(num_events >= 0);
   for (int i = 0; i < num_events; ++i) {
-    f(out_events[i].ident);
     if (loop.m_terminate_request)
       return;
+    switch (out_events[i].filter) {
+    case EVFILT_READ:
+      loop.ProcessReadObject(out_events[i].ident);
+      break;
+    case EVFILT_SIGNAL:
+      loop.ProcessSignal(out_events[i].ident);
+      break;
+    default:
+      llvm_unreachable("Unknown event");
+    }
   }
 }
-template <typename F> void MainLoop::RunImpl::ForEachSignal(F && f) {}
 #else
-MainLoop::RunImpl::~RunImpl() {}
-std::unique_ptr<MainLoop::RunImpl> MainLoop::RunImpl::Create(MainLoop &loop, Error &error)
-{
-  error.Clear();
-  return std::unique_ptr<RunImpl>(new RunImpl(loop));
+MainLoop::RunImpl::RunImpl(MainLoop &loop) : loop(loop) {
+#ifndef FORCE_PSELECT
+  read_fds.reserve(loop.m_read_fds.size());
+#endif
 }
 
 sigset_t MainLoop::RunImpl::get_sigmask() {
@@ -162,18 +147,14 @@ sigset_t MainLoop::RunImpl::get_sigmask() {
   assert(ret == 0);
   (void) ret;
 
-  for (const auto &sig : loop.m_signals) {
-    signals.push_back(sig.first);
+  for (const auto &sig : loop.m_signals)
     sigdelset(&sigmask, sig.first);
-  }
   return sigmask;
 #endif
 }
 
 #ifdef FORCE_PSELECT
 Error MainLoop::RunImpl::Poll() {
-  signals.clear();
-
   FD_ZERO(&read_fd_set);
   int nfds = 0;
   for (const auto &fd : loop.m_read_fds) {
@@ -188,20 +169,8 @@ Error MainLoop::RunImpl::Poll() {
 
   return Error();
 }
-
-template <typename F> void MainLoop::RunImpl::ForEachReadFD(F &&f) {
-  for (const auto &fd : loop.m_read_fds) {
-    if(!FD_ISSET(fd.first, &read_fd_set))
-      continue;
-
-    f(fd.first);
-    if (loop.m_terminate_request)
-      return;
-  }
-}
 #else
 Error MainLoop::RunImpl::Poll() {
-  signals.clear();
   read_fds.clear();
 
   sigset_t sigmask = get_sigmask();
@@ -220,33 +189,47 @@ Error MainLoop::RunImpl::Poll() {
 
   return Error();
 }
+#endif
 
-template <typename F> void MainLoop::RunImpl::ForEachReadFD(F &&f) {
+void MainLoop::RunImpl::ProcessEvents() {
+#ifdef FORCE_PSELECT
+  for (const auto &fd : loop.m_read_fds) {
+    if (!FD_ISSET(fd.first, &read_fd_set))
+      continue;
+    IOObject::WaitableHandle handle = fd.first;
+#else
   for (const auto &fd : read_fds) {
     if ((fd.revents & POLLIN) == 0)
       continue;
-
-    f(fd.fd);
+    IOObject::WaitableHandle handle = fd.fd;
+#endif
     if (loop.m_terminate_request)
       return;
-  }
-}
-#endif
 
-template <typename F> void MainLoop::RunImpl::ForEachSignal(F &&f) {
-  for (int sig : signals) {
-    if (g_signal_flags[sig] == 0)
-      continue; // No signal
-    g_signal_flags[sig] = 0;
-    f(sig);
+    loop.ProcessReadObject(handle);
+  }
 
+  for (const auto &entry : loop.m_signals) {
     if (loop.m_terminate_request)
       return;
+    if (g_signal_flags[entry.first] == 0)
+      continue; // No signal
+    g_signal_flags[entry.first] = 0;
+    loop.ProcessSignal(entry.first);
   }
 }
 #endif
 
+MainLoop::MainLoop() {
+#if HAVE_SYS_EVENT_H
+  m_kqueue = kqueue();
+  assert(m_kqueue >= 0);
+#endif
+}
 MainLoop::~MainLoop() {
+#if HAVE_SYS_EVENT_H
+  close(m_kqueue);
+#endif
   assert(m_read_fds.size() == 0);
   assert(m_signals.size() == 0);
 }
@@ -298,24 +281,30 @@ MainLoop::RegisterSignal(int signo, const Callback &callback,
   new_action.sa_flags = SA_SIGINFO;
   sigemptyset(&new_action.sa_mask);
   sigaddset(&new_action.sa_mask, signo);
-
   sigset_t old_set;
-  if (int ret = pthread_sigmask(SIG_BLOCK, &new_action.sa_mask, &old_set)) {
-    error.SetErrorStringWithFormat("pthread_sigmask failed with error %d\n",
-                                   ret);
-    return nullptr;
-  }
 
-  info.was_blocked = sigismember(&old_set, signo);
-  if (sigaction(signo, &new_action, &info.old_action) == -1) {
-    error.SetErrorToErrno();
-    if (!info.was_blocked)
-      pthread_sigmask(SIG_UNBLOCK, &new_action.sa_mask, nullptr);
-    return nullptr;
-  }
+  g_signal_flags[signo] = 0;
+
+  // Even if using kqueue, the signal handler will still be invoked, so it's
+  // important to replace it with our "bening" handler.
+  int ret = sigaction(signo, &new_action, &info.old_action);
+  assert(ret == 0 && "sigaction failed");
 
+#if HAVE_SYS_EVENT_H
+  struct kevent ev;
+  EV_SET(&ev, signo, EVFILT_SIGNAL, EV_ADD, 0, 0, 0);
+  ret = kevent(m_kqueue, &ev, 1, nullptr, 0, nullptr);
+  assert(ret == 0);
+#endif
+
+  // If we're using kqueue, the signal needs to be unblocked in order to recieve
+  // it. If using pselect/ppoll, we need to block it, and later unblock it as a
+  // part of the system call.
+  ret = pthread_sigmask(HAVE_SYS_EVENT_H ? SIG_UNBLOCK : SIG_BLOCK,
+                        &new_action.sa_mask, &old_set);
+  assert(ret == 0 && "pthread_sigmask failed");
+  info.was_blocked = sigismember(&old_set, signo);
   m_signals.insert({signo, info});
-  g_signal_flags[signo] = 0;
 
   return SignalHandleUP(new SignalHandle(*this, signo));
 #endif
@@ -331,7 +320,6 @@ void MainLoop::UnregisterSignal(int signo) {
 #if SIGNAL_POLLING_UNSUPPORTED
   Error("Signal polling is not supported on this platform.");
 #else
-  // We undo the actions of RegisterSignal on a best-effort basis.
   auto it = m_signals.find(signo);
   assert(it != m_signals.end());
 
@@ -340,8 +328,17 @@ void MainLoop::UnregisterSignal(int signo) {
   sigset_t set;
   sigemptyset(&set);
   sigaddset(&set, signo);
-  pthread_sigmask(it->second.was_blocked ? SIG_BLOCK : SIG_UNBLOCK, &set,
-                  nullptr);
+  int ret = pthread_sigmask(it->second.was_blocked ? SIG_BLOCK : SIG_UNBLOCK,
+                            &set, nullptr);
+  assert(ret == 0);
+  (void)ret;
+
+#if HAVE_SYS_EVENT_H
+  struct kevent ev;
+  EV_SET(&ev, signo, EVFILT_SIGNAL, EV_DELETE, 0, 0, 0);
+  ret = kevent(m_kqueue, &ev, 1, nullptr, 0, nullptr);
+  assert(ret == 0);
+#endif
 
   m_signals.erase(it);
 #endif
@@ -351,32 +348,31 @@ Error MainLoop::Run() {
   m_terminate_request = false;
   
   Error error;
-  auto impl = RunImpl::Create(*this, error);
-  if (!impl)
-    return error;
+  RunImpl impl(*this);
 
   // run until termination or until we run out of things to listen to
   while (!m_terminate_request && (!m_read_fds.empty() || !m_signals.empty())) {
 
-    error = impl->Poll();
+    error = impl.Poll();
     if (error.Fail())
       return error;
 
-    impl->ForEachSignal([&](int sig) {
-      auto it = m_signals.find(sig);
-      if (it != m_signals.end())
-        it->second.callback(*this); // Do the work
-    });
-    if (m_terminate_request)
-      return Error();
+    impl.ProcessEvents();
 
-    impl->ForEachReadFD([&](int fd) {
-      auto it = m_read_fds.find(fd);
-      if (it != m_read_fds.end())
-        it->second(*this); // Do the work
-    });
     if (m_terminate_request)
       return Error();
   }
   return Error();
 }
+
+void MainLoop::ProcessSignal(int signo) {
+  auto it = m_signals.find(signo);
+  if (it != m_signals.end())
+    it->second.callback(*this); // Do the work
+}
+
+void MainLoop::ProcessReadObject(IOObject::WaitableHandle handle) {
+  auto it = m_read_fds.find(handle);
+  if (it != m_read_fds.end())
+    it->second(*this); // Do the work
+}
diff --git a/source/Host/common/UDPSocket.cpp b/source/Host/common/UDPSocket.cpp
index a32657aab0a6..ce8d90891b2b 100644
--- a/source/Host/common/UDPSocket.cpp
+++ b/source/Host/common/UDPSocket.cpp
@@ -28,31 +28,41 @@ const int kDomain = AF_INET;
 const int kType = SOCK_DGRAM;
 
 static const char *g_not_supported_error = "Not supported";
-} // namespace
-
-UDPSocket::UDPSocket(bool should_close, bool child_processes_inherit)
-    : Socket(ProtocolUdp, should_close, child_processes_inherit) {}
+}
 
-UDPSocket::UDPSocket(NativeSocket socket, const UDPSocket &listen_socket)
-    : Socket(ProtocolUdp, listen_socket.m_should_close_fd,
-             listen_socket.m_child_processes_inherit) {
+UDPSocket::UDPSocket(NativeSocket socket) : Socket(ProtocolUdp, true, true) {
   m_socket = socket;
 }
 
+UDPSocket::UDPSocket(bool should_close, bool child_processes_inherit)
+    : Socket(ProtocolUdp, should_close, child_processes_inherit) {}
+
 size_t UDPSocket::Send(const void *buf, const size_t num_bytes) {
   return ::sendto(m_socket, static_cast<const char *>(buf), num_bytes, 0,
                   m_sockaddr, m_sockaddr.GetLength());
 }
 
 Error UDPSocket::Connect(llvm::StringRef name) {
+  return Error("%s", g_not_supported_error);
+}
+
+Error UDPSocket::Listen(llvm::StringRef name, int backlog) {
+  return Error("%s", g_not_supported_error);
+}
+
+Error UDPSocket::Accept(Socket *&socket) {
+  return Error("%s", g_not_supported_error);
+}
+
+Error UDPSocket::Connect(llvm::StringRef name, bool child_processes_inherit,
+                         Socket *&socket) {
+  std::unique_ptr<UDPSocket> final_socket;
+
   Log *log(lldb_private::GetLogIfAnyCategoriesSet(LIBLLDB_LOG_CONNECTION));
   if (log)
     log->Printf("UDPSocket::%s (host/port = %s)", __FUNCTION__, name.data());
 
   Error error;
-  if (error.Fail())
-    return error;
-
   std::string host_str;
   std::string port_str;
   int32_t port = INT32_MIN;
@@ -84,11 +94,12 @@ Error UDPSocket::Connect(llvm::StringRef name) {
   for (struct addrinfo *service_info_ptr = service_info_list;
        service_info_ptr != nullptr;
        service_info_ptr = service_info_ptr->ai_next) {
-    m_socket = Socket::CreateSocket(
+    auto send_fd = CreateSocket(
         service_info_ptr->ai_family, service_info_ptr->ai_socktype,
-        service_info_ptr->ai_protocol, m_child_processes_inherit, error);
+        service_info_ptr->ai_protocol, child_processes_inherit, error);
     if (error.Success()) {
-      m_sockaddr = service_info_ptr;
+      final_socket.reset(new UDPSocket(send_fd));
+      final_socket->m_sockaddr = service_info_ptr;
       break;
     } else
       continue;
@@ -96,17 +107,16 @@ Error UDPSocket::Connect(llvm::StringRef name) {
 
   ::freeaddrinfo(service_info_list);
 
-  if (IsValid())
+  if (!final_socket)
     return error;
 
   SocketAddress bind_addr;
 
   // Only bind to the loopback address if we are expecting a connection from
   // localhost to avoid any firewall issues.
-  const bool bind_addr_success =
-      (host_str == "127.0.0.1" || host_str == "localhost")
-          ? bind_addr.SetToLocalhost(kDomain, port)
-          : bind_addr.SetToAnyAddress(kDomain, port);
+  const bool bind_addr_success = (host_str == "127.0.0.1" || host_str == "localhost")
+                                     ? bind_addr.SetToLocalhost(kDomain, port)
+                                     : bind_addr.SetToAnyAddress(kDomain, port);
 
   if (!bind_addr_success) {
     error.SetErrorString("Failed to get hostspec to bind for");
@@ -115,37 +125,13 @@ Error UDPSocket::Connect(llvm::StringRef name) {
 
   bind_addr.SetPort(0); // Let the source port # be determined dynamically
 
-  err = ::bind(m_socket, bind_addr, bind_addr.GetLength());
+  err = ::bind(final_socket->GetNativeSocket(), bind_addr, bind_addr.GetLength());
 
-  error.Clear();
-  return error;
-}
+  struct sockaddr_in source_info;
+  socklen_t address_len = sizeof (struct sockaddr_in);
+  err = ::getsockname(final_socket->GetNativeSocket(), (struct sockaddr *) &source_info, &address_len);
 
-Error UDPSocket::Listen(llvm::StringRef name, int backlog) {
-  return Error("%s", g_not_supported_error);
-}
-
-Error UDPSocket::Accept(Socket *&socket) {
-  return Error("%s", g_not_supported_error);
-}
-
-Error UDPSocket::CreateSocket() {
-  Error error;
-  if (IsValid())
-    error = Close();
-  if (error.Fail())
-    return error;
-  m_socket =
-      Socket::CreateSocket(kDomain, kType, 0, m_child_processes_inherit, error);
-  return error;
-}
-
-Error UDPSocket::Connect(llvm::StringRef name, bool child_processes_inherit,
-                         Socket *&socket) {
-  std::unique_ptr<UDPSocket> final_socket(
-      new UDPSocket(true, child_processes_inherit));
-  Error error = final_socket->Connect(name);
-  if (!error.Fail())
-    socket = final_socket.release();
+  socket = final_socket.release();
+  error.Clear();
   return error;
 }
diff --git a/source/Plugins/ABI/SysV-arm64/ABISysV_arm64.cpp b/source/Plugins/ABI/SysV-arm64/ABISysV_arm64.cpp
index 04df0065d7bc..65cbd271e979 100644
--- a/source/Plugins/ABI/SysV-arm64/ABISysV_arm64.cpp
+++ b/source/Plugins/ABI/SysV-arm64/ABISysV_arm64.cpp
@@ -2362,32 +2362,30 @@ ValueObjectSP ABISysV_arm64::GetReturnValueObjectImpl(
     if (success)
       return_valobj_sp = ValueObjectConstResult::Create(
           thread.GetStackFrameAtIndex(0).get(), value, ConstString(""));
-  } else if (type_flags & eTypeIsVector) {
+  } else if (type_flags & eTypeIsVector && byte_size <= 16) {
     if (byte_size > 0) {
       const RegisterInfo *v0_info = reg_ctx->GetRegisterInfoByName("v0", 0);
 
       if (v0_info) {
-        if (byte_size <= v0_info->byte_size) {
-          std::unique_ptr<DataBufferHeap> heap_data_ap(
-              new DataBufferHeap(byte_size, 0));
-          const ByteOrder byte_order = exe_ctx.GetProcessRef().GetByteOrder();
-          RegisterValue reg_value;
-          if (reg_ctx->ReadRegister(v0_info, reg_value)) {
-            Error error;
-            if (reg_value.GetAsMemoryData(v0_info, heap_data_ap->GetBytes(),
-                                          heap_data_ap->GetByteSize(),
-                                          byte_order, error)) {
-              DataExtractor data(DataBufferSP(heap_data_ap.release()),
-                                 byte_order,
-                                 exe_ctx.GetProcessRef().GetAddressByteSize());
-              return_valobj_sp = ValueObjectConstResult::Create(
-                  &thread, return_compiler_type, ConstString(""), data);
-            }
+        std::unique_ptr<DataBufferHeap> heap_data_ap(
+            new DataBufferHeap(byte_size, 0));
+        const ByteOrder byte_order = exe_ctx.GetProcessRef().GetByteOrder();
+        RegisterValue reg_value;
+        if (reg_ctx->ReadRegister(v0_info, reg_value)) {
+          Error error;
+          if (reg_value.GetAsMemoryData(v0_info, heap_data_ap->GetBytes(),
+                                        heap_data_ap->GetByteSize(), byte_order,
+                                        error)) {
+            DataExtractor data(DataBufferSP(heap_data_ap.release()), byte_order,
+                               exe_ctx.GetProcessRef().GetAddressByteSize());
+            return_valobj_sp = ValueObjectConstResult::Create(
+                &thread, return_compiler_type, ConstString(""), data);
           }
         }
       }
     }
-  } else if (type_flags & eTypeIsStructUnion || type_flags & eTypeIsClass) {
+  } else if (type_flags & eTypeIsStructUnion || type_flags & eTypeIsClass ||
+             (type_flags & eTypeIsVector && byte_size > 16)) {
     DataExtractor data;
 
     uint32_t NGRN = 0; // Search ABI docs for NGRN
diff --git a/source/Plugins/DynamicLoader/MacOSX-DYLD/DynamicLoaderMacOS.cpp b/source/Plugins/DynamicLoader/MacOSX-DYLD/DynamicLoaderMacOS.cpp
index 20260ee5b5c3..c824653b2e93 100644
--- a/source/Plugins/DynamicLoader/MacOSX-DYLD/DynamicLoaderMacOS.cpp
+++ b/source/Plugins/DynamicLoader/MacOSX-DYLD/DynamicLoaderMacOS.cpp
@@ -434,24 +434,25 @@ Error DynamicLoaderMacOS::CanLoadImage() {
 
   // Default assumption is that it is OK to load images.
   // Only say that we cannot load images if we find the symbol in libdyld and it
-  // indicates that
-  // we cannot.
+  // indicates that we cannot.
 
   if (symbol_address != LLDB_INVALID_ADDRESS) {
     {
       int lock_held =
           m_process->ReadUnsignedIntegerFromMemory(symbol_address, 4, 0, error);
       if (lock_held != 0) {
-        error.SetErrorToGenericError();
+        error.SetErrorString("dyld lock held - unsafe to load images.");
       }
     }
   } else {
     // If we were unable to find _dyld_global_lock_held in any modules, or it is
-    // not loaded into
-    // memory yet, we may be at process startup (sitting at _dyld_start) - so we
-    // should not allow
-    // dlopen calls.
-    error.SetErrorToGenericError();
+    // not loaded into memory yet, we may be at process startup (sitting 
+    // at _dyld_start) - so we should not allow dlopen calls.
+    // But if we found more than one module then we are clearly past _dyld_start
+    // so in that case we'll default to "it's safe".
+    if (num_modules <= 1)
+        error.SetErrorString("could not find the dyld library or "
+                                       "the dyld lock symbol");
   }
   return error;
 }
diff --git a/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.cpp b/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.cpp
index 8c2fc3d3aa42..ad6af8dfebd5 100644
--- a/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.cpp
+++ b/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.cpp
@@ -1946,7 +1946,9 @@ void SymbolFileDWARF::Index() {
     std::vector<NameToDIE> type_index(num_compile_units);
     std::vector<NameToDIE> namespace_index(num_compile_units);
 
-    std::vector<bool> clear_cu_dies(num_compile_units, false);
+    // std::vector<bool> might be implemented using bit test-and-set, so use
+    // uint8_t instead.
+    std::vector<uint8_t> clear_cu_dies(num_compile_units, false);
     auto parser_fn = [debug_info, &function_basename_index,
                       &function_fullname_index, &function_method_index,
                       &function_selector_index, &objc_class_selectors_index,
@@ -1963,22 +1965,18 @@ void SymbolFileDWARF::Index() {
       return cu_idx;
     };
 
-    auto extract_fn = [debug_info](uint32_t cu_idx) {
+    auto extract_fn = [debug_info, &clear_cu_dies](uint32_t cu_idx) {
       DWARFCompileUnit *dwarf_cu = debug_info->GetCompileUnitAtIndex(cu_idx);
       if (dwarf_cu) {
         // dwarf_cu->ExtractDIEsIfNeeded(false) will return zero if the
         // DIEs for a compile unit have already been parsed.
-        return std::make_pair(cu_idx, dwarf_cu->ExtractDIEsIfNeeded(false) > 1);
+        if (dwarf_cu->ExtractDIEsIfNeeded(false) > 1)
+          clear_cu_dies[cu_idx] = true;
       }
-      return std::make_pair(cu_idx, false);
     };
 
     // Create a task runner that extracts dies for each DWARF compile unit in a
     // separate thread
-    TaskRunner<std::pair<uint32_t, bool>> task_runner_extract;
-    for (uint32_t cu_idx = 0; cu_idx < num_compile_units; ++cu_idx)
-      task_runner_extract.AddTask(extract_fn, cu_idx);
-
     //----------------------------------------------------------------------
     // First figure out which compile units didn't have their DIEs already
     // parsed and remember this.  If no DIEs were parsed prior to this index
@@ -1988,48 +1986,37 @@ void SymbolFileDWARF::Index() {
     // a DIE in one compile unit refers to another and the indexes accesses
     // those DIEs.
     //----------------------------------------------------------------------
-    while (true) {
-      auto f = task_runner_extract.WaitForNextCompletedTask();
-      if (!f.valid())
-        break;
-      unsigned cu_idx;
-      bool clear;
-      std::tie(cu_idx, clear) = f.get();
-      clear_cu_dies[cu_idx] = clear;
-    }
+    TaskMapOverInt(0, num_compile_units, extract_fn);
 
     // Now create a task runner that can index each DWARF compile unit in a
     // separate
     // thread so we can index quickly.
 
-    TaskRunner<uint32_t> task_runner;
-    for (uint32_t cu_idx = 0; cu_idx < num_compile_units; ++cu_idx)
-      task_runner.AddTask(parser_fn, cu_idx);
+    TaskMapOverInt(0, num_compile_units, parser_fn);
 
-    while (true) {
-      std::future<uint32_t> f = task_runner.WaitForNextCompletedTask();
-      if (!f.valid())
-        break;
-      uint32_t cu_idx = f.get();
-
-      m_function_basename_index.Append(function_basename_index[cu_idx]);
-      m_function_fullname_index.Append(function_fullname_index[cu_idx]);
-      m_function_method_index.Append(function_method_index[cu_idx]);
-      m_function_selector_index.Append(function_selector_index[cu_idx]);
-      m_objc_class_selectors_index.Append(objc_class_selectors_index[cu_idx]);
-      m_global_index.Append(global_index[cu_idx]);
-      m_type_index.Append(type_index[cu_idx]);
-      m_namespace_index.Append(namespace_index[cu_idx]);
-    }
+    auto finalize_fn = [](NameToDIE &index, std::vector<NameToDIE> &srcs) {
+      for (auto &src : srcs)
+        index.Append(src);
+      index.Finalize();
+    };
 
-    TaskPool::RunTasks([&]() { m_function_basename_index.Finalize(); },
-                       [&]() { m_function_fullname_index.Finalize(); },
-                       [&]() { m_function_method_index.Finalize(); },
-                       [&]() { m_function_selector_index.Finalize(); },
-                       [&]() { m_objc_class_selectors_index.Finalize(); },
-                       [&]() { m_global_index.Finalize(); },
-                       [&]() { m_type_index.Finalize(); },
-                       [&]() { m_namespace_index.Finalize(); });
+    TaskPool::RunTasks(
+        [&]() {
+          finalize_fn(m_function_basename_index, function_basename_index);
+        },
+        [&]() {
+          finalize_fn(m_function_fullname_index, function_fullname_index);
+        },
+        [&]() { finalize_fn(m_function_method_index, function_method_index); },
+        [&]() {
+          finalize_fn(m_function_selector_index, function_selector_index);
+        },
+        [&]() {
+          finalize_fn(m_objc_class_selectors_index, objc_class_selectors_index);
+        },
+        [&]() { finalize_fn(m_global_index, global_index); },
+        [&]() { finalize_fn(m_type_index, type_index); },
+        [&]() { finalize_fn(m_namespace_index, namespace_index); });
 
     //----------------------------------------------------------------------
     // Keep memory down by clearing DIEs for any compile units if indexing
diff --git a/source/Target/ThreadPlanCallUserExpression.cpp b/source/Target/ThreadPlanCallUserExpression.cpp
index 679040d09a02..15cbd0baa9a6 100644
--- a/source/Target/ThreadPlanCallUserExpression.cpp
+++ b/source/Target/ThreadPlanCallUserExpression.cpp
@@ -60,6 +60,12 @@ void ThreadPlanCallUserExpression::GetDescription(
     ThreadPlanCallFunction::GetDescription(s, level);
 }
 
+void ThreadPlanCallUserExpression::DidPush() {
+  ThreadPlanCallFunction::DidPush();
+  if (m_user_expression_sp)
+    m_user_expression_sp->WillStartExecuting();
+}
+
 void ThreadPlanCallUserExpression::WillPop() {
   ThreadPlanCallFunction::WillPop();
   if (m_user_expression_sp)
@@ -113,3 +119,8 @@ StopInfoSP ThreadPlanCallUserExpression::GetRealStopInfo() {
 
   return stop_info_sp;
 }
+
+void ThreadPlanCallUserExpression::DoTakedown(bool success) {
+  ThreadPlanCallFunction::DoTakedown(success);
+  m_user_expression_sp->DidFinishExecuting();
+}
diff --git a/source/Utility/TaskPool.cpp b/source/Utility/TaskPool.cpp
index 244e64fdb5fb..d8306dc7dc8f 100644
--- a/source/Utility/TaskPool.cpp
+++ b/source/Utility/TaskPool.cpp
@@ -73,3 +73,26 @@ void TaskPoolImpl::Worker(TaskPoolImpl *pool) {
     f();
   }
 }
+
+void TaskMapOverInt(size_t begin, size_t end,
+                    std::function<void(size_t)> const &func) {
+  std::atomic<size_t> idx{begin};
+  size_t num_workers =
+      std::min<size_t>(end, std::thread::hardware_concurrency());
+
+  auto wrapper = [&idx, end, &func]() {
+    while (true) {
+      size_t i = idx.fetch_add(1);
+      if (i >= end)
+        break;
+      func(i);
+    }
+  };
+
+  std::vector<std::future<void>> futures;
+  futures.reserve(num_workers);
+  for (size_t i = 0; i < num_workers; i++)
+    futures.push_back(TaskPool::AddTask(wrapper));
+  for (size_t i = 0; i < num_workers; i++)
+    futures[i].wait();
+}
diff --git a/unittests/Host/CMakeLists.txt b/unittests/Host/CMakeLists.txt
index 3396f45da4f3..7b2ce3bbfde5 100644
--- a/unittests/Host/CMakeLists.txt
+++ b/unittests/Host/CMakeLists.txt
@@ -1,6 +1,7 @@
 set (FILES
   FileSpecTest.cpp
   FileSystemTest.cpp
+  MainLoopTest.cpp
   SocketAddressTest.cpp
   SocketTest.cpp
   SymbolsTest.cpp
diff --git a/unittests/Host/MainLoopTest.cpp b/unittests/Host/MainLoopTest.cpp
new file mode 100644
index 000000000000..a2a673d38ca5
--- /dev/null
+++ b/unittests/Host/MainLoopTest.cpp
@@ -0,0 +1,120 @@
+//===-- MainLoopTest.cpp ----------------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "lldb/Host/MainLoop.h"
+#include "lldb/Host/common/TCPSocket.h"
+#include "gtest/gtest.h"
+#include <future>
+
+using namespace lldb_private;
+
+namespace {
+class MainLoopTest : public testing::Test {
+public:
+  static void SetUpTestCase() {
+#ifdef _MSC_VER
+    WSADATA data;
+    ASSERT_EQ(0, WSAStartup(MAKEWORD(2, 2), &data));
+#endif
+  }
+
+  static void TearDownTestCase() {
+#ifdef _MSC_VER
+    ASSERT_EQ(0, WSACleanup());
+#endif
+  }
+
+  void SetUp() override {
+    bool child_processes_inherit = false;
+    Error error;
+    std::unique_ptr<TCPSocket> listen_socket_up(
+        new TCPSocket(true, child_processes_inherit));
+    ASSERT_TRUE(error.Success());
+    error = listen_socket_up->Listen("localhost:0", 5);
+    ASSERT_TRUE(error.Success());
+
+    Socket *accept_socket;
+    std::future<Error> accept_error = std::async(std::launch::async, [&] {
+      return listen_socket_up->Accept(accept_socket);
+    });
+
+    std::unique_ptr<TCPSocket> connect_socket_up(
+        new TCPSocket(true, child_processes_inherit));
+    error = connect_socket_up->Connect(
+        llvm::formatv("localhost:{0}", listen_socket_up->GetLocalPortNumber())
+            .str());
+    ASSERT_TRUE(error.Success());
+    ASSERT_TRUE(accept_error.get().Success());
+
+    callback_count = 0;
+    socketpair[0] = std::move(connect_socket_up);
+    socketpair[1].reset(accept_socket);
+  }
+
+  void TearDown() override {
+    socketpair[0].reset();
+    socketpair[1].reset();
+  }
+
+protected:
+  MainLoop::Callback make_callback() {
+    return [&](MainLoopBase &loop) {
+      ++callback_count;
+      loop.RequestTermination();
+    };
+  }
+  std::shared_ptr<Socket> socketpair[2];
+  unsigned callback_count;
+};
+} // namespace
+
+TEST_F(MainLoopTest, ReadObject) {
+  char X = 'X';
+  size_t len = sizeof(X);
+  ASSERT_TRUE(socketpair[0]->Write(&X, len).Success());
+
+  MainLoop loop;
+
+  Error error;
+  auto handle = loop.RegisterReadObject(socketpair[1], make_callback(), error);
+  ASSERT_TRUE(error.Success());
+  ASSERT_TRUE(handle);
+  ASSERT_TRUE(loop.Run().Success());
+  ASSERT_EQ(1u, callback_count);
+}
+
+TEST_F(MainLoopTest, TerminatesImmediately) {
+  char X = 'X';
+  size_t len = sizeof(X);
+  ASSERT_TRUE(socketpair[0]->Write(&X, len).Success());
+  ASSERT_TRUE(socketpair[1]->Write(&X, len).Success());
+
+  MainLoop loop;
+  Error error;
+  auto handle0 = loop.RegisterReadObject(socketpair[0], make_callback(), error);
+  ASSERT_TRUE(error.Success());
+  auto handle1 = loop.RegisterReadObject(socketpair[1], make_callback(), error);
+  ASSERT_TRUE(error.Success());
+
+  ASSERT_TRUE(loop.Run().Success());
+  ASSERT_EQ(1u, callback_count);
+}
+
+#ifdef LLVM_ON_UNIX
+TEST_F(MainLoopTest, Signal) {
+  MainLoop loop;
+  Error error;
+
+  auto handle = loop.RegisterSignal(SIGUSR1, make_callback(), error);
+  ASSERT_TRUE(error.Success());
+  kill(getpid(), SIGUSR1);
+  ASSERT_TRUE(loop.Run().Success());
+  ASSERT_EQ(1u, callback_count);
+}
+#endif
diff --git a/unittests/Utility/TaskPoolTest.cpp b/unittests/Utility/TaskPoolTest.cpp
index 172e32a9c6c0..e340a81b27db 100644
--- a/unittests/Utility/TaskPoolTest.cpp
+++ b/unittests/Utility/TaskPoolTest.cpp
@@ -30,25 +30,14 @@ TEST(TaskPoolTest, RunTasks) {
   ASSERT_EQ(17, r[3]);
 }
 
-TEST(TaskPoolTest, TaskRunner) {
-  auto fn = [](int x) { return std::make_pair(x, x * x); };
-
-  TaskRunner<std::pair<int, int>> tr;
-  tr.AddTask(fn, 1);
-  tr.AddTask(fn, 2);
-  tr.AddTask(fn, 3);
-  tr.AddTask(fn, 4);
-
-  int count = 0;
-  while (true) {
-    auto f = tr.WaitForNextCompletedTask();
-    if (!f.valid())
-      break;
-
-    ++count;
-    std::pair<int, int> v = f.get();
-    ASSERT_EQ(v.first * v.first, v.second);
-  }
-
-  ASSERT_EQ(4, count);
+TEST(TaskPoolTest, TaskMap) {
+  int data[4];
+  auto fn = [&data](int x) { data[x] = x * x; };
+
+  TaskMapOverInt(0, 4, fn);
+
+  ASSERT_EQ(data[0], 0);
+  ASSERT_EQ(data[1], 1);
+  ASSERT_EQ(data[2], 4);
+  ASSERT_EQ(data[3], 9);
 }
diff --git a/www/lldb-gdb.html b/www/lldb-gdb.html
index 60cd718d5255..69179bd8c07c 100755
--- a/www/lldb-gdb.html
+++ b/www/lldb-gdb.html
@@ -772,6 +772,27 @@
                     </tr>
 
 
+                    <tr><td class="header" colspan="2">List the threads in your program.</td></tr>
+                    <tr>
+                        <td class="content">
+                            <b>(gdb)</b> info threads<br>
+                        </td>
+                        <td class="content">
+                            <b>(lldb)</b> thread list<br>
+                        </td>
+                    </tr>
+
+                    <tr><td class="header" colspan="2">Select thread 1 as the default thread for subsequent commands.</td></tr>
+                    <tr>
+                        <td class="content">
+                            <b>(gdb)</b> thread 1<br>
+                        </td>
+                        <td class="content">
+                            <b>(lldb)</b> thread select 1<br>
+                            <b>(lldb)</b> t 1<br>
+                        </td>
+                    </tr>
+
                     <tr><td class="header" colspan="2">Show the stack backtrace for the current thread.</td></tr>
                     <tr>
                         <td class="content">
@@ -1250,6 +1271,16 @@
                         <td class="hed" width="50%">LLDB</td>
                     </tr>
 
+                    <tr><td class="header" colspan="2">Search command help for a keyword.</td></tr>
+                    <tr>
+                        <td class="content">
+                            <b>(gdb)</b> apropos keyword<br>
+                        </td>
+                        <td class="content">
+                            <b>(lldb)</b> apropos keyword<br>
+                        </td>
+                    </tr>
+
                     <tr><td class="header" colspan="2">Echo text to the screen.</td></tr>
                     <tr>
                         <td class="content">
-- 
cgit v1.2.3


From 6b3f41ed88e8e440e11a4fbf20b6600529f80049 Mon Sep 17 00:00:00 2001
From: Dimitry Andric <dim@FreeBSD.org>
Date: Tue, 16 May 2017 19:46:52 +0000
Subject: Vendor import of llvm trunk r303197:
 https://llvm.org/svn/llvm-project/llvm/trunk@303197

---
 CREDITS.TXT                                        |    2 +-
 cmake/config-ix.cmake                              |   10 -
 cmake/modules/AddSphinxTarget.cmake                |   13 +
 docs/CMakeLists.txt                                |    2 +-
 docs/GettingStarted.rst                            |    4 +-
 docs/LangRef.rst                                   |  348 +-
 docs/Lexicon.rst                                   |    8 +
 docs/LibFuzzer.rst                                 |   19 +-
 docs/ReleaseNotes.rst                              |    4 +
 include/llvm/ADT/APInt.h                           |   49 +-
 include/llvm/ADT/BitVector.h                       |    2 +-
 include/llvm/ADT/STLExtras.h                       |   14 +-
 include/llvm/ADT/StringExtras.h                    |    7 +
 include/llvm/Analysis/CallGraph.h                  |   10 -
 include/llvm/Analysis/ProfileSummaryInfo.h         |    4 +-
 include/llvm/Analysis/ScalarEvolution.h            |   21 +-
 include/llvm/Analysis/TargetLibraryInfo.def        |  127 +-
 include/llvm/Analysis/TargetTransformInfo.h        |   33 +
 include/llvm/Analysis/TargetTransformInfoImpl.h    |   12 +
 include/llvm/Analysis/ValueTracking.h              |   13 +-
 include/llvm/Bitcode/BitcodeReader.h               |    5 +-
 include/llvm/CodeGen/ExpandReductions.h            |   24 +
 include/llvm/CodeGen/GlobalISel/LegalizerInfo.h    |   17 +-
 include/llvm/CodeGen/GlobalISel/Utils.h            |    3 +
 include/llvm/CodeGen/ISDOpcodes.h                  |   21 +
 include/llvm/CodeGen/MachineCombinerPattern.h      |    2 +
 include/llvm/CodeGen/Passes.h                      |   12 +
 include/llvm/CodeGen/SelectionDAG.h                |   14 +-
 include/llvm/DebugInfo/CodeView/CVTypeVisitor.h    |    4 +
 .../DebugInfo/CodeView/RandomAccessTypeVisitor.h   |  103 +
 include/llvm/DebugInfo/CodeView/TypeDatabase.h     |   23 +-
 .../llvm/DebugInfo/CodeView/TypeDatabaseVisitor.h  |   13 +-
 include/llvm/DebugInfo/CodeView/TypeDeserializer.h |    4 +
 include/llvm/DebugInfo/CodeView/TypeDumpVisitor.h  |    1 +
 include/llvm/DebugInfo/CodeView/TypeIndex.h        |   62 +
 .../CodeView/TypeVisitorCallbackPipeline.h         |    8 +
 .../llvm/DebugInfo/CodeView/TypeVisitorCallbacks.h |    9 +-
 include/llvm/DebugInfo/DWARF/DWARFContext.h        |    7 -
 include/llvm/DebugInfo/DWARF/DWARFDebugLine.h      |    2 +-
 include/llvm/DebugInfo/DWARF/DWARFDebugRangeList.h |    7 +-
 include/llvm/DebugInfo/DWARF/DWARFRelocMap.h       |   12 +-
 include/llvm/DebugInfo/DWARF/DWARFVerifier.h       |    6 +-
 include/llvm/DebugInfo/PDB/Native/RawTypes.h       |    7 -
 include/llvm/DebugInfo/PDB/Native/TpiStream.h      |    4 +-
 .../llvm/DebugInfo/PDB/Native/TpiStreamBuilder.h   |    2 +-
 .../ExecutionEngine/Orc/CompileOnDemandLayer.h     |   13 +
 .../ExecutionEngine/Orc/OrcRemoteTargetClient.h    |   29 +-
 .../ExecutionEngine/Orc/RTDyldObjectLinkingLayer.h |    4 +
 include/llvm/ExecutionEngine/RTDyldMemoryManager.h |   16 +-
 include/llvm/ExecutionEngine/RuntimeDyld.h         |    3 +-
 include/llvm/IR/Attributes.h                       |   41 +
 include/llvm/IR/CallingConv.h                      |    4 +
 include/llvm/IR/Constants.h                        |   10 +-
 include/llvm/IR/DebugInfoMetadata.h                |   37 +-
 include/llvm/IR/DebugLoc.h                         |   16 +
 include/llvm/IR/DerivedTypes.h                     |   42 +-
 include/llvm/IR/DiagnosticInfo.h                   |   23 +-
 include/llvm/IR/Function.h                         |   38 +-
 include/llvm/IR/GetElementPtrTypeIterator.h        |    8 +-
 include/llvm/IR/GlobalAlias.h                      |    8 +-
 include/llvm/IR/GlobalIFunc.h                      |    8 +-
 include/llvm/IR/GlobalObject.h                     |    4 +-
 include/llvm/IR/GlobalValue.h                      |   20 +-
 include/llvm/IR/GlobalVariable.h                   |   63 +-
 include/llvm/IR/IRBuilder.h                        |   39 +
 include/llvm/IR/InstrTypes.h                       |   18 +-
 include/llvm/IR/Instruction.h                      |    6 +
 include/llvm/IR/Instructions.h                     |  149 +-
 include/llvm/IR/Intrinsics.td                      |   44 +
 include/llvm/IR/LLVMContext.h                      |   12 +-
 include/llvm/IR/LegacyPassManager.h                |    3 +
 include/llvm/IR/Module.h                           |  100 +-
 include/llvm/IR/ModuleSummaryIndex.h               |   59 +-
 include/llvm/IR/PassManager.h                      |  122 +-
 include/llvm/IR/PassManagerInternal.h              |   11 +-
 include/llvm/IR/PatternMatch.h                     |   39 +-
 include/llvm/IR/ProfileSummary.h                   |   20 +-
 include/llvm/IR/Statepoint.h                       |   16 +-
 include/llvm/IR/SymbolTableListTraits.h            |   10 +-
 include/llvm/IR/TrackingMDRef.h                    |   16 +-
 include/llvm/IR/Type.h                             |   45 +-
 include/llvm/IR/TypeFinder.h                       |    4 +-
 include/llvm/IR/Use.h                              |   28 +-
 include/llvm/IR/UseListOrder.h                     |    2 +-
 include/llvm/IR/User.h                             |   19 +-
 include/llvm/IR/Value.h                            |   23 +-
 include/llvm/IR/ValueHandle.h                      |   48 +-
 include/llvm/IR/ValueMap.h                         |   42 +-
 include/llvm/IR/ValueSymbolTable.h                 |    6 +-
 include/llvm/IR/Verifier.h                         |   15 +-
 include/llvm/InitializePasses.h                    |    5 +-
 include/llvm/LibDriver/LibDriver.h                 |   24 -
 include/llvm/LinkAllPasses.h                       |    1 +
 include/llvm/Object/Wasm.h                         |    7 +-
 include/llvm/ObjectYAML/WasmYAML.h                 |   23 +-
 include/llvm/ProfileData/SampleProfWriter.h        |   11 +-
 include/llvm/Support/BinaryStreamArray.h           |    5 +-
 include/llvm/Support/Compiler.h                    |    8 +-
 include/llvm/Support/KnownBits.h                   |   60 +
 include/llvm/Support/Parallel.h                    |  249 +
 include/llvm/Support/Wasm.h                        |   23 +-
 include/llvm/Target/Target.td                      |    5 +
 include/llvm/Target/TargetInstrInfo.h              |   13 +-
 include/llvm/Target/TargetLowering.h               |   24 +-
 include/llvm/Target/TargetSchedule.td              |    2 +-
 include/llvm/Target/TargetSelectionDAG.td          |    2 +-
 include/llvm/ToolDrivers/llvm-lib/LibDriver.h      |   24 +
 include/llvm/Transforms/Utils/Cloning.h            |    7 +-
 include/llvm/Transforms/Utils/LoopUtils.h          |   32 +
 include/llvm/Transforms/Vectorize/SLPVectorizer.h  |    4 +-
 include/llvm/module.modulemap                      |    1 +
 lib/Analysis/BasicAliasAnalysis.cpp                |    9 +-
 lib/Analysis/BranchProbabilityInfo.cpp             |   11 +-
 lib/Analysis/CallGraph.cpp                         |   34 +-
 lib/Analysis/ConstantFolding.cpp                   |   80 +-
 lib/Analysis/DemandedBits.cpp                      |   10 +-
 lib/Analysis/InlineCost.cpp                        |    2 +-
 lib/Analysis/InstructionSimplify.cpp               |  167 +-
 lib/Analysis/ModuleSummaryAnalysis.cpp             |    3 +-
 lib/Analysis/OptimizationDiagnosticInfo.cpp        |    2 +-
 lib/Analysis/ProfileSummaryInfo.cpp                |   13 +-
 lib/Analysis/ScalarEvolution.cpp                   |  159 +-
 lib/Analysis/TargetLibraryInfo.cpp                 |  112 +-
 lib/Analysis/TargetTransformInfo.cpp               |   13 +
 lib/Analysis/ValueTracking.cpp                     |  276 +-
 lib/Analysis/VectorUtils.cpp                       |    1 +
 lib/AsmParser/LLParser.cpp                         |   18 +-
 lib/Bitcode/Reader/BitcodeReader.cpp               |   17 +-
 lib/Bitcode/Reader/MetadataLoader.cpp              |    2 +-
 lib/Bitcode/Writer/BitcodeWriter.cpp               |    8 +-
 lib/Bitcode/Writer/ValueEnumerator.cpp             |    7 +-
 lib/CMakeLists.txt                                 |    2 +-
 lib/CodeGen/AsmPrinter/CodeViewDebug.cpp           |   14 +-
 lib/CodeGen/AsmPrinter/DebugHandlerBase.cpp        |    3 +-
 lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp        |   42 +-
 lib/CodeGen/AsmPrinter/DwarfCompileUnit.h          |   22 +
 lib/CodeGen/AsmPrinter/DwarfDebug.cpp              |   89 +-
 lib/CodeGen/AsmPrinter/DwarfDebug.h                |   20 +-
 lib/CodeGen/AsmPrinter/DwarfFile.h                 |    4 +
 lib/CodeGen/AsmPrinter/DwarfUnit.cpp               |    6 +-
 lib/CodeGen/AsmPrinter/DwarfUnit.h                 |    5 +-
 lib/CodeGen/AsmPrinter/WinException.cpp            |   12 +-
 lib/CodeGen/AtomicExpandPass.cpp                   |   31 +-
 lib/CodeGen/CMakeLists.txt                         |    3 +
 lib/CodeGen/CodeGen.cpp                            |    4 +-
 lib/CodeGen/CodeGenPrepare.cpp                     |  548 +-
 lib/CodeGen/ExpandPostRAPseudos.cpp                |    5 +-
 lib/CodeGen/ExpandReductions.cpp                   |  167 +
 lib/CodeGen/GlobalISel/LegalizerInfo.cpp           |   10 +-
 lib/CodeGen/GlobalISel/RegBankSelect.cpp           |    9 +-
 lib/CodeGen/GlobalISel/Utils.cpp                   |    8 +
 lib/CodeGen/IfConversion.cpp                       |   30 +-
 lib/CodeGen/LiveRangeShrink.cpp                    |  211 +
 lib/CodeGen/LiveVariables.cpp                      |    2 +-
 lib/CodeGen/MachineBlockPlacement.cpp              |   30 +-
 lib/CodeGen/MachineVerifier.cpp                    |    4 +-
 lib/CodeGen/PHIElimination.cpp                     |    2 +-
 lib/CodeGen/RegisterCoalescer.cpp                  |    2 +-
 lib/CodeGen/RegisterScavenging.cpp                 |    7 +-
 lib/CodeGen/SafeStack.cpp                          |  172 +-
 lib/CodeGen/ScalarizeMaskedMemIntrin.cpp           |  660 ++
 lib/CodeGen/SelectionDAG/DAGCombiner.cpp           |  176 +-
 lib/CodeGen/SelectionDAG/FastISel.cpp              |   20 +-
 lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp  |   33 +-
 lib/CodeGen/SelectionDAG/LegalizeDAG.cpp           |    3 +-
 lib/CodeGen/SelectionDAG/LegalizeTypes.h           |    1 +
 lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp   |   58 +
 lib/CodeGen/SelectionDAG/SelectionDAG.cpp          |  162 +-
 lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp   |  138 +-
 lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h     |    6 +-
 lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp    |   13 +
 lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp      |   74 +-
 lib/CodeGen/SelectionDAG/TargetLowering.cpp        |   11 +-
 lib/CodeGen/ShrinkWrap.cpp                         |   12 +-
 lib/CodeGen/SjLjEHPrepare.cpp                      |    4 +-
 lib/CodeGen/TargetLoweringObjectFileImpl.cpp       |    6 +-
 lib/CodeGen/TargetPassConfig.cpp                   |   11 +
 lib/CodeGen/TwoAddressInstructionPass.cpp          |    7 +-
 lib/CodeGen/UnreachableBlockElim.cpp               |    7 +-
 lib/DebugInfo/CodeView/CMakeLists.txt              |    2 +-
 lib/DebugInfo/CodeView/CVTypeVisitor.cpp           |   41 +-
 .../CodeView/ModuleDebugUnknownFragment.cpp        |   10 -
 lib/DebugInfo/CodeView/RandomAccessTypeVisitor.cpp |   91 +
 lib/DebugInfo/CodeView/TypeDatabase.cpp            |   73 +-
 lib/DebugInfo/CodeView/TypeDatabaseVisitor.cpp     |   65 +-
 lib/DebugInfo/CodeView/TypeDumpVisitor.cpp         |    9 +-
 lib/DebugInfo/DWARF/DWARFContext.cpp               |   25 +-
 lib/DebugInfo/DWARF/DWARFDebugAranges.cpp          |    5 +-
 lib/DebugInfo/DWARF/DWARFDebugRangeList.cpp        |    4 +-
 lib/DebugInfo/DWARF/DWARFDie.cpp                   |   12 +-
 lib/DebugInfo/DWARF/DWARFTypeUnit.cpp              |    6 +-
 lib/DebugInfo/DWARF/DWARFUnit.cpp                  |   16 +-
 lib/DebugInfo/DWARF/DWARFVerifier.cpp              |    8 +-
 lib/DebugInfo/PDB/Native/TpiStreamBuilder.cpp      |    2 +-
 lib/ExecutionEngine/Orc/OrcMCJITReplacement.h      |    5 +-
 .../RuntimeDyld/RTDyldMemoryManager.cpp            |   12 +
 lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp    |    4 +-
 lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp |   33 +-
 lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.h   |    2 -
 lib/ExecutionEngine/RuntimeDyld/RuntimeDyldImpl.h  |    2 +-
 .../RuntimeDyld/Targets/RuntimeDyldCOFFI386.h      |    1 -
 .../RuntimeDyld/Targets/RuntimeDyldCOFFThumb.h     |    1 -
 .../RuntimeDyld/Targets/RuntimeDyldCOFFX86_64.h    |    3 -
 lib/Fuzzer/FuzzerDriver.cpp                        |    3 +-
 lib/Fuzzer/FuzzerFlags.def                         |    8 +-
 lib/Fuzzer/FuzzerInternal.h                        |    1 +
 lib/Fuzzer/FuzzerLoop.cpp                          |   20 +
 lib/Fuzzer/FuzzerMutate.cpp                        |    5 +-
 lib/Fuzzer/afl/afl_driver.cpp                      |   57 +-
 lib/Fuzzer/test/AFLDriverTest.cpp                  |    8 +-
 lib/Fuzzer/test/CMakeLists.txt                     |    1 +
 lib/Fuzzer/test/OverwriteInputTest.cpp             |   13 +
 lib/Fuzzer/test/afl-driver.test                    |   26 +
 lib/Fuzzer/test/overwrite-input.test               |    2 +
 lib/IR/AsmWriter.cpp                               |    7 +
 lib/IR/AttributeImpl.h                             |   18 +-
 lib/IR/Attributes.cpp                              |   72 +-
 lib/IR/ConstantFold.cpp                            |   10 +-
 lib/IR/ConstantRange.cpp                           |   37 +-
 lib/IR/Constants.cpp                               |   15 +-
 lib/IR/ConstantsContext.h                          |   49 +-
 lib/IR/DebugInfoMetadata.cpp                       |   18 +
 lib/IR/DebugLoc.cpp                                |  114 +
 lib/IR/DiagnosticInfo.cpp                          |   25 +-
 lib/IR/Function.cpp                                |   79 +-
 lib/IR/Globals.cpp                                 |   43 +-
 lib/IR/IRBuilder.cpp                               |   88 +
 lib/IR/Instruction.cpp                             |   24 +
 lib/IR/Instructions.cpp                            |  119 +-
 lib/IR/LegacyPassManager.cpp                       |   13 +
 lib/IR/Module.cpp                                  |   35 +-
 lib/IR/Type.cpp                                    |   71 +-
 lib/IR/Verifier.cpp                                |   13 +-
 lib/LLVMBuild.txt                                  |    2 +-
 lib/LTO/LTO.cpp                                    |    4 +-
 lib/LTO/LTOCodeGenerator.cpp                       |   18 +-
 lib/LTO/ThinLTOCodeGenerator.cpp                   |    3 +-
 lib/LibDriver/CMakeLists.txt                       |    8 -
 lib/LibDriver/LLVMBuild.txt                        |   22 -
 lib/LibDriver/LibDriver.cpp                        |  171 -
 lib/LibDriver/Options.td                           |   25 -
 lib/Linker/IRMover.cpp                             |   18 +-
 lib/MC/MCObjectStreamer.cpp                        |    5 +
 lib/MC/MCParser/AsmParser.cpp                      |   21 +
 lib/Object/COFFObjectFile.cpp                      |    4 +-
 lib/Object/WasmObjectFile.cpp                      |   41 +-
 lib/ObjectYAML/WasmYAML.cpp                        |    8 +-
 lib/ProfileData/SampleProfWriter.cpp               |   42 +-
 lib/Support/APInt.cpp                              |  312 +-
 lib/Support/CMakeLists.txt                         |    1 +
 lib/Support/Parallel.cpp                           |  138 +
 lib/Support/Unix/Path.inc                          |   30 +-
 lib/Support/Unix/Process.inc                       |    4 +-
 lib/Target/AArch64/AArch64.td                      |    1 +
 lib/Target/AArch64/AArch64CallLowering.cpp         |    2 +-
 lib/Target/AArch64/AArch64FastISel.cpp             |    2 +-
 lib/Target/AArch64/AArch64ISelLowering.cpp         |    6 +-
 lib/Target/AArch64/AArch64InstrInfo.cpp            |   30 +-
 lib/Target/AArch64/AArch64InstrInfo.td             |    8 +-
 lib/Target/AArch64/AArch64RegisterBankInfo.cpp     |   27 +-
 lib/Target/AArch64/AArch64SchedFalkorDetails.td    |   78 +-
 lib/Target/AArch64/AArch64SchedFalkorWriteRes.td   |   37 +-
 lib/Target/AArch64/AArch64Subtarget.cpp            |    8 +
 lib/Target/AArch64/AArch64Subtarget.h              |   10 +
 lib/Target/AArch64/AArch64TargetObjectFile.cpp     |    8 +
 lib/Target/AArch64/AArch64TargetObjectFile.h       |    3 +
 lib/Target/AArch64/AArch64TargetTransformInfo.cpp  |  106 +-
 lib/Target/AArch64/AArch64TargetTransformInfo.h    |   11 +
 lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp  |   19 +-
 .../AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp      |   10 +-
 lib/Target/AMDGPU/AMDGPU.h                         |    4 +
 lib/Target/AMDGPU/AMDGPU.td                        |   21 +-
 lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp           |    8 +-
 lib/Target/AMDGPU/AMDGPUISelLowering.cpp           |   10 +-
 lib/Target/AMDGPU/AMDGPUISelLowering.h             |    2 +
 lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp    |    3 +-
 lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp          |    1 +
 lib/Target/AMDGPU/AMDGPUMachineCFGStructurizer.cpp | 2881 +++++++++
 lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp          |    7 +-
 lib/Target/AMDGPU/AMDGPUSubtarget.cpp              |    3 +
 lib/Target/AMDGPU/AMDGPUSubtarget.h                |   15 +
 lib/Target/AMDGPU/AMDGPUTargetMachine.cpp          |   18 +-
 lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp    |   34 +-
 lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h      |    3 +
 lib/Target/AMDGPU/CMakeLists.txt                   |    1 +
 lib/Target/AMDGPU/FLATInstructions.td              |   34 +-
 lib/Target/AMDGPU/GCNRegPressure.cpp               |  153 +-
 lib/Target/AMDGPU/GCNRegPressure.h                 |   61 +-
 lib/Target/AMDGPU/GCNSchedStrategy.cpp             |  282 +-
 lib/Target/AMDGPU/GCNSchedStrategy.h               |   24 +-
 lib/Target/AMDGPU/SIAnnotateControlFlow.cpp        |    2 +-
 lib/Target/AMDGPU/SIISelLowering.cpp               |   22 +
 lib/Target/AMDGPU/SIISelLowering.h                 |    1 +
 lib/Target/AMDGPU/SIInstrInfo.cpp                  |  310 +-
 lib/Target/AMDGPU/SIInstrInfo.h                    |   33 +-
 lib/Target/AMDGPU/SIInstructions.td                |    7 +
 lib/Target/AMDGPU/VOP2Instructions.td              |   51 +-
 lib/Target/AMDGPU/VOP3Instructions.td              |    3 -
 lib/Target/ARM/ARMBaseInstrInfo.h                  |   18 +-
 lib/Target/ARM/ARMCallLowering.cpp                 |    2 +-
 lib/Target/ARM/ARMFastISel.cpp                     |    2 +-
 lib/Target/ARM/ARMISelLowering.cpp                 |   27 +-
 lib/Target/ARM/ARMISelLowering.h                   |    8 +-
 lib/Target/ARM/ARMInstrInfo.td                     |    7 +-
 lib/Target/ARM/ARMInstrThumb.td                    |    4 +-
 lib/Target/ARM/ARMInstructionSelector.cpp          |   39 +-
 lib/Target/ARM/ARMLegalizerInfo.cpp                |    8 +-
 lib/Target/ARM/ARMOptimizeBarriersPass.cpp         |    4 +-
 lib/Target/ARM/ARMRegisterBankInfo.cpp             |    1 +
 lib/Target/ARM/ARMTargetMachine.cpp                |    2 +
 lib/Target/AVR/AVRFrameLowering.cpp                |    2 +-
 lib/Target/AVR/AVRISelLowering.cpp                 |   10 +-
 lib/Target/AVR/AVRInstrInfo.td                     |    6 +-
 lib/Target/AVR/AVRRegisterInfo.cpp                 |    1 -
 lib/Target/BPF/BPFISelLowering.cpp                 |    3 +-
 lib/Target/BPF/BPFInstrInfo.td                     |    9 +-
 lib/Target/Hexagon/HexagonISelLowering.cpp         |    3 +-
 lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp |    2 +-
 lib/Target/Hexagon/HexagonPatterns.td              |    7 +-
 lib/Target/Hexagon/HexagonPseudo.td                |    2 +-
 lib/Target/Lanai/LanaiISelLowering.cpp             |   31 +-
 lib/Target/Lanai/LanaiISelLowering.h               |    5 +
 lib/Target/Lanai/LanaiInstrInfo.td                 |   12 +-
 lib/Target/MSP430/MSP430FrameLowering.cpp          |    7 +-
 lib/Target/MSP430/MSP430ISelLowering.cpp           |  271 +-
 lib/Target/MSP430/MSP430InstrInfo.h                |    6 +
 lib/Target/MSP430/MSP430InstrInfo.td               |    9 +-
 lib/Target/MSP430/MSP430RegisterInfo.cpp           |    4 +-
 lib/Target/Mips/MipsFastISel.cpp                   |    2 +-
 lib/Target/Mips/MipsISelLowering.cpp               |    2 +-
 lib/Target/Mips/MipsInstrInfo.td                   |    6 +-
 lib/Target/Mips/MipsOptimizePICCall.cpp            |    2 +-
 lib/Target/NVPTX/NVPTXISelLowering.cpp             |   18 +-
 lib/Target/NVPTX/NVPTXInstrInfo.td                 |    9 +-
 .../PowerPC/Disassembler/PPCDisassembler.cpp       |   17 +
 lib/Target/PowerPC/InstPrinter/PPCInstPrinter.cpp  |    3 +-
 lib/Target/PowerPC/PPCFastISel.cpp                 |    3 +-
 lib/Target/PowerPC/PPCISelDAGToDAG.cpp             |  255 +
 lib/Target/PowerPC/PPCISelLowering.cpp             |   98 +-
 lib/Target/PowerPC/PPCISelLowering.h               |   29 +-
 lib/Target/PowerPC/PPCInstr64Bit.td                |   28 +-
 lib/Target/PowerPC/PPCInstrAltivec.td              |   40 +-
 lib/Target/PowerPC/PPCInstrInfo.td                 |   13 +-
 lib/Target/PowerPC/PPCInstrVSX.td                  |    2 +-
 lib/Target/PowerPC/PPCTLSDynamicCall.cpp           |    3 +-
 lib/Target/Sparc/SparcISelLowering.cpp             |   31 +-
 lib/Target/Sparc/SparcInstrInfo.td                 |    9 +-
 lib/Target/Sparc/SparcRegisterInfo.td              |    6 +-
 lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp  |    4 +
 .../SystemZ/Disassembler/SystemZDisassembler.cpp   |   19 +
 .../SystemZ/MCTargetDesc/SystemZMCCodeEmitter.cpp  |   14 +
 lib/Target/SystemZ/README.txt                      |    2 +-
 lib/Target/SystemZ/SystemZFeatures.td              |   14 +-
 lib/Target/SystemZ/SystemZISelLowering.cpp         |   13 +-
 lib/Target/SystemZ/SystemZISelLowering.h           |    2 +
 lib/Target/SystemZ/SystemZInstrFP.td               |   13 +
 lib/Target/SystemZ/SystemZInstrFormats.td          |  301 +-
 lib/Target/SystemZ/SystemZInstrInfo.td             |  201 +-
 lib/Target/SystemZ/SystemZOperands.td              |    2 +
 lib/Target/SystemZ/SystemZOperators.td             |    3 +-
 lib/Target/SystemZ/SystemZSchedule.td              |    4 +
 lib/Target/SystemZ/SystemZScheduleZ13.td           |   84 +-
 lib/Target/SystemZ/SystemZScheduleZ196.td          |   92 +-
 lib/Target/SystemZ/SystemZScheduleZEC12.td         |   92 +-
 lib/Target/SystemZ/SystemZSubtarget.cpp            |    7 +-
 lib/Target/SystemZ/SystemZSubtarget.h              |   10 +
 lib/Target/WebAssembly/WebAssemblyInstrCall.td     |    4 +-
 lib/Target/WebAssembly/WebAssemblyInstrInfo.td     |    3 +-
 lib/Target/X86/X86.td                              |    3 +
 lib/Target/X86/X86FastISel.cpp                     |   48 +-
 lib/Target/X86/X86FixupLEAs.cpp                    |  269 +-
 lib/Target/X86/X86ISelDAGToDAG.cpp                 |   41 +-
 lib/Target/X86/X86ISelLowering.cpp                 |  214 +-
 lib/Target/X86/X86InstrCompiler.td                 |   14 +-
 lib/Target/X86/X86InstrInfo.cpp                    |   52 +-
 lib/Target/X86/X86InstrInfo.h                      |   11 +-
 lib/Target/X86/X86InstrInfo.td                     |   35 +-
 lib/Target/X86/X86InstrSSE.td                      |   18 +-
 lib/Target/X86/X86InstructionSelector.cpp          |  214 +-
 lib/Target/X86/X86IntrinsicsInfo.h                 |    2 +-
 lib/Target/X86/X86LegalizerInfo.cpp                |   16 +-
 lib/Target/X86/X86RegisterInfo.cpp                 |   14 +-
 lib/Target/X86/X86Subtarget.h                      |    6 +
 lib/Target/X86/X86TargetMachine.cpp                |    4 +-
 lib/Target/X86/X86TargetTransformInfo.cpp          |  194 +-
 lib/Target/X86/X86WinEHState.cpp                   |    2 +-
 lib/Target/XCore/XCoreISelLowering.cpp             |    5 +-
 lib/Target/XCore/XCoreInstrInfo.td                 |   11 +-
 lib/ToolDrivers/CMakeLists.txt                     |    1 +
 lib/ToolDrivers/LLVMBuild.txt                      |   24 +
 lib/ToolDrivers/llvm-lib/CMakeLists.txt            |    8 +
 lib/ToolDrivers/llvm-lib/LLVMBuild.txt             |   22 +
 lib/ToolDrivers/llvm-lib/LibDriver.cpp             |  171 +
 lib/ToolDrivers/llvm-lib/Options.td                |   25 +
 lib/Transforms/Coroutines/CoroFrame.cpp            |  100 +-
 lib/Transforms/IPO/FunctionImport.cpp              |   15 +-
 lib/Transforms/IPO/Inliner.cpp                     |    4 +-
 lib/Transforms/IPO/PartialInlining.cpp             |  426 +-
 lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp        |    6 +-
 lib/Transforms/InstCombine/InstCombineAddSub.cpp   |  199 +-
 lib/Transforms/InstCombine/InstCombineAndOrXor.cpp |  109 +-
 lib/Transforms/InstCombine/InstCombineCalls.cpp    |    8 +-
 lib/Transforms/InstCombine/InstCombineCasts.cpp    |   25 +-
 lib/Transforms/InstCombine/InstCombineCompares.cpp |   39 +-
 lib/Transforms/InstCombine/InstCombineInternal.h   |   30 +-
 .../InstCombine/InstCombineLoadStoreAlloca.cpp     |    6 +-
 .../InstCombine/InstCombineMulDivRem.cpp           |   20 +-
 .../InstCombine/InstCombineSimplifyDemanded.cpp    |    2 +-
 .../InstCombine/InstructionCombining.cpp           |   26 +-
 .../Instrumentation/AddressSanitizer.cpp           |   34 +-
 .../Instrumentation/DataFlowSanitizer.cpp          |    8 +-
 .../Instrumentation/EfficiencySanitizer.cpp        |   49 +-
 lib/Transforms/Instrumentation/MemorySanitizer.cpp |    7 +-
 .../Scalar/CorrelatedValuePropagation.cpp          |   10 +-
 lib/Transforms/Scalar/LoopIdiomRecognize.cpp       |  295 +-
 lib/Transforms/Scalar/NewGVN.cpp                   |  210 +-
 lib/Transforms/Scalar/SimpleLoopUnswitch.cpp       |  161 +-
 lib/Transforms/Scalar/SpeculativeExecution.cpp     |   43 +-
 lib/Transforms/Utils/BypassSlowDivision.cpp        |    4 +-
 lib/Transforms/Utils/CloneFunction.cpp             |   32 +-
 lib/Transforms/Utils/CloneModule.cpp               |    2 +-
 lib/Transforms/Utils/EscapeEnumerator.cpp          |    3 +-
 lib/Transforms/Utils/InlineFunction.cpp            |   61 +-
 lib/Transforms/Utils/InstructionNamer.cpp          |   13 +-
 lib/Transforms/Utils/Local.cpp                     |  106 +-
 lib/Transforms/Utils/LoopUtils.cpp                 |  201 +
 lib/Transforms/Utils/ModuleUtils.cpp               |   12 +-
 lib/Transforms/Utils/SimplifyLibCalls.cpp          |    6 +-
 lib/Transforms/Utils/VNCoercion.cpp                |    9 +
 lib/Transforms/Utils/ValueMapper.cpp               |    9 +-
 lib/Transforms/Vectorize/LoadStoreVectorizer.cpp   |    2 +-
 lib/Transforms/Vectorize/LoopVectorize.cpp         |  241 +-
 lib/Transforms/Vectorize/SLPVectorizer.cpp         |  112 +-
 lib/XRay/Trace.cpp                                 |   34 +-
 projects/CMakeLists.txt                            |    4 +-
 test/Analysis/BasicAA/cs-cs-arm.ll                 |   34 +
 test/Analysis/BasicAA/cs-cs.ll                     |   37 +-
 test/Analysis/BasicAA/intrinsics-arm.ll            |   31 +
 test/Analysis/BasicAA/intrinsics.ll                |   34 +-
 test/Analysis/BranchProbabilityInfo/basic.ll       |    6 +-
 .../CostModel/AArch64/free-widening-casts.ll       |  622 ++
 test/Analysis/CostModel/AMDGPU/extractelement.ll   |   74 +-
 test/Analysis/CostModel/AMDGPU/insertelement.ll    |   43 +-
 test/Analysis/CostModel/AMDGPU/shufflevector.ll    |   43 +
 test/Analysis/CostModel/X86/div.ll                 |   32 +-
 test/Analysis/CostModel/X86/vshift-ashr-cost.ll    |  138 +-
 test/Analysis/CostModel/X86/vshift-lshr-cost.ll    |  128 +-
 test/Analysis/CostModel/X86/vshift-shl-cost.ll     |  134 +-
 .../ScalarEvolution/different-loops-recs.ll        |  454 ++
 test/Analysis/TypeBasedAliasAnalysis/intrinsics.ll |   18 +-
 test/Assembler/globalvariable-attributes.ll        |   19 +
 test/Bitcode/globalvariable-attributes.ll          |   19 +
 test/Bitcode/ptest-old.ll                          |    1 +
 ...o-function-summary-callgraph-profile-summary.ll |    2 +-
 ...ion-summary-callgraph-sample-profile-summary.ll |  121 +
 .../AArch64/GlobalISel/arm64-regbankselect.mir     |   96 +
 test/CodeGen/AArch64/GlobalISel/call-translator.ll |    4 +-
 test/CodeGen/AArch64/arm64-ccmp.ll                 |    2 +-
 test/CodeGen/AArch64/arm64-fml-combines.ll         |   24 +-
 test/CodeGen/AArch64/arm64-hello.ll                |    4 +-
 test/CodeGen/AArch64/arm64-misched-multimmo.ll     |    2 +-
 test/CodeGen/AArch64/macho-global-symbols.ll       |   17 +
 test/CodeGen/AArch64/misched-fusion-aes.ll         |   33 +
 test/CodeGen/AArch64/stackmap-frame-setup.ll       |    4 +-
 .../AMDGPU/GlobalISel/inst-select-load-flat.mir    |    2 +-
 .../AMDGPU/GlobalISel/inst-select-store-flat.mir   |    2 +-
 .../AMDGPU/GlobalISel/legalize-constant.mir        |   20 +
 test/CodeGen/AMDGPU/GlobalISel/lit.local.cfg       |    2 +
 test/CodeGen/AMDGPU/constant-fold-imm-immreg.mir   |   70 +-
 test/CodeGen/AMDGPU/constant-fold-mi-operands.ll   |   12 +-
 test/CodeGen/AMDGPU/ctpop.ll                       |   80 +-
 test/CodeGen/AMDGPU/ctpop64.ll                     |   16 +-
 test/CodeGen/AMDGPU/fneg-combines.ll               |    9 +-
 test/CodeGen/AMDGPU/fneg.f16.ll                    |   39 +-
 test/CodeGen/AMDGPU/inserted-wait-states.mir       |   10 +-
 test/CodeGen/AMDGPU/limit-coalesce.mir             |    6 +-
 test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pkrtz.ll       |   18 +-
 test/CodeGen/AMDGPU/llvm.amdgcn.mbcnt.ll           |    4 +-
 test/CodeGen/AMDGPU/madak.ll                       |    6 +-
 test/CodeGen/AMDGPU/promote-alloca-volatile.ll     |   12 +-
 test/CodeGen/AMDGPU/v_madak_f16.ll                 |    2 +-
 test/CodeGen/AMDGPU/waitcnt.mir                    |   22 +-
 .../ARM/GlobalISel/arm-instruction-select.mir      |  200 +-
 test/CodeGen/ARM/GlobalISel/arm-irtranslator.ll    |   16 +-
 test/CodeGen/ARM/GlobalISel/arm-legalizer.mir      |   30 +-
 test/CodeGen/ARM/GlobalISel/arm-regbankselect.mir  |  136 +-
 test/CodeGen/ARM/divmod-eabi.ll                    |   73 +-
 test/CodeGen/ARM/divmod.ll                         |    1 +
 test/CodeGen/AVR/select-mbb-placement-bug.ll       |   35 +
 .../Generic/expand-experimental-reductions.ll      |  210 +
 test/CodeGen/Hexagon/regalloc-bad-undef.mir        |    8 +-
 test/CodeGen/Lanai/masking_setccs.ll               |   48 +
 test/CodeGen/Lanai/peephole-compare.mir            |    4 +-
 .../ARM/PR32721_ifcvt_triangle_unanalyzable.mir    |   24 +
 test/CodeGen/MIR/ARM/ifcvt_canFallThroughTo.mir    |   64 +
 .../MIR/X86/frame-info-save-restore-points.mir     |    2 +-
 test/CodeGen/MSP430/2009-11-05-8BitLibcalls.ll     |   22 -
 test/CodeGen/MSP430/hwmult16.ll                    |   43 +
 test/CodeGen/MSP430/hwmult32.ll                    |   43 +
 test/CodeGen/MSP430/hwmultf5.ll                    |   43 +
 test/CodeGen/MSP430/jumptable.ll                   |    2 +-
 test/CodeGen/MSP430/libcalls.ll                    |  595 ++
 test/CodeGen/MSP430/promote-i8-mul.ll              |   22 +
 test/CodeGen/NVPTX/bug17709.ll                     |   52 +-
 test/CodeGen/NVPTX/ctlz.ll                         |    2 +-
 test/CodeGen/NVPTX/ctpop.ll                        |    2 +-
 test/CodeGen/NVPTX/cttz.ll                         |    3 +-
 test/CodeGen/NVPTX/f16-instructions.ll             | 2157 +++----
 test/CodeGen/NVPTX/f16x2-instructions.ll           | 2853 ++++-----
 test/CodeGen/NVPTX/fma.ll                          |   84 +-
 test/CodeGen/NVPTX/i8-param.ll                     |   46 +-
 test/CodeGen/NVPTX/param-load-store.ll             | 1878 +++---
 test/CodeGen/NVPTX/sched1.ll                       |    4 +-
 test/CodeGen/NVPTX/sched2.ll                       |    4 +-
 test/CodeGen/NVPTX/simple-call.ll                  |   52 +-
 test/CodeGen/NVPTX/vec8.ll                         |    2 +-
 test/CodeGen/NVPTX/vector-call.ll                  |   60 +-
 test/CodeGen/NVPTX/zeroext-32bit.ll                |   52 +-
 test/CodeGen/PowerPC/mtvsrdd.ll                    |   22 +
 test/CodeGen/PowerPC/setcc-logic.ll                |   12 +-
 test/CodeGen/PowerPC/stackmap-frame-setup.ll       |    4 +-
 test/CodeGen/PowerPC/tail-dup-layout.ll            |   97 +-
 test/CodeGen/PowerPC/testComparesieqsc.ll          |  138 +
 test/CodeGen/PowerPC/testComparesieqsi.ll          |  138 +
 test/CodeGen/PowerPC/testComparesieqss.ll          |  138 +
 test/CodeGen/PowerPC/testComparesiequc.ll          |  138 +
 test/CodeGen/PowerPC/testComparesiequi.ll          |  138 +
 test/CodeGen/PowerPC/testComparesiequs.ll          |  138 +
 test/CodeGen/PowerPC/testCompareslleqsc.ll         |  138 +
 test/CodeGen/PowerPC/testCompareslleqsi.ll         |  138 +
 test/CodeGen/PowerPC/testCompareslleqss.ll         |  137 +
 test/CodeGen/PowerPC/testComparesllequc.ll         |  137 +
 test/CodeGen/PowerPC/testComparesllequi.ll         |  137 +
 test/CodeGen/PowerPC/testComparesllequs.ll         |  137 +
 test/CodeGen/SPARC/LeonItinerariesUT.ll            |    4 +-
 test/CodeGen/SPARC/inlineasm-v9.ll                 |   30 +
 test/CodeGen/SPARC/inlineasm.ll                    |   18 +
 test/CodeGen/SystemZ/list-ilp-crash.ll             |   23 +
 test/CodeGen/SystemZ/lower-copy-undef-src.mir      |   14 +
 test/CodeGen/Thumb2/v8_IT_5.ll                     |    2 +-
 test/CodeGen/X86/2007-01-08-InstrSched.ll          |    4 +-
 test/CodeGen/X86/2010-01-18-DbgValue.ll            |   13 +-
 test/CodeGen/X86/2012-11-30-handlemove-dbg.ll      |   51 -
 test/CodeGen/X86/2012-11-30-misched-dbg.ll         |  142 -
 test/CodeGen/X86/2012-11-30-regpres-dbg.ll         |   47 -
 test/CodeGen/X86/GlobalISel/add-scalar.ll          |   44 +
 test/CodeGen/X86/GlobalISel/binop.ll               |   42 -
 test/CodeGen/X86/GlobalISel/br.ll                  |   19 +
 test/CodeGen/X86/GlobalISel/cmp.ll                 |  159 +
 test/CodeGen/X86/GlobalISel/ext-x86-64.ll          |   14 +-
 test/CodeGen/X86/GlobalISel/ext.ll                 |   18 +
 test/CodeGen/X86/GlobalISel/legalize-cmp.mir       |  179 +
 .../CodeGen/X86/GlobalISel/legalize-ext-x86-64.mir |   64 +
 test/CodeGen/X86/GlobalISel/legalize-ext.mir       |   64 +
 test/CodeGen/X86/GlobalISel/memop-scalar-x32.ll    |  101 +
 test/CodeGen/X86/GlobalISel/memop-scalar.ll        |  146 +
 test/CodeGen/X86/GlobalISel/memop-vec.ll           |   39 +
 test/CodeGen/X86/GlobalISel/memop-x32.ll           |  101 -
 test/CodeGen/X86/GlobalISel/memop.ll               |  206 -
 .../X86/GlobalISel/regbankselect-X86_64.mir        |  125 +-
 test/CodeGen/X86/GlobalISel/select-br.mir          |   39 +
 test/CodeGen/X86/GlobalISel/select-cmp.mir         |  563 ++
 test/CodeGen/X86/GlobalISel/select-ext-x86-64.mir  |   38 +
 test/CodeGen/X86/GlobalISel/select-ext.mir         |   33 +
 .../X86/GlobalISel/select-memop-scalar-x32.mir     |  310 +
 .../CodeGen/X86/GlobalISel/select-memop-scalar.mir |  500 ++
 test/CodeGen/X86/GlobalISel/select-memop-v128.mir  |  143 +
 test/CodeGen/X86/GlobalISel/select-memop-x32.mir   |  310 -
 test/CodeGen/X86/GlobalISel/select-memop.mir       |  637 --
 test/CodeGen/X86/O0-pipeline.ll                    |   67 +
 test/CodeGen/X86/all-ones-vector.ll                |  112 +-
 test/CodeGen/X86/avg.ll                            |  833 ++-
 test/CodeGen/X86/avx-basic.ll                      |    8 +-
 test/CodeGen/X86/avx-cvt-3.ll                      |   22 +-
 test/CodeGen/X86/avx-intrinsics-fast-isel.ll       |   60 +-
 test/CodeGen/X86/avx-schedule.ll                   |   50 +
 test/CodeGen/X86/avx.ll                            |    2 +-
 test/CodeGen/X86/avx512-cmp-kor-sequence.ll        |    6 +-
 test/CodeGen/X86/avx512-gather-scatter-intrin.ll   |   10 +-
 test/CodeGen/X86/avx512-intrinsics-upgrade.ll      |   44 +-
 test/CodeGen/X86/avx512-intrinsics.ll              |  215 +-
 test/CodeGen/X86/avx512-mask-spills.ll             |   40 +-
 test/CodeGen/X86/avx512-scalar_mask.ll             |  107 +
 test/CodeGen/X86/avx512-vselect.ll                 |   61 +
 test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll    |   12 +-
 test/CodeGen/X86/avx512bw-intrinsics.ll            |   16 +-
 test/CodeGen/X86/avx512bwvl-intrinsics-upgrade.ll  |   24 +-
 test/CodeGen/X86/avx512cdvl-intrinsics-upgrade.ll  |    2 +-
 test/CodeGen/X86/avx512cdvl-intrinsics.ll          |    2 +-
 test/CodeGen/X86/avx512dq-intrinsics-upgrade.ll    |    2 +-
 test/CodeGen/X86/avx512dq-intrinsics.ll            |    4 +-
 test/CodeGen/X86/avx512dqvl-intrinsics-upgrade.ll  |   10 +-
 test/CodeGen/X86/avx512dqvl-intrinsics.ll          |    4 +-
 test/CodeGen/X86/avx512er-intrinsics.ll            |   48 +-
 test/CodeGen/X86/avx512ifma-intrinsics.ll          |    8 +-
 test/CodeGen/X86/avx512ifmavl-intrinsics.ll        |   16 +-
 test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll    |   64 +-
 test/CodeGen/X86/avx512vl-intrinsics.ll            |   28 +-
 test/CodeGen/X86/bmi.ll                            |   76 +
 test/CodeGen/X86/bswap_tree2.ll                    |   35 +-
 test/CodeGen/X86/cast-vsel.ll                      |   37 +-
 test/CodeGen/X86/combine-abs.ll                    |   11 +-
 test/CodeGen/X86/combine-shl.ll                    |    3 +-
 test/CodeGen/X86/combine-srl.ll                    |   22 +-
 test/CodeGen/X86/constructor.ll                    |    5 +
 test/CodeGen/X86/dbg-baseptr.ll                    |   62 +-
 test/CodeGen/X86/elf-associated.ll                 |    5 +
 test/CodeGen/X86/fold-tied-op.ll                   |    7 +-
 test/CodeGen/X86/fp128-i128.ll                     |    2 +-
 test/CodeGen/X86/haddsub-2.ll                      |   12 +-
 test/CodeGen/X86/known-signbits-vector.ll          |   61 +
 test/CodeGen/X86/leaFixup32.mir                    |  508 ++
 test/CodeGen/X86/leaFixup64.mir                    | 1041 +++
 test/CodeGen/X86/lrshrink.ll                       |   57 +
 test/CodeGen/X86/madd.ll                           |   34 +-
 test/CodeGen/X86/masked_gather_scatter.ll          |    2 +-
 test/CodeGen/X86/merge-consecutive-loads-128.ll    |   16 +-
 test/CodeGen/X86/misched-matrix.ll                 |    4 +-
 test/CodeGen/X86/not-and-simplify.ll               |   28 +-
 test/CodeGen/X86/oddshuffles.ll                    |   34 +-
 test/CodeGen/X86/packss.ll                         |   11 +-
 test/CodeGen/X86/pmul.ll                           |   55 +-
 test/CodeGen/X86/pr28129.ll                        |   32 +-
 test/CodeGen/X86/pr29112.ll                        |    8 +-
 test/CodeGen/X86/pr30562.ll                        |    1 +
 test/CodeGen/X86/pr31088.ll                        |    2 +-
 test/CodeGen/X86/pr32284.ll                        |   71 +-
 test/CodeGen/X86/pr32907.ll                        |   53 +-
 .../X86/replace_unsupported_masked_mem_intrin.ll   |   37 +
 test/CodeGen/X86/rotate.ll                         |   16 +-
 test/CodeGen/X86/sad.ll                            |  929 ++-
 test/CodeGen/X86/select.ll                         |   28 +-
 test/CodeGen/X86/setcc-wide-types.ll               |   56 +-
 test/CodeGen/X86/shrink_vmul_sse.ll                |    2 +-
 test/CodeGen/X86/shuffle-of-splat-multiuses.ll     |   34 +-
 test/CodeGen/X86/sse-intrinsics-fast-isel.ll       |   10 +-
 test/CodeGen/X86/sse1.ll                           |   20 +-
 test/CodeGen/X86/sse3-avx-addsub-2.ll              |   14 +-
 test/CodeGen/X86/sse41.ll                          |    8 +-
 test/CodeGen/X86/stackmap-frame-setup.ll           |    4 +-
 test/CodeGen/X86/vec_int_to_fp.ll                  |   84 +-
 test/CodeGen/X86/vec_set-2.ll                      |   31 +-
 test/CodeGen/X86/vec_set-3.ll                      |   45 +-
 test/CodeGen/X86/vec_set-4.ll                      |   38 +-
 test/CodeGen/X86/vec_set-6.ll                      |   23 +-
 test/CodeGen/X86/vec_set-7.ll                      |   18 +-
 test/CodeGen/X86/vec_set-8.ll                      |   16 +-
 test/CodeGen/X86/vec_set-A.ll                      |   19 +-
 test/CodeGen/X86/vec_set-B.ll                      |   40 +-
 test/CodeGen/X86/vec_set-C.ll                      |   10 +-
 test/CodeGen/X86/vec_set.ll                        |   63 +-
 test/CodeGen/X86/vector-bitreverse.ll              |    6 +-
 test/CodeGen/X86/vector-blend.ll                   |    4 +-
 test/CodeGen/X86/vector-lzcnt-128.ll               |  380 +-
 test/CodeGen/X86/vector-lzcnt-256.ll               |  536 +-
 test/CodeGen/X86/vector-narrow-binop.ll            |  111 +
 test/CodeGen/X86/vector-pcmp.ll                    |   27 +-
 test/CodeGen/X86/vector-shift-ashr-256.ll          |  580 ++
 test/CodeGen/X86/vector-shift-lshr-256.ll          |  434 ++
 test/CodeGen/X86/vector-shift-shl-256.ll           |  377 ++
 test/CodeGen/X86/vector-shuffle-512-v32.ll         |  356 +-
 test/CodeGen/X86/vector-sqrt.ll                    |    8 +-
 test/CodeGen/X86/viabs.ll                          |  107 +-
 test/CodeGen/X86/vselect-pcmp.ll                   |   12 +-
 test/CodeGen/X86/x86-interleaved-access.ll         |   14 +-
 .../X86/x86-no_caller_saved_registers-preserve.ll  |   26 +-
 test/CodeGen/X86/x86-no_caller_saved_registers.ll  |   62 +-
 test/CodeGen/X86/x86-shrink-wrapping.ll            |   53 +-
 test/CodeGen/X86/xop-intrinsics-fast-isel.ll       |    8 +-
 test/DebugInfo/COFF/local-variables.ll             |    5 -
 test/DebugInfo/COFF/no-cus.ll                      |   25 +
 test/DebugInfo/Inputs/typeunit-header.elf-x86-64   |  Bin 0 -> 840 bytes
 test/DebugInfo/Inputs/typeunit-header.s            |   49 +
 test/DebugInfo/PDB/DIA/pdbdump-symbol-format.test  |    4 +-
 test/DebugInfo/X86/dbg-declare-inalloca.ll         |  199 +
 .../X86/split-dwarf-cross-unit-reference.ll        |  198 +-
 test/DebugInfo/typeunit-header.test                |   15 +
 test/Feature/intrinsic-noduplicate.ll              |    1 +
 test/Instrumentation/MemorySanitizer/msan_basic.ll |   64 -
 .../MemorySanitizer/msan_x86intrinsics.ll          |   68 +
 test/Instrumentation/MemorySanitizer/pr32842.ll    |   20 +
 .../MemorySanitizer/vector_arith.ll                |    1 +
 test/Instrumentation/MemorySanitizer/vector_cmp.ll |    1 +
 test/Instrumentation/MemorySanitizer/vector_cvt.ll |    1 +
 .../Instrumentation/MemorySanitizer/vector_pack.ll |    1 +
 .../MemorySanitizer/vector_shift.ll                |    1 +
 test/LTO/Resolution/X86/ifunc.ll                   |   15 +
 test/MC/AArch64/directive-cpu-err.s                |    9 +
 test/MC/AArch64/label-arithmetic-diags-elf.s       |    9 +
 test/MC/AMDGPU/flat.s                              |   66 -
 test/MC/AMDGPU/literal16.s                         |    8 +-
 test/MC/AMDGPU/vop2.s                              |   38 +-
 test/MC/AMDGPU/vop3-convert.s                      |   14 +-
 test/MC/AsmParser/altmacro_string_escape.s         |   29 +
 test/MC/Disassembler/AMDGPU/flat_vi.txt            |   24 -
 test/MC/Disassembler/AMDGPU/literal16_vi.txt       |    6 +-
 test/MC/Disassembler/AMDGPU/vop2_vi.txt            |   30 +-
 test/MC/Disassembler/AMDGPU/vop3_vi.txt            |   18 +
 .../PowerPC/ppc64-encoding-p9vector.txt            |    4 +
 test/MC/Disassembler/SystemZ/insns-z13.txt         | 4068 ++++++------
 test/MC/Disassembler/SystemZ/insns.txt             | 6717 +++++++++++++-------
 test/MC/SystemZ/insn-bad-z13.s                     |  792 ++-
 test/MC/SystemZ/insn-bad-z196.s                    |   53 +-
 test/MC/SystemZ/insn-bad-zEC12.s                   |  511 +-
 test/MC/SystemZ/insn-bad.s                         | 2284 ++++++-
 test/MC/SystemZ/insn-good-z13.s                    | 1736 ++---
 test/MC/SystemZ/insn-good-z196.s                   |  158 +-
 test/MC/SystemZ/insn-good-zEC12.s                  |   16 +-
 test/MC/SystemZ/insn-good.s                        | 2131 ++++++-
 test/Object/Inputs/COFF/empty-drectve.yaml         |   14 +
 test/Object/X86/archive-symbol-table.s             |   19 +
 test/Object/X86/nm-ir.ll                           |    2 +-
 test/Object/coff-empty-drectve.test                |    3 +
 test/Object/invalid.test                           |    4 +-
 test/Object/wasm-invalid-start.test                |   10 +
 test/ObjectYAML/wasm/export_section.yaml           |   28 +-
 test/ObjectYAML/wasm/function_section.yaml         |    4 +-
 test/ObjectYAML/wasm/import_section.yaml           |   45 +-
 test/ObjectYAML/wasm/start_section.yaml            |    9 +
 test/TableGen/AsmVariant.td                        |    2 +-
 test/TableGen/RegisterEncoder.td                   |   35 +
 .../CodeExtractor/ExtractedFnEntryCount.ll         |    2 +-
 .../CodeExtractor/MultipleExitBranchProb.ll        |    2 +-
 test/Transforms/CodeExtractor/PartialInlineAnd.ll  |    4 +-
 .../CodeExtractor/PartialInlineEntryUpdate.ll      |   41 +
 .../CodeExtractor/PartialInlineHighCost.ll         |  107 +
 test/Transforms/CodeExtractor/PartialInlineOr.ll   |    4 +-
 .../Transforms/CodeExtractor/PartialInlineOrAnd.ll |    4 +-
 test/Transforms/CodeExtractor/SingleCondition.ll   |    4 +-
 .../CodeExtractor/X86/InheritTargetAttributes.ll   |    4 +-
 .../Transforms/CodeGenPrepare/section-samplepgo.ll |   57 +
 test/Transforms/CodeGenPrepare/section.ll          |   20 +-
 test/Transforms/ConstProp/calls-math-finite.ll     |   83 +
 test/Transforms/ConstProp/calls.ll                 |  206 -
 test/Transforms/ConstProp/sse.ll                   |  208 +
 .../Coroutines/coro-eh-aware-edge-split.ll         |  218 +
 .../GVN/PRE/2011-06-01-NonLocalMemdepMiscompile.ll |    7 +-
 test/Transforms/GVN/PRE/nonintegral.ll             |   39 +
 .../IndVarSimplify/2011-10-27-lftrnull.ll          |    2 +-
 test/Transforms/InferFunctionAttrs/annotate.ll     |  126 +
 test/Transforms/InferFunctionAttrs/no-proto.ll     |  126 +
 test/Transforms/Inline/inline-cold.ll              |   20 +-
 .../inline-constexpr-addrspacecast-argument.ll     |    2 +-
 test/Transforms/Inline/partial-inline-act.ll       |    2 +-
 test/Transforms/Inline/prof-update.ll              |   35 +-
 .../InstCombine/2012-04-23-Neon-Intrinsics.ll      |  135 -
 .../AArch64/2012-04-23-Neon-Intrinsics.ll          |   71 +
 test/Transforms/InstCombine/AArch64/lit.local.cfg  |    2 +
 .../InstCombine/AMDGPU/amdgcn-intrinsics.ll        | 1540 +++++
 test/Transforms/InstCombine/AMDGPU/lit.local.cfg   |    2 +
 .../InstCombine/ARM/2012-04-23-Neon-Intrinsics.ll  |   65 +
 .../InstCombine/ARM/constant-fold-hang.ll          |   14 +
 test/Transforms/InstCombine/ARM/lit.local.cfg      |    2 +
 test/Transforms/InstCombine/ARM/neon-intrinsics.ll |   25 +
 .../InstCombine/PowerPC/aligned-altivec.ll         |  131 +
 test/Transforms/InstCombine/PowerPC/aligned-qpx.ll |  165 +
 test/Transforms/InstCombine/PowerPC/lit.local.cfg  |    3 +
 .../InstCombine/PowerPC/vsx-unaligned.ll           |   44 +
 .../InstCombine/X86/X86FsubCmpCombine.ll           |  181 +
 test/Transforms/InstCombine/X86/blend_x86.ll       |  151 +
 test/Transforms/InstCombine/X86/lit.local.cfg      |    2 +
 test/Transforms/InstCombine/X86/pr2645-1.ll        |   39 +
 .../InstCombine/X86/shufflemask-undef.ll           |  110 +
 test/Transforms/InstCombine/X86/x86-avx2.ll        |  109 +
 test/Transforms/InstCombine/X86/x86-avx512.ll      | 2793 ++++++++
 .../InstCombine/X86/x86-crc32-demanded.ll          |   17 +
 test/Transforms/InstCombine/X86/x86-f16c.ll        |   68 +
 test/Transforms/InstCombine/X86/x86-fma.ll         |  315 +
 test/Transforms/InstCombine/X86/x86-insertps.ll    |  166 +
 .../InstCombine/X86/x86-masked-memops.ll           |  328 +
 test/Transforms/InstCombine/X86/x86-movmsk.ll      |  324 +
 test/Transforms/InstCombine/X86/x86-muldq.ll       |  245 +
 test/Transforms/InstCombine/X86/x86-pack.ll        |  366 ++
 test/Transforms/InstCombine/X86/x86-pshufb.ll      |  515 ++
 test/Transforms/InstCombine/X86/x86-sse.ll         |  613 ++
 test/Transforms/InstCombine/X86/x86-sse2.ll        |  460 ++
 test/Transforms/InstCombine/X86/x86-sse41.ll       |   98 +
 test/Transforms/InstCombine/X86/x86-sse4a.ll       |  408 ++
 .../InstCombine/X86/x86-vec_demanded_elts.ll       |  110 +
 .../InstCombine/X86/x86-vector-shifts.ll           | 3434 ++++++++++
 test/Transforms/InstCombine/X86/x86-vperm2.ll      |  313 +
 test/Transforms/InstCombine/X86/x86-vpermil.ll     |  298 +
 test/Transforms/InstCombine/X86/x86-xop.ll         |  305 +
 test/Transforms/InstCombine/X86FsubCmpCombine.ll   |  181 -
 test/Transforms/InstCombine/add.ll                 |   26 +
 test/Transforms/InstCombine/aligned-altivec.ll     |  131 -
 test/Transforms/InstCombine/aligned-qpx.ll         |  165 -
 test/Transforms/InstCombine/amdgcn-intrinsics.ll   | 1540 -----
 test/Transforms/InstCombine/and.ll                 |    2 +-
 test/Transforms/InstCombine/bit-tracking.ll        |   26 -
 test/Transforms/InstCombine/blend_x86.ll           |  151 -
 test/Transforms/InstCombine/cast.ll                |   38 +
 test/Transforms/InstCombine/constant-fold-hang.ll  |   14 -
 .../InstCombine/constant-fold-iteration.ll         |   10 +
 test/Transforms/InstCombine/demorgan.ll            |    8 +-
 test/Transforms/InstCombine/icmp.ll                |   15 +
 test/Transforms/InstCombine/intrinsics.ll          |   29 +-
 test/Transforms/InstCombine/logical-select.ll      |   75 +
 test/Transforms/InstCombine/neon-intrinsics.ll     |   25 -
 test/Transforms/InstCombine/not.ll                 |   76 +-
 test/Transforms/InstCombine/or-xor.ll              |   70 +
 test/Transforms/InstCombine/or.ll                  |  109 -
 test/Transforms/InstCombine/pr2645-1.ll            |   39 -
 test/Transforms/InstCombine/sext.ll                |    2 +-
 test/Transforms/InstCombine/shufflemask-undef.ll   |  109 -
 test/Transforms/InstCombine/trunc.ll               |    2 +-
 test/Transforms/InstCombine/vec_demanded_elts.ll   |  108 -
 test/Transforms/InstCombine/vsx-unaligned.ll       |   44 -
 test/Transforms/InstCombine/x86-avx2.ll            |  109 -
 test/Transforms/InstCombine/x86-avx512.ll          | 2793 --------
 test/Transforms/InstCombine/x86-crc32-demanded.ll  |   17 -
 test/Transforms/InstCombine/x86-f16c.ll            |   68 -
 test/Transforms/InstCombine/x86-fma.ll             |  315 -
 test/Transforms/InstCombine/x86-insertps.ll        |  166 -
 test/Transforms/InstCombine/x86-masked-memops.ll   |  328 -
 test/Transforms/InstCombine/x86-movmsk.ll          |  324 -
 test/Transforms/InstCombine/x86-muldq.ll           |  245 -
 test/Transforms/InstCombine/x86-pack.ll            |  366 --
 test/Transforms/InstCombine/x86-pshufb.ll          |  515 --
 test/Transforms/InstCombine/x86-sse.ll             |  613 --
 test/Transforms/InstCombine/x86-sse2.ll            |  460 --
 test/Transforms/InstCombine/x86-sse41.ll           |   98 -
 test/Transforms/InstCombine/x86-sse4a.ll           |  408 --
 test/Transforms/InstCombine/x86-vector-shifts.ll   | 3434 ----------
 test/Transforms/InstCombine/x86-vperm2.ll          |  313 -
 test/Transforms/InstCombine/x86-vpermil.ll         |  298 -
 test/Transforms/InstCombine/x86-xop.ll             |  305 -
 test/Transforms/InstCombine/xor2.ll                |   11 -
 test/Transforms/InstNamer/basic.ll                 |   19 +
 test/Transforms/InstSimplify/AndOrXor.ll           |  173 +
 test/Transforms/InstSimplify/apint-or.ll           |   72 -
 test/Transforms/InstSimplify/compare.ll            |    7 +-
 test/Transforms/InstSimplify/or.ll                 |  181 +
 test/Transforms/LoopIdiom/ARM/ctlz.ll              |  185 +
 test/Transforms/LoopIdiom/X86/ctlz.ll              |  185 +
 test/Transforms/LoopUnroll/not-rotated.ll          |    2 +-
 .../LoopVectorize/X86/svml-calls-finite.ll         |  187 +
 test/Transforms/LoopVectorize/induction.ll         |   45 +
 test/Transforms/LoopVectorize/pr32859.ll           |   30 +
 test/Transforms/NewGVN/pr32934.ll                  |   69 +
 test/Transforms/NewGVN/pr32952.ll                  |   42 +
 test/Transforms/NewGVN/verify-memoryphi.ll         |   29 +
 .../SLPVectorizer/AArch64/64-bit-vector.ll         |   22 +
 .../SLPVectorizer/AArch64/getelementptr.ll         |   43 +-
 .../Transforms/SLPVectorizer/AArch64/horizontal.ll |   33 +-
 test/Transforms/SLPVectorizer/AArch64/remarks.ll   |   32 +
 test/Transforms/SLPVectorizer/X86/arith-add.ll     |  649 ++
 test/Transforms/SLPVectorizer/X86/arith-mul.ll     |  700 ++
 test/Transforms/SLPVectorizer/X86/arith-sub.ll     |  649 ++
 test/Transforms/SLPVectorizer/X86/shift-ashr.ll    |  913 +++
 test/Transforms/SLPVectorizer/X86/shift-lshr.ll    |  862 +++
 test/Transforms/SLPVectorizer/X86/shift-shl.ll     |  814 +++
 .../SimpleLoopUnswitch/trivial-unswitch.ll         |  199 +
 test/Transforms/SpeculativeExecution/spec-other.ll |   32 -
 .../Transforms/SpeculativeExecution/spec-vector.ll |   73 -
 test/Transforms/Util/split-bit-piece.ll            |  110 +-
 test/Verifier/metadata-function-dbg.ll             |   16 +-
 test/tools/llvm-pdbdump/Inputs/FilterTest.cpp      |   18 +
 test/tools/llvm-pdbdump/Inputs/FilterTest.pdb      |  Bin 44032 -> 44032 bytes
 test/tools/llvm-pdbdump/regex-filter.test          |    8 +-
 test/tools/llvm-pdbdump/symbol-filters.test        |   74 +
 test/tools/llvm-profdata/sample-profile-basic.test |    7 +-
 test/tools/llvm-readobj/wasm-invalid.test          |    7 +
 tools/bugpoint/ExtractFunction.cpp                 |    3 +-
 tools/llc/llc.cpp                                  |    2 +
 tools/lli/RemoteJITUtils.h                         |    5 +-
 tools/llvm-ar/llvm-ar.cpp                          |    2 +-
 tools/llvm-pdbdump/LLVMOutputStyle.cpp             |    2 +-
 tools/llvm-pdbdump/PrettyCompilandDumper.cpp       |   12 +
 tools/llvm-pdbdump/PrettyFunctionDumper.cpp        |   10 +-
 tools/llvm-pdbdump/llvm-pdbdump.cpp                |  102 +-
 tools/llvm-pdbdump/llvm-pdbdump.h                  |   23 +
 tools/llvm-readobj/COFFDumper.cpp                  |    8 +
 tools/llvm-readobj/llvm-readobj.cpp                |   21 +-
 tools/llvm-rtdyld/llvm-rtdyld.cpp                  |    3 +-
 tools/obj2yaml/wasm2yaml.cpp                       |   63 +-
 tools/opt/opt.cpp                                  |    4 +-
 tools/yaml2obj/yaml2wasm.cpp                       |   11 +-
 unittests/Analysis/ProfileSummaryInfoTest.cpp      |    6 +
 unittests/Analysis/TargetLibraryInfoTest.cpp       |   46 +
 unittests/DebugInfo/CMakeLists.txt                 |    2 +-
 unittests/DebugInfo/CodeView/CMakeLists.txt        |   11 +
 unittests/DebugInfo/CodeView/ErrorChecking.h       |   61 +
 .../DebugInfo/CodeView/RandomAccessVisitorTest.cpp |  353 +
 .../Orc/ObjectTransformLayerTest.cpp               |    2 +-
 unittests/ExecutionEngine/Orc/OrcTestCommon.h      |    2 +-
 .../Orc/RTDyldObjectLinkingLayerTest.cpp           |   29 +-
 unittests/IR/ConstantRangeTest.cpp                 |    9 +-
 unittests/IR/InstructionsTest.cpp                  |    7 +
 unittests/IR/TypeBuilderTest.cpp                   |   30 +-
 unittests/Support/CMakeLists.txt                   |    1 +
 .../Support/DynamicLibrary/DynamicLibraryTest.cpp  |    4 +-
 unittests/Support/ParallelTest.cpp                 |   53 +
 unittests/Support/Path.cpp                         |    2 +-
 unittests/Transforms/Utils/Cloning.cpp             |   65 +-
 utils/TableGen/CodeGenInstruction.cpp              |    1 +
 utils/TableGen/SubtargetEmitter.cpp                |    2 +-
 utils/TableGen/X86RecognizableInstr.cpp            |  125 +-
 utils/TableGen/X86RecognizableInstr.h              |  122 +
 utils/git-svn/git-llvm                             |   33 +-
 utils/release/build_llvm_package.bat               |    8 +-
 utils/vscode/README                                |   18 +
 utils/vscode/tablegen/.vscode/launch.json          |   13 +
 utils/vscode/tablegen/CHANGELOG.md                 |    4 +
 utils/vscode/tablegen/README.md                    |   13 +
 utils/vscode/tablegen/language-configuration.json  |   30 +
 utils/vscode/tablegen/package.json                 |   26 +
 utils/vscode/tablegen/syntaxes/TableGen.tmLanguage |  132 +
 utils/vscode/tablegen/vsc-extension-quickstart.md  |   27 +
 909 files changed, 68912 insertions(+), 33784 deletions(-)
 create mode 100644 include/llvm/CodeGen/ExpandReductions.h
 create mode 100644 include/llvm/DebugInfo/CodeView/RandomAccessTypeVisitor.h
 delete mode 100644 include/llvm/LibDriver/LibDriver.h
 create mode 100644 include/llvm/Support/Parallel.h
 create mode 100644 include/llvm/ToolDrivers/llvm-lib/LibDriver.h
 create mode 100644 lib/CodeGen/ExpandReductions.cpp
 create mode 100644 lib/CodeGen/LiveRangeShrink.cpp
 create mode 100644 lib/CodeGen/ScalarizeMaskedMemIntrin.cpp
 delete mode 100644 lib/DebugInfo/CodeView/ModuleDebugUnknownFragment.cpp
 create mode 100644 lib/DebugInfo/CodeView/RandomAccessTypeVisitor.cpp
 create mode 100644 lib/Fuzzer/test/OverwriteInputTest.cpp
 create mode 100644 lib/Fuzzer/test/afl-driver.test
 create mode 100644 lib/Fuzzer/test/overwrite-input.test
 delete mode 100644 lib/LibDriver/CMakeLists.txt
 delete mode 100644 lib/LibDriver/LLVMBuild.txt
 delete mode 100644 lib/LibDriver/LibDriver.cpp
 delete mode 100644 lib/LibDriver/Options.td
 create mode 100644 lib/Support/Parallel.cpp
 create mode 100644 lib/Target/AMDGPU/AMDGPUMachineCFGStructurizer.cpp
 create mode 100644 lib/ToolDrivers/CMakeLists.txt
 create mode 100644 lib/ToolDrivers/LLVMBuild.txt
 create mode 100644 lib/ToolDrivers/llvm-lib/CMakeLists.txt
 create mode 100644 lib/ToolDrivers/llvm-lib/LLVMBuild.txt
 create mode 100644 lib/ToolDrivers/llvm-lib/LibDriver.cpp
 create mode 100644 lib/ToolDrivers/llvm-lib/Options.td
 create mode 100644 test/Analysis/BasicAA/cs-cs-arm.ll
 create mode 100644 test/Analysis/BasicAA/intrinsics-arm.ll
 create mode 100644 test/Analysis/CostModel/AArch64/free-widening-casts.ll
 create mode 100644 test/Analysis/CostModel/AMDGPU/shufflevector.ll
 create mode 100644 test/Analysis/ScalarEvolution/different-loops-recs.ll
 create mode 100644 test/Assembler/globalvariable-attributes.ll
 create mode 100644 test/Bitcode/globalvariable-attributes.ll
 create mode 100644 test/Bitcode/thinlto-function-summary-callgraph-sample-profile-summary.ll
 create mode 100644 test/CodeGen/AArch64/macho-global-symbols.ll
 create mode 100644 test/CodeGen/AMDGPU/GlobalISel/legalize-constant.mir
 create mode 100644 test/CodeGen/AMDGPU/GlobalISel/lit.local.cfg
 create mode 100644 test/CodeGen/AVR/select-mbb-placement-bug.ll
 create mode 100644 test/CodeGen/Generic/expand-experimental-reductions.ll
 create mode 100644 test/CodeGen/Lanai/masking_setccs.ll
 create mode 100644 test/CodeGen/MIR/ARM/PR32721_ifcvt_triangle_unanalyzable.mir
 create mode 100644 test/CodeGen/MIR/ARM/ifcvt_canFallThroughTo.mir
 delete mode 100644 test/CodeGen/MSP430/2009-11-05-8BitLibcalls.ll
 create mode 100644 test/CodeGen/MSP430/hwmult16.ll
 create mode 100644 test/CodeGen/MSP430/hwmult32.ll
 create mode 100644 test/CodeGen/MSP430/hwmultf5.ll
 create mode 100644 test/CodeGen/MSP430/libcalls.ll
 create mode 100644 test/CodeGen/MSP430/promote-i8-mul.ll
 create mode 100644 test/CodeGen/PowerPC/mtvsrdd.ll
 create mode 100644 test/CodeGen/PowerPC/testComparesieqsc.ll
 create mode 100644 test/CodeGen/PowerPC/testComparesieqsi.ll
 create mode 100644 test/CodeGen/PowerPC/testComparesieqss.ll
 create mode 100644 test/CodeGen/PowerPC/testComparesiequc.ll
 create mode 100644 test/CodeGen/PowerPC/testComparesiequi.ll
 create mode 100644 test/CodeGen/PowerPC/testComparesiequs.ll
 create mode 100644 test/CodeGen/PowerPC/testCompareslleqsc.ll
 create mode 100644 test/CodeGen/PowerPC/testCompareslleqsi.ll
 create mode 100644 test/CodeGen/PowerPC/testCompareslleqss.ll
 create mode 100644 test/CodeGen/PowerPC/testComparesllequc.ll
 create mode 100644 test/CodeGen/PowerPC/testComparesllequi.ll
 create mode 100644 test/CodeGen/PowerPC/testComparesllequs.ll
 create mode 100644 test/CodeGen/SPARC/inlineasm-v9.ll
 create mode 100644 test/CodeGen/SystemZ/list-ilp-crash.ll
 create mode 100644 test/CodeGen/SystemZ/lower-copy-undef-src.mir
 delete mode 100644 test/CodeGen/X86/2012-11-30-handlemove-dbg.ll
 delete mode 100644 test/CodeGen/X86/2012-11-30-misched-dbg.ll
 delete mode 100644 test/CodeGen/X86/2012-11-30-regpres-dbg.ll
 create mode 100644 test/CodeGen/X86/GlobalISel/add-scalar.ll
 create mode 100644 test/CodeGen/X86/GlobalISel/br.ll
 create mode 100644 test/CodeGen/X86/GlobalISel/cmp.ll
 create mode 100644 test/CodeGen/X86/GlobalISel/legalize-cmp.mir
 create mode 100644 test/CodeGen/X86/GlobalISel/memop-scalar-x32.ll
 create mode 100644 test/CodeGen/X86/GlobalISel/memop-scalar.ll
 create mode 100644 test/CodeGen/X86/GlobalISel/memop-vec.ll
 delete mode 100644 test/CodeGen/X86/GlobalISel/memop-x32.ll
 delete mode 100644 test/CodeGen/X86/GlobalISel/memop.ll
 create mode 100644 test/CodeGen/X86/GlobalISel/select-br.mir
 create mode 100644 test/CodeGen/X86/GlobalISel/select-cmp.mir
 create mode 100644 test/CodeGen/X86/GlobalISel/select-memop-scalar-x32.mir
 create mode 100644 test/CodeGen/X86/GlobalISel/select-memop-scalar.mir
 create mode 100644 test/CodeGen/X86/GlobalISel/select-memop-v128.mir
 delete mode 100644 test/CodeGen/X86/GlobalISel/select-memop-x32.mir
 delete mode 100644 test/CodeGen/X86/GlobalISel/select-memop.mir
 create mode 100644 test/CodeGen/X86/O0-pipeline.ll
 create mode 100644 test/CodeGen/X86/avx512-scalar_mask.ll
 create mode 100644 test/CodeGen/X86/avx512-vselect.ll
 create mode 100644 test/CodeGen/X86/leaFixup32.mir
 create mode 100644 test/CodeGen/X86/leaFixup64.mir
 create mode 100644 test/CodeGen/X86/lrshrink.ll
 create mode 100644 test/CodeGen/X86/replace_unsupported_masked_mem_intrin.ll
 create mode 100644 test/CodeGen/X86/vector-narrow-binop.ll
 create mode 100644 test/DebugInfo/COFF/no-cus.ll
 create mode 100644 test/DebugInfo/Inputs/typeunit-header.elf-x86-64
 create mode 100644 test/DebugInfo/Inputs/typeunit-header.s
 create mode 100644 test/DebugInfo/X86/dbg-declare-inalloca.ll
 create mode 100644 test/DebugInfo/typeunit-header.test
 create mode 100644 test/Instrumentation/MemorySanitizer/msan_x86intrinsics.ll
 create mode 100644 test/Instrumentation/MemorySanitizer/pr32842.ll
 create mode 100644 test/LTO/Resolution/X86/ifunc.ll
 create mode 100644 test/MC/AArch64/directive-cpu-err.s
 create mode 100644 test/MC/AsmParser/altmacro_string_escape.s
 create mode 100644 test/MC/Disassembler/PowerPC/ppc64-encoding-p9vector.txt
 create mode 100644 test/Object/Inputs/COFF/empty-drectve.yaml
 create mode 100644 test/Object/X86/archive-symbol-table.s
 create mode 100644 test/Object/coff-empty-drectve.test
 create mode 100644 test/Object/wasm-invalid-start.test
 create mode 100644 test/TableGen/RegisterEncoder.td
 create mode 100644 test/Transforms/CodeExtractor/PartialInlineEntryUpdate.ll
 create mode 100644 test/Transforms/CodeExtractor/PartialInlineHighCost.ll
 create mode 100644 test/Transforms/CodeGenPrepare/section-samplepgo.ll
 create mode 100644 test/Transforms/ConstProp/calls-math-finite.ll
 create mode 100644 test/Transforms/ConstProp/sse.ll
 create mode 100644 test/Transforms/Coroutines/coro-eh-aware-edge-split.ll
 create mode 100644 test/Transforms/GVN/PRE/nonintegral.ll
 delete mode 100644 test/Transforms/InstCombine/2012-04-23-Neon-Intrinsics.ll
 create mode 100644 test/Transforms/InstCombine/AArch64/2012-04-23-Neon-Intrinsics.ll
 create mode 100644 test/Transforms/InstCombine/AArch64/lit.local.cfg
 create mode 100644 test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll
 create mode 100644 test/Transforms/InstCombine/AMDGPU/lit.local.cfg
 create mode 100644 test/Transforms/InstCombine/ARM/2012-04-23-Neon-Intrinsics.ll
 create mode 100644 test/Transforms/InstCombine/ARM/constant-fold-hang.ll
 create mode 100644 test/Transforms/InstCombine/ARM/lit.local.cfg
 create mode 100644 test/Transforms/InstCombine/ARM/neon-intrinsics.ll
 create mode 100644 test/Transforms/InstCombine/PowerPC/aligned-altivec.ll
 create mode 100644 test/Transforms/InstCombine/PowerPC/aligned-qpx.ll
 create mode 100644 test/Transforms/InstCombine/PowerPC/lit.local.cfg
 create mode 100644 test/Transforms/InstCombine/PowerPC/vsx-unaligned.ll
 create mode 100644 test/Transforms/InstCombine/X86/X86FsubCmpCombine.ll
 create mode 100644 test/Transforms/InstCombine/X86/blend_x86.ll
 create mode 100644 test/Transforms/InstCombine/X86/lit.local.cfg
 create mode 100644 test/Transforms/InstCombine/X86/pr2645-1.ll
 create mode 100644 test/Transforms/InstCombine/X86/shufflemask-undef.ll
 create mode 100644 test/Transforms/InstCombine/X86/x86-avx2.ll
 create mode 100644 test/Transforms/InstCombine/X86/x86-avx512.ll
 create mode 100644 test/Transforms/InstCombine/X86/x86-crc32-demanded.ll
 create mode 100644 test/Transforms/InstCombine/X86/x86-f16c.ll
 create mode 100644 test/Transforms/InstCombine/X86/x86-fma.ll
 create mode 100644 test/Transforms/InstCombine/X86/x86-insertps.ll
 create mode 100644 test/Transforms/InstCombine/X86/x86-masked-memops.ll
 create mode 100644 test/Transforms/InstCombine/X86/x86-movmsk.ll
 create mode 100644 test/Transforms/InstCombine/X86/x86-muldq.ll
 create mode 100644 test/Transforms/InstCombine/X86/x86-pack.ll
 create mode 100644 test/Transforms/InstCombine/X86/x86-pshufb.ll
 create mode 100644 test/Transforms/InstCombine/X86/x86-sse.ll
 create mode 100644 test/Transforms/InstCombine/X86/x86-sse2.ll
 create mode 100644 test/Transforms/InstCombine/X86/x86-sse41.ll
 create mode 100644 test/Transforms/InstCombine/X86/x86-sse4a.ll
 create mode 100644 test/Transforms/InstCombine/X86/x86-vec_demanded_elts.ll
 create mode 100644 test/Transforms/InstCombine/X86/x86-vector-shifts.ll
 create mode 100644 test/Transforms/InstCombine/X86/x86-vperm2.ll
 create mode 100644 test/Transforms/InstCombine/X86/x86-vpermil.ll
 create mode 100644 test/Transforms/InstCombine/X86/x86-xop.ll
 delete mode 100644 test/Transforms/InstCombine/X86FsubCmpCombine.ll
 delete mode 100644 test/Transforms/InstCombine/aligned-altivec.ll
 delete mode 100644 test/Transforms/InstCombine/aligned-qpx.ll
 delete mode 100644 test/Transforms/InstCombine/amdgcn-intrinsics.ll
 delete mode 100644 test/Transforms/InstCombine/bit-tracking.ll
 delete mode 100644 test/Transforms/InstCombine/blend_x86.ll
 delete mode 100644 test/Transforms/InstCombine/constant-fold-hang.ll
 create mode 100644 test/Transforms/InstCombine/constant-fold-iteration.ll
 delete mode 100644 test/Transforms/InstCombine/neon-intrinsics.ll
 delete mode 100644 test/Transforms/InstCombine/pr2645-1.ll
 delete mode 100644 test/Transforms/InstCombine/shufflemask-undef.ll
 delete mode 100644 test/Transforms/InstCombine/vsx-unaligned.ll
 delete mode 100644 test/Transforms/InstCombine/x86-avx2.ll
 delete mode 100644 test/Transforms/InstCombine/x86-avx512.ll
 delete mode 100644 test/Transforms/InstCombine/x86-crc32-demanded.ll
 delete mode 100644 test/Transforms/InstCombine/x86-f16c.ll
 delete mode 100644 test/Transforms/InstCombine/x86-fma.ll
 delete mode 100644 test/Transforms/InstCombine/x86-insertps.ll
 delete mode 100644 test/Transforms/InstCombine/x86-masked-memops.ll
 delete mode 100644 test/Transforms/InstCombine/x86-movmsk.ll
 delete mode 100644 test/Transforms/InstCombine/x86-muldq.ll
 delete mode 100644 test/Transforms/InstCombine/x86-pack.ll
 delete mode 100644 test/Transforms/InstCombine/x86-pshufb.ll
 delete mode 100644 test/Transforms/InstCombine/x86-sse.ll
 delete mode 100644 test/Transforms/InstCombine/x86-sse2.ll
 delete mode 100644 test/Transforms/InstCombine/x86-sse41.ll
 delete mode 100644 test/Transforms/InstCombine/x86-sse4a.ll
 delete mode 100644 test/Transforms/InstCombine/x86-vector-shifts.ll
 delete mode 100644 test/Transforms/InstCombine/x86-vperm2.ll
 delete mode 100644 test/Transforms/InstCombine/x86-vpermil.ll
 delete mode 100644 test/Transforms/InstCombine/x86-xop.ll
 create mode 100644 test/Transforms/InstNamer/basic.ll
 delete mode 100644 test/Transforms/InstSimplify/apint-or.ll
 create mode 100644 test/Transforms/InstSimplify/or.ll
 create mode 100644 test/Transforms/LoopIdiom/ARM/ctlz.ll
 create mode 100644 test/Transforms/LoopIdiom/X86/ctlz.ll
 create mode 100644 test/Transforms/LoopVectorize/X86/svml-calls-finite.ll
 create mode 100644 test/Transforms/LoopVectorize/pr32859.ll
 create mode 100644 test/Transforms/NewGVN/pr32934.ll
 create mode 100644 test/Transforms/NewGVN/pr32952.ll
 create mode 100644 test/Transforms/NewGVN/verify-memoryphi.ll
 create mode 100644 test/Transforms/SLPVectorizer/AArch64/64-bit-vector.ll
 create mode 100644 test/Transforms/SLPVectorizer/AArch64/remarks.ll
 create mode 100644 test/Transforms/SLPVectorizer/X86/arith-add.ll
 create mode 100644 test/Transforms/SLPVectorizer/X86/arith-mul.ll
 create mode 100644 test/Transforms/SLPVectorizer/X86/arith-sub.ll
 create mode 100644 test/Transforms/SLPVectorizer/X86/shift-ashr.ll
 create mode 100644 test/Transforms/SLPVectorizer/X86/shift-lshr.ll
 create mode 100644 test/Transforms/SLPVectorizer/X86/shift-shl.ll
 delete mode 100644 test/Transforms/SpeculativeExecution/spec-other.ll
 delete mode 100644 test/Transforms/SpeculativeExecution/spec-vector.ll
 create mode 100644 test/tools/llvm-pdbdump/symbol-filters.test
 create mode 100644 test/tools/llvm-readobj/wasm-invalid.test
 create mode 100644 unittests/DebugInfo/CodeView/CMakeLists.txt
 create mode 100644 unittests/DebugInfo/CodeView/ErrorChecking.h
 create mode 100644 unittests/DebugInfo/CodeView/RandomAccessVisitorTest.cpp
 create mode 100644 unittests/Support/ParallelTest.cpp
 create mode 100644 utils/vscode/README
 create mode 100644 utils/vscode/tablegen/.vscode/launch.json
 create mode 100644 utils/vscode/tablegen/CHANGELOG.md
 create mode 100644 utils/vscode/tablegen/README.md
 create mode 100644 utils/vscode/tablegen/language-configuration.json
 create mode 100644 utils/vscode/tablegen/package.json
 create mode 100644 utils/vscode/tablegen/syntaxes/TableGen.tmLanguage
 create mode 100644 utils/vscode/tablegen/vsc-extension-quickstart.md

diff --git a/CREDITS.TXT b/CREDITS.TXT
index 15d822a68091..20bd553ae2bc 100644
--- a/CREDITS.TXT
+++ b/CREDITS.TXT
@@ -265,7 +265,7 @@ D: Release manager (1.7+)
 N: Sylvestre Ledru
 E: sylvestre@debian.org
 W: http://sylvestre.ledru.info/
-W: http://llvm.org/apt/
+W: http://apt.llvm.org/
 D: Debian and Ubuntu packaging
 D: Continuous integration with jenkins
 
diff --git a/cmake/config-ix.cmake b/cmake/config-ix.cmake
index 0331d0fa10ab..de8e9bf9a494 100755
--- a/cmake/config-ix.cmake
+++ b/cmake/config-ix.cmake
@@ -530,16 +530,6 @@ else()
   message(STATUS "Doxygen disabled.")
 endif()
 
-if (LLVM_ENABLE_SPHINX)
-  message(STATUS "Sphinx enabled.")
-  find_package(Sphinx REQUIRED)
-  if (LLVM_BUILD_DOCS)
-    add_custom_target(sphinx ALL)
-  endif()
-else()
-  message(STATUS "Sphinx disabled.")
-endif()
-
 set(LLVM_BINDINGS "")
 if(WIN32)
   message(STATUS "Go bindings disabled.")
diff --git a/cmake/modules/AddSphinxTarget.cmake b/cmake/modules/AddSphinxTarget.cmake
index cfc7f38e9e77..4540c5c36c8e 100644
--- a/cmake/modules/AddSphinxTarget.cmake
+++ b/cmake/modules/AddSphinxTarget.cmake
@@ -1,3 +1,16 @@
+
+# Create sphinx target
+if (LLVM_ENABLE_SPHINX)
+  message(STATUS "Sphinx enabled.")
+  find_package(Sphinx REQUIRED)
+  if (LLVM_BUILD_DOCS AND NOT TARGET sphinx)
+    add_custom_target(sphinx ALL)
+  endif()
+else()
+  message(STATUS "Sphinx disabled.")
+endif()
+
+
 # Handy function for creating the different Sphinx targets.
 #
 # ``builder`` should be one of the supported builders used by
diff --git a/docs/CMakeLists.txt b/docs/CMakeLists.txt
index 6dff219ae37f..4437610146c4 100644
--- a/docs/CMakeLists.txt
+++ b/docs/CMakeLists.txt
@@ -103,8 +103,8 @@ endif()
 endif()
 
 if (LLVM_ENABLE_SPHINX)
+  include(AddSphinxTarget)
   if (SPHINX_FOUND)
-    include(AddSphinxTarget)
     if (${SPHINX_OUTPUT_HTML})
       add_sphinx_target(html llvm)
     endif()
diff --git a/docs/GettingStarted.rst b/docs/GettingStarted.rst
index d5c8ba4b8214..0cb415ad764e 100644
--- a/docs/GettingStarted.rst
+++ b/docs/GettingStarted.rst
@@ -699,14 +699,14 @@ For developers to work with a git monorepo
 
 .. note::
 
-   This set-up is using unofficial mirror hosted on GitHub, use with caution.
+   This set-up is using an unofficial mirror hosted on GitHub, use with caution.
 
 To set up a clone of all the llvm projects using a unified repository:
 
 .. code-block:: console
 
   % export TOP_LEVEL_DIR=`pwd`
-  % git clone https://github.com/llvm-project/llvm-project/
+  % git clone https://github.com/llvm-project/llvm-project-20170507/ llvm-project
   % cd llvm-project
   % git config branch.master.rebase true
 
diff --git a/docs/LangRef.rst b/docs/LangRef.rst
index dad99e3352dd..9ff47e8366dc 100644
--- a/docs/LangRef.rst
+++ b/docs/LangRef.rst
@@ -641,8 +641,9 @@ assume that the globals are densely packed in their section and try to
 iterate over them as an array, alignment padding would break this
 iteration. The maximum alignment is ``1 << 29``.
 
-Globals can also have a :ref:`DLL storage class <dllstorageclass>` and
-an optional list of attached :ref:`metadata <metadata>`,
+Globals can also have a :ref:`DLL storage class <dllstorageclass>`,
+an optional :ref:`global attributes <glattrs>` and
+an optional list of attached :ref:`metadata <metadata>`.
 
 Variables and aliases can have a
 :ref:`Thread Local Storage Model <tls_model>`.
@@ -1624,6 +1625,14 @@ example:
     the ELF x86-64 abi, but it can be disabled for some compilation
     units.
 
+.. _glattrs:
+
+Global Attributes
+-----------------
+
+Attributes may be set to communicate additional information about a global variable.
+Unlike :ref:`function attributes <fnattrs>`, attributes on a global variable
+are grouped into a single :ref:`attribute group <attrgrp>`.
 
 .. _opbundles:
 
@@ -3664,6 +3673,9 @@ Sparc:
 
 - ``I``: An immediate 13-bit signed integer.
 - ``r``: A 32-bit integer register.
+- ``f``: Any floating-point register on SparcV8, or a floating point
+  register in the "low" half of the registers on SparcV9.
+- ``e``: Any floating point register. (Same as ``f`` on SparcV8.)
 
 SystemZ:
 
@@ -11687,6 +11699,338 @@ Examples:
 
       %r2 = call float @llvm.fmuladd.f32(float %a, float %b, float %c) ; yields float:r2 = (a * b) + c
 
+
+Experimental Vector Reduction Intrinsics
+----------------------------------------
+
+Horizontal reductions of vectors can be expressed using the following
+intrinsics. Each one takes a vector operand as an input and applies its
+respective operation across all elements of the vector, returning a single
+scalar result of the same element type.
+
+
+'``llvm.experimental.vector.reduce.add.*``' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+
+::
+
+      declare i32 @llvm.experimental.vector.reduce.add.i32.v4i32(<4 x i32> %a)
+      declare i64 @llvm.experimental.vector.reduce.add.i64.v2i64(<2 x i64> %a)
+
+Overview:
+"""""""""
+
+The '``llvm.experimental.vector.reduce.add.*``' intrinsics do an integer ``ADD``
+reduction of a vector, returning the result as a scalar. The return type matches
+the element-type of the vector input.
+
+Arguments:
+""""""""""
+The argument to this intrinsic must be a vector of integer values.
+
+'``llvm.experimental.vector.reduce.fadd.*``' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+
+::
+
+      declare float @llvm.experimental.vector.reduce.fadd.f32.v4f32(float %acc, <4 x float> %a)
+      declare double @llvm.experimental.vector.reduce.fadd.f64.v2f64(double %acc, <2 x double> %a)
+
+Overview:
+"""""""""
+
+The '``llvm.experimental.vector.reduce.fadd.*``' intrinsics do a floating point
+``ADD`` reduction of a vector, returning the result as a scalar. The return type
+matches the element-type of the vector input.
+
+If the intrinsic call has fast-math flags, then the reduction will not preserve
+the associativity of an equivalent scalarized counterpart. If it does not have
+fast-math flags, then the reduction will be *ordered*, implying that the
+operation respects the associativity of a scalarized reduction.
+
+
+Arguments:
+""""""""""
+The first argument to this intrinsic is a scalar accumulator value, which is
+only used when there are no fast-math flags attached. This argument may be undef
+when fast-math flags are used.
+
+The second argument must be a vector of floating point values.
+
+Examples:
+"""""""""
+
+.. code-block:: llvm
+
+      %fast = call fast float @llvm.experimental.vector.reduce.fadd.f32.v4f32(float undef, <4 x float> %input) ; fast reduction
+      %ord = call float @llvm.experimental.vector.reduce.fadd.f32.v4f32(float %acc, <4 x float> %input) ; ordered reduction
+
+
+'``llvm.experimental.vector.reduce.mul.*``' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+
+::
+
+      declare i32 @llvm.experimental.vector.reduce.mul.i32.v4i32(<4 x i32> %a)
+      declare i64 @llvm.experimental.vector.reduce.mul.i64.v2i64(<2 x i64> %a)
+
+Overview:
+"""""""""
+
+The '``llvm.experimental.vector.reduce.mul.*``' intrinsics do an integer ``MUL``
+reduction of a vector, returning the result as a scalar. The return type matches
+the element-type of the vector input.
+
+Arguments:
+""""""""""
+The argument to this intrinsic must be a vector of integer values.
+
+'``llvm.experimental.vector.reduce.fmul.*``' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+
+::
+
+      declare float @llvm.experimental.vector.reduce.fmul.f32.v4f32(float %acc, <4 x float> %a)
+      declare double @llvm.experimental.vector.reduce.fmul.f64.v2f64(double %acc, <2 x double> %a)
+
+Overview:
+"""""""""
+
+The '``llvm.experimental.vector.reduce.fmul.*``' intrinsics do a floating point
+``MUL`` reduction of a vector, returning the result as a scalar. The return type
+matches the element-type of the vector input.
+
+If the intrinsic call has fast-math flags, then the reduction will not preserve
+the associativity of an equivalent scalarized counterpart. If it does not have
+fast-math flags, then the reduction will be *ordered*, implying that the
+operation respects the associativity of a scalarized reduction.
+
+
+Arguments:
+""""""""""
+The first argument to this intrinsic is a scalar accumulator value, which is
+only used when there are no fast-math flags attached. This argument may be undef
+when fast-math flags are used.
+
+The second argument must be a vector of floating point values.
+
+Examples:
+"""""""""
+
+.. code-block:: llvm
+
+      %fast = call fast float @llvm.experimental.vector.reduce.fmul.f32.v4f32(float undef, <4 x float> %input) ; fast reduction
+      %ord = call float @llvm.experimental.vector.reduce.fmul.f32.v4f32(float %acc, <4 x float> %input) ; ordered reduction
+
+'``llvm.experimental.vector.reduce.and.*``' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+
+::
+
+      declare i32 @llvm.experimental.vector.reduce.and.i32.v4i32(<4 x i32> %a)
+
+Overview:
+"""""""""
+
+The '``llvm.experimental.vector.reduce.and.*``' intrinsics do a bitwise ``AND``
+reduction of a vector, returning the result as a scalar. The return type matches
+the element-type of the vector input.
+
+Arguments:
+""""""""""
+The argument to this intrinsic must be a vector of integer values.
+
+'``llvm.experimental.vector.reduce.or.*``' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+
+::
+
+      declare i32 @llvm.experimental.vector.reduce.or.i32.v4i32(<4 x i32> %a)
+
+Overview:
+"""""""""
+
+The '``llvm.experimental.vector.reduce.or.*``' intrinsics do a bitwise ``OR`` reduction
+of a vector, returning the result as a scalar. The return type matches the
+element-type of the vector input.
+
+Arguments:
+""""""""""
+The argument to this intrinsic must be a vector of integer values.
+
+'``llvm.experimental.vector.reduce.xor.*``' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+
+::
+
+      declare i32 @llvm.experimental.vector.reduce.xor.i32.v4i32(<4 x i32> %a)
+
+Overview:
+"""""""""
+
+The '``llvm.experimental.vector.reduce.xor.*``' intrinsics do a bitwise ``XOR``
+reduction of a vector, returning the result as a scalar. The return type matches
+the element-type of the vector input.
+
+Arguments:
+""""""""""
+The argument to this intrinsic must be a vector of integer values.
+
+'``llvm.experimental.vector.reduce.smax.*``' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+
+::
+
+      declare i32 @llvm.experimental.vector.reduce.smax.i32.v4i32(<4 x i32> %a)
+
+Overview:
+"""""""""
+
+The '``llvm.experimental.vector.reduce.smax.*``' intrinsics do a signed integer
+``MAX`` reduction of a vector, returning the result as a scalar. The return type
+matches the element-type of the vector input.
+
+Arguments:
+""""""""""
+The argument to this intrinsic must be a vector of integer values.
+
+'``llvm.experimental.vector.reduce.smin.*``' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+
+::
+
+      declare i32 @llvm.experimental.vector.reduce.smin.i32.v4i32(<4 x i32> %a)
+
+Overview:
+"""""""""
+
+The '``llvm.experimental.vector.reduce.smin.*``' intrinsics do a signed integer
+``MIN`` reduction of a vector, returning the result as a scalar. The return type
+matches the element-type of the vector input.
+
+Arguments:
+""""""""""
+The argument to this intrinsic must be a vector of integer values.
+
+'``llvm.experimental.vector.reduce.umax.*``' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+
+::
+
+      declare i32 @llvm.experimental.vector.reduce.umax.i32.v4i32(<4 x i32> %a)
+
+Overview:
+"""""""""
+
+The '``llvm.experimental.vector.reduce.umax.*``' intrinsics do an unsigned
+integer ``MAX`` reduction of a vector, returning the result as a scalar. The
+return type matches the element-type of the vector input.
+
+Arguments:
+""""""""""
+The argument to this intrinsic must be a vector of integer values.
+
+'``llvm.experimental.vector.reduce.umin.*``' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+
+::
+
+      declare i32 @llvm.experimental.vector.reduce.umin.i32.v4i32(<4 x i32> %a)
+
+Overview:
+"""""""""
+
+The '``llvm.experimental.vector.reduce.umin.*``' intrinsics do an unsigned
+integer ``MIN`` reduction of a vector, returning the result as a scalar. The
+return type matches the element-type of the vector input.
+
+Arguments:
+""""""""""
+The argument to this intrinsic must be a vector of integer values.
+
+'``llvm.experimental.vector.reduce.fmax.*``' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+
+::
+
+      declare float @llvm.experimental.vector.reduce.fmax.f32.v4f32(<4 x float> %a)
+      declare double @llvm.experimental.vector.reduce.fmax.f64.v2f64(<2 x double> %a)
+
+Overview:
+"""""""""
+
+The '``llvm.experimental.vector.reduce.fmax.*``' intrinsics do a floating point
+``MAX`` reduction of a vector, returning the result as a scalar. The return type
+matches the element-type of the vector input.
+
+If the intrinsic call has the ``nnan`` fast-math flag then the operation can
+assume that NaNs are not present in the input vector.
+
+Arguments:
+""""""""""
+The argument to this intrinsic must be a vector of floating point values.
+
+'``llvm.experimental.vector.reduce.fmin.*``' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+
+::
+
+      declare float @llvm.experimental.vector.reduce.fmin.f32.v4f32(<4 x float> %a)
+      declare double @llvm.experimental.vector.reduce.fmin.f64.v2f64(<2 x double> %a)
+
+Overview:
+"""""""""
+
+The '``llvm.experimental.vector.reduce.fmin.*``' intrinsics do a floating point
+``MIN`` reduction of a vector, returning the result as a scalar. The return type
+matches the element-type of the vector input.
+
+If the intrinsic call has the ``nnan`` fast-math flag then the operation can
+assume that NaNs are not present in the input vector.
+
+Arguments:
+""""""""""
+The argument to this intrinsic must be a vector of floating point values.
+
 Half Precision Floating Point Intrinsics
 ----------------------------------------
 
diff --git a/docs/Lexicon.rst b/docs/Lexicon.rst
index 35687e258182..ebc3fb772e81 100644
--- a/docs/Lexicon.rst
+++ b/docs/Lexicon.rst
@@ -249,6 +249,14 @@ S
     Superword-Level Parallelism, same as :ref:`Basic-Block Vectorization
     <lexicon-bb-vectorization>`.
 
+**Splat**
+    Splat refers to a vector of identical scalar elements.
+
+    The term is based on the PowerPC Altivec instructions that provided
+    this functionality in hardware. For example, "vsplth" and the corresponding
+    software intrinsic "vec_splat()". Examples of other hardware names for this
+    action include "duplicate" (ARM) and "broadcast" (x86).
+
 **SRoA**
     Scalar Replacement of Aggregates
 
diff --git a/docs/LibFuzzer.rst b/docs/LibFuzzer.rst
index a11baa720ec8..5acfa04ce1f4 100644
--- a/docs/LibFuzzer.rst
+++ b/docs/LibFuzzer.rst
@@ -305,6 +305,10 @@ The most important command line options are:
    - 1 : close ``stdout``
    - 2 : close ``stderr``
    - 3 : close both ``stdout`` and ``stderr``.
+``-print_coverage``
+   If 1, print coverage information as text at exit.
+``-dump_coverage``
+   If 1, dump coverage information as a .sancov file at exit.
 
 For the full list of flags run the fuzzer binary with ``-help=1``.
 
@@ -543,12 +547,19 @@ You can get the coverage for your corpus like this:
 
 .. code-block:: console
 
-  ASAN_OPTIONS=coverage=1 ./fuzzer CORPUS_DIR -runs=0
+  ./fuzzer CORPUS_DIR -runs=0 -print_coverage=1
 
 This will run all tests in the CORPUS_DIR but will not perform any fuzzing.
-At the end of the process it will dump a single ``.sancov`` file with coverage 
-information.  See SanitizerCoverage_ for details on querying the file using the
-``sancov`` tool.
+At the end of the process it will print text describing what code has been covered and what hasn't.
+
+Alternatively, use
+
+.. code-block:: console
+
+  ./fuzzer CORPUS_DIR -runs=0 -dump_coverage=1
+
+which will dump a ``.sancov`` file with coverage information.
+See SanitizerCoverage_ for details on querying the file using the ``sancov`` tool.
 
 You may also use other ways to visualize coverage,
 e.g. using `Clang coverage <http://clang.llvm.org/docs/SourceBasedCodeCoverage.html>`_,
diff --git a/docs/ReleaseNotes.rst b/docs/ReleaseNotes.rst
index dbffb53d5a51..bc35e62189a2 100644
--- a/docs/ReleaseNotes.rst
+++ b/docs/ReleaseNotes.rst
@@ -40,6 +40,10 @@ Non-comprehensive list of changes in this release
    functionality, or simply have a lot to talk about), see the `NOTE` below
    for adding a new subsection.
 
+* LLVM's ``WeakVH`` has been renamed to ``WeakTrackingVH`` and a new ``WeakVH``
+  has been introduced.  The new ``WeakVH`` nulls itself out on deletion, but
+  does not track values across RAUW.
+
 * ... next change ...
 
 .. NOTE
diff --git a/include/llvm/ADT/APInt.h b/include/llvm/ADT/APInt.h
index c3822e35906a..94fbd1a29bf9 100644
--- a/include/llvm/ADT/APInt.h
+++ b/include/llvm/ADT/APInt.h
@@ -157,6 +157,11 @@ private:
     return isSingleWord() ? U.VAL : U.pVal[whichWord(bitPosition)];
   }
 
+  /// Utility method to change the bit width of this APInt to new bit width,
+  /// allocating and/or deallocating as necessary. There is no guarantee on the
+  /// value of any bits upon return. Caller should populate the bits after.
+  void reallocate(unsigned NewBitWidth);
+
   /// \brief Convert a char array into an APInt
   ///
   /// \param radix 2, 8, 10, 16, or 36
@@ -1437,6 +1442,12 @@ public:
   /// as "bitPosition".
   void flipBit(unsigned bitPosition);
 
+  /// Negate this APInt in place.
+  void negate() {
+    flipAllBits();
+    ++(*this);
+  }
+
   /// Insert the bits from a smaller APInt starting at bitPosition.
   void insertBits(const APInt &SubBits, unsigned bitPosition);
 
@@ -1646,12 +1657,7 @@ public:
   /// re-interprets the bits as a double. Note that it is valid to do this on
   /// any bit width. Exactly 64 bits will be translated.
   double bitsToDouble() const {
-    union {
-      uint64_t I;
-      double D;
-    } T;
-    T.I = (isSingleWord() ? U.VAL : U.pVal[0]);
-    return T.D;
+    return BitsToDouble(getWord(0));
   }
 
   /// \brief Converts APInt bits to a double
@@ -1660,12 +1666,7 @@ public:
   /// re-interprets the bits as a float. Note that it is valid to do this on
   /// any bit width. Exactly 32 bits will be translated.
   float bitsToFloat() const {
-    union {
-      unsigned I;
-      float F;
-    } T;
-    T.I = unsigned((isSingleWord() ? U.VAL : U.pVal[0]));
-    return T.F;
+    return BitsToFloat(getWord(0));
   }
 
   /// \brief Converts a double to APInt bits.
@@ -1673,12 +1674,7 @@ public:
   /// The conversion does not do a translation from double to integer, it just
   /// re-interprets the bits of the double.
   static APInt doubleToBits(double V) {
-    union {
-      uint64_t I;
-      double D;
-    } T;
-    T.D = V;
-    return APInt(sizeof T * CHAR_BIT, T.I);
+    return APInt(sizeof(double) * CHAR_BIT, DoubleToBits(V));
   }
 
   /// \brief Converts a float to APInt bits.
@@ -1686,12 +1682,7 @@ public:
   /// The conversion does not do a translation from float to integer, it just
   /// re-interprets the bits of the float.
   static APInt floatToBits(float V) {
-    union {
-      unsigned I;
-      float F;
-    } T;
-    T.F = V;
-    return APInt(sizeof T * CHAR_BIT, T.I);
+    return APInt(sizeof(float) * CHAR_BIT, FloatToBits(V));
   }
 
   /// @}
@@ -1852,10 +1843,9 @@ public:
                         unsigned);
 
   /// DST = LHS * RHS, where DST has width the sum of the widths of the
-  /// operands.  No overflow occurs.  DST must be disjoint from both
-  /// operands. Returns the number of parts required to hold the result.
-  static unsigned tcFullMultiply(WordType *, const WordType *,
-                                 const WordType *, unsigned, unsigned);
+  /// operands. No overflow occurs. DST must be disjoint from both operands.
+  static void tcFullMultiply(WordType *, const WordType *,
+                             const WordType *, unsigned, unsigned);
 
   /// If RHS is zero LHS and REMAINDER are left unchanged, return one.
   /// Otherwise set LHS to LHS / RHS with the fractional part discarded, set
@@ -1997,8 +1987,7 @@ inline raw_ostream &operator<<(raw_ostream &OS, const APInt &I) {
 }
 
 inline APInt operator-(APInt v) {
-  v.flipAllBits();
-  ++v;
+  v.negate();
   return v;
 }
 
diff --git a/include/llvm/ADT/BitVector.h b/include/llvm/ADT/BitVector.h
index e835f1516225..4a2af7cd68a6 100644
--- a/include/llvm/ADT/BitVector.h
+++ b/include/llvm/ADT/BitVector.h
@@ -255,7 +255,7 @@ public:
 
   /// find_prev - Returns the index of the first set bit that precedes the
   /// the bit at \p PriorTo.  Returns -1 if all previous bits are unset.
-  int find_prev(unsigned PriorTo) {
+  int find_prev(unsigned PriorTo) const {
     if (PriorTo == 0)
       return -1;
 
diff --git a/include/llvm/ADT/STLExtras.h b/include/llvm/ADT/STLExtras.h
index 15945adbe589..8c28412bb607 100644
--- a/include/llvm/ADT/STLExtras.h
+++ b/include/llvm/ADT/STLExtras.h
@@ -706,6 +706,18 @@ struct is_one_of<T, U, Ts...> {
       std::is_same<T, U>::value || is_one_of<T, Ts...>::value;
 };
 
+/// \brief traits class for checking whether type T is a base class for all
+///  the given types in the variadic list.
+template <typename T, typename... Ts> struct are_base_of {
+  static const bool value = true;
+};
+
+template <typename T, typename U, typename... Ts>
+struct are_base_of<T, U, Ts...> {
+  static const bool value =
+      std::is_base_of<T, U>::value && are_base_of<T, Ts...>::value;
+};
+
 //===----------------------------------------------------------------------===//
 //     Extra additions for arrays
 //===----------------------------------------------------------------------===//
@@ -1079,7 +1091,7 @@ private:
 ///
 /// std::vector<char> Items = {'A', 'B', 'C', 'D'};
 /// for (auto X : enumerate(Items)) {
-///   printf("Item %d - %c\n", X.Index, X.Value);
+///   printf("Item %d - %c\n", X.index(), X.value());
 /// }
 ///
 /// Output:
diff --git a/include/llvm/ADT/StringExtras.h b/include/llvm/ADT/StringExtras.h
index 26f11924b771..1c109be3bab3 100644
--- a/include/llvm/ADT/StringExtras.h
+++ b/include/llvm/ADT/StringExtras.h
@@ -106,6 +106,13 @@ static inline std::string fromHex(StringRef Input) {
   return Output;
 }
 
+/// \brief Convert the string \p S to an integer of the specified type using
+/// the radix \p Base.  If \p Base is 0, auto-detects the radix.
+/// Returns true if the number was successfully converted, false otherwise.
+template <typename N> bool to_integer(StringRef S, N &Num, unsigned Base = 0) {
+  return !S.getAsInteger(Base, Num);
+}
+
 static inline std::string utostr(uint64_t X, bool isNeg = false) {
   char Buffer[21];
   char *BufPtr = std::end(Buffer);
diff --git a/include/llvm/Analysis/CallGraph.h b/include/llvm/Analysis/CallGraph.h
index cc4788d3edae..01469a25c96c 100644
--- a/include/llvm/Analysis/CallGraph.h
+++ b/include/llvm/Analysis/CallGraph.h
@@ -41,12 +41,6 @@
 /// of all of the caller-callee relationships, which is useful for
 /// transformations.
 ///
-/// The CallGraph class also attempts to figure out what the root of the
-/// CallGraph is, which it currently does by looking for a function named
-/// 'main'. If no function named 'main' is found, the external node is used as
-/// the entry node, reflecting the fact that any function without internal
-/// linkage could be called into (which is common for libraries).
-///
 //===----------------------------------------------------------------------===//
 
 #ifndef LLVM_ANALYSIS_CALLGRAPH_H
@@ -82,10 +76,6 @@ class CallGraph {
   /// \brief A map from \c Function* to \c CallGraphNode*.
   FunctionMapTy FunctionMap;
 
-  /// \brief Root is root of the call graph, or the external node if a 'main'
-  /// function couldn't be found.
-  CallGraphNode *Root;
-
   /// \brief This node has edges to all external functions and those internal
   /// functions that have their address taken.
   CallGraphNode *ExternalCallingNode;
diff --git a/include/llvm/Analysis/ProfileSummaryInfo.h b/include/llvm/Analysis/ProfileSummaryInfo.h
index 75c4cbd03706..c5f97083af4d 100644
--- a/include/llvm/Analysis/ProfileSummaryInfo.h
+++ b/include/llvm/Analysis/ProfileSummaryInfo.h
@@ -67,8 +67,8 @@ public:
   }
 
   /// Returns the profile count for \p CallInst.
-  static Optional<uint64_t> getProfileCount(const Instruction *CallInst,
-                                            BlockFrequencyInfo *BFI);
+  Optional<uint64_t> getProfileCount(const Instruction *CallInst,
+                                     BlockFrequencyInfo *BFI);
   /// \brief Returns true if \p F has hot function entry.
   bool isFunctionEntryHot(const Function *F);
   /// Returns true if \p F has hot function entry or hot call edge.
diff --git a/include/llvm/Analysis/ScalarEvolution.h b/include/llvm/Analysis/ScalarEvolution.h
index 85350fa159d6..ceca6cb389a1 100644
--- a/include/llvm/Analysis/ScalarEvolution.h
+++ b/include/llvm/Analysis/ScalarEvolution.h
@@ -568,27 +568,16 @@ private:
       Predicates.insert(P);
     }
 
-    /*implicit*/ ExitLimit(const SCEV *E)
-        : ExactNotTaken(E), MaxNotTaken(E), MaxOrZero(false) {}
+    /*implicit*/ ExitLimit(const SCEV *E);
 
     ExitLimit(
         const SCEV *E, const SCEV *M, bool MaxOrZero,
-        ArrayRef<const SmallPtrSetImpl<const SCEVPredicate *> *> PredSetList)
-        : ExactNotTaken(E), MaxNotTaken(M), MaxOrZero(MaxOrZero) {
-      assert((isa<SCEVCouldNotCompute>(ExactNotTaken) ||
-              !isa<SCEVCouldNotCompute>(MaxNotTaken)) &&
-             "Exact is not allowed to be less precise than Max");
-      for (auto *PredSet : PredSetList)
-        for (auto *P : *PredSet)
-          addPredicate(P);
-    }
+        ArrayRef<const SmallPtrSetImpl<const SCEVPredicate *> *> PredSetList);
 
     ExitLimit(const SCEV *E, const SCEV *M, bool MaxOrZero,
-              const SmallPtrSetImpl<const SCEVPredicate *> &PredSet)
-        : ExitLimit(E, M, MaxOrZero, {&PredSet}) {}
+              const SmallPtrSetImpl<const SCEVPredicate *> &PredSet);
 
-    ExitLimit(const SCEV *E, const SCEV *M, bool MaxOrZero)
-        : ExitLimit(E, M, MaxOrZero, None) {}
+    ExitLimit(const SCEV *E, const SCEV *M, bool MaxOrZero);
 
     /// Test whether this ExitLimit contains any computed information, or
     /// whether it's all SCEVCouldNotCompute values.
@@ -782,7 +771,7 @@ private:
 
   /// Set the memoized range for the given SCEV.
   const ConstantRange &setRange(const SCEV *S, RangeSignHint Hint,
-                                ConstantRange &&CR) {
+                                ConstantRange CR) {
     DenseMap<const SCEV *, ConstantRange> &Cache =
         Hint == HINT_RANGE_UNSIGNED ? UnsignedRanges : SignedRanges;
 
diff --git a/include/llvm/Analysis/TargetLibraryInfo.def b/include/llvm/Analysis/TargetLibraryInfo.def
index 099a3c7cf2ac..9cbe917c146d 100644
--- a/include/llvm/Analysis/TargetLibraryInfo.def
+++ b/include/llvm/Analysis/TargetLibraryInfo.def
@@ -161,6 +161,60 @@ TLI_DEFINE_STRING_INTERNAL("_Znwm")
 /// void *new(unsigned long, nothrow);
 TLI_DEFINE_ENUM_INTERNAL(ZnwmRKSt9nothrow_t)
 TLI_DEFINE_STRING_INTERNAL("_ZnwmRKSt9nothrow_t")
+/// double __acos_finite(double x);
+TLI_DEFINE_ENUM_INTERNAL(acos_finite)
+TLI_DEFINE_STRING_INTERNAL("__acos_finite")
+/// float __acosf_finite(float x);
+TLI_DEFINE_ENUM_INTERNAL(acosf_finite)
+TLI_DEFINE_STRING_INTERNAL("__acosf_finite")
+/// double __acosh_finite(double x);
+TLI_DEFINE_ENUM_INTERNAL(acosh_finite)
+TLI_DEFINE_STRING_INTERNAL("__acosh_finite")
+/// float __acoshf_finite(float x);
+TLI_DEFINE_ENUM_INTERNAL(acoshf_finite)
+TLI_DEFINE_STRING_INTERNAL("__acoshf_finite")
+/// long double __acoshl_finite(long double x);
+TLI_DEFINE_ENUM_INTERNAL(acoshl_finite)
+TLI_DEFINE_STRING_INTERNAL("__acoshl_finite")
+/// long double __acosl_finite(long double x);
+TLI_DEFINE_ENUM_INTERNAL(acosl_finite)
+TLI_DEFINE_STRING_INTERNAL("__acosl_finite")
+/// double __asin_finite(double x);
+TLI_DEFINE_ENUM_INTERNAL(asin_finite)
+TLI_DEFINE_STRING_INTERNAL("__asin_finite")
+/// float __asinf_finite(float x);
+TLI_DEFINE_ENUM_INTERNAL(asinf_finite)
+TLI_DEFINE_STRING_INTERNAL("__asinf_finite")
+/// long double __asinl_finite(long double x);
+TLI_DEFINE_ENUM_INTERNAL(asinl_finite)
+TLI_DEFINE_STRING_INTERNAL("__asinl_finite")
+/// double atan2_finite(double y, double x);
+TLI_DEFINE_ENUM_INTERNAL(atan2_finite)
+TLI_DEFINE_STRING_INTERNAL("__atan2_finite")
+/// float atan2f_finite(float y, float x);
+TLI_DEFINE_ENUM_INTERNAL(atan2f_finite)
+TLI_DEFINE_STRING_INTERNAL("__atan2f_finite")
+/// long double atan2l_finite(long double y, long double x);
+TLI_DEFINE_ENUM_INTERNAL(atan2l_finite)
+TLI_DEFINE_STRING_INTERNAL("__atan2l_finite")
+/// double __atanh_finite(double x);
+TLI_DEFINE_ENUM_INTERNAL(atanh_finite)
+TLI_DEFINE_STRING_INTERNAL("__atanh_finite")
+/// float __atanhf_finite(float x);
+TLI_DEFINE_ENUM_INTERNAL(atanhf_finite)
+TLI_DEFINE_STRING_INTERNAL("__atanhf_finite")
+/// long double __atanhl_finite(long double x);
+TLI_DEFINE_ENUM_INTERNAL(atanhl_finite)
+TLI_DEFINE_STRING_INTERNAL("__atanhl_finite")
+/// double __cosh_finite(double x);
+TLI_DEFINE_ENUM_INTERNAL(cosh_finite)
+TLI_DEFINE_STRING_INTERNAL("__cosh_finite")
+/// float __coshf_finite(float x);
+TLI_DEFINE_ENUM_INTERNAL(coshf_finite)
+TLI_DEFINE_STRING_INTERNAL("__coshf_finite")
+/// long double __coshl_finite(long double x);
+TLI_DEFINE_ENUM_INTERNAL(coshl_finite)
+TLI_DEFINE_STRING_INTERNAL("__coshl_finite")
 /// double __cospi(double x);
 TLI_DEFINE_ENUM_INTERNAL(cospi)
 TLI_DEFINE_STRING_INTERNAL("__cospi")
@@ -180,12 +234,66 @@ TLI_DEFINE_STRING_INTERNAL("__cxa_guard_acquire")
 /// void __cxa_guard_release(guard_t *guard);
 TLI_DEFINE_ENUM_INTERNAL(cxa_guard_release)
 TLI_DEFINE_STRING_INTERNAL("__cxa_guard_release")
+/// double __exp10_finite(double x);
+TLI_DEFINE_ENUM_INTERNAL(exp10_finite)
+TLI_DEFINE_STRING_INTERNAL("__exp10_finite")
+/// float __exp10f_finite(float x);
+TLI_DEFINE_ENUM_INTERNAL(exp10f_finite)
+TLI_DEFINE_STRING_INTERNAL("__exp10f_finite")
+/// long double __exp10l_finite(long double x);
+TLI_DEFINE_ENUM_INTERNAL(exp10l_finite)
+TLI_DEFINE_STRING_INTERNAL("__exp10l_finite")
+/// double __exp2_finite(double x);
+TLI_DEFINE_ENUM_INTERNAL(exp2_finite)
+TLI_DEFINE_STRING_INTERNAL("__exp2_finite")
+/// float __exp2f_finite(float x);
+TLI_DEFINE_ENUM_INTERNAL(exp2f_finite)
+TLI_DEFINE_STRING_INTERNAL("__exp2f_finite")
+/// long double __exp2l_finite(long double x);
+TLI_DEFINE_ENUM_INTERNAL(exp2l_finite)
+TLI_DEFINE_STRING_INTERNAL("__exp2l_finite")
+/// double __exp_finite(double x);
+TLI_DEFINE_ENUM_INTERNAL(exp_finite)
+TLI_DEFINE_STRING_INTERNAL("__exp_finite")
+/// float __expf_finite(float x);
+TLI_DEFINE_ENUM_INTERNAL(expf_finite)
+TLI_DEFINE_STRING_INTERNAL("__expf_finite")
+/// long double __expl_finite(long double x);
+TLI_DEFINE_ENUM_INTERNAL(expl_finite)
+TLI_DEFINE_STRING_INTERNAL("__expl_finite")
 /// int __isoc99_scanf (const char *format, ...)
 TLI_DEFINE_ENUM_INTERNAL(dunder_isoc99_scanf)
 TLI_DEFINE_STRING_INTERNAL("__isoc99_scanf")
 /// int __isoc99_sscanf(const char *s, const char *format, ...)
 TLI_DEFINE_ENUM_INTERNAL(dunder_isoc99_sscanf)
 TLI_DEFINE_STRING_INTERNAL("__isoc99_sscanf")
+/// double __log10_finite(double x);
+TLI_DEFINE_ENUM_INTERNAL(log10_finite)
+TLI_DEFINE_STRING_INTERNAL("__log10_finite")
+/// float __log10f_finite(float x);
+TLI_DEFINE_ENUM_INTERNAL(log10f_finite)
+TLI_DEFINE_STRING_INTERNAL("__log10f_finite")
+/// long double __log10l_finite(long double x);
+TLI_DEFINE_ENUM_INTERNAL(log10l_finite)
+TLI_DEFINE_STRING_INTERNAL("__log10l_finite")
+/// double __log2_finite(double x);
+TLI_DEFINE_ENUM_INTERNAL(log2_finite)
+TLI_DEFINE_STRING_INTERNAL("__log2_finite")
+/// float __log2f_finite(float x);
+TLI_DEFINE_ENUM_INTERNAL(log2f_finite)
+TLI_DEFINE_STRING_INTERNAL("__log2f_finite")
+/// long double __log2l_finite(long double x);
+TLI_DEFINE_ENUM_INTERNAL(log2l_finite)
+TLI_DEFINE_STRING_INTERNAL("__log2l_finite")
+/// double __log_finite(double x);
+TLI_DEFINE_ENUM_INTERNAL(log_finite)
+TLI_DEFINE_STRING_INTERNAL("__log_finite")
+/// float __logf_finite(float x);
+TLI_DEFINE_ENUM_INTERNAL(logf_finite)
+TLI_DEFINE_STRING_INTERNAL("__logf_finite")
+/// long double __logl_finite(long double x);
+TLI_DEFINE_ENUM_INTERNAL(logl_finite)
+TLI_DEFINE_STRING_INTERNAL("__logl_finite")
 /// void *__memcpy_chk(void *s1, const void *s2, size_t n, size_t s1size);
 TLI_DEFINE_ENUM_INTERNAL(memcpy_chk)
 TLI_DEFINE_STRING_INTERNAL("__memcpy_chk")
@@ -199,13 +307,30 @@ TLI_DEFINE_STRING_INTERNAL("__memset_chk")
 // int __nvvm_reflect(const char *)
 TLI_DEFINE_ENUM_INTERNAL(nvvm_reflect)
 TLI_DEFINE_STRING_INTERNAL("__nvvm_reflect")
-
+/// double __pow_finite(double x, double y);
+TLI_DEFINE_ENUM_INTERNAL(pow_finite)
+TLI_DEFINE_STRING_INTERNAL("__pow_finite")
+/// float _powf_finite(float x, float y);
+TLI_DEFINE_ENUM_INTERNAL(powf_finite)
+TLI_DEFINE_STRING_INTERNAL("__powf_finite")
+/// long double __powl_finite(long double x, long double y);
+TLI_DEFINE_ENUM_INTERNAL(powl_finite)
+TLI_DEFINE_STRING_INTERNAL("__powl_finite")
 /// double __sincospi_stret(double x);
 TLI_DEFINE_ENUM_INTERNAL(sincospi_stret)
 TLI_DEFINE_STRING_INTERNAL("__sincospi_stret")
 /// float __sincospif_stret(float x);
 TLI_DEFINE_ENUM_INTERNAL(sincospif_stret)
 TLI_DEFINE_STRING_INTERNAL("__sincospif_stret")
+/// double __sinh_finite(double x);
+TLI_DEFINE_ENUM_INTERNAL(sinh_finite)
+TLI_DEFINE_STRING_INTERNAL("__sinh_finite")
+/// float _sinhf_finite(float x);
+TLI_DEFINE_ENUM_INTERNAL(sinhf_finite)
+TLI_DEFINE_STRING_INTERNAL("__sinhf_finite")
+/// long double __sinhl_finite(long double x);
+TLI_DEFINE_ENUM_INTERNAL(sinhl_finite)
+TLI_DEFINE_STRING_INTERNAL("__sinhl_finite")
 /// double __sinpi(double x);
 TLI_DEFINE_ENUM_INTERNAL(sinpi)
 TLI_DEFINE_STRING_INTERNAL("__sinpi")
diff --git a/include/llvm/Analysis/TargetTransformInfo.h b/include/llvm/Analysis/TargetTransformInfo.h
index b9639dba1881..0a0af384c3e6 100644
--- a/include/llvm/Analysis/TargetTransformInfo.h
+++ b/include/llvm/Analysis/TargetTransformInfo.h
@@ -537,6 +537,9 @@ public:
   /// \return The width of the largest scalar or vector register type.
   unsigned getRegisterBitWidth(bool Vector) const;
 
+  /// \return The width of the smallest vector register type.
+  unsigned getMinVectorRegisterBitWidth() const;
+
   /// \return True if it should be considered for address type promotion.
   /// \p AllowPromotionWithoutCommonHeader Set true if promoting \p I is
   /// profitable without finding other extensions fed by the same input.
@@ -740,6 +743,22 @@ public:
                                 unsigned ChainSizeInBytes,
                                 VectorType *VecTy) const;
 
+  /// Flags describing the kind of vector reduction.
+  struct ReductionFlags {
+    ReductionFlags() : IsMaxOp(false), IsSigned(false), NoNaN(false) {}
+    bool IsMaxOp;  ///< If the op a min/max kind, true if it's a max operation.
+    bool IsSigned; ///< Whether the operation is a signed int reduction.
+    bool NoNaN;    ///< If op is an fp min/max, whether NaNs may be present.
+  };
+
+  /// \returns True if the target wants to handle the given reduction idiom in
+  /// the intrinsics form instead of the shuffle form.
+  bool useReductionIntrinsic(unsigned Opcode, Type *Ty,
+                             ReductionFlags Flags) const;
+
+  /// \returns True if the target wants to expand the given reduction intrinsic
+  /// into a shuffle sequence.
+  bool shouldExpandReduction(const IntrinsicInst *II) const;
   /// @}
 
 private:
@@ -824,6 +843,7 @@ public:
                             Type *Ty) = 0;
   virtual unsigned getNumberOfRegisters(bool Vector) = 0;
   virtual unsigned getRegisterBitWidth(bool Vector) = 0;
+  virtual unsigned getMinVectorRegisterBitWidth() = 0;
   virtual bool shouldConsiderAddressTypePromotion(
       const Instruction &I, bool &AllowPromotionWithoutCommonHeader) = 0;
   virtual unsigned getCacheLineSize() = 0;
@@ -895,6 +915,9 @@ public:
   virtual unsigned getStoreVectorFactor(unsigned VF, unsigned StoreSize,
                                         unsigned ChainSizeInBytes,
                                         VectorType *VecTy) const = 0;
+  virtual bool useReductionIntrinsic(unsigned Opcode, Type *Ty,
+                                     ReductionFlags) const = 0;
+  virtual bool shouldExpandReduction(const IntrinsicInst *II) const = 0;
 };
 
 template <typename T>
@@ -1057,6 +1080,9 @@ public:
   unsigned getRegisterBitWidth(bool Vector) override {
     return Impl.getRegisterBitWidth(Vector);
   }
+  unsigned getMinVectorRegisterBitWidth() override {
+    return Impl.getMinVectorRegisterBitWidth();
+  }
   bool shouldConsiderAddressTypePromotion(
       const Instruction &I, bool &AllowPromotionWithoutCommonHeader) override {
     return Impl.shouldConsiderAddressTypePromotion(
@@ -1200,6 +1226,13 @@ public:
                                 VectorType *VecTy) const override {
     return Impl.getStoreVectorFactor(VF, StoreSize, ChainSizeInBytes, VecTy);
   }
+  bool useReductionIntrinsic(unsigned Opcode, Type *Ty,
+                             ReductionFlags Flags) const override {
+    return Impl.useReductionIntrinsic(Opcode, Ty, Flags);
+  }
+  bool shouldExpandReduction(const IntrinsicInst *II) const override {
+    return Impl.shouldExpandReduction(II);
+  }
 };
 
 template <typename T>
diff --git a/include/llvm/Analysis/TargetTransformInfoImpl.h b/include/llvm/Analysis/TargetTransformInfoImpl.h
index d7fda9e14b05..550e84ad90c4 100644
--- a/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -311,6 +311,8 @@ public:
 
   unsigned getRegisterBitWidth(bool Vector) { return 32; }
 
+  unsigned getMinVectorRegisterBitWidth() { return 128; }
+
   bool
   shouldConsiderAddressTypePromotion(const Instruction &I,
                                      bool &AllowPromotionWithoutCommonHeader) {
@@ -456,6 +458,16 @@ public:
                                 VectorType *VecTy) const {
     return VF;
   }
+
+  bool useReductionIntrinsic(unsigned Opcode, Type *Ty,
+                             TTI::ReductionFlags Flags) const {
+    return false;
+  }
+
+  bool shouldExpandReduction(const IntrinsicInst *II) const {
+    return true;
+  }
+
 protected:
   // Obtain the minimum required size to hold the value (without the sign)
   // In case of a vector it returns the min required size for one element.
diff --git a/include/llvm/Analysis/ValueTracking.h b/include/llvm/Analysis/ValueTracking.h
index a54c39e3ea3a..f5f323c6b797 100644
--- a/include/llvm/Analysis/ValueTracking.h
+++ b/include/llvm/Analysis/ValueTracking.h
@@ -56,6 +56,11 @@ template <typename T> class ArrayRef;
                         const Instruction *CxtI = nullptr,
                         const DominatorTree *DT = nullptr,
                         OptimizationRemarkEmitter *ORE = nullptr);
+  /// Returns the known bits rather than passing by reference.
+  KnownBits computeKnownBits(const Value *V, const DataLayout &DL,
+                             unsigned Depth = 0, AssumptionCache *AC = nullptr,
+                             const Instruction *CxtI = nullptr,
+                             const DominatorTree *DT = nullptr);
   /// Compute known bits from the range metadata.
   /// \p KnownZero the set of bits that are known to be zero
   /// \p KnownOne the set of bits that are known to be one
@@ -68,14 +73,6 @@ template <typename T> class ArrayRef;
                            const Instruction *CxtI = nullptr,
                            const DominatorTree *DT = nullptr);
 
-  /// Determine whether the sign bit is known to be zero or one. Convenience
-  /// wrapper around computeKnownBits.
-  void ComputeSignBit(const Value *V, bool &KnownZero, bool &KnownOne,
-                      const DataLayout &DL, unsigned Depth = 0,
-                      AssumptionCache *AC = nullptr,
-                      const Instruction *CxtI = nullptr,
-                      const DominatorTree *DT = nullptr);
-
   /// Return true if the given value is known to have exactly one bit set when
   /// defined. For vectors return true if every element is known to be a power
   /// of two when defined. Supports values with integer or pointer type and
diff --git a/include/llvm/Bitcode/BitcodeReader.h b/include/llvm/Bitcode/BitcodeReader.h
index 54f990d00233..31ffb7645f3a 100644
--- a/include/llvm/Bitcode/BitcodeReader.h
+++ b/include/llvm/Bitcode/BitcodeReader.h
@@ -152,10 +152,11 @@ namespace llvm {
 
   /// Parse the module summary index out of an IR file and return the module
   /// summary index object if found, or an empty summary if not. If Path refers
-  /// to an empty file and the -ignore-empty-index-file cl::opt flag is passed
+  /// to an empty file and IgnoreEmptyThinLTOIndexFile is true, then
   /// this function will return nullptr.
   Expected<std::unique_ptr<ModuleSummaryIndex>>
-  getModuleSummaryIndexForFile(StringRef Path);
+  getModuleSummaryIndexForFile(StringRef Path,
+                               bool IgnoreEmptyThinLTOIndexFile = false);
 
   /// isBitcodeWrapper - Return true if the given bytes are the magic bytes
   /// for an LLVM IR bitcode wrapper.
diff --git a/include/llvm/CodeGen/ExpandReductions.h b/include/llvm/CodeGen/ExpandReductions.h
new file mode 100644
index 000000000000..c6aaaad967b3
--- /dev/null
+++ b/include/llvm/CodeGen/ExpandReductions.h
@@ -0,0 +1,24 @@
+//===----- ExpandReductions.h - Expand experimental reduction intrinsics --===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CODEGEN_EXPANDREDUCTIONS_H
+#define LLVM_CODEGEN_EXPANDREDUCTIONS_H
+
+#include "llvm/IR/PassManager.h"
+
+namespace llvm {
+
+class ExpandReductionsPass
+    : public PassInfoMixin<ExpandReductionsPass> {
+public:
+  PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
+};
+} // end namespace llvm
+
+#endif // LLVM_CODEGEN_EXPANDREDUCTIONS_H
diff --git a/include/llvm/CodeGen/GlobalISel/LegalizerInfo.h b/include/llvm/CodeGen/GlobalISel/LegalizerInfo.h
index 30d67eb49923..21354ae20ed1 100644
--- a/include/llvm/CodeGen/GlobalISel/LegalizerInfo.h
+++ b/include/llvm/CodeGen/GlobalISel/LegalizerInfo.h
@@ -145,7 +145,7 @@ public:
 
   /// Iterate the given function (typically something like doubling the width)
   /// on Ty until we find a legal type for this operation.
-  LLT findLegalType(const InstrAspect &Aspect,
+  Optional<LLT> findLegalType(const InstrAspect &Aspect,
                     function_ref<LLT(LLT)> NextType) const {
     LegalizeAction Action;
     const TypeMap &Map = Actions[Aspect.Opcode - FirstOp][Aspect.Idx];
@@ -153,8 +153,12 @@ public:
     do {
       Ty = NextType(Ty);
       auto ActionIt = Map.find(Ty);
-      if (ActionIt == Map.end())
-        Action = DefaultActions.find(Aspect.Opcode)->second;
+      if (ActionIt == Map.end()) {
+        auto DefaultIt = DefaultActions.find(Aspect.Opcode);
+        if (DefaultIt == DefaultActions.end())
+          return None;
+        Action = DefaultIt->second;
+      }
       else
         Action = ActionIt->second;
     } while(Action != Legal);
@@ -163,11 +167,14 @@ public:
 
   /// Find what type it's actually OK to perform the given operation on, given
   /// the general approach we've decided to take.
-  LLT findLegalType(const InstrAspect &Aspect, LegalizeAction Action) const;
+  Optional<LLT> findLegalType(const InstrAspect &Aspect, LegalizeAction Action) const;
 
   std::pair<LegalizeAction, LLT> findLegalAction(const InstrAspect &Aspect,
                                                  LegalizeAction Action) const {
-    return std::make_pair(Action, findLegalType(Aspect, Action));
+    auto LegalType = findLegalType(Aspect, Action);
+    if (!LegalType)
+      return std::make_pair(LegalizeAction::Unsupported, LLT());
+    return std::make_pair(Action, *LegalType);
   }
 
   /// Find the specified \p Aspect in the primary (explicitly set) Actions
diff --git a/include/llvm/CodeGen/GlobalISel/Utils.h b/include/llvm/CodeGen/GlobalISel/Utils.h
index 92bc9736141a..69d507069808 100644
--- a/include/llvm/CodeGen/GlobalISel/Utils.h
+++ b/include/llvm/CodeGen/GlobalISel/Utils.h
@@ -30,6 +30,7 @@ class TargetInstrInfo;
 class TargetPassConfig;
 class TargetRegisterInfo;
 class Twine;
+class ConstantFP;
 
 /// Try to constrain Reg so that it is usable by argument OpIdx of the
 /// provided MCInstrDesc \p II. If this fails, create a new virtual
@@ -62,6 +63,8 @@ void reportGISelFailure(MachineFunction &MF, const TargetPassConfig &TPC,
 
 Optional<int64_t> getConstantVRegVal(unsigned VReg,
                                      const MachineRegisterInfo &MRI);
+const ConstantFP* getConstantFPVRegVal(unsigned VReg,
+                                       const MachineRegisterInfo &MRI);
 
 } // End namespace llvm.
 #endif
diff --git a/include/llvm/CodeGen/ISDOpcodes.h b/include/llvm/CodeGen/ISDOpcodes.h
index ca0f3fbad892..f2a9a9f73ca6 100644
--- a/include/llvm/CodeGen/ISDOpcodes.h
+++ b/include/llvm/CodeGen/ISDOpcodes.h
@@ -644,6 +644,13 @@ namespace ISD {
     /// of a call sequence, and carry arbitrary information that target might
     /// want to know.  The first operand is a chain, the rest are specified by
     /// the target and not touched by the DAG optimizers.
+    /// Targets that may use stack to pass call arguments define additional
+    /// operands:
+    /// - size of the call frame part that must be set up within the
+    ///   CALLSEQ_START..CALLSEQ_END pair,
+    /// - part of the call frame prepared prior to CALLSEQ_START.
+    /// Both these parameters must be constants, their sum is the total call
+    /// frame size.
     /// CALLSEQ_START..CALLSEQ_END pairs may not be nested.
     CALLSEQ_START,  // Beginning of a call sequence
     CALLSEQ_END,    // End of a call sequence
@@ -783,6 +790,20 @@ namespace ISD {
     /// known nonzero constant. The only operand here is the chain.
     GET_DYNAMIC_AREA_OFFSET,
 
+    /// Generic reduction nodes. These nodes represent horizontal vector
+    /// reduction operations, producing a scalar result.
+    /// The STRICT variants perform reductions in sequential order. The first
+    /// operand is an initial scalar accumulator value, and the second operand
+    /// is the vector to reduce.
+    VECREDUCE_STRICT_FADD, VECREDUCE_STRICT_FMUL,
+    /// These reductions are non-strict, and have a single vector operand.
+    VECREDUCE_FADD, VECREDUCE_FMUL,
+    VECREDUCE_ADD, VECREDUCE_MUL,
+    VECREDUCE_AND, VECREDUCE_OR, VECREDUCE_XOR,
+    VECREDUCE_SMAX, VECREDUCE_SMIN, VECREDUCE_UMAX, VECREDUCE_UMIN,
+    /// FMIN/FMAX nodes can have flags, for NaN/NoNaN variants.
+    VECREDUCE_FMAX, VECREDUCE_FMIN,
+
     /// BUILTIN_OP_END - This must be the last enum value in this list.
     /// The target-specific pre-isel opcode values start here.
     BUILTIN_OP_END
diff --git a/include/llvm/CodeGen/MachineCombinerPattern.h b/include/llvm/CodeGen/MachineCombinerPattern.h
index 11238016d447..8c54ae925470 100644
--- a/include/llvm/CodeGen/MachineCombinerPattern.h
+++ b/include/llvm/CodeGen/MachineCombinerPattern.h
@@ -48,6 +48,8 @@ enum class MachineCombinerPattern {
   FMULADDD_OP2,
   FMULSUBD_OP1,
   FMULSUBD_OP2,
+  FNMULSUBS_OP1,
+  FNMULSUBD_OP1,
   FMLAv1i32_indexed_OP1,
   FMLAv1i32_indexed_OP2,
   FMLAv1i64_indexed_OP1,
diff --git a/include/llvm/CodeGen/Passes.h b/include/llvm/CodeGen/Passes.h
index 42299b529410..8a5a1997386f 100644
--- a/include/llvm/CodeGen/Passes.h
+++ b/include/llvm/CodeGen/Passes.h
@@ -68,6 +68,10 @@ namespace llvm {
   /// matching during instruction selection.
   FunctionPass *createCodeGenPreparePass(const TargetMachine *TM = nullptr);
 
+  /// createScalarizeMaskedMemIntrinPass - Replace masked load, store, gather
+  /// and scatter intrinsics with scalar code when target doesn't support them.
+  FunctionPass *createScalarizeMaskedMemIntrinPass();
+
   /// AtomicExpandID -- Lowers atomic operations in terms of either cmpxchg
   /// load-linked/store-conditional loops.
   extern char &AtomicExpandID;
@@ -129,6 +133,10 @@ namespace llvm {
   // instruction and update the MachineFunctionInfo with that information.
   extern char &ShrinkWrapID;
 
+  /// LiveRangeShrink pass. Move instruction close to its definition to shrink
+  /// the definition's live range.
+  extern char &LiveRangeShrinkID;
+
   /// Greedy register allocator.
   extern char &RAGreedyID;
 
@@ -405,6 +413,10 @@ namespace llvm {
   /// printing assembly.
   ModulePass *createMachineOutlinerPass();
 
+  /// This pass expands the experimental reduction intrinsics into sequences of
+  /// shuffles.
+  FunctionPass *createExpandReductionsPass();
+
 } // End llvm namespace
 
 /// Target machine pass initializer for passes with dependencies. Use with
diff --git a/include/llvm/CodeGen/SelectionDAG.h b/include/llvm/CodeGen/SelectionDAG.h
index 9e1d148c7ce5..d761661f763e 100644
--- a/include/llvm/CodeGen/SelectionDAG.h
+++ b/include/llvm/CodeGen/SelectionDAG.h
@@ -406,7 +406,7 @@ public:
   /// certain types of nodes together, or eliminating superfluous nodes.  The
   /// Level argument controls whether Combine is allowed to produce nodes and
   /// types that are illegal on the target.
-  void Combine(CombineLevel Level, AliasAnalysis &AA,
+  void Combine(CombineLevel Level, AliasAnalysis *AA,
                CodeGenOpt::Level OptLevel);
 
   /// This transforms the SelectionDAG into a SelectionDAG that
@@ -737,11 +737,15 @@ public:
   /// \brief Create a logical NOT operation as (XOR Val, BooleanOne).
   SDValue getLogicalNOT(const SDLoc &DL, SDValue Val, EVT VT);
 
-  /// Return a new CALLSEQ_START node, which always must have a glue result
-  /// (to ensure it's not CSE'd).  CALLSEQ_START does not have a useful SDLoc.
-  SDValue getCALLSEQ_START(SDValue Chain, SDValue Op, const SDLoc &DL) {
+  /// Return a new CALLSEQ_START node, that starts new call frame, in which
+  /// InSize bytes are set up inside CALLSEQ_START..CALLSEQ_END sequence and
+  /// OutSize specifies part of the frame set up prior to the sequence.
+  SDValue getCALLSEQ_START(SDValue Chain, uint64_t InSize, uint64_t OutSize,
+                           const SDLoc &DL) {
     SDVTList VTs = getVTList(MVT::Other, MVT::Glue);
-    SDValue Ops[] = { Chain,  Op };
+    SDValue Ops[] = { Chain,
+                      getIntPtrConstant(InSize, DL, true),
+                      getIntPtrConstant(OutSize, DL, true) };
     return getNode(ISD::CALLSEQ_START, DL, VTs, Ops);
   }
 
diff --git a/include/llvm/DebugInfo/CodeView/CVTypeVisitor.h b/include/llvm/DebugInfo/CodeView/CVTypeVisitor.h
index e9012db7602d..f3122f0bf7f0 100644
--- a/include/llvm/DebugInfo/CodeView/CVTypeVisitor.h
+++ b/include/llvm/DebugInfo/CodeView/CVTypeVisitor.h
@@ -26,6 +26,7 @@ public:
 
   void addTypeServerHandler(TypeServerHandler &Handler);
 
+  Error visitTypeRecord(CVType &Record, TypeIndex Index);
   Error visitTypeRecord(CVType &Record);
   Error visitMemberRecord(CVMemberRecord &Record);
 
@@ -37,6 +38,9 @@ public:
   Error visitFieldListMemberStream(BinaryStreamReader Reader);
 
 private:
+  Expected<bool> handleTypeServer(CVType &Record);
+  Error finishVisitation(CVType &Record);
+
   /// The interface to the class that gets notified of each visitation.
   TypeVisitorCallbacks &Callbacks;
 
diff --git a/include/llvm/DebugInfo/CodeView/RandomAccessTypeVisitor.h b/include/llvm/DebugInfo/CodeView/RandomAccessTypeVisitor.h
new file mode 100644
index 000000000000..35a8010f1163
--- /dev/null
+++ b/include/llvm/DebugInfo/CodeView/RandomAccessTypeVisitor.h
@@ -0,0 +1,103 @@
+//===- RandomAccessTypeVisitor.h ------------------------------ *- C++ --*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_DEBUGINFO_CODEVIEW_RANDOMACCESSTYPEVISITOR_H
+#define LLVM_DEBUGINFO_CODEVIEW_RANDOMACCESSTYPEVISITOR_H
+
+#include "llvm/ADT/TinyPtrVector.h"
+#include "llvm/DebugInfo/CodeView/CVTypeVisitor.h"
+#include "llvm/DebugInfo/CodeView/TypeDatabase.h"
+#include "llvm/DebugInfo/CodeView/TypeDatabaseVisitor.h"
+#include "llvm/DebugInfo/CodeView/TypeDeserializer.h"
+#include "llvm/DebugInfo/CodeView/TypeIndex.h"
+#include "llvm/DebugInfo/CodeView/TypeRecord.h"
+#include "llvm/DebugInfo/CodeView/TypeVisitorCallbackPipeline.h"
+#include "llvm/Support/Error.h"
+
+namespace llvm {
+namespace codeview {
+
+class TypeDatabase;
+class TypeServerHandler;
+class TypeVisitorCallbacks;
+
+/// \brief Provides amortized O(1) random access to a CodeView type stream.
+/// Normally to access a type from a type stream, you must know its byte
+/// offset into the type stream, because type records are variable-lengthed.
+/// However, this is not the way we prefer to access them.  For example, given
+/// a symbol record one of the fields may be the TypeIndex of the symbol's
+/// type record.  Or given a type record such as an array type, there might
+/// be a TypeIndex for the element type.  Sequential access is perfect when
+/// we're just dumping every entry, but it's very poor for real world usage.
+///
+/// Type streams in PDBs contain an additional field which is a list of pairs
+/// containing indices and their corresponding offsets, roughly every ~8KB of
+/// record data.  This general idea need not be confined to PDBs though.  By
+/// supplying such an array, the producer of a type stream can allow the
+/// consumer much better access time, because the consumer can find the nearest
+/// index in this array, and do a linear scan forward only from there.
+///
+/// RandomAccessTypeVisitor implements this algorithm, but additionally goes one
+/// step further by caching offsets of every record that has been visited at
+/// least once.  This way, even repeated visits of the same record will never
+/// require more than one linear scan.  For a type stream of N elements divided
+/// into M chunks of roughly equal size, this yields a worst case lookup time
+/// of O(N/M) and an amortized time of O(1).
+class RandomAccessTypeVisitor {
+  typedef FixedStreamArray<TypeIndexOffset> PartialOffsetArray;
+
+public:
+  RandomAccessTypeVisitor(const CVTypeArray &Types, uint32_t NumRecords,
+                          PartialOffsetArray PartialOffsets);
+
+  Error visitTypeIndex(TypeIndex Index, TypeVisitorCallbacks &Callbacks);
+
+  const TypeDatabase &database() const { return Database; }
+
+private:
+  Error visitRangeForType(TypeIndex TI);
+  Error visitRange(TypeIndex Begin, uint32_t BeginOffset, TypeIndex End);
+
+  /// Visited records get automatically added to the type database.
+  TypeDatabase Database;
+
+  /// The type array to allow random access visitation of.
+  const CVTypeArray &Types;
+
+  /// The database visitor which adds new records to the database.
+  TypeDatabaseVisitor DatabaseVisitor;
+
+  /// The deserializer which deserializes new records.
+  TypeDeserializer Deserializer;
+
+  /// The visitation callback pipeline to use.  By default this contains a
+  /// deserializer and a type database visitor.  But the callback specified
+  /// in the constructor is also added.
+  TypeVisitorCallbackPipeline Pipeline;
+
+  /// The visitor used to visit the internal pipeline for deserialization and
+  /// database maintenance.
+  CVTypeVisitor InternalVisitor;
+
+  /// A vector mapping type indices to type offset.  For every record that has
+  /// been visited, contains the absolute offset of that record in the record
+  /// array.
+  std::vector<uint32_t> KnownOffsets;
+
+  /// An array of index offsets for the given type stream, allowing log(N)
+  /// lookups of a type record by index.  Similar to KnownOffsets but only
+  /// contains offsets for some type indices, some of which may not have
+  /// ever been visited.
+  PartialOffsetArray PartialOffsets;
+};
+
+} // end namespace codeview
+} // end namespace llvm
+
+#endif // LLVM_DEBUGINFO_CODEVIEW_RANDOMACCESSTYPEVISITOR_H
diff --git a/include/llvm/DebugInfo/CodeView/TypeDatabase.h b/include/llvm/DebugInfo/CodeView/TypeDatabase.h
index be7b19e7df0c..92c15ebd8b2b 100644
--- a/include/llvm/DebugInfo/CodeView/TypeDatabase.h
+++ b/include/llvm/DebugInfo/CodeView/TypeDatabase.h
@@ -10,6 +10,7 @@
 #ifndef LLVM_DEBUGINFO_CODEVIEW_TYPEDATABASE_H
 #define LLVM_DEBUGINFO_CODEVIEW_TYPEDATABASE_H
 
+#include "llvm/ADT/BitVector.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/DebugInfo/CodeView/TypeIndex.h"
@@ -20,14 +21,16 @@
 namespace llvm {
 namespace codeview {
 class TypeDatabase {
+  friend class RandomAccessTypeVisitor;
+
 public:
-  explicit TypeDatabase(uint32_t ExpectedSize);
+  explicit TypeDatabase(uint32_t Capacity);
 
-  /// Gets the type index for the next type record.
-  TypeIndex getNextTypeIndex() const;
+  /// Records the name of a type, and reserves its type index.
+  TypeIndex appendType(StringRef Name, const CVType &Data);
 
   /// Records the name of a type, and reserves its type index.
-  void recordType(StringRef Name, const CVType &Data);
+  void recordType(StringRef Name, TypeIndex Index, const CVType &Data);
 
   /// Saves the name in a StringSet and creates a stable StringRef.
   StringRef saveTypeName(StringRef TypeName);
@@ -37,13 +40,21 @@ public:
   const CVType &getTypeRecord(TypeIndex Index) const;
   CVType &getTypeRecord(TypeIndex Index);
 
-  bool containsTypeIndex(TypeIndex Index) const;
+  bool contains(TypeIndex Index) const;
 
   uint32_t size() const;
+  uint32_t capacity() const;
+  bool empty() const;
+
+  TypeIndex getAppendIndex() const;
 
 private:
+  void grow();
+
   BumpPtrAllocator Allocator;
 
+  uint32_t Count = 0;
+
   /// All user defined type records in .debug$T live in here. Type indices
   /// greater than 0x1000 are user defined. Subtract 0x1000 from the index to
   /// index into this vector.
@@ -51,6 +62,8 @@ private:
   SmallVector<CVType, 10> TypeRecords;
 
   StringSaver TypeNameStorage;
+
+  BitVector ValidRecords;
 };
 }
 }
diff --git a/include/llvm/DebugInfo/CodeView/TypeDatabaseVisitor.h b/include/llvm/DebugInfo/CodeView/TypeDatabaseVisitor.h
index 39d234cf9814..c064e19a7e90 100644
--- a/include/llvm/DebugInfo/CodeView/TypeDatabaseVisitor.h
+++ b/include/llvm/DebugInfo/CodeView/TypeDatabaseVisitor.h
@@ -10,6 +10,8 @@
 #ifndef LLVM_DEBUGINFO_CODEVIEW_TYPEDATABASEVISITOR_H
 #define LLVM_DEBUGINFO_CODEVIEW_TYPEDATABASEVISITOR_H
 
+#include "llvm/ADT/PointerUnion.h"
+
 #include "llvm/DebugInfo/CodeView/TypeDatabase.h"
 #include "llvm/DebugInfo/CodeView/TypeIndex.h"
 #include "llvm/DebugInfo/CodeView/TypeRecord.h"
@@ -21,11 +23,12 @@ namespace codeview {
 /// Dumper for CodeView type streams found in COFF object files and PDB files.
 class TypeDatabaseVisitor : public TypeVisitorCallbacks {
 public:
-  explicit TypeDatabaseVisitor(TypeDatabase &TypeDB) : TypeDB(TypeDB) {}
+  explicit TypeDatabaseVisitor(TypeDatabase &TypeDB) : TypeDB(&TypeDB) {}
 
   /// Paired begin/end actions for all types. Receives all record data,
   /// including the fixed-length record prefix.
   Error visitTypeBegin(CVType &Record) override;
+  Error visitTypeBegin(CVType &Record, TypeIndex Index) override;
   Error visitTypeEnd(CVType &Record) override;
   Error visitMemberBegin(CVMemberRecord &Record) override;
   Error visitMemberEnd(CVMemberRecord &Record) override;
@@ -39,12 +42,18 @@ public:
 #include "TypeRecords.def"
 
 private:
+  StringRef getTypeName(TypeIndex Index) const;
+  StringRef saveTypeName(StringRef Name);
+
   bool IsInFieldList = false;
 
   /// Name of the current type. Only valid before visitTypeEnd.
   StringRef Name;
+  /// Current type index.  Only valid before visitTypeEnd, and if we are
+  /// visiting a random access type database.
+  Optional<TypeIndex> CurrentTypeIndex;
 
-  TypeDatabase &TypeDB;
+  TypeDatabase *TypeDB;
 };
 
 } // end namespace codeview
diff --git a/include/llvm/DebugInfo/CodeView/TypeDeserializer.h b/include/llvm/DebugInfo/CodeView/TypeDeserializer.h
index 0e3443789170..2142d4a2dec7 100644
--- a/include/llvm/DebugInfo/CodeView/TypeDeserializer.h
+++ b/include/llvm/DebugInfo/CodeView/TypeDeserializer.h
@@ -46,6 +46,10 @@ public:
     return Mapping->Mapping.visitTypeBegin(Record);
   }
 
+  Error visitTypeBegin(CVType &Record, TypeIndex Index) override {
+    return visitTypeBegin(Record);
+  }
+
   Error visitTypeEnd(CVType &Record) override {
     assert(Mapping && "Not in a type mapping!");
     auto EC = Mapping->Mapping.visitTypeEnd(Record);
diff --git a/include/llvm/DebugInfo/CodeView/TypeDumpVisitor.h b/include/llvm/DebugInfo/CodeView/TypeDumpVisitor.h
index 00bb09137e48..6f10afb30d60 100644
--- a/include/llvm/DebugInfo/CodeView/TypeDumpVisitor.h
+++ b/include/llvm/DebugInfo/CodeView/TypeDumpVisitor.h
@@ -45,6 +45,7 @@ public:
   /// Paired begin/end actions for all types. Receives all record data,
   /// including the fixed-length record prefix.
   Error visitTypeBegin(CVType &Record) override;
+  Error visitTypeBegin(CVType &Record, TypeIndex Index) override;
   Error visitTypeEnd(CVType &Record) override;
   Error visitMemberBegin(CVMemberRecord &Record) override;
   Error visitMemberEnd(CVMemberRecord &Record) override;
diff --git a/include/llvm/DebugInfo/CodeView/TypeIndex.h b/include/llvm/DebugInfo/CodeView/TypeIndex.h
index 3c11d248fa72..b5d695fc49d5 100644
--- a/include/llvm/DebugInfo/CodeView/TypeIndex.h
+++ b/include/llvm/DebugInfo/CodeView/TypeIndex.h
@@ -106,6 +106,15 @@ public:
 
   bool isNoneType() const { return *this == None(); }
 
+  uint32_t toArrayIndex() const {
+    assert(!isSimple());
+    return getIndex() - FirstNonSimpleIndex;
+  }
+
+  static TypeIndex fromArrayIndex(uint32_t Index) {
+    return TypeIndex(Index + FirstNonSimpleIndex);
+  }
+
   SimpleTypeKind getSimpleKind() const {
     assert(isSimple());
     return static_cast<SimpleTypeKind>(Index & SimpleKindMask);
@@ -159,6 +168,39 @@ public:
   static TypeIndex Float32() { return TypeIndex(SimpleTypeKind::Float32); }
   static TypeIndex Float64() { return TypeIndex(SimpleTypeKind::Float64); }
 
+  TypeIndex &operator+=(unsigned N) {
+    Index += N;
+    return *this;
+  }
+
+  TypeIndex &operator++() {
+    Index += 1;
+    return *this;
+  }
+
+  TypeIndex operator++(int) {
+    TypeIndex Copy = *this;
+    operator++();
+    return Copy;
+  }
+
+  TypeIndex &operator-=(unsigned N) {
+    assert(Index >= N);
+    Index -= N;
+    return *this;
+  }
+
+  TypeIndex &operator--() {
+    Index -= 1;
+    return *this;
+  }
+
+  TypeIndex operator--(int) {
+    TypeIndex Copy = *this;
+    operator--();
+    return Copy;
+  }
+
   friend inline bool operator==(const TypeIndex &A, const TypeIndex &B) {
     return A.getIndex() == B.getIndex();
   }
@@ -183,10 +225,30 @@ public:
     return A.getIndex() >= B.getIndex();
   }
 
+  friend inline TypeIndex operator+(const TypeIndex &A, uint32_t N) {
+    TypeIndex Result(A);
+    Result += N;
+    return Result;
+  }
+
+  friend inline TypeIndex operator-(const TypeIndex &A, uint32_t N) {
+    assert(A.getIndex() >= N);
+    TypeIndex Result(A);
+    Result -= N;
+    return Result;
+  }
+
 private:
   support::ulittle32_t Index;
 };
 
+// Used for pseudo-indexing an array of type records.  An array of such records
+// sorted by TypeIndex can allow log(N) lookups even though such a type record
+// stream does not provide random access.
+struct TypeIndexOffset {
+  TypeIndex Type;
+  support::ulittle32_t Offset;
+};
 }
 }
 
diff --git a/include/llvm/DebugInfo/CodeView/TypeVisitorCallbackPipeline.h b/include/llvm/DebugInfo/CodeView/TypeVisitorCallbackPipeline.h
index f25129691041..ed48df33249f 100644
--- a/include/llvm/DebugInfo/CodeView/TypeVisitorCallbackPipeline.h
+++ b/include/llvm/DebugInfo/CodeView/TypeVisitorCallbackPipeline.h
@@ -47,6 +47,14 @@ public:
     return Error::success();
   }
 
+  Error visitTypeBegin(CVType &Record, TypeIndex Index) override {
+    for (auto Visitor : Pipeline) {
+      if (auto EC = Visitor->visitTypeBegin(Record, Index))
+        return EC;
+    }
+    return Error::success();
+  }
+
   Error visitTypeEnd(CVType &Record) override {
     for (auto Visitor : Pipeline) {
       if (auto EC = Visitor->visitTypeEnd(Record))
diff --git a/include/llvm/DebugInfo/CodeView/TypeVisitorCallbacks.h b/include/llvm/DebugInfo/CodeView/TypeVisitorCallbacks.h
index 5e27df346b00..2950c7d27cb6 100644
--- a/include/llvm/DebugInfo/CodeView/TypeVisitorCallbacks.h
+++ b/include/llvm/DebugInfo/CodeView/TypeVisitorCallbacks.h
@@ -26,8 +26,15 @@ public:
   virtual Error visitUnknownType(CVType &Record) { return Error::success(); }
   /// Paired begin/end actions for all types. Receives all record data,
   /// including the fixed-length record prefix.  visitTypeBegin() should return
-  /// the type of the Record, or an error if it cannot be determined.
+  /// the type of the Record, or an error if it cannot be determined.  Exactly
+  /// one of the two visitTypeBegin methods will be called, depending on whether
+  /// records are being visited sequentially or randomly.  An implementation
+  /// should be prepared to handle both (or assert if it can't handle random
+  /// access visitation).
   virtual Error visitTypeBegin(CVType &Record) { return Error::success(); }
+  virtual Error visitTypeBegin(CVType &Record, TypeIndex Index) {
+    return Error::success();
+  }
   virtual Error visitTypeEnd(CVType &Record) { return Error::success(); }
 
   virtual Error visitUnknownMember(CVMemberRecord &Record) {
diff --git a/include/llvm/DebugInfo/DWARF/DWARFContext.h b/include/llvm/DebugInfo/DWARF/DWARFContext.h
index 3fae8b441439..ca82a68ead31 100644
--- a/include/llvm/DebugInfo/DWARF/DWARFContext.h
+++ b/include/llvm/DebugInfo/DWARF/DWARFContext.h
@@ -43,13 +43,6 @@ namespace llvm {
 class MemoryBuffer;
 class raw_ostream;
 
-// In place of applying the relocations to the data we've read from disk we use
-// a separate mapping table to the side and checking that at locations in the
-// dwarf where we expect relocated values. This adds a bit of complexity to the
-// dwarf parsing/extraction at the benefit of not allocating memory for the
-// entire size of the debug info sections.
-typedef DenseMap<uint64_t, std::pair<uint8_t, int64_t>> RelocAddrMap;
-
 /// Reads a value from data extractor and applies a relocation to the result if
 /// one exists for the given offset.
 uint64_t getRelocatedValue(const DataExtractor &Data, uint32_t Size,
diff --git a/include/llvm/DebugInfo/DWARF/DWARFDebugLine.h b/include/llvm/DebugInfo/DWARF/DWARFDebugLine.h
index e21245b97b73..39a7ef71de97 100644
--- a/include/llvm/DebugInfo/DWARF/DWARFDebugLine.h
+++ b/include/llvm/DebugInfo/DWARF/DWARFDebugLine.h
@@ -30,7 +30,7 @@ public:
   struct FileNameEntry {
     FileNameEntry() = default;
 
-    StringRef Name = StringRef();
+    StringRef Name;
     uint64_t DirIdx = 0;
     uint64_t ModTime = 0;
     uint64_t Length = 0;
diff --git a/include/llvm/DebugInfo/DWARF/DWARFDebugRangeList.h b/include/llvm/DebugInfo/DWARF/DWARFDebugRangeList.h
index 9172df5bfac6..23a573b7a9fa 100644
--- a/include/llvm/DebugInfo/DWARF/DWARFDebugRangeList.h
+++ b/include/llvm/DebugInfo/DWARF/DWARFDebugRangeList.h
@@ -22,8 +22,13 @@ namespace llvm {
 
 class raw_ostream;
 
+struct DWARFAddressRange {
+  uint64_t LowPC;
+  uint64_t HighPC;
+};
+
 /// DWARFAddressRangesVector - represents a set of absolute address ranges.
-typedef std::vector<std::pair<uint64_t, uint64_t>> DWARFAddressRangesVector;
+typedef std::vector<DWARFAddressRange> DWARFAddressRangesVector;
 
 class DWARFDebugRangeList {
 public:
diff --git a/include/llvm/DebugInfo/DWARF/DWARFRelocMap.h b/include/llvm/DebugInfo/DWARF/DWARFRelocMap.h
index af01bddeed15..f1e03bb4c2e1 100644
--- a/include/llvm/DebugInfo/DWARF/DWARFRelocMap.h
+++ b/include/llvm/DebugInfo/DWARF/DWARFRelocMap.h
@@ -16,7 +16,17 @@
 
 namespace llvm {
 
-typedef DenseMap<uint64_t, std::pair<uint8_t, int64_t>> RelocAddrMap;
+struct RelocAddrEntry {
+  uint8_t Width;
+  int64_t Value;
+};
+
+// In place of applying the relocations to the data we've read from disk we use
+// a separate mapping table to the side and checking that at locations in the
+// dwarf where we expect relocated values. This adds a bit of complexity to the
+// dwarf parsing/extraction at the benefit of not allocating memory for the
+// entire size of the debug info sections.
+typedef DenseMap<uint64_t, RelocAddrEntry> RelocAddrMap;
 
 } // end namespace llvm
 
diff --git a/include/llvm/DebugInfo/DWARF/DWARFVerifier.h b/include/llvm/DebugInfo/DWARF/DWARFVerifier.h
index 8e12bcd2c8e2..b9f14be85926 100644
--- a/include/llvm/DebugInfo/DWARF/DWARFVerifier.h
+++ b/include/llvm/DebugInfo/DWARF/DWARFVerifier.h
@@ -40,7 +40,7 @@ class DWARFVerifier {
   ///
   /// @param Die          The DWARF DIE that owns the attribute value
   /// @param AttrValue    The DWARF attribute value to check
-  void verifyDebugInfoAttribute(DWARFDie &Die, DWARFAttribute &AttrValue);
+  void verifyDebugInfoAttribute(const DWARFDie &Die, DWARFAttribute &AttrValue);
 
   /// Verifies the attribute's DWARF form.
   ///
@@ -51,7 +51,7 @@ class DWARFVerifier {
   ///
   /// @param Die          The DWARF DIE that owns the attribute value
   /// @param AttrValue    The DWARF attribute value to check
-  void verifyDebugInfoForm(DWARFDie &Die, DWARFAttribute &AttrValue);
+  void verifyDebugInfoForm(const DWARFDie &Die, DWARFAttribute &AttrValue);
 
   /// Verifies the all valid references that were found when iterating through
   /// all of the DIE attributes.
@@ -60,7 +60,7 @@ class DWARFVerifier {
   /// offset matches. This helps to ensure if a DWARF link phase moved things
   /// around, that it doesn't create invalid references by failing to relocate
   /// CU relative and absolute references.
-  void veifyDebugInfoReferences();
+  void verifyDebugInfoReferences();
 
   /// Verify the the DW_AT_stmt_list encoding and value and ensure that no
   /// compile units that have the same DW_AT_stmt_list value.
diff --git a/include/llvm/DebugInfo/PDB/Native/RawTypes.h b/include/llvm/DebugInfo/PDB/Native/RawTypes.h
index 979b8454dd5e..771272d6a47d 100644
--- a/include/llvm/DebugInfo/PDB/Native/RawTypes.h
+++ b/include/llvm/DebugInfo/PDB/Native/RawTypes.h
@@ -73,13 +73,6 @@ struct SecMapEntry {
   support::ulittle32_t SecByteLength; // Byte count of the segment or group.
 };
 
-// Used for serialized hash table in TPI stream.
-// In the reference, it is an array of TI and cbOff pair.
-struct TypeIndexOffset {
-  codeview::TypeIndex Type;
-  support::ulittle32_t Offset;
-};
-
 /// Some of the values are stored in bitfields.  Since this needs to be portable
 /// across compilers and architectures (big / little endian in particular) we
 /// can't use the actual structures below, but must instead do the shifting
diff --git a/include/llvm/DebugInfo/PDB/Native/TpiStream.h b/include/llvm/DebugInfo/PDB/Native/TpiStream.h
index 9fef9bee5e1a..4579cbf4227b 100644
--- a/include/llvm/DebugInfo/PDB/Native/TpiStream.h
+++ b/include/llvm/DebugInfo/PDB/Native/TpiStream.h
@@ -47,7 +47,7 @@ public:
   uint32_t getHashKeySize() const;
   uint32_t getNumHashBuckets() const;
   FixedStreamArray<support::ulittle32_t> getHashValues() const;
-  FixedStreamArray<TypeIndexOffset> getTypeIndexOffsets() const;
+  FixedStreamArray<codeview::TypeIndexOffset> getTypeIndexOffsets() const;
   HashTable &getHashAdjusters();
 
   codeview::CVTypeRange types(bool *HadError) const;
@@ -62,7 +62,7 @@ private:
 
   std::unique_ptr<BinaryStream> HashStream;
   FixedStreamArray<support::ulittle32_t> HashValues;
-  FixedStreamArray<TypeIndexOffset> TypeIndexOffsets;
+  FixedStreamArray<codeview::TypeIndexOffset> TypeIndexOffsets;
   HashTable HashAdjusters;
 
   const TpiStreamHeader *Header;
diff --git a/include/llvm/DebugInfo/PDB/Native/TpiStreamBuilder.h b/include/llvm/DebugInfo/PDB/Native/TpiStreamBuilder.h
index a29ed0b610d3..6c609c34665c 100644
--- a/include/llvm/DebugInfo/PDB/Native/TpiStreamBuilder.h
+++ b/include/llvm/DebugInfo/PDB/Native/TpiStreamBuilder.h
@@ -75,7 +75,7 @@ private:
   Optional<PdbRaw_TpiVer> VerHeader;
   std::vector<ArrayRef<uint8_t>> TypeRecords;
   std::vector<uint32_t> TypeHashes;
-  std::vector<TypeIndexOffset> TypeIndexOffsets;
+  std::vector<codeview::TypeIndexOffset> TypeIndexOffsets;
   uint32_t HashStreamIndex = kInvalidStreamIndex;
   std::unique_ptr<BinaryByteStream> HashValueStream;
 
diff --git a/include/llvm/ExecutionEngine/Orc/CompileOnDemandLayer.h b/include/llvm/ExecutionEngine/Orc/CompileOnDemandLayer.h
index 7e7f7358938a..1bb911d09cfb 100644
--- a/include/llvm/ExecutionEngine/Orc/CompileOnDemandLayer.h
+++ b/include/llvm/ExecutionEngine/Orc/CompileOnDemandLayer.h
@@ -172,6 +172,11 @@ private:
       return nullptr;
     }
 
+    void removeModulesFromBaseLayer(BaseLayerT &BaseLayer) {
+      for (auto &BLH : BaseLayerHandles)
+        BaseLayer.removeModuleSet(BLH);
+    }
+
     std::unique_ptr<JITSymbolResolver> ExternalSymbolResolver;
     std::unique_ptr<ResourceOwner<RuntimeDyld::MemoryManager>> MemMgr;
     std::unique_ptr<IndirectStubsMgrT> StubsMgr;
@@ -204,6 +209,11 @@ public:
         CreateIndirectStubsManager(std::move(CreateIndirectStubsManager)),
         CloneStubsIntoPartitions(CloneStubsIntoPartitions) {}
 
+  ~CompileOnDemandLayer() {
+    while (!LogicalDylibs.empty())
+      removeModuleSet(LogicalDylibs.begin());
+  }
+  
   /// @brief Add a module to the compile-on-demand layer.
   template <typename ModuleSetT, typename MemoryManagerPtrT,
             typename SymbolResolverPtrT>
@@ -239,6 +249,7 @@ public:
   ///   This will remove all modules in the layers below that were derived from
   /// the module represented by H.
   void removeModuleSet(ModuleSetHandleT H) {
+    H->removeModulesFromBaseLayer(BaseLayer);
     LogicalDylibs.erase(H);
   }
 
@@ -478,6 +489,8 @@ private:
         return 0;
     }
 
+    LD.BaseLayerHandles.push_back(PartH);
+
     return CalledAddr;
   }
 
diff --git a/include/llvm/ExecutionEngine/Orc/OrcRemoteTargetClient.h b/include/llvm/ExecutionEngine/Orc/OrcRemoteTargetClient.h
index 02f59d6a831a..a19c30631c57 100644
--- a/include/llvm/ExecutionEngine/Orc/OrcRemoteTargetClient.h
+++ b/include/llvm/ExecutionEngine/Orc/OrcRemoteTargetClient.h
@@ -144,16 +144,16 @@ public:
 
     void registerEHFrames(uint8_t *Addr, uint64_t LoadAddr,
                           size_t Size) override {
-      UnfinalizedEHFrames.push_back(
-          std::make_pair(LoadAddr, static_cast<uint32_t>(Size)));
+      UnfinalizedEHFrames.push_back({LoadAddr, Size});
     }
 
-    void deregisterEHFrames(uint8_t *Addr, uint64_t LoadAddr,
-                            size_t Size) override {
-      auto Err = Client.deregisterEHFrames(LoadAddr, Size);
-      // FIXME: Add error poll.
-      assert(!Err && "Failed to register remote EH frames.");
-      (void)Err;
+    void deregisterEHFrames() override {
+      for (auto &Frame : RegisteredEHFrames) {
+        auto Err = Client.deregisterEHFrames(Frame.Addr, Frame.Size);
+        // FIXME: Add error poll.
+        assert(!Err && "Failed to register remote EH frames.");
+        (void)Err;
+      }
     }
 
     void notifyObjectLoaded(RuntimeDyld &Dyld,
@@ -320,7 +320,7 @@ public:
       Unfinalized.clear();
 
       for (auto &EHFrame : UnfinalizedEHFrames) {
-        if (auto Err = Client.registerEHFrames(EHFrame.first, EHFrame.second)) {
+        if (auto Err = Client.registerEHFrames(EHFrame.Addr, EHFrame.Size)) {
           // FIXME: Replace this once finalizeMemory can return an Error.
           handleAllErrors(std::move(Err), [&](ErrorInfoBase &EIB) {
             if (ErrMsg) {
@@ -331,7 +331,8 @@ public:
           return false;
         }
       }
-      UnfinalizedEHFrames.clear();
+      RegisteredEHFrames = std::move(UnfinalizedEHFrames);
+      UnfinalizedEHFrames = {};
 
       return false;
     }
@@ -387,7 +388,13 @@ public:
     ResourceIdMgr::ResourceId Id;
     std::vector<ObjectAllocs> Unmapped;
     std::vector<ObjectAllocs> Unfinalized;
-    std::vector<std::pair<uint64_t, uint32_t>> UnfinalizedEHFrames;
+
+    struct EHFrame {
+      JITTargetAddress Addr;
+      uint64_t Size;
+    };
+    std::vector<EHFrame> UnfinalizedEHFrames;
+    std::vector<EHFrame> RegisteredEHFrames;
   };
 
   /// Remote indirect stubs manager.
diff --git a/include/llvm/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.h b/include/llvm/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.h
index babcc7f26aab..5b3426afe584 100644
--- a/include/llvm/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.h
+++ b/include/llvm/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.h
@@ -120,6 +120,10 @@ private:
       buildInitialSymbolTable(PFC->Objects);
     }
 
+    ~ConcreteLinkedObjectSet() override {
+      MemMgr->deregisterEHFrames();
+    }
+    
     void setHandle(ObjSetHandleT H) {
       PFC->Handle = H;
     }
diff --git a/include/llvm/ExecutionEngine/RTDyldMemoryManager.h b/include/llvm/ExecutionEngine/RTDyldMemoryManager.h
index 5638717790bb..74535fe948ff 100644
--- a/include/llvm/ExecutionEngine/RTDyldMemoryManager.h
+++ b/include/llvm/ExecutionEngine/RTDyldMemoryManager.h
@@ -69,13 +69,8 @@ public:
   /// Deregister EH frames in the current proces.
   static void deregisterEHFramesInProcess(uint8_t *Addr, size_t Size);
 
-  void registerEHFrames(uint8_t *Addr, uint64_t LoadAddr, size_t Size) override {
-    registerEHFramesInProcess(Addr, Size);
-  }
-
-  void deregisterEHFrames(uint8_t *Addr, uint64_t LoadAddr, size_t Size) override {
-    deregisterEHFramesInProcess(Addr, Size);
-  }
+  void registerEHFrames(uint8_t *Addr, uint64_t LoadAddr, size_t Size) override;
+  void deregisterEHFrames() override;
 
   /// This method returns the address of the specified function or variable in
   /// the current process.
@@ -139,6 +134,13 @@ public:
   /// MCJIT or RuntimeDyld.  Use getSymbolAddress instead.
   virtual void *getPointerToNamedFunction(const std::string &Name,
                                           bool AbortOnFailure = true);
+
+private:
+  struct EHFrame {
+    uint8_t *Addr;
+    size_t Size;
+  };
+  std::vector<EHFrame> EHFrames;
 };
 
 // Create wrappers for C Binding types (see CBindingWrapping.h).
diff --git a/include/llvm/ExecutionEngine/RuntimeDyld.h b/include/llvm/ExecutionEngine/RuntimeDyld.h
index 13a5f9922c51..9470866dc0d6 100644
--- a/include/llvm/ExecutionEngine/RuntimeDyld.h
+++ b/include/llvm/ExecutionEngine/RuntimeDyld.h
@@ -150,8 +150,7 @@ public:
     /// be the case for local execution) these two values will be the same.
     virtual void registerEHFrames(uint8_t *Addr, uint64_t LoadAddr,
                                   size_t Size) = 0;
-    virtual void deregisterEHFrames(uint8_t *addr, uint64_t LoadAddr,
-                                    size_t Size) = 0;
+    virtual void deregisterEHFrames() = 0;
 
     /// This method is called when object loading is complete and section page
     /// permissions can be applied.  It is up to the memory manager implementation
diff --git a/include/llvm/IR/Attributes.h b/include/llvm/IR/Attributes.h
index cbe681684a5c..d4a896c01867 100644
--- a/include/llvm/IR/Attributes.h
+++ b/include/llvm/IR/Attributes.h
@@ -35,6 +35,7 @@ namespace llvm {
 class AttrBuilder;
 class AttributeImpl;
 class AttributeListImpl;
+class AttributeList;
 class AttributeSetNode;
 template<typename T> struct DenseMapInfo;
 class Function;
@@ -227,14 +228,51 @@ public:
   bool operator==(const AttributeSet &O) { return SetNode == O.SetNode; }
   bool operator!=(const AttributeSet &O) { return !(*this == O); }
 
+  /// Add an argument attribute. Because
+  /// attribute sets are immutable, this returns a new set.
+  AttributeSet addAttribute(LLVMContext &C,
+                            Attribute::AttrKind Kind) const;
+
+  /// Add a target-dependent attribute. Because
+  /// attribute sets are immutable, this returns a new set.
+  AttributeSet addAttribute(LLVMContext &C, StringRef Kind,
+                            StringRef Value = StringRef()) const;
+
+  /// Add attributes to the attribute set. Because
+  /// attribute sets are immutable, this returns a new set.
+  AttributeSet addAttributes(LLVMContext &C, AttributeSet AS) const;
+
+  /// Remove the specified attribute from this set. Because
+  /// attribute sets are immutable, this returns a new set.
+  AttributeSet removeAttribute(LLVMContext &C,
+                                Attribute::AttrKind Kind) const;
+
+  /// Remove the specified attribute from this set. Because
+  /// attribute sets are immutable, this returns a new set.
+  AttributeSet removeAttribute(LLVMContext &C,
+                                StringRef Kind) const;
+
+  /// Remove the specified attributes from this set. Because
+  /// attribute sets are immutable, this returns a new set.
+  AttributeSet removeAttributes(LLVMContext &C,
+                                 const AttrBuilder &AttrsToRemove) const;
+
+  /// Return the number of attributes in this set.
   unsigned getNumAttributes() const;
 
+  /// Return true if attributes exists in this set.
   bool hasAttributes() const { return SetNode != nullptr; }
 
+  /// Return true if the attribute exists in this set.
   bool hasAttribute(Attribute::AttrKind Kind) const;
+
+  /// Return true if the attribute exists in this set.
   bool hasAttribute(StringRef Kind) const;
 
+  /// Return the attribute object.
   Attribute getAttribute(Attribute::AttrKind Kind) const;
+
+  /// Return the target-dependent attribute object.
   Attribute getAttribute(StringRef Kind) const;
 
   unsigned getAlignment() const;
@@ -248,6 +286,9 @@ public:
 
   iterator begin() const;
   iterator end() const;
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+  void dump() const;
+#endif
 };
 
 //===----------------------------------------------------------------------===//
diff --git a/include/llvm/IR/CallingConv.h b/include/llvm/IR/CallingConv.h
index 39fb3f1c791b..801e88aba4d1 100644
--- a/include/llvm/IR/CallingConv.h
+++ b/include/llvm/IR/CallingConv.h
@@ -201,6 +201,10 @@ namespace CallingConv {
     /// shaders)
     AMDGPU_HS = 93,
 
+    /// Calling convention used for special MSP430 rtlib functions
+    /// which have an "optimized" convention using additional registers.
+    MSP430_BUILTIN = 94,
+
     /// The highest possible calling convention ID. Must be some 2^k - 1.
     MaxID = 1023
   };
diff --git a/include/llvm/IR/Constants.h b/include/llvm/IR/Constants.h
index ad83b21c7bf3..5db9b3bb5048 100644
--- a/include/llvm/IR/Constants.h
+++ b/include/llvm/IR/Constants.h
@@ -26,6 +26,7 @@
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/None.h"
 #include "llvm/ADT/Optional.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/IR/Constant.h"
 #include "llvm/IR/DerivedTypes.h"
@@ -452,7 +453,14 @@ class ConstantStruct final : public ConstantAggregate {
 public:
   // ConstantStruct accessors
   static Constant *get(StructType *T, ArrayRef<Constant*> V);
-  static Constant *get(StructType *T, ...) LLVM_END_WITH_NULL;
+
+  template <typename... Csts>
+  static typename std::enable_if<are_base_of<Constant, Csts...>::value,
+                                 Constant *>::type
+  get(StructType *T, Csts *... Vs) {
+    SmallVector<Constant *, 8> Values({Vs...});
+    return get(T, Values);
+  }
 
   /// Return an anonymous struct that has the specified elements.
   /// If the struct is possibly empty, then you must specify a context.
diff --git a/include/llvm/IR/DebugInfoMetadata.h b/include/llvm/IR/DebugInfoMetadata.h
index 0331d5229e7f..358106aac43b 100644
--- a/include/llvm/IR/DebugInfoMetadata.h
+++ b/include/llvm/IR/DebugInfoMetadata.h
@@ -16,8 +16,11 @@
 
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/BitmaskEnum.h"
+#include "llvm/ADT/iterator_range.h"
 #include "llvm/ADT/None.h"
+#include "llvm/ADT/Optional.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/IR/Metadata.h"
 #include "llvm/Support/Casting.h"
@@ -56,10 +59,6 @@
 
 namespace llvm {
 
-class DIBuilder;
-
-template <typename T> class Optional;
-
 /// Holds a subclass of DINode.
 ///
 /// FIXME: This class doesn't currently make much sense.  Previously it was a
@@ -94,9 +93,9 @@ public:
   bool operator!=(const TypedDINodeRef<T> &X) const { return MD != X.MD; }
 };
 
-typedef TypedDINodeRef<DINode> DINodeRef;
-typedef TypedDINodeRef<DIScope> DIScopeRef;
-typedef TypedDINodeRef<DIType> DITypeRef;
+using DINodeRef = TypedDINodeRef<DINode>;
+using DIScopeRef = TypedDINodeRef<DIScope>;
+using DITypeRef = TypedDINodeRef<DIType>;
 
 class DITypeRefArray {
   const MDTuple *N = nullptr;
@@ -240,7 +239,8 @@ public:
 };
 
 template <class T> struct simplify_type<const TypedDINodeRef<T>> {
-  typedef Metadata *SimpleType;
+  using SimpleType = Metadata *;
+
   static SimpleType getSimplifiedValue(const TypedDINodeRef<T> &MD) {
     return MD;
   }
@@ -799,15 +799,18 @@ public:
     assert(getTag() == dwarf::DW_TAG_ptr_to_member_type);
     return DITypeRef(getExtraData());
   }
+
   DIObjCProperty *getObjCProperty() const {
     return dyn_cast_or_null<DIObjCProperty>(getExtraData());
   }
+
   Constant *getStorageOffsetInBits() const {
     assert(getTag() == dwarf::DW_TAG_member && isBitField());
     if (auto *C = cast_or_null<ConstantAsMetadata>(getExtraData()))
       return C->getValue();
     return nullptr;
   }
+
   Constant *getConstant() const {
     assert(getTag() == dwarf::DW_TAG_member && isStaticMember());
     if (auto *C = cast_or_null<ConstantAsMetadata>(getExtraData()))
@@ -970,9 +973,11 @@ public:
 #endif
     replaceOperandWith(4, Elements.get());
   }
+
   void replaceVTableHolder(DITypeRef VTableHolder) {
     replaceOperandWith(5, VTableHolder);
   }
+
   void replaceTemplateParams(DITemplateParameterArray TemplateParams) {
     replaceOperandWith(6, TemplateParams.get());
   }
@@ -1031,6 +1036,7 @@ public:
   DITypeRefArray getTypeArray() const {
     return cast_or_null<MDTuple>(getRawTypeArray());
   }
+
   Metadata *getRawTypeArray() const { return getOperand(3); }
 
   static bool classof(const Metadata *MD) {
@@ -1319,6 +1325,7 @@ public:
   unsigned getLine() const { return SubclassData32; }
   unsigned getColumn() const { return SubclassData16; }
   DILocalScope *getScope() const { return cast<DILocalScope>(getRawScope()); }
+
   DILocation *getInlinedAt() const {
     return cast_or_null<DILocation>(getRawInlinedAt());
   }
@@ -1452,7 +1459,6 @@ public:
   static bool classof(const Metadata *MD) {
     return MD->getMetadataID() == DILocationKind;
   }
-
 };
 
 /// Subprogram description.
@@ -2087,6 +2093,7 @@ public:
       return F->getFilename();
     return "";
   }
+
   StringRef getDirectory() const {
     if (auto *F = getFile())
       return F->getDirectory();
@@ -2143,6 +2150,7 @@ public:
   ArrayRef<uint64_t> getElements() const { return Elements; }
 
   unsigned getNumElements() const { return Elements.size(); }
+
   uint64_t getElement(unsigned I) const {
     assert(I < Elements.size() && "Index out of range");
     return Elements[I];
@@ -2151,7 +2159,8 @@ public:
   /// Determine whether this represents a standalone constant value.
   bool isConstant() const;
 
-  typedef ArrayRef<uint64_t>::iterator element_iterator;
+  using element_iterator = ArrayRef<uint64_t>::iterator;
+
   element_iterator elements_begin() const { return getElements().begin(); }
   element_iterator elements_end() const { return getElements().end(); }
 
@@ -2276,6 +2285,10 @@ public:
   /// Append \p Ops with operations to apply the \p Offset.
   static void appendOffset(SmallVectorImpl<uint64_t> &Ops, int64_t Offset);
 
+  /// If this is a constant offset, extract it. If there is no expression,
+  /// return true with an offset of zero.
+  bool extractIfOffset(int64_t &Offset) const;
+
   /// Constants for DIExpression::prepend.
   enum { NoDeref = false, WithDeref = true, WithStackValue = true };
 
@@ -2509,6 +2522,7 @@ public:
       return F->getFilename();
     return "";
   }
+
   StringRef getDirectory() const {
     if (auto *F = getFile())
       return F->getDirectory();
@@ -2609,10 +2623,13 @@ public:
   TempDIGlobalVariableExpression clone() const { return cloneImpl(); }
 
   Metadata *getRawVariable() const { return getOperand(0); }
+
   DIGlobalVariable *getVariable() const {
     return cast_or_null<DIGlobalVariable>(getRawVariable());
   }
+
   Metadata *getRawExpression() const { return getOperand(1); }
+
   DIExpression *getExpression() const {
     return cast_or_null<DIExpression>(getRawExpression());
   }
diff --git a/include/llvm/IR/DebugLoc.h b/include/llvm/IR/DebugLoc.h
index 202be3da14da..aa74f361cda2 100644
--- a/include/llvm/IR/DebugLoc.h
+++ b/include/llvm/IR/DebugLoc.h
@@ -80,6 +80,22 @@ namespace llvm {
     static DebugLoc get(unsigned Line, unsigned Col, const MDNode *Scope,
                         const MDNode *InlinedAt = nullptr);
 
+    enum { ReplaceLastInlinedAt = true };
+    /// Rebuild the entire inlined-at chain for this instruction so that the top of
+    /// the chain now is inlined-at the new call site.
+    /// \param   InlinedAt    The new outermost inlined-at in the chain.
+    /// \param   ReplaceLast  Replace the last location in the inlined-at chain.
+    static DebugLoc appendInlinedAt(DebugLoc DL, DILocation *InlinedAt,
+                                    LLVMContext &Ctx,
+                                    DenseMap<const MDNode *, MDNode *> &Cache,
+                                    bool ReplaceLast = false);
+
+    /// Reparent all debug locations referenced by \c I that belong to \c OrigSP
+    /// to become (possibly indirect) children of \c NewSP.
+    static void reparentDebugInfo(Instruction &I, DISubprogram *OrigSP,
+                                  DISubprogram *NewSP,
+                                  DenseMap<const MDNode *, MDNode *> &Cache);
+
     unsigned getLine() const;
     unsigned getCol() const;
     MDNode *getScope() const;
diff --git a/include/llvm/IR/DerivedTypes.h b/include/llvm/IR/DerivedTypes.h
index 05e99157b8dc..a92321a44511 100644
--- a/include/llvm/IR/DerivedTypes.h
+++ b/include/llvm/IR/DerivedTypes.h
@@ -1,4 +1,4 @@
-//===-- llvm/DerivedTypes.h - Classes for handling data types ---*- C++ -*-===//
+//===- llvm/DerivedTypes.h - Classes for handling data types ----*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -19,6 +19,7 @@
 #define LLVM_IR_DERIVEDTYPES_H
 
 #include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/IR/Type.h"
 #include "llvm/Support/Casting.h"
@@ -122,7 +123,8 @@ public:
   bool isVarArg() const { return getSubclassData()!=0; }
   Type *getReturnType() const { return ContainedTys[0]; }
 
-  typedef Type::subtype_iterator param_iterator;
+  using param_iterator = Type::subtype_iterator;
+
   param_iterator param_begin() const { return ContainedTys + 1; }
   param_iterator param_end() const { return &ContainedTys[NumContainedTys]; }
   ArrayRef<Type *> params() const {
@@ -197,8 +199,7 @@ public:
 /// generator for a target expects).
 ///
 class StructType : public CompositeType {
-  StructType(LLVMContext &C)
-    : CompositeType(C, StructTyID), SymbolTableEntry(nullptr) {}
+  StructType(LLVMContext &C) : CompositeType(C, StructTyID) {}
 
   enum {
     /// This is the contents of the SubClassData field.
@@ -212,7 +213,7 @@ class StructType : public CompositeType {
   /// symbol table entry (maintained by LLVMContext) for the struct.
   /// This is null if the type is an literal struct or if it is a identified
   /// type that has an empty name.
-  void *SymbolTableEntry;
+  void *SymbolTableEntry = nullptr;
 
 public:
   StructType(const StructType &) = delete;
@@ -228,7 +229,14 @@ public:
   static StructType *create(LLVMContext &Context, ArrayRef<Type *> Elements,
                             StringRef Name, bool isPacked = false);
   static StructType *create(LLVMContext &Context, ArrayRef<Type *> Elements);
-  static StructType *create(StringRef Name, Type *elt1, ...) LLVM_END_WITH_NULL;
+  template <class... Tys>
+  static typename std::enable_if<are_base_of<Type, Tys...>::value,
+                                 StructType *>::type
+  create(StringRef Name, Type *elt1, Tys *... elts) {
+    assert(elt1 && "Cannot create a struct type with no elements with this");
+    SmallVector<llvm::Type *, 8> StructFields({elt1, elts...});
+    return create(StructFields, Name);
+  }
 
   /// This static method is the primary way to create a literal StructType.
   static StructType *get(LLVMContext &Context, ArrayRef<Type*> Elements,
@@ -240,7 +248,15 @@ public:
   /// This static method is a convenience method for creating structure types by
   /// specifying the elements as arguments. Note that this method always returns
   /// a non-packed struct, and requires at least one element type.
-  static StructType *get(Type *elt1, ...) LLVM_END_WITH_NULL;
+  template <class... Tys>
+  static typename std::enable_if<are_base_of<Type, Tys...>::value,
+                                 StructType *>::type
+  get(Type *elt1, Tys *... elts) {
+    assert(elt1 && "Cannot create a struct type with no elements with this");
+    LLVMContext &Ctx = elt1->getContext();
+    SmallVector<llvm::Type *, 8> StructFields({elt1, elts...});
+    return llvm::StructType::get(Ctx, StructFields);
+  }
 
   bool isPacked() const { return (getSubclassData() & SCDB_Packed) != 0; }
 
@@ -269,13 +285,21 @@ public:
 
   /// Specify a body for an opaque identified type.
   void setBody(ArrayRef<Type*> Elements, bool isPacked = false);
-  void setBody(Type *elt1, ...) LLVM_END_WITH_NULL;
+
+  template <typename... Tys>
+  typename std::enable_if<are_base_of<Type, Tys...>::value, void>::type
+  setBody(Type *elt1, Tys *... elts) {
+    assert(elt1 && "Cannot create a struct type with no elements with this");
+    SmallVector<llvm::Type *, 8> StructFields({elt1, elts...});
+    setBody(StructFields);
+  }
 
   /// Return true if the specified type is valid as a element type.
   static bool isValidElementType(Type *ElemTy);
 
   // Iterator access to the elements.
-  typedef Type::subtype_iterator element_iterator;
+  using element_iterator = Type::subtype_iterator;
+
   element_iterator element_begin() const { return ContainedTys; }
   element_iterator element_end() const { return &ContainedTys[NumContainedTys];}
   ArrayRef<Type *> const elements() const {
diff --git a/include/llvm/IR/DiagnosticInfo.h b/include/llvm/IR/DiagnosticInfo.h
index 458c3cf29b0d..5497652135bd 100644
--- a/include/llvm/IR/DiagnosticInfo.h
+++ b/include/llvm/IR/DiagnosticInfo.h
@@ -15,7 +15,6 @@
 #ifndef LLVM_IR_DIAGNOSTICINFO_H
 #define LLVM_IR_DIAGNOSTICINFO_H
 
-#include "llvm/ADT/None.h"
 #include "llvm/ADT/Optional.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
@@ -120,18 +119,18 @@ public:
   virtual void print(DiagnosticPrinter &DP) const = 0;
 };
 
-typedef std::function<void(const DiagnosticInfo &)> DiagnosticHandlerFunction;
+using DiagnosticHandlerFunction = std::function<void(const DiagnosticInfo &)>;
 
 /// Diagnostic information for inline asm reporting.
 /// This is basically a message and an optional location.
 class DiagnosticInfoInlineAsm : public DiagnosticInfo {
 private:
   /// Optional line information. 0 if not set.
-  unsigned LocCookie;
+  unsigned LocCookie = 0;
   /// Message to be reported.
   const Twine &MsgStr;
   /// Optional origin of the problem.
-  const Instruction *Instr;
+  const Instruction *Instr = nullptr;
 
 public:
   /// \p MsgStr is the message to be reported to the frontend.
@@ -139,8 +138,7 @@ public:
   /// for the whole life time of the Diagnostic.
   DiagnosticInfoInlineAsm(const Twine &MsgStr,
                           DiagnosticSeverity Severity = DS_Error)
-      : DiagnosticInfo(DK_InlineAsm, Severity), LocCookie(0), MsgStr(MsgStr),
-        Instr(nullptr) {}
+      : DiagnosticInfo(DK_InlineAsm, Severity), MsgStr(MsgStr) {}
 
   /// \p LocCookie if non-zero gives the line number for this report.
   /// \p MsgStr gives the message.
@@ -149,7 +147,7 @@ public:
   DiagnosticInfoInlineAsm(unsigned LocCookie, const Twine &MsgStr,
                           DiagnosticSeverity Severity = DS_Error)
       : DiagnosticInfo(DK_InlineAsm, Severity), LocCookie(LocCookie),
-        MsgStr(MsgStr), Instr(nullptr) {}
+        MsgStr(MsgStr) {}
 
   /// \p Instr gives the original instruction that triggered the diagnostic.
   /// \p MsgStr gives the message.
@@ -294,10 +292,10 @@ public:
   DiagnosticInfoSampleProfile(StringRef FileName, const Twine &Msg,
                               DiagnosticSeverity Severity = DS_Error)
       : DiagnosticInfo(DK_SampleProfile, Severity), FileName(FileName),
-        LineNum(0), Msg(Msg) {}
+        Msg(Msg) {}
   DiagnosticInfoSampleProfile(const Twine &Msg,
                               DiagnosticSeverity Severity = DS_Error)
-      : DiagnosticInfo(DK_SampleProfile, Severity), LineNum(0), Msg(Msg) {}
+      : DiagnosticInfo(DK_SampleProfile, Severity), Msg(Msg) {}
 
   /// \see DiagnosticInfo::print.
   void print(DiagnosticPrinter &DP) const override;
@@ -316,7 +314,7 @@ private:
 
   /// Line number where the diagnostic occurred. If 0, no line number will
   /// be emitted in the message.
-  unsigned LineNum;
+  unsigned LineNum = 0;
 
   /// Message to report.
   const Twine &Msg;
@@ -351,8 +349,9 @@ class DiagnosticLocation {
   StringRef Filename;
   unsigned Line = 0;
   unsigned Column = 0;
+
 public:
-  DiagnosticLocation() {}
+  DiagnosticLocation() = default;
   DiagnosticLocation(const DebugLoc &DL);
   DiagnosticLocation(const DISubprogram *SP);
 
@@ -796,6 +795,7 @@ private:
                                       const Twine &Msg)
       : OptimizationRemarkAnalysis(DK_OptimizationRemarkAnalysisFPCommute,
                                    PassName, Fn, Loc, Msg) {}
+
   friend void emitOptimizationRemarkAnalysisFPCommute(
       LLVMContext &Ctx, const char *PassName, const Function &Fn,
       const DiagnosticLocation &Loc, const Twine &Msg);
@@ -1012,6 +1012,7 @@ public:
 
   void print(DiagnosticPrinter &DP) const override;
 };
+
 } // end namespace llvm
 
 #endif // LLVM_IR_DIAGNOSTICINFO_H
diff --git a/include/llvm/IR/Function.h b/include/llvm/IR/Function.h
index c12a125b6352..8a2a6ed87eb2 100644
--- a/include/llvm/IR/Function.h
+++ b/include/llvm/IR/Function.h
@@ -1,4 +1,4 @@
-//===-- llvm/Function.h - Class to represent a single function --*- C++ -*-===//
+//===- llvm/Function.h - Class to represent a single function ---*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -22,15 +22,19 @@
 #include "llvm/ADT/ilist_node.h"
 #include "llvm/ADT/iterator_range.h"
 #include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/Twine.h"
 #include "llvm/IR/Argument.h"
 #include "llvm/IR/Attributes.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/CallingConv.h"
+#include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/GlobalObject.h"
+#include "llvm/IR/GlobalValue.h"
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/OperandTraits.h"
 #include "llvm/IR/SymbolTableListTraits.h"
 #include "llvm/IR/Value.h"
+#include "llvm/Support/Casting.h"
 #include "llvm/Support/Compiler.h"
 #include <cassert>
 #include <cstddef>
@@ -40,27 +44,31 @@
 
 namespace llvm {
 
-template <typename T> class Optional;
 class AssemblyAnnotationWriter;
-class FunctionType;
-class LLVMContext;
+class Constant;
 class DISubprogram;
+class LLVMContext;
+class Module;
+template <typename T> class Optional;
+class raw_ostream;
+class Type;
+class User;
 
 class Function : public GlobalObject, public ilist_node<Function> {
 public:
-  typedef SymbolTableList<BasicBlock> BasicBlockListType;
+  using BasicBlockListType = SymbolTableList<BasicBlock>;
 
   // BasicBlock iterators...
-  typedef BasicBlockListType::iterator iterator;
-  typedef BasicBlockListType::const_iterator const_iterator;
+  using iterator = BasicBlockListType::iterator;
+  using const_iterator = BasicBlockListType::const_iterator;
 
-  typedef Argument *arg_iterator;
-  typedef const Argument *const_arg_iterator;
+  using arg_iterator = Argument *;
+  using const_arg_iterator = const Argument *;
 
 private:
   // Important things that make up a function!
-  BasicBlockListType  BasicBlocks;        ///< The basic blocks
-  mutable Argument *Arguments;            ///< The formal arguments
+  BasicBlockListType BasicBlocks;         ///< The basic blocks
+  mutable Argument *Arguments = nullptr;  ///< The formal arguments
   size_t NumArgs;
   std::unique_ptr<ValueSymbolTable>
       SymTab;                             ///< Symbol table of args/instructions
@@ -124,10 +132,12 @@ public:
 
   // Provide fast operand accessors.
   DECLARE_TRANSPARENT_OPERAND_ACCESSORS(Value);
+
   /// Returns the FunctionType for me.
   FunctionType *getFunctionType() const {
     return cast<FunctionType>(getValueType());
   }
+
   /// Returns the type of the ret val.
   Type *getReturnType() const { return getFunctionType()->getReturnType(); }
 
@@ -484,7 +494,7 @@ public:
 
   /// copyAttributesFrom - copy all additional attributes (those not needed to
   /// create a Function) from the Function Src to this one.
-  void copyAttributesFrom(const GlobalValue *Src) override;
+  void copyAttributesFrom(const Function *Src);
 
   /// deleteBody - This method deletes the body of the function, and converts
   /// the linkage to external.
@@ -497,12 +507,12 @@ public:
   /// removeFromParent - This method unlinks 'this' from the containing module,
   /// but does not delete it.
   ///
-  void removeFromParent() override;
+  void removeFromParent();
 
   /// eraseFromParent - This method unlinks 'this' from the containing module
   /// and deletes it.
   ///
-  void eraseFromParent() override;
+  void eraseFromParent();
 
   /// Steal arguments from another function.
   ///
diff --git a/include/llvm/IR/GetElementPtrTypeIterator.h b/include/llvm/IR/GetElementPtrTypeIterator.h
index 490bff29cf38..f017a449d33f 100644
--- a/include/llvm/IR/GetElementPtrTypeIterator.h
+++ b/include/llvm/IR/GetElementPtrTypeIterator.h
@@ -21,7 +21,9 @@
 #include "llvm/IR/Operator.h"
 #include "llvm/IR/User.h"
 #include "llvm/Support/Casting.h"
+#include <cassert> 
 #include <cstddef>
+#include <cstdint> 
 #include <iterator>
 
 namespace llvm {
@@ -29,13 +31,13 @@ namespace llvm {
   template<typename ItTy = User::const_op_iterator>
   class generic_gep_type_iterator
     : public std::iterator<std::forward_iterator_tag, Type *, ptrdiff_t> {
-    typedef std::iterator<std::forward_iterator_tag,
-                          Type *, ptrdiff_t> super;
+    using super = std::iterator<std::forward_iterator_tag, Type *, ptrdiff_t>;
 
     ItTy OpIt;
     PointerUnion<StructType *, Type *> CurTy;
     enum : uint64_t { Unbounded = -1ull };
     uint64_t NumElements = Unbounded;
+
     generic_gep_type_iterator() = default;
 
   public:
@@ -121,7 +123,7 @@ namespace llvm {
     }
   };
 
-  typedef generic_gep_type_iterator<> gep_type_iterator;
+  using gep_type_iterator = generic_gep_type_iterator<>;
 
   inline gep_type_iterator gep_type_begin(const User *GEP) {
     auto *GEPOp = cast<GEPOperator>(GEP);
diff --git a/include/llvm/IR/GlobalAlias.h b/include/llvm/IR/GlobalAlias.h
index 37a291dfeb7a..d4bf0d7e1ed4 100644
--- a/include/llvm/IR/GlobalAlias.h
+++ b/include/llvm/IR/GlobalAlias.h
@@ -59,15 +59,19 @@ public:
   // Linkage, Type, Parent and AddressSpace taken from the Aliasee.
   static GlobalAlias *create(const Twine &Name, GlobalValue *Aliasee);
 
+  void copyAttributesFrom(const GlobalValue *Src) {
+    GlobalValue::copyAttributesFrom(Src);
+  }
+
   /// removeFromParent - This method unlinks 'this' from the containing module,
   /// but does not delete it.
   ///
-  void removeFromParent() override;
+  void removeFromParent();
 
   /// eraseFromParent - This method unlinks 'this' from the containing module
   /// and deletes it.
   ///
-  void eraseFromParent() override;
+  void eraseFromParent();
 
   /// These methods retrieve and set alias target.
   void setAliasee(Constant *Aliasee);
diff --git a/include/llvm/IR/GlobalIFunc.h b/include/llvm/IR/GlobalIFunc.h
index bfaa9960cb13..d90c7c78ed26 100644
--- a/include/llvm/IR/GlobalIFunc.h
+++ b/include/llvm/IR/GlobalIFunc.h
@@ -47,12 +47,16 @@ public:
                              LinkageTypes Linkage, const Twine &Name,
                              Constant *Resolver, Module *Parent);
 
+  void copyAttributesFrom(const GlobalIFunc *Src) {
+    GlobalValue::copyAttributesFrom(Src);
+  }
+
   /// This method unlinks 'this' from the containing module, but does not
   /// delete it.
-  void removeFromParent() final;
+  void removeFromParent();
 
   /// This method unlinks 'this' from the containing module and deletes it.
-  void eraseFromParent() final;
+  void eraseFromParent();
 
   /// These methods retrieve and set ifunc resolver function.
   void setResolver(Constant *Resolver) {
diff --git a/include/llvm/IR/GlobalObject.h b/include/llvm/IR/GlobalObject.h
index f3789bafefe3..fc38f698027b 100644
--- a/include/llvm/IR/GlobalObject.h
+++ b/include/llvm/IR/GlobalObject.h
@@ -150,8 +150,10 @@ public:
 
   void addTypeMetadata(unsigned Offset, Metadata *TypeID);
 
-  void copyAttributesFrom(const GlobalValue *Src) override;
+protected:
+  void copyAttributesFrom(const GlobalObject *Src);
 
+public:
   // Methods for support type inquiry through isa, cast, and dyn_cast:
   static inline bool classof(const Value *V) {
     return V->getValueID() == Value::FunctionVal ||
diff --git a/include/llvm/IR/GlobalValue.h b/include/llvm/IR/GlobalValue.h
index bb30fa8be867..0793a1c0ee2e 100644
--- a/include/llvm/IR/GlobalValue.h
+++ b/include/llvm/IR/GlobalValue.h
@@ -435,14 +435,20 @@ public:
 
   bool isWeakForLinker() const { return isWeakForLinker(getLinkage()); }
 
+protected:
   /// Copy all additional attributes (those not needed to create a GlobalValue)
   /// from the GlobalValue Src to this one.
-  virtual void copyAttributesFrom(const GlobalValue *Src);
+  void copyAttributesFrom(const GlobalValue *Src);
 
-  /// If special LLVM prefix that is used to inform the asm printer to not emit
-  /// usual symbol prefix before the symbol name is used then return linkage
-  /// name after skipping this special LLVM prefix.
-  static StringRef getRealLinkageName(StringRef Name) {
+public:
+  /// If the given string begins with the GlobalValue name mangling escape
+  /// character '\1', drop it.
+  ///
+  /// This function applies a specific mangling that is used in PGO profiles,
+  /// among other things. If you're trying to get a symbol name for an
+  /// arbitrary GlobalValue, this is not the function you're looking for; see
+  /// Mangler.h.
+  static StringRef dropLLVMManglingEscape(StringRef Name) {
     if (!Name.empty() && Name[0] == '\1')
       return Name.substr(1);
     return Name;
@@ -530,10 +536,10 @@ public:
 
   /// This method unlinks 'this' from the containing module, but does not delete
   /// it.
-  virtual void removeFromParent() = 0;
+  void removeFromParent();
 
   /// This method unlinks 'this' from the containing module and deletes it.
-  virtual void eraseFromParent() = 0;
+  void eraseFromParent();
 
   /// Get the module that this global value is contained inside of...
   Module *getParent() { return Parent; }
diff --git a/include/llvm/IR/GlobalVariable.h b/include/llvm/IR/GlobalVariable.h
index 3b545d811d44..21d334c8f01d 100644
--- a/include/llvm/IR/GlobalVariable.h
+++ b/include/llvm/IR/GlobalVariable.h
@@ -24,6 +24,7 @@
 #include "llvm/ADT/Twine.h"
 #include "llvm/ADT/ilist_node.h"
 #include "llvm/IR/GlobalObject.h"
+#include "llvm/IR/Attributes.h"
 #include "llvm/IR/OperandTraits.h"
 #include "llvm/IR/Value.h"
 #include <cassert>
@@ -41,6 +42,7 @@ class DIGlobalVariableExpression;
 class GlobalVariable : public GlobalObject, public ilist_node<GlobalVariable> {
   friend class SymbolTableListTraits<GlobalVariable>;
 
+  AttributeSet Attrs;
   bool isConstantGlobal : 1;                   // Is this a global constant?
   bool isExternallyInitializedConstant : 1;    // Is this a global whose value
                                                // can change from its initial
@@ -156,17 +158,17 @@ public:
 
   /// copyAttributesFrom - copy all additional attributes (those not needed to
   /// create a GlobalVariable) from the GlobalVariable Src to this one.
-  void copyAttributesFrom(const GlobalValue *Src) override;
+  void copyAttributesFrom(const GlobalVariable *Src);
 
   /// removeFromParent - This method unlinks 'this' from the containing module,
   /// but does not delete it.
   ///
-  void removeFromParent() override;
+  void removeFromParent();
 
   /// eraseFromParent - This method unlinks 'this' from the containing module
   /// and deletes it.
   ///
-  void eraseFromParent() override;
+  void eraseFromParent();
 
   /// Drop all references in preparation to destroy the GlobalVariable. This
   /// drops not only the reference to the initializer but also to any metadata.
@@ -178,6 +180,61 @@ public:
   /// Fill the vector with all debug info attachements.
   void getDebugInfo(SmallVectorImpl<DIGlobalVariableExpression *> &GVs) const;
 
+  /// Add attribute to this global.
+  void addAttribute(Attribute::AttrKind Kind) {
+    Attrs = Attrs.addAttribute(getContext(), Kind);
+  }
+
+  /// Add attribute to this global.
+  void addAttribute(StringRef Kind, StringRef Val = StringRef()) {
+    Attrs = Attrs.addAttribute(getContext(), Kind, Val);
+  }
+
+  /// Return true if the attribute exists.
+  bool hasAttribute(Attribute::AttrKind Kind) const {
+    return Attrs.hasAttribute(Kind);
+  }
+
+  /// Return true if the attribute exists.
+  bool hasAttribute(StringRef Kind) const {
+    return Attrs.hasAttribute(Kind);
+  }
+
+  /// Return true if any attributes exist.
+  bool hasAttributes() const {
+    return Attrs.hasAttributes();
+  }
+
+  /// Return the attribute object.
+  Attribute getAttribute(Attribute::AttrKind Kind) const {
+    return Attrs.getAttribute(Kind);
+  }
+
+  /// Return the attribute object.
+  Attribute getAttribute(StringRef Kind) const {
+    return Attrs.getAttribute(Kind);
+  }
+
+  /// Return the attribute set for this global
+  AttributeSet getAttributes() const {
+    return Attrs;
+  }
+
+  /// Return attribute set as list with index.
+  /// FIXME: This may not be required once ValueEnumerators
+  /// in bitcode-writer can enumerate attribute-set.
+  AttributeList getAttributesAsList(unsigned index) const {
+    if (!hasAttributes())
+      return AttributeList();
+    std::pair<unsigned, AttributeSet> AS[1] = {{index, Attrs}};
+    return AttributeList::get(getContext(), AS);
+  }
+
+  /// Set attribute list for this global
+  void setAttributes(AttributeSet A) {
+    Attrs = A;
+  }
+
   // Methods for support type inquiry through isa, cast, and dyn_cast:
   static inline bool classof(const Value *V) {
     return V->getValueID() == Value::GlobalVariableVal;
diff --git a/include/llvm/IR/IRBuilder.h b/include/llvm/IR/IRBuilder.h
index bc689f3b01d7..9d4c13c29f68 100644
--- a/include/llvm/IR/IRBuilder.h
+++ b/include/llvm/IR/IRBuilder.h
@@ -454,6 +454,45 @@ public:
                           MDNode *ScopeTag = nullptr,
                           MDNode *NoAliasTag = nullptr);
 
+  /// \brief Create a vector fadd reduction intrinsic of the source vector.
+  /// The first parameter is a scalar accumulator value for ordered reductions.
+  CallInst *CreateFAddReduce(Value *Acc, Value *Src);
+
+  /// \brief Create a vector fmul reduction intrinsic of the source vector.
+  /// The first parameter is a scalar accumulator value for ordered reductions.
+  CallInst *CreateFMulReduce(Value *Acc, Value *Src);
+
+  /// \brief Create a vector int add reduction intrinsic of the source vector.
+  CallInst *CreateAddReduce(Value *Src);
+
+  /// \brief Create a vector int mul reduction intrinsic of the source vector.
+  CallInst *CreateMulReduce(Value *Src);
+
+  /// \brief Create a vector int AND reduction intrinsic of the source vector.
+  CallInst *CreateAndReduce(Value *Src);
+
+  /// \brief Create a vector int OR reduction intrinsic of the source vector.
+  CallInst *CreateOrReduce(Value *Src);
+
+  /// \brief Create a vector int XOR reduction intrinsic of the source vector.
+  CallInst *CreateXorReduce(Value *Src);
+
+  /// \brief Create a vector integer max reduction intrinsic of the source
+  /// vector.
+  CallInst *CreateIntMaxReduce(Value *Src, bool IsSigned = false);
+
+  /// \brief Create a vector integer min reduction intrinsic of the source
+  /// vector.
+  CallInst *CreateIntMinReduce(Value *Src, bool IsSigned = false);
+
+  /// \brief Create a vector float max reduction intrinsic of the source
+  /// vector.
+  CallInst *CreateFPMaxReduce(Value *Src, bool NoNaN = false);
+
+  /// \brief Create a vector float min reduction intrinsic of the source
+  /// vector.
+  CallInst *CreateFPMinReduce(Value *Src, bool NoNaN = false);
+
   /// \brief Create a lifetime.start intrinsic.
   ///
   /// If the pointer isn't i8* it will be converted.
diff --git a/include/llvm/IR/InstrTypes.h b/include/llvm/IR/InstrTypes.h
index d16a5d318d78..61ca90de7393 100644
--- a/include/llvm/IR/InstrTypes.h
+++ b/include/llvm/IR/InstrTypes.h
@@ -65,27 +65,15 @@ protected:
   // Out of line virtual method, so the vtable, etc has a home.
   ~TerminatorInst() override;
 
-  /// Virtual methods - Terminators should overload these and provide inline
-  /// overrides of non-V methods.
-  virtual BasicBlock *getSuccessorV(unsigned idx) const = 0;
-  virtual unsigned getNumSuccessorsV() const = 0;
-  virtual void setSuccessorV(unsigned idx, BasicBlock *B) = 0;
-
 public:
   /// Return the number of successors that this terminator has.
-  unsigned getNumSuccessors() const {
-    return getNumSuccessorsV();
-  }
+  unsigned getNumSuccessors() const;
 
   /// Return the specified successor.
-  BasicBlock *getSuccessor(unsigned idx) const {
-    return getSuccessorV(idx);
-  }
+  BasicBlock *getSuccessor(unsigned idx) const;
 
   /// Update the specified successor to point at the provided block.
-  void setSuccessor(unsigned idx, BasicBlock *B) {
-    setSuccessorV(idx, B);
-  }
+  void setSuccessor(unsigned idx, BasicBlock *B);
 
   // Methods for support type inquiry through isa, cast, and dyn_cast:
   static inline bool classof(const Instruction *I) {
diff --git a/include/llvm/IR/Instruction.h b/include/llvm/IR/Instruction.h
index 90c3175122fd..fca29900f4c2 100644
--- a/include/llvm/IR/Instruction.h
+++ b/include/llvm/IR/Instruction.h
@@ -456,6 +456,12 @@ public:
   /// higher.
   bool isAtomic() const;
 
+  /// Return true if this atomic instruction loads from memory.
+  bool hasAtomicLoad() const;
+
+  /// Return true if this atomic instruction stores to memory.
+  bool hasAtomicStore() const;
+
   /// Return true if this instruction may throw an exception.
   bool mayThrow() const;
 
diff --git a/include/llvm/IR/Instructions.h b/include/llvm/IR/Instructions.h
index 844a7273eca9..c26701af27ce 100644
--- a/include/llvm/IR/Instructions.h
+++ b/include/llvm/IR/Instructions.h
@@ -1,4 +1,4 @@
-//===-- llvm/Instructions.h - Instruction subclass definitions --*- C++ -*-===//
+//===- llvm/Instructions.h - Instruction subclass definitions ---*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -17,6 +17,7 @@
 #define LLVM_IR_INSTRUCTIONS_H
 
 #include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/iterator.h"
 #include "llvm/ADT/iterator_range.h"
 #include "llvm/ADT/None.h"
 #include "llvm/ADT/SmallVector.h"
@@ -24,21 +25,25 @@
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/IR/Attributes.h"
+#include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/CallingConv.h"
 #include "llvm/IR/Constant.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
 #include "llvm/IR/OperandTraits.h"
 #include "llvm/IR/Type.h"
 #include "llvm/IR/Use.h"
 #include "llvm/IR/User.h"
+#include "llvm/IR/Value.h"
 #include "llvm/Support/AtomicOrdering.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/ErrorHandling.h"
 #include <cassert>
 #include <cstddef>
 #include <cstdint>
+#include <iterator>
 
 namespace llvm {
 
@@ -264,6 +269,7 @@ public:
   }
 
   bool isSimple() const { return !isAtomic() && !isVolatile(); }
+
   bool isUnordered() const {
     return (getOrdering() == AtomicOrdering::NotAtomic ||
             getOrdering() == AtomicOrdering::Unordered) &&
@@ -386,6 +392,7 @@ public:
   }
 
   bool isSimple() const { return !isAtomic() && !isVolatile(); }
+
   bool isUnordered() const {
     return (getOrdering() == AtomicOrdering::NotAtomic ||
             getOrdering() == AtomicOrdering::Unordered) &&
@@ -836,10 +843,7 @@ class GetElementPtrInst : public Instruction {
   Type *SourceElementType;
   Type *ResultElementType;
 
-  void anchor() override;
-
   GetElementPtrInst(const GetElementPtrInst &GEPI);
-  void init(Value *Ptr, ArrayRef<Value *> IdxList, const Twine &NameStr);
 
   /// Constructors - Create a getelementptr instruction with a base pointer an
   /// list of indices. The first ctor can optionally insert before an existing
@@ -852,6 +856,9 @@ class GetElementPtrInst : public Instruction {
                            ArrayRef<Value *> IdxList, unsigned Values,
                            const Twine &NameStr, BasicBlock *InsertAtEnd);
 
+  void anchor() override;
+  void init(Value *Ptr, ArrayRef<Value *> IdxList, const Twine &NameStr);
+
 protected:
   // Note: Instruction needs to be a friend here to call cloneImpl.
   friend class Instruction;
@@ -2261,6 +2268,19 @@ public:
     return Mask;
   }
 
+  /// Change values in a shuffle permute mask assuming the two vector operands
+  /// of length InVecNumElts have swapped position.
+  static void commuteShuffleMask(MutableArrayRef<int> Mask,
+                                 unsigned InVecNumElts) {
+    for (int &Idx : Mask) {
+      if (Idx == -1)
+        continue;
+      Idx = Idx < (int)InVecNumElts ? Idx + InVecNumElts : Idx - InVecNumElts;
+      assert(Idx >= 0 && Idx < (int)InVecNumElts * 2 &&
+             "shufflevector mask index out of range");
+    }
+  }
+
   // Methods for support type inquiry through isa, cast, and dyn_cast:
   static inline bool classof(const Instruction *I) {
     return I->getOpcode() == Instruction::ShuffleVector;
@@ -2288,6 +2308,7 @@ class ExtractValueInst : public UnaryInstruction {
   SmallVector<unsigned, 4> Indices;
 
   ExtractValueInst(const ExtractValueInst &EVI);
+
   /// Constructors - Create a extractvalue instruction with a base aggregate
   /// value and a list of indices.  The first ctor can optionally insert before
   /// an existing instruction, the second appends the new instruction to the
@@ -2333,7 +2354,8 @@ public:
   /// Null is returned if the indices are invalid for the specified type.
   static Type *getIndexedType(Type *Agg, ArrayRef<unsigned> Idxs);
 
-  typedef const unsigned* idx_iterator;
+  using idx_iterator = const unsigned*;
+
   inline idx_iterator idx_begin() const { return Indices.begin(); }
   inline idx_iterator idx_end()   const { return Indices.end(); }
   inline iterator_range<idx_iterator> indices() const {
@@ -2455,7 +2477,8 @@ public:
   /// Transparently provide more efficient getOperand methods.
   DECLARE_TRANSPARENT_OPERAND_ACCESSORS(Value);
 
-  typedef const unsigned* idx_iterator;
+  using idx_iterator = const unsigned*;
+
   inline idx_iterator idx_begin() const { return Indices.begin(); }
   inline idx_iterator idx_end()   const { return Indices.end(); }
   inline iterator_range<idx_iterator> indices() const {
@@ -2606,8 +2629,8 @@ public:
   // Block iterator interface. This provides access to the list of incoming
   // basic blocks, which parallels the list of incoming values.
 
-  typedef BasicBlock **block_iterator;
-  typedef BasicBlock * const *const_block_iterator;
+  using block_iterator = BasicBlock **;
+  using const_block_iterator = BasicBlock * const *;
 
   block_iterator block_begin() {
     Use::UserRef *ref =
@@ -2656,9 +2679,11 @@ public:
            "All operands to PHI node must be the same type as the PHI node!");
     setOperand(i, V);
   }
+
   static unsigned getOperandNumForIncomingValue(unsigned i) {
     return i;
   }
+
   static unsigned getIncomingValueNumForOperand(unsigned i) {
     return i;
   }
@@ -2937,9 +2962,11 @@ public:
   }
 
 private:
-  BasicBlock *getSuccessorV(unsigned idx) const override;
-  unsigned getNumSuccessorsV() const override;
-  void setSuccessorV(unsigned idx, BasicBlock *B) override;
+  friend TerminatorInst;
+
+  BasicBlock *getSuccessorV(unsigned idx) const;
+  unsigned getNumSuccessorsV() const;
+  void setSuccessorV(unsigned idx, BasicBlock *B);
 };
 
 template <>
@@ -3047,9 +3074,11 @@ public:
   }
 
 private:
-  BasicBlock *getSuccessorV(unsigned idx) const override;
-  unsigned getNumSuccessorsV() const override;
-  void setSuccessorV(unsigned idx, BasicBlock *B) override;
+  friend TerminatorInst;
+
+  BasicBlock *getSuccessorV(unsigned idx) const;
+  unsigned getNumSuccessorsV() const;
+  void setSuccessorV(unsigned idx, BasicBlock *B);
 };
 
 template <>
@@ -3123,7 +3152,7 @@ public:
 
   protected:
     // Expose the switch type we're parameterized with to the iterator.
-    typedef SwitchInstT SwitchInstType;
+    using SwitchInstType = SwitchInstT;
 
     SwitchInstT *SI;
     ptrdiff_t Index;
@@ -3164,8 +3193,8 @@ public:
     }
   };
 
-  typedef CaseHandleImpl<const SwitchInst, const ConstantInt, const BasicBlock>
-      ConstCaseHandle;
+  using ConstCaseHandle =
+      CaseHandleImpl<const SwitchInst, const ConstantInt, const BasicBlock>;
 
   class CaseHandle
       : public CaseHandleImpl<SwitchInst, ConstantInt, BasicBlock> {
@@ -3192,7 +3221,7 @@ public:
       : public iterator_facade_base<CaseIteratorImpl<CaseHandleT>,
                                     std::random_access_iterator_tag,
                                     CaseHandleT> {
-    typedef typename CaseHandleT::SwitchInstType SwitchInstT;
+    using SwitchInstT = typename CaseHandleT::SwitchInstType;
 
     CaseHandleT Case;
 
@@ -3254,8 +3283,8 @@ public:
     const CaseHandleT &operator*() const { return Case; }
   };
 
-  typedef CaseIteratorImpl<CaseHandle> CaseIt;
-  typedef CaseIteratorImpl<ConstCaseHandle> ConstCaseIt;
+  using CaseIt = CaseIteratorImpl<CaseHandle>;
+  using ConstCaseIt = CaseIteratorImpl<ConstCaseHandle>;
 
   static SwitchInst *Create(Value *Value, BasicBlock *Default,
                             unsigned NumCases,
@@ -3411,9 +3440,11 @@ public:
   }
 
 private:
-  BasicBlock *getSuccessorV(unsigned idx) const override;
-  unsigned getNumSuccessorsV() const override;
-  void setSuccessorV(unsigned idx, BasicBlock *B) override;
+  friend TerminatorInst;
+
+  BasicBlock *getSuccessorV(unsigned idx) const;
+  unsigned getNumSuccessorsV() const;
+  void setSuccessorV(unsigned idx, BasicBlock *B);
 };
 
 template <>
@@ -3516,9 +3547,11 @@ public:
   }
 
 private:
-  BasicBlock *getSuccessorV(unsigned idx) const override;
-  unsigned getNumSuccessorsV() const override;
-  void setSuccessorV(unsigned idx, BasicBlock *B) override;
+  friend TerminatorInst;
+
+  BasicBlock *getSuccessorV(unsigned idx) const;
+  unsigned getNumSuccessorsV() const;
+  void setSuccessorV(unsigned idx, BasicBlock *B);
 };
 
 template <>
@@ -3639,6 +3672,7 @@ public:
     return new (Values) InvokeInst(Func, IfNormal, IfException, Args, None,
                                    Values, NameStr, InsertAtEnd);
   }
+
   static InvokeInst *Create(Value *Func, BasicBlock *IfNormal,
                             BasicBlock *IfException, ArrayRef<Value *> Args,
                             ArrayRef<OperandBundleDef> Bundles,
@@ -3996,9 +4030,11 @@ public:
   }
 
 private:
-  BasicBlock *getSuccessorV(unsigned idx) const override;
-  unsigned getNumSuccessorsV() const override;
-  void setSuccessorV(unsigned idx, BasicBlock *B) override;
+  friend TerminatorInst;
+
+  BasicBlock *getSuccessorV(unsigned idx) const;
+  unsigned getNumSuccessorsV() const;
+  void setSuccessorV(unsigned idx, BasicBlock *B);
 
   template <typename AttrKind> bool hasFnAttrImpl(AttrKind Kind) const {
     if (Attrs.hasAttribute(AttributeList::FunctionIndex, Kind))
@@ -4095,9 +4131,11 @@ public:
   }
 
 private:
-  BasicBlock *getSuccessorV(unsigned idx) const override;
-  unsigned getNumSuccessorsV() const override;
-  void setSuccessorV(unsigned idx, BasicBlock *B) override;
+  friend TerminatorInst;
+
+  BasicBlock *getSuccessorV(unsigned idx) const;
+  unsigned getNumSuccessorsV() const;
+  void setSuccessorV(unsigned idx, BasicBlock *B);
 };
 
 template <>
@@ -4202,13 +4240,14 @@ private:
   }
 
 public:
-  typedef std::pointer_to_unary_function<Value *, BasicBlock *> DerefFnTy;
-  typedef mapped_iterator<op_iterator, DerefFnTy> handler_iterator;
-  typedef iterator_range<handler_iterator> handler_range;
-  typedef std::pointer_to_unary_function<const Value *, const BasicBlock *>
-      ConstDerefFnTy;
-  typedef mapped_iterator<const_op_iterator, ConstDerefFnTy> const_handler_iterator;
-  typedef iterator_range<const_handler_iterator> const_handler_range;
+  using DerefFnTy = std::pointer_to_unary_function<Value *, BasicBlock *>;
+  using handler_iterator = mapped_iterator<op_iterator, DerefFnTy>;
+  using handler_range = iterator_range<handler_iterator>;
+  using ConstDerefFnTy =
+      std::pointer_to_unary_function<const Value *, const BasicBlock *>;
+  using const_handler_iterator =
+      mapped_iterator<const_op_iterator, ConstDerefFnTy>;
+  using const_handler_range = iterator_range<const_handler_iterator>;
 
   /// Returns an iterator that points to the first handler in CatchSwitchInst.
   handler_iterator handler_begin() {
@@ -4278,9 +4317,11 @@ public:
   }
 
 private:
-  BasicBlock *getSuccessorV(unsigned Idx) const override;
-  unsigned getNumSuccessorsV() const override;
-  void setSuccessorV(unsigned Idx, BasicBlock *B) override;
+  friend TerminatorInst;
+
+  BasicBlock *getSuccessorV(unsigned Idx) const;
+  unsigned getNumSuccessorsV() const;
+  void setSuccessorV(unsigned Idx, BasicBlock *B);
 };
 
 template <>
@@ -4443,9 +4484,11 @@ public:
   }
 
 private:
-  BasicBlock *getSuccessorV(unsigned Idx) const override;
-  unsigned getNumSuccessorsV() const override;
-  void setSuccessorV(unsigned Idx, BasicBlock *B) override;
+  friend TerminatorInst;
+
+  BasicBlock *getSuccessorV(unsigned Idx) const;
+  unsigned getNumSuccessorsV() const;
+  void setSuccessorV(unsigned Idx, BasicBlock *B);
 };
 
 template <>
@@ -4531,9 +4574,11 @@ public:
   }
 
 private:
-  BasicBlock *getSuccessorV(unsigned Idx) const override;
-  unsigned getNumSuccessorsV() const override;
-  void setSuccessorV(unsigned Idx, BasicBlock *B) override;
+  friend TerminatorInst;
+
+  BasicBlock *getSuccessorV(unsigned Idx) const;
+  unsigned getNumSuccessorsV() const;
+  void setSuccessorV(unsigned Idx, BasicBlock *B);
 
   // Shadow Instruction::setInstructionSubclassData with a private forwarding
   // method so that subclasses cannot accidentally use it.
@@ -4586,9 +4631,11 @@ public:
   }
 
 private:
-  BasicBlock *getSuccessorV(unsigned idx) const override;
-  unsigned getNumSuccessorsV() const override;
-  void setSuccessorV(unsigned idx, BasicBlock *B) override;
+  friend TerminatorInst;
+
+  BasicBlock *getSuccessorV(unsigned idx) const;
+  unsigned getNumSuccessorsV() const;
+  void setSuccessorV(unsigned idx, BasicBlock *B);
 };
 
 //===----------------------------------------------------------------------===//
diff --git a/include/llvm/IR/Intrinsics.td b/include/llvm/IR/Intrinsics.td
index 7b78d4d3d34a..19f6045568f4 100644
--- a/include/llvm/IR/Intrinsics.td
+++ b/include/llvm/IR/Intrinsics.td
@@ -812,6 +812,50 @@ def int_memcpy_element_atomic  : Intrinsic<[],
                                  [IntrArgMemOnly, NoCapture<0>, NoCapture<1>,
                                   WriteOnly<0>, ReadOnly<1>]>;
 
+//===------------------------ Reduction Intrinsics ------------------------===//
+//
+def int_experimental_vector_reduce_fadd : Intrinsic<[llvm_anyfloat_ty],
+                                                    [llvm_anyfloat_ty,
+                                                     llvm_anyvector_ty],
+                                                    [IntrNoMem]>;
+def int_experimental_vector_reduce_fmul : Intrinsic<[llvm_anyfloat_ty],
+                                                    [llvm_anyfloat_ty,
+                                                     llvm_anyvector_ty],
+                                                    [IntrNoMem]>;
+def int_experimental_vector_reduce_add : Intrinsic<[llvm_anyint_ty],
+                                                   [llvm_anyvector_ty],
+                                                   [IntrNoMem]>;
+def int_experimental_vector_reduce_mul : Intrinsic<[llvm_anyint_ty],
+                                                   [llvm_anyvector_ty],
+                                                   [IntrNoMem]>;
+def int_experimental_vector_reduce_and : Intrinsic<[llvm_anyint_ty],
+                                                   [llvm_anyvector_ty],
+                                                   [IntrNoMem]>;
+def int_experimental_vector_reduce_or : Intrinsic<[llvm_anyint_ty],
+                                                  [llvm_anyvector_ty],
+                                                  [IntrNoMem]>;
+def int_experimental_vector_reduce_xor : Intrinsic<[llvm_anyint_ty],
+                                                   [llvm_anyvector_ty],
+                                                   [IntrNoMem]>;
+def int_experimental_vector_reduce_smax : Intrinsic<[llvm_anyint_ty],
+                                                    [llvm_anyvector_ty],
+                                                    [IntrNoMem]>;
+def int_experimental_vector_reduce_smin : Intrinsic<[llvm_anyint_ty],
+                                                    [llvm_anyvector_ty],
+                                                    [IntrNoMem]>;
+def int_experimental_vector_reduce_umax : Intrinsic<[llvm_anyint_ty],
+                                                    [llvm_anyvector_ty],
+                                                    [IntrNoMem]>;
+def int_experimental_vector_reduce_umin : Intrinsic<[llvm_anyint_ty],
+                                                    [llvm_anyvector_ty],
+                                                    [IntrNoMem]>;
+def int_experimental_vector_reduce_fmax : Intrinsic<[llvm_anyfloat_ty],
+                                                    [llvm_anyvector_ty],
+                                                    [IntrNoMem]>;
+def int_experimental_vector_reduce_fmin : Intrinsic<[llvm_anyfloat_ty],
+                                                    [llvm_anyvector_ty],
+                                                    [IntrNoMem]>;
+
 //===----- Intrinsics that are used to provide predicate information -----===//
 
 def int_ssa_copy : Intrinsic<[llvm_any_ty], [LLVMMatchType<0>],
diff --git a/include/llvm/IR/LLVMContext.h b/include/llvm/IR/LLVMContext.h
index d13d5ddaeb3c..ad011fb72e6a 100644
--- a/include/llvm/IR/LLVMContext.h
+++ b/include/llvm/IR/LLVMContext.h
@@ -1,4 +1,4 @@
-//===-- llvm/LLVMContext.h - Class for managing "global" state --*- C++ -*-===//
+//===- llvm/LLVMContext.h - Class for managing "global" state ---*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -37,7 +37,9 @@ class StringRef;
 class Twine;
 
 namespace yaml {
+
 class Output;
+
 } // end namespace yaml
 
 /// This is an important class for using LLVM in a threaded context.  It
@@ -134,17 +136,17 @@ public:
   void enableDebugTypeODRUniquing();
   void disableDebugTypeODRUniquing();
 
-  typedef void (*InlineAsmDiagHandlerTy)(const SMDiagnostic&, void *Context,
-                                         unsigned LocCookie);
+  using InlineAsmDiagHandlerTy = void (*)(const SMDiagnostic&, void *Context,
+                                          unsigned LocCookie);
 
   /// Defines the type of a diagnostic handler.
   /// \see LLVMContext::setDiagnosticHandler.
   /// \see LLVMContext::diagnose.
-  typedef void (*DiagnosticHandlerTy)(const DiagnosticInfo &DI, void *Context);
+  using DiagnosticHandlerTy = void (*)(const DiagnosticInfo &DI, void *Context);
 
   /// Defines the type of a yield callback.
   /// \see LLVMContext::setYieldCallback.
-  typedef void (*YieldCallbackTy)(LLVMContext *Context, void *OpaqueHandle);
+  using YieldCallbackTy = void (*)(LLVMContext *Context, void *OpaqueHandle);
 
   /// setInlineAsmDiagnosticHandler - This method sets a handler that is invoked
   /// when problems with inline asm are detected by the backend.  The first
diff --git a/include/llvm/IR/LegacyPassManager.h b/include/llvm/IR/LegacyPassManager.h
index 5257a0eed488..9a376a151505 100644
--- a/include/llvm/IR/LegacyPassManager.h
+++ b/include/llvm/IR/LegacyPassManager.h
@@ -98,6 +98,9 @@ private:
 // Create wrappers for C Binding types (see CBindingWrapping.h).
 DEFINE_STDCXX_CONVERSION_FUNCTIONS(legacy::PassManagerBase, LLVMPassManagerRef)
 
+/// If -time-passes has been specified, report the timings immediately and then
+/// reset the timers to zero.
+void reportAndResetTimings();
 } // End llvm namespace
 
 #endif
diff --git a/include/llvm/IR/Module.h b/include/llvm/IR/Module.h
index 67c35cd22b34..3024d9e27a2f 100644
--- a/include/llvm/IR/Module.h
+++ b/include/llvm/IR/Module.h
@@ -1,4 +1,4 @@
-//===-- llvm/Module.h - C++ class to represent a VM module ------*- C++ -*-===//
+//===- llvm/Module.h - C++ class to represent a VM module -------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -16,6 +16,10 @@
 #define LLVM_IR_MODULE_H
 
 #include "llvm/ADT/iterator_range.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringMap.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/IR/Attributes.h"
 #include "llvm/IR/Comdat.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/Function.h"
@@ -23,20 +27,27 @@
 #include "llvm/IR/GlobalIFunc.h"
 #include "llvm/IR/GlobalVariable.h"
 #include "llvm/IR/Metadata.h"
+#include "llvm/IR/SymbolTableListTraits.h"
 #include "llvm/Support/CBindingWrapping.h"
 #include "llvm/Support/CodeGen.h"
-#include "llvm/Support/DataTypes.h"
+#include "llvm-c/Types.h"
+#include <cstddef>
+#include <cstdint>
+#include <iterator>
+#include <memory>
+#include <string>
+#include <vector>
 
 namespace llvm {
-template <typename T> class Optional;
+
 class Error;
 class FunctionType;
 class GVMaterializer;
 class LLVMContext;
 class MemoryBuffer;
 class RandomNumberGenerator;
-class StructType;
 template <class PtrType> class SmallPtrSetImpl;
+class StructType;
 
 /// A Module instance is used to store all the information related to an
 /// LLVM module. Modules are the top level container of all other LLVM
@@ -54,47 +65,47 @@ class Module {
 /// @{
 public:
   /// The type for the list of global variables.
-  typedef SymbolTableList<GlobalVariable> GlobalListType;
+  using GlobalListType = SymbolTableList<GlobalVariable>;
   /// The type for the list of functions.
-  typedef SymbolTableList<Function> FunctionListType;
+  using FunctionListType = SymbolTableList<Function>;
   /// The type for the list of aliases.
-  typedef SymbolTableList<GlobalAlias> AliasListType;
+  using AliasListType = SymbolTableList<GlobalAlias>;
   /// The type for the list of ifuncs.
-  typedef SymbolTableList<GlobalIFunc> IFuncListType;
+  using IFuncListType = SymbolTableList<GlobalIFunc>;
   /// The type for the list of named metadata.
-  typedef ilist<NamedMDNode> NamedMDListType;
+  using NamedMDListType = ilist<NamedMDNode>;
   /// The type of the comdat "symbol" table.
-  typedef StringMap<Comdat> ComdatSymTabType;
+  using ComdatSymTabType = StringMap<Comdat>;
 
   /// The Global Variable iterator.
-  typedef GlobalListType::iterator                      global_iterator;
+  using global_iterator = GlobalListType::iterator;
   /// The Global Variable constant iterator.
-  typedef GlobalListType::const_iterator          const_global_iterator;
+  using const_global_iterator = GlobalListType::const_iterator;
 
   /// The Function iterators.
-  typedef FunctionListType::iterator                           iterator;
+  using iterator = FunctionListType::iterator;
   /// The Function constant iterator
-  typedef FunctionListType::const_iterator               const_iterator;
+  using const_iterator = FunctionListType::const_iterator;
 
   /// The Function reverse iterator.
-  typedef FunctionListType::reverse_iterator             reverse_iterator;
+  using reverse_iterator = FunctionListType::reverse_iterator;
   /// The Function constant reverse iterator.
-  typedef FunctionListType::const_reverse_iterator const_reverse_iterator;
+  using const_reverse_iterator = FunctionListType::const_reverse_iterator;
 
   /// The Global Alias iterators.
-  typedef AliasListType::iterator                        alias_iterator;
+  using alias_iterator = AliasListType::iterator;
   /// The Global Alias constant iterator
-  typedef AliasListType::const_iterator            const_alias_iterator;
+  using const_alias_iterator = AliasListType::const_iterator;
 
   /// The Global IFunc iterators.
-  typedef IFuncListType::iterator                        ifunc_iterator;
+  using ifunc_iterator = IFuncListType::iterator;
   /// The Global IFunc constant iterator
-  typedef IFuncListType::const_iterator            const_ifunc_iterator;
+  using const_ifunc_iterator = IFuncListType::const_iterator;
 
   /// The named metadata iterators.
-  typedef NamedMDListType::iterator             named_metadata_iterator;
+  using named_metadata_iterator = NamedMDListType::iterator;
   /// The named metadata constant iterators.
-  typedef NamedMDListType::const_iterator const_named_metadata_iterator;
+  using const_named_metadata_iterator = NamedMDListType::const_iterator;
 
   /// This enumeration defines the supported behaviors of module flags.
   enum ModFlagBehavior {
@@ -141,6 +152,7 @@ public:
     ModFlagBehavior Behavior;
     MDString *Key;
     Metadata *Val;
+
     ModuleFlagEntry(ModFlagBehavior B, MDString *K, Metadata *V)
         : Behavior(B), Key(K), Val(V) {}
   };
@@ -483,9 +495,11 @@ public:
   const GlobalListType   &getGlobalList() const       { return GlobalList; }
   /// Get the Module's list of global variables.
   GlobalListType         &getGlobalList()             { return GlobalList; }
+
   static GlobalListType Module::*getSublistAccess(GlobalVariable*) {
     return &Module::GlobalList;
   }
+
   /// Get the Module's list of functions (constant).
   const FunctionListType &getFunctionList() const     { return FunctionList; }
   /// Get the Module's list of functions.
@@ -493,31 +507,39 @@ public:
   static FunctionListType Module::*getSublistAccess(Function*) {
     return &Module::FunctionList;
   }
+
   /// Get the Module's list of aliases (constant).
   const AliasListType    &getAliasList() const        { return AliasList; }
   /// Get the Module's list of aliases.
   AliasListType          &getAliasList()              { return AliasList; }
+
   static AliasListType Module::*getSublistAccess(GlobalAlias*) {
     return &Module::AliasList;
   }
+
   /// Get the Module's list of ifuncs (constant).
   const IFuncListType    &getIFuncList() const        { return IFuncList; }
   /// Get the Module's list of ifuncs.
   IFuncListType          &getIFuncList()              { return IFuncList; }
+
   static IFuncListType Module::*getSublistAccess(GlobalIFunc*) {
     return &Module::IFuncList;
   }
+
   /// Get the Module's list of named metadata (constant).
   const NamedMDListType  &getNamedMDList() const      { return NamedMDList; }
   /// Get the Module's list of named metadata.
   NamedMDListType        &getNamedMDList()            { return NamedMDList; }
+
   static NamedMDListType Module::*getSublistAccess(NamedMDNode*) {
     return &Module::NamedMDList;
   }
+
   /// Get the symbol table of global variable and function identifiers
   const ValueSymbolTable &getValueSymbolTable() const { return *ValSymTab; }
   /// Get the Module's symbol table of global variable and function identifiers.
   ValueSymbolTable       &getValueSymbolTable()       { return *ValSymTab; }
+
   /// Get the Module's symbol table for COMDATs (constant).
   const ComdatSymTabType &getComdatSymbolTable() const { return ComdatSymTab; }
   /// Get the Module's symbol table for COMDATs.
@@ -602,11 +624,11 @@ public:
   /// @name Convenience iterators
   /// @{
 
-  typedef concat_iterator<GlobalObject, iterator, global_iterator>
-      global_object_iterator;
-  typedef concat_iterator<const GlobalObject, const_iterator,
-                          const_global_iterator>
-      const_global_object_iterator;
+  using global_object_iterator =
+      concat_iterator<GlobalObject, iterator, global_iterator>;
+  using const_global_object_iterator =
+      concat_iterator<const GlobalObject, const_iterator,
+                      const_global_iterator>;
 
   iterator_range<global_object_iterator> global_objects() {
     return concat<GlobalObject>(functions(), globals());
@@ -627,13 +649,12 @@ public:
     return global_objects().end();
   }
 
-  typedef concat_iterator<GlobalValue, iterator, global_iterator,
-                          alias_iterator, ifunc_iterator>
-      global_value_iterator;
-  typedef concat_iterator<const GlobalValue, const_iterator,
-                          const_global_iterator, const_alias_iterator,
-                          const_ifunc_iterator>
-      const_global_value_iterator;
+  using global_value_iterator =
+      concat_iterator<GlobalValue, iterator, global_iterator, alias_iterator,
+                      ifunc_iterator>;
+  using const_global_value_iterator =
+      concat_iterator<const GlobalValue, const_iterator, const_global_iterator,
+                      const_alias_iterator, const_ifunc_iterator>;
 
   iterator_range<global_value_iterator> global_values() {
     return concat<GlobalValue>(functions(), globals(), aliases(), ifuncs());
@@ -682,28 +703,35 @@ public:
       : public std::iterator<std::input_iterator_tag, DICompileUnit *> {
     NamedMDNode *CUs;
     unsigned Idx;
+
     void SkipNoDebugCUs();
+
   public:
     explicit debug_compile_units_iterator(NamedMDNode *CUs, unsigned Idx)
         : CUs(CUs), Idx(Idx) {
       SkipNoDebugCUs();
     }
+
     debug_compile_units_iterator &operator++() {
       ++Idx;
       SkipNoDebugCUs();
       return *this;
     }
+
     debug_compile_units_iterator operator++(int) {
       debug_compile_units_iterator T(*this);
       ++Idx;
       return T;
     }
+
     bool operator==(const debug_compile_units_iterator &I) const {
       return Idx == I.Idx;
     }
+
     bool operator!=(const debug_compile_units_iterator &I) const {
       return Idx != I.Idx;
     }
+
     DICompileUnit *operator*() const;
     DICompileUnit *operator->() const;
   };
@@ -833,6 +861,6 @@ inline Module *unwrap(LLVMModuleProviderRef MP) {
   return reinterpret_cast<Module*>(MP);
 }
 
-} // End llvm namespace
+} // end namespace llvm
 
-#endif
+#endif // LLVM_IR_MODULE_H
diff --git a/include/llvm/IR/ModuleSummaryIndex.h b/include/llvm/IR/ModuleSummaryIndex.h
index 53570bdf16f4..c46c609609e2 100644
--- a/include/llvm/IR/ModuleSummaryIndex.h
+++ b/include/llvm/IR/ModuleSummaryIndex.h
@@ -1,4 +1,4 @@
-//===-- llvm/ModuleSummaryIndex.h - Module Summary Index --------*- C++ -*-===//
+//===- llvm/ModuleSummaryIndex.h - Module Summary Index ---------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -16,21 +16,33 @@
 #ifndef LLVM_IR_MODULESUMMARYINDEX_H
 #define LLVM_IR_MODULESUMMARYINDEX_H
 
+#include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/DenseSet.h"
-#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringMap.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/IR/Module.h"
-
+#include "llvm/IR/GlobalValue.h"
+#include <algorithm>
 #include <array>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <map>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
 
 namespace llvm {
 
 namespace yaml {
+
 template <typename T> struct MappingTraits;
-}
+
+} // end namespace yaml
 
 /// \brief Class to accumulate and hold information about a callee.
 struct CalleeInfo {
@@ -47,7 +59,7 @@ struct CalleeInfo {
 
 class GlobalValueSummary;
 
-typedef std::vector<std::unique_ptr<GlobalValueSummary>> GlobalValueSummaryList;
+using GlobalValueSummaryList = std::vector<std::unique_ptr<GlobalValueSummary>>;
 
 struct GlobalValueSummaryInfo {
   /// The GlobalValue corresponding to this summary. This is only used in
@@ -66,19 +78,22 @@ struct GlobalValueSummaryInfo {
 /// likely incur less overhead, as the value type is not very small and the size
 /// of the map is unknown, resulting in inefficiencies due to repeated
 /// insertions and resizing.
-typedef std::map<GlobalValue::GUID, GlobalValueSummaryInfo>
-    GlobalValueSummaryMapTy;
+using GlobalValueSummaryMapTy =
+    std::map<GlobalValue::GUID, GlobalValueSummaryInfo>;
 
 /// Struct that holds a reference to a particular GUID in a global value
 /// summary.
 struct ValueInfo {
   const GlobalValueSummaryMapTy::value_type *Ref = nullptr;
+
   ValueInfo() = default;
   ValueInfo(const GlobalValueSummaryMapTy::value_type *Ref) : Ref(Ref) {}
+
   operator bool() const { return Ref; }
 
   GlobalValue::GUID getGUID() const { return Ref->first; }
   const GlobalValue *getValue() const { return Ref->second.GV; }
+
   ArrayRef<std::unique_ptr<GlobalValueSummary>> getSummaryList() const {
     return Ref->second.SummaryList;
   }
@@ -88,9 +103,11 @@ template <> struct DenseMapInfo<ValueInfo> {
   static inline ValueInfo getEmptyKey() {
     return ValueInfo((GlobalValueSummaryMapTy::value_type *)-1);
   }
+
   static inline ValueInfo getTombstoneKey() {
     return ValueInfo((GlobalValueSummaryMapTy::value_type *)-2);
   }
+
   static bool isEqual(ValueInfo L, ValueInfo R) { return L.Ref == R.Ref; }
   static unsigned getHashValue(ValueInfo I) { return (uintptr_t)I.Ref; }
 };
@@ -138,7 +155,7 @@ private:
   /// This is the hash of the name of the symbol in the original file. It is
   /// identical to the GUID for global symbols, but differs for local since the
   /// GUID includes the module level id in the hash.
-  GlobalValue::GUID OriginalName;
+  GlobalValue::GUID OriginalName = 0;
 
   /// \brief Path of module IR containing value's definition, used to locate
   /// module during importing.
@@ -157,7 +174,7 @@ private:
 
 protected:
   GlobalValueSummary(SummaryKind K, GVFlags Flags, std::vector<ValueInfo> Refs)
-      : Kind(K), Flags(Flags), OriginalName(0), RefEdgeList(std::move(Refs)) {}
+      : Kind(K), Flags(Flags), RefEdgeList(std::move(Refs)) {}
 
 public:
   virtual ~GlobalValueSummary() = default;
@@ -242,7 +259,7 @@ public:
 class FunctionSummary : public GlobalValueSummary {
 public:
   /// <CalleeValueInfo, CalleeInfo> call edge pair.
-  typedef std::pair<ValueInfo, CalleeInfo> EdgeTy;
+  using EdgeTy = std::pair<ValueInfo, CalleeInfo>;
 
   /// An "identifier" for a virtual function. This contains the type identifier
   /// represented as a GUID and the offset from the address point to the virtual
@@ -376,12 +393,15 @@ public:
 
 template <> struct DenseMapInfo<FunctionSummary::VFuncId> {
   static FunctionSummary::VFuncId getEmptyKey() { return {0, uint64_t(-1)}; }
+
   static FunctionSummary::VFuncId getTombstoneKey() {
     return {0, uint64_t(-2)};
   }
+
   static bool isEqual(FunctionSummary::VFuncId L, FunctionSummary::VFuncId R) {
     return L.GUID == R.GUID && L.Offset == R.Offset;
   }
+
   static unsigned getHashValue(FunctionSummary::VFuncId I) { return I.GUID; }
 };
 
@@ -389,14 +409,17 @@ template <> struct DenseMapInfo<FunctionSummary::ConstVCall> {
   static FunctionSummary::ConstVCall getEmptyKey() {
     return {{0, uint64_t(-1)}, {}};
   }
+
   static FunctionSummary::ConstVCall getTombstoneKey() {
     return {{0, uint64_t(-2)}, {}};
   }
+
   static bool isEqual(FunctionSummary::ConstVCall L,
                       FunctionSummary::ConstVCall R) {
     return DenseMapInfo<FunctionSummary::VFuncId>::isEqual(L.VFunc, R.VFunc) &&
            L.Args == R.Args;
   }
+
   static unsigned getHashValue(FunctionSummary::ConstVCall I) {
     return I.VFunc.GUID;
   }
@@ -477,20 +500,20 @@ struct TypeIdSummary {
 };
 
 /// 160 bits SHA1
-typedef std::array<uint32_t, 5> ModuleHash;
+using ModuleHash = std::array<uint32_t, 5>;
 
 /// Type used for iterating through the global value summary map.
-typedef GlobalValueSummaryMapTy::const_iterator const_gvsummary_iterator;
-typedef GlobalValueSummaryMapTy::iterator gvsummary_iterator;
+using const_gvsummary_iterator = GlobalValueSummaryMapTy::const_iterator;
+using gvsummary_iterator = GlobalValueSummaryMapTy::iterator;
 
 /// String table to hold/own module path strings, which additionally holds the
 /// module ID assigned to each module during the plugin step, as well as a hash
 /// of the module. The StringMap makes a copy of and owns inserted strings.
-typedef StringMap<std::pair<uint64_t, ModuleHash>> ModulePathStringTableTy;
+using ModulePathStringTableTy = StringMap<std::pair<uint64_t, ModuleHash>>;
 
 /// Map of global value GUID to its summary, used to identify values defined in
 /// a particular module, and provide efficient access to their summary.
-typedef std::map<GlobalValue::GUID, GlobalValueSummary *> GVSummaryMapTy;
+using GVSummaryMapTy = std::map<GlobalValue::GUID, GlobalValueSummary *>;
 
 /// Class to hold module path string table and global value map,
 /// and encapsulate methods for operating on them.
@@ -697,6 +720,6 @@ public:
       StringMap<GVSummaryMapTy> &ModuleToDefinedGVSummaries) const;
 };
 
-} // End llvm namespace
+} // end namespace llvm
 
-#endif
+#endif // LLVM_IR_MODULESUMMARYINDEX_H
diff --git a/include/llvm/IR/PassManager.h b/include/llvm/IR/PassManager.h
index c845112baa45..d03b7b65f81e 100644
--- a/include/llvm/IR/PassManager.h
+++ b/include/llvm/IR/PassManager.h
@@ -39,8 +39,8 @@
 #define LLVM_IR_PASSMANAGER_H
 
 #include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/TinyPtrVector.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/Module.h"
@@ -48,9 +48,15 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/TypeName.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Support/type_traits.h"
+#include <algorithm>
+#include <cassert>
+#include <cstring>
+#include <iterator>
 #include <list>
 #include <memory>
+#include <tuple>
+#include <type_traits>
+#include <utility>
 #include <vector>
 
 namespace llvm {
@@ -469,15 +475,16 @@ public:
   }
 
   template <typename PassT> void addPass(PassT Pass) {
-    typedef detail::PassModel<IRUnitT, PassT, PreservedAnalyses,
-                              AnalysisManagerT, ExtraArgTs...>
-        PassModelT;
+    using PassModelT =
+        detail::PassModel<IRUnitT, PassT, PreservedAnalyses, AnalysisManagerT,
+                          ExtraArgTs...>;
+
     Passes.emplace_back(new PassModelT(std::move(Pass)));
   }
 
 private:
-  typedef detail::PassConcept<IRUnitT, AnalysisManagerT, ExtraArgTs...>
-      PassConceptT;
+  using PassConceptT =
+      detail::PassConcept<IRUnitT, AnalysisManagerT, ExtraArgTs...>;
 
   std::vector<std::unique_ptr<PassConceptT>> Passes;
 
@@ -486,12 +493,14 @@ private:
 };
 
 extern template class PassManager<Module>;
+
 /// \brief Convenience typedef for a pass manager over modules.
-typedef PassManager<Module> ModulePassManager;
+using ModulePassManager = PassManager<Module>;
 
 extern template class PassManager<Function>;
+
 /// \brief Convenience typedef for a pass manager over functions.
-typedef PassManager<Function> FunctionPassManager;
+using FunctionPassManager = PassManager<Function>;
 
 /// \brief A container for analyses that lazily runs them and caches their
 /// results.
@@ -504,11 +513,11 @@ public:
 
 private:
   // Now that we've defined our invalidator, we can define the concept types.
-  typedef detail::AnalysisResultConcept<IRUnitT, PreservedAnalyses, Invalidator>
-      ResultConceptT;
-  typedef detail::AnalysisPassConcept<IRUnitT, PreservedAnalyses, Invalidator,
-                                      ExtraArgTs...>
-      PassConceptT;
+  using ResultConceptT =
+      detail::AnalysisResultConcept<IRUnitT, PreservedAnalyses, Invalidator>;
+  using PassConceptT =
+      detail::AnalysisPassConcept<IRUnitT, PreservedAnalyses, Invalidator,
+                                  ExtraArgTs...>;
 
   /// \brief List of analysis pass IDs and associated concept pointers.
   ///
@@ -516,18 +525,18 @@ private:
   /// erases. Provides the analysis ID to enable finding iterators to a given
   /// entry in maps below, and provides the storage for the actual result
   /// concept.
-  typedef std::list<std::pair<AnalysisKey *, std::unique_ptr<ResultConceptT>>>
-      AnalysisResultListT;
+  using AnalysisResultListT =
+      std::list<std::pair<AnalysisKey *, std::unique_ptr<ResultConceptT>>>;
 
   /// \brief Map type from IRUnitT pointer to our custom list type.
-  typedef DenseMap<IRUnitT *, AnalysisResultListT> AnalysisResultListMapT;
+  using AnalysisResultListMapT = DenseMap<IRUnitT *, AnalysisResultListT>;
 
   /// \brief Map type from a pair of analysis ID and IRUnitT pointer to an
   /// iterator into a particular result list (which is where the actual analysis
   /// result is stored).
-  typedef DenseMap<std::pair<AnalysisKey *, IRUnitT *>,
-                   typename AnalysisResultListT::iterator>
-      AnalysisResultMapT;
+  using AnalysisResultMapT =
+      DenseMap<std::pair<AnalysisKey *, IRUnitT *>,
+               typename AnalysisResultListT::iterator>;
 
 public:
   /// API to communicate dependencies between analyses during invalidation.
@@ -558,10 +567,10 @@ public:
     /// dependecies on it will become invalid as a result.
     template <typename PassT>
     bool invalidate(IRUnitT &IR, const PreservedAnalyses &PA) {
-      typedef detail::AnalysisResultModel<IRUnitT, PassT,
-                                          typename PassT::Result,
-                                          PreservedAnalyses, Invalidator>
-          ResultModelT;
+      using ResultModelT =
+          detail::AnalysisResultModel<IRUnitT, PassT, typename PassT::Result,
+                                      PreservedAnalyses, Invalidator>;
+
       return invalidateImpl<ResultModelT>(PassT::ID(), IR, PA);
     }
 
@@ -672,9 +681,11 @@ public:
            "This analysis pass was not registered prior to being queried");
     ResultConceptT &ResultConcept =
         getResultImpl(PassT::ID(), IR, ExtraArgs...);
-    typedef detail::AnalysisResultModel<IRUnitT, PassT, typename PassT::Result,
-                                        PreservedAnalyses, Invalidator>
-        ResultModelT;
+
+    using ResultModelT =
+        detail::AnalysisResultModel<IRUnitT, PassT, typename PassT::Result,
+                                    PreservedAnalyses, Invalidator>;
+
     return static_cast<ResultModelT &>(ResultConcept).Result;
   }
 
@@ -692,9 +703,10 @@ public:
     if (!ResultConcept)
       return nullptr;
 
-    typedef detail::AnalysisResultModel<IRUnitT, PassT, typename PassT::Result,
-                                        PreservedAnalyses, Invalidator>
-        ResultModelT;
+    using ResultModelT =
+        detail::AnalysisResultModel<IRUnitT, PassT, typename PassT::Result,
+                                    PreservedAnalyses, Invalidator>;
+
     return &static_cast<ResultModelT *>(ResultConcept)->Result;
   }
 
@@ -717,10 +729,10 @@ public:
   /// hashtable.)
   template <typename PassBuilderT>
   bool registerPass(PassBuilderT &&PassBuilder) {
-    typedef decltype(PassBuilder()) PassT;
-    typedef detail::AnalysisPassModel<IRUnitT, PassT, PreservedAnalyses,
-                                      Invalidator, ExtraArgTs...>
-        PassModelT;
+    using PassT = decltype(PassBuilder());
+    using PassModelT =
+        detail::AnalysisPassModel<IRUnitT, PassT, PreservedAnalyses,
+                                  Invalidator, ExtraArgTs...>;
 
     auto &PassPtr = AnalysisPasses[PassT::ID()];
     if (PassPtr)
@@ -876,7 +888,8 @@ private:
   }
 
   /// \brief Map type from module analysis pass ID to pass concept pointer.
-  typedef DenseMap<AnalysisKey *, std::unique_ptr<PassConceptT>> AnalysisPassMapT;
+  using AnalysisPassMapT =
+      DenseMap<AnalysisKey *, std::unique_ptr<PassConceptT>>;
 
   /// \brief Collection of module analysis passes, indexed by ID.
   AnalysisPassMapT AnalysisPasses;
@@ -896,12 +909,14 @@ private:
 };
 
 extern template class AnalysisManager<Module>;
+
 /// \brief Convenience typedef for the Module analysis manager.
-typedef AnalysisManager<Module> ModuleAnalysisManager;
+using ModuleAnalysisManager = AnalysisManager<Module>;
 
 extern template class AnalysisManager<Function>;
+
 /// \brief Convenience typedef for the Function analysis manager.
-typedef AnalysisManager<Function> FunctionAnalysisManager;
+using FunctionAnalysisManager = AnalysisManager<Function>;
 
 /// \brief An analysis over an "outer" IR unit that provides access to an
 /// analysis manager over an "inner" IR unit.  The inner unit must be contained
@@ -927,20 +942,14 @@ public:
   class Result {
   public:
     explicit Result(AnalysisManagerT &InnerAM) : InnerAM(&InnerAM) {}
+
     Result(Result &&Arg) : InnerAM(std::move(Arg.InnerAM)) {
       // We have to null out the analysis manager in the moved-from state
       // because we are taking ownership of the responsibilty to clear the
       // analysis state.
       Arg.InnerAM = nullptr;
     }
-    Result &operator=(Result &&RHS) {
-      InnerAM = RHS.InnerAM;
-      // We have to null out the analysis manager in the moved-from state
-      // because we are taking ownership of the responsibilty to clear the
-      // analysis state.
-      RHS.InnerAM = nullptr;
-      return *this;
-    }
+
     ~Result() {
       // InnerAM is cleared in a moved from state where there is nothing to do.
       if (!InnerAM)
@@ -951,6 +960,15 @@ public:
       InnerAM->clear();
     }
 
+    Result &operator=(Result &&RHS) {
+      InnerAM = RHS.InnerAM;
+      // We have to null out the analysis manager in the moved-from state
+      // because we are taking ownership of the responsibilty to clear the
+      // analysis state.
+      RHS.InnerAM = nullptr;
+      return *this;
+    }
+
     /// \brief Accessor for the analysis manager.
     AnalysisManagerT &getManager() { return *InnerAM; }
 
@@ -988,6 +1006,7 @@ public:
 private:
   friend AnalysisInfoMixin<
       InnerAnalysisManagerProxy<AnalysisManagerT, IRUnitT>>;
+
   static AnalysisKey Key;
 
   AnalysisManagerT *InnerAM;
@@ -998,8 +1017,8 @@ AnalysisKey
     InnerAnalysisManagerProxy<AnalysisManagerT, IRUnitT, ExtraArgTs...>::Key;
 
 /// Provide the \c FunctionAnalysisManager to \c Module proxy.
-typedef InnerAnalysisManagerProxy<FunctionAnalysisManager, Module>
-    FunctionAnalysisManagerModuleProxy;
+using FunctionAnalysisManagerModuleProxy =
+    InnerAnalysisManagerProxy<FunctionAnalysisManager, Module>;
 
 /// Specialization of the invalidate method for the \c
 /// FunctionAnalysisManagerModuleProxy's result.
@@ -1097,6 +1116,7 @@ public:
 private:
   friend AnalysisInfoMixin<
       OuterAnalysisManagerProxy<AnalysisManagerT, IRUnitT, ExtraArgTs...>>;
+
   static AnalysisKey Key;
 
   const AnalysisManagerT *AM;
@@ -1109,8 +1129,8 @@ AnalysisKey
 extern template class OuterAnalysisManagerProxy<ModuleAnalysisManager,
                                                 Function>;
 /// Provide the \c ModuleAnalysisManager to \c Function proxy.
-typedef OuterAnalysisManagerProxy<ModuleAnalysisManager, Function>
-    ModuleAnalysisManagerFunctionProxy;
+using ModuleAnalysisManagerFunctionProxy =
+    OuterAnalysisManagerProxy<ModuleAnalysisManager, Function>;
 
 /// \brief Trivial adaptor that maps from a module to its functions.
 ///
@@ -1274,6 +1294,6 @@ RepeatedPass<PassT> createRepeatedPass(int Count, PassT P) {
   return RepeatedPass<PassT>(Count, std::move(P));
 }
 
-}
+} // end namespace llvm
 
-#endif
+#endif // LLVM_IR_PASSMANAGER_H
diff --git a/include/llvm/IR/PassManagerInternal.h b/include/llvm/IR/PassManagerInternal.h
index 387dc4c65c43..9195d4dfa428 100644
--- a/include/llvm/IR/PassManagerInternal.h
+++ b/include/llvm/IR/PassManagerInternal.h
@@ -27,7 +27,6 @@ namespace llvm {
 
 template <typename IRUnitT> class AllAnalysesOn;
 template <typename IRUnitT, typename... ExtraArgTs> class AnalysisManager;
-class Invalidator;
 class PreservedAnalyses;
 
 /// \brief Implementation details of the pass manager interfaces.
@@ -116,7 +115,7 @@ struct AnalysisResultConcept {
 /// \brief SFINAE metafunction for computing whether \c ResultT provides an
 /// \c invalidate member function.
 template <typename IRUnitT, typename ResultT> class ResultHasInvalidateMethod {
-  typedef char EnabledType;
+  using EnabledType = char;
   struct DisabledType {
     char a, b;
   };
@@ -124,7 +123,7 @@ template <typename IRUnitT, typename ResultT> class ResultHasInvalidateMethod {
   // Purely to help out MSVC which fails to disable the below specialization,
   // explicitly enable using the result type's invalidate routine if we can
   // successfully call that routine.
-  template <typename T> struct Nonce { typedef EnabledType Type; };
+  template <typename T> struct Nonce { using Type = EnabledType; };
   template <typename T>
   static typename Nonce<decltype(std::declval<T>().invalidate(
       std::declval<IRUnitT &>(), std::declval<PreservedAnalyses>()))>::Type
@@ -280,9 +279,9 @@ struct AnalysisPassModel : AnalysisPassConcept<IRUnitT, PreservedAnalysesT,
   }
 
   // FIXME: Replace PassT::Result with type traits when we use C++11.
-  typedef AnalysisResultModel<IRUnitT, PassT, typename PassT::Result,
-                              PreservedAnalysesT, InvalidatorT>
-      ResultModelT;
+  using ResultModelT =
+      AnalysisResultModel<IRUnitT, PassT, typename PassT::Result,
+                          PreservedAnalysesT, InvalidatorT>;
 
   /// \brief The model delegates to the \c PassT::run method.
   ///
diff --git a/include/llvm/IR/PatternMatch.h b/include/llvm/IR/PatternMatch.h
index 31a76b4ed6c3..6b2b22e82b95 100644
--- a/include/llvm/IR/PatternMatch.h
+++ b/include/llvm/IR/PatternMatch.h
@@ -29,11 +29,19 @@
 #ifndef LLVM_IR_PATTERNMATCH_H
 #define LLVM_IR_PATTERNMATCH_H
 
+#include "llvm/ADT/APFloat.h"
+#include "llvm/ADT/APInt.h"
 #include "llvm/IR/CallSite.h"
+#include "llvm/IR/Constant.h"
 #include "llvm/IR/Constants.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/Operator.h"
+#include "llvm/IR/Value.h"
+#include "llvm/Support/Casting.h"
+#include <cstdint>
 
 namespace llvm {
 namespace PatternMatch {
@@ -172,7 +180,9 @@ inline match_nan m_NaN() { return match_nan(); }
 
 struct apint_match {
   const APInt *&Res;
+
   apint_match(const APInt *&R) : Res(R) {}
+
   template <typename ITy> bool match(ITy *V) {
     if (auto *CI = dyn_cast<ConstantInt>(V)) {
       Res = &CI->getValue();
@@ -230,7 +240,9 @@ template <typename Predicate> struct cst_pred_ty : public Predicate {
 /// satisfy a specified predicate, and bind them to an APInt.
 template <typename Predicate> struct api_pred_ty : public Predicate {
   const APInt *&Res;
+
   api_pred_ty(const APInt *&R) : Res(R) {}
+
   template <typename ITy> bool match(ITy *V) {
     if (const auto *CI = dyn_cast<ConstantInt>(V))
       if (this->isValue(CI->getValue())) {
@@ -294,6 +306,7 @@ inline api_pred_ty<is_maxsignedvalue> m_MaxSignedValue(const APInt *&V) { return
 
 template <typename Class> struct bind_ty {
   Class *&VR;
+
   bind_ty(Class *&V) : VR(V) {}
 
   template <typename ITy> bool match(ITy *V) {
@@ -326,6 +339,7 @@ inline bind_ty<ConstantFP> m_ConstantFP(ConstantFP *&C) { return C; }
 /// \brief Match a specified Value*.
 struct specificval_ty {
   const Value *Val;
+
   specificval_ty(const Value *V) : Val(V) {}
 
   template <typename ITy> bool match(ITy *V) { return V == Val; }
@@ -338,6 +352,7 @@ inline specificval_ty m_Specific(const Value *V) { return V; }
 /// that value.
 struct specific_fpval {
   double Val;
+
   specific_fpval(double V) : Val(V) {}
 
   template <typename ITy> bool match(ITy *V) {
@@ -360,6 +375,7 @@ inline specific_fpval m_FPOne() { return m_SpecificFP(1.0); }
 
 struct bind_const_intval_ty {
   uint64_t &VR;
+
   bind_const_intval_ty(uint64_t &V) : VR(V) {}
 
   template <typename ITy> bool match(ITy *V) {
@@ -376,6 +392,7 @@ struct bind_const_intval_ty {
 // value.
 struct specific_intval {
   uint64_t Val;
+
   specific_intval(uint64_t V) : Val(V) {}
 
   template <typename ITy> bool match(ITy *V) {
@@ -939,6 +956,7 @@ template <typename LHS> inline fneg_match<LHS> m_FNeg(const LHS &L) {
 
 struct br_match {
   BasicBlock *&Succ;
+
   br_match(BasicBlock *&Succ) : Succ(Succ) {}
 
   template <typename OpTy> bool match(OpTy *V) {
@@ -956,6 +974,7 @@ inline br_match m_UnconditionalBr(BasicBlock *&Succ) { return br_match(Succ); }
 template <typename Cond_t> struct brc_match {
   Cond_t Cond;
   BasicBlock *&T, *&F;
+
   brc_match(const Cond_t &C, BasicBlock *&t, BasicBlock *&f)
       : Cond(C), T(t), F(f) {}
 
@@ -1202,6 +1221,7 @@ m_UnordFMin(const LHS &L, const RHS &R) {
 template <typename Opnd_t> struct Argument_match {
   unsigned OpI;
   Opnd_t Val;
+
   Argument_match(unsigned OpIdx, const Opnd_t &V) : OpI(OpIdx), Val(V) {}
 
   template <typename OpTy> bool match(OpTy *V) {
@@ -1219,6 +1239,7 @@ inline Argument_match<Opnd_t> m_Argument(const Opnd_t &Op) {
 /// \brief Intrinsic matchers.
 struct IntrinsicID_match {
   unsigned ID;
+
   IntrinsicID_match(Intrinsic::ID IntrID) : ID(IntrID) {}
 
   template <typename OpTy> bool match(OpTy *V) {
@@ -1239,21 +1260,23 @@ template <typename T0 = void, typename T1 = void, typename T2 = void,
           typename T9 = void, typename T10 = void>
 struct m_Intrinsic_Ty;
 template <typename T0> struct m_Intrinsic_Ty<T0> {
-  typedef match_combine_and<IntrinsicID_match, Argument_match<T0>> Ty;
+  using Ty = match_combine_and<IntrinsicID_match, Argument_match<T0>>;
 };
 template <typename T0, typename T1> struct m_Intrinsic_Ty<T0, T1> {
-  typedef match_combine_and<typename m_Intrinsic_Ty<T0>::Ty, Argument_match<T1>>
-      Ty;
+  using Ty =
+      match_combine_and<typename m_Intrinsic_Ty<T0>::Ty, Argument_match<T1>>;
 };
 template <typename T0, typename T1, typename T2>
 struct m_Intrinsic_Ty<T0, T1, T2> {
-  typedef match_combine_and<typename m_Intrinsic_Ty<T0, T1>::Ty,
-                            Argument_match<T2>> Ty;
+  using Ty =
+      match_combine_and<typename m_Intrinsic_Ty<T0, T1>::Ty,
+                        Argument_match<T2>>;
 };
 template <typename T0, typename T1, typename T2, typename T3>
 struct m_Intrinsic_Ty<T0, T1, T2, T3> {
-  typedef match_combine_and<typename m_Intrinsic_Ty<T0, T1, T2>::Ty,
-                            Argument_match<T3>> Ty;
+  using Ty =
+      match_combine_and<typename m_Intrinsic_Ty<T0, T1, T2>::Ty,
+                        Argument_match<T3>>;
 };
 
 /// \brief Match intrinsic calls like this:
@@ -1437,4 +1460,4 @@ m_c_UMax(const LHS &L, const RHS &R) {
 } // end namespace PatternMatch
 } // end namespace llvm
 
-#endif
+#endif // LLVM_IR_PATTERNMATCH_H
diff --git a/include/llvm/IR/ProfileSummary.h b/include/llvm/IR/ProfileSummary.h
index f4248014c6e1..d85ce8c443ec 100644
--- a/include/llvm/IR/ProfileSummary.h
+++ b/include/llvm/IR/ProfileSummary.h
@@ -1,4 +1,4 @@
-//===-- ProfileSummary.h - Profile summary data structure. ------*- C++ -*-===//
+//===- ProfileSummary.h - Profile summary data structure. -------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -11,21 +11,17 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_SUPPORT_PROFILE_SUMMARY_H
-#define LLVM_SUPPORT_PROFILE_SUMMARY_H
+#ifndef LLVM_IR_PROFILESUMMARY_H
+#define LLVM_IR_PROFILESUMMARY_H
 
+#include <algorithm>
 #include <cstdint>
-#include <utility>
 #include <vector>
 
-#include "llvm/Support/Casting.h"
-
 namespace llvm {
 
 class LLVMContext;
 class Metadata;
-class MDTuple;
-class MDNode;
 
 // The profile summary is one or more (Cutoff, MinCount, NumCounts) triplets.
 // The semantics of counts depend on the type of profile. For instrumentation
@@ -37,12 +33,13 @@ struct ProfileSummaryEntry {
   uint32_t Cutoff;    ///< The required percentile of counts.
   uint64_t MinCount;  ///< The minimum count for this percentile.
   uint64_t NumCounts; ///< Number of counts >= the minimum count.
+
   ProfileSummaryEntry(uint32_t TheCutoff, uint64_t TheMinCount,
                       uint64_t TheNumCounts)
       : Cutoff(TheCutoff), MinCount(TheMinCount), NumCounts(TheNumCounts) {}
 };
 
-typedef std::vector<ProfileSummaryEntry> SummaryEntryVector;
+using SummaryEntryVector = std::vector<ProfileSummaryEntry>;
 
 class ProfileSummary {
 public:
@@ -59,6 +56,7 @@ private:
 
 public:
   static const int Scale = 1000000;
+
   ProfileSummary(Kind K, SummaryEntryVector DetailedSummary,
                  uint64_t TotalCount, uint64_t MaxCount,
                  uint64_t MaxInternalCount, uint64_t MaxFunctionCount,
@@ -67,6 +65,7 @@ public:
         TotalCount(TotalCount), MaxCount(MaxCount),
         MaxInternalCount(MaxInternalCount), MaxFunctionCount(MaxFunctionCount),
         NumCounts(NumCounts), NumFunctions(NumFunctions) {}
+
   Kind getKind() const { return PSK; }
   /// \brief Return summary information as metadata.
   Metadata *getMD(LLVMContext &Context);
@@ -82,4 +81,5 @@ public:
 };
 
 } // end namespace llvm
-#endif
+
+#endif // LLVM_IR_PROFILESUMMARY_H
diff --git a/include/llvm/IR/Statepoint.h b/include/llvm/IR/Statepoint.h
index 03151cd7c8f7..f01607614a0c 100644
--- a/include/llvm/IR/Statepoint.h
+++ b/include/llvm/IR/Statepoint.h
@@ -1,4 +1,4 @@
-//===-- llvm/IR/Statepoint.h - gc.statepoint utilities ----------*- C++ -*-===//
+//===- llvm/IR/Statepoint.h - gc.statepoint utilities -----------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -24,10 +24,12 @@
 #include "llvm/IR/CallSite.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/Function.h"
+#include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/Support/Casting.h"
+#include "llvm/Support/MathExtras.h"
 #include <cassert>
 #include <cstddef>
 #include <cstdint>
@@ -87,7 +89,7 @@ protected:
   }
 
 public:
-  typedef typename CallSiteTy::arg_iterator arg_iterator;
+  using arg_iterator = typename CallSiteTy::arg_iterator;
 
   enum {
     IDPos = 0,
@@ -300,8 +302,9 @@ public:
 class ImmutableStatepoint
     : public StatepointBase<const Function, const Instruction, const Value,
                             ImmutableCallSite> {
-  typedef StatepointBase<const Function, const Instruction, const Value,
-                         ImmutableCallSite> Base;
+  using Base =
+      StatepointBase<const Function, const Instruction, const Value,
+                     ImmutableCallSite>;
 
 public:
   explicit ImmutableStatepoint(const Instruction *I) : Base(I) {}
@@ -312,7 +315,7 @@ public:
 /// to a gc.statepoint.
 class Statepoint
     : public StatepointBase<Function, Instruction, Value, CallSite> {
-  typedef StatepointBase<Function, Instruction, Value, CallSite> Base;
+  using Base = StatepointBase<Function, Instruction, Value, CallSite>;
 
 public:
   explicit Statepoint(Instruction *I) : Base(I) {}
@@ -327,6 +330,7 @@ public:
     return I->getIntrinsicID() == Intrinsic::experimental_gc_relocate ||
       I->getIntrinsicID() == Intrinsic::experimental_gc_result;
   }
+
   static inline bool classof(const Value *V) {
     return isa<IntrinsicInst>(V) && classof(cast<IntrinsicInst>(V));
   }
@@ -369,6 +373,7 @@ public:
   static inline bool classof(const IntrinsicInst *I) {
     return I->getIntrinsicID() == Intrinsic::experimental_gc_relocate;
   }
+
   static inline bool classof(const Value *V) {
     return isa<IntrinsicInst>(V) && classof(cast<IntrinsicInst>(V));
   }
@@ -403,6 +408,7 @@ public:
   static inline bool classof(const IntrinsicInst *I) {
     return I->getIntrinsicID() == Intrinsic::experimental_gc_result;
   }
+
   static inline bool classof(const Value *V) {
     return isa<IntrinsicInst>(V) && classof(cast<IntrinsicInst>(V));
   }
diff --git a/include/llvm/IR/SymbolTableListTraits.h b/include/llvm/IR/SymbolTableListTraits.h
index 49a5fb21297d..87ce902c2811 100644
--- a/include/llvm/IR/SymbolTableListTraits.h
+++ b/include/llvm/IR/SymbolTableListTraits.h
@@ -48,7 +48,7 @@ class ValueSymbolTable;
 template <typename NodeTy> struct SymbolTableListParentType {};
 
 #define DEFINE_SYMBOL_TABLE_PARENT_TYPE(NODE, PARENT)                          \
-  template <> struct SymbolTableListParentType<NODE> { typedef PARENT type; };
+  template <> struct SymbolTableListParentType<NODE> { using type = PARENT; };
 DEFINE_SYMBOL_TABLE_PARENT_TYPE(Instruction, BasicBlock)
 DEFINE_SYMBOL_TABLE_PARENT_TYPE(BasicBlock, Function)
 DEFINE_SYMBOL_TABLE_PARENT_TYPE(Argument, Function)
@@ -65,10 +65,10 @@ template <typename NodeTy> class SymbolTableList;
 //
 template <typename ValueSubClass>
 class SymbolTableListTraits : public ilist_alloc_traits<ValueSubClass> {
-  typedef SymbolTableList<ValueSubClass> ListTy;
-  typedef typename simple_ilist<ValueSubClass>::iterator iterator;
-  typedef
-      typename SymbolTableListParentType<ValueSubClass>::type ItemParentClass;
+  using ListTy = SymbolTableList<ValueSubClass>;
+  using iterator = typename simple_ilist<ValueSubClass>::iterator;
+  using ItemParentClass =
+      typename SymbolTableListParentType<ValueSubClass>::type;
 
 public:
   SymbolTableListTraits() = default;
diff --git a/include/llvm/IR/TrackingMDRef.h b/include/llvm/IR/TrackingMDRef.h
index 12b196432006..bdec904ad1e1 100644
--- a/include/llvm/IR/TrackingMDRef.h
+++ b/include/llvm/IR/TrackingMDRef.h
@@ -139,31 +139,35 @@ public:
   bool hasTrivialDestructor() const { return Ref.hasTrivialDestructor(); }
 };
 
-typedef TypedTrackingMDRef<MDNode> TrackingMDNodeRef;
-typedef TypedTrackingMDRef<ValueAsMetadata> TrackingValueAsMetadataRef;
+using TrackingMDNodeRef = TypedTrackingMDRef<MDNode>;
+using TrackingValueAsMetadataRef = TypedTrackingMDRef<ValueAsMetadata>;
 
 // Expose the underlying metadata to casting.
 template <> struct simplify_type<TrackingMDRef> {
-  typedef Metadata *SimpleType;
+  using SimpleType = Metadata *;
+
   static SimpleType getSimplifiedValue(TrackingMDRef &MD) { return MD.get(); }
 };
 
 template <> struct simplify_type<const TrackingMDRef> {
-  typedef Metadata *SimpleType;
+  using SimpleType = Metadata *;
+
   static SimpleType getSimplifiedValue(const TrackingMDRef &MD) {
     return MD.get();
   }
 };
 
 template <class T> struct simplify_type<TypedTrackingMDRef<T>> {
-  typedef T *SimpleType;
+  using SimpleType = T *;
+
   static SimpleType getSimplifiedValue(TypedTrackingMDRef<T> &MD) {
     return MD.get();
   }
 };
 
 template <class T> struct simplify_type<const TypedTrackingMDRef<T>> {
-  typedef T *SimpleType;
+  using SimpleType = T *;
+
   static SimpleType getSimplifiedValue(const TypedTrackingMDRef<T> &MD) {
     return MD.get();
   }
diff --git a/include/llvm/IR/Type.h b/include/llvm/IR/Type.h
index e6a0df937e9b..82362107e41e 100644
--- a/include/llvm/IR/Type.h
+++ b/include/llvm/IR/Type.h
@@ -1,4 +1,4 @@
-//===-- llvm/Type.h - Classes for handling data types -----------*- C++ -*-===//
+//===- llvm/Type.h - Classes for handling data types ------------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -18,21 +18,22 @@
 #include "llvm/ADT/APFloat.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/SmallPtrSet.h"
-#include "llvm/Support/CBindingWrapping.h"
 #include "llvm/Support/Casting.h"
-#include "llvm/Support/DataTypes.h"
+#include "llvm/Support/CBindingWrapping.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/ErrorHandling.h"
+#include <cassert>
+#include <cstdint>
+#include <iterator>
 
 namespace llvm {
 
-class PointerType;
+template<class GraphType> struct GraphTraits;
 class IntegerType;
-class raw_ostream;
-class Module;
 class LLVMContext;
-class LLVMContextImpl;
+class PointerType;
+class raw_ostream;
 class StringRef;
-template<class GraphType> struct GraphTraits;
 
 /// The instances of the Type class are immutable: once they are created,
 /// they are never changed.  Also note that only one instance of a particular
@@ -86,9 +87,9 @@ private:
 
 protected:
   friend class LLVMContextImpl;
+
   explicit Type(LLVMContext &C, TypeID tid)
-    : Context(C), ID(tid), SubclassData(0),
-      NumContainedTys(0), ContainedTys(nullptr) {}
+    : Context(C), ID(tid), SubclassData(0) {}
   ~Type() = default;
 
   unsigned getSubclassData() const { return SubclassData; }
@@ -100,14 +101,14 @@ protected:
   }
 
   /// Keeps track of how many Type*'s there are in the ContainedTys list.
-  unsigned NumContainedTys;
+  unsigned NumContainedTys = 0;
 
   /// A pointer to the array of Types contained by this Type. For example, this
   /// includes the arguments of a function type, the elements of a structure,
   /// the pointee of a pointer, the element type of an array, etc. This pointer
   /// may be 0 for types that don't contain other types (Integer, Double,
   /// Float).
-  Type * const *ContainedTys;
+  Type * const *ContainedTys = nullptr;
 
   static bool isSequentialType(TypeID TyID) {
     return TyID == ArrayTyID || TyID == VectorTyID;
@@ -122,6 +123,7 @@ public:
   /// inlined with the operands when printing an instruction.
   void print(raw_ostream &O, bool IsForDebug = false,
              bool NoDetails = false) const;
+
   void dump() const;
 
   /// Return the LLVMContext in which this type was uniqued.
@@ -299,14 +301,16 @@ public:
   //===--------------------------------------------------------------------===//
   // Type Iteration support.
   //
-  typedef Type * const *subtype_iterator;
+  using subtype_iterator = Type * const *;
+
   subtype_iterator subtype_begin() const { return ContainedTys; }
   subtype_iterator subtype_end() const { return &ContainedTys[NumContainedTys];}
   ArrayRef<Type*> subtypes() const {
     return makeArrayRef(subtype_begin(), subtype_end());
   }
 
-  typedef std::reverse_iterator<subtype_iterator> subtype_reverse_iterator;
+  using subtype_reverse_iterator = std::reverse_iterator<subtype_iterator>;
+
   subtype_reverse_iterator subtype_rbegin() const {
     return subtype_reverse_iterator(subtype_end());
   }
@@ -348,6 +352,7 @@ public:
   }
 
   inline uint64_t getArrayNumElements() const;
+
   Type *getArrayElementType() const {
     assert(getTypeID() == ArrayTyID);
     return ContainedTys[0];
@@ -444,8 +449,8 @@ template <> struct isa_impl<PointerType, Type> {
 // graph of sub types.
 
 template <> struct GraphTraits<Type *> {
-  typedef Type *NodeRef;
-  typedef Type::subtype_iterator ChildIteratorType;
+  using NodeRef = Type *;
+  using ChildIteratorType = Type::subtype_iterator;
 
   static NodeRef getEntryNode(Type *T) { return T; }
   static ChildIteratorType child_begin(NodeRef N) { return N->subtype_begin(); }
@@ -453,8 +458,8 @@ template <> struct GraphTraits<Type *> {
 };
 
 template <> struct GraphTraits<const Type*> {
-  typedef const Type *NodeRef;
-  typedef Type::subtype_iterator ChildIteratorType;
+  using NodeRef = const Type *;
+  using ChildIteratorType = Type::subtype_iterator;
 
   static NodeRef getEntryNode(NodeRef T) { return T; }
   static ChildIteratorType child_begin(NodeRef N) { return N->subtype_begin(); }
@@ -474,6 +479,6 @@ inline LLVMTypeRef *wrap(Type **Tys) {
   return reinterpret_cast<LLVMTypeRef*>(const_cast<Type**>(Tys));
 }
 
-} // End llvm namespace
+} // end namespace llvm
 
-#endif
+#endif // LLVM_IR_TYPE_H
diff --git a/include/llvm/IR/TypeFinder.h b/include/llvm/IR/TypeFinder.h
index 48c4f1161aa1..c050c388d398 100644
--- a/include/llvm/IR/TypeFinder.h
+++ b/include/llvm/IR/TypeFinder.h
@@ -44,8 +44,8 @@ public:
   void run(const Module &M, bool onlyNamed);
   void clear();
 
-  typedef std::vector<StructType*>::iterator iterator;
-  typedef std::vector<StructType*>::const_iterator const_iterator;
+  using iterator = std::vector<StructType*>::iterator;
+  using const_iterator = std::vector<StructType*>::const_iterator;
 
   iterator begin() { return StructTypes.begin(); }
   iterator end() { return StructTypes.end(); }
diff --git a/include/llvm/IR/Use.h b/include/llvm/IR/Use.h
index 6b56546f4421..d3a59d8a060e 100644
--- a/include/llvm/IR/Use.h
+++ b/include/llvm/IR/Use.h
@@ -1,4 +1,4 @@
-//===-- llvm/Use.h - Definition of the Use class ----------------*- C++ -*-===//
+//===- llvm/Use.h - Definition of the Use class -----------------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -27,14 +27,14 @@
 
 #include "llvm/ADT/PointerIntPair.h"
 #include "llvm/Support/CBindingWrapping.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm-c/Types.h"
 
 namespace llvm {
 
-class Value;
-class User;
-class Use;
 template <typename> struct simplify_type;
+class User;
+class Value;
 
 /// \brief A Use represents the edge between a Value definition and its users.
 ///
@@ -65,23 +65,27 @@ public:
   /// use the LSB regardless of pointer alignment on different targets.
   struct UserRefPointerTraits {
     static inline void *getAsVoidPointer(User *P) { return P; }
+
     static inline User *getFromVoidPointer(void *P) {
       return (User *)P;
     }
+
     enum { NumLowBitsAvailable = 1 };
   };
 
   // A type for the word following an array of hung-off Uses in memory, which is
   // a pointer back to their User with the bottom bit set.
-  typedef PointerIntPair<User *, 1, unsigned, UserRefPointerTraits> UserRef;
+  using UserRef = PointerIntPair<User *, 1, unsigned, UserRefPointerTraits>;
 
   /// Pointer traits for the Prev PointerIntPair. This ensures we always use
   /// the two LSBs regardless of pointer alignment on different targets.
   struct PrevPointerTraits {
     static inline void *getAsVoidPointer(Use **P) { return P; }
+
     static inline Use **getFromVoidPointer(void *P) {
       return (Use **)P;
     }
+
     enum { NumLowBitsAvailable = 2 };
   };
 
@@ -95,9 +99,11 @@ private:
   enum PrevPtrTag { zeroDigitTag, oneDigitTag, stopTag, fullStopTag };
 
   /// Constructor
-  Use(PrevPtrTag tag) : Val(nullptr) { Prev.setInt(tag); }
+  Use(PrevPtrTag tag) { Prev.setInt(tag); }
 
 public:
+  friend class Value;
+
   operator Value *() const { return Val; }
   Value *get() const { return Val; }
 
@@ -133,7 +139,7 @@ public:
 private:
   const Use *getImpliedUser() const LLVM_READONLY;
 
-  Value *Val;
+  Value *Val = nullptr;
   Use *Next;
   PointerIntPair<Use **, 2, PrevPtrTag, PrevPointerTraits> Prev;
 
@@ -153,18 +159,18 @@ private:
     if (Next)
       Next->setPrev(StrippedPrev);
   }
-
-  friend class Value;
 };
 
 /// \brief Allow clients to treat uses just like values when using
 /// casting operators.
 template <> struct simplify_type<Use> {
-  typedef Value *SimpleType;
+  using SimpleType = Value *;
+
   static SimpleType getSimplifiedValue(Use &Val) { return Val.get(); }
 };
 template <> struct simplify_type<const Use> {
-  typedef /*const*/ Value *SimpleType;
+  using SimpleType = /*const*/ Value *;
+
   static SimpleType getSimplifiedValue(const Use &Val) { return Val.get(); }
 };
 
diff --git a/include/llvm/IR/UseListOrder.h b/include/llvm/IR/UseListOrder.h
index ebe99223facd..a8b394fc6302 100644
--- a/include/llvm/IR/UseListOrder.h
+++ b/include/llvm/IR/UseListOrder.h
@@ -37,7 +37,7 @@ struct UseListOrder {
   UseListOrder &operator=(UseListOrder &&) = default;
 };
 
-typedef std::vector<UseListOrder> UseListOrderStack;
+using UseListOrderStack = std::vector<UseListOrder>;
 
 } // end namespace llvm
 
diff --git a/include/llvm/IR/User.h b/include/llvm/IR/User.h
index 54758a9b6d6a..7b9d451aaf53 100644
--- a/include/llvm/IR/User.h
+++ b/include/llvm/IR/User.h
@@ -1,4 +1,4 @@
-//===-- llvm/User.h - User class definition ---------------------*- C++ -*-===//
+//===- llvm/User.h - User class definition ----------------------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -114,6 +114,7 @@ protected:
       ? OperandTraits<U>::op_end(const_cast<U*>(that))[Idx]
       : OperandTraits<U>::op_begin(const_cast<U*>(that))[Idx];
   }
+
   template <int Idx> Use &Op() {
     return OpFrom<Idx>(this);
   }
@@ -205,10 +206,10 @@ public:
   // ---------------------------------------------------------------------------
   // Operand Iterator interface...
   //
-  typedef Use*       op_iterator;
-  typedef const Use* const_op_iterator;
-  typedef iterator_range<op_iterator> op_range;
-  typedef iterator_range<const_op_iterator> const_op_range;
+  using op_iterator = Use*;
+  using const_op_iterator = const Use*;
+  using op_range = iterator_range<op_iterator>;
+  using const_op_range = iterator_range<const_op_iterator>;
 
   op_iterator       op_begin()       { return getOperandList(); }
   const_op_iterator op_begin() const { return getOperandList(); }
@@ -252,6 +253,7 @@ public:
                               ptrdiff_t, const Value *, const Value *> {
     explicit const_value_op_iterator(const Use *U = nullptr) :
       iterator_adaptor_base(U) {}
+
     const Value *operator*() const { return *I; }
     const Value *operator->() const { return operator*(); }
   };
@@ -290,6 +292,7 @@ public:
     return isa<Instruction>(V) || isa<Constant>(V);
   }
 };
+
 // Either Use objects, or a Use pointer can be prepended to User.
 static_assert(alignof(Use) >= alignof(User),
               "Alignment is insufficient after objects prepended to User");
@@ -297,13 +300,15 @@ static_assert(alignof(Use *) >= alignof(User),
               "Alignment is insufficient after objects prepended to User");
 
 template<> struct simplify_type<User::op_iterator> {
-  typedef Value* SimpleType;
+  using SimpleType = Value*;
+
   static SimpleType getSimplifiedValue(User::op_iterator &Val) {
     return Val->get();
   }
 };
 template<> struct simplify_type<User::const_op_iterator> {
-  typedef /*const*/ Value* SimpleType;
+  using SimpleType = /*const*/ Value*;
+
   static SimpleType getSimplifiedValue(User::const_op_iterator &Val) {
     return Val->get();
   }
diff --git a/include/llvm/IR/Value.h b/include/llvm/IR/Value.h
index 00f821399257..96a370dcc35f 100644
--- a/include/llvm/IR/Value.h
+++ b/include/llvm/IR/Value.h
@@ -1,4 +1,4 @@
-//===-- llvm/Value.h - Definition of the Value class ------------*- C++ -*-===//
+//===- llvm/Value.h - Definition of the Value class -------------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -44,12 +44,12 @@ class LLVMContext;
 class Module;
 class ModuleSlotTracker;
 class raw_ostream;
+template<typename ValueTy> class StringMapEntry;
 class StringRef;
 class Twine;
 class Type;
 
-template<typename ValueTy> class StringMapEntry;
-typedef StringMapEntry<Value*> ValueName;
+using ValueName = StringMapEntry<Value*>;
 
 //===----------------------------------------------------------------------===//
 //                                 Value Class
@@ -120,9 +120,11 @@ private:
   template <typename UseT> // UseT == 'Use' or 'const Use'
   class use_iterator_impl
       : public std::iterator<std::forward_iterator_tag, UseT *> {
+    friend class Value;
+
     UseT *U;
+
     explicit use_iterator_impl(UseT *u) : U(u) {}
-    friend class Value;
 
   public:
     use_iterator_impl() : U() {}
@@ -309,8 +311,9 @@ public:
     return UseList == nullptr;
   }
 
-  typedef use_iterator_impl<Use> use_iterator;
-  typedef use_iterator_impl<const Use> const_use_iterator;
+  using use_iterator = use_iterator_impl<Use>;
+  using const_use_iterator = use_iterator_impl<const Use>;
+
   use_iterator materialized_use_begin() { return use_iterator(UseList); }
   const_use_iterator materialized_use_begin() const {
     return const_use_iterator(UseList);
@@ -345,8 +348,9 @@ public:
     return UseList == nullptr;
   }
 
-  typedef user_iterator_impl<User> user_iterator;
-  typedef user_iterator_impl<const User> const_user_iterator;
+  using user_iterator = user_iterator_impl<User>;
+  using const_user_iterator = user_iterator_impl<const User>;
+
   user_iterator materialized_user_begin() { return user_iterator(UseList); }
   const_user_iterator materialized_user_begin() const {
     return const_user_iterator(UseList);
@@ -560,7 +564,6 @@ public:
   /// block.
   const Value *DoPHITranslation(const BasicBlock *CurBB,
                                 const BasicBlock *PredBB) const;
-
   Value *DoPHITranslation(const BasicBlock *CurBB, const BasicBlock *PredBB) {
     return const_cast<Value *>(
              static_cast<const Value *>(this)->DoPHITranslation(CurBB, PredBB));
@@ -606,7 +609,7 @@ private:
     Use *Merged;
     Use **Next = &Merged;
 
-    for (;;) {
+    while (true) {
       if (!L) {
         *Next = R;
         break;
diff --git a/include/llvm/IR/ValueHandle.h b/include/llvm/IR/ValueHandle.h
index 393618d5511b..b45cc7b6dc02 100644
--- a/include/llvm/IR/ValueHandle.h
+++ b/include/llvm/IR/ValueHandle.h
@@ -17,10 +17,10 @@
 #include "llvm/ADT/DenseMapInfo.h"
 #include "llvm/ADT/PointerIntPair.h"
 #include "llvm/IR/Value.h"
+#include "llvm/Support/Casting.h"
+#include <cassert>
 
 namespace llvm {
-class ValueHandleBase;
-template<typename From> struct simplify_type;
 
 /// \brief This is the common base class of value handles.
 ///
@@ -29,6 +29,7 @@ template<typename From> struct simplify_type;
 /// below for details.
 class ValueHandleBase {
   friend class Value;
+
 protected:
   /// \brief This indicates what sub class the handle actually is.
   ///
@@ -40,24 +41,23 @@ protected:
       : ValueHandleBase(RHS.PrevPair.getInt(), RHS) {}
 
   ValueHandleBase(HandleBaseKind Kind, const ValueHandleBase &RHS)
-      : PrevPair(nullptr, Kind), Next(nullptr), Val(RHS.getValPtr()) {
+      : PrevPair(nullptr, Kind), Val(RHS.getValPtr()) {
     if (isValid(getValPtr()))
       AddToExistingUseList(RHS.getPrevPtr());
   }
 
 private:
   PointerIntPair<ValueHandleBase**, 2, HandleBaseKind> PrevPair;
-  ValueHandleBase *Next;
-
-  Value *Val;
+  ValueHandleBase *Next = nullptr;
+  Value *Val = nullptr;
 
   void setValPtr(Value *V) { Val = V; }
 
 public:
   explicit ValueHandleBase(HandleBaseKind Kind)
-      : PrevPair(nullptr, Kind), Next(nullptr), Val(nullptr) {}
+      : PrevPair(nullptr, Kind) {}
   ValueHandleBase(HandleBaseKind Kind, Value *V)
-      : PrevPair(nullptr, Kind), Next(nullptr), Val(V) {
+      : PrevPair(nullptr, Kind), Val(V) {
     if (isValid(getValPtr()))
       AddToUseList();
   }
@@ -162,11 +162,13 @@ public:
 // Specialize simplify_type to allow WeakVH to participate in
 // dyn_cast, isa, etc.
 template <> struct simplify_type<WeakVH> {
-  typedef Value *SimpleType;
+  using SimpleType = Value *;
+
   static SimpleType getSimplifiedValue(WeakVH &WVH) { return WVH; }
 };
 template <> struct simplify_type<const WeakVH> {
-  typedef Value *SimpleType;
+  using SimpleType = Value *;
+
   static SimpleType getSimplifiedValue(const WeakVH &WVH) { return WVH; }
 };
 
@@ -205,11 +207,13 @@ public:
 // Specialize simplify_type to allow WeakTrackingVH to participate in
 // dyn_cast, isa, etc.
 template <> struct simplify_type<WeakTrackingVH> {
-  typedef Value *SimpleType;
+  using SimpleType = Value *;
+
   static SimpleType getSimplifiedValue(WeakTrackingVH &WVH) { return WVH; }
 };
 template <> struct simplify_type<const WeakTrackingVH> {
-  typedef Value *SimpleType;
+  using SimpleType = Value *;
+
   static SimpleType getSimplifiedValue(const WeakTrackingVH &WVH) {
     return WVH;
   }
@@ -236,7 +240,7 @@ class AssertingVH
   : public ValueHandleBase
 #endif
   {
-  friend struct DenseMapInfo<AssertingVH<ValueTy> >;
+  friend struct DenseMapInfo<AssertingVH<ValueTy>>;
 
 #ifndef NDEBUG
   Value *getRawValPtr() const { return ValueHandleBase::getValPtr(); }
@@ -282,20 +286,23 @@ public:
 
 // Specialize DenseMapInfo to allow AssertingVH to participate in DenseMap.
 template<typename T>
-struct DenseMapInfo<AssertingVH<T> > {
+struct DenseMapInfo<AssertingVH<T>> {
   static inline AssertingVH<T> getEmptyKey() {
     AssertingVH<T> Res;
     Res.setRawValPtr(DenseMapInfo<Value *>::getEmptyKey());
     return Res;
   }
+
   static inline AssertingVH<T> getTombstoneKey() {
     AssertingVH<T> Res;
     Res.setRawValPtr(DenseMapInfo<Value *>::getTombstoneKey());
     return Res;
   }
+
   static unsigned getHashValue(const AssertingVH<T> &Val) {
     return DenseMapInfo<Value *>::getHashValue(Val.getRawValPtr());
   }
+
   static bool isEqual(const AssertingVH<T> &LHS, const AssertingVH<T> &RHS) {
     return DenseMapInfo<Value *>::isEqual(LHS.getRawValPtr(),
                                           RHS.getRawValPtr());
@@ -303,7 +310,7 @@ struct DenseMapInfo<AssertingVH<T> > {
 };
 
 template <typename T>
-struct isPodLike<AssertingVH<T> > {
+struct isPodLike<AssertingVH<T>> {
 #ifdef NDEBUG
   static const bool value = true;
 #else
@@ -356,7 +363,7 @@ public:
   static Value *GetAsValue(const Value *V) { return const_cast<Value*>(V); }
 
 public:
-  TrackingVH() {}
+  TrackingVH() = default;
   TrackingVH(ValueTy *P) { setValPtr(P); }
 
   operator ValueTy*() const {
@@ -495,10 +502,12 @@ public:
   PoisoningVH(ValueTy *P) : CallbackVH(GetAsValue(P)) {}
   PoisoningVH(const PoisoningVH &RHS)
       : CallbackVH(RHS), Poisoned(RHS.Poisoned) {}
+
   ~PoisoningVH() {
     if (Poisoned)
       clearValPtr();
   }
+
   PoisoningVH &operator=(const PoisoningVH &RHS) {
     if (Poisoned)
       clearValPtr();
@@ -523,14 +532,17 @@ template <typename T> struct DenseMapInfo<PoisoningVH<T>> {
     Res.setRawValPtr(DenseMapInfo<Value *>::getEmptyKey());
     return Res;
   }
+
   static inline PoisoningVH<T> getTombstoneKey() {
     PoisoningVH<T> Res;
     Res.setRawValPtr(DenseMapInfo<Value *>::getTombstoneKey());
     return Res;
   }
+
   static unsigned getHashValue(const PoisoningVH<T> &Val) {
     return DenseMapInfo<Value *>::getHashValue(Val.getRawValPtr());
   }
+
   static bool isEqual(const PoisoningVH<T> &LHS, const PoisoningVH<T> &RHS) {
     return DenseMapInfo<Value *>::isEqual(LHS.getRawValPtr(),
                                           RHS.getRawValPtr());
@@ -545,6 +557,6 @@ template <typename T> struct isPodLike<PoisoningVH<T>> {
 #endif
 };
 
-} // End llvm namespace
+} // end namespace llvm
 
-#endif
+#endif // LLVM_IR_VALUEHANDLE_H
diff --git a/include/llvm/IR/ValueMap.h b/include/llvm/IR/ValueMap.h
index 9648e1989f94..11d5823ee479 100644
--- a/include/llvm/IR/ValueMap.h
+++ b/include/llvm/IR/ValueMap.h
@@ -46,7 +46,6 @@ namespace llvm {
 
 template<typename KeyT, typename ValueT, typename Config>
 class ValueMapCallbackVH;
-
 template<typename DenseMapT, typename KeyT>
 class ValueMapIterator;
 template<typename DenseMapT, typename KeyT>
@@ -57,7 +56,7 @@ class ValueMapConstIterator;
 /// as possible with future versions of ValueMap.
 template<typename KeyT, typename MutexT = sys::Mutex>
 struct ValueMapConfig {
-  typedef MutexT mutex_type;
+  using mutex_type = MutexT;
 
   /// If FollowRAUW is true, the ValueMap will update mappings on RAUW. If it's
   /// false, the ValueMap will leave the original mapping in place.
@@ -87,21 +86,21 @@ template<typename KeyT, typename ValueT, typename Config =ValueMapConfig<KeyT>>
 class ValueMap {
   friend class ValueMapCallbackVH<KeyT, ValueT, Config>;
 
-  typedef ValueMapCallbackVH<KeyT, ValueT, Config> ValueMapCVH;
-  typedef DenseMap<ValueMapCVH, ValueT, DenseMapInfo<ValueMapCVH>> MapT;
-  typedef DenseMap<const Metadata *, TrackingMDRef> MDMapT;
-  typedef typename Config::ExtraData ExtraData;
+  using ValueMapCVH = ValueMapCallbackVH<KeyT, ValueT, Config>;
+  using MapT = DenseMap<ValueMapCVH, ValueT, DenseMapInfo<ValueMapCVH>>;
+  using MDMapT = DenseMap<const Metadata *, TrackingMDRef>;
+  using ExtraData = typename Config::ExtraData;
+
   MapT Map;
   Optional<MDMapT> MDMap;
   ExtraData Data;
-
   bool MayMapMetadata = true;
 
 public:
-  typedef KeyT key_type;
-  typedef ValueT mapped_type;
-  typedef std::pair<KeyT, ValueT> value_type;
-  typedef unsigned size_type;
+  using key_type = KeyT;
+  using mapped_type = ValueT;
+  using value_type = std::pair<KeyT, ValueT>;
+  using size_type = unsigned;
 
   explicit ValueMap(unsigned NumInitBuckets = 64)
       : Map(NumInitBuckets), Data() {}
@@ -132,8 +131,9 @@ public:
     return Where->second.get();
   }
 
-  typedef ValueMapIterator<MapT, KeyT> iterator;
-  typedef ValueMapConstIterator<MapT, KeyT> const_iterator;
+  using iterator = ValueMapIterator<MapT, KeyT>;
+  using const_iterator = ValueMapConstIterator<MapT, KeyT>;
+
   inline iterator begin() { return iterator(Map.begin()); }
   inline iterator end() { return iterator(Map.end()); }
   inline const_iterator begin() const { return const_iterator(Map.begin()); }
@@ -244,8 +244,8 @@ class ValueMapCallbackVH final : public CallbackVH {
   friend class ValueMap<KeyT, ValueT, Config>;
   friend struct DenseMapInfo<ValueMapCallbackVH>;
 
-  typedef ValueMap<KeyT, ValueT, Config> ValueMapT;
-  typedef typename std::remove_pointer<KeyT>::type KeySansPointerT;
+  using ValueMapT = ValueMap<KeyT, ValueT, Config>;
+  using KeySansPointerT = typename std::remove_pointer<KeyT>::type;
 
   ValueMapT *Map;
 
@@ -298,7 +298,7 @@ public:
 
 template<typename KeyT, typename ValueT, typename Config>
 struct DenseMapInfo<ValueMapCallbackVH<KeyT, ValueT, Config>> {
-  typedef ValueMapCallbackVH<KeyT, ValueT, Config> VH;
+  using VH = ValueMapCallbackVH<KeyT, ValueT, Config>;
 
   static inline VH getEmptyKey() {
     return VH(DenseMapInfo<Value *>::getEmptyKey());
@@ -330,8 +330,8 @@ class ValueMapIterator :
     public std::iterator<std::forward_iterator_tag,
                          std::pair<KeyT, typename DenseMapT::mapped_type>,
                          ptrdiff_t> {
-  typedef typename DenseMapT::iterator BaseT;
-  typedef typename DenseMapT::mapped_type ValueT;
+  using BaseT = typename DenseMapT::iterator;
+  using ValueT = typename DenseMapT::mapped_type;
 
   BaseT I;
 
@@ -344,7 +344,9 @@ public:
   struct ValueTypeProxy {
     const KeyT first;
     ValueT& second;
+
     ValueTypeProxy *operator->() { return this; }
+
     operator std::pair<KeyT, ValueT>() const {
       return std::make_pair(first, second);
     }
@@ -380,8 +382,8 @@ class ValueMapConstIterator :
     public std::iterator<std::forward_iterator_tag,
                          std::pair<KeyT, typename DenseMapT::mapped_type>,
                          ptrdiff_t> {
-  typedef typename DenseMapT::const_iterator BaseT;
-  typedef typename DenseMapT::mapped_type ValueT;
+  using BaseT = typename DenseMapT::const_iterator;
+  using ValueT = typename DenseMapT::mapped_type;
 
   BaseT I;
 
diff --git a/include/llvm/IR/ValueSymbolTable.h b/include/llvm/IR/ValueSymbolTable.h
index 9e86751dae6f..26cbbfabfc0c 100644
--- a/include/llvm/IR/ValueSymbolTable.h
+++ b/include/llvm/IR/ValueSymbolTable.h
@@ -49,13 +49,13 @@ class ValueSymbolTable {
 /// @{
 public:
   /// @brief A mapping of names to values.
-  typedef StringMap<Value*> ValueMap;
+  using ValueMap = StringMap<Value*>;
 
   /// @brief An iterator over a ValueMap.
-  typedef ValueMap::iterator iterator;
+  using iterator = ValueMap::iterator;
 
   /// @brief A const_iterator over a ValueMap.
-  typedef ValueMap::const_iterator const_iterator;
+  using const_iterator = ValueMap::const_iterator;
 
 /// @}
 /// @name Constructors
diff --git a/include/llvm/IR/Verifier.h b/include/llvm/IR/Verifier.h
index 71f727c3d4fc..15e52d9e0742 100644
--- a/include/llvm/IR/Verifier.h
+++ b/include/llvm/IR/Verifier.h
@@ -21,13 +21,17 @@
 #ifndef LLVM_IR_VERIFIER_H
 #define LLVM_IR_VERIFIER_H
 
+#include "llvm/ADT/DenseMap.h"
 #include "llvm/IR/PassManager.h"
+#include <utility>
 
 namespace llvm {
 
+class APInt;
 class Function;
 class FunctionPass;
-class ModulePass;
+class Instruction;
+class MDNode;
 class Module;
 class raw_ostream;
 struct VerifierSupport;
@@ -47,7 +51,7 @@ class TBAAVerifier {
   ///    the offset of the access.  If zero, only a zero offset is allowed.
   ///
   /// \c BitWidth has no meaning if \c IsInvalid is true.
-  typedef std::pair<bool, unsigned> TBAABaseNodeSummary;
+  using TBAABaseNodeSummary = std::pair<bool, unsigned>;
   DenseMap<const MDNode *, TBAABaseNodeSummary> TBAABaseNodes;
 
   /// Maps an alleged scalar TBAA node to a boolean that is true if the said
@@ -101,12 +105,14 @@ FunctionPass *createVerifierPass(bool FatalErrors = true);
 /// and debug info errors.
 class VerifierAnalysis : public AnalysisInfoMixin<VerifierAnalysis> {
   friend AnalysisInfoMixin<VerifierAnalysis>;
+
   static AnalysisKey Key;
 
 public:
   struct Result {
     bool IRBroken, DebugInfoBroken;
   };
+
   Result run(Module &M, ModuleAnalysisManager &);
   Result run(Function &F, FunctionAnalysisManager &);
 };
@@ -136,7 +142,6 @@ public:
   PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
 };
 
+} // end namespace llvm
 
-} // End llvm namespace
-
-#endif
+#endif // LLVM_IR_VERIFIER_H
diff --git a/include/llvm/InitializePasses.h b/include/llvm/InitializePasses.h
index 44ff4c1a581b..cf314e19d1ca 100644
--- a/include/llvm/InitializePasses.h
+++ b/include/llvm/InitializePasses.h
@@ -130,6 +130,7 @@ void initializeEfficiencySanitizerPass(PassRegistry&);
 void initializeEliminateAvailableExternallyLegacyPassPass(PassRegistry&);
 void initializeExpandISelPseudosPass(PassRegistry&);
 void initializeExpandPostRAPass(PassRegistry&);
+void initializeExpandReductionsPass(PassRegistry&);
 void initializeExternalAAWrapperPassPass(PassRegistry&);
 void initializeFEntryInserterPass(PassRegistry&);
 void initializeFinalizeMachineBundlesPass(PassRegistry&);
@@ -186,6 +187,7 @@ void initializeLintPass(PassRegistry&);
 void initializeLiveDebugValuesPass(PassRegistry&);
 void initializeLiveDebugVariablesPass(PassRegistry&);
 void initializeLiveIntervalsPass(PassRegistry&);
+void initializeLiveRangeShrinkPass(PassRegistry&);
 void initializeLiveRegMatrixPass(PassRegistry&);
 void initializeLiveStacksPass(PassRegistry&);
 void initializeLiveVariablesPass(PassRegistry&);
@@ -319,11 +321,12 @@ void initializeSCCPLegacyPassPass(PassRegistry&);
 void initializeSCEVAAWrapperPassPass(PassRegistry&);
 void initializeSLPVectorizerPass(PassRegistry&);
 void initializeSROALegacyPassPass(PassRegistry&);
-void initializeSafeStackPass(PassRegistry&);
+void initializeSafeStackLegacyPassPass(PassRegistry&);
 void initializeSampleProfileLoaderLegacyPassPass(PassRegistry&);
 void initializeSanitizerCoverageModulePass(PassRegistry&);
 void initializeScalarEvolutionWrapperPassPass(PassRegistry&);
 void initializeScalarizerPass(PassRegistry&);
+void initializeScalarizeMaskedMemIntrinPass(PassRegistry&);
 void initializeScopedNoAliasAAWrapperPassPass(PassRegistry&);
 void initializeSeparateConstOffsetFromGEPPass(PassRegistry&);
 void initializeShadowStackGCLoweringPass(PassRegistry&);
diff --git a/include/llvm/LibDriver/LibDriver.h b/include/llvm/LibDriver/LibDriver.h
deleted file mode 100644
index 95feb60be403..000000000000
--- a/include/llvm/LibDriver/LibDriver.h
+++ /dev/null
@@ -1,24 +0,0 @@
-//===- llvm/LibDriver/LibDriver.h - lib.exe-compatible driver ---*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// Defines an interface to a lib.exe-compatible driver that also understands
-// bitcode files. Used by llvm-lib and lld-link /lib.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_LIBDRIVER_LIBDRIVER_H
-#define LLVM_LIBDRIVER_LIBDRIVER_H
-
-namespace llvm {
-template <typename T> class ArrayRef;
-
-int libDriverMain(ArrayRef<const char *> ARgs);
-}
-
-#endif
diff --git a/include/llvm/LinkAllPasses.h b/include/llvm/LinkAllPasses.h
index 39a86e838bde..5c398b2ab567 100644
--- a/include/llvm/LinkAllPasses.h
+++ b/include/llvm/LinkAllPasses.h
@@ -206,6 +206,7 @@ namespace {
       (void) llvm::createMemDerefPrinter();
       (void) llvm::createFloat2IntPass();
       (void) llvm::createEliminateAvailableExternallyPass();
+      (void) llvm::createScalarizeMaskedMemIntrinPass();
 
       (void)new llvm::IntervalPartition();
       (void)new llvm::ScalarEvolutionWrapperPass();
diff --git a/include/llvm/Object/Wasm.h b/include/llvm/Object/Wasm.h
index 4bc39d98b7af..d200d4a148e3 100644
--- a/include/llvm/Object/Wasm.h
+++ b/include/llvm/Object/Wasm.h
@@ -67,7 +67,8 @@ public:
   WasmObjectFile(MemoryBufferRef Object, Error &Err);
 
   const wasm::WasmObjectHeader &getHeader() const;
-  const WasmSymbol &getWasmSymbol(DataRefImpl Symb) const;
+  const WasmSymbol &getWasmSymbol(const DataRefImpl &Symb) const;
+  const WasmSymbol &getWasmSymbol(const SymbolRef &Symbol) const;
   const WasmSection &getWasmSection(const SectionRef &Section) const;
   const wasm::WasmRelocation &getWasmRelocation(const RelocationRef& Ref) const;
 
@@ -81,6 +82,10 @@ public:
   const std::vector<wasm::WasmGlobal>& globals() const { return Globals; }
   const std::vector<wasm::WasmExport>& exports() const { return Exports; }
 
+  uint32_t getNumberOfSymbols() const {
+    return Symbols.size();
+  }
+
   const std::vector<wasm::WasmElemSegment>& elements() const {
     return ElemSegments;
   }
diff --git a/include/llvm/ObjectYAML/WasmYAML.h b/include/llvm/ObjectYAML/WasmYAML.h
index bd7d72be4dbc..7b70c9537827 100644
--- a/include/llvm/ObjectYAML/WasmYAML.h
+++ b/include/llvm/ObjectYAML/WasmYAML.h
@@ -34,17 +34,6 @@ struct FileHeader {
   yaml::Hex32 Version;
 };
 
-struct Import {
-  StringRef Module;
-  StringRef Field;
-  ExportKind Kind;
-  union {
-    uint32_t SigIndex;
-    ValueType GlobalType;
-  };
-  bool GlobalMutable;
-};
-
 struct Limits {
   yaml::Hex32 Flags;
   yaml::Hex32 Initial;
@@ -74,6 +63,18 @@ struct Global {
   wasm::WasmInitExpr InitExpr;
 };
 
+struct Import {
+  StringRef Module;
+  StringRef Field;
+  ExportKind Kind;
+  union {
+    uint32_t SigIndex;
+    Global GlobalImport;
+    Table TableImport;
+    Limits Memory;
+  };
+};
+
 struct LocalDecl {
   ValueType Type;
   uint32_t Count;
diff --git a/include/llvm/ProfileData/SampleProfWriter.h b/include/llvm/ProfileData/SampleProfWriter.h
index 9d69af32dd46..86af1038d74e 100644
--- a/include/llvm/ProfileData/SampleProfWriter.h
+++ b/include/llvm/ProfileData/SampleProfWriter.h
@@ -43,16 +43,7 @@ public:
   /// Write all the sample profiles in the given map of samples.
   ///
   /// \returns status code of the file update operation.
-  std::error_code write(const StringMap<FunctionSamples> &ProfileMap) {
-    if (std::error_code EC = writeHeader(ProfileMap))
-      return EC;
-    for (const auto &I : ProfileMap) {
-      const FunctionSamples &Profile = I.second;
-      if (std::error_code EC = write(Profile))
-        return EC;
-    }
-    return sampleprof_error::success;
-  }
+  std::error_code write(const StringMap<FunctionSamples> &ProfileMap);
 
   raw_ostream &getOutputStream() { return *OutputStream; }
 
diff --git a/include/llvm/Support/BinaryStreamArray.h b/include/llvm/Support/BinaryStreamArray.h
index bad31cd38d6a..77c99ffff919 100644
--- a/include/llvm/Support/BinaryStreamArray.h
+++ b/include/llvm/Support/BinaryStreamArray.h
@@ -139,6 +139,7 @@ public:
   }
 
   uint32_t offset() const { return AbsOffset; }
+  uint32_t getRecordLength() const { return ThisLen; }
 
 private:
   void moveToEnd() {
@@ -294,6 +295,8 @@ template <typename T> class FixedStreamArray {
   friend class FixedStreamArrayIterator<T>;
 
 public:
+  typedef FixedStreamArrayIterator<T> Iterator;
+
   FixedStreamArray() = default;
   explicit FixedStreamArray(BinaryStreamRef Stream) : Stream(Stream) {
     assert(Stream.getLength() % sizeof(T) == 0);
@@ -371,7 +374,7 @@ public:
   }
 
   FixedStreamArrayIterator<T> &operator-=(std::ptrdiff_t N) {
-    assert(Index >= N);
+    assert(std::ptrdiff_t(Index) >= N);
     Index -= N;
     return *this;
   }
diff --git a/include/llvm/Support/Compiler.h b/include/llvm/Support/Compiler.h
index a56bc93e111b..be9e46540016 100644
--- a/include/llvm/Support/Compiler.h
+++ b/include/llvm/Support/Compiler.h
@@ -111,12 +111,6 @@
 #define LLVM_PREFETCH(addr, rw, locality)
 #endif
 
-#if __has_attribute(sentinel) || LLVM_GNUC_PREREQ(3, 0, 0)
-#define LLVM_END_WITH_NULL __attribute__((sentinel))
-#else
-#define LLVM_END_WITH_NULL
-#endif
-
 #if __has_attribute(used) || LLVM_GNUC_PREREQ(3, 1, 0)
 #define LLVM_ATTRIBUTE_USED __attribute__((__used__))
 #else
@@ -233,6 +227,8 @@
 /// LLVM_FALLTHROUGH - Mark fallthrough cases in switch statements.
 #if __cplusplus > 201402L && __has_cpp_attribute(fallthrough)
 #define LLVM_FALLTHROUGH [[fallthrough]]
+#elif __has_cpp_attribute(gnu::fallthrough)
+#define LLVM_FALLTHROUGH [[gnu::fallthrough]]
 #elif !__cplusplus
 // Workaround for llvm.org/PR23435, since clang 3.6 and below emit a spurious
 // error when __has_cpp_attribute is given a scoped attribute in C mode.
diff --git a/include/llvm/Support/KnownBits.h b/include/llvm/Support/KnownBits.h
index 3d38cf878538..2c77d40559b9 100644
--- a/include/llvm/Support/KnownBits.h
+++ b/include/llvm/Support/KnownBits.h
@@ -133,6 +133,66 @@ public:
   KnownBits zextOrTrunc(unsigned BitWidth) {
     return KnownBits(Zero.zextOrTrunc(BitWidth), One.zextOrTrunc(BitWidth));
   }
+
+  /// Returns the minimum number of trailing zero bits.
+  unsigned countMinTrailingZeros() const {
+    return Zero.countTrailingOnes();
+  }
+
+  /// Returns the minimum number of trailing one bits.
+  unsigned countMinTrailingOnes() const {
+    return One.countTrailingOnes();
+  }
+
+  /// Returns the minimum number of leading zero bits.
+  unsigned countMinLeadingZeros() const {
+    return Zero.countLeadingOnes();
+  }
+
+  /// Returns the minimum number of leading one bits.
+  unsigned countMinLeadingOnes() const {
+    return One.countLeadingOnes();
+  }
+
+  /// Returns the number of times the sign bit is replicated into the other
+  /// bits.
+  unsigned countMinSignBits() const {
+    if (isNonNegative())
+      return countMinLeadingZeros();
+    if (isNegative())
+      return countMinLeadingOnes();
+    return 0;
+  }
+
+  /// Returns the maximum number of trailing zero bits possible.
+  unsigned countMaxTrailingZeros() const {
+    return One.countTrailingZeros();
+  }
+
+  /// Returns the maximum number of trailing one bits possible.
+  unsigned countMaxTrailingOnes() const {
+    return Zero.countTrailingZeros();
+  }
+
+  /// Returns the maximum number of leading zero bits possible.
+  unsigned countMaxLeadingZeros() const {
+    return One.countLeadingZeros();
+  }
+
+  /// Returns the maximum number of leading one bits possible.
+  unsigned countMaxLeadingOnes() const {
+    return Zero.countLeadingZeros();
+  }
+
+  /// Returns the number of bits known to be one.
+  unsigned countMinPopulation() const {
+    return One.countPopulation();
+  }
+
+  /// Returns the maximum number of bits that could be one.
+  unsigned countMaxPopulation() const {
+    return getBitWidth() - Zero.countPopulation();
+  }
 };
 
 } // end namespace llvm
diff --git a/include/llvm/Support/Parallel.h b/include/llvm/Support/Parallel.h
new file mode 100644
index 000000000000..e36e0cc29e14
--- /dev/null
+++ b/include/llvm/Support/Parallel.h
@@ -0,0 +1,249 @@
+//===- llvm/Support/Parallel.h - Parallel algorithms ----------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_SUPPORT_PARALLEL_H
+#define LLVM_SUPPORT_PARALLEL_H
+
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/Config/llvm-config.h"
+#include "llvm/Support/MathExtras.h"
+
+#include <algorithm>
+#include <condition_variable>
+#include <functional>
+#include <mutex>
+
+#if defined(_MSC_VER) && LLVM_ENABLE_THREADS
+#pragma warning(push)
+#pragma warning(disable : 4530)
+#include <concrt.h>
+#include <ppl.h>
+#pragma warning(pop)
+#endif
+
+namespace llvm {
+
+namespace parallel {
+struct sequential_execution_policy {};
+struct parallel_execution_policy {};
+
+template <typename T>
+struct is_execution_policy
+    : public std::integral_constant<
+          bool, llvm::is_one_of<T, sequential_execution_policy,
+                                parallel_execution_policy>::value> {};
+
+constexpr sequential_execution_policy seq{};
+constexpr parallel_execution_policy par{};
+
+namespace detail {
+
+#if LLVM_ENABLE_THREADS
+
+class Latch {
+  uint32_t Count;
+  mutable std::mutex Mutex;
+  mutable std::condition_variable Cond;
+
+public:
+  explicit Latch(uint32_t Count = 0) : Count(Count) {}
+  ~Latch() { sync(); }
+
+  void inc() {
+    std::unique_lock<std::mutex> lock(Mutex);
+    ++Count;
+  }
+
+  void dec() {
+    std::unique_lock<std::mutex> lock(Mutex);
+    if (--Count == 0)
+      Cond.notify_all();
+  }
+
+  void sync() const {
+    std::unique_lock<std::mutex> lock(Mutex);
+    Cond.wait(lock, [&] { return Count == 0; });
+  }
+};
+
+class TaskGroup {
+  Latch L;
+
+public:
+  void spawn(std::function<void()> f);
+
+  void sync() const { L.sync(); }
+};
+
+#if defined(_MSC_VER)
+template <class RandomAccessIterator, class Comparator>
+void parallel_sort(RandomAccessIterator Start, RandomAccessIterator End,
+                   const Comparator &Comp) {
+  concurrency::parallel_sort(Start, End, Comp);
+}
+template <class IterTy, class FuncTy>
+void parallel_for_each(IterTy Begin, IterTy End, FuncTy Fn) {
+  concurrency::parallel_for_each(Begin, End, Fn);
+}
+
+template <class IndexTy, class FuncTy>
+void parallel_for_each_n(IndexTy Begin, IndexTy End, FuncTy Fn) {
+  concurrency::parallel_for(Begin, End, Fn);
+}
+
+#else
+const ptrdiff_t MinParallelSize = 1024;
+
+/// \brief Inclusive median.
+template <class RandomAccessIterator, class Comparator>
+RandomAccessIterator medianOf3(RandomAccessIterator Start,
+                               RandomAccessIterator End,
+                               const Comparator &Comp) {
+  RandomAccessIterator Mid = Start + (std::distance(Start, End) / 2);
+  return Comp(*Start, *(End - 1))
+             ? (Comp(*Mid, *(End - 1)) ? (Comp(*Start, *Mid) ? Mid : Start)
+                                       : End - 1)
+             : (Comp(*Mid, *Start) ? (Comp(*(End - 1), *Mid) ? Mid : End - 1)
+                                   : Start);
+}
+
+template <class RandomAccessIterator, class Comparator>
+void parallel_quick_sort(RandomAccessIterator Start, RandomAccessIterator End,
+                         const Comparator &Comp, TaskGroup &TG, size_t Depth) {
+  // Do a sequential sort for small inputs.
+  if (std::distance(Start, End) < detail::MinParallelSize || Depth == 0) {
+    std::sort(Start, End, Comp);
+    return;
+  }
+
+  // Partition.
+  auto Pivot = medianOf3(Start, End, Comp);
+  // Move Pivot to End.
+  std::swap(*(End - 1), *Pivot);
+  Pivot = std::partition(Start, End - 1, [&Comp, End](decltype(*Start) V) {
+    return Comp(V, *(End - 1));
+  });
+  // Move Pivot to middle of partition.
+  std::swap(*Pivot, *(End - 1));
+
+  // Recurse.
+  TG.spawn([=, &Comp, &TG] {
+    parallel_quick_sort(Start, Pivot, Comp, TG, Depth - 1);
+  });
+  parallel_quick_sort(Pivot + 1, End, Comp, TG, Depth - 1);
+}
+
+template <class RandomAccessIterator, class Comparator>
+void parallel_sort(RandomAccessIterator Start, RandomAccessIterator End,
+                   const Comparator &Comp) {
+  TaskGroup TG;
+  parallel_quick_sort(Start, End, Comp, TG,
+                      llvm::Log2_64(std::distance(Start, End)) + 1);
+}
+
+template <class IterTy, class FuncTy>
+void parallel_for_each(IterTy Begin, IterTy End, FuncTy Fn) {
+  // TaskGroup has a relatively high overhead, so we want to reduce
+  // the number of spawn() calls. We'll create up to 1024 tasks here.
+  // (Note that 1024 is an arbitrary number. This code probably needs
+  // improving to take the number of available cores into account.)
+  ptrdiff_t TaskSize = std::distance(Begin, End) / 1024;
+  if (TaskSize == 0)
+    TaskSize = 1;
+
+  TaskGroup TG;
+  while (TaskSize <= std::distance(Begin, End)) {
+    TG.spawn([=, &Fn] { std::for_each(Begin, Begin + TaskSize, Fn); });
+    Begin += TaskSize;
+  }
+  TG.spawn([=, &Fn] { std::for_each(Begin, End, Fn); });
+}
+
+template <class IndexTy, class FuncTy>
+void parallel_for_each_n(IndexTy Begin, IndexTy End, FuncTy Fn) {
+  ptrdiff_t TaskSize = (End - Begin) / 1024;
+  if (TaskSize == 0)
+    TaskSize = 1;
+
+  TaskGroup TG;
+  IndexTy I = Begin;
+  for (; I + TaskSize < End; I += TaskSize) {
+    TG.spawn([=, &Fn] {
+      for (IndexTy J = I, E = I + TaskSize; J != E; ++J)
+        Fn(J);
+    });
+  }
+  TG.spawn([=, &Fn] {
+    for (IndexTy J = I; J < End; ++J)
+      Fn(J);
+  });
+}
+
+#endif
+
+#endif
+
+template <typename Iter>
+using DefComparator =
+    std::less<typename std::iterator_traits<Iter>::value_type>;
+
+} // namespace detail
+
+// sequential algorithm implementations.
+template <class Policy, class RandomAccessIterator,
+          class Comparator = detail::DefComparator<RandomAccessIterator>>
+void sort(Policy policy, RandomAccessIterator Start, RandomAccessIterator End,
+          const Comparator &Comp = Comparator()) {
+  static_assert(is_execution_policy<Policy>::value,
+                "Invalid execution policy!");
+  std::sort(Start, End, Comp);
+}
+
+template <class Policy, class IterTy, class FuncTy>
+void for_each(Policy policy, IterTy Begin, IterTy End, FuncTy Fn) {
+  static_assert(is_execution_policy<Policy>::value,
+                "Invalid execution policy!");
+  std::for_each(Begin, End, Fn);
+}
+
+template <class Policy, class IndexTy, class FuncTy>
+void for_each_n(Policy policy, IndexTy Begin, IndexTy End, FuncTy Fn) {
+  static_assert(is_execution_policy<Policy>::value,
+                "Invalid execution policy!");
+  for (IndexTy I = Begin; I != End; ++I)
+    Fn(I);
+}
+
+// Parallel algorithm implementations, only available when LLVM_ENABLE_THREADS
+// is true.
+#if LLVM_ENABLE_THREADS
+template <class RandomAccessIterator,
+          class Comparator = detail::DefComparator<RandomAccessIterator>>
+void sort(parallel_execution_policy policy, RandomAccessIterator Start,
+          RandomAccessIterator End, const Comparator &Comp = Comparator()) {
+  detail::parallel_sort(Start, End, Comp);
+}
+
+template <class IterTy, class FuncTy>
+void for_each(parallel_execution_policy policy, IterTy Begin, IterTy End,
+              FuncTy Fn) {
+  detail::parallel_for_each(Begin, End, Fn);
+}
+
+template <class IndexTy, class FuncTy>
+void for_each_n(parallel_execution_policy policy, IndexTy Begin, IndexTy End,
+                FuncTy Fn) {
+  detail::parallel_for_each_n(Begin, End, Fn);
+}
+#endif
+
+} // namespace parallel
+} // namespace llvm
+
+#endif // LLVM_SUPPORT_PARALLEL_H
diff --git a/include/llvm/Support/Wasm.h b/include/llvm/Support/Wasm.h
index a48dfe10b3bb..e3831827062c 100644
--- a/include/llvm/Support/Wasm.h
+++ b/include/llvm/Support/Wasm.h
@@ -37,17 +37,6 @@ struct WasmSignature {
   int32_t ReturnType;
 };
 
-struct WasmImport {
-  StringRef Module;
-  StringRef Field;
-  uint32_t Kind;
-  union {
-    uint32_t SigIndex;
-    int32_t GlobalType;
-  };
-  bool GlobalMutable;
-};
-
 struct WasmExport {
   StringRef Name;
   uint32_t Kind;
@@ -82,6 +71,18 @@ struct WasmGlobal {
   WasmInitExpr InitExpr;
 };
 
+struct WasmImport {
+  StringRef Module;
+  StringRef Field;
+  uint32_t Kind;
+  union {
+    uint32_t SigIndex;
+    WasmGlobal Global;
+    WasmTable Table;
+    WasmLimits Memory;
+  };
+};
+
 struct WasmLocalDecl {
   int32_t Type;
   uint32_t Count;
diff --git a/include/llvm/Target/Target.td b/include/llvm/Target/Target.td
index fc35b4527bc3..6f44292c47ed 100644
--- a/include/llvm/Target/Target.td
+++ b/include/llvm/Target/Target.td
@@ -680,6 +680,11 @@ class RegisterOperand<RegisterClass regclass, string pm = "printOperand">
   // this type. The method normally will just use an alt-name index to look
   // up the name to print. Default to the generic printOperand().
   string PrintMethod = pm;
+
+  // EncoderMethod - The target method name to call to encode this register
+  // operand.
+  string EncoderMethod = "";
+
   // ParserMatchClass - The "match class" that operands of this type fit
   // in. Match classes are used to define the order in which instructions are
   // match, to ensure that which instructions gets matched is deterministic.
diff --git a/include/llvm/Target/TargetInstrInfo.h b/include/llvm/Target/TargetInstrInfo.h
index 82a682cf1f7e..97a6f0c6e3ae 100644
--- a/include/llvm/Target/TargetInstrInfo.h
+++ b/include/llvm/Target/TargetInstrInfo.h
@@ -172,11 +172,22 @@ public:
   /// inalloca arguments. This function reports only the size of the frame part
   /// that is set up between the frame setup and destroy pseudo instructions.
   int64_t getFrameSize(const MachineInstr &I) const {
-    assert(isFrameInstr(I));
+    assert(isFrameInstr(I) && "Not a frame instruction");
     assert(I.getOperand(0).getImm() >= 0);
     return I.getOperand(0).getImm();
   }
 
+  /// Returns the total frame size, which is made up of the space set up inside
+  /// the pair of frame start-stop instructions and the space that is set up
+  /// prior to the pair.
+  int64_t getFrameTotalSize(const MachineInstr &I) const {
+    if (isFrameSetup(I)) {
+      assert(I.getOperand(1).getImm() >= 0 && "Frame size must not be negative");
+      return getFrameSize(I) + I.getOperand(1).getImm();
+    }
+    return getFrameSize(I);
+  }
+
   unsigned getCatchReturnOpcode() const { return CatchRetOpcode; }
   unsigned getReturnOpcode() const { return ReturnOpcode; }
 
diff --git a/include/llvm/Target/TargetLowering.h b/include/llvm/Target/TargetLowering.h
index ced183852146..1ca32d4c3589 100644
--- a/include/llvm/Target/TargetLowering.h
+++ b/include/llvm/Target/TargetLowering.h
@@ -1396,7 +1396,10 @@ public:
   /// It is called by AtomicExpandPass before expanding an
   ///   AtomicRMW/AtomicCmpXchg/AtomicStore/AtomicLoad
   ///   if shouldInsertFencesForAtomic returns true.
-  /// RMW and CmpXchg set both IsStore and IsLoad to true.
+  ///
+  /// Inst is the original atomic instruction, prior to other expansions that
+  /// may be performed.
+  ///
   /// This function should either return a nullptr, or a pointer to an IR-level
   ///   Instruction*. Even complex fence sequences can be represented by a
   ///   single Instruction* through an intrinsic to be lowered later.
@@ -1422,18 +1425,17 @@ public:
   ///  seq_cst. But if they are lowered to monotonic accesses, no amount of
   ///  IR-level fences can prevent it.
   /// @{
-  virtual Instruction *emitLeadingFence(IRBuilder<> &Builder,
-                                        AtomicOrdering Ord, bool IsStore,
-                                        bool IsLoad) const {
-    if (isReleaseOrStronger(Ord) && IsStore)
+  virtual Instruction *emitLeadingFence(IRBuilder<> &Builder, Instruction *Inst,
+                                        AtomicOrdering Ord) const {
+    if (isReleaseOrStronger(Ord) && Inst->hasAtomicStore())
       return Builder.CreateFence(Ord);
     else
       return nullptr;
   }
 
   virtual Instruction *emitTrailingFence(IRBuilder<> &Builder,
-                                         AtomicOrdering Ord, bool IsStore,
-                                         bool IsLoad) const {
+                                         Instruction *Inst,
+                                         AtomicOrdering Ord) const {
     if (isAcquireOrStronger(Ord))
       return Builder.CreateFence(Ord);
     else
@@ -2061,14 +2063,6 @@ public:
     return false;
   }
 
-  // Return true if the instruction that performs a << b actually performs
-  // a << (b % (sizeof(a) * 8)).
-  virtual bool supportsModuloShift(ISD::NodeType Inst, EVT ReturnType) const {
-    assert((Inst == ISD::SHL || Inst == ISD::SRA || Inst == ISD::SRL) &&
-           "Expect a shift instruction");
-    return false;
-  }
-
   //===--------------------------------------------------------------------===//
   // Runtime Library hooks
   //
diff --git a/include/llvm/Target/TargetSchedule.td b/include/llvm/Target/TargetSchedule.td
index d342e4fe2613..7b00c9420e35 100644
--- a/include/llvm/Target/TargetSchedule.td
+++ b/include/llvm/Target/TargetSchedule.td
@@ -334,7 +334,7 @@ class ReadAdvance<SchedRead read, int cycles, list<SchedWrite> writes = []>
 }
 
 // Directly associate a new SchedRead type with a delay and optional
-// pipeline bypess. For use with InstRW or ItinRW.
+// pipeline bypass. For use with InstRW or ItinRW.
 class SchedReadAdvance<int cycles, list<SchedWrite> writes = []> : SchedRead,
   ProcReadAdvance<cycles, writes>;
 
diff --git a/include/llvm/Target/TargetSelectionDAG.td b/include/llvm/Target/TargetSelectionDAG.td
index 45a842f77a21..9ed614ccee17 100644
--- a/include/llvm/Target/TargetSelectionDAG.td
+++ b/include/llvm/Target/TargetSelectionDAG.td
@@ -281,7 +281,7 @@ def SDTConvertOp : SDTypeProfile<1, 5, [ //cvtss, su, us, uu, ff, fs, fu, sf, su
 ]>;
 
 class SDCallSeqStart<list<SDTypeConstraint> constraints> :
-        SDTypeProfile<0, 1, constraints>;
+        SDTypeProfile<0, 2, constraints>;
 class SDCallSeqEnd<list<SDTypeConstraint> constraints> :
         SDTypeProfile<0, 2, constraints>;
 
diff --git a/include/llvm/ToolDrivers/llvm-lib/LibDriver.h b/include/llvm/ToolDrivers/llvm-lib/LibDriver.h
new file mode 100644
index 000000000000..a4806ac4ad69
--- /dev/null
+++ b/include/llvm/ToolDrivers/llvm-lib/LibDriver.h
@@ -0,0 +1,24 @@
+//===- llvm-lib/LibDriver.h - lib.exe-compatible driver ---------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Defines an interface to a lib.exe-compatible driver that also understands
+// bitcode files. Used by llvm-lib and lld-link /lib.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TOOLDRIVERS_LLVM_LIB_LIBDRIVER_H
+#define LLVM_TOOLDRIVERS_LLVM_LIB_LIBDRIVER_H
+
+namespace llvm {
+template <typename T> class ArrayRef;
+
+int libDriverMain(ArrayRef<const char *> ARgs);
+}
+
+#endif
diff --git a/include/llvm/Transforms/Utils/Cloning.h b/include/llvm/Transforms/Utils/Cloning.h
index 0a8903a6ed7b..91c9d255302f 100644
--- a/include/llvm/Transforms/Utils/Cloning.h
+++ b/include/llvm/Transforms/Utils/Cloning.h
@@ -43,6 +43,7 @@ class InvokeInst;
 class Loop;
 class LoopInfo;
 class Module;
+class ProfileSummaryInfo;
 class ReturnInst;
 
 /// Return an exact copy of the specified module
@@ -175,15 +176,17 @@ public:
   explicit InlineFunctionInfo(CallGraph *cg = nullptr,
                               std::function<AssumptionCache &(Function &)>
                                   *GetAssumptionCache = nullptr,
+                              ProfileSummaryInfo *PSI = nullptr,
                               BlockFrequencyInfo *CallerBFI = nullptr,
                               BlockFrequencyInfo *CalleeBFI = nullptr)
-      : CG(cg), GetAssumptionCache(GetAssumptionCache), CallerBFI(CallerBFI),
-        CalleeBFI(CalleeBFI) {}
+      : CG(cg), GetAssumptionCache(GetAssumptionCache), PSI(PSI),
+        CallerBFI(CallerBFI), CalleeBFI(CalleeBFI) {}
 
   /// CG - If non-null, InlineFunction will update the callgraph to reflect the
   /// changes it makes.
   CallGraph *CG;
   std::function<AssumptionCache &(Function &)> *GetAssumptionCache;
+  ProfileSummaryInfo *PSI;
   BlockFrequencyInfo *CallerBFI, *CalleeBFI;
 
   /// StaticAllocas - InlineFunction fills this in with all static allocas that
diff --git a/include/llvm/Transforms/Utils/LoopUtils.h b/include/llvm/Transforms/Utils/LoopUtils.h
index a1cf41d6f931..561f94880624 100644
--- a/include/llvm/Transforms/Utils/LoopUtils.h
+++ b/include/llvm/Transforms/Utils/LoopUtils.h
@@ -21,6 +21,7 @@
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/EHPersonalities.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/InstrTypes.h"
@@ -42,6 +43,7 @@ class PredIteratorCache;
 class ScalarEvolution;
 class SCEV;
 class TargetLibraryInfo;
+class TargetTransformInfo;
 
 /// \brief Captures loop safety information.
 /// It keep information for loop & its header may throw exception.
@@ -489,6 +491,36 @@ bool canSinkOrHoistInst(Instruction &I, AAResults *AA, DominatorTree *DT,
                         LoopSafetyInfo *SafetyInfo,
                         OptimizationRemarkEmitter *ORE = nullptr);
 
+/// Generates a vector reduction using shufflevectors to reduce the value.
+Value *getShuffleReduction(IRBuilder<> &Builder, Value *Src, unsigned Op,
+                           RecurrenceDescriptor::MinMaxRecurrenceKind
+                               MinMaxKind = RecurrenceDescriptor::MRK_Invalid,
+                           ArrayRef<Value *> RedOps = ArrayRef<Value *>());
+
+/// Create a target reduction of the given vector. The reduction operation
+/// is described by the \p Opcode parameter. min/max reductions require
+/// additional information supplied in \p Flags.
+/// The target is queried to determine if intrinsics or shuffle sequences are
+/// required to implement the reduction.
+Value *
+createSimpleTargetReduction(IRBuilder<> &B, const TargetTransformInfo *TTI,
+                            unsigned Opcode, Value *Src,
+                            TargetTransformInfo::ReductionFlags Flags =
+                                TargetTransformInfo::ReductionFlags(),
+                            ArrayRef<Value *> RedOps = ArrayRef<Value *>());
+
+/// Create a generic target reduction using a recurrence descriptor \p Desc
+/// The target is queried to determine if intrinsics or shuffle sequences are
+/// required to implement the reduction.
+Value *createTargetReduction(IRBuilder<> &B, const TargetTransformInfo *TTI,
+                             RecurrenceDescriptor &Desc, Value *Src,
+                             bool NoNaN = false);
+
+/// Get the intersection (logical and) of all of the potential IR flags
+/// of each scalar operation (VL) that will be converted into a vector (I).
+/// Flag set: NSW, NUW, exact, and all of fast-math.
+void propagateIRFlags(Value *I, ArrayRef<Value *> VL);
+
 } // end namespace llvm
 
 #endif // LLVM_TRANSFORMS_UTILS_LOOPUTILS_H
diff --git a/include/llvm/Transforms/Vectorize/SLPVectorizer.h b/include/llvm/Transforms/Vectorize/SLPVectorizer.h
index 10338f7937e8..c514db41623c 100644
--- a/include/llvm/Transforms/Vectorize/SLPVectorizer.h
+++ b/include/llvm/Transforms/Vectorize/SLPVectorizer.h
@@ -24,6 +24,7 @@
 #include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/DemandedBits.h"
 #include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/OptimizationDiagnosticInfo.h"
 #include "llvm/Analysis/ScalarEvolution.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/IR/Function.h"
@@ -59,7 +60,8 @@ public:
   // Glue for old PM.
   bool runImpl(Function &F, ScalarEvolution *SE_, TargetTransformInfo *TTI_,
                TargetLibraryInfo *TLI_, AliasAnalysis *AA_, LoopInfo *LI_,
-               DominatorTree *DT_, AssumptionCache *AC_, DemandedBits *DB_);
+               DominatorTree *DT_, AssumptionCache *AC_, DemandedBits *DB_,
+               OptimizationRemarkEmitter *ORE_);
 
 private:
   /// \brief Collect store and getelementptr instructions and organize them
diff --git a/include/llvm/module.modulemap b/include/llvm/module.modulemap
index 59b1f1621039..5e15e8d49802 100644
--- a/include/llvm/module.modulemap
+++ b/include/llvm/module.modulemap
@@ -148,6 +148,7 @@ module LLVM_intrinsic_gen {
   module IR_Attributes { header "IR/Attributes.h" export * }
   module IR_CallSite { header "IR/CallSite.h" export * }
   module IR_ConstantFolder { header "IR/ConstantFolder.h" export * }
+  module IR_GlobalVariable { header "IR/GlobalVariable.h" export * }
   module IR_NoFolder { header "IR/NoFolder.h" export * }
   module IR_Module { header "IR/Module.h" export * }
   module IR_ModuleSummaryIndex { header "IR/ModuleSummaryIndex.h" export * }
diff --git a/lib/Analysis/BasicAliasAnalysis.cpp b/lib/Analysis/BasicAliasAnalysis.cpp
index 537823020301..a33c01a0e461 100644
--- a/lib/Analysis/BasicAliasAnalysis.cpp
+++ b/lib/Analysis/BasicAliasAnalysis.cpp
@@ -17,13 +17,13 @@
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/CFG.h"
 #include "llvm/Analysis/CaptureTracking.h"
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/MemoryBuiltins.h"
 #include "llvm/Analysis/ValueTracking.h"
-#include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/DerivedTypes.h"
@@ -36,6 +36,7 @@
 #include "llvm/IR/Operator.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/KnownBits.h"
 #include <algorithm>
 
 #define DEBUG_TYPE "basicaa"
@@ -1283,9 +1284,9 @@ AliasResult BasicAAResult::aliasGEP(const GEPOperator *GEP1, uint64_t V1Size,
         // give up if we can't determine conditions that hold for every cycle:
         const Value *V = DecompGEP1.VarIndices[i].V;
 
-        bool SignKnownZero, SignKnownOne;
-        ComputeSignBit(const_cast<Value *>(V), SignKnownZero, SignKnownOne, DL,
-                       0, &AC, nullptr, DT);
+        KnownBits Known = computeKnownBits(V, DL, 0, &AC, nullptr, DT);
+        bool SignKnownZero = Known.isNonNegative();
+        bool SignKnownOne = Known.isNegative();
 
         // Zero-extension widens the variable, and so forces the sign
         // bit to zero.
diff --git a/lib/Analysis/BranchProbabilityInfo.cpp b/lib/Analysis/BranchProbabilityInfo.cpp
index 0dc4475ca0e2..db87b17c1567 100644
--- a/lib/Analysis/BranchProbabilityInfo.cpp
+++ b/lib/Analysis/BranchProbabilityInfo.cpp
@@ -301,6 +301,8 @@ bool BranchProbabilityInfo::calcMetadataWeights(const BasicBlock *BB) {
       WeightSum += Weights[i];
     }
   }
+  assert(WeightSum <= UINT32_MAX &&
+         "Expected weights to scale down to 32 bits");
 
   if (WeightSum == 0 || ReachableIdxs.size() == 0) {
     for (unsigned i = 0, e = TI->getNumSuccessors(); i != e; ++i)
@@ -328,21 +330,14 @@ bool BranchProbabilityInfo::calcMetadataWeights(const BasicBlock *BB) {
     // the difference between reachable blocks.
     if (ToDistribute > BranchProbability::getZero()) {
       BranchProbability PerEdge = ToDistribute / ReachableIdxs.size();
-      for (auto i : ReachableIdxs) {
+      for (auto i : ReachableIdxs)
         BP[i] += PerEdge;
-        ToDistribute -= PerEdge;
-      }
-      // Tail goes to the first reachable edge.
-      BP[ReachableIdxs[0]] += ToDistribute;
     }
   }
 
   for (unsigned i = 0, e = TI->getNumSuccessors(); i != e; ++i)
     setEdgeProbability(BB, i, BP[i]);
 
-  assert(WeightSum <= UINT32_MAX &&
-         "Expected weights to scale down to 32 bits");
-
   return true;
 }
 
diff --git a/lib/Analysis/CallGraph.cpp b/lib/Analysis/CallGraph.cpp
index 6942176ae6ae..ff5242f69a1b 100644
--- a/lib/Analysis/CallGraph.cpp
+++ b/lib/Analysis/CallGraph.cpp
@@ -21,23 +21,18 @@ using namespace llvm;
 //
 
 CallGraph::CallGraph(Module &M)
-    : M(M), Root(nullptr), ExternalCallingNode(getOrInsertFunction(nullptr)),
+    : M(M), ExternalCallingNode(getOrInsertFunction(nullptr)),
       CallsExternalNode(llvm::make_unique<CallGraphNode>(nullptr)) {
   // Add every function to the call graph.
   for (Function &F : M)
     addToCallGraph(&F);
-
-  // If we didn't find a main function, use the external call graph node
-  if (!Root)
-    Root = ExternalCallingNode;
 }
 
 CallGraph::CallGraph(CallGraph &&Arg)
-    : M(Arg.M), FunctionMap(std::move(Arg.FunctionMap)), Root(Arg.Root),
+    : M(Arg.M), FunctionMap(std::move(Arg.FunctionMap)),
       ExternalCallingNode(Arg.ExternalCallingNode),
       CallsExternalNode(std::move(Arg.CallsExternalNode)) {
   Arg.FunctionMap.clear();
-  Arg.Root = nullptr;
   Arg.ExternalCallingNode = nullptr;
 }
 
@@ -57,21 +52,9 @@ CallGraph::~CallGraph() {
 void CallGraph::addToCallGraph(Function *F) {
   CallGraphNode *Node = getOrInsertFunction(F);
 
-  // If this function has external linkage, anything could call it.
-  if (!F->hasLocalLinkage()) {
-    ExternalCallingNode->addCalledFunction(CallSite(), Node);
-
-    // Found the entry point?
-    if (F->getName() == "main") {
-      if (Root) // Found multiple external mains?  Don't pick one.
-        Root = ExternalCallingNode;
-      else
-        Root = Node; // Found a main, keep track of it!
-    }
-  }
-
-  // If this function has its address taken, anything could call it.
-  if (F->hasAddressTaken())
+  // If this function has external linkage or has its address taken, anything
+  // could call it.
+  if (!F->hasLocalLinkage() || F->hasAddressTaken())
     ExternalCallingNode->addCalledFunction(CallSite(), Node);
 
   // If this function is not defined in this translation unit, it could call
@@ -96,13 +79,6 @@ void CallGraph::addToCallGraph(Function *F) {
 }
 
 void CallGraph::print(raw_ostream &OS) const {
-  OS << "CallGraph Root is: ";
-  if (Function *F = Root->getFunction())
-    OS << F->getName() << "\n";
-  else {
-    OS << "<<null function: 0x" << Root << ">>\n";
-  }
-
   // Print in a deterministic order by sorting CallGraphNodes by name.  We do
   // this here to avoid slowing down the non-printing fast path.
 
diff --git a/lib/Analysis/ConstantFolding.cpp b/lib/Analysis/ConstantFolding.cpp
index 130e917e49d7..0ca712bbfe70 100644
--- a/lib/Analysis/ConstantFolding.cpp
+++ b/lib/Analysis/ConstantFolding.cpp
@@ -1438,6 +1438,36 @@ bool llvm::canConstantFoldCallTo(const Function *F) {
            Name == "sinf" || Name == "sinhf" || Name == "sqrtf";
   case 't':
     return Name == "tan" || Name == "tanh" || Name == "tanf" || Name == "tanhf";
+  case '_':
+
+    // Check for various function names that get used for the math functions
+    // when the header files are preprocessed with the macro
+    // __FINITE_MATH_ONLY__ enabled.
+    // The '12' here is the length of the shortest name that can match.
+    // We need to check the size before looking at Name[1] and Name[2]
+    // so we may as well check a limit that will eliminate mismatches.
+    if (Name.size() < 12 || Name[1] != '_')
+      return false;
+    switch (Name[2]) {
+    default:
+      return false;
+    case 'a':
+      return Name == "__acos_finite" || Name == "__acosf_finite" ||
+             Name == "__asin_finite" || Name == "__asinf_finite" ||
+             Name == "__atan2_finite" || Name == "__atan2f_finite";
+    case 'c':
+      return Name == "__cosh_finite" || Name == "__coshf_finite";
+    case 'e':
+      return Name == "__exp_finite" || Name == "__expf_finite" ||
+             Name == "__exp2_finite" || Name == "__exp2f_finite";
+    case 'l':
+      return Name == "__log_finite" || Name == "__logf_finite" ||
+             Name == "__log10_finite" || Name == "__log10f_finite";
+    case 'p':
+      return Name == "__pow_finite" || Name == "__powf_finite";
+    case 's':
+      return Name == "__sinh_finite" || Name == "__sinhf_finite";
+    }
   }
 }
 
@@ -1637,13 +1667,21 @@ Constant *ConstantFoldScalarCall(StringRef Name, unsigned IntrinsicID, Type *Ty,
       if (!TLI)
         return nullptr;
 
-      switch (Name[0]) {
+      char NameKeyChar = Name[0];
+      if (Name[0] == '_' && Name.size() > 2 && Name[1] == '_')
+        NameKeyChar = Name[2];
+
+      switch (NameKeyChar) {
       case 'a':
         if ((Name == "acos" && TLI->has(LibFunc_acos)) ||
-            (Name == "acosf" && TLI->has(LibFunc_acosf)))
+            (Name == "acosf" && TLI->has(LibFunc_acosf)) ||
+            (Name == "__acos_finite" && TLI->has(LibFunc_acos_finite)) ||
+            (Name == "__acosf_finite" && TLI->has(LibFunc_acosf_finite)))
           return ConstantFoldFP(acos, V, Ty);
         else if ((Name == "asin" && TLI->has(LibFunc_asin)) ||
-                 (Name == "asinf" && TLI->has(LibFunc_asinf)))
+                 (Name == "asinf" && TLI->has(LibFunc_asinf)) ||
+                 (Name == "__asin_finite" && TLI->has(LibFunc_asin_finite)) ||
+                 (Name == "__asinf_finite" && TLI->has(LibFunc_asinf_finite)))
           return ConstantFoldFP(asin, V, Ty);
         else if ((Name == "atan" && TLI->has(LibFunc_atan)) ||
                  (Name == "atanf" && TLI->has(LibFunc_atanf)))
@@ -1657,15 +1695,21 @@ Constant *ConstantFoldScalarCall(StringRef Name, unsigned IntrinsicID, Type *Ty,
                  (Name == "cosf" && TLI->has(LibFunc_cosf)))
           return ConstantFoldFP(cos, V, Ty);
         else if ((Name == "cosh" && TLI->has(LibFunc_cosh)) ||
-                 (Name == "coshf" && TLI->has(LibFunc_coshf)))
+                 (Name == "coshf" && TLI->has(LibFunc_coshf)) ||
+                 (Name == "__cosh_finite" && TLI->has(LibFunc_cosh_finite)) ||
+                 (Name == "__coshf_finite" && TLI->has(LibFunc_coshf_finite)))
           return ConstantFoldFP(cosh, V, Ty);
         break;
       case 'e':
         if ((Name == "exp" && TLI->has(LibFunc_exp)) ||
-            (Name == "expf" && TLI->has(LibFunc_expf)))
+            (Name == "expf" && TLI->has(LibFunc_expf)) ||
+            (Name == "__exp_finite" && TLI->has(LibFunc_exp_finite)) ||
+            (Name == "__expf_finite" && TLI->has(LibFunc_expf_finite)))
           return ConstantFoldFP(exp, V, Ty);
         if ((Name == "exp2" && TLI->has(LibFunc_exp2)) ||
-            (Name == "exp2f" && TLI->has(LibFunc_exp2f)))
+            (Name == "exp2f" && TLI->has(LibFunc_exp2f)) ||
+            (Name == "__exp2_finite" && TLI->has(LibFunc_exp2_finite)) ||
+            (Name == "__exp2f_finite" && TLI->has(LibFunc_exp2f_finite)))
           // Constant fold exp2(x) as pow(2,x) in case the host doesn't have a
           // C99 library.
           return ConstantFoldBinaryFP(pow, 2.0, V, Ty);
@@ -1680,10 +1724,18 @@ Constant *ConstantFoldScalarCall(StringRef Name, unsigned IntrinsicID, Type *Ty,
         break;
       case 'l':
         if ((Name == "log" && V > 0 && TLI->has(LibFunc_log)) ||
-            (Name == "logf" && V > 0 && TLI->has(LibFunc_logf)))
+            (Name == "logf" && V > 0 && TLI->has(LibFunc_logf)) ||
+            (Name == "__log_finite" && V > 0 &&
+              TLI->has(LibFunc_log_finite)) ||
+            (Name == "__logf_finite" && V > 0 &&
+              TLI->has(LibFunc_logf_finite)))
           return ConstantFoldFP(log, V, Ty);
         else if ((Name == "log10" && V > 0 && TLI->has(LibFunc_log10)) ||
-                 (Name == "log10f" && V > 0 && TLI->has(LibFunc_log10f)))
+                 (Name == "log10f" && V > 0 && TLI->has(LibFunc_log10f)) ||
+                 (Name == "__log10_finite" && V > 0 &&
+                   TLI->has(LibFunc_log10_finite)) ||
+                 (Name == "__log10f_finite" && V > 0 &&
+                   TLI->has(LibFunc_log10f_finite)))
           return ConstantFoldFP(log10, V, Ty);
         break;
       case 'r':
@@ -1695,7 +1747,9 @@ Constant *ConstantFoldScalarCall(StringRef Name, unsigned IntrinsicID, Type *Ty,
             (Name == "sinf" && TLI->has(LibFunc_sinf)))
           return ConstantFoldFP(sin, V, Ty);
         else if ((Name == "sinh" && TLI->has(LibFunc_sinh)) ||
-                 (Name == "sinhf" && TLI->has(LibFunc_sinhf)))
+                 (Name == "sinhf" && TLI->has(LibFunc_sinhf)) ||
+                 (Name == "__sinh_finite" && TLI->has(LibFunc_sinh_finite)) ||
+                 (Name == "__sinhf_finite" && TLI->has(LibFunc_sinhf_finite)))
           return ConstantFoldFP(sinh, V, Ty);
         else if ((Name == "sqrt" && V >= 0 && TLI->has(LibFunc_sqrt)) ||
                  (Name == "sqrtf" && V >= 0 && TLI->has(LibFunc_sqrtf)))
@@ -1813,13 +1867,17 @@ Constant *ConstantFoldScalarCall(StringRef Name, unsigned IntrinsicID, Type *Ty,
         if (!TLI)
           return nullptr;
         if ((Name == "pow" && TLI->has(LibFunc_pow)) ||
-            (Name == "powf" && TLI->has(LibFunc_powf)))
+            (Name == "powf" && TLI->has(LibFunc_powf)) ||
+            (Name == "__pow_finite" && TLI->has(LibFunc_pow_finite)) ||
+            (Name == "__powf_finite" && TLI->has(LibFunc_powf_finite)))
           return ConstantFoldBinaryFP(pow, Op1V, Op2V, Ty);
         if ((Name == "fmod" && TLI->has(LibFunc_fmod)) ||
             (Name == "fmodf" && TLI->has(LibFunc_fmodf)))
           return ConstantFoldBinaryFP(fmod, Op1V, Op2V, Ty);
         if ((Name == "atan2" && TLI->has(LibFunc_atan2)) ||
-            (Name == "atan2f" && TLI->has(LibFunc_atan2f)))
+            (Name == "atan2f" && TLI->has(LibFunc_atan2f)) ||
+            (Name == "__atan2_finite" && TLI->has(LibFunc_atan2_finite)) ||
+            (Name == "__atan2f_finite" && TLI->has(LibFunc_atan2f_finite)))
           return ConstantFoldBinaryFP(atan2, Op1V, Op2V, Ty);
       } else if (auto *Op2C = dyn_cast<ConstantInt>(Operands[1])) {
         if (IntrinsicID == Intrinsic::powi && Ty->isHalfTy())
diff --git a/lib/Analysis/DemandedBits.cpp b/lib/Analysis/DemandedBits.cpp
index 9f5dc5318239..8f808f3e7871 100644
--- a/lib/Analysis/DemandedBits.cpp
+++ b/lib/Analysis/DemandedBits.cpp
@@ -86,13 +86,11 @@ void DemandedBits::determineLiveOperandBits(
       [&](unsigned BitWidth, const Value *V1, const Value *V2) {
         const DataLayout &DL = I->getModule()->getDataLayout();
         Known = KnownBits(BitWidth);
-        computeKnownBits(const_cast<Value *>(V1), Known, DL, 0,
-                         &AC, UserI, &DT);
+        computeKnownBits(V1, Known, DL, 0, &AC, UserI, &DT);
 
         if (V2) {
           Known2 = KnownBits(BitWidth);
-          computeKnownBits(const_cast<Value *>(V2), Known2, DL,
-                           0, &AC, UserI, &DT);
+          computeKnownBits(V2, Known2, DL, 0, &AC, UserI, &DT);
         }
       };
 
@@ -118,7 +116,7 @@ void DemandedBits::determineLiveOperandBits(
           // known to be one.
           ComputeKnownBits(BitWidth, I, nullptr);
           AB = APInt::getHighBitsSet(BitWidth,
-                 std::min(BitWidth, Known.One.countLeadingZeros()+1));
+                 std::min(BitWidth, Known.countMaxLeadingZeros()+1));
         }
         break;
       case Intrinsic::cttz:
@@ -128,7 +126,7 @@ void DemandedBits::determineLiveOperandBits(
           // known to be one.
           ComputeKnownBits(BitWidth, I, nullptr);
           AB = APInt::getLowBitsSet(BitWidth,
-                 std::min(BitWidth, Known.One.countTrailingZeros()+1));
+                 std::min(BitWidth, Known.countMaxTrailingZeros()+1));
         }
         break;
       }
diff --git a/lib/Analysis/InlineCost.cpp b/lib/Analysis/InlineCost.cpp
index 100a591e452c..44c14cb17c22 100644
--- a/lib/Analysis/InlineCost.cpp
+++ b/lib/Analysis/InlineCost.cpp
@@ -63,7 +63,7 @@ static cl::opt<bool>
 // PGO before we actually hook up inliner with analysis passes such as BPI and
 // BFI.
 static cl::opt<int> ColdThreshold(
-    "inlinecold-threshold", cl::Hidden, cl::init(225),
+    "inlinecold-threshold", cl::Hidden, cl::init(45),
     cl::desc("Threshold for inlining functions with cold attribute"));
 
 static cl::opt<int>
diff --git a/lib/Analysis/InstructionSimplify.cpp b/lib/Analysis/InstructionSimplify.cpp
index 4a713f441ce8..5728887cc1e9 100644
--- a/lib/Analysis/InstructionSimplify.cpp
+++ b/lib/Analysis/InstructionSimplify.cpp
@@ -1317,7 +1317,7 @@ static Value *SimplifyShift(Instruction::BinaryOps Opcode, Value *Op0,
   // If all valid bits in the shift amount are known zero, the first operand is
   // unchanged.
   unsigned NumValidShiftBits = Log2_32_Ceil(BitWidth);
-  if (Known.Zero.countTrailingOnes() >= NumValidShiftBits)
+  if (Known.countMinTrailingZeros() >= NumValidShiftBits)
     return Op0;
 
   return nullptr;
@@ -1536,7 +1536,7 @@ static Value *simplifyAndOrOfICmpsWithConstants(ICmpInst *Cmp0, ICmpInst *Cmp1,
   auto Range0 = ConstantRange::makeExactICmpRegion(Cmp0->getPredicate(), *C0);
   auto Range1 = ConstantRange::makeExactICmpRegion(Cmp1->getPredicate(), *C1);
 
-  // For and-of-comapares, check if the intersection is empty:
+  // For and-of-compares, check if the intersection is empty:
   // (icmp X, C0) && (icmp X, C1) --> empty set --> false
   if (IsAnd && Range0.intersectWith(Range1).isEmptySet())
     return getFalse(Cmp0->getType());
@@ -1870,6 +1870,24 @@ static Value *SimplifyOrInst(Value *Op0, Value *Op1, const SimplifyQuery &Q,
        match(Op1, m_c_And(m_Not(m_Specific(A)), m_Specific(B)))))
     return Op0;
 
+  // (A & B) | (~A ^ B) -> (~A ^ B)
+  // (B & A) | (~A ^ B) -> (~A ^ B)
+  // (A & B) | (B ^ ~A) -> (B ^ ~A)
+  // (B & A) | (B ^ ~A) -> (B ^ ~A)
+  if (match(Op0, m_And(m_Value(A), m_Value(B))) &&
+      (match(Op1, m_c_Xor(m_Specific(A), m_Not(m_Specific(B)))) ||
+       match(Op1, m_c_Xor(m_Not(m_Specific(A)), m_Specific(B)))))
+    return Op1;
+
+  // (~A ^ B) | (A & B) -> (~A ^ B)
+  // (~A ^ B) | (B & A) -> (~A ^ B)
+  // (B ^ ~A) | (A & B) -> (B ^ ~A)
+  // (B ^ ~A) | (B & A) -> (B ^ ~A)
+  if (match(Op1, m_And(m_Value(A), m_Value(B))) &&
+      (match(Op0, m_c_Xor(m_Specific(A), m_Not(m_Specific(B)))) ||
+       match(Op0, m_c_Xor(m_Not(m_Specific(A)), m_Specific(B)))))
+    return Op0;
+
   if (Value *V = simplifyAndOrOfICmps(Op0, Op1, false))
     return V;
 
@@ -2286,7 +2304,6 @@ static Value *simplifyICmpWithZero(CmpInst::Predicate Pred, Value *LHS,
     return nullptr;
 
   Type *ITy = GetCompareTy(LHS); // The return type.
-  bool LHSKnownNonNegative, LHSKnownNegative;
   switch (Pred) {
   default:
     llvm_unreachable("Unknown ICmp predicate!");
@@ -2304,39 +2321,41 @@ static Value *simplifyICmpWithZero(CmpInst::Predicate Pred, Value *LHS,
     if (isKnownNonZero(LHS, Q.DL, 0, Q.AC, Q.CxtI, Q.DT))
       return getTrue(ITy);
     break;
-  case ICmpInst::ICMP_SLT:
-    ComputeSignBit(LHS, LHSKnownNonNegative, LHSKnownNegative, Q.DL, 0, Q.AC,
-                   Q.CxtI, Q.DT);
-    if (LHSKnownNegative)
+  case ICmpInst::ICMP_SLT: {
+    KnownBits LHSKnown = computeKnownBits(LHS, Q.DL, 0, Q.AC, Q.CxtI, Q.DT);
+    if (LHSKnown.isNegative())
       return getTrue(ITy);
-    if (LHSKnownNonNegative)
+    if (LHSKnown.isNonNegative())
       return getFalse(ITy);
     break;
-  case ICmpInst::ICMP_SLE:
-    ComputeSignBit(LHS, LHSKnownNonNegative, LHSKnownNegative, Q.DL, 0, Q.AC,
-                   Q.CxtI, Q.DT);
-    if (LHSKnownNegative)
+  }
+  case ICmpInst::ICMP_SLE: {
+    KnownBits LHSKnown = computeKnownBits(LHS, Q.DL, 0, Q.AC, Q.CxtI, Q.DT);
+    if (LHSKnown.isNegative())
       return getTrue(ITy);
-    if (LHSKnownNonNegative && isKnownNonZero(LHS, Q.DL, 0, Q.AC, Q.CxtI, Q.DT))
+    if (LHSKnown.isNonNegative() &&
+        isKnownNonZero(LHS, Q.DL, 0, Q.AC, Q.CxtI, Q.DT))
       return getFalse(ITy);
     break;
-  case ICmpInst::ICMP_SGE:
-    ComputeSignBit(LHS, LHSKnownNonNegative, LHSKnownNegative, Q.DL, 0, Q.AC,
-                   Q.CxtI, Q.DT);
-    if (LHSKnownNegative)
+  }
+  case ICmpInst::ICMP_SGE: {
+    KnownBits LHSKnown = computeKnownBits(LHS, Q.DL, 0, Q.AC, Q.CxtI, Q.DT);
+    if (LHSKnown.isNegative())
       return getFalse(ITy);
-    if (LHSKnownNonNegative)
+    if (LHSKnown.isNonNegative())
       return getTrue(ITy);
     break;
-  case ICmpInst::ICMP_SGT:
-    ComputeSignBit(LHS, LHSKnownNonNegative, LHSKnownNegative, Q.DL, 0, Q.AC,
-                   Q.CxtI, Q.DT);
-    if (LHSKnownNegative)
+  }
+  case ICmpInst::ICMP_SGT: {
+    KnownBits LHSKnown = computeKnownBits(LHS, Q.DL, 0, Q.AC, Q.CxtI, Q.DT);
+    if (LHSKnown.isNegative())
       return getFalse(ITy);
-    if (LHSKnownNonNegative && isKnownNonZero(LHS, Q.DL, 0, Q.AC, Q.CxtI, Q.DT))
+    if (LHSKnown.isNonNegative() &&
+        isKnownNonZero(LHS, Q.DL, 0, Q.AC, Q.CxtI, Q.DT))
       return getTrue(ITy);
     break;
   }
+  }
 
   return nullptr;
 }
@@ -2535,6 +2554,9 @@ static Value *simplifyICmpWithConstant(CmpInst::Predicate Pred, Value *LHS,
   return nullptr;
 }
 
+/// TODO: A large part of this logic is duplicated in InstCombine's
+/// foldICmpBinOp(). We should be able to share that and avoid the code
+/// duplication.
 static Value *simplifyICmpWithBinOp(CmpInst::Predicate Pred, Value *LHS,
                                     Value *RHS, const SimplifyQuery &Q,
                                     unsigned MaxRecurse) {
@@ -2616,15 +2638,11 @@ static Value *simplifyICmpWithBinOp(CmpInst::Predicate Pred, Value *LHS,
         return getTrue(ITy);
 
       if (Pred == ICmpInst::ICMP_SLT || Pred == ICmpInst::ICMP_SGE) {
-        bool RHSKnownNonNegative, RHSKnownNegative;
-        bool YKnownNonNegative, YKnownNegative;
-        ComputeSignBit(RHS, RHSKnownNonNegative, RHSKnownNegative, Q.DL, 0,
-                       Q.AC, Q.CxtI, Q.DT);
-        ComputeSignBit(Y, YKnownNonNegative, YKnownNegative, Q.DL, 0, Q.AC,
-                       Q.CxtI, Q.DT);
-        if (RHSKnownNonNegative && YKnownNegative)
+        KnownBits RHSKnown = computeKnownBits(RHS, Q.DL, 0, Q.AC, Q.CxtI, Q.DT);
+        KnownBits YKnown = computeKnownBits(Y, Q.DL, 0, Q.AC, Q.CxtI, Q.DT);
+        if (RHSKnown.isNonNegative() && YKnown.isNegative())
           return Pred == ICmpInst::ICMP_SLT ? getTrue(ITy) : getFalse(ITy);
-        if (RHSKnownNegative || YKnownNonNegative)
+        if (RHSKnown.isNegative() || YKnown.isNonNegative())
           return Pred == ICmpInst::ICMP_SLT ? getFalse(ITy) : getTrue(ITy);
       }
     }
@@ -2636,15 +2654,11 @@ static Value *simplifyICmpWithBinOp(CmpInst::Predicate Pred, Value *LHS,
         return getFalse(ITy);
 
       if (Pred == ICmpInst::ICMP_SGT || Pred == ICmpInst::ICMP_SLE) {
-        bool LHSKnownNonNegative, LHSKnownNegative;
-        bool YKnownNonNegative, YKnownNegative;
-        ComputeSignBit(LHS, LHSKnownNonNegative, LHSKnownNegative, Q.DL, 0,
-                       Q.AC, Q.CxtI, Q.DT);
-        ComputeSignBit(Y, YKnownNonNegative, YKnownNegative, Q.DL, 0, Q.AC,
-                       Q.CxtI, Q.DT);
-        if (LHSKnownNonNegative && YKnownNegative)
+        KnownBits LHSKnown = computeKnownBits(LHS, Q.DL, 0, Q.AC, Q.CxtI, Q.DT);
+        KnownBits YKnown = computeKnownBits(Y, Q.DL, 0, Q.AC, Q.CxtI, Q.DT);
+        if (LHSKnown.isNonNegative() && YKnown.isNegative())
           return Pred == ICmpInst::ICMP_SGT ? getTrue(ITy) : getFalse(ITy);
-        if (LHSKnownNegative || YKnownNonNegative)
+        if (LHSKnown.isNegative() || YKnown.isNonNegative())
           return Pred == ICmpInst::ICMP_SGT ? getFalse(ITy) : getTrue(ITy);
       }
     }
@@ -2691,28 +2705,27 @@ static Value *simplifyICmpWithBinOp(CmpInst::Predicate Pred, Value *LHS,
 
   // icmp pred (urem X, Y), Y
   if (LBO && match(LBO, m_URem(m_Value(), m_Specific(RHS)))) {
-    bool KnownNonNegative, KnownNegative;
     switch (Pred) {
     default:
       break;
     case ICmpInst::ICMP_SGT:
-    case ICmpInst::ICMP_SGE:
-      ComputeSignBit(RHS, KnownNonNegative, KnownNegative, Q.DL, 0, Q.AC,
-                     Q.CxtI, Q.DT);
-      if (!KnownNonNegative)
+    case ICmpInst::ICMP_SGE: {
+      KnownBits Known = computeKnownBits(RHS, Q.DL, 0, Q.AC, Q.CxtI, Q.DT);
+      if (!Known.isNonNegative())
         break;
       LLVM_FALLTHROUGH;
+    }
     case ICmpInst::ICMP_EQ:
     case ICmpInst::ICMP_UGT:
     case ICmpInst::ICMP_UGE:
       return getFalse(ITy);
     case ICmpInst::ICMP_SLT:
-    case ICmpInst::ICMP_SLE:
-      ComputeSignBit(RHS, KnownNonNegative, KnownNegative, Q.DL, 0, Q.AC,
-                     Q.CxtI, Q.DT);
-      if (!KnownNonNegative)
+    case ICmpInst::ICMP_SLE: {
+      KnownBits Known = computeKnownBits(RHS, Q.DL, 0, Q.AC, Q.CxtI, Q.DT);
+      if (!Known.isNonNegative())
         break;
       LLVM_FALLTHROUGH;
+    }
     case ICmpInst::ICMP_NE:
     case ICmpInst::ICMP_ULT:
     case ICmpInst::ICMP_ULE:
@@ -2722,28 +2735,27 @@ static Value *simplifyICmpWithBinOp(CmpInst::Predicate Pred, Value *LHS,
 
   // icmp pred X, (urem Y, X)
   if (RBO && match(RBO, m_URem(m_Value(), m_Specific(LHS)))) {
-    bool KnownNonNegative, KnownNegative;
     switch (Pred) {
     default:
       break;
     case ICmpInst::ICMP_SGT:
-    case ICmpInst::ICMP_SGE:
-      ComputeSignBit(LHS, KnownNonNegative, KnownNegative, Q.DL, 0, Q.AC,
-                     Q.CxtI, Q.DT);
-      if (!KnownNonNegative)
+    case ICmpInst::ICMP_SGE: {
+      KnownBits Known = computeKnownBits(LHS, Q.DL, 0, Q.AC, Q.CxtI, Q.DT);
+      if (!Known.isNonNegative())
         break;
       LLVM_FALLTHROUGH;
+    }
     case ICmpInst::ICMP_NE:
     case ICmpInst::ICMP_UGT:
     case ICmpInst::ICMP_UGE:
       return getTrue(ITy);
     case ICmpInst::ICMP_SLT:
-    case ICmpInst::ICMP_SLE:
-      ComputeSignBit(LHS, KnownNonNegative, KnownNegative, Q.DL, 0, Q.AC,
-                     Q.CxtI, Q.DT);
-      if (!KnownNonNegative)
+    case ICmpInst::ICMP_SLE: {
+      KnownBits Known = computeKnownBits(LHS, Q.DL, 0, Q.AC, Q.CxtI, Q.DT);
+      if (!Known.isNonNegative())
         break;
       LLVM_FALLTHROUGH;
+    }
     case ICmpInst::ICMP_EQ:
     case ICmpInst::ICMP_ULT:
     case ICmpInst::ICMP_ULE:
@@ -2815,10 +2827,19 @@ static Value *simplifyICmpWithBinOp(CmpInst::Predicate Pred, Value *LHS,
       break;
     case Instruction::UDiv:
     case Instruction::LShr:
-      if (ICmpInst::isSigned(Pred))
+      if (ICmpInst::isSigned(Pred) || !LBO->isExact() || !RBO->isExact())
         break;
-      LLVM_FALLTHROUGH;
+      if (Value *V = SimplifyICmpInst(Pred, LBO->getOperand(0),
+                                      RBO->getOperand(0), Q, MaxRecurse - 1))
+          return V;
+      break;
     case Instruction::SDiv:
+      if (!ICmpInst::isEquality(Pred) || !LBO->isExact() || !RBO->isExact())
+        break;
+      if (Value *V = SimplifyICmpInst(Pred, LBO->getOperand(0),
+                                      RBO->getOperand(0), Q, MaxRecurse - 1))
+        return V;
+      break;
     case Instruction::AShr:
       if (!LBO->isExact() || !RBO->isExact())
         break;
@@ -4034,24 +4055,21 @@ Value *llvm::SimplifyCastInst(unsigned CastOpc, Value *Op, Type *Ty,
 /// match a root vector source operand that contains that element in the same
 /// vector lane (ie, the same mask index), so we can eliminate the shuffle(s).
 static Value *foldIdentityShuffles(int DestElt, Value *Op0, Value *Op1,
-                                   Constant *Mask, Value *RootVec, int RootElt,
+                                   int MaskVal, Value *RootVec,
                                    unsigned MaxRecurse) {
   if (!MaxRecurse--)
     return nullptr;
 
   // Bail out if any mask value is undefined. That kind of shuffle may be
   // simplified further based on demanded bits or other folds.
-  int MaskVal = ShuffleVectorInst::getMaskValue(Mask, RootElt);
   if (MaskVal == -1)
     return nullptr;
 
   // The mask value chooses which source operand we need to look at next.
-  Value *SourceOp;
   int InVecNumElts = Op0->getType()->getVectorNumElements();
-  if (MaskVal < InVecNumElts) {
-    RootElt = MaskVal;
-    SourceOp = Op0;
-  } else {
+  int RootElt = MaskVal;
+  Value *SourceOp = Op0;
+  if (MaskVal >= InVecNumElts) {
     RootElt = MaskVal - InVecNumElts;
     SourceOp = Op1;
   }
@@ -4061,7 +4079,7 @@ static Value *foldIdentityShuffles(int DestElt, Value *Op0, Value *Op1,
   if (auto *SourceShuf = dyn_cast<ShuffleVectorInst>(SourceOp)) {
     return foldIdentityShuffles(
         DestElt, SourceShuf->getOperand(0), SourceShuf->getOperand(1),
-        SourceShuf->getMask(), RootVec, RootElt, MaxRecurse);
+        SourceShuf->getMaskValue(RootElt), RootVec, MaxRecurse);
   }
 
   // TODO: Look through bitcasts? What if the bitcast changes the vector element
@@ -4126,17 +4144,7 @@ static Value *SimplifyShuffleVectorInst(Value *Op0, Value *Op1, Constant *Mask,
   // second one.
   if (Op0Const && !Op1Const) {
     std::swap(Op0, Op1);
-    for (int &Idx : Indices) {
-      if (Idx == -1)
-        continue;
-      Idx = Idx < (int)InVecNumElts ? Idx + InVecNumElts : Idx - InVecNumElts;
-      assert(Idx >= 0 && Idx < (int)InVecNumElts * 2 &&
-             "shufflevector mask index out of range");
-    }
-    Mask = ConstantDataVector::get(
-        Mask->getContext(),
-        makeArrayRef(reinterpret_cast<uint32_t *>(Indices.data()),
-                     MaskNumElts));
+    ShuffleVectorInst::commuteShuffleMask(Indices, InVecNumElts);
   }
 
   // A shuffle of a splat is always the splat itself. Legal if the shuffle's
@@ -4160,7 +4168,8 @@ static Value *SimplifyShuffleVectorInst(Value *Op0, Value *Op1, Constant *Mask,
   for (unsigned i = 0; i != MaskNumElts; ++i) {
     // Note that recursion is limited for each vector element, so if any element
     // exceeds the limit, this will fail to simplify.
-    RootVec = foldIdentityShuffles(i, Op0, Op1, Mask, RootVec, i, MaxRecurse);
+    RootVec =
+        foldIdentityShuffles(i, Op0, Op1, Indices[i], RootVec, MaxRecurse);
 
     // We can't replace a widening/narrowing shuffle with one of its operands.
     if (!RootVec || RootVec->getType() != RetTy)
diff --git a/lib/Analysis/ModuleSummaryAnalysis.cpp b/lib/Analysis/ModuleSummaryAnalysis.cpp
index 99f900ae3932..26706f5509ba 100644
--- a/lib/Analysis/ModuleSummaryAnalysis.cpp
+++ b/lib/Analysis/ModuleSummaryAnalysis.cpp
@@ -232,7 +232,7 @@ computeFunctionSummary(ModuleSummaryIndex &Index, const Module &M,
         }
         // We should have named any anonymous globals
         assert(CalledFunction->hasName());
-        auto ScaledCount = ProfileSummaryInfo::getProfileCount(&I, BFI);
+        auto ScaledCount = PSI->getProfileCount(&I, BFI);
         auto Hotness = ScaledCount ? getHotness(ScaledCount.getValue(), PSI)
                                    : CalleeInfo::HotnessType::Unknown;
 
@@ -330,6 +330,7 @@ ModuleSummaryIndex llvm::buildModuleSummaryIndex(
     const Module &M,
     std::function<BlockFrequencyInfo *(const Function &F)> GetBFICallback,
     ProfileSummaryInfo *PSI) {
+  assert(PSI);
   ModuleSummaryIndex Index;
 
   // Identify the local values in the llvm.used and llvm.compiler.used sets,
diff --git a/lib/Analysis/OptimizationDiagnosticInfo.cpp b/lib/Analysis/OptimizationDiagnosticInfo.cpp
index 73245981b022..e38e530c052d 100644
--- a/lib/Analysis/OptimizationDiagnosticInfo.cpp
+++ b/lib/Analysis/OptimizationDiagnosticInfo.cpp
@@ -101,7 +101,7 @@ void MappingTraits<DiagnosticInfoOptimizationBase *>::mapping(
   // These are read-only for now.
   DiagnosticLocation DL = OptDiag->getLocation();
   StringRef FN =
-      GlobalValue::getRealLinkageName(OptDiag->getFunction().getName());
+      GlobalValue::dropLLVMManglingEscape(OptDiag->getFunction().getName());
 
   StringRef PassName(OptDiag->PassName);
   io.mapRequired("Pass", PassName);
diff --git a/lib/Analysis/ProfileSummaryInfo.cpp b/lib/Analysis/ProfileSummaryInfo.cpp
index 1a53a8ed4283..502f4205b689 100644
--- a/lib/Analysis/ProfileSummaryInfo.cpp
+++ b/lib/Analysis/ProfileSummaryInfo.cpp
@@ -75,11 +75,14 @@ ProfileSummaryInfo::getProfileCount(const Instruction *Inst,
     return None;
   assert((isa<CallInst>(Inst) || isa<InvokeInst>(Inst)) &&
          "We can only get profile count for call/invoke instruction.");
-  // Check if there is a profile metadata on the instruction. If it is present,
-  // determine hotness solely based on that.
-  uint64_t TotalCount;
-  if (Inst->extractProfTotalWeight(TotalCount))
-    return TotalCount;
+  if (computeSummary() && Summary->getKind() == ProfileSummary::PSK_Sample) {
+    // In sample PGO mode, check if there is a profile metadata on the
+    // instruction. If it is present, determine hotness solely based on that,
+    // since the sampled entry count may not be accurate.
+    uint64_t TotalCount;
+    if (Inst->extractProfTotalWeight(TotalCount))
+      return TotalCount;
+  }
   if (BFI)
     return BFI->getBlockProfileCount(Inst->getParent());
   return None;
diff --git a/lib/Analysis/ScalarEvolution.cpp b/lib/Analysis/ScalarEvolution.cpp
index 01dca0793145..800354d2f5b4 100644
--- a/lib/Analysis/ScalarEvolution.cpp
+++ b/lib/Analysis/ScalarEvolution.cpp
@@ -584,7 +584,7 @@ CompareValueComplexity(SmallSet<std::pair<Value *, Value *>, 8> &EqCache,
 static int CompareSCEVComplexity(
     SmallSet<std::pair<const SCEV *, const SCEV *>, 8> &EqCacheSCEV,
     const LoopInfo *const LI, const SCEV *LHS, const SCEV *RHS,
-    unsigned Depth = 0) {
+    DominatorTree &DT, unsigned Depth = 0) {
   // Fast-path: SCEVs are uniqued so we can do a quick equality check.
   if (LHS == RHS)
     return 0;
@@ -629,9 +629,16 @@ static int CompareSCEVComplexity(
     const SCEVAddRecExpr *LA = cast<SCEVAddRecExpr>(LHS);
     const SCEVAddRecExpr *RA = cast<SCEVAddRecExpr>(RHS);
 
-    // Compare addrec loop depths.
+    // If there is a dominance relationship between the loops, sort by the
+    // dominance. Otherwise, sort by depth. We require such order in getAddExpr.
     const Loop *LLoop = LA->getLoop(), *RLoop = RA->getLoop();
     if (LLoop != RLoop) {
+      const BasicBlock *LHead = LLoop->getHeader(), *RHead = RLoop->getHeader();
+      assert(LHead != RHead && "Two loops share the same header?");
+      if (DT.dominates(LHead, RHead))
+        return 1;
+      else if (DT.dominates(RHead, LHead))
+        return -1;
       unsigned LDepth = LLoop->getLoopDepth(), RDepth = RLoop->getLoopDepth();
       if (LDepth != RDepth)
         return (int)LDepth - (int)RDepth;
@@ -645,7 +652,7 @@ static int CompareSCEVComplexity(
     // Lexicographically compare.
     for (unsigned i = 0; i != LNumOps; ++i) {
       int X = CompareSCEVComplexity(EqCacheSCEV, LI, LA->getOperand(i),
-                                    RA->getOperand(i), Depth + 1);
+                                    RA->getOperand(i), DT,  Depth + 1);
       if (X != 0)
         return X;
     }
@@ -669,7 +676,7 @@ static int CompareSCEVComplexity(
       if (i >= RNumOps)
         return 1;
       int X = CompareSCEVComplexity(EqCacheSCEV, LI, LC->getOperand(i),
-                                    RC->getOperand(i), Depth + 1);
+                                    RC->getOperand(i), DT, Depth + 1);
       if (X != 0)
         return X;
     }
@@ -683,10 +690,10 @@ static int CompareSCEVComplexity(
 
     // Lexicographically compare udiv expressions.
     int X = CompareSCEVComplexity(EqCacheSCEV, LI, LC->getLHS(), RC->getLHS(),
-                                  Depth + 1);
+                                  DT, Depth + 1);
     if (X != 0)
       return X;
-    X = CompareSCEVComplexity(EqCacheSCEV, LI, LC->getRHS(), RC->getRHS(),
+    X = CompareSCEVComplexity(EqCacheSCEV, LI, LC->getRHS(), RC->getRHS(), DT,
                               Depth + 1);
     if (X == 0)
       EqCacheSCEV.insert({LHS, RHS});
@@ -701,7 +708,7 @@ static int CompareSCEVComplexity(
 
     // Compare cast expressions by operand.
     int X = CompareSCEVComplexity(EqCacheSCEV, LI, LC->getOperand(),
-                                  RC->getOperand(), Depth + 1);
+                                  RC->getOperand(), DT, Depth + 1);
     if (X == 0)
       EqCacheSCEV.insert({LHS, RHS});
     return X;
@@ -724,7 +731,7 @@ static int CompareSCEVComplexity(
 /// land in memory.
 ///
 static void GroupByComplexity(SmallVectorImpl<const SCEV *> &Ops,
-                              LoopInfo *LI) {
+                              LoopInfo *LI, DominatorTree &DT) {
   if (Ops.size() < 2) return;  // Noop
 
   SmallSet<std::pair<const SCEV *, const SCEV *>, 8> EqCache;
@@ -732,15 +739,16 @@ static void GroupByComplexity(SmallVectorImpl<const SCEV *> &Ops,
     // This is the common case, which also happens to be trivially simple.
     // Special case it.
     const SCEV *&LHS = Ops[0], *&RHS = Ops[1];
-    if (CompareSCEVComplexity(EqCache, LI, RHS, LHS) < 0)
+    if (CompareSCEVComplexity(EqCache, LI, RHS, LHS, DT) < 0)
       std::swap(LHS, RHS);
     return;
   }
 
   // Do the rough sort by complexity.
   std::stable_sort(Ops.begin(), Ops.end(),
-                   [&EqCache, LI](const SCEV *LHS, const SCEV *RHS) {
-                     return CompareSCEVComplexity(EqCache, LI, LHS, RHS) < 0;
+                   [&EqCache, LI, &DT](const SCEV *LHS, const SCEV *RHS) {
+                     return
+                         CompareSCEVComplexity(EqCache, LI, LHS, RHS, DT) < 0;
                    });
 
   // Now that we are sorted by complexity, group elements of the same
@@ -2186,7 +2194,7 @@ const SCEV *ScalarEvolution::getAddExpr(SmallVectorImpl<const SCEV *> &Ops,
 #endif
 
   // Sort by complexity, this groups all similar expression types together.
-  GroupByComplexity(Ops, &LI);
+  GroupByComplexity(Ops, &LI, DT);
 
   Flags = StrengthenNoWrapFlags(this, scAddExpr, Ops, Flags);
 
@@ -2492,7 +2500,13 @@ const SCEV *ScalarEvolution::getAddExpr(SmallVectorImpl<const SCEV *> &Ops,
     // added together.  If so, we can fold them.
     for (unsigned OtherIdx = Idx+1;
          OtherIdx < Ops.size() && isa<SCEVAddRecExpr>(Ops[OtherIdx]);
-         ++OtherIdx)
+         ++OtherIdx) {
+      // We expect the AddRecExpr's to be sorted in reverse dominance order,
+      // so that the 1st found AddRecExpr is dominated by all others.
+      assert(DT.dominates(
+           cast<SCEVAddRecExpr>(Ops[OtherIdx])->getLoop()->getHeader(),
+           AddRec->getLoop()->getHeader()) &&
+        "AddRecExprs are not sorted in reverse dominance order?");
       if (AddRecLoop == cast<SCEVAddRecExpr>(Ops[OtherIdx])->getLoop()) {
         // Other + {A,+,B}<L> + {C,+,D}<L>  -->  Other + {A+C,+,B+D}<L>
         SmallVector<const SCEV *, 4> AddRecOps(AddRec->op_begin(),
@@ -2518,6 +2532,7 @@ const SCEV *ScalarEvolution::getAddExpr(SmallVectorImpl<const SCEV *> &Ops,
         Ops[Idx] = getAddRecExpr(AddRecOps, AddRecLoop, SCEV::FlagAnyWrap);
         return getAddExpr(Ops, SCEV::FlagAnyWrap, Depth + 1);
       }
+    }
 
     // Otherwise couldn't fold anything into this recurrence.  Move onto the
     // next one.
@@ -2614,7 +2629,7 @@ const SCEV *ScalarEvolution::getMulExpr(SmallVectorImpl<const SCEV *> &Ops,
 #endif
 
   // Sort by complexity, this groups all similar expression types together.
-  GroupByComplexity(Ops, &LI);
+  GroupByComplexity(Ops, &LI, DT);
 
   Flags = StrengthenNoWrapFlags(this, scMulExpr, Ops, Flags);
 
@@ -3211,7 +3226,7 @@ ScalarEvolution::getSMaxExpr(SmallVectorImpl<const SCEV *> &Ops) {
 #endif
 
   // Sort by complexity, this groups all similar expression types together.
-  GroupByComplexity(Ops, &LI);
+  GroupByComplexity(Ops, &LI, DT);
 
   // If there are any constants, fold them together.
   unsigned Idx = 0;
@@ -3312,7 +3327,7 @@ ScalarEvolution::getUMaxExpr(SmallVectorImpl<const SCEV *> &Ops) {
 #endif
 
   // Sort by complexity, this groups all similar expression types together.
-  GroupByComplexity(Ops, &LI);
+  GroupByComplexity(Ops, &LI, DT);
 
   // If there are any constants, fold them together.
   unsigned Idx = 0;
@@ -4636,7 +4651,7 @@ uint32_t ScalarEvolution::GetMinTrailingZerosImpl(const SCEV *S) {
     KnownBits Known(BitWidth);
     computeKnownBits(U->getValue(), Known, getDataLayout(), 0, &AC,
                      nullptr, &DT);
-    return Known.Zero.countTrailingOnes();
+    return Known.countMinTrailingZeros();
   }
 
   // SCEVUDivExpr
@@ -5955,6 +5970,30 @@ bool ScalarEvolution::BackedgeTakenInfo::hasOperand(const SCEV *S,
   return false;
 }
 
+ScalarEvolution::ExitLimit::ExitLimit(const SCEV *E)
+    : ExactNotTaken(E), MaxNotTaken(E), MaxOrZero(false) {}
+
+ScalarEvolution::ExitLimit::ExitLimit(
+    const SCEV *E, const SCEV *M, bool MaxOrZero,
+    ArrayRef<const SmallPtrSetImpl<const SCEVPredicate *> *> PredSetList)
+    : ExactNotTaken(E), MaxNotTaken(M), MaxOrZero(MaxOrZero) {
+  assert((isa<SCEVCouldNotCompute>(ExactNotTaken) ||
+          !isa<SCEVCouldNotCompute>(MaxNotTaken)) &&
+         "Exact is not allowed to be less precise than Max");
+  for (auto *PredSet : PredSetList)
+    for (auto *P : *PredSet)
+      addPredicate(P);
+}
+
+ScalarEvolution::ExitLimit::ExitLimit(
+    const SCEV *E, const SCEV *M, bool MaxOrZero,
+    const SmallPtrSetImpl<const SCEVPredicate *> &PredSet)
+    : ExitLimit(E, M, MaxOrZero, {&PredSet}) {}
+
+ScalarEvolution::ExitLimit::ExitLimit(const SCEV *E, const SCEV *M,
+                                      bool MaxOrZero)
+    : ExitLimit(E, M, MaxOrZero, None) {}
+
 /// Allocate memory for BackedgeTakenInfo and copy the not-taken count of each
 /// computable exit into a persistent ExitNotTakenInfo array.
 ScalarEvolution::BackedgeTakenInfo::BackedgeTakenInfo(
@@ -6637,13 +6676,12 @@ ScalarEvolution::ExitLimit ScalarEvolution::computeShiftCompareExitLimit(
     // {K,ashr,<positive-constant>} stabilizes to signum(K) in at most
     // bitwidth(K) iterations.
     Value *FirstValue = PN->getIncomingValueForBlock(Predecessor);
-    bool KnownZero, KnownOne;
-    ComputeSignBit(FirstValue, KnownZero, KnownOne, DL, 0, nullptr,
-                   Predecessor->getTerminator(), &DT);
+    KnownBits Known = computeKnownBits(FirstValue, DL, 0, nullptr,
+                                       Predecessor->getTerminator(), &DT);
     auto *Ty = cast<IntegerType>(RHS->getType());
-    if (KnownZero)
+    if (Known.isNonNegative())
       StableValue = ConstantInt::get(Ty, 0);
-    else if (KnownOne)
+    else if (Known.isNegative())
       StableValue = ConstantInt::get(Ty, -1, true);
     else
       return getCouldNotCompute();
@@ -7377,48 +7415,49 @@ SolveQuadraticEquation(const SCEVAddRecExpr *AddRec, ScalarEvolution &SE) {
   const APInt &N = NC->getAPInt();
   APInt Two(BitWidth, 2);
 
-  {
-    using namespace APIntOps;
-    const APInt& C = L;
-    // Convert from chrec coefficients to polynomial coefficients AX^2+BX+C
-    // The B coefficient is M-N/2
-    APInt B(M);
-    B -= N.sdiv(Two);
+  // Convert from chrec coefficients to polynomial coefficients AX^2+BX+C
 
-    // The A coefficient is N/2
-    APInt A(N.sdiv(Two));
+  // The A coefficient is N/2
+  APInt A = N.sdiv(Two);
 
-    // Compute the B^2-4ac term.
-    APInt SqrtTerm(B);
-    SqrtTerm *= B;
-    SqrtTerm -= 4 * (A * C);
+  // The B coefficient is M-N/2
+  APInt B = M;
+  B -= A; // A is the same as N/2.
 
-    if (SqrtTerm.isNegative()) {
-      // The loop is provably infinite.
-      return None;
-    }
+  // The C coefficient is L.
+  const APInt& C = L;
 
-    // Compute sqrt(B^2-4ac). This is guaranteed to be the nearest
-    // integer value or else APInt::sqrt() will assert.
-    APInt SqrtVal(SqrtTerm.sqrt());
+  // Compute the B^2-4ac term.
+  APInt SqrtTerm = B;
+  SqrtTerm *= B;
+  SqrtTerm -= 4 * (A * C);
 
-    // Compute the two solutions for the quadratic formula.
-    // The divisions must be performed as signed divisions.
-    APInt NegB(-B);
-    APInt TwoA(A << 1);
-    if (TwoA.isMinValue())
-      return None;
+  if (SqrtTerm.isNegative()) {
+    // The loop is provably infinite.
+    return None;
+  }
+
+  // Compute sqrt(B^2-4ac). This is guaranteed to be the nearest
+  // integer value or else APInt::sqrt() will assert.
+  APInt SqrtVal = SqrtTerm.sqrt();
+
+  // Compute the two solutions for the quadratic formula.
+  // The divisions must be performed as signed divisions.
+  APInt NegB = -std::move(B);
+  APInt TwoA = std::move(A);
+  TwoA <<= 1;
+  if (TwoA.isNullValue())
+    return None;
 
-    LLVMContext &Context = SE.getContext();
+  LLVMContext &Context = SE.getContext();
 
-    ConstantInt *Solution1 =
-      ConstantInt::get(Context, (NegB + SqrtVal).sdiv(TwoA));
-    ConstantInt *Solution2 =
-      ConstantInt::get(Context, (NegB - SqrtVal).sdiv(TwoA));
+  ConstantInt *Solution1 =
+    ConstantInt::get(Context, (NegB + SqrtVal).sdiv(TwoA));
+  ConstantInt *Solution2 =
+    ConstantInt::get(Context, (NegB - SqrtVal).sdiv(TwoA));
 
-    return std::make_pair(cast<SCEVConstant>(SE.getConstant(Solution1)),
-                          cast<SCEVConstant>(SE.getConstant(Solution2)));
-  } // end APIntOps namespace
+  return std::make_pair(cast<SCEVConstant>(SE.getConstant(Solution1)),
+                        cast<SCEVConstant>(SE.getConstant(Solution2)));
 }
 
 ScalarEvolution::ExitLimit
@@ -8976,7 +9015,7 @@ bool ScalarEvolution::doesIVOverflowOnLT(const SCEV *RHS, const SCEV *Stride,
                                 .getSignedMax();
 
     // SMaxRHS + SMaxStrideMinusOne > SMaxValue => overflow!
-    return (std::move(MaxValue) - std::move(MaxStrideMinusOne)).slt(MaxRHS);
+    return (std::move(MaxValue) - MaxStrideMinusOne).slt(MaxRHS);
   }
 
   APInt MaxRHS = getUnsignedRange(RHS).getUnsignedMax();
@@ -8985,7 +9024,7 @@ bool ScalarEvolution::doesIVOverflowOnLT(const SCEV *RHS, const SCEV *Stride,
                               .getUnsignedMax();
 
   // UMaxRHS + UMaxStrideMinusOne > UMaxValue => overflow!
-  return (std::move(MaxValue) - std::move(MaxStrideMinusOne)).ult(MaxRHS);
+  return (std::move(MaxValue) - MaxStrideMinusOne).ult(MaxRHS);
 }
 
 bool ScalarEvolution::doesIVOverflowOnGT(const SCEV *RHS, const SCEV *Stride,
@@ -9002,7 +9041,7 @@ bool ScalarEvolution::doesIVOverflowOnGT(const SCEV *RHS, const SCEV *Stride,
                                .getSignedMax();
 
     // SMinRHS - SMaxStrideMinusOne < SMinValue => overflow!
-    return (std::move(MinValue) + std::move(MaxStrideMinusOne)).sgt(MinRHS);
+    return (std::move(MinValue) + MaxStrideMinusOne).sgt(MinRHS);
   }
 
   APInt MinRHS = getUnsignedRange(RHS).getUnsignedMin();
@@ -9011,7 +9050,7 @@ bool ScalarEvolution::doesIVOverflowOnGT(const SCEV *RHS, const SCEV *Stride,
                             .getUnsignedMax();
 
   // UMinRHS - UMaxStrideMinusOne < UMinValue => overflow!
-  return (std::move(MinValue) + std::move(MaxStrideMinusOne)).ugt(MinRHS);
+  return (std::move(MinValue) + MaxStrideMinusOne).ugt(MinRHS);
 }
 
 const SCEV *ScalarEvolution::computeBECount(const SCEV *Delta, const SCEV *Step,
diff --git a/lib/Analysis/TargetLibraryInfo.cpp b/lib/Analysis/TargetLibraryInfo.cpp
index 848e1b4717b5..3cf1bbc5daa5 100644
--- a/lib/Analysis/TargetLibraryInfo.cpp
+++ b/lib/Analysis/TargetLibraryInfo.cpp
@@ -241,6 +241,50 @@ static void initialize(TargetLibraryInfoImpl &TLI, const Triple &T,
       TLI.setUnavailable(LibFunc_tanhf);
     }
 
+    // These definitions are due to math-finite.h header on Linux
+    TLI.setUnavailable(LibFunc_acos_finite);
+    TLI.setUnavailable(LibFunc_acosf_finite);
+    TLI.setUnavailable(LibFunc_acosl_finite);
+    TLI.setUnavailable(LibFunc_acosh_finite);
+    TLI.setUnavailable(LibFunc_acoshf_finite);
+    TLI.setUnavailable(LibFunc_acoshl_finite);
+    TLI.setUnavailable(LibFunc_asin_finite);
+    TLI.setUnavailable(LibFunc_asinf_finite);
+    TLI.setUnavailable(LibFunc_asinl_finite);
+    TLI.setUnavailable(LibFunc_atan2_finite);
+    TLI.setUnavailable(LibFunc_atan2f_finite);
+    TLI.setUnavailable(LibFunc_atan2l_finite);
+    TLI.setUnavailable(LibFunc_atanh_finite);
+    TLI.setUnavailable(LibFunc_atanhf_finite);
+    TLI.setUnavailable(LibFunc_atanhl_finite);
+    TLI.setUnavailable(LibFunc_cosh_finite);
+    TLI.setUnavailable(LibFunc_coshf_finite);
+    TLI.setUnavailable(LibFunc_coshl_finite);
+    TLI.setUnavailable(LibFunc_exp10_finite);
+    TLI.setUnavailable(LibFunc_exp10f_finite);
+    TLI.setUnavailable(LibFunc_exp10l_finite);
+    TLI.setUnavailable(LibFunc_exp2_finite);
+    TLI.setUnavailable(LibFunc_exp2f_finite);
+    TLI.setUnavailable(LibFunc_exp2l_finite);
+    TLI.setUnavailable(LibFunc_exp_finite);
+    TLI.setUnavailable(LibFunc_expf_finite);
+    TLI.setUnavailable(LibFunc_expl_finite);
+    TLI.setUnavailable(LibFunc_log10_finite);
+    TLI.setUnavailable(LibFunc_log10f_finite);
+    TLI.setUnavailable(LibFunc_log10l_finite);
+    TLI.setUnavailable(LibFunc_log2_finite);
+    TLI.setUnavailable(LibFunc_log2f_finite);
+    TLI.setUnavailable(LibFunc_log2l_finite);
+    TLI.setUnavailable(LibFunc_log_finite);
+    TLI.setUnavailable(LibFunc_logf_finite);
+    TLI.setUnavailable(LibFunc_logl_finite);
+    TLI.setUnavailable(LibFunc_pow_finite);
+    TLI.setUnavailable(LibFunc_powf_finite);
+    TLI.setUnavailable(LibFunc_powl_finite);
+    TLI.setUnavailable(LibFunc_sinh_finite);
+    TLI.setUnavailable(LibFunc_sinhf_finite);
+    TLI.setUnavailable(LibFunc_sinhl_finite);
+
     // Win32 does *not* provide provide these functions, but they are
     // generally available on POSIX-compliant systems:
     TLI.setUnavailable(LibFunc_access);
@@ -496,7 +540,7 @@ static StringRef sanitizeFunctionName(StringRef funcName) {
 
   // Check for \01 prefix that is used to mangle __asm declarations and
   // strip it if present.
-  return GlobalValue::getRealLinkageName(funcName);
+  return GlobalValue::dropLLVMManglingEscape(funcName);
 }
 
 bool TargetLibraryInfoImpl::getLibFunc(StringRef funcName,
@@ -1004,22 +1048,34 @@ bool TargetLibraryInfoImpl::isValidProtoForLibFunc(const FunctionType &FTy,
     return (NumParams == 1 && FTy.getParamType(0)->isFloatingPointTy());
 
   case LibFunc_acos:
+  case LibFunc_acos_finite:
   case LibFunc_acosf:
+  case LibFunc_acosf_finite:
   case LibFunc_acosh:
+  case LibFunc_acosh_finite:
   case LibFunc_acoshf:
+  case LibFunc_acoshf_finite:
   case LibFunc_acoshl:
+  case LibFunc_acoshl_finite:
   case LibFunc_acosl:
+  case LibFunc_acosl_finite:
   case LibFunc_asin:
+  case LibFunc_asin_finite:
   case LibFunc_asinf:
+  case LibFunc_asinf_finite:
   case LibFunc_asinh:
   case LibFunc_asinhf:
   case LibFunc_asinhl:
   case LibFunc_asinl:
+  case LibFunc_asinl_finite:
   case LibFunc_atan:
   case LibFunc_atanf:
   case LibFunc_atanh:
+  case LibFunc_atanh_finite:
   case LibFunc_atanhf:
+  case LibFunc_atanhf_finite:
   case LibFunc_atanhl:
+  case LibFunc_atanhl_finite:
   case LibFunc_atanl:
   case LibFunc_cbrt:
   case LibFunc_cbrtf:
@@ -1030,18 +1086,30 @@ bool TargetLibraryInfoImpl::isValidProtoForLibFunc(const FunctionType &FTy,
   case LibFunc_cos:
   case LibFunc_cosf:
   case LibFunc_cosh:
+  case LibFunc_cosh_finite:
   case LibFunc_coshf:
+  case LibFunc_coshf_finite:
   case LibFunc_coshl:
+  case LibFunc_coshl_finite:
   case LibFunc_cosl:
   case LibFunc_exp10:
+  case LibFunc_exp10_finite:
   case LibFunc_exp10f:
+  case LibFunc_exp10f_finite:
   case LibFunc_exp10l:
+  case LibFunc_exp10l_finite:
   case LibFunc_exp2:
+  case LibFunc_exp2_finite:
   case LibFunc_exp2f:
+  case LibFunc_exp2f_finite:
   case LibFunc_exp2l:
+  case LibFunc_exp2l_finite:
   case LibFunc_exp:
+  case LibFunc_exp_finite:
   case LibFunc_expf:
+  case LibFunc_expf_finite:
   case LibFunc_expl:
+  case LibFunc_expl_finite:
   case LibFunc_expm1:
   case LibFunc_expm1f:
   case LibFunc_expm1l:
@@ -1052,20 +1120,29 @@ bool TargetLibraryInfoImpl::isValidProtoForLibFunc(const FunctionType &FTy,
   case LibFunc_floorf:
   case LibFunc_floorl:
   case LibFunc_log10:
+  case LibFunc_log10_finite:
   case LibFunc_log10f:
+  case LibFunc_log10f_finite:
   case LibFunc_log10l:
+  case LibFunc_log10l_finite:
   case LibFunc_log1p:
   case LibFunc_log1pf:
   case LibFunc_log1pl:
   case LibFunc_log2:
+  case LibFunc_log2_finite:
   case LibFunc_log2f:
+  case LibFunc_log2f_finite:
   case LibFunc_log2l:
+  case LibFunc_log2l_finite:
   case LibFunc_log:
+  case LibFunc_log_finite:
   case LibFunc_logb:
   case LibFunc_logbf:
   case LibFunc_logbl:
   case LibFunc_logf:
+  case LibFunc_logf_finite:
   case LibFunc_logl:
+  case LibFunc_logl_finite:
   case LibFunc_nearbyint:
   case LibFunc_nearbyintf:
   case LibFunc_nearbyintl:
@@ -1078,8 +1155,11 @@ bool TargetLibraryInfoImpl::isValidProtoForLibFunc(const FunctionType &FTy,
   case LibFunc_sin:
   case LibFunc_sinf:
   case LibFunc_sinh:
+  case LibFunc_sinh_finite:
   case LibFunc_sinhf:
+  case LibFunc_sinhf_finite:
   case LibFunc_sinhl:
+  case LibFunc_sinhl_finite:
   case LibFunc_sinl:
   case LibFunc_sqrt:
   case LibFunc_sqrt_finite:
@@ -1100,8 +1180,11 @@ bool TargetLibraryInfoImpl::isValidProtoForLibFunc(const FunctionType &FTy,
             FTy.getReturnType() == FTy.getParamType(0));
 
   case LibFunc_atan2:
+  case LibFunc_atan2_finite:
   case LibFunc_atan2f:
+  case LibFunc_atan2f_finite:
   case LibFunc_atan2l:
+  case LibFunc_atan2l_finite:
   case LibFunc_fmin:
   case LibFunc_fminf:
   case LibFunc_fminl:
@@ -1115,8 +1198,11 @@ bool TargetLibraryInfoImpl::isValidProtoForLibFunc(const FunctionType &FTy,
   case LibFunc_copysignf:
   case LibFunc_copysignl:
   case LibFunc_pow:
+  case LibFunc_pow_finite:
   case LibFunc_powf:
+  case LibFunc_powf_finite:
   case LibFunc_powl:
+  case LibFunc_powl_finite:
     return (NumParams == 2 && FTy.getReturnType()->isFloatingPointTy() &&
             FTy.getReturnType() == FTy.getParamType(0) &&
             FTy.getReturnType() == FTy.getParamType(1));
@@ -1294,6 +1380,14 @@ void TargetLibraryInfoImpl::addVectorizableFunctionsFromVecLib(
         {"powf", "__svml_powf8", 8},
         {"powf", "__svml_powf16", 16},
 
+        { "__pow_finite", "__svml_pow2", 2 },
+        { "__pow_finite", "__svml_pow4", 4 },
+        { "__pow_finite", "__svml_pow8", 8 },
+
+        { "__powf_finite", "__svml_powf4", 4 },
+        { "__powf_finite", "__svml_powf8", 8 },
+        { "__powf_finite", "__svml_powf16", 16 },
+
         {"llvm.pow.f64", "__svml_pow2", 2},
         {"llvm.pow.f64", "__svml_pow4", 4},
         {"llvm.pow.f64", "__svml_pow8", 8},
@@ -1310,6 +1404,14 @@ void TargetLibraryInfoImpl::addVectorizableFunctionsFromVecLib(
         {"expf", "__svml_expf8", 8},
         {"expf", "__svml_expf16", 16},
 
+        { "__exp_finite", "__svml_exp2", 2 },
+        { "__exp_finite", "__svml_exp4", 4 },
+        { "__exp_finite", "__svml_exp8", 8 },
+
+        { "__expf_finite", "__svml_expf4", 4 },
+        { "__expf_finite", "__svml_expf8", 8 },
+        { "__expf_finite", "__svml_expf16", 16 },
+
         {"llvm.exp.f64", "__svml_exp2", 2},
         {"llvm.exp.f64", "__svml_exp4", 4},
         {"llvm.exp.f64", "__svml_exp8", 8},
@@ -1326,6 +1428,14 @@ void TargetLibraryInfoImpl::addVectorizableFunctionsFromVecLib(
         {"logf", "__svml_logf8", 8},
         {"logf", "__svml_logf16", 16},
 
+        { "__log_finite", "__svml_log2", 2 },
+        { "__log_finite", "__svml_log4", 4 },
+        { "__log_finite", "__svml_log8", 8 },
+
+        { "__logf_finite", "__svml_logf4", 4 },
+        { "__logf_finite", "__svml_logf8", 8 },
+        { "__logf_finite", "__svml_logf16", 16 },
+
         {"llvm.log.f64", "__svml_log2", 2},
         {"llvm.log.f64", "__svml_log4", 4},
         {"llvm.log.f64", "__svml_log8", 8},
diff --git a/lib/Analysis/TargetTransformInfo.cpp b/lib/Analysis/TargetTransformInfo.cpp
index 26d606cce9bb..8a5d10473662 100644
--- a/lib/Analysis/TargetTransformInfo.cpp
+++ b/lib/Analysis/TargetTransformInfo.cpp
@@ -279,6 +279,10 @@ unsigned TargetTransformInfo::getRegisterBitWidth(bool Vector) const {
   return TTIImpl->getRegisterBitWidth(Vector);
 }
 
+unsigned TargetTransformInfo::getMinVectorRegisterBitWidth() const {
+  return TTIImpl->getMinVectorRegisterBitWidth();
+}
+
 bool TargetTransformInfo::shouldConsiderAddressTypePromotion(
     const Instruction &I, bool &AllowPromotionWithoutCommonHeader) const {
   return TTIImpl->shouldConsiderAddressTypePromotion(
@@ -500,6 +504,15 @@ unsigned TargetTransformInfo::getStoreVectorFactor(unsigned VF,
   return TTIImpl->getStoreVectorFactor(VF, StoreSize, ChainSizeInBytes, VecTy);
 }
 
+bool TargetTransformInfo::useReductionIntrinsic(unsigned Opcode,
+                                                Type *Ty, ReductionFlags Flags) const {
+  return TTIImpl->useReductionIntrinsic(Opcode, Ty, Flags);
+}
+
+bool TargetTransformInfo::shouldExpandReduction(const IntrinsicInst *II) const {
+  return TTIImpl->shouldExpandReduction(II);
+}
+
 TargetTransformInfo::Concept::~Concept() {}
 
 TargetIRAnalysis::TargetIRAnalysis() : TTICallback(&getDefaultTTI) {}
diff --git a/lib/Analysis/ValueTracking.cpp b/lib/Analysis/ValueTracking.cpp
index a7f3ff672aef..cba7363a0afa 100644
--- a/lib/Analysis/ValueTracking.cpp
+++ b/lib/Analysis/ValueTracking.cpp
@@ -88,9 +88,8 @@ struct Query {
   /// classic case of this is assume(x = y), which will attempt to determine
   /// bits in x from bits in y, which will attempt to determine bits in y from
   /// bits in x, etc. Regarding the mutual recursion, computeKnownBits can call
-  /// isKnownNonZero, which calls computeKnownBits and ComputeSignBit and
-  /// isKnownToBeAPowerOfTwo (all of which can call computeKnownBits), and so
-  /// on.
+  /// isKnownNonZero, which calls computeKnownBits and isKnownToBeAPowerOfTwo
+  /// (all of which can call computeKnownBits), and so on.
   std::array<const Value *, MaxDepth> Excluded;
   unsigned NumExcluded;
 
@@ -143,6 +142,16 @@ void llvm::computeKnownBits(const Value *V, KnownBits &Known,
                      Query(DL, AC, safeCxtI(V, CxtI), DT, ORE));
 }
 
+static KnownBits computeKnownBits(const Value *V, unsigned Depth,
+                                  const Query &Q);
+
+KnownBits llvm::computeKnownBits(const Value *V, const DataLayout &DL,
+                                 unsigned Depth, AssumptionCache *AC,
+                                 const Instruction *CxtI,
+                                 const DominatorTree *DT) {
+  return ::computeKnownBits(V, Depth, Query(DL, AC, safeCxtI(V, CxtI), DT));
+}
+
 bool llvm::haveNoCommonBitsSet(const Value *LHS, const Value *RHS,
                                const DataLayout &DL,
                                AssumptionCache *AC, const Instruction *CxtI,
@@ -159,16 +168,6 @@ bool llvm::haveNoCommonBitsSet(const Value *LHS, const Value *RHS,
   return (LHSKnown.Zero | RHSKnown.Zero).isAllOnesValue();
 }
 
-static void ComputeSignBit(const Value *V, bool &KnownZero, bool &KnownOne,
-                           unsigned Depth, const Query &Q);
-
-void llvm::ComputeSignBit(const Value *V, bool &KnownZero, bool &KnownOne,
-                          const DataLayout &DL, unsigned Depth,
-                          AssumptionCache *AC, const Instruction *CxtI,
-                          const DominatorTree *DT) {
-  ::ComputeSignBit(V, KnownZero, KnownOne, Depth,
-                   Query(DL, AC, safeCxtI(V, CxtI), DT));
-}
 
 static bool isKnownToBeAPowerOfTwo(const Value *V, bool OrZero, unsigned Depth,
                                    const Query &Q);
@@ -194,9 +193,8 @@ bool llvm::isKnownNonNegative(const Value *V, const DataLayout &DL,
                               unsigned Depth,
                               AssumptionCache *AC, const Instruction *CxtI,
                               const DominatorTree *DT) {
-  bool NonNegative, Negative;
-  ComputeSignBit(V, NonNegative, Negative, DL, Depth, AC, CxtI, DT);
-  return NonNegative;
+  KnownBits Known = computeKnownBits(V, DL, Depth, AC, CxtI, DT);
+  return Known.isNonNegative();
 }
 
 bool llvm::isKnownPositive(const Value *V, const DataLayout &DL, unsigned Depth,
@@ -214,9 +212,8 @@ bool llvm::isKnownPositive(const Value *V, const DataLayout &DL, unsigned Depth,
 bool llvm::isKnownNegative(const Value *V, const DataLayout &DL, unsigned Depth,
                            AssumptionCache *AC, const Instruction *CxtI,
                            const DominatorTree *DT) {
-  bool NonNegative, Negative;
-  ComputeSignBit(V, NonNegative, Negative, DL, Depth, AC, CxtI, DT);
-  return Negative;
+  KnownBits Known = computeKnownBits(V, DL, Depth, AC, CxtI, DT);
+  return Known.isNegative();
 }
 
 static bool isKnownNonEqual(const Value *V1, const Value *V2, const Query &Q);
@@ -342,10 +339,10 @@ static void computeKnownBitsMul(const Value *Op0, const Value *Op1, bool NSW,
   // Also compute a conservative estimate for high known-0 bits.
   // More trickiness is possible, but this is sufficient for the
   // interesting case of alignment computation.
-  unsigned TrailZ = Known.Zero.countTrailingOnes() +
-                    Known2.Zero.countTrailingOnes();
-  unsigned LeadZ =  std::max(Known.Zero.countLeadingOnes() +
-                             Known2.Zero.countLeadingOnes(),
+  unsigned TrailZ = Known.countMinTrailingZeros() +
+                    Known2.countMinTrailingZeros();
+  unsigned LeadZ =  std::max(Known.countMinLeadingZeros() +
+                             Known2.countMinLeadingZeros(),
                              BitWidth) - BitWidth;
 
   TrailZ = std::min(TrailZ, BitWidth);
@@ -750,8 +747,8 @@ static void computeKnownBitsFromAssume(const Value *V, KnownBits &Known,
       computeKnownBits(A, RHSKnown, Depth+1, Query(Q, I));
 
       // Whatever high bits in c are zero are known to be zero.
-      Known.Zero.setHighBits(RHSKnown.Zero.countLeadingOnes());
-    // assume(v <_u c)
+      Known.Zero.setHighBits(RHSKnown.countMinLeadingZeros());
+      // assume(v <_u c)
     } else if (match(Arg, m_ICmp(Pred, m_V, m_Value(A))) &&
                Pred == ICmpInst::ICMP_ULT &&
                isValidAssumeForContext(I, Q.CxtI, Q.DT)) {
@@ -761,9 +758,9 @@ static void computeKnownBitsFromAssume(const Value *V, KnownBits &Known,
       // Whatever high bits in c are zero are known to be zero (if c is a power
       // of 2, then one more).
       if (isKnownToBeAPowerOfTwo(A, false, Depth + 1, Query(Q, I)))
-        Known.Zero.setHighBits(RHSKnown.Zero.countLeadingOnes()+1);
+        Known.Zero.setHighBits(RHSKnown.countMinLeadingZeros() + 1);
       else
-        Known.Zero.setHighBits(RHSKnown.Zero.countLeadingOnes());
+        Known.Zero.setHighBits(RHSKnown.countMinLeadingZeros());
     }
   }
 
@@ -916,7 +913,7 @@ static void computeKnownBitsFromOperator(const Operator *I, KnownBits &Known,
                                        m_Value(Y))))) {
       Known2.resetAll();
       computeKnownBits(Y, Known2, Depth + 1, Q);
-      if (Known2.One.countTrailingOnes() > 0)
+      if (Known2.countMinTrailingOnes() > 0)
         Known.Zero.setBit(0);
     }
     break;
@@ -953,14 +950,13 @@ static void computeKnownBitsFromOperator(const Operator *I, KnownBits &Known,
     // treat a udiv as a logical right shift by the power of 2 known to
     // be less than the denominator.
     computeKnownBits(I->getOperand(0), Known2, Depth + 1, Q);
-    unsigned LeadZ = Known2.Zero.countLeadingOnes();
+    unsigned LeadZ = Known2.countMinLeadingZeros();
 
     Known2.resetAll();
     computeKnownBits(I->getOperand(1), Known2, Depth + 1, Q);
-    unsigned RHSUnknownLeadingOnes = Known2.One.countLeadingZeros();
-    if (RHSUnknownLeadingOnes != BitWidth)
-      LeadZ = std::min(BitWidth,
-                       LeadZ + BitWidth - RHSUnknownLeadingOnes - 1);
+    unsigned RHSMaxLeadingZeros = Known2.countMaxLeadingZeros();
+    if (RHSMaxLeadingZeros != BitWidth)
+      LeadZ = std::min(BitWidth, LeadZ + BitWidth - RHSMaxLeadingZeros - 1);
 
     Known.Zero.setHighBits(LeadZ);
     break;
@@ -983,8 +979,8 @@ static void computeKnownBitsFromOperator(const Operator *I, KnownBits &Known,
       if (Known.isNegative() && Known2.isNegative())
         // We can derive a lower bound on the result by taking the max of the
         // leading one bits.
-        MaxHighOnes = std::max(Known.One.countLeadingOnes(),
-                               Known2.One.countLeadingOnes());
+        MaxHighOnes =
+            std::max(Known.countMinLeadingOnes(), Known2.countMinLeadingOnes());
       // If either side is non-negative, the result is non-negative.
       else if (Known.isNonNegative() || Known2.isNonNegative())
         MaxHighZeros = 1;
@@ -993,8 +989,8 @@ static void computeKnownBitsFromOperator(const Operator *I, KnownBits &Known,
       if (Known.isNonNegative() && Known2.isNonNegative())
         // We can derive an upper bound on the result by taking the max of the
         // leading zero bits.
-        MaxHighZeros = std::max(Known.Zero.countLeadingOnes(),
-                                Known2.Zero.countLeadingOnes());
+        MaxHighZeros = std::max(Known.countMinLeadingZeros(),
+                                Known2.countMinLeadingZeros());
       // If either side is negative, the result is negative.
       else if (Known.isNegative() || Known2.isNegative())
         MaxHighOnes = 1;
@@ -1002,12 +998,12 @@ static void computeKnownBitsFromOperator(const Operator *I, KnownBits &Known,
       // We can derive a lower bound on the result by taking the max of the
       // leading one bits.
       MaxHighOnes =
-          std::max(Known.One.countLeadingOnes(), Known2.One.countLeadingOnes());
+          std::max(Known.countMinLeadingOnes(), Known2.countMinLeadingOnes());
     } else if (SPF == SPF_UMIN) {
       // We can derive an upper bound on the result by taking the max of the
       // leading zero bits.
       MaxHighZeros =
-          std::max(Known.Zero.countLeadingOnes(), Known2.Zero.countLeadingOnes());
+          std::max(Known.countMinLeadingZeros(), Known2.countMinLeadingZeros());
     }
 
     // Only known if known in both the LHS and RHS.
@@ -1185,8 +1181,8 @@ static void computeKnownBitsFromOperator(const Operator *I, KnownBits &Known,
     computeKnownBits(I->getOperand(0), Known, Depth + 1, Q);
     computeKnownBits(I->getOperand(1), Known2, Depth + 1, Q);
 
-    unsigned Leaders = std::max(Known.Zero.countLeadingOnes(),
-                                Known2.Zero.countLeadingOnes());
+    unsigned Leaders =
+        std::max(Known.countMinLeadingZeros(), Known2.countMinLeadingZeros());
     Known.resetAll();
     Known.Zero.setHighBits(Leaders);
     break;
@@ -1207,7 +1203,7 @@ static void computeKnownBitsFromOperator(const Operator *I, KnownBits &Known,
     // to determine if we can prove known low zero bits.
     KnownBits LocalKnown(BitWidth);
     computeKnownBits(I->getOperand(0), LocalKnown, Depth + 1, Q);
-    unsigned TrailZ = LocalKnown.Zero.countTrailingOnes();
+    unsigned TrailZ = LocalKnown.countMinTrailingZeros();
 
     gep_type_iterator GTI = gep_type_begin(I);
     for (unsigned i = 1, e = I->getNumOperands(); i != e; ++i, ++GTI) {
@@ -1241,7 +1237,7 @@ static void computeKnownBitsFromOperator(const Operator *I, KnownBits &Known,
         computeKnownBits(Index, LocalKnown, Depth + 1, Q);
         TrailZ = std::min(TrailZ,
                           unsigned(countTrailingZeros(TypeSize) +
-                                   LocalKnown.Zero.countTrailingOnes()));
+                                   LocalKnown.countMinTrailingZeros()));
       }
     }
 
@@ -1286,8 +1282,8 @@ static void computeKnownBitsFromOperator(const Operator *I, KnownBits &Known,
           KnownBits Known3(Known);
           computeKnownBits(L, Known3, Depth + 1, Q);
 
-          Known.Zero.setLowBits(std::min(Known2.Zero.countTrailingOnes(),
-                                         Known3.Zero.countTrailingOnes()));
+          Known.Zero.setLowBits(std::min(Known2.countMinTrailingZeros(),
+                                         Known3.countMinTrailingZeros()));
 
           if (DontImproveNonNegativePhiBits)
             break;
@@ -1386,12 +1382,25 @@ static void computeKnownBitsFromOperator(const Operator *I, KnownBits &Known,
         Known.Zero |= Known2.Zero.byteSwap();
         Known.One |= Known2.One.byteSwap();
         break;
-      case Intrinsic::ctlz:
+      case Intrinsic::ctlz: {
+        computeKnownBits(I->getOperand(0), Known2, Depth + 1, Q);
+        // If we have a known 1, its position is our upper bound.
+        unsigned PossibleLZ = Known2.One.countLeadingZeros();
+        // If this call is undefined for 0, the result will be less than 2^n.
+        if (II->getArgOperand(1) == ConstantInt::getTrue(II->getContext()))
+          PossibleLZ = std::min(PossibleLZ, BitWidth - 1);
+        unsigned LowBits = Log2_32(PossibleLZ)+1;
+        Known.Zero.setBitsFrom(LowBits);
+        break;
+      }
       case Intrinsic::cttz: {
-        unsigned LowBits = Log2_32(BitWidth)+1;
+        computeKnownBits(I->getOperand(0), Known2, Depth + 1, Q);
+        // If we have a known 1, its position is our upper bound.
+        unsigned PossibleTZ = Known2.One.countTrailingZeros();
         // If this call is undefined for 0, the result will be less than 2^n.
         if (II->getArgOperand(1) == ConstantInt::getTrue(II->getContext()))
-          LowBits -= 1;
+          PossibleTZ = std::min(PossibleTZ, BitWidth - 1);
+        unsigned LowBits = Log2_32(PossibleTZ)+1;
         Known.Zero.setBitsFrom(LowBits);
         break;
       }
@@ -1399,7 +1408,7 @@ static void computeKnownBitsFromOperator(const Operator *I, KnownBits &Known,
         computeKnownBits(I->getOperand(0), Known2, Depth + 1, Q);
         // We can bound the space the count needs.  Also, bits known to be zero
         // can't contribute to the population.
-        unsigned BitsPossiblySet = BitWidth - Known2.Zero.countPopulation();
+        unsigned BitsPossiblySet = Known2.countMaxPopulation();
         unsigned LowBits = Log2_32(BitsPossiblySet)+1;
         Known.Zero.setBitsFrom(LowBits);
         // TODO: we could bound KnownOne using the lower bound on the number
@@ -1449,6 +1458,14 @@ static void computeKnownBitsFromOperator(const Operator *I, KnownBits &Known,
   }
 }
 
+/// Determine which bits of V are known to be either zero or one and return
+/// them.
+KnownBits computeKnownBits(const Value *V, unsigned Depth, const Query &Q) {
+  KnownBits Known(getBitWidth(V->getType(), Q.DL));
+  computeKnownBits(V, Known, Depth, Q);
+  return Known;
+}
+
 /// Determine which bits of V are known to be either zero or one and return
 /// them in the Known bit set.
 ///
@@ -1568,16 +1585,6 @@ void computeKnownBits(const Value *V, KnownBits &Known, unsigned Depth,
   assert((Known.Zero & Known.One) == 0 && "Bits known to be one AND zero?");
 }
 
-/// Determine whether the sign bit is known to be zero or one.
-/// Convenience wrapper around computeKnownBits.
-void ComputeSignBit(const Value *V, bool &KnownZero, bool &KnownOne,
-                    unsigned Depth, const Query &Q) {
-  KnownBits Bits(getBitWidth(V->getType(), Q.DL));
-  computeKnownBits(V, Bits, Depth, Q);
-  KnownOne = Bits.isNegative();
-  KnownZero = Bits.isNonNegative();
-}
-
 /// Return true if the given value is known to have exactly one
 /// bit set when defined. For vectors return true if every element is known to
 /// be a power of two when defined. Supports values with integer or pointer
@@ -1842,24 +1849,20 @@ bool isKnownNonZero(const Value *V, unsigned Depth, const Query &Q) {
     if (BO->isExact())
       return isKnownNonZero(X, Depth, Q);
 
-    bool XKnownNonNegative, XKnownNegative;
-    ComputeSignBit(X, XKnownNonNegative, XKnownNegative, Depth, Q);
-    if (XKnownNegative)
+    KnownBits Known = computeKnownBits(X, Depth, Q);
+    if (Known.isNegative())
       return true;
 
     // If the shifter operand is a constant, and all of the bits shifted
     // out are known to be zero, and X is known non-zero then at least one
     // non-zero bit must remain.
     if (ConstantInt *Shift = dyn_cast<ConstantInt>(Y)) {
-      KnownBits Known(BitWidth);
-      computeKnownBits(X, Known, Depth, Q);
-
       auto ShiftVal = Shift->getLimitedValue(BitWidth - 1);
       // Is there a known one in the portion not shifted out?
-      if (Known.One.countLeadingZeros() < BitWidth - ShiftVal)
+      if (Known.countMaxLeadingZeros() < BitWidth - ShiftVal)
         return true;
       // Are all the bits to be shifted out known zero?
-      if (Known.Zero.countTrailingOnes() >= ShiftVal)
+      if (Known.countMinTrailingZeros() >= ShiftVal)
         return isKnownNonZero(X, Depth, Q);
     }
   }
@@ -1869,39 +1872,34 @@ bool isKnownNonZero(const Value *V, unsigned Depth, const Query &Q) {
   }
   // X + Y.
   else if (match(V, m_Add(m_Value(X), m_Value(Y)))) {
-    bool XKnownNonNegative, XKnownNegative;
-    bool YKnownNonNegative, YKnownNegative;
-    ComputeSignBit(X, XKnownNonNegative, XKnownNegative, Depth, Q);
-    ComputeSignBit(Y, YKnownNonNegative, YKnownNegative, Depth, Q);
+    KnownBits XKnown = computeKnownBits(X, Depth, Q);
+    KnownBits YKnown = computeKnownBits(Y, Depth, Q);
 
     // If X and Y are both non-negative (as signed values) then their sum is not
     // zero unless both X and Y are zero.
-    if (XKnownNonNegative && YKnownNonNegative)
+    if (XKnown.isNonNegative() && YKnown.isNonNegative())
       if (isKnownNonZero(X, Depth, Q) || isKnownNonZero(Y, Depth, Q))
         return true;
 
     // If X and Y are both negative (as signed values) then their sum is not
     // zero unless both X and Y equal INT_MIN.
-    if (XKnownNegative && YKnownNegative) {
-      KnownBits Known(BitWidth);
+    if (XKnown.isNegative() && YKnown.isNegative()) {
       APInt Mask = APInt::getSignedMaxValue(BitWidth);
       // The sign bit of X is set.  If some other bit is set then X is not equal
       // to INT_MIN.
-      computeKnownBits(X, Known, Depth, Q);
-      if (Known.One.intersects(Mask))
+      if (XKnown.One.intersects(Mask))
         return true;
       // The sign bit of Y is set.  If some other bit is set then Y is not equal
       // to INT_MIN.
-      computeKnownBits(Y, Known, Depth, Q);
-      if (Known.One.intersects(Mask))
+      if (YKnown.One.intersects(Mask))
         return true;
     }
 
     // The sum of a non-negative number and a power of two is not zero.
-    if (XKnownNonNegative &&
+    if (XKnown.isNonNegative() &&
         isKnownToBeAPowerOfTwo(Y, /*OrZero*/ false, Depth, Q))
       return true;
-    if (YKnownNonNegative &&
+    if (YKnown.isNonNegative() &&
         isKnownToBeAPowerOfTwo(X, /*OrZero*/ false, Depth, Q))
       return true;
   }
@@ -2276,14 +2274,7 @@ static unsigned ComputeNumSignBitsImpl(const Value *V, unsigned Depth,
 
   // If we know that the sign bit is either zero or one, determine the number of
   // identical bits in the top of the input value.
-  if (Known.isNonNegative())
-    return std::max(FirstAnswer, Known.Zero.countLeadingOnes());
-
-  if (Known.isNegative())
-    return std::max(FirstAnswer, Known.One.countLeadingOnes());
-
-  // computeKnownBits gave us no extra information about the top bits.
-  return FirstAnswer;
+  return std::max(FirstAnswer, Known.countMinSignBits());
 }
 
 /// This function computes the integer multiple of Base that equals V.
@@ -3441,8 +3432,8 @@ OverflowResult llvm::computeOverflowForUnsignedMul(const Value *LHS,
   computeKnownBits(RHS, RHSKnown, DL, /*Depth=*/0, AC, CxtI, DT);
   // Note that underestimating the number of zero bits gives a more
   // conservative answer.
-  unsigned ZeroBits = LHSKnown.Zero.countLeadingOnes() +
-                      RHSKnown.Zero.countLeadingOnes();
+  unsigned ZeroBits = LHSKnown.countMinLeadingZeros() +
+                      RHSKnown.countMinLeadingZeros();
   // First handle the easy case: if we have enough zero bits there's
   // definitely no overflow.
   if (ZeroBits >= BitWidth)
@@ -3475,21 +3466,17 @@ OverflowResult llvm::computeOverflowForUnsignedAdd(const Value *LHS,
                                                    AssumptionCache *AC,
                                                    const Instruction *CxtI,
                                                    const DominatorTree *DT) {
-  bool LHSKnownNonNegative, LHSKnownNegative;
-  ComputeSignBit(LHS, LHSKnownNonNegative, LHSKnownNegative, DL, /*Depth=*/0,
-                 AC, CxtI, DT);
-  if (LHSKnownNonNegative || LHSKnownNegative) {
-    bool RHSKnownNonNegative, RHSKnownNegative;
-    ComputeSignBit(RHS, RHSKnownNonNegative, RHSKnownNegative, DL, /*Depth=*/0,
-                   AC, CxtI, DT);
-
-    if (LHSKnownNegative && RHSKnownNegative) {
+  KnownBits LHSKnown = computeKnownBits(LHS, DL, /*Depth=*/0, AC, CxtI, DT);
+  if (LHSKnown.isNonNegative() || LHSKnown.isNegative()) {
+    KnownBits RHSKnown = computeKnownBits(RHS, DL, /*Depth=*/0, AC, CxtI, DT);
+
+    if (LHSKnown.isNegative() && RHSKnown.isNegative()) {
       // The sign bit is set in both cases: this MUST overflow.
       // Create a simple add instruction, and insert it into the struct.
       return OverflowResult::AlwaysOverflows;
     }
 
-    if (LHSKnownNonNegative && RHSKnownNonNegative) {
+    if (LHSKnown.isNonNegative() && RHSKnown.isNonNegative()) {
       // The sign bit is clear in both cases: this CANNOT overflow.
       // Create a simple add instruction, and insert it into the struct.
       return OverflowResult::NeverOverflows;
@@ -3499,6 +3486,51 @@ OverflowResult llvm::computeOverflowForUnsignedAdd(const Value *LHS,
   return OverflowResult::MayOverflow;
 }
 
+/// \brief Return true if we can prove that adding the two values of the
+/// knownbits will not overflow.
+/// Otherwise return false.
+static bool checkRippleForSignedAdd(const KnownBits &LHSKnown,
+                                    const KnownBits &RHSKnown) {
+  // Addition of two 2's complement numbers having opposite signs will never
+  // overflow.
+  if ((LHSKnown.isNegative() && RHSKnown.isNonNegative()) ||
+      (LHSKnown.isNonNegative() && RHSKnown.isNegative()))
+    return true;
+
+  // If either of the values is known to be non-negative, adding them can only
+  // overflow if the second is also non-negative, so we can assume that.
+  // Two non-negative numbers will only overflow if there is a carry to the 
+  // sign bit, so we can check if even when the values are as big as possible
+  // there is no overflow to the sign bit.
+  if (LHSKnown.isNonNegative() || RHSKnown.isNonNegative()) {
+    APInt MaxLHS = ~LHSKnown.Zero;
+    MaxLHS.clearSignBit();
+    APInt MaxRHS = ~RHSKnown.Zero;
+    MaxRHS.clearSignBit();
+    APInt Result = std::move(MaxLHS) + std::move(MaxRHS);
+    return Result.isSignBitClear();
+  }
+
+  // If either of the values is known to be negative, adding them can only
+  // overflow if the second is also negative, so we can assume that.
+  // Two negative number will only overflow if there is no carry to the sign
+  // bit, so we can check if even when the values are as small as possible
+  // there is overflow to the sign bit.
+  if (LHSKnown.isNegative() || RHSKnown.isNegative()) {
+    APInt MinLHS = LHSKnown.One;
+    MinLHS.clearSignBit();
+    APInt MinRHS = RHSKnown.One;
+    MinRHS.clearSignBit();
+    APInt Result = std::move(MinLHS) + std::move(MinRHS);
+    return Result.isSignBitSet();
+  }
+
+  // If we reached here it means that we know nothing about the sign bits.
+  // In this case we can't know if there will be an overflow, since by 
+  // changing the sign bits any two values can be made to overflow.
+  return false;
+}
+
 static OverflowResult computeOverflowForSignedAdd(const Value *LHS,
                                                   const Value *RHS,
                                                   const AddOperator *Add,
@@ -3510,18 +3542,29 @@ static OverflowResult computeOverflowForSignedAdd(const Value *LHS,
     return OverflowResult::NeverOverflows;
   }
 
-  bool LHSKnownNonNegative, LHSKnownNegative;
-  bool RHSKnownNonNegative, RHSKnownNegative;
-  ComputeSignBit(LHS, LHSKnownNonNegative, LHSKnownNegative, DL, /*Depth=*/0,
-                 AC, CxtI, DT);
-  ComputeSignBit(RHS, RHSKnownNonNegative, RHSKnownNegative, DL, /*Depth=*/0,
-                 AC, CxtI, DT);
+  // If LHS and RHS each have at least two sign bits, the addition will look
+  // like
+  //
+  // XX..... +
+  // YY.....
+  //
+  // If the carry into the most significant position is 0, X and Y can't both
+  // be 1 and therefore the carry out of the addition is also 0.
+  //
+  // If the carry into the most significant position is 1, X and Y can't both
+  // be 0 and therefore the carry out of the addition is also 1.
+  //
+  // Since the carry into the most significant position is always equal to
+  // the carry out of the addition, there is no signed overflow.
+  if (ComputeNumSignBits(LHS, DL, 0, AC, CxtI, DT) > 1 &&
+      ComputeNumSignBits(RHS, DL, 0, AC, CxtI, DT) > 1)
+    return OverflowResult::NeverOverflows;
+
+  KnownBits LHSKnown = computeKnownBits(LHS, DL, /*Depth=*/0, AC, CxtI, DT);
+  KnownBits RHSKnown = computeKnownBits(RHS, DL, /*Depth=*/0, AC, CxtI, DT);
 
-  if ((LHSKnownNonNegative && RHSKnownNegative) ||
-      (LHSKnownNegative && RHSKnownNonNegative)) {
-    // The sign bits are opposite: this CANNOT overflow.
+  if (checkRippleForSignedAdd(LHSKnown, RHSKnown))
     return OverflowResult::NeverOverflows;
-  }
 
   // The remaining code needs Add to be available. Early returns if not so.
   if (!Add)
@@ -3532,14 +3575,13 @@ static OverflowResult computeOverflowForSignedAdd(const Value *LHS,
   // @llvm.assume'ed non-negative rather than proved so from analyzing its
   // operands.
   bool LHSOrRHSKnownNonNegative =
-      (LHSKnownNonNegative || RHSKnownNonNegative);
-  bool LHSOrRHSKnownNegative = (LHSKnownNegative || RHSKnownNegative);
+      (LHSKnown.isNonNegative() || RHSKnown.isNonNegative());
+  bool LHSOrRHSKnownNegative = 
+      (LHSKnown.isNegative() || RHSKnown.isNegative());
   if (LHSOrRHSKnownNonNegative || LHSOrRHSKnownNegative) {
-    bool AddKnownNonNegative, AddKnownNegative;
-    ComputeSignBit(Add, AddKnownNonNegative, AddKnownNegative, DL,
-                   /*Depth=*/0, AC, CxtI, DT);
-    if ((AddKnownNonNegative && LHSOrRHSKnownNonNegative) ||
-        (AddKnownNegative && LHSOrRHSKnownNegative)) {
+    KnownBits AddKnown = computeKnownBits(Add, DL, /*Depth=*/0, AC, CxtI, DT);
+    if ((AddKnown.isNonNegative() && LHSOrRHSKnownNonNegative) ||
+        (AddKnown.isNegative() && LHSOrRHSKnownNegative)) {
       return OverflowResult::NeverOverflows;
     }
   }
diff --git a/lib/Analysis/VectorUtils.cpp b/lib/Analysis/VectorUtils.cpp
index 722f17a8067e..2d2249da4e13 100644
--- a/lib/Analysis/VectorUtils.cpp
+++ b/lib/Analysis/VectorUtils.cpp
@@ -23,6 +23,7 @@
 #include "llvm/IR/PatternMatch.h"
 #include "llvm/IR/Value.h"
 #include "llvm/IR/Constants.h"
+#include "llvm/IR/IRBuilder.h"
 
 using namespace llvm;
 using namespace llvm::PatternMatch;
diff --git a/lib/AsmParser/LLParser.cpp b/lib/AsmParser/LLParser.cpp
index 97a567565b47..d7602c83435c 100644
--- a/lib/AsmParser/LLParser.cpp
+++ b/lib/AsmParser/LLParser.cpp
@@ -162,6 +162,10 @@ bool LLParser::ValidateEndOfModule() {
       AS = AS.addAttributes(Context, AttributeList::FunctionIndex,
                             AttributeSet::get(Context, FnAttrs));
       II->setAttributes(AS);
+    } else if (auto *GV = dyn_cast<GlobalVariable>(V)) {
+      AttrBuilder Attrs(GV->getAttributes());
+      Attrs.merge(B);
+      GV->setAttributes(AttributeSet::get(Context,Attrs));
     } else {
       llvm_unreachable("invalid object with forward attribute group reference");
     }
@@ -832,10 +836,10 @@ bool LLParser::parseIndirectSymbol(
 /// ParseGlobal
 ///   ::= GlobalVar '=' OptionalLinkage OptionalVisibility OptionalDLLStorageClass
 ///       OptionalThreadLocal OptionalUnnamedAddr OptionalAddrSpace
-///       OptionalExternallyInitialized GlobalType Type Const
+///       OptionalExternallyInitialized GlobalType Type Const OptionalAttrs
 ///   ::= OptionalLinkage OptionalVisibility OptionalDLLStorageClass
 ///       OptionalThreadLocal OptionalUnnamedAddr OptionalAddrSpace
-///       OptionalExternallyInitialized GlobalType Type Const
+///       OptionalExternallyInitialized GlobalType Type Const OptionalAttrs
 ///
 /// Everything up to and including OptionalUnnamedAddr has been parsed
 /// already.
@@ -950,6 +954,16 @@ bool LLParser::ParseGlobal(const std::string &Name, LocTy NameLoc,
     }
   }
 
+  AttrBuilder Attrs;
+  LocTy BuiltinLoc;
+  std::vector<unsigned> FwdRefAttrGrps;
+  if (ParseFnAttributeValuePairs(Attrs, FwdRefAttrGrps, false, BuiltinLoc))
+    return true;
+  if (Attrs.hasAttributes() || !FwdRefAttrGrps.empty()) {
+    GV->setAttributes(AttributeSet::get(Context, Attrs));
+    ForwardRefAttrGroups[GV] = FwdRefAttrGrps;
+  }
+
   return false;
 }
 
diff --git a/lib/Bitcode/Reader/BitcodeReader.cpp b/lib/Bitcode/Reader/BitcodeReader.cpp
index 580261a3b5e0..76298121566a 100644
--- a/lib/Bitcode/Reader/BitcodeReader.cpp
+++ b/lib/Bitcode/Reader/BitcodeReader.cpp
@@ -93,13 +93,6 @@ static cl::opt<bool> PrintSummaryGUIDs(
     cl::desc(
         "Print the global id for each value when reading the module summary"));
 
-// FIXME: This flag should either be removed or moved to clang as a driver flag.
-static llvm::cl::opt<bool> IgnoreEmptyThinLTOIndexFile(
-    "ignore-empty-index-file", llvm::cl::ZeroOrMore,
-    llvm::cl::desc(
-        "Ignore an empty index file and perform non-ThinLTO compilation"),
-    llvm::cl::init(false));
-
 namespace {
 
 enum {
@@ -2750,7 +2743,7 @@ Error BitcodeReader::parseComdatRecord(ArrayRef<uint64_t> Record) {
 Error BitcodeReader::parseGlobalVarRecord(ArrayRef<uint64_t> Record) {
   // v1: [pointer type, isconst, initid, linkage, alignment, section,
   // visibility, threadlocal, unnamed_addr, externally_initialized,
-  // dllstorageclass, comdat] (name in VST)
+  // dllstorageclass, comdat, attributes] (name in VST)
   // v2: [strtab_offset, strtab_size, v1]
   StringRef Name;
   std::tie(Name, Record) = readNameFromStrtab(Record);
@@ -2830,6 +2823,11 @@ Error BitcodeReader::parseGlobalVarRecord(ArrayRef<uint64_t> Record) {
   } else if (hasImplicitComdat(RawLinkage)) {
     NewGV->setComdat(reinterpret_cast<Comdat *>(1));
   }
+
+  if (Record.size() > 12) {
+    auto AS = getAttributes(Record[12]).getFnAttributes();
+    NewGV->setAttributes(AS);
+  }
   return Error::success();
 }
 
@@ -5658,7 +5656,8 @@ Expected<bool> llvm::hasGlobalValueSummary(MemoryBufferRef Buffer) {
 }
 
 Expected<std::unique_ptr<ModuleSummaryIndex>>
-llvm::getModuleSummaryIndexForFile(StringRef Path) {
+llvm::getModuleSummaryIndexForFile(StringRef Path,
+                                   bool IgnoreEmptyThinLTOIndexFile) {
   ErrorOr<std::unique_ptr<MemoryBuffer>> FileOrErr =
       MemoryBuffer::getFileOrSTDIN(Path);
   if (!FileOrErr)
diff --git a/lib/Bitcode/Reader/MetadataLoader.cpp b/lib/Bitcode/Reader/MetadataLoader.cpp
index 42135e5949ce..d80e1da911ca 100644
--- a/lib/Bitcode/Reader/MetadataLoader.cpp
+++ b/lib/Bitcode/Reader/MetadataLoader.cpp
@@ -500,7 +500,7 @@ class MetadataLoader::MetadataLoaderImpl {
 
     // Upgrade variables attached to globals.
     for (auto &GV : TheModule.globals()) {
-      SmallVector<MDNode *, 1> MDs, NewMDs;
+      SmallVector<MDNode *, 1> MDs;
       GV.getMetadata(LLVMContext::MD_dbg, MDs);
       GV.eraseMetadata(LLVMContext::MD_dbg);
       for (auto *MD : MDs)
diff --git a/lib/Bitcode/Writer/BitcodeWriter.cpp b/lib/Bitcode/Writer/BitcodeWriter.cpp
index 1b8d81a60201..1f8b50342c2d 100644
--- a/lib/Bitcode/Writer/BitcodeWriter.cpp
+++ b/lib/Bitcode/Writer/BitcodeWriter.cpp
@@ -1109,7 +1109,7 @@ void ModuleBitcodeWriter::writeModuleInfo() {
     // GLOBALVAR: [strtab offset, strtab size, type, isconst, initid,
     //             linkage, alignment, section, visibility, threadlocal,
     //             unnamed_addr, externally_initialized, dllstorageclass,
-    //             comdat]
+    //             comdat, attributes]
     Vals.push_back(StrtabBuilder.add(GV.getName()));
     Vals.push_back(GV.getName().size());
     Vals.push_back(VE.getTypeID(GV.getValueType()));
@@ -1124,13 +1124,17 @@ void ModuleBitcodeWriter::writeModuleInfo() {
         GV.getUnnamedAddr() != GlobalValue::UnnamedAddr::None ||
         GV.isExternallyInitialized() ||
         GV.getDLLStorageClass() != GlobalValue::DefaultStorageClass ||
-        GV.hasComdat()) {
+        GV.hasComdat() ||
+        GV.hasAttributes()) {
       Vals.push_back(getEncodedVisibility(GV));
       Vals.push_back(getEncodedThreadLocalMode(GV));
       Vals.push_back(getEncodedUnnamedAddr(GV));
       Vals.push_back(GV.isExternallyInitialized());
       Vals.push_back(getEncodedDLLStorageClass(GV));
       Vals.push_back(GV.hasComdat() ? VE.getComdatID(GV.getComdat()) : 0);
+
+      auto AL = GV.getAttributesAsList(AttributeList::FunctionIndex);
+      Vals.push_back(VE.getAttributeListID(AL));
     } else {
       AbbrevToUse = SimpleGVarAbbrev;
     }
diff --git a/lib/Bitcode/Writer/ValueEnumerator.cpp b/lib/Bitcode/Writer/ValueEnumerator.cpp
index 861150766986..fd76400331d9 100644
--- a/lib/Bitcode/Writer/ValueEnumerator.cpp
+++ b/lib/Bitcode/Writer/ValueEnumerator.cpp
@@ -314,10 +314,13 @@ ValueEnumerator::ValueEnumerator(const Module &M,
   // Remember what is the cutoff between globalvalue's and other constants.
   unsigned FirstConstant = Values.size();
 
-  // Enumerate the global variable initializers.
-  for (const GlobalVariable &GV : M.globals())
+  // Enumerate the global variable initializers and attributes.
+  for (const GlobalVariable &GV : M.globals()) {
     if (GV.hasInitializer())
       EnumerateValue(GV.getInitializer());
+    if (GV.hasAttributes())
+      EnumerateAttributes(GV.getAttributesAsList(AttributeList::FunctionIndex));
+  }
 
   // Enumerate the aliasees.
   for (const GlobalAlias &GA : M.aliases())
diff --git a/lib/CMakeLists.txt b/lib/CMakeLists.txt
index 76549540ce0f..73fc2b35fe4e 100644
--- a/lib/CMakeLists.txt
+++ b/lib/CMakeLists.txt
@@ -21,5 +21,5 @@ add_subdirectory(LineEditor)
 add_subdirectory(ProfileData)
 add_subdirectory(Fuzzer)
 add_subdirectory(Passes)
-add_subdirectory(LibDriver)
+add_subdirectory(ToolDrivers)
 add_subdirectory(XRay)
diff --git a/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp b/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp
index 87b45c001de4..98163bffb60b 100644
--- a/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp
+++ b/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp
@@ -767,7 +767,7 @@ void CodeViewDebug::emitDebugInfoForFunction(const Function *GV,
 
   // If our DISubprogram name is empty, use the mangled name.
   if (FuncName.empty())
-    FuncName = GlobalValue::getRealLinkageName(GV->getName());
+    FuncName = GlobalValue::dropLLVMManglingEscape(GV->getName());
 
   // Emit a symbol subsection, required by VS2012+ to find function boundaries.
   OS.AddComment("Symbol subsection for " + Twine(FuncName));
@@ -888,13 +888,21 @@ void CodeViewDebug::collectVariableInfoFromMFTable(
     if (!Scope)
       continue;
 
+    // If the variable has an attached offset expression, extract it.
+    // FIXME: Try to handle DW_OP_deref as well.
+    int64_t ExprOffset = 0;
+    if (VI.Expr)
+      if (!VI.Expr->extractIfOffset(ExprOffset))
+        continue;
+
     // Get the frame register used and the offset.
     unsigned FrameReg = 0;
     int FrameOffset = TFI->getFrameIndexReference(*Asm->MF, VI.Slot, FrameReg);
     uint16_t CVReg = TRI->getCodeViewRegNum(FrameReg);
 
     // Calculate the label ranges.
-    LocalVarDefRange DefRange = createDefRangeMem(CVReg, FrameOffset);
+    LocalVarDefRange DefRange =
+        createDefRangeMem(CVReg, FrameOffset + ExprOffset);
     for (const InsnRange &Range : Scope->getRanges()) {
       const MCSymbol *Begin = getLabelBeforeInsn(Range.first);
       const MCSymbol *End = getLabelAfterInsn(Range.second);
@@ -2194,7 +2202,7 @@ void CodeViewDebug::emitDebugInfoForGlobals() {
         if (GV->hasComdat()) {
           MCSymbol *GVSym = Asm->getSymbol(GV);
           OS.AddComment("Symbol subsection for " +
-                        Twine(GlobalValue::getRealLinkageName(GV->getName())));
+                        Twine(GlobalValue::dropLLVMManglingEscape(GV->getName())));
           switchToDebugSectionForSymbol(GVSym);
           EndLabel = beginCVSubsection(ModuleDebugFragmentKind::Symbols);
           // FIXME: emitDebugInfoForGlobal() doesn't handle DIExpressions.
diff --git a/lib/CodeGen/AsmPrinter/DebugHandlerBase.cpp b/lib/CodeGen/AsmPrinter/DebugHandlerBase.cpp
index 1d63e33a4d33..826162ad47c4 100644
--- a/lib/CodeGen/AsmPrinter/DebugHandlerBase.cpp
+++ b/lib/CodeGen/AsmPrinter/DebugHandlerBase.cpp
@@ -129,10 +129,9 @@ bool hasDebugInfo(const MachineModuleInfo *MMI, const MachineFunction *MF) {
 }
 
 void DebugHandlerBase::beginFunction(const MachineFunction *MF) {
-  assert(Asm);
   PrevInstBB = nullptr;
 
-  if (!hasDebugInfo(MMI, MF)) {
+  if (!Asm || !hasDebugInfo(MMI, MF)) {
     skippedNonDebugFunction();
     return;
   }
diff --git a/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp b/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp
index 738e062cb93f..e172712cf889 100644
--- a/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp
+++ b/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp
@@ -440,7 +440,7 @@ DIE *DwarfCompileUnit::constructInlinedScopeDIE(LexicalScope *Scope) {
   auto *InlinedSP = getDISubprogram(DS);
   // Find the subprogram's DwarfCompileUnit in the SPMap in case the subprogram
   // was inlined from another compile unit.
-  DIE *OriginDIE = DU->getAbstractSPDies()[InlinedSP];
+  DIE *OriginDIE = getAbstractSPDies()[InlinedSP];
   assert(OriginDIE && "Unable to find original DIE for an inlined subprogram.");
 
   auto ScopeDIE = DIE::get(DIEValueAllocator, dwarf::DW_TAG_inlined_subroutine);
@@ -634,7 +634,7 @@ DIE *DwarfCompileUnit::createAndAddScopeChildren(LexicalScope *Scope,
 
 void DwarfCompileUnit::constructAbstractSubprogramScopeDIE(
     LexicalScope *Scope) {
-  DIE *&AbsDef = DU->getAbstractSPDies()[Scope->getScopeNode()];
+  DIE *&AbsDef = getAbstractSPDies()[Scope->getScopeNode()];
   if (AbsDef)
     return;
 
@@ -696,7 +696,7 @@ DIE *DwarfCompileUnit::constructImportedEntityDIE(
 
 void DwarfCompileUnit::finishSubprogramDefinition(const DISubprogram *SP) {
   DIE *D = getDIE(SP);
-  if (DIE *AbsSPDIE = DU->getAbstractSPDies().lookup(SP)) {
+  if (DIE *AbsSPDIE = getAbstractSPDies().lookup(SP)) {
     if (D)
       // If this subprogram has an abstract definition, reference that
       addDIEEntry(*D, dwarf::DW_AT_abstract_origin, *AbsSPDIE);
@@ -708,6 +708,42 @@ void DwarfCompileUnit::finishSubprogramDefinition(const DISubprogram *SP) {
   }
 }
 
+void DwarfCompileUnit::finishVariableDefinition(const DbgVariable &Var) {
+  DbgVariable *AbsVar = getExistingAbstractVariable(
+      InlinedVariable(Var.getVariable(), Var.getInlinedAt()));
+  auto *VariableDie = Var.getDIE();
+  if (AbsVar && AbsVar->getDIE()) {
+    addDIEEntry(*VariableDie, dwarf::DW_AT_abstract_origin,
+                      *AbsVar->getDIE());
+  } else
+    applyVariableAttributes(Var, *VariableDie);
+}
+
+DbgVariable *DwarfCompileUnit::getExistingAbstractVariable(InlinedVariable IV) {
+  const DILocalVariable *Cleansed;
+  return getExistingAbstractVariable(IV, Cleansed);
+}
+
+// Find abstract variable, if any, associated with Var.
+DbgVariable *DwarfCompileUnit::getExistingAbstractVariable(
+    InlinedVariable IV, const DILocalVariable *&Cleansed) {
+  // More then one inlined variable corresponds to one abstract variable.
+  Cleansed = IV.first;
+  auto &AbstractVariables = getAbstractVariables();
+  auto I = AbstractVariables.find(Cleansed);
+  if (I != AbstractVariables.end())
+    return I->second.get();
+  return nullptr;
+}
+
+void DwarfCompileUnit::createAbstractVariable(const DILocalVariable *Var,
+                                        LexicalScope *Scope) {
+  assert(Scope && Scope->isAbstractScope());
+  auto AbsDbgVariable = make_unique<DbgVariable>(Var, /* IA */ nullptr);
+  DU->addScopeVariable(Scope, AbsDbgVariable.get());
+  getAbstractVariables()[Var] = std::move(AbsDbgVariable);
+}
+
 void DwarfCompileUnit::emitHeader(bool UseOffsets) {
   // Don't bother labeling the .dwo unit, as its offset isn't used.
   if (!Skeleton) {
diff --git a/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h b/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h
index 20a415150b4d..77e9e671529f 100644
--- a/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h
+++ b/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h
@@ -68,6 +68,9 @@ class DwarfCompileUnit final : public DwarfUnit {
   // ranges/locs.
   const MCSymbol *BaseAddress;
 
+  DenseMap<const MDNode *, DIE *> AbstractSPDies;
+  DenseMap<const MDNode *, std::unique_ptr<DbgVariable>> AbstractVariables;
+
   /// \brief Construct a DIE for the given DbgVariable without initializing the
   /// DbgVariable's DIE reference.
   DIE *constructVariableDIEImpl(const DbgVariable &DV, bool Abstract);
@@ -76,6 +79,18 @@ class DwarfCompileUnit final : public DwarfUnit {
 
   bool includeMinimalInlineScopes() const;
 
+  DenseMap<const MDNode *, DIE *> &getAbstractSPDies() {
+    if (isDwoUnit() && !DD->shareAcrossDWOCUs())
+      return AbstractSPDies;
+    return DU->getAbstractSPDies();
+  }
+
+  DenseMap<const MDNode *, std::unique_ptr<DbgVariable>> &getAbstractVariables() {
+    if (isDwoUnit() && !DD->shareAcrossDWOCUs())
+      return AbstractVariables;
+    return DU->getAbstractVariables();
+  }
+
 public:
   DwarfCompileUnit(unsigned UID, const DICompileUnit *Node, AsmPrinter *A,
                    DwarfDebug *DW, DwarfFile *DWU);
@@ -189,6 +204,13 @@ public:
   DIE *constructImportedEntityDIE(const DIImportedEntity *Module);
 
   void finishSubprogramDefinition(const DISubprogram *SP);
+  void finishVariableDefinition(const DbgVariable &Var);
+  /// Find abstract variable associated with Var.
+  typedef DbgValueHistoryMap::InlinedVariable InlinedVariable;
+  DbgVariable *getExistingAbstractVariable(InlinedVariable IV,
+                                           const DILocalVariable *&Cleansed);
+  DbgVariable *getExistingAbstractVariable(InlinedVariable IV);
+  void createAbstractVariable(const DILocalVariable *DV, LexicalScope *Scope);
 
   /// Set the skeleton unit associated with this unit.
   void setSkeleton(DwarfCompileUnit &Skel) { Skeleton = &Skel; }
diff --git a/lib/CodeGen/AsmPrinter/DwarfDebug.cpp b/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
index 6f442f5c3172..3410b98d7776 100644
--- a/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
+++ b/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
@@ -71,6 +71,10 @@ static cl::opt<bool> GenerateARangeSection("generate-arange-section",
                                            cl::desc("Generate dwarf aranges"),
                                            cl::init(false));
 
+static cl::opt<bool> SplitDwarfCrossCuReferences(
+    "split-dwarf-cross-cu-references", cl::Hidden,
+    cl::desc("Enable cross-cu references in DWO files"), cl::init(false));
+
 namespace {
 enum DefaultOnOff { Default, Enable, Disable };
 }
@@ -362,21 +366,29 @@ template <typename Func> static void forBothCUs(DwarfCompileUnit &CU, Func F) {
       F(*SkelCU);
 }
 
-void DwarfDebug::constructAbstractSubprogramScopeDIE(LexicalScope *Scope) {
+bool DwarfDebug::shareAcrossDWOCUs() const {
+  return SplitDwarfCrossCuReferences;
+}
+
+void DwarfDebug::constructAbstractSubprogramScopeDIE(DwarfCompileUnit &SrcCU,
+                                                     LexicalScope *Scope) {
   assert(Scope && Scope->getScopeNode());
   assert(Scope->isAbstractScope());
   assert(!Scope->getInlinedAt());
 
   auto *SP = cast<DISubprogram>(Scope->getScopeNode());
 
-  ProcessedSPNodes.insert(SP);
-
   // Find the subprogram's DwarfCompileUnit in the SPMap in case the subprogram
   // was inlined from another compile unit.
   auto &CU = *CUMap.lookup(SP->getUnit());
-  forBothCUs(CU, [&](DwarfCompileUnit &CU) {
+  if (auto *SkelCU = CU.getSkeleton()) {
+    (shareAcrossDWOCUs() ? CU : SrcCU)
+        .constructAbstractSubprogramScopeDIE(Scope);
+    if (CU.getCUNode()->getSplitDebugInlining())
+      SkelCU->constructAbstractSubprogramScopeDIE(Scope);
+  } else {
     CU.constructAbstractSubprogramScopeDIE(Scope);
-  });
+  }
 }
 
 void DwarfDebug::addGnuPubAttributes(DwarfUnit &U, DIE &D) const {
@@ -564,13 +576,7 @@ void DwarfDebug::finishVariableDefinitions() {
     // DIE::getUnit isn't simple - it walks parent pointers, etc.
     DwarfCompileUnit *Unit = CUDieMap.lookup(VariableDie->getUnitDie());
     assert(Unit);
-    DbgVariable *AbsVar = getExistingAbstractVariable(
-        InlinedVariable(Var->getVariable(), Var->getInlinedAt()));
-    if (AbsVar && AbsVar->getDIE()) {
-      Unit->addDIEEntry(*VariableDie, dwarf::DW_AT_abstract_origin,
-                        *AbsVar->getDIE());
-    } else
-      Unit->applyVariableAttributes(*Var, *VariableDie);
+    Unit->finishVariableDefinition(*Var);
   }
 }
 
@@ -718,58 +724,32 @@ void DwarfDebug::endModule() {
   }
 
   // clean up.
-  AbstractVariables.clear();
+  // FIXME: AbstractVariables.clear();
 }
 
-// Find abstract variable, if any, associated with Var.
-DbgVariable *
-DwarfDebug::getExistingAbstractVariable(InlinedVariable IV,
-                                        const DILocalVariable *&Cleansed) {
-  // More then one inlined variable corresponds to one abstract variable.
-  Cleansed = IV.first;
-  auto I = AbstractVariables.find(Cleansed);
-  if (I != AbstractVariables.end())
-    return I->second.get();
-  return nullptr;
-}
-
-DbgVariable *DwarfDebug::getExistingAbstractVariable(InlinedVariable IV) {
-  const DILocalVariable *Cleansed;
-  return getExistingAbstractVariable(IV, Cleansed);
-}
-
-void DwarfDebug::createAbstractVariable(const DILocalVariable *Var,
-                                        LexicalScope *Scope) {
-  assert(Scope && Scope->isAbstractScope());
-  auto AbsDbgVariable = make_unique<DbgVariable>(Var, /* IA */ nullptr);
-  InfoHolder.addScopeVariable(Scope, AbsDbgVariable.get());
-  AbstractVariables[Var] = std::move(AbsDbgVariable);
-}
-
-void DwarfDebug::ensureAbstractVariableIsCreated(InlinedVariable IV,
+void DwarfDebug::ensureAbstractVariableIsCreated(DwarfCompileUnit &CU, InlinedVariable IV,
                                                  const MDNode *ScopeNode) {
   const DILocalVariable *Cleansed = nullptr;
-  if (getExistingAbstractVariable(IV, Cleansed))
+  if (CU.getExistingAbstractVariable(IV, Cleansed))
     return;
 
-  createAbstractVariable(Cleansed, LScopes.getOrCreateAbstractScope(
+  CU.createAbstractVariable(Cleansed, LScopes.getOrCreateAbstractScope(
                                        cast<DILocalScope>(ScopeNode)));
 }
 
-void DwarfDebug::ensureAbstractVariableIsCreatedIfScoped(
+void DwarfDebug::ensureAbstractVariableIsCreatedIfScoped(DwarfCompileUnit &CU,
     InlinedVariable IV, const MDNode *ScopeNode) {
   const DILocalVariable *Cleansed = nullptr;
-  if (getExistingAbstractVariable(IV, Cleansed))
+  if (CU.getExistingAbstractVariable(IV, Cleansed))
     return;
 
   if (LexicalScope *Scope =
           LScopes.findAbstractScope(cast_or_null<DILocalScope>(ScopeNode)))
-    createAbstractVariable(Cleansed, Scope);
+    CU.createAbstractVariable(Cleansed, Scope);
 }
-
 // Collect variable information from side table maintained by MF.
 void DwarfDebug::collectVariableInfoFromMFTable(
-    DenseSet<InlinedVariable> &Processed) {
+    DwarfCompileUnit &TheCU, DenseSet<InlinedVariable> &Processed) {
   for (const auto &VI : Asm->MF->getVariableDbgInfo()) {
     if (!VI.Var)
       continue;
@@ -784,7 +764,7 @@ void DwarfDebug::collectVariableInfoFromMFTable(
     if (!Scope)
       continue;
 
-    ensureAbstractVariableIsCreatedIfScoped(Var, Scope->getScopeNode());
+    ensureAbstractVariableIsCreatedIfScoped(TheCU, Var, Scope->getScopeNode());
     auto RegVar = make_unique<DbgVariable>(Var.first, Var.second);
     RegVar->initializeMMI(VI.Expr, VI.Slot);
     if (InfoHolder.addScopeVariable(Scope, RegVar.get()))
@@ -955,9 +935,10 @@ DwarfDebug::buildLocationList(SmallVectorImpl<DebugLocEntry> &DebugLoc,
   }
 }
 
-DbgVariable *DwarfDebug::createConcreteVariable(LexicalScope &Scope,
+DbgVariable *DwarfDebug::createConcreteVariable(DwarfCompileUnit &TheCU,
+                                                LexicalScope &Scope,
                                                 InlinedVariable IV) {
-  ensureAbstractVariableIsCreatedIfScoped(IV, Scope.getScopeNode());
+  ensureAbstractVariableIsCreatedIfScoped(TheCU, IV, Scope.getScopeNode());
   ConcreteVariables.push_back(make_unique<DbgVariable>(IV.first, IV.second));
   InfoHolder.addScopeVariable(&Scope, ConcreteVariables.back().get());
   return ConcreteVariables.back().get();
@@ -980,7 +961,7 @@ void DwarfDebug::collectVariableInfo(DwarfCompileUnit &TheCU,
                                      const DISubprogram *SP,
                                      DenseSet<InlinedVariable> &Processed) {
   // Grab the variable info that was squirreled away in the MMI side-table.
-  collectVariableInfoFromMFTable(Processed);
+  collectVariableInfoFromMFTable(TheCU, Processed);
 
   for (const auto &I : DbgValues) {
     InlinedVariable IV = I.first;
@@ -1002,7 +983,7 @@ void DwarfDebug::collectVariableInfo(DwarfCompileUnit &TheCU,
       continue;
 
     Processed.insert(IV);
-    DbgVariable *RegVar = createConcreteVariable(*Scope, IV);
+    DbgVariable *RegVar = createConcreteVariable(TheCU, *Scope, IV);
 
     const MachineInstr *MInsn = Ranges.front().first;
     assert(MInsn->isDebugValue() && "History must begin with debug value");
@@ -1038,7 +1019,7 @@ void DwarfDebug::collectVariableInfo(DwarfCompileUnit &TheCU,
   for (const DILocalVariable *DV : SP->getVariables()) {
     if (Processed.insert(InlinedVariable(DV, nullptr)).second)
       if (LexicalScope *Scope = LScopes.findLexicalScope(DV->getScope()))
-        createConcreteVariable(*Scope, InlinedVariable(DV, nullptr));
+        createConcreteVariable(TheCU, *Scope, InlinedVariable(DV, nullptr));
   }
 }
 
@@ -1229,12 +1210,12 @@ void DwarfDebug::endFunctionImpl(const MachineFunction *MF) {
     for (const DILocalVariable *DV : SP->getVariables()) {
       if (!ProcessedVars.insert(InlinedVariable(DV, nullptr)).second)
         continue;
-      ensureAbstractVariableIsCreated(InlinedVariable(DV, nullptr),
+      ensureAbstractVariableIsCreated(TheCU, InlinedVariable(DV, nullptr),
                                       DV->getScope());
       assert(LScopes.getAbstractScopesList().size() == NumAbstractScopes
              && "ensureAbstractVariableIsCreated inserted abstract scopes");
     }
-    constructAbstractSubprogramScopeDIE(AScope);
+    constructAbstractSubprogramScopeDIE(TheCU, AScope);
   }
 
   ProcessedSPNodes.insert(SP);
diff --git a/lib/CodeGen/AsmPrinter/DwarfDebug.h b/lib/CodeGen/AsmPrinter/DwarfDebug.h
index 8a96e7867b6e..b9c5aa9ffb23 100644
--- a/lib/CodeGen/AsmPrinter/DwarfDebug.h
+++ b/lib/CodeGen/AsmPrinter/DwarfDebug.h
@@ -210,7 +210,6 @@ class DwarfDebug : public DebugHandlerBase {
   DenseMap<const MCSymbol *, uint64_t> SymSize;
 
   /// Collection of abstract variables.
-  DenseMap<const MDNode *, std::unique_ptr<DbgVariable>> AbstractVariables;
   SmallVector<std::unique_ptr<DbgVariable>, 64> ConcreteVariables;
 
   /// Collection of DebugLocEntry. Stored in a linked list so that DIELocLists
@@ -313,20 +312,16 @@ class DwarfDebug : public DebugHandlerBase {
 
   typedef DbgValueHistoryMap::InlinedVariable InlinedVariable;
 
-  /// Find abstract variable associated with Var.
-  DbgVariable *getExistingAbstractVariable(InlinedVariable IV,
-                                           const DILocalVariable *&Cleansed);
-  DbgVariable *getExistingAbstractVariable(InlinedVariable IV);
-  void createAbstractVariable(const DILocalVariable *DV, LexicalScope *Scope);
-  void ensureAbstractVariableIsCreated(InlinedVariable Var,
+  void ensureAbstractVariableIsCreated(DwarfCompileUnit &CU, InlinedVariable Var,
                                        const MDNode *Scope);
-  void ensureAbstractVariableIsCreatedIfScoped(InlinedVariable Var,
+  void ensureAbstractVariableIsCreatedIfScoped(DwarfCompileUnit &CU, InlinedVariable Var,
                                                const MDNode *Scope);
 
-  DbgVariable *createConcreteVariable(LexicalScope &Scope, InlinedVariable IV);
+  DbgVariable *createConcreteVariable(DwarfCompileUnit &TheCU,
+                                      LexicalScope &Scope, InlinedVariable IV);
 
   /// Construct a DIE for this abstract scope.
-  void constructAbstractSubprogramScopeDIE(LexicalScope *Scope);
+  void constructAbstractSubprogramScopeDIE(DwarfCompileUnit &SrcCU, LexicalScope *Scope);
 
   void finishVariableDefinitions();
 
@@ -446,7 +441,8 @@ class DwarfDebug : public DebugHandlerBase {
                          const DbgValueHistoryMap::InstrRanges &Ranges);
 
   /// Collect variable information from the side table maintained by MF.
-  void collectVariableInfoFromMFTable(DenseSet<InlinedVariable> &P);
+  void collectVariableInfoFromMFTable(DwarfCompileUnit &TheCU,
+                                      DenseSet<InlinedVariable> &P);
 
 protected:
   /// Gather pre-function debug information.
@@ -518,6 +514,8 @@ public:
   /// split dwarf proposal support.
   bool useSplitDwarf() const { return HasSplitDwarf; }
 
+  bool shareAcrossDWOCUs() const;
+
   /// Returns the Dwarf Version.
   uint16_t getDwarfVersion() const;
 
diff --git a/lib/CodeGen/AsmPrinter/DwarfFile.h b/lib/CodeGen/AsmPrinter/DwarfFile.h
index d4d2ed277274..54924e9806ed 100644
--- a/lib/CodeGen/AsmPrinter/DwarfFile.h
+++ b/lib/CodeGen/AsmPrinter/DwarfFile.h
@@ -53,6 +53,7 @@ class DwarfFile {
 
   // Collection of abstract subprogram DIEs.
   DenseMap<const MDNode *, DIE *> AbstractSPDies;
+  DenseMap<const MDNode *, std::unique_ptr<DbgVariable>> AbstractVariables;
 
   /// Maps MDNodes for type system with the corresponding DIEs. These DIEs can
   /// be shared across CUs, that is why we keep the map here instead
@@ -105,6 +106,9 @@ public:
   DenseMap<const MDNode *, DIE *> &getAbstractSPDies() {
     return AbstractSPDies;
   }
+  DenseMap<const MDNode *, std::unique_ptr<DbgVariable>> &getAbstractVariables() {
+    return AbstractVariables;
+  }
 
   void insertDIE(const MDNode *TypeMD, DIE *Die) {
     DITypeNodeToDieMap.insert(std::make_pair(TypeMD, Die));
diff --git a/lib/CodeGen/AsmPrinter/DwarfUnit.cpp b/lib/CodeGen/AsmPrinter/DwarfUnit.cpp
index 8d25def7772c..667afbb450bd 100644
--- a/lib/CodeGen/AsmPrinter/DwarfUnit.cpp
+++ b/lib/CodeGen/AsmPrinter/DwarfUnit.cpp
@@ -173,7 +173,7 @@ int64_t DwarfUnit::getDefaultLowerBound() const {
 }
 
 /// Check whether the DIE for this MDNode can be shared across CUs.
-static bool isShareableAcrossCUs(const DINode *D) {
+bool DwarfUnit::isShareableAcrossCUs(const DINode *D) const {
   // When the MDNode can be part of the type system, the DIE can be shared
   // across CUs.
   // Combining type units and cross-CU DIE sharing is lower value (since
@@ -181,6 +181,8 @@ static bool isShareableAcrossCUs(const DINode *D) {
   // level already) but may be implementable for some value in projects
   // building multiple independent libraries with LTO and then linking those
   // together.
+  if (isDwoUnit() && !DD->shareAcrossDWOCUs())
+    return false;
   return (isa<DIType>(D) ||
           (isa<DISubprogram>(D) && !cast<DISubprogram>(D)->isDefinition())) &&
          !GenerateDwarfTypeUnits;
@@ -645,7 +647,7 @@ void DwarfUnit::addLinkageName(DIE &Die, StringRef LinkageName) {
     addString(Die,
               DD->getDwarfVersion() >= 4 ? dwarf::DW_AT_linkage_name
                                          : dwarf::DW_AT_MIPS_linkage_name,
-              GlobalValue::getRealLinkageName(LinkageName));
+              GlobalValue::dropLLVMManglingEscape(LinkageName));
 }
 
 void DwarfUnit::addTemplateParams(DIE &Buffer, DINodeArray TParams) {
diff --git a/lib/CodeGen/AsmPrinter/DwarfUnit.h b/lib/CodeGen/AsmPrinter/DwarfUnit.h
index 8fc841703e23..7acad2cbd89f 100644
--- a/lib/CodeGen/AsmPrinter/DwarfUnit.h
+++ b/lib/CodeGen/AsmPrinter/DwarfUnit.h
@@ -65,7 +65,7 @@ public:
 //===----------------------------------------------------------------------===//
 /// This dwarf writer support class manages information associated with a
 /// source file.
-  class DwarfUnit : public DIEUnit {
+class DwarfUnit : public DIEUnit {
 protected:
   /// MDNode for the compile unit.
   const DICompileUnit *CUNode;
@@ -103,6 +103,9 @@ protected:
 
   bool applySubprogramDefinitionAttributes(const DISubprogram *SP, DIE &SPDie);
 
+  bool shareAcrossDWOCUs() const;
+  bool isShareableAcrossCUs(const DINode *D) const;
+
 public:
   // Accessors.
   AsmPrinter* getAsmPrinter() const { return Asm; }
diff --git a/lib/CodeGen/AsmPrinter/WinException.cpp b/lib/CodeGen/AsmPrinter/WinException.cpp
index 704f0ac2f191..815658bfb637 100644
--- a/lib/CodeGen/AsmPrinter/WinException.cpp
+++ b/lib/CodeGen/AsmPrinter/WinException.cpp
@@ -101,7 +101,7 @@ void WinException::beginFunction(const MachineFunction *MF) {
       // functions may still refer to it.
       const WinEHFuncInfo &FuncInfo = *MF->getWinEHFuncInfo();
       StringRef FLinkageName =
-          GlobalValue::getRealLinkageName(MF->getFunction()->getName());
+          GlobalValue::dropLLVMManglingEscape(MF->getFunction()->getName());
       emitEHRegistrationOffsetLabel(FuncInfo, FLinkageName);
     }
     shouldEmitLSDA = hasEHFunclets;
@@ -174,7 +174,7 @@ static MCSymbol *getMCSymbolForMBB(AsmPrinter *Asm,
   // their funclet entry block's number.
   const MachineFunction *MF = MBB->getParent();
   const Function *F = MF->getFunction();
-  StringRef FuncLinkageName = GlobalValue::getRealLinkageName(F->getName());
+  StringRef FuncLinkageName = GlobalValue::dropLLVMManglingEscape(F->getName());
   MCContext &Ctx = MF->getContext();
   StringRef HandlerPrefix = MBB->isCleanupFuncletEntry() ? "dtor" : "catch";
   return Ctx.getOrCreateSymbol("?" + HandlerPrefix + "$" +
@@ -252,7 +252,7 @@ void WinException::endFunclet() {
         !CurrentFuncletEntry->isCleanupFuncletEntry()) {
       // If this is a C++ catch funclet (or the parent function),
       // emit a reference to the LSDA for the parent function.
-      StringRef FuncLinkageName = GlobalValue::getRealLinkageName(F->getName());
+      StringRef FuncLinkageName = GlobalValue::dropLLVMManglingEscape(F->getName());
       MCSymbol *FuncInfoXData = Asm->OutContext.getOrCreateSymbol(
           Twine("$cppxdata$", FuncLinkageName));
       Asm->OutStreamer->EmitValue(create32bitRef(FuncInfoXData), 4);
@@ -536,7 +536,7 @@ void WinException::emitCSpecificHandlerTable(const MachineFunction *MF) {
   // Emit a label assignment with the SEH frame offset so we can use it for
   // llvm.x86.seh.recoverfp.
   StringRef FLinkageName =
-      GlobalValue::getRealLinkageName(MF->getFunction()->getName());
+      GlobalValue::dropLLVMManglingEscape(MF->getFunction()->getName());
   MCSymbol *ParentFrameOffset =
       Ctx.getOrCreateParentFrameOffsetSymbol(FLinkageName);
   const MCExpr *MCOffset =
@@ -635,7 +635,7 @@ void WinException::emitCXXFrameHandler3Table(const MachineFunction *MF) {
   auto &OS = *Asm->OutStreamer;
   const WinEHFuncInfo &FuncInfo = *MF->getWinEHFuncInfo();
 
-  StringRef FuncLinkageName = GlobalValue::getRealLinkageName(F->getName());
+  StringRef FuncLinkageName = GlobalValue::dropLLVMManglingEscape(F->getName());
 
   SmallVector<std::pair<const MCExpr *, int>, 4> IPToStateTable;
   MCSymbol *FuncInfoXData = nullptr;
@@ -942,7 +942,7 @@ void WinException::emitEHRegistrationOffsetLabel(const WinEHFuncInfo &FuncInfo,
 void WinException::emitExceptHandlerTable(const MachineFunction *MF) {
   MCStreamer &OS = *Asm->OutStreamer;
   const Function *F = MF->getFunction();
-  StringRef FLinkageName = GlobalValue::getRealLinkageName(F->getName());
+  StringRef FLinkageName = GlobalValue::dropLLVMManglingEscape(F->getName());
 
   bool VerboseAsm = OS.isVerboseAsm();
   auto AddComment = [&](const Twine &Comment) {
diff --git a/lib/CodeGen/AtomicExpandPass.cpp b/lib/CodeGen/AtomicExpandPass.cpp
index 9c19a4fd3c3e..17e6be05eb42 100644
--- a/lib/CodeGen/AtomicExpandPass.cpp
+++ b/lib/CodeGen/AtomicExpandPass.cpp
@@ -47,8 +47,7 @@ namespace {
     bool runOnFunction(Function &F) override;
 
   private:
-    bool bracketInstWithFences(Instruction *I, AtomicOrdering Order,
-                               bool IsStore, bool IsLoad);
+    bool bracketInstWithFences(Instruction *I, AtomicOrdering Order);
     IntegerType *getCorrespondingIntegerType(Type *T, const DataLayout &DL);
     LoadInst *convertAtomicLoadToIntegerType(LoadInst *LI);
     bool tryExpandAtomicLoad(LoadInst *LI);
@@ -224,22 +223,16 @@ bool AtomicExpand::runOnFunction(Function &F) {
 
     if (TLI->shouldInsertFencesForAtomic(I)) {
       auto FenceOrdering = AtomicOrdering::Monotonic;
-      bool IsStore, IsLoad;
       if (LI && isAcquireOrStronger(LI->getOrdering())) {
         FenceOrdering = LI->getOrdering();
         LI->setOrdering(AtomicOrdering::Monotonic);
-        IsStore = false;
-        IsLoad = true;
       } else if (SI && isReleaseOrStronger(SI->getOrdering())) {
         FenceOrdering = SI->getOrdering();
         SI->setOrdering(AtomicOrdering::Monotonic);
-        IsStore = true;
-        IsLoad = false;
       } else if (RMWI && (isReleaseOrStronger(RMWI->getOrdering()) ||
                           isAcquireOrStronger(RMWI->getOrdering()))) {
         FenceOrdering = RMWI->getOrdering();
         RMWI->setOrdering(AtomicOrdering::Monotonic);
-        IsStore = IsLoad = true;
       } else if (CASI && !TLI->shouldExpandAtomicCmpXchgInIR(CASI) &&
                  (isReleaseOrStronger(CASI->getSuccessOrdering()) ||
                   isAcquireOrStronger(CASI->getSuccessOrdering()))) {
@@ -250,11 +243,10 @@ bool AtomicExpand::runOnFunction(Function &F) {
         FenceOrdering = CASI->getSuccessOrdering();
         CASI->setSuccessOrdering(AtomicOrdering::Monotonic);
         CASI->setFailureOrdering(AtomicOrdering::Monotonic);
-        IsStore = IsLoad = true;
       }
 
       if (FenceOrdering != AtomicOrdering::Monotonic) {
-        MadeChange |= bracketInstWithFences(I, FenceOrdering, IsStore, IsLoad);
+        MadeChange |= bracketInstWithFences(I, FenceOrdering);
       }
     }
 
@@ -320,13 +312,12 @@ bool AtomicExpand::runOnFunction(Function &F) {
   return MadeChange;
 }
 
-bool AtomicExpand::bracketInstWithFences(Instruction *I, AtomicOrdering Order,
-                                         bool IsStore, bool IsLoad) {
+bool AtomicExpand::bracketInstWithFences(Instruction *I, AtomicOrdering Order) {
   IRBuilder<> Builder(I);
 
-  auto LeadingFence = TLI->emitLeadingFence(Builder, Order, IsStore, IsLoad);
+  auto LeadingFence = TLI->emitLeadingFence(Builder, I, Order);
 
-  auto TrailingFence = TLI->emitTrailingFence(Builder, Order, IsStore, IsLoad);
+  auto TrailingFence = TLI->emitTrailingFence(Builder, I, Order);
   // The trailing fence is emitted before the instruction instead of after
   // because there is no easy way of setting Builder insertion point after
   // an instruction. So we must erase it from the BB, and insert it back
@@ -1048,8 +1039,7 @@ bool AtomicExpand::expandAtomicCmpXchg(AtomicCmpXchgInst *CI) {
   std::prev(BB->end())->eraseFromParent();
   Builder.SetInsertPoint(BB);
   if (ShouldInsertFencesForAtomic && UseUnconditionalReleaseBarrier)
-    TLI->emitLeadingFence(Builder, SuccessOrder, /*IsStore=*/true,
-                          /*IsLoad=*/true);
+    TLI->emitLeadingFence(Builder, CI, SuccessOrder);
   Builder.CreateBr(StartBB);
 
   // Start the main loop block now that we've taken care of the preliminaries.
@@ -1064,8 +1054,7 @@ bool AtomicExpand::expandAtomicCmpXchg(AtomicCmpXchgInst *CI) {
 
   Builder.SetInsertPoint(ReleasingStoreBB);
   if (ShouldInsertFencesForAtomic && !UseUnconditionalReleaseBarrier)
-    TLI->emitLeadingFence(Builder, SuccessOrder, /*IsStore=*/true,
-                          /*IsLoad=*/true);
+    TLI->emitLeadingFence(Builder, CI, SuccessOrder);
   Builder.CreateBr(TryStoreBB);
 
   Builder.SetInsertPoint(TryStoreBB);
@@ -1094,8 +1083,7 @@ bool AtomicExpand::expandAtomicCmpXchg(AtomicCmpXchgInst *CI) {
   // necessary.
   Builder.SetInsertPoint(SuccessBB);
   if (ShouldInsertFencesForAtomic)
-    TLI->emitTrailingFence(Builder, SuccessOrder, /*IsStore=*/true,
-                           /*IsLoad=*/true);
+    TLI->emitTrailingFence(Builder, CI, SuccessOrder);
   Builder.CreateBr(ExitBB);
 
   Builder.SetInsertPoint(NoStoreBB);
@@ -1107,8 +1095,7 @@ bool AtomicExpand::expandAtomicCmpXchg(AtomicCmpXchgInst *CI) {
 
   Builder.SetInsertPoint(FailureBB);
   if (ShouldInsertFencesForAtomic)
-    TLI->emitTrailingFence(Builder, FailureOrder, /*IsStore=*/true,
-                           /*IsLoad=*/true);
+    TLI->emitTrailingFence(Builder, CI, FailureOrder);
   Builder.CreateBr(ExitBB);
 
   // Finally, we have control-flow based knowledge of whether the cmpxchg
diff --git a/lib/CodeGen/CMakeLists.txt b/lib/CodeGen/CMakeLists.txt
index 26da748fa244..55a27e2fb79e 100644
--- a/lib/CodeGen/CMakeLists.txt
+++ b/lib/CodeGen/CMakeLists.txt
@@ -23,6 +23,7 @@ add_llvm_library(LLVMCodeGen
   ExecutionDepsFix.cpp
   ExpandISelPseudos.cpp
   ExpandPostRAPseudos.cpp
+  ExpandReductions.cpp
   FaultMaps.cpp
   FEntryInserter.cpp
   FuncletLayout.cpp
@@ -48,6 +49,7 @@ add_llvm_library(LLVMCodeGen
   LivePhysRegs.cpp
   LiveRangeCalc.cpp
   LiveRangeEdit.cpp
+  LiveRangeShrink.cpp
   LiveRegMatrix.cpp
   LiveRegUnits.cpp
   LiveStackAnalysis.cpp
@@ -118,6 +120,7 @@ add_llvm_library(LLVMCodeGen
   SafeStack.cpp
   SafeStackColoring.cpp
   SafeStackLayout.cpp
+  ScalarizeMaskedMemIntrin.cpp
   ScheduleDAG.cpp
   ScheduleDAGInstrs.cpp
   ScheduleDAGPrinter.cpp
diff --git a/lib/CodeGen/CodeGen.cpp b/lib/CodeGen/CodeGen.cpp
index 3fc12ccc3b60..4d30c6574b12 100644
--- a/lib/CodeGen/CodeGen.cpp
+++ b/lib/CodeGen/CodeGen.cpp
@@ -43,6 +43,7 @@ void llvm::initializeCodeGen(PassRegistry &Registry) {
   initializeLiveDebugValuesPass(Registry);
   initializeLiveDebugVariablesPass(Registry);
   initializeLiveIntervalsPass(Registry);
+  initializeLiveRangeShrinkPass(Registry);
   initializeLiveStacksPass(Registry);
   initializeLiveVariablesPass(Registry);
   initializeLocalStackSlotPassPass(Registry);
@@ -79,7 +80,8 @@ void llvm::initializeCodeGen(PassRegistry &Registry) {
   initializeRAGreedyPass(Registry);
   initializeRegisterCoalescerPass(Registry);
   initializeRenameIndependentSubregsPass(Registry);
-  initializeSafeStackPass(Registry);
+  initializeSafeStackLegacyPassPass(Registry);
+  initializeScalarizeMaskedMemIntrinPass(Registry);
   initializeShrinkWrapPass(Registry);
   initializeSlotIndexesPass(Registry);
   initializeStackColoringPass(Registry);
diff --git a/lib/CodeGen/CodeGenPrepare.cpp b/lib/CodeGen/CodeGenPrepare.cpp
index c6c93811a0f9..f2e024c5e3bd 100644
--- a/lib/CodeGen/CodeGenPrepare.cpp
+++ b/lib/CodeGen/CodeGenPrepare.cpp
@@ -295,7 +295,7 @@ bool CodeGenPrepare::runOnFunction(Function &F) {
     if (PSI->isFunctionHotInCallGraph(&F))
       F.setSectionPrefix(".hot");
     else if (PSI->isFunctionColdInCallGraph(&F))
-      F.setSectionPrefix(".cold");
+      F.setSectionPrefix(".unlikely");
   }
 
   /// This optimization identifies DIV instructions that can be
@@ -1549,519 +1549,6 @@ static bool OptimizeExtractBits(BinaryOperator *ShiftI, ConstantInt *CI,
   return MadeChange;
 }
 
-// Translate a masked load intrinsic like
-// <16 x i32 > @llvm.masked.load( <16 x i32>* %addr, i32 align,
-//                               <16 x i1> %mask, <16 x i32> %passthru)
-// to a chain of basic blocks, with loading element one-by-one if
-// the appropriate mask bit is set
-//
-//  %1 = bitcast i8* %addr to i32*
-//  %2 = extractelement <16 x i1> %mask, i32 0
-//  %3 = icmp eq i1 %2, true
-//  br i1 %3, label %cond.load, label %else
-//
-//cond.load:                                        ; preds = %0
-//  %4 = getelementptr i32* %1, i32 0
-//  %5 = load i32* %4
-//  %6 = insertelement <16 x i32> undef, i32 %5, i32 0
-//  br label %else
-//
-//else:                                             ; preds = %0, %cond.load
-//  %res.phi.else = phi <16 x i32> [ %6, %cond.load ], [ undef, %0 ]
-//  %7 = extractelement <16 x i1> %mask, i32 1
-//  %8 = icmp eq i1 %7, true
-//  br i1 %8, label %cond.load1, label %else2
-//
-//cond.load1:                                       ; preds = %else
-//  %9 = getelementptr i32* %1, i32 1
-//  %10 = load i32* %9
-//  %11 = insertelement <16 x i32> %res.phi.else, i32 %10, i32 1
-//  br label %else2
-//
-//else2:                                            ; preds = %else, %cond.load1
-//  %res.phi.else3 = phi <16 x i32> [ %11, %cond.load1 ], [ %res.phi.else, %else ]
-//  %12 = extractelement <16 x i1> %mask, i32 2
-//  %13 = icmp eq i1 %12, true
-//  br i1 %13, label %cond.load4, label %else5
-//
-static void scalarizeMaskedLoad(CallInst *CI) {
-  Value *Ptr  = CI->getArgOperand(0);
-  Value *Alignment = CI->getArgOperand(1);
-  Value *Mask = CI->getArgOperand(2);
-  Value *Src0 = CI->getArgOperand(3);
-
-  unsigned AlignVal = cast<ConstantInt>(Alignment)->getZExtValue();
-  VectorType *VecType = dyn_cast<VectorType>(CI->getType());
-  assert(VecType && "Unexpected return type of masked load intrinsic");
-
-  Type *EltTy = CI->getType()->getVectorElementType();
-
-  IRBuilder<> Builder(CI->getContext());
-  Instruction *InsertPt = CI;
-  BasicBlock *IfBlock = CI->getParent();
-  BasicBlock *CondBlock = nullptr;
-  BasicBlock *PrevIfBlock = CI->getParent();
-
-  Builder.SetInsertPoint(InsertPt);
-  Builder.SetCurrentDebugLocation(CI->getDebugLoc());
-
-  // Short-cut if the mask is all-true.
-  bool IsAllOnesMask = isa<Constant>(Mask) &&
-    cast<Constant>(Mask)->isAllOnesValue();
-
-  if (IsAllOnesMask) {
-    Value *NewI = Builder.CreateAlignedLoad(Ptr, AlignVal);
-    CI->replaceAllUsesWith(NewI);
-    CI->eraseFromParent();
-    return;
-  }
-
-  // Adjust alignment for the scalar instruction.
-  AlignVal = std::min(AlignVal, VecType->getScalarSizeInBits()/8);
-  // Bitcast %addr fron i8* to EltTy*
-  Type *NewPtrType =
-    EltTy->getPointerTo(cast<PointerType>(Ptr->getType())->getAddressSpace());
-  Value *FirstEltPtr = Builder.CreateBitCast(Ptr, NewPtrType);
-  unsigned VectorWidth = VecType->getNumElements();
-
-  Value *UndefVal = UndefValue::get(VecType);
-
-  // The result vector
-  Value *VResult = UndefVal;
-
-  if (isa<ConstantVector>(Mask)) {
-    for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) {
-      if (cast<ConstantVector>(Mask)->getOperand(Idx)->isNullValue())
-          continue;
-      Value *Gep =
-          Builder.CreateInBoundsGEP(EltTy, FirstEltPtr, Builder.getInt32(Idx));
-      LoadInst* Load = Builder.CreateAlignedLoad(Gep, AlignVal);
-      VResult = Builder.CreateInsertElement(VResult, Load,
-                                            Builder.getInt32(Idx));
-    }
-    Value *NewI = Builder.CreateSelect(Mask, VResult, Src0);
-    CI->replaceAllUsesWith(NewI);
-    CI->eraseFromParent();
-    return;
-  }
-
-  PHINode *Phi = nullptr;
-  Value *PrevPhi = UndefVal;
-
-  for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) {
-
-    // Fill the "else" block, created in the previous iteration
-    //
-    //  %res.phi.else3 = phi <16 x i32> [ %11, %cond.load1 ], [ %res.phi.else, %else ]
-    //  %mask_1 = extractelement <16 x i1> %mask, i32 Idx
-    //  %to_load = icmp eq i1 %mask_1, true
-    //  br i1 %to_load, label %cond.load, label %else
-    //
-    if (Idx > 0) {
-      Phi = Builder.CreatePHI(VecType, 2, "res.phi.else");
-      Phi->addIncoming(VResult, CondBlock);
-      Phi->addIncoming(PrevPhi, PrevIfBlock);
-      PrevPhi = Phi;
-      VResult = Phi;
-    }
-
-    Value *Predicate = Builder.CreateExtractElement(Mask, Builder.getInt32(Idx));
-    Value *Cmp = Builder.CreateICmp(ICmpInst::ICMP_EQ, Predicate,
-                                    ConstantInt::get(Predicate->getType(), 1));
-
-    // Create "cond" block
-    //
-    //  %EltAddr = getelementptr i32* %1, i32 0
-    //  %Elt = load i32* %EltAddr
-    //  VResult = insertelement <16 x i32> VResult, i32 %Elt, i32 Idx
-    //
-    CondBlock = IfBlock->splitBasicBlock(InsertPt->getIterator(), "cond.load");
-    Builder.SetInsertPoint(InsertPt);
-
-    Value *Gep =
-        Builder.CreateInBoundsGEP(EltTy, FirstEltPtr, Builder.getInt32(Idx));
-    LoadInst *Load = Builder.CreateAlignedLoad(Gep, AlignVal);
-    VResult = Builder.CreateInsertElement(VResult, Load, Builder.getInt32(Idx));
-
-    // Create "else" block, fill it in the next iteration
-    BasicBlock *NewIfBlock =
-        CondBlock->splitBasicBlock(InsertPt->getIterator(), "else");
-    Builder.SetInsertPoint(InsertPt);
-    Instruction *OldBr = IfBlock->getTerminator();
-    BranchInst::Create(CondBlock, NewIfBlock, Cmp, OldBr);
-    OldBr->eraseFromParent();
-    PrevIfBlock = IfBlock;
-    IfBlock = NewIfBlock;
-  }
-
-  Phi = Builder.CreatePHI(VecType, 2, "res.phi.select");
-  Phi->addIncoming(VResult, CondBlock);
-  Phi->addIncoming(PrevPhi, PrevIfBlock);
-  Value *NewI = Builder.CreateSelect(Mask, Phi, Src0);
-  CI->replaceAllUsesWith(NewI);
-  CI->eraseFromParent();
-}
-
-// Translate a masked store intrinsic, like
-// void @llvm.masked.store(<16 x i32> %src, <16 x i32>* %addr, i32 align,
-//                               <16 x i1> %mask)
-// to a chain of basic blocks, that stores element one-by-one if
-// the appropriate mask bit is set
-//
-//   %1 = bitcast i8* %addr to i32*
-//   %2 = extractelement <16 x i1> %mask, i32 0
-//   %3 = icmp eq i1 %2, true
-//   br i1 %3, label %cond.store, label %else
-//
-// cond.store:                                       ; preds = %0
-//   %4 = extractelement <16 x i32> %val, i32 0
-//   %5 = getelementptr i32* %1, i32 0
-//   store i32 %4, i32* %5
-//   br label %else
-//
-// else:                                             ; preds = %0, %cond.store
-//   %6 = extractelement <16 x i1> %mask, i32 1
-//   %7 = icmp eq i1 %6, true
-//   br i1 %7, label %cond.store1, label %else2
-//
-// cond.store1:                                      ; preds = %else
-//   %8 = extractelement <16 x i32> %val, i32 1
-//   %9 = getelementptr i32* %1, i32 1
-//   store i32 %8, i32* %9
-//   br label %else2
-//   . . .
-static void scalarizeMaskedStore(CallInst *CI) {
-  Value *Src = CI->getArgOperand(0);
-  Value *Ptr  = CI->getArgOperand(1);
-  Value *Alignment = CI->getArgOperand(2);
-  Value *Mask = CI->getArgOperand(3);
-
-  unsigned AlignVal = cast<ConstantInt>(Alignment)->getZExtValue();
-  VectorType *VecType = dyn_cast<VectorType>(Src->getType());
-  assert(VecType && "Unexpected data type in masked store intrinsic");
-
-  Type *EltTy = VecType->getElementType();
-
-  IRBuilder<> Builder(CI->getContext());
-  Instruction *InsertPt = CI;
-  BasicBlock *IfBlock = CI->getParent();
-  Builder.SetInsertPoint(InsertPt);
-  Builder.SetCurrentDebugLocation(CI->getDebugLoc());
-
-  // Short-cut if the mask is all-true.
-  bool IsAllOnesMask = isa<Constant>(Mask) &&
-    cast<Constant>(Mask)->isAllOnesValue();
-
-  if (IsAllOnesMask) {
-    Builder.CreateAlignedStore(Src, Ptr, AlignVal);
-    CI->eraseFromParent();
-    return;
-  }
-
-  // Adjust alignment for the scalar instruction.
-  AlignVal = std::max(AlignVal, VecType->getScalarSizeInBits()/8);
-  // Bitcast %addr fron i8* to EltTy*
-  Type *NewPtrType =
-    EltTy->getPointerTo(cast<PointerType>(Ptr->getType())->getAddressSpace());
-  Value *FirstEltPtr = Builder.CreateBitCast(Ptr, NewPtrType);
-  unsigned VectorWidth = VecType->getNumElements();
-
-  if (isa<ConstantVector>(Mask)) {
-    for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) {
-      if (cast<ConstantVector>(Mask)->getOperand(Idx)->isNullValue())
-          continue;
-      Value *OneElt = Builder.CreateExtractElement(Src, Builder.getInt32(Idx));
-      Value *Gep =
-          Builder.CreateInBoundsGEP(EltTy, FirstEltPtr, Builder.getInt32(Idx));
-      Builder.CreateAlignedStore(OneElt, Gep, AlignVal);
-    }
-    CI->eraseFromParent();
-    return;
-  }
-
-  for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) {
-
-    // Fill the "else" block, created in the previous iteration
-    //
-    //  %mask_1 = extractelement <16 x i1> %mask, i32 Idx
-    //  %to_store = icmp eq i1 %mask_1, true
-    //  br i1 %to_store, label %cond.store, label %else
-    //
-    Value *Predicate = Builder.CreateExtractElement(Mask, Builder.getInt32(Idx));
-    Value *Cmp = Builder.CreateICmp(ICmpInst::ICMP_EQ, Predicate,
-                                    ConstantInt::get(Predicate->getType(), 1));
-
-    // Create "cond" block
-    //
-    //  %OneElt = extractelement <16 x i32> %Src, i32 Idx
-    //  %EltAddr = getelementptr i32* %1, i32 0
-    //  %store i32 %OneElt, i32* %EltAddr
-    //
-    BasicBlock *CondBlock =
-        IfBlock->splitBasicBlock(InsertPt->getIterator(), "cond.store");
-    Builder.SetInsertPoint(InsertPt);
-
-    Value *OneElt = Builder.CreateExtractElement(Src, Builder.getInt32(Idx));
-    Value *Gep =
-        Builder.CreateInBoundsGEP(EltTy, FirstEltPtr, Builder.getInt32(Idx));
-    Builder.CreateAlignedStore(OneElt, Gep, AlignVal);
-
-    // Create "else" block, fill it in the next iteration
-    BasicBlock *NewIfBlock =
-        CondBlock->splitBasicBlock(InsertPt->getIterator(), "else");
-    Builder.SetInsertPoint(InsertPt);
-    Instruction *OldBr = IfBlock->getTerminator();
-    BranchInst::Create(CondBlock, NewIfBlock, Cmp, OldBr);
-    OldBr->eraseFromParent();
-    IfBlock = NewIfBlock;
-  }
-  CI->eraseFromParent();
-}
-
-// Translate a masked gather intrinsic like
-// <16 x i32 > @llvm.masked.gather.v16i32( <16 x i32*> %Ptrs, i32 4,
-//                               <16 x i1> %Mask, <16 x i32> %Src)
-// to a chain of basic blocks, with loading element one-by-one if
-// the appropriate mask bit is set
-//
-// % Ptrs = getelementptr i32, i32* %base, <16 x i64> %ind
-// % Mask0 = extractelement <16 x i1> %Mask, i32 0
-// % ToLoad0 = icmp eq i1 % Mask0, true
-// br i1 % ToLoad0, label %cond.load, label %else
-//
-// cond.load:
-// % Ptr0 = extractelement <16 x i32*> %Ptrs, i32 0
-// % Load0 = load i32, i32* % Ptr0, align 4
-// % Res0 = insertelement <16 x i32> undef, i32 % Load0, i32 0
-// br label %else
-//
-// else:
-// %res.phi.else = phi <16 x i32>[% Res0, %cond.load], [undef, % 0]
-// % Mask1 = extractelement <16 x i1> %Mask, i32 1
-// % ToLoad1 = icmp eq i1 % Mask1, true
-// br i1 % ToLoad1, label %cond.load1, label %else2
-//
-// cond.load1:
-// % Ptr1 = extractelement <16 x i32*> %Ptrs, i32 1
-// % Load1 = load i32, i32* % Ptr1, align 4
-// % Res1 = insertelement <16 x i32> %res.phi.else, i32 % Load1, i32 1
-// br label %else2
-// . . .
-// % Result = select <16 x i1> %Mask, <16 x i32> %res.phi.select, <16 x i32> %Src
-// ret <16 x i32> %Result
-static void scalarizeMaskedGather(CallInst *CI) {
-  Value *Ptrs = CI->getArgOperand(0);
-  Value *Alignment = CI->getArgOperand(1);
-  Value *Mask = CI->getArgOperand(2);
-  Value *Src0 = CI->getArgOperand(3);
-
-  VectorType *VecType = dyn_cast<VectorType>(CI->getType());
-
-  assert(VecType && "Unexpected return type of masked load intrinsic");
-
-  IRBuilder<> Builder(CI->getContext());
-  Instruction *InsertPt = CI;
-  BasicBlock *IfBlock = CI->getParent();
-  BasicBlock *CondBlock = nullptr;
-  BasicBlock *PrevIfBlock = CI->getParent();
-  Builder.SetInsertPoint(InsertPt);
-  unsigned AlignVal = cast<ConstantInt>(Alignment)->getZExtValue();
-
-  Builder.SetCurrentDebugLocation(CI->getDebugLoc());
-
-  Value *UndefVal = UndefValue::get(VecType);
-
-  // The result vector
-  Value *VResult = UndefVal;
-  unsigned VectorWidth = VecType->getNumElements();
-
-  // Shorten the way if the mask is a vector of constants.
-  bool IsConstMask = isa<ConstantVector>(Mask);
-
-  if (IsConstMask) {
-    for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) {
-      if (cast<ConstantVector>(Mask)->getOperand(Idx)->isNullValue())
-        continue;
-      Value *Ptr = Builder.CreateExtractElement(Ptrs, Builder.getInt32(Idx),
-                                                "Ptr" + Twine(Idx));
-      LoadInst *Load = Builder.CreateAlignedLoad(Ptr, AlignVal,
-                                                 "Load" + Twine(Idx));
-      VResult = Builder.CreateInsertElement(VResult, Load,
-                                            Builder.getInt32(Idx),
-                                            "Res" + Twine(Idx));
-    }
-    Value *NewI = Builder.CreateSelect(Mask, VResult, Src0);
-    CI->replaceAllUsesWith(NewI);
-    CI->eraseFromParent();
-    return;
-  }
-
-  PHINode *Phi = nullptr;
-  Value *PrevPhi = UndefVal;
-
-  for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) {
-
-    // Fill the "else" block, created in the previous iteration
-    //
-    //  %Mask1 = extractelement <16 x i1> %Mask, i32 1
-    //  %ToLoad1 = icmp eq i1 %Mask1, true
-    //  br i1 %ToLoad1, label %cond.load, label %else
-    //
-    if (Idx > 0) {
-      Phi = Builder.CreatePHI(VecType, 2, "res.phi.else");
-      Phi->addIncoming(VResult, CondBlock);
-      Phi->addIncoming(PrevPhi, PrevIfBlock);
-      PrevPhi = Phi;
-      VResult = Phi;
-    }
-
-    Value *Predicate = Builder.CreateExtractElement(Mask,
-                                                    Builder.getInt32(Idx),
-                                                    "Mask" + Twine(Idx));
-    Value *Cmp = Builder.CreateICmp(ICmpInst::ICMP_EQ, Predicate,
-                                    ConstantInt::get(Predicate->getType(), 1),
-                                    "ToLoad" + Twine(Idx));
-
-    // Create "cond" block
-    //
-    //  %EltAddr = getelementptr i32* %1, i32 0
-    //  %Elt = load i32* %EltAddr
-    //  VResult = insertelement <16 x i32> VResult, i32 %Elt, i32 Idx
-    //
-    CondBlock = IfBlock->splitBasicBlock(InsertPt, "cond.load");
-    Builder.SetInsertPoint(InsertPt);
-
-    Value *Ptr = Builder.CreateExtractElement(Ptrs, Builder.getInt32(Idx),
-                                              "Ptr" + Twine(Idx));
-    LoadInst *Load = Builder.CreateAlignedLoad(Ptr, AlignVal,
-                                               "Load" + Twine(Idx));
-    VResult = Builder.CreateInsertElement(VResult, Load, Builder.getInt32(Idx),
-                                          "Res" + Twine(Idx));
-
-    // Create "else" block, fill it in the next iteration
-    BasicBlock *NewIfBlock = CondBlock->splitBasicBlock(InsertPt, "else");
-    Builder.SetInsertPoint(InsertPt);
-    Instruction *OldBr = IfBlock->getTerminator();
-    BranchInst::Create(CondBlock, NewIfBlock, Cmp, OldBr);
-    OldBr->eraseFromParent();
-    PrevIfBlock = IfBlock;
-    IfBlock = NewIfBlock;
-  }
-
-  Phi = Builder.CreatePHI(VecType, 2, "res.phi.select");
-  Phi->addIncoming(VResult, CondBlock);
-  Phi->addIncoming(PrevPhi, PrevIfBlock);
-  Value *NewI = Builder.CreateSelect(Mask, Phi, Src0);
-  CI->replaceAllUsesWith(NewI);
-  CI->eraseFromParent();
-}
-
-// Translate a masked scatter intrinsic, like
-// void @llvm.masked.scatter.v16i32(<16 x i32> %Src, <16 x i32*>* %Ptrs, i32 4,
-//                                  <16 x i1> %Mask)
-// to a chain of basic blocks, that stores element one-by-one if
-// the appropriate mask bit is set.
-//
-// % Ptrs = getelementptr i32, i32* %ptr, <16 x i64> %ind
-// % Mask0 = extractelement <16 x i1> % Mask, i32 0
-// % ToStore0 = icmp eq i1 % Mask0, true
-// br i1 %ToStore0, label %cond.store, label %else
-//
-// cond.store:
-// % Elt0 = extractelement <16 x i32> %Src, i32 0
-// % Ptr0 = extractelement <16 x i32*> %Ptrs, i32 0
-// store i32 %Elt0, i32* % Ptr0, align 4
-// br label %else
-//
-// else:
-// % Mask1 = extractelement <16 x i1> % Mask, i32 1
-// % ToStore1 = icmp eq i1 % Mask1, true
-// br i1 % ToStore1, label %cond.store1, label %else2
-//
-// cond.store1:
-// % Elt1 = extractelement <16 x i32> %Src, i32 1
-// % Ptr1 = extractelement <16 x i32*> %Ptrs, i32 1
-// store i32 % Elt1, i32* % Ptr1, align 4
-// br label %else2
-//   . . .
-static void scalarizeMaskedScatter(CallInst *CI) {
-  Value *Src = CI->getArgOperand(0);
-  Value *Ptrs = CI->getArgOperand(1);
-  Value *Alignment = CI->getArgOperand(2);
-  Value *Mask = CI->getArgOperand(3);
-
-  assert(isa<VectorType>(Src->getType()) &&
-         "Unexpected data type in masked scatter intrinsic");
-  assert(isa<VectorType>(Ptrs->getType()) &&
-         isa<PointerType>(Ptrs->getType()->getVectorElementType()) &&
-         "Vector of pointers is expected in masked scatter intrinsic");
-
-  IRBuilder<> Builder(CI->getContext());
-  Instruction *InsertPt = CI;
-  BasicBlock *IfBlock = CI->getParent();
-  Builder.SetInsertPoint(InsertPt);
-  Builder.SetCurrentDebugLocation(CI->getDebugLoc());
-
-  unsigned AlignVal = cast<ConstantInt>(Alignment)->getZExtValue();
-  unsigned VectorWidth = Src->getType()->getVectorNumElements();
-
-  // Shorten the way if the mask is a vector of constants.
-  bool IsConstMask = isa<ConstantVector>(Mask);
-
-  if (IsConstMask) {
-    for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) {
-      if (cast<ConstantVector>(Mask)->getOperand(Idx)->isNullValue())
-        continue;
-      Value *OneElt = Builder.CreateExtractElement(Src, Builder.getInt32(Idx),
-                                                   "Elt" + Twine(Idx));
-      Value *Ptr = Builder.CreateExtractElement(Ptrs, Builder.getInt32(Idx),
-                                                "Ptr" + Twine(Idx));
-      Builder.CreateAlignedStore(OneElt, Ptr, AlignVal);
-    }
-    CI->eraseFromParent();
-    return;
-  }
-  for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) {
-    // Fill the "else" block, created in the previous iteration
-    //
-    //  % Mask1 = extractelement <16 x i1> % Mask, i32 Idx
-    //  % ToStore = icmp eq i1 % Mask1, true
-    //  br i1 % ToStore, label %cond.store, label %else
-    //
-    Value *Predicate = Builder.CreateExtractElement(Mask,
-                                                    Builder.getInt32(Idx),
-                                                    "Mask" + Twine(Idx));
-    Value *Cmp =
-       Builder.CreateICmp(ICmpInst::ICMP_EQ, Predicate,
-                          ConstantInt::get(Predicate->getType(), 1),
-                          "ToStore" + Twine(Idx));
-
-    // Create "cond" block
-    //
-    //  % Elt1 = extractelement <16 x i32> %Src, i32 1
-    //  % Ptr1 = extractelement <16 x i32*> %Ptrs, i32 1
-    //  %store i32 % Elt1, i32* % Ptr1
-    //
-    BasicBlock *CondBlock = IfBlock->splitBasicBlock(InsertPt, "cond.store");
-    Builder.SetInsertPoint(InsertPt);
-
-    Value *OneElt = Builder.CreateExtractElement(Src, Builder.getInt32(Idx),
-                                                 "Elt" + Twine(Idx));
-    Value *Ptr = Builder.CreateExtractElement(Ptrs, Builder.getInt32(Idx),
-                                              "Ptr" + Twine(Idx));
-    Builder.CreateAlignedStore(OneElt, Ptr, AlignVal);
-
-    // Create "else" block, fill it in the next iteration
-    BasicBlock *NewIfBlock = CondBlock->splitBasicBlock(InsertPt, "else");
-    Builder.SetInsertPoint(InsertPt);
-    Instruction *OldBr = IfBlock->getTerminator();
-    BranchInst::Create(CondBlock, NewIfBlock, Cmp, OldBr);
-    OldBr->eraseFromParent();
-    IfBlock = NewIfBlock;
-  }
-  CI->eraseFromParent();
-}
-
 /// If counting leading or trailing zeros is an expensive operation and a zero
 /// input is defined, add a check for zero to avoid calling the intrinsic.
 ///
@@ -2242,39 +1729,6 @@ bool CodeGenPrepare::optimizeCallInst(CallInst *CI, bool& ModifiedDT) {
       }
       return true;
     }
-    case Intrinsic::masked_load: {
-      // Scalarize unsupported vector masked load
-      if (!TTI->isLegalMaskedLoad(CI->getType())) {
-        scalarizeMaskedLoad(CI);
-        ModifiedDT = true;
-        return true;
-      }
-      return false;
-    }
-    case Intrinsic::masked_store: {
-      if (!TTI->isLegalMaskedStore(CI->getArgOperand(0)->getType())) {
-        scalarizeMaskedStore(CI);
-        ModifiedDT = true;
-        return true;
-      }
-      return false;
-    }
-    case Intrinsic::masked_gather: {
-      if (!TTI->isLegalMaskedGather(CI->getType())) {
-        scalarizeMaskedGather(CI);
-        ModifiedDT = true;
-        return true;
-      }
-      return false;
-    }
-    case Intrinsic::masked_scatter: {
-      if (!TTI->isLegalMaskedScatter(CI->getArgOperand(0)->getType())) {
-        scalarizeMaskedScatter(CI);
-        ModifiedDT = true;
-        return true;
-      }
-      return false;
-    }
     case Intrinsic::aarch64_stlxr:
     case Intrinsic::aarch64_stxr: {
       ZExtInst *ExtVal = dyn_cast<ZExtInst>(CI->getArgOperand(0));
diff --git a/lib/CodeGen/ExpandPostRAPseudos.cpp b/lib/CodeGen/ExpandPostRAPseudos.cpp
index ab2382e2db6d..e860906043dd 100644
--- a/lib/CodeGen/ExpandPostRAPseudos.cpp
+++ b/lib/CodeGen/ExpandPostRAPseudos.cpp
@@ -142,8 +142,9 @@ bool ExpandPostRA::LowerCopy(MachineInstr *MI) {
   MachineOperand &DstMO = MI->getOperand(0);
   MachineOperand &SrcMO = MI->getOperand(1);
 
-  if (SrcMO.getReg() == DstMO.getReg()) {
-    DEBUG(dbgs() << "identity copy: " << *MI);
+  bool IdentityCopy = (SrcMO.getReg() == DstMO.getReg());
+  if (IdentityCopy || SrcMO.isUndef()) {
+    DEBUG(dbgs() << (IdentityCopy ? "identity copy: " : "undef copy:    ") << *MI);
     // No need to insert an identity copy instruction, but replace with a KILL
     // if liveness is changed.
     if (SrcMO.isUndef() || MI->getNumOperands() > 2) {
diff --git a/lib/CodeGen/ExpandReductions.cpp b/lib/CodeGen/ExpandReductions.cpp
new file mode 100644
index 000000000000..a40ea28056dd
--- /dev/null
+++ b/lib/CodeGen/ExpandReductions.cpp
@@ -0,0 +1,167 @@
+//===--- ExpandReductions.cpp - Expand experimental reduction intrinsics --===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass implements IR expansion for reduction intrinsics, allowing targets
+// to enable the experimental intrinsics until just before codegen.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/CodeGen/ExpandReductions.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Transforms/Utils/LoopUtils.h"
+#include "llvm/Pass.h"
+
+using namespace llvm;
+
+namespace {
+
+unsigned getOpcode(Intrinsic::ID ID) {
+  switch (ID) {
+  case Intrinsic::experimental_vector_reduce_fadd:
+    return Instruction::FAdd;
+  case Intrinsic::experimental_vector_reduce_fmul:
+    return Instruction::FMul;
+  case Intrinsic::experimental_vector_reduce_add:
+    return Instruction::Add;
+  case Intrinsic::experimental_vector_reduce_mul:
+    return Instruction::Mul;
+  case Intrinsic::experimental_vector_reduce_and:
+    return Instruction::And;
+  case Intrinsic::experimental_vector_reduce_or:
+    return Instruction::Or;
+  case Intrinsic::experimental_vector_reduce_xor:
+    return Instruction::Xor;
+  case Intrinsic::experimental_vector_reduce_smax:
+  case Intrinsic::experimental_vector_reduce_smin:
+  case Intrinsic::experimental_vector_reduce_umax:
+  case Intrinsic::experimental_vector_reduce_umin:
+    return Instruction::ICmp;
+  case Intrinsic::experimental_vector_reduce_fmax:
+  case Intrinsic::experimental_vector_reduce_fmin:
+    return Instruction::FCmp;
+  default:
+    llvm_unreachable("Unexpected ID");
+  }
+}
+
+RecurrenceDescriptor::MinMaxRecurrenceKind getMRK(Intrinsic::ID ID) {
+  switch (ID) {
+  case Intrinsic::experimental_vector_reduce_smax:
+    return RecurrenceDescriptor::MRK_SIntMax;
+  case Intrinsic::experimental_vector_reduce_smin:
+    return RecurrenceDescriptor::MRK_SIntMin;
+  case Intrinsic::experimental_vector_reduce_umax:
+    return RecurrenceDescriptor::MRK_UIntMax;
+  case Intrinsic::experimental_vector_reduce_umin:
+    return RecurrenceDescriptor::MRK_UIntMin;
+  case Intrinsic::experimental_vector_reduce_fmax:
+    return RecurrenceDescriptor::MRK_FloatMax;
+  case Intrinsic::experimental_vector_reduce_fmin:
+    return RecurrenceDescriptor::MRK_FloatMin;
+  default:
+    return RecurrenceDescriptor::MRK_Invalid;
+  }
+}
+
+bool expandReductions(Function &F, const TargetTransformInfo *TTI) {
+  bool Changed = false;
+  SmallVector<IntrinsicInst*, 4> Worklist;
+  for (inst_iterator I = inst_begin(F), E = inst_end(F); I != E; ++I)
+    if (auto II = dyn_cast<IntrinsicInst>(&*I))
+      Worklist.push_back(II);
+
+  for (auto *II : Worklist) {
+    IRBuilder<> Builder(II);
+    Value *Vec = nullptr;
+    auto ID = II->getIntrinsicID();
+    auto MRK = RecurrenceDescriptor::MRK_Invalid;
+    switch (ID) {
+    case Intrinsic::experimental_vector_reduce_fadd:
+    case Intrinsic::experimental_vector_reduce_fmul:
+      // FMFs must be attached to the call, otherwise it's an ordered reduction
+      // and it can't be handled by generating this shuffle sequence.
+      // TODO: Implement scalarization of ordered reductions here for targets
+      // without native support.
+      if (!II->getFastMathFlags().unsafeAlgebra())
+        continue;
+      Vec = II->getArgOperand(1);
+      break;
+    case Intrinsic::experimental_vector_reduce_add:
+    case Intrinsic::experimental_vector_reduce_mul:
+    case Intrinsic::experimental_vector_reduce_and:
+    case Intrinsic::experimental_vector_reduce_or:
+    case Intrinsic::experimental_vector_reduce_xor:
+    case Intrinsic::experimental_vector_reduce_smax:
+    case Intrinsic::experimental_vector_reduce_smin:
+    case Intrinsic::experimental_vector_reduce_umax:
+    case Intrinsic::experimental_vector_reduce_umin:
+    case Intrinsic::experimental_vector_reduce_fmax:
+    case Intrinsic::experimental_vector_reduce_fmin:
+      Vec = II->getArgOperand(0);
+      MRK = getMRK(ID);
+      break;
+    default:
+      continue;
+    }
+    if (!TTI->shouldExpandReduction(II))
+      continue;
+    auto Rdx = getShuffleReduction(Builder, Vec, getOpcode(ID), MRK);
+    II->replaceAllUsesWith(Rdx);
+    II->eraseFromParent();
+    Changed = true;
+  }
+  return Changed;
+}
+
+class ExpandReductions : public FunctionPass {
+public:
+  static char ID;
+  ExpandReductions() : FunctionPass(ID) {
+    initializeExpandReductionsPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnFunction(Function &F) override {
+    const auto *TTI =&getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
+    return expandReductions(F, TTI);
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<TargetTransformInfoWrapperPass>();
+    AU.setPreservesCFG();
+  }
+};
+}
+
+char ExpandReductions::ID;
+INITIALIZE_PASS_BEGIN(ExpandReductions, "expand-reductions",
+                      "Expand reduction intrinsics", false, false)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
+INITIALIZE_PASS_END(ExpandReductions, "expand-reductions",
+                    "Expand reduction intrinsics", false, false)
+
+FunctionPass *llvm::createExpandReductionsPass() {
+  return new ExpandReductions();
+}
+
+PreservedAnalyses ExpandReductionsPass::run(Function &F,
+                                            FunctionAnalysisManager &AM) {
+  const auto &TTI = AM.getResult<TargetIRAnalysis>(F);
+  if (!expandReductions(F, &TTI))
+    return PreservedAnalyses::all();
+  PreservedAnalyses PA;
+  PA.preserveSet<CFGAnalyses>();
+  return PA;
+}
diff --git a/lib/CodeGen/GlobalISel/LegalizerInfo.cpp b/lib/CodeGen/GlobalISel/LegalizerInfo.cpp
index eaf4056e47ea..4d4591042296 100644
--- a/lib/CodeGen/GlobalISel/LegalizerInfo.cpp
+++ b/lib/CodeGen/GlobalISel/LegalizerInfo.cpp
@@ -162,7 +162,7 @@ bool LegalizerInfo::isLegal(const MachineInstr &MI,
   return std::get<0>(getAction(MI, MRI)) == Legal;
 }
 
-LLT LegalizerInfo::findLegalType(const InstrAspect &Aspect,
+Optional<LLT> LegalizerInfo::findLegalType(const InstrAspect &Aspect,
                                  LegalizeAction Action) const {
   switch(Action) {
   default:
@@ -174,20 +174,20 @@ LLT LegalizerInfo::findLegalType(const InstrAspect &Aspect,
     return Aspect.Type;
   case NarrowScalar: {
     return findLegalType(Aspect,
-                         [&](LLT Ty) -> LLT { return Ty.halfScalarSize(); });
+                         [](LLT Ty) -> LLT { return Ty.halfScalarSize(); });
   }
   case WidenScalar: {
-    return findLegalType(Aspect, [&](LLT Ty) -> LLT {
+    return findLegalType(Aspect, [](LLT Ty) -> LLT {
       return Ty.getSizeInBits() < 8 ? LLT::scalar(8) : Ty.doubleScalarSize();
     });
   }
   case FewerElements: {
     return findLegalType(Aspect,
-                         [&](LLT Ty) -> LLT { return Ty.halfElements(); });
+                         [](LLT Ty) -> LLT { return Ty.halfElements(); });
   }
   case MoreElements: {
     return findLegalType(Aspect,
-                         [&](LLT Ty) -> LLT { return Ty.doubleElements(); });
+                         [](LLT Ty) -> LLT { return Ty.doubleElements(); });
   }
   }
 }
diff --git a/lib/CodeGen/GlobalISel/RegBankSelect.cpp b/lib/CodeGen/GlobalISel/RegBankSelect.cpp
index 7248f50945d0..2eb3cdee694d 100644
--- a/lib/CodeGen/GlobalISel/RegBankSelect.cpp
+++ b/lib/CodeGen/GlobalISel/RegBankSelect.cpp
@@ -204,12 +204,8 @@ uint64_t RegBankSelect::getRepairCost(
     // TODO: use a dedicated constant for ImpossibleCost.
     if (Cost != UINT_MAX)
       return Cost;
-    assert(!TPC->isGlobalISelAbortEnabled() &&
-           "Legalization not available yet");
     // Return the legalization cost of that repairing.
   }
-  assert(!TPC->isGlobalISelAbortEnabled() &&
-         "Complex repairing not implemented yet");
   return UINT_MAX;
 }
 
@@ -452,6 +448,11 @@ RegBankSelect::MappingCost RegBankSelect::computeMapping(
 
     // Sums up the repairing cost of MO at each insertion point.
     uint64_t RepairCost = getRepairCost(MO, ValMapping);
+
+    // This is an impossible to repair cost.
+    if (RepairCost == UINT_MAX)
+      continue;
+
     // Bias used for splitting: 5%.
     const uint64_t PercentageForBias = 5;
     uint64_t Bias = (RepairCost * PercentageForBias + 99) / 100;
diff --git a/lib/CodeGen/GlobalISel/Utils.cpp b/lib/CodeGen/GlobalISel/Utils.cpp
index 3c93f8123b0d..254bdf10d804 100644
--- a/lib/CodeGen/GlobalISel/Utils.cpp
+++ b/lib/CodeGen/GlobalISel/Utils.cpp
@@ -110,3 +110,11 @@ Optional<int64_t> llvm::getConstantVRegVal(unsigned VReg,
 
   return None;
 }
+
+const llvm::ConstantFP* llvm::getConstantFPVRegVal(unsigned VReg,
+                                       const MachineRegisterInfo &MRI) {
+  MachineInstr *MI = MRI.getVRegDef(VReg);
+  if (TargetOpcode::G_FCONSTANT != MI->getOpcode())
+    return nullptr;
+  return MI->getOperand(1).getFPImm();
+}
diff --git a/lib/CodeGen/IfConversion.cpp b/lib/CodeGen/IfConversion.cpp
index 37fe41582333..628d599a3cc7 100644
--- a/lib/CodeGen/IfConversion.cpp
+++ b/lib/CodeGen/IfConversion.cpp
@@ -1318,7 +1318,8 @@ static bool canFallThroughTo(MachineBasicBlock &MBB, MachineBasicBlock &ToMBB) {
       return false;
     PI = I++;
   }
-  return true;
+  // Finally see if the last I is indeed a successor to PI.
+  return PI->isSuccessor(&*I);
 }
 
 /// Invalidate predecessor BB info so it would be re-analyzed to determine if it
@@ -1587,22 +1588,32 @@ bool IfConverter::IfConvertTriangle(BBInfo &BBI, IfcvtKind Kind) {
     BBCvt = MBPI->getEdgeProbability(BBI.BB, &CvtMBB);
   }
 
+  // To be able to insert code freely at the end of BBI we sometimes remove
+  // the branch from BBI to NextMBB temporarily. Remember if this happened.
+  bool RemovedBranchToNextMBB = false;
   if (CvtMBB.pred_size() > 1) {
     BBI.NonPredSize -= TII->removeBranch(*BBI.BB);
     // Copy instructions in the true block, predicate them, and add them to
     // the entry block.
     CopyAndPredicateBlock(BBI, *CvtBBI, Cond, true);
 
-    // RemoveExtraEdges won't work if the block has an unanalyzable branch, so
-    // explicitly remove CvtBBI as a successor.
+    // Keep the CFG updated.
     BBI.BB->removeSuccessor(&CvtMBB, true);
   } else {
     // Predicate the 'true' block after removing its branch.
     CvtBBI->NonPredSize -= TII->removeBranch(CvtMBB);
     PredicateBlock(*CvtBBI, CvtMBB.end(), Cond);
 
-    // Now merge the entry of the triangle with the true block.
+    // Remove the branch from the entry of the triangle to NextBB to be able to
+    // do the merge below. Keep the CFG updated, but remember we removed the
+    // branch since we do want to execute NextMBB, either by introducing a
+    // branch to it again, or merging it into the entry block.
+    // How it's handled is decided further down.
     BBI.NonPredSize -= TII->removeBranch(*BBI.BB);
+    BBI.BB->removeSuccessor(&NextMBB, true);
+    RemovedBranchToNextMBB = true;
+
+    // Now merge the entry of the triangle with the true block.
     MergeBlocks(BBI, *CvtBBI, false);
   }
 
@@ -1640,12 +1651,19 @@ bool IfConverter::IfConvertTriangle(BBInfo &BBI, IfcvtKind Kind) {
     // block. By not merging them, we make it possible to iteratively
     // ifcvt the blocks.
     if (!HasEarlyExit &&
-        NextMBB.pred_size() == 1 && !NextBBI->HasFallThrough &&
+        // We might have removed BBI from NextMBB's predecessor list above but
+        // we want it to be there, so consider that too.
+        (NextMBB.pred_size() == (RemovedBranchToNextMBB ? 0 : 1)) &&
+        !NextBBI->HasFallThrough &&
         !NextMBB.hasAddressTaken()) {
+      // We will merge NextBBI into BBI, and thus remove the current
+      // fallthrough from BBI into CvtBBI.
+      BBI.BB->removeSuccessor(&CvtMBB, true);
       MergeBlocks(BBI, *NextBBI);
       FalseBBDead = true;
     } else {
       InsertUncondBranch(*BBI.BB, NextMBB, TII);
+      BBI.BB->addSuccessor(&NextMBB);
       BBI.HasFallThrough = false;
     }
     // Mixed predicated and unpredicated code. This cannot be iteratively
@@ -1653,8 +1671,6 @@ bool IfConverter::IfConvertTriangle(BBInfo &BBI, IfcvtKind Kind) {
     IterIfcvt = false;
   }
 
-  RemoveExtraEdges(BBI);
-
   // Update block info. BB can be iteratively if-converted.
   if (!IterIfcvt)
     BBI.IsDone = true;
diff --git a/lib/CodeGen/LiveRangeShrink.cpp b/lib/CodeGen/LiveRangeShrink.cpp
new file mode 100644
index 000000000000..00182e2c779f
--- /dev/null
+++ b/lib/CodeGen/LiveRangeShrink.cpp
@@ -0,0 +1,211 @@
+//===-- LiveRangeShrink.cpp - Move instructions to shrink live range ------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+///===---------------------------------------------------------------------===//
+///
+/// \file
+/// This pass moves instructions close to the definition of its operands to
+/// shrink live range of the def instruction. The code motion is limited within
+/// the basic block. The moved instruction should have 1 def, and more than one
+/// uses, all of which are the only use of the def.
+///
+///===---------------------------------------------------------------------===//
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Support/Debug.h"
+
+#define DEBUG_TYPE "lrshrink"
+
+STATISTIC(NumInstrsHoistedToShrinkLiveRange,
+          "Number of insructions hoisted to shrink live range.");
+
+using namespace llvm;
+
+namespace {
+class LiveRangeShrink : public MachineFunctionPass {
+public:
+  static char ID;
+
+  LiveRangeShrink() : MachineFunctionPass(ID) {
+    initializeLiveRangeShrinkPass(*PassRegistry::getPassRegistry());
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+
+  StringRef getPassName() const override { return "Live Range Shrink"; }
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+};
+} // End anonymous namespace.
+
+char LiveRangeShrink::ID = 0;
+char &llvm::LiveRangeShrinkID = LiveRangeShrink::ID;
+
+INITIALIZE_PASS(LiveRangeShrink, "lrshrink", "Live Range Shrink Pass", false,
+                false)
+namespace {
+typedef DenseMap<MachineInstr *, unsigned> InstOrderMap;
+
+/// Returns \p New if it's dominated by \p Old, otherwise return \p Old.
+/// \p M maintains a map from instruction to its dominating order that satisfies
+/// M[A] > M[B] guarantees that A is dominated by B.
+/// If \p New is not in \p M, return \p Old. Otherwise if \p Old is null, return
+/// \p New.
+MachineInstr *FindDominatedInstruction(MachineInstr &New, MachineInstr *Old,
+                                       const InstOrderMap &M) {
+  auto NewIter = M.find(&New);
+  if (NewIter == M.end())
+    return Old;
+  if (Old == nullptr)
+    return &New;
+  unsigned OrderOld = M.find(Old)->second;
+  unsigned OrderNew = NewIter->second;
+  if (OrderOld != OrderNew)
+    return OrderOld < OrderNew ? &New : Old;
+  // OrderOld == OrderNew, we need to iterate down from Old to see if it
+  // can reach New, if yes, New is dominated by Old.
+  for (MachineInstr *I = Old->getNextNode(); M.find(I)->second == OrderNew;
+       I = I->getNextNode())
+    if (I == &New)
+      return &New;
+  return Old;
+}
+
+/// Builds Instruction to its dominating order number map \p M by traversing
+/// from instruction \p Start.
+void BuildInstOrderMap(MachineBasicBlock::iterator Start, InstOrderMap &M) {
+  M.clear();
+  unsigned i = 0;
+  for (MachineInstr &I : make_range(Start, Start->getParent()->end()))
+    M[&I] = i++;
+}
+} // end anonymous namespace
+
+bool LiveRangeShrink::runOnMachineFunction(MachineFunction &MF) {
+  if (skipFunction(*MF.getFunction()))
+    return false;
+
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+
+  DEBUG(dbgs() << "**** Analysing " << MF.getName() << '\n');
+
+  InstOrderMap IOM;
+  // Map from register to instruction order (value of IOM) where the
+  // register is used last. When moving instructions up, we need to
+  // make sure all its defs (including dead def) will not cross its
+  // last use when moving up.
+  DenseMap<unsigned, unsigned> UseMap;
+
+  for (MachineBasicBlock &MBB : MF) {
+    if (MBB.empty())
+      continue;
+    bool SawStore = false;
+    BuildInstOrderMap(MBB.begin(), IOM);
+    UseMap.clear();
+
+    for (MachineBasicBlock::iterator Next = MBB.begin(); Next != MBB.end();) {
+      MachineInstr &MI = *Next;
+      ++Next;
+      if (MI.isPHI() || MI.isDebugValue())
+        continue;
+      if (MI.mayStore())
+        SawStore = true;
+
+      unsigned CurrentOrder = IOM[&MI];
+      unsigned Barrier = 0;
+      for (const MachineOperand &MO : MI.operands()) {
+        if (!MO.isReg() || MO.isDebug())
+          continue;
+        if (MO.isUse())
+          UseMap[MO.getReg()] = CurrentOrder;
+        else if (MO.isDead() && UseMap.count(MO.getReg()))
+          // Barrier is the last instruction where MO get used. MI should not
+          // be moved above Barrier.
+          Barrier = std::max(Barrier, UseMap[MO.getReg()]);
+      }
+
+      if (!MI.isSafeToMove(nullptr, SawStore)) {
+        // If MI has side effects, it should become a barrier for code motion.
+        // IOM is rebuild from the next instruction to prevent later
+        // instructions from being moved before this MI.
+        if (MI.hasUnmodeledSideEffects() && Next != MBB.end()) {
+          BuildInstOrderMap(Next, IOM);
+          SawStore = false;
+        }
+        continue;
+      }
+
+      const MachineOperand *DefMO = nullptr;
+      MachineInstr *Insert = nullptr;
+
+      // Number of live-ranges that will be shortened. We do not count
+      // live-ranges that are defined by a COPY as it could be coalesced later.
+      unsigned NumEligibleUse = 0;
+
+      for (const MachineOperand &MO : MI.operands()) {
+        if (!MO.isReg() || MO.isDead() || MO.isDebug())
+          continue;
+        unsigned Reg = MO.getReg();
+        // Do not move the instruction if it def/uses a physical register,
+        // unless it is a constant physical register.
+        if (TargetRegisterInfo::isPhysicalRegister(Reg) &&
+            !MRI.isConstantPhysReg(Reg)) {
+          Insert = nullptr;
+          break;
+        }
+        if (MO.isDef()) {
+          // Do not move if there is more than one def.
+          if (DefMO) {
+            Insert = nullptr;
+            break;
+          }
+          DefMO = &MO;
+        } else if (MRI.hasOneNonDBGUse(Reg) && MRI.hasOneDef(Reg)) {
+          MachineInstr &DefInstr = *MRI.def_instr_begin(Reg);
+          if (!DefInstr.isCopy())
+            NumEligibleUse++;
+          Insert = FindDominatedInstruction(DefInstr, Insert, IOM);
+        } else {
+          Insert = nullptr;
+          break;
+        }
+      }
+      // Move the instruction when # of shrunk live range > 1.
+      if (DefMO && Insert && NumEligibleUse > 1 && Barrier <= IOM[Insert]) {
+        MachineBasicBlock::iterator I = std::next(Insert->getIterator());
+        // Skip all the PHI and debug instructions.
+        while (I != MBB.end() && (I->isPHI() || I->isDebugValue()))
+          I = std::next(I);
+        if (I == MI.getIterator())
+          continue;
+
+        // Update the dominator order to be the same as the insertion point.
+        // We do this to maintain a non-decreasing order without need to update
+        // all instruction orders after the insertion point.
+        unsigned NewOrder = IOM[&*I];
+        IOM[&MI] = NewOrder;
+        NumInstrsHoistedToShrinkLiveRange++;
+
+        // Find MI's debug value following MI.
+        MachineBasicBlock::iterator EndIter = std::next(MI.getIterator());
+        if (MI.getOperand(0).isReg())
+          for (; EndIter != MBB.end() && EndIter->isDebugValue() &&
+                 EndIter->getOperand(0).isReg() &&
+                 EndIter->getOperand(0).getReg() == MI.getOperand(0).getReg();
+               ++EndIter, ++Next)
+            IOM[&*EndIter] = NewOrder;
+        MBB.splice(I, &MBB, MI.getIterator(), EndIter);
+      }
+    }
+  }
+  return false;
+}
diff --git a/lib/CodeGen/LiveVariables.cpp b/lib/CodeGen/LiveVariables.cpp
index 3568b0294ad9..a9aec926115a 100644
--- a/lib/CodeGen/LiveVariables.cpp
+++ b/lib/CodeGen/LiveVariables.cpp
@@ -767,7 +767,7 @@ void LiveVariables::addNewBlock(MachineBasicBlock *BB,
                                 MachineBasicBlock *SuccBB) {
   const unsigned NumNew = BB->getNumber();
 
-  SmallSet<unsigned, 16> Defs, Kills;
+  DenseSet<unsigned> Defs, Kills;
 
   MachineBasicBlock::iterator BBI = SuccBB->begin(), BBE = SuccBB->end();
   for (; BBI != BBE && BBI->isPHI(); ++BBI) {
diff --git a/lib/CodeGen/MachineBlockPlacement.cpp b/lib/CodeGen/MachineBlockPlacement.cpp
index 4cfc128a8c1d..5003115a770f 100644
--- a/lib/CodeGen/MachineBlockPlacement.cpp
+++ b/lib/CodeGen/MachineBlockPlacement.cpp
@@ -133,6 +133,14 @@ static cl::opt<unsigned> TailDupPlacementThreshold(
              "that won't conflict."), cl::init(2),
     cl::Hidden);
 
+// Heuristic for aggressive tail duplication.
+static cl::opt<unsigned> TailDupPlacementAggressiveThreshold(
+    "tail-dup-placement-aggressive-threshold",
+    cl::desc("Instruction cutoff for aggressive tail duplication during "
+             "layout. Used at -O3. Tail merging during layout is forced to "
+             "have a threshold that won't conflict."), cl::init(3),
+    cl::Hidden);
+
 // Heuristic for tail duplication.
 static cl::opt<unsigned> TailDupPlacementPenalty(
     "tail-dup-placement-penalty",
@@ -2646,9 +2654,26 @@ bool MachineBlockPlacement::runOnMachineFunction(MachineFunction &MF) {
   assert(BlockToChain.empty());
   assert(ComputedEdges.empty());
 
+  unsigned TailDupSize = TailDupPlacementThreshold;
+  // If only the aggressive threshold is explicitly set, use it.
+  if (TailDupPlacementAggressiveThreshold.getNumOccurrences() != 0 &&
+      TailDupPlacementThreshold.getNumOccurrences() == 0)
+    TailDupSize = TailDupPlacementAggressiveThreshold;
+
+  TargetPassConfig *PassConfig = &getAnalysis<TargetPassConfig>();
+  // For agressive optimization, we can adjust some thresholds to be less
+  // conservative.
+  if (PassConfig->getOptLevel() >= CodeGenOpt::Aggressive) {
+    // At O3 we should be more willing to copy blocks for tail duplication. This
+    // increases size pressure, so we only do it at O3
+    // Do this unless only the regular threshold is explicitly set.
+    if (TailDupPlacementThreshold.getNumOccurrences() == 0 ||
+        TailDupPlacementAggressiveThreshold.getNumOccurrences() != 0)
+      TailDupSize = TailDupPlacementAggressiveThreshold;
+  }
+
   if (TailDupPlacement) {
     MPDT = &getAnalysis<MachinePostDominatorTree>();
-    unsigned TailDupSize = TailDupPlacementThreshold;
     if (MF.getFunction()->optForSize())
       TailDupSize = 1;
     TailDup.initMF(MF, MBPI, /* LayoutMode */ true, TailDupSize);
@@ -2658,7 +2683,6 @@ bool MachineBlockPlacement::runOnMachineFunction(MachineFunction &MF) {
   buildCFGChains();
 
   // Changing the layout can create new tail merging opportunities.
-  TargetPassConfig *PassConfig = &getAnalysis<TargetPassConfig>();
   // TailMerge can create jump into if branches that make CFG irreducible for
   // HW that requires structured CFG.
   bool EnableTailMerge = !MF.getTarget().requiresStructuredCFG() &&
@@ -2666,7 +2690,7 @@ bool MachineBlockPlacement::runOnMachineFunction(MachineFunction &MF) {
                          BranchFoldPlacement;
   // No tail merging opportunities if the block number is less than four.
   if (MF.size() > 3 && EnableTailMerge) {
-    unsigned TailMergeSize = TailDupPlacementThreshold + 1;
+    unsigned TailMergeSize = TailDupSize + 1;
     BranchFolder BF(/*EnableTailMerge=*/true, /*CommonHoist=*/false, *MBFI,
                     *MBPI, TailMergeSize);
 
diff --git a/lib/CodeGen/MachineVerifier.cpp b/lib/CodeGen/MachineVerifier.cpp
index bfb2cde030dc..ab433273b189 100644
--- a/lib/CodeGen/MachineVerifier.cpp
+++ b/lib/CodeGen/MachineVerifier.cpp
@@ -2063,12 +2063,12 @@ void MachineVerifier::verifyStackFrame() {
       if (I.getOpcode() == FrameSetupOpcode) {
         if (BBState.ExitIsSetup)
           report("FrameSetup is after another FrameSetup", &I);
-        BBState.ExitValue -= TII->getFrameSize(I);
+        BBState.ExitValue -= TII->getFrameTotalSize(I);
         BBState.ExitIsSetup = true;
       }
 
       if (I.getOpcode() == FrameDestroyOpcode) {
-        int Size = TII->getFrameSize(I);
+        int Size = TII->getFrameTotalSize(I);
         if (!BBState.ExitIsSetup)
           report("FrameDestroy is not after a FrameSetup", &I);
         int AbsSPAdj = BBState.ExitValue < 0 ? -BBState.ExitValue :
diff --git a/lib/CodeGen/PHIElimination.cpp b/lib/CodeGen/PHIElimination.cpp
index c67a25b888bf..db2264b2439d 100644
--- a/lib/CodeGen/PHIElimination.cpp
+++ b/lib/CodeGen/PHIElimination.cpp
@@ -34,7 +34,7 @@
 #include <algorithm>
 using namespace llvm;
 
-#define DEBUG_TYPE "phielim"
+#define DEBUG_TYPE "phi-node-elimination"
 
 static cl::opt<bool>
 DisableEdgeSplitting("disable-phi-elim-edge-splitting", cl::init(false),
diff --git a/lib/CodeGen/RegisterCoalescer.cpp b/lib/CodeGen/RegisterCoalescer.cpp
index bf44ee8453b6..1803ea2b9249 100644
--- a/lib/CodeGen/RegisterCoalescer.cpp
+++ b/lib/CodeGen/RegisterCoalescer.cpp
@@ -3214,7 +3214,7 @@ RegisterCoalescer::copyCoalesceInMBB(MachineBasicBlock *MBB) {
     CurrList(WorkList.begin() + PrevSize, WorkList.end());
   if (copyCoalesceWorkList(CurrList))
     WorkList.erase(std::remove(WorkList.begin() + PrevSize, WorkList.end(),
-                               (MachineInstr*)nullptr), WorkList.end());
+                               nullptr), WorkList.end());
 }
 
 void RegisterCoalescer::coalesceLocals() {
diff --git a/lib/CodeGen/RegisterScavenging.cpp b/lib/CodeGen/RegisterScavenging.cpp
index 35db30f89976..0635e5c0a63c 100644
--- a/lib/CodeGen/RegisterScavenging.cpp
+++ b/lib/CodeGen/RegisterScavenging.cpp
@@ -62,10 +62,9 @@ void RegScavenger::init(MachineBasicBlock &MBB) {
   }
   this->MBB = &MBB;
 
-  for (SmallVectorImpl<ScavengedInfo>::iterator I = Scavenged.begin(),
-         IE = Scavenged.end(); I != IE; ++I) {
-    I->Reg = 0;
-    I->Restore = nullptr;
+  for (ScavengedInfo &SI : Scavenged) {
+    SI.Reg = 0;
+    SI.Restore = nullptr;
   }
 
   Tracking = false;
diff --git a/lib/CodeGen/SafeStack.cpp b/lib/CodeGen/SafeStack.cpp
index 7fa379d80c6c..08b3d345f689 100644
--- a/lib/CodeGen/SafeStack.cpp
+++ b/lib/CodeGen/SafeStack.cpp
@@ -19,6 +19,7 @@
 #include "SafeStackLayout.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/Triple.h"
+#include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/BranchProbabilityInfo.h"
 #include "llvm/Analysis/ScalarEvolution.h"
 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
@@ -92,11 +93,11 @@ public:
 /// determined statically), and the unsafe stack, which contains all
 /// local variables that are accessed in ways that we can't prove to
 /// be safe.
-class SafeStack : public FunctionPass {
-  const TargetMachine *TM;
-  const TargetLoweringBase *TL;
-  const DataLayout *DL;
-  ScalarEvolution *SE;
+class SafeStack {
+  Function &F;
+  const TargetLoweringBase &TL;
+  const DataLayout &DL;
+  ScalarEvolution &SE;
 
   Type *StackPtrTy;
   Type *IntPtrTy;
@@ -171,33 +172,21 @@ class SafeStack : public FunctionPass {
                     uint64_t AllocaSize);
 
 public:
-  static char ID; // Pass identification, replacement for typeid.
-  SafeStack(const TargetMachine *TM)
-      : FunctionPass(ID), TM(TM), TL(nullptr), DL(nullptr) {
-    initializeSafeStackPass(*PassRegistry::getPassRegistry());
-  }
-  SafeStack() : SafeStack(nullptr) {}
-
-  void getAnalysisUsage(AnalysisUsage &AU) const override {
-    AU.addRequired<ScalarEvolutionWrapperPass>();
-  }
-
-  bool doInitialization(Module &M) override {
-    DL = &M.getDataLayout();
-
-    StackPtrTy = Type::getInt8PtrTy(M.getContext());
-    IntPtrTy = DL->getIntPtrType(M.getContext());
-    Int32Ty = Type::getInt32Ty(M.getContext());
-    Int8Ty = Type::getInt8Ty(M.getContext());
-
-    return false;
-  }
-
-  bool runOnFunction(Function &F) override;
-}; // class SafeStack
+  SafeStack(Function &F, const TargetLoweringBase &TL, const DataLayout &DL,
+            ScalarEvolution &SE)
+      : F(F), TL(TL), DL(DL), SE(SE),
+        StackPtrTy(Type::getInt8PtrTy(F.getContext())),
+        IntPtrTy(DL.getIntPtrType(F.getContext())),
+        Int32Ty(Type::getInt32Ty(F.getContext())),
+        Int8Ty(Type::getInt8Ty(F.getContext())) {}
+
+  // Run the transformation on the associated function.
+  // Returns whether the function was changed.
+  bool run();
+};
 
 uint64_t SafeStack::getStaticAllocaAllocationSize(const AllocaInst* AI) {
-  uint64_t Size = DL->getTypeAllocSize(AI->getAllocatedType());
+  uint64_t Size = DL.getTypeAllocSize(AI->getAllocatedType());
   if (AI->isArrayAllocation()) {
     auto C = dyn_cast<ConstantInt>(AI->getArraySize());
     if (!C)
@@ -209,11 +198,11 @@ uint64_t SafeStack::getStaticAllocaAllocationSize(const AllocaInst* AI) {
 
 bool SafeStack::IsAccessSafe(Value *Addr, uint64_t AccessSize,
                              const Value *AllocaPtr, uint64_t AllocaSize) {
-  AllocaOffsetRewriter Rewriter(*SE, AllocaPtr);
-  const SCEV *Expr = Rewriter.visit(SE->getSCEV(Addr));
+  AllocaOffsetRewriter Rewriter(SE, AllocaPtr);
+  const SCEV *Expr = Rewriter.visit(SE.getSCEV(Addr));
 
-  uint64_t BitWidth = SE->getTypeSizeInBits(Expr->getType());
-  ConstantRange AccessStartRange = SE->getUnsignedRange(Expr);
+  uint64_t BitWidth = SE.getTypeSizeInBits(Expr->getType());
+  ConstantRange AccessStartRange = SE.getUnsignedRange(Expr);
   ConstantRange SizeRange =
       ConstantRange(APInt(BitWidth, 0), APInt(BitWidth, AccessSize));
   ConstantRange AccessRange = AccessStartRange.add(SizeRange);
@@ -226,8 +215,8 @@ bool SafeStack::IsAccessSafe(Value *Addr, uint64_t AccessSize,
                << *AllocaPtr << "\n"
                << "            Access " << *Addr << "\n"
                << "            SCEV " << *Expr
-               << " U: " << SE->getUnsignedRange(Expr)
-               << ", S: " << SE->getSignedRange(Expr) << "\n"
+               << " U: " << SE.getUnsignedRange(Expr)
+               << ", S: " << SE.getSignedRange(Expr) << "\n"
                << "            Range " << AccessRange << "\n"
                << "            AllocaRange " << AllocaRange << "\n"
                << "            " << (Safe ? "safe" : "unsafe") << "\n");
@@ -266,7 +255,7 @@ bool SafeStack::IsSafeStackAlloca(const Value *AllocaPtr, uint64_t AllocaSize) {
 
       switch (I->getOpcode()) {
       case Instruction::Load: {
-        if (!IsAccessSafe(UI, DL->getTypeStoreSize(I->getType()), AllocaPtr,
+        if (!IsAccessSafe(UI, DL.getTypeStoreSize(I->getType()), AllocaPtr,
                           AllocaSize))
           return false;
         break;
@@ -282,7 +271,7 @@ bool SafeStack::IsSafeStackAlloca(const Value *AllocaPtr, uint64_t AllocaSize) {
           return false;
         }
 
-        if (!IsAccessSafe(UI, DL->getTypeStoreSize(I->getOperand(0)->getType()),
+        if (!IsAccessSafe(UI, DL.getTypeStoreSize(I->getOperand(0)->getType()),
                           AllocaPtr, AllocaSize))
           return false;
         break;
@@ -343,7 +332,7 @@ bool SafeStack::IsSafeStackAlloca(const Value *AllocaPtr, uint64_t AllocaSize) {
 }
 
 Value *SafeStack::getStackGuard(IRBuilder<> &IRB, Function &F) {
-  Value *StackGuardVar = TL->getIRStackGuard(IRB);
+  Value *StackGuardVar = TL.getIRStackGuard(IRB);
   if (!StackGuardVar)
     StackGuardVar =
         F.getParent()->getOrInsertGlobal("__stack_chk_guard", StackPtrTy);
@@ -390,7 +379,7 @@ void SafeStack::findInsts(Function &F,
     if (!Arg.hasByValAttr())
       continue;
     uint64_t Size =
-        DL->getTypeStoreSize(Arg.getType()->getPointerElementType());
+        DL.getTypeStoreSize(Arg.getType()->getPointerElementType());
     if (IsSafeStackAlloca(&Arg, Size))
       continue;
 
@@ -476,19 +465,19 @@ Value *SafeStack::moveStaticAllocasToUnsafeStack(
   if (StackGuardSlot) {
     Type *Ty = StackGuardSlot->getAllocatedType();
     unsigned Align =
-        std::max(DL->getPrefTypeAlignment(Ty), StackGuardSlot->getAlignment());
+        std::max(DL.getPrefTypeAlignment(Ty), StackGuardSlot->getAlignment());
     SSL.addObject(StackGuardSlot, getStaticAllocaAllocationSize(StackGuardSlot),
                   Align, SSC.getFullLiveRange());
   }
 
   for (Argument *Arg : ByValArguments) {
     Type *Ty = Arg->getType()->getPointerElementType();
-    uint64_t Size = DL->getTypeStoreSize(Ty);
+    uint64_t Size = DL.getTypeStoreSize(Ty);
     if (Size == 0)
       Size = 1; // Don't create zero-sized stack objects.
 
     // Ensure the object is properly aligned.
-    unsigned Align = std::max((unsigned)DL->getPrefTypeAlignment(Ty),
+    unsigned Align = std::max((unsigned)DL.getPrefTypeAlignment(Ty),
                               Arg->getParamAlignment());
     SSL.addObject(Arg, Size, Align, SSC.getFullLiveRange());
   }
@@ -501,7 +490,7 @@ Value *SafeStack::moveStaticAllocasToUnsafeStack(
 
     // Ensure the object is properly aligned.
     unsigned Align =
-        std::max((unsigned)DL->getPrefTypeAlignment(Ty), AI->getAlignment());
+        std::max((unsigned)DL.getPrefTypeAlignment(Ty), AI->getAlignment());
 
     SSL.addObject(AI, Size, Align, SSC.getLiveRange(AI));
   }
@@ -539,7 +528,7 @@ Value *SafeStack::moveStaticAllocasToUnsafeStack(
     unsigned Offset = SSL.getObjectOffset(Arg);
     Type *Ty = Arg->getType()->getPointerElementType();
 
-    uint64_t Size = DL->getTypeStoreSize(Ty);
+    uint64_t Size = DL.getTypeStoreSize(Ty);
     if (Size == 0)
       Size = 1; // Don't create zero-sized stack objects.
 
@@ -630,7 +619,7 @@ void SafeStack::moveDynamicAllocasToUnsafeStack(
       ArraySize = IRB.CreateIntCast(ArraySize, IntPtrTy, false);
 
     Type *Ty = AI->getAllocatedType();
-    uint64_t TySize = DL->getTypeAllocSize(Ty);
+    uint64_t TySize = DL.getTypeAllocSize(Ty);
     Value *Size = IRB.CreateMul(ArraySize, ConstantInt::get(IntPtrTy, TySize));
 
     Value *SP = IRB.CreatePtrToInt(IRB.CreateLoad(UnsafeStackPtr), IntPtrTy);
@@ -638,7 +627,7 @@ void SafeStack::moveDynamicAllocasToUnsafeStack(
 
     // Align the SP value to satisfy the AllocaInst, type and stack alignments.
     unsigned Align = std::max(
-        std::max((unsigned)DL->getPrefTypeAlignment(Ty), AI->getAlignment()),
+        std::max((unsigned)DL.getPrefTypeAlignment(Ty), AI->getAlignment()),
         (unsigned)StackAlignment);
 
     assert(isPowerOf2_32(Align));
@@ -685,25 +674,10 @@ void SafeStack::moveDynamicAllocasToUnsafeStack(
   }
 }
 
-bool SafeStack::runOnFunction(Function &F) {
-  DEBUG(dbgs() << "[SafeStack] Function: " << F.getName() << "\n");
-
-  if (!F.hasFnAttribute(Attribute::SafeStack)) {
-    DEBUG(dbgs() << "[SafeStack]     safestack is not requested"
-                    " for this function\n");
-    return false;
-  }
-
-  if (F.isDeclaration()) {
-    DEBUG(dbgs() << "[SafeStack]     function definition"
-                    " is not available\n");
-    return false;
-  }
-
-  if (!TM)
-    report_fatal_error("Target machine is required");
-  TL = TM->getSubtargetImpl(F)->getTargetLowering();
-  SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
+bool SafeStack::run() {
+  assert(F.hasFnAttribute(Attribute::SafeStack) &&
+         "Can't run SafeStack on a function without the attribute");
+  assert(!F.isDeclaration() && "Can't run SafeStack on a function declaration");
 
   ++NumFunctions;
 
@@ -736,7 +710,7 @@ bool SafeStack::runOnFunction(Function &F) {
     ++NumUnsafeStackRestorePointsFunctions;
 
   IRBuilder<> IRB(&F.front(), F.begin()->getFirstInsertionPt());
-  UnsafeStackPtr = TL->getSafeStackPointerLocation(IRB);
+  UnsafeStackPtr = TL.getSafeStackPointerLocation(IRB);
 
   // Load the current stack pointer (we'll also use it as a base pointer).
   // FIXME: use a dedicated register for it ?
@@ -788,14 +762,70 @@ bool SafeStack::runOnFunction(Function &F) {
   return true;
 }
 
+class SafeStackLegacyPass : public FunctionPass {
+  const TargetMachine *TM;
+
+public:
+  static char ID; // Pass identification, replacement for typeid..
+  SafeStackLegacyPass(const TargetMachine *TM) : FunctionPass(ID), TM(TM) {
+    initializeSafeStackLegacyPassPass(*PassRegistry::getPassRegistry());
+  }
+
+  SafeStackLegacyPass() : SafeStackLegacyPass(nullptr) {}
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<TargetLibraryInfoWrapperPass>();
+    AU.addRequired<AssumptionCacheTracker>();
+  }
+
+  bool runOnFunction(Function &F) override {
+    DEBUG(dbgs() << "[SafeStack] Function: " << F.getName() << "\n");
+
+    if (!F.hasFnAttribute(Attribute::SafeStack)) {
+      DEBUG(dbgs() << "[SafeStack]     safestack is not requested"
+                      " for this function\n");
+      return false;
+    }
+
+    if (F.isDeclaration()) {
+      DEBUG(dbgs() << "[SafeStack]     function definition"
+                      " is not available\n");
+      return false;
+    }
+
+    if (!TM)
+      report_fatal_error("Target machine is required");
+    auto *TL = TM->getSubtargetImpl(F)->getTargetLowering();
+    if (!TL)
+      report_fatal_error("TargetLowering instance is required");
+
+    auto *DL = &F.getParent()->getDataLayout();
+    auto &TLI = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+    auto &ACT = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
+
+    // Compute DT and LI only for functions that have the attribute.
+    // This is only useful because the legacy pass manager doesn't let us
+    // compute analyzes lazily.
+    // In the backend pipeline, nothing preserves DT before SafeStack, so we
+    // would otherwise always compute it wastefully, even if there is no
+    // function with the safestack attribute.
+    DominatorTree DT(F);
+    LoopInfo LI(DT);
+
+    ScalarEvolution SE(F, TLI, ACT, DT, LI);
+
+    return SafeStack(F, *TL, *DL, SE).run();
+  }
+};
+
 } // anonymous namespace
 
-char SafeStack::ID = 0;
-INITIALIZE_TM_PASS_BEGIN(SafeStack, "safe-stack",
+char SafeStackLegacyPass::ID = 0;
+INITIALIZE_TM_PASS_BEGIN(SafeStackLegacyPass, "safe-stack",
                          "Safe Stack instrumentation pass", false, false)
-INITIALIZE_TM_PASS_END(SafeStack, "safe-stack",
+INITIALIZE_TM_PASS_END(SafeStackLegacyPass, "safe-stack",
                        "Safe Stack instrumentation pass", false, false)
 
 FunctionPass *llvm::createSafeStackPass(const llvm::TargetMachine *TM) {
-  return new SafeStack(TM);
+  return new SafeStackLegacyPass(TM);
 }
diff --git a/lib/CodeGen/ScalarizeMaskedMemIntrin.cpp b/lib/CodeGen/ScalarizeMaskedMemIntrin.cpp
new file mode 100644
index 000000000000..dab5b91f50ad
--- /dev/null
+++ b/lib/CodeGen/ScalarizeMaskedMemIntrin.cpp
@@ -0,0 +1,660 @@
+//=== ScalarizeMaskedMemIntrin.cpp - Scalarize unsupported masked mem      ===//
+//===                                instrinsics                           ===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass replaces masked memory intrinsics - when unsupported by the target
+// - with a chain of basic blocks, that deal with the elements one-by-one if the
+// appropriate mask bit is set.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "scalarize-masked-mem-intrin"
+
+namespace {
+
+class ScalarizeMaskedMemIntrin : public FunctionPass {
+  const TargetTransformInfo *TTI;
+
+public:
+  static char ID; // Pass identification, replacement for typeid
+  explicit ScalarizeMaskedMemIntrin() : FunctionPass(ID), TTI(nullptr) {
+    initializeScalarizeMaskedMemIntrinPass(*PassRegistry::getPassRegistry());
+  }
+  bool runOnFunction(Function &F) override;
+
+  StringRef getPassName() const override {
+    return "Scalarize Masked Memory Intrinsics";
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<TargetTransformInfoWrapperPass>();
+  }
+
+private:
+  bool optimizeBlock(BasicBlock &BB, bool &ModifiedDT);
+  bool optimizeCallInst(CallInst *CI, bool &ModifiedDT);
+};
+} // namespace
+
+char ScalarizeMaskedMemIntrin::ID = 0;
+INITIALIZE_PASS_BEGIN(ScalarizeMaskedMemIntrin, "scalarize-masked-mem-intrin",
+                      "Scalarize unsupported masked memory intrinsics", false,
+                      false)
+INITIALIZE_PASS_END(ScalarizeMaskedMemIntrin, "scalarize-masked-mem-intrin",
+                    "Scalarize unsupported masked memory intrinsics", false,
+                    false)
+
+FunctionPass *llvm::createScalarizeMaskedMemIntrinPass() {
+  return new ScalarizeMaskedMemIntrin();
+}
+
+// Translate a masked load intrinsic like
+// <16 x i32 > @llvm.masked.load( <16 x i32>* %addr, i32 align,
+//                               <16 x i1> %mask, <16 x i32> %passthru)
+// to a chain of basic blocks, with loading element one-by-one if
+// the appropriate mask bit is set
+//
+//  %1 = bitcast i8* %addr to i32*
+//  %2 = extractelement <16 x i1> %mask, i32 0
+//  %3 = icmp eq i1 %2, true
+//  br i1 %3, label %cond.load, label %else
+//
+// cond.load:                                        ; preds = %0
+//  %4 = getelementptr i32* %1, i32 0
+//  %5 = load i32* %4
+//  %6 = insertelement <16 x i32> undef, i32 %5, i32 0
+//  br label %else
+//
+// else:                                             ; preds = %0, %cond.load
+//  %res.phi.else = phi <16 x i32> [ %6, %cond.load ], [ undef, %0 ]
+//  %7 = extractelement <16 x i1> %mask, i32 1
+//  %8 = icmp eq i1 %7, true
+//  br i1 %8, label %cond.load1, label %else2
+//
+// cond.load1:                                       ; preds = %else
+//  %9 = getelementptr i32* %1, i32 1
+//  %10 = load i32* %9
+//  %11 = insertelement <16 x i32> %res.phi.else, i32 %10, i32 1
+//  br label %else2
+//
+// else2:                                          ; preds = %else, %cond.load1
+//  %res.phi.else3 = phi <16 x i32> [ %11, %cond.load1 ], [ %res.phi.else, %else ]
+//  %12 = extractelement <16 x i1> %mask, i32 2
+//  %13 = icmp eq i1 %12, true
+//  br i1 %13, label %cond.load4, label %else5
+//
+static void scalarizeMaskedLoad(CallInst *CI) {
+  Value *Ptr = CI->getArgOperand(0);
+  Value *Alignment = CI->getArgOperand(1);
+  Value *Mask = CI->getArgOperand(2);
+  Value *Src0 = CI->getArgOperand(3);
+
+  unsigned AlignVal = cast<ConstantInt>(Alignment)->getZExtValue();
+  VectorType *VecType = dyn_cast<VectorType>(CI->getType());
+  assert(VecType && "Unexpected return type of masked load intrinsic");
+
+  Type *EltTy = CI->getType()->getVectorElementType();
+
+  IRBuilder<> Builder(CI->getContext());
+  Instruction *InsertPt = CI;
+  BasicBlock *IfBlock = CI->getParent();
+  BasicBlock *CondBlock = nullptr;
+  BasicBlock *PrevIfBlock = CI->getParent();
+
+  Builder.SetInsertPoint(InsertPt);
+  Builder.SetCurrentDebugLocation(CI->getDebugLoc());
+
+  // Short-cut if the mask is all-true.
+  bool IsAllOnesMask =
+      isa<Constant>(Mask) && cast<Constant>(Mask)->isAllOnesValue();
+
+  if (IsAllOnesMask) {
+    Value *NewI = Builder.CreateAlignedLoad(Ptr, AlignVal);
+    CI->replaceAllUsesWith(NewI);
+    CI->eraseFromParent();
+    return;
+  }
+
+  // Adjust alignment for the scalar instruction.
+  AlignVal = std::min(AlignVal, VecType->getScalarSizeInBits() / 8);
+  // Bitcast %addr fron i8* to EltTy*
+  Type *NewPtrType =
+      EltTy->getPointerTo(cast<PointerType>(Ptr->getType())->getAddressSpace());
+  Value *FirstEltPtr = Builder.CreateBitCast(Ptr, NewPtrType);
+  unsigned VectorWidth = VecType->getNumElements();
+
+  Value *UndefVal = UndefValue::get(VecType);
+
+  // The result vector
+  Value *VResult = UndefVal;
+
+  if (isa<ConstantVector>(Mask)) {
+    for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) {
+      if (cast<ConstantVector>(Mask)->getOperand(Idx)->isNullValue())
+        continue;
+      Value *Gep =
+          Builder.CreateInBoundsGEP(EltTy, FirstEltPtr, Builder.getInt32(Idx));
+      LoadInst *Load = Builder.CreateAlignedLoad(Gep, AlignVal);
+      VResult =
+          Builder.CreateInsertElement(VResult, Load, Builder.getInt32(Idx));
+    }
+    Value *NewI = Builder.CreateSelect(Mask, VResult, Src0);
+    CI->replaceAllUsesWith(NewI);
+    CI->eraseFromParent();
+    return;
+  }
+
+  PHINode *Phi = nullptr;
+  Value *PrevPhi = UndefVal;
+
+  for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) {
+
+    // Fill the "else" block, created in the previous iteration
+    //
+    //  %res.phi.else3 = phi <16 x i32> [ %11, %cond.load1 ], [ %res.phi.else, %else ]
+    //  %mask_1 = extractelement <16 x i1> %mask, i32 Idx
+    //  %to_load = icmp eq i1 %mask_1, true
+    //  br i1 %to_load, label %cond.load, label %else
+    //
+    if (Idx > 0) {
+      Phi = Builder.CreatePHI(VecType, 2, "res.phi.else");
+      Phi->addIncoming(VResult, CondBlock);
+      Phi->addIncoming(PrevPhi, PrevIfBlock);
+      PrevPhi = Phi;
+      VResult = Phi;
+    }
+
+    Value *Predicate =
+        Builder.CreateExtractElement(Mask, Builder.getInt32(Idx));
+    Value *Cmp = Builder.CreateICmp(ICmpInst::ICMP_EQ, Predicate,
+                                    ConstantInt::get(Predicate->getType(), 1));
+
+    // Create "cond" block
+    //
+    //  %EltAddr = getelementptr i32* %1, i32 0
+    //  %Elt = load i32* %EltAddr
+    //  VResult = insertelement <16 x i32> VResult, i32 %Elt, i32 Idx
+    //
+    CondBlock = IfBlock->splitBasicBlock(InsertPt->getIterator(), "cond.load");
+    Builder.SetInsertPoint(InsertPt);
+
+    Value *Gep =
+        Builder.CreateInBoundsGEP(EltTy, FirstEltPtr, Builder.getInt32(Idx));
+    LoadInst *Load = Builder.CreateAlignedLoad(Gep, AlignVal);
+    VResult = Builder.CreateInsertElement(VResult, Load, Builder.getInt32(Idx));
+
+    // Create "else" block, fill it in the next iteration
+    BasicBlock *NewIfBlock =
+        CondBlock->splitBasicBlock(InsertPt->getIterator(), "else");
+    Builder.SetInsertPoint(InsertPt);
+    Instruction *OldBr = IfBlock->getTerminator();
+    BranchInst::Create(CondBlock, NewIfBlock, Cmp, OldBr);
+    OldBr->eraseFromParent();
+    PrevIfBlock = IfBlock;
+    IfBlock = NewIfBlock;
+  }
+
+  Phi = Builder.CreatePHI(VecType, 2, "res.phi.select");
+  Phi->addIncoming(VResult, CondBlock);
+  Phi->addIncoming(PrevPhi, PrevIfBlock);
+  Value *NewI = Builder.CreateSelect(Mask, Phi, Src0);
+  CI->replaceAllUsesWith(NewI);
+  CI->eraseFromParent();
+}
+
+// Translate a masked store intrinsic, like
+// void @llvm.masked.store(<16 x i32> %src, <16 x i32>* %addr, i32 align,
+//                               <16 x i1> %mask)
+// to a chain of basic blocks, that stores element one-by-one if
+// the appropriate mask bit is set
+//
+//   %1 = bitcast i8* %addr to i32*
+//   %2 = extractelement <16 x i1> %mask, i32 0
+//   %3 = icmp eq i1 %2, true
+//   br i1 %3, label %cond.store, label %else
+//
+// cond.store:                                       ; preds = %0
+//   %4 = extractelement <16 x i32> %val, i32 0
+//   %5 = getelementptr i32* %1, i32 0
+//   store i32 %4, i32* %5
+//   br label %else
+//
+// else:                                             ; preds = %0, %cond.store
+//   %6 = extractelement <16 x i1> %mask, i32 1
+//   %7 = icmp eq i1 %6, true
+//   br i1 %7, label %cond.store1, label %else2
+//
+// cond.store1:                                      ; preds = %else
+//   %8 = extractelement <16 x i32> %val, i32 1
+//   %9 = getelementptr i32* %1, i32 1
+//   store i32 %8, i32* %9
+//   br label %else2
+//   . . .
+static void scalarizeMaskedStore(CallInst *CI) {
+  Value *Src = CI->getArgOperand(0);
+  Value *Ptr = CI->getArgOperand(1);
+  Value *Alignment = CI->getArgOperand(2);
+  Value *Mask = CI->getArgOperand(3);
+
+  unsigned AlignVal = cast<ConstantInt>(Alignment)->getZExtValue();
+  VectorType *VecType = dyn_cast<VectorType>(Src->getType());
+  assert(VecType && "Unexpected data type in masked store intrinsic");
+
+  Type *EltTy = VecType->getElementType();
+
+  IRBuilder<> Builder(CI->getContext());
+  Instruction *InsertPt = CI;
+  BasicBlock *IfBlock = CI->getParent();
+  Builder.SetInsertPoint(InsertPt);
+  Builder.SetCurrentDebugLocation(CI->getDebugLoc());
+
+  // Short-cut if the mask is all-true.
+  bool IsAllOnesMask =
+      isa<Constant>(Mask) && cast<Constant>(Mask)->isAllOnesValue();
+
+  if (IsAllOnesMask) {
+    Builder.CreateAlignedStore(Src, Ptr, AlignVal);
+    CI->eraseFromParent();
+    return;
+  }
+
+  // Adjust alignment for the scalar instruction.
+  AlignVal = std::max(AlignVal, VecType->getScalarSizeInBits() / 8);
+  // Bitcast %addr fron i8* to EltTy*
+  Type *NewPtrType =
+      EltTy->getPointerTo(cast<PointerType>(Ptr->getType())->getAddressSpace());
+  Value *FirstEltPtr = Builder.CreateBitCast(Ptr, NewPtrType);
+  unsigned VectorWidth = VecType->getNumElements();
+
+  if (isa<ConstantVector>(Mask)) {
+    for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) {
+      if (cast<ConstantVector>(Mask)->getOperand(Idx)->isNullValue())
+        continue;
+      Value *OneElt = Builder.CreateExtractElement(Src, Builder.getInt32(Idx));
+      Value *Gep =
+          Builder.CreateInBoundsGEP(EltTy, FirstEltPtr, Builder.getInt32(Idx));
+      Builder.CreateAlignedStore(OneElt, Gep, AlignVal);
+    }
+    CI->eraseFromParent();
+    return;
+  }
+
+  for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) {
+
+    // Fill the "else" block, created in the previous iteration
+    //
+    //  %mask_1 = extractelement <16 x i1> %mask, i32 Idx
+    //  %to_store = icmp eq i1 %mask_1, true
+    //  br i1 %to_store, label %cond.store, label %else
+    //
+    Value *Predicate =
+        Builder.CreateExtractElement(Mask, Builder.getInt32(Idx));
+    Value *Cmp = Builder.CreateICmp(ICmpInst::ICMP_EQ, Predicate,
+                                    ConstantInt::get(Predicate->getType(), 1));
+
+    // Create "cond" block
+    //
+    //  %OneElt = extractelement <16 x i32> %Src, i32 Idx
+    //  %EltAddr = getelementptr i32* %1, i32 0
+    //  %store i32 %OneElt, i32* %EltAddr
+    //
+    BasicBlock *CondBlock =
+        IfBlock->splitBasicBlock(InsertPt->getIterator(), "cond.store");
+    Builder.SetInsertPoint(InsertPt);
+
+    Value *OneElt = Builder.CreateExtractElement(Src, Builder.getInt32(Idx));
+    Value *Gep =
+        Builder.CreateInBoundsGEP(EltTy, FirstEltPtr, Builder.getInt32(Idx));
+    Builder.CreateAlignedStore(OneElt, Gep, AlignVal);
+
+    // Create "else" block, fill it in the next iteration
+    BasicBlock *NewIfBlock =
+        CondBlock->splitBasicBlock(InsertPt->getIterator(), "else");
+    Builder.SetInsertPoint(InsertPt);
+    Instruction *OldBr = IfBlock->getTerminator();
+    BranchInst::Create(CondBlock, NewIfBlock, Cmp, OldBr);
+    OldBr->eraseFromParent();
+    IfBlock = NewIfBlock;
+  }
+  CI->eraseFromParent();
+}
+
+// Translate a masked gather intrinsic like
+// <16 x i32 > @llvm.masked.gather.v16i32( <16 x i32*> %Ptrs, i32 4,
+//                               <16 x i1> %Mask, <16 x i32> %Src)
+// to a chain of basic blocks, with loading element one-by-one if
+// the appropriate mask bit is set
+//
+// % Ptrs = getelementptr i32, i32* %base, <16 x i64> %ind
+// % Mask0 = extractelement <16 x i1> %Mask, i32 0
+// % ToLoad0 = icmp eq i1 % Mask0, true
+// br i1 % ToLoad0, label %cond.load, label %else
+//
+// cond.load:
+// % Ptr0 = extractelement <16 x i32*> %Ptrs, i32 0
+// % Load0 = load i32, i32* % Ptr0, align 4
+// % Res0 = insertelement <16 x i32> undef, i32 % Load0, i32 0
+// br label %else
+//
+// else:
+// %res.phi.else = phi <16 x i32>[% Res0, %cond.load], [undef, % 0]
+// % Mask1 = extractelement <16 x i1> %Mask, i32 1
+// % ToLoad1 = icmp eq i1 % Mask1, true
+// br i1 % ToLoad1, label %cond.load1, label %else2
+//
+// cond.load1:
+// % Ptr1 = extractelement <16 x i32*> %Ptrs, i32 1
+// % Load1 = load i32, i32* % Ptr1, align 4
+// % Res1 = insertelement <16 x i32> %res.phi.else, i32 % Load1, i32 1
+// br label %else2
+// . . .
+// % Result = select <16 x i1> %Mask, <16 x i32> %res.phi.select, <16 x i32> %Src
+// ret <16 x i32> %Result
+static void scalarizeMaskedGather(CallInst *CI) {
+  Value *Ptrs = CI->getArgOperand(0);
+  Value *Alignment = CI->getArgOperand(1);
+  Value *Mask = CI->getArgOperand(2);
+  Value *Src0 = CI->getArgOperand(3);
+
+  VectorType *VecType = dyn_cast<VectorType>(CI->getType());
+
+  assert(VecType && "Unexpected return type of masked load intrinsic");
+
+  IRBuilder<> Builder(CI->getContext());
+  Instruction *InsertPt = CI;
+  BasicBlock *IfBlock = CI->getParent();
+  BasicBlock *CondBlock = nullptr;
+  BasicBlock *PrevIfBlock = CI->getParent();
+  Builder.SetInsertPoint(InsertPt);
+  unsigned AlignVal = cast<ConstantInt>(Alignment)->getZExtValue();
+
+  Builder.SetCurrentDebugLocation(CI->getDebugLoc());
+
+  Value *UndefVal = UndefValue::get(VecType);
+
+  // The result vector
+  Value *VResult = UndefVal;
+  unsigned VectorWidth = VecType->getNumElements();
+
+  // Shorten the way if the mask is a vector of constants.
+  bool IsConstMask = isa<ConstantVector>(Mask);
+
+  if (IsConstMask) {
+    for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) {
+      if (cast<ConstantVector>(Mask)->getOperand(Idx)->isNullValue())
+        continue;
+      Value *Ptr = Builder.CreateExtractElement(Ptrs, Builder.getInt32(Idx),
+                                                "Ptr" + Twine(Idx));
+      LoadInst *Load =
+          Builder.CreateAlignedLoad(Ptr, AlignVal, "Load" + Twine(Idx));
+      VResult = Builder.CreateInsertElement(
+          VResult, Load, Builder.getInt32(Idx), "Res" + Twine(Idx));
+    }
+    Value *NewI = Builder.CreateSelect(Mask, VResult, Src0);
+    CI->replaceAllUsesWith(NewI);
+    CI->eraseFromParent();
+    return;
+  }
+
+  PHINode *Phi = nullptr;
+  Value *PrevPhi = UndefVal;
+
+  for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) {
+
+    // Fill the "else" block, created in the previous iteration
+    //
+    //  %Mask1 = extractelement <16 x i1> %Mask, i32 1
+    //  %ToLoad1 = icmp eq i1 %Mask1, true
+    //  br i1 %ToLoad1, label %cond.load, label %else
+    //
+    if (Idx > 0) {
+      Phi = Builder.CreatePHI(VecType, 2, "res.phi.else");
+      Phi->addIncoming(VResult, CondBlock);
+      Phi->addIncoming(PrevPhi, PrevIfBlock);
+      PrevPhi = Phi;
+      VResult = Phi;
+    }
+
+    Value *Predicate = Builder.CreateExtractElement(Mask, Builder.getInt32(Idx),
+                                                    "Mask" + Twine(Idx));
+    Value *Cmp = Builder.CreateICmp(ICmpInst::ICMP_EQ, Predicate,
+                                    ConstantInt::get(Predicate->getType(), 1),
+                                    "ToLoad" + Twine(Idx));
+
+    // Create "cond" block
+    //
+    //  %EltAddr = getelementptr i32* %1, i32 0
+    //  %Elt = load i32* %EltAddr
+    //  VResult = insertelement <16 x i32> VResult, i32 %Elt, i32 Idx
+    //
+    CondBlock = IfBlock->splitBasicBlock(InsertPt, "cond.load");
+    Builder.SetInsertPoint(InsertPt);
+
+    Value *Ptr = Builder.CreateExtractElement(Ptrs, Builder.getInt32(Idx),
+                                              "Ptr" + Twine(Idx));
+    LoadInst *Load =
+        Builder.CreateAlignedLoad(Ptr, AlignVal, "Load" + Twine(Idx));
+    VResult = Builder.CreateInsertElement(VResult, Load, Builder.getInt32(Idx),
+                                          "Res" + Twine(Idx));
+
+    // Create "else" block, fill it in the next iteration
+    BasicBlock *NewIfBlock = CondBlock->splitBasicBlock(InsertPt, "else");
+    Builder.SetInsertPoint(InsertPt);
+    Instruction *OldBr = IfBlock->getTerminator();
+    BranchInst::Create(CondBlock, NewIfBlock, Cmp, OldBr);
+    OldBr->eraseFromParent();
+    PrevIfBlock = IfBlock;
+    IfBlock = NewIfBlock;
+  }
+
+  Phi = Builder.CreatePHI(VecType, 2, "res.phi.select");
+  Phi->addIncoming(VResult, CondBlock);
+  Phi->addIncoming(PrevPhi, PrevIfBlock);
+  Value *NewI = Builder.CreateSelect(Mask, Phi, Src0);
+  CI->replaceAllUsesWith(NewI);
+  CI->eraseFromParent();
+}
+
+// Translate a masked scatter intrinsic, like
+// void @llvm.masked.scatter.v16i32(<16 x i32> %Src, <16 x i32*>* %Ptrs, i32 4,
+//                                  <16 x i1> %Mask)
+// to a chain of basic blocks, that stores element one-by-one if
+// the appropriate mask bit is set.
+//
+// % Ptrs = getelementptr i32, i32* %ptr, <16 x i64> %ind
+// % Mask0 = extractelement <16 x i1> % Mask, i32 0
+// % ToStore0 = icmp eq i1 % Mask0, true
+// br i1 %ToStore0, label %cond.store, label %else
+//
+// cond.store:
+// % Elt0 = extractelement <16 x i32> %Src, i32 0
+// % Ptr0 = extractelement <16 x i32*> %Ptrs, i32 0
+// store i32 %Elt0, i32* % Ptr0, align 4
+// br label %else
+//
+// else:
+// % Mask1 = extractelement <16 x i1> % Mask, i32 1
+// % ToStore1 = icmp eq i1 % Mask1, true
+// br i1 % ToStore1, label %cond.store1, label %else2
+//
+// cond.store1:
+// % Elt1 = extractelement <16 x i32> %Src, i32 1
+// % Ptr1 = extractelement <16 x i32*> %Ptrs, i32 1
+// store i32 % Elt1, i32* % Ptr1, align 4
+// br label %else2
+//   . . .
+static void scalarizeMaskedScatter(CallInst *CI) {
+  Value *Src = CI->getArgOperand(0);
+  Value *Ptrs = CI->getArgOperand(1);
+  Value *Alignment = CI->getArgOperand(2);
+  Value *Mask = CI->getArgOperand(3);
+
+  assert(isa<VectorType>(Src->getType()) &&
+         "Unexpected data type in masked scatter intrinsic");
+  assert(isa<VectorType>(Ptrs->getType()) &&
+         isa<PointerType>(Ptrs->getType()->getVectorElementType()) &&
+         "Vector of pointers is expected in masked scatter intrinsic");
+
+  IRBuilder<> Builder(CI->getContext());
+  Instruction *InsertPt = CI;
+  BasicBlock *IfBlock = CI->getParent();
+  Builder.SetInsertPoint(InsertPt);
+  Builder.SetCurrentDebugLocation(CI->getDebugLoc());
+
+  unsigned AlignVal = cast<ConstantInt>(Alignment)->getZExtValue();
+  unsigned VectorWidth = Src->getType()->getVectorNumElements();
+
+  // Shorten the way if the mask is a vector of constants.
+  bool IsConstMask = isa<ConstantVector>(Mask);
+
+  if (IsConstMask) {
+    for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) {
+      if (cast<ConstantVector>(Mask)->getOperand(Idx)->isNullValue())
+        continue;
+      Value *OneElt = Builder.CreateExtractElement(Src, Builder.getInt32(Idx),
+                                                   "Elt" + Twine(Idx));
+      Value *Ptr = Builder.CreateExtractElement(Ptrs, Builder.getInt32(Idx),
+                                                "Ptr" + Twine(Idx));
+      Builder.CreateAlignedStore(OneElt, Ptr, AlignVal);
+    }
+    CI->eraseFromParent();
+    return;
+  }
+  for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) {
+    // Fill the "else" block, created in the previous iteration
+    //
+    //  % Mask1 = extractelement <16 x i1> % Mask, i32 Idx
+    //  % ToStore = icmp eq i1 % Mask1, true
+    //  br i1 % ToStore, label %cond.store, label %else
+    //
+    Value *Predicate = Builder.CreateExtractElement(Mask, Builder.getInt32(Idx),
+                                                    "Mask" + Twine(Idx));
+    Value *Cmp = Builder.CreateICmp(ICmpInst::ICMP_EQ, Predicate,
+                                    ConstantInt::get(Predicate->getType(), 1),
+                                    "ToStore" + Twine(Idx));
+
+    // Create "cond" block
+    //
+    //  % Elt1 = extractelement <16 x i32> %Src, i32 1
+    //  % Ptr1 = extractelement <16 x i32*> %Ptrs, i32 1
+    //  %store i32 % Elt1, i32* % Ptr1
+    //
+    BasicBlock *CondBlock = IfBlock->splitBasicBlock(InsertPt, "cond.store");
+    Builder.SetInsertPoint(InsertPt);
+
+    Value *OneElt = Builder.CreateExtractElement(Src, Builder.getInt32(Idx),
+                                                 "Elt" + Twine(Idx));
+    Value *Ptr = Builder.CreateExtractElement(Ptrs, Builder.getInt32(Idx),
+                                              "Ptr" + Twine(Idx));
+    Builder.CreateAlignedStore(OneElt, Ptr, AlignVal);
+
+    // Create "else" block, fill it in the next iteration
+    BasicBlock *NewIfBlock = CondBlock->splitBasicBlock(InsertPt, "else");
+    Builder.SetInsertPoint(InsertPt);
+    Instruction *OldBr = IfBlock->getTerminator();
+    BranchInst::Create(CondBlock, NewIfBlock, Cmp, OldBr);
+    OldBr->eraseFromParent();
+    IfBlock = NewIfBlock;
+  }
+  CI->eraseFromParent();
+}
+
+bool ScalarizeMaskedMemIntrin::runOnFunction(Function &F) {
+  if (skipFunction(F))
+    return false;
+
+  bool EverMadeChange = false;
+
+  TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
+
+  bool MadeChange = true;
+  while (MadeChange) {
+    MadeChange = false;
+    for (Function::iterator I = F.begin(); I != F.end();) {
+      BasicBlock *BB = &*I++;
+      bool ModifiedDTOnIteration = false;
+      MadeChange |= optimizeBlock(*BB, ModifiedDTOnIteration);
+
+      // Restart BB iteration if the dominator tree of the Function was changed
+      if (ModifiedDTOnIteration)
+        break;
+    }
+
+    EverMadeChange |= MadeChange;
+  }
+
+  return EverMadeChange;
+}
+
+bool ScalarizeMaskedMemIntrin::optimizeBlock(BasicBlock &BB, bool &ModifiedDT) {
+  bool MadeChange = false;
+
+  BasicBlock::iterator CurInstIterator = BB.begin();
+  while (CurInstIterator != BB.end()) {
+    if (CallInst *CI = dyn_cast<CallInst>(&*CurInstIterator++))
+      MadeChange |= optimizeCallInst(CI, ModifiedDT);
+    if (ModifiedDT)
+      return true;
+  }
+
+  return MadeChange;
+}
+
+bool ScalarizeMaskedMemIntrin::optimizeCallInst(CallInst *CI,
+                                                bool &ModifiedDT) {
+
+  IntrinsicInst *II = dyn_cast<IntrinsicInst>(CI);
+  if (II) {
+    switch (II->getIntrinsicID()) {
+    default:
+      break;
+    case Intrinsic::masked_load: {
+      // Scalarize unsupported vector masked load
+      if (!TTI->isLegalMaskedLoad(CI->getType())) {
+        scalarizeMaskedLoad(CI);
+        ModifiedDT = true;
+        return true;
+      }
+      return false;
+    }
+    case Intrinsic::masked_store: {
+      if (!TTI->isLegalMaskedStore(CI->getArgOperand(0)->getType())) {
+        scalarizeMaskedStore(CI);
+        ModifiedDT = true;
+        return true;
+      }
+      return false;
+    }
+    case Intrinsic::masked_gather: {
+      if (!TTI->isLegalMaskedGather(CI->getType())) {
+        scalarizeMaskedGather(CI);
+        ModifiedDT = true;
+        return true;
+      }
+      return false;
+    }
+    case Intrinsic::masked_scatter: {
+      if (!TTI->isLegalMaskedScatter(CI->getArgOperand(0)->getType())) {
+        scalarizeMaskedScatter(CI);
+        ModifiedDT = true;
+        return true;
+      }
+      return false;
+    }
+    }
+  }
+
+  return false;
+}
diff --git a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index c77046fdfaf5..caf5cb497a71 100644
--- a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -114,7 +114,7 @@ namespace {
     SmallPtrSet<SDNode *, 32> CombinedNodes;
 
     // AA - Used for DAG load/store alias analysis.
-    AliasAnalysis &AA;
+    AliasAnalysis *AA;
 
     /// When an instruction is simplified, add all users of the instruction to
     /// the work lists because they might get more simplified now.
@@ -496,9 +496,9 @@ namespace {
     SDValue distributeTruncateThroughAnd(SDNode *N);
 
   public:
-    DAGCombiner(SelectionDAG &D, AliasAnalysis &A, CodeGenOpt::Level OL)
+    DAGCombiner(SelectionDAG &D, AliasAnalysis *AA, CodeGenOpt::Level OL)
         : DAG(D), TLI(D.getTargetLoweringInfo()), Level(BeforeLegalizeTypes),
-          OptLevel(OL), LegalOperations(false), LegalTypes(false), AA(A) {
+          OptLevel(OL), LegalOperations(false), LegalTypes(false), AA(AA) {
       ForCodeSize = DAG.getMachineFunction().getFunction()->optForSize();
 
       MaximumLegalStoreInBits = 0;
@@ -1729,10 +1729,9 @@ SDValue DAGCombiner::visitTokenFactor(SDNode *N) {
       NumLeftToConsider--;
   }
 
-  SDValue Result;
-
   // If we've changed things around then replace token factor.
   if (Changed) {
+    SDValue Result;
     if (Ops.empty()) {
       // The entry token is the only possible outcome.
       Result = DAG.getEntryNode();
@@ -1749,13 +1748,9 @@ SDValue DAGCombiner::visitTokenFactor(SDNode *N) {
         Result = DAG.getNode(ISD::TokenFactor, SDLoc(N), MVT::Other, Ops);
       }
     }
-
-    // Add users to worklist, since we may introduce a lot of new
-    // chained token factors while removing memory deps.
-    return CombineTo(N, Result, true /*add to worklist*/);
+    return Result;
   }
-
-  return Result;
+  return SDValue();
 }
 
 /// MERGE_VALUES can always be eliminated.
@@ -2131,17 +2126,17 @@ SDValue DAGCombiner::visitADDCARRY(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
   SDValue CarryIn = N->getOperand(2);
+  SDLoc DL(N);
 
   // canonicalize constant to RHS
   ConstantSDNode *N0C = dyn_cast<ConstantSDNode>(N0);
   ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
   if (N0C && !N1C)
-    return DAG.getNode(ISD::ADDCARRY, SDLoc(N), N->getVTList(),
-                       N1, N0, CarryIn);
+    return DAG.getNode(ISD::ADDCARRY, DL, N->getVTList(), N1, N0, CarryIn);
 
   // fold (addcarry x, y, false) -> (uaddo x, y)
   if (isNullConstant(CarryIn))
-    return DAG.getNode(ISD::UADDO, SDLoc(N), N->getVTList(), N0, N1);
+    return DAG.getNode(ISD::UADDO, DL, N->getVTList(), N0, N1);
 
   if (SDValue Combined = visitADDCARRYLike(N0, N1, CarryIn, N))
     return Combined;
@@ -5313,17 +5308,6 @@ SDValue DAGCombiner::visitSHL(SDNode *N) {
     }
   }
 
-  // If the target supports masking y in (shl, y),
-  // fold (shl x, (and y, ((1 << numbits(x)) - 1))) -> (shl x, y)
-  if (TLI.isOperationLegal(ISD::SHL, VT) &&
-      TLI.supportsModuloShift(ISD::SHL, VT) && N1->getOpcode() == ISD::AND) {
-    if (ConstantSDNode *Mask = isConstOrConstSplat(N1->getOperand(1))) {
-      if (Mask->getZExtValue() == OpSizeInBits - 1) {
-        return DAG.getNode(ISD::SHL, SDLoc(N), VT, N0, N1->getOperand(0));
-      }
-    }
-  }
-
   ConstantSDNode *N1C = isConstOrConstSplat(N1);
 
   // fold (shl c1, c2) -> c1<<c2
@@ -5331,7 +5315,7 @@ SDValue DAGCombiner::visitSHL(SDNode *N) {
   if (N0C && N1C && !N1C->isOpaque())
     return DAG.FoldConstantArithmetic(ISD::SHL, SDLoc(N), VT, N0C, N1C);
   // fold (shl 0, x) -> 0
-  if (isNullConstant(N0))
+  if (isNullConstantOrNullSplatConstant(N0))
     return N0;
   // fold (shl x, c >= size(x)) -> undef
   if (N1C && N1C->getAPIntValue().uge(OpSizeInBits))
@@ -5522,18 +5506,9 @@ SDValue DAGCombiner::visitSRA(SDNode *N) {
   EVT VT = N0.getValueType();
   unsigned OpSizeInBits = VT.getScalarSizeInBits();
 
-  // If the target supports masking y in (sra, y),
-  // fold (sra x, (and y, ((1 << numbits(x)) - 1))) -> (sra x, y)
-  if (TLI.isOperationLegal(ISD::SRA, VT) &&
-      TLI.supportsModuloShift(ISD::SRA, VT) && N1->getOpcode() == ISD::AND) {
-    if (ConstantSDNode *Mask = isConstOrConstSplat(N1->getOperand(1))) {
-      if (Mask->getZExtValue() == OpSizeInBits - 1) {
-        return DAG.getNode(ISD::SRA, SDLoc(N), VT, N0, N1->getOperand(0));
-      }
-    }
-  }
-
   // Arithmetic shifting an all-sign-bit value is a no-op.
+  // fold (sra 0, x) -> 0
+  // fold (sra -1, x) -> -1
   if (DAG.ComputeNumSignBits(N0) == OpSizeInBits)
     return N0;
 
@@ -5548,12 +5523,6 @@ SDValue DAGCombiner::visitSRA(SDNode *N) {
   ConstantSDNode *N0C = getAsNonOpaqueConstant(N0);
   if (N0C && N1C && !N1C->isOpaque())
     return DAG.FoldConstantArithmetic(ISD::SRA, SDLoc(N), VT, N0C, N1C);
-  // fold (sra 0, x) -> 0
-  if (isNullConstant(N0))
-    return N0;
-  // fold (sra -1, x) -> -1
-  if (isAllOnesConstant(N0))
-    return N0;
   // fold (sra x, c >= size(x)) -> undef
   if (N1C && N1C->getAPIntValue().uge(OpSizeInBits))
     return DAG.getUNDEF(VT);
@@ -5691,17 +5660,6 @@ SDValue DAGCombiner::visitSRL(SDNode *N) {
   EVT VT = N0.getValueType();
   unsigned OpSizeInBits = VT.getScalarSizeInBits();
 
-  // If the target supports masking y in (srl, y),
-  // fold (srl x, (and y, ((1 << numbits(x)) - 1))) -> (srl x, y)
-  if (TLI.isOperationLegal(ISD::SRL, VT) &&
-      TLI.supportsModuloShift(ISD::SRL, VT) && N1->getOpcode() == ISD::AND) {
-    if (ConstantSDNode *Mask = isConstOrConstSplat(N1->getOperand(1))) {
-      if (Mask->getZExtValue() == OpSizeInBits - 1) {
-        return DAG.getNode(ISD::SRL, SDLoc(N), VT, N0, N1->getOperand(0));
-      }
-    }
-  }
-
   // fold vector ops
   if (VT.isVector())
     if (SDValue FoldedVOp = SimplifyVBinOp(N))
@@ -5714,7 +5672,7 @@ SDValue DAGCombiner::visitSRL(SDNode *N) {
   if (N0C && N1C && !N1C->isOpaque())
     return DAG.FoldConstantArithmetic(ISD::SRL, SDLoc(N), VT, N0C, N1C);
   // fold (srl 0, x) -> 0
-  if (isNullConstant(N0))
+  if (isNullConstantOrNullSplatConstant(N0))
     return N0;
   // fold (srl x, c >= size(x)) -> undef
   if (N1C && N1C->getAPIntValue().uge(OpSizeInBits))
@@ -7365,14 +7323,8 @@ SDValue DAGCombiner::visitZERO_EXTEND(SDNode *N) {
                         N0.getValueSizeInBits(),
                         std::min(Op.getValueSizeInBits(),
                                  VT.getSizeInBits()));
-    if (TruncatedBits.isSubsetOf(Known.Zero)) {
-      if (VT.bitsGT(Op.getValueType()))
-        return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), VT, Op);
-      if (VT.bitsLT(Op.getValueType()))
-        return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, Op);
-
-      return Op;
-    }
+    if (TruncatedBits.isSubsetOf(Known.Zero))
+      return DAG.getZExtOrTrunc(Op, SDLoc(N), VT);
   }
 
   // fold (zext (truncate (load x))) -> (zext (smaller load x))
@@ -7419,14 +7371,8 @@ SDValue DAGCombiner::visitZERO_EXTEND(SDNode *N) {
     }
 
     if (!LegalOperations || TLI.isOperationLegal(ISD::AND, VT)) {
-      SDValue Op = N0.getOperand(0);
-      if (SrcVT.bitsLT(VT)) {
-        Op = DAG.getNode(ISD::ANY_EXTEND, SDLoc(N), VT, Op);
-        AddToWorklist(Op.getNode());
-      } else if (SrcVT.bitsGT(VT)) {
-        Op = DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, Op);
-        AddToWorklist(Op.getNode());
-      }
+      SDValue Op = DAG.getAnyExtOrTrunc(N0.getOperand(0), SDLoc(N), VT);
+      AddToWorklist(Op.getNode());
       return DAG.getZeroExtendInReg(Op, SDLoc(N), MinVT.getScalarType());
     }
   }
@@ -7440,11 +7386,7 @@ SDValue DAGCombiner::visitZERO_EXTEND(SDNode *N) {
                            N0.getValueType()) ||
        !TLI.isZExtFree(N0.getValueType(), VT))) {
     SDValue X = N0.getOperand(0).getOperand(0);
-    if (X.getValueType().bitsLT(VT)) {
-      X = DAG.getNode(ISD::ANY_EXTEND, SDLoc(X), VT, X);
-    } else if (X.getValueType().bitsGT(VT)) {
-      X = DAG.getNode(ISD::TRUNCATE, SDLoc(X), VT, X);
-    }
+    X = DAG.getAnyExtOrTrunc(X, SDLoc(X), VT);
     APInt Mask = cast<ConstantSDNode>(N0.getOperand(1))->getAPIntValue();
     Mask = Mask.zext(VT.getSizeInBits());
     SDLoc DL(N);
@@ -7669,14 +7611,8 @@ SDValue DAGCombiner::visitANY_EXTEND(SDNode *N) {
   }
 
   // fold (aext (truncate x))
-  if (N0.getOpcode() == ISD::TRUNCATE) {
-    SDValue TruncOp = N0.getOperand(0);
-    if (TruncOp.getValueType() == VT)
-      return TruncOp; // x iff x size == zext size.
-    if (TruncOp.getValueType().bitsGT(VT))
-      return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, TruncOp);
-    return DAG.getNode(ISD::ANY_EXTEND, SDLoc(N), VT, TruncOp);
-  }
+  if (N0.getOpcode() == ISD::TRUNCATE)
+    return DAG.getAnyExtOrTrunc(N0.getOperand(0), SDLoc(N), VT);
 
   // Fold (aext (and (trunc x), cst)) -> (and x, cst)
   // if the trunc is not free.
@@ -7687,11 +7623,7 @@ SDValue DAGCombiner::visitANY_EXTEND(SDNode *N) {
                           N0.getValueType())) {
     SDLoc DL(N);
     SDValue X = N0.getOperand(0).getOperand(0);
-    if (X.getValueType().bitsLT(VT)) {
-      X = DAG.getNode(ISD::ANY_EXTEND, DL, VT, X);
-    } else if (X.getValueType().bitsGT(VT)) {
-      X = DAG.getNode(ISD::TRUNCATE, DL, VT, X);
-    }
+    X = DAG.getAnyExtOrTrunc(X, DL, VT);
     APInt Mask = cast<ConstantSDNode>(N0.getOperand(1))->getAPIntValue();
     Mask = Mask.zext(VT.getSizeInBits());
     return DAG.getNode(ISD::AND, DL, VT,
@@ -14868,6 +14800,55 @@ SDValue combineTruncationShuffle(ShuffleVectorSDNode *SVN, SelectionDAG &DAG) {
   return SDValue();
 }
 
+// Combine shuffles of splat-shuffles of the form:
+// shuffle (shuffle V, undef, splat-mask), undef, M
+// If splat-mask contains undef elements, we need to be careful about
+// introducing undef's in the folded mask which are not the result of composing
+// the masks of the shuffles.
+static SDValue combineShuffleOfSplat(ArrayRef<int> UserMask,
+                                     ShuffleVectorSDNode *Splat,
+                                     SelectionDAG &DAG) {
+  ArrayRef<int> SplatMask = Splat->getMask();
+  assert(UserMask.size() == SplatMask.size() && "Mask length mismatch");
+
+  // Prefer simplifying to the splat-shuffle, if possible. This is legal if
+  // every undef mask element in the splat-shuffle has a corresponding undef
+  // element in the user-shuffle's mask or if the composition of mask elements
+  // would result in undef.
+  // Examples for (shuffle (shuffle v, undef, SplatMask), undef, UserMask):
+  // * UserMask=[0,2,u,u], SplatMask=[2,u,2,u] -> [2,2,u,u]
+  //   In this case it is not legal to simplify to the splat-shuffle because we
+  //   may be exposing the users of the shuffle an undef element at index 1
+  //   which was not there before the combine.
+  // * UserMask=[0,u,2,u], SplatMask=[2,u,2,u] -> [2,u,2,u]
+  //   In this case the composition of masks yields SplatMask, so it's ok to
+  //   simplify to the splat-shuffle.
+  // * UserMask=[3,u,2,u], SplatMask=[2,u,2,u] -> [u,u,2,u]
+  //   In this case the composed mask includes all undef elements of SplatMask
+  //   and in addition sets element zero to undef. It is safe to simplify to
+  //   the splat-shuffle.
+  auto CanSimplifyToExistingSplat = [](ArrayRef<int> UserMask,
+                                       ArrayRef<int> SplatMask) {
+    for (unsigned i = 0, e = UserMask.size(); i != e; ++i)
+      if (UserMask[i] != -1 && SplatMask[i] == -1 &&
+          SplatMask[UserMask[i]] != -1)
+        return false;
+    return true;
+  };
+  if (CanSimplifyToExistingSplat(UserMask, SplatMask))
+    return SDValue(Splat, 0);
+
+  // Create a new shuffle with a mask that is composed of the two shuffles'
+  // masks.
+  SmallVector<int, 32> NewMask;
+  for (int Idx : UserMask)
+    NewMask.push_back(Idx == -1 ? -1 : SplatMask[Idx]);
+
+  return DAG.getVectorShuffle(Splat->getValueType(0), SDLoc(Splat),
+                              Splat->getOperand(0), Splat->getOperand(1),
+                              NewMask);
+}
+
 SDValue DAGCombiner::visitVECTOR_SHUFFLE(SDNode *N) {
   EVT VT = N->getValueType(0);
   unsigned NumElts = VT.getVectorNumElements();
@@ -14914,6 +14895,11 @@ SDValue DAGCombiner::visitVECTOR_SHUFFLE(SDNode *N) {
       return DAG.getVectorShuffle(VT, SDLoc(N), N0, N1, NewMask);
   }
 
+  // A shuffle of a single vector that is a splat can always be folded.
+  if (auto *N0Shuf = dyn_cast<ShuffleVectorSDNode>(N0))
+    if (N1->isUndef() && N0Shuf->isSplat())
+      return combineShuffleOfSplat(SVN->getMask(), N0Shuf, DAG);
+
   // If it is a splat, check if the argument vector is another splat or a
   // build_vector.
   if (SVN->isSplat() && SVN->getSplatIndex() < (int)NumElts) {
@@ -16381,17 +16367,17 @@ bool DAGCombiner::isAlias(LSBaseSDNode *Op0, LSBaseSDNode *Op1) const {
     UseAA = false;
 #endif
 
-  if (UseAA &&
+  if (UseAA && AA &&
       Op0->getMemOperand()->getValue() && Op1->getMemOperand()->getValue()) {
     // Use alias analysis information.
     int64_t MinOffset = std::min(SrcValOffset0, SrcValOffset1);
     int64_t Overlap0 = NumBytes0 + SrcValOffset0 - MinOffset;
     int64_t Overlap1 = NumBytes1 + SrcValOffset1 - MinOffset;
     AliasResult AAResult =
-        AA.alias(MemoryLocation(Op0->getMemOperand()->getValue(), Overlap0,
-                                UseTBAA ? Op0->getAAInfo() : AAMDNodes()),
-                 MemoryLocation(Op1->getMemOperand()->getValue(), Overlap1,
-                                UseTBAA ? Op1->getAAInfo() : AAMDNodes()));
+        AA->alias(MemoryLocation(Op0->getMemOperand()->getValue(), Overlap0,
+                                 UseTBAA ? Op0->getAAInfo() : AAMDNodes()),
+                  MemoryLocation(Op1->getMemOperand()->getValue(), Overlap1,
+                                 UseTBAA ? Op1->getAAInfo() : AAMDNodes()) );
     if (AAResult == NoAlias)
       return false;
   }
@@ -16605,7 +16591,7 @@ bool DAGCombiner::findBetterNeighborChains(StoreSDNode *St) {
 }
 
 /// This is the entry point for the file.
-void SelectionDAG::Combine(CombineLevel Level, AliasAnalysis &AA,
+void SelectionDAG::Combine(CombineLevel Level, AliasAnalysis *AA,
                            CodeGenOpt::Level OptLevel) {
   /// This is the main entry point to this class.
   DAGCombiner(*this, AA, OptLevel).Run(Level);
diff --git a/lib/CodeGen/SelectionDAG/FastISel.cpp b/lib/CodeGen/SelectionDAG/FastISel.cpp
index 8c98e3740f6d..5003b79974eb 100644
--- a/lib/CodeGen/SelectionDAG/FastISel.cpp
+++ b/lib/CodeGen/SelectionDAG/FastISel.cpp
@@ -622,7 +622,7 @@ bool FastISel::selectStackmap(const CallInst *I) {
   // have to worry about calling conventions and target-specific lowering code.
   // Instead we perform the call lowering right here.
   //
-  // CALLSEQ_START(0...)
+  // CALLSEQ_START(0, 0...)
   // STACKMAP(id, nbytes, ...)
   // CALLSEQ_END(0, 0)
   //
@@ -1150,16 +1150,16 @@ bool FastISel::selectIntrinsicCall(const IntrinsicInst *II) {
       return true;
     }
 
-    unsigned Offset = 0;
+    // Byval arguments with frame indices were already handled after argument
+    // lowering and before isel.
+    const auto *Arg =
+        dyn_cast<Argument>(Address->stripInBoundsConstantOffsets());
+    if (Arg && FuncInfo.getArgumentFrameIndex(Arg) != INT_MAX)
+      return true;
+
     Optional<MachineOperand> Op;
-    if (const auto *Arg = dyn_cast<Argument>(Address))
-      // Some arguments' frame index is recorded during argument lowering.
-      Offset = FuncInfo.getArgumentFrameIndex(Arg);
-    if (Offset)
-      Op = MachineOperand::CreateFI(Offset);
-    if (!Op)
-      if (unsigned Reg = lookUpRegForValue(Address))
-        Op = MachineOperand::CreateReg(Reg, false);
+    if (unsigned Reg = lookUpRegForValue(Address))
+      Op = MachineOperand::CreateReg(Reg, false);
 
     // If we have a VLA that has a "use" in a metadata node that's then used
     // here but it has no other uses, then we have a problem. E.g.,
diff --git a/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp b/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp
index cdf4d3a8b4e5..606b8952f3c1 100644
--- a/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp
+++ b/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp
@@ -85,7 +85,6 @@ void FunctionLoweringInfo::set(const Function &fn, MachineFunction &mf,
   MF = &mf;
   TLI = MF->getSubtarget().getTargetLowering();
   RegInfo = &MF->getRegInfo();
-  MachineModuleInfo &MMI = MF->getMMI();
   const TargetFrameLowering *TFI = MF->getSubtarget().getFrameLowering();
   unsigned StackAlign = TFI->getStackAlignment();
 
@@ -214,33 +213,6 @@ void FunctionLoweringInfo::set(const Function &fn, MachineFunction &mf,
         if (!isa<AllocaInst>(I) || !StaticAllocaMap.count(cast<AllocaInst>(&I)))
           InitializeRegForValue(&I);
 
-      // Collect llvm.dbg.declare information. This is done now instead of
-      // during the initial isel pass through the IR so that it is done
-      // in a predictable order.
-      if (const DbgDeclareInst *DI = dyn_cast<DbgDeclareInst>(&I)) {
-        assert(DI->getVariable() && "Missing variable");
-        assert(DI->getDebugLoc() && "Missing location");
-        if (MMI.hasDebugInfo()) {
-          // Don't handle byval struct arguments or VLAs, for example.
-          // Non-byval arguments are handled here (they refer to the stack
-          // temporary alloca at this point).
-          const Value *Address = DI->getAddress();
-          if (Address) {
-            if (const BitCastInst *BCI = dyn_cast<BitCastInst>(Address))
-              Address = BCI->getOperand(0);
-            if (const AllocaInst *AI = dyn_cast<AllocaInst>(Address)) {
-              DenseMap<const AllocaInst *, int>::iterator SI =
-                StaticAllocaMap.find(AI);
-              if (SI != StaticAllocaMap.end()) { // Check for VLAs.
-                int FI = SI->second;
-                MF->setVariableDbgInfo(DI->getVariable(), DI->getExpression(),
-                                       FI, DI->getDebugLoc());
-              }
-            }
-          }
-        }
-      }
-
       // Decide the preferred extend type for a value.
       PreferredExtendType[&I] = getPreferredExtendForValue(&I);
     }
@@ -510,12 +482,11 @@ void FunctionLoweringInfo::setArgumentFrameIndex(const Argument *A,
 /// If the argument does not have any assigned frame index then 0 is
 /// returned.
 int FunctionLoweringInfo::getArgumentFrameIndex(const Argument *A) {
-  DenseMap<const Argument *, int>::iterator I =
-    ByValArgFrameIndexMap.find(A);
+  auto I = ByValArgFrameIndexMap.find(A);
   if (I != ByValArgFrameIndexMap.end())
     return I->second;
   DEBUG(dbgs() << "Argument does not have assigned frame index!\n");
-  return 0;
+  return INT_MAX;
 }
 
 unsigned FunctionLoweringInfo::getCatchPadExceptionPointerVReg(
diff --git a/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
index 2654b3ad7a62..9a47a914df91 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -1493,7 +1493,7 @@ void SelectionDAGLegalize::ExpandDYNAMIC_STACKALLOC(SDNode* Node,
 
   // Chain the dynamic stack allocation so that it doesn't modify the stack
   // pointer when other instructions are using the stack.
-  Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(0, dl, true), dl);
+  Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);
 
   SDValue Size  = Tmp2.getOperand(1);
   SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
@@ -4187,6 +4187,7 @@ void SelectionDAGLegalize::PromoteNode(SDNode *Node) {
     ReplacedNode(Node);
     break;
   }
+  case ISD::MUL:
   case ISD::SDIV:
   case ISD::SREM:
   case ISD::UDIV:
diff --git a/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/lib/CodeGen/SelectionDAG/LegalizeTypes.h
index cde4331cc42d..4c3b514856b7 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeTypes.h
+++ b/lib/CodeGen/SelectionDAG/LegalizeTypes.h
@@ -675,6 +675,7 @@ private:
   // Vector Operand Splitting: <128 x ty> -> 2 x <64 x ty>.
   bool SplitVectorOperand(SDNode *N, unsigned OpNo);
   SDValue SplitVecOp_VSELECT(SDNode *N, unsigned OpNo);
+  SDValue SplitVecOp_VECREDUCE(SDNode *N, unsigned OpNo);
   SDValue SplitVecOp_UnaryOp(SDNode *N);
   SDValue SplitVecOp_TruncateHelper(SDNode *N);
 
diff --git a/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
index 97a7fab6efd0..ff0e609803d8 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -1513,6 +1513,22 @@ bool DAGTypeLegalizer::SplitVectorOperand(SDNode *N, unsigned OpNo) {
     case ISD::ZERO_EXTEND_VECTOR_INREG:
       Res = SplitVecOp_ExtVecInRegOp(N);
       break;
+
+    case ISD::VECREDUCE_FADD:
+    case ISD::VECREDUCE_FMUL:
+    case ISD::VECREDUCE_ADD:
+    case ISD::VECREDUCE_MUL:
+    case ISD::VECREDUCE_AND:
+    case ISD::VECREDUCE_OR:
+    case ISD::VECREDUCE_XOR:
+    case ISD::VECREDUCE_SMAX:
+    case ISD::VECREDUCE_SMIN:
+    case ISD::VECREDUCE_UMAX:
+    case ISD::VECREDUCE_UMIN:
+    case ISD::VECREDUCE_FMAX:
+    case ISD::VECREDUCE_FMIN:
+      Res = SplitVecOp_VECREDUCE(N, OpNo);
+      break;
     }
   }
 
@@ -1565,6 +1581,48 @@ SDValue DAGTypeLegalizer::SplitVecOp_VSELECT(SDNode *N, unsigned OpNo) {
   return DAG.getNode(ISD::CONCAT_VECTORS, DL, Src0VT, LoSelect, HiSelect);
 }
 
+SDValue DAGTypeLegalizer::SplitVecOp_VECREDUCE(SDNode *N, unsigned OpNo) {
+  EVT ResVT = N->getValueType(0);
+  SDValue Lo, Hi;
+  SDLoc dl(N);
+
+  SDValue VecOp = N->getOperand(OpNo);
+  EVT VecVT = VecOp.getValueType();
+  assert(VecVT.isVector() && "Can only split reduce vector operand");
+  GetSplitVector(VecOp, Lo, Hi);
+  EVT LoOpVT, HiOpVT;
+  std::tie(LoOpVT, HiOpVT) = DAG.GetSplitDestVTs(VecVT);
+
+  bool NoNaN = N->getFlags().hasNoNaNs();
+  unsigned CombineOpc = 0;
+  switch (N->getOpcode()) {
+  case ISD::VECREDUCE_FADD: CombineOpc = ISD::FADD; break;
+  case ISD::VECREDUCE_FMUL: CombineOpc = ISD::FMUL; break;
+  case ISD::VECREDUCE_ADD:  CombineOpc = ISD::ADD; break;
+  case ISD::VECREDUCE_MUL:  CombineOpc = ISD::MUL; break;
+  case ISD::VECREDUCE_AND:  CombineOpc = ISD::AND; break;
+  case ISD::VECREDUCE_OR:   CombineOpc = ISD::OR; break;
+  case ISD::VECREDUCE_XOR:  CombineOpc = ISD::XOR; break;
+  case ISD::VECREDUCE_SMAX: CombineOpc = ISD::SMAX; break;
+  case ISD::VECREDUCE_SMIN: CombineOpc = ISD::SMIN; break;
+  case ISD::VECREDUCE_UMAX: CombineOpc = ISD::UMAX; break;
+  case ISD::VECREDUCE_UMIN: CombineOpc = ISD::UMIN; break;
+  case ISD::VECREDUCE_FMAX:
+    CombineOpc = NoNaN ? ISD::FMAXNUM : ISD::FMAXNAN;
+    break;
+  case ISD::VECREDUCE_FMIN:
+    CombineOpc = NoNaN ? ISD::FMINNUM : ISD::FMINNAN;
+    break;
+  default:
+    llvm_unreachable("Unexpected reduce ISD node");
+  }
+
+  // Use the appropriate scalar instruction on the split subvectors before
+  // reducing the now partially reduced smaller vector.
+  SDValue Partial = DAG.getNode(CombineOpc, dl, LoOpVT, Lo, Hi);
+  return DAG.getNode(N->getOpcode(), dl, ResVT, Partial);
+}
+
 SDValue DAGTypeLegalizer::SplitVecOp_UnaryOp(SDNode *N) {
   // The result has a legal vector type, but the input needs splitting.
   EVT ResVT = N->getValueType(0);
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index d605a1dc1c20..057badcd6b74 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -2217,10 +2217,10 @@ void SelectionDAG::computeKnownBits(SDValue Op, KnownBits &Known,
     // Also compute a conservative estimate for high known-0 bits.
     // More trickiness is possible, but this is sufficient for the
     // interesting case of alignment computation.
-    unsigned TrailZ = Known.Zero.countTrailingOnes() +
-                      Known2.Zero.countTrailingOnes();
-    unsigned LeadZ =  std::max(Known.Zero.countLeadingOnes() +
-                               Known2.Zero.countLeadingOnes(),
+    unsigned TrailZ = Known.countMinTrailingZeros() +
+                      Known2.countMinTrailingZeros();
+    unsigned LeadZ =  std::max(Known.countMinLeadingZeros() +
+                               Known2.countMinLeadingZeros(),
                                BitWidth) - BitWidth;
 
     Known.resetAll();
@@ -2233,13 +2233,12 @@ void SelectionDAG::computeKnownBits(SDValue Op, KnownBits &Known,
     // treat a udiv as a logical right shift by the power of 2 known to
     // be less than the denominator.
     computeKnownBits(Op.getOperand(0), Known2, DemandedElts, Depth + 1);
-    unsigned LeadZ = Known2.Zero.countLeadingOnes();
+    unsigned LeadZ = Known2.countMinLeadingZeros();
 
     computeKnownBits(Op.getOperand(1), Known2, DemandedElts, Depth + 1);
-    unsigned RHSUnknownLeadingOnes = Known2.One.countLeadingZeros();
-    if (RHSUnknownLeadingOnes != BitWidth)
-      LeadZ = std::min(BitWidth,
-                       LeadZ + BitWidth - RHSUnknownLeadingOnes - 1);
+    unsigned RHSMaxLeadingZeros = Known2.countMaxLeadingZeros();
+    if (RHSMaxLeadingZeros != BitWidth)
+      LeadZ = std::min(BitWidth, LeadZ + BitWidth - RHSMaxLeadingZeros - 1);
 
     Known.Zero.setHighBits(LeadZ);
     break;
@@ -2359,7 +2358,7 @@ void SelectionDAG::computeKnownBits(SDValue Op, KnownBits &Known,
   case ISD::CTTZ_ZERO_UNDEF: {
     computeKnownBits(Op.getOperand(0), Known2, DemandedElts, Depth + 1);
     // If we have a known 1, its position is our upper bound.
-    unsigned PossibleTZ = Known2.One.countTrailingZeros();
+    unsigned PossibleTZ = Known2.countMaxTrailingZeros();
     unsigned LowBits = Log2_32(PossibleTZ) + 1;
     Known.Zero.setBitsFrom(LowBits);
     break;
@@ -2368,7 +2367,7 @@ void SelectionDAG::computeKnownBits(SDValue Op, KnownBits &Known,
   case ISD::CTLZ_ZERO_UNDEF: {
     computeKnownBits(Op.getOperand(0), Known2, DemandedElts, Depth + 1);
     // If we have a known 1, its position is our upper bound.
-    unsigned PossibleLZ = Known2.One.countLeadingZeros();
+    unsigned PossibleLZ = Known2.countMaxLeadingZeros();
     unsigned LowBits = Log2_32(PossibleLZ) + 1;
     Known.Zero.setBitsFrom(LowBits);
     break;
@@ -2376,7 +2375,7 @@ void SelectionDAG::computeKnownBits(SDValue Op, KnownBits &Known,
   case ISD::CTPOP: {
     computeKnownBits(Op.getOperand(0), Known2, DemandedElts, Depth + 1);
     // If we know some of the bits are zero, they can't be one.
-    unsigned PossibleOnes = BitWidth - Known2.Zero.countPopulation();
+    unsigned PossibleOnes = Known2.countMaxPopulation();
     Known.Zero.setBitsFrom(Log2_32(PossibleOnes) + 1);
     break;
   }
@@ -2493,13 +2492,12 @@ void SelectionDAG::computeKnownBits(SDValue Op, KnownBits &Known,
     // going to be 0 in the result. Both addition and complement operations
     // preserve the low zero bits.
     computeKnownBits(Op.getOperand(0), Known2, DemandedElts, Depth + 1);
-    unsigned KnownZeroLow = Known2.Zero.countTrailingOnes();
+    unsigned KnownZeroLow = Known2.countMinTrailingZeros();
     if (KnownZeroLow == 0)
       break;
 
     computeKnownBits(Op.getOperand(1), Known2, DemandedElts, Depth + 1);
-    KnownZeroLow = std::min(KnownZeroLow,
-                            Known2.Zero.countTrailingOnes());
+    KnownZeroLow = std::min(KnownZeroLow, Known2.countMinTrailingZeros());
     Known.Zero.setLowBits(KnownZeroLow);
     break;
   }
@@ -2526,15 +2524,13 @@ void SelectionDAG::computeKnownBits(SDValue Op, KnownBits &Known,
     // and the other has the top 8 bits clear, we know the top 7 bits of the
     // output must be clear.
     computeKnownBits(Op.getOperand(0), Known2, DemandedElts, Depth + 1);
-    unsigned KnownZeroHigh = Known2.Zero.countLeadingOnes();
-    unsigned KnownZeroLow = Known2.Zero.countTrailingOnes();
+    unsigned KnownZeroHigh = Known2.countMinLeadingZeros();
+    unsigned KnownZeroLow = Known2.countMinTrailingZeros();
 
     computeKnownBits(Op.getOperand(1), Known2, DemandedElts,
                      Depth + 1);
-    KnownZeroHigh = std::min(KnownZeroHigh,
-                             Known2.Zero.countLeadingOnes());
-    KnownZeroLow = std::min(KnownZeroLow,
-                            Known2.Zero.countTrailingOnes());
+    KnownZeroHigh = std::min(KnownZeroHigh, Known2.countMinLeadingZeros());
+    KnownZeroLow = std::min(KnownZeroLow, Known2.countMinTrailingZeros());
 
     if (Opcode == ISD::ADDE || Opcode == ISD::ADDCARRY) {
       // With ADDE and ADDCARRY, a carry bit may be added in, so we can only
@@ -2594,8 +2590,8 @@ void SelectionDAG::computeKnownBits(SDValue Op, KnownBits &Known,
     computeKnownBits(Op.getOperand(0), Known, DemandedElts, Depth + 1);
     computeKnownBits(Op.getOperand(1), Known2, DemandedElts, Depth + 1);
 
-    uint32_t Leaders = std::max(Known.Zero.countLeadingOnes(),
-                                Known2.Zero.countLeadingOnes());
+    uint32_t Leaders =
+        std::max(Known.countMinLeadingZeros(), Known2.countMinLeadingZeros());
     Known.resetAll();
     Known.Zero.setHighBits(Leaders);
     break;
@@ -2711,8 +2707,8 @@ void SelectionDAG::computeKnownBits(SDValue Op, KnownBits &Known,
 
     // UMIN - we know that the result will have the maximum of the
     // known zero leading bits of the inputs.
-    unsigned LeadZero = Known.Zero.countLeadingOnes();
-    LeadZero = std::max(LeadZero, Known2.Zero.countLeadingOnes());
+    unsigned LeadZero = Known.countMinLeadingZeros();
+    LeadZero = std::max(LeadZero, Known2.countMinLeadingZeros());
 
     Known.Zero &= Known2.Zero;
     Known.One &= Known2.One;
@@ -2726,8 +2722,8 @@ void SelectionDAG::computeKnownBits(SDValue Op, KnownBits &Known,
 
     // UMAX - we know that the result will have the maximum of the
     // known one leading bits of the inputs.
-    unsigned LeadOne = Known.One.countLeadingOnes();
-    LeadOne = std::max(LeadOne, Known2.One.countLeadingOnes());
+    unsigned LeadOne = Known.countMinLeadingOnes();
+    LeadOne = std::max(LeadOne, Known2.countMinLeadingOnes());
 
     Known.Zero &= Known2.Zero;
     Known.One &= Known2.One;
@@ -2843,8 +2839,7 @@ bool SelectionDAG::isKnownToBeAPowerOfTwo(SDValue Val) const {
   // Fall back to computeKnownBits to catch other known cases.
   KnownBits Known;
   computeKnownBits(Val, Known);
-  return (Known.Zero.countPopulation() == BitWidth - 1) &&
-         (Known.One.countPopulation() == 1);
+  return (Known.countMaxPopulation() == 1) && (Known.countMinPopulation() == 1);
 }
 
 unsigned SelectionDAG::ComputeNumSignBits(SDValue Op, unsigned Depth) const {
@@ -2860,6 +2855,7 @@ unsigned SelectionDAG::ComputeNumSignBits(SDValue Op, const APInt &DemandedElts,
   EVT VT = Op.getValueType();
   assert(VT.isInteger() && "Invalid VT!");
   unsigned VTBits = VT.getScalarSizeInBits();
+  unsigned NumElts = DemandedElts.getBitWidth();
   unsigned Tmp, Tmp2;
   unsigned FirstAnswer = 1;
 
@@ -2903,6 +2899,39 @@ unsigned SelectionDAG::ComputeNumSignBits(SDValue Op, const APInt &DemandedElts,
     }
     return Tmp;
 
+  case ISD::VECTOR_SHUFFLE: {
+    // Collect the minimum number of sign bits that are shared by every vector
+    // element referenced by the shuffle.
+    APInt DemandedLHS(NumElts, 0), DemandedRHS(NumElts, 0);
+    const ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op);
+    assert(NumElts == SVN->getMask().size() && "Unexpected vector size");
+    for (unsigned i = 0; i != NumElts; ++i) {
+      int M = SVN->getMaskElt(i);
+      if (!DemandedElts[i])
+        continue;
+      // For UNDEF elements, we don't know anything about the common state of
+      // the shuffle result.
+      if (M < 0)
+        return 1;
+      if ((unsigned)M < NumElts)
+        DemandedLHS.setBit((unsigned)M % NumElts);
+      else
+        DemandedRHS.setBit((unsigned)M % NumElts);
+    }
+    Tmp = UINT_MAX;
+    if (!!DemandedLHS)
+      Tmp = ComputeNumSignBits(Op.getOperand(0), DemandedLHS, Depth + 1);
+    if (!!DemandedRHS) {
+      Tmp2 = ComputeNumSignBits(Op.getOperand(1), DemandedRHS, Depth + 1);
+      Tmp = std::min(Tmp, Tmp2);
+    }
+    // If we don't know anything, early out and try computeKnownBits fall-back.
+    if (Tmp == 1)
+      break;
+    assert(Tmp <= VTBits && "Failed to determine minimum sign bits");
+    return Tmp;
+  }
+
   case ISD::SIGN_EXTEND:
   case ISD::SIGN_EXTEND_VECTOR_INREG:
     Tmp = VTBits - Op.getOperand(0).getScalarValueSizeInBits();
@@ -3142,14 +3171,36 @@ unsigned SelectionDAG::ComputeNumSignBits(SDValue Op, const APInt &DemandedElts,
 
     return ComputeNumSignBits(InVec, DemandedSrcElts, Depth + 1);
   }
-  case ISD::EXTRACT_SUBVECTOR:
-    return ComputeNumSignBits(Op.getOperand(0), Depth + 1);
+  case ISD::EXTRACT_SUBVECTOR: {
+    // If we know the element index, just demand that subvector elements,
+    // otherwise demand them all.
+    SDValue Src = Op.getOperand(0);
+    ConstantSDNode *SubIdx = dyn_cast<ConstantSDNode>(Op.getOperand(1));
+    unsigned NumSrcElts = Src.getValueType().getVectorNumElements();
+    if (SubIdx && SubIdx->getAPIntValue().ule(NumSrcElts - NumElts)) {
+      // Offset the demanded elts by the subvector index.
+      uint64_t Idx = SubIdx->getZExtValue();
+      APInt DemandedSrc = DemandedElts.zext(NumSrcElts).shl(Idx);
+      return ComputeNumSignBits(Src, DemandedSrc, Depth + 1);
+    }
+    return ComputeNumSignBits(Src, Depth + 1);
+  }
   case ISD::CONCAT_VECTORS:
-    // Determine the minimum number of sign bits across all input vectors.
-    // Early out if the result is already 1.
-    Tmp = ComputeNumSignBits(Op.getOperand(0), Depth + 1);
-    for (unsigned i = 1, e = Op.getNumOperands(); (i < e) && (Tmp > 1); ++i)
-      Tmp = std::min(Tmp, ComputeNumSignBits(Op.getOperand(i), Depth + 1));
+    // Determine the minimum number of sign bits across all demanded
+    // elts of the input vectors. Early out if the result is already 1.
+    Tmp = UINT_MAX;
+    EVT SubVectorVT = Op.getOperand(0).getValueType();
+    unsigned NumSubVectorElts = SubVectorVT.getVectorNumElements();
+    unsigned NumSubVectors = Op.getNumOperands();
+    for (unsigned i = 0; (i < NumSubVectors) && (Tmp > 1); ++i) {
+      APInt DemandedSub = DemandedElts.lshr(i * NumSubVectorElts);
+      DemandedSub = DemandedSub.trunc(NumSubVectorElts);
+      if (!DemandedSub)
+        continue;
+      Tmp2 = ComputeNumSignBits(Op.getOperand(i), DemandedSub, Depth + 1);
+      Tmp = std::min(Tmp, Tmp2);
+    }
+    assert(Tmp <= VTBits && "Failed to determine minimum sign bits");
     return Tmp;
   }
 
@@ -3543,7 +3594,7 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
     assert(Operand.getValueType().bitsLT(VT) &&
            "Invalid sext node, dst < src!");
     if (OpOpcode == ISD::SIGN_EXTEND || OpOpcode == ISD::ZERO_EXTEND)
-      return getNode(OpOpcode, DL, VT, Operand.getNode()->getOperand(0));
+      return getNode(OpOpcode, DL, VT, Operand.getOperand(0));
     else if (OpOpcode == ISD::UNDEF)
       // sext(undef) = 0, because the top bits will all be the same.
       return getConstant(0, DL, VT);
@@ -3559,8 +3610,7 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
     assert(Operand.getValueType().bitsLT(VT) &&
            "Invalid zext node, dst < src!");
     if (OpOpcode == ISD::ZERO_EXTEND)   // (zext (zext x)) -> (zext x)
-      return getNode(ISD::ZERO_EXTEND, DL, VT,
-                     Operand.getNode()->getOperand(0));
+      return getNode(ISD::ZERO_EXTEND, DL, VT, Operand.getOperand(0));
     else if (OpOpcode == ISD::UNDEF)
       // zext(undef) = 0, because the top bits will be zero.
       return getConstant(0, DL, VT);
@@ -3579,13 +3629,13 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
     if (OpOpcode == ISD::ZERO_EXTEND || OpOpcode == ISD::SIGN_EXTEND ||
         OpOpcode == ISD::ANY_EXTEND)
       // (ext (zext x)) -> (zext x)  and  (ext (sext x)) -> (sext x)
-      return getNode(OpOpcode, DL, VT, Operand.getNode()->getOperand(0));
+      return getNode(OpOpcode, DL, VT, Operand.getOperand(0));
     else if (OpOpcode == ISD::UNDEF)
       return getUNDEF(VT);
 
     // (ext (trunx x)) -> x
     if (OpOpcode == ISD::TRUNCATE) {
-      SDValue OpOp = Operand.getNode()->getOperand(0);
+      SDValue OpOp = Operand.getOperand(0);
       if (OpOp.getValueType() == VT)
         return OpOp;
     }
@@ -3601,16 +3651,16 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
     assert(Operand.getValueType().bitsGT(VT) &&
            "Invalid truncate node, src < dst!");
     if (OpOpcode == ISD::TRUNCATE)
-      return getNode(ISD::TRUNCATE, DL, VT, Operand.getNode()->getOperand(0));
+      return getNode(ISD::TRUNCATE, DL, VT, Operand.getOperand(0));
     if (OpOpcode == ISD::ZERO_EXTEND || OpOpcode == ISD::SIGN_EXTEND ||
         OpOpcode == ISD::ANY_EXTEND) {
       // If the source is smaller than the dest, we still need an extend.
-      if (Operand.getNode()->getOperand(0).getValueType().getScalarType()
+      if (Operand.getOperand(0).getValueType().getScalarType()
             .bitsLT(VT.getScalarType()))
-        return getNode(OpOpcode, DL, VT, Operand.getNode()->getOperand(0));
-      if (Operand.getNode()->getOperand(0).getValueType().bitsGT(VT))
-        return getNode(ISD::TRUNCATE, DL, VT, Operand.getNode()->getOperand(0));
-      return Operand.getNode()->getOperand(0);
+        return getNode(OpOpcode, DL, VT, Operand.getOperand(0));
+      if (Operand.getOperand(0).getValueType().bitsGT(VT))
+        return getNode(ISD::TRUNCATE, DL, VT, Operand.getOperand(0));
+      return Operand.getOperand(0);
     }
     if (OpOpcode == ISD::UNDEF)
       return getUNDEF(VT);
@@ -3665,15 +3715,14 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
     // -(X-Y) -> (Y-X) is unsafe because when X==Y, -0.0 != +0.0
     if (getTarget().Options.UnsafeFPMath && OpOpcode == ISD::FSUB)
       // FIXME: FNEG has no fast-math-flags to propagate; use the FSUB's flags?
-      return getNode(ISD::FSUB, DL, VT, Operand.getNode()->getOperand(1),
-                     Operand.getNode()->getOperand(0),
-                     Operand.getNode()->getFlags());
+      return getNode(ISD::FSUB, DL, VT, Operand.getOperand(1),
+                     Operand.getOperand(0), Operand.getNode()->getFlags());
     if (OpOpcode == ISD::FNEG)  // --X -> X
-      return Operand.getNode()->getOperand(0);
+      return Operand.getOperand(0);
     break;
   case ISD::FABS:
     if (OpOpcode == ISD::FNEG)  // abs(-X) -> abs(X)
-      return getNode(ISD::FABS, DL, VT, Operand.getNode()->getOperand(0));
+      return getNode(ISD::FABS, DL, VT, Operand.getOperand(0));
     break;
   }
 
@@ -5970,7 +6019,7 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
   unsigned NumOps = Ops.size();
   switch (NumOps) {
   case 0: return getNode(Opcode, DL, VT);
-  case 1: return getNode(Opcode, DL, VT, Ops[0]);
+  case 1: return getNode(Opcode, DL, VT, Ops[0], Flags);
   case 2: return getNode(Opcode, DL, VT, Ops[0], Ops[1], Flags);
   case 3: return getNode(Opcode, DL, VT, Ops[0], Ops[1], Ops[2]);
   default: break;
@@ -7520,9 +7569,8 @@ unsigned SelectionDAG::InferPtrAlignment(SDValue Ptr) const {
   if (TLI->isGAPlusOffset(Ptr.getNode(), GV, GVOffset)) {
     unsigned PtrWidth = getDataLayout().getPointerTypeSizeInBits(GV->getType());
     KnownBits Known(PtrWidth);
-    llvm::computeKnownBits(const_cast<GlobalValue *>(GV), Known,
-                           getDataLayout());
-    unsigned AlignBits = Known.Zero.countTrailingOnes();
+    llvm::computeKnownBits(GV, Known, getDataLayout());
+    unsigned AlignBits = Known.countMinTrailingZeros();
     unsigned Align = AlignBits ? 1 << std::min(31U, AlignBits) : 0;
     if (Align)
       return MinAlign(Align, GVOffset);
@@ -7621,7 +7669,7 @@ bool BuildVectorSDNode::isConstantSplat(APInt &SplatValue, APInt &SplatUndef,
     return false;
 
   // FIXME: The widths are based on this node's type, but build vectors can
-  // truncate their operands. 
+  // truncate their operands.
   SplatValue = APInt(VecWidth, 0);
   SplatUndef = APInt(VecWidth, 0);
 
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index 50313e2da884..57d340c41c39 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -661,7 +661,7 @@ SDValue RegsForValue::getCopyFromRegs(SelectionDAG &DAG,
 
       unsigned RegSize = RegisterVT.getSizeInBits();
       unsigned NumSignBits = LOI->NumSignBits;
-      unsigned NumZeroBits = LOI->Known.Zero.countLeadingOnes();
+      unsigned NumZeroBits = LOI->Known.countMinLeadingZeros();
 
       if (NumZeroBits == RegSize) {
         // The current value is a zero.
@@ -811,9 +811,9 @@ void RegsForValue::AddInlineAsmOperands(unsigned Code, bool HasMatching,
   }
 }
 
-void SelectionDAGBuilder::init(GCFunctionInfo *gfi, AliasAnalysis &aa,
+void SelectionDAGBuilder::init(GCFunctionInfo *gfi, AliasAnalysis *aa,
                                const TargetLibraryInfo *li) {
-  AA = &aa;
+  AA = aa;
   GFI = gfi;
   LibInfo = li;
   DL = &DAG.getDataLayout();
@@ -3423,7 +3423,7 @@ void SelectionDAGBuilder::visitLoad(const LoadInst &I) {
   if (isVolatile || NumValues > MaxParallelChains)
     // Serialize volatile loads with other side effects.
     Root = getRoot();
-  else if (AA->pointsToConstantMemory(MemoryLocation(
+  else if (AA && AA->pointsToConstantMemory(MemoryLocation(
                SV, DAG.getDataLayout().getTypeStoreSize(Ty), AAInfo))) {
     // Do not serialize (non-volatile) loads of constant memory with anything.
     Root = DAG.getEntryNode();
@@ -3535,8 +3535,8 @@ void SelectionDAGBuilder::visitLoadFromSwiftError(const LoadInst &I) {
   Type *Ty = I.getType();
   AAMDNodes AAInfo;
   I.getAAMetadata(AAInfo);
-  assert(!AA->pointsToConstantMemory(MemoryLocation(
-             SV, DAG.getDataLayout().getTypeStoreSize(Ty), AAInfo)) &&
+  assert((!AA || !AA->pointsToConstantMemory(MemoryLocation(
+             SV, DAG.getDataLayout().getTypeStoreSize(Ty), AAInfo))) &&
          "load_from_swift_error should not be constant memory");
 
   SmallVector<EVT, 4> ValueVTs;
@@ -3817,7 +3817,7 @@ void SelectionDAGBuilder::visitMaskedLoad(const CallInst &I, bool IsExpanding) {
   const MDNode *Ranges = I.getMetadata(LLVMContext::MD_range);
 
   // Do not serialize masked loads of constant memory with anything.
-  bool AddToChain = !AA->pointsToConstantMemory(MemoryLocation(
+  bool AddToChain = !AA || !AA->pointsToConstantMemory(MemoryLocation(
       PtrOperand, DAG.getDataLayout().getTypeStoreSize(I.getType()), AAInfo));
   SDValue InChain = AddToChain ? DAG.getRoot() : DAG.getEntryNode();
 
@@ -3861,7 +3861,7 @@ void SelectionDAGBuilder::visitMaskedGather(const CallInst &I) {
   bool UniformBase = getUniformBase(BasePtr, Base, Index, this);
   bool ConstantMemory = false;
   if (UniformBase &&
-      AA->pointsToConstantMemory(MemoryLocation(
+      AA && AA->pointsToConstantMemory(MemoryLocation(
           BasePtr, DAG.getDataLayout().getTypeStoreSize(I.getType()),
           AAInfo))) {
     // Do not serialize (non-volatile) loads of constant memory with anything.
@@ -4676,7 +4676,8 @@ bool SelectionDAGBuilder::EmitFuncArgumentDbgValue(
   bool IsIndirect = false;
   Optional<MachineOperand> Op;
   // Some arguments' frame index is recorded during argument lowering.
-  if (int FI = FuncInfo.getArgumentFrameIndex(Arg))
+  int FI = FuncInfo.getArgumentFrameIndex(Arg);
+  if (FI != INT_MAX)
     Op = MachineOperand::CreateFI(FI);
 
   if (!Op && N.getNode()) {
@@ -4927,6 +4928,13 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
       return nullptr;
     }
 
+    // Byval arguments with frame indices were already handled after argument
+    // lowering and before isel.
+    const auto *Arg =
+        dyn_cast<Argument>(Address->stripInBoundsConstantOffsets());
+    if (Arg && FuncInfo.getArgumentFrameIndex(Arg) != INT_MAX)
+      return nullptr;
+
     SDValue &N = NodeMap[Address];
     if (!N.getNode() && isa<Argument>(Address))
       // Check unused arguments map.
@@ -4957,20 +4965,6 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
       // virtual register info from the FuncInfo.ValueMap.
       if (!EmitFuncArgumentDbgValue(Address, Variable, Expression, dl, 0, true,
                                     N)) {
-        // If variable is pinned by a alloca in dominating bb then
-        // use StaticAllocaMap.
-        if (const AllocaInst *AI = dyn_cast<AllocaInst>(Address)) {
-          if (AI->getParent() != DI.getParent()) {
-            DenseMap<const AllocaInst*, int>::iterator SI =
-              FuncInfo.StaticAllocaMap.find(AI);
-            if (SI != FuncInfo.StaticAllocaMap.end()) {
-              SDV = DAG.getFrameIndexDbgValue(Variable, Expression, SI->second,
-                                              0, dl, SDNodeOrder);
-              DAG.AddDbgValue(SDV, nullptr, false);
-              return nullptr;
-            }
-          }
-        }
         DEBUG(dbgs() << "Dropping debug info for " << DI << "\n");
       }
     }
@@ -5651,7 +5645,7 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
       int FI = FuncInfo.StaticAllocaMap[Slot];
       MCSymbol *FrameAllocSym =
           MF.getMMI().getContext().getOrCreateFrameAllocSymbol(
-              GlobalValue::getRealLinkageName(MF.getName()), Idx);
+              GlobalValue::dropLLVMManglingEscape(MF.getName()), Idx);
       BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, dl,
               TII->get(TargetOpcode::LOCAL_ESCAPE))
           .addSym(FrameAllocSym)
@@ -5672,7 +5666,7 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
     unsigned IdxVal = unsigned(Idx->getLimitedValue(INT_MAX));
     MCSymbol *FrameAllocSym =
         MF.getMMI().getContext().getOrCreateFrameAllocSymbol(
-            GlobalValue::getRealLinkageName(Fn->getName()), IdxVal);
+            GlobalValue::dropLLVMManglingEscape(Fn->getName()), IdxVal);
 
     // Create a MCSymbol for the label to avoid any target lowering
     // that would make this PC relative.
@@ -5737,6 +5731,24 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
   case Intrinsic::experimental_deoptimize:
     LowerDeoptimizeCall(&I);
     return nullptr;
+
+  case Intrinsic::experimental_vector_reduce_fadd:
+  case Intrinsic::experimental_vector_reduce_fmul:
+  case Intrinsic::experimental_vector_reduce_add:
+  case Intrinsic::experimental_vector_reduce_mul:
+  case Intrinsic::experimental_vector_reduce_and:
+  case Intrinsic::experimental_vector_reduce_or:
+  case Intrinsic::experimental_vector_reduce_xor:
+  case Intrinsic::experimental_vector_reduce_smax:
+  case Intrinsic::experimental_vector_reduce_smin:
+  case Intrinsic::experimental_vector_reduce_umax:
+  case Intrinsic::experimental_vector_reduce_umin:
+  case Intrinsic::experimental_vector_reduce_fmax:
+  case Intrinsic::experimental_vector_reduce_fmin: {
+    visitVectorReduce(I, Intrinsic);
+    return nullptr;
+  }
+
   }
 }
 
@@ -5982,7 +5994,7 @@ static SDValue getMemCmpLoad(const Value *PtrVal, MVT LoadVT,
   bool ConstantMemory = false;
 
   // Do not serialize (non-volatile) loads of constant memory with anything.
-  if (Builder.AA->pointsToConstantMemory(PtrVal)) {
+  if (Builder.AA && Builder.AA->pointsToConstantMemory(PtrVal)) {
     Root = Builder.DAG.getEntryNode();
     ConstantMemory = true;
   } else {
@@ -7422,11 +7434,11 @@ void SelectionDAGBuilder::visitStackmap(const CallInst &CI) {
   // have to worry about calling conventions and target specific lowering code.
   // Instead we perform the call lowering right here.
   //
-  // chain, flag = CALLSEQ_START(chain, 0)
+  // chain, flag = CALLSEQ_START(chain, 0, 0)
   // chain, flag = STACKMAP(id, nbytes, ..., chain, flag)
   // chain, flag = CALLSEQ_END(chain, 0, 0, flag)
   //
-  Chain = DAG.getCALLSEQ_START(getRoot(), NullPtr, DL);
+  Chain = DAG.getCALLSEQ_START(getRoot(), 0, 0, DL);
   InFlag = Chain.getValue(1);
 
   // Add the <id> and <numBytes> constants.
@@ -7616,6 +7628,76 @@ void SelectionDAGBuilder::visitPatchpoint(ImmutableCallSite CS,
   FuncInfo.MF->getFrameInfo().setHasPatchPoint();
 }
 
+void SelectionDAGBuilder::visitVectorReduce(const CallInst &I,
+                                            unsigned Intrinsic) {
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  SDValue Op1 = getValue(I.getArgOperand(0));
+  SDValue Op2;
+  if (I.getNumArgOperands() > 1)
+    Op2 = getValue(I.getArgOperand(1));
+  SDLoc dl = getCurSDLoc();
+  EVT VT = TLI.getValueType(DAG.getDataLayout(), I.getType());
+  SDValue Res;
+  FastMathFlags FMF;
+  if (isa<FPMathOperator>(I))
+    FMF = I.getFastMathFlags();
+  SDNodeFlags SDFlags;
+  SDFlags.setNoNaNs(FMF.noNaNs());
+
+  switch (Intrinsic) {
+  case Intrinsic::experimental_vector_reduce_fadd:
+    if (FMF.unsafeAlgebra())
+      Res = DAG.getNode(ISD::VECREDUCE_FADD, dl, VT, Op2);
+    else
+      Res = DAG.getNode(ISD::VECREDUCE_STRICT_FADD, dl, VT, Op1, Op2);
+    break;
+  case Intrinsic::experimental_vector_reduce_fmul:
+    if (FMF.unsafeAlgebra())
+      Res = DAG.getNode(ISD::VECREDUCE_FMUL, dl, VT, Op2);
+    else
+      Res = DAG.getNode(ISD::VECREDUCE_STRICT_FMUL, dl, VT, Op1, Op2);
+    break;
+  case Intrinsic::experimental_vector_reduce_add:
+    Res = DAG.getNode(ISD::VECREDUCE_ADD, dl, VT, Op1);
+    break;
+  case Intrinsic::experimental_vector_reduce_mul:
+    Res = DAG.getNode(ISD::VECREDUCE_MUL, dl, VT, Op1);
+    break;
+  case Intrinsic::experimental_vector_reduce_and:
+    Res = DAG.getNode(ISD::VECREDUCE_AND, dl, VT, Op1);
+    break;
+  case Intrinsic::experimental_vector_reduce_or:
+    Res = DAG.getNode(ISD::VECREDUCE_OR, dl, VT, Op1);
+    break;
+  case Intrinsic::experimental_vector_reduce_xor:
+    Res = DAG.getNode(ISD::VECREDUCE_XOR, dl, VT, Op1);
+    break;
+  case Intrinsic::experimental_vector_reduce_smax:
+    Res = DAG.getNode(ISD::VECREDUCE_SMAX, dl, VT, Op1);
+    break;
+  case Intrinsic::experimental_vector_reduce_smin:
+    Res = DAG.getNode(ISD::VECREDUCE_SMIN, dl, VT, Op1);
+    break;
+  case Intrinsic::experimental_vector_reduce_umax:
+    Res = DAG.getNode(ISD::VECREDUCE_UMAX, dl, VT, Op1);
+    break;
+  case Intrinsic::experimental_vector_reduce_umin:
+    Res = DAG.getNode(ISD::VECREDUCE_UMIN, dl, VT, Op1);
+    break;
+  case Intrinsic::experimental_vector_reduce_fmax: {
+    Res = DAG.getNode(ISD::VECREDUCE_FMAX, dl, VT, Op1, SDFlags);
+    break;
+  }
+  case Intrinsic::experimental_vector_reduce_fmin: {
+    Res = DAG.getNode(ISD::VECREDUCE_FMIN, dl, VT, Op1, SDFlags);
+    break;
+  }
+  default:
+    llvm_unreachable("Unhandled vector reduce intrinsic");
+  }
+  setValue(&I, Res);
+}
+
 /// Returns an AttributeList representing the attributes applied to the return
 /// value of the given call.
 static AttributeList getReturnAttrs(TargetLowering::CallLoweringInfo &CLI) {
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
index 9e9989058ae5..bdaee858da61 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
@@ -604,11 +604,11 @@ public:
   SelectionDAGBuilder(SelectionDAG &dag, FunctionLoweringInfo &funcinfo,
                       CodeGenOpt::Level ol)
     : CurInst(nullptr), SDNodeOrder(LowestSDNodeOrder), TM(dag.getTarget()),
-      DAG(dag), FuncInfo(funcinfo),
+      DAG(dag), DL(nullptr), AA(nullptr), FuncInfo(funcinfo),
       HasTailCall(false) {
   }
 
-  void init(GCFunctionInfo *gfi, AliasAnalysis &aa,
+  void init(GCFunctionInfo *gfi, AliasAnalysis *AA,
             const TargetLibraryInfo *li);
 
   /// Clear out the current SelectionDAG and the associated state and prepare
@@ -909,6 +909,8 @@ private:
   void visitGCRelocate(const GCRelocateInst &I);
   void visitGCResult(const GCResultInst &I);
 
+  void visitVectorReduce(const CallInst &I, unsigned Intrinsic);
+
   void visitUserOp1(const Instruction &I) {
     llvm_unreachable("UserOp1 should not exist at instruction selection time!");
   }
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
index 26dd45ef933f..c37d7080f2c5 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
@@ -346,6 +346,19 @@ std::string SDNode::getOperationName(const SelectionDAG *G) const {
     case ISD::SETFALSE:                 return "setfalse";
     case ISD::SETFALSE2:                return "setfalse2";
     }
+  case ISD::VECREDUCE_FADD:             return "vecreduce_fadd";
+  case ISD::VECREDUCE_FMUL:             return "vecreduce_fmul";
+  case ISD::VECREDUCE_ADD:              return "vecreduce_add";
+  case ISD::VECREDUCE_MUL:              return "vecreduce_mul";
+  case ISD::VECREDUCE_AND:              return "vecreduce_and";
+  case ISD::VECREDUCE_OR:               return "vecreduce_or";
+  case ISD::VECREDUCE_XOR:              return "vecreduce_xor";
+  case ISD::VECREDUCE_SMAX:             return "vecreduce_smax";
+  case ISD::VECREDUCE_SMIN:             return "vecreduce_smin";
+  case ISD::VECREDUCE_UMAX:             return "vecreduce_umax";
+  case ISD::VECREDUCE_UMIN:             return "vecreduce_umin";
+  case ISD::VECREDUCE_FMAX:             return "vecreduce_fmax";
+  case ISD::VECREDUCE_FMIN:             return "vecreduce_fmin";
   }
 }
 
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
index 3aabdaeaa094..5e0feccb6b4c 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
@@ -38,6 +38,7 @@
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineMemOperand.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachinePassRegistry.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
@@ -299,7 +300,7 @@ SelectionDAGISel::SelectionDAGISel(TargetMachine &tm,
   FuncInfo(new FunctionLoweringInfo()),
   CurDAG(new SelectionDAG(tm, OL)),
   SDB(new SelectionDAGBuilder(*CurDAG, *FuncInfo, OL)),
-  GFI(),
+  AA(), GFI(),
   OptLevel(OL),
   DAGSize(0) {
     initializeGCModuleInfoPass(*PassRegistry::getPassRegistry());
@@ -317,7 +318,8 @@ SelectionDAGISel::~SelectionDAGISel() {
 }
 
 void SelectionDAGISel::getAnalysisUsage(AnalysisUsage &AU) const {
-  AU.addRequired<AAResultsWrapperPass>();
+  if (OptLevel != CodeGenOpt::None)
+    AU.addRequired<AAResultsWrapperPass>();
   AU.addRequired<GCModuleInfo>();
   AU.addRequired<StackProtector>();
   AU.addPreserved<StackProtector>();
@@ -394,7 +396,6 @@ bool SelectionDAGISel::runOnMachineFunction(MachineFunction &mf) {
   TII = MF->getSubtarget().getInstrInfo();
   TLI = MF->getSubtarget().getTargetLowering();
   RegInfo = &MF->getRegInfo();
-  AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
   LibInfo = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
   GFI = Fn.hasGC() ? &getAnalysis<GCModuleInfo>().getFunctionInfo(Fn) : nullptr;
   ORE = make_unique<OptimizationRemarkEmitter>(&Fn);
@@ -406,12 +407,22 @@ bool SelectionDAGISel::runOnMachineFunction(MachineFunction &mf) {
   CurDAG->init(*MF, *ORE);
   FuncInfo->set(Fn, *MF, CurDAG);
 
+  // Now get the optional analyzes if we want to.
+  // This is based on the possibly changed OptLevel (after optnone is taken
+  // into account).  That's unfortunate but OK because it just means we won't
+  // ask for passes that have been required anyway.
+
   if (UseMBPI && OptLevel != CodeGenOpt::None)
     FuncInfo->BPI = &getAnalysis<BranchProbabilityInfoWrapperPass>().getBPI();
   else
     FuncInfo->BPI = nullptr;
 
-  SDB->init(GFI, *AA, LibInfo);
+  if (OptLevel != CodeGenOpt::None)
+    AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
+  else
+    AA = nullptr;
+
+  SDB->init(GFI, AA, LibInfo);
 
   MF->setHasInlineAsm(false);
 
@@ -715,7 +726,7 @@ void SelectionDAGISel::CodeGenAndEmitDAG() {
   {
     NamedRegionTimer T("combine1", "DAG Combining 1", GroupName,
                        GroupDescription, TimePassesIsEnabled);
-    CurDAG->Combine(BeforeLegalizeTypes, *AA, OptLevel);
+    CurDAG->Combine(BeforeLegalizeTypes, AA, OptLevel);
   }
 
   DEBUG(dbgs() << "Optimized lowered selection DAG: BB#" << BlockNumber
@@ -747,7 +758,7 @@ void SelectionDAGISel::CodeGenAndEmitDAG() {
     {
       NamedRegionTimer T("combine_lt", "DAG Combining after legalize types",
                          GroupName, GroupDescription, TimePassesIsEnabled);
-      CurDAG->Combine(AfterLegalizeTypes, *AA, OptLevel);
+      CurDAG->Combine(AfterLegalizeTypes, AA, OptLevel);
     }
 
     DEBUG(dbgs() << "Optimized type-legalized selection DAG: BB#" << BlockNumber
@@ -781,7 +792,7 @@ void SelectionDAGISel::CodeGenAndEmitDAG() {
     {
       NamedRegionTimer T("combine_lv", "DAG Combining after legalize vectors",
                          GroupName, GroupDescription, TimePassesIsEnabled);
-      CurDAG->Combine(AfterLegalizeVectorOps, *AA, OptLevel);
+      CurDAG->Combine(AfterLegalizeVectorOps, AA, OptLevel);
     }
 
     DEBUG(dbgs() << "Optimized vector-legalized selection DAG: BB#"
@@ -807,7 +818,7 @@ void SelectionDAGISel::CodeGenAndEmitDAG() {
   {
     NamedRegionTimer T("combine2", "DAG Combining 2", GroupName,
                        GroupDescription, TimePassesIsEnabled);
-    CurDAG->Combine(AfterLegalizeDAG, *AA, OptLevel);
+    CurDAG->Combine(AfterLegalizeDAG, AA, OptLevel);
   }
 
   DEBUG(dbgs() << "Optimized legalized selection DAG: BB#" << BlockNumber
@@ -1145,6 +1156,51 @@ static void createSwiftErrorEntriesInEntryBlock(FunctionLoweringInfo *FuncInfo,
   }
 }
 
+/// Collect llvm.dbg.declare information. This is done after argument lowering
+/// in case the declarations refer to arguments.
+static void processDbgDeclares(FunctionLoweringInfo *FuncInfo) {
+  MachineFunction *MF = FuncInfo->MF;
+  const DataLayout &DL = MF->getDataLayout();
+  for (const BasicBlock &BB : *FuncInfo->Fn) {
+    for (const Instruction &I : BB) {
+      const DbgDeclareInst *DI = dyn_cast<DbgDeclareInst>(&I);
+      if (!DI)
+        continue;
+
+      assert(DI->getVariable() && "Missing variable");
+      assert(DI->getDebugLoc() && "Missing location");
+      const Value *Address = DI->getAddress();
+      if (!Address)
+        continue;
+
+      // Look through casts and constant offset GEPs. These mostly come from
+      // inalloca.
+      APInt Offset(DL.getPointerSizeInBits(0), 0);
+      Address = Address->stripAndAccumulateInBoundsConstantOffsets(DL, Offset);
+
+      // Check if the variable is a static alloca or a byval or inalloca
+      // argument passed in memory. If it is not, then we will ignore this
+      // intrinsic and handle this during isel like dbg.value.
+      int FI = INT_MAX;
+      if (const auto *AI = dyn_cast<AllocaInst>(Address)) {
+        auto SI = FuncInfo->StaticAllocaMap.find(AI);
+        if (SI != FuncInfo->StaticAllocaMap.end())
+          FI = SI->second;
+      } else if (const auto *Arg = dyn_cast<Argument>(Address))
+        FI = FuncInfo->getArgumentFrameIndex(Arg);
+
+      if (FI == INT_MAX)
+        continue;
+
+      DIExpression *Expr = DI->getExpression();
+      if (Offset.getBoolValue())
+        Expr = DIExpression::prepend(Expr, DIExpression::NoDeref,
+                                     Offset.getZExtValue());
+      MF->setVariableDbgInfo(DI->getVariable(), Expr, FI, DI->getDebugLoc());
+    }
+  }
+}
+
 /// Propagate swifterror values through the machine function CFG.
 static void propagateSwiftErrorVRegs(FunctionLoweringInfo *FuncInfo) {
   auto *TLI = FuncInfo->TLI;
@@ -1317,6 +1373,8 @@ void SelectionDAGISel::SelectAllBasicBlocks(const Function &Fn) {
   }
   createSwiftErrorEntriesInEntryBlock(FuncInfo, FastIS, TLI, TII, SDB);
 
+  processDbgDeclares(FuncInfo);
+
   // Iterate over all basic blocks in the function.
   for (const BasicBlock *LLVMBB : RPOT) {
     if (OptLevel != CodeGenOpt::None) {
diff --git a/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 23f597db140c..befbd80d7965 100644
--- a/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -417,11 +417,10 @@ bool TargetLowering::ShrinkDemandedOp(SDValue Op, unsigned BitWidth,
     if (TLI.isTruncateFree(Op.getValueType(), SmallVT) &&
         TLI.isZExtFree(SmallVT, Op.getValueType())) {
       // We found a type with free casts.
-      SDValue X = DAG.getNode(Op.getOpcode(), dl, SmallVT,
-                              DAG.getNode(ISD::TRUNCATE, dl, SmallVT,
-                                          Op.getNode()->getOperand(0)),
-                              DAG.getNode(ISD::TRUNCATE, dl, SmallVT,
-                                          Op.getNode()->getOperand(1)));
+      SDValue X = DAG.getNode(
+          Op.getOpcode(), dl, SmallVT,
+          DAG.getNode(ISD::TRUNCATE, dl, SmallVT, Op.getOperand(0)),
+          DAG.getNode(ISD::TRUNCATE, dl, SmallVT, Op.getOperand(1)));
       bool NeedZext = DemandedSize > SmallVTBits;
       SDValue Z = DAG.getNode(NeedZext ? ISD::ZERO_EXTEND : ISD::ANY_EXTEND,
                               dl, Op.getValueType(), X);
@@ -817,7 +816,7 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op,
       // Convert (shl (anyext x, c)) to (anyext (shl x, c)) if the high bits
       // are not demanded. This will likely allow the anyext to be folded away.
       if (InOp.getNode()->getOpcode() == ISD::ANY_EXTEND) {
-        SDValue InnerOp = InOp.getNode()->getOperand(0);
+        SDValue InnerOp = InOp.getOperand(0);
         EVT InnerVT = InnerOp.getValueType();
         unsigned InnerBits = InnerVT.getSizeInBits();
         if (ShAmt < InnerBits && NewMask.getActiveBits() <= InnerBits &&
diff --git a/lib/CodeGen/ShrinkWrap.cpp b/lib/CodeGen/ShrinkWrap.cpp
index 4837495777da..2638702da152 100644
--- a/lib/CodeGen/ShrinkWrap.cpp
+++ b/lib/CodeGen/ShrinkWrap.cpp
@@ -282,8 +282,14 @@ void ShrinkWrap::updateSaveRestorePoints(MachineBasicBlock &MBB,
 
   if (!Restore)
     Restore = &MBB;
-  else
+  else if (MPDT->getNode(&MBB)) // If the block is not in the post dom tree, it
+                                // means the block never returns. If that's the
+                                // case, we don't want to call
+                                // `findNearestCommonDominator`, which will
+                                // return `Restore`.
     Restore = MPDT->findNearestCommonDominator(Restore, &MBB);
+  else
+    Restore = nullptr; // Abort, we can't find a restore point in this case.
 
   // Make sure we would be able to insert the restore code before the
   // terminator.
@@ -293,7 +299,7 @@ void ShrinkWrap::updateSaveRestorePoints(MachineBasicBlock &MBB,
         continue;
       // One of the terminator needs to happen before the restore point.
       if (MBB.succ_empty()) {
-        Restore = nullptr;
+        Restore = nullptr; // Abort, we can't find a restore point in this case.
         break;
       }
       // Look for a restore point that post-dominates all the successors.
@@ -419,7 +425,7 @@ static bool isIrreducibleCFG(const MachineFunction &MF,
 }
 
 bool ShrinkWrap::runOnMachineFunction(MachineFunction &MF) {
-  if (MF.empty() || !isShrinkWrapEnabled(MF))
+  if (skipFunction(*MF.getFunction()) || MF.empty() || !isShrinkWrapEnabled(MF))
     return false;
 
   DEBUG(dbgs() << "**** Analysing " << MF.getName() << '\n');
diff --git a/lib/CodeGen/SjLjEHPrepare.cpp b/lib/CodeGen/SjLjEHPrepare.cpp
index ab578df4069d..e9eff4d0acb2 100644
--- a/lib/CodeGen/SjLjEHPrepare.cpp
+++ b/lib/CodeGen/SjLjEHPrepare.cpp
@@ -93,8 +93,8 @@ bool SjLjEHPrepare::doInitialization(Module &M) {
                                       doubleUnderDataTy, // __data
                                       VoidPtrTy,         // __personality
                                       VoidPtrTy,         // __lsda
-                                      doubleUnderJBufTy, // __jbuf
-                                      nullptr);
+                                      doubleUnderJBufTy  // __jbuf
+                                      );
 
   return true;
 }
diff --git a/lib/CodeGen/TargetLoweringObjectFileImpl.cpp b/lib/CodeGen/TargetLoweringObjectFileImpl.cpp
index 34892680aceb..1d232c71d824 100644
--- a/lib/CodeGen/TargetLoweringObjectFileImpl.cpp
+++ b/lib/CodeGen/TargetLoweringObjectFileImpl.cpp
@@ -232,7 +232,11 @@ static const MCSymbolELF *getAssociatedSymbol(const GlobalObject *GO,
   if (!MD)
     return nullptr;
 
-  auto *VM = dyn_cast<ValueAsMetadata>(MD->getOperand(0));
+  const MDOperand &Op = MD->getOperand(0);
+  if (!Op.get())
+    return nullptr;
+
+  auto *VM = dyn_cast<ValueAsMetadata>(Op);
   if (!VM)
     report_fatal_error("MD_associated operand is not ValueAsMetadata");
 
diff --git a/lib/CodeGen/TargetPassConfig.cpp b/lib/CodeGen/TargetPassConfig.cpp
index 150195f5f85b..e6c5d8753b83 100644
--- a/lib/CodeGen/TargetPassConfig.cpp
+++ b/lib/CodeGen/TargetPassConfig.cpp
@@ -487,6 +487,14 @@ void TargetPassConfig::addIRPasses() {
 
   // Insert calls to mcount-like functions.
   addPass(createCountingFunctionInserterPass());
+
+  // Add scalarization of target's unsupported masked memory intrinsics pass.
+  // the unsupported intrinsic will be replaced with a chain of basic blocks,
+  // that stores/loads element one-by-one if the appropriate mask bit is set.
+  addPass(createScalarizeMaskedMemIntrinPass());
+
+  // Expand reduction intrinsics into shuffle sequences if the target wants to.
+  addPass(createExpandReductionsPass());
 }
 
 /// Turn exception handling constructs into something the code generators can
@@ -607,6 +615,9 @@ void TargetPassConfig::addMachinePasses() {
     addPass(&LocalStackSlotAllocationID, false);
   }
 
+  if (getOptLevel() != CodeGenOpt::None)
+    addPass(&LiveRangeShrinkID);
+
   // Run pre-ra passes.
   addPreRegAlloc();
 
diff --git a/lib/CodeGen/TwoAddressInstructionPass.cpp b/lib/CodeGen/TwoAddressInstructionPass.cpp
index 75359fe3c0ea..7392c8327148 100644
--- a/lib/CodeGen/TwoAddressInstructionPass.cpp
+++ b/lib/CodeGen/TwoAddressInstructionPass.cpp
@@ -155,7 +155,7 @@ public:
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.setPreservesCFG();
-    AU.addRequired<AAResultsWrapperPass>();
+    AU.addUsedIfAvailable<AAResultsWrapperPass>();
     AU.addUsedIfAvailable<LiveVariables>();
     AU.addPreserved<LiveVariables>();
     AU.addPreserved<SlotIndexes>();
@@ -1627,7 +1627,10 @@ bool TwoAddressInstructionPass::runOnMachineFunction(MachineFunction &Func) {
   InstrItins = MF->getSubtarget().getInstrItineraryData();
   LV = getAnalysisIfAvailable<LiveVariables>();
   LIS = getAnalysisIfAvailable<LiveIntervals>();
-  AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
+  if (auto *AAPass = getAnalysisIfAvailable<AAResultsWrapperPass>())
+    AA = &AAPass->getAAResults();
+  else
+    AA = nullptr;
   OptLevel = TM.getOptLevel();
 
   bool MadeChange = false;
diff --git a/lib/CodeGen/UnreachableBlockElim.cpp b/lib/CodeGen/UnreachableBlockElim.cpp
index f085132b6a94..407fd9b162e9 100644
--- a/lib/CodeGen/UnreachableBlockElim.cpp
+++ b/lib/CodeGen/UnreachableBlockElim.cpp
@@ -206,11 +206,12 @@ bool UnreachableMachineBlockElim::runOnMachineFunction(MachineFunction &F) {
         if (InputReg != OutputReg) {
           MachineRegisterInfo &MRI = F.getRegInfo();
           unsigned InputSub = Input.getSubReg();
-          if (InputSub == 0) {
-            MRI.constrainRegClass(InputReg, MRI.getRegClass(OutputReg));
+          if (InputSub == 0 &&
+              MRI.constrainRegClass(InputReg, MRI.getRegClass(OutputReg))) {
             MRI.replaceRegWith(OutputReg, InputReg);
           } else {
-            // The input register to the PHI has a subregister:
+            // The input register to the PHI has a subregister or it can't be
+            // constrained to the proper register class:
             // insert a COPY instead of simply replacing the output
             // with the input.
             const TargetInstrInfo *TII = F.getSubtarget().getInstrInfo();
diff --git a/lib/DebugInfo/CodeView/CMakeLists.txt b/lib/DebugInfo/CodeView/CMakeLists.txt
index 410d5a3777d4..8d9353ae5f5e 100644
--- a/lib/DebugInfo/CodeView/CMakeLists.txt
+++ b/lib/DebugInfo/CodeView/CMakeLists.txt
@@ -13,7 +13,7 @@ add_llvm_library(LLVMDebugInfoCodeView
   ModuleDebugFragmentVisitor.cpp
   ModuleDebugInlineeLinesFragment.cpp
   ModuleDebugLineFragment.cpp
-  ModuleDebugUnknownFragment.cpp
+  RandomAccessTypeVisitor.cpp
   RecordSerialization.cpp
   StringTable.cpp
   SymbolRecordMapping.cpp
diff --git a/lib/DebugInfo/CodeView/CVTypeVisitor.cpp b/lib/DebugInfo/CodeView/CVTypeVisitor.cpp
index 0069ee3cc904..b6ed0453d9c4 100644
--- a/lib/DebugInfo/CodeView/CVTypeVisitor.cpp
+++ b/lib/DebugInfo/CodeView/CVTypeVisitor.cpp
@@ -26,8 +26,7 @@ CVTypeVisitor::CVTypeVisitor(TypeVisitorCallbacks &Callbacks)
     : Callbacks(Callbacks) {}
 
 template <typename T>
-static Error visitKnownRecord(CVTypeVisitor &Visitor, CVType &Record,
-                              TypeVisitorCallbacks &Callbacks) {
+static Error visitKnownRecord(CVType &Record, TypeVisitorCallbacks &Callbacks) {
   TypeRecordKind RK = static_cast<TypeRecordKind>(Record.Type);
   T KnownRecord(RK);
   if (auto EC = Callbacks.visitKnownRecord(Record, KnownRecord))
@@ -76,7 +75,7 @@ void CVTypeVisitor::addTypeServerHandler(TypeServerHandler &Handler) {
   Handlers.push_back(&Handler);
 }
 
-Error CVTypeVisitor::visitTypeRecord(CVType &Record) {
+Expected<bool> CVTypeVisitor::handleTypeServer(CVType &Record) {
   if (Record.Type == TypeLeafKind::LF_TYPESERVER2 && !Handlers.empty()) {
     auto TS = deserializeTypeServerRecord(Record);
     if (!TS)
@@ -90,16 +89,16 @@ Error CVTypeVisitor::visitTypeRecord(CVType &Record) {
 
       // If the handler processed the record, return success.
       if (*ExpectedResult)
-        return Error::success();
+        return true;
 
       // Otherwise keep searching for a handler, eventually falling out and
       // using the default record handler.
     }
   }
+  return false;
+}
 
-  if (auto EC = Callbacks.visitTypeBegin(Record))
-    return EC;
-
+Error CVTypeVisitor::finishVisitation(CVType &Record) {
   switch (Record.Type) {
   default:
     if (auto EC = Callbacks.visitUnknownType(Record))
@@ -107,7 +106,7 @@ Error CVTypeVisitor::visitTypeRecord(CVType &Record) {
     break;
 #define TYPE_RECORD(EnumName, EnumVal, Name)                                   \
   case EnumName: {                                                             \
-    if (auto EC = visitKnownRecord<Name##Record>(*this, Record, Callbacks))    \
+    if (auto EC = visitKnownRecord<Name##Record>(Record, Callbacks))           \
       return EC;                                                               \
     break;                                                                     \
   }
@@ -124,6 +123,32 @@ Error CVTypeVisitor::visitTypeRecord(CVType &Record) {
   return Error::success();
 }
 
+Error CVTypeVisitor::visitTypeRecord(CVType &Record, TypeIndex Index) {
+  auto ExpectedResult = handleTypeServer(Record);
+  if (!ExpectedResult)
+    return ExpectedResult.takeError();
+  if (*ExpectedResult)
+    return Error::success();
+
+  if (auto EC = Callbacks.visitTypeBegin(Record, Index))
+    return EC;
+
+  return finishVisitation(Record);
+}
+
+Error CVTypeVisitor::visitTypeRecord(CVType &Record) {
+  auto ExpectedResult = handleTypeServer(Record);
+  if (!ExpectedResult)
+    return ExpectedResult.takeError();
+  if (*ExpectedResult)
+    return Error::success();
+
+  if (auto EC = Callbacks.visitTypeBegin(Record))
+    return EC;
+
+  return finishVisitation(Record);
+}
+
 static Error visitMemberRecord(CVMemberRecord &Record,
                                TypeVisitorCallbacks &Callbacks) {
   if (auto EC = Callbacks.visitMemberBegin(Record))
diff --git a/lib/DebugInfo/CodeView/ModuleDebugUnknownFragment.cpp b/lib/DebugInfo/CodeView/ModuleDebugUnknownFragment.cpp
deleted file mode 100644
index 9fd2cb8ed3e8..000000000000
--- a/lib/DebugInfo/CodeView/ModuleDebugUnknownFragment.cpp
+++ /dev/null
@@ -1,10 +0,0 @@
-//===- ModuleDebugUnknownFragment.cpp ---------------------------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/DebugInfo/CodeView/ModuleDebugUnknownFragment.h"
\ No newline at end of file
diff --git a/lib/DebugInfo/CodeView/RandomAccessTypeVisitor.cpp b/lib/DebugInfo/CodeView/RandomAccessTypeVisitor.cpp
new file mode 100644
index 000000000000..4cb9acbe07d9
--- /dev/null
+++ b/lib/DebugInfo/CodeView/RandomAccessTypeVisitor.cpp
@@ -0,0 +1,91 @@
+//===- RandomAccessTypeVisitor.cpp ---------------------------- *- C++ --*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/DebugInfo/CodeView/RandomAccessTypeVisitor.h"
+
+#include "llvm/DebugInfo/CodeView/TypeDatabase.h"
+#include "llvm/DebugInfo/CodeView/TypeServerHandler.h"
+#include "llvm/DebugInfo/CodeView/TypeVisitorCallbacks.h"
+
+using namespace llvm;
+using namespace llvm::codeview;
+
+RandomAccessTypeVisitor::RandomAccessTypeVisitor(
+    const CVTypeArray &Types, uint32_t NumRecords,
+    PartialOffsetArray PartialOffsets)
+    : Database(NumRecords), Types(Types), DatabaseVisitor(Database),
+      InternalVisitor(Pipeline), PartialOffsets(PartialOffsets) {
+  Pipeline.addCallbackToPipeline(Deserializer);
+  Pipeline.addCallbackToPipeline(DatabaseVisitor);
+
+  KnownOffsets.resize(Database.capacity());
+}
+
+Error RandomAccessTypeVisitor::visitTypeIndex(TypeIndex TI,
+                                              TypeVisitorCallbacks &Callbacks) {
+  assert(TI.toArrayIndex() < Database.capacity());
+
+  if (!Database.contains(TI)) {
+    if (auto EC = visitRangeForType(TI))
+      return EC;
+  }
+
+  assert(Database.contains(TI));
+  auto &Record = Database.getTypeRecord(TI);
+  CVTypeVisitor V(Callbacks);
+  return V.visitTypeRecord(Record, TI);
+}
+
+Error RandomAccessTypeVisitor::visitRangeForType(TypeIndex TI) {
+  if (PartialOffsets.empty()) {
+    TypeIndex TIB(TypeIndex::FirstNonSimpleIndex);
+    TypeIndex TIE = TIB + Database.capacity();
+    return visitRange(TIB, 0, TIE);
+  }
+
+  auto Next = std::upper_bound(PartialOffsets.begin(), PartialOffsets.end(), TI,
+                               [](TypeIndex Value, const TypeIndexOffset &IO) {
+                                 return Value < IO.Type;
+                               });
+
+  assert(Next != PartialOffsets.begin());
+  auto Prev = std::prev(Next);
+
+  TypeIndex TIB = Prev->Type;
+  TypeIndex TIE;
+  if (Next == PartialOffsets.end()) {
+    TIE = TypeIndex::fromArrayIndex(Database.capacity());
+  } else {
+    TIE = Next->Type;
+  }
+
+  if (auto EC = visitRange(TIB, Prev->Offset, TIE))
+    return EC;
+  return Error::success();
+}
+
+Error RandomAccessTypeVisitor::visitRange(TypeIndex Begin, uint32_t BeginOffset,
+                                          TypeIndex End) {
+
+  auto RI = Types.at(BeginOffset);
+  assert(RI != Types.end());
+
+  while (Begin != End) {
+    assert(!Database.contains(Begin));
+    if (auto EC = InternalVisitor.visitTypeRecord(*RI, Begin))
+      return EC;
+    KnownOffsets[Begin.toArrayIndex()] = BeginOffset;
+
+    BeginOffset += RI.getRecordLength();
+    ++Begin;
+    ++RI;
+  }
+
+  return Error::success();
+}
diff --git a/lib/DebugInfo/CodeView/TypeDatabase.cpp b/lib/DebugInfo/CodeView/TypeDatabase.cpp
index 5b8841041f88..7924440e5e29 100644
--- a/lib/DebugInfo/CodeView/TypeDatabase.cpp
+++ b/lib/DebugInfo/CodeView/TypeDatabase.cpp
@@ -65,20 +65,32 @@ static const SimpleTypeEntry SimpleTypeNames[] = {
     {"__bool64*", SimpleTypeKind::Boolean64},
 };
 
-TypeDatabase::TypeDatabase(uint32_t ExpectedSize) : TypeNameStorage(Allocator) {
-  CVUDTNames.reserve(ExpectedSize);
-  TypeRecords.reserve(ExpectedSize);
+TypeDatabase::TypeDatabase(uint32_t Capacity) : TypeNameStorage(Allocator) {
+  CVUDTNames.resize(Capacity);
+  TypeRecords.resize(Capacity);
+  ValidRecords.resize(Capacity);
 }
 
-/// Gets the type index for the next type record.
-TypeIndex TypeDatabase::getNextTypeIndex() const {
-  return TypeIndex(TypeIndex::FirstNonSimpleIndex + CVUDTNames.size());
+TypeIndex TypeDatabase::appendType(StringRef Name, const CVType &Data) {
+  TypeIndex TI;
+  TI = getAppendIndex();
+  if (TI.toArrayIndex() >= capacity())
+    grow();
+  recordType(Name, TI, Data);
+  return TI;
 }
 
-/// Records the name of a type, and reserves its type index.
-void TypeDatabase::recordType(StringRef Name, const CVType &Data) {
-  CVUDTNames.push_back(Name);
-  TypeRecords.push_back(Data);
+void TypeDatabase::recordType(StringRef Name, TypeIndex Index,
+                              const CVType &Data) {
+  uint32_t AI = Index.toArrayIndex();
+
+  assert(!contains(Index));
+  assert(AI < capacity());
+
+  CVUDTNames[AI] = Name;
+  TypeRecords[AI] = Data;
+  ValidRecords.set(AI);
+  ++Count;
 }
 
 /// Saves the name in a StringSet and creates a stable StringRef.
@@ -104,24 +116,47 @@ StringRef TypeDatabase::getTypeName(TypeIndex Index) const {
     return "<unknown simple type>";
   }
 
-  uint32_t I = Index.getIndex() - TypeIndex::FirstNonSimpleIndex;
-  if (I < CVUDTNames.size())
-    return CVUDTNames[I];
+  if (contains(Index))
+    return CVUDTNames[Index.toArrayIndex()];
 
   return "<unknown UDT>";
 }
 
 const CVType &TypeDatabase::getTypeRecord(TypeIndex Index) const {
-  return TypeRecords[Index.getIndex() - TypeIndex::FirstNonSimpleIndex];
+  assert(contains(Index));
+  return TypeRecords[Index.toArrayIndex()];
 }
 
 CVType &TypeDatabase::getTypeRecord(TypeIndex Index) {
-  return TypeRecords[Index.getIndex() - TypeIndex::FirstNonSimpleIndex];
+  assert(contains(Index));
+  return TypeRecords[Index.toArrayIndex()];
+}
+
+bool TypeDatabase::contains(TypeIndex Index) const {
+  uint32_t AI = Index.toArrayIndex();
+  if (AI >= capacity())
+    return false;
+
+  return ValidRecords.test(AI);
 }
 
-bool TypeDatabase::containsTypeIndex(TypeIndex Index) const {
-  uint32_t I = Index.getIndex() - TypeIndex::FirstNonSimpleIndex;
-  return I < CVUDTNames.size();
+uint32_t TypeDatabase::size() const { return Count; }
+
+uint32_t TypeDatabase::capacity() const { return TypeRecords.size(); }
+
+void TypeDatabase::grow() {
+  TypeRecords.emplace_back();
+  CVUDTNames.emplace_back();
+  ValidRecords.resize(ValidRecords.size() + 1);
 }
 
-uint32_t TypeDatabase::size() const { return CVUDTNames.size(); }
+bool TypeDatabase::empty() const { return size() == 0; }
+
+TypeIndex TypeDatabase::getAppendIndex() const {
+  if (empty())
+    return TypeIndex::fromArrayIndex(0);
+
+  int Index = ValidRecords.find_last();
+  assert(Index != -1);
+  return TypeIndex::fromArrayIndex(Index) + 1;
+}
diff --git a/lib/DebugInfo/CodeView/TypeDatabaseVisitor.cpp b/lib/DebugInfo/CodeView/TypeDatabaseVisitor.cpp
index c234afd2288b..8d97f8b1cb40 100644
--- a/lib/DebugInfo/CodeView/TypeDatabaseVisitor.cpp
+++ b/lib/DebugInfo/CodeView/TypeDatabaseVisitor.cpp
@@ -15,7 +15,7 @@ using namespace llvm;
 
 using namespace llvm::codeview;
 
-Error TypeDatabaseVisitor::visitTypeBegin(CVRecord<TypeLeafKind> &Record) {
+Error TypeDatabaseVisitor::visitTypeBegin(CVType &Record) {
   assert(!IsInFieldList);
   // Reset Name to the empty string. If the visitor sets it, we know it.
   Name = "";
@@ -28,6 +28,22 @@ Error TypeDatabaseVisitor::visitTypeBegin(CVRecord<TypeLeafKind> &Record) {
   return Error::success();
 }
 
+Error TypeDatabaseVisitor::visitTypeBegin(CVType &Record, TypeIndex Index) {
+  if (auto EC = visitTypeBegin(Record))
+    return EC;
+
+  CurrentTypeIndex = Index;
+  return Error::success();
+}
+
+StringRef TypeDatabaseVisitor::getTypeName(TypeIndex Index) const {
+  return TypeDB->getTypeName(Index);
+}
+
+StringRef TypeDatabaseVisitor::saveTypeName(StringRef Name) {
+  return TypeDB->saveTypeName(Name);
+}
+
 Error TypeDatabaseVisitor::visitTypeEnd(CVType &CVR) {
   if (CVR.Type == LF_FIELDLIST) {
     assert(IsInFieldList);
@@ -39,7 +55,12 @@ Error TypeDatabaseVisitor::visitTypeEnd(CVType &CVR) {
   // CVUDTNames is indexed by type index, and must have one entry for every
   // type.  Field list members are not recorded, and are only referenced by
   // their containing field list record.
-  TypeDB.recordType(Name, CVR);
+  if (CurrentTypeIndex)
+    TypeDB->recordType(Name, *CurrentTypeIndex, CVR);
+  else
+    TypeDB->appendType(Name, CVR);
+
+  CurrentTypeIndex.reset();
   return Error::success();
 }
 
@@ -73,13 +94,13 @@ Error TypeDatabaseVisitor::visitKnownRecord(CVType &CVR, ArgListRecord &Args) {
   uint32_t Size = Indices.size();
   SmallString<256> TypeName("(");
   for (uint32_t I = 0; I < Size; ++I) {
-    StringRef ArgTypeName = TypeDB.getTypeName(Indices[I]);
+    StringRef ArgTypeName = getTypeName(Indices[I]);
     TypeName.append(ArgTypeName);
     if (I + 1 != Size)
       TypeName.append(", ");
   }
   TypeName.push_back(')');
-  Name = TypeDB.saveTypeName(TypeName);
+  Name = saveTypeName(TypeName);
   return Error::success();
 }
 
@@ -89,13 +110,13 @@ Error TypeDatabaseVisitor::visitKnownRecord(CVType &CVR,
   uint32_t Size = Indices.size();
   SmallString<256> TypeName("\"");
   for (uint32_t I = 0; I < Size; ++I) {
-    StringRef ArgTypeName = TypeDB.getTypeName(Indices[I]);
+    StringRef ArgTypeName = getTypeName(Indices[I]);
     TypeName.append(ArgTypeName);
     if (I + 1 != Size)
       TypeName.append("\" \"");
   }
   TypeName.push_back('\"');
-  Name = TypeDB.saveTypeName(TypeName);
+  Name = saveTypeName(TypeName);
   return Error::success();
 }
 
@@ -132,26 +153,26 @@ Error TypeDatabaseVisitor::visitKnownRecord(CVType &CVR,
 
 Error TypeDatabaseVisitor::visitKnownRecord(CVType &CVR,
                                             ProcedureRecord &Proc) {
-  StringRef ReturnTypeName = TypeDB.getTypeName(Proc.getReturnType());
-  StringRef ArgListTypeName = TypeDB.getTypeName(Proc.getArgumentList());
+  StringRef ReturnTypeName = getTypeName(Proc.getReturnType());
+  StringRef ArgListTypeName = getTypeName(Proc.getArgumentList());
   SmallString<256> TypeName(ReturnTypeName);
   TypeName.push_back(' ');
   TypeName.append(ArgListTypeName);
-  Name = TypeDB.saveTypeName(TypeName);
+  Name = saveTypeName(TypeName);
   return Error::success();
 }
 
 Error TypeDatabaseVisitor::visitKnownRecord(CVType &CVR,
                                             MemberFunctionRecord &MF) {
-  StringRef ReturnTypeName = TypeDB.getTypeName(MF.getReturnType());
-  StringRef ClassTypeName = TypeDB.getTypeName(MF.getClassType());
-  StringRef ArgListTypeName = TypeDB.getTypeName(MF.getArgumentList());
+  StringRef ReturnTypeName = getTypeName(MF.getReturnType());
+  StringRef ClassTypeName = getTypeName(MF.getClassType());
+  StringRef ArgListTypeName = getTypeName(MF.getArgumentList());
   SmallString<256> TypeName(ReturnTypeName);
   TypeName.push_back(' ');
   TypeName.append(ClassTypeName);
   TypeName.append("::");
   TypeName.append(ArgListTypeName);
-  Name = TypeDB.saveTypeName(TypeName);
+  Name = saveTypeName(TypeName);
   return Error::success();
 }
 
@@ -171,13 +192,13 @@ Error TypeDatabaseVisitor::visitKnownRecord(CVType &CVR, PointerRecord &Ptr) {
   if (Ptr.isPointerToMember()) {
     const MemberPointerInfo &MI = Ptr.getMemberInfo();
 
-    StringRef PointeeName = TypeDB.getTypeName(Ptr.getReferentType());
-    StringRef ClassName = TypeDB.getTypeName(MI.getContainingType());
+    StringRef PointeeName = getTypeName(Ptr.getReferentType());
+    StringRef ClassName = getTypeName(MI.getContainingType());
     SmallString<256> TypeName(PointeeName);
     TypeName.push_back(' ');
     TypeName.append(ClassName);
     TypeName.append("::*");
-    Name = TypeDB.saveTypeName(TypeName);
+    Name = saveTypeName(TypeName);
   } else {
     SmallString<256> TypeName;
     if (Ptr.isConst())
@@ -187,7 +208,7 @@ Error TypeDatabaseVisitor::visitKnownRecord(CVType &CVR, PointerRecord &Ptr) {
     if (Ptr.isUnaligned())
       TypeName.append("__unaligned ");
 
-    TypeName.append(TypeDB.getTypeName(Ptr.getReferentType()));
+    TypeName.append(getTypeName(Ptr.getReferentType()));
 
     if (Ptr.getMode() == PointerMode::LValueReference)
       TypeName.append("&");
@@ -197,7 +218,7 @@ Error TypeDatabaseVisitor::visitKnownRecord(CVType &CVR, PointerRecord &Ptr) {
       TypeName.append("*");
 
     if (!TypeName.empty())
-      Name = TypeDB.saveTypeName(TypeName);
+      Name = saveTypeName(TypeName);
   }
   return Error::success();
 }
@@ -205,7 +226,7 @@ Error TypeDatabaseVisitor::visitKnownRecord(CVType &CVR, PointerRecord &Ptr) {
 Error TypeDatabaseVisitor::visitKnownRecord(CVType &CVR, ModifierRecord &Mod) {
   uint16_t Mods = static_cast<uint16_t>(Mod.getModifiers());
 
-  StringRef ModifiedName = TypeDB.getTypeName(Mod.getModifiedType());
+  StringRef ModifiedName = getTypeName(Mod.getModifiedType());
   SmallString<256> TypeName;
   if (Mods & uint16_t(ModifierOptions::Const))
     TypeName.append("const ");
@@ -214,14 +235,14 @@ Error TypeDatabaseVisitor::visitKnownRecord(CVType &CVR, ModifierRecord &Mod) {
   if (Mods & uint16_t(ModifierOptions::Unaligned))
     TypeName.append("__unaligned ");
   TypeName.append(ModifiedName);
-  Name = TypeDB.saveTypeName(TypeName);
+  Name = saveTypeName(TypeName);
   return Error::success();
 }
 
 Error TypeDatabaseVisitor::visitKnownRecord(CVType &CVR,
                                             VFTableShapeRecord &Shape) {
-  Name = TypeDB.saveTypeName("<vftable " + utostr(Shape.getEntryCount()) +
-                             " methods>");
+  Name =
+      saveTypeName("<vftable " + utostr(Shape.getEntryCount()) + " methods>");
   return Error::success();
 }
 
diff --git a/lib/DebugInfo/CodeView/TypeDumpVisitor.cpp b/lib/DebugInfo/CodeView/TypeDumpVisitor.cpp
index 870d95221e7d..27a6e0987886 100644
--- a/lib/DebugInfo/CodeView/TypeDumpVisitor.cpp
+++ b/lib/DebugInfo/CodeView/TypeDumpVisitor.cpp
@@ -173,10 +173,13 @@ void TypeDumpVisitor::printItemIndex(StringRef FieldName, TypeIndex TI) const {
 }
 
 Error TypeDumpVisitor::visitTypeBegin(CVType &Record) {
+  TypeIndex TI = getSourceDB().getAppendIndex();
+  return visitTypeBegin(Record, TI);
+}
+
+Error TypeDumpVisitor::visitTypeBegin(CVType &Record, TypeIndex Index) {
   W->startLine() << getLeafTypeName(Record.Type);
-  W->getOStream() << " ("
-                  << HexNumber(getSourceDB().getNextTypeIndex().getIndex())
-                  << ")";
+  W->getOStream() << " (" << HexNumber(Index.getIndex()) << ")";
   W->getOStream() << " {\n";
   W->indent();
   W->printEnum("TypeLeafKind", unsigned(Record.Type),
diff --git a/lib/DebugInfo/DWARF/DWARFContext.cpp b/lib/DebugInfo/DWARF/DWARFContext.cpp
index 246899ac12b9..59a060d143ff 100644
--- a/lib/DebugInfo/DWARF/DWARFContext.cpp
+++ b/lib/DebugInfo/DWARF/DWARFContext.cpp
@@ -66,7 +66,7 @@ uint64_t llvm::getRelocatedValue(const DataExtractor &Data, uint32_t Size,
   RelocAddrMap::const_iterator AI = Relocs->find(*Off);
   if (AI == Relocs->end())
     return Data.getUnsigned(Off, Size);
-  return Data.getUnsigned(Off, Size) + AI->second.second;
+  return Data.getUnsigned(Off, Size) + AI->second.Value;
 }
 
 static void dumpAccelSection(raw_ostream &OS, StringRef Name,
@@ -905,16 +905,23 @@ static Error createError(const Twine &Reason, llvm::Error E) {
 /// Returns the address of symbol relocation used against. Used for futher
 /// relocations computation. Symbol's section load address is taken in account if
 /// LoadedObjectInfo interface is provided.
-static Expected<uint64_t> getSymbolAddress(const object::ObjectFile &Obj,
-                                           const RelocationRef &Reloc,
-                                           const LoadedObjectInfo *L) {
+static Expected<uint64_t>
+getSymbolAddress(const object::ObjectFile &Obj, const RelocationRef &Reloc,
+                 const LoadedObjectInfo *L,
+                 std::map<SymbolRef, uint64_t> &Cache) {
   uint64_t Ret = 0;
   object::section_iterator RSec = Obj.section_end();
   object::symbol_iterator Sym = Reloc.getSymbol();
 
+  std::map<SymbolRef, uint64_t>::iterator CacheIt = Cache.end();
   // First calculate the address of the symbol or section as it appears
   // in the object file
   if (Sym != Obj.symbol_end()) {
+    bool New;
+    std::tie(CacheIt, New) = Cache.insert({*Sym, 0});
+    if (!New)
+      return CacheIt->second;
+
     Expected<uint64_t> SymAddrOrErr = Sym->getAddress();
     if (!SymAddrOrErr)
       return createError("error: failed to compute symbol address: ",
@@ -943,6 +950,10 @@ static Expected<uint64_t> getSymbolAddress(const object::ObjectFile &Obj,
   if (L && RSec != Obj.section_end())
     if (uint64_t SectionLoadAddress = L->getSectionLoadAddress(*RSec))
       Ret += SectionLoadAddress - RSec->getAddress();
+
+  if (CacheIt != Cache.end())
+    CacheIt->second = Ret;
+
   return Ret;
 }
 
@@ -1075,6 +1086,7 @@ DWARFContextInMemory::DWARFContextInMemory(const object::ObjectFile &Obj,
         continue;
     }
 
+    std::map<SymbolRef, uint64_t> AddrCache;
     if (Section.relocation_begin() != Section.relocation_end()) {
       uint64_t SectionSize = RelocatedSection->getSize();
       for (const RelocationRef &Reloc : Section.relocations()) {
@@ -1083,7 +1095,8 @@ DWARFContextInMemory::DWARFContextInMemory(const object::ObjectFile &Obj,
         if (isRelocScattered(Obj, Reloc))
           continue;
 
-        Expected<uint64_t> SymAddrOrErr = getSymbolAddress(Obj, Reloc, L);
+        Expected<uint64_t> SymAddrOrErr =
+            getSymbolAddress(Obj, Reloc, L, AddrCache);
         if (!SymAddrOrErr) {
           errs() << toString(SymAddrOrErr.takeError()) << '\n';
           continue;
@@ -1114,7 +1127,7 @@ DWARFContextInMemory::DWARFContextInMemory(const object::ObjectFile &Obj,
                      << " at " << format("%p", Address)
                      << " with width " << format("%d", R.Width)
                      << "\n");
-        Map->insert(std::make_pair(Address, std::make_pair(R.Width, R.Value)));
+        Map->insert({Address, {(uint8_t)R.Width, R.Value}});
       }
     }
   }
diff --git a/lib/DebugInfo/DWARF/DWARFDebugAranges.cpp b/lib/DebugInfo/DWARF/DWARFDebugAranges.cpp
index 0cf71f530446..6601393d7459 100644
--- a/lib/DebugInfo/DWARF/DWARFDebugAranges.cpp
+++ b/lib/DebugInfo/DWARF/DWARFDebugAranges.cpp
@@ -54,9 +54,8 @@ void DWARFDebugAranges::generate(DWARFContext *CTX) {
     if (ParsedCUOffsets.insert(CUOffset).second) {
       DWARFAddressRangesVector CURanges;
       CU->collectAddressRanges(CURanges);
-      for (const auto &R : CURanges) {
-        appendRange(CUOffset, R.first, R.second);
-      }
+      for (const auto &R : CURanges)
+        appendRange(CUOffset, R.LowPC, R.HighPC);
     }
   }
 
diff --git a/lib/DebugInfo/DWARF/DWARFDebugRangeList.cpp b/lib/DebugInfo/DWARF/DWARFDebugRangeList.cpp
index 9380fe8fe85d..8da797750abd 100644
--- a/lib/DebugInfo/DWARF/DWARFDebugRangeList.cpp
+++ b/lib/DebugInfo/DWARF/DWARFDebugRangeList.cpp
@@ -69,8 +69,8 @@ DWARFDebugRangeList::getAbsoluteRanges(uint64_t BaseAddress) const {
     if (RLE.isBaseAddressSelectionEntry(AddressSize)) {
       BaseAddress = RLE.EndAddress;
     } else {
-      Res.push_back(std::make_pair(BaseAddress + RLE.StartAddress,
-                                   BaseAddress + RLE.EndAddress));
+      Res.push_back(
+          {BaseAddress + RLE.StartAddress, BaseAddress + RLE.EndAddress});
     }
   }
   return Res;
diff --git a/lib/DebugInfo/DWARF/DWARFDie.cpp b/lib/DebugInfo/DWARF/DWARFDie.cpp
index 24039eb35209..e3bd759ba94b 100644
--- a/lib/DebugInfo/DWARF/DWARFDie.cpp
+++ b/lib/DebugInfo/DWARF/DWARFDie.cpp
@@ -60,8 +60,8 @@ static void dumpRanges(raw_ostream &OS, const DWARFAddressRangesVector& Ranges,
     OS << '\n';
     OS.indent(Indent);
     OS << format("[0x%0*" PRIx64 " - 0x%0*" PRIx64 ")",
-                 AddressSize*2, Range.first,
-                 AddressSize*2, Range.second);
+                 AddressSize*2, Range.LowPC,
+                 AddressSize*2, Range.HighPC);
   }
 }
 
@@ -229,9 +229,9 @@ DWARFDie::getAddressRanges() const {
     return DWARFAddressRangesVector();
   // Single range specified by low/high PC.
   uint64_t LowPC, HighPC;
-  if (getLowAndHighPC(LowPC, HighPC)) {
-    return DWARFAddressRangesVector(1, std::make_pair(LowPC, HighPC));
-  }
+  if (getLowAndHighPC(LowPC, HighPC))
+    return {{LowPC, HighPC}};
+
   // Multiple ranges from .debug_ranges section.
   auto RangesOffset = toSectionOffset(find(DW_AT_ranges));
   if (RangesOffset) {
@@ -257,7 +257,7 @@ DWARFDie::collectChildrenAddressRanges(DWARFAddressRangesVector& Ranges) const {
 
 bool DWARFDie::addressRangeContainsAddress(const uint64_t Address) const {
   for (const auto& R : getAddressRanges()) {
-    if (R.first <= Address && Address < R.second)
+    if (R.LowPC <= Address && Address < R.HighPC)
       return true;
   }
   return false;
diff --git a/lib/DebugInfo/DWARF/DWARFTypeUnit.cpp b/lib/DebugInfo/DWARF/DWARFTypeUnit.cpp
index e0f819383289..25824f6eb83b 100644
--- a/lib/DebugInfo/DWARF/DWARFTypeUnit.cpp
+++ b/lib/DebugInfo/DWARF/DWARFTypeUnit.cpp
@@ -24,7 +24,11 @@ bool DWARFTypeUnit::extractImpl(DataExtractor debug_info,
     return false;
   TypeHash = debug_info.getU64(offset_ptr);
   TypeOffset = debug_info.getU32(offset_ptr);
-  return TypeOffset < getLength();
+  // TypeOffset is relative to the beginning of the header,
+  // so we have to account for the leading length field.
+  // FIXME: The size of the length field is 12 in DWARF64.
+  unsigned SizeOfLength = 4;
+  return TypeOffset < getLength() + SizeOfLength;
 }
 
 void DWARFTypeUnit::dump(raw_ostream &OS, bool SummarizeTypes) {
diff --git a/lib/DebugInfo/DWARF/DWARFUnit.cpp b/lib/DebugInfo/DWARF/DWARFUnit.cpp
index f50487fc3ba3..3835d4da9ae9 100644
--- a/lib/DebugInfo/DWARF/DWARFUnit.cpp
+++ b/lib/DebugInfo/DWARF/DWARFUnit.cpp
@@ -349,18 +349,18 @@ void DWARFUnit::updateAddressDieMap(DWARFDie Die) {
   if (Die.isSubroutineDIE()) {
     for (const auto &R : Die.getAddressRanges()) {
       // Ignore 0-sized ranges.
-      if (R.first == R.second)
+      if (R.LowPC == R.HighPC)
         continue;
-      auto B = AddrDieMap.upper_bound(R.first);
-      if (B != AddrDieMap.begin() && R.first < (--B)->second.first) {
+      auto B = AddrDieMap.upper_bound(R.LowPC);
+      if (B != AddrDieMap.begin() && R.LowPC < (--B)->second.first) {
         // The range is a sub-range of existing ranges, we need to split the
         // existing range.
-        if (R.second < B->second.first)
-          AddrDieMap[R.second] = B->second;
-        if (R.first > B->first)
-          AddrDieMap[B->first].first = R.first;
+        if (R.HighPC < B->second.first)
+          AddrDieMap[R.HighPC] = B->second;
+        if (R.LowPC > B->first)
+          AddrDieMap[B->first].first = R.LowPC;
       }
-      AddrDieMap[R.first] = std::make_pair(R.second, Die);
+      AddrDieMap[R.LowPC] = std::make_pair(R.HighPC, Die);
     }
   }
   // Parent DIEs are added to the AddrDieMap prior to the Children DIEs to
diff --git a/lib/DebugInfo/DWARF/DWARFVerifier.cpp b/lib/DebugInfo/DWARF/DWARFVerifier.cpp
index 9494e876da15..8a544296f65c 100644
--- a/lib/DebugInfo/DWARF/DWARFVerifier.cpp
+++ b/lib/DebugInfo/DWARF/DWARFVerifier.cpp
@@ -23,7 +23,7 @@ using namespace llvm;
 using namespace dwarf;
 using namespace object;
 
-void DWARFVerifier::verifyDebugInfoAttribute(DWARFDie &Die,
+void DWARFVerifier::verifyDebugInfoAttribute(const DWARFDie &Die,
                                              DWARFAttribute &AttrValue) {
   const auto Attr = AttrValue.Attr;
   switch (Attr) {
@@ -68,7 +68,7 @@ void DWARFVerifier::verifyDebugInfoAttribute(DWARFDie &Die,
   }
 }
 
-void DWARFVerifier::verifyDebugInfoForm(DWARFDie &Die,
+void DWARFVerifier::verifyDebugInfoForm(const DWARFDie &Die,
                                         DWARFAttribute &AttrValue) {
   const auto Form = AttrValue.Value.getForm();
   switch (Form) {
@@ -136,7 +136,7 @@ void DWARFVerifier::verifyDebugInfoForm(DWARFDie &Die,
   }
 }
 
-void DWARFVerifier::veifyDebugInfoReferences() {
+void DWARFVerifier::verifyDebugInfoReferences() {
   // Take all references and make sure they point to an actual DIE by
   // getting the DIE by offset and emitting an error
   OS << "Verifying .debug_info references...\n";
@@ -172,7 +172,7 @@ bool DWARFVerifier::handleDebugInfo() {
       }
     }
   }
-  veifyDebugInfoReferences();
+  verifyDebugInfoReferences();
   return NumDebugInfoErrors == 0;
 }
 
diff --git a/lib/DebugInfo/PDB/Native/TpiStreamBuilder.cpp b/lib/DebugInfo/PDB/Native/TpiStreamBuilder.cpp
index 375c35b11145..701a318511b8 100644
--- a/lib/DebugInfo/PDB/Native/TpiStreamBuilder.cpp
+++ b/lib/DebugInfo/PDB/Native/TpiStreamBuilder.cpp
@@ -109,7 +109,7 @@ uint32_t TpiStreamBuilder::calculateHashBufferSize() const {
 }
 
 uint32_t TpiStreamBuilder::calculateIndexOffsetSize() const {
-  return TypeIndexOffsets.size() * sizeof(TypeIndexOffset);
+  return TypeIndexOffsets.size() * sizeof(codeview::TypeIndexOffset);
 }
 
 Error TpiStreamBuilder::finalizeMsfLayout() {
diff --git a/lib/ExecutionEngine/Orc/OrcMCJITReplacement.h b/lib/ExecutionEngine/Orc/OrcMCJITReplacement.h
index a5100a56bcf1..a27573f93b97 100644
--- a/lib/ExecutionEngine/Orc/OrcMCJITReplacement.h
+++ b/lib/ExecutionEngine/Orc/OrcMCJITReplacement.h
@@ -94,9 +94,8 @@ class OrcMCJITReplacement : public ExecutionEngine {
       return ClientMM->registerEHFrames(Addr, LoadAddr, Size);
     }
 
-    void deregisterEHFrames(uint8_t *Addr, uint64_t LoadAddr,
-                            size_t Size) override {
-      return ClientMM->deregisterEHFrames(Addr, LoadAddr, Size);
+    void deregisterEHFrames() override {
+      return ClientMM->deregisterEHFrames();
     }
 
     void notifyObjectLoaded(RuntimeDyld &RTDyld,
diff --git a/lib/ExecutionEngine/RuntimeDyld/RTDyldMemoryManager.cpp b/lib/ExecutionEngine/RuntimeDyld/RTDyldMemoryManager.cpp
index de73fbde8eb7..99e84b7496d4 100644
--- a/lib/ExecutionEngine/RuntimeDyld/RTDyldMemoryManager.cpp
+++ b/lib/ExecutionEngine/RuntimeDyld/RTDyldMemoryManager.cpp
@@ -134,6 +134,18 @@ void RTDyldMemoryManager::deregisterEHFramesInProcess(uint8_t *Addr,
 
 #endif
 
+void RTDyldMemoryManager::registerEHFrames(uint8_t *Addr, uint64_t LoadAddr,
+                                          size_t Size) {
+  registerEHFramesInProcess(Addr, Size);
+  EHFrames.push_back({Addr, Size});
+}
+
+void RTDyldMemoryManager::deregisterEHFrames() {
+  for (auto &Frame : EHFrames)
+    deregisterEHFramesInProcess(Frame.Addr, Frame.Size);
+  EHFrames.clear();
+}
+
 static int jit_noop() {
   return 0;
 }
diff --git a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp
index df9d2ceba329..e9a4b71c903d 100644
--- a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp
+++ b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp
@@ -73,7 +73,9 @@ namespace llvm {
 
 void RuntimeDyldImpl::registerEHFrames() {}
 
-void RuntimeDyldImpl::deregisterEHFrames() {}
+void RuntimeDyldImpl::deregisterEHFrames() {
+  MemMgr.deregisterEHFrames();
+}
 
 #ifndef NDEBUG
 static void dumpSectionMemory(const SectionEntry &S, StringRef State) {
diff --git a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp
index 50f63fb8dd39..660843765b3f 100644
--- a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp
+++ b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp
@@ -221,22 +221,10 @@ void RuntimeDyldELF::registerEHFrames() {
     uint64_t EHFrameLoadAddr = Sections[EHFrameSID].getLoadAddress();
     size_t EHFrameSize = Sections[EHFrameSID].getSize();
     MemMgr.registerEHFrames(EHFrameAddr, EHFrameLoadAddr, EHFrameSize);
-    RegisteredEHFrameSections.push_back(EHFrameSID);
   }
   UnregisteredEHFrameSections.clear();
 }
 
-void RuntimeDyldELF::deregisterEHFrames() {
-  for (int i = 0, e = RegisteredEHFrameSections.size(); i != e; ++i) {
-    SID EHFrameSID = RegisteredEHFrameSections[i];
-    uint8_t *EHFrameAddr = Sections[EHFrameSID].getAddress();
-    uint64_t EHFrameLoadAddr = Sections[EHFrameSID].getLoadAddress();
-    size_t EHFrameSize = Sections[EHFrameSID].getSize();
-    MemMgr.deregisterEHFrames(EHFrameAddr, EHFrameLoadAddr, EHFrameSize);
-  }
-  RegisteredEHFrameSections.clear();
-}
-
 std::unique_ptr<RuntimeDyldELF>
 llvm::RuntimeDyldELF::create(Triple::ArchType Arch,
                              RuntimeDyld::MemoryManager &MemMgr,
@@ -802,20 +790,35 @@ void RuntimeDyldELF::resolveSystemZRelocation(const SectionEntry &Section,
     writeInt32BE(LocalAddress, Delta / 2);
     break;
   }
+  case ELF::R_390_PC16: {
+    int64_t Delta = (Value + Addend) - Section.getLoadAddressWithOffset(Offset);
+    assert(int16_t(Delta) == Delta && "R_390_PC16 overflow");
+    writeInt16BE(LocalAddress, Delta);
+    break;
+  }
   case ELF::R_390_PC32: {
     int64_t Delta = (Value + Addend) - Section.getLoadAddressWithOffset(Offset);
     assert(int32_t(Delta) == Delta && "R_390_PC32 overflow");
     writeInt32BE(LocalAddress, Delta);
     break;
   }
-  case ELF::R_390_64:
-    writeInt64BE(LocalAddress, Value + Addend);
-    break;
   case ELF::R_390_PC64: {
     int64_t Delta = (Value + Addend) - Section.getLoadAddressWithOffset(Offset);
     writeInt64BE(LocalAddress, Delta);
     break;
   }
+  case ELF::R_390_8:
+    *LocalAddress = (uint8_t)(Value + Addend);
+    break;
+  case ELF::R_390_16:
+    writeInt16BE(LocalAddress, Value + Addend);
+    break;
+  case ELF::R_390_32:
+    writeInt32BE(LocalAddress, Value + Addend);
+    break;
+  case ELF::R_390_64:
+    writeInt64BE(LocalAddress, Value + Addend);
+    break;
   }
 }
 
diff --git a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.h b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.h
index 84dd810101f3..fb5da6dd8bbb 100644
--- a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.h
+++ b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.h
@@ -152,7 +152,6 @@ private:
   // in a table until we receive a request to register all unregistered
   // EH frame sections with the memory manager.
   SmallVector<SID, 2> UnregisteredEHFrameSections;
-  SmallVector<SID, 2> RegisteredEHFrameSections;
 
   // Map between GOT relocation value and corresponding GOT offset
   std::map<RelocationValueRef, uint64_t> GOTOffsetMap;
@@ -180,7 +179,6 @@ public:
                        StubMap &Stubs) override;
   bool isCompatibleFile(const object::ObjectFile &Obj) const override;
   void registerEHFrames() override;
-  void deregisterEHFrames() override;
   Error finalizeLoad(const ObjectFile &Obj,
                      ObjSectionToIDMap &SectionMap) override;
 };
diff --git a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldImpl.h b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldImpl.h
index f5cc883d98fd..18c23c5a2a5d 100644
--- a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldImpl.h
+++ b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldImpl.h
@@ -515,7 +515,7 @@ public:
 
   virtual void registerEHFrames();
 
-  virtual void deregisterEHFrames();
+  void deregisterEHFrames();
 
   virtual Error finalizeLoad(const ObjectFile &ObjImg,
                              ObjSectionToIDMap &SectionMap) {
diff --git a/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldCOFFI386.h b/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldCOFFI386.h
index 0398413e1532..6aa1a2bdb926 100644
--- a/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldCOFFI386.h
+++ b/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldCOFFI386.h
@@ -217,7 +217,6 @@ public:
   }
 
   void registerEHFrames() override {}
-  void deregisterEHFrames() override {}
 };
 
 }
diff --git a/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldCOFFThumb.h b/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldCOFFThumb.h
index 8c6af0bd9c6d..318afa21a88b 100644
--- a/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldCOFFThumb.h
+++ b/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldCOFFThumb.h
@@ -316,7 +316,6 @@ public:
   }
 
   void registerEHFrames() override {}
-  void deregisterEHFrames() override {}
 };
 
 }
diff --git a/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldCOFFX86_64.h b/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldCOFFX86_64.h
index 109beb36f1ee..26e73989d7ed 100644
--- a/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldCOFFX86_64.h
+++ b/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldCOFFX86_64.h
@@ -194,9 +194,6 @@ public:
     }
     UnregisteredEHFrameSections.clear();
   }
-  void deregisterEHFrames() override {
-    // Stub
-  }
   Error finalizeLoad(const ObjectFile &Obj,
                      ObjSectionToIDMap &SectionMap) override {
     // Look for and record the EH frame section IDs.
diff --git a/lib/Fuzzer/FuzzerDriver.cpp b/lib/Fuzzer/FuzzerDriver.cpp
index b85ba210afb3..e93c79cfcec6 100644
--- a/lib/Fuzzer/FuzzerDriver.cpp
+++ b/lib/Fuzzer/FuzzerDriver.cpp
@@ -656,7 +656,8 @@ int FuzzerDriver(int *argc, char ***argv, UserCallback Callback) {
       SMR.WaitClient();
       size_t Size = SMR.ReadByteArraySize();
       SMR.WriteByteArray(nullptr, 0);
-      F->RunOne(SMR.GetByteArray(), Size);
+      const Unit tmp(SMR.GetByteArray(), SMR.GetByteArray() + Size);
+      F->RunOne(tmp.data(), tmp.size());
       SMR.PostServer();
     }
     return 0;
diff --git a/lib/Fuzzer/FuzzerFlags.def b/lib/Fuzzer/FuzzerFlags.def
index 0a1ff1b1df6a..7ff196c8fa96 100644
--- a/lib/Fuzzer/FuzzerFlags.def
+++ b/lib/Fuzzer/FuzzerFlags.def
@@ -92,10 +92,10 @@ FUZZER_FLAG_INT(print_pcs, 0, "If 1, print out newly covered PCs.")
 FUZZER_FLAG_INT(print_final_stats, 0, "If 1, print statistics at exit.")
 FUZZER_FLAG_INT(print_corpus_stats, 0,
   "If 1, print statistics on corpus elements at exit.")
-FUZZER_FLAG_INT(print_coverage, 0, "If 1, print coverage information at exit."
-                                   " Experimental, only with trace-pc-guard")
-FUZZER_FLAG_INT(dump_coverage, 0, "If 1, dump coverage information at exit."
-                                  " Experimental, only with trace-pc-guard")
+FUZZER_FLAG_INT(print_coverage, 0, "If 1, print coverage information as text"
+                                   " at exit.")
+FUZZER_FLAG_INT(dump_coverage, 0, "If 1, dump coverage information as a"
+                                  " .sancov file at exit.")
 FUZZER_FLAG_INT(handle_segv, 1, "If 1, try to intercept SIGSEGV.")
 FUZZER_FLAG_INT(handle_bus, 1, "If 1, try to intercept SIGBUS.")
 FUZZER_FLAG_INT(handle_abrt, 1, "If 1, try to intercept SIGABRT.")
diff --git a/lib/Fuzzer/FuzzerInternal.h b/lib/Fuzzer/FuzzerInternal.h
index ad067ee2c0d9..5f184c2316e2 100644
--- a/lib/Fuzzer/FuzzerInternal.h
+++ b/lib/Fuzzer/FuzzerInternal.h
@@ -91,6 +91,7 @@ public:
 private:
   void AlarmCallback();
   void CrashCallback();
+  void CrashOnOverwrittenData();
   void InterruptCallback();
   void MutateAndTestOne();
   void ReportNewCoverage(InputInfo *II, const Unit &U);
diff --git a/lib/Fuzzer/FuzzerLoop.cpp b/lib/Fuzzer/FuzzerLoop.cpp
index d84c3dbdaf77..14caa203c5ef 100644
--- a/lib/Fuzzer/FuzzerLoop.cpp
+++ b/lib/Fuzzer/FuzzerLoop.cpp
@@ -422,6 +422,24 @@ size_t Fuzzer::GetCurrentUnitInFuzzingThead(const uint8_t **Data) const {
   return CurrentUnitSize;
 }
 
+void Fuzzer::CrashOnOverwrittenData() {
+  Printf("==%d== ERROR: libFuzzer: fuzz target overwrites it's const input\n",
+         GetPid());
+  DumpCurrentUnit("crash-");
+  Printf("SUMMARY: libFuzzer: out-of-memory\n");
+  _Exit(Options.ErrorExitCode); // Stop right now.
+}
+
+// Compare two arrays, but not all bytes if the arrays are large.
+static bool LooseMemeq(const uint8_t *A, const uint8_t *B, size_t Size) {
+  const size_t Limit = 64;
+  if (Size <= 64)
+    return !memcmp(A, B, Size);
+  // Compare first and last Limit/2 bytes.
+  return !memcmp(A, B, Limit / 2) &&
+         !memcmp(A + Size - Limit / 2, B + Size - Limit / 2, Limit / 2);
+}
+
 void Fuzzer::ExecuteCallback(const uint8_t *Data, size_t Size) {
   assert(InFuzzingThread());
   if (SMR.IsClient())
@@ -443,6 +461,8 @@ void Fuzzer::ExecuteCallback(const uint8_t *Data, size_t Size) {
   (void)Res;
   assert(Res == 0);
   HasMoreMallocsThanFrees = AllocTracer.Stop();
+  if (!LooseMemeq(DataCopy, Data, Size))
+    CrashOnOverwrittenData();
   CurrentUnitSize = 0;
   delete[] DataCopy;
 }
diff --git a/lib/Fuzzer/FuzzerMutate.cpp b/lib/Fuzzer/FuzzerMutate.cpp
index cd846c7deec5..e60d4130de10 100644
--- a/lib/Fuzzer/FuzzerMutate.cpp
+++ b/lib/Fuzzer/FuzzerMutate.cpp
@@ -217,11 +217,12 @@ DictionaryEntry MutationDispatcher::MakeDictionaryEntryFromCMP(
     size_t NumPositions = 0;
     for (const uint8_t *Cur = Data;
          Cur < End && NumPositions < kMaxNumPositions; Cur++) {
-      Cur = (uint8_t *)SearchMemory(Cur, End - Cur, ExistingBytes, ArgSize);
+      Cur =
+          (const uint8_t *)SearchMemory(Cur, End - Cur, ExistingBytes, ArgSize);
       if (!Cur) break;
       Positions[NumPositions++] = Cur - Data;
     }
-    if (!NumPositions) break;
+    if (!NumPositions) continue;
     return DictionaryEntry(W, Positions[Rand(NumPositions)]);
   }
   DictionaryEntry DE(W);
diff --git a/lib/Fuzzer/afl/afl_driver.cpp b/lib/Fuzzer/afl/afl_driver.cpp
index b3a54e57fceb..3815ed11cf60 100644
--- a/lib/Fuzzer/afl/afl_driver.cpp
+++ b/lib/Fuzzer/afl/afl_driver.cpp
@@ -59,6 +59,11 @@ statistics from the file. If that fails then the process will quit.
 #include <signal.h>
 #include <sys/resource.h>
 #include <sys/time.h>
+
+#include <iostream>
+#include <fstream>
+#include <vector>
+
 // Platform detection. Copied from FuzzerInternal.h
 #ifdef __linux__
 #define LIBFUZZER_LINUX 1
@@ -245,17 +250,39 @@ extern "C" size_t LLVMFuzzerMutate(uint8_t *Data, size_t Size, size_t MaxSize) {
   return 0;
 }
 
+// Execute any files provided as parameters.
+int ExecuteFilesOnyByOne(int argc, char **argv) {
+  for (int i = 1; i < argc; i++) {
+    std::ifstream in(argv[i]);
+    in.seekg(0, in.end);
+    size_t length = in.tellg();
+    in.seekg (0, in.beg);
+    std::cout << "Reading " << length << " bytes from " << argv[i] << std::endl;
+    // Allocate exactly length bytes so that we reliably catch buffer overflows.
+    std::vector<char> bytes(length);
+    in.read(bytes.data(), bytes.size());
+    assert(in);
+    LLVMFuzzerTestOneInput(reinterpret_cast<const uint8_t *>(bytes.data()),
+                           bytes.size());
+    std::cout << "Execution successfull" << std::endl;
+  }
+  return 0;
+}
+
 int main(int argc, char **argv) {
-  fprintf(stderr, "======================= INFO =========================\n"
-                  "This binary is built for AFL-fuzz.\n"
-                  "To run the target function on a single input execute this:\n"
-                  "  %s < INPUT_FILE\n"
-                  "To run the fuzzing execute this:\n"
-                  "  afl-fuzz [afl-flags] %s [N] "
-                  "-- run N fuzzing iterations before "
-                  "re-spawning the process (default: 1000)\n"
-                  "======================================================\n",
-          argv[0], argv[0]);
+  fprintf(stderr,
+      "======================= INFO =========================\n"
+      "This binary is built for AFL-fuzz.\n"
+      "To run the target function on individual input(s) execute this:\n"
+      "  %s < INPUT_FILE\n"
+      "or\n"
+      "  %s INPUT_FILE1 [INPUT_FILE2 ... ]\n"
+      "To fuzz with afl-fuzz execute this:\n"
+      "  afl-fuzz [afl-flags] %s [-N]\n"
+      "afl-fuzz will run N iterations before "
+      "re-spawning the process (default: 1000)\n"
+      "======================================================\n",
+          argv[0], argv[0], argv[0]);
   if (LLVMFuzzerInitialize)
     LLVMFuzzerInitialize(&argc, &argv);
   // Do any other expensive one-time initialization here.
@@ -266,8 +293,14 @@ int main(int argc, char **argv) {
   __afl_manual_init();
 
   int N = 1000;
-  if (argc >= 2)
-    N = atoi(argv[1]);
+  if (argc == 2 && argv[1][0] == '-')
+      N = atoi(argv[1] + 1);
+  else if(argc == 2 && (N = atoi(argv[1])) > 0)
+      fprintf(stderr, "WARNING: using the deprecated call style `%s %d`\n",
+              argv[0], N);
+  else if (argc > 1)
+    return ExecuteFilesOnyByOne(argc, argv);
+
   assert(N > 0);
   time_t unit_time_secs;
   int num_runs = 0;
diff --git a/lib/Fuzzer/test/AFLDriverTest.cpp b/lib/Fuzzer/test/AFLDriverTest.cpp
index 3dd0b6117305..e3f5f7100883 100644
--- a/lib/Fuzzer/test/AFLDriverTest.cpp
+++ b/lib/Fuzzer/test/AFLDriverTest.cpp
@@ -4,19 +4,25 @@
 // Contains dummy functions used to avoid dependency on AFL.
 #include <stdint.h>
 #include <stdlib.h>
+#include <stdio.h>
 
 extern "C" void __afl_manual_init() {}
 
-extern "C" int __afl_persistent_loop(unsigned int) {
+extern "C" int __afl_persistent_loop(unsigned int N) {
+  static int Count = N;
+  fprintf(stderr, "__afl_persistent_loop calle, Count = %d\n", Count);
+  if (Count--) return 1;
   return 0;
 }
 
 // This declaration exists to prevent the Darwin linker
 // from complaining about this being a missing weak symbol.
 extern "C" int LLVMFuzzerInitialize(int *argc, char ***argv) {
+  fprintf(stderr, "LLVMFuzzerInitialize called\n");
   return 0;
 }
 
 extern "C" int LLVMFuzzerTestOneInput(const uint8_t *Data, size_t Size) {
+  fprintf(stderr, "LLVMFuzzerTestOneInput called; Size = %zd\n", Size);
   return 0;
 }
diff --git a/lib/Fuzzer/test/CMakeLists.txt b/lib/Fuzzer/test/CMakeLists.txt
index cd049d3f03d8..b39938a705f6 100644
--- a/lib/Fuzzer/test/CMakeLists.txt
+++ b/lib/Fuzzer/test/CMakeLists.txt
@@ -104,6 +104,7 @@ set(Tests
   OneHugeAllocTest
   OutOfMemoryTest
   OutOfMemorySingleLargeMallocTest
+  OverwriteInputTest
   RepeatedMemcmp
   RepeatedBytesTest
   SimpleCmpTest
diff --git a/lib/Fuzzer/test/OverwriteInputTest.cpp b/lib/Fuzzer/test/OverwriteInputTest.cpp
new file mode 100644
index 000000000000..e688682346a6
--- /dev/null
+++ b/lib/Fuzzer/test/OverwriteInputTest.cpp
@@ -0,0 +1,13 @@
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+
+// Simple test for a fuzzer. Make sure we abort if Data is overwritten.
+#include <cstdint>
+#include <iostream>
+
+extern "C" int LLVMFuzzerTestOneInput(const uint8_t *Data, size_t Size) {
+  if (Size)
+    *const_cast<uint8_t*>(Data) = 1;
+  return 0;
+}
+
diff --git a/lib/Fuzzer/test/afl-driver.test b/lib/Fuzzer/test/afl-driver.test
new file mode 100644
index 000000000000..6eab23cc3636
--- /dev/null
+++ b/lib/Fuzzer/test/afl-driver.test
@@ -0,0 +1,26 @@
+REQUIRES: linux
+RUN: echo -n "abc" > %t.file3
+RUN: echo -n "abcd" > %t.file4
+
+RUN: AFLDriverTest < %t.file3 2>&1 | FileCheck %s --check-prefix=CHECK1
+CHECK1: __afl_persistent_loop calle, Count = 1000
+CHECK1: LLVMFuzzerTestOneInput called; Size = 3
+
+
+RUN: AFLDriverTest < %t.file3 -42 2>&1 | FileCheck %s --check-prefix=CHECK2
+CHECK2: __afl_persistent_loop calle, Count = 42
+CHECK2: LLVMFuzzerTestOneInput called; Size = 3
+
+
+RUN: AFLDriverTest < %t.file3 666 2>&1 | FileCheck %s --check-prefix=CHECK3
+CHECK3: WARNING: using the deprecated call style
+CHECK3: __afl_persistent_loop calle, Count = 666
+CHECK3: LLVMFuzzerTestOneInput called; Size = 3
+
+
+RUN: AFLDriverTest %t.file3 2>&1 | FileCheck %s --check-prefix=CHECK4
+CHECK4: LLVMFuzzerTestOneInput called; Size = 3
+
+RUN: AFLDriverTest %t.file3 %t.file4  2>&1 | FileCheck %s --check-prefix=CHECK5
+CHECK5: LLVMFuzzerTestOneInput called; Size = 3
+CHECK5: LLVMFuzzerTestOneInput called; Size = 4
diff --git a/lib/Fuzzer/test/overwrite-input.test b/lib/Fuzzer/test/overwrite-input.test
new file mode 100644
index 000000000000..81c27909e8df
--- /dev/null
+++ b/lib/Fuzzer/test/overwrite-input.test
@@ -0,0 +1,2 @@
+RUN: not LLVMFuzzer-OverwriteInputTest 2>&1 | FileCheck %s
+CHECK: ERROR: libFuzzer: fuzz target overwrites it's const input
diff --git a/lib/IR/AsmWriter.cpp b/lib/IR/AsmWriter.cpp
index 4c6e3e3788bd..ec4663018bd4 100644
--- a/lib/IR/AsmWriter.cpp
+++ b/lib/IR/AsmWriter.cpp
@@ -805,6 +805,9 @@ void SlotTracker::processModule() {
     if (!Var.hasName())
       CreateModuleSlot(&Var);
     processGlobalObjectMetadata(Var);
+    auto Attrs = Var.getAttributes();
+    if (Attrs.hasAttributes())
+      CreateAttributeSetSlot(Attrs);
   }
 
   for (const GlobalAlias &A : TheModule->aliases()) {
@@ -2502,6 +2505,10 @@ void AssemblyWriter::printGlobal(const GlobalVariable *GV) {
   GV->getAllMetadata(MDs);
   printMetadataAttachments(MDs, ", ");
 
+  auto Attrs = GV->getAttributes();
+  if (Attrs.hasAttributes())
+    Out << " #" << Machine.getAttributeGroupSlot(Attrs);
+
   printInfoComment(*GV);
 }
 
diff --git a/lib/IR/AttributeImpl.h b/lib/IR/AttributeImpl.h
index cf2925254695..acfac316e91e 100644
--- a/lib/IR/AttributeImpl.h
+++ b/lib/IR/AttributeImpl.h
@@ -1,4 +1,4 @@
-//===-- AttributeImpl.h - Attribute Internals -------------------*- C++ -*-===//
+//===- AttributeImpl.h - Attribute Internals --------------------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -21,9 +21,7 @@
 #include "llvm/ADT/StringRef.h"
 #include "llvm/IR/Attributes.h"
 #include "llvm/Support/TrailingObjects.h"
-#include <algorithm>
 #include <cassert>
-#include <climits>
 #include <cstddef>
 #include <cstdint>
 #include <string>
@@ -80,11 +78,13 @@ public:
     else
       Profile(ID, getKindAsString(), getValueAsString());
   }
+
   static void Profile(FoldingSetNodeID &ID, Attribute::AttrKind Kind,
                       uint64_t Val) {
     ID.AddInteger(Kind);
     if (Val) ID.AddInteger(Val);
   }
+
   static void Profile(FoldingSetNodeID &ID, StringRef Kind, StringRef Values) {
     ID.AddString(Kind);
     if (!Values.empty()) ID.AddString(Values);
@@ -114,9 +114,10 @@ public:
 };
 
 class IntAttributeImpl : public EnumAttributeImpl {
-  void anchor() override;
   uint64_t Val;
 
+  void anchor() override;
+
 public:
   IntAttributeImpl(Attribute::AttrKind Kind, uint64_t Val)
       : EnumAttributeImpl(IntAttrEntry, Kind), Val(Val) {
@@ -188,20 +189,22 @@ public:
   std::pair<unsigned, Optional<unsigned>> getAllocSizeArgs() const;
   std::string getAsString(bool InAttrGrp) const;
 
-  typedef const Attribute *iterator;
+  using iterator = const Attribute *;
+
   iterator begin() const { return getTrailingObjects<Attribute>(); }
   iterator end() const { return begin() + NumAttrs; }
 
   void Profile(FoldingSetNodeID &ID) const {
     Profile(ID, makeArrayRef(begin(), end()));
   }
+
   static void Profile(FoldingSetNodeID &ID, ArrayRef<Attribute> AttrList) {
     for (const auto &Attr : AttrList)
       Attr.Profile(ID);
   }
 };
 
-typedef std::pair<unsigned, AttributeSet> IndexAttrPair;
+using IndexAttrPair = std::pair<unsigned, AttributeSet>;
 
 //===----------------------------------------------------------------------===//
 /// \class
@@ -265,7 +268,8 @@ public:
     return AvailableFunctionAttrs & ((uint64_t)1) << Kind;
   }
 
-  typedef AttributeSet::iterator iterator;
+  using iterator = AttributeSet::iterator;
+
   iterator begin(unsigned Slot) const {
     return getSlotAttributes(Slot).begin();
   }
diff --git a/lib/IR/Attributes.cpp b/lib/IR/Attributes.cpp
index 3b1140ab542c..ce60367a6c8b 100644
--- a/lib/IR/Attributes.cpp
+++ b/lib/IR/Attributes.cpp
@@ -34,6 +34,8 @@
 #include "llvm/Support/raw_ostream.h"
 #include <algorithm>
 #include <cassert>
+#include <climits>
+#include <cstddef>
 #include <cstdint>
 #include <limits>
 #include <map>
@@ -504,16 +506,74 @@ AttributeSet AttributeSet::get(LLVMContext &C, ArrayRef<Attribute> Attrs) {
   return AttributeSet(AttributeSetNode::get(C, Attrs));
 }
 
+AttributeSet AttributeSet::addAttribute(LLVMContext &C,
+                          Attribute::AttrKind Kind) const {
+  if (hasAttribute(Kind)) return *this;
+  AttrBuilder B;
+  B.addAttribute(Kind);
+  return addAttributes(C, AttributeSet::get(C, B));
+}
+
+AttributeSet AttributeSet::addAttribute(LLVMContext &C, StringRef Kind,
+                          StringRef Value) const {
+  AttrBuilder B;
+  B.addAttribute(Kind, Value);
+  return addAttributes(C, AttributeSet::get(C, B));
+}
+
+AttributeSet AttributeSet::addAttributes(LLVMContext &C,
+                                         const AttributeSet AS) const {
+  if (!hasAttributes())
+    return AS;
+
+  if (!AS.hasAttributes())
+    return *this;
+
+  AttrBuilder B(AS);
+  for (Attribute I : *this)
+    B.addAttribute(I);
+
+ return get(C, B);
+}
+
+AttributeSet AttributeSet::removeAttribute(LLVMContext &C,
+                                             Attribute::AttrKind Kind) const {
+  if (!hasAttribute(Kind)) return *this;
+  AttrBuilder B;
+  B.addAttribute(Kind);
+  return removeAttributes(C, B);
+}
+
+AttributeSet AttributeSet::removeAttribute(LLVMContext &C,
+                                             StringRef Kind) const {
+  if (!hasAttribute(Kind)) return *this;
+  AttrBuilder B;
+  B.addAttribute(Kind);
+  return removeAttributes(C, B);
+}
+
+AttributeSet AttributeSet::removeAttributes(LLVMContext &C,
+                                              const AttrBuilder &Attrs) const {
+
+  // FIXME it is not obvious how this should work for alignment.
+  // For now, say we can't pass in alignment, which no current use does.
+  assert(!Attrs.hasAlignmentAttr() && "Attempt to change alignment!");
+
+  AttrBuilder B(*this);
+  B.remove(Attrs);
+  return get(C, B);
+}
+
 unsigned AttributeSet::getNumAttributes() const {
   return SetNode ? SetNode->getNumAttributes() : 0;
 }
 
 bool AttributeSet::hasAttribute(Attribute::AttrKind Kind) const {
-  return SetNode ? SetNode->hasAttribute(Kind) : 0;
+  return SetNode ? SetNode->hasAttribute(Kind) : false;
 }
 
 bool AttributeSet::hasAttribute(StringRef Kind) const {
-  return SetNode ? SetNode->hasAttribute(Kind) : 0;
+  return SetNode ? SetNode->hasAttribute(Kind) : false;
 }
 
 Attribute AttributeSet::getAttribute(Attribute::AttrKind Kind) const {
@@ -557,6 +617,14 @@ AttributeSet::iterator AttributeSet::end() const {
   return SetNode ? SetNode->end() : nullptr;
 }
 
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+LLVM_DUMP_METHOD void AttributeSet::dump() const {
+  dbgs() << "AS =\n";
+    dbgs() << "  { ";
+    dbgs() << getAsString(true) << " }\n";
+}
+#endif
+
 //===----------------------------------------------------------------------===//
 // AttributeSetNode Definition
 //===----------------------------------------------------------------------===//
diff --git a/lib/IR/ConstantFold.cpp b/lib/IR/ConstantFold.cpp
index 80b117015ede..a20f3f811c8d 100644
--- a/lib/IR/ConstantFold.cpp
+++ b/lib/IR/ConstantFold.cpp
@@ -2041,9 +2041,6 @@ Constant *llvm::ConstantFoldGetElementPtr(Type *PointeeTy, Constant *C,
                                           Optional<unsigned> InRangeIndex,
                                           ArrayRef<Value *> Idxs) {
   if (Idxs.empty()) return C;
-  Constant *Idx0 = cast<Constant>(Idxs[0]);
-  if ((Idxs.size() == 1 && Idx0->isNullValue()))
-    return C;
 
   if (isa<UndefValue>(C)) {
     Type *GEPTy = GetElementPtrInst::getGEPReturnType(
@@ -2051,10 +2048,15 @@ Constant *llvm::ConstantFoldGetElementPtr(Type *PointeeTy, Constant *C,
     return UndefValue::get(GEPTy);
   }
 
+  Constant *Idx0 = cast<Constant>(Idxs[0]);
+  if (Idxs.size() == 1 && (Idx0->isNullValue() || isa<UndefValue>(Idx0)))
+    return C;
+
   if (C->isNullValue()) {
     bool isNull = true;
     for (unsigned i = 0, e = Idxs.size(); i != e; ++i)
-      if (!cast<Constant>(Idxs[i])->isNullValue()) {
+      if (!isa<UndefValue>(Idxs[i]) &&
+          !cast<Constant>(Idxs[i])->isNullValue()) {
         isNull = false;
         break;
       }
diff --git a/lib/IR/ConstantRange.cpp b/lib/IR/ConstantRange.cpp
index aeb1257754f3..509caba3acd4 100644
--- a/lib/IR/ConstantRange.cpp
+++ b/lib/IR/ConstantRange.cpp
@@ -278,7 +278,7 @@ APInt ConstantRange::getUnsignedMax() const {
 }
 
 APInt ConstantRange::getUnsignedMin() const {
-  if (isFullSet() || (isWrappedSet() && getUpper() != 0))
+  if (isFullSet() || (isWrappedSet() && !getUpper().isNullValue()))
     return APInt::getMinValue(getBitWidth());
   return getLower();
 }
@@ -442,7 +442,7 @@ ConstantRange ConstantRange::unionWith(const ConstantRange &CR) const {
     APInt L = CR.Lower.ult(Lower) ? CR.Lower : Lower;
     APInt U = (CR.Upper - 1).ugt(Upper - 1) ? CR.Upper : Upper;
 
-    if (L == 0 && U == 0)
+    if (L.isNullValue() && U.isNullValue())
       return ConstantRange(getBitWidth());
 
     return ConstantRange(std::move(L), std::move(U));
@@ -757,7 +757,8 @@ ConstantRange::multiply(const ConstantRange &Other) const {
   // from one positive number to another which is as good as we can generate.
   // In this case, skip the extra work of generating signed ranges which aren't
   // going to be better than this range.
-  if (!UR.isWrappedSet() && UR.getLower().isNonNegative())
+  if (!UR.isWrappedSet() &&
+      (UR.getUpper().isNonNegative() || UR.getUpper().isMinSignedValue()))
     return UR;
 
   // Now the signed range. Because we could be dealing with negative numbers
@@ -834,7 +835,7 @@ ConstantRange::umin(const ConstantRange &Other) const {
 
 ConstantRange
 ConstantRange::udiv(const ConstantRange &RHS) const {
-  if (isEmptySet() || RHS.isEmptySet() || RHS.getUnsignedMax() == 0)
+  if (isEmptySet() || RHS.isEmptySet() || RHS.getUnsignedMax().isNullValue())
     return ConstantRange(getBitWidth(), /*isFullSet=*/false);
   if (RHS.isFullSet())
     return ConstantRange(getBitWidth(), /*isFullSet=*/true);
@@ -842,7 +843,7 @@ ConstantRange::udiv(const ConstantRange &RHS) const {
   APInt Lower = getUnsignedMin().udiv(RHS.getUnsignedMax());
 
   APInt RHS_umin = RHS.getUnsignedMin();
-  if (RHS_umin == 0) {
+  if (RHS_umin.isNullValue()) {
     // We want the lowest value in RHS excluding zero. Usually that would be 1
     // except for a range in the form of [X, 1) in which case it would be X.
     if (RHS.getUpper() == 1)
@@ -892,29 +893,33 @@ ConstantRange::shl(const ConstantRange &Other) const {
   if (isEmptySet() || Other.isEmptySet())
     return ConstantRange(getBitWidth(), /*isFullSet=*/false);
 
-  APInt min = getUnsignedMin().shl(Other.getUnsignedMin());
-  APInt max = getUnsignedMax().shl(Other.getUnsignedMax());
+  APInt max = getUnsignedMax();
+  APInt Other_umax = Other.getUnsignedMax();
 
-  // there's no overflow!
-  APInt Zeros(getBitWidth(), getUnsignedMax().countLeadingZeros());
-  if (Zeros.ugt(Other.getUnsignedMax()))
-    return ConstantRange(std::move(min), std::move(max) + 1);
+  // there's overflow!
+  if (Other_umax.uge(max.countLeadingZeros()))
+    return ConstantRange(getBitWidth(), /*isFullSet=*/true);
 
   // FIXME: implement the other tricky cases
-  return ConstantRange(getBitWidth(), /*isFullSet=*/true);
+
+  APInt min = getUnsignedMin();
+  min <<= Other.getUnsignedMin();
+  max <<= Other_umax;
+
+  return ConstantRange(std::move(min), std::move(max) + 1);
 }
 
 ConstantRange
 ConstantRange::lshr(const ConstantRange &Other) const {
   if (isEmptySet() || Other.isEmptySet())
     return ConstantRange(getBitWidth(), /*isFullSet=*/false);
-  
-  APInt max = getUnsignedMax().lshr(Other.getUnsignedMin());
+
+  APInt max = getUnsignedMax().lshr(Other.getUnsignedMin()) + 1;
   APInt min = getUnsignedMin().lshr(Other.getUnsignedMax());
-  if (min == max + 1)
+  if (min == max)
     return ConstantRange(getBitWidth(), /*isFullSet=*/true);
 
-  return ConstantRange(std::move(min), std::move(max) + 1);
+  return ConstantRange(std::move(min), std::move(max));
 }
 
 ConstantRange ConstantRange::inverse() const {
diff --git a/lib/IR/Constants.cpp b/lib/IR/Constants.cpp
index ffc8f2e4303b..4b9d89cda539 100644
--- a/lib/IR/Constants.cpp
+++ b/lib/IR/Constants.cpp
@@ -30,7 +30,7 @@
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
 #include <algorithm>
-#include <cstdarg>
+
 using namespace llvm;
 
 //===----------------------------------------------------------------------===//
@@ -966,16 +966,6 @@ Constant *ConstantStruct::get(StructType *ST, ArrayRef<Constant*> V) {
   return ST->getContext().pImpl->StructConstants.getOrCreate(ST, V);
 }
 
-Constant *ConstantStruct::get(StructType *T, ...) {
-  va_list ap;
-  SmallVector<Constant*, 8> Values;
-  va_start(ap, T);
-  while (Constant *Val = va_arg(ap, llvm::Constant*))
-    Values.push_back(Val);
-  va_end(ap);
-  return get(T, Values);
-}
-
 ConstantVector::ConstantVector(VectorType *T, ArrayRef<Constant *> V)
     : ConstantAggregate(T, ConstantVectorVal, V) {
   assert(V.size() == T->getNumElements() &&
@@ -1810,8 +1800,7 @@ Constant *ConstantExpr::getSizeOf(Type* Ty) {
 Constant *ConstantExpr::getAlignOf(Type* Ty) {
   // alignof is implemented as: (i64) gep ({i1,Ty}*)null, 0, 1
   // Note that a non-inbounds gep is used, as null isn't within any object.
-  Type *AligningTy = 
-    StructType::get(Type::getInt1Ty(Ty->getContext()), Ty, nullptr);
+  Type *AligningTy = StructType::get(Type::getInt1Ty(Ty->getContext()), Ty);
   Constant *NullPtr = Constant::getNullValue(AligningTy->getPointerTo(0));
   Constant *Zero = ConstantInt::get(Type::getInt64Ty(Ty->getContext()), 0);
   Constant *One = ConstantInt::get(Type::getInt32Ty(Ty->getContext()), 1);
diff --git a/lib/IR/ConstantsContext.h b/lib/IR/ConstantsContext.h
index eda751d8af4a..25eb9452d9d0 100644
--- a/lib/IR/ConstantsContext.h
+++ b/lib/IR/ConstantsContext.h
@@ -22,6 +22,7 @@
 #include "llvm/ADT/None.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
+#include "llvm/IR/Constant.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/InlineAsm.h"
@@ -387,31 +388,34 @@ struct ConstantExprKeyType;
 
 template <class ConstantClass> struct ConstantInfo;
 template <> struct ConstantInfo<ConstantExpr> {
-  typedef ConstantExprKeyType ValType;
-  typedef Type TypeClass;
+  using ValType = ConstantExprKeyType;
+  using TypeClass = Type;
 };
 template <> struct ConstantInfo<InlineAsm> {
-  typedef InlineAsmKeyType ValType;
-  typedef PointerType TypeClass;
+  using ValType = InlineAsmKeyType;
+  using TypeClass = PointerType;
 };
 template <> struct ConstantInfo<ConstantArray> {
-  typedef ConstantAggrKeyType<ConstantArray> ValType;
-  typedef ArrayType TypeClass;
+  using ValType = ConstantAggrKeyType<ConstantArray>;
+  using TypeClass = ArrayType;
 };
 template <> struct ConstantInfo<ConstantStruct> {
-  typedef ConstantAggrKeyType<ConstantStruct> ValType;
-  typedef StructType TypeClass;
+  using ValType = ConstantAggrKeyType<ConstantStruct>;
+  using TypeClass = StructType;
 };
 template <> struct ConstantInfo<ConstantVector> {
-  typedef ConstantAggrKeyType<ConstantVector> ValType;
-  typedef VectorType TypeClass;
+  using ValType = ConstantAggrKeyType<ConstantVector>;
+  using TypeClass = VectorType;
 };
 
 template <class ConstantClass> struct ConstantAggrKeyType {
   ArrayRef<Constant *> Operands;
+
   ConstantAggrKeyType(ArrayRef<Constant *> Operands) : Operands(Operands) {}
+
   ConstantAggrKeyType(ArrayRef<Constant *> Operands, const ConstantClass *)
       : Operands(Operands) {}
+
   ConstantAggrKeyType(const ConstantClass *C,
                       SmallVectorImpl<Constant *> &Storage) {
     assert(Storage.empty() && "Expected empty storage");
@@ -437,7 +441,8 @@ template <class ConstantClass> struct ConstantAggrKeyType {
     return hash_combine_range(Operands.begin(), Operands.end());
   }
 
-  typedef typename ConstantInfo<ConstantClass>::TypeClass TypeClass;
+  using TypeClass = typename ConstantInfo<ConstantClass>::TypeClass;
+
   ConstantClass *create(TypeClass *Ty) const {
     return new (Operands.size()) ConstantClass(Ty, Operands);
   }
@@ -457,6 +462,7 @@ struct InlineAsmKeyType {
       : AsmString(AsmString), Constraints(Constraints), FTy(FTy),
         HasSideEffects(HasSideEffects), IsAlignStack(IsAlignStack),
         AsmDialect(AsmDialect) {}
+
   InlineAsmKeyType(const InlineAsm *Asm, SmallVectorImpl<Constant *> &)
       : AsmString(Asm->getAsmString()), Constraints(Asm->getConstraintString()),
         FTy(Asm->getFunctionType()), HasSideEffects(Asm->hasSideEffects()),
@@ -483,7 +489,8 @@ struct InlineAsmKeyType {
                         AsmDialect, FTy);
   }
 
-  typedef ConstantInfo<InlineAsm>::TypeClass TypeClass;
+  using TypeClass = ConstantInfo<InlineAsm>::TypeClass;
+
   InlineAsm *create(TypeClass *Ty) const {
     assert(PointerType::getUnqual(FTy) == Ty);
     return new InlineAsm(FTy, AsmString, Constraints, HasSideEffects,
@@ -507,11 +514,13 @@ struct ConstantExprKeyType {
       : Opcode(Opcode), SubclassOptionalData(SubclassOptionalData),
         SubclassData(SubclassData), Ops(Ops), Indexes(Indexes),
         ExplicitTy(ExplicitTy) {}
+
   ConstantExprKeyType(ArrayRef<Constant *> Operands, const ConstantExpr *CE)
       : Opcode(CE->getOpcode()),
         SubclassOptionalData(CE->getRawSubclassOptionalData()),
         SubclassData(CE->isCompare() ? CE->getPredicate() : 0), Ops(Operands),
         Indexes(CE->hasIndices() ? CE->getIndices() : ArrayRef<unsigned>()) {}
+
   ConstantExprKeyType(const ConstantExpr *CE,
                       SmallVectorImpl<Constant *> &Storage)
       : Opcode(CE->getOpcode()),
@@ -553,7 +562,8 @@ struct ConstantExprKeyType {
                         hash_combine_range(Indexes.begin(), Indexes.end()));
   }
 
-  typedef ConstantInfo<ConstantExpr>::TypeClass TypeClass;
+  using TypeClass = ConstantInfo<ConstantExpr>::TypeClass;
+
   ConstantExpr *create(TypeClass *Ty) const {
     switch (Opcode) {
     default:
@@ -594,16 +604,17 @@ struct ConstantExprKeyType {
 
 template <class ConstantClass> class ConstantUniqueMap {
 public:
-  typedef typename ConstantInfo<ConstantClass>::ValType ValType;
-  typedef typename ConstantInfo<ConstantClass>::TypeClass TypeClass;
-  typedef std::pair<TypeClass *, ValType> LookupKey;
+  using ValType = typename ConstantInfo<ConstantClass>::ValType;
+  using TypeClass = typename ConstantInfo<ConstantClass>::TypeClass;
+  using LookupKey = std::pair<TypeClass *, ValType>;
 
   /// Key and hash together, so that we compute the hash only once and reuse it.
-  typedef std::pair<unsigned, LookupKey> LookupKeyHashed;
+  using LookupKeyHashed = std::pair<unsigned, LookupKey>;
 
 private:
   struct MapInfo {
-    typedef DenseMapInfo<ConstantClass *> ConstantClassInfo;
+    using ConstantClassInfo = DenseMapInfo<ConstantClass *>;
+
     static inline ConstantClass *getEmptyKey() {
       return ConstantClassInfo::getEmptyKey();
     }
@@ -643,7 +654,7 @@ private:
   };
 
 public:
-  typedef DenseSet<ConstantClass *, MapInfo> MapTy;
+  using MapTy = DenseSet<ConstantClass *, MapInfo>;
 
 private:
   MapTy Map;
diff --git a/lib/IR/DebugInfoMetadata.cpp b/lib/IR/DebugInfoMetadata.cpp
index cdbe237766a3..e6c49cad0722 100644
--- a/lib/IR/DebugInfoMetadata.cpp
+++ b/lib/IR/DebugInfoMetadata.cpp
@@ -672,6 +672,24 @@ void DIExpression::appendOffset(SmallVectorImpl<uint64_t> &Ops,
   }
 }
 
+bool DIExpression::extractIfOffset(int64_t &Offset) const {
+  if (getNumElements() == 0) {
+    Offset = 0;
+    return true;
+  }
+  if (getNumElements() != 2)
+    return false;
+  if (Elements[0] == dwarf::DW_OP_plus) {
+    Offset = Elements[1];
+    return true;
+  }
+  if (Elements[0] == dwarf::DW_OP_minus) {
+    Offset = -Elements[1];
+    return true;
+  }
+  return false;
+}
+
 DIExpression *DIExpression::prepend(const DIExpression *Expr, bool Deref,
                                     int64_t Offset, bool StackValue) {
   SmallVector<uint64_t, 8> Ops;
diff --git a/lib/IR/DebugLoc.cpp b/lib/IR/DebugLoc.cpp
index f31074a7ad44..3168ec6944a3 100644
--- a/lib/IR/DebugLoc.cpp
+++ b/lib/IR/DebugLoc.cpp
@@ -8,6 +8,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/IR/DebugLoc.h"
+#include "llvm/IR/IntrinsicInst.h"
 #include "LLVMContextImpl.h"
 #include "llvm/IR/DebugInfo.h"
 using namespace llvm;
@@ -66,6 +67,119 @@ DebugLoc DebugLoc::get(unsigned Line, unsigned Col, const MDNode *Scope,
                          const_cast<MDNode *>(InlinedAt));
 }
 
+DebugLoc DebugLoc::appendInlinedAt(DebugLoc DL, DILocation *InlinedAt,
+                                   LLVMContext &Ctx,
+                                   DenseMap<const MDNode *, MDNode *> &Cache,
+                                   bool ReplaceLast) {
+  SmallVector<DILocation *, 3> InlinedAtLocations;
+  DILocation *Last = InlinedAt;
+  DILocation *CurInlinedAt = DL;
+
+  // Gather all the inlined-at nodes.
+  while (DILocation *IA = CurInlinedAt->getInlinedAt()) {
+    // Skip any we've already built nodes for.
+    if (auto *Found = Cache[IA]) {
+      Last = cast<DILocation>(Found);
+      break;
+    }
+
+    if (ReplaceLast && !IA->getInlinedAt())
+      break;
+    InlinedAtLocations.push_back(IA);
+    CurInlinedAt = IA;
+  }
+
+  // Starting from the top, rebuild the nodes to point to the new inlined-at
+  // location (then rebuilding the rest of the chain behind it) and update the
+  // map of already-constructed inlined-at nodes.
+  for (const DILocation *MD : reverse(InlinedAtLocations))
+    Cache[MD] = Last = DILocation::getDistinct(
+        Ctx, MD->getLine(), MD->getColumn(), MD->getScope(), Last);
+
+  return Last;
+}
+
+/// Reparent \c Scope from \c OrigSP to \c NewSP.
+static DIScope *reparentScope(LLVMContext &Ctx, DIScope *Scope,
+                              DISubprogram *OrigSP, DISubprogram *NewSP,
+                              DenseMap<const MDNode *, MDNode *> &Cache) {
+  SmallVector<DIScope *, 3> ScopeChain;
+  DIScope *Last = NewSP;
+  DIScope *CurScope = Scope;
+  do {
+    if (auto *SP = dyn_cast<DISubprogram>(CurScope)) {
+      // Don't rewrite this scope chain if it doesn't lead to the replaced SP.
+      if (SP != OrigSP)
+        return Scope;
+      Cache.insert({OrigSP, NewSP});
+      break;
+    }
+    if (auto *Found = Cache[CurScope]) {
+      Last = cast<DIScope>(Found);
+      break;
+    }
+    ScopeChain.push_back(CurScope);
+  } while ((CurScope = CurScope->getScope().resolve()));
+
+  // Starting from the top, rebuild the nodes to point to the new inlined-at
+  // location (then rebuilding the rest of the chain behind it) and update the
+  // map of already-constructed inlined-at nodes.
+  for (const DIScope *MD : reverse(ScopeChain)) {
+    if (auto *LB = dyn_cast<DILexicalBlock>(MD))
+      Cache[MD] = Last = DILexicalBlock::getDistinct(
+          Ctx, Last, LB->getFile(), LB->getLine(), LB->getColumn());
+    else if (auto *LB = dyn_cast<DILexicalBlockFile>(MD))
+      Cache[MD] = Last = DILexicalBlockFile::getDistinct(
+          Ctx, Last, LB->getFile(), LB->getDiscriminator());
+    else
+      llvm_unreachable("illegal parent scope");
+  }
+  return Last;
+}
+
+void DebugLoc::reparentDebugInfo(Instruction &I, DISubprogram *OrigSP,
+                                 DISubprogram *NewSP,
+                                 DenseMap<const MDNode *, MDNode *> &Cache) {
+  auto DL = I.getDebugLoc();
+  if (!OrigSP || !NewSP || OrigSP == NewSP || !DL)
+    return;
+
+  // Reparent the debug location.
+  auto &Ctx = I.getContext();
+  DILocation *InlinedAt = DL->getInlinedAt();
+  if (InlinedAt) {
+    while (auto *IA = InlinedAt->getInlinedAt())
+      InlinedAt = IA;
+    auto NewScope =
+        reparentScope(Ctx, InlinedAt->getScope(), OrigSP, NewSP, Cache);
+    InlinedAt =
+        DebugLoc::get(InlinedAt->getLine(), InlinedAt->getColumn(), NewScope);
+  }
+  I.setDebugLoc(
+      DebugLoc::get(DL.getLine(), DL.getCol(),
+                    reparentScope(Ctx, DL->getScope(), OrigSP, NewSP, Cache),
+                    DebugLoc::appendInlinedAt(DL, InlinedAt, Ctx, Cache,
+                                              ReplaceLastInlinedAt)));
+
+  // Fix up debug variables to point to NewSP.
+  auto reparentVar = [&](DILocalVariable *Var) {
+    return DILocalVariable::getDistinct(
+        Ctx,
+        cast<DILocalScope>(
+            reparentScope(Ctx, Var->getScope(), OrigSP, NewSP, Cache)),
+        Var->getName(), Var->getFile(), Var->getLine(), Var->getType(),
+        Var->getArg(), Var->getFlags(), Var->getAlignInBits());
+  };
+  if (auto *DbgValue = dyn_cast<DbgValueInst>(&I)) {
+    auto *Var = DbgValue->getVariable();
+    I.setOperand(2, MetadataAsValue::get(Ctx, reparentVar(Var)));
+  } else if (auto *DbgDeclare = dyn_cast<DbgDeclareInst>(&I)) {
+    auto *Var = DbgDeclare->getVariable();
+    I.setOperand(1, MetadataAsValue::get(Ctx, reparentVar(Var)));
+  }
+}
+
+
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 LLVM_DUMP_METHOD void DebugLoc::dump() const {
   if (!Loc)
diff --git a/lib/IR/DiagnosticInfo.cpp b/lib/IR/DiagnosticInfo.cpp
index 395b6158e0c8..e73f53f3202d 100644
--- a/lib/IR/DiagnosticInfo.cpp
+++ b/lib/IR/DiagnosticInfo.cpp
@@ -12,20 +12,31 @@
 // Diagnostics reporting is still done as part of the LLVMContext.
 //===----------------------------------------------------------------------===//
 
-#include "llvm/IR/DiagnosticInfo.h"
-#include "LLVMContextImpl.h"
+#include "llvm/ADT/iterator_range.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/Twine.h"
+#include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/Constants.h"
-#include "llvm/IR/DebugInfo.h"
+#include "llvm/IR/DebugInfoMetadata.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/DiagnosticInfo.h"
 #include "llvm/IR/DiagnosticPrinter.h"
 #include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalValue.h"
 #include "llvm/IR/Instruction.h"
+#include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Metadata.h"
 #include "llvm/IR/Module.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Value.h"
+#include "llvm/Support/Casting.h"
 #include "llvm/Support/CommandLine.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
 #include "llvm/Support/Regex.h"
 #include <atomic>
+#include <cassert>
+#include <memory>
 #include <string>
 
 using namespace llvm;
@@ -53,6 +64,8 @@ struct PassRemarksOpt {
   }
 };
 
+} // end anonymous namespace
+
 static PassRemarksOpt PassRemarksOptLoc;
 static PassRemarksOpt PassRemarksMissedOptLoc;
 static PassRemarksOpt PassRemarksAnalysisOptLoc;
@@ -85,7 +98,6 @@ PassRemarksAnalysis(
         "the given regular expression"),
     cl::Hidden, cl::location(PassRemarksAnalysisOptLoc), cl::ValueRequired,
     cl::ZeroOrMore);
-}
 
 int llvm::getNextAvailablePluginDiagnosticKind() {
   static std::atomic<int> PluginKindID(DK_FirstPluginKind);
@@ -97,8 +109,7 @@ const char *OptimizationRemarkAnalysis::AlwaysPrint = "";
 DiagnosticInfoInlineAsm::DiagnosticInfoInlineAsm(const Instruction &I,
                                                  const Twine &MsgStr,
                                                  DiagnosticSeverity Severity)
-    : DiagnosticInfo(DK_InlineAsm, Severity), LocCookie(0), MsgStr(MsgStr),
-      Instr(&I) {
+    : DiagnosticInfo(DK_InlineAsm, Severity), MsgStr(MsgStr), Instr(&I) {
   if (const MDNode *SrcLoc = I.getMetadata("srcloc")) {
     if (SrcLoc->getNumOperands() != 0)
       if (const auto *CI =
@@ -193,7 +204,7 @@ DiagnosticInfoOptimizationBase::Argument::Argument(StringRef Key, const Value *V
   // Only include names that correspond to user variables.  FIXME: we should use
   // debug info if available to get the name of the user variable.
   if (isa<llvm::Argument>(V) || isa<GlobalValue>(V))
-    Val = GlobalValue::getRealLinkageName(V->getName());
+    Val = GlobalValue::dropLLVMManglingEscape(V->getName());
   else if (isa<Constant>(V)) {
     raw_string_ostream OS(Val);
     V->printAsOperand(OS, /*PrintType=*/false);
diff --git a/lib/IR/Function.cpp b/lib/IR/Function.cpp
index 58c060550322..16a9e51b8306 100644
--- a/lib/IR/Function.cpp
+++ b/lib/IR/Function.cpp
@@ -1,4 +1,4 @@
-//===-- Function.cpp - Implement the Global object classes ----------------===//
+//===- Function.cpp - Implement the Global object classes -----------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -11,21 +11,51 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/IR/Function.h"
 #include "LLVMContextImpl.h"
 #include "SymbolTableListTraitsImpl.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/None.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/CodeGen/ValueTypes.h"
+#include "llvm/IR/Argument.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/CallSite.h"
+#include "llvm/IR/Constant.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalValue.h"
 #include "llvm/IR/InstIterator.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/MDBuilder.h"
 #include "llvm/IR/Metadata.h"
 #include "llvm/IR/Module.h"
+#include "llvm/IR/SymbolTableListTraits.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Use.h"
+#include "llvm/IR/User.h"
+#include "llvm/IR/Value.h"
+#include "llvm/IR/ValueSymbolTable.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/ErrorHandling.h"
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+#include <string>
+
 using namespace llvm;
 
 // Explicit instantiations of SymbolTableListTraits since some of the methods
@@ -36,7 +66,7 @@ template class llvm::SymbolTableListTraits<BasicBlock>;
 // Argument Implementation
 //===----------------------------------------------------------------------===//
 
-void Argument::anchor() { }
+void Argument::anchor() {}
 
 Argument::Argument(Type *Ty, const Twine &Name, Function *Par, unsigned ArgNo)
     : Value(Ty, Value::ArgumentVal), Parent(Par), ArgNo(ArgNo) {
@@ -186,7 +216,7 @@ Function::Function(FunctionType *Ty, LinkageTypes Linkage, const Twine &name,
                    Module *ParentModule)
     : GlobalObject(Ty, Value::FunctionVal,
                    OperandTraits<Function>::op_begin(this), 0, Linkage, name),
-      Arguments(nullptr), NumArgs(Ty->getNumParams()) {
+      NumArgs(Ty->getNumParams()) {
   assert(FunctionType::isValidReturnType(getReturnType()) &&
          "invalid return type");
   setGlobalObjectSubClassData(0);
@@ -386,24 +416,20 @@ void Function::clearGC() {
 
 /// Copy all additional attributes (those not needed to create a Function) from
 /// the Function Src to this one.
-void Function::copyAttributesFrom(const GlobalValue *Src) {
+void Function::copyAttributesFrom(const Function *Src) {
   GlobalObject::copyAttributesFrom(Src);
-  const Function *SrcF = dyn_cast<Function>(Src);
-  if (!SrcF)
-    return;
-
-  setCallingConv(SrcF->getCallingConv());
-  setAttributes(SrcF->getAttributes());
-  if (SrcF->hasGC())
-    setGC(SrcF->getGC());
+  setCallingConv(Src->getCallingConv());
+  setAttributes(Src->getAttributes());
+  if (Src->hasGC())
+    setGC(Src->getGC());
   else
     clearGC();
-  if (SrcF->hasPersonalityFn())
-    setPersonalityFn(SrcF->getPersonalityFn());
-  if (SrcF->hasPrefixData())
-    setPrefixData(SrcF->getPrefixData());
-  if (SrcF->hasPrologueData())
-    setPrologueData(SrcF->getPrologueData());
+  if (Src->hasPersonalityFn())
+    setPersonalityFn(Src->getPersonalityFn());
+  if (Src->hasPrefixData())
+    setPrefixData(Src->getPrefixData());
+  if (Src->hasPrologueData())
+    setPrologueData(Src->getPrologueData());
 }
 
 /// Table of string intrinsic names indexed by enum value.
@@ -486,10 +512,10 @@ void Function::recalculateIntrinsicID() {
 static std::string getMangledTypeStr(Type* Ty) {
   std::string Result;
   if (PointerType* PTyp = dyn_cast<PointerType>(Ty)) {
-    Result += "p" + llvm::utostr(PTyp->getAddressSpace()) +
+    Result += "p" + utostr(PTyp->getAddressSpace()) +
       getMangledTypeStr(PTyp->getElementType());
   } else if (ArrayType* ATyp = dyn_cast<ArrayType>(Ty)) {
-    Result += "a" + llvm::utostr(ATyp->getNumElements()) +
+    Result += "a" + utostr(ATyp->getNumElements()) +
       getMangledTypeStr(ATyp->getElementType());
   } else if (StructType *STyp = dyn_cast<StructType>(Ty)) {
     if (!STyp->isLiteral()) {
@@ -534,7 +560,6 @@ std::string Intrinsic::getName(ID id, ArrayRef<Type*> Tys) {
   return Result;
 }
 
-
 /// IIT_Info - These are enumerators that describe the entries returned by the
 /// getIntrinsicInfoTableEntries function.
 ///
@@ -585,9 +610,10 @@ enum IIT_Info {
 
 static void DecodeIITType(unsigned &NextElt, ArrayRef<unsigned char> Infos,
                       SmallVectorImpl<Intrinsic::IITDescriptor> &OutputTable) {
+  using namespace Intrinsic;
+
   IIT_Info Info = IIT_Info(Infos[NextElt++]);
   unsigned StructElts = 2;
-  using namespace Intrinsic;
 
   switch (Info) {
   case IIT_Done:
@@ -742,7 +768,6 @@ static void DecodeIITType(unsigned &NextElt, ArrayRef<unsigned char> Infos,
   llvm_unreachable("unhandled");
 }
 
-
 #define GET_INTRINSIC_GENERATOR_GLOBAL
 #include "llvm/IR/Intrinsics.gen"
 #undef GET_INTRINSIC_GENERATOR_GLOBAL
@@ -780,10 +805,10 @@ void Intrinsic::getIntrinsicInfoTableEntries(ID id,
     DecodeIITType(NextElt, IITEntries, T);
 }
 
-
 static Type *DecodeFixedType(ArrayRef<Intrinsic::IITDescriptor> &Infos,
                              ArrayRef<Type*> Tys, LLVMContext &Context) {
   using namespace Intrinsic;
+
   IITDescriptor D = Infos.front();
   Infos = Infos.slice(1);
 
@@ -855,12 +880,10 @@ static Type *DecodeFixedType(ArrayRef<Intrinsic::IITDescriptor> &Infos,
   case IITDescriptor::VecOfAnyPtrsToElt:
     // Return the overloaded type (which determines the pointers address space)
     return Tys[D.getOverloadArgNumber()];
- }
+  }
   llvm_unreachable("unhandled");
 }
 
-
-
 FunctionType *Intrinsic::getType(LLVMContext &Context,
                                  ID id, ArrayRef<Type*> Tys) {
   SmallVector<IITDescriptor, 8> Table;
diff --git a/lib/IR/Globals.cpp b/lib/IR/Globals.cpp
index 5f338f58d940..17d27b016cf2 100644
--- a/lib/IR/Globals.cpp
+++ b/lib/IR/Globals.cpp
@@ -69,6 +69,30 @@ void GlobalValue::copyAttributesFrom(const GlobalValue *Src) {
   setDLLStorageClass(Src->getDLLStorageClass());
 }
 
+void GlobalValue::removeFromParent() {
+  switch (getValueID()) {
+#define HANDLE_GLOBAL_VALUE(NAME)                                              \
+  case Value::NAME##Val:                                                       \
+    return static_cast<NAME *>(this)->removeFromParent();
+#include "llvm/IR/Value.def"
+  default:
+    break;
+  }
+  llvm_unreachable("not a global");
+}
+
+void GlobalValue::eraseFromParent() {
+  switch (getValueID()) {
+#define HANDLE_GLOBAL_VALUE(NAME)                                              \
+  case Value::NAME##Val:                                                       \
+    return static_cast<NAME *>(this)->eraseFromParent();
+#include "llvm/IR/Value.def"
+  default:
+    break;
+  }
+  llvm_unreachable("not a global");
+}
+
 unsigned GlobalValue::getAlignment() const {
   if (auto *GA = dyn_cast<GlobalAlias>(this)) {
     // In general we cannot compute this at the IR level, but we try.
@@ -93,12 +117,10 @@ void GlobalObject::setAlignment(unsigned Align) {
   assert(getAlignment() == Align && "Alignment representation error!");
 }
 
-void GlobalObject::copyAttributesFrom(const GlobalValue *Src) {
+void GlobalObject::copyAttributesFrom(const GlobalObject *Src) {
   GlobalValue::copyAttributesFrom(Src);
-  if (const auto *GV = dyn_cast<GlobalObject>(Src)) {
-    setAlignment(GV->getAlignment());
-    setSection(GV->getSection());
-  }
+  setAlignment(Src->getAlignment());
+  setSection(Src->getSection());
 }
 
 std::string GlobalValue::getGlobalIdentifier(StringRef Name,
@@ -233,7 +255,7 @@ bool GlobalValue::canIncreaseAlignment() const {
 const GlobalObject *GlobalValue::getBaseObject() const {
   if (auto *GO = dyn_cast<GlobalObject>(this))
     return GO;
-  if (auto *GA = dyn_cast<GlobalAlias>(this))
+  if (auto *GA = dyn_cast<GlobalIndirectSymbol>(this))
     return GA->getBaseObject();
   return nullptr;
 }
@@ -333,12 +355,11 @@ void GlobalVariable::setInitializer(Constant *InitVal) {
 
 /// Copy all additional attributes (those not needed to create a GlobalVariable)
 /// from the GlobalVariable Src to this one.
-void GlobalVariable::copyAttributesFrom(const GlobalValue *Src) {
+void GlobalVariable::copyAttributesFrom(const GlobalVariable *Src) {
   GlobalObject::copyAttributesFrom(Src);
-  if (const GlobalVariable *SrcVar = dyn_cast<GlobalVariable>(Src)) {
-    setThreadLocalMode(SrcVar->getThreadLocalMode());
-    setExternallyInitialized(SrcVar->isExternallyInitialized());
-  }
+  setThreadLocalMode(Src->getThreadLocalMode());
+  setExternallyInitialized(Src->isExternallyInitialized());
+  setAttributes(Src->getAttributes());
 }
 
 void GlobalVariable::dropAllReferences() {
diff --git a/lib/IR/IRBuilder.cpp b/lib/IR/IRBuilder.cpp
index e265a823687f..3477c087967f 100644
--- a/lib/IR/IRBuilder.cpp
+++ b/lib/IR/IRBuilder.cpp
@@ -161,6 +161,94 @@ CreateMemMove(Value *Dst, Value *Src, Value *Size, unsigned Align,
   return CI;  
 }
 
+static CallInst *getReductionIntrinsic(IRBuilderBase *Builder, Intrinsic::ID ID,
+                                    Value *Src) {
+  Module *M = Builder->GetInsertBlock()->getParent()->getParent();
+  Value *Ops[] = {Src};
+  Type *Tys[] = { Src->getType()->getVectorElementType(), Src->getType() };
+  auto Decl = Intrinsic::getDeclaration(M, ID, Tys);
+  return createCallHelper(Decl, Ops, Builder);
+}
+
+CallInst *IRBuilderBase::CreateFAddReduce(Value *Acc, Value *Src) {
+  Module *M = GetInsertBlock()->getParent()->getParent();
+  Value *Ops[] = {Acc, Src};
+  Type *Tys[] = {Src->getType()->getVectorElementType(), Acc->getType(),
+                 Src->getType()};
+  auto Decl = Intrinsic::getDeclaration(
+      M, Intrinsic::experimental_vector_reduce_fadd, Tys);
+  return createCallHelper(Decl, Ops, this);
+}
+
+CallInst *IRBuilderBase::CreateFMulReduce(Value *Acc, Value *Src) {
+  Module *M = GetInsertBlock()->getParent()->getParent();
+  Value *Ops[] = {Acc, Src};
+  Type *Tys[] = {Src->getType()->getVectorElementType(), Acc->getType(),
+                 Src->getType()};
+  auto Decl = Intrinsic::getDeclaration(
+      M, Intrinsic::experimental_vector_reduce_fmul, Tys);
+  return createCallHelper(Decl, Ops, this);
+}
+
+CallInst *IRBuilderBase::CreateAddReduce(Value *Src) {
+  return getReductionIntrinsic(this, Intrinsic::experimental_vector_reduce_add,
+                               Src);
+}
+
+CallInst *IRBuilderBase::CreateMulReduce(Value *Src) {
+  return getReductionIntrinsic(this, Intrinsic::experimental_vector_reduce_mul,
+                               Src);
+}
+
+CallInst *IRBuilderBase::CreateAndReduce(Value *Src) {
+  return getReductionIntrinsic(this, Intrinsic::experimental_vector_reduce_and,
+                               Src);
+}
+
+CallInst *IRBuilderBase::CreateOrReduce(Value *Src) {
+  return getReductionIntrinsic(this, Intrinsic::experimental_vector_reduce_or,
+                               Src);
+}
+
+CallInst *IRBuilderBase::CreateXorReduce(Value *Src) {
+  return getReductionIntrinsic(this, Intrinsic::experimental_vector_reduce_xor,
+                               Src);
+}
+
+CallInst *IRBuilderBase::CreateIntMaxReduce(Value *Src, bool IsSigned) {
+  auto ID = IsSigned ? Intrinsic::experimental_vector_reduce_smax
+                     : Intrinsic::experimental_vector_reduce_umax;
+  return getReductionIntrinsic(this, ID, Src);
+}
+
+CallInst *IRBuilderBase::CreateIntMinReduce(Value *Src, bool IsSigned) {
+  auto ID = IsSigned ? Intrinsic::experimental_vector_reduce_smin
+                     : Intrinsic::experimental_vector_reduce_umin;
+  return getReductionIntrinsic(this, ID, Src);
+}
+
+CallInst *IRBuilderBase::CreateFPMaxReduce(Value *Src, bool NoNaN) {
+  auto Rdx = getReductionIntrinsic(
+      this, Intrinsic::experimental_vector_reduce_fmax, Src);
+  if (NoNaN) {
+    FastMathFlags FMF;
+    FMF.setNoNaNs();
+    Rdx->setFastMathFlags(FMF);
+  }
+  return Rdx;
+}
+
+CallInst *IRBuilderBase::CreateFPMinReduce(Value *Src, bool NoNaN) {
+  auto Rdx = getReductionIntrinsic(
+      this, Intrinsic::experimental_vector_reduce_fmin, Src);
+  if (NoNaN) {
+    FastMathFlags FMF;
+    FMF.setNoNaNs();
+    Rdx->setFastMathFlags(FMF);
+  }
+  return Rdx;
+}
+
 CallInst *IRBuilderBase::CreateLifetimeStart(Value *Ptr, ConstantInt *Size) {
   assert(isa<PointerType>(Ptr->getType()) &&
          "lifetime.start only applies to pointers.");
diff --git a/lib/IR/Instruction.cpp b/lib/IR/Instruction.cpp
index 906a28a5c887..91b9d9232b54 100644
--- a/lib/IR/Instruction.cpp
+++ b/lib/IR/Instruction.cpp
@@ -534,6 +534,30 @@ bool Instruction::isAtomic() const {
   }
 }
 
+bool Instruction::hasAtomicLoad() const {
+  assert(isAtomic());
+  switch (getOpcode()) {
+  default:
+    return false;
+  case Instruction::AtomicCmpXchg:
+  case Instruction::AtomicRMW:
+  case Instruction::Load:
+    return true;
+  }
+}
+
+bool Instruction::hasAtomicStore() const {
+  assert(isAtomic());
+  switch (getOpcode()) {
+  default:
+    return false;
+  case Instruction::AtomicCmpXchg:
+  case Instruction::AtomicRMW:
+  case Instruction::Store:
+    return true;
+  }
+}
+
 bool Instruction::mayThrow() const {
   if (const CallInst *CI = dyn_cast<CallInst>(this))
     return !CI->doesNotThrow();
diff --git a/lib/IR/Instructions.cpp b/lib/IR/Instructions.cpp
index a60cc375d568..5a5b9c0d06bb 100644
--- a/lib/IR/Instructions.cpp
+++ b/lib/IR/Instructions.cpp
@@ -1,4 +1,4 @@
-//===-- Instructions.cpp - Implement the LLVM instructions ----------------===//
+//===- Instructions.cpp - Implement the LLVM instructions -----------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -12,18 +12,36 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/IR/Instructions.h"
 #include "LLVMContextImpl.h"
+#include "llvm/ADT/None.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/CallSite.h"
-#include "llvm/IR/ConstantRange.h"
+#include "llvm/IR/Constant.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Function.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Metadata.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Operator.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Value.h"
+#include "llvm/Support/AtomicOrdering.h"
+#include "llvm/Support/Casting.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/MathExtras.h"
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <vector>
+
 using namespace llvm;
 
 //===----------------------------------------------------------------------===//
@@ -42,7 +60,42 @@ User::op_iterator CallSite::getCallee() const {
 //===----------------------------------------------------------------------===//
 
 // Out of line virtual method, so the vtable, etc has a home.
-TerminatorInst::~TerminatorInst() {
+TerminatorInst::~TerminatorInst() = default;
+
+unsigned TerminatorInst::getNumSuccessors() const {
+  switch (getOpcode()) {
+#define HANDLE_TERM_INST(N, OPC, CLASS)                                        \
+  case Instruction::OPC:                                                       \
+    return static_cast<const CLASS *>(this)->getNumSuccessorsV();
+#include "llvm/IR/Instruction.def"
+  default:
+    break;
+  }
+  llvm_unreachable("not a terminator");
+}
+
+BasicBlock *TerminatorInst::getSuccessor(unsigned idx) const {
+  switch (getOpcode()) {
+#define HANDLE_TERM_INST(N, OPC, CLASS)                                        \
+  case Instruction::OPC:                                                       \
+    return static_cast<const CLASS *>(this)->getSuccessorV(idx);
+#include "llvm/IR/Instruction.def"
+  default:
+    break;
+  }
+  llvm_unreachable("not a terminator");
+}
+
+void TerminatorInst::setSuccessor(unsigned idx, BasicBlock *B) {
+  switch (getOpcode()) {
+#define HANDLE_TERM_INST(N, OPC, CLASS)                                        \
+  case Instruction::OPC:                                                       \
+    return static_cast<CLASS *>(this)->setSuccessorV(idx, B);
+#include "llvm/IR/Instruction.def"
+  default:
+    break;
+  }
+  llvm_unreachable("not a terminator");
 }
 
 //===----------------------------------------------------------------------===//
@@ -50,8 +103,7 @@ TerminatorInst::~TerminatorInst() {
 //===----------------------------------------------------------------------===//
 
 // Out of line virtual method, so the vtable, etc has a home.
-UnaryInstruction::~UnaryInstruction() {
-}
+UnaryInstruction::~UnaryInstruction() = default;
 
 //===----------------------------------------------------------------------===//
 //                              SelectInst Class
@@ -82,7 +134,6 @@ const char *SelectInst::areInvalidOperands(Value *Op0, Value *Op1, Value *Op2) {
   return nullptr;
 }
 
-
 //===----------------------------------------------------------------------===//
 //                               PHINode Class
 //===----------------------------------------------------------------------===//
@@ -242,8 +293,7 @@ void LandingPadInst::addClause(Constant *Val) {
 //                        CallInst Implementation
 //===----------------------------------------------------------------------===//
 
-CallInst::~CallInst() {
-}
+CallInst::~CallInst() = default;
 
 void CallInst::init(FunctionType *FTy, Value *Func, ArrayRef<Value *> Args,
                     ArrayRef<OperandBundleDef> Bundles, const Twine &NameStr) {
@@ -541,7 +591,6 @@ Instruction *CallInst::CreateMalloc(Instruction *InsertBefore,
                       ArraySize, OpB, MallocF, Name);
 }
 
-
 /// CreateMalloc - Generate the IR for a call to malloc:
 /// 1. Compute the malloc call's argument as the specified type's size,
 ///    possibly multiplied by the array size if the array size is not
@@ -692,9 +741,11 @@ InvokeInst *InvokeInst::Create(InvokeInst *II, ArrayRef<OperandBundleDef> OpB,
 BasicBlock *InvokeInst::getSuccessorV(unsigned idx) const {
   return getSuccessor(idx);
 }
+
 unsigned InvokeInst::getNumSuccessorsV() const {
   return getNumSuccessors();
 }
+
 void InvokeInst::setSuccessorV(unsigned idx, BasicBlock *B) {
   return setSuccessor(idx, B);
 }
@@ -821,6 +872,7 @@ ReturnInst::ReturnInst(LLVMContext &C, Value *retVal, Instruction *InsertBefore)
   if (retVal)
     Op<0>() = retVal;
 }
+
 ReturnInst::ReturnInst(LLVMContext &C, Value *retVal, BasicBlock *InsertAtEnd)
   : TerminatorInst(Type::getVoidTy(C), Instruction::Ret,
                    OperandTraits<ReturnInst>::op_end(this) - !!retVal, !!retVal,
@@ -828,6 +880,7 @@ ReturnInst::ReturnInst(LLVMContext &C, Value *retVal, BasicBlock *InsertAtEnd)
   if (retVal)
     Op<0>() = retVal;
 }
+
 ReturnInst::ReturnInst(LLVMContext &Context, BasicBlock *InsertAtEnd)
   : TerminatorInst(Type::getVoidTy(Context), Instruction::Ret,
                    OperandTraits<ReturnInst>::op_end(this), 0, InsertAtEnd) {
@@ -847,8 +900,7 @@ BasicBlock *ReturnInst::getSuccessorV(unsigned idx) const {
   llvm_unreachable("ReturnInst has no successors!");
 }
 
-ReturnInst::~ReturnInst() {
-}
+ReturnInst::~ReturnInst() = default;
 
 //===----------------------------------------------------------------------===//
 //                        ResumeInst Implementation
@@ -930,9 +982,11 @@ BasicBlock *CleanupReturnInst::getSuccessorV(unsigned Idx) const {
   assert(Idx == 0);
   return getUnwindDest();
 }
+
 unsigned CleanupReturnInst::getNumSuccessorsV() const {
   return getNumSuccessors();
 }
+
 void CleanupReturnInst::setSuccessorV(unsigned Idx, BasicBlock *B) {
   assert(Idx == 0);
   setUnwindDest(B);
@@ -973,9 +1027,11 @@ BasicBlock *CatchReturnInst::getSuccessorV(unsigned Idx) const {
   assert(Idx < getNumSuccessors() && "Successor # out of range for catchret!");
   return getSuccessor();
 }
+
 unsigned CatchReturnInst::getNumSuccessorsV() const {
   return getNumSuccessors();
 }
+
 void CatchReturnInst::setSuccessorV(unsigned Idx, BasicBlock *B) {
   assert(Idx < getNumSuccessors() && "Successor # out of range for catchret!");
   setSuccessor(B);
@@ -1067,9 +1123,11 @@ void CatchSwitchInst::removeHandler(handler_iterator HI) {
 BasicBlock *CatchSwitchInst::getSuccessorV(unsigned idx) const {
   return getSuccessor(idx);
 }
+
 unsigned CatchSwitchInst::getNumSuccessorsV() const {
   return getNumSuccessors();
 }
+
 void CatchSwitchInst::setSuccessorV(unsigned idx, BasicBlock *B) {
   setSuccessor(idx, B);
 }
@@ -1155,6 +1213,7 @@ BranchInst::BranchInst(BasicBlock *IfTrue, Instruction *InsertBefore)
   assert(IfTrue && "Branch destination may not be null!");
   Op<-1>() = IfTrue;
 }
+
 BranchInst::BranchInst(BasicBlock *IfTrue, BasicBlock *IfFalse, Value *Cond,
                        Instruction *InsertBefore)
   : TerminatorInst(Type::getVoidTy(IfTrue->getContext()), Instruction::Br,
@@ -1189,7 +1248,6 @@ BranchInst::BranchInst(BasicBlock *IfTrue, BasicBlock *IfFalse, Value *Cond,
 #endif
 }
 
-
 BranchInst::BranchInst(const BranchInst &BI) :
   TerminatorInst(Type::getVoidTy(BI.getContext()), Instruction::Br,
                  OperandTraits<BranchInst>::op_end(this) - BI.getNumOperands(),
@@ -1216,14 +1274,15 @@ void BranchInst::swapSuccessors() {
 BasicBlock *BranchInst::getSuccessorV(unsigned idx) const {
   return getSuccessor(idx);
 }
+
 unsigned BranchInst::getNumSuccessorsV() const {
   return getNumSuccessors();
 }
+
 void BranchInst::setSuccessorV(unsigned idx, BasicBlock *B) {
   setSuccessor(idx, B);
 }
 
-
 //===----------------------------------------------------------------------===//
 //                        AllocaInst Implementation
 //===----------------------------------------------------------------------===//
@@ -1279,8 +1338,7 @@ AllocaInst::AllocaInst(Type *Ty, unsigned AddrSpace, Value *ArraySize,
 }
 
 // Out of line virtual method, so the vtable, etc has a home.
-AllocaInst::~AllocaInst() {
-}
+AllocaInst::~AllocaInst() = default;
 
 void AllocaInst::setAlignment(unsigned Align) {
   assert((Align & (Align-1)) == 0 && "Alignment is not a power of 2!");
@@ -1543,8 +1601,7 @@ AtomicCmpXchgInst::AtomicCmpXchgInst(Value *Ptr, Value *Cmp, Value *NewVal,
                                      SynchronizationScope SynchScope,
                                      Instruction *InsertBefore)
     : Instruction(
-          StructType::get(Cmp->getType(), Type::getInt1Ty(Cmp->getContext()),
-                          nullptr),
+          StructType::get(Cmp->getType(), Type::getInt1Ty(Cmp->getContext())),
           AtomicCmpXchg, OperandTraits<AtomicCmpXchgInst>::op_begin(this),
           OperandTraits<AtomicCmpXchgInst>::operands(this), InsertBefore) {
   Init(Ptr, Cmp, NewVal, SuccessOrdering, FailureOrdering, SynchScope);
@@ -1556,8 +1613,7 @@ AtomicCmpXchgInst::AtomicCmpXchgInst(Value *Ptr, Value *Cmp, Value *NewVal,
                                      SynchronizationScope SynchScope,
                                      BasicBlock *InsertAtEnd)
     : Instruction(
-          StructType::get(Cmp->getType(), Type::getInt1Ty(Cmp->getContext()),
-                          nullptr),
+          StructType::get(Cmp->getType(), Type::getInt1Ty(Cmp->getContext())),
           AtomicCmpXchg, OperandTraits<AtomicCmpXchgInst>::op_begin(this),
           OperandTraits<AtomicCmpXchgInst>::operands(this), InsertAtEnd) {
   Init(Ptr, Cmp, NewVal, SuccessOrdering, FailureOrdering, SynchScope);
@@ -1771,14 +1827,12 @@ ExtractElementInst::ExtractElementInst(Value *Val, Value *Index,
   setName(Name);
 }
 
-
 bool ExtractElementInst::isValidOperands(const Value *Val, const Value *Index) {
   if (!Val->getType()->isVectorTy() || !Index->getType()->isIntegerTy())
     return false;
   return true;
 }
 
-
 //===----------------------------------------------------------------------===//
 //                           InsertElementInst Implementation
 //===----------------------------------------------------------------------===//
@@ -1825,7 +1879,6 @@ bool InsertElementInst::isValidOperands(const Value *Vec, const Value *Elt,
   return true;
 }
 
-
 //===----------------------------------------------------------------------===//
 //                      ShuffleVectorInst Implementation
 //===----------------------------------------------------------------------===//
@@ -1938,7 +1991,6 @@ void ShuffleVectorInst::getShuffleMask(Constant *Mask,
   }
 }
 
-
 //===----------------------------------------------------------------------===//
 //                             InsertValueInst Class
 //===----------------------------------------------------------------------===//
@@ -1951,7 +2003,7 @@ void InsertValueInst::init(Value *Agg, Value *Val, ArrayRef<unsigned> Idxs,
   // (other than weirdness with &*IdxBegin being invalid; see
   // getelementptr's init routine for example). But there's no
   // present need to support it.
-  assert(Idxs.size() > 0 && "InsertValueInst must have at least one index");
+  assert(!Idxs.empty() && "InsertValueInst must have at least one index");
 
   assert(ExtractValueInst::getIndexedType(Agg->getType(), Idxs) ==
          Val->getType() && "Inserted value must match indexed type!");
@@ -1980,7 +2032,7 @@ void ExtractValueInst::init(ArrayRef<unsigned> Idxs, const Twine &Name) {
 
   // There's no fundamental reason why we require at least one index.
   // But there's no present need to support it.
-  assert(Idxs.size() > 0 && "ExtractValueInst must have at least one index");
+  assert(!Idxs.empty() && "ExtractValueInst must have at least one index");
 
   Indices.append(Idxs.begin(), Idxs.end());
   setName(Name);
@@ -2053,7 +2105,6 @@ BinaryOperator::BinaryOperator(BinaryOps iType, Value *S1, Value *S2,
   setName(Name);
 }
 
-
 void BinaryOperator::init(BinaryOps iType) {
   Value *LHS = getOperand(0), *RHS = getOperand(1);
   (void)LHS; (void)RHS; // Silence warnings.
@@ -2213,7 +2264,6 @@ BinaryOperator *BinaryOperator::CreateNot(Value *Op, const Twine &Name,
                             Op->getType(), Name, InsertAtEnd);
 }
 
-
 // isConstantAllOnes - Helper function for several functions below
 static inline bool isConstantAllOnes(const Value *V) {
   if (const Constant *C = dyn_cast<Constant>(V))
@@ -2279,7 +2329,6 @@ const Value *BinaryOperator::getNotArgument(const Value *BinOp) {
   return getNotArgument(const_cast<Value*>(BinOp));
 }
 
-
 // Exchange the two operands to this instruction. This instruction is safe to
 // use on any binary instruction and does not modify the semantics of the
 // instruction. If the instruction is order-dependent (SetLT f.e.), the opcode
@@ -2291,7 +2340,6 @@ bool BinaryOperator::swapOperands() {
   return false;
 }
 
-
 //===----------------------------------------------------------------------===//
 //                             FPMathOperator Class
 //===----------------------------------------------------------------------===//
@@ -2305,7 +2353,6 @@ float FPMathOperator::getFPAccuracy() const {
   return Accuracy->getValueAPF().convertToFloat();
 }
 
-
 //===----------------------------------------------------------------------===//
 //                                CastInst Class
 //===----------------------------------------------------------------------===//
@@ -2567,13 +2614,12 @@ unsigned CastInst::isEliminableCastPair(
         return Instruction::BitCast;
       return 0;
     }
-    case 12: {
+    case 12:
       // addrspacecast, addrspacecast -> bitcast,       if SrcAS == DstAS
       // addrspacecast, addrspacecast -> addrspacecast, if SrcAS != DstAS
       if (SrcTy->getPointerAddressSpace() != DstTy->getPointerAddressSpace())
         return Instruction::AddrSpaceCast;
       return Instruction::BitCast;
-    }
     case 13:
       // FIXME: this state can be merged with (1), but the following assert
       // is useful to check the correcteness of the sequence due to semantic
@@ -2594,7 +2640,6 @@ unsigned CastInst::isEliminableCastPair(
           DstTy->getScalarType()->getPointerElementType())
         return Instruction::AddrSpaceCast;
       return 0;
-
     case 15:
       // FIXME: this state can be merged with (1), but the following assert
       // is useful to check the correcteness of the sequence due to semantic
@@ -3070,7 +3115,6 @@ CastInst::getCastOpcode(
 /// of the types involved.
 bool 
 CastInst::castIsValid(Instruction::CastOps op, Value *S, Type *DstTy) {
-
   // Check for type sanity on the arguments
   Type *SrcTy = S->getType();
 
@@ -3419,7 +3463,6 @@ bool CmpInst::isEquality() const {
   return cast<FCmpInst>(this)->isEquality();
 }
 
-
 CmpInst::Predicate CmpInst::getInversePredicate(Predicate pred) {
   switch (pred) {
     default: llvm_unreachable("Unknown cmp predicate!");
@@ -3743,9 +3786,11 @@ void SwitchInst::growOperands() {
 BasicBlock *SwitchInst::getSuccessorV(unsigned idx) const {
   return getSuccessor(idx);
 }
+
 unsigned SwitchInst::getNumSuccessorsV() const {
   return getNumSuccessors();
 }
+
 void SwitchInst::setSuccessorV(unsigned idx, BasicBlock *B) {
   setSuccessor(idx, B);
 }
@@ -3832,9 +3877,11 @@ void IndirectBrInst::removeDestination(unsigned idx) {
 BasicBlock *IndirectBrInst::getSuccessorV(unsigned idx) const {
   return getSuccessor(idx);
 }
+
 unsigned IndirectBrInst::getNumSuccessorsV() const {
   return getNumSuccessors();
 }
+
 void IndirectBrInst::setSuccessorV(unsigned idx, BasicBlock *B) {
   setSuccessor(idx, B);
 }
diff --git a/lib/IR/LegacyPassManager.cpp b/lib/IR/LegacyPassManager.cpp
index 628a67bd639c..b2b12289f871 100644
--- a/lib/IR/LegacyPassManager.cpp
+++ b/lib/IR/LegacyPassManager.cpp
@@ -12,6 +12,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/IR/LegacyPassManager.h"
+#include "llvm/ADT/Statistic.h"
 #include "llvm/IR/IRPrintingPasses.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/LegacyPassManagers.h"
@@ -465,6 +466,11 @@ public:
   // null.  It may be called multiple times.
   static void createTheTimeInfo();
 
+  // print - Prints out timing information and then resets the timers.
+  void print() {
+    TG.print(*CreateInfoOutputFile());
+  }
+
   /// getPassTimer - Return the timer for the specified pass if it exists.
   Timer *getPassTimer(Pass *P) {
     if (P->getAsPMDataManager())
@@ -1752,6 +1758,13 @@ Timer *llvm::getPassTimer(Pass *P) {
   return nullptr;
 }
 
+/// If timing is enabled, report the times collected up to now and then reset
+/// them.
+void llvm::reportAndResetTimings() {
+  if (TheTimeInfo)
+    TheTimeInfo->print();
+}
+
 //===----------------------------------------------------------------------===//
 // PMStack implementation
 //
diff --git a/lib/IR/Module.cpp b/lib/IR/Module.cpp
index fec9df193685..12c258d95f52 100644
--- a/lib/IR/Module.cpp
+++ b/lib/IR/Module.cpp
@@ -1,4 +1,4 @@
-//===-- Module.cpp - Implement the Module class ---------------------------===//
+//===- Module.cpp - Implement the Module class ----------------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -11,27 +11,46 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/IR/Module.h"
 #include "SymbolTableListTraitsImpl.h"
-#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallString.h"
-#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringMap.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/Comdat.h"
 #include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/DebugInfoMetadata.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalAlias.h"
+#include "llvm/IR/GlobalIFunc.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/GlobalVariable.h"
 #include "llvm/IR/GVMaterializer.h"
-#include "llvm/IR/InstrTypes.h"
 #include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/SymbolTableListTraits.h"
+#include "llvm/IR/Type.h"
 #include "llvm/IR/TypeFinder.h"
-#include "llvm/Support/Dwarf.h"
+#include "llvm/IR/Value.h"
+#include "llvm/IR/ValueSymbolTable.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CodeGen.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/Path.h"
 #include "llvm/Support/RandomNumberGenerator.h"
 #include <algorithm>
-#include <cstdarg>
-#include <cstdlib>
+#include <cassert>
+#include <cstdint>
+#include <memory>
+#include <utility>
+#include <vector>
 
 using namespace llvm;
 
diff --git a/lib/IR/Type.cpp b/lib/IR/Type.cpp
index b67b0a307861..c9f957c244f8 100644
--- a/lib/IR/Type.cpp
+++ b/lib/IR/Type.cpp
@@ -1,4 +1,4 @@
-//===-- Type.cpp - Implement the Type class -------------------------------===//
+//===- Type.cpp - Implement the Type class --------------------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -11,12 +11,25 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/IR/Type.h"
 #include "LLVMContextImpl.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/None.h"
 #include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/StringMap.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Module.h"
-#include <algorithm>
-#include <cstdarg>
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Value.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
+#include <cassert>
+#include <utility>
+
 using namespace llvm;
 
 //===----------------------------------------------------------------------===//
@@ -220,7 +233,6 @@ PointerType *Type::getInt64PtrTy(LLVMContext &C, unsigned AS) {
   return getInt64Ty(C)->getPointerTo(AS);
 }
 
-
 //===----------------------------------------------------------------------===//
 //                       IntegerType Implementation
 //===----------------------------------------------------------------------===//
@@ -362,7 +374,8 @@ void StructType::setName(StringRef Name) {
   if (Name == getName()) return;
 
   StringMap<StructType *> &SymbolTable = getContext().pImpl->NamedStructTypes;
-  typedef StringMap<StructType *>::MapEntryTy EntryTy;
+
+  using EntryTy = StringMap<StructType *>::MapEntryTy;
 
   // If this struct already had a name, remove its symbol table entry. Don't
   // delete the data yet because it may be part of the new name.
@@ -419,21 +432,6 @@ StructType *StructType::get(LLVMContext &Context, bool isPacked) {
   return get(Context, None, isPacked);
 }
 
-StructType *StructType::get(Type *type, ...) {
-  assert(type && "Cannot create a struct type with no elements with this");
-  LLVMContext &Ctx = type->getContext();
-  va_list ap;
-  SmallVector<llvm::Type*, 8> StructFields;
-  va_start(ap, type);
-  while (type) {
-    StructFields.push_back(type);
-    type = va_arg(ap, llvm::Type*);
-  }
-  auto *Ret = llvm::StructType::get(Ctx, StructFields);
-  va_end(ap);
-  return Ret;
-}
-
 StructType *StructType::create(LLVMContext &Context, ArrayRef<Type*> Elements,
                                StringRef Name, bool isPacked) {
   StructType *ST = create(Context, Name);
@@ -462,21 +460,6 @@ StructType *StructType::create(ArrayRef<Type*> Elements) {
   return create(Elements[0]->getContext(), Elements, StringRef());
 }
 
-StructType *StructType::create(StringRef Name, Type *type, ...) {
-  assert(type && "Cannot create a struct type with no elements with this");
-  LLVMContext &Ctx = type->getContext();
-  va_list ap;
-  SmallVector<llvm::Type*, 8> StructFields;
-  va_start(ap, type);
-  while (type) {
-    StructFields.push_back(type);
-    type = va_arg(ap, llvm::Type*);
-  }
-  auto *Ret = llvm::StructType::create(Ctx, StructFields, Name);
-  va_end(ap);
-  return Ret;
-}
-
 bool StructType::isSized(SmallPtrSetImpl<Type*> *Visited) const {
   if ((getSubclassData() & SCDB_IsSized) != 0)
     return true;
@@ -508,19 +491,6 @@ StringRef StructType::getName() const {
   return ((StringMapEntry<StructType*> *)SymbolTableEntry)->getKey();
 }
 
-void StructType::setBody(Type *type, ...) {
-  assert(type && "Cannot create a struct type with no elements with this");
-  va_list ap;
-  SmallVector<llvm::Type*, 8> StructFields;
-  va_start(ap, type);
-  while (type) {
-    StructFields.push_back(type);
-    type = va_arg(ap, llvm::Type*);
-  }
-  setBody(StructFields);
-  va_end(ap);
-}
-
 bool StructType::isValidElementType(Type *ElemTy) {
   return !ElemTy->isVoidTy() && !ElemTy->isLabelTy() &&
          !ElemTy->isMetadataTy() && !ElemTy->isFunctionTy() &&
@@ -540,7 +510,6 @@ StructType *Module::getTypeByName(StringRef Name) const {
   return getContext().pImpl->NamedStructTypes.lookup(Name);
 }
 
-
 //===----------------------------------------------------------------------===//
 //                       CompositeType Implementation
 //===----------------------------------------------------------------------===//
@@ -589,7 +558,6 @@ bool CompositeType::indexValid(unsigned Idx) const {
   return true;
 }
 
-
 //===----------------------------------------------------------------------===//
 //                           ArrayType Implementation
 //===----------------------------------------------------------------------===//
@@ -661,7 +629,6 @@ PointerType *PointerType::get(Type *EltTy, unsigned AddressSpace) {
   return Entry;
 }
 
-
 PointerType::PointerType(Type *E, unsigned AddrSpace)
   : Type(E->getContext(), PointerTyID), PointeeTy(E) {
   ContainedTys = &PointeeTy;
diff --git a/lib/IR/Verifier.cpp b/lib/IR/Verifier.cpp
index 65e124562493..3b68d6365872 100644
--- a/lib/IR/Verifier.cpp
+++ b/lib/IR/Verifier.cpp
@@ -267,6 +267,9 @@ class Verifier : public InstVisitor<Verifier>, VerifierSupport {
   /// \brief Keep track of the metadata nodes that have been checked already.
   SmallPtrSet<const Metadata *, 32> MDNodes;
 
+  /// Keep track which DISubprogram is attached to which function.
+  DenseMap<const DISubprogram *, const Function *> DISubprogramAttachments;
+
   /// Track all DICompileUnits visited.
   SmallPtrSet<const Metadata *, 2> CUVisited;
 
@@ -386,7 +389,7 @@ public:
     verifyCompileUnits();
 
     verifyDeoptimizeCallingConvs();
-
+    DISubprogramAttachments.clear();
     return !Broken;
   }
 
@@ -2085,13 +2088,19 @@ void Verifier::visitFunction(const Function &F) {
       switch (I.first) {
       default:
         break;
-      case LLVMContext::MD_dbg:
+      case LLVMContext::MD_dbg: {
         ++NumDebugAttachments;
         AssertDI(NumDebugAttachments == 1,
                  "function must have a single !dbg attachment", &F, I.second);
         AssertDI(isa<DISubprogram>(I.second),
                  "function !dbg attachment must be a subprogram", &F, I.second);
+        auto *SP = cast<DISubprogram>(I.second);
+        const Function *&AttachedTo = DISubprogramAttachments[SP];
+        AssertDI(!AttachedTo || AttachedTo == &F,
+                 "DISubprogram attached to more than one function", SP, &F);
+        AttachedTo = &F;
         break;
+      }
       case LLVMContext::MD_prof:
         ++NumProfAttachments;
         Assert(NumProfAttachments == 1,
diff --git a/lib/LLVMBuild.txt b/lib/LLVMBuild.txt
index 684b378c93e5..89ddd0fc1af3 100644
--- a/lib/LLVMBuild.txt
+++ b/lib/LLVMBuild.txt
@@ -24,7 +24,6 @@ subdirectories =
  DebugInfo
  Demangle
  ExecutionEngine
- LibDriver
  LineEditor
  Linker
  IR
@@ -39,6 +38,7 @@ subdirectories =
  Support
  TableGen
  Target
+ ToolDrivers
  Transforms
 
 [component_0]
diff --git a/lib/LTO/LTO.cpp b/lib/LTO/LTO.cpp
index 2d2dcdec05fb..c73b6b6b15c1 100644
--- a/lib/LTO/LTO.cpp
+++ b/lib/LTO/LTO.cpp
@@ -973,7 +973,7 @@ Error LTO::runThinLTO(AddStreamFn AddStream, NativeObjectCache Cache,
           // this value. If not, no need to preserve any ThinLTO copies.
           !Res.second.IRName.empty())
         GUIDPreservedSymbols.insert(GlobalValue::getGUID(
-            GlobalValue::getRealLinkageName(Res.second.IRName)));
+            GlobalValue::dropLLVMManglingEscape(Res.second.IRName)));
     }
 
     auto DeadSymbols =
@@ -993,7 +993,7 @@ Error LTO::runThinLTO(AddStreamFn AddStream, NativeObjectCache Cache,
       if (Res.second.IRName.empty())
         continue;
       auto GUID = GlobalValue::getGUID(
-          GlobalValue::getRealLinkageName(Res.second.IRName));
+          GlobalValue::dropLLVMManglingEscape(Res.second.IRName));
       // Mark exported unless index-based analysis determined it to be dead.
       if (!DeadSymbols.count(GUID))
         ExportedGUIDs.insert(GUID);
diff --git a/lib/LTO/LTOCodeGenerator.cpp b/lib/LTO/LTOCodeGenerator.cpp
index 86fba843e980..6a275560dc92 100644
--- a/lib/LTO/LTOCodeGenerator.cpp
+++ b/lib/LTO/LTOCodeGenerator.cpp
@@ -495,17 +495,14 @@ void LTOCodeGenerator::verifyMergedModuleOnce() {
     return;
   HasVerifiedInput = true;
 
-  if (LTOStripInvalidDebugInfo) {
-    bool BrokenDebugInfo = false;
-    if (verifyModule(*MergedModule, &dbgs(), &BrokenDebugInfo))
-      report_fatal_error("Broken module found, compilation aborted!");
-    if (BrokenDebugInfo) {
-      emitWarning("Invalid debug info found, debug info will be stripped");
-      StripDebugInfo(*MergedModule);
-    }
-  }
-  if (verifyModule(*MergedModule, &dbgs()))
+  bool BrokenDebugInfo = false;
+  if (verifyModule(*MergedModule, &dbgs(),
+                   LTOStripInvalidDebugInfo ? &BrokenDebugInfo : nullptr))
     report_fatal_error("Broken module found, compilation aborted!");
+  if (BrokenDebugInfo) {
+    emitWarning("Invalid debug info found, debug info will be stripped");
+    StripDebugInfo(*MergedModule);
+  }
 }
 
 void LTOCodeGenerator::finishOptimizationRemarks() {
@@ -600,6 +597,7 @@ bool LTOCodeGenerator::compileOptimized(ArrayRef<raw_pwrite_stream *> Out) {
   // If statistics were requested, print them out after codegen.
   if (llvm::AreStatisticsEnabled())
     llvm::PrintStatistics();
+  reportAndResetTimings();
 
   finishOptimizationRemarks();
 
diff --git a/lib/LTO/ThinLTOCodeGenerator.cpp b/lib/LTO/ThinLTOCodeGenerator.cpp
index b4ee7c2b2fbc..65a7994325bc 100644
--- a/lib/LTO/ThinLTOCodeGenerator.cpp
+++ b/lib/LTO/ThinLTOCodeGenerator.cpp
@@ -446,7 +446,7 @@ ProcessThinLTOModule(Module &TheModule, ModuleSummaryIndex &Index,
     {
       raw_svector_ostream OS(OutputBuffer);
       ProfileSummaryInfo PSI(TheModule);
-      auto Index = buildModuleSummaryIndex(TheModule, nullptr, nullptr);
+      auto Index = buildModuleSummaryIndex(TheModule, nullptr, &PSI);
       WriteBitcodeToFile(&TheModule, OS, true, &Index);
     }
     return make_unique<ObjectMemoryBuffer>(std::move(OutputBuffer));
@@ -1024,4 +1024,5 @@ void ThinLTOCodeGenerator::run() {
   // If statistics were requested, print them out now.
   if (llvm::AreStatisticsEnabled())
     llvm::PrintStatistics();
+  reportAndResetTimings();
 }
diff --git a/lib/LibDriver/CMakeLists.txt b/lib/LibDriver/CMakeLists.txt
deleted file mode 100644
index ab53a6843446..000000000000
--- a/lib/LibDriver/CMakeLists.txt
+++ /dev/null
@@ -1,8 +0,0 @@
-set(LLVM_TARGET_DEFINITIONS Options.td)
-tablegen(LLVM Options.inc -gen-opt-parser-defs)
-add_public_tablegen_target(LibOptionsTableGen)
-
-add_llvm_library(LLVMLibDriver
-  LibDriver.cpp
-  )
-add_dependencies(LLVMLibDriver LibOptionsTableGen)
diff --git a/lib/LibDriver/LLVMBuild.txt b/lib/LibDriver/LLVMBuild.txt
deleted file mode 100644
index 799dc997c0bb..000000000000
--- a/lib/LibDriver/LLVMBuild.txt
+++ /dev/null
@@ -1,22 +0,0 @@
-;===- ./lib/LibDriver/LLVMBuild.txt ----------------------------*- Conf -*--===;
-;
-;                     The LLVM Compiler Infrastructure
-;
-; This file is distributed under the University of Illinois Open Source
-; License. See LICENSE.TXT for details.
-;
-;===------------------------------------------------------------------------===;
-;
-; This is an LLVMBuild description file for the components in this subdirectory.
-;
-; For more information on the LLVMBuild system, please see:
-;
-;   http://llvm.org/docs/LLVMBuild.html
-;
-;===------------------------------------------------------------------------===;
-
-[component_0]
-type = Library
-name = LibDriver
-parent = Libraries
-required_libraries = Object Option Support
diff --git a/lib/LibDriver/LibDriver.cpp b/lib/LibDriver/LibDriver.cpp
deleted file mode 100644
index c50629d71501..000000000000
--- a/lib/LibDriver/LibDriver.cpp
+++ /dev/null
@@ -1,171 +0,0 @@
-//===- LibDriver.cpp - lib.exe-compatible driver --------------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// Defines an interface to a lib.exe-compatible driver that also understands
-// bitcode files. Used by llvm-lib and lld-link /lib.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/LibDriver/LibDriver.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/Object/ArchiveWriter.h"
-#include "llvm/Option/Arg.h"
-#include "llvm/Option/ArgList.h"
-#include "llvm/Option/Option.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/StringSaver.h"
-#include "llvm/Support/Path.h"
-#include "llvm/Support/Process.h"
-#include "llvm/Support/raw_ostream.h"
-
-using namespace llvm;
-
-namespace {
-
-enum {
-  OPT_INVALID = 0,
-#define OPTION(_1, _2, ID, _4, _5, _6, _7, _8, _9, _10, _11) OPT_##ID,
-#include "Options.inc"
-#undef OPTION
-};
-
-#define PREFIX(NAME, VALUE) const char *const NAME[] = VALUE;
-#include "Options.inc"
-#undef PREFIX
-
-static const llvm::opt::OptTable::Info infoTable[] = {
-#define OPTION(X1, X2, ID, KIND, GROUP, ALIAS, X6, X7, X8, X9, X10)    \
-  {                                                                    \
-    X1, X2, X9, X10, OPT_##ID, llvm::opt::Option::KIND##Class, X8, X7, \
-    OPT_##GROUP, OPT_##ALIAS, X6                                       \
-  },
-#include "Options.inc"
-#undef OPTION
-};
-
-class LibOptTable : public llvm::opt::OptTable {
-public:
-  LibOptTable() : OptTable(infoTable, true) {}
-};
-
-}
-
-static std::string getOutputPath(llvm::opt::InputArgList *Args,
-                                 const llvm::NewArchiveMember &FirstMember) {
-  if (auto *Arg = Args->getLastArg(OPT_out))
-    return Arg->getValue();
-  SmallString<128> Val = StringRef(FirstMember.Buf->getBufferIdentifier());
-  llvm::sys::path::replace_extension(Val, ".lib");
-  return Val.str();
-}
-
-static std::vector<StringRef> getSearchPaths(llvm::opt::InputArgList *Args,
-                                             StringSaver &Saver) {
-  std::vector<StringRef> Ret;
-  // Add current directory as first item of the search path.
-  Ret.push_back("");
-
-  // Add /libpath flags.
-  for (auto *Arg : Args->filtered(OPT_libpath))
-    Ret.push_back(Arg->getValue());
-
-  // Add $LIB.
-  Optional<std::string> EnvOpt = sys::Process::GetEnv("LIB");
-  if (!EnvOpt.hasValue())
-    return Ret;
-  StringRef Env = Saver.save(*EnvOpt);
-  while (!Env.empty()) {
-    StringRef Path;
-    std::tie(Path, Env) = Env.split(';');
-    Ret.push_back(Path);
-  }
-  return Ret;
-}
-
-static Optional<std::string> findInputFile(StringRef File,
-                                           ArrayRef<StringRef> Paths) {
-  for (auto Dir : Paths) {
-    SmallString<128> Path = Dir;
-    sys::path::append(Path, File);
-    if (sys::fs::exists(Path))
-      return Path.str().str();
-  }
-  return Optional<std::string>();
-}
-
-int llvm::libDriverMain(llvm::ArrayRef<const char*> ArgsArr) {
-  SmallVector<const char *, 20> NewArgs(ArgsArr.begin(), ArgsArr.end());
-  BumpPtrAllocator Alloc;
-  StringSaver Saver(Alloc);
-  cl::ExpandResponseFiles(Saver, cl::TokenizeWindowsCommandLine, NewArgs);
-  ArgsArr = NewArgs;
-
-  LibOptTable Table;
-  unsigned MissingIndex;
-  unsigned MissingCount;
-  llvm::opt::InputArgList Args =
-      Table.ParseArgs(ArgsArr.slice(1), MissingIndex, MissingCount);
-  if (MissingCount) {
-    llvm::errs() << "missing arg value for \""
-                 << Args.getArgString(MissingIndex) << "\", expected "
-                 << MissingCount
-                 << (MissingCount == 1 ? " argument.\n" : " arguments.\n");
-    return 1;
-  }
-  for (auto *Arg : Args.filtered(OPT_UNKNOWN))
-    llvm::errs() << "ignoring unknown argument: " << Arg->getSpelling() << "\n";
-
-  if (!Args.hasArgNoClaim(OPT_INPUT)) {
-    // No input files.  To match lib.exe, silently do nothing.
-    return 0;
-  }
-
-  std::vector<StringRef> SearchPaths = getSearchPaths(&Args, Saver);
-
-  std::vector<llvm::NewArchiveMember> Members;
-  for (auto *Arg : Args.filtered(OPT_INPUT)) {
-    Optional<std::string> Path = findInputFile(Arg->getValue(), SearchPaths);
-    if (!Path.hasValue()) {
-      llvm::errs() << Arg->getValue() << ": no such file or directory\n";
-      return 1;
-    }
-    Expected<NewArchiveMember> MOrErr =
-        NewArchiveMember::getFile(Saver.save(*Path), /*Deterministic=*/true);
-    if (!MOrErr) {
-      handleAllErrors(MOrErr.takeError(), [&](const llvm::ErrorInfoBase &EIB) {
-        llvm::errs() << Arg->getValue() << ": " << EIB.message() << "\n";
-      });
-      return 1;
-    }
-    sys::fs::file_magic Magic =
-        sys::fs::identify_magic(MOrErr->Buf->getBuffer());
-    if (Magic != sys::fs::file_magic::coff_object &&
-        Magic != sys::fs::file_magic::bitcode &&
-        Magic != sys::fs::file_magic::windows_resource) {
-      llvm::errs() << Arg->getValue()
-                   << ": not a COFF object, bitcode or resource file\n";
-      return 1;
-    }
-    Members.emplace_back(std::move(*MOrErr));
-  }
-
-  std::pair<StringRef, std::error_code> Result =
-      llvm::writeArchive(getOutputPath(&Args, Members[0]), Members,
-                         /*WriteSymtab=*/true, object::Archive::K_GNU,
-                         /*Deterministic*/ true, Args.hasArg(OPT_llvmlibthin));
-
-  if (Result.second) {
-    if (Result.first.empty())
-      Result.first = ArgsArr[0];
-    llvm::errs() << Result.first << ": " << Result.second.message() << "\n";
-    return 1;
-  }
-
-  return 0;
-}
diff --git a/lib/LibDriver/Options.td b/lib/LibDriver/Options.td
deleted file mode 100644
index 5a56ef7468d4..000000000000
--- a/lib/LibDriver/Options.td
+++ /dev/null
@@ -1,25 +0,0 @@
-include "llvm/Option/OptParser.td"
-
-// lib.exe accepts options starting with either a dash or a slash.
-
-// Flag that takes no arguments.
-class F<string name> : Flag<["/", "-", "-?"], name>;
-
-// Flag that takes one argument after ":".
-class P<string name, string help> :
-      Joined<["/", "-", "-?"], name#":">, HelpText<help>;
-
-def libpath: P<"libpath", "Object file search path">;
-def out    : P<"out", "Path to file to write output">;
-
-def llvmlibthin : F<"llvmlibthin">;
-
-//==============================================================================
-// The flags below do nothing. They are defined only for lib.exe compatibility.
-//==============================================================================
-
-class QF<string name> : Joined<["/", "-", "-?"], name#":">;
-
-def ignore : QF<"ignore">;
-def machine: QF<"machine">;
-def nologo : F<"nologo">;
diff --git a/lib/Linker/IRMover.cpp b/lib/Linker/IRMover.cpp
index 15a46a2d0420..ecef1efda1a2 100644
--- a/lib/Linker/IRMover.cpp
+++ b/lib/Linker/IRMover.cpp
@@ -602,6 +602,7 @@ GlobalVariable *IRLinker::copyGlobalVariableProto(const GlobalVariable *SGVar) {
                          /*insertbefore*/ nullptr, SGVar->getThreadLocalMode(),
                          SGVar->getType()->getAddressSpace());
   NewDGV->setAlignment(SGVar->getAlignment());
+  NewDGV->copyAttributesFrom(SGVar);
   return NewDGV;
 }
 
@@ -610,8 +611,11 @@ GlobalVariable *IRLinker::copyGlobalVariableProto(const GlobalVariable *SGVar) {
 Function *IRLinker::copyFunctionProto(const Function *SF) {
   // If there is no linkage to be performed or we are linking from the source,
   // bring SF over.
-  return Function::Create(TypeMap.get(SF->getFunctionType()),
-                          GlobalValue::ExternalLinkage, SF->getName(), &DstM);
+  auto *F =
+      Function::Create(TypeMap.get(SF->getFunctionType()),
+                       GlobalValue::ExternalLinkage, SF->getName(), &DstM);
+  F->copyAttributesFrom(SF);
+  return F;
 }
 
 /// Set up prototypes for any aliases that come over from the source module.
@@ -619,9 +623,11 @@ GlobalValue *IRLinker::copyGlobalAliasProto(const GlobalAlias *SGA) {
   // If there is no linkage to be performed or we're linking from the source,
   // bring over SGA.
   auto *Ty = TypeMap.get(SGA->getValueType());
-  return GlobalAlias::create(Ty, SGA->getType()->getPointerAddressSpace(),
-                             GlobalValue::ExternalLinkage, SGA->getName(),
-                             &DstM);
+  auto *GA =
+      GlobalAlias::create(Ty, SGA->getType()->getPointerAddressSpace(),
+                          GlobalValue::ExternalLinkage, SGA->getName(), &DstM);
+  GA->copyAttributesFrom(SGA);
+  return GA;
 }
 
 GlobalValue *IRLinker::copyGlobalValueProto(const GlobalValue *SGV,
@@ -648,8 +654,6 @@ GlobalValue *IRLinker::copyGlobalValueProto(const GlobalValue *SGV,
   else if (SGV->hasExternalWeakLinkage())
     NewGV->setLinkage(GlobalValue::ExternalWeakLinkage);
 
-  NewGV->copyAttributesFrom(SGV);
-
   if (auto *NewGO = dyn_cast<GlobalObject>(NewGV)) {
     // Metadata for global variables and function declarations is copied eagerly.
     if (isa<GlobalVariable>(SGV) || SGV->isDeclaration())
diff --git a/lib/MC/MCObjectStreamer.cpp b/lib/MC/MCObjectStreamer.cpp
index f7f2253256eb..174397e27396 100644
--- a/lib/MC/MCObjectStreamer.cpp
+++ b/lib/MC/MCObjectStreamer.cpp
@@ -133,6 +133,11 @@ void MCObjectStreamer::EmitValueImpl(const MCExpr *Value, unsigned Size,
   // Avoid fixups when possible.
   int64_t AbsValue;
   if (Value->evaluateAsAbsolute(AbsValue, getAssembler())) {
+    if (!isUIntN(8 * Size, AbsValue) && !isIntN(8 * Size, AbsValue)) {
+      getContext().reportError(
+          Loc, "value evaluated as " + Twine(AbsValue) + " is out of range.");
+      return;
+    }
     EmitIntValue(AbsValue, Size);
     return;
   }
diff --git a/lib/MC/MCParser/AsmParser.cpp b/lib/MC/MCParser/AsmParser.cpp
index 66ba853da2fe..3b213ef4ce09 100644
--- a/lib/MC/MCParser/AsmParser.cpp
+++ b/lib/MC/MCParser/AsmParser.cpp
@@ -288,6 +288,7 @@ public:
 
 private:
   bool isAltmacroString(SMLoc &StrLoc, SMLoc &EndLoc);
+  void altMacroString(StringRef AltMacroStr, std::string &Res);
   bool parseStatement(ParseStatementInfo &Info,
                       MCAsmParserSemaCallback *SI);
   bool parseCurlyBlockScope(SmallVectorImpl<AsmRewrite>& AsmStrRewrites);
@@ -1209,6 +1210,8 @@ bool AsmParser::isAltmacroString(SMLoc &StrLoc, SMLoc &EndLoc) {
   const char *CharPtr = StrLoc.getPointer();
   while ((*CharPtr != '>') && (*CharPtr != '\n') &&
          (*CharPtr != '\r') && (*CharPtr != '\0')){
+	  if(*CharPtr == '!')
+		  CharPtr++;
     CharPtr++;
   }
   if (*CharPtr == '>') {
@@ -1218,6 +1221,15 @@ bool AsmParser::isAltmacroString(SMLoc &StrLoc, SMLoc &EndLoc) {
   return false;
 }
 
+/// \brief creating a string without the escape characters '!'.
+void AsmParser::altMacroString(StringRef AltMacroStr,std::string &Res) {
+  for (size_t Pos = 0; Pos < AltMacroStr.size(); Pos++) {
+    if (AltMacroStr[Pos] == '!')
+      Pos++;
+    Res += AltMacroStr[Pos];
+  }
+}
+
 /// \brief Parse an expression and return it.
 ///
 ///  expr ::= expr &&,|| expr               -> lowest.
@@ -2309,6 +2321,15 @@ bool AsmParser::expandMacro(raw_svector_ostream &OS, StringRef Body,
                  (*(Token.getString().begin()) == '%') && Token.is(AsmToken::Integer))
               // Emit an integer value to the buffer.
               OS << Token.getIntVal();
+            // Only Token that was validated as a string and begins with '<'
+            // is considered altMacroString!!!
+            else if ((Lexer.IsaAltMacroMode()) &&
+                     (*(Token.getString().begin()) == '<') &&
+                     Token.is(AsmToken::String)) {
+              std::string Res;
+              altMacroString(Token.getStringContents(), Res);
+              OS << Res;
+            }
             // We expect no quotes around the string's contents when
             // parsing for varargs.
             else if (Token.isNot(AsmToken::String) || VarargParameter)
diff --git a/lib/Object/COFFObjectFile.cpp b/lib/Object/COFFObjectFile.cpp
index b1223e81be43..28531feccfe1 100644
--- a/lib/Object/COFFObjectFile.cpp
+++ b/lib/Object/COFFObjectFile.cpp
@@ -1062,7 +1062,7 @@ COFFObjectFile::getSectionContents(const coff_section *Sec,
   // In COFF, a virtual section won't have any in-file
   // content, so the file pointer to the content will be zero.
   if (Sec->PointerToRawData == 0)
-    return object_error::parse_failed;
+    return std::error_code();
   // The only thing that we need to verify is that the contents is contained
   // within the file bounds. We don't need to make sure it doesn't cover other
   // data, as there's nothing that says that is not allowed.
@@ -1602,8 +1602,6 @@ ErrorOr<ArrayRef<UTF16>> ResourceSectionRef::getDirStringAtOffset(uint32_t Offse
   uint16_t Length;
   RETURN_IF_ERROR(Reader.readInteger(Length));
   ArrayRef<UTF16> RawDirString;
-  // Strings are stored as 2-byte aligned unicode characters but readFixedString
-  // assumes byte string, so we double length.
   RETURN_IF_ERROR(Reader.readArray(RawDirString, Length));
   return RawDirString;
 }
diff --git a/lib/Object/WasmObjectFile.cpp b/lib/Object/WasmObjectFile.cpp
index 39f8704aacf2..058686e4db9e 100644
--- a/lib/Object/WasmObjectFile.cpp
+++ b/lib/Object/WasmObjectFile.cpp
@@ -168,6 +168,13 @@ static wasm::WasmLimits readLimits(const uint8_t *&Ptr) {
   return Result;
 }
 
+static wasm::WasmTable readTable(const uint8_t *&Ptr) {
+  wasm::WasmTable Table;
+  Table.ElemType = readVarint7(Ptr);
+  Table.Limits = readLimits(Ptr);
+  return Table;
+}
+
 static Error readSection(WasmSection &Section, const uint8_t *&Ptr,
                          const uint8_t *Start) {
   // TODO(sbc): Avoid reading past EOF in the case of malformed files.
@@ -397,13 +404,22 @@ Error WasmObjectFile::parseImportSection(const uint8_t *Ptr, const uint8_t *End)
                            Sections.size(), i);
       break;
     case wasm::WASM_EXTERNAL_GLOBAL:
-      Im.GlobalType = readVarint7(Ptr);
-      Im.GlobalMutable = readVaruint1(Ptr);
+      Im.Global.Type = readVarint7(Ptr);
+      Im.Global.Mutable = readVaruint1(Ptr);
       Symbols.emplace_back(Im.Field, WasmSymbol::SymbolType::GLOBAL_IMPORT,
                            Sections.size(), i);
       break;
+    case wasm::WASM_EXTERNAL_MEMORY:
+      Im.Memory = readLimits(Ptr);
+      break;
+    case wasm::WASM_EXTERNAL_TABLE:
+      Im.Table = readTable(Ptr);
+      if (Im.Table.ElemType != wasm::WASM_TYPE_ANYFUNC) {
+        return make_error<GenericBinaryError>("Invalid table element type",
+                                              object_error::parse_failed);
+      }
+      break;
     default:
-      // TODO(sbc): Handle other kinds of imports
       return make_error<GenericBinaryError>(
           "Unexpected import kind", object_error::parse_failed);
     }
@@ -431,14 +447,11 @@ Error WasmObjectFile::parseTableSection(const uint8_t *Ptr, const uint8_t *End)
   uint32_t Count = readVaruint32(Ptr);
   Tables.reserve(Count);
   while (Count--) {
-    wasm::WasmTable Table;
-    Table.ElemType = readVarint7(Ptr);
-    if (Table.ElemType != wasm::WASM_TYPE_ANYFUNC) {
+    Tables.push_back(readTable(Ptr));
+    if (Tables.back().ElemType != wasm::WASM_TYPE_ANYFUNC) {
       return make_error<GenericBinaryError>("Invalid table element type",
                                             object_error::parse_failed);
     }
-    Table.Limits = readLimits(Ptr);
-    Tables.push_back(Table);
   }
   if (Ptr != End)
     return make_error<GenericBinaryError>("Table section ended prematurely",
@@ -493,8 +506,10 @@ Error WasmObjectFile::parseExportSection(const uint8_t *Ptr, const uint8_t *End)
       Symbols.emplace_back(Ex.Name, WasmSymbol::SymbolType::GLOBAL_EXPORT,
                            Sections.size(), i);
       break;
+    case wasm::WASM_EXTERNAL_MEMORY:
+    case wasm::WASM_EXTERNAL_TABLE:
+      break;
     default:
-      // TODO(sbc): Handle other kinds of exports
       return make_error<GenericBinaryError>(
           "Unexpected export kind", object_error::parse_failed);
     }
@@ -507,7 +522,7 @@ Error WasmObjectFile::parseExportSection(const uint8_t *Ptr, const uint8_t *End)
 
 Error WasmObjectFile::parseStartSection(const uint8_t *Ptr, const uint8_t *End) {
   StartFunction = readVaruint32(Ptr);
-  if (StartFunction < FunctionTypes.size())
+  if (StartFunction >= FunctionTypes.size())
     return make_error<GenericBinaryError>("Invalid start function",
                                           object_error::parse_failed);
   return Error::success();
@@ -638,10 +653,14 @@ basic_symbol_iterator WasmObjectFile::symbol_end() const {
   return BasicSymbolRef(Ref, this);
 }
 
-const WasmSymbol &WasmObjectFile::getWasmSymbol(DataRefImpl Symb) const {
+const WasmSymbol &WasmObjectFile::getWasmSymbol(const DataRefImpl &Symb) const {
   return Symbols[Symb.d.a];
 }
 
+const WasmSymbol &WasmObjectFile::getWasmSymbol(const SymbolRef &Symb) const {
+  return getWasmSymbol(Symb.getRawDataRefImpl());
+}
+
 Expected<StringRef> WasmObjectFile::getSymbolName(DataRefImpl Symb) const {
   const WasmSymbol &Sym = getWasmSymbol(Symb);
   return Sym.Name;
diff --git a/lib/ObjectYAML/WasmYAML.cpp b/lib/ObjectYAML/WasmYAML.cpp
index c5d1b438ee2a..910d32f16af9 100644
--- a/lib/ObjectYAML/WasmYAML.cpp
+++ b/lib/ObjectYAML/WasmYAML.cpp
@@ -265,8 +265,12 @@ void MappingTraits<WasmYAML::Import>::mapping(IO &IO,
   if (Import.Kind == wasm::WASM_EXTERNAL_FUNCTION) {
     IO.mapRequired("SigIndex", Import.SigIndex);
   } else if (Import.Kind == wasm::WASM_EXTERNAL_GLOBAL) {
-    IO.mapRequired("GlobalType", Import.GlobalType);
-    IO.mapRequired("GlobalMutable", Import.GlobalMutable);
+    IO.mapRequired("GlobalType", Import.GlobalImport.Type);
+    IO.mapRequired("GlobalMutable", Import.GlobalImport.Mutable);
+  } else if (Import.Kind == wasm::WASM_EXTERNAL_TABLE) {
+    IO.mapRequired("Table", Import.TableImport);
+  } else if (Import.Kind == wasm::WASM_EXTERNAL_MEMORY ) {
+    IO.mapRequired("Memory", Import.Memory);
   } else {
     llvm_unreachable("unhandled import type");
   }
diff --git a/lib/ProfileData/SampleProfWriter.cpp b/lib/ProfileData/SampleProfWriter.cpp
index b91b6fb7c7ad..b05efa7417b9 100644
--- a/lib/ProfileData/SampleProfWriter.cpp
+++ b/lib/ProfileData/SampleProfWriter.cpp
@@ -29,6 +29,7 @@
 #include <algorithm>
 #include <cstdint>
 #include <memory>
+#include <set>
 #include <system_error>
 #include <utility>
 #include <vector>
@@ -36,6 +37,32 @@
 using namespace llvm;
 using namespace sampleprof;
 
+std::error_code
+SampleProfileWriter::write(const StringMap<FunctionSamples> &ProfileMap) {
+  if (std::error_code EC = writeHeader(ProfileMap))
+    return EC;
+
+  // Sort the ProfileMap by total samples.
+  typedef std::pair<StringRef, const FunctionSamples *> NameFunctionSamples;
+  std::vector<NameFunctionSamples> V;
+  for (const auto &I : ProfileMap)
+    V.push_back(std::make_pair(I.getKey(), &I.second));
+
+  std::stable_sort(
+      V.begin(), V.end(),
+      [](const NameFunctionSamples &A, const NameFunctionSamples &B) {
+        if (A.second->getTotalSamples() == B.second->getTotalSamples())
+          return A.first > B.first;
+        return A.second->getTotalSamples() > B.second->getTotalSamples();
+      });
+
+  for (const auto &I : V) {
+    if (std::error_code EC = write(*I.second))
+      return EC;
+  }
+  return sampleprof_error::success;
+}
+
 /// \brief Write samples to a text file.
 ///
 /// Note: it may be tempting to implement this in terms of
@@ -97,8 +124,7 @@ std::error_code SampleProfileWriterBinary::writeNameIdx(StringRef FName) {
 }
 
 void SampleProfileWriterBinary::addName(StringRef FName) {
-  auto NextIdx = NameTable.size();
-  NameTable.insert(std::make_pair(FName, NextIdx));
+  NameTable.insert(std::make_pair(FName, 0));
 }
 
 void SampleProfileWriterBinary::addNames(const FunctionSamples &S) {
@@ -136,10 +162,18 @@ std::error_code SampleProfileWriterBinary::writeHeader(
     addNames(I.second);
   }
 
+  // Sort the names to make NameTable is deterministic.
+  std::set<StringRef> V;
+  for (const auto &I : NameTable)
+    V.insert(I.first);
+  int i = 0;
+  for (const StringRef &N : V)
+    NameTable[N] = i++;
+
   // Write out the name table.
   encodeULEB128(NameTable.size(), OS);
-  for (auto N : NameTable) {
-    OS << N.first;
+  for (auto N : V) {
+    OS << N;
     encodeULEB128(0, OS);
   }
   return sampleprof_error::success;
diff --git a/lib/Support/APInt.cpp b/lib/Support/APInt.cpp
index caa0691f9205..17144522db82 100644
--- a/lib/Support/APInt.cpp
+++ b/lib/Support/APInt.cpp
@@ -122,35 +122,38 @@ APInt::APInt(unsigned numbits, StringRef Str, uint8_t radix)
   fromString(numbits, Str, radix);
 }
 
+void APInt::reallocate(unsigned NewBitWidth) {
+  // If the number of words is the same we can just change the width and stop.
+  if (getNumWords() == getNumWords(NewBitWidth)) {
+    BitWidth = NewBitWidth;
+    return;
+  }
+
+  // If we have an allocation, delete it.
+  if (!isSingleWord())
+    delete [] U.pVal;
+
+  // Update BitWidth.
+  BitWidth = NewBitWidth;
+
+  // If we are supposed to have an allocation, create it.
+  if (!isSingleWord())
+    U.pVal = getMemory(getNumWords());
+}
+
 void APInt::AssignSlowCase(const APInt& RHS) {
   // Don't do anything for X = X
   if (this == &RHS)
     return;
 
-  if (BitWidth == RHS.getBitWidth()) {
-    // assume same bit-width single-word case is already handled
-    assert(!isSingleWord());
-    memcpy(U.pVal, RHS.U.pVal, getNumWords() * APINT_WORD_SIZE);
-    return;
-  }
+  // Adjust the bit width and handle allocations as necessary.
+  reallocate(RHS.getBitWidth());
 
-  if (isSingleWord()) {
-    // assume case where both are single words is already handled
-    assert(!RHS.isSingleWord());
-    U.pVal = getMemory(RHS.getNumWords());
-    memcpy(U.pVal, RHS.U.pVal, RHS.getNumWords() * APINT_WORD_SIZE);
-  } else if (getNumWords() == RHS.getNumWords())
-    memcpy(U.pVal, RHS.U.pVal, RHS.getNumWords() * APINT_WORD_SIZE);
-  else if (RHS.isSingleWord()) {
-    delete [] U.pVal;
+  // Copy the data.
+  if (isSingleWord())
     U.VAL = RHS.U.VAL;
-  } else {
-    delete [] U.pVal;
-    U.pVal = getMemory(RHS.getNumWords());
-    memcpy(U.pVal, RHS.U.pVal, RHS.getNumWords() * APINT_WORD_SIZE);
-  }
-  BitWidth = RHS.BitWidth;
-  clearUnusedBits();
+  else
+    memcpy(U.pVal, RHS.U.pVal, getNumWords() * APINT_WORD_SIZE);
 }
 
 /// This method 'profiles' an APInt for use with FoldingSet.
@@ -1138,10 +1141,13 @@ APInt APInt::multiplicativeInverse(const APInt& modulo) const {
     return APInt(BitWidth, 0);
 
   // The next-to-last t is the multiplicative inverse.  However, we are
-  // interested in a positive inverse. Calcuate a positive one from a negative
+  // interested in a positive inverse. Calculate a positive one from a negative
   // one if necessary. A simple addition of the modulo suffices because
   // abs(t[i]) is known to be less than *this/2 (see the link above).
-  return t[i].isNegative() ? t[i] + modulo : t[i];
+  if (t[i].isNegative())
+    t[i] += modulo;
+
+  return std::move(t[i]);
 }
 
 /// Calculate the magic numbers required to implement a signed integer division
@@ -1240,7 +1246,7 @@ APInt::mu APInt::magicu(unsigned LeadingZeros) const {
 /// from "Art of Computer Programming, Volume 2", section 4.3.1, p. 272. The
 /// variables here have the same names as in the algorithm. Comments explain
 /// the algorithm and any deviation from it.
-static void KnuthDiv(unsigned *u, unsigned *v, unsigned *q, unsigned* r,
+static void KnuthDiv(uint32_t *u, uint32_t *v, uint32_t *q, uint32_t* r,
                      unsigned m, unsigned n) {
   assert(u && "Must provide dividend");
   assert(v && "Must provide divisor");
@@ -1266,16 +1272,16 @@ static void KnuthDiv(unsigned *u, unsigned *v, unsigned *q, unsigned* r,
   // overflow. Note that this can require an extra word in u so that u must
   // be of length m+n+1.
   unsigned shift = countLeadingZeros(v[n-1]);
-  unsigned v_carry = 0;
-  unsigned u_carry = 0;
+  uint32_t v_carry = 0;
+  uint32_t u_carry = 0;
   if (shift) {
     for (unsigned i = 0; i < m+n; ++i) {
-      unsigned u_tmp = u[i] >> (32 - shift);
+      uint32_t u_tmp = u[i] >> (32 - shift);
       u[i] = (u[i] << shift) | u_carry;
       u_carry = u_tmp;
     }
     for (unsigned i = 0; i < n; ++i) {
-      unsigned v_tmp = v[i] >> (32 - shift);
+      uint32_t v_tmp = v[i] >> (32 - shift);
       v[i] = (v[i] << shift) | v_carry;
       v_carry = v_tmp;
     }
@@ -1296,11 +1302,11 @@ static void KnuthDiv(unsigned *u, unsigned *v, unsigned *q, unsigned* r,
     //     Set qp = (u[j+n]*b + u[j+n-1]) / v[n-1]. (qp=qprime=q')
     //     Set rp = (u[j+n]*b + u[j+n-1]) % v[n-1]. (rp=rprime=r')
     // Now test if qp == b or qp*v[n-2] > b*rp + u[j+n-2]; if so, decrease
-    // qp by 1, inrease rp by v[n-1], and repeat this test if rp < b. The test
+    // qp by 1, increase rp by v[n-1], and repeat this test if rp < b. The test
     // on v[n-2] determines at high speed most of the cases in which the trial
     // value qp is one too large, and it eliminates all cases where qp is two
     // too large.
-    uint64_t dividend = ((uint64_t(u[j+n]) << 32) + u[j+n-1]);
+    uint64_t dividend = Make_64(u[j+n], u[j+n-1]);
     DEBUG(dbgs() << "KnuthDiv: dividend == " << dividend << '\n');
     uint64_t qp = dividend / v[n-1];
     uint64_t rp = dividend % v[n-1];
@@ -1323,14 +1329,14 @@ static void KnuthDiv(unsigned *u, unsigned *v, unsigned *q, unsigned* r,
     int64_t borrow = 0;
     for (unsigned i = 0; i < n; ++i) {
       uint64_t p = uint64_t(qp) * uint64_t(v[i]);
-      int64_t subres = int64_t(u[j+i]) - borrow - (unsigned)p;
-      u[j+i] = (unsigned)subres;
-      borrow = (p >> 32) - (subres >> 32);
+      int64_t subres = int64_t(u[j+i]) - borrow - Lo_32(p);
+      u[j+i] = Lo_32(subres);
+      borrow = Hi_32(p) - Hi_32(subres);
       DEBUG(dbgs() << "KnuthDiv: u[j+i] = " << u[j+i]
                    << ", borrow = " << borrow << '\n');
     }
     bool isNeg = u[j+n] < borrow;
-    u[j+n] -= (unsigned)borrow;
+    u[j+n] -= Lo_32(borrow);
 
     DEBUG(dbgs() << "KnuthDiv: after subtraction:");
     DEBUG(for (int i = m+n; i >=0; i--) dbgs() << " " << u[i]);
@@ -1338,7 +1344,7 @@ static void KnuthDiv(unsigned *u, unsigned *v, unsigned *q, unsigned* r,
 
     // D5. [Test remainder.] Set q[j] = qp. If the result of step D4 was
     // negative, go to step D6; otherwise go on to step D7.
-    q[j] = (unsigned)qp;
+    q[j] = Lo_32(qp);
     if (isNeg) {
       // D6. [Add back]. The probability that this step is necessary is very
       // small, on the order of only 2/b. Make sure that test data accounts for
@@ -1349,7 +1355,7 @@ static void KnuthDiv(unsigned *u, unsigned *v, unsigned *q, unsigned* r,
       // since it cancels with the borrow that occurred in D4.
       bool carry = false;
       for (unsigned i = 0; i < n; i++) {
-        unsigned limit = std::min(u[j+i],v[i]);
+        uint32_t limit = std::min(u[j+i],v[i]);
         u[j+i] += v[i] + carry;
         carry = u[j+i] < limit || (carry && u[j+i] == limit);
       }
@@ -1374,7 +1380,7 @@ static void KnuthDiv(unsigned *u, unsigned *v, unsigned *q, unsigned* r,
     // multiplication by d by using a shift left. So, all we have to do is
     // shift right here.
     if (shift) {
-      unsigned carry = 0;
+      uint32_t carry = 0;
       DEBUG(dbgs() << "KnuthDiv: remainder:");
       for (int i = n-1; i >= 0; i--) {
         r[i] = (u[i] >> shift) | carry;
@@ -1403,17 +1409,16 @@ void APInt::divide(const APInt &LHS, unsigned lhsWords, const APInt &RHS,
   // can't use 64-bit operands here because we don't have native results of
   // 128-bits. Furthermore, casting the 64-bit values to 32-bit values won't
   // work on large-endian machines.
-  uint64_t mask = ~0ull >> (sizeof(unsigned)*CHAR_BIT);
   unsigned n = rhsWords * 2;
   unsigned m = (lhsWords * 2) - n;
 
   // Allocate space for the temporary values we need either on the stack, if
   // it will fit, or on the heap if it won't.
-  unsigned SPACE[128];
-  unsigned *U = nullptr;
-  unsigned *V = nullptr;
-  unsigned *Q = nullptr;
-  unsigned *R = nullptr;
+  uint32_t SPACE[128];
+  uint32_t *U = nullptr;
+  uint32_t *V = nullptr;
+  uint32_t *Q = nullptr;
+  uint32_t *R = nullptr;
   if ((Remainder?4:3)*n+2*m+1 <= 128) {
     U = &SPACE[0];
     V = &SPACE[m+n+1];
@@ -1421,34 +1426,34 @@ void APInt::divide(const APInt &LHS, unsigned lhsWords, const APInt &RHS,
     if (Remainder)
       R = &SPACE[(m+n+1) + n + (m+n)];
   } else {
-    U = new unsigned[m + n + 1];
-    V = new unsigned[n];
-    Q = new unsigned[m+n];
+    U = new uint32_t[m + n + 1];
+    V = new uint32_t[n];
+    Q = new uint32_t[m+n];
     if (Remainder)
-      R = new unsigned[n];
+      R = new uint32_t[n];
   }
 
   // Initialize the dividend
-  memset(U, 0, (m+n+1)*sizeof(unsigned));
+  memset(U, 0, (m+n+1)*sizeof(uint32_t));
   for (unsigned i = 0; i < lhsWords; ++i) {
-    uint64_t tmp = (LHS.getNumWords() == 1 ? LHS.U.VAL : LHS.U.pVal[i]);
-    U[i * 2] = (unsigned)(tmp & mask);
-    U[i * 2 + 1] = (unsigned)(tmp >> (sizeof(unsigned)*CHAR_BIT));
+    uint64_t tmp = LHS.getRawData()[i];
+    U[i * 2] = Lo_32(tmp);
+    U[i * 2 + 1] = Hi_32(tmp);
   }
   U[m+n] = 0; // this extra word is for "spill" in the Knuth algorithm.
 
   // Initialize the divisor
-  memset(V, 0, (n)*sizeof(unsigned));
+  memset(V, 0, (n)*sizeof(uint32_t));
   for (unsigned i = 0; i < rhsWords; ++i) {
-    uint64_t tmp = (RHS.getNumWords() == 1 ? RHS.U.VAL : RHS.U.pVal[i]);
-    V[i * 2] = (unsigned)(tmp & mask);
-    V[i * 2 + 1] = (unsigned)(tmp >> (sizeof(unsigned)*CHAR_BIT));
+    uint64_t tmp = RHS.getRawData()[i];
+    V[i * 2] = Lo_32(tmp);
+    V[i * 2 + 1] = Hi_32(tmp);
   }
 
   // initialize the quotient and remainder
-  memset(Q, 0, (m+n) * sizeof(unsigned));
+  memset(Q, 0, (m+n) * sizeof(uint32_t));
   if (Remainder)
-    memset(R, 0, n * sizeof(unsigned));
+    memset(R, 0, n * sizeof(uint32_t));
 
   // Now, adjust m and n for the Knuth division. n is the number of words in
   // the divisor. m is the number of words by which the dividend exceeds the
@@ -1469,22 +1474,22 @@ void APInt::divide(const APInt &LHS, unsigned lhsWords, const APInt &RHS,
   // are using base 2^32 instead of base 10.
   assert(n != 0 && "Divide by zero?");
   if (n == 1) {
-    unsigned divisor = V[0];
-    unsigned remainder = 0;
-    for (int i = m+n-1; i >= 0; i--) {
-      uint64_t partial_dividend = uint64_t(remainder) << 32 | U[i];
+    uint32_t divisor = V[0];
+    uint32_t remainder = 0;
+    for (int i = m; i >= 0; i--) {
+      uint64_t partial_dividend = Make_64(remainder, U[i]);
       if (partial_dividend == 0) {
         Q[i] = 0;
         remainder = 0;
       } else if (partial_dividend < divisor) {
         Q[i] = 0;
-        remainder = (unsigned)partial_dividend;
+        remainder = Lo_32(partial_dividend);
       } else if (partial_dividend == divisor) {
         Q[i] = 1;
         remainder = 0;
       } else {
-        Q[i] = (unsigned)(partial_dividend / divisor);
-        remainder = (unsigned)(partial_dividend - (Q[i] * divisor));
+        Q[i] = Lo_32(partial_dividend / divisor);
+        remainder = Lo_32(partial_dividend - (Q[i] * divisor));
       }
     }
     if (R)
@@ -1498,24 +1503,16 @@ void APInt::divide(const APInt &LHS, unsigned lhsWords, const APInt &RHS,
   // If the caller wants the quotient
   if (Quotient) {
     // Set up the Quotient value's memory.
-    if (Quotient->BitWidth != LHS.BitWidth) {
-      if (Quotient->isSingleWord())
-        Quotient->U.VAL = 0;
-      else
-        delete [] Quotient->U.pVal;
-      Quotient->BitWidth = LHS.BitWidth;
-      if (!Quotient->isSingleWord())
-        Quotient->U.pVal = getClearedMemory(Quotient->getNumWords());
-    } else
-      Quotient->clearAllBits();
+    Quotient->reallocate(LHS.BitWidth);
+    // Clear out any previous bits.
+    Quotient->clearAllBits();
 
     // The quotient is in Q. Reconstitute the quotient into Quotient's low
     // order words.
     // This case is currently dead as all users of divide() handle trivial cases
     // earlier.
     if (lhsWords == 1) {
-      uint64_t tmp =
-        uint64_t(Q[0]) | (uint64_t(Q[1]) << (APINT_BITS_PER_WORD / 2));
+      uint64_t tmp = Make_64(Q[1], Q[0]);
       if (Quotient->isSingleWord())
         Quotient->U.VAL = tmp;
       else
@@ -1523,30 +1520,21 @@ void APInt::divide(const APInt &LHS, unsigned lhsWords, const APInt &RHS,
     } else {
       assert(!Quotient->isSingleWord() && "Quotient APInt not large enough");
       for (unsigned i = 0; i < lhsWords; ++i)
-        Quotient->U.pVal[i] =
-          uint64_t(Q[i*2]) | (uint64_t(Q[i*2+1]) << (APINT_BITS_PER_WORD / 2));
+        Quotient->U.pVal[i] = Make_64(Q[i*2+1], Q[i*2]);
     }
   }
 
   // If the caller wants the remainder
   if (Remainder) {
     // Set up the Remainder value's memory.
-    if (Remainder->BitWidth != RHS.BitWidth) {
-      if (Remainder->isSingleWord())
-        Remainder->U.VAL = 0;
-      else
-        delete [] Remainder->U.pVal;
-      Remainder->BitWidth = RHS.BitWidth;
-      if (!Remainder->isSingleWord())
-        Remainder->U.pVal = getClearedMemory(Remainder->getNumWords());
-    } else
-      Remainder->clearAllBits();
+    Remainder->reallocate(RHS.BitWidth);
+    // Clear out any previous bits.
+    Remainder->clearAllBits();
 
     // The remainder is in R. Reconstitute the remainder into Remainder's low
     // order words.
     if (rhsWords == 1) {
-      uint64_t tmp =
-        uint64_t(R[0]) | (uint64_t(R[1]) << (APINT_BITS_PER_WORD / 2));
+      uint64_t tmp = Make_64(R[1], R[0]);
       if (Remainder->isSingleWord())
         Remainder->U.VAL = tmp;
       else
@@ -1554,8 +1542,7 @@ void APInt::divide(const APInt &LHS, unsigned lhsWords, const APInt &RHS,
     } else {
       assert(!Remainder->isSingleWord() && "Remainder APInt not large enough");
       for (unsigned i = 0; i < rhsWords; ++i)
-        Remainder->U.pVal[i] =
-          uint64_t(R[i*2]) | (uint64_t(R[i*2+1]) << (APINT_BITS_PER_WORD / 2));
+        Remainder->U.pVal[i] = Make_64(R[i*2+1], R[i*2]);
     }
   }
 
@@ -1578,29 +1565,30 @@ APInt APInt::udiv(const APInt& RHS) const {
   }
 
   // Get some facts about the LHS and RHS number of bits and words
-  unsigned rhsBits = RHS.getActiveBits();
-  unsigned rhsWords = !rhsBits ? 0 : (APInt::whichWord(rhsBits - 1) + 1);
+  unsigned lhsWords = getNumWords(getActiveBits());
+  unsigned rhsBits  = RHS.getActiveBits();
+  unsigned rhsWords = getNumWords(rhsBits);
   assert(rhsWords && "Divided by zero???");
-  unsigned lhsBits = this->getActiveBits();
-  unsigned lhsWords = !lhsBits ? 0 : (APInt::whichWord(lhsBits - 1) + 1);
 
   // Deal with some degenerate cases
   if (!lhsWords)
     // 0 / X ===> 0
     return APInt(BitWidth, 0);
-  else if (lhsWords < rhsWords || this->ult(RHS)) {
+  if (rhsBits == 1)
+    // X / 1 ===> X
+    return *this;
+  if (lhsWords < rhsWords || this->ult(RHS))
     // X / Y ===> 0, iff X < Y
     return APInt(BitWidth, 0);
-  } else if (*this == RHS) {
+  if (*this == RHS)
     // X / X ===> 1
     return APInt(BitWidth, 1);
-  } else if (lhsWords == 1 && rhsWords == 1) {
+  if (lhsWords == 1) // rhsWords is 1 if lhsWords is 1.
     // All high words are zero, just use native divide
     return APInt(BitWidth, this->U.pVal[0] / RHS.U.pVal[0]);
-  }
 
   // We have to compute it the hard way. Invoke the Knuth divide algorithm.
-  APInt Quotient(1,0); // to hold result.
+  APInt Quotient; // to hold result.
   divide(*this, lhsWords, RHS, rhsWords, &Quotient, nullptr);
   return Quotient;
 }
@@ -1624,31 +1612,32 @@ APInt APInt::urem(const APInt& RHS) const {
   }
 
   // Get some facts about the LHS
-  unsigned lhsBits = getActiveBits();
-  unsigned lhsWords = !lhsBits ? 0 : (whichWord(lhsBits - 1) + 1);
+  unsigned lhsWords = getNumWords(getActiveBits());
 
   // Get some facts about the RHS
   unsigned rhsBits = RHS.getActiveBits();
-  unsigned rhsWords = !rhsBits ? 0 : (APInt::whichWord(rhsBits - 1) + 1);
+  unsigned rhsWords = getNumWords(rhsBits);
   assert(rhsWords && "Performing remainder operation by zero ???");
 
   // Check the degenerate cases
-  if (lhsWords == 0) {
+  if (lhsWords == 0)
     // 0 % Y ===> 0
     return APInt(BitWidth, 0);
-  } else if (lhsWords < rhsWords || this->ult(RHS)) {
+  if (rhsBits == 1)
+    // X % 1 ===> 0
+    return APInt(BitWidth, 0);
+  if (lhsWords < rhsWords || this->ult(RHS))
     // X % Y ===> X, iff X < Y
     return *this;
-  } else if (*this == RHS) {
+  if (*this == RHS)
     // X % X == 0;
     return APInt(BitWidth, 0);
-  } else if (lhsWords == 1) {
+  if (lhsWords == 1)
     // All high words are zero, just use native remainder
     return APInt(BitWidth, U.pVal[0] % RHS.U.pVal[0]);
-  }
 
   // We have to compute it the hard way. Invoke the Knuth divide algorithm.
-  APInt Remainder(1,0);
+  APInt Remainder;
   divide(*this, lhsWords, RHS, rhsWords, nullptr, &Remainder);
   return Remainder;
 }
@@ -1667,22 +1656,23 @@ APInt APInt::srem(const APInt &RHS) const {
 void APInt::udivrem(const APInt &LHS, const APInt &RHS,
                     APInt &Quotient, APInt &Remainder) {
   assert(LHS.BitWidth == RHS.BitWidth && "Bit widths must be the same");
+  unsigned BitWidth = LHS.BitWidth;
 
   // First, deal with the easy case
   if (LHS.isSingleWord()) {
     assert(RHS.U.VAL != 0 && "Divide by zero?");
     uint64_t QuotVal = LHS.U.VAL / RHS.U.VAL;
     uint64_t RemVal = LHS.U.VAL % RHS.U.VAL;
-    Quotient = APInt(LHS.BitWidth, QuotVal);
-    Remainder = APInt(LHS.BitWidth, RemVal);
+    Quotient = APInt(BitWidth, QuotVal);
+    Remainder = APInt(BitWidth, RemVal);
     return;
   }
 
   // Get some size facts about the dividend and divisor
-  unsigned lhsBits  = LHS.getActiveBits();
-  unsigned lhsWords = !lhsBits ? 0 : (APInt::whichWord(lhsBits - 1) + 1);
+  unsigned lhsWords = getNumWords(LHS.getActiveBits());
   unsigned rhsBits  = RHS.getActiveBits();
-  unsigned rhsWords = !rhsBits ? 0 : (APInt::whichWord(rhsBits - 1) + 1);
+  unsigned rhsWords = getNumWords(rhsBits);
+  assert(rhsWords && "Performing divrem operation by zero ???");
 
   // Check the degenerate cases
   if (lhsWords == 0) {
@@ -1691,6 +1681,11 @@ void APInt::udivrem(const APInt &LHS, const APInt &RHS,
     return;
   }
 
+  if (rhsBits == 1) {
+    Quotient = LHS;             // X / 1 ===> X
+    Remainder = 0;              // X % 1 ===> 0
+  }
+
   if (lhsWords < rhsWords || LHS.ult(RHS)) {
     Remainder = LHS;            // X % Y ===> X, iff X < Y
     Quotient = 0;               // X / Y ===> 0, iff X < Y
@@ -1703,12 +1698,15 @@ void APInt::udivrem(const APInt &LHS, const APInt &RHS,
     return;
   }
 
-  if (lhsWords == 1 && rhsWords == 1) {
+  if (lhsWords == 1) { // rhsWords is 1 if lhsWords is 1.
     // There is only one word to consider so use the native versions.
-    uint64_t lhsValue = LHS.isSingleWord() ? LHS.U.VAL : LHS.U.pVal[0];
-    uint64_t rhsValue = RHS.isSingleWord() ? RHS.U.VAL : RHS.U.pVal[0];
-    Quotient = APInt(LHS.getBitWidth(), lhsValue / rhsValue);
-    Remainder = APInt(LHS.getBitWidth(), lhsValue % rhsValue);
+    uint64_t lhsValue = LHS.U.pVal[0];
+    uint64_t rhsValue = RHS.U.pVal[0];
+    // Make sure there is enough space to hold the results.
+    Quotient.reallocate(BitWidth);
+    Remainder.reallocate(BitWidth);
+    Quotient = lhsValue / rhsValue;
+    Remainder = lhsValue % rhsValue;
     return;
   }
 
@@ -1723,12 +1721,12 @@ void APInt::sdivrem(const APInt &LHS, const APInt &RHS,
       APInt::udivrem(-LHS, -RHS, Quotient, Remainder);
     else {
       APInt::udivrem(-LHS, RHS, Quotient, Remainder);
-      Quotient = -Quotient;
+      Quotient.negate();
     }
-    Remainder = -Remainder;
+    Remainder.negate();
   } else if (RHS.isNegative()) {
     APInt::udivrem(LHS, -RHS, Quotient, Remainder);
-    Quotient = -Quotient;
+    Quotient.negate();
   } else {
     APInt::udivrem(LHS, RHS, Quotient, Remainder);
   }
@@ -1859,10 +1857,8 @@ void APInt::fromString(unsigned numbits, StringRef str, uint8_t radix) {
     *this += digit;
   }
   // If its negative, put it in two's complement form
-  if (isNeg) {
-    --(*this);
-    this->flipAllBits();
-  }
+  if (isNeg)
+    this->negate();
 }
 
 void APInt::toString(SmallVectorImpl<char> &Str, unsigned Radix,
@@ -1940,8 +1936,7 @@ void APInt::toString(SmallVectorImpl<char> &Str, unsigned Radix,
     // They want to print the signed version and it is a negative value
     // Flip the bits and add one to turn it into the equivalent positive
     // value and put a '-' in the result.
-    Tmp.flipAllBits();
-    ++Tmp;
+    Tmp.negate();
     Str.push_back('-');
   }
 
@@ -1961,22 +1956,19 @@ void APInt::toString(SmallVectorImpl<char> &Str, unsigned Radix,
     unsigned ShiftAmt = (Radix == 16 ? 4 : (Radix == 8 ? 3 : 1));
     unsigned MaskAmt = Radix - 1;
 
-    while (Tmp != 0) {
+    while (Tmp.getBoolValue()) {
       unsigned Digit = unsigned(Tmp.getRawData()[0]) & MaskAmt;
       Str.push_back(Digits[Digit]);
       Tmp.lshrInPlace(ShiftAmt);
     }
   } else {
-    APInt divisor(Radix == 10? 4 : 8, Radix);
-    while (Tmp != 0) {
-      APInt APdigit(1, 0);
-      APInt tmp2(Tmp.getBitWidth(), 0);
-      divide(Tmp, Tmp.getNumWords(), divisor, divisor.getNumWords(), &tmp2,
-             &APdigit);
+    APInt divisor(Tmp.getBitWidth(), Radix);
+    APInt APdigit;
+    while (Tmp.getBoolValue()) {
+      udivrem(Tmp, divisor, Tmp, APdigit);
       unsigned Digit = (unsigned)APdigit.getZExtValue();
       assert(Digit < Radix && "divide failed");
       Str.push_back(Digits[Digit]);
-      Tmp = tmp2;
     }
   }
 
@@ -2346,13 +2338,11 @@ int APInt::tcMultiply(WordType *dst, const WordType *lhs,
   return overflow;
 }
 
-/* DST = LHS * RHS, where DST has width the sum of the widths of the
-   operands.  No overflow occurs.  DST must be disjoint from both
-   operands.  Returns the number of parts required to hold the
-   result.  */
-unsigned APInt::tcFullMultiply(WordType *dst, const WordType *lhs,
-                               const WordType *rhs, unsigned lhsParts,
-                               unsigned rhsParts) {
+/// DST = LHS * RHS, where DST has width the sum of the widths of the
+/// operands. No overflow occurs. DST must be disjoint from both operands.
+void APInt::tcFullMultiply(WordType *dst, const WordType *lhs,
+                           const WordType *rhs, unsigned lhsParts,
+                           unsigned rhsParts) {
   /* Put the narrower number on the LHS for less loops below.  */
   if (lhsParts > rhsParts)
     return tcFullMultiply (dst, rhs, lhs, rhsParts, lhsParts);
@@ -2363,10 +2353,6 @@ unsigned APInt::tcFullMultiply(WordType *dst, const WordType *lhs,
 
   for (unsigned i = 0; i < lhsParts; i++)
     tcMultiplyPart(&dst[i], rhs, lhs[i], 0, rhsParts, rhsParts + 1, true);
-
-  unsigned n = lhsParts + rhsParts;
-
-  return n - (dst[n - 1] == 0);
 }
 
 /* If RHS is zero LHS and REMAINDER are left unchanged, return one.
@@ -2400,22 +2386,20 @@ int APInt::tcDivide(WordType *lhs, const WordType *rhs,
   /* Loop, subtracting SRHS if REMAINDER is greater and adding that to
      the total.  */
   for (;;) {
-      int compare;
-
-      compare = tcCompare(remainder, srhs, parts);
-      if (compare >= 0) {
-        tcSubtract(remainder, srhs, 0, parts);
-        lhs[n] |= mask;
-      }
+    int compare = tcCompare(remainder, srhs, parts);
+    if (compare >= 0) {
+      tcSubtract(remainder, srhs, 0, parts);
+      lhs[n] |= mask;
+    }
 
-      if (shiftCount == 0)
-        break;
-      shiftCount--;
-      tcShiftRight(srhs, parts, 1);
-      if ((mask >>= 1) == 0) {
-        mask = (WordType) 1 << (APINT_BITS_PER_WORD - 1);
-        n--;
-      }
+    if (shiftCount == 0)
+      break;
+    shiftCount--;
+    tcShiftRight(srhs, parts, 1);
+    if ((mask >>= 1) == 0) {
+      mask = (WordType) 1 << (APINT_BITS_PER_WORD - 1);
+      n--;
+    }
   }
 
   return false;
diff --git a/lib/Support/CMakeLists.txt b/lib/Support/CMakeLists.txt
index 63c440037c22..83376284548f 100644
--- a/lib/Support/CMakeLists.txt
+++ b/lib/Support/CMakeLists.txt
@@ -81,6 +81,7 @@ add_llvm_library(LLVMSupport
   MD5.cpp
   NativeFormatting.cpp
   Options.cpp
+  Parallel.cpp
   PluginLoader.cpp
   PrettyStackTrace.cpp
   RandomNumberGenerator.cpp
diff --git a/lib/Support/Parallel.cpp b/lib/Support/Parallel.cpp
new file mode 100644
index 000000000000..ab2cfdebf07d
--- /dev/null
+++ b/lib/Support/Parallel.cpp
@@ -0,0 +1,138 @@
+//===- llvm/Support/Parallel.cpp - Parallel algorithms --------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/Parallel.h"
+#include "llvm/Config/llvm-config.h"
+
+#include <atomic>
+#include <stack>
+#include <thread>
+
+using namespace llvm;
+
+namespace {
+
+/// \brief An abstract class that takes closures and runs them asynchronously.
+class Executor {
+public:
+  virtual ~Executor() = default;
+  virtual void add(std::function<void()> func) = 0;
+
+  static Executor *getDefaultExecutor();
+};
+
+#if !LLVM_ENABLE_THREADS
+class SyncExecutor : public Executor {
+public:
+  virtual void add(std::function<void()> F) { F(); }
+};
+
+Executor *Executor::getDefaultExecutor() {
+  static SyncExecutor Exec;
+  return &Exec;
+}
+
+#elif defined(_MSC_VER)
+/// \brief An Executor that runs tasks via ConcRT.
+class ConcRTExecutor : public Executor {
+  struct Taskish {
+    Taskish(std::function<void()> Task) : Task(Task) {}
+
+    std::function<void()> Task;
+
+    static void run(void *P) {
+      Taskish *Self = static_cast<Taskish *>(P);
+      Self->Task();
+      concurrency::Free(Self);
+    }
+  };
+
+public:
+  virtual void add(std::function<void()> F) {
+    Concurrency::CurrentScheduler::ScheduleTask(
+        Taskish::run, new (concurrency::Alloc(sizeof(Taskish))) Taskish(F));
+  }
+};
+
+Executor *Executor::getDefaultExecutor() {
+  static ConcRTExecutor exec;
+  return &exec;
+}
+
+#else
+/// \brief An implementation of an Executor that runs closures on a thread pool
+///   in filo order.
+class ThreadPoolExecutor : public Executor {
+public:
+  explicit ThreadPoolExecutor(
+      unsigned ThreadCount = std::thread::hardware_concurrency())
+      : Done(ThreadCount) {
+    // Spawn all but one of the threads in another thread as spawning threads
+    // can take a while.
+    std::thread([&, ThreadCount] {
+      for (size_t i = 1; i < ThreadCount; ++i) {
+        std::thread([=] { work(); }).detach();
+      }
+      work();
+    }).detach();
+  }
+
+  ~ThreadPoolExecutor() override {
+    std::unique_lock<std::mutex> Lock(Mutex);
+    Stop = true;
+    Lock.unlock();
+    Cond.notify_all();
+    // Wait for ~Latch.
+  }
+
+  void add(std::function<void()> F) override {
+    std::unique_lock<std::mutex> Lock(Mutex);
+    WorkStack.push(F);
+    Lock.unlock();
+    Cond.notify_one();
+  }
+
+private:
+  void work() {
+    while (true) {
+      std::unique_lock<std::mutex> Lock(Mutex);
+      Cond.wait(Lock, [&] { return Stop || !WorkStack.empty(); });
+      if (Stop)
+        break;
+      auto Task = WorkStack.top();
+      WorkStack.pop();
+      Lock.unlock();
+      Task();
+    }
+    Done.dec();
+  }
+
+  std::atomic<bool> Stop{false};
+  std::stack<std::function<void()>> WorkStack;
+  std::mutex Mutex;
+  std::condition_variable Cond;
+  parallel::detail::Latch Done;
+};
+
+Executor *Executor::getDefaultExecutor() {
+  static ThreadPoolExecutor exec;
+  return &exec;
+}
+#endif
+}
+
+#if LLVM_ENABLE_THREADS
+void parallel::detail::TaskGroup::spawn(std::function<void()> F) {
+  L.inc();
+  Executor::getDefaultExecutor()->add([&, F] {
+    F();
+    L.dec();
+  });
+}
+#endif
diff --git a/lib/Support/Unix/Path.inc b/lib/Support/Unix/Path.inc
index fa28ba1b6ab6..cdea09be41e0 100644
--- a/lib/Support/Unix/Path.inc
+++ b/lib/Support/Unix/Path.inc
@@ -103,13 +103,16 @@
 #define STATVFS_F_FLAG(vfs) (vfs).f_flags
 #endif
 
+#if defined(__FreeBSD__) || defined(__NetBSD__)
+#include <sys/sysctl.h>
+#endif
+
 using namespace llvm;
 
 namespace llvm {
 namespace sys  {
 namespace fs {
-#if defined(__FreeBSD__) || defined (__NetBSD__) || defined(__Bitrig__) || \
-    defined(__OpenBSD__) || defined(__minix) || defined(__FreeBSD_kernel__) || \
+#if defined(__Bitrig__) || defined(__OpenBSD__) || defined(__minix) || \
     defined(__linux__) || defined(__CYGWIN__) || defined(__DragonFly__) || \
     defined(_AIX)
 static int
@@ -164,7 +167,7 @@ getprogpath(char ret[PATH_MAX], const char *bin)
   free(pv);
   return nullptr;
 }
-#endif // __FreeBSD__ || __NetBSD__ || __FreeBSD_kernel__
+#endif // Bitrig || OpenBSD || minix || linux || CYGWIN || DragonFly || AIX
 
 /// GetMainExecutable - Return the path to the main executable, given the
 /// value of argv[0] from program startup.
@@ -180,9 +183,24 @@ std::string getMainExecutable(const char *argv0, void *MainAddr) {
     if (realpath(exe_path, link_path))
       return link_path;
   }
-#elif defined(__FreeBSD__) || defined (__NetBSD__) || defined(__Bitrig__) || \
-      defined(__OpenBSD__) || defined(__minix) || defined(__DragonFly__) || \
-      defined(__FreeBSD_kernel__) || defined(_AIX)
+#elif defined(__FreeBSD__) || defined(__FreeBSD_kernel__) || defined(__NetBSD__)
+  int mib[4];
+  mib[0] = CTL_KERN;
+#if defined(__FreeBSD__) || defined(__FreeBSD_kernel__)
+  mib[1] = KERN_PROC;
+  mib[2] = KERN_PROC_PATHNAME;
+  mib[3] = -1;
+#else
+  mib[1] = KERN_PROC_ARGS;
+  mib[2] = -1;
+  mib[3] = KERN_PROC_PATHNAME;
+#endif
+  char exe_path[PATH_MAX];
+  size_t cb = sizeof(exe_path);
+  if (sysctl(mib, 4, exe_path, &cb, NULL, 0) == 0)
+    return exe_path;
+#elif defined(__Bitrig__) || defined(__OpenBSD__) || defined(__minix) || \
+      defined(__DragonFly__) || defined(_AIX)
   char exe_path[PATH_MAX];
 
   if (getprogpath(exe_path, argv0) != NULL)
diff --git a/lib/Support/Unix/Process.inc b/lib/Support/Unix/Process.inc
index 16f8f5a98e52..1d0143c6716e 100644
--- a/lib/Support/Unix/Process.inc
+++ b/lib/Support/Unix/Process.inc
@@ -347,7 +347,7 @@ static bool terminalHasColors(int fd) {
   MutexGuard G(*TermColorMutex);
 
   int errret = 0;
-  if (setupterm((char *)nullptr, fd, &errret) != 0)
+  if (setupterm(nullptr, fd, &errret) != 0)
     // Regardless of why, if we can't get terminfo, we shouldn't try to print
     // colors.
     return false;
@@ -369,7 +369,7 @@ static bool terminalHasColors(int fd) {
 
   // Now extract the structure allocated by setupterm and free its memory
   // through a really silly dance.
-  struct term *termp = set_curterm((struct term *)nullptr);
+  struct term *termp = set_curterm(nullptr);
   (void)del_curterm(termp); // Drop any errors here.
 
   // Return true if we found a color capabilities for the current terminal.
diff --git a/lib/Target/AArch64/AArch64.td b/lib/Target/AArch64/AArch64.td
index 73f2b6a25f66..4af5fef4287c 100644
--- a/lib/Target/AArch64/AArch64.td
+++ b/lib/Target/AArch64/AArch64.td
@@ -216,6 +216,7 @@ def ProcA72     : SubtargetFeature<"a72", "ARMProcFamily", "CortexA72",
                                    FeatureCRC,
                                    FeatureCrypto,
                                    FeatureFPARMv8,
+                                   FeatureFuseAES,
                                    FeatureNEON,
                                    FeaturePerfMon
                                    ]>;
diff --git a/lib/Target/AArch64/AArch64CallLowering.cpp b/lib/Target/AArch64/AArch64CallLowering.cpp
index ff3e4c40e2c2..29f6d571d6bd 100644
--- a/lib/Target/AArch64/AArch64CallLowering.cpp
+++ b/lib/Target/AArch64/AArch64CallLowering.cpp
@@ -380,7 +380,7 @@ bool AArch64CallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
       MIRBuilder.buildSequence(OrigRet.Reg, SplitRegs, RegOffsets);
   }
 
-  CallSeqStart.addImm(Handler.StackSize);
+  CallSeqStart.addImm(Handler.StackSize).addImm(0);
   MIRBuilder.buildInstr(AArch64::ADJCALLSTACKUP)
       .addImm(Handler.StackSize)
       .addImm(0);
diff --git a/lib/Target/AArch64/AArch64FastISel.cpp b/lib/Target/AArch64/AArch64FastISel.cpp
index 083708001757..9ac7ecb9cdb4 100644
--- a/lib/Target/AArch64/AArch64FastISel.cpp
+++ b/lib/Target/AArch64/AArch64FastISel.cpp
@@ -3014,7 +3014,7 @@ bool AArch64FastISel::processCallArgs(CallLoweringInfo &CLI,
   // Issue CALLSEQ_START
   unsigned AdjStackDown = TII.getCallFrameSetupOpcode();
   BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AdjStackDown))
-    .addImm(NumBytes);
+    .addImm(NumBytes).addImm(0);
 
   // Process the args.
   for (CCValAssign &VA : ArgLocs) {
diff --git a/lib/Target/AArch64/AArch64ISelLowering.cpp b/lib/Target/AArch64/AArch64ISelLowering.cpp
index 4b1bb27dce73..4f7c2e122390 100644
--- a/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -2265,7 +2265,7 @@ SDValue AArch64TargetLowering::LowerFSINCOS(SDValue Op,
   SDValue Callee =
       DAG.getExternalSymbol(LibcallName, getPointerTy(DAG.getDataLayout()));
 
-  StructType *RetTy = StructType::get(ArgTy, ArgTy, nullptr);
+  StructType *RetTy = StructType::get(ArgTy, ArgTy);
   TargetLowering::CallLoweringInfo CLI(DAG);
   CLI.setDebugLoc(dl)
       .setChain(DAG.getEntryNode())
@@ -3249,9 +3249,7 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
   // Adjust the stack pointer for the new arguments...
   // These operations are automatically eliminated by the prolog/epilog pass
   if (!IsSibCall)
-    Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, DL,
-                                                              true),
-                                 DL);
+    Chain = DAG.getCALLSEQ_START(Chain, NumBytes, 0, DL);
 
   SDValue StackPtr = DAG.getCopyFromReg(Chain, DL, AArch64::SP,
                                         getPointerTy(DAG.getDataLayout()));
diff --git a/lib/Target/AArch64/AArch64InstrInfo.cpp b/lib/Target/AArch64/AArch64InstrInfo.cpp
index cb268828455e..c42738da7ab0 100644
--- a/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -3427,6 +3427,10 @@ static bool getFMAPatterns(MachineInstr &Root,
       Patterns.push_back(MachineCombinerPattern::FMLSv1i32_indexed_OP2);
       Found = true;
     }
+    if (canCombineWithFMUL(MBB, Root.getOperand(1), AArch64::FNMULSrr)) {
+      Patterns.push_back(MachineCombinerPattern::FNMULSUBS_OP1);
+      Found = true;
+    }
     break;
   case AArch64::FSUBDrr:
     if (canCombineWithFMUL(MBB, Root.getOperand(1), AArch64::FMULDrr)) {
@@ -3441,6 +3445,10 @@ static bool getFMAPatterns(MachineInstr &Root,
       Patterns.push_back(MachineCombinerPattern::FMLSv1i64_indexed_OP2);
       Found = true;
     }
+    if (canCombineWithFMUL(MBB, Root.getOperand(1), AArch64::FNMULDrr)) {
+      Patterns.push_back(MachineCombinerPattern::FNMULSUBD_OP1);
+      Found = true;
+    }
     break;
   case AArch64::FSUBv2f32:
     if (canCombineWithFMUL(MBB, Root.getOperand(2),
@@ -3495,6 +3503,8 @@ AArch64InstrInfo::isThroughputPattern(MachineCombinerPattern Pattern) const {
   case MachineCombinerPattern::FMULADDD_OP2:
   case MachineCombinerPattern::FMULSUBD_OP1:
   case MachineCombinerPattern::FMULSUBD_OP2:
+  case MachineCombinerPattern::FNMULSUBS_OP1:
+  case MachineCombinerPattern::FNMULSUBD_OP1:
   case MachineCombinerPattern::FMLAv1i32_indexed_OP1:
   case MachineCombinerPattern::FMLAv1i32_indexed_OP2:
   case MachineCombinerPattern::FMLAv1i64_indexed_OP1:
@@ -3996,6 +4006,24 @@ void AArch64InstrInfo::genAlternativeCodeSequence(
     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
     break;
   }
+
+  case MachineCombinerPattern::FNMULSUBS_OP1:
+  case MachineCombinerPattern::FNMULSUBD_OP1: {
+    // FNMUL I=A,B,0
+    // FSUB R,I,C
+    // ==> FNMADD R,A,B,C // = -A*B - C
+    // --- Create(FNMADD);
+    if (Pattern == MachineCombinerPattern::FNMULSUBS_OP1) {
+      Opc = AArch64::FNMADDSrrr;
+      RC = &AArch64::FPR32RegClass;
+    } else {
+      Opc = AArch64::FNMADDDrrr;
+      RC = &AArch64::FPR64RegClass;
+    }
+    MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
+    break;
+  }
+
   case MachineCombinerPattern::FMULSUBS_OP2:
   case MachineCombinerPattern::FMULSUBD_OP2: {
     // FMUL I=A,B,0
@@ -4011,6 +4039,7 @@ void AArch64InstrInfo::genAlternativeCodeSequence(
     }
     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
     break;
+  }
 
   case MachineCombinerPattern::FMLSv1i32_indexed_OP2:
     Opc = AArch64::FMLSv1i32_indexed;
@@ -4067,7 +4096,6 @@ void AArch64InstrInfo::genAlternativeCodeSequence(
                              FMAInstKind::Accumulator);
     }
     break;
-  }
   } // end switch (Pattern)
   // Record MUL and ADD/SUB for deletion
   DelInstrs.push_back(MUL);
diff --git a/lib/Target/AArch64/AArch64InstrInfo.td b/lib/Target/AArch64/AArch64InstrInfo.td
index 902b08844216..5ddf66654a67 100644
--- a/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/lib/Target/AArch64/AArch64InstrInfo.td
@@ -156,7 +156,8 @@ def AArch64adrp          : SDNode<"AArch64ISD::ADRP", SDTIntUnaryOp, []>;
 def AArch64addlow        : SDNode<"AArch64ISD::ADDlow", SDTIntBinOp, []>;
 def AArch64LOADgot       : SDNode<"AArch64ISD::LOADgot", SDTIntUnaryOp>;
 def AArch64callseq_start : SDNode<"ISD::CALLSEQ_START",
-                                SDCallSeqStart<[ SDTCisVT<0, i32> ]>,
+                                SDCallSeqStart<[ SDTCisVT<0, i32>,
+                                                 SDTCisVT<1, i32> ]>,
                                 [SDNPHasChain, SDNPOutGlue]>;
 def AArch64callseq_end   : SDNode<"ISD::CALLSEQ_END",
                                 SDCallSeqEnd<[ SDTCisVT<0, i32>,
@@ -328,8 +329,9 @@ include "AArch64InstrFormats.td"
 let Defs = [SP], Uses = [SP], hasSideEffects = 1, isCodeGenOnly = 1 in {
 // We set Sched to empty list because we expect these instructions to simply get
 // removed in most cases.
-def ADJCALLSTACKDOWN : Pseudo<(outs), (ins i32imm:$amt),
-                              [(AArch64callseq_start timm:$amt)]>, Sched<[]>;
+def ADJCALLSTACKDOWN : Pseudo<(outs), (ins i32imm:$amt1, i32imm:$amt2),
+                              [(AArch64callseq_start timm:$amt1, timm:$amt2)]>,
+                              Sched<[]>;
 def ADJCALLSTACKUP : Pseudo<(outs), (ins i32imm:$amt1, i32imm:$amt2),
                             [(AArch64callseq_end timm:$amt1, timm:$amt2)]>,
                             Sched<[]>;
diff --git a/lib/Target/AArch64/AArch64RegisterBankInfo.cpp b/lib/Target/AArch64/AArch64RegisterBankInfo.cpp
index 5f895903da6f..789270c2a34b 100644
--- a/lib/Target/AArch64/AArch64RegisterBankInfo.cpp
+++ b/lib/Target/AArch64/AArch64RegisterBankInfo.cpp
@@ -529,9 +529,34 @@ AArch64RegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
     // for the greedy mode the cost of the cross bank copy will
     // offset this number.
     // FIXME: Should be derived from the scheduling model.
-    if (OpRegBankIdx[0] >= PMI_FirstFPR)
+    if (OpRegBankIdx[0] != PMI_FirstGPR)
       Cost = 2;
+    else
+      // Check if that load feeds fp instructions.
+      // In that case, we want the default mapping to be on FPR
+      // instead of blind map every scalar to GPR.
+      for (const MachineInstr &UseMI :
+           MRI.use_instructions(MI.getOperand(0).getReg()))
+        // If we have at least one direct use in a FP instruction,
+        // assume this was a floating point load in the IR.
+        // If it was not, we would have had a bitcast before
+        // reaching that instruction.
+        if (isPreISelGenericFloatingPointOpcode(UseMI.getOpcode())) {
+          OpRegBankIdx[0] = PMI_FirstFPR;
+          break;
+        }
     break;
+  case TargetOpcode::G_STORE:
+    // Check if that store is fed by fp instructions.
+    if (OpRegBankIdx[0] == PMI_FirstGPR) {
+      unsigned VReg = MI.getOperand(0).getReg();
+      if (!VReg)
+        break;
+      MachineInstr *DefMI = MRI.getVRegDef(VReg);
+      if (isPreISelGenericFloatingPointOpcode(DefMI->getOpcode()))
+        OpRegBankIdx[0] = PMI_FirstFPR;
+      break;
+    }
   }
 
   // Finally construct the computed mapping.
diff --git a/lib/Target/AArch64/AArch64SchedFalkorDetails.td b/lib/Target/AArch64/AArch64SchedFalkorDetails.td
index 8f8eeef8a6cf..a9b4d44a523e 100644
--- a/lib/Target/AArch64/AArch64SchedFalkorDetails.td
+++ b/lib/Target/AArch64/AArch64SchedFalkorDetails.td
@@ -42,11 +42,11 @@ def : InstRW<[FalkorWr_1VXVY_4cyc],   (instregex "^FCVT(N|M|P|Z|A)(S|U)(v1i32|v1
 def : InstRW<[FalkorWr_1VXVY_4cyc],   (instrs FCVTXNv1i64)>;
 def : InstRW<[FalkorWr_1VXVY_4cyc],   (instregex "^FCVTZ(S|U)(v2i32|v4i16)(_shift)?$")>;
 
-def : InstRW<[FalkorWr_1VXVY_5cyc],   (instregex "^(FMUL|FMULX)(v2f32|v4f16|(v1i16_indexed|v4i16_indexed|v1i32_indexed|v2i32_indexed))$")>;
-def : InstRW<[FalkorWr_1VXVY_5cyc],   (instrs FMULX16, FMULX32)>;
+def : InstRW<[FalkorWr_FMUL32_1VXVY_5cyc], (instregex "^(FMUL|FMULX)(v2f32|v4f16|(v1i16_indexed|v4i16_indexed|v1i32_indexed|v2i32_indexed))$")>;
+def : InstRW<[FalkorWr_FMUL32_1VXVY_5cyc], (instrs FMULX16, FMULX32)>;
 
-def : InstRW<[FalkorWr_1VXVY_6cyc],   (instregex "^(FMUL|FMULX)v1i64_indexed$")>;
-def : InstRW<[FalkorWr_1VXVY_6cyc],   (instrs FMULX64)>;
+def : InstRW<[FalkorWr_FMUL64_1VXVY_6cyc], (instregex "^(FMUL|FMULX)v1i64_indexed$")>;
+def : InstRW<[FalkorWr_FMUL64_1VXVY_6cyc], (instrs FMULX64)>;
 
 def : InstRW<[FalkorWr_2VXVY_1cyc],   (instregex "^(FABS|FNEG)(v2f64|v4f32|v8f16)$")>;
 
@@ -62,9 +62,9 @@ def : InstRW<[FalkorWr_2VXVY_4cyc],   (instregex "^FCVT(N|M|P|Z|A)(S|U)(v2f64|v4
 def : InstRW<[FalkorWr_2VXVY_4cyc],   (instregex "^(FCVTL|FCVTL2)(v2i32|v4i16|v4i32|v8i16)$")>;
 def : InstRW<[FalkorWr_2VXVY_4cyc],   (instregex "^FCVTZ(S|U)(v2i64|v4i32|v8i16)(_shift)?$")>;
 
-def : InstRW<[FalkorWr_2VXVY_5cyc],   (instregex "^(FMUL|FMULX)(v2f64|v4f32|v8f16|v8i16_indexed|v4i32_indexed)$")>;
+def : InstRW<[FalkorWr_FMUL32_2VXVY_5cyc], (instregex "^(FMUL|FMULX)(v2f64|v4f32|v8f16|v8i16_indexed|v4i32_indexed)$")>;
 
-def : InstRW<[FalkorWr_2VXVY_6cyc],   (instregex "^(FMUL|FMULX)v2i64_indexed$")>;
+def : InstRW<[FalkorWr_FMUL64_2VXVY_6cyc], (instregex "^(FMUL|FMULX)v2i64_indexed$")>;
 
 def : InstRW<[FalkorWr_3VXVY_4cyc],   (instregex "^(FCVTX?N|FCVTX?N2)(v1i32|v1i64|v1f16|v2f32|v4f16)$")>;
 
@@ -72,13 +72,14 @@ def : InstRW<[FalkorWr_3VXVY_5cyc],   (instregex "^(FCVTX?N|FCVTX?N2)(v2i32|v4i1
 
 def : InstRW<[FalkorWr_2VX_2VY_2cyc], (instregex "^(FDIV|FSQRT)(v2f64|v4f32|v8f16)$")>;
 
-def : InstRW<[FalkorWr_1VXVY_4cyc, FalkorReadVMA],(instregex "^ML(A|S)(v8i8|v4i16|v2i32)(_indexed)?$")>;
-def : InstRW<[FalkorWr_2VXVY_4cyc, FalkorReadVMA],(instregex "^ML(A|S)(v16i8|v8i16|v4i32|v2i64)(_indexed)?$")>;
+def : InstRW<[FalkorWr_VMUL32_1VXVY_4cyc, FalkorReadVMA], (instregex "^ML(A|S)(v8i8|v4i16|v2i32)(_indexed)?$")>;
+def : InstRW<[FalkorWr_VMUL32_2VXVY_4cyc, FalkorReadVMA], (instregex "^ML(A|S)(v16i8|v8i16|v4i32|v2i64)(_indexed)?$")>;
+
+def : InstRW<[FalkorWr_FMUL32_1VXVY_5cyc, FalkorReadFMA32], (instregex "^FML(A|S)(v2f32|v4f16|(v1i16_indexed|v4i16_indexed|v1i32_indexed|v2i32_indexed))$")>;
+def : InstRW<[FalkorWr_FMUL64_1VXVY_6cyc, FalkorReadFMA64], (instregex "^FML(A|S)v1i64_indexed$")>;
+def : InstRW<[FalkorWr_FMUL32_2VXVY_5cyc, FalkorReadFMA32], (instregex "^FML(A|S)(v4f32|v8f16|v8i16_indexed|v4i32_indexed)$")>;
+def : InstRW<[FalkorWr_FMUL64_2VXVY_6cyc, FalkorReadFMA64], (instregex "^FML(A|S)(v2f64|v2i64_indexed)$")>;
 
-def : InstRW<[FalkorWr_1VXVY_5cyc, FalkorReadFMA],(instregex "^FML(A|S)(v2f32|v4f16|(v1i16_indexed|v4i16_indexed|v1i32_indexed|v2i32_indexed))$")>;
-def : InstRW<[FalkorWr_1VXVY_6cyc, FalkorReadFMA],(instregex "^FML(A|S)v1i64_indexed$")>;
-def : InstRW<[FalkorWr_2VXVY_5cyc, FalkorReadFMA],(instregex "^FML(A|S)(v2f64|v4f32|v8f16|v8i16_indexed|v4i32_indexed)$")>;
-def : InstRW<[FalkorWr_2VXVY_6cyc, FalkorReadFMA],(instregex "^FML(A|S)v2i64_indexed$")>;
 // SIMD Integer Instructions
 // -----------------------------------------------------------------------------
 def : InstRW<[FalkorWr_1VXVY_1cyc],   (instregex "^ADD(v1i64|v2i32|v4i16|v8i8)$")>;
@@ -119,10 +120,10 @@ def : InstRW<[FalkorWr_1VXVY_3cyc],   (instregex "^SQNEG(v1i8|v1i16|v1i32|v1i64)
 def : InstRW<[FalkorWr_1VXVY_4cyc],   (instregex "^(S|U)ADDLVv8i8v$")>;
 def : InstRW<[FalkorWr_1VXVY_4cyc],   (instregex "^(S|U)?(MAX|MIN)V(v8i8v|v8i16v)$")>;
 def : InstRW<[FalkorWr_1VXVY_4cyc],   (instrs ADDVv8i8v)>;
-def : InstRW<[FalkorWr_1VXVY_4cyc],   (instregex "^MUL(v2i32|v4i16|v8i8)(_indexed)?$")>;
-def : InstRW<[FalkorWr_1VXVY_4cyc],   (instregex "^SQR?DMULH(v8i8|v4i16|v1i32|v2i32|v1i16)(_indexed)?$")>;
-def : InstRW<[FalkorWr_1VXVY_4cyc],   (instregex "^SQDMULL(i16|i32)$")>;
-def : InstRW<[FalkorWr_1VXVY_4cyc],   (instregex "^SQRDML(A|S)?H(v8i8|v4i16|v1i32|v2i32|v1i16)(_indexed)?$")>;
+def : InstRW<[FalkorWr_VMUL32_1VXVY_4cyc], (instregex "^MUL(v2i32|v4i16|v8i8)(_indexed)?$")>;
+def : InstRW<[FalkorWr_VMUL32_1VXVY_4cyc], (instregex "^SQR?DMULH(v8i8|v4i16|v1i32|v2i32|v1i16)(_indexed)?$")>;
+def : InstRW<[FalkorWr_VMUL32_1VXVY_4cyc], (instregex "^SQDMULL(i16|i32)$")>;
+def : InstRW<[FalkorWr_VMUL32_1VXVY_4cyc, FalkorReadVMA], (instregex "^SQRDML(A|S)H(i16|i32|v8i8|v4i16|v1i32|v2i32|v1i16)(_indexed)?$")>;
 
 def : InstRW<[FalkorWr_1VXVY_5cyc],   (instregex "^(S|U)?(MAX|MIN)Vv16i8v$")>;
 
@@ -169,9 +170,9 @@ def : InstRW<[FalkorWr_2VXVY_3cyc],   (instregex "^PMULL2?(v1i64|v2i64)$")>;
 def : InstRW<[FalkorWr_2VXVY_3cyc],   (instregex "^S(L|R)I(v16i8|v8i16|v4i32|v2i64)_shift$")>;
 def : InstRW<[FalkorWr_2VXVY_3cyc],   (instregex "^SQ(ABS|NEG)(v16i8|v8i16|v4i32|v2i64)$")>;
 
-def : InstRW<[FalkorWr_2VXVY_4cyc],   (instregex "^(MUL|SQR?DMULH)(v16i8|v8i16|v4i32)(_indexed)?$")>;
-def : InstRW<[FalkorWr_2VXVY_4cyc],   (instregex "^SQDMULLv.*$")>;
-def : InstRW<[FalkorWr_2VXVY_4cyc],   (instregex "^SQRDML(A|S)H(v16i8|v8i16|v4i32)(_indexed)?$")>;
+def : InstRW<[FalkorWr_VMUL32_2VXVY_4cyc], (instregex "^(MUL|SQR?DMULH)(v16i8|v8i16|v4i32)(_indexed)?$")>;
+def : InstRW<[FalkorWr_VMUL32_2VXVY_4cyc], (instregex "^SQDMULLv.*$")>;
+def : InstRW<[FalkorWr_VMUL32_2VXVY_4cyc, FalkorReadVMA], (instregex "^SQRDML(A|S)H(v16i8|v8i16|v4i32)(_indexed)?$")>;
 
 def : InstRW<[FalkorWr_3VXVY_3cyc],   (instregex "^(S|U)ADDLVv4i32v$")>;
 
@@ -185,8 +186,9 @@ def : InstRW<[FalkorWr_4VXVY_3cyc],   (instregex "^(S|U)ABALv.*$")>;
 
 def : InstRW<[FalkorWr_4VXVY_4cyc],   (instregex "^(S|U)ABA(v16i8|v8i16|v4i32)$")>;
 
-def : InstRW<[FalkorWr_1VXVY_4cyc, FalkorReadVMA],(instregex "^SQD(MLAL|MLSL)(i16|i32)$")>;
-def : InstRW<[FalkorWr_2VXVY_4cyc, FalkorReadVMA],(instregex "^SQD(MLAL|MLSL)v.*$")>;
+def : InstRW<[FalkorWr_VMUL32_1VXVY_4cyc, FalkorReadVMA], (instregex "^SQD(MLAL|MLSL)(i16|i32|v1i32_indexed|v1i64_indexed)$")>;
+def : InstRW<[FalkorWr_VMUL32_2VXVY_4cyc, FalkorReadVMA], (instregex "^SQD(MLAL|MLSL)v[248].*$")>;
+
 // SIMD Load Instructions
 // -----------------------------------------------------------------------------
 def : InstRW<[WriteVLD],                               (instregex "^LD1(i64|Onev(8b|4h|2s|1d|16b|8h|4s|2d))$")>;
@@ -294,9 +296,9 @@ def : InstRW<[FalkorWr_1VXVY_3cyc],   (instrs FRECPEv1i32, FRECPEv1i64, FRSQRTEv
 def : InstRW<[FalkorWr_1VXVY_3cyc],   (instrs FRECPXv1i32, FRECPXv1i64)>;
 def : InstRW<[FalkorWr_1VXVY_3cyc],   (instrs URECPEv2i32, URSQRTEv2i32)>;
 
-def : InstRW<[FalkorWr_1VXVY_5cyc],   (instrs FRECPS32, FRSQRTS32, FRECPSv2f32, FRSQRTSv2f32)>;
+def : InstRW<[FalkorWr_FMUL32_1VXVY_5cyc], (instrs FRECPS32, FRSQRTS32, FRECPSv2f32, FRSQRTSv2f32)>;
 
-def : InstRW<[FalkorWr_1VXVY_6cyc],   (instrs FRECPS64, FRSQRTS64)>;
+def : InstRW<[FalkorWr_FMUL64_1VXVY_6cyc], (instrs FRECPS64, FRSQRTS64)>;
 
 def : InstRW<[FalkorWr_1GTOV_1VXVY_2cyc],(instregex "^INSv(i32|i64)(gpr|lane)$")>;
 def : InstRW<[FalkorWr_2GTOV_1cyc],   (instregex "^DUP(v4i32|v2i64)(gpr|lane)$")>;
@@ -311,9 +313,9 @@ def : InstRW<[FalkorWr_2VXVY_3cyc],   (instrs URECPEv4i32, URSQRTEv4i32)>;
 def : InstRW<[FalkorWr_2VXVY_4cyc],   (instrs TBLv8i8Two)>;
 def : InstRW<[FalkorWr_2VXVY_4cyc],   (instregex "^TBX(v8|v16)i8One$")>;
 
-def : InstRW<[FalkorWr_2VXVY_5cyc],   (instrs FRECPSv4f32, FRSQRTSv4f32)>;
+def : InstRW<[FalkorWr_FMUL32_2VXVY_5cyc], (instrs FRECPSv4f32, FRSQRTSv4f32)>;
 
-def : InstRW<[FalkorWr_2VXVY_6cyc],   (instrs FRECPSv2f64, FRSQRTSv2f64)>;
+def : InstRW<[FalkorWr_FMUL64_2VXVY_6cyc], (instrs FRECPSv2f64, FRSQRTSv2f64)>;
 
 def : InstRW<[FalkorWr_3VXVY_5cyc],   (instregex "^TBL(v8i8Three|v16i8Two)$")>;
 def : InstRW<[FalkorWr_3VXVY_5cyc],   (instregex "^TBX(v8i8Two|v16i8Two)$")>;
@@ -416,22 +418,25 @@ def : InstRW<[FalkorWr_1VXVY_3cyc],   (instrs FCVTSHr, FCVTDHr)>;
 
 def : InstRW<[FalkorWr_1VXVY_4cyc],   (instrs FCVTSDr, FCVTDSr)>;
 
-def : InstRW<[FalkorWr_1VXVY_5cyc],   (instregex "^F(N)?MUL(H|S)rr$")>;
+def : InstRW<[FalkorWr_FMUL32_1VXVY_5cyc], (instregex "^F(N)?MUL(H|S)rr$")>;
 
-def : InstRW<[FalkorWr_1VXVY_6cyc],   (instregex "^F(N)?MULDrr$")>;
+def : InstRW<[FalkorWr_FMUL64_1VXVY_6cyc], (instregex "^F(N)?MULDrr$")>;
 
 def : InstRW<[FalkorWr_1VX_1VY_10cyc],(instregex "^FDIV(H|S|D)rr$")>;
 def : InstRW<[FalkorWr_1VX_1VY_2cyc], (instregex "^FSQRT(H|S|D)r$")>;
 
-def : InstRW<[FalkorWr_1VXVY_5cyc, FalkorReadFMA],(instregex "^F(N)?M(ADD|SUB)(H|S)rrr$")>;
-def : InstRW<[FalkorWr_1VXVY_6cyc, FalkorReadFMA],(instregex "^F(N)?M(ADD|SUB)Drrr$")>;
+def : InstRW<[FalkorWr_FMUL32_1VXVY_5cyc, ReadDefault, ReadDefault, FalkorReadFMA32], (instregex "^F(N)?M(ADD|SUB)(H|S)rrr$")>;
+def : InstRW<[FalkorWr_FMUL64_1VXVY_6cyc, ReadDefault, ReadDefault, FalkorReadFMA64], (instregex "^F(N)?M(ADD|SUB)Drrr$")>;
 
 // FP Miscellaneous Instructions
 // -----------------------------------------------------------------------------
-def : InstRW<[FalkorWr_FMOV],         (instregex "^FMOV(HW|HX|SW|DX|DXHigh)r$")>;
+def : InstRW<[FalkorWr_FMOV],         (instregex "^FMOV(H|S|D)i$")>;
+def : InstRW<[FalkorWr_1GTOV_1cyc],   (instregex "^FMOV(HW|HX|SW|DX|DXHigh)r$")>;
 def : InstRW<[FalkorWr_1VTOG_1cyc],   (instregex "^FCVTZ(S|U)(S|U)(W|X)(D|S)ri?$")>;
 def : InstRW<[FalkorWr_1VTOG_1cyc],   (instregex "^FMOV(WH|WS|XH|XD|XDHigh)r$")>;
-def : InstRW<[FalkorWr_1VXVY_1cyc],   (instregex "^FMOV(Hi|Hr|S0|Si|Sr|D0|Di|Dr|v.*_ns)$")>;
+def : InstRW<[FalkorWr_1VXVY_1cyc],   (instregex "^FMOV(Hr|Sr|Dr|v.*_ns)$")>;
+// FIXME: We are currently generating movi v0.2d, #0 for these, which is worse than fmov 0.0
+def : InstRW<[FalkorWr_2VXVY_1cyc],   (instrs FMOVD0, FMOVS0)>;
 
 def : InstRW<[FalkorWr_1GTOV_4cyc],   (instregex "^(S|U)CVTF(S|U)(W|X)(D|S)ri$")>;
 def : InstRW<[FalkorWr_1VXVY_4cyc],   (instregex "^(S|U)CVTF(v1i16|v1i32|v2i32|v1i64|v4i16|v2f32|v4f16|d|s)(_shift)?")>;
@@ -475,16 +480,17 @@ def : InstRW<[FalkorWr_2XYZ_2cyc],    (instregex "^EXTR(W|X)rri$")>;
 
 // Divide and Multiply Instructions
 // -----------------------------------------------------------------------------
-def : InstRW<[FalkorWr_1X_4cyc],      (instregex "^(S|U)M(ADD|SUB)Lrrr$")>;
-def : InstRW<[FalkorWr_1X_4cyc],      (instregex "^M(ADD|SUB)Wrrr$")>;
+def : InstRW<[FalkorWr_IMUL64_1X_4cyc, ReadDefault, ReadDefault, FalkorReadIMA64], (instregex "^(S|U)M(ADD|SUB)Lrrr$")>;
+def : InstRW<[FalkorWr_IMUL32_1X_2cyc, ReadDefault, ReadDefault, FalkorReadIMA32], (instregex "^M(ADD|SUB)Wrrr$")>;
 
-def : InstRW<[FalkorWr_1X_5cyc],      (instregex "^(S|U)MULHrr$")>;
-def : InstRW<[FalkorWr_1X_5cyc],      (instregex "^M(ADD|SUB)Xrrr$")>;
+def : InstRW<[FalkorWr_IMUL64_1X_5cyc],                                            (instregex "^(S|U)MULHrr$")>;
+def : InstRW<[FalkorWr_IMUL64_1X_5cyc, ReadDefault, ReadDefault, FalkorReadIMA64], (instregex "^M(ADD|SUB)Xrrr$")>;
 
 def : InstRW<[FalkorWr_1X_1Z_8cyc],   (instregex "^(S|U)DIVWr$")>;
 def : InstRW<[FalkorWr_1X_1Z_16cyc],  (instregex "^(S|U)DIVXr$")>;
 
-def : InstRW<[FalkorWr_2VXVY_4cyc],   (instregex "^(S|U)(MLAL|MLSL|MULL)v.*$")>;
+def : InstRW<[FalkorWr_VMUL32_2VXVY_4cyc],                (instregex "^(S|U)MULLv.*$")>;
+def : InstRW<[FalkorWr_VMUL32_2VXVY_4cyc, FalkorReadVMA], (instregex "^(S|U)(MLAL|MLSL)v.*$")>;
 
 // Move and Shift Instructions
 // -----------------------------------------------------------------------------
diff --git a/lib/Target/AArch64/AArch64SchedFalkorWriteRes.td b/lib/Target/AArch64/AArch64SchedFalkorWriteRes.td
index e64b2c441a19..6526cc28e806 100644
--- a/lib/Target/AArch64/AArch64SchedFalkorWriteRes.td
+++ b/lib/Target/AArch64/AArch64SchedFalkorWriteRes.td
@@ -29,8 +29,9 @@
 // Define 1 micro-op types
 
 def FalkorWr_1X_2cyc    : SchedWriteRes<[FalkorUnitX]>   { let Latency = 2; }
-def FalkorWr_1X_4cyc    : SchedWriteRes<[FalkorUnitX]>   { let Latency = 4; }
-def FalkorWr_1X_5cyc    : SchedWriteRes<[FalkorUnitX]>   { let Latency = 5; }
+def FalkorWr_IMUL32_1X_2cyc : SchedWriteRes<[FalkorUnitX]> { let Latency = 4; }
+def FalkorWr_IMUL64_1X_4cyc : SchedWriteRes<[FalkorUnitX]> { let Latency = 4; }
+def FalkorWr_IMUL64_1X_5cyc : SchedWriteRes<[FalkorUnitX]> { let Latency = 5; }
 def FalkorWr_1Z_0cyc    : SchedWriteRes<[FalkorUnitZ]>   { let Latency = 0; }
 def FalkorWr_1ZB_0cyc   : SchedWriteRes<[FalkorUnitZB]>  { let Latency = 0; }
 def FalkorWr_1LD_3cyc   : SchedWriteRes<[FalkorUnitLD]>  { let Latency = 3; }
@@ -45,8 +46,10 @@ def FalkorWr_1VXVY_1cyc : SchedWriteRes<[FalkorUnitVXVY]>{ let Latency = 1; }
 def FalkorWr_1VXVY_2cyc : SchedWriteRes<[FalkorUnitVXVY]>{ let Latency = 2; }
 def FalkorWr_1VXVY_3cyc : SchedWriteRes<[FalkorUnitVXVY]>{ let Latency = 3; }
 def FalkorWr_1VXVY_4cyc : SchedWriteRes<[FalkorUnitVXVY]>{ let Latency = 4; }
+def FalkorWr_VMUL32_1VXVY_4cyc : SchedWriteRes<[FalkorUnitVXVY]>{ let Latency = 4; }
 def FalkorWr_1VXVY_5cyc : SchedWriteRes<[FalkorUnitVXVY]>{ let Latency = 5; }
-def FalkorWr_1VXVY_6cyc : SchedWriteRes<[FalkorUnitVXVY]>{ let Latency = 6; }
+def FalkorWr_FMUL32_1VXVY_5cyc : SchedWriteRes<[FalkorUnitVXVY]>{ let Latency = 5; }
+def FalkorWr_FMUL64_1VXVY_6cyc : SchedWriteRes<[FalkorUnitVXVY]>{ let Latency = 6; }
 
 def FalkorWr_1LD_0cyc   : SchedWriteRes<[FalkorUnitLD]>  { let Latency = 0; }
 def FalkorWr_1ST_0cyc   : SchedWriteRes<[FalkorUnitST]>  { let Latency = 0; }
@@ -75,14 +78,26 @@ def FalkorWr_2VXVY_4cyc   : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY]> {
   let Latency = 4;
   let NumMicroOps = 2;
 }
+def FalkorWr_VMUL32_2VXVY_4cyc : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY]> {
+  let Latency = 4;
+  let NumMicroOps = 2;
+}
 def FalkorWr_2VXVY_5cyc   : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY]> {
   let Latency = 5;
   let NumMicroOps = 2;
 }
+def FalkorWr_FMUL32_2VXVY_5cyc : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY]> {
+  let Latency = 5;
+  let NumMicroOps = 2;
+}
 def FalkorWr_2VXVY_6cyc   : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY]> {
   let Latency = 6;
   let NumMicroOps = 2;
 }
+def FalkorWr_FMUL64_2VXVY_6cyc : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY]> {
+  let Latency = 6;
+  let NumMicroOps = 2;
+}
 
 def FalkorWr_1LD_1VXVY_4cyc : SchedWriteRes<[FalkorUnitLD, FalkorUnitVXVY]> {
   let Latency = 4;
@@ -350,18 +365,17 @@ def FalkorWr_2LD_2VXVY_1XYZ_2LD_2VXVY_4cyc:SchedWriteRes<[FalkorUnitLD,
   let NumMicroOps = 9;
 }
 
-// Forwarding logic is modeled for vector multiply and accumulate
+// Forwarding logic is modeled for multiply add/accumulate.
 // -----------------------------------------------------------------------------
-def FalkorReadVMA : SchedReadAdvance<2, [FalkorWr_1VXVY_4cyc,
-                                         FalkorWr_2VXVY_4cyc]>;
-def FalkorReadFMA : SchedReadAdvance<3, [FalkorWr_1VXVY_5cyc,
-                                         FalkorWr_1VXVY_6cyc,
-                                         FalkorWr_2VXVY_5cyc,
-                                         FalkorWr_2VXVY_6cyc]>;
+def FalkorReadIMA32  : SchedReadAdvance<3, [FalkorWr_IMUL32_1X_2cyc]>;
+def FalkorReadIMA64  : SchedReadAdvance<4, [FalkorWr_IMUL64_1X_4cyc, FalkorWr_IMUL64_1X_5cyc]>;
+def FalkorReadVMA    : SchedReadAdvance<3, [FalkorWr_VMUL32_1VXVY_4cyc, FalkorWr_VMUL32_2VXVY_4cyc]>;
+def FalkorReadFMA32  : SchedReadAdvance<1, [FalkorWr_FMUL32_1VXVY_5cyc, FalkorWr_FMUL32_2VXVY_5cyc]>;
+def FalkorReadFMA64  : SchedReadAdvance<2, [FalkorWr_FMUL64_1VXVY_6cyc, FalkorWr_FMUL64_2VXVY_6cyc]>;
 
 // SchedPredicates and WriteVariants for Immediate Zero and LSLFast
 // -----------------------------------------------------------------------------
-def FalkorImmZPred    : SchedPredicate<[{TII->isGPRZero(*MI)}]>;
+def FalkorImmZPred    : SchedPredicate<[{MI->getOperand(1).getImm() == 0}]>;
 def FalkorLSLFastPred : SchedPredicate<[{TII->isFalkorLSLFast(*MI)}]>; 
 
 def FalkorWr_FMOV  : SchedWriteVariant<[
@@ -378,7 +392,6 @@ def FalkorWr_LDR   : SchedWriteVariant<[
 
 def FalkorWr_ADD   : SchedWriteVariant<[
                        SchedVar<FalkorLSLFastPred, [FalkorWr_1XYZ_1cyc]>,
-                       SchedVar<FalkorImmZPred,    [FalkorWr_1XYZ_1cyc]>,
                        SchedVar<NoSchedPred,       [FalkorWr_2XYZ_2cyc]>]>;
 
 def FalkorWr_PRFM  : SchedWriteVariant<[
diff --git a/lib/Target/AArch64/AArch64Subtarget.cpp b/lib/Target/AArch64/AArch64Subtarget.cpp
index abdeac019a18..1c81d34014fd 100644
--- a/lib/Target/AArch64/AArch64Subtarget.cpp
+++ b/lib/Target/AArch64/AArch64Subtarget.cpp
@@ -91,6 +91,8 @@ void AArch64Subtarget::initializeProperties() {
   case Falkor:
     MaxInterleaveFactor = 4;
     VectorInsertExtractBaseCost = 2;
+    // FIXME: remove this to enable 64-bit SLP if performance looks good.
+    MinVectorRegisterBitWidth = 128;
     break;
   case Kryo:
     MaxInterleaveFactor = 4;
@@ -99,6 +101,8 @@ void AArch64Subtarget::initializeProperties() {
     PrefetchDistance = 740;
     MinPrefetchStride = 1024;
     MaxPrefetchIterationsAhead = 11;
+    // FIXME: remove this to enable 64-bit SLP if performance looks good.
+    MinVectorRegisterBitWidth = 128;
     break;
   case ThunderX2T99:
     CacheLineSize = 64;
@@ -108,6 +112,8 @@ void AArch64Subtarget::initializeProperties() {
     PrefetchDistance = 128;
     MinPrefetchStride = 1024;
     MaxPrefetchIterationsAhead = 4;
+    // FIXME: remove this to enable 64-bit SLP if performance looks good.
+    MinVectorRegisterBitWidth = 128;
     break;
   case ThunderX:
   case ThunderXT88:
@@ -116,6 +122,8 @@ void AArch64Subtarget::initializeProperties() {
     CacheLineSize = 128;
     PrefFunctionAlignment = 3;
     PrefLoopAlignment = 2;
+    // FIXME: remove this to enable 64-bit SLP if performance looks good.
+    MinVectorRegisterBitWidth = 128;
     break;
   case CortexA35: break;
   case CortexA53: break;
diff --git a/lib/Target/AArch64/AArch64Subtarget.h b/lib/Target/AArch64/AArch64Subtarget.h
index 5b9bee6e41b8..df54bf3f48e1 100644
--- a/lib/Target/AArch64/AArch64Subtarget.h
+++ b/lib/Target/AArch64/AArch64Subtarget.h
@@ -83,6 +83,9 @@ protected:
   // NegativeImmediates - transform instructions with negative immediates
   bool NegativeImmediates = true;
 
+  // Enable 64-bit vectorization in SLP.
+  unsigned MinVectorRegisterBitWidth = 64;
+
   bool UseAA = false;
   bool PredictableSelectIsExpensive = false;
   bool BalanceFPOps = false;
@@ -106,6 +109,7 @@ protected:
   unsigned PrefFunctionAlignment = 0;
   unsigned PrefLoopAlignment = 0;
   unsigned MaxJumpTableSize = 0;
+  unsigned WideningBaseCost = 0;
 
   // ReserveX18 - X18 is not available as a general purpose register.
   bool ReserveX18;
@@ -190,6 +194,10 @@ public:
 
   bool isXRaySupported() const override { return true; }
 
+  unsigned getMinVectorRegisterBitWidth() const {
+    return MinVectorRegisterBitWidth;
+  }
+
   bool isX18Reserved() const { return ReserveX18; }
   bool hasFPARMv8() const { return HasFPARMv8; }
   bool hasNEON() const { return HasNEON; }
@@ -228,6 +236,8 @@ public:
 
   unsigned getMaximumJumpTableSize() const { return MaxJumpTableSize; }
 
+  unsigned getWideningBaseCost() const { return WideningBaseCost; }
+
   /// CPU has TBI (top byte of addresses is ignored during HW address
   /// translation) and OS enables it.
   bool supportsAddressTopByteIgnored() const;
diff --git a/lib/Target/AArch64/AArch64TargetObjectFile.cpp b/lib/Target/AArch64/AArch64TargetObjectFile.cpp
index 8875f9b72647..12a2e9a867f0 100644
--- a/lib/Target/AArch64/AArch64TargetObjectFile.cpp
+++ b/lib/Target/AArch64/AArch64TargetObjectFile.cpp
@@ -70,3 +70,11 @@ const MCExpr *AArch64_MachoTargetObjectFile::getIndirectSymViaGOTPCRel(
   const MCExpr *PC = MCSymbolRefExpr::create(PCSym, getContext());
   return MCBinaryExpr::createSub(Res, PC, getContext());
 }
+
+void AArch64_MachoTargetObjectFile::getNameWithPrefix(
+    SmallVectorImpl<char> &OutName, const GlobalValue *GV,
+    const TargetMachine &TM) const {
+  // AArch64 does not use section-relative relocations so any global symbol must
+  // be accessed via at least a linker-private symbol.
+  getMangler().getNameWithPrefix(OutName, GV, /* CannotUsePrivateLabel */ true);
+}
diff --git a/lib/Target/AArch64/AArch64TargetObjectFile.h b/lib/Target/AArch64/AArch64TargetObjectFile.h
index 05e1dfa9e6c9..47e3bce43f6e 100644
--- a/lib/Target/AArch64/AArch64TargetObjectFile.h
+++ b/lib/Target/AArch64/AArch64TargetObjectFile.h
@@ -40,6 +40,9 @@ public:
                                           const MCValue &MV, int64_t Offset,
                                           MachineModuleInfo *MMI,
                                           MCStreamer &Streamer) const override;
+
+  void getNameWithPrefix(SmallVectorImpl<char> &OutName, const GlobalValue *GV,
+                         const TargetMachine &TM) const override;
 };
 
 } // end namespace llvm
diff --git a/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 4d59da0c646d..7c6f55c06bce 100644
--- a/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -176,11 +176,95 @@ AArch64TTIImpl::getPopcntSupport(unsigned TyWidth) {
   return TTI::PSK_Software;
 }
 
+bool AArch64TTIImpl::isWideningInstruction(Type *DstTy, unsigned Opcode,
+                                           ArrayRef<const Value *> Args) {
+
+  // A helper that returns a vector type from the given type. The number of
+  // elements in type Ty determine the vector width.
+  auto toVectorTy = [&](Type *ArgTy) {
+    return VectorType::get(ArgTy->getScalarType(),
+                           DstTy->getVectorNumElements());
+  };
+
+  // Exit early if DstTy is not a vector type whose elements are at least
+  // 16-bits wide.
+  if (!DstTy->isVectorTy() || DstTy->getScalarSizeInBits() < 16)
+    return false;
+
+  // Determine if the operation has a widening variant. We consider both the
+  // "long" (e.g., usubl) and "wide" (e.g., usubw) versions of the
+  // instructions.
+  //
+  // TODO: Add additional widening operations (e.g., mul, shl, etc.) once we
+  //       verify that their extending operands are eliminated during code
+  //       generation.
+  switch (Opcode) {
+  case Instruction::Add: // UADDL(2), SADDL(2), UADDW(2), SADDW(2).
+  case Instruction::Sub: // USUBL(2), SSUBL(2), USUBW(2), SSUBW(2).
+    break;
+  default:
+    return false;
+  }
+
+  // To be a widening instruction (either the "wide" or "long" versions), the
+  // second operand must be a sign- or zero extend having a single user. We
+  // only consider extends having a single user because they may otherwise not
+  // be eliminated.
+  if (Args.size() != 2 ||
+      (!isa<SExtInst>(Args[1]) && !isa<ZExtInst>(Args[1])) ||
+      !Args[1]->hasOneUse())
+    return false;
+  auto *Extend = cast<CastInst>(Args[1]);
+
+  // Legalize the destination type and ensure it can be used in a widening
+  // operation.
+  auto DstTyL = TLI->getTypeLegalizationCost(DL, DstTy);
+  unsigned DstElTySize = DstTyL.second.getScalarSizeInBits();
+  if (!DstTyL.second.isVector() || DstElTySize != DstTy->getScalarSizeInBits())
+    return false;
+
+  // Legalize the source type and ensure it can be used in a widening
+  // operation.
+  Type *SrcTy = toVectorTy(Extend->getSrcTy());
+  auto SrcTyL = TLI->getTypeLegalizationCost(DL, SrcTy);
+  unsigned SrcElTySize = SrcTyL.second.getScalarSizeInBits();
+  if (!SrcTyL.second.isVector() || SrcElTySize != SrcTy->getScalarSizeInBits())
+    return false;
+
+  // Get the total number of vector elements in the legalized types.
+  unsigned NumDstEls = DstTyL.first * DstTyL.second.getVectorNumElements();
+  unsigned NumSrcEls = SrcTyL.first * SrcTyL.second.getVectorNumElements();
+
+  // Return true if the legalized types have the same number of vector elements
+  // and the destination element type size is twice that of the source type.
+  return NumDstEls == NumSrcEls && 2 * SrcElTySize == DstElTySize;
+}
+
 int AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
                                      const Instruction *I) {
   int ISD = TLI->InstructionOpcodeToISD(Opcode);
   assert(ISD && "Invalid opcode");
 
+  // If the cast is observable, and it is used by a widening instruction (e.g.,
+  // uaddl, saddw, etc.), it may be free.
+  if (I && I->hasOneUse()) {
+    auto *SingleUser = cast<Instruction>(*I->user_begin());
+    SmallVector<const Value *, 4> Operands(SingleUser->operand_values());
+    if (isWideningInstruction(Dst, SingleUser->getOpcode(), Operands)) {
+      // If the cast is the second operand, it is free. We will generate either
+      // a "wide" or "long" version of the widening instruction.
+      if (I == SingleUser->getOperand(1))
+        return 0;
+      // If the cast is not the second operand, it will be free if it looks the
+      // same as the second operand. In this case, we will generate a "long"
+      // version of the widening instruction.
+      if (auto *Cast = dyn_cast<CastInst>(SingleUser->getOperand(1)))
+        if (I->getOpcode() == Cast->getOpcode() &&
+            cast<CastInst>(I)->getSrcTy() == Cast->getSrcTy())
+          return 0;
+    }
+  }
+
   EVT SrcTy = TLI->getValueType(DL, Src);
   EVT DstTy = TLI->getValueType(DL, Dst);
 
@@ -379,6 +463,16 @@ int AArch64TTIImpl::getArithmeticInstrCost(
   // Legalize the type.
   std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
 
+  // If the instruction is a widening instruction (e.g., uaddl, saddw, etc.),
+  // add in the widening overhead specified by the sub-target. Since the
+  // extends feeding widening instructions are performed automatically, they
+  // aren't present in the generated code and have a zero cost. By adding a
+  // widening overhead here, we attach the total cost of the combined operation
+  // to the widening instruction.
+  int Cost = 0;
+  if (isWideningInstruction(Ty, Opcode, Args))
+    Cost += ST->getWideningBaseCost();
+
   int ISD = TLI->InstructionOpcodeToISD(Opcode);
 
   if (ISD == ISD::SDIV &&
@@ -388,9 +482,9 @@ int AArch64TTIImpl::getArithmeticInstrCost(
     // normally expanded to the sequence ADD + CMP + SELECT + SRA.
     // The OperandValue properties many not be same as that of previous
     // operation; conservatively assume OP_None.
-    int Cost = getArithmeticInstrCost(Instruction::Add, Ty, Opd1Info, Opd2Info,
-                                      TargetTransformInfo::OP_None,
-                                      TargetTransformInfo::OP_None);
+    Cost += getArithmeticInstrCost(Instruction::Add, Ty, Opd1Info, Opd2Info,
+                                   TargetTransformInfo::OP_None,
+                                   TargetTransformInfo::OP_None);
     Cost += getArithmeticInstrCost(Instruction::Sub, Ty, Opd1Info, Opd2Info,
                                    TargetTransformInfo::OP_None,
                                    TargetTransformInfo::OP_None);
@@ -405,8 +499,8 @@ int AArch64TTIImpl::getArithmeticInstrCost(
 
   switch (ISD) {
   default:
-    return BaseT::getArithmeticInstrCost(Opcode, Ty, Opd1Info, Opd2Info,
-                                         Opd1PropInfo, Opd2PropInfo);
+    return Cost + BaseT::getArithmeticInstrCost(Opcode, Ty, Opd1Info, Opd2Info,
+                                                Opd1PropInfo, Opd2PropInfo);
   case ISD::ADD:
   case ISD::MUL:
   case ISD::XOR:
@@ -414,7 +508,7 @@ int AArch64TTIImpl::getArithmeticInstrCost(
   case ISD::AND:
     // These nodes are marked as 'custom' for combining purposes only.
     // We know that they are legal. See LowerAdd in ISelLowering.
-    return 1 * LT.first;
+    return (Cost + 1) * LT.first;
   }
 }
 
diff --git a/lib/Target/AArch64/AArch64TargetTransformInfo.h b/lib/Target/AArch64/AArch64TargetTransformInfo.h
index e37c003e064c..280d97f3c502 100644
--- a/lib/Target/AArch64/AArch64TargetTransformInfo.h
+++ b/lib/Target/AArch64/AArch64TargetTransformInfo.h
@@ -43,6 +43,9 @@ class AArch64TTIImpl : public BasicTTIImplBase<AArch64TTIImpl> {
     VECTOR_LDST_FOUR_ELEMENTS
   };
 
+  bool isWideningInstruction(Type *Ty, unsigned Opcode,
+                             ArrayRef<const Value *> Args);
+
 public:
   explicit AArch64TTIImpl(const AArch64TargetMachine *TM, const Function &F)
       : BaseT(TM, F.getParent()->getDataLayout()), ST(TM->getSubtargetImpl(F)),
@@ -84,6 +87,10 @@ public:
     return 64;
   }
 
+  unsigned getMinVectorRegisterBitWidth() {
+    return ST->getMinVectorRegisterBitWidth();
+  }
+
   unsigned getMaxInterleaveFactor(unsigned VF);
 
   int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
@@ -134,6 +141,10 @@ public:
   unsigned getMinPrefetchStride();
 
   unsigned getMaxPrefetchIterationsAhead();
+
+  bool shouldExpandReduction(const IntrinsicInst *II) const {
+    return false;
+  }
   /// @}
 };
 
diff --git a/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp b/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
index 4dbcc9581a84..449d732a8d44 100644
--- a/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
+++ b/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
@@ -3904,10 +3904,14 @@ bool AArch64AsmParser::parseDirectiveArch(SMLoc L) {
   return false;
 }
 
+static SMLoc incrementLoc(SMLoc L, int Offset) {
+  return SMLoc::getFromPointer(L.getPointer() + Offset);
+}
+
 /// parseDirectiveCPU
 ///   ::= .cpu id
 bool AArch64AsmParser::parseDirectiveCPU(SMLoc L) {
-  SMLoc CPULoc = getLoc();
+  SMLoc CurLoc = getLoc();
 
   StringRef CPU, ExtensionString;
   std::tie(CPU, ExtensionString) =
@@ -3923,15 +3927,19 @@ bool AArch64AsmParser::parseDirectiveCPU(SMLoc L) {
   // FIXME This is using tablegen data, but should be moved to ARMTargetParser
   // once that is tablegen'ed
   if (!getSTI().isCPUStringValid(CPU)) {
-    Error(CPULoc, "unknown CPU name");
+    Error(CurLoc, "unknown CPU name");
     return false;
   }
 
   MCSubtargetInfo &STI = copySTI();
   STI.setDefaultFeatures(CPU, "");
+  CurLoc = incrementLoc(CurLoc, CPU.size());
 
   FeatureBitset Features = STI.getFeatureBits();
   for (auto Name : RequestedExtensions) {
+    // Advance source location past '+'.
+    CurLoc = incrementLoc(CurLoc, 1);
+
     bool EnableFeature = true;
 
     if (Name.startswith_lower("no")) {
@@ -3939,6 +3947,7 @@ bool AArch64AsmParser::parseDirectiveCPU(SMLoc L) {
       Name = Name.substr(2);
     }
 
+    bool FoundExtension = false;
     for (const auto &Extension : ExtensionMap) {
       if (Extension.Name != Name)
         continue;
@@ -3952,9 +3961,15 @@ bool AArch64AsmParser::parseDirectiveCPU(SMLoc L) {
       uint64_t Features =
           ComputeAvailableFeatures(STI.ToggleFeature(ToggleFeatures));
       setAvailableFeatures(Features);
+      FoundExtension = true;
 
       break;
     }
+
+    if (!FoundExtension)
+      Error(CurLoc, "unsupported architectural extension");
+
+    CurLoc = incrementLoc(CurLoc, Name.size());
   }
   return false;
 }
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp
index 94112849f84e..1b28df963b40 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp
@@ -32,8 +32,9 @@ static cl::opt<AsmWriterVariantTy> AsmWriterVariant(
                clEnumValN(Apple, "apple", "Emit Apple-style NEON assembly")));
 
 AArch64MCAsmInfoDarwin::AArch64MCAsmInfoDarwin() {
-  // We prefer NEON instructions to be printed in the short form.
-  AssemblerDialect = AsmWriterVariant == Default ? 1 : AsmWriterVariant;
+  // We prefer NEON instructions to be printed in the short, Apple-specific
+  // form when targeting Darwin.
+  AssemblerDialect = AsmWriterVariant == Default ? Apple : AsmWriterVariant;
 
   PrivateGlobalPrefix = "L";
   PrivateLabelPrefix = "L";
@@ -68,8 +69,9 @@ AArch64MCAsmInfoELF::AArch64MCAsmInfoELF(const Triple &T) {
   if (T.getArch() == Triple::aarch64_be)
     IsLittleEndian = false;
 
-  // We prefer NEON instructions to be printed in the short form.
-  AssemblerDialect = AsmWriterVariant == Default ? 0 : AsmWriterVariant;
+  // We prefer NEON instructions to be printed in the generic form when
+  // targeting ELF.
+  AssemblerDialect = AsmWriterVariant == Default ? Generic : AsmWriterVariant;
 
   CodePointerSize = 8;
 
diff --git a/lib/Target/AMDGPU/AMDGPU.h b/lib/Target/AMDGPU/AMDGPU.h
index 8f6e1e7d8846..3f89702bed50 100644
--- a/lib/Target/AMDGPU/AMDGPU.h
+++ b/lib/Target/AMDGPU/AMDGPU.h
@@ -50,6 +50,10 @@ FunctionPass *createSIDebuggerInsertNopsPass();
 FunctionPass *createSIInsertWaitsPass();
 FunctionPass *createSIInsertWaitcntsPass();
 FunctionPass *createAMDGPUCodeGenPreparePass(const GCNTargetMachine *TM = nullptr);
+FunctionPass *createAMDGPUMachineCFGStructurizerPass();
+
+void initializeAMDGPUMachineCFGStructurizerPass(PassRegistry&);
+extern char &AMDGPUMachineCFGStructurizerID;
 
 ModulePass *createAMDGPUAnnotateKernelFeaturesPass(const TargetMachine *TM = nullptr);
 void initializeAMDGPUAnnotateKernelFeaturesPass(PassRegistry &);
diff --git a/lib/Target/AMDGPU/AMDGPU.td b/lib/Target/AMDGPU/AMDGPU.td
index 2e5b78bbf7ef..b279bd61e180 100644
--- a/lib/Target/AMDGPU/AMDGPU.td
+++ b/lib/Target/AMDGPU/AMDGPU.td
@@ -61,6 +61,24 @@ def FeatureFlatAddressSpace : SubtargetFeature<"flat-address-space",
   "Support flat address space"
 >;
 
+def FeatureFlatInstOffsets : SubtargetFeature<"flat-inst-offsets",
+  "FlatInstOffsets",
+  "true",
+  "Flat instructions have immediate offset addressing mode"
+>;
+
+def FeatureFlatGlobalInsts : SubtargetFeature<"flat-global-insts",
+  "FlatGlobalInsts",
+  "true",
+  "Have global_* flat memory instructions"
+>;
+
+def FeatureFlatScratchInsts : SubtargetFeature<"flat-scratch-insts",
+  "FlatScratchInsts",
+  "true",
+  "Have scratch_* flat memory instructions"
+>;
+
 def FeatureUnalignedBufferAccess : SubtargetFeature<"unaligned-buffer-access",
   "UnalignedBufferAccess",
   "true",
@@ -407,7 +425,8 @@ def FeatureGFX9 : SubtargetFeatureGeneration<"GFX9",
    FeatureGCN3Encoding, FeatureCIInsts, Feature16BitInsts,
    FeatureSMemRealTime, FeatureScalarStores, FeatureInv2PiInlineImm,
    FeatureApertureRegs, FeatureGFX9Insts, FeatureVOP3P, FeatureVGPRIndexMode,
-   FeatureFastFMAF32, FeatureDPP
+   FeatureFastFMAF32, FeatureDPP,
+   FeatureFlatInstOffsets, FeatureFlatGlobalInsts, FeatureFlatScratchInsts
   ]
 >;
 
diff --git a/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index ccae36ced1f8..7c99752b881f 100644
--- a/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -136,8 +136,7 @@ private:
   bool SelectMUBUFIntrinsicVOffset(SDValue Offset, SDValue &SOffset,
                                    SDValue &ImmOffset, SDValue &VOffset) const;
 
-  bool SelectFlat(SDValue Addr, SDValue &VAddr,
-                  SDValue &SLC, SDValue &TFE) const;
+  bool SelectFlat(SDValue Addr, SDValue &VAddr, SDValue &SLC) const;
 
   bool SelectSMRDOffset(SDValue ByteOffsetNode, SDValue &Offset,
                         bool &Imm) const;
@@ -1278,10 +1277,9 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFIntrinsicVOffset(SDValue Offset,
 
 bool AMDGPUDAGToDAGISel::SelectFlat(SDValue Addr,
                                     SDValue &VAddr,
-                                    SDValue &SLC,
-                                    SDValue &TFE) const {
+                                    SDValue &SLC) const {
   VAddr = Addr;
-  TFE = SLC = CurDAG->getTargetConstant(0, SDLoc(), MVT::i1);
+  SLC = CurDAG->getTargetConstant(0, SDLoc(), MVT::i1);
   return true;
 }
 
diff --git a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 915d1d9e0e68..f80652b87373 100644
--- a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -567,13 +567,19 @@ static bool hasSourceMods(const SDNode *N) {
   case AMDGPUISD::INTERP_P1:
   case AMDGPUISD::INTERP_P2:
   case AMDGPUISD::DIV_SCALE:
+
+  // TODO: Should really be looking at the users of the bitcast. These are
+  // problematic because bitcasts are used to legalize all stores to integer
+  // types.
+  case ISD::BITCAST:
     return false;
   default:
     return true;
   }
 }
 
-static bool allUsesHaveSourceMods(const SDNode *N, unsigned CostThreshold = 4) {
+bool AMDGPUTargetLowering::allUsesHaveSourceMods(const SDNode *N,
+                                                 unsigned CostThreshold) {
   // Some users (such as 3-operand FMA/MAD) must use a VOP3 encoding, and thus
   // it is truly free to use a source modifier in all cases. If there are
   // multiple users but for each one will necessitate using VOP3, there will be
@@ -2299,7 +2305,7 @@ static bool isU24(SDValue Op, SelectionDAG &DAG) {
   EVT VT = Op.getValueType();
   DAG.computeKnownBits(Op, Known);
 
-  return (VT.getSizeInBits() - Known.Zero.countLeadingOnes()) <= 24;
+  return (VT.getSizeInBits() - Known.countMinLeadingZeros()) <= 24;
 }
 
 static bool isI24(SDValue Op, SelectionDAG &DAG) {
diff --git a/lib/Target/AMDGPU/AMDGPUISelLowering.h b/lib/Target/AMDGPU/AMDGPUISelLowering.h
index e1a5a2072418..4c588a7bafd0 100644
--- a/lib/Target/AMDGPU/AMDGPUISelLowering.h
+++ b/lib/Target/AMDGPU/AMDGPUISelLowering.h
@@ -132,6 +132,8 @@ public:
     return false;
   }
 
+  static bool allUsesHaveSourceMods(const SDNode *N,
+                                    unsigned CostThreshold = 4);
   bool isFAbsFree(EVT VT) const override;
   bool isFNegFree(EVT VT) const override;
   bool isTruncateFree(EVT Src, EVT Dest) const override;
diff --git a/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index 8867ed689a31..a7eac080f885 100644
--- a/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -127,9 +127,9 @@ bool AMDGPUInstructionSelector::selectG_STORE(MachineInstr &I) const {
           .add(I.getOperand(1))
           .add(I.getOperand(0))
           .addImm(0)
-          .addImm(0)
           .addImm(0);
 
+
   // Now that we selected an opcode, we need to constrain the register
   // operands to use appropriate classes.
   bool Ret = constrainSelectedInstRegOperands(*Flat, TII, TRI, RBI);
@@ -393,7 +393,6 @@ bool AMDGPUInstructionSelector::selectG_LOAD(MachineInstr &I) const {
                                .add(I.getOperand(0))
                                .addReg(PtrReg)
                                .addImm(0)
-                               .addImm(0)
                                .addImm(0);
 
   bool Ret = constrainSelectedInstRegOperands(*Flat, TII, TRI, RBI);
diff --git a/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index a2567a549028..9de302994e68 100644
--- a/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -33,6 +33,7 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo() {
   const LLT P1 = LLT::pointer(1, 64);
   const LLT P2 = LLT::pointer(2, 64);
 
+  setAction({G_CONSTANT, S32}, Legal);
   setAction({G_CONSTANT, S64}, Legal);
 
   setAction({G_GEP, P1}, Legal);
diff --git a/lib/Target/AMDGPU/AMDGPUMachineCFGStructurizer.cpp b/lib/Target/AMDGPU/AMDGPUMachineCFGStructurizer.cpp
new file mode 100644
index 000000000000..6d2785ba1c60
--- /dev/null
+++ b/lib/Target/AMDGPU/AMDGPUMachineCFGStructurizer.cpp
@@ -0,0 +1,2881 @@
+//===- AMDGPUMachineCFGStructurizer.cpp - Machine code if conversion pass. ===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the machine instruction level CFG structurizer pass.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "SIInstrInfo.h"
+#include "AMDGPUSubtarget.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Analysis/CFG.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegionInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/IR/DebugLoc.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetLowering.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+#include <tuple>
+using namespace llvm;
+
+#define DEBUG_TYPE "amdgpucfgstructurizer"
+
+namespace {
+class PHILinearizeDestIterator;
+
+class PHILinearize {
+  friend class PHILinearizeDestIterator;
+
+public:
+  typedef std::pair<unsigned, MachineBasicBlock *> PHISourceT;
+
+private:
+  typedef DenseSet<PHISourceT> PHISourcesT;
+  typedef struct {
+    unsigned DestReg;
+    DebugLoc DL;
+    PHISourcesT Sources;
+  } PHIInfoElementT;
+  typedef SmallPtrSet<PHIInfoElementT *, 2> PHIInfoT;
+  PHIInfoT PHIInfo;
+
+  static unsigned phiInfoElementGetDest(PHIInfoElementT *Info);
+  static void phiInfoElementSetDef(PHIInfoElementT *Info, unsigned NewDef);
+  static PHISourcesT &phiInfoElementGetSources(PHIInfoElementT *Info);
+  static void phiInfoElementAddSource(PHIInfoElementT *Info, unsigned SourceReg,
+                                      MachineBasicBlock *SourceMBB);
+  static void phiInfoElementRemoveSource(PHIInfoElementT *Info,
+                                         unsigned SourceReg,
+                                         MachineBasicBlock *SourceMBB);
+  PHIInfoElementT *findPHIInfoElement(unsigned DestReg);
+  PHIInfoElementT *findPHIInfoElementFromSource(unsigned SourceReg,
+                                                MachineBasicBlock *SourceMBB);
+
+public:
+  bool findSourcesFromMBB(MachineBasicBlock *SourceMBB,
+                          SmallVector<unsigned, 4> &Sources);
+  void addDest(unsigned DestReg, const DebugLoc &DL);
+  void replaceDef(unsigned OldDestReg, unsigned NewDestReg);
+  void deleteDef(unsigned DestReg);
+  void addSource(unsigned DestReg, unsigned SourceReg,
+                 MachineBasicBlock *SourceMBB);
+  void removeSource(unsigned DestReg, unsigned SourceReg,
+                    MachineBasicBlock *SourceMBB = nullptr);
+  bool findDest(unsigned SourceReg, MachineBasicBlock *SourceMBB,
+                unsigned &DestReg);
+  bool isSource(unsigned Reg, MachineBasicBlock *SourceMBB = nullptr);
+  unsigned getNumSources(unsigned DestReg);
+  void dump(MachineRegisterInfo *MRI);
+  void clear();
+
+  typedef PHISourcesT::iterator source_iterator;
+  typedef PHILinearizeDestIterator dest_iterator;
+
+  dest_iterator dests_begin();
+  dest_iterator dests_end();
+
+  source_iterator sources_begin(unsigned Reg);
+  source_iterator sources_end(unsigned Reg);
+};
+
+class PHILinearizeDestIterator {
+private:
+  PHILinearize::PHIInfoT::iterator Iter;
+
+public:
+  unsigned operator*() { return PHILinearize::phiInfoElementGetDest(*Iter); }
+  PHILinearizeDestIterator &operator++() {
+    ++Iter;
+    return *this;
+  }
+  bool operator==(const PHILinearizeDestIterator &I) const {
+    return I.Iter == Iter;
+  }
+  bool operator!=(const PHILinearizeDestIterator &I) const {
+    return I.Iter != Iter;
+  }
+
+  PHILinearizeDestIterator(PHILinearize::PHIInfoT::iterator I) : Iter(I) {}
+};
+
+unsigned PHILinearize::phiInfoElementGetDest(PHIInfoElementT *Info) {
+  return Info->DestReg;
+}
+
+void PHILinearize::phiInfoElementSetDef(PHIInfoElementT *Info,
+                                        unsigned NewDef) {
+  Info->DestReg = NewDef;
+}
+
+PHILinearize::PHISourcesT &
+PHILinearize::phiInfoElementGetSources(PHIInfoElementT *Info) {
+  return Info->Sources;
+}
+
+void PHILinearize::phiInfoElementAddSource(PHIInfoElementT *Info,
+                                           unsigned SourceReg,
+                                           MachineBasicBlock *SourceMBB) {
+  // Assertion ensures we don't use the same SourceMBB for the
+  // sources, because we cannot have different registers with
+  // identical predecessors, but we can have the same register for
+  // multiple predecessors.
+#if !defined(NDEBUG)
+  for (auto SI : phiInfoElementGetSources(Info)) {
+    assert((SI.second != SourceMBB || SourceReg == SI.first));
+  }
+#endif
+
+  phiInfoElementGetSources(Info).insert(PHISourceT(SourceReg, SourceMBB));
+}
+
+void PHILinearize::phiInfoElementRemoveSource(PHIInfoElementT *Info,
+                                              unsigned SourceReg,
+                                              MachineBasicBlock *SourceMBB) {
+  auto &Sources = phiInfoElementGetSources(Info);
+  SmallVector<PHISourceT, 4> ElimiatedSources;
+  for (auto SI : Sources) {
+    if (SI.first == SourceReg &&
+        (SI.second == nullptr || SI.second == SourceMBB)) {
+      ElimiatedSources.push_back(PHISourceT(SI.first, SI.second));
+    }
+  }
+
+  for (auto &Source : ElimiatedSources) {
+    Sources.erase(Source);
+  }
+}
+
+PHILinearize::PHIInfoElementT *
+PHILinearize::findPHIInfoElement(unsigned DestReg) {
+  for (auto I : PHIInfo) {
+    if (phiInfoElementGetDest(I) == DestReg) {
+      return I;
+    }
+  }
+  return nullptr;
+}
+
+PHILinearize::PHIInfoElementT *
+PHILinearize::findPHIInfoElementFromSource(unsigned SourceReg,
+                                           MachineBasicBlock *SourceMBB) {
+  for (auto I : PHIInfo) {
+    for (auto SI : phiInfoElementGetSources(I)) {
+      if (SI.first == SourceReg &&
+          (SI.second == nullptr || SI.second == SourceMBB)) {
+        return I;
+      }
+    }
+  }
+  return nullptr;
+}
+
+bool PHILinearize::findSourcesFromMBB(MachineBasicBlock *SourceMBB,
+                                      SmallVector<unsigned, 4> &Sources) {
+  bool FoundSource = false;
+  for (auto I : PHIInfo) {
+    for (auto SI : phiInfoElementGetSources(I)) {
+      if (SI.second == SourceMBB) {
+        FoundSource = true;
+        Sources.push_back(SI.first);
+      }
+    }
+  }
+  return FoundSource;
+}
+
+void PHILinearize::addDest(unsigned DestReg, const DebugLoc &DL) {
+  assert(findPHIInfoElement(DestReg) == nullptr && "Dest already exsists");
+  PHISourcesT EmptySet;
+  PHIInfoElementT *NewElement = new PHIInfoElementT();
+  NewElement->DestReg = DestReg;
+  NewElement->DL = DL;
+  NewElement->Sources = EmptySet;
+  PHIInfo.insert(NewElement);
+}
+
+void PHILinearize::replaceDef(unsigned OldDestReg, unsigned NewDestReg) {
+  phiInfoElementSetDef(findPHIInfoElement(OldDestReg), NewDestReg);
+}
+
+void PHILinearize::deleteDef(unsigned DestReg) {
+  PHIInfoElementT *InfoElement = findPHIInfoElement(DestReg);
+  PHIInfo.erase(InfoElement);
+  delete InfoElement;
+}
+
+void PHILinearize::addSource(unsigned DestReg, unsigned SourceReg,
+                             MachineBasicBlock *SourceMBB) {
+  phiInfoElementAddSource(findPHIInfoElement(DestReg), SourceReg, SourceMBB);
+}
+
+void PHILinearize::removeSource(unsigned DestReg, unsigned SourceReg,
+                                MachineBasicBlock *SourceMBB) {
+  phiInfoElementRemoveSource(findPHIInfoElement(DestReg), SourceReg, SourceMBB);
+}
+
+bool PHILinearize::findDest(unsigned SourceReg, MachineBasicBlock *SourceMBB,
+                            unsigned &DestReg) {
+  PHIInfoElementT *InfoElement =
+      findPHIInfoElementFromSource(SourceReg, SourceMBB);
+  if (InfoElement != nullptr) {
+    DestReg = phiInfoElementGetDest(InfoElement);
+    return true;
+  }
+  return false;
+}
+
+bool PHILinearize::isSource(unsigned Reg, MachineBasicBlock *SourceMBB) {
+  unsigned DestReg;
+  return findDest(Reg, SourceMBB, DestReg);
+}
+
+unsigned PHILinearize::getNumSources(unsigned DestReg) {
+  return phiInfoElementGetSources(findPHIInfoElement(DestReg)).size();
+}
+
+void PHILinearize::dump(MachineRegisterInfo *MRI) {
+  const TargetRegisterInfo *TRI = MRI->getTargetRegisterInfo();
+  dbgs() << "=PHIInfo Start=\n";
+  for (auto PII : this->PHIInfo) {
+    PHIInfoElementT &Element = *PII;
+    dbgs() << "Dest: " << PrintReg(Element.DestReg, TRI)
+           << " Sources: {";
+    for (auto &SI : Element.Sources) {
+      dbgs() << PrintReg(SI.first, TRI) << "(BB#"
+             << SI.second->getNumber() << "),";
+    }
+    dbgs() << "}\n";
+  }
+  dbgs() << "=PHIInfo End=\n";
+}
+
+void PHILinearize::clear() { PHIInfo = PHIInfoT(); }
+
+PHILinearize::dest_iterator PHILinearize::dests_begin() {
+  return PHILinearizeDestIterator(PHIInfo.begin());
+}
+
+PHILinearize::dest_iterator PHILinearize::dests_end() {
+  return PHILinearizeDestIterator(PHIInfo.end());
+}
+
+PHILinearize::source_iterator PHILinearize::sources_begin(unsigned Reg) {
+  auto InfoElement = findPHIInfoElement(Reg);
+  return phiInfoElementGetSources(InfoElement).begin();
+}
+PHILinearize::source_iterator PHILinearize::sources_end(unsigned Reg) {
+  auto InfoElement = findPHIInfoElement(Reg);
+  return phiInfoElementGetSources(InfoElement).end();
+}
+
+class RegionMRT;
+class MBBMRT;
+
+static unsigned getPHINumInputs(MachineInstr &PHI) {
+  assert(PHI.isPHI());
+  return (PHI.getNumOperands() - 1) / 2;
+}
+
+static MachineBasicBlock *getPHIPred(MachineInstr &PHI, unsigned Index) {
+  assert(PHI.isPHI());
+  return PHI.getOperand(Index * 2 + 2).getMBB();
+}
+
+static void setPhiPred(MachineInstr &PHI, unsigned Index,
+                       MachineBasicBlock *NewPred) {
+  PHI.getOperand(Index * 2 + 2).setMBB(NewPred);
+}
+
+static unsigned getPHISourceReg(MachineInstr &PHI, unsigned Index) {
+  assert(PHI.isPHI());
+  return PHI.getOperand(Index * 2 + 1).getReg();
+}
+
+static unsigned getPHIDestReg(MachineInstr &PHI) {
+  assert(PHI.isPHI());
+  return PHI.getOperand(0).getReg();
+}
+
+class LinearizedRegion {
+protected:
+  MachineBasicBlock *Entry;
+  // The exit block is part of the region, and is the last
+  // merge block before exiting the region.
+  MachineBasicBlock *Exit;
+  DenseSet<unsigned> LiveOuts;
+  SmallPtrSet<MachineBasicBlock *, 1> MBBs;
+  bool HasLoop;
+  LinearizedRegion *Parent;
+  RegionMRT *RMRT;
+
+  void storeLiveOutReg(MachineBasicBlock *MBB, unsigned Reg,
+                       MachineInstr *DefInstr, const MachineRegisterInfo *MRI,
+                       const TargetRegisterInfo *TRI, PHILinearize &PHIInfo);
+
+  void storeLiveOutRegRegion(RegionMRT *Region, unsigned Reg,
+                             MachineInstr *DefInstr,
+                             const MachineRegisterInfo *MRI,
+                             const TargetRegisterInfo *TRI,
+                             PHILinearize &PHIInfo);
+
+  void storeMBBLiveOuts(MachineBasicBlock *MBB, const MachineRegisterInfo *MRI,
+                        const TargetRegisterInfo *TRI, PHILinearize &PHIInfo,
+                        RegionMRT *TopRegion);
+
+  void storeLiveOuts(MachineBasicBlock *MBB, const MachineRegisterInfo *MRI,
+                     const TargetRegisterInfo *TRI, PHILinearize &PHIInfo);
+
+  void storeLiveOuts(RegionMRT *Region, const MachineRegisterInfo *MRI,
+                     const TargetRegisterInfo *TRI, PHILinearize &PHIInfo,
+                     RegionMRT *TopRegion = nullptr);
+
+public:
+  void setRegionMRT(RegionMRT *Region) { RMRT = Region; }
+
+  RegionMRT *getRegionMRT() { return RMRT; }
+
+  void setParent(LinearizedRegion *P) { Parent = P; }
+
+  LinearizedRegion *getParent() { return Parent; }
+
+  void print(raw_ostream &OS, const TargetRegisterInfo *TRI = nullptr);
+
+  void setBBSelectRegIn(unsigned Reg);
+
+  unsigned getBBSelectRegIn();
+
+  void setBBSelectRegOut(unsigned Reg, bool IsLiveOut);
+
+  unsigned getBBSelectRegOut();
+
+  void setHasLoop(bool Value);
+
+  bool getHasLoop();
+
+  void addLiveOut(unsigned VReg);
+
+  void removeLiveOut(unsigned Reg);
+
+  void replaceLiveOut(unsigned OldReg, unsigned NewReg);
+
+  void replaceRegister(unsigned Register, unsigned NewRegister,
+                       MachineRegisterInfo *MRI, bool ReplaceInside,
+                       bool ReplaceOutside, bool IncludeLoopPHIs);
+
+  void replaceRegisterInsideRegion(unsigned Register, unsigned NewRegister,
+                                   bool IncludeLoopPHIs,
+                                   MachineRegisterInfo *MRI);
+
+  void replaceRegisterOutsideRegion(unsigned Register, unsigned NewRegister,
+                                    bool IncludeLoopPHIs,
+                                    MachineRegisterInfo *MRI);
+
+  DenseSet<unsigned> *getLiveOuts();
+
+  void setEntry(MachineBasicBlock *NewEntry);
+
+  MachineBasicBlock *getEntry();
+
+  void setExit(MachineBasicBlock *NewExit);
+
+  MachineBasicBlock *getExit();
+
+  void addMBB(MachineBasicBlock *MBB);
+
+  void addMBBs(LinearizedRegion *InnerRegion);
+
+  bool contains(MachineBasicBlock *MBB);
+
+  bool isLiveOut(unsigned Reg);
+
+  bool hasNoDef(unsigned Reg, MachineRegisterInfo *MRI);
+
+  void removeFalseRegisterKills(MachineRegisterInfo *MRI);
+
+  void initLiveOut(RegionMRT *Region, const MachineRegisterInfo *MRI,
+                   const TargetRegisterInfo *TRI, PHILinearize &PHIInfo);
+
+  LinearizedRegion(MachineBasicBlock *MBB, const MachineRegisterInfo *MRI,
+                   const TargetRegisterInfo *TRI, PHILinearize &PHIInfo);
+
+  LinearizedRegion();
+
+  ~LinearizedRegion();
+};
+
+class MRT {
+protected:
+  RegionMRT *Parent;
+  unsigned BBSelectRegIn;
+  unsigned BBSelectRegOut;
+
+public:
+  unsigned getBBSelectRegIn() { return BBSelectRegIn; }
+
+  unsigned getBBSelectRegOut() { return BBSelectRegOut; }
+
+  void setBBSelectRegIn(unsigned Reg) { BBSelectRegIn = Reg; }
+
+  void setBBSelectRegOut(unsigned Reg) { BBSelectRegOut = Reg; }
+
+  virtual RegionMRT *getRegionMRT() { return nullptr; }
+
+  virtual MBBMRT *getMBBMRT() { return nullptr; }
+
+  bool isRegion() { return getRegionMRT() != nullptr; }
+
+  bool isMBB() { return getMBBMRT() != nullptr; }
+
+  bool isRoot() { return Parent == nullptr; }
+
+  void setParent(RegionMRT *Region) { Parent = Region; }
+
+  RegionMRT *getParent() { return Parent; }
+
+  static MachineBasicBlock *
+  initializeMRT(MachineFunction &MF, const MachineRegionInfo *RegionInfo,
+                DenseMap<MachineRegion *, RegionMRT *> &RegionMap);
+
+  static RegionMRT *buildMRT(MachineFunction &MF,
+                             const MachineRegionInfo *RegionInfo,
+                             const SIInstrInfo *TII,
+                             MachineRegisterInfo *MRI);
+
+  virtual void dump(const TargetRegisterInfo *TRI, int depth = 0) = 0;
+
+  void dumpDepth(int depth) {
+    for (int i = depth; i > 0; --i) {
+      dbgs() << "  ";
+    }
+  }
+
+  virtual ~MRT() {}
+};
+
+class MBBMRT : public MRT {
+  MachineBasicBlock *MBB;
+
+public:
+  virtual MBBMRT *getMBBMRT() { return this; }
+
+  MachineBasicBlock *getMBB() { return MBB; }
+
+  virtual void dump(const TargetRegisterInfo *TRI, int depth = 0) {
+    dumpDepth(depth);
+    dbgs() << "MBB: " << getMBB()->getNumber();
+    dbgs() << " In: " << PrintReg(getBBSelectRegIn(), TRI);
+    dbgs() << ", Out: " << PrintReg(getBBSelectRegOut(), TRI) << "\n";
+  }
+
+  MBBMRT(MachineBasicBlock *BB) : MBB(BB) {
+    setParent(nullptr);
+    setBBSelectRegOut(0);
+    setBBSelectRegIn(0);
+  }
+};
+
+class RegionMRT : public MRT {
+protected:
+  MachineRegion *Region;
+  LinearizedRegion *LRegion;
+  MachineBasicBlock *Succ;
+
+  SetVector<MRT *> Children;
+
+public:
+  virtual RegionMRT *getRegionMRT() { return this; }
+
+  void setLinearizedRegion(LinearizedRegion *LinearizeRegion) {
+    LRegion = LinearizeRegion;
+  }
+
+  LinearizedRegion *getLinearizedRegion() { return LRegion; }
+
+  MachineRegion *getMachineRegion() { return Region; }
+
+  unsigned getInnerOutputRegister() {
+    return (*(Children.begin()))->getBBSelectRegOut();
+  }
+
+  void addChild(MRT *Tree) { Children.insert(Tree); }
+
+  SetVector<MRT *> *getChildren() { return &Children; }
+
+  virtual void dump(const TargetRegisterInfo *TRI, int depth = 0) {
+    dumpDepth(depth);
+    dbgs() << "Region: " << (void *)Region;
+    dbgs() << " In: " << PrintReg(getBBSelectRegIn(), TRI);
+    dbgs() << ", Out: " << PrintReg(getBBSelectRegOut(), TRI) << "\n";
+
+    dumpDepth(depth);
+    if (getSucc())
+      dbgs() << "Succ: " << getSucc()->getNumber() << "\n";
+    else
+      dbgs() << "Succ: none \n";
+    for (auto MRTI : Children) {
+      MRTI->dump(TRI, depth + 1);
+    }
+  }
+
+  MRT *getEntryTree() { return Children.back(); }
+
+  MRT *getExitTree() { return Children.front(); }
+
+  MachineBasicBlock *getEntry() {
+    MRT *Tree = Children.back();
+    return (Tree->isRegion()) ? Tree->getRegionMRT()->getEntry()
+                              : Tree->getMBBMRT()->getMBB();
+  }
+
+  MachineBasicBlock *getExit() {
+    MRT *Tree = Children.front();
+    return (Tree->isRegion()) ? Tree->getRegionMRT()->getExit()
+                              : Tree->getMBBMRT()->getMBB();
+  }
+
+  void setSucc(MachineBasicBlock *MBB) { Succ = MBB; }
+
+  MachineBasicBlock *getSucc() { return Succ; }
+
+  bool contains(MachineBasicBlock *MBB) {
+    for (auto CI : Children) {
+      if (CI->isMBB()) {
+        if (MBB == CI->getMBBMRT()->getMBB()) {
+          return true;
+        }
+      } else {
+        if (CI->getRegionMRT()->contains(MBB)) {
+          return true;
+        } else if (CI->getRegionMRT()->getLinearizedRegion() != nullptr &&
+                   CI->getRegionMRT()->getLinearizedRegion()->contains(MBB)) {
+          return true;
+        }
+      }
+    }
+    return false;
+  }
+
+  void replaceLiveOutReg(unsigned Register, unsigned NewRegister) {
+    LinearizedRegion *LRegion = getLinearizedRegion();
+    LRegion->replaceLiveOut(Register, NewRegister);
+    for (auto &CI : Children) {
+      if (CI->isRegion()) {
+        CI->getRegionMRT()->replaceLiveOutReg(Register, NewRegister);
+      }
+    }
+  }
+
+  RegionMRT(MachineRegion *MachineRegion)
+      : Region(MachineRegion), LRegion(nullptr), Succ(nullptr) {
+    setParent(nullptr);
+    setBBSelectRegOut(0);
+    setBBSelectRegIn(0);
+  }
+
+  virtual ~RegionMRT() {
+    if (LRegion) {
+      delete LRegion;
+    }
+
+    for (auto CI : Children) {
+      delete &(*CI);
+    }
+  }
+};
+
+static unsigned createBBSelectReg(const SIInstrInfo *TII,
+                                  MachineRegisterInfo *MRI) {
+  return MRI->createVirtualRegister(TII->getPreferredSelectRegClass(32));
+}
+
+MachineBasicBlock *
+MRT::initializeMRT(MachineFunction &MF, const MachineRegionInfo *RegionInfo,
+                   DenseMap<MachineRegion *, RegionMRT *> &RegionMap) {
+  for (auto &MFI : MF) {
+    MachineBasicBlock *ExitMBB = &MFI;
+    if (ExitMBB->succ_size() == 0) {
+      return ExitMBB;
+    }
+  }
+  llvm_unreachable("CFG has no exit block");
+  return nullptr;
+}
+
+RegionMRT *MRT::buildMRT(MachineFunction &MF,
+                         const MachineRegionInfo *RegionInfo,
+                         const SIInstrInfo *TII, MachineRegisterInfo *MRI) {
+  SmallPtrSet<MachineRegion *, 4> PlacedRegions;
+  DenseMap<MachineRegion *, RegionMRT *> RegionMap;
+  MachineRegion *TopLevelRegion = RegionInfo->getTopLevelRegion();
+  RegionMRT *Result = new RegionMRT(TopLevelRegion);
+  RegionMap[TopLevelRegion] = Result;
+
+  // Insert the exit block first, we need it to be the merge node
+  // for the top level region.
+  MachineBasicBlock *Exit = initializeMRT(MF, RegionInfo, RegionMap);
+
+  unsigned BBSelectRegIn = createBBSelectReg(TII, MRI);
+  MBBMRT *ExitMRT = new MBBMRT(Exit);
+  RegionMap[RegionInfo->getRegionFor(Exit)]->addChild(ExitMRT);
+  ExitMRT->setBBSelectRegIn(BBSelectRegIn);
+
+  for (auto MBBI : post_order(&(MF.front()))) {
+    MachineBasicBlock *MBB = &(*MBBI);
+
+    // Skip Exit since we already added it
+    if (MBB == Exit) {
+      continue;
+    }
+
+    DEBUG(dbgs() << "Visiting BB#" << MBB->getNumber() << "\n");
+    MBBMRT *NewMBB = new MBBMRT(MBB);
+    MachineRegion *Region = RegionInfo->getRegionFor(MBB);
+
+    // Ensure we have the MRT region
+    if (RegionMap.count(Region) == 0) {
+      RegionMRT *NewMRTRegion = new RegionMRT(Region);
+      RegionMap[Region] = NewMRTRegion;
+
+      // Ensure all parents are in the RegionMap
+      MachineRegion *Parent = Region->getParent();
+      while (RegionMap.count(Parent) == 0) {
+        RegionMRT *NewMRTParent = new RegionMRT(Parent);
+        NewMRTParent->addChild(NewMRTRegion);
+        NewMRTRegion->setParent(NewMRTParent);
+        RegionMap[Parent] = NewMRTParent;
+        NewMRTRegion = NewMRTParent;
+        Parent = Parent->getParent();
+      }
+      RegionMap[Parent]->addChild(NewMRTRegion);
+      NewMRTRegion->setParent(RegionMap[Parent]);
+    }
+
+    // Add MBB to Region MRT
+    RegionMap[Region]->addChild(NewMBB);
+    NewMBB->setParent(RegionMap[Region]);
+    RegionMap[Region]->setSucc(Region->getExit());
+  }
+  return Result;
+}
+
+void LinearizedRegion::storeLiveOutReg(MachineBasicBlock *MBB, unsigned Reg,
+                                       MachineInstr *DefInstr,
+                                       const MachineRegisterInfo *MRI,
+                                       const TargetRegisterInfo *TRI,
+                                       PHILinearize &PHIInfo) {
+  if (TRI->isVirtualRegister(Reg)) {
+    DEBUG(dbgs() << "Considering Register: " << PrintReg(Reg, TRI) << "\n");
+    // If this is a source register to a PHI we are chaining, it
+    // must be live out.
+    if (PHIInfo.isSource(Reg)) {
+      DEBUG(dbgs() << "Add LiveOut (PHI): " << PrintReg(Reg, TRI) << "\n");
+      addLiveOut(Reg);
+    } else {
+      // If this is live out of the MBB
+      for (auto &UI : MRI->use_operands(Reg)) {
+        if (UI.getParent()->getParent() != MBB) {
+          DEBUG(dbgs() << "Add LiveOut (MBB BB#" << MBB->getNumber()
+                       << "): " << PrintReg(Reg, TRI) << "\n");
+          addLiveOut(Reg);
+        } else {
+          // If the use is in the same MBB we have to make sure
+          // it is after the def, otherwise it is live out in a loop
+          MachineInstr *UseInstr = UI.getParent();
+          for (MachineBasicBlock::instr_iterator
+                   MII = UseInstr->getIterator(),
+                   MIE = UseInstr->getParent()->instr_end();
+               MII != MIE; ++MII) {
+            if ((&(*MII)) == DefInstr) {
+              DEBUG(dbgs() << "Add LiveOut (Loop): " << PrintReg(Reg, TRI)
+                           << "\n");
+              addLiveOut(Reg);
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+void LinearizedRegion::storeLiveOutRegRegion(RegionMRT *Region, unsigned Reg,
+                                             MachineInstr *DefInstr,
+                                             const MachineRegisterInfo *MRI,
+                                             const TargetRegisterInfo *TRI,
+                                             PHILinearize &PHIInfo) {
+  if (TRI->isVirtualRegister(Reg)) {
+    DEBUG(dbgs() << "Considering Register: " << PrintReg(Reg, TRI) << "\n");
+    for (auto &UI : MRI->use_operands(Reg)) {
+      if (!Region->contains(UI.getParent()->getParent())) {
+        DEBUG(dbgs() << "Add LiveOut (Region " << (void *)Region
+                     << "): " << PrintReg(Reg, TRI) << "\n");
+        addLiveOut(Reg);
+      }
+    }
+  }
+}
+
+void LinearizedRegion::storeLiveOuts(MachineBasicBlock *MBB,
+                                     const MachineRegisterInfo *MRI,
+                                     const TargetRegisterInfo *TRI,
+                                     PHILinearize &PHIInfo) {
+  DEBUG(dbgs() << "-Store Live Outs Begin (BB#" << MBB->getNumber() << ")-\n");
+  for (auto &II : *MBB) {
+    for (auto &RI : II.defs()) {
+      storeLiveOutReg(MBB, RI.getReg(), RI.getParent(), MRI, TRI, PHIInfo);
+    }
+    for (auto &IRI : II.implicit_operands()) {
+      if (IRI.isDef()) {
+        storeLiveOutReg(MBB, IRI.getReg(), IRI.getParent(), MRI, TRI, PHIInfo);
+      }
+    }
+  }
+
+  // If we have a successor with a PHI, source coming from this MBB we have to
+  // add the register as live out
+  for (MachineBasicBlock::succ_iterator SI = MBB->succ_begin(),
+                                        E = MBB->succ_end();
+       SI != E; ++SI) {
+    for (auto &II : *(*SI)) {
+      if (II.isPHI()) {
+        MachineInstr &PHI = II;
+        int numPreds = getPHINumInputs(PHI);
+        for (int i = 0; i < numPreds; ++i) {
+          if (getPHIPred(PHI, i) == MBB) {
+            unsigned PHIReg = getPHISourceReg(PHI, i);
+            DEBUG(dbgs() << "Add LiveOut (PhiSource BB#" << MBB->getNumber()
+                         << " -> BB#" << (*SI)->getNumber()
+                         << "): " << PrintReg(PHIReg, TRI) << "\n");
+            addLiveOut(PHIReg);
+          }
+        }
+      }
+    }
+  }
+
+  DEBUG(dbgs() << "-Store Live Outs Endn-\n");
+}
+
+void LinearizedRegion::storeMBBLiveOuts(MachineBasicBlock *MBB,
+                                        const MachineRegisterInfo *MRI,
+                                        const TargetRegisterInfo *TRI,
+                                        PHILinearize &PHIInfo,
+                                        RegionMRT *TopRegion) {
+  for (auto &II : *MBB) {
+    for (auto &RI : II.defs()) {
+      storeLiveOutRegRegion(TopRegion, RI.getReg(), RI.getParent(), MRI, TRI,
+                            PHIInfo);
+    }
+    for (auto &IRI : II.implicit_operands()) {
+      if (IRI.isDef()) {
+        storeLiveOutRegRegion(TopRegion, IRI.getReg(), IRI.getParent(), MRI,
+                              TRI, PHIInfo);
+      }
+    }
+  }
+}
+
+void LinearizedRegion::storeLiveOuts(RegionMRT *Region,
+                                     const MachineRegisterInfo *MRI,
+                                     const TargetRegisterInfo *TRI,
+                                     PHILinearize &PHIInfo,
+                                     RegionMRT *CurrentTopRegion) {
+  MachineBasicBlock *Exit = Region->getSucc();
+
+  RegionMRT *TopRegion =
+      CurrentTopRegion == nullptr ? Region : CurrentTopRegion;
+
+  // Check if exit is end of function, if so, no live outs.
+  if (Exit == nullptr)
+    return;
+
+  auto Children = Region->getChildren();
+  for (auto CI : *Children) {
+    if (CI->isMBB()) {
+      auto MBB = CI->getMBBMRT()->getMBB();
+      storeMBBLiveOuts(MBB, MRI, TRI, PHIInfo, TopRegion);
+    } else {
+      LinearizedRegion *SubRegion = CI->getRegionMRT()->getLinearizedRegion();
+      // We should be limited to only store registers that are live out from the
+      // lineaized region
+      for (auto MBBI : SubRegion->MBBs) {
+        storeMBBLiveOuts(MBBI, MRI, TRI, PHIInfo, TopRegion);
+      }
+    }
+  }
+
+  if (CurrentTopRegion == nullptr) {
+    auto Succ = Region->getSucc();
+    for (auto &II : *Succ) {
+      if (II.isPHI()) {
+        MachineInstr &PHI = II;
+        int numPreds = getPHINumInputs(PHI);
+        for (int i = 0; i < numPreds; ++i) {
+          if (Region->contains(getPHIPred(PHI, i))) {
+            unsigned PHIReg = getPHISourceReg(PHI, i);
+            DEBUG(dbgs() << "Add Region LiveOut (" << (void *)Region
+                         << "): " << PrintReg(PHIReg, TRI) << "\n");
+            addLiveOut(PHIReg);
+          }
+        }
+      }
+    }
+  }
+}
+
+void LinearizedRegion::print(raw_ostream &OS, const TargetRegisterInfo *TRI) {
+  OS << "Linearized Region {";
+  bool IsFirst = true;
+  for (const auto &MBB : MBBs) {
+    if (IsFirst) {
+      IsFirst = false;
+    } else {
+      OS << " ,";
+    }
+    OS << MBB->getNumber();
+  }
+  OS << "} (" << Entry->getNumber() << ", "
+     << (Exit == nullptr ? -1 : Exit->getNumber())
+     << "): In:" << PrintReg(getBBSelectRegIn(), TRI)
+     << " Out:" << PrintReg(getBBSelectRegOut(), TRI) << " {";
+  for (auto &LI : LiveOuts) {
+    OS << PrintReg(LI, TRI) << " ";
+  }
+  OS << "} \n";
+}
+
+unsigned LinearizedRegion::getBBSelectRegIn() {
+  return getRegionMRT()->getBBSelectRegIn();
+}
+
+unsigned LinearizedRegion::getBBSelectRegOut() {
+  return getRegionMRT()->getBBSelectRegOut();
+}
+
+void LinearizedRegion::setHasLoop(bool Value) { HasLoop = Value; }
+
+bool LinearizedRegion::getHasLoop() { return HasLoop; }
+
+void LinearizedRegion::addLiveOut(unsigned VReg) { LiveOuts.insert(VReg); }
+
+void LinearizedRegion::removeLiveOut(unsigned Reg) {
+  if (isLiveOut(Reg))
+    LiveOuts.erase(Reg);
+}
+
+void LinearizedRegion::replaceLiveOut(unsigned OldReg, unsigned NewReg) {
+  if (isLiveOut(OldReg)) {
+    removeLiveOut(OldReg);
+    addLiveOut(NewReg);
+  }
+}
+
+void LinearizedRegion::replaceRegister(unsigned Register, unsigned NewRegister,
+                                       MachineRegisterInfo *MRI,
+                                       bool ReplaceInside, bool ReplaceOutside,
+                                       bool IncludeLoopPHI) {
+  assert(Register != NewRegister && "Cannot replace a reg with itself");
+
+  DEBUG(dbgs() << "Pepareing to replace register (region): "
+               << PrintReg(Register, MRI->getTargetRegisterInfo()) << " with "
+               << PrintReg(NewRegister, MRI->getTargetRegisterInfo()) << "\n");
+
+  // If we are replacing outside, we also need to update the LiveOuts
+  if (ReplaceOutside &&
+      (isLiveOut(Register) || this->getParent()->isLiveOut(Register))) {
+    LinearizedRegion *Current = this;
+    while (Current != nullptr && Current->getEntry() != nullptr) {
+      DEBUG(dbgs() << "Region before register replace\n");
+      DEBUG(Current->print(dbgs(), MRI->getTargetRegisterInfo()));
+      Current->replaceLiveOut(Register, NewRegister);
+      DEBUG(dbgs() << "Region after register replace\n");
+      DEBUG(Current->print(dbgs(), MRI->getTargetRegisterInfo()));
+      Current = Current->getParent();
+    }
+  }
+
+  for (MachineRegisterInfo::reg_iterator I = MRI->reg_begin(Register),
+                                         E = MRI->reg_end();
+       I != E;) {
+    MachineOperand &O = *I;
+    ++I;
+
+    // We don't rewrite defs.
+    if (O.isDef())
+      continue;
+
+    bool IsInside = contains(O.getParent()->getParent());
+    bool IsLoopPHI = IsInside && (O.getParent()->isPHI() &&
+                                  O.getParent()->getParent() == getEntry());
+    bool ShouldReplace = (IsInside && ReplaceInside) ||
+                         (!IsInside && ReplaceOutside) ||
+                         (IncludeLoopPHI && IsLoopPHI);
+    if (ShouldReplace) {
+
+      if (TargetRegisterInfo::isPhysicalRegister(NewRegister)) {
+        DEBUG(dbgs() << "Trying to substitute physical register: "
+                     << PrintReg(NewRegister, MRI->getTargetRegisterInfo())
+                     << "\n");
+        llvm_unreachable("Cannot substitute physical registers");
+      } else {
+        DEBUG(dbgs() << "Replacing register (region): "
+                     << PrintReg(Register, MRI->getTargetRegisterInfo())
+                     << " with "
+                     << PrintReg(NewRegister, MRI->getTargetRegisterInfo())
+                     << "\n");
+        O.setReg(NewRegister);
+      }
+    }
+  }
+}
+
+void LinearizedRegion::replaceRegisterInsideRegion(unsigned Register,
+                                                   unsigned NewRegister,
+                                                   bool IncludeLoopPHIs,
+                                                   MachineRegisterInfo *MRI) {
+  replaceRegister(Register, NewRegister, MRI, true, false, IncludeLoopPHIs);
+}
+
+void LinearizedRegion::replaceRegisterOutsideRegion(unsigned Register,
+                                                    unsigned NewRegister,
+                                                    bool IncludeLoopPHIs,
+                                                    MachineRegisterInfo *MRI) {
+  replaceRegister(Register, NewRegister, MRI, false, true, IncludeLoopPHIs);
+}
+
+DenseSet<unsigned> *LinearizedRegion::getLiveOuts() { return &LiveOuts; }
+
+void LinearizedRegion::setEntry(MachineBasicBlock *NewEntry) {
+  Entry = NewEntry;
+}
+
+MachineBasicBlock *LinearizedRegion::getEntry() { return Entry; }
+
+void LinearizedRegion::setExit(MachineBasicBlock *NewExit) { Exit = NewExit; }
+
+MachineBasicBlock *LinearizedRegion::getExit() { return Exit; }
+
+void LinearizedRegion::addMBB(MachineBasicBlock *MBB) { MBBs.insert(MBB); }
+
+void LinearizedRegion::addMBBs(LinearizedRegion *InnerRegion) {
+  for (const auto &MBB : InnerRegion->MBBs) {
+    addMBB(MBB);
+  }
+}
+
+bool LinearizedRegion::contains(MachineBasicBlock *MBB) {
+  return MBBs.count(MBB) == 1;
+}
+
+bool LinearizedRegion::isLiveOut(unsigned Reg) {
+  return LiveOuts.count(Reg) == 1;
+}
+
+bool LinearizedRegion::hasNoDef(unsigned Reg, MachineRegisterInfo *MRI) {
+  return MRI->def_begin(Reg) == MRI->def_end();
+}
+
+// After the code has been structurized, what was flagged as kills
+// before are no longer register kills.
+void LinearizedRegion::removeFalseRegisterKills(MachineRegisterInfo *MRI) {
+  const TargetRegisterInfo *TRI = MRI->getTargetRegisterInfo();
+  for (auto MBBI : MBBs) {
+    MachineBasicBlock *MBB = MBBI;
+    for (auto &II : *MBB) {
+      for (auto &RI : II.uses()) {
+        if (RI.isReg()) {
+          unsigned Reg = RI.getReg();
+          if (TRI->isVirtualRegister(Reg)) {
+            if (hasNoDef(Reg, MRI))
+              continue;
+            if (!MRI->hasOneDef(Reg)) {
+              DEBUG(this->getEntry()->getParent()->dump());
+              DEBUG(dbgs() << PrintReg(Reg, TRI) << "\n");
+            }
+
+            if (MRI->def_begin(Reg) == MRI->def_end()) {
+              DEBUG(dbgs() << "Register "
+                           << PrintReg(Reg, MRI->getTargetRegisterInfo())
+                           << " has NO defs\n");
+            } else if (!MRI->hasOneDef(Reg)) {
+              DEBUG(dbgs() << "Register "
+                           << PrintReg(Reg, MRI->getTargetRegisterInfo())
+                           << " has multiple defs\n");
+            }
+
+            assert(MRI->hasOneDef(Reg) && "Register has multiple definitions");
+            MachineOperand *Def = &(*(MRI->def_begin(Reg)));
+            MachineOperand *UseOperand = &(RI);
+            bool UseIsOutsideDefMBB = Def->getParent()->getParent() != MBB;
+            if (UseIsOutsideDefMBB && UseOperand->isKill()) {
+              DEBUG(dbgs() << "Removing kill flag on register: "
+                           << PrintReg(Reg, TRI) << "\n");
+              UseOperand->setIsKill(false);
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+void LinearizedRegion::initLiveOut(RegionMRT *Region,
+                                   const MachineRegisterInfo *MRI,
+                                   const TargetRegisterInfo *TRI,
+                                   PHILinearize &PHIInfo) {
+  storeLiveOuts(Region, MRI, TRI, PHIInfo);
+}
+
+LinearizedRegion::LinearizedRegion(MachineBasicBlock *MBB,
+                                   const MachineRegisterInfo *MRI,
+                                   const TargetRegisterInfo *TRI,
+                                   PHILinearize &PHIInfo) {
+  setEntry(MBB);
+  setExit(MBB);
+  storeLiveOuts(MBB, MRI, TRI, PHIInfo);
+  MBBs.insert(MBB);
+  Parent = nullptr;
+}
+
+LinearizedRegion::LinearizedRegion() {
+  setEntry(nullptr);
+  setExit(nullptr);
+  Parent = nullptr;
+}
+
+LinearizedRegion::~LinearizedRegion() {}
+
+class AMDGPUMachineCFGStructurizer : public MachineFunctionPass {
+private:
+  const MachineRegionInfo *Regions;
+  const SIInstrInfo *TII;
+  const TargetRegisterInfo *TRI;
+  MachineRegisterInfo *MRI;
+  unsigned BBSelectRegister;
+  PHILinearize PHIInfo;
+  DenseMap<MachineBasicBlock *, MachineBasicBlock *> FallthroughMap;
+
+  void getPHIRegionIndices(RegionMRT *Region, MachineInstr &PHI,
+                           SmallVector<unsigned, 2> &RegionIndices);
+  void getPHIRegionIndices(LinearizedRegion *Region, MachineInstr &PHI,
+                           SmallVector<unsigned, 2> &RegionIndices);
+  void getPHINonRegionIndices(LinearizedRegion *Region, MachineInstr &PHI,
+                              SmallVector<unsigned, 2> &PHINonRegionIndices);
+
+  void storePHILinearizationInfoDest(
+      unsigned LDestReg, MachineInstr &PHI,
+      SmallVector<unsigned, 2> *RegionIndices = nullptr);
+
+  unsigned storePHILinearizationInfo(MachineInstr &PHI,
+                                     SmallVector<unsigned, 2> *RegionIndices);
+
+  void extractKilledPHIs(MachineBasicBlock *MBB);
+
+  bool shrinkPHI(MachineInstr &PHI, SmallVector<unsigned, 2> &PHIIndices,
+                 unsigned *ReplaceReg);
+
+  bool shrinkPHI(MachineInstr &PHI, unsigned CombinedSourceReg,
+                 MachineBasicBlock *SourceMBB,
+                 SmallVector<unsigned, 2> &PHIIndices, unsigned *ReplaceReg);
+
+  void replacePHI(MachineInstr &PHI, unsigned CombinedSourceReg,
+                  MachineBasicBlock *LastMerge,
+                  SmallVector<unsigned, 2> &PHIRegionIndices);
+  void replaceEntryPHI(MachineInstr &PHI, unsigned CombinedSourceReg,
+                       MachineBasicBlock *IfMBB,
+                       SmallVector<unsigned, 2> &PHIRegionIndices);
+  void replaceLiveOutRegs(MachineInstr &PHI,
+                          SmallVector<unsigned, 2> &PHIRegionIndices,
+                          unsigned CombinedSourceReg,
+                          LinearizedRegion *LRegion);
+  void rewriteRegionExitPHI(RegionMRT *Region, MachineBasicBlock *LastMerge,
+                            MachineInstr &PHI, LinearizedRegion *LRegion);
+
+  void rewriteRegionExitPHIs(RegionMRT *Region, MachineBasicBlock *LastMerge,
+                             LinearizedRegion *LRegion);
+  void rewriteRegionEntryPHI(LinearizedRegion *Region, MachineBasicBlock *IfMBB,
+                             MachineInstr &PHI);
+  void rewriteRegionEntryPHIs(LinearizedRegion *Region,
+                              MachineBasicBlock *IfMBB);
+
+  bool regionIsSimpleIf(RegionMRT *Region);
+
+  void transformSimpleIfRegion(RegionMRT *Region);
+
+  void eliminateDeadBranchOperands(MachineBasicBlock::instr_iterator &II);
+
+  void insertUnconditionalBranch(MachineBasicBlock *MBB,
+                                 MachineBasicBlock *Dest,
+                                 const DebugLoc &DL = DebugLoc());
+
+  MachineBasicBlock *createLinearizedExitBlock(RegionMRT *Region);
+
+  void insertMergePHI(MachineBasicBlock *IfBB, MachineBasicBlock *CodeBB,
+                      MachineBasicBlock *MergeBB, unsigned DestRegister,
+                      unsigned IfSourceRegister, unsigned CodeSourceRegister,
+                      bool IsUndefIfSource = false);
+
+  MachineBasicBlock *createIfBlock(MachineBasicBlock *MergeBB,
+                                   MachineBasicBlock *CodeBBStart,
+                                   MachineBasicBlock *CodeBBEnd,
+                                   MachineBasicBlock *SelectBB, unsigned IfReg,
+                                   bool InheritPreds);
+
+  void prunePHIInfo(MachineBasicBlock *MBB);
+  void createEntryPHI(LinearizedRegion *CurrentRegion, unsigned DestReg);
+
+  void createEntryPHIs(LinearizedRegion *CurrentRegion);
+  void resolvePHIInfos(MachineBasicBlock *FunctionEntry);
+
+  void replaceRegisterWith(unsigned Register, unsigned NewRegister);
+
+  MachineBasicBlock *createIfRegion(MachineBasicBlock *MergeBB,
+                                    MachineBasicBlock *CodeBB,
+                                    LinearizedRegion *LRegion,
+                                    unsigned BBSelectRegIn,
+                                    unsigned BBSelectRegOut);
+
+  MachineBasicBlock *
+  createIfRegion(MachineBasicBlock *MergeMBB, LinearizedRegion *InnerRegion,
+                 LinearizedRegion *CurrentRegion, MachineBasicBlock *SelectBB,
+                 unsigned BBSelectRegIn, unsigned BBSelectRegOut);
+  void ensureCondIsNotKilled(SmallVector<MachineOperand, 1> Cond);
+
+  void rewriteCodeBBTerminator(MachineBasicBlock *CodeBB,
+                               MachineBasicBlock *MergeBB,
+                               unsigned BBSelectReg);
+
+  MachineInstr *getDefInstr(unsigned Reg);
+  void insertChainedPHI(MachineBasicBlock *IfBB, MachineBasicBlock *CodeBB,
+                        MachineBasicBlock *MergeBB,
+                        LinearizedRegion *InnerRegion, unsigned DestReg,
+                        unsigned SourceReg);
+  bool containsDef(MachineBasicBlock *MBB, LinearizedRegion *InnerRegion,
+                   unsigned Register);
+  void rewriteLiveOutRegs(MachineBasicBlock *IfBB, MachineBasicBlock *CodeBB,
+                          MachineBasicBlock *MergeBB,
+                          LinearizedRegion *InnerRegion,
+                          LinearizedRegion *LRegion);
+
+  void splitLoopPHI(MachineInstr &PHI, MachineBasicBlock *Entry,
+                    MachineBasicBlock *EntrySucc, LinearizedRegion *LRegion);
+  void splitLoopPHIs(MachineBasicBlock *Entry, MachineBasicBlock *EntrySucc,
+                     LinearizedRegion *LRegion);
+
+  MachineBasicBlock *splitExit(LinearizedRegion *LRegion);
+
+  MachineBasicBlock *splitEntry(LinearizedRegion *LRegion);
+
+  LinearizedRegion *initLinearizedRegion(RegionMRT *Region);
+
+  bool structurizeComplexRegion(RegionMRT *Region);
+
+  bool structurizeRegion(RegionMRT *Region);
+
+  bool structurizeRegions(RegionMRT *Region, bool isTopRegion);
+
+public:
+  static char ID;
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<MachineRegionInfoPass>();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+
+    AMDGPUMachineCFGStructurizer() : MachineFunctionPass(ID) {
+      initializeAMDGPUMachineCFGStructurizerPass(*PassRegistry::getPassRegistry());
+    }
+
+  void initFallthroughMap(MachineFunction &MF);
+
+  void createLinearizedRegion(RegionMRT *Region, unsigned SelectOut);
+
+  unsigned initializeSelectRegisters(MRT *MRT, unsigned ExistingExitReg,
+                                     MachineRegisterInfo *MRI,
+                                     const SIInstrInfo *TII);
+
+  RegionMRT *RMRT;
+  void setRegionMRT(RegionMRT *RegionTree) { RMRT = RegionTree; }
+
+  RegionMRT *getRegionMRT() { return RMRT; }
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+};
+}
+
+char AMDGPUMachineCFGStructurizer::ID = 0;
+
+bool AMDGPUMachineCFGStructurizer::regionIsSimpleIf(RegionMRT *Region) {
+  MachineBasicBlock *Entry = Region->getEntry();
+  MachineBasicBlock *Succ = Region->getSucc();
+  bool FoundBypass = false;
+  bool FoundIf = false;
+
+  if (Entry->succ_size() != 2) {
+    return false;
+  }
+
+  for (MachineBasicBlock::const_succ_iterator SI = Entry->succ_begin(),
+                                              E = Entry->succ_end();
+       SI != E; ++SI) {
+    MachineBasicBlock *Current = *SI;
+
+    if (Current == Succ) {
+      FoundBypass = true;
+    } else if ((Current->succ_size() == 1) &&
+               *(Current->succ_begin()) == Succ) {
+      FoundIf = true;
+    }
+  }
+
+  return FoundIf && FoundBypass;
+}
+
+void AMDGPUMachineCFGStructurizer::transformSimpleIfRegion(RegionMRT *Region) {
+  MachineBasicBlock *Entry = Region->getEntry();
+  MachineBasicBlock *Exit = Region->getExit();
+  TII->convertNonUniformIfRegion(Entry, Exit);
+}
+
+static void fixMBBTerminator(MachineBasicBlock *MBB) {
+
+  if (MBB->succ_size() == 1) {
+    auto *Succ = *(MBB->succ_begin());
+    for (auto &TI : MBB->terminators()) {
+      for (auto &UI : TI.uses()) {
+        if (UI.isMBB() && UI.getMBB() != Succ) {
+          UI.setMBB(Succ);
+        }
+      }
+    }
+  }
+}
+
+static void fixRegionTerminator(RegionMRT *Region) {
+  MachineBasicBlock *InternalSucc = nullptr;
+  MachineBasicBlock *ExternalSucc = nullptr;
+  LinearizedRegion *LRegion = Region->getLinearizedRegion();
+  auto Exit = LRegion->getExit();
+
+  SmallPtrSet<MachineBasicBlock *, 2> Successors;
+  for (MachineBasicBlock::const_succ_iterator SI = Exit->succ_begin(),
+                                              SE = Exit->succ_end();
+       SI != SE; ++SI) {
+    MachineBasicBlock *Succ = *SI;
+    if (LRegion->contains(Succ)) {
+      // Do not allow re-assign
+      assert(InternalSucc == nullptr);
+      InternalSucc = Succ;
+    } else {
+      // Do not allow re-assign
+      assert(ExternalSucc == nullptr);
+      ExternalSucc = Succ;
+    }
+  }
+
+  for (auto &TI : Exit->terminators()) {
+    for (auto &UI : TI.uses()) {
+      if (UI.isMBB()) {
+        auto Target = UI.getMBB();
+        if (Target != InternalSucc && Target != ExternalSucc) {
+          UI.setMBB(ExternalSucc);
+        }
+      }
+    }
+  }
+}
+
+// If a region region is just a sequence of regions (and the exit
+// block in the case of the top level region), we can simply skip
+// linearizing it, because it is already linear
+bool regionIsSequence(RegionMRT *Region) {
+  auto Children = Region->getChildren();
+  for (auto CI : *Children) {
+    if (!CI->isRegion()) {
+      if (CI->getMBBMRT()->getMBB()->succ_size() > 1) {
+        return false;
+      }
+    }
+  }
+  return true;
+}
+
+void fixupRegionExits(RegionMRT *Region) {
+  auto Children = Region->getChildren();
+  for (auto CI : *Children) {
+    if (!CI->isRegion()) {
+      fixMBBTerminator(CI->getMBBMRT()->getMBB());
+    } else {
+      fixRegionTerminator(CI->getRegionMRT());
+    }
+  }
+}
+
+void AMDGPUMachineCFGStructurizer::getPHIRegionIndices(
+    RegionMRT *Region, MachineInstr &PHI,
+    SmallVector<unsigned, 2> &PHIRegionIndices) {
+  unsigned NumInputs = getPHINumInputs(PHI);
+  for (unsigned i = 0; i < NumInputs; ++i) {
+    MachineBasicBlock *Pred = getPHIPred(PHI, i);
+    if (Region->contains(Pred)) {
+      PHIRegionIndices.push_back(i);
+    }
+  }
+}
+
+void AMDGPUMachineCFGStructurizer::getPHIRegionIndices(
+    LinearizedRegion *Region, MachineInstr &PHI,
+    SmallVector<unsigned, 2> &PHIRegionIndices) {
+  unsigned NumInputs = getPHINumInputs(PHI);
+  for (unsigned i = 0; i < NumInputs; ++i) {
+    MachineBasicBlock *Pred = getPHIPred(PHI, i);
+    if (Region->contains(Pred)) {
+      PHIRegionIndices.push_back(i);
+    }
+  }
+}
+
+void AMDGPUMachineCFGStructurizer::getPHINonRegionIndices(
+    LinearizedRegion *Region, MachineInstr &PHI,
+    SmallVector<unsigned, 2> &PHINonRegionIndices) {
+  unsigned NumInputs = getPHINumInputs(PHI);
+  for (unsigned i = 0; i < NumInputs; ++i) {
+    MachineBasicBlock *Pred = getPHIPred(PHI, i);
+    if (!Region->contains(Pred)) {
+      PHINonRegionIndices.push_back(i);
+    }
+  }
+}
+
+void AMDGPUMachineCFGStructurizer::storePHILinearizationInfoDest(
+    unsigned LDestReg, MachineInstr &PHI,
+    SmallVector<unsigned, 2> *RegionIndices) {
+  if (RegionIndices) {
+    for (auto i : *RegionIndices) {
+      PHIInfo.addSource(LDestReg, getPHISourceReg(PHI, i), getPHIPred(PHI, i));
+    }
+  } else {
+    unsigned NumInputs = getPHINumInputs(PHI);
+    for (unsigned i = 0; i < NumInputs; ++i) {
+      PHIInfo.addSource(LDestReg, getPHISourceReg(PHI, i), getPHIPred(PHI, i));
+    }
+  }
+}
+
+unsigned AMDGPUMachineCFGStructurizer::storePHILinearizationInfo(
+    MachineInstr &PHI, SmallVector<unsigned, 2> *RegionIndices) {
+  unsigned DestReg = getPHIDestReg(PHI);
+  unsigned LinearizeDestReg =
+      MRI->createVirtualRegister(MRI->getRegClass(DestReg));
+  PHIInfo.addDest(LinearizeDestReg, PHI.getDebugLoc());
+  storePHILinearizationInfoDest(LinearizeDestReg, PHI, RegionIndices);
+  return LinearizeDestReg;
+}
+
+void AMDGPUMachineCFGStructurizer::extractKilledPHIs(MachineBasicBlock *MBB) {
+  // We need to create a new chain for the killed phi, but there is no
+  // need to do the renaming outside or inside the block.
+  SmallPtrSet<MachineInstr *, 2> PHIs;
+  for (MachineBasicBlock::instr_iterator I = MBB->instr_begin(),
+                                         E = MBB->instr_end();
+       I != E; ++I) {
+    MachineInstr &Instr = *I;
+    if (Instr.isPHI()) {
+      unsigned PHIDestReg = getPHIDestReg(Instr);
+      DEBUG(dbgs() << "Extractking killed phi:\n");
+      DEBUG(Instr.dump());
+      PHIs.insert(&Instr);
+      PHIInfo.addDest(PHIDestReg, Instr.getDebugLoc());
+      storePHILinearizationInfoDest(PHIDestReg, Instr);
+    }
+  }
+
+  for (auto PI : PHIs) {
+    PI->eraseFromParent();
+  }
+}
+
+static bool isPHIRegionIndex(SmallVector<unsigned, 2> PHIRegionIndices,
+                             unsigned Index) {
+  for (auto i : PHIRegionIndices) {
+    if (i == Index)
+      return true;
+  }
+  return false;
+}
+
+bool AMDGPUMachineCFGStructurizer::shrinkPHI(MachineInstr &PHI,
+                                       SmallVector<unsigned, 2> &PHIIndices,
+                                       unsigned *ReplaceReg) {
+  return shrinkPHI(PHI, 0, nullptr, PHIIndices, ReplaceReg);
+}
+
+bool AMDGPUMachineCFGStructurizer::shrinkPHI(MachineInstr &PHI,
+                                       unsigned CombinedSourceReg,
+                                       MachineBasicBlock *SourceMBB,
+                                       SmallVector<unsigned, 2> &PHIIndices,
+                                       unsigned *ReplaceReg) {
+  DEBUG(dbgs() << "Shrink PHI: ");
+  DEBUG(PHI.dump());
+  DEBUG(dbgs() << " to " << PrintReg(getPHIDestReg(PHI), TRI)
+               << "<def> = PHI(");
+
+  bool Replaced = false;
+  unsigned NumInputs = getPHINumInputs(PHI);
+  int SingleExternalEntryIndex = -1;
+  for (unsigned i = 0; i < NumInputs; ++i) {
+    if (!isPHIRegionIndex(PHIIndices, i)) {
+      if (SingleExternalEntryIndex == -1) {
+        // Single entry
+        SingleExternalEntryIndex = i;
+      } else {
+        // Multiple entries
+        SingleExternalEntryIndex = -2;
+      }
+    }
+  }
+
+  if (SingleExternalEntryIndex > -1) {
+    *ReplaceReg = getPHISourceReg(PHI, SingleExternalEntryIndex);
+    // We should not rewrite the code, we should only pick up the single value
+    // that represents the shrunk PHI.
+    Replaced = true;
+  } else {
+    MachineBasicBlock *MBB = PHI.getParent();
+    MachineInstrBuilder MIB =
+        BuildMI(*MBB, PHI, PHI.getDebugLoc(), TII->get(TargetOpcode::PHI),
+                getPHIDestReg(PHI));
+    if (SourceMBB) {
+      MIB.addReg(CombinedSourceReg);
+      MIB.addMBB(SourceMBB);
+      DEBUG(dbgs() << PrintReg(CombinedSourceReg, TRI) << ", BB#"
+                   << SourceMBB->getNumber());
+    }
+
+    for (unsigned i = 0; i < NumInputs; ++i) {
+      if (isPHIRegionIndex(PHIIndices, i)) {
+        continue;
+      }
+      unsigned SourceReg = getPHISourceReg(PHI, i);
+      MachineBasicBlock *SourcePred = getPHIPred(PHI, i);
+      MIB.addReg(SourceReg);
+      MIB.addMBB(SourcePred);
+      DEBUG(dbgs() << PrintReg(SourceReg, TRI) << ", BB#"
+                   << SourcePred->getNumber());
+    }
+    DEBUG(dbgs() << ")\n");
+  }
+  PHI.eraseFromParent();
+  return Replaced;
+}
+
+void AMDGPUMachineCFGStructurizer::replacePHI(
+    MachineInstr &PHI, unsigned CombinedSourceReg, MachineBasicBlock *LastMerge,
+    SmallVector<unsigned, 2> &PHIRegionIndices) {
+  DEBUG(dbgs() << "Replace PHI: ");
+  DEBUG(PHI.dump());
+  DEBUG(dbgs() << " with " << PrintReg(getPHIDestReg(PHI), TRI)
+               << "<def> = PHI(");
+
+  bool HasExternalEdge = false;
+  unsigned NumInputs = getPHINumInputs(PHI);
+  for (unsigned i = 0; i < NumInputs; ++i) {
+    if (!isPHIRegionIndex(PHIRegionIndices, i)) {
+      HasExternalEdge = true;
+    }
+  }
+
+  if (HasExternalEdge) {
+    MachineBasicBlock *MBB = PHI.getParent();
+    MachineInstrBuilder MIB =
+        BuildMI(*MBB, PHI, PHI.getDebugLoc(), TII->get(TargetOpcode::PHI),
+                getPHIDestReg(PHI));
+    MIB.addReg(CombinedSourceReg);
+    MIB.addMBB(LastMerge);
+    DEBUG(dbgs() << PrintReg(CombinedSourceReg, TRI) << ", BB#"
+                 << LastMerge->getNumber());
+    for (unsigned i = 0; i < NumInputs; ++i) {
+      if (isPHIRegionIndex(PHIRegionIndices, i)) {
+        continue;
+      }
+      unsigned SourceReg = getPHISourceReg(PHI, i);
+      MachineBasicBlock *SourcePred = getPHIPred(PHI, i);
+      MIB.addReg(SourceReg);
+      MIB.addMBB(SourcePred);
+      DEBUG(dbgs() << PrintReg(SourceReg, TRI) << ", BB#"
+                   << SourcePred->getNumber());
+    }
+    DEBUG(dbgs() << ")\n");
+  } else {
+    replaceRegisterWith(getPHIDestReg(PHI), CombinedSourceReg);
+  }
+  PHI.eraseFromParent();
+}
+
+void AMDGPUMachineCFGStructurizer::replaceEntryPHI(
+    MachineInstr &PHI, unsigned CombinedSourceReg, MachineBasicBlock *IfMBB,
+    SmallVector<unsigned, 2> &PHIRegionIndices) {
+
+  DEBUG(dbgs() << "Replace entry PHI: ");
+  DEBUG(PHI.dump());
+  DEBUG(dbgs() << " with ");
+
+  unsigned NumInputs = getPHINumInputs(PHI);
+  unsigned NumNonRegionInputs = NumInputs;
+  for (unsigned i = 0; i < NumInputs; ++i) {
+    if (isPHIRegionIndex(PHIRegionIndices, i)) {
+      NumNonRegionInputs--;
+    }
+  }
+
+  if (NumNonRegionInputs == 0) {
+    auto DestReg = getPHIDestReg(PHI);
+    replaceRegisterWith(DestReg, CombinedSourceReg);
+    DEBUG(dbgs() << " register " << PrintReg(CombinedSourceReg, TRI) << "\n");
+    PHI.eraseFromParent();
+  } else {
+    DEBUG(dbgs() << PrintReg(getPHIDestReg(PHI), TRI) << "<def> = PHI(");
+    MachineBasicBlock *MBB = PHI.getParent();
+    MachineInstrBuilder MIB =
+        BuildMI(*MBB, PHI, PHI.getDebugLoc(), TII->get(TargetOpcode::PHI),
+                getPHIDestReg(PHI));
+    MIB.addReg(CombinedSourceReg);
+    MIB.addMBB(IfMBB);
+    DEBUG(dbgs() << PrintReg(CombinedSourceReg, TRI) << ", BB#"
+                 << IfMBB->getNumber());
+    unsigned NumInputs = getPHINumInputs(PHI);
+    for (unsigned i = 0; i < NumInputs; ++i) {
+      if (isPHIRegionIndex(PHIRegionIndices, i)) {
+        continue;
+      }
+      unsigned SourceReg = getPHISourceReg(PHI, i);
+      MachineBasicBlock *SourcePred = getPHIPred(PHI, i);
+      MIB.addReg(SourceReg);
+      MIB.addMBB(SourcePred);
+      DEBUG(dbgs() << PrintReg(SourceReg, TRI) << ", BB#"
+                   << SourcePred->getNumber());
+    }
+    DEBUG(dbgs() << ")\n");
+    PHI.eraseFromParent();
+  }
+}
+
+void AMDGPUMachineCFGStructurizer::replaceLiveOutRegs(
+    MachineInstr &PHI, SmallVector<unsigned, 2> &PHIRegionIndices,
+    unsigned CombinedSourceReg, LinearizedRegion *LRegion) {
+  bool WasLiveOut = false;
+  for (auto PII : PHIRegionIndices) {
+    unsigned Reg = getPHISourceReg(PHI, PII);
+    if (LRegion->isLiveOut(Reg)) {
+      bool IsDead = true;
+
+      // Check if register is live out of the basic block
+      MachineBasicBlock *DefMBB = getDefInstr(Reg)->getParent();
+      for (auto UI = MRI->use_begin(Reg), E = MRI->use_end(); UI != E; ++UI) {
+        if ((*UI).getParent()->getParent() != DefMBB) {
+          IsDead = false;
+        }
+      }
+
+      DEBUG(dbgs() << "Register " << PrintReg(Reg, TRI) << " is "
+                   << (IsDead ? "dead" : "alive") << " after PHI replace\n");
+      if (IsDead) {
+        LRegion->removeLiveOut(Reg);
+      }
+      WasLiveOut = true;
+    }
+  }
+
+  if (WasLiveOut)
+    LRegion->addLiveOut(CombinedSourceReg);
+}
+
+void AMDGPUMachineCFGStructurizer::rewriteRegionExitPHI(RegionMRT *Region,
+                                                  MachineBasicBlock *LastMerge,
+                                                  MachineInstr &PHI,
+                                                  LinearizedRegion *LRegion) {
+  SmallVector<unsigned, 2> PHIRegionIndices;
+  getPHIRegionIndices(Region, PHI, PHIRegionIndices);
+  unsigned LinearizedSourceReg =
+      storePHILinearizationInfo(PHI, &PHIRegionIndices);
+
+  replacePHI(PHI, LinearizedSourceReg, LastMerge, PHIRegionIndices);
+  replaceLiveOutRegs(PHI, PHIRegionIndices, LinearizedSourceReg, LRegion);
+}
+
+void AMDGPUMachineCFGStructurizer::rewriteRegionEntryPHI(LinearizedRegion *Region,
+                                                   MachineBasicBlock *IfMBB,
+                                                   MachineInstr &PHI) {
+  SmallVector<unsigned, 2> PHINonRegionIndices;
+  getPHINonRegionIndices(Region, PHI, PHINonRegionIndices);
+  unsigned LinearizedSourceReg =
+      storePHILinearizationInfo(PHI, &PHINonRegionIndices);
+  replaceEntryPHI(PHI, LinearizedSourceReg, IfMBB, PHINonRegionIndices);
+}
+
+static void collectPHIs(MachineBasicBlock *MBB,
+                        SmallVector<MachineInstr *, 2> &PHIs) {
+  for (auto &BBI : *MBB) {
+    if (BBI.isPHI()) {
+      PHIs.push_back(&BBI);
+    }
+  }
+}
+
+void AMDGPUMachineCFGStructurizer::rewriteRegionExitPHIs(RegionMRT *Region,
+                                                   MachineBasicBlock *LastMerge,
+                                                   LinearizedRegion *LRegion) {
+  SmallVector<MachineInstr *, 2> PHIs;
+  auto Exit = Region->getSucc();
+  if (Exit == nullptr)
+    return;
+
+  collectPHIs(Exit, PHIs);
+
+  for (auto PHII : PHIs) {
+    rewriteRegionExitPHI(Region, LastMerge, *PHII, LRegion);
+  }
+}
+
+void AMDGPUMachineCFGStructurizer::rewriteRegionEntryPHIs(LinearizedRegion *Region,
+                                                    MachineBasicBlock *IfMBB) {
+  SmallVector<MachineInstr *, 2> PHIs;
+  auto Entry = Region->getEntry();
+
+  collectPHIs(Entry, PHIs);
+
+  for (auto PHII : PHIs) {
+    rewriteRegionEntryPHI(Region, IfMBB, *PHII);
+  }
+}
+
+void AMDGPUMachineCFGStructurizer::insertUnconditionalBranch(MachineBasicBlock *MBB,
+                                                       MachineBasicBlock *Dest,
+                                                       const DebugLoc &DL) {
+  DEBUG(dbgs() << "Inserting unconditional branch: " << MBB->getNumber()
+               << " -> " << Dest->getNumber() << "\n");
+  MachineBasicBlock::instr_iterator Terminator = MBB->getFirstInstrTerminator();
+  bool HasTerminator = Terminator != MBB->instr_end();
+  if (HasTerminator) {
+    TII->ReplaceTailWithBranchTo(Terminator, Dest);
+  }
+  if (++MachineFunction::iterator(MBB) != MachineFunction::iterator(Dest)) {
+    TII->insertUnconditionalBranch(*MBB, Dest, DL);
+  }
+}
+
+static MachineBasicBlock *getSingleExitNode(MachineFunction &MF) {
+  MachineBasicBlock *result = nullptr;
+  for (auto &MFI : MF) {
+    if (MFI.succ_size() == 0) {
+      if (result == nullptr) {
+        result = &MFI;
+      } else {
+        return nullptr;
+      }
+    }
+  }
+
+  return result;
+}
+
+static bool hasOneExitNode(MachineFunction &MF) {
+  return getSingleExitNode(MF) != nullptr;
+}
+
+MachineBasicBlock *
+AMDGPUMachineCFGStructurizer::createLinearizedExitBlock(RegionMRT *Region) {
+  auto Exit = Region->getSucc();
+
+  // If the exit is the end of the function, we just use the existing
+  MachineFunction *MF = Region->getEntry()->getParent();
+  if (Exit == nullptr && hasOneExitNode(*MF)) {
+    return &(*(--(Region->getEntry()->getParent()->end())));
+  }
+
+  MachineBasicBlock *LastMerge = MF->CreateMachineBasicBlock();
+  if (Exit == nullptr) {
+    MachineFunction::iterator ExitIter = MF->end();
+    MF->insert(ExitIter, LastMerge);
+  } else {
+    MachineFunction::iterator ExitIter = Exit->getIterator();
+    MF->insert(ExitIter, LastMerge);
+    LastMerge->addSuccessor(Exit);
+    insertUnconditionalBranch(LastMerge, Exit);
+    DEBUG(dbgs() << "Created exit block: " << LastMerge->getNumber() << "\n");
+  }
+  return LastMerge;
+}
+
+void AMDGPUMachineCFGStructurizer::insertMergePHI(MachineBasicBlock *IfBB,
+                                            MachineBasicBlock *CodeBB,
+                                            MachineBasicBlock *MergeBB,
+                                            unsigned DestRegister,
+                                            unsigned IfSourceRegister,
+                                            unsigned CodeSourceRegister,
+                                            bool IsUndefIfSource) {
+  // If this is the function exit block, we don't need a phi.
+  if (MergeBB->succ_begin() == MergeBB->succ_end()) {
+    return;
+  }
+  DEBUG(dbgs() << "Merge PHI (BB#" << MergeBB->getNumber()
+               << "): " << PrintReg(DestRegister, TRI) << "<def> = PHI("
+               << PrintReg(IfSourceRegister, TRI) << ", BB#"
+               << IfBB->getNumber() << PrintReg(CodeSourceRegister, TRI)
+               << ", BB#" << CodeBB->getNumber() << ")\n");
+  const DebugLoc &DL = MergeBB->findDebugLoc(MergeBB->begin());
+  MachineInstrBuilder MIB = BuildMI(*MergeBB, MergeBB->instr_begin(), DL,
+                                    TII->get(TargetOpcode::PHI), DestRegister);
+  if (IsUndefIfSource && false) {
+    MIB.addReg(IfSourceRegister, RegState::Undef);
+  } else {
+    MIB.addReg(IfSourceRegister);
+  }
+  MIB.addMBB(IfBB);
+  MIB.addReg(CodeSourceRegister);
+  MIB.addMBB(CodeBB);
+}
+
+static void removeExternalCFGSuccessors(MachineBasicBlock *MBB) {
+  for (MachineBasicBlock::succ_iterator PI = MBB->succ_begin(),
+                                        E = MBB->succ_end();
+       PI != E; ++PI) {
+    if ((*PI) != MBB) {
+      (MBB)->removeSuccessor(*PI);
+    }
+  }
+}
+
+static void removeExternalCFGEdges(MachineBasicBlock *StartMBB,
+                                   MachineBasicBlock *EndMBB) {
+
+  // We have to check against the StartMBB successor becasuse a
+  // structurized region with a loop will have the entry block split,
+  // and the backedge will go to the entry successor.
+  DenseSet<std::pair<MachineBasicBlock *, MachineBasicBlock *>> Succs;
+  unsigned SuccSize = StartMBB->succ_size();
+  if (SuccSize > 0) {
+    MachineBasicBlock *StartMBBSucc = *(StartMBB->succ_begin());
+    for (MachineBasicBlock::succ_iterator PI = EndMBB->succ_begin(),
+                                          E = EndMBB->succ_end();
+         PI != E; ++PI) {
+      // Either we have a back-edge to the entry block, or a back-edge to the
+      // succesor of the entry block since the block may be split.
+      if ((*PI) != StartMBB &&
+          !((*PI) == StartMBBSucc && StartMBB != EndMBB && SuccSize == 1)) {
+        Succs.insert(
+            std::pair<MachineBasicBlock *, MachineBasicBlock *>(EndMBB, *PI));
+      }
+    }
+  }
+
+  for (MachineBasicBlock::pred_iterator PI = StartMBB->pred_begin(),
+                                        E = StartMBB->pred_end();
+       PI != E; ++PI) {
+    if ((*PI) != EndMBB) {
+      Succs.insert(
+          std::pair<MachineBasicBlock *, MachineBasicBlock *>(*PI, StartMBB));
+    }
+  }
+
+  for (auto SI : Succs) {
+    std::pair<MachineBasicBlock *, MachineBasicBlock *> Edge = SI;
+    DEBUG(dbgs() << "Removing edge: BB#" << Edge.first->getNumber() << " -> BB#"
+                 << Edge.second->getNumber() << "\n");
+    Edge.first->removeSuccessor(Edge.second);
+  }
+}
+
+MachineBasicBlock *AMDGPUMachineCFGStructurizer::createIfBlock(
+    MachineBasicBlock *MergeBB, MachineBasicBlock *CodeBBStart,
+    MachineBasicBlock *CodeBBEnd, MachineBasicBlock *SelectBB, unsigned IfReg,
+    bool InheritPreds) {
+  MachineFunction *MF = MergeBB->getParent();
+  MachineBasicBlock *IfBB = MF->CreateMachineBasicBlock();
+
+  if (InheritPreds) {
+    for (MachineBasicBlock::pred_iterator PI = CodeBBStart->pred_begin(),
+                                          E = CodeBBStart->pred_end();
+         PI != E; ++PI) {
+      if ((*PI) != CodeBBEnd) {
+        MachineBasicBlock *Pred = (*PI);
+        Pred->addSuccessor(IfBB);
+      }
+    }
+  }
+
+  removeExternalCFGEdges(CodeBBStart, CodeBBEnd);
+
+  auto CodeBBStartI = CodeBBStart->getIterator();
+  auto CodeBBEndI = CodeBBEnd->getIterator();
+  auto MergeIter = MergeBB->getIterator();
+  MF->insert(MergeIter, IfBB);
+  MF->splice(MergeIter, CodeBBStartI, ++CodeBBEndI);
+  IfBB->addSuccessor(MergeBB);
+  IfBB->addSuccessor(CodeBBStart);
+
+  DEBUG(dbgs() << "Created If block: " << IfBB->getNumber() << "\n");
+  // Ensure that the MergeBB is a succesor of the CodeEndBB.
+  if (!CodeBBEnd->isSuccessor(MergeBB))
+    CodeBBEnd->addSuccessor(MergeBB);
+
+  DEBUG(dbgs() << "Moved MBB#" << CodeBBStart->getNumber() << " through MBB#"
+               << CodeBBEnd->getNumber() << "\n");
+
+  // If we have a single predecessor we can find a reasonable debug location
+  MachineBasicBlock *SinglePred =
+      CodeBBStart->pred_size() == 1 ? *(CodeBBStart->pred_begin()) : nullptr;
+  const DebugLoc &DL = SinglePred
+                    ? SinglePred->findDebugLoc(SinglePred->getFirstTerminator())
+                    : DebugLoc();
+
+  unsigned Reg =
+      TII->insertEQ(IfBB, IfBB->begin(), DL, IfReg,
+                    SelectBB->getNumber() /* CodeBBStart->getNumber() */);
+  if (&(*(IfBB->getParent()->begin())) == IfBB) {
+    TII->materializeImmediate(*IfBB, IfBB->begin(), DL, IfReg,
+                              CodeBBStart->getNumber());
+  }
+  MachineOperand RegOp = MachineOperand::CreateReg(Reg, false, false, true);
+  ArrayRef<MachineOperand> Cond(RegOp);
+  TII->insertBranch(*IfBB, MergeBB, CodeBBStart, Cond, DL);
+
+  return IfBB;
+}
+
+void AMDGPUMachineCFGStructurizer::ensureCondIsNotKilled(
+    SmallVector<MachineOperand, 1> Cond) {
+  if (Cond.size() != 1)
+    return;
+  if (!Cond[0].isReg())
+    return;
+
+  unsigned CondReg = Cond[0].getReg();
+  for (auto UI = MRI->use_begin(CondReg), E = MRI->use_end(); UI != E; ++UI) {
+    (*UI).setIsKill(false);
+  }
+}
+
+void AMDGPUMachineCFGStructurizer::rewriteCodeBBTerminator(MachineBasicBlock *CodeBB,
+                                                     MachineBasicBlock *MergeBB,
+                                                     unsigned BBSelectReg) {
+  MachineBasicBlock *TrueBB = nullptr;
+  MachineBasicBlock *FalseBB = nullptr;
+  SmallVector<MachineOperand, 1> Cond;
+  MachineBasicBlock *FallthroughBB = FallthroughMap[CodeBB];
+  TII->analyzeBranch(*CodeBB, TrueBB, FalseBB, Cond);
+
+  const DebugLoc &DL = CodeBB->findDebugLoc(CodeBB->getFirstTerminator());
+
+  if (FalseBB == nullptr && TrueBB == nullptr && FallthroughBB == nullptr) {
+    // This is an exit block, hence no successors. We will assign the
+    // bb select register to the entry block.
+    TII->materializeImmediate(*CodeBB, CodeBB->getFirstTerminator(), DL,
+                              BBSelectReg,
+                              CodeBB->getParent()->begin()->getNumber());
+    insertUnconditionalBranch(CodeBB, MergeBB, DL);
+    return;
+  }
+
+  if (FalseBB == nullptr && TrueBB == nullptr) {
+    TrueBB = FallthroughBB;
+  } else if (TrueBB != nullptr) {
+    FalseBB =
+        (FallthroughBB && (FallthroughBB != TrueBB)) ? FallthroughBB : FalseBB;
+  }
+
+  if ((TrueBB != nullptr && FalseBB == nullptr) || (TrueBB == FalseBB)) {
+    TII->materializeImmediate(*CodeBB, CodeBB->getFirstTerminator(), DL,
+                              BBSelectReg, TrueBB->getNumber());
+  } else {
+    const TargetRegisterClass *RegClass = MRI->getRegClass(BBSelectReg);
+    unsigned TrueBBReg = MRI->createVirtualRegister(RegClass);
+    unsigned FalseBBReg = MRI->createVirtualRegister(RegClass);
+    TII->materializeImmediate(*CodeBB, CodeBB->getFirstTerminator(), DL,
+                              TrueBBReg, TrueBB->getNumber());
+    TII->materializeImmediate(*CodeBB, CodeBB->getFirstTerminator(), DL,
+                              FalseBBReg, FalseBB->getNumber());
+    ensureCondIsNotKilled(Cond);
+    TII->insertVectorSelect(*CodeBB, CodeBB->getFirstTerminator(), DL,
+                            BBSelectReg, Cond, TrueBBReg, FalseBBReg);
+  }
+
+  insertUnconditionalBranch(CodeBB, MergeBB, DL);
+}
+
+MachineInstr *AMDGPUMachineCFGStructurizer::getDefInstr(unsigned Reg) {
+  if (MRI->def_begin(Reg) == MRI->def_end()) {
+    DEBUG(dbgs() << "Register " << PrintReg(Reg, MRI->getTargetRegisterInfo())
+                 << " has NO defs\n");
+  } else if (!MRI->hasOneDef(Reg)) {
+    DEBUG(dbgs() << "Register " << PrintReg(Reg, MRI->getTargetRegisterInfo())
+                 << " has multiple defs\n");
+    DEBUG(dbgs() << "DEFS BEGIN:\n");
+    for (auto DI = MRI->def_begin(Reg), DE = MRI->def_end(); DI != DE; ++DI) {
+      DEBUG(DI->getParent()->dump());
+    }
+    DEBUG(dbgs() << "DEFS END\n");
+  }
+
+  assert(MRI->hasOneDef(Reg) && "Register has multiple definitions");
+  return (*(MRI->def_begin(Reg))).getParent();
+}
+
+void AMDGPUMachineCFGStructurizer::insertChainedPHI(MachineBasicBlock *IfBB,
+                                              MachineBasicBlock *CodeBB,
+                                              MachineBasicBlock *MergeBB,
+                                              LinearizedRegion *InnerRegion,
+                                              unsigned DestReg,
+                                              unsigned SourceReg) {
+  // In this function we know we are part of a chain already, so we need
+  // to add the registers to the existing chain, and rename the register
+  // inside the region.
+  bool IsSingleBB = InnerRegion->getEntry() == InnerRegion->getExit();
+  MachineInstr *DefInstr = getDefInstr(SourceReg);
+  if (DefInstr->isPHI() && DefInstr->getParent() == CodeBB && IsSingleBB) {
+    // Handle the case where the def is a PHI-def inside a basic
+    // block, then we only need to do renaming. Special care needs to
+    // be taken if the PHI-def is part of an existing chain, or if a
+    // new one needs to be created.
+    InnerRegion->replaceRegisterInsideRegion(SourceReg, DestReg, true, MRI);
+
+    // We collect all PHI Information, and if we are at the region entry,
+    // all PHIs will be removed, and then re-introduced if needed.
+    storePHILinearizationInfoDest(DestReg, *DefInstr);
+    // We have picked up all the information we need now and can remove
+    // the PHI
+    PHIInfo.removeSource(DestReg, SourceReg, CodeBB);
+    DefInstr->eraseFromParent();
+  } else {
+    // If this is not a phi-def, or it is a phi-def but from a linearized region
+    if (IsSingleBB && DefInstr->getParent() == InnerRegion->getEntry()) {
+      // If this is a single BB and the definition is in this block we
+      // need to replace any uses outside the region.
+      InnerRegion->replaceRegisterOutsideRegion(SourceReg, DestReg, false, MRI);
+    }
+    const TargetRegisterClass *RegClass = MRI->getRegClass(DestReg);
+    unsigned NextDestReg = MRI->createVirtualRegister(RegClass);
+    bool IsLastDef = PHIInfo.getNumSources(DestReg) == 1;
+    DEBUG(dbgs() << "Insert Chained PHI\n");
+    insertMergePHI(IfBB, InnerRegion->getExit(), MergeBB, DestReg, NextDestReg,
+                   SourceReg, IsLastDef);
+
+    PHIInfo.removeSource(DestReg, SourceReg, CodeBB);
+    if (IsLastDef) {
+      const DebugLoc &DL = IfBB->findDebugLoc(IfBB->getFirstTerminator());
+      TII->materializeImmediate(*IfBB, IfBB->getFirstTerminator(), DL,
+                                NextDestReg, 0);
+      PHIInfo.deleteDef(DestReg);
+    } else {
+      PHIInfo.replaceDef(DestReg, NextDestReg);
+    }
+  }
+}
+
+bool AMDGPUMachineCFGStructurizer::containsDef(MachineBasicBlock *MBB,
+                                         LinearizedRegion *InnerRegion,
+                                         unsigned Register) {
+  return getDefInstr(Register)->getParent() == MBB ||
+         InnerRegion->contains(getDefInstr(Register)->getParent());
+}
+
+void AMDGPUMachineCFGStructurizer::rewriteLiveOutRegs(MachineBasicBlock *IfBB,
+                                                MachineBasicBlock *CodeBB,
+                                                MachineBasicBlock *MergeBB,
+                                                LinearizedRegion *InnerRegion,
+                                                LinearizedRegion *LRegion) {
+  DenseSet<unsigned> *LiveOuts = InnerRegion->getLiveOuts();
+  SmallVector<unsigned, 4> OldLiveOuts;
+  bool IsSingleBB = InnerRegion->getEntry() == InnerRegion->getExit();
+  for (auto OLI : *LiveOuts) {
+    OldLiveOuts.push_back(OLI);
+  }
+
+  for (auto LI : OldLiveOuts) {
+    DEBUG(dbgs() << "LiveOut: " << PrintReg(LI, TRI));
+    if (!containsDef(CodeBB, InnerRegion, LI) ||
+        (!IsSingleBB && (getDefInstr(LI)->getParent() == LRegion->getExit()))) {
+      // If the register simly lives through the CodeBB, we don't have
+      // to rewrite anything since the register is not defined in this
+      // part of the code.
+      DEBUG(dbgs() << "- through");
+      continue;
+    }
+    DEBUG(dbgs() << "\n");
+    unsigned Reg = LI;
+    if (/*!PHIInfo.isSource(Reg) &&*/ Reg != InnerRegion->getBBSelectRegOut()) {
+      // If the register is live out, we do want to create a phi,
+      // unless it is from the Exit block, becasuse in that case there
+      // is already a PHI, and no need to create a new one.
+
+      // If the register is just a live out def and not part of a phi
+      // chain, we need to create a PHI node to handle the if region,
+      // and replace all uses outside of the region with the new dest
+      // register, unless it is the outgoing BB select register. We have
+      // already creaed phi nodes for these.
+      const TargetRegisterClass *RegClass = MRI->getRegClass(Reg);
+      unsigned PHIDestReg = MRI->createVirtualRegister(RegClass);
+      unsigned IfSourceReg = MRI->createVirtualRegister(RegClass);
+      // Create initializer, this value is never used, but is needed
+      // to satisfy SSA.
+      DEBUG(dbgs() << "Initializer for reg: " << PrintReg(Reg) << "\n");
+      TII->materializeImmediate(*IfBB, IfBB->getFirstTerminator(), DebugLoc(),
+                        IfSourceReg, 0);
+
+      InnerRegion->replaceRegisterOutsideRegion(Reg, PHIDestReg, true, MRI);
+      DEBUG(dbgs() << "Insert Non-Chained Live out PHI\n");
+      insertMergePHI(IfBB, InnerRegion->getExit(), MergeBB, PHIDestReg,
+                     IfSourceReg, Reg, true);
+    }
+  }
+
+  // Handle the chained definitions in PHIInfo, checking if this basic block
+  // is a source block for a definition.
+  SmallVector<unsigned, 4> Sources;
+  if (PHIInfo.findSourcesFromMBB(CodeBB, Sources)) {
+    DEBUG(dbgs() << "Inserting PHI Live Out from BB#" << CodeBB->getNumber()
+                 << "\n");
+    for (auto SI : Sources) {
+      unsigned DestReg;
+      PHIInfo.findDest(SI, CodeBB, DestReg);
+      insertChainedPHI(IfBB, CodeBB, MergeBB, InnerRegion, DestReg, SI);
+    }
+    DEBUG(dbgs() << "Insertion done.\n");
+  }
+
+  DEBUG(PHIInfo.dump(MRI));
+}
+
+void AMDGPUMachineCFGStructurizer::prunePHIInfo(MachineBasicBlock *MBB) {
+  DEBUG(dbgs() << "Before PHI Prune\n");
+  DEBUG(PHIInfo.dump(MRI));
+  SmallVector<std::tuple<unsigned, unsigned, MachineBasicBlock *>, 4>
+      ElimiatedSources;
+  for (auto DRI = PHIInfo.dests_begin(), DE = PHIInfo.dests_end(); DRI != DE;
+       ++DRI) {
+
+    unsigned DestReg = *DRI;
+    auto SE = PHIInfo.sources_end(DestReg);
+
+    bool MBBContainsPHISource = false;
+    // Check if there is a PHI source in this MBB
+    for (auto SRI = PHIInfo.sources_begin(DestReg); SRI != SE; ++SRI) {
+      unsigned SourceReg = (*SRI).first;
+      MachineOperand *Def = &(*(MRI->def_begin(SourceReg)));
+      if (Def->getParent()->getParent() == MBB) {
+        MBBContainsPHISource = true;
+      }
+    }
+
+    // If so, all other sources are useless since we know this block
+    // is always executed when the region is executed.
+    if (MBBContainsPHISource) {
+      for (auto SRI = PHIInfo.sources_begin(DestReg); SRI != SE; ++SRI) {
+        PHILinearize::PHISourceT Source = *SRI;
+        unsigned SourceReg = Source.first;
+        MachineBasicBlock *SourceMBB = Source.second;
+        MachineOperand *Def = &(*(MRI->def_begin(SourceReg)));
+        if (Def->getParent()->getParent() != MBB) {
+          ElimiatedSources.push_back(
+              std::make_tuple(DestReg, SourceReg, SourceMBB));
+        }
+      }
+    }
+  }
+
+  // Remove the PHI sources that are in the given MBB
+  for (auto &SourceInfo : ElimiatedSources) {
+    PHIInfo.removeSource(std::get<0>(SourceInfo), std::get<1>(SourceInfo),
+                         std::get<2>(SourceInfo));
+  }
+  DEBUG(dbgs() << "After PHI Prune\n");
+  DEBUG(PHIInfo.dump(MRI));
+}
+
+void AMDGPUMachineCFGStructurizer::createEntryPHI(LinearizedRegion *CurrentRegion,
+                                            unsigned DestReg) {
+  MachineBasicBlock *Entry = CurrentRegion->getEntry();
+  MachineBasicBlock *Exit = CurrentRegion->getExit();
+
+  DEBUG(dbgs() << "RegionExit: " << Exit->getNumber()
+               << " Pred: " << (*(Entry->pred_begin()))->getNumber() << "\n");
+
+  int NumSources = 0;
+  auto SE = PHIInfo.sources_end(DestReg);
+
+  for (auto SRI = PHIInfo.sources_begin(DestReg); SRI != SE; ++SRI) {
+    NumSources++;
+  }
+
+  if (NumSources == 1) {
+    auto SRI = PHIInfo.sources_begin(DestReg);
+    unsigned SourceReg = (*SRI).first;
+    replaceRegisterWith(DestReg, SourceReg);
+  } else {
+    const DebugLoc &DL = Entry->findDebugLoc(Entry->begin());
+    MachineInstrBuilder MIB = BuildMI(*Entry, Entry->instr_begin(), DL,
+                                      TII->get(TargetOpcode::PHI), DestReg);
+    DEBUG(dbgs() << "Entry PHI " << PrintReg(DestReg, TRI) << "<def> = PHI(");
+
+    unsigned CurrentBackedgeReg = 0;
+
+    for (auto SRI = PHIInfo.sources_begin(DestReg); SRI != SE; ++SRI) {
+      unsigned SourceReg = (*SRI).first;
+
+      if (CurrentRegion->contains((*SRI).second)) {
+        if (CurrentBackedgeReg == 0) {
+          CurrentBackedgeReg = SourceReg;
+        } else {
+          MachineInstr *PHIDefInstr = getDefInstr(SourceReg);
+          MachineBasicBlock *PHIDefMBB = PHIDefInstr->getParent();
+          const TargetRegisterClass *RegClass =
+              MRI->getRegClass(CurrentBackedgeReg);
+          unsigned NewBackedgeReg = MRI->createVirtualRegister(RegClass);
+          MachineInstrBuilder BackedgePHI =
+              BuildMI(*PHIDefMBB, PHIDefMBB->instr_begin(), DL,
+                      TII->get(TargetOpcode::PHI), NewBackedgeReg);
+          BackedgePHI.addReg(CurrentBackedgeReg);
+          BackedgePHI.addMBB(getPHIPred(*PHIDefInstr, 0));
+          BackedgePHI.addReg(getPHISourceReg(*PHIDefInstr, 1));
+          BackedgePHI.addMBB((*SRI).second);
+          CurrentBackedgeReg = NewBackedgeReg;
+          DEBUG(dbgs() << "Inserting backedge PHI: "
+                       << PrintReg(NewBackedgeReg, TRI) << "<def> = PHI("
+                       << PrintReg(CurrentBackedgeReg, TRI) << ", BB#"
+                       << getPHIPred(*PHIDefInstr, 0)->getNumber() << ", "
+                       << PrintReg(getPHISourceReg(*PHIDefInstr, 1), TRI)
+                       << ", BB#" << (*SRI).second->getNumber());
+        }
+      } else {
+        MIB.addReg(SourceReg);
+        MIB.addMBB((*SRI).second);
+        DEBUG(dbgs() << PrintReg(SourceReg, TRI) << ", BB#"
+                     << (*SRI).second->getNumber() << ", ");
+      }
+    }
+
+    // Add the final backedge register source to the entry phi
+    if (CurrentBackedgeReg != 0) {
+      MIB.addReg(CurrentBackedgeReg);
+      MIB.addMBB(Exit);
+      DEBUG(dbgs() << PrintReg(CurrentBackedgeReg, TRI) << ", BB#"
+                   << Exit->getNumber() << ")\n");
+    } else {
+      DEBUG(dbgs() << ")\n");
+    }
+  }
+}
+
+void AMDGPUMachineCFGStructurizer::createEntryPHIs(LinearizedRegion *CurrentRegion) {
+  DEBUG(PHIInfo.dump(MRI));
+
+  for (auto DRI = PHIInfo.dests_begin(), DE = PHIInfo.dests_end(); DRI != DE;
+       ++DRI) {
+
+    unsigned DestReg = *DRI;
+    createEntryPHI(CurrentRegion, DestReg);
+  }
+  PHIInfo.clear();
+}
+
+void AMDGPUMachineCFGStructurizer::replaceRegisterWith(unsigned Register,
+                                                 unsigned NewRegister) {
+  assert(Register != NewRegister && "Cannot replace a reg with itself");
+
+  for (MachineRegisterInfo::reg_iterator I = MRI->reg_begin(Register),
+                                         E = MRI->reg_end();
+       I != E;) {
+    MachineOperand &O = *I;
+    ++I;
+    if (TargetRegisterInfo::isPhysicalRegister(NewRegister)) {
+      DEBUG(dbgs() << "Trying to substitute physical register: "
+                   << PrintReg(NewRegister, MRI->getTargetRegisterInfo())
+                   << "\n");
+      llvm_unreachable("Cannot substitute physical registers");
+      // We don't handle physical registers, but if we need to
+      // in the future This is how we do it:
+      // O.substPhysReg(NewRegister, *TRI);
+    } else {
+      DEBUG(dbgs() << "Replacing register: "
+                   << PrintReg(Register, MRI->getTargetRegisterInfo())
+                   << " with "
+                   << PrintReg(NewRegister, MRI->getTargetRegisterInfo())
+                   << "\n");
+      O.setReg(NewRegister);
+    }
+  }
+  PHIInfo.deleteDef(Register);
+
+  getRegionMRT()->replaceLiveOutReg(Register, NewRegister);
+
+  DEBUG(PHIInfo.dump(MRI));
+}
+
+void AMDGPUMachineCFGStructurizer::resolvePHIInfos(MachineBasicBlock *FunctionEntry) {
+  DEBUG(dbgs() << "Resolve PHI Infos\n");
+  DEBUG(PHIInfo.dump(MRI));
+  for (auto DRI = PHIInfo.dests_begin(), DE = PHIInfo.dests_end(); DRI != DE;
+       ++DRI) {
+    unsigned DestReg = *DRI;
+    DEBUG(dbgs() << "DestReg: " << PrintReg(DestReg, TRI) << "\n");
+    auto SRI = PHIInfo.sources_begin(DestReg);
+    unsigned SourceReg = (*SRI).first;
+    DEBUG(dbgs() << "DestReg: " << PrintReg(DestReg, TRI)
+                 << " SourceReg: " << PrintReg(SourceReg, TRI) << "\n");
+
+    assert(PHIInfo.sources_end(DestReg) == ++SRI &&
+           "More than one phi source in entry node");
+    replaceRegisterWith(DestReg, SourceReg);
+  }
+}
+
+static bool isFunctionEntryBlock(MachineBasicBlock *MBB) {
+  return ((&(*(MBB->getParent()->begin()))) == MBB);
+}
+
+MachineBasicBlock *AMDGPUMachineCFGStructurizer::createIfRegion(
+    MachineBasicBlock *MergeBB, MachineBasicBlock *CodeBB,
+    LinearizedRegion *CurrentRegion, unsigned BBSelectRegIn,
+    unsigned BBSelectRegOut) {
+  if (isFunctionEntryBlock(CodeBB) && !CurrentRegion->getHasLoop()) {
+    // Handle non-loop function entry block.
+    // We need to allow loops to the entry block and then
+    rewriteCodeBBTerminator(CodeBB, MergeBB, BBSelectRegOut);
+    resolvePHIInfos(CodeBB);
+    removeExternalCFGSuccessors(CodeBB);
+    CodeBB->addSuccessor(MergeBB);
+    CurrentRegion->addMBB(CodeBB);
+    return nullptr;
+  }
+  if (CurrentRegion->getEntry() == CodeBB && !CurrentRegion->getHasLoop()) {
+    // Handle non-loop region entry block.
+    MachineFunction *MF = MergeBB->getParent();
+    auto MergeIter = MergeBB->getIterator();
+    auto CodeBBStartIter = CodeBB->getIterator();
+    auto CodeBBEndIter = ++(CodeBB->getIterator());
+    if (CodeBBEndIter != MergeIter) {
+      MF->splice(MergeIter, CodeBBStartIter, CodeBBEndIter);
+    }
+    rewriteCodeBBTerminator(CodeBB, MergeBB, BBSelectRegOut);
+    prunePHIInfo(CodeBB);
+    createEntryPHIs(CurrentRegion);
+    removeExternalCFGSuccessors(CodeBB);
+    CodeBB->addSuccessor(MergeBB);
+    CurrentRegion->addMBB(CodeBB);
+    return nullptr;
+  } else {
+    // Handle internal block.
+    const TargetRegisterClass *RegClass = MRI->getRegClass(BBSelectRegIn);
+    unsigned CodeBBSelectReg = MRI->createVirtualRegister(RegClass);
+    rewriteCodeBBTerminator(CodeBB, MergeBB, CodeBBSelectReg);
+    bool IsRegionEntryBB = CurrentRegion->getEntry() == CodeBB;
+    MachineBasicBlock *IfBB = createIfBlock(MergeBB, CodeBB, CodeBB, CodeBB,
+                                            BBSelectRegIn, IsRegionEntryBB);
+    CurrentRegion->addMBB(IfBB);
+    // If this is the entry block we need to make the If block the new
+    // linearized region entry.
+    if (IsRegionEntryBB) {
+      CurrentRegion->setEntry(IfBB);
+
+      if (CurrentRegion->getHasLoop()) {
+        MachineBasicBlock *RegionExit = CurrentRegion->getExit();
+        MachineBasicBlock *ETrueBB = nullptr;
+        MachineBasicBlock *EFalseBB = nullptr;
+        SmallVector<MachineOperand, 1> ECond;
+
+        const DebugLoc &DL = DebugLoc();
+        TII->analyzeBranch(*RegionExit, ETrueBB, EFalseBB, ECond);
+        TII->removeBranch(*RegionExit);
+
+        // We need to create a backedge if there is a loop
+        unsigned Reg = TII->insertNE(
+            RegionExit, RegionExit->instr_end(), DL,
+            CurrentRegion->getRegionMRT()->getInnerOutputRegister(),
+            CurrentRegion->getRegionMRT()->getEntry()->getNumber());
+        MachineOperand RegOp =
+            MachineOperand::CreateReg(Reg, false, false, true);
+        ArrayRef<MachineOperand> Cond(RegOp);
+        DEBUG(dbgs() << "RegionExitReg: ");
+        DEBUG(Cond[0].print(dbgs(), TRI));
+        DEBUG(dbgs() << "\n");
+        TII->insertBranch(*RegionExit, CurrentRegion->getEntry(), RegionExit,
+                          Cond, DebugLoc());
+        RegionExit->addSuccessor(CurrentRegion->getEntry());
+      }
+    }
+    CurrentRegion->addMBB(CodeBB);
+    LinearizedRegion InnerRegion(CodeBB, MRI, TRI, PHIInfo);
+
+    InnerRegion.setParent(CurrentRegion);
+    DEBUG(dbgs() << "Insert BB Select PHI (BB)\n");
+    insertMergePHI(IfBB, CodeBB, MergeBB, BBSelectRegOut, BBSelectRegIn,
+                   CodeBBSelectReg);
+    InnerRegion.addMBB(MergeBB);
+
+    DEBUG(InnerRegion.print(dbgs(), TRI));
+    rewriteLiveOutRegs(IfBB, CodeBB, MergeBB, &InnerRegion, CurrentRegion);
+    extractKilledPHIs(CodeBB);
+    if (IsRegionEntryBB) {
+      createEntryPHIs(CurrentRegion);
+    }
+    return IfBB;
+  }
+}
+
+MachineBasicBlock *AMDGPUMachineCFGStructurizer::createIfRegion(
+    MachineBasicBlock *MergeBB, LinearizedRegion *InnerRegion,
+    LinearizedRegion *CurrentRegion, MachineBasicBlock *SelectBB,
+    unsigned BBSelectRegIn, unsigned BBSelectRegOut) {
+  unsigned CodeBBSelectReg =
+      InnerRegion->getRegionMRT()->getInnerOutputRegister();
+  MachineBasicBlock *CodeEntryBB = InnerRegion->getEntry();
+  MachineBasicBlock *CodeExitBB = InnerRegion->getExit();
+  MachineBasicBlock *IfBB = createIfBlock(MergeBB, CodeEntryBB, CodeExitBB,
+                                          SelectBB, BBSelectRegIn, true);
+  CurrentRegion->addMBB(IfBB);
+  bool isEntry = CurrentRegion->getEntry() == InnerRegion->getEntry();
+  if (isEntry) {
+
+    if (CurrentRegion->getHasLoop()) {
+      MachineBasicBlock *RegionExit = CurrentRegion->getExit();
+      MachineBasicBlock *ETrueBB = nullptr;
+      MachineBasicBlock *EFalseBB = nullptr;
+      SmallVector<MachineOperand, 1> ECond;
+
+      const DebugLoc &DL = DebugLoc();
+      TII->analyzeBranch(*RegionExit, ETrueBB, EFalseBB, ECond);
+      TII->removeBranch(*RegionExit);
+
+      // We need to create a backedge if there is a loop
+      unsigned Reg =
+          TII->insertNE(RegionExit, RegionExit->instr_end(), DL,
+                        CurrentRegion->getRegionMRT()->getInnerOutputRegister(),
+                        CurrentRegion->getRegionMRT()->getEntry()->getNumber());
+      MachineOperand RegOp = MachineOperand::CreateReg(Reg, false, false, true);
+      ArrayRef<MachineOperand> Cond(RegOp);
+      DEBUG(dbgs() << "RegionExitReg: ");
+      DEBUG(Cond[0].print(dbgs(), TRI));
+      DEBUG(dbgs() << "\n");
+      TII->insertBranch(*RegionExit, CurrentRegion->getEntry(), RegionExit,
+                        Cond, DebugLoc());
+      RegionExit->addSuccessor(IfBB);
+    }
+  }
+  CurrentRegion->addMBBs(InnerRegion);
+  DEBUG(dbgs() << "Insert BB Select PHI (region)\n");
+  insertMergePHI(IfBB, CodeExitBB, MergeBB, BBSelectRegOut, BBSelectRegIn,
+                 CodeBBSelectReg);
+
+  rewriteLiveOutRegs(IfBB, /* CodeEntryBB */ CodeExitBB, MergeBB, InnerRegion,
+                     CurrentRegion);
+
+  rewriteRegionEntryPHIs(InnerRegion, IfBB);
+
+  if (isEntry) {
+    CurrentRegion->setEntry(IfBB);
+  }
+
+  if (isEntry) {
+    createEntryPHIs(CurrentRegion);
+  }
+
+  return IfBB;
+}
+
+void AMDGPUMachineCFGStructurizer::splitLoopPHI(MachineInstr &PHI,
+                                          MachineBasicBlock *Entry,
+                                          MachineBasicBlock *EntrySucc,
+                                          LinearizedRegion *LRegion) {
+  SmallVector<unsigned, 2> PHIRegionIndices;
+  getPHIRegionIndices(LRegion, PHI, PHIRegionIndices);
+
+  assert(PHIRegionIndices.size() == 1);
+
+  unsigned RegionIndex = PHIRegionIndices[0];
+  unsigned RegionSourceReg = getPHISourceReg(PHI, RegionIndex);
+  MachineBasicBlock *RegionSourceMBB = getPHIPred(PHI, RegionIndex);
+  unsigned PHIDest = getPHIDestReg(PHI);
+  unsigned PHISource = PHIDest;
+  unsigned ReplaceReg;
+
+  if (shrinkPHI(PHI, PHIRegionIndices, &ReplaceReg)) {
+    PHISource = ReplaceReg;
+  }
+
+  const TargetRegisterClass *RegClass = MRI->getRegClass(PHIDest);
+  unsigned NewDestReg = MRI->createVirtualRegister(RegClass);
+  LRegion->replaceRegisterInsideRegion(PHIDest, NewDestReg, false, MRI);
+  MachineInstrBuilder MIB =
+      BuildMI(*EntrySucc, EntrySucc->instr_begin(), PHI.getDebugLoc(),
+              TII->get(TargetOpcode::PHI), NewDestReg);
+  DEBUG(dbgs() << "Split Entry PHI " << PrintReg(NewDestReg, TRI)
+               << "<def> = PHI(");
+  MIB.addReg(PHISource);
+  MIB.addMBB(Entry);
+  DEBUG(dbgs() << PrintReg(PHISource, TRI) << ", BB#" << Entry->getNumber());
+  MIB.addReg(RegionSourceReg);
+  MIB.addMBB(RegionSourceMBB);
+  DEBUG(dbgs() << " ," << PrintReg(RegionSourceReg, TRI) << ", BB#"
+               << RegionSourceMBB->getNumber() << ")\n");
+}
+
+void AMDGPUMachineCFGStructurizer::splitLoopPHIs(MachineBasicBlock *Entry,
+                                           MachineBasicBlock *EntrySucc,
+                                           LinearizedRegion *LRegion) {
+  SmallVector<MachineInstr *, 2> PHIs;
+  collectPHIs(Entry, PHIs);
+
+  for (auto PHII : PHIs) {
+    splitLoopPHI(*PHII, Entry, EntrySucc, LRegion);
+  }
+}
+
+// Split the exit block so that we can insert a end control flow
+MachineBasicBlock *
+AMDGPUMachineCFGStructurizer::splitExit(LinearizedRegion *LRegion) {
+  auto MRTRegion = LRegion->getRegionMRT();
+  auto Exit = LRegion->getExit();
+  auto MF = Exit->getParent();
+  auto Succ = MRTRegion->getSucc();
+
+  auto NewExit = MF->CreateMachineBasicBlock();
+  auto AfterExitIter = Exit->getIterator();
+  AfterExitIter++;
+  MF->insert(AfterExitIter, NewExit);
+  Exit->removeSuccessor(Succ);
+  Exit->addSuccessor(NewExit);
+  NewExit->addSuccessor(Succ);
+  insertUnconditionalBranch(NewExit, Succ);
+  LRegion->addMBB(NewExit);
+  LRegion->setExit(NewExit);
+
+  DEBUG(dbgs() << "Created new exit block: " << NewExit->getNumber() << "\n");
+
+  // Replace any PHI Predecessors in the successor with NewExit
+  for (auto &II : *Succ) {
+    MachineInstr &Instr = II;
+
+    // If we are past the PHI instructions we are done
+    if (!Instr.isPHI())
+      break;
+
+    int numPreds = getPHINumInputs(Instr);
+    for (int i = 0; i < numPreds; ++i) {
+      auto Pred = getPHIPred(Instr, i);
+      if (Pred == Exit) {
+        setPhiPred(Instr, i, NewExit);
+      }
+    }
+  }
+
+  return NewExit;
+}
+
+
+static MachineBasicBlock *split(MachineBasicBlock::iterator I) {
+  // Create the fall-through block.
+  MachineBasicBlock *MBB = (*I).getParent();
+  MachineFunction *MF = MBB->getParent();
+  MachineBasicBlock *SuccMBB = MF->CreateMachineBasicBlock();
+  auto MBBIter = ++(MBB->getIterator());
+  MF->insert(MBBIter, SuccMBB);
+  SuccMBB->transferSuccessorsAndUpdatePHIs(MBB);
+  MBB->addSuccessor(SuccMBB);
+
+  // Splice the code over.
+  SuccMBB->splice(SuccMBB->end(), MBB, I, MBB->end());
+
+  return SuccMBB;
+}
+
+// Split the entry block separating PHI-nodes and the rest of the code
+// This is needed to insert an initializer for the bb select register
+// inloop regions.
+
+MachineBasicBlock *
+AMDGPUMachineCFGStructurizer::splitEntry(LinearizedRegion *LRegion) {
+  MachineBasicBlock *Entry = LRegion->getEntry();
+  MachineBasicBlock *EntrySucc = split(Entry->getFirstNonPHI());
+  MachineBasicBlock *Exit = LRegion->getExit();
+
+  DEBUG(dbgs() << "Split BB#" << Entry->getNumber() << " to BB#"
+               << Entry->getNumber() << " -> BB#" << EntrySucc->getNumber()
+               << "\n");
+  LRegion->addMBB(EntrySucc);
+
+  // Make the backedge go to Entry Succ
+  if (Exit->isSuccessor(Entry)) {
+    Exit->removeSuccessor(Entry);
+  }
+  Exit->addSuccessor(EntrySucc);
+  MachineInstr &Branch = *(Exit->instr_rbegin());
+  for (auto &UI : Branch.uses()) {
+    if (UI.isMBB() && UI.getMBB() == Entry) {
+      UI.setMBB(EntrySucc);
+    }
+  }
+
+  splitLoopPHIs(Entry, EntrySucc, LRegion);
+
+  return EntrySucc;
+}
+
+LinearizedRegion *
+AMDGPUMachineCFGStructurizer::initLinearizedRegion(RegionMRT *Region) {
+  LinearizedRegion *LRegion = Region->getLinearizedRegion();
+  LRegion->initLiveOut(Region, MRI, TRI, PHIInfo);
+  LRegion->setEntry(Region->getEntry());
+  return LRegion;
+}
+
+static void removeOldExitPreds(RegionMRT *Region) {
+  MachineBasicBlock *Exit = Region->getSucc();
+  if (Exit == nullptr) {
+    return;
+  }
+  for (MachineBasicBlock::pred_iterator PI = Exit->pred_begin(),
+                                        E = Exit->pred_end();
+       PI != E; ++PI) {
+    if (Region->contains(*PI)) {
+      (*PI)->removeSuccessor(Exit);
+    }
+  }
+}
+
+static bool mbbHasBackEdge(MachineBasicBlock *MBB,
+                           SmallPtrSet<MachineBasicBlock *, 8> &MBBs) {
+  for (auto SI = MBB->succ_begin(), SE = MBB->succ_end(); SI != SE; ++SI) {
+    if (MBBs.count(*SI) != 0) {
+      return true;
+    }
+  }
+  return false;
+}
+
+static bool containsNewBackedge(MRT *Tree,
+                                SmallPtrSet<MachineBasicBlock *, 8> &MBBs) {
+  // Need to traverse this in reverse since it is in post order.
+  if (Tree == nullptr)
+    return false;
+
+  if (Tree->isMBB()) {
+    MachineBasicBlock *MBB = Tree->getMBBMRT()->getMBB();
+    MBBs.insert(MBB);
+    if (mbbHasBackEdge(MBB, MBBs)) {
+      return true;
+    }
+  } else {
+    RegionMRT *Region = Tree->getRegionMRT();
+    SetVector<MRT *> *Children = Region->getChildren();
+    for (auto CI = Children->rbegin(), CE = Children->rend(); CI != CE; ++CI) {
+      if (containsNewBackedge(*CI, MBBs))
+        return true;
+    }
+  }
+  return false;
+}
+
+static bool containsNewBackedge(RegionMRT *Region) {
+  SmallPtrSet<MachineBasicBlock *, 8> MBBs;
+  return containsNewBackedge(Region, MBBs);
+}
+
+bool AMDGPUMachineCFGStructurizer::structurizeComplexRegion(RegionMRT *Region) {
+  auto *LRegion = initLinearizedRegion(Region);
+  LRegion->setHasLoop(containsNewBackedge(Region));
+  MachineBasicBlock *LastMerge = createLinearizedExitBlock(Region);
+  MachineBasicBlock *CurrentMerge = LastMerge;
+  LRegion->addMBB(LastMerge);
+  LRegion->setExit(LastMerge);
+
+  rewriteRegionExitPHIs(Region, LastMerge, LRegion);
+  removeOldExitPreds(Region);
+
+  DEBUG(PHIInfo.dump(MRI));
+
+  SetVector<MRT *> *Children = Region->getChildren();
+  DEBUG(dbgs() << "===========If Region Start===============\n");
+  if (LRegion->getHasLoop()) {
+    DEBUG(dbgs() << "Has Backedge: Yes\n");
+  } else {
+    DEBUG(dbgs() << "Has Backedge: No\n");
+  }
+
+  unsigned BBSelectRegIn;
+  unsigned BBSelectRegOut;
+  for (auto CI = Children->begin(), CE = Children->end(); CI != CE; ++CI) {
+    DEBUG(dbgs() << "CurrentRegion: \n");
+    DEBUG(LRegion->print(dbgs(), TRI));
+
+    auto CNI = CI;
+    ++CNI;
+
+    MRT *Child = (*CI);
+
+    if (Child->isRegion()) {
+
+      LinearizedRegion *InnerLRegion =
+          Child->getRegionMRT()->getLinearizedRegion();
+      // We found the block is the exit of an inner region, we need
+      // to put it in the current linearized region.
+
+      DEBUG(dbgs() << "Linearizing region: ");
+      DEBUG(InnerLRegion->print(dbgs(), TRI));
+      DEBUG(dbgs() << "\n");
+
+      MachineBasicBlock *InnerEntry = InnerLRegion->getEntry();
+      if ((&(*(InnerEntry->getParent()->begin()))) == InnerEntry) {
+        // Entry has already been linearized, no need to do this region.
+        unsigned OuterSelect = InnerLRegion->getBBSelectRegOut();
+        unsigned InnerSelectReg =
+            InnerLRegion->getRegionMRT()->getInnerOutputRegister();
+        replaceRegisterWith(InnerSelectReg, OuterSelect),
+            resolvePHIInfos(InnerEntry);
+        if (!InnerLRegion->getExit()->isSuccessor(CurrentMerge))
+          InnerLRegion->getExit()->addSuccessor(CurrentMerge);
+        continue;
+      }
+
+      BBSelectRegOut = Child->getBBSelectRegOut();
+      BBSelectRegIn = Child->getBBSelectRegIn();
+
+      DEBUG(dbgs() << "BBSelectRegIn: " << PrintReg(BBSelectRegIn, TRI)
+                   << "\n");
+      DEBUG(dbgs() << "BBSelectRegOut: " << PrintReg(BBSelectRegOut, TRI)
+                   << "\n");
+
+      MachineBasicBlock *IfEnd = CurrentMerge;
+      CurrentMerge = createIfRegion(CurrentMerge, InnerLRegion, LRegion,
+                                    Child->getRegionMRT()->getEntry(),
+                                    BBSelectRegIn, BBSelectRegOut);
+      TII->convertNonUniformIfRegion(CurrentMerge, IfEnd);
+    } else {
+      MachineBasicBlock *MBB = Child->getMBBMRT()->getMBB();
+      DEBUG(dbgs() << "Linearizing block: " << MBB->getNumber() << "\n");
+
+      if (MBB == getSingleExitNode(*(MBB->getParent()))) {
+        // If this is the exit block then we need to skip to the next.
+        // The "in" register will be transferred to "out" in the next
+        // iteration.
+        continue;
+      }
+
+      BBSelectRegOut = Child->getBBSelectRegOut();
+      BBSelectRegIn = Child->getBBSelectRegIn();
+
+      DEBUG(dbgs() << "BBSelectRegIn: " << PrintReg(BBSelectRegIn, TRI)
+                   << "\n");
+      DEBUG(dbgs() << "BBSelectRegOut: " << PrintReg(BBSelectRegOut, TRI)
+                   << "\n");
+
+      MachineBasicBlock *IfEnd = CurrentMerge;
+      // This is a basic block that is not part of an inner region, we
+      // need to put it in the current linearized region.
+      CurrentMerge = createIfRegion(CurrentMerge, MBB, LRegion, BBSelectRegIn,
+                                    BBSelectRegOut);
+      if (CurrentMerge) {
+        TII->convertNonUniformIfRegion(CurrentMerge, IfEnd);
+      }
+
+      DEBUG(PHIInfo.dump(MRI));
+    }
+  }
+
+  LRegion->removeFalseRegisterKills(MRI);
+
+  if (LRegion->getHasLoop()) {
+    MachineBasicBlock *NewSucc = splitEntry(LRegion);
+    if (isFunctionEntryBlock(LRegion->getEntry())) {
+      resolvePHIInfos(LRegion->getEntry());
+    }
+    const DebugLoc &DL = NewSucc->findDebugLoc(NewSucc->getFirstNonPHI());
+    unsigned InReg = LRegion->getBBSelectRegIn();
+    unsigned InnerSelectReg =
+        MRI->createVirtualRegister(MRI->getRegClass(InReg));
+    unsigned NewInReg = MRI->createVirtualRegister(MRI->getRegClass(InReg));
+    TII->materializeImmediate(*(LRegion->getEntry()),
+                              LRegion->getEntry()->getFirstTerminator(), DL,
+                              NewInReg, Region->getEntry()->getNumber());
+    // Need to be careful about updating the registers inside the region.
+    LRegion->replaceRegisterInsideRegion(InReg, InnerSelectReg, false, MRI);
+    DEBUG(dbgs() << "Loop BBSelect Merge PHI:\n");
+    insertMergePHI(LRegion->getEntry(), LRegion->getExit(), NewSucc,
+                   InnerSelectReg, NewInReg,
+                   LRegion->getRegionMRT()->getInnerOutputRegister());
+    splitExit(LRegion);
+    TII->convertNonUniformLoopRegion(NewSucc, LastMerge);
+  }
+
+  if (Region->isRoot()) {
+    TII->insertReturn(*LastMerge);
+  }
+
+  DEBUG(Region->getEntry()->getParent()->dump());
+  DEBUG(LRegion->print(dbgs(), TRI));
+  DEBUG(PHIInfo.dump(MRI));
+
+  DEBUG(dbgs() << "===========If Region End===============\n");
+
+  Region->setLinearizedRegion(LRegion);
+  return true;
+}
+
+bool AMDGPUMachineCFGStructurizer::structurizeRegion(RegionMRT *Region) {
+  if (false && regionIsSimpleIf(Region)) {
+    transformSimpleIfRegion(Region);
+    return true;
+  } else if (regionIsSequence(Region)) {
+    fixupRegionExits(Region);
+    return false;
+  } else {
+    structurizeComplexRegion(Region);
+  }
+  return false;
+}
+
+static int structurize_once = 0;
+
+bool AMDGPUMachineCFGStructurizer::structurizeRegions(RegionMRT *Region,
+                                                bool isTopRegion) {
+  bool Changed = false;
+
+  auto Children = Region->getChildren();
+  for (auto CI : *Children) {
+    if (CI->isRegion()) {
+      Changed |= structurizeRegions(CI->getRegionMRT(), false);
+    }
+  }
+
+  if (structurize_once < 2 || true) {
+    Changed |= structurizeRegion(Region);
+    structurize_once++;
+  }
+  return Changed;
+}
+
+void AMDGPUMachineCFGStructurizer::initFallthroughMap(MachineFunction &MF) {
+  DEBUG(dbgs() << "Fallthrough Map:\n");
+  for (auto &MBBI : MF) {
+    MachineBasicBlock *MBB = MBBI.getFallThrough();
+    if (MBB != nullptr) {
+      DEBUG(dbgs() << "Fallthrough: " << MBBI.getNumber() << " -> "
+                   << MBB->getNumber() << "\n");
+    }
+    FallthroughMap[&MBBI] = MBB;
+  }
+}
+
+void AMDGPUMachineCFGStructurizer::createLinearizedRegion(RegionMRT *Region,
+                                                    unsigned SelectOut) {
+  LinearizedRegion *LRegion = new LinearizedRegion();
+  if (SelectOut) {
+    LRegion->addLiveOut(SelectOut);
+    DEBUG(dbgs() << "Add LiveOut (BBSelect): " << PrintReg(SelectOut, TRI)
+                 << "\n");
+  }
+  LRegion->setRegionMRT(Region);
+  Region->setLinearizedRegion(LRegion);
+  LRegion->setParent(Region->getParent()
+                         ? Region->getParent()->getLinearizedRegion()
+                         : nullptr);
+}
+
+unsigned
+AMDGPUMachineCFGStructurizer::initializeSelectRegisters(MRT *MRT, unsigned SelectOut,
+                                                  MachineRegisterInfo *MRI,
+                                                  const SIInstrInfo *TII) {
+  if (MRT->isRegion()) {
+    RegionMRT *Region = MRT->getRegionMRT();
+    Region->setBBSelectRegOut(SelectOut);
+    unsigned InnerSelectOut = createBBSelectReg(TII, MRI);
+
+    // Fixme: Move linearization creation to the original spot
+    createLinearizedRegion(Region, SelectOut);
+
+    for (auto CI = Region->getChildren()->begin(),
+              CE = Region->getChildren()->end();
+         CI != CE; ++CI) {
+      InnerSelectOut =
+          initializeSelectRegisters((*CI), InnerSelectOut, MRI, TII);
+    }
+    MRT->setBBSelectRegIn(InnerSelectOut);
+    return InnerSelectOut;
+  } else {
+    MRT->setBBSelectRegOut(SelectOut);
+    unsigned NewSelectIn = createBBSelectReg(TII, MRI);
+    MRT->setBBSelectRegIn(NewSelectIn);
+    return NewSelectIn;
+  }
+}
+
+static void checkRegOnlyPHIInputs(MachineFunction &MF) {
+  for (auto &MBBI : MF) {
+    for (MachineBasicBlock::instr_iterator I = MBBI.instr_begin(),
+                                           E = MBBI.instr_end();
+         I != E; ++I) {
+      MachineInstr &Instr = *I;
+      if (Instr.isPHI()) {
+        int numPreds = getPHINumInputs(Instr);
+        for (int i = 0; i < numPreds; ++i) {
+          assert(Instr.getOperand(i * 2 + 1).isReg() &&
+                 "PHI Operand not a register");
+        }
+      }
+    }
+  }
+}
+
+
+INITIALIZE_PASS_BEGIN(AMDGPUMachineCFGStructurizer, "amdgpu-machine-cfg-structurizer",
+                      "AMDGPU Machine CFG Structurizer", false, false)
+INITIALIZE_PASS_DEPENDENCY(MachineRegionInfoPass)
+INITIALIZE_PASS_END(AMDGPUMachineCFGStructurizer, "amdgpu-machine-cfg-structurizer",
+                    "AMDGPU Machine CFG Structurizer", false, false)
+
+char AMDGPUMachineCFGStructurizerID = AMDGPUMachineCFGStructurizer::ID;
+
+
+bool AMDGPUMachineCFGStructurizer::runOnMachineFunction(MachineFunction &MF) {
+  const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+  const SIInstrInfo *TII = ST.getInstrInfo();
+  TRI = ST.getRegisterInfo();
+  MRI = &(MF.getRegInfo());
+  initFallthroughMap(MF);
+
+  checkRegOnlyPHIInputs(MF);
+  DEBUG(dbgs() << "----STRUCTURIZER START----\n");
+  DEBUG(MF.dump());
+
+  Regions = &(getAnalysis<MachineRegionInfoPass>().getRegionInfo());
+  DEBUG(Regions->dump());
+
+  RegionMRT *RTree = MRT::buildMRT(MF, Regions, TII, MRI);
+  setRegionMRT(RTree);
+  initializeSelectRegisters(RTree, 0, MRI, TII);
+  DEBUG(RTree->dump(TRI));
+  bool result = structurizeRegions(RTree, true);
+  delete RTree;
+  DEBUG(dbgs() << "----STRUCTURIZER END----\n");
+  initFallthroughMap(MF);
+  return result;
+}
+
+FunctionPass *llvm::createAMDGPUMachineCFGStructurizerPass() {
+  return new AMDGPUMachineCFGStructurizer();
+}
diff --git a/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
index 36dcc699d4ea..e40f39557747 100644
--- a/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
+++ b/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
@@ -397,14 +397,17 @@ static Value* GEPToVectorIndex(GetElementPtrInst *GEP) {
 // instructions.
 static bool canVectorizeInst(Instruction *Inst, User *User) {
   switch (Inst->getOpcode()) {
-  case Instruction::Load:
+  case Instruction::Load: {
+    LoadInst *LI = cast<LoadInst>(Inst);
+    return !LI->isVolatile();
+  }
   case Instruction::BitCast:
   case Instruction::AddrSpaceCast:
     return true;
   case Instruction::Store: {
     // Must be the stored pointer operand, not a stored value.
     StoreInst *SI = cast<StoreInst>(Inst);
-    return SI->getPointerOperand() == User;
+    return (SI->getPointerOperand() == User) && !SI->isVolatile();
   }
   default:
     return false;
diff --git a/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
index 972c28579f7a..6e301b4ad527 100644
--- a/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
+++ b/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
@@ -125,6 +125,9 @@ AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
     HasSDWA(false),
     HasDPP(false),
     FlatAddressSpace(false),
+    FlatInstOffsets(false),
+    FlatGlobalInsts(false),
+    FlatScratchInsts(false),
 
     R600ALUInst(false),
     CaymanISA(false),
diff --git a/lib/Target/AMDGPU/AMDGPUSubtarget.h b/lib/Target/AMDGPU/AMDGPUSubtarget.h
index a5cda817ac11..bed7d326b3dd 100644
--- a/lib/Target/AMDGPU/AMDGPUSubtarget.h
+++ b/lib/Target/AMDGPU/AMDGPUSubtarget.h
@@ -145,6 +145,9 @@ protected:
   bool HasSDWA;
   bool HasDPP;
   bool FlatAddressSpace;
+  bool FlatInstOffsets;
+  bool FlatGlobalInsts;
+  bool FlatScratchInsts;
   bool R600ALUInst;
   bool CaymanISA;
   bool CFALUBug;
@@ -380,6 +383,18 @@ public:
     return FlatAddressSpace;
   }
 
+  bool hasFlatInstOffsets() const {
+    return FlatInstOffsets;
+  }
+
+  bool hasFlatGlobalInsts() const {
+    return FlatGlobalInsts;
+  }
+
+  bool hasFlatScratchInsts() const {
+    return FlatScratchInsts;
+  }
+
   bool isMesaKernel(const MachineFunction &MF) const {
     return isMesa3DOS() && !AMDGPU::isShader(MF.getFunction()->getCallingConv());
   }
diff --git a/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index cd5bad04d0b3..386a88b0520f 100644
--- a/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -118,6 +118,13 @@ static cl::opt<bool> EnableSIInsertWaitcntsPass(
   cl::desc("Use new waitcnt insertion pass"),
   cl::init(false));
 
+// Option to run late CFG structurizer
+static cl::opt<bool> LateCFGStructurize(
+  "amdgpu-late-structurize",
+  cl::desc("Enable late CFG structurization"),
+  cl::init(false),
+  cl::Hidden);
+
 extern "C" void LLVMInitializeAMDGPUTarget() {
   // Register the target
   RegisterTargetMachine<R600TargetMachine> X(getTheAMDGPUTarget());
@@ -702,11 +709,15 @@ bool GCNPassConfig::addPreISel() {
   // Merge divergent exit nodes. StructurizeCFG won't recognize the multi-exit
   // regions formed by them.
   addPass(&AMDGPUUnifyDivergentExitNodesID);
-  addPass(createStructurizeCFGPass(true)); // true -> SkipUniformRegions
+  if (!LateCFGStructurize) {
+    addPass(createStructurizeCFGPass(true)); // true -> SkipUniformRegions
+  }
   addPass(createSinkingPass());
   addPass(createSITypeRewriter());
   addPass(createAMDGPUAnnotateUniformValues());
-  addPass(createSIAnnotateControlFlowPass());
+  if (!LateCFGStructurize) {
+    addPass(createSIAnnotateControlFlowPass());
+  }
 
   return false;
 }
@@ -770,6 +781,9 @@ bool GCNPassConfig::addGlobalInstructionSelect() {
 #endif
 
 void GCNPassConfig::addPreRegAlloc() {
+  if (LateCFGStructurize) {
+    addPass(createAMDGPUMachineCFGStructurizerPass());
+  }
   addPass(createSIWholeQuadModePass());
 }
 
diff --git a/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
index c9482c37ec80..beafebc1284a 100644
--- a/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
+++ b/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@@ -363,13 +363,22 @@ int AMDGPUTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy,
                                       unsigned Index) {
   switch (Opcode) {
   case Instruction::ExtractElement:
-  case Instruction::InsertElement:
+  case Instruction::InsertElement: {
+    unsigned EltSize
+      = DL.getTypeSizeInBits(cast<VectorType>(ValTy)->getElementType());
+    if (EltSize < 32) {
+      if (EltSize == 16 && Index == 0 && ST->has16BitInsts())
+        return 0;
+      return BaseT::getVectorInstrCost(Opcode, ValTy, Index);
+    }
+
     // Extracts are just reads of a subregister, so are free. Inserts are
     // considered free because we don't want to have any cost for scalarizing
     // operations, and we don't have to copy into a different register class.
 
     // Dynamic indexing isn't free and is best avoided.
     return Index == ~0u ? 2 : 0;
+  }
   default:
     return BaseT::getVectorInstrCost(Opcode, ValTy, Index);
   }
@@ -479,3 +488,26 @@ bool AMDGPUTTIImpl::isSourceOfDivergence(const Value *V) const {
 
   return false;
 }
+
+unsigned AMDGPUTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
+                                       Type *SubTp) {
+  if (ST->hasVOP3PInsts()) {
+    VectorType *VT = cast<VectorType>(Tp);
+    if (VT->getNumElements() == 2 &&
+        DL.getTypeSizeInBits(VT->getElementType()) == 16) {
+      // With op_sel VOP3P instructions freely can access the low half or high
+      // half of a register, so any swizzle is free.
+
+      switch (Kind) {
+      case TTI::SK_Broadcast:
+      case TTI::SK_Reverse:
+      case TTI::SK_PermuteSingleSrc:
+        return 0;
+      default:
+        break;
+      }
+    }
+  }
+
+  return BaseT::getShuffleCost(Kind, Tp, Index, SubTp);
+}
diff --git a/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h b/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
index 71d6306bc1a5..e0024e21e82b 100644
--- a/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
+++ b/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
@@ -114,6 +114,9 @@ public:
   }
 
   unsigned getVectorSplitCost() { return 0; }
+
+  unsigned getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
+                          Type *SubTp);
 };
 
 } // end namespace llvm
diff --git a/lib/Target/AMDGPU/CMakeLists.txt b/lib/Target/AMDGPU/CMakeLists.txt
index 7c0ef4aeac3c..cafce0164fa9 100644
--- a/lib/Target/AMDGPU/CMakeLists.txt
+++ b/lib/Target/AMDGPU/CMakeLists.txt
@@ -48,6 +48,7 @@ add_llvm_target(AMDGPUCodeGen
   AMDGPUISelDAGToDAG.cpp
   AMDGPULowerIntrinsics.cpp
   AMDGPUMCInstLower.cpp
+  AMDGPUMachineCFGStructurizer.cpp
   AMDGPUMachineFunction.cpp
   AMDGPUUnifyMetadata.cpp
   AMDGPUOpenCLImageTypeLoweringPass.cpp
diff --git a/lib/Target/AMDGPU/FLATInstructions.td b/lib/Target/AMDGPU/FLATInstructions.td
index b0ac0e689a0b..8ba9efd42c70 100644
--- a/lib/Target/AMDGPU/FLATInstructions.td
+++ b/lib/Target/AMDGPU/FLATInstructions.td
@@ -7,7 +7,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-def FLATAtomic : ComplexPattern<i64, 3, "SelectFlat">;
+def FLATAtomic : ComplexPattern<i64, 2, "SelectFlat">;
 
 //===----------------------------------------------------------------------===//
 // FLAT classes
@@ -62,7 +62,9 @@ class FLAT_Real <bits<7> op, FLAT_Pseudo ps> :
   bits<8> vdst;
   bits<1> slc;
   bits<1> glc;
-  bits<1> tfe;
+
+  // We don't use tfe right now, and it was removed in gfx9.
+  bits<1> tfe = 0;
 
   // 15-0 is reserved.
   let Inst{16}    = !if(ps.has_glc, glc, ps.glcValue);
@@ -79,8 +81,8 @@ class FLAT_Real <bits<7> op, FLAT_Pseudo ps> :
 class FLAT_Load_Pseudo <string opName, RegisterClass regClass> : FLAT_Pseudo<
   opName,
   (outs regClass:$vdst),
-  (ins VReg_64:$vaddr, GLC:$glc, slc:$slc, tfe:$tfe),
-  " $vdst, $vaddr$glc$slc$tfe"> {
+  (ins VReg_64:$vaddr, GLC:$glc, slc:$slc),
+  " $vdst, $vaddr$glc$slc"> {
   let has_data = 0;
   let mayLoad = 1;
 }
@@ -88,8 +90,8 @@ class FLAT_Load_Pseudo <string opName, RegisterClass regClass> : FLAT_Pseudo<
 class FLAT_Store_Pseudo <string opName, RegisterClass vdataClass> : FLAT_Pseudo<
   opName,
   (outs),
-  (ins VReg_64:$vaddr, vdataClass:$vdata, GLC:$glc, slc:$slc, tfe:$tfe),
-  " $vaddr, $vdata$glc$slc$tfe"> {
+  (ins VReg_64:$vaddr, vdataClass:$vdata, GLC:$glc, slc:$slc),
+  " $vaddr, $vdata$glc$slc"> {
   let mayLoad  = 0;
   let mayStore = 1;
   let has_vdst = 0;
@@ -105,8 +107,8 @@ multiclass FLAT_Atomic_Pseudo<
 
   def "" : FLAT_Pseudo <opName,
     (outs),
-    (ins VReg_64:$vaddr, data_rc:$vdata, slc:$slc, tfe:$tfe),
-    " $vaddr, $vdata$slc$tfe",
+    (ins VReg_64:$vaddr, data_rc:$vdata, slc:$slc),
+    " $vaddr, $vdata$slc",
     []>,
     AtomicNoRet <NAME, 0> {
     let mayLoad = 1;
@@ -119,10 +121,10 @@ multiclass FLAT_Atomic_Pseudo<
 
   def _RTN : FLAT_Pseudo <opName,
     (outs vdst_rc:$vdst),
-    (ins VReg_64:$vaddr, data_rc:$vdata, slc:$slc, tfe:$tfe),
-    " $vdst, $vaddr, $vdata glc$slc$tfe",
+    (ins VReg_64:$vaddr, data_rc:$vdata, slc:$slc),
+    " $vdst, $vaddr, $vdata glc$slc",
     [(set vt:$vdst,
-      (atomic (FLATAtomic i64:$vaddr, i1:$slc, i1:$tfe), data_vt:$vdata))]>,
+      (atomic (FLATAtomic i64:$vaddr, i1:$slc), data_vt:$vdata))]>,
     AtomicNoRet <NAME, 1> {
     let mayLoad  = 1;
     let mayStore = 1;
@@ -311,30 +313,30 @@ def flat_truncstorei16 : flat_st <truncstorei16>;
 // Patterns for global loads with no offset.
 class FlatLoadPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : Pat <
   (vt (node i64:$addr)),
-  (inst $addr, 0, 0, 0)
+  (inst $addr, 0, 0)
 >;
 
 class FlatLoadAtomicPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : Pat <
   (vt (node i64:$addr)),
-  (inst $addr, 1, 0, 0)
+  (inst $addr, 1, 0)
 >;
 
 class FlatStorePat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : Pat <
   (node vt:$data, i64:$addr),
-  (inst $addr, $data, 0, 0, 0)
+  (inst $addr, $data, 0, 0)
 >;
 
 class FlatStoreAtomicPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : Pat <
   // atomic store follows atomic binop convention so the address comes
   // first.
   (node i64:$addr, vt:$data),
-  (inst $addr, $data, 1, 0, 0)
+  (inst $addr, $data, 1, 0)
 >;
 
 class FlatAtomicPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt,
                      ValueType data_vt = vt> : Pat <
   (vt (node i64:$addr, data_vt:$data)),
-  (inst $addr, $data, 0, 0)
+  (inst $addr, $data, 0)
 >;
 
 let Predicates = [isCIVI] in {
diff --git a/lib/Target/AMDGPU/GCNRegPressure.cpp b/lib/Target/AMDGPU/GCNRegPressure.cpp
index bf16a8216001..8066428fe44a 100644
--- a/lib/Target/AMDGPU/GCNRegPressure.cpp
+++ b/lib/Target/AMDGPU/GCNRegPressure.cpp
@@ -27,7 +27,7 @@ void llvm::printLivesAt(SlotIndex SI,
   unsigned Num = 0;
   for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) {
     const unsigned Reg = TargetRegisterInfo::index2VirtReg(I);
-    if (MRI.reg_nodbg_empty(Reg))
+    if (!LIS.hasInterval(Reg))
       continue;
     const auto &LI = LIS.getInterval(Reg);
     if (LI.hasSubRanges()) {
@@ -131,13 +131,13 @@ bool GCNRegPressure::less(const SISubtarget &ST,
                           const GCNRegPressure& O,
                           unsigned MaxOccupancy) const {
   const auto SGPROcc = std::min(MaxOccupancy,
-                                ST.getOccupancyWithNumSGPRs(getSGRPNum()));
+                                ST.getOccupancyWithNumSGPRs(getSGPRNum()));
   const auto VGPROcc = std::min(MaxOccupancy,
-                                ST.getOccupancyWithNumVGPRs(getVGRPNum()));
+                                ST.getOccupancyWithNumVGPRs(getVGPRNum()));
   const auto OtherSGPROcc = std::min(MaxOccupancy,
-                                ST.getOccupancyWithNumSGPRs(O.getSGRPNum()));
+                                ST.getOccupancyWithNumSGPRs(O.getSGPRNum()));
   const auto OtherVGPROcc = std::min(MaxOccupancy,
-                                ST.getOccupancyWithNumVGPRs(O.getVGRPNum()));
+                                ST.getOccupancyWithNumVGPRs(O.getVGPRNum()));
 
   const auto Occ = std::min(SGPROcc, VGPROcc);
   const auto OtherOcc = std::min(OtherSGPROcc, OtherVGPROcc);
@@ -167,17 +167,17 @@ bool GCNRegPressure::less(const SISubtarget &ST,
         return VW < OtherVW;
     }
   }
-  return SGPRImportant ? (getSGRPNum() < O.getSGRPNum()):
-                         (getVGRPNum() < O.getVGRPNum());
+  return SGPRImportant ? (getSGPRNum() < O.getSGPRNum()):
+                         (getVGPRNum() < O.getVGPRNum());
 }
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 LLVM_DUMP_METHOD
 void GCNRegPressure::print(raw_ostream &OS, const SISubtarget *ST) const {
-  OS << "VGPRs: " << getVGRPNum();
-  if (ST) OS << "(O" << ST->getOccupancyWithNumVGPRs(getVGRPNum()) << ')';
-  OS << ", SGPRs: " << getSGRPNum();
-  if (ST) OS << "(O" << ST->getOccupancyWithNumSGPRs(getSGRPNum()) << ')';
+  OS << "VGPRs: " << getVGPRNum();
+  if (ST) OS << "(O" << ST->getOccupancyWithNumVGPRs(getVGPRNum()) << ')';
+  OS << ", SGPRs: " << getSGPRNum();
+  if (ST) OS << "(O" << ST->getOccupancyWithNumSGPRs(getSGPRNum()) << ')';
   OS << ", LVGPR WT: " << getVGPRTuplesWeight()
      << ", LSGPR WT: " << getSGPRTuplesWeight();
   if (ST) OS << " -> Occ: " << getOccupancy(*ST);
@@ -192,7 +192,6 @@ LaneBitmask llvm::getLiveLaneMask(unsigned Reg,
                                   SlotIndex SI,
                                   const LiveIntervals &LIS,
                                   const MachineRegisterInfo &MRI) {
-  assert(!MRI.reg_nodbg_empty(Reg));
   LaneBitmask LiveMask;
   const auto &LI = LIS.getInterval(Reg);
   if (LI.hasSubRanges()) {
@@ -214,7 +213,7 @@ GCNRPTracker::LiveRegSet llvm::getLiveRegs(SlotIndex SI,
   GCNRPTracker::LiveRegSet LiveRegs;
   for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) {
     auto Reg = TargetRegisterInfo::index2VirtReg(I);
-    if (MRI.reg_nodbg_empty(Reg))
+    if (!LIS.hasInterval(Reg))
       continue;
     auto LiveMask = getLiveLaneMask(Reg, SI, LIS, MRI);
     if (LiveMask.any())
@@ -223,13 +222,7 @@ GCNRPTracker::LiveRegSet llvm::getLiveRegs(SlotIndex SI,
   return LiveRegs;
 }
 
-void GCNUpwardRPTracker::reset(const MachineInstr &MI) {
-  MRI = &MI.getParent()->getParent()->getRegInfo();
-  LiveRegs = getLiveRegsAfter(MI, LIS);
-  MaxPressure = CurPressure = getRegPressure(*MRI, LiveRegs);
-}
-
-LaneBitmask GCNUpwardRPTracker::getDefRegMask(const MachineOperand &MO) const {
+LaneBitmask GCNRPTracker::getDefRegMask(const MachineOperand &MO) const {
   assert(MO.isDef() && MO.isReg() &&
     TargetRegisterInfo::isVirtualRegister(MO.getReg()));
 
@@ -241,7 +234,7 @@ LaneBitmask GCNUpwardRPTracker::getDefRegMask(const MachineOperand &MO) const {
     MRI->getTargetRegisterInfo()->getSubRegIndexLaneMask(MO.getSubReg());
 }
 
-LaneBitmask GCNUpwardRPTracker::getUsedRegMask(const MachineOperand &MO) const {
+LaneBitmask GCNRPTracker::getUsedRegMask(const MachineOperand &MO) const {
   assert(MO.isUse() && MO.isReg() &&
          TargetRegisterInfo::isVirtualRegister(MO.getReg()));
 
@@ -259,6 +252,18 @@ LaneBitmask GCNUpwardRPTracker::getUsedRegMask(const MachineOperand &MO) const {
   return getLiveLaneMask(MO.getReg(), SI, LIS, *MRI);
 }
 
+void GCNUpwardRPTracker::reset(const MachineInstr &MI,
+                               const LiveRegSet *LiveRegsCopy) {
+  MRI = &MI.getParent()->getParent()->getRegInfo();
+  if (LiveRegsCopy) {
+    if (&LiveRegs != LiveRegsCopy)
+      LiveRegs = *LiveRegsCopy;
+  } else {
+    LiveRegs = getLiveRegsAfter(MI, LIS);
+  }
+  MaxPressure = CurPressure = getRegPressure(*MRI, LiveRegs);
+}
+
 void GCNUpwardRPTracker::recede(const MachineInstr &MI) {
   assert(MRI && "call reset first");
 
@@ -297,6 +302,100 @@ void GCNUpwardRPTracker::recede(const MachineInstr &MI) {
   MaxPressure = max(MaxPressure, CurPressure);
 }
 
+bool GCNDownwardRPTracker::reset(const MachineInstr &MI,
+                                 const LiveRegSet *LiveRegsCopy) {
+  MRI = &MI.getParent()->getParent()->getRegInfo();
+  LastTrackedMI = nullptr;
+  MBBEnd = MI.getParent()->end();
+  NextMI = &MI;
+  NextMI = skipDebugInstructionsForward(NextMI, MBBEnd);
+  if (NextMI == MBBEnd)
+    return false;
+  if (LiveRegsCopy) {
+    if (&LiveRegs != LiveRegsCopy)
+      LiveRegs = *LiveRegsCopy;
+  } else {
+    LiveRegs = getLiveRegsBefore(*NextMI, LIS);
+  }
+  MaxPressure = CurPressure = getRegPressure(*MRI, LiveRegs);
+  return true;
+}
+
+bool GCNDownwardRPTracker::advanceBeforeNext() {
+  assert(MRI && "call reset first");
+
+  NextMI = skipDebugInstructionsForward(NextMI, MBBEnd);
+  if (NextMI == MBBEnd)
+    return false;
+
+  SlotIndex SI = LIS.getInstructionIndex(*NextMI).getBaseIndex();
+  assert(SI.isValid());
+
+  // Remove dead registers or mask bits.
+  for (auto &It : LiveRegs) {
+    const LiveInterval &LI = LIS.getInterval(It.first);
+    if (LI.hasSubRanges()) {
+      for (const auto &S : LI.subranges()) {
+        if (!S.liveAt(SI)) {
+          auto PrevMask = It.second;
+          It.second &= ~S.LaneMask;
+          CurPressure.inc(It.first, PrevMask, It.second, *MRI);
+        }
+      }
+    } else if (!LI.liveAt(SI)) {
+      auto PrevMask = It.second;
+      It.second = LaneBitmask::getNone();
+      CurPressure.inc(It.first, PrevMask, It.second, *MRI);
+    }
+    if (It.second.none())
+      LiveRegs.erase(It.first);
+  }
+
+  MaxPressure = max(MaxPressure, CurPressure);
+
+  return true;
+}
+
+void GCNDownwardRPTracker::advanceToNext() {
+  LastTrackedMI = &*NextMI++;
+
+  // Add new registers or mask bits.
+  for (const auto &MO : LastTrackedMI->defs()) {
+    if (!MO.isReg())
+      continue;
+    unsigned Reg = MO.getReg();
+    if (!TargetRegisterInfo::isVirtualRegister(Reg))
+      continue;
+    auto &LiveMask = LiveRegs[Reg];
+    auto PrevMask = LiveMask;
+    LiveMask |= getDefRegMask(MO);
+    CurPressure.inc(Reg, PrevMask, LiveMask, *MRI);
+  }
+
+  MaxPressure = max(MaxPressure, CurPressure);
+}
+
+bool GCNDownwardRPTracker::advance() {
+  // If we have just called reset live set is actual.
+  if ((NextMI == MBBEnd) || (LastTrackedMI && !advanceBeforeNext()))
+    return false;
+  advanceToNext();
+  return true;
+}
+
+bool GCNDownwardRPTracker::advance(MachineBasicBlock::const_iterator End) {
+  while (NextMI != End)
+    if (!advance()) return false;
+  return true;
+}
+
+bool GCNDownwardRPTracker::advance(MachineBasicBlock::const_iterator Begin,
+                                   MachineBasicBlock::const_iterator End,
+                                   const LiveRegSet *LiveRegsCopy) {
+  reset(*Begin, LiveRegsCopy);
+  return advance(End);
+}
+
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 LLVM_DUMP_METHOD
 static void reportMismatch(const GCNRPTracker::LiveRegSet &LISLR,
@@ -352,4 +451,16 @@ bool GCNUpwardRPTracker::isValid() const {
   return true;
 }
 
+void GCNRPTracker::printLiveRegs(raw_ostream &OS, const LiveRegSet& LiveRegs,
+                                 const MachineRegisterInfo &MRI) {
+  const TargetRegisterInfo *TRI = MRI.getTargetRegisterInfo();
+  for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) {
+    unsigned Reg = TargetRegisterInfo::index2VirtReg(I);
+    auto It = LiveRegs.find(Reg);
+    if (It != LiveRegs.end() && It->second.any())
+      OS << ' ' << PrintVRegOrUnit(Reg, TRI) << ':'
+         << PrintLaneMask(It->second);
+  }
+  OS << '\n';
+}
 #endif
diff --git a/lib/Target/AMDGPU/GCNRegPressure.h b/lib/Target/AMDGPU/GCNRegPressure.h
index 82e76a7bfddc..9875ca6a6d16 100644
--- a/lib/Target/AMDGPU/GCNRegPressure.h
+++ b/lib/Target/AMDGPU/GCNRegPressure.h
@@ -33,19 +33,19 @@ struct GCNRegPressure {
     clear();
   }
 
-  bool empty() const { return getSGRPNum() == 0 && getVGRPNum() == 0; }
+  bool empty() const { return getSGPRNum() == 0 && getVGPRNum() == 0; }
 
   void clear() { std::fill(&Value[0], &Value[TOTAL_KINDS], 0); }
 
-  unsigned getSGRPNum() const { return Value[SGPR32]; }
-  unsigned getVGRPNum() const { return Value[VGPR32]; }
+  unsigned getSGPRNum() const { return Value[SGPR32]; }
+  unsigned getVGPRNum() const { return Value[VGPR32]; }
 
   unsigned getVGPRTuplesWeight() const { return Value[VGPR_TUPLE]; }
   unsigned getSGPRTuplesWeight() const { return Value[SGPR_TUPLE]; }
 
   unsigned getOccupancy(const SISubtarget &ST) const {
-    return std::min(ST.getOccupancyWithNumSGPRs(getSGRPNum()),
-                    ST.getOccupancyWithNumVGPRs(getVGRPNum()));
+    return std::min(ST.getOccupancyWithNumSGPRs(getSGPRNum()),
+                    ST.getOccupancyWithNumVGPRs(getVGPRNum()));
   }
 
   void inc(unsigned Reg,
@@ -92,16 +92,21 @@ public:
   typedef DenseMap<unsigned, LaneBitmask> LiveRegSet;
 
 protected:
+  const LiveIntervals &LIS;
   LiveRegSet LiveRegs;
   GCNRegPressure CurPressure, MaxPressure;
   const MachineInstr *LastTrackedMI = nullptr;
   mutable const MachineRegisterInfo *MRI = nullptr;
-  GCNRPTracker() {}
+  GCNRPTracker(const LiveIntervals &LIS_) : LIS(LIS_) {}
+  LaneBitmask getDefRegMask(const MachineOperand &MO) const;
+  LaneBitmask getUsedRegMask(const MachineOperand &MO) const;
 public:
   // live regs for the current state
   const decltype(LiveRegs) &getLiveRegs() const { return LiveRegs; }
   const MachineInstr *getLastTrackedMI() const { return LastTrackedMI; }
 
+  void clearMaxPressure() { MaxPressure.clear(); }
+
   // returns MaxPressure, resetting it
   decltype(MaxPressure) moveMaxPressure() {
     auto Res = MaxPressure;
@@ -111,17 +116,16 @@ public:
   decltype(LiveRegs) moveLiveRegs() {
     return std::move(LiveRegs);
   }
+  static void printLiveRegs(raw_ostream &OS, const LiveRegSet& LiveRegs,
+                            const MachineRegisterInfo &MRI);
 };
 
 class GCNUpwardRPTracker : public GCNRPTracker {
-  const LiveIntervals &LIS;
-  LaneBitmask getDefRegMask(const MachineOperand &MO) const;
-  LaneBitmask getUsedRegMask(const MachineOperand &MO) const;
 public:
-  GCNUpwardRPTracker(const LiveIntervals &LIS_) : LIS(LIS_) {}
+  GCNUpwardRPTracker(const LiveIntervals &LIS_) : GCNRPTracker(LIS_) {}
   // reset tracker to the point just below MI
   // filling live regs upon this point using LIS
-  void reset(const MachineInstr &MI);
+  void reset(const MachineInstr &MI, const LiveRegSet *LiveRegs = nullptr);
 
   // move to the state just above the MI
   void recede(const MachineInstr &MI);
@@ -131,6 +135,41 @@ public:
   bool isValid() const;
 };
 
+class GCNDownwardRPTracker : public GCNRPTracker {
+  // Last position of reset or advanceBeforeNext
+  MachineBasicBlock::const_iterator NextMI;
+
+  MachineBasicBlock::const_iterator MBBEnd;
+
+public:
+  GCNDownwardRPTracker(const LiveIntervals &LIS_) : GCNRPTracker(LIS_) {}
+
+  const MachineBasicBlock::const_iterator getNext() const { return NextMI; }
+
+  // Reset tracker to the point before the MI
+  // filling live regs upon this point using LIS.
+  // Returns false if block is empty except debug values.
+  bool reset(const MachineInstr &MI, const LiveRegSet *LiveRegs = nullptr);
+
+  // Move to the state right before the next MI. Returns false if reached
+  // end of the block.
+  bool advanceBeforeNext();
+
+  // Move to the state at the MI, advanceBeforeNext has to be called first.
+  void advanceToNext();
+
+  // Move to the state at the next MI. Returns false if reached end of block.
+  bool advance();
+
+  // Advance instructions until before End.
+  bool advance(MachineBasicBlock::const_iterator End);
+
+  // Reset to Begin and advance to End.
+  bool advance(MachineBasicBlock::const_iterator Begin,
+               MachineBasicBlock::const_iterator End,
+               const LiveRegSet *LiveRegsCopy = nullptr);
+};
+
 LaneBitmask getLiveLaneMask(unsigned Reg,
                             SlotIndex SI,
                             const LiveIntervals &LIS,
diff --git a/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/lib/Target/AMDGPU/GCNSchedStrategy.cpp
index 630442625aa3..8ec46665daf5 100644
--- a/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++ b/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -316,46 +316,57 @@ GCNScheduleDAGMILive::GCNScheduleDAGMILive(MachineSchedContext *C,
   MFI(*MF.getInfo<SIMachineFunctionInfo>()),
   StartingOccupancy(ST.getOccupancyWithLocalMemSize(MFI.getLDSSize(),
                                                     *MF.getFunction())),
-  MinOccupancy(StartingOccupancy), Stage(0) {
+  MinOccupancy(StartingOccupancy), Stage(0), RegionIdx(0) {
 
   DEBUG(dbgs() << "Starting occupancy is " << StartingOccupancy << ".\n");
 }
 
 void GCNScheduleDAGMILive::schedule() {
+  if (Stage == 0) {
+    // Just record regions at the first pass.
+    Regions.push_back(std::make_pair(RegionBegin, RegionEnd));
+    return;
+  }
+
   std::vector<MachineInstr*> Unsched;
   Unsched.reserve(NumRegionInstrs);
   for (auto &I : *this)
     Unsched.push_back(&I);
 
-  std::pair<unsigned, unsigned> PressureBefore;
+  GCNRegPressure PressureBefore;
   if (LIS) {
-    DEBUG(dbgs() << "Pressure before scheduling:\n");
-    discoverLiveIns();
-    PressureBefore = getRealRegPressure();
+    PressureBefore = Pressure[RegionIdx];
+
+    DEBUG(dbgs() << "Pressure before scheduling:\nRegion live-ins:";
+          GCNRPTracker::printLiveRegs(dbgs(), LiveIns[RegionIdx], MRI);
+          dbgs() << "Region live-in pressure:  ";
+          llvm::getRegPressure(MRI, LiveIns[RegionIdx]).print(dbgs());
+          dbgs() << "Region register pressure: ";
+          PressureBefore.print(dbgs()));
   }
 
   ScheduleDAGMILive::schedule();
-  if (Stage == 0)
-    Regions.push_back(std::make_pair(RegionBegin, RegionEnd));
+  Regions[RegionIdx] = std::make_pair(RegionBegin, RegionEnd);
 
   if (!LIS)
     return;
 
   // Check the results of scheduling.
   GCNMaxOccupancySchedStrategy &S = (GCNMaxOccupancySchedStrategy&)*SchedImpl;
-  DEBUG(dbgs() << "Pressure after scheduling:\n");
   auto PressureAfter = getRealRegPressure();
-  LiveIns.clear();
 
-  if (PressureAfter.first <= S.SGPRCriticalLimit &&
-      PressureAfter.second <= S.VGPRCriticalLimit) {
+  DEBUG(dbgs() << "Pressure after scheduling: "; PressureAfter.print(dbgs()));
+
+  if (PressureAfter.getSGPRNum() <= S.SGPRCriticalLimit &&
+      PressureAfter.getVGPRNum() <= S.VGPRCriticalLimit) {
+    Pressure[RegionIdx] = PressureAfter;
     DEBUG(dbgs() << "Pressure in desired limits, done.\n");
     return;
   }
-  unsigned WavesAfter = getMaxWaves(PressureAfter.first,
-                                    PressureAfter.second, MF);
-  unsigned WavesBefore = getMaxWaves(PressureBefore.first,
-                                      PressureBefore.second, MF);
+  unsigned WavesAfter = getMaxWaves(PressureAfter.getSGPRNum(),
+                                    PressureAfter.getVGPRNum(), MF);
+  unsigned WavesBefore = getMaxWaves(PressureBefore.getSGPRNum(),
+                                     PressureBefore.getVGPRNum(), MF);
   DEBUG(dbgs() << "Occupancy before scheduling: " << WavesBefore <<
                   ", after " << WavesAfter << ".\n");
 
@@ -368,8 +379,10 @@ void GCNScheduleDAGMILive::schedule() {
                  << MinOccupancy << ".\n");
   }
 
-  if (WavesAfter >= WavesBefore)
+  if (WavesAfter >= WavesBefore) {
+    Pressure[RegionIdx] = PressureAfter;
     return;
+  }
 
   DEBUG(dbgs() << "Attempting to revert scheduling.\n");
   RegionEnd = RegionBegin;
@@ -398,166 +411,139 @@ void GCNScheduleDAGMILive::schedule() {
     DEBUG(dbgs() << "Scheduling " << *MI);
   }
   RegionBegin = Unsched.front()->getIterator();
-  if (Stage == 0)
-    Regions.back() = std::make_pair(RegionBegin, RegionEnd);
+  Regions[RegionIdx] = std::make_pair(RegionBegin, RegionEnd);
 
   placeDebugValues();
 }
 
-static inline void setMask(const MachineRegisterInfo &MRI,
-                           const SIRegisterInfo *SRI, unsigned Reg,
-                           LaneBitmask &PrevMask, LaneBitmask NewMask,
-                           unsigned &SGPRs, unsigned &VGPRs) {
-  int NewRegs = countPopulation(NewMask.getAsInteger()) -
-                countPopulation(PrevMask.getAsInteger());
-  if (SRI->isSGPRReg(MRI, Reg))
-    SGPRs += NewRegs;
-  if (SRI->isVGPR(MRI, Reg))
-    VGPRs += NewRegs;
-  assert ((int)SGPRs >= 0 && (int)VGPRs >= 0);
-  PrevMask = NewMask;
+GCNRegPressure GCNScheduleDAGMILive::getRealRegPressure() const {
+  GCNDownwardRPTracker RPTracker(*LIS);
+  RPTracker.advance(begin(), end(), &LiveIns[RegionIdx]);
+  return RPTracker.moveMaxPressure();
 }
 
-void GCNScheduleDAGMILive::discoverLiveIns() {
-  unsigned SGPRs = 0;
-  unsigned VGPRs = 0;
+void GCNScheduleDAGMILive::computeBlockPressure(const MachineBasicBlock *MBB) {
+  GCNDownwardRPTracker RPTracker(*LIS);
+
+  // If the block has the only successor then live-ins of that successor are
+  // live-outs of the current block. We can reuse calculated live set if the
+  // successor will be sent to scheduling past current block.
+  const MachineBasicBlock *OnlySucc = nullptr;
+  if (MBB->succ_size() == 1 && !(*MBB->succ_begin())->empty()) {
+    SlotIndexes *Ind = LIS->getSlotIndexes();
+    if (Ind->getMBBStartIdx(MBB) < Ind->getMBBStartIdx(*MBB->succ_begin()))
+      OnlySucc = *MBB->succ_begin();
+  }
 
-  auto &MI = *begin()->getParent()->getFirstNonDebugInstr();
-  const SIRegisterInfo *SRI = static_cast<const SIRegisterInfo*>(TRI);
-  SlotIndex SI = LIS->getInstructionIndex(MI).getBaseIndex();
-  assert (SI.isValid());
-
-  DEBUG(dbgs() << "Region live-ins:");
-  for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) {
-    unsigned Reg = TargetRegisterInfo::index2VirtReg(I);
-    if (MRI.reg_nodbg_empty(Reg))
-      continue;
-    const LiveInterval &LI = LIS->getInterval(Reg);
-    LaneBitmask LaneMask = LaneBitmask::getNone();
-    if (LI.hasSubRanges()) {
-      for (const auto &S : LI.subranges())
-        if (S.liveAt(SI))
-          LaneMask |= S.LaneMask;
-    } else if (LI.liveAt(SI)) {
-      LaneMask = MRI.getMaxLaneMaskForVReg(Reg);
-    }
+  // Scheduler sends regions from the end of the block upwards.
+  size_t CurRegion = RegionIdx;
+  for (size_t E = Regions.size(); CurRegion != E; ++CurRegion)
+    if (Regions[CurRegion].first->getParent() != MBB)
+      break;
+  --CurRegion;
+
+  auto I = MBB->begin();
+  auto LiveInIt = MBBLiveIns.find(MBB);
+  if (LiveInIt != MBBLiveIns.end()) {
+    auto LiveIn = std::move(LiveInIt->second);
+    RPTracker.reset(*MBB->begin(), &LiveIn);
+    MBBLiveIns.erase(LiveInIt);
+  } else {
+    I = Regions[CurRegion].first;
+    RPTracker.reset(*I);
+  }
 
-    if (LaneMask.any()) {
-      setMask(MRI, SRI, Reg, LiveIns[Reg], LaneMask, SGPRs, VGPRs);
+  for ( ; ; ) {
+    I = RPTracker.getNext();
 
-      DEBUG(dbgs() << ' ' << PrintVRegOrUnit(Reg, SRI) << ':'
-                   << PrintLaneMask(LiveIns[Reg]));
+    if (Regions[CurRegion].first == I) {
+      LiveIns[CurRegion] = RPTracker.getLiveRegs();
+      RPTracker.clearMaxPressure();
     }
-  }
 
-  LiveInPressure = std::make_pair(SGPRs, VGPRs);
+    if (Regions[CurRegion].second == I) {
+      Pressure[CurRegion] = RPTracker.moveMaxPressure();
+      if (CurRegion-- == RegionIdx)
+        break;
+    }
+    RPTracker.advanceToNext();
+    RPTracker.advanceBeforeNext();
+  }
 
-  DEBUG(dbgs() << "\nLive-in pressure:\nSGPR = " << SGPRs
-               << "\nVGPR = " << VGPRs << '\n');
+  if (OnlySucc) {
+    if (I != MBB->end()) {
+      RPTracker.advanceToNext();
+      RPTracker.advance(MBB->end());
+    }
+    RPTracker.reset(*OnlySucc->begin(), &RPTracker.getLiveRegs());
+    RPTracker.advanceBeforeNext();
+    MBBLiveIns[OnlySucc] = RPTracker.moveLiveRegs();
+  }
 }
 
-std::pair<unsigned, unsigned>
-GCNScheduleDAGMILive::getRealRegPressure() const {
-  unsigned SGPRs, MaxSGPRs, VGPRs, MaxVGPRs;
-  SGPRs = MaxSGPRs = LiveInPressure.first;
-  VGPRs = MaxVGPRs = LiveInPressure.second;
-
-  const SIRegisterInfo *SRI = static_cast<const SIRegisterInfo*>(TRI);
-  DenseMap<unsigned, LaneBitmask> LiveRegs(LiveIns);
+void GCNScheduleDAGMILive::finalizeSchedule() {
+  GCNMaxOccupancySchedStrategy &S = (GCNMaxOccupancySchedStrategy&)*SchedImpl;
+  DEBUG(dbgs() << "All regions recorded, starting actual scheduling.\n");
 
-  for (const MachineInstr &MI : *this) {
-    if (MI.isDebugValue())
-      continue;
-    SlotIndex SI = LIS->getInstructionIndex(MI).getBaseIndex();
-    assert (SI.isValid());
+  LiveIns.resize(Regions.size());
+  Pressure.resize(Regions.size());
 
-    // Remove dead registers or mask bits.
-    for (auto &It : LiveRegs) {
-      if (It.second.none())
-        continue;
-      const LiveInterval &LI = LIS->getInterval(It.first);
-      if (LI.hasSubRanges()) {
-        for (const auto &S : LI.subranges())
-          if (!S.liveAt(SI))
-            setMask(MRI, SRI, It.first, It.second, It.second & ~S.LaneMask,
-                    SGPRs, VGPRs);
-      } else if (!LI.liveAt(SI)) {
-        setMask(MRI, SRI, It.first, It.second, LaneBitmask::getNone(),
-                SGPRs, VGPRs);
-      }
-    }
+  do {
+    Stage++;
+    RegionIdx = 0;
+    MachineBasicBlock *MBB = nullptr;
 
-    // Add new registers or mask bits.
-    for (const auto &MO : MI.defs()) {
-      if (!MO.isReg())
-        continue;
-      unsigned Reg = MO.getReg();
-      if (!TargetRegisterInfo::isVirtualRegister(Reg))
-        continue;
-      unsigned SubRegIdx = MO.getSubReg();
-      LaneBitmask LaneMask = SubRegIdx != 0
-                             ? TRI->getSubRegIndexLaneMask(SubRegIdx)
-                             : MRI.getMaxLaneMaskForVReg(Reg);
-      LaneBitmask &LM = LiveRegs[Reg];
-      setMask(MRI, SRI, Reg, LM, LM | LaneMask, SGPRs, VGPRs);
-    }
-    MaxSGPRs = std::max(MaxSGPRs, SGPRs);
-    MaxVGPRs = std::max(MaxVGPRs, VGPRs);
-  }
+    if (Stage > 1) {
+      // Retry function scheduling if we found resulting occupancy and it is
+      // lower than used for first pass scheduling. This will give more freedom
+      // to schedule low register pressure blocks.
+      // Code is partially copied from MachineSchedulerBase::scheduleRegions().
 
-  DEBUG(dbgs() << "Real region's register pressure:\nSGPR = " << MaxSGPRs
-               << "\nVGPR = " << MaxVGPRs << '\n');
+      if (!LIS || StartingOccupancy <= MinOccupancy)
+        break;
 
-  return std::make_pair(MaxSGPRs, MaxVGPRs);
-}
+      DEBUG(dbgs()
+              << "Retrying function scheduling with lowest recorded occupancy "
+              << MinOccupancy << ".\n");
 
-void GCNScheduleDAGMILive::finalizeSchedule() {
-  // Retry function scheduling if we found resulting occupancy and it is
-  // lower than used for first pass scheduling. This will give more freedom
-  // to schedule low register pressure blocks.
-  // Code is partially copied from MachineSchedulerBase::scheduleRegions().
+      S.setTargetOccupancy(MinOccupancy);
+    }
 
-  if (!LIS || StartingOccupancy <= MinOccupancy)
-    return;
+    for (auto Region : Regions) {
+      RegionBegin = Region.first;
+      RegionEnd = Region.second;
 
-  DEBUG(dbgs() << "Retrying function scheduling with lowest recorded occupancy "
-               << MinOccupancy << ".\n");
+      if (RegionBegin->getParent() != MBB) {
+        if (MBB) finishBlock();
+        MBB = RegionBegin->getParent();
+        startBlock(MBB);
+        if (Stage == 1)
+          computeBlockPressure(MBB);
+      }
 
-  Stage++;
-  GCNMaxOccupancySchedStrategy &S = (GCNMaxOccupancySchedStrategy&)*SchedImpl;
-  S.setTargetOccupancy(MinOccupancy);
+      unsigned NumRegionInstrs = std::distance(begin(), end());
+      enterRegion(MBB, begin(), end(), NumRegionInstrs);
 
-  MachineBasicBlock *MBB = nullptr;
-  for (auto Region : Regions) {
-    RegionBegin = Region.first;
-    RegionEnd = Region.second;
+      // Skip empty scheduling regions (0 or 1 schedulable instructions).
+      if (begin() == end() || begin() == std::prev(end())) {
+        exitRegion();
+        continue;
+      }
 
-    if (RegionBegin->getParent() != MBB) {
-      if (MBB) finishBlock();
-      MBB = RegionBegin->getParent();
-      startBlock(MBB);
-    }
+      DEBUG(dbgs() << "********** MI Scheduling **********\n");
+      DEBUG(dbgs() << MF.getName()
+            << ":BB#" << MBB->getNumber() << " " << MBB->getName()
+            << "\n  From: " << *begin() << "    To: ";
+            if (RegionEnd != MBB->end()) dbgs() << *RegionEnd;
+            else dbgs() << "End";
+            dbgs() << " RegionInstrs: " << NumRegionInstrs << '\n');
 
-    unsigned NumRegionInstrs = std::distance(begin(), end());
-    enterRegion(MBB, begin(), end(), NumRegionInstrs);
+      schedule();
 
-    // Skip empty scheduling regions (0 or 1 schedulable instructions).
-    if (begin() == end() || begin() == std::prev(end())) {
       exitRegion();
-      continue;
+      ++RegionIdx;
     }
-    DEBUG(dbgs() << "********** MI Scheduling **********\n");
-    DEBUG(dbgs() << MF.getName()
-          << ":BB#" << MBB->getNumber() << " " << MBB->getName()
-          << "\n  From: " << *begin() << "    To: ";
-          if (RegionEnd != MBB->end()) dbgs() << *RegionEnd;
-          else dbgs() << "End";
-          dbgs() << " RegionInstrs: " << NumRegionInstrs << '\n');
+    finishBlock();
 
-    schedule();
-
-    exitRegion();
-  }
-  finishBlock();
-  LiveIns.shrink_and_clear();
+  } while (Stage < 2);
 }
diff --git a/lib/Target/AMDGPU/GCNSchedStrategy.h b/lib/Target/AMDGPU/GCNSchedStrategy.h
index 15af232704ff..3ed3cd5b3b1c 100644
--- a/lib/Target/AMDGPU/GCNSchedStrategy.h
+++ b/lib/Target/AMDGPU/GCNSchedStrategy.h
@@ -14,6 +14,7 @@
 #ifndef LLVM_LIB_TARGET_AMDGPU_GCNSCHEDSTRATEGY_H
 #define LLVM_LIB_TARGET_AMDGPU_GCNSCHEDSTRATEGY_H
 
+#include "GCNRegPressure.h"
 #include "llvm/CodeGen/MachineScheduler.h"
 
 namespace llvm {
@@ -74,21 +75,28 @@ class GCNScheduleDAGMILive : public ScheduleDAGMILive {
   // Scheduling stage number.
   unsigned Stage;
 
+  // Current region index.
+  size_t RegionIdx;
+
   // Vecor of regions recorder for later rescheduling
   SmallVector<std::pair<MachineBasicBlock::iterator,
                         MachineBasicBlock::iterator>, 32> Regions;
 
-  // Region live-ins.
-  DenseMap<unsigned, LaneBitmask> LiveIns;
+  // Region live-in cache.
+  SmallVector<GCNRPTracker::LiveRegSet, 32> LiveIns;
+
+  // Region pressure cache.
+  SmallVector<GCNRegPressure, 32> Pressure;
+
+  // Temporary basic block live-in cache.
+  DenseMap<const MachineBasicBlock*, GCNRPTracker::LiveRegSet> MBBLiveIns;
 
-  // Number of live-ins to the current region, first SGPR then VGPR.
-  std::pair<unsigned, unsigned> LiveInPressure;
+  // Return current region pressure.
+  GCNRegPressure getRealRegPressure() const;
 
-  // Collect current region live-ins.
-  void discoverLiveIns();
+  // Compute and cache live-ins and pressure for all regions in block.
+  void computeBlockPressure(const MachineBasicBlock *MBB);
 
-  // Return current region pressure. First value is SGPR number, second is VGPR.
-  std::pair<unsigned, unsigned> getRealRegPressure() const;
 
 public:
   GCNScheduleDAGMILive(MachineSchedContext *C,
diff --git a/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp b/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp
index d8cb98fe1b19..8cb35c506135 100644
--- a/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp
+++ b/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp
@@ -126,7 +126,7 @@ bool SIAnnotateControlFlow::doInitialization(Module &M) {
   Void = Type::getVoidTy(Context);
   Boolean = Type::getInt1Ty(Context);
   Int64 = Type::getInt64Ty(Context);
-  ReturnStruct = StructType::get(Boolean, Int64, (Type *)nullptr);
+  ReturnStruct = StructType::get(Boolean, Int64);
 
   BoolTrue = ConstantInt::getTrue(Context);
   BoolFalse = ConstantInt::getFalse(Context);
diff --git a/lib/Target/AMDGPU/SIISelLowering.cpp b/lib/Target/AMDGPU/SIISelLowering.cpp
index cc93c27731ff..48a14e4dbea2 100644
--- a/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -488,6 +488,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
   setTargetDAGCombine(ISD::FCANONICALIZE);
   setTargetDAGCombine(ISD::SCALAR_TO_VECTOR);
   setTargetDAGCombine(ISD::ZERO_EXTEND);
+  setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
 
   // All memory operations. Some folding on the pointer operand is done to help
   // matching the constant offsets in the addressing modes.
@@ -2003,6 +2004,7 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(
       break;
     }
     assert(Found);
+    (void)Found;
 
     // This should be before all vector instructions.
     BuildMI(*BB, FirstMI, DebugLoc(), TII->get(AMDGPU::S_BFE_U32), CountReg)
@@ -4604,6 +4606,24 @@ SDValue SITargetLowering::performCvtPkRTZCombine(SDNode *N,
   return SDValue();
 }
 
+SDValue SITargetLowering::performExtractVectorEltCombine(
+  SDNode *N, DAGCombinerInfo &DCI) const {
+  SDValue Vec = N->getOperand(0);
+
+  SelectionDAG &DAG= DCI.DAG;
+  if (Vec.getOpcode() == ISD::FNEG && allUsesHaveSourceMods(N)) {
+    SDLoc SL(N);
+    EVT EltVT = N->getValueType(0);
+    SDValue Idx = N->getOperand(1);
+    SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT,
+                              Vec.getOperand(0), Idx);
+    return DAG.getNode(ISD::FNEG, SL, EltVT, Elt);
+  }
+
+  return SDValue();
+}
+
+
 unsigned SITargetLowering::getFusedOpcode(const SelectionDAG &DAG,
                                           const SDNode *N0,
                                           const SDNode *N1) const {
@@ -4891,6 +4911,8 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
 
     break;
   }
+  case ISD::EXTRACT_VECTOR_ELT:
+    return performExtractVectorEltCombine(N, DCI);
   }
   return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
 }
diff --git a/lib/Target/AMDGPU/SIISelLowering.h b/lib/Target/AMDGPU/SIISelLowering.h
index d177777ad5ee..046e677756d1 100644
--- a/lib/Target/AMDGPU/SIISelLowering.h
+++ b/lib/Target/AMDGPU/SIISelLowering.h
@@ -100,6 +100,7 @@ class SITargetLowering final : public AMDGPUTargetLowering {
   SDValue performMinMaxCombine(SDNode *N, DAGCombinerInfo &DCI) const;
   SDValue performFMed3Combine(SDNode *N, DAGCombinerInfo &DCI) const;
   SDValue performCvtPkRTZCombine(SDNode *N, DAGCombinerInfo &DCI) const;
+  SDValue performExtractVectorEltCombine(SDNode *N, DAGCombinerInfo &DCI) const;
 
   unsigned getFusedOpcode(const SelectionDAG &DAG,
                           const SDNode *N0, const SDNode *N1) const;
diff --git a/lib/Target/AMDGPU/SIInstrInfo.cpp b/lib/Target/AMDGPU/SIInstrInfo.cpp
index 92e452a3d6a0..065fd09eb356 100644
--- a/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -496,6 +496,188 @@ int SIInstrInfo::commuteOpcode(unsigned Opcode) const {
   return Opcode;
 }
 
+void SIInstrInfo::materializeImmediate(MachineBasicBlock &MBB,
+                                       MachineBasicBlock::iterator MI,
+                                       const DebugLoc &DL, unsigned DestReg,
+                                       int64_t Value) const {
+  MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+  const TargetRegisterClass *RegClass = MRI.getRegClass(DestReg);
+  if (RegClass == &AMDGPU::SReg_32RegClass ||
+      RegClass == &AMDGPU::SGPR_32RegClass ||
+      RegClass == &AMDGPU::SReg_32_XM0RegClass ||
+      RegClass == &AMDGPU::SReg_32_XM0_XEXECRegClass) {
+    BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DestReg)
+      .addImm(Value);
+    return;
+  }
+
+  if (RegClass == &AMDGPU::SReg_64RegClass ||
+      RegClass == &AMDGPU::SGPR_64RegClass ||
+      RegClass == &AMDGPU::SReg_64_XEXECRegClass) {
+    BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), DestReg)
+      .addImm(Value);
+    return;
+  }
+
+  if (RegClass == &AMDGPU::VGPR_32RegClass) {
+    BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DestReg)
+      .addImm(Value);
+    return;
+  }
+  if (RegClass == &AMDGPU::VReg_64RegClass) {
+    BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_PSEUDO), DestReg)
+      .addImm(Value);
+    return;
+  }
+
+  unsigned EltSize = 4;
+  unsigned Opcode = AMDGPU::V_MOV_B32_e32;
+  if (RI.isSGPRClass(RegClass)) {
+    if (RI.getRegSizeInBits(*RegClass) > 32) {
+      Opcode =  AMDGPU::S_MOV_B64;
+      EltSize = 8;
+    } else {
+      Opcode = AMDGPU::S_MOV_B32;
+      EltSize = 4;
+    }
+  }
+
+  ArrayRef<int16_t> SubIndices = RI.getRegSplitParts(RegClass, EltSize);
+  for (unsigned Idx = 0; Idx < SubIndices.size(); ++Idx) {
+    int64_t IdxValue = Idx == 0 ? Value : 0;
+
+    MachineInstrBuilder Builder = BuildMI(MBB, MI, DL,
+      get(Opcode), RI.getSubReg(DestReg, Idx));
+    Builder.addImm(IdxValue);
+  }
+}
+
+const TargetRegisterClass *
+SIInstrInfo::getPreferredSelectRegClass(unsigned Size) const {
+  return &AMDGPU::VGPR_32RegClass;
+}
+
+void SIInstrInfo::insertVectorSelect(MachineBasicBlock &MBB,
+                                     MachineBasicBlock::iterator I,
+                                     const DebugLoc &DL, unsigned DstReg,
+                                     ArrayRef<MachineOperand> Cond,
+                                     unsigned TrueReg,
+                                     unsigned FalseReg) const {
+  MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+  assert(MRI.getRegClass(DstReg) == &AMDGPU::VGPR_32RegClass &&
+         "Not a VGPR32 reg");
+
+  if (Cond.size() == 1) {
+    BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
+      .addReg(FalseReg)
+      .addReg(TrueReg)
+      .add(Cond[0]);
+  } else if (Cond.size() == 2) {
+    assert(Cond[0].isImm() && "Cond[0] is not an immediate");
+    switch (Cond[0].getImm()) {
+    case SIInstrInfo::SCC_TRUE: {
+      unsigned SReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
+      BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B64), SReg)
+        .addImm(-1)
+        .addImm(0);
+      BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
+        .addReg(FalseReg)
+        .addReg(TrueReg)
+        .addReg(SReg);
+      break;
+    }
+    case SIInstrInfo::SCC_FALSE: {
+      unsigned SReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
+      BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B64), SReg)
+        .addImm(0)
+        .addImm(-1);
+      BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
+        .addReg(FalseReg)
+        .addReg(TrueReg)
+        .addReg(SReg);
+      break;
+    }
+    case SIInstrInfo::VCCNZ: {
+      MachineOperand RegOp = Cond[1];
+      RegOp.setImplicit(false);
+      BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
+          .addReg(FalseReg)
+          .addReg(TrueReg)
+          .add(RegOp);
+      break;
+    }
+    case SIInstrInfo::VCCZ: {
+      MachineOperand RegOp = Cond[1];
+      RegOp.setImplicit(false);
+      BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
+          .addReg(TrueReg)
+          .addReg(FalseReg)
+          .add(RegOp);
+      break;
+    }
+    case SIInstrInfo::EXECNZ: {
+      unsigned SReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
+      unsigned SReg2 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
+      BuildMI(MBB, I, DL, get(AMDGPU::S_OR_SAVEEXEC_B64), SReg2)
+        .addImm(0);
+      BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B64), SReg)
+        .addImm(-1)
+        .addImm(0);
+      BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
+        .addReg(FalseReg)
+        .addReg(TrueReg)
+        .addReg(SReg);
+      break;
+    }
+    case SIInstrInfo::EXECZ: {
+      unsigned SReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
+      unsigned SReg2 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
+      BuildMI(MBB, I, DL, get(AMDGPU::S_OR_SAVEEXEC_B64), SReg2)
+        .addImm(0);
+      BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B64), SReg)
+        .addImm(0)
+        .addImm(-1);
+      BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
+        .addReg(FalseReg)
+        .addReg(TrueReg)
+        .addReg(SReg);
+      llvm_unreachable("Unhandled branch predicate EXECZ");
+      break;
+    }
+    default:
+      llvm_unreachable("invalid branch predicate");
+    }
+  } else {
+    llvm_unreachable("Can only handle Cond size 1 or 2");
+  }
+}
+
+unsigned SIInstrInfo::insertEQ(MachineBasicBlock *MBB,
+                               MachineBasicBlock::iterator I,
+                               const DebugLoc &DL,
+                               unsigned SrcReg, int Value) const {
+  MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
+  unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
+  BuildMI(*MBB, I, DL, get(AMDGPU::V_CMP_EQ_I32_e64), Reg)
+    .addImm(Value)
+    .addReg(SrcReg);
+
+  return Reg;
+}
+
+unsigned SIInstrInfo::insertNE(MachineBasicBlock *MBB,
+                               MachineBasicBlock::iterator I,
+                               const DebugLoc &DL,
+                               unsigned SrcReg, int Value) const {
+  MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
+  unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
+  BuildMI(*MBB, I, DL, get(AMDGPU::V_CMP_NE_I32_e64), Reg)
+    .addImm(Value)
+    .addReg(SrcReg);
+
+  return Reg;
+}
+
 unsigned SIInstrInfo::getMovOpcode(const TargetRegisterClass *DstRC) const {
 
   if (RI.getRegSizeInBits(*DstRC) == 32) {
@@ -834,6 +1016,20 @@ void SIInstrInfo::insertNoop(MachineBasicBlock &MBB,
   insertWaitStates(MBB, MI, 1);
 }
 
+void SIInstrInfo::insertReturn(MachineBasicBlock &MBB) const {
+  auto MF = MBB.getParent();
+  SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
+
+  assert(Info->isEntryFunction());
+
+  if (MBB.succ_empty()) {
+    bool HasNoTerminator = MBB.getFirstTerminator() == MBB.end();
+    if (HasNoTerminator)
+      BuildMI(MBB, MBB.end(), DebugLoc(),
+              get(Info->returnsVoid() ? AMDGPU::S_ENDPGM : AMDGPU::SI_RETURN_TO_EPILOG));
+  }
+}
+
 unsigned SIInstrInfo::getNumWaitStates(const MachineInstr &MI) const {
   switch (MI.getOpcode()) {
   default: return 1; // FIXME: Do wait states equal cycles?
@@ -1241,14 +1437,20 @@ bool SIInstrInfo::analyzeBranchImpl(MachineBasicBlock &MBB,
     return false;
   }
 
-  BranchPredicate Pred = getBranchPredicate(I->getOpcode());
-  if (Pred == INVALID_BR)
-    return true;
+  MachineBasicBlock *CondBB = nullptr;
 
-  MachineBasicBlock *CondBB = I->getOperand(0).getMBB();
-  Cond.push_back(MachineOperand::CreateImm(Pred));
-  Cond.push_back(I->getOperand(1)); // Save the branch register.
+  if (I->getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO) {
+    CondBB = I->getOperand(1).getMBB();
+    Cond.push_back(I->getOperand(0));
+  } else {
+    BranchPredicate Pred = getBranchPredicate(I->getOpcode());
+    if (Pred == INVALID_BR)
+      return true;
 
+    CondBB = I->getOperand(0).getMBB();
+    Cond.push_back(MachineOperand::CreateImm(Pred));
+    Cond.push_back(I->getOperand(1)); // Save the branch register.
+  }
   ++I;
 
   if (I == MBB.end()) {
@@ -1351,6 +1553,13 @@ unsigned SIInstrInfo::insertBranch(MachineBasicBlock &MBB,
     return 1;
   }
 
+  if(Cond.size() == 1 && Cond[0].isReg()) {
+     BuildMI(&MBB, DL, get(AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO))
+       .add(Cond[0])
+       .addMBB(TBB);
+     return 1;
+  }
+
   assert(TBB && Cond[0].isImm());
 
   unsigned Opcode
@@ -1390,9 +1599,16 @@ unsigned SIInstrInfo::insertBranch(MachineBasicBlock &MBB,
 
 bool SIInstrInfo::reverseBranchCondition(
   SmallVectorImpl<MachineOperand> &Cond) const {
-  assert(Cond.size() == 2);
-  Cond[0].setImm(-Cond[0].getImm());
-  return false;
+  if (Cond.size() != 2) {
+    return true;
+  }
+
+  if (Cond[0].isImm()) {
+    Cond[0].setImm(-Cond[0].getImm());
+    return false;
+  }
+
+  return true;
 }
 
 bool SIInstrInfo::canInsertSelect(const MachineBasicBlock &MBB,
@@ -3920,6 +4136,82 @@ bool SIInstrInfo::mayAccessFlatAddressSpace(const MachineInstr &MI) const {
   return false;
 }
 
+bool SIInstrInfo::isNonUniformBranchInstr(MachineInstr &Branch) const {
+  return Branch.getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO;
+}
+
+void SIInstrInfo::convertNonUniformIfRegion(MachineBasicBlock *IfEntry,
+                                            MachineBasicBlock *IfEnd) const {
+  MachineBasicBlock::iterator TI = IfEntry->getFirstTerminator();
+  assert(TI != IfEntry->end());
+
+  MachineInstr *Branch = &(*TI);
+  MachineFunction *MF = IfEntry->getParent();
+  MachineRegisterInfo &MRI = IfEntry->getParent()->getRegInfo();
+
+  if (Branch->getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO) {
+    unsigned DstReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
+    MachineInstr *SIIF =
+        BuildMI(*MF, Branch->getDebugLoc(), get(AMDGPU::SI_IF), DstReg)
+            .add(Branch->getOperand(0))
+            .add(Branch->getOperand(1));
+    MachineInstr *SIEND =
+        BuildMI(*MF, Branch->getDebugLoc(), get(AMDGPU::SI_END_CF))
+            .addReg(DstReg);
+
+    IfEntry->erase(TI);
+    IfEntry->insert(IfEntry->end(), SIIF);
+    IfEnd->insert(IfEnd->getFirstNonPHI(), SIEND);
+  }
+}
+
+void SIInstrInfo::convertNonUniformLoopRegion(
+    MachineBasicBlock *LoopEntry, MachineBasicBlock *LoopEnd) const {
+  MachineBasicBlock::iterator TI = LoopEnd->getFirstTerminator();
+  // We expect 2 terminators, one conditional and one unconditional.
+  assert(TI != LoopEnd->end());
+
+  MachineInstr *Branch = &(*TI);
+  MachineFunction *MF = LoopEnd->getParent();
+  MachineRegisterInfo &MRI = LoopEnd->getParent()->getRegInfo();
+
+  if (Branch->getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO) {
+
+    unsigned DstReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
+    unsigned BackEdgeReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
+    MachineInstrBuilder HeaderPHIBuilder =
+        BuildMI(*(MF), Branch->getDebugLoc(), get(TargetOpcode::PHI), DstReg);
+    for (MachineBasicBlock::pred_iterator PI = LoopEntry->pred_begin(),
+                                          E = LoopEntry->pred_end();
+         PI != E; ++PI) {
+      if (*PI == LoopEnd) {
+        HeaderPHIBuilder.addReg(BackEdgeReg);
+      } else {
+        MachineBasicBlock *PMBB = *PI;
+        unsigned ZeroReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
+        materializeImmediate(*PMBB, PMBB->getFirstTerminator(), DebugLoc(),
+                             ZeroReg, 0);
+        HeaderPHIBuilder.addReg(ZeroReg);
+      }
+      HeaderPHIBuilder.addMBB(*PI);
+    }
+    MachineInstr *HeaderPhi = HeaderPHIBuilder;
+    MachineInstr *SIIFBREAK = BuildMI(*(MF), Branch->getDebugLoc(),
+                                      get(AMDGPU::SI_IF_BREAK), BackEdgeReg)
+                                  .addReg(DstReg)
+                                  .add(Branch->getOperand(0));
+    MachineInstr *SILOOP =
+        BuildMI(*(MF), Branch->getDebugLoc(), get(AMDGPU::SI_LOOP))
+            .addReg(BackEdgeReg)
+            .addMBB(LoopEntry);
+
+    LoopEntry->insert(LoopEntry->begin(), HeaderPhi);
+    LoopEnd->erase(TI);
+    LoopEnd->insert(LoopEnd->end(), SIIFBREAK);
+    LoopEnd->insert(LoopEnd->end(), SILOOP);
+  }
+}
+
 ArrayRef<std::pair<int, const char *>>
 SIInstrInfo::getSerializableTargetIndices() const {
   static const std::pair<int, const char *> TargetIndices[] = {
diff --git a/lib/Target/AMDGPU/SIInstrInfo.h b/lib/Target/AMDGPU/SIInstrInfo.h
index 03a5ef74b179..f6e5e8883f63 100644
--- a/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/lib/Target/AMDGPU/SIInstrInfo.h
@@ -143,6 +143,23 @@ public:
                                     RegScavenger *RS, unsigned TmpReg,
                                     unsigned Offset, unsigned Size) const;
 
+  void materializeImmediate(MachineBasicBlock &MBB,
+                            MachineBasicBlock::iterator MI,
+                            const DebugLoc &DL,
+                            unsigned DestReg,
+                            int64_t Value) const;
+
+  const TargetRegisterClass *getPreferredSelectRegClass(
+                               unsigned Size) const;
+
+  unsigned insertNE(MachineBasicBlock *MBB,
+                    MachineBasicBlock::iterator I, const DebugLoc &DL,
+                    unsigned SrcReg, int Value)  const;
+
+  unsigned insertEQ(MachineBasicBlock *MBB,
+                    MachineBasicBlock::iterator I, const DebugLoc &DL,
+                    unsigned SrcReg, int Value)  const;
+
   void storeRegToStackSlot(MachineBasicBlock &MBB,
                            MachineBasicBlock::iterator MI, unsigned SrcReg,
                            bool isKill, int FrameIndex,
@@ -193,7 +210,7 @@ public:
   bool analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
                      MachineBasicBlock *&FBB,
                      SmallVectorImpl<MachineOperand> &Cond,
-                     bool AllowModify) const override;
+                     bool AllowModify = false) const override;
 
   unsigned removeBranch(MachineBasicBlock &MBB,
                         int *BytesRemoved = nullptr) const override;
@@ -218,6 +235,11 @@ public:
                     unsigned DstReg, ArrayRef<MachineOperand> Cond,
                     unsigned TrueReg, unsigned FalseReg) const override;
 
+  void insertVectorSelect(MachineBasicBlock &MBB,
+                          MachineBasicBlock::iterator I, const DebugLoc &DL,
+                          unsigned DstReg, ArrayRef<MachineOperand> Cond,
+                          unsigned TrueReg, unsigned FalseReg) const;
+
   bool
   areMemAccessesTriviallyDisjoint(MachineInstr &MIa, MachineInstr &MIb,
                                   AliasAnalysis *AA = nullptr) const override;
@@ -705,6 +727,7 @@ public:
   void insertNoop(MachineBasicBlock &MBB,
                   MachineBasicBlock::iterator MI) const override;
 
+  void insertReturn(MachineBasicBlock &MBB) const;
   /// \brief Return the number of wait states that result from executing this
   /// instruction.
   unsigned getNumWaitStates(const MachineInstr &MI) const;
@@ -750,6 +773,14 @@ public:
 
   bool mayAccessFlatAddressSpace(const MachineInstr &MI) const;
 
+  bool isNonUniformBranchInstr(MachineInstr &Instr) const;
+
+  void convertNonUniformIfRegion(MachineBasicBlock *IfEntry,
+                                 MachineBasicBlock *IfEnd) const;
+
+  void convertNonUniformLoopRegion(MachineBasicBlock *LoopEntry,
+                                   MachineBasicBlock *LoopEnd) const;
+
   ArrayRef<std::pair<int, const char *>>
   getSerializableTargetIndices() const override;
 
diff --git a/lib/Target/AMDGPU/SIInstructions.td b/lib/Target/AMDGPU/SIInstructions.td
index 7ccb54f54e34..3b4bdc864253 100644
--- a/lib/Target/AMDGPU/SIInstructions.td
+++ b/lib/Target/AMDGPU/SIInstructions.td
@@ -174,6 +174,13 @@ def SI_MASK_BRANCH : VPseudoInstSI <
 
 let isTerminator = 1 in {
 
+ def SI_NON_UNIFORM_BRCOND_PSEUDO : CFPseudoInstSI <
+  (outs),
+  (ins SReg_64:$vcc, brtarget:$target),
+  [(brcond i1:$vcc, bb:$target)]> {
+    let Size = 12;
+}
+
 def SI_IF: CFPseudoInstSI <
   (outs SReg_64:$dst), (ins SReg_64:$vcc, brtarget:$target),
   [(set i64:$dst, (AMDGPUif i1:$vcc, bb:$target))], 1, 1> {
diff --git a/lib/Target/AMDGPU/VOP2Instructions.td b/lib/Target/AMDGPU/VOP2Instructions.td
index 2281f338ab45..4a11d9471f1d 100644
--- a/lib/Target/AMDGPU/VOP2Instructions.td
+++ b/lib/Target/AMDGPU/VOP2Instructions.td
@@ -164,8 +164,11 @@ multiclass VOP2eInst <string opName,
 class VOP_MADAK <ValueType vt> : VOPProfile <[vt, vt, vt, vt]> {
   field Operand ImmOpType = !if(!eq(vt.Size, 32), f32kimm, f16kimm);
   field dag Ins32 = (ins VCSrc_f32:$src0, VGPR_32:$src1, ImmOpType:$imm);
-  field string Asm32 = "$vdst, $src0, $src1, $imm";
   field bit HasExt = 0;
+
+  // Hack to stop printing _e64
+  let DstRC = RegisterOperand<VGPR_32>;
+  field string Asm32 = " $vdst, $src0, $src1, $imm";
 }
 
 def VOP_MADAK_F16 : VOP_MADAK <f16>;
@@ -174,8 +177,11 @@ def VOP_MADAK_F32 : VOP_MADAK <f32>;
 class VOP_MADMK <ValueType vt> : VOPProfile <[vt, vt, vt, vt]> {
   field Operand ImmOpType = !if(!eq(vt.Size, 32), f32kimm, f16kimm);
   field dag Ins32 = (ins VCSrc_f32:$src0, ImmOpType:$imm, VGPR_32:$src1);
-  field string Asm32 = "$vdst, $src0, $imm, $src1";
   field bit HasExt = 0;
+
+  // Hack to stop printing _e64
+  let DstRC = RegisterOperand<VGPR_32>;
+  field string Asm32 = " $vdst, $src0, $imm, $src1";
 }
 
 def VOP_MADMK_F16 : VOP_MADMK <f16>;
@@ -298,7 +304,7 @@ def VOP_WRITELANE : VOPProfile<[i32, i32, i32]> {
 let SubtargetPredicate = isGCN in {
 
 defm V_CNDMASK_B32 : VOP2eInst <"v_cndmask_b32", VOP2e_I32_I32_I32_I1>;
-def V_MADMK_F32 : VOP2_Pseudo <"v_madmk_f32", VOP_MADMK_F32>;
+def V_MADMK_F32 : VOP2_Pseudo <"v_madmk_f32", VOP_MADMK_F32, [], "">;
 
 let isCommutable = 1 in {
 defm V_ADD_F32 : VOP2Inst <"v_add_f32", VOP_F32_F32_F32, fadd>;
@@ -328,7 +334,7 @@ let Constraints = "$vdst = $src2", DisableEncoding="$src2",
 defm V_MAC_F32 : VOP2Inst <"v_mac_f32", VOP_MAC_F32>;
 }
 
-def V_MADAK_F32 : VOP2_Pseudo <"v_madak_f32", VOP_MADAK_F32>;
+def V_MADAK_F32 : VOP2_Pseudo <"v_madak_f32", VOP_MADAK_F32, [], "">;
 
 // No patterns so that the scalar instructions are always selected.
 // The scalar versions will be replaced with vector when needed later.
@@ -383,7 +389,7 @@ defm V_LSHL_B32 : VOP2Inst <"v_lshl_b32", VOP_I32_I32_I32>;
 
 let SubtargetPredicate = isVI in {
 
-def V_MADMK_F16 : VOP2_Pseudo <"v_madmk_f16", VOP_MADMK_F16>;
+def V_MADMK_F16 : VOP2_Pseudo <"v_madmk_f16", VOP_MADMK_F16, [], "">;
 defm V_LSHLREV_B16 : VOP2Inst <"v_lshlrev_b16", VOP_I16_I16_I16>;
 defm V_LSHRREV_B16 : VOP2Inst <"v_lshrrev_b16", VOP_I16_I16_I16>;
 defm V_ASHRREV_I16 : VOP2Inst <"v_ashrrev_i16", VOP_I16_I16_I16>;
@@ -394,7 +400,7 @@ defm V_ADD_F16 : VOP2Inst <"v_add_f16", VOP_F16_F16_F16, fadd>;
 defm V_SUB_F16 : VOP2Inst <"v_sub_f16", VOP_F16_F16_F16, fsub>;
 defm V_SUBREV_F16 : VOP2Inst <"v_subrev_f16", VOP_F16_F16_F16, null_frag, "v_sub_f16">;
 defm V_MUL_F16 : VOP2Inst <"v_mul_f16", VOP_F16_F16_F16, fmul>;
-def V_MADAK_F16 : VOP2_Pseudo <"v_madak_f16", VOP_MADAK_F16>;
+def V_MADAK_F16 : VOP2_Pseudo <"v_madak_f16", VOP_MADAK_F16, [], "">;
 defm V_ADD_U16 : VOP2Inst <"v_add_u16", VOP_I16_I16_I16>;
 defm V_SUB_U16 : VOP2Inst <"v_sub_u16" , VOP_I16_I16_I16>;
 defm V_SUBREV_U16 : VOP2Inst <"v_subrev_u16", VOP_I16_I16_I16, null_frag, "v_sub_u16">;
@@ -651,6 +657,17 @@ multiclass VOP2_Real_e64_vi <bits<10> op> {
     VOP3e_vi <op, !cast<VOP3_Pseudo>(NAME#"_e64").Pfl>;
 }
 
+multiclass VOP2_Real_e64only_vi <bits<10> op> {
+  def _e64_vi :
+    VOP3_Real<!cast<VOP3_Pseudo>(NAME#"_e64"), SIEncodingFamily.VI>,
+    VOP3e_vi <op, !cast<VOP3_Pseudo>(NAME#"_e64").Pfl> {
+      // Hack to stop printing _e64
+      VOP3_Pseudo ps = !cast<VOP3_Pseudo>(NAME#"_e64");
+      let OutOperandList = (outs VGPR_32:$vdst);
+      let AsmString = ps.Mnemonic # " " # ps.AsmOperands;
+    }
+}
+
 multiclass Base_VOP2be_Real_e32e64_vi <bits<6> op> : VOP2_Real_e32_vi<op> {
   def _e64_vi :
     VOP3_Real<!cast<VOP3_Pseudo>(NAME#"_e64"), SIEncodingFamily.VI>,
@@ -718,17 +735,17 @@ defm V_SUBBREV_U32        : VOP2be_Real_e32e64_vi <0x1e>;
 defm V_READLANE_B32       : VOP32_Real_vi <0x289>;
 defm V_WRITELANE_B32      : VOP32_Real_vi <0x28a>;
 
-defm V_BFM_B32            : VOP2_Real_e64_vi <0x293>;
-defm V_BCNT_U32_B32       : VOP2_Real_e64_vi <0x28b>;
-defm V_MBCNT_LO_U32_B32   : VOP2_Real_e64_vi <0x28c>;
-defm V_MBCNT_HI_U32_B32   : VOP2_Real_e64_vi <0x28d>;
-defm V_LDEXP_F32          : VOP2_Real_e64_vi <0x288>;
-defm V_CVT_PKACCUM_U8_F32 : VOP2_Real_e64_vi <0x1f0>;
-defm V_CVT_PKNORM_I16_F32 : VOP2_Real_e64_vi <0x294>;
-defm V_CVT_PKNORM_U16_F32 : VOP2_Real_e64_vi <0x295>;
-defm V_CVT_PKRTZ_F16_F32  : VOP2_Real_e64_vi <0x296>;
-defm V_CVT_PK_U16_U32     : VOP2_Real_e64_vi <0x297>;
-defm V_CVT_PK_I16_I32     : VOP2_Real_e64_vi <0x298>;
+defm V_BFM_B32            : VOP2_Real_e64only_vi <0x293>;
+defm V_BCNT_U32_B32       : VOP2_Real_e64only_vi <0x28b>;
+defm V_MBCNT_LO_U32_B32   : VOP2_Real_e64only_vi <0x28c>;
+defm V_MBCNT_HI_U32_B32   : VOP2_Real_e64only_vi <0x28d>;
+defm V_LDEXP_F32          : VOP2_Real_e64only_vi <0x288>;
+defm V_CVT_PKACCUM_U8_F32 : VOP2_Real_e64only_vi <0x1f0>;
+defm V_CVT_PKNORM_I16_F32 : VOP2_Real_e64only_vi <0x294>;
+defm V_CVT_PKNORM_U16_F32 : VOP2_Real_e64only_vi <0x295>;
+defm V_CVT_PKRTZ_F16_F32  : VOP2_Real_e64only_vi <0x296>;
+defm V_CVT_PK_U16_U32     : VOP2_Real_e64only_vi <0x297>;
+defm V_CVT_PK_I16_I32     : VOP2_Real_e64only_vi <0x298>;
 
 defm V_ADD_F16            : VOP2_Real_e32e64_vi <0x1f>;
 defm V_SUB_F16            : VOP2_Real_e32e64_vi <0x20>;
diff --git a/lib/Target/AMDGPU/VOP3Instructions.td b/lib/Target/AMDGPU/VOP3Instructions.td
index 217a07488853..ffa6c60d6b1f 100644
--- a/lib/Target/AMDGPU/VOP3Instructions.td
+++ b/lib/Target/AMDGPU/VOP3Instructions.td
@@ -232,7 +232,6 @@ def V_ASHRREV_I64 : VOP3Inst <"v_ashrrev_i64", VOP3_Profile<VOP_I64_I32_I64>>;
 
 let SubtargetPredicate = isCIVI in {
 
-def V_MQSAD_U16_U8 : VOP3Inst <"v_mqsad_u16_u8", VOP3_Profile<VOP_I32_I32_I32>>;
 def V_QSAD_PK_U16_U8 : VOP3Inst <"v_qsad_pk_u16_u8", VOP3_Profile<VOP_I64_I64_I32_I64>, int_amdgcn_qsad_pk_u16_u8>;
 def V_MQSAD_U32_U8 : VOP3Inst <"v_mqsad_u32_u8", VOP3_Profile<VOP_V4I32_I64_I32_V4I32>, int_amdgcn_mqsad_u32_u8>;
 
@@ -402,7 +401,6 @@ multiclass VOP3be_Real_ci<bits<9> op> {
   }
 }
 
-defm V_MQSAD_U16_U8     : VOP3_Real_ci <0x172>;
 defm V_QSAD_PK_U16_U8   : VOP3_Real_ci <0x172>;
 defm V_MQSAD_U32_U8     : VOP3_Real_ci <0x175>;
 defm V_MAD_U64_U32      : VOP3be_Real_ci <0x176>;
@@ -426,7 +424,6 @@ multiclass VOP3be_Real_vi<bits<10> op> {
 
 } // End AssemblerPredicates = [isVI], DecoderNamespace = "VI"
 
-defm V_MQSAD_U16_U8     : VOP3_Real_vi <0x172>;
 defm V_MAD_U64_U32      : VOP3be_Real_vi <0x1E8>;
 defm V_MAD_I64_I32      : VOP3be_Real_vi <0x1E9>;
 
diff --git a/lib/Target/ARM/ARMBaseInstrInfo.h b/lib/Target/ARM/ARMBaseInstrInfo.h
index 28c407f74125..dd7fe871345a 100644
--- a/lib/Target/ARM/ARMBaseInstrInfo.h
+++ b/lib/Target/ARM/ARMBaseInstrInfo.h
@@ -404,21 +404,11 @@ public:
   /// Returns predicate register associated with the given frame instruction.
   unsigned getFramePred(const MachineInstr &MI) const {
     assert(isFrameInstr(MI));
-    if (isFrameSetup(MI))
-      // Operands of ADJCALLSTACKDOWN:
-      // - argument declared in ADJCALLSTACKDOWN pattern:
-      // 0 - frame size
-      // 1 - predicate code (like ARMCC::AL)
-      // - added by predOps:
-      // 2 - predicate reg
-      return MI.getOperand(2).getReg();
-    assert(MI.getOpcode() == ARM::ADJCALLSTACKUP ||
-           MI.getOpcode() == ARM::tADJCALLSTACKUP);
-    // Operands of ADJCALLSTACKUP:
-    // - argument declared in ADJCALLSTACKUP pattern:
+    // Operands of ADJCALLSTACKDOWN/ADJCALLSTACKUP:
+    // - argument declared in the pattern:
     // 0 - frame size
-    // 1 - arg of CALLSEQ_END
-    // 2 - predicate code
+    // 1 - arg of CALLSEQ_START/CALLSEQ_END
+    // 2 - predicate code (like ARMCC::AL)
     // - added by predOps:
     // 3 - predicate reg
     return MI.getOperand(3).getReg();
diff --git a/lib/Target/ARM/ARMCallLowering.cpp b/lib/Target/ARM/ARMCallLowering.cpp
index 9178c67afa6e..46ac4d0ad933 100644
--- a/lib/Target/ARM/ARMCallLowering.cpp
+++ b/lib/Target/ARM/ARMCallLowering.cpp
@@ -433,7 +433,7 @@ bool ARMCallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
 
   // We now know the size of the stack - update the ADJCALLSTACKDOWN
   // accordingly.
-  CallSeqStart.addImm(ArgHandler.StackSize).add(predOps(ARMCC::AL));
+  CallSeqStart.addImm(ArgHandler.StackSize).addImm(0).add(predOps(ARMCC::AL));
 
   MIRBuilder.buildInstr(ARM::ADJCALLSTACKUP)
       .addImm(ArgHandler.StackSize)
diff --git a/lib/Target/ARM/ARMFastISel.cpp b/lib/Target/ARM/ARMFastISel.cpp
index 56cac855620d..4f6a73b5980d 100644
--- a/lib/Target/ARM/ARMFastISel.cpp
+++ b/lib/Target/ARM/ARMFastISel.cpp
@@ -1949,7 +1949,7 @@ bool ARMFastISel::ProcessCallArgs(SmallVectorImpl<Value*> &Args,
   unsigned AdjStackDown = TII.getCallFrameSetupOpcode();
   AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
                           TII.get(AdjStackDown))
-                  .addImm(NumBytes));
+                  .addImm(NumBytes).addImm(0));
 
   // Process the args.
   for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
diff --git a/lib/Target/ARM/ARMISelLowering.cpp b/lib/Target/ARM/ARMISelLowering.cpp
index e64582402fe1..f8b584db7b99 100644
--- a/lib/Target/ARM/ARMISelLowering.cpp
+++ b/lib/Target/ARM/ARMISelLowering.cpp
@@ -473,9 +473,9 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
   }
 
   // Use divmod compiler-rt calls for iOS 5.0 and later.
-  if (Subtarget->isTargetWatchOS() ||
-      (Subtarget->isTargetIOS() &&
-       !Subtarget->getTargetTriple().isOSVersionLT(5, 0))) {
+  if (Subtarget->isTargetMachO() &&
+      !(Subtarget->isTargetIOS() &&
+        Subtarget->getTargetTriple().isOSVersionLT(5, 0))) {
     setLibcallName(RTLIB::SDIVREM_I32, "__divmodsi4");
     setLibcallName(RTLIB::UDIVREM_I32, "__udivmodsi4");
   }
@@ -1817,8 +1817,7 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
   // Adjust the stack pointer for the new arguments...
   // These operations are automatically eliminated by the prolog/epilog pass
   if (!isSibCall)
-    Chain = DAG.getCALLSEQ_START(Chain,
-                                 DAG.getIntPtrConstant(NumBytes, dl, true), dl);
+    Chain = DAG.getCALLSEQ_START(Chain, NumBytes, 0, dl);
 
   SDValue StackPtr =
       DAG.getCopyFromReg(Chain, dl, ARM::SP, getPointerTy(DAG.getDataLayout()));
@@ -7365,7 +7364,7 @@ SDValue ARMTargetLowering::LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const {
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
 
   // Pair of floats / doubles used to pass the result.
-  Type *RetTy = StructType::get(ArgTy, ArgTy, nullptr);
+  Type *RetTy = StructType::get(ArgTy, ArgTy);
   auto &DL = DAG.getDataLayout();
 
   ArgListTy Args;
@@ -13115,7 +13114,7 @@ SDValue ARMTargetLowering::LowerDivRem(SDValue Op, SelectionDAG &DAG) const {
   SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC),
                                          getPointerTy(DAG.getDataLayout()));
 
-  Type *RetTy = (Type*)StructType::get(Ty, Ty, nullptr);
+  Type *RetTy = StructType::get(Ty, Ty);
 
   if (Subtarget->isTargetWindows())
     InChain = WinDBZCheckDenominator(DAG, Op.getNode(), InChain);
@@ -13417,9 +13416,9 @@ Instruction* ARMTargetLowering::makeDMB(IRBuilder<> &Builder,
 }
 
 // Based on http://www.cl.cam.ac.uk/~pes20/cpp/cpp0xmappings.html
-Instruction* ARMTargetLowering::emitLeadingFence(IRBuilder<> &Builder,
-                                         AtomicOrdering Ord, bool IsStore,
-                                         bool IsLoad) const {
+Instruction *ARMTargetLowering::emitLeadingFence(IRBuilder<> &Builder,
+                                                 Instruction *Inst,
+                                                 AtomicOrdering Ord) const {
   switch (Ord) {
   case AtomicOrdering::NotAtomic:
   case AtomicOrdering::Unordered:
@@ -13428,7 +13427,7 @@ Instruction* ARMTargetLowering::emitLeadingFence(IRBuilder<> &Builder,
   case AtomicOrdering::Acquire:
     return nullptr; // Nothing to do
   case AtomicOrdering::SequentiallyConsistent:
-    if (!IsStore)
+    if (!Inst->hasAtomicStore())
       return nullptr; // Nothing to do
     /*FALLTHROUGH*/
   case AtomicOrdering::Release:
@@ -13442,9 +13441,9 @@ Instruction* ARMTargetLowering::emitLeadingFence(IRBuilder<> &Builder,
   llvm_unreachable("Unknown fence ordering in emitLeadingFence");
 }
 
-Instruction* ARMTargetLowering::emitTrailingFence(IRBuilder<> &Builder,
-                                          AtomicOrdering Ord, bool IsStore,
-                                          bool IsLoad) const {
+Instruction *ARMTargetLowering::emitTrailingFence(IRBuilder<> &Builder,
+                                                  Instruction *Inst,
+                                                  AtomicOrdering Ord) const {
   switch (Ord) {
   case AtomicOrdering::NotAtomic:
   case AtomicOrdering::Unordered:
diff --git a/lib/Target/ARM/ARMISelLowering.h b/lib/Target/ARM/ARMISelLowering.h
index 08c51b66dfe7..875c06210ae6 100644
--- a/lib/Target/ARM/ARMISelLowering.h
+++ b/lib/Target/ARM/ARMISelLowering.h
@@ -483,10 +483,10 @@ class InstrItineraryData;
 
     void emitAtomicCmpXchgNoStoreLLBalance(IRBuilder<> &Builder) const override;
 
-    Instruction* emitLeadingFence(IRBuilder<> &Builder, AtomicOrdering Ord,
-                          bool IsStore, bool IsLoad) const override;
-    Instruction* emitTrailingFence(IRBuilder<> &Builder, AtomicOrdering Ord,
-                           bool IsStore, bool IsLoad) const override;
+    Instruction *emitLeadingFence(IRBuilder<> &Builder, Instruction *Inst,
+                                  AtomicOrdering Ord) const override;
+    Instruction *emitTrailingFence(IRBuilder<> &Builder, Instruction *Inst,
+                                   AtomicOrdering Ord) const override;
 
     unsigned getMaxSupportedInterleaveFactor() const override { return 4; }
 
diff --git a/lib/Target/ARM/ARMInstrInfo.td b/lib/Target/ARM/ARMInstrInfo.td
index a94d6048f02d..d06b7d0896f1 100644
--- a/lib/Target/ARM/ARMInstrInfo.td
+++ b/lib/Target/ARM/ARMInstrInfo.td
@@ -16,7 +16,8 @@
 //
 
 // Type profiles.
-def SDT_ARMCallSeqStart : SDCallSeqStart<[ SDTCisVT<0, i32> ]>;
+def SDT_ARMCallSeqStart : SDCallSeqStart<[ SDTCisVT<0, i32>,
+                                           SDTCisVT<1, i32> ]>;
 def SDT_ARMCallSeqEnd   : SDCallSeqEnd<[ SDTCisVT<0, i32>, SDTCisVT<1, i32> ]>;
 def SDT_ARMStructByVal : SDTypeProfile<0, 4,
                                        [SDTCisVT<0, i32>, SDTCisVT<1, i32>,
@@ -1968,8 +1969,8 @@ PseudoInst<(outs), (ins i32imm:$amt1, i32imm:$amt2, pred:$p), NoItinerary,
            [(ARMcallseq_end timm:$amt1, timm:$amt2)]>;
 
 def ADJCALLSTACKDOWN :
-PseudoInst<(outs), (ins i32imm:$amt, pred:$p), NoItinerary,
-           [(ARMcallseq_start timm:$amt)]>;
+PseudoInst<(outs), (ins i32imm:$amt, i32imm:$amt2, pred:$p), NoItinerary,
+           [(ARMcallseq_start timm:$amt, timm:$amt2)]>;
 }
 
 def HINT : AI<(outs), (ins imm0_239:$imm), MiscFrm, NoItinerary,
diff --git a/lib/Target/ARM/ARMInstrThumb.td b/lib/Target/ARM/ARMInstrThumb.td
index 8048c758e998..bee83dfb6f63 100644
--- a/lib/Target/ARM/ARMInstrThumb.td
+++ b/lib/Target/ARM/ARMInstrThumb.td
@@ -284,8 +284,8 @@ def tADJCALLSTACKUP :
             Requires<[IsThumb, IsThumb1Only]>;
 
 def tADJCALLSTACKDOWN :
-  PseudoInst<(outs), (ins i32imm:$amt), NoItinerary,
-             [(ARMcallseq_start imm:$amt)]>,
+  PseudoInst<(outs), (ins i32imm:$amt, i32imm:$amt2), NoItinerary,
+             [(ARMcallseq_start imm:$amt, imm:$amt2)]>,
             Requires<[IsThumb, IsThumb1Only]>;
 }
 
diff --git a/lib/Target/ARM/ARMInstructionSelector.cpp b/lib/Target/ARM/ARMInstructionSelector.cpp
index 2ac3fda9f448..8c680cdf9b47 100644
--- a/lib/Target/ARM/ARMInstructionSelector.cpp
+++ b/lib/Target/ARM/ARMInstructionSelector.cpp
@@ -101,14 +101,6 @@ static bool selectCopy(MachineInstr &I, const TargetInstrInfo &TII,
   assert(RegBank && "Can't get reg bank for virtual register");
 
   const unsigned DstSize = MRI.getType(DstReg).getSizeInBits();
-  (void)DstSize;
-  unsigned SrcReg = I.getOperand(1).getReg();
-  const unsigned SrcSize = RBI.getSizeInBits(SrcReg, MRI, TRI);
-  (void)SrcSize;
-  // We use copies for trunc, so it's ok for the size of the destination to be
-  // smaller (the higher bits will just be undefined).
-  assert(DstSize <= SrcSize && "Copy with different width?!");
-
   assert((RegBank->getID() == ARM::GPRRegBankID ||
           RegBank->getID() == ARM::FPRRegBankID) &&
          "Unsupported reg bank");
@@ -135,28 +127,6 @@ static bool selectCopy(MachineInstr &I, const TargetInstrInfo &TII,
   return true;
 }
 
-static bool selectFAdd(MachineInstrBuilder &MIB, const ARMBaseInstrInfo &TII,
-                       MachineRegisterInfo &MRI) {
-  assert(TII.getSubtarget().hasVFP2() && "Can't select fp add without vfp");
-
-  LLT Ty = MRI.getType(MIB->getOperand(0).getReg());
-  unsigned ValSize = Ty.getSizeInBits();
-
-  if (ValSize == 32) {
-    if (TII.getSubtarget().useNEONForSinglePrecisionFP())
-      return false;
-    MIB->setDesc(TII.get(ARM::VADDS));
-  } else {
-    assert(ValSize == 64 && "Unsupported size for floating point value");
-    if (TII.getSubtarget().isFPOnlySP())
-      return false;
-    MIB->setDesc(TII.get(ARM::VADDD));
-  }
-  MIB.add(predOps(ARMCC::AL));
-
-  return true;
-}
-
 static bool selectSequence(MachineInstrBuilder &MIB,
                            const ARMBaseInstrInfo &TII,
                            MachineRegisterInfo &MRI,
@@ -352,6 +322,7 @@ bool ARMInstructionSelector::select(MachineInstr &I) const {
     }
     break;
   }
+  case G_ANYEXT:
   case G_TRUNC: {
     // The high bits are undefined, so there's nothing special to do, just
     // treat it as a copy.
@@ -362,12 +333,12 @@ bool ARMInstructionSelector::select(MachineInstr &I) const {
     const auto &DstRegBank = *RBI.getRegBank(DstReg, MRI, TRI);
 
     if (SrcRegBank.getID() != DstRegBank.getID()) {
-      DEBUG(dbgs() << "G_TRUNC operands on different register banks\n");
+      DEBUG(dbgs() << "G_TRUNC/G_ANYEXT operands on different register banks\n");
       return false;
     }
 
     if (SrcRegBank.getID() != ARM::GPRRegBankID) {
-      DEBUG(dbgs() << "G_TRUNC on non-GPR not supported yet\n");
+      DEBUG(dbgs() << "G_TRUNC/G_ANYEXT on non-GPR not supported yet\n");
       return false;
     }
 
@@ -393,10 +364,6 @@ bool ARMInstructionSelector::select(MachineInstr &I) const {
     }
     MIB.add(predOps(ARMCC::AL)).add(condCodeOp());
     break;
-  case G_FADD:
-    if (!selectFAdd(MIB, TII, MRI))
-      return false;
-    break;
   case G_FRAME_INDEX:
     // Add 0 to the given frame index and hope it will eventually be folded into
     // the user(s).
diff --git a/lib/Target/ARM/ARMLegalizerInfo.cpp b/lib/Target/ARM/ARMLegalizerInfo.cpp
index 9b86030fdd29..5bf6c7aed6b8 100644
--- a/lib/Target/ARM/ARMLegalizerInfo.cpp
+++ b/lib/Target/ARM/ARMLegalizerInfo.cpp
@@ -45,9 +45,11 @@ ARMLegalizerInfo::ARMLegalizerInfo(const ARMSubtarget &ST) {
     setAction({Op, 1, p0}, Legal);
   }
 
-  for (unsigned Op : {G_ADD, G_SUB, G_MUL})
-    for (auto Ty : {s1, s8, s16, s32})
-      setAction({Op, Ty}, Legal);
+  for (unsigned Op : {G_ADD, G_SUB, G_MUL}) {
+    for (auto Ty : {s1, s8, s16})
+      setAction({Op, Ty}, WidenScalar);
+    setAction({Op, s32}, Legal);
+  }
 
   for (unsigned Op : {G_SDIV, G_UDIV}) {
     for (auto Ty : {s8, s16})
diff --git a/lib/Target/ARM/ARMOptimizeBarriersPass.cpp b/lib/Target/ARM/ARMOptimizeBarriersPass.cpp
index 581d5fe159fd..7e4d598a6e0b 100644
--- a/lib/Target/ARM/ARMOptimizeBarriersPass.cpp
+++ b/lib/Target/ARM/ARMOptimizeBarriersPass.cpp
@@ -88,13 +88,15 @@ bool ARMOptimizeBarriersPass::runOnMachineFunction(MachineFunction &MF) {
       }
     }
   }
+  bool Changed = false;
   // Remove the tagged DMB
   for (auto MI : ToRemove) {
     MI->eraseFromParent();
     ++NumDMBsRemoved;
+    Changed = true;
   }
 
-  return NumDMBsRemoved > 0;
+  return Changed;
 }
 
 /// createARMOptimizeBarriersPass - Returns an instance of the remove double
diff --git a/lib/Target/ARM/ARMRegisterBankInfo.cpp b/lib/Target/ARM/ARMRegisterBankInfo.cpp
index 13a32211f88c..a20997c95cd9 100644
--- a/lib/Target/ARM/ARMRegisterBankInfo.cpp
+++ b/lib/Target/ARM/ARMRegisterBankInfo.cpp
@@ -225,6 +225,7 @@ ARMRegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
   case G_UDIV:
   case G_SEXT:
   case G_ZEXT:
+  case G_ANYEXT:
   case G_TRUNC:
   case G_GEP:
     // FIXME: We're abusing the fact that everything lives in a GPR for now; in
diff --git a/lib/Target/ARM/ARMTargetMachine.cpp b/lib/Target/ARM/ARMTargetMachine.cpp
index d09f3ecbaa28..5583d6148b08 100644
--- a/lib/Target/ARM/ARMTargetMachine.cpp
+++ b/lib/Target/ARM/ARMTargetMachine.cpp
@@ -13,7 +13,9 @@
 #include "ARM.h"
 #include "ARMCallLowering.h"
 #include "ARMLegalizerInfo.h"
+#ifdef LLVM_BUILD_GLOBAL_ISEL
 #include "ARMRegisterBankInfo.h"
+#endif
 #include "ARMSubtarget.h"
 #include "ARMTargetMachine.h"
 #include "ARMTargetObjectFile.h"
diff --git a/lib/Target/AVR/AVRFrameLowering.cpp b/lib/Target/AVR/AVRFrameLowering.cpp
index c297865db820..0ec8e8b08ceb 100644
--- a/lib/Target/AVR/AVRFrameLowering.cpp
+++ b/lib/Target/AVR/AVRFrameLowering.cpp
@@ -375,7 +375,7 @@ MachineBasicBlock::iterator AVRFrameLowering::eliminateCallFramePseudoInstr(
 
   DebugLoc DL = MI->getDebugLoc();
   unsigned int Opcode = MI->getOpcode();
-  int Amount = MI->getOperand(0).getImm();
+  int Amount = TII.getFrameSize(*MI);
 
   // Adjcallstackup does not need to allocate stack space for the call, instead
   // we insert push instructions that will allocate the necessary stack.
diff --git a/lib/Target/AVR/AVRISelLowering.cpp b/lib/Target/AVR/AVRISelLowering.cpp
index f0ab6acedad1..ef9c00e4b784 100644
--- a/lib/Target/AVR/AVRISelLowering.cpp
+++ b/lib/Target/AVR/AVRISelLowering.cpp
@@ -361,7 +361,7 @@ SDValue AVRTargetLowering::LowerDivRem(SDValue Op, SelectionDAG &DAG) const {
   SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC),
                                          getPointerTy(DAG.getDataLayout()));
 
-  Type *RetTy = (Type *)StructType::get(Ty, Ty, nullptr);
+  Type *RetTy = (Type *)StructType::get(Ty, Ty);
 
   SDLoc dl(Op);
   TargetLowering::CallLoweringInfo CLI(DAG);
@@ -1166,8 +1166,7 @@ SDValue AVRTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
   // Get a count of how many bytes are to be pushed on the stack.
   unsigned NumBytes = CCInfo.getNextStackOffset();
 
-  Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, DL, true),
-                               DL);
+  Chain = DAG.getCALLSEQ_START(Chain, NumBytes, 0, DL);
 
   SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
 
@@ -1611,8 +1610,9 @@ AVRTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
   MachineBasicBlock *trueMBB = MF->CreateMachineBasicBlock(LLVM_BB);
   MachineBasicBlock *falseMBB = MF->CreateMachineBasicBlock(LLVM_BB);
 
-  MachineFunction::iterator I = MBB->getParent()->begin();
-  ++I;
+  MachineFunction::iterator I;
+  for (I = MF->begin(); I != MF->end() && &(*I) != MBB; ++I);
+  if (I != MF->end()) ++I;
   MF->insert(I, trueMBB);
   MF->insert(I, falseMBB);
 
diff --git a/lib/Target/AVR/AVRInstrInfo.td b/lib/Target/AVR/AVRInstrInfo.td
index 1b6547ef7795..06ad2b3ffdf8 100644
--- a/lib/Target/AVR/AVRInstrInfo.td
+++ b/lib/Target/AVR/AVRInstrInfo.td
@@ -17,7 +17,7 @@ include "AVRInstrFormats.td"
 // AVR Type Profiles
 //===----------------------------------------------------------------------===//
 
-def SDT_AVRCallSeqStart : SDCallSeqStart<[SDTCisVT<0, i16>]>;
+def SDT_AVRCallSeqStart : SDCallSeqStart<[SDTCisVT<0, i16>, SDTCisVT<1, i16>]>;
 def SDT_AVRCallSeqEnd : SDCallSeqEnd<[SDTCisVT<0, i16>, SDTCisVT<1, i16>]>;
 def SDT_AVRCall : SDTypeProfile<0, -1, [SDTCisVT<0, iPTR>]>;
 def SDT_AVRWrapper : SDTypeProfile<1, 1, [SDTCisSameAs<0, 1>, SDTCisPtrTy<0>]>;
@@ -333,9 +333,9 @@ let Defs = [SP, SREG],
 Uses = [SP] in
 {
   def ADJCALLSTACKDOWN : Pseudo<(outs),
-                                (ins i16imm:$amt),
+                                (ins i16imm:$amt, i16imm:$amt2),
                                 "#ADJCALLSTACKDOWN",
-                                [(AVRcallseq_start timm:$amt)]>;
+                                [(AVRcallseq_start timm:$amt, timm:$amt2)]>;
 
   // R31R30 is used to update SP, since it is a scratch reg and this instruction
   // is placed after the function call then R31R30 should be always free.
diff --git a/lib/Target/AVR/AVRRegisterInfo.cpp b/lib/Target/AVR/AVRRegisterInfo.cpp
index 2813e24d2ac7..11a47bad78ba 100644
--- a/lib/Target/AVR/AVRRegisterInfo.cpp
+++ b/lib/Target/AVR/AVRRegisterInfo.cpp
@@ -52,7 +52,6 @@ AVRRegisterInfo::getCallPreservedMask(const MachineFunction &MF,
 BitVector AVRRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
   BitVector Reserved(getNumRegs());
   const AVRTargetMachine &TM = static_cast<const AVRTargetMachine&>(MF.getTarget());
-  const TargetFrameLowering *TFI = TM.getSubtargetImpl()->getFrameLowering();
 
   // Reserve the intermediate result registers r1 and r2
   // The result of instructions like 'mul' is always stored here.
diff --git a/lib/Target/BPF/BPFISelLowering.cpp b/lib/Target/BPF/BPFISelLowering.cpp
index b9b3dff95c0a..6897161c903c 100644
--- a/lib/Target/BPF/BPFISelLowering.cpp
+++ b/lib/Target/BPF/BPFISelLowering.cpp
@@ -257,8 +257,7 @@ SDValue BPFTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
   }
 
   auto PtrVT = getPointerTy(MF.getDataLayout());
-  Chain = DAG.getCALLSEQ_START(
-      Chain, DAG.getConstant(NumBytes, CLI.DL, PtrVT, true), CLI.DL);
+  Chain = DAG.getCALLSEQ_START(Chain, NumBytes, 0, CLI.DL);
 
   SmallVector<std::pair<unsigned, SDValue>, MaxArgs> RegsToPass;
 
diff --git a/lib/Target/BPF/BPFInstrInfo.td b/lib/Target/BPF/BPFInstrInfo.td
index 93ee24371c4d..c6c0ff587c6b 100644
--- a/lib/Target/BPF/BPFInstrInfo.td
+++ b/lib/Target/BPF/BPFInstrInfo.td
@@ -16,7 +16,8 @@ include "BPFInstrFormats.td"
 // Instruction Operands and Patterns
 
 // These are target-independent nodes, but have target-specific formats.
-def SDT_BPFCallSeqStart : SDCallSeqStart<[SDTCisVT<0, iPTR>]>;
+def SDT_BPFCallSeqStart : SDCallSeqStart<[SDTCisVT<0, iPTR>,
+                                          SDTCisVT<1, iPTR>]>;
 def SDT_BPFCallSeqEnd   : SDCallSeqEnd<[SDTCisVT<0, iPTR>, SDTCisVT<1, iPTR>]>;
 def SDT_BPFCall         : SDTypeProfile<0, -1, [SDTCisVT<0, iPTR>]>;
 def SDT_BPFSetFlag      : SDTypeProfile<0, 3, [SDTCisSameAs<0, 1>]>;
@@ -445,9 +446,9 @@ let isReturn = 1, isTerminator = 1, hasDelaySlot=0, isBarrier = 1,
 
 // ADJCALLSTACKDOWN/UP pseudo insns
 let Defs = [R11], Uses = [R11] in {
-def ADJCALLSTACKDOWN : Pseudo<(outs), (ins i64imm:$amt),
-                              "#ADJCALLSTACKDOWN $amt",
-                              [(BPFcallseq_start timm:$amt)]>;
+def ADJCALLSTACKDOWN : Pseudo<(outs), (ins i64imm:$amt1, i64imm:$amt2),
+                              "#ADJCALLSTACKDOWN $amt1 $amt2",
+                              [(BPFcallseq_start timm:$amt1, timm:$amt2)]>;
 def ADJCALLSTACKUP   : Pseudo<(outs), (ins i64imm:$amt1, i64imm:$amt2),
                               "#ADJCALLSTACKUP $amt1 $amt2",
                               [(BPFcallseq_end timm:$amt1, timm:$amt2)]>;
diff --git a/lib/Target/Hexagon/HexagonISelLowering.cpp b/lib/Target/Hexagon/HexagonISelLowering.cpp
index 861af94f1e38..1dffebe97f2d 100644
--- a/lib/Target/Hexagon/HexagonISelLowering.cpp
+++ b/lib/Target/Hexagon/HexagonISelLowering.cpp
@@ -848,8 +848,7 @@ HexagonTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
 
   SDValue Glue;
   if (!IsTailCall) {
-    SDValue C = DAG.getConstant(NumBytes, dl, PtrVT, true);
-    Chain = DAG.getCALLSEQ_START(Chain, C, dl);
+    Chain = DAG.getCALLSEQ_START(Chain, NumBytes, 0, dl);
     Glue = Chain.getValue(1);
   }
 
diff --git a/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp b/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp
index 5a5799dbe009..e4df7ff5c200 100644
--- a/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp
+++ b/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp
@@ -1209,7 +1209,7 @@ bool PolynomialMultiplyRecognize::highBitsAreZero(Value *V,
 
   KnownBits Known(T->getBitWidth());
   computeKnownBits(V, Known, DL);
-  return Known.Zero.countLeadingOnes() >= IterCount;
+  return Known.countMinLeadingZeros() >= IterCount;
 }
 
 
diff --git a/lib/Target/Hexagon/HexagonPatterns.td b/lib/Target/Hexagon/HexagonPatterns.td
index 32503d111c24..81b5e10c1173 100644
--- a/lib/Target/Hexagon/HexagonPatterns.td
+++ b/lib/Target/Hexagon/HexagonPatterns.td
@@ -714,7 +714,8 @@ def: Pat<(i1 0), (PS_false)>;
 def: Pat<(i1 1), (PS_true)>;
 
 // Pseudo instructions.
-def SDT_SPCallSeqStart : SDCallSeqStart<[ SDTCisVT<0, i32> ]>;
+def SDT_SPCallSeqStart : SDCallSeqStart<[ SDTCisVT<0, i32>,
+                                          SDTCisVT<1, i32> ]>;
 def SDT_SPCallSeqEnd   : SDCallSeqEnd<[ SDTCisVT<0, i32>,
                                         SDTCisVT<1, i32> ]>;
 
@@ -732,8 +733,8 @@ def HexagonTCRet : SDNode<"HexagonISD::TC_RETURN", SDT_SPCall,
                           [SDNPHasChain,  SDNPOptInGlue, SDNPVariadic]>;
 
 
-def: Pat<(callseq_start timm:$amt),
-          (ADJCALLSTACKDOWN imm:$amt)>;
+def: Pat<(callseq_start timm:$amt, timm:$amt2),
+          (ADJCALLSTACKDOWN imm:$amt, imm:$amt2)>;
 def: Pat<(callseq_end timm:$amt1, timm:$amt2),
          (ADJCALLSTACKUP imm:$amt1, imm:$amt2)>;
 
diff --git a/lib/Target/Hexagon/HexagonPseudo.td b/lib/Target/Hexagon/HexagonPseudo.td
index 8c2caea2d5c5..0f99dfe342b8 100644
--- a/lib/Target/Hexagon/HexagonPseudo.td
+++ b/lib/Target/Hexagon/HexagonPseudo.td
@@ -80,7 +80,7 @@ def PS_false : InstHexagon<(outs PredRegs:$dst), (ins), "",
                [(set I1:$dst, 0)], "", C2_andn.Itinerary, TypeCR>;
 
 let Defs = [R29, R30], Uses = [R31, R30, R29], isPseudo = 1 in
-def ADJCALLSTACKDOWN : Pseudo<(outs), (ins i32imm:$amt),
+def ADJCALLSTACKDOWN : Pseudo<(outs), (ins i32imm:$amt1, i32imm:$amt2),
                               ".error \"should not emit\" ", []>;
 
 let Defs = [R29, R30, R31], Uses = [R29], isPseudo = 1 in
diff --git a/lib/Target/Lanai/LanaiISelLowering.cpp b/lib/Target/Lanai/LanaiISelLowering.cpp
index d156294a0b0c..0a9cac2565f2 100644
--- a/lib/Target/Lanai/LanaiISelLowering.cpp
+++ b/lib/Target/Lanai/LanaiISelLowering.cpp
@@ -11,9 +11,9 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "LanaiISelLowering.h"
 #include "Lanai.h"
 #include "LanaiCondCode.h"
-#include "LanaiISelLowering.h"
 #include "LanaiMachineFunctionInfo.h"
 #include "LanaiSubtarget.h"
 #include "LanaiTargetObjectFile.h"
@@ -38,10 +38,11 @@
 #include "llvm/IR/Function.h"
 #include "llvm/IR/GlobalValue.h"
 #include "llvm/Support/Casting.h"
-#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/CodeGen.h"
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/KnownBits.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetCallingConv.h"
@@ -649,10 +650,7 @@ SDValue LanaiTargetLowering::LowerCCCCallTo(
     ByValArgs.push_back(FIPtr);
   }
 
-  Chain = DAG.getCALLSEQ_START(
-      Chain,
-      DAG.getConstant(NumBytes, DL, getPointerTy(DAG.getDataLayout()), true),
-      DL);
+  Chain = DAG.getCALLSEQ_START(Chain, NumBytes, 0, DL);
 
   SmallVector<std::pair<unsigned, SDValue>, 4> RegsToPass;
   SmallVector<SDValue, 12> MemOpChains;
@@ -1502,3 +1500,24 @@ SDValue LanaiTargetLowering::PerformDAGCombine(SDNode *N,
 
   return SDValue();
 }
+
+void LanaiTargetLowering::computeKnownBitsForTargetNode(
+    const SDValue Op, KnownBits &Known, const APInt &DemandedElts,
+    const SelectionDAG &DAG, unsigned Depth) const {
+  unsigned BitWidth = Known.getBitWidth();
+  switch (Op.getOpcode()) {
+  default:
+    break;
+  case LanaiISD::SETCC:
+    Known = KnownBits(BitWidth);
+    Known.Zero.setBits(1, BitWidth);
+    break;
+  case LanaiISD::SELECT_CC:
+    KnownBits Known2;
+    DAG.computeKnownBits(Op->getOperand(0), Known, Depth + 1);
+    DAG.computeKnownBits(Op->getOperand(1), Known2, Depth + 1);
+    Known.Zero &= Known2.Zero;
+    Known.One &= Known2.One;
+    break;
+  }
+}
diff --git a/lib/Target/Lanai/LanaiISelLowering.h b/lib/Target/Lanai/LanaiISelLowering.h
index c2fba4f9d167..49ad52a39771 100644
--- a/lib/Target/Lanai/LanaiISelLowering.h
+++ b/lib/Target/Lanai/LanaiISelLowering.h
@@ -106,6 +106,11 @@ public:
 
   SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override;
 
+  void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known,
+                                     const APInt &DemandedElts,
+                                     const SelectionDAG &DAG,
+                                     unsigned Depth = 0) const override;
+
 private:
   SDValue LowerCCCCallTo(SDValue Chain, SDValue Callee,
                          CallingConv::ID CallConv, bool IsVarArg,
diff --git a/lib/Target/Lanai/LanaiInstrInfo.td b/lib/Target/Lanai/LanaiInstrInfo.td
index 285fca11737d..776fee101dfe 100644
--- a/lib/Target/Lanai/LanaiInstrInfo.td
+++ b/lib/Target/Lanai/LanaiInstrInfo.td
@@ -22,7 +22,8 @@ include "LanaiInstrFormats.td"
 // -------------------------------------------------- //
 
 //  These are target-independent nodes, but have target-specific formats.
-def SDT_LanaiCallSeqStart : SDCallSeqStart<[SDTCisVT<0, i32>]>;
+def SDT_LanaiCallSeqStart : SDCallSeqStart<[SDTCisVT<0, i32>,
+                                            SDTCisVT<1, i32>]>;
 def SDT_LanaiCallSeqEnd   : SDCallSeqEnd<[SDTCisVT<0, i32>,
                                           SDTCisVT<1, i32>]>;
 def SDT_LanaiCall         : SDTypeProfile<0, -1, [SDTCisVT<0, i32>]>;
@@ -750,9 +751,9 @@ let isReturn = 1, isTerminator = 1, hasDelaySlot = 1, isBarrier = 1,
 // Pessimistically assume ADJCALLSTACKDOWN / ADJCALLSTACKUP will become
 // sub / add which can clobber SP.
 let Defs = [SP], Uses = [SP] in {
-  def ADJCALLSTACKDOWN : Pseudo<(outs), (ins i32imm:$amt),
-                                "#ADJCALLSTACKDOWN $amt",
-                                [(CallSeqStart timm:$amt)]>;
+  def ADJCALLSTACKDOWN : Pseudo<(outs), (ins i32imm:$amt1, i32imm:$amt2),
+                                "#ADJCALLSTACKDOWN $amt1 $amt2",
+                                [(CallSeqStart timm:$amt1, timm:$amt2)]>;
   def ADJCALLSTACKUP   : Pseudo<(outs), (ins i32imm:$amt1, i32imm:$amt2),
                                 "#ADJCALLSTACKUP $amt1 $amt2",
                                 [(CallSeqEnd timm:$amt1, timm:$amt2)]>;
@@ -770,9 +771,6 @@ let Uses = [SR] in {
                     [(set (i32 GPR:$Rs1), (LanaiSetCC imm:$DDDI))]>;
 }
 
-// SCC's output is already 1-bit so and'ing with 1 is redundant.
-def : Pat<(and (LanaiSetCC imm:$DDDI), 1), (SCC imm:$DDDI)>;
-
 // Select with hardware support
 let Uses = [SR], isSelect = 1 in {
   def SELECT : InstRR<0b111, (outs GPR:$Rd),
diff --git a/lib/Target/MSP430/MSP430FrameLowering.cpp b/lib/Target/MSP430/MSP430FrameLowering.cpp
index f1cb0b6c031b..b4ff8f66c55f 100644
--- a/lib/Target/MSP430/MSP430FrameLowering.cpp
+++ b/lib/Target/MSP430/MSP430FrameLowering.cpp
@@ -236,7 +236,7 @@ MachineBasicBlock::iterator MSP430FrameLowering::eliminateCallFramePseudoInstr(
     // adjcallstackdown instruction into 'add SP, <amt>'
     // TODO: consider using push / pop instead of sub + store / add
     MachineInstr &Old = *I;
-    uint64_t Amount = Old.getOperand(0).getImm();
+    uint64_t Amount = TII.getFrameSize(Old);
     if (Amount != 0) {
       // We need to keep the stack aligned properly.  To do this, we round the
       // amount of space needed for the outgoing arguments up to the next
@@ -252,8 +252,7 @@ MachineBasicBlock::iterator MSP430FrameLowering::eliminateCallFramePseudoInstr(
       } else {
         assert(Old.getOpcode() == TII.getCallFrameDestroyOpcode());
         // factor out the amount the callee already popped.
-        uint64_t CalleeAmt = Old.getOperand(1).getImm();
-        Amount -= CalleeAmt;
+        Amount -= TII.getFramePoppedByCallee(Old);
         if (Amount)
           New = BuildMI(MF, Old.getDebugLoc(), TII.get(MSP430::ADD16ri),
                         MSP430::SP)
@@ -272,7 +271,7 @@ MachineBasicBlock::iterator MSP430FrameLowering::eliminateCallFramePseudoInstr(
   } else if (I->getOpcode() == TII.getCallFrameDestroyOpcode()) {
     // If we are performing frame pointer elimination and if the callee pops
     // something off the stack pointer, add it back.
-    if (uint64_t CalleeAmt = I->getOperand(1).getImm()) {
+    if (uint64_t CalleeAmt = TII.getFramePoppedByCallee(*I)) {
       MachineInstr &Old = *I;
       MachineInstr *New =
           BuildMI(MF, Old.getDebugLoc(), TII.get(MSP430::SUB16ri), MSP430::SP)
diff --git a/lib/Target/MSP430/MSP430ISelLowering.cpp b/lib/Target/MSP430/MSP430ISelLowering.cpp
index 40b1dd3cc2eb..cc6e64043f54 100644
--- a/lib/Target/MSP430/MSP430ISelLowering.cpp
+++ b/lib/Target/MSP430/MSP430ISelLowering.cpp
@@ -40,21 +40,24 @@ using namespace llvm;
 
 typedef enum {
   NoHWMult,
-  HWMultIntr,
-  HWMultNoIntr
+  HWMult16,
+  HWMult32,
+  HWMultF5
 } HWMultUseMode;
 
 static cl::opt<HWMultUseMode>
-HWMultMode("msp430-hwmult-mode", cl::Hidden,
+HWMultMode("mhwmult", cl::Hidden,
            cl::desc("Hardware multiplier use mode"),
-           cl::init(HWMultNoIntr),
+           cl::init(NoHWMult),
            cl::values(
-             clEnumValN(NoHWMult, "no",
+             clEnumValN(NoHWMult, "none",
                 "Do not use hardware multiplier"),
-             clEnumValN(HWMultIntr, "interrupts",
-                "Assume hardware multiplier can be used inside interrupts"),
-             clEnumValN(HWMultNoIntr, "use",
-                "Assume hardware multiplier cannot be used inside interrupts")));
+             clEnumValN(HWMult16, "16bit",
+                "Use 16-bit hardware multiplier"),
+             clEnumValN(HWMult32, "32bit",
+                "Use 32-bit hardware multiplier"),
+             clEnumValN(HWMultF5, "f5series",
+                "Use F5 series hardware multiplier")));
 
 MSP430TargetLowering::MSP430TargetLowering(const TargetMachine &TM,
                                            const MSP430Subtarget &STI)
@@ -131,29 +134,29 @@ MSP430TargetLowering::MSP430TargetLowering(const TargetMachine &TM,
   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1,   Expand);
 
   // FIXME: Implement efficiently multiplication by a constant
-  setOperationAction(ISD::MUL,              MVT::i8,    Expand);
-  setOperationAction(ISD::MULHS,            MVT::i8,    Expand);
-  setOperationAction(ISD::MULHU,            MVT::i8,    Expand);
-  setOperationAction(ISD::SMUL_LOHI,        MVT::i8,    Expand);
-  setOperationAction(ISD::UMUL_LOHI,        MVT::i8,    Expand);
-  setOperationAction(ISD::MUL,              MVT::i16,   Expand);
+  setOperationAction(ISD::MUL,              MVT::i8,    Promote);
+  setOperationAction(ISD::MULHS,            MVT::i8,    Promote);
+  setOperationAction(ISD::MULHU,            MVT::i8,    Promote);
+  setOperationAction(ISD::SMUL_LOHI,        MVT::i8,    Promote);
+  setOperationAction(ISD::UMUL_LOHI,        MVT::i8,    Promote);
+  setOperationAction(ISD::MUL,              MVT::i16,   LibCall);
   setOperationAction(ISD::MULHS,            MVT::i16,   Expand);
   setOperationAction(ISD::MULHU,            MVT::i16,   Expand);
   setOperationAction(ISD::SMUL_LOHI,        MVT::i16,   Expand);
   setOperationAction(ISD::UMUL_LOHI,        MVT::i16,   Expand);
 
-  setOperationAction(ISD::UDIV,             MVT::i8,    Expand);
-  setOperationAction(ISD::UDIVREM,          MVT::i8,    Expand);
-  setOperationAction(ISD::UREM,             MVT::i8,    Expand);
-  setOperationAction(ISD::SDIV,             MVT::i8,    Expand);
-  setOperationAction(ISD::SDIVREM,          MVT::i8,    Expand);
-  setOperationAction(ISD::SREM,             MVT::i8,    Expand);
-  setOperationAction(ISD::UDIV,             MVT::i16,   Expand);
+  setOperationAction(ISD::UDIV,             MVT::i8,    Promote);
+  setOperationAction(ISD::UDIVREM,          MVT::i8,    Promote);
+  setOperationAction(ISD::UREM,             MVT::i8,    Promote);
+  setOperationAction(ISD::SDIV,             MVT::i8,    Promote);
+  setOperationAction(ISD::SDIVREM,          MVT::i8,    Promote);
+  setOperationAction(ISD::SREM,             MVT::i8,    Promote);
+  setOperationAction(ISD::UDIV,             MVT::i16,   LibCall);
   setOperationAction(ISD::UDIVREM,          MVT::i16,   Expand);
-  setOperationAction(ISD::UREM,             MVT::i16,   Expand);
-  setOperationAction(ISD::SDIV,             MVT::i16,   Expand);
+  setOperationAction(ISD::UREM,             MVT::i16,   LibCall);
+  setOperationAction(ISD::SDIV,             MVT::i16,   LibCall);
   setOperationAction(ISD::SDIVREM,          MVT::i16,   Expand);
-  setOperationAction(ISD::SREM,             MVT::i16,   Expand);
+  setOperationAction(ISD::SREM,             MVT::i16,   LibCall);
 
   // varargs support
   setOperationAction(ISD::VASTART,          MVT::Other, Custom);
@@ -162,15 +165,183 @@ MSP430TargetLowering::MSP430TargetLowering(const TargetMachine &TM,
   setOperationAction(ISD::VACOPY,           MVT::Other, Expand);
   setOperationAction(ISD::JumpTable,        MVT::i16,   Custom);
 
-  // Libcalls names.
-  if (HWMultMode == HWMultIntr) {
-    setLibcallName(RTLIB::MUL_I8,  "__mulqi3hw");
-    setLibcallName(RTLIB::MUL_I16, "__mulhi3hw");
-  } else if (HWMultMode == HWMultNoIntr) {
-    setLibcallName(RTLIB::MUL_I8,  "__mulqi3hw_noint");
-    setLibcallName(RTLIB::MUL_I16, "__mulhi3hw_noint");
+  // EABI Libcalls - EABI Section 6.2
+  const struct {
+    const RTLIB::Libcall Op;
+    const char * const Name;
+    const ISD::CondCode Cond;
+  } LibraryCalls[] = {
+    // Floating point conversions - EABI Table 6
+    { RTLIB::FPROUND_F64_F32,   "__mspabi_cvtdf",   ISD::SETCC_INVALID },
+    { RTLIB::FPEXT_F32_F64,     "__mspabi_cvtfd",   ISD::SETCC_INVALID },
+    // The following is NOT implemented in libgcc
+    //{ RTLIB::FPTOSINT_F64_I16,  "__mspabi_fixdi", ISD::SETCC_INVALID },
+    { RTLIB::FPTOSINT_F64_I32,  "__mspabi_fixdli",  ISD::SETCC_INVALID },
+    { RTLIB::FPTOSINT_F64_I64,  "__mspabi_fixdlli", ISD::SETCC_INVALID },
+    // The following is NOT implemented in libgcc
+    //{ RTLIB::FPTOUINT_F64_I16,  "__mspabi_fixdu", ISD::SETCC_INVALID },
+    { RTLIB::FPTOUINT_F64_I32,  "__mspabi_fixdul",  ISD::SETCC_INVALID },
+    { RTLIB::FPTOUINT_F64_I64,  "__mspabi_fixdull", ISD::SETCC_INVALID },
+    // The following is NOT implemented in libgcc
+    //{ RTLIB::FPTOSINT_F32_I16,  "__mspabi_fixfi", ISD::SETCC_INVALID },
+    { RTLIB::FPTOSINT_F32_I32,  "__mspabi_fixfli",  ISD::SETCC_INVALID },
+    { RTLIB::FPTOSINT_F32_I64,  "__mspabi_fixflli", ISD::SETCC_INVALID },
+    // The following is NOT implemented in libgcc
+    //{ RTLIB::FPTOUINT_F32_I16,  "__mspabi_fixfu", ISD::SETCC_INVALID },
+    { RTLIB::FPTOUINT_F32_I32,  "__mspabi_fixful",  ISD::SETCC_INVALID },
+    { RTLIB::FPTOUINT_F32_I64,  "__mspabi_fixfull", ISD::SETCC_INVALID },
+    // TODO The following IS implemented in libgcc
+    //{ RTLIB::SINTTOFP_I16_F64,  "__mspabi_fltid", ISD::SETCC_INVALID },
+    { RTLIB::SINTTOFP_I32_F64,  "__mspabi_fltlid",  ISD::SETCC_INVALID },
+    // TODO The following IS implemented in libgcc but is not in the EABI
+    { RTLIB::SINTTOFP_I64_F64,  "__mspabi_fltllid", ISD::SETCC_INVALID },
+    // TODO The following IS implemented in libgcc
+    //{ RTLIB::UINTTOFP_I16_F64,  "__mspabi_fltud", ISD::SETCC_INVALID },
+    { RTLIB::UINTTOFP_I32_F64,  "__mspabi_fltuld",  ISD::SETCC_INVALID },
+    // The following IS implemented in libgcc but is not in the EABI
+    { RTLIB::UINTTOFP_I64_F64,  "__mspabi_fltulld", ISD::SETCC_INVALID },
+    // TODO The following IS implemented in libgcc
+    //{ RTLIB::SINTTOFP_I16_F32,  "__mspabi_fltif", ISD::SETCC_INVALID },
+    { RTLIB::SINTTOFP_I32_F32,  "__mspabi_fltlif",  ISD::SETCC_INVALID },
+    // TODO The following IS implemented in libgcc but is not in the EABI
+    { RTLIB::SINTTOFP_I64_F32,  "__mspabi_fltllif", ISD::SETCC_INVALID },
+    // TODO The following IS implemented in libgcc
+    //{ RTLIB::UINTTOFP_I16_F32,  "__mspabi_fltuf", ISD::SETCC_INVALID },
+    { RTLIB::UINTTOFP_I32_F32,  "__mspabi_fltulf",  ISD::SETCC_INVALID },
+    // The following IS implemented in libgcc but is not in the EABI
+    { RTLIB::UINTTOFP_I64_F32,  "__mspabi_fltullf", ISD::SETCC_INVALID },
+
+    // Floating point comparisons - EABI Table 7
+    { RTLIB::OEQ_F64, "__mspabi_cmpd", ISD::SETEQ },
+    { RTLIB::UNE_F64, "__mspabi_cmpd", ISD::SETNE },
+    { RTLIB::OGE_F64, "__mspabi_cmpd", ISD::SETGE },
+    { RTLIB::OLT_F64, "__mspabi_cmpd", ISD::SETLT },
+    { RTLIB::OLE_F64, "__mspabi_cmpd", ISD::SETLE },
+    { RTLIB::OGT_F64, "__mspabi_cmpd", ISD::SETGT },
+    { RTLIB::OEQ_F32, "__mspabi_cmpf", ISD::SETEQ },
+    { RTLIB::UNE_F32, "__mspabi_cmpf", ISD::SETNE },
+    { RTLIB::OGE_F32, "__mspabi_cmpf", ISD::SETGE },
+    { RTLIB::OLT_F32, "__mspabi_cmpf", ISD::SETLT },
+    { RTLIB::OLE_F32, "__mspabi_cmpf", ISD::SETLE },
+    { RTLIB::OGT_F32, "__mspabi_cmpf", ISD::SETGT },
+
+    // Floating point arithmetic - EABI Table 8
+    { RTLIB::ADD_F64,  "__mspabi_addd", ISD::SETCC_INVALID },
+    { RTLIB::ADD_F32,  "__mspabi_addf", ISD::SETCC_INVALID },
+    { RTLIB::DIV_F64,  "__mspabi_divd", ISD::SETCC_INVALID },
+    { RTLIB::DIV_F32,  "__mspabi_divf", ISD::SETCC_INVALID },
+    { RTLIB::MUL_F64,  "__mspabi_mpyd", ISD::SETCC_INVALID },
+    { RTLIB::MUL_F32,  "__mspabi_mpyf", ISD::SETCC_INVALID },
+    { RTLIB::SUB_F64,  "__mspabi_subd", ISD::SETCC_INVALID },
+    { RTLIB::SUB_F32,  "__mspabi_subf", ISD::SETCC_INVALID },
+    // The following are NOT implemented in libgcc
+    // { RTLIB::NEG_F64,  "__mspabi_negd", ISD::SETCC_INVALID },
+    // { RTLIB::NEG_F32,  "__mspabi_negf", ISD::SETCC_INVALID },
+
+    // TODO: SLL/SRA/SRL are in libgcc, RLL isn't
+
+    // Universal Integer Operations - EABI Table 9
+    { RTLIB::SDIV_I16,   "__mspabi_divi", ISD::SETCC_INVALID },
+    { RTLIB::SDIV_I32,   "__mspabi_divli", ISD::SETCC_INVALID },
+    { RTLIB::SDIV_I64,   "__mspabi_divlli", ISD::SETCC_INVALID },
+    { RTLIB::UDIV_I16,   "__mspabi_divu", ISD::SETCC_INVALID },
+    { RTLIB::UDIV_I32,   "__mspabi_divul", ISD::SETCC_INVALID },
+    { RTLIB::UDIV_I64,   "__mspabi_divull", ISD::SETCC_INVALID },
+    { RTLIB::SREM_I16,   "__mspabi_remi", ISD::SETCC_INVALID },
+    { RTLIB::SREM_I32,   "__mspabi_remli", ISD::SETCC_INVALID },
+    { RTLIB::SREM_I64,   "__mspabi_remlli", ISD::SETCC_INVALID },
+    { RTLIB::UREM_I16,   "__mspabi_remu", ISD::SETCC_INVALID },
+    { RTLIB::UREM_I32,   "__mspabi_remul", ISD::SETCC_INVALID },
+    { RTLIB::UREM_I64,   "__mspabi_remull", ISD::SETCC_INVALID },
+
+  };
+
+  for (const auto &LC : LibraryCalls) {
+    setLibcallName(LC.Op, LC.Name);
+    if (LC.Cond != ISD::SETCC_INVALID)
+      setCmpLibcallCC(LC.Op, LC.Cond);
+  }
+
+  if (HWMultMode == HWMult16) {
+    const struct {
+      const RTLIB::Libcall Op;
+      const char * const Name;
+    } LibraryCalls[] = {
+      // Integer Multiply - EABI Table 9
+      { RTLIB::MUL_I16,   "__mspabi_mpyi_hw" },
+      { RTLIB::MUL_I32,   "__mspabi_mpyl_hw" },
+      { RTLIB::MUL_I64,   "__mspabi_mpyll_hw" },
+      // TODO The __mspabi_mpysl*_hw functions ARE implemented in libgcc
+      // TODO The __mspabi_mpyul*_hw functions ARE implemented in libgcc
+    };
+    for (const auto &LC : LibraryCalls) {
+      setLibcallName(LC.Op, LC.Name);
+    }
+  } else if (HWMultMode == HWMult32) {
+    const struct {
+      const RTLIB::Libcall Op;
+      const char * const Name;
+    } LibraryCalls[] = {
+      // Integer Multiply - EABI Table 9
+      { RTLIB::MUL_I16,   "__mspabi_mpyi_hw" },
+      { RTLIB::MUL_I32,   "__mspabi_mpyl_hw32" },
+      { RTLIB::MUL_I64,   "__mspabi_mpyll_hw32" },
+      // TODO The __mspabi_mpysl*_hw32 functions ARE implemented in libgcc
+      // TODO The __mspabi_mpyul*_hw32 functions ARE implemented in libgcc
+    };
+    for (const auto &LC : LibraryCalls) {
+      setLibcallName(LC.Op, LC.Name);
+    }
+  } else if (HWMultMode == HWMultF5) {
+    const struct {
+      const RTLIB::Libcall Op;
+      const char * const Name;
+    } LibraryCalls[] = {
+      // Integer Multiply - EABI Table 9
+      { RTLIB::MUL_I16,   "__mspabi_mpyi_f5hw" },
+      { RTLIB::MUL_I32,   "__mspabi_mpyl_f5hw" },
+      { RTLIB::MUL_I64,   "__mspabi_mpyll_f5hw" },
+      // TODO The __mspabi_mpysl*_f5hw functions ARE implemented in libgcc
+      // TODO The __mspabi_mpyul*_f5hw functions ARE implemented in libgcc
+    };
+    for (const auto &LC : LibraryCalls) {
+      setLibcallName(LC.Op, LC.Name);
+    }
+  } else { // NoHWMult
+    const struct {
+      const RTLIB::Libcall Op;
+      const char * const Name;
+    } LibraryCalls[] = {
+      // Integer Multiply - EABI Table 9
+      { RTLIB::MUL_I16,   "__mspabi_mpyi" },
+      { RTLIB::MUL_I32,   "__mspabi_mpyl" },
+      { RTLIB::MUL_I64,   "__mspabi_mpyll" },
+      // The __mspabi_mpysl* functions are NOT implemented in libgcc
+      // The __mspabi_mpyul* functions are NOT implemented in libgcc
+    };
+    for (const auto &LC : LibraryCalls) {
+      setLibcallName(LC.Op, LC.Name);
+    }
+    setLibcallCallingConv(RTLIB::MUL_I64, CallingConv::MSP430_BUILTIN);
   }
 
+  // Several of the runtime library functions use a special calling conv
+  setLibcallCallingConv(RTLIB::UDIV_I64, CallingConv::MSP430_BUILTIN);
+  setLibcallCallingConv(RTLIB::UREM_I64, CallingConv::MSP430_BUILTIN);
+  setLibcallCallingConv(RTLIB::SDIV_I64, CallingConv::MSP430_BUILTIN);
+  setLibcallCallingConv(RTLIB::SREM_I64, CallingConv::MSP430_BUILTIN);
+  setLibcallCallingConv(RTLIB::ADD_F64, CallingConv::MSP430_BUILTIN);
+  setLibcallCallingConv(RTLIB::SUB_F64, CallingConv::MSP430_BUILTIN);
+  setLibcallCallingConv(RTLIB::MUL_F64, CallingConv::MSP430_BUILTIN);
+  setLibcallCallingConv(RTLIB::DIV_F64, CallingConv::MSP430_BUILTIN);
+  setLibcallCallingConv(RTLIB::OEQ_F64, CallingConv::MSP430_BUILTIN);
+  setLibcallCallingConv(RTLIB::UNE_F64, CallingConv::MSP430_BUILTIN);
+  setLibcallCallingConv(RTLIB::OGE_F64, CallingConv::MSP430_BUILTIN);
+  setLibcallCallingConv(RTLIB::OLT_F64, CallingConv::MSP430_BUILTIN);
+  setLibcallCallingConv(RTLIB::OLE_F64, CallingConv::MSP430_BUILTIN);
+  setLibcallCallingConv(RTLIB::OGT_F64, CallingConv::MSP430_BUILTIN);
+  // TODO: __mspabi_srall, __mspabi_srlll, __mspabi_sllll
+
   setMinFunctionAlignment(1);
   setPrefFunctionAlignment(2);
 }
@@ -281,10 +452,27 @@ template<typename ArgT>
 static void AnalyzeArguments(CCState &State,
                              SmallVectorImpl<CCValAssign> &ArgLocs,
                              const SmallVectorImpl<ArgT> &Args) {
-  static const MCPhysReg RegList[] = {
+  static const MCPhysReg CRegList[] = {
     MSP430::R12, MSP430::R13, MSP430::R14, MSP430::R15
   };
-  static const unsigned NbRegs = array_lengthof(RegList);
+  static const unsigned CNbRegs = array_lengthof(CRegList);
+  static const MCPhysReg BuiltinRegList[] = {
+    MSP430::R8, MSP430::R9, MSP430::R10, MSP430::R11,
+    MSP430::R12, MSP430::R13, MSP430::R14, MSP430::R15
+  };
+  static const unsigned BuiltinNbRegs = array_lengthof(BuiltinRegList);
+
+  ArrayRef<MCPhysReg> RegList;
+  unsigned NbRegs;
+
+  bool Builtin = (State.getCallingConv() == CallingConv::MSP430_BUILTIN);
+  if (Builtin) {
+    RegList = BuiltinRegList;
+    NbRegs = BuiltinNbRegs;
+  } else {
+    RegList = CRegList;
+    NbRegs = CNbRegs;
+  }
 
   if (State.isVarArg()) {
     AnalyzeVarArgs(State, Args);
@@ -294,6 +482,11 @@ static void AnalyzeArguments(CCState &State,
   SmallVector<unsigned, 4> ArgsParts;
   ParseFunctionArgs(Args, ArgsParts);
 
+  if (Builtin) {
+    assert(ArgsParts.size() == 2 &&
+        "Builtin calling convention requires two arguments");
+  }
+
   unsigned RegsLeft = NbRegs;
   bool UsedStack = false;
   unsigned ValNo = 0;
@@ -323,6 +516,11 @@ static void AnalyzeArguments(CCState &State,
 
     unsigned Parts = ArgsParts[i];
 
+    if (Builtin) {
+      assert(Parts == 4 &&
+          "Builtin calling convention requires 64-bit arguments");
+    }
+
     if (!UsedStack && Parts == 2 && RegsLeft == 1) {
       // Special case for 32-bit register split, see EABI section 3.3.3
       unsigned Reg = State.AllocateReg(RegList);
@@ -400,6 +598,7 @@ MSP430TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
   switch (CallConv) {
   default:
     llvm_unreachable("Unsupported calling convention");
+  case CallingConv::MSP430_BUILTIN:
   case CallingConv::Fast:
   case CallingConv::C:
     return LowerCCCCallTo(Chain, Callee, CallConv, isVarArg, isTailCall,
@@ -598,7 +797,6 @@ MSP430TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
 
 /// LowerCCCCallTo - functions arguments are copied from virtual regs to
 /// (physical regs)/(stack frame), CALLSEQ_START and CALLSEQ_END are emitted.
-// TODO: sret.
 SDValue MSP430TargetLowering::LowerCCCCallTo(
     SDValue Chain, SDValue Callee, CallingConv::ID CallConv, bool isVarArg,
     bool isTailCall, const SmallVectorImpl<ISD::OutputArg> &Outs,
@@ -615,8 +813,7 @@ SDValue MSP430TargetLowering::LowerCCCCallTo(
   unsigned NumBytes = CCInfo.getNextStackOffset();
   auto PtrVT = getPointerTy(DAG.getDataLayout());
 
-  Chain = DAG.getCALLSEQ_START(Chain,
-                               DAG.getConstant(NumBytes, dl, PtrVT, true), dl);
+  Chain = DAG.getCALLSEQ_START(Chain, NumBytes, 0, dl);
 
   SmallVector<std::pair<unsigned, SDValue>, 4> RegsToPass;
   SmallVector<SDValue, 12> MemOpChains;
diff --git a/lib/Target/MSP430/MSP430InstrInfo.h b/lib/Target/MSP430/MSP430InstrInfo.h
index e3259bd6a7bc..d81f17e753c5 100644
--- a/lib/Target/MSP430/MSP430InstrInfo.h
+++ b/lib/Target/MSP430/MSP430InstrInfo.h
@@ -85,6 +85,12 @@ public:
                         MachineBasicBlock *FBB, ArrayRef<MachineOperand> Cond,
                         const DebugLoc &DL,
                         int *BytesAdded = nullptr) const override;
+
+  int64_t getFramePoppedByCallee(const MachineInstr &I) const {
+    assert(isFrameInstr(I) && "Not a frame instruction");
+    assert(I.getOperand(1).getImm() >= 0 && "Size must not be negative");
+    return I.getOperand(1).getImm();
+  }
 };
 
 }
diff --git a/lib/Target/MSP430/MSP430InstrInfo.td b/lib/Target/MSP430/MSP430InstrInfo.td
index 22fc2474fae6..1cd18611e52c 100644
--- a/lib/Target/MSP430/MSP430InstrInfo.td
+++ b/lib/Target/MSP430/MSP430InstrInfo.td
@@ -23,7 +23,8 @@ class SDTCisI16<int OpNum> : SDTCisVT<OpNum, i16>;
 // Type Profiles.
 //===----------------------------------------------------------------------===//
 def SDT_MSP430Call         : SDTypeProfile<0, -1, [SDTCisVT<0, iPTR>]>;
-def SDT_MSP430CallSeqStart : SDCallSeqStart<[SDTCisVT<0, i16>]>;
+def SDT_MSP430CallSeqStart : SDCallSeqStart<[SDTCisVT<0, i16>,
+                                             SDTCisVT<1, i16>]>;
 def SDT_MSP430CallSeqEnd   : SDCallSeqEnd<[SDTCisVT<0, i16>, SDTCisVT<1, i16>]>;
 def SDT_MSP430Wrapper      : SDTypeProfile<1, 1, [SDTCisSameAs<0, 1>,
                                                   SDTCisPtrTy<0>]>;
@@ -113,9 +114,9 @@ def and_su : PatFrag<(ops node:$lhs, node:$rhs), (and node:$lhs, node:$rhs), [{
 // Pessimistically assume ADJCALLSTACKDOWN / ADJCALLSTACKUP will become
 // sub / add which can clobber SR.
 let Defs = [SP, SR], Uses = [SP] in {
-def ADJCALLSTACKDOWN : Pseudo<(outs), (ins i16imm:$amt),
+def ADJCALLSTACKDOWN : Pseudo<(outs), (ins i16imm:$amt1, i16imm:$amt2),
                               "#ADJCALLSTACKDOWN",
-                              [(MSP430callseq_start timm:$amt)]>;
+                              [(MSP430callseq_start timm:$amt1, timm:$amt2)]>;
 def ADJCALLSTACKUP   : Pseudo<(outs), (ins i16imm:$amt1, i16imm:$amt2),
                               "#ADJCALLSTACKUP",
                               [(MSP430callseq_end timm:$amt1, timm:$amt2)]>;
@@ -209,7 +210,7 @@ let isCall = 1 in
   // a use to prevent stack-pointer assignments that appear immediately
   // before calls from potentially appearing dead. Uses for argument
   // registers are added manually.
-  let Defs = [R12, R13, R14, R15, SR],
+  let Defs = [R11, R12, R13, R14, R15, SR],
       Uses = [SP] in {
     def CALLi     : II16i<0x0,
                           (outs), (ins i16imm:$dst),
diff --git a/lib/Target/MSP430/MSP430RegisterInfo.cpp b/lib/Target/MSP430/MSP430RegisterInfo.cpp
index 81cd9d1ad3f8..9600bc28f100 100644
--- a/lib/Target/MSP430/MSP430RegisterInfo.cpp
+++ b/lib/Target/MSP430/MSP430RegisterInfo.cpp
@@ -41,12 +41,12 @@ MSP430RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
   const Function* F = MF->getFunction();
   static const MCPhysReg CalleeSavedRegs[] = {
     MSP430::FP, MSP430::R5, MSP430::R6, MSP430::R7,
-    MSP430::R8, MSP430::R9, MSP430::R10, MSP430::R11,
+    MSP430::R8, MSP430::R9, MSP430::R10,
     0
   };
   static const MCPhysReg CalleeSavedRegsFP[] = {
     MSP430::R5, MSP430::R6, MSP430::R7,
-    MSP430::R8, MSP430::R9, MSP430::R10, MSP430::R11,
+    MSP430::R8, MSP430::R9, MSP430::R10,
     0
   };
   static const MCPhysReg CalleeSavedRegsIntr[] = {
diff --git a/lib/Target/Mips/MipsFastISel.cpp b/lib/Target/Mips/MipsFastISel.cpp
index 21c99da0922d..b83f44a74d5b 100644
--- a/lib/Target/Mips/MipsFastISel.cpp
+++ b/lib/Target/Mips/MipsFastISel.cpp
@@ -1133,7 +1133,7 @@ bool MipsFastISel::processCallArgs(CallLoweringInfo &CLI,
   if (NumBytes < 16)
     NumBytes = 16;
 
-  emitInst(Mips::ADJCALLSTACKDOWN).addImm(16);
+  emitInst(Mips::ADJCALLSTACKDOWN).addImm(16).addImm(0);
   // Process the args.
   MVT firstMVT;
   for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
diff --git a/lib/Target/Mips/MipsISelLowering.cpp b/lib/Target/Mips/MipsISelLowering.cpp
index 8f39ebd42a5c..78bae6954c3c 100644
--- a/lib/Target/Mips/MipsISelLowering.cpp
+++ b/lib/Target/Mips/MipsISelLowering.cpp
@@ -2787,7 +2787,7 @@ MipsTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
   SDValue NextStackOffsetVal = DAG.getIntPtrConstant(NextStackOffset, DL, true);
 
   if (!IsTailCall)
-    Chain = DAG.getCALLSEQ_START(Chain, NextStackOffsetVal, DL);
+    Chain = DAG.getCALLSEQ_START(Chain, NextStackOffset, 0, DL);
 
   SDValue StackPtr =
       DAG.getCopyFromReg(Chain, DL, ABI.IsN64() ? Mips::SP_64 : Mips::SP,
diff --git a/lib/Target/Mips/MipsInstrInfo.td b/lib/Target/Mips/MipsInstrInfo.td
index b90077d7807d..8761946b8dbb 100644
--- a/lib/Target/Mips/MipsInstrInfo.td
+++ b/lib/Target/Mips/MipsInstrInfo.td
@@ -21,7 +21,7 @@ def SDT_MipsCMov         : SDTypeProfile<1, 4, [SDTCisSameAs<0, 1>,
                                                 SDTCisSameAs<1, 2>,
                                                 SDTCisSameAs<3, 4>,
                                                 SDTCisInt<4>]>;
-def SDT_MipsCallSeqStart : SDCallSeqStart<[SDTCisVT<0, i32>]>;
+def SDT_MipsCallSeqStart : SDCallSeqStart<[SDTCisVT<0, i32>, SDTCisVT<1, i32>]>;
 def SDT_MipsCallSeqEnd   : SDCallSeqEnd<[SDTCisVT<0, i32>, SDTCisVT<1, i32>]>;
 def SDT_MFLOHI : SDTypeProfile<1, 1, [SDTCisInt<0>, SDTCisVT<1, untyped>]>;
 def SDT_MTLOHI : SDTypeProfile<1, 2, [SDTCisVT<0, untyped>,
@@ -1719,8 +1719,8 @@ let isReturn=1, isTerminator=1, isBarrier=1, hasCtrlDep=1, isCTI=1 in {
 }
 
 let Defs = [SP], Uses = [SP], hasSideEffects = 1 in {
-def ADJCALLSTACKDOWN : MipsPseudo<(outs), (ins i32imm:$amt),
-                                  [(callseq_start timm:$amt)]>;
+def ADJCALLSTACKDOWN : MipsPseudo<(outs), (ins i32imm:$amt1, i32imm:$amt2),
+                                  [(callseq_start timm:$amt1, timm:$amt2)]>;
 def ADJCALLSTACKUP   : MipsPseudo<(outs), (ins i32imm:$amt1, i32imm:$amt2),
                                   [(callseq_end timm:$amt1, timm:$amt2)]>;
 }
diff --git a/lib/Target/Mips/MipsOptimizePICCall.cpp b/lib/Target/Mips/MipsOptimizePICCall.cpp
index 68dcbdfb4211..f8d9c34556bc 100644
--- a/lib/Target/Mips/MipsOptimizePICCall.cpp
+++ b/lib/Target/Mips/MipsOptimizePICCall.cpp
@@ -257,7 +257,7 @@ bool OptimizePICCall::isCallViaRegister(MachineInstr &MI, unsigned &Reg,
 
   // Get the instruction that loads the function address from the GOT.
   Reg = MO->getReg();
-  Val = (Value*)nullptr;
+  Val = nullptr;
   MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
   MachineInstr *DefMI = MRI.getVRegDef(Reg);
 
diff --git a/lib/Target/NVPTX/NVPTXISelLowering.cpp b/lib/Target/NVPTX/NVPTXISelLowering.cpp
index 61fdda8aa109..ebaaf42bc64e 100644
--- a/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -1430,8 +1430,7 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
     return Chain;
 
   SDValue tempChain = Chain;
-  Chain = DAG.getCALLSEQ_START(
-      Chain, DAG.getIntPtrConstant(uniqueCallSite, dl, true), dl);
+  Chain = DAG.getCALLSEQ_START(Chain, uniqueCallSite, 0, dl);
   SDValue InFlag = Chain.getValue(1);
 
   unsigned paramCount = 0;
@@ -1549,7 +1548,9 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
 
           Chain = DAG.getMemIntrinsicNode(
               Op, dl, DAG.getVTList(MVT::Other, MVT::Glue), StoreOperands,
-              TheStoreType, MachinePointerInfo(), EltAlign);
+              TheStoreType, MachinePointerInfo(), EltAlign,
+              /* Volatile */ false, /* ReadMem */ false,
+              /* WriteMem */ true, /* Size */ 0);
           InFlag = Chain.getValue(1);
 
           // Cleanup.
@@ -1609,7 +1610,9 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
                                  theVal, InFlag };
       Chain = DAG.getMemIntrinsicNode(NVPTXISD::StoreParam, dl, CopyParamVTs,
                                       CopyParamOps, elemtype,
-                                      MachinePointerInfo());
+                                      MachinePointerInfo(), /* Align */ 0,
+                                      /* Volatile */ false, /* ReadMem */ false,
+                                      /* WriteMem */ true, /* Size */ 0);
 
       InFlag = Chain.getValue(1);
     }
@@ -1795,7 +1798,8 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
             DAG.getConstant(Offsets[VecIdx], dl, MVT::i32), InFlag};
         SDValue RetVal = DAG.getMemIntrinsicNode(
             Op, dl, DAG.getVTList(LoadVTs), LoadOperands, TheLoadType,
-            MachinePointerInfo(), EltAlign);
+            MachinePointerInfo(), EltAlign, /* Volatile */ false,
+            /* ReadMem */ true, /* WriteMem */ false, /* Size */ 0);
 
         for (unsigned j = 0; j < NumElts; ++j) {
           SDValue Ret = RetVal.getValue(j);
@@ -2579,7 +2583,9 @@ NVPTXTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
       EVT TheStoreType = ExtendIntegerRetVal ? MVT::i32 : VTs[i];
       Chain = DAG.getMemIntrinsicNode(Op, dl, DAG.getVTList(MVT::Other),
                                       StoreOperands, TheStoreType,
-                                      MachinePointerInfo(), 1);
+                                      MachinePointerInfo(), /* Align */ 1,
+                                      /* Volatile */ false, /* ReadMem */ false,
+                                      /* WriteMem */ true, /* Size */ 0);
       // Cleanup vector state.
       StoreOperands.clear();
     }
diff --git a/lib/Target/NVPTX/NVPTXInstrInfo.td b/lib/Target/NVPTX/NVPTXInstrInfo.td
index 9378b29a9d0e..b5b5ea1ed639 100644
--- a/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ b/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -3101,7 +3101,8 @@ def : Pat<(brcond (i1 (setne Int1Regs:$a, -1)), bb:$target),
           (CBranchOther Int1Regs:$a, bb:$target)>;
 
 // Call
-def SDT_NVPTXCallSeqStart : SDCallSeqStart<[SDTCisVT<0, i32>]>;
+def SDT_NVPTXCallSeqStart : SDCallSeqStart<[SDTCisVT<0, i32>,
+                                            SDTCisVT<1, i32>]>;
 def SDT_NVPTXCallSeqEnd   : SDCallSeqEnd<[SDTCisVT<0, i32>, SDTCisVT<1, i32>]>;
 
 def callseq_start : SDNode<"ISD::CALLSEQ_START", SDT_NVPTXCallSeqStart,
@@ -3126,10 +3127,10 @@ class Pseudo<dag outs, dag ins, string asmstr, list<dag> pattern>
    : NVPTXInst<outs, ins, asmstr, pattern>;
 
 def Callseq_Start :
-  NVPTXInst<(outs), (ins i32imm:$amt),
-            "\\{ // callseq $amt\n"
+  NVPTXInst<(outs), (ins i32imm:$amt1, i32imm:$amt2),
+            "\\{ // callseq $amt1, $amt2\n"
             "\t.reg .b32 temp_param_reg;",
-           [(callseq_start timm:$amt)]>;
+            [(callseq_start timm:$amt1, timm:$amt2)]>;
 def Callseq_End :
   NVPTXInst<(outs), (ins i32imm:$amt1, i32imm:$amt2),
             "\\} // callseq $amt1",
diff --git a/lib/Target/PowerPC/Disassembler/PPCDisassembler.cpp b/lib/Target/PowerPC/Disassembler/PPCDisassembler.cpp
index 12ffbfdeacc1..11d22377611b 100644
--- a/lib/Target/PowerPC/Disassembler/PPCDisassembler.cpp
+++ b/lib/Target/PowerPC/Disassembler/PPCDisassembler.cpp
@@ -204,6 +204,17 @@ static const unsigned G8Regs[] = {
   PPC::X28, PPC::X29, PPC::X30, PPC::X31
 };
 
+static const unsigned G80Regs[] = {
+  PPC::ZERO8, PPC::X1, PPC::X2, PPC::X3,
+  PPC::X4, PPC::X5, PPC::X6, PPC::X7,
+  PPC::X8, PPC::X9, PPC::X10, PPC::X11,
+  PPC::X12, PPC::X13, PPC::X14, PPC::X15,
+  PPC::X16, PPC::X17, PPC::X18, PPC::X19,
+  PPC::X20, PPC::X21, PPC::X22, PPC::X23,
+  PPC::X24, PPC::X25, PPC::X26, PPC::X27,
+  PPC::X28, PPC::X29, PPC::X30, PPC::X31
+};
+
 static const unsigned QFRegs[] = {
   PPC::QF0, PPC::QF1, PPC::QF2, PPC::QF3,
   PPC::QF4, PPC::QF5, PPC::QF6, PPC::QF7,
@@ -301,6 +312,12 @@ static DecodeStatus DecodeG8RCRegisterClass(MCInst &Inst, uint64_t RegNo,
   return decodeRegisterClass(Inst, RegNo, G8Regs);
 }
 
+static DecodeStatus DecodeG8RC_NOX0RegisterClass(MCInst &Inst, uint64_t RegNo,
+                                            uint64_t Address,
+                                            const void *Decoder) {
+  return decodeRegisterClass(Inst, RegNo, G80Regs);
+}
+
 #define DecodePointerLikeRegClass0 DecodeGPRCRegisterClass
 #define DecodePointerLikeRegClass1 DecodeGPRC_NOR0RegisterClass
 
diff --git a/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.cpp b/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.cpp
index 609d959c6d08..84bb9ec56800 100644
--- a/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.cpp
+++ b/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.cpp
@@ -95,7 +95,8 @@ void PPCInstPrinter::printInst(const MCInst *MI, raw_ostream &O,
     return;
   }
   
-  if (MI->getOpcode() == PPC::RLDICR) {
+  if (MI->getOpcode() == PPC::RLDICR ||
+      MI->getOpcode() == PPC::RLDICR_32) {
     unsigned char SH = MI->getOperand(2).getImm();
     unsigned char ME = MI->getOperand(3).getImm();
     // rldicr RA, RS, SH, 63-SH == sldi RA, RS, SH
diff --git a/lib/Target/PowerPC/PPCFastISel.cpp b/lib/Target/PowerPC/PPCFastISel.cpp
index 9b91b9ab8f82..2fc8654deeab 100644
--- a/lib/Target/PowerPC/PPCFastISel.cpp
+++ b/lib/Target/PowerPC/PPCFastISel.cpp
@@ -1330,7 +1330,7 @@ bool PPCFastISel::processCallArgs(SmallVectorImpl<Value*> &Args,
   // Issue CALLSEQ_START.
   BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
           TII.get(TII.getCallFrameSetupOpcode()))
-    .addImm(NumBytes);
+    .addImm(NumBytes).addImm(0);
 
   // Prepare to assign register arguments.  Every argument uses up a
   // GPR protocol register even if it's passed in a floating-point
@@ -2246,6 +2246,7 @@ bool PPCFastISel::tryToFoldLoadIntoMI(MachineInstr *MI, unsigned OpNo,
     }
 
     case PPC::EXTSW:
+    case PPC::EXTSW_32:
     case PPC::EXTSW_32_64: {
       if (VT != MVT::i32 && VT != MVT::i16 && VT != MVT::i8)
         return false;
diff --git a/lib/Target/PowerPC/PPCISelDAGToDAG.cpp b/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
index 1b0402bf003d..5fa7b2c6bfb1 100644
--- a/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
+++ b/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
@@ -54,6 +54,7 @@
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/ADT/Statistic.h"
 #include <algorithm>
 #include <cassert>
 #include <cstdint>
@@ -68,6 +69,14 @@ using namespace llvm;
 
 #define DEBUG_TYPE "ppc-codegen"
 
+STATISTIC(NumSextSetcc,
+          "Number of (sext(setcc)) nodes expanded into GPR sequence.");
+STATISTIC(NumZextSetcc,
+          "Number of (zext(setcc)) nodes expanded into GPR sequence.");
+STATISTIC(SignExtensionsAdded,
+          "Number of sign extensions for compare inputs added.");
+STATISTIC(ZeroExtensionsAdded,
+          "Number of zero extensions for compare inputs added.");
 // FIXME: Remove this once the bug has been fixed!
 cl::opt<bool> ANDIGlueBug("expose-ppc-andi-glue-bug",
 cl::desc("expose the ANDI glue bug on PPC"), cl::Hidden);
@@ -252,7 +261,28 @@ namespace {
 #include "PPCGenDAGISel.inc"
 
 private:
+    // Conversion type for interpreting results of a 32-bit instruction as
+    // a 64-bit value or vice versa.
+    enum ExtOrTruncConversion { Ext, Trunc };
+
+    // Modifiers to guide how an ISD::SETCC node's result is to be computed
+    // in a GPR.
+    // ZExtOrig - use the original condition code, zero-extend value
+    // ZExtInvert - invert the condition code, zero-extend value
+    // SExtOrig - use the original condition code, sign-extend value
+    // SExtInvert - invert the condition code, sign-extend value
+    enum SetccInGPROpts { ZExtOrig, ZExtInvert, SExtOrig, SExtInvert };
+
     bool trySETCC(SDNode *N);
+    bool tryEXTEND(SDNode *N);
+    SDValue signExtendInputIfNeeded(SDValue Input);
+    SDValue zeroExtendInputIfNeeded(SDValue Input);
+    SDValue addExtOrTrunc(SDValue NatWidthRes, ExtOrTruncConversion Conv);
+    SDValue get32BitZExtCompare(SDValue LHS, SDValue RHS, ISD::CondCode CC,
+                                int64_t RHSValue, SDLoc dl);
+    SDValue get32BitSExtCompare(SDValue LHS, SDValue RHS, ISD::CondCode CC,
+                                int64_t RHSValue, SDLoc dl);
+    SDValue getSETCCInGPR(SDValue Compare, SetccInGPROpts ConvOpts);
 
     void PeepholePPC64();
     void PeepholePPC64ZExt();
@@ -2471,6 +2501,225 @@ bool PPCDAGToDAGISel::trySETCC(SDNode *N) {
   return true;
 }
 
+/// If this node is a sign/zero extension of an integer comparison,
+/// it can usually be computed in GPR's rather than using comparison
+/// instructions and ISEL. We only do this on 64-bit targets for now
+/// as the code is specialized for 64-bit (it uses 64-bit instructions
+/// and assumes 64-bit registers).
+bool PPCDAGToDAGISel::tryEXTEND(SDNode *N) {
+  if (TM.getOptLevel() == CodeGenOpt::None || !TM.isPPC64())
+    return false;
+  assert((N->getOpcode() == ISD::ZERO_EXTEND ||
+          N->getOpcode() == ISD::SIGN_EXTEND) &&
+          "Expecting a zero/sign extend node!");
+
+  if (N->getOperand(0).getOpcode() != ISD::SETCC)
+    return false;
+
+  SDValue WideRes =
+    getSETCCInGPR(N->getOperand(0),
+                  N->getOpcode() == ISD::SIGN_EXTEND ?
+                  SetccInGPROpts::SExtOrig : SetccInGPROpts::ZExtOrig);
+
+  if (!WideRes)
+    return false;
+
+  SDLoc dl(N);
+  bool Inputs32Bit = N->getOperand(0).getOperand(0).getValueType() == MVT::i32;
+  bool Output32Bit = N->getValueType(0) == MVT::i32;
+
+  NumSextSetcc += N->getOpcode() == ISD::SIGN_EXTEND ? 1 : 0;
+  NumZextSetcc += N->getOpcode() == ISD::SIGN_EXTEND ? 0 : 1;
+
+  SDValue ConvOp = WideRes;
+  if (Inputs32Bit != Output32Bit)
+    ConvOp = addExtOrTrunc(WideRes, Inputs32Bit ? ExtOrTruncConversion::Ext :
+                           ExtOrTruncConversion::Trunc);
+  ReplaceNode(N, ConvOp.getNode());
+
+  return true;
+}
+
+/// If the value isn't guaranteed to be sign-extended to 64-bits, extend it.
+/// Useful when emitting comparison code for 32-bit values without using
+/// the compare instruction (which only considers the lower 32-bits).
+SDValue PPCDAGToDAGISel::signExtendInputIfNeeded(SDValue Input) {
+  assert(Input.getValueType() == MVT::i32 &&
+         "Can only sign-extend 32-bit values here.");
+  unsigned Opc = Input.getOpcode();
+
+  // The value was sign extended and then truncated to 32-bits. No need to
+  // sign extend it again.
+  if (Opc == ISD::TRUNCATE &&
+      (Input.getOperand(0).getOpcode() == ISD::AssertSext ||
+       Input.getOperand(0).getOpcode() == ISD::SIGN_EXTEND))
+    return Input;
+
+  LoadSDNode *InputLoad = dyn_cast<LoadSDNode>(Input);
+  // The input is a sign-extending load. No reason to sign-extend.
+  if (InputLoad && InputLoad->getExtensionType() == ISD::SEXTLOAD)
+    return Input;
+
+  ConstantSDNode *InputConst = dyn_cast<ConstantSDNode>(Input);
+  // We don't sign-extend constants and already sign-extended values.
+  if (InputConst || Opc == ISD::AssertSext || Opc == ISD::SIGN_EXTEND_INREG ||
+      Opc == ISD::SIGN_EXTEND)
+    return Input;
+
+  SDLoc dl(Input);
+  SignExtensionsAdded++;
+  return SDValue(CurDAG->getMachineNode(PPC::EXTSW_32, dl, MVT::i32, Input), 0);
+}
+
+/// If the value isn't guaranteed to be zero-extended to 64-bits, extend it.
+/// Useful when emitting comparison code for 32-bit values without using
+/// the compare instruction (which only considers the lower 32-bits).
+SDValue PPCDAGToDAGISel::zeroExtendInputIfNeeded(SDValue Input) {
+  assert(Input.getValueType() == MVT::i32 &&
+         "Can only zero-extend 32-bit values here.");
+  LoadSDNode *InputLoad = dyn_cast<LoadSDNode>(Input);
+  unsigned Opc = Input.getOpcode();
+
+  // No need to zero-extend loaded values (unless they're loaded with
+  // a sign-extending load).
+  if (InputLoad && InputLoad->getExtensionType() != ISD::SEXTLOAD)
+    return Input;
+
+  ConstantSDNode *InputConst = dyn_cast<ConstantSDNode>(Input);
+  bool InputZExtConst = InputConst && InputConst->getSExtValue() >= 0;
+  // An ISD::TRUNCATE will be lowered to an EXTRACT_SUBREG so we have
+  // to conservatively actually clear the high bits. We also don't need to
+  // zero-extend constants or values that are already zero-extended.
+  if (InputZExtConst || Opc == ISD::AssertZext || Opc == ISD::ZERO_EXTEND)
+    return Input;
+
+  SDLoc dl(Input);
+  ZeroExtensionsAdded++;
+  return SDValue(CurDAG->getMachineNode(PPC::RLDICL_32, dl, MVT::i32, Input,
+                                        getI64Imm(0, dl), getI64Imm(32, dl)),
+                 0);
+}
+
+// Handle a 32-bit value in a 64-bit register and vice-versa. These are of
+// course not actual zero/sign extensions that will generate machine code,
+// they're just a way to reinterpret a 32 bit value in a register as a
+// 64 bit value and vice-versa.
+SDValue PPCDAGToDAGISel::addExtOrTrunc(SDValue NatWidthRes,
+                                       ExtOrTruncConversion Conv) {
+  SDLoc dl(NatWidthRes);
+
+  // For reinterpreting 32-bit values as 64 bit values, we generate
+  // INSERT_SUBREG IMPLICIT_DEF:i64, <input>, TargetConstant:i32<1>
+  if (Conv == ExtOrTruncConversion::Ext) {
+    SDValue ImDef(CurDAG->getMachineNode(PPC::IMPLICIT_DEF, dl, MVT::i64), 0);
+    SDValue SubRegIdx =
+      CurDAG->getTargetConstant(PPC::sub_32, dl, MVT::i32);
+    return SDValue(CurDAG->getMachineNode(PPC::INSERT_SUBREG, dl, MVT::i64,
+                                          ImDef, NatWidthRes, SubRegIdx), 0);
+  }
+
+  assert(Conv == ExtOrTruncConversion::Trunc &&
+         "Unknown convertion between 32 and 64 bit values.");
+  // For reinterpreting 64-bit values as 32-bit values, we just need to
+  // EXTRACT_SUBREG (i.e. extract the low word).
+  SDValue SubRegIdx =
+    CurDAG->getTargetConstant(PPC::sub_32, dl, MVT::i32);
+  return SDValue(CurDAG->getMachineNode(PPC::EXTRACT_SUBREG, dl, MVT::i32,
+                                        NatWidthRes, SubRegIdx), 0);
+}
+
+/// Produces a zero-extended result of comparing two 32-bit values according to
+/// the passed condition code.
+SDValue PPCDAGToDAGISel::get32BitZExtCompare(SDValue LHS, SDValue RHS,
+                                             ISD::CondCode CC,
+                                             int64_t RHSValue, SDLoc dl) {
+  bool IsRHSZero = RHSValue == 0;
+  switch (CC) {
+  default: return SDValue();
+  case ISD::SETEQ: {
+    // (zext (setcc %a, %b, seteq)) -> (lshr (cntlzw (xor %a, %b)), 5)
+    // (zext (setcc %a, 0, seteq))  -> (lshr (cntlzw %a), 5)
+    SDValue Xor = IsRHSZero ? LHS :
+      SDValue(CurDAG->getMachineNode(PPC::XOR, dl, MVT::i32, LHS, RHS), 0);
+    SDValue Clz =
+      SDValue(CurDAG->getMachineNode(PPC::CNTLZW, dl, MVT::i32, Xor), 0);
+    SDValue ShiftOps[] = { Clz, getI32Imm(27, dl), getI32Imm(5, dl),
+      getI32Imm(31, dl) };
+    return SDValue(CurDAG->getMachineNode(PPC::RLWINM, dl, MVT::i32,
+                                          ShiftOps), 0);
+  }
+  }
+}
+
+/// Produces a sign-extended result of comparing two 32-bit values according to
+/// the passed condition code.
+SDValue PPCDAGToDAGISel::get32BitSExtCompare(SDValue LHS, SDValue RHS,
+                                             ISD::CondCode CC,
+                                             int64_t RHSValue, SDLoc dl) {
+  bool IsRHSZero = RHSValue == 0;
+  switch (CC) {
+  default: return SDValue();
+  case ISD::SETEQ: {
+    // (sext (setcc %a, %b, seteq)) ->
+    //   (ashr (shl (ctlz (xor %a, %b)), 58), 63)
+    // (sext (setcc %a, 0, seteq)) ->
+    //   (ashr (shl (ctlz %a), 58), 63)
+    SDValue CountInput = IsRHSZero ? LHS :
+      SDValue(CurDAG->getMachineNode(PPC::XOR, dl, MVT::i32, LHS, RHS), 0);
+    SDValue Cntlzw =
+      SDValue(CurDAG->getMachineNode(PPC::CNTLZW, dl, MVT::i32, CountInput), 0);
+    SDValue SHLOps[] = { Cntlzw, getI32Imm(58, dl), getI32Imm(0, dl) };
+    SDValue Sldi =
+      SDValue(CurDAG->getMachineNode(PPC::RLDICR_32, dl, MVT::i32, SHLOps), 0);
+    return SDValue(CurDAG->getMachineNode(PPC::SRADI_32, dl, MVT::i32, Sldi,
+                                          getI32Imm(63, dl)), 0);
+  }
+  }
+}
+
+/// Returns an equivalent of a SETCC node but with the result the same width as
+/// the inputs. This can nalso be used for SELECT_CC if either the true or false
+/// values is a power of two while the other is zero.
+SDValue PPCDAGToDAGISel::getSETCCInGPR(SDValue Compare,
+                                       SetccInGPROpts ConvOpts) {
+  assert((Compare.getOpcode() == ISD::SETCC ||
+          Compare.getOpcode() == ISD::SELECT_CC) &&
+         "An ISD::SETCC node required here.");
+
+  SDValue LHS = Compare.getOperand(0);
+  SDValue RHS = Compare.getOperand(1);
+
+  // The condition code is operand 2 for SETCC and operand 4 for SELECT_CC.
+  int CCOpNum = Compare.getOpcode() == ISD::SELECT_CC ? 4 : 2;
+  ISD::CondCode CC =
+    cast<CondCodeSDNode>(Compare.getOperand(CCOpNum))->get();
+  EVT InputVT = LHS.getValueType();
+  if (InputVT != MVT::i32)
+    return SDValue();
+
+  SDLoc dl(Compare);
+  ConstantSDNode *RHSConst = dyn_cast<ConstantSDNode>(RHS);
+  int64_t RHSValue = RHSConst ? RHSConst->getSExtValue() : INT64_MAX;
+
+  if (ConvOpts == SetccInGPROpts::ZExtInvert ||
+      ConvOpts == SetccInGPROpts::SExtInvert)
+    CC = ISD::getSetCCInverse(CC, true);
+
+  if (ISD::isSignedIntSetCC(CC)) {
+    LHS = signExtendInputIfNeeded(LHS);
+    RHS = signExtendInputIfNeeded(RHS);
+  } else if (ISD::isUnsignedIntSetCC(CC)) {
+    LHS = zeroExtendInputIfNeeded(LHS);
+    RHS = zeroExtendInputIfNeeded(RHS);
+  }
+
+  bool IsSext = ConvOpts == SetccInGPROpts::SExtOrig ||
+    ConvOpts == SetccInGPROpts::SExtInvert;
+  if (IsSext)
+    return get32BitSExtCompare(LHS, RHS, CC, RHSValue, dl);
+  return get32BitZExtCompare(LHS, RHS, CC, RHSValue, dl);
+}
+
 void PPCDAGToDAGISel::transferMemOperands(SDNode *N, SDNode *Result) {
   // Transfer memoperands.
   MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
@@ -2508,6 +2757,12 @@ void PPCDAGToDAGISel::Select(SDNode *N) {
     }
     break;
 
+  case ISD::ZERO_EXTEND:
+  case ISD::SIGN_EXTEND:
+    if (tryEXTEND(N))
+      return;
+    break;
+
   case ISD::SETCC:
     if (trySETCC(N))
       return;
diff --git a/lib/Target/PowerPC/PPCISelLowering.cpp b/lib/Target/PowerPC/PPCISelLowering.cpp
index 685f24cb502e..17bdd595da10 100644
--- a/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -923,6 +923,9 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
   setStackPointerRegisterToSaveRestore(isPPC64 ? PPC::X1 : PPC::R1);
 
   // We have target-specific dag combine patterns for the following nodes:
+  setTargetDAGCombine(ISD::SHL);
+  setTargetDAGCombine(ISD::SRA);
+  setTargetDAGCombine(ISD::SRL);
   setTargetDAGCombine(ISD::SINT_TO_FP);
   setTargetDAGCombine(ISD::BUILD_VECTOR);
   if (Subtarget.hasFPCVT())
@@ -4949,8 +4952,7 @@ SDValue PPCTargetLowering::LowerCall_32SVR4(
 
   // Adjust the stack pointer for the new arguments...
   // These operations are automatically eliminated by the prolog/epilog pass
-  Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, dl, true),
-                               dl);
+  Chain = DAG.getCALLSEQ_START(Chain, NumBytes, 0, dl);
   SDValue CallSeqStart = Chain;
 
   // Load the return address and frame pointer so it can be moved somewhere else
@@ -5000,9 +5002,8 @@ SDValue PPCTargetLowering::LowerCall_32SVR4(
                                   Flags, DAG, dl);
 
       // This must go outside the CALLSEQ_START..END.
-      SDValue NewCallSeqStart = DAG.getCALLSEQ_START(MemcpyCall,
-                           CallSeqStart.getNode()->getOperand(1),
-                           SDLoc(MemcpyCall));
+      SDValue NewCallSeqStart = DAG.getCALLSEQ_START(MemcpyCall, NumBytes, 0,
+                                                     SDLoc(MemcpyCall));
       DAG.ReplaceAllUsesWith(CallSeqStart.getNode(),
                              NewCallSeqStart.getNode());
       Chain = CallSeqStart = NewCallSeqStart;
@@ -5083,9 +5084,9 @@ SDValue PPCTargetLowering::createMemcpyOutsideCallSeq(
                         CallSeqStart.getNode()->getOperand(0),
                         Flags, DAG, dl);
   // The MEMCPY must go outside the CALLSEQ_START..END.
-  SDValue NewCallSeqStart = DAG.getCALLSEQ_START(MemcpyCall,
-                             CallSeqStart.getNode()->getOperand(1),
-                             SDLoc(MemcpyCall));
+  int64_t FrameSize = CallSeqStart.getConstantOperandVal(1);
+  SDValue NewCallSeqStart = DAG.getCALLSEQ_START(MemcpyCall, FrameSize, 0,
+                                                 SDLoc(MemcpyCall));
   DAG.ReplaceAllUsesWith(CallSeqStart.getNode(),
                          NewCallSeqStart.getNode());
   return NewCallSeqStart;
@@ -5268,8 +5269,7 @@ SDValue PPCTargetLowering::LowerCall_64SVR4(
   // Adjust the stack pointer for the new arguments...
   // These operations are automatically eliminated by the prolog/epilog pass
   if (!IsSibCall)
-    Chain = DAG.getCALLSEQ_START(Chain,
-                                 DAG.getIntPtrConstant(NumBytes, dl, true), dl);
+    Chain = DAG.getCALLSEQ_START(Chain, NumBytes, 0, dl);
   SDValue CallSeqStart = Chain;
 
   // Load the return address and frame pointer so it can be move somewhere else
@@ -5828,8 +5828,7 @@ SDValue PPCTargetLowering::LowerCall_Darwin(
 
   // Adjust the stack pointer for the new arguments...
   // These operations are automatically eliminated by the prolog/epilog pass
-  Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, dl, true),
-                               dl);
+  Chain = DAG.getCALLSEQ_START(Chain, NumBytes, 0, dl);
   SDValue CallSeqStart = Chain;
 
   // Load the return address and frame pointer so it can be move somewhere else
@@ -8741,9 +8740,9 @@ static Instruction* callIntrinsic(IRBuilder<> &Builder, Intrinsic::ID Id) {
 
 // The mappings for emitLeading/TrailingFence is taken from
 // http://www.cl.cam.ac.uk/~pes20/cpp/cpp0xmappings.html
-Instruction* PPCTargetLowering::emitLeadingFence(IRBuilder<> &Builder,
-                                         AtomicOrdering Ord, bool IsStore,
-                                         bool IsLoad) const {
+Instruction *PPCTargetLowering::emitLeadingFence(IRBuilder<> &Builder,
+                                                 Instruction *Inst,
+                                                 AtomicOrdering Ord) const {
   if (Ord == AtomicOrdering::SequentiallyConsistent)
     return callIntrinsic(Builder, Intrinsic::ppc_sync);
   if (isReleaseOrStronger(Ord))
@@ -8751,10 +8750,10 @@ Instruction* PPCTargetLowering::emitLeadingFence(IRBuilder<> &Builder,
   return nullptr;
 }
 
-Instruction* PPCTargetLowering::emitTrailingFence(IRBuilder<> &Builder,
-                                          AtomicOrdering Ord, bool IsStore,
-                                          bool IsLoad) const {
-  if (IsLoad && isAcquireOrStronger(Ord))
+Instruction *PPCTargetLowering::emitTrailingFence(IRBuilder<> &Builder,
+                                                  Instruction *Inst,
+                                                  AtomicOrdering Ord) const {
+  if (Inst->hasAtomicLoad() && isAcquireOrStronger(Ord))
     return callIntrinsic(Builder, Intrinsic::ppc_lwsync);
   // FIXME: this is too conservative, a dependent branch + isync is enough.
   // See http://www.cl.cam.ac.uk/~pes20/cpp/cpp0xmappings.html and
@@ -11316,6 +11315,12 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
   SDLoc dl(N);
   switch (N->getOpcode()) {
   default: break;
+  case ISD::SHL:
+    return combineSHL(N, DCI);
+  case ISD::SRA:
+    return combineSRA(N, DCI);
+  case ISD::SRL:
+    return combineSRL(N, DCI);
   case PPCISD::SHL:
     if (isNullConstant(N->getOperand(0))) // 0 << V -> 0.
         return N->getOperand(0);
@@ -12948,3 +12953,58 @@ bool PPCTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
     return Imm.isPosZero();
   }
 }
+
+// For vector shift operation op, fold
+// (op x, (and y, ((1 << numbits(x)) - 1))) -> (target op x, y)
+static SDValue stripModuloOnShift(const TargetLowering &TLI, SDNode *N,
+                                  SelectionDAG &DAG) {
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  EVT VT = N0.getValueType();
+  unsigned OpSizeInBits = VT.getScalarSizeInBits();
+  unsigned Opcode = N->getOpcode();
+  unsigned TargetOpcode;
+
+  switch (Opcode) {
+  default:
+    llvm_unreachable("Unexpected shift operation");
+  case ISD::SHL:
+    TargetOpcode = PPCISD::SHL;
+    break;
+  case ISD::SRL:
+    TargetOpcode = PPCISD::SRL;
+    break;
+  case ISD::SRA:
+    TargetOpcode = PPCISD::SRA;
+    break;
+  }
+
+  if (VT.isVector() && TLI.isOperationLegal(Opcode, VT) &&
+      N1->getOpcode() == ISD::AND)
+    if (ConstantSDNode *Mask = isConstOrConstSplat(N1->getOperand(1)))
+      if (Mask->getZExtValue() == OpSizeInBits - 1)
+        return DAG.getNode(TargetOpcode, SDLoc(N), VT, N0, N1->getOperand(0));
+
+  return SDValue();
+}
+
+SDValue PPCTargetLowering::combineSHL(SDNode *N, DAGCombinerInfo &DCI) const {
+  if (auto Value = stripModuloOnShift(*this, N, DCI.DAG))
+    return Value;
+
+  return SDValue();
+}
+
+SDValue PPCTargetLowering::combineSRA(SDNode *N, DAGCombinerInfo &DCI) const {
+  if (auto Value = stripModuloOnShift(*this, N, DCI.DAG))
+    return Value;
+
+  return SDValue();
+}
+
+SDValue PPCTargetLowering::combineSRL(SDNode *N, DAGCombinerInfo &DCI) const {
+  if (auto Value = stripModuloOnShift(*this, N, DCI.DAG))
+    return Value;
+
+  return SDValue();
+}
diff --git a/lib/Target/PowerPC/PPCISelLowering.h b/lib/Target/PowerPC/PPCISelLowering.h
index 32661099b79d..4fc744257262 100644
--- a/lib/Target/PowerPC/PPCISelLowering.h
+++ b/lib/Target/PowerPC/PPCISelLowering.h
@@ -117,9 +117,13 @@ namespace llvm {
       /// at function entry, used for PIC code.
       GlobalBaseReg,
 
-      /// These nodes represent the 32-bit PPC shifts that operate on 6-bit
-      /// shift amounts.  These nodes are generated by the multi-precision shift
-      /// code.
+      /// These nodes represent PPC shifts.
+      ///
+      /// For scalar types, only the last `n + 1` bits of the shift amounts
+      /// are used, where n is log2(sizeof(element) * 8). See sld/slw, etc.
+      /// for exact behaviors.
+      ///
+      /// For vector types, only the last n bits are used. See vsld.
       SRL, SRA, SHL,
 
       /// The combination of sra[wd]i and addze used to implemented signed
@@ -617,10 +621,10 @@ namespace llvm {
       return true;
     }
 
-    Instruction* emitLeadingFence(IRBuilder<> &Builder, AtomicOrdering Ord,
-                                  bool IsStore, bool IsLoad) const override;
-    Instruction* emitTrailingFence(IRBuilder<> &Builder, AtomicOrdering Ord,
-                                   bool IsStore, bool IsLoad) const override;
+    Instruction *emitLeadingFence(IRBuilder<> &Builder, Instruction *Inst,
+                                  AtomicOrdering Ord) const override;
+    Instruction *emitTrailingFence(IRBuilder<> &Builder, Instruction *Inst,
+                                   AtomicOrdering Ord) const override;
 
     MachineBasicBlock *
     EmitInstrWithCustomInserter(MachineInstr &MI,
@@ -999,6 +1003,9 @@ namespace llvm {
     SDValue DAGCombineBuildVector(SDNode *N, DAGCombinerInfo &DCI) const;
     SDValue DAGCombineTruncBoolExt(SDNode *N, DAGCombinerInfo &DCI) const;
     SDValue combineFPToIntToFP(SDNode *N, DAGCombinerInfo &DCI) const;
+    SDValue combineSHL(SDNode *N, DAGCombinerInfo &DCI) const;
+    SDValue combineSRA(SDNode *N, DAGCombinerInfo &DCI) const;
+    SDValue combineSRL(SDNode *N, DAGCombinerInfo &DCI) const;
 
     /// ConvertSETCCToSubtract - looks at SETCC that compares ints. It replaces
     /// SETCC with integer subtraction when (1) there is a legal way of doing it
@@ -1017,14 +1024,6 @@ namespace llvm {
     SDValue
     combineElementTruncationToVectorTruncation(SDNode *N,
                                                DAGCombinerInfo &DCI) const;
-
-    bool supportsModuloShift(ISD::NodeType Inst,
-                             EVT ReturnType) const override {
-      assert((Inst == ISD::SHL || Inst == ISD::SRA || Inst == ISD::SRL) &&
-             "Expect a shift instruction");
-      assert(isOperationLegal(Inst, ReturnType));
-      return ReturnType.isVector();
-    }
   };
 
   namespace PPC {
diff --git a/lib/Target/PowerPC/PPCInstr64Bit.td b/lib/Target/PowerPC/PPCInstr64Bit.td
index 997b96ca6ec8..a8433919f0f3 100644
--- a/lib/Target/PowerPC/PPCInstr64Bit.td
+++ b/lib/Target/PowerPC/PPCInstr64Bit.td
@@ -634,10 +634,19 @@ let Interpretation64Bit = 1, isCodeGenOnly = 1 in
 defm EXTSW_32_64 : XForm_11r<31, 986, (outs g8rc:$rA), (ins gprc:$rS),
                              "extsw", "$rA, $rS", IIC_IntSimple,
                              [(set i64:$rA, (sext i32:$rS))]>, isPPC64;
+let isCodeGenOnly = 1 in
+def EXTSW_32 : XForm_11<31, 986, (outs gprc:$rA), (ins gprc:$rS),
+                        "extsw $rA, $rS", IIC_IntSimple,
+                        []>, isPPC64;
 
 defm SRADI  : XSForm_1rc<31, 413, (outs g8rc:$rA), (ins g8rc:$rS, u6imm:$SH),
                          "sradi", "$rA, $rS, $SH", IIC_IntRotateDI,
                          [(set i64:$rA, (sra i64:$rS, (i32 imm:$SH)))]>, isPPC64;
+// For fast-isel:
+let isCodeGenOnly = 1 in
+def SRADI_32  : XSForm_1<31, 413, (outs gprc:$rA), (ins gprc:$rS, u6imm:$SH),
+                         "sradi $rA, $rS, $SH", IIC_IntRotateDI, []>, isPPC64;
+
 defm CNTLZD : XForm_11r<31,  58, (outs g8rc:$rA), (ins g8rc:$rS),
                         "cntlzd", "$rA, $rS", IIC_IntGeneral,
                         [(set i64:$rA, (ctlz i64:$rS))]>;
@@ -721,15 +730,26 @@ defm RLDICL : MDForm_1r<30, 0,
 // For fast-isel:
 let isCodeGenOnly = 1 in
 def RLDICL_32_64 : MDForm_1<30, 0,
-                           (outs g8rc:$rA),
-                           (ins gprc:$rS, u6imm:$SH, u6imm:$MBE),
-                           "rldicl $rA, $rS, $SH, $MBE", IIC_IntRotateDI,
-                           []>, isPPC64;
+                            (outs g8rc:$rA),
+                            (ins gprc:$rS, u6imm:$SH, u6imm:$MBE),
+                            "rldicl $rA, $rS, $SH, $MBE", IIC_IntRotateDI,
+                            []>, isPPC64;
 // End fast-isel.
+let isCodeGenOnly = 1 in
+def RLDICL_32 : MDForm_1<30, 0,
+                         (outs gprc:$rA),
+                         (ins gprc:$rS, u6imm:$SH, u6imm:$MBE),
+                         "rldicl $rA, $rS, $SH, $MBE", IIC_IntRotateDI,
+                         []>, isPPC64;
 defm RLDICR : MDForm_1r<30, 1,
                         (outs g8rc:$rA), (ins g8rc:$rS, u6imm:$SH, u6imm:$MBE),
                         "rldicr", "$rA, $rS, $SH, $MBE", IIC_IntRotateDI,
                         []>, isPPC64;
+let isCodeGenOnly = 1 in
+def RLDICR_32 : MDForm_1<30, 1,
+                         (outs gprc:$rA), (ins gprc:$rS, u6imm:$SH, u6imm:$MBE),
+                         "rldicr $rA, $rS, $SH, $MBE", IIC_IntRotateDI,
+                         []>, isPPC64;
 defm RLDIC  : MDForm_1r<30, 2,
                         (outs g8rc:$rA), (ins g8rc:$rS, u6imm:$SH, u6imm:$MBE),
                         "rldic", "$rA, $rS, $SH, $MBE", IIC_IntRotateDI,
diff --git a/lib/Target/PowerPC/PPCInstrAltivec.td b/lib/Target/PowerPC/PPCInstrAltivec.td
index c380766e9f5c..e14d18fd5433 100644
--- a/lib/Target/PowerPC/PPCInstrAltivec.td
+++ b/lib/Target/PowerPC/PPCInstrAltivec.td
@@ -987,6 +987,12 @@ def : Pat<(v8i16 (shl v8i16:$vA, v8i16:$vB)),
           (v8i16 (VSLH $vA, $vB))>;
 def : Pat<(v4i32 (shl v4i32:$vA, v4i32:$vB)),
           (v4i32 (VSLW $vA, $vB))>;
+def : Pat<(v16i8 (PPCshl v16i8:$vA, v16i8:$vB)),
+          (v16i8 (VSLB $vA, $vB))>;
+def : Pat<(v8i16 (PPCshl v8i16:$vA, v8i16:$vB)),
+          (v8i16 (VSLH $vA, $vB))>;
+def : Pat<(v4i32 (PPCshl v4i32:$vA, v4i32:$vB)),
+          (v4i32 (VSLW $vA, $vB))>;
 
 def : Pat<(v16i8 (srl v16i8:$vA, v16i8:$vB)),
           (v16i8 (VSRB $vA, $vB))>;
@@ -994,6 +1000,12 @@ def : Pat<(v8i16 (srl v8i16:$vA, v8i16:$vB)),
           (v8i16 (VSRH $vA, $vB))>;
 def : Pat<(v4i32 (srl v4i32:$vA, v4i32:$vB)),
           (v4i32 (VSRW $vA, $vB))>;
+def : Pat<(v16i8 (PPCsrl v16i8:$vA, v16i8:$vB)),
+          (v16i8 (VSRB $vA, $vB))>;
+def : Pat<(v8i16 (PPCsrl v8i16:$vA, v8i16:$vB)),
+          (v8i16 (VSRH $vA, $vB))>;
+def : Pat<(v4i32 (PPCsrl v4i32:$vA, v4i32:$vB)),
+          (v4i32 (VSRW $vA, $vB))>;
 
 def : Pat<(v16i8 (sra v16i8:$vA, v16i8:$vB)),
           (v16i8 (VSRAB $vA, $vB))>;
@@ -1001,6 +1013,12 @@ def : Pat<(v8i16 (sra v8i16:$vA, v8i16:$vB)),
           (v8i16 (VSRAH $vA, $vB))>;
 def : Pat<(v4i32 (sra v4i32:$vA, v4i32:$vB)),
           (v4i32 (VSRAW $vA, $vB))>;
+def : Pat<(v16i8 (PPCsra v16i8:$vA, v16i8:$vB)),
+          (v16i8 (VSRAB $vA, $vB))>;
+def : Pat<(v8i16 (PPCsra v8i16:$vA, v8i16:$vB)),
+          (v8i16 (VSRAH $vA, $vB))>;
+def : Pat<(v4i32 (PPCsra v4i32:$vA, v4i32:$vB)),
+          (v4i32 (VSRAW $vA, $vB))>;
 
 // Float to integer and integer to float conversions
 def : Pat<(v4i32 (fp_to_sint v4f32:$vA)),
@@ -1072,14 +1090,24 @@ def:Pat<(vmrgow_swapped_shuffle v16i8:$vA, v16i8:$vB),
 // Vector shifts
 def VRLD : VX1_Int_Ty<196, "vrld", int_ppc_altivec_vrld, v2i64>;
 def VSLD : VXForm_1<1476, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
-                    "vsld $vD, $vA, $vB", IIC_VecGeneral,
-                    [(set v2i64:$vD, (shl v2i64:$vA, v2i64:$vB))]>;
+                    "vsld $vD, $vA, $vB", IIC_VecGeneral, []>;
 def VSRD : VXForm_1<1732, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
-                   "vsrd $vD, $vA, $vB", IIC_VecGeneral,
-                   [(set v2i64:$vD, (srl v2i64:$vA, v2i64:$vB))]>;
+                   "vsrd $vD, $vA, $vB", IIC_VecGeneral, []>;
 def VSRAD : VXForm_1<964, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
-                    "vsrad $vD, $vA, $vB", IIC_VecGeneral,
-                    [(set v2i64:$vD, (sra v2i64:$vA, v2i64:$vB))]>;
+                    "vsrad $vD, $vA, $vB", IIC_VecGeneral, []>;
+
+def : Pat<(v2i64 (shl v2i64:$vA, v2i64:$vB)),
+          (v2i64 (VSLD $vA, $vB))>;
+def : Pat<(v2i64 (PPCshl v2i64:$vA, v2i64:$vB)),
+          (v2i64 (VSLD $vA, $vB))>;
+def : Pat<(v2i64 (srl v2i64:$vA, v2i64:$vB)),
+          (v2i64 (VSRD $vA, $vB))>;
+def : Pat<(v2i64 (PPCsrl v2i64:$vA, v2i64:$vB)),
+          (v2i64 (VSRD $vA, $vB))>;
+def : Pat<(v2i64 (sra v2i64:$vA, v2i64:$vB)),
+          (v2i64 (VSRAD $vA, $vB))>;
+def : Pat<(v2i64 (PPCsra v2i64:$vA, v2i64:$vB)),
+          (v2i64 (VSRAD $vA, $vB))>;
 
 // Vector Integer Arithmetic Instructions
 let isCommutable = 1 in {
diff --git a/lib/Target/PowerPC/PPCInstrInfo.td b/lib/Target/PowerPC/PPCInstrInfo.td
index f004ce49cac0..1af5e7f28342 100644
--- a/lib/Target/PowerPC/PPCInstrInfo.td
+++ b/lib/Target/PowerPC/PPCInstrInfo.td
@@ -33,7 +33,8 @@ def SDT_PPCVexts  : SDTypeProfile<1, 2, [
   SDTCisVT<0, f64>, SDTCisVT<1, f64>, SDTCisPtrTy<2>
 ]>;
 
-def SDT_PPCCallSeqStart : SDCallSeqStart<[ SDTCisVT<0, i32> ]>;
+def SDT_PPCCallSeqStart : SDCallSeqStart<[ SDTCisVT<0, i32>,
+                                           SDTCisVT<1, i32> ]>;
 def SDT_PPCCallSeqEnd   : SDCallSeqEnd<[ SDTCisVT<0, i32>,
                                          SDTCisVT<1, i32> ]>;
 def SDT_PPCvperm   : SDTypeProfile<1, 3, [
@@ -1099,9 +1100,11 @@ multiclass AForm_3r<bits<6> opcode, bits<5> xo, dag OOL, dag IOL,
 
 let hasCtrlDep = 1 in {
 let Defs = [R1], Uses = [R1] in {
-def ADJCALLSTACKDOWN : Pseudo<(outs), (ins u16imm:$amt), "#ADJCALLSTACKDOWN $amt",
-                              [(callseq_start timm:$amt)]>;
-def ADJCALLSTACKUP   : Pseudo<(outs), (ins u16imm:$amt1, u16imm:$amt2), "#ADJCALLSTACKUP $amt1 $amt2",
+def ADJCALLSTACKDOWN : Pseudo<(outs), (ins u16imm:$amt1, u16imm:$amt2),
+                              "#ADJCALLSTACKDOWN $amt1 $amt2",
+                              [(callseq_start timm:$amt1, timm:$amt2)]>;
+def ADJCALLSTACKUP   : Pseudo<(outs), (ins u16imm:$amt1, u16imm:$amt2),
+                              "#ADJCALLSTACKUP $amt1 $amt2",
                               [(callseq_end timm:$amt1, timm:$amt2)]>;
 }
 
@@ -4163,6 +4166,8 @@ def : InstAlias<"rotldi. $rA, $rS, $n", (RLDICLo g8rc:$rA, g8rc:$rS, u6imm:$n, 0
 def : InstAlias<"rotld $rA, $rS, $rB", (RLDCL g8rc:$rA, g8rc:$rS, gprc:$rB, 0)>;
 def : InstAlias<"rotld. $rA, $rS, $rB", (RLDCLo g8rc:$rA, g8rc:$rS, gprc:$rB, 0)>;
 def : InstAlias<"clrldi $rA, $rS, $n", (RLDICL g8rc:$rA, g8rc:$rS, 0, u6imm:$n)>;
+def : InstAlias<"clrldi $rA, $rS, $n",
+                (RLDICL_32 gprc:$rA, gprc:$rS, 0, u6imm:$n)>;
 def : InstAlias<"clrldi. $rA, $rS, $n", (RLDICLo g8rc:$rA, g8rc:$rS, 0, u6imm:$n)>;
 
 def RLWINMbm : PPCAsmPseudo<"rlwinm $rA, $rS, $n, $b",
diff --git a/lib/Target/PowerPC/PPCInstrVSX.td b/lib/Target/PowerPC/PPCInstrVSX.td
index 967557452f24..b98140fedfc0 100644
--- a/lib/Target/PowerPC/PPCInstrVSX.td
+++ b/lib/Target/PowerPC/PPCInstrVSX.td
@@ -1436,7 +1436,7 @@ let Predicates = [IsISA3_0, HasDirectMove] in {
   def MTVSRWS: XX1_RS6_RD5_XO<31, 403, (outs vsrc:$XT), (ins gprc:$rA),
                               "mtvsrws $XT, $rA", IIC_VecGeneral, []>;
 
-  def MTVSRDD: XX1Form<31, 435, (outs vsrc:$XT), (ins g8rc:$rA, g8rc:$rB),
+  def MTVSRDD: XX1Form<31, 435, (outs vsrc:$XT), (ins g8rc_nox0:$rA, g8rc:$rB),
                        "mtvsrdd $XT, $rA, $rB", IIC_VecGeneral,
                        []>, Requires<[In64BitMode]>;
 
diff --git a/lib/Target/PowerPC/PPCTLSDynamicCall.cpp b/lib/Target/PowerPC/PPCTLSDynamicCall.cpp
index 0c1260a2965b..c7aa4cb78b7a 100644
--- a/lib/Target/PowerPC/PPCTLSDynamicCall.cpp
+++ b/lib/Target/PowerPC/PPCTLSDynamicCall.cpp
@@ -99,7 +99,8 @@ protected:
         // Don't really need to save data to the stack - the clobbered
         // registers are already saved when the SDNode (e.g. PPCaddiTlsgdLAddr)
         // gets translated to the pseudo instruction (e.g. ADDItlsgdLADDR).
-        BuildMI(MBB, I, DL, TII->get(PPC::ADJCALLSTACKDOWN)).addImm(0);
+        BuildMI(MBB, I, DL, TII->get(PPC::ADJCALLSTACKDOWN)).addImm(0)
+                                                            .addImm(0);
 
         // Expand into two ops built prior to the existing instruction.
         MachineInstr *Addi = BuildMI(MBB, I, DL, TII->get(Opc1), GPR3)
diff --git a/lib/Target/Sparc/SparcISelLowering.cpp b/lib/Target/Sparc/SparcISelLowering.cpp
index acb34d5baaa8..9e7e3c6b705a 100644
--- a/lib/Target/Sparc/SparcISelLowering.cpp
+++ b/lib/Target/Sparc/SparcISelLowering.cpp
@@ -773,8 +773,7 @@ SparcTargetLowering::LowerCall_32(TargetLowering::CallLoweringInfo &CLI,
     }
   }
 
-  Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(ArgsSize, dl, true),
-                               dl);
+  Chain = DAG.getCALLSEQ_START(Chain, ArgsSize, 0, dl);
 
   SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
   SmallVector<SDValue, 8> MemOpChains;
@@ -1165,8 +1164,7 @@ SparcTargetLowering::LowerCall_64(TargetLowering::CallLoweringInfo &CLI,
   // Adjust the stack pointer to make room for the arguments.
   // FIXME: Use hasReservedCallFrame to avoid %sp adjustments around all calls
   // with more than 6 arguments.
-  Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(ArgsSize, DL, true),
-                               DL);
+  Chain = DAG.getCALLSEQ_START(Chain, ArgsSize, 0, DL);
 
   // Collect the set of registers to pass to the function and their values.
   // This will be emitted as a sequence of CopyToReg nodes glued to the call
@@ -2058,7 +2056,7 @@ SDValue SparcTargetLowering::LowerGlobalTLSAddress(SDValue Op,
     SDValue Chain = DAG.getEntryNode();
     SDValue InFlag;
 
-    Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(1, DL, true), DL);
+    Chain = DAG.getCALLSEQ_START(Chain, 1, 0, DL);
     Chain = DAG.getCopyToReg(Chain, DL, SP::O0, Argument, InFlag);
     InFlag = Chain.getValue(1);
     SDValue Callee = DAG.getTargetExternalSymbol("__tls_get_addr", PtrVT);
@@ -3386,7 +3384,10 @@ SparcTargetLowering::getConstraintType(StringRef Constraint) const {
   if (Constraint.size() == 1) {
     switch (Constraint[0]) {
     default:  break;
-    case 'r': return C_RegisterClass;
+    case 'r':
+    case 'f':
+    case 'e':
+      return C_RegisterClass;
     case 'I': // SIMM13
       return C_Other;
     }
@@ -3465,6 +3466,24 @@ SparcTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
         return std::make_pair(0U, &SP::IntPairRegClass);
       else
         return std::make_pair(0U, &SP::IntRegsRegClass);
+    case 'f':
+      if (VT == MVT::f32)
+        return std::make_pair(0U, &SP::FPRegsRegClass);
+      else if (VT == MVT::f64)
+        return std::make_pair(0U, &SP::LowDFPRegsRegClass);
+      else if (VT == MVT::f128)
+        return std::make_pair(0U, &SP::LowQFPRegsRegClass);
+      llvm_unreachable("Unknown ValueType for f-register-type!");
+      break;
+    case 'e':
+      if (VT == MVT::f32)
+        return std::make_pair(0U, &SP::FPRegsRegClass);
+      else if (VT == MVT::f64)
+        return std::make_pair(0U, &SP::DFPRegsRegClass);
+      else if (VT == MVT::f128)
+        return std::make_pair(0U, &SP::QFPRegsRegClass);
+      llvm_unreachable("Unknown ValueType for e-register-type!");
+      break;
     }
   } else if (!Constraint.empty() && Constraint.size() <= 5
               && Constraint[0] == '{' && *(Constraint.end()-1) == '}') {
diff --git a/lib/Target/Sparc/SparcInstrInfo.td b/lib/Target/Sparc/SparcInstrInfo.td
index 5a19c624abb5..ae45c8be6752 100644
--- a/lib/Target/Sparc/SparcInstrInfo.td
+++ b/lib/Target/Sparc/SparcInstrInfo.td
@@ -195,7 +195,8 @@ def SPsjlj_longjmp: SDNode<"SPISD::EH_SJLJ_LONGJMP",
                            [SDNPHasChain, SDNPSideEffect]>;
 
 //  These are target-independent nodes, but have target-specific formats.
-def SDT_SPCallSeqStart : SDCallSeqStart<[ SDTCisVT<0, i32> ]>;
+def SDT_SPCallSeqStart : SDCallSeqStart<[ SDTCisVT<0, i32>,
+                                          SDTCisVT<1, i32> ]>;
 def SDT_SPCallSeqEnd   : SDCallSeqEnd<[ SDTCisVT<0, i32>,
                                         SDTCisVT<1, i32> ]>;
 
@@ -404,9 +405,9 @@ let Defs = [O7] in {
 }
 
 let Defs = [O6], Uses = [O6] in {
-def ADJCALLSTACKDOWN : Pseudo<(outs), (ins i32imm:$amt),
-                               "!ADJCALLSTACKDOWN $amt",
-                               [(callseq_start timm:$amt)]>;
+def ADJCALLSTACKDOWN : Pseudo<(outs), (ins i32imm:$amt1, i32imm:$amt2),
+                               "!ADJCALLSTACKDOWN $amt1, $amt2",
+                               [(callseq_start timm:$amt1, timm:$amt2)]>;
 def ADJCALLSTACKUP : Pseudo<(outs), (ins i32imm:$amt1, i32imm:$amt2),
                             "!ADJCALLSTACKUP $amt1",
                             [(callseq_end timm:$amt1, timm:$amt2)]>;
diff --git a/lib/Target/Sparc/SparcRegisterInfo.td b/lib/Target/Sparc/SparcRegisterInfo.td
index 6ecfddfc7d66..6625eaafd992 100644
--- a/lib/Target/Sparc/SparcRegisterInfo.td
+++ b/lib/Target/Sparc/SparcRegisterInfo.td
@@ -346,11 +346,13 @@ def I64Regs : RegisterClass<"SP", [i64], 64, (add IntRegs)>;
 
 // Floating point register classes.
 def FPRegs : RegisterClass<"SP", [f32], 32, (sequence "F%u", 0, 31)>;
-
 def DFPRegs : RegisterClass<"SP", [f64], 64, (sequence "D%u", 0, 31)>;
-
 def QFPRegs : RegisterClass<"SP", [f128], 128, (sequence "Q%u", 0, 15)>;
 
+// The Low?FPRegs classes are used only for inline-asm constraints.
+def LowDFPRegs : RegisterClass<"SP", [f64], 64, (sequence "D%u", 0, 15)>;
+def LowQFPRegs : RegisterClass<"SP", [f128], 128, (sequence "Q%u", 0, 7)>;
+
 // Floating point control register classes.
 def FCCRegs : RegisterClass<"SP", [i1], 1, (sequence "FCC%u", 0, 3)>;
 
diff --git a/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp b/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp
index 3f91ca9035a6..efcf6696fd50 100644
--- a/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp
+++ b/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp
@@ -262,6 +262,9 @@ public:
   bool isMemDisp20(MemoryKind MemKind, RegisterKind RegKind) const {
     return isMem(MemKind, RegKind) && inRange(Mem.Disp, -524288, 524287);
   }
+  bool isMemDisp12Len4(RegisterKind RegKind) const {
+    return isMemDisp12(BDLMem, RegKind) && inRange(Mem.Length.Imm, 1, 0x10);
+  }
   bool isMemDisp12Len8(RegisterKind RegKind) const {
     return isMemDisp12(BDLMem, RegKind) && inRange(Mem.Length.Imm, 1, 0x100);
   }
@@ -347,6 +350,7 @@ public:
   bool isBDAddr64Disp20() const { return isMemDisp20(BDMem, ADDR64Reg); }
   bool isBDXAddr64Disp12() const { return isMemDisp12(BDXMem, ADDR64Reg); }
   bool isBDXAddr64Disp20() const { return isMemDisp20(BDXMem, ADDR64Reg); }
+  bool isBDLAddr64Disp12Len4() const { return isMemDisp12Len4(ADDR64Reg); }
   bool isBDLAddr64Disp12Len8() const { return isMemDisp12Len8(ADDR64Reg); }
   bool isBDRAddr64Disp12() const { return isMemDisp12(BDRMem, ADDR64Reg); }
   bool isBDVAddr64Disp12() const { return isMemDisp12(BDVMem, ADDR64Reg); }
diff --git a/lib/Target/SystemZ/Disassembler/SystemZDisassembler.cpp b/lib/Target/SystemZ/Disassembler/SystemZDisassembler.cpp
index a281a0aa6bcc..27fd70bc6092 100644
--- a/lib/Target/SystemZ/Disassembler/SystemZDisassembler.cpp
+++ b/lib/Target/SystemZ/Disassembler/SystemZDisassembler.cpp
@@ -327,6 +327,18 @@ static DecodeStatus decodeBDXAddr20Operand(MCInst &Inst, uint64_t Field,
   return MCDisassembler::Success;
 }
 
+static DecodeStatus decodeBDLAddr12Len4Operand(MCInst &Inst, uint64_t Field,
+                                               const unsigned *Regs) {
+  uint64_t Length = Field >> 16;
+  uint64_t Base = (Field >> 12) & 0xf;
+  uint64_t Disp = Field & 0xfff;
+  assert(Length < 16 && "Invalid BDLAddr12Len4");
+  Inst.addOperand(MCOperand::createReg(Base == 0 ? 0 : Regs[Base]));
+  Inst.addOperand(MCOperand::createImm(Disp));
+  Inst.addOperand(MCOperand::createImm(Length + 1));
+  return MCDisassembler::Success;
+}
+
 static DecodeStatus decodeBDLAddr12Len8Operand(MCInst &Inst, uint64_t Field,
                                                const unsigned *Regs) {
   uint64_t Length = Field >> 16;
@@ -399,6 +411,13 @@ static DecodeStatus decodeBDXAddr64Disp20Operand(MCInst &Inst, uint64_t Field,
   return decodeBDXAddr20Operand(Inst, Field, SystemZMC::GR64Regs);
 }
 
+static DecodeStatus decodeBDLAddr64Disp12Len4Operand(MCInst &Inst,
+                                                     uint64_t Field,
+                                                     uint64_t Address,
+                                                     const void *Decoder) {
+  return decodeBDLAddr12Len4Operand(Inst, Field, SystemZMC::GR64Regs);
+}
+
 static DecodeStatus decodeBDLAddr64Disp12Len8Operand(MCInst &Inst,
                                                      uint64_t Field,
                                                      uint64_t Address,
diff --git a/lib/Target/SystemZ/MCTargetDesc/SystemZMCCodeEmitter.cpp b/lib/Target/SystemZ/MCTargetDesc/SystemZMCCodeEmitter.cpp
index 092eb4011adc..d188f56512ab 100644
--- a/lib/Target/SystemZ/MCTargetDesc/SystemZMCCodeEmitter.cpp
+++ b/lib/Target/SystemZ/MCTargetDesc/SystemZMCCodeEmitter.cpp
@@ -77,6 +77,9 @@ private:
   uint64_t getBDXAddr20Encoding(const MCInst &MI, unsigned OpNum,
                                 SmallVectorImpl<MCFixup> &Fixups,
                                 const MCSubtargetInfo &STI) const;
+  uint64_t getBDLAddr12Len4Encoding(const MCInst &MI, unsigned OpNum,
+                                    SmallVectorImpl<MCFixup> &Fixups,
+                                    const MCSubtargetInfo &STI) const;
   uint64_t getBDLAddr12Len8Encoding(const MCInst &MI, unsigned OpNum,
                                     SmallVectorImpl<MCFixup> &Fixups,
                                     const MCSubtargetInfo &STI) const;
@@ -219,6 +222,17 @@ getBDXAddr20Encoding(const MCInst &MI, unsigned OpNum,
     | ((Disp & 0xff000) >> 12);
 }
 
+uint64_t SystemZMCCodeEmitter::
+getBDLAddr12Len4Encoding(const MCInst &MI, unsigned OpNum,
+                         SmallVectorImpl<MCFixup> &Fixups,
+                         const MCSubtargetInfo &STI) const {
+  uint64_t Base = getMachineOpValue(MI, MI.getOperand(OpNum), Fixups, STI);
+  uint64_t Disp = getMachineOpValue(MI, MI.getOperand(OpNum + 1), Fixups, STI);
+  uint64_t Len  = getMachineOpValue(MI, MI.getOperand(OpNum + 2), Fixups, STI) - 1;
+  assert(isUInt<4>(Base) && isUInt<12>(Disp) && isUInt<4>(Len));
+  return (Len << 16) | (Base << 12) | Disp;
+}
+
 uint64_t SystemZMCCodeEmitter::
 getBDLAddr12Len8Encoding(const MCInst &MI, unsigned OpNum,
                          SmallVectorImpl<MCFixup> &Fixups,
diff --git a/lib/Target/SystemZ/README.txt b/lib/Target/SystemZ/README.txt
index 86a1322c9e23..74cf653b9d95 100644
--- a/lib/Target/SystemZ/README.txt
+++ b/lib/Target/SystemZ/README.txt
@@ -63,7 +63,7 @@ via a register.)
 
 --
 
-We don't use ICM or STCM.
+We don't use ICM, STCM, or CLM.
 
 --
 
diff --git a/lib/Target/SystemZ/SystemZFeatures.td b/lib/Target/SystemZ/SystemZFeatures.td
index 716e5add8051..7bfa378aa85c 100644
--- a/lib/Target/SystemZ/SystemZFeatures.td
+++ b/lib/Target/SystemZ/SystemZFeatures.td
@@ -68,6 +68,11 @@ def FeaturePopulationCount : SystemZFeature<
   "Assume that the population-count facility is installed"
 >;
 
+def FeatureMessageSecurityAssist4 : SystemZFeature<
+  "message-security-assist-extension4", "MessageSecurityAssist4",
+  "Assume that the message-security-assist extension facility 4 is installed"
+>;
+
 def Arch9NewFeatures : SystemZFeatureList<[
     FeatureDistinctOps,
     FeatureFastSerialization,
@@ -75,7 +80,8 @@ def Arch9NewFeatures : SystemZFeatureList<[
     FeatureHighWord,
     FeatureInterlockedAccess1,
     FeatureLoadStoreOnCond,
-    FeaturePopulationCount
+    FeaturePopulationCount,
+    FeatureMessageSecurityAssist4
 ]>;
 
 //===----------------------------------------------------------------------===//
@@ -133,6 +139,11 @@ def FeatureLoadStoreOnCond2 : SystemZFeature<
   "Assume that the load/store-on-condition facility 2 is installed"
 >;
 
+def FeatureMessageSecurityAssist5 : SystemZFeature<
+  "message-security-assist-extension5", "MessageSecurityAssist5",
+  "Assume that the message-security-assist extension facility 5 is installed"
+>;
+
 def FeatureVector : SystemZFeature<
   "vector", "Vector",
   "Assume that the vectory facility is installed"
@@ -142,6 +153,7 @@ def FeatureNoVector : SystemZMissingFeature<"Vector">;
 def Arch11NewFeatures : SystemZFeatureList<[
     FeatureLoadAndZeroRightmostByte,
     FeatureLoadStoreOnCond2,
+    FeatureMessageSecurityAssist5,
     FeatureVector
 ]>;
 
diff --git a/lib/Target/SystemZ/SystemZISelLowering.cpp b/lib/Target/SystemZ/SystemZISelLowering.cpp
index 6989aabb8c6a..235e095f0010 100644
--- a/lib/Target/SystemZ/SystemZISelLowering.cpp
+++ b/lib/Target/SystemZ/SystemZISelLowering.cpp
@@ -1110,9 +1110,7 @@ SystemZTargetLowering::LowerCall(CallLoweringInfo &CLI,
 
   // Mark the start of the call.
   if (!IsTailCall)
-    Chain = DAG.getCALLSEQ_START(Chain,
-                                 DAG.getConstant(NumBytes, DL, PtrVT, true),
-                                 DL);
+    Chain = DAG.getCALLSEQ_START(Chain, NumBytes, 0, DL);
 
   // Copy argument values to their designated locations.
   SmallVector<std::pair<unsigned, SDValue>, 9> RegsToPass;
@@ -6354,3 +6352,12 @@ MachineBasicBlock *SystemZTargetLowering::EmitInstrWithCustomInserter(
     llvm_unreachable("Unexpected instr type to insert");
   }
 }
+
+// This is only used by the isel schedulers, and is needed only to prevent
+// compiler from crashing when list-ilp is used.
+const TargetRegisterClass *
+SystemZTargetLowering::getRepRegClassFor(MVT VT) const {
+  if (VT == MVT::Untyped)
+    return &SystemZ::ADDR128BitRegClass;
+  return TargetLowering::getRepRegClassFor(VT);
+}
diff --git a/lib/Target/SystemZ/SystemZISelLowering.h b/lib/Target/SystemZ/SystemZISelLowering.h
index 1c34dc43e8bb..79c8c4d92669 100644
--- a/lib/Target/SystemZ/SystemZISelLowering.h
+++ b/lib/Target/SystemZ/SystemZISelLowering.h
@@ -590,6 +590,8 @@ private:
   MachineBasicBlock *emitLoadAndTestCmp0(MachineInstr &MI,
                                          MachineBasicBlock *MBB,
                                          unsigned Opcode) const;
+
+  const TargetRegisterClass *getRepRegClassFor(MVT VT) const override;
 };
 } // end namespace llvm
 
diff --git a/lib/Target/SystemZ/SystemZInstrFP.td b/lib/Target/SystemZ/SystemZInstrFP.td
index bb6d27e24828..364b81f98eed 100644
--- a/lib/Target/SystemZ/SystemZInstrFP.td
+++ b/lib/Target/SystemZ/SystemZInstrFP.td
@@ -458,6 +458,12 @@ def DXBR : BinaryRRE<"dxbr", 0xB34D, fdiv, FP128, FP128>;
 def DEB : BinaryRXE<"deb", 0xED0D, fdiv, FP32, load, 4>;
 def DDB : BinaryRXE<"ddb", 0xED1D, fdiv, FP64, load, 8>;
 
+// Divide to integer.
+let Defs = [CC] in {
+  def DIEBR : TernaryRRFb<"diebr", 0xB353, FP32, FP32, FP32>;
+  def DIDBR : TernaryRRFb<"didbr", 0xB35B, FP64, FP64, FP64>;
+}
+
 //===----------------------------------------------------------------------===//
 // Comparisons
 //===----------------------------------------------------------------------===//
@@ -469,6 +475,13 @@ let Defs = [CC], CCValues = 0xF in {
 
   def CEB : CompareRXE<"ceb", 0xED09, z_fcmp, FP32, load, 4>;
   def CDB : CompareRXE<"cdb", 0xED19, z_fcmp, FP64, load, 8>;
+
+  def KEBR : CompareRRE<"kebr", 0xB308, null_frag, FP32,  FP32>;
+  def KDBR : CompareRRE<"kdbr", 0xB318, null_frag, FP64,  FP64>;
+  def KXBR : CompareRRE<"kxbr", 0xB348, null_frag, FP128, FP128>;
+
+  def KEB : CompareRXE<"keb", 0xED08, null_frag, FP32, load, 4>;
+  def KDB : CompareRXE<"kdb", 0xED18, null_frag, FP64, load, 8>;
 }
 
 // Test Data Class.
diff --git a/lib/Target/SystemZ/SystemZInstrFormats.td b/lib/Target/SystemZ/SystemZInstrFormats.td
index c727f486087e..a37da2807854 100644
--- a/lib/Target/SystemZ/SystemZInstrFormats.td
+++ b/lib/Target/SystemZ/SystemZInstrFormats.td
@@ -710,6 +710,21 @@ class InstRSI<bits<8> op, dag outs, dag ins, string asmstr, list<dag> pattern>
   let Inst{15-0}  = RI2;
 }
 
+class InstRSLa<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern>
+  : InstSystemZ<6, outs, ins, asmstr, pattern> {
+  field bits<48> Inst;
+  field bits<48> SoftFail = 0;
+
+  bits<20> BDL1;
+
+  let Inst{47-40} = op{15-8};
+  let Inst{39-36} = BDL1{19-16};
+  let Inst{35-32} = 0;
+  let Inst{31-16} = BDL1{15-0};
+  let Inst{15-8}  = 0;
+  let Inst{7-0}   = op{7-0};
+}
+
 class InstRSYa<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern>
   : InstSystemZ<6, outs, ins, asmstr, pattern> {
   field bits<48> Inst;
@@ -817,6 +832,37 @@ class InstSSa<bits<8> op, dag outs, dag ins, string asmstr, list<dag> pattern>
   let Inst{15-0}  = BD2;
 }
 
+class InstSSb<bits<8> op, dag outs, dag ins, string asmstr, list<dag> pattern>
+  : InstSystemZ<6, outs, ins, asmstr, pattern> {
+  field bits<48> Inst;
+  field bits<48> SoftFail = 0;
+
+  bits<20> BDL1;
+  bits<20> BDL2;
+
+  let Inst{47-40} = op;
+  let Inst{39-36} = BDL1{19-16};
+  let Inst{35-32} = BDL2{19-16};
+  let Inst{31-16} = BDL1{15-0};
+  let Inst{15-0}  = BDL2{15-0};
+}
+
+class InstSSc<bits<8> op, dag outs, dag ins, string asmstr, list<dag> pattern>
+  : InstSystemZ<6, outs, ins, asmstr, pattern> {
+  field bits<48> Inst;
+  field bits<48> SoftFail = 0;
+
+  bits<20> BDL1;
+  bits<16> BD2;
+  bits<4> I3;
+
+  let Inst{47-40} = op;
+  let Inst{39-36} = BDL1{19-16};
+  let Inst{35-32} = I3;
+  let Inst{31-16} = BDL1{15-0};
+  let Inst{15-0}  = BD2;
+}
+
 class InstSSd<bits<8> op, dag outs, dag ins, string asmstr, list<dag> pattern>
   : InstSystemZ<6, outs, ins, asmstr, pattern> {
   field bits<48> Inst;
@@ -850,6 +896,20 @@ class InstSSe<bits<8> op, dag outs, dag ins, string asmstr, list<dag> pattern>
   let Inst{15-0}  = BD4;
 }
 
+class InstSSf<bits<8> op, dag outs, dag ins, string asmstr, list<dag> pattern>
+  : InstSystemZ<6, outs, ins, asmstr, pattern> {
+  field bits<48> Inst;
+  field bits<48> SoftFail = 0;
+
+  bits<16> BD1;
+  bits<24> BDL2;
+
+  let Inst{47-40} = op;
+  let Inst{39-32} = BDL2{23-16};
+  let Inst{31-16} = BD1;
+  let Inst{15-0}  = BDL2{15-0};
+}
+
 class InstSSE<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern>
   : InstSystemZ<6, outs, ins, asmstr, pattern> {
   field bits<48> Inst;
@@ -1567,6 +1627,9 @@ class ICV<string name>
 //   Inherent:
 //     One register output operand and no input operands.
 //
+//   InherentDual:
+//     Two register output operands and no input operands.
+//
 //   StoreInherent:
 //     One address operand.  The instruction stores to the address.
 //
@@ -1642,8 +1705,9 @@ class ICV<string name>
 //     Two input operands and an implicit CC output operand.
 //
 //   Test:
-//     Two input operands and an implicit CC output operand.  The second
-//     input operand is an "address" operand used as a test class mask.
+//     One or two input operands and an implicit CC output operand.  If
+//     present, the second input operand is an "address" operand used as
+//     a test class mask.
 //
 //   Ternary:
 //     One register output operand and three input operands.
@@ -1691,6 +1755,10 @@ class InherentRRE<string mnemonic, bits<16> opcode, RegisterOperand cls,
   let R2 = 0;
 }
 
+class InherentDualRRE<string mnemonic, bits<16> opcode, RegisterOperand cls>
+  : InstRRE<opcode, (outs cls:$R1, cls:$R2), (ins),
+            mnemonic#"\t$R1, $R2", []>;
+
 class InherentVRIa<string mnemonic, bits<16> opcode, bits<16> value>
   : InstVRIa<opcode, (outs VR128:$V1), (ins), mnemonic#"\t$V1", []> {
   let I2 = value;
@@ -1714,6 +1782,12 @@ class SideEffectInherentS<string mnemonic, bits<16> opcode,
   let BD2 = 0;
 }
 
+class SideEffectInherentRRE<string mnemonic, bits<16> opcode>
+  : InstRRE<opcode, (outs), (ins), mnemonic, []> {
+  let R1 = 0;
+  let R2 = 0;
+}
+
 // Allow an optional TLS marker symbol to generate TLS call relocations.
 class CallRI<string mnemonic, bits<12> opcode>
   : InstRIb<opcode, (outs), (ins GR64:$R1, brtarget16tls:$RI2),
@@ -2084,6 +2158,13 @@ multiclass LoadMultipleRSPair<string mnemonic, bits<8> rsOpcode,
   }
 }
 
+class LoadMultipleSSe<string mnemonic, bits<8> opcode, RegisterOperand cls>
+  : InstSSe<opcode, (outs cls:$R1, cls:$R3),
+            (ins bdaddr12only:$BD2, bdaddr12only:$BD4),
+            mnemonic#"\t$R1, $R3, $BD2, $BD4", []> {
+  let mayLoad = 1;
+}
+
 class LoadMultipleVRSa<string mnemonic, bits<16> opcode>
   : InstVRSa<opcode, (outs VR128:$V1, VR128:$V3), (ins bdaddr12only:$BD2),
              mnemonic#"\t$V1, $V3, $BD2", []> {
@@ -2355,6 +2436,15 @@ class UnaryRRE<string mnemonic, bits<16> opcode, SDPatternOperator operator,
   let OpType = "reg";
 }
 
+class UnaryMemRRFc<string mnemonic, bits<16> opcode,
+                   RegisterOperand cls1, RegisterOperand cls2>
+  : InstRRFc<opcode, (outs cls2:$R2, cls1:$R1), (ins cls1:$R1src),
+            mnemonic#"\t$R1, $R2", []> {
+  let Constraints = "$R1 = $R1src";
+  let DisableEncoding = "$R1src";
+  let M3 = 0;
+}
+
 class UnaryRI<string mnemonic, bits<12> opcode, SDPatternOperator operator,
               RegisterOperand cls, Immediate imm>
   : InstRIa<opcode, (outs cls:$R1), (ins imm:$I2),
@@ -2585,11 +2675,61 @@ class SideEffectBinaryIE<string mnemonic, bits<16> opcode,
   : InstIE<opcode, (outs), (ins imm1:$I1, imm2:$I2),
            mnemonic#"\t$I1, $I2", []>;
 
+class SideEffectBinarySI<string mnemonic, bits<8> opcode, Operand imm>
+  : InstSI<opcode, (outs), (ins bdaddr12only:$BD1, imm:$I2),
+           mnemonic#"\t$BD1, $I2", []>;
+
 class SideEffectBinarySIL<string mnemonic, bits<16> opcode,
                           SDPatternOperator operator, Immediate imm>
   : InstSIL<opcode, (outs), (ins bdaddr12only:$BD1, imm:$I2),
             mnemonic#"\t$BD1, $I2", [(operator bdaddr12only:$BD1, imm:$I2)]>;
 
+class SideEffectBinarySSa<string mnemonic, bits<8> opcode>
+  : InstSSa<opcode, (outs), (ins bdladdr12onlylen8:$BDL1, bdaddr12only:$BD2),
+            mnemonic##"\t$BDL1, $BD2", []>;
+
+class SideEffectBinarySSb<string mnemonic, bits<8> opcode>
+  : InstSSb<opcode,
+            (outs), (ins bdladdr12onlylen4:$BDL1, bdladdr12onlylen4:$BDL2),
+            mnemonic##"\t$BDL1, $BDL2", []>;
+
+class SideEffectBinarySSf<string mnemonic, bits<8> opcode>
+  : InstSSf<opcode, (outs), (ins bdaddr12only:$BD1, bdladdr12onlylen8:$BDL2),
+            mnemonic##"\t$BD1, $BDL2", []>;
+
+class SideEffectBinaryMemMemRR<string mnemonic, bits<8> opcode,
+                               RegisterOperand cls1, RegisterOperand cls2>
+  : InstRR<opcode, (outs cls1:$R1, cls2:$R2), (ins cls1:$R1src, cls2:$R2src),
+           mnemonic#"\t$R1, $R2", []> {
+    let Constraints = "$R1 = $R1src, $R2 = $R2src";
+    let DisableEncoding = "$R1src, $R2src";
+}
+
+class SideEffectBinaryMemRRE<string mnemonic, bits<16> opcode,
+                             RegisterOperand cls1, RegisterOperand cls2>
+  : InstRRE<opcode, (outs cls2:$R2), (ins cls1:$R1, cls2:$R2src),
+            mnemonic#"\t$R1, $R2", []> {
+  let Constraints = "$R2 = $R2src";
+  let DisableEncoding = "$R2src";
+}
+
+class SideEffectBinaryMemMemRRE<string mnemonic, bits<16> opcode,
+                                RegisterOperand cls1, RegisterOperand cls2>
+  : InstRRE<opcode, (outs cls1:$R1, cls2:$R2), (ins cls1:$R1src, cls2:$R2src),
+            mnemonic#"\t$R1, $R2", []> {
+    let Constraints = "$R1 = $R1src, $R2 = $R2src";
+    let DisableEncoding = "$R1src, $R2src";
+}
+
+class SideEffectBinaryMemMemRRFc<string mnemonic, bits<16> opcode,
+                                 RegisterOperand cls1, RegisterOperand cls2>
+  : InstRRFc<opcode, (outs cls1:$R1, cls2:$R2), (ins cls1:$R1src, cls2:$R2src),
+             mnemonic#"\t$R1, $R2", []> {
+  let Constraints = "$R1 = $R1src, $R2 = $R2src";
+  let DisableEncoding = "$R1src, $R2src";
+  let M3 = 0;
+}
+
 class BinaryRR<string mnemonic, bits<8> opcode, SDPatternOperator operator,
                RegisterOperand cls1, RegisterOperand cls2>
   : InstRR<opcode, (outs cls1:$R1), (ins cls1:$R1src, cls2:$R2),
@@ -2654,6 +2794,20 @@ class BinaryRRFb<string mnemonic, bits<16> opcode, SDPatternOperator operator,
   let M4 = 0;
 }
 
+class BinaryMemRRFc<string mnemonic, bits<16> opcode,
+                    RegisterOperand cls1, RegisterOperand cls2, Immediate imm>
+  : InstRRFc<opcode, (outs cls2:$R2, cls1:$R1), (ins cls1:$R1src, imm:$M3),
+            mnemonic#"\t$R1, $R2, $M3", []> {
+  let Constraints = "$R1 = $R1src";
+  let DisableEncoding = "$R1src";
+}
+
+multiclass BinaryMemRRFcOpt<string mnemonic, bits<16> opcode,
+                            RegisterOperand cls1, RegisterOperand cls2> {
+  def "" : BinaryMemRRFc<mnemonic, opcode, cls1, cls2, imm32zx4>;
+  def Opt : UnaryMemRRFc<mnemonic, opcode, cls1, cls2>;
+}
+
 class BinaryRRFe<string mnemonic, bits<16> opcode, RegisterOperand cls1,
                 RegisterOperand cls2>
   : InstRRFe<opcode, (outs cls1:$R1), (ins imm32zx4:$M3, cls2:$R2),
@@ -3112,6 +3266,34 @@ class BinaryVRX<string mnemonic, bits<16> opcode, SDPatternOperator operator,
   let AccessBytes = bytes;
 }
 
+class StoreBinaryRS<string mnemonic, bits<8> opcode, RegisterOperand cls,
+                    bits<5> bytes, AddressingMode mode = bdaddr12only>
+  : InstRSb<opcode, (outs), (ins cls:$R1, imm32zx4:$M3, mode:$BD2),
+            mnemonic#"\t$R1, $M3, $BD2", []> {
+  let mayStore = 1;
+  let AccessBytes = bytes;
+}
+
+class StoreBinaryRSY<string mnemonic, bits<16> opcode, RegisterOperand cls,
+                     bits<5> bytes, AddressingMode mode = bdaddr20only>
+  : InstRSYb<opcode, (outs), (ins cls:$R1, imm32zx4:$M3, mode:$BD2),
+             mnemonic#"\t$R1, $M3, $BD2", []> {
+  let mayStore = 1;
+  let AccessBytes = bytes;
+}
+
+multiclass StoreBinaryRSPair<string mnemonic, bits<8> rsOpcode,
+                             bits<16> rsyOpcode, RegisterOperand cls,
+                             bits<5> bytes> {
+  let DispKey = mnemonic ## #cls in {
+    let DispSize = "12" in
+      def "" : StoreBinaryRS<mnemonic, rsOpcode, cls, bytes, bdaddr12pair>;
+    let DispSize = "20" in
+      def Y  : StoreBinaryRSY<mnemonic#"y", rsyOpcode, cls, bytes,
+                              bdaddr20pair>;
+  }
+}
+
 class StoreBinaryVRV<string mnemonic, bits<16> opcode, bits<5> bytes,
                      Immediate index>
   : InstVRV<opcode, (outs), (ins VR128:$V1, bdvaddr12only:$VBD2, index:$M3),
@@ -3237,6 +3419,40 @@ multiclass CompareRXPair<string mnemonic, bits<8> rxOpcode, bits<16> rxyOpcode,
   }
 }
 
+class CompareRS<string mnemonic, bits<8> opcode, RegisterOperand cls,
+                bits<5> bytes, AddressingMode mode = bdaddr12only>
+  : InstRSb<opcode, (outs), (ins cls:$R1, imm32zx4:$M3, mode:$BD2),
+            mnemonic#"\t$R1, $M3, $BD2", []> {
+  let mayLoad = 1;
+  let AccessBytes = bytes;
+}
+
+class CompareRSY<string mnemonic, bits<16> opcode, RegisterOperand cls,
+                 bits<5> bytes, AddressingMode mode = bdaddr20only>
+  : InstRSYb<opcode, (outs), (ins cls:$R1, imm32zx4:$M3, mode:$BD2),
+             mnemonic#"\t$R1, $M3, $BD2", []> {
+  let mayLoad = 1;
+  let AccessBytes = bytes;
+}
+
+multiclass CompareRSPair<string mnemonic, bits<8> rsOpcode, bits<16> rsyOpcode,
+                         RegisterOperand cls, bits<5> bytes> {
+  let DispKey = mnemonic ## #cls in {
+    let DispSize = "12" in
+      def "" : CompareRS<mnemonic, rsOpcode, cls, bytes, bdaddr12pair>;
+    let DispSize = "20" in
+      def Y  : CompareRSY<mnemonic#"y", rsyOpcode, cls, bytes, bdaddr20pair>;
+  }
+}
+
+class CompareSSb<string mnemonic, bits<8> opcode>
+  : InstSSb<opcode,
+            (outs), (ins bdladdr12onlylen4:$BDL1, bdladdr12onlylen4:$BDL2),
+            mnemonic##"\t$BDL1, $BDL2", []> {
+  let isCompare = 1;
+  let mayLoad = 1;
+}
+
 class CompareSI<string mnemonic, bits<8> opcode, SDPatternOperator operator,
                 SDPatternOperator load, Immediate imm,
                 AddressingMode mode = bdaddr12only>
@@ -3313,18 +3529,68 @@ class TestRXE<string mnemonic, bits<16> opcode, SDPatternOperator operator,
   let M3 = 0;
 }
 
+class TestRSL<string mnemonic, bits<16> opcode>
+  : InstRSLa<opcode, (outs), (ins bdladdr12onlylen4:$BDL1),
+             mnemonic#"\t$BDL1", []> {
+  let mayLoad = 1;
+}
+
+class SideEffectTernarySSc<string mnemonic, bits<8> opcode>
+  : InstSSc<opcode, (outs), (ins bdladdr12onlylen4:$BDL1,
+                                 shift12only:$BD2, imm32zx4:$I3),
+            mnemonic##"\t$BDL1, $BD2, $I3", []>;
+
+class SideEffectTernaryMemMemMemRRFb<string mnemonic, bits<16> opcode,
+                                     RegisterOperand cls1,
+                                     RegisterOperand cls2,
+                                     RegisterOperand cls3>
+  : InstRRFb<opcode, (outs cls1:$R1, cls2:$R2, cls3:$R3),
+             (ins cls1:$R1src, cls2:$R2src, cls3:$R3src),
+             mnemonic#"\t$R1, $R3, $R2", []> {
+  let Constraints = "$R1 = $R1src, $R2 = $R2src, $R3 = $R3src";
+  let DisableEncoding = "$R1src, $R2src, $R3src";
+  let M4 = 0;
+}
+
 class SideEffectTernaryRRFc<string mnemonic, bits<16> opcode,
                             RegisterOperand cls1, RegisterOperand cls2,
                             Immediate imm>
   : InstRRFc<opcode, (outs), (ins cls1:$R1, cls2:$R2, imm:$M3),
              mnemonic#"\t$R1, $R2, $M3", []>;
 
+class SideEffectTernaryMemMemRRFc<string mnemonic, bits<16> opcode,
+                                  RegisterOperand cls1, RegisterOperand cls2,
+                                  Immediate imm>
+  : InstRRFc<opcode, (outs cls1:$R1, cls2:$R2),
+             (ins cls1:$R1src, cls2:$R2src, imm:$M3),
+             mnemonic#"\t$R1, $R2, $M3", []> {
+  let Constraints = "$R1 = $R1src, $R2 = $R2src";
+  let DisableEncoding = "$R1src, $R2src";
+}
+
+multiclass SideEffectTernaryMemMemRRFcOpt<string mnemonic, bits<16> opcode,
+                                          RegisterOperand cls1,
+                                          RegisterOperand cls2> {
+  def "" : SideEffectTernaryMemMemRRFc<mnemonic, opcode, cls1, cls2, imm32zx4>;
+  def Opt : SideEffectBinaryMemMemRRFc<mnemonic, opcode, cls1, cls2>;
+}
+
 class SideEffectTernarySSF<string mnemonic, bits<12> opcode,
                            RegisterOperand cls>
   : InstSSF<opcode, (outs),
             (ins bdaddr12only:$BD1, bdaddr12only:$BD2, cls:$R3),
             mnemonic#"\t$BD1, $BD2, $R3", []>;
 
+class TernaryRRFb<string mnemonic, bits<16> opcode,
+                  RegisterOperand cls1, RegisterOperand cls2,
+                  RegisterOperand cls3>
+  : InstRRFb<opcode, (outs cls1:$R1, cls3:$R3),
+             (ins cls1:$R1src, cls2:$R2, imm32zx4:$M4),
+             mnemonic#"\t$R1, $R3, $R2, $M4", []> {
+  let Constraints = "$R1 = $R1src";
+  let DisableEncoding = "$R1src";
+}
+
 class TernaryRRFe<string mnemonic, bits<16> opcode, RegisterOperand cls1,
                   RegisterOperand cls2>
   : InstRRFe<opcode, (outs cls1:$R1),
@@ -3376,6 +3642,24 @@ multiclass TernaryRSPair<string mnemonic, bits<8> rsOpcode, bits<16> rsyOpcode,
   }
 }
 
+class SideEffectTernaryMemMemRS<string mnemonic, bits<8> opcode,
+                                RegisterOperand cls1, RegisterOperand cls2>
+  : InstRSa<opcode, (outs cls1:$R1, cls2:$R3),
+            (ins cls1:$R1src, cls2:$R3src, shift12only:$BD2),
+            mnemonic#"\t$R1, $R3, $BD2", []> {
+    let Constraints = "$R1 = $R1src, $R3 = $R3src";
+    let DisableEncoding = "$R1src, $R3src";
+}
+
+class SideEffectTernaryMemMemRSY<string mnemonic, bits<16> opcode,
+                                 RegisterOperand cls1, RegisterOperand cls2>
+  : InstRSYa<opcode, (outs cls1:$R1, cls2:$R3),
+             (ins cls1:$R1src, cls2:$R3src, shift20only:$BD2),
+             mnemonic#"\t$R1, $R3, $BD2", []> {
+    let Constraints = "$R1 = $R1src, $R3 = $R3src";
+    let DisableEncoding = "$R1src, $R3src";
+}
+
 class TernaryRXF<string mnemonic, bits<16> opcode, SDPatternOperator operator,
                  RegisterOperand cls, SDPatternOperator load, bits<5> bytes>
   : InstRXF<opcode, (outs cls:$R1),
@@ -3981,9 +4265,7 @@ class AtomicLoadWBinaryImm<SDPatternOperator operator, Immediate imm>
 // another instruction to handle the excess.
 multiclass MemorySS<string mnemonic, bits<8> opcode,
                     SDPatternOperator sequence, SDPatternOperator loop> {
-  def "" : InstSSa<opcode, (outs), (ins bdladdr12onlylen8:$BDL1,
-                                        bdaddr12only:$BD2),
-                   mnemonic##"\t$BDL1, $BD2", []>;
+  def "" : SideEffectBinarySSa<mnemonic, opcode>;
   let usesCustomInserter = 1, hasNoSchedulingInfo = 1 in {
     def Sequence : Pseudo<(outs), (ins bdaddr12only:$dest, bdaddr12only:$src,
                                        imm64:$length),
@@ -4003,13 +4285,8 @@ multiclass MemorySS<string mnemonic, bits<8> opcode,
 // the full loop (the main instruction plus the branch on CC==3).
 multiclass StringRRE<string mnemonic, bits<16> opcode,
                      SDPatternOperator operator> {
-  def "" : InstRRE<opcode, (outs GR64:$R1, GR64:$R2),
-                   (ins GR64:$R1src, GR64:$R2src),
-                   mnemonic#"\t$R1, $R2", []> {
-    let Uses = [R0L];
-    let Constraints = "$R1 = $R1src, $R2 = $R2src";
-    let DisableEncoding = "$R1src, $R2src";
-  }
+  let Uses = [R0L] in
+    def "" : SideEffectBinaryMemMemRRE<mnemonic, opcode, GR64, GR64>;
   let usesCustomInserter = 1, hasNoSchedulingInfo = 1 in
     def Loop : Pseudo<(outs GR64:$end),
                       (ins GR64:$start1, GR64:$start2, GR32:$char),
diff --git a/lib/Target/SystemZ/SystemZInstrInfo.td b/lib/Target/SystemZ/SystemZInstrInfo.td
index d63525f29412..fa5ecdd85243 100644
--- a/lib/Target/SystemZ/SystemZInstrInfo.td
+++ b/lib/Target/SystemZ/SystemZInstrInfo.td
@@ -12,8 +12,8 @@
 //===----------------------------------------------------------------------===//
 
 let hasNoSchedulingInfo = 1 in {
-  def ADJCALLSTACKDOWN : Pseudo<(outs), (ins i64imm:$amt),
-                                [(callseq_start timm:$amt)]>;
+  def ADJCALLSTACKDOWN : Pseudo<(outs), (ins i64imm:$amt1, i64imm:$amt2),
+                                [(callseq_start timm:$amt1, timm:$amt2)]>;
   def ADJCALLSTACKUP   : Pseudo<(outs), (ins i64imm:$amt1, i64imm:$amt2),
                                 [(callseq_end timm:$amt1, timm:$amt2)]>;
 }
@@ -464,6 +464,11 @@ def MVGHI : StoreSIL<"mvghi", 0xE548, store,         imm64sx16>;
 // Memory-to-memory moves.
 let mayLoad = 1, mayStore = 1 in
   defm MVC : MemorySS<"mvc", 0xD2, z_mvc, z_mvc_loop>;
+let mayLoad = 1, mayStore = 1, Defs = [CC] in {
+  def MVCL  : SideEffectBinaryMemMemRR<"mvcl", 0x0E, GR128, GR128>;
+  def MVCLE : SideEffectTernaryMemMemRS<"mvcle", 0xA8, GR128, GR128>;
+  def MVCLU : SideEffectTernaryMemMemRSY<"mvclu", 0xEB8E, GR128, GR128>;
+}
 
 // String moves.
 let mayLoad = 1, mayStore = 1, Defs = [CC] in
@@ -707,6 +712,10 @@ def  : StoreGR64PC<STHRL, aligned_truncstorei16>;
 defm : StoreGR64Pair<ST, STY, truncstorei32>;
 def  : StoreGR64PC<STRL, aligned_truncstorei32>;
 
+// Store characters under mask -- not (yet) used for codegen.
+defm STCM : StoreBinaryRSPair<"stcm", 0xBE, 0xEB2D, GR32, 0>;
+def STCMH : StoreBinaryRSY<"stcmh", 0xEB2C, GRH32, 0>;
+
 //===----------------------------------------------------------------------===//
 // Multi-register moves
 //===----------------------------------------------------------------------===//
@@ -715,6 +724,7 @@ def  : StoreGR64PC<STRL, aligned_truncstorei32>;
 defm LM : LoadMultipleRSPair<"lm", 0x98, 0xEB98, GR32>;
 def LMG : LoadMultipleRSY<"lmg", 0xEB04, GR64>;
 def LMH : LoadMultipleRSY<"lmh", 0xEB96, GRH32>;
+def LMD : LoadMultipleSSe<"lmd", 0xEF, GR64>;
 
 // Multi-register stores.
 defm STM : StoreMultipleRSPair<"stm", 0x90, 0xEB90, GR32>;
@@ -742,6 +752,10 @@ def STRVH : StoreRXY<"strvh", 0xE33F, z_strvh, GR32, 2>;
 def STRV  : StoreRXY<"strv",  0xE33E, z_strv,  GR32, 4>;
 def STRVG : StoreRXY<"strvg", 0xE32F, z_strvg, GR64, 8>;
 
+// Byte-swapping memory-to-memory moves.
+let mayLoad = 1, mayStore = 1 in
+  def MVCIN : SideEffectBinarySSa<"mvcin", 0xE8>;
+
 //===----------------------------------------------------------------------===//
 // Load address instructions
 //===----------------------------------------------------------------------===//
@@ -816,6 +830,7 @@ defm : InsertMem<"inserti8", IC32Y, GR32, azextloadi8, bdxaddr20pair>;
 defm : InsertMem<"inserti8", IC,  GR64, azextloadi8, bdxaddr12pair>;
 defm : InsertMem<"inserti8", ICY, GR64, azextloadi8, bdxaddr20pair>;
 
+// Insert characters under mask -- not (yet) used for codegen.
 let Defs = [CC] in {
   defm ICM : TernaryRSPair<"icm", 0xBF, 0xEB81, GR32, 0>;
   def ICMH : TernaryRSY<"icmh", 0xEB80, GRH32, 0>;
@@ -919,6 +934,10 @@ let Defs = [CC] in {
   defm AL   : BinaryRXPair<"al", 0x5E, 0xE35E, addc, GR32, load, 4>;
   def  ALGF : BinaryRXY<"algf", 0xE31A, addc, GR64, azextloadi32, 4>;
   def  ALG  : BinaryRXY<"alg",  0xE30A, addc, GR64, load, 8>;
+
+  // Addition to memory.
+  def ALSI  : BinarySIY<"alsi",  0xEB6E, null_frag, imm32sx8>;
+  def ALGSI : BinarySIY<"algsi", 0xEB7E, null_frag, imm64sx8>;
 }
 defm : ZXB<addc, GR64, ALGFR>;
 
@@ -1166,9 +1185,14 @@ def  MSGF : BinaryRXY<"msgf", 0xE31C, mul, GR64, asextloadi32, 4>;
 def  MSG  : BinaryRXY<"msg",  0xE30C, mul, GR64, load, 8>;
 
 // Multiplication of a register, producing two results.
+def MR   : BinaryRR <"mr",   0x1C,   null_frag, GR128, GR32>;
+def MLR  : BinaryRRE<"mlr",  0xB996, null_frag, GR128, GR32>;
 def MLGR : BinaryRRE<"mlgr", 0xB986, z_umul_lohi64, GR128, GR64>;
 
 // Multiplication of memory, producing two results.
+def M   : BinaryRX <"m",   0x5C,   null_frag, GR128, load, 4>;
+def MFY : BinaryRXY<"mfy", 0xE35C, null_frag, GR128, load, 4>;
+def ML  : BinaryRXY<"ml",  0xE396, null_frag, GR128, load, 4>;
 def MLG : BinaryRXY<"mlg", 0xE386, z_umul_lohi64, GR128, load, 8>;
 
 //===----------------------------------------------------------------------===//
@@ -1177,12 +1201,14 @@ def MLG : BinaryRXY<"mlg", 0xE386, z_umul_lohi64, GR128, load, 8>;
 
 let hasSideEffects = 1 in {  // Do not speculatively execute.
   // Division and remainder, from registers.
+  def DR    : BinaryRR <"dr",    0x1D,   null_frag,   GR128, GR32>;
   def DSGFR : BinaryRRE<"dsgfr", 0xB91D, z_sdivrem32, GR128, GR32>;
   def DSGR  : BinaryRRE<"dsgr",  0xB90D, z_sdivrem64, GR128, GR64>;
   def DLR   : BinaryRRE<"dlr",   0xB997, z_udivrem32, GR128, GR32>;
   def DLGR  : BinaryRRE<"dlgr",  0xB987, z_udivrem64, GR128, GR64>;
 
   // Division and remainder, from memory.
+  def D    : BinaryRX <"d",    0x5D,   null_frag,   GR128, load, 4>;
   def DSGF : BinaryRXY<"dsgf", 0xE31D, z_sdivrem32, GR128, load, 4>;
   def DSG  : BinaryRXY<"dsg",  0xE30D, z_sdivrem64, GR128, load, 8>;
   def DL   : BinaryRXY<"dl",   0xE397, z_udivrem32, GR128, load, 4>;
@@ -1193,23 +1219,32 @@ let hasSideEffects = 1 in {  // Do not speculatively execute.
 // Shifts
 //===----------------------------------------------------------------------===//
 
-// Shift left.
+// Logical shift left.
 let hasSideEffects = 0 in {
   defm SLL : BinaryRSAndK<"sll", 0x89, 0xEBDF, shl, GR32>;
-  defm SLA : BinaryRSAndK<"sla", 0x8B, 0xEBDD, null_frag, GR32>;
   def SLLG : BinaryRSY<"sllg", 0xEB0D, shl, GR64>;
+  def SLDL : BinaryRS<"sldl", 0x8D, null_frag, GR128>;
+}
+
+// Arithmetic shift left.
+let Defs = [CC] in {
+  defm SLA : BinaryRSAndK<"sla", 0x8B, 0xEBDD, null_frag, GR32>;
+  def SLAG : BinaryRSY<"slag", 0xEB0B, null_frag, GR64>;
+  def SLDA : BinaryRS<"slda", 0x8F, null_frag, GR128>;
 }
 
 // Logical shift right.
 let hasSideEffects = 0 in {
   defm SRL : BinaryRSAndK<"srl", 0x88, 0xEBDE, srl, GR32>;
   def SRLG : BinaryRSY<"srlg", 0xEB0C, srl, GR64>;
+  def SRDL : BinaryRS<"srdl", 0x8C, null_frag, GR128>;
 }
 
 // Arithmetic shift right.
 let Defs = [CC], CCValues = 0xE, CompareZeroCCMask = 0xE in {
   defm SRA : BinaryRSAndK<"sra", 0x8A, 0xEBDC, sra, GR32>;
   def SRAG : BinaryRSY<"srag", 0xEB0A, sra, GR64>;
+  def SRDA : BinaryRS<"srda", 0x8E, null_frag, GR128>;
 }
 
 // Rotate left.
@@ -1351,8 +1386,12 @@ let Defs = [CC], CCValues = 0xE, IsLogical = 1 in {
 defm : ZXB<z_ucmp, GR64, CLGFR>;
 
 // Memory-to-memory comparison.
-let mayLoad = 1, Defs = [CC] in
+let mayLoad = 1, Defs = [CC] in {
   defm CLC : MemorySS<"clc", 0xD5, z_clc, z_clc_loop>;
+  def CLCL  : SideEffectBinaryMemMemRR<"clcl", 0x0F, GR128, GR128>;
+  def CLCLE : SideEffectTernaryMemMemRS<"clcle", 0xA9, GR128, GR128>;
+  def CLCLU : SideEffectTernaryMemMemRSY<"clclu", 0xEB8F, GR128, GR128>;
+}
 
 // String comparison.
 let mayLoad = 1, Defs = [CC] in
@@ -1381,6 +1420,12 @@ let Defs = [CC] in {
 def TML : InstAlias<"tml\t$R, $I", (TMLL GR32:$R, imm32ll16:$I), 0>;
 def TMH : InstAlias<"tmh\t$R, $I", (TMLH GR32:$R, imm32lh16:$I), 0>;
 
+// Compare logical characters under mask -- not (yet) used for codegen.
+let Defs = [CC] in {
+  defm CLM : CompareRSPair<"clm", 0xBD, 0xEB21, GR32, 0>;
+  def CLMH : CompareRSY<"clmh", 0xEB20, GRH32, 0>;
+}
+
 //===----------------------------------------------------------------------===//
 // Prefetch and execution hint
 //===----------------------------------------------------------------------===//
@@ -1580,6 +1625,115 @@ let Predicates = [FeatureInterlockedAccess1], Defs = [CC] in {
   def LPDG : BinarySSF<"lpdg", 0xC85, GR128>;
 }
 
+//===----------------------------------------------------------------------===//
+// Translate and convert
+//===----------------------------------------------------------------------===//
+
+let mayLoad = 1, mayStore = 1 in
+  def TR : SideEffectBinarySSa<"tr", 0xDC>;
+
+let mayLoad = 1, Defs = [CC, R0L, R1D] in {
+  def TRT  : SideEffectBinarySSa<"trt", 0xDD>;
+  def TRTR : SideEffectBinarySSa<"trtr", 0xD0>;
+}
+
+let mayLoad = 1, mayStore = 1, Uses = [R0L] in
+  def TRE : SideEffectBinaryMemMemRRE<"tre", 0xB2A5, GR128, GR64>;
+
+let mayLoad = 1, Uses = [R1D], Defs = [CC] in {
+  defm TRTE  : BinaryMemRRFcOpt<"trte",  0xB9BF, GR128, GR64>;
+  defm TRTRE : BinaryMemRRFcOpt<"trtre", 0xB9BD, GR128, GR64>;
+}
+
+let mayLoad = 1, mayStore = 1, Uses = [R0L, R1D], Defs = [CC] in {
+  defm TROO : SideEffectTernaryMemMemRRFcOpt<"troo", 0xB993, GR128, GR64>;
+  defm TROT : SideEffectTernaryMemMemRRFcOpt<"trot", 0xB992, GR128, GR64>;
+  defm TRTO : SideEffectTernaryMemMemRRFcOpt<"trto", 0xB991, GR128, GR64>;
+  defm TRTT : SideEffectTernaryMemMemRRFcOpt<"trtt", 0xB990, GR128, GR64>;
+}
+
+let mayLoad = 1, mayStore = 1, Defs = [CC] in {
+  defm CU12 : SideEffectTernaryMemMemRRFcOpt<"cu12", 0xB2A7, GR128, GR128>;
+  defm CU14 : SideEffectTernaryMemMemRRFcOpt<"cu14", 0xB9B0, GR128, GR128>;
+  defm CU21 : SideEffectTernaryMemMemRRFcOpt<"cu21", 0xB2A6, GR128, GR128>;
+  defm CU24 : SideEffectTernaryMemMemRRFcOpt<"cu24", 0xB9B1, GR128, GR128>;
+  def  CU41 : SideEffectBinaryMemMemRRE<"cu41", 0xB9B2, GR128, GR128>;
+  def  CU42 : SideEffectBinaryMemMemRRE<"cu42", 0xB9B3, GR128, GR128>;
+
+  let isAsmParserOnly = 1 in {
+    defm CUUTF : SideEffectTernaryMemMemRRFcOpt<"cuutf", 0xB2A6, GR128, GR128>;
+    defm CUTFU : SideEffectTernaryMemMemRRFcOpt<"cutfu", 0xB2A7, GR128, GR128>;
+  }
+}
+
+//===----------------------------------------------------------------------===//
+// Message-security assist
+//===----------------------------------------------------------------------===//
+
+let mayLoad = 1, mayStore = 1, Uses = [R0L, R1D], Defs = [CC] in {
+  def KM  : SideEffectBinaryMemMemRRE<"km",  0xB92E, GR128, GR128>;
+  def KMC : SideEffectBinaryMemMemRRE<"kmc", 0xB92F, GR128, GR128>;
+
+  def KIMD : SideEffectBinaryMemRRE<"kimd", 0xB93E, GR64, GR128>;
+  def KLMD : SideEffectBinaryMemRRE<"klmd", 0xB93F, GR64, GR128>;
+  def KMAC : SideEffectBinaryMemRRE<"kmac", 0xB91E, GR64, GR128>;
+
+  let Predicates = [FeatureMessageSecurityAssist4] in {
+    def KMF   : SideEffectBinaryMemMemRRE<"kmf", 0xB92A, GR128, GR128>;
+    def KMO   : SideEffectBinaryMemMemRRE<"kmo", 0xB92B, GR128, GR128>;
+    def KMCTR : SideEffectTernaryMemMemMemRRFb<"kmctr", 0xB92D,
+                                               GR128, GR128, GR128>;
+    def PCC   : SideEffectInherentRRE<"pcc", 0xB92C>;
+  }
+  let Predicates = [FeatureMessageSecurityAssist5] in
+    def PPNO  : SideEffectBinaryMemMemRRE<"ppno", 0xB93C, GR128, GR128>;
+}
+
+//===----------------------------------------------------------------------===//
+// Decimal arithmetic
+//===----------------------------------------------------------------------===//
+
+defm CVB  : BinaryRXPair<"cvb",0x4F, 0xE306, null_frag, GR32, load, 4>;
+def  CVBG : BinaryRXY<"cvbg", 0xE30E, null_frag, GR64, load, 8>;
+
+defm CVD  : StoreRXPair<"cvd", 0x4E, 0xE326, null_frag, GR32, 4>;
+def  CVDG : StoreRXY<"cvdg", 0xE32E, null_frag, GR64, 8>;
+
+let mayLoad = 1, mayStore = 1 in {
+  def MVN : SideEffectBinarySSa<"mvn", 0xD1>;
+  def MVZ : SideEffectBinarySSa<"mvz", 0xD3>;
+  def MVO : SideEffectBinarySSb<"mvo", 0xF1>;
+
+  def PACK : SideEffectBinarySSb<"pack", 0xF2>;
+  def PKA  : SideEffectBinarySSf<"pka", 0xE9>;
+  def PKU  : SideEffectBinarySSf<"pku", 0xE1>;
+  def UNPK : SideEffectBinarySSb<"unpk", 0xF3>;
+  let Defs = [CC] in {
+    def UNPKA : SideEffectBinarySSa<"unpka", 0xEA>;
+    def UNPKU : SideEffectBinarySSa<"unpku", 0xE2>;
+  }
+}
+
+let mayLoad = 1, mayStore = 1 in {
+  let Defs = [CC] in {
+    def AP : SideEffectBinarySSb<"ap", 0xFA>;
+    def SP : SideEffectBinarySSb<"sp", 0xFB>;
+    def ZAP : SideEffectBinarySSb<"zap", 0xF8>;
+    def SRP : SideEffectTernarySSc<"srp", 0xF0>;
+  }
+  def MP : SideEffectBinarySSb<"mp", 0xFC>;
+  def DP : SideEffectBinarySSb<"dp", 0xFD>;
+  let Defs = [CC] in {
+    def ED : SideEffectBinarySSa<"ed", 0xDE>;
+    def EDMK : SideEffectBinarySSa<"edmk", 0xDF>;
+  }
+}
+
+let Defs = [CC] in {
+  def CP : CompareSSb<"cp", 0xF9>;
+  def TP : TestRSL<"tp", 0xEBC0>;
+}
+
 //===----------------------------------------------------------------------===//
 // Access registers
 //===----------------------------------------------------------------------===//
@@ -1712,12 +1866,39 @@ let usesCustomInserter = 1 in {
 
 // Search a block of memory for a character.
 let mayLoad = 1, Defs = [CC] in
-  defm SRST : StringRRE<"srst", 0xb25e, z_search_string>;
+  defm SRST : StringRRE<"srst", 0xB25E, z_search_string>;
+let mayLoad = 1, Defs = [CC], Uses = [R0L] in
+  def SRSTU : SideEffectBinaryMemMemRRE<"srstu", 0xB9BE, GR64, GR64>;
+
+// Compare until substring equal.
+let mayLoad = 1, Defs = [CC], Uses = [R0L, R1L] in
+  def CUSE : SideEffectBinaryMemMemRRE<"cuse", 0xB257, GR128, GR128>;
+
+// Compare and form codeword.
+let mayLoad = 1, Defs = [CC, R1D, R2D, R3D], Uses = [R1D, R2D, R3D] in
+  def CFC : SideEffectAddressS<"cfc", 0xB21A, null_frag>;
+
+// Update tree.
+let mayLoad = 1, mayStore = 1, Defs = [CC, R0D, R1D, R2D, R3D, R5D],
+    Uses = [R0D, R1D, R2D, R3D, R4D, R5D] in
+  def UPT : SideEffectInherentE<"upt", 0x0102>;
+
+// Checksum.
+let mayLoad = 1, Defs = [CC] in
+  def CKSM : SideEffectBinaryMemMemRRE<"cksm", 0xB241, GR64, GR128>;
+
+// Compression call.
+let mayLoad = 1, mayStore = 1, Defs = [CC, R1D], Uses = [R0L, R1D] in
+  def CMPSC : SideEffectBinaryMemMemRRE<"cmpsc", 0xB263, GR128, GR128>;
 
 // Supervisor call.
 let hasSideEffects = 1, isCall = 1, Defs = [CC] in
   def SVC : SideEffectUnaryI<"svc", 0x0A, imm32zx8>;
 
+// Monitor call.
+let hasSideEffects = 1, isCall = 1 in
+  def MC : SideEffectBinarySI<"mc", 0xAF, imm32zx8>;
+
 // Store clock.
 let hasSideEffects = 1, Defs = [CC] in {
   def STCK  : StoreInherentS<"stck",  0xB205, null_frag, 8>;
@@ -1729,10 +1910,18 @@ let hasSideEffects = 1, Defs = [CC] in {
 let hasSideEffects = 1, Uses = [R0D], Defs = [R0D, CC] in
   def STFLE : StoreInherentS<"stfle", 0xB2B0, null_frag, 0>;
 
+// Extract CPU attribute.
+let hasSideEffects = 1 in
+  def ECAG : BinaryRSY<"ecag", 0xEB4C, null_frag, GR64>;
+
 // Extract CPU time.
 let Defs = [R0D, R1D], hasSideEffects = 1, mayLoad = 1 in
   def ECTG : SideEffectTernarySSF<"ectg", 0xC81, GR64>;
 
+// Extract PSW.
+let hasSideEffects = 1, Uses = [CC] in
+  def EPSW : InherentDualRRE<"epsw", 0xB98D, GR32>;
+
 // Execute.
 let hasSideEffects = 1 in {
   def EX   : SideEffectBinaryRX<"ex", 0x44, GR64>;
diff --git a/lib/Target/SystemZ/SystemZOperands.td b/lib/Target/SystemZ/SystemZOperands.td
index 7bb4fe5afb3f..713612129d90 100644
--- a/lib/Target/SystemZ/SystemZOperands.td
+++ b/lib/Target/SystemZ/SystemZOperands.td
@@ -531,6 +531,7 @@ def BDAddr64Disp12      : AddressAsmOperand<"BDAddr",   "64", "12">;
 def BDAddr64Disp20      : AddressAsmOperand<"BDAddr",   "64", "20">;
 def BDXAddr64Disp12     : AddressAsmOperand<"BDXAddr",  "64", "12">;
 def BDXAddr64Disp20     : AddressAsmOperand<"BDXAddr",  "64", "20">;
+def BDLAddr64Disp12Len4 : AddressAsmOperand<"BDLAddr",  "64", "12", "Len4">;
 def BDLAddr64Disp12Len8 : AddressAsmOperand<"BDLAddr",  "64", "12", "Len8">;
 def BDRAddr64Disp12     : AddressAsmOperand<"BDRAddr",  "64", "12">;
 def BDVAddr64Disp12     : AddressAsmOperand<"BDVAddr",  "64", "12">;
@@ -578,6 +579,7 @@ def bdxaddr20pair     : BDXMode<"BDXAddr",  "64", "20", "Pair">;
 def dynalloc12only    : BDXMode<"DynAlloc", "64", "12", "Only">;
 def laaddr12pair      : BDXMode<"LAAddr",   "64", "12", "Pair">;
 def laaddr20pair      : BDXMode<"LAAddr",   "64", "20", "Pair">;
+def bdladdr12onlylen4 : BDLMode<"BDLAddr",  "64", "12", "Only", "4">;
 def bdladdr12onlylen8 : BDLMode<"BDLAddr",  "64", "12", "Only", "8">;
 def bdraddr12only     : BDRMode<"BDRAddr",  "64", "12", "Only">;
 def bdvaddr12only     : BDVMode<            "64", "12">;
diff --git a/lib/Target/SystemZ/SystemZOperators.td b/lib/Target/SystemZ/SystemZOperators.td
index fde26ed4e1c5..adfc69c5d4cf 100644
--- a/lib/Target/SystemZ/SystemZOperators.td
+++ b/lib/Target/SystemZ/SystemZOperators.td
@@ -10,7 +10,8 @@
 //===----------------------------------------------------------------------===//
 // Type profiles
 //===----------------------------------------------------------------------===//
-def SDT_CallSeqStart        : SDCallSeqStart<[SDTCisVT<0, i64>]>;
+def SDT_CallSeqStart        : SDCallSeqStart<[SDTCisVT<0, i64>,
+                                              SDTCisVT<1, i64>]>;
 def SDT_CallSeqEnd          : SDCallSeqEnd<[SDTCisVT<0, i64>,
                                             SDTCisVT<1, i64>]>;
 def SDT_ZCall               : SDTypeProfile<0, -1, [SDTCisPtrTy<0>]>;
diff --git a/lib/Target/SystemZ/SystemZSchedule.td b/lib/Target/SystemZ/SystemZSchedule.td
index dbba8ab42b5a..1ce0168f95e9 100644
--- a/lib/Target/SystemZ/SystemZSchedule.td
+++ b/lib/Target/SystemZ/SystemZSchedule.td
@@ -56,12 +56,16 @@ def LSU_lat1    : SchedWrite;
 // Floating point unit (zEC12 and earlier)
 def FPU  : SchedWrite;
 def FPU2 : SchedWrite;
+def DFU  : SchedWrite;
+def DFU2 : SchedWrite;
 
 // Vector sub units (z13)
 def VecBF     : SchedWrite;
 def VecBF2    : SchedWrite;
 def VecDF     : SchedWrite;
 def VecDF2    : SchedWrite;
+def VecDFX    : SchedWrite;
+def VecDFX2   : SchedWrite;
 def VecFPd    : SchedWrite; // Blocking BFP div/sqrt unit.
 def VecMul    : SchedWrite;
 def VecStr    : SchedWrite;
diff --git a/lib/Target/SystemZ/SystemZScheduleZ13.td b/lib/Target/SystemZ/SystemZScheduleZ13.td
index 7aee6f52e9a7..612c3b6cf96e 100644
--- a/lib/Target/SystemZ/SystemZScheduleZ13.td
+++ b/lib/Target/SystemZ/SystemZScheduleZ13.td
@@ -76,6 +76,8 @@ def : WriteRes<VecBF,   [Z13_VecUnit]> { let Latency = 8; }
 def : WriteRes<VecBF2,  [Z13_VecUnit, Z13_VecUnit]> { let Latency = 9; }
 def : WriteRes<VecDF,   [Z13_VecUnit]> { let Latency = 8; }
 def : WriteRes<VecDF2,  [Z13_VecUnit, Z13_VecUnit]> { let Latency = 9; }
+def : WriteRes<VecDFX,  [Z13_VecUnit]> { let Latency = 1; }
+def : WriteRes<VecDFX2, [Z13_VecUnit, Z13_VecUnit]> { let Latency = 2; }
 def : WriteRes<VecFPd,  [Z13_VecFPdUnit, Z13_VecFPdUnit, Z13_VecFPdUnit,
                          Z13_VecFPdUnit, Z13_VecFPdUnit, Z13_VecFPdUnit,
                          Z13_VecFPdUnit, Z13_VecFPdUnit, Z13_VecFPdUnit,
@@ -179,6 +181,7 @@ def : InstRW<[FXb, LSU, Lat5], (instregex "MVI(Y)?$")>;
 
 // Move character
 def : InstRW<[FXb, LSU, LSU, LSU, Lat8, GroupAlone], (instregex "MVC$")>;
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "MVCL(E|U)?$")>;
 
 // Pseudo -> reg move
 def : InstRW<[FXa], (instregex "COPY(_TO_REGCLASS)?$")>;
@@ -268,6 +271,7 @@ def : InstRW<[FXb, LSU, Lat5], (instregex "LLG(F|T)?AT$")>;
 
 def : InstRW<[FXb, LSU, Lat5], (instregex "STC(H|Y|Mux)?$")>;
 def : InstRW<[FXb, LSU, Lat5], (instregex "STH(H|Y|RL|Mux)?$")>;
+def : InstRW<[FXb, LSU, Lat5], (instregex "STCM(H|Y)?$")>;
 
 //===----------------------------------------------------------------------===//
 // Multi-register moves
@@ -277,6 +281,9 @@ def : InstRW<[FXb, LSU, Lat5], (instregex "STH(H|Y|RL|Mux)?$")>;
 def : InstRW<[LSU, LSU, LSU, LSU, LSU, Lat10, GroupAlone],
              (instregex "LM(H|Y|G)?$")>;
 
+// Load multiple disjoint
+def : InstRW<[FXb, Lat30, GroupAlone], (instregex "LMD$")>;
+
 // Store multiple (estimated average of ceil(5/2) FXb ops)
 def : InstRW<[LSU, LSU, FXb, FXb, FXb, Lat10,
               GroupAlone], (instregex "STM(G|H|Y)?$")>;
@@ -288,6 +295,7 @@ def : InstRW<[LSU, LSU, FXb, FXb, FXb, Lat10,
 def : InstRW<[FXa], (instregex "LRV(G)?R$")>;
 def : InstRW<[FXa, LSU, Lat5], (instregex "LRV(G|H)?$")>;
 def : InstRW<[FXb, LSU, Lat5], (instregex "STRV(G|H)?$")>;
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "MVCIN$")>;
 
 //===----------------------------------------------------------------------===//
 // Load address instructions
@@ -345,7 +353,7 @@ def : InstRW<[FXa], (instregex "ALGF(I|R)$")>;
 def : InstRW<[FXa], (instregex "ALGR(K)?$")>;
 def : InstRW<[FXa], (instregex "ALR(K)?$")>;
 def : InstRW<[FXa], (instregex "AR(K)?$")>;
-def : InstRW<[FXb, LSU, Lat5], (instregex "A(G)?SI$")>;
+def : InstRW<[FXb, LSU, Lat5], (instregex "A(L)?(G)?SI$")>;
 
 // Logical addition with carry
 def : InstRW<[FXa, LSU, Lat6, GroupAlone], (instregex "ALC(G)?$")>;
@@ -438,11 +446,15 @@ def : InstRW<[FXa, Lat9, GroupAlone], (instregex "MLGR$")>;
 def : InstRW<[FXa, Lat5], (instregex "MGHI$")>;
 def : InstRW<[FXa, Lat5], (instregex "MHI$")>;
 def : InstRW<[FXa, LSU, Lat9], (instregex "MH(Y)?$")>;
+def : InstRW<[FXa, Lat7, GroupAlone], (instregex "M(L)?R$")>;
+def : InstRW<[FXa, LSU, Lat7, GroupAlone], (instregex "M(FY|L)?$")>;
 
 //===----------------------------------------------------------------------===//
 // Division and remainder
 //===----------------------------------------------------------------------===//
 
+def : InstRW<[FXa2, FXa2, Lat20, GroupAlone], (instregex "DR$")>;
+def : InstRW<[FXa2, FXa2, LSU, Lat30, GroupAlone], (instregex "D$")>;
 def : InstRW<[FXa, Lat30, GroupAlone], (instregex "DSG(F)?R$")>;
 def : InstRW<[LSU, FXa, Lat30, GroupAlone], (instregex "DSG(F)?$")>;
 def : InstRW<[FXa2, FXa2, Lat20, GroupAlone], (instregex "DLR$")>;
@@ -456,7 +468,8 @@ def : InstRW<[FXa2, FXa2, LSU, Lat30, GroupAlone], (instregex "DL(G)?$")>;
 def : InstRW<[FXa], (instregex "SLL(G|K)?$")>;
 def : InstRW<[FXa], (instregex "SRL(G|K)?$")>;
 def : InstRW<[FXa], (instregex "SRA(G|K)?$")>;
-def : InstRW<[FXa], (instregex "SLA(K)?$")>;
+def : InstRW<[FXa], (instregex "SLA(G|K)?$")>;
+def : InstRW<[FXa, FXa, FXa, FXa, Lat8], (instregex "S(L|R)D(A|L)$")>;
 
 // Rotate
 def : InstRW<[FXa, LSU, Lat6], (instregex "RLL(G)?$")>;
@@ -505,7 +518,7 @@ def : InstRW<[FXb, Lat2], (instregex "CGFR$")>;
 
 // Compare logical character
 def : InstRW<[FXb, LSU, LSU, Lat9, BeginGroup], (instregex "CLC$")>;
-
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "CLCL(E|U)?$")>;
 def : InstRW<[LSU, Lat30, GroupAlone], (instregex "CLST$")>;
 
 // Test under mask
@@ -516,6 +529,9 @@ def : InstRW<[FXb], (instregex "TMHL(64)?$")>;
 def : InstRW<[FXb], (instregex "TMLH(64)?$")>;
 def : InstRW<[FXb], (instregex "TMLL(64)?$")>;
 
+// Compare logical characters under mask
+def : InstRW<[FXb, LSU, Lat5], (instregex "CLM(H|Y)?$")>;
+
 //===----------------------------------------------------------------------===//
 // Prefetch and execution hint
 //===----------------------------------------------------------------------===//
@@ -562,6 +578,42 @@ def : InstRW<[FXb, FXb, LSU, Lat6, GroupAlone], (instregex "STPQ$")>;
 // Load pair disjoint
 def : InstRW<[LSU, LSU, Lat5, GroupAlone], (instregex "LPD(G)?$")>;
 
+//===----------------------------------------------------------------------===//
+// Translate and convert
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXa, Lat30, GroupAlone], (instregex "TR(T|TR)?(E|EOpt)?$")>;
+def : InstRW<[FXa, Lat30, GroupAlone], (instregex "TR(T|O)(T|O)(Opt)?$")>;
+def : InstRW<[FXa, Lat30, GroupAlone], (instregex "CU(12|14|21|24|41|42)(Opt)?$")>;
+def : InstRW<[FXa, Lat30, GroupAlone], (instregex "(CUUTF|CUTFU)(Opt)?$")>;
+
+//===----------------------------------------------------------------------===//
+// Message-security assist
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXa, Lat30, GroupAlone], (instregex "KM(C|F|O|CTR)?$")>;
+def : InstRW<[FXa, Lat30, GroupAlone], (instregex "(KIMD|KLMD|KMAC|PCC|PPNO)$")>;
+
+//===----------------------------------------------------------------------===//
+// Decimal arithmetic
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXb, VecDF, LSU, Lat30, GroupAlone], (instregex "CVB(Y|G)?$")>;
+def : InstRW<[FXb, VecDF, FXb, Lat30, GroupAlone], (instregex "CVD(Y|G)?$")>;
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "MV(N|Z|O)$")>;
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "(PACK|PKA|PKU)$")>;
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "UNPK(A|U)?$")>;
+
+def : InstRW<[FXb, VecDFX, LSU, LSU, Lat9, GroupAlone],
+             (instregex "(A|S|ZA)P$")>;
+def : InstRW<[FXb, VecDFX2, LSU, LSU, Lat30, GroupAlone],
+             (instregex "(M|D)P$")>;
+def : InstRW<[FXb, FXb, VecDFX2, LSU, LSU, LSU, Lat15, GroupAlone],
+             (instregex "SRP$")>;
+def : InstRW<[VecDFX, LSU, LSU, Lat5, GroupAlone], (instregex "CP$")>;
+def : InstRW<[VecDFX, LSU, Lat4, GroupAlone], (instregex "TP$")>;
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "ED(MK)?$")>;
+
 //===----------------------------------------------------------------------===//
 // Access registers
 //===----------------------------------------------------------------------===//
@@ -640,13 +692,30 @@ def : InstRW<[FXa], (instregex "ZEXT128_(32|64)$")>;
 
 // String instructions
 def : InstRW<[FXa, LSU, Lat30], (instregex "SRST$")>;
+def : InstRW<[LSU, Lat30], (instregex "SRSTU$")>;
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "CUSE$")>;
+
+// Various complex instructions
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "CFC$")>;
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "UPT$")>;
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "CKSM$")>;
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "CMPSC$")>;
 
 // Move with key
 def : InstRW<[FXa, FXa, FXb, LSU, Lat8, GroupAlone], (instregex "MVCK$")>;
 
+// Monitor call
+def : InstRW<[FXb], (instregex "MC$")>;
+
+// Extract CPU attribute
+def : InstRW<[FXb, Lat30], (instregex "ECAG$")>;
+
 // Extract CPU Time
 def : InstRW<[FXa, Lat5, LSU], (instregex "ECTG$")>;
 
+// Extract PSW
+def : InstRW<[FXb, Lat30], (instregex "EPSW$")>;
+
 // Execute
 def : InstRW<[FXb, GroupAlone], (instregex "EX(RL)?$")>;
 
@@ -811,14 +880,17 @@ def : InstRW<[VecFPd, LSU], (instregex "D(E|D)B$")>;
 def : InstRW<[VecFPd], (instregex "D(E|D)BR$")>;
 def : InstRW<[VecFPd, VecFPd, GroupAlone], (instregex "DXBR$")>;
 
+// Divide to integer
+def : InstRW<[VecFPd, Lat30, GroupAlone], (instregex "DI(E|D)BR$")>;
+
 //===----------------------------------------------------------------------===//
 // FP: Comparisons
 //===----------------------------------------------------------------------===//
 
 // Compare
-def : InstRW<[VecXsPm, LSU, Lat8], (instregex "C(E|D)B$")>;
-def : InstRW<[VecXsPm, Lat4], (instregex "C(E|D)BR?$")>;
-def : InstRW<[VecDF, VecDF, Lat20, GroupAlone], (instregex "CXBR$")>;
+def : InstRW<[VecXsPm, LSU, Lat8], (instregex "(K|C)(E|D)B$")>;
+def : InstRW<[VecXsPm, Lat4], (instregex "(K|C)(E|D)BR?$")>;
+def : InstRW<[VecDF, VecDF, Lat20, GroupAlone], (instregex "(K|C)XBR$")>;
 
 // Test Data Class
 def : InstRW<[LSU, VecXsPm, Lat9], (instregex "TC(E|D)B$")>;
diff --git a/lib/Target/SystemZ/SystemZScheduleZ196.td b/lib/Target/SystemZ/SystemZScheduleZ196.td
index a950e54e7601..670df8ff5541 100644
--- a/lib/Target/SystemZ/SystemZScheduleZ196.td
+++ b/lib/Target/SystemZ/SystemZScheduleZ196.td
@@ -59,6 +59,7 @@ def : WriteRes<Lat30, []> { let Latency = 30; let NumMicroOps = 0;}
 def Z196_FXUnit : ProcResource<2>;
 def Z196_LSUnit : ProcResource<2>;
 def Z196_FPUnit : ProcResource<1>;
+def Z196_DFUnit : ProcResource<1>;
 
 // Subtarget specific definitions of scheduling resources.
 def : WriteRes<FXU,       [Z196_FXUnit]> { let Latency = 1; }
@@ -66,6 +67,8 @@ def : WriteRes<LSU,       [Z196_LSUnit]> { let Latency = 4; }
 def : WriteRes<LSU_lat1,  [Z196_LSUnit]> { let Latency = 1; }
 def : WriteRes<FPU,       [Z196_FPUnit]> { let Latency = 8; }
 def : WriteRes<FPU2,      [Z196_FPUnit, Z196_FPUnit]> { let Latency = 9; }
+def : WriteRes<DFU,       [Z196_DFUnit]> { let Latency = 2; }
+def : WriteRes<DFU2,      [Z196_DFUnit, Z196_DFUnit]> { let Latency = 3; }
 
 // -------------------------- INSTRUCTIONS ---------------------------------- //
 
@@ -152,6 +155,7 @@ def : InstRW<[FXU, LSU, Lat5], (instregex "MVI(Y)?$")>;
 
 // Move character
 def : InstRW<[LSU, LSU, LSU, FXU, Lat8, GroupAlone], (instregex "MVC$")>;
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "MVCL(E|U)?$")>;
 
 // Pseudo -> reg move
 def : InstRW<[FXU], (instregex "COPY(_TO_REGCLASS)?$")>;
@@ -226,6 +230,7 @@ def : InstRW<[LSU], (instregex "LLG(C|F|H|T|FRL|HRL)$")>;
 
 def : InstRW<[FXU, LSU, Lat5], (instregex "STC(H|Y|Mux)?$")>;
 def : InstRW<[FXU, LSU, Lat5], (instregex "STH(H|Y|RL|Mux)?$")>;
+def : InstRW<[FXU, LSU, Lat5], (instregex "STCM(H|Y)?$")>;
 
 //===----------------------------------------------------------------------===//
 // Multi-register moves
@@ -235,6 +240,9 @@ def : InstRW<[FXU, LSU, Lat5], (instregex "STH(H|Y|RL|Mux)?$")>;
 def : InstRW<[LSU, LSU, LSU, LSU, LSU, Lat10, GroupAlone],
              (instregex "LM(H|Y|G)?$")>;
 
+// Load multiple disjoint
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "LMD$")>;
+
 // Store multiple (estimated average of 3 ops)
 def : InstRW<[LSU, LSU, FXU, FXU, FXU, Lat10, GroupAlone],
              (instregex "STM(H|Y|G)?$")>;
@@ -246,6 +254,7 @@ def : InstRW<[LSU, LSU, FXU, FXU, FXU, Lat10, GroupAlone],
 def : InstRW<[FXU], (instregex "LRV(G)?R$")>;
 def : InstRW<[FXU, LSU, Lat5], (instregex "LRV(G|H)?$")>;
 def : InstRW<[FXU, LSU, Lat5], (instregex "STRV(G|H)?$")>;
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "MVCIN$")>;
 
 //===----------------------------------------------------------------------===//
 // Load address instructions
@@ -285,7 +294,7 @@ def : InstRW<[FXU], (instregex "IILL(64)?$")>;
 // Addition
 //===----------------------------------------------------------------------===//
 
-def : InstRW<[FXU, LSU, Lat5], (instregex "A(Y|SI)?$")>;
+def : InstRW<[FXU, LSU, Lat5], (instregex "A(L)?(Y|SI)?$")>;
 def : InstRW<[FXU, FXU, LSU, Lat6, GroupAlone], (instregex "AH(Y)?$")>;
 def : InstRW<[FXU], (instregex "AIH$")>;
 def : InstRW<[FXU], (instregex "AFI(Mux)?$")>;
@@ -294,15 +303,14 @@ def : InstRW<[FXU], (instregex "AGHI(K)?$")>;
 def : InstRW<[FXU], (instregex "AGR(K)?$")>;
 def : InstRW<[FXU], (instregex "AHI(K)?$")>;
 def : InstRW<[FXU], (instregex "AHIMux(K)?$")>;
-def : InstRW<[FXU, LSU, Lat5], (instregex "AL(Y)?$")>;
 def : InstRW<[FXU], (instregex "AL(FI|HSIK)$")>;
-def : InstRW<[FXU, LSU, Lat5], (instregex "ALG(F)?$")>;
+def : InstRW<[FXU, LSU, Lat5], (instregex "ALGF$")>;
 def : InstRW<[FXU], (instregex "ALGHSIK$")>;
 def : InstRW<[FXU], (instregex "ALGF(I|R)$")>;
 def : InstRW<[FXU], (instregex "ALGR(K)?$")>;
 def : InstRW<[FXU], (instregex "ALR(K)?$")>;
 def : InstRW<[FXU], (instregex "AR(K)?$")>;
-def : InstRW<[FXU, LSU, Lat5], (instregex "AG(SI)?$")>;
+def : InstRW<[FXU, LSU, Lat5], (instregex "A(L)?G(SI)?$")>;
 
 // Logical addition with carry
 def : InstRW<[FXU, LSU, Lat7, GroupAlone], (instregex "ALC(G)?$")>;
@@ -395,11 +403,17 @@ def : InstRW<[FXU, Lat9, GroupAlone], (instregex "MLGR$")>;
 def : InstRW<[FXU, Lat5], (instregex "MGHI$")>;
 def : InstRW<[FXU, Lat5], (instregex "MHI$")>;
 def : InstRW<[FXU, LSU, Lat9], (instregex "MH(Y)?$")>;
+def : InstRW<[FXU, Lat7, GroupAlone], (instregex "M(L)?R$")>;
+def : InstRW<[FXU, LSU, Lat7, GroupAlone], (instregex "M(FY|L)?$")>;
 
 //===----------------------------------------------------------------------===//
 // Division and remainder
 //===----------------------------------------------------------------------===//
 
+def : InstRW<[FPU2, FPU2, FXU, FXU, FXU, FXU, FXU, Lat30, GroupAlone],
+              (instregex "DR$")>;
+def : InstRW<[FPU2, FPU2, LSU, FXU, FXU, FXU, FXU, Lat30, GroupAlone],
+              (instregex "D$")>;
 def : InstRW<[FPU2, FPU2, FXU, FXU, FXU, FXU, Lat30, GroupAlone],
               (instregex "DSG(F)?R$")>;
 def : InstRW<[FPU2, FPU2, LSU, FXU, FXU, FXU, Lat30, GroupAlone],
@@ -416,7 +430,8 @@ def : InstRW<[FPU2, FPU2, LSU, FXU, FXU, FXU, FXU, Lat30, GroupAlone],
 def : InstRW<[FXU], (instregex "SLL(G|K)?$")>;
 def : InstRW<[FXU], (instregex "SRL(G|K)?$")>;
 def : InstRW<[FXU], (instregex "SRA(G|K)?$")>;
-def : InstRW<[FXU, Lat2], (instregex "SLA(K)?$")>;
+def : InstRW<[FXU, Lat2], (instregex "SLA(G|K)?$")>;
+def : InstRW<[FXU, FXU, FXU, FXU, Lat8], (instregex "S(L|R)D(A|L)$")>;
 
 // Rotate
 def : InstRW<[FXU, LSU, Lat6], (instregex "RLL(G)?$")>;
@@ -465,7 +480,7 @@ def : InstRW<[FXU, FXU, Lat2, GroupAlone], (instregex "CGFR$")>;
 
 // Compare logical character
 def : InstRW<[LSU, LSU, FXU, Lat9, GroupAlone], (instregex "CLC$")>;
-
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "CLCL(E|U)?$")>;
 def : InstRW<[LSU, Lat30, GroupAlone], (instregex "CLST$")>;
 
 // Test under mask
@@ -476,6 +491,9 @@ def : InstRW<[FXU], (instregex "TMHL(64)?$")>;
 def : InstRW<[FXU], (instregex "TMLH(64)?$")>;
 def : InstRW<[FXU], (instregex "TMLL(64)?$")>;
 
+// Compare logical characters under mask
+def : InstRW<[FXU, LSU, Lat5], (instregex "CLM(H|Y)?$")>;
+
 //===----------------------------------------------------------------------===//
 // Prefetch
 //===----------------------------------------------------------------------===//
@@ -519,6 +537,42 @@ def : InstRW<[FXU, FXU, LSU, LSU, Lat6, GroupAlone], (instregex "STPQ$")>;
 // Load pair disjoint
 def : InstRW<[LSU, LSU, Lat5, GroupAlone], (instregex "LPD(G)?$")>;
 
+//===----------------------------------------------------------------------===//
+// Translate and convert
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXU, Lat30, GroupAlone], (instregex "TR(T|TR)?(E|EOpt)?$")>;
+def : InstRW<[FXU, Lat30, GroupAlone], (instregex "TR(T|O)(T|O)(Opt)?$")>;
+def : InstRW<[FXU, Lat30, GroupAlone], (instregex "CU(12|14|21|24|41|42)(Opt)?$")>;
+def : InstRW<[FXU, Lat30, GroupAlone], (instregex "(CUUTF|CUTFU)(Opt)?$")>;
+
+//===----------------------------------------------------------------------===//
+// Message-security assist
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXU, Lat30, GroupAlone], (instregex "KM(C|F|O|CTR)?$")>;
+def : InstRW<[FXU, Lat30, GroupAlone], (instregex "(KIMD|KLMD|KMAC|PCC)$")>;
+
+//===----------------------------------------------------------------------===//
+// Decimal arithmetic
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXU, DFU, LSU, Lat30, GroupAlone], (instregex "CVB(Y|G)?$")>;
+def : InstRW<[FXU, DFU, FXU, Lat30, GroupAlone], (instregex "CVD(Y|G)?$")>;
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "MV(N|Z|O)$")>;
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "(PACK|PKA|PKU)$")>;
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "UNPK(A|U)?$")>;
+
+def : InstRW<[FXU, FXU, DFU2, LSU, LSU, LSU, LSU, Lat15, GroupAlone],
+             (instregex "(A|S|ZA)P$")>;
+def : InstRW<[FXU, FXU, DFU2, LSU, LSU, LSU, LSU, Lat30, GroupAlone],
+             (instregex "(M|D)P$")>;
+def : InstRW<[FXU, FXU, DFU2, LSU, LSU, Lat15, GroupAlone],
+             (instregex "SRP$")>;
+def : InstRW<[DFU2, LSU, LSU, LSU, LSU, Lat11, GroupAlone], (instregex "CP$")>;
+def : InstRW<[DFU2, LSU, LSU, Lat3, GroupAlone], (instregex "TP$")>;
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "ED(MK)?$")>;
+
 //===----------------------------------------------------------------------===//
 // Access registers
 //===----------------------------------------------------------------------===//
@@ -571,13 +625,30 @@ def : InstRW<[FXU], (instregex "ZEXT128_(32|64)$")>;
 
 // String instructions
 def : InstRW<[FXU, LSU, Lat30], (instregex "SRST$")>;
+def : InstRW<[LSU, Lat30], (instregex "SRSTU$")>;
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "CUSE$")>;
+
+// Various complex instructions
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "CFC$")>;
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "UPT$")>;
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "CKSM$")>;
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "CMPSC$")>;
 
 // Move with key
 def : InstRW<[LSU, Lat8, GroupAlone], (instregex "MVCK$")>;
 
+// Monitor call
+def : InstRW<[FXU], (instregex "MC$")>;
+
+// Extract CPU attribute
+def : InstRW<[FXU, Lat30], (instregex "ECAG$")>;
+
 // Extract CPU Time
 def : InstRW<[FXU, Lat5, LSU], (instregex "ECTG$")>;
 
+// Extract PSW
+def : InstRW<[FXU, Lat30], (instregex "EPSW$")>;
+
 // Execute
 def : InstRW<[LSU, GroupAlone], (instregex "EX(RL)?$")>;
 
@@ -740,14 +811,17 @@ def : InstRW<[FPU, LSU, Lat30], (instregex "D(E|D)B$")>;
 def : InstRW<[FPU, Lat30], (instregex "D(E|D)BR$")>;
 def : InstRW<[FPU2, FPU2, Lat30, GroupAlone], (instregex "DXBR$")>;
 
+// Divide to integer
+def : InstRW<[FPU, Lat30, GroupAlone], (instregex "DI(E|D)BR$")>;
+
 //===----------------------------------------------------------------------===//
 // FP: Comparisons
 //===----------------------------------------------------------------------===//
 
 // Compare
-def : InstRW<[FPU, LSU, Lat12], (instregex "C(E|D)B$")>;
-def : InstRW<[FPU], (instregex "C(E|D)BR$")>;
-def : InstRW<[FPU, FPU, Lat30], (instregex "CXBR$")>;
+def : InstRW<[FPU, LSU, Lat12], (instregex "(K|C)(E|D)B$")>;
+def : InstRW<[FPU], (instregex "(K|C)(E|D)BR$")>;
+def : InstRW<[FPU, FPU, Lat30], (instregex "(K|C)XBR$")>;
 
 // Test Data Class
 def : InstRW<[FPU, LSU, Lat15], (instregex "TC(E|D)B$")>;
diff --git a/lib/Target/SystemZ/SystemZScheduleZEC12.td b/lib/Target/SystemZ/SystemZScheduleZEC12.td
index 8ab6c826f1ed..1bdb8779dc72 100644
--- a/lib/Target/SystemZ/SystemZScheduleZEC12.td
+++ b/lib/Target/SystemZ/SystemZScheduleZEC12.td
@@ -59,6 +59,7 @@ def : WriteRes<Lat30, []> { let Latency = 30; let NumMicroOps = 0;}
 def ZEC12_FXUnit : ProcResource<2>;
 def ZEC12_LSUnit : ProcResource<2>;
 def ZEC12_FPUnit : ProcResource<1>;
+def ZEC12_DFUnit : ProcResource<1>;
 def ZEC12_VBUnit : ProcResource<1>;
 
 // Subtarget specific definitions of scheduling resources.
@@ -67,6 +68,8 @@ def : WriteRes<LSU,      [ZEC12_LSUnit]> { let Latency = 4; }
 def : WriteRes<LSU_lat1, [ZEC12_LSUnit]> { let Latency = 1; }
 def : WriteRes<FPU,  [ZEC12_FPUnit]> { let Latency = 8; }
 def : WriteRes<FPU2, [ZEC12_FPUnit, ZEC12_FPUnit]> { let Latency = 9; }
+def : WriteRes<DFU,  [ZEC12_DFUnit]> { let Latency = 2; }
+def : WriteRes<DFU2, [ZEC12_DFUnit, ZEC12_FPUnit]> { let Latency = 3; }
 def : WriteRes<VBU,  [ZEC12_VBUnit]>; // Virtual Branching Unit
 
 // -------------------------- INSTRUCTIONS ---------------------------------- //
@@ -155,6 +158,7 @@ def : InstRW<[FXU, LSU, Lat5], (instregex "MVI(Y)?$")>;
 
 // Move character
 def : InstRW<[LSU, LSU, LSU, FXU, Lat8, GroupAlone], (instregex "MVC$")>;
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "MVCL(E|U)?$")>;
 
 // Pseudo -> reg move
 def : InstRW<[FXU], (instregex "COPY(_TO_REGCLASS)?$")>;
@@ -236,6 +240,7 @@ def : InstRW<[FXU, LSU, Lat5], (instregex "LLG(F|T)?AT$")>;
 
 def : InstRW<[FXU, LSU, Lat5], (instregex "STC(H|Y|Mux)?$")>;
 def : InstRW<[FXU, LSU, Lat5], (instregex "STH(H|Y|RL|Mux)?$")>;
+def : InstRW<[FXU, LSU, Lat5], (instregex "STCM(H|Y)?$")>;
 
 //===----------------------------------------------------------------------===//
 // Multi-register moves
@@ -245,6 +250,9 @@ def : InstRW<[FXU, LSU, Lat5], (instregex "STH(H|Y|RL|Mux)?$")>;
 def : InstRW<[LSU, LSU, LSU, LSU, LSU, Lat10, GroupAlone],
              (instregex "LM(H|Y|G)?$")>;
 
+// Load multiple disjoint
+def : InstRW<[FXU, Lat30, GroupAlone], (instregex "LMD$")>;
+
 // Store multiple (estimated average of 3 ops)
 def : InstRW<[LSU, LSU, FXU, FXU, FXU, Lat10, GroupAlone],
              (instregex "STM(H|Y|G)?$")>;
@@ -256,6 +264,7 @@ def : InstRW<[LSU, LSU, FXU, FXU, FXU, Lat10, GroupAlone],
 def : InstRW<[FXU], (instregex "LRV(G)?R$")>;
 def : InstRW<[FXU, LSU, Lat5], (instregex "LRV(G|H)?$")>;
 def : InstRW<[FXU, LSU, Lat5], (instregex "STRV(G|H)?$")>;
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "MVCIN$")>;
 
 //===----------------------------------------------------------------------===//
 // Load address instructions
@@ -295,7 +304,7 @@ def : InstRW<[FXU], (instregex "IILL(64)?$")>;
 // Addition
 //===----------------------------------------------------------------------===//
 
-def : InstRW<[FXU, LSU, Lat5], (instregex "A(Y|SI)?$")>;
+def : InstRW<[FXU, LSU, Lat5], (instregex "A(L)?(Y|SI)?$")>;
 def : InstRW<[FXU, LSU, Lat6], (instregex "AH(Y)?$")>;
 def : InstRW<[FXU], (instregex "AIH$")>;
 def : InstRW<[FXU], (instregex "AFI(Mux)?$")>;
@@ -304,15 +313,14 @@ def : InstRW<[FXU], (instregex "AGHI(K)?$")>;
 def : InstRW<[FXU], (instregex "AGR(K)?$")>;
 def : InstRW<[FXU], (instregex "AHI(K)?$")>;
 def : InstRW<[FXU], (instregex "AHIMux(K)?$")>;
-def : InstRW<[FXU, LSU, Lat5], (instregex "AL(Y)?$")>;
 def : InstRW<[FXU], (instregex "AL(FI|HSIK)$")>;
-def : InstRW<[FXU, LSU, Lat5], (instregex "ALG(F)?$")>;
+def : InstRW<[FXU, LSU, Lat5], (instregex "ALGF$")>;
 def : InstRW<[FXU], (instregex "ALGHSIK$")>;
 def : InstRW<[FXU], (instregex "ALGF(I|R)$")>;
 def : InstRW<[FXU], (instregex "ALGR(K)?$")>;
 def : InstRW<[FXU], (instregex "ALR(K)?$")>;
 def : InstRW<[FXU], (instregex "AR(K)?$")>;
-def : InstRW<[FXU, LSU, Lat5], (instregex "AG(SI)?$")>;
+def : InstRW<[FXU, LSU, Lat5], (instregex "A(L)?G(SI)?$")>;
 
 // Logical addition with carry
 def : InstRW<[FXU, LSU, Lat7, GroupAlone], (instregex "ALC(G)?$")>;
@@ -405,11 +413,17 @@ def : InstRW<[FXU, Lat9, GroupAlone], (instregex "MLGR$")>;
 def : InstRW<[FXU, Lat5], (instregex "MGHI$")>;
 def : InstRW<[FXU, Lat5], (instregex "MHI$")>;
 def : InstRW<[FXU, LSU, Lat9], (instregex "MH(Y)?$")>;
+def : InstRW<[FXU, Lat7, GroupAlone], (instregex "M(L)?R$")>;
+def : InstRW<[FXU, LSU, Lat7, GroupAlone], (instregex "M(FY|L)?$")>;
 
 //===----------------------------------------------------------------------===//
 // Division and remainder
 //===----------------------------------------------------------------------===//
 
+def : InstRW<[FPU2, FPU2, FXU, FXU, FXU, FXU, FXU, Lat30, GroupAlone],
+              (instregex "DR$")>;
+def : InstRW<[FPU2, FPU2, LSU, FXU, FXU, FXU, FXU, Lat30, GroupAlone],
+              (instregex "D$")>;
 def : InstRW<[FPU2, FPU2, FXU, FXU, FXU, FXU, Lat30, GroupAlone],
               (instregex "DSG(F)?R$")>;
 def : InstRW<[FPU2, FPU2, LSU, FXU, FXU, FXU, Lat30, GroupAlone],
@@ -426,7 +440,8 @@ def : InstRW<[FPU2, FPU2, LSU, FXU, FXU, FXU, FXU, Lat30, GroupAlone],
 def : InstRW<[FXU], (instregex "SLL(G|K)?$")>;
 def : InstRW<[FXU], (instregex "SRL(G|K)?$")>;
 def : InstRW<[FXU], (instregex "SRA(G|K)?$")>;
-def : InstRW<[FXU], (instregex "SLA(K)?$")>;
+def : InstRW<[FXU], (instregex "SLA(G|K)?$")>;
+def : InstRW<[FXU, FXU, FXU, FXU, Lat8], (instregex "S(L|R)D(A|L)$")>;
 
 // Rotate
 def : InstRW<[FXU, LSU, Lat6], (instregex "RLL(G)?$")>;
@@ -475,7 +490,7 @@ def : InstRW<[FXU, Lat2], (instregex "CGFR$")>;
 
 // Compare logical character
 def : InstRW<[FXU, LSU, LSU, Lat9, GroupAlone], (instregex "CLC$")>;
-
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "CLCL(E|U)?$")>;
 def : InstRW<[LSU, Lat30, GroupAlone], (instregex "CLST$")>;
 
 // Test under mask
@@ -486,6 +501,9 @@ def : InstRW<[FXU], (instregex "TMHL(64)?$")>;
 def : InstRW<[FXU], (instregex "TMLH(64)?$")>;
 def : InstRW<[FXU], (instregex "TMLL(64)?$")>;
 
+// Compare logical characters under mask
+def : InstRW<[FXU, LSU, Lat5], (instregex "CLM(H|Y)?$")>;
+
 //===----------------------------------------------------------------------===//
 // Prefetch and execution hint
 //===----------------------------------------------------------------------===//
@@ -531,6 +549,42 @@ def : InstRW<[FXU, FXU, LSU, LSU, Lat6, GroupAlone], (instregex "STPQ$")>;
 // Load pair disjoint
 def : InstRW<[LSU, LSU, Lat5, GroupAlone], (instregex "LPD(G)?$")>;
 
+//===----------------------------------------------------------------------===//
+// Translate and convert
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXU, Lat30, GroupAlone], (instregex "TR(T|TR)?(E|EOpt)?$")>;
+def : InstRW<[FXU, Lat30, GroupAlone], (instregex "TR(T|O)(T|O)(Opt)?$")>;
+def : InstRW<[FXU, Lat30, GroupAlone], (instregex "CU(12|14|21|24|41|42)(Opt)?$")>;
+def : InstRW<[FXU, Lat30, GroupAlone], (instregex "(CUUTF|CUTFU)(Opt)?$")>;
+
+//===----------------------------------------------------------------------===//
+// Message-security assist
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXU, Lat30, GroupAlone], (instregex "KM(C|F|O|CTR)?$")>;
+def : InstRW<[FXU, Lat30, GroupAlone], (instregex "(KIMD|KLMD|KMAC|PCC)$")>;
+
+//===----------------------------------------------------------------------===//
+// Decimal arithmetic
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXU, DFU, LSU, Lat30, GroupAlone], (instregex "CVB(Y|G)?$")>;
+def : InstRW<[FXU, DFU, FXU, Lat30, GroupAlone], (instregex "CVD(Y|G)?$")>;
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "MV(N|Z|O)$")>;
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "(PACK|PKA|PKU)$")>;
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "UNPK(A|U)?$")>;
+
+def : InstRW<[FXU, FXU, DFU2, LSU, LSU, LSU, LSU, Lat15, GroupAlone],
+             (instregex "(A|S|ZA)P$")>;
+def : InstRW<[FXU, FXU, DFU2, LSU, LSU, LSU, LSU, Lat30, GroupAlone],
+             (instregex "(M|D)P$")>;
+def : InstRW<[FXU, FXU, DFU2, LSU, LSU, Lat15, GroupAlone],
+             (instregex "SRP$")>;
+def : InstRW<[DFU2, LSU, LSU, LSU, LSU, Lat11, GroupAlone], (instregex "CP$")>;
+def : InstRW<[DFU2, LSU, LSU, Lat3, GroupAlone], (instregex "TP$")>;
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "ED(MK)?$")>;
+
 //===----------------------------------------------------------------------===//
 // Access registers
 //===----------------------------------------------------------------------===//
@@ -609,13 +663,30 @@ def : InstRW<[FXU], (instregex "ZEXT128_(32|64)$")>;
 
 // String instructions
 def : InstRW<[FXU, LSU, Lat30], (instregex "SRST$")>;
+def : InstRW<[LSU, Lat30], (instregex "SRSTU$")>;
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "CUSE$")>;
+
+// Various complex instructions
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "CFC$")>;
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "UPT$")>;
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "CKSM$")>;
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "CMPSC$")>;
 
 // Move with key
 def : InstRW<[LSU, Lat8, GroupAlone], (instregex "MVCK$")>;
 
+// Monitor call
+def : InstRW<[FXU], (instregex "MC$")>;
+
+// Extract CPU attribute
+def : InstRW<[FXU, Lat30], (instregex "ECAG$")>;
+
 // Extract CPU Time
 def : InstRW<[FXU, Lat5, LSU], (instregex "ECTG$")>;
 
+// Extract PSW
+def : InstRW<[FXU, Lat30], (instregex "EPSW$")>;
+
 // Execute
 def : InstRW<[LSU, GroupAlone], (instregex "EX(RL)?$")>;
 
@@ -778,14 +849,17 @@ def : InstRW<[FPU, LSU, Lat30], (instregex "D(E|D)B$")>;
 def : InstRW<[FPU, Lat30], (instregex "D(E|D)BR$")>;
 def : InstRW<[FPU2, FPU2, Lat30, GroupAlone], (instregex "DXBR$")>;
 
+// Divide to integer
+def : InstRW<[FPU, Lat30, GroupAlone], (instregex "DI(E|D)BR$")>;
+
 //===----------------------------------------------------------------------===//
 // FP: Comparisons
 //===----------------------------------------------------------------------===//
 
 // Compare
-def : InstRW<[FPU, LSU, Lat12], (instregex "C(E|D)B$")>;
-def : InstRW<[FPU], (instregex "C(E|D)BR$")>;
-def : InstRW<[FPU, FPU, Lat30], (instregex "CXBR$")>;
+def : InstRW<[FPU, LSU, Lat12], (instregex "(K|C)(E|D)B$")>;
+def : InstRW<[FPU], (instregex "(K|C)(E|D)BR$")>;
+def : InstRW<[FPU, FPU, Lat30], (instregex "(K|C)XBR$")>;
 
 // Test Data Class
 def : InstRW<[FPU, LSU, Lat15], (instregex "TC(E|D)B$")>;
diff --git a/lib/Target/SystemZ/SystemZSubtarget.cpp b/lib/Target/SystemZ/SystemZSubtarget.cpp
index ce07ea3318a5..022679a7bc18 100644
--- a/lib/Target/SystemZ/SystemZSubtarget.cpp
+++ b/lib/Target/SystemZ/SystemZSubtarget.cpp
@@ -37,12 +37,13 @@ SystemZSubtarget::SystemZSubtarget(const Triple &TT, const std::string &CPU,
                                    const TargetMachine &TM)
     : SystemZGenSubtargetInfo(TT, CPU, FS), HasDistinctOps(false),
       HasLoadStoreOnCond(false), HasHighWord(false), HasFPExtension(false),
-      HasPopulationCount(false), HasFastSerialization(false),
-      HasInterlockedAccess1(false), HasMiscellaneousExtensions(false),
+      HasPopulationCount(false), HasMessageSecurityAssist4(false),
+      HasFastSerialization(false), HasInterlockedAccess1(false),
+      HasMiscellaneousExtensions(false),
       HasExecutionHint(false), HasLoadAndTrap(false),
       HasTransactionalExecution(false), HasProcessorAssist(false),
       HasVector(false), HasLoadStoreOnCond2(false),
-      HasLoadAndZeroRightmostByte(false),
+      HasLoadAndZeroRightmostByte(false), HasMessageSecurityAssist5(false),
       TargetTriple(TT), InstrInfo(initializeSubtargetDependencies(CPU, FS)),
       TLInfo(TM, *this), TSInfo(), FrameLowering() {}
 
diff --git a/lib/Target/SystemZ/SystemZSubtarget.h b/lib/Target/SystemZ/SystemZSubtarget.h
index cdb61327a16a..770dd7cd939f 100644
--- a/lib/Target/SystemZ/SystemZSubtarget.h
+++ b/lib/Target/SystemZ/SystemZSubtarget.h
@@ -39,6 +39,7 @@ protected:
   bool HasHighWord;
   bool HasFPExtension;
   bool HasPopulationCount;
+  bool HasMessageSecurityAssist4;
   bool HasFastSerialization;
   bool HasInterlockedAccess1;
   bool HasMiscellaneousExtensions;
@@ -49,6 +50,7 @@ protected:
   bool HasVector;
   bool HasLoadStoreOnCond2;
   bool HasLoadAndZeroRightmostByte;
+  bool HasMessageSecurityAssist5;
 
 private:
   Triple TargetTriple;
@@ -104,6 +106,10 @@ public:
   // Return true if the target has the population-count facility.
   bool hasPopulationCount() const { return HasPopulationCount; }
 
+  // Return true if the target has the message-security-assist
+  // extension facility 4.
+  bool hasMessageSecurityAssist4() const { return HasMessageSecurityAssist4; }
+
   // Return true if the target has the fast-serialization facility.
   bool hasFastSerialization() const { return HasFastSerialization; }
 
@@ -132,6 +138,10 @@ public:
     return HasLoadAndZeroRightmostByte;
   }
 
+  // Return true if the target has the message-security-assist
+  // extension facility 5.
+  bool hasMessageSecurityAssist5() const { return HasMessageSecurityAssist5; }
+
   // Return true if the target has the vector facility.
   bool hasVector() const { return HasVector; }
 
diff --git a/lib/Target/WebAssembly/WebAssemblyInstrCall.td b/lib/Target/WebAssembly/WebAssemblyInstrCall.td
index 73d1d4be293b..6b45839c14b0 100644
--- a/lib/Target/WebAssembly/WebAssemblyInstrCall.td
+++ b/lib/Target/WebAssembly/WebAssemblyInstrCall.td
@@ -19,8 +19,8 @@ let Defs = [ARGUMENTS] in {
 // Call sequence markers. These have an immediate which represents the amount of
 // stack space to allocate or free, which is used for varargs lowering.
 let Uses = [SP32, SP64], Defs = [SP32, SP64], isCodeGenOnly = 1 in {
-def ADJCALLSTACKDOWN : I<(outs), (ins i32imm:$amt),
-                         [(WebAssemblycallseq_start timm:$amt)]>;
+def ADJCALLSTACKDOWN : I<(outs), (ins i32imm:$amt, i32imm:$amt2),
+                         [(WebAssemblycallseq_start timm:$amt, timm:$amt2)]>;
 def ADJCALLSTACKUP : I<(outs), (ins i32imm:$amt, i32imm:$amt2),
                        [(WebAssemblycallseq_end timm:$amt, timm:$amt2)]>;
 } // isCodeGenOnly = 1
diff --git a/lib/Target/WebAssembly/WebAssemblyInstrInfo.td b/lib/Target/WebAssembly/WebAssemblyInstrInfo.td
index a601b575f579..fa2146f7db84 100644
--- a/lib/Target/WebAssembly/WebAssemblyInstrInfo.td
+++ b/lib/Target/WebAssembly/WebAssemblyInstrInfo.td
@@ -25,7 +25,8 @@ def HasSIMD128 : Predicate<"Subtarget->hasSIMD128()">,
 // WebAssembly-specific DAG Node Types.
 //===----------------------------------------------------------------------===//
 
-def SDT_WebAssemblyCallSeqStart : SDCallSeqStart<[SDTCisVT<0, iPTR>]>;
+def SDT_WebAssemblyCallSeqStart : SDCallSeqStart<[SDTCisVT<0, iPTR>,
+                                                  SDTCisVT<1, iPTR>]>;
 def SDT_WebAssemblyCallSeqEnd :
     SDCallSeqEnd<[SDTCisVT<0, iPTR>, SDTCisVT<1, iPTR>]>;
 def SDT_WebAssemblyCall0    : SDTypeProfile<0, -1, [SDTCisPtrTy<0>]>;
diff --git a/lib/Target/X86/X86.td b/lib/Target/X86/X86.td
index 784c3a6557ff..3a421fe77392 100644
--- a/lib/Target/X86/X86.td
+++ b/lib/Target/X86/X86.td
@@ -235,6 +235,8 @@ def FeatureLEAUsesAG : SubtargetFeature<"lea-uses-ag", "LEAUsesAG", "true",
                                    "LEA instruction needs inputs at AG stage">;
 def FeatureSlowLEA : SubtargetFeature<"slow-lea", "SlowLEA", "true",
                                    "LEA instruction with certain arguments is slow">;
+def FeatureSlow3OpsLEA : SubtargetFeature<"slow-3ops-lea", "Slow3OpsLEA", "true",
+                                   "LEA instruction with 3 ops or certain registers is slow">;
 def FeatureSlowIncDec : SubtargetFeature<"slow-incdec", "SlowIncDec", "true",
                                    "INC and DEC instructions are slower than ADD and SUB">;
 def FeatureSoftFloat
@@ -480,6 +482,7 @@ def SNBFeatures : ProcessorFeatures<[], [
   FeatureXSAVE,
   FeatureXSAVEOPT,
   FeatureLAHFSAHF,
+  FeatureSlow3OpsLEA,
   FeatureFastScalarFSQRT,
   FeatureFastSHLDRotate
 ]>;
diff --git a/lib/Target/X86/X86FastISel.cpp b/lib/Target/X86/X86FastISel.cpp
index ebd179e786da..fc3b4836c178 100644
--- a/lib/Target/X86/X86FastISel.cpp
+++ b/lib/Target/X86/X86FastISel.cpp
@@ -180,44 +180,6 @@ private:
 
 } // end anonymous namespace.
 
-static std::pair<X86::CondCode, bool>
-getX86ConditionCode(CmpInst::Predicate Predicate) {
-  X86::CondCode CC = X86::COND_INVALID;
-  bool NeedSwap = false;
-  switch (Predicate) {
-  default: break;
-  // Floating-point Predicates
-  case CmpInst::FCMP_UEQ: CC = X86::COND_E;       break;
-  case CmpInst::FCMP_OLT: NeedSwap = true;        LLVM_FALLTHROUGH;
-  case CmpInst::FCMP_OGT: CC = X86::COND_A;       break;
-  case CmpInst::FCMP_OLE: NeedSwap = true;        LLVM_FALLTHROUGH;
-  case CmpInst::FCMP_OGE: CC = X86::COND_AE;      break;
-  case CmpInst::FCMP_UGT: NeedSwap = true;        LLVM_FALLTHROUGH;
-  case CmpInst::FCMP_ULT: CC = X86::COND_B;       break;
-  case CmpInst::FCMP_UGE: NeedSwap = true;        LLVM_FALLTHROUGH;
-  case CmpInst::FCMP_ULE: CC = X86::COND_BE;      break;
-  case CmpInst::FCMP_ONE: CC = X86::COND_NE;      break;
-  case CmpInst::FCMP_UNO: CC = X86::COND_P;       break;
-  case CmpInst::FCMP_ORD: CC = X86::COND_NP;      break;
-  case CmpInst::FCMP_OEQ:                         LLVM_FALLTHROUGH;
-  case CmpInst::FCMP_UNE: CC = X86::COND_INVALID; break;
-
-  // Integer Predicates
-  case CmpInst::ICMP_EQ:  CC = X86::COND_E;       break;
-  case CmpInst::ICMP_NE:  CC = X86::COND_NE;      break;
-  case CmpInst::ICMP_UGT: CC = X86::COND_A;       break;
-  case CmpInst::ICMP_UGE: CC = X86::COND_AE;      break;
-  case CmpInst::ICMP_ULT: CC = X86::COND_B;       break;
-  case CmpInst::ICMP_ULE: CC = X86::COND_BE;      break;
-  case CmpInst::ICMP_SGT: CC = X86::COND_G;       break;
-  case CmpInst::ICMP_SGE: CC = X86::COND_GE;      break;
-  case CmpInst::ICMP_SLT: CC = X86::COND_L;       break;
-  case CmpInst::ICMP_SLE: CC = X86::COND_LE;      break;
-  }
-
-  return std::make_pair(CC, NeedSwap);
-}
-
 static std::pair<unsigned, bool>
 getX86SSEConditionCode(CmpInst::Predicate Predicate) {
   unsigned CC;
@@ -1559,7 +1521,7 @@ bool X86FastISel::X86SelectCmp(const Instruction *I) {
 
   X86::CondCode CC;
   bool SwapArgs;
-  std::tie(CC, SwapArgs) = getX86ConditionCode(Predicate);
+  std::tie(CC, SwapArgs) = X86::getX86ConditionCode(Predicate);
   assert(CC <= X86::LAST_VALID_COND && "Unexpected condition code.");
   unsigned Opc = X86::getSETFromCond(CC);
 
@@ -1697,7 +1659,7 @@ bool X86FastISel::X86SelectBranch(const Instruction *I) {
 
       bool SwapArgs;
       unsigned BranchOpc;
-      std::tie(CC, SwapArgs) = getX86ConditionCode(Predicate);
+      std::tie(CC, SwapArgs) = X86::getX86ConditionCode(Predicate);
       assert(CC <= X86::LAST_VALID_COND && "Unexpected condition code.");
 
       BranchOpc = X86::GetCondBranchFromCond(CC);
@@ -2070,7 +2032,7 @@ bool X86FastISel::X86FastEmitCMoveSelect(MVT RetVT, const Instruction *I) {
     }
 
     bool NeedSwap;
-    std::tie(CC, NeedSwap) = getX86ConditionCode(Predicate);
+    std::tie(CC, NeedSwap) = X86::getX86ConditionCode(Predicate);
     assert(CC <= X86::LAST_VALID_COND && "Unexpected condition code.");
 
     const Value *CmpLHS = CI->getOperand(0);
@@ -2319,7 +2281,7 @@ bool X86FastISel::X86FastEmitPseudoSelect(MVT RetVT, const Instruction *I) {
   const auto *CI = dyn_cast<CmpInst>(Cond);
   if (CI && (CI->getParent() == I->getParent())) {
     bool NeedSwap;
-    std::tie(CC, NeedSwap) = getX86ConditionCode(CI->getPredicate());
+    std::tie(CC, NeedSwap) = X86::getX86ConditionCode(CI->getPredicate());
     if (CC > X86::LAST_VALID_COND)
       return false;
 
@@ -3293,7 +3255,7 @@ bool X86FastISel::fastLowerCall(CallLoweringInfo &CLI) {
   // Issue CALLSEQ_START
   unsigned AdjStackDown = TII.getCallFrameSetupOpcode();
   BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AdjStackDown))
-    .addImm(NumBytes).addImm(0);
+    .addImm(NumBytes).addImm(0).addImm(0);
 
   // Walk the register/memloc assignments, inserting copies/loads.
   const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
diff --git a/lib/Target/X86/X86FixupLEAs.cpp b/lib/Target/X86/X86FixupLEAs.cpp
index 2cd4c1a3e7b3..9f649dad8bc0 100644
--- a/lib/Target/X86/X86FixupLEAs.cpp
+++ b/lib/Target/X86/X86FixupLEAs.cpp
@@ -27,20 +27,26 @@
 #include "llvm/Target/TargetInstrInfo.h"
 using namespace llvm;
 
-#define DEBUG_TYPE "x86-fixup-LEAs"
+namespace llvm {
+void initializeFixupLEAPassPass(PassRegistry &);
+}
+
+#define FIXUPLEA_DESC "X86 LEA Fixup"
+#define FIXUPLEA_NAME "x86-fixup-LEAs"
+
+#define DEBUG_TYPE FIXUPLEA_NAME
 
 STATISTIC(NumLEAs, "Number of LEA instructions created");
 
 namespace {
 class FixupLEAPass : public MachineFunctionPass {
   enum RegUsageState { RU_NotUsed, RU_Write, RU_Read };
-  static char ID;
+
   /// \brief Loop over all of the instructions in the basic block
   /// replacing applicable instructions with LEA instructions,
   /// where appropriate.
   bool processBasicBlock(MachineFunction &MF, MachineFunction::iterator MFI);
 
-  StringRef getPassName() const override { return "X86 LEA Fixup"; }
 
   /// \brief Given a machine register, look for the instruction
   /// which writes it in the current basic block. If found,
@@ -62,6 +68,22 @@ class FixupLEAPass : public MachineFunctionPass {
   void processInstructionForSLM(MachineBasicBlock::iterator &I,
                                 MachineFunction::iterator MFI);
 
+
+  /// \brief Given a LEA instruction which is unprofitable
+  /// on SNB+ try to replace it with other instructions.
+  /// According to Intel's Optimization Reference Manual:
+  /// " For LEA instructions with three source operands and some specific
+  ///   situations, instruction latency has increased to 3 cycles, and must
+  ///   dispatch via port 1:
+  /// - LEA that has all three source operands: base, index, and offset
+  /// - LEA that uses base and index registers where the base is EBP, RBP,
+  ///   or R13
+  /// - LEA that uses RIP relative addressing mode
+  /// - LEA that uses 16-bit addressing mode "
+  /// This function currently handles the first 2 cases only.
+  MachineInstr *processInstrForSlow3OpLEA(MachineInstr &MI,
+                                          MachineFunction::iterator MFI);
+
   /// \brief Look for LEAs that add 1 to reg or subtract 1 from reg
   /// and convert them to INC or DEC respectively.
   bool fixupIncDec(MachineBasicBlock::iterator &I,
@@ -85,7 +107,13 @@ class FixupLEAPass : public MachineFunctionPass {
                                    MachineBasicBlock::iterator &MBBI) const;
 
 public:
-  FixupLEAPass() : MachineFunctionPass(ID) {}
+  static char ID;
+
+  StringRef getPassName() const override { return FIXUPLEA_DESC; }
+
+  FixupLEAPass() : MachineFunctionPass(ID) {
+    initializeFixupLEAPassPass(*PassRegistry::getPassRegistry());
+  }
 
   /// \brief Loop over all of the basic blocks,
   /// replacing instructions by equivalent LEA instructions
@@ -104,9 +132,12 @@ private:
   bool OptIncDec;
   bool OptLEA;
 };
-char FixupLEAPass::ID = 0;
 }
 
+char FixupLEAPass::ID = 0;
+
+INITIALIZE_PASS(FixupLEAPass, FIXUPLEA_NAME, FIXUPLEA_DESC, false, false)
+
 MachineInstr *
 FixupLEAPass::postRAConvertToLEA(MachineFunction::iterator &MFI,
                                  MachineBasicBlock::iterator &MBBI) const {
@@ -168,7 +199,7 @@ bool FixupLEAPass::runOnMachineFunction(MachineFunction &Func) {
   MF = &Func;
   const X86Subtarget &ST = Func.getSubtarget<X86Subtarget>();
   OptIncDec = !ST.slowIncDec() || Func.getFunction()->optForMinSize();
-  OptLEA = ST.LEAusesAG() || ST.slowLEA();
+  OptLEA = ST.LEAusesAG() || ST.slowLEA() || ST.slow3OpsLEA();
 
   if (!OptLEA && !OptIncDec)
     return false;
@@ -242,9 +273,64 @@ FixupLEAPass::searchBackwards(MachineOperand &p, MachineBasicBlock::iterator &I,
   return MachineBasicBlock::iterator();
 }
 
-static inline bool isLEA(const int opcode) {
-  return opcode == X86::LEA16r || opcode == X86::LEA32r ||
-         opcode == X86::LEA64r || opcode == X86::LEA64_32r;
+static inline bool isLEA(const int Opcode) {
+  return Opcode == X86::LEA16r || Opcode == X86::LEA32r ||
+         Opcode == X86::LEA64r || Opcode == X86::LEA64_32r;
+}
+
+static inline bool isInefficientLEAReg(unsigned int Reg) {
+  return Reg == X86::EBP || Reg == X86::RBP || Reg == X86::R13;
+}
+
+static inline bool isRegOperand(const MachineOperand &Op) {
+  return Op.isReg() && Op.getReg() != X86::NoRegister;
+}
+/// hasIneffecientLEARegs - LEA that uses base and index registers
+/// where the base is EBP, RBP, or R13
+static inline bool hasInefficientLEABaseReg(const MachineOperand &Base,
+                                            const MachineOperand &Index) {
+  return Base.isReg() && isInefficientLEAReg(Base.getReg()) &&
+         isRegOperand(Index);
+}
+
+static inline bool hasLEAOffset(const MachineOperand &Offset) {
+  return (Offset.isImm() && Offset.getImm() != 0) || Offset.isGlobal();
+}
+
+// LEA instruction that has all three operands: offset, base and index
+static inline bool isThreeOperandsLEA(const MachineOperand &Base,
+                                      const MachineOperand &Index,
+                                      const MachineOperand &Offset) {
+  return isRegOperand(Base) && isRegOperand(Index) && hasLEAOffset(Offset);
+}
+
+static inline int getADDrrFromLEA(int LEAOpcode) {
+  switch (LEAOpcode) {
+  default:
+    llvm_unreachable("Unexpected LEA instruction");
+  case X86::LEA16r:
+    return X86::ADD16rr;
+  case X86::LEA32r:
+    return X86::ADD32rr;
+  case X86::LEA64_32r:
+  case X86::LEA64r:
+    return X86::ADD64rr;
+  }
+}
+
+static inline int getADDriFromLEA(int LEAOpcode, const MachineOperand &Offset) {
+  bool IsInt8 = Offset.isImm() && isInt<8>(Offset.getImm());
+  switch (LEAOpcode) {
+  default:
+    llvm_unreachable("Unexpected LEA instruction");
+  case X86::LEA16r:
+    return IsInt8 ? X86::ADD16ri8 : X86::ADD16ri;
+  case X86::LEA32r:
+  case X86::LEA64_32r:
+    return IsInt8 ? X86::ADD32ri8 : X86::ADD32ri;
+  case X86::LEA64r:
+    return IsInt8 ? X86::ADD64ri8 : X86::ADD64ri32;
+  }
 }
 
 /// isLEASimpleIncOrDec - Does this LEA have one these forms:
@@ -337,8 +423,8 @@ void FixupLEAPass::seekLEAFixup(MachineOperand &p,
 void FixupLEAPass::processInstructionForSLM(MachineBasicBlock::iterator &I,
                                             MachineFunction::iterator MFI) {
   MachineInstr &MI = *I;
-  const int opcode = MI.getOpcode();
-  if (!isLEA(opcode))
+  const int Opcode = MI.getOpcode();
+  if (!isLEA(Opcode))
     return;
   if (MI.getOperand(5).getReg() != 0 || !MI.getOperand(4).isImm() ||
       !TII->isSafeToClobberEFLAGS(*MFI, I))
@@ -350,53 +436,142 @@ void FixupLEAPass::processInstructionForSLM(MachineBasicBlock::iterator &I,
     return;
   if (MI.getOperand(2).getImm() > 1)
     return;
-  int addrr_opcode, addri_opcode;
-  switch (opcode) {
-  default:
-    llvm_unreachable("Unexpected LEA instruction");
-  case X86::LEA16r:
-    addrr_opcode = X86::ADD16rr;
-    addri_opcode = X86::ADD16ri;
-    break;
-  case X86::LEA32r:
-    addrr_opcode = X86::ADD32rr;
-    addri_opcode = X86::ADD32ri;
-    break;
-  case X86::LEA64_32r:
-  case X86::LEA64r:
-    addrr_opcode = X86::ADD64rr;
-    addri_opcode = X86::ADD64ri32;
-    break;
-  }
   DEBUG(dbgs() << "FixLEA: Candidate to replace:"; I->dump(););
   DEBUG(dbgs() << "FixLEA: Replaced by: ";);
   MachineInstr *NewMI = nullptr;
-  const MachineOperand &Dst = MI.getOperand(0);
   // Make ADD instruction for two registers writing to LEA's destination
   if (SrcR1 != 0 && SrcR2 != 0) {
-    const MachineOperand &Src1 = MI.getOperand(SrcR1 == DstR ? 1 : 3);
-    const MachineOperand &Src2 = MI.getOperand(SrcR1 == DstR ? 3 : 1);
-    NewMI = BuildMI(*MF, MI.getDebugLoc(), TII->get(addrr_opcode))
-                .add(Dst)
-                .add(Src1)
-                .add(Src2);
-    MFI->insert(I, NewMI);
+    const MCInstrDesc &ADDrr = TII->get(getADDrrFromLEA(Opcode));
+    const MachineOperand &Src = MI.getOperand(SrcR1 == DstR ? 3 : 1);
+    NewMI =
+        BuildMI(*MFI, I, MI.getDebugLoc(), ADDrr, DstR).addReg(DstR).add(Src);
     DEBUG(NewMI->dump(););
   }
   // Make ADD instruction for immediate
   if (MI.getOperand(4).getImm() != 0) {
+    const MCInstrDesc &ADDri =
+        TII->get(getADDriFromLEA(Opcode, MI.getOperand(4)));
     const MachineOperand &SrcR = MI.getOperand(SrcR1 == DstR ? 1 : 3);
-    NewMI = BuildMI(*MF, MI.getDebugLoc(), TII->get(addri_opcode))
-                .add(Dst)
+    NewMI = BuildMI(*MFI, I, MI.getDebugLoc(), ADDri, DstR)
                 .add(SrcR)
                 .addImm(MI.getOperand(4).getImm());
-    MFI->insert(I, NewMI);
     DEBUG(NewMI->dump(););
   }
   if (NewMI) {
     MFI->erase(I);
-    I = static_cast<MachineBasicBlock::iterator>(NewMI);
+    I = NewMI;
+  }
+}
+
+MachineInstr *
+FixupLEAPass::processInstrForSlow3OpLEA(MachineInstr &MI,
+                                        MachineFunction::iterator MFI) {
+
+  const int LEAOpcode = MI.getOpcode();
+  if (!isLEA(LEAOpcode))
+    return nullptr;
+
+  const MachineOperand &Dst = MI.getOperand(0);
+  const MachineOperand &Base = MI.getOperand(1);
+  const MachineOperand &Scale = MI.getOperand(2);
+  const MachineOperand &Index = MI.getOperand(3);
+  const MachineOperand &Offset = MI.getOperand(4);
+  const MachineOperand &Segment = MI.getOperand(5);
+
+  if (!(isThreeOperandsLEA(Base, Index, Offset) ||
+        hasInefficientLEABaseReg(Base, Index)) ||
+      !TII->isSafeToClobberEFLAGS(*MFI, MI) ||
+      Segment.getReg() != X86::NoRegister)
+    return nullptr;
+
+  unsigned int DstR = Dst.getReg();
+  unsigned int BaseR = Base.getReg();
+  unsigned int IndexR = Index.getReg();
+  unsigned SSDstR =
+      (LEAOpcode == X86::LEA64_32r) ? getX86SubSuperRegister(DstR, 64) : DstR;
+  bool IsScale1 = Scale.getImm() == 1;
+  bool IsInefficientBase = isInefficientLEAReg(BaseR);
+  bool IsInefficientIndex = isInefficientLEAReg(IndexR);
+
+  // Skip these cases since it takes more than 2 instructions
+  // to replace the LEA instruction.
+  if (IsInefficientBase && SSDstR == BaseR && !IsScale1)
+    return nullptr;
+  if (LEAOpcode == X86::LEA64_32r && IsInefficientBase &&
+      (IsInefficientIndex || !IsScale1))
+    return nullptr;
+
+  const DebugLoc DL = MI.getDebugLoc();
+  const MCInstrDesc &ADDrr = TII->get(getADDrrFromLEA(LEAOpcode));
+  const MCInstrDesc &ADDri = TII->get(getADDriFromLEA(LEAOpcode, Offset));
+
+  DEBUG(dbgs() << "FixLEA: Candidate to replace:"; MI.dump(););
+  DEBUG(dbgs() << "FixLEA: Replaced by: ";);
+
+  // First try to replace LEA with one or two (for the 3-op LEA case)
+  // add instructions:
+  // 1.lea (%base,%index,1), %base => add %index,%base
+  // 2.lea (%base,%index,1), %index => add %base,%index
+  if (IsScale1 && (DstR == BaseR || DstR == IndexR)) {
+    const MachineOperand &Src = DstR == BaseR ? Index : Base;
+    MachineInstr *NewMI =
+        BuildMI(*MFI, MI, DL, ADDrr, DstR).addReg(DstR).add(Src);
+    DEBUG(NewMI->dump(););
+    // Create ADD instruction for the Offset in case of 3-Ops LEA.
+    if (hasLEAOffset(Offset)) {
+      NewMI = BuildMI(*MFI, MI, DL, ADDri, DstR).addReg(DstR).add(Offset);
+      DEBUG(NewMI->dump(););
+    }
+    return NewMI;
+  }
+  // If the base is inefficient try switching the index and base operands,
+  // otherwise just break the 3-Ops LEA inst into 2-Ops LEA + ADD instruction:
+  // lea offset(%base,%index,scale),%dst =>
+  // lea (%base,%index,scale); add offset,%dst
+  if (!IsInefficientBase || (!IsInefficientIndex && IsScale1)) {
+    MachineInstr *NewMI = BuildMI(*MFI, MI, DL, TII->get(LEAOpcode))
+                              .add(Dst)
+                              .add(IsInefficientBase ? Index : Base)
+                              .add(Scale)
+                              .add(IsInefficientBase ? Base : Index)
+                              .addImm(0)
+                              .add(Segment);
+    DEBUG(NewMI->dump(););
+    // Create ADD instruction for the Offset in case of 3-Ops LEA.
+    if (hasLEAOffset(Offset)) {
+      NewMI = BuildMI(*MFI, MI, DL, ADDri, DstR).addReg(DstR).add(Offset);
+      DEBUG(NewMI->dump(););
+    }
+    return NewMI;
+  }
+  // Handle the rest of the cases with inefficient base register:
+  assert(SSDstR != BaseR && "SSDstR == BaseR should be handled already!");
+  assert(IsInefficientBase && "efficient base should be handled already!");
+
+  // lea (%base,%index,1), %dst => mov %base,%dst; add %index,%dst
+  if (IsScale1 && !hasLEAOffset(Offset)) {
+    TII->copyPhysReg(*MFI, MI, DL, DstR, BaseR, Base.isKill());
+    DEBUG(MI.getPrevNode()->dump(););
+
+    MachineInstr *NewMI =
+        BuildMI(*MFI, MI, DL, ADDrr, DstR).addReg(DstR).add(Index);
+    DEBUG(NewMI->dump(););
+    return NewMI;
   }
+  // lea offset(%base,%index,scale), %dst =>
+  // lea offset( ,%index,scale), %dst; add %base,%dst
+  MachineInstr *NewMI = BuildMI(*MFI, MI, DL, TII->get(LEAOpcode))
+                            .add(Dst)
+                            .addReg(0)
+                            .add(Scale)
+                            .add(Index)
+                            .add(Offset)
+                            .add(Segment);
+  DEBUG(NewMI->dump(););
+
+  NewMI = BuildMI(*MFI, MI, DL, ADDrr, DstR).addReg(DstR).add(Base);
+  DEBUG(NewMI->dump(););
+  return NewMI;
 }
 
 bool FixupLEAPass::processBasicBlock(MachineFunction &MF,
@@ -410,8 +585,16 @@ bool FixupLEAPass::processBasicBlock(MachineFunction &MF,
     if (OptLEA) {
       if (MF.getSubtarget<X86Subtarget>().isSLM())
         processInstructionForSLM(I, MFI);
-      else
-        processInstruction(I, MFI);
+
+      else {
+        if (MF.getSubtarget<X86Subtarget>().slow3OpsLEA()) {
+          if (auto *NewMI = processInstrForSlow3OpLEA(*I, MFI)) {
+            MFI->erase(I);
+            I = NewMI;
+          }
+        } else
+          processInstruction(I, MFI);
+      }
     }
   }
   return false;
diff --git a/lib/Target/X86/X86ISelDAGToDAG.cpp b/lib/Target/X86/X86ISelDAGToDAG.cpp
index 12a10bf3072f..c899f0fd5100 100644
--- a/lib/Target/X86/X86ISelDAGToDAG.cpp
+++ b/lib/Target/X86/X86ISelDAGToDAG.cpp
@@ -1178,8 +1178,7 @@ bool X86DAGToDAGISel::matchAddressRecursively(SDValue N, X86ISelAddressMode &AM,
     if (AM.IndexReg.getNode() != nullptr || AM.Scale != 1)
       break;
 
-    if (ConstantSDNode
-          *CN = dyn_cast<ConstantSDNode>(N.getNode()->getOperand(1))) {
+    if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
       unsigned Val = CN->getZExtValue();
       // Note that we handle x<<1 as (,x,2) rather than (x,x) here so
       // that the base operand remains free for further matching. If
@@ -1187,15 +1186,14 @@ bool X86DAGToDAGISel::matchAddressRecursively(SDValue N, X86ISelAddressMode &AM,
       // in MatchAddress turns (,x,2) into (x,x), which is cheaper.
       if (Val == 1 || Val == 2 || Val == 3) {
         AM.Scale = 1 << Val;
-        SDValue ShVal = N.getNode()->getOperand(0);
+        SDValue ShVal = N.getOperand(0);
 
         // Okay, we know that we have a scale by now.  However, if the scaled
         // value is an add of something and a constant, we can fold the
         // constant into the disp field here.
         if (CurDAG->isBaseWithConstantOffset(ShVal)) {
-          AM.IndexReg = ShVal.getNode()->getOperand(0);
-          ConstantSDNode *AddVal =
-            cast<ConstantSDNode>(ShVal.getNode()->getOperand(1));
+          AM.IndexReg = ShVal.getOperand(0);
+          ConstantSDNode *AddVal = cast<ConstantSDNode>(ShVal.getOperand(1));
           uint64_t Disp = (uint64_t)AddVal->getSExtValue() << Val;
           if (!foldOffsetIntoAddress(Disp, AM))
             return false;
@@ -1245,28 +1243,27 @@ bool X86DAGToDAGISel::matchAddressRecursively(SDValue N, X86ISelAddressMode &AM,
     if (AM.BaseType == X86ISelAddressMode::RegBase &&
         AM.Base_Reg.getNode() == nullptr &&
         AM.IndexReg.getNode() == nullptr) {
-      if (ConstantSDNode
-            *CN = dyn_cast<ConstantSDNode>(N.getNode()->getOperand(1)))
+      if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N.getOperand(1)))
         if (CN->getZExtValue() == 3 || CN->getZExtValue() == 5 ||
             CN->getZExtValue() == 9) {
           AM.Scale = unsigned(CN->getZExtValue())-1;
 
-          SDValue MulVal = N.getNode()->getOperand(0);
+          SDValue MulVal = N.getOperand(0);
           SDValue Reg;
 
           // Okay, we know that we have a scale by now.  However, if the scaled
           // value is an add of something and a constant, we can fold the
           // constant into the disp field here.
           if (MulVal.getNode()->getOpcode() == ISD::ADD && MulVal.hasOneUse() &&
-              isa<ConstantSDNode>(MulVal.getNode()->getOperand(1))) {
-            Reg = MulVal.getNode()->getOperand(0);
+              isa<ConstantSDNode>(MulVal.getOperand(1))) {
+            Reg = MulVal.getOperand(0);
             ConstantSDNode *AddVal =
-              cast<ConstantSDNode>(MulVal.getNode()->getOperand(1));
+              cast<ConstantSDNode>(MulVal.getOperand(1));
             uint64_t Disp = AddVal->getSExtValue() * CN->getZExtValue();
             if (foldOffsetIntoAddress(Disp, AM))
-              Reg = N.getNode()->getOperand(0);
+              Reg = N.getOperand(0);
           } else {
-            Reg = N.getNode()->getOperand(0);
+            Reg = N.getOperand(0);
           }
 
           AM.IndexReg = AM.Base_Reg = Reg;
@@ -1289,7 +1286,7 @@ bool X86DAGToDAGISel::matchAddressRecursively(SDValue N, X86ISelAddressMode &AM,
 
     // Test if the LHS of the sub can be folded.
     X86ISelAddressMode Backup = AM;
-    if (matchAddressRecursively(N.getNode()->getOperand(0), AM, Depth+1)) {
+    if (matchAddressRecursively(N.getOperand(0), AM, Depth+1)) {
       AM = Backup;
       break;
     }
@@ -1300,7 +1297,7 @@ bool X86DAGToDAGISel::matchAddressRecursively(SDValue N, X86ISelAddressMode &AM,
     }
 
     int Cost = 0;
-    SDValue RHS = Handle.getValue().getNode()->getOperand(1);
+    SDValue RHS = Handle.getValue().getOperand(1);
     // If the RHS involves a register with multiple uses, this
     // transformation incurs an extra mov, due to the neg instruction
     // clobbering its operand.
@@ -1309,7 +1306,7 @@ bool X86DAGToDAGISel::matchAddressRecursively(SDValue N, X86ISelAddressMode &AM,
         RHS.getNode()->getOpcode() == ISD::TRUNCATE ||
         RHS.getNode()->getOpcode() == ISD::ANY_EXTEND ||
         (RHS.getNode()->getOpcode() == ISD::ZERO_EXTEND &&
-         RHS.getNode()->getOperand(0).getValueType() == MVT::i32))
+         RHS.getOperand(0).getValueType() == MVT::i32))
       ++Cost;
     // If the base is a register with multiple uses, this
     // transformation may save a mov.
@@ -2524,7 +2521,7 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
         N0.getNode()->hasOneUse() &&
         N0.getValueType() != MVT::i8 &&
         X86::isZeroNode(N1)) {
-      ConstantSDNode *C = dyn_cast<ConstantSDNode>(N0.getNode()->getOperand(1));
+      ConstantSDNode *C = dyn_cast<ConstantSDNode>(N0.getOperand(1));
       if (!C) break;
 
       // For example, convert "testl %eax, $8" to "testb %al, $8"
@@ -2532,7 +2529,7 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
           (!(C->getZExtValue() & 0x80) ||
            hasNoSignedComparisonUses(Node))) {
         SDValue Imm = CurDAG->getTargetConstant(C->getZExtValue(), dl, MVT::i8);
-        SDValue Reg = N0.getNode()->getOperand(0);
+        SDValue Reg = N0.getOperand(0);
 
         // On x86-32, only the ABCD registers have 8-bit subregisters.
         if (!Subtarget->is64Bit()) {
@@ -2568,7 +2565,7 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
         // Shift the immediate right by 8 bits.
         SDValue ShiftedImm = CurDAG->getTargetConstant(C->getZExtValue() >> 8,
                                                        dl, MVT::i8);
-        SDValue Reg = N0.getNode()->getOperand(0);
+        SDValue Reg = N0.getOperand(0);
 
         // Put the value in an ABCD register.
         const TargetRegisterClass *TRC;
@@ -2605,7 +2602,7 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
            hasNoSignedComparisonUses(Node))) {
         SDValue Imm = CurDAG->getTargetConstant(C->getZExtValue(), dl,
                                                 MVT::i16);
-        SDValue Reg = N0.getNode()->getOperand(0);
+        SDValue Reg = N0.getOperand(0);
 
         // Extract the 16-bit subregister.
         SDValue Subreg = CurDAG->getTargetExtractSubreg(X86::sub_16bit, dl,
@@ -2628,7 +2625,7 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
            hasNoSignedComparisonUses(Node))) {
         SDValue Imm = CurDAG->getTargetConstant(C->getZExtValue(), dl,
                                                 MVT::i32);
-        SDValue Reg = N0.getNode()->getOperand(0);
+        SDValue Reg = N0.getOperand(0);
 
         // Extract the 32-bit subregister.
         SDValue Subreg = CurDAG->getTargetExtractSubreg(X86::sub_32bit, dl,
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index 9ee2234595f9..11c08292518a 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -40,6 +40,7 @@
 #include "llvm/IR/CallingConv.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/DiagnosticInfo.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/GlobalAlias.h"
 #include "llvm/IR/GlobalVariable.h"
@@ -79,6 +80,17 @@ static cl::opt<int> ExperimentalPrefLoopAlignment(
              " of the loop header PC will be 0)."),
     cl::Hidden);
 
+/// Call this when the user attempts to do something unsupported, like
+/// returning a double without SSE2 enabled on x86_64. This is not fatal, unlike
+/// report_fatal_error, so calling code should attempt to recover without
+/// crashing.
+static void errorUnsupported(SelectionDAG &DAG, const SDLoc &dl,
+                             const char *Msg) {
+  MachineFunction &MF = DAG.getMachineFunction();
+  DAG.getContext()->diagnose(
+      DiagnosticInfoUnsupported(*MF.getFunction(), Msg, dl.getDebugLoc()));
+}
+
 X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
                                      const X86Subtarget &STI)
     : TargetLowering(TM), Subtarget(STI) {
@@ -1381,7 +1393,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
       setOperationAction(ISD::VECTOR_SHUFFLE,      VT, Custom);
       setOperationAction(ISD::INSERT_VECTOR_ELT,   VT, Custom);
       setOperationAction(ISD::BUILD_VECTOR,        VT, Custom);
-      setOperationAction(ISD::VSELECT,             VT, Legal);
+      setOperationAction(ISD::VSELECT,             VT, Custom);
       setOperationAction(ISD::EXTRACT_VECTOR_ELT,  VT, Custom);
       setOperationAction(ISD::SCALAR_TO_VECTOR,    VT, Custom);
       setOperationAction(ISD::INSERT_SUBVECTOR,    VT, Legal);
@@ -1445,8 +1457,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v64i1, Custom);
     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v32i16, Custom);
     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v64i8, Custom);
-    setOperationAction(ISD::VSELECT,            MVT::v32i16, Legal);
-    setOperationAction(ISD::VSELECT,            MVT::v64i8, Legal);
     setOperationAction(ISD::TRUNCATE,           MVT::v32i1, Custom);
     setOperationAction(ISD::TRUNCATE,           MVT::v64i1, Custom);
     setOperationAction(ISD::TRUNCATE,           MVT::v32i8, Custom);
@@ -1479,7 +1489,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
 
     for (auto VT : { MVT::v64i8, MVT::v32i16 }) {
       setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
-      setOperationAction(ISD::VSELECT,      VT, Legal);
+      setOperationAction(ISD::VSELECT,      VT, Custom);
       setOperationAction(ISD::ABS,          VT, Legal);
       setOperationAction(ISD::SRL,          VT, Custom);
       setOperationAction(ISD::SHL,          VT, Custom);
@@ -2207,15 +2217,17 @@ X86TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
     // or SSE or MMX vectors.
     if ((ValVT == MVT::f32 || ValVT == MVT::f64 ||
          VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) &&
-          (Subtarget.is64Bit() && !Subtarget.hasSSE1())) {
-      report_fatal_error("SSE register return with SSE disabled");
+        (Subtarget.is64Bit() && !Subtarget.hasSSE1())) {
+      errorUnsupported(DAG, dl, "SSE register return with SSE disabled");
+      VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
+    } else if (ValVT == MVT::f64 &&
+               (Subtarget.is64Bit() && !Subtarget.hasSSE2())) {
+      // Likewise we can't return F64 values with SSE1 only.  gcc does so, but
+      // llvm-gcc has never done it right and no one has noticed, so this
+      // should be OK for now.
+      errorUnsupported(DAG, dl, "SSE2 register return with SSE2 disabled");
+      VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
     }
-    // Likewise we can't return F64 values with SSE1 only.  gcc does so, but
-    // llvm-gcc has never done it right and no one has noticed, so this
-    // should be OK for now.
-    if (ValVT == MVT::f64 &&
-        (Subtarget.is64Bit() && !Subtarget.hasSSE2()))
-      report_fatal_error("SSE2 register return with SSE2 disabled");
 
     // Returns in ST0/ST1 are handled specially: these are pushed as operands to
     // the RET instruction and handled by the FP Stackifier.
@@ -2528,7 +2540,8 @@ SDValue X86TargetLowering::LowerCallResult(
     // If this is x86-64, and we disabled SSE, we can't return FP values
     if ((CopyVT == MVT::f32 || CopyVT == MVT::f64 || CopyVT == MVT::f128) &&
         ((Is64Bit || Ins[InsIndex].Flags.isInReg()) && !Subtarget.hasSSE1())) {
-      report_fatal_error("SSE register return with SSE disabled");
+      errorUnsupported(DAG, dl, "SSE register return with SSE disabled");
+      VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
     }
 
     // If we prefer to use the value in xmm registers, copy it out as f80 and
@@ -3415,8 +3428,8 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
   }
 
   if (!IsSibcall)
-    Chain = DAG.getCALLSEQ_START(
-        Chain, DAG.getIntPtrConstant(NumBytesToPush, dl, true), dl);
+    Chain = DAG.getCALLSEQ_START(Chain, NumBytesToPush,
+                                 NumBytes - NumBytesToPush, dl);
 
   SDValue RetAddrFrIdx;
   // Load return address for tail calls.
@@ -6912,9 +6925,9 @@ X86TargetLowering::LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG) const {
 
   // for splat use " (select i1 splat_elt, all-ones, all-zeroes)"
   if (IsSplat)
-    return DAG.getNode(ISD::SELECT, dl, VT, Op.getOperand(SplatIdx),
-                       DAG.getConstant(1, dl, VT),
-                       DAG.getConstant(0, dl, VT));
+    return DAG.getSelect(dl, VT, Op.getOperand(SplatIdx),
+                         DAG.getConstant(1, dl, VT),
+                         DAG.getConstant(0, dl, VT));
 
   // insert elements one by one
   SDValue DstVec;
@@ -8386,9 +8399,9 @@ static SDValue lowerVectorShuffleToEXPAND(const SDLoc &DL, MVT VT,
                               Subtarget, DAG, DL);
   SDValue ZeroVector = getZeroVector(VT, Subtarget, DAG, DL);
   SDValue ExpandedVector = IsLeftZeroSide ? V2 : V1;
-  return DAG.getNode(ISD::VSELECT, DL, VT, VMask,
-                     DAG.getNode(X86ISD::EXPAND, DL, VT, ExpandedVector),
-                     ZeroVector);
+  return DAG.getSelect(DL, VT, VMask,
+                       DAG.getNode(X86ISD::EXPAND, DL, VT, ExpandedVector),
+                       ZeroVector);
 }
 
 static bool matchVectorShuffleWithUNPCK(MVT VT, SDValue &V1, SDValue &V2,
@@ -8748,8 +8761,9 @@ static SDValue lowerVectorShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1,
     V1 = DAG.getBitcast(BlendVT, V1);
     V2 = DAG.getBitcast(BlendVT, V2);
     return DAG.getBitcast(
-        VT, DAG.getNode(ISD::VSELECT, DL, BlendVT,
-                        DAG.getBuildVector(BlendVT, DL, VSELECTMask), V1, V2));
+        VT,
+        DAG.getSelect(DL, BlendVT, DAG.getBuildVector(BlendVT, DL, VSELECTMask),
+                      V1, V2));
   }
   case MVT::v16f32:
   case MVT::v8f64:
@@ -13817,6 +13831,11 @@ SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const {
       ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(2).getNode()))
     return SDValue();
 
+  // If this VSELECT has a vector if i1 as a mask, it will be directly matched
+  // with patterns on the mask registers on AVX-512.
+  if (Op->getOperand(0).getValueType().getScalarSizeInBits() == 1)
+    return Op;
+
   // Try to lower this to a blend-style vector shuffle. This can handle all
   // constant condition cases.
   if (SDValue BlendOp = lowerVSELECTtoVectorShuffle(Op, Subtarget, DAG))
@@ -13826,10 +13845,30 @@ SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const {
   if (!Subtarget.hasSSE41())
     return SDValue();
 
+  SDLoc dl(Op);
+  MVT VT = Op.getSimpleValueType();
+
+  // If the VSELECT is on a 512-bit type, we have to convert a non-i1 condition
+  // into an i1 condition so that we can use the mask-based 512-bit blend
+  // instructions.
+  if (VT.getSizeInBits() == 512) {
+    SDValue Cond = Op.getOperand(0);
+    // The vNi1 condition case should be handled above as it can be trivially
+    // lowered.
+    assert(Cond.getValueType().getScalarSizeInBits() ==
+               VT.getScalarSizeInBits() &&
+           "Should have a size-matched integer condition!");
+    // Build a mask by testing the condition against itself (tests for zero).
+    MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
+    SDValue Mask = DAG.getNode(X86ISD::TESTM, dl, MaskVT, Cond, Cond);
+    // Now return a new VSELECT using the mask.
+    return DAG.getSelect(dl, VT, Mask, Op.getOperand(1), Op.getOperand(2));
+  }
+
   // Only some types will be legal on some subtargets. If we can emit a legal
   // VSELECT-matching blend, return Op, and but if we need to expand, return
   // a null value.
-  switch (Op.getSimpleValueType().SimpleTy) {
+  switch (VT.SimpleTy) {
   default:
     // Most of the vector types have blends past SSE4.1.
     return Op;
@@ -14725,7 +14764,7 @@ X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
     // location.
     SDValue Chain = DAG.getEntryNode();
     SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
-    Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(0, DL, true), DL);
+    Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL);
     SDValue Args[] = { Chain, Offset };
     Chain = DAG.getNode(X86ISD::TLSCALL, DL, NodeTys, Args);
     Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, DL, true),
@@ -15348,8 +15387,7 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
   // Get a pointer to FF if the sign bit was set, or to 0 otherwise.
   SDValue Zero = DAG.getIntPtrConstant(0, dl);
   SDValue Four = DAG.getIntPtrConstant(4, dl);
-  SDValue Offset = DAG.getNode(ISD::SELECT, dl, Zero.getValueType(), SignSet,
-                               Zero, Four);
+  SDValue Offset = DAG.getSelect(dl, Zero.getValueType(), SignSet, Zero, Four);
   FudgePtr = DAG.getNode(ISD::ADD, dl, PtrVT, FudgePtr, Offset);
 
   // Load the value out, extending it from f32 to f80.
@@ -15621,7 +15659,7 @@ static  SDValue LowerZERO_EXTEND_AVX512(SDValue Op,
   SDValue Zero =
    DAG.getConstant(APInt::getNullValue(ExtVT.getScalarSizeInBits()), DL, ExtVT);
 
-  SDValue SelectedVal = DAG.getNode(ISD::VSELECT, DL, ExtVT, In, One, Zero);
+  SDValue SelectedVal = DAG.getSelect(DL, ExtVT, In, One, Zero);
   if (VT == ExtVT)
     return SelectedVal;
   return DAG.getNode(X86ISD::VTRUNC, DL, VT, SelectedVal);
@@ -16713,7 +16751,7 @@ static SDValue LowerAndToBT(SDValue And, ISD::CondCode CC,
       if (BitWidth > AndBitWidth) {
         KnownBits Known;
         DAG.computeKnownBits(Op0, Known);
-        if (Known.Zero.countLeadingOnes() < BitWidth - AndBitWidth)
+        if (Known.countMinLeadingZeros() < BitWidth - AndBitWidth)
           return SDValue();
       }
       LHS = Op1;
@@ -17455,7 +17493,7 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
         MVT VCmpVT = VT == MVT::f32 ? MVT::v4i32 : MVT::v2i64;
         VCmp = DAG.getBitcast(VCmpVT, VCmp);
 
-        SDValue VSel = DAG.getNode(ISD::VSELECT, DL, VecVT, VCmp, VOp1, VOp2);
+        SDValue VSel = DAG.getSelect(DL, VecVT, VCmp, VOp1, VOp2);
 
         return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
                            VSel, DAG.getIntPtrConstant(0, DL));
@@ -17483,9 +17521,8 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
     else if (Op2.getOpcode() == ISD::BITCAST && Op2.getOperand(0))
       Op2Scalar = Op2.getOperand(0);
     if (Op1Scalar.getNode() && Op2Scalar.getNode()) {
-      SDValue newSelect = DAG.getNode(ISD::SELECT, DL,
-                                      Op1Scalar.getValueType(),
-                                      Cond, Op1Scalar, Op2Scalar);
+      SDValue newSelect = DAG.getSelect(DL, Op1Scalar.getValueType(), Cond,
+                                        Op1Scalar, Op2Scalar);
       if (newSelect.getValueSizeInBits() == VT.getSizeInBits())
         return DAG.getBitcast(VT, newSelect);
       SDValue ExtVec = DAG.getBitcast(MVT::v8i1, newSelect);
@@ -17500,8 +17537,7 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
                       DAG.getUNDEF(MVT::v8i1), Op1, zeroConst);
     Op2 = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v8i1,
                       DAG.getUNDEF(MVT::v8i1), Op2, zeroConst);
-    SDValue newSelect = DAG.getNode(ISD::SELECT, DL, MVT::v8i1,
-                                    Cond, Op1, Op2);
+    SDValue newSelect = DAG.getSelect(DL, MVT::v8i1, Cond, Op1, Op2);
     return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, newSelect, zeroConst);
   }
 
@@ -17770,7 +17806,7 @@ static SDValue LowerSIGN_EXTEND_AVX512(SDValue Op,
   } else {
     SDValue NegOne = getOnesVector(ExtVT, DAG, dl);
     SDValue Zero = getZeroVector(ExtVT, Subtarget, DAG, dl);
-    V = DAG.getNode(ISD::VSELECT, dl, ExtVT, In, NegOne, Zero);
+    V = DAG.getSelect(dl, ExtVT, In, NegOne, Zero);
     if (ExtVT == VT)
       return V;
   }
@@ -18572,7 +18608,7 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
 
   // Chain the dynamic stack allocation so that it doesn't modify the stack
   // pointer when other instructions are using the stack.
-  Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(0, dl, true), dl);
+  Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);
 
   bool Is64Bit = Subtarget.is64Bit();
   MVT SPTy = getPointerTy(DAG.getDataLayout());
@@ -19021,8 +19057,10 @@ static SDValue getScalarMaskingNode(SDValue Op, SDValue Mask,
                                     SDValue PreservedSrc,
                                     const X86Subtarget &Subtarget,
                                     SelectionDAG &DAG) {
-  if (isAllOnesConstant(Mask))
-    return Op;
+
+  if (auto *MaskConst = dyn_cast<ConstantSDNode>(Mask))
+    if (MaskConst->getZExtValue() & 0x1)
+      return Op;
 
   MVT VT = Op.getSimpleValueType();
   SDLoc dl(Op);
@@ -19081,7 +19119,7 @@ static SDValue recoverFramePointer(SelectionDAG &DAG, const Function *Fn,
   // registration, or the .set_setframe offset.
   MCSymbol *OffsetSym =
       MF.getMMI().getContext().getOrCreateParentFrameOffsetSymbol(
-          GlobalValue::getRealLinkageName(Fn->getName()));
+          GlobalValue::dropLLVMManglingEscape(Fn->getName()));
   SDValue OffsetSymVal = DAG.getMCSymbol(OffsetSym, PtrVT);
   SDValue ParentFrameOffset =
       DAG.getNode(ISD::LOCAL_RECOVER, dl, PtrVT, OffsetSymVal);
@@ -19683,12 +19721,6 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget &Subtarget
                                 DAG.getIntPtrConstant(0, dl));
       return DAG.getBitcast(Op.getValueType(), Res);
     }
-    case CONVERT_MASK_TO_VEC: {
-      SDValue Mask = Op.getOperand(1);
-      MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
-      SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
-      return DAG.getNode(IntrData->Opc0, dl, VT, VMask);
-    }
     case BRCST_SUBVEC_TO_VEC: {
       SDValue Src = Op.getOperand(1);
       SDValue Passthru = Op.getOperand(2);
@@ -19932,7 +19964,7 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget &Subtarget
     SDValue Op1 = Op.getOperand(1);
     auto *Fn = cast<Function>(cast<GlobalAddressSDNode>(Op1)->getGlobal());
     MCSymbol *LSDASym = MF.getMMI().getContext().getOrCreateLSDASymbol(
-        GlobalValue::getRealLinkageName(Fn->getName()));
+        GlobalValue::dropLLVMManglingEscape(Fn->getName()));
 
     // Generate a simple absolute symbol reference. This intrinsic is only
     // supported on 32-bit Windows, which isn't PIC.
@@ -21741,6 +21773,14 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
     MVT ExVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() * 2);
     SDValue Ex = DAG.getBitcast(ExVT, R);
 
+    // ashr(R, 63) === cmp_slt(R, 0)
+    if (ShiftAmt == 63 && Subtarget.hasSSE42()) {
+      assert((VT != MVT::v4i64 || Subtarget.hasInt256()) &&
+             "Unsupported PCMPGT op");
+      return DAG.getNode(X86ISD::PCMPGT, dl, VT,
+                         getZeroVector(VT, Subtarget, DAG, dl), R);
+    }
+
     if (ShiftAmt >= 32) {
       // Splat sign to upper i32 dst, and SRA upper i32 src to lower i32.
       SDValue Upper =
@@ -21839,10 +21879,19 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
   }
 
   // Special case in 32-bit mode, where i64 is expanded into high and low parts.
+  // TODO: Replace constant extraction with getTargetConstantBitsFromNode.
   if (!Subtarget.is64Bit() && !Subtarget.hasXOP() &&
       (VT == MVT::v2i64 || (Subtarget.hasInt256() && VT == MVT::v4i64) ||
        (Subtarget.hasAVX512() && VT == MVT::v8i64))) {
 
+    // AVX1 targets maybe extracting a 128-bit vector from a 256-bit constant.
+    unsigned SubVectorScale = 1;
+    if (Amt.getOpcode() == ISD::EXTRACT_SUBVECTOR) {
+      SubVectorScale =
+          Amt.getOperand(0).getValueSizeInBits() / Amt.getValueSizeInBits();
+      Amt = Amt.getOperand(0);
+    }
+
     // Peek through any splat that was introduced for i64 shift vectorization.
     int SplatIndex = -1;
     if (ShuffleVectorSDNode *SVN = dyn_cast<ShuffleVectorSDNode>(Amt.getNode()))
@@ -21859,7 +21908,7 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
 
     Amt = Amt.getOperand(0);
     unsigned Ratio = Amt.getSimpleValueType().getVectorNumElements() /
-                     VT.getVectorNumElements();
+                     (SubVectorScale * VT.getVectorNumElements());
     unsigned RatioInLog2 = Log2_32_Ceil(Ratio);
     uint64_t ShiftAmt = 0;
     unsigned BaseOp = (SplatIndex < 0 ? 0 : SplatIndex * Ratio);
@@ -22233,23 +22282,21 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
         V1 = DAG.getBitcast(VT, V1);
         Sel = DAG.getBitcast(VT, Sel);
         Sel = DAG.getNode(X86ISD::CVT2MASK, dl, MaskVT, Sel);
-        return DAG.getBitcast(SelVT,
-                              DAG.getNode(ISD::VSELECT, dl, VT, Sel, V0, V1));
+        return DAG.getBitcast(SelVT, DAG.getSelect(dl, VT, Sel, V0, V1));
       } else if (Subtarget.hasSSE41()) {
         // On SSE41 targets we make use of the fact that VSELECT lowers
         // to PBLENDVB which selects bytes based just on the sign bit.
         V0 = DAG.getBitcast(VT, V0);
         V1 = DAG.getBitcast(VT, V1);
         Sel = DAG.getBitcast(VT, Sel);
-        return DAG.getBitcast(SelVT,
-                              DAG.getNode(ISD::VSELECT, dl, VT, Sel, V0, V1));
+        return DAG.getBitcast(SelVT, DAG.getSelect(dl, VT, Sel, V0, V1));
       }
       // On pre-SSE41 targets we test for the sign bit by comparing to
       // zero - a negative value will set all bits of the lanes to true
       // and VSELECT uses that in its OR(AND(V0,C),AND(V1,~C)) lowering.
       SDValue Z = getZeroVector(SelVT, Subtarget, DAG, dl);
       SDValue C = DAG.getNode(X86ISD::PCMPGT, dl, SelVT, Z, Sel);
-      return DAG.getNode(ISD::VSELECT, dl, SelVT, C, V0, V1);
+      return DAG.getSelect(dl, SelVT, C, V0, V1);
     };
 
     // Turn 'a' into a mask suitable for VSELECT: a = a << 5;
@@ -22371,15 +22418,14 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
         V0 = DAG.getBitcast(ExtVT, V0);
         V1 = DAG.getBitcast(ExtVT, V1);
         Sel = DAG.getBitcast(ExtVT, Sel);
-        return DAG.getBitcast(
-            VT, DAG.getNode(ISD::VSELECT, dl, ExtVT, Sel, V0, V1));
+        return DAG.getBitcast(VT, DAG.getSelect(dl, ExtVT, Sel, V0, V1));
       }
       // On pre-SSE41 targets we splat the sign bit - a negative value will
       // set all bits of the lanes to true and VSELECT uses that in
       // its OR(AND(V0,C),AND(V1,~C)) lowering.
       SDValue C =
           DAG.getNode(ISD::SRA, dl, VT, Sel, DAG.getConstant(15, dl, VT));
-      return DAG.getNode(ISD::VSELECT, dl, VT, C, V0, V1);
+      return DAG.getSelect(dl, VT, C, V0, V1);
     };
 
     // Turn 'a' into a mask suitable for VSELECT: a = a << 12;
@@ -23296,9 +23342,8 @@ static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget &Subtarget,
   SDValue Callee =
       DAG.getExternalSymbol(LibcallName, TLI.getPointerTy(DAG.getDataLayout()));
 
-  Type *RetTy = isF64
-    ? (Type*)StructType::get(ArgTy, ArgTy, nullptr)
-    : (Type*)VectorType::get(ArgTy, 4);
+  Type *RetTy = isF64 ? (Type *)StructType::get(ArgTy, ArgTy)
+                      : (Type *)VectorType::get(ArgTy, 4);
 
   TargetLowering::CallLoweringInfo CLI(DAG);
   CLI.setDebugLoc(dl)
@@ -25779,7 +25824,7 @@ X86TargetLowering::EmitLoweredTLSAddr(MachineInstr &MI,
   // Emit CALLSEQ_START right before the instruction.
   unsigned AdjStackDown = TII.getCallFrameSetupOpcode();
   MachineInstrBuilder CallseqStart =
-    BuildMI(MF, DL, TII.get(AdjStackDown)).addImm(0).addImm(0);
+    BuildMI(MF, DL, TII.get(AdjStackDown)).addImm(0).addImm(0).addImm(0);
   BB->insert(MachineBasicBlock::iterator(MI), CallseqStart);
 
   // Emit CALLSEQ_END right after the instruction.
@@ -26517,7 +26562,7 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
   case TargetOpcode::STACKMAP:
   case TargetOpcode::PATCHPOINT:
     return emitPatchPoint(MI, BB);
-    
+
   case TargetOpcode::PATCHABLE_EVENT_CALL:
     // Do nothing here, handle in xray instrumentation pass.
     return BB;
@@ -29532,7 +29577,7 @@ combineVSelectWithAllOnesOrZeros(SDNode *N, SelectionDAG &DAG,
     SDValue CondNew = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond,
                                   DAG.getAllOnesConstant(DL, CondVT));
     // Vselect cond, op1, op2 = Vselect not(cond), op2, op1
-    return DAG.getNode(ISD::VSELECT, DL, VT, CondNew, RHS, LHS);
+    return DAG.getSelect(DL, VT, CondNew, RHS, LHS);
   }
 
   // To use the condition operand as a bitwise mask, it must have elements that
@@ -30015,7 +30060,7 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
       ISD::CondCode NewCC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGE;
       Cond = DAG.getSetCC(SDLoc(Cond), Cond.getValueType(),
                           Cond.getOperand(0), Cond.getOperand(1), NewCC);
-      return DAG.getNode(ISD::SELECT, DL, VT, Cond, LHS, RHS);
+      return DAG.getSelect(DL, VT, Cond, LHS, RHS);
     }
     }
   }
@@ -31561,20 +31606,22 @@ static SDValue combineAnd(SDNode *N, SelectionDAG &DAG,
 //   (sub (xor X, M), M)
 static SDValue combineLogicBlendIntoPBLENDV(SDNode *N, SelectionDAG &DAG,
                                             const X86Subtarget &Subtarget) {
-  assert(N->getOpcode() == ISD::OR);
+  assert(N->getOpcode() == ISD::OR && "Unexpected Opcode");
 
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
   EVT VT = N->getValueType(0);
 
-  if (!((VT == MVT::v2i64) || (VT == MVT::v4i64 && Subtarget.hasInt256())))
+  if (!((VT.is128BitVector() && Subtarget.hasSSE2()) ||
+        (VT.is256BitVector() && Subtarget.hasInt256())))
     return SDValue();
-  assert(Subtarget.hasSSE2() && "Unexpected i64 vector without SSE2!");
 
-  // Canonicalize pandn to RHS
-  if (N0.getOpcode() == X86ISD::ANDNP)
+  // Canonicalize AND to LHS.
+  if (N1.getOpcode() == ISD::AND)
     std::swap(N0, N1);
 
+  // TODO: Attempt to match against AND(XOR(-1,X),Y) as well, waiting for
+  // ANDNP combine allows other combines to happen that prevent matching.
   if (N0.getOpcode() != ISD::AND || N1.getOpcode() != X86ISD::ANDNP)
     return SDValue();
 
@@ -31596,20 +31643,10 @@ static SDValue combineLogicBlendIntoPBLENDV(SDNode *N, SelectionDAG &DAG,
   Y = peekThroughBitcasts(Y);
 
   EVT MaskVT = Mask.getValueType();
-
-  // Validate that the Mask operand is a vector sra node.
-  // FIXME: what to do for bytes, since there is a psignb/pblendvb, but
-  // there is no psrai.b
   unsigned EltBits = MaskVT.getScalarSizeInBits();
-  unsigned SraAmt = ~0;
-  if (Mask.getOpcode() == ISD::SRA) {
-    if (auto *AmtBV = dyn_cast<BuildVectorSDNode>(Mask.getOperand(1)))
-      if (auto *AmtConst = AmtBV->getConstantSplatNode())
-        SraAmt = AmtConst->getZExtValue();
-  } else if (Mask.getOpcode() == X86ISD::VSRAI)
-    SraAmt = Mask.getConstantOperandVal(1);
 
-  if ((SraAmt + 1) != EltBits)
+  // TODO: Attempt to handle floating point cases as well?
+  if (!MaskVT.isInteger() || DAG.ComputeNumSignBits(Mask) != EltBits)
     return SDValue();
 
   SDLoc DL(N);
@@ -31630,7 +31667,8 @@ static SDValue combineLogicBlendIntoPBLENDV(SDNode *N, SelectionDAG &DAG,
   //   (add (xor X, M), (and M, 1))
   // And further to:
   //   (sub (xor X, M), M)
-  if (X.getValueType() == MaskVT && Y.getValueType() == MaskVT) {
+  if (X.getValueType() == MaskVT && Y.getValueType() == MaskVT &&
+      DAG.getTargetLoweringInfo().isOperationLegal(ISD::SUB, MaskVT)) {
     auto IsNegV = [](SDNode *N, SDValue V) {
       return N->getOpcode() == ISD::SUB && N->getOperand(1) == V &&
         ISD::isBuildVectorAllZeros(N->getOperand(0).getNode());
@@ -31642,9 +31680,6 @@ static SDValue combineLogicBlendIntoPBLENDV(SDNode *N, SelectionDAG &DAG,
       V = Y;
 
     if (V) {
-      if (EltBits != 8 && EltBits != 16 && EltBits != 32)
-        return SDValue();
-
       SDValue SubOp1 = DAG.getNode(ISD::XOR, DL, MaskVT, V, Mask);
       SDValue SubOp2 = Mask;
 
@@ -31661,8 +31696,8 @@ static SDValue combineLogicBlendIntoPBLENDV(SDNode *N, SelectionDAG &DAG,
       if (V == Y)
          std::swap(SubOp1, SubOp2);
 
-      return DAG.getBitcast(VT,
-                            DAG.getNode(ISD::SUB, DL, MaskVT, SubOp1, SubOp2));
+      SDValue Res = DAG.getNode(ISD::SUB, DL, MaskVT, SubOp1, SubOp2);
+      return DAG.getBitcast(VT, Res);
     }
   }
 
@@ -31675,7 +31710,7 @@ static SDValue combineLogicBlendIntoPBLENDV(SDNode *N, SelectionDAG &DAG,
   X = DAG.getBitcast(BlendVT, X);
   Y = DAG.getBitcast(BlendVT, Y);
   Mask = DAG.getBitcast(BlendVT, Mask);
-  Mask = DAG.getNode(ISD::VSELECT, DL, BlendVT, Mask, Y, X);
+  Mask = DAG.getSelect(DL, BlendVT, Mask, Y, X);
   return DAG.getBitcast(VT, Mask);
 }
 
@@ -33655,8 +33690,7 @@ static SDValue combineFMinNumFMaxNum(SDNode *N, SelectionDAG &DAG,
 
   // If Op0 is a NaN, select Op1. Otherwise, select the max. If both operands
   // are NaN, the NaN value of Op1 is the result.
-  auto SelectOpcode = VT.isVector() ? ISD::VSELECT : ISD::SELECT;
-  return DAG.getNode(SelectOpcode, DL, VT, IsOp0Nan, Op1, MinOrMax);
+  return DAG.getSelect(DL, VT, IsOp0Nan, Op1, MinOrMax);
 }
 
 /// Do target-specific dag combines on X86ISD::ANDNP nodes.
@@ -33949,7 +33983,7 @@ static SDValue combineSext(SDNode *N, SelectionDAG &DAG,
     if (InVT == MVT::i1) {
       SDValue Zero = DAG.getConstant(0, DL, VT);
       SDValue AllOnes = DAG.getAllOnesConstant(DL, VT);
-      return DAG.getNode(ISD::SELECT, DL, VT, N0, AllOnes, Zero);
+      return DAG.getSelect(DL, VT, N0, AllOnes, Zero);
     }
     return SDValue();
   }
diff --git a/lib/Target/X86/X86InstrCompiler.td b/lib/Target/X86/X86InstrCompiler.td
index 3dc673e3c35a..d003d027ddb9 100644
--- a/lib/Target/X86/X86InstrCompiler.td
+++ b/lib/Target/X86/X86InstrCompiler.td
@@ -43,7 +43,8 @@ let hasSideEffects = 0, isNotDuplicable = 1, Uses = [ESP] in
 // Pessimistically assume ADJCALLSTACKDOWN / ADJCALLSTACKUP will become
 // sub / add which can clobber EFLAGS.
 let Defs = [ESP, EFLAGS], Uses = [ESP] in {
-def ADJCALLSTACKDOWN32 : I<0, Pseudo, (outs), (ins i32imm:$amt1, i32imm:$amt2),
+def ADJCALLSTACKDOWN32 : I<0, Pseudo, (outs),
+                           (ins i32imm:$amt1, i32imm:$amt2, i32imm:$amt3),
                            "#ADJCALLSTACKDOWN",
                            []>,
                           Requires<[NotLP64]>;
@@ -52,8 +53,8 @@ def ADJCALLSTACKUP32   : I<0, Pseudo, (outs), (ins i32imm:$amt1, i32imm:$amt2),
                            [(X86callseq_end timm:$amt1, timm:$amt2)]>,
                           Requires<[NotLP64]>;
 }
-def : Pat<(X86callseq_start timm:$amt1),
-          (ADJCALLSTACKDOWN32 i32imm:$amt1, 0)>, Requires<[NotLP64]>;
+def : Pat<(X86callseq_start timm:$amt1, timm:$amt2),
+       (ADJCALLSTACKDOWN32 i32imm:$amt1, i32imm:$amt2, 0)>, Requires<[NotLP64]>;
 
 
 // ADJCALLSTACKDOWN/UP implicitly use/def RSP because they may be expanded into
@@ -62,7 +63,8 @@ def : Pat<(X86callseq_start timm:$amt1),
 // Pessimistically assume ADJCALLSTACKDOWN / ADJCALLSTACKUP will become
 // sub / add which can clobber EFLAGS.
 let Defs = [RSP, EFLAGS], Uses = [RSP] in {
-def ADJCALLSTACKDOWN64 : I<0, Pseudo, (outs), (ins i32imm:$amt1, i32imm:$amt2),
+def ADJCALLSTACKDOWN64 : I<0, Pseudo, (outs),
+                           (ins i32imm:$amt1, i32imm:$amt2, i32imm:$amt3),
                            "#ADJCALLSTACKDOWN",
                            []>,
                           Requires<[IsLP64]>;
@@ -71,8 +73,8 @@ def ADJCALLSTACKUP64   : I<0, Pseudo, (outs), (ins i32imm:$amt1, i32imm:$amt2),
                            [(X86callseq_end timm:$amt1, timm:$amt2)]>,
                           Requires<[IsLP64]>;
 }
-def : Pat<(X86callseq_start timm:$amt1),
-          (ADJCALLSTACKDOWN64 i32imm:$amt1, 0)>, Requires<[IsLP64]>;
+def : Pat<(X86callseq_start timm:$amt1, timm:$amt2),
+        (ADJCALLSTACKDOWN64 i32imm:$amt1, i32imm:$amt2, 0)>, Requires<[IsLP64]>;
 
 
 // x86-64 va_start lowering magic.
diff --git a/lib/Target/X86/X86InstrInfo.cpp b/lib/Target/X86/X86InstrInfo.cpp
index 888daa275265..092ceb207ada 100644
--- a/lib/Target/X86/X86InstrInfo.cpp
+++ b/lib/Target/X86/X86InstrInfo.cpp
@@ -5729,6 +5729,44 @@ static X86::CondCode getSwappedCondition(X86::CondCode CC) {
   }
 }
 
+std::pair<X86::CondCode, bool>
+X86::getX86ConditionCode(CmpInst::Predicate Predicate) {
+  X86::CondCode CC = X86::COND_INVALID;
+  bool NeedSwap = false;
+  switch (Predicate) {
+  default: break;
+  // Floating-point Predicates
+  case CmpInst::FCMP_UEQ: CC = X86::COND_E;       break;
+  case CmpInst::FCMP_OLT: NeedSwap = true;        LLVM_FALLTHROUGH;
+  case CmpInst::FCMP_OGT: CC = X86::COND_A;       break;
+  case CmpInst::FCMP_OLE: NeedSwap = true;        LLVM_FALLTHROUGH;
+  case CmpInst::FCMP_OGE: CC = X86::COND_AE;      break;
+  case CmpInst::FCMP_UGT: NeedSwap = true;        LLVM_FALLTHROUGH;
+  case CmpInst::FCMP_ULT: CC = X86::COND_B;       break;
+  case CmpInst::FCMP_UGE: NeedSwap = true;        LLVM_FALLTHROUGH;
+  case CmpInst::FCMP_ULE: CC = X86::COND_BE;      break;
+  case CmpInst::FCMP_ONE: CC = X86::COND_NE;      break;
+  case CmpInst::FCMP_UNO: CC = X86::COND_P;       break;
+  case CmpInst::FCMP_ORD: CC = X86::COND_NP;      break;
+  case CmpInst::FCMP_OEQ:                         LLVM_FALLTHROUGH;
+  case CmpInst::FCMP_UNE: CC = X86::COND_INVALID; break;
+
+  // Integer Predicates
+  case CmpInst::ICMP_EQ:  CC = X86::COND_E;       break;
+  case CmpInst::ICMP_NE:  CC = X86::COND_NE;      break;
+  case CmpInst::ICMP_UGT: CC = X86::COND_A;       break;
+  case CmpInst::ICMP_UGE: CC = X86::COND_AE;      break;
+  case CmpInst::ICMP_ULT: CC = X86::COND_B;       break;
+  case CmpInst::ICMP_ULE: CC = X86::COND_BE;      break;
+  case CmpInst::ICMP_SGT: CC = X86::COND_G;       break;
+  case CmpInst::ICMP_SGE: CC = X86::COND_GE;      break;
+  case CmpInst::ICMP_SLT: CC = X86::COND_L;       break;
+  case CmpInst::ICMP_SLE: CC = X86::COND_LE;      break;
+  }
+
+  return std::make_pair(CC, NeedSwap);
+}
+
 /// Return a set opcode for the given condition and
 /// whether it has memory operand.
 unsigned X86::getSETFromCond(CondCode CC, bool HasMemoryOperand) {
@@ -7589,6 +7627,13 @@ bool X86InstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
     return Expand2AddrUndef(MIB, get(HasAVX ? X86::VPCMPEQDrr : X86::PCMPEQDrr));
   case X86::AVX2_SETALLONES:
     return Expand2AddrUndef(MIB, get(X86::VPCMPEQDYrr));
+  case X86::AVX1_SETALLONES: {
+    unsigned Reg = MIB->getOperand(0).getReg();
+    // VCMPPSYrri with an immediate 0xf should produce VCMPTRUEPS.
+    MIB->setDesc(get(X86::VCMPPSYrri));
+    MIB.addReg(Reg, RegState::Undef).addReg(Reg, RegState::Undef).addImm(0xf);
+    return true;
+  }
   case X86::AVX512_512_SETALLONES: {
     unsigned Reg = MIB->getOperand(0).getReg();
     MIB->setDesc(get(X86::VPTERNLOGDZrri));
@@ -8477,6 +8522,7 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
       Alignment = 64;
       break;
     case X86::AVX2_SETALLONES:
+    case X86::AVX1_SETALLONES:
     case X86::AVX_SET0:
     case X86::AVX512_256_SET0:
       Alignment = 32;
@@ -8522,6 +8568,7 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
   case X86::V_SET0:
   case X86::V_SETALLONES:
   case X86::AVX2_SETALLONES:
+  case X86::AVX1_SETALLONES:
   case X86::AVX_SET0:
   case X86::AVX512_128_SET0:
   case X86::AVX512_256_SET0:
@@ -8563,13 +8610,14 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
     else if (Opc == X86::AVX512_512_SET0 || Opc == X86::AVX512_512_SETALLONES)
       Ty = VectorType::get(Type::getInt32Ty(MF.getFunction()->getContext()),16);
     else if (Opc == X86::AVX2_SETALLONES || Opc == X86::AVX_SET0 ||
-             Opc == X86::AVX512_256_SET0)
+             Opc == X86::AVX512_256_SET0 || Opc == X86::AVX1_SETALLONES)
       Ty = VectorType::get(Type::getInt32Ty(MF.getFunction()->getContext()), 8);
     else
       Ty = VectorType::get(Type::getInt32Ty(MF.getFunction()->getContext()), 4);
 
     bool IsAllOnes = (Opc == X86::V_SETALLONES || Opc == X86::AVX2_SETALLONES ||
-                      Opc == X86::AVX512_512_SETALLONES);
+                      Opc == X86::AVX512_512_SETALLONES ||
+                      Opc == X86::AVX1_SETALLONES);
     const Constant *C = IsAllOnes ? Constant::getAllOnesValue(Ty) :
                                     Constant::getNullValue(Ty);
     unsigned CPI = MCP.getConstantPoolIndex(C, Alignment);
diff --git a/lib/Target/X86/X86InstrInfo.h b/lib/Target/X86/X86InstrInfo.h
index 38567831b3a4..e64876073ccf 100644
--- a/lib/Target/X86/X86InstrInfo.h
+++ b/lib/Target/X86/X86InstrInfo.h
@@ -64,6 +64,10 @@ enum CondCode {
 // Turn condition code into conditional branch opcode.
 unsigned GetCondBranchFromCond(CondCode CC);
 
+/// \brief Return a pair of condition code for the given predicate and whether
+/// the instruction operands should be swaped to match the condition code.
+std::pair<CondCode, bool> getX86ConditionCode(CmpInst::Predicate Predicate);
+
 /// \brief Return a set opcode for the given condition and whether it has
 /// a memory operand.
 unsigned getSETFromCond(CondCode CC, bool HasMemoryOperand = false);
@@ -186,6 +190,8 @@ public:
   /// setup..destroy sequence (e.g. by pushes, or inside the callee).
   int64_t getFrameAdjustment(const MachineInstr &I) const {
     assert(isFrameInstr(I));
+    if (isFrameSetup(I))
+      return I.getOperand(2).getImm();
     return I.getOperand(1).getImm();
   }
 
@@ -193,7 +199,10 @@ public:
   /// instruction.
   void setFrameAdjustment(MachineInstr &I, int64_t V) const {
     assert(isFrameInstr(I));
-    I.getOperand(1).setImm(V);
+    if (isFrameSetup(I))
+      I.getOperand(2).setImm(V);
+    else
+      I.getOperand(1).setImm(V);
   }
 
   /// getSPAdjust - This returns the stack pointer adjustment made by
diff --git a/lib/Target/X86/X86InstrInfo.td b/lib/Target/X86/X86InstrInfo.td
index 902b0c2c04e3..4d7d8ece92d9 100644
--- a/lib/Target/X86/X86InstrInfo.td
+++ b/lib/Target/X86/X86InstrInfo.td
@@ -84,7 +84,8 @@ def SDTLockBinaryArithWithFlags : SDTypeProfile<1, 2, [SDTCisVT<0, i32>,
 
 def SDTX86Ret     : SDTypeProfile<0, -1, [SDTCisVT<0, i32>]>;
 
-def SDT_X86CallSeqStart : SDCallSeqStart<[SDTCisVT<0, i32>]>;
+def SDT_X86CallSeqStart : SDCallSeqStart<[SDTCisVT<0, i32>,
+                                          SDTCisVT<1, i32>]>;
 def SDT_X86CallSeqEnd   : SDCallSeqEnd<[SDTCisVT<0, i32>,
                                         SDTCisVT<1, i32>]>;
 
@@ -2351,6 +2352,38 @@ let Predicates = [HasBMI2] in {
   def : Pat<(and (loadi64 addr:$src), (add (shl 1, GR8:$lz), -1)),
             (BZHI64rm addr:$src,
               (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GR8:$lz, sub_8bit))>;
+
+  // x & (-1 >> (32 - y))
+  def : Pat<(and GR32:$src, (srl -1, (i8 (trunc (sub 32, GR32:$lz))))),
+            (BZHI32rr GR32:$src, GR32:$lz)>;
+  def : Pat<(and (loadi32 addr:$src), (srl -1, (i8 (trunc (sub 32, GR32:$lz))))),
+            (BZHI32rm addr:$src, GR32:$lz)>;
+
+  // x & (-1 >> (64 - y))
+  def : Pat<(and GR64:$src, (srl -1, (i8 (trunc (sub 64, GR32:$lz))))),
+            (BZHI64rr GR64:$src,
+              (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GR32:$lz, sub_32bit))>;
+  def : Pat<(and (loadi64 addr:$src), (srl -1, (i8 (trunc (sub 64, GR32:$lz))))),
+            (BZHI64rm addr:$src,
+              (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GR32:$lz, sub_32bit))>;
+
+  // x << (32 - y) >> (32 - y)
+  def : Pat<(srl (shl GR32:$src, (i8 (trunc (sub 32, GR32:$lz)))),
+                 (i8 (trunc (sub 32, GR32:$lz)))),
+            (BZHI32rr GR32:$src, GR32:$lz)>;
+  def : Pat<(srl (shl (loadi32 addr:$src), (i8 (trunc (sub 32, GR32:$lz)))),
+                 (i8 (trunc (sub 32, GR32:$lz)))),
+            (BZHI32rm addr:$src, GR32:$lz)>;
+
+  // x << (64 - y) >> (64 - y)
+  def : Pat<(srl (shl GR64:$src, (i8 (trunc (sub 64, GR32:$lz)))),
+                 (i8 (trunc (sub 64, GR32:$lz)))),
+            (BZHI64rr GR64:$src,
+              (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GR32:$lz, sub_32bit))>;
+  def : Pat<(srl (shl (loadi64 addr:$src), (i8 (trunc (sub 64, GR32:$lz)))),
+                 (i8 (trunc (sub 64, GR32:$lz)))),
+            (BZHI64rm addr:$src,
+              (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GR32:$lz, sub_32bit))>;
 } // HasBMI2
 
 let Predicates = [HasBMI] in {
diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td
index 48da2fa607af..f73d85e7e01b 100644
--- a/lib/Target/X86/X86InstrSSE.td
+++ b/lib/Target/X86/X86InstrSSE.td
@@ -486,6 +486,10 @@ let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
     isPseudo = 1, SchedRW = [WriteZero] in {
   def V_SETALLONES : I<0, Pseudo, (outs VR128:$dst), (ins), "",
                        [(set VR128:$dst, (v4i32 immAllOnesV))]>;
+  let Predicates = [HasAVX1Only, OptForMinSize] in {
+  def AVX1_SETALLONES: I<0, Pseudo, (outs VR256:$dst), (ins), "",
+                          [(set VR256:$dst, (v8i32 immAllOnesV))]>;
+  }
   let Predicates = [HasAVX2] in
   def AVX2_SETALLONES : I<0, Pseudo, (outs VR256:$dst), (ins), "",
                           [(set VR256:$dst, (v8i32 immAllOnesV))]>;
@@ -7755,14 +7759,12 @@ def VINSERTF128rm : AVXAIi8<0x18, MRMSrcMem, (outs VR256:$dst),
           []>, Sched<[WriteFShuffleLd, ReadAfterLd]>, VEX_4V, VEX_L;
 }
 
-
-// Without AVX2 we need to concat two v4i32 V_SETALLONES to create a 256-bit
-// all ones value.
-let Predicates = [HasAVX1Only] in
-def : Pat<(v8i32 immAllOnesV),
-          (VINSERTF128rr
-           (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), (V_SETALLONES), sub_xmm),
-           (V_SETALLONES), 1)>;
+// To create a 256-bit all ones value, we should produce VCMPTRUEPS
+// with YMM register containing zero.
+// FIXME: Avoid producing vxorps to clear the fake inputs.
+let Predicates = [HasAVX1Only] in {
+def : Pat<(v8i32 immAllOnesV), (VCMPPSYrri (AVX_SET0), (AVX_SET0), 0xf)>;
+}
 
 multiclass vinsert_lowering<string InstrStr, ValueType From, ValueType To,
                             PatFrag memop_frag> {
diff --git a/lib/Target/X86/X86InstructionSelector.cpp b/lib/Target/X86/X86InstructionSelector.cpp
index d65eb1de8d09..de58d719acb4 100644
--- a/lib/Target/X86/X86InstructionSelector.cpp
+++ b/lib/Target/X86/X86InstructionSelector.cpp
@@ -56,13 +56,9 @@ private:
   bool selectImpl(MachineInstr &I) const;
 
   // TODO: remove after suported by Tablegen-erated instruction selection.
-  unsigned getFAddOp(LLT &Ty, const RegisterBank &RB) const;
-  unsigned getFSubOp(LLT &Ty, const RegisterBank &RB) const;
   unsigned getLoadStoreOp(LLT &Ty, const RegisterBank &RB, unsigned Opc,
                           uint64_t Alignment) const;
 
-  bool selectBinaryOp(MachineInstr &I, MachineRegisterInfo &MRI,
-                      MachineFunction &MF) const;
   bool selectLoadStoreOp(MachineInstr &I, MachineRegisterInfo &MRI,
                          MachineFunction &MF) const;
   bool selectFrameIndexOrGep(MachineInstr &I, MachineRegisterInfo &MRI,
@@ -71,6 +67,10 @@ private:
                       MachineFunction &MF) const;
   bool selectTrunc(MachineInstr &I, MachineRegisterInfo &MRI,
                    MachineFunction &MF) const;
+  bool selectZext(MachineInstr &I, MachineRegisterInfo &MRI,
+                  MachineFunction &MF) const;
+  bool selectCmp(MachineInstr &I, MachineRegisterInfo &MRI,
+                 MachineFunction &MF) const;
 
   const X86TargetMachine &TM;
   const X86Subtarget &STI;
@@ -226,13 +226,11 @@ bool X86InstructionSelector::select(MachineInstr &I) const {
          "Generic instruction has unexpected implicit operands\n");
 
   if (selectImpl(I))
-     return true;
+    return true;
 
   DEBUG(dbgs() << " C++ instruction selection: "; I.print(dbgs()));
 
   // TODO: This should be implemented by tblgen.
-  if (selectBinaryOp(I, MRI, MF))
-    return true;
   if (selectLoadStoreOp(I, MRI, MF))
     return true;
   if (selectFrameIndexOrGep(I, MRI, MF))
@@ -241,109 +239,14 @@ bool X86InstructionSelector::select(MachineInstr &I) const {
     return true;
   if (selectTrunc(I, MRI, MF))
     return true;
+  if (selectZext(I, MRI, MF))
+    return true;
+  if (selectCmp(I, MRI, MF))
+    return true;
 
   return false;
 }
 
-unsigned X86InstructionSelector::getFAddOp(LLT &Ty,
-                                           const RegisterBank &RB) const {
-
-  if (X86::VECRRegBankID != RB.getID())
-    return TargetOpcode::G_FADD;
-
-  if (Ty == LLT::scalar(32)) {
-    if (STI.hasAVX512()) {
-      return X86::VADDSSZrr;
-    } else if (STI.hasAVX()) {
-      return X86::VADDSSrr;
-    } else if (STI.hasSSE1()) {
-      return X86::ADDSSrr;
-    }
-  } else if (Ty == LLT::scalar(64)) {
-    if (STI.hasAVX512()) {
-      return X86::VADDSDZrr;
-    } else if (STI.hasAVX()) {
-      return X86::VADDSDrr;
-    } else if (STI.hasSSE2()) {
-      return X86::ADDSDrr;
-    }
-  } else if (Ty == LLT::vector(4, 32)) {
-    if ((STI.hasAVX512()) && (STI.hasVLX())) {
-      return X86::VADDPSZ128rr;
-    } else if (STI.hasAVX()) {
-      return X86::VADDPSrr;
-    } else if (STI.hasSSE1()) {
-      return X86::ADDPSrr;
-    }
-  }
-
-  return TargetOpcode::G_FADD;
-}
-
-unsigned X86InstructionSelector::getFSubOp(LLT &Ty,
-                                           const RegisterBank &RB) const {
-
-  if (X86::VECRRegBankID != RB.getID())
-    return TargetOpcode::G_FSUB;
-
-  if (Ty == LLT::scalar(32)) {
-    if (STI.hasAVX512()) {
-      return X86::VSUBSSZrr;
-    } else if (STI.hasAVX()) {
-      return X86::VSUBSSrr;
-    } else if (STI.hasSSE1()) {
-      return X86::SUBSSrr;
-    }
-  } else if (Ty == LLT::scalar(64)) {
-    if (STI.hasAVX512()) {
-      return X86::VSUBSDZrr;
-    } else if (STI.hasAVX()) {
-      return X86::VSUBSDrr;
-    } else if (STI.hasSSE2()) {
-      return X86::SUBSDrr;
-    }
-  } else if (Ty == LLT::vector(4, 32)) {
-    if ((STI.hasAVX512()) && (STI.hasVLX())) {
-      return X86::VSUBPSZ128rr;
-    } else if (STI.hasAVX()) {
-      return X86::VSUBPSrr;
-    } else if (STI.hasSSE1()) {
-      return X86::SUBPSrr;
-    }
-  }
-
-  return TargetOpcode::G_FSUB;
-}
-
-bool X86InstructionSelector::selectBinaryOp(MachineInstr &I,
-                                            MachineRegisterInfo &MRI,
-                                            MachineFunction &MF) const {
-
-  const unsigned DefReg = I.getOperand(0).getReg();
-  LLT Ty = MRI.getType(DefReg);
-  const RegisterBank &RB = *RBI.getRegBank(DefReg, MRI, TRI);
-
-  unsigned NewOpc = I.getOpcode();
-
-  switch (NewOpc) {
-  case TargetOpcode::G_FADD:
-    NewOpc = getFAddOp(Ty, RB);
-    break;
-  case TargetOpcode::G_FSUB:
-    NewOpc = getFSubOp(Ty, RB);
-    break;
-  default:
-    break;
-  }
-
-  if (NewOpc == I.getOpcode())
-    return false;
-
-  I.setDesc(TII.get(NewOpc));
-
-  return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
-}
-
 unsigned X86InstructionSelector::getLoadStoreOp(LLT &Ty, const RegisterBank &RB,
                                                 unsigned Opc,
                                                 uint64_t Alignment) const {
@@ -562,6 +465,105 @@ bool X86InstructionSelector::selectTrunc(MachineInstr &I,
   return true;
 }
 
+bool X86InstructionSelector::selectZext(MachineInstr &I,
+                                        MachineRegisterInfo &MRI,
+                                        MachineFunction &MF) const {
+  if (I.getOpcode() != TargetOpcode::G_ZEXT)
+    return false;
+
+  const unsigned DstReg = I.getOperand(0).getReg();
+  const unsigned SrcReg = I.getOperand(1).getReg();
+
+  const LLT DstTy = MRI.getType(DstReg);
+  const LLT SrcTy = MRI.getType(SrcReg);
+
+  if (SrcTy == LLT::scalar(1)) {
+
+    unsigned AndOpc;
+    if (DstTy == LLT::scalar(32))
+      AndOpc = X86::AND32ri8;
+    else if (DstTy == LLT::scalar(64))
+      AndOpc = X86::AND64ri8;
+    else
+      return false;
+
+    const RegisterBank &RegBank = *RBI.getRegBank(DstReg, MRI, TRI);
+    unsigned DefReg =
+        MRI.createVirtualRegister(getRegClassForTypeOnBank(DstTy, RegBank));
+
+    BuildMI(*I.getParent(), I, I.getDebugLoc(),
+            TII.get(TargetOpcode::SUBREG_TO_REG), DefReg)
+        .addImm(0)
+        .addReg(SrcReg)
+        .addImm(X86::sub_8bit);
+
+    MachineInstr &AndInst =
+        *BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(AndOpc), DstReg)
+             .addReg(DefReg)
+             .addImm(1);
+
+    constrainSelectedInstRegOperands(AndInst, TII, TRI, RBI);
+
+    I.eraseFromParent();
+    return true;
+  }
+
+  return false;
+}
+
+bool X86InstructionSelector::selectCmp(MachineInstr &I,
+                                       MachineRegisterInfo &MRI,
+                                       MachineFunction &MF) const {
+  if (I.getOpcode() != TargetOpcode::G_ICMP)
+    return false;
+
+  X86::CondCode CC;
+  bool SwapArgs;
+  std::tie(CC, SwapArgs) = X86::getX86ConditionCode(
+      (CmpInst::Predicate)I.getOperand(1).getPredicate());
+  unsigned OpSet = X86::getSETFromCond(CC);
+
+  unsigned LHS = I.getOperand(2).getReg();
+  unsigned RHS = I.getOperand(3).getReg();
+
+  if (SwapArgs)
+    std::swap(LHS, RHS);
+
+  unsigned OpCmp;
+  LLT Ty = MRI.getType(LHS);
+
+  switch (Ty.getSizeInBits()) {
+  default:
+    return false;
+  case 8:
+    OpCmp = X86::CMP8rr;
+    break;
+  case 16:
+    OpCmp = X86::CMP16rr;
+    break;
+  case 32:
+    OpCmp = X86::CMP32rr;
+    break;
+  case 64:
+    OpCmp = X86::CMP64rr;
+    break;
+  }
+
+  MachineInstr &CmpInst =
+      *BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(OpCmp))
+           .addReg(LHS)
+           .addReg(RHS);
+
+  MachineInstr &SetInst = *BuildMI(*I.getParent(), I, I.getDebugLoc(),
+                                   TII.get(OpSet), I.getOperand(0).getReg());
+
+  constrainSelectedInstRegOperands(CmpInst, TII, TRI, RBI);
+  constrainSelectedInstRegOperands(SetInst, TII, TRI, RBI);
+
+  I.eraseFromParent();
+  return true;
+}
+
 InstructionSelector *
 llvm::createX86InstructionSelector(const X86TargetMachine &TM,
                                    X86Subtarget &Subtarget,
diff --git a/lib/Target/X86/X86IntrinsicsInfo.h b/lib/Target/X86/X86IntrinsicsInfo.h
index 2a40399ba571..bc73bb1ae8c5 100644
--- a/lib/Target/X86/X86IntrinsicsInfo.h
+++ b/lib/Target/X86/X86IntrinsicsInfo.h
@@ -36,7 +36,7 @@ enum IntrinsicType : uint16_t {
   TRUNCATE_TO_MEM_VI8, TRUNCATE_TO_MEM_VI16, TRUNCATE_TO_MEM_VI32,
   EXPAND_FROM_MEM,
   TERLOG_OP_MASK, TERLOG_OP_MASKZ, BROADCASTM, KUNPCK, FIXUPIMM, FIXUPIMM_MASKZ, FIXUPIMMS,
-  FIXUPIMMS_MASKZ, CONVERT_MASK_TO_VEC, CONVERT_TO_MASK, GATHER_AVX2, MASK_BINOP,
+  FIXUPIMMS_MASKZ, CONVERT_TO_MASK, GATHER_AVX2, MASK_BINOP,
 };
 
 struct IntrinsicData {
diff --git a/lib/Target/X86/X86LegalizerInfo.cpp b/lib/Target/X86/X86LegalizerInfo.cpp
index 4f5e70414aa9..cf26238c0239 100644
--- a/lib/Target/X86/X86LegalizerInfo.cpp
+++ b/lib/Target/X86/X86LegalizerInfo.cpp
@@ -87,10 +87,16 @@ void X86LegalizerInfo::setLegalizerInfo32bit() {
   setAction({G_ZEXT, s32}, Legal);
   setAction({G_SEXT, s32}, Legal);
 
-  for (auto Ty : {s8, s16}) {
+  for (auto Ty : {s1, s8, s16}) {
     setAction({G_ZEXT, 1, Ty}, Legal);
     setAction({G_SEXT, 1, Ty}, Legal);
   }
+
+  // Comparison
+  setAction({G_ICMP, s1}, Legal);
+
+  for (auto Ty : {s8, s16, s32, p0})
+    setAction({G_ICMP, 1, Ty}, Legal);
 }
 
 void X86LegalizerInfo::setLegalizerInfo64bit() {
@@ -139,10 +145,16 @@ void X86LegalizerInfo::setLegalizerInfo64bit() {
     setAction({G_SEXT, Ty}, Legal);
   }
 
-  for (auto Ty : {s8, s16, s32}) {
+  for (auto Ty : {s1, s8, s16, s32}) {
     setAction({G_ZEXT, 1, Ty}, Legal);
     setAction({G_SEXT, 1, Ty}, Legal);
   }
+
+  // Comparison
+  setAction({G_ICMP, s1}, Legal);
+
+  for (auto Ty : {s8, s16, s32, s64, p0})
+    setAction({G_ICMP, 1, Ty}, Legal);
 }
 
 void X86LegalizerInfo::setLegalizerInfoSSE1() {
diff --git a/lib/Target/X86/X86RegisterInfo.cpp b/lib/Target/X86/X86RegisterInfo.cpp
index cf2ceef8013a..7e4cba1c8345 100644
--- a/lib/Target/X86/X86RegisterInfo.cpp
+++ b/lib/Target/X86/X86RegisterInfo.cpp
@@ -320,14 +320,14 @@ X86RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
   case CallingConv::X86_RegCall:
     if (Is64Bit) {
       if (IsWin64) {
-        return (HasSSE ? CSR_Win64_RegCall_SaveList : 
+        return (HasSSE ? CSR_Win64_RegCall_SaveList :
                          CSR_Win64_RegCall_NoSSE_SaveList);
       } else {
-        return (HasSSE ? CSR_SysV64_RegCall_SaveList : 
+        return (HasSSE ? CSR_SysV64_RegCall_SaveList :
                          CSR_SysV64_RegCall_NoSSE_SaveList);
       }
     } else {
-      return (HasSSE ? CSR_32_RegCall_SaveList : 
+      return (HasSSE ? CSR_32_RegCall_SaveList :
                        CSR_32_RegCall_NoSSE_SaveList);
     }
   case CallingConv::Cold:
@@ -435,15 +435,15 @@ X86RegisterInfo::getCallPreservedMask(const MachineFunction &MF,
     return CSR_64_HHVM_RegMask;
   case CallingConv::X86_RegCall:
     if (Is64Bit) {
-      if (IsWin64) { 
-        return (HasSSE ? CSR_Win64_RegCall_RegMask : 
+      if (IsWin64) {
+        return (HasSSE ? CSR_Win64_RegCall_RegMask :
                          CSR_Win64_RegCall_NoSSE_RegMask);
       } else {
-        return (HasSSE ? CSR_SysV64_RegCall_RegMask : 
+        return (HasSSE ? CSR_SysV64_RegCall_RegMask :
                          CSR_SysV64_RegCall_NoSSE_RegMask);
       }
     } else {
-      return (HasSSE ? CSR_32_RegCall_RegMask : 
+      return (HasSSE ? CSR_32_RegCall_RegMask :
                        CSR_32_RegCall_NoSSE_RegMask);
     }
   case CallingConv::Cold:
diff --git a/lib/Target/X86/X86Subtarget.h b/lib/Target/X86/X86Subtarget.h
index de1514243aeb..02be95e2e556 100644
--- a/lib/Target/X86/X86Subtarget.h
+++ b/lib/Target/X86/X86Subtarget.h
@@ -253,6 +253,11 @@ protected:
   /// True if the LEA instruction with certain arguments is slow
   bool SlowLEA;
 
+  /// True if the LEA instruction has all three source operands: base, index,
+  /// and offset or if the LEA instruction uses base and index registers where
+  /// the base is EBP, RBP,or R13
+  bool Slow3OpsLEA;
+
   /// True if INC and DEC instructions are slow when writing to flags
   bool SlowIncDec;
 
@@ -490,6 +495,7 @@ public:
   bool callRegIndirect() const { return CallRegIndirect; }
   bool LEAusesAG() const { return LEAUsesAG; }
   bool slowLEA() const { return SlowLEA; }
+  bool slow3OpsLEA() const { return Slow3OpsLEA; }
   bool slowIncDec() const { return SlowIncDec; }
   bool hasCDI() const { return HasCDI; }
   bool hasPFI() const { return HasPFI; }
diff --git a/lib/Target/X86/X86TargetMachine.cpp b/lib/Target/X86/X86TargetMachine.cpp
index 086f55dd60b5..c6a90725d89c 100644
--- a/lib/Target/X86/X86TargetMachine.cpp
+++ b/lib/Target/X86/X86TargetMachine.cpp
@@ -61,6 +61,7 @@ static cl::opt<bool> EnableMachineCombinerPass("x86-machine-combiner",
 namespace llvm {
 
 void initializeWinEHStatePassPass(PassRegistry &);
+void initializeFixupLEAPassPass(PassRegistry &);
 void initializeX86ExecutionDepsFixPass(PassRegistry &);
 
 } // end namespace llvm
@@ -75,6 +76,7 @@ extern "C" void LLVMInitializeX86Target() {
   initializeWinEHStatePassPass(PR);
   initializeFixupBWInstPassPass(PR);
   initializeEvexToVexInstPassPass(PR);
+  initializeFixupLEAPassPass(PR);
   initializeX86ExecutionDepsFixPass(PR);
 }
 
@@ -87,7 +89,7 @@ static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) {
 
   if (TT.isOSFreeBSD())
     return llvm::make_unique<X86FreeBSDTargetObjectFile>();
-  if (TT.isOSLinux() || TT.isOSNaCl())
+  if (TT.isOSLinux() || TT.isOSNaCl() || TT.isOSIAMCU())
     return llvm::make_unique<X86LinuxNaClTargetObjectFile>();
   if (TT.isOSFuchsia())
     return llvm::make_unique<X86FuchsiaTargetObjectFile>();
diff --git a/lib/Target/X86/X86TargetTransformInfo.cpp b/lib/Target/X86/X86TargetTransformInfo.cpp
index f3b619a2956a..80e18161a94b 100644
--- a/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -247,35 +247,38 @@ int X86TTIImpl::getArithmeticInstrCost(
   }
 
   static const CostTblEntry SSE2UniformConstCostTable[] = {
-    { ISD::SHL,  MVT::v16i8,   2 }, // psllw + pand.
-    { ISD::SRL,  MVT::v16i8,   2 }, // psrlw + pand.
-    { ISD::SRA,  MVT::v16i8,   4 }, // psrlw, pand, pxor, psubb.
-
-    { ISD::SHL,  MVT::v32i8,   4 }, // 2*(psllw + pand).
-    { ISD::SRL,  MVT::v32i8,   4 }, // 2*(psrlw + pand).
-    { ISD::SRA,  MVT::v32i8,   8 }, // 2*(psrlw, pand, pxor, psubb).
-
-    { ISD::SDIV, MVT::v16i16, 12 }, // pmulhw sequence
-    { ISD::SDIV, MVT::v8i16,   6 }, // pmulhw sequence
-    { ISD::UDIV, MVT::v16i16, 12 }, // pmulhuw sequence
-    { ISD::UDIV, MVT::v8i16,   6 }, // pmulhuw sequence
-    { ISD::SDIV, MVT::v8i32,  38 }, // pmuludq sequence
-    { ISD::SDIV, MVT::v4i32,  19 }, // pmuludq sequence
-    { ISD::UDIV, MVT::v8i32,  30 }, // pmuludq sequence
-    { ISD::UDIV, MVT::v4i32,  15 }, // pmuludq sequence
+    { ISD::SHL,  MVT::v16i8,     2 }, // psllw + pand.
+    { ISD::SRL,  MVT::v16i8,     2 }, // psrlw + pand.
+    { ISD::SRA,  MVT::v16i8,     4 }, // psrlw, pand, pxor, psubb.
+
+    { ISD::SHL,  MVT::v32i8,   4+2 }, // 2*(psllw + pand) + split.
+    { ISD::SRL,  MVT::v32i8,   4+2 }, // 2*(psrlw + pand) + split.
+    { ISD::SRA,  MVT::v32i8,   8+2 }, // 2*(psrlw, pand, pxor, psubb) + split.
+
+    { ISD::SDIV, MVT::v16i16, 12+2 }, // 2*pmulhw sequence + split.
+    { ISD::SDIV, MVT::v8i16,     6 }, // pmulhw sequence
+    { ISD::UDIV, MVT::v16i16, 12+2 }, // 2*pmulhuw sequence + split.
+    { ISD::UDIV, MVT::v8i16,     6 }, // pmulhuw sequence
+    { ISD::SDIV, MVT::v8i32,  38+2 }, // 2*pmuludq sequence + split.
+    { ISD::SDIV, MVT::v4i32,    19 }, // pmuludq sequence
+    { ISD::UDIV, MVT::v8i32,  30+2 }, // 2*pmuludq sequence + split.
+    { ISD::UDIV, MVT::v4i32,    15 }, // pmuludq sequence
   };
 
   if (Op2Info == TargetTransformInfo::OK_UniformConstantValue &&
       ST->hasSSE2()) {
     // pmuldq sequence.
     if (ISD == ISD::SDIV && LT.second == MVT::v8i32 && ST->hasAVX())
-      return LT.first * 30;
+      return LT.first * 32;
     if (ISD == ISD::SDIV && LT.second == MVT::v4i32 && ST->hasSSE41())
       return LT.first * 15;
 
-    if (const auto *Entry = CostTableLookup(SSE2UniformConstCostTable, ISD,
-                                            LT.second))
-      return LT.first * Entry->Cost;
+    // XOP has faster vXi8 shifts.
+    if ((ISD != ISD::SHL && ISD != ISD::SRL && ISD != ISD::SRA) ||
+        !ST->hasXOP())
+      if (const auto *Entry =
+              CostTableLookup(SSE2UniformConstCostTable, ISD, LT.second))
+        return LT.first * Entry->Cost;
   }
 
   static const CostTblEntry AVX2UniformCostTable[] = {
@@ -430,18 +433,18 @@ int X86TTIImpl::getArithmeticInstrCost(
     { ISD::SRL,     MVT::v2i64,    2 },
     { ISD::SRA,     MVT::v2i64,    2 },
     // 256bit shifts require splitting if AVX2 didn't catch them above.
-    { ISD::SHL,     MVT::v32i8,    2 },
-    { ISD::SRL,     MVT::v32i8,    4 },
-    { ISD::SRA,     MVT::v32i8,    4 },
-    { ISD::SHL,     MVT::v16i16,   2 },
-    { ISD::SRL,     MVT::v16i16,   4 },
-    { ISD::SRA,     MVT::v16i16,   4 },
-    { ISD::SHL,     MVT::v8i32,    2 },
-    { ISD::SRL,     MVT::v8i32,    4 },
-    { ISD::SRA,     MVT::v8i32,    4 },
-    { ISD::SHL,     MVT::v4i64,    2 },
-    { ISD::SRL,     MVT::v4i64,    4 },
-    { ISD::SRA,     MVT::v4i64,    4 },
+    { ISD::SHL,     MVT::v32i8,  2+2 },
+    { ISD::SRL,     MVT::v32i8,  4+2 },
+    { ISD::SRA,     MVT::v32i8,  4+2 },
+    { ISD::SHL,     MVT::v16i16, 2+2 },
+    { ISD::SRL,     MVT::v16i16, 4+2 },
+    { ISD::SRA,     MVT::v16i16, 4+2 },
+    { ISD::SHL,     MVT::v8i32,  2+2 },
+    { ISD::SRL,     MVT::v8i32,  4+2 },
+    { ISD::SRA,     MVT::v8i32,  4+2 },
+    { ISD::SHL,     MVT::v4i64,  2+2 },
+    { ISD::SRL,     MVT::v4i64,  4+2 },
+    { ISD::SRA,     MVT::v4i64,  4+2 },
   };
 
   // Look for XOP lowering tricks.
@@ -451,23 +454,28 @@ int X86TTIImpl::getArithmeticInstrCost(
 
   static const CostTblEntry SSE2UniformShiftCostTable[] = {
     // Uniform splats are cheaper for the following instructions.
-    { ISD::SHL,  MVT::v16i16, 2 }, // psllw.
-    { ISD::SHL,  MVT::v8i32,  2 }, // pslld
-    { ISD::SHL,  MVT::v4i64,  2 }, // psllq.
-
-    { ISD::SRL,  MVT::v16i16, 2 }, // psrlw.
-    { ISD::SRL,  MVT::v8i32,  2 }, // psrld.
-    { ISD::SRL,  MVT::v4i64,  2 }, // psrlq.
-
-    { ISD::SRA,  MVT::v16i16, 2 }, // psraw.
-    { ISD::SRA,  MVT::v8i32,  2 }, // psrad.
-    { ISD::SRA,  MVT::v2i64,  4 }, // 2 x psrad + shuffle.
-    { ISD::SRA,  MVT::v4i64,  8 }, // 2 x psrad + shuffle.
+    { ISD::SHL,  MVT::v16i16, 2+2 }, // 2*psllw + split.
+    { ISD::SHL,  MVT::v8i32,  2+2 }, // 2*pslld + split.
+    { ISD::SHL,  MVT::v4i64,  2+2 }, // 2*psllq + split.
+
+    { ISD::SRL,  MVT::v16i16, 2+2 }, // 2*psrlw + split.
+    { ISD::SRL,  MVT::v8i32,  2+2 }, // 2*psrld + split.
+    { ISD::SRL,  MVT::v4i64,  2+2 }, // 2*psrlq + split.
+
+    { ISD::SRA,  MVT::v16i16, 2+2 }, // 2*psraw + split.
+    { ISD::SRA,  MVT::v8i32,  2+2 }, // 2*psrad + split.
+    { ISD::SRA,  MVT::v2i64,    4 }, // 2*psrad + shuffle.
+    { ISD::SRA,  MVT::v4i64,  8+2 }, // 2*(2*psrad + shuffle) + split.
   };
 
   if (ST->hasSSE2() &&
       ((Op2Info == TargetTransformInfo::OK_UniformConstantValue) ||
        (Op2Info == TargetTransformInfo::OK_UniformValue))) {
+
+    // Handle AVX2 uniform v4i64 ISD::SRA, it's not worth a table.
+    if (ISD == ISD::SRA && LT.second == MVT::v4i64 && ST->hasAVX2())
+      return LT.first * 4; // 2*psrad + shuffle.
+
     if (const auto *Entry =
             CostTableLookup(SSE2UniformShiftCostTable, ISD, LT.second))
       return LT.first * Entry->Cost;
@@ -581,28 +589,28 @@ int X86TTIImpl::getArithmeticInstrCost(
       return LT.first * Entry->Cost;
 
   static const CostTblEntry SSE41CostTable[] = {
-    { ISD::SHL,  MVT::v16i8,    11 }, // pblendvb sequence.
-    { ISD::SHL,  MVT::v32i8,  2*11 }, // pblendvb sequence.
-    { ISD::SHL,  MVT::v8i16,    14 }, // pblendvb sequence.
-    { ISD::SHL,  MVT::v16i16, 2*14 }, // pblendvb sequence.
-    { ISD::SHL,  MVT::v4i32,     4 }, // pslld/paddd/cvttps2dq/pmulld
-    { ISD::SHL,  MVT::v8i32,   2*4 }, // pslld/paddd/cvttps2dq/pmulld
-
-    { ISD::SRL,  MVT::v16i8,    12 }, // pblendvb sequence.
-    { ISD::SRL,  MVT::v32i8,  2*12 }, // pblendvb sequence.
-    { ISD::SRL,  MVT::v8i16,    14 }, // pblendvb sequence.
-    { ISD::SRL,  MVT::v16i16, 2*14 }, // pblendvb sequence.
-    { ISD::SRL,  MVT::v4i32,    11 }, // Shift each lane + blend.
-    { ISD::SRL,  MVT::v8i32,  2*11 }, // Shift each lane + blend.
-
-    { ISD::SRA,  MVT::v16i8,    24 }, // pblendvb sequence.
-    { ISD::SRA,  MVT::v32i8,  2*24 }, // pblendvb sequence.
-    { ISD::SRA,  MVT::v8i16,    14 }, // pblendvb sequence.
-    { ISD::SRA,  MVT::v16i16, 2*14 }, // pblendvb sequence.
-    { ISD::SRA,  MVT::v4i32,    12 }, // Shift each lane + blend.
-    { ISD::SRA,  MVT::v8i32,  2*12 }, // Shift each lane + blend.
-
-    { ISD::MUL,  MVT::v4i32,     1 }  // pmulld
+    { ISD::SHL,  MVT::v16i8,      11 }, // pblendvb sequence.
+    { ISD::SHL,  MVT::v32i8,  2*11+2 }, // pblendvb sequence + split.
+    { ISD::SHL,  MVT::v8i16,      14 }, // pblendvb sequence.
+    { ISD::SHL,  MVT::v16i16, 2*14+2 }, // pblendvb sequence + split.
+    { ISD::SHL,  MVT::v4i32,       4 }, // pslld/paddd/cvttps2dq/pmulld
+    { ISD::SHL,  MVT::v8i32,   2*4+2 }, // pslld/paddd/cvttps2dq/pmulld + split
+
+    { ISD::SRL,  MVT::v16i8,      12 }, // pblendvb sequence.
+    { ISD::SRL,  MVT::v32i8,  2*12+2 }, // pblendvb sequence + split.
+    { ISD::SRL,  MVT::v8i16,      14 }, // pblendvb sequence.
+    { ISD::SRL,  MVT::v16i16, 2*14+2 }, // pblendvb sequence + split.
+    { ISD::SRL,  MVT::v4i32,      11 }, // Shift each lane + blend.
+    { ISD::SRL,  MVT::v8i32,  2*11+2 }, // Shift each lane + blend + split.
+
+    { ISD::SRA,  MVT::v16i8,      24 }, // pblendvb sequence.
+    { ISD::SRA,  MVT::v32i8,  2*24+2 }, // pblendvb sequence + split.
+    { ISD::SRA,  MVT::v8i16,      14 }, // pblendvb sequence.
+    { ISD::SRA,  MVT::v16i16, 2*14+2 }, // pblendvb sequence + split.
+    { ISD::SRA,  MVT::v4i32,      12 }, // Shift each lane + blend.
+    { ISD::SRA,  MVT::v8i32,  2*12+2 }, // Shift each lane + blend + split.
+
+    { ISD::MUL,  MVT::v4i32,       1 }  // pmulld
   };
 
   if (ST->hasSSE41())
@@ -612,33 +620,33 @@ int X86TTIImpl::getArithmeticInstrCost(
   static const CostTblEntry SSE2CostTable[] = {
     // We don't correctly identify costs of casts because they are marked as
     // custom.
-    { ISD::SHL,  MVT::v16i8,    26 }, // cmpgtb sequence.
-    { ISD::SHL,  MVT::v8i16,    32 }, // cmpgtb sequence.
-    { ISD::SHL,  MVT::v4i32,   2*5 }, // We optimized this using mul.
-    { ISD::SHL,  MVT::v2i64,     4 }, // splat+shuffle sequence.
-    { ISD::SHL,  MVT::v4i64,   2*4 }, // splat+shuffle sequence.
-
-    { ISD::SRL,  MVT::v16i8,    26 }, // cmpgtb sequence.
-    { ISD::SRL,  MVT::v8i16,    32 }, // cmpgtb sequence.
-    { ISD::SRL,  MVT::v4i32,    16 }, // Shift each lane + blend.
-    { ISD::SRL,  MVT::v2i64,     4 }, // splat+shuffle sequence.
-    { ISD::SRL,  MVT::v4i64,   2*4 }, // splat+shuffle sequence.
-
-    { ISD::SRA,  MVT::v16i8,    54 }, // unpacked cmpgtb sequence.
-    { ISD::SRA,  MVT::v8i16,    32 }, // cmpgtb sequence.
-    { ISD::SRA,  MVT::v4i32,    16 }, // Shift each lane + blend.
-    { ISD::SRA,  MVT::v2i64,    12 }, // srl/xor/sub sequence.
-    { ISD::SRA,  MVT::v4i64,  2*12 }, // srl/xor/sub sequence.
-
-    { ISD::MUL,  MVT::v16i8,    12 }, // extend/pmullw/trunc sequence.
-    { ISD::MUL,  MVT::v8i16,     1 }, // pmullw
-    { ISD::MUL,  MVT::v4i32,     6 }, // 3*pmuludq/4*shuffle
-    { ISD::MUL,  MVT::v2i64,     8 }, // 3*pmuludq/3*shift/2*add
-
-    { ISD::FDIV, MVT::f32,      23 }, // Pentium IV from http://www.agner.org/
-    { ISD::FDIV, MVT::v4f32,    39 }, // Pentium IV from http://www.agner.org/
-    { ISD::FDIV, MVT::f64,      38 }, // Pentium IV from http://www.agner.org/
-    { ISD::FDIV, MVT::v2f64,    69 }, // Pentium IV from http://www.agner.org/
+    { ISD::SHL,  MVT::v16i8,      26 }, // cmpgtb sequence.
+    { ISD::SHL,  MVT::v8i16,      32 }, // cmpgtb sequence.
+    { ISD::SHL,  MVT::v4i32,     2*5 }, // We optimized this using mul.
+    { ISD::SHL,  MVT::v2i64,       4 }, // splat+shuffle sequence.
+    { ISD::SHL,  MVT::v4i64,   2*4+2 }, // splat+shuffle sequence + split.
+
+    { ISD::SRL,  MVT::v16i8,      26 }, // cmpgtb sequence.
+    { ISD::SRL,  MVT::v8i16,      32 }, // cmpgtb sequence.
+    { ISD::SRL,  MVT::v4i32,      16 }, // Shift each lane + blend.
+    { ISD::SRL,  MVT::v2i64,       4 }, // splat+shuffle sequence.
+    { ISD::SRL,  MVT::v4i64,   2*4+2 }, // splat+shuffle sequence + split.
+
+    { ISD::SRA,  MVT::v16i8,      54 }, // unpacked cmpgtb sequence.
+    { ISD::SRA,  MVT::v8i16,      32 }, // cmpgtb sequence.
+    { ISD::SRA,  MVT::v4i32,      16 }, // Shift each lane + blend.
+    { ISD::SRA,  MVT::v2i64,      12 }, // srl/xor/sub sequence.
+    { ISD::SRA,  MVT::v4i64,  2*12+2 }, // srl/xor/sub sequence+split.
+
+    { ISD::MUL,  MVT::v16i8,      12 }, // extend/pmullw/trunc sequence.
+    { ISD::MUL,  MVT::v8i16,       1 }, // pmullw
+    { ISD::MUL,  MVT::v4i32,       6 }, // 3*pmuludq/4*shuffle
+    { ISD::MUL,  MVT::v2i64,       8 }, // 3*pmuludq/3*shift/2*add
+
+    { ISD::FDIV, MVT::f32,        23 }, // Pentium IV from http://www.agner.org/
+    { ISD::FDIV, MVT::v4f32,      39 }, // Pentium IV from http://www.agner.org/
+    { ISD::FDIV, MVT::f64,        38 }, // Pentium IV from http://www.agner.org/
+    { ISD::FDIV, MVT::v2f64,      69 }, // Pentium IV from http://www.agner.org/
 
     // It is not a good idea to vectorize division. We have to scalarize it and
     // in the process we will often end up having to spilling regular
diff --git a/lib/Target/X86/X86WinEHState.cpp b/lib/Target/X86/X86WinEHState.cpp
index 500b26b3be17..3ee14a0ff7b1 100644
--- a/lib/Target/X86/X86WinEHState.cpp
+++ b/lib/Target/X86/X86WinEHState.cpp
@@ -398,7 +398,7 @@ Function *WinEHStatePass::generateLSDAInEAXThunk(Function *ParentFunc) {
                         /*isVarArg=*/false);
   Function *Trampoline =
       Function::Create(TrampolineTy, GlobalValue::InternalLinkage,
-                       Twine("__ehhandler$") + GlobalValue::getRealLinkageName(
+                       Twine("__ehhandler$") + GlobalValue::dropLLVMManglingEscape(
                                                    ParentFunc->getName()),
                        TheModule);
   BasicBlock *EntryBB = BasicBlock::Create(Context, "entry", Trampoline);
diff --git a/lib/Target/XCore/XCoreISelLowering.cpp b/lib/Target/XCore/XCoreISelLowering.cpp
index b8742683a0c8..1da189c5cd31 100644
--- a/lib/Target/XCore/XCoreISelLowering.cpp
+++ b/lib/Target/XCore/XCoreISelLowering.cpp
@@ -409,7 +409,7 @@ static bool isWordAligned(SDValue Value, SelectionDAG &DAG)
 {
   KnownBits Known;
   DAG.computeKnownBits(Value, Known);
-  return Known.Zero.countTrailingOnes() >= 2;
+  return Known.countMinTrailingZeros() >= 2;
 }
 
 SDValue XCoreTargetLowering::
@@ -1131,8 +1131,7 @@ SDValue XCoreTargetLowering::LowerCCCCallTo(
   unsigned NumBytes = RetCCInfo.getNextStackOffset();
   auto PtrVT = getPointerTy(DAG.getDataLayout());
 
-  Chain = DAG.getCALLSEQ_START(Chain,
-                               DAG.getConstant(NumBytes, dl, PtrVT, true), dl);
+  Chain = DAG.getCALLSEQ_START(Chain, NumBytes, 0, dl);
 
   SmallVector<std::pair<unsigned, SDValue>, 4> RegsToPass;
   SmallVector<SDValue, 12> MemOpChains;
diff --git a/lib/Target/XCore/XCoreInstrInfo.td b/lib/Target/XCore/XCoreInstrInfo.td
index f1d52d5a191f..b87ba6548962 100644
--- a/lib/Target/XCore/XCoreInstrInfo.td
+++ b/lib/Target/XCore/XCoreInstrInfo.td
@@ -73,9 +73,10 @@ def XCoreLdwsp        : SDNode<"XCoreISD::LDWSP", SDT_XCoreLdwsp,
                                [SDNPHasChain, SDNPMayLoad]>;
 
 // These are target-independent nodes, but have target-specific formats.
-def SDT_XCoreCallSeqStart : SDCallSeqStart<[ SDTCisVT<0, i32> ]>;
+def SDT_XCoreCallSeqStart : SDCallSeqStart<[ SDTCisVT<0, i32>,
+                                             SDTCisVT<1, i32> ]>;
 def SDT_XCoreCallSeqEnd   : SDCallSeqEnd<[ SDTCisVT<0, i32>,
-                                        SDTCisVT<1, i32> ]>;
+                                           SDTCisVT<1, i32> ]>;
 
 def callseq_start : SDNode<"ISD::CALLSEQ_START", SDT_XCoreCallSeqStart,
                            [SDNPHasChain, SDNPOutGlue]>;
@@ -323,9 +324,9 @@ class F2R_np<bits<6> opc, string OpcStr> :
 //===----------------------------------------------------------------------===//
 
 let Defs = [SP], Uses = [SP] in {
-def ADJCALLSTACKDOWN : PseudoInstXCore<(outs), (ins i32imm:$amt),
-                               "# ADJCALLSTACKDOWN $amt",
-                               [(callseq_start timm:$amt)]>;
+def ADJCALLSTACKDOWN : PseudoInstXCore<(outs), (ins i32imm:$amt, i32imm:$amt2),
+                               "# ADJCALLSTACKDOWN $amt, $amt2",
+                               [(callseq_start timm:$amt, timm:$amt2)]>;
 def ADJCALLSTACKUP : PseudoInstXCore<(outs), (ins i32imm:$amt1, i32imm:$amt2),
                             "# ADJCALLSTACKUP $amt1",
                             [(callseq_end timm:$amt1, timm:$amt2)]>;
diff --git a/lib/ToolDrivers/CMakeLists.txt b/lib/ToolDrivers/CMakeLists.txt
new file mode 100644
index 000000000000..ad458450fda3
--- /dev/null
+++ b/lib/ToolDrivers/CMakeLists.txt
@@ -0,0 +1 @@
+add_subdirectory(llvm-lib)
diff --git a/lib/ToolDrivers/LLVMBuild.txt b/lib/ToolDrivers/LLVMBuild.txt
new file mode 100644
index 000000000000..7da9a5c01005
--- /dev/null
+++ b/lib/ToolDrivers/LLVMBuild.txt
@@ -0,0 +1,24 @@
+;===- ./lib/ToolDrivers/LLVMBuild.txt --------------------------*- Conf -*--===;
+;
+;                     The LLVM Compiler Infrastructure
+;
+; This file is distributed under the University of Illinois Open Source
+; License. See LICENSE.TXT for details.
+;
+;===------------------------------------------------------------------------===;
+;
+; This is an LLVMBuild description file for the components in this subdirectory.
+;
+; For more information on the LLVMBuild system, please see:
+;
+;   http://llvm.org/docs/LLVMBuild.html
+;
+;===------------------------------------------------------------------------===;
+
+[common]
+subdirectories = llvm-lib
+
+[component_0]
+type = Group
+name = ToolDrivers
+parent = Libraries
diff --git a/lib/ToolDrivers/llvm-lib/CMakeLists.txt b/lib/ToolDrivers/llvm-lib/CMakeLists.txt
new file mode 100644
index 000000000000..ab53a6843446
--- /dev/null
+++ b/lib/ToolDrivers/llvm-lib/CMakeLists.txt
@@ -0,0 +1,8 @@
+set(LLVM_TARGET_DEFINITIONS Options.td)
+tablegen(LLVM Options.inc -gen-opt-parser-defs)
+add_public_tablegen_target(LibOptionsTableGen)
+
+add_llvm_library(LLVMLibDriver
+  LibDriver.cpp
+  )
+add_dependencies(LLVMLibDriver LibOptionsTableGen)
diff --git a/lib/ToolDrivers/llvm-lib/LLVMBuild.txt b/lib/ToolDrivers/llvm-lib/LLVMBuild.txt
new file mode 100644
index 000000000000..799dc997c0bb
--- /dev/null
+++ b/lib/ToolDrivers/llvm-lib/LLVMBuild.txt
@@ -0,0 +1,22 @@
+;===- ./lib/LibDriver/LLVMBuild.txt ----------------------------*- Conf -*--===;
+;
+;                     The LLVM Compiler Infrastructure
+;
+; This file is distributed under the University of Illinois Open Source
+; License. See LICENSE.TXT for details.
+;
+;===------------------------------------------------------------------------===;
+;
+; This is an LLVMBuild description file for the components in this subdirectory.
+;
+; For more information on the LLVMBuild system, please see:
+;
+;   http://llvm.org/docs/LLVMBuild.html
+;
+;===------------------------------------------------------------------------===;
+
+[component_0]
+type = Library
+name = LibDriver
+parent = Libraries
+required_libraries = Object Option Support
diff --git a/lib/ToolDrivers/llvm-lib/LibDriver.cpp b/lib/ToolDrivers/llvm-lib/LibDriver.cpp
new file mode 100644
index 000000000000..3bae3826d62e
--- /dev/null
+++ b/lib/ToolDrivers/llvm-lib/LibDriver.cpp
@@ -0,0 +1,171 @@
+//===- LibDriver.cpp - lib.exe-compatible driver --------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Defines an interface to a lib.exe-compatible driver that also understands
+// bitcode files. Used by llvm-lib and lld-link /lib.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ToolDrivers/llvm-lib/LibDriver.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/Object/ArchiveWriter.h"
+#include "llvm/Option/Arg.h"
+#include "llvm/Option/ArgList.h"
+#include "llvm/Option/Option.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/StringSaver.h"
+#include "llvm/Support/Path.h"
+#include "llvm/Support/Process.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+namespace {
+
+enum {
+  OPT_INVALID = 0,
+#define OPTION(_1, _2, ID, _4, _5, _6, _7, _8, _9, _10, _11) OPT_##ID,
+#include "Options.inc"
+#undef OPTION
+};
+
+#define PREFIX(NAME, VALUE) const char *const NAME[] = VALUE;
+#include "Options.inc"
+#undef PREFIX
+
+static const llvm::opt::OptTable::Info infoTable[] = {
+#define OPTION(X1, X2, ID, KIND, GROUP, ALIAS, X6, X7, X8, X9, X10)    \
+  {                                                                    \
+    X1, X2, X9, X10, OPT_##ID, llvm::opt::Option::KIND##Class, X8, X7, \
+    OPT_##GROUP, OPT_##ALIAS, X6                                       \
+  },
+#include "Options.inc"
+#undef OPTION
+};
+
+class LibOptTable : public llvm::opt::OptTable {
+public:
+  LibOptTable() : OptTable(infoTable, true) {}
+};
+
+}
+
+static std::string getOutputPath(llvm::opt::InputArgList *Args,
+                                 const llvm::NewArchiveMember &FirstMember) {
+  if (auto *Arg = Args->getLastArg(OPT_out))
+    return Arg->getValue();
+  SmallString<128> Val = StringRef(FirstMember.Buf->getBufferIdentifier());
+  llvm::sys::path::replace_extension(Val, ".lib");
+  return Val.str();
+}
+
+static std::vector<StringRef> getSearchPaths(llvm::opt::InputArgList *Args,
+                                             StringSaver &Saver) {
+  std::vector<StringRef> Ret;
+  // Add current directory as first item of the search path.
+  Ret.push_back("");
+
+  // Add /libpath flags.
+  for (auto *Arg : Args->filtered(OPT_libpath))
+    Ret.push_back(Arg->getValue());
+
+  // Add $LIB.
+  Optional<std::string> EnvOpt = sys::Process::GetEnv("LIB");
+  if (!EnvOpt.hasValue())
+    return Ret;
+  StringRef Env = Saver.save(*EnvOpt);
+  while (!Env.empty()) {
+    StringRef Path;
+    std::tie(Path, Env) = Env.split(';');
+    Ret.push_back(Path);
+  }
+  return Ret;
+}
+
+static Optional<std::string> findInputFile(StringRef File,
+                                           ArrayRef<StringRef> Paths) {
+  for (auto Dir : Paths) {
+    SmallString<128> Path = Dir;
+    sys::path::append(Path, File);
+    if (sys::fs::exists(Path))
+      return Path.str().str();
+  }
+  return Optional<std::string>();
+}
+
+int llvm::libDriverMain(llvm::ArrayRef<const char*> ArgsArr) {
+  SmallVector<const char *, 20> NewArgs(ArgsArr.begin(), ArgsArr.end());
+  BumpPtrAllocator Alloc;
+  StringSaver Saver(Alloc);
+  cl::ExpandResponseFiles(Saver, cl::TokenizeWindowsCommandLine, NewArgs);
+  ArgsArr = NewArgs;
+
+  LibOptTable Table;
+  unsigned MissingIndex;
+  unsigned MissingCount;
+  llvm::opt::InputArgList Args =
+      Table.ParseArgs(ArgsArr.slice(1), MissingIndex, MissingCount);
+  if (MissingCount) {
+    llvm::errs() << "missing arg value for \""
+                 << Args.getArgString(MissingIndex) << "\", expected "
+                 << MissingCount
+                 << (MissingCount == 1 ? " argument.\n" : " arguments.\n");
+    return 1;
+  }
+  for (auto *Arg : Args.filtered(OPT_UNKNOWN))
+    llvm::errs() << "ignoring unknown argument: " << Arg->getSpelling() << "\n";
+
+  if (!Args.hasArgNoClaim(OPT_INPUT)) {
+    // No input files.  To match lib.exe, silently do nothing.
+    return 0;
+  }
+
+  std::vector<StringRef> SearchPaths = getSearchPaths(&Args, Saver);
+
+  std::vector<llvm::NewArchiveMember> Members;
+  for (auto *Arg : Args.filtered(OPT_INPUT)) {
+    Optional<std::string> Path = findInputFile(Arg->getValue(), SearchPaths);
+    if (!Path.hasValue()) {
+      llvm::errs() << Arg->getValue() << ": no such file or directory\n";
+      return 1;
+    }
+    Expected<NewArchiveMember> MOrErr =
+        NewArchiveMember::getFile(Saver.save(*Path), /*Deterministic=*/true);
+    if (!MOrErr) {
+      handleAllErrors(MOrErr.takeError(), [&](const llvm::ErrorInfoBase &EIB) {
+        llvm::errs() << Arg->getValue() << ": " << EIB.message() << "\n";
+      });
+      return 1;
+    }
+    sys::fs::file_magic Magic =
+        sys::fs::identify_magic(MOrErr->Buf->getBuffer());
+    if (Magic != sys::fs::file_magic::coff_object &&
+        Magic != sys::fs::file_magic::bitcode &&
+        Magic != sys::fs::file_magic::windows_resource) {
+      llvm::errs() << Arg->getValue()
+                   << ": not a COFF object, bitcode or resource file\n";
+      return 1;
+    }
+    Members.emplace_back(std::move(*MOrErr));
+  }
+
+  std::pair<StringRef, std::error_code> Result =
+      llvm::writeArchive(getOutputPath(&Args, Members[0]), Members,
+                         /*WriteSymtab=*/true, object::Archive::K_GNU,
+                         /*Deterministic*/ true, Args.hasArg(OPT_llvmlibthin));
+
+  if (Result.second) {
+    if (Result.first.empty())
+      Result.first = ArgsArr[0];
+    llvm::errs() << Result.first << ": " << Result.second.message() << "\n";
+    return 1;
+  }
+
+  return 0;
+}
diff --git a/lib/ToolDrivers/llvm-lib/Options.td b/lib/ToolDrivers/llvm-lib/Options.td
new file mode 100644
index 000000000000..5a56ef7468d4
--- /dev/null
+++ b/lib/ToolDrivers/llvm-lib/Options.td
@@ -0,0 +1,25 @@
+include "llvm/Option/OptParser.td"
+
+// lib.exe accepts options starting with either a dash or a slash.
+
+// Flag that takes no arguments.
+class F<string name> : Flag<["/", "-", "-?"], name>;
+
+// Flag that takes one argument after ":".
+class P<string name, string help> :
+      Joined<["/", "-", "-?"], name#":">, HelpText<help>;
+
+def libpath: P<"libpath", "Object file search path">;
+def out    : P<"out", "Path to file to write output">;
+
+def llvmlibthin : F<"llvmlibthin">;
+
+//==============================================================================
+// The flags below do nothing. They are defined only for lib.exe compatibility.
+//==============================================================================
+
+class QF<string name> : Joined<["/", "-", "-?"], name#":">;
+
+def ignore : QF<"ignore">;
+def machine: QF<"machine">;
+def nologo : F<"nologo">;
diff --git a/lib/Transforms/Coroutines/CoroFrame.cpp b/lib/Transforms/Coroutines/CoroFrame.cpp
index 19e6789dfa74..4480220f2cd4 100644
--- a/lib/Transforms/Coroutines/CoroFrame.cpp
+++ b/lib/Transforms/Coroutines/CoroFrame.cpp
@@ -177,7 +177,7 @@ SuspendCrossingInfo::SuspendCrossingInfo(Function &F, coro::Shape &Shape)
   // consume. Note, that crossing coro.save also requires a spill, as any code
   // between coro.save and coro.suspend may resume the coroutine and all of the
   // state needs to be saved by that time.
-  auto markSuspendBlock = [&](IntrinsicInst* BarrierInst) {
+  auto markSuspendBlock = [&](IntrinsicInst *BarrierInst) {
     BasicBlock *SuspendBlock = BarrierInst->getParent();
     auto &B = getBlockData(SuspendBlock);
     B.Suspend = true;
@@ -495,6 +495,78 @@ static Instruction *insertSpills(SpillInfo &Spills, coro::Shape &Shape) {
   return FramePtr;
 }
 
+// Sets the unwind edge of an instruction to a particular successor.
+static void setUnwindEdgeTo(TerminatorInst *TI, BasicBlock *Succ) {
+  if (auto *II = dyn_cast<InvokeInst>(TI))
+    II->setUnwindDest(Succ);
+  else if (auto *CS = dyn_cast<CatchSwitchInst>(TI))
+    CS->setUnwindDest(Succ);
+  else if (auto *CR = dyn_cast<CleanupReturnInst>(TI))
+    CR->setUnwindDest(Succ);
+  else
+    llvm_unreachable("unexpected terminator instruction");
+}
+
+// Replaces all uses of OldPred with the NewPred block in all PHINodes in a
+// block.
+static void updatePhiNodes(BasicBlock *DestBB, BasicBlock *OldPred,
+                           BasicBlock *NewPred,
+                           PHINode *LandingPadReplacement) {
+  unsigned BBIdx = 0;
+  for (BasicBlock::iterator I = DestBB->begin(); isa<PHINode>(I); ++I) {
+    PHINode *PN = cast<PHINode>(I);
+
+    // We manually update the LandingPadReplacement PHINode and it is the last
+    // PHI Node. So, if we find it, we are done.
+    if (LandingPadReplacement == PN)
+      break;
+
+    // Reuse the previous value of BBIdx if it lines up.  In cases where we
+    // have multiple phi nodes with *lots* of predecessors, this is a speed
+    // win because we don't have to scan the PHI looking for TIBB.  This
+    // happens because the BB list of PHI nodes are usually in the same
+    // order.
+    if (PN->getIncomingBlock(BBIdx) != OldPred)
+      BBIdx = PN->getBasicBlockIndex(OldPred);
+
+    assert(BBIdx != (unsigned)-1 && "Invalid PHI Index!");
+    PN->setIncomingBlock(BBIdx, NewPred);
+  }
+}
+
+// Uses SplitEdge unless the successor block is an EHPad, in which case do EH
+// specific handling.
+static BasicBlock *ehAwareSplitEdge(BasicBlock *BB, BasicBlock *Succ,
+                                    LandingPadInst *OriginalPad,
+                                    PHINode *LandingPadReplacement) {
+  auto *PadInst = Succ->getFirstNonPHI();
+  if (!LandingPadReplacement && !PadInst->isEHPad())
+    return SplitEdge(BB, Succ);
+
+  auto *NewBB = BasicBlock::Create(BB->getContext(), "", BB->getParent(), Succ);
+  setUnwindEdgeTo(BB->getTerminator(), NewBB);
+  updatePhiNodes(Succ, BB, NewBB, LandingPadReplacement);
+
+  if (LandingPadReplacement) {
+    auto *NewLP = OriginalPad->clone();
+    auto *Terminator = BranchInst::Create(Succ, NewBB);
+    NewLP->insertBefore(Terminator);
+    LandingPadReplacement->addIncoming(NewLP, NewBB);
+    return NewBB;
+  }
+  Value *ParentPad = nullptr;
+  if (auto *FuncletPad = dyn_cast<FuncletPadInst>(PadInst))
+    ParentPad = FuncletPad->getParentPad();
+  else if (auto *CatchSwitch = dyn_cast<CatchSwitchInst>(PadInst))
+    ParentPad = CatchSwitch->getParentPad();
+  else
+    llvm_unreachable("handling for other EHPads not implemented yet");
+
+  auto *NewCleanupPad = CleanupPadInst::Create(ParentPad, {}, "", NewBB);
+  CleanupReturnInst::Create(NewCleanupPad, Succ, NewBB);
+  return NewBB;
+}
+
 static void rewritePHIs(BasicBlock &BB) {
   // For every incoming edge we will create a block holding all
   // incoming values in a single PHI nodes.
@@ -502,7 +574,7 @@ static void rewritePHIs(BasicBlock &BB) {
   // loop:
   //    %n.val = phi i32[%n, %entry], [%inc, %loop]
   //
-  // It will create:  
+  // It will create:
   //
   // loop.from.entry:
   //    %n.loop.pre = phi i32 [%n, %entry]
@@ -517,9 +589,22 @@ static void rewritePHIs(BasicBlock &BB) {
   // TODO: Simplify PHINodes in the basic block to remove duplicate
   // predecessors.
 
+  LandingPadInst *LandingPad = nullptr;
+  PHINode *ReplPHI = nullptr;
+  if ((LandingPad = dyn_cast_or_null<LandingPadInst>(BB.getFirstNonPHI()))) {
+    // ehAwareSplitEdge will clone the LandingPad in all the edge blocks.
+    // We replace the original landing pad with a PHINode that will collect the
+    // results from all of them.
+    ReplPHI = PHINode::Create(LandingPad->getType(), 1, "", LandingPad);
+    ReplPHI->takeName(LandingPad);
+    LandingPad->replaceAllUsesWith(ReplPHI);
+    // We will erase the original landing pad at the end of this function after
+    // ehAwareSplitEdge cloned it in the transition blocks.
+  }
+
   SmallVector<BasicBlock *, 8> Preds(pred_begin(&BB), pred_end(&BB));
   for (BasicBlock *Pred : Preds) {
-    auto *IncomingBB = SplitEdge(Pred, &BB);
+    auto *IncomingBB = ehAwareSplitEdge(Pred, &BB, LandingPad, ReplPHI);
     IncomingBB->setName(BB.getName() + Twine(".from.") + Pred->getName());
     auto *PN = cast<PHINode>(&BB.front());
     do {
@@ -531,7 +616,14 @@ static void rewritePHIs(BasicBlock &BB) {
       InputV->addIncoming(V, Pred);
       PN->setIncomingValue(Index, InputV);
       PN = dyn_cast<PHINode>(PN->getNextNode());
-    } while (PN);
+    } while (PN != ReplPHI); // ReplPHI is either null or the PHI that replaced
+                             // the landing pad.
+  }
+
+  if (LandingPad) {
+    // Calls to ehAwareSplitEdge function cloned the original lading pad.
+    // No longer need it.
+    LandingPad->eraseFromParent();
   }
 }
 
diff --git a/lib/Transforms/IPO/FunctionImport.cpp b/lib/Transforms/IPO/FunctionImport.cpp
index 7ed07d63c627..231487923fad 100644
--- a/lib/Transforms/IPO/FunctionImport.cpp
+++ b/lib/Transforms/IPO/FunctionImport.cpp
@@ -610,8 +610,7 @@ void llvm::thinLTOInternalizeModule(Module &TheModule,
       return true;
 
     // Lookup the linkage recorded in the summaries during global analysis.
-    const auto &GS = DefinedGlobals.find(GV.getGUID());
-    GlobalValue::LinkageTypes Linkage;
+    auto GS = DefinedGlobals.find(GV.getGUID());
     if (GS == DefinedGlobals.end()) {
       // Must have been promoted (possibly conservatively). Find original
       // name so that we can access the correct summary and see if it can
@@ -623,7 +622,7 @@ void llvm::thinLTOInternalizeModule(Module &TheModule,
       std::string OrigId = GlobalValue::getGlobalIdentifier(
           OrigName, GlobalValue::InternalLinkage,
           TheModule.getSourceFileName());
-      const auto &GS = DefinedGlobals.find(GlobalValue::getGUID(OrigId));
+      GS = DefinedGlobals.find(GlobalValue::getGUID(OrigId));
       if (GS == DefinedGlobals.end()) {
         // Also check the original non-promoted non-globalized name. In some
         // cases a preempted weak value is linked in as a local copy because
@@ -631,15 +630,11 @@ void llvm::thinLTOInternalizeModule(Module &TheModule,
         // In that case, since it was originally not a local value, it was
         // recorded in the index using the original name.
         // FIXME: This may not be needed once PR27866 is fixed.
-        const auto &GS = DefinedGlobals.find(GlobalValue::getGUID(OrigName));
+        GS = DefinedGlobals.find(GlobalValue::getGUID(OrigName));
         assert(GS != DefinedGlobals.end());
-        Linkage = GS->second->linkage();
-      } else {
-        Linkage = GS->second->linkage();
       }
-    } else
-      Linkage = GS->second->linkage();
-    return !GlobalValue::isLocalLinkage(Linkage);
+    }
+    return !GlobalValue::isLocalLinkage(GS->second->linkage());
   };
 
   // FIXME: See if we can just internalize directly here via linkage changes
diff --git a/lib/Transforms/IPO/Inliner.cpp b/lib/Transforms/IPO/Inliner.cpp
index 6c83c99ae3be..673d3af0ab52 100644
--- a/lib/Transforms/IPO/Inliner.cpp
+++ b/lib/Transforms/IPO/Inliner.cpp
@@ -502,7 +502,7 @@ inlineCallsImpl(CallGraphSCC &SCC, CallGraph &CG,
         std::swap(CallSites[i--], CallSites[--FirstCallInSCC]);
 
   InlinedArrayAllocasTy InlinedArrayAllocas;
-  InlineFunctionInfo InlineInfo(&CG, &GetAssumptionCache);
+  InlineFunctionInfo InlineInfo(&CG, &GetAssumptionCache, PSI);
 
   // Now that we have all of the call sites, loop over them and inline them if
   // it looks profitable to do so.
@@ -872,7 +872,7 @@ PreservedAnalyses InlinerPass::run(LazyCallGraph::SCC &InitialC,
       // Setup the data structure used to plumb customization into the
       // `InlineFunction` routine.
       InlineFunctionInfo IFI(
-          /*cg=*/nullptr, &GetAssumptionCache,
+          /*cg=*/nullptr, &GetAssumptionCache, PSI,
           &FAM.getResult<BlockFrequencyAnalysis>(*(CS.getCaller())),
           &FAM.getResult<BlockFrequencyAnalysis>(Callee));
 
diff --git a/lib/Transforms/IPO/PartialInlining.cpp b/lib/Transforms/IPO/PartialInlining.cpp
index 2db47b3b5622..8dff2fb3be8a 100644
--- a/lib/Transforms/IPO/PartialInlining.cpp
+++ b/lib/Transforms/IPO/PartialInlining.cpp
@@ -16,6 +16,7 @@
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/BlockFrequencyInfo.h"
 #include "llvm/Analysis/BranchProbabilityInfo.h"
+#include "llvm/Analysis/CodeMetrics.h"
 #include "llvm/Analysis/InlineCost.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/OptimizationDiagnosticInfo.h"
@@ -42,6 +43,11 @@ STATISTIC(NumPartialInlined,
 static cl::opt<bool>
     DisablePartialInlining("disable-partial-inlining", cl::init(false),
                            cl::Hidden, cl::desc("Disable partial ininling"));
+// This is an option used by testing:
+static cl::opt<bool> SkipCostAnalysis("skip-partial-inlining-cost-analysis",
+                                      cl::init(false), cl::ZeroOrMore,
+                                      cl::ReallyHidden,
+                                      cl::desc("Skip Cost Analysis"));
 
 static cl::opt<unsigned> MaxNumInlineBlocks(
     "max-num-inline-blocks", cl::init(5), cl::Hidden,
@@ -53,6 +59,15 @@ static cl::opt<int> MaxNumPartialInlining(
     "max-partial-inlining", cl::init(-1), cl::Hidden, cl::ZeroOrMore,
     cl::desc("Max number of partial inlining. The default is unlimited"));
 
+// Used only when PGO or user annotated branch data is absent. It is
+// the least value that is used to weigh the outline region. If BFI
+// produces larger value, the BFI value will be used.
+static cl::opt<int>
+    OutlineRegionFreqPercent("outline-region-freq-percent", cl::init(75),
+                             cl::Hidden, cl::ZeroOrMore,
+                             cl::desc("Relative frequency of outline region to "
+                                      "the entry block"));
+
 namespace {
 
 struct FunctionOutliningInfo {
@@ -84,8 +99,6 @@ struct PartialInlinerImpl {
   bool run(Module &M);
   Function *unswitchFunction(Function *F);
 
-  std::unique_ptr<FunctionOutliningInfo> computeOutliningInfo(Function *F);
-
 private:
   int NumPartialInlining = 0;
   std::function<AssumptionCache &(Function &)> *GetAssumptionCache;
@@ -93,11 +106,84 @@ private:
   Optional<function_ref<BlockFrequencyInfo &(Function &)>> GetBFI;
   ProfileSummaryInfo *PSI;
 
-  bool shouldPartialInline(CallSite CS, OptimizationRemarkEmitter &ORE);
+  // Return the frequency of the OutlininingBB relative to F's entry point.
+  // The result is no larger than 1 and is represented using BP.
+  // (Note that the outlined region's 'head' block can only have incoming
+  // edges from the guarding entry blocks).
+  BranchProbability getOutliningCallBBRelativeFreq(Function *F,
+                                                   FunctionOutliningInfo *OI,
+                                                   Function *DuplicateFunction,
+                                                   BlockFrequencyInfo *BFI,
+                                                   BasicBlock *OutliningCallBB);
+
+  // Return true if the callee of CS should be partially inlined with
+  // profit.
+  bool shouldPartialInline(CallSite CS, Function *F, FunctionOutliningInfo *OI,
+                           BlockFrequencyInfo *CalleeBFI,
+                           BasicBlock *OutliningCallBB,
+                           int OutliningCallOverhead,
+                           OptimizationRemarkEmitter &ORE);
+
+  // Try to inline DuplicateFunction (cloned from F with call to
+  // the OutlinedFunction into its callers. Return true
+  // if there is any successful inlining.
+  bool tryPartialInline(Function *DuplicateFunction,
+                        Function *F, /*orignal function */
+                        FunctionOutliningInfo *OI, Function *OutlinedFunction,
+                        BlockFrequencyInfo *CalleeBFI);
+
+  // Compute the mapping from use site of DuplicationFunction to the enclosing
+  // BB's profile count.
+  void computeCallsiteToProfCountMap(Function *DuplicateFunction,
+                                     DenseMap<User *, uint64_t> &SiteCountMap);
+
   bool IsLimitReached() {
     return (MaxNumPartialInlining != -1 &&
             NumPartialInlining >= MaxNumPartialInlining);
   }
+
+  CallSite getCallSite(User *U) {
+    CallSite CS;
+    if (CallInst *CI = dyn_cast<CallInst>(U))
+      CS = CallSite(CI);
+    else if (InvokeInst *II = dyn_cast<InvokeInst>(U))
+      CS = CallSite(II);
+    else
+      llvm_unreachable("All uses must be calls");
+    return CS;
+  }
+
+  CallSite getOneCallSiteTo(Function *F) {
+    User *User = *F->user_begin();
+    return getCallSite(User);
+  }
+
+  std::tuple<DebugLoc, BasicBlock *> getOneDebugLoc(Function *F) {
+    CallSite CS = getOneCallSiteTo(F);
+    DebugLoc DLoc = CS.getInstruction()->getDebugLoc();
+    BasicBlock *Block = CS.getParent();
+    return std::make_tuple(DLoc, Block);
+  }
+
+  // Returns the costs associated with function outlining:
+  // - The first value is the non-weighted runtime cost for making the call
+  //   to the outlined function 'OutlinedFunction', including the addtional
+  //   setup cost in the outlined function itself;
+  // - The second value is the estimated size of the new call sequence in
+  //   basic block 'OutliningCallBB';
+  // - The third value is the estimated size of the original code from
+  //   function 'F' that is extracted into the outlined function.
+  std::tuple<int, int, int>
+  computeOutliningCosts(Function *F, const FunctionOutliningInfo *OutliningInfo,
+                        Function *OutlinedFunction,
+                        BasicBlock *OutliningCallBB);
+  // Compute the 'InlineCost' of block BB. InlineCost is a proxy used to
+  // approximate both the size and runtime cost (Note that in the current
+  // inline cost analysis, there is no clear distinction there either).
+  int computeBBInlineCost(BasicBlock *BB);
+
+  std::unique_ptr<FunctionOutliningInfo> computeOutliningInfo(Function *F);
+
 };
 
 struct PartialInlinerLegacyPass : public ModulePass {
@@ -157,7 +243,7 @@ PartialInlinerImpl::computeOutliningInfo(Function *F) {
     return isa<ReturnInst>(TI);
   };
 
-  auto GetReturnBlock = [=](BasicBlock *Succ1, BasicBlock *Succ2) {
+  auto GetReturnBlock = [&](BasicBlock *Succ1, BasicBlock *Succ2) {
     if (IsReturnBlock(Succ1))
       return std::make_tuple(Succ1, Succ2);
     if (IsReturnBlock(Succ2))
@@ -167,7 +253,7 @@ PartialInlinerImpl::computeOutliningInfo(Function *F) {
   };
 
   // Detect a triangular shape:
-  auto GetCommonSucc = [=](BasicBlock *Succ1, BasicBlock *Succ2) {
+  auto GetCommonSucc = [&](BasicBlock *Succ1, BasicBlock *Succ2) {
     if (IsSuccessor(Succ1, Succ2))
       return std::make_tuple(Succ1, Succ2);
     if (IsSuccessor(Succ2, Succ1))
@@ -223,7 +309,8 @@ PartialInlinerImpl::computeOutliningInfo(Function *F) {
   // Do sanity check of the entries: threre should not
   // be any successors (not in the entry set) other than
   // {ReturnBlock, NonReturnBlock}
-  assert(OutliningInfo->Entries[0] == &F->front());
+  assert(OutliningInfo->Entries[0] == &F->front() &&
+         "Function Entry must be the first in Entries vector");
   DenseSet<BasicBlock *> Entries;
   for (BasicBlock *E : OutliningInfo->Entries)
     Entries.insert(E);
@@ -289,10 +376,54 @@ PartialInlinerImpl::computeOutliningInfo(Function *F) {
   return OutliningInfo;
 }
 
-bool PartialInlinerImpl::shouldPartialInline(CallSite CS,
-                                             OptimizationRemarkEmitter &ORE) {
-  // TODO : more sharing with shouldInline in Inliner.cpp
+// Check if there is PGO data or user annoated branch data:
+static bool hasProfileData(Function *F, FunctionOutliningInfo *OI) {
+  if (F->getEntryCount())
+    return true;
+  // Now check if any of the entry block has MD_prof data:
+  for (auto *E : OI->Entries) {
+    BranchInst *BR = dyn_cast<BranchInst>(E->getTerminator());
+    if (!BR || BR->isUnconditional())
+      continue;
+    uint64_t T, F;
+    if (BR->extractProfMetadata(T, F))
+      return true;
+  }
+  return false;
+}
+
+BranchProbability PartialInlinerImpl::getOutliningCallBBRelativeFreq(
+    Function *F, FunctionOutliningInfo *OI, Function *DuplicateFunction,
+    BlockFrequencyInfo *BFI, BasicBlock *OutliningCallBB) {
+
+  auto EntryFreq =
+      BFI->getBlockFreq(&DuplicateFunction->getEntryBlock());
+  auto OutliningCallFreq = BFI->getBlockFreq(OutliningCallBB);
+
+  auto OutlineRegionRelFreq =
+      BranchProbability::getBranchProbability(OutliningCallFreq.getFrequency(),
+                                              EntryFreq.getFrequency());
+
+  if (hasProfileData(F, OI))
+    return OutlineRegionRelFreq;
+
+  // When profile data is not available, we need to be very
+  // conservative in estimating the overall savings. We need to make sure
+  // the outline region relative frequency is not below the threshold
+  // specified by the option.
+  OutlineRegionRelFreq = std::max(OutlineRegionRelFreq, BranchProbability(OutlineRegionFreqPercent, 100));
+
+  return OutlineRegionRelFreq;
+}
+
+bool PartialInlinerImpl::shouldPartialInline(
+    CallSite CS, Function *F /* Original Callee */, FunctionOutliningInfo *OI,
+    BlockFrequencyInfo *CalleeBFI, BasicBlock *OutliningCallBB,
+    int NonWeightedOutliningRcost, OptimizationRemarkEmitter &ORE) {
   using namespace ore;
+  if (SkipCostAnalysis)
+    return true;
+
   Instruction *Call = CS.getInstruction();
   Function *Callee = CS.getCalledFunction();
   Function *Caller = CS.getCaller();
@@ -302,36 +433,170 @@ bool PartialInlinerImpl::shouldPartialInline(CallSite CS,
 
   if (IC.isAlways()) {
     ORE.emit(OptimizationRemarkAnalysis(DEBUG_TYPE, "AlwaysInline", Call)
-             << NV("Callee", Callee)
+             << NV("Callee", F)
              << " should always be fully inlined, not partially");
     return false;
   }
 
   if (IC.isNever()) {
     ORE.emit(OptimizationRemarkMissed(DEBUG_TYPE, "NeverInline", Call)
-             << NV("Callee", Callee) << " not partially inlined into "
+             << NV("Callee", F) << " not partially inlined into "
              << NV("Caller", Caller)
              << " because it should never be inlined (cost=never)");
     return false;
   }
 
   if (!IC) {
-    ORE.emit(OptimizationRemarkMissed(DEBUG_TYPE, "TooCostly", Call)
-             << NV("Callee", Callee) << " not partially inlined into "
+    ORE.emit(OptimizationRemarkAnalysis(DEBUG_TYPE, "TooCostly", Call)
+             << NV("Callee", F) << " not partially inlined into "
              << NV("Caller", Caller) << " because too costly to inline (cost="
              << NV("Cost", IC.getCost()) << ", threshold="
              << NV("Threshold", IC.getCostDelta() + IC.getCost()) << ")");
     return false;
   }
+  const DataLayout &DL = Caller->getParent()->getDataLayout();
+  // The savings of eliminating the call:
+  int NonWeightedSavings = getCallsiteCost(CS, DL);
+  BlockFrequency NormWeightedSavings(NonWeightedSavings);
+
+  auto RelativeFreq =
+      getOutliningCallBBRelativeFreq(F, OI, Callee, CalleeBFI, OutliningCallBB);
+  auto NormWeightedRcost =
+      BlockFrequency(NonWeightedOutliningRcost) * RelativeFreq;
+
+  // Weighted saving is smaller than weighted cost, return false
+  if (NormWeightedSavings < NormWeightedRcost) {
+    ORE.emit(
+        OptimizationRemarkAnalysis(DEBUG_TYPE, "OutliningCallcostTooHigh", Call)
+        << NV("Callee", F) << " not partially inlined into "
+        << NV("Caller", Caller) << " runtime overhead (overhead="
+        << NV("Overhead", (unsigned)NormWeightedRcost.getFrequency())
+        << ", savings="
+        << NV("Savings", (unsigned)NormWeightedSavings.getFrequency()) << ")"
+        << " of making the outlined call is too high");
+
+    return false;
+  }
 
   ORE.emit(OptimizationRemarkAnalysis(DEBUG_TYPE, "CanBePartiallyInlined", Call)
-           << NV("Callee", Callee) << " can be partially inlined into "
+           << NV("Callee", F) << " can be partially inlined into "
            << NV("Caller", Caller) << " with cost=" << NV("Cost", IC.getCost())
            << " (threshold="
            << NV("Threshold", IC.getCostDelta() + IC.getCost()) << ")");
   return true;
 }
 
+// TODO: Ideally  we should share Inliner's InlineCost Analysis code.
+// For now use a simplified version. The returned 'InlineCost' will be used
+// to esimate the size cost as well as runtime cost of the BB.
+int PartialInlinerImpl::computeBBInlineCost(BasicBlock *BB) {
+  int InlineCost = 0;
+  const DataLayout &DL = BB->getParent()->getParent()->getDataLayout();
+  for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) {
+    if (isa<DbgInfoIntrinsic>(I))
+      continue;
+
+    if (CallInst *CI = dyn_cast<CallInst>(I)) {
+      InlineCost += getCallsiteCost(CallSite(CI), DL);
+      continue;
+    }
+
+    if (InvokeInst *II = dyn_cast<InvokeInst>(I)) {
+      InlineCost += getCallsiteCost(CallSite(II), DL);
+      continue;
+    }
+
+    if (SwitchInst *SI = dyn_cast<SwitchInst>(I)) {
+      InlineCost += (SI->getNumCases() + 1) * InlineConstants::InstrCost;
+      continue;
+    }
+    InlineCost += InlineConstants::InstrCost;
+  }
+  return InlineCost;
+}
+
+std::tuple<int, int, int> PartialInlinerImpl::computeOutliningCosts(
+    Function *F, const FunctionOutliningInfo *OI, Function *OutlinedFunction,
+    BasicBlock *OutliningCallBB) {
+  // First compute the cost of the outlined region 'OI' in the original
+  // function 'F':
+  int OutlinedRegionCost = 0;
+  for (BasicBlock &BB : *F) {
+    if (&BB != OI->ReturnBlock &&
+        // Assuming Entry set is small -- do a linear search here:
+        std::find(OI->Entries.begin(), OI->Entries.end(), &BB) ==
+            OI->Entries.end()) {
+      OutlinedRegionCost += computeBBInlineCost(&BB);
+    }
+  }
+
+  // Now compute the cost of the call sequence to the outlined function
+  // 'OutlinedFunction' in BB 'OutliningCallBB':
+  int OutliningFuncCallCost = computeBBInlineCost(OutliningCallBB);
+
+  // Now compute the cost of the extracted/outlined function itself:
+  int OutlinedFunctionCost = 0;
+  for (BasicBlock &BB : *OutlinedFunction) {
+    OutlinedFunctionCost += computeBBInlineCost(&BB);
+  }
+
+  assert(OutlinedFunctionCost >= OutlinedRegionCost &&
+         "Outlined function cost should be no less than the outlined region");
+  int OutliningRuntimeOverhead =
+      OutliningFuncCallCost + (OutlinedFunctionCost - OutlinedRegionCost);
+
+  return std::make_tuple(OutliningFuncCallCost, OutliningRuntimeOverhead,
+                         OutlinedRegionCost);
+}
+
+// Create the callsite to profile count map which is
+// used to update the original function's entry count,
+// after the function is partially inlined into the callsite.
+void PartialInlinerImpl::computeCallsiteToProfCountMap(
+    Function *DuplicateFunction,
+    DenseMap<User *, uint64_t> &CallSiteToProfCountMap) {
+  std::vector<User *> Users(DuplicateFunction->user_begin(),
+                            DuplicateFunction->user_end());
+  Function *CurrentCaller = nullptr;
+  BlockFrequencyInfo *CurrentCallerBFI = nullptr;
+
+  auto ComputeCurrBFI = [&,this](Function *Caller) {
+      // For the old pass manager:
+      if (!GetBFI) {
+        if (CurrentCallerBFI)
+          delete CurrentCallerBFI;
+        DominatorTree DT(*Caller);
+        LoopInfo LI(DT);
+        BranchProbabilityInfo BPI(*Caller, LI);
+        CurrentCallerBFI = new BlockFrequencyInfo(*Caller, BPI, LI);
+      } else {
+        // New pass manager:
+        CurrentCallerBFI = &(*GetBFI)(*Caller);
+      }
+  };
+
+  for (User *User : Users) {
+    CallSite CS = getCallSite(User);
+    Function *Caller = CS.getCaller();
+    if (CurrentCaller != Caller) {
+      CurrentCaller = Caller;
+      ComputeCurrBFI(Caller);
+    } else {
+      assert(CurrentCallerBFI && "CallerBFI is not set");
+    }
+    BasicBlock *CallBB = CS.getInstruction()->getParent();
+    auto Count = CurrentCallerBFI->getBlockProfileCount(CallBB);
+    if (Count)
+      CallSiteToProfCountMap[User] = *Count;
+    else
+      CallSiteToProfCountMap[User] = 0;
+  }
+  if (!GetBFI) {
+    if (CurrentCallerBFI)
+      delete CurrentCallerBFI;
+  }
+}
+
 Function *PartialInlinerImpl::unswitchFunction(Function *F) {
 
   if (F->hasAddressTaken())
@@ -347,21 +612,21 @@ Function *PartialInlinerImpl::unswitchFunction(Function *F) {
   if (PSI->isFunctionEntryCold(F))
     return nullptr;
 
-  std::unique_ptr<FunctionOutliningInfo> OutliningInfo =
-      computeOutliningInfo(F);
+  if (F->user_begin() == F->user_end())
+    return nullptr;
+
+  std::unique_ptr<FunctionOutliningInfo> OI = computeOutliningInfo(F);
 
-  if (!OutliningInfo)
+  if (!OI)
     return nullptr;
 
   // Clone the function, so that we can hack away on it.
   ValueToValueMapTy VMap;
   Function *DuplicateFunction = CloneFunction(F, VMap);
-  BasicBlock *NewReturnBlock =
-      cast<BasicBlock>(VMap[OutliningInfo->ReturnBlock]);
-  BasicBlock *NewNonReturnBlock =
-      cast<BasicBlock>(VMap[OutliningInfo->NonReturnBlock]);
+  BasicBlock *NewReturnBlock = cast<BasicBlock>(VMap[OI->ReturnBlock]);
+  BasicBlock *NewNonReturnBlock = cast<BasicBlock>(VMap[OI->NonReturnBlock]);
   DenseSet<BasicBlock *> NewEntries;
-  for (BasicBlock *BB : OutliningInfo->Entries) {
+  for (BasicBlock *BB : OI->Entries) {
     NewEntries.insert(cast<BasicBlock>(VMap[BB]));
   }
 
@@ -390,7 +655,7 @@ Function *PartialInlinerImpl::unswitchFunction(Function *F) {
   BasicBlock *PreReturn = NewReturnBlock;
   // only split block when necessary:
   PHINode *FirstPhi = getFirstPHI(PreReturn);
-  unsigned NumPredsFromEntries = OutliningInfo->ReturnBlockPreds.size();
+  unsigned NumPredsFromEntries = OI->ReturnBlockPreds.size();
   if (FirstPhi && FirstPhi->getNumIncomingValues() > NumPredsFromEntries + 1) {
 
     NewReturnBlock = NewReturnBlock->splitBasicBlock(
@@ -408,14 +673,14 @@ Function *PartialInlinerImpl::unswitchFunction(Function *F) {
       Ins = NewReturnBlock->getFirstNonPHI();
 
       RetPhi->addIncoming(&*I, PreReturn);
-      for (BasicBlock *E : OutliningInfo->ReturnBlockPreds) {
+      for (BasicBlock *E : OI->ReturnBlockPreds) {
         BasicBlock *NewE = cast<BasicBlock>(VMap[E]);
         RetPhi->addIncoming(OldPhi->getIncomingValueForBlock(NewE), NewE);
         OldPhi->removeIncomingValue(NewE);
       }
       ++I;
     }
-    for (auto E : OutliningInfo->ReturnBlockPreds) {
+    for (auto E : OI->ReturnBlockPreds) {
       BasicBlock *NewE = cast<BasicBlock>(VMap[E]);
       NewE->getTerminator()->replaceUsesOfWith(PreReturn, NewReturnBlock);
     }
@@ -423,7 +688,7 @@ Function *PartialInlinerImpl::unswitchFunction(Function *F) {
 
   // Returns true if the block is to be partial inlined into the caller
   // (i.e. not to be extracted to the out of line function)
-  auto ToBeInlined = [=](BasicBlock *BB) {
+  auto ToBeInlined = [&](BasicBlock *BB) {
     return BB == NewReturnBlock || NewEntries.count(BB);
   };
   // Gather up the blocks that we're going to extract.
@@ -443,50 +708,113 @@ Function *PartialInlinerImpl::unswitchFunction(Function *F) {
   BlockFrequencyInfo BFI(*DuplicateFunction, BPI, LI);
 
   // Extract the body of the if.
-  Function *ExtractedFunction =
+  Function *OutlinedFunction =
       CodeExtractor(ToExtract, &DT, /*AggregateArgs*/ false, &BFI, &BPI)
           .extractCodeRegion();
 
-  // Inline the top-level if test into all callers.
+  bool AnyInline =
+      tryPartialInline(DuplicateFunction, F, OI.get(), OutlinedFunction, &BFI);
+
+  // Ditch the duplicate, since we're done with it, and rewrite all remaining
+  // users (function pointers, etc.) back to the original function.
+  DuplicateFunction->replaceAllUsesWith(F);
+  DuplicateFunction->eraseFromParent();
+
+  if (AnyInline)
+    return OutlinedFunction;
+
+  // Remove the function that is speculatively created:
+  if (OutlinedFunction)
+    OutlinedFunction->eraseFromParent();
+
+  return nullptr;
+}
+
+bool PartialInlinerImpl::tryPartialInline(Function *DuplicateFunction,
+                                          Function *F,
+                                          FunctionOutliningInfo *OI,
+                                          Function *OutlinedFunction,
+                                          BlockFrequencyInfo *CalleeBFI) {
+  if (OutlinedFunction == nullptr)
+    return false;
+
+  int NonWeightedRcost;
+  int SizeCost;
+  int OutlinedRegionSizeCost;
+
+  auto OutliningCallBB =
+      getOneCallSiteTo(OutlinedFunction).getInstruction()->getParent();
+
+  std::tie(SizeCost, NonWeightedRcost, OutlinedRegionSizeCost) =
+      computeOutliningCosts(F, OI, OutlinedFunction, OutliningCallBB);
+
+  // The call sequence to the outlined function is larger than the original
+  // outlined region size, it does not increase the chances of inlining
+  // 'F' with outlining (The inliner usies the size increase to model the
+  // the cost of inlining a callee).
+  if (!SkipCostAnalysis && OutlinedRegionSizeCost < SizeCost) {
+    OptimizationRemarkEmitter ORE(F);
+    DebugLoc DLoc;
+    BasicBlock *Block;
+    std::tie(DLoc, Block) = getOneDebugLoc(DuplicateFunction);
+    ORE.emit(OptimizationRemarkAnalysis(DEBUG_TYPE, "OutlineRegionTooSmall",
+                                        DLoc, Block)
+             << ore::NV("Function", F)
+             << " not partially inlined into callers (Original Size = "
+             << ore::NV("OutlinedRegionOriginalSize", OutlinedRegionSizeCost)
+             << ", Size of call sequence to outlined function = "
+             << ore::NV("NewSize", SizeCost) << ")");
+    return false;
+  }
+
+  assert(F->user_begin() == F->user_end() &&
+         "F's users should all be replaced!");
   std::vector<User *> Users(DuplicateFunction->user_begin(),
                             DuplicateFunction->user_end());
 
+  DenseMap<User *, uint64_t> CallSiteToProfCountMap;
+  if (F->getEntryCount())
+    computeCallsiteToProfCountMap(DuplicateFunction, CallSiteToProfCountMap);
+
+  auto CalleeEntryCount = F->getEntryCount();
+  uint64_t CalleeEntryCountV = (CalleeEntryCount ? *CalleeEntryCount : 0);
+  bool AnyInline = false;
   for (User *User : Users) {
-    CallSite CS;
-    if (CallInst *CI = dyn_cast<CallInst>(User))
-      CS = CallSite(CI);
-    else if (InvokeInst *II = dyn_cast<InvokeInst>(User))
-      CS = CallSite(II);
-    else
-      llvm_unreachable("All uses must be calls");
+    CallSite CS = getCallSite(User);
 
     if (IsLimitReached())
       continue;
 
     OptimizationRemarkEmitter ORE(CS.getCaller());
-    if (!shouldPartialInline(CS, ORE))
+
+    if (!shouldPartialInline(CS, F, OI, CalleeBFI, OutliningCallBB,
+                             NonWeightedRcost, ORE))
       continue;
 
-    DebugLoc DLoc = CS.getInstruction()->getDebugLoc();
-    BasicBlock *Block = CS.getParent();
-    ORE.emit(OptimizationRemark(DEBUG_TYPE, "PartiallyInlined", DLoc, Block)
-             << ore::NV("Callee", F) << " partially inlined into "
-             << ore::NV("Caller", CS.getCaller()));
+    ORE.emit(
+        OptimizationRemark(DEBUG_TYPE, "PartiallyInlined", CS.getInstruction())
+        << ore::NV("Callee", F) << " partially inlined into "
+        << ore::NV("Caller", CS.getCaller()));
 
-    InlineFunctionInfo IFI(nullptr, GetAssumptionCache);
+    InlineFunctionInfo IFI(nullptr, GetAssumptionCache, PSI);
     InlineFunction(CS, IFI);
+
+    // Now update the entry count:
+    if (CalleeEntryCountV && CallSiteToProfCountMap.count(User)) {
+      uint64_t CallSiteCount = CallSiteToProfCountMap[User];
+      CalleeEntryCountV -= std::min(CalleeEntryCountV, CallSiteCount);
+    }
+
+    AnyInline = true;
     NumPartialInlining++;
-    // update stats
+    // Update the stats
     NumPartialInlined++;
   }
 
-  // Ditch the duplicate, since we're done with it, and rewrite all remaining
-  // users (function pointers, etc.) back to the original function.
-  DuplicateFunction->replaceAllUsesWith(F);
-  DuplicateFunction->eraseFromParent();
-
+  if (AnyInline && CalleeEntryCount)
+    F->setEntryCount(CalleeEntryCountV);
 
-  return ExtractedFunction;
+  return AnyInline;
 }
 
 bool PartialInlinerImpl::run(Module &M) {
diff --git a/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp b/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp
index d3a3c24ce7b4..659cb9df00a2 100644
--- a/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp
+++ b/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp
@@ -16,6 +16,7 @@
 
 #include "llvm/Analysis/BasicAliasAnalysis.h"
 #include "llvm/Analysis/ModuleSummaryAnalysis.h"
+#include "llvm/Analysis/ProfileSummaryInfo.h"
 #include "llvm/Analysis/TypeMetadataUtils.h"
 #include "llvm/Bitcode/BitcodeWriter.h"
 #include "llvm/IR/Constants.h"
@@ -178,7 +179,7 @@ void filterModule(
     else
       GO = new GlobalVariable(
           *M, GA->getValueType(), false, GlobalValue::ExternalLinkage,
-          (Constant *)nullptr, "", (GlobalVariable *)nullptr,
+          nullptr, "", nullptr,
           GA->getThreadLocalMode(), GA->getType()->getAddressSpace());
     GO->takeName(GA);
     GA->replaceAllUsesWith(GO);
@@ -320,7 +321,8 @@ void splitAndWriteThinLTOBitcode(
 
 
   // FIXME: Try to re-use BSI and PFI from the original module here.
-  ModuleSummaryIndex Index = buildModuleSummaryIndex(M, nullptr, nullptr);
+  ProfileSummaryInfo PSI(M);
+  ModuleSummaryIndex Index = buildModuleSummaryIndex(M, nullptr, &PSI);
 
   SmallVector<char, 0> Buffer;
 
diff --git a/lib/Transforms/InstCombine/InstCombineAddSub.cpp b/lib/Transforms/InstCombine/InstCombineAddSub.cpp
index 153a186d5ed4..0ca62b7ae40c 100644
--- a/lib/Transforms/InstCombine/InstCombineAddSub.cpp
+++ b/lib/Transforms/InstCombine/InstCombineAddSub.cpp
@@ -847,92 +847,6 @@ Value *FAddCombine::createAddendVal(const FAddend &Opnd, bool &NeedNeg) {
   return createFMul(OpndVal, Coeff.getValue(Instr->getType()));
 }
 
-/// \brief Return true if we can prove that adding the two values of the
-/// knownbits will not overflow.
-/// Otherwise return false.
-static bool checkRippleForAdd(const KnownBits &LHSKnown,
-                              const KnownBits &RHSKnown) {
-  // Addition of two 2's complement numbers having opposite signs will never
-  // overflow.
-  if ((LHSKnown.isNegative() && RHSKnown.isNonNegative()) ||
-      (LHSKnown.isNonNegative() && RHSKnown.isNegative()))
-    return true;
-
-  // If either of the values is known to be non-negative, adding them can only
-  // overflow if the second is also non-negative, so we can assume that.
-  // Two non-negative numbers will only overflow if there is a carry to the 
-  // sign bit, so we can check if even when the values are as big as possible
-  // there is no overflow to the sign bit.
-  if (LHSKnown.isNonNegative() || RHSKnown.isNonNegative()) {
-    APInt MaxLHS = ~LHSKnown.Zero;
-    MaxLHS.clearSignBit();
-    APInt MaxRHS = ~RHSKnown.Zero;
-    MaxRHS.clearSignBit();
-    APInt Result = std::move(MaxLHS) + std::move(MaxRHS);
-    return Result.isSignBitClear();
-  }
-
-  // If either of the values is known to be negative, adding them can only
-  // overflow if the second is also negative, so we can assume that.
-  // Two negative number will only overflow if there is no carry to the sign
-  // bit, so we can check if even when the values are as small as possible
-  // there is overflow to the sign bit.
-  if (LHSKnown.isNegative() || RHSKnown.isNegative()) {
-    APInt MinLHS = LHSKnown.One;
-    MinLHS.clearSignBit();
-    APInt MinRHS = RHSKnown.One;
-    MinRHS.clearSignBit();
-    APInt Result = std::move(MinLHS) + std::move(MinRHS);
-    return Result.isSignBitSet();
-  }
-
-  // If we reached here it means that we know nothing about the sign bits.
-  // In this case we can't know if there will be an overflow, since by 
-  // changing the sign bits any two values can be made to overflow.
-  return false;
-}
-
-/// Return true if we can prove that:
-///    (sext (add LHS, RHS))  === (add (sext LHS), (sext RHS))
-/// This basically requires proving that the add in the original type would not
-/// overflow to change the sign bit or have a carry out.
-bool InstCombiner::WillNotOverflowSignedAdd(Value *LHS, Value *RHS,
-                                            Instruction &CxtI) {
-  // There are different heuristics we can use for this.  Here are some simple
-  // ones.
-
-  // If LHS and RHS each have at least two sign bits, the addition will look
-  // like
-  //
-  // XX..... +
-  // YY.....
-  //
-  // If the carry into the most significant position is 0, X and Y can't both
-  // be 1 and therefore the carry out of the addition is also 0.
-  //
-  // If the carry into the most significant position is 1, X and Y can't both
-  // be 0 and therefore the carry out of the addition is also 1.
-  //
-  // Since the carry into the most significant position is always equal to
-  // the carry out of the addition, there is no signed overflow.
-  if (ComputeNumSignBits(LHS, 0, &CxtI) > 1 &&
-      ComputeNumSignBits(RHS, 0, &CxtI) > 1)
-    return true;
-
-  unsigned BitWidth = LHS->getType()->getScalarSizeInBits();
-  KnownBits LHSKnown(BitWidth);
-  computeKnownBits(LHS, LHSKnown, 0, &CxtI);
-
-  KnownBits RHSKnown(BitWidth);
-  computeKnownBits(RHS, RHSKnown, 0, &CxtI);
-
-  // Check if carry bit of addition will not cause overflow.
-  if (checkRippleForAdd(LHSKnown, RHSKnown))
-    return true;
-
-  return false;
-}
-
 /// \brief Return true if we can prove that:
 ///    (sub LHS, RHS)  === (sub nsw LHS, RHS)
 /// This basically requires proving that the add in the original type would not
@@ -968,13 +882,9 @@ bool InstCombiner::WillNotOverflowSignedSub(Value *LHS, Value *RHS,
 bool InstCombiner::WillNotOverflowUnsignedSub(Value *LHS, Value *RHS,
                                               Instruction &CxtI) {
   // If the LHS is negative and the RHS is non-negative, no unsigned wrap.
-  bool LHSKnownNonNegative, LHSKnownNegative;
-  bool RHSKnownNonNegative, RHSKnownNegative;
-  ComputeSignBit(LHS, LHSKnownNonNegative, LHSKnownNegative, /*Depth=*/0,
-                 &CxtI);
-  ComputeSignBit(RHS, RHSKnownNonNegative, RHSKnownNegative, /*Depth=*/0,
-                 &CxtI);
-  if (LHSKnownNegative && RHSKnownNonNegative)
+  KnownBits LHSKnown = computeKnownBits(LHS, /*Depth=*/0, &CxtI);
+  KnownBits RHSKnown = computeKnownBits(RHS, /*Depth=*/0, &CxtI);
+  if (LHSKnown.isNegative() && RHSKnown.isNonNegative())
     return true;
 
   return false;
@@ -1041,6 +951,57 @@ static Value *checkForNegativeOperand(BinaryOperator &I,
   return nullptr;
 }
 
+static Instruction *foldAddWithConstant(BinaryOperator &Add,
+                                        InstCombiner::BuilderTy &Builder) {
+  Value *Op0 = Add.getOperand(0), *Op1 = Add.getOperand(1);
+  const APInt *C;
+  if (!match(Op1, m_APInt(C)))
+    return nullptr;
+
+  if (C->isSignMask()) {
+    // If wrapping is not allowed, then the addition must set the sign bit:
+    // X + (signmask) --> X | signmask
+    if (Add.hasNoSignedWrap() || Add.hasNoUnsignedWrap())
+      return BinaryOperator::CreateOr(Op0, Op1);
+
+    // If wrapping is allowed, then the addition flips the sign bit of LHS:
+    // X + (signmask) --> X ^ signmask
+    return BinaryOperator::CreateXor(Op0, Op1);
+  }
+
+  Value *X;
+  const APInt *C2;
+  Type *Ty = Add.getType();
+
+  // Is this add the last step in a convoluted sext?
+  // add(zext(xor i16 X, -32768), -32768) --> sext X
+  if (match(Op0, m_ZExt(m_Xor(m_Value(X), m_APInt(C2)))) &&
+      C2->isMinSignedValue() && C2->sext(Ty->getScalarSizeInBits()) == *C)
+    return CastInst::Create(Instruction::SExt, X, Ty);
+
+  // (add (zext (add nuw X, C2)), C) --> (zext (add nuw X, C2 + C))
+  // FIXME: This should check hasOneUse to not increase the instruction count?
+  if (C->isNegative() &&
+      match(Op0, m_ZExt(m_NUWAdd(m_Value(X), m_APInt(C2)))) &&
+      C->sge(-C2->sext(C->getBitWidth()))) {
+    Constant *NewC =
+        ConstantInt::get(X->getType(), *C2 + C->trunc(C2->getBitWidth()));
+    return new ZExtInst(Builder.CreateNUWAdd(X, NewC), Ty);
+  }
+
+  // Shifts and add used to flip and mask off the low bit:
+  // add (ashr (shl i32 X, 31), 31), 1 --> and (not X), 1
+  const APInt *C3;
+  if (*C == 1 && match(Op0, m_OneUse(m_AShr(m_Shl(m_Value(X), m_APInt(C2)),
+                                            m_APInt(C3)))) &&
+      C2 == C3 && *C2 == Ty->getScalarSizeInBits() - 1) {
+    Value *NotX = Builder.CreateNot(X);
+    return BinaryOperator::CreateAnd(NotX, ConstantInt::get(Ty, 1));
+  }
+
+  return nullptr;
+}
+
 Instruction *InstCombiner::visitAdd(BinaryOperator &I) {
   bool Changed = SimplifyAssociativeOrCommutative(I);
   Value *LHS = I.getOperand(0), *RHS = I.getOperand(1);
@@ -1056,41 +1017,11 @@ Instruction *InstCombiner::visitAdd(BinaryOperator &I) {
   if (Value *V = SimplifyUsingDistributiveLaws(I))
     return replaceInstUsesWith(I, V);
 
-  const APInt *RHSC;
-  if (match(RHS, m_APInt(RHSC))) {
-    if (RHSC->isSignMask()) {
-      // If wrapping is not allowed, then the addition must set the sign bit:
-      // X + (signmask) --> X | signmask
-      if (I.hasNoSignedWrap() || I.hasNoUnsignedWrap())
-        return BinaryOperator::CreateOr(LHS, RHS);
-
-      // If wrapping is allowed, then the addition flips the sign bit of LHS:
-      // X + (signmask) --> X ^ signmask
-      return BinaryOperator::CreateXor(LHS, RHS);
-    }
-
-    // Is this add the last step in a convoluted sext?
-    Value *X;
-    const APInt *C;
-    if (match(LHS, m_ZExt(m_Xor(m_Value(X), m_APInt(C)))) &&
-        C->isMinSignedValue() &&
-        C->sext(LHS->getType()->getScalarSizeInBits()) == *RHSC) {
-      // add(zext(xor i16 X, -32768), -32768) --> sext X
-      return CastInst::Create(Instruction::SExt, X, LHS->getType());
-    }
-
-    if (RHSC->isNegative() &&
-        match(LHS, m_ZExt(m_NUWAdd(m_Value(X), m_APInt(C)))) &&
-        RHSC->sge(-C->sext(RHSC->getBitWidth()))) {
-      // (add (zext (add nuw X, C)), Val) -> (zext (add nuw X, C+Val))
-      Constant *NewC =
-          ConstantInt::get(X->getType(), *C + RHSC->trunc(C->getBitWidth()));
-      return new ZExtInst(Builder->CreateNUWAdd(X, NewC), I.getType());
-    }
-  }
+  if (Instruction *X = foldAddWithConstant(I, *Builder))
+    return X;
 
-  // FIXME: Use the match above instead of dyn_cast to allow these transforms
-  // for splat vectors.
+  // FIXME: This should be moved into the above helper function to allow these
+  // transforms for splat vectors.
   if (ConstantInt *CI = dyn_cast<ConstantInt>(RHS)) {
     // zext(bool) + C -> bool ? C + 1 : C
     if (ZExtInst *ZI = dyn_cast<ZExtInst>(LHS))
@@ -1285,8 +1216,7 @@ Instruction *InstCombiner::visitAdd(BinaryOperator &I) {
         Constant *CI =
             ConstantExpr::getTrunc(RHSC, LHSConv->getOperand(0)->getType());
         if (ConstantExpr::getZExt(CI, I.getType()) == RHSC &&
-            computeOverflowForUnsignedAdd(LHSConv->getOperand(0), CI, &I) ==
-                OverflowResult::NeverOverflows) {
+            willNotOverflowUnsignedAdd(LHSConv->getOperand(0), CI, I)) {
           // Insert the new, smaller add.
           Value *NewAdd =
               Builder->CreateNUWAdd(LHSConv->getOperand(0), CI, "addconv");
@@ -1303,9 +1233,8 @@ Instruction *InstCombiner::visitAdd(BinaryOperator &I) {
       if (LHSConv->getOperand(0)->getType() ==
               RHSConv->getOperand(0)->getType() &&
           (LHSConv->hasOneUse() || RHSConv->hasOneUse()) &&
-          computeOverflowForUnsignedAdd(LHSConv->getOperand(0),
-                                        RHSConv->getOperand(0),
-                                        &I) == OverflowResult::NeverOverflows) {
+          willNotOverflowUnsignedAdd(LHSConv->getOperand(0),
+                                     RHSConv->getOperand(0), I)) {
         // Insert the new integer add.
         Value *NewAdd = Builder->CreateNUWAdd(
             LHSConv->getOperand(0), RHSConv->getOperand(0), "addconv");
@@ -1347,15 +1276,13 @@ Instruction *InstCombiner::visitAdd(BinaryOperator &I) {
   }
 
   // TODO(jingyue): Consider WillNotOverflowSignedAdd and
-  // WillNotOverflowUnsignedAdd to reduce the number of invocations of
+  // willNotOverflowUnsignedAdd to reduce the number of invocations of
   // computeKnownBits.
   if (!I.hasNoSignedWrap() && WillNotOverflowSignedAdd(LHS, RHS, I)) {
     Changed = true;
     I.setHasNoSignedWrap(true);
   }
-  if (!I.hasNoUnsignedWrap() &&
-      computeOverflowForUnsignedAdd(LHS, RHS, &I) ==
-          OverflowResult::NeverOverflows) {
+  if (!I.hasNoUnsignedWrap() && willNotOverflowUnsignedAdd(LHS, RHS, I)) {
     Changed = true;
     I.setHasNoUnsignedWrap(true);
   }
diff --git a/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp b/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
index b114801cc1c0..82dc88f1b3ad 100644
--- a/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
+++ b/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
@@ -23,21 +23,6 @@ using namespace PatternMatch;
 
 #define DEBUG_TYPE "instcombine"
 
-static inline Value *dyn_castNotVal(Value *V) {
-  // If this is not(not(x)) don't return that this is a not: we want the two
-  // not's to be folded first.
-  if (BinaryOperator::isNot(V)) {
-    Value *Operand = BinaryOperator::getNotArgument(V);
-    if (!IsFreeToInvert(Operand, Operand->hasOneUse()))
-      return Operand;
-  }
-
-  // Constants can be considered to be not'ed values...
-  if (ConstantInt *C = dyn_cast<ConstantInt>(V))
-    return ConstantInt::get(C->getType(), ~C->getValue());
-  return nullptr;
-}
-
 /// Similar to getICmpCode but for FCmpInst. This encodes a fcmp predicate into
 /// a four bit mask.
 static unsigned getFCmpCode(FCmpInst::Predicate CC) {
@@ -713,9 +698,8 @@ Value *InstCombiner::simplifyRangeCheck(ICmpInst *Cmp0, ICmpInst *Cmp1,
   }
 
   // This simplification is only valid if the upper range is not negative.
-  bool IsNegative, IsNotNegative;
-  ComputeSignBit(RangeEnd, IsNotNegative, IsNegative, /*Depth=*/0, Cmp1);
-  if (!IsNotNegative)
+  KnownBits Known = computeKnownBits(RangeEnd, /*Depth=*/0, Cmp1);
+  if (!Known.isNonNegative())
     return nullptr;
 
   if (Inverted)
@@ -1013,26 +997,22 @@ Value *InstCombiner::FoldAndOfFCmps(FCmpInst *LHS, FCmpInst *RHS) {
 /// (~A & ~B) == (~(A | B))
 /// (~A | ~B) == (~(A & B))
 static Instruction *matchDeMorgansLaws(BinaryOperator &I,
-                                       InstCombiner::BuilderTy *Builder) {
+                                       InstCombiner::BuilderTy &Builder) {
   auto Opcode = I.getOpcode();
   assert((Opcode == Instruction::And || Opcode == Instruction::Or) &&
          "Trying to match De Morgan's Laws with something other than and/or");
+
   // Flip the logic operation.
-  if (Opcode == Instruction::And)
-    Opcode = Instruction::Or;
-  else
-    Opcode = Instruction::And;
+  Opcode = (Opcode == Instruction::And) ? Instruction::Or : Instruction::And;
 
-  Value *Op0 = I.getOperand(0);
-  Value *Op1 = I.getOperand(1);
-  // TODO: Use pattern matchers instead of dyn_cast.
-  if (Value *Op0NotVal = dyn_castNotVal(Op0))
-    if (Value *Op1NotVal = dyn_castNotVal(Op1))
-      if (Op0->hasOneUse() && Op1->hasOneUse()) {
-        Value *LogicOp = Builder->CreateBinOp(Opcode, Op0NotVal, Op1NotVal,
-                                              I.getName() + ".demorgan");
-        return BinaryOperator::CreateNot(LogicOp);
-      }
+  Value *A, *B;
+  if (match(I.getOperand(0), m_OneUse(m_Not(m_Value(A)))) &&
+      match(I.getOperand(1), m_OneUse(m_Not(m_Value(B)))) &&
+      !IsFreeToInvert(A, A->hasOneUse()) &&
+      !IsFreeToInvert(B, B->hasOneUse())) {
+    Value *AndOr = Builder.CreateBinOp(Opcode, A, B, I.getName() + ".demorgan");
+    return BinaryOperator::CreateNot(AndOr);
+  }
 
   return nullptr;
 }
@@ -1376,7 +1356,7 @@ Instruction *InstCombiner::visitAnd(BinaryOperator &I) {
     if (Instruction *FoldedLogic = foldOpWithConstantIntoOperand(I))
       return FoldedLogic;
 
-  if (Instruction *DeMorgan = matchDeMorgansLaws(I, Builder))
+  if (Instruction *DeMorgan = matchDeMorgansLaws(I, *Builder))
     return DeMorgan;
 
   {
@@ -2005,18 +1985,6 @@ Instruction *InstCombiner::visitOr(BinaryOperator &I) {
   if (Value *V = SimplifyBSwap(I))
     return replaceInstUsesWith(I, V);
 
-  if (ConstantInt *RHS = dyn_cast<ConstantInt>(Op1)) {
-    ConstantInt *C1 = nullptr; Value *X = nullptr;
-    // (X ^ C1) | C2 --> (X | C2) ^ (C1&~C2)
-    if (match(Op0, m_Xor(m_Value(X), m_ConstantInt(C1))) &&
-        Op0->hasOneUse()) {
-      Value *Or = Builder->CreateOr(X, RHS);
-      Or->takeName(Op0);
-      return BinaryOperator::CreateXor(Or,
-                            Builder->getInt(C1->getValue() & ~RHS->getValue()));
-    }
-  }
-
   if (isa<Constant>(Op1))
     if (Instruction *FoldedLogic = foldOpWithConstantIntoOperand(I))
       return FoldedLogic;
@@ -2167,7 +2135,7 @@ Instruction *InstCombiner::visitOr(BinaryOperator &I) {
   if (match(Op0, m_And(m_Or(m_Specific(Op1), m_Value(C)), m_Value(A))))
     return BinaryOperator::CreateOr(Op1, Builder->CreateAnd(A, C));
 
-  if (Instruction *DeMorgan = matchDeMorgansLaws(I, Builder))
+  if (Instruction *DeMorgan = matchDeMorgansLaws(I, *Builder))
     return DeMorgan;
 
   // Canonicalize xor to the RHS.
@@ -2399,27 +2367,44 @@ Instruction *InstCombiner::visitXor(BinaryOperator &I) {
   }
 
   // Is this a 'not' (~) fed by a binary operator?
-  BinaryOperator *NotOp;
-  if (match(&I, m_Not(m_BinOp(NotOp)))) {
-    if (NotOp->getOpcode() == Instruction::And ||
-        NotOp->getOpcode() == Instruction::Or) {
+  BinaryOperator *NotVal;
+  if (match(&I, m_Not(m_BinOp(NotVal)))) {
+    if (NotVal->getOpcode() == Instruction::And ||
+        NotVal->getOpcode() == Instruction::Or) {
       // Apply DeMorgan's Law when inverts are free:
       // ~(X & Y) --> (~X | ~Y)
       // ~(X | Y) --> (~X & ~Y)
-      if (IsFreeToInvert(NotOp->getOperand(0),
-                         NotOp->getOperand(0)->hasOneUse()) &&
-          IsFreeToInvert(NotOp->getOperand(1),
-                         NotOp->getOperand(1)->hasOneUse())) {
-        Value *NotX = Builder->CreateNot(NotOp->getOperand(0), "notlhs");
-        Value *NotY = Builder->CreateNot(NotOp->getOperand(1), "notrhs");
-        if (NotOp->getOpcode() == Instruction::And)
+      if (IsFreeToInvert(NotVal->getOperand(0),
+                         NotVal->getOperand(0)->hasOneUse()) &&
+          IsFreeToInvert(NotVal->getOperand(1),
+                         NotVal->getOperand(1)->hasOneUse())) {
+        Value *NotX = Builder->CreateNot(NotVal->getOperand(0), "notlhs");
+        Value *NotY = Builder->CreateNot(NotVal->getOperand(1), "notrhs");
+        if (NotVal->getOpcode() == Instruction::And)
           return BinaryOperator::CreateOr(NotX, NotY);
         return BinaryOperator::CreateAnd(NotX, NotY);
       }
-    } else if (NotOp->getOpcode() == Instruction::AShr) {
-      // ~(~X >>s Y) --> (X >>s Y)
-      if (Value *Op0NotVal = dyn_castNotVal(NotOp->getOperand(0)))
-        return BinaryOperator::CreateAShr(Op0NotVal, NotOp->getOperand(1));
+    }
+
+    // ~(~X >>s Y) --> (X >>s Y)
+    if (match(NotVal, m_AShr(m_Not(m_Value(X)), m_Value(Y))))
+      return BinaryOperator::CreateAShr(X, Y);
+
+    // If we are inverting a right-shifted constant, we may be able to eliminate
+    // the 'not' by inverting the constant and using the opposite shift type.
+    // Canonicalization rules ensure that only a negative constant uses 'ashr',
+    // but we must check that in case that transform has not fired yet.
+    const APInt *C;
+    if (match(NotVal, m_AShr(m_APInt(C), m_Value(Y))) && C->isNegative()) {
+      // ~(C >>s Y) --> ~C >>u Y (when inverting the replicated sign bits)
+      Constant *NotC = ConstantInt::get(I.getType(), ~(*C));
+      return BinaryOperator::CreateLShr(NotC, Y);
+    }
+
+    if (match(NotVal, m_LShr(m_APInt(C), m_Value(Y))) && C->isNonNegative()) {
+      // ~(C >>u Y) --> ~C >>s Y (when inverting the replicated sign bits)
+      Constant *NotC = ConstantInt::get(I.getType(), ~(*C));
+      return BinaryOperator::CreateAShr(NotC, Y);
     }
   }
 
diff --git a/lib/Transforms/InstCombine/InstCombineCalls.cpp b/lib/Transforms/InstCombine/InstCombineCalls.cpp
index 6989d67f0060..face7abcc95f 100644
--- a/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -1384,10 +1384,10 @@ static Instruction *foldCttzCtlz(IntrinsicInst &II, InstCombiner &IC) {
 
   // Create a mask for bits above (ctlz) or below (cttz) the first known one.
   bool IsTZ = II.getIntrinsicID() == Intrinsic::cttz;
-  unsigned PossibleZeros = IsTZ ? Known.One.countTrailingZeros()
-                                : Known.One.countLeadingZeros();
-  unsigned DefiniteZeros = IsTZ ? Known.Zero.countTrailingOnes()
-                                : Known.Zero.countLeadingOnes();
+  unsigned PossibleZeros = IsTZ ? Known.countMaxTrailingZeros()
+                                : Known.countMaxLeadingZeros();
+  unsigned DefiniteZeros = IsTZ ? Known.countMinTrailingZeros()
+                                : Known.countMinLeadingZeros();
 
   // If all bits above (ctlz) or below (cttz) the first known one are known
   // zero, this value is constant.
diff --git a/lib/Transforms/InstCombine/InstCombineCasts.cpp b/lib/Transforms/InstCombine/InstCombineCasts.cpp
index 312d9baae43a..001a4bcf16f3 100644
--- a/lib/Transforms/InstCombine/InstCombineCasts.cpp
+++ b/lib/Transforms/InstCombine/InstCombineCasts.cpp
@@ -559,6 +559,9 @@ Instruction *InstCombiner::visitTrunc(TruncInst &CI) {
     return new ICmpInst(ICmpInst::ICMP_NE, Src, Zero);
   }
 
+  // FIXME: Maybe combine the next two transforms to handle the no cast case
+  // more efficiently. Support vector types. Cleanup code by using m_OneUse.
+
   // Transform trunc(lshr (zext A), Cst) to eliminate one type conversion.
   Value *A = nullptr; ConstantInt *Cst = nullptr;
   if (Src->hasOneUse() &&
@@ -588,15 +591,20 @@ Instruction *InstCombiner::visitTrunc(TruncInst &CI) {
   // the sign bit of the original value; performing ashr instead of lshr
   // generates bits of the same value as the sign bit.
   if (Src->hasOneUse() &&
-      match(Src, m_LShr(m_SExt(m_Value(A)), m_ConstantInt(Cst))) &&
-      cast<Instruction>(Src)->getOperand(0)->hasOneUse()) {
+      match(Src, m_LShr(m_SExt(m_Value(A)), m_ConstantInt(Cst)))) {
+    Value *SExt = cast<Instruction>(Src)->getOperand(0);
+    const unsigned SExtSize = SExt->getType()->getPrimitiveSizeInBits();
     const unsigned ASize = A->getType()->getPrimitiveSizeInBits();
+    unsigned ShiftAmt = Cst->getZExtValue();
     // This optimization can be only performed when zero bits generated by
     // the original lshr aren't pulled into the value after truncation, so we
-    // can only shift by values smaller than the size of destination type (in
-    // bits).
-    if (Cst->getValue().ult(ASize)) {
-      Value *Shift = Builder->CreateAShr(A, Cst->getZExtValue());
+    // can only shift by values no larger than the number of extension bits.
+    // FIXME: Instead of bailing when the shift is too large, use and to clear
+    // the extra bits.
+    if (SExt->hasOneUse() && ShiftAmt <= SExtSize - ASize) {
+      // If shifting by the size of the original value in bits or more, it is
+      // being filled with the sign bit, so shift by ASize-1 to avoid ub.
+      Value *Shift = Builder->CreateAShr(A, std::min(ShiftAmt, ASize-1));
       Shift->takeName(Src);
       return CastInst::CreateIntegerCast(Shift, CI.getType(), true);
     }
@@ -1180,9 +1188,8 @@ Instruction *InstCombiner::visitSExt(SExtInst &CI) {
 
   // If we know that the value being extended is positive, we can use a zext
   // instead.
-  bool KnownZero, KnownOne;
-  ComputeSignBit(Src, KnownZero, KnownOne, 0, &CI);
-  if (KnownZero) {
+  KnownBits Known = computeKnownBits(Src, 0, &CI);
+  if (Known.isNonNegative()) {
     Value *ZExt = Builder->CreateZExt(Src, DestTy);
     return replaceInstUsesWith(CI, ZExt);
   }
diff --git a/lib/Transforms/InstCombine/InstCombineCompares.cpp b/lib/Transforms/InstCombine/InstCombineCompares.cpp
index 34ce235b3fe2..60ed4057cedd 100644
--- a/lib/Transforms/InstCombine/InstCombineCompares.cpp
+++ b/lib/Transforms/InstCombine/InstCombineCompares.cpp
@@ -2785,6 +2785,9 @@ Instruction *InstCombiner::foldICmpInstWithConstantNotInt(ICmpInst &I) {
 }
 
 /// Try to fold icmp (binop), X or icmp X, (binop).
+/// TODO: A large part of this logic is duplicated in InstSimplify's
+/// simplifyICmpWithBinOp(). We should be able to share that and avoid the code
+/// duplication.
 Instruction *InstCombiner::foldICmpBinOp(ICmpInst &I) {
   Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
 
@@ -2794,7 +2797,7 @@ Instruction *InstCombiner::foldICmpBinOp(ICmpInst &I) {
   if (!BO0 && !BO1)
     return nullptr;
 
-  CmpInst::Predicate Pred = I.getPredicate();
+  const CmpInst::Predicate Pred = I.getPredicate();
   bool NoOp0WrapProblem = false, NoOp1WrapProblem = false;
   if (BO0 && isa<OverflowingBinaryOperator>(BO0))
     NoOp0WrapProblem =
@@ -3029,21 +3032,20 @@ Instruction *InstCombiner::foldICmpBinOp(ICmpInst &I) {
     case Instruction::Sub:
     case Instruction::Xor:
       if (I.isEquality()) // a+x icmp eq/ne b+x --> a icmp b
-        return new ICmpInst(I.getPredicate(), BO0->getOperand(0),
-                            BO1->getOperand(0));
+        return new ICmpInst(Pred, BO0->getOperand(0), BO1->getOperand(0));
       // icmp u/s (a ^ signmask), (b ^ signmask) --> icmp s/u a, b
       if (ConstantInt *CI = dyn_cast<ConstantInt>(BO0->getOperand(1))) {
         if (CI->getValue().isSignMask()) {
-          ICmpInst::Predicate Pred =
+          ICmpInst::Predicate NewPred =
               I.isSigned() ? I.getUnsignedPredicate() : I.getSignedPredicate();
-          return new ICmpInst(Pred, BO0->getOperand(0), BO1->getOperand(0));
+          return new ICmpInst(NewPred, BO0->getOperand(0), BO1->getOperand(0));
         }
 
         if (BO0->getOpcode() == Instruction::Xor && CI->isMaxValue(true)) {
-          ICmpInst::Predicate Pred =
+          ICmpInst::Predicate NewPred =
               I.isSigned() ? I.getUnsignedPredicate() : I.getSignedPredicate();
-          Pred = I.getSwappedPredicate(Pred);
-          return new ICmpInst(Pred, BO0->getOperand(0), BO1->getOperand(0));
+          NewPred = I.getSwappedPredicate(NewPred);
+          return new ICmpInst(NewPred, BO0->getOperand(0), BO1->getOperand(0));
         }
       }
       break;
@@ -3062,21 +3064,27 @@ Instruction *InstCombiner::foldICmpBinOp(ICmpInst &I) {
                                    AP.getBitWidth() - AP.countTrailingZeros()));
           Value *And1 = Builder->CreateAnd(BO0->getOperand(0), Mask);
           Value *And2 = Builder->CreateAnd(BO1->getOperand(0), Mask);
-          return new ICmpInst(I.getPredicate(), And1, And2);
+          return new ICmpInst(Pred, And1, And2);
         }
       }
       break;
+
     case Instruction::UDiv:
     case Instruction::LShr:
-      if (I.isSigned())
+      if (I.isSigned() || !BO0->isExact() || !BO1->isExact())
         break;
-      LLVM_FALLTHROUGH;
+      return new ICmpInst(Pred, BO0->getOperand(0), BO1->getOperand(0));
+
     case Instruction::SDiv:
+      if (!I.isEquality() || !BO0->isExact() || !BO1->isExact())
+        break;
+      return new ICmpInst(Pred, BO0->getOperand(0), BO1->getOperand(0));
+
     case Instruction::AShr:
       if (!BO0->isExact() || !BO1->isExact())
         break;
-      return new ICmpInst(I.getPredicate(), BO0->getOperand(0),
-                          BO1->getOperand(0));
+      return new ICmpInst(Pred, BO0->getOperand(0), BO1->getOperand(0));
+
     case Instruction::Shl: {
       bool NUW = BO0->hasNoUnsignedWrap() && BO1->hasNoUnsignedWrap();
       bool NSW = BO0->hasNoSignedWrap() && BO1->hasNoSignedWrap();
@@ -3084,8 +3092,7 @@ Instruction *InstCombiner::foldICmpBinOp(ICmpInst &I) {
         break;
       if (!NSW && I.isSigned())
         break;
-      return new ICmpInst(I.getPredicate(), BO0->getOperand(0),
-                          BO1->getOperand(0));
+      return new ICmpInst(Pred, BO0->getOperand(0), BO1->getOperand(0));
     }
     }
   }
@@ -3096,7 +3103,7 @@ Instruction *InstCombiner::foldICmpBinOp(ICmpInst &I) {
     auto BitwiseAnd =
         m_CombineOr(m_And(m_Value(), LSubOne), m_And(LSubOne, m_Value()));
 
-    if (match(BO0, BitwiseAnd) && I.getPredicate() == ICmpInst::ICMP_ULT) {
+    if (match(BO0, BitwiseAnd) && Pred == ICmpInst::ICMP_ULT) {
       auto *Zero = Constant::getNullValue(BO0->getType());
       return new ICmpInst(ICmpInst::ICMP_NE, Op1, Zero);
     }
diff --git a/lib/Transforms/InstCombine/InstCombineInternal.h b/lib/Transforms/InstCombine/InstCombineInternal.h
index 3be6419a129a..1424f61fe701 100644
--- a/lib/Transforms/InstCombine/InstCombineInternal.h
+++ b/lib/Transforms/InstCombine/InstCombineInternal.h
@@ -30,6 +30,7 @@
 #include "llvm/IR/PatternMatch.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/Dwarf.h"
+#include "llvm/Support/KnownBits.h"
 #include "llvm/Transforms/InstCombine/InstCombineWorklist.h"
 #include "llvm/Transforms/Utils/Local.h"
 
@@ -388,10 +389,21 @@ private:
                                  bool DoTransform = true);
 
   Instruction *transformSExtICmp(ICmpInst *ICI, Instruction &CI);
-  bool WillNotOverflowSignedAdd(Value *LHS, Value *RHS, Instruction &CxtI);
+  bool WillNotOverflowSignedAdd(Value *LHS, Value *RHS, Instruction &CxtI) {
+    return computeOverflowForSignedAdd(LHS, RHS, &CxtI) ==
+           OverflowResult::NeverOverflows;
+  };
+  bool willNotOverflowUnsignedAdd(Value *LHS, Value *RHS, Instruction &CxtI) {
+    return computeOverflowForUnsignedAdd(LHS, RHS, &CxtI) ==
+           OverflowResult::NeverOverflows;
+  };
   bool WillNotOverflowSignedSub(Value *LHS, Value *RHS, Instruction &CxtI);
   bool WillNotOverflowUnsignedSub(Value *LHS, Value *RHS, Instruction &CxtI);
   bool WillNotOverflowSignedMul(Value *LHS, Value *RHS, Instruction &CxtI);
+  bool willNotOverflowUnsignedMul(Value *LHS, Value *RHS, Instruction &CxtI) {
+    return computeOverflowForUnsignedMul(LHS, RHS, &CxtI) ==
+           OverflowResult::NeverOverflows;
+  };
   Value *EmitGEPOffset(User *GEP);
   Instruction *scalarizePHI(ExtractElementInst &EI, PHINode *PN);
   Value *EvaluateInDifferentElementOrder(Value *V, ArrayRef<int> Mask);
@@ -492,7 +504,11 @@ public:
 
   void computeKnownBits(Value *V, KnownBits &Known,
                         unsigned Depth, Instruction *CxtI) const {
-    return llvm::computeKnownBits(V, Known, DL, Depth, &AC, CxtI, &DT);
+    llvm::computeKnownBits(V, Known, DL, Depth, &AC, CxtI, &DT);
+  }
+  KnownBits computeKnownBits(Value *V, unsigned Depth,
+                             Instruction *CxtI) const {
+    return llvm::computeKnownBits(V, DL, Depth, &AC, CxtI, &DT);
   }
 
   bool MaskedValueIsZero(Value *V, const APInt &Mask, unsigned Depth = 0,
@@ -503,11 +519,6 @@ public:
                               Instruction *CxtI = nullptr) const {
     return llvm::ComputeNumSignBits(Op, DL, Depth, &AC, CxtI, &DT);
   }
-  void ComputeSignBit(Value *V, bool &KnownZero, bool &KnownOne,
-                      unsigned Depth = 0, Instruction *CxtI = nullptr) const {
-    return llvm::ComputeSignBit(V, KnownZero, KnownOne, DL, Depth, &AC, CxtI,
-                                &DT);
-  }
   OverflowResult computeOverflowForUnsignedMul(Value *LHS, Value *RHS,
                                                const Instruction *CxtI) {
     return llvm::computeOverflowForUnsignedMul(LHS, RHS, DL, &AC, CxtI, &DT);
@@ -516,6 +527,11 @@ public:
                                                const Instruction *CxtI) {
     return llvm::computeOverflowForUnsignedAdd(LHS, RHS, DL, &AC, CxtI, &DT);
   }
+  OverflowResult computeOverflowForSignedAdd(const Value *LHS,
+                                             const Value *RHS,
+                                             const Instruction *CxtI) const {
+    return llvm::computeOverflowForSignedAdd(LHS, RHS, DL, &AC, CxtI, &DT);
+  }
 
   /// Maximum size of array considered when transforming.
   uint64_t MaxArraySizeForCombine;
diff --git a/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp b/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
index 675553017838..a4d84ae81aa0 100644
--- a/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
+++ b/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
@@ -885,10 +885,8 @@ static bool canReplaceGEPIdxWithZero(InstCombiner &IC, GetElementPtrInst *GEPI,
   // first non-zero index.
   auto IsAllNonNegative = [&]() {
     for (unsigned i = Idx+1, e = GEPI->getNumOperands(); i != e; ++i) {
-      bool KnownNonNegative, KnownNegative;
-      IC.ComputeSignBit(GEPI->getOperand(i), KnownNonNegative,
-                        KnownNegative, 0, MemI);
-      if (KnownNonNegative)
+      KnownBits Known = IC.computeKnownBits(GEPI->getOperand(i), 0, MemI);
+      if (Known.isNonNegative())
         continue;
       return false;
     }
diff --git a/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp b/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
index face9d9237ae..2a35259f2103 100644
--- a/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
+++ b/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
@@ -162,11 +162,9 @@ bool InstCombiner::WillNotOverflowSignedMul(Value *LHS, Value *RHS,
     // product is exactly the minimum negative number.
     // E.g. mul i16 with 17 sign bits: 0xff00 * 0xff80 = 0x8000
     // For simplicity we just check if at least one side is not negative.
-    bool LHSNonNegative, LHSNegative;
-    bool RHSNonNegative, RHSNegative;
-    ComputeSignBit(LHS, LHSNonNegative, LHSNegative, /*Depth=*/0, &CxtI);
-    ComputeSignBit(RHS, RHSNonNegative, RHSNegative, /*Depth=*/0, &CxtI);
-    if (LHSNonNegative || RHSNonNegative)
+    KnownBits LHSKnown = computeKnownBits(LHS, /*Depth=*/0, &CxtI);
+    KnownBits RHSKnown = computeKnownBits(RHS, /*Depth=*/0, &CxtI);
+    if (LHSKnown.isNonNegative() || RHSKnown.isNonNegative())
       return true;
   }
   return false;
@@ -422,8 +420,7 @@ Instruction *InstCombiner::visitMul(BinaryOperator &I) {
         Constant *CI =
             ConstantExpr::getTrunc(Op1C, Op0Conv->getOperand(0)->getType());
         if (ConstantExpr::getZExt(CI, I.getType()) == Op1C &&
-            computeOverflowForUnsignedMul(Op0Conv->getOperand(0), CI, &I) ==
-                OverflowResult::NeverOverflows) {
+            willNotOverflowUnsignedMul(Op0Conv->getOperand(0), CI, I)) {
           // Insert the new, smaller mul.
           Value *NewMul =
               Builder->CreateNUWMul(Op0Conv->getOperand(0), CI, "mulconv");
@@ -440,9 +437,8 @@ Instruction *InstCombiner::visitMul(BinaryOperator &I) {
       if (Op0Conv->getOperand(0)->getType() ==
               Op1Conv->getOperand(0)->getType() &&
           (Op0Conv->hasOneUse() || Op1Conv->hasOneUse()) &&
-          computeOverflowForUnsignedMul(Op0Conv->getOperand(0),
-                                        Op1Conv->getOperand(0),
-                                        &I) == OverflowResult::NeverOverflows) {
+          willNotOverflowUnsignedMul(Op0Conv->getOperand(0),
+                                     Op1Conv->getOperand(0), I)) {
         // Insert the new integer mul.
         Value *NewMul = Builder->CreateNUWMul(
             Op0Conv->getOperand(0), Op1Conv->getOperand(0), "mulconv");
@@ -456,9 +452,7 @@ Instruction *InstCombiner::visitMul(BinaryOperator &I) {
     I.setHasNoSignedWrap(true);
   }
 
-  if (!I.hasNoUnsignedWrap() &&
-      computeOverflowForUnsignedMul(Op0, Op1, &I) ==
-          OverflowResult::NeverOverflows) {
+  if (!I.hasNoUnsignedWrap() && willNotOverflowUnsignedMul(Op0, Op1, I)) {
     Changed = true;
     I.setHasNoUnsignedWrap(true);
   }
diff --git a/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp b/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
index 05b01774cd5e..4028a92771a4 100644
--- a/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
+++ b/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
@@ -611,7 +611,7 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
         SimplifyDemandedBits(I, 1, AllOnes, Known2, Depth + 1))
       return I;
 
-    unsigned Leaders = Known2.Zero.countLeadingOnes();
+    unsigned Leaders = Known2.countMinLeadingZeros();
     Known.Zero = APInt::getHighBitsSet(BitWidth, Leaders) & DemandedMask;
     break;
   }
diff --git a/lib/Transforms/InstCombine/InstructionCombining.cpp b/lib/Transforms/InstCombine/InstructionCombining.cpp
index 1792cb585f87..65b1148cb03b 100644
--- a/lib/Transforms/InstCombine/InstructionCombining.cpp
+++ b/lib/Transforms/InstCombine/InstructionCombining.cpp
@@ -2212,9 +2212,9 @@ Instruction *InstCombiner::visitBranchInst(BranchInst &BI) {
 
   // Canonicalize fcmp_one -> fcmp_oeq
   FCmpInst::Predicate FPred; Value *Y;
-  if (match(&BI, m_Br(m_FCmp(FPred, m_Value(X), m_Value(Y)),
-                             TrueDest, FalseDest)) &&
-      BI.getCondition()->hasOneUse())
+  if (match(&BI, m_Br(m_OneUse(m_FCmp(FPred, m_Value(X), m_Value(Y))),
+                      TrueDest, FalseDest))) {
+    // TODO: Why are we only transforming these 3 predicates?
     if (FPred == FCmpInst::FCMP_ONE || FPred == FCmpInst::FCMP_OLE ||
         FPred == FCmpInst::FCMP_OGE) {
       FCmpInst *Cond = cast<FCmpInst>(BI.getCondition());
@@ -2225,12 +2225,12 @@ Instruction *InstCombiner::visitBranchInst(BranchInst &BI) {
       Worklist.Add(Cond);
       return &BI;
     }
+  }
 
   // Canonicalize icmp_ne -> icmp_eq
   ICmpInst::Predicate IPred;
-  if (match(&BI, m_Br(m_ICmp(IPred, m_Value(X), m_Value(Y)),
-                      TrueDest, FalseDest)) &&
-      BI.getCondition()->hasOneUse())
+  if (match(&BI, m_Br(m_OneUse(m_ICmp(IPred, m_Value(X), m_Value(Y))),
+                      TrueDest, FalseDest))) {
     if (IPred == ICmpInst::ICMP_NE  || IPred == ICmpInst::ICMP_ULE ||
         IPred == ICmpInst::ICMP_SLE || IPred == ICmpInst::ICMP_UGE ||
         IPred == ICmpInst::ICMP_SGE) {
@@ -2241,6 +2241,7 @@ Instruction *InstCombiner::visitBranchInst(BranchInst &BI) {
       Worklist.Add(Cond);
       return &BI;
     }
+  }
 
   return nullptr;
 }
@@ -2264,8 +2265,8 @@ Instruction *InstCombiner::visitSwitchInst(SwitchInst &SI) {
   unsigned BitWidth = cast<IntegerType>(Cond->getType())->getBitWidth();
   KnownBits Known(BitWidth);
   computeKnownBits(Cond, Known, 0, &SI);
-  unsigned LeadingKnownZeros = Known.Zero.countLeadingOnes();
-  unsigned LeadingKnownOnes = Known.One.countLeadingOnes();
+  unsigned LeadingKnownZeros = Known.countMinLeadingZeros();
+  unsigned LeadingKnownOnes = Known.countMinLeadingOnes();
 
   // Compute the number of leading bits we can ignore.
   // TODO: A better way to determine this would use ComputeNumSignBits().
@@ -3141,7 +3142,7 @@ combineInstructionsOverFunction(Function &F, InstCombineWorklist &Worklist,
 
   // Lower dbg.declare intrinsics otherwise their value may be clobbered
   // by instcombiner.
-  bool DbgDeclaresChanged = LowerDbgDeclare(F);
+  bool MadeIRChange = LowerDbgDeclare(F);
 
   // Iterate while there is work to do.
   int Iteration = 0;
@@ -3150,18 +3151,17 @@ combineInstructionsOverFunction(Function &F, InstCombineWorklist &Worklist,
     DEBUG(dbgs() << "\n\nINSTCOMBINE ITERATION #" << Iteration << " on "
                  << F.getName() << "\n");
 
-    bool Changed = prepareICWorklistFromFunction(F, DL, &TLI, Worklist);
+    MadeIRChange |= prepareICWorklistFromFunction(F, DL, &TLI, Worklist);
 
     InstCombiner IC(Worklist, &Builder, F.optForMinSize(), ExpensiveCombines,
                     AA, AC, TLI, DT, DL, LI);
     IC.MaxArraySizeForCombine = MaxArraySize;
-    Changed |= IC.run();
 
-    if (!Changed)
+    if (!IC.run())
       break;
   }
 
-  return DbgDeclaresChanged || Iteration > 1;
+  return MadeIRChange || Iteration > 1;
 }
 
 PreservedAnalyses InstCombinePass::run(Function &F,
diff --git a/lib/Transforms/Instrumentation/AddressSanitizer.cpp b/lib/Transforms/Instrumentation/AddressSanitizer.cpp
index b034ccc46933..7eea44d6aca0 100644
--- a/lib/Transforms/Instrumentation/AddressSanitizer.cpp
+++ b/lib/Transforms/Instrumentation/AddressSanitizer.cpp
@@ -613,7 +613,15 @@ public:
                                   bool UseGlobalsGC = true)
       : ModulePass(ID), CompileKernel(CompileKernel || ClEnableKasan),
         Recover(Recover || ClRecover),
-        UseGlobalsGC(UseGlobalsGC && ClUseGlobalsGC) {}
+        UseGlobalsGC(UseGlobalsGC && ClUseGlobalsGC),
+        // Not a typo: ClWithComdat is almost completely pointless without
+        // ClUseGlobalsGC (because then it only works on modules without
+        // globals, which are rare); it is a prerequisite for ClUseGlobalsGC;
+        // and both suffer from gold PR19002 for which UseGlobalsGC constructor
+        // argument is designed as workaround. Therefore, disable both
+        // ClWithComdat and ClUseGlobalsGC unless the frontend says it's ok to
+        // do globals-gc.
+        UseCtorComdat(UseGlobalsGC && ClWithComdat) {}
   bool runOnModule(Module &M) override;
   static char ID; // Pass identification, replacement for typeid
   StringRef getPassName() const override { return "AddressSanitizerModule"; }
@@ -656,6 +664,7 @@ private:
   bool CompileKernel;
   bool Recover;
   bool UseGlobalsGC;
+  bool UseCtorComdat;
   Type *IntptrTy;
   LLVMContext *C;
   Triple TargetTriple;
@@ -1677,7 +1686,7 @@ AddressSanitizerModule::CreateMetadataGlobal(Module &M, Constant *Initializer,
                      : GlobalVariable::PrivateLinkage;
   GlobalVariable *Metadata = new GlobalVariable(
       M, Initializer->getType(), false, Linkage, Initializer,
-      Twine("__asan_global_") + GlobalValue::getRealLinkageName(OriginalName));
+      Twine("__asan_global_") + GlobalValue::dropLLVMManglingEscape(OriginalName));
   Metadata->setSection(getGlobalMetadataSection());
   return Metadata;
 }
@@ -1782,7 +1791,7 @@ void AddressSanitizerModule::InstrumentGlobalsMachO(
   // On recent Mach-O platforms, use a structure which binds the liveness of
   // the global variable to the metadata struct. Keep the list of "Liveness" GV
   // created to be added to llvm.compiler.used
-  StructType *LivenessTy = StructType::get(IntptrTy, IntptrTy, nullptr);
+  StructType *LivenessTy = StructType::get(IntptrTy, IntptrTy);
   SmallVector<GlobalValue *, 16> LivenessGlobals(ExtendedGlobals.size());
 
   for (size_t i = 0; i < ExtendedGlobals.size(); i++) {
@@ -1793,9 +1802,9 @@ void AddressSanitizerModule::InstrumentGlobalsMachO(
 
     // On recent Mach-O platforms, we emit the global metadata in a way that
     // allows the linker to properly strip dead globals.
-    auto LivenessBinder = ConstantStruct::get(
-        LivenessTy, Initializer->getAggregateElement(0u),
-        ConstantExpr::getPointerCast(Metadata, IntptrTy), nullptr);
+    auto LivenessBinder =
+        ConstantStruct::get(LivenessTy, Initializer->getAggregateElement(0u),
+                            ConstantExpr::getPointerCast(Metadata, IntptrTy));
     GlobalVariable *Liveness = new GlobalVariable(
         M, LivenessTy, false, GlobalVariable::InternalLinkage, LivenessBinder,
         Twine("__asan_binder_") + G->getName());
@@ -1893,7 +1902,7 @@ bool AddressSanitizerModule::InstrumentGlobals(IRBuilder<> &IRB, Module &M, bool
   // We initialize an array of such structures and pass it to a run-time call.
   StructType *GlobalStructTy =
       StructType::get(IntptrTy, IntptrTy, IntptrTy, IntptrTy, IntptrTy,
-                      IntptrTy, IntptrTy, IntptrTy, nullptr);
+                      IntptrTy, IntptrTy, IntptrTy);
   SmallVector<GlobalVariable *, 16> NewGlobals(n);
   SmallVector<Constant *, 16> Initializers(n);
 
@@ -1929,10 +1938,9 @@ bool AddressSanitizerModule::InstrumentGlobals(IRBuilder<> &IRB, Module &M, bool
     assert(((RightRedzoneSize + SizeInBytes) % MinRZ) == 0);
     Type *RightRedZoneTy = ArrayType::get(IRB.getInt8Ty(), RightRedzoneSize);
 
-    StructType *NewTy = StructType::get(Ty, RightRedZoneTy, nullptr);
-    Constant *NewInitializer =
-        ConstantStruct::get(NewTy, G->getInitializer(),
-                            Constant::getNullValue(RightRedZoneTy), nullptr);
+    StructType *NewTy = StructType::get(Ty, RightRedZoneTy);
+    Constant *NewInitializer = ConstantStruct::get(
+        NewTy, G->getInitializer(), Constant::getNullValue(RightRedZoneTy));
 
     // Create a new global variable with enough space for a redzone.
     GlobalValue::LinkageTypes Linkage = G->getLinkage();
@@ -2013,7 +2021,7 @@ bool AddressSanitizerModule::InstrumentGlobals(IRBuilder<> &IRB, Module &M, bool
         ConstantExpr::getPointerCast(Name, IntptrTy),
         ConstantExpr::getPointerCast(ModuleName, IntptrTy),
         ConstantInt::get(IntptrTy, MD.IsDynInit), SourceLoc,
-        ConstantExpr::getPointerCast(ODRIndicator, IntptrTy), nullptr);
+        ConstantExpr::getPointerCast(ODRIndicator, IntptrTy));
 
     if (ClInitializers && MD.IsDynInit) HasDynamicallyInitializedGlobals = true;
 
@@ -2073,7 +2081,7 @@ bool AddressSanitizerModule::runOnModule(Module &M) {
   // Put the constructor and destructor in comdat if both
   // (1) global instrumentation is not TU-specific
   // (2) target is ELF.
-  if (ClWithComdat && TargetTriple.isOSBinFormatELF() && CtorComdat) {
+  if (UseCtorComdat && TargetTriple.isOSBinFormatELF() && CtorComdat) {
     AsanCtorFunction->setComdat(M.getOrInsertComdat(kAsanModuleCtorName));
     appendToGlobalCtors(M, AsanCtorFunction, kAsanCtorAndDtorPriority,
                         AsanCtorFunction);
diff --git a/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp b/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp
index 8786781933ea..e2e3cbdbc295 100644
--- a/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp
+++ b/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp
@@ -388,7 +388,7 @@ FunctionType *DataFlowSanitizer::getArgsFunctionType(FunctionType *T) {
     ArgTypes.push_back(ShadowPtrTy);
   Type *RetType = T->getReturnType();
   if (!RetType->isVoidTy())
-    RetType = StructType::get(RetType, ShadowTy, (Type *)nullptr);
+    RetType = StructType::get(RetType, ShadowTy);
   return FunctionType::get(RetType, ArgTypes, T->isVarArg());
 }
 
@@ -476,16 +476,14 @@ bool DataFlowSanitizer::doInitialization(Module &M) {
     GetArgTLS = ConstantExpr::getIntToPtr(
         ConstantInt::get(IntptrTy, uintptr_t(GetArgTLSPtr)),
         PointerType::getUnqual(
-            FunctionType::get(PointerType::getUnqual(ArgTLSTy),
-                              (Type *)nullptr)));
+            FunctionType::get(PointerType::getUnqual(ArgTLSTy), false)));
   }
   if (GetRetvalTLSPtr) {
     RetvalTLS = nullptr;
     GetRetvalTLS = ConstantExpr::getIntToPtr(
         ConstantInt::get(IntptrTy, uintptr_t(GetRetvalTLSPtr)),
         PointerType::getUnqual(
-            FunctionType::get(PointerType::getUnqual(ShadowTy),
-                              (Type *)nullptr)));
+            FunctionType::get(PointerType::getUnqual(ShadowTy), false)));
   }
 
   ColdCallWeights = MDBuilder(*Ctx).createBranchWeights(1, 1000);
diff --git a/lib/Transforms/Instrumentation/EfficiencySanitizer.cpp b/lib/Transforms/Instrumentation/EfficiencySanitizer.cpp
index 7dea1dee756a..e89384c559fe 100644
--- a/lib/Transforms/Instrumentation/EfficiencySanitizer.cpp
+++ b/lib/Transforms/Instrumentation/EfficiencySanitizer.cpp
@@ -398,8 +398,8 @@ GlobalVariable *EfficiencySanitizer::createCacheFragInfoGV(
   //   u64 *ArrayCounter;
   // };
   auto *StructInfoTy =
-    StructType::get(Int8PtrTy, Int32Ty, Int32Ty, Int32PtrTy, Int32PtrTy,
-                    Int8PtrPtrTy, Int64PtrTy, Int64PtrTy, nullptr);
+      StructType::get(Int8PtrTy, Int32Ty, Int32Ty, Int32PtrTy, Int32PtrTy,
+                      Int8PtrPtrTy, Int64PtrTy, Int64PtrTy);
   auto *StructInfoPtrTy = StructInfoTy->getPointerTo();
   // This structure should be kept consistent with the CacheFragInfo struct
   // in the runtime library.
@@ -408,8 +408,7 @@ GlobalVariable *EfficiencySanitizer::createCacheFragInfoGV(
   //   u32 NumStructs;
   //   StructInfo *Structs;
   // };
-  auto *CacheFragInfoTy =
-    StructType::get(Int8PtrTy, Int32Ty, StructInfoPtrTy, nullptr);
+  auto *CacheFragInfoTy = StructType::get(Int8PtrTy, Int32Ty, StructInfoPtrTy);
 
   std::vector<StructType *> Vec = M.getIdentifiedStructTypes();
   unsigned NumStructs = 0;
@@ -457,24 +456,23 @@ GlobalVariable *EfficiencySanitizer::createCacheFragInfoGV(
     ArrayCounterIdx[0] = ConstantInt::get(Int32Ty, 0);
     ArrayCounterIdx[1] = ConstantInt::get(Int32Ty,
                                           getArrayCounterIdx(StructTy));
-    Initializers.push_back(
-        ConstantStruct::get(
-            StructInfoTy,
-            ConstantExpr::getPointerCast(StructCounterName, Int8PtrTy),
-            ConstantInt::get(Int32Ty,
-                             DL.getStructLayout(StructTy)->getSizeInBytes()),
-            ConstantInt::get(Int32Ty, StructTy->getNumElements()),
-            Offset == nullptr ? ConstantPointerNull::get(Int32PtrTy) :
-                ConstantExpr::getPointerCast(Offset, Int32PtrTy),
-            Size == nullptr ? ConstantPointerNull::get(Int32PtrTy) :
-                ConstantExpr::getPointerCast(Size, Int32PtrTy),
-            TypeName == nullptr ? ConstantPointerNull::get(Int8PtrPtrTy) :
-                ConstantExpr::getPointerCast(TypeName, Int8PtrPtrTy),
-            ConstantExpr::getGetElementPtr(CounterArrayTy, Counters,
-                                           FieldCounterIdx),
-            ConstantExpr::getGetElementPtr(CounterArrayTy, Counters,
-                                           ArrayCounterIdx),
-            nullptr));
+    Initializers.push_back(ConstantStruct::get(
+        StructInfoTy,
+        ConstantExpr::getPointerCast(StructCounterName, Int8PtrTy),
+        ConstantInt::get(Int32Ty,
+                         DL.getStructLayout(StructTy)->getSizeInBytes()),
+        ConstantInt::get(Int32Ty, StructTy->getNumElements()),
+        Offset == nullptr ? ConstantPointerNull::get(Int32PtrTy)
+                          : ConstantExpr::getPointerCast(Offset, Int32PtrTy),
+        Size == nullptr ? ConstantPointerNull::get(Int32PtrTy)
+                        : ConstantExpr::getPointerCast(Size, Int32PtrTy),
+        TypeName == nullptr
+            ? ConstantPointerNull::get(Int8PtrPtrTy)
+            : ConstantExpr::getPointerCast(TypeName, Int8PtrPtrTy),
+        ConstantExpr::getGetElementPtr(CounterArrayTy, Counters,
+                                       FieldCounterIdx),
+        ConstantExpr::getGetElementPtr(CounterArrayTy, Counters,
+                                       ArrayCounterIdx)));
   }
   // Structs.
   Constant *StructInfo;
@@ -491,11 +489,8 @@ GlobalVariable *EfficiencySanitizer::createCacheFragInfoGV(
 
   auto *CacheFragInfoGV = new GlobalVariable(
       M, CacheFragInfoTy, true, GlobalVariable::InternalLinkage,
-      ConstantStruct::get(CacheFragInfoTy,
-                          UnitName,
-                          ConstantInt::get(Int32Ty, NumStructs),
-                          StructInfo,
-                          nullptr));
+      ConstantStruct::get(CacheFragInfoTy, UnitName,
+                          ConstantInt::get(Int32Ty, NumStructs), StructInfo));
   return CacheFragInfoGV;
 }
 
diff --git a/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/lib/Transforms/Instrumentation/MemorySanitizer.cpp
index 15333a5317dd..ff753c20a94a 100644
--- a/lib/Transforms/Instrumentation/MemorySanitizer.cpp
+++ b/lib/Transforms/Instrumentation/MemorySanitizer.cpp
@@ -1576,13 +1576,16 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
   Value *CreateShadowCast(IRBuilder<> &IRB, Value *V, Type *dstTy,
                           bool Signed = false) {
     Type *srcTy = V->getType();
+    size_t srcSizeInBits = VectorOrPrimitiveTypeSizeInBits(srcTy);
+    size_t dstSizeInBits = VectorOrPrimitiveTypeSizeInBits(dstTy);
+    if (srcSizeInBits > 1 && dstSizeInBits == 1)
+      return IRB.CreateICmpNE(V, getCleanShadow(V));
+
     if (dstTy->isIntegerTy() && srcTy->isIntegerTy())
       return IRB.CreateIntCast(V, dstTy, Signed);
     if (dstTy->isVectorTy() && srcTy->isVectorTy() &&
         dstTy->getVectorNumElements() == srcTy->getVectorNumElements())
       return IRB.CreateIntCast(V, dstTy, Signed);
-    size_t srcSizeInBits = VectorOrPrimitiveTypeSizeInBits(srcTy);
-    size_t dstSizeInBits = VectorOrPrimitiveTypeSizeInBits(dstTy);
     Value *V1 = IRB.CreateBitCast(V, Type::getIntNTy(*MS.C, srcSizeInBits));
     Value *V2 =
       IRB.CreateIntCast(V1, Type::getIntNTy(*MS.C, dstSizeInBits), Signed);
diff --git a/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp b/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp
index 3f1a77b49a44..ee493a8ec7e1 100644
--- a/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp
+++ b/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp
@@ -442,9 +442,8 @@ static bool processAdd(BinaryOperator *AddOp, LazyValueInfo *LVI) {
 
   bool Changed = false;
   if (!NUW) {
-    ConstantRange NUWRange =
-            LRange.makeGuaranteedNoWrapRegion(BinaryOperator::Add, LRange,
-                                              OBO::NoUnsignedWrap);
+    ConstantRange NUWRange = ConstantRange::makeGuaranteedNoWrapRegion(
+        BinaryOperator::Add, LRange, OBO::NoUnsignedWrap);
     if (!NUWRange.isEmptySet()) {
       bool NewNUW = NUWRange.contains(LazyRRange());
       AddOp->setHasNoUnsignedWrap(NewNUW);
@@ -452,9 +451,8 @@ static bool processAdd(BinaryOperator *AddOp, LazyValueInfo *LVI) {
     }
   }
   if (!NSW) {
-    ConstantRange NSWRange =
-            LRange.makeGuaranteedNoWrapRegion(BinaryOperator::Add, LRange,
-                                              OBO::NoSignedWrap);
+    ConstantRange NSWRange = ConstantRange::makeGuaranteedNoWrapRegion(
+        BinaryOperator::Add, LRange, OBO::NoSignedWrap);
     if (!NSWRange.isEmptySet()) {
       bool NewNSW = NSWRange.contains(LazyRRange());
       AddOp->setHasNoSignedWrap(NewNSW);
diff --git a/lib/Transforms/Scalar/LoopIdiomRecognize.cpp b/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
index 48d5ae88cda9..6693a26e8890 100644
--- a/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
+++ b/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
@@ -144,6 +144,10 @@ private:
   bool recognizePopcount();
   void transformLoopToPopcount(BasicBlock *PreCondBB, Instruction *CntInst,
                                PHINode *CntPhi, Value *Var);
+  bool recognizeAndInsertCTLZ();
+  void transformLoopToCountable(BasicBlock *PreCondBB, Instruction *CntInst,
+                                PHINode *CntPhi, Value *Var, const DebugLoc DL,
+                                bool ZeroCheck, bool IsCntPhiUsedOutsideLoop);
 
   /// @}
 };
@@ -994,7 +998,7 @@ bool LoopIdiomRecognize::avoidLIRForMultiBlockLoop(bool IsMemset,
 }
 
 bool LoopIdiomRecognize::runOnNoncountableLoop() {
-  return recognizePopcount();
+  return recognizePopcount() || recognizeAndInsertCTLZ();
 }
 
 /// Check if the given conditional branch is based on the comparison between
@@ -1159,6 +1163,167 @@ static bool detectPopcountIdiom(Loop *CurLoop, BasicBlock *PreCondBB,
   return true;
 }
 
+/// Return true if the idiom is detected in the loop.
+///
+/// Additionally:
+/// 1) \p CntInst is set to the instruction Counting Leading Zeros (CTLZ)
+///       or nullptr if there is no such.
+/// 2) \p CntPhi is set to the corresponding phi node
+///       or nullptr if there is no such.
+/// 3) \p Var is set to the value whose CTLZ could be used.
+/// 4) \p DefX is set to the instruction calculating Loop exit condition.
+///
+/// The core idiom we are trying to detect is:
+/// \code
+///    if (x0 == 0)
+///      goto loop-exit // the precondition of the loop
+///    cnt0 = init-val;
+///    do {
+///       x = phi (x0, x.next);   //PhiX
+///       cnt = phi(cnt0, cnt.next);
+///
+///       cnt.next = cnt + 1;
+///        ...
+///       x.next = x >> 1;   // DefX
+///        ...
+///    } while(x.next != 0);
+///
+/// loop-exit:
+/// \endcode
+static bool detectCTLZIdiom(Loop *CurLoop, PHINode *&PhiX,
+                            Instruction *&CntInst, PHINode *&CntPhi,
+                            Instruction *&DefX) {
+  BasicBlock *LoopEntry;
+  Value *VarX = nullptr;
+
+  DefX = nullptr;
+  PhiX = nullptr;
+  CntInst = nullptr;
+  CntPhi = nullptr;
+  LoopEntry = *(CurLoop->block_begin());
+
+  // step 1: Check if the loop-back branch is in desirable form.
+  if (Value *T = matchCondition(
+          dyn_cast<BranchInst>(LoopEntry->getTerminator()), LoopEntry))
+    DefX = dyn_cast<Instruction>(T);
+  else
+    return false;
+
+  // step 2: detect instructions corresponding to "x.next = x >> 1"
+  if (!DefX || DefX->getOpcode() != Instruction::AShr)
+    return false;
+  if (ConstantInt *Shft = dyn_cast<ConstantInt>(DefX->getOperand(1)))
+    if (!Shft || !Shft->isOne())
+      return false;
+  VarX = DefX->getOperand(0);
+
+  // step 3: Check the recurrence of variable X
+  PhiX = dyn_cast<PHINode>(VarX);
+  if (!PhiX || (PhiX->getOperand(0) != DefX && PhiX->getOperand(1) != DefX))
+    return false;
+
+  // step 4: Find the instruction which count the CTLZ: cnt.next = cnt + 1
+  // TODO: We can skip the step. If loop trip count is known (CTLZ),
+  //       then all uses of "cnt.next" could be optimized to the trip count
+  //       plus "cnt0". Currently it is not optimized.
+  //       This step could be used to detect POPCNT instruction:
+  //       cnt.next = cnt + (x.next & 1)
+  for (BasicBlock::iterator Iter = LoopEntry->getFirstNonPHI()->getIterator(),
+                            IterE = LoopEntry->end();
+       Iter != IterE; Iter++) {
+    Instruction *Inst = &*Iter;
+    if (Inst->getOpcode() != Instruction::Add)
+      continue;
+
+    ConstantInt *Inc = dyn_cast<ConstantInt>(Inst->getOperand(1));
+    if (!Inc || !Inc->isOne())
+      continue;
+
+    PHINode *Phi = dyn_cast<PHINode>(Inst->getOperand(0));
+    if (!Phi || Phi->getParent() != LoopEntry)
+      continue;
+
+    CntInst = Inst;
+    CntPhi = Phi;
+    break;
+  }
+  if (!CntInst)
+    return false;
+
+  return true;
+}
+
+/// Recognize CTLZ idiom in a non-countable loop and convert the loop
+/// to countable (with CTLZ trip count).
+/// If CTLZ inserted as a new trip count returns true; otherwise, returns false.
+bool LoopIdiomRecognize::recognizeAndInsertCTLZ() {
+  // Give up if the loop has multiple blocks or multiple backedges.
+  if (CurLoop->getNumBackEdges() != 1 || CurLoop->getNumBlocks() != 1)
+    return false;
+
+  Instruction *CntInst, *DefX;
+  PHINode *CntPhi, *PhiX;
+  if (!detectCTLZIdiom(CurLoop, PhiX, CntInst, CntPhi, DefX))
+    return false;
+
+  bool IsCntPhiUsedOutsideLoop = false;
+  for (User *U : CntPhi->users())
+    if (!CurLoop->contains(dyn_cast<Instruction>(U))) {
+      IsCntPhiUsedOutsideLoop = true;
+      break;
+    }
+  bool IsCntInstUsedOutsideLoop = false;
+  for (User *U : CntInst->users())
+    if (!CurLoop->contains(dyn_cast<Instruction>(U))) {
+      IsCntInstUsedOutsideLoop = true;
+      break;
+    }
+  // If both CntInst and CntPhi are used outside the loop the profitability
+  // is questionable.
+  if (IsCntInstUsedOutsideLoop && IsCntPhiUsedOutsideLoop)
+    return false;
+
+  // For some CPUs result of CTLZ(X) intrinsic is undefined
+  // when X is 0. If we can not guarantee X != 0, we need to check this
+  // when expand.
+  bool ZeroCheck = false;
+  // It is safe to assume Preheader exist as it was checked in
+  // parent function RunOnLoop.
+  BasicBlock *PH = CurLoop->getLoopPreheader();
+  Value *InitX = PhiX->getIncomingValueForBlock(PH);
+  // If we check X != 0 before entering the loop we don't need a zero
+  // check in CTLZ intrinsic.
+  if (BasicBlock *PreCondBB = PH->getSinglePredecessor())
+    if (BranchInst *PreCondBr =
+        dyn_cast<BranchInst>(PreCondBB->getTerminator())) {
+      if (matchCondition(PreCondBr, PH) == InitX)
+        ZeroCheck = true;
+    }
+
+  // Check if CTLZ intrinsic is profitable. Assume it is always profitable
+  // if we delete the loop (the loop has only 6 instructions):
+  //  %n.addr.0 = phi [ %n, %entry ], [ %shr, %while.cond ]
+  //  %i.0 = phi [ %i0, %entry ], [ %inc, %while.cond ]
+  //  %shr = ashr %n.addr.0, 1
+  //  %tobool = icmp eq %shr, 0
+  //  %inc = add nsw %i.0, 1
+  //  br i1 %tobool
+
+  IRBuilder<> Builder(PH->getTerminator());
+  SmallVector<const Value *, 2> Ops =
+      {InitX, ZeroCheck ? Builder.getTrue() : Builder.getFalse()};
+  ArrayRef<const Value *> Args(Ops);
+  if (CurLoop->getHeader()->size() != 6 &&
+      TTI->getIntrinsicCost(Intrinsic::ctlz, InitX->getType(), Args) >
+          TargetTransformInfo::TCC_Basic)
+    return false;
+
+  const DebugLoc DL = DefX->getDebugLoc();
+  transformLoopToCountable(PH, CntInst, CntPhi, InitX, DL, ZeroCheck,
+                           IsCntPhiUsedOutsideLoop);
+  return true;
+}
+
 /// Recognizes a population count idiom in a non-countable loop.
 ///
 /// If detected, transforms the relevant code to issue the popcount intrinsic
@@ -1222,6 +1387,134 @@ static CallInst *createPopcntIntrinsic(IRBuilder<> &IRBuilder, Value *Val,
   return CI;
 }
 
+static CallInst *createCTLZIntrinsic(IRBuilder<> &IRBuilder, Value *Val,
+                                     const DebugLoc &DL, bool ZeroCheck) {
+  Value *Ops[] = {Val, ZeroCheck ? IRBuilder.getTrue() : IRBuilder.getFalse()};
+  Type *Tys[] = {Val->getType()};
+
+  Module *M = IRBuilder.GetInsertBlock()->getParent()->getParent();
+  Value *Func = Intrinsic::getDeclaration(M, Intrinsic::ctlz, Tys);
+  CallInst *CI = IRBuilder.CreateCall(Func, Ops);
+  CI->setDebugLoc(DL);
+
+  return CI;
+}
+
+/// Transform the following loop:
+/// loop:
+///   CntPhi = PHI [Cnt0, CntInst]
+///   PhiX = PHI [InitX, DefX]
+///   CntInst = CntPhi + 1
+///   DefX = PhiX >> 1
+//    LOOP_BODY
+///   Br: loop if (DefX != 0)
+/// Use(CntPhi) or Use(CntInst)
+///
+/// Into:
+/// If CntPhi used outside the loop:
+///   CountPrev = BitWidth(InitX) - CTLZ(InitX >> 1)
+///   Count = CountPrev + 1
+/// else
+///   Count = BitWidth(InitX) - CTLZ(InitX)
+/// loop:
+///   CntPhi = PHI [Cnt0, CntInst]
+///   PhiX = PHI [InitX, DefX]
+///   PhiCount = PHI [Count, Dec]
+///   CntInst = CntPhi + 1
+///   DefX = PhiX >> 1
+///   Dec = PhiCount - 1
+///   LOOP_BODY
+///   Br: loop if (Dec != 0)
+/// Use(CountPrev + Cnt0) // Use(CntPhi)
+/// or
+/// Use(Count + Cnt0) // Use(CntInst)
+///
+/// If LOOP_BODY is empty the loop will be deleted.
+/// If CntInst and DefX are not used in LOOP_BODY they will be removed.
+void LoopIdiomRecognize::transformLoopToCountable(
+    BasicBlock *Preheader, Instruction *CntInst, PHINode *CntPhi, Value *InitX,
+    const DebugLoc DL, bool ZeroCheck, bool IsCntPhiUsedOutsideLoop) {
+  BranchInst *PreheaderBr = dyn_cast<BranchInst>(Preheader->getTerminator());
+
+  // Step 1: Insert the CTLZ instruction at the end of the preheader block
+  //   Count = BitWidth - CTLZ(InitX);
+  // If there are uses of CntPhi create:
+  //   CountPrev = BitWidth - CTLZ(InitX >> 1);
+  IRBuilder<> Builder(PreheaderBr);
+  Builder.SetCurrentDebugLocation(DL);
+  Value *CTLZ, *Count, *CountPrev, *NewCount, *InitXNext;
+
+  if (IsCntPhiUsedOutsideLoop)
+    InitXNext = Builder.CreateAShr(InitX,
+                                   ConstantInt::get(InitX->getType(), 1));
+  else
+    InitXNext = InitX;
+  CTLZ = createCTLZIntrinsic(Builder, InitXNext, DL, ZeroCheck);
+  Count = Builder.CreateSub(
+      ConstantInt::get(CTLZ->getType(),
+                       CTLZ->getType()->getIntegerBitWidth()),
+      CTLZ);
+  if (IsCntPhiUsedOutsideLoop) {
+    CountPrev = Count;
+    Count = Builder.CreateAdd(
+        CountPrev,
+        ConstantInt::get(CountPrev->getType(), 1));
+  }
+  if (IsCntPhiUsedOutsideLoop)
+    NewCount = Builder.CreateZExtOrTrunc(CountPrev,
+        cast<IntegerType>(CntInst->getType()));
+  else
+    NewCount = Builder.CreateZExtOrTrunc(Count,
+        cast<IntegerType>(CntInst->getType()));
+
+  // If the CTLZ counter's initial value is not zero, insert Add Inst.
+  Value *CntInitVal = CntPhi->getIncomingValueForBlock(Preheader);
+  ConstantInt *InitConst = dyn_cast<ConstantInt>(CntInitVal);
+  if (!InitConst || !InitConst->isZero())
+    NewCount = Builder.CreateAdd(NewCount, CntInitVal);
+
+  // Step 2: Insert new IV and loop condition:
+  // loop:
+  //   ...
+  //   PhiCount = PHI [Count, Dec]
+  //   ...
+  //   Dec = PhiCount - 1
+  //   ...
+  //   Br: loop if (Dec != 0)
+  BasicBlock *Body = *(CurLoop->block_begin());
+  auto *LbBr = dyn_cast<BranchInst>(Body->getTerminator());
+  ICmpInst *LbCond = cast<ICmpInst>(LbBr->getCondition());
+  Type *Ty = Count->getType();
+
+  PHINode *TcPhi = PHINode::Create(Ty, 2, "tcphi", &Body->front());
+
+  Builder.SetInsertPoint(LbCond);
+  Instruction *TcDec = cast<Instruction>(
+      Builder.CreateSub(TcPhi, ConstantInt::get(Ty, 1),
+                        "tcdec", false, true));
+
+  TcPhi->addIncoming(Count, Preheader);
+  TcPhi->addIncoming(TcDec, Body);
+
+  CmpInst::Predicate Pred =
+      (LbBr->getSuccessor(0) == Body) ? CmpInst::ICMP_NE : CmpInst::ICMP_EQ;
+  LbCond->setPredicate(Pred);
+  LbCond->setOperand(0, TcDec);
+  LbCond->setOperand(1, ConstantInt::get(Ty, 0));
+
+  // Step 3: All the references to the original counter outside
+  //  the loop are replaced with the NewCount -- the value returned from
+  //  __builtin_ctlz(x).
+  if (IsCntPhiUsedOutsideLoop)
+    CntPhi->replaceUsesOutsideBlock(NewCount, Body);
+  else
+    CntInst->replaceUsesOutsideBlock(NewCount, Body);
+
+  // step 4: Forget the "non-computable" trip-count SCEV associated with the
+  //   loop. The loop would otherwise not be deleted even if it becomes empty.
+  SE->forgetLoop(CurLoop);
+}
+
 void LoopIdiomRecognize::transformLoopToPopcount(BasicBlock *PreCondBB,
                                                  Instruction *CntInst,
                                                  PHINode *CntPhi, Value *Var) {
diff --git a/lib/Transforms/Scalar/NewGVN.cpp b/lib/Transforms/Scalar/NewGVN.cpp
index 3c9850b156ac..5e0a705782ea 100644
--- a/lib/Transforms/Scalar/NewGVN.cpp
+++ b/lib/Transforms/Scalar/NewGVN.cpp
@@ -283,7 +283,6 @@ public:
 
   // Forward propagation info
   const Expression *getDefiningExpr() const { return DefiningExpr; }
-  void setDefiningExpr(const Expression *E) { DefiningExpr = E; }
 
   // Value member set
   bool empty() const { return Members.empty(); }
@@ -317,6 +316,9 @@ public:
     --StoreCount;
   }
 
+  // True if this class has no memory members.
+  bool definesNoMemory() const { return StoreCount == 0 && memory_empty(); }
+
   // Return true if two congruence classes are equivalent to each other.  This
   // means
   // that every field but the ID number and the dead field are equivalent.
@@ -401,9 +403,12 @@ class NewGVN {
   MemorySSAWalker *MSSAWalker;
   const DataLayout &DL;
   std::unique_ptr<PredicateInfo> PredInfo;
-  BumpPtrAllocator ExpressionAllocator;
-  ArrayRecycler<Value *> ArgRecycler;
-  TarjanSCC SCCFinder;
+
+  // These are the only two things the create* functions should have
+  // side-effects on due to allocating memory.
+  mutable BumpPtrAllocator ExpressionAllocator;
+  mutable ArrayRecycler<Value *> ArgRecycler;
+  mutable TarjanSCC SCCFinder;
   const SimplifyQuery SQ;
 
   // Number of function arguments, used by ranking
@@ -430,11 +435,12 @@ class NewGVN {
   // In order to correctly ensure propagation, we must keep track of what
   // comparisons we used, so that when the values of the comparisons change, we
   // propagate the information to the places we used the comparison.
-  DenseMap<const Value *, SmallPtrSet<Instruction *, 2>> PredicateToUsers;
-  // Mapping from MemoryAccess we used to the MemoryAccess we used it with.  Has
+  mutable DenseMap<const Value *, SmallPtrSet<Instruction *, 2>>
+      PredicateToUsers;
   // the same reasoning as PredicateToUsers.  When we skip MemoryAccesses for
   // stores, we no longer can rely solely on the def-use chains of MemorySSA.
-  DenseMap<const MemoryAccess *, SmallPtrSet<MemoryAccess *, 2>> MemoryToUsers;
+  mutable DenseMap<const MemoryAccess *, SmallPtrSet<MemoryAccess *, 2>>
+      MemoryToUsers;
 
   // A table storing which memorydefs/phis represent a memory state provably
   // equivalent to another memory state.
@@ -457,7 +463,7 @@ class NewGVN {
   DenseMap<const MemoryPhi *, MemoryPhiState> MemoryPhiState;
 
   enum PhiCycleState { PCS_Unknown, PCS_CycleFree, PCS_Cycle };
-  DenseMap<const PHINode *, PhiCycleState> PhiCycleState;
+  mutable DenseMap<const PHINode *, PhiCycleState> PhiCycleState;
   // Expression to class mapping.
   using ExpressionClassMap = DenseMap<const Expression *, CongruenceClass *>;
   ExpressionClassMap ExpressionToClass;
@@ -511,21 +517,24 @@ public:
 
 private:
   // Expression handling.
-  const Expression *createExpression(Instruction *);
-  const Expression *createBinaryExpression(unsigned, Type *, Value *, Value *);
-  PHIExpression *createPHIExpression(Instruction *, bool &HasBackedge,
-                                     bool &AllConstant);
-  const VariableExpression *createVariableExpression(Value *);
-  const ConstantExpression *createConstantExpression(Constant *);
-  const Expression *createVariableOrConstant(Value *V);
-  const UnknownExpression *createUnknownExpression(Instruction *);
+  const Expression *createExpression(Instruction *) const;
+  const Expression *createBinaryExpression(unsigned, Type *, Value *,
+                                           Value *) const;
+  PHIExpression *createPHIExpression(Instruction *, bool &HasBackEdge,
+                                     bool &AllConstant) const;
+  const VariableExpression *createVariableExpression(Value *) const;
+  const ConstantExpression *createConstantExpression(Constant *) const;
+  const Expression *createVariableOrConstant(Value *V) const;
+  const UnknownExpression *createUnknownExpression(Instruction *) const;
   const StoreExpression *createStoreExpression(StoreInst *,
-                                               const MemoryAccess *);
+                                               const MemoryAccess *) const;
   LoadExpression *createLoadExpression(Type *, Value *, LoadInst *,
-                                       const MemoryAccess *);
-  const CallExpression *createCallExpression(CallInst *, const MemoryAccess *);
-  const AggregateValueExpression *createAggregateValueExpression(Instruction *);
-  bool setBasicExpressionInfo(Instruction *, BasicExpression *);
+                                       const MemoryAccess *) const;
+  const CallExpression *createCallExpression(CallInst *,
+                                             const MemoryAccess *) const;
+  const AggregateValueExpression *
+  createAggregateValueExpression(Instruction *) const;
+  bool setBasicExpressionInfo(Instruction *, BasicExpression *) const;
 
   // Congruence class handling.
   CongruenceClass *createCongruenceClass(Value *Leader, const Expression *E) {
@@ -560,17 +569,18 @@ private:
 
   // Symbolic evaluation.
   const Expression *checkSimplificationResults(Expression *, Instruction *,
-                                               Value *);
-  const Expression *performSymbolicEvaluation(Value *);
+                                               Value *) const;
+  const Expression *performSymbolicEvaluation(Value *) const;
   const Expression *performSymbolicLoadCoercion(Type *, Value *, LoadInst *,
-                                                Instruction *, MemoryAccess *);
-  const Expression *performSymbolicLoadEvaluation(Instruction *);
-  const Expression *performSymbolicStoreEvaluation(Instruction *);
-  const Expression *performSymbolicCallEvaluation(Instruction *);
-  const Expression *performSymbolicPHIEvaluation(Instruction *);
-  const Expression *performSymbolicAggrValueEvaluation(Instruction *);
-  const Expression *performSymbolicCmpEvaluation(Instruction *);
-  const Expression *performSymbolicPredicateInfoEvaluation(Instruction *);
+                                                Instruction *,
+                                                MemoryAccess *) const;
+  const Expression *performSymbolicLoadEvaluation(Instruction *) const;
+  const Expression *performSymbolicStoreEvaluation(Instruction *) const;
+  const Expression *performSymbolicCallEvaluation(Instruction *) const;
+  const Expression *performSymbolicPHIEvaluation(Instruction *) const;
+  const Expression *performSymbolicAggrValueEvaluation(Instruction *) const;
+  const Expression *performSymbolicCmpEvaluation(Instruction *) const;
+  const Expression *performSymbolicPredicateInfoEvaluation(Instruction *) const;
 
   // Congruence finding.
   bool someEquivalentDominates(const Instruction *, const Instruction *) const;
@@ -620,8 +630,8 @@ private:
   void markPredicateUsersTouched(Instruction *);
   void markValueLeaderChangeTouched(CongruenceClass *CC);
   void markMemoryLeaderChangeTouched(CongruenceClass *CC);
-  void addPredicateUsers(const PredicateBase *, Instruction *);
-  void addMemoryUsers(const MemoryAccess *To, MemoryAccess *U);
+  void addPredicateUsers(const PredicateBase *, Instruction *) const;
+  void addMemoryUsers(const MemoryAccess *To, MemoryAccess *U) const;
 
   // Main loop of value numbering
   void iterateTouchedInstructions();
@@ -634,7 +644,7 @@ private:
   void verifyIterationSettled(Function &F);
   bool singleReachablePHIPath(const MemoryAccess *, const MemoryAccess *) const;
   BasicBlock *getBlockForValue(Value *V) const;
-  void deleteExpression(const Expression *E);
+  void deleteExpression(const Expression *E) const;
   unsigned InstrToDFSNum(const Value *V) const {
     assert(isa<Instruction>(V) && "This should not be used for MemoryAccesses");
     return InstrDFS.lookup(V);
@@ -654,7 +664,7 @@ private:
                ? InstrToDFSNum(cast<MemoryUseOrDef>(MA)->getMemoryInst())
                : InstrDFS.lookup(MA);
   }
-  bool isCycleFree(const PHINode *PN);
+  bool isCycleFree(const PHINode *PN) const;
   template <class T, class Range> T *getMinDFSOfRange(const Range &) const;
   // Debug counter info.  When verifying, we have to reset the value numbering
   // debug counter to the same state it started in to get the same results.
@@ -702,7 +712,7 @@ BasicBlock *NewGVN::getBlockForValue(Value *V) const {
 // Delete a definitely dead expression, so it can be reused by the expression
 // allocator.  Some of these are not in creation functions, so we have to accept
 // const versions.
-void NewGVN::deleteExpression(const Expression *E) {
+void NewGVN::deleteExpression(const Expression *E) const {
   assert(isa<BasicExpression>(E));
   auto *BE = cast<BasicExpression>(E);
   const_cast<BasicExpression *>(BE)->deallocateOperands(ArgRecycler);
@@ -710,7 +720,7 @@ void NewGVN::deleteExpression(const Expression *E) {
 }
 
 PHIExpression *NewGVN::createPHIExpression(Instruction *I, bool &HasBackedge,
-                                           bool &AllConstant) {
+                                           bool &AllConstant) const {
   BasicBlock *PHIBlock = I->getParent();
   auto *PN = cast<PHINode>(I);
   auto *E =
@@ -722,30 +732,46 @@ PHIExpression *NewGVN::createPHIExpression(Instruction *I, bool &HasBackedge,
 
   unsigned PHIRPO = RPOOrdering.lookup(DT->getNode(PHIBlock));
 
+  // NewGVN assumes the operands of a PHI node are in a consistent order across
+  // PHIs. LLVM doesn't seem to always guarantee this. While we need to fix
+  // this in LLVM at some point we don't want GVN to find wrong congruences.
+  // Therefore, here we sort uses in predecessor order.
+  // We're sorting the values by pointer. In theory this might be cause of
+  // non-determinism, but here we don't rely on the ordering for anything
+  // significant, e.g. we don't create new instructions based on it so we're
+  // fine.
+  SmallVector<const Use *, 4> PHIOperands;
+  for (const Use &U : PN->operands())
+    PHIOperands.push_back(&U);
+  std::sort(PHIOperands.begin(), PHIOperands.end(),
+            [&](const Use *U1, const Use *U2) {
+              return PN->getIncomingBlock(*U1) < PN->getIncomingBlock(*U2);
+            });
+
   // Filter out unreachable phi operands.
-  auto Filtered = make_filter_range(PN->operands(), [&](const Use &U) {
-    return ReachableEdges.count({PN->getIncomingBlock(U), PHIBlock});
+  auto Filtered = make_filter_range(PHIOperands, [&](const Use *U) {
+    return ReachableEdges.count({PN->getIncomingBlock(*U), PHIBlock});
   });
 
   std::transform(Filtered.begin(), Filtered.end(), op_inserter(E),
-                 [&](const Use &U) -> Value * {
-                   auto *BB = PN->getIncomingBlock(U);
+                 [&](const Use *U) -> Value * {
+                   auto *BB = PN->getIncomingBlock(*U);
                    auto *DTN = DT->getNode(BB);
                    if (RPOOrdering.lookup(DTN) >= PHIRPO)
                      HasBackedge = true;
-                   AllConstant &= isa<UndefValue>(U) || isa<Constant>(U);
+                   AllConstant &= isa<UndefValue>(*U) || isa<Constant>(*U);
 
                    // Don't try to transform self-defined phis.
-                   if (U == PN)
+                   if (*U == PN)
                      return PN;
-                   return lookupOperandLeader(U);
+                   return lookupOperandLeader(*U);
                  });
   return E;
 }
 
 // Set basic expression info (Arguments, type, opcode) for Expression
 // E from Instruction I in block B.
-bool NewGVN::setBasicExpressionInfo(Instruction *I, BasicExpression *E) {
+bool NewGVN::setBasicExpressionInfo(Instruction *I, BasicExpression *E) const {
   bool AllConstant = true;
   if (auto *GEP = dyn_cast<GetElementPtrInst>(I))
     E->setType(GEP->getSourceElementType());
@@ -766,7 +792,8 @@ bool NewGVN::setBasicExpressionInfo(Instruction *I, BasicExpression *E) {
 }
 
 const Expression *NewGVN::createBinaryExpression(unsigned Opcode, Type *T,
-                                                 Value *Arg1, Value *Arg2) {
+                                                 Value *Arg1,
+                                                 Value *Arg2) const {
   auto *E = new (ExpressionAllocator) BasicExpression(2);
 
   E->setType(T);
@@ -795,7 +822,8 @@ const Expression *NewGVN::createBinaryExpression(unsigned Opcode, Type *T,
 // TODO: Once finished, this should not take an Instruction, we only
 // use it for printing.
 const Expression *NewGVN::checkSimplificationResults(Expression *E,
-                                                     Instruction *I, Value *V) {
+                                                     Instruction *I,
+                                                     Value *V) const {
   if (!V)
     return nullptr;
   if (auto *C = dyn_cast<Constant>(V)) {
@@ -827,7 +855,7 @@ const Expression *NewGVN::checkSimplificationResults(Expression *E,
   return nullptr;
 }
 
-const Expression *NewGVN::createExpression(Instruction *I) {
+const Expression *NewGVN::createExpression(Instruction *I) const {
   auto *E = new (ExpressionAllocator) BasicExpression(I->getNumOperands());
 
   bool AllConstant = setBasicExpressionInfo(I, E);
@@ -913,7 +941,7 @@ const Expression *NewGVN::createExpression(Instruction *I) {
 }
 
 const AggregateValueExpression *
-NewGVN::createAggregateValueExpression(Instruction *I) {
+NewGVN::createAggregateValueExpression(Instruction *I) const {
   if (auto *II = dyn_cast<InsertValueInst>(I)) {
     auto *E = new (ExpressionAllocator)
         AggregateValueExpression(I->getNumOperands(), II->getNumIndices());
@@ -932,32 +960,32 @@ NewGVN::createAggregateValueExpression(Instruction *I) {
   llvm_unreachable("Unhandled type of aggregate value operation");
 }
 
-const VariableExpression *NewGVN::createVariableExpression(Value *V) {
+const VariableExpression *NewGVN::createVariableExpression(Value *V) const {
   auto *E = new (ExpressionAllocator) VariableExpression(V);
   E->setOpcode(V->getValueID());
   return E;
 }
 
-const Expression *NewGVN::createVariableOrConstant(Value *V) {
+const Expression *NewGVN::createVariableOrConstant(Value *V) const {
   if (auto *C = dyn_cast<Constant>(V))
     return createConstantExpression(C);
   return createVariableExpression(V);
 }
 
-const ConstantExpression *NewGVN::createConstantExpression(Constant *C) {
+const ConstantExpression *NewGVN::createConstantExpression(Constant *C) const {
   auto *E = new (ExpressionAllocator) ConstantExpression(C);
   E->setOpcode(C->getValueID());
   return E;
 }
 
-const UnknownExpression *NewGVN::createUnknownExpression(Instruction *I) {
+const UnknownExpression *NewGVN::createUnknownExpression(Instruction *I) const {
   auto *E = new (ExpressionAllocator) UnknownExpression(I);
   E->setOpcode(I->getOpcode());
   return E;
 }
 
-const CallExpression *NewGVN::createCallExpression(CallInst *CI,
-                                                   const MemoryAccess *MA) {
+const CallExpression *
+NewGVN::createCallExpression(CallInst *CI, const MemoryAccess *MA) const {
   // FIXME: Add operand bundles for calls.
   auto *E =
       new (ExpressionAllocator) CallExpression(CI->getNumOperands(), CI, MA);
@@ -1017,9 +1045,8 @@ Value *NewGVN::lookupOperandLeader(Value *V) const {
 const MemoryAccess *NewGVN::lookupMemoryLeader(const MemoryAccess *MA) const {
   auto *CC = getMemoryClass(MA);
   assert(CC->getMemoryLeader() &&
-         "Every MemoryAccess should be mapped to a "
-         "congruence class with a represenative memory "
-         "access");
+         "Every MemoryAccess should be mapped to a congruence class with a "
+         "representative memory access");
   return CC->getMemoryLeader();
 }
 
@@ -1032,7 +1059,7 @@ bool NewGVN::isMemoryAccessTop(const MemoryAccess *MA) const {
 
 LoadExpression *NewGVN::createLoadExpression(Type *LoadType, Value *PointerOp,
                                              LoadInst *LI,
-                                             const MemoryAccess *MA) {
+                                             const MemoryAccess *MA) const {
   auto *E =
       new (ExpressionAllocator) LoadExpression(1, LI, lookupMemoryLeader(MA));
   E->allocateOperands(ArgRecycler, ExpressionAllocator);
@@ -1050,8 +1077,8 @@ LoadExpression *NewGVN::createLoadExpression(Type *LoadType, Value *PointerOp,
   return E;
 }
 
-const StoreExpression *NewGVN::createStoreExpression(StoreInst *SI,
-                                                     const MemoryAccess *MA) {
+const StoreExpression *
+NewGVN::createStoreExpression(StoreInst *SI, const MemoryAccess *MA) const {
   auto *StoredValueLeader = lookupOperandLeader(SI->getValueOperand());
   auto *E = new (ExpressionAllocator)
       StoreExpression(SI->getNumOperands(), SI, StoredValueLeader, MA);
@@ -1068,7 +1095,7 @@ const StoreExpression *NewGVN::createStoreExpression(StoreInst *SI,
   return E;
 }
 
-const Expression *NewGVN::performSymbolicStoreEvaluation(Instruction *I) {
+const Expression *NewGVN::performSymbolicStoreEvaluation(Instruction *I) const {
   // Unlike loads, we never try to eliminate stores, so we do not check if they
   // are simple and avoid value numbering them.
   auto *SI = cast<StoreInst>(I);
@@ -1126,7 +1153,7 @@ const Expression *NewGVN::performSymbolicStoreEvaluation(Instruction *I) {
 const Expression *
 NewGVN::performSymbolicLoadCoercion(Type *LoadType, Value *LoadPtr,
                                     LoadInst *LI, Instruction *DepInst,
-                                    MemoryAccess *DefiningAccess) {
+                                    MemoryAccess *DefiningAccess) const {
   assert((!LI || LI->isSimple()) && "Not a simple load");
   if (auto *DepSI = dyn_cast<StoreInst>(DepInst)) {
     // Can't forward from non-atomic to atomic without violating memory model.
@@ -1201,7 +1228,7 @@ NewGVN::performSymbolicLoadCoercion(Type *LoadType, Value *LoadPtr,
   return nullptr;
 }
 
-const Expression *NewGVN::performSymbolicLoadEvaluation(Instruction *I) {
+const Expression *NewGVN::performSymbolicLoadEvaluation(Instruction *I) const {
   auto *LI = cast<LoadInst>(I);
 
   // We can eliminate in favor of non-simple loads, but we won't be able to
@@ -1239,7 +1266,7 @@ const Expression *NewGVN::performSymbolicLoadEvaluation(Instruction *I) {
 }
 
 const Expression *
-NewGVN::performSymbolicPredicateInfoEvaluation(Instruction *I) {
+NewGVN::performSymbolicPredicateInfoEvaluation(Instruction *I) const {
   auto *PI = PredInfo->getPredicateInfoFor(I);
   if (!PI)
     return nullptr;
@@ -1284,7 +1311,7 @@ NewGVN::performSymbolicPredicateInfoEvaluation(Instruction *I) {
     return nullptr;
 
   if (CopyOf != Cmp->getOperand(0) && CopyOf != Cmp->getOperand(1)) {
-    DEBUG(dbgs() << "Copy is not of any condition operands!");
+    DEBUG(dbgs() << "Copy is not of any condition operands!\n");
     return nullptr;
   }
   Value *FirstOp = lookupOperandLeader(Cmp->getOperand(0));
@@ -1329,7 +1356,7 @@ NewGVN::performSymbolicPredicateInfoEvaluation(Instruction *I) {
 }
 
 // Evaluate read only and pure calls, and create an expression result.
-const Expression *NewGVN::performSymbolicCallEvaluation(Instruction *I) {
+const Expression *NewGVN::performSymbolicCallEvaluation(Instruction *I) const {
   auto *CI = cast<CallInst>(I);
   if (auto *II = dyn_cast<IntrinsicInst>(I)) {
     // Instrinsics with the returned attribute are copies of arguments.
@@ -1366,8 +1393,7 @@ bool NewGVN::setMemoryClass(const MemoryAccess *From,
   DEBUG(dbgs() << "Setting " << *From);
   DEBUG(dbgs() << " equivalent to congruence class ");
   DEBUG(dbgs() << NewClass->getID() << " with current MemoryAccess leader ");
-  DEBUG(dbgs() << *NewClass->getMemoryLeader());
-  DEBUG(dbgs() << "\n");
+  DEBUG(dbgs() << *NewClass->getMemoryLeader() << "\n");
 
   auto LookupResult = MemoryAccessToClass.find(From);
   bool Changed = false;
@@ -1381,7 +1407,7 @@ bool NewGVN::setMemoryClass(const MemoryAccess *From,
         NewClass->memory_insert(MP);
         // This may have killed the class if it had no non-memory members
         if (OldClass->getMemoryLeader() == From) {
-          if (OldClass->memory_empty()) {
+          if (OldClass->definesNoMemory()) {
             OldClass->setMemoryLeader(nullptr);
           } else {
             OldClass->setMemoryLeader(getNextMemoryLeader(OldClass));
@@ -1406,7 +1432,7 @@ bool NewGVN::setMemoryClass(const MemoryAccess *From,
 // Determine if a phi is cycle-free.  That means the values in the phi don't
 // depend on any expressions that can change value as a result of the phi.
 // For example, a non-cycle free phi would be  v = phi(0, v+1).
-bool NewGVN::isCycleFree(const PHINode *PN) {
+bool NewGVN::isCycleFree(const PHINode *PN) const {
   // In order to compute cycle-freeness, we do SCC finding on the phi, and see
   // what kind of SCC it ends up in.  If it is a singleton, it is cycle-free.
   // If it is not in a singleton, it is only cycle free if the other members are
@@ -1436,7 +1462,7 @@ bool NewGVN::isCycleFree(const PHINode *PN) {
 }
 
 // Evaluate PHI nodes symbolically, and create an expression result.
-const Expression *NewGVN::performSymbolicPHIEvaluation(Instruction *I) {
+const Expression *NewGVN::performSymbolicPHIEvaluation(Instruction *I) const {
   // True if one of the incoming phi edges is a backedge.
   bool HasBackedge = false;
   // All constant tracks the state of whether all the *original* phi operands
@@ -1510,7 +1536,8 @@ const Expression *NewGVN::performSymbolicPHIEvaluation(Instruction *I) {
   return E;
 }
 
-const Expression *NewGVN::performSymbolicAggrValueEvaluation(Instruction *I) {
+const Expression *
+NewGVN::performSymbolicAggrValueEvaluation(Instruction *I) const {
   if (auto *EI = dyn_cast<ExtractValueInst>(I)) {
     auto *II = dyn_cast<IntrinsicInst>(EI->getAggregateOperand());
     if (II && EI->getNumIndices() == 1 && *EI->idx_begin() == 0) {
@@ -1548,7 +1575,7 @@ const Expression *NewGVN::performSymbolicAggrValueEvaluation(Instruction *I) {
 
   return createAggregateValueExpression(I);
 }
-const Expression *NewGVN::performSymbolicCmpEvaluation(Instruction *I) {
+const Expression *NewGVN::performSymbolicCmpEvaluation(Instruction *I) const {
   auto *CI = dyn_cast<CmpInst>(I);
   // See if our operands are equal to those of a previous predicate, and if so,
   // if it implies true or false.
@@ -1663,7 +1690,7 @@ const Expression *NewGVN::performSymbolicCmpEvaluation(Instruction *I) {
 }
 
 // Substitute and symbolize the value before value numbering.
-const Expression *NewGVN::performSymbolicEvaluation(Value *V) {
+const Expression *NewGVN::performSymbolicEvaluation(Value *V) const {
   const Expression *E = nullptr;
   if (auto *C = dyn_cast<Constant>(V))
     E = createConstantExpression(C);
@@ -1749,7 +1776,7 @@ void NewGVN::markUsersTouched(Value *V) {
   }
 }
 
-void NewGVN::addMemoryUsers(const MemoryAccess *To, MemoryAccess *U) {
+void NewGVN::addMemoryUsers(const MemoryAccess *To, MemoryAccess *U) const {
   DEBUG(dbgs() << "Adding memory user " << *U << " to " << *To << "\n");
   MemoryToUsers[To].insert(U);
 }
@@ -1772,7 +1799,7 @@ void NewGVN::markMemoryUsersTouched(const MemoryAccess *MA) {
 }
 
 // Add I to the set of users of a given predicate.
-void NewGVN::addPredicateUsers(const PredicateBase *PB, Instruction *I) {
+void NewGVN::addPredicateUsers(const PredicateBase *PB, Instruction *I) const {
   if (auto *PBranch = dyn_cast<PredicateBranch>(PB))
     PredicateToUsers[PBranch->Condition].insert(I);
   else if (auto *PAssume = dyn_cast<PredicateBranch>(PB))
@@ -1825,8 +1852,7 @@ const MemoryAccess *NewGVN::getNextMemoryLeader(CongruenceClass *CC) const {
   // TODO: If this ends up to slow, we can maintain a next memory leader like we
   // do for regular leaders.
   // Make sure there will be a leader to find
-  assert((CC->getStoreCount() > 0 || !CC->memory_empty()) &&
-         "Can't get next leader if there is none");
+  assert(!CC->definesNoMemory() && "Can't get next leader if there is none");
   if (CC->getStoreCount() > 0) {
     if (auto *NL = dyn_cast_or_null<StoreInst>(CC->getNextLeader().first))
       return MSSA->getMemoryAccess(NL);
@@ -1898,7 +1924,7 @@ void NewGVN::moveMemoryToNewCongruenceClass(Instruction *I,
   setMemoryClass(InstMA, NewClass);
   // Now, fixup the old class if necessary
   if (OldClass->getMemoryLeader() == InstMA) {
-    if (OldClass->getStoreCount() != 0 || !OldClass->memory_empty()) {
+    if (!OldClass->definesNoMemory()) {
       OldClass->setMemoryLeader(getNextMemoryLeader(OldClass));
       DEBUG(dbgs() << "Memory class leader change for class "
                    << OldClass->getID() << " to "
@@ -1956,10 +1982,9 @@ void NewGVN::moveValueToNewCongruenceClass(Instruction *I, const Expression *E,
     if (NewClass->getStoreCount() == 0 && !NewClass->getStoredValue()) {
       // If it's a store expression we are using, it means we are not equivalent
       // to something earlier.
-      if (isa<StoreExpression>(E)) {
-        assert(lookupOperandLeader(SI->getValueOperand()) !=
-               NewClass->getLeader());
-        NewClass->setStoredValue(lookupOperandLeader(SI->getValueOperand()));
+      if (auto *SE = dyn_cast<StoreExpression>(E)) {
+        assert(SE->getStoredValue() != NewClass->getLeader());
+        NewClass->setStoredValue(SE->getStoredValue());
         markValueLeaderChangeTouched(NewClass);
         // Shift the new class leader to be the store
         DEBUG(dbgs() << "Changing leader of congruence class "
@@ -1985,7 +2010,7 @@ void NewGVN::moveValueToNewCongruenceClass(Instruction *I, const Expression *E,
   // See if we destroyed the class or need to swap leaders.
   if (OldClass->empty() && OldClass != TOPClass) {
     if (OldClass->getDefiningExpr()) {
-      DEBUG(dbgs() << "Erasing expression " << OldClass->getDefiningExpr()
+      DEBUG(dbgs() << "Erasing expression " << *OldClass->getDefiningExpr()
                    << " from table\n");
       ExpressionToClass.erase(OldClass->getDefiningExpr());
     }
@@ -2064,7 +2089,7 @@ void NewGVN::performCongruenceFinding(Instruction *I, const Expression *E) {
       } else if (const auto *SE = dyn_cast<StoreExpression>(E)) {
         StoreInst *SI = SE->getStoreInst();
         NewClass->setLeader(SI);
-        NewClass->setStoredValue(lookupOperandLeader(SI->getValueOperand()));
+        NewClass->setStoredValue(SE->getStoredValue());
         // The RepMemoryAccess field will be filled in properly by the
         // moveValueToNewCongruenceClass call.
       } else {
@@ -2523,6 +2548,19 @@ void NewGVN::verifyMemoryCongruency() const {
           return false;
         if (auto *MemDef = dyn_cast<MemoryDef>(Pair.first))
           return !isInstructionTriviallyDead(MemDef->getMemoryInst());
+
+        // We could have phi nodes which operands are all trivially dead,
+        // so we don't process them.
+        if (auto *MemPHI = dyn_cast<MemoryPhi>(Pair.first)) {
+          for (auto &U : MemPHI->incoming_values()) {
+            if (Instruction *I = dyn_cast<Instruction>(U.get())) {
+              if (!isInstructionTriviallyDead(I))
+                return true;
+            }
+          }
+          return false;
+        }
+
         return true;
       };
 
diff --git a/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp b/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
index fb1b47c48276..4f608c97147d 100644
--- a/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
+++ b/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
@@ -55,7 +55,7 @@ static void replaceLoopUsesWithConstant(Loop &L, Value &LIC,
 /// Update the dominator tree after removing one exiting predecessor of a loop
 /// exit block.
 static void updateLoopExitIDom(BasicBlock *LoopExitBB, Loop &L,
-                                            DominatorTree &DT) {
+                               DominatorTree &DT) {
   assert(pred_begin(LoopExitBB) != pred_end(LoopExitBB) &&
          "Cannot have empty predecessors of the loop exit block if we split "
          "off a block to unswitch!");
@@ -137,6 +137,98 @@ static void updateDTAfterUnswitch(BasicBlock *UnswitchedBB, BasicBlock *OldPH,
   }
 }
 
+/// Check that all the LCSSA PHI nodes in the loop exit block have trivial
+/// incoming values along this edge.
+static bool areLoopExitPHIsLoopInvariant(Loop &L, BasicBlock &ExitingBB,
+                                         BasicBlock &ExitBB) {
+  for (Instruction &I : ExitBB) {
+    auto *PN = dyn_cast<PHINode>(&I);
+    if (!PN)
+      // No more PHIs to check.
+      return true;
+
+    // If the incoming value for this edge isn't loop invariant the unswitch
+    // won't be trivial.
+    if (!L.isLoopInvariant(PN->getIncomingValueForBlock(&ExitingBB)))
+      return false;
+  }
+  llvm_unreachable("Basic blocks should never be empty!");
+}
+
+/// Rewrite the PHI nodes in an unswitched loop exit basic block.
+///
+/// Requires that the loop exit and unswitched basic block are the same, and
+/// that the exiting block was a unique predecessor of that block. Rewrites the
+/// PHI nodes in that block such that what were LCSSA PHI nodes become trivial
+/// PHI nodes from the old preheader that now contains the unswitched
+/// terminator.
+static void rewritePHINodesForUnswitchedExitBlock(BasicBlock &UnswitchedBB,
+                                                  BasicBlock &OldExitingBB,
+                                                  BasicBlock &OldPH) {
+  for (Instruction &I : UnswitchedBB) {
+    auto *PN = dyn_cast<PHINode>(&I);
+    if (!PN)
+      // No more PHIs to check.
+      break;
+
+    // When the loop exit is directly unswitched we just need to update the
+    // incoming basic block. We loop to handle weird cases with repeated
+    // incoming blocks, but expect to typically only have one operand here.
+    for (auto i : llvm::seq<int>(0, PN->getNumOperands())) {
+      assert(PN->getIncomingBlock(i) == &OldExitingBB &&
+             "Found incoming block different from unique predecessor!");
+      PN->setIncomingBlock(i, &OldPH);
+    }
+  }
+}
+
+/// Rewrite the PHI nodes in the loop exit basic block and the split off
+/// unswitched block.
+///
+/// Because the exit block remains an exit from the loop, this rewrites the
+/// LCSSA PHI nodes in it to remove the unswitched edge and introduces PHI
+/// nodes into the unswitched basic block to select between the value in the
+/// old preheader and the loop exit.
+static void rewritePHINodesForExitAndUnswitchedBlocks(BasicBlock &ExitBB,
+                                                      BasicBlock &UnswitchedBB,
+                                                      BasicBlock &OldExitingBB,
+                                                      BasicBlock &OldPH) {
+  assert(&ExitBB != &UnswitchedBB &&
+         "Must have different loop exit and unswitched blocks!");
+  Instruction *InsertPt = &*UnswitchedBB.begin();
+  for (Instruction &I : ExitBB) {
+    auto *PN = dyn_cast<PHINode>(&I);
+    if (!PN)
+      // No more PHIs to check.
+      break;
+
+    auto *NewPN = PHINode::Create(PN->getType(), /*NumReservedValues*/ 2,
+                                  PN->getName() + ".split", InsertPt);
+
+    // Walk backwards over the old PHI node's inputs to minimize the cost of
+    // removing each one. We have to do this weird loop manually so that we
+    // create the same number of new incoming edges in the new PHI as we expect
+    // each case-based edge to be included in the unswitched switch in some
+    // cases.
+    // FIXME: This is really, really gross. It would be much cleaner if LLVM
+    // allowed us to create a single entry for a predecessor block without
+    // having separate entries for each "edge" even though these edges are
+    // required to produce identical results.
+    for (int i = PN->getNumIncomingValues() - 1; i >= 0; --i) {
+      if (PN->getIncomingBlock(i) != &OldExitingBB)
+        continue;
+
+      Value *Incoming = PN->removeIncomingValue(i);
+      NewPN->addIncoming(Incoming, &OldPH);
+    }
+
+    // Now replace the old PHI with the new one and wire the old one in as an
+    // input to the new one.
+    PN->replaceAllUsesWith(NewPN);
+    NewPN->addIncoming(PN, &ExitBB);
+  }
+}
+
 /// Unswitch a trivial branch if the condition is loop invariant.
 ///
 /// This routine should only be called when loop code leading to the branch has
@@ -187,10 +279,8 @@ static bool unswitchTrivialBranch(Loop &L, BranchInst &BI, DominatorTree &DT,
   assert(L.contains(ContinueBB) &&
          "Cannot have both successors exit and still be in the loop!");
 
-  // If the loop exit block contains phi nodes, this isn't trivial.
-  // FIXME: We should examine the PHI to determine whether or not we can handle
-  // it trivially.
-  if (isa<PHINode>(LoopExitBB->begin()))
+  auto *ParentBB = BI.getParent();
+  if (!areLoopExitPHIsLoopInvariant(L, *ParentBB, *LoopExitBB))
     return false;
 
   DEBUG(dbgs() << "    unswitching trivial branch when: " << CondVal
@@ -209,14 +299,13 @@ static bool unswitchTrivialBranch(Loop &L, BranchInst &BI, DominatorTree &DT,
   BasicBlock *UnswitchedBB;
   if (BasicBlock *PredBB = LoopExitBB->getUniquePredecessor()) {
     (void)PredBB;
-    assert(PredBB == BI.getParent() && "A branch's parent is't a predecessor!");
+    assert(PredBB == BI.getParent() &&
+           "A branch's parent isn't a predecessor!");
     UnswitchedBB = LoopExitBB;
   } else {
     UnswitchedBB = SplitBlock(LoopExitBB, &LoopExitBB->front(), &DT, &LI);
   }
 
-  BasicBlock *ParentBB = BI.getParent();
-
   // Now splice the branch to gate reaching the new preheader and re-point its
   // successors.
   OldPH->getInstList().splice(std::prev(OldPH->end()),
@@ -229,6 +318,13 @@ static bool unswitchTrivialBranch(Loop &L, BranchInst &BI, DominatorTree &DT,
   // terminator.
   BranchInst::Create(ContinueBB, ParentBB);
 
+  // Rewrite the relevant PHI nodes.
+  if (UnswitchedBB == LoopExitBB)
+    rewritePHINodesForUnswitchedExitBlock(*UnswitchedBB, *ParentBB, *OldPH);
+  else
+    rewritePHINodesForExitAndUnswitchedBlocks(*LoopExitBB, *UnswitchedBB,
+                                              *ParentBB, *OldPH);
+
   // Now we need to update the dominator tree.
   updateDTAfterUnswitch(UnswitchedBB, OldPH, DT);
   // But if we split something off of the loop exit block then we also removed
@@ -278,6 +374,8 @@ static bool unswitchTrivialSwitch(Loop &L, SwitchInst &SI, DominatorTree &DT,
   if (!L.isLoopInvariant(LoopCond))
     return false;
 
+  auto *ParentBB = SI.getParent();
+
   // FIXME: We should compute this once at the start and update it!
   SmallVector<BasicBlock *, 16> ExitBlocks;
   L.getExitBlocks(ExitBlocks);
@@ -287,12 +385,13 @@ static bool unswitchTrivialSwitch(Loop &L, SwitchInst &SI, DominatorTree &DT,
   SmallVector<int, 4> ExitCaseIndices;
   for (auto Case : SI.cases()) {
     auto *SuccBB = Case.getCaseSuccessor();
-    if (ExitBlockSet.count(SuccBB) && !isa<PHINode>(SuccBB->begin()))
+    if (ExitBlockSet.count(SuccBB) &&
+        areLoopExitPHIsLoopInvariant(L, *ParentBB, *SuccBB))
       ExitCaseIndices.push_back(Case.getCaseIndex());
   }
   BasicBlock *DefaultExitBB = nullptr;
   if (ExitBlockSet.count(SI.getDefaultDest()) &&
-      !isa<PHINode>(SI.getDefaultDest()->begin()) &&
+      areLoopExitPHIsLoopInvariant(L, *ParentBB, *SI.getDefaultDest()) &&
       !isa<UnreachableInst>(SI.getDefaultDest()->getTerminator()))
     DefaultExitBB = SI.getDefaultDest();
   else if (ExitCaseIndices.empty())
@@ -330,7 +429,6 @@ static bool unswitchTrivialSwitch(Loop &L, SwitchInst &SI, DominatorTree &DT,
     if (CommonSuccBB) {
       SI.setDefaultDest(CommonSuccBB);
     } else {
-      BasicBlock *ParentBB = SI.getParent();
       BasicBlock *UnreachableBB = BasicBlock::Create(
           ParentBB->getContext(),
           Twine(ParentBB->getName()) + ".unreachable_default",
@@ -358,30 +456,44 @@ static bool unswitchTrivialSwitch(Loop &L, SwitchInst &SI, DominatorTree &DT,
   // Now add the unswitched switch.
   auto *NewSI = SwitchInst::Create(LoopCond, NewPH, ExitCases.size(), OldPH);
 
-  // Split any exit blocks with remaining in-loop predecessors. We walk in
-  // reverse so that we split in the same order as the cases appeared. This is
-  // purely for convenience of reading the resulting IR, but it doesn't cost
-  // anything really.
+  // Rewrite the IR for the unswitched basic blocks. This requires two steps.
+  // First, we split any exit blocks with remaining in-loop predecessors. Then
+  // we update the PHIs in one of two ways depending on if there was a split.
+  // We walk in reverse so that we split in the same order as the cases
+  // appeared. This is purely for convenience of reading the resulting IR, but
+  // it doesn't cost anything really.
+  SmallPtrSet<BasicBlock *, 2> UnswitchedExitBBs;
   SmallDenseMap<BasicBlock *, BasicBlock *, 2> SplitExitBBMap;
   // Handle the default exit if necessary.
   // FIXME: It'd be great if we could merge this with the loop below but LLVM's
   // ranges aren't quite powerful enough yet.
-  if (DefaultExitBB && !pred_empty(DefaultExitBB)) {
-    auto *SplitBB =
-        SplitBlock(DefaultExitBB, &DefaultExitBB->front(), &DT, &LI);
-    updateLoopExitIDom(DefaultExitBB, L, DT);
-    DefaultExitBB = SplitExitBBMap[DefaultExitBB] = SplitBB;
+  if (DefaultExitBB) {
+    if (pred_empty(DefaultExitBB)) {
+      UnswitchedExitBBs.insert(DefaultExitBB);
+      rewritePHINodesForUnswitchedExitBlock(*DefaultExitBB, *ParentBB, *OldPH);
+    } else {
+      auto *SplitBB =
+          SplitBlock(DefaultExitBB, &DefaultExitBB->front(), &DT, &LI);
+      rewritePHINodesForExitAndUnswitchedBlocks(*DefaultExitBB, *SplitBB,
+                                                *ParentBB, *OldPH);
+      updateLoopExitIDom(DefaultExitBB, L, DT);
+      DefaultExitBB = SplitExitBBMap[DefaultExitBB] = SplitBB;
+    }
   }
   // Note that we must use a reference in the for loop so that we update the
   // container.
   for (auto &CasePair : reverse(ExitCases)) {
     // Grab a reference to the exit block in the pair so that we can update it.
-    BasicBlock *&ExitBB = CasePair.second;
+    BasicBlock *ExitBB = CasePair.second;
 
     // If this case is the last edge into the exit block, we can simply reuse it
     // as it will no longer be a loop exit. No mapping necessary.
-    if (pred_empty(ExitBB))
+    if (pred_empty(ExitBB)) {
+      // Only rewrite once.
+      if (UnswitchedExitBBs.insert(ExitBB).second)
+        rewritePHINodesForUnswitchedExitBlock(*ExitBB, *ParentBB, *OldPH);
       continue;
+    }
 
     // Otherwise we need to split the exit block so that we retain an exit
     // block from the loop and a target for the unswitched condition.
@@ -389,9 +501,12 @@ static bool unswitchTrivialSwitch(Loop &L, SwitchInst &SI, DominatorTree &DT,
     if (!SplitExitBB) {
       // If this is the first time we see this, do the split and remember it.
       SplitExitBB = SplitBlock(ExitBB, &ExitBB->front(), &DT, &LI);
+      rewritePHINodesForExitAndUnswitchedBlocks(*ExitBB, *SplitExitBB,
+                                                *ParentBB, *OldPH);
       updateLoopExitIDom(ExitBB, L, DT);
     }
-    ExitBB = SplitExitBB;
+    // Update the case pair to point to the split block.
+    CasePair.second = SplitExitBB;
   }
 
   // Now add the unswitched cases. We do this in reverse order as we built them
diff --git a/lib/Transforms/Scalar/SpeculativeExecution.cpp b/lib/Transforms/Scalar/SpeculativeExecution.cpp
index a0fc966cee2c..a7c308b59877 100644
--- a/lib/Transforms/Scalar/SpeculativeExecution.cpp
+++ b/lib/Transforms/Scalar/SpeculativeExecution.cpp
@@ -208,6 +208,47 @@ bool SpeculativeExecutionPass::runOnBasicBlock(BasicBlock &B) {
   return false;
 }
 
+static unsigned ComputeSpeculationCost(const Instruction *I,
+                                       const TargetTransformInfo &TTI) {
+  switch (Operator::getOpcode(I)) {
+    case Instruction::GetElementPtr:
+    case Instruction::Add:
+    case Instruction::Mul:
+    case Instruction::And:
+    case Instruction::Or:
+    case Instruction::Select:
+    case Instruction::Shl:
+    case Instruction::Sub:
+    case Instruction::LShr:
+    case Instruction::AShr:
+    case Instruction::Xor:
+    case Instruction::ZExt:
+    case Instruction::SExt:
+    case Instruction::Call:
+    case Instruction::BitCast:
+    case Instruction::PtrToInt:
+    case Instruction::IntToPtr:
+    case Instruction::AddrSpaceCast:
+    case Instruction::FPToUI:
+    case Instruction::FPToSI:
+    case Instruction::UIToFP:
+    case Instruction::SIToFP:
+    case Instruction::FPExt:
+    case Instruction::FPTrunc:
+    case Instruction::FAdd:
+    case Instruction::FSub:
+    case Instruction::FMul:
+    case Instruction::FDiv:
+    case Instruction::FRem:
+    case Instruction::ICmp:
+    case Instruction::FCmp:
+      return TTI.getUserCost(I);
+
+    default:
+      return UINT_MAX; // Disallow anything not whitelisted.
+  }
+}
+
 bool SpeculativeExecutionPass::considerHoistingFromTo(
     BasicBlock &FromBlock, BasicBlock &ToBlock) {
   SmallSet<const Instruction *, 8> NotHoisted;
@@ -223,7 +264,7 @@ bool SpeculativeExecutionPass::considerHoistingFromTo(
 
   unsigned TotalSpeculationCost = 0;
   for (auto& I : FromBlock) {
-    const unsigned Cost = TTI->getUserCost(&I);
+    const unsigned Cost = ComputeSpeculationCost(&I, *TTI);
     if (Cost != UINT_MAX && isSafeToSpeculativelyExecute(&I) &&
         AllPrecedingUsesFromBlockHoisted(&I)) {
       TotalSpeculationCost += Cost;
diff --git a/lib/Transforms/Utils/BypassSlowDivision.cpp b/lib/Transforms/Utils/BypassSlowDivision.cpp
index 7ffdad597a9b..83ec7f55d1af 100644
--- a/lib/Transforms/Utils/BypassSlowDivision.cpp
+++ b/lib/Transforms/Utils/BypassSlowDivision.cpp
@@ -261,10 +261,10 @@ ValueRange FastDivInsertionTask::getValueRange(Value *V,
 
   computeKnownBits(V, Known, DL);
 
-  if (Known.Zero.countLeadingOnes() >= HiBits)
+  if (Known.countMinLeadingZeros() >= HiBits)
     return VALRNG_KNOWN_SHORT;
 
-  if (Known.One.countLeadingZeros() < HiBits)
+  if (Known.countMaxLeadingZeros() < HiBits)
     return VALRNG_LIKELY_LONG;
 
   // Long integer divisions are often used in hashtable implementations. It's
diff --git a/lib/Transforms/Utils/CloneFunction.cpp b/lib/Transforms/Utils/CloneFunction.cpp
index d5124ac89016..4aa26fd14fee 100644
--- a/lib/Transforms/Utils/CloneFunction.cpp
+++ b/lib/Transforms/Utils/CloneFunction.cpp
@@ -41,6 +41,7 @@ BasicBlock *llvm::CloneBasicBlock(const BasicBlock *BB,
                                   ValueToValueMapTy &VMap,
                                   const Twine &NameSuffix, Function *F,
                                   ClonedCodeInfo *CodeInfo) {
+  DenseMap<const MDNode *, MDNode *> Cache;
   BasicBlock *NewBB = BasicBlock::Create(BB->getContext(), "", F);
   if (BB->hasName()) NewBB->setName(BB->getName()+NameSuffix);
 
@@ -50,6 +51,9 @@ BasicBlock *llvm::CloneBasicBlock(const BasicBlock *BB,
   for (BasicBlock::const_iterator II = BB->begin(), IE = BB->end();
        II != IE; ++II) {
     Instruction *NewInst = II->clone();
+    if (F && F->getSubprogram())
+      DebugLoc::reparentDebugInfo(*NewInst, BB->getParent()->getSubprogram(),
+                                  F->getSubprogram(), Cache);
     if (II->hasName())
       NewInst->setName(II->getName()+NameSuffix);
     NewBB->getInstList().push_back(NewInst);
@@ -120,12 +124,28 @@ void llvm::CloneFunctionInto(Function *NewFunc, const Function *OldFunc,
 
   SmallVector<std::pair<unsigned, MDNode *>, 1> MDs;
   OldFunc->getAllMetadata(MDs);
-  for (auto MD : MDs)
-    NewFunc->addMetadata(
-        MD.first,
-        *MapMetadata(MD.second, VMap,
-                     ModuleLevelChanges ? RF_None : RF_NoModuleLevelChanges,
-                     TypeMapper, Materializer));
+  for (auto MD : MDs) {
+    MDNode *NewMD;
+    bool MustCloneSP =
+        (MD.first == LLVMContext::MD_dbg && OldFunc->getParent() &&
+         OldFunc->getParent() == NewFunc->getParent());
+    if (MustCloneSP) {
+      auto *SP = cast<DISubprogram>(MD.second);
+      NewMD = DISubprogram::getDistinct(
+          NewFunc->getContext(), SP->getScope(), SP->getName(),
+          NewFunc->getName(), SP->getFile(), SP->getLine(), SP->getType(),
+          SP->isLocalToUnit(), SP->isDefinition(), SP->getScopeLine(),
+          SP->getContainingType(), SP->getVirtuality(), SP->getVirtualIndex(),
+          SP->getThisAdjustment(), SP->getFlags(), SP->isOptimized(),
+          SP->getUnit(), SP->getTemplateParams(), SP->getDeclaration(),
+          SP->getVariables(), SP->getThrownTypes());
+    } else
+      NewMD =
+          MapMetadata(MD.second, VMap,
+                      ModuleLevelChanges ? RF_None : RF_NoModuleLevelChanges,
+                      TypeMapper, Materializer);
+    NewFunc->addMetadata(MD.first, *NewMD);
+  }
 
   // Loop over all of the basic blocks in the function, cloning them as
   // appropriate.  Note that we save BE this way in order to handle cloning of
diff --git a/lib/Transforms/Utils/CloneModule.cpp b/lib/Transforms/Utils/CloneModule.cpp
index 4e9d67252d6c..5444b752de82 100644
--- a/lib/Transforms/Utils/CloneModule.cpp
+++ b/lib/Transforms/Utils/CloneModule.cpp
@@ -96,7 +96,7 @@ std::unique_ptr<Module> llvm::CloneModule(
       else
         GV = new GlobalVariable(
             *New, I->getValueType(), false, GlobalValue::ExternalLinkage,
-            (Constant *)nullptr, I->getName(), (GlobalVariable *)nullptr,
+            nullptr, I->getName(), nullptr,
             I->getThreadLocalMode(), I->getType()->getAddressSpace());
       VMap[&*I] = GV;
       // We do not copy attributes (mainly because copying between different
diff --git a/lib/Transforms/Utils/EscapeEnumerator.cpp b/lib/Transforms/Utils/EscapeEnumerator.cpp
index 8c2386554da5..78d7474e5b95 100644
--- a/lib/Transforms/Utils/EscapeEnumerator.cpp
+++ b/lib/Transforms/Utils/EscapeEnumerator.cpp
@@ -67,8 +67,7 @@ IRBuilder<> *EscapeEnumerator::Next() {
   // Create a cleanup block.
   LLVMContext &C = F.getContext();
   BasicBlock *CleanupBB = BasicBlock::Create(C, CleanupBBName, &F);
-  Type *ExnTy =
-      StructType::get(Type::getInt8PtrTy(C), Type::getInt32Ty(C), nullptr);
+  Type *ExnTy = StructType::get(Type::getInt8PtrTy(C), Type::getInt32Ty(C));
   if (!F.hasPersonalityFn()) {
     Constant *PersFn = getDefaultPersonalityFn(F.getParent());
     F.setPersonalityFn(PersFn);
diff --git a/lib/Transforms/Utils/InlineFunction.cpp b/lib/Transforms/Utils/InlineFunction.cpp
index 6d56e08af99f..9cb4762b683c 100644
--- a/lib/Transforms/Utils/InlineFunction.cpp
+++ b/lib/Transforms/Utils/InlineFunction.cpp
@@ -1302,41 +1302,6 @@ static bool hasLifetimeMarkers(AllocaInst *AI) {
   return false;
 }
 
-/// Rebuild the entire inlined-at chain for this instruction so that the top of
-/// the chain now is inlined-at the new call site.
-static DebugLoc
-updateInlinedAtInfo(const DebugLoc &DL, DILocation *InlinedAtNode,
-                    LLVMContext &Ctx,
-                    DenseMap<const DILocation *, DILocation *> &IANodes) {
-  SmallVector<DILocation *, 3> InlinedAtLocations;
-  DILocation *Last = InlinedAtNode;
-  DILocation *CurInlinedAt = DL;
-
-  // Gather all the inlined-at nodes
-  while (DILocation *IA = CurInlinedAt->getInlinedAt()) {
-    // Skip any we've already built nodes for
-    if (DILocation *Found = IANodes[IA]) {
-      Last = Found;
-      break;
-    }
-
-    InlinedAtLocations.push_back(IA);
-    CurInlinedAt = IA;
-  }
-
-  // Starting from the top, rebuild the nodes to point to the new inlined-at
-  // location (then rebuilding the rest of the chain behind it) and update the
-  // map of already-constructed inlined-at nodes.
-  for (const DILocation *MD : reverse(InlinedAtLocations)) {
-    Last = IANodes[MD] = DILocation::getDistinct(
-        Ctx, MD->getLine(), MD->getColumn(), MD->getScope(), Last);
-  }
-
-  // And finally create the normal location for this instruction, referring to
-  // the new inlined-at chain.
-  return DebugLoc::get(DL.getLine(), DL.getCol(), DL.getScope(), Last);
-}
-
 /// Return the result of AI->isStaticAlloca() if AI were moved to the entry
 /// block. Allocas used in inalloca calls and allocas of dynamic array size
 /// cannot be static.
@@ -1364,14 +1329,16 @@ static void fixupLineNumbers(Function *Fn, Function::iterator FI,
   // Cache the inlined-at nodes as they're built so they are reused, without
   // this every instruction's inlined-at chain would become distinct from each
   // other.
-  DenseMap<const DILocation *, DILocation *> IANodes;
+  DenseMap<const MDNode *, MDNode *> IANodes;
 
   for (; FI != Fn->end(); ++FI) {
     for (BasicBlock::iterator BI = FI->begin(), BE = FI->end();
          BI != BE; ++BI) {
       if (DebugLoc DL = BI->getDebugLoc()) {
-        BI->setDebugLoc(
-            updateInlinedAtInfo(DL, InlinedAtNode, BI->getContext(), IANodes));
+        auto IA = DebugLoc::appendInlinedAt(DL, InlinedAtNode, BI->getContext(),
+                                            IANodes);
+        auto IDL = DebugLoc::get(DL.getLine(), DL.getCol(), DL.getScope(), IA);
+        BI->setDebugLoc(IDL);
         continue;
       }
 
@@ -1429,11 +1396,12 @@ static void updateCallerBFI(BasicBlock *CallSiteBlock,
 /// Update the branch metadata for cloned call instructions.
 static void updateCallProfile(Function *Callee, const ValueToValueMapTy &VMap,
                               const Optional<uint64_t> &CalleeEntryCount,
-                              const Instruction *TheCall) {
+                              const Instruction *TheCall,
+                              ProfileSummaryInfo *PSI) {
   if (!CalleeEntryCount.hasValue() || CalleeEntryCount.getValue() < 1)
     return;
   Optional<uint64_t> CallSiteCount =
-      ProfileSummaryInfo::getProfileCount(TheCall, nullptr);
+      PSI ? PSI->getProfileCount(TheCall, nullptr) : None;
   uint64_t CallCount =
       std::min(CallSiteCount.hasValue() ? CallSiteCount.getValue() : 0,
                CalleeEntryCount.getValue());
@@ -1456,16 +1424,16 @@ static void updateCallProfile(Function *Callee, const ValueToValueMapTy &VMap,
 /// The callsite's block count is subtracted from the callee's function entry
 /// count.
 static void updateCalleeCount(BlockFrequencyInfo *CallerBFI, BasicBlock *CallBB,
-                              Instruction *CallInst, Function *Callee) {
+                              Instruction *CallInst, Function *Callee,
+                              ProfileSummaryInfo *PSI) {
   // If the callee has a original count of N, and the estimated count of
   // callsite is M, the new callee count is set to N - M. M is estimated from
   // the caller's entry count, its entry block frequency and the block frequency
   // of the callsite.
   Optional<uint64_t> CalleeCount = Callee->getEntryCount();
-  if (!CalleeCount.hasValue())
+  if (!CalleeCount.hasValue() || !PSI)
     return;
-  Optional<uint64_t> CallCount =
-      ProfileSummaryInfo::getProfileCount(CallInst, CallerBFI);
+  Optional<uint64_t> CallCount = PSI->getProfileCount(CallInst, CallerBFI);
   if (!CallCount.hasValue())
     return;
   // Since CallSiteCount is an estimate, it could exceed the original callee
@@ -1668,9 +1636,10 @@ bool llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI,
       updateCallerBFI(OrigBB, VMap, IFI.CallerBFI, IFI.CalleeBFI,
                       CalledFunc->front());
 
-    updateCallProfile(CalledFunc, VMap, CalledFunc->getEntryCount(), TheCall);
+    updateCallProfile(CalledFunc, VMap, CalledFunc->getEntryCount(), TheCall,
+                      IFI.PSI);
     // Update the profile count of callee.
-    updateCalleeCount(IFI.CallerBFI, OrigBB, TheCall, CalledFunc);
+    updateCalleeCount(IFI.CallerBFI, OrigBB, TheCall, CalledFunc, IFI.PSI);
 
     // Inject byval arguments initialization.
     for (std::pair<Value*, Value*> &Init : ByValInit)
diff --git a/lib/Transforms/Utils/InstructionNamer.cpp b/lib/Transforms/Utils/InstructionNamer.cpp
index 8a1973d1db05..53b432fcafd4 100644
--- a/lib/Transforms/Utils/InstructionNamer.cpp
+++ b/lib/Transforms/Utils/InstructionNamer.cpp
@@ -26,16 +26,15 @@ namespace {
     InstNamer() : FunctionPass(ID) {
       initializeInstNamerPass(*PassRegistry::getPassRegistry());
     }
-    
+
     void getAnalysisUsage(AnalysisUsage &Info) const override {
       Info.setPreservesAll();
     }
 
     bool runOnFunction(Function &F) override {
-      for (Function::arg_iterator AI = F.arg_begin(), AE = F.arg_end();
-           AI != AE; ++AI)
-        if (!AI->hasName() && !AI->getType()->isVoidTy())
-          AI->setName("arg");
+      for (auto &Arg : F.args())
+        if (!Arg.hasName())
+          Arg.setName("arg");
 
       for (BasicBlock &BB : F) {
         if (!BB.hasName())
@@ -48,11 +47,11 @@ namespace {
       return true;
     }
   };
-  
+
   char InstNamer::ID = 0;
 }
 
-INITIALIZE_PASS(InstNamer, "instnamer", 
+INITIALIZE_PASS(InstNamer, "instnamer",
                 "Assign names to anonymous instructions", false, false)
 char &llvm::InstructionNamerID = InstNamer::ID;
 //===----------------------------------------------------------------------===//
diff --git a/lib/Transforms/Utils/Local.cpp b/lib/Transforms/Utils/Local.cpp
index ce6b703f3528..1ca509472b5f 100644
--- a/lib/Transforms/Utils/Local.cpp
+++ b/lib/Transforms/Utils/Local.cpp
@@ -1041,7 +1041,7 @@ unsigned llvm::getOrEnforceKnownAlignment(Value *V, unsigned PrefAlign,
 
   KnownBits Known(BitWidth);
   computeKnownBits(V, Known, DL, 0, AC, CxtI, DT);
-  unsigned TrailZ = Known.Zero.countTrailingOnes();
+  unsigned TrailZ = Known.countMinTrailingZeros();
 
   // Avoid trouble with ridiculously large TrailZ values, such as
   // those computed from a null pointer.
@@ -1105,8 +1105,9 @@ static bool PhiHasDebugValue(DILocalVariable *DIVar,
 void llvm::ConvertDebugDeclareToDebugValue(DbgDeclareInst *DDI,
                                            StoreInst *SI, DIBuilder &Builder) {
   auto *DIVar = DDI->getVariable();
-  auto *DIExpr = DDI->getExpression();
   assert(DIVar && "Missing variable");
+  auto *DIExpr = DDI->getExpression();
+  Value *DV = SI->getOperand(0);
 
   // If an argument is zero extended then use argument directly. The ZExt
   // may be zapped by an optimization pass in future.
@@ -1116,34 +1117,28 @@ void llvm::ConvertDebugDeclareToDebugValue(DbgDeclareInst *DDI,
   if (SExtInst *SExt = dyn_cast<SExtInst>(SI->getOperand(0)))
     ExtendedArg = dyn_cast<Argument>(SExt->getOperand(0));
   if (ExtendedArg) {
-    // We're now only describing a subset of the variable. The fragment we're
-    // describing will always be smaller than the variable size, because
-    // VariableSize == Size of Alloca described by DDI. Since SI stores
-    // to the alloca described by DDI, if it's first operand is an extend,
-    // we're guaranteed that before extension, the value was narrower than
-    // the size of the alloca, hence the size of the described variable.
-    SmallVector<uint64_t, 3> Ops;
-    unsigned FragmentOffset = 0;
-    // If this already is a bit fragment, we drop the bit fragment from the
-    // expression and record the offset.
-    auto Fragment = DIExpr->getFragmentInfo();
-    if (Fragment) {
-      Ops.append(DIExpr->elements_begin(), DIExpr->elements_end()-3);
-      FragmentOffset = Fragment->OffsetInBits;
-    } else {
-      Ops.append(DIExpr->elements_begin(), DIExpr->elements_end());
+    // If this DDI was already describing only a fragment of a variable, ensure
+    // that fragment is appropriately narrowed here.
+    // But if a fragment wasn't used, describe the value as the original
+    // argument (rather than the zext or sext) so that it remains described even
+    // if the sext/zext is optimized away. This widens the variable description,
+    // leaving it up to the consumer to know how the smaller value may be
+    // represented in a larger register.
+    if (auto Fragment = DIExpr->getFragmentInfo()) {
+      unsigned FragmentOffset = Fragment->OffsetInBits;
+      SmallVector<uint64_t, 3> Ops(DIExpr->elements_begin(),
+                                   DIExpr->elements_end() - 3);
+      Ops.push_back(dwarf::DW_OP_LLVM_fragment);
+      Ops.push_back(FragmentOffset);
+      const DataLayout &DL = DDI->getModule()->getDataLayout();
+      Ops.push_back(DL.getTypeSizeInBits(ExtendedArg->getType()));
+      DIExpr = Builder.createExpression(Ops);
     }
-    Ops.push_back(dwarf::DW_OP_LLVM_fragment);
-    Ops.push_back(FragmentOffset);
-    const DataLayout &DL = DDI->getModule()->getDataLayout();
-    Ops.push_back(DL.getTypeSizeInBits(ExtendedArg->getType()));
-    auto NewDIExpr = Builder.createExpression(Ops);
-    if (!LdStHasDebugValue(DIVar, NewDIExpr, SI))
-      Builder.insertDbgValueIntrinsic(ExtendedArg, 0, DIVar, NewDIExpr,
-                                      DDI->getDebugLoc(), SI);
-  } else if (!LdStHasDebugValue(DIVar, DIExpr, SI))
-    Builder.insertDbgValueIntrinsic(SI->getOperand(0), 0, DIVar, DIExpr,
-                                    DDI->getDebugLoc(), SI);
+    DV = ExtendedArg;
+  }
+  if (!LdStHasDebugValue(DIVar, DIExpr, SI))
+    Builder.insertDbgValueIntrinsic(DV, 0, DIVar, DIExpr, DDI->getDebugLoc(),
+                                    SI);
 }
 
 /// Inserts a llvm.dbg.value intrinsic before a load of an alloca'd value
@@ -1781,44 +1776,43 @@ void llvm::combineMetadataForCSE(Instruction *K, const Instruction *J) {
   combineMetadata(K, J, KnownIDs);
 }
 
-unsigned llvm::replaceDominatedUsesWith(Value *From, Value *To,
-                                        DominatorTree &DT,
-                                        const BasicBlockEdge &Root) {
+template <typename RootType, typename DominatesFn>
+static unsigned replaceDominatedUsesWith(Value *From, Value *To,
+                                         const RootType &Root,
+                                         const DominatesFn &Dominates) {
   assert(From->getType() == To->getType());
-  
+
   unsigned Count = 0;
   for (Value::use_iterator UI = From->use_begin(), UE = From->use_end();
-       UI != UE; ) {
+       UI != UE;) {
     Use &U = *UI++;
-    if (DT.dominates(Root, U)) {
-      U.set(To);
-      DEBUG(dbgs() << "Replace dominated use of '"
-            << From->getName() << "' as "
-            << *To << " in " << *U << "\n");
-      ++Count;
-    }
+    if (!Dominates(Root, U))
+      continue;
+    U.set(To);
+    DEBUG(dbgs() << "Replace dominated use of '" << From->getName() << "' as "
+                 << *To << " in " << *U << "\n");
+    ++Count;
   }
   return Count;
 }
 
 unsigned llvm::replaceDominatedUsesWith(Value *From, Value *To,
                                         DominatorTree &DT,
-                                        const BasicBlock *BB) {
-  assert(From->getType() == To->getType());
+                                        const BasicBlockEdge &Root) {
+  auto Dominates = [&DT](const BasicBlockEdge &Root, const Use &U) {
+    return DT.dominates(Root, U);
+  };
+  return ::replaceDominatedUsesWith(From, To, Root, Dominates);
+}
 
-  unsigned Count = 0;
-  for (Value::use_iterator UI = From->use_begin(), UE = From->use_end();
-       UI != UE;) {
-    Use &U = *UI++;
-    auto *I = cast<Instruction>(U.getUser());
-    if (DT.properlyDominates(BB, I->getParent())) {
-      U.set(To);
-      DEBUG(dbgs() << "Replace dominated use of '" << From->getName() << "' as "
-                   << *To << " in " << *U << "\n");
-      ++Count;
-    }
-  }
-  return Count;
+unsigned llvm::replaceDominatedUsesWith(Value *From, Value *To,
+                                        DominatorTree &DT,
+                                        const BasicBlock *BB) {
+  auto ProperlyDominates = [&DT](const BasicBlock *BB, const Use &U) {
+    auto *I = cast<Instruction>(U.getUser())->getParent();
+    return DT.properlyDominates(BB, I);
+  };
+  return ::replaceDominatedUsesWith(From, To, BB, ProperlyDominates);
 }
 
 bool llvm::callsGCLeafFunction(ImmutableCallSite CS) {
diff --git a/lib/Transforms/Utils/LoopUtils.cpp b/lib/Transforms/Utils/LoopUtils.cpp
index 175d013a011d..81f033e7d51a 100644
--- a/lib/Transforms/Utils/LoopUtils.cpp
+++ b/lib/Transforms/Utils/LoopUtils.cpp
@@ -18,6 +18,7 @@
 #include "llvm/Analysis/GlobalsModRef.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/Analysis/ScalarEvolution.h"
 #include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h"
 #include "llvm/Analysis/ScalarEvolutionExpander.h"
@@ -1112,3 +1113,203 @@ Optional<unsigned> llvm::getLoopEstimatedTripCount(Loop *L) {
   else
     return (FalseVal + (TrueVal / 2)) / TrueVal;
 }
+
+/// \brief Adds a 'fast' flag to floating point operations.
+static Value *addFastMathFlag(Value *V) {
+  if (isa<FPMathOperator>(V)) {
+    FastMathFlags Flags;
+    Flags.setUnsafeAlgebra();
+    cast<Instruction>(V)->setFastMathFlags(Flags);
+  }
+  return V;
+}
+
+// Helper to generate a log2 shuffle reduction.
+Value *
+llvm::getShuffleReduction(IRBuilder<> &Builder, Value *Src, unsigned Op,
+                          RecurrenceDescriptor::MinMaxRecurrenceKind MinMaxKind,
+                          ArrayRef<Value *> RedOps) {
+  unsigned VF = Src->getType()->getVectorNumElements();
+  // VF is a power of 2 so we can emit the reduction using log2(VF) shuffles
+  // and vector ops, reducing the set of values being computed by half each
+  // round.
+  assert(isPowerOf2_32(VF) &&
+         "Reduction emission only supported for pow2 vectors!");
+  Value *TmpVec = Src;
+  SmallVector<Constant *, 32> ShuffleMask(VF, nullptr);
+  for (unsigned i = VF; i != 1; i >>= 1) {
+    // Move the upper half of the vector to the lower half.
+    for (unsigned j = 0; j != i / 2; ++j)
+      ShuffleMask[j] = Builder.getInt32(i / 2 + j);
+
+    // Fill the rest of the mask with undef.
+    std::fill(&ShuffleMask[i / 2], ShuffleMask.end(),
+              UndefValue::get(Builder.getInt32Ty()));
+
+    Value *Shuf = Builder.CreateShuffleVector(
+        TmpVec, UndefValue::get(TmpVec->getType()),
+        ConstantVector::get(ShuffleMask), "rdx.shuf");
+
+    if (Op != Instruction::ICmp && Op != Instruction::FCmp) {
+      // Floating point operations had to be 'fast' to enable the reduction.
+      TmpVec = addFastMathFlag(Builder.CreateBinOp((Instruction::BinaryOps)Op,
+                                                   TmpVec, Shuf, "bin.rdx"));
+    } else {
+      assert(MinMaxKind != RecurrenceDescriptor::MRK_Invalid &&
+             "Invalid min/max");
+      TmpVec = RecurrenceDescriptor::createMinMaxOp(Builder, MinMaxKind, TmpVec,
+                                                    Shuf);
+    }
+    if (!RedOps.empty())
+      propagateIRFlags(TmpVec, RedOps);
+  }
+  // The result is in the first element of the vector.
+  return Builder.CreateExtractElement(TmpVec, Builder.getInt32(0));
+}
+
+/// Create a simple vector reduction specified by an opcode and some
+/// flags (if generating min/max reductions).
+Value *llvm::createSimpleTargetReduction(
+    IRBuilder<> &Builder, const TargetTransformInfo *TTI, unsigned Opcode,
+    Value *Src, TargetTransformInfo::ReductionFlags Flags,
+    ArrayRef<Value *> RedOps) {
+  assert(isa<VectorType>(Src->getType()) && "Type must be a vector");
+
+  Value *ScalarUdf = UndefValue::get(Src->getType()->getVectorElementType());
+  std::function<Value*()> BuildFunc;
+  using RD = RecurrenceDescriptor;
+  RD::MinMaxRecurrenceKind MinMaxKind = RD::MRK_Invalid;
+  // TODO: Support creating ordered reductions.
+  FastMathFlags FMFUnsafe;
+  FMFUnsafe.setUnsafeAlgebra();
+
+  switch (Opcode) {
+  case Instruction::Add:
+    BuildFunc = [&]() { return Builder.CreateAddReduce(Src); };
+    break;
+  case Instruction::Mul:
+    BuildFunc = [&]() { return Builder.CreateMulReduce(Src); };
+    break;
+  case Instruction::And:
+    BuildFunc = [&]() { return Builder.CreateAndReduce(Src); };
+    break;
+  case Instruction::Or:
+    BuildFunc = [&]() { return Builder.CreateOrReduce(Src); };
+    break;
+  case Instruction::Xor:
+    BuildFunc = [&]() { return Builder.CreateXorReduce(Src); };
+    break;
+  case Instruction::FAdd:
+    BuildFunc = [&]() {
+      auto Rdx = Builder.CreateFAddReduce(ScalarUdf, Src);
+      cast<CallInst>(Rdx)->setFastMathFlags(FMFUnsafe);
+      return Rdx;
+    };
+    break;
+  case Instruction::FMul:
+    BuildFunc = [&]() {
+      auto Rdx = Builder.CreateFMulReduce(ScalarUdf, Src);
+      cast<CallInst>(Rdx)->setFastMathFlags(FMFUnsafe);
+      return Rdx;
+    };
+    break;
+  case Instruction::ICmp:
+    if (Flags.IsMaxOp) {
+      MinMaxKind = Flags.IsSigned ? RD::MRK_SIntMax : RD::MRK_UIntMax;
+      BuildFunc = [&]() {
+        return Builder.CreateIntMaxReduce(Src, Flags.IsSigned);
+      };
+    } else {
+      MinMaxKind = Flags.IsSigned ? RD::MRK_SIntMin : RD::MRK_UIntMin;
+      BuildFunc = [&]() {
+        return Builder.CreateIntMinReduce(Src, Flags.IsSigned);
+      };
+    }
+    break;
+  case Instruction::FCmp:
+    if (Flags.IsMaxOp) {
+      MinMaxKind = RD::MRK_FloatMax;
+      BuildFunc = [&]() { return Builder.CreateFPMaxReduce(Src, Flags.NoNaN); };
+    } else {
+      MinMaxKind = RD::MRK_FloatMin;
+      BuildFunc = [&]() { return Builder.CreateFPMinReduce(Src, Flags.NoNaN); };
+    }
+    break;
+  default:
+    llvm_unreachable("Unhandled opcode");
+    break;
+  }
+  if (TTI->useReductionIntrinsic(Opcode, Src->getType(), Flags))
+    return BuildFunc();
+  return getShuffleReduction(Builder, Src, Opcode, MinMaxKind, RedOps);
+}
+
+/// Create a vector reduction using a given recurrence descriptor.
+Value *llvm::createTargetReduction(IRBuilder<> &Builder,
+                                   const TargetTransformInfo *TTI,
+                                   RecurrenceDescriptor &Desc, Value *Src,
+                                   bool NoNaN) {
+  // TODO: Support in-order reductions based on the recurrence descriptor.
+  RecurrenceDescriptor::RecurrenceKind RecKind = Desc.getRecurrenceKind();
+  TargetTransformInfo::ReductionFlags Flags;
+  Flags.NoNaN = NoNaN;
+  auto getSimpleRdx = [&](unsigned Opc) {
+    return createSimpleTargetReduction(Builder, TTI, Opc, Src, Flags);
+  };
+  switch (RecKind) {
+  case RecurrenceDescriptor::RK_FloatAdd:
+    return getSimpleRdx(Instruction::FAdd);
+  case RecurrenceDescriptor::RK_FloatMult:
+    return getSimpleRdx(Instruction::FMul);
+  case RecurrenceDescriptor::RK_IntegerAdd:
+    return getSimpleRdx(Instruction::Add);
+  case RecurrenceDescriptor::RK_IntegerMult:
+    return getSimpleRdx(Instruction::Mul);
+  case RecurrenceDescriptor::RK_IntegerAnd:
+    return getSimpleRdx(Instruction::And);
+  case RecurrenceDescriptor::RK_IntegerOr:
+    return getSimpleRdx(Instruction::Or);
+  case RecurrenceDescriptor::RK_IntegerXor:
+    return getSimpleRdx(Instruction::Xor);
+  case RecurrenceDescriptor::RK_IntegerMinMax: {
+    switch (Desc.getMinMaxRecurrenceKind()) {
+    case RecurrenceDescriptor::MRK_SIntMax:
+      Flags.IsSigned = true;
+      Flags.IsMaxOp = true;
+      break;
+    case RecurrenceDescriptor::MRK_UIntMax:
+      Flags.IsMaxOp = true;
+      break;
+    case RecurrenceDescriptor::MRK_SIntMin:
+      Flags.IsSigned = true;
+      break;
+    case RecurrenceDescriptor::MRK_UIntMin:
+      break;
+    default:
+      llvm_unreachable("Unhandled MRK");
+    }
+    return getSimpleRdx(Instruction::ICmp);
+  }
+  case RecurrenceDescriptor::RK_FloatMinMax: {
+    Flags.IsMaxOp =
+        Desc.getMinMaxRecurrenceKind() == RecurrenceDescriptor::MRK_FloatMax;
+    return getSimpleRdx(Instruction::FCmp);
+  }
+  default:
+    llvm_unreachable("Unhandled RecKind");
+  }
+}
+
+void llvm::propagateIRFlags(Value *I, ArrayRef<Value *> VL) {
+  if (auto *VecOp = dyn_cast<Instruction>(I)) {
+    if (auto *I0 = dyn_cast<Instruction>(VL[0])) {
+      // VecOVp is initialized to the 0th scalar, so start counting from index
+      // '1'.
+      VecOp->copyIRFlags(I0);
+      for (int i = 1, e = VL.size(); i < e; ++i) {
+        if (auto *Scalar = dyn_cast<Instruction>(VL[i]))
+          VecOp->andIRFlags(Scalar);
+      }
+    }
+  }
+}
diff --git a/lib/Transforms/Utils/ModuleUtils.cpp b/lib/Transforms/Utils/ModuleUtils.cpp
index 29d334f2968f..2ef3d6336ae2 100644
--- a/lib/Transforms/Utils/ModuleUtils.cpp
+++ b/lib/Transforms/Utils/ModuleUtils.cpp
@@ -35,7 +35,7 @@ static void appendToGlobalArray(const char *Array, Module &M, Function *F,
     // Upgrade a 2-field global array type to the new 3-field format if needed.
     if (Data && OldEltTy->getNumElements() < 3)
       EltTy = StructType::get(IRB.getInt32Ty(), PointerType::getUnqual(FnTy),
-                              IRB.getInt8PtrTy(), nullptr);
+                              IRB.getInt8PtrTy());
     else
       EltTy = OldEltTy;
     if (Constant *Init = GVCtor->getInitializer()) {
@@ -44,10 +44,10 @@ static void appendToGlobalArray(const char *Array, Module &M, Function *F,
       for (unsigned i = 0; i != n; ++i) {
         auto Ctor = cast<Constant>(Init->getOperand(i));
         if (EltTy != OldEltTy)
-          Ctor = ConstantStruct::get(
-              EltTy, Ctor->getAggregateElement((unsigned)0),
-              Ctor->getAggregateElement(1),
-              Constant::getNullValue(IRB.getInt8PtrTy()), nullptr);
+          Ctor =
+              ConstantStruct::get(EltTy, Ctor->getAggregateElement((unsigned)0),
+                                  Ctor->getAggregateElement(1),
+                                  Constant::getNullValue(IRB.getInt8PtrTy()));
         CurrentCtors.push_back(Ctor);
       }
     }
@@ -55,7 +55,7 @@ static void appendToGlobalArray(const char *Array, Module &M, Function *F,
   } else {
     // Use the new three-field struct if there isn't one already.
     EltTy = StructType::get(IRB.getInt32Ty(), PointerType::getUnqual(FnTy),
-                            IRB.getInt8PtrTy(), nullptr);
+                            IRB.getInt8PtrTy());
   }
 
   // Build a 2 or 3 field global_ctor entry.  We don't take a comdat key.
diff --git a/lib/Transforms/Utils/SimplifyLibCalls.cpp b/lib/Transforms/Utils/SimplifyLibCalls.cpp
index 9e71d746de34..1de579ed41b0 100644
--- a/lib/Transforms/Utils/SimplifyLibCalls.cpp
+++ b/lib/Transforms/Utils/SimplifyLibCalls.cpp
@@ -1450,11 +1450,11 @@ static void insertSinCosCall(IRBuilder<> &B, Function *OrigCallee, Value *Arg,
     // x86_64 can't use {float, float} since that would be returned in both
     // xmm0 and xmm1, which isn't what a real struct would do.
     ResTy = T.getArch() == Triple::x86_64
-    ? static_cast<Type *>(VectorType::get(ArgTy, 2))
-    : static_cast<Type *>(StructType::get(ArgTy, ArgTy, nullptr));
+                ? static_cast<Type *>(VectorType::get(ArgTy, 2))
+                : static_cast<Type *>(StructType::get(ArgTy, ArgTy));
   } else {
     Name = "__sincospi_stret";
-    ResTy = StructType::get(ArgTy, ArgTy, nullptr);
+    ResTy = StructType::get(ArgTy, ArgTy);
   }
 
   Module *M = OrigCallee->getParent();
diff --git a/lib/Transforms/Utils/VNCoercion.cpp b/lib/Transforms/Utils/VNCoercion.cpp
index 83bd29dbca65..60d9ede2c487 100644
--- a/lib/Transforms/Utils/VNCoercion.cpp
+++ b/lib/Transforms/Utils/VNCoercion.cpp
@@ -303,6 +303,15 @@ static T *getStoreValueForLoadHelper(T *SrcVal, unsigned Offset, Type *LoadTy,
                                      const DataLayout &DL) {
   LLVMContext &Ctx = SrcVal->getType()->getContext();
 
+  // If two pointers are in the same address space, they have the same size,
+  // so we don't need to do any truncation, etc. This avoids introducing
+  // ptrtoint instructions for pointers that may be non-integral.
+  if (SrcVal->getType()->isPointerTy() && LoadTy->isPointerTy() &&
+      cast<PointerType>(SrcVal->getType())->getAddressSpace() ==
+          cast<PointerType>(LoadTy)->getAddressSpace()) {
+    return SrcVal;
+  }
+
   uint64_t StoreSize = (DL.getTypeSizeInBits(SrcVal->getType()) + 7) / 8;
   uint64_t LoadSize = (DL.getTypeSizeInBits(LoadTy) + 7) / 8;
   // Compute which bits of the stored value are being used by the load.  Convert
diff --git a/lib/Transforms/Utils/ValueMapper.cpp b/lib/Transforms/Utils/ValueMapper.cpp
index 84d89f103a2f..930972924c3c 100644
--- a/lib/Transforms/Utils/ValueMapper.cpp
+++ b/lib/Transforms/Utils/ValueMapper.cpp
@@ -949,11 +949,10 @@ void Mapper::mapAppendingVariable(GlobalVariable &GV, Constant *InitPrefix,
     Constant *NewV;
     if (IsOldCtorDtor) {
       auto *S = cast<ConstantStruct>(V);
-      auto *E1 = mapValue(S->getOperand(0));
-      auto *E2 = mapValue(S->getOperand(1));
-      Value *Null = Constant::getNullValue(VoidPtrTy);
-      NewV =
-          ConstantStruct::get(cast<StructType>(EltTy), E1, E2, Null, nullptr);
+      auto *E1 = cast<Constant>(mapValue(S->getOperand(0)));
+      auto *E2 = cast<Constant>(mapValue(S->getOperand(1)));
+      Constant *Null = Constant::getNullValue(VoidPtrTy);
+      NewV = ConstantStruct::get(cast<StructType>(EltTy), E1, E2, Null);
     } else {
       NewV = cast_or_null<Constant>(mapValue(V));
     }
diff --git a/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp b/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
index 97dcb40a1d72..9cf66382b581 100644
--- a/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
+++ b/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
@@ -346,7 +346,7 @@ bool Vectorizer::isConsecutiveAccess(Value *A, Value *B) {
   if (!Safe) {
     KnownBits Known(BitWidth);
     computeKnownBits(OpA, Known, DL, 0, nullptr, OpA, &DT);
-    if (Known.Zero.countTrailingZeros() < (BitWidth - 1))
+    if (Known.countMaxTrailingOnes() < (BitWidth - 1))
       Safe = true;
   }
 
diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index 3fde0a453962..516ab7d03a88 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -391,13 +391,14 @@ public:
         TripCount(nullptr), VectorTripCount(nullptr), Legal(LVL), Cost(CM),
         AddedSafetyChecks(false) {}
 
-  // Perform the actual loop widening (vectorization).
-  void vectorize() {
-    // Create a new empty loop. Unlink the old loop and connect the new one.
-    createEmptyLoop();
-    // Widen each instruction in the old loop to a new one in the new loop.
-    vectorizeLoop();
-  }
+  /// Create a new empty loop. Unlink the old loop and connect the new one.
+  void createVectorizedLoopSkeleton();
+
+  /// Vectorize a single instruction within the innermost loop.
+  void vectorizeInstruction(Instruction &I);
+
+  /// Fix the vectorized code, taking care of header phi's, live-outs, and more.
+  void fixVectorizedLoop();
 
   // Return true if any runtime check is added.
   bool areSafetyChecksAdded() { return AddedSafetyChecks; }
@@ -425,9 +426,6 @@ protected:
       EdgeMaskCacheTy;
   typedef DenseMap<BasicBlock *, VectorParts> BlockMaskCacheTy;
 
-  /// Create an empty loop, based on the loop ranges of the old loop.
-  void createEmptyLoop();
-
   /// Set up the values of the IVs correctly when exiting the vector loop.
   void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II,
                     Value *CountRoundDown, Value *EndValue,
@@ -436,8 +434,6 @@ protected:
   /// Create a new induction variable inside L.
   PHINode *createInductionVariable(Loop *L, Value *Start, Value *End,
                                    Value *Step, Instruction *DL);
-  /// Copy and widen the instructions from the old loop.
-  virtual void vectorizeLoop();
 
   /// Handle all cross-iteration phis in the header.
   void fixCrossIterationPHIs();
@@ -450,10 +446,10 @@ protected:
   /// vectorizing this phi node.
   void fixReduction(PHINode *Phi);
 
-  /// \brief The Loop exit block may have single value PHI nodes where the
-  /// incoming value is 'Undef'. While vectorizing we only handled real values
-  /// that were defined inside the loop. Here we fix the 'undef case'.
-  /// See PR14725.
+  /// \brief The Loop exit block may have single value PHI nodes with some
+  /// incoming value. While vectorizing we only handled real values
+  /// that were defined inside the loop and we should have one value for
+  /// each predecessor of its parent basic block. See PR14725.
   void fixLCSSAPHIs();
 
   /// Iteratively sink the scalarized operands of a predicated instruction into
@@ -464,11 +460,6 @@ protected:
   /// respective conditions.
   void predicateInstructions();
 
-  /// Collect the instructions from the original loop that would be trivially
-  /// dead in the vectorized loop if generated.
-  void collectTriviallyDeadInstructions(
-      SmallPtrSetImpl<Instruction *> &DeadInstructions);
-
   /// Shrinks vector element sizes to the smallest bitwidth they can be legally
   /// represented as.
   void truncateToMinimalBitwidths();
@@ -481,10 +472,6 @@ protected:
   /// and DST.
   VectorParts createEdgeMask(BasicBlock *Src, BasicBlock *Dst);
 
-  /// A helper function to vectorize a single instruction within the innermost
-  /// loop.
-  void vectorizeInstruction(Instruction &I);
-
   /// Vectorize a single PHINode in a block. This method handles the induction
   /// variable canonicalization. It supports both VF = 1 for unrolled loops and
   /// arbitrary length vectors.
@@ -1700,6 +1687,9 @@ public:
   /// access that can be widened.
   bool memoryInstructionCanBeWidened(Instruction *I, unsigned VF = 1);
 
+  // Returns true if the NoNaN attribute is set on the function.
+  bool hasFunNoNaNAttr() const { return HasFunNoNaNAttr; }
+
 private:
   /// Check if a single basic block loop is vectorizable.
   /// At this point we know that this is a loop with a constant trip count
@@ -2185,7 +2175,10 @@ public:
 /// passed Legality checks.
 class LoopVectorizationPlanner {
 public:
-  LoopVectorizationPlanner(LoopVectorizationCostModel &CM) : CM(CM) {}
+  LoopVectorizationPlanner(Loop *OrigLoop, LoopInfo *LI,
+                           LoopVectorizationLegality *Legal,
+                           LoopVectorizationCostModel &CM)
+      : OrigLoop(OrigLoop), LI(LI), Legal(Legal), CM(CM) {}
 
   ~LoopVectorizationPlanner() {}
 
@@ -2193,7 +2186,25 @@ public:
   LoopVectorizationCostModel::VectorizationFactor plan(bool OptForSize,
                                                        unsigned UserVF);
 
+  /// Generate the IR code for the vectorized loop.
+  void executePlan(InnerLoopVectorizer &ILV);
+
+protected:
+  /// Collect the instructions from the original loop that would be trivially
+  /// dead in the vectorized loop if generated.
+  void collectTriviallyDeadInstructions(
+      SmallPtrSetImpl<Instruction *> &DeadInstructions);
+
 private:
+  /// The loop that we evaluate.
+  Loop *OrigLoop;
+
+  /// Loop Info analysis.
+  LoopInfo *LI;
+
+  /// The legality analysis.
+  LoopVectorizationLegality *Legal;
+
   /// The profitablity analysis.
   LoopVectorizationCostModel &CM;
 };
@@ -3361,7 +3372,7 @@ void InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass) {
   LVer->prepareNoAliasMetadata();
 }
 
-void InnerLoopVectorizer::createEmptyLoop() {
+void InnerLoopVectorizer::createVectorizedLoopSkeleton() {
   /*
    In this function we generate a new loop. The new loop will contain
    the vectorized instructions while the old loop will continue to run the
@@ -3883,36 +3894,7 @@ void InnerLoopVectorizer::truncateToMinimalBitwidths() {
   }
 }
 
-void InnerLoopVectorizer::vectorizeLoop() {
-  //===------------------------------------------------===//
-  //
-  // Notice: any optimization or new instruction that go
-  // into the code below should be also be implemented in
-  // the cost-model.
-  //
-  //===------------------------------------------------===//
-
-  // Collect instructions from the original loop that will become trivially dead
-  // in the vectorized loop. We don't need to vectorize these instructions. For
-  // example, original induction update instructions can become dead because we
-  // separately emit induction "steps" when generating code for the new loop.
-  // Similarly, we create a new latch condition when setting up the structure
-  // of the new loop, so the old one can become dead.
-  SmallPtrSet<Instruction *, 4> DeadInstructions;
-  collectTriviallyDeadInstructions(DeadInstructions);
-
-  // Scan the loop in a topological order to ensure that defs are vectorized
-  // before users.
-  LoopBlocksDFS DFS(OrigLoop);
-  DFS.perform(LI);
-
-  // Vectorize all instructions in the original loop that will not become
-  // trivially dead when vectorized.
-  for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO()))
-    for (Instruction &I : *BB)
-      if (!DeadInstructions.count(&I))
-        vectorizeInstruction(I);
-
+void InnerLoopVectorizer::fixVectorizedLoop() {
   // Insert truncates and extends for any truncated instructions as hints to
   // InstCombine.
   if (VF > 1)
@@ -4049,8 +4031,11 @@ void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) {
 
   // Set the insertion point after the previous value if it is an instruction.
   // Note that the previous value may have been constant-folded so it is not
-  // guaranteed to be an instruction in the vector loop.
-  if (LI->getLoopFor(LoopVectorBody)->isLoopInvariant(PreviousParts[UF - 1]))
+  // guaranteed to be an instruction in the vector loop. Also, if the previous
+  // value is a phi node, we should insert after all the phi nodes to avoid
+  // breaking basic block verification.
+  if (LI->getLoopFor(LoopVectorBody)->isLoopInvariant(PreviousParts[UF - 1]) ||
+      isa<PHINode>(PreviousParts[UF - 1]))
     Builder.SetInsertPoint(&*LoopVectorBody->getFirstInsertionPt());
   else
     Builder.SetInsertPoint(
@@ -4258,39 +4243,9 @@ void InnerLoopVectorizer::fixReduction(PHINode *Phi) {
   }
 
   if (VF > 1) {
-    // VF is a power of 2 so we can emit the reduction using log2(VF) shuffles
-    // and vector ops, reducing the set of values being computed by half each
-    // round.
-    assert(isPowerOf2_32(VF) &&
-           "Reduction emission only supported for pow2 vectors!");
-    Value *TmpVec = ReducedPartRdx;
-    SmallVector<Constant *, 32> ShuffleMask(VF, nullptr);
-    for (unsigned i = VF; i != 1; i >>= 1) {
-      // Move the upper half of the vector to the lower half.
-      for (unsigned j = 0; j != i / 2; ++j)
-        ShuffleMask[j] = Builder.getInt32(i / 2 + j);
-
-      // Fill the rest of the mask with undef.
-      std::fill(&ShuffleMask[i / 2], ShuffleMask.end(),
-                UndefValue::get(Builder.getInt32Ty()));
-
-      Value *Shuf = Builder.CreateShuffleVector(
-          TmpVec, UndefValue::get(TmpVec->getType()),
-          ConstantVector::get(ShuffleMask), "rdx.shuf");
-
-      if (Op != Instruction::ICmp && Op != Instruction::FCmp)
-        // Floating point operations had to be 'fast' to enable the reduction.
-        TmpVec = addFastMathFlag(Builder.CreateBinOp(
-                                     (Instruction::BinaryOps)Op, TmpVec, Shuf, "bin.rdx"));
-      else
-        TmpVec = RecurrenceDescriptor::createMinMaxOp(Builder, MinMaxKind,
-                                                      TmpVec, Shuf);
-    }
-
-    // The result is in the first element of the vector.
+    bool NoNaN = Legal->hasFunNoNaNAttr();
     ReducedPartRdx =
-      Builder.CreateExtractElement(TmpVec, Builder.getInt32(0));
-
+        createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx, NoNaN);
     // If the reduction can be performed in a smaller type, we need to extend
     // the reduction to the wider type before we branch to the original loop.
     if (Phi->getType() != RdxDesc.getRecurrenceType())
@@ -4345,33 +4300,11 @@ void InnerLoopVectorizer::fixLCSSAPHIs() {
     auto *LCSSAPhi = dyn_cast<PHINode>(&LEI);
     if (!LCSSAPhi)
       break;
-    if (LCSSAPhi->getNumIncomingValues() == 1)
-      LCSSAPhi->addIncoming(UndefValue::get(LCSSAPhi->getType()),
-                            LoopMiddleBlock);
-  }
-}
-
-void InnerLoopVectorizer::collectTriviallyDeadInstructions(
-    SmallPtrSetImpl<Instruction *> &DeadInstructions) {
-  BasicBlock *Latch = OrigLoop->getLoopLatch();
-
-  // We create new control-flow for the vectorized loop, so the original
-  // condition will be dead after vectorization if it's only used by the
-  // branch.
-  auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0));
-  if (Cmp && Cmp->hasOneUse())
-    DeadInstructions.insert(Cmp);
-
-  // We create new "steps" for induction variable updates to which the original
-  // induction variables map. An original update instruction will be dead if
-  // all its users except the induction variable are dead.
-  for (auto &Induction : *Legal->getInductionVars()) {
-    PHINode *Ind = Induction.first;
-    auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
-    if (all_of(IndUpdate->users(), [&](User *U) -> bool {
-          return U == Ind || DeadInstructions.count(cast<Instruction>(U));
-        }))
-      DeadInstructions.insert(IndUpdate);
+    if (LCSSAPhi->getNumIncomingValues() == 1) {
+      assert(OrigLoop->isLoopInvariant(LCSSAPhi->getIncomingValue(0)) &&
+             "Incoming value isn't loop invariant");
+      LCSSAPhi->addIncoming(LCSSAPhi->getIncomingValue(0), LoopMiddleBlock);
+    }
   }
 }
 
@@ -7577,6 +7510,72 @@ LoopVectorizationPlanner::plan(bool OptForSize, unsigned UserVF) {
   return CM.selectVectorizationFactor(MaxVF);
 }
 
+void LoopVectorizationPlanner::executePlan(InnerLoopVectorizer &ILV) {
+  // Perform the actual loop transformation.
+
+  // 1. Create a new empty loop. Unlink the old loop and connect the new one.
+  ILV.createVectorizedLoopSkeleton();
+
+  //===------------------------------------------------===//
+  //
+  // Notice: any optimization or new instruction that go
+  // into the code below should also be implemented in
+  // the cost-model.
+  //
+  //===------------------------------------------------===//
+
+  // 2. Copy and widen instructions from the old loop into the new loop.
+
+  // Collect instructions from the original loop that will become trivially dead
+  // in the vectorized loop. We don't need to vectorize these instructions. For
+  // example, original induction update instructions can become dead because we
+  // separately emit induction "steps" when generating code for the new loop.
+  // Similarly, we create a new latch condition when setting up the structure
+  // of the new loop, so the old one can become dead.
+  SmallPtrSet<Instruction *, 4> DeadInstructions;
+  collectTriviallyDeadInstructions(DeadInstructions);
+
+  // Scan the loop in a topological order to ensure that defs are vectorized
+  // before users.
+  LoopBlocksDFS DFS(OrigLoop);
+  DFS.perform(LI);
+
+  // Vectorize all instructions in the original loop that will not become
+  // trivially dead when vectorized.
+  for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO()))
+    for (Instruction &I : *BB)
+      if (!DeadInstructions.count(&I))
+        ILV.vectorizeInstruction(I);
+
+  // 3. Fix the vectorized code: take care of header phi's, live-outs,
+  //    predication, updating analyses.
+  ILV.fixVectorizedLoop();
+}
+
+void LoopVectorizationPlanner::collectTriviallyDeadInstructions(
+    SmallPtrSetImpl<Instruction *> &DeadInstructions) {
+  BasicBlock *Latch = OrigLoop->getLoopLatch();
+
+  // We create new control-flow for the vectorized loop, so the original
+  // condition will be dead after vectorization if it's only used by the
+  // branch.
+  auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0));
+  if (Cmp && Cmp->hasOneUse())
+    DeadInstructions.insert(Cmp);
+
+  // We create new "steps" for induction variable updates to which the original
+  // induction variables map. An original update instruction will be dead if
+  // all its users except the induction variable are dead.
+  for (auto &Induction : *Legal->getInductionVars()) {
+    PHINode *Ind = Induction.first;
+    auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
+    if (all_of(IndUpdate->users(), [&](User *U) -> bool {
+          return U == Ind || DeadInstructions.count(cast<Instruction>(U));
+        }))
+      DeadInstructions.insert(IndUpdate);
+  }
+}
+
 void InnerLoopUnroller::vectorizeMemoryInstruction(Instruction *Instr) {
   auto *SI = dyn_cast<StoreInst>(Instr);
   bool IfPredicateInstr = (SI && Legal->blockNeedsPredication(SI->getParent()));
@@ -7759,7 +7758,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
   CM.collectValuesToIgnore();
 
   // Use the planner for vectorization.
-  LoopVectorizationPlanner LVP(CM);
+  LoopVectorizationPlanner LVP(L, LI, &LVL, CM);
 
   // Get user vectorization factor.
   unsigned UserVF = Hints.getWidth();
@@ -7853,7 +7852,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
     // interleave it.
     InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL,
                                &CM);
-    Unroller.vectorize();
+    LVP.executePlan(Unroller);
 
     ORE->emit(OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(),
                                  L->getHeader())
@@ -7863,7 +7862,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
     // If we decided that it is *legal* to vectorize the loop, then do it.
     InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC,
                            &LVL, &CM);
-    LB.vectorize();
+    LVP.executePlan(LB);
     ++LoopsVectorized;
 
     // Add metadata to disable runtime unrolling a scalar loop when there are
diff --git a/lib/Transforms/Vectorize/SLPVectorizer.cpp b/lib/Transforms/Vectorize/SLPVectorizer.cpp
index f112c555205c..64013d6d687d 100644
--- a/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -40,7 +40,9 @@
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/GraphWriter.h"
+#include "llvm/Support/KnownBits.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils/LoopUtils.h"
 #include "llvm/Transforms/Vectorize.h"
 #include <algorithm>
 #include <memory>
@@ -212,23 +214,6 @@ static unsigned getSameOpcode(ArrayRef<Value *> VL) {
   return Opcode;
 }
 
-/// Get the intersection (logical and) of all of the potential IR flags
-/// of each scalar operation (VL) that will be converted into a vector (I).
-/// Flag set: NSW, NUW, exact, and all of fast-math.
-static void propagateIRFlags(Value *I, ArrayRef<Value *> VL) {
-  if (auto *VecOp = dyn_cast<Instruction>(I)) {
-    if (auto *I0 = dyn_cast<Instruction>(VL[0])) {
-      // VecOVp is initialized to the 0th scalar, so start counting from index
-      // '1'.
-      VecOp->copyIRFlags(I0);
-      for (int i = 1, e = VL.size(); i < e; ++i) {
-        if (auto *Scalar = dyn_cast<Instruction>(VL[i]))
-          VecOp->andIRFlags(Scalar);
-      }
-    }
-  }
-}
-
 /// \returns true if all of the values in \p VL have the same type or false
 /// otherwise.
 static bool allSameType(ArrayRef<Value *> VL) {
@@ -315,10 +300,10 @@ public:
   BoUpSLP(Function *Func, ScalarEvolution *Se, TargetTransformInfo *Tti,
           TargetLibraryInfo *TLi, AliasAnalysis *Aa, LoopInfo *Li,
           DominatorTree *Dt, AssumptionCache *AC, DemandedBits *DB,
-          const DataLayout *DL)
+          const DataLayout *DL, OptimizationRemarkEmitter *ORE)
       : NumLoadsWantToKeepOrder(0), NumLoadsWantToChangeOrder(0), F(Func),
         SE(Se), TTI(Tti), TLI(TLi), AA(Aa), LI(Li), DT(Dt), AC(AC), DB(DB),
-        DL(DL), Builder(Se->getContext()) {
+        DL(DL), ORE(ORE), Builder(Se->getContext()) {
     CodeMetrics::collectEphemeralValues(F, AC, EphValues);
     // Use the vector register size specified by the target unless overridden
     // by a command-line option.
@@ -331,7 +316,10 @@ public:
     else
       MaxVecRegSize = TTI->getRegisterBitWidth(true);
 
-    MinVecRegSize = MinVectorRegSizeOption;
+    if (MinVectorRegSizeOption.getNumOccurrences())
+      MinVecRegSize = MinVectorRegSizeOption;
+    else
+      MinVecRegSize = TTI->getMinVectorRegisterBitWidth();
   }
 
   /// \brief Vectorize the tree that starts with the elements in \p VL.
@@ -377,6 +365,8 @@ public:
     MinBWs.clear();
   }
 
+  unsigned getTreeSize() const { return VectorizableTree.size(); }
+
   /// \brief Perform LICM and CSE on the newly generated gather sequences.
   void optimizeGatherSequence();
 
@@ -415,6 +405,8 @@ public:
   /// vectorizable. We do not vectorize such trees.
   bool isTreeTinyAndNotFullyVectorizable();
 
+  OptimizationRemarkEmitter *getORE() { return ORE; }
+
 private:
   struct TreeEntry;
 
@@ -944,6 +936,8 @@ private:
   AssumptionCache *AC;
   DemandedBits *DB;
   const DataLayout *DL;
+  OptimizationRemarkEmitter *ORE;
+
   unsigned MaxVecRegSize; // This is set by TTI or overridden by cl::opt.
   unsigned MinVecRegSize; // Set by cl::opt (default: 128).
   /// Instruction builder to construct the vectorized tree.
@@ -1835,11 +1829,13 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
           CInt->getValue().isPowerOf2())
         Op2VP = TargetTransformInfo::OP_PowerOf2;
 
-      int ScalarCost = VecTy->getNumElements() *
-                       TTI->getArithmeticInstrCost(Opcode, ScalarTy, Op1VK,
-                                                   Op2VK, Op1VP, Op2VP);
+      SmallVector<const Value *, 4> Operands(VL0->operand_values());
+      int ScalarCost =
+          VecTy->getNumElements() *
+          TTI->getArithmeticInstrCost(Opcode, ScalarTy, Op1VK, Op2VK, Op1VP,
+                                      Op2VP, Operands);
       int VecCost = TTI->getArithmeticInstrCost(Opcode, VecTy, Op1VK, Op2VK,
-                                                Op1VP, Op2VP);
+                                                Op1VP, Op2VP, Operands);
       return VecCost - ScalarCost;
     }
     case Instruction::GetElementPtr: {
@@ -3703,10 +3699,8 @@ void BoUpSLP::computeMinimumValueSizes() {
     // Determine if the sign bit of all the roots is known to be zero. If not,
     // IsKnownPositive is set to False.
     IsKnownPositive = all_of(TreeRoot, [&](Value *R) {
-      bool KnownZero = false;
-      bool KnownOne = false;
-      ComputeSignBit(R, KnownZero, KnownOne, *DL);
-      return KnownZero;
+      KnownBits Known = computeKnownBits(R, *DL);
+      return Known.isNonNegative();
     });
 
     // Determine the maximum number of bits required to store the scalar
@@ -3786,8 +3780,9 @@ struct SLPVectorizer : public FunctionPass {
     auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
     auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
     auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits();
+    auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
 
-    return Impl.runImpl(F, SE, TTI, TLI, AA, LI, DT, AC, DB);
+    return Impl.runImpl(F, SE, TTI, TLI, AA, LI, DT, AC, DB, ORE);
   }
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
@@ -3799,6 +3794,7 @@ struct SLPVectorizer : public FunctionPass {
     AU.addRequired<LoopInfoWrapperPass>();
     AU.addRequired<DominatorTreeWrapperPass>();
     AU.addRequired<DemandedBitsWrapperPass>();
+    AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
     AU.addPreserved<LoopInfoWrapperPass>();
     AU.addPreserved<DominatorTreeWrapperPass>();
     AU.addPreserved<AAResultsWrapperPass>();
@@ -3817,8 +3813,9 @@ PreservedAnalyses SLPVectorizerPass::run(Function &F, FunctionAnalysisManager &A
   auto *DT = &AM.getResult<DominatorTreeAnalysis>(F);
   auto *AC = &AM.getResult<AssumptionAnalysis>(F);
   auto *DB = &AM.getResult<DemandedBitsAnalysis>(F);
+  auto *ORE = &AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
 
-  bool Changed = runImpl(F, SE, TTI, TLI, AA, LI, DT, AC, DB);
+  bool Changed = runImpl(F, SE, TTI, TLI, AA, LI, DT, AC, DB, ORE);
   if (!Changed)
     return PreservedAnalyses::all();
 
@@ -3833,7 +3830,8 @@ bool SLPVectorizerPass::runImpl(Function &F, ScalarEvolution *SE_,
                                 TargetTransformInfo *TTI_,
                                 TargetLibraryInfo *TLI_, AliasAnalysis *AA_,
                                 LoopInfo *LI_, DominatorTree *DT_,
-                                AssumptionCache *AC_, DemandedBits *DB_) {
+                                AssumptionCache *AC_, DemandedBits *DB_,
+                                OptimizationRemarkEmitter *ORE_) {
   SE = SE_;
   TTI = TTI_;
   TLI = TLI_;
@@ -3861,7 +3859,7 @@ bool SLPVectorizerPass::runImpl(Function &F, ScalarEvolution *SE_,
 
   // Use the bottom up slp vectorizer to construct chains that start with
   // store instructions.
-  BoUpSLP R(&F, SE, TTI, TLI, AA, LI, DT, AC, DB, DL);
+  BoUpSLP R(&F, SE, TTI, TLI, AA, LI, DT, AC, DB, DL, ORE_);
 
   // A general note: the vectorizer must use BoUpSLP::eraseInstruction() to
   // delete instructions.
@@ -3950,6 +3948,13 @@ bool SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R,
     DEBUG(dbgs() << "SLP: Found cost=" << Cost << " for VF=" << VF << "\n");
     if (Cost < -SLPCostThreshold) {
       DEBUG(dbgs() << "SLP: Decided to vectorize cost=" << Cost << "\n");
+      using namespace ore;
+      R.getORE()->emit(OptimizationRemark(SV_NAME, "StoresVectorized",
+                                          cast<StoreInst>(Chain[i]))
+                       << "Stores SLP vectorized with cost " << NV("Cost", Cost)
+                       << " and with tree size "
+                       << NV("TreeSize", R.getTreeSize()));
+
       R.vectorizeTree();
 
       // Move to the next bundle.
@@ -4163,6 +4168,12 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
 
       if (Cost < -SLPCostThreshold) {
         DEBUG(dbgs() << "SLP: Vectorizing list at cost:" << Cost << ".\n");
+        R.getORE()->emit(OptimizationRemark(SV_NAME, "VectorizedList",
+                                            cast<Instruction>(Ops[0]))
+                         << "SLP vectorized with cost " << ore::NV("Cost", Cost)
+                         << " and with tree size "
+                         << ore::NV("TreeSize", R.getTreeSize()));
+
         Value *VectorizedRoot = R.vectorizeTree();
 
         // Reconstruct the build vector by extracting the vectorized root. This
@@ -4506,6 +4517,12 @@ public:
 
       DEBUG(dbgs() << "SLP: Vectorizing horizontal reduction at cost:" << Cost
                    << ". (HorRdx)\n");
+      auto *I0 = cast<Instruction>(VL[0]);
+      V.getORE()->emit(
+          OptimizationRemark(SV_NAME, "VectorizedHorizontalReduction", I0)
+          << "Vectorized horizontal reduction with cost "
+          << ore::NV("Cost", Cost) << " and with tree size "
+          << ore::NV("TreeSize", V.getTreeSize()));
 
       // Vectorize a tree.
       DebugLoc Loc = cast<Instruction>(ReducedVals[i])->getDebugLoc();
@@ -4513,7 +4530,7 @@ public:
 
       // Emit a reduction.
       Value *ReducedSubTree =
-          emitReduction(VectorizedRoot, Builder, ReduxWidth, ReductionOps);
+          emitReduction(VectorizedRoot, Builder, ReduxWidth, ReductionOps, TTI);
       if (VectorizedTree) {
         Builder.SetCurrentDebugLocation(Loc);
         VectorizedTree = Builder.CreateBinOp(ReductionOpcode, VectorizedTree,
@@ -4583,33 +4600,31 @@ private:
 
   /// \brief Emit a horizontal reduction of the vectorized value.
   Value *emitReduction(Value *VectorizedValue, IRBuilder<> &Builder,
-                       unsigned ReduxWidth, ArrayRef<Value *> RedOps) {
+                       unsigned ReduxWidth, ArrayRef<Value *> RedOps,
+                       const TargetTransformInfo *TTI) {
     assert(VectorizedValue && "Need to have a vectorized tree node");
     assert(isPowerOf2_32(ReduxWidth) &&
            "We only handle power-of-two reductions for now");
 
+    if (!IsPairwiseReduction)
+      return createSimpleTargetReduction(
+          Builder, TTI, ReductionOpcode, VectorizedValue,
+          TargetTransformInfo::ReductionFlags(), RedOps);
+
     Value *TmpVec = VectorizedValue;
     for (unsigned i = ReduxWidth / 2; i != 0; i >>= 1) {
-      if (IsPairwiseReduction) {
-        Value *LeftMask =
+      Value *LeftMask =
           createRdxShuffleMask(ReduxWidth, i, true, true, Builder);
-        Value *RightMask =
+      Value *RightMask =
           createRdxShuffleMask(ReduxWidth, i, true, false, Builder);
 
-        Value *LeftShuf = Builder.CreateShuffleVector(
+      Value *LeftShuf = Builder.CreateShuffleVector(
           TmpVec, UndefValue::get(TmpVec->getType()), LeftMask, "rdx.shuf.l");
-        Value *RightShuf = Builder.CreateShuffleVector(
+      Value *RightShuf = Builder.CreateShuffleVector(
           TmpVec, UndefValue::get(TmpVec->getType()), (RightMask),
           "rdx.shuf.r");
-        TmpVec = Builder.CreateBinOp(ReductionOpcode, LeftShuf, RightShuf,
-                                     "bin.rdx");
-      } else {
-        Value *UpperHalf =
-          createRdxShuffleMask(ReduxWidth, i, false, false, Builder);
-        Value *Shuf = Builder.CreateShuffleVector(
-          TmpVec, UndefValue::get(TmpVec->getType()), UpperHalf, "rdx.shuf");
-        TmpVec = Builder.CreateBinOp(ReductionOpcode, TmpVec, Shuf, "bin.rdx");
-      }
+      TmpVec =
+          Builder.CreateBinOp(ReductionOpcode, LeftShuf, RightShuf, "bin.rdx");
       propagateIRFlags(TmpVec, RedOps);
     }
 
@@ -5162,6 +5177,7 @@ INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
 INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
 INITIALIZE_PASS_END(SLPVectorizer, SV_NAME, lv_name, false, false)
 
 namespace llvm {
diff --git a/lib/XRay/Trace.cpp b/lib/XRay/Trace.cpp
index d2984697c8a9..6677063f944f 100644
--- a/lib/XRay/Trace.cpp
+++ b/lib/XRay/Trace.cpp
@@ -115,6 +115,7 @@ struct FDRState {
   uint16_t CPUId;
   uint16_t ThreadId;
   uint64_t BaseTSC;
+
   /// Encode some of the state transitions for the FDR log reader as explicit
   /// checks. These are expectations for the next Record in the stream.
   enum class Token {
@@ -123,8 +124,10 @@ struct FDRState {
     NEW_CPU_ID_RECORD,
     FUNCTION_SEQUENCE,
     SCAN_TO_END_OF_THREAD_BUF,
+    CUSTOM_EVENT_DATA,
   };
   Token Expects;
+
   // Each threads buffer may have trailing garbage to scan over, so we track our
   // progress.
   uint64_t CurrentBufferSize;
@@ -143,6 +146,8 @@ Twine fdrStateToTwine(const FDRState::Token &state) {
     return "FUNCTION_SEQUENCE";
   case FDRState::Token::SCAN_TO_END_OF_THREAD_BUF:
     return "SCAN_TO_END_OF_THREAD_BUF";
+  case FDRState::Token::CUSTOM_EVENT_DATA:
+    return "CUSTOM_EVENT_DATA";
   }
   return "UNKNOWN";
 }
@@ -212,13 +217,32 @@ Error processFDRWallTimeRecord(FDRState &State, uint8_t RecordFirstByte,
   return Error::success();
 }
 
+/// State transition when a CustomEventMarker is encountered.
+Error processCustomEventMarker(FDRState &State, uint8_t RecordFirstByte,
+                               DataExtractor &RecordExtractor,
+                               size_t &RecordSize) {
+  // We can encounter a CustomEventMarker anywhere in the log, so we can handle
+  // it regardless of the expectation. However, we do se the expectation to read
+  // a set number of fixed bytes, as described in the metadata.
+  uint32_t OffsetPtr = 1; // Read after the first byte.
+  uint32_t DataSize = RecordExtractor.getU32(&OffsetPtr);
+  uint64_t TSC = RecordExtractor.getU64(&OffsetPtr);
+
+  // FIXME: Actually represent the record through the API. For now we only skip
+  // through the data.
+  (void)TSC;
+  RecordSize = 16 + DataSize;
+  return Error::success();
+}
+
 /// Advances the state machine for reading the FDR record type by reading one
 /// Metadata Record and updating the State appropriately based on the kind of
 /// record encountered. The RecordKind is encoded in the first byte of the
 /// Record, which the caller should pass in because they have already read it
 /// to determine that this is a metadata record as opposed to a function record.
 Error processFDRMetadataRecord(FDRState &State, uint8_t RecordFirstByte,
-                               DataExtractor &RecordExtractor) {
+                               DataExtractor &RecordExtractor,
+                               size_t &RecordSize) {
   // The remaining 7 bits are the RecordKind enum.
   uint8_t RecordKind = RecordFirstByte >> 1;
   switch (RecordKind) {
@@ -247,6 +271,11 @@ Error processFDRMetadataRecord(FDRState &State, uint8_t RecordFirstByte,
             processFDRWallTimeRecord(State, RecordFirstByte, RecordExtractor))
       return E;
     break;
+  case 5: // CustomEventMarker
+    if (auto E = processCustomEventMarker(State, RecordFirstByte,
+                                          RecordExtractor, RecordSize))
+      return E;
+    break;
   default:
     // Widen the record type to uint16_t to prevent conversion to char.
     return make_error<StringError>(
@@ -400,7 +429,8 @@ Error loadFDRLog(StringRef Data, XRayFileHeader &FileHeader,
     bool isMetadataRecord = BitField & 0x01uL;
     if (isMetadataRecord) {
       RecordSize = 16;
-      if (auto E = processFDRMetadataRecord(State, BitField, RecordExtractor))
+      if (auto E = processFDRMetadataRecord(State, BitField, RecordExtractor,
+                                            RecordSize))
         return E;
       State.CurrentBufferConsumed += RecordSize;
     } else { // Process Function Record
diff --git a/projects/CMakeLists.txt b/projects/CMakeLists.txt
index 79d8fc7df99b..9102efbdcb46 100644
--- a/projects/CMakeLists.txt
+++ b/projects/CMakeLists.txt
@@ -22,7 +22,9 @@ endforeach(entry)
 if(${LLVM_BUILD_RUNTIME})
   # MSVC isn't quite working with libc++ yet, disable it until issues are
   # fixed.
-  if(NOT MSVC)
+  # FIXME: LLVM_FORCE_BUILD_RUNTIME is currently used by libc++ to force
+  # enable the in-tree build when targeting clang-cl.
+  if(NOT MSVC OR LLVM_FORCE_BUILD_RUNTIME)
     # Add the projects in reverse order of their dependencies so that the
     # dependent projects can see the target names of their dependencies.
     add_llvm_external_project(libunwind)
diff --git a/test/Analysis/BasicAA/cs-cs-arm.ll b/test/Analysis/BasicAA/cs-cs-arm.ll
new file mode 100644
index 000000000000..1580af9ea826
--- /dev/null
+++ b/test/Analysis/BasicAA/cs-cs-arm.ll
@@ -0,0 +1,34 @@
+; RUN: opt < %s -basicaa -aa-eval -print-all-alias-modref-info -disable-output 2>&1 | FileCheck %s
+; REQUIRES: arm
+
+target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:32-f32:32:32-f64:32:32-v64:32:64-v128:32:128-a0:0:32-n32"
+target triple = "arm-apple-ios"
+
+declare <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8*, i32) nounwind readonly
+declare void @llvm.arm.neon.vst1.p0i8.v8i16(i8*, <8 x i16>, i32) nounwind
+
+define <8 x i16> @test1(i8* %p, <8 x i16> %y) {
+entry:
+  %q = getelementptr i8, i8* %p, i64 16
+  %a = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* %p, i32 16) nounwind
+  call void @llvm.arm.neon.vst1.p0i8.v8i16(i8* %q, <8 x i16> %y, i32 16)
+  %b = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* %p, i32 16) nounwind
+  %c = add <8 x i16> %a, %b
+  ret <8 x i16> %c
+
+; CHECK-LABEL: Function: test1:
+
+; CHECK: NoAlias:      i8* %p, i8* %q
+; CHECK: Just Ref:  Ptr: i8* %p        <->  %a = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* %p, i32 16)
+; CHECK: NoModRef:  Ptr: i8* %q        <->  %a = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* %p, i32 16)
+; CHECK: NoModRef:  Ptr: i8* %p        <->  call void @llvm.arm.neon.vst1.p0i8.v8i16(i8* %q, <8 x i16> %y, i32 16)
+; CHECK: Both ModRef:  Ptr: i8* %q     <->  call void @llvm.arm.neon.vst1.p0i8.v8i16(i8* %q, <8 x i16> %y, i32 16)
+; CHECK: Just Ref:  Ptr: i8* %p        <->  %b = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* %p, i32 16)
+; CHECK: NoModRef:  Ptr: i8* %q        <->  %b = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* %p, i32 16)
+; CHECK: NoModRef:   %a = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* %p, i32 16) #{{[0-9]+}} <->   call void @llvm.arm.neon.vst1.p0i8.v8i16(i8* %q, <8 x i16> %y, i32 16)
+; CHECK: NoModRef:   %a = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* %p, i32 16) #{{[0-9]+}} <->   %b = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* %p, i32 16)
+; CHECK: NoModRef:   call void @llvm.arm.neon.vst1.p0i8.v8i16(i8* %q, <8 x i16> %y, i32 16) <->   %a = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* %p, i32 16)
+; CHECK: NoModRef:   call void @llvm.arm.neon.vst1.p0i8.v8i16(i8* %q, <8 x i16> %y, i32 16) <->   %b = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* %p, i32 16)
+; CHECK: NoModRef:   %b = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* %p, i32 16) #{{[0-9]+}} <->   %a = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* %p, i32 16)
+; CHECK: NoModRef:   %b = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* %p, i32 16) #{{[0-9]+}} <->   call void @llvm.arm.neon.vst1.p0i8.v8i16(i8* %q, <8 x i16> %y, i32 16)
+}
diff --git a/test/Analysis/BasicAA/cs-cs.ll b/test/Analysis/BasicAA/cs-cs.ll
index 0f74dbd92bbd..870794c25165 100644
--- a/test/Analysis/BasicAA/cs-cs.ll
+++ b/test/Analysis/BasicAA/cs-cs.ll
@@ -2,41 +2,12 @@
 target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:32-f32:32:32-f64:32:32-v64:32:64-v128:32:128-a0:0:32-n32"
 target triple = "arm-apple-ios"
 
-declare <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8*, i32) nounwind readonly
-declare void @llvm.arm.neon.vst1.p0i8.v8i16(i8*, <8 x i16>, i32) nounwind
-
 declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i32, i1) nounwind
 declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i32, i1) nounwind
 
 declare void @a_readonly_func(i8 *) noinline nounwind readonly
 declare void @a_writeonly_func(i8 *) noinline nounwind writeonly
 
-define <8 x i16> @test1(i8* %p, <8 x i16> %y) {
-entry:
-  %q = getelementptr i8, i8* %p, i64 16
-  %a = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* %p, i32 16) nounwind
-  call void @llvm.arm.neon.vst1.p0i8.v8i16(i8* %q, <8 x i16> %y, i32 16)
-  %b = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* %p, i32 16) nounwind
-  %c = add <8 x i16> %a, %b
-  ret <8 x i16> %c
-
-; CHECK-LABEL: Function: test1:
-
-; CHECK: NoAlias:      i8* %p, i8* %q
-; CHECK: Just Ref:  Ptr: i8* %p        <->  %a = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* %p, i32 16)
-; CHECK: NoModRef:  Ptr: i8* %q        <->  %a = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* %p, i32 16)
-; CHECK: NoModRef:  Ptr: i8* %p        <->  call void @llvm.arm.neon.vst1.p0i8.v8i16(i8* %q, <8 x i16> %y, i32 16)
-; CHECK: Both ModRef:  Ptr: i8* %q     <->  call void @llvm.arm.neon.vst1.p0i8.v8i16(i8* %q, <8 x i16> %y, i32 16)
-; CHECK: Just Ref:  Ptr: i8* %p        <->  %b = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* %p, i32 16)
-; CHECK: NoModRef:  Ptr: i8* %q        <->  %b = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* %p, i32 16)
-; CHECK: NoModRef:   %a = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* %p, i32 16) #{{[0-9]+}} <->   call void @llvm.arm.neon.vst1.p0i8.v8i16(i8* %q, <8 x i16> %y, i32 16)
-; CHECK: NoModRef:   %a = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* %p, i32 16) #{{[0-9]+}} <->   %b = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* %p, i32 16)
-; CHECK: NoModRef:   call void @llvm.arm.neon.vst1.p0i8.v8i16(i8* %q, <8 x i16> %y, i32 16) <->   %a = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* %p, i32 16)
-; CHECK: NoModRef:   call void @llvm.arm.neon.vst1.p0i8.v8i16(i8* %q, <8 x i16> %y, i32 16) <->   %b = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* %p, i32 16)
-; CHECK: NoModRef:   %b = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* %p, i32 16) #{{[0-9]+}} <->   %a = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* %p, i32 16)
-; CHECK: NoModRef:   %b = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* %p, i32 16) #{{[0-9]+}} <->   call void @llvm.arm.neon.vst1.p0i8.v8i16(i8* %q, <8 x i16> %y, i32 16)
-}
-
 define void @test2(i8* %P, i8* %Q) nounwind ssp {
   tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %Q, i64 12, i32 1, i1 false)
   tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %Q, i64 12, i32 1, i1 false)
@@ -247,9 +218,9 @@ define void @test7(i8* %P) nounwind ssp {
 ; CHECK: Just Ref:   call void @a_readonly_func(i8* %P) <->   call void @a_writeonly_func(i8* %P)
 }
 
-declare void @an_inaccessiblememonly_func() nounwind inaccessiblememonly 
-declare void @an_inaccessibleorargmemonly_func(i8 *) nounwind inaccessiblemem_or_argmemonly 
-declare void @an_argmemonly_func(i8 *) nounwind argmemonly 
+declare void @an_inaccessiblememonly_func() nounwind inaccessiblememonly
+declare void @an_inaccessibleorargmemonly_func(i8 *) nounwind inaccessiblemem_or_argmemonly
+declare void @an_argmemonly_func(i8 *) nounwind argmemonly
 
 define void @test8(i8* %p) {
 entry:
@@ -260,7 +231,7 @@ entry:
   call void @an_inaccessiblememonly_func()
   call void @an_inaccessibleorargmemonly_func(i8* %q)
   call void @an_argmemonly_func(i8* %q)
-  ret void 
+  ret void
 
 ; CHECK-LABEL: Function: test8
 ; CHECK: NoModRef:  Ptr: i8* %p <->  call void @an_inaccessiblememonly_func()
diff --git a/test/Analysis/BasicAA/intrinsics-arm.ll b/test/Analysis/BasicAA/intrinsics-arm.ll
new file mode 100644
index 000000000000..e15ce1c65c64
--- /dev/null
+++ b/test/Analysis/BasicAA/intrinsics-arm.ll
@@ -0,0 +1,31 @@
+; RUN: opt -basicaa -gvn -S < %s | FileCheck %s
+; REQUIRES: arm
+
+target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:32-f32:32:32-f64:32:32-v64:32:64-v128:32:128-a0:0:32-n32"
+
+; BasicAA should prove that these calls don't interfere, since we've
+; specifically special cased exactly these two intrinsics in
+; MemoryLocation::getForArgument.
+
+; CHECK:      define <8 x i16> @test1(i8* %p, <8 x i16> %y) {
+; CHECK-NEXT: entry:
+; CHECK-NEXT:   %q = getelementptr i8, i8* %p, i64 16
+; CHECK-NEXT:   %a = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* %p, i32 16) [[ATTR:#[0-9]+]]
+; CHECK-NEXT:   call void @llvm.arm.neon.vst1.p0i8.v8i16(i8* %q, <8 x i16> %y, i32 16)
+; CHECK-NEXT:   %c = add <8 x i16> %a, %a
+define <8 x i16> @test1(i8* %p, <8 x i16> %y) {
+entry:
+  %q = getelementptr i8, i8* %p, i64 16
+  %a = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* %p, i32 16) nounwind
+  call void @llvm.arm.neon.vst1.p0i8.v8i16(i8* %q, <8 x i16> %y, i32 16)
+  %b = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* %p, i32 16) nounwind
+  %c = add <8 x i16> %a, %b
+  ret <8 x i16> %c
+}
+
+declare <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8*, i32) nounwind readonly
+declare void @llvm.arm.neon.vst1.p0i8.v8i16(i8*, <8 x i16>, i32) nounwind
+
+; CHECK: attributes #0 = { argmemonly nounwind readonly }
+; CHECK: attributes #1 = { argmemonly nounwind }
+; CHECK: attributes [[ATTR]] = { nounwind }
diff --git a/test/Analysis/BasicAA/intrinsics.ll b/test/Analysis/BasicAA/intrinsics.ll
index 526a039ef7ac..68e59862bcc1 100644
--- a/test/Analysis/BasicAA/intrinsics.ll
+++ b/test/Analysis/BasicAA/intrinsics.ll
@@ -5,38 +5,22 @@ target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:32-
 ; BasicAA should prove that these calls don't interfere, since they are
 ; IntrArgReadMem and have noalias pointers.
 
-; CHECK:      define <8 x i16> @test0(i8* noalias %p, i8* noalias %q, <8 x i16> %y) {
+; CHECK:      define <8 x i16> @test0(<8 x i16>* noalias %p, <8 x i16>* noalias %q, <8 x i16> %y, <8 x i1> %m, <8 x i16> %pt) {
 ; CHECK-NEXT: entry:
-; CHECK-NEXT:   %a = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* %p, i32 16) [[ATTR:#[0-9]+]]
-; CHECK-NEXT:   call void @llvm.arm.neon.vst1.p0i8.v8i16(i8* %q, <8 x i16> %y, i32 16)
+; CHECK-NEXT:   %a = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %p, i32 16, <8 x i1> %m, <8 x i16> %pt) [[ATTR:#[0-9]+]]
+; CHECK-NEXT:   call void @llvm.masked.store.v8i16.p0v8i16(<8 x i16> %y, <8 x i16>* %q, i32 16, <8 x i1> %m)
 ; CHECK-NEXT:   %c = add <8 x i16> %a, %a
-define <8 x i16> @test0(i8* noalias %p, i8* noalias %q, <8 x i16> %y) {
+define <8 x i16> @test0(<8 x i16>* noalias %p, <8 x i16>* noalias %q, <8 x i16> %y, <8 x i1> %m, <8 x i16> %pt) {
 entry:
-  %a = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* %p, i32 16) nounwind
-  call void @llvm.arm.neon.vst1.p0i8.v8i16(i8* %q, <8 x i16> %y, i32 16)
-  %b = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* %p, i32 16) nounwind
+  %a = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %p, i32 16, <8 x i1> %m, <8 x i16> %pt) nounwind
+  call void @llvm.masked.store.v8i16.p0v8i16(<8 x i16> %y, <8 x i16>* %q, i32 16, <8 x i1> %m)
+  %b = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %p, i32 16, <8 x i1> %m, <8 x i16> %pt) nounwind
   %c = add <8 x i16> %a, %b
   ret <8 x i16> %c
 }
 
-; CHECK:      define <8 x i16> @test1(i8* %p, <8 x i16> %y) {
-; CHECK-NEXT: entry:
-; CHECK-NEXT:   %q = getelementptr i8, i8* %p, i64 16
-; CHECK-NEXT:   %a = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* %p, i32 16) [[ATTR]]
-; CHECK-NEXT:   call void @llvm.arm.neon.vst1.p0i8.v8i16(i8* %q, <8 x i16> %y, i32 16)
-; CHECK-NEXT:   %c = add <8 x i16> %a, %a
-define <8 x i16> @test1(i8* %p, <8 x i16> %y) {
-entry:
-  %q = getelementptr i8, i8* %p, i64 16
-  %a = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* %p, i32 16) nounwind
-  call void @llvm.arm.neon.vst1.p0i8.v8i16(i8* %q, <8 x i16> %y, i32 16)
-  %b = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* %p, i32 16) nounwind
-  %c = add <8 x i16> %a, %b
-  ret <8 x i16> %c
-}
-
-declare <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8*, i32) nounwind readonly
-declare void @llvm.arm.neon.vst1.p0i8.v8i16(i8*, <8 x i16>, i32) nounwind
+declare <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>*, i32, <8 x i1>, <8 x i16>) nounwind readonly
+declare void @llvm.masked.store.v8i16.p0v8i16(<8 x i16>, <8 x i16>*, i32, <8 x i1>) nounwind
 
 ; CHECK: attributes #0 = { argmemonly nounwind readonly }
 ; CHECK: attributes #1 = { argmemonly nounwind }
diff --git a/test/Analysis/BranchProbabilityInfo/basic.ll b/test/Analysis/BranchProbabilityInfo/basic.ll
index 84936b7761ca..7bee1bd57373 100644
--- a/test/Analysis/BranchProbabilityInfo/basic.ll
+++ b/test/Analysis/BranchProbabilityInfo/basic.ll
@@ -452,7 +452,7 @@ entry:
                                  i32 3, label %case_d
                                  i32 4, label %case_e ], !prof !8
 ; CHECK: edge entry -> case_a probability is 0x00000800 / 0x80000000 = 0.00%
-; CHECK: edge entry -> case_b probability is 0x07fffe01 / 0x80000000 = 6.25%
+; CHECK: edge entry -> case_b probability is 0x07fffdff / 0x80000000 = 6.25%
 ; CHECK: edge entry -> case_c probability is 0x67fffdff / 0x80000000 = 81.25% [HOT edge]
 ; CHECK: edge entry -> case_d probability is 0x07fffdff / 0x80000000 = 6.25%
 ; CHECK: edge entry -> case_e probability is 0x07fffdff / 0x80000000 = 6.25%
@@ -495,7 +495,7 @@ entry:
                                  i32 4, label %case_e ], !prof !9
 ; CHECK: edge entry -> case_a probability is 0x00000400 / 0x80000000 = 0.00%
 ; CHECK: edge entry -> case_b probability is 0x00000400 / 0x80000000 = 0.00%
-; CHECK: edge entry -> case_c probability is 0x6aaaa800 / 0x80000000 = 83.33% [HOT edge]
+; CHECK: edge entry -> case_c probability is 0x6aaaa7ff / 0x80000000 = 83.33% [HOT edge]
 ; CHECK: edge entry -> case_d probability is 0x0aaaa7ff / 0x80000000 = 8.33%
 ; CHECK: edge entry -> case_e probability is 0x0aaaa7ff / 0x80000000 = 8.33%
 
@@ -535,7 +535,7 @@ entry:
                                  i32 4, label %case_e ], !prof !10
 ; CHECK: edge entry -> case_a probability is 0x00000000 / 0x80000000 = 0.00%
 ; CHECK: edge entry -> case_b probability is 0x00000400 / 0x80000000 = 0.00%
-; CHECK: edge entry -> case_c probability is 0x6e08fa2e / 0x80000000 = 85.96% [HOT edge]
+; CHECK: edge entry -> case_c probability is 0x6e08fa2d / 0x80000000 = 85.96% [HOT edge]
 ; CHECK: edge entry -> case_d probability is 0x08fb80e9 / 0x80000000 = 7.02%
 ; CHECK: edge entry -> case_e probability is 0x08fb80e9 / 0x80000000 = 7.02%
 
diff --git a/test/Analysis/CostModel/AArch64/free-widening-casts.ll b/test/Analysis/CostModel/AArch64/free-widening-casts.ll
new file mode 100644
index 000000000000..07f32d1d8ba2
--- /dev/null
+++ b/test/Analysis/CostModel/AArch64/free-widening-casts.ll
@@ -0,0 +1,622 @@
+; RUN: opt < %s -mtriple=aarch64--linux-gnu -cost-model -analyze | FileCheck %s --check-prefix=COST
+; RUN: llc < %s -mtriple=aarch64--linux-gnu | FileCheck %s --check-prefix=CODE
+
+; COST-LABEL: uaddl_8h
+; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = zext <8 x i8> %a to <8 x i16>
+; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %tmp1 = zext <8 x i8> %b to <8 x i16>
+; CODE-LABEL: uaddl_8h
+; CODE:       uaddl v0.8h, v0.8b, v1.8b
+define <8 x i16> @uaddl_8h(<8 x i8> %a, <8 x i8> %b) {
+  %tmp0 = zext <8 x i8> %a to <8 x i16>
+  %tmp1 = zext <8 x i8> %b to <8 x i16>
+  %tmp2 = add <8 x i16> %tmp0, %tmp1
+  ret <8 x i16> %tmp2
+}
+
+; COST-LABEL: uaddl_4s
+; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = zext <4 x i16> %a to <4 x i32>
+; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %tmp1 = zext <4 x i16> %b to <4 x i32>
+; CODE-LABEL: uaddl_4s
+; CODE:       uaddl v0.4s, v0.4h, v1.4h
+define <4 x i32> @uaddl_4s(<4 x i16> %a, <4 x i16> %b) {
+  %tmp0 = zext <4 x i16> %a to <4 x i32>
+  %tmp1 = zext <4 x i16> %b to <4 x i32>
+  %tmp2 = add <4 x i32> %tmp0, %tmp1
+  ret <4 x i32> %tmp2
+}
+
+; COST-LABEL: uaddl_2d
+; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = zext <2 x i32> %a to <2 x i64>
+; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %tmp1 = zext <2 x i32> %b to <2 x i64>
+; CODE-LABEL: uaddl_2d
+; CODE:       uaddl v0.2d, v0.2s, v1.2s
+define <2 x i64> @uaddl_2d(<2 x i32> %a, <2 x i32> %b) {
+  %tmp0 = zext <2 x i32> %a to <2 x i64>
+  %tmp1 = zext <2 x i32> %b to <2 x i64>
+  %tmp2 = add <2 x i64> %tmp0, %tmp1
+  ret <2 x i64> %tmp2
+}
+
+; COST-LABEL: uaddl2_8h
+; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = zext <16 x i8> %a to <16 x i16>
+; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %tmp1 = zext <16 x i8> %b to <16 x i16>
+; CODE-LABEL: uaddl2_8h
+; CODE:       uaddl2 v2.8h, v0.16b, v1.16b
+; CODE-NEXT:  uaddl v0.8h, v0.8b, v1.8b
+define <16 x i16> @uaddl2_8h(<16 x i8> %a, <16 x i8> %b) {
+  %tmp0 = zext <16 x i8> %a to <16 x i16>
+  %tmp1 = zext <16 x i8> %b to <16 x i16>
+  %tmp2 = add <16 x i16> %tmp0, %tmp1
+  ret <16 x i16> %tmp2
+}
+
+; COST-LABEL: uaddl2_4s
+; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = zext <8 x i16> %a to <8 x i32>
+; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %tmp1 = zext <8 x i16> %b to <8 x i32>
+; CODE-LABEL: uaddl2_4s
+; CODE:       uaddl2 v2.4s, v0.8h, v1.8h
+; CODE-NEXT:  uaddl v0.4s, v0.4h, v1.4h
+define <8 x i32> @uaddl2_4s(<8 x i16> %a, <8 x i16> %b) {
+  %tmp0 = zext <8 x i16> %a to <8 x i32>
+  %tmp1 = zext <8 x i16> %b to <8 x i32>
+  %tmp2 = add <8 x i32> %tmp0, %tmp1
+  ret <8 x i32> %tmp2
+}
+
+; COST-LABEL: uaddl2_2d
+; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = zext <4 x i32> %a to <4 x i64>
+; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %tmp1 = zext <4 x i32> %b to <4 x i64>
+; CODE-LABEL: uaddl2_2d
+; CODE:       uaddl2 v2.2d, v0.4s, v1.4s
+; CODE-NEXT:  uaddl v0.2d, v0.2s, v1.2s
+define <4 x i64> @uaddl2_2d(<4 x i32> %a, <4 x i32> %b) {
+  %tmp0 = zext <4 x i32> %a to <4 x i64>
+  %tmp1 = zext <4 x i32> %b to <4 x i64>
+  %tmp2 = add <4 x i64> %tmp0, %tmp1
+  ret <4 x i64> %tmp2
+}
+
+; COST-LABEL: saddl_8h
+; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = sext <8 x i8> %a to <8 x i16>
+; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %tmp1 = sext <8 x i8> %b to <8 x i16>
+; CODE-LABEL: saddl_8h
+; CODE:       saddl v0.8h, v0.8b, v1.8b
+define <8 x i16> @saddl_8h(<8 x i8> %a, <8 x i8> %b) {
+  %tmp0 = sext <8 x i8> %a to <8 x i16>
+  %tmp1 = sext <8 x i8> %b to <8 x i16>
+  %tmp2 = add <8 x i16> %tmp0, %tmp1
+  ret <8 x i16> %tmp2
+}
+
+; COST-LABEL: saddl_4s
+; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = sext <4 x i16> %a to <4 x i32>
+; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %tmp1 = sext <4 x i16> %b to <4 x i32>
+; CODE-LABEL: saddl_4s
+; CODE:       saddl v0.4s, v0.4h, v1.4h
+define <4 x i32> @saddl_4s(<4 x i16> %a, <4 x i16> %b) {
+  %tmp0 = sext <4 x i16> %a to <4 x i32>
+  %tmp1 = sext <4 x i16> %b to <4 x i32>
+  %tmp2 = add <4 x i32> %tmp0, %tmp1
+  ret <4 x i32> %tmp2
+}
+
+; COST-LABEL: saddl_2d
+; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = sext <2 x i32> %a to <2 x i64>
+; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %tmp1 = sext <2 x i32> %b to <2 x i64>
+; CODE-LABEL: saddl_2d
+; CODE:       saddl v0.2d, v0.2s, v1.2s
+define <2 x i64> @saddl_2d(<2 x i32> %a, <2 x i32> %b) {
+  %tmp0 = sext <2 x i32> %a to <2 x i64>
+  %tmp1 = sext <2 x i32> %b to <2 x i64>
+  %tmp2 = add <2 x i64> %tmp0, %tmp1
+  ret <2 x i64> %tmp2
+}
+
+; COST-LABEL: saddl2_8h
+; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = sext <16 x i8> %a to <16 x i16>
+; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %tmp1 = sext <16 x i8> %b to <16 x i16>
+; CODE-LABEL: saddl2_8h
+; CODE:       saddl2 v2.8h, v0.16b, v1.16b
+; CODE-NEXT:  saddl v0.8h, v0.8b, v1.8b
+define <16 x i16> @saddl2_8h(<16 x i8> %a, <16 x i8> %b) {
+  %tmp0 = sext <16 x i8> %a to <16 x i16>
+  %tmp1 = sext <16 x i8> %b to <16 x i16>
+  %tmp2 = add <16 x i16> %tmp0, %tmp1
+  ret <16 x i16> %tmp2
+}
+
+; COST-LABEL: saddl2_4s
+; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = sext <8 x i16> %a to <8 x i32>
+; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %tmp1 = sext <8 x i16> %b to <8 x i32>
+; CODE-LABEL: saddl2_4s
+; CODE:       saddl2 v2.4s, v0.8h, v1.8h
+; CODE-NEXT:  saddl v0.4s, v0.4h, v1.4h
+define <8 x i32> @saddl2_4s(<8 x i16> %a, <8 x i16> %b) {
+  %tmp0 = sext <8 x i16> %a to <8 x i32>
+  %tmp1 = sext <8 x i16> %b to <8 x i32>
+  %tmp2 = add <8 x i32> %tmp0, %tmp1
+  ret <8 x i32> %tmp2
+}
+
+; COST-LABEL: saddl2_2d
+; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = sext <4 x i32> %a to <4 x i64>
+; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %tmp1 = sext <4 x i32> %b to <4 x i64>
+; CODE-LABEL: saddl2_2d
+; CODE:       saddl2 v2.2d, v0.4s, v1.4s
+; CODE-NEXT:  saddl v0.2d, v0.2s, v1.2s
+define <4 x i64> @saddl2_2d(<4 x i32> %a, <4 x i32> %b) {
+  %tmp0 = sext <4 x i32> %a to <4 x i64>
+  %tmp1 = sext <4 x i32> %b to <4 x i64>
+  %tmp2 = add <4 x i64> %tmp0, %tmp1
+  ret <4 x i64> %tmp2
+}
+
+; COST-LABEL: usubl_8h
+; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = zext <8 x i8> %a to <8 x i16>
+; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %tmp1 = zext <8 x i8> %b to <8 x i16>
+; CODE-LABEL: usubl_8h
+; CODE:       usubl v0.8h, v0.8b, v1.8b
+define <8 x i16> @usubl_8h(<8 x i8> %a, <8 x i8> %b) {
+  %tmp0 = zext <8 x i8> %a to <8 x i16>
+  %tmp1 = zext <8 x i8> %b to <8 x i16>
+  %tmp2 = sub <8 x i16> %tmp0, %tmp1
+  ret <8 x i16> %tmp2
+}
+
+; COST-LABEL: usubl_4s
+; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = zext <4 x i16> %a to <4 x i32>
+; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %tmp1 = zext <4 x i16> %b to <4 x i32>
+; CODE-LABEL: usubl_4s
+; CODE:       usubl v0.4s, v0.4h, v1.4h
+define <4 x i32> @usubl_4s(<4 x i16> %a, <4 x i16> %b) {
+  %tmp0 = zext <4 x i16> %a to <4 x i32>
+  %tmp1 = zext <4 x i16> %b to <4 x i32>
+  %tmp2 = sub <4 x i32> %tmp0, %tmp1
+  ret <4 x i32> %tmp2
+}
+
+; COST-LABEL: usubl_2d
+; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = zext <2 x i32> %a to <2 x i64>
+; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %tmp1 = zext <2 x i32> %b to <2 x i64>
+; CODE-LABEL: usubl_2d
+; CODE:       usubl v0.2d, v0.2s, v1.2s
+define <2 x i64> @usubl_2d(<2 x i32> %a, <2 x i32> %b) {
+  %tmp0 = zext <2 x i32> %a to <2 x i64>
+  %tmp1 = zext <2 x i32> %b to <2 x i64>
+  %tmp2 = sub <2 x i64> %tmp0, %tmp1
+  ret <2 x i64> %tmp2
+}
+
+; COST-LABEL: usubl2_8h
+; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = zext <16 x i8> %a to <16 x i16>
+; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %tmp1 = zext <16 x i8> %b to <16 x i16>
+; CODE-LABEL: usubl2_8h
+; CODE:       usubl2 v2.8h, v0.16b, v1.16b
+; CODE-NEXT:  usubl v0.8h, v0.8b, v1.8b
+define <16 x i16> @usubl2_8h(<16 x i8> %a, <16 x i8> %b) {
+  %tmp0 = zext <16 x i8> %a to <16 x i16>
+  %tmp1 = zext <16 x i8> %b to <16 x i16>
+  %tmp2 = sub <16 x i16> %tmp0, %tmp1
+  ret <16 x i16> %tmp2
+}
+
+; COST-LABEL: usubl2_4s
+; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = zext <8 x i16> %a to <8 x i32>
+; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %tmp1 = zext <8 x i16> %b to <8 x i32>
+; CODE-LABEL: usubl2_4s
+; CODE:       usubl2 v2.4s, v0.8h, v1.8h
+; CODE-NEXT:  usubl v0.4s, v0.4h, v1.4h
+define <8 x i32> @usubl2_4s(<8 x i16> %a, <8 x i16> %b) {
+  %tmp0 = zext <8 x i16> %a to <8 x i32>
+  %tmp1 = zext <8 x i16> %b to <8 x i32>
+  %tmp2 = sub <8 x i32> %tmp0, %tmp1
+  ret <8 x i32> %tmp2
+}
+
+; COST-LABEL: usubl2_2d
+; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = zext <4 x i32> %a to <4 x i64>
+; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %tmp1 = zext <4 x i32> %b to <4 x i64>
+; CODE-LABEL: usubl2_2d
+; CODE:       usubl2 v2.2d, v0.4s, v1.4s
+; CODE-NEXT:  usubl v0.2d, v0.2s, v1.2s
+define <4 x i64> @usubl2_2d(<4 x i32> %a, <4 x i32> %b) {
+  %tmp0 = zext <4 x i32> %a to <4 x i64>
+  %tmp1 = zext <4 x i32> %b to <4 x i64>
+  %tmp2 = sub <4 x i64> %tmp0, %tmp1
+  ret <4 x i64> %tmp2
+}
+
+; COST-LABEL: ssubl_8h
+; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = sext <8 x i8> %a to <8 x i16>
+; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %tmp1 = sext <8 x i8> %b to <8 x i16>
+; CODE-LABEL: ssubl_8h
+; CODE:       ssubl v0.8h, v0.8b, v1.8b
+define <8 x i16> @ssubl_8h(<8 x i8> %a, <8 x i8> %b) {
+  %tmp0 = sext <8 x i8> %a to <8 x i16>
+  %tmp1 = sext <8 x i8> %b to <8 x i16>
+  %tmp2 = sub <8 x i16> %tmp0, %tmp1
+  ret <8 x i16> %tmp2
+}
+
+; COST-LABEL: ssubl_4s
+; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = sext <4 x i16> %a to <4 x i32>
+; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %tmp1 = sext <4 x i16> %b to <4 x i32>
+; CODE-LABEL: ssubl_4s
+; CODE:       ssubl v0.4s, v0.4h, v1.4h
+define <4 x i32> @ssubl_4s(<4 x i16> %a, <4 x i16> %b) {
+  %tmp0 = sext <4 x i16> %a to <4 x i32>
+  %tmp1 = sext <4 x i16> %b to <4 x i32>
+  %tmp2 = sub <4 x i32> %tmp0, %tmp1
+  ret <4 x i32> %tmp2
+}
+
+; COST-LABEL: ssubl_2d
+; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = sext <2 x i32> %a to <2 x i64>
+; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %tmp1 = sext <2 x i32> %b to <2 x i64>
+; CODE-LABEL: ssubl_2d
+; CODE:       ssubl v0.2d, v0.2s, v1.2s
+define <2 x i64> @ssubl_2d(<2 x i32> %a, <2 x i32> %b) {
+  %tmp0 = sext <2 x i32> %a to <2 x i64>
+  %tmp1 = sext <2 x i32> %b to <2 x i64>
+  %tmp2 = sub <2 x i64> %tmp0, %tmp1
+  ret <2 x i64> %tmp2
+}
+
+; COST-LABEL: ssubl2_8h
+; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = sext <16 x i8> %a to <16 x i16>
+; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %tmp1 = sext <16 x i8> %b to <16 x i16>
+; CODE-LABEL: ssubl2_8h
+; CODE:       ssubl2 v2.8h, v0.16b, v1.16b
+; CODE-NEXT:  ssubl v0.8h, v0.8b, v1.8b
+define <16 x i16> @ssubl2_8h(<16 x i8> %a, <16 x i8> %b) {
+  %tmp0 = sext <16 x i8> %a to <16 x i16>
+  %tmp1 = sext <16 x i8> %b to <16 x i16>
+  %tmp2 = sub <16 x i16> %tmp0, %tmp1
+  ret <16 x i16> %tmp2
+}
+
+; COST-LABEL: ssubl2_4s
+; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = sext <8 x i16> %a to <8 x i32>
+; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %tmp1 = sext <8 x i16> %b to <8 x i32>
+; CODE-LABEL: ssubl2_4s
+; CODE:       ssubl2 v2.4s, v0.8h, v1.8h
+; CODE-NEXT:  ssubl v0.4s, v0.4h, v1.4h
+define <8 x i32> @ssubl2_4s(<8 x i16> %a, <8 x i16> %b) {
+  %tmp0 = sext <8 x i16> %a to <8 x i32>
+  %tmp1 = sext <8 x i16> %b to <8 x i32>
+  %tmp2 = sub <8 x i32> %tmp0, %tmp1
+  ret <8 x i32> %tmp2
+}
+
+; COST-LABEL: ssubl2_2d
+; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = sext <4 x i32> %a to <4 x i64>
+; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %tmp1 = sext <4 x i32> %b to <4 x i64>
+; CODE-LABEL: ssubl2_2d
+; CODE:       ssubl2 v2.2d, v0.4s, v1.4s
+; CODE-NEXT:  ssubl v0.2d, v0.2s, v1.2s
+define <4 x i64> @ssubl2_2d(<4 x i32> %a, <4 x i32> %b) {
+  %tmp0 = sext <4 x i32> %a to <4 x i64>
+  %tmp1 = sext <4 x i32> %b to <4 x i64>
+  %tmp2 = sub <4 x i64> %tmp0, %tmp1
+  ret <4 x i64> %tmp2
+}
+
+; COST-LABEL: uaddw_8h
+; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = zext <8 x i8> %a to <8 x i16>
+; CODE-LABEL: uaddw_8h
+; CODE:       uaddw v0.8h, v1.8h, v0.8b
+define <8 x i16> @uaddw_8h(<8 x i8> %a, <8 x i16> %b) {
+  %tmp0 = zext <8 x i8> %a to <8 x i16>
+  %tmp1 = add <8 x i16> %b, %tmp0
+  ret <8 x i16> %tmp1
+}
+
+; COST-LABEL: uaddw_4s
+; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = zext <4 x i16> %a to <4 x i32>
+; CODE-LABEL: uaddw_4s
+; CODE:       uaddw v0.4s, v1.4s, v0.4h
+define <4 x i32> @uaddw_4s(<4 x i16> %a, <4 x i32> %b) {
+  %tmp0 = zext <4 x i16> %a to <4 x i32>
+  %tmp1 = add <4 x i32> %b, %tmp0
+  ret <4 x i32> %tmp1
+}
+
+; COST-LABEL: uaddw_2d
+; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = zext <2 x i32> %a to <2 x i64>
+; CODE-LABEL: uaddw_2d
+; CODE:       uaddw v0.2d, v1.2d, v0.2s
+define <2 x i64> @uaddw_2d(<2 x i32> %a, <2 x i64> %b) {
+  %tmp0 = zext <2 x i32> %a to <2 x i64>
+  %tmp1 = add <2 x i64> %b, %tmp0
+  ret <2 x i64> %tmp1
+}
+
+; COST-LABEL: uaddw2_8h
+; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = zext <16 x i8> %a to <16 x i16>
+; CODE-LABEL: uaddw2_8h
+; CODE:       uaddw2 v2.8h, v2.8h, v0.16b
+; CODE-NEXT:  uaddw v0.8h, v1.8h, v0.8b
+define <16 x i16> @uaddw2_8h(<16 x i8> %a, <16 x i16> %b) {
+  %tmp0 = zext <16 x i8> %a to <16 x i16>
+  %tmp1 = add <16 x i16> %b, %tmp0
+  ret <16 x i16> %tmp1
+}
+
+; COST-LABEL: uaddw2_4s
+; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = zext <8 x i16> %a to <8 x i32>
+; CODE-LABEL: uaddw2_4s
+; CODE:       uaddw2 v2.4s, v2.4s, v0.8h
+; CODE-NEXT:  uaddw v0.4s, v1.4s, v0.4h
+define <8 x i32> @uaddw2_4s(<8 x i16> %a, <8 x i32> %b) {
+  %tmp0 = zext <8 x i16> %a to <8 x i32>
+  %tmp1 = add <8 x i32> %b, %tmp0
+  ret <8 x i32> %tmp1
+}
+
+; COST-LABEL: uaddw2_2d
+; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = zext <4 x i32> %a to <4 x i64>
+; CODE-LABEL: uaddw2_2d
+; CODE:       uaddw2 v2.2d, v2.2d, v0.4s
+; CODE-NEXT:  uaddw v0.2d, v1.2d, v0.2s
+define <4 x i64> @uaddw2_2d(<4 x i32> %a, <4 x i64> %b) {
+  %tmp0 = zext <4 x i32> %a to <4 x i64>
+  %tmp1 = add <4 x i64> %b, %tmp0
+  ret <4 x i64> %tmp1
+}
+
+; COST-LABEL: saddw_8h
+; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = sext <8 x i8> %a to <8 x i16>
+; CODE-LABEL: saddw_8h
+; CODE:       saddw v0.8h, v1.8h, v0.8b
+define <8 x i16> @saddw_8h(<8 x i8> %a, <8 x i16> %b) {
+  %tmp0 = sext <8 x i8> %a to <8 x i16>
+  %tmp1 = add <8 x i16> %b, %tmp0
+  ret <8 x i16> %tmp1
+}
+
+; COST-LABEL: saddw_4s
+; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = sext <4 x i16> %a to <4 x i32>
+; CODE-LABEL: saddw_4s
+; CODE:       saddw v0.4s, v1.4s, v0.4h
+define <4 x i32> @saddw_4s(<4 x i16> %a, <4 x i32> %b) {
+  %tmp0 = sext <4 x i16> %a to <4 x i32>
+  %tmp1 = add <4 x i32> %b, %tmp0
+  ret <4 x i32> %tmp1
+}
+
+; COST-LABEL: saddw_2d
+; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = sext <2 x i32> %a to <2 x i64>
+; CODE-LABEL: saddw_2d
+; CODE:       saddw v0.2d, v1.2d, v0.2s
+define <2 x i64> @saddw_2d(<2 x i32> %a, <2 x i64> %b) {
+  %tmp0 = sext <2 x i32> %a to <2 x i64>
+  %tmp1 = add <2 x i64> %b, %tmp0
+  ret <2 x i64> %tmp1
+}
+
+; COST-LABEL: saddw2_8h
+; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = sext <16 x i8> %a to <16 x i16>
+; CODE-LABEL: saddw2_8h
+; CODE:       saddw2 v2.8h, v2.8h, v0.16b
+; CODE-NEXT:  saddw v0.8h, v1.8h, v0.8b
+define <16 x i16> @saddw2_8h(<16 x i8> %a, <16 x i16> %b) {
+  %tmp0 = sext <16 x i8> %a to <16 x i16>
+  %tmp1 = add <16 x i16> %b, %tmp0
+  ret <16 x i16> %tmp1
+}
+
+; COST-LABEL: saddw2_4s
+; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = sext <8 x i16> %a to <8 x i32>
+; CODE-LABEL: saddw2_4s
+; CODE:       saddw2 v2.4s, v2.4s, v0.8h
+; CODE-NEXT:  saddw v0.4s, v1.4s, v0.4h
+define <8 x i32> @saddw2_4s(<8 x i16> %a, <8 x i32> %b) {
+  %tmp0 = sext <8 x i16> %a to <8 x i32>
+  %tmp1 = add <8 x i32> %b, %tmp0
+  ret <8 x i32> %tmp1
+}
+
+; COST-LABEL: saddw2_2d
+; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = sext <4 x i32> %a to <4 x i64>
+; CODE-LABEL: saddw2_2d
+; CODE:       saddw2 v2.2d, v2.2d, v0.4s
+; CODE-NEXT:  saddw v0.2d, v1.2d, v0.2s
+define <4 x i64> @saddw2_2d(<4 x i32> %a, <4 x i64> %b) {
+  %tmp0 = sext <4 x i32> %a to <4 x i64>
+  %tmp1 = add <4 x i64> %b, %tmp0
+  ret <4 x i64> %tmp1
+}
+
+; COST-LABEL: usubw_8h
+; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = zext <8 x i8> %a to <8 x i16>
+; CODE-LABEL: usubw_8h
+; CODE:       usubw v0.8h, v1.8h, v0.8b
+define <8 x i16> @usubw_8h(<8 x i8> %a, <8 x i16> %b) {
+  %tmp0 = zext <8 x i8> %a to <8 x i16>
+  %tmp1 = sub <8 x i16> %b, %tmp0
+  ret <8 x i16> %tmp1
+}
+
+; COST-LABEL: usubw_4s
+; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = zext <4 x i16> %a to <4 x i32>
+; CODE-LABEL: usubw_4s
+; CODE:       usubw v0.4s, v1.4s, v0.4h
+define <4 x i32> @usubw_4s(<4 x i16> %a, <4 x i32> %b) {
+  %tmp0 = zext <4 x i16> %a to <4 x i32>
+  %tmp1 = sub <4 x i32> %b, %tmp0
+  ret <4 x i32> %tmp1
+}
+
+; COST-LABEL: usubw_2d
+; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = zext <2 x i32> %a to <2 x i64>
+; CODE-LABEL: usubw_2d
+; CODE:       usubw v0.2d, v1.2d, v0.2s
+define <2 x i64> @usubw_2d(<2 x i32> %a, <2 x i64> %b) {
+  %tmp0 = zext <2 x i32> %a to <2 x i64>
+  %tmp1 = sub <2 x i64> %b, %tmp0
+  ret <2 x i64> %tmp1
+}
+
+; COST-LABEL: usubw2_8h
+; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = zext <16 x i8> %a to <16 x i16>
+; CODE-LABEL: usubw2_8h
+; CODE:       usubw2 v2.8h, v2.8h, v0.16b
+; CODE-NEXT:  usubw v0.8h, v1.8h, v0.8b
+define <16 x i16> @usubw2_8h(<16 x i8> %a, <16 x i16> %b) {
+  %tmp0 = zext <16 x i8> %a to <16 x i16>
+  %tmp1 = sub <16 x i16> %b, %tmp0
+  ret <16 x i16> %tmp1
+}
+
+; COST-LABEL: usubw2_4s
+; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = zext <8 x i16> %a to <8 x i32>
+; CODE-LABEL: usubw2_4s
+; CODE:       usubw2 v2.4s, v2.4s, v0.8h
+; CODE-NEXT:  usubw v0.4s, v1.4s, v0.4h
+define <8 x i32> @usubw2_4s(<8 x i16> %a, <8 x i32> %b) {
+  %tmp0 = zext <8 x i16> %a to <8 x i32>
+  %tmp1 = sub <8 x i32> %b, %tmp0
+  ret <8 x i32> %tmp1
+}
+
+; COST-LABEL: usubw2_2d
+; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = zext <4 x i32> %a to <4 x i64>
+; CODE-LABEL: usubw2_2d
+; CODE:       usubw2 v2.2d, v2.2d, v0.4s
+; CODE-NEXT:  usubw v0.2d, v1.2d, v0.2s
+define <4 x i64> @usubw2_2d(<4 x i32> %a, <4 x i64> %b) {
+  %tmp0 = zext <4 x i32> %a to <4 x i64>
+  %tmp1 = sub <4 x i64> %b, %tmp0
+  ret <4 x i64> %tmp1
+}
+
+; COST-LABEL: ssubw_8h
+; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = sext <8 x i8> %a to <8 x i16>
+; CODE-LABEL: ssubw_8h
+; CODE:       ssubw v0.8h, v1.8h, v0.8b
+define <8 x i16> @ssubw_8h(<8 x i8> %a, <8 x i16> %b) {
+  %tmp0 = sext <8 x i8> %a to <8 x i16>
+  %tmp1 = sub <8 x i16> %b, %tmp0
+  ret <8 x i16> %tmp1
+}
+
+; COST-LABEL: ssubw_4s
+; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = sext <4 x i16> %a to <4 x i32>
+; CODE-LABEL: ssubw_4s
+; CODE:       ssubw v0.4s, v1.4s, v0.4h
+define <4 x i32> @ssubw_4s(<4 x i16> %a, <4 x i32> %b) {
+  %tmp0 = sext <4 x i16> %a to <4 x i32>
+  %tmp1 = sub <4 x i32> %b, %tmp0
+  ret <4 x i32> %tmp1
+}
+
+; COST-LABEL: ssubw_2d
+; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = sext <2 x i32> %a to <2 x i64>
+; CODE-LABEL: ssubw_2d
+; CODE:       ssubw v0.2d, v1.2d, v0.2s
+define <2 x i64> @ssubw_2d(<2 x i32> %a, <2 x i64> %b) {
+  %tmp0 = sext <2 x i32> %a to <2 x i64>
+  %tmp1 = sub <2 x i64> %b, %tmp0
+  ret <2 x i64> %tmp1
+}
+
+; COST-LABEL: ssubw2_8h
+; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = sext <16 x i8> %a to <16 x i16>
+; CODE-LABEL: ssubw2_8h
+; CODE:       ssubw2 v2.8h, v2.8h, v0.16b
+; CODE-NEXT:  ssubw v0.8h, v1.8h, v0.8b
+define <16 x i16> @ssubw2_8h(<16 x i8> %a, <16 x i16> %b) {
+  %tmp0 = sext <16 x i8> %a to <16 x i16>
+  %tmp1 = sub <16 x i16> %b, %tmp0
+  ret <16 x i16> %tmp1
+}
+
+; COST-LABEL: ssubw2_4s
+; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = sext <8 x i16> %a to <8 x i32>
+; CODE-LABEL: ssubw2_4s
+; CODE:       ssubw2 v2.4s, v2.4s, v0.8h
+; CODE-NEXT:  ssubw v0.4s, v1.4s, v0.4h
+define <8 x i32> @ssubw2_4s(<8 x i16> %a, <8 x i32> %b) {
+  %tmp0 = sext <8 x i16> %a to <8 x i32>
+  %tmp1 = sub <8 x i32> %b, %tmp0
+  ret <8 x i32> %tmp1
+}
+
+; COST-LABEL: ssubw2_2d
+; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = sext <4 x i32> %a to <4 x i64>
+; CODE-LABEL: ssubw2_2d
+; CODE:       ssubw2 v2.2d, v2.2d, v0.4s
+; CODE-NEXT:  ssubw v0.2d, v1.2d, v0.2s
+define <4 x i64> @ssubw2_2d(<4 x i32> %a, <4 x i64> %b) {
+  %tmp0 = sext <4 x i32> %a to <4 x i64>
+  %tmp1 = sub <4 x i64> %b, %tmp0
+  ret <4 x i64> %tmp1
+}
+
+; COST-LABEL: neg_wrong_operand_order
+; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %tmp0 = zext <8 x i8> %a to <8 x i16>
+define <8 x i16> @neg_wrong_operand_order(<8 x i8> %a, <8 x i16> %b) {
+  %tmp0 = zext <8 x i8> %a to <8 x i16>
+  %tmp1 = sub <8 x i16> %tmp0, %b
+  ret <8 x i16> %tmp1
+}
+
+; COST-LABEL: neg_non_widening_op
+; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %tmp0 = zext <8 x i8> %a to <8 x i16>
+define <8 x i16> @neg_non_widening_op(<8 x i8> %a, <8 x i16> %b) {
+  %tmp0 = zext <8 x i8> %a to <8 x i16>
+  %tmp1 = udiv <8 x i16> %b, %tmp0
+  ret <8 x i16> %tmp1
+}
+
+; COST-LABEL: neg_dissimilar_operand_kind_0
+; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %tmp0 = sext <8 x i8> %a to <8 x i16>
+; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %tmp1 = zext <8 x i8> %b to <8 x i16>
+define <8 x i16> @neg_dissimilar_operand_kind_0(<8 x i8> %a, <8 x i8> %b) {
+  %tmp0 = sext <8 x i8> %a to <8 x i16>
+  %tmp1 = zext <8 x i8> %b to <8 x i16>
+  %tmp2 = add <8 x i16> %tmp0, %tmp1
+  ret <8 x i16> %tmp2
+}
+
+; COST-LABEL: neg_dissimilar_operand_kind_1
+; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %tmp0 = zext <4 x i8> %a to <4 x i32>
+; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %tmp1 = zext <4 x i16> %b to <4 x i32>
+define <4 x i32> @neg_dissimilar_operand_kind_1(<4 x i8> %a, <4 x i16> %b) {
+  %tmp0 = zext <4 x i8> %a to <4 x i32>
+  %tmp1 = zext <4 x i16> %b to <4 x i32>
+  %tmp2 = add <4 x i32> %tmp0, %tmp1
+  ret <4 x i32> %tmp2
+}
+
+; COST-LABEL: neg_illegal_vector_type_0
+; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %tmp0 = zext <16 x i4> %a to <16 x i8>
+define <16 x i8> @neg_illegal_vector_type_0(<16 x i4> %a, <16 x i8> %b) {
+  %tmp0 = zext <16 x i4> %a to <16 x i8>
+  %tmp1 = sub <16 x i8> %b, %tmp0
+  ret <16 x i8> %tmp1
+}
+
+; COST-LABEL: neg_llegal_vector_type_1
+; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %tmp0 = zext <1 x i16> %a to <1 x i32>
+define <1 x i32> @neg_llegal_vector_type_1(<1 x i16> %a, <1 x i32> %b) {
+  %tmp0 = zext <1 x i16> %a to <1 x i32>
+  %tmp1 = add <1 x i32> %b, %tmp0
+  ret <1 x i32> %tmp1
+}
+
+; COST-LABEL: neg_llegal_vector_type_2
+; COST-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %tmp0 = zext <4 x i16> %a to <4 x i64>
+define <4 x i64> @neg_llegal_vector_type_2(<4 x i16> %a, <4 x i64> %b) {
+  %tmp0 = zext <4 x i16> %a to <4 x i64>
+  %tmp1 = add <4 x i64> %b, %tmp0
+  ret <4 x i64> %tmp1
+}
+
+; COST-LABEL: neg_llegal_vector_type_3
+; COST-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %tmp0 = zext <3 x i34> %a to <3 x i68>
+define <3 x i68> @neg_llegal_vector_type_3(<3 x i34> %a, <3 x i68> %b) {
+  %tmp0 = zext <3 x i34> %a to <3 x i68>
+  %tmp1 = add <3 x i68> %b, %tmp0
+  ret <3 x i68> %tmp1
+}
diff --git a/test/Analysis/CostModel/AMDGPU/extractelement.ll b/test/Analysis/CostModel/AMDGPU/extractelement.ll
index 1efbb5873acb..54c8b6c52365 100644
--- a/test/Analysis/CostModel/AMDGPU/extractelement.ll
+++ b/test/Analysis/CostModel/AMDGPU/extractelement.ll
@@ -1,7 +1,9 @@
-; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa < %s | FileCheck %s
+; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa %s | FileCheck -check-prefixes=GCN,CI %s
+; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=fiji %s | FileCheck -check-prefixes=GCN,VI %s
+; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 %s | FileCheck -check-prefixes=GCN,GFX9 %s
 
-; CHECK: 'extractelement_v2i32'
-; CHECK: estimated cost of 0 for {{.*}} extractelement <2 x i32>
+; GCN: 'extractelement_v2i32'
+; GCN: estimated cost of 0 for {{.*}} extractelement <2 x i32>
 define amdgpu_kernel void @extractelement_v2i32(i32 addrspace(1)* %out, <2 x i32> addrspace(1)* %vaddr) {
   %vec = load <2 x i32>, <2 x i32> addrspace(1)* %vaddr
   %elt = extractelement <2 x i32> %vec, i32 1
@@ -9,8 +11,8 @@ define amdgpu_kernel void @extractelement_v2i32(i32 addrspace(1)* %out, <2 x i32
   ret void
 }
 
-; CHECK: 'extractelement_v2f32'
-; CHECK: estimated cost of 0 for {{.*}} extractelement <2 x float>
+; GCN: 'extractelement_v2f32'
+; GCN: estimated cost of 0 for {{.*}} extractelement <2 x float>
 define amdgpu_kernel void @extractelement_v2f32(float addrspace(1)* %out, <2 x float> addrspace(1)* %vaddr) {
   %vec = load <2 x float>, <2 x float> addrspace(1)* %vaddr
   %elt = extractelement <2 x float> %vec, i32 1
@@ -18,8 +20,8 @@ define amdgpu_kernel void @extractelement_v2f32(float addrspace(1)* %out, <2 x f
   ret void
 }
 
-; CHECK: 'extractelement_v3i32'
-; CHECK: estimated cost of 0 for {{.*}} extractelement <3 x i32>
+; GCN: 'extractelement_v3i32'
+; GCN: estimated cost of 0 for {{.*}} extractelement <3 x i32>
 define amdgpu_kernel void @extractelement_v3i32(i32 addrspace(1)* %out, <3 x i32> addrspace(1)* %vaddr) {
   %vec = load <3 x i32>, <3 x i32> addrspace(1)* %vaddr
   %elt = extractelement <3 x i32> %vec, i32 1
@@ -27,8 +29,8 @@ define amdgpu_kernel void @extractelement_v3i32(i32 addrspace(1)* %out, <3 x i32
   ret void
 }
 
-; CHECK: 'extractelement_v4i32'
-; CHECK: estimated cost of 0 for {{.*}} extractelement <4 x i32>
+; GCN: 'extractelement_v4i32'
+; GCN: estimated cost of 0 for {{.*}} extractelement <4 x i32>
 define amdgpu_kernel void @extractelement_v4i32(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %vaddr) {
   %vec = load <4 x i32>, <4 x i32> addrspace(1)* %vaddr
   %elt = extractelement <4 x i32> %vec, i32 1
@@ -36,8 +38,8 @@ define amdgpu_kernel void @extractelement_v4i32(i32 addrspace(1)* %out, <4 x i32
   ret void
 }
 
-; CHECK: 'extractelement_v8i32'
-; CHECK: estimated cost of 0 for {{.*}} extractelement <8 x i32>
+; GCN: 'extractelement_v8i32'
+; GCN: estimated cost of 0 for {{.*}} extractelement <8 x i32>
 define amdgpu_kernel void @extractelement_v8i32(i32 addrspace(1)* %out, <8 x i32> addrspace(1)* %vaddr) {
   %vec = load <8 x i32>, <8 x i32> addrspace(1)* %vaddr
   %elt = extractelement <8 x i32> %vec, i32 1
@@ -46,8 +48,8 @@ define amdgpu_kernel void @extractelement_v8i32(i32 addrspace(1)* %out, <8 x i32
 }
 
 ; FIXME: Should be non-0
-; CHECK: 'extractelement_v8i32_dynindex'
-; CHECK: estimated cost of 2 for {{.*}} extractelement <8 x i32>
+; GCN: 'extractelement_v8i32_dynindex'
+; GCN: estimated cost of 2 for {{.*}} extractelement <8 x i32>
 define amdgpu_kernel void @extractelement_v8i32_dynindex(i32 addrspace(1)* %out, <8 x i32> addrspace(1)* %vaddr, i32 %idx) {
   %vec = load <8 x i32>, <8 x i32> addrspace(1)* %vaddr
   %elt = extractelement <8 x i32> %vec, i32 %idx
@@ -55,8 +57,8 @@ define amdgpu_kernel void @extractelement_v8i32_dynindex(i32 addrspace(1)* %out,
   ret void
 }
 
-; CHECK: 'extractelement_v2i64'
-; CHECK: estimated cost of 0 for {{.*}} extractelement <2 x i64>
+; GCN: 'extractelement_v2i64'
+; GCN: estimated cost of 0 for {{.*}} extractelement <2 x i64>
 define amdgpu_kernel void @extractelement_v2i64(i64 addrspace(1)* %out, <2 x i64> addrspace(1)* %vaddr) {
   %vec = load <2 x i64>, <2 x i64> addrspace(1)* %vaddr
   %elt = extractelement <2 x i64> %vec, i64 1
@@ -64,8 +66,8 @@ define amdgpu_kernel void @extractelement_v2i64(i64 addrspace(1)* %out, <2 x i64
   ret void
 }
 
-; CHECK: 'extractelement_v3i64'
-; CHECK: estimated cost of 0 for {{.*}} extractelement <3 x i64>
+; GCN: 'extractelement_v3i64'
+; GCN: estimated cost of 0 for {{.*}} extractelement <3 x i64>
 define amdgpu_kernel void @extractelement_v3i64(i64 addrspace(1)* %out, <3 x i64> addrspace(1)* %vaddr) {
   %vec = load <3 x i64>, <3 x i64> addrspace(1)* %vaddr
   %elt = extractelement <3 x i64> %vec, i64 1
@@ -73,8 +75,8 @@ define amdgpu_kernel void @extractelement_v3i64(i64 addrspace(1)* %out, <3 x i64
   ret void
 }
 
-; CHECK: 'extractelement_v4i64'
-; CHECK: estimated cost of 0 for {{.*}} extractelement <4 x i64>
+; GCN: 'extractelement_v4i64'
+; GCN: estimated cost of 0 for {{.*}} extractelement <4 x i64>
 define amdgpu_kernel void @extractelement_v4i64(i64 addrspace(1)* %out, <4 x i64> addrspace(1)* %vaddr) {
   %vec = load <4 x i64>, <4 x i64> addrspace(1)* %vaddr
   %elt = extractelement <4 x i64> %vec, i64 1
@@ -82,8 +84,8 @@ define amdgpu_kernel void @extractelement_v4i64(i64 addrspace(1)* %out, <4 x i64
   ret void
 }
 
-; CHECK: 'extractelement_v8i64'
-; CHECK: estimated cost of 0 for {{.*}} extractelement <8 x i64>
+; GCN: 'extractelement_v8i64'
+; GCN: estimated cost of 0 for {{.*}} extractelement <8 x i64>
 define amdgpu_kernel void @extractelement_v8i64(i64 addrspace(1)* %out, <8 x i64> addrspace(1)* %vaddr) {
   %vec = load <8 x i64>, <8 x i64> addrspace(1)* %vaddr
   %elt = extractelement <8 x i64> %vec, i64 1
@@ -91,8 +93,8 @@ define amdgpu_kernel void @extractelement_v8i64(i64 addrspace(1)* %out, <8 x i64
   ret void
 }
 
-; CHECK: 'extractelement_v4i8'
-; CHECK: estimated cost of 0 for {{.*}} extractelement <4 x i8>
+; GCN: 'extractelement_v4i8'
+; GCN: estimated cost of 1 for {{.*}} extractelement <4 x i8>
 define amdgpu_kernel void @extractelement_v4i8(i8 addrspace(1)* %out, <4 x i8> addrspace(1)* %vaddr) {
   %vec = load <4 x i8>, <4 x i8> addrspace(1)* %vaddr
   %elt = extractelement <4 x i8> %vec, i8 1
@@ -100,11 +102,31 @@ define amdgpu_kernel void @extractelement_v4i8(i8 addrspace(1)* %out, <4 x i8> a
   ret void
 }
 
-; CHECK: 'extractelement_v2i16'
-; CHECK: estimated cost of 0 for {{.*}} extractelement <2 x i16>
-define amdgpu_kernel void @extractelement_v2i16(i16 addrspace(1)* %out, <2 x i16> addrspace(1)* %vaddr) {
+; GCN: 'extractelement_0_v2i16':
+; CI: estimated cost of 1 for {{.*}} extractelement <2 x i16> %vec, i16 0
+; VI: estimated cost of 0 for {{.*}} extractelement <2 x i16>
+; GFX9: estimated cost of 0 for {{.*}} extractelement <2 x i16>
+define amdgpu_kernel void @extractelement_0_v2i16(i16 addrspace(1)* %out, <2 x i16> addrspace(1)* %vaddr) {
+  %vec = load <2 x i16>, <2 x i16> addrspace(1)* %vaddr
+  %elt = extractelement <2 x i16> %vec, i16 0
+  store i16 %elt, i16 addrspace(1)* %out
+  ret void
+}
+
+; GCN: 'extractelement_1_v2i16':
+; GCN: estimated cost of 1 for {{.*}} extractelement <2 x i16>
+define amdgpu_kernel void @extractelement_1_v2i16(i16 addrspace(1)* %out, <2 x i16> addrspace(1)* %vaddr) {
   %vec = load <2 x i16>, <2 x i16> addrspace(1)* %vaddr
   %elt = extractelement <2 x i16> %vec, i16 1
   store i16 %elt, i16 addrspace(1)* %out
   ret void
 }
+
+; GCN: 'extractelement_var_v2i16'
+; GCN: estimated cost of 1 for {{.*}} extractelement <2 x i16>
+define amdgpu_kernel void @extractelement_var_v2i16(i16 addrspace(1)* %out, <2 x i16> addrspace(1)* %vaddr, i32 %idx) {
+  %vec = load <2 x i16>, <2 x i16> addrspace(1)* %vaddr
+  %elt = extractelement <2 x i16> %vec, i32 %idx
+  store i16 %elt, i16 addrspace(1)* %out
+  ret void
+}
diff --git a/test/Analysis/CostModel/AMDGPU/insertelement.ll b/test/Analysis/CostModel/AMDGPU/insertelement.ll
index 6f296a3e7a34..67ab2607acd5 100644
--- a/test/Analysis/CostModel/AMDGPU/insertelement.ll
+++ b/test/Analysis/CostModel/AMDGPU/insertelement.ll
@@ -1,37 +1,50 @@
-; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa < %s | FileCheck %s
+; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa %s | FileCheck -check-prefixes=GCN,CI %s
+; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=fiji %s | FileCheck -check-prefixes=GCN,VI %s
+; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 %s | FileCheck -check-prefixes=GCN,GFX9 %s
 
-; CHECK: 'insertelement_v2i32'
-; CHECK: estimated cost of 0 for {{.*}} insertelement <2 x i32>
+; GCN-LABEL: 'insertelement_v2i32'
+; GCN: estimated cost of 0 for {{.*}} insertelement <2 x i32>
 define amdgpu_kernel void @insertelement_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %vaddr) {
   %vec = load <2 x i32>, <2 x i32> addrspace(1)* %vaddr
-  %insert = insertelement <2 x i32> %vec, i32 1, i32 123
+  %insert = insertelement <2 x i32> %vec, i32 123, i32 1
   store <2 x i32> %insert, <2 x i32> addrspace(1)* %out
   ret void
 }
 
-; CHECK: 'insertelement_v2i64'
-; CHECK: estimated cost of 0 for {{.*}} insertelement <2 x i64>
+; GCN-LABEL: 'insertelement_v2i64'
+; GCN: estimated cost of 0 for {{.*}} insertelement <2 x i64>
 define amdgpu_kernel void @insertelement_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %vaddr) {
   %vec = load <2 x i64>, <2 x i64> addrspace(1)* %vaddr
-  %insert = insertelement <2 x i64> %vec, i64 1, i64 123
+  %insert = insertelement <2 x i64> %vec, i64 123, i64 1
   store <2 x i64> %insert, <2 x i64> addrspace(1)* %out
   ret void
 }
 
-; CHECK: 'insertelement_v2i16'
-; CHECK: estimated cost of 0 for {{.*}} insertelement <2 x i16>
-define amdgpu_kernel void @insertelement_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %vaddr) {
+; GCN-LABEL: 'insertelement_0_v2i16'
+; CI: estimated cost of 1 for {{.*}} insertelement <2 x i16>
+; VI: estimated cost of 0 for {{.*}} insertelement <2 x i16>
+; GFX9: estimated cost of 0 for {{.*}} insertelement <2 x i16>
+define amdgpu_kernel void @insertelement_0_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %vaddr) {
   %vec = load <2 x i16>, <2 x i16> addrspace(1)* %vaddr
-  %insert = insertelement <2 x i16> %vec, i16 1, i16 123
+  %insert = insertelement <2 x i16> %vec, i16 123, i16 0
   store <2 x i16> %insert, <2 x i16> addrspace(1)* %out
   ret void
 }
 
-; CHECK: 'insertelement_v2i8'
-; CHECK: estimated cost of 0 for {{.*}} insertelement <2 x i8>
-define amdgpu_kernel void @insertelement_v2i8(<2 x i8> addrspace(1)* %out, <2 x i8> addrspace(1)* %vaddr) {
+; GCN-LABEL: 'insertelement_1_v2i16'
+; GCN: estimated cost of 1 for {{.*}} insertelement <2 x i16>
+define amdgpu_kernel void @insertelement_1_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %vaddr) {
+  %vec = load <2 x i16>, <2 x i16> addrspace(1)* %vaddr
+  %insert = insertelement <2 x i16> %vec, i16 123, i16 1
+  store <2 x i16> %insert, <2 x i16> addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: 'insertelement_1_v2i8'
+; GCN: estimated cost of 1 for {{.*}} insertelement <2 x i8>
+define amdgpu_kernel void @insertelement_1_v2i8(<2 x i8> addrspace(1)* %out, <2 x i8> addrspace(1)* %vaddr) {
   %vec = load <2 x i8>, <2 x i8> addrspace(1)* %vaddr
-  %insert = insertelement <2 x i8> %vec, i8 1, i8 123
+  %insert = insertelement <2 x i8> %vec, i8 123, i8 1
   store <2 x i8> %insert, <2 x i8> addrspace(1)* %out
   ret void
 }
diff --git a/test/Analysis/CostModel/AMDGPU/shufflevector.ll b/test/Analysis/CostModel/AMDGPU/shufflevector.ll
new file mode 100644
index 000000000000..cc756c82fed3
--- /dev/null
+++ b/test/Analysis/CostModel/AMDGPU/shufflevector.ll
@@ -0,0 +1,43 @@
+; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 %s | FileCheck -check-prefixes=GFX9,GCN %s
+; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=fiji %s | FileCheck -check-prefixes=VI,GCN %s
+
+; GFX9: estimated cost of 0 for {{.*}} shufflevector <2 x i16> %vec, <2 x i16> undef, <2 x i32> zeroinitializer
+define amdgpu_kernel void @shufflevector_00_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %vaddr) {
+  %vec = load <2 x i16>, <2 x i16> addrspace(1)* %vaddr
+  %shuf = shufflevector <2 x i16> %vec, <2 x i16> undef, <2 x i32> zeroinitializer
+  store <2 x i16> %shuf, <2 x i16> addrspace(1)* %out
+  ret void
+}
+
+; GFX9: estimated cost of 0 for {{.*}} shufflevector <2 x i16> %vec, <2 x i16> undef, <2 x i32> <i32 0, i32 1>
+define amdgpu_kernel void @shufflevector_01_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %vaddr) {
+  %vec = load <2 x i16>, <2 x i16> addrspace(1)* %vaddr
+  %shuf = shufflevector <2 x i16> %vec, <2 x i16> undef, <2 x i32> <i32 0, i32 1>
+  store <2 x i16> %shuf, <2 x i16> addrspace(1)* %out
+  ret void
+}
+
+; GFX9: estimated cost of 0 for {{.*}} shufflevector <2 x i16> %vec, <2 x i16> undef, <2 x i32> <i32 1, i32 0>
+define amdgpu_kernel void @shufflevector_10_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %vaddr) {
+  %vec = load <2 x i16>, <2 x i16> addrspace(1)* %vaddr
+  %shuf = shufflevector <2 x i16> %vec, <2 x i16> undef, <2 x i32> <i32 1, i32 0>
+  store <2 x i16> %shuf, <2 x i16> addrspace(1)* %out
+  ret void
+}
+
+; GFX9: estimated cost of 0 for {{.*}} shufflevector <2 x i16> %vec, <2 x i16> undef, <2 x i32> <i32 1, i32 1>
+define amdgpu_kernel void @shufflevector_11_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %vaddr) {
+  %vec = load <2 x i16>, <2 x i16> addrspace(1)* %vaddr
+  %shuf = shufflevector <2 x i16> %vec, <2 x i16> undef, <2 x i32> <i32 1, i32 1>
+  store <2 x i16> %shuf, <2 x i16> addrspace(1)* %out
+  ret void
+}
+
+; GCN: estimated cost of 2 for {{.*}} shufflevector <2 x i16> %vec0, <2 x i16> %vec1, <2 x i32> <i32 0, i32 2>
+define amdgpu_kernel void @shufflevector_02_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %vaddr0, <2 x i16> addrspace(1)* %vaddr1) {
+  %vec0 = load <2 x i16>, <2 x i16> addrspace(1)* %vaddr0
+  %vec1 = load <2 x i16>, <2 x i16> addrspace(1)* %vaddr1
+  %shuf = shufflevector <2 x i16> %vec0, <2 x i16> %vec1, <2 x i32> <i32 0, i32 2>
+  store <2 x i16> %shuf, <2 x i16> addrspace(1)* %out
+  ret void
+}
diff --git a/test/Analysis/CostModel/X86/div.ll b/test/Analysis/CostModel/X86/div.ll
index 0ac06ff75ebe..dabaaef3596a 100644
--- a/test/Analysis/CostModel/X86/div.ll
+++ b/test/Analysis/CostModel/X86/div.ll
@@ -139,14 +139,14 @@ define i32 @sdiv_uniformconst() {
   ; SSE2: cost of 38 {{.*}} %V8i32 = sdiv
   ; SSSE3: cost of 38 {{.*}} %V8i32 = sdiv
   ; SSE42: cost of 30 {{.*}} %V8i32 = sdiv
-  ; AVX1: cost of 30 {{.*}} %V8i32 = sdiv
+  ; AVX1: cost of 32 {{.*}} %V8i32 = sdiv
   ; AVX2: cost of 15 {{.*}} %V8i32 = sdiv
   ; AVX512: cost of 15 {{.*}} %V8i32 = sdiv
   %V8i32 = sdiv <8 x i32> undef, <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
   ; SSE2: cost of 76 {{.*}} %V16i32 = sdiv
   ; SSSE3: cost of 76 {{.*}} %V16i32 = sdiv
   ; SSE42: cost of 60 {{.*}} %V16i32 = sdiv
-  ; AVX1: cost of 60 {{.*}} %V16i32 = sdiv
+  ; AVX1: cost of 64 {{.*}} %V16i32 = sdiv
   ; AVX2: cost of 30 {{.*}} %V16i32 = sdiv
   ; AVX512: cost of 15 {{.*}} %V16i32 = sdiv
   %V16i32 = sdiv <16 x i32> undef, <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
@@ -157,12 +157,12 @@ define i32 @sdiv_uniformconst() {
   ; AVX: cost of 6 {{.*}} %V8i16 = sdiv
   %V8i16 = sdiv <8 x i16> undef, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
   ; SSE: cost of 12 {{.*}} %V16i16 = sdiv
-  ; AVX1: cost of 12 {{.*}} %V16i16 = sdiv
+  ; AVX1: cost of 14 {{.*}} %V16i16 = sdiv
   ; AVX2: cost of 6 {{.*}} %V16i16 = sdiv
   ; AVX512: cost of 6 {{.*}} %V16i16 = sdiv
   %V16i16 = sdiv <16 x i16> undef, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
   ; SSE: cost of 24 {{.*}} %V32i16 = sdiv
-  ; AVX1: cost of 24 {{.*}} %V32i16 = sdiv
+  ; AVX1: cost of 28 {{.*}} %V32i16 = sdiv
   ; AVX2: cost of 12 {{.*}} %V32i16 = sdiv
   ; AVX512F: cost of 12 {{.*}} %V32i16 = sdiv
   ; AVX512BW: cost of 6 {{.*}} %V32i16 = sdiv
@@ -203,12 +203,12 @@ define i32 @udiv_uniformconst() {
   ; AVX: cost of 15 {{.*}} %V4i32 = udiv
   %V4i32 = udiv <4 x i32> undef, <i32 7, i32 7, i32 7, i32 7>
   ; SSE: cost of 30 {{.*}} %V8i32 = udiv
-  ; AVX1: cost of 30 {{.*}} %V8i32 = udiv
+  ; AVX1: cost of 32 {{.*}} %V8i32 = udiv
   ; AVX2: cost of 15 {{.*}} %V8i32 = udiv
   ; AVX512: cost of 15 {{.*}} %V8i32 = udiv
   %V8i32 = udiv <8 x i32> undef, <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
   ; SSE: cost of 60 {{.*}} %V16i32 = udiv
-  ; AVX1: cost of 60 {{.*}} %V16i32 = udiv
+  ; AVX1: cost of 64 {{.*}} %V16i32 = udiv
   ; AVX2: cost of 30 {{.*}} %V16i32 = udiv
   ; AVX512: cost of 15 {{.*}} %V16i32 = udiv
   %V16i32 = udiv <16 x i32> undef, <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
@@ -219,12 +219,12 @@ define i32 @udiv_uniformconst() {
   ; AVX: cost of 6 {{.*}} %V8i16 = udiv
   %V8i16 = udiv <8 x i16> undef, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
   ; SSE: cost of 12 {{.*}} %V16i16 = udiv
-  ; AVX1: cost of 12 {{.*}} %V16i16 = udiv
+  ; AVX1: cost of 14 {{.*}} %V16i16 = udiv
   ; AVX2: cost of 6 {{.*}} %V16i16 = udiv
   ; AVX512: cost of 6 {{.*}} %V16i16 = udiv
   %V16i16 = udiv <16 x i16> undef, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
   ; SSE: cost of 24 {{.*}} %V32i16 = udiv
-  ; AVX1: cost of 24 {{.*}} %V32i16 = udiv
+  ; AVX1: cost of 28 {{.*}} %V32i16 = udiv
   ; AVX2: cost of 12 {{.*}} %V32i16 = udiv
   ; AVX512F: cost of 12 {{.*}} %V32i16 = udiv
   ; AVX512BW: cost of 6 {{.*}} %V32i16 = udiv
@@ -269,14 +269,14 @@ define i32 @sdiv_uniformconstpow2() {
   ; SSE2: cost of 38 {{.*}} %V8i32 = sdiv
   ; SSSE3: cost of 38 {{.*}} %V8i32 = sdiv
   ; SSE42: cost of 30 {{.*}} %V8i32 = sdiv
-  ; AVX1: cost of 30 {{.*}} %V8i32 = sdiv
+  ; AVX1: cost of 32 {{.*}} %V8i32 = sdiv
   ; AVX2: cost of 15 {{.*}} %V8i32 = sdiv
   ; AVX512: cost of 15 {{.*}} %V8i32 = sdiv
   %V8i32 = sdiv <8 x i32> undef, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
   ; SSE2: cost of 76 {{.*}} %V16i32 = sdiv
   ; SSSE3: cost of 76 {{.*}} %V16i32 = sdiv
   ; SSE42: cost of 60 {{.*}} %V16i32 = sdiv
-  ; AVX1: cost of 60 {{.*}} %V16i32 = sdiv
+  ; AVX1: cost of 64 {{.*}} %V16i32 = sdiv
   ; AVX2: cost of 30 {{.*}} %V16i32 = sdiv
   ; AVX512: cost of 15 {{.*}} %V16i32 = sdiv
   %V16i32 = sdiv <16 x i32> undef, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
@@ -287,12 +287,12 @@ define i32 @sdiv_uniformconstpow2() {
   ; AVX: cost of 6 {{.*}} %V8i16 = sdiv
   %V8i16 = sdiv <8 x i16> undef, <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>
   ; SSE: cost of 12 {{.*}} %V16i16 = sdiv
-  ; AVX1: cost of 12 {{.*}} %V16i16 = sdiv
+  ; AVX1: cost of 14 {{.*}} %V16i16 = sdiv
   ; AVX2: cost of 6 {{.*}} %V16i16 = sdiv
   ; AVX512: cost of 6 {{.*}} %V16i16 = sdiv
   %V16i16 = sdiv <16 x i16> undef, <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>
   ; SSE: cost of 24 {{.*}} %V32i16 = sdiv
-  ; AVX1: cost of 24 {{.*}} %V32i16 = sdiv
+  ; AVX1: cost of 28 {{.*}} %V32i16 = sdiv
   ; AVX2: cost of 12 {{.*}} %V32i16 = sdiv
   ; AVX512F: cost of 12 {{.*}} %V32i16 = sdiv
   ; AVX512BW: cost of 6 {{.*}} %V32i16 = sdiv
@@ -333,12 +333,12 @@ define i32 @udiv_uniformconstpow2() {
   ; AVX: cost of 15 {{.*}} %V4i32 = udiv
   %V4i32 = udiv <4 x i32> undef, <i32 16, i32 16, i32 16, i32 16>
   ; SSE: cost of 30 {{.*}} %V8i32 = udiv
-  ; AVX1: cost of 30 {{.*}} %V8i32 = udiv
+  ; AVX1: cost of 32 {{.*}} %V8i32 = udiv
   ; AVX2: cost of 15 {{.*}} %V8i32 = udiv
   ; AVX512: cost of 15 {{.*}} %V8i32 = udiv
   %V8i32 = udiv <8 x i32> undef, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
   ; SSE: cost of 60 {{.*}} %V16i32 = udiv
-  ; AVX1: cost of 60 {{.*}} %V16i32 = udiv
+  ; AVX1: cost of 64 {{.*}} %V16i32 = udiv
   ; AVX2: cost of 30 {{.*}} %V16i32 = udiv
   ; AVX512: cost of 15 {{.*}} %V16i32 = udiv
   %V16i32 = udiv <16 x i32> undef, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
@@ -349,12 +349,12 @@ define i32 @udiv_uniformconstpow2() {
   ; AVX: cost of 6 {{.*}} %V8i16 = udiv
   %V8i16 = udiv <8 x i16> undef, <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>
   ; SSE: cost of 12 {{.*}} %V16i16 = udiv
-  ; AVX1: cost of 12 {{.*}} %V16i16 = udiv
+  ; AVX1: cost of 14 {{.*}} %V16i16 = udiv
   ; AVX2: cost of 6 {{.*}} %V16i16 = udiv
   ; AVX512: cost of 6 {{.*}} %V16i16 = udiv
   %V16i16 = udiv <16 x i16> undef, <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>
   ; SSE: cost of 24 {{.*}} %V32i16 = udiv
-  ; AVX1: cost of 24 {{.*}} %V32i16 = udiv
+  ; AVX1: cost of 28 {{.*}} %V32i16 = udiv
   ; AVX2: cost of 12 {{.*}} %V32i16 = udiv
   ; AVX512F: cost of 12 {{.*}} %V32i16 = udiv
   ; AVX512BW: cost of 6 {{.*}} %V32i16 = udiv
diff --git a/test/Analysis/CostModel/X86/vshift-ashr-cost.ll b/test/Analysis/CostModel/X86/vshift-ashr-cost.ll
index a23b13fb2e25..eabc2330ddc6 100644
--- a/test/Analysis/CostModel/X86/vshift-ashr-cost.ll
+++ b/test/Analysis/CostModel/X86/vshift-ashr-cost.ll
@@ -33,10 +33,10 @@ define <4 x i64> @var_shift_v4i64(<4 x i64> %a, <4 x i64> %b) {
 ; CHECK: 'Cost Model Analysis' for function 'var_shift_v4i64':
 ; SSE2: Found an estimated cost of 24 for instruction:   %shift
 ; SSE41: Found an estimated cost of 24 for instruction:   %shift
-; AVX: Found an estimated cost of 24 for instruction:   %shift
+; AVX: Found an estimated cost of 26 for instruction:   %shift
 ; AVX2: Found an estimated cost of 4 for instruction:   %shift
 ; AVX512: Found an estimated cost of 1 for instruction:   %shift
-; XOP: Found an estimated cost of 4 for instruction:   %shift
+; XOP: Found an estimated cost of 6 for instruction:   %shift
   %shift = ashr <4 x i64> %a, %b
   ret <4 x i64> %shift
 }
@@ -45,10 +45,10 @@ define <8 x i64> @var_shift_v8i64(<8 x i64> %a, <8 x i64> %b) {
 ; CHECK: 'Cost Model Analysis' for function 'var_shift_v8i64':
 ; SSE2: Found an estimated cost of 48 for instruction:   %shift
 ; SSE41: Found an estimated cost of 48 for instruction:   %shift
-; AVX: Found an estimated cost of 48 for instruction:   %shift
+; AVX: Found an estimated cost of 52 for instruction:   %shift
 ; AVX2: Found an estimated cost of 8 for instruction:   %shift
 ; AVX512: Found an estimated cost of 1 for instruction:   %shift
-; XOP: Found an estimated cost of 8 for instruction:   %shift
+; XOP: Found an estimated cost of 12 for instruction:   %shift
   %shift = ashr <8 x i64> %a, %b
   ret <8 x i64> %shift
 }
@@ -70,10 +70,10 @@ define <8 x i32> @var_shift_v8i32(<8 x i32> %a, <8 x i32> %b) {
 ; CHECK: 'Cost Model Analysis' for function 'var_shift_v8i32':
 ; SSE2: Found an estimated cost of 32 for instruction:   %shift
 ; SSE41: Found an estimated cost of 24 for instruction:   %shift
-; AVX: Found an estimated cost of 24 for instruction:   %shift
+; AVX: Found an estimated cost of 26 for instruction:   %shift
 ; AVX2: Found an estimated cost of 1 for instruction:   %shift
 ; AVX512: Found an estimated cost of 1 for instruction:   %shift
-; XOPAVX: Found an estimated cost of 4 for instruction:   %shift
+; XOPAVX: Found an estimated cost of 6 for instruction:   %shift
 ; XOPAVX2: Found an estimated cost of 1 for instruction:   %shift
   %shift = ashr <8 x i32> %a, %b
   ret <8 x i32> %shift
@@ -83,10 +83,10 @@ define <16 x i32> @var_shift_v16i32(<16 x i32> %a, <16 x i32> %b) {
 ; CHECK: 'Cost Model Analysis' for function 'var_shift_v16i32':
 ; SSE2: Found an estimated cost of 64 for instruction:   %shift
 ; SSE41: Found an estimated cost of 48 for instruction:   %shift
-; AVX: Found an estimated cost of 48 for instruction:   %shift
+; AVX: Found an estimated cost of 52 for instruction:   %shift
 ; AVX2: Found an estimated cost of 2 for instruction:   %shift
 ; AVX512: Found an estimated cost of 1 for instruction:   %shift
-; XOPAVX: Found an estimated cost of 8 for instruction:   %shift
+; XOPAVX: Found an estimated cost of 12 for instruction:   %shift
 ; XOPAVX2: Found an estimated cost of 2 for instruction:   %shift
   %shift = ashr <16 x i32> %a, %b
   ret <16 x i32> %shift
@@ -109,11 +109,11 @@ define <16 x i16> @var_shift_v16i16(<16 x i16> %a, <16 x i16> %b) {
 ; CHECK: 'Cost Model Analysis' for function 'var_shift_v16i16':
 ; SSE2: Found an estimated cost of 64 for instruction:   %shift
 ; SSE41: Found an estimated cost of 28 for instruction:   %shift
-; AVX: Found an estimated cost of 28 for instruction:   %shift
+; AVX: Found an estimated cost of 30 for instruction:   %shift
 ; AVX2: Found an estimated cost of 10 for instruction:   %shift
 ; AVX512F: Found an estimated cost of 10 for instruction:   %shift
 ; AVX512BW: Found an estimated cost of 1 for instruction:   %shift
-; XOP: Found an estimated cost of 4 for instruction:   %shift
+; XOP: Found an estimated cost of 6 for instruction:   %shift
   %shift = ashr <16 x i16> %a, %b
   ret <16 x i16> %shift
 }
@@ -122,11 +122,11 @@ define <32 x i16> @var_shift_v32i16(<32 x i16> %a, <32 x i16> %b) {
 ; CHECK: 'Cost Model Analysis' for function 'var_shift_v32i16':
 ; SSE2: Found an estimated cost of 128 for instruction:   %shift
 ; SSE41: Found an estimated cost of 56 for instruction:   %shift
-; AVX: Found an estimated cost of 56 for instruction:   %shift
+; AVX: Found an estimated cost of 60 for instruction:   %shift
 ; AVX2: Found an estimated cost of 20 for instruction:   %shift
 ; AVX512F: Found an estimated cost of 20 for instruction:   %shift
 ; AVX512BW: Found an estimated cost of 1 for instruction:   %shift
-; XOP: Found an estimated cost of 8 for instruction:   %shift
+; XOP: Found an estimated cost of 12 for instruction:   %shift
   %shift = ashr <32 x i16> %a, %b
   ret <32 x i16> %shift
 }
@@ -147,11 +147,11 @@ define <32 x i8> @var_shift_v32i8(<32 x i8> %a, <32 x i8> %b) {
 ; CHECK: 'Cost Model Analysis' for function 'var_shift_v32i8':
 ; SSE2: Found an estimated cost of 108 for instruction:   %shift
 ; SSE41: Found an estimated cost of 48 for instruction:   %shift
-; AVX: Found an estimated cost of 48 for instruction:   %shift
+; AVX: Found an estimated cost of 50 for instruction:   %shift
 ; AVX2: Found an estimated cost of 24 for instruction:   %shift
 ; AVX512F: Found an estimated cost of 24 for instruction:   %shift
 ; AVX512BW: Found an estimated cost of 24 for instruction:   %shift
-; XOP: Found an estimated cost of 4 for instruction:   %shift
+; XOP: Found an estimated cost of 6 for instruction:   %shift
   %shift = ashr <32 x i8> %a, %b
   ret <32 x i8> %shift
 }
@@ -160,11 +160,11 @@ define <64 x i8> @var_shift_v64i8(<64 x i8> %a, <64 x i8> %b) {
 ; CHECK: 'Cost Model Analysis' for function 'var_shift_v64i8':
 ; SSE2: Found an estimated cost of 216 for instruction:   %shift
 ; SSE41: Found an estimated cost of 96 for instruction:   %shift
-; AVX: Found an estimated cost of 96 for instruction:   %shift
+; AVX: Found an estimated cost of 100 for instruction:   %shift
 ; AVX2: Found an estimated cost of 48 for instruction:   %shift
 ; AVX512F: Found an estimated cost of 48 for instruction:   %shift
 ; AVX512BW: Found an estimated cost of 24 for instruction:   %shift
-; XOP: Found an estimated cost of 8 for instruction:   %shift
+; XOP: Found an estimated cost of 12 for instruction:   %shift
   %shift = ashr <64 x i8> %a, %b
   ret <64 x i8> %shift
 }
@@ -191,11 +191,10 @@ define <4 x i64> @splatvar_shift_v4i64(<4 x i64> %a, i64 %b) {
 ; CHECK: 'Cost Model Analysis' for function 'splatvar_shift_v4i64':
 ; SSE2: Found an estimated cost of 8 for instruction:   %shift
 ; SSE41: Found an estimated cost of 8 for instruction:   %shift
-; AVX: Found an estimated cost of 8 for instruction:   %shift
-; AVX2: Found an estimated cost of 8 for instruction:   %shift
+; AVX: Found an estimated cost of 10 for instruction:   %shift
+; AVX2: Found an estimated cost of 4 for instruction:   %shift
 ; AVX512: Found an estimated cost of 1 for instruction:   %shift
-; XOPAVX: Found an estimated cost of 4 for instruction:   %shift
-; XOPAVX2: Found an estimated cost of 4 for instruction:   %shift
+; XOP: Found an estimated cost of 6 for instruction:   %shift
   %insert = insertelement <4 x i64> undef, i64 %b, i32 0
   %splat = shufflevector <4 x i64> %insert, <4 x i64> undef, <4 x i32> zeroinitializer
   %shift = ashr <4 x i64> %a, %splat
@@ -206,11 +205,10 @@ define <8 x i64> @splatvar_shift_v8i64(<8 x i64> %a, i64 %b) {
 ; CHECK: 'Cost Model Analysis' for function 'splatvar_shift_v8i64':
 ; SSE2: Found an estimated cost of 16 for instruction:   %shift
 ; SSE41: Found an estimated cost of 16 for instruction:   %shift
-; AVX: Found an estimated cost of 16 for instruction:   %shift
-; AVX2: Found an estimated cost of 16 for instruction:   %shift
+; AVX: Found an estimated cost of 20 for instruction:   %shift
+; AVX2: Found an estimated cost of 8 for instruction:   %shift
 ; AVX512: Found an estimated cost of 1 for instruction:   %shift
-; XOPAVX: Found an estimated cost of 8 for instruction:   %shift
-; XOPAVX2: Found an estimated cost of 8 for instruction:   %shift
+; XOP: Found an estimated cost of 12 for instruction:   %shift
   %insert = insertelement <8 x i64> undef, i64 %b, i32 0
   %splat = shufflevector <8 x i64> %insert, <8 x i64> undef, <8 x i32> zeroinitializer
   %shift = ashr <8 x i64> %a, %splat
@@ -235,10 +233,10 @@ define <8 x i32> @splatvar_shift_v8i32(<8 x i32> %a, i32 %b) {
 ; CHECK: 'Cost Model Analysis' for function 'splatvar_shift_v8i32':
 ; SSE2: Found an estimated cost of 2 for instruction:   %shift
 ; SSE41: Found an estimated cost of 2 for instruction:   %shift
-; AVX: Found an estimated cost of 2 for instruction:   %shift
+; AVX: Found an estimated cost of 4 for instruction:   %shift
 ; AVX2: Found an estimated cost of 1 for instruction:   %shift
 ; AVX512: Found an estimated cost of 1 for instruction:   %shift
-; XOPAVX: Found an estimated cost of 4 for instruction:   %shift
+; XOPAVX: Found an estimated cost of 6 for instruction:   %shift
 ; XOPAVX2: Found an estimated cost of 1 for instruction:   %shift
   %insert = insertelement <8 x i32> undef, i32 %b, i32 0
   %splat = shufflevector <8 x i32> %insert, <8 x i32> undef, <8 x i32> zeroinitializer
@@ -250,10 +248,10 @@ define <16 x i32> @splatvar_shift_v16i32(<16 x i32> %a, i32 %b) {
 ; CHECK: 'Cost Model Analysis' for function 'splatvar_shift_v16i32':
 ; SSE2: Found an estimated cost of 4 for instruction:   %shift
 ; SSE41: Found an estimated cost of 4 for instruction:   %shift
-; AVX: Found an estimated cost of 4 for instruction:   %shift
+; AVX: Found an estimated cost of 8 for instruction:   %shift
 ; AVX2: Found an estimated cost of 2 for instruction:   %shift
 ; AVX512: Found an estimated cost of 1 for instruction:   %shift
-; XOPAVX: Found an estimated cost of 8 for instruction:   %shift
+; XOPAVX: Found an estimated cost of 12 for instruction:   %shift
 ; XOPAVX2: Found an estimated cost of 2 for instruction:   %shift
   %insert = insertelement <16 x i32> undef, i32 %b, i32 0
   %splat = shufflevector <16 x i32> %insert, <16 x i32> undef, <16 x i32> zeroinitializer
@@ -279,10 +277,10 @@ define <16 x i16> @splatvar_shift_v16i16(<16 x i16> %a, i16 %b) {
 ; CHECK: 'Cost Model Analysis' for function 'splatvar_shift_v16i16':
 ; SSE2: Found an estimated cost of 2 for instruction:   %shift
 ; SSE41: Found an estimated cost of 2 for instruction:   %shift
-; AVX: Found an estimated cost of 2 for instruction:   %shift
+; AVX: Found an estimated cost of 4 for instruction:   %shift
 ; AVX2: Found an estimated cost of 1 for instruction:   %shift
 ; AVX512: Found an estimated cost of 1 for instruction:   %shift
-; XOPAVX: Found an estimated cost of 4 for instruction:   %shift
+; XOPAVX: Found an estimated cost of 6 for instruction:   %shift
 ; XOPAVX2: Found an estimated cost of 1 for instruction:   %shift
   %insert = insertelement <16 x i16> undef, i16 %b, i32 0
   %splat = shufflevector <16 x i16> %insert, <16 x i16> undef, <16 x i32> zeroinitializer
@@ -294,11 +292,11 @@ define <32 x i16> @splatvar_shift_v32i16(<32 x i16> %a, i16 %b) {
 ; CHECK: 'Cost Model Analysis' for function 'splatvar_shift_v32i16':
 ; SSE2: Found an estimated cost of 4 for instruction:   %shift
 ; SSE41: Found an estimated cost of 4 for instruction:   %shift
-; AVX: Found an estimated cost of 4 for instruction:   %shift
+; AVX: Found an estimated cost of 8 for instruction:   %shift
 ; AVX2: Found an estimated cost of 2 for instruction:   %shift
 ; AVX512F: Found an estimated cost of 2 for instruction:   %shift
 ; AVX512BW: Found an estimated cost of 1 for instruction:   %shift
-; XOPAVX: Found an estimated cost of 8 for instruction:   %shift
+; XOPAVX: Found an estimated cost of 12 for instruction:   %shift
 ; XOPAVX2: Found an estimated cost of 2 for instruction:   %shift
   %insert = insertelement <32 x i16> undef, i16 %b, i32 0
   %splat = shufflevector <32 x i16> %insert, <32 x i16> undef, <32 x i32> zeroinitializer
@@ -324,10 +322,10 @@ define <32 x i8> @splatvar_shift_v32i8(<32 x i8> %a, i8 %b) {
 ; CHECK: 'Cost Model Analysis' for function 'splatvar_shift_v32i8':
 ; SSE2: Found an estimated cost of 108 for instruction:   %shift
 ; SSE41: Found an estimated cost of 48 for instruction:   %shift
-; AVX: Found an estimated cost of 48 for instruction:   %shift
+; AVX: Found an estimated cost of 50 for instruction:   %shift
 ; AVX2: Found an estimated cost of 24 for instruction:   %shift
 ; AVX512: Found an estimated cost of 24 for instruction:   %shift
-; XOP: Found an estimated cost of 4 for instruction:   %shift
+; XOP: Found an estimated cost of 6 for instruction:   %shift
   %insert = insertelement <32 x i8> undef, i8 %b, i32 0
   %splat = shufflevector <32 x i8> %insert, <32 x i8> undef, <32 x i32> zeroinitializer
   %shift = ashr <32 x i8> %a, %splat
@@ -338,11 +336,11 @@ define <64 x i8> @splatvar_shift_v64i8(<64 x i8> %a, i8 %b) {
 ; CHECK: 'Cost Model Analysis' for function 'splatvar_shift_v64i8':
 ; SSE2: Found an estimated cost of 216 for instruction:   %shift
 ; SSE41: Found an estimated cost of 96 for instruction:   %shift
-; AVX: Found an estimated cost of 96 for instruction:   %shift
+; AVX: Found an estimated cost of 100 for instruction:   %shift
 ; AVX2: Found an estimated cost of 48 for instruction:   %shift
 ; AVX512F: Found an estimated cost of 48 for instruction:   %shift
 ; AVX512BW: Found an estimated cost of 24 for instruction:   %shift
-; XOP: Found an estimated cost of 8 for instruction:   %shift
+; XOP: Found an estimated cost of 12 for instruction:   %shift
   %insert = insertelement <64 x i8> undef, i8 %b, i32 0
   %splat = shufflevector <64 x i8> %insert, <64 x i8> undef, <64 x i32> zeroinitializer
   %shift = ashr <64 x i8> %a, %splat
@@ -369,10 +367,10 @@ define <4 x i64> @constant_shift_v4i64(<4 x i64> %a) {
 ; CHECK: 'Cost Model Analysis' for function 'constant_shift_v4i64':
 ; SSE2: Found an estimated cost of 24 for instruction:   %shift
 ; SSE41: Found an estimated cost of 24 for instruction:   %shift
-; AVX: Found an estimated cost of 24 for instruction:   %shift
+; AVX: Found an estimated cost of 26 for instruction:   %shift
 ; AVX2: Found an estimated cost of 4 for instruction:   %shift
 ; AVX512: Found an estimated cost of 1 for instruction:   %shift
-; XOP: Found an estimated cost of 4 for instruction:   %shift
+; XOP: Found an estimated cost of 6 for instruction:   %shift
   %shift = ashr <4 x i64> %a, <i64 1, i64 7, i64 15, i64 31>
   ret <4 x i64> %shift
 }
@@ -381,10 +379,10 @@ define <8 x i64> @constant_shift_v8i64(<8 x i64> %a) {
 ; CHECK: 'Cost Model Analysis' for function 'constant_shift_v8i64':
 ; SSE2: Found an estimated cost of 48 for instruction:   %shift
 ; SSE41: Found an estimated cost of 48 for instruction:   %shift
-; AVX: Found an estimated cost of 48 for instruction:   %shift
+; AVX: Found an estimated cost of 52 for instruction:   %shift
 ; AVX2: Found an estimated cost of 8 for instruction:   %shift
 ; AVX512: Found an estimated cost of 1 for instruction:   %shift
-; XOP: Found an estimated cost of 8 for instruction:   %shift
+; XOP: Found an estimated cost of 12 for instruction:   %shift
   %shift = ashr <8 x i64> %a, <i64 1, i64 7, i64 15, i64 31, i64 1, i64 7, i64 15, i64 31>
   ret <8 x i64> %shift
 }
@@ -406,10 +404,10 @@ define <8 x i32> @constant_shift_v8i32(<8 x i32> %a) {
 ; CHECK: 'Cost Model Analysis' for function 'constant_shift_v8i32':
 ; SSE2: Found an estimated cost of 32 for instruction:   %shift
 ; SSE41: Found an estimated cost of 24 for instruction:   %shift
-; AVX: Found an estimated cost of 24 for instruction:   %shift
+; AVX: Found an estimated cost of 26 for instruction:   %shift
 ; AVX2: Found an estimated cost of 1 for instruction:   %shift
 ; AVX512: Found an estimated cost of 1 for instruction:   %shift
-; XOPAVX: Found an estimated cost of 4 for instruction:   %shift
+; XOPAVX: Found an estimated cost of 6 for instruction:   %shift
 ; XOPAVX2: Found an estimated cost of 1 for instruction:   %shift
   %shift = ashr <8 x i32> %a, <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>
   ret <8 x i32> %shift
@@ -419,10 +417,10 @@ define <16 x i32> @constant_shift_v16i32(<16 x i32> %a) {
 ; CHECK: 'Cost Model Analysis' for function 'constant_shift_v16i32':
 ; SSE2: Found an estimated cost of 64 for instruction:   %shift
 ; SSE41: Found an estimated cost of 48 for instruction:   %shift
-; AVX: Found an estimated cost of 48 for instruction:   %shift
+; AVX: Found an estimated cost of 52 for instruction:   %shift
 ; AVX2: Found an estimated cost of 2 for instruction:   %shift
 ; AVX512: Found an estimated cost of 1 for instruction:   %shift
-; XOPAVX: Found an estimated cost of 8 for instruction:   %shift
+; XOPAVX: Found an estimated cost of 12 for instruction:   %shift
 ; XOPAVX2: Found an estimated cost of 2 for instruction:   %shift
   %shift = ashr <16 x i32> %a, <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>
   ret <16 x i32> %shift
@@ -445,11 +443,11 @@ define <16 x i16> @constant_shift_v16i16(<16 x i16> %a) {
 ; CHECK: 'Cost Model Analysis' for function 'constant_shift_v16i16':
 ; SSE2: Found an estimated cost of 64 for instruction:   %shift
 ; SSE41: Found an estimated cost of 28 for instruction:   %shift
-; AVX: Found an estimated cost of 28 for instruction:   %shift
+; AVX: Found an estimated cost of 30 for instruction:   %shift
 ; AVX2: Found an estimated cost of 10 for instruction:   %shift
 ; AVX512F: Found an estimated cost of 10 for instruction:   %shift
 ; AVX512BW: Found an estimated cost of 1 for instruction:   %shift
-; XOP: Found an estimated cost of 4 for instruction:   %shift
+; XOP: Found an estimated cost of 6 for instruction:   %shift
   %shift = ashr <16 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>
   ret <16 x i16> %shift
 }
@@ -458,11 +456,11 @@ define <32 x i16> @constant_shift_v32i16(<32 x i16> %a) {
 ; CHECK: 'Cost Model Analysis' for function 'constant_shift_v32i16':
 ; SSE2: Found an estimated cost of 128 for instruction:   %shift
 ; SSE41: Found an estimated cost of 56 for instruction:   %shift
-; AVX: Found an estimated cost of 56 for instruction:   %shift
+; AVX: Found an estimated cost of 60 for instruction:   %shift
 ; AVX2: Found an estimated cost of 20 for instruction:   %shift
 ; AVX512F: Found an estimated cost of 20 for instruction:   %shift
 ; AVX512BW: Found an estimated cost of 1 for instruction:   %shift
-; XOP: Found an estimated cost of 8 for instruction:   %shift
+; XOP: Found an estimated cost of 12 for instruction:   %shift
   %shift = ashr <32 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>
   ret <32 x i16> %shift
 }
@@ -483,10 +481,10 @@ define <32 x i8> @constant_shift_v32i8(<32 x i8> %a) {
 ; CHECK: 'Cost Model Analysis' for function 'constant_shift_v32i8':
 ; SSE2: Found an estimated cost of 108 for instruction:   %shift
 ; SSE41: Found an estimated cost of 48 for instruction:   %shift
-; AVX: Found an estimated cost of 48 for instruction:   %shift
+; AVX: Found an estimated cost of 50 for instruction:   %shift
 ; AVX2: Found an estimated cost of 24 for instruction:   %shift
 ; AVX512: Found an estimated cost of 24 for instruction:   %shift
-; XOP: Found an estimated cost of 4 for instruction:   %shift
+; XOP: Found an estimated cost of 6 for instruction:   %shift
   %shift = ashr <32 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>
   ret <32 x i8> %shift
 }
@@ -495,11 +493,11 @@ define <64 x i8> @constant_shift_v64i8(<64 x i8> %a) {
 ; CHECK: 'Cost Model Analysis' for function 'constant_shift_v64i8':
 ; SSE2: Found an estimated cost of 216 for instruction:   %shift
 ; SSE41: Found an estimated cost of 96 for instruction:   %shift
-; AVX: Found an estimated cost of 96 for instruction:   %shift
+; AVX: Found an estimated cost of 100 for instruction:   %shift
 ; AVX2: Found an estimated cost of 48 for instruction:   %shift
 ; AVX512F: Found an estimated cost of 48 for instruction:   %shift
 ; AVX512BW: Found an estimated cost of 24 for instruction:   %shift
-; XOP: Found an estimated cost of 8 for instruction:   %shift
+; XOP: Found an estimated cost of 12 for instruction:   %shift
   %shift = ashr <64 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>
   ret <64 x i8> %shift
 }
@@ -524,10 +522,11 @@ define <4 x i64> @splatconstant_shift_v4i64(<4 x i64> %a) {
 ; CHECK: 'Cost Model Analysis' for function 'splatconstant_shift_v4i64':
 ; SSE2: Found an estimated cost of 8 for instruction:   %shift
 ; SSE41: Found an estimated cost of 8 for instruction:   %shift
-; AVX: Found an estimated cost of 8 for instruction:   %shift
+; AVX: Found an estimated cost of 10 for instruction:   %shift
 ; AVX2: Found an estimated cost of 4 for instruction:   %shift
 ; AVX512: Found an estimated cost of 1 for instruction:   %shift
-; XOP: Found an estimated cost of 4 for instruction:   %shift
+; XOPAVX: Found an estimated cost of 6 for instruction:   %shift
+; XOPAVX2: Found an estimated cost of 4 for instruction:   %shift
   %shift = ashr <4 x i64> %a, <i64 7, i64 7, i64 7, i64 7>
   ret <4 x i64> %shift
 }
@@ -536,10 +535,11 @@ define <8 x i64> @splatconstant_shift_v8i64(<8 x i64> %a) {
 ; CHECK: 'Cost Model Analysis' for function 'splatconstant_shift_v8i64':
 ; SSE2: Found an estimated cost of 16 for instruction:   %shift
 ; SSE41: Found an estimated cost of 16 for instruction:   %shift
-; AVX: Found an estimated cost of 16 for instruction:   %shift
+; AVX: Found an estimated cost of 20 for instruction:   %shift
 ; AVX2: Found an estimated cost of 8 for instruction:   %shift
 ; AVX512: Found an estimated cost of 1 for instruction:   %shift
-; XOP: Found an estimated cost of 8 for instruction:   %shift
+; XOPAVX: Found an estimated cost of 12 for instruction:   %shift
+; XOPAVX2: Found an estimated cost of 8 for instruction:   %shift
   %shift = ashr <8 x i64> %a, <i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7>
   ret <8 x i64> %shift
 }
@@ -560,10 +560,10 @@ define <8 x i32> @splatconstant_shift_v8i32(<8 x i32> %a) {
 ; CHECK: 'Cost Model Analysis' for function 'splatconstant_shift_v8i32':
 ; SSE2: Found an estimated cost of 2 for instruction:   %shift
 ; SSE41: Found an estimated cost of 2 for instruction:   %shift
-; AVX: Found an estimated cost of 2 for instruction:   %shift
+; AVX: Found an estimated cost of 4 for instruction:   %shift
 ; AVX2: Found an estimated cost of 1 for instruction:   %shift
 ; AVX512: Found an estimated cost of 1 for instruction:   %shift
-; XOPAVX: Found an estimated cost of 4 for instruction:   %shift
+; XOPAVX: Found an estimated cost of 6 for instruction:   %shift
 ; XOPAVX2: Found an estimated cost of 1 for instruction:   %shift
   %shift = ashr <8 x i32> %a, <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
   ret <8 x i32> %shift
@@ -573,10 +573,10 @@ define <16 x i32> @splatconstant_shift_v16i32(<16 x i32> %a) {
 ; CHECK: 'Cost Model Analysis' for function 'splatconstant_shift_v16i32':
 ; SSE2: Found an estimated cost of 4 for instruction:   %shift
 ; SSE41: Found an estimated cost of 4 for instruction:   %shift
-; AVX: Found an estimated cost of 4 for instruction:   %shift
+; AVX: Found an estimated cost of 8 for instruction:   %shift
 ; AVX2: Found an estimated cost of 2 for instruction:   %shift
 ; AVX512: Found an estimated cost of 1 for instruction:   %shift
-; XOPAVX: Found an estimated cost of 8 for instruction:   %shift
+; XOPAVX: Found an estimated cost of 12 for instruction:   %shift
 ; XOPAVX2: Found an estimated cost of 2 for instruction:   %shift
   %shift = ashr <16 x i32> %a, <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
   ret <16 x i32> %shift
@@ -598,10 +598,10 @@ define <16 x i16> @splatconstant_shift_v16i16(<16 x i16> %a) {
 ; CHECK: 'Cost Model Analysis' for function 'splatconstant_shift_v16i16':
 ; SSE2: Found an estimated cost of 2 for instruction:   %shift
 ; SSE41: Found an estimated cost of 2 for instruction:   %shift
-; AVX: Found an estimated cost of 2 for instruction:   %shift
+; AVX: Found an estimated cost of 4 for instruction:   %shift
 ; AVX2: Found an estimated cost of 1 for instruction:   %shift
 ; AVX512: Found an estimated cost of 1 for instruction:   %shift
-; XOPAVX: Found an estimated cost of 4 for instruction:   %shift
+; XOPAVX: Found an estimated cost of 6 for instruction:   %shift
 ; XOPAVX2: Found an estimated cost of 1 for instruction:   %shift
   %shift = ashr <16 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
   ret <16 x i16> %shift
@@ -611,11 +611,11 @@ define <32 x i16> @splatconstant_shift_v32i16(<32 x i16> %a) {
 ; CHECK: 'Cost Model Analysis' for function 'splatconstant_shift_v32i16':
 ; SSE2: Found an estimated cost of 4 for instruction:   %shift
 ; SSE41: Found an estimated cost of 4 for instruction:   %shift
-; AVX: Found an estimated cost of 4 for instruction:   %shift
+; AVX: Found an estimated cost of 8 for instruction:   %shift
 ; AVX2: Found an estimated cost of 2 for instruction:   %shift
 ; AVX512F: Found an estimated cost of 2 for instruction:   %shift
 ; AVX512BW: Found an estimated cost of 1 for instruction:   %shift
-; XOPAVX: Found an estimated cost of 8 for instruction:   %shift
+; XOPAVX: Found an estimated cost of 12 for instruction:   %shift
 ; XOPAVX2: Found an estimated cost of 2 for instruction:   %shift
   %shift = ashr <32 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
   ret <32 x i16> %shift
@@ -628,7 +628,7 @@ define <16 x i8> @splatconstant_shift_v16i8(<16 x i8> %a) {
 ; AVX: Found an estimated cost of 4 for instruction:   %shift
 ; AVX2: Found an estimated cost of 4 for instruction:   %shift
 ; AVX512: Found an estimated cost of 4 for instruction:   %shift
-; XOP: Found an estimated cost of 4 for instruction:   %shift
+; XOP: Found an estimated cost of 2 for instruction:   %shift
   %shift = ashr <16 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
   ret <16 x i8> %shift
 }
@@ -637,10 +637,10 @@ define <32 x i8> @splatconstant_shift_v32i8(<32 x i8> %a) {
 ; CHECK: 'Cost Model Analysis' for function 'splatconstant_shift_v32i8':
 ; SSE2: Found an estimated cost of 8 for instruction:   %shift
 ; SSE41: Found an estimated cost of 8 for instruction:   %shift
-; AVX: Found an estimated cost of 8 for instruction:   %shift
+; AVX: Found an estimated cost of 10 for instruction:   %shift
 ; AVX2: Found an estimated cost of 4 for instruction:   %shift
 ; AVX512: Found an estimated cost of 4 for instruction:   %shift
-; XOPAVX: Found an estimated cost of 8 for instruction:   %shift
+; XOPAVX: Found an estimated cost of 6 for instruction:   %shift
 ; XOPAVX2: Found an estimated cost of 4 for instruction:   %shift
   %shift = ashr <32 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
   ret <32 x i8> %shift
@@ -650,11 +650,11 @@ define <64 x i8> @splatconstant_shift_v64i8(<64 x i8> %a) {
 ; CHECK: 'Cost Model Analysis' for function 'splatconstant_shift_v64i8':
 ; SSE2: Found an estimated cost of 16 for instruction:   %shift
 ; SSE41: Found an estimated cost of 16 for instruction:   %shift
-; AVX: Found an estimated cost of 16 for instruction:   %shift
+; AVX: Found an estimated cost of 20 for instruction:   %shift
 ; AVX2: Found an estimated cost of 8 for instruction:   %shift
 ; AVX512F: Found an estimated cost of 8 for instruction:   %shift
 ; AVX512BW: Found an estimated cost of 4 for instruction:   %shift
-; XOPAVX: Found an estimated cost of 16 for instruction:   %shift
+; XOPAVX: Found an estimated cost of 12 for instruction:   %shift
 ; XOPAVX2: Found an estimated cost of 8 for instruction:   %shift
   %shift = ashr <64 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
   ret <64 x i8> %shift
diff --git a/test/Analysis/CostModel/X86/vshift-lshr-cost.ll b/test/Analysis/CostModel/X86/vshift-lshr-cost.ll
index 546b2bb50f26..6e890369d677 100644
--- a/test/Analysis/CostModel/X86/vshift-lshr-cost.ll
+++ b/test/Analysis/CostModel/X86/vshift-lshr-cost.ll
@@ -34,10 +34,10 @@ define <4 x i64> @var_shift_v4i64(<4 x i64> %a, <4 x i64> %b) {
 ; CHECK: 'Cost Model Analysis' for function 'var_shift_v4i64':
 ; SSE2: Found an estimated cost of 8 for instruction:   %shift
 ; SSE41: Found an estimated cost of 8 for instruction:   %shift
-; AVX: Found an estimated cost of 8 for instruction:   %shift
+; AVX: Found an estimated cost of 10 for instruction:   %shift
 ; AVX2: Found an estimated cost of 1 for instruction:   %shift
 ; AVX512: Found an estimated cost of 1 for instruction:   %shift
-; XOPAVX: Found an estimated cost of 4 for instruction:   %shift
+; XOPAVX: Found an estimated cost of 6 for instruction:   %shift
 ; XOPAVX2: Found an estimated cost of 1 for instruction:   %shift
   %shift = lshr <4 x i64> %a, %b
   ret <4 x i64> %shift
@@ -47,10 +47,10 @@ define <8 x i64> @var_shift_v8i64(<8 x i64> %a, <8 x i64> %b) {
 ; CHECK: 'Cost Model Analysis' for function 'var_shift_v8i64':
 ; SSE2: Found an estimated cost of 16 for instruction:   %shift
 ; SSE41: Found an estimated cost of 16 for instruction:   %shift
-; AVX: Found an estimated cost of 16 for instruction:   %shift
+; AVX: Found an estimated cost of 20 for instruction:   %shift
 ; AVX2: Found an estimated cost of 2 for instruction:   %shift
 ; AVX512: Found an estimated cost of 1 for instruction:   %shift
-; XOPAVX: Found an estimated cost of 8 for instruction:   %shift
+; XOPAVX: Found an estimated cost of 12 for instruction:   %shift
 ; XOPAVX2: Found an estimated cost of 2 for instruction:   %shift
   %shift = lshr <8 x i64> %a, %b
   ret <8 x i64> %shift
@@ -73,10 +73,10 @@ define <8 x i32> @var_shift_v8i32(<8 x i32> %a, <8 x i32> %b) {
 ; CHECK: 'Cost Model Analysis' for function 'var_shift_v8i32':
 ; SSE2: Found an estimated cost of 32 for instruction:   %shift
 ; SSE41: Found an estimated cost of 22 for instruction:   %shift
-; AVX: Found an estimated cost of 22 for instruction:   %shift
+; AVX: Found an estimated cost of 24 for instruction:   %shift
 ; AVX2: Found an estimated cost of 1 for instruction:   %shift
 ; AVX512: Found an estimated cost of 1 for instruction:   %shift
-; XOPAVX: Found an estimated cost of 4 for instruction:   %shift
+; XOPAVX: Found an estimated cost of 6 for instruction:   %shift
 ; XOPAVX2: Found an estimated cost of 1 for instruction:   %shift
   %shift = lshr <8 x i32> %a, %b
   ret <8 x i32> %shift
@@ -86,10 +86,10 @@ define <16 x i32> @var_shift_v16i32(<16 x i32> %a, <16 x i32> %b) {
 ; CHECK: 'Cost Model Analysis' for function 'var_shift_v16i32':
 ; SSE2: Found an estimated cost of 64 for instruction:   %shift
 ; SSE41: Found an estimated cost of 44 for instruction:   %shift
-; AVX: Found an estimated cost of 44 for instruction:   %shift
+; AVX: Found an estimated cost of 48 for instruction:   %shift
 ; AVX2: Found an estimated cost of 2 for instruction:   %shift
 ; AVX512: Found an estimated cost of 1 for instruction:   %shift
-; XOPAVX: Found an estimated cost of 8 for instruction:   %shift
+; XOPAVX: Found an estimated cost of 12 for instruction:   %shift
 ; XOPAVX2: Found an estimated cost of 2 for instruction:   %shift
   %shift = lshr <16 x i32> %a, %b
   ret <16 x i32> %shift
@@ -112,11 +112,11 @@ define <16 x i16> @var_shift_v16i16(<16 x i16> %a, <16 x i16> %b) {
 ; CHECK: 'Cost Model Analysis' for function 'var_shift_v16i16':
 ; SSE2: Found an estimated cost of 64 for instruction:   %shift
 ; SSE41: Found an estimated cost of 28 for instruction:   %shift
-; AVX: Found an estimated cost of 28 for instruction:   %shift
+; AVX: Found an estimated cost of 30 for instruction:   %shift
 ; AVX2: Found an estimated cost of 10 for instruction:   %shift
 ; AVX512F: Found an estimated cost of 10 for instruction:   %shift
 ; AVX512BW: Found an estimated cost of 1 for instruction:   %shift
-; XOP: Found an estimated cost of 4 for instruction:   %shift
+; XOP: Found an estimated cost of 6 for instruction:   %shift
   %shift = lshr <16 x i16> %a, %b
   ret <16 x i16> %shift
 }
@@ -125,11 +125,11 @@ define <32 x i16> @var_shift_v32i16(<32 x i16> %a, <32 x i16> %b) {
 ; CHECK: 'Cost Model Analysis' for function 'var_shift_v32i16':
 ; SSE2: Found an estimated cost of 128 for instruction:   %shift
 ; SSE41: Found an estimated cost of 56 for instruction:   %shift
-; AVX: Found an estimated cost of 56 for instruction:   %shift
+; AVX: Found an estimated cost of 60 for instruction:   %shift
 ; AVX2: Found an estimated cost of 20 for instruction:   %shift
 ; AVX512F: Found an estimated cost of 20 for instruction:   %shift
 ; AVX512BW: Found an estimated cost of 1 for instruction:   %shift
-; XOP: Found an estimated cost of 8 for instruction:   %shift
+; XOP: Found an estimated cost of 12 for instruction:   %shift
   %shift = lshr <32 x i16> %a, %b
   ret <32 x i16> %shift
 }
@@ -150,10 +150,10 @@ define <32 x i8> @var_shift_v32i8(<32 x i8> %a, <32 x i8> %b) {
 ; CHECK: 'Cost Model Analysis' for function 'var_shift_v32i8':
 ; SSE2: Found an estimated cost of 52 for instruction:   %shift
 ; SSE41: Found an estimated cost of 24 for instruction:   %shift
-; AVX: Found an estimated cost of 24 for instruction:   %shift
+; AVX: Found an estimated cost of 26 for instruction:   %shift
 ; AVX2: Found an estimated cost of 11 for instruction:   %shift
 ; AVX512: Found an estimated cost of 11 for instruction:   %shift
-; XOP: Found an estimated cost of 4 for instruction:   %shift
+; XOP: Found an estimated cost of 6 for instruction:   %shift
   %shift = lshr <32 x i8> %a, %b
   ret <32 x i8> %shift
 }
@@ -162,11 +162,11 @@ define <64 x i8> @var_shift_v64i8(<64 x i8> %a, <64 x i8> %b) {
 ; CHECK: 'Cost Model Analysis' for function 'var_shift_v64i8':
 ; SSE2: Found an estimated cost of 104 for instruction:   %shift
 ; SSE41: Found an estimated cost of 48 for instruction:   %shift
-; AVX: Found an estimated cost of 48 for instruction:   %shift
+; AVX: Found an estimated cost of 52 for instruction:   %shift
 ; AVX2: Found an estimated cost of 22 for instruction:   %shift
 ; AVX512F: Found an estimated cost of 22 for instruction:   %shift
 ; AVX512BW: Found an estimated cost of 11 for instruction:   %shift
-; XOP: Found an estimated cost of 8 for instruction:   %shift
+; XOP: Found an estimated cost of 12 for instruction:   %shift
   %shift = lshr <64 x i8> %a, %b
   ret <64 x i8> %shift
 }
@@ -193,10 +193,10 @@ define <4 x i64> @splatvar_shift_v4i64(<4 x i64> %a, i64 %b) {
 ; CHECK: 'Cost Model Analysis' for function 'splatvar_shift_v4i64':
 ; SSE2: Found an estimated cost of 2 for instruction:   %shift
 ; SSE41: Found an estimated cost of 2 for instruction:   %shift
-; AVX: Found an estimated cost of 2 for instruction:   %shift
+; AVX: Found an estimated cost of 4 for instruction:   %shift
 ; AVX2: Found an estimated cost of 1 for instruction:   %shift
 ; AVX512: Found an estimated cost of 1 for instruction:   %shift
-; XOPAVX: Found an estimated cost of 4 for instruction:   %shift
+; XOPAVX: Found an estimated cost of 6 for instruction:   %shift
 ; XOPAVX2: Found an estimated cost of 1 for instruction:   %shift
   %insert = insertelement <4 x i64> undef, i64 %b, i32 0
   %splat = shufflevector <4 x i64> %insert, <4 x i64> undef, <4 x i32> zeroinitializer
@@ -208,10 +208,10 @@ define <8 x i64> @splatvar_shift_v8i64(<8 x i64> %a, i64 %b) {
 ; CHECK: 'Cost Model Analysis' for function 'splatvar_shift_v8i64':
 ; SSE2: Found an estimated cost of 4 for instruction:   %shift
 ; SSE41: Found an estimated cost of 4 for instruction:   %shift
-; AVX: Found an estimated cost of 4 for instruction:   %shift
+; AVX: Found an estimated cost of 8 for instruction:   %shift
 ; AVX2: Found an estimated cost of 2 for instruction:   %shift
 ; AVX512: Found an estimated cost of 1 for instruction:   %shift
-; XOPAVX: Found an estimated cost of 8 for instruction:   %shift
+; XOPAVX: Found an estimated cost of 12 for instruction:   %shift
 ; XOPAVX2: Found an estimated cost of 2 for instruction:   %shift
   %insert = insertelement <8 x i64> undef, i64 %b, i32 0
   %splat = shufflevector <8 x i64> %insert, <8 x i64> undef, <8 x i32> zeroinitializer
@@ -237,10 +237,10 @@ define <8 x i32> @splatvar_shift_v8i32(<8 x i32> %a, i32 %b) {
 ; CHECK: 'Cost Model Analysis' for function 'splatvar_shift_v8i32':
 ; SSE2: Found an estimated cost of 2 for instruction:   %shift
 ; SSE41: Found an estimated cost of 2 for instruction:   %shift
-; AVX: Found an estimated cost of 2 for instruction:   %shift
+; AVX: Found an estimated cost of 4 for instruction:   %shift
 ; AVX2: Found an estimated cost of 1 for instruction:   %shift
 ; AVX512: Found an estimated cost of 1 for instruction:   %shift
-; XOPAVX: Found an estimated cost of 4 for instruction:   %shift
+; XOPAVX: Found an estimated cost of 6 for instruction:   %shift
 ; XOPAVX2: Found an estimated cost of 1 for instruction:   %shift
   %insert = insertelement <8 x i32> undef, i32 %b, i32 0
   %splat = shufflevector <8 x i32> %insert, <8 x i32> undef, <8 x i32> zeroinitializer
@@ -252,10 +252,10 @@ define <16 x i32> @splatvar_shift_v16i32(<16 x i32> %a, i32 %b) {
 ; CHECK: 'Cost Model Analysis' for function 'splatvar_shift_v16i32':
 ; SSE2: Found an estimated cost of 4 for instruction:   %shift
 ; SSE41: Found an estimated cost of 4 for instruction:   %shift
-; AVX: Found an estimated cost of 4 for instruction:   %shift
+; AVX: Found an estimated cost of 8 for instruction:   %shift
 ; AVX2: Found an estimated cost of 2 for instruction:   %shift
 ; AVX512: Found an estimated cost of 1 for instruction:   %shift
-; XOPAVX: Found an estimated cost of 8 for instruction:   %shift
+; XOPAVX: Found an estimated cost of 12 for instruction:   %shift
 ; XOPAVX2: Found an estimated cost of 2 for instruction:   %shift
   %insert = insertelement <16 x i32> undef, i32 %b, i32 0
   %splat = shufflevector <16 x i32> %insert, <16 x i32> undef, <16 x i32> zeroinitializer
@@ -281,10 +281,10 @@ define <16 x i16> @splatvar_shift_v16i16(<16 x i16> %a, i16 %b) {
 ; CHECK: 'Cost Model Analysis' for function 'splatvar_shift_v16i16':
 ; SSE2: Found an estimated cost of 2 for instruction:   %shift
 ; SSE41: Found an estimated cost of 2 for instruction:   %shift
-; AVX: Found an estimated cost of 2 for instruction:   %shift
+; AVX: Found an estimated cost of 4 for instruction:   %shift
 ; AVX2: Found an estimated cost of 1 for instruction:   %shift
 ; AVX512: Found an estimated cost of 1 for instruction:   %shift
-; XOPAVX: Found an estimated cost of 4 for instruction:   %shift
+; XOPAVX: Found an estimated cost of 6 for instruction:   %shift
 ; XOPAVX2: Found an estimated cost of 1 for instruction:   %shift
   %insert = insertelement <16 x i16> undef, i16 %b, i32 0
   %splat = shufflevector <16 x i16> %insert, <16 x i16> undef, <16 x i32> zeroinitializer
@@ -296,11 +296,11 @@ define <32 x i16> @splatvar_shift_v32i16(<32 x i16> %a, i16 %b) {
 ; CHECK: 'Cost Model Analysis' for function 'splatvar_shift_v32i16':
 ; SSE2: Found an estimated cost of 4 for instruction:   %shift
 ; SSE41: Found an estimated cost of 4 for instruction:   %shift
-; AVX: Found an estimated cost of 4 for instruction:   %shift
+; AVX: Found an estimated cost of 8 for instruction:   %shift
 ; AVX2: Found an estimated cost of 2 for instruction:   %shift
 ; AVX512F: Found an estimated cost of 2 for instruction:   %shift
 ; AVX512BW: Found an estimated cost of 1 for instruction:   %shift
-; XOPAVX: Found an estimated cost of 8 for instruction:   %shift
+; XOPAVX: Found an estimated cost of 12 for instruction:   %shift
 ; XOPAVX2: Found an estimated cost of 2 for instruction:   %shift
   %insert = insertelement <32 x i16> undef, i16 %b, i32 0
   %splat = shufflevector <32 x i16> %insert, <32 x i16> undef, <32 x i32> zeroinitializer
@@ -326,10 +326,10 @@ define <32 x i8> @splatvar_shift_v32i8(<32 x i8> %a, i8 %b) {
 ; CHECK: 'Cost Model Analysis' for function 'splatvar_shift_v32i8':
 ; SSE2: Found an estimated cost of 52 for instruction:   %shift
 ; SSE41: Found an estimated cost of 24 for instruction:   %shift
-; AVX: Found an estimated cost of 24 for instruction:   %shift
+; AVX: Found an estimated cost of 26 for instruction:   %shift
 ; AVX2: Found an estimated cost of 11 for instruction:   %shift
 ; AVX512: Found an estimated cost of 11 for instruction:   %shift
-; XOP: Found an estimated cost of 4 for instruction:   %shift
+; XOP: Found an estimated cost of 6 for instruction:   %shift
   %insert = insertelement <32 x i8> undef, i8 %b, i32 0
   %splat = shufflevector <32 x i8> %insert, <32 x i8> undef, <32 x i32> zeroinitializer
   %shift = lshr <32 x i8> %a, %splat
@@ -340,11 +340,11 @@ define <64 x i8> @splatvar_shift_v64i8(<64 x i8> %a, i8 %b) {
 ; CHECK: 'Cost Model Analysis' for function 'splatvar_shift_v64i8':
 ; SSE2: Found an estimated cost of 104 for instruction:   %shift
 ; SSE41: Found an estimated cost of 48 for instruction:   %shift
-; AVX: Found an estimated cost of 48 for instruction:   %shift
+; AVX: Found an estimated cost of 52 for instruction:   %shift
 ; AVX2: Found an estimated cost of 22 for instruction:   %shift
 ; AVX512F: Found an estimated cost of 22 for instruction:   %shift
 ; AVX512BW: Found an estimated cost of 11 for instruction:   %shift
-; XOP: Found an estimated cost of 8 for instruction:   %shift
+; XOP: Found an estimated cost of 12 for instruction:   %shift
   %insert = insertelement <64 x i8> undef, i8 %b, i32 0
   %splat = shufflevector <64 x i8> %insert, <64 x i8> undef, <64 x i32> zeroinitializer
   %shift = lshr <64 x i8> %a, %splat
@@ -372,10 +372,10 @@ define <4 x i64> @constant_shift_v4i64(<4 x i64> %a) {
 ; CHECK: 'Cost Model Analysis' for function 'constant_shift_v4i64':
 ; SSE2: Found an estimated cost of 8 for instruction:   %shift
 ; SSE41: Found an estimated cost of 8 for instruction:   %shift
-; AVX: Found an estimated cost of 8 for instruction:   %shift
+; AVX: Found an estimated cost of 10 for instruction:   %shift
 ; AVX2: Found an estimated cost of 1 for instruction:   %shift
 ; AVX512: Found an estimated cost of 1 for instruction:   %shift
-; XOPAVX: Found an estimated cost of 4 for instruction:   %shift
+; XOPAVX: Found an estimated cost of 6 for instruction:   %shift
 ; XOPAVX2: Found an estimated cost of 1 for instruction:   %shift
   %shift = lshr <4 x i64> %a, <i64 1, i64 7, i64 15, i64 31>
   ret <4 x i64> %shift
@@ -385,10 +385,10 @@ define <8 x i64> @constant_shift_v8i64(<8 x i64> %a) {
 ; CHECK: 'Cost Model Analysis' for function 'constant_shift_v8i64':
 ; SSE2: Found an estimated cost of 16 for instruction:   %shift
 ; SSE41: Found an estimated cost of 16 for instruction:   %shift
-; AVX: Found an estimated cost of 16 for instruction:   %shift
+; AVX: Found an estimated cost of 20 for instruction:   %shift
 ; AVX2: Found an estimated cost of 2 for instruction:   %shift
 ; AVX512: Found an estimated cost of 1 for instruction:   %shift
-; XOPAVX: Found an estimated cost of 8 for instruction:   %shift
+; XOPAVX: Found an estimated cost of 12 for instruction:   %shift
 ; XOPAVX2: Found an estimated cost of 2 for instruction:   %shift
   %shift = lshr <8 x i64> %a, <i64 1, i64 7, i64 15, i64 31, i64 1, i64 7, i64 15, i64 31>
   ret <8 x i64> %shift
@@ -411,10 +411,10 @@ define <8 x i32> @constant_shift_v8i32(<8 x i32> %a) {
 ; CHECK: 'Cost Model Analysis' for function 'constant_shift_v8i32':
 ; SSE2: Found an estimated cost of 32 for instruction:   %shift
 ; SSE41: Found an estimated cost of 22 for instruction:   %shift
-; AVX: Found an estimated cost of 22 for instruction:   %shift
+; AVX: Found an estimated cost of 24 for instruction:   %shift
 ; AVX2: Found an estimated cost of 1 for instruction:   %shift
 ; AVX512: Found an estimated cost of 1 for instruction:   %shift
-; XOPAVX: Found an estimated cost of 4 for instruction:   %shift
+; XOPAVX: Found an estimated cost of 6 for instruction:   %shift
 ; XOPAVX2: Found an estimated cost of 1 for instruction:   %shift
   %shift = lshr <8 x i32> %a, <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>
   ret <8 x i32> %shift
@@ -424,10 +424,10 @@ define <16 x i32> @constant_shift_v16i32(<16 x i32> %a) {
 ; CHECK: 'Cost Model Analysis' for function 'constant_shift_v16i32':
 ; SSE2: Found an estimated cost of 64 for instruction:   %shift
 ; SSE41: Found an estimated cost of 44 for instruction:   %shift
-; AVX: Found an estimated cost of 44 for instruction:   %shift
+; AVX: Found an estimated cost of 48 for instruction:   %shift
 ; AVX2: Found an estimated cost of 2 for instruction:   %shift
 ; AVX512: Found an estimated cost of 1 for instruction:   %shift
-; XOPAVX: Found an estimated cost of 8 for instruction:   %shift
+; XOPAVX: Found an estimated cost of 12 for instruction:   %shift
 ; XOPAVX2: Found an estimated cost of 2 for instruction:   %shift
   %shift = lshr <16 x i32> %a, <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>
   ret <16 x i32> %shift
@@ -450,11 +450,11 @@ define <16 x i16> @constant_shift_v16i16(<16 x i16> %a) {
 ; CHECK: 'Cost Model Analysis' for function 'constant_shift_v16i16':
 ; SSE2: Found an estimated cost of 64 for instruction:   %shift
 ; SSE41: Found an estimated cost of 28 for instruction:   %shift
-; AVX: Found an estimated cost of 28 for instruction:   %shift
+; AVX: Found an estimated cost of 30 for instruction:   %shift
 ; AVX2: Found an estimated cost of 10 for instruction:   %shift
 ; AVX512F: Found an estimated cost of 10 for instruction:   %shift
 ; AVX512BW: Found an estimated cost of 1 for instruction:   %shift
-; XOP: Found an estimated cost of 4 for instruction:   %shift
+; XOP: Found an estimated cost of 6 for instruction:   %shift
   %shift = lshr <16 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>
   ret <16 x i16> %shift
 }
@@ -463,11 +463,11 @@ define <32 x i16> @constant_shift_v32i16(<32 x i16> %a) {
 ; CHECK: 'Cost Model Analysis' for function 'constant_shift_v32i16':
 ; SSE2: Found an estimated cost of 128 for instruction:   %shift
 ; SSE41: Found an estimated cost of 56 for instruction:   %shift
-; AVX: Found an estimated cost of 56 for instruction:   %shift
+; AVX: Found an estimated cost of 60 for instruction:   %shift
 ; AVX2: Found an estimated cost of 20 for instruction:   %shift
 ; AVX512F: Found an estimated cost of 20 for instruction:   %shift
 ; AVX512BW: Found an estimated cost of 1 for instruction:   %shift
-; XOP: Found an estimated cost of 8 for instruction:   %shift
+; XOP: Found an estimated cost of 12 for instruction:   %shift
   %shift = lshr <32 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>
   ret <32 x i16> %shift
 }
@@ -488,10 +488,10 @@ define <32 x i8> @constant_shift_v32i8(<32 x i8> %a) {
 ; CHECK: 'Cost Model Analysis' for function 'constant_shift_v32i8':
 ; SSE2: Found an estimated cost of 52 for instruction:   %shift
 ; SSE41: Found an estimated cost of 24 for instruction:   %shift
-; AVX: Found an estimated cost of 24 for instruction:   %shift
+; AVX: Found an estimated cost of 26 for instruction:   %shift
 ; AVX2: Found an estimated cost of 11 for instruction:   %shift
 ; AVX512: Found an estimated cost of 11 for instruction:   %shift
-; XOP: Found an estimated cost of 4 for instruction:   %shift
+; XOP: Found an estimated cost of 6 for instruction:   %shift
   %shift = lshr <32 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>
   ret <32 x i8> %shift
 }
@@ -500,11 +500,11 @@ define <64 x i8> @constant_shift_v64i8(<64 x i8> %a) {
 ; CHECK: 'Cost Model Analysis' for function 'constant_shift_v64i8':
 ; SSE2: Found an estimated cost of 104 for instruction:   %shift
 ; SSE41: Found an estimated cost of 48 for instruction:   %shift
-; AVX: Found an estimated cost of 48 for instruction:   %shift
+; AVX: Found an estimated cost of 52 for instruction:   %shift
 ; AVX2: Found an estimated cost of 22 for instruction:   %shift
 ; AVX512F: Found an estimated cost of 22 for instruction:   %shift
 ; AVX512BW: Found an estimated cost of 11 for instruction:   %shift
-; XOP: Found an estimated cost of 8 for instruction:   %shift
+; XOP: Found an estimated cost of 12 for instruction:   %shift
   %shift = lshr <64 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>
   ret <64 x i8> %shift
 }
@@ -529,10 +529,10 @@ define <4 x i64> @splatconstant_shift_v4i64(<4 x i64> %a) {
 ; CHECK: 'Cost Model Analysis' for function 'splatconstant_shift_v4i64':
 ; SSE2: Found an estimated cost of 2 for instruction:   %shift
 ; SSE41: Found an estimated cost of 2 for instruction:   %shift
-; AVX: Found an estimated cost of 2 for instruction:   %shift
+; AVX: Found an estimated cost of 4 for instruction:   %shift
 ; AVX2: Found an estimated cost of 1 for instruction:   %shift
 ; AVX512: Found an estimated cost of 1 for instruction:   %shift
-; XOPAVX: Found an estimated cost of 4 for instruction:   %shift
+; XOPAVX: Found an estimated cost of 6 for instruction:   %shift
 ; XOPAVX2: Found an estimated cost of 1 for instruction:   %shift
   %shift = lshr <4 x i64> %a, <i64 7, i64 7, i64 7, i64 7>
   ret <4 x i64> %shift
@@ -542,10 +542,10 @@ define <8 x i64> @splatconstant_shift_v8i64(<8 x i64> %a) {
 ; CHECK: 'Cost Model Analysis' for function 'splatconstant_shift_v8i64':
 ; SSE2: Found an estimated cost of 4 for instruction:   %shift
 ; SSE41: Found an estimated cost of 4 for instruction:   %shift
-; AVX: Found an estimated cost of 4 for instruction:   %shift
+; AVX: Found an estimated cost of 8 for instruction:   %shift
 ; AVX2: Found an estimated cost of 2 for instruction:   %shift
 ; AVX512: Found an estimated cost of 1 for instruction:   %shift
-; XOPAVX: Found an estimated cost of 8 for instruction:   %shift
+; XOPAVX: Found an estimated cost of 12 for instruction:   %shift
 ; XOPAVX2: Found an estimated cost of 2 for instruction:   %shift
   %shift = lshr <8 x i64> %a, <i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7>
   ret <8 x i64> %shift
@@ -567,10 +567,10 @@ define <8 x i32> @splatconstant_shift_v8i32(<8 x i32> %a) {
 ; CHECK: 'Cost Model Analysis' for function 'splatconstant_shift_v8i32':
 ; SSE2: Found an estimated cost of 2 for instruction:   %shift
 ; SSE41: Found an estimated cost of 2 for instruction:   %shift
-; AVX: Found an estimated cost of 2 for instruction:   %shift
+; AVX: Found an estimated cost of 4 for instruction:   %shift
 ; AVX2: Found an estimated cost of 1 for instruction:   %shift
 ; AVX512: Found an estimated cost of 1 for instruction:   %shift
-; XOPAVX: Found an estimated cost of 4 for instruction:   %shift
+; XOPAVX: Found an estimated cost of 6 for instruction:   %shift
 ; XOPAVX2: Found an estimated cost of 1 for instruction:   %shift
   %shift = lshr <8 x i32> %a, <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
   ret <8 x i32> %shift
@@ -580,10 +580,10 @@ define <16 x i32> @splatconstant_shift_v16i32(<16 x i32> %a) {
 ; CHECK: 'Cost Model Analysis' for function 'splatconstant_shift_v16i32':
 ; SSE2: Found an estimated cost of 4 for instruction:   %shift
 ; SSE41: Found an estimated cost of 4 for instruction:   %shift
-; AVX: Found an estimated cost of 4 for instruction:   %shift
+; AVX: Found an estimated cost of 8 for instruction:   %shift
 ; AVX2: Found an estimated cost of 2 for instruction:   %shift
 ; AVX512: Found an estimated cost of 1 for instruction:   %shift
-; XOPAVX: Found an estimated cost of 8 for instruction:   %shift
+; XOPAVX: Found an estimated cost of 12 for instruction:   %shift
 ; XOPAVX2: Found an estimated cost of 2 for instruction:   %shift
   %shift = lshr <16 x i32> %a, <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
   ret <16 x i32> %shift
@@ -605,10 +605,10 @@ define <16 x i16> @splatconstant_shift_v16i16(<16 x i16> %a) {
 ; CHECK: 'Cost Model Analysis' for function 'splatconstant_shift_v16i16':
 ; SSE2: Found an estimated cost of 2 for instruction:   %shift
 ; SSE41: Found an estimated cost of 2 for instruction:   %shift
-; AVX: Found an estimated cost of 2 for instruction:   %shift
+; AVX: Found an estimated cost of 4 for instruction:   %shift
 ; AVX2: Found an estimated cost of 1 for instruction:   %shift
 ; AVX512: Found an estimated cost of 1 for instruction:   %shift
-; XOPAVX: Found an estimated cost of 4 for instruction:   %shift
+; XOPAVX: Found an estimated cost of 6 for instruction:   %shift
 ; XOPAVX2: Found an estimated cost of 1 for instruction:   %shift
   %shift = lshr <16 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
   ret <16 x i16> %shift
@@ -618,11 +618,11 @@ define <32 x i16> @splatconstant_shift_v32i16(<32 x i16> %a) {
 ; CHECK: 'Cost Model Analysis' for function 'splatconstant_shift_v32i16':
 ; SSE2: Found an estimated cost of 4 for instruction:   %shift
 ; SSE41: Found an estimated cost of 4 for instruction:   %shift
-; AVX: Found an estimated cost of 4 for instruction:   %shift
+; AVX: Found an estimated cost of 8 for instruction:   %shift
 ; AVX2: Found an estimated cost of 2 for instruction:   %shift
 ; AVX512F: Found an estimated cost of 2 for instruction:   %shift
 ; AVX512BW: Found an estimated cost of 1 for instruction:   %shift
-; XOPAVX: Found an estimated cost of 8 for instruction:   %shift
+; XOPAVX: Found an estimated cost of 12 for instruction:   %shift
 ; XOPAVX2: Found an estimated cost of 2 for instruction:   %shift
   %shift = lshr <32 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
   ret <32 x i16> %shift
@@ -644,10 +644,10 @@ define <32 x i8> @splatconstant_shift_v32i8(<32 x i8> %a) {
 ; CHECK: 'Cost Model Analysis' for function 'splatconstant_shift_v32i8':
 ; SSE2: Found an estimated cost of 4 for instruction:   %shift
 ; SSE41: Found an estimated cost of 4 for instruction:   %shift
-; AVX: Found an estimated cost of 4 for instruction:   %shift
+; AVX: Found an estimated cost of 6 for instruction:   %shift
 ; AVX2: Found an estimated cost of 2 for instruction:   %shift
 ; AVX512: Found an estimated cost of 2 for instruction:   %shift
-; XOPAVX: Found an estimated cost of 4 for instruction:   %shift
+; XOPAVX: Found an estimated cost of 6 for instruction:   %shift
 ; XOPAVX2: Found an estimated cost of 2 for instruction:   %shift
   %shift = lshr <32 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
   ret <32 x i8> %shift
@@ -657,11 +657,11 @@ define <64 x i8> @splatconstant_shift_v64i8(<64 x i8> %a) {
 ; CHECK: 'Cost Model Analysis' for function 'splatconstant_shift_v64i8':
 ; SSE2: Found an estimated cost of 8 for instruction:   %shift
 ; SSE41: Found an estimated cost of 8 for instruction:   %shift
-; AVX: Found an estimated cost of 8 for instruction:   %shift
+; AVX: Found an estimated cost of 12 for instruction:   %shift
 ; AVX2: Found an estimated cost of 4 for instruction:   %shift
 ; AVX512F: Found an estimated cost of 4 for instruction:   %shift
 ; AVX512BW: Found an estimated cost of 2 for instruction:   %shift
-; XOPAVX: Found an estimated cost of 8 for instruction:   %shift
+; XOPAVX: Found an estimated cost of 12 for instruction:   %shift
 ; XOPAVX2: Found an estimated cost of 4 for instruction:   %shift
   %shift = lshr <64 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
   ret <64 x i8> %shift
diff --git a/test/Analysis/CostModel/X86/vshift-shl-cost.ll b/test/Analysis/CostModel/X86/vshift-shl-cost.ll
index 90356f5ce8be..5e604bb7983e 100644
--- a/test/Analysis/CostModel/X86/vshift-shl-cost.ll
+++ b/test/Analysis/CostModel/X86/vshift-shl-cost.ll
@@ -35,10 +35,10 @@ define <4 x i64> @var_shift_v4i64(<4 x i64> %a, <4 x i64> %b) {
 ; CHECK: 'Cost Model Analysis' for function 'var_shift_v4i64':
 ; SSE2: Found an estimated cost of 8 for instruction:   %shift
 ; SSE41: Found an estimated cost of 8 for instruction:   %shift
-; AVX: Found an estimated cost of 8 for instruction:   %shift
+; AVX: Found an estimated cost of 10 for instruction:   %shift
 ; AVX2: Found an estimated cost of 1 for instruction:   %shift
 ; AVX512: Found an estimated cost of 1 for instruction:   %shift
-; XOPAVX: Found an estimated cost of 2 for instruction:   %shift
+; XOPAVX: Found an estimated cost of 4 for instruction:   %shift
 ; XOPAVX2: Found an estimated cost of 1 for instruction:   %shift
   %shift = shl <4 x i64> %a, %b
   ret <4 x i64> %shift
@@ -48,10 +48,10 @@ define <8 x i64> @var_shift_v8i64(<8 x i64> %a, <8 x i64> %b) {
 ; CHECK: 'Cost Model Analysis' for function 'var_shift_v8i64':
 ; SSE2: Found an estimated cost of 16 for instruction:   %shift
 ; SSE41: Found an estimated cost of 16 for instruction:   %shift
-; AVX: Found an estimated cost of 16 for instruction:   %shift
+; AVX: Found an estimated cost of 20 for instruction:   %shift
 ; AVX2: Found an estimated cost of 2 for instruction:   %shift
 ; AVX512: Found an estimated cost of 1 for instruction:   %shift
-; XOPAVX: Found an estimated cost of 4 for instruction:   %shift
+; XOPAVX: Found an estimated cost of 8 for instruction:   %shift
 ; XOPAVX2: Found an estimated cost of 2 for instruction:   %shift
   %shift = shl <8 x i64> %a, %b
   ret <8 x i64> %shift
@@ -74,10 +74,10 @@ define <8 x i32> @var_shift_v8i32(<8 x i32> %a, <8 x i32> %b) {
 ; CHECK: 'Cost Model Analysis' for function 'var_shift_v8i32':
 ; SSE2: Found an estimated cost of 20 for instruction:   %shift
 ; SSE41: Found an estimated cost of 8 for instruction:   %shift
-; AVX: Found an estimated cost of 8 for instruction:   %shift
+; AVX: Found an estimated cost of 10 for instruction:   %shift
 ; AVX2: Found an estimated cost of 1 for instruction:   %shift
 ; AVX512: Found an estimated cost of 1 for instruction:   %shift
-; XOPAVX: Found an estimated cost of 2 for instruction:   %shift
+; XOPAVX: Found an estimated cost of 4 for instruction:   %shift
 ; XOPAVX2: Found an estimated cost of 1 for instruction:   %shift
   %shift = shl <8 x i32> %a, %b
   ret <8 x i32> %shift
@@ -87,10 +87,10 @@ define <16 x i32> @var_shift_v16i32(<16 x i32> %a, <16 x i32> %b) {
 ; CHECK: 'Cost Model Analysis' for function 'var_shift_v16i32':
 ; SSE2: Found an estimated cost of 40 for instruction:   %shift
 ; SSE41: Found an estimated cost of 16 for instruction:   %shift
-; AVX: Found an estimated cost of 16 for instruction:   %shift
+; AVX: Found an estimated cost of 20 for instruction:   %shift
 ; AVX2: Found an estimated cost of 2 for instruction:   %shift
 ; AVX512: Found an estimated cost of 1 for instruction:   %shift
-; XOPAVX: Found an estimated cost of 4 for instruction:   %shift
+; XOPAVX: Found an estimated cost of 8 for instruction:   %shift
 ; XOPAVX2: Found an estimated cost of 2 for instruction:   %shift
   %shift = shl <16 x i32> %a, %b
   ret <16 x i32> %shift
@@ -113,11 +113,11 @@ define <16 x i16> @var_shift_v16i16(<16 x i16> %a, <16 x i16> %b) {
 ; CHECK: 'Cost Model Analysis' for function 'var_shift_v16i16':
 ; SSE2: Found an estimated cost of 64 for instruction:   %shift
 ; SSE41: Found an estimated cost of 28 for instruction:   %shift
-; AVX: Found an estimated cost of 28 for instruction:   %shift
+; AVX: Found an estimated cost of 30 for instruction:   %shift
 ; AVX2: Found an estimated cost of 10 for instruction:   %shift
 ; AVX512F: Found an estimated cost of 10 for instruction:   %shift
 ; AVX512BW: Found an estimated cost of 1 for instruction:   %shift
-; XOP: Found an estimated cost of 2 for instruction:   %shift
+; XOP: Found an estimated cost of 4 for instruction:   %shift
   %shift = shl <16 x i16> %a, %b
   ret <16 x i16> %shift
 }
@@ -126,11 +126,11 @@ define <32 x i16> @var_shift_v32i16(<32 x i16> %a, <32 x i16> %b) {
 ; CHECK: 'Cost Model Analysis' for function 'var_shift_v32i16':
 ; SSE2: Found an estimated cost of 128 for instruction:   %shift
 ; SSE41: Found an estimated cost of 56 for instruction:   %shift
-; AVX: Found an estimated cost of 56 for instruction:   %shift
+; AVX: Found an estimated cost of 60 for instruction:   %shift
 ; AVX2: Found an estimated cost of 20 for instruction:   %shift
 ; AVX512F: Found an estimated cost of 20 for instruction:   %shift
 ; AVX512BW: Found an estimated cost of 1 for instruction:   %shift
-; XOP: Found an estimated cost of 4 for instruction:   %shift
+; XOP: Found an estimated cost of 8 for instruction:   %shift
   %shift = shl <32 x i16> %a, %b
   ret <32 x i16> %shift
 }
@@ -151,10 +151,10 @@ define <32 x i8> @var_shift_v32i8(<32 x i8> %a, <32 x i8> %b) {
 ; CHECK: 'Cost Model Analysis' for function 'var_shift_v32i8':
 ; SSE2: Found an estimated cost of 52 for instruction:   %shift
 ; SSE41: Found an estimated cost of 22 for instruction:   %shift
-; AVX: Found an estimated cost of 22 for instruction:   %shift
+; AVX: Found an estimated cost of 24 for instruction:   %shift
 ; AVX2: Found an estimated cost of 11 for instruction:   %shift
 ; AVX512: Found an estimated cost of 11 for instruction:   %shift
-; XOP: Found an estimated cost of 2 for instruction:   %shift
+; XOP: Found an estimated cost of 4 for instruction:   %shift
   %shift = shl <32 x i8> %a, %b
   ret <32 x i8> %shift
 }
@@ -163,11 +163,11 @@ define <64 x i8> @var_shift_v64i8(<64 x i8> %a, <64 x i8> %b) {
 ; CHECK: 'Cost Model Analysis' for function 'var_shift_v64i8':
 ; SSE2: Found an estimated cost of 104 for instruction:   %shift
 ; SSE41: Found an estimated cost of 44 for instruction:   %shift
-; AVX: Found an estimated cost of 44 for instruction:   %shift
+; AVX: Found an estimated cost of 48 for instruction:   %shift
 ; AVX2: Found an estimated cost of 22 for instruction:   %shift
 ; AVX512F: Found an estimated cost of 22 for instruction:   %shift
 ; AVX512BW: Found an estimated cost of 11 for instruction:   %shift
-; XOP: Found an estimated cost of 4 for instruction:   %shift
+; XOP: Found an estimated cost of 8 for instruction:   %shift
   %shift = shl <64 x i8> %a, %b
   ret <64 x i8> %shift
 }
@@ -194,10 +194,10 @@ define <4 x i64> @splatvar_shift_v4i64(<4 x i64> %a, i64 %b) {
 ; CHECK: 'Cost Model Analysis' for function 'splatvar_shift_v4i64':
 ; SSE2: Found an estimated cost of 2 for instruction:   %shift
 ; SSE41: Found an estimated cost of 2 for instruction:   %shift
-; AVX: Found an estimated cost of 2 for instruction:   %shift
+; AVX: Found an estimated cost of 4 for instruction:   %shift
 ; AVX2: Found an estimated cost of 1 for instruction:   %shift
 ; AVX512: Found an estimated cost of 1 for instruction:   %shift
-; XOPAVX: Found an estimated cost of 2 for instruction:   %shift
+; XOPAVX: Found an estimated cost of 4 for instruction:   %shift
 ; XOPAVX2: Found an estimated cost of 1 for instruction:   %shift
   %insert = insertelement <4 x i64> undef, i64 %b, i32 0
   %splat = shufflevector <4 x i64> %insert, <4 x i64> undef, <4 x i32> zeroinitializer
@@ -209,10 +209,10 @@ define <8 x i64> @splatvar_shift_v8i64(<8 x i64> %a, i64 %b) {
 ; CHECK: 'Cost Model Analysis' for function 'splatvar_shift_v8i64':
 ; SSE2: Found an estimated cost of 4 for instruction:   %shift
 ; SSE41: Found an estimated cost of 4 for instruction:   %shift
-; AVX: Found an estimated cost of 4 for instruction:   %shift
+; AVX: Found an estimated cost of 8 for instruction:   %shift
 ; AVX2: Found an estimated cost of 2 for instruction:   %shift
 ; AVX512: Found an estimated cost of 1 for instruction:   %shift
-; XOPAVX: Found an estimated cost of 4 for instruction:   %shift
+; XOPAVX: Found an estimated cost of 8 for instruction:   %shift
 ; XOPAVX2: Found an estimated cost of 2 for instruction:   %shift
   %insert = insertelement <8 x i64> undef, i64 %b, i32 0
   %splat = shufflevector <8 x i64> %insert, <8 x i64> undef, <8 x i32> zeroinitializer
@@ -238,10 +238,10 @@ define <8 x i32> @splatvar_shift_v8i32(<8 x i32> %a, i32 %b) {
 ; CHECK: 'Cost Model Analysis' for function 'splatvar_shift_v8i32':
 ; SSE2: Found an estimated cost of 2 for instruction:   %shift
 ; SSE41: Found an estimated cost of 2 for instruction:   %shift
-; AVX: Found an estimated cost of 2 for instruction:   %shift
+; AVX: Found an estimated cost of 4 for instruction:   %shift
 ; AVX2: Found an estimated cost of 1 for instruction:   %shift
 ; AVX512: Found an estimated cost of 1 for instruction:   %shift
-; XOPAVX: Found an estimated cost of 2 for instruction:   %shift
+; XOPAVX: Found an estimated cost of 4 for instruction:   %shift
 ; XOPAVX2: Found an estimated cost of 1 for instruction:   %shift
   %insert = insertelement <8 x i32> undef, i32 %b, i32 0
   %splat = shufflevector <8 x i32> %insert, <8 x i32> undef, <8 x i32> zeroinitializer
@@ -253,10 +253,10 @@ define <16 x i32> @splatvar_shift_v16i32(<16 x i32> %a, i32 %b) {
 ; CHECK: 'Cost Model Analysis' for function 'splatvar_shift_v16i32':
 ; SSE2: Found an estimated cost of 4 for instruction:   %shift
 ; SSE41: Found an estimated cost of 4 for instruction:   %shift
-; AVX: Found an estimated cost of 4 for instruction:   %shift
+; AVX: Found an estimated cost of 8 for instruction:   %shift
 ; AVX2: Found an estimated cost of 2 for instruction:   %shift
 ; AVX512: Found an estimated cost of 1 for instruction:   %shift
-; XOPAVX: Found an estimated cost of 4 for instruction:   %shift
+; XOPAVX: Found an estimated cost of 8 for instruction:   %shift
 ; XOPAVX2: Found an estimated cost of 2 for instruction:   %shift
   %insert = insertelement <16 x i32> undef, i32 %b, i32 0
   %splat = shufflevector <16 x i32> %insert, <16 x i32> undef, <16 x i32> zeroinitializer
@@ -282,10 +282,10 @@ define <16 x i16> @splatvar_shift_v16i16(<16 x i16> %a, i16 %b) {
 ; CHECK: 'Cost Model Analysis' for function 'splatvar_shift_v16i16':
 ; SSE2: Found an estimated cost of 2 for instruction:   %shift
 ; SSE41: Found an estimated cost of 2 for instruction:   %shift
-; AVX: Found an estimated cost of 2 for instruction:   %shift
+; AVX: Found an estimated cost of 4 for instruction:   %shift
 ; AVX2: Found an estimated cost of 1 for instruction:   %shift
 ; AVX512: Found an estimated cost of 1 for instruction:   %shift
-; XOPAVX: Found an estimated cost of 2 for instruction:   %shift
+; XOPAVX: Found an estimated cost of 4 for instruction:   %shift
 ; XOPAVX2: Found an estimated cost of 1 for instruction:   %shift
   %insert = insertelement <16 x i16> undef, i16 %b, i32 0
   %splat = shufflevector <16 x i16> %insert, <16 x i16> undef, <16 x i32> zeroinitializer
@@ -297,11 +297,11 @@ define <32 x i16> @splatvar_shift_v32i16(<32 x i16> %a, i16 %b) {
 ; CHECK: 'Cost Model Analysis' for function 'splatvar_shift_v32i16':
 ; SSE2: Found an estimated cost of 4 for instruction:   %shift
 ; SSE41: Found an estimated cost of 4 for instruction:   %shift
-; AVX: Found an estimated cost of 4 for instruction:   %shift
+; AVX: Found an estimated cost of 8 for instruction:   %shift
 ; AVX2: Found an estimated cost of 2 for instruction:   %shift
 ; AVX512F: Found an estimated cost of 2 for instruction:   %shift
 ; AVX512BW: Found an estimated cost of 1 for instruction:   %shift
-; XOPAVX: Found an estimated cost of 4 for instruction:   %shift
+; XOPAVX: Found an estimated cost of 8 for instruction:   %shift
 ; XOPAVX2: Found an estimated cost of 2 for instruction:   %shift
   %insert = insertelement <32 x i16> undef, i16 %b, i32 0
   %splat = shufflevector <32 x i16> %insert, <32 x i16> undef, <32 x i32> zeroinitializer
@@ -327,10 +327,10 @@ define <32 x i8> @splatvar_shift_v32i8(<32 x i8> %a, i8 %b) {
 ; CHECK: 'Cost Model Analysis' for function 'splatvar_shift_v32i8':
 ; SSE2: Found an estimated cost of 52 for instruction:   %shift
 ; SSE41: Found an estimated cost of 22 for instruction:   %shift
-; AVX: Found an estimated cost of 22 for instruction:   %shift
+; AVX: Found an estimated cost of 24 for instruction:   %shift
 ; AVX2: Found an estimated cost of 11 for instruction:   %shift
 ; AVX512: Found an estimated cost of 11 for instruction:   %shift
-; XOP: Found an estimated cost of 2 for instruction:   %shift
+; XOP: Found an estimated cost of 4 for instruction:   %shift
   %insert = insertelement <32 x i8> undef, i8 %b, i32 0
   %splat = shufflevector <32 x i8> %insert, <32 x i8> undef, <32 x i32> zeroinitializer
   %shift = shl <32 x i8> %a, %splat
@@ -341,11 +341,11 @@ define <64 x i8> @splatvar_shift_v64i8(<64 x i8> %a, i8 %b) {
 ; CHECK: 'Cost Model Analysis' for function 'splatvar_shift_v64i8':
 ; SSE2: Found an estimated cost of 104 for instruction:   %shift
 ; SSE41: Found an estimated cost of 44 for instruction:   %shift
-; AVX: Found an estimated cost of 44 for instruction:   %shift
+; AVX: Found an estimated cost of 48 for instruction:   %shift
 ; AVX2: Found an estimated cost of 22 for instruction:   %shift
 ; AVX512F: Found an estimated cost of 22 for instruction:   %shift
 ; AVX512BW: Found an estimated cost of 11 for instruction:   %shift
-; XOP: Found an estimated cost of 4 for instruction:   %shift
+; XOP: Found an estimated cost of 8 for instruction:   %shift
   %insert = insertelement <64 x i8> undef, i8 %b, i32 0
   %splat = shufflevector <64 x i8> %insert, <64 x i8> undef, <64 x i32> zeroinitializer
   %shift = shl <64 x i8> %a, %splat
@@ -373,10 +373,10 @@ define <4 x i64> @constant_shift_v4i64(<4 x i64> %a) {
 ; CHECK: 'Cost Model Analysis' for function 'constant_shift_v4i64':
 ; SSE2: Found an estimated cost of 8 for instruction:   %shift
 ; SSE41: Found an estimated cost of 8 for instruction:   %shift
-; AVX: Found an estimated cost of 8 for instruction:   %shift
+; AVX: Found an estimated cost of 10 for instruction:   %shift
 ; AVX2: Found an estimated cost of 1 for instruction:   %shift
 ; AVX512: Found an estimated cost of 1 for instruction:   %shift
-; XOPAVX: Found an estimated cost of 2 for instruction:   %shift
+; XOPAVX: Found an estimated cost of 4 for instruction:   %shift
 ; XOPAVX2: Found an estimated cost of 1 for instruction:   %shift
   %shift = shl <4 x i64> %a, <i64 1, i64 7, i64 15, i64 31>
   ret <4 x i64> %shift
@@ -386,10 +386,10 @@ define <8 x i64> @constant_shift_v8i64(<8 x i64> %a) {
 ; CHECK: 'Cost Model Analysis' for function 'constant_shift_v8i64':
 ; SSE2: Found an estimated cost of 16 for instruction:   %shift
 ; SSE41: Found an estimated cost of 16 for instruction:   %shift
-; AVX: Found an estimated cost of 16 for instruction:   %shift
+; AVX: Found an estimated cost of 20 for instruction:   %shift
 ; AVX2: Found an estimated cost of 2 for instruction:   %shift
 ; AVX512: Found an estimated cost of 1 for instruction:   %shift
-; XOPAVX: Found an estimated cost of 4 for instruction:   %shift
+; XOPAVX: Found an estimated cost of 8 for instruction:   %shift
 ; XOPAVX2: Found an estimated cost of 2 for instruction:   %shift
   %shift = shl <8 x i64> %a, <i64 1, i64 7, i64 15, i64 31, i64 1, i64 7, i64 15, i64 31>
   ret <8 x i64> %shift
@@ -415,7 +415,7 @@ define <8 x i32> @constant_shift_v8i32(<8 x i32> %a) {
 ; AVX: Found an estimated cost of 4 for instruction:   %shift
 ; AVX2: Found an estimated cost of 1 for instruction:   %shift
 ; AVX512: Found an estimated cost of 1 for instruction:   %shift
-; XOPAVX: Found an estimated cost of 2 for instruction:   %shift
+; XOPAVX: Found an estimated cost of 4 for instruction:   %shift
 ; XOPAVX2: Found an estimated cost of 1 for instruction:   %shift
   %shift = shl <8 x i32> %a, <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>
   ret <8 x i32> %shift
@@ -428,7 +428,7 @@ define <16 x i32> @constant_shift_v16i32(<16 x i32> %a) {
 ; AVX: Found an estimated cost of 8 for instruction:   %shift
 ; AVX2: Found an estimated cost of 2 for instruction:   %shift
 ; AVX512: Found an estimated cost of 1 for instruction:   %shift
-; XOPAVX: Found an estimated cost of 4 for instruction:   %shift
+; XOPAVX: Found an estimated cost of 8 for instruction:   %shift
 ; XOPAVX2: Found an estimated cost of 2 for instruction:   %shift
   %shift = shl <16 x i32> %a, <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>
   ret <16 x i32> %shift
@@ -453,7 +453,7 @@ define <16 x i16> @constant_shift_v16i16(<16 x i16> %a) {
 ; AVX: Found an estimated cost of 4 for instruction:   %shift
 ; AVX2: Found an estimated cost of 1 for instruction:   %shift
 ; AVX512: Found an estimated cost of 1 for instruction:   %shift
-; XOPAVX: Found an estimated cost of 2 for instruction:   %shift
+; XOPAVX: Found an estimated cost of 4 for instruction:   %shift
 ; XOPAVX2: Found an estimated cost of 1 for instruction:   %shift
   %shift = shl <16 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>
   ret <16 x i16> %shift
@@ -467,7 +467,7 @@ define <32 x i16> @constant_shift_v32i16(<32 x i16> %a) {
 ; AVX2: Found an estimated cost of 2 for instruction:   %shift
 ; AVX512F: Found an estimated cost of 2 for instruction:   %shift
 ; AVX512BW: Found an estimated cost of 1 for instruction:   %shift
-; XOPAVX: Found an estimated cost of 4 for instruction:   %shift
+; XOPAVX: Found an estimated cost of 8 for instruction:   %shift
 ; XOPAVX2: Found an estimated cost of 2 for instruction:   %shift
   %shift = shl <32 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>
   ret <32 x i16> %shift
@@ -489,10 +489,10 @@ define <32 x i8> @constant_shift_v32i8(<32 x i8> %a) {
 ; CHECK: 'Cost Model Analysis' for function 'constant_shift_v32i8':
 ; SSE2: Found an estimated cost of 52 for instruction:   %shift
 ; SSE41: Found an estimated cost of 22 for instruction:   %shift
-; AVX: Found an estimated cost of 22 for instruction:   %shift
+; AVX: Found an estimated cost of 24 for instruction:   %shift
 ; AVX2: Found an estimated cost of 11 for instruction:   %shift
 ; AVX512: Found an estimated cost of 11 for instruction:   %shift
-; XOP: Found an estimated cost of 2 for instruction:   %shift
+; XOP: Found an estimated cost of 4 for instruction:   %shift
   %shift = shl <32 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>
   ret <32 x i8> %shift
 }
@@ -501,11 +501,11 @@ define <64 x i8> @constant_shift_v64i8(<64 x i8> %a) {
 ; CHECK: 'Cost Model Analysis' for function 'constant_shift_v64i8':
 ; SSE2: Found an estimated cost of 104 for instruction:   %shift
 ; SSE41: Found an estimated cost of 44 for instruction:   %shift
-; AVX: Found an estimated cost of 44 for instruction:   %shift
+; AVX: Found an estimated cost of 48 for instruction:   %shift
 ; AVX2: Found an estimated cost of 22 for instruction:   %shift
 ; AVX512F: Found an estimated cost of 22 for instruction:   %shift
 ; AVX512BW: Found an estimated cost of 11 for instruction:   %shift
-; XOP: Found an estimated cost of 4 for instruction:   %shift
+; XOP: Found an estimated cost of 8 for instruction:   %shift
   %shift = shl <64 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>
   ret <64 x i8> %shift
 }
@@ -531,10 +531,10 @@ define <4 x i64> @splatconstant_shift_v4i64(<4 x i64> %a) {
 ; CHECK: 'Cost Model Analysis' for function 'splatconstant_shift_v4i64':
 ; SSE2: Found an estimated cost of 2 for instruction:   %shift
 ; SSE41: Found an estimated cost of 2 for instruction:   %shift
-; AVX: Found an estimated cost of 2 for instruction:   %shift
+; AVX: Found an estimated cost of 4 for instruction:   %shift
 ; AVX2: Found an estimated cost of 1 for instruction:   %shift
 ; AVX512: Found an estimated cost of 1 for instruction:   %shift
-; XOPAVX: Found an estimated cost of 2 for instruction:   %shift
+; XOPAVX: Found an estimated cost of 4 for instruction:   %shift
 ; XOPAVX2: Found an estimated cost of 1 for instruction:   %shift
   %shift = shl <4 x i64> %a, <i64 7, i64 7, i64 7, i64 7>
   ret <4 x i64> %shift
@@ -544,10 +544,10 @@ define <8 x i64> @splatconstant_shift_v8i64(<8 x i64> %a) {
 ; CHECK: 'Cost Model Analysis' for function 'splatconstant_shift_v8i64':
 ; SSE2: Found an estimated cost of 4 for instruction:   %shift
 ; SSE41: Found an estimated cost of 4 for instruction:   %shift
-; AVX: Found an estimated cost of 4 for instruction:   %shift
+; AVX: Found an estimated cost of 8 for instruction:   %shift
 ; AVX2: Found an estimated cost of 2 for instruction:   %shift
 ; AVX512: Found an estimated cost of 1 for instruction:   %shift
-; XOPAVX: Found an estimated cost of 4 for instruction:   %shift
+; XOPAVX: Found an estimated cost of 8 for instruction:   %shift
 ; XOPAVX2: Found an estimated cost of 2 for instruction:   %shift
   %shift = shl <8 x i64> %a, <i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7>
   ret <8 x i64> %shift
@@ -570,10 +570,10 @@ define <8 x i32> @splatconstant_shift_v8i32(<8 x i32> %a) {
 ; CHECK: 'Cost Model Analysis' for function 'splatconstant_shift_v8i32':
 ; SSE2: Found an estimated cost of 2 for instruction:   %shift
 ; SSE41: Found an estimated cost of 2 for instruction:   %shift
-; AVX: Found an estimated cost of 2 for instruction:   %shift
+; AVX: Found an estimated cost of 4 for instruction:   %shift
 ; AVX2: Found an estimated cost of 1 for instruction:   %shift
 ; AVX512: Found an estimated cost of 1 for instruction:   %shift
-; XOPAVX: Found an estimated cost of 2 for instruction:   %shift
+; XOPAVX: Found an estimated cost of 4 for instruction:   %shift
 ; XOPAVX2: Found an estimated cost of 1 for instruction:   %shift
   %shift = shl <8 x i32> %a, <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
   ret <8 x i32> %shift
@@ -583,10 +583,10 @@ define <16 x i32> @splatconstant_shift_v16i32(<16 x i32> %a) {
 ; CHECK: 'Cost Model Analysis' for function 'splatconstant_shift_v16i32':
 ; SSE2: Found an estimated cost of 4 for instruction:   %shift
 ; SSE41: Found an estimated cost of 4 for instruction:   %shift
-; AVX: Found an estimated cost of 4 for instruction:   %shift
+; AVX: Found an estimated cost of 8 for instruction:   %shift
 ; AVX2: Found an estimated cost of 2 for instruction:   %shift
 ; AVX512: Found an estimated cost of 1 for instruction:   %shift
-; XOPAVX: Found an estimated cost of 4 for instruction:   %shift
+; XOPAVX: Found an estimated cost of 8 for instruction:   %shift
 ; XOPAVX2: Found an estimated cost of 2 for instruction:   %shift
   %shift = shl <16 x i32> %a, <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
   ret <16 x i32> %shift
@@ -608,10 +608,10 @@ define <16 x i16> @splatconstant_shift_v16i16(<16 x i16> %a) {
 ; CHECK: 'Cost Model Analysis' for function 'splatconstant_shift_v16i16':
 ; SSE2: Found an estimated cost of 2 for instruction:   %shift
 ; SSE41: Found an estimated cost of 2 for instruction:   %shift
-; AVX: Found an estimated cost of 2 for instruction:   %shift
+; AVX: Found an estimated cost of 4 for instruction:   %shift
 ; AVX2: Found an estimated cost of 1 for instruction:   %shift
 ; AVX512: Found an estimated cost of 1 for instruction:   %shift
-; XOPAVX: Found an estimated cost of 2 for instruction:   %shift
+; XOPAVX: Found an estimated cost of 4 for instruction:   %shift
 ; XOPAVX2: Found an estimated cost of 1 for instruction:   %shift
   %shift = shl <16 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
   ret <16 x i16> %shift
@@ -621,11 +621,11 @@ define <32 x i16> @splatconstant_shift_v32i16(<32 x i16> %a) {
 ; CHECK: 'Cost Model Analysis' for function 'splatconstant_shift_v32i16':
 ; SSE2: Found an estimated cost of 4 for instruction:   %shift
 ; SSE41: Found an estimated cost of 4 for instruction:   %shift
-; AVX: Found an estimated cost of 4 for instruction:   %shift
+; AVX: Found an estimated cost of 8 for instruction:   %shift
 ; AVX2: Found an estimated cost of 2 for instruction:   %shift
 ; AVX512F: Found an estimated cost of 2 for instruction:   %shift
 ; AVX512BW: Found an estimated cost of 1 for instruction:   %shift
-; XOPAVX: Found an estimated cost of 4 for instruction:   %shift
+; XOPAVX: Found an estimated cost of 8 for instruction:   %shift
 ; XOPAVX2: Found an estimated cost of 2 for instruction:   %shift
   %shift = shl <32 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
   ret <32 x i16> %shift
@@ -638,7 +638,7 @@ define <16 x i8> @splatconstant_shift_v16i8(<16 x i8> %a) {
 ; AVX: Found an estimated cost of 2 for instruction:   %shift
 ; AVX2: Found an estimated cost of 2 for instruction:   %shift
 ; AVX512: Found an estimated cost of 2 for instruction:   %shift
-; XOP: Found an estimated cost of 2 for instruction:   %shift
+; XOP: Found an estimated cost of 1 for instruction:   %shift
   %shift = shl <16 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
   ret <16 x i8> %shift
 }
@@ -647,7 +647,7 @@ define <32 x i8> @splatconstant_shift_v32i8(<32 x i8> %a) {
 ; CHECK: 'Cost Model Analysis' for function 'splatconstant_shift_v32i8':
 ; SSE2: Found an estimated cost of 4 for instruction:   %shift
 ; SSE41: Found an estimated cost of 4 for instruction:   %shift
-; AVX: Found an estimated cost of 4 for instruction:   %shift
+; AVX: Found an estimated cost of 6 for instruction:   %shift
 ; AVX2: Found an estimated cost of 2 for instruction:   %shift
 ; AVX512: Found an estimated cost of 2 for instruction:   %shift
 ; XOPAVX: Found an estimated cost of 4 for instruction:   %shift
@@ -660,7 +660,7 @@ define <64 x i8> @splatconstant_shift_v64i8(<64 x i8> %a) {
 ; CHECK: 'Cost Model Analysis' for function 'splatconstant_shift_v64i8':
 ; SSE2: Found an estimated cost of 8 for instruction:   %shift
 ; SSE41: Found an estimated cost of 8 for instruction:   %shift
-; AVX: Found an estimated cost of 8 for instruction:   %shift
+; AVX: Found an estimated cost of 12 for instruction:   %shift
 ; AVX2: Found an estimated cost of 4 for instruction:   %shift
 ; AVX512F: Found an estimated cost of 4 for instruction:   %shift
 ; AVX512BW: Found an estimated cost of 2 for instruction:   %shift
@@ -761,7 +761,7 @@ define <16 x i16> @test6(<16 x i16> %a) {
 ; SSE41: Found an estimated cost of 2 for instruction:   %shl
 ; AVX: Found an estimated cost of 4 for instruction:   %shl
 ; AVX2: Found an estimated cost of 1 for instruction:   %shl
-; XOPAVX: Found an estimated cost of 2 for instruction:   %shl
+; XOPAVX: Found an estimated cost of 4 for instruction:   %shl
 ; XOPAVX2: Found an estimated cost of 1 for instruction:   %shl
 
 
@@ -778,7 +778,7 @@ define <8 x i32> @test7(<8 x i32> %a) {
 ; SSE41: Found an estimated cost of 2 for instruction:   %shl
 ; AVX: Found an estimated cost of 4 for instruction:   %shl
 ; AVX2: Found an estimated cost of 1 for instruction:   %shl
-; XOPAVX: Found an estimated cost of 2 for instruction:   %shl
+; XOPAVX: Found an estimated cost of 4 for instruction:   %shl
 ; XOPAVX2: Found an estimated cost of 1 for instruction:   %shl
 
 
@@ -794,9 +794,9 @@ define <4 x i64> @test8(<4 x i64> %a) {
 ; CHECK: 'Cost Model Analysis' for function 'test8':
 ; SSE2: Found an estimated cost of 8 for instruction:   %shl
 ; SSE41: Found an estimated cost of 8 for instruction:   %shl
-; AVX: Found an estimated cost of 8 for instruction:   %shl
+; AVX: Found an estimated cost of 10 for instruction:   %shl
 ; AVX2: Found an estimated cost of 1 for instruction:   %shl
-; XOPAVX: Found an estimated cost of 2 for instruction:   %shl
+; XOPAVX: Found an estimated cost of 4 for instruction:   %shl
 ; XOPAVX2: Found an estimated cost of 1 for instruction:   %shl
 
 
@@ -811,7 +811,7 @@ define <32 x i16> @test9(<32 x i16> %a) {
 ; SSE41: Found an estimated cost of 4 for instruction:   %shl
 ; AVX: Found an estimated cost of 8 for instruction:   %shl
 ; AVX2: Found an estimated cost of 2 for instruction:   %shl
-; XOPAVX: Found an estimated cost of 4 for instruction:   %shl
+; XOPAVX: Found an estimated cost of 8 for instruction:   %shl
 ; XOPAVX2: Found an estimated cost of 2 for instruction:   %shl
 
 
@@ -826,7 +826,7 @@ define <16 x i32> @test10(<16 x i32> %a) {
 ; SSE41: Found an estimated cost of 4 for instruction:   %shl
 ; AVX: Found an estimated cost of 8 for instruction:   %shl
 ; AVX2: Found an estimated cost of 2 for instruction:   %shl
-; XOPAVX: Found an estimated cost of 4 for instruction:   %shl
+; XOPAVX: Found an estimated cost of 8 for instruction:   %shl
 ; XOPAVX2: Found an estimated cost of 2 for instruction:   %shl
 
 
@@ -842,7 +842,7 @@ define <8 x i64> @test11(<8 x i64> %a) {
 ; CHECK: 'Cost Model Analysis' for function 'test11':
 ; SSE2: Found an estimated cost of 16 for instruction:   %shl
 ; SSE41: Found an estimated cost of 16 for instruction:   %shl
-; AVX: Found an estimated cost of 16 for instruction:   %shl
+; AVX: Found an estimated cost of 20 for instruction:   %shl
 ; AVX2: Found an estimated cost of 2 for instruction:   %shl
-; XOPAVX: Found an estimated cost of 4 for instruction:   %shl
+; XOPAVX: Found an estimated cost of 8 for instruction:   %shl
 ; XOPAVX2: Found an estimated cost of 2 for instruction:   %shl
diff --git a/test/Analysis/ScalarEvolution/different-loops-recs.ll b/test/Analysis/ScalarEvolution/different-loops-recs.ll
new file mode 100644
index 000000000000..ad3d1e0bd110
--- /dev/null
+++ b/test/Analysis/ScalarEvolution/different-loops-recs.ll
@@ -0,0 +1,454 @@
+; RUN: opt -analyze -scalar-evolution < %s | FileCheck %s
+
+; This test set ensures that we can correctly operate with recurrencies from
+; different loops.
+
+; Check that we can evaluate a sum of phis from two different loops in any
+; order.
+
+define void @test_00() {
+
+; CHECK-LABEL: Classifying expressions for: @test_00
+; CHECK:       %sum1 = add i32 %phi1, %phi2
+; CHECK-NEXT:  -->  {14,+,3}<%loop1>
+; CHECK:       %sum2 = add i32 %sum1, %phi3
+; CHECK-NEXT:  -->  {20,+,6}<%loop1>
+; CHECK:       %sum3 = add i32 %phi4, %phi5
+; CHECK-NEXT:  -->  {116,+,3}<%loop2>
+; CHECK:       %sum4 = add i32 %sum3, %phi6
+; CHECK-NEXT:  -->  {159,+,6}<%loop2>
+; CHECK:       %s1 = add i32 %phi1, %phi4
+; CHECK-NEXT:  -->  {{{{}}73,+,1}<%loop1>,+,1}<%loop2>
+; CHECK:       %s2 = add i32 %phi5, %phi2
+; CHECK-NEXT:  -->  {{{{}}57,+,2}<%loop1>,+,2}<%loop2>
+; CHECK:       %s3 = add i32 %sum1, %sum3
+; CHECK-NEXT:  -->  {{{{}}130,+,3}<%loop1>,+,3}<%loop2>
+; CHECK:       %s4 = add i32 %sum4, %sum2
+; CHECK-NEXT:  -->  {{{{}}179,+,6}<%loop1>,+,6}<%loop2>
+; CHECK:       %s5 = add i32 %phi3, %sum3
+; CHECK-NEXT:  -->  {{{{}}122,+,3}<%loop1>,+,3}<%loop2>
+; CHECK:       %s6 = add i32 %sum2, %phi6
+; CHECK-NEXT:  -->  {{{{}}63,+,6}<%loop1>,+,3}<%loop2>
+
+entry:
+  br label %loop1
+
+loop1:
+  %phi1 = phi i32 [ 10, %entry ], [ %phi1.inc, %loop1 ]
+  %phi2 = phi i32 [ 4, %entry ], [ %phi2.inc, %loop1 ]
+  %phi3 = phi i32 [ 6, %entry ], [ %phi3.inc, %loop1 ]
+  %phi1.inc = add i32 %phi1, 1
+  %phi2.inc = add i32 %phi2, 2
+  %phi3.inc = add i32 %phi3, 3
+  %sum1 = add i32 %phi1, %phi2
+  %sum2 = add i32 %sum1, %phi3
+  %cond1 = icmp ult i32 %sum2, 1000
+  br i1 %cond1, label %loop1, label %loop2
+
+loop2:
+  %phi4 = phi i32 [ 63, %loop1 ], [ %phi4.inc, %loop2 ]
+  %phi5 = phi i32 [ 53, %loop1 ], [ %phi5.inc, %loop2 ]
+  %phi6 = phi i32 [ 43, %loop1 ], [ %phi6.inc, %loop2 ]
+  %phi4.inc = add i32 %phi4, 1
+  %phi5.inc = add i32 %phi5, 2
+  %phi6.inc = add i32 %phi6, 3
+  %sum3 = add i32 %phi4, %phi5
+  %sum4 = add i32 %sum3, %phi6
+  %cond2 = icmp ult i32 %sum4, 1000
+  br i1 %cond2, label %loop2, label %exit
+
+exit:
+  %s1 = add i32 %phi1, %phi4
+  %s2 = add i32 %phi5, %phi2
+  %s3 = add i32 %sum1, %sum3
+  %s4 = add i32 %sum4, %sum2
+  %s5 = add i32 %phi3, %sum3
+  %s6 = add i32 %sum2, %phi6
+  ret void
+}
+
+; Check that we can evaluate a sum of phis+invariants from two different loops
+; in any order.
+
+define void @test_01(i32 %a, i32 %b) {
+
+; CHECK-LABEL: Classifying expressions for: @test_01
+; CHECK:       %sum1 = add i32 %phi1, %phi2
+; CHECK-NEXT:  -->  {(%a + %b),+,3}<%loop1>
+; CHECK:       %sum2 = add i32 %sum1, %phi3
+; CHECK-NEXT:  -->  {(6 + %a + %b),+,6}<%loop1>
+; CHECK:       %is1 = add i32 %sum2, %a
+; CHECK-NEXT:  -->  {(6 + (2 * %a) + %b),+,6}<%loop1>
+; CHECK:       %sum3 = add i32 %phi4, %phi5
+; CHECK-NEXT:  -->  {116,+,3}<%loop2>
+; CHECK:       %sum4 = add i32 %sum3, %phi6
+; CHECK-NEXT:  -->  {159,+,6}<%loop2>
+; CHECK:       %is2 = add i32 %sum4, %b
+; CHECK-NEXT:  -->  {(159 + %b),+,6}<%loop2>
+; CHECK:       %ec2 = add i32 %is1, %is2
+; CHECK-NEXT:  -->  {{{{}}(165 + (2 * %a) + (2 * %b)),+,6}<%loop1>,+,6}<%loop2>
+; CHECK:       %s1 = add i32 %phi1, %is1
+; CHECK-NEXT:  -->  {(6 + (3 * %a) + %b),+,7}<%loop1>
+; CHECK:       %s2 = add i32 %is2, %phi4
+; CHECK-NEXT:  -->  {(222 + %b),+,7}<%loop2>
+; CHECK:       %s3 = add i32 %is1, %phi5
+; CHECK-NEXT:  -->  {{{{}}(59 + (2 * %a) + %b),+,6}<%loop1>,+,2}<%loop2>
+; CHECK:       %s4 = add i32 %phi2, %is2
+; CHECK-NEXT:  -->  {{{{}}(159 + (2 * %b)),+,2}<%loop1>,+,6}<%loop2>
+; CHECK:       %s5 = add i32 %is1, %is2
+; CHECK-NEXT:  -->  {{{{}}(165 + (2 * %a) + (2 * %b)),+,6}<%loop1>,+,6}<%loop2>
+; CHECK:       %s6 = add i32 %is2, %is1
+; CHECK-NEXT:  -->  {{{{}}(165 + (2 * %a) + (2 * %b)),+,6}<%loop1>,+,6}<%loop2>
+
+entry:
+  br label %loop1
+
+loop1:
+  %phi1 = phi i32 [ %a, %entry ], [ %phi1.inc, %loop1 ]
+  %phi2 = phi i32 [ %b, %entry ], [ %phi2.inc, %loop1 ]
+  %phi3 = phi i32 [ 6, %entry ], [ %phi3.inc, %loop1 ]
+  %phi1.inc = add i32 %phi1, 1
+  %phi2.inc = add i32 %phi2, 2
+  %phi3.inc = add i32 %phi3, 3
+  %sum1 = add i32 %phi1, %phi2
+  %sum2 = add i32 %sum1, %phi3
+  %is1 = add i32 %sum2, %a
+  %cond1 = icmp ult i32 %is1, 1000
+  br i1 %cond1, label %loop1, label %loop2
+
+loop2:
+  %phi4 = phi i32 [ 63, %loop1 ], [ %phi4.inc, %loop2 ]
+  %phi5 = phi i32 [ 53, %loop1 ], [ %phi5.inc, %loop2 ]
+  %phi6 = phi i32 [ 43, %loop1 ], [ %phi6.inc, %loop2 ]
+  %phi4.inc = add i32 %phi4, 1
+  %phi5.inc = add i32 %phi5, 2
+  %phi6.inc = add i32 %phi6, 3
+  %sum3 = add i32 %phi4, %phi5
+  %sum4 = add i32 %sum3, %phi6
+  %is2 = add i32 %sum4, %b
+  %ec2 = add i32 %is1, %is2
+  %cond2 = icmp ult i32 %ec2, 1000
+  br i1 %cond2, label %loop2, label %exit
+
+exit:
+  %s1 = add i32 %phi1, %is1
+  %s2 = add i32 %is2, %phi4
+  %s3 = add i32 %is1, %phi5
+  %s4 = add i32 %phi2, %is2
+  %s5 = add i32 %is1, %is2
+  %s6 = add i32 %is2, %is1
+  ret void
+}
+
+; Check that we can correctly evaluate a sum of phis+variants from two different
+; loops in any order.
+
+define void @test_02(i32 %a, i32 %b, i32* %p) {
+
+; CHECK-LABEL: Classifying expressions for: @test_02
+; CHECK:       %sum1 = add i32 %phi1, %phi2
+; CHECK-NEXT:  -->  {(%a + %b),+,3}<%loop1>
+; CHECK:       %sum2 = add i32 %sum1, %phi3
+; CHECK-NEXT:  -->  {(6 + %a + %b),+,6}<%loop1>
+; CHECK:       %is1 = add i32 %sum2, %v1
+; CHECK-NEXT:  -->  ({(6 + %a + %b),+,6}<%loop1> + %v1)
+; CHECK:       %sum3 = add i32 %phi4, %phi5
+; CHECK-NEXT:  -->  {(%a + %b),+,3}<%loop2>
+; CHECK:       %sum4 = add i32 %sum3, %phi6
+; CHECK-NEXT:  -->  {(43 + %a + %b),+,6}<%loop2>
+; CHECK:       %is2 = add i32 %sum4, %v2
+; CHECK-NEXT:  -->  ({(43 + %a + %b),+,6}<%loop2> + %v2)
+; CHECK:       %is3 = add i32 %v1, %sum2
+; CHECK-NEXT:  -->  ({(6 + %a + %b),+,6}<%loop1> + %v1)
+; CHECK:       %ec2 = add i32 %is1, %is3
+; CHECK-NEXT:  -->  (2 * ({(6 + %a + %b),+,6}<%loop1> + %v1))
+; CHECK:       %s1 = add i32 %phi1, %is1
+; CHECK-NEXT:  -->  ({(6 + (2 * %a) + %b),+,7}<%loop1> + %v1)
+; CHECK:       %s2 = add i32 %is2, %phi4
+; CHECK-NEXT:  -->  ({(43 + (2 * %a) + %b),+,7}<%loop2> + %v2)
+; CHECK:       %s3 = add i32 %is1, %phi5
+; CHECK-NEXT:  -->  {({(6 + (2 * %b) + %a),+,6}<%loop1> + %v1),+,2}<%loop2>
+; CHECK:       %s4 = add i32 %phi2, %is2
+; CHECK-NEXT:  -->  ({{{{}}(43 + (2 * %b) + %a),+,2}<%loop1>,+,6}<%loop2> + %v2)
+; CHECK:       %s5 = add i32 %is1, %is2
+; CHECK-NEXT:  -->  ({({(49 + (2 * %a) + (2 * %b)),+,6}<%loop1> + %v1),+,6}<%loop2> + %v2)
+; CHECK:       %s6 = add i32 %is2, %is1
+; CHECK-NEXT:  -->  ({({(49 + (2 * %a) + (2 * %b)),+,6}<%loop1> + %v1),+,6}<%loop2> + %v2)
+
+entry:
+  br label %loop1
+
+loop1:
+  %phi1 = phi i32 [ %a, %entry ], [ %phi1.inc, %loop1 ]
+  %phi2 = phi i32 [ %b, %entry ], [ %phi2.inc, %loop1 ]
+  %phi3 = phi i32 [ 6, %entry ], [ %phi3.inc, %loop1 ]
+  %phi1.inc = add i32 %phi1, 1
+  %phi2.inc = add i32 %phi2, 2
+  %phi3.inc = add i32 %phi3, 3
+  %v1 = load i32, i32* %p
+  %sum1 = add i32 %phi1, %phi2
+  %sum2 = add i32 %sum1, %phi3
+  %is1 = add i32 %sum2, %v1
+  %cond1 = icmp ult i32 %is1, 1000
+  br i1 %cond1, label %loop1, label %loop2
+
+loop2:
+  %phi4 = phi i32 [ %a, %loop1 ], [ %phi4.inc, %loop2 ]
+  %phi5 = phi i32 [ %b, %loop1 ], [ %phi5.inc, %loop2 ]
+  %phi6 = phi i32 [ 43, %loop1 ], [ %phi6.inc, %loop2 ]
+  %phi4.inc = add i32 %phi4, 1
+  %phi5.inc = add i32 %phi5, 2
+  %phi6.inc = add i32 %phi6, 3
+  %v2 = load i32, i32* %p
+  %sum3 = add i32 %phi4, %phi5
+  %sum4 = add i32 %sum3, %phi6
+  %is2 = add i32 %sum4, %v2
+  %is3 = add i32 %v1, %sum2
+  %ec2 = add i32 %is1, %is3
+  %cond2 = icmp ult i32 %ec2, 1000
+  br i1 %cond2, label %loop2, label %exit
+
+exit:
+  %s1 = add i32 %phi1, %is1
+  %s2 = add i32 %is2, %phi4
+  %s3 = add i32 %is1, %phi5
+  %s4 = add i32 %phi2, %is2
+  %s5 = add i32 %is1, %is2
+  %s6 = add i32 %is2, %is1
+  ret void
+}
+
+; Mix of previous use cases that demonstrates %s3 can be incorrectly treated as
+; a recurrence of loop1 because of operands order if we pick recurrencies in an
+; incorrect order.
+
+define void @test_03(i32 %a, i32 %b, i32 %c, i32* %p) {
+
+; CHECK-LABEL: Classifying expressions for: @test_03
+; CHECK:       %v1 = load i32, i32* %p
+; CHECK-NEXT:  -->  %v1
+; CHECK:       %s1 = add i32 %phi1, %v1
+; CHECK-NEXT:  -->  {(%a + %v1),+,1}<%loop1>
+; CHECK:       %s2 = add i32 %s1, %b
+; CHECK-NEXT:  -->  {(%a + %b + %v1),+,1}<%loop1>
+; CHECK:       %s3 = add i32 %s2, %phi2
+; CHECK-NEXT:  -->  ({{{{}}((2 * %a) + %b),+,1}<%loop1>,+,2}<%loop2> + %v1)
+
+entry:
+  br label %loop1
+
+loop1:
+  %phi1 = phi i32 [ %a, %entry ], [ %phi1.inc, %loop1 ]
+  %phi1.inc = add i32 %phi1, 1
+  %cond1 = icmp ult i32 %phi1, %c
+  br i1 %cond1, label %loop1, label %loop2
+
+loop2:
+  %phi2 = phi i32 [ %a, %loop1 ], [ %phi2.inc, %loop2 ]
+  %phi2.inc = add i32 %phi2, 2
+  %v1 = load i32, i32* %p
+  %s1 = add i32 %phi1, %v1
+  %s2 = add i32 %s1, %b
+  %s3 = add i32 %s2, %phi2
+  %cond2 = icmp ult i32 %s3, %c
+  br i1 %cond2, label %loop2, label %exit
+
+exit:
+
+  ret void
+}
+
+; Another mix of previous use cases that demonstrates that incorrect picking of
+; a loop for a recurrence may cause a crash of SCEV analysis.
+define void @test_04() {
+
+; CHECK-LABEL: Classifying expressions for: @test_04
+; CHECK:       %tmp = phi i64 [ 2, %bb ], [ %tmp4, %bb3 ]
+; CHECK-NEXT:  -->  {2,+,1}<nuw><nsw><%loop1>
+; CHECK:       %tmp2 = trunc i64 %tmp to i32
+; CHECK-NEXT:  -->  {2,+,1}<%loop1>
+; CHECK:       %tmp4 = add nuw nsw i64 %tmp, 1
+; CHECK-NEXT:  -->  {3,+,1}<nuw><%loop1>
+; CHECK:       %tmp7 = phi i64 [ %tmp15, %loop2 ], [ 2, %loop1 ]
+; CHECK-NEXT:  -->  {2,+,1}<nuw><nsw><%loop2>
+; CHECK:       %tmp10 = sub i64 %tmp9, %tmp7
+; CHECK-NEXT:  -->  ((sext i8 %tmp8 to i64) + {-2,+,-1}<nw><%loop2>)
+; CHECK:       %tmp11 = add i64 %tmp10, undef
+; CHECK-NEXT:  -->  ((sext i8 %tmp8 to i64) + {(-2 + undef),+,-1}<nw><%loop2>)
+; CHECK:       %tmp13 = trunc i64 %tmp11 to i32
+; CHECK-NEXT:  -->  ((sext i8 %tmp8 to i32) + {(trunc i64 (-2 + undef) to i32),+,-1}<%loop2>)
+; CHECK:       %tmp14 = sub i32 %tmp13, %tmp2
+; CHECK-NEXT:  -->  ((sext i8 %tmp8 to i32) + {{{{}}(-2 + (trunc i64 (-2 + undef) to i32)),+,-1}<%loop1>,+,-1}<%loop2>)
+; CHECK:       %tmp15 = add nuw nsw i64 %tmp7, 1
+; CHECK-NEXT:  -->  {3,+,1}<nuw><nsw><%loop2>
+
+bb:
+  br label %loop1
+
+loop1:
+  %tmp = phi i64 [ 2, %bb ], [ %tmp4, %bb3 ]
+  %tmp2 = trunc i64 %tmp to i32
+  br i1 undef, label %loop2, label %bb3
+
+bb3:
+  %tmp4 = add nuw nsw i64 %tmp, 1
+  br label %loop1
+
+bb5:
+  ret void
+
+loop2:
+  %tmp7 = phi i64 [ %tmp15, %loop2 ], [ 2, %loop1 ]
+  %tmp8 = load i8, i8 addrspace(1)* undef, align 1
+  %tmp9 = sext i8 %tmp8 to i64
+  %tmp10 = sub i64 %tmp9, %tmp7
+  %tmp11 = add i64 %tmp10, undef
+  %tmp13 = trunc i64 %tmp11 to i32
+  %tmp14 = sub i32 %tmp13, %tmp2
+  %tmp15 = add nuw nsw i64 %tmp7, 1
+  %tmp16 = icmp slt i64 %tmp15, %tmp
+  br i1 %tmp16, label %loop2, label %bb5
+}
+
+@A = weak global [1000 x i32] zeroinitializer, align 32
+
+; Demonstrate a situation when we can add two recs with different degrees from
+; the same loop.
+define void @test_05(i32 %N) {
+
+; CHECK-LABEL: Classifying expressions for: @test_05
+; CHECK:       %SQ = mul i32 %i.0, %i.0
+; CHECK-NEXT:  -->  {4,+,5,+,2}<%bb3>
+; CHECK:       %tmp4 = mul i32 %i.0, 2
+; CHECK-NEXT:  -->  {4,+,2}<%bb3>
+; CHECK:       %tmp5 = sub i32 %SQ, %tmp4
+; CHECK-NEXT:  -->  {0,+,3,+,2}<%bb3>
+
+entry:
+        %"alloca point" = bitcast i32 0 to i32           ; <i32> [#uses=0]
+        br label %bb3
+
+bb:             ; preds = %bb3
+        %tmp = getelementptr [1000 x i32], [1000 x i32]* @A, i32 0, i32 %i.0          ; <i32*> [#uses=1]
+        store i32 123, i32* %tmp
+        %tmp2 = add i32 %i.0, 1         ; <i32> [#uses=1]
+        br label %bb3
+
+bb3:            ; preds = %bb, %entry
+        %i.0 = phi i32 [ 2, %entry ], [ %tmp2, %bb ]            ; <i32> [#uses=3]
+        %SQ = mul i32 %i.0, %i.0
+        %tmp4 = mul i32 %i.0, 2
+        %tmp5 = sub i32 %SQ, %tmp4
+        %tmp3 = icmp sle i32 %tmp5, 9999          ; <i1> [#uses=1]
+        br i1 %tmp3, label %bb, label %bb5
+
+bb5:            ; preds = %bb3
+        br label %return
+
+return:         ; preds = %bb5
+        ret void
+}
+
+; Check that we can add Phis from different loops with different nesting, nested
+; loop comes first.
+define void @test_06() {
+
+; CHECK-LABEL: Classifying expressions for: @test_06
+; CHECK:       %s1 = add i32 %phi1, %phi2
+; CHECK-NEXT:  -->  {{{{}}30,+,1}<%loop1>,+,2}<%loop2>
+; CHECK:       %s2 = add i32 %phi2, %phi1
+; CHECK-NEXT:  -->  {{{{}}30,+,1}<%loop1>,+,2}<%loop2>
+; CHECK:       %s3 = add i32 %phi1, %phi3
+; CHECK-NEXT:  -->  {{{{}}40,+,1}<%loop1>,+,3}<%loop3>
+; CHECK:       %s4 = add i32 %phi3, %phi1
+; CHECK-NEXT:  -->  {{{{}}40,+,1}<%loop1>,+,3}<%loop3>
+; CHECK:       %s5 = add i32 %phi2, %phi3
+; CHECK-NEXT:  -->  {{{{}}50,+,2}<%loop2>,+,3}<%loop3>
+; CHECK:       %s6 = add i32 %phi3, %phi2
+; CHECK-NEXT:  -->  {{{{}}50,+,2}<%loop2>,+,3}<%loop3>
+
+entry:
+  br label %loop1
+
+loop1:
+  %phi1 = phi i32 [ 10, %entry ], [ %phi1.inc, %loop1.exit ]
+  br label %loop2
+
+loop2:
+  %phi2 = phi i32 [ 20, %loop1 ], [ %phi2.inc, %loop2 ]
+  %phi2.inc = add i32 %phi2, 2
+  %cond2 = icmp ult i32 %phi2.inc, 1000
+  br i1 %cond2, label %loop2, label %loop1.exit
+
+loop1.exit:
+  %phi1.inc = add i32 %phi1, 1
+  %cond1 = icmp ult i32 %phi1.inc, 1000
+  br i1 %cond1, label %loop1, label %loop3
+
+loop3:
+  %phi3 = phi i32 [ 30, %loop1.exit ], [ %phi3.inc, %loop3 ]
+  %phi3.inc = add i32 %phi3, 3
+  %cond3 = icmp ult i32 %phi3.inc, 1000
+  br i1 %cond3, label %loop3, label %exit
+
+exit:
+  %s1 = add i32 %phi1, %phi2
+  %s2 = add i32 %phi2, %phi1
+  %s3 = add i32 %phi1, %phi3
+  %s4 = add i32 %phi3, %phi1
+  %s5 = add i32 %phi2, %phi3
+  %s6 = add i32 %phi3, %phi2
+  ret void
+}
+
+; Check that we can add Phis from different loops with different nesting, nested
+; loop comes second.
+define void @test_07() {
+
+; CHECK-LABEL: Classifying expressions for: @test_07
+; CHECK:       %s1 = add i32 %phi1, %phi2
+; CHECK-NEXT:  -->  {{{{}}30,+,1}<%loop1>,+,2}<%loop2>
+; CHECK:       %s2 = add i32 %phi2, %phi1
+; CHECK-NEXT:  -->  {{{{}}30,+,1}<%loop1>,+,2}<%loop2>
+; CHECK:       %s3 = add i32 %phi1, %phi3
+; CHECK-NEXT:  -->  {{{{}}40,+,3}<%loop3>,+,1}<%loop1>
+; CHECK:       %s4 = add i32 %phi3, %phi1
+; CHECK-NEXT:  -->  {{{{}}40,+,3}<%loop3>,+,1}<%loop1>
+; CHECK:       %s5 = add i32 %phi2, %phi3
+; CHECK-NEXT:  -->  {{{{}}50,+,3}<%loop3>,+,2}<%loop2>
+; CHECK:       %s6 = add i32 %phi3, %phi2
+; CHECK-NEXT:  -->  {{{{}}50,+,3}<%loop3>,+,2}<%loop2>
+
+entry:
+  br label %loop3
+
+loop3:
+  %phi3 = phi i32 [ 30, %entry ], [ %phi3.inc, %loop3 ]
+  %phi3.inc = add i32 %phi3, 3
+  %cond3 = icmp ult i32 %phi3.inc, 1000
+  br i1 %cond3, label %loop3, label %loop1
+
+loop1:
+  %phi1 = phi i32 [ 10, %loop3 ], [ %phi1.inc, %loop1.exit ]
+  br label %loop2
+
+loop2:
+  %phi2 = phi i32 [ 20, %loop1 ], [ %phi2.inc, %loop2 ]
+  %phi2.inc = add i32 %phi2, 2
+  %cond2 = icmp ult i32 %phi2.inc, 1000
+  br i1 %cond2, label %loop2, label %loop1.exit
+
+loop1.exit:
+  %phi1.inc = add i32 %phi1, 1
+  %cond1 = icmp ult i32 %phi1.inc, 1000
+  br i1 %cond1, label %exit, label %loop1
+
+exit:
+  %s1 = add i32 %phi1, %phi2
+  %s2 = add i32 %phi2, %phi1
+  %s3 = add i32 %phi1, %phi3
+  %s4 = add i32 %phi3, %phi1
+  %s5 = add i32 %phi2, %phi3
+  %s6 = add i32 %phi3, %phi2
+  ret void
+}
diff --git a/test/Analysis/TypeBasedAliasAnalysis/intrinsics.ll b/test/Analysis/TypeBasedAliasAnalysis/intrinsics.ll
index eab314eaa9c2..655d4558a5e1 100644
--- a/test/Analysis/TypeBasedAliasAnalysis/intrinsics.ll
+++ b/test/Analysis/TypeBasedAliasAnalysis/intrinsics.ll
@@ -5,22 +5,22 @@ target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:32-
 ; TBAA should prove that these calls don't interfere, since they are
 ; IntrArgReadMem and have TBAA metadata.
 
-; CHECK:      define <8 x i16> @test0(i8* %p, i8* %q, <8 x i16> %y) {
+; CHECK:      define <8 x i16> @test0(<8 x i16>* %p, <8 x i16>* %q, <8 x i16> %y, <8 x i1> %m, <8 x i16> %pt) {
 ; CHECK-NEXT: entry:
-; CHECK-NEXT:   %a = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* %p, i32 16) [[NUW:#[0-9]+]]
-; CHECK-NEXT:   call void @llvm.arm.neon.vst1.p0i8.v8i16(i8* %q, <8 x i16> %y, i32 16)
+; CHECK-NEXT:   %a = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %p, i32 16, <8 x i1> %m, <8 x i16> %pt) [[NUW:#[0-9]+]]
+; CHECK-NEXT:   call void @llvm.masked.store.v8i16.p0v8i16(<8 x i16> %y, <8 x i16>* %q, i32 16, <8 x i1> %m)
 ; CHECK-NEXT:   %c = add <8 x i16> %a, %a
-define <8 x i16> @test0(i8* %p, i8* %q, <8 x i16> %y) {
+define <8 x i16> @test0(<8 x i16>* %p, <8 x i16>* %q, <8 x i16> %y, <8 x i1> %m, <8 x i16> %pt) {
 entry:
-  %a = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* %p, i32 16) nounwind, !tbaa !2
-  call void @llvm.arm.neon.vst1.p0i8.v8i16(i8* %q, <8 x i16> %y, i32 16), !tbaa !1
-  %b = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* %p, i32 16) nounwind, !tbaa !2
+  %a = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %p, i32 16, <8 x i1> %m, <8 x i16> %pt) nounwind, !tbaa !2
+  call void @llvm.masked.store.v8i16.p0v8i16(<8 x i16> %y, <8 x i16>* %q, i32 16, <8 x i1> %m), !tbaa !1
+  %b = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %p, i32 16, <8 x i1> %m, <8 x i16> %pt) nounwind, !tbaa !2
   %c = add <8 x i16> %a, %b
   ret <8 x i16> %c
 }
 
-declare <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8*, i32) nounwind readonly
-declare void @llvm.arm.neon.vst1.p0i8.v8i16(i8*, <8 x i16>, i32) nounwind
+declare <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>*, i32, <8 x i1>, <8 x i16>) nounwind readonly
+declare void @llvm.masked.store.v8i16.p0v8i16(<8 x i16>, <8 x i16>*, i32, <8 x i1>) nounwind
 
 ; CHECK: attributes #0 = { argmemonly nounwind readonly }
 ; CHECK: attributes #1 = { argmemonly nounwind }
diff --git a/test/Assembler/globalvariable-attributes.ll b/test/Assembler/globalvariable-attributes.ll
new file mode 100644
index 000000000000..64227a451c25
--- /dev/null
+++ b/test/Assembler/globalvariable-attributes.ll
@@ -0,0 +1,19 @@
+; RUN: llvm-as < %s | llvm-dis | llvm-as | llvm-dis | FileCheck %s
+
+@g1 = global i32 7 "key" = "value" "key2" = "value2"
+@g2 = global i32 2, align 4 "key3" = "value3"
+@g3 = global i32 2 #0
+@g4 = global i32 2, align 4 "key5" = "value5" #0
+
+attributes #0 = { "string" = "value" nobuiltin norecurse }
+
+; CHECK: @g1 = global i32 7 #0
+; CHECK: @g2 = global i32 2, align 4 #1
+; CHECK: @g3 = global i32 2 #2
+; CHECK: @g4 = global i32 2, align 4 #3
+
+; CHECK: attributes #0 = { "key"="value" "key2"="value2" }
+; CHECK: attributes #1 = { "key3"="value3" }
+; CHECK: attributes #2 = { nobuiltin norecurse "string"="value" }
+; CHECK: attributes #3 = { nobuiltin norecurse "key5"="value5" "string"="value" }
+
diff --git a/test/Bitcode/globalvariable-attributes.ll b/test/Bitcode/globalvariable-attributes.ll
new file mode 100644
index 000000000000..cbab3b71e58a
--- /dev/null
+++ b/test/Bitcode/globalvariable-attributes.ll
@@ -0,0 +1,19 @@
+; RUN: llvm-as < %s | llvm-dis | FileCheck %s
+
+@g1 = global i32 7 "key" = "value" "key2" = "value2"
+@g2 = global i32 2, align 4 "key3" = "value3"
+@g3 = global i32 2 #0
+@g4 = global i32 2, align 4 "key5" = "value5" #0
+
+attributes #0 = { "string" = "value" nobuiltin norecurse }
+
+; CHECK: @g1 = global i32 7 #0
+; CHECK: @g2 = global i32 2, align 4 #1
+; CHECK: @g3 = global i32 2 #2
+; CHECK: @g4 = global i32 2, align 4 #3
+
+; CHECK: attributes #0 = { "key"="value" "key2"="value2" }
+; CHECK: attributes #1 = { "key3"="value3" }
+; CHECK: attributes #2 = { nobuiltin norecurse "string"="value" }
+; CHECK: attributes #3 = { nobuiltin norecurse "key5"="value5" "string"="value" }
+
diff --git a/test/Bitcode/ptest-old.ll b/test/Bitcode/ptest-old.ll
index c1e1cae37368..53ffef900b57 100644
--- a/test/Bitcode/ptest-old.ll
+++ b/test/Bitcode/ptest-old.ll
@@ -1,5 +1,6 @@
 ; RUN: llvm-as < %s | llvm-dis | FileCheck %s
 ; RUN: verify-uselistorder < %s
+; REQUIRES: x86
 
 define i32 @foo(<4 x float> %bar) nounwind {
 entry:
diff --git a/test/Bitcode/thinlto-function-summary-callgraph-profile-summary.ll b/test/Bitcode/thinlto-function-summary-callgraph-profile-summary.ll
index 982bb5cb7e53..b64d5bd52bfc 100644
--- a/test/Bitcode/thinlto-function-summary-callgraph-profile-summary.ll
+++ b/test/Bitcode/thinlto-function-summary-callgraph-profile-summary.ll
@@ -29,7 +29,7 @@
 ; CHECK-NEXT:    <VERSION
 ; CHECK-NEXT:    <VALUE_GUID op0=25 op1=123/>
 ; op4=hot1 op6=cold op8=hot2 op10=hot4 op12=none1 op14=hot3 op16=none2 op18=none3 op20=123
-; CHECK-NEXT:    <PERMODULE_PROFILE {{.*}} op4=1 op5=3 op6=5 op7=1 op8=2 op9=3 op10=4 op11=3 op12=6 op13=2 op14=3 op15=3 op16=7 op17=2 op18=8 op19=2 op20=25 op21=3/>
+; CHECK-NEXT:    <PERMODULE_PROFILE {{.*}} op4=1 op5=3 op6=5 op7=1 op8=2 op9=3 op10=4 op11=1 op12=6 op13=2 op14=3 op15=3 op16=7 op17=2 op18=8 op19=2 op20=25 op21=3/>
 ; CHECK-NEXT:  </GLOBALVAL_SUMMARY_BLOCK>
 
 ; CHECK: <STRTAB_BLOCK
diff --git a/test/Bitcode/thinlto-function-summary-callgraph-sample-profile-summary.ll b/test/Bitcode/thinlto-function-summary-callgraph-sample-profile-summary.ll
new file mode 100644
index 000000000000..875f397646a6
--- /dev/null
+++ b/test/Bitcode/thinlto-function-summary-callgraph-sample-profile-summary.ll
@@ -0,0 +1,121 @@
+; Test to check the callgraph in summary when there is PGO
+; RUN: opt -module-summary %s -o %t.o
+; RUN: llvm-bcanalyzer -dump %t.o | FileCheck %s
+; RUN: opt -module-summary %p/Inputs/thinlto-function-summary-callgraph-profile-summary.ll -o %t2.o
+; RUN: llvm-lto -thinlto -o %t3 %t.o %t2.o
+; RUN: llvm-bcanalyzer -dump %t3.thinlto.bc | FileCheck %s --check-prefix=COMBINED
+
+
+; CHECK: <SOURCE_FILENAME
+; "hot_function"
+; CHECK-NEXT: <FUNCTION op0=0 op1=12
+; "hot1"
+; CHECK-NEXT: <FUNCTION op0=12 op1=4
+; "hot2"
+; CHECK-NEXT: <FUNCTION op0=16 op1=4
+; "hot3"
+; CHECK-NEXT: <FUNCTION op0=20 op1=4
+; "hot4"
+; CHECK-NEXT: <FUNCTION op0=24 op1=4
+; "cold"
+; CHECK-NEXT: <FUNCTION op0=28 op1=4
+; "none1"
+; CHECK-NEXT: <FUNCTION op0=32 op1=5
+; "none2"
+; CHECK-NEXT: <FUNCTION op0=37 op1=5
+; "none3"
+; CHECK-NEXT: <FUNCTION op0=42 op1=5
+; CHECK-LABEL:       <GLOBALVAL_SUMMARY_BLOCK
+; CHECK-NEXT:    <VERSION
+; CHECK-NEXT:    <VALUE_GUID op0=25 op1=123/>
+; op4=hot1 op6=cold op8=hot2 op10=hot4 op12=none1 op14=hot3 op16=none2 op18=none3 op20=123
+; CHECK-NEXT:    <PERMODULE_PROFILE {{.*}} op4=1 op5=3 op6=5 op7=1 op8=2 op9=3 op10=4 op11=3 op12=6 op13=2 op14=3 op15=3 op16=7 op17=2 op18=8 op19=2 op20=25 op21=3/>
+; CHECK-NEXT:  </GLOBALVAL_SUMMARY_BLOCK>
+
+; CHECK: <STRTAB_BLOCK
+; CHECK-NEXT: blob data = 'hot_functionhot1hot2hot3hot4coldnone1none2none3'
+
+; COMBINED:       <GLOBALVAL_SUMMARY_BLOCK
+; COMBINED-NEXT:    <VERSION
+; COMBINED-NEXT:    <VALUE_GUID
+; COMBINED-NEXT:    <VALUE_GUID
+; COMBINED-NEXT:    <VALUE_GUID
+; COMBINED-NEXT:    <VALUE_GUID
+; COMBINED-NEXT:    <VALUE_GUID
+; COMBINED-NEXT:    <VALUE_GUID
+; COMBINED-NEXT:    <VALUE_GUID
+; COMBINED-NEXT:    <VALUE_GUID
+; COMBINED-NEXT:    <COMBINED abbrevid=
+; COMBINED-NEXT:    <COMBINED abbrevid=
+; COMBINED-NEXT:    <COMBINED abbrevid=
+; COMBINED-NEXT:    <COMBINED abbrevid=
+; COMBINED-NEXT:    <COMBINED abbrevid=
+; COMBINED-NEXT:    <COMBINED abbrevid=
+; COMBINED-NEXT:    <COMBINED_PROFILE {{.*}} op5=[[HOT1:.*]] op6=3 op7=[[COLD:.*]] op8=1 op9=[[HOT2:.*]] op10=3 op11=[[NONE1:.*]] op12=2 op13=[[HOT3:.*]] op14=3 op15=[[NONE2:.*]] op16=2 op17=[[NONE3:.*]] op18=2/>
+; COMBINED_NEXT:    <COMBINED abbrevid=
+; COMBINED_NEXT:  </GLOBALVAL_SUMMARY_BLOCK>
+
+
+; ModuleID = 'thinlto-function-summary-callgraph.ll'
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; This function have high profile count, so entry block is hot.
+define void @hot_function(i1 %a, i1 %a2) !prof !20 {
+entry:
+    call void @hot1()
+    br i1 %a, label %Cold, label %Hot, !prof !41
+Cold:           ; 1/1000 goes here
+  call void @cold()
+  call void @hot2()
+  call void @hot4(), !prof !15
+  call void @none1()
+  br label %exit
+Hot:            ; 999/1000 goes here
+  call void @hot2()
+  call void @hot3()
+  br i1 %a2, label %None1, label %None2, !prof !42
+None1:          ; half goes here
+  call void @none1()
+  call void @none2()
+  br label %exit
+None2:          ; half goes here
+  call void @none3()
+  br label %exit
+exit:
+  ret void
+}
+
+declare void @hot1() #1
+declare void @hot2() #1
+declare void @hot3() #1
+declare void @hot4() #1
+declare void @cold() #1
+declare void @none1() #1
+declare void @none2() #1
+declare void @none3() #1
+
+
+!41 = !{!"branch_weights", i32 1, i32 1000}
+!42 = !{!"branch_weights", i32 1, i32 1}
+
+
+
+!llvm.module.flags = !{!1}
+!20 = !{!"function_entry_count", i64 110, i64 123}
+
+!1 = !{i32 1, !"ProfileSummary", !2}
+!2 = !{!3, !4, !5, !6, !7, !8, !9, !10}
+!3 = !{!"ProfileFormat", !"SampleProfile"}
+!4 = !{!"TotalCount", i64 10000}
+!5 = !{!"MaxCount", i64 10}
+!6 = !{!"MaxInternalCount", i64 1}
+!7 = !{!"MaxFunctionCount", i64 1000}
+!8 = !{!"NumCounts", i64 3}
+!9 = !{!"NumFunctions", i64 3}
+!10 = !{!"DetailedSummary", !11}
+!11 = !{!12, !13, !14}
+!12 = !{i32 10000, i64 100, i32 1}
+!13 = !{i32 999000, i64 100, i32 1}
+!14 = !{i32 999999, i64 1, i32 2}
+!15 = !{!"branch_weights", i32 100}
diff --git a/test/CodeGen/AArch64/GlobalISel/arm64-regbankselect.mir b/test/CodeGen/AArch64/GlobalISel/arm64-regbankselect.mir
index 739fdd5cb4c5..0f054f1d940c 100644
--- a/test/CodeGen/AArch64/GlobalISel/arm64-regbankselect.mir
+++ b/test/CodeGen/AArch64/GlobalISel/arm64-regbankselect.mir
@@ -74,6 +74,21 @@
     %res = bitcast <2 x i32> %vres to i64
     ret i64 %res
   }
+
+  define i64 @floatingPointLoad(i64 %arg1, double* %addr) {
+    %varg1 = bitcast i64 %arg1 to double
+    %varg2 = load double, double* %addr
+    %vres = fadd double %varg1, %varg2
+    %res = bitcast double %vres to i64
+    ret i64 %res
+  }
+
+  define void @floatingPointStore(i64 %arg1, double* %addr) {
+    %varg1 = bitcast i64 %arg1 to double
+    %vres = fadd double %varg1, %varg1
+    store double %vres, double* %addr
+    ret void
+  }
 ...
 
 ---
@@ -650,3 +665,84 @@ body:             |
     RET_ReallyLR implicit %x0
 
 ...
+
+---
+# Make sure we map what looks like floating point
+# loads to floating point register bank.
+# CHECK-LABEL: name: floatingPointLoad
+name:            floatingPointLoad
+legalized:       true
+
+# CHECK: registers:
+# CHECK-NEXT:  - { id: 0, class: gpr }
+# CHECK-NEXT:  - { id: 1, class: gpr }
+# CHECK-NEXT:   - { id: 2, class: fpr }
+# CHECK-NEXT:   - { id: 3, class: fpr }
+# CHECK-NEXT:   - { id: 4, class: fpr }
+registers:
+  - { id: 0, class: _ }
+  - { id: 1, class: _ }
+  - { id: 2, class: _ }
+  - { id: 3, class: _ }
+
+# No repairing should be necessary for both modes.
+# CHECK:         %0(s64) = COPY %x0
+# CHECK-NEXT:    %1(p0) = COPY %x1
+# CHECK-NEXT:    %2(s64) = G_LOAD %1(p0) :: (load 8 from %ir.addr)
+# %0 has been mapped to GPR, we need to repair to match FPR.
+# CHECK-NEXT:    %4(s64) = COPY %0
+# CHECK-NEXT:    %3(s64) = G_FADD %4, %2
+# CHECK-NEXT:    %x0 = COPY %3(s64)
+# CHECK-NEXT:    RET_ReallyLR implicit %x0
+
+body:             |
+  bb.0:
+    liveins: %x0, %x1
+
+    %0(s64) = COPY %x0
+    %1(p0) = COPY %x1
+    %2(s64) = G_LOAD %1(p0) :: (load 8 from %ir.addr)
+    %3(s64) = G_FADD %0, %2
+    %x0 = COPY %3(s64)
+    RET_ReallyLR implicit %x0
+
+...
+
+---
+# Make sure we map what looks like floating point
+# stores to floating point register bank.
+# CHECK-LABEL: name: floatingPointStore
+name:            floatingPointStore
+legalized:       true
+
+# CHECK: registers:
+# CHECK-NEXT:  - { id: 0, class: gpr }
+# CHECK-NEXT:  - { id: 1, class: gpr }
+# CHECK-NEXT:   - { id: 2, class: fpr }
+# CHECK-NEXT:   - { id: 3, class: fpr }
+# CHECK-NEXT:   - { id: 4, class: fpr }
+registers:
+  - { id: 0, class: _ }
+  - { id: 1, class: _ }
+  - { id: 2, class: _ }
+
+# CHECK:         %0(s64) = COPY %x0
+# CHECK-NEXT:    %1(p0) = COPY %x1
+# %0 has been mapped to GPR, we need to repair to match FPR.
+# CHECK-NEXT:    %3(s64) = COPY %0
+# CHECK-NEXT:    %4(s64) = COPY %0
+# CHECK-NEXT:    %2(s64) = G_FADD %3, %4
+# CHECK-NEXT:    G_STORE %2(s64), %1(p0) :: (store 8 into %ir.addr)
+# CHECK-NEXT:    RET_ReallyLR
+
+body:             |
+  bb.0:
+    liveins: %x0, %x1
+
+    %0(s64) = COPY %x0
+    %1(p0) = COPY %x1
+    %2(s64) = G_FADD %0, %0
+    G_STORE %2(s64), %1(p0) :: (store 8 into %ir.addr)
+    RET_ReallyLR
+
+...
diff --git a/test/CodeGen/AArch64/GlobalISel/call-translator.ll b/test/CodeGen/AArch64/GlobalISel/call-translator.ll
index f8d95c88cc8f..44705a9c9f65 100644
--- a/test/CodeGen/AArch64/GlobalISel/call-translator.ll
+++ b/test/CodeGen/AArch64/GlobalISel/call-translator.ll
@@ -1,7 +1,7 @@
 ; RUN: llc -mtriple=aarch64-linux-gnu -O0 -stop-after=irtranslator -global-isel -verify-machineinstrs %s -o - 2>&1 | FileCheck %s
 
 ; CHECK-LABEL: name: test_trivial_call
-; CHECK: ADJCALLSTACKDOWN 0, implicit-def %sp, implicit %sp
+; CHECK: ADJCALLSTACKDOWN 0, 0, implicit-def %sp, implicit %sp
 ; CHECK: BL @trivial_callee, csr_aarch64_aapcs, implicit-def %lr
 ; CHECK: ADJCALLSTACKUP 0, 0, implicit-def %sp, implicit %sp
 declare void @trivial_callee()
@@ -186,7 +186,7 @@ define void @test_stack_slots([8 x i64], i64 %lhs, i64 %rhs, i64* %addr) {
 ; CHECK: [[C42:%[0-9]+]](s64) = G_CONSTANT i64 42
 ; CHECK: [[C12:%[0-9]+]](s64) = G_CONSTANT i64 12
 ; CHECK: [[PTR:%[0-9]+]](p0) = G_CONSTANT i64 0
-; CHECK: ADJCALLSTACKDOWN 24, implicit-def %sp, implicit %sp
+; CHECK: ADJCALLSTACKDOWN 24, 0, implicit-def %sp, implicit %sp
 ; CHECK: [[SP:%[0-9]+]](p0) = COPY %sp
 ; CHECK: [[C42_OFFS:%[0-9]+]](s64) = G_CONSTANT i64 0
 ; CHECK: [[C42_LOC:%[0-9]+]](p0) = G_GEP [[SP]], [[C42_OFFS]](s64)
diff --git a/test/CodeGen/AArch64/arm64-ccmp.ll b/test/CodeGen/AArch64/arm64-ccmp.ll
index 2682fa7dcce1..fc1aeb7b37d9 100644
--- a/test/CodeGen/AArch64/arm64-ccmp.ll
+++ b/test/CodeGen/AArch64/arm64-ccmp.ll
@@ -378,11 +378,11 @@ define i64 @select_noccmp1(i64 %v1, i64 %v2, i64 %v3, i64 %r) {
 ; CHECK-NEXT: cmp x0, #13
 ; CHECK-NOT: ccmp
 ; CHECK-NEXT: cset [[REG1:w[0-9]+]], gt
+; CHECK-NEXT: and [[REG4:w[0-9]+]], [[REG0]], [[REG1]]
 ; CHECK-NEXT: cmp x2, #2
 ; CHECK-NEXT: cset [[REG2:w[0-9]+]], lt
 ; CHECK-NEXT: cmp x2, #4
 ; CHECK-NEXT: cset [[REG3:w[0-9]+]], gt
-; CHECK-NEXT: and [[REG4:w[0-9]+]], [[REG0]], [[REG1]]
 ; CHECK-NEXT: and [[REG5:w[0-9]+]], [[REG2]], [[REG3]]
 ; CHECK-NEXT: orr [[REG6:w[0-9]+]], [[REG4]], [[REG5]]
 ; CHECK-NEXT: cmp [[REG6]], #0
diff --git a/test/CodeGen/AArch64/arm64-fml-combines.ll b/test/CodeGen/AArch64/arm64-fml-combines.ll
index 840d1dcbf060..f97498825279 100644
--- a/test/CodeGen/AArch64/arm64-fml-combines.ll
+++ b/test/CodeGen/AArch64/arm64-fml-combines.ll
@@ -1,4 +1,6 @@
-; RUN: llc < %s -O=3 -mtriple=arm64-apple-ios  -mcpu=cyclone -enable-unsafe-fp-math | FileCheck %s
+; RUN: llc < %s -O3 -mtriple=arm64-apple-ios -enable-unsafe-fp-math | FileCheck %s
+; RUN: llc < %s -O3 -mtriple=arm64-apple-ios -fp-contract=fast | FileCheck %s
+
 define void @foo_2d(double* %src) {
 entry:
   %arrayidx1 = getelementptr inbounds double, double* %src, i64 5
@@ -126,3 +128,23 @@ for.body:                                         ; preds = %for.body, %entry
 for.end:                                          ; preds = %for.body
   ret void
 }
+
+; CHECK-LABEL: test1:
+; CHECK: fnmadd s0, s0, s1, s2
+define float @test1(float %a, float %b, float %c) {
+entry:
+  %0 = fmul float %a, %b
+  %mul = fsub float -0.000000e+00, %0
+  %sub1 = fsub float %mul, %c
+  ret float %sub1
+}
+
+; CHECK-LABEL: test2:
+; CHECK: fnmadd d0, d0, d1, d2
+define double @test2(double %a, double %b, double %c) {
+entry:
+  %0 = fmul double %a, %b
+  %mul = fsub double -0.000000e+00, %0
+  %sub1 = fsub double %mul, %c
+  ret double %sub1
+}
diff --git a/test/CodeGen/AArch64/arm64-hello.ll b/test/CodeGen/AArch64/arm64-hello.ll
index caaf8615cd4a..a8d1c2482520 100644
--- a/test/CodeGen/AArch64/arm64-hello.ll
+++ b/test/CodeGen/AArch64/arm64-hello.ll
@@ -6,8 +6,8 @@
 ; CHECK-NEXT:	stp	x29, x30, [sp, #16]
 ; CHECK-NEXT:	add	x29, sp, #16
 ; CHECK-NEXT:	stur	wzr, [x29, #-4]
-; CHECK:	adrp	x0, L_.str@PAGE
-; CHECK:	add	x0, x0, L_.str@PAGEOFF
+; CHECK:	adrp	x0, l_.str@PAGE
+; CHECK:	add	x0, x0, l_.str@PAGEOFF
 ; CHECK-NEXT:	bl	_puts
 ; CHECK-NEXT:	ldp	x29, x30, [sp, #16]
 ; CHECK-NEXT:	add	sp, sp, #32
diff --git a/test/CodeGen/AArch64/arm64-misched-multimmo.ll b/test/CodeGen/AArch64/arm64-misched-multimmo.ll
index 3593668e0156..4c0195b93a44 100644
--- a/test/CodeGen/AArch64/arm64-misched-multimmo.ll
+++ b/test/CodeGen/AArch64/arm64-misched-multimmo.ll
@@ -12,7 +12,7 @@
 ; CHECK: Successors:
 ; CHECK-NOT: ch SU(4)
 ; CHECK: SU(3)
-; CHECK: SU(4):   STRWui %WZR, %X{{[0-9]+}}
+; CHECK: SU(5):   STRWui %WZR, %X{{[0-9]+}}
 define i32 @foo() {
 entry:
   %0 = load i32, i32* getelementptr inbounds ([100 x i32], [100 x i32]* @G2, i64 0, i64 0), align 4
diff --git a/test/CodeGen/AArch64/macho-global-symbols.ll b/test/CodeGen/AArch64/macho-global-symbols.ll
new file mode 100644
index 000000000000..d68abad57ccd
--- /dev/null
+++ b/test/CodeGen/AArch64/macho-global-symbols.ll
@@ -0,0 +1,17 @@
+; RUN: llc -mtriple=arm64-apple-ios %s -o - | FileCheck %s
+
+; All global symbols must be at-most linker-private for AArch64 because we don't
+; use section-relative relocations in MachO.
+
+define i8* @private_sym() {
+; CHECK-LABEL: private_sym:
+; CHECK:     adrp [[HIBITS:x[0-9]+]], l_var@PAGE
+; CHECK:     add x0, [[HIBITS]], l_var@PAGEOFF
+
+  ret i8* getelementptr([2 x i8], [2 x i8]* @var, i32 0, i32 0)
+}
+
+; CHECK:     .section __TEXT,__cstring
+; CHECK: l_var:
+; CHECK:    .asciz "\002"
+@var = private unnamed_addr constant [2 x i8] [i8 2, i8 0]
diff --git a/test/CodeGen/AArch64/misched-fusion-aes.ll b/test/CodeGen/AArch64/misched-fusion-aes.ll
index f29dfb3a9802..4c682e594e66 100644
--- a/test/CodeGen/AArch64/misched-fusion-aes.ll
+++ b/test/CodeGen/AArch64/misched-fusion-aes.ll
@@ -1,4 +1,5 @@
 ; RUN: llc %s -o - -mtriple=aarch64-unknown -mcpu=cortex-a57 | FileCheck %s --check-prefix=CHECK --check-prefix=CHECKA57
+; RUN: llc %s -o - -mtriple=aarch64-unknown -mcpu=cortex-a72 | FileCheck %s --check-prefix=CHECK --check-prefix=CHECKA72
 ; RUN: llc %s -o - -mtriple=aarch64-unknown -mcpu=exynos-m1  | FileCheck %s --check-prefix=CHECK --check-prefix=CHECKM1
 
 declare <16 x i8> @llvm.aarch64.crypto.aese(<16 x i8> %d, <16 x i8> %k)
@@ -87,6 +88,22 @@ define void @aesea(<16 x i8>* %a0, <16 x i8>* %b0, <16 x i8>* %c0, <16 x i8> %d,
 ; CHECKA57-NEXT: aesmc {{v[0-7].16b}}, [[VG]]
 ; CHECKA57: aese [[VH:v[0-7].16b]], {{v[0-7].16b}}
 ; CHECKA57-NEXT: aesmc {{v[0-7].16b}}, [[VH]]
+; CHECKA72: aese [[VA:v[0-7].16b]], {{v[0-7].16b}}
+; CHECKA72-NEXT: aesmc {{v[0-7].16b}}, [[VA]]
+; CHECKA72: aese [[VB:v[0-7].16b]], {{v[0-7].16b}}
+; CHECKA72-NEXT: aesmc {{v[0-7].16b}}, [[VB]]
+; CHECKA72: aese [[VC:v[0-7].16b]], {{v[0-7].16b}}
+; CHECKA72-NEXT: aesmc {{v[0-7].16b}}, [[VC]]
+; CHECKA72: aese [[VD:v[0-7].16b]], {{v[0-7].16b}}
+; CHECKA72-NEXT: aesmc {{v[0-7].16b}}, [[VD]]
+; CHECKA72: aese [[VE:v[0-7].16b]], {{v[0-7].16b}}
+; CHECKA72-NEXT: aesmc {{v[0-7].16b}}, [[VE]]
+; CHECKA72: aese [[VF:v[0-7].16b]], {{v[0-7].16b}}
+; CHECKA72-NEXT: aesmc {{v[0-7].16b}}, [[VF]]
+; CHECKA72: aese [[VG:v[0-7].16b]], {{v[0-7].16b}}
+; CHECKA72-NEXT: aesmc {{v[0-7].16b}}, [[VG]]
+; CHECKA72: aese [[VH:v[0-7].16b]], {{v[0-7].16b}}
+; CHECKA72-NEXT: aesmc {{v[0-7].16b}}, [[VH]]
 ; CHECKM1: aese [[VA:v[0-7].16b]], {{v[0-7].16b}}
 ; CHECKM1: aesmc {{v[0-7].16b}}, [[VA]]
 ; CHECKM1: aese [[VB:v[0-7].16b]], {{v[0-7].16b}}
@@ -187,6 +204,22 @@ define void @aesda(<16 x i8>* %a0, <16 x i8>* %b0, <16 x i8>* %c0, <16 x i8> %d,
 ; CHECKA57-NEXT: aesimc {{v[0-7].16b}}, [[VG]]
 ; CHECKA57: aesd [[VH:v[0-7].16b]], {{v[0-7].16b}}
 ; CHECKA57-NEXT: aesimc {{v[0-7].16b}}, [[VH]]
+; CHECKA72: aesd [[VA:v[0-7].16b]], {{v[0-7].16b}}
+; CHECKA72-NEXT: aesimc {{v[0-7].16b}}, [[VA]]
+; CHECKA72: aesd [[VB:v[0-7].16b]], {{v[0-7].16b}}
+; CHECKA72-NEXT: aesimc {{v[0-7].16b}}, [[VB]]
+; CHECKA72: aesd [[VC:v[0-7].16b]], {{v[0-7].16b}}
+; CHECKA72-NEXT: aesimc {{v[0-7].16b}}, [[VC]]
+; CHECKA72: aesd [[VD:v[0-7].16b]], {{v[0-7].16b}}
+; CHECKA72-NEXT: aesimc {{v[0-7].16b}}, [[VD]]
+; CHECKA72: aesd [[VE:v[0-7].16b]], {{v[0-7].16b}}
+; CHECKA72-NEXT: aesimc {{v[0-7].16b}}, [[VE]]
+; CHECKA72: aesd [[VF:v[0-7].16b]], {{v[0-7].16b}}
+; CHECKA72-NEXT: aesimc {{v[0-7].16b}}, [[VF]]
+; CHECKA72: aesd [[VG:v[0-7].16b]], {{v[0-7].16b}}
+; CHECKA72-NEXT: aesimc {{v[0-7].16b}}, [[VG]]
+; CHECKA72: aesd [[VH:v[0-7].16b]], {{v[0-7].16b}}
+; CHECKA72-NEXT: aesimc {{v[0-7].16b}}, [[VH]]
 ; CHECKM1: aesd [[VA:v[0-7].16b]], {{v[0-7].16b}}
 ; CHECKM1: aesimc {{v[0-7].16b}}, [[VA]]
 ; CHECKM1: aesd [[VB:v[0-7].16b]], {{v[0-7].16b}}
diff --git a/test/CodeGen/AArch64/stackmap-frame-setup.ll b/test/CodeGen/AArch64/stackmap-frame-setup.ll
index 5646703fa403..677ff8dc2530 100644
--- a/test/CodeGen/AArch64/stackmap-frame-setup.ll
+++ b/test/CodeGen/AArch64/stackmap-frame-setup.ll
@@ -7,11 +7,11 @@ entry:
   store i64 11, i64* %metadata
   store i64 12, i64* %metadata
   store i64 13, i64* %metadata
-; ISEL:      ADJCALLSTACKDOWN 0, implicit-def
+; ISEL:      ADJCALLSTACKDOWN 0, 0, implicit-def
 ; ISEL-NEXT: STACKMAP
 ; ISEL-NEXT: ADJCALLSTACKUP 0, 0, implicit-def
   call void (i64, i32, ...) @llvm.experimental.stackmap(i64 4, i32 0, i64* %metadata)
-; FAST-ISEL:      ADJCALLSTACKDOWN 0, implicit-def
+; FAST-ISEL:      ADJCALLSTACKDOWN 0, 0, implicit-def
 ; FAST-ISEL-NEXT: STACKMAP
 ; FAST-ISEL-NEXT: ADJCALLSTACKUP 0, 0, implicit-def
   ret void
diff --git a/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-flat.mir b/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-flat.mir
index 56a9e7022db9..2a3d3887ed69 100644
--- a/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-flat.mir
+++ b/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-flat.mir
@@ -14,7 +14,7 @@ regBankSelected: true
 
 # GCN: global_addrspace
 # GCN: [[PTR:%[0-9]+]] = COPY %vgpr0_vgpr1
-# GCN: FLAT_LOAD_DWORD  [[PTR]], 0, 0, 0
+# GCN: FLAT_LOAD_DWORD  [[PTR]], 0, 0
 
 body: |
   bb.0:
diff --git a/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-flat.mir b/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-flat.mir
index ea435725bf25..89be3bde94a8 100644
--- a/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-flat.mir
+++ b/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-flat.mir
@@ -15,7 +15,7 @@ regBankSelected: true
 # GCN: global_addrspace
 # GCN: [[PTR:%[0-9]+]] = COPY %vgpr0_vgpr1
 # GCN: [[VAL:%[0-9]+]] = COPY %vgpr2
-# GCN: FLAT_STORE_DWORD [[PTR]], [[VAL]], 0, 0, 0
+# GCN: FLAT_STORE_DWORD [[PTR]], [[VAL]], 0, 0
 
 body: |
   bb.0:
diff --git a/test/CodeGen/AMDGPU/GlobalISel/legalize-constant.mir b/test/CodeGen/AMDGPU/GlobalISel/legalize-constant.mir
new file mode 100644
index 000000000000..8839ba8e0ab2
--- /dev/null
+++ b/test/CodeGen/AMDGPU/GlobalISel/legalize-constant.mir
@@ -0,0 +1,20 @@
+# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -O0 -run-pass=legalizer -global-isel %s -o - | FileCheck %s
+
+--- |
+  define void @test_constant() {
+  entry:
+    ret void
+  }
+...
+
+---
+name:            test_constant
+registers:
+  - { id: 0, class: _ }
+body: |
+  bb.0.entry:
+    ; CHECK-LABEL: name: test_constant
+    ; CHECK: %0(s32) = G_CONSTANT i32 5
+
+    %0(s32) = G_CONSTANT i32 5
+...
diff --git a/test/CodeGen/AMDGPU/GlobalISel/lit.local.cfg b/test/CodeGen/AMDGPU/GlobalISel/lit.local.cfg
new file mode 100644
index 000000000000..e99d1bb8446c
--- /dev/null
+++ b/test/CodeGen/AMDGPU/GlobalISel/lit.local.cfg
@@ -0,0 +1,2 @@
+if not 'global-isel' in config.root.available_features:
+    config.unsupported = True
diff --git a/test/CodeGen/AMDGPU/constant-fold-imm-immreg.mir b/test/CodeGen/AMDGPU/constant-fold-imm-immreg.mir
index 62b47beb1251..bc992ed77ffd 100644
--- a/test/CodeGen/AMDGPU/constant-fold-imm-immreg.mir
+++ b/test/CodeGen/AMDGPU/constant-fold-imm-immreg.mir
@@ -219,19 +219,19 @@ body:             |
     %34 = V_MOV_B32_e32 63, implicit %exec
 
     %27 = V_AND_B32_e64 %26, %24, implicit %exec
-    FLAT_STORE_DWORD %37, %27, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+    FLAT_STORE_DWORD %37, %27, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
 
     %28 = V_AND_B32_e64 %24, %26, implicit %exec
-    FLAT_STORE_DWORD %37, %28, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+    FLAT_STORE_DWORD %37, %28, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
 
     %29 = V_AND_B32_e32 %26, %24, implicit %exec
-    FLAT_STORE_DWORD %37, %29, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+    FLAT_STORE_DWORD %37, %29, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
 
     %30 = V_AND_B32_e64 %26, %26, implicit %exec
-    FLAT_STORE_DWORD %37, %30, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+    FLAT_STORE_DWORD %37, %30, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
 
     %31 = V_AND_B32_e64 %34, %34, implicit %exec
-    FLAT_STORE_DWORD %37, %31, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+    FLAT_STORE_DWORD %37, %31, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
 
     S_ENDPGM
 
@@ -407,34 +407,34 @@ body:             |
     %27 = S_MOV_B32 -4
 
     %11 = V_LSHLREV_B32_e64 12, %10, implicit %exec
-    FLAT_STORE_DWORD %20, %11, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+    FLAT_STORE_DWORD %20, %11, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
 
     %12 = V_LSHLREV_B32_e64 %7, 12, implicit %exec
-    FLAT_STORE_DWORD %20, %12, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+    FLAT_STORE_DWORD %20, %12, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
 
     %13 = V_LSHL_B32_e64 %7, 12, implicit %exec
-    FLAT_STORE_DWORD %20, %13, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+    FLAT_STORE_DWORD %20, %13, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
 
     %14 = V_LSHL_B32_e64 12, %7, implicit %exec
-    FLAT_STORE_DWORD %20, %14, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+    FLAT_STORE_DWORD %20, %14, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
 
     %15 = V_LSHL_B32_e64 12, %24, implicit %exec
-    FLAT_STORE_DWORD %20, %15, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+    FLAT_STORE_DWORD %20, %15, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
 
     %22 = V_LSHL_B32_e64 %6, 12, implicit %exec
-    FLAT_STORE_DWORD %20, %22, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+    FLAT_STORE_DWORD %20, %22, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
 
     %23 = V_LSHL_B32_e64 %6, 32, implicit %exec
-    FLAT_STORE_DWORD %20, %23, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+    FLAT_STORE_DWORD %20, %23, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
 
     %25 = V_LSHL_B32_e32 %6, %6, implicit %exec
-    FLAT_STORE_DWORD %20, %25, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+    FLAT_STORE_DWORD %20, %25, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
 
     %26 = V_LSHLREV_B32_e32 11, %24, implicit %exec
-    FLAT_STORE_DWORD %20, %26, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+    FLAT_STORE_DWORD %20, %26, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
 
     %28 = V_LSHL_B32_e32 %27, %6, implicit %exec
-    FLAT_STORE_DWORD %20, %28, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+    FLAT_STORE_DWORD %20, %28, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
 
     S_ENDPGM
 
@@ -615,34 +615,34 @@ body:             |
     %35 = V_MOV_B32_e32 2, implicit %exec
 
     %11 = V_ASHRREV_I32_e64 8, %10, implicit %exec
-    FLAT_STORE_DWORD %20, %11, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+    FLAT_STORE_DWORD %20, %11, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
 
     %12 = V_ASHRREV_I32_e64 %8, %10, implicit %exec
-    FLAT_STORE_DWORD %20, %12, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+    FLAT_STORE_DWORD %20, %12, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
 
     %13 = V_ASHR_I32_e64 %7, 3, implicit %exec
-    FLAT_STORE_DWORD %20, %13, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+    FLAT_STORE_DWORD %20, %13, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
 
     %14 = V_ASHR_I32_e64 7, %32, implicit %exec
-    FLAT_STORE_DWORD %20, %14, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+    FLAT_STORE_DWORD %20, %14, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
 
     %15 = V_ASHR_I32_e64 %27, %24, implicit %exec
-    FLAT_STORE_DWORD %20, %15, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+    FLAT_STORE_DWORD %20, %15, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
 
     %22 = V_ASHR_I32_e64 %6, 4, implicit %exec
-    FLAT_STORE_DWORD %20, %22, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+    FLAT_STORE_DWORD %20, %22, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
 
     %23 = V_ASHR_I32_e64 %6, %33, implicit %exec
-    FLAT_STORE_DWORD %20, %23, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+    FLAT_STORE_DWORD %20, %23, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
 
     %25 = V_ASHR_I32_e32 %34, %34, implicit %exec
-    FLAT_STORE_DWORD %20, %25, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+    FLAT_STORE_DWORD %20, %25, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
 
     %26 = V_ASHRREV_I32_e32 11, %10, implicit %exec
-    FLAT_STORE_DWORD %20, %26, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+    FLAT_STORE_DWORD %20, %26, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
 
     %28 = V_ASHR_I32_e32 %27, %35, implicit %exec
-    FLAT_STORE_DWORD %20, %28, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+    FLAT_STORE_DWORD %20, %28, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
 
     S_ENDPGM
 
@@ -824,34 +824,34 @@ body:             |
     %35 = V_MOV_B32_e32 2, implicit %exec
 
     %11 = V_LSHRREV_B32_e64 8, %10, implicit %exec
-    FLAT_STORE_DWORD %20, %11, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+    FLAT_STORE_DWORD %20, %11, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
 
     %12 = V_LSHRREV_B32_e64 %8, %10, implicit %exec
-    FLAT_STORE_DWORD %20, %12, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+    FLAT_STORE_DWORD %20, %12, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
 
     %13 = V_LSHR_B32_e64 %7, 3, implicit %exec
-    FLAT_STORE_DWORD %20, %13, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+    FLAT_STORE_DWORD %20, %13, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
 
     %14 = V_LSHR_B32_e64 7, %32, implicit %exec
-    FLAT_STORE_DWORD %20, %14, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+    FLAT_STORE_DWORD %20, %14, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
 
     %15 = V_LSHR_B32_e64 %27, %24, implicit %exec
-    FLAT_STORE_DWORD %20, %15, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+    FLAT_STORE_DWORD %20, %15, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
 
     %22 = V_LSHR_B32_e64 %6, 4, implicit %exec
-    FLAT_STORE_DWORD %20, %22, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+    FLAT_STORE_DWORD %20, %22, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
 
     %23 = V_LSHR_B32_e64 %6, %33, implicit %exec
-    FLAT_STORE_DWORD %20, %23, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+    FLAT_STORE_DWORD %20, %23, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
 
     %25 = V_LSHR_B32_e32 %34, %34, implicit %exec
-    FLAT_STORE_DWORD %20, %25, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+    FLAT_STORE_DWORD %20, %25, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
 
     %26 = V_LSHRREV_B32_e32 11, %10, implicit %exec
-    FLAT_STORE_DWORD %20, %26, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+    FLAT_STORE_DWORD %20, %26, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
 
     %28 = V_LSHR_B32_e32 %27, %35, implicit %exec
-    FLAT_STORE_DWORD %20, %28, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+    FLAT_STORE_DWORD %20, %28, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
 
     S_ENDPGM
 
diff --git a/test/CodeGen/AMDGPU/constant-fold-mi-operands.ll b/test/CodeGen/AMDGPU/constant-fold-mi-operands.ll
index 0831d250b9e7..8611cd080e15 100644
--- a/test/CodeGen/AMDGPU/constant-fold-mi-operands.ll
+++ b/test/CodeGen/AMDGPU/constant-fold-mi-operands.ll
@@ -25,7 +25,7 @@ define amdgpu_kernel void @fold_mi_s_and_0(i32 addrspace(1)* %out, i32 %x) #0 {
 }
 
 ; GCN-LABEL: {{^}}fold_mi_v_or_0:
-; GCN: v_mbcnt_lo_u32_b32_e64 [[RESULT:v[0-9]+]]
+; GCN: v_mbcnt_lo_u32_b32{{(_e64)*}} [[RESULT:v[0-9]+]]
 ; GCN-NOT: [[RESULT]]
 ; GCN: buffer_store_dword [[RESULT]]
 define amdgpu_kernel void @fold_mi_v_or_0(i32 addrspace(1)* %out) {
@@ -50,7 +50,7 @@ define amdgpu_kernel void @fold_mi_s_or_0(i32 addrspace(1)* %out, i32 %x) #0 {
 }
 
 ; GCN-LABEL: {{^}}fold_mi_v_xor_0:
-; GCN: v_mbcnt_lo_u32_b32_e64 [[RESULT:v[0-9]+]]
+; GCN: v_mbcnt_lo_u32_b32{{(_e64)*}} [[RESULT:v[0-9]+]]
 ; GCN-NOT: [[RESULT]]
 ; GCN: buffer_store_dword [[RESULT]]
 define amdgpu_kernel void @fold_mi_v_xor_0(i32 addrspace(1)* %out) {
@@ -86,8 +86,8 @@ define amdgpu_kernel void @fold_mi_s_not_0(i32 addrspace(1)* %out, i32 %x) #0 {
 }
 
 ; GCN-LABEL: {{^}}fold_mi_v_not_0:
-; GCN: v_bcnt_u32_b32_e64 v[[RESULT_LO:[0-9]+]], v{{[0-9]+}}, 0{{$}}
-; GCN: v_bcnt_u32_b32_e{{[0-9]+}} v[[RESULT_LO:[0-9]+]], v{{[0-9]+}}, v[[RESULT_LO]]{{$}}
+; GCN: v_bcnt_u32_b32{{(_e64)*}} v[[RESULT_LO:[0-9]+]], v{{[0-9]+}}, 0{{$}}
+; GCN: v_bcnt_u32_b32{{(_e32)*(_e64)*}} v[[RESULT_LO:[0-9]+]], v{{[0-9]+}}, v[[RESULT_LO]]{{$}}
 ; GCN-NEXT: v_not_b32_e32 v[[RESULT_LO]]
 ; GCN-NEXT: v_mov_b32_e32 v[[RESULT_HI:[0-9]+]], -1{{$}}
 ; GCN-NEXT: buffer_store_dwordx2 v{{\[}}[[RESULT_LO]]:[[RESULT_HI]]{{\]}}
@@ -104,8 +104,8 @@ define amdgpu_kernel void @fold_mi_v_not_0(i64 addrspace(1)* %out) {
 ; GCN: buffer_load_dwordx2
 ; GCN: buffer_load_dwordx2 v{{\[}}[[VREG1_LO:[0-9]+]]:[[VREG1_HI:[0-9]+]]{{\]}}
 
-; GCN: v_bcnt_u32_b32_e64 v[[RESULT_LO:[0-9]+]], v{{[0-9]+}}, 0{{$}}
-; GCN: v_bcnt_u32_b32_e{{[0-9]+}} v[[RESULT_LO:[0-9]+]], v{{[0-9]+}}, v[[RESULT_LO]]{{$}}
+; GCN: v_bcnt_u32_b32{{(_e64)*}} v[[RESULT_LO:[0-9]+]], v{{[0-9]+}}, 0{{$}}
+; GCN: v_bcnt_u32_b32{{(_e32)*(_e64)*}} v[[RESULT_LO:[0-9]+]], v{{[0-9]+}}, v[[RESULT_LO]]{{$}}
 ; GCN-DAG: v_not_b32_e32 v[[RESULT_LO]], v[[RESULT_LO]]
 ; GCN-DAG: v_or_b32_e32 v[[RESULT_LO]], v[[VREG1_LO]], v[[RESULT_LO]]
 ; GCN-DAG: v_mov_b32_e32 v[[RESULT_HI:[0-9]+]], v[[VREG1_HI]]
diff --git a/test/CodeGen/AMDGPU/ctpop.ll b/test/CodeGen/AMDGPU/ctpop.ll
index a29e72ea57cb..aa913ad406d2 100644
--- a/test/CodeGen/AMDGPU/ctpop.ll
+++ b/test/CodeGen/AMDGPU/ctpop.ll
@@ -25,7 +25,7 @@ define amdgpu_kernel void @s_ctpop_i32(i32 addrspace(1)* noalias %out, i32 %val)
 ; XXX - Why 0 in register?
 ; FUNC-LABEL: {{^}}v_ctpop_i32:
 ; GCN: buffer_load_dword [[VAL:v[0-9]+]],
-; GCN: v_bcnt_u32_b32_e64 [[RESULT:v[0-9]+]], [[VAL]], 0
+; GCN: v_bcnt_u32_b32{{(_e64)*}} [[RESULT:v[0-9]+]], [[VAL]], 0
 ; GCN: buffer_store_dword [[RESULT]],
 ; GCN: s_endpgm
 
@@ -40,9 +40,9 @@ define amdgpu_kernel void @v_ctpop_i32(i32 addrspace(1)* noalias %out, i32 addrs
 ; FUNC-LABEL: {{^}}v_ctpop_add_chain_i32:
 ; GCN: buffer_load_dword [[VAL1:v[0-9]+]],
 ; GCN: buffer_load_dword [[VAL0:v[0-9]+]],
-; GCN: v_bcnt_u32_b32_e64 [[MIDRESULT:v[0-9]+]], [[VAL1]], 0
+; GCN: v_bcnt_u32_b32{{(_e64)*}} [[MIDRESULT:v[0-9]+]], [[VAL1]], 0
 ; SI: v_bcnt_u32_b32_e32 [[RESULT:v[0-9]+]], [[VAL0]], [[MIDRESULT]]
-; VI: v_bcnt_u32_b32_e64 [[RESULT:v[0-9]+]], [[VAL0]], [[MIDRESULT]]
+; VI: v_bcnt_u32_b32 [[RESULT:v[0-9]+]], [[VAL0]], [[MIDRESULT]]
 ; GCN: buffer_store_dword [[RESULT]],
 ; GCN: s_endpgm
 
@@ -61,7 +61,7 @@ define amdgpu_kernel void @v_ctpop_add_chain_i32(i32 addrspace(1)* noalias %out,
 ; FUNC-LABEL: {{^}}v_ctpop_add_sgpr_i32:
 ; GCN: buffer_load_dword [[VAL0:v[0-9]+]],
 ; GCN: s_waitcnt
-; GCN-NEXT: v_bcnt_u32_b32_e64 [[RESULT:v[0-9]+]], [[VAL0]], s{{[0-9]+}}
+; GCN-NEXT: v_bcnt_u32_b32{{(_e64)*}} [[RESULT:v[0-9]+]], [[VAL0]], s{{[0-9]+}}
 ; GCN: buffer_store_dword [[RESULT]],
 ; GCN: s_endpgm
 define amdgpu_kernel void @v_ctpop_add_sgpr_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in0, i32 addrspace(1)* noalias %in1, i32 %sval) nounwind {
@@ -73,8 +73,8 @@ define amdgpu_kernel void @v_ctpop_add_sgpr_i32(i32 addrspace(1)* noalias %out,
 }
 
 ; FUNC-LABEL: {{^}}v_ctpop_v2i32:
-; GCN: v_bcnt_u32_b32_e64
-; GCN: v_bcnt_u32_b32_e64
+; GCN: v_bcnt_u32_b32{{(_e64)*}}
+; GCN: v_bcnt_u32_b32{{(_e64)*}}
 ; GCN: s_endpgm
 
 ; EG: BCNT_INT
@@ -87,10 +87,10 @@ define amdgpu_kernel void @v_ctpop_v2i32(<2 x i32> addrspace(1)* noalias %out, <
 }
 
 ; FUNC-LABEL: {{^}}v_ctpop_v4i32:
-; GCN: v_bcnt_u32_b32_e64
-; GCN: v_bcnt_u32_b32_e64
-; GCN: v_bcnt_u32_b32_e64
-; GCN: v_bcnt_u32_b32_e64
+; GCN: v_bcnt_u32_b32{{(_e64)*}}
+; GCN: v_bcnt_u32_b32{{(_e64)*}}
+; GCN: v_bcnt_u32_b32{{(_e64)*}}
+; GCN: v_bcnt_u32_b32{{(_e64)*}}
 ; GCN: s_endpgm
 
 ; EG: BCNT_INT
@@ -105,14 +105,14 @@ define amdgpu_kernel void @v_ctpop_v4i32(<4 x i32> addrspace(1)* noalias %out, <
 }
 
 ; FUNC-LABEL: {{^}}v_ctpop_v8i32:
-; GCN: v_bcnt_u32_b32_e64
-; GCN: v_bcnt_u32_b32_e64
-; GCN: v_bcnt_u32_b32_e64
-; GCN: v_bcnt_u32_b32_e64
-; GCN: v_bcnt_u32_b32_e64
-; GCN: v_bcnt_u32_b32_e64
-; GCN: v_bcnt_u32_b32_e64
-; GCN: v_bcnt_u32_b32_e64
+; GCN: v_bcnt_u32_b32{{(_e64)*}}
+; GCN: v_bcnt_u32_b32{{(_e64)*}}
+; GCN: v_bcnt_u32_b32{{(_e64)*}}
+; GCN: v_bcnt_u32_b32{{(_e64)*}}
+; GCN: v_bcnt_u32_b32{{(_e64)*}}
+; GCN: v_bcnt_u32_b32{{(_e64)*}}
+; GCN: v_bcnt_u32_b32{{(_e64)*}}
+; GCN: v_bcnt_u32_b32{{(_e64)*}}
 ; GCN: s_endpgm
 
 ; EG: BCNT_INT
@@ -131,22 +131,22 @@ define amdgpu_kernel void @v_ctpop_v8i32(<8 x i32> addrspace(1)* noalias %out, <
 }
 
 ; FUNC-LABEL: {{^}}v_ctpop_v16i32:
-; GCN: v_bcnt_u32_b32_e64
-; GCN: v_bcnt_u32_b32_e64
-; GCN: v_bcnt_u32_b32_e64
-; GCN: v_bcnt_u32_b32_e64
-; GCN: v_bcnt_u32_b32_e64
-; GCN: v_bcnt_u32_b32_e64
-; GCN: v_bcnt_u32_b32_e64
-; GCN: v_bcnt_u32_b32_e64
-; GCN: v_bcnt_u32_b32_e64
-; GCN: v_bcnt_u32_b32_e64
-; GCN: v_bcnt_u32_b32_e64
-; GCN: v_bcnt_u32_b32_e64
-; GCN: v_bcnt_u32_b32_e64
-; GCN: v_bcnt_u32_b32_e64
-; GCN: v_bcnt_u32_b32_e64
-; GCN: v_bcnt_u32_b32_e64
+; GCN: v_bcnt_u32_b32{{(_e64)*}}
+; GCN: v_bcnt_u32_b32{{(_e64)*}}
+; GCN: v_bcnt_u32_b32{{(_e64)*}}
+; GCN: v_bcnt_u32_b32{{(_e64)*}}
+; GCN: v_bcnt_u32_b32{{(_e64)*}}
+; GCN: v_bcnt_u32_b32{{(_e64)*}}
+; GCN: v_bcnt_u32_b32{{(_e64)*}}
+; GCN: v_bcnt_u32_b32{{(_e64)*}}
+; GCN: v_bcnt_u32_b32{{(_e64)*}}
+; GCN: v_bcnt_u32_b32{{(_e64)*}}
+; GCN: v_bcnt_u32_b32{{(_e64)*}}
+; GCN: v_bcnt_u32_b32{{(_e64)*}}
+; GCN: v_bcnt_u32_b32{{(_e64)*}}
+; GCN: v_bcnt_u32_b32{{(_e64)*}}
+; GCN: v_bcnt_u32_b32{{(_e64)*}}
+; GCN: v_bcnt_u32_b32{{(_e64)*}}
 ; GCN: s_endpgm
 
 ; EG: BCNT_INT
@@ -174,7 +174,7 @@ define amdgpu_kernel void @v_ctpop_v16i32(<16 x i32> addrspace(1)* noalias %out,
 
 ; FUNC-LABEL: {{^}}v_ctpop_i32_add_inline_constant:
 ; GCN: buffer_load_dword [[VAL:v[0-9]+]],
-; GCN: v_bcnt_u32_b32_e64 [[RESULT:v[0-9]+]], [[VAL]], 4
+; GCN: v_bcnt_u32_b32{{(_e64)*}} [[RESULT:v[0-9]+]], [[VAL]], 4
 ; GCN: buffer_store_dword [[RESULT]],
 ; GCN: s_endpgm
 
@@ -189,7 +189,7 @@ define amdgpu_kernel void @v_ctpop_i32_add_inline_constant(i32 addrspace(1)* noa
 
 ; FUNC-LABEL: {{^}}v_ctpop_i32_add_inline_constant_inv:
 ; GCN: buffer_load_dword [[VAL:v[0-9]+]],
-; GCN: v_bcnt_u32_b32_e64 [[RESULT:v[0-9]+]], [[VAL]], 4
+; GCN: v_bcnt_u32_b32{{(_e64)*}} [[RESULT:v[0-9]+]], [[VAL]], 4
 ; GCN: buffer_store_dword [[RESULT]],
 ; GCN: s_endpgm
 
@@ -206,7 +206,7 @@ define amdgpu_kernel void @v_ctpop_i32_add_inline_constant_inv(i32 addrspace(1)*
 ; GCN-DAG: buffer_load_dword [[VAL:v[0-9]+]],
 ; GCN-DAG: v_mov_b32_e32 [[LIT:v[0-9]+]], 0x1869f
 ; SI: v_bcnt_u32_b32_e32 [[RESULT:v[0-9]+]], [[VAL]], [[LIT]]
-; VI: v_bcnt_u32_b32_e64 [[RESULT:v[0-9]+]], [[VAL]], [[LIT]]
+; VI: v_bcnt_u32_b32 [[RESULT:v[0-9]+]], [[VAL]], [[LIT]]
 ; GCN: buffer_store_dword [[RESULT]],
 ; GCN: s_endpgm
 define amdgpu_kernel void @v_ctpop_i32_add_literal(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
@@ -220,7 +220,7 @@ define amdgpu_kernel void @v_ctpop_i32_add_literal(i32 addrspace(1)* noalias %ou
 ; FUNC-LABEL: {{^}}v_ctpop_i32_add_var:
 ; GCN-DAG: buffer_load_dword [[VAL:v[0-9]+]],
 ; GCN-DAG: s_load_dword [[VAR:s[0-9]+]],
-; GCN: v_bcnt_u32_b32_e64 [[RESULT:v[0-9]+]], [[VAL]], [[VAR]]
+; GCN: v_bcnt_u32_b32{{(_e64)*}} [[RESULT:v[0-9]+]], [[VAL]], [[VAR]]
 ; GCN: buffer_store_dword [[RESULT]],
 ; GCN: s_endpgm
 
@@ -236,7 +236,7 @@ define amdgpu_kernel void @v_ctpop_i32_add_var(i32 addrspace(1)* noalias %out, i
 ; FUNC-LABEL: {{^}}v_ctpop_i32_add_var_inv:
 ; GCN-DAG: buffer_load_dword [[VAL:v[0-9]+]],
 ; GCN-DAG: s_load_dword [[VAR:s[0-9]+]],
-; GCN: v_bcnt_u32_b32_e64 [[RESULT:v[0-9]+]], [[VAL]], [[VAR]]
+; GCN: v_bcnt_u32_b32{{(_e64)*}} [[RESULT:v[0-9]+]], [[VAL]], [[VAR]]
 ; GCN: buffer_store_dword [[RESULT]],
 ; GCN: s_endpgm
 
@@ -253,7 +253,7 @@ define amdgpu_kernel void @v_ctpop_i32_add_var_inv(i32 addrspace(1)* noalias %ou
 ; GCN-DAG: buffer_load_dword [[VAL:v[0-9]+]], off, s[{{[0-9]+:[0-9]+}}], {{0$}}
 ; GCN-DAG: buffer_load_dword [[VAR:v[0-9]+]], off, s[{{[0-9]+:[0-9]+}}], 0 offset:16
 ; SI: v_bcnt_u32_b32_e32 [[RESULT:v[0-9]+]], [[VAL]], [[VAR]]
-; VI: v_bcnt_u32_b32_e64 [[RESULT:v[0-9]+]], [[VAL]], [[VAR]]
+; VI: v_bcnt_u32_b32 [[RESULT:v[0-9]+]], [[VAL]], [[VAR]]
 ; GCN: buffer_store_dword [[RESULT]],
 ; GCN: s_endpgm
 
diff --git a/test/CodeGen/AMDGPU/ctpop64.ll b/test/CodeGen/AMDGPU/ctpop64.ll
index 2610684ad9ee..f18bd9fd8174 100644
--- a/test/CodeGen/AMDGPU/ctpop64.ll
+++ b/test/CodeGen/AMDGPU/ctpop64.ll
@@ -26,9 +26,9 @@ define amdgpu_kernel void @s_ctpop_i64(i32 addrspace(1)* noalias %out, i64 %val)
 
 ; FUNC-LABEL: {{^}}v_ctpop_i64:
 ; GCN: buffer_load_dwordx2 v{{\[}}[[LOVAL:[0-9]+]]:[[HIVAL:[0-9]+]]{{\]}},
-; GCN: v_bcnt_u32_b32_e64 [[MIDRESULT:v[0-9]+]], v[[LOVAL]], 0
+; GCN: v_bcnt_u32_b32{{(_e64)*}} [[MIDRESULT:v[0-9]+]], v[[LOVAL]], 0
 ; SI-NEXT: v_bcnt_u32_b32_e32 [[RESULT:v[0-9]+]], v[[HIVAL]], [[MIDRESULT]]
-; VI-NEXT: v_bcnt_u32_b32_e64 [[RESULT:v[0-9]+]], v[[HIVAL]], [[MIDRESULT]]
+; VI-NEXT: v_bcnt_u32_b32 [[RESULT:v[0-9]+]], v[[HIVAL]], [[MIDRESULT]]
 ; GCN: buffer_store_dword [[RESULT]],
 ; GCN: s_endpgm
 define amdgpu_kernel void @v_ctpop_i64(i32 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind {
@@ -41,9 +41,9 @@ define amdgpu_kernel void @v_ctpop_i64(i32 addrspace(1)* noalias %out, i64 addrs
 
 ; FUNC-LABEL: {{^}}v_ctpop_i64_user:
 ; GCN: buffer_load_dwordx2 v{{\[}}[[LOVAL:[0-9]+]]:[[HIVAL:[0-9]+]]{{\]}},
-; GCN: v_bcnt_u32_b32_e64 [[MIDRESULT:v[0-9]+]], v[[LOVAL]], 0
+; GCN: v_bcnt_u32_b32{{(_e64)*}} [[MIDRESULT:v[0-9]+]], v[[LOVAL]], 0
 ; SI-NEXT: v_bcnt_u32_b32_e32 [[RESULT:v[0-9]+]], v[[HIVAL]], [[MIDRESULT]]
-; VI-NEXT: v_bcnt_u32_b32_e64 [[RESULT:v[0-9]+]], v[[HIVAL]], [[MIDRESULT]]
+; VI-NEXT: v_bcnt_u32_b32 [[RESULT:v[0-9]+]], v[[HIVAL]], [[MIDRESULT]]
 ; GCN-DAG: v_or_b32_e32 v[[RESULT_LO:[0-9]+]], s{{[0-9]+}}, [[RESULT]]
 ; GCN-DAG: v_mov_b32_e32 v[[RESULT_HI:[0-9]+]], s{{[0-9]+}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[RESULT_LO]]:[[RESULT_HI]]{{\]}}
@@ -171,11 +171,11 @@ define amdgpu_kernel void @s_ctpop_i65(i32 addrspace(1)* noalias %out, i65 %val)
 ; FUNC-LABEL: {{^}}v_ctpop_i128:
 ; GCN: buffer_load_dwordx4 v{{\[}}[[VAL0:[0-9]+]]:[[VAL3:[0-9]+]]{{\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
 
-; GCN-DAG: v_bcnt_u32_b32_e64 [[MIDRESULT0:v[0-9]+]], v{{[0-9]+}}, 0
-; GCN-DAG: v_bcnt_u32_b32{{_e32|_e64}} [[MIDRESULT1:v[0-9]+]], v[[VAL3]], [[MIDRESULT0]]
+; GCN-DAG: v_bcnt_u32_b32{{(_e64)*}} [[MIDRESULT0:v[0-9]+]], v{{[0-9]+}}, 0
+; GCN-DAG: v_bcnt_u32_b32{{(_e32)*(_e64)*}} [[MIDRESULT1:v[0-9]+]], v[[VAL3]], [[MIDRESULT0]]
 
-; GCN-DAG: v_bcnt_u32_b32_e64 [[MIDRESULT2:v[0-9]+]], v[[VAL0]], 0
-; GCN-DAG: v_bcnt_u32_b32{{_e32|_e64}} [[MIDRESULT3:v[0-9]+]], v{{[0-9]+}}, [[MIDRESULT2]]
+; GCN-DAG: v_bcnt_u32_b32{{(_e64)*}} [[MIDRESULT2:v[0-9]+]], v[[VAL0]], 0
+; GCN-DAG: v_bcnt_u32_b32{{(_e32)*(_e64)*}} [[MIDRESULT3:v[0-9]+]], v{{[0-9]+}}, [[MIDRESULT2]]
 
 ; GCN: v_add_i32_e32 [[RESULT:v[0-9]+]], vcc, [[MIDRESULT1]], [[MIDRESULT2]]
 
diff --git a/test/CodeGen/AMDGPU/fneg-combines.ll b/test/CodeGen/AMDGPU/fneg-combines.ll
index 1c0e9a2f13ce..66bf9d0ffb00 100644
--- a/test/CodeGen/AMDGPU/fneg-combines.ll
+++ b/test/CodeGen/AMDGPU/fneg-combines.ll
@@ -1471,11 +1471,10 @@ define amdgpu_kernel void @v_fneg_mul_legacy_store_use_mul_legacy_f32(float addr
 ; GCN-LABEL: {{^}}v_fneg_mul_legacy_multi_use_mul_legacy_f32:
 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
-; GCN-DAG: v_mul_legacy_f32_e32 [[ADD:v[0-9]+]], [[B]], [[A]]
-; GCN-DAG: v_xor_b32_e32 [[NEG_MUL_LEGACY:v[0-9]+]], 0x80000000, [[ADD]]
-; GCN: v_mul_legacy_f32_e32 [[MUL:v[0-9]+]], 4.0, [[ADD]]
-; GCN-NEXT: buffer_store_dword [[NEG_MUL_LEGACY]]
-; GCN: buffer_store_dword [[MUL]]
+; GCN: v_mul_legacy_f32_e64 [[ADD:v[0-9]+]], [[A]], -[[B]]
+; GCN-NEXT: v_mul_legacy_f32_e64 [[MUL:v[0-9]+]], -[[ADD]], 4.0
+; GCN-NEXT: buffer_store_dword [[ADD]]
+; GCN-NEXT: buffer_store_dword [[MUL]]
 define amdgpu_kernel void @v_fneg_mul_legacy_multi_use_mul_legacy_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
diff --git a/test/CodeGen/AMDGPU/fneg.f16.ll b/test/CodeGen/AMDGPU/fneg.f16.ll
index 626a0b50cce8..ed36666db807 100644
--- a/test/CodeGen/AMDGPU/fneg.f16.ll
+++ b/test/CodeGen/AMDGPU/fneg.f16.ll
@@ -1,6 +1,6 @@
 ; RUN: llc -march=amdgcn -mcpu=kaveri -mtriple=amdgcn--amdhsa -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=CIVI -check-prefix=GCN %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mtriple=amdgcn--amdhsa -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=CIVI -check-prefix=GCN %s
-; RUN: llc -march=amdgcn -mcpu=gfx901 -mtriple=amdgcn--amdhsa -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mtriple=amdgcn--amdhsa -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=CIVI -check-prefix=GCN -check-prefix=GFX89 %s
+; RUN: llc -march=amdgcn -mcpu=gfx901 -mtriple=amdgcn--amdhsa -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 -check-prefix=GCN -check-prefix=GFX89 %s
 
 ; FIXME: Should be able to do scalar op
 ; GCN-LABEL: {{^}}s_fneg_f16:
@@ -129,6 +129,41 @@ define amdgpu_kernel void @v_fneg_fold_v2f16(<2 x half> addrspace(1)* %out, <2 x
   ret void
 }
 
+; GCN-LABEL: {{^}}v_extract_fneg_fold_v2f16:
+; GCN: flat_load_dword [[VAL:v[0-9]+]]
+; CI-DAG: v_mul_f32_e32 v{{[0-9]+}}, -4.0, v{{[0-9]+}}
+; CI-DAG: v_sub_f32_e32 v{{[0-9]+}}, 2.0, v{{[0-9]+}}
+
+; GFX89: v_lshrrev_b32_e32 [[ELT1:v[0-9]+]], 16, [[VAL]]
+; GFX89-DAG: v_mul_f16_e32 v{{[0-9]+}}, -4.0, [[VAL]]
+; GFX89-DAG: v_sub_f16_e32 v{{[0-9]+}}, 2.0, [[ELT1]]
+define amdgpu_kernel void @v_extract_fneg_fold_v2f16(<2 x half> addrspace(1)* %in) #0 {
+  %val = load <2 x half>, <2 x half> addrspace(1)* %in
+  %fneg = fsub <2 x half> <half -0.0, half -0.0>, %val
+  %elt0 = extractelement <2 x half> %fneg, i32 0
+  %elt1 = extractelement <2 x half> %fneg, i32 1
+
+  %fmul0 = fmul half %elt0, 4.0
+  %fadd1 = fadd half %elt1, 2.0
+  store volatile half %fmul0, half addrspace(1)* undef
+  store volatile half %fadd1, half addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_extract_fneg_no_fold_v2f16:
+; GCN: flat_load_dword [[VAL:v[0-9]+]]
+; GCN: v_xor_b32_e32 [[NEG:v[0-9]+]], 0x80008000, [[VAL]]
+; GCN: v_lshrrev_b32_e32 [[ELT1:v[0-9]+]], 16, [[NEG]]
+define amdgpu_kernel void @v_extract_fneg_no_fold_v2f16(<2 x half> addrspace(1)* %in) #0 {
+  %val = load <2 x half>, <2 x half> addrspace(1)* %in
+  %fneg = fsub <2 x half> <half -0.0, half -0.0>, %val
+  %elt0 = extractelement <2 x half> %fneg, i32 0
+  %elt1 = extractelement <2 x half> %fneg, i32 1
+  store volatile half %elt0, half addrspace(1)* undef
+  store volatile half %elt1, half addrspace(1)* undef
+  ret void
+}
+
 declare i32 @llvm.amdgcn.workitem.id.x() #1
 
 attributes #0 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/inserted-wait-states.mir b/test/CodeGen/AMDGPU/inserted-wait-states.mir
index c6fe6debd225..ff9fcd1c693f 100644
--- a/test/CodeGen/AMDGPU/inserted-wait-states.mir
+++ b/test/CodeGen/AMDGPU/inserted-wait-states.mir
@@ -246,15 +246,15 @@ body: |
     S_BRANCH %bb.1
 
   bb.1:
-    FLAT_STORE_DWORDX2 %vgpr0_vgpr1, %vgpr2_vgpr3, 0, 0, 0, implicit %exec, implicit %flat_scr
+    FLAT_STORE_DWORDX2 %vgpr0_vgpr1, %vgpr2_vgpr3, 0, 0, implicit %exec, implicit %flat_scr
     %vgpr3 = V_MOV_B32_e32 0, implicit %exec
-    FLAT_STORE_DWORDX3 %vgpr0_vgpr1, %vgpr2_vgpr3_vgpr4, 0, 0, 0, implicit %exec, implicit %flat_scr
+    FLAT_STORE_DWORDX3 %vgpr0_vgpr1, %vgpr2_vgpr3_vgpr4, 0, 0, implicit %exec, implicit %flat_scr
     %vgpr3 = V_MOV_B32_e32 0, implicit %exec
-    FLAT_STORE_DWORDX4 %vgpr0_vgpr1, %vgpr2_vgpr3_vgpr4_vgpr5, 0, 0, 0, implicit %exec, implicit %flat_scr
+    FLAT_STORE_DWORDX4 %vgpr0_vgpr1, %vgpr2_vgpr3_vgpr4_vgpr5, 0, 0, implicit %exec, implicit %flat_scr
     %vgpr3 = V_MOV_B32_e32 0, implicit %exec
-    FLAT_ATOMIC_CMPSWAP_X2 %vgpr0_vgpr1, %vgpr2_vgpr3_vgpr4_vgpr5, 0, 0, implicit %exec, implicit %flat_scr
+    FLAT_ATOMIC_CMPSWAP_X2 %vgpr0_vgpr1, %vgpr2_vgpr3_vgpr4_vgpr5, 0, implicit %exec, implicit %flat_scr
     %vgpr3 = V_MOV_B32_e32 0, implicit %exec
-    FLAT_ATOMIC_FCMPSWAP_X2 %vgpr0_vgpr1, %vgpr2_vgpr3_vgpr4_vgpr5, 0, 0, implicit %exec, implicit %flat_scr
+    FLAT_ATOMIC_FCMPSWAP_X2 %vgpr0_vgpr1, %vgpr2_vgpr3_vgpr4_vgpr5, 0, implicit %exec, implicit %flat_scr
     %vgpr3 = V_MOV_B32_e32 0, implicit %exec
     S_ENDPGM
 
diff --git a/test/CodeGen/AMDGPU/limit-coalesce.mir b/test/CodeGen/AMDGPU/limit-coalesce.mir
index 106a96e32dc3..a0d2d6c097a2 100644
--- a/test/CodeGen/AMDGPU/limit-coalesce.mir
+++ b/test/CodeGen/AMDGPU/limit-coalesce.mir
@@ -57,15 +57,15 @@ body:             |
     %4.sub1 = COPY %3.sub0
     undef %5.sub0 = COPY %4.sub1
     %5.sub1 = COPY %4.sub0
-    FLAT_STORE_DWORDX2 %vgpr0_vgpr1, killed %5, 0, 0, 0, implicit %exec, implicit %flat_scr
+    FLAT_STORE_DWORDX2 %vgpr0_vgpr1, killed %5, 0, 0, implicit %exec, implicit %flat_scr
 
     %6 = IMPLICIT_DEF
     undef %7.sub0_sub1 = COPY %6
     %7.sub2 = COPY %3.sub0
-    FLAT_STORE_DWORDX3 %vgpr0_vgpr1, killed %7, 0, 0, 0, implicit %exec, implicit %flat_scr
+    FLAT_STORE_DWORDX3 %vgpr0_vgpr1, killed %7, 0, 0, implicit %exec, implicit %flat_scr
 
     %8 = IMPLICIT_DEF
     undef %9.sub0_sub1_sub2 = COPY %8
     %9.sub3 = COPY %3.sub0
-    FLAT_STORE_DWORDX4 %vgpr0_vgpr1, killed %9, 0, 0, 0, implicit %exec, implicit %flat_scr
+    FLAT_STORE_DWORDX4 %vgpr0_vgpr1, killed %9, 0, 0, implicit %exec, implicit %flat_scr
 ...
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pkrtz.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pkrtz.ll
index b92eb34750d9..7179d02fc6dd 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pkrtz.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pkrtz.ll
@@ -7,7 +7,7 @@
 ; GCN-DAG: s_load_dword [[SY:s[0-9]+]], s[0:1], 0x{{c|30}}
 ; GCN: v_mov_b32_e32 [[VY:v[0-9]+]], [[SY]]
 ; SI: v_cvt_pkrtz_f16_f32_e32 v{{[0-9]+}}, [[X]], [[VY]]
-; GFX89: v_cvt_pkrtz_f16_f32_e64 v{{[0-9]+}}, [[X]], [[VY]]
+; GFX89: v_cvt_pkrtz_f16_f32 v{{[0-9]+}}, [[X]], [[VY]]
 define amdgpu_kernel void @s_cvt_pkrtz_v2f16_f32(<2 x half> addrspace(1)* %out, float %x, float %y) #0 {
   %result = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %x, float %y)
   store <2 x half> %result, <2 x half> addrspace(1)* %out
@@ -16,7 +16,7 @@ define amdgpu_kernel void @s_cvt_pkrtz_v2f16_f32(<2 x half> addrspace(1)* %out,
 
 ; GCN-LABEL: {{^}}s_cvt_pkrtz_samereg_v2f16_f32:
 ; GCN: s_load_dword [[X:s[0-9]+]]
-; GCN: v_cvt_pkrtz_f16_f32_e64 v{{[0-9]+}}, [[X]], [[X]]
+; GCN: v_cvt_pkrtz_f16_f32{{(_e64)*}} v{{[0-9]+}}, [[X]], [[X]]
 define amdgpu_kernel void @s_cvt_pkrtz_samereg_v2f16_f32(<2 x half> addrspace(1)* %out, float %x) #0 {
   %result = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %x, float %x)
   store <2 x half> %result, <2 x half> addrspace(1)* %out
@@ -39,7 +39,7 @@ define amdgpu_kernel void @s_cvt_pkrtz_undef_undef(<2 x half> addrspace(1)* %out
 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
 ; SI: v_cvt_pkrtz_f16_f32_e32 v{{[0-9]+}}, [[A]], [[B]]
-; GFX89: v_cvt_pkrtz_f16_f32_e64 v{{[0-9]+}}, [[A]], [[B]]
+; GFX89: v_cvt_pkrtz_f16_f32 v{{[0-9]+}}, [[A]], [[B]]
 define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32(<2 x half> addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
@@ -55,7 +55,7 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32(<2 x half> addrspace(1)* %out,
 
 ; GCN-LABEL: {{^}}v_cvt_pkrtz_v2f16_f32_reg_imm:
 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
-; GCN: v_cvt_pkrtz_f16_f32_e64 v{{[0-9]+}}, [[A]], 1.0
+; GCN: v_cvt_pkrtz_f16_f32{{(_e64)*}} v{{[0-9]+}}, [[A]], 1.0
 define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_reg_imm(<2 x half> addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
@@ -70,7 +70,7 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_reg_imm(<2 x half> addrspace(1)
 ; GCN-LABEL: {{^}}v_cvt_pkrtz_v2f16_f32_imm_reg:
 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
 ; SI: v_cvt_pkrtz_f16_f32_e32 v{{[0-9]+}}, 1.0, [[A]]
-; GFX89: v_cvt_pkrtz_f16_f32_e64 v{{[0-9]+}}, 1.0, [[A]]
+; GFX89: v_cvt_pkrtz_f16_f32 v{{[0-9]+}}, 1.0, [[A]]
 define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_imm_reg(<2 x half> addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
@@ -85,7 +85,7 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_imm_reg(<2 x half> addrspace(1)
 ; GCN-LABEL: {{^}}v_cvt_pkrtz_v2f16_f32_fneg_lo:
 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
-; GCN: v_cvt_pkrtz_f16_f32_e64 v{{[0-9]+}}, -[[A]], [[B]]
+; GCN: v_cvt_pkrtz_f16_f32{{(_e64)*}} v{{[0-9]+}}, -[[A]], [[B]]
 define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_lo(<2 x half> addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
@@ -103,7 +103,7 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_lo(<2 x half> addrspace(1)
 ; GCN-LABEL: {{^}}v_cvt_pkrtz_v2f16_f32_fneg_hi:
 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
-; GCN: v_cvt_pkrtz_f16_f32_e64 v{{[0-9]+}}, [[A]], -[[B]]
+; GCN: v_cvt_pkrtz_f16_f32{{(_e64)*}} v{{[0-9]+}}, [[A]], -[[B]]
 define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_hi(<2 x half> addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
@@ -121,7 +121,7 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_hi(<2 x half> addrspace(1)
 ; GCN-LABEL: {{^}}v_cvt_pkrtz_v2f16_f32_fneg_lo_hi:
 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
-; GCN: v_cvt_pkrtz_f16_f32_e64 v{{[0-9]+}}, -[[A]], -[[B]]
+; GCN: v_cvt_pkrtz_f16_f32{{(_e64)*}} v{{[0-9]+}}, -[[A]], -[[B]]
 define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_lo_hi(<2 x half> addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
@@ -140,7 +140,7 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_lo_hi(<2 x half> addrspace
 ; GCN-LABEL: {{^}}v_cvt_pkrtz_v2f16_f32_fneg_fabs_lo_fneg_hi:
 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
-; GCN: v_cvt_pkrtz_f16_f32_e64 v{{[0-9]+}}, -|[[A]]|, -[[B]]
+; GCN: v_cvt_pkrtz_f16_f32{{(_e64)*}} v{{[0-9]+}}, -|[[A]]|, -[[B]]
 define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_fabs_lo_fneg_hi(<2 x half> addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.mbcnt.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.mbcnt.ll
index ab76c870796b..144c8f428ab0 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.mbcnt.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.mbcnt.ll
@@ -2,9 +2,9 @@
 ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
 
 ; GCN-LABEL: {{^}}mbcnt_intrinsics:
-; GCN: v_mbcnt_lo_u32_b32_e64 [[LO:v[0-9]+]], -1, 0
+; GCN: v_mbcnt_lo_u32_b32{{(_e64)*}} [[LO:v[0-9]+]], -1, 0
 ; SI: v_mbcnt_hi_u32_b32_e32 {{v[0-9]+}}, -1, [[LO]]
-; VI: v_mbcnt_hi_u32_b32_e64 {{v[0-9]+}}, -1, [[LO]]
+; VI: v_mbcnt_hi_u32_b32 {{v[0-9]+}}, -1, [[LO]]
 define amdgpu_ps void @mbcnt_intrinsics(<16 x i8> addrspace(2)* inreg %arg, <16 x i8> addrspace(2)* inreg %arg1, <32 x i8> addrspace(2)* inreg %arg2, i32 inreg %arg3) {
 main_body:
   %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
diff --git a/test/CodeGen/AMDGPU/madak.ll b/test/CodeGen/AMDGPU/madak.ll
index eb4066a2a0a8..5f1fb0e2d732 100644
--- a/test/CodeGen/AMDGPU/madak.ll
+++ b/test/CodeGen/AMDGPU/madak.ll
@@ -9,7 +9,7 @@ declare float @llvm.fabs.f32(float) nounwind readnone
 ; GCN-LABEL: {{^}}madak_f32:
 ; GCN: buffer_load_dword [[VA:v[0-9]+]]
 ; GCN: buffer_load_dword [[VB:v[0-9]+]]
-; GCN: v_madak_f32_e32 {{v[0-9]+}}, [[VA]], [[VB]], 0x41200000
+; GCN: v_madak_f32 {{v[0-9]+}}, [[VA]], [[VB]], 0x41200000
 define amdgpu_kernel void @madak_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float addrspace(1)* noalias %in.b) nounwind {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %in.a.gep = getelementptr float, float addrspace(1)* %in.a, i32 %tid
@@ -63,7 +63,7 @@ define amdgpu_kernel void @madak_2_use_f32(float addrspace(1)* noalias %out, flo
 
 ; GCN-LABEL: {{^}}madak_m_inline_imm_f32:
 ; GCN: buffer_load_dword [[VA:v[0-9]+]]
-; GCN: v_madak_f32_e32 {{v[0-9]+}}, 4.0, [[VA]], 0x41200000
+; GCN: v_madak_f32 {{v[0-9]+}}, 4.0, [[VA]], 0x41200000
 define amdgpu_kernel void @madak_m_inline_imm_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a) nounwind {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %in.a.gep = getelementptr float, float addrspace(1)* %in.a, i32 %tid
@@ -198,7 +198,7 @@ define amdgpu_kernel void @no_madak_src1_modifier_f32(float addrspace(1)* noalia
 ; GCN: s_load_dword [[SGPR0:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, {{0xa|0x28}}
 ; GCN: v_mov_b32_e32 [[SGPR0_VCOPY:v[0-9]+]], [[SGPR0]]
 ; GCN: buffer_load_dword [[VGPR:v[0-9]+]]
-; GCN: v_madak_f32_e32 [[MADAK:v[0-9]+]], 0.5, [[SGPR0_VCOPY]], 0x42280000
+; GCN: v_madak_f32 [[MADAK:v[0-9]+]], 0.5, [[SGPR0_VCOPY]], 0x42280000
 ; GCN: v_mul_f32_e32 [[MUL:v[0-9]+]], [[VGPR]], [[MADAK]]
 ; GCN: buffer_store_dword [[MUL]]
 define amdgpu_kernel void @madak_constant_bus_violation(i32 %arg1, float %sgpr0, float %sgpr1) #0 {
diff --git a/test/CodeGen/AMDGPU/promote-alloca-volatile.ll b/test/CodeGen/AMDGPU/promote-alloca-volatile.ll
index 9c43a6dc60f4..d7655993a2d9 100644
--- a/test/CodeGen/AMDGPU/promote-alloca-volatile.ll
+++ b/test/CodeGen/AMDGPU/promote-alloca-volatile.ll
@@ -1,26 +1,26 @@
 ; RUN: opt -S -mtriple=amdgcn-unknown-amdhsa -amdgpu-promote-alloca < %s | FileCheck %s
 
 ; CHECK-LABEL: @volatile_load(
-; CHECK: alloca [5 x i32]
+; CHECK: alloca [4 x i32]
 ; CHECK: load volatile i32, i32*
 define amdgpu_kernel void @volatile_load(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) {
 entry:
-  %stack = alloca [5 x i32], align 4
+  %stack = alloca [4 x i32], align 4
   %tmp = load i32, i32 addrspace(1)* %in, align 4
-  %arrayidx1 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %tmp
+  %arrayidx1 = getelementptr inbounds [4 x i32], [4 x i32]* %stack, i32 0, i32 %tmp
   %load = load volatile i32, i32* %arrayidx1
   store i32 %load, i32 addrspace(1)* %out
  ret void
 }
 
 ; CHECK-LABEL: @volatile_store(
-; CHECK: alloca [5 x i32]
+; CHECK: alloca [4 x i32]
 ; CHECK: store volatile i32 %tmp, i32*
 define amdgpu_kernel void @volatile_store(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) {
 entry:
-  %stack = alloca [5 x i32], align 4
+  %stack = alloca [4 x i32], align 4
   %tmp = load i32, i32 addrspace(1)* %in, align 4
-  %arrayidx1 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %tmp
+  %arrayidx1 = getelementptr inbounds [4 x i32], [4 x i32]* %stack, i32 0, i32 %tmp
   store volatile i32 %tmp, i32* %arrayidx1
  ret void
 }
diff --git a/test/CodeGen/AMDGPU/v_madak_f16.ll b/test/CodeGen/AMDGPU/v_madak_f16.ll
index bfb10503aaea..0148ff470b78 100644
--- a/test/CodeGen/AMDGPU/v_madak_f16.ll
+++ b/test/CodeGen/AMDGPU/v_madak_f16.ll
@@ -4,7 +4,7 @@
 ; GCN-LABEL: {{^}}madak_f16
 ; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
 ; GCN: buffer_load_ushort v[[B_F16:[0-9]+]]
-; VI:  v_madak_f16_e32 v[[R_F16:[0-9]+]], v[[A_F16]], v[[B_F16]], 0x4900{{$}}
+; VI:  v_madak_f16 v[[R_F16:[0-9]+]], v[[A_F16]], v[[B_F16]], 0x4900{{$}}
 ; VI:  buffer_store_short v[[R_F16]]
 ; GCN: s_endpgm
 define amdgpu_kernel void @madak_f16(
diff --git a/test/CodeGen/AMDGPU/waitcnt.mir b/test/CodeGen/AMDGPU/waitcnt.mir
index 38662e83b359..f754415dccb4 100644
--- a/test/CodeGen/AMDGPU/waitcnt.mir
+++ b/test/CodeGen/AMDGPU/waitcnt.mir
@@ -51,21 +51,21 @@ name: flat_zero_waitcnt
 body: |
   bb.0:
     successors: %bb.1
-    %vgpr0 = FLAT_LOAD_DWORD %vgpr1_vgpr2, 0, 0, 0, implicit %exec, implicit %flat_scr :: (load 4 from %ir.global4)
-    %vgpr3_vgpr4_vgpr5_vgpr6 = FLAT_LOAD_DWORDX4 %vgpr7_vgpr8, 0, 0, 0, implicit %exec, implicit %flat_scr :: (load 16 from %ir.global16)
+    %vgpr0 = FLAT_LOAD_DWORD %vgpr1_vgpr2, 0, 0, implicit %exec, implicit %flat_scr :: (load 4 from %ir.global4)
+    %vgpr3_vgpr4_vgpr5_vgpr6 = FLAT_LOAD_DWORDX4 %vgpr7_vgpr8, 0, 0, implicit %exec, implicit %flat_scr :: (load 16 from %ir.global16)
     %vgpr0 = V_MOV_B32_e32 %vgpr1, implicit %exec
     S_BRANCH %bb.1
 
   bb.1:
     successors: %bb.2
-    %vgpr0 = FLAT_LOAD_DWORD %vgpr1_vgpr2, 0, 0, 0, implicit %exec, implicit %flat_scr
-    %vgpr3_vgpr4_vgpr5_vgpr6 = FLAT_LOAD_DWORDX4 %vgpr7_vgpr8, 0, 0, 0, implicit %exec, implicit %flat_scr :: (load 16 from %ir.global16)
+    %vgpr0 = FLAT_LOAD_DWORD %vgpr1_vgpr2, 0, 0, implicit %exec, implicit %flat_scr
+    %vgpr3_vgpr4_vgpr5_vgpr6 = FLAT_LOAD_DWORDX4 %vgpr7_vgpr8, 0, 0, implicit %exec, implicit %flat_scr :: (load 16 from %ir.global16)
     %vgpr0 = V_MOV_B32_e32 %vgpr1, implicit %exec
     S_BRANCH %bb.2
 
   bb.2:
-    %vgpr0 = FLAT_LOAD_DWORD %vgpr1_vgpr2, 0, 0, 0, implicit %exec, implicit %flat_scr :: (load 4 from %ir.flat4)
-    %vgpr3_vgpr4_vgpr5_vgpr6 = FLAT_LOAD_DWORDX4 %vgpr7_vgpr8, 0, 0, 0, implicit %exec, implicit %flat_scr :: (load 16 from %ir.flat16)
+    %vgpr0 = FLAT_LOAD_DWORD %vgpr1_vgpr2, 0, 0, implicit %exec, implicit %flat_scr :: (load 4 from %ir.flat4)
+    %vgpr3_vgpr4_vgpr5_vgpr6 = FLAT_LOAD_DWORDX4 %vgpr7_vgpr8, 0, 0, implicit %exec, implicit %flat_scr :: (load 16 from %ir.flat16)
     %vgpr0 = V_MOV_B32_e32 %vgpr1, implicit %exec
     S_ENDPGM
 ...
@@ -86,11 +86,11 @@ name: single_fallthrough_successor_no_end_block_wait
 body: |
   bb.0:
     successors: %bb.1
-    %vgpr0 = FLAT_LOAD_DWORD %vgpr1_vgpr2, 0, 0, 0, implicit %exec, implicit %flat_scr
+    %vgpr0 = FLAT_LOAD_DWORD %vgpr1_vgpr2, 0, 0, implicit %exec, implicit %flat_scr
 
   bb.1:
     %vgpr3_vgpr4 = V_LSHLREV_B64 4, %vgpr7_vgpr8, implicit %exec
-    FLAT_STORE_DWORD %vgpr3_vgpr4, %vgpr0, 0, 0, 0, implicit %exec, implicit %flat_scr
+    FLAT_STORE_DWORD %vgpr3_vgpr4, %vgpr0, 0, 0, implicit %exec, implicit %flat_scr
     S_ENDPGM
 ...
 ---
@@ -114,15 +114,15 @@ name: single_branch_successor_not_next_block
 body: |
   bb.0:
     successors: %bb.2
-    %vgpr0 = FLAT_LOAD_DWORD %vgpr1_vgpr2, 0, 0, 0, implicit %exec, implicit %flat_scr
+    %vgpr0 = FLAT_LOAD_DWORD %vgpr1_vgpr2, 0, 0, implicit %exec, implicit %flat_scr
    S_BRANCH %bb.2
 
   bb.1:
-    FLAT_STORE_DWORD %vgpr8_vgpr9, %vgpr10, 0, 0, 0, implicit %exec, implicit %flat_scr
+    FLAT_STORE_DWORD %vgpr8_vgpr9, %vgpr10, 0, 0, implicit %exec, implicit %flat_scr
     S_ENDPGM
 
   bb.2:
      %vgpr3_vgpr4 = V_LSHLREV_B64 4, %vgpr7_vgpr8, implicit %exec
-    FLAT_STORE_DWORD %vgpr3_vgpr4, %vgpr0, 0, 0, 0, implicit %exec, implicit %flat_scr
+    FLAT_STORE_DWORD %vgpr3_vgpr4, %vgpr0, 0, 0, implicit %exec, implicit %flat_scr
     S_ENDPGM
 ...
diff --git a/test/CodeGen/ARM/GlobalISel/arm-instruction-select.mir b/test/CodeGen/ARM/GlobalISel/arm-instruction-select.mir
index 83ab2659ef4a..72c3b715d36e 100644
--- a/test/CodeGen/ARM/GlobalISel/arm-instruction-select.mir
+++ b/test/CodeGen/ARM/GlobalISel/arm-instruction-select.mir
@@ -4,6 +4,8 @@
   define void @test_sext_s1() { ret void }
   define void @test_sext_s8() { ret void }
   define void @test_zext_s16() { ret void }
+  define void @test_anyext_s8() { ret void }
+  define void @test_anyext_s16() { ret void }
 
   define void @test_trunc_s32_16() { ret void }
 
@@ -149,6 +151,58 @@ body:             |
     ; CHECK: BX_RET 14, _, implicit %r0
 ...
 ---
+name:            test_anyext_s8
+# CHECK-LABEL: name: test_anyext_s8
+legalized:       true
+regBankSelected: true
+selected:        false
+# CHECK: selected: true
+registers:
+  - { id: 0, class: gprb }
+  - { id: 1, class: gprb }
+body:             |
+  bb.0:
+    liveins: %r0
+
+    %0(s8) = COPY %r0
+    ; CHECK: [[VREGX:%[0-9]+]] = COPY %r0
+
+    %1(s32) = G_ANYEXT %0(s8)
+    ; CHECK: [[VREGEXT:%[0-9]+]] = COPY [[VREGX]]
+
+    %r0 = COPY %1(s32)
+    ; CHECK: %r0 = COPY [[VREGEXT]]
+
+    BX_RET 14, _, implicit %r0
+    ; CHECK: BX_RET 14, _, implicit %r0
+...
+---
+name:            test_anyext_s16
+# CHECK-LABEL: name: test_anyext_s16
+legalized:       true
+regBankSelected: true
+selected:        false
+# CHECK: selected: true
+registers:
+  - { id: 0, class: gprb }
+  - { id: 1, class: gprb }
+body:             |
+  bb.0:
+    liveins: %r0
+
+    %0(s16) = COPY %r0
+    ; CHECK: [[VREGX:%[0-9]+]] = COPY %r0
+
+    %1(s32) = G_ANYEXT %0(s16)
+    ; CHECK: [[VREGEXT:%[0-9]+]] = COPY [[VREGX]]
+
+    %r0 = COPY %1(s32)
+    ; CHECK: %r0 = COPY [[VREGEXT]]
+
+    BX_RET 14, _, implicit %r0
+    ; CHECK: BX_RET 14, _, implicit %r0
+...
+---
 name:            test_trunc_s32_16
 # CHECK-LABEL: name: test_trunc_s32_16
 legalized:       true
@@ -187,9 +241,15 @@ registers:
   - { id: 0, class: gprb }
   - { id: 1, class: gprb }
   - { id: 2, class: gprb }
+  - { id: 3, class: gprb }
+  - { id: 4, class: gprb }
+  - { id: 5, class: gprb }
 # CHECK-DAG: id: 0, class: gpr
 # CHECK-DAG: id: 1, class: gpr
 # CHECK-DAG: id: 2, class: gpr
+# CHECK-DAG: id: 3, class: gpr
+# CHECK-DAG: id: 4, class: gpr
+# CHECK-DAG: id: 5, class: gpr
 body:             |
   bb.0:
     liveins: %r0, %r1
@@ -200,11 +260,20 @@ body:             |
     %1(s8) = COPY %r1
     ; CHECK: [[VREGY:%[0-9]+]] = COPY %r1
 
-    %2(s8) = G_ADD %0, %1
-    ; CHECK: [[VREGSUM:%[0-9]+]] = ADDrr [[VREGX]], [[VREGY]], 14, _, _
+    %2(s32) = G_ANYEXT %0(s8)
+    ; CHECK: [[VREGXEXT:%[0-9]+]] = COPY [[VREGX]]
 
-    %r0 = COPY %2(s8)
-    ; CHECK: %r0 = COPY [[VREGSUM]]
+    %3(s32) = G_ANYEXT %1(s8)
+    ; CHECK: [[VREGYEXT:%[0-9]+]] = COPY [[VREGY]]
+
+    %4(s32) = G_ADD %2, %3
+    ; CHECK: [[VREGSUM:%[0-9]+]] = ADDrr [[VREGXEXT]], [[VREGYEXT]], 14, _, _
+
+    %5(s8) = G_TRUNC %4(s32)
+    ; CHECK: [[VREGSUMTR:%[0-9]+]] = COPY [[VREGSUM]]
+
+    %r0 = COPY %5(s8)
+    ; CHECK: %r0 = COPY [[VREGSUMTR]]
 
     BX_RET 14, _, implicit %r0
     ; CHECK: BX_RET 14, _, implicit %r0
@@ -220,9 +289,15 @@ registers:
   - { id: 0, class: gprb }
   - { id: 1, class: gprb }
   - { id: 2, class: gprb }
+  - { id: 3, class: gprb }
+  - { id: 4, class: gprb }
+  - { id: 5, class: gprb }
 # CHECK-DAG: id: 0, class: gpr
 # CHECK-DAG: id: 1, class: gpr
 # CHECK-DAG: id: 2, class: gpr
+# CHECK-DAG: id: 3, class: gpr
+# CHECK-DAG: id: 4, class: gpr
+# CHECK-DAG: id: 5, class: gpr
 body:             |
   bb.0:
     liveins: %r0, %r1
@@ -233,11 +308,20 @@ body:             |
     %1(s16) = COPY %r1
     ; CHECK: [[VREGY:%[0-9]+]] = COPY %r1
 
-    %2(s16) = G_ADD %0, %1
-    ; CHECK: [[VREGSUM:%[0-9]+]] = ADDrr [[VREGX]], [[VREGY]], 14, _, _
+    %2(s32) = G_ANYEXT %0(s16)
+    ; CHECK: [[VREGXEXT:%[0-9]+]] = COPY [[VREGX]]
 
-    %r0 = COPY %2(s16)
-    ; CHECK: %r0 = COPY [[VREGSUM]]
+    %3(s32) = G_ANYEXT %1(s16)
+    ; CHECK: [[VREGYEXT:%[0-9]+]] = COPY [[VREGY]]
+
+    %4(s32) = G_ADD %2, %3
+    ; CHECK: [[VREGSUM:%[0-9]+]] = ADDrr [[VREGXEXT]], [[VREGYEXT]], 14, _, _
+
+    %5(s16) = G_TRUNC %4(s32)
+    ; CHECK: [[VREGSUMTR:%[0-9]+]] = COPY [[VREGSUM]]
+
+    %r0 = COPY %5(s16)
+    ; CHECK: %r0 = COPY [[VREGSUMTR]]
 
     BX_RET 14, _, implicit %r0
     ; CHECK: BX_RET 14, _, implicit %r0
@@ -352,9 +436,15 @@ registers:
   - { id: 0, class: gprb }
   - { id: 1, class: gprb }
   - { id: 2, class: gprb }
+  - { id: 3, class: gprb }
+  - { id: 4, class: gprb }
+  - { id: 5, class: gprb }
 # CHECK-DAG: id: 0, class: gpr
 # CHECK-DAG: id: 1, class: gpr
 # CHECK-DAG: id: 2, class: gpr
+# CHECK-DAG: id: 3, class: gpr
+# CHECK-DAG: id: 4, class: gpr
+# CHECK-DAG: id: 5, class: gpr
 body:             |
   bb.0:
     liveins: %r0, %r1
@@ -365,11 +455,20 @@ body:             |
     %1(s8) = COPY %r1
     ; CHECK: [[VREGY:%[0-9]+]] = COPY %r1
 
-    %2(s8) = G_SUB %0, %1
-    ; CHECK: [[VREGRES:%[0-9]+]] = SUBrr [[VREGX]], [[VREGY]], 14, _, _
+    %2(s32) = G_ANYEXT %0(s8)
+    ; CHECK: [[VREGXEXT:%[0-9]+]] = COPY [[VREGX]]
 
-    %r0 = COPY %2(s8)
-    ; CHECK: %r0 = COPY [[VREGRES]]
+    %3(s32) = G_ANYEXT %1(s8)
+    ; CHECK: [[VREGYEXT:%[0-9]+]] = COPY [[VREGY]]
+
+    %4(s32) = G_SUB %2, %3
+    ; CHECK: [[VREGRES:%[0-9]+]] = SUBrr [[VREGXEXT]], [[VREGYEXT]], 14, _, _
+
+    %5(s8) = G_TRUNC %4(s32)
+    ; CHECK: [[VREGRESTR:%[0-9]+]] = COPY [[VREGRES]]
+
+    %r0 = COPY %5(s8)
+    ; CHECK: %r0 = COPY [[VREGRESTR]]
 
     BX_RET 14, _, implicit %r0
     ; CHECK: BX_RET 14, _, implicit %r0
@@ -385,9 +484,15 @@ registers:
   - { id: 0, class: gprb }
   - { id: 1, class: gprb }
   - { id: 2, class: gprb }
+  - { id: 3, class: gprb }
+  - { id: 4, class: gprb }
+  - { id: 5, class: gprb }
 # CHECK-DAG: id: 0, class: gpr
 # CHECK-DAG: id: 1, class: gpr
 # CHECK-DAG: id: 2, class: gpr
+# CHECK-DAG: id: 3, class: gpr
+# CHECK-DAG: id: 4, class: gpr
+# CHECK-DAG: id: 5, class: gpr
 body:             |
   bb.0:
     liveins: %r0, %r1
@@ -398,11 +503,20 @@ body:             |
     %1(s16) = COPY %r1
     ; CHECK: [[VREGY:%[0-9]+]] = COPY %r1
 
-    %2(s16) = G_SUB %0, %1
-    ; CHECK: [[VREGRES:%[0-9]+]] = SUBrr [[VREGX]], [[VREGY]], 14, _, _
+    %2(s32) = G_ANYEXT %0(s16)
+    ; CHECK: [[VREGXEXT:%[0-9]+]] = COPY [[VREGX]]
 
-    %r0 = COPY %2(s16)
-    ; CHECK: %r0 = COPY [[VREGRES]]
+    %3(s32) = G_ANYEXT %1(s16)
+    ; CHECK: [[VREGYEXT:%[0-9]+]] = COPY [[VREGY]]
+
+    %4(s32) = G_SUB %2, %3
+    ; CHECK: [[VREGRES:%[0-9]+]] = SUBrr [[VREGXEXT]], [[VREGYEXT]], 14, _, _
+
+    %5(s16) = G_TRUNC %4(s32)
+    ; CHECK: [[VREGRESTR:%[0-9]+]] = COPY [[VREGRES]]
+
+    %r0 = COPY %5(s16)
+    ; CHECK: %r0 = COPY [[VREGRESTR]]
 
     BX_RET 14, _, implicit %r0
     ; CHECK: BX_RET 14, _, implicit %r0
@@ -451,9 +565,15 @@ registers:
   - { id: 0, class: gprb }
   - { id: 1, class: gprb }
   - { id: 2, class: gprb }
-# CHECK-DAG: id: 0, class: gprnopc
-# CHECK-DAG: id: 1, class: gprnopc
+  - { id: 3, class: gprb }
+  - { id: 4, class: gprb }
+  - { id: 5, class: gprb }
+# CHECK-DAG: id: 0, class: gpr
+# CHECK-DAG: id: 1, class: gpr
 # CHECK-DAG: id: 2, class: gprnopc
+# CHECK-DAG: id: 3, class: gprnopc
+# CHECK-DAG: id: 4, class: gprnopc
+# CHECK-DAG: id: 5, class: gpr
 body:             |
   bb.0:
     liveins: %r0, %r1
@@ -464,11 +584,20 @@ body:             |
     %1(s8) = COPY %r1
     ; CHECK: [[VREGY:%[0-9]+]] = COPY %r1
 
-    %2(s8) = G_MUL %0, %1
-    ; CHECK: [[VREGRES:%[0-9]+]] = MUL [[VREGX]], [[VREGY]], 14, _, _
+    %2(s32) = G_ANYEXT %0(s8)
+    ; CHECK: [[VREGXEXT:%[0-9]+]] = COPY [[VREGX]]
 
-    %r0 = COPY %2(s8)
-    ; CHECK: %r0 = COPY [[VREGRES]]
+    %3(s32) = G_ANYEXT %1(s8)
+    ; CHECK: [[VREGYEXT:%[0-9]+]] = COPY [[VREGY]]
+
+    %4(s32) = G_MUL %2, %3
+    ; CHECK: [[VREGRES:%[0-9]+]] = MUL [[VREGXEXT]], [[VREGYEXT]], 14, _, _
+
+    %5(s8) = G_TRUNC %4(s32)
+    ; CHECK: [[VREGRESTR:%[0-9]+]] = COPY [[VREGRES]]
+
+    %r0 = COPY %5(s8)
+    ; CHECK: %r0 = COPY [[VREGRESTR]]
 
     BX_RET 14, _, implicit %r0
     ; CHECK: BX_RET 14, _, implicit %r0
@@ -484,9 +613,15 @@ registers:
   - { id: 0, class: gprb }
   - { id: 1, class: gprb }
   - { id: 2, class: gprb }
-# CHECK-DAG: id: 0, class: gprnopc
-# CHECK-DAG: id: 1, class: gprnopc
+  - { id: 3, class: gprb }
+  - { id: 4, class: gprb }
+  - { id: 5, class: gprb }
+# CHECK-DAG: id: 0, class: gpr
+# CHECK-DAG: id: 1, class: gpr
 # CHECK-DAG: id: 2, class: gprnopc
+# CHECK-DAG: id: 3, class: gprnopc
+# CHECK-DAG: id: 4, class: gprnopc
+# CHECK-DAG: id: 5, class: gpr
 body:             |
   bb.0:
     liveins: %r0, %r1
@@ -497,11 +632,20 @@ body:             |
     %1(s16) = COPY %r1
     ; CHECK: [[VREGY:%[0-9]+]] = COPY %r1
 
-    %2(s16) = G_MUL %0, %1
-    ; CHECK: [[VREGRES:%[0-9]+]] = MUL [[VREGX]], [[VREGY]], 14, _, _
+    %2(s32) = G_ANYEXT %0(s16)
+    ; CHECK: [[VREGXEXT:%[0-9]+]] = COPY [[VREGX]]
 
-    %r0 = COPY %2(s16)
-    ; CHECK: %r0 = COPY [[VREGRES]]
+    %3(s32) = G_ANYEXT %1(s16)
+    ; CHECK: [[VREGYEXT:%[0-9]+]] = COPY [[VREGY]]
+
+    %4(s32) = G_MUL %2, %3
+    ; CHECK: [[VREGRES:%[0-9]+]] = MUL [[VREGXEXT]], [[VREGYEXT]], 14, _, _
+
+    %5(s16) = G_TRUNC %4(s32)
+    ; CHECK: [[VREGRESTR:%[0-9]+]] = COPY [[VREGRES]]
+
+    %r0 = COPY %5(s16)
+    ; CHECK: %r0 = COPY [[VREGRESTR]]
 
     BX_RET 14, _, implicit %r0
     ; CHECK: BX_RET 14, _, implicit %r0
diff --git a/test/CodeGen/ARM/GlobalISel/arm-irtranslator.ll b/test/CodeGen/ARM/GlobalISel/arm-irtranslator.ll
index 44fe7410b42c..53577dbd76f6 100644
--- a/test/CodeGen/ARM/GlobalISel/arm-irtranslator.ll
+++ b/test/CodeGen/ARM/GlobalISel/arm-irtranslator.ll
@@ -421,7 +421,7 @@ entry:
 define arm_aapcscc void @test_indirect_call(void() *%fptr) {
 ; CHECK-LABEL: name: test_indirect_call
 ; CHECK: [[FPTR:%[0-9]+]](p0) = COPY %r0
-; CHECK: ADJCALLSTACKDOWN 0, 14, _, implicit-def %sp, implicit %sp
+; CHECK: ADJCALLSTACKDOWN 0, 0, 14, _, implicit-def %sp, implicit %sp
 ; CHECK: BLX [[FPTR]](p0), csr_aapcs, implicit-def %lr, implicit %sp
 ; CHECK: ADJCALLSTACKUP 0, 0, 14, _, implicit-def %sp, implicit %sp
 entry:
@@ -433,7 +433,7 @@ declare arm_aapcscc void @call_target()
 
 define arm_aapcscc void @test_direct_call() {
 ; CHECK-LABEL: name: test_direct_call
-; CHECK: ADJCALLSTACKDOWN 0, 14, _, implicit-def %sp, implicit %sp
+; CHECK: ADJCALLSTACKDOWN 0, 0, 14, _, implicit-def %sp, implicit %sp
 ; CHECK: BLX @call_target, csr_aapcs, implicit-def %lr, implicit %sp
 ; CHECK: ADJCALLSTACKUP 0, 0, 14, _, implicit-def %sp, implicit %sp
 entry:
@@ -447,7 +447,7 @@ define arm_aapcscc i32* @test_call_simple_reg_params(i32 *%a, i32 %b) {
 ; CHECK-LABEL: name: test_call_simple_reg_params
 ; CHECK-DAG: [[AVREG:%[0-9]+]](p0) = COPY %r0
 ; CHECK-DAG: [[BVREG:%[0-9]+]](s32) = COPY %r1
-; CHECK: ADJCALLSTACKDOWN 0, 14, _, implicit-def %sp, implicit %sp
+; CHECK: ADJCALLSTACKDOWN 0, 0, 14, _, implicit-def %sp, implicit %sp
 ; CHECK-DAG: %r0 = COPY [[BVREG]]
 ; CHECK-DAG: %r1 = COPY [[AVREG]]
 ; CHECK: BLX @simple_reg_params_target, csr_aapcs, implicit-def %lr, implicit %sp, implicit %r0, implicit %r1, implicit-def %r0
@@ -466,7 +466,7 @@ define arm_aapcscc i32* @test_call_simple_stack_params(i32 *%a, i32 %b) {
 ; CHECK-LABEL: name: test_call_simple_stack_params
 ; CHECK-DAG: [[AVREG:%[0-9]+]](p0) = COPY %r0
 ; CHECK-DAG: [[BVREG:%[0-9]+]](s32) = COPY %r1
-; CHECK: ADJCALLSTACKDOWN 8, 14, _, implicit-def %sp, implicit %sp
+; CHECK: ADJCALLSTACKDOWN 8, 0, 14, _, implicit-def %sp, implicit %sp
 ; CHECK-DAG: %r0 = COPY [[BVREG]]
 ; CHECK-DAG: %r1 = COPY [[AVREG]]
 ; CHECK-DAG: %r2 = COPY [[BVREG]]
@@ -496,7 +496,7 @@ define arm_aapcscc signext i16 @test_call_ext_params(i8 %a, i16 %b, i1 %c) {
 ; CHECK-DAG: [[AVREG:%[0-9]+]](s8) = COPY %r0
 ; CHECK-DAG: [[BVREG:%[0-9]+]](s16) = COPY %r1
 ; CHECK-DAG: [[CVREG:%[0-9]+]](s1) = COPY %r2
-; CHECK: ADJCALLSTACKDOWN 20, 14, _, implicit-def %sp, implicit %sp
+; CHECK: ADJCALLSTACKDOWN 20, 0, 14, _, implicit-def %sp, implicit %sp
 ; CHECK: [[SEXTA:%[0-9]+]](s32) = G_SEXT [[AVREG]](s8)
 ; CHECK: %r0 = COPY [[SEXTA]]
 ; CHECK: [[ZEXTA:%[0-9]+]](s32) = G_ZEXT [[AVREG]](s8)
@@ -547,7 +547,7 @@ define arm_aapcs_vfpcc double @test_call_vfpcc_fp_params(double %a, float %b) {
 ; CHECK-LABEL: name: test_call_vfpcc_fp_params
 ; CHECK-DAG: [[AVREG:%[0-9]+]](s64) = COPY %d0
 ; CHECK-DAG: [[BVREG:%[0-9]+]](s32) = COPY %s2
-; CHECK: ADJCALLSTACKDOWN 0, 14, _, implicit-def %sp, implicit %sp
+; CHECK: ADJCALLSTACKDOWN 0, 0, 14, _, implicit-def %sp, implicit %sp
 ; CHECK-DAG: %s0 = COPY [[BVREG]]
 ; CHECK-DAG: %d1 = COPY [[AVREG]]
 ; CHECK: BLX @vfpcc_fp_target, csr_aapcs, implicit-def %lr, implicit %sp, implicit %s0, implicit %d1, implicit-def %d0
@@ -569,7 +569,7 @@ define arm_aapcscc double @test_call_aapcs_fp_params(double %a, float %b) {
 ; LITTLE-DAG: [[AVREG:%[0-9]+]](s64) = G_SEQUENCE [[A1]](s32), 0, [[A2]](s32), 32
 ; BIG-DAG: [[AVREG:%[0-9]+]](s64) = G_SEQUENCE [[A2]](s32), 0, [[A1]](s32), 32
 ; CHECK-DAG: [[BVREG:%[0-9]+]](s32) = COPY %r2
-; CHECK: ADJCALLSTACKDOWN 16, 14, _, implicit-def %sp, implicit %sp
+; CHECK: ADJCALLSTACKDOWN 16, 0, 14, _, implicit-def %sp, implicit %sp
 ; CHECK-DAG: %r0 = COPY [[BVREG]]
 ; CHECK-DAG: [[A1:%[0-9]+]](s32) = G_EXTRACT [[AVREG]](s64), 0
 ; CHECK-DAG: [[A2:%[0-9]+]](s32) = G_EXTRACT [[AVREG]](s64), 32
@@ -608,7 +608,7 @@ declare arm_aapcscc float @different_call_conv_target(float)
 define arm_aapcs_vfpcc float @test_call_different_call_conv(float %x) {
 ; CHECK-LABEL: name: test_call_different_call_conv
 ; CHECK: [[X:%[0-9]+]](s32) = COPY %s0
-; CHECK: ADJCALLSTACKDOWN 0, 14, _, implicit-def %sp, implicit %sp
+; CHECK: ADJCALLSTACKDOWN 0, 0, 14, _, implicit-def %sp, implicit %sp
 ; CHECK: %r0 = COPY [[X]]
 ; CHECK: BLX @different_call_conv_target, csr_aapcs, implicit-def %lr, implicit %sp, implicit %r0, implicit-def %r0
 ; CHECK: [[R:%[0-9]+]](s32) = COPY %r0
diff --git a/test/CodeGen/ARM/GlobalISel/arm-legalizer.mir b/test/CodeGen/ARM/GlobalISel/arm-legalizer.mir
index 625d35acf17b..f6ac92597cb2 100644
--- a/test/CodeGen/ARM/GlobalISel/arm-legalizer.mir
+++ b/test/CodeGen/ARM/GlobalISel/arm-legalizer.mir
@@ -91,8 +91,9 @@ body:             |
     %0(s8) = COPY %r0
     %1(s8) = COPY %r1
     %2(s8) = G_ADD %0, %1
-    ; G_ADD with s8 is legal, so we should find it unchanged in the output
-    ; CHECK: {{%[0-9]+}}(s8) = G_ADD {{%[0-9]+, %[0-9]+}}
+    ; G_ADD with s8 should widen
+    ; CHECK: {{%[0-9]+}}(s32) = G_ADD {{%[0-9]+, %[0-9]+}}
+    ; CHECK-NOT: {{%[0-9]+}}(s8) = G_ADD {{%[0-9]+, %[0-9]+}}
     %r0 = COPY %2(s8)
     BX_RET 14, _, implicit %r0
 ...
@@ -115,8 +116,9 @@ body:             |
     %0(s16) = COPY %r0
     %1(s16) = COPY %r1
     %2(s16) = G_ADD %0, %1
-    ; G_ADD with s16 is legal, so we should find it unchanged in the output
-    ; CHECK: {{%[0-9]+}}(s16) = G_ADD {{%[0-9]+, %[0-9]+}}
+    ; G_ADD with s16 should widen
+    ; CHECK: {{%[0-9]+}}(s32) = G_ADD {{%[0-9]+, %[0-9]+}}
+    ; CHECK-NOT: {{%[0-9]+}}(s16) = G_ADD {{%[0-9]+, %[0-9]+}}
     %r0 = COPY %2(s16)
     BX_RET 14, _, implicit %r0
 
@@ -165,8 +167,9 @@ body:             |
     %0(s8) = COPY %r0
     %1(s8) = COPY %r1
     %2(s8) = G_SUB %0, %1
-    ; G_SUB with s8 is legal, so we should find it unchanged in the output
-    ; CHECK: {{%[0-9]+}}(s8) = G_SUB {{%[0-9]+, %[0-9]+}}
+    ; G_SUB with s8 should widen
+    ; CHECK: {{%[0-9]+}}(s32) = G_SUB {{%[0-9]+, %[0-9]+}}
+    ; CHECK-NOT: {{%[0-9]+}}(s8) = G_SUB {{%[0-9]+, %[0-9]+}}
     %r0 = COPY %2(s8)
     BX_RET 14, _, implicit %r0
 ...
@@ -189,8 +192,9 @@ body:             |
     %0(s16) = COPY %r0
     %1(s16) = COPY %r1
     %2(s16) = G_SUB %0, %1
-    ; G_SUB with s16 is legal, so we should find it unchanged in the output
-    ; CHECK: {{%[0-9]+}}(s16) = G_SUB {{%[0-9]+, %[0-9]+}}
+    ; G_SUB with s16 should widen
+    ; CHECK: {{%[0-9]+}}(s32) = G_SUB {{%[0-9]+, %[0-9]+}}
+    ; CHECK-NOT: {{%[0-9]+}}(s16) = G_SUB {{%[0-9]+, %[0-9]+}}
     %r0 = COPY %2(s16)
     BX_RET 14, _, implicit %r0
 
@@ -239,8 +243,9 @@ body:             |
     %0(s8) = COPY %r0
     %1(s8) = COPY %r1
     %2(s8) = G_MUL %0, %1
-    ; G_MUL with s8 is legal, so we should find it unchanged in the output
-    ; CHECK: {{%[0-9]+}}(s8) = G_MUL {{%[0-9]+, %[0-9]+}}
+    ; G_MUL with s8 should widen
+    ; CHECK: {{%[0-9]+}}(s32) = G_MUL {{%[0-9]+, %[0-9]+}}
+    ; CHECK-NOT: {{%[0-9]+}}(s8) = G_MUL {{%[0-9]+, %[0-9]+}}
     %r0 = COPY %2(s8)
     BX_RET 14, _, implicit %r0
 ...
@@ -263,8 +268,9 @@ body:             |
     %0(s16) = COPY %r0
     %1(s16) = COPY %r1
     %2(s16) = G_MUL %0, %1
-    ; G_MUL with s16 is legal, so we should find it unchanged in the output
-    ; CHECK: {{%[0-9]+}}(s16) = G_MUL {{%[0-9]+, %[0-9]+}}
+    ; G_MUL with s16 should widen
+    ; CHECK: {{%[0-9]+}}(s32) = G_MUL {{%[0-9]+, %[0-9]+}}
+    ; CHECK-NOT: {{%[0-9]+}}(s16) = G_MUL {{%[0-9]+, %[0-9]+}}
     %r0 = COPY %2(s16)
     BX_RET 14, _, implicit %r0
 
diff --git a/test/CodeGen/ARM/GlobalISel/arm-regbankselect.mir b/test/CodeGen/ARM/GlobalISel/arm-regbankselect.mir
index 4e94fb4e3481..dfccc47c277c 100644
--- a/test/CodeGen/ARM/GlobalISel/arm-regbankselect.mir
+++ b/test/CodeGen/ARM/GlobalISel/arm-regbankselect.mir
@@ -25,6 +25,9 @@
 
   define void @test_constants() { ret void }
 
+  define void @test_anyext_s8_32() { ret void }
+  define void @test_anyext_s16_32() { ret void }
+
   define void @test_trunc_s32_16() { ret void }
 
   define void @test_fadd_s32() #0 { ret void }
@@ -71,19 +74,28 @@ selected:        false
 # CHECK: - { id: 0, class: gprb }
 # CHECK: - { id: 1, class: gprb }
 # CHECK: - { id: 2, class: gprb }
+# CHECK: - { id: 3, class: gprb }
+# CHECK: - { id: 4, class: gprb }
+# CHECK: - { id: 5, class: gprb }
 
 registers:
   - { id: 0, class: _ }
   - { id: 1, class: _ }
   - { id: 2, class: _ }
+  - { id: 3, class: _ }
+  - { id: 4, class: _ }
+  - { id: 5, class: _ }
 body:             |
   bb.0:
     liveins: %r0, %r1
 
     %0(s16) = COPY %r0
     %1(s16) = COPY %r1
-    %2(s16) = G_ADD %0, %1
-    %r0 = COPY %2(s16)
+    %2(s32) = G_ANYEXT %0(s16)
+    %3(s32) = G_ANYEXT %1(s16)
+    %4(s32) = G_ADD %2, %3
+    %5(s16) = G_TRUNC %4(s32)
+    %r0 = COPY %5(s16)
     BX_RET 14, _, implicit %r0
 
 ...
@@ -97,19 +109,28 @@ selected:        false
 # CHECK: - { id: 0, class: gprb }
 # CHECK: - { id: 1, class: gprb }
 # CHECK: - { id: 2, class: gprb }
+# CHECK: - { id: 3, class: gprb }
+# CHECK: - { id: 4, class: gprb }
+# CHECK: - { id: 5, class: gprb }
 
 registers:
   - { id: 0, class: _ }
   - { id: 1, class: _ }
   - { id: 2, class: _ }
+  - { id: 3, class: _ }
+  - { id: 4, class: _ }
+  - { id: 5, class: _ }
 body:             |
   bb.0:
     liveins: %r0, %r1
 
     %0(s8) = COPY %r0
     %1(s8) = COPY %r1
-    %2(s8) = G_ADD %0, %1
-    %r0 = COPY %2(s8)
+    %2(s32) = G_ANYEXT %0(s8)
+    %3(s32) = G_ANYEXT %1(s8)
+    %4(s32) = G_ADD %2, %3
+    %5(s8) = G_TRUNC %4(s32)
+    %r0 = COPY %5(s8)
     BX_RET 14, _, implicit %r0
 
 ...
@@ -123,19 +144,28 @@ selected:        false
 # CHECK: - { id: 0, class: gprb }
 # CHECK: - { id: 1, class: gprb }
 # CHECK: - { id: 2, class: gprb }
+# CHECK: - { id: 3, class: gprb }
+# CHECK: - { id: 4, class: gprb }
+# CHECK: - { id: 5, class: gprb }
 
 registers:
   - { id: 0, class: _ }
   - { id: 1, class: _ }
   - { id: 2, class: _ }
+  - { id: 3, class: _ }
+  - { id: 4, class: _ }
+  - { id: 5, class: _ }
 body:             |
   bb.0:
     liveins: %r0, %r1
 
     %0(s1) = COPY %r0
     %1(s1) = COPY %r1
-    %2(s1) = G_ADD %0, %1
-    %r0 = COPY %2(s1)
+    %2(s32) = G_ANYEXT %0(s1)
+    %3(s32) = G_ANYEXT %1(s1)
+    %4(s32) = G_ADD %2, %3
+    %5(s1) = G_TRUNC %4(s32)
+    %r0 = COPY %5(s1)
     BX_RET 14, _, implicit %r0
 
 ...
@@ -175,19 +205,28 @@ selected:        false
 # CHECK: - { id: 0, class: gprb }
 # CHECK: - { id: 1, class: gprb }
 # CHECK: - { id: 2, class: gprb }
+# CHECK: - { id: 3, class: gprb }
+# CHECK: - { id: 4, class: gprb }
+# CHECK: - { id: 5, class: gprb }
 
 registers:
   - { id: 0, class: _ }
   - { id: 1, class: _ }
   - { id: 2, class: _ }
+  - { id: 3, class: _ }
+  - { id: 4, class: _ }
+  - { id: 5, class: _ }
 body:             |
   bb.0:
     liveins: %r0, %r1
 
     %0(s16) = COPY %r0
     %1(s16) = COPY %r1
-    %2(s16) = G_SUB %0, %1
-    %r0 = COPY %2(s16)
+    %2(s32) = G_ANYEXT %0(s16)
+    %3(s32) = G_ANYEXT %1(s16)
+    %4(s32) = G_SUB %2, %3
+    %5(s16) = G_TRUNC %4(s32)
+    %r0 = COPY %5(s16)
     BX_RET 14, _, implicit %r0
 
 ...
@@ -201,19 +240,28 @@ selected:        false
 # CHECK: - { id: 0, class: gprb }
 # CHECK: - { id: 1, class: gprb }
 # CHECK: - { id: 2, class: gprb }
+# CHECK: - { id: 3, class: gprb }
+# CHECK: - { id: 4, class: gprb }
+# CHECK: - { id: 5, class: gprb }
 
 registers:
   - { id: 0, class: _ }
   - { id: 1, class: _ }
   - { id: 2, class: _ }
+  - { id: 3, class: _ }
+  - { id: 4, class: _ }
+  - { id: 5, class: _ }
 body:             |
   bb.0:
     liveins: %r0, %r1
 
     %0(s8) = COPY %r0
     %1(s8) = COPY %r1
-    %2(s8) = G_SUB %0, %1
-    %r0 = COPY %2(s8)
+    %2(s32) = G_ANYEXT %0(s8)
+    %3(s32) = G_ANYEXT %1(s8)
+    %4(s32) = G_SUB %2, %3
+    %5(s8) = G_TRUNC %4(s32)
+    %r0 = COPY %5(s8)
     BX_RET 14, _, implicit %r0
 
 ...
@@ -253,19 +301,28 @@ selected:        false
 # CHECK: - { id: 0, class: gprb }
 # CHECK: - { id: 1, class: gprb }
 # CHECK: - { id: 2, class: gprb }
+# CHECK: - { id: 3, class: gprb }
+# CHECK: - { id: 4, class: gprb }
+# CHECK: - { id: 5, class: gprb }
 
 registers:
   - { id: 0, class: _ }
   - { id: 1, class: _ }
   - { id: 2, class: _ }
+  - { id: 3, class: _ }
+  - { id: 4, class: _ }
+  - { id: 5, class: _ }
 body:             |
   bb.0:
     liveins: %r0, %r1
 
     %0(s16) = COPY %r0
     %1(s16) = COPY %r1
-    %2(s16) = G_MUL %0, %1
-    %r0 = COPY %2(s16)
+    %2(s32) = G_ANYEXT %0(s16)
+    %3(s32) = G_ANYEXT %1(s16)
+    %4(s32) = G_MUL %2, %3
+    %5(s16) = G_TRUNC %4(s32)
+    %r0 = COPY %5(s16)
     BX_RET 14, _, implicit %r0
 
 ...
@@ -279,19 +336,28 @@ selected:        false
 # CHECK: - { id: 0, class: gprb }
 # CHECK: - { id: 1, class: gprb }
 # CHECK: - { id: 2, class: gprb }
+# CHECK: - { id: 3, class: gprb }
+# CHECK: - { id: 4, class: gprb }
+# CHECK: - { id: 5, class: gprb }
 
 registers:
   - { id: 0, class: _ }
   - { id: 1, class: _ }
   - { id: 2, class: _ }
+  - { id: 3, class: _ }
+  - { id: 4, class: _ }
+  - { id: 5, class: _ }
 body:             |
   bb.0:
     liveins: %r0, %r1
 
     %0(s8) = COPY %r0
     %1(s8) = COPY %r1
-    %2(s8) = G_MUL %0, %1
-    %r0 = COPY %2(s8)
+    %2(s32) = G_ANYEXT %0(s8)
+    %3(s32) = G_ANYEXT %1(s8)
+    %4(s32) = G_MUL %2, %3
+    %5(s8) = G_TRUNC %4(s32)
+    %r0 = COPY %5(s8)
     BX_RET 14, _, implicit %r0
 
 ...
@@ -500,6 +566,48 @@ body:             |
     BX_RET 14, _, implicit %r0
 ...
 ---
+name:            test_anyext_s8_32
+# CHECK-LABEL: name: test_anyext_s8_32
+legalized:       true
+regBankSelected: false
+selected:        false
+# CHECK: registers:
+# CHECK: - { id: 0, class: gprb }
+# CHECK: - { id: 1, class: gprb }
+registers:
+  - { id: 0, class: _ }
+  - { id: 1, class: _ }
+body:             |
+  bb.0:
+    liveins: %r0
+
+    %0(s8) = COPY %r0
+    %1(s32) = G_ANYEXT %0(s8)
+    %r0 = COPY %1(s32)
+    BX_RET 14, _, implicit %r0
+...
+---
+name:            test_anyext_s16_32
+# CHECK-LABEL: name: test_anyext_s16_32
+legalized:       true
+regBankSelected: false
+selected:        false
+# CHECK: registers:
+# CHECK: - { id: 0, class: gprb }
+# CHECK: - { id: 1, class: gprb }
+registers:
+  - { id: 0, class: _ }
+  - { id: 1, class: _ }
+body:             |
+  bb.0:
+    liveins: %r0
+
+    %0(s16) = COPY %r0
+    %1(s32) = G_ANYEXT %0(s16)
+    %r0 = COPY %1(s32)
+    BX_RET 14, _, implicit %r0
+...
+---
 name:            test_trunc_s32_16
 # CHECK-LABEL: name: test_trunc_s32_16
 legalized:       true
diff --git a/test/CodeGen/ARM/divmod-eabi.ll b/test/CodeGen/ARM/divmod-eabi.ll
index ce5a1df05e3f..77ffc46e6a69 100644
--- a/test/CodeGen/ARM/divmod-eabi.ll
+++ b/test/CodeGen/ARM/divmod-eabi.ll
@@ -16,17 +16,15 @@
 ; RUN: llc -mtriple armv7-linux-gnueabi %s -o - -O0 -optimize-regalloc | FileCheck %s --check-prefix=EABI
 ; RUN: llc -mtriple armv7-linux-musleabi %s -o - | FileCheck %s --check-prefix=EABI
 ; RUN: llc -mtriple armv7-linux-musleabi %s -o - -O0 -optimize-regalloc | FileCheck %s --check-prefix=EABI
-; RUN: llc -mtriple armv7-apple-darwin %s -o - | FileCheck %s --check-prefixes=DARWIN,DARWIN-DEFAULT
-; RUN: llc -mtriple armv7-apple-darwin %s -o - -O0 -optimize-regalloc | FileCheck %s --check-prefixes=DARWIN,DARWIN-O0
-; FIXME: long-term, we will use "-apple-macho" and won't need this exception:
-; RUN: llc -mtriple armv7-apple-darwin-eabi %s -o - | FileCheck %s --check-prefixes=DARWIN,DARWIN-DEFAULT
-; RUN: llc -mtriple armv7-apple-darwin-eabi %s -o - -O0 -optimize-regalloc | FileCheck %s --check-prefixes=DARWIN,DARWIN-O0
+; RUN: llc -mtriple armv7-apple-darwin %s -o - | FileCheck %s --check-prefixes=DARWIN
+; RUN: llc -mtriple armv7-apple-darwin %s -o - -O0 -optimize-regalloc | FileCheck %s --check-prefix=DARWIN-O0
 ; RUN: llc -mtriple thumbv7-windows %s -o - | FileCheck %s --check-prefixes=WINDOWS,WINDOWS-DEFAULT
 ; RUN: llc -mtriple thumbv7-windows %s -o - -O0 -optimize-regalloc | FileCheck %s --check-prefixes=WINDOWS,WINDOWS-O0
 
 define signext i16 @f16(i16 signext %a, i16 signext %b) {
 ; EABI-LABEL: f16:
 ; DARWIN-LABEL: f16:
+; DARWIN-O0-LABEL: f16:
 ; WINDOWS-LABEL: f16:
 entry:
   %conv = sext i16 %a to i32
@@ -36,11 +34,9 @@ entry:
 ; EABI: __aeabi_idivmod
 ; EABI: mov [[div:r[0-9]+]], r0
 ; EABI: mov [[rem:r[0-9]+]], r1
-; DARWIN: ___divsi3
-; DARWIN: mov [[div:r[0-9]+]], r0
-; DARWIN: __modsi3
-; DARWIN-DEFAULT: add [[sum:r[0-9]+]], r0, [[div]]
-; DARWIN-O0: mov [[rem:r[0-9]+]], r0
+; DARWIN: __divmodsi4
+; DARWIN-O0: __divsi3
+; DARWIN-O0: __modsi3
 ; WINDOWS: __rt_sdiv
 ; WINDOWS: __rt_sdiv
 ; WINDOWS-DEFAULT: add [[sum:r[0-9]+]], r1
@@ -48,16 +44,13 @@ entry:
   %rem8 = srem i32 %conv1, %conv
 ; EABI: __aeabi_idivmod
 ; DARWIN: __modsi3
+; DARWIN-O0: __modsi3
 ; WINDOWS: __rt_sdiv
   %add = add nsw i32 %rem, %div
   %add13 = add nsw i32 %add, %rem8
   %conv14 = trunc i32 %add13 to i16
 ; EABI: add r0{{.*}}r1
 ; EABI: sxth r0, r0
-; DARWIN-DEFAULT: add [[res:r[0-9]+]], [[sum]], r0
-; DARWIN-O0: add [[sum:r[0-9]+]], [[rem]], [[div]]
-; DARWIN-O0: add [[res:r[0-9]+]], [[sum]], r0
-; DARWIN: sxth r0, [[res]]
 ; WINDOWS-DEFAULT: adds [[sum1:r[0-9]+]], [[sum]], r1
 ; WINDOWS-O0: adds [[sum:r[0-9]+]], [[rem]],
 ; WINDOWS-O0: add [[sum1:r[0-9]+]], r1
@@ -68,6 +61,7 @@ entry:
 define i32 @f32(i32 %a, i32 %b) {
 ; EABI-LABEL: f32:
 ; DARWIN-LABEL: f32:
+; DARWIN-O0-LABEL: f32:
 ; WINDOWS-LABEL: f32:
 entry:
   %div = sdiv i32 %a, %b
@@ -75,11 +69,9 @@ entry:
 ; EABI: __aeabi_idivmod
 ; EABI: mov [[div:r[0-9]+]], r0
 ; EABI: mov [[rem:r[0-9]+]], r1
-; DARWIN: ___divsi3
-; DARWIN: mov [[div:r[0-9]+]], r0
-; DARWIN: __modsi3
-; DARWIN-DEFAULT: add [[sum:r[0-9]+]], r0, [[div]]
-; DARWIN-O0: mov [[rem:r[0-9]+]], r0
+; DARWIN: ___divmodsi4
+; DARWIN-O0: __divsi3
+; DARWIN-O0: __modsi3
 ; WINDOWS: __rt_sdiv
 ; WINDOWS: mov [[div:r[0-9]+]], r0
 ; WINDOWS: __rt_sdiv
@@ -87,13 +79,11 @@ entry:
   %rem1 = srem i32 %b, %a
 ; EABI: __aeabi_idivmod
 ; DARWIN: __modsi3
+; DARWIN-O0: __modsi3
 ; WINDOWS: __rt_sdiv
   %add = add nsw i32 %rem, %div
   %add2 = add nsw i32 %add, %rem1
 ; EABI: add r0{{.*}}r1
-; DARWIN-DEFAULT: add r0, [[sum]], r0
-; DARWIN-O0: add [[sum:r[0-9]+]], [[rem]], [[div]]
-; DARWIN-O0: add [[res:r[0-9]+]], [[sum]], r0
 ; WINDOWS-DEFAULT: adds r0, [[div]], r1
 ; WINDOWS-O0: adds [[sum:r[0-9]+]], [[rem]], [[div]]
 ; WINDOWS-O0: add [[sum]], r1
@@ -103,16 +93,15 @@ entry:
 define i32 @uf(i32 %a, i32 %b) {
 ; EABI-LABEL: uf:
 ; DARWIN-LABEL: uf:
+; DARWIN-O0-LABEL: uf:
 ; WINDOWS-LABEL: uf:
 entry:
   %div = udiv i32 %a, %b
   %rem = urem i32 %a, %b
 ; EABI: __aeabi_uidivmod
-; DARWIN: ___udivsi3
-; DARWIN: mov [[div:r[0-9]+]], r0
-; DARWIN: __umodsi3
-; DARWIN-DEFAULT: add [[sum:r[0-9]+]], r0, [[div]]
-; DARWIN-O0: mov [[rem:r[0-9]+]], r0
+; DARWIN: __udivmodsi4
+; DARWIN-O0: __udivsi3
+; DARWIN-O0: __umodsi3
 ; WINDOWS: __rt_udiv
 ; WINDOWS: mov [[div:r[0-9]+]], r0
 ; WINDOWS: __rt_udiv
@@ -120,13 +109,11 @@ entry:
   %rem1 = urem i32 %b, %a
 ; EABI: __aeabi_uidivmod
 ; DARWIN: __umodsi3
+; DARWIN-O0: __umodsi3
 ; WINDOWS: __rt_udiv
   %add = add nuw i32 %rem, %div
   %add2 = add nuw i32 %add, %rem1
 ; EABI: add r0{{.*}}r1
-; DARWIN-DEFAULT: add r0, [[sum]], r0
-; DARWIN-O0: add [[sum:r[0-9]+]], [[rem]], [[div]]
-; DARWIN-O0: add [[res:r[0-9]+]], [[sum]], r0
 ; WINDOWS-DEFAULT: adds [[sum:r[0-9]+]], [[div]], r1
 ; WINDOWS-O0: adds [[sum:r[0-9]+]],
 ; WINDOWS-O0: add [[sum]], r1
@@ -136,6 +123,7 @@ entry:
 define i64 @longf(i64 %a, i64 %b) {
 ; EABI-LABEL: longf:
 ; DARWIN-LABEL: longf:
+; DARWIN-O0-LABEL: longf:
 ; WINDOWS-LABEL: longf:
 entry:
   %div = sdiv i64 %a, %b
@@ -148,6 +136,8 @@ entry:
 ; DARWIN: mov [[div1:r[0-9]+]], r0
 ; DARWIN: mov [[div2:r[0-9]+]], r1
 ; DARWIN: __moddi3
+; DARWIN-O0: __divdi3
+; DARWIN-O0: __moddi3
 ; WINDOWS: __rt_sdiv64
   %add = add nsw i64 %rem, %div
 ; DARWIN: adds r0{{.*}}[[div1]]
@@ -160,20 +150,19 @@ entry:
 define i16 @shortf(i16 %a, i16 %b) {
 ; EABI-LABEL: shortf:
 ; DARWIN-LABEL: shortf:
+; DARWIN-O0-LABEL: shortf:
 ; WINDOWS-LABEL: shortf:
 entry:
   %div = sdiv i16 %a, %b
   %rem = srem i16 %a, %b
 ; EABI: __aeabi_idivmod
-; DARWIN: ___divsi3
-; DARWIN: mov [[div1:r[0-9]+]], r0
-; DARWIN: __modsi3
+; DARWIN: ___divmodsi4
+; DARWIN-O0: __divmodsi4
 ; WINDOWS: __rt_sdiv
 ; WINDOWS: mov [[div:r[0-9]+]], r0
 ; WINDOWS: __rt_sdiv
   %add = add nsw i16 %rem, %div
 ; EABI: add r0, r1
-; DARWIN: add r0{{.*}}[[div1]]
 ; WINDOWS: adds r0, r1, [[div]]
   ret i16 %add
 }
@@ -181,20 +170,20 @@ entry:
 define i32 @g1(i32 %a, i32 %b) {
 ; EABI-LABEL: g1:
 ; DARWIN-LABEL: g1:
+; DARWIN-O0-LABEL: g1:
 ; WINDOWS-LABEL: g1:
 entry:
   %div = sdiv i32 %a, %b
   %rem = srem i32 %a, %b
 ; EABI: __aeabi_idivmod
-; DARWIN: ___divsi3
-; DARWIN: mov [[sum:r[0-9]+]], r0
-; DARWIN: __modsi3
+; DARWIN: ___divmodsi4
+; DARWIN-O0: __divsi3
+; DARWIN-O0: __modsi3
 ; WINDOWS: __rt_sdiv
 ; WINDOWS: mov [[div:r[0-9]+]], r0
 ; WINDOWS: __rt_sdiv
   %add = add nsw i32 %rem, %div
 ; EABI:	add	r0{{.*}}r1
-; DARWIN: add r0{{.*}}[[sum]]
 ; WINDOWS: adds r0, r1, [[div]]
   ret i32 %add
 }
@@ -203,11 +192,13 @@ entry:
 define i32 @g2(i32 %a, i32 %b) {
 ; EABI-LABEL: g2:
 ; DARWIN-LABEL: g2:
+; DARWIN-O0-LABEL: g2:
 ; WINDOWS-LABEL: g2:
 entry:
   %rem = srem i32 %a, %b
 ; EABI: __aeabi_idivmod
 ; DARWIN: __modsi3
+; DARWIN-O0: __modsi3
 ; WINDOWS: __rt_sdiv
   ret i32 %rem
 ; EABI:	mov	r0, r1
@@ -217,6 +208,7 @@ entry:
 define i32 @g3(i32 %a, i32 %b) {
 ; EABI-LABEL: g3:
 ; DARWIN-LABEL: g3:
+; DARWIN-O0-LABEL: g3:
 ; WINDOWS-LABEL: g3:
 entry:
   %rem = srem i32 %a, %b
@@ -224,11 +216,13 @@ entry:
 ; EABI: mov [[mod:r[0-9]+]], r1
 ; DARWIN: __modsi3
 ; DARWIN: mov [[sum:r[0-9]+]], r0
+; DARWIN-O0: __modsi3
 ; WINDOWS: __rt_sdiv
 ; WINDOWS: mov [[rem:r[0-9]+]], r1
   %rem1 = srem i32 %b, %rem
 ; EABI: __aeabi_idivmod
 ; DARWIN: __modsi3
+; DARWIN-O0: __modsi3
 ; WINDOWS: __rt_sdiv
   %add = add nsw i32 %rem1, %rem
 ; EABI: add r0, r1, [[mod]]
@@ -240,6 +234,7 @@ entry:
 define i32 @g4(i32 %a, i32 %b) {
 ; EABI-LABEL: g4:
 ; DARWIN-LABEL: g4:
+; DARWIN-O0-LABEL: g4:
 ; WINDOWS-LABEL: g4:
 entry:
   %div = sdiv i32 %a, %b
@@ -247,11 +242,13 @@ entry:
 ; EABI: mov [[div:r[0-9]+]], r0
 ; DARWIN: ___divsi3
 ; DARWIN: mov [[sum:r[0-9]+]], r0
+; DARWIN-O0: __divsi3
 ; WINDOWS: __rt_sdiv
 ; WINDOWS: mov [[div:r[0-9]+]], r0
   %rem = srem i32 %b, %div
 ; EABI: __aeabi_idivmod
 ; DARWIN: __modsi3
+; DARWIN-O0: __modsi3
 ; WINDOWS: __rt_sdiv
   %add = add nsw i32 %rem, %div
 ; EABI: add r0, r1, [[div]]
diff --git a/test/CodeGen/ARM/divmod.ll b/test/CodeGen/ARM/divmod.ll
index 9336d0c477d1..ffc1ed09cbf0 100644
--- a/test/CodeGen/ARM/divmod.ll
+++ b/test/CodeGen/ARM/divmod.ll
@@ -1,5 +1,6 @@
 ; RUN: llc < %s -mtriple=arm-apple-ios5.0 -mcpu=cortex-a8 | FileCheck %s -check-prefix=A8
 ; RUN: llc < %s -mtriple=arm-apple-ios5.0 -mcpu=swift     | FileCheck %s -check-prefix=SWIFT
+; RUN: llc < %s -mtriple=thumbv7-apple-macho -mcpu=cortex-a8     | FileCheck %s -check-prefix=A8
 
 ; rdar://12481395
 
diff --git a/test/CodeGen/AVR/select-mbb-placement-bug.ll b/test/CodeGen/AVR/select-mbb-placement-bug.ll
new file mode 100644
index 000000000000..ca7ec1ab831c
--- /dev/null
+++ b/test/CodeGen/AVR/select-mbb-placement-bug.ll
@@ -0,0 +1,35 @@
+; RUN: llc -mcpu=atmega328p < %s -march=avr | FileCheck %s
+
+; CHECK-LABEL: loopy
+define internal fastcc void @loopy() {
+
+; In this case, when we expand `Select8`/`Select16`, we should be
+; replacing the existing MBB instead of adding a new one.
+;
+; https://github.com/avr-rust/rust/issues/49
+
+; CHECK: LBB0_1:
+; CHECK: LBB0_2:
+; CHECK-NOT: LBB0_3:
+start:
+  br label %bb7.preheader
+
+bb7.preheader:                                    ; preds = %bb10, %start
+  %i = phi i8 [ 0, %start ], [ %j, %bb10 ]
+  %j = phi i8 [ 1, %start ], [ %next, %bb10 ]
+  br label %bb10
+
+bb4:                                              ; preds = %bb10
+  ret void
+
+bb10:                                             ; preds = %bb7.preheader
+  tail call fastcc void @observe(i8 %i, i8 1)
+  %0 = icmp ult i8 %j, 20
+  %1 = zext i1 %0 to i8
+  %next = add i8 %j, %1
+  br i1 %0, label %bb7.preheader, label %bb4
+
+}
+
+declare void @observe(i8, i8);
+
diff --git a/test/CodeGen/Generic/expand-experimental-reductions.ll b/test/CodeGen/Generic/expand-experimental-reductions.ll
new file mode 100644
index 000000000000..ef813fa7205b
--- /dev/null
+++ b/test/CodeGen/Generic/expand-experimental-reductions.ll
@@ -0,0 +1,210 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -expand-reductions -S | FileCheck %s
+; Tests without a target which should expand all reductions
+declare i64 @llvm.experimental.vector.reduce.add.i64.v2i64(<2 x i64>)
+declare i64 @llvm.experimental.vector.reduce.mul.i64.v2i64(<2 x i64>)
+declare i64 @llvm.experimental.vector.reduce.and.i64.v2i64(<2 x i64>)
+declare i64 @llvm.experimental.vector.reduce.or.i64.v2i64(<2 x i64>)
+declare i64 @llvm.experimental.vector.reduce.xor.i64.v2i64(<2 x i64>)
+
+declare float @llvm.experimental.vector.reduce.fadd.f32.v4f32(float, <4 x float>)
+declare float @llvm.experimental.vector.reduce.fmul.f32.v4f32(float, <4 x float>)
+
+declare i64 @llvm.experimental.vector.reduce.smax.i64.v2i64(<2 x i64>)
+declare i64 @llvm.experimental.vector.reduce.smin.i64.v2i64(<2 x i64>)
+declare i64 @llvm.experimental.vector.reduce.umax.i64.v2i64(<2 x i64>)
+declare i64 @llvm.experimental.vector.reduce.umin.i64.v2i64(<2 x i64>)
+
+declare double @llvm.experimental.vector.reduce.fmax.f64.v2f64(<2 x double>)
+declare double @llvm.experimental.vector.reduce.fmin.f64.v2f64(<2 x double>)
+
+
+define i64 @add_i64(<2 x i64> %vec) {
+; CHECK-LABEL: @add_i64(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <2 x i64> [[VEC:%.*]], <2 x i64> undef, <2 x i32> <i32 1, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX:%.*]] = add <2 x i64> [[VEC]], [[RDX_SHUF]]
+; CHECK-NEXT:    [[TMP0:%.*]] = extractelement <2 x i64> [[BIN_RDX]], i32 0
+; CHECK-NEXT:    ret i64 [[TMP0]]
+;
+entry:
+  %r = call i64 @llvm.experimental.vector.reduce.add.i64.v2i64(<2 x i64> %vec)
+  ret i64 %r
+}
+
+define i64 @mul_i64(<2 x i64> %vec) {
+; CHECK-LABEL: @mul_i64(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <2 x i64> [[VEC:%.*]], <2 x i64> undef, <2 x i32> <i32 1, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX:%.*]] = mul <2 x i64> [[VEC]], [[RDX_SHUF]]
+; CHECK-NEXT:    [[TMP0:%.*]] = extractelement <2 x i64> [[BIN_RDX]], i32 0
+; CHECK-NEXT:    ret i64 [[TMP0]]
+;
+entry:
+  %r = call i64 @llvm.experimental.vector.reduce.mul.i64.v2i64(<2 x i64> %vec)
+  ret i64 %r
+}
+
+define i64 @and_i64(<2 x i64> %vec) {
+; CHECK-LABEL: @and_i64(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <2 x i64> [[VEC:%.*]], <2 x i64> undef, <2 x i32> <i32 1, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX:%.*]] = and <2 x i64> [[VEC]], [[RDX_SHUF]]
+; CHECK-NEXT:    [[TMP0:%.*]] = extractelement <2 x i64> [[BIN_RDX]], i32 0
+; CHECK-NEXT:    ret i64 [[TMP0]]
+;
+entry:
+  %r = call i64 @llvm.experimental.vector.reduce.and.i64.v2i64(<2 x i64> %vec)
+  ret i64 %r
+}
+
+define i64 @or_i64(<2 x i64> %vec) {
+; CHECK-LABEL: @or_i64(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <2 x i64> [[VEC:%.*]], <2 x i64> undef, <2 x i32> <i32 1, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX:%.*]] = or <2 x i64> [[VEC]], [[RDX_SHUF]]
+; CHECK-NEXT:    [[TMP0:%.*]] = extractelement <2 x i64> [[BIN_RDX]], i32 0
+; CHECK-NEXT:    ret i64 [[TMP0]]
+;
+entry:
+  %r = call i64 @llvm.experimental.vector.reduce.or.i64.v2i64(<2 x i64> %vec)
+  ret i64 %r
+}
+
+define i64 @xor_i64(<2 x i64> %vec) {
+; CHECK-LABEL: @xor_i64(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <2 x i64> [[VEC:%.*]], <2 x i64> undef, <2 x i32> <i32 1, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX:%.*]] = xor <2 x i64> [[VEC]], [[RDX_SHUF]]
+; CHECK-NEXT:    [[TMP0:%.*]] = extractelement <2 x i64> [[BIN_RDX]], i32 0
+; CHECK-NEXT:    ret i64 [[TMP0]]
+;
+entry:
+  %r = call i64 @llvm.experimental.vector.reduce.xor.i64.v2i64(<2 x i64> %vec)
+  ret i64 %r
+}
+
+define float @fadd_f32(<4 x float> %vec) {
+; CHECK-LABEL: @fadd_f32(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <4 x float> [[VEC:%.*]], <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX:%.*]] = fadd fast <4 x float> [[VEC]], [[RDX_SHUF]]
+; CHECK-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <4 x float> [[BIN_RDX]], <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX2:%.*]] = fadd fast <4 x float> [[BIN_RDX]], [[RDX_SHUF1]]
+; CHECK-NEXT:    [[TMP0:%.*]] = extractelement <4 x float> [[BIN_RDX2]], i32 0
+; CHECK-NEXT:    ret float [[TMP0]]
+;
+entry:
+  %r = call fast float @llvm.experimental.vector.reduce.fadd.f32.v4f32(float undef, <4 x float> %vec)
+  ret float %r
+}
+
+define float @fadd_f32_strict(<4 x float> %vec) {
+; CHECK-LABEL: @fadd_f32_strict(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[R:%.*]] = call float @llvm.experimental.vector.reduce.fadd.f32.f32.v4f32(float undef, <4 x float> [[VEC:%.*]])
+; CHECK-NEXT:    ret float [[R]]
+;
+entry:
+  %r = call float @llvm.experimental.vector.reduce.fadd.f32.v4f32(float undef, <4 x float> %vec)
+  ret float %r
+}
+
+define float @fmul_f32(<4 x float> %vec) {
+; CHECK-LABEL: @fmul_f32(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <4 x float> [[VEC:%.*]], <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX:%.*]] = fmul fast <4 x float> [[VEC]], [[RDX_SHUF]]
+; CHECK-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <4 x float> [[BIN_RDX]], <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX2:%.*]] = fmul fast <4 x float> [[BIN_RDX]], [[RDX_SHUF1]]
+; CHECK-NEXT:    [[TMP0:%.*]] = extractelement <4 x float> [[BIN_RDX2]], i32 0
+; CHECK-NEXT:    ret float [[TMP0]]
+;
+entry:
+  %r = call fast float @llvm.experimental.vector.reduce.fmul.f32.v4f32(float undef, <4 x float> %vec)
+  ret float %r
+}
+
+define i64 @smax_i64(<2 x i64> %vec) {
+; CHECK-LABEL: @smax_i64(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <2 x i64> [[VEC:%.*]], <2 x i64> undef, <2 x i32> <i32 1, i32 undef>
+; CHECK-NEXT:    [[RDX_MINMAX_CMP:%.*]] = icmp sgt <2 x i64> [[VEC]], [[RDX_SHUF]]
+; CHECK-NEXT:    [[RDX_MINMAX_SELECT:%.*]] = select <2 x i1> [[RDX_MINMAX_CMP]], <2 x i64> [[VEC]], <2 x i64> [[RDX_SHUF]]
+; CHECK-NEXT:    [[TMP0:%.*]] = extractelement <2 x i64> [[RDX_MINMAX_SELECT]], i32 0
+; CHECK-NEXT:    ret i64 [[TMP0]]
+;
+entry:
+  %r = call i64 @llvm.experimental.vector.reduce.smax.i64.v2i64(<2 x i64> %vec)
+  ret i64 %r
+}
+
+define i64 @smin_i64(<2 x i64> %vec) {
+; CHECK-LABEL: @smin_i64(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <2 x i64> [[VEC:%.*]], <2 x i64> undef, <2 x i32> <i32 1, i32 undef>
+; CHECK-NEXT:    [[RDX_MINMAX_CMP:%.*]] = icmp slt <2 x i64> [[VEC]], [[RDX_SHUF]]
+; CHECK-NEXT:    [[RDX_MINMAX_SELECT:%.*]] = select <2 x i1> [[RDX_MINMAX_CMP]], <2 x i64> [[VEC]], <2 x i64> [[RDX_SHUF]]
+; CHECK-NEXT:    [[TMP0:%.*]] = extractelement <2 x i64> [[RDX_MINMAX_SELECT]], i32 0
+; CHECK-NEXT:    ret i64 [[TMP0]]
+;
+entry:
+  %r = call i64 @llvm.experimental.vector.reduce.smin.i64.v2i64(<2 x i64> %vec)
+  ret i64 %r
+}
+
+define i64 @umax_i64(<2 x i64> %vec) {
+; CHECK-LABEL: @umax_i64(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <2 x i64> [[VEC:%.*]], <2 x i64> undef, <2 x i32> <i32 1, i32 undef>
+; CHECK-NEXT:    [[RDX_MINMAX_CMP:%.*]] = icmp ugt <2 x i64> [[VEC]], [[RDX_SHUF]]
+; CHECK-NEXT:    [[RDX_MINMAX_SELECT:%.*]] = select <2 x i1> [[RDX_MINMAX_CMP]], <2 x i64> [[VEC]], <2 x i64> [[RDX_SHUF]]
+; CHECK-NEXT:    [[TMP0:%.*]] = extractelement <2 x i64> [[RDX_MINMAX_SELECT]], i32 0
+; CHECK-NEXT:    ret i64 [[TMP0]]
+;
+entry:
+  %r = call i64 @llvm.experimental.vector.reduce.umax.i64.v2i64(<2 x i64> %vec)
+  ret i64 %r
+}
+
+define i64 @umin_i64(<2 x i64> %vec) {
+; CHECK-LABEL: @umin_i64(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <2 x i64> [[VEC:%.*]], <2 x i64> undef, <2 x i32> <i32 1, i32 undef>
+; CHECK-NEXT:    [[RDX_MINMAX_CMP:%.*]] = icmp ult <2 x i64> [[VEC]], [[RDX_SHUF]]
+; CHECK-NEXT:    [[RDX_MINMAX_SELECT:%.*]] = select <2 x i1> [[RDX_MINMAX_CMP]], <2 x i64> [[VEC]], <2 x i64> [[RDX_SHUF]]
+; CHECK-NEXT:    [[TMP0:%.*]] = extractelement <2 x i64> [[RDX_MINMAX_SELECT]], i32 0
+; CHECK-NEXT:    ret i64 [[TMP0]]
+;
+entry:
+  %r = call i64 @llvm.experimental.vector.reduce.umin.i64.v2i64(<2 x i64> %vec)
+  ret i64 %r
+}
+
+define double @fmax_f64(<2 x double> %vec) {
+; CHECK-LABEL: @fmax_f64(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <2 x double> [[VEC:%.*]], <2 x double> undef, <2 x i32> <i32 1, i32 undef>
+; CHECK-NEXT:    [[RDX_MINMAX_CMP:%.*]] = fcmp fast ogt <2 x double> [[VEC]], [[RDX_SHUF]]
+; CHECK-NEXT:    [[RDX_MINMAX_SELECT:%.*]] = select <2 x i1> [[RDX_MINMAX_CMP]], <2 x double> [[VEC]], <2 x double> [[RDX_SHUF]]
+; CHECK-NEXT:    [[TMP0:%.*]] = extractelement <2 x double> [[RDX_MINMAX_SELECT]], i32 0
+; CHECK-NEXT:    ret double [[TMP0]]
+;
+entry:
+  %r = call double @llvm.experimental.vector.reduce.fmax.f64.v2f64(<2 x double> %vec)
+  ret double %r
+}
+
+define double @fmin_f64(<2 x double> %vec) {
+; CHECK-LABEL: @fmin_f64(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <2 x double> [[VEC:%.*]], <2 x double> undef, <2 x i32> <i32 1, i32 undef>
+; CHECK-NEXT:    [[RDX_MINMAX_CMP:%.*]] = fcmp fast olt <2 x double> [[VEC]], [[RDX_SHUF]]
+; CHECK-NEXT:    [[RDX_MINMAX_SELECT:%.*]] = select <2 x i1> [[RDX_MINMAX_CMP]], <2 x double> [[VEC]], <2 x double> [[RDX_SHUF]]
+; CHECK-NEXT:    [[TMP0:%.*]] = extractelement <2 x double> [[RDX_MINMAX_SELECT]], i32 0
+; CHECK-NEXT:    ret double [[TMP0]]
+;
+entry:
+  %r = call double @llvm.experimental.vector.reduce.fmin.f64.v2f64(<2 x double> %vec)
+  ret double %r
+}
diff --git a/test/CodeGen/Hexagon/regalloc-bad-undef.mir b/test/CodeGen/Hexagon/regalloc-bad-undef.mir
index d8fbb92b0d50..a541e766f593 100644
--- a/test/CodeGen/Hexagon/regalloc-bad-undef.mir
+++ b/test/CodeGen/Hexagon/regalloc-bad-undef.mir
@@ -161,17 +161,17 @@ body:             |
   bb.1.for.body:
     successors: %bb.3.for.end, %bb.2.if.end82
 
-    ADJCALLSTACKDOWN 0, implicit-def dead %r29, implicit-def dead %r30, implicit %r31, implicit %r30, implicit %r29
+    ADJCALLSTACKDOWN 0, 0, implicit-def dead %r29, implicit-def dead %r30, implicit %r31, implicit %r30, implicit %r29
     J2_call @lrand48, implicit-def dead %d0, implicit-def dead %d1, implicit-def dead %d2, implicit-def dead %d3, implicit-def dead %d4, implicit-def dead %d5, implicit-def dead %d6, implicit-def dead %d7, implicit-def dead %r28, implicit-def dead %r31, implicit-def dead %p0, implicit-def dead %p1, implicit-def dead %p2, implicit-def dead %p3, implicit-def dead %m0, implicit-def dead %m1, implicit-def dead %lc0, implicit-def dead %lc1, implicit-def dead %sa0, implicit-def dead %sa1, implicit-def dead %usr, implicit-def %usr_ovf, implicit-def dead %cs0, implicit-def dead %cs1, implicit-def dead %w0, implicit-def dead %w1, implicit-def dead %w2, implicit-def dead %w3, implicit-def dead %w4, implicit-def dead %w5, implicit-def dead %w6, implicit-def dead %w7, implicit-def dead %w8, implicit-def dead %w9, implicit-def dead %w10, implicit-def dead %w11, implicit-def dead %w12, implicit-def dead %w13, implicit-def dead %w14, implicit-def dead %w15, implicit-def dead %q0, implicit-def dead %q1, implicit-def dead %q2, implicit-def dead %q3, implicit-def %r0
     ADJCALLSTACKUP 0, 0, implicit-def dead %r29, implicit-def dead %r30, implicit-def dead %r31, implicit %r29
     undef %29.isub_lo = COPY killed %r0
     %29.isub_hi = S2_asr_i_r %29.isub_lo, 31
-    ADJCALLSTACKDOWN 0, implicit-def dead %r29, implicit-def dead %r30, implicit %r31, implicit %r30, implicit %r29
+    ADJCALLSTACKDOWN 0, 0, implicit-def dead %r29, implicit-def dead %r30, implicit %r31, implicit %r30, implicit %r29
     J2_call @lrand48, implicit-def dead %d0, implicit-def dead %d1, implicit-def dead %d2, implicit-def dead %d3, implicit-def dead %d4, implicit-def dead %d5, implicit-def dead %d6, implicit-def dead %d7, implicit-def dead %r28, implicit-def dead %r31, implicit-def dead %p0, implicit-def dead %p1, implicit-def dead %p2, implicit-def dead %p3, implicit-def dead %m0, implicit-def dead %m1, implicit-def dead %lc0, implicit-def dead %lc1, implicit-def dead %sa0, implicit-def dead %sa1, implicit-def dead %usr, implicit-def %usr_ovf, implicit-def dead %cs0, implicit-def dead %cs1, implicit-def dead %w0, implicit-def dead %w1, implicit-def dead %w2, implicit-def dead %w3, implicit-def dead %w4, implicit-def dead %w5, implicit-def dead %w6, implicit-def dead %w7, implicit-def dead %w8, implicit-def dead %w9, implicit-def dead %w10, implicit-def dead %w11, implicit-def dead %w12, implicit-def dead %w13, implicit-def dead %w14, implicit-def dead %w15, implicit-def dead %q0, implicit-def dead %q1, implicit-def dead %q2, implicit-def dead %q3, implicit-def %r0
     ADJCALLSTACKUP 0, 0, implicit-def dead %r29, implicit-def dead %r30, implicit-def dead %r31, implicit %r29
     %32.isub_lo = COPY killed %r0
     %7 = S2_extractup %32, 22, 9
-    ADJCALLSTACKDOWN 0, implicit-def dead %r29, implicit-def dead %r30, implicit %r31, implicit %r30, implicit %r29
+    ADJCALLSTACKDOWN 0, 0, implicit-def dead %r29, implicit-def dead %r30, implicit %r31, implicit %r30, implicit %r29
     J2_call @lrand48, implicit-def dead %d0, implicit-def dead %d1, implicit-def dead %d2, implicit-def dead %d3, implicit-def dead %d4, implicit-def dead %d5, implicit-def dead %d6, implicit-def dead %d7, implicit-def dead %r28, implicit-def dead %r31, implicit-def dead %p0, implicit-def dead %p1, implicit-def dead %p2, implicit-def dead %p3, implicit-def dead %m0, implicit-def dead %m1, implicit-def dead %lc0, implicit-def dead %lc1, implicit-def dead %sa0, implicit-def dead %sa1, implicit-def dead %usr, implicit-def %usr_ovf, implicit-def dead %cs0, implicit-def dead %cs1, implicit-def dead %w0, implicit-def dead %w1, implicit-def dead %w2, implicit-def dead %w3, implicit-def dead %w4, implicit-def dead %w5, implicit-def dead %w6, implicit-def dead %w7, implicit-def dead %w8, implicit-def dead %w9, implicit-def dead %w10, implicit-def dead %w11, implicit-def dead %w12, implicit-def dead %w13, implicit-def dead %w14, implicit-def dead %w15, implicit-def dead %q0, implicit-def dead %q1, implicit-def dead %q2, implicit-def dead %q3, implicit-def %r0
     ADJCALLSTACKUP 0, 0, implicit-def dead %r29, implicit-def dead %r30, implicit-def dead %r31, implicit %r29
     undef %43.isub_lo = COPY killed %r0
@@ -179,7 +179,7 @@ body:             |
     %16 = S2_extractup %43, 6, 25
     %18 = A2_tfrpi -1
     %18 = S2_asl_r_p_acc %18, %47, %16.isub_lo
-    ADJCALLSTACKDOWN 0, implicit-def dead %r29, implicit-def dead %r30, implicit %r31, implicit %r30, implicit %r29
+    ADJCALLSTACKDOWN 0, 0, implicit-def dead %r29, implicit-def dead %r30, implicit %r31, implicit %r30, implicit %r29
     J2_call @lrand48, implicit-def dead %d0, implicit-def dead %d1, implicit-def dead %d2, implicit-def dead %d3, implicit-def dead %d4, implicit-def dead %d5, implicit-def dead %d6, implicit-def dead %d7, implicit-def dead %r28, implicit-def dead %r31, implicit-def dead %p0, implicit-def dead %p1, implicit-def dead %p2, implicit-def dead %p3, implicit-def dead %m0, implicit-def dead %m1, implicit-def dead %lc0, implicit-def dead %lc1, implicit-def dead %sa0, implicit-def dead %sa1, implicit-def dead %usr, implicit-def %usr_ovf, implicit-def dead %cs0, implicit-def dead %cs1, implicit-def dead %w0, implicit-def dead %w1, implicit-def dead %w2, implicit-def dead %w3, implicit-def dead %w4, implicit-def dead %w5, implicit-def dead %w6, implicit-def dead %w7, implicit-def dead %w8, implicit-def dead %w9, implicit-def dead %w10, implicit-def dead %w11, implicit-def dead %w12, implicit-def dead %w13, implicit-def dead %w14, implicit-def dead %w15, implicit-def dead %q0, implicit-def dead %q1, implicit-def dead %q2, implicit-def dead %q3
     ADJCALLSTACKUP 0, 0, implicit-def dead %r29, implicit-def dead %r30, implicit-def dead %r31, implicit %r29
     %22 = S2_asl_r_p %18, %8.isub_lo
diff --git a/test/CodeGen/Lanai/masking_setccs.ll b/test/CodeGen/Lanai/masking_setccs.ll
new file mode 100644
index 000000000000..48136fd42574
--- /dev/null
+++ b/test/CodeGen/Lanai/masking_setccs.ll
@@ -0,0 +1,48 @@
+; RUN: llc < %s | FileCheck %s
+
+; Test that unnecessary masking with 0x1 is not inserted.
+
+target datalayout = "E-m:e-p:32:32-i64:64-a:0:32-n32-S64"
+target triple = "lanai"
+
+; CHECK-LABEL: masking:
+; CHECK-NOT: mov 1
+define i32 @masking(i32 inreg %a, i32 inreg %b, i32 inreg %c, i32 inreg %d) {
+entry:
+  %cmp = icmp ne i32 %a, 0
+  %cmp1 = icmp ult i32 %a, %b
+  %or.cond = and i1 %cmp, %cmp1
+  br i1 %or.cond, label %return, label %if.end
+
+if.end:                                           ; preds = %entry
+  %cmp2 = icmp ne i32 %b, 0
+  %cmp4 = icmp ult i32 %b, %c
+  %or.cond29 = and i1 %cmp2, %cmp4
+  br i1 %or.cond29, label %return, label %if.end6
+
+if.end6:                                          ; preds = %if.end
+  %cmp7 = icmp ne i32 %c, 0
+  %cmp9 = icmp ult i32 %c, %d
+  %or.cond30 = and i1 %cmp7, %cmp9
+  br i1 %or.cond30, label %return, label %if.end11
+
+if.end11:                                         ; preds = %if.end6
+  %cmp12 = icmp ne i32 %d, 0
+  %cmp14 = icmp ult i32 %d, %a
+  %or.cond31 = and i1 %cmp12, %cmp14
+  %b. = select i1 %or.cond31, i32 %b, i32 21
+  ret i32 %b.
+
+return:                                           ; preds = %if.end6, %if.end, %entry
+  %retval.0 = phi i32 [ %c, %entry ], [ %d, %if.end ], [ %a, %if.end6 ]
+  ret i32 %retval.0
+}
+
+; CHECK-LABEL: notnot:
+; CHECK-NOT: mov 1
+define i32 @notnot(i32 %x) {
+entry:
+  %tobool = icmp ne i32 %x, 0
+  %lnot.ext = zext i1 %tobool to i32
+  ret i32 %lnot.ext
+}
diff --git a/test/CodeGen/Lanai/peephole-compare.mir b/test/CodeGen/Lanai/peephole-compare.mir
index 5056a05ed1f6..51133b5e58e3 100644
--- a/test/CodeGen/Lanai/peephole-compare.mir
+++ b/test/CodeGen/Lanai/peephole-compare.mir
@@ -644,7 +644,7 @@ body:             |
   bb.1.if.then:
     successors: %bb.2.while.body
   
-    ADJCALLSTACKDOWN 0, implicit-def dead %sp, implicit %sp
+    ADJCALLSTACKDOWN 0, 0, implicit-def dead %sp, implicit %sp
     CALL @g, csr, implicit-def dead %rca, implicit %sp, implicit-def %sp, implicit-def %rv
     ADJCALLSTACKUP 0, 0, implicit-def dead %sp, implicit %sp
   
@@ -663,7 +663,7 @@ body:             |
   bb.4.if.then4:
     successors: %bb.5.while.body6
   
-    ADJCALLSTACKDOWN 0, implicit-def dead %sp, implicit %sp
+    ADJCALLSTACKDOWN 0, 0, implicit-def dead %sp, implicit %sp
     CALL @g, csr, implicit-def dead %rca, implicit %sp, implicit-def %sp, implicit-def %rv
     ADJCALLSTACKUP 0, 0, implicit-def dead %sp, implicit %sp
   
diff --git a/test/CodeGen/MIR/ARM/PR32721_ifcvt_triangle_unanalyzable.mir b/test/CodeGen/MIR/ARM/PR32721_ifcvt_triangle_unanalyzable.mir
new file mode 100644
index 000000000000..96801f5b0a37
--- /dev/null
+++ b/test/CodeGen/MIR/ARM/PR32721_ifcvt_triangle_unanalyzable.mir
@@ -0,0 +1,24 @@
+# RUN: llc -mtriple=arm-apple-ios -run-pass=if-converter %s -o - | FileCheck %s
+---
+name:            foo
+body:             |
+  bb.0:
+    B %bb.2
+
+  bb.1:
+    BX_RET 14, 0
+
+  bb.2:
+    Bcc %bb.1, 1, %cpsr
+
+  bb.3:
+    B %bb.1
+
+...
+
+# We should get a single block containing the BX_RET, with no successors at all
+
+# CHECK:      body:
+# CHECK-NEXT:   bb.0:
+# CHECK-NEXT:     BX_RET
+
diff --git a/test/CodeGen/MIR/ARM/ifcvt_canFallThroughTo.mir b/test/CodeGen/MIR/ARM/ifcvt_canFallThroughTo.mir
new file mode 100644
index 000000000000..5a1583f7a9be
--- /dev/null
+++ b/test/CodeGen/MIR/ARM/ifcvt_canFallThroughTo.mir
@@ -0,0 +1,64 @@
+# RUN: llc -mtriple=arm-apple-ios -o - %s -run-pass if-converter | FileCheck %s
+---
+name:            f1
+body:             |
+  bb.0:
+    successors: %bb.1
+
+    B %bb.1
+
+  bb.1:
+    successors: %bb.2, %bb.4
+
+    Bcc %bb.4, 1, %cpsr
+
+  bb.2:
+    successors: %bb.3, %bb.5
+
+    Bcc %bb.5, 1, %cpsr
+
+  bb.3:
+    successors: %bb.5
+
+    B %bb.5
+
+  bb.4:
+    successors:
+
+  bb.5:
+    successors: %bb.1, %bb.6
+
+    Bcc %bb.1, 1, %cpsr
+
+  bb.6:
+    BX_RET 14, _
+
+...
+
+# IfConversion.cpp/canFallThroughTo thought there was a fallthrough from
+# bb.4 to bb5 even if the successor list was empty.
+# bb.4 is empty, so it surely looks like it can fallthrough, but this is what
+# happens for a bb just containing an "unreachable".
+
+#CHECK: body:             |
+#CHECK:   bb.0:
+#CHECK:     successors: %bb.1
+
+#CHECK:   bb.1:
+#CHECK:     successors: %bb.3({{.*}}), %bb.2
+
+# The original brr_cond from bb.1, jumping to the empty bb
+#CHECK:     Bcc %bb.2
+#CHECK:     B %bb.3
+
+# Empty bb.2, originally containing "unreachable" and thus has no successors
+#CHECK:   bb.2:
+#CHECK-NOT: successors
+
+#CHECK:   bb.3:
+#CHECK:     successors: %bb.1
+
+# Conditional BX_RET and then loop back to bb.1
+#CHECK:     BX_RET 0
+#CHECK:     B %bb.1
+
diff --git a/test/CodeGen/MIR/X86/frame-info-save-restore-points.mir b/test/CodeGen/MIR/X86/frame-info-save-restore-points.mir
index 2d5347e5d30d..14bb5db5a51d 100644
--- a/test/CodeGen/MIR/X86/frame-info-save-restore-points.mir
+++ b/test/CodeGen/MIR/X86/frame-info-save-restore-points.mir
@@ -60,7 +60,7 @@ body: |
     liveins: %eax
 
     MOV32mr %stack.0.tmp, 1, _, 0, _, killed %eax
-    ADJCALLSTACKDOWN64 0, 0, implicit-def %rsp, implicit-def dead %eflags, implicit %rsp
+    ADJCALLSTACKDOWN64 0, 0, 0, implicit-def %rsp, implicit-def dead %eflags, implicit %rsp
     %rsi = LEA64r %stack.0.tmp, 1, _, 0, _
     %edi = MOV32r0 implicit-def dead %eflags
     CALL64pcrel32 @doSomething, csr_64, implicit %rsp, implicit %edi, implicit %rsi, implicit-def %rsp, implicit-def %eax
diff --git a/test/CodeGen/MSP430/2009-11-05-8BitLibcalls.ll b/test/CodeGen/MSP430/2009-11-05-8BitLibcalls.ll
deleted file mode 100644
index dce9d25ca87a..000000000000
--- a/test/CodeGen/MSP430/2009-11-05-8BitLibcalls.ll
+++ /dev/null
@@ -1,22 +0,0 @@
-; RUN: llc < %s | FileCheck %s
-
-target datalayout = "e-p:16:8:8-i8:8:8-i16:8:8-i32:8:8"
-target triple = "msp430-elf"
-
-@g_29 = common global i8 0, align 1               ; <i8*> [#uses=0]
-
-define signext i8 @foo(i8 signext %_si1, i8 signext %_si2) nounwind readnone {
-entry:
-; CHECK-LABEL: foo:
-; CHECK: call #__mulqi3
-  %mul = mul i8 %_si2, %_si1                      ; <i8> [#uses=1]
-  ret i8 %mul
-}
-
-define void @uint81(i16* nocapture %p_32) nounwind {
-entry:
-  %call = tail call i16 @bar(i8* bitcast (i8 (i8, i8)* @foo to i8*)) nounwind ; <i16> [#uses=0]
-  ret void
-}
-
-declare i16 @bar(i8*)
diff --git a/test/CodeGen/MSP430/hwmult16.ll b/test/CodeGen/MSP430/hwmult16.ll
new file mode 100644
index 000000000000..b23f1ad37d81
--- /dev/null
+++ b/test/CodeGen/MSP430/hwmult16.ll
@@ -0,0 +1,43 @@
+; RUN: llc -O0 -mhwmult=16bit < %s | FileCheck %s
+
+target datalayout = "e-p:16:16:16-i8:8:8-i16:16:16-i32:16:32-n8:16-a0:16:16"
+target triple = "msp430---elf"
+
+@g_i32 = global i32 123, align 8
+@g_i64 = global i64 456, align 8
+@g_i16 = global i16 789, align 8
+
+define i16 @mpyi() #0 {
+entry:
+; CHECK: mpyi:
+
+; CHECK: call #__mspabi_mpyi_hw
+  %0 = load volatile i16, i16* @g_i16, align 8
+  %1 = mul i16 %0, %0
+
+  ret i16 %1
+}
+
+define i32 @mpyli() #0 {
+entry:
+; CHECK: mpyli:
+
+; CHECK: call #__mspabi_mpyl_hw
+  %0 = load volatile i32, i32* @g_i32, align 8
+  %1 = mul i32 %0, %0
+
+  ret i32 %1
+}
+
+define i64 @mpylli() #0 {
+entry:
+; CHECK: mpylli:
+
+; CHECK: call #__mspabi_mpyll_hw
+  %0 = load volatile i64, i64* @g_i64, align 8
+  %1 = mul i64 %0, %0
+
+  ret i64 %1
+}
+
+attributes #0 = { nounwind }
diff --git a/test/CodeGen/MSP430/hwmult32.ll b/test/CodeGen/MSP430/hwmult32.ll
new file mode 100644
index 000000000000..6ffeb9698862
--- /dev/null
+++ b/test/CodeGen/MSP430/hwmult32.ll
@@ -0,0 +1,43 @@
+; RUN: llc -O0 -mhwmult=32bit < %s | FileCheck %s
+
+target datalayout = "e-p:16:16:16-i8:8:8-i16:16:16-i32:16:32-n8:16-a0:16:16"
+target triple = "msp430---elf"
+
+@g_i32 = global i32 123, align 8
+@g_i64 = global i64 456, align 8
+@g_i16 = global i16 789, align 8
+
+define i16 @mpyi() #0 {
+entry:
+; CHECK: mpyi:
+
+; CHECK: call #__mspabi_mpyi_hw
+  %0 = load volatile i16, i16* @g_i16, align 8
+  %1 = mul i16 %0, %0
+
+  ret i16 %1
+}
+
+define i32 @mpyli() #0 {
+entry:
+; CHECK: mpyli:
+
+; CHECK: call #__mspabi_mpyl_hw32
+  %0 = load volatile i32, i32* @g_i32, align 8
+  %1 = mul i32 %0, %0
+
+  ret i32 %1
+}
+
+define i64 @mpylli() #0 {
+entry:
+; CHECK: mpylli:
+
+; CHECK: call #__mspabi_mpyll_hw32
+  %0 = load volatile i64, i64* @g_i64, align 8
+  %1 = mul i64 %0, %0
+
+  ret i64 %1
+}
+
+attributes #0 = { nounwind }
diff --git a/test/CodeGen/MSP430/hwmultf5.ll b/test/CodeGen/MSP430/hwmultf5.ll
new file mode 100644
index 000000000000..51ca4be4a654
--- /dev/null
+++ b/test/CodeGen/MSP430/hwmultf5.ll
@@ -0,0 +1,43 @@
+; RUN: llc -O0 -mhwmult=f5series < %s | FileCheck %s
+
+target datalayout = "e-p:16:16:16-i8:8:8-i16:16:16-i32:16:32-n8:16-a0:16:16"
+target triple = "msp430---elf"
+
+@g_i32 = global i32 123, align 8
+@g_i64 = global i64 456, align 8
+@g_i16 = global i16 789, align 8
+
+define i16 @mpyi() #0 {
+entry:
+; CHECK: mpyi:
+
+; CHECK: call #__mspabi_mpyi_f5hw
+  %0 = load volatile i16, i16* @g_i16, align 8
+  %1 = mul i16 %0, %0
+
+  ret i16 %1
+}
+
+define i32 @mpyli() #0 {
+entry:
+; CHECK: mpyli:
+
+; CHECK: call #__mspabi_mpyl_f5hw
+  %0 = load volatile i32, i32* @g_i32, align 8
+  %1 = mul i32 %0, %0
+
+  ret i32 %1
+}
+
+define i64 @mpylli() #0 {
+entry:
+; CHECK: mpylli:
+
+; CHECK: call #__mspabi_mpyll_f5hw
+  %0 = load volatile i64, i64* @g_i64, align 8
+  %1 = mul i64 %0, %0
+
+  ret i64 %1
+}
+
+attributes #0 = { nounwind }
diff --git a/test/CodeGen/MSP430/jumptable.ll b/test/CodeGen/MSP430/jumptable.ll
index 5ccdbb701db1..b4366251698b 100644
--- a/test/CodeGen/MSP430/jumptable.ll
+++ b/test/CodeGen/MSP430/jumptable.ll
@@ -12,7 +12,7 @@ entry:
   store i16 %i, i16* %i.addr, align 2
   %0 = load i16, i16* %i.addr, align 2
 ; CHECK: mov.w #2, r13
-; CHECK: call #__mulhi3hw_noint
+; CHECK: call #__mspabi_mpyi
 ; CHECK: br .LJTI0_0(r12)
   switch i16 %0, label %sw.default [
     i16 0, label %sw.bb
diff --git a/test/CodeGen/MSP430/libcalls.ll b/test/CodeGen/MSP430/libcalls.ll
new file mode 100644
index 000000000000..950ed6c17e2c
--- /dev/null
+++ b/test/CodeGen/MSP430/libcalls.ll
@@ -0,0 +1,595 @@
+; RUN: llc -O0 < %s | FileCheck %s
+
+target datalayout = "e-p:16:16:16-i8:8:8-i16:16:16-i32:16:32-n8:16-a0:16:16"
+target triple = "msp430---elf"
+
+@g_double = global double 123.0, align 8
+@g_float = global float 123.0, align 8
+@g_i32 = global i32 123, align 8
+@g_i64 = global i64 456, align 8
+@g_i16 = global i16 789, align 8
+
+define float @d2f() #0 {
+entry:
+; CHECK: d2f:
+
+; CHECK: call #__mspabi_cvtdf
+  %0 = load volatile double, double* @g_double, align 8
+  %1 = fptrunc double %0 to float
+
+  ret float %1
+}
+
+define double @f2d() #0 {
+entry:
+; CHECK: f2d:
+
+; CHECK: call #__mspabi_cvtfd
+  %0 = load volatile float, float* @g_float, align 8
+  %1 = fpext float %0 to double
+
+  ret double %1
+}
+
+define i32 @d2l() #0 {
+entry:
+; CHECK: d2l:
+
+; CHECK: call #__mspabi_fixdli
+  %0 = load volatile double, double* @g_double, align 8
+  %1 = fptosi double %0 to i32
+
+  ret i32 %1
+}
+
+define i64 @d2ll() #0 {
+entry:
+; CHECK: d2ll:
+
+; CHECK: call #__mspabi_fixdlli
+  %0 = load volatile double, double* @g_double, align 8
+  %1 = fptosi double %0 to i64
+
+  ret i64 %1
+}
+
+define i32 @d2ul() #0 {
+entry:
+; CHECK: d2ul:
+
+; CHECK: call #__mspabi_fixdul
+  %0 = load volatile double, double* @g_double, align 8
+  %1 = fptoui double %0 to i32
+
+  ret i32 %1
+}
+
+define i64 @d2ull() #0 {
+entry:
+; CHECK: d2ull:
+
+; CHECK: call #__mspabi_fixdull
+  %0 = load volatile double, double* @g_double, align 8
+  %1 = fptoui double %0 to i64
+
+  ret i64 %1
+}
+
+define i32 @f2l() #0 {
+entry:
+; CHECK: f2l:
+
+; CHECK: call #__mspabi_fixfli
+  %0 = load volatile float, float* @g_float, align 8
+  %1 = fptosi float %0 to i32
+
+  ret i32 %1
+}
+
+define i64 @f2ll() #0 {
+entry:
+; CHECK: f2ll:
+
+; CHECK: call #__mspabi_fixflli
+  %0 = load volatile float, float* @g_float, align 8
+  %1 = fptosi float %0 to i64
+
+  ret i64 %1
+}
+
+define i32 @f2ul() #0 {
+entry:
+; CHECK: f2ul:
+
+; CHECK: call #__mspabi_fixful
+  %0 = load volatile float, float* @g_float, align 8
+  %1 = fptoui float %0 to i32
+
+  ret i32 %1
+}
+
+define i64 @f2ull() #0 {
+entry:
+; CHECK: f2ull:
+
+; CHECK: call #__mspabi_fixfull
+  %0 = load volatile float, float* @g_float, align 8
+  %1 = fptoui float %0 to i64
+
+  ret i64 %1
+}
+
+define double @l2d() #0 {
+entry:
+; CHECK: l2d:
+
+; CHECK: call #__mspabi_fltlid
+  %0 = load volatile i32, i32* @g_i32, align 8
+  %1 = sitofp i32 %0 to double
+
+  ret double %1
+}
+
+define double @ll2d() #0 {
+entry:
+; CHECK: ll2d:
+
+; CHECK: call #__mspabi_fltllid
+  %0 = load volatile i64, i64* @g_i64, align 8
+  %1 = sitofp i64 %0 to double
+
+  ret double %1
+}
+
+define double @ul2d() #0 {
+entry:
+; CHECK: ul2d:
+
+; CHECK: call #__mspabi_fltuld
+  %0 = load volatile i32, i32* @g_i32, align 8
+  %1 = uitofp i32 %0 to double
+
+  ret double %1
+}
+
+define double @ull2d() #0 {
+entry:
+; CHECK: ull2d:
+
+; CHECK: call #__mspabi_fltulld
+  %0 = load volatile i64, i64* @g_i64, align 8
+  %1 = uitofp i64 %0 to double
+
+  ret double %1
+}
+
+define float @l2f() #0 {
+entry:
+; CHECK: l2f:
+
+; CHECK: call #__mspabi_fltlif
+  %0 = load volatile i32, i32* @g_i32, align 8
+  %1 = sitofp i32 %0 to float
+
+  ret float %1
+}
+
+define float @ll2f() #0 {
+entry:
+; CHECK: ll2f:
+
+; CHECK: call #__mspabi_fltllif
+  %0 = load volatile i64, i64* @g_i64, align 8
+  %1 = sitofp i64 %0 to float
+
+  ret float %1
+}
+
+define float @ul2f() #0 {
+entry:
+; CHECK: ul2f:
+
+; CHECK: call #__mspabi_fltulf
+  %0 = load volatile i32, i32* @g_i32, align 8
+  %1 = uitofp i32 %0 to float
+
+  ret float %1
+}
+
+define float @ull2f() #0 {
+entry:
+; CHECK: ull2f:
+
+; CHECK: call #__mspabi_fltullf
+  %0 = load volatile i64, i64* @g_i64, align 8
+  %1 = uitofp i64 %0 to float
+
+  ret float %1
+}
+
+define i1 @cmpd_oeq() #0 {
+entry:
+; CHECK: cmpd_oeq:
+
+; CHECK: call #__mspabi_cmpd
+  %0 = load volatile double, double* @g_double, align 8
+  %1 = fcmp oeq double %0, 123.0
+
+  ret i1 %1
+}
+
+define i1 @cmpd_une() #0 {
+entry:
+; CHECK: cmpd_une:
+
+; CHECK: call #__mspabi_cmpd
+  %0 = load volatile double, double* @g_double, align 8
+  %1 = fcmp une double %0, 123.0
+
+  ret i1 %1
+}
+
+define i1 @cmpd_oge() #0 {
+entry:
+; CHECK: cmpd_oge:
+
+; CHECK: call #__mspabi_cmpd
+  %0 = load volatile double, double* @g_double, align 8
+  %1 = fcmp oge double %0, 123.0
+
+  ret i1 %1
+}
+
+define i1 @cmpd_olt() #0 {
+entry:
+; CHECK: cmpd_olt:
+
+; CHECK: call #__mspabi_cmpd
+  %0 = load volatile double, double* @g_double, align 8
+  %1 = fcmp olt double %0, 123.0
+
+  ret i1 %1
+}
+
+define i1 @cmpd_ole() #0 {
+entry:
+; CHECK: cmpd_ole:
+
+; CHECK: call #__mspabi_cmpd
+  %0 = load volatile double, double* @g_double, align 8
+  %1 = fcmp ole double %0, 123.0
+
+  ret i1 %1
+}
+
+define i1 @cmpd_ogt() #0 {
+entry:
+; CHECK: cmpd_ogt:
+
+; CHECK: call #__mspabi_cmpd
+  %0 = load volatile double, double* @g_double, align 8
+  %1 = fcmp ogt double %0, 123.0
+
+  ret i1 %1
+}
+
+define i1 @cmpf_oeq() #0 {
+entry:
+; CHECK: cmpf_oeq:
+
+; CHECK: call #__mspabi_cmpf
+  %0 = load volatile float, float* @g_float, align 8
+  %1 = fcmp oeq float %0, 123.0
+
+  ret i1 %1
+}
+
+define i1 @cmpf_une() #0 {
+entry:
+; CHECK: cmpf_une:
+
+; CHECK: call #__mspabi_cmpf
+  %0 = load volatile float, float* @g_float, align 8
+  %1 = fcmp une float %0, 123.0
+
+  ret i1 %1
+}
+
+define i1 @cmpf_oge() #0 {
+entry:
+; CHECK: cmpf_oge:
+
+; CHECK: call #__mspabi_cmpf
+  %0 = load volatile float, float* @g_float, align 8
+  %1 = fcmp oge float %0, 123.0
+
+  ret i1 %1
+}
+
+define i1 @cmpf_olt() #0 {
+entry:
+; CHECK: cmpf_olt:
+
+; CHECK: call #__mspabi_cmpf
+  %0 = load volatile float, float* @g_float, align 8
+  %1 = fcmp olt float %0, 123.0
+
+  ret i1 %1
+}
+
+define i1 @cmpf_ole() #0 {
+entry:
+; CHECK: cmpf_ole:
+
+; CHECK: call #__mspabi_cmpf
+  %0 = load volatile float, float* @g_float, align 8
+  %1 = fcmp ole float %0, 123.0
+
+  ret i1 %1
+}
+
+define i1 @cmpf_ogt() #0 {
+entry:
+; CHECK: cmpf_ogt:
+
+; CHECK: call #__mspabi_cmpf
+  %0 = load volatile float, float* @g_float, align 8
+  %1 = fcmp ogt float %0, 123.0
+
+  ret i1 %1
+}
+
+define double @addd() #0 {
+entry:
+; CHECK: addd:
+
+; CHECK: call #__mspabi_addd
+  %0 = load volatile double, double* @g_double, align 8
+  %1 = fadd double %0, 123.0
+
+  ret double %1
+}
+
+define float @addf() #0 {
+entry:
+; CHECK: addf:
+
+; CHECK: call #__mspabi_addf
+  %0 = load volatile float, float* @g_float, align 8
+  %1 = fadd float %0, 123.0
+
+  ret float %1
+}
+
+define double @divd() #0 {
+entry:
+; CHECK: divd:
+
+; CHECK: call #__mspabi_divd
+  %0 = load volatile double, double* @g_double, align 8
+  %1 = fdiv double %0, 123.0
+
+  ret double %1
+}
+
+define float @divf() #0 {
+entry:
+; CHECK: divf:
+
+; CHECK: call #__mspabi_divf
+  %0 = load volatile float, float* @g_float, align 8
+  %1 = fdiv float %0, 123.0
+
+  ret float %1
+}
+
+define double @mpyd() #0 {
+entry:
+; CHECK: mpyd:
+
+; CHECK: call #__mspabi_mpyd
+  %0 = load volatile double, double* @g_double, align 8
+  %1 = fmul double %0, 123.0
+
+  ret double %1
+}
+
+define float @mpyf() #0 {
+entry:
+; CHECK: mpyf:
+
+; CHECK: call #__mspabi_mpyf
+  %0 = load volatile float, float* @g_float, align 8
+  %1 = fmul float %0, 123.0
+
+  ret float %1
+}
+
+define double @subd() #0 {
+entry:
+; CHECK: subd:
+
+; CHECK: call #__mspabi_subd
+  %0 = load volatile double, double* @g_double, align 8
+  %1 = fsub double %0, %0
+
+  ret double %1
+}
+
+define float @subf() #0 {
+entry:
+; CHECK: subf:
+
+; CHECK: call #__mspabi_subf
+  %0 = load volatile float, float* @g_float, align 8
+  %1 = fsub float %0, %0
+
+  ret float %1
+}
+
+define i16 @divi() #0 {
+entry:
+; CHECK: divi:
+
+; CHECK: call #__mspabi_divi
+  %0 = load volatile i16, i16* @g_i16, align 8
+  %1 = sdiv i16 %0, %0
+
+  ret i16 %1
+}
+
+define i32 @divli() #0 {
+entry:
+; CHECK: divli:
+
+; CHECK: call #__mspabi_divli
+  %0 = load volatile i32, i32* @g_i32, align 8
+  %1 = sdiv i32 %0, %0
+
+  ret i32 %1
+}
+
+define i64 @divlli() #0 {
+entry:
+; CHECK: divlli:
+
+; CHECK: call #__mspabi_divlli
+  %0 = load volatile i64, i64* @g_i64, align 8
+  %1 = sdiv i64 %0, %0
+
+  ret i64 %1
+}
+
+define i16 @divu() #0 {
+entry:
+; CHECK: divu:
+
+; CHECK: call #__mspabi_divu
+  %0 = load volatile i16, i16* @g_i16, align 8
+  %1 = udiv i16 %0, %0
+
+  ret i16 %1
+}
+
+define i32 @divul() #0 {
+entry:
+; CHECK: divul:
+
+; CHECK: call #__mspabi_divul
+  %0 = load volatile i32, i32* @g_i32, align 8
+  %1 = udiv i32 %0, %0
+
+  ret i32 %1
+}
+
+define i64 @divull() #0 {
+entry:
+; CHECK: divull:
+
+; CHECK: call #__mspabi_divull
+  %0 = load volatile i64, i64* @g_i64, align 8
+  %1 = udiv i64 %0, %0
+
+  ret i64 %1
+}
+
+define i16 @remi() #0 {
+entry:
+; CHECK: remi:
+
+; CHECK: call #__mspabi_remi
+  %0 = load volatile i16, i16* @g_i16, align 8
+  %1 = srem i16 %0, %0
+
+  ret i16 %1
+}
+
+define i32 @remli() #0 {
+entry:
+; CHECK: remli:
+
+; CHECK: call #__mspabi_remli
+  %0 = load volatile i32, i32* @g_i32, align 8
+  %1 = srem i32 %0, %0
+
+  ret i32 %1
+}
+
+define i64 @remlli() #0 {
+entry:
+; CHECK: remlli:
+
+; CHECK: call #__mspabi_remlli
+  %0 = load volatile i64, i64* @g_i64, align 8
+  %1 = srem i64 %0, %0
+
+  ret i64 %1
+}
+
+define i16 @remu() #0 {
+entry:
+; CHECK: remu:
+
+; CHECK: call #__mspabi_remu
+  %0 = load volatile i16, i16* @g_i16, align 8
+  %1 = urem i16 %0, %0
+
+  ret i16 %1
+}
+
+define i32 @remul() #0 {
+entry:
+; CHECK: remul:
+
+; CHECK: call #__mspabi_remul
+  %0 = load volatile i32, i32* @g_i32, align 8
+  %1 = urem i32 %0, %0
+
+  ret i32 %1
+}
+
+define i64 @remull() #0 {
+entry:
+; CHECK: remull:
+
+; CHECK: call #__mspabi_remull
+  %0 = load volatile i64, i64* @g_i64, align 8
+  %1 = urem i64 %0, %0
+
+  ret i64 %1
+}
+
+define i16 @mpyi() #0 {
+entry:
+; CHECK: mpyi:
+
+; CHECK: call #__mspabi_mpyi
+  %0 = load volatile i16, i16* @g_i16, align 8
+  %1 = mul i16 %0, %0
+
+  ret i16 %1
+}
+
+define i32 @mpyli() #0 {
+entry:
+; CHECK: mpyli:
+
+; CHECK: call #__mspabi_mpyl
+  %0 = load volatile i32, i32* @g_i32, align 8
+  %1 = mul i32 %0, %0
+
+  ret i32 %1
+}
+
+define i64 @mpylli() #0 {
+entry:
+; CHECK: mpylli:
+
+; CHECK: call #__mspabi_mpyll
+  %0 = load volatile i64, i64* @g_i64, align 8
+  %1 = mul i64 %0, %0
+
+  ret i64 %1
+}
+
+attributes #0 = { nounwind }
diff --git a/test/CodeGen/MSP430/promote-i8-mul.ll b/test/CodeGen/MSP430/promote-i8-mul.ll
new file mode 100644
index 000000000000..0e05e3978b1e
--- /dev/null
+++ b/test/CodeGen/MSP430/promote-i8-mul.ll
@@ -0,0 +1,22 @@
+; RUN: llc < %s | FileCheck %s
+
+target datalayout = "e-p:16:8:8-i8:8:8-i16:8:8-i32:8:8"
+target triple = "msp430-elf"
+
+@g_29 = common global i8 0, align 1               ; <i8*> [#uses=0]
+
+define signext i8 @foo(i8 signext %_si1, i8 signext %_si2) nounwind readnone {
+entry:
+; CHECK-LABEL: foo:
+; CHECK: call #__mspabi_mpyi
+  %mul = mul i8 %_si2, %_si1                      ; <i8> [#uses=1]
+  ret i8 %mul
+}
+
+define void @uint81(i16* nocapture %p_32) nounwind {
+entry:
+  %call = tail call i16 @bar(i8* bitcast (i8 (i8, i8)* @foo to i8*)) nounwind ; <i16> [#uses=0]
+  ret void
+}
+
+declare i16 @bar(i8*)
diff --git a/test/CodeGen/NVPTX/bug17709.ll b/test/CodeGen/NVPTX/bug17709.ll
index 076c44684579..6d747f09d8a7 100644
--- a/test/CodeGen/NVPTX/bug17709.ll
+++ b/test/CodeGen/NVPTX/bug17709.ll
@@ -1,26 +1,26 @@
-; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s
-
-; ModuleID = '__kernelgen_main_module'
-target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
-target triple = "nvptx64-nvidia-cuda"
-
-define private ptx_device { double, double } @__utils1_MOD_trace(%"struct.array2_complex(kind=8).43.5.57"* noalias %m) {
-entry:
-  ;unreachable
-  %t0 = insertvalue {double, double} undef, double 1.0, 0
-  %t1 = insertvalue {double, double} %t0, double 1.0, 1
-  ret { double, double } %t1
-}
-
-%struct.descriptor_dimension.0.52 = type { i64, i64, i64 }
-%"struct.array2_complex(kind=8).37.18.70" = type { i8*, i64, i64, [2 x %struct.descriptor_dimension.0.52] }
-%"struct.array2_complex(kind=8).43.5.57" = type { i8*, i64, i64, [2 x %struct.descriptor_dimension.0.52] }
-@replacementOfAlloca8 = private global %"struct.array2_complex(kind=8).37.18.70" zeroinitializer, align 4096
-
-; CHECK: .visible .entry __kernelgen_main
-define ptx_kernel void @__kernelgen_main(i32* nocapture %args, i32*) {
-entry:
-  %1 = tail call ptx_device { double, double } bitcast ({ double, double } (%"struct.array2_complex(kind=8).43.5.57"*)* @__utils1_MOD_trace to { double, double } (%"struct.array2_complex(kind=8).37.18.70"*)*)(%"struct.array2_complex(kind=8).37.18.70"* noalias @replacementOfAlloca8)
-  ret void
-}
-
+; RUN: llc < %s -march=nvptx -mcpu=sm_20 -verify-machineinstrs | FileCheck %s
+
+; ModuleID = '__kernelgen_main_module'
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
+target triple = "nvptx64-nvidia-cuda"
+
+define private ptx_device { double, double } @__utils1_MOD_trace(%"struct.array2_complex(kind=8).43.5.57"* noalias %m) {
+entry:
+  ;unreachable
+  %t0 = insertvalue {double, double} undef, double 1.0, 0
+  %t1 = insertvalue {double, double} %t0, double 1.0, 1
+  ret { double, double } %t1
+}
+
+%struct.descriptor_dimension.0.52 = type { i64, i64, i64 }
+%"struct.array2_complex(kind=8).37.18.70" = type { i8*, i64, i64, [2 x %struct.descriptor_dimension.0.52] }
+%"struct.array2_complex(kind=8).43.5.57" = type { i8*, i64, i64, [2 x %struct.descriptor_dimension.0.52] }
+@replacementOfAlloca8 = private global %"struct.array2_complex(kind=8).37.18.70" zeroinitializer, align 4096
+
+; CHECK: .visible .entry __kernelgen_main
+define ptx_kernel void @__kernelgen_main(i32* nocapture %args, i32*) {
+entry:
+  %1 = tail call ptx_device { double, double } bitcast ({ double, double } (%"struct.array2_complex(kind=8).43.5.57"*)* @__utils1_MOD_trace to { double, double } (%"struct.array2_complex(kind=8).37.18.70"*)*)(%"struct.array2_complex(kind=8).37.18.70"* noalias @replacementOfAlloca8)
+  ret void
+}
+
diff --git a/test/CodeGen/NVPTX/ctlz.ll b/test/CodeGen/NVPTX/ctlz.ll
index 005958bd938a..7aa29fe811dd 100644
--- a/test/CodeGen/NVPTX/ctlz.ll
+++ b/test/CodeGen/NVPTX/ctlz.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s
+; RUN: llc < %s -march=nvptx -mcpu=sm_20 -verify-machineinstrs | FileCheck %s
 
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
 
diff --git a/test/CodeGen/NVPTX/ctpop.ll b/test/CodeGen/NVPTX/ctpop.ll
index b961d4d27bdd..69a4f879a8d8 100644
--- a/test/CodeGen/NVPTX/ctpop.ll
+++ b/test/CodeGen/NVPTX/ctpop.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s
+; RUN: llc < %s -march=nvptx -mcpu=sm_20 -verify-machineinstrs | FileCheck %s
 
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
 
diff --git a/test/CodeGen/NVPTX/cttz.ll b/test/CodeGen/NVPTX/cttz.ll
index 124ba9d1e9a7..0bfe0139bcdf 100644
--- a/test/CodeGen/NVPTX/cttz.ll
+++ b/test/CodeGen/NVPTX/cttz.ll
@@ -1,5 +1,4 @@
-; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s
-
+; RUN: llc < %s -march=nvptx -mcpu=sm_20 -verify-machineinstrs | FileCheck %s
 
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
 
diff --git a/test/CodeGen/NVPTX/f16-instructions.ll b/test/CodeGen/NVPTX/f16-instructions.ll
index 3d4140820794..08a2ee14e8bd 100644
--- a/test/CodeGen/NVPTX/f16-instructions.ll
+++ b/test/CodeGen/NVPTX/f16-instructions.ll
@@ -1,1078 +1,1079 @@
-; ## Full FP16 support enabled by default.
-; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda -mcpu=sm_53 -asm-verbose=false \
-; RUN:          -O0 -disable-post-ra -disable-fp-elim \
-; RUN: | FileCheck -check-prefixes CHECK,CHECK-F16 %s
-; ## FP16 support explicitly disabled.
-; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda -mcpu=sm_53 -asm-verbose=false \
-; RUN:          -O0 -disable-post-ra -disable-fp-elim --nvptx-no-f16-math \
-; RUN: | FileCheck -check-prefixes CHECK,CHECK-NOF16 %s
-; ## FP16 is not supported by hardware.
-; RUN: llc < %s -O0 -mtriple=nvptx64-nvidia-cuda -mcpu=sm_52 -asm-verbose=false \
-; RUN:          -disable-post-ra -disable-fp-elim \
-; RUN: | FileCheck -check-prefixes CHECK,CHECK-NOF16 %s
-
-target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
-
-; CHECK-LABEL: test_ret_const(
-; CHECK:      mov.b16         [[R:%h[0-9]+]], 0x3C00;
-; CHECK-NEXT: st.param.b16    [func_retval0+0], [[R]];
-; CHECK-NEXT: ret;
-define half @test_ret_const() #0 {
-  ret half 1.0
-}
-
-; CHECK-LABEL: test_fadd(
-; CHECK-DAG:  ld.param.b16    [[A:%h[0-9]+]], [test_fadd_param_0];
-; CHECK-DAG:  ld.param.b16    [[B:%h[0-9]+]], [test_fadd_param_1];
-; CHECK-F16-NEXT:   add.rn.f16     [[R:%h[0-9]+]], [[A]], [[B]];
-; CHECK-NOF16-DAG:  cvt.f32.f16    [[A32:%f[0-9]+]], [[A]]
-; CHECK-NOF16-DAG:  cvt.f32.f16    [[B32:%f[0-9]+]], [[B]]
-; CHECK-NOF16-NEXT: add.rn.f32     [[R32:%f[0-9]+]], [[A32]], [[B32]];
-; CHECK-NOF16-NEXT: cvt.rn.f16.f32 [[R:%h[0-9]+]], [[R32]]
-; CHECK-NEXT: st.param.b16    [func_retval0+0], [[R]];
-; CHECK-NEXT: ret;
-define half @test_fadd(half %a, half %b) #0 {
-  %r = fadd half %a, %b
-  ret half %r
-}
-
-; CHECK-LABEL: test_fadd_v1f16(
-; CHECK-DAG:  ld.param.b16    [[A:%h[0-9]+]], [test_fadd_v1f16_param_0];
-; CHECK-DAG:  ld.param.b16    [[B:%h[0-9]+]], [test_fadd_v1f16_param_1];
-; CHECK-F16-NEXT:   add.rn.f16     [[R:%h[0-9]+]], [[A]], [[B]];
-; CHECK-NOF16-DAG:  cvt.f32.f16    [[A32:%f[0-9]+]], [[A]]
-; CHECK-NOF16-DAG:  cvt.f32.f16    [[B32:%f[0-9]+]], [[B]]
-; CHECK-NOF16-NEXT: add.rn.f32     [[R32:%f[0-9]+]], [[A32]], [[B32]];
-; CHECK-NOF16-NEXT: cvt.rn.f16.f32 [[R:%h[0-9]+]], [[R32]]
-; CHECK-NEXT: st.param.b16    [func_retval0+0], [[R]];
-; CHECK-NEXT: ret;
-define <1 x half> @test_fadd_v1f16(<1 x half> %a, <1 x half> %b) #0 {
-  %r = fadd <1 x half> %a, %b
-  ret <1 x half> %r
-}
-
-; Check that we can lower fadd with immediate arguments.
-; CHECK-LABEL: test_fadd_imm_0(
-; CHECK-DAG:  ld.param.b16    [[B:%h[0-9]+]], [test_fadd_imm_0_param_0];
-; CHECK-F16-DAG:    mov.b16        [[A:%h[0-9]+]], 0x3C00;
-; CHECK-F16-NEXT:   add.rn.f16     [[R:%h[0-9]+]], [[B]], [[A]];
-; CHECK-NOF16-DAG:  cvt.f32.f16    [[B32:%f[0-9]+]], [[B]]
-; CHECK-NOF16-NEXT: add.rn.f32     [[R32:%f[0-9]+]], [[B32]], 0f3F800000;
-; CHECK-NOF16-NEXT: cvt.rn.f16.f32 [[R:%h[0-9]+]], [[R32]]
-; CHECK-NEXT: st.param.b16    [func_retval0+0], [[R]];
-; CHECK-NEXT: ret;
-define half @test_fadd_imm_0(half %b) #0 {
-  %r = fadd half 1.0, %b
-  ret half %r
-}
-
-; CHECK-LABEL: test_fadd_imm_1(
-; CHECK-DAG:  ld.param.b16    [[B:%h[0-9]+]], [test_fadd_imm_1_param_0];
-; CHECK-F16-DAG:    mov.b16        [[A:%h[0-9]+]], 0x3C00;
-; CHECK-F16-NEXT:   add.rn.f16     [[R:%h[0-9]+]], [[B]], [[A]];
-; CHECK-NOF16-DAG:  cvt.f32.f16    [[B32:%f[0-9]+]], [[B]]
-; CHECK-NOF16-NEXT: add.rn.f32     [[R32:%f[0-9]+]], [[B32]], 0f3F800000;
-; CHECK-NOF16-NEXT: cvt.rn.f16.f32 [[R:%h[0-9]+]], [[R32]]
-; CHECK-NEXT: st.param.b16    [func_retval0+0], [[R]];
-; CHECK-NEXT: ret;
-define half @test_fadd_imm_1(half %a) #0 {
-  %r = fadd half %a, 1.0
-  ret half %r
-}
-
-; CHECK-LABEL: test_fsub(
-; CHECK-DAG:  ld.param.b16    [[A:%h[0-9]+]], [test_fsub_param_0];
-; CHECK-DAG:  ld.param.b16    [[B:%h[0-9]+]], [test_fsub_param_1];
-; CHECK-F16-NEXT:   sub.rn.f16     [[R:%h[0-9]+]], [[A]], [[B]];
-; CHECK-NOF16-DAG:  cvt.f32.f16    [[A32:%f[0-9]+]], [[A]]
-; CHECK-NOF16-DAG:  cvt.f32.f16    [[B32:%f[0-9]+]], [[B]]
-; CHECK-NOF16-NEXT: sub.rn.f32     [[R32:%f[0-9]+]], [[A32]], [[B32]];
-; CHECK-NOF16-NEXT: cvt.rn.f16.f32 [[R:%h[0-9]+]], [[R32]]
-; CHECK-NEXT: st.param.b16    [func_retval0+0], [[R]];
-; CHECK-NEXT: ret;
-define half @test_fsub(half %a, half %b) #0 {
-  %r = fsub half %a, %b
-  ret half %r
-}
-
-; CHECK-LABEL: test_fneg(
-; CHECK-DAG:  ld.param.b16    [[A:%h[0-9]+]], [test_fneg_param_0];
-; CHECK-F16-NEXT:   mov.b16        [[Z:%h[0-9]+]], 0x0000
-; CHECK-F16-NEXT:   sub.rn.f16     [[R:%h[0-9]+]], [[Z]], [[A]];
-; CHECK-NOF16-DAG:  cvt.f32.f16    [[A32:%f[0-9]+]], [[A]]
-; CHECK-NOF16-DAG:  mov.f32        [[Z:%f[0-9]+]], 0f00000000;
-; CHECK-NOF16-NEXT: sub.rn.f32     [[R32:%f[0-9]+]], [[Z]], [[A32]];
-; CHECK-NOF16-NEXT: cvt.rn.f16.f32 [[R:%h[0-9]+]], [[R32]]
-; CHECK-NEXT: st.param.b16    [func_retval0+0], [[R]];
-; CHECK-NEXT: ret;
-define half @test_fneg(half %a) #0 {
-  %r = fsub half 0.0, %a
-  ret half %r
-}
-
-; CHECK-LABEL: test_fmul(
-; CHECK-DAG:  ld.param.b16    [[A:%h[0-9]+]], [test_fmul_param_0];
-; CHECK-DAG:  ld.param.b16    [[B:%h[0-9]+]], [test_fmul_param_1];
-; CHECK-F16-NEXT: mul.rn.f16      [[R:%h[0-9]+]], [[A]], [[B]];
-; CHECK-NOF16-DAG:  cvt.f32.f16    [[A32:%f[0-9]+]], [[A]]
-; CHECK-NOF16-DAG:  cvt.f32.f16    [[B32:%f[0-9]+]], [[B]]
-; CHECK-NOF16-NEXT: mul.rn.f32     [[R32:%f[0-9]+]], [[A32]], [[B32]];
-; CHECK-NOF16-NEXT: cvt.rn.f16.f32 [[R:%h[0-9]+]], [[R32]]
-; CHECK-NEXT: st.param.b16    [func_retval0+0], [[R]];
-; CHECK-NEXT: ret;
-define half @test_fmul(half %a, half %b) #0 {
-  %r = fmul half %a, %b
-  ret half %r
-}
-
-; CHECK-LABEL: test_fdiv(
-; CHECK-DAG:  ld.param.b16    [[A:%h[0-9]+]], [test_fdiv_param_0];
-; CHECK-DAG:  ld.param.b16    [[B:%h[0-9]+]], [test_fdiv_param_1];
-; CHECK-DAG:  cvt.f32.f16     [[F0:%f[0-9]+]], [[A]];
-; CHECK-DAG:  cvt.f32.f16     [[F1:%f[0-9]+]], [[B]];
-; CHECK-NEXT: div.rn.f32      [[FR:%f[0-9]+]], [[F0]], [[F1]];
-; CHECK-NEXT: cvt.rn.f16.f32  [[R:%h[0-9]+]], [[FR]];
-; CHECK-NEXT: st.param.b16    [func_retval0+0], [[R]];
-; CHECK-NEXT: ret;
-define half @test_fdiv(half %a, half %b) #0 {
-  %r = fdiv half %a, %b
-  ret half %r
-}
-
-; CHECK-LABEL: test_frem(
-; CHECK-DAG:  ld.param.b16    [[A:%h[0-9]+]], [test_frem_param_0];
-; CHECK-DAG:  ld.param.b16    [[B:%h[0-9]+]], [test_frem_param_1];
-; CHECK-DAG:  cvt.f32.f16     [[FA:%f[0-9]+]], [[A]];
-; CHECK-DAG:  cvt.f32.f16     [[FB:%f[0-9]+]], [[B]];
-; CHECK-NEXT: div.rn.f32      [[D:%f[0-9]+]], [[FA]], [[FB]];
-; CHECK-NEXT: cvt.rmi.f32.f32 [[DI:%f[0-9]+]], [[D]];
-; CHECK-NEXT: mul.f32         [[RI:%f[0-9]+]], [[DI]], [[FB]];
-; CHECK-NEXT: sub.f32         [[RF:%f[0-9]+]], [[FA]], [[RI]];
-; CHECK-NEXT: cvt.rn.f16.f32  [[R:%h[0-9]+]], [[RF]];
-; CHECK-NEXT: st.param.b16    [func_retval0+0], [[R]];
-; CHECK-NEXT: ret;
-define half @test_frem(half %a, half %b) #0 {
-  %r = frem half %a, %b
-  ret half %r
-}
-
-; CHECK-LABEL: test_store(
-; CHECK-DAG:  ld.param.b16    [[A:%h[0-9]+]], [test_store_param_0];
-; CHECK-DAG:  ld.param.u64    %[[PTR:rd[0-9]+]], [test_store_param_1];
-; CHECK-NEXT: st.b16          [%[[PTR]]], [[A]];
-; CHECK-NEXT: ret;
-define void @test_store(half %a, half* %b) #0 {
-  store half %a, half* %b
-  ret void
-}
-
-; CHECK-LABEL: test_load(
-; CHECK:      ld.param.u64    %[[PTR:rd[0-9]+]], [test_load_param_0];
-; CHECK-NEXT: ld.b16          [[R:%h[0-9]+]], [%[[PTR]]];
-; CHECK-NEXT: st.param.b16    [func_retval0+0], [[R]];
-; CHECK-NEXT: ret;
-define half @test_load(half* %a) #0 {
-  %r = load half, half* %a
-  ret half %r
-}
-
-; CHECK-LABEL: .visible .func test_halfp0a1(
-; CHECK-DAG: ld.param.u64 %[[FROM:rd?[0-9]+]], [test_halfp0a1_param_0];
-; CHECK-DAG: ld.param.u64 %[[TO:rd?[0-9]+]], [test_halfp0a1_param_1];
-; CHECK-DAG: ld.u8        [[B0:%r[sd]?[0-9]+]], [%[[FROM]]]
-; CHECK-DAG: st.u8        [%[[TO]]], [[B0]]
-; CHECK-DAG: ld.u8        [[B1:%r[sd]?[0-9]+]], [%[[FROM]]+1]
-; CHECK-DAG: st.u8        [%[[TO]]+1], [[B1]]
-; CHECK: ret
-define void @test_halfp0a1(half * noalias readonly %from, half * %to) {
-  %1 = load half, half * %from , align 1
-  store half %1, half * %to , align 1
-  ret void
-}
-
-declare half @test_callee(half %a, half %b) #0
-
-; CHECK-LABEL: test_call(
-; CHECK-DAG:  ld.param.b16    [[A:%h[0-9]+]], [test_call_param_0];
-; CHECK-DAG:  ld.param.b16    [[B:%h[0-9]+]], [test_call_param_1];
-; CHECK:      {
-; CHECK-DAG:  .param .b32 param0;
-; CHECK-DAG:  .param .b32 param1;
-; CHECK-DAG:  st.param.b16    [param0+0], [[A]];
-; CHECK-DAG:  st.param.b16    [param1+0], [[B]];
-; CHECK-DAG:  .param .b32 retval0;
-; CHECK:      call.uni (retval0),
-; CHECK-NEXT:        test_callee,
-; CHECK:      );
-; CHECK-NEXT: ld.param.b16    [[R:%h[0-9]+]], [retval0+0];
-; CHECK-NEXT: }
-; CHECK-NEXT: st.param.b16    [func_retval0+0], [[R]];
-; CHECK-NEXT: ret;
-define half @test_call(half %a, half %b) #0 {
-  %r = call half @test_callee(half %a, half %b)
-  ret half %r
-}
-
-; CHECK-LABEL: test_call_flipped(
-; CHECK-DAG:  ld.param.b16    [[A:%h[0-9]+]], [test_call_flipped_param_0];
-; CHECK-DAG:  ld.param.b16    [[B:%h[0-9]+]], [test_call_flipped_param_1];
-; CHECK:      {
-; CHECK-DAG:  .param .b32 param0;
-; CHECK-DAG:  .param .b32 param1;
-; CHECK-DAG:  st.param.b16    [param0+0], [[B]];
-; CHECK-DAG:  st.param.b16    [param1+0], [[A]];
-; CHECK-DAG:  .param .b32 retval0;
-; CHECK:      call.uni (retval0),
-; CHECK-NEXT:        test_callee,
-; CHECK:      );
-; CHECK-NEXT: ld.param.b16    [[R:%h[0-9]+]], [retval0+0];
-; CHECK-NEXT: }
-; CHECK-NEXT: st.param.b16    [func_retval0+0], [[R]];
-; CHECK-NEXT: ret;
-define half @test_call_flipped(half %a, half %b) #0 {
-  %r = call half @test_callee(half %b, half %a)
-  ret half %r
-}
-
-; CHECK-LABEL: test_tailcall_flipped(
-; CHECK-DAG:  ld.param.b16    [[A:%h[0-9]+]], [test_tailcall_flipped_param_0];
-; CHECK-DAG:  ld.param.b16    [[B:%h[0-9]+]], [test_tailcall_flipped_param_1];
-; CHECK:      {
-; CHECK-DAG:  .param .b32 param0;
-; CHECK-DAG:  .param .b32 param1;
-; CHECK-DAG:  st.param.b16    [param0+0], [[B]];
-; CHECK-DAG:  st.param.b16    [param1+0], [[A]];
-; CHECK-DAG:  .param .b32 retval0;
-; CHECK:      call.uni (retval0),
-; CHECK-NEXT:        test_callee,
-; CHECK:      );
-; CHECK-NEXT: ld.param.b16    [[R:%h[0-9]+]], [retval0+0];
-; CHECK-NEXT: }
-; CHECK-NEXT: st.param.b16    [func_retval0+0], [[R]];
-; CHECK-NEXT: ret;
-define half @test_tailcall_flipped(half %a, half %b) #0 {
-  %r = tail call half @test_callee(half %b, half %a)
-  ret half %r
-}
-
-; CHECK-LABEL: test_select(
-; CHECK-DAG:  ld.param.b16    [[A:%h[0-9]+]], [test_select_param_0];
-; CHECK-DAG:  ld.param.b16    [[B:%h[0-9]+]], [test_select_param_1];
-; CHECK-DAG:  setp.eq.b16     [[PRED:%p[0-9]+]], %rs{{.*}}, 1;
-; CHECK-NEXT: selp.b16        [[R:%h[0-9]+]], [[A]], [[B]], [[PRED]];
-; CHECK-NEXT: st.param.b16    [func_retval0+0], [[R]];
-; CHECK-NEXT: ret;
-define half @test_select(half %a, half %b, i1 zeroext %c) #0 {
-  %r = select i1 %c, half %a, half %b
-  ret half %r
-}
-
-; CHECK-LABEL: test_select_cc(
-; CHECK-DAG:  ld.param.b16    [[A:%h[0-9]+]], [test_select_cc_param_0];
-; CHECK-DAG:  ld.param.b16    [[B:%h[0-9]+]], [test_select_cc_param_1];
-; CHECK-DAG:  ld.param.b16    [[C:%h[0-9]+]], [test_select_cc_param_2];
-; CHECK-DAG:  ld.param.b16    [[D:%h[0-9]+]], [test_select_cc_param_3];
-; CHECK-F16:  setp.neu.f16    [[PRED:%p[0-9]+]], [[C]], [[D]]
-; CHECK-NOF16-DAG: cvt.f32.f16 [[DF:%f[0-9]+]], [[D]];
-; CHECK-NOF16-DAG: cvt.f32.f16 [[CF:%f[0-9]+]], [[C]];
-; CHECK-NOF16: setp.neu.f32    [[PRED:%p[0-9]+]], [[CF]], [[DF]]
-; CHECK:      selp.b16        [[R:%h[0-9]+]], [[A]], [[B]], [[PRED]];
-; CHECK-NEXT: st.param.b16    [func_retval0+0], [[R]];
-; CHECK-NEXT: ret;
-define half @test_select_cc(half %a, half %b, half %c, half %d) #0 {
-  %cc = fcmp une half %c, %d
-  %r = select i1 %cc, half %a, half %b
-  ret half %r
-}
-
-; CHECK-LABEL: test_select_cc_f32_f16(
-; CHECK-DAG:  ld.param.f32    [[A:%f[0-9]+]], [test_select_cc_f32_f16_param_0];
-; CHECK-DAG:  ld.param.f32    [[B:%f[0-9]+]], [test_select_cc_f32_f16_param_1];
-; CHECK-DAG:  ld.param.b16    [[C:%h[0-9]+]], [test_select_cc_f32_f16_param_2];
-; CHECK-DAG:  ld.param.b16    [[D:%h[0-9]+]], [test_select_cc_f32_f16_param_3];
-; CHECK-F16:  setp.neu.f16    [[PRED:%p[0-9]+]], [[C]], [[D]]
-; CHECK-NOF16-DAG: cvt.f32.f16 [[DF:%f[0-9]+]], [[D]];
-; CHECK-NOF16-DAG: cvt.f32.f16 [[CF:%f[0-9]+]], [[C]];
-; CHECK-NOF16: setp.neu.f32    [[PRED:%p[0-9]+]], [[CF]], [[DF]]
-; CHECK-NEXT: selp.f32        [[R:%f[0-9]+]], [[A]], [[B]], [[PRED]];
-; CHECK-NEXT: st.param.f32    [func_retval0+0], [[R]];
-; CHECK-NEXT: ret;
-define float @test_select_cc_f32_f16(float %a, float %b, half %c, half %d) #0 {
-  %cc = fcmp une half %c, %d
-  %r = select i1 %cc, float %a, float %b
-  ret float %r
-}
-
-; CHECK-LABEL: test_select_cc_f16_f32(
-; CHECK-DAG:  ld.param.b16    [[A:%h[0-9]+]], [test_select_cc_f16_f32_param_0];
-; CHECK-DAG:  ld.param.f32    [[C:%f[0-9]+]], [test_select_cc_f16_f32_param_2];
-; CHECK-DAG:  ld.param.f32    [[D:%f[0-9]+]], [test_select_cc_f16_f32_param_3];
-; CHECK-DAG:  setp.neu.f32    [[PRED:%p[0-9]+]], [[C]], [[D]]
-; CHECK-DAG:  ld.param.b16    [[B:%h[0-9]+]], [test_select_cc_f16_f32_param_1];
-; CHECK-NEXT: selp.b16        [[R:%h[0-9]+]], [[A]], [[B]], [[PRED]];
-; CHECK-NEXT: st.param.b16    [func_retval0+0], [[R]];
-; CHECK-NEXT: ret;
-define half @test_select_cc_f16_f32(half %a, half %b, float %c, float %d) #0 {
-  %cc = fcmp une float %c, %d
-  %r = select i1 %cc, half %a, half %b
-  ret half %r
-}
-
-; CHECK-LABEL: test_fcmp_une(
-; CHECK-DAG:  ld.param.b16    [[A:%h[0-9]+]], [test_fcmp_une_param_0];
-; CHECK-DAG:  ld.param.b16    [[B:%h[0-9]+]], [test_fcmp_une_param_1];
-; CHECK-F16:  setp.neu.f16    [[PRED:%p[0-9]+]], [[A]], [[B]]
-; CHECK-NOF16-DAG: cvt.f32.f16 [[AF:%f[0-9]+]], [[A]];
-; CHECK-NOF16-DAG: cvt.f32.f16 [[BF:%f[0-9]+]], [[B]];
-; CHECK-NOF16: setp.neu.f32   [[PRED:%p[0-9]+]], [[AF]], [[BF]]
-; CHECK-NEXT: selp.u32        [[R:%r[0-9]+]], 1, 0, [[PRED]];
-; CHECK-NEXT: st.param.b32    [func_retval0+0], [[R]];
-; CHECK-NEXT: ret;
-define i1 @test_fcmp_une(half %a, half %b) #0 {
-  %r = fcmp une half %a, %b
-  ret i1 %r
-}
-
-; CHECK-LABEL: test_fcmp_ueq(
-; CHECK-DAG:  ld.param.b16    [[A:%h[0-9]+]], [test_fcmp_ueq_param_0];
-; CHECK-DAG:  ld.param.b16    [[B:%h[0-9]+]], [test_fcmp_ueq_param_1];
-; CHECK-F16:  setp.equ.f16    [[PRED:%p[0-9]+]], [[A]], [[B]]
-; CHECK-NOF16-DAG: cvt.f32.f16 [[AF:%f[0-9]+]], [[A]];
-; CHECK-NOF16-DAG: cvt.f32.f16 [[BF:%f[0-9]+]], [[B]];
-; CHECK-NOF16: setp.equ.f32   [[PRED:%p[0-9]+]], [[AF]], [[BF]]
-; CHECK-NEXT: selp.u32        [[R:%r[0-9]+]], 1, 0, [[PRED]];
-; CHECK-NEXT: st.param.b32    [func_retval0+0], [[R]];
-; CHECK-NEXT: ret;
-define i1 @test_fcmp_ueq(half %a, half %b) #0 {
-  %r = fcmp ueq half %a, %b
-  ret i1 %r
-}
-
-; CHECK-LABEL: test_fcmp_ugt(
-; CHECK-DAG:  ld.param.b16    [[A:%h[0-9]+]], [test_fcmp_ugt_param_0];
-; CHECK-DAG:  ld.param.b16    [[B:%h[0-9]+]], [test_fcmp_ugt_param_1];
-; CHECK-F16:  setp.gtu.f16    [[PRED:%p[0-9]+]], [[A]], [[B]]
-; CHECK-NOF16-DAG: cvt.f32.f16 [[AF:%f[0-9]+]], [[A]];
-; CHECK-NOF16-DAG: cvt.f32.f16 [[BF:%f[0-9]+]], [[B]];
-; CHECK-NOF16: setp.gtu.f32   [[PRED:%p[0-9]+]], [[AF]], [[BF]]
-; CHECK-NEXT: selp.u32        [[R:%r[0-9]+]], 1, 0, [[PRED]];
-; CHECK-NEXT: st.param.b32    [func_retval0+0], [[R]];
-; CHECK-NEXT: ret;
-define i1 @test_fcmp_ugt(half %a, half %b) #0 {
-  %r = fcmp ugt half %a, %b
-  ret i1 %r
-}
-
-; CHECK-LABEL: test_fcmp_uge(
-; CHECK-DAG:  ld.param.b16    [[A:%h[0-9]+]], [test_fcmp_uge_param_0];
-; CHECK-DAG:  ld.param.b16    [[B:%h[0-9]+]], [test_fcmp_uge_param_1];
-; CHECK-F16:  setp.geu.f16    [[PRED:%p[0-9]+]], [[A]], [[B]]
-; CHECK-NOF16-DAG: cvt.f32.f16 [[AF:%f[0-9]+]], [[A]];
-; CHECK-NOF16-DAG: cvt.f32.f16 [[BF:%f[0-9]+]], [[B]];
-; CHECK-NOF16: setp.geu.f32   [[PRED:%p[0-9]+]], [[AF]], [[BF]]
-; CHECK-NEXT: selp.u32        [[R:%r[0-9]+]], 1, 0, [[PRED]];
-; CHECK-NEXT: st.param.b32    [func_retval0+0], [[R]];
-; CHECK-NEXT: ret;
-define i1 @test_fcmp_uge(half %a, half %b) #0 {
-  %r = fcmp uge half %a, %b
-  ret i1 %r
-}
-
-; CHECK-LABEL: test_fcmp_ult(
-; CHECK-DAG:  ld.param.b16    [[A:%h[0-9]+]], [test_fcmp_ult_param_0];
-; CHECK-DAG:  ld.param.b16    [[B:%h[0-9]+]], [test_fcmp_ult_param_1];
-; CHECK-F16:  setp.ltu.f16    [[PRED:%p[0-9]+]], [[A]], [[B]]
-; CHECK-NOF16-DAG: cvt.f32.f16 [[AF:%f[0-9]+]], [[A]];
-; CHECK-NOF16-DAG: cvt.f32.f16 [[BF:%f[0-9]+]], [[B]];
-; CHECK-NOF16: setp.ltu.f32   [[PRED:%p[0-9]+]], [[AF]], [[BF]]
-; CHECK-NEXT: selp.u32        [[R:%r[0-9]+]], 1, 0, [[PRED]];
-; CHECK-NEXT: st.param.b32    [func_retval0+0], [[R]];
-; CHECK-NEXT: ret;
-define i1 @test_fcmp_ult(half %a, half %b) #0 {
-  %r = fcmp ult half %a, %b
-  ret i1 %r
-}
-
-; CHECK-LABEL: test_fcmp_ule(
-; CHECK-DAG:  ld.param.b16    [[A:%h[0-9]+]], [test_fcmp_ule_param_0];
-; CHECK-DAG:  ld.param.b16    [[B:%h[0-9]+]], [test_fcmp_ule_param_1];
-; CHECK-F16:  setp.leu.f16    [[PRED:%p[0-9]+]], [[A]], [[B]]
-; CHECK-NOF16-DAG: cvt.f32.f16 [[AF:%f[0-9]+]], [[A]];
-; CHECK-NOF16-DAG: cvt.f32.f16 [[BF:%f[0-9]+]], [[B]];
-; CHECK-NOF16: setp.leu.f32   [[PRED:%p[0-9]+]], [[AF]], [[BF]]
-; CHECK-NEXT: selp.u32        [[R:%r[0-9]+]], 1, 0, [[PRED]];
-; CHECK-NEXT: st.param.b32    [func_retval0+0], [[R]];
-; CHECK-NEXT: ret;
-define i1 @test_fcmp_ule(half %a, half %b) #0 {
-  %r = fcmp ule half %a, %b
-  ret i1 %r
-}
-
-
-; CHECK-LABEL: test_fcmp_uno(
-; CHECK-DAG:  ld.param.b16    [[A:%h[0-9]+]], [test_fcmp_uno_param_0];
-; CHECK-DAG:  ld.param.b16    [[B:%h[0-9]+]], [test_fcmp_uno_param_1];
-; CHECK-F16:  setp.nan.f16    [[PRED:%p[0-9]+]], [[A]], [[B]]
-; CHECK-NOF16-DAG: cvt.f32.f16 [[AF:%f[0-9]+]], [[A]];
-; CHECK-NOF16-DAG: cvt.f32.f16 [[BF:%f[0-9]+]], [[B]];
-; CHECK-NOF16: setp.nan.f32   [[PRED:%p[0-9]+]], [[AF]], [[BF]]
-; CHECK-NEXT: selp.u32        [[R:%r[0-9]+]], 1, 0, [[PRED]];
-; CHECK-NEXT: st.param.b32    [func_retval0+0], [[R]];
-; CHECK-NEXT: ret;
-define i1 @test_fcmp_uno(half %a, half %b) #0 {
-  %r = fcmp uno half %a, %b
-  ret i1 %r
-}
-
-; CHECK-LABEL: test_fcmp_one(
-; CHECK-DAG:  ld.param.b16    [[A:%h[0-9]+]], [test_fcmp_one_param_0];
-; CHECK-DAG:  ld.param.b16    [[B:%h[0-9]+]], [test_fcmp_one_param_1];
-; CHECK-F16:  setp.ne.f16     [[PRED:%p[0-9]+]], [[A]], [[B]]
-; CHECK-NOF16-DAG: cvt.f32.f16 [[AF:%f[0-9]+]], [[A]];
-; CHECK-NOF16-DAG: cvt.f32.f16 [[BF:%f[0-9]+]], [[B]];
-; CHECK-NOF16: setp.ne.f32    [[PRED:%p[0-9]+]], [[AF]], [[BF]]
-; CHECK-NEXT: selp.u32        [[R:%r[0-9]+]], 1, 0, [[PRED]];
-; CHECK-NEXT: st.param.b32    [func_retval0+0], [[R]];
-; CHECK-NEXT: ret;
-define i1 @test_fcmp_one(half %a, half %b) #0 {
-  %r = fcmp one half %a, %b
-  ret i1 %r
-}
-
-; CHECK-LABEL: test_fcmp_oeq(
-; CHECK-DAG:  ld.param.b16    [[A:%h[0-9]+]], [test_fcmp_oeq_param_0];
-; CHECK-DAG:  ld.param.b16    [[B:%h[0-9]+]], [test_fcmp_oeq_param_1];
-; CHECK-F16:  setp.eq.f16     [[PRED:%p[0-9]+]], [[A]], [[B]]
-; CHECK-NOF16-DAG: cvt.f32.f16 [[AF:%f[0-9]+]], [[A]];
-; CHECK-NOF16-DAG: cvt.f32.f16 [[BF:%f[0-9]+]], [[B]];
-; CHECK-NOF16: setp.eq.f32    [[PRED:%p[0-9]+]], [[AF]], [[BF]]
-; CHECK-NEXT: selp.u32        [[R:%r[0-9]+]], 1, 0, [[PRED]];
-; CHECK-NEXT: st.param.b32    [func_retval0+0], [[R]];
-; CHECK-NEXT: ret;
-define i1 @test_fcmp_oeq(half %a, half %b) #0 {
-  %r = fcmp oeq half %a, %b
-  ret i1 %r
-}
-
-; CHECK-LABEL: test_fcmp_ogt(
-; CHECK-DAG:  ld.param.b16    [[A:%h[0-9]+]], [test_fcmp_ogt_param_0];
-; CHECK-DAG:  ld.param.b16    [[B:%h[0-9]+]], [test_fcmp_ogt_param_1];
-; CHECK-F16:  setp.gt.f16     [[PRED:%p[0-9]+]], [[A]], [[B]]
-; CHECK-NOF16-DAG: cvt.f32.f16 [[AF:%f[0-9]+]], [[A]];
-; CHECK-NOF16-DAG: cvt.f32.f16 [[BF:%f[0-9]+]], [[B]];
-; CHECK-NOF16: setp.gt.f32    [[PRED:%p[0-9]+]], [[AF]], [[BF]]
-; CHECK-NEXT: selp.u32        [[R:%r[0-9]+]], 1, 0, [[PRED]];
-; CHECK-NEXT: st.param.b32    [func_retval0+0], [[R]];
-; CHECK-NEXT: ret;
-define i1 @test_fcmp_ogt(half %a, half %b) #0 {
-  %r = fcmp ogt half %a, %b
-  ret i1 %r
-}
-
-; CHECK-LABEL: test_fcmp_oge(
-; CHECK-DAG:  ld.param.b16    [[A:%h[0-9]+]], [test_fcmp_oge_param_0];
-; CHECK-DAG:  ld.param.b16    [[B:%h[0-9]+]], [test_fcmp_oge_param_1];
-; CHECK-F16:  setp.ge.f16     [[PRED:%p[0-9]+]], [[A]], [[B]]
-; CHECK-NOF16-DAG: cvt.f32.f16 [[AF:%f[0-9]+]], [[A]];
-; CHECK-NOF16-DAG: cvt.f32.f16 [[BF:%f[0-9]+]], [[B]];
-; CHECK-NOF16: setp.ge.f32    [[PRED:%p[0-9]+]], [[AF]], [[BF]]
-; CHECK-NEXT: selp.u32        [[R:%r[0-9]+]], 1, 0, [[PRED]];
-; CHECK-NEXT: st.param.b32    [func_retval0+0], [[R]];
-; CHECK-NEXT: ret;
-define i1 @test_fcmp_oge(half %a, half %b) #0 {
-  %r = fcmp oge half %a, %b
-  ret i1 %r
-}
-
-; XCHECK-LABEL: test_fcmp_olt(
-; CHECK-DAG:  ld.param.b16    [[A:%h[0-9]+]], [test_fcmp_olt_param_0];
-; CHECK-DAG:  ld.param.b16    [[B:%h[0-9]+]], [test_fcmp_olt_param_1];
-; CHECK-F16:  setp.lt.f16     [[PRED:%p[0-9]+]], [[A]], [[B]]
-; CHECK-NOF16-DAG: cvt.f32.f16 [[AF:%f[0-9]+]], [[A]];
-; CHECK-NOF16-DAG: cvt.f32.f16 [[BF:%f[0-9]+]], [[B]];
-; CHECK-NOF16: setp.lt.f32    [[PRED:%p[0-9]+]], [[AF]], [[BF]]
-; CHECK-NEXT: selp.u32        [[R:%r[0-9]+]], 1, 0, [[PRED]];
-; CHECK-NEXT: st.param.b32    [func_retval0+0], [[R]];
-; CHECK-NEXT: ret;
-define i1 @test_fcmp_olt(half %a, half %b) #0 {
-  %r = fcmp olt half %a, %b
-  ret i1 %r
-}
-
-; XCHECK-LABEL: test_fcmp_ole(
-; CHECK-DAG:  ld.param.b16    [[A:%h[0-9]+]], [test_fcmp_ole_param_0];
-; CHECK-DAG:  ld.param.b16    [[B:%h[0-9]+]], [test_fcmp_ole_param_1];
-; CHECK-F16:  setp.le.f16     [[PRED:%p[0-9]+]], [[A]], [[B]]
-; CHECK-NOF16-DAG: cvt.f32.f16 [[AF:%f[0-9]+]], [[A]];
-; CHECK-NOF16-DAG: cvt.f32.f16 [[BF:%f[0-9]+]], [[B]];
-; CHECK-NOF16: setp.le.f32    [[PRED:%p[0-9]+]], [[AF]], [[BF]]
-; CHECK-NEXT: selp.u32        [[R:%r[0-9]+]], 1, 0, [[PRED]];
-; CHECK-NEXT: st.param.b32    [func_retval0+0], [[R]];
-; CHECK-NEXT: ret;
-define i1 @test_fcmp_ole(half %a, half %b) #0 {
-  %r = fcmp ole half %a, %b
-  ret i1 %r
-}
-
-; CHECK-LABEL: test_fcmp_ord(
-; CHECK-DAG:  ld.param.b16    [[A:%h[0-9]+]], [test_fcmp_ord_param_0];
-; CHECK-DAG:  ld.param.b16    [[B:%h[0-9]+]], [test_fcmp_ord_param_1];
-; CHECK-F16:  setp.num.f16    [[PRED:%p[0-9]+]], [[A]], [[B]]
-; CHECK-NOF16-DAG: cvt.f32.f16 [[AF:%f[0-9]+]], [[A]];
-; CHECK-NOF16-DAG: cvt.f32.f16 [[BF:%f[0-9]+]], [[B]];
-; CHECK-NOF16: setp.num.f32   [[PRED:%p[0-9]+]], [[AF]], [[BF]]
-; CHECK-NEXT: selp.u32        [[R:%r[0-9]+]], 1, 0, [[PRED]];
-; CHECK-NEXT: st.param.b32    [func_retval0+0], [[R]];
-; CHECK-NEXT: ret;
-define i1 @test_fcmp_ord(half %a, half %b) #0 {
-  %r = fcmp ord half %a, %b
-  ret i1 %r
-}
-
-; CHECK-LABEL: test_br_cc(
-; CHECK-DAG:  ld.param.b16    [[A:%h[0-9]+]], [test_br_cc_param_0];
-; CHECK-DAG:  ld.param.b16    [[B:%h[0-9]+]], [test_br_cc_param_1];
-; CHECK-DAG:  ld.param.u64    %[[C:rd[0-9]+]], [test_br_cc_param_2];
-; CHECK-DAG:  ld.param.u64    %[[D:rd[0-9]+]], [test_br_cc_param_3];
-; CHECK-F16:  setp.lt.f16     [[PRED:%p[0-9]+]], [[A]], [[B]]
-; CHECK-NOF16-DAG: cvt.f32.f16 [[AF:%f[0-9]+]], [[A]];
-; CHECK-NOF16-DAG: cvt.f32.f16 [[BF:%f[0-9]+]], [[B]];
-; CHECK-NOF16: setp.lt.f32    [[PRED:%p[0-9]+]], [[AF]], [[BF]]
-; CHECK-NEXT: @[[PRED]] bra   [[LABEL:LBB.*]];
-; CHECK:      st.u32  [%[[C]]],
-; CHECK:      [[LABEL]]:
-; CHECK:      st.u32  [%[[D]]],
-; CHECK:      ret;
-define void @test_br_cc(half %a, half %b, i32* %p1, i32* %p2) #0 {
-  %c = fcmp uge half %a, %b
-  br i1 %c, label %then, label %else
-then:
-  store i32 0, i32* %p1
-  ret void
-else:
-  store i32 0, i32* %p2
-  ret void
-}
-
-; CHECK-LABEL: test_phi(
-; CHECK:      ld.param.u64    %[[P1:rd[0-9]+]], [test_phi_param_0];
-; CHECK:      ld.b16  {{%h[0-9]+}}, [%[[P1]]];
-; CHECK: [[LOOP:LBB[0-9_]+]]:
-; CHECK:      mov.b16 [[R:%h[0-9]+]], [[AB:%h[0-9]+]];
-; CHECK:      ld.b16  [[AB:%h[0-9]+]], [%[[P1]]];
-; CHECK:      {
-; CHECK:      st.param.b64    [param0+0], %[[P1]];
-; CHECK:      call.uni (retval0),
-; CHECK-NEXT: test_dummy
-; CHECK:      }
-; CHECK:      setp.eq.b32     [[PRED:%p[0-9]+]], %r{{[0-9]+}}, 1;
-; CHECK:      @[[PRED]] bra   [[LOOP]];
-; CHECK:      st.param.b16    [func_retval0+0], [[R]];
-; CHECK:      ret;
-define half @test_phi(half* %p1) #0 {
-entry:
-  %a = load half, half* %p1
-  br label %loop
-loop:
-  %r = phi half [%a, %entry], [%b, %loop]
-  %b = load half, half* %p1
-  %c = call i1 @test_dummy(half* %p1)
-  br i1 %c, label %loop, label %return
-return:
-  ret half %r
-}
-declare i1 @test_dummy(half* %p1) #0
-
-; CHECK-LABEL: test_fptosi_i32(
-; CHECK:      ld.param.b16    [[A:%h[0-9]+]], [test_fptosi_i32_param_0];
-; CHECK:      cvt.rzi.s32.f16 [[R:%r[0-9]+]], [[A]];
-; CHECK:      st.param.b32    [func_retval0+0], [[R]];
-; CHECK:      ret;
-define i32 @test_fptosi_i32(half %a) #0 {
-  %r = fptosi half %a to i32
-  ret i32 %r
-}
-
-; CHECK-LABEL: test_fptosi_i64(
-; CHECK:      ld.param.b16    [[A:%h[0-9]+]], [test_fptosi_i64_param_0];
-; CHECK:      cvt.rzi.s64.f16 [[R:%rd[0-9]+]], [[A]];
-; CHECK:      st.param.b64    [func_retval0+0], [[R]];
-; CHECK:      ret;
-define i64 @test_fptosi_i64(half %a) #0 {
-  %r = fptosi half %a to i64
-  ret i64 %r
-}
-
-; CHECK-LABEL: test_fptoui_i32(
-; CHECK:      ld.param.b16    [[A:%h[0-9]+]], [test_fptoui_i32_param_0];
-; CHECK:      cvt.rzi.u32.f16 [[R:%r[0-9]+]], [[A]];
-; CHECK:      st.param.b32    [func_retval0+0], [[R]];
-; CHECK:      ret;
-define i32 @test_fptoui_i32(half %a) #0 {
-  %r = fptoui half %a to i32
-  ret i32 %r
-}
-
-; CHECK-LABEL: test_fptoui_i64(
-; CHECK:      ld.param.b16    [[A:%h[0-9]+]], [test_fptoui_i64_param_0];
-; CHECK:      cvt.rzi.u64.f16 [[R:%rd[0-9]+]], [[A]];
-; CHECK:      st.param.b64    [func_retval0+0], [[R]];
-; CHECK:      ret;
-define i64 @test_fptoui_i64(half %a) #0 {
-  %r = fptoui half %a to i64
-  ret i64 %r
-}
-
-; CHECK-LABEL: test_uitofp_i32(
-; CHECK:      ld.param.u32    [[A:%r[0-9]+]], [test_uitofp_i32_param_0];
-; CHECK:      cvt.rn.f16.u32  [[R:%h[0-9]+]], [[A]];
-; CHECK:      st.param.b16    [func_retval0+0], [[R]];
-; CHECK:      ret;
-define half @test_uitofp_i32(i32 %a) #0 {
-  %r = uitofp i32 %a to half
-  ret half %r
-}
-
-; CHECK-LABEL: test_uitofp_i64(
-; CHECK:      ld.param.u64    [[A:%rd[0-9]+]], [test_uitofp_i64_param_0];
-; CHECK:      cvt.rn.f16.u64  [[R:%h[0-9]+]], [[A]];
-; CHECK:      st.param.b16    [func_retval0+0], [[R]];
-; CHECK:      ret;
-define half @test_uitofp_i64(i64 %a) #0 {
-  %r = uitofp i64 %a to half
-  ret half %r
-}
-
-; CHECK-LABEL: test_sitofp_i32(
-; CHECK:      ld.param.u32    [[A:%r[0-9]+]], [test_sitofp_i32_param_0];
-; CHECK:      cvt.rn.f16.s32  [[R:%h[0-9]+]], [[A]];
-; CHECK:      st.param.b16    [func_retval0+0], [[R]];
-; CHECK:      ret;
-define half @test_sitofp_i32(i32 %a) #0 {
-  %r = sitofp i32 %a to half
-  ret half %r
-}
-
-; CHECK-LABEL: test_sitofp_i64(
-; CHECK:      ld.param.u64    [[A:%rd[0-9]+]], [test_sitofp_i64_param_0];
-; CHECK:      cvt.rn.f16.s64  [[R:%h[0-9]+]], [[A]];
-; CHECK:      st.param.b16    [func_retval0+0], [[R]];
-; CHECK:      ret;
-define half @test_sitofp_i64(i64 %a) #0 {
-  %r = sitofp i64 %a to half
-  ret half %r
-}
-
-; CHECK-LABEL: test_uitofp_i32_fadd(
-; CHECK-DAG:  ld.param.u32    [[A:%r[0-9]+]], [test_uitofp_i32_fadd_param_0];
-; CHECK-DAG:  cvt.rn.f16.u32  [[C:%h[0-9]+]], [[A]];
-; CHECK-DAG:  ld.param.b16    [[B:%h[0-9]+]], [test_uitofp_i32_fadd_param_1];
-; CHECK-F16:       add.rn.f16      [[R:%h[0-9]+]], [[B]], [[C]];
-; CHECK-NOF16-DAG:  cvt.f32.f16    [[B32:%f[0-9]+]], [[B]]
-; CHECK-NOF16-DAG:  cvt.f32.f16    [[C32:%f[0-9]+]], [[C]]
-; CHECK-NOF16-NEXT: add.rn.f32     [[R32:%f[0-9]+]], [[B32]], [[C32]];
-; CHECK-NOF16-NEXT: cvt.rn.f16.f32 [[R:%h[0-9]+]], [[R32]]
-; CHECK:      st.param.b16    [func_retval0+0], [[R]];
-; CHECK:      ret;
-define half @test_uitofp_i32_fadd(i32 %a, half %b) #0 {
-  %c = uitofp i32 %a to half
-  %r = fadd half %b, %c
-  ret half %r
-}
-
-; CHECK-LABEL: test_sitofp_i32_fadd(
-; CHECK-DAG:  ld.param.u32    [[A:%r[0-9]+]], [test_sitofp_i32_fadd_param_0];
-; CHECK-DAG:  cvt.rn.f16.s32  [[C:%h[0-9]+]], [[A]];
-; CHECK-DAG:  ld.param.b16    [[B:%h[0-9]+]], [test_sitofp_i32_fadd_param_1];
-; CHECK-F16:         add.rn.f16     [[R:%h[0-9]+]], [[B]], [[C]];
-; XCHECK-NOF16-DAG:  cvt.f32.f16    [[B32:%f[0-9]+]], [[B]]
-; XCHECK-NOF16-DAG:  cvt.f32.f16    [[C32:%f[0-9]+]], [[C]]
-; XCHECK-NOF16-NEXT: add.rn.f32     [[R32:%f[0-9]+]], [[B32]], [[C32]];
-; XCHECK-NOF16-NEXT: cvt.rn.f16.f32 [[R:%h[0-9]+]], [[R32]]
-; CHECK:      st.param.b16    [func_retval0+0], [[R]];
-; CHECK:      ret;
-define half @test_sitofp_i32_fadd(i32 %a, half %b) #0 {
-  %c = sitofp i32 %a to half
-  %r = fadd half %b, %c
-  ret half %r
-}
-
-; CHECK-LABEL: test_fptrunc_float(
-; CHECK:      ld.param.f32    [[A:%f[0-9]+]], [test_fptrunc_float_param_0];
-; CHECK:      cvt.rn.f16.f32  [[R:%h[0-9]+]], [[A]];
-; CHECK:      st.param.b16    [func_retval0+0], [[R]];
-; CHECK:      ret;
-define half @test_fptrunc_float(float %a) #0 {
-  %r = fptrunc float %a to half
-  ret half %r
-}
-
-; CHECK-LABEL: test_fptrunc_double(
-; CHECK:      ld.param.f64    [[A:%fd[0-9]+]], [test_fptrunc_double_param_0];
-; CHECK:      cvt.rn.f16.f64  [[R:%h[0-9]+]], [[A]];
-; CHECK:      st.param.b16    [func_retval0+0], [[R]];
-; CHECK:      ret;
-define half @test_fptrunc_double(double %a) #0 {
-  %r = fptrunc double %a to half
-  ret half %r
-}
-
-; CHECK-LABEL: test_fpext_float(
-; CHECK:      ld.param.b16    [[A:%h[0-9]+]], [test_fpext_float_param_0];
-; CHECK:      cvt.f32.f16     [[R:%f[0-9]+]], [[A]];
-; CHECK:      st.param.f32    [func_retval0+0], [[R]];
-; CHECK:      ret;
-define float @test_fpext_float(half %a) #0 {
-  %r = fpext half %a to float
-  ret float %r
-}
-
-; CHECK-LABEL: test_fpext_double(
-; CHECK:      ld.param.b16    [[A:%h[0-9]+]], [test_fpext_double_param_0];
-; CHECK:      cvt.f64.f16     [[R:%fd[0-9]+]], [[A]];
-; CHECK:      st.param.f64    [func_retval0+0], [[R]];
-; CHECK:      ret;
-define double @test_fpext_double(half %a) #0 {
-  %r = fpext half %a to double
-  ret double %r
-}
-
-
-; CHECK-LABEL: test_bitcast_halftoi16(
-; CHECK:      ld.param.b16    [[AH:%h[0-9]+]], [test_bitcast_halftoi16_param_0];
-; CHECK:      mov.b16         [[AS:%rs[0-9]+]], [[AH]]
-; CHECK:      cvt.u32.u16     [[R:%r[0-9]+]], [[AS]]
-; CHECK:      st.param.b32    [func_retval0+0], [[R]];
-; CHECK:      ret;
-define i16 @test_bitcast_halftoi16(half %a) #0 {
-  %r = bitcast half %a to i16
-  ret i16 %r
-}
-
-; CHECK-LABEL: test_bitcast_i16tohalf(
-; CHECK:      ld.param.u16    [[AS:%rs[0-9]+]], [test_bitcast_i16tohalf_param_0];
-; CHECK:      mov.b16         [[AH:%h[0-9]+]], [[AS]]
-; CHECK:      st.param.b16    [func_retval0+0], [[AH]];
-; CHECK:      ret;
-define half @test_bitcast_i16tohalf(i16 %a) #0 {
-  %r = bitcast i16 %a to half
-  ret half %r
-}
-
-
-declare half @llvm.sqrt.f16(half %a) #0
-declare half @llvm.powi.f16(half %a, i32 %b) #0
-declare half @llvm.sin.f16(half %a) #0
-declare half @llvm.cos.f16(half %a) #0
-declare half @llvm.pow.f16(half %a, half %b) #0
-declare half @llvm.exp.f16(half %a) #0
-declare half @llvm.exp2.f16(half %a) #0
-declare half @llvm.log.f16(half %a) #0
-declare half @llvm.log10.f16(half %a) #0
-declare half @llvm.log2.f16(half %a) #0
-declare half @llvm.fma.f16(half %a, half %b, half %c) #0
-declare half @llvm.fabs.f16(half %a) #0
-declare half @llvm.minnum.f16(half %a, half %b) #0
-declare half @llvm.maxnum.f16(half %a, half %b) #0
-declare half @llvm.copysign.f16(half %a, half %b) #0
-declare half @llvm.floor.f16(half %a) #0
-declare half @llvm.ceil.f16(half %a) #0
-declare half @llvm.trunc.f16(half %a) #0
-declare half @llvm.rint.f16(half %a) #0
-declare half @llvm.nearbyint.f16(half %a) #0
-declare half @llvm.round.f16(half %a) #0
-declare half @llvm.fmuladd.f16(half %a, half %b, half %c) #0
-
-; CHECK-LABEL: test_sqrt(
-; CHECK:      ld.param.b16    [[A:%h[0-9]+]], [test_sqrt_param_0];
-; CHECK:      cvt.f32.f16     [[AF:%f[0-9]+]], [[A]];
-; CHECK:      sqrt.rn.f32     [[RF:%f[0-9]+]], [[AF]];
-; CHECK:      cvt.rn.f16.f32  [[R:%h[0-9]+]], [[RF]];
-; CHECK:      st.param.b16    [func_retval0+0], [[R]];
-; CHECK:      ret;
-define half @test_sqrt(half %a) #0 {
-  %r = call half @llvm.sqrt.f16(half %a)
-  ret half %r
-}
-
-;;; Can't do this yet: requires libcall.
-; XCHECK-LABEL: test_powi(
-;define half @test_powi(half %a, i32 %b) #0 {
-;  %r = call half @llvm.powi.f16(half %a, i32 %b)
-;  ret half %r
-;}
-
-; CHECK-LABEL: test_sin(
-; CHECK:      ld.param.b16    [[A:%h[0-9]+]], [test_sin_param_0];
-; CHECK:      cvt.f32.f16     [[AF:%f[0-9]+]], [[A]];
-; CHECK:      sin.approx.f32  [[RF:%f[0-9]+]], [[AF]];
-; CHECK:      cvt.rn.f16.f32  [[R:%h[0-9]+]], [[RF]];
-; CHECK:      st.param.b16    [func_retval0+0], [[R]];
-; CHECK:      ret;
-define half @test_sin(half %a) #0 #1 {
-  %r = call half @llvm.sin.f16(half %a)
-  ret half %r
-}
-
-; CHECK-LABEL: test_cos(
-; CHECK:      ld.param.b16    [[A:%h[0-9]+]], [test_cos_param_0];
-; CHECK:      cvt.f32.f16     [[AF:%f[0-9]+]], [[A]];
-; CHECK:      cos.approx.f32  [[RF:%f[0-9]+]], [[AF]];
-; CHECK:      cvt.rn.f16.f32  [[R:%h[0-9]+]], [[RF]];
-; CHECK:      st.param.b16    [func_retval0+0], [[R]];
-; CHECK:      ret;
-define half @test_cos(half %a) #0 #1 {
-  %r = call half @llvm.cos.f16(half %a)
-  ret half %r
-}
-
-;;; Can't do this yet: requires libcall.
-; XCHECK-LABEL: test_pow(
-;define half @test_pow(half %a, half %b) #0 {
-;  %r = call half @llvm.pow.f16(half %a, half %b)
-;  ret half %r
-;}
-
-;;; Can't do this yet: requires libcall.
-; XCHECK-LABEL: test_exp(
-;define half @test_exp(half %a) #0 {
-;  %r = call half @llvm.exp.f16(half %a)
-;  ret half %r
-;}
-
-;;; Can't do this yet: requires libcall.
-; XCHECK-LABEL: test_exp2(
-;define half @test_exp2(half %a) #0 {
-;  %r = call half @llvm.exp2.f16(half %a)
-;  ret half %r
-;}
-
-;;; Can't do this yet: requires libcall.
-; XCHECK-LABEL: test_log(
-;define half @test_log(half %a) #0 {
-;  %r = call half @llvm.log.f16(half %a)
-;  ret half %r
-;}
-
-;;; Can't do this yet: requires libcall.
-; XCHECK-LABEL: test_log10(
-;define half @test_log10(half %a) #0 {
-;  %r = call half @llvm.log10.f16(half %a)
-;  ret half %r
-;}
-
-;;; Can't do this yet: requires libcall.
-; XCHECK-LABEL: test_log2(
-;define half @test_log2(half %a) #0 {
-;  %r = call half @llvm.log2.f16(half %a)
-;  ret half %r
-;}
-
-; CHECK-LABEL: test_fma(
-; CHECK-DAG:  ld.param.b16    [[A:%h[0-9]+]], [test_fma_param_0];
-; CHECK-DAG:  ld.param.b16    [[B:%h[0-9]+]], [test_fma_param_1];
-; CHECK-DAG:  ld.param.b16    [[C:%h[0-9]+]], [test_fma_param_2];
-; CHECK-F16:      fma.rn.f16      [[R:%h[0-9]+]], [[A]], [[B]], [[C]];
-; CHECK-NOF16-DAG:  cvt.f32.f16    [[A32:%f[0-9]+]], [[A]]
-; CHECK-NOF16-DAG:  cvt.f32.f16    [[B32:%f[0-9]+]], [[B]]
-; CHECK-NOF16-DAG:  cvt.f32.f16    [[C32:%f[0-9]+]], [[C]]
-; CHECK-NOF16-NEXT: fma.rn.f32     [[R32:%f[0-9]+]], [[A32]], [[B32]], [[C32]];
-; CHECK-NOF16-NEXT: cvt.rn.f16.f32 [[R:%h[0-9]+]], [[R32]]
-; CHECK:      st.param.b16    [func_retval0+0], [[R]];
-; CHECK:      ret
-define half @test_fma(half %a, half %b, half %c) #0 {
-  %r = call half @llvm.fma.f16(half %a, half %b, half %c)
-  ret half %r
-}
-
-; CHECK-LABEL: test_fabs(
-; CHECK:      ld.param.b16    [[A:%h[0-9]+]], [test_fabs_param_0];
-; CHECK:      cvt.f32.f16     [[AF:%f[0-9]+]], [[A]];
-; CHECK:      abs.f32         [[RF:%f[0-9]+]], [[AF]];
-; CHECK:      cvt.rn.f16.f32  [[R:%h[0-9]+]], [[RF]];
-; CHECK:      st.param.b16    [func_retval0+0], [[R]];
-; CHECK:      ret;
-define half @test_fabs(half %a) #0 {
-  %r = call half @llvm.fabs.f16(half %a)
-  ret half %r
-}
-
-; CHECK-LABEL: test_minnum(
-; CHECK-DAG:  ld.param.b16    [[A:%h[0-9]+]], [test_minnum_param_0];
-; CHECK-DAG:  ld.param.b16    [[B:%h[0-9]+]], [test_minnum_param_1];
-; CHECK-DAG:  cvt.f32.f16     [[AF:%f[0-9]+]], [[A]];
-; CHECK-DAG:  cvt.f32.f16     [[BF:%f[0-9]+]], [[B]];
-; CHECK:      min.f32         [[RF:%f[0-9]+]], [[AF]], [[BF]];
-; CHECK:      cvt.rn.f16.f32  [[R:%h[0-9]+]], [[RF]];
-; CHECK:      st.param.b16    [func_retval0+0], [[R]];
-; CHECK:      ret;
-define half @test_minnum(half %a, half %b) #0 {
-  %r = call half @llvm.minnum.f16(half %a, half %b)
-  ret half %r
-}
-
-; CHECK-LABEL: test_maxnum(
-; CHECK-DAG:  ld.param.b16    [[A:%h[0-9]+]], [test_maxnum_param_0];
-; CHECK-DAG:  ld.param.b16    [[B:%h[0-9]+]], [test_maxnum_param_1];
-; CHECK-DAG:  cvt.f32.f16     [[AF:%f[0-9]+]], [[A]];
-; CHECK-DAG:  cvt.f32.f16     [[BF:%f[0-9]+]], [[B]];
-; CHECK:      max.f32         [[RF:%f[0-9]+]], [[AF]], [[BF]];
-; CHECK:      cvt.rn.f16.f32  [[R:%h[0-9]+]], [[RF]];
-; CHECK:      st.param.b16    [func_retval0+0], [[R]];
-; CHECK:      ret;
-define half @test_maxnum(half %a, half %b) #0 {
-  %r = call half @llvm.maxnum.f16(half %a, half %b)
-  ret half %r
-}
-
-; CHECK-LABEL: test_copysign(
-; CHECK-DAG:  ld.param.b16    [[AH:%h[0-9]+]], [test_copysign_param_0];
-; CHECK-DAG:  ld.param.b16    [[BH:%h[0-9]+]], [test_copysign_param_1];
-; CHECK-DAG:  mov.b16         [[AS:%rs[0-9]+]], [[AH]];
-; CHECK-DAG:  mov.b16         [[BS:%rs[0-9]+]], [[BH]];
-; CHECK-DAG:  and.b16         [[AX:%rs[0-9]+]], [[AS]], 32767;
-; CHECK-DAG:  and.b16         [[BX:%rs[0-9]+]], [[BS]], -32768;
-; CHECK:      or.b16          [[RX:%rs[0-9]+]], [[AX]], [[BX]];
-; CHECK:      mov.b16         [[R:%h[0-9]+]], [[RX]];
-; CHECK:      st.param.b16    [func_retval0+0], [[R]];
-; CHECK:      ret;
-define half @test_copysign(half %a, half %b) #0 {
-  %r = call half @llvm.copysign.f16(half %a, half %b)
-  ret half %r
-}
-
-; CHECK-LABEL: test_copysign_f32(
-; CHECK-DAG:  ld.param.b16    [[AH:%h[0-9]+]], [test_copysign_f32_param_0];
-; CHECK-DAG:  ld.param.f32    [[BF:%f[0-9]+]], [test_copysign_f32_param_1];
-; CHECK-DAG:  mov.b16         [[A:%rs[0-9]+]], [[AH]];
-; CHECK-DAG:  mov.b32         [[B:%r[0-9]+]], [[BF]];
-; CHECK-DAG:  and.b16         [[AX:%rs[0-9]+]], [[A]], 32767;
-; CHECK-DAG:  and.b32         [[BX0:%r[0-9]+]], [[B]], -2147483648;
-; CHECK-DAG:  shr.u32         [[BX1:%r[0-9]+]], [[BX0]], 16;
-; CHECK-DAG:  cvt.u16.u32     [[BX2:%rs[0-9]+]], [[BX1]];
-; CHECK:      or.b16          [[RX:%rs[0-9]+]], [[AX]], [[BX2]];
-; CHECK:      mov.b16         [[R:%h[0-9]+]], [[RX]];
-; CHECK:      st.param.b16    [func_retval0+0], [[R]];
-; CHECK:      ret;
-define half @test_copysign_f32(half %a, float %b) #0 {
-  %tb = fptrunc float %b to half
-  %r = call half @llvm.copysign.f16(half %a, half %tb)
-  ret half %r
-}
-
-; CHECK-LABEL: test_copysign_f64(
-; CHECK-DAG:  ld.param.b16    [[AH:%h[0-9]+]], [test_copysign_f64_param_0];
-; CHECK-DAG:  ld.param.f64    [[BD:%fd[0-9]+]], [test_copysign_f64_param_1];
-; CHECK-DAG:  mov.b16         [[A:%rs[0-9]+]], [[AH]];
-; CHECK-DAG:  mov.b64         [[B:%rd[0-9]+]], [[BD]];
-; CHECK-DAG:  and.b16         [[AX:%rs[0-9]+]], [[A]], 32767;
-; CHECK-DAG:  and.b64         [[BX0:%rd[0-9]+]], [[B]], -9223372036854775808;
-; CHECK-DAG:  shr.u64         [[BX1:%rd[0-9]+]], [[BX0]], 48;
-; CHECK-DAG:  cvt.u16.u64     [[BX2:%rs[0-9]+]], [[BX1]];
-; CHECK:      or.b16          [[RX:%rs[0-9]+]], [[AX]], [[BX2]];
-; CHECK:      mov.b16         [[R:%h[0-9]+]], [[RX]];
-; CHECK:      st.param.b16    [func_retval0+0], [[R]];
-; CHECK:      ret;
-define half @test_copysign_f64(half %a, double %b) #0 {
-  %tb = fptrunc double %b to half
-  %r = call half @llvm.copysign.f16(half %a, half %tb)
-  ret half %r
-}
-
-; CHECK-LABEL: test_copysign_extended(
-; CHECK-DAG:  ld.param.b16    [[AH:%h[0-9]+]], [test_copysign_extended_param_0];
-; CHECK-DAG:  ld.param.b16    [[BH:%h[0-9]+]], [test_copysign_extended_param_1];
-; CHECK-DAG:  mov.b16         [[AS:%rs[0-9]+]], [[AH]];
-; CHECK-DAG:  mov.b16         [[BS:%rs[0-9]+]], [[BH]];
-; CHECK-DAG:  and.b16         [[AX:%rs[0-9]+]], [[AS]], 32767;
-; CHECK-DAG:  and.b16         [[BX:%rs[0-9]+]], [[BS]], -32768;
-; CHECK:      or.b16          [[RX:%rs[0-9]+]], [[AX]], [[BX]];
-; CHECK:      mov.b16         [[R:%h[0-9]+]], [[RX]];
-; CHECK:      cvt.f32.f16     [[XR:%f[0-9]+]], [[R]];
-; CHECK:      st.param.f32    [func_retval0+0], [[XR]];
-; CHECK:      ret;
-define float @test_copysign_extended(half %a, half %b) #0 {
-  %r = call half @llvm.copysign.f16(half %a, half %b)
-  %xr = fpext half %r to float
-  ret float %xr
-}
-
-; CHECK-LABEL: test_floor(
-; CHECK:      ld.param.b16    [[A:%h[0-9]+]], [test_floor_param_0];
-; CHECK:      cvt.rmi.f16.f16 [[R:%h[0-9]+]], [[A]];
-; CHECK:      st.param.b16    [func_retval0+0], [[R]];
-; CHECK:      ret;
-define half @test_floor(half %a) #0 {
-  %r = call half @llvm.floor.f16(half %a)
-  ret half %r
-}
-
-; CHECK-LABEL: test_ceil(
-; CHECK:      ld.param.b16    [[A:%h[0-9]+]], [test_ceil_param_0];
-; CHECK:      cvt.rpi.f16.f16 [[R:%h[0-9]+]], [[A]];
-; CHECK:      st.param.b16    [func_retval0+0], [[R]];
-; CHECK:      ret;
-define half @test_ceil(half %a) #0 {
-  %r = call half @llvm.ceil.f16(half %a)
-  ret half %r
-}
-
-; CHECK-LABEL: test_trunc(
-; CHECK:      ld.param.b16    [[A:%h[0-9]+]], [test_trunc_param_0];
-; CHECK:      cvt.rzi.f16.f16 [[R:%h[0-9]+]], [[A]];
-; CHECK:      st.param.b16    [func_retval0+0], [[R]];
-; CHECK:      ret;
-define half @test_trunc(half %a) #0 {
-  %r = call half @llvm.trunc.f16(half %a)
-  ret half %r
-}
-
-; CHECK-LABEL: test_rint(
-; CHECK:      ld.param.b16    [[A:%h[0-9]+]], [test_rint_param_0];
-; CHECK:      cvt.rni.f16.f16 [[R:%h[0-9]+]], [[A]];
-; CHECK:      st.param.b16    [func_retval0+0], [[R]];
-; CHECK:      ret;
-define half @test_rint(half %a) #0 {
-  %r = call half @llvm.rint.f16(half %a)
-  ret half %r
-}
-
-; CHECK-LABEL: test_nearbyint(
-; CHECK:      ld.param.b16    [[A:%h[0-9]+]], [test_nearbyint_param_0];
-; CHECK:      cvt.rni.f16.f16 [[R:%h[0-9]+]], [[A]];
-; CHECK:      st.param.b16    [func_retval0+0], [[R]];
-; CHECK:      ret;
-define half @test_nearbyint(half %a) #0 {
-  %r = call half @llvm.nearbyint.f16(half %a)
-  ret half %r
-}
-
-; CHECK-LABEL: test_round(
-; CHECK:      ld.param.b16    [[A:%h[0-9]+]], [test_round_param_0];
-; CHECK:      cvt.rni.f16.f16 [[R:%h[0-9]+]], [[A]];
-; CHECK:      st.param.b16    [func_retval0+0], [[R]];
-; CHECK:      ret;
-define half @test_round(half %a) #0 {
-  %r = call half @llvm.round.f16(half %a)
-  ret half %r
-}
-
-; CHECK-LABEL: test_fmuladd(
-; CHECK-DAG:  ld.param.b16    [[A:%h[0-9]+]], [test_fmuladd_param_0];
-; CHECK-DAG:  ld.param.b16    [[B:%h[0-9]+]], [test_fmuladd_param_1];
-; CHECK-DAG:  ld.param.b16    [[C:%h[0-9]+]], [test_fmuladd_param_2];
-; CHECK-F16:        fma.rn.f16     [[R:%h[0-9]+]], [[A]], [[B]], [[C]];
-; CHECK-NOF16-DAG:  cvt.f32.f16    [[A32:%f[0-9]+]], [[A]]
-; CHECK-NOF16-DAG:  cvt.f32.f16    [[B32:%f[0-9]+]], [[B]]
-; CHECK-NOF16-DAG:  cvt.f32.f16    [[C32:%f[0-9]+]], [[C]]
-; CHECK-NOF16-NEXT: fma.rn.f32     [[R32:%f[0-9]+]], [[A32]], [[B32]], [[C32]];
-; CHECK-NOF16-NEXT: cvt.rn.f16.f32 [[R:%h[0-9]+]], [[R32]]
-; CHECK:      st.param.b16    [func_retval0+0], [[R]];
-; CHECK:      ret;
-define half @test_fmuladd(half %a, half %b, half %c) #0 {
-  %r = call half @llvm.fmuladd.f16(half %a, half %b, half %c)
-  ret half %r
-}
-
-attributes #0 = { nounwind }
-attributes #1 = { "unsafe-fp-math" = "true" }
+; ## Full FP16 support enabled by default.
+; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda -mcpu=sm_53 -asm-verbose=false \
+; RUN:          -O0 -disable-post-ra -disable-fp-elim -verify-machineinstrs \
+; RUN: | FileCheck -check-prefixes CHECK,CHECK-F16 %s
+; ## FP16 support explicitly disabled.
+; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda -mcpu=sm_53 -asm-verbose=false \
+; RUN:          -O0 -disable-post-ra -disable-fp-elim --nvptx-no-f16-math \
+; RUN:           -verify-machineinstrs \
+; RUN: | FileCheck -check-prefixes CHECK,CHECK-NOF16 %s
+; ## FP16 is not supported by hardware.
+; RUN: llc < %s -O0 -mtriple=nvptx64-nvidia-cuda -mcpu=sm_52 -asm-verbose=false \
+; RUN:          -disable-post-ra -disable-fp-elim -verify-machineinstrs \
+; RUN: | FileCheck -check-prefixes CHECK,CHECK-NOF16 %s
+
+target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
+
+; CHECK-LABEL: test_ret_const(
+; CHECK:      mov.b16         [[R:%h[0-9]+]], 0x3C00;
+; CHECK-NEXT: st.param.b16    [func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+define half @test_ret_const() #0 {
+  ret half 1.0
+}
+
+; CHECK-LABEL: test_fadd(
+; CHECK-DAG:  ld.param.b16    [[A:%h[0-9]+]], [test_fadd_param_0];
+; CHECK-DAG:  ld.param.b16    [[B:%h[0-9]+]], [test_fadd_param_1];
+; CHECK-F16-NEXT:   add.rn.f16     [[R:%h[0-9]+]], [[A]], [[B]];
+; CHECK-NOF16-DAG:  cvt.f32.f16    [[A32:%f[0-9]+]], [[A]]
+; CHECK-NOF16-DAG:  cvt.f32.f16    [[B32:%f[0-9]+]], [[B]]
+; CHECK-NOF16-NEXT: add.rn.f32     [[R32:%f[0-9]+]], [[A32]], [[B32]];
+; CHECK-NOF16-NEXT: cvt.rn.f16.f32 [[R:%h[0-9]+]], [[R32]]
+; CHECK-NEXT: st.param.b16    [func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+define half @test_fadd(half %a, half %b) #0 {
+  %r = fadd half %a, %b
+  ret half %r
+}
+
+; CHECK-LABEL: test_fadd_v1f16(
+; CHECK-DAG:  ld.param.b16    [[A:%h[0-9]+]], [test_fadd_v1f16_param_0];
+; CHECK-DAG:  ld.param.b16    [[B:%h[0-9]+]], [test_fadd_v1f16_param_1];
+; CHECK-F16-NEXT:   add.rn.f16     [[R:%h[0-9]+]], [[A]], [[B]];
+; CHECK-NOF16-DAG:  cvt.f32.f16    [[A32:%f[0-9]+]], [[A]]
+; CHECK-NOF16-DAG:  cvt.f32.f16    [[B32:%f[0-9]+]], [[B]]
+; CHECK-NOF16-NEXT: add.rn.f32     [[R32:%f[0-9]+]], [[A32]], [[B32]];
+; CHECK-NOF16-NEXT: cvt.rn.f16.f32 [[R:%h[0-9]+]], [[R32]]
+; CHECK-NEXT: st.param.b16    [func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+define <1 x half> @test_fadd_v1f16(<1 x half> %a, <1 x half> %b) #0 {
+  %r = fadd <1 x half> %a, %b
+  ret <1 x half> %r
+}
+
+; Check that we can lower fadd with immediate arguments.
+; CHECK-LABEL: test_fadd_imm_0(
+; CHECK-DAG:  ld.param.b16    [[B:%h[0-9]+]], [test_fadd_imm_0_param_0];
+; CHECK-F16-DAG:    mov.b16        [[A:%h[0-9]+]], 0x3C00;
+; CHECK-F16-NEXT:   add.rn.f16     [[R:%h[0-9]+]], [[B]], [[A]];
+; CHECK-NOF16-DAG:  cvt.f32.f16    [[B32:%f[0-9]+]], [[B]]
+; CHECK-NOF16-NEXT: add.rn.f32     [[R32:%f[0-9]+]], [[B32]], 0f3F800000;
+; CHECK-NOF16-NEXT: cvt.rn.f16.f32 [[R:%h[0-9]+]], [[R32]]
+; CHECK-NEXT: st.param.b16    [func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+define half @test_fadd_imm_0(half %b) #0 {
+  %r = fadd half 1.0, %b
+  ret half %r
+}
+
+; CHECK-LABEL: test_fadd_imm_1(
+; CHECK-DAG:  ld.param.b16    [[B:%h[0-9]+]], [test_fadd_imm_1_param_0];
+; CHECK-F16-DAG:    mov.b16        [[A:%h[0-9]+]], 0x3C00;
+; CHECK-F16-NEXT:   add.rn.f16     [[R:%h[0-9]+]], [[B]], [[A]];
+; CHECK-NOF16-DAG:  cvt.f32.f16    [[B32:%f[0-9]+]], [[B]]
+; CHECK-NOF16-NEXT: add.rn.f32     [[R32:%f[0-9]+]], [[B32]], 0f3F800000;
+; CHECK-NOF16-NEXT: cvt.rn.f16.f32 [[R:%h[0-9]+]], [[R32]]
+; CHECK-NEXT: st.param.b16    [func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+define half @test_fadd_imm_1(half %a) #0 {
+  %r = fadd half %a, 1.0
+  ret half %r
+}
+
+; CHECK-LABEL: test_fsub(
+; CHECK-DAG:  ld.param.b16    [[A:%h[0-9]+]], [test_fsub_param_0];
+; CHECK-DAG:  ld.param.b16    [[B:%h[0-9]+]], [test_fsub_param_1];
+; CHECK-F16-NEXT:   sub.rn.f16     [[R:%h[0-9]+]], [[A]], [[B]];
+; CHECK-NOF16-DAG:  cvt.f32.f16    [[A32:%f[0-9]+]], [[A]]
+; CHECK-NOF16-DAG:  cvt.f32.f16    [[B32:%f[0-9]+]], [[B]]
+; CHECK-NOF16-NEXT: sub.rn.f32     [[R32:%f[0-9]+]], [[A32]], [[B32]];
+; CHECK-NOF16-NEXT: cvt.rn.f16.f32 [[R:%h[0-9]+]], [[R32]]
+; CHECK-NEXT: st.param.b16    [func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+define half @test_fsub(half %a, half %b) #0 {
+  %r = fsub half %a, %b
+  ret half %r
+}
+
+; CHECK-LABEL: test_fneg(
+; CHECK-DAG:  ld.param.b16    [[A:%h[0-9]+]], [test_fneg_param_0];
+; CHECK-F16-NEXT:   mov.b16        [[Z:%h[0-9]+]], 0x0000
+; CHECK-F16-NEXT:   sub.rn.f16     [[R:%h[0-9]+]], [[Z]], [[A]];
+; CHECK-NOF16-DAG:  cvt.f32.f16    [[A32:%f[0-9]+]], [[A]]
+; CHECK-NOF16-DAG:  mov.f32        [[Z:%f[0-9]+]], 0f00000000;
+; CHECK-NOF16-NEXT: sub.rn.f32     [[R32:%f[0-9]+]], [[Z]], [[A32]];
+; CHECK-NOF16-NEXT: cvt.rn.f16.f32 [[R:%h[0-9]+]], [[R32]]
+; CHECK-NEXT: st.param.b16    [func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+define half @test_fneg(half %a) #0 {
+  %r = fsub half 0.0, %a
+  ret half %r
+}
+
+; CHECK-LABEL: test_fmul(
+; CHECK-DAG:  ld.param.b16    [[A:%h[0-9]+]], [test_fmul_param_0];
+; CHECK-DAG:  ld.param.b16    [[B:%h[0-9]+]], [test_fmul_param_1];
+; CHECK-F16-NEXT: mul.rn.f16      [[R:%h[0-9]+]], [[A]], [[B]];
+; CHECK-NOF16-DAG:  cvt.f32.f16    [[A32:%f[0-9]+]], [[A]]
+; CHECK-NOF16-DAG:  cvt.f32.f16    [[B32:%f[0-9]+]], [[B]]
+; CHECK-NOF16-NEXT: mul.rn.f32     [[R32:%f[0-9]+]], [[A32]], [[B32]];
+; CHECK-NOF16-NEXT: cvt.rn.f16.f32 [[R:%h[0-9]+]], [[R32]]
+; CHECK-NEXT: st.param.b16    [func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+define half @test_fmul(half %a, half %b) #0 {
+  %r = fmul half %a, %b
+  ret half %r
+}
+
+; CHECK-LABEL: test_fdiv(
+; CHECK-DAG:  ld.param.b16    [[A:%h[0-9]+]], [test_fdiv_param_0];
+; CHECK-DAG:  ld.param.b16    [[B:%h[0-9]+]], [test_fdiv_param_1];
+; CHECK-DAG:  cvt.f32.f16     [[F0:%f[0-9]+]], [[A]];
+; CHECK-DAG:  cvt.f32.f16     [[F1:%f[0-9]+]], [[B]];
+; CHECK-NEXT: div.rn.f32      [[FR:%f[0-9]+]], [[F0]], [[F1]];
+; CHECK-NEXT: cvt.rn.f16.f32  [[R:%h[0-9]+]], [[FR]];
+; CHECK-NEXT: st.param.b16    [func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+define half @test_fdiv(half %a, half %b) #0 {
+  %r = fdiv half %a, %b
+  ret half %r
+}
+
+; CHECK-LABEL: test_frem(
+; CHECK-DAG:  ld.param.b16    [[A:%h[0-9]+]], [test_frem_param_0];
+; CHECK-DAG:  ld.param.b16    [[B:%h[0-9]+]], [test_frem_param_1];
+; CHECK-DAG:  cvt.f32.f16     [[FA:%f[0-9]+]], [[A]];
+; CHECK-DAG:  cvt.f32.f16     [[FB:%f[0-9]+]], [[B]];
+; CHECK-NEXT: div.rn.f32      [[D:%f[0-9]+]], [[FA]], [[FB]];
+; CHECK-NEXT: cvt.rmi.f32.f32 [[DI:%f[0-9]+]], [[D]];
+; CHECK-NEXT: mul.f32         [[RI:%f[0-9]+]], [[DI]], [[FB]];
+; CHECK-NEXT: sub.f32         [[RF:%f[0-9]+]], [[FA]], [[RI]];
+; CHECK-NEXT: cvt.rn.f16.f32  [[R:%h[0-9]+]], [[RF]];
+; CHECK-NEXT: st.param.b16    [func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+define half @test_frem(half %a, half %b) #0 {
+  %r = frem half %a, %b
+  ret half %r
+}
+
+; CHECK-LABEL: test_store(
+; CHECK-DAG:  ld.param.b16    [[A:%h[0-9]+]], [test_store_param_0];
+; CHECK-DAG:  ld.param.u64    %[[PTR:rd[0-9]+]], [test_store_param_1];
+; CHECK-NEXT: st.b16          [%[[PTR]]], [[A]];
+; CHECK-NEXT: ret;
+define void @test_store(half %a, half* %b) #0 {
+  store half %a, half* %b
+  ret void
+}
+
+; CHECK-LABEL: test_load(
+; CHECK:      ld.param.u64    %[[PTR:rd[0-9]+]], [test_load_param_0];
+; CHECK-NEXT: ld.b16          [[R:%h[0-9]+]], [%[[PTR]]];
+; CHECK-NEXT: st.param.b16    [func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+define half @test_load(half* %a) #0 {
+  %r = load half, half* %a
+  ret half %r
+}
+
+; CHECK-LABEL: .visible .func test_halfp0a1(
+; CHECK-DAG: ld.param.u64 %[[FROM:rd?[0-9]+]], [test_halfp0a1_param_0];
+; CHECK-DAG: ld.param.u64 %[[TO:rd?[0-9]+]], [test_halfp0a1_param_1];
+; CHECK-DAG: ld.u8        [[B0:%r[sd]?[0-9]+]], [%[[FROM]]]
+; CHECK-DAG: st.u8        [%[[TO]]], [[B0]]
+; CHECK-DAG: ld.u8        [[B1:%r[sd]?[0-9]+]], [%[[FROM]]+1]
+; CHECK-DAG: st.u8        [%[[TO]]+1], [[B1]]
+; CHECK: ret
+define void @test_halfp0a1(half * noalias readonly %from, half * %to) {
+  %1 = load half, half * %from , align 1
+  store half %1, half * %to , align 1
+  ret void
+}
+
+declare half @test_callee(half %a, half %b) #0
+
+; CHECK-LABEL: test_call(
+; CHECK-DAG:  ld.param.b16    [[A:%h[0-9]+]], [test_call_param_0];
+; CHECK-DAG:  ld.param.b16    [[B:%h[0-9]+]], [test_call_param_1];
+; CHECK:      {
+; CHECK-DAG:  .param .b32 param0;
+; CHECK-DAG:  .param .b32 param1;
+; CHECK-DAG:  st.param.b16    [param0+0], [[A]];
+; CHECK-DAG:  st.param.b16    [param1+0], [[B]];
+; CHECK-DAG:  .param .b32 retval0;
+; CHECK:      call.uni (retval0),
+; CHECK-NEXT:        test_callee,
+; CHECK:      );
+; CHECK-NEXT: ld.param.b16    [[R:%h[0-9]+]], [retval0+0];
+; CHECK-NEXT: }
+; CHECK-NEXT: st.param.b16    [func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+define half @test_call(half %a, half %b) #0 {
+  %r = call half @test_callee(half %a, half %b)
+  ret half %r
+}
+
+; CHECK-LABEL: test_call_flipped(
+; CHECK-DAG:  ld.param.b16    [[A:%h[0-9]+]], [test_call_flipped_param_0];
+; CHECK-DAG:  ld.param.b16    [[B:%h[0-9]+]], [test_call_flipped_param_1];
+; CHECK:      {
+; CHECK-DAG:  .param .b32 param0;
+; CHECK-DAG:  .param .b32 param1;
+; CHECK-DAG:  st.param.b16    [param0+0], [[B]];
+; CHECK-DAG:  st.param.b16    [param1+0], [[A]];
+; CHECK-DAG:  .param .b32 retval0;
+; CHECK:      call.uni (retval0),
+; CHECK-NEXT:        test_callee,
+; CHECK:      );
+; CHECK-NEXT: ld.param.b16    [[R:%h[0-9]+]], [retval0+0];
+; CHECK-NEXT: }
+; CHECK-NEXT: st.param.b16    [func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+define half @test_call_flipped(half %a, half %b) #0 {
+  %r = call half @test_callee(half %b, half %a)
+  ret half %r
+}
+
+; CHECK-LABEL: test_tailcall_flipped(
+; CHECK-DAG:  ld.param.b16    [[A:%h[0-9]+]], [test_tailcall_flipped_param_0];
+; CHECK-DAG:  ld.param.b16    [[B:%h[0-9]+]], [test_tailcall_flipped_param_1];
+; CHECK:      {
+; CHECK-DAG:  .param .b32 param0;
+; CHECK-DAG:  .param .b32 param1;
+; CHECK-DAG:  st.param.b16    [param0+0], [[B]];
+; CHECK-DAG:  st.param.b16    [param1+0], [[A]];
+; CHECK-DAG:  .param .b32 retval0;
+; CHECK:      call.uni (retval0),
+; CHECK-NEXT:        test_callee,
+; CHECK:      );
+; CHECK-NEXT: ld.param.b16    [[R:%h[0-9]+]], [retval0+0];
+; CHECK-NEXT: }
+; CHECK-NEXT: st.param.b16    [func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+define half @test_tailcall_flipped(half %a, half %b) #0 {
+  %r = tail call half @test_callee(half %b, half %a)
+  ret half %r
+}
+
+; CHECK-LABEL: test_select(
+; CHECK-DAG:  ld.param.b16    [[A:%h[0-9]+]], [test_select_param_0];
+; CHECK-DAG:  ld.param.b16    [[B:%h[0-9]+]], [test_select_param_1];
+; CHECK-DAG:  setp.eq.b16     [[PRED:%p[0-9]+]], %rs{{.*}}, 1;
+; CHECK-NEXT: selp.b16        [[R:%h[0-9]+]], [[A]], [[B]], [[PRED]];
+; CHECK-NEXT: st.param.b16    [func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+define half @test_select(half %a, half %b, i1 zeroext %c) #0 {
+  %r = select i1 %c, half %a, half %b
+  ret half %r
+}
+
+; CHECK-LABEL: test_select_cc(
+; CHECK-DAG:  ld.param.b16    [[A:%h[0-9]+]], [test_select_cc_param_0];
+; CHECK-DAG:  ld.param.b16    [[B:%h[0-9]+]], [test_select_cc_param_1];
+; CHECK-DAG:  ld.param.b16    [[C:%h[0-9]+]], [test_select_cc_param_2];
+; CHECK-DAG:  ld.param.b16    [[D:%h[0-9]+]], [test_select_cc_param_3];
+; CHECK-F16:  setp.neu.f16    [[PRED:%p[0-9]+]], [[C]], [[D]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[DF:%f[0-9]+]], [[D]];
+; CHECK-NOF16-DAG: cvt.f32.f16 [[CF:%f[0-9]+]], [[C]];
+; CHECK-NOF16: setp.neu.f32    [[PRED:%p[0-9]+]], [[CF]], [[DF]]
+; CHECK:      selp.b16        [[R:%h[0-9]+]], [[A]], [[B]], [[PRED]];
+; CHECK-NEXT: st.param.b16    [func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+define half @test_select_cc(half %a, half %b, half %c, half %d) #0 {
+  %cc = fcmp une half %c, %d
+  %r = select i1 %cc, half %a, half %b
+  ret half %r
+}
+
+; CHECK-LABEL: test_select_cc_f32_f16(
+; CHECK-DAG:  ld.param.f32    [[A:%f[0-9]+]], [test_select_cc_f32_f16_param_0];
+; CHECK-DAG:  ld.param.f32    [[B:%f[0-9]+]], [test_select_cc_f32_f16_param_1];
+; CHECK-DAG:  ld.param.b16    [[C:%h[0-9]+]], [test_select_cc_f32_f16_param_2];
+; CHECK-DAG:  ld.param.b16    [[D:%h[0-9]+]], [test_select_cc_f32_f16_param_3];
+; CHECK-F16:  setp.neu.f16    [[PRED:%p[0-9]+]], [[C]], [[D]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[DF:%f[0-9]+]], [[D]];
+; CHECK-NOF16-DAG: cvt.f32.f16 [[CF:%f[0-9]+]], [[C]];
+; CHECK-NOF16: setp.neu.f32    [[PRED:%p[0-9]+]], [[CF]], [[DF]]
+; CHECK-NEXT: selp.f32        [[R:%f[0-9]+]], [[A]], [[B]], [[PRED]];
+; CHECK-NEXT: st.param.f32    [func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+define float @test_select_cc_f32_f16(float %a, float %b, half %c, half %d) #0 {
+  %cc = fcmp une half %c, %d
+  %r = select i1 %cc, float %a, float %b
+  ret float %r
+}
+
+; CHECK-LABEL: test_select_cc_f16_f32(
+; CHECK-DAG:  ld.param.b16    [[A:%h[0-9]+]], [test_select_cc_f16_f32_param_0];
+; CHECK-DAG:  ld.param.f32    [[C:%f[0-9]+]], [test_select_cc_f16_f32_param_2];
+; CHECK-DAG:  ld.param.f32    [[D:%f[0-9]+]], [test_select_cc_f16_f32_param_3];
+; CHECK-DAG:  setp.neu.f32    [[PRED:%p[0-9]+]], [[C]], [[D]]
+; CHECK-DAG:  ld.param.b16    [[B:%h[0-9]+]], [test_select_cc_f16_f32_param_1];
+; CHECK-NEXT: selp.b16        [[R:%h[0-9]+]], [[A]], [[B]], [[PRED]];
+; CHECK-NEXT: st.param.b16    [func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+define half @test_select_cc_f16_f32(half %a, half %b, float %c, float %d) #0 {
+  %cc = fcmp une float %c, %d
+  %r = select i1 %cc, half %a, half %b
+  ret half %r
+}
+
+; CHECK-LABEL: test_fcmp_une(
+; CHECK-DAG:  ld.param.b16    [[A:%h[0-9]+]], [test_fcmp_une_param_0];
+; CHECK-DAG:  ld.param.b16    [[B:%h[0-9]+]], [test_fcmp_une_param_1];
+; CHECK-F16:  setp.neu.f16    [[PRED:%p[0-9]+]], [[A]], [[B]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[AF:%f[0-9]+]], [[A]];
+; CHECK-NOF16-DAG: cvt.f32.f16 [[BF:%f[0-9]+]], [[B]];
+; CHECK-NOF16: setp.neu.f32   [[PRED:%p[0-9]+]], [[AF]], [[BF]]
+; CHECK-NEXT: selp.u32        [[R:%r[0-9]+]], 1, 0, [[PRED]];
+; CHECK-NEXT: st.param.b32    [func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+define i1 @test_fcmp_une(half %a, half %b) #0 {
+  %r = fcmp une half %a, %b
+  ret i1 %r
+}
+
+; CHECK-LABEL: test_fcmp_ueq(
+; CHECK-DAG:  ld.param.b16    [[A:%h[0-9]+]], [test_fcmp_ueq_param_0];
+; CHECK-DAG:  ld.param.b16    [[B:%h[0-9]+]], [test_fcmp_ueq_param_1];
+; CHECK-F16:  setp.equ.f16    [[PRED:%p[0-9]+]], [[A]], [[B]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[AF:%f[0-9]+]], [[A]];
+; CHECK-NOF16-DAG: cvt.f32.f16 [[BF:%f[0-9]+]], [[B]];
+; CHECK-NOF16: setp.equ.f32   [[PRED:%p[0-9]+]], [[AF]], [[BF]]
+; CHECK-NEXT: selp.u32        [[R:%r[0-9]+]], 1, 0, [[PRED]];
+; CHECK-NEXT: st.param.b32    [func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+define i1 @test_fcmp_ueq(half %a, half %b) #0 {
+  %r = fcmp ueq half %a, %b
+  ret i1 %r
+}
+
+; CHECK-LABEL: test_fcmp_ugt(
+; CHECK-DAG:  ld.param.b16    [[A:%h[0-9]+]], [test_fcmp_ugt_param_0];
+; CHECK-DAG:  ld.param.b16    [[B:%h[0-9]+]], [test_fcmp_ugt_param_1];
+; CHECK-F16:  setp.gtu.f16    [[PRED:%p[0-9]+]], [[A]], [[B]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[AF:%f[0-9]+]], [[A]];
+; CHECK-NOF16-DAG: cvt.f32.f16 [[BF:%f[0-9]+]], [[B]];
+; CHECK-NOF16: setp.gtu.f32   [[PRED:%p[0-9]+]], [[AF]], [[BF]]
+; CHECK-NEXT: selp.u32        [[R:%r[0-9]+]], 1, 0, [[PRED]];
+; CHECK-NEXT: st.param.b32    [func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+define i1 @test_fcmp_ugt(half %a, half %b) #0 {
+  %r = fcmp ugt half %a, %b
+  ret i1 %r
+}
+
+; CHECK-LABEL: test_fcmp_uge(
+; CHECK-DAG:  ld.param.b16    [[A:%h[0-9]+]], [test_fcmp_uge_param_0];
+; CHECK-DAG:  ld.param.b16    [[B:%h[0-9]+]], [test_fcmp_uge_param_1];
+; CHECK-F16:  setp.geu.f16    [[PRED:%p[0-9]+]], [[A]], [[B]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[AF:%f[0-9]+]], [[A]];
+; CHECK-NOF16-DAG: cvt.f32.f16 [[BF:%f[0-9]+]], [[B]];
+; CHECK-NOF16: setp.geu.f32   [[PRED:%p[0-9]+]], [[AF]], [[BF]]
+; CHECK-NEXT: selp.u32        [[R:%r[0-9]+]], 1, 0, [[PRED]];
+; CHECK-NEXT: st.param.b32    [func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+define i1 @test_fcmp_uge(half %a, half %b) #0 {
+  %r = fcmp uge half %a, %b
+  ret i1 %r
+}
+
+; CHECK-LABEL: test_fcmp_ult(
+; CHECK-DAG:  ld.param.b16    [[A:%h[0-9]+]], [test_fcmp_ult_param_0];
+; CHECK-DAG:  ld.param.b16    [[B:%h[0-9]+]], [test_fcmp_ult_param_1];
+; CHECK-F16:  setp.ltu.f16    [[PRED:%p[0-9]+]], [[A]], [[B]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[AF:%f[0-9]+]], [[A]];
+; CHECK-NOF16-DAG: cvt.f32.f16 [[BF:%f[0-9]+]], [[B]];
+; CHECK-NOF16: setp.ltu.f32   [[PRED:%p[0-9]+]], [[AF]], [[BF]]
+; CHECK-NEXT: selp.u32        [[R:%r[0-9]+]], 1, 0, [[PRED]];
+; CHECK-NEXT: st.param.b32    [func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+define i1 @test_fcmp_ult(half %a, half %b) #0 {
+  %r = fcmp ult half %a, %b
+  ret i1 %r
+}
+
+; CHECK-LABEL: test_fcmp_ule(
+; CHECK-DAG:  ld.param.b16    [[A:%h[0-9]+]], [test_fcmp_ule_param_0];
+; CHECK-DAG:  ld.param.b16    [[B:%h[0-9]+]], [test_fcmp_ule_param_1];
+; CHECK-F16:  setp.leu.f16    [[PRED:%p[0-9]+]], [[A]], [[B]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[AF:%f[0-9]+]], [[A]];
+; CHECK-NOF16-DAG: cvt.f32.f16 [[BF:%f[0-9]+]], [[B]];
+; CHECK-NOF16: setp.leu.f32   [[PRED:%p[0-9]+]], [[AF]], [[BF]]
+; CHECK-NEXT: selp.u32        [[R:%r[0-9]+]], 1, 0, [[PRED]];
+; CHECK-NEXT: st.param.b32    [func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+define i1 @test_fcmp_ule(half %a, half %b) #0 {
+  %r = fcmp ule half %a, %b
+  ret i1 %r
+}
+
+
+; CHECK-LABEL: test_fcmp_uno(
+; CHECK-DAG:  ld.param.b16    [[A:%h[0-9]+]], [test_fcmp_uno_param_0];
+; CHECK-DAG:  ld.param.b16    [[B:%h[0-9]+]], [test_fcmp_uno_param_1];
+; CHECK-F16:  setp.nan.f16    [[PRED:%p[0-9]+]], [[A]], [[B]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[AF:%f[0-9]+]], [[A]];
+; CHECK-NOF16-DAG: cvt.f32.f16 [[BF:%f[0-9]+]], [[B]];
+; CHECK-NOF16: setp.nan.f32   [[PRED:%p[0-9]+]], [[AF]], [[BF]]
+; CHECK-NEXT: selp.u32        [[R:%r[0-9]+]], 1, 0, [[PRED]];
+; CHECK-NEXT: st.param.b32    [func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+define i1 @test_fcmp_uno(half %a, half %b) #0 {
+  %r = fcmp uno half %a, %b
+  ret i1 %r
+}
+
+; CHECK-LABEL: test_fcmp_one(
+; CHECK-DAG:  ld.param.b16    [[A:%h[0-9]+]], [test_fcmp_one_param_0];
+; CHECK-DAG:  ld.param.b16    [[B:%h[0-9]+]], [test_fcmp_one_param_1];
+; CHECK-F16:  setp.ne.f16     [[PRED:%p[0-9]+]], [[A]], [[B]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[AF:%f[0-9]+]], [[A]];
+; CHECK-NOF16-DAG: cvt.f32.f16 [[BF:%f[0-9]+]], [[B]];
+; CHECK-NOF16: setp.ne.f32    [[PRED:%p[0-9]+]], [[AF]], [[BF]]
+; CHECK-NEXT: selp.u32        [[R:%r[0-9]+]], 1, 0, [[PRED]];
+; CHECK-NEXT: st.param.b32    [func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+define i1 @test_fcmp_one(half %a, half %b) #0 {
+  %r = fcmp one half %a, %b
+  ret i1 %r
+}
+
+; CHECK-LABEL: test_fcmp_oeq(
+; CHECK-DAG:  ld.param.b16    [[A:%h[0-9]+]], [test_fcmp_oeq_param_0];
+; CHECK-DAG:  ld.param.b16    [[B:%h[0-9]+]], [test_fcmp_oeq_param_1];
+; CHECK-F16:  setp.eq.f16     [[PRED:%p[0-9]+]], [[A]], [[B]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[AF:%f[0-9]+]], [[A]];
+; CHECK-NOF16-DAG: cvt.f32.f16 [[BF:%f[0-9]+]], [[B]];
+; CHECK-NOF16: setp.eq.f32    [[PRED:%p[0-9]+]], [[AF]], [[BF]]
+; CHECK-NEXT: selp.u32        [[R:%r[0-9]+]], 1, 0, [[PRED]];
+; CHECK-NEXT: st.param.b32    [func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+define i1 @test_fcmp_oeq(half %a, half %b) #0 {
+  %r = fcmp oeq half %a, %b
+  ret i1 %r
+}
+
+; CHECK-LABEL: test_fcmp_ogt(
+; CHECK-DAG:  ld.param.b16    [[A:%h[0-9]+]], [test_fcmp_ogt_param_0];
+; CHECK-DAG:  ld.param.b16    [[B:%h[0-9]+]], [test_fcmp_ogt_param_1];
+; CHECK-F16:  setp.gt.f16     [[PRED:%p[0-9]+]], [[A]], [[B]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[AF:%f[0-9]+]], [[A]];
+; CHECK-NOF16-DAG: cvt.f32.f16 [[BF:%f[0-9]+]], [[B]];
+; CHECK-NOF16: setp.gt.f32    [[PRED:%p[0-9]+]], [[AF]], [[BF]]
+; CHECK-NEXT: selp.u32        [[R:%r[0-9]+]], 1, 0, [[PRED]];
+; CHECK-NEXT: st.param.b32    [func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+define i1 @test_fcmp_ogt(half %a, half %b) #0 {
+  %r = fcmp ogt half %a, %b
+  ret i1 %r
+}
+
+; CHECK-LABEL: test_fcmp_oge(
+; CHECK-DAG:  ld.param.b16    [[A:%h[0-9]+]], [test_fcmp_oge_param_0];
+; CHECK-DAG:  ld.param.b16    [[B:%h[0-9]+]], [test_fcmp_oge_param_1];
+; CHECK-F16:  setp.ge.f16     [[PRED:%p[0-9]+]], [[A]], [[B]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[AF:%f[0-9]+]], [[A]];
+; CHECK-NOF16-DAG: cvt.f32.f16 [[BF:%f[0-9]+]], [[B]];
+; CHECK-NOF16: setp.ge.f32    [[PRED:%p[0-9]+]], [[AF]], [[BF]]
+; CHECK-NEXT: selp.u32        [[R:%r[0-9]+]], 1, 0, [[PRED]];
+; CHECK-NEXT: st.param.b32    [func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+define i1 @test_fcmp_oge(half %a, half %b) #0 {
+  %r = fcmp oge half %a, %b
+  ret i1 %r
+}
+
+; XCHECK-LABEL: test_fcmp_olt(
+; CHECK-DAG:  ld.param.b16    [[A:%h[0-9]+]], [test_fcmp_olt_param_0];
+; CHECK-DAG:  ld.param.b16    [[B:%h[0-9]+]], [test_fcmp_olt_param_1];
+; CHECK-F16:  setp.lt.f16     [[PRED:%p[0-9]+]], [[A]], [[B]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[AF:%f[0-9]+]], [[A]];
+; CHECK-NOF16-DAG: cvt.f32.f16 [[BF:%f[0-9]+]], [[B]];
+; CHECK-NOF16: setp.lt.f32    [[PRED:%p[0-9]+]], [[AF]], [[BF]]
+; CHECK-NEXT: selp.u32        [[R:%r[0-9]+]], 1, 0, [[PRED]];
+; CHECK-NEXT: st.param.b32    [func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+define i1 @test_fcmp_olt(half %a, half %b) #0 {
+  %r = fcmp olt half %a, %b
+  ret i1 %r
+}
+
+; XCHECK-LABEL: test_fcmp_ole(
+; CHECK-DAG:  ld.param.b16    [[A:%h[0-9]+]], [test_fcmp_ole_param_0];
+; CHECK-DAG:  ld.param.b16    [[B:%h[0-9]+]], [test_fcmp_ole_param_1];
+; CHECK-F16:  setp.le.f16     [[PRED:%p[0-9]+]], [[A]], [[B]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[AF:%f[0-9]+]], [[A]];
+; CHECK-NOF16-DAG: cvt.f32.f16 [[BF:%f[0-9]+]], [[B]];
+; CHECK-NOF16: setp.le.f32    [[PRED:%p[0-9]+]], [[AF]], [[BF]]
+; CHECK-NEXT: selp.u32        [[R:%r[0-9]+]], 1, 0, [[PRED]];
+; CHECK-NEXT: st.param.b32    [func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+define i1 @test_fcmp_ole(half %a, half %b) #0 {
+  %r = fcmp ole half %a, %b
+  ret i1 %r
+}
+
+; CHECK-LABEL: test_fcmp_ord(
+; CHECK-DAG:  ld.param.b16    [[A:%h[0-9]+]], [test_fcmp_ord_param_0];
+; CHECK-DAG:  ld.param.b16    [[B:%h[0-9]+]], [test_fcmp_ord_param_1];
+; CHECK-F16:  setp.num.f16    [[PRED:%p[0-9]+]], [[A]], [[B]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[AF:%f[0-9]+]], [[A]];
+; CHECK-NOF16-DAG: cvt.f32.f16 [[BF:%f[0-9]+]], [[B]];
+; CHECK-NOF16: setp.num.f32   [[PRED:%p[0-9]+]], [[AF]], [[BF]]
+; CHECK-NEXT: selp.u32        [[R:%r[0-9]+]], 1, 0, [[PRED]];
+; CHECK-NEXT: st.param.b32    [func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+define i1 @test_fcmp_ord(half %a, half %b) #0 {
+  %r = fcmp ord half %a, %b
+  ret i1 %r
+}
+
+; CHECK-LABEL: test_br_cc(
+; CHECK-DAG:  ld.param.b16    [[A:%h[0-9]+]], [test_br_cc_param_0];
+; CHECK-DAG:  ld.param.b16    [[B:%h[0-9]+]], [test_br_cc_param_1];
+; CHECK-DAG:  ld.param.u64    %[[C:rd[0-9]+]], [test_br_cc_param_2];
+; CHECK-DAG:  ld.param.u64    %[[D:rd[0-9]+]], [test_br_cc_param_3];
+; CHECK-F16:  setp.lt.f16     [[PRED:%p[0-9]+]], [[A]], [[B]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[AF:%f[0-9]+]], [[A]];
+; CHECK-NOF16-DAG: cvt.f32.f16 [[BF:%f[0-9]+]], [[B]];
+; CHECK-NOF16: setp.lt.f32    [[PRED:%p[0-9]+]], [[AF]], [[BF]]
+; CHECK-NEXT: @[[PRED]] bra   [[LABEL:LBB.*]];
+; CHECK:      st.u32  [%[[C]]],
+; CHECK:      [[LABEL]]:
+; CHECK:      st.u32  [%[[D]]],
+; CHECK:      ret;
+define void @test_br_cc(half %a, half %b, i32* %p1, i32* %p2) #0 {
+  %c = fcmp uge half %a, %b
+  br i1 %c, label %then, label %else
+then:
+  store i32 0, i32* %p1
+  ret void
+else:
+  store i32 0, i32* %p2
+  ret void
+}
+
+; CHECK-LABEL: test_phi(
+; CHECK:      ld.param.u64    %[[P1:rd[0-9]+]], [test_phi_param_0];
+; CHECK:      ld.b16  {{%h[0-9]+}}, [%[[P1]]];
+; CHECK: [[LOOP:LBB[0-9_]+]]:
+; CHECK:      mov.b16 [[R:%h[0-9]+]], [[AB:%h[0-9]+]];
+; CHECK:      ld.b16  [[AB:%h[0-9]+]], [%[[P1]]];
+; CHECK:      {
+; CHECK:      st.param.b64    [param0+0], %[[P1]];
+; CHECK:      call.uni (retval0),
+; CHECK-NEXT: test_dummy
+; CHECK:      }
+; CHECK:      setp.eq.b32     [[PRED:%p[0-9]+]], %r{{[0-9]+}}, 1;
+; CHECK:      @[[PRED]] bra   [[LOOP]];
+; CHECK:      st.param.b16    [func_retval0+0], [[R]];
+; CHECK:      ret;
+define half @test_phi(half* %p1) #0 {
+entry:
+  %a = load half, half* %p1
+  br label %loop
+loop:
+  %r = phi half [%a, %entry], [%b, %loop]
+  %b = load half, half* %p1
+  %c = call i1 @test_dummy(half* %p1)
+  br i1 %c, label %loop, label %return
+return:
+  ret half %r
+}
+declare i1 @test_dummy(half* %p1) #0
+
+; CHECK-LABEL: test_fptosi_i32(
+; CHECK:      ld.param.b16    [[A:%h[0-9]+]], [test_fptosi_i32_param_0];
+; CHECK:      cvt.rzi.s32.f16 [[R:%r[0-9]+]], [[A]];
+; CHECK:      st.param.b32    [func_retval0+0], [[R]];
+; CHECK:      ret;
+define i32 @test_fptosi_i32(half %a) #0 {
+  %r = fptosi half %a to i32
+  ret i32 %r
+}
+
+; CHECK-LABEL: test_fptosi_i64(
+; CHECK:      ld.param.b16    [[A:%h[0-9]+]], [test_fptosi_i64_param_0];
+; CHECK:      cvt.rzi.s64.f16 [[R:%rd[0-9]+]], [[A]];
+; CHECK:      st.param.b64    [func_retval0+0], [[R]];
+; CHECK:      ret;
+define i64 @test_fptosi_i64(half %a) #0 {
+  %r = fptosi half %a to i64
+  ret i64 %r
+}
+
+; CHECK-LABEL: test_fptoui_i32(
+; CHECK:      ld.param.b16    [[A:%h[0-9]+]], [test_fptoui_i32_param_0];
+; CHECK:      cvt.rzi.u32.f16 [[R:%r[0-9]+]], [[A]];
+; CHECK:      st.param.b32    [func_retval0+0], [[R]];
+; CHECK:      ret;
+define i32 @test_fptoui_i32(half %a) #0 {
+  %r = fptoui half %a to i32
+  ret i32 %r
+}
+
+; CHECK-LABEL: test_fptoui_i64(
+; CHECK:      ld.param.b16    [[A:%h[0-9]+]], [test_fptoui_i64_param_0];
+; CHECK:      cvt.rzi.u64.f16 [[R:%rd[0-9]+]], [[A]];
+; CHECK:      st.param.b64    [func_retval0+0], [[R]];
+; CHECK:      ret;
+define i64 @test_fptoui_i64(half %a) #0 {
+  %r = fptoui half %a to i64
+  ret i64 %r
+}
+
+; CHECK-LABEL: test_uitofp_i32(
+; CHECK:      ld.param.u32    [[A:%r[0-9]+]], [test_uitofp_i32_param_0];
+; CHECK:      cvt.rn.f16.u32  [[R:%h[0-9]+]], [[A]];
+; CHECK:      st.param.b16    [func_retval0+0], [[R]];
+; CHECK:      ret;
+define half @test_uitofp_i32(i32 %a) #0 {
+  %r = uitofp i32 %a to half
+  ret half %r
+}
+
+; CHECK-LABEL: test_uitofp_i64(
+; CHECK:      ld.param.u64    [[A:%rd[0-9]+]], [test_uitofp_i64_param_0];
+; CHECK:      cvt.rn.f16.u64  [[R:%h[0-9]+]], [[A]];
+; CHECK:      st.param.b16    [func_retval0+0], [[R]];
+; CHECK:      ret;
+define half @test_uitofp_i64(i64 %a) #0 {
+  %r = uitofp i64 %a to half
+  ret half %r
+}
+
+; CHECK-LABEL: test_sitofp_i32(
+; CHECK:      ld.param.u32    [[A:%r[0-9]+]], [test_sitofp_i32_param_0];
+; CHECK:      cvt.rn.f16.s32  [[R:%h[0-9]+]], [[A]];
+; CHECK:      st.param.b16    [func_retval0+0], [[R]];
+; CHECK:      ret;
+define half @test_sitofp_i32(i32 %a) #0 {
+  %r = sitofp i32 %a to half
+  ret half %r
+}
+
+; CHECK-LABEL: test_sitofp_i64(
+; CHECK:      ld.param.u64    [[A:%rd[0-9]+]], [test_sitofp_i64_param_0];
+; CHECK:      cvt.rn.f16.s64  [[R:%h[0-9]+]], [[A]];
+; CHECK:      st.param.b16    [func_retval0+0], [[R]];
+; CHECK:      ret;
+define half @test_sitofp_i64(i64 %a) #0 {
+  %r = sitofp i64 %a to half
+  ret half %r
+}
+
+; CHECK-LABEL: test_uitofp_i32_fadd(
+; CHECK-DAG:  ld.param.u32    [[A:%r[0-9]+]], [test_uitofp_i32_fadd_param_0];
+; CHECK-DAG:  cvt.rn.f16.u32  [[C:%h[0-9]+]], [[A]];
+; CHECK-DAG:  ld.param.b16    [[B:%h[0-9]+]], [test_uitofp_i32_fadd_param_1];
+; CHECK-F16:       add.rn.f16      [[R:%h[0-9]+]], [[B]], [[C]];
+; CHECK-NOF16-DAG:  cvt.f32.f16    [[B32:%f[0-9]+]], [[B]]
+; CHECK-NOF16-DAG:  cvt.f32.f16    [[C32:%f[0-9]+]], [[C]]
+; CHECK-NOF16-NEXT: add.rn.f32     [[R32:%f[0-9]+]], [[B32]], [[C32]];
+; CHECK-NOF16-NEXT: cvt.rn.f16.f32 [[R:%h[0-9]+]], [[R32]]
+; CHECK:      st.param.b16    [func_retval0+0], [[R]];
+; CHECK:      ret;
+define half @test_uitofp_i32_fadd(i32 %a, half %b) #0 {
+  %c = uitofp i32 %a to half
+  %r = fadd half %b, %c
+  ret half %r
+}
+
+; CHECK-LABEL: test_sitofp_i32_fadd(
+; CHECK-DAG:  ld.param.u32    [[A:%r[0-9]+]], [test_sitofp_i32_fadd_param_0];
+; CHECK-DAG:  cvt.rn.f16.s32  [[C:%h[0-9]+]], [[A]];
+; CHECK-DAG:  ld.param.b16    [[B:%h[0-9]+]], [test_sitofp_i32_fadd_param_1];
+; CHECK-F16:         add.rn.f16     [[R:%h[0-9]+]], [[B]], [[C]];
+; XCHECK-NOF16-DAG:  cvt.f32.f16    [[B32:%f[0-9]+]], [[B]]
+; XCHECK-NOF16-DAG:  cvt.f32.f16    [[C32:%f[0-9]+]], [[C]]
+; XCHECK-NOF16-NEXT: add.rn.f32     [[R32:%f[0-9]+]], [[B32]], [[C32]];
+; XCHECK-NOF16-NEXT: cvt.rn.f16.f32 [[R:%h[0-9]+]], [[R32]]
+; CHECK:      st.param.b16    [func_retval0+0], [[R]];
+; CHECK:      ret;
+define half @test_sitofp_i32_fadd(i32 %a, half %b) #0 {
+  %c = sitofp i32 %a to half
+  %r = fadd half %b, %c
+  ret half %r
+}
+
+; CHECK-LABEL: test_fptrunc_float(
+; CHECK:      ld.param.f32    [[A:%f[0-9]+]], [test_fptrunc_float_param_0];
+; CHECK:      cvt.rn.f16.f32  [[R:%h[0-9]+]], [[A]];
+; CHECK:      st.param.b16    [func_retval0+0], [[R]];
+; CHECK:      ret;
+define half @test_fptrunc_float(float %a) #0 {
+  %r = fptrunc float %a to half
+  ret half %r
+}
+
+; CHECK-LABEL: test_fptrunc_double(
+; CHECK:      ld.param.f64    [[A:%fd[0-9]+]], [test_fptrunc_double_param_0];
+; CHECK:      cvt.rn.f16.f64  [[R:%h[0-9]+]], [[A]];
+; CHECK:      st.param.b16    [func_retval0+0], [[R]];
+; CHECK:      ret;
+define half @test_fptrunc_double(double %a) #0 {
+  %r = fptrunc double %a to half
+  ret half %r
+}
+
+; CHECK-LABEL: test_fpext_float(
+; CHECK:      ld.param.b16    [[A:%h[0-9]+]], [test_fpext_float_param_0];
+; CHECK:      cvt.f32.f16     [[R:%f[0-9]+]], [[A]];
+; CHECK:      st.param.f32    [func_retval0+0], [[R]];
+; CHECK:      ret;
+define float @test_fpext_float(half %a) #0 {
+  %r = fpext half %a to float
+  ret float %r
+}
+
+; CHECK-LABEL: test_fpext_double(
+; CHECK:      ld.param.b16    [[A:%h[0-9]+]], [test_fpext_double_param_0];
+; CHECK:      cvt.f64.f16     [[R:%fd[0-9]+]], [[A]];
+; CHECK:      st.param.f64    [func_retval0+0], [[R]];
+; CHECK:      ret;
+define double @test_fpext_double(half %a) #0 {
+  %r = fpext half %a to double
+  ret double %r
+}
+
+
+; CHECK-LABEL: test_bitcast_halftoi16(
+; CHECK:      ld.param.b16    [[AH:%h[0-9]+]], [test_bitcast_halftoi16_param_0];
+; CHECK:      mov.b16         [[AS:%rs[0-9]+]], [[AH]]
+; CHECK:      cvt.u32.u16     [[R:%r[0-9]+]], [[AS]]
+; CHECK:      st.param.b32    [func_retval0+0], [[R]];
+; CHECK:      ret;
+define i16 @test_bitcast_halftoi16(half %a) #0 {
+  %r = bitcast half %a to i16
+  ret i16 %r
+}
+
+; CHECK-LABEL: test_bitcast_i16tohalf(
+; CHECK:      ld.param.u16    [[AS:%rs[0-9]+]], [test_bitcast_i16tohalf_param_0];
+; CHECK:      mov.b16         [[AH:%h[0-9]+]], [[AS]]
+; CHECK:      st.param.b16    [func_retval0+0], [[AH]];
+; CHECK:      ret;
+define half @test_bitcast_i16tohalf(i16 %a) #0 {
+  %r = bitcast i16 %a to half
+  ret half %r
+}
+
+
+declare half @llvm.sqrt.f16(half %a) #0
+declare half @llvm.powi.f16(half %a, i32 %b) #0
+declare half @llvm.sin.f16(half %a) #0
+declare half @llvm.cos.f16(half %a) #0
+declare half @llvm.pow.f16(half %a, half %b) #0
+declare half @llvm.exp.f16(half %a) #0
+declare half @llvm.exp2.f16(half %a) #0
+declare half @llvm.log.f16(half %a) #0
+declare half @llvm.log10.f16(half %a) #0
+declare half @llvm.log2.f16(half %a) #0
+declare half @llvm.fma.f16(half %a, half %b, half %c) #0
+declare half @llvm.fabs.f16(half %a) #0
+declare half @llvm.minnum.f16(half %a, half %b) #0
+declare half @llvm.maxnum.f16(half %a, half %b) #0
+declare half @llvm.copysign.f16(half %a, half %b) #0
+declare half @llvm.floor.f16(half %a) #0
+declare half @llvm.ceil.f16(half %a) #0
+declare half @llvm.trunc.f16(half %a) #0
+declare half @llvm.rint.f16(half %a) #0
+declare half @llvm.nearbyint.f16(half %a) #0
+declare half @llvm.round.f16(half %a) #0
+declare half @llvm.fmuladd.f16(half %a, half %b, half %c) #0
+
+; CHECK-LABEL: test_sqrt(
+; CHECK:      ld.param.b16    [[A:%h[0-9]+]], [test_sqrt_param_0];
+; CHECK:      cvt.f32.f16     [[AF:%f[0-9]+]], [[A]];
+; CHECK:      sqrt.rn.f32     [[RF:%f[0-9]+]], [[AF]];
+; CHECK:      cvt.rn.f16.f32  [[R:%h[0-9]+]], [[RF]];
+; CHECK:      st.param.b16    [func_retval0+0], [[R]];
+; CHECK:      ret;
+define half @test_sqrt(half %a) #0 {
+  %r = call half @llvm.sqrt.f16(half %a)
+  ret half %r
+}
+
+;;; Can't do this yet: requires libcall.
+; XCHECK-LABEL: test_powi(
+;define half @test_powi(half %a, i32 %b) #0 {
+;  %r = call half @llvm.powi.f16(half %a, i32 %b)
+;  ret half %r
+;}
+
+; CHECK-LABEL: test_sin(
+; CHECK:      ld.param.b16    [[A:%h[0-9]+]], [test_sin_param_0];
+; CHECK:      cvt.f32.f16     [[AF:%f[0-9]+]], [[A]];
+; CHECK:      sin.approx.f32  [[RF:%f[0-9]+]], [[AF]];
+; CHECK:      cvt.rn.f16.f32  [[R:%h[0-9]+]], [[RF]];
+; CHECK:      st.param.b16    [func_retval0+0], [[R]];
+; CHECK:      ret;
+define half @test_sin(half %a) #0 #1 {
+  %r = call half @llvm.sin.f16(half %a)
+  ret half %r
+}
+
+; CHECK-LABEL: test_cos(
+; CHECK:      ld.param.b16    [[A:%h[0-9]+]], [test_cos_param_0];
+; CHECK:      cvt.f32.f16     [[AF:%f[0-9]+]], [[A]];
+; CHECK:      cos.approx.f32  [[RF:%f[0-9]+]], [[AF]];
+; CHECK:      cvt.rn.f16.f32  [[R:%h[0-9]+]], [[RF]];
+; CHECK:      st.param.b16    [func_retval0+0], [[R]];
+; CHECK:      ret;
+define half @test_cos(half %a) #0 #1 {
+  %r = call half @llvm.cos.f16(half %a)
+  ret half %r
+}
+
+;;; Can't do this yet: requires libcall.
+; XCHECK-LABEL: test_pow(
+;define half @test_pow(half %a, half %b) #0 {
+;  %r = call half @llvm.pow.f16(half %a, half %b)
+;  ret half %r
+;}
+
+;;; Can't do this yet: requires libcall.
+; XCHECK-LABEL: test_exp(
+;define half @test_exp(half %a) #0 {
+;  %r = call half @llvm.exp.f16(half %a)
+;  ret half %r
+;}
+
+;;; Can't do this yet: requires libcall.
+; XCHECK-LABEL: test_exp2(
+;define half @test_exp2(half %a) #0 {
+;  %r = call half @llvm.exp2.f16(half %a)
+;  ret half %r
+;}
+
+;;; Can't do this yet: requires libcall.
+; XCHECK-LABEL: test_log(
+;define half @test_log(half %a) #0 {
+;  %r = call half @llvm.log.f16(half %a)
+;  ret half %r
+;}
+
+;;; Can't do this yet: requires libcall.
+; XCHECK-LABEL: test_log10(
+;define half @test_log10(half %a) #0 {
+;  %r = call half @llvm.log10.f16(half %a)
+;  ret half %r
+;}
+
+;;; Can't do this yet: requires libcall.
+; XCHECK-LABEL: test_log2(
+;define half @test_log2(half %a) #0 {
+;  %r = call half @llvm.log2.f16(half %a)
+;  ret half %r
+;}
+
+; CHECK-LABEL: test_fma(
+; CHECK-DAG:  ld.param.b16    [[A:%h[0-9]+]], [test_fma_param_0];
+; CHECK-DAG:  ld.param.b16    [[B:%h[0-9]+]], [test_fma_param_1];
+; CHECK-DAG:  ld.param.b16    [[C:%h[0-9]+]], [test_fma_param_2];
+; CHECK-F16:      fma.rn.f16      [[R:%h[0-9]+]], [[A]], [[B]], [[C]];
+; CHECK-NOF16-DAG:  cvt.f32.f16    [[A32:%f[0-9]+]], [[A]]
+; CHECK-NOF16-DAG:  cvt.f32.f16    [[B32:%f[0-9]+]], [[B]]
+; CHECK-NOF16-DAG:  cvt.f32.f16    [[C32:%f[0-9]+]], [[C]]
+; CHECK-NOF16-NEXT: fma.rn.f32     [[R32:%f[0-9]+]], [[A32]], [[B32]], [[C32]];
+; CHECK-NOF16-NEXT: cvt.rn.f16.f32 [[R:%h[0-9]+]], [[R32]]
+; CHECK:      st.param.b16    [func_retval0+0], [[R]];
+; CHECK:      ret
+define half @test_fma(half %a, half %b, half %c) #0 {
+  %r = call half @llvm.fma.f16(half %a, half %b, half %c)
+  ret half %r
+}
+
+; CHECK-LABEL: test_fabs(
+; CHECK:      ld.param.b16    [[A:%h[0-9]+]], [test_fabs_param_0];
+; CHECK:      cvt.f32.f16     [[AF:%f[0-9]+]], [[A]];
+; CHECK:      abs.f32         [[RF:%f[0-9]+]], [[AF]];
+; CHECK:      cvt.rn.f16.f32  [[R:%h[0-9]+]], [[RF]];
+; CHECK:      st.param.b16    [func_retval0+0], [[R]];
+; CHECK:      ret;
+define half @test_fabs(half %a) #0 {
+  %r = call half @llvm.fabs.f16(half %a)
+  ret half %r
+}
+
+; CHECK-LABEL: test_minnum(
+; CHECK-DAG:  ld.param.b16    [[A:%h[0-9]+]], [test_minnum_param_0];
+; CHECK-DAG:  ld.param.b16    [[B:%h[0-9]+]], [test_minnum_param_1];
+; CHECK-DAG:  cvt.f32.f16     [[AF:%f[0-9]+]], [[A]];
+; CHECK-DAG:  cvt.f32.f16     [[BF:%f[0-9]+]], [[B]];
+; CHECK:      min.f32         [[RF:%f[0-9]+]], [[AF]], [[BF]];
+; CHECK:      cvt.rn.f16.f32  [[R:%h[0-9]+]], [[RF]];
+; CHECK:      st.param.b16    [func_retval0+0], [[R]];
+; CHECK:      ret;
+define half @test_minnum(half %a, half %b) #0 {
+  %r = call half @llvm.minnum.f16(half %a, half %b)
+  ret half %r
+}
+
+; CHECK-LABEL: test_maxnum(
+; CHECK-DAG:  ld.param.b16    [[A:%h[0-9]+]], [test_maxnum_param_0];
+; CHECK-DAG:  ld.param.b16    [[B:%h[0-9]+]], [test_maxnum_param_1];
+; CHECK-DAG:  cvt.f32.f16     [[AF:%f[0-9]+]], [[A]];
+; CHECK-DAG:  cvt.f32.f16     [[BF:%f[0-9]+]], [[B]];
+; CHECK:      max.f32         [[RF:%f[0-9]+]], [[AF]], [[BF]];
+; CHECK:      cvt.rn.f16.f32  [[R:%h[0-9]+]], [[RF]];
+; CHECK:      st.param.b16    [func_retval0+0], [[R]];
+; CHECK:      ret;
+define half @test_maxnum(half %a, half %b) #0 {
+  %r = call half @llvm.maxnum.f16(half %a, half %b)
+  ret half %r
+}
+
+; CHECK-LABEL: test_copysign(
+; CHECK-DAG:  ld.param.b16    [[AH:%h[0-9]+]], [test_copysign_param_0];
+; CHECK-DAG:  ld.param.b16    [[BH:%h[0-9]+]], [test_copysign_param_1];
+; CHECK-DAG:  mov.b16         [[AS:%rs[0-9]+]], [[AH]];
+; CHECK-DAG:  mov.b16         [[BS:%rs[0-9]+]], [[BH]];
+; CHECK-DAG:  and.b16         [[AX:%rs[0-9]+]], [[AS]], 32767;
+; CHECK-DAG:  and.b16         [[BX:%rs[0-9]+]], [[BS]], -32768;
+; CHECK:      or.b16          [[RX:%rs[0-9]+]], [[AX]], [[BX]];
+; CHECK:      mov.b16         [[R:%h[0-9]+]], [[RX]];
+; CHECK:      st.param.b16    [func_retval0+0], [[R]];
+; CHECK:      ret;
+define half @test_copysign(half %a, half %b) #0 {
+  %r = call half @llvm.copysign.f16(half %a, half %b)
+  ret half %r
+}
+
+; CHECK-LABEL: test_copysign_f32(
+; CHECK-DAG:  ld.param.b16    [[AH:%h[0-9]+]], [test_copysign_f32_param_0];
+; CHECK-DAG:  ld.param.f32    [[BF:%f[0-9]+]], [test_copysign_f32_param_1];
+; CHECK-DAG:  mov.b16         [[A:%rs[0-9]+]], [[AH]];
+; CHECK-DAG:  mov.b32         [[B:%r[0-9]+]], [[BF]];
+; CHECK-DAG:  and.b16         [[AX:%rs[0-9]+]], [[A]], 32767;
+; CHECK-DAG:  and.b32         [[BX0:%r[0-9]+]], [[B]], -2147483648;
+; CHECK-DAG:  shr.u32         [[BX1:%r[0-9]+]], [[BX0]], 16;
+; CHECK-DAG:  cvt.u16.u32     [[BX2:%rs[0-9]+]], [[BX1]];
+; CHECK:      or.b16          [[RX:%rs[0-9]+]], [[AX]], [[BX2]];
+; CHECK:      mov.b16         [[R:%h[0-9]+]], [[RX]];
+; CHECK:      st.param.b16    [func_retval0+0], [[R]];
+; CHECK:      ret;
+define half @test_copysign_f32(half %a, float %b) #0 {
+  %tb = fptrunc float %b to half
+  %r = call half @llvm.copysign.f16(half %a, half %tb)
+  ret half %r
+}
+
+; CHECK-LABEL: test_copysign_f64(
+; CHECK-DAG:  ld.param.b16    [[AH:%h[0-9]+]], [test_copysign_f64_param_0];
+; CHECK-DAG:  ld.param.f64    [[BD:%fd[0-9]+]], [test_copysign_f64_param_1];
+; CHECK-DAG:  mov.b16         [[A:%rs[0-9]+]], [[AH]];
+; CHECK-DAG:  mov.b64         [[B:%rd[0-9]+]], [[BD]];
+; CHECK-DAG:  and.b16         [[AX:%rs[0-9]+]], [[A]], 32767;
+; CHECK-DAG:  and.b64         [[BX0:%rd[0-9]+]], [[B]], -9223372036854775808;
+; CHECK-DAG:  shr.u64         [[BX1:%rd[0-9]+]], [[BX0]], 48;
+; CHECK-DAG:  cvt.u16.u64     [[BX2:%rs[0-9]+]], [[BX1]];
+; CHECK:      or.b16          [[RX:%rs[0-9]+]], [[AX]], [[BX2]];
+; CHECK:      mov.b16         [[R:%h[0-9]+]], [[RX]];
+; CHECK:      st.param.b16    [func_retval0+0], [[R]];
+; CHECK:      ret;
+define half @test_copysign_f64(half %a, double %b) #0 {
+  %tb = fptrunc double %b to half
+  %r = call half @llvm.copysign.f16(half %a, half %tb)
+  ret half %r
+}
+
+; CHECK-LABEL: test_copysign_extended(
+; CHECK-DAG:  ld.param.b16    [[AH:%h[0-9]+]], [test_copysign_extended_param_0];
+; CHECK-DAG:  ld.param.b16    [[BH:%h[0-9]+]], [test_copysign_extended_param_1];
+; CHECK-DAG:  mov.b16         [[AS:%rs[0-9]+]], [[AH]];
+; CHECK-DAG:  mov.b16         [[BS:%rs[0-9]+]], [[BH]];
+; CHECK-DAG:  and.b16         [[AX:%rs[0-9]+]], [[AS]], 32767;
+; CHECK-DAG:  and.b16         [[BX:%rs[0-9]+]], [[BS]], -32768;
+; CHECK:      or.b16          [[RX:%rs[0-9]+]], [[AX]], [[BX]];
+; CHECK:      mov.b16         [[R:%h[0-9]+]], [[RX]];
+; CHECK:      cvt.f32.f16     [[XR:%f[0-9]+]], [[R]];
+; CHECK:      st.param.f32    [func_retval0+0], [[XR]];
+; CHECK:      ret;
+define float @test_copysign_extended(half %a, half %b) #0 {
+  %r = call half @llvm.copysign.f16(half %a, half %b)
+  %xr = fpext half %r to float
+  ret float %xr
+}
+
+; CHECK-LABEL: test_floor(
+; CHECK:      ld.param.b16    [[A:%h[0-9]+]], [test_floor_param_0];
+; CHECK:      cvt.rmi.f16.f16 [[R:%h[0-9]+]], [[A]];
+; CHECK:      st.param.b16    [func_retval0+0], [[R]];
+; CHECK:      ret;
+define half @test_floor(half %a) #0 {
+  %r = call half @llvm.floor.f16(half %a)
+  ret half %r
+}
+
+; CHECK-LABEL: test_ceil(
+; CHECK:      ld.param.b16    [[A:%h[0-9]+]], [test_ceil_param_0];
+; CHECK:      cvt.rpi.f16.f16 [[R:%h[0-9]+]], [[A]];
+; CHECK:      st.param.b16    [func_retval0+0], [[R]];
+; CHECK:      ret;
+define half @test_ceil(half %a) #0 {
+  %r = call half @llvm.ceil.f16(half %a)
+  ret half %r
+}
+
+; CHECK-LABEL: test_trunc(
+; CHECK:      ld.param.b16    [[A:%h[0-9]+]], [test_trunc_param_0];
+; CHECK:      cvt.rzi.f16.f16 [[R:%h[0-9]+]], [[A]];
+; CHECK:      st.param.b16    [func_retval0+0], [[R]];
+; CHECK:      ret;
+define half @test_trunc(half %a) #0 {
+  %r = call half @llvm.trunc.f16(half %a)
+  ret half %r
+}
+
+; CHECK-LABEL: test_rint(
+; CHECK:      ld.param.b16    [[A:%h[0-9]+]], [test_rint_param_0];
+; CHECK:      cvt.rni.f16.f16 [[R:%h[0-9]+]], [[A]];
+; CHECK:      st.param.b16    [func_retval0+0], [[R]];
+; CHECK:      ret;
+define half @test_rint(half %a) #0 {
+  %r = call half @llvm.rint.f16(half %a)
+  ret half %r
+}
+
+; CHECK-LABEL: test_nearbyint(
+; CHECK:      ld.param.b16    [[A:%h[0-9]+]], [test_nearbyint_param_0];
+; CHECK:      cvt.rni.f16.f16 [[R:%h[0-9]+]], [[A]];
+; CHECK:      st.param.b16    [func_retval0+0], [[R]];
+; CHECK:      ret;
+define half @test_nearbyint(half %a) #0 {
+  %r = call half @llvm.nearbyint.f16(half %a)
+  ret half %r
+}
+
+; CHECK-LABEL: test_round(
+; CHECK:      ld.param.b16    [[A:%h[0-9]+]], [test_round_param_0];
+; CHECK:      cvt.rni.f16.f16 [[R:%h[0-9]+]], [[A]];
+; CHECK:      st.param.b16    [func_retval0+0], [[R]];
+; CHECK:      ret;
+define half @test_round(half %a) #0 {
+  %r = call half @llvm.round.f16(half %a)
+  ret half %r
+}
+
+; CHECK-LABEL: test_fmuladd(
+; CHECK-DAG:  ld.param.b16    [[A:%h[0-9]+]], [test_fmuladd_param_0];
+; CHECK-DAG:  ld.param.b16    [[B:%h[0-9]+]], [test_fmuladd_param_1];
+; CHECK-DAG:  ld.param.b16    [[C:%h[0-9]+]], [test_fmuladd_param_2];
+; CHECK-F16:        fma.rn.f16     [[R:%h[0-9]+]], [[A]], [[B]], [[C]];
+; CHECK-NOF16-DAG:  cvt.f32.f16    [[A32:%f[0-9]+]], [[A]]
+; CHECK-NOF16-DAG:  cvt.f32.f16    [[B32:%f[0-9]+]], [[B]]
+; CHECK-NOF16-DAG:  cvt.f32.f16    [[C32:%f[0-9]+]], [[C]]
+; CHECK-NOF16-NEXT: fma.rn.f32     [[R32:%f[0-9]+]], [[A32]], [[B32]], [[C32]];
+; CHECK-NOF16-NEXT: cvt.rn.f16.f32 [[R:%h[0-9]+]], [[R32]]
+; CHECK:      st.param.b16    [func_retval0+0], [[R]];
+; CHECK:      ret;
+define half @test_fmuladd(half %a, half %b, half %c) #0 {
+  %r = call half @llvm.fmuladd.f16(half %a, half %b, half %c)
+  ret half %r
+}
+
+attributes #0 = { nounwind }
+attributes #1 = { "unsafe-fp-math" = "true" }
diff --git a/test/CodeGen/NVPTX/f16x2-instructions.ll b/test/CodeGen/NVPTX/f16x2-instructions.ll
index 33bb616d895c..5dc796ada37f 100644
--- a/test/CodeGen/NVPTX/f16x2-instructions.ll
+++ b/test/CodeGen/NVPTX/f16x2-instructions.ll
@@ -1,1426 +1,1427 @@
-; ## Full FP16 support enabled by default.
-; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda -mcpu=sm_53 -asm-verbose=false \
-; RUN:          -O0 -disable-post-ra -disable-fp-elim \
-; RUN: | FileCheck -check-prefixes CHECK,CHECK-F16 %s
-; ## FP16 support explicitly disabled.
-; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda -mcpu=sm_53 -asm-verbose=false \
-; RUN:          -O0 -disable-post-ra -disable-fp-elim --nvptx-no-f16-math \
-; RUN: | FileCheck -check-prefixes CHECK,CHECK-NOF16 %s
-; ## FP16 is not supported by hardware.
-; RUN: llc < %s -O0 -mtriple=nvptx64-nvidia-cuda -mcpu=sm_52 -asm-verbose=false \
-; RUN:          -disable-post-ra -disable-fp-elim \
-; RUN: | FileCheck -check-prefixes CHECK,CHECK-NOF16 %s
-
-target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
-
-; CHECK-LABEL: test_ret_const(
-; CHECK:     mov.u32         [[T:%r[0-9+]]], 1073757184;
-; CHECK:     mov.b32         [[R:%hh[0-9+]]], [[T]];
-; CHECK:     st.param.b32    [func_retval0+0], [[R]];
-; CHECK-NEXT: ret;
-define <2 x half> @test_ret_const() #0 {
-  ret <2 x half> <half 1.0, half 2.0>
-}
-
-; CHECK-LABEL: test_extract_0(
-; CHECK:      ld.param.b32    [[A:%hh[0-9]+]], [test_extract_0_param_0];
-; CHECK:      mov.b32         {[[R:%h[0-9]+]], %tmp_hi}, [[A]];
-; CHECK:      st.param.b16    [func_retval0+0], [[R]];
-; CHECK:      ret;
-define half @test_extract_0(<2 x half> %a) #0 {
-  %e = extractelement <2 x half> %a, i32 0
-  ret half %e
-}
-
-; CHECK-LABEL: test_extract_1(
-; CHECK:      ld.param.b32    [[A:%hh[0-9]+]], [test_extract_1_param_0];
-; CHECK:      mov.b32         {%tmp_lo, [[R:%h[0-9]+]]}, [[A]];
-; CHECK:      st.param.b16    [func_retval0+0], [[R]];
-; CHECK:      ret;
-define half @test_extract_1(<2 x half> %a) #0 {
-  %e = extractelement <2 x half> %a, i32 1
-  ret half %e
-}
-
-; CHECK-LABEL: test_extract_i(
-; CHECK-DAG:  ld.param.b32    [[A:%hh[0-9]+]], [test_extract_i_param_0];
-; CHECK-DAG:  ld.param.u64    [[IDX:%rd[0-9]+]], [test_extract_i_param_1];
-; CHECK-DAG:  setp.eq.s64     [[PRED:%p[0-9]+]], [[IDX]], 0;
-; CHECK-DAG:  mov.b32         {[[E0:%h[0-9]+]], [[E1:%h[0-9]+]]}, [[A]];
-; CHECK:      selp.b16        [[R:%h[0-9]+]], [[E0]], [[E1]], [[PRED]];
-; CHECK:      st.param.b16    [func_retval0+0], [[R]];
-; CHECK:      ret;
-define half @test_extract_i(<2 x half> %a, i64 %idx) #0 {
-  %e = extractelement <2 x half> %a, i64 %idx
-  ret half %e
-}
-
-; CHECK-LABEL: test_fadd(
-; CHECK-DAG:  ld.param.b32    [[A:%hh[0-9]+]], [test_fadd_param_0];
-; CHECK-DAG:  ld.param.b32    [[B:%hh[0-9]+]], [test_fadd_param_1];
-;
-; CHECK-F16-NEXT:   add.rn.f16x2   [[R:%hh[0-9]+]], [[A]], [[B]];
-;
-; CHECK-NOF16-DAG:  mov.b32        {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
-; CHECK-NOF16-DAG:  mov.b32        {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]]
-; CHECK-NOF16-DAG:  cvt.f32.f16    [[FA0:%f[0-9]+]], [[A0]]
-; CHECK-NOF16-DAG:  cvt.f32.f16    [[FB0:%f[0-9]+]], [[B0]]
-; CHECK-NOF16-DAG:  cvt.f32.f16    [[FA1:%f[0-9]+]], [[A1]]
-; CHECK-NOF16-DAG:  cvt.f32.f16    [[FB1:%f[0-9]+]], [[B1]]
-; CHECK-NOF16-DAG:  add.rn.f32     [[FR0:%f[0-9]+]], [[FA0]], [[FB0]];
-; CHECK-NOF16-DAG:  add.rn.f32     [[FR1:%f[0-9]+]], [[FA1]], [[FB1]];
-; CHECK-NOF16-DAG:  cvt.rn.f16.f32 [[R0:%h[0-9]+]], [[FR0]]
-; CHECK-NOF16-DAG:  cvt.rn.f16.f32 [[R1:%h[0-9]+]], [[FR1]]
-; CHECK-NOF16:      mov.b32         [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
-;
-; CHECK-NEXT: st.param.b32    [func_retval0+0], [[R]];
-; CHECK-NEXT: ret;
-define <2 x half> @test_fadd(<2 x half> %a, <2 x half> %b) #0 {
-  %r = fadd <2 x half> %a, %b
-  ret <2 x half> %r
-}
-
-; Check that we can lower fadd with immediate arguments.
-; CHECK-LABEL: test_fadd_imm_0(
-; CHECK-DAG:  ld.param.b32    [[A:%hh[0-9]+]], [test_fadd_imm_0_param_0];
-;
-; CHECK-F16:        mov.u32        [[I:%r[0-9+]]], 1073757184;
-; CHECK-F16:        mov.b32        [[IHH:%hh[0-9+]]], [[I]];
-; CHECK-F16:        add.rn.f16x2   [[R:%hh[0-9]+]], [[A]], [[IHH]];
-;
-; CHECK-NOF16-DAG:  mov.b32        {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
-; CHECK-NOF16-DAG:  cvt.f32.f16    [[FA0:%f[0-9]+]], [[A0]]
-; CHECK-NOF16-DAG:  cvt.f32.f16    [[FA1:%f[0-9]+]], [[A1]]
-; CHECK-NOF16-DAG:  add.rn.f32     [[FR0:%f[0-9]+]], [[FA0]], 0f3F800000;
-; CHECK-NOF16-DAG:  add.rn.f32     [[FR1:%f[0-9]+]], [[FA1]], 0f40000000;
-; CHECK-NOF16-DAG:  cvt.rn.f16.f32 [[R0:%h[0-9]+]], [[FR0]]
-; CHECK-NOF16-DAG:  cvt.rn.f16.f32 [[R1:%h[0-9]+]], [[FR1]]
-; CHECK-NOF16:      mov.b32        [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
-;
-; CHECK-NEXT: st.param.b32    [func_retval0+0], [[R]];
-; CHECK-NEXT: ret;
-define <2 x half> @test_fadd_imm_0(<2 x half> %a) #0 {
-  %r = fadd <2 x half> <half 1.0, half 2.0>, %a
-  ret <2 x half> %r
-}
-
-; CHECK-LABEL: test_fadd_imm_1(
-; CHECK-DAG:  ld.param.b32    [[B:%hh[0-9]+]], [test_fadd_imm_1_param_0];
-;
-; CHECK-F16:        mov.u32        [[I:%r[0-9+]]], 1073757184;
-; CHECK-F16:        mov.b32        [[IHH:%hh[0-9+]]], [[I]];
-; CHECK-F16:        add.rn.f16x2   [[R:%hh[0-9]+]], [[B]], [[IHH]];
-;
-; CHECK-NOF16-DAG:  mov.b32        {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
-; CHECK-NOF16-DAG:  cvt.f32.f16    [[FA0:%f[0-9]+]], [[A0]]
-; CHECK-NOF16-DAG:  cvt.f32.f16    [[FA1:%f[0-9]+]], [[A1]]
-; CHECK-NOF16-DAG:  add.rn.f32     [[FR0:%f[0-9]+]], [[FA0]], 0f3F800000;
-; CHECK-NOF16-DAG:  add.rn.f32     [[FR1:%f[0-9]+]], [[FA1]], 0f40000000;
-; CHECK-NOF16-DAG:  cvt.rn.f16.f32 [[R0:%h[0-9]+]], [[FR0]]
-; CHECK-NOF16-DAG:  cvt.rn.f16.f32 [[R1:%h[0-9]+]], [[FR1]]
-; CHECK-NOF16:      mov.b32        [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
-;
-; CHECK-NEXT: st.param.b32    [func_retval0+0], [[R]];
-; CHECK-NEXT: ret;
-define <2 x half> @test_fadd_imm_1(<2 x half> %a) #0 {
-  %r = fadd <2 x half> %a, <half 1.0, half 2.0>
-  ret <2 x half> %r
-}
-
-; CHECK-LABEL: test_fsub(
-; CHECK-DAG:  ld.param.b32    [[A:%hh[0-9]+]], [test_fsub_param_0];
-;
-; CHECK-DAG:  ld.param.b32    [[B:%hh[0-9]+]], [test_fsub_param_1];
-; CHECK-F16-NEXT:   sub.rn.f16x2   [[R:%hh[0-9]+]], [[A]], [[B]];
-;
-; CHECK-NOF16-DAG:  mov.b32        {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
-; CHECK-NOF16-DAG:  mov.b32        {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]]
-; CHECK-NOF16-DAG:  cvt.f32.f16    [[FA0:%f[0-9]+]], [[A0]]
-; CHECK-NOF16-DAG:  cvt.f32.f16    [[FB0:%f[0-9]+]], [[B0]]
-; CHECK-NOF16-DAG:  cvt.f32.f16    [[FA1:%f[0-9]+]], [[A1]]
-; CHECK-NOF16-DAG:  cvt.f32.f16    [[FB1:%f[0-9]+]], [[B1]]
-; CHECK-NOF16-DAG:  sub.rn.f32     [[FR0:%f[0-9]+]], [[FA0]], [[FB0]];
-; CHECK-NOF16-DAG:  sub.rn.f32     [[FR1:%f[0-9]+]], [[FA1]], [[FB1]];
-; CHECK-NOF16-DAG:  cvt.rn.f16.f32 [[R0:%h[0-9]+]], [[FR0]]
-; CHECK-NOF16-DAG:  cvt.rn.f16.f32 [[R1:%h[0-9]+]], [[FR1]]
-; CHECK-NOF16:      mov.b32        [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
-;
-; CHECK-NEXT: st.param.b32    [func_retval0+0], [[R]];
-; CHECK-NEXT: ret;
-define <2 x half> @test_fsub(<2 x half> %a, <2 x half> %b) #0 {
-  %r = fsub <2 x half> %a, %b
-  ret <2 x half> %r
-}
-
-; CHECK-LABEL: test_fneg(
-; CHECK-DAG:  ld.param.b32    [[A:%hh[0-9]+]], [test_fneg_param_0];
-;
-; CHECK-F16:        mov.u32        [[I0:%r[0-9+]]], 0;
-; CHECK-F16:        mov.b32        [[IHH0:%hh[0-9+]]], [[I0]];
-; CHECK-F16-NEXT:   sub.rn.f16x2   [[R:%hh[0-9]+]], [[IHH0]], [[A]];
-;
-; CHECK-NOF16-DAG:  mov.b32        {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
-; CHECK-NOF16-DAG:  cvt.f32.f16    [[FA0:%f[0-9]+]], [[A0]]
-; CHECK-NOF16-DAG:  cvt.f32.f16    [[FA1:%f[0-9]+]], [[A1]]
-; CHECK-NOF16-DAG:  mov.f32        [[Z:%f[0-9]+]], 0f00000000;
-; CHECK-NOF16-DAG:  sub.rn.f32     [[FR0:%f[0-9]+]], [[Z]], [[FA0]];
-; CHECK-NOF16-DAG:  sub.rn.f32     [[FR1:%f[0-9]+]], [[Z]], [[FA1]];
-; CHECK-NOF16-DAG:  cvt.rn.f16.f32 [[R0:%h[0-9]+]], [[FR0]]
-; CHECK-NOF16-DAG:  cvt.rn.f16.f32 [[R1:%h[0-9]+]], [[FR1]]
-; CHECK-NOF16:      mov.b32        [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
-;
-; CHECK-NEXT: st.param.b32    [func_retval0+0], [[R]];
-; CHECK-NEXT: ret;
-define <2 x half> @test_fneg(<2 x half> %a) #0 {
-  %r = fsub <2 x half> <half 0.0, half 0.0>, %a
-  ret <2 x half> %r
-}
-
-; CHECK-LABEL: test_fmul(
-; CHECK-DAG:  ld.param.b32    [[A:%hh[0-9]+]], [test_fmul_param_0];
-; CHECK-DAG:  ld.param.b32    [[B:%hh[0-9]+]], [test_fmul_param_1];
-; CHECK-F16-NEXT: mul.rn.f16x2     [[R:%hh[0-9]+]], [[A]], [[B]];
-;
-; CHECK-NOF16-DAG:  mov.b32        {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
-; CHECK-NOF16-DAG:  mov.b32        {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]]
-; CHECK-NOF16-DAG:  cvt.f32.f16    [[FA0:%f[0-9]+]], [[A0]]
-; CHECK-NOF16-DAG:  cvt.f32.f16    [[FB0:%f[0-9]+]], [[B0]]
-; CHECK-NOF16-DAG:  cvt.f32.f16    [[FA1:%f[0-9]+]], [[A1]]
-; CHECK-NOF16-DAG:  cvt.f32.f16    [[FB1:%f[0-9]+]], [[B1]]
-; CHECK-NOF16-DAG:  mul.rn.f32     [[FR0:%f[0-9]+]], [[FA0]], [[FB0]];
-; CHECK-NOF16-DAG:  mul.rn.f32     [[FR1:%f[0-9]+]], [[FA1]], [[FB1]];
-; CHECK-NOF16-DAG:  cvt.rn.f16.f32 [[R0:%h[0-9]+]], [[FR0]]
-; CHECK-NOF16-DAG:  cvt.rn.f16.f32 [[R1:%h[0-9]+]], [[FR1]]
-; CHECK-NOF16:      mov.b32         [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
-;
-; CHECK-NEXT: st.param.b32    [func_retval0+0], [[R]];
-; CHECK-NEXT: ret;
-define <2 x half> @test_fmul(<2 x half> %a, <2 x half> %b) #0 {
-  %r = fmul <2 x half> %a, %b
-  ret <2 x half> %r
-}
-
-; CHECK-LABEL: test_fdiv(
-; CHECK-DAG:  ld.param.b32    [[A:%hh[0-9]+]], [test_fdiv_param_0];
-; CHECK-DAG:  ld.param.b32    [[B:%hh[0-9]+]], [test_fdiv_param_1];
-; CHECK-DAG:  mov.b32         {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
-; CHECK-DAG:  mov.b32         {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]]
-; CHECK-DAG:  cvt.f32.f16     [[FA0:%f[0-9]+]], [[A0]];
-; CHECK-DAG:  cvt.f32.f16     [[FA1:%f[0-9]+]], [[A1]];
-; CHECK-DAG:  cvt.f32.f16     [[FB0:%f[0-9]+]], [[B0]];
-; CHECK-DAG:  cvt.f32.f16     [[FB1:%f[0-9]+]], [[B1]];
-; CHECK-DAG:  div.rn.f32      [[FR0:%f[0-9]+]], [[FA0]], [[FB0]];
-; CHECK-DAG:  div.rn.f32      [[FR1:%f[0-9]+]], [[FA1]], [[FB1]];
-; CHECK-DAG:  cvt.rn.f16.f32  [[R0:%h[0-9]+]], [[FR0]];
-; CHECK-DAG:  cvt.rn.f16.f32  [[R1:%h[0-9]+]], [[FR1]];
-; CHECK-NEXT: mov.b32         [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
-; CHECK-NEXT: st.param.b32    [func_retval0+0], [[R]];
-; CHECK-NEXT: ret;
-define <2 x half> @test_fdiv(<2 x half> %a, <2 x half> %b) #0 {
-  %r = fdiv <2 x half> %a, %b
-  ret <2 x half> %r
-}
-
-; CHECK-LABEL: test_frem(
-; -- Load two 16x2 inputs and split them into f16 elements
-; CHECK-DAG:  ld.param.b32    [[A:%hh[0-9]+]], [test_frem_param_0];
-; CHECK-DAG:  ld.param.b32    [[B:%hh[0-9]+]], [test_frem_param_1];
-; -- Split into elements
-; CHECK-DAG:  mov.b32         {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
-; CHECK-DAG:  mov.b32         {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]]
-; -- promote to f32.
-; CHECK-DAG:  cvt.f32.f16     [[FA0:%f[0-9]+]], [[A0]];
-; CHECK-DAG:  cvt.f32.f16     [[FB0:%f[0-9]+]], [[B0]];
-; CHECK-DAG:  cvt.f32.f16     [[FA1:%f[0-9]+]], [[A1]];
-; CHECK-DAG:  cvt.f32.f16     [[FB1:%f[0-9]+]], [[B1]];
-; -- frem(a[0],b[0]).
-; CHECK-DAG:  div.rn.f32      [[FD0:%f[0-9]+]], [[FA0]], [[FB0]];
-; CHECK-DAG:  cvt.rmi.f32.f32 [[DI0:%f[0-9]+]], [[FD0]];
-; CHECK-DAG:  mul.f32         [[RI0:%f[0-9]+]], [[DI0]], [[FB0]];
-; CHECK-DAG:  sub.f32         [[RF0:%f[0-9]+]], [[FA0]], [[RI0]];
-; -- frem(a[1],b[1]).
-; CHECK-DAG:  div.rn.f32      [[FD1:%f[0-9]+]], [[FA1]], [[FB1]];
-; CHECK-DAG:  cvt.rmi.f32.f32 [[DI1:%f[0-9]+]], [[FD1]];
-; CHECK-DAG:  mul.f32         [[RI1:%f[0-9]+]], [[DI1]], [[FB1]];
-; CHECK-DAG:  sub.f32         [[RF1:%f[0-9]+]], [[FA1]], [[RI1]];
-; -- convert back to f16.
-; CHECK-DAG:  cvt.rn.f16.f32  [[R0:%h[0-9]+]], [[RF0]];
-; CHECK-DAG:  cvt.rn.f16.f32  [[R1:%h[0-9]+]], [[RF1]];
-; -- merge into f16x2 and return it.
-; CHECK:      mov.b32         [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
-; CHECK-NEXT: st.param.b32    [func_retval0+0], [[R]];
-; CHECK-NEXT: ret;
-define <2 x half> @test_frem(<2 x half> %a, <2 x half> %b) #0 {
-  %r = frem <2 x half> %a, %b
-  ret <2 x half> %r
-}
-
-; CHECK-LABEL: .func test_ldst_v2f16(
-; CHECK-DAG:    ld.param.u64    %[[A:rd[0-9]+]], [test_ldst_v2f16_param_0];
-; CHECK-DAG:    ld.param.u64    %[[B:rd[0-9]+]], [test_ldst_v2f16_param_1];
-; CHECK-DAG:    ld.b32          [[E:%hh[0-9]+]], [%[[A]]]
-; CHECK:        mov.b32         {[[E0:%h[0-9]+]], [[E1:%h[0-9]+]]}, [[E]];
-; CHECK-DAG:    st.v2.b16       [%[[B]]], {[[E0]], [[E1]]};
-; CHECK:        ret;
-define void @test_ldst_v2f16(<2 x half>* %a, <2 x half>* %b) {
-  %t1 = load <2 x half>, <2 x half>* %a
-  store <2 x half> %t1, <2 x half>* %b, align 16
-  ret void
-}
-
-; CHECK-LABEL: .func test_ldst_v3f16(
-; CHECK-DAG:    ld.param.u64    %[[A:rd[0-9]+]], [test_ldst_v3f16_param_0];
-; CHECK-DAG:    ld.param.u64    %[[B:rd[0-9]+]], [test_ldst_v3f16_param_1];
-; -- v3 is inconvenient to capture as it's lowered as ld.b64 + fair
-;    number of bitshifting instructions that may change at llvm's whim.
-;    So we only verify that we only issue correct number of writes using
-;    correct offset, but not the values we write.
-; CHECK-DAG:    ld.u64
-; CHECK-DAG:    st.u32          [%[[B]]],
-; CHECK-DAG:    st.b16          [%[[B]]+4],
-; CHECK:        ret;
-define void @test_ldst_v3f16(<3 x half>* %a, <3 x half>* %b) {
-  %t1 = load <3 x half>, <3 x half>* %a
-  store <3 x half> %t1, <3 x half>* %b, align 16
-  ret void
-}
-
-; CHECK-LABEL: .func test_ldst_v4f16(
-; CHECK-DAG:    ld.param.u64    %[[A:rd[0-9]+]], [test_ldst_v4f16_param_0];
-; CHECK-DAG:    ld.param.u64    %[[B:rd[0-9]+]], [test_ldst_v4f16_param_1];
-; CHECK-DAG:    ld.v4.b16       {[[E0:%h[0-9]+]], [[E1:%h[0-9]+]], [[E2:%h[0-9]+]], [[E3:%h[0-9]+]]}, [%[[A]]];
-; CHECK-DAG:    st.v4.b16       [%[[B]]], {[[E0]], [[E1]], [[E2]], [[E3]]};
-; CHECK:        ret;
-define void @test_ldst_v4f16(<4 x half>* %a, <4 x half>* %b) {
-  %t1 = load <4 x half>, <4 x half>* %a
-  store <4 x half> %t1, <4 x half>* %b, align 16
-  ret void
-}
-
-; CHECK-LABEL: .func test_ldst_v8f16(
-; CHECK-DAG:    ld.param.u64    %[[A:rd[0-9]+]], [test_ldst_v8f16_param_0];
-; CHECK-DAG:    ld.param.u64    %[[B:rd[0-9]+]], [test_ldst_v8f16_param_1];
-; CHECK-DAG:    ld.v4.b32       {[[E0:%r[0-9]+]], [[E1:%r[0-9]+]], [[E2:%r[0-9]+]], [[E3:%r[0-9]+]]}, [%[[A]]];
-; CHECK-DAG:    st.v4.b32       [%[[B]]], {[[E0]], [[E1]], [[E2]], [[E3]]};
-; CHECK:        ret;
-define void @test_ldst_v8f16(<8 x half>* %a, <8 x half>* %b) {
-  %t1 = load <8 x half>, <8 x half>* %a
-  store <8 x half> %t1, <8 x half>* %b, align 16
-  ret void
-}
-
-declare <2 x half> @test_callee(<2 x half> %a, <2 x half> %b) #0
-
-; CHECK-LABEL: test_call(
-; CHECK-DAG:  ld.param.b32    [[A:%hh[0-9]+]], [test_call_param_0];
-; CHECK-DAG:  ld.param.b32    [[B:%hh[0-9]+]], [test_call_param_1];
-; CHECK:      {
-; CHECK-DAG:  .param .align 4 .b8 param0[4];
-; CHECK-DAG:  .param .align 4 .b8 param1[4];
-; CHECK-DAG:  st.param.b32    [param0+0], [[A]];
-; CHECK-DAG:  st.param.b32    [param1+0], [[B]];
-; CHECK-DAG:  .param .align 4 .b8 retval0[4];
-; CHECK:      call.uni (retval0),
-; CHECK-NEXT:        test_callee,
-; CHECK:      );
-; CHECK-NEXT: ld.param.b32    [[R:%hh[0-9]+]], [retval0+0];
-; CHECK-NEXT: }
-; CHECK-NEXT: st.param.b32    [func_retval0+0], [[R]];
-; CHECK-NEXT: ret;
-define <2 x half> @test_call(<2 x half> %a, <2 x half> %b) #0 {
-  %r = call <2 x half> @test_callee(<2 x half> %a, <2 x half> %b)
-  ret <2 x half> %r
-}
-
-; CHECK-LABEL: test_call_flipped(
-; CHECK-DAG:  ld.param.b32    [[A:%hh[0-9]+]], [test_call_flipped_param_0];
-; CHECK-DAG:  ld.param.b32    [[B:%hh[0-9]+]], [test_call_flipped_param_1];
-; CHECK:      {
-; CHECK-DAG:  .param .align 4 .b8 param0[4];
-; CHECK-DAG:  .param .align 4 .b8 param1[4];
-; CHECK-DAG:  st.param.b32    [param0+0], [[B]];
-; CHECK-DAG:  st.param.b32    [param1+0], [[A]];
-; CHECK-DAG:  .param .align 4 .b8 retval0[4];
-; CHECK:      call.uni (retval0),
-; CHECK-NEXT:        test_callee,
-; CHECK:      );
-; CHECK-NEXT: ld.param.b32    [[R:%hh[0-9]+]], [retval0+0];
-; CHECK-NEXT: }
-; CHECK-NEXT: st.param.b32    [func_retval0+0], [[R]];
-; CHECK-NEXT: ret;
-define <2 x half> @test_call_flipped(<2 x half> %a, <2 x half> %b) #0 {
-  %r = call <2 x half> @test_callee(<2 x half> %b, <2 x half> %a)
-  ret <2 x half> %r
-}
-
-; CHECK-LABEL: test_tailcall_flipped(
-; CHECK-DAG:  ld.param.b32    [[A:%hh[0-9]+]], [test_tailcall_flipped_param_0];
-; CHECK-DAG:  ld.param.b32    [[B:%hh[0-9]+]], [test_tailcall_flipped_param_1];
-; CHECK:      {
-; CHECK-DAG:  .param .align 4 .b8 param0[4];
-; CHECK-DAG:  .param .align 4 .b8 param1[4];
-; CHECK-DAG:  st.param.b32    [param0+0], [[B]];
-; CHECK-DAG:  st.param.b32    [param1+0], [[A]];
-; CHECK-DAG:  .param .align 4 .b8 retval0[4];
-; CHECK:      call.uni (retval0),
-; CHECK-NEXT:        test_callee,
-; CHECK:      );
-; CHECK-NEXT: ld.param.b32    [[R:%hh[0-9]+]], [retval0+0];
-; CHECK-NEXT: }
-; CHECK-NEXT: st.param.b32    [func_retval0+0], [[R]];
-; CHECK-NEXT: ret;
-define <2 x half> @test_tailcall_flipped(<2 x half> %a, <2 x half> %b) #0 {
-  %r = tail call <2 x half> @test_callee(<2 x half> %b, <2 x half> %a)
-  ret <2 x half> %r
-}
-
-; CHECK-LABEL: test_select(
-; CHECK-DAG:  ld.param.b32    [[A:%hh[0-9]+]], [test_select_param_0];
-; CHECK-DAG:  ld.param.b32    [[B:%hh[0-9]+]], [test_select_param_1];
-; CHECK-DAG:  ld.param.u8     [[C:%rs[0-9]+]], [test_select_param_2]
-; CHECK-DAG:  setp.eq.b16     [[PRED:%p[0-9]+]], %rs{{.*}}, 1;
-; CHECK-NEXT: selp.b32        [[R:%hh[0-9]+]], [[A]], [[B]], [[PRED]];
-; CHECK-NEXT: st.param.b32    [func_retval0+0], [[R]];
-; CHECK-NEXT: ret;
-define <2 x half> @test_select(<2 x half> %a, <2 x half> %b, i1 zeroext %c) #0 {
-  %r = select i1 %c, <2 x half> %a, <2 x half> %b
-  ret <2 x half> %r
-}
-
-; CHECK-LABEL: test_select_cc(
-; CHECK-DAG:  ld.param.b32    [[A:%hh[0-9]+]], [test_select_cc_param_0];
-; CHECK-DAG:  ld.param.b32    [[B:%hh[0-9]+]], [test_select_cc_param_1];
-; CHECK-DAG:  ld.param.b32    [[C:%hh[0-9]+]], [test_select_cc_param_2];
-; CHECK-DAG:  ld.param.b32    [[D:%hh[0-9]+]], [test_select_cc_param_3];
-;
-; CHECK-F16:  setp.neu.f16x2  [[P0:%p[0-9]+]]|[[P1:%p[0-9]+]], [[C]], [[D]]
-;
-; CHECK-NOF16-DAG: mov.b32        {[[C0:%h[0-9]+]], [[C1:%h[0-9]+]]}, [[C]]
-; CHECK-NOF16-DAG: mov.b32        {[[D0:%h[0-9]+]], [[D1:%h[0-9]+]]}, [[D]]
-; CHECK-NOF16-DAG: cvt.f32.f16 [[DF0:%f[0-9]+]], [[D0]];
-; CHECK-NOF16-DAG: cvt.f32.f16 [[CF0:%f[0-9]+]], [[C0]];
-; CHECK-NOF16-DAG: cvt.f32.f16 [[DF1:%f[0-9]+]], [[D1]];
-; CHECK-NOF16-DAG: cvt.f32.f16 [[CF1:%f[0-9]+]], [[C1]];
-; CHECK-NOF16-DAG: setp.neu.f32    [[P0:%p[0-9]+]], [[CF0]], [[DF0]]
-; CHECK-NOF16-DAG: setp.neu.f32    [[P1:%p[0-9]+]], [[CF1]], [[DF1]]
-;
-; CHECK-DAG:  mov.b32         {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
-; CHECK-DAG:  mov.b32         {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]]
-; CHECK-DAG:  selp.b16        [[R0:%h[0-9]+]], [[A0]], [[B0]], [[P0]];
-; CHECK-DAG:  selp.b16        [[R1:%h[0-9]+]], [[A1]], [[B1]], [[P1]];
-; CHECK:      mov.b32         [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
-; CHECK-NEXT: st.param.b32    [func_retval0+0], [[R]];
-; CHECK-NEXT: ret;
-define <2 x half> @test_select_cc(<2 x half> %a, <2 x half> %b, <2 x half> %c, <2 x half> %d) #0 {
-  %cc = fcmp une <2 x half> %c, %d
-  %r = select <2 x i1> %cc, <2 x half> %a, <2 x half> %b
-  ret <2 x half> %r
-}
-
-; CHECK-LABEL: test_select_cc_f32_f16(
-; CHECK-DAG:  ld.param.v2.f32    {[[A0:%f[0-9]+]], [[A1:%f[0-9]+]]}, [test_select_cc_f32_f16_param_0];
-; CHECK-DAG:  ld.param.v2.f32    {[[B0:%f[0-9]+]], [[B1:%f[0-9]+]]}, [test_select_cc_f32_f16_param_1];
-; CHECK-DAG:  ld.param.b32    [[C:%hh[0-9]+]], [test_select_cc_f32_f16_param_2];
-; CHECK-DAG:  ld.param.b32    [[D:%hh[0-9]+]], [test_select_cc_f32_f16_param_3];
-;
-; CHECK-F16:  setp.neu.f16x2  [[P0:%p[0-9]+]]|[[P1:%p[0-9]+]], [[C]], [[D]]
-; CHECK-NOF16-DAG: mov.b32         {[[C0:%h[0-9]+]], [[C1:%h[0-9]+]]}, [[C]]
-; CHECK-NOF16-DAG: mov.b32         {[[D0:%h[0-9]+]], [[D1:%h[0-9]+]]}, [[D]]
-; CHECK-NOF16-DAG: cvt.f32.f16 [[DF0:%f[0-9]+]], [[D0]];
-; CHECK-NOF16-DAG: cvt.f32.f16 [[CF0:%f[0-9]+]], [[C0]];
-; CHECK-NOF16-DAG: cvt.f32.f16 [[DF1:%f[0-9]+]], [[D1]];
-; CHECK-NOF16-DAG: cvt.f32.f16 [[CF1:%f[0-9]+]], [[C1]];
-; CHECK-NOF16-DAG: setp.neu.f32    [[P0:%p[0-9]+]], [[CF0]], [[DF0]]
-; CHECK-NOF16-DAG: setp.neu.f32    [[P1:%p[0-9]+]], [[CF1]], [[DF1]]
-;
-; CHECK-DAG: selp.f32        [[R0:%f[0-9]+]], [[A0]], [[B0]], [[P0]];
-; CHECK-DAG: selp.f32        [[R1:%f[0-9]+]], [[A1]], [[B1]], [[P1]];
-; CHECK-NEXT: st.param.v2.f32    [func_retval0+0], {[[R0]], [[R1]]};
-; CHECK-NEXT: ret;
-define <2 x float> @test_select_cc_f32_f16(<2 x float> %a, <2 x float> %b,
-                                           <2 x half> %c, <2 x half> %d) #0 {
-  %cc = fcmp une <2 x half> %c, %d
-  %r = select <2 x i1> %cc, <2 x float> %a, <2 x float> %b
-  ret <2 x float> %r
-}
-
-; CHECK-LABEL: test_select_cc_f16_f32(
-; CHECK-DAG:  ld.param.b32    [[A:%hh[0-9]+]], [test_select_cc_f16_f32_param_0];
-; CHECK-DAG:  ld.param.b32    [[B:%hh[0-9]+]], [test_select_cc_f16_f32_param_1];
-; CHECK-DAG:  ld.param.v2.f32 {[[C0:%f[0-9]+]], [[C1:%f[0-9]+]]}, [test_select_cc_f16_f32_param_2];
-; CHECK-DAG:  ld.param.v2.f32 {[[D0:%f[0-9]+]], [[D1:%f[0-9]+]]}, [test_select_cc_f16_f32_param_3];
-; CHECK-DAG:  setp.neu.f32    [[P0:%p[0-9]+]], [[C0]], [[D0]]
-; CHECK-DAG:  setp.neu.f32    [[P1:%p[0-9]+]], [[C1]], [[D1]]
-; CHECK-DAG:  mov.b32         {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
-; CHECK-DAG:  mov.b32         {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]]
-; CHECK-DAG:  selp.b16        [[R0:%h[0-9]+]], [[A0]], [[B0]], [[P0]];
-; CHECK-DAG:  selp.b16        [[R1:%h[0-9]+]], [[A1]], [[B1]], [[P1]];
-; CHECK:      mov.b32         [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
-; CHECK-NEXT: st.param.b32    [func_retval0+0], [[R]];
-; CHECK-NEXT: ret;
-define <2 x half> @test_select_cc_f16_f32(<2 x half> %a, <2 x half> %b,
-                                          <2 x float> %c, <2 x float> %d) #0 {
-  %cc = fcmp une <2 x float> %c, %d
-  %r = select <2 x i1> %cc, <2 x half> %a, <2 x half> %b
-  ret <2 x half> %r
-}
-
-; CHECK-LABEL: test_fcmp_une(
-; CHECK-DAG:  ld.param.b32    [[A:%hh[0-9]+]], [test_fcmp_une_param_0];
-; CHECK-DAG:  ld.param.b32    [[B:%hh[0-9]+]], [test_fcmp_une_param_1];
-; CHECK-F16:  setp.neu.f16x2  [[P0:%p[0-9]+]]|[[P1:%p[0-9]+]], [[A]], [[B]]
-; CHECK-NOF16-DAG:  mov.b32        {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
-; CHECK-NOF16-DAG:  mov.b32        {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]]
-; CHECK-NOF16-DAG:  cvt.f32.f16    [[FA0:%f[0-9]+]], [[A0]]
-; CHECK-NOF16-DAG:  cvt.f32.f16    [[FB0:%f[0-9]+]], [[B0]]
-; CHECK-NOF16-DAG:  cvt.f32.f16    [[FA1:%f[0-9]+]], [[A1]]
-; CHECK-NOF16-DAG:  cvt.f32.f16    [[FB1:%f[0-9]+]], [[B1]]
-; CHECK-NOF16-DAG:  setp.neu.f32   [[P0:%p[0-9]+]], [[FA0]], [[FB0]]
-; CHECK-NOF16-DAG:  setp.neu.f32   [[P1:%p[0-9]+]], [[FA1]], [[FB1]]
-; CHECK-DAG:  selp.u16        [[R0:%rs[0-9]+]], -1, 0, [[P0]];
-; CHECK-DAG:  selp.u16        [[R1:%rs[0-9]+]], -1, 0, [[P1]];
-; CHECK-NEXT: st.param.v2.b8  [func_retval0+0], {[[R0]], [[R1]]};
-; CHECK-NEXT: ret;
-define <2 x i1> @test_fcmp_une(<2 x half> %a, <2 x half> %b) #0 {
-  %r = fcmp une <2 x half> %a, %b
-  ret <2 x i1> %r
-}
-
-; CHECK-LABEL: test_fcmp_ueq(
-; CHECK-DAG:  ld.param.b32    [[A:%hh[0-9]+]], [test_fcmp_ueq_param_0];
-; CHECK-DAG:  ld.param.b32    [[B:%hh[0-9]+]], [test_fcmp_ueq_param_1];
-; CHECK-F16:  setp.equ.f16x2  [[P0:%p[0-9]+]]|[[P1:%p[0-9]+]], [[A]], [[B]]
-; CHECK-NOF16-DAG:  mov.b32        {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
-; CHECK-NOF16-DAG:  mov.b32        {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]]
-; CHECK-NOF16-DAG:  cvt.f32.f16    [[FA0:%f[0-9]+]], [[A0]]
-; CHECK-NOF16-DAG:  cvt.f32.f16    [[FB0:%f[0-9]+]], [[B0]]
-; CHECK-NOF16-DAG:  cvt.f32.f16    [[FA1:%f[0-9]+]], [[A1]]
-; CHECK-NOF16-DAG:  cvt.f32.f16    [[FB1:%f[0-9]+]], [[B1]]
-; CHECK-NOF16-DAG:  setp.equ.f32   [[P0:%p[0-9]+]], [[FA0]], [[FB0]]
-; CHECK-NOF16-DAG:  setp.equ.f32   [[P1:%p[0-9]+]], [[FA1]], [[FB1]]
-; CHECK-DAG:  selp.u16        [[R0:%rs[0-9]+]], -1, 0, [[P0]];
-; CHECK-DAG:  selp.u16        [[R1:%rs[0-9]+]], -1, 0, [[P1]];
-; CHECK-NEXT: st.param.v2.b8  [func_retval0+0], {[[R0]], [[R1]]};
-; CHECK-NEXT: ret;
-define <2 x i1> @test_fcmp_ueq(<2 x half> %a, <2 x half> %b) #0 {
-  %r = fcmp ueq <2 x half> %a, %b
-  ret <2 x i1> %r
-}
-
-; CHECK-LABEL: test_fcmp_ugt(
-; CHECK-DAG:  ld.param.b32    [[A:%hh[0-9]+]], [test_fcmp_ugt_param_0];
-; CHECK-DAG:  ld.param.b32    [[B:%hh[0-9]+]], [test_fcmp_ugt_param_1];
-; CHECK-F16:  setp.gtu.f16x2  [[P0:%p[0-9]+]]|[[P1:%p[0-9]+]], [[A]], [[B]]
-; CHECK-NOF16-DAG:  mov.b32        {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
-; CHECK-NOF16-DAG:  mov.b32        {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]]
-; CHECK-NOF16-DAG:  cvt.f32.f16    [[FA0:%f[0-9]+]], [[A0]]
-; CHECK-NOF16-DAG:  cvt.f32.f16    [[FB0:%f[0-9]+]], [[B0]]
-; CHECK-NOF16-DAG:  cvt.f32.f16    [[FA1:%f[0-9]+]], [[A1]]
-; CHECK-NOF16-DAG:  cvt.f32.f16    [[FB1:%f[0-9]+]], [[B1]]
-; CHECK-NOF16-DAG:  setp.gtu.f32   [[P0:%p[0-9]+]], [[FA0]], [[FB0]]
-; CHECK-NOF16-DAG:  setp.gtu.f32   [[P1:%p[0-9]+]], [[FA1]], [[FB1]]
-; CHECK-DAG:  selp.u16        [[R0:%rs[0-9]+]], -1, 0, [[P0]];
-; CHECK-DAG:  selp.u16        [[R1:%rs[0-9]+]], -1, 0, [[P1]];
-; CHECK-NEXT: st.param.v2.b8  [func_retval0+0], {[[R0]], [[R1]]};
-; CHECK-NEXT: ret;
-define <2 x i1> @test_fcmp_ugt(<2 x half> %a, <2 x half> %b) #0 {
-  %r = fcmp ugt <2 x half> %a, %b
-  ret <2 x i1> %r
-}
-
-; CHECK-LABEL: test_fcmp_uge(
-; CHECK-DAG:  ld.param.b32    [[A:%hh[0-9]+]], [test_fcmp_uge_param_0];
-; CHECK-DAG:  ld.param.b32    [[B:%hh[0-9]+]], [test_fcmp_uge_param_1];
-; CHECK-F16:  setp.geu.f16x2  [[P0:%p[0-9]+]]|[[P1:%p[0-9]+]], [[A]], [[B]]
-; CHECK-NOF16-DAG:  mov.b32        {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
-; CHECK-NOF16-DAG:  mov.b32        {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]]
-; CHECK-NOF16-DAG:  cvt.f32.f16    [[FA0:%f[0-9]+]], [[A0]]
-; CHECK-NOF16-DAG:  cvt.f32.f16    [[FB0:%f[0-9]+]], [[B0]]
-; CHECK-NOF16-DAG:  cvt.f32.f16    [[FA1:%f[0-9]+]], [[A1]]
-; CHECK-NOF16-DAG:  cvt.f32.f16    [[FB1:%f[0-9]+]], [[B1]]
-; CHECK-NOF16-DAG:  setp.geu.f32   [[P0:%p[0-9]+]], [[FA0]], [[FB0]]
-; CHECK-NOF16-DAG:  setp.geu.f32   [[P1:%p[0-9]+]], [[FA1]], [[FB1]]
-; CHECK-DAG:  selp.u16        [[R0:%rs[0-9]+]], -1, 0, [[P0]];
-; CHECK-DAG:  selp.u16        [[R1:%rs[0-9]+]], -1, 0, [[P1]];
-; CHECK-NEXT: st.param.v2.b8  [func_retval0+0], {[[R0]], [[R1]]};
-; CHECK-NEXT: ret;
-define <2 x i1> @test_fcmp_uge(<2 x half> %a, <2 x half> %b) #0 {
-  %r = fcmp uge <2 x half> %a, %b
-  ret <2 x i1> %r
-}
-
-; CHECK-LABEL: test_fcmp_ult(
-; CHECK-DAG:  ld.param.b32    [[A:%hh[0-9]+]], [test_fcmp_ult_param_0];
-; CHECK-DAG:  ld.param.b32    [[B:%hh[0-9]+]], [test_fcmp_ult_param_1];
-; CHECK-F16:  setp.ltu.f16x2  [[P0:%p[0-9]+]]|[[P1:%p[0-9]+]], [[A]], [[B]]
-; CHECK-NOF16-DAG:  mov.b32        {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
-; CHECK-NOF16-DAG:  mov.b32        {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]]
-; CHECK-NOF16-DAG:  cvt.f32.f16    [[FA0:%f[0-9]+]], [[A0]]
-; CHECK-NOF16-DAG:  cvt.f32.f16    [[FB0:%f[0-9]+]], [[B0]]
-; CHECK-NOF16-DAG:  cvt.f32.f16    [[FA1:%f[0-9]+]], [[A1]]
-; CHECK-NOF16-DAG:  cvt.f32.f16    [[FB1:%f[0-9]+]], [[B1]]
-; CHECK-NOF16-DAG:  setp.ltu.f32   [[P0:%p[0-9]+]], [[FA0]], [[FB0]]
-; CHECK-NOF16-DAG:  setp.ltu.f32   [[P1:%p[0-9]+]], [[FA1]], [[FB1]]
-; CHECK-DAG:  selp.u16        [[R0:%rs[0-9]+]], -1, 0, [[P0]];
-; CHECK-DAG:  selp.u16        [[R1:%rs[0-9]+]], -1, 0, [[P1]];
-; CHECK-NEXT: st.param.v2.b8  [func_retval0+0], {[[R0]], [[R1]]};
-; CHECK-NEXT: ret;
-define <2 x i1> @test_fcmp_ult(<2 x half> %a, <2 x half> %b) #0 {
-  %r = fcmp ult <2 x half> %a, %b
-  ret <2 x i1> %r
-}
-
-; CHECK-LABEL: test_fcmp_ule(
-; CHECK-DAG:  ld.param.b32    [[A:%hh[0-9]+]], [test_fcmp_ule_param_0];
-; CHECK-DAG:  ld.param.b32    [[B:%hh[0-9]+]], [test_fcmp_ule_param_1];
-; CHECK-F16:  setp.leu.f16x2  [[P0:%p[0-9]+]]|[[P1:%p[0-9]+]], [[A]], [[B]]
-; CHECK-NOF16-DAG:  mov.b32        {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
-; CHECK-NOF16-DAG:  mov.b32        {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]]
-; CHECK-NOF16-DAG:  cvt.f32.f16    [[FA0:%f[0-9]+]], [[A0]]
-; CHECK-NOF16-DAG:  cvt.f32.f16    [[FB0:%f[0-9]+]], [[B0]]
-; CHECK-NOF16-DAG:  cvt.f32.f16    [[FA1:%f[0-9]+]], [[A1]]
-; CHECK-NOF16-DAG:  cvt.f32.f16    [[FB1:%f[0-9]+]], [[B1]]
-; CHECK-NOF16-DAG:  setp.leu.f32   [[P0:%p[0-9]+]], [[FA0]], [[FB0]]
-; CHECK-NOF16-DAG:  setp.leu.f32   [[P1:%p[0-9]+]], [[FA1]], [[FB1]]
-; CHECK-DAG:  selp.u16        [[R0:%rs[0-9]+]], -1, 0, [[P0]];
-; CHECK-DAG:  selp.u16        [[R1:%rs[0-9]+]], -1, 0, [[P1]];
-; CHECK-NEXT: st.param.v2.b8  [func_retval0+0], {[[R0]], [[R1]]};
-; CHECK-NEXT: ret;
-define <2 x i1> @test_fcmp_ule(<2 x half> %a, <2 x half> %b) #0 {
-  %r = fcmp ule <2 x half> %a, %b
-  ret <2 x i1> %r
-}
-
-
-; CHECK-LABEL: test_fcmp_uno(
-; CHECK-DAG:  ld.param.b32    [[A:%hh[0-9]+]], [test_fcmp_uno_param_0];
-; CHECK-DAG:  ld.param.b32    [[B:%hh[0-9]+]], [test_fcmp_uno_param_1];
-; CHECK-F16:  setp.nan.f16x2  [[P0:%p[0-9]+]]|[[P1:%p[0-9]+]], [[A]], [[B]]
-; CHECK-NOF16-DAG:  mov.b32        {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
-; CHECK-NOF16-DAG:  mov.b32        {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]]
-; CHECK-NOF16-DAG:  cvt.f32.f16    [[FA0:%f[0-9]+]], [[A0]]
-; CHECK-NOF16-DAG:  cvt.f32.f16    [[FB0:%f[0-9]+]], [[B0]]
-; CHECK-NOF16-DAG:  cvt.f32.f16    [[FA1:%f[0-9]+]], [[A1]]
-; CHECK-NOF16-DAG:  cvt.f32.f16    [[FB1:%f[0-9]+]], [[B1]]
-; CHECK-NOF16-DAG:  setp.nan.f32   [[P0:%p[0-9]+]], [[FA0]], [[FB0]]
-; CHECK-NOF16-DAG:  setp.nan.f32   [[P1:%p[0-9]+]], [[FA1]], [[FB1]]
-; CHECK-DAG:  selp.u16        [[R0:%rs[0-9]+]], -1, 0, [[P0]];
-; CHECK-DAG:  selp.u16        [[R1:%rs[0-9]+]], -1, 0, [[P1]];
-; CHECK-NEXT: st.param.v2.b8  [func_retval0+0], {[[R0]], [[R1]]};
-; CHECK-NEXT: ret;
-define <2 x i1> @test_fcmp_uno(<2 x half> %a, <2 x half> %b) #0 {
-  %r = fcmp uno <2 x half> %a, %b
-  ret <2 x i1> %r
-}
-
-; CHECK-LABEL: test_fcmp_one(
-; CHECK-DAG:  ld.param.b32    [[A:%hh[0-9]+]], [test_fcmp_one_param_0];
-; CHECK-DAG:  ld.param.b32    [[B:%hh[0-9]+]], [test_fcmp_one_param_1];
-; CHECK-F16:  setp.ne.f16x2  [[P0:%p[0-9]+]]|[[P1:%p[0-9]+]], [[A]], [[B]]
-; CHECK-NOF16-DAG:  mov.b32        {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
-; CHECK-NOF16-DAG:  mov.b32        {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]]
-; CHECK-NOF16-DAG:  cvt.f32.f16    [[FA0:%f[0-9]+]], [[A0]]
-; CHECK-NOF16-DAG:  cvt.f32.f16    [[FB0:%f[0-9]+]], [[B0]]
-; CHECK-NOF16-DAG:  cvt.f32.f16    [[FA1:%f[0-9]+]], [[A1]]
-; CHECK-NOF16-DAG:  cvt.f32.f16    [[FB1:%f[0-9]+]], [[B1]]
-; CHECK-NOF16-DAG:  setp.ne.f32   [[P0:%p[0-9]+]], [[FA0]], [[FB0]]
-; CHECK-NOF16-DAG:  setp.ne.f32   [[P1:%p[0-9]+]], [[FA1]], [[FB1]]
-; CHECK-DAG:  selp.u16        [[R0:%rs[0-9]+]], -1, 0, [[P0]];
-; CHECK-DAG:  selp.u16        [[R1:%rs[0-9]+]], -1, 0, [[P1]];
-; CHECK-NEXT: st.param.v2.b8  [func_retval0+0], {[[R0]], [[R1]]};
-; CHECK-NEXT: ret;
-define <2 x i1> @test_fcmp_one(<2 x half> %a, <2 x half> %b) #0 {
-  %r = fcmp one <2 x half> %a, %b
-  ret <2 x i1> %r
-}
-
-; CHECK-LABEL: test_fcmp_oeq(
-; CHECK-DAG:  ld.param.b32    [[A:%hh[0-9]+]], [test_fcmp_oeq_param_0];
-; CHECK-DAG:  ld.param.b32    [[B:%hh[0-9]+]], [test_fcmp_oeq_param_1];
-; CHECK-F16:  setp.eq.f16x2  [[P0:%p[0-9]+]]|[[P1:%p[0-9]+]], [[A]], [[B]]
-; CHECK-NOF16-DAG:  mov.b32        {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
-; CHECK-NOF16-DAG:  mov.b32        {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]]
-; CHECK-NOF16-DAG:  cvt.f32.f16    [[FA0:%f[0-9]+]], [[A0]]
-; CHECK-NOF16-DAG:  cvt.f32.f16    [[FB0:%f[0-9]+]], [[B0]]
-; CHECK-NOF16-DAG:  cvt.f32.f16    [[FA1:%f[0-9]+]], [[A1]]
-; CHECK-NOF16-DAG:  cvt.f32.f16    [[FB1:%f[0-9]+]], [[B1]]
-; CHECK-NOF16-DAG:  setp.eq.f32   [[P0:%p[0-9]+]], [[FA0]], [[FB0]]
-; CHECK-NOF16-DAG:  setp.eq.f32   [[P1:%p[0-9]+]], [[FA1]], [[FB1]]
-; CHECK-DAG:  selp.u16        [[R0:%rs[0-9]+]], -1, 0, [[P0]];
-; CHECK-DAG:  selp.u16        [[R1:%rs[0-9]+]], -1, 0, [[P1]];
-; CHECK-NEXT: st.param.v2.b8  [func_retval0+0], {[[R0]], [[R1]]};
-; CHECK-NEXT: ret;
-define <2 x i1> @test_fcmp_oeq(<2 x half> %a, <2 x half> %b) #0 {
-  %r = fcmp oeq <2 x half> %a, %b
-  ret <2 x i1> %r
-}
-
-; CHECK-LABEL: test_fcmp_ogt(
-; CHECK-DAG:  ld.param.b32    [[A:%hh[0-9]+]], [test_fcmp_ogt_param_0];
-; CHECK-DAG:  ld.param.b32    [[B:%hh[0-9]+]], [test_fcmp_ogt_param_1];
-; CHECK-F16:  setp.gt.f16x2  [[P0:%p[0-9]+]]|[[P1:%p[0-9]+]], [[A]], [[B]]
-; CHECK-NOF16-DAG:  mov.b32        {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
-; CHECK-NOF16-DAG:  mov.b32        {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]]
-; CHECK-NOF16-DAG:  cvt.f32.f16    [[FA0:%f[0-9]+]], [[A0]]
-; CHECK-NOF16-DAG:  cvt.f32.f16    [[FB0:%f[0-9]+]], [[B0]]
-; CHECK-NOF16-DAG:  cvt.f32.f16    [[FA1:%f[0-9]+]], [[A1]]
-; CHECK-NOF16-DAG:  cvt.f32.f16    [[FB1:%f[0-9]+]], [[B1]]
-; CHECK-NOF16-DAG:  setp.gt.f32   [[P0:%p[0-9]+]], [[FA0]], [[FB0]]
-; CHECK-NOF16-DAG:  setp.gt.f32   [[P1:%p[0-9]+]], [[FA1]], [[FB1]]
-; CHECK-DAG:  selp.u16        [[R0:%rs[0-9]+]], -1, 0, [[P0]];
-; CHECK-DAG:  selp.u16        [[R1:%rs[0-9]+]], -1, 0, [[P1]];
-; CHECK-NEXT: st.param.v2.b8  [func_retval0+0], {[[R0]], [[R1]]};
-; CHECK-NEXT: ret;
-define <2 x i1> @test_fcmp_ogt(<2 x half> %a, <2 x half> %b) #0 {
-  %r = fcmp ogt <2 x half> %a, %b
-  ret <2 x i1> %r
-}
-
-; CHECK-LABEL: test_fcmp_oge(
-; CHECK-DAG:  ld.param.b32    [[A:%hh[0-9]+]], [test_fcmp_oge_param_0];
-; CHECK-DAG:  ld.param.b32    [[B:%hh[0-9]+]], [test_fcmp_oge_param_1];
-; CHECK-F16:  setp.ge.f16x2  [[P0:%p[0-9]+]]|[[P1:%p[0-9]+]], [[A]], [[B]]
-; CHECK-NOF16-DAG:  mov.b32        {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
-; CHECK-NOF16-DAG:  mov.b32        {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]]
-; CHECK-NOF16-DAG:  cvt.f32.f16    [[FA0:%f[0-9]+]], [[A0]]
-; CHECK-NOF16-DAG:  cvt.f32.f16    [[FB0:%f[0-9]+]], [[B0]]
-; CHECK-NOF16-DAG:  cvt.f32.f16    [[FA1:%f[0-9]+]], [[A1]]
-; CHECK-NOF16-DAG:  cvt.f32.f16    [[FB1:%f[0-9]+]], [[B1]]
-; CHECK-NOF16-DAG:  setp.ge.f32   [[P0:%p[0-9]+]], [[FA0]], [[FB0]]
-; CHECK-NOF16-DAG:  setp.ge.f32   [[P1:%p[0-9]+]], [[FA1]], [[FB1]]
-; CHECK-DAG:  selp.u16        [[R0:%rs[0-9]+]], -1, 0, [[P0]];
-; CHECK-DAG:  selp.u16        [[R1:%rs[0-9]+]], -1, 0, [[P1]];
-; CHECK-NEXT: st.param.v2.b8  [func_retval0+0], {[[R0]], [[R1]]};
-; CHECK-NEXT: ret;
-define <2 x i1> @test_fcmp_oge(<2 x half> %a, <2 x half> %b) #0 {
-  %r = fcmp oge <2 x half> %a, %b
-  ret <2 x i1> %r
-}
-
-; CHECK-LABEL: test_fcmp_olt(
-; CHECK-DAG:  ld.param.b32    [[A:%hh[0-9]+]], [test_fcmp_olt_param_0];
-; CHECK-DAG:  ld.param.b32    [[B:%hh[0-9]+]], [test_fcmp_olt_param_1];
-; CHECK-F16:  setp.lt.f16x2  [[P0:%p[0-9]+]]|[[P1:%p[0-9]+]], [[A]], [[B]]
-; CHECK-NOF16-DAG:  mov.b32        {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
-; CHECK-NOF16-DAG:  mov.b32        {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]]
-; CHECK-NOF16-DAG:  cvt.f32.f16    [[FA0:%f[0-9]+]], [[A0]]
-; CHECK-NOF16-DAG:  cvt.f32.f16    [[FB0:%f[0-9]+]], [[B0]]
-; CHECK-NOF16-DAG:  cvt.f32.f16    [[FA1:%f[0-9]+]], [[A1]]
-; CHECK-NOF16-DAG:  cvt.f32.f16    [[FB1:%f[0-9]+]], [[B1]]
-; CHECK-NOF16-DAG:  setp.lt.f32   [[P0:%p[0-9]+]], [[FA0]], [[FB0]]
-; CHECK-NOF16-DAG:  setp.lt.f32   [[P1:%p[0-9]+]], [[FA1]], [[FB1]]
-; CHECK-DAG:  selp.u16        [[R0:%rs[0-9]+]], -1, 0, [[P0]];
-; CHECK-DAG:  selp.u16        [[R1:%rs[0-9]+]], -1, 0, [[P1]];
-; CHECK-NEXT: st.param.v2.b8  [func_retval0+0], {[[R0]], [[R1]]};
-; CHECK-NEXT: ret;
-define <2 x i1> @test_fcmp_olt(<2 x half> %a, <2 x half> %b) #0 {
-  %r = fcmp olt <2 x half> %a, %b
-  ret <2 x i1> %r
-}
-
-; XCHECK-LABEL: test_fcmp_ole(
-; CHECK-DAG:  ld.param.b32    [[A:%hh[0-9]+]], [test_fcmp_ole_param_0];
-; CHECK-DAG:  ld.param.b32    [[B:%hh[0-9]+]], [test_fcmp_ole_param_1];
-; CHECK-F16:  setp.le.f16x2  [[P0:%p[0-9]+]]|[[P1:%p[0-9]+]], [[A]], [[B]]
-; CHECK-NOF16-DAG:  mov.b32        {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
-; CHECK-NOF16-DAG:  mov.b32        {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]]
-; CHECK-NOF16-DAG:  cvt.f32.f16    [[FA0:%f[0-9]+]], [[A0]]
-; CHECK-NOF16-DAG:  cvt.f32.f16    [[FB0:%f[0-9]+]], [[B0]]
-; CHECK-NOF16-DAG:  cvt.f32.f16    [[FA1:%f[0-9]+]], [[A1]]
-; CHECK-NOF16-DAG:  cvt.f32.f16    [[FB1:%f[0-9]+]], [[B1]]
-; CHECK-NOF16-DAG:  setp.le.f32   [[P0:%p[0-9]+]], [[FA0]], [[FB0]]
-; CHECK-NOF16-DAG:  setp.le.f32   [[P1:%p[0-9]+]], [[FA1]], [[FB1]]
-; CHECK-DAG:  selp.u16        [[R0:%rs[0-9]+]], -1, 0, [[P0]];
-; CHECK-DAG:  selp.u16        [[R1:%rs[0-9]+]], -1, 0, [[P1]];
-; CHECK-NEXT: st.param.v2.b8  [func_retval0+0], {[[R0]], [[R1]]};
-; CHECK-NEXT: ret;
-define <2 x i1> @test_fcmp_ole(<2 x half> %a, <2 x half> %b) #0 {
-  %r = fcmp ole <2 x half> %a, %b
-  ret <2 x i1> %r
-}
-
-; CHECK-LABEL: test_fcmp_ord(
-; CHECK-DAG:  ld.param.b32    [[A:%hh[0-9]+]], [test_fcmp_ord_param_0];
-; CHECK-DAG:  ld.param.b32    [[B:%hh[0-9]+]], [test_fcmp_ord_param_1];
-; CHECK-F16:  setp.num.f16x2  [[P0:%p[0-9]+]]|[[P1:%p[0-9]+]], [[A]], [[B]]
-; CHECK-NOF16-DAG:  mov.b32        {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
-; CHECK-NOF16-DAG:  mov.b32        {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]]
-; CHECK-NOF16-DAG:  cvt.f32.f16    [[FA0:%f[0-9]+]], [[A0]]
-; CHECK-NOF16-DAG:  cvt.f32.f16    [[FB0:%f[0-9]+]], [[B0]]
-; CHECK-NOF16-DAG:  cvt.f32.f16    [[FA1:%f[0-9]+]], [[A1]]
-; CHECK-NOF16-DAG:  cvt.f32.f16    [[FB1:%f[0-9]+]], [[B1]]
-; CHECK-NOF16-DAG:  setp.num.f32   [[P0:%p[0-9]+]], [[FA0]], [[FB0]]
-; CHECK-NOF16-DAG:  setp.num.f32   [[P1:%p[0-9]+]], [[FA1]], [[FB1]]
-; CHECK-DAG:  selp.u16        [[R0:%rs[0-9]+]], -1, 0, [[P0]];
-; CHECK-DAG:  selp.u16        [[R1:%rs[0-9]+]], -1, 0, [[P1]];
-; CHECK-NEXT: st.param.v2.b8  [func_retval0+0], {[[R0]], [[R1]]};
-; CHECK-NEXT: ret;
-define <2 x i1> @test_fcmp_ord(<2 x half> %a, <2 x half> %b) #0 {
-  %r = fcmp ord <2 x half> %a, %b
-  ret <2 x i1> %r
-}
-
-; CHECK-LABEL: test_fptosi_i32(
-; CHECK:      ld.param.b32    [[A:%hh[0-9]+]], [test_fptosi_i32_param_0];
-; CHECK:      mov.b32         {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
-; CHECK-DAG:  cvt.rzi.s32.f16 [[R0:%r[0-9]+]], [[A0]];
-; CHECK-DAG:  cvt.rzi.s32.f16 [[R1:%r[0-9]+]], [[A1]];
-; CHECK:      st.param.v2.b32 [func_retval0+0], {[[R0]], [[R1]]}
-; CHECK:      ret;
-define <2 x i32> @test_fptosi_i32(<2 x half> %a) #0 {
-  %r = fptosi <2 x half> %a to <2 x i32>
-  ret <2 x i32> %r
-}
-
-; CHECK-LABEL: test_fptosi_i64(
-; CHECK:      ld.param.b32    [[A:%hh[0-9]+]], [test_fptosi_i64_param_0];
-; CHECK:      mov.b32         {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
-; CHECK-DAG:  cvt.rzi.s64.f16 [[R0:%rd[0-9]+]], [[A0]];
-; CHECK-DAG:  cvt.rzi.s64.f16 [[R1:%rd[0-9]+]], [[A1]];
-; CHECK:      st.param.v2.b64 [func_retval0+0], {[[R0]], [[R1]]}
-; CHECK:      ret;
-define <2 x i64> @test_fptosi_i64(<2 x half> %a) #0 {
-  %r = fptosi <2 x half> %a to <2 x i64>
-  ret <2 x i64> %r
-}
-
-; CHECK-LABEL: test_fptoui_2xi32(
-; CHECK:      ld.param.b32    [[A:%hh[0-9]+]], [test_fptoui_2xi32_param_0];
-; CHECK:      mov.b32         {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
-; CHECK-DAG:  cvt.rzi.u32.f16 [[R0:%r[0-9]+]], [[A0]];
-; CHECK-DAG:  cvt.rzi.u32.f16 [[R1:%r[0-9]+]], [[A1]];
-; CHECK:      st.param.v2.b32 [func_retval0+0], {[[R0]], [[R1]]}
-; CHECK:      ret;
-define <2 x i32> @test_fptoui_2xi32(<2 x half> %a) #0 {
-  %r = fptoui <2 x half> %a to <2 x i32>
-  ret <2 x i32> %r
-}
-
-; CHECK-LABEL: test_fptoui_2xi64(
-; CHECK:      ld.param.b32    [[A:%hh[0-9]+]], [test_fptoui_2xi64_param_0];
-; CHECK:      mov.b32         {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
-; CHECK-DAG:  cvt.rzi.u64.f16 [[R0:%rd[0-9]+]], [[A0]];
-; CHECK-DAG:  cvt.rzi.u64.f16 [[R1:%rd[0-9]+]], [[A1]];
-; CHECK:      st.param.v2.b64 [func_retval0+0], {[[R0]], [[R1]]}
-; CHECK:      ret;
-define <2 x i64> @test_fptoui_2xi64(<2 x half> %a) #0 {
-  %r = fptoui <2 x half> %a to <2 x i64>
-  ret <2 x i64> %r
-}
-
-; CHECK-LABEL: test_uitofp_2xi32(
-; CHECK:      ld.param.v2.u32 {[[A0:%r[0-9]+]], [[A1:%r[0-9]+]]}, [test_uitofp_2xi32_param_0];
-; CHECK-DAG:  cvt.rn.f16.u32  [[R0:%h[0-9]+]], [[A0]];
-; CHECK-DAG:  cvt.rn.f16.u32  [[R1:%h[0-9]+]], [[A1]];
-; CHECK:      mov.b32         [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
-; CHECK:      st.param.b32    [func_retval0+0], [[R]];
-; CHECK:      ret;
-define <2 x half> @test_uitofp_2xi32(<2 x i32> %a) #0 {
-  %r = uitofp <2 x i32> %a to <2 x half>
-  ret <2 x half> %r
-}
-
-; CHECK-LABEL: test_uitofp_2xi64(
-; CHECK:      ld.param.v2.u64 {[[A0:%rd[0-9]+]], [[A1:%rd[0-9]+]]}, [test_uitofp_2xi64_param_0];
-; CHECK-DAG:  cvt.rn.f32.u64  [[F0:%f[0-9]+]], [[A0]];
-; CHECK-DAG:  cvt.rn.f32.u64  [[F1:%f[0-9]+]], [[A1]];
-; CHECK-DAG:  cvt.rn.f16.f32  [[R0:%h[0-9]+]], [[F0]];
-; CHECK-DAG:  cvt.rn.f16.f32  [[R1:%h[0-9]+]], [[F1]];
-; CHECK:      mov.b32         [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
-; CHECK:      st.param.b32    [func_retval0+0], [[R]];
-; CHECK:      ret;
-define <2 x half> @test_uitofp_2xi64(<2 x i64> %a) #0 {
-  %r = uitofp <2 x i64> %a to <2 x half>
-  ret <2 x half> %r
-}
-
-; CHECK-LABEL: test_sitofp_2xi32(
-; CHECK:      ld.param.v2.u32 {[[A0:%r[0-9]+]], [[A1:%r[0-9]+]]}, [test_sitofp_2xi32_param_0];
-; CHECK-DAG:  cvt.rn.f16.s32  [[R0:%h[0-9]+]], [[A0]];
-; CHECK-DAG:  cvt.rn.f16.s32  [[R1:%h[0-9]+]], [[A1]];
-; CHECK:      mov.b32         [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
-; CHECK:      st.param.b32    [func_retval0+0], [[R]];
-; CHECK:      ret;
-define <2 x half> @test_sitofp_2xi32(<2 x i32> %a) #0 {
-  %r = sitofp <2 x i32> %a to <2 x half>
-  ret <2 x half> %r
-}
-
-; CHECK-LABEL: test_sitofp_2xi64(
-; CHECK:      ld.param.v2.u64 {[[A0:%rd[0-9]+]], [[A1:%rd[0-9]+]]}, [test_sitofp_2xi64_param_0];
-; CHECK-DAG:  cvt.rn.f32.s64  [[F0:%f[0-9]+]], [[A0]];
-; CHECK-DAG:  cvt.rn.f32.s64  [[F1:%f[0-9]+]], [[A1]];
-; CHECK-DAG:  cvt.rn.f16.f32  [[R0:%h[0-9]+]], [[F0]];
-; CHECK-DAG:  cvt.rn.f16.f32  [[R1:%h[0-9]+]], [[F1]];
-; CHECK:      mov.b32         [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
-; CHECK:      st.param.b32    [func_retval0+0], [[R]];
-; CHECK:      ret;
-define <2 x half> @test_sitofp_2xi64(<2 x i64> %a) #0 {
-  %r = sitofp <2 x i64> %a to <2 x half>
-  ret <2 x half> %r
-}
-
-; CHECK-LABEL: test_uitofp_2xi32_fadd(
-; CHECK-DAG:  ld.param.v2.u32 {[[A0:%r[0-9]+]], [[A1:%r[0-9]+]]}, [test_uitofp_2xi32_fadd_param_0];
-; CHECK-DAG:  ld.param.b32    [[B:%hh[0-9]+]], [test_uitofp_2xi32_fadd_param_1];
-; CHECK-DAG:  cvt.rn.f16.u32  [[C0:%h[0-9]+]], [[A0]];
-; CHECK-DAG:  cvt.rn.f16.u32  [[C1:%h[0-9]+]], [[A1]];
-
-; CHECK-F16-DAG:  mov.b32         [[C:%hh[0-9]+]], {[[C0]], [[C1]]}
-; CHECK-F16-DAG:  add.rn.f16x2    [[R:%hh[0-9]+]], [[B]], [[C]];
-;
-; CHECK-NOF16-DAG:  mov.b32        {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]]
-; CHECK-NOF16-DAG:  cvt.f32.f16    [[FB0:%f[0-9]+]], [[B0]]
-; CHECK-NOF16-DAG:  cvt.f32.f16    [[FB1:%f[0-9]+]], [[B1]]
-; CHECK-NOF16-DAG:  cvt.f32.f16    [[FC0:%f[0-9]+]], [[C0]]
-; CHECK-NOF16-DAG:  cvt.f32.f16    [[FC1:%f[0-9]+]], [[C1]]
-; CHECK-NOF16-DAG:  add.rn.f32     [[FR0:%f[0-9]+]], [[FB0]], [[FC0]];
-; CHECK-NOF16-DAG:  add.rn.f32     [[FR1:%f[0-9]+]], [[FB1]], [[FC1]];
-; CHECK-NOF16-DAG:  cvt.rn.f16.f32 [[R0:%h[0-9]+]], [[FR0]]
-; CHECK-NOF16-DAG:  cvt.rn.f16.f32 [[R1:%h[0-9]+]], [[FR1]]
-; CHECK-NOF16:      mov.b32        [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
-;
-; CHECK:      st.param.b32    [func_retval0+0], [[R]];
-; CHECK:      ret;
-define <2 x half> @test_uitofp_2xi32_fadd(<2 x i32> %a, <2 x half> %b) #0 {
-  %c = uitofp <2 x i32> %a to <2 x half>
-  %r = fadd <2 x half> %b, %c
-  ret <2 x half> %r
-}
-
-; CHECK-LABEL: test_sitofp_2xi32_fadd(
-; CHECK-DAG:  ld.param.v2.u32 {[[A0:%r[0-9]+]], [[A1:%r[0-9]+]]}, [test_sitofp_2xi32_fadd_param_0];
-; CHECK-DAG:  ld.param.b32    [[B:%hh[0-9]+]], [test_sitofp_2xi32_fadd_param_1];
-; CHECK-DAG:  cvt.rn.f16.s32  [[C0:%h[0-9]+]], [[A0]];
-; CHECK-DAG:  cvt.rn.f16.s32  [[C1:%h[0-9]+]], [[A1]];
-;
-; CHECK-F16-DAG:  mov.b32         [[C:%hh[0-9]+]], {[[C0]], [[C1]]}
-; CHECK-F16-DAG:  add.rn.f16x2    [[R:%hh[0-9]+]], [[B]], [[C]];
-;
-; CHECK-NOF16-DAG:  mov.b32        {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]]
-; CHECK-NOF16-DAG:  cvt.f32.f16    [[FB0:%f[0-9]+]], [[B0]]
-; CHECK-NOF16-DAG:  cvt.f32.f16    [[FB1:%f[0-9]+]], [[B1]]
-; CHECK-NOF16-DAG:  cvt.f32.f16    [[FC0:%f[0-9]+]], [[C0]]
-; CHECK-NOF16-DAG:  cvt.f32.f16    [[FC1:%f[0-9]+]], [[C1]]
-; CHECK-NOF16-DAG:  add.rn.f32     [[FR0:%f[0-9]+]], [[FB0]], [[FC0]];
-; CHECK-NOF16-DAG:  add.rn.f32     [[FR1:%f[0-9]+]], [[FB1]], [[FC1]];
-; CHECK-NOF16-DAG:  cvt.rn.f16.f32 [[R0:%h[0-9]+]], [[FR0]]
-; CHECK-NOF16-DAG:  cvt.rn.f16.f32 [[R1:%h[0-9]+]], [[FR1]]
-; CHECK-NOF16:      mov.b32        [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
-;
-; CHECK:      st.param.b32    [func_retval0+0], [[R]];
-; CHECK:      ret;
-define <2 x half> @test_sitofp_2xi32_fadd(<2 x i32> %a, <2 x half> %b) #0 {
-  %c = sitofp <2 x i32> %a to <2 x half>
-  %r = fadd <2 x half> %b, %c
-  ret <2 x half> %r
-}
-
-; CHECK-LABEL: test_fptrunc_2xfloat(
-; CHECK:      ld.param.v2.f32 {[[A0:%f[0-9]+]], [[A1:%f[0-9]+]]}, [test_fptrunc_2xfloat_param_0];
-; CHECK-DAG:  cvt.rn.f16.f32  [[R0:%h[0-9]+]], [[A0]];
-; CHECK-DAG:  cvt.rn.f16.f32  [[R1:%h[0-9]+]], [[A1]];
-; CHECK:      mov.b32         [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
-; CHECK:      st.param.b32    [func_retval0+0], [[R]];
-; CHECK:      ret;
-define <2 x half> @test_fptrunc_2xfloat(<2 x float> %a) #0 {
-  %r = fptrunc <2 x float> %a to <2 x half>
-  ret <2 x half> %r
-}
-
-; CHECK-LABEL: test_fptrunc_2xdouble(
-; CHECK:      ld.param.v2.f64 {[[A0:%fd[0-9]+]], [[A1:%fd[0-9]+]]}, [test_fptrunc_2xdouble_param_0];
-; CHECK-DAG:  cvt.rn.f16.f64  [[R0:%h[0-9]+]], [[A0]];
-; CHECK-DAG:  cvt.rn.f16.f64  [[R1:%h[0-9]+]], [[A1]];
-; CHECK:      mov.b32         [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
-; CHECK:      st.param.b32    [func_retval0+0], [[R]];
-; CHECK:      ret;
-define <2 x half> @test_fptrunc_2xdouble(<2 x double> %a) #0 {
-  %r = fptrunc <2 x double> %a to <2 x half>
-  ret <2 x half> %r
-}
-
-; CHECK-LABEL: test_fpext_2xfloat(
-; CHECK:      ld.param.b32    [[A:%hh[0-9]+]], [test_fpext_2xfloat_param_0];
-; CHECK:      mov.b32         {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
-; CHECK-DAG:  cvt.f32.f16     [[R0:%f[0-9]+]], [[A0]];
-; CHECK-DAG:  cvt.f32.f16     [[R1:%f[0-9]+]], [[A1]];
-; CHECK-NEXT: st.param.v2.f32 [func_retval0+0], {[[R0]], [[R1]]};
-; CHECK:      ret;
-define <2 x float> @test_fpext_2xfloat(<2 x half> %a) #0 {
-  %r = fpext <2 x half> %a to <2 x float>
-  ret <2 x float> %r
-}
-
-; CHECK-LABEL: test_fpext_2xdouble(
-; CHECK:      ld.param.b32    [[A:%hh[0-9]+]], [test_fpext_2xdouble_param_0];
-; CHECK:      mov.b32         {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
-; CHECK-DAG:  cvt.f64.f16     [[R0:%fd[0-9]+]], [[A0]];
-; CHECK-DAG:  cvt.f64.f16     [[R1:%fd[0-9]+]], [[A1]];
-; CHECK-NEXT: st.param.v2.f64 [func_retval0+0], {[[R0]], [[R1]]};
-; CHECK:      ret;
-define <2 x double> @test_fpext_2xdouble(<2 x half> %a) #0 {
-  %r = fpext <2 x half> %a to <2 x double>
-  ret <2 x double> %r
-}
-
-
-; CHECK-LABEL: test_bitcast_2xhalf_to_2xi16(
-; CHECK:      ld.param.u32    [[A:%r[0-9]+]], [test_bitcast_2xhalf_to_2xi16_param_0];
-; CHECK-DAG:  cvt.u16.u32     [[R0:%rs[0-9]+]], [[A]]
-; CHECK-DAG:  shr.u32         [[AH:%r[0-9]+]], [[A]], 16
-; CHECK-DAG:  cvt.u16.u32     [[R1:%rs[0-9]+]], [[AH]]
-; CHECK:      st.param.v2.b16 [func_retval0+0], {[[R0]], [[R1]]}
-; CHECK:      ret;
-define <2 x i16> @test_bitcast_2xhalf_to_2xi16(<2 x half> %a) #0 {
-  %r = bitcast <2 x half> %a to <2 x i16>
-  ret <2 x i16> %r
-}
-
-; CHECK-LABEL: test_bitcast_2xi16_to_2xhalf(
-; CHECK:      ld.param.v2.u16         {[[RS0:%rs[0-9]+]], [[RS1:%rs[0-9]+]]}, [test_bitcast_2xi16_to_2xhalf_param_0];
-; CHECK-DAG:  cvt.u32.u16     [[R0:%r[0-9]+]], [[RS0]];
-; CHECK-DAG:  cvt.u32.u16     [[R1:%r[0-9]+]], [[RS1]];
-; CHECK-DAG:  shl.b32         [[R1H:%r[0-9]+]], [[R1]], 16;
-; CHECK-DAG:  or.b32          [[R1H0L:%r[0-9]+]], [[R0]], [[R1H]];
-; CHECK:      mov.b32         [[R:%hh[0-9]+]], [[R1H0L]];
-; CHECK:      st.param.b32    [func_retval0+0], [[R]];
-; CHECK:      ret;
-define <2 x half> @test_bitcast_2xi16_to_2xhalf(<2 x i16> %a) #0 {
-  %r = bitcast <2 x i16> %a to <2 x half>
-  ret <2 x half> %r
-}
-
-
-declare <2 x half> @llvm.sqrt.f16(<2 x half> %a) #0
-declare <2 x half> @llvm.powi.f16(<2 x half> %a, <2 x i32> %b) #0
-declare <2 x half> @llvm.sin.f16(<2 x half> %a) #0
-declare <2 x half> @llvm.cos.f16(<2 x half> %a) #0
-declare <2 x half> @llvm.pow.f16(<2 x half> %a, <2 x half> %b) #0
-declare <2 x half> @llvm.exp.f16(<2 x half> %a) #0
-declare <2 x half> @llvm.exp2.f16(<2 x half> %a) #0
-declare <2 x half> @llvm.log.f16(<2 x half> %a) #0
-declare <2 x half> @llvm.log10.f16(<2 x half> %a) #0
-declare <2 x half> @llvm.log2.f16(<2 x half> %a) #0
-declare <2 x half> @llvm.fma.f16(<2 x half> %a, <2 x half> %b, <2 x half> %c) #0
-declare <2 x half> @llvm.fabs.f16(<2 x half> %a) #0
-declare <2 x half> @llvm.minnum.f16(<2 x half> %a, <2 x half> %b) #0
-declare <2 x half> @llvm.maxnum.f16(<2 x half> %a, <2 x half> %b) #0
-declare <2 x half> @llvm.copysign.f16(<2 x half> %a, <2 x half> %b) #0
-declare <2 x half> @llvm.floor.f16(<2 x half> %a) #0
-declare <2 x half> @llvm.ceil.f16(<2 x half> %a) #0
-declare <2 x half> @llvm.trunc.f16(<2 x half> %a) #0
-declare <2 x half> @llvm.rint.f16(<2 x half> %a) #0
-declare <2 x half> @llvm.nearbyint.f16(<2 x half> %a) #0
-declare <2 x half> @llvm.round.f16(<2 x half> %a) #0
-declare <2 x half> @llvm.fmuladd.f16(<2 x half> %a, <2 x half> %b, <2 x half> %c) #0
-
-; CHECK-LABEL: test_sqrt(
-; CHECK:      ld.param.b32    [[A:%hh[0-9]+]], [test_sqrt_param_0];
-; CHECK:      mov.b32         {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
-; CHECK-DAG:  cvt.f32.f16     [[AF0:%f[0-9]+]], [[A0]];
-; CHECK-DAG:  cvt.f32.f16     [[AF1:%f[0-9]+]], [[A1]];
-; CHECK-DAG:  sqrt.rn.f32     [[RF0:%f[0-9]+]], [[AF0]];
-; CHECK-DAG:  sqrt.rn.f32     [[RF1:%f[0-9]+]], [[AF1]];
-; CHECK-DAG:  cvt.rn.f16.f32  [[R0:%h[0-9]+]], [[RF0]];
-; CHECK-DAG:  cvt.rn.f16.f32  [[R1:%h[0-9]+]], [[RF1]];
-; CHECK:      mov.b32         [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
-; CHECK:      st.param.b32    [func_retval0+0], [[R]];
-; CHECK:      ret;
-define <2 x half> @test_sqrt(<2 x half> %a) #0 {
-  %r = call <2 x half> @llvm.sqrt.f16(<2 x half> %a)
-  ret <2 x half> %r
-}
-
-;;; Can't do this yet: requires libcall.
-; XCHECK-LABEL: test_powi(
-;define <2 x half> @test_powi(<2 x half> %a, <2 x i32> %b) #0 {
-;  %r = call <2 x half> @llvm.powi.f16(<2 x half> %a, <2 x i32> %b)
-;  ret <2 x half> %r
-;}
-
-; CHECK-LABEL: test_sin(
-; CHECK:      ld.param.b32    [[A:%hh[0-9]+]], [test_sin_param_0];
-; CHECK:      mov.b32         {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
-; CHECK-DAG:  cvt.f32.f16     [[AF0:%f[0-9]+]], [[A0]];
-; CHECK-DAG:  cvt.f32.f16     [[AF1:%f[0-9]+]], [[A1]];
-; CHECK-DAG:  sin.approx.f32  [[RF0:%f[0-9]+]], [[AF0]];
-; CHECK-DAG:  sin.approx.f32  [[RF1:%f[0-9]+]], [[AF1]];
-; CHECK-DAG:  cvt.rn.f16.f32  [[R0:%h[0-9]+]], [[RF0]];
-; CHECK-DAG:  cvt.rn.f16.f32  [[R1:%h[0-9]+]], [[RF1]];
-; CHECK:      mov.b32         [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
-; CHECK:      st.param.b32    [func_retval0+0], [[R]];
-; CHECK:      ret;
-define <2 x half> @test_sin(<2 x half> %a) #0 #1 {
-  %r = call <2 x half> @llvm.sin.f16(<2 x half> %a)
-  ret <2 x half> %r
-}
-
-; CHECK-LABEL: test_cos(
-; CHECK:      ld.param.b32    [[A:%hh[0-9]+]], [test_cos_param_0];
-; CHECK:      mov.b32         {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
-; CHECK-DAG:  cvt.f32.f16     [[AF0:%f[0-9]+]], [[A0]];
-; CHECK-DAG:  cvt.f32.f16     [[AF1:%f[0-9]+]], [[A1]];
-; CHECK-DAG:  cos.approx.f32  [[RF0:%f[0-9]+]], [[AF0]];
-; CHECK-DAG:  cos.approx.f32  [[RF1:%f[0-9]+]], [[AF1]];
-; CHECK-DAG:  cvt.rn.f16.f32  [[R0:%h[0-9]+]], [[RF0]];
-; CHECK-DAG:  cvt.rn.f16.f32  [[R1:%h[0-9]+]], [[RF1]];
-; CHECK:      mov.b32         [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
-; CHECK:      st.param.b32    [func_retval0+0], [[R]];
-; CHECK:      ret;
-define <2 x half> @test_cos(<2 x half> %a) #0 #1 {
-  %r = call <2 x half> @llvm.cos.f16(<2 x half> %a)
-  ret <2 x half> %r
-}
-
-;;; Can't do this yet: requires libcall.
-; XCHECK-LABEL: test_pow(
-;define <2 x half> @test_pow(<2 x half> %a, <2 x half> %b) #0 {
-;  %r = call <2 x half> @llvm.pow.f16(<2 x half> %a, <2 x half> %b)
-;  ret <2 x half> %r
-;}
-
-;;; Can't do this yet: requires libcall.
-; XCHECK-LABEL: test_exp(
-;define <2 x half> @test_exp(<2 x half> %a) #0 {
-;  %r = call <2 x half> @llvm.exp.f16(<2 x half> %a)
-;  ret <2 x half> %r
-;}
-
-;;; Can't do this yet: requires libcall.
-; XCHECK-LABEL: test_exp2(
-;define <2 x half> @test_exp2(<2 x half> %a) #0 {
-;  %r = call <2 x half> @llvm.exp2.f16(<2 x half> %a)
-;  ret <2 x half> %r
-;}
-
-;;; Can't do this yet: requires libcall.
-; XCHECK-LABEL: test_log(
-;define <2 x half> @test_log(<2 x half> %a) #0 {
-;  %r = call <2 x half> @llvm.log.f16(<2 x half> %a)
-;  ret <2 x half> %r
-;}
-
-;;; Can't do this yet: requires libcall.
-; XCHECK-LABEL: test_log10(
-;define <2 x half> @test_log10(<2 x half> %a) #0 {
-;  %r = call <2 x half> @llvm.log10.f16(<2 x half> %a)
-;  ret <2 x half> %r
-;}
-
-;;; Can't do this yet: requires libcall.
-; XCHECK-LABEL: test_log2(
-;define <2 x half> @test_log2(<2 x half> %a) #0 {
-;  %r = call <2 x half> @llvm.log2.f16(<2 x half> %a)
-;  ret <2 x half> %r
-;}
-
-; CHECK-LABEL: test_fma(
-; CHECK-DAG:  ld.param.b32    [[A:%hh[0-9]+]], [test_fma_param_0];
-; CHECK-DAG:  ld.param.b32    [[B:%hh[0-9]+]], [test_fma_param_1];
-; CHECK-DAG:  ld.param.b32    [[C:%hh[0-9]+]], [test_fma_param_2];
-;
-; CHECK-F16:        fma.rn.f16x2   [[R:%hh[0-9]+]], [[A]], [[B]], [[C]];
-;
-; CHECK-NOF16-DAG:  mov.b32        {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
-; CHECK-NOF16-DAG:  mov.b32        {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]]
-; CHECK-NOF16-DAG:  mov.b32        {[[C0:%h[0-9]+]], [[C1:%h[0-9]+]]}, [[C]]
-; CHECK-NOF16-DAG:  cvt.f32.f16    [[FA0:%f[0-9]+]], [[A0]]
-; CHECK-NOF16-DAG:  cvt.f32.f16    [[FB0:%f[0-9]+]], [[B0]]
-; CHECK-NOF16-DAG:  cvt.f32.f16    [[FC0:%f[0-9]+]], [[C0]]
-; CHECK-NOF16-DAG:  cvt.f32.f16    [[FA1:%f[0-9]+]], [[A1]]
-; CHECK-NOF16-DAG:  cvt.f32.f16    [[FB1:%f[0-9]+]], [[B1]]
-; CHECK-NOF16-DAG:  cvt.f32.f16    [[FC0:%f[0-9]+]], [[C0]]
-; CHECK-NOF16-DAG:  fma.rn.f32     [[FR0:%f[0-9]+]], [[FA0]], [[FB0]], [[FC0]];
-; CHECK-NOF16-DAG:  fma.rn.f32     [[FR1:%f[0-9]+]], [[FA1]], [[FB1]], [[FC1]];
-; CHECK-NOF16-DAG:  cvt.rn.f16.f32 [[R0:%h[0-9]+]], [[FR0]]
-; CHECK-NOF16-DAG:  cvt.rn.f16.f32 [[R1:%h[0-9]+]], [[FR1]]
-; CHECK-NOF16:      mov.b32        [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
-
-; CHECK:      st.param.b32    [func_retval0+0], [[R]];
-; CHECK:      ret
-define <2 x half> @test_fma(<2 x half> %a, <2 x half> %b, <2 x half> %c) #0 {
-  %r = call <2 x half> @llvm.fma.f16(<2 x half> %a, <2 x half> %b, <2 x half> %c)
-  ret <2 x half> %r
-}
-
-; CHECK-LABEL: test_fabs(
-; CHECK:      ld.param.b32    [[A:%hh[0-9]+]], [test_fabs_param_0];
-; CHECK:      mov.b32         {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
-; CHECK-DAG:  cvt.f32.f16     [[AF0:%f[0-9]+]], [[A0]];
-; CHECK-DAG:  cvt.f32.f16     [[AF1:%f[0-9]+]], [[A1]];
-; CHECK-DAG:  abs.f32         [[RF0:%f[0-9]+]], [[AF0]];
-; CHECK-DAG:  abs.f32         [[RF1:%f[0-9]+]], [[AF1]];
-; CHECK-DAG:  cvt.rn.f16.f32  [[R0:%h[0-9]+]], [[RF0]];
-; CHECK-DAG:  cvt.rn.f16.f32  [[R1:%h[0-9]+]], [[RF1]];
-; CHECK:      mov.b32         [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
-; CHECK:      st.param.b32    [func_retval0+0], [[R]];
-; CHECK:      ret;
-define <2 x half> @test_fabs(<2 x half> %a) #0 {
-  %r = call <2 x half> @llvm.fabs.f16(<2 x half> %a)
-  ret <2 x half> %r
-}
-
-; CHECK-LABEL: test_minnum(
-; CHECK-DAG:  ld.param.b32    [[A:%hh[0-9]+]], [test_minnum_param_0];
-; CHECK-DAG:  ld.param.b32    [[B:%hh[0-9]+]], [test_minnum_param_1];
-; CHECK-DAG:  mov.b32         {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
-; CHECK-DAG:  mov.b32         {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]]
-; CHECK-DAG:  cvt.f32.f16     [[AF0:%f[0-9]+]], [[A0]];
-; CHECK-DAG:  cvt.f32.f16     [[AF1:%f[0-9]+]], [[A1]];
-; CHECK-DAG:  cvt.f32.f16     [[BF0:%f[0-9]+]], [[B0]];
-; CHECK-DAG:  cvt.f32.f16     [[BF1:%f[0-9]+]], [[B1]];
-; CHECK-DAG:  min.f32         [[RF0:%f[0-9]+]], [[AF0]], [[BF0]];
-; CHECK-DAG:  min.f32         [[RF1:%f[0-9]+]], [[AF1]], [[BF1]];
-; CHECK-DAG:  cvt.rn.f16.f32  [[R0:%h[0-9]+]], [[RF0]];
-; CHECK-DAG:  cvt.rn.f16.f32  [[R1:%h[0-9]+]], [[RF1]];
-; CHECK:      mov.b32         [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
-; CHECK:      st.param.b32    [func_retval0+0], [[R]];
-; CHECK:      ret;
-define <2 x half> @test_minnum(<2 x half> %a, <2 x half> %b) #0 {
-  %r = call <2 x half> @llvm.minnum.f16(<2 x half> %a, <2 x half> %b)
-  ret <2 x half> %r
-}
-
-; CHECK-LABEL: test_maxnum(
-; CHECK-DAG:  ld.param.b32    [[A:%hh[0-9]+]], [test_maxnum_param_0];
-; CHECK-DAG:  ld.param.b32    [[B:%hh[0-9]+]], [test_maxnum_param_1];
-; CHECK-DAG:  mov.b32         {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
-; CHECK-DAG:  mov.b32         {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]]
-; CHECK-DAG:  cvt.f32.f16     [[AF0:%f[0-9]+]], [[A0]];
-; CHECK-DAG:  cvt.f32.f16     [[AF1:%f[0-9]+]], [[A1]];
-; CHECK-DAG:  cvt.f32.f16     [[BF0:%f[0-9]+]], [[B0]];
-; CHECK-DAG:  cvt.f32.f16     [[BF1:%f[0-9]+]], [[B1]];
-; CHECK-DAG:  max.f32         [[RF0:%f[0-9]+]], [[AF0]], [[BF0]];
-; CHECK-DAG:  max.f32         [[RF1:%f[0-9]+]], [[AF1]], [[BF1]];
-; CHECK-DAG:  cvt.rn.f16.f32  [[R0:%h[0-9]+]], [[RF0]];
-; CHECK-DAG:  cvt.rn.f16.f32  [[R1:%h[0-9]+]], [[RF1]];
-; CHECK:      mov.b32         [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
-; CHECK:      st.param.b32    [func_retval0+0], [[R]];
-; CHECK:      ret;
-define <2 x half> @test_maxnum(<2 x half> %a, <2 x half> %b) #0 {
-  %r = call <2 x half> @llvm.maxnum.f16(<2 x half> %a, <2 x half> %b)
-  ret <2 x half> %r
-}
-
-; CHECK-LABEL: test_copysign(
-; CHECK-DAG:  ld.param.b32    [[A:%hh[0-9]+]], [test_copysign_param_0];
-; CHECK-DAG:  ld.param.b32    [[B:%hh[0-9]+]], [test_copysign_param_1];
-; CHECK-DAG:  mov.b32         {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
-; CHECK-DAG:  mov.b32         {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]]
-; CHECK-DAG:  mov.b16         [[AS0:%rs[0-9]+]], [[A0]];
-; CHECK-DAG:  mov.b16         [[AS1:%rs[0-9]+]], [[A1]];
-; CHECK-DAG:  mov.b16         [[BS0:%rs[0-9]+]], [[B0]];
-; CHECK-DAG:  mov.b16         [[BS1:%rs[0-9]+]], [[B1]];
-; CHECK-DAG:  and.b16         [[AX0:%rs[0-9]+]], [[AS0]], 32767;
-; CHECK-DAG:  and.b16         [[AX1:%rs[0-9]+]], [[AS1]], 32767;
-; CHECK-DAG:  and.b16         [[BX0:%rs[0-9]+]], [[BS0]], -32768;
-; CHECK-DAG:  and.b16         [[BX1:%rs[0-9]+]], [[BS1]], -32768;
-; CHECK-DAG:  or.b16          [[RS0:%rs[0-9]+]], [[AX0]], [[BX0]];
-; CHECK-DAG:  or.b16          [[RS1:%rs[0-9]+]], [[AX1]], [[BX1]];
-; CHECK-DAG:  mov.b16         [[R0:%h[0-9]+]], [[RS0]];
-; CHECK-DAG:  mov.b16         [[R1:%h[0-9]+]], [[RS1]];
-; CHECK-DAG:  mov.b32         [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
-; CHECK:      st.param.b32    [func_retval0+0], [[R]];
-; CHECK:      ret;
-define <2 x half> @test_copysign(<2 x half> %a, <2 x half> %b) #0 {
-  %r = call <2 x half> @llvm.copysign.f16(<2 x half> %a, <2 x half> %b)
-  ret <2 x half> %r
-}
-
-; CHECK-LABEL: test_copysign_f32(
-; CHECK-DAG:  ld.param.b32    [[A:%hh[0-9]+]], [test_copysign_f32_param_0];
-; CHECK-DAG:  ld.param.v2.f32 {[[B0:%f[0-9]+]], [[B1:%f[0-9]+]]}, [test_copysign_f32_param_1];
-; CHECK-DAG:  mov.b32         {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
-; CHECK-DAG:  mov.b16         [[AS0:%rs[0-9]+]], [[A0]];
-; CHECK-DAG:  mov.b16         [[AS1:%rs[0-9]+]], [[A1]];
-; CHECK-DAG:  mov.b32         [[BI0:%r[0-9]+]], [[B0]];
-; CHECK-DAG:  mov.b32         [[BI1:%r[0-9]+]], [[B1]];
-; CHECK-DAG:  and.b16         [[AI0:%rs[0-9]+]], [[AS0]], 32767;
-; CHECK-DAG:  and.b16         [[AI1:%rs[0-9]+]], [[AS1]], 32767;
-; CHECK-DAG:  and.b32         [[BX0:%r[0-9]+]], [[BI0]], -2147483648;
-; CHECK-DAG:  and.b32         [[BX1:%r[0-9]+]], [[BI1]], -2147483648;
-; CHECK-DAG:  shr.u32         [[BY0:%r[0-9]+]], [[BX0]], 16;
-; CHECK-DAG:  shr.u32         [[BY1:%r[0-9]+]], [[BX1]], 16;
-; CHECK-DAG:  cvt.u16.u32     [[BZ0:%rs[0-9]+]], [[BY0]];
-; CHECK-DAG:  cvt.u16.u32     [[BZ1:%rs[0-9]+]], [[BY1]];
-; CHECK-DAG:  or.b16          [[RS0:%rs[0-9]+]], [[AI0]], [[BZ0]];
-; CHECK-DAG:  or.b16          [[RS1:%rs[0-9]+]], [[AI1]], [[BZ1]];
-; CHECK-DAG:  mov.b16         [[R0:%h[0-9]+]], [[RS0]];
-; CHECK-DAG:  mov.b16         [[R1:%h[0-9]+]], [[RS1]];
-; CHECK-DAG:  mov.b32         [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
-; CHECK:      st.param.b32    [func_retval0+0], [[R]];
-; CHECK:      ret;
-define <2 x half> @test_copysign_f32(<2 x half> %a, <2 x float> %b) #0 {
-  %tb = fptrunc <2 x float> %b to <2 x half>
-  %r = call <2 x half> @llvm.copysign.f16(<2 x half> %a, <2 x half> %tb)
-  ret <2 x half> %r
-}
-
-; CHECK-LABEL: test_copysign_f64(
-; CHECK-DAG:  ld.param.b32    [[A:%hh[0-9]+]], [test_copysign_f64_param_0];
-; CHECK-DAG:  ld.param.v2.f64 {[[B0:%fd[0-9]+]], [[B1:%fd[0-9]+]]}, [test_copysign_f64_param_1];
-; CHECK-DAG:  mov.b32         {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
-; CHECK-DAG:  mov.b16         [[AS0:%rs[0-9]+]], [[A0]];
-; CHECK-DAG:  mov.b16         [[AS1:%rs[0-9]+]], [[A1]];
-; CHECK-DAG:  mov.b64         [[BI0:%rd[0-9]+]], [[B0]];
-; CHECK-DAG:  mov.b64         [[BI1:%rd[0-9]+]], [[B1]];
-; CHECK-DAG:  and.b16         [[AI0:%rs[0-9]+]], [[AS0]], 32767;
-; CHECK-DAG:  and.b16         [[AI1:%rs[0-9]+]], [[AS1]], 32767;
-; CHECK-DAG:  and.b64         [[BX0:%rd[0-9]+]], [[BI0]], -9223372036854775808;
-; CHECK-DAG:  and.b64         [[BX1:%rd[0-9]+]], [[BI1]], -9223372036854775808;
-; CHECK-DAG:  shr.u64         [[BY0:%rd[0-9]+]], [[BX0]], 48;
-; CHECK-DAG:  shr.u64         [[BY1:%rd[0-9]+]], [[BX1]], 48;
-; CHECK-DAG:  cvt.u16.u64     [[BZ0:%rs[0-9]+]], [[BY0]];
-; CHECK-DAG:  cvt.u16.u64     [[BZ1:%rs[0-9]+]], [[BY1]];
-; CHECK-DAG:  or.b16          [[RS0:%rs[0-9]+]], [[AI0]], [[BZ0]];
-; CHECK-DAG:  or.b16          [[RS1:%rs[0-9]+]], [[AI1]], [[BZ1]];
-; CHECK-DAG:  mov.b16         [[R0:%h[0-9]+]], [[RS0]];
-; CHECK-DAG:  mov.b16         [[R1:%h[0-9]+]], [[RS1]];
-; CHECK-DAG:  mov.b32         [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
-; CHECK:      st.param.b32    [func_retval0+0], [[R]];
-; CHECK:      ret;
-define <2 x half> @test_copysign_f64(<2 x half> %a, <2 x double> %b) #0 {
-  %tb = fptrunc <2 x double> %b to <2 x half>
-  %r = call <2 x half> @llvm.copysign.f16(<2 x half> %a, <2 x half> %tb)
-  ret <2 x half> %r
-}
-
-; CHECK-LABEL: test_copysign_extended(
-; CHECK-DAG:  ld.param.b32    [[A:%hh[0-9]+]], [test_copysign_extended_param_0];
-; CHECK-DAG:  ld.param.b32    [[B:%hh[0-9]+]], [test_copysign_extended_param_1];
-; CHECK-DAG:  mov.b32         {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
-; CHECK-DAG:  mov.b32         {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]]
-; CHECK-DAG:  mov.b16         [[AS0:%rs[0-9]+]], [[A0]];
-; CHECK-DAG:  mov.b16         [[AS1:%rs[0-9]+]], [[A1]];
-; CHECK-DAG:  mov.b16         [[BS0:%rs[0-9]+]], [[B0]];
-; CHECK-DAG:  mov.b16         [[BS1:%rs[0-9]+]], [[B1]];
-; CHECK-DAG:  and.b16         [[AX0:%rs[0-9]+]], [[AS0]], 32767;
-; CHECK-DAG:  and.b16         [[AX1:%rs[0-9]+]], [[AS1]], 32767;
-; CHECK-DAG:  and.b16         [[BX0:%rs[0-9]+]], [[BS0]], -32768;
-; CHECK-DAG:  and.b16         [[BX1:%rs[0-9]+]], [[BS1]], -32768;
-; CHECK-DAG:  or.b16          [[RS0:%rs[0-9]+]], [[AX0]], [[BX0]];
-; CHECK-DAG:  or.b16          [[RS1:%rs[0-9]+]], [[AX1]], [[BX1]];
-; CHECK-DAG:  mov.b16         [[R0:%h[0-9]+]], [[RS0]];
-; CHECK-DAG:  mov.b16         [[R1:%h[0-9]+]], [[RS1]];
-; CHECK-DAG:  mov.b32         [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
-; CHECK:      mov.b32         {[[RX0:%h[0-9]+]], [[RX1:%h[0-9]+]]}, [[R]]
-; CHECK-DAG:  cvt.f32.f16     [[XR0:%f[0-9]+]], [[RX0]];
-; CHECK-DAG:  cvt.f32.f16     [[XR1:%f[0-9]+]], [[RX1]];
-; CHECK:      st.param.v2.f32 [func_retval0+0], {[[XR0]], [[XR1]]};
-; CHECK:      ret;
-define <2 x float> @test_copysign_extended(<2 x half> %a, <2 x half> %b) #0 {
-  %r = call <2 x half> @llvm.copysign.f16(<2 x half> %a, <2 x half> %b)
-  %xr = fpext <2 x half> %r to <2 x float>
-  ret <2 x float> %xr
-}
-
-; CHECK-LABEL: test_floor(
-; CHECK:      ld.param.b32    [[A:%hh[0-9]+]], [test_floor_param_0];
-; CHECK-DAG:  mov.b32         {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]];
-; CHECK-DAG:  cvt.rmi.f16.f16 [[R1:%h[0-9]+]], [[A1]];
-; CHECK-DAG:  cvt.rmi.f16.f16 [[R0:%h[0-9]+]], [[A0]];
-; CHECK:      mov.b32         [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
-; CHECK:      st.param.b32    [func_retval0+0], [[R]];
-; CHECK:      ret;
-define <2 x half> @test_floor(<2 x half> %a) #0 {
-  %r = call <2 x half> @llvm.floor.f16(<2 x half> %a)
-  ret <2 x half> %r
-}
-
-; CHECK-LABEL: test_ceil(
-; CHECK:      ld.param.b32    [[A:%hh[0-9]+]], [test_ceil_param_0];
-; CHECK-DAG:  mov.b32         {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]];
-; CHECK-DAG:  cvt.rpi.f16.f16 [[R1:%h[0-9]+]], [[A1]];
-; CHECK-DAG:  cvt.rpi.f16.f16 [[R0:%h[0-9]+]], [[A0]];
-; CHECK:      mov.b32         [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
-; CHECK:      st.param.b32    [func_retval0+0], [[R]];
-; CHECK:      ret;
-define <2 x half> @test_ceil(<2 x half> %a) #0 {
-  %r = call <2 x half> @llvm.ceil.f16(<2 x half> %a)
-  ret <2 x half> %r
-}
-
-; CHECK-LABEL: test_trunc(
-; CHECK:      ld.param.b32    [[A:%hh[0-9]+]], [test_trunc_param_0];
-; CHECK-DAG:  mov.b32         {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]];
-; CHECK-DAG:  cvt.rzi.f16.f16 [[R1:%h[0-9]+]], [[A1]];
-; CHECK-DAG:  cvt.rzi.f16.f16 [[R0:%h[0-9]+]], [[A0]];
-; CHECK:      mov.b32         [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
-; CHECK:      st.param.b32    [func_retval0+0], [[R]];
-; CHECK:      ret;
-define <2 x half> @test_trunc(<2 x half> %a) #0 {
-  %r = call <2 x half> @llvm.trunc.f16(<2 x half> %a)
-  ret <2 x half> %r
-}
-
-; CHECK-LABEL: test_rint(
-; CHECK:      ld.param.b32    [[A:%hh[0-9]+]], [test_rint_param_0];
-; CHECK-DAG:  mov.b32         {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]];
-; CHECK-DAG:  cvt.rni.f16.f16 [[R1:%h[0-9]+]], [[A1]];
-; CHECK-DAG:  cvt.rni.f16.f16 [[R0:%h[0-9]+]], [[A0]];
-; CHECK:      mov.b32         [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
-; CHECK:      st.param.b32    [func_retval0+0], [[R]];
-; CHECK:      ret;
-define <2 x half> @test_rint(<2 x half> %a) #0 {
-  %r = call <2 x half> @llvm.rint.f16(<2 x half> %a)
-  ret <2 x half> %r
-}
-
-; CHECK-LABEL: test_nearbyint(
-; CHECK:      ld.param.b32    [[A:%hh[0-9]+]], [test_nearbyint_param_0];
-; CHECK-DAG:  mov.b32         {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]];
-; CHECK-DAG:  cvt.rni.f16.f16 [[R1:%h[0-9]+]], [[A1]];
-; CHECK-DAG:  cvt.rni.f16.f16 [[R0:%h[0-9]+]], [[A0]];
-; CHECK:      mov.b32         [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
-; CHECK:      st.param.b32    [func_retval0+0], [[R]];
-; CHECK:      ret;
-define <2 x half> @test_nearbyint(<2 x half> %a) #0 {
-  %r = call <2 x half> @llvm.nearbyint.f16(<2 x half> %a)
-  ret <2 x half> %r
-}
-
-; CHECK-LABEL: test_round(
-; CHECK:      ld.param.b32    [[A:%hh[0-9]+]], [test_round_param_0];
-; CHECK-DAG:  mov.b32         {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]];
-; CHECK-DAG:  cvt.rni.f16.f16 [[R1:%h[0-9]+]], [[A1]];
-; CHECK-DAG:  cvt.rni.f16.f16 [[R0:%h[0-9]+]], [[A0]];
-; CHECK:      mov.b32         [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
-; CHECK:      st.param.b32    [func_retval0+0], [[R]];
-; CHECK:      ret;
-define <2 x half> @test_round(<2 x half> %a) #0 {
-  %r = call <2 x half> @llvm.round.f16(<2 x half> %a)
-  ret <2 x half> %r
-}
-
-; CHECK-LABEL: test_fmuladd(
-; CHECK-DAG:  ld.param.b32    [[A:%hh[0-9]+]], [test_fmuladd_param_0];
-; CHECK-DAG:  ld.param.b32    [[B:%hh[0-9]+]], [test_fmuladd_param_1];
-; CHECK-DAG:  ld.param.b32    [[C:%hh[0-9]+]], [test_fmuladd_param_2];
-;
-; CHECK-F16:        fma.rn.f16x2   [[R:%hh[0-9]+]], [[A]], [[B]], [[C]];
-;
-; CHECK-NOF16-DAG:  mov.b32        {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
-; CHECK-NOF16-DAG:  mov.b32        {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]]
-; CHECK-NOF16-DAG:  mov.b32        {[[C0:%h[0-9]+]], [[C1:%h[0-9]+]]}, [[C]]
-; CHECK-NOF16-DAG:  cvt.f32.f16    [[FA0:%f[0-9]+]], [[A0]]
-; CHECK-NOF16-DAG:  cvt.f32.f16    [[FB0:%f[0-9]+]], [[B0]]
-; CHECK-NOF16-DAG:  cvt.f32.f16    [[FC0:%f[0-9]+]], [[C0]]
-; CHECK-NOF16-DAG:  cvt.f32.f16    [[FA1:%f[0-9]+]], [[A1]]
-; CHECK-NOF16-DAG:  cvt.f32.f16    [[FB1:%f[0-9]+]], [[B1]]
-; CHECK-NOF16-DAG:  cvt.f32.f16    [[FC0:%f[0-9]+]], [[C0]]
-; CHECK-NOF16-DAG:  fma.rn.f32     [[FR0:%f[0-9]+]], [[FA0]], [[FB0]], [[FC0]];
-; CHECK-NOF16-DAG:  fma.rn.f32     [[FR1:%f[0-9]+]], [[FA1]], [[FB1]], [[FC1]];
-; CHECK-NOF16-DAG:  cvt.rn.f16.f32 [[R0:%h[0-9]+]], [[FR0]]
-; CHECK-NOF16-DAG:  cvt.rn.f16.f32 [[R1:%h[0-9]+]], [[FR1]]
-; CHECK-NOF16:      mov.b32        [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
-;
-; CHECK:      st.param.b32    [func_retval0+0], [[R]];
-; CHECK:      ret;
-define <2 x half> @test_fmuladd(<2 x half> %a, <2 x half> %b, <2 x half> %c) #0 {
-  %r = call <2 x half> @llvm.fmuladd.f16(<2 x half> %a, <2 x half> %b, <2 x half> %c)
-  ret <2 x half> %r
-}
-
-attributes #0 = { nounwind }
-attributes #1 = { "unsafe-fp-math" = "true" }
+; ## Full FP16 support enabled by default.
+; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda -mcpu=sm_53 -asm-verbose=false \
+; RUN:          -O0 -disable-post-ra -disable-fp-elim -verify-machineinstrs \
+; RUN: | FileCheck -check-prefixes CHECK,CHECK-F16 %s
+; ## FP16 support explicitly disabled.
+; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda -mcpu=sm_53 -asm-verbose=false \
+; RUN:          -O0 -disable-post-ra -disable-fp-elim --nvptx-no-f16-math \
+; RUN:           -verify-machineinstrs \
+; RUN: | FileCheck -check-prefixes CHECK,CHECK-NOF16 %s
+; ## FP16 is not supported by hardware.
+; RUN: llc < %s -O0 -mtriple=nvptx64-nvidia-cuda -mcpu=sm_52 -asm-verbose=false \
+; RUN:          -disable-post-ra -disable-fp-elim -verify-machineinstrs \
+; RUN: | FileCheck -check-prefixes CHECK,CHECK-NOF16 %s
+
+target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
+
+; CHECK-LABEL: test_ret_const(
+; CHECK:     mov.u32         [[T:%r[0-9+]]], 1073757184;
+; CHECK:     mov.b32         [[R:%hh[0-9+]]], [[T]];
+; CHECK:     st.param.b32    [func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+define <2 x half> @test_ret_const() #0 {
+  ret <2 x half> <half 1.0, half 2.0>
+}
+
+; CHECK-LABEL: test_extract_0(
+; CHECK:      ld.param.b32    [[A:%hh[0-9]+]], [test_extract_0_param_0];
+; CHECK:      mov.b32         {[[R:%h[0-9]+]], %tmp_hi}, [[A]];
+; CHECK:      st.param.b16    [func_retval0+0], [[R]];
+; CHECK:      ret;
+define half @test_extract_0(<2 x half> %a) #0 {
+  %e = extractelement <2 x half> %a, i32 0
+  ret half %e
+}
+
+; CHECK-LABEL: test_extract_1(
+; CHECK:      ld.param.b32    [[A:%hh[0-9]+]], [test_extract_1_param_0];
+; CHECK:      mov.b32         {%tmp_lo, [[R:%h[0-9]+]]}, [[A]];
+; CHECK:      st.param.b16    [func_retval0+0], [[R]];
+; CHECK:      ret;
+define half @test_extract_1(<2 x half> %a) #0 {
+  %e = extractelement <2 x half> %a, i32 1
+  ret half %e
+}
+
+; CHECK-LABEL: test_extract_i(
+; CHECK-DAG:  ld.param.b32    [[A:%hh[0-9]+]], [test_extract_i_param_0];
+; CHECK-DAG:  ld.param.u64    [[IDX:%rd[0-9]+]], [test_extract_i_param_1];
+; CHECK-DAG:  setp.eq.s64     [[PRED:%p[0-9]+]], [[IDX]], 0;
+; CHECK-DAG:  mov.b32         {[[E0:%h[0-9]+]], [[E1:%h[0-9]+]]}, [[A]];
+; CHECK:      selp.b16        [[R:%h[0-9]+]], [[E0]], [[E1]], [[PRED]];
+; CHECK:      st.param.b16    [func_retval0+0], [[R]];
+; CHECK:      ret;
+define half @test_extract_i(<2 x half> %a, i64 %idx) #0 {
+  %e = extractelement <2 x half> %a, i64 %idx
+  ret half %e
+}
+
+; CHECK-LABEL: test_fadd(
+; CHECK-DAG:  ld.param.b32    [[A:%hh[0-9]+]], [test_fadd_param_0];
+; CHECK-DAG:  ld.param.b32    [[B:%hh[0-9]+]], [test_fadd_param_1];
+;
+; CHECK-F16-NEXT:   add.rn.f16x2   [[R:%hh[0-9]+]], [[A]], [[B]];
+;
+; CHECK-NOF16-DAG:  mov.b32        {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
+; CHECK-NOF16-DAG:  mov.b32        {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]]
+; CHECK-NOF16-DAG:  cvt.f32.f16    [[FA0:%f[0-9]+]], [[A0]]
+; CHECK-NOF16-DAG:  cvt.f32.f16    [[FB0:%f[0-9]+]], [[B0]]
+; CHECK-NOF16-DAG:  cvt.f32.f16    [[FA1:%f[0-9]+]], [[A1]]
+; CHECK-NOF16-DAG:  cvt.f32.f16    [[FB1:%f[0-9]+]], [[B1]]
+; CHECK-NOF16-DAG:  add.rn.f32     [[FR0:%f[0-9]+]], [[FA0]], [[FB0]];
+; CHECK-NOF16-DAG:  add.rn.f32     [[FR1:%f[0-9]+]], [[FA1]], [[FB1]];
+; CHECK-NOF16-DAG:  cvt.rn.f16.f32 [[R0:%h[0-9]+]], [[FR0]]
+; CHECK-NOF16-DAG:  cvt.rn.f16.f32 [[R1:%h[0-9]+]], [[FR1]]
+; CHECK-NOF16:      mov.b32         [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
+;
+; CHECK-NEXT: st.param.b32    [func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+define <2 x half> @test_fadd(<2 x half> %a, <2 x half> %b) #0 {
+  %r = fadd <2 x half> %a, %b
+  ret <2 x half> %r
+}
+
+; Check that we can lower fadd with immediate arguments.
+; CHECK-LABEL: test_fadd_imm_0(
+; CHECK-DAG:  ld.param.b32    [[A:%hh[0-9]+]], [test_fadd_imm_0_param_0];
+;
+; CHECK-F16:        mov.u32        [[I:%r[0-9+]]], 1073757184;
+; CHECK-F16:        mov.b32        [[IHH:%hh[0-9+]]], [[I]];
+; CHECK-F16:        add.rn.f16x2   [[R:%hh[0-9]+]], [[A]], [[IHH]];
+;
+; CHECK-NOF16-DAG:  mov.b32        {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
+; CHECK-NOF16-DAG:  cvt.f32.f16    [[FA0:%f[0-9]+]], [[A0]]
+; CHECK-NOF16-DAG:  cvt.f32.f16    [[FA1:%f[0-9]+]], [[A1]]
+; CHECK-NOF16-DAG:  add.rn.f32     [[FR0:%f[0-9]+]], [[FA0]], 0f3F800000;
+; CHECK-NOF16-DAG:  add.rn.f32     [[FR1:%f[0-9]+]], [[FA1]], 0f40000000;
+; CHECK-NOF16-DAG:  cvt.rn.f16.f32 [[R0:%h[0-9]+]], [[FR0]]
+; CHECK-NOF16-DAG:  cvt.rn.f16.f32 [[R1:%h[0-9]+]], [[FR1]]
+; CHECK-NOF16:      mov.b32        [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
+;
+; CHECK-NEXT: st.param.b32    [func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+define <2 x half> @test_fadd_imm_0(<2 x half> %a) #0 {
+  %r = fadd <2 x half> <half 1.0, half 2.0>, %a
+  ret <2 x half> %r
+}
+
+; CHECK-LABEL: test_fadd_imm_1(
+; CHECK-DAG:  ld.param.b32    [[B:%hh[0-9]+]], [test_fadd_imm_1_param_0];
+;
+; CHECK-F16:        mov.u32        [[I:%r[0-9+]]], 1073757184;
+; CHECK-F16:        mov.b32        [[IHH:%hh[0-9+]]], [[I]];
+; CHECK-F16:        add.rn.f16x2   [[R:%hh[0-9]+]], [[B]], [[IHH]];
+;
+; CHECK-NOF16-DAG:  mov.b32        {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
+; CHECK-NOF16-DAG:  cvt.f32.f16    [[FA0:%f[0-9]+]], [[A0]]
+; CHECK-NOF16-DAG:  cvt.f32.f16    [[FA1:%f[0-9]+]], [[A1]]
+; CHECK-NOF16-DAG:  add.rn.f32     [[FR0:%f[0-9]+]], [[FA0]], 0f3F800000;
+; CHECK-NOF16-DAG:  add.rn.f32     [[FR1:%f[0-9]+]], [[FA1]], 0f40000000;
+; CHECK-NOF16-DAG:  cvt.rn.f16.f32 [[R0:%h[0-9]+]], [[FR0]]
+; CHECK-NOF16-DAG:  cvt.rn.f16.f32 [[R1:%h[0-9]+]], [[FR1]]
+; CHECK-NOF16:      mov.b32        [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
+;
+; CHECK-NEXT: st.param.b32    [func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+define <2 x half> @test_fadd_imm_1(<2 x half> %a) #0 {
+  %r = fadd <2 x half> %a, <half 1.0, half 2.0>
+  ret <2 x half> %r
+}
+
+; CHECK-LABEL: test_fsub(
+; CHECK-DAG:  ld.param.b32    [[A:%hh[0-9]+]], [test_fsub_param_0];
+;
+; CHECK-DAG:  ld.param.b32    [[B:%hh[0-9]+]], [test_fsub_param_1];
+; CHECK-F16-NEXT:   sub.rn.f16x2   [[R:%hh[0-9]+]], [[A]], [[B]];
+;
+; CHECK-NOF16-DAG:  mov.b32        {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
+; CHECK-NOF16-DAG:  mov.b32        {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]]
+; CHECK-NOF16-DAG:  cvt.f32.f16    [[FA0:%f[0-9]+]], [[A0]]
+; CHECK-NOF16-DAG:  cvt.f32.f16    [[FB0:%f[0-9]+]], [[B0]]
+; CHECK-NOF16-DAG:  cvt.f32.f16    [[FA1:%f[0-9]+]], [[A1]]
+; CHECK-NOF16-DAG:  cvt.f32.f16    [[FB1:%f[0-9]+]], [[B1]]
+; CHECK-NOF16-DAG:  sub.rn.f32     [[FR0:%f[0-9]+]], [[FA0]], [[FB0]];
+; CHECK-NOF16-DAG:  sub.rn.f32     [[FR1:%f[0-9]+]], [[FA1]], [[FB1]];
+; CHECK-NOF16-DAG:  cvt.rn.f16.f32 [[R0:%h[0-9]+]], [[FR0]]
+; CHECK-NOF16-DAG:  cvt.rn.f16.f32 [[R1:%h[0-9]+]], [[FR1]]
+; CHECK-NOF16:      mov.b32        [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
+;
+; CHECK-NEXT: st.param.b32    [func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+define <2 x half> @test_fsub(<2 x half> %a, <2 x half> %b) #0 {
+  %r = fsub <2 x half> %a, %b
+  ret <2 x half> %r
+}
+
+; CHECK-LABEL: test_fneg(
+; CHECK-DAG:  ld.param.b32    [[A:%hh[0-9]+]], [test_fneg_param_0];
+;
+; CHECK-F16:        mov.u32        [[I0:%r[0-9+]]], 0;
+; CHECK-F16:        mov.b32        [[IHH0:%hh[0-9+]]], [[I0]];
+; CHECK-F16-NEXT:   sub.rn.f16x2   [[R:%hh[0-9]+]], [[IHH0]], [[A]];
+;
+; CHECK-NOF16-DAG:  mov.b32        {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
+; CHECK-NOF16-DAG:  cvt.f32.f16    [[FA0:%f[0-9]+]], [[A0]]
+; CHECK-NOF16-DAG:  cvt.f32.f16    [[FA1:%f[0-9]+]], [[A1]]
+; CHECK-NOF16-DAG:  mov.f32        [[Z:%f[0-9]+]], 0f00000000;
+; CHECK-NOF16-DAG:  sub.rn.f32     [[FR0:%f[0-9]+]], [[Z]], [[FA0]];
+; CHECK-NOF16-DAG:  sub.rn.f32     [[FR1:%f[0-9]+]], [[Z]], [[FA1]];
+; CHECK-NOF16-DAG:  cvt.rn.f16.f32 [[R0:%h[0-9]+]], [[FR0]]
+; CHECK-NOF16-DAG:  cvt.rn.f16.f32 [[R1:%h[0-9]+]], [[FR1]]
+; CHECK-NOF16:      mov.b32        [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
+;
+; CHECK-NEXT: st.param.b32    [func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+define <2 x half> @test_fneg(<2 x half> %a) #0 {
+  %r = fsub <2 x half> <half 0.0, half 0.0>, %a
+  ret <2 x half> %r
+}
+
+; CHECK-LABEL: test_fmul(
+; CHECK-DAG:  ld.param.b32    [[A:%hh[0-9]+]], [test_fmul_param_0];
+; CHECK-DAG:  ld.param.b32    [[B:%hh[0-9]+]], [test_fmul_param_1];
+; CHECK-F16-NEXT: mul.rn.f16x2     [[R:%hh[0-9]+]], [[A]], [[B]];
+;
+; CHECK-NOF16-DAG:  mov.b32        {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
+; CHECK-NOF16-DAG:  mov.b32        {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]]
+; CHECK-NOF16-DAG:  cvt.f32.f16    [[FA0:%f[0-9]+]], [[A0]]
+; CHECK-NOF16-DAG:  cvt.f32.f16    [[FB0:%f[0-9]+]], [[B0]]
+; CHECK-NOF16-DAG:  cvt.f32.f16    [[FA1:%f[0-9]+]], [[A1]]
+; CHECK-NOF16-DAG:  cvt.f32.f16    [[FB1:%f[0-9]+]], [[B1]]
+; CHECK-NOF16-DAG:  mul.rn.f32     [[FR0:%f[0-9]+]], [[FA0]], [[FB0]];
+; CHECK-NOF16-DAG:  mul.rn.f32     [[FR1:%f[0-9]+]], [[FA1]], [[FB1]];
+; CHECK-NOF16-DAG:  cvt.rn.f16.f32 [[R0:%h[0-9]+]], [[FR0]]
+; CHECK-NOF16-DAG:  cvt.rn.f16.f32 [[R1:%h[0-9]+]], [[FR1]]
+; CHECK-NOF16:      mov.b32         [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
+;
+; CHECK-NEXT: st.param.b32    [func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+define <2 x half> @test_fmul(<2 x half> %a, <2 x half> %b) #0 {
+  %r = fmul <2 x half> %a, %b
+  ret <2 x half> %r
+}
+
+; CHECK-LABEL: test_fdiv(
+; CHECK-DAG:  ld.param.b32    [[A:%hh[0-9]+]], [test_fdiv_param_0];
+; CHECK-DAG:  ld.param.b32    [[B:%hh[0-9]+]], [test_fdiv_param_1];
+; CHECK-DAG:  mov.b32         {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
+; CHECK-DAG:  mov.b32         {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]]
+; CHECK-DAG:  cvt.f32.f16     [[FA0:%f[0-9]+]], [[A0]];
+; CHECK-DAG:  cvt.f32.f16     [[FA1:%f[0-9]+]], [[A1]];
+; CHECK-DAG:  cvt.f32.f16     [[FB0:%f[0-9]+]], [[B0]];
+; CHECK-DAG:  cvt.f32.f16     [[FB1:%f[0-9]+]], [[B1]];
+; CHECK-DAG:  div.rn.f32      [[FR0:%f[0-9]+]], [[FA0]], [[FB0]];
+; CHECK-DAG:  div.rn.f32      [[FR1:%f[0-9]+]], [[FA1]], [[FB1]];
+; CHECK-DAG:  cvt.rn.f16.f32  [[R0:%h[0-9]+]], [[FR0]];
+; CHECK-DAG:  cvt.rn.f16.f32  [[R1:%h[0-9]+]], [[FR1]];
+; CHECK-NEXT: mov.b32         [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
+; CHECK-NEXT: st.param.b32    [func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+define <2 x half> @test_fdiv(<2 x half> %a, <2 x half> %b) #0 {
+  %r = fdiv <2 x half> %a, %b
+  ret <2 x half> %r
+}
+
+; CHECK-LABEL: test_frem(
+; -- Load two 16x2 inputs and split them into f16 elements
+; CHECK-DAG:  ld.param.b32    [[A:%hh[0-9]+]], [test_frem_param_0];
+; CHECK-DAG:  ld.param.b32    [[B:%hh[0-9]+]], [test_frem_param_1];
+; -- Split into elements
+; CHECK-DAG:  mov.b32         {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
+; CHECK-DAG:  mov.b32         {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]]
+; -- promote to f32.
+; CHECK-DAG:  cvt.f32.f16     [[FA0:%f[0-9]+]], [[A0]];
+; CHECK-DAG:  cvt.f32.f16     [[FB0:%f[0-9]+]], [[B0]];
+; CHECK-DAG:  cvt.f32.f16     [[FA1:%f[0-9]+]], [[A1]];
+; CHECK-DAG:  cvt.f32.f16     [[FB1:%f[0-9]+]], [[B1]];
+; -- frem(a[0],b[0]).
+; CHECK-DAG:  div.rn.f32      [[FD0:%f[0-9]+]], [[FA0]], [[FB0]];
+; CHECK-DAG:  cvt.rmi.f32.f32 [[DI0:%f[0-9]+]], [[FD0]];
+; CHECK-DAG:  mul.f32         [[RI0:%f[0-9]+]], [[DI0]], [[FB0]];
+; CHECK-DAG:  sub.f32         [[RF0:%f[0-9]+]], [[FA0]], [[RI0]];
+; -- frem(a[1],b[1]).
+; CHECK-DAG:  div.rn.f32      [[FD1:%f[0-9]+]], [[FA1]], [[FB1]];
+; CHECK-DAG:  cvt.rmi.f32.f32 [[DI1:%f[0-9]+]], [[FD1]];
+; CHECK-DAG:  mul.f32         [[RI1:%f[0-9]+]], [[DI1]], [[FB1]];
+; CHECK-DAG:  sub.f32         [[RF1:%f[0-9]+]], [[FA1]], [[RI1]];
+; -- convert back to f16.
+; CHECK-DAG:  cvt.rn.f16.f32  [[R0:%h[0-9]+]], [[RF0]];
+; CHECK-DAG:  cvt.rn.f16.f32  [[R1:%h[0-9]+]], [[RF1]];
+; -- merge into f16x2 and return it.
+; CHECK:      mov.b32         [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
+; CHECK-NEXT: st.param.b32    [func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+define <2 x half> @test_frem(<2 x half> %a, <2 x half> %b) #0 {
+  %r = frem <2 x half> %a, %b
+  ret <2 x half> %r
+}
+
+; CHECK-LABEL: .func test_ldst_v2f16(
+; CHECK-DAG:    ld.param.u64    %[[A:rd[0-9]+]], [test_ldst_v2f16_param_0];
+; CHECK-DAG:    ld.param.u64    %[[B:rd[0-9]+]], [test_ldst_v2f16_param_1];
+; CHECK-DAG:    ld.b32          [[E:%hh[0-9]+]], [%[[A]]]
+; CHECK:        mov.b32         {[[E0:%h[0-9]+]], [[E1:%h[0-9]+]]}, [[E]];
+; CHECK-DAG:    st.v2.b16       [%[[B]]], {[[E0]], [[E1]]};
+; CHECK:        ret;
+define void @test_ldst_v2f16(<2 x half>* %a, <2 x half>* %b) {
+  %t1 = load <2 x half>, <2 x half>* %a
+  store <2 x half> %t1, <2 x half>* %b, align 16
+  ret void
+}
+
+; CHECK-LABEL: .func test_ldst_v3f16(
+; CHECK-DAG:    ld.param.u64    %[[A:rd[0-9]+]], [test_ldst_v3f16_param_0];
+; CHECK-DAG:    ld.param.u64    %[[B:rd[0-9]+]], [test_ldst_v3f16_param_1];
+; -- v3 is inconvenient to capture as it's lowered as ld.b64 + fair
+;    number of bitshifting instructions that may change at llvm's whim.
+;    So we only verify that we only issue correct number of writes using
+;    correct offset, but not the values we write.
+; CHECK-DAG:    ld.u64
+; CHECK-DAG:    st.u32          [%[[B]]],
+; CHECK-DAG:    st.b16          [%[[B]]+4],
+; CHECK:        ret;
+define void @test_ldst_v3f16(<3 x half>* %a, <3 x half>* %b) {
+  %t1 = load <3 x half>, <3 x half>* %a
+  store <3 x half> %t1, <3 x half>* %b, align 16
+  ret void
+}
+
+; CHECK-LABEL: .func test_ldst_v4f16(
+; CHECK-DAG:    ld.param.u64    %[[A:rd[0-9]+]], [test_ldst_v4f16_param_0];
+; CHECK-DAG:    ld.param.u64    %[[B:rd[0-9]+]], [test_ldst_v4f16_param_1];
+; CHECK-DAG:    ld.v4.b16       {[[E0:%h[0-9]+]], [[E1:%h[0-9]+]], [[E2:%h[0-9]+]], [[E3:%h[0-9]+]]}, [%[[A]]];
+; CHECK-DAG:    st.v4.b16       [%[[B]]], {[[E0]], [[E1]], [[E2]], [[E3]]};
+; CHECK:        ret;
+define void @test_ldst_v4f16(<4 x half>* %a, <4 x half>* %b) {
+  %t1 = load <4 x half>, <4 x half>* %a
+  store <4 x half> %t1, <4 x half>* %b, align 16
+  ret void
+}
+
+; CHECK-LABEL: .func test_ldst_v8f16(
+; CHECK-DAG:    ld.param.u64    %[[A:rd[0-9]+]], [test_ldst_v8f16_param_0];
+; CHECK-DAG:    ld.param.u64    %[[B:rd[0-9]+]], [test_ldst_v8f16_param_1];
+; CHECK-DAG:    ld.v4.b32       {[[E0:%r[0-9]+]], [[E1:%r[0-9]+]], [[E2:%r[0-9]+]], [[E3:%r[0-9]+]]}, [%[[A]]];
+; CHECK-DAG:    st.v4.b32       [%[[B]]], {[[E0]], [[E1]], [[E2]], [[E3]]};
+; CHECK:        ret;
+define void @test_ldst_v8f16(<8 x half>* %a, <8 x half>* %b) {
+  %t1 = load <8 x half>, <8 x half>* %a
+  store <8 x half> %t1, <8 x half>* %b, align 16
+  ret void
+}
+
+declare <2 x half> @test_callee(<2 x half> %a, <2 x half> %b) #0
+
+; CHECK-LABEL: test_call(
+; CHECK-DAG:  ld.param.b32    [[A:%hh[0-9]+]], [test_call_param_0];
+; CHECK-DAG:  ld.param.b32    [[B:%hh[0-9]+]], [test_call_param_1];
+; CHECK:      {
+; CHECK-DAG:  .param .align 4 .b8 param0[4];
+; CHECK-DAG:  .param .align 4 .b8 param1[4];
+; CHECK-DAG:  st.param.b32    [param0+0], [[A]];
+; CHECK-DAG:  st.param.b32    [param1+0], [[B]];
+; CHECK-DAG:  .param .align 4 .b8 retval0[4];
+; CHECK:      call.uni (retval0),
+; CHECK-NEXT:        test_callee,
+; CHECK:      );
+; CHECK-NEXT: ld.param.b32    [[R:%hh[0-9]+]], [retval0+0];
+; CHECK-NEXT: }
+; CHECK-NEXT: st.param.b32    [func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+define <2 x half> @test_call(<2 x half> %a, <2 x half> %b) #0 {
+  %r = call <2 x half> @test_callee(<2 x half> %a, <2 x half> %b)
+  ret <2 x half> %r
+}
+
+; CHECK-LABEL: test_call_flipped(
+; CHECK-DAG:  ld.param.b32    [[A:%hh[0-9]+]], [test_call_flipped_param_0];
+; CHECK-DAG:  ld.param.b32    [[B:%hh[0-9]+]], [test_call_flipped_param_1];
+; CHECK:      {
+; CHECK-DAG:  .param .align 4 .b8 param0[4];
+; CHECK-DAG:  .param .align 4 .b8 param1[4];
+; CHECK-DAG:  st.param.b32    [param0+0], [[B]];
+; CHECK-DAG:  st.param.b32    [param1+0], [[A]];
+; CHECK-DAG:  .param .align 4 .b8 retval0[4];
+; CHECK:      call.uni (retval0),
+; CHECK-NEXT:        test_callee,
+; CHECK:      );
+; CHECK-NEXT: ld.param.b32    [[R:%hh[0-9]+]], [retval0+0];
+; CHECK-NEXT: }
+; CHECK-NEXT: st.param.b32    [func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+define <2 x half> @test_call_flipped(<2 x half> %a, <2 x half> %b) #0 {
+  %r = call <2 x half> @test_callee(<2 x half> %b, <2 x half> %a)
+  ret <2 x half> %r
+}
+
+; CHECK-LABEL: test_tailcall_flipped(
+; CHECK-DAG:  ld.param.b32    [[A:%hh[0-9]+]], [test_tailcall_flipped_param_0];
+; CHECK-DAG:  ld.param.b32    [[B:%hh[0-9]+]], [test_tailcall_flipped_param_1];
+; CHECK:      {
+; CHECK-DAG:  .param .align 4 .b8 param0[4];
+; CHECK-DAG:  .param .align 4 .b8 param1[4];
+; CHECK-DAG:  st.param.b32    [param0+0], [[B]];
+; CHECK-DAG:  st.param.b32    [param1+0], [[A]];
+; CHECK-DAG:  .param .align 4 .b8 retval0[4];
+; CHECK:      call.uni (retval0),
+; CHECK-NEXT:        test_callee,
+; CHECK:      );
+; CHECK-NEXT: ld.param.b32    [[R:%hh[0-9]+]], [retval0+0];
+; CHECK-NEXT: }
+; CHECK-NEXT: st.param.b32    [func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+define <2 x half> @test_tailcall_flipped(<2 x half> %a, <2 x half> %b) #0 {
+  %r = tail call <2 x half> @test_callee(<2 x half> %b, <2 x half> %a)
+  ret <2 x half> %r
+}
+
+; CHECK-LABEL: test_select(
+; CHECK-DAG:  ld.param.b32    [[A:%hh[0-9]+]], [test_select_param_0];
+; CHECK-DAG:  ld.param.b32    [[B:%hh[0-9]+]], [test_select_param_1];
+; CHECK-DAG:  ld.param.u8     [[C:%rs[0-9]+]], [test_select_param_2]
+; CHECK-DAG:  setp.eq.b16     [[PRED:%p[0-9]+]], %rs{{.*}}, 1;
+; CHECK-NEXT: selp.b32        [[R:%hh[0-9]+]], [[A]], [[B]], [[PRED]];
+; CHECK-NEXT: st.param.b32    [func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+define <2 x half> @test_select(<2 x half> %a, <2 x half> %b, i1 zeroext %c) #0 {
+  %r = select i1 %c, <2 x half> %a, <2 x half> %b
+  ret <2 x half> %r
+}
+
+; CHECK-LABEL: test_select_cc(
+; CHECK-DAG:  ld.param.b32    [[A:%hh[0-9]+]], [test_select_cc_param_0];
+; CHECK-DAG:  ld.param.b32    [[B:%hh[0-9]+]], [test_select_cc_param_1];
+; CHECK-DAG:  ld.param.b32    [[C:%hh[0-9]+]], [test_select_cc_param_2];
+; CHECK-DAG:  ld.param.b32    [[D:%hh[0-9]+]], [test_select_cc_param_3];
+;
+; CHECK-F16:  setp.neu.f16x2  [[P0:%p[0-9]+]]|[[P1:%p[0-9]+]], [[C]], [[D]]
+;
+; CHECK-NOF16-DAG: mov.b32        {[[C0:%h[0-9]+]], [[C1:%h[0-9]+]]}, [[C]]
+; CHECK-NOF16-DAG: mov.b32        {[[D0:%h[0-9]+]], [[D1:%h[0-9]+]]}, [[D]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[DF0:%f[0-9]+]], [[D0]];
+; CHECK-NOF16-DAG: cvt.f32.f16 [[CF0:%f[0-9]+]], [[C0]];
+; CHECK-NOF16-DAG: cvt.f32.f16 [[DF1:%f[0-9]+]], [[D1]];
+; CHECK-NOF16-DAG: cvt.f32.f16 [[CF1:%f[0-9]+]], [[C1]];
+; CHECK-NOF16-DAG: setp.neu.f32    [[P0:%p[0-9]+]], [[CF0]], [[DF0]]
+; CHECK-NOF16-DAG: setp.neu.f32    [[P1:%p[0-9]+]], [[CF1]], [[DF1]]
+;
+; CHECK-DAG:  mov.b32         {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
+; CHECK-DAG:  mov.b32         {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]]
+; CHECK-DAG:  selp.b16        [[R0:%h[0-9]+]], [[A0]], [[B0]], [[P0]];
+; CHECK-DAG:  selp.b16        [[R1:%h[0-9]+]], [[A1]], [[B1]], [[P1]];
+; CHECK:      mov.b32         [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
+; CHECK-NEXT: st.param.b32    [func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+define <2 x half> @test_select_cc(<2 x half> %a, <2 x half> %b, <2 x half> %c, <2 x half> %d) #0 {
+  %cc = fcmp une <2 x half> %c, %d
+  %r = select <2 x i1> %cc, <2 x half> %a, <2 x half> %b
+  ret <2 x half> %r
+}
+
+; CHECK-LABEL: test_select_cc_f32_f16(
+; CHECK-DAG:  ld.param.v2.f32    {[[A0:%f[0-9]+]], [[A1:%f[0-9]+]]}, [test_select_cc_f32_f16_param_0];
+; CHECK-DAG:  ld.param.v2.f32    {[[B0:%f[0-9]+]], [[B1:%f[0-9]+]]}, [test_select_cc_f32_f16_param_1];
+; CHECK-DAG:  ld.param.b32    [[C:%hh[0-9]+]], [test_select_cc_f32_f16_param_2];
+; CHECK-DAG:  ld.param.b32    [[D:%hh[0-9]+]], [test_select_cc_f32_f16_param_3];
+;
+; CHECK-F16:  setp.neu.f16x2  [[P0:%p[0-9]+]]|[[P1:%p[0-9]+]], [[C]], [[D]]
+; CHECK-NOF16-DAG: mov.b32         {[[C0:%h[0-9]+]], [[C1:%h[0-9]+]]}, [[C]]
+; CHECK-NOF16-DAG: mov.b32         {[[D0:%h[0-9]+]], [[D1:%h[0-9]+]]}, [[D]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[DF0:%f[0-9]+]], [[D0]];
+; CHECK-NOF16-DAG: cvt.f32.f16 [[CF0:%f[0-9]+]], [[C0]];
+; CHECK-NOF16-DAG: cvt.f32.f16 [[DF1:%f[0-9]+]], [[D1]];
+; CHECK-NOF16-DAG: cvt.f32.f16 [[CF1:%f[0-9]+]], [[C1]];
+; CHECK-NOF16-DAG: setp.neu.f32    [[P0:%p[0-9]+]], [[CF0]], [[DF0]]
+; CHECK-NOF16-DAG: setp.neu.f32    [[P1:%p[0-9]+]], [[CF1]], [[DF1]]
+;
+; CHECK-DAG: selp.f32        [[R0:%f[0-9]+]], [[A0]], [[B0]], [[P0]];
+; CHECK-DAG: selp.f32        [[R1:%f[0-9]+]], [[A1]], [[B1]], [[P1]];
+; CHECK-NEXT: st.param.v2.f32    [func_retval0+0], {[[R0]], [[R1]]};
+; CHECK-NEXT: ret;
+define <2 x float> @test_select_cc_f32_f16(<2 x float> %a, <2 x float> %b,
+                                           <2 x half> %c, <2 x half> %d) #0 {
+  %cc = fcmp une <2 x half> %c, %d
+  %r = select <2 x i1> %cc, <2 x float> %a, <2 x float> %b
+  ret <2 x float> %r
+}
+
+; CHECK-LABEL: test_select_cc_f16_f32(
+; CHECK-DAG:  ld.param.b32    [[A:%hh[0-9]+]], [test_select_cc_f16_f32_param_0];
+; CHECK-DAG:  ld.param.b32    [[B:%hh[0-9]+]], [test_select_cc_f16_f32_param_1];
+; CHECK-DAG:  ld.param.v2.f32 {[[C0:%f[0-9]+]], [[C1:%f[0-9]+]]}, [test_select_cc_f16_f32_param_2];
+; CHECK-DAG:  ld.param.v2.f32 {[[D0:%f[0-9]+]], [[D1:%f[0-9]+]]}, [test_select_cc_f16_f32_param_3];
+; CHECK-DAG:  setp.neu.f32    [[P0:%p[0-9]+]], [[C0]], [[D0]]
+; CHECK-DAG:  setp.neu.f32    [[P1:%p[0-9]+]], [[C1]], [[D1]]
+; CHECK-DAG:  mov.b32         {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
+; CHECK-DAG:  mov.b32         {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]]
+; CHECK-DAG:  selp.b16        [[R0:%h[0-9]+]], [[A0]], [[B0]], [[P0]];
+; CHECK-DAG:  selp.b16        [[R1:%h[0-9]+]], [[A1]], [[B1]], [[P1]];
+; CHECK:      mov.b32         [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
+; CHECK-NEXT: st.param.b32    [func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+define <2 x half> @test_select_cc_f16_f32(<2 x half> %a, <2 x half> %b,
+                                          <2 x float> %c, <2 x float> %d) #0 {
+  %cc = fcmp une <2 x float> %c, %d
+  %r = select <2 x i1> %cc, <2 x half> %a, <2 x half> %b
+  ret <2 x half> %r
+}
+
+; CHECK-LABEL: test_fcmp_une(
+; CHECK-DAG:  ld.param.b32    [[A:%hh[0-9]+]], [test_fcmp_une_param_0];
+; CHECK-DAG:  ld.param.b32    [[B:%hh[0-9]+]], [test_fcmp_une_param_1];
+; CHECK-F16:  setp.neu.f16x2  [[P0:%p[0-9]+]]|[[P1:%p[0-9]+]], [[A]], [[B]]
+; CHECK-NOF16-DAG:  mov.b32        {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
+; CHECK-NOF16-DAG:  mov.b32        {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]]
+; CHECK-NOF16-DAG:  cvt.f32.f16    [[FA0:%f[0-9]+]], [[A0]]
+; CHECK-NOF16-DAG:  cvt.f32.f16    [[FB0:%f[0-9]+]], [[B0]]
+; CHECK-NOF16-DAG:  cvt.f32.f16    [[FA1:%f[0-9]+]], [[A1]]
+; CHECK-NOF16-DAG:  cvt.f32.f16    [[FB1:%f[0-9]+]], [[B1]]
+; CHECK-NOF16-DAG:  setp.neu.f32   [[P0:%p[0-9]+]], [[FA0]], [[FB0]]
+; CHECK-NOF16-DAG:  setp.neu.f32   [[P1:%p[0-9]+]], [[FA1]], [[FB1]]
+; CHECK-DAG:  selp.u16        [[R0:%rs[0-9]+]], -1, 0, [[P0]];
+; CHECK-DAG:  selp.u16        [[R1:%rs[0-9]+]], -1, 0, [[P1]];
+; CHECK-NEXT: st.param.v2.b8  [func_retval0+0], {[[R0]], [[R1]]};
+; CHECK-NEXT: ret;
+define <2 x i1> @test_fcmp_une(<2 x half> %a, <2 x half> %b) #0 {
+  %r = fcmp une <2 x half> %a, %b
+  ret <2 x i1> %r
+}
+
+; CHECK-LABEL: test_fcmp_ueq(
+; CHECK-DAG:  ld.param.b32    [[A:%hh[0-9]+]], [test_fcmp_ueq_param_0];
+; CHECK-DAG:  ld.param.b32    [[B:%hh[0-9]+]], [test_fcmp_ueq_param_1];
+; CHECK-F16:  setp.equ.f16x2  [[P0:%p[0-9]+]]|[[P1:%p[0-9]+]], [[A]], [[B]]
+; CHECK-NOF16-DAG:  mov.b32        {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
+; CHECK-NOF16-DAG:  mov.b32        {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]]
+; CHECK-NOF16-DAG:  cvt.f32.f16    [[FA0:%f[0-9]+]], [[A0]]
+; CHECK-NOF16-DAG:  cvt.f32.f16    [[FB0:%f[0-9]+]], [[B0]]
+; CHECK-NOF16-DAG:  cvt.f32.f16    [[FA1:%f[0-9]+]], [[A1]]
+; CHECK-NOF16-DAG:  cvt.f32.f16    [[FB1:%f[0-9]+]], [[B1]]
+; CHECK-NOF16-DAG:  setp.equ.f32   [[P0:%p[0-9]+]], [[FA0]], [[FB0]]
+; CHECK-NOF16-DAG:  setp.equ.f32   [[P1:%p[0-9]+]], [[FA1]], [[FB1]]
+; CHECK-DAG:  selp.u16        [[R0:%rs[0-9]+]], -1, 0, [[P0]];
+; CHECK-DAG:  selp.u16        [[R1:%rs[0-9]+]], -1, 0, [[P1]];
+; CHECK-NEXT: st.param.v2.b8  [func_retval0+0], {[[R0]], [[R1]]};
+; CHECK-NEXT: ret;
+define <2 x i1> @test_fcmp_ueq(<2 x half> %a, <2 x half> %b) #0 {
+  %r = fcmp ueq <2 x half> %a, %b
+  ret <2 x i1> %r
+}
+
+; CHECK-LABEL: test_fcmp_ugt(
+; CHECK-DAG:  ld.param.b32    [[A:%hh[0-9]+]], [test_fcmp_ugt_param_0];
+; CHECK-DAG:  ld.param.b32    [[B:%hh[0-9]+]], [test_fcmp_ugt_param_1];
+; CHECK-F16:  setp.gtu.f16x2  [[P0:%p[0-9]+]]|[[P1:%p[0-9]+]], [[A]], [[B]]
+; CHECK-NOF16-DAG:  mov.b32        {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
+; CHECK-NOF16-DAG:  mov.b32        {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]]
+; CHECK-NOF16-DAG:  cvt.f32.f16    [[FA0:%f[0-9]+]], [[A0]]
+; CHECK-NOF16-DAG:  cvt.f32.f16    [[FB0:%f[0-9]+]], [[B0]]
+; CHECK-NOF16-DAG:  cvt.f32.f16    [[FA1:%f[0-9]+]], [[A1]]
+; CHECK-NOF16-DAG:  cvt.f32.f16    [[FB1:%f[0-9]+]], [[B1]]
+; CHECK-NOF16-DAG:  setp.gtu.f32   [[P0:%p[0-9]+]], [[FA0]], [[FB0]]
+; CHECK-NOF16-DAG:  setp.gtu.f32   [[P1:%p[0-9]+]], [[FA1]], [[FB1]]
+; CHECK-DAG:  selp.u16        [[R0:%rs[0-9]+]], -1, 0, [[P0]];
+; CHECK-DAG:  selp.u16        [[R1:%rs[0-9]+]], -1, 0, [[P1]];
+; CHECK-NEXT: st.param.v2.b8  [func_retval0+0], {[[R0]], [[R1]]};
+; CHECK-NEXT: ret;
+define <2 x i1> @test_fcmp_ugt(<2 x half> %a, <2 x half> %b) #0 {
+  %r = fcmp ugt <2 x half> %a, %b
+  ret <2 x i1> %r
+}
+
+; CHECK-LABEL: test_fcmp_uge(
+; CHECK-DAG:  ld.param.b32    [[A:%hh[0-9]+]], [test_fcmp_uge_param_0];
+; CHECK-DAG:  ld.param.b32    [[B:%hh[0-9]+]], [test_fcmp_uge_param_1];
+; CHECK-F16:  setp.geu.f16x2  [[P0:%p[0-9]+]]|[[P1:%p[0-9]+]], [[A]], [[B]]
+; CHECK-NOF16-DAG:  mov.b32        {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
+; CHECK-NOF16-DAG:  mov.b32        {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]]
+; CHECK-NOF16-DAG:  cvt.f32.f16    [[FA0:%f[0-9]+]], [[A0]]
+; CHECK-NOF16-DAG:  cvt.f32.f16    [[FB0:%f[0-9]+]], [[B0]]
+; CHECK-NOF16-DAG:  cvt.f32.f16    [[FA1:%f[0-9]+]], [[A1]]
+; CHECK-NOF16-DAG:  cvt.f32.f16    [[FB1:%f[0-9]+]], [[B1]]
+; CHECK-NOF16-DAG:  setp.geu.f32   [[P0:%p[0-9]+]], [[FA0]], [[FB0]]
+; CHECK-NOF16-DAG:  setp.geu.f32   [[P1:%p[0-9]+]], [[FA1]], [[FB1]]
+; CHECK-DAG:  selp.u16        [[R0:%rs[0-9]+]], -1, 0, [[P0]];
+; CHECK-DAG:  selp.u16        [[R1:%rs[0-9]+]], -1, 0, [[P1]];
+; CHECK-NEXT: st.param.v2.b8  [func_retval0+0], {[[R0]], [[R1]]};
+; CHECK-NEXT: ret;
+define <2 x i1> @test_fcmp_uge(<2 x half> %a, <2 x half> %b) #0 {
+  %r = fcmp uge <2 x half> %a, %b
+  ret <2 x i1> %r
+}
+
+; CHECK-LABEL: test_fcmp_ult(
+; CHECK-DAG:  ld.param.b32    [[A:%hh[0-9]+]], [test_fcmp_ult_param_0];
+; CHECK-DAG:  ld.param.b32    [[B:%hh[0-9]+]], [test_fcmp_ult_param_1];
+; CHECK-F16:  setp.ltu.f16x2  [[P0:%p[0-9]+]]|[[P1:%p[0-9]+]], [[A]], [[B]]
+; CHECK-NOF16-DAG:  mov.b32        {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
+; CHECK-NOF16-DAG:  mov.b32        {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]]
+; CHECK-NOF16-DAG:  cvt.f32.f16    [[FA0:%f[0-9]+]], [[A0]]
+; CHECK-NOF16-DAG:  cvt.f32.f16    [[FB0:%f[0-9]+]], [[B0]]
+; CHECK-NOF16-DAG:  cvt.f32.f16    [[FA1:%f[0-9]+]], [[A1]]
+; CHECK-NOF16-DAG:  cvt.f32.f16    [[FB1:%f[0-9]+]], [[B1]]
+; CHECK-NOF16-DAG:  setp.ltu.f32   [[P0:%p[0-9]+]], [[FA0]], [[FB0]]
+; CHECK-NOF16-DAG:  setp.ltu.f32   [[P1:%p[0-9]+]], [[FA1]], [[FB1]]
+; CHECK-DAG:  selp.u16        [[R0:%rs[0-9]+]], -1, 0, [[P0]];
+; CHECK-DAG:  selp.u16        [[R1:%rs[0-9]+]], -1, 0, [[P1]];
+; CHECK-NEXT: st.param.v2.b8  [func_retval0+0], {[[R0]], [[R1]]};
+; CHECK-NEXT: ret;
+define <2 x i1> @test_fcmp_ult(<2 x half> %a, <2 x half> %b) #0 {
+  %r = fcmp ult <2 x half> %a, %b
+  ret <2 x i1> %r
+}
+
+; CHECK-LABEL: test_fcmp_ule(
+; CHECK-DAG:  ld.param.b32    [[A:%hh[0-9]+]], [test_fcmp_ule_param_0];
+; CHECK-DAG:  ld.param.b32    [[B:%hh[0-9]+]], [test_fcmp_ule_param_1];
+; CHECK-F16:  setp.leu.f16x2  [[P0:%p[0-9]+]]|[[P1:%p[0-9]+]], [[A]], [[B]]
+; CHECK-NOF16-DAG:  mov.b32        {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
+; CHECK-NOF16-DAG:  mov.b32        {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]]
+; CHECK-NOF16-DAG:  cvt.f32.f16    [[FA0:%f[0-9]+]], [[A0]]
+; CHECK-NOF16-DAG:  cvt.f32.f16    [[FB0:%f[0-9]+]], [[B0]]
+; CHECK-NOF16-DAG:  cvt.f32.f16    [[FA1:%f[0-9]+]], [[A1]]
+; CHECK-NOF16-DAG:  cvt.f32.f16    [[FB1:%f[0-9]+]], [[B1]]
+; CHECK-NOF16-DAG:  setp.leu.f32   [[P0:%p[0-9]+]], [[FA0]], [[FB0]]
+; CHECK-NOF16-DAG:  setp.leu.f32   [[P1:%p[0-9]+]], [[FA1]], [[FB1]]
+; CHECK-DAG:  selp.u16        [[R0:%rs[0-9]+]], -1, 0, [[P0]];
+; CHECK-DAG:  selp.u16        [[R1:%rs[0-9]+]], -1, 0, [[P1]];
+; CHECK-NEXT: st.param.v2.b8  [func_retval0+0], {[[R0]], [[R1]]};
+; CHECK-NEXT: ret;
+define <2 x i1> @test_fcmp_ule(<2 x half> %a, <2 x half> %b) #0 {
+  %r = fcmp ule <2 x half> %a, %b
+  ret <2 x i1> %r
+}
+
+
+; CHECK-LABEL: test_fcmp_uno(
+; CHECK-DAG:  ld.param.b32    [[A:%hh[0-9]+]], [test_fcmp_uno_param_0];
+; CHECK-DAG:  ld.param.b32    [[B:%hh[0-9]+]], [test_fcmp_uno_param_1];
+; CHECK-F16:  setp.nan.f16x2  [[P0:%p[0-9]+]]|[[P1:%p[0-9]+]], [[A]], [[B]]
+; CHECK-NOF16-DAG:  mov.b32        {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
+; CHECK-NOF16-DAG:  mov.b32        {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]]
+; CHECK-NOF16-DAG:  cvt.f32.f16    [[FA0:%f[0-9]+]], [[A0]]
+; CHECK-NOF16-DAG:  cvt.f32.f16    [[FB0:%f[0-9]+]], [[B0]]
+; CHECK-NOF16-DAG:  cvt.f32.f16    [[FA1:%f[0-9]+]], [[A1]]
+; CHECK-NOF16-DAG:  cvt.f32.f16    [[FB1:%f[0-9]+]], [[B1]]
+; CHECK-NOF16-DAG:  setp.nan.f32   [[P0:%p[0-9]+]], [[FA0]], [[FB0]]
+; CHECK-NOF16-DAG:  setp.nan.f32   [[P1:%p[0-9]+]], [[FA1]], [[FB1]]
+; CHECK-DAG:  selp.u16        [[R0:%rs[0-9]+]], -1, 0, [[P0]];
+; CHECK-DAG:  selp.u16        [[R1:%rs[0-9]+]], -1, 0, [[P1]];
+; CHECK-NEXT: st.param.v2.b8  [func_retval0+0], {[[R0]], [[R1]]};
+; CHECK-NEXT: ret;
+define <2 x i1> @test_fcmp_uno(<2 x half> %a, <2 x half> %b) #0 {
+  %r = fcmp uno <2 x half> %a, %b
+  ret <2 x i1> %r
+}
+
+; CHECK-LABEL: test_fcmp_one(
+; CHECK-DAG:  ld.param.b32    [[A:%hh[0-9]+]], [test_fcmp_one_param_0];
+; CHECK-DAG:  ld.param.b32    [[B:%hh[0-9]+]], [test_fcmp_one_param_1];
+; CHECK-F16:  setp.ne.f16x2  [[P0:%p[0-9]+]]|[[P1:%p[0-9]+]], [[A]], [[B]]
+; CHECK-NOF16-DAG:  mov.b32        {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
+; CHECK-NOF16-DAG:  mov.b32        {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]]
+; CHECK-NOF16-DAG:  cvt.f32.f16    [[FA0:%f[0-9]+]], [[A0]]
+; CHECK-NOF16-DAG:  cvt.f32.f16    [[FB0:%f[0-9]+]], [[B0]]
+; CHECK-NOF16-DAG:  cvt.f32.f16    [[FA1:%f[0-9]+]], [[A1]]
+; CHECK-NOF16-DAG:  cvt.f32.f16    [[FB1:%f[0-9]+]], [[B1]]
+; CHECK-NOF16-DAG:  setp.ne.f32   [[P0:%p[0-9]+]], [[FA0]], [[FB0]]
+; CHECK-NOF16-DAG:  setp.ne.f32   [[P1:%p[0-9]+]], [[FA1]], [[FB1]]
+; CHECK-DAG:  selp.u16        [[R0:%rs[0-9]+]], -1, 0, [[P0]];
+; CHECK-DAG:  selp.u16        [[R1:%rs[0-9]+]], -1, 0, [[P1]];
+; CHECK-NEXT: st.param.v2.b8  [func_retval0+0], {[[R0]], [[R1]]};
+; CHECK-NEXT: ret;
+define <2 x i1> @test_fcmp_one(<2 x half> %a, <2 x half> %b) #0 {
+  %r = fcmp one <2 x half> %a, %b
+  ret <2 x i1> %r
+}
+
+; CHECK-LABEL: test_fcmp_oeq(
+; CHECK-DAG:  ld.param.b32    [[A:%hh[0-9]+]], [test_fcmp_oeq_param_0];
+; CHECK-DAG:  ld.param.b32    [[B:%hh[0-9]+]], [test_fcmp_oeq_param_1];
+; CHECK-F16:  setp.eq.f16x2  [[P0:%p[0-9]+]]|[[P1:%p[0-9]+]], [[A]], [[B]]
+; CHECK-NOF16-DAG:  mov.b32        {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
+; CHECK-NOF16-DAG:  mov.b32        {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]]
+; CHECK-NOF16-DAG:  cvt.f32.f16    [[FA0:%f[0-9]+]], [[A0]]
+; CHECK-NOF16-DAG:  cvt.f32.f16    [[FB0:%f[0-9]+]], [[B0]]
+; CHECK-NOF16-DAG:  cvt.f32.f16    [[FA1:%f[0-9]+]], [[A1]]
+; CHECK-NOF16-DAG:  cvt.f32.f16    [[FB1:%f[0-9]+]], [[B1]]
+; CHECK-NOF16-DAG:  setp.eq.f32   [[P0:%p[0-9]+]], [[FA0]], [[FB0]]
+; CHECK-NOF16-DAG:  setp.eq.f32   [[P1:%p[0-9]+]], [[FA1]], [[FB1]]
+; CHECK-DAG:  selp.u16        [[R0:%rs[0-9]+]], -1, 0, [[P0]];
+; CHECK-DAG:  selp.u16        [[R1:%rs[0-9]+]], -1, 0, [[P1]];
+; CHECK-NEXT: st.param.v2.b8  [func_retval0+0], {[[R0]], [[R1]]};
+; CHECK-NEXT: ret;
+define <2 x i1> @test_fcmp_oeq(<2 x half> %a, <2 x half> %b) #0 {
+  %r = fcmp oeq <2 x half> %a, %b
+  ret <2 x i1> %r
+}
+
+; CHECK-LABEL: test_fcmp_ogt(
+; CHECK-DAG:  ld.param.b32    [[A:%hh[0-9]+]], [test_fcmp_ogt_param_0];
+; CHECK-DAG:  ld.param.b32    [[B:%hh[0-9]+]], [test_fcmp_ogt_param_1];
+; CHECK-F16:  setp.gt.f16x2  [[P0:%p[0-9]+]]|[[P1:%p[0-9]+]], [[A]], [[B]]
+; CHECK-NOF16-DAG:  mov.b32        {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
+; CHECK-NOF16-DAG:  mov.b32        {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]]
+; CHECK-NOF16-DAG:  cvt.f32.f16    [[FA0:%f[0-9]+]], [[A0]]
+; CHECK-NOF16-DAG:  cvt.f32.f16    [[FB0:%f[0-9]+]], [[B0]]
+; CHECK-NOF16-DAG:  cvt.f32.f16    [[FA1:%f[0-9]+]], [[A1]]
+; CHECK-NOF16-DAG:  cvt.f32.f16    [[FB1:%f[0-9]+]], [[B1]]
+; CHECK-NOF16-DAG:  setp.gt.f32   [[P0:%p[0-9]+]], [[FA0]], [[FB0]]
+; CHECK-NOF16-DAG:  setp.gt.f32   [[P1:%p[0-9]+]], [[FA1]], [[FB1]]
+; CHECK-DAG:  selp.u16        [[R0:%rs[0-9]+]], -1, 0, [[P0]];
+; CHECK-DAG:  selp.u16        [[R1:%rs[0-9]+]], -1, 0, [[P1]];
+; CHECK-NEXT: st.param.v2.b8  [func_retval0+0], {[[R0]], [[R1]]};
+; CHECK-NEXT: ret;
+define <2 x i1> @test_fcmp_ogt(<2 x half> %a, <2 x half> %b) #0 {
+  %r = fcmp ogt <2 x half> %a, %b
+  ret <2 x i1> %r
+}
+
+; CHECK-LABEL: test_fcmp_oge(
+; CHECK-DAG:  ld.param.b32    [[A:%hh[0-9]+]], [test_fcmp_oge_param_0];
+; CHECK-DAG:  ld.param.b32    [[B:%hh[0-9]+]], [test_fcmp_oge_param_1];
+; CHECK-F16:  setp.ge.f16x2  [[P0:%p[0-9]+]]|[[P1:%p[0-9]+]], [[A]], [[B]]
+; CHECK-NOF16-DAG:  mov.b32        {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
+; CHECK-NOF16-DAG:  mov.b32        {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]]
+; CHECK-NOF16-DAG:  cvt.f32.f16    [[FA0:%f[0-9]+]], [[A0]]
+; CHECK-NOF16-DAG:  cvt.f32.f16    [[FB0:%f[0-9]+]], [[B0]]
+; CHECK-NOF16-DAG:  cvt.f32.f16    [[FA1:%f[0-9]+]], [[A1]]
+; CHECK-NOF16-DAG:  cvt.f32.f16    [[FB1:%f[0-9]+]], [[B1]]
+; CHECK-NOF16-DAG:  setp.ge.f32   [[P0:%p[0-9]+]], [[FA0]], [[FB0]]
+; CHECK-NOF16-DAG:  setp.ge.f32   [[P1:%p[0-9]+]], [[FA1]], [[FB1]]
+; CHECK-DAG:  selp.u16        [[R0:%rs[0-9]+]], -1, 0, [[P0]];
+; CHECK-DAG:  selp.u16        [[R1:%rs[0-9]+]], -1, 0, [[P1]];
+; CHECK-NEXT: st.param.v2.b8  [func_retval0+0], {[[R0]], [[R1]]};
+; CHECK-NEXT: ret;
+define <2 x i1> @test_fcmp_oge(<2 x half> %a, <2 x half> %b) #0 {
+  %r = fcmp oge <2 x half> %a, %b
+  ret <2 x i1> %r
+}
+
+; CHECK-LABEL: test_fcmp_olt(
+; CHECK-DAG:  ld.param.b32    [[A:%hh[0-9]+]], [test_fcmp_olt_param_0];
+; CHECK-DAG:  ld.param.b32    [[B:%hh[0-9]+]], [test_fcmp_olt_param_1];
+; CHECK-F16:  setp.lt.f16x2  [[P0:%p[0-9]+]]|[[P1:%p[0-9]+]], [[A]], [[B]]
+; CHECK-NOF16-DAG:  mov.b32        {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
+; CHECK-NOF16-DAG:  mov.b32        {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]]
+; CHECK-NOF16-DAG:  cvt.f32.f16    [[FA0:%f[0-9]+]], [[A0]]
+; CHECK-NOF16-DAG:  cvt.f32.f16    [[FB0:%f[0-9]+]], [[B0]]
+; CHECK-NOF16-DAG:  cvt.f32.f16    [[FA1:%f[0-9]+]], [[A1]]
+; CHECK-NOF16-DAG:  cvt.f32.f16    [[FB1:%f[0-9]+]], [[B1]]
+; CHECK-NOF16-DAG:  setp.lt.f32   [[P0:%p[0-9]+]], [[FA0]], [[FB0]]
+; CHECK-NOF16-DAG:  setp.lt.f32   [[P1:%p[0-9]+]], [[FA1]], [[FB1]]
+; CHECK-DAG:  selp.u16        [[R0:%rs[0-9]+]], -1, 0, [[P0]];
+; CHECK-DAG:  selp.u16        [[R1:%rs[0-9]+]], -1, 0, [[P1]];
+; CHECK-NEXT: st.param.v2.b8  [func_retval0+0], {[[R0]], [[R1]]};
+; CHECK-NEXT: ret;
+define <2 x i1> @test_fcmp_olt(<2 x half> %a, <2 x half> %b) #0 {
+  %r = fcmp olt <2 x half> %a, %b
+  ret <2 x i1> %r
+}
+
+; XCHECK-LABEL: test_fcmp_ole(
+; CHECK-DAG:  ld.param.b32    [[A:%hh[0-9]+]], [test_fcmp_ole_param_0];
+; CHECK-DAG:  ld.param.b32    [[B:%hh[0-9]+]], [test_fcmp_ole_param_1];
+; CHECK-F16:  setp.le.f16x2  [[P0:%p[0-9]+]]|[[P1:%p[0-9]+]], [[A]], [[B]]
+; CHECK-NOF16-DAG:  mov.b32        {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
+; CHECK-NOF16-DAG:  mov.b32        {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]]
+; CHECK-NOF16-DAG:  cvt.f32.f16    [[FA0:%f[0-9]+]], [[A0]]
+; CHECK-NOF16-DAG:  cvt.f32.f16    [[FB0:%f[0-9]+]], [[B0]]
+; CHECK-NOF16-DAG:  cvt.f32.f16    [[FA1:%f[0-9]+]], [[A1]]
+; CHECK-NOF16-DAG:  cvt.f32.f16    [[FB1:%f[0-9]+]], [[B1]]
+; CHECK-NOF16-DAG:  setp.le.f32   [[P0:%p[0-9]+]], [[FA0]], [[FB0]]
+; CHECK-NOF16-DAG:  setp.le.f32   [[P1:%p[0-9]+]], [[FA1]], [[FB1]]
+; CHECK-DAG:  selp.u16        [[R0:%rs[0-9]+]], -1, 0, [[P0]];
+; CHECK-DAG:  selp.u16        [[R1:%rs[0-9]+]], -1, 0, [[P1]];
+; CHECK-NEXT: st.param.v2.b8  [func_retval0+0], {[[R0]], [[R1]]};
+; CHECK-NEXT: ret;
+define <2 x i1> @test_fcmp_ole(<2 x half> %a, <2 x half> %b) #0 {
+  %r = fcmp ole <2 x half> %a, %b
+  ret <2 x i1> %r
+}
+
+; CHECK-LABEL: test_fcmp_ord(
+; CHECK-DAG:  ld.param.b32    [[A:%hh[0-9]+]], [test_fcmp_ord_param_0];
+; CHECK-DAG:  ld.param.b32    [[B:%hh[0-9]+]], [test_fcmp_ord_param_1];
+; CHECK-F16:  setp.num.f16x2  [[P0:%p[0-9]+]]|[[P1:%p[0-9]+]], [[A]], [[B]]
+; CHECK-NOF16-DAG:  mov.b32        {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
+; CHECK-NOF16-DAG:  mov.b32        {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]]
+; CHECK-NOF16-DAG:  cvt.f32.f16    [[FA0:%f[0-9]+]], [[A0]]
+; CHECK-NOF16-DAG:  cvt.f32.f16    [[FB0:%f[0-9]+]], [[B0]]
+; CHECK-NOF16-DAG:  cvt.f32.f16    [[FA1:%f[0-9]+]], [[A1]]
+; CHECK-NOF16-DAG:  cvt.f32.f16    [[FB1:%f[0-9]+]], [[B1]]
+; CHECK-NOF16-DAG:  setp.num.f32   [[P0:%p[0-9]+]], [[FA0]], [[FB0]]
+; CHECK-NOF16-DAG:  setp.num.f32   [[P1:%p[0-9]+]], [[FA1]], [[FB1]]
+; CHECK-DAG:  selp.u16        [[R0:%rs[0-9]+]], -1, 0, [[P0]];
+; CHECK-DAG:  selp.u16        [[R1:%rs[0-9]+]], -1, 0, [[P1]];
+; CHECK-NEXT: st.param.v2.b8  [func_retval0+0], {[[R0]], [[R1]]};
+; CHECK-NEXT: ret;
+define <2 x i1> @test_fcmp_ord(<2 x half> %a, <2 x half> %b) #0 {
+  %r = fcmp ord <2 x half> %a, %b
+  ret <2 x i1> %r
+}
+
+; CHECK-LABEL: test_fptosi_i32(
+; CHECK:      ld.param.b32    [[A:%hh[0-9]+]], [test_fptosi_i32_param_0];
+; CHECK:      mov.b32         {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
+; CHECK-DAG:  cvt.rzi.s32.f16 [[R0:%r[0-9]+]], [[A0]];
+; CHECK-DAG:  cvt.rzi.s32.f16 [[R1:%r[0-9]+]], [[A1]];
+; CHECK:      st.param.v2.b32 [func_retval0+0], {[[R0]], [[R1]]}
+; CHECK:      ret;
+define <2 x i32> @test_fptosi_i32(<2 x half> %a) #0 {
+  %r = fptosi <2 x half> %a to <2 x i32>
+  ret <2 x i32> %r
+}
+
+; CHECK-LABEL: test_fptosi_i64(
+; CHECK:      ld.param.b32    [[A:%hh[0-9]+]], [test_fptosi_i64_param_0];
+; CHECK:      mov.b32         {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
+; CHECK-DAG:  cvt.rzi.s64.f16 [[R0:%rd[0-9]+]], [[A0]];
+; CHECK-DAG:  cvt.rzi.s64.f16 [[R1:%rd[0-9]+]], [[A1]];
+; CHECK:      st.param.v2.b64 [func_retval0+0], {[[R0]], [[R1]]}
+; CHECK:      ret;
+define <2 x i64> @test_fptosi_i64(<2 x half> %a) #0 {
+  %r = fptosi <2 x half> %a to <2 x i64>
+  ret <2 x i64> %r
+}
+
+; CHECK-LABEL: test_fptoui_2xi32(
+; CHECK:      ld.param.b32    [[A:%hh[0-9]+]], [test_fptoui_2xi32_param_0];
+; CHECK:      mov.b32         {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
+; CHECK-DAG:  cvt.rzi.u32.f16 [[R0:%r[0-9]+]], [[A0]];
+; CHECK-DAG:  cvt.rzi.u32.f16 [[R1:%r[0-9]+]], [[A1]];
+; CHECK:      st.param.v2.b32 [func_retval0+0], {[[R0]], [[R1]]}
+; CHECK:      ret;
+define <2 x i32> @test_fptoui_2xi32(<2 x half> %a) #0 {
+  %r = fptoui <2 x half> %a to <2 x i32>
+  ret <2 x i32> %r
+}
+
+; CHECK-LABEL: test_fptoui_2xi64(
+; CHECK:      ld.param.b32    [[A:%hh[0-9]+]], [test_fptoui_2xi64_param_0];
+; CHECK:      mov.b32         {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
+; CHECK-DAG:  cvt.rzi.u64.f16 [[R0:%rd[0-9]+]], [[A0]];
+; CHECK-DAG:  cvt.rzi.u64.f16 [[R1:%rd[0-9]+]], [[A1]];
+; CHECK:      st.param.v2.b64 [func_retval0+0], {[[R0]], [[R1]]}
+; CHECK:      ret;
+define <2 x i64> @test_fptoui_2xi64(<2 x half> %a) #0 {
+  %r = fptoui <2 x half> %a to <2 x i64>
+  ret <2 x i64> %r
+}
+
+; CHECK-LABEL: test_uitofp_2xi32(
+; CHECK:      ld.param.v2.u32 {[[A0:%r[0-9]+]], [[A1:%r[0-9]+]]}, [test_uitofp_2xi32_param_0];
+; CHECK-DAG:  cvt.rn.f16.u32  [[R0:%h[0-9]+]], [[A0]];
+; CHECK-DAG:  cvt.rn.f16.u32  [[R1:%h[0-9]+]], [[A1]];
+; CHECK:      mov.b32         [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
+; CHECK:      st.param.b32    [func_retval0+0], [[R]];
+; CHECK:      ret;
+define <2 x half> @test_uitofp_2xi32(<2 x i32> %a) #0 {
+  %r = uitofp <2 x i32> %a to <2 x half>
+  ret <2 x half> %r
+}
+
+; CHECK-LABEL: test_uitofp_2xi64(
+; CHECK:      ld.param.v2.u64 {[[A0:%rd[0-9]+]], [[A1:%rd[0-9]+]]}, [test_uitofp_2xi64_param_0];
+; CHECK-DAG:  cvt.rn.f32.u64  [[F0:%f[0-9]+]], [[A0]];
+; CHECK-DAG:  cvt.rn.f32.u64  [[F1:%f[0-9]+]], [[A1]];
+; CHECK-DAG:  cvt.rn.f16.f32  [[R0:%h[0-9]+]], [[F0]];
+; CHECK-DAG:  cvt.rn.f16.f32  [[R1:%h[0-9]+]], [[F1]];
+; CHECK:      mov.b32         [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
+; CHECK:      st.param.b32    [func_retval0+0], [[R]];
+; CHECK:      ret;
+define <2 x half> @test_uitofp_2xi64(<2 x i64> %a) #0 {
+  %r = uitofp <2 x i64> %a to <2 x half>
+  ret <2 x half> %r
+}
+
+; CHECK-LABEL: test_sitofp_2xi32(
+; CHECK:      ld.param.v2.u32 {[[A0:%r[0-9]+]], [[A1:%r[0-9]+]]}, [test_sitofp_2xi32_param_0];
+; CHECK-DAG:  cvt.rn.f16.s32  [[R0:%h[0-9]+]], [[A0]];
+; CHECK-DAG:  cvt.rn.f16.s32  [[R1:%h[0-9]+]], [[A1]];
+; CHECK:      mov.b32         [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
+; CHECK:      st.param.b32    [func_retval0+0], [[R]];
+; CHECK:      ret;
+define <2 x half> @test_sitofp_2xi32(<2 x i32> %a) #0 {
+  %r = sitofp <2 x i32> %a to <2 x half>
+  ret <2 x half> %r
+}
+
+; CHECK-LABEL: test_sitofp_2xi64(
+; CHECK:      ld.param.v2.u64 {[[A0:%rd[0-9]+]], [[A1:%rd[0-9]+]]}, [test_sitofp_2xi64_param_0];
+; CHECK-DAG:  cvt.rn.f32.s64  [[F0:%f[0-9]+]], [[A0]];
+; CHECK-DAG:  cvt.rn.f32.s64  [[F1:%f[0-9]+]], [[A1]];
+; CHECK-DAG:  cvt.rn.f16.f32  [[R0:%h[0-9]+]], [[F0]];
+; CHECK-DAG:  cvt.rn.f16.f32  [[R1:%h[0-9]+]], [[F1]];
+; CHECK:      mov.b32         [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
+; CHECK:      st.param.b32    [func_retval0+0], [[R]];
+; CHECK:      ret;
+define <2 x half> @test_sitofp_2xi64(<2 x i64> %a) #0 {
+  %r = sitofp <2 x i64> %a to <2 x half>
+  ret <2 x half> %r
+}
+
+; CHECK-LABEL: test_uitofp_2xi32_fadd(
+; CHECK-DAG:  ld.param.v2.u32 {[[A0:%r[0-9]+]], [[A1:%r[0-9]+]]}, [test_uitofp_2xi32_fadd_param_0];
+; CHECK-DAG:  ld.param.b32    [[B:%hh[0-9]+]], [test_uitofp_2xi32_fadd_param_1];
+; CHECK-DAG:  cvt.rn.f16.u32  [[C0:%h[0-9]+]], [[A0]];
+; CHECK-DAG:  cvt.rn.f16.u32  [[C1:%h[0-9]+]], [[A1]];
+
+; CHECK-F16-DAG:  mov.b32         [[C:%hh[0-9]+]], {[[C0]], [[C1]]}
+; CHECK-F16-DAG:  add.rn.f16x2    [[R:%hh[0-9]+]], [[B]], [[C]];
+;
+; CHECK-NOF16-DAG:  mov.b32        {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]]
+; CHECK-NOF16-DAG:  cvt.f32.f16    [[FB0:%f[0-9]+]], [[B0]]
+; CHECK-NOF16-DAG:  cvt.f32.f16    [[FB1:%f[0-9]+]], [[B1]]
+; CHECK-NOF16-DAG:  cvt.f32.f16    [[FC0:%f[0-9]+]], [[C0]]
+; CHECK-NOF16-DAG:  cvt.f32.f16    [[FC1:%f[0-9]+]], [[C1]]
+; CHECK-NOF16-DAG:  add.rn.f32     [[FR0:%f[0-9]+]], [[FB0]], [[FC0]];
+; CHECK-NOF16-DAG:  add.rn.f32     [[FR1:%f[0-9]+]], [[FB1]], [[FC1]];
+; CHECK-NOF16-DAG:  cvt.rn.f16.f32 [[R0:%h[0-9]+]], [[FR0]]
+; CHECK-NOF16-DAG:  cvt.rn.f16.f32 [[R1:%h[0-9]+]], [[FR1]]
+; CHECK-NOF16:      mov.b32        [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
+;
+; CHECK:      st.param.b32    [func_retval0+0], [[R]];
+; CHECK:      ret;
+define <2 x half> @test_uitofp_2xi32_fadd(<2 x i32> %a, <2 x half> %b) #0 {
+  %c = uitofp <2 x i32> %a to <2 x half>
+  %r = fadd <2 x half> %b, %c
+  ret <2 x half> %r
+}
+
+; CHECK-LABEL: test_sitofp_2xi32_fadd(
+; CHECK-DAG:  ld.param.v2.u32 {[[A0:%r[0-9]+]], [[A1:%r[0-9]+]]}, [test_sitofp_2xi32_fadd_param_0];
+; CHECK-DAG:  ld.param.b32    [[B:%hh[0-9]+]], [test_sitofp_2xi32_fadd_param_1];
+; CHECK-DAG:  cvt.rn.f16.s32  [[C0:%h[0-9]+]], [[A0]];
+; CHECK-DAG:  cvt.rn.f16.s32  [[C1:%h[0-9]+]], [[A1]];
+;
+; CHECK-F16-DAG:  mov.b32         [[C:%hh[0-9]+]], {[[C0]], [[C1]]}
+; CHECK-F16-DAG:  add.rn.f16x2    [[R:%hh[0-9]+]], [[B]], [[C]];
+;
+; CHECK-NOF16-DAG:  mov.b32        {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]]
+; CHECK-NOF16-DAG:  cvt.f32.f16    [[FB0:%f[0-9]+]], [[B0]]
+; CHECK-NOF16-DAG:  cvt.f32.f16    [[FB1:%f[0-9]+]], [[B1]]
+; CHECK-NOF16-DAG:  cvt.f32.f16    [[FC0:%f[0-9]+]], [[C0]]
+; CHECK-NOF16-DAG:  cvt.f32.f16    [[FC1:%f[0-9]+]], [[C1]]
+; CHECK-NOF16-DAG:  add.rn.f32     [[FR0:%f[0-9]+]], [[FB0]], [[FC0]];
+; CHECK-NOF16-DAG:  add.rn.f32     [[FR1:%f[0-9]+]], [[FB1]], [[FC1]];
+; CHECK-NOF16-DAG:  cvt.rn.f16.f32 [[R0:%h[0-9]+]], [[FR0]]
+; CHECK-NOF16-DAG:  cvt.rn.f16.f32 [[R1:%h[0-9]+]], [[FR1]]
+; CHECK-NOF16:      mov.b32        [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
+;
+; CHECK:      st.param.b32    [func_retval0+0], [[R]];
+; CHECK:      ret;
+define <2 x half> @test_sitofp_2xi32_fadd(<2 x i32> %a, <2 x half> %b) #0 {
+  %c = sitofp <2 x i32> %a to <2 x half>
+  %r = fadd <2 x half> %b, %c
+  ret <2 x half> %r
+}
+
+; CHECK-LABEL: test_fptrunc_2xfloat(
+; CHECK:      ld.param.v2.f32 {[[A0:%f[0-9]+]], [[A1:%f[0-9]+]]}, [test_fptrunc_2xfloat_param_0];
+; CHECK-DAG:  cvt.rn.f16.f32  [[R0:%h[0-9]+]], [[A0]];
+; CHECK-DAG:  cvt.rn.f16.f32  [[R1:%h[0-9]+]], [[A1]];
+; CHECK:      mov.b32         [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
+; CHECK:      st.param.b32    [func_retval0+0], [[R]];
+; CHECK:      ret;
+define <2 x half> @test_fptrunc_2xfloat(<2 x float> %a) #0 {
+  %r = fptrunc <2 x float> %a to <2 x half>
+  ret <2 x half> %r
+}
+
+; CHECK-LABEL: test_fptrunc_2xdouble(
+; CHECK:      ld.param.v2.f64 {[[A0:%fd[0-9]+]], [[A1:%fd[0-9]+]]}, [test_fptrunc_2xdouble_param_0];
+; CHECK-DAG:  cvt.rn.f16.f64  [[R0:%h[0-9]+]], [[A0]];
+; CHECK-DAG:  cvt.rn.f16.f64  [[R1:%h[0-9]+]], [[A1]];
+; CHECK:      mov.b32         [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
+; CHECK:      st.param.b32    [func_retval0+0], [[R]];
+; CHECK:      ret;
+define <2 x half> @test_fptrunc_2xdouble(<2 x double> %a) #0 {
+  %r = fptrunc <2 x double> %a to <2 x half>
+  ret <2 x half> %r
+}
+
+; CHECK-LABEL: test_fpext_2xfloat(
+; CHECK:      ld.param.b32    [[A:%hh[0-9]+]], [test_fpext_2xfloat_param_0];
+; CHECK:      mov.b32         {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
+; CHECK-DAG:  cvt.f32.f16     [[R0:%f[0-9]+]], [[A0]];
+; CHECK-DAG:  cvt.f32.f16     [[R1:%f[0-9]+]], [[A1]];
+; CHECK-NEXT: st.param.v2.f32 [func_retval0+0], {[[R0]], [[R1]]};
+; CHECK:      ret;
+define <2 x float> @test_fpext_2xfloat(<2 x half> %a) #0 {
+  %r = fpext <2 x half> %a to <2 x float>
+  ret <2 x float> %r
+}
+
+; CHECK-LABEL: test_fpext_2xdouble(
+; CHECK:      ld.param.b32    [[A:%hh[0-9]+]], [test_fpext_2xdouble_param_0];
+; CHECK:      mov.b32         {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
+; CHECK-DAG:  cvt.f64.f16     [[R0:%fd[0-9]+]], [[A0]];
+; CHECK-DAG:  cvt.f64.f16     [[R1:%fd[0-9]+]], [[A1]];
+; CHECK-NEXT: st.param.v2.f64 [func_retval0+0], {[[R0]], [[R1]]};
+; CHECK:      ret;
+define <2 x double> @test_fpext_2xdouble(<2 x half> %a) #0 {
+  %r = fpext <2 x half> %a to <2 x double>
+  ret <2 x double> %r
+}
+
+
+; CHECK-LABEL: test_bitcast_2xhalf_to_2xi16(
+; CHECK:      ld.param.u32    [[A:%r[0-9]+]], [test_bitcast_2xhalf_to_2xi16_param_0];
+; CHECK-DAG:  cvt.u16.u32     [[R0:%rs[0-9]+]], [[A]]
+; CHECK-DAG:  shr.u32         [[AH:%r[0-9]+]], [[A]], 16
+; CHECK-DAG:  cvt.u16.u32     [[R1:%rs[0-9]+]], [[AH]]
+; CHECK:      st.param.v2.b16 [func_retval0+0], {[[R0]], [[R1]]}
+; CHECK:      ret;
+define <2 x i16> @test_bitcast_2xhalf_to_2xi16(<2 x half> %a) #0 {
+  %r = bitcast <2 x half> %a to <2 x i16>
+  ret <2 x i16> %r
+}
+
+; CHECK-LABEL: test_bitcast_2xi16_to_2xhalf(
+; CHECK:      ld.param.v2.u16         {[[RS0:%rs[0-9]+]], [[RS1:%rs[0-9]+]]}, [test_bitcast_2xi16_to_2xhalf_param_0];
+; CHECK-DAG:  cvt.u32.u16     [[R0:%r[0-9]+]], [[RS0]];
+; CHECK-DAG:  cvt.u32.u16     [[R1:%r[0-9]+]], [[RS1]];
+; CHECK-DAG:  shl.b32         [[R1H:%r[0-9]+]], [[R1]], 16;
+; CHECK-DAG:  or.b32          [[R1H0L:%r[0-9]+]], [[R0]], [[R1H]];
+; CHECK:      mov.b32         [[R:%hh[0-9]+]], [[R1H0L]];
+; CHECK:      st.param.b32    [func_retval0+0], [[R]];
+; CHECK:      ret;
+define <2 x half> @test_bitcast_2xi16_to_2xhalf(<2 x i16> %a) #0 {
+  %r = bitcast <2 x i16> %a to <2 x half>
+  ret <2 x half> %r
+}
+
+
+declare <2 x half> @llvm.sqrt.f16(<2 x half> %a) #0
+declare <2 x half> @llvm.powi.f16(<2 x half> %a, <2 x i32> %b) #0
+declare <2 x half> @llvm.sin.f16(<2 x half> %a) #0
+declare <2 x half> @llvm.cos.f16(<2 x half> %a) #0
+declare <2 x half> @llvm.pow.f16(<2 x half> %a, <2 x half> %b) #0
+declare <2 x half> @llvm.exp.f16(<2 x half> %a) #0
+declare <2 x half> @llvm.exp2.f16(<2 x half> %a) #0
+declare <2 x half> @llvm.log.f16(<2 x half> %a) #0
+declare <2 x half> @llvm.log10.f16(<2 x half> %a) #0
+declare <2 x half> @llvm.log2.f16(<2 x half> %a) #0
+declare <2 x half> @llvm.fma.f16(<2 x half> %a, <2 x half> %b, <2 x half> %c) #0
+declare <2 x half> @llvm.fabs.f16(<2 x half> %a) #0
+declare <2 x half> @llvm.minnum.f16(<2 x half> %a, <2 x half> %b) #0
+declare <2 x half> @llvm.maxnum.f16(<2 x half> %a, <2 x half> %b) #0
+declare <2 x half> @llvm.copysign.f16(<2 x half> %a, <2 x half> %b) #0
+declare <2 x half> @llvm.floor.f16(<2 x half> %a) #0
+declare <2 x half> @llvm.ceil.f16(<2 x half> %a) #0
+declare <2 x half> @llvm.trunc.f16(<2 x half> %a) #0
+declare <2 x half> @llvm.rint.f16(<2 x half> %a) #0
+declare <2 x half> @llvm.nearbyint.f16(<2 x half> %a) #0
+declare <2 x half> @llvm.round.f16(<2 x half> %a) #0
+declare <2 x half> @llvm.fmuladd.f16(<2 x half> %a, <2 x half> %b, <2 x half> %c) #0
+
+; CHECK-LABEL: test_sqrt(
+; CHECK:      ld.param.b32    [[A:%hh[0-9]+]], [test_sqrt_param_0];
+; CHECK:      mov.b32         {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
+; CHECK-DAG:  cvt.f32.f16     [[AF0:%f[0-9]+]], [[A0]];
+; CHECK-DAG:  cvt.f32.f16     [[AF1:%f[0-9]+]], [[A1]];
+; CHECK-DAG:  sqrt.rn.f32     [[RF0:%f[0-9]+]], [[AF0]];
+; CHECK-DAG:  sqrt.rn.f32     [[RF1:%f[0-9]+]], [[AF1]];
+; CHECK-DAG:  cvt.rn.f16.f32  [[R0:%h[0-9]+]], [[RF0]];
+; CHECK-DAG:  cvt.rn.f16.f32  [[R1:%h[0-9]+]], [[RF1]];
+; CHECK:      mov.b32         [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
+; CHECK:      st.param.b32    [func_retval0+0], [[R]];
+; CHECK:      ret;
+define <2 x half> @test_sqrt(<2 x half> %a) #0 {
+  %r = call <2 x half> @llvm.sqrt.f16(<2 x half> %a)
+  ret <2 x half> %r
+}
+
+;;; Can't do this yet: requires libcall.
+; XCHECK-LABEL: test_powi(
+;define <2 x half> @test_powi(<2 x half> %a, <2 x i32> %b) #0 {
+;  %r = call <2 x half> @llvm.powi.f16(<2 x half> %a, <2 x i32> %b)
+;  ret <2 x half> %r
+;}
+
+; CHECK-LABEL: test_sin(
+; CHECK:      ld.param.b32    [[A:%hh[0-9]+]], [test_sin_param_0];
+; CHECK:      mov.b32         {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
+; CHECK-DAG:  cvt.f32.f16     [[AF0:%f[0-9]+]], [[A0]];
+; CHECK-DAG:  cvt.f32.f16     [[AF1:%f[0-9]+]], [[A1]];
+; CHECK-DAG:  sin.approx.f32  [[RF0:%f[0-9]+]], [[AF0]];
+; CHECK-DAG:  sin.approx.f32  [[RF1:%f[0-9]+]], [[AF1]];
+; CHECK-DAG:  cvt.rn.f16.f32  [[R0:%h[0-9]+]], [[RF0]];
+; CHECK-DAG:  cvt.rn.f16.f32  [[R1:%h[0-9]+]], [[RF1]];
+; CHECK:      mov.b32         [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
+; CHECK:      st.param.b32    [func_retval0+0], [[R]];
+; CHECK:      ret;
+define <2 x half> @test_sin(<2 x half> %a) #0 #1 {
+  %r = call <2 x half> @llvm.sin.f16(<2 x half> %a)
+  ret <2 x half> %r
+}
+
+; CHECK-LABEL: test_cos(
+; CHECK:      ld.param.b32    [[A:%hh[0-9]+]], [test_cos_param_0];
+; CHECK:      mov.b32         {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
+; CHECK-DAG:  cvt.f32.f16     [[AF0:%f[0-9]+]], [[A0]];
+; CHECK-DAG:  cvt.f32.f16     [[AF1:%f[0-9]+]], [[A1]];
+; CHECK-DAG:  cos.approx.f32  [[RF0:%f[0-9]+]], [[AF0]];
+; CHECK-DAG:  cos.approx.f32  [[RF1:%f[0-9]+]], [[AF1]];
+; CHECK-DAG:  cvt.rn.f16.f32  [[R0:%h[0-9]+]], [[RF0]];
+; CHECK-DAG:  cvt.rn.f16.f32  [[R1:%h[0-9]+]], [[RF1]];
+; CHECK:      mov.b32         [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
+; CHECK:      st.param.b32    [func_retval0+0], [[R]];
+; CHECK:      ret;
+define <2 x half> @test_cos(<2 x half> %a) #0 #1 {
+  %r = call <2 x half> @llvm.cos.f16(<2 x half> %a)
+  ret <2 x half> %r
+}
+
+;;; Can't do this yet: requires libcall.
+; XCHECK-LABEL: test_pow(
+;define <2 x half> @test_pow(<2 x half> %a, <2 x half> %b) #0 {
+;  %r = call <2 x half> @llvm.pow.f16(<2 x half> %a, <2 x half> %b)
+;  ret <2 x half> %r
+;}
+
+;;; Can't do this yet: requires libcall.
+; XCHECK-LABEL: test_exp(
+;define <2 x half> @test_exp(<2 x half> %a) #0 {
+;  %r = call <2 x half> @llvm.exp.f16(<2 x half> %a)
+;  ret <2 x half> %r
+;}
+
+;;; Can't do this yet: requires libcall.
+; XCHECK-LABEL: test_exp2(
+;define <2 x half> @test_exp2(<2 x half> %a) #0 {
+;  %r = call <2 x half> @llvm.exp2.f16(<2 x half> %a)
+;  ret <2 x half> %r
+;}
+
+;;; Can't do this yet: requires libcall.
+; XCHECK-LABEL: test_log(
+;define <2 x half> @test_log(<2 x half> %a) #0 {
+;  %r = call <2 x half> @llvm.log.f16(<2 x half> %a)
+;  ret <2 x half> %r
+;}
+
+;;; Can't do this yet: requires libcall.
+; XCHECK-LABEL: test_log10(
+;define <2 x half> @test_log10(<2 x half> %a) #0 {
+;  %r = call <2 x half> @llvm.log10.f16(<2 x half> %a)
+;  ret <2 x half> %r
+;}
+
+;;; Can't do this yet: requires libcall.
+; XCHECK-LABEL: test_log2(
+;define <2 x half> @test_log2(<2 x half> %a) #0 {
+;  %r = call <2 x half> @llvm.log2.f16(<2 x half> %a)
+;  ret <2 x half> %r
+;}
+
+; CHECK-LABEL: test_fma(
+; CHECK-DAG:  ld.param.b32    [[A:%hh[0-9]+]], [test_fma_param_0];
+; CHECK-DAG:  ld.param.b32    [[B:%hh[0-9]+]], [test_fma_param_1];
+; CHECK-DAG:  ld.param.b32    [[C:%hh[0-9]+]], [test_fma_param_2];
+;
+; CHECK-F16:        fma.rn.f16x2   [[R:%hh[0-9]+]], [[A]], [[B]], [[C]];
+;
+; CHECK-NOF16-DAG:  mov.b32        {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
+; CHECK-NOF16-DAG:  mov.b32        {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]]
+; CHECK-NOF16-DAG:  mov.b32        {[[C0:%h[0-9]+]], [[C1:%h[0-9]+]]}, [[C]]
+; CHECK-NOF16-DAG:  cvt.f32.f16    [[FA0:%f[0-9]+]], [[A0]]
+; CHECK-NOF16-DAG:  cvt.f32.f16    [[FB0:%f[0-9]+]], [[B0]]
+; CHECK-NOF16-DAG:  cvt.f32.f16    [[FC0:%f[0-9]+]], [[C0]]
+; CHECK-NOF16-DAG:  cvt.f32.f16    [[FA1:%f[0-9]+]], [[A1]]
+; CHECK-NOF16-DAG:  cvt.f32.f16    [[FB1:%f[0-9]+]], [[B1]]
+; CHECK-NOF16-DAG:  cvt.f32.f16    [[FC0:%f[0-9]+]], [[C0]]
+; CHECK-NOF16-DAG:  fma.rn.f32     [[FR0:%f[0-9]+]], [[FA0]], [[FB0]], [[FC0]];
+; CHECK-NOF16-DAG:  fma.rn.f32     [[FR1:%f[0-9]+]], [[FA1]], [[FB1]], [[FC1]];
+; CHECK-NOF16-DAG:  cvt.rn.f16.f32 [[R0:%h[0-9]+]], [[FR0]]
+; CHECK-NOF16-DAG:  cvt.rn.f16.f32 [[R1:%h[0-9]+]], [[FR1]]
+; CHECK-NOF16:      mov.b32        [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
+
+; CHECK:      st.param.b32    [func_retval0+0], [[R]];
+; CHECK:      ret
+define <2 x half> @test_fma(<2 x half> %a, <2 x half> %b, <2 x half> %c) #0 {
+  %r = call <2 x half> @llvm.fma.f16(<2 x half> %a, <2 x half> %b, <2 x half> %c)
+  ret <2 x half> %r
+}
+
+; CHECK-LABEL: test_fabs(
+; CHECK:      ld.param.b32    [[A:%hh[0-9]+]], [test_fabs_param_0];
+; CHECK:      mov.b32         {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
+; CHECK-DAG:  cvt.f32.f16     [[AF0:%f[0-9]+]], [[A0]];
+; CHECK-DAG:  cvt.f32.f16     [[AF1:%f[0-9]+]], [[A1]];
+; CHECK-DAG:  abs.f32         [[RF0:%f[0-9]+]], [[AF0]];
+; CHECK-DAG:  abs.f32         [[RF1:%f[0-9]+]], [[AF1]];
+; CHECK-DAG:  cvt.rn.f16.f32  [[R0:%h[0-9]+]], [[RF0]];
+; CHECK-DAG:  cvt.rn.f16.f32  [[R1:%h[0-9]+]], [[RF1]];
+; CHECK:      mov.b32         [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
+; CHECK:      st.param.b32    [func_retval0+0], [[R]];
+; CHECK:      ret;
+define <2 x half> @test_fabs(<2 x half> %a) #0 {
+  %r = call <2 x half> @llvm.fabs.f16(<2 x half> %a)
+  ret <2 x half> %r
+}
+
+; CHECK-LABEL: test_minnum(
+; CHECK-DAG:  ld.param.b32    [[A:%hh[0-9]+]], [test_minnum_param_0];
+; CHECK-DAG:  ld.param.b32    [[B:%hh[0-9]+]], [test_minnum_param_1];
+; CHECK-DAG:  mov.b32         {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
+; CHECK-DAG:  mov.b32         {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]]
+; CHECK-DAG:  cvt.f32.f16     [[AF0:%f[0-9]+]], [[A0]];
+; CHECK-DAG:  cvt.f32.f16     [[AF1:%f[0-9]+]], [[A1]];
+; CHECK-DAG:  cvt.f32.f16     [[BF0:%f[0-9]+]], [[B0]];
+; CHECK-DAG:  cvt.f32.f16     [[BF1:%f[0-9]+]], [[B1]];
+; CHECK-DAG:  min.f32         [[RF0:%f[0-9]+]], [[AF0]], [[BF0]];
+; CHECK-DAG:  min.f32         [[RF1:%f[0-9]+]], [[AF1]], [[BF1]];
+; CHECK-DAG:  cvt.rn.f16.f32  [[R0:%h[0-9]+]], [[RF0]];
+; CHECK-DAG:  cvt.rn.f16.f32  [[R1:%h[0-9]+]], [[RF1]];
+; CHECK:      mov.b32         [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
+; CHECK:      st.param.b32    [func_retval0+0], [[R]];
+; CHECK:      ret;
+define <2 x half> @test_minnum(<2 x half> %a, <2 x half> %b) #0 {
+  %r = call <2 x half> @llvm.minnum.f16(<2 x half> %a, <2 x half> %b)
+  ret <2 x half> %r
+}
+
+; CHECK-LABEL: test_maxnum(
+; CHECK-DAG:  ld.param.b32    [[A:%hh[0-9]+]], [test_maxnum_param_0];
+; CHECK-DAG:  ld.param.b32    [[B:%hh[0-9]+]], [test_maxnum_param_1];
+; CHECK-DAG:  mov.b32         {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
+; CHECK-DAG:  mov.b32         {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]]
+; CHECK-DAG:  cvt.f32.f16     [[AF0:%f[0-9]+]], [[A0]];
+; CHECK-DAG:  cvt.f32.f16     [[AF1:%f[0-9]+]], [[A1]];
+; CHECK-DAG:  cvt.f32.f16     [[BF0:%f[0-9]+]], [[B0]];
+; CHECK-DAG:  cvt.f32.f16     [[BF1:%f[0-9]+]], [[B1]];
+; CHECK-DAG:  max.f32         [[RF0:%f[0-9]+]], [[AF0]], [[BF0]];
+; CHECK-DAG:  max.f32         [[RF1:%f[0-9]+]], [[AF1]], [[BF1]];
+; CHECK-DAG:  cvt.rn.f16.f32  [[R0:%h[0-9]+]], [[RF0]];
+; CHECK-DAG:  cvt.rn.f16.f32  [[R1:%h[0-9]+]], [[RF1]];
+; CHECK:      mov.b32         [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
+; CHECK:      st.param.b32    [func_retval0+0], [[R]];
+; CHECK:      ret;
+define <2 x half> @test_maxnum(<2 x half> %a, <2 x half> %b) #0 {
+  %r = call <2 x half> @llvm.maxnum.f16(<2 x half> %a, <2 x half> %b)
+  ret <2 x half> %r
+}
+
+; CHECK-LABEL: test_copysign(
+; CHECK-DAG:  ld.param.b32    [[A:%hh[0-9]+]], [test_copysign_param_0];
+; CHECK-DAG:  ld.param.b32    [[B:%hh[0-9]+]], [test_copysign_param_1];
+; CHECK-DAG:  mov.b32         {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
+; CHECK-DAG:  mov.b32         {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]]
+; CHECK-DAG:  mov.b16         [[AS0:%rs[0-9]+]], [[A0]];
+; CHECK-DAG:  mov.b16         [[AS1:%rs[0-9]+]], [[A1]];
+; CHECK-DAG:  mov.b16         [[BS0:%rs[0-9]+]], [[B0]];
+; CHECK-DAG:  mov.b16         [[BS1:%rs[0-9]+]], [[B1]];
+; CHECK-DAG:  and.b16         [[AX0:%rs[0-9]+]], [[AS0]], 32767;
+; CHECK-DAG:  and.b16         [[AX1:%rs[0-9]+]], [[AS1]], 32767;
+; CHECK-DAG:  and.b16         [[BX0:%rs[0-9]+]], [[BS0]], -32768;
+; CHECK-DAG:  and.b16         [[BX1:%rs[0-9]+]], [[BS1]], -32768;
+; CHECK-DAG:  or.b16          [[RS0:%rs[0-9]+]], [[AX0]], [[BX0]];
+; CHECK-DAG:  or.b16          [[RS1:%rs[0-9]+]], [[AX1]], [[BX1]];
+; CHECK-DAG:  mov.b16         [[R0:%h[0-9]+]], [[RS0]];
+; CHECK-DAG:  mov.b16         [[R1:%h[0-9]+]], [[RS1]];
+; CHECK-DAG:  mov.b32         [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
+; CHECK:      st.param.b32    [func_retval0+0], [[R]];
+; CHECK:      ret;
+define <2 x half> @test_copysign(<2 x half> %a, <2 x half> %b) #0 {
+  %r = call <2 x half> @llvm.copysign.f16(<2 x half> %a, <2 x half> %b)
+  ret <2 x half> %r
+}
+
+; CHECK-LABEL: test_copysign_f32(
+; CHECK-DAG:  ld.param.b32    [[A:%hh[0-9]+]], [test_copysign_f32_param_0];
+; CHECK-DAG:  ld.param.v2.f32 {[[B0:%f[0-9]+]], [[B1:%f[0-9]+]]}, [test_copysign_f32_param_1];
+; CHECK-DAG:  mov.b32         {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
+; CHECK-DAG:  mov.b16         [[AS0:%rs[0-9]+]], [[A0]];
+; CHECK-DAG:  mov.b16         [[AS1:%rs[0-9]+]], [[A1]];
+; CHECK-DAG:  mov.b32         [[BI0:%r[0-9]+]], [[B0]];
+; CHECK-DAG:  mov.b32         [[BI1:%r[0-9]+]], [[B1]];
+; CHECK-DAG:  and.b16         [[AI0:%rs[0-9]+]], [[AS0]], 32767;
+; CHECK-DAG:  and.b16         [[AI1:%rs[0-9]+]], [[AS1]], 32767;
+; CHECK-DAG:  and.b32         [[BX0:%r[0-9]+]], [[BI0]], -2147483648;
+; CHECK-DAG:  and.b32         [[BX1:%r[0-9]+]], [[BI1]], -2147483648;
+; CHECK-DAG:  shr.u32         [[BY0:%r[0-9]+]], [[BX0]], 16;
+; CHECK-DAG:  shr.u32         [[BY1:%r[0-9]+]], [[BX1]], 16;
+; CHECK-DAG:  cvt.u16.u32     [[BZ0:%rs[0-9]+]], [[BY0]];
+; CHECK-DAG:  cvt.u16.u32     [[BZ1:%rs[0-9]+]], [[BY1]];
+; CHECK-DAG:  or.b16          [[RS0:%rs[0-9]+]], [[AI0]], [[BZ0]];
+; CHECK-DAG:  or.b16          [[RS1:%rs[0-9]+]], [[AI1]], [[BZ1]];
+; CHECK-DAG:  mov.b16         [[R0:%h[0-9]+]], [[RS0]];
+; CHECK-DAG:  mov.b16         [[R1:%h[0-9]+]], [[RS1]];
+; CHECK-DAG:  mov.b32         [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
+; CHECK:      st.param.b32    [func_retval0+0], [[R]];
+; CHECK:      ret;
+define <2 x half> @test_copysign_f32(<2 x half> %a, <2 x float> %b) #0 {
+  %tb = fptrunc <2 x float> %b to <2 x half>
+  %r = call <2 x half> @llvm.copysign.f16(<2 x half> %a, <2 x half> %tb)
+  ret <2 x half> %r
+}
+
+; CHECK-LABEL: test_copysign_f64(
+; CHECK-DAG:  ld.param.b32    [[A:%hh[0-9]+]], [test_copysign_f64_param_0];
+; CHECK-DAG:  ld.param.v2.f64 {[[B0:%fd[0-9]+]], [[B1:%fd[0-9]+]]}, [test_copysign_f64_param_1];
+; CHECK-DAG:  mov.b32         {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
+; CHECK-DAG:  mov.b16         [[AS0:%rs[0-9]+]], [[A0]];
+; CHECK-DAG:  mov.b16         [[AS1:%rs[0-9]+]], [[A1]];
+; CHECK-DAG:  mov.b64         [[BI0:%rd[0-9]+]], [[B0]];
+; CHECK-DAG:  mov.b64         [[BI1:%rd[0-9]+]], [[B1]];
+; CHECK-DAG:  and.b16         [[AI0:%rs[0-9]+]], [[AS0]], 32767;
+; CHECK-DAG:  and.b16         [[AI1:%rs[0-9]+]], [[AS1]], 32767;
+; CHECK-DAG:  and.b64         [[BX0:%rd[0-9]+]], [[BI0]], -9223372036854775808;
+; CHECK-DAG:  and.b64         [[BX1:%rd[0-9]+]], [[BI1]], -9223372036854775808;
+; CHECK-DAG:  shr.u64         [[BY0:%rd[0-9]+]], [[BX0]], 48;
+; CHECK-DAG:  shr.u64         [[BY1:%rd[0-9]+]], [[BX1]], 48;
+; CHECK-DAG:  cvt.u16.u64     [[BZ0:%rs[0-9]+]], [[BY0]];
+; CHECK-DAG:  cvt.u16.u64     [[BZ1:%rs[0-9]+]], [[BY1]];
+; CHECK-DAG:  or.b16          [[RS0:%rs[0-9]+]], [[AI0]], [[BZ0]];
+; CHECK-DAG:  or.b16          [[RS1:%rs[0-9]+]], [[AI1]], [[BZ1]];
+; CHECK-DAG:  mov.b16         [[R0:%h[0-9]+]], [[RS0]];
+; CHECK-DAG:  mov.b16         [[R1:%h[0-9]+]], [[RS1]];
+; CHECK-DAG:  mov.b32         [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
+; CHECK:      st.param.b32    [func_retval0+0], [[R]];
+; CHECK:      ret;
+define <2 x half> @test_copysign_f64(<2 x half> %a, <2 x double> %b) #0 {
+  %tb = fptrunc <2 x double> %b to <2 x half>
+  %r = call <2 x half> @llvm.copysign.f16(<2 x half> %a, <2 x half> %tb)
+  ret <2 x half> %r
+}
+
+; CHECK-LABEL: test_copysign_extended(
+; CHECK-DAG:  ld.param.b32    [[A:%hh[0-9]+]], [test_copysign_extended_param_0];
+; CHECK-DAG:  ld.param.b32    [[B:%hh[0-9]+]], [test_copysign_extended_param_1];
+; CHECK-DAG:  mov.b32         {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
+; CHECK-DAG:  mov.b32         {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]]
+; CHECK-DAG:  mov.b16         [[AS0:%rs[0-9]+]], [[A0]];
+; CHECK-DAG:  mov.b16         [[AS1:%rs[0-9]+]], [[A1]];
+; CHECK-DAG:  mov.b16         [[BS0:%rs[0-9]+]], [[B0]];
+; CHECK-DAG:  mov.b16         [[BS1:%rs[0-9]+]], [[B1]];
+; CHECK-DAG:  and.b16         [[AX0:%rs[0-9]+]], [[AS0]], 32767;
+; CHECK-DAG:  and.b16         [[AX1:%rs[0-9]+]], [[AS1]], 32767;
+; CHECK-DAG:  and.b16         [[BX0:%rs[0-9]+]], [[BS0]], -32768;
+; CHECK-DAG:  and.b16         [[BX1:%rs[0-9]+]], [[BS1]], -32768;
+; CHECK-DAG:  or.b16          [[RS0:%rs[0-9]+]], [[AX0]], [[BX0]];
+; CHECK-DAG:  or.b16          [[RS1:%rs[0-9]+]], [[AX1]], [[BX1]];
+; CHECK-DAG:  mov.b16         [[R0:%h[0-9]+]], [[RS0]];
+; CHECK-DAG:  mov.b16         [[R1:%h[0-9]+]], [[RS1]];
+; CHECK-DAG:  mov.b32         [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
+; CHECK:      mov.b32         {[[RX0:%h[0-9]+]], [[RX1:%h[0-9]+]]}, [[R]]
+; CHECK-DAG:  cvt.f32.f16     [[XR0:%f[0-9]+]], [[RX0]];
+; CHECK-DAG:  cvt.f32.f16     [[XR1:%f[0-9]+]], [[RX1]];
+; CHECK:      st.param.v2.f32 [func_retval0+0], {[[XR0]], [[XR1]]};
+; CHECK:      ret;
+define <2 x float> @test_copysign_extended(<2 x half> %a, <2 x half> %b) #0 {
+  %r = call <2 x half> @llvm.copysign.f16(<2 x half> %a, <2 x half> %b)
+  %xr = fpext <2 x half> %r to <2 x float>
+  ret <2 x float> %xr
+}
+
+; CHECK-LABEL: test_floor(
+; CHECK:      ld.param.b32    [[A:%hh[0-9]+]], [test_floor_param_0];
+; CHECK-DAG:  mov.b32         {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]];
+; CHECK-DAG:  cvt.rmi.f16.f16 [[R1:%h[0-9]+]], [[A1]];
+; CHECK-DAG:  cvt.rmi.f16.f16 [[R0:%h[0-9]+]], [[A0]];
+; CHECK:      mov.b32         [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
+; CHECK:      st.param.b32    [func_retval0+0], [[R]];
+; CHECK:      ret;
+define <2 x half> @test_floor(<2 x half> %a) #0 {
+  %r = call <2 x half> @llvm.floor.f16(<2 x half> %a)
+  ret <2 x half> %r
+}
+
+; CHECK-LABEL: test_ceil(
+; CHECK:      ld.param.b32    [[A:%hh[0-9]+]], [test_ceil_param_0];
+; CHECK-DAG:  mov.b32         {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]];
+; CHECK-DAG:  cvt.rpi.f16.f16 [[R1:%h[0-9]+]], [[A1]];
+; CHECK-DAG:  cvt.rpi.f16.f16 [[R0:%h[0-9]+]], [[A0]];
+; CHECK:      mov.b32         [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
+; CHECK:      st.param.b32    [func_retval0+0], [[R]];
+; CHECK:      ret;
+define <2 x half> @test_ceil(<2 x half> %a) #0 {
+  %r = call <2 x half> @llvm.ceil.f16(<2 x half> %a)
+  ret <2 x half> %r
+}
+
+; CHECK-LABEL: test_trunc(
+; CHECK:      ld.param.b32    [[A:%hh[0-9]+]], [test_trunc_param_0];
+; CHECK-DAG:  mov.b32         {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]];
+; CHECK-DAG:  cvt.rzi.f16.f16 [[R1:%h[0-9]+]], [[A1]];
+; CHECK-DAG:  cvt.rzi.f16.f16 [[R0:%h[0-9]+]], [[A0]];
+; CHECK:      mov.b32         [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
+; CHECK:      st.param.b32    [func_retval0+0], [[R]];
+; CHECK:      ret;
+define <2 x half> @test_trunc(<2 x half> %a) #0 {
+  %r = call <2 x half> @llvm.trunc.f16(<2 x half> %a)
+  ret <2 x half> %r
+}
+
+; CHECK-LABEL: test_rint(
+; CHECK:      ld.param.b32    [[A:%hh[0-9]+]], [test_rint_param_0];
+; CHECK-DAG:  mov.b32         {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]];
+; CHECK-DAG:  cvt.rni.f16.f16 [[R1:%h[0-9]+]], [[A1]];
+; CHECK-DAG:  cvt.rni.f16.f16 [[R0:%h[0-9]+]], [[A0]];
+; CHECK:      mov.b32         [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
+; CHECK:      st.param.b32    [func_retval0+0], [[R]];
+; CHECK:      ret;
+define <2 x half> @test_rint(<2 x half> %a) #0 {
+  %r = call <2 x half> @llvm.rint.f16(<2 x half> %a)
+  ret <2 x half> %r
+}
+
+; CHECK-LABEL: test_nearbyint(
+; CHECK:      ld.param.b32    [[A:%hh[0-9]+]], [test_nearbyint_param_0];
+; CHECK-DAG:  mov.b32         {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]];
+; CHECK-DAG:  cvt.rni.f16.f16 [[R1:%h[0-9]+]], [[A1]];
+; CHECK-DAG:  cvt.rni.f16.f16 [[R0:%h[0-9]+]], [[A0]];
+; CHECK:      mov.b32         [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
+; CHECK:      st.param.b32    [func_retval0+0], [[R]];
+; CHECK:      ret;
+define <2 x half> @test_nearbyint(<2 x half> %a) #0 {
+  %r = call <2 x half> @llvm.nearbyint.f16(<2 x half> %a)
+  ret <2 x half> %r
+}
+
+; CHECK-LABEL: test_round(
+; CHECK:      ld.param.b32    [[A:%hh[0-9]+]], [test_round_param_0];
+; CHECK-DAG:  mov.b32         {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]];
+; CHECK-DAG:  cvt.rni.f16.f16 [[R1:%h[0-9]+]], [[A1]];
+; CHECK-DAG:  cvt.rni.f16.f16 [[R0:%h[0-9]+]], [[A0]];
+; CHECK:      mov.b32         [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
+; CHECK:      st.param.b32    [func_retval0+0], [[R]];
+; CHECK:      ret;
+define <2 x half> @test_round(<2 x half> %a) #0 {
+  %r = call <2 x half> @llvm.round.f16(<2 x half> %a)
+  ret <2 x half> %r
+}
+
+; CHECK-LABEL: test_fmuladd(
+; CHECK-DAG:  ld.param.b32    [[A:%hh[0-9]+]], [test_fmuladd_param_0];
+; CHECK-DAG:  ld.param.b32    [[B:%hh[0-9]+]], [test_fmuladd_param_1];
+; CHECK-DAG:  ld.param.b32    [[C:%hh[0-9]+]], [test_fmuladd_param_2];
+;
+; CHECK-F16:        fma.rn.f16x2   [[R:%hh[0-9]+]], [[A]], [[B]], [[C]];
+;
+; CHECK-NOF16-DAG:  mov.b32        {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
+; CHECK-NOF16-DAG:  mov.b32        {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]]
+; CHECK-NOF16-DAG:  mov.b32        {[[C0:%h[0-9]+]], [[C1:%h[0-9]+]]}, [[C]]
+; CHECK-NOF16-DAG:  cvt.f32.f16    [[FA0:%f[0-9]+]], [[A0]]
+; CHECK-NOF16-DAG:  cvt.f32.f16    [[FB0:%f[0-9]+]], [[B0]]
+; CHECK-NOF16-DAG:  cvt.f32.f16    [[FC0:%f[0-9]+]], [[C0]]
+; CHECK-NOF16-DAG:  cvt.f32.f16    [[FA1:%f[0-9]+]], [[A1]]
+; CHECK-NOF16-DAG:  cvt.f32.f16    [[FB1:%f[0-9]+]], [[B1]]
+; CHECK-NOF16-DAG:  cvt.f32.f16    [[FC0:%f[0-9]+]], [[C0]]
+; CHECK-NOF16-DAG:  fma.rn.f32     [[FR0:%f[0-9]+]], [[FA0]], [[FB0]], [[FC0]];
+; CHECK-NOF16-DAG:  fma.rn.f32     [[FR1:%f[0-9]+]], [[FA1]], [[FB1]], [[FC1]];
+; CHECK-NOF16-DAG:  cvt.rn.f16.f32 [[R0:%h[0-9]+]], [[FR0]]
+; CHECK-NOF16-DAG:  cvt.rn.f16.f32 [[R1:%h[0-9]+]], [[FR1]]
+; CHECK-NOF16:      mov.b32        [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
+;
+; CHECK:      st.param.b32    [func_retval0+0], [[R]];
+; CHECK:      ret;
+define <2 x half> @test_fmuladd(<2 x half> %a, <2 x half> %b, <2 x half> %c) #0 {
+  %r = call <2 x half> @llvm.fmuladd.f16(<2 x half> %a, <2 x half> %b, <2 x half> %c)
+  ret <2 x half> %r
+}
+
+attributes #0 = { nounwind }
+attributes #1 = { "unsafe-fp-math" = "true" }
diff --git a/test/CodeGen/NVPTX/fma.ll b/test/CodeGen/NVPTX/fma.ll
index 6785a01827e2..351f9b20dc0c 100644
--- a/test/CodeGen/NVPTX/fma.ll
+++ b/test/CodeGen/NVPTX/fma.ll
@@ -1,42 +1,42 @@
-; RUN: llc < %s -march=nvptx -mcpu=sm_20 -fp-contract=fast | FileCheck %s
-
-declare float @dummy_f32(float, float) #0
-declare double @dummy_f64(double, double) #0
-
-define ptx_device float @t1_f32(float %x, float %y, float %z) {
-; CHECK: fma.rn.f32 %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}};
-; CHECK: ret;
-  %a = fmul float %x, %y
-  %b = fadd float %a, %z
-  ret float %b
-}
-
-define ptx_device float @t2_f32(float %x, float %y, float %z, float %w) {
-; CHECK: fma.rn.f32 %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}};
-; CHECK: fma.rn.f32 %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}};
-; CHECK: ret;
-  %a = fmul float %x, %y
-  %b = fadd float %a, %z
-  %c = fadd float %a, %w
-  %d = call float @dummy_f32(float %b, float %c)
-  ret float %d
-}
-
-define ptx_device double @t1_f64(double %x, double %y, double %z) {
-; CHECK: fma.rn.f64 %fd{{[0-9]+}}, %fd{{[0-9]+}}, %fd{{[0-9]+}}, %fd{{[0-9]+}};
-; CHECK: ret;
-  %a = fmul double %x, %y
-  %b = fadd double %a, %z
-  ret double %b
-}
-
-define ptx_device double @t2_f64(double %x, double %y, double %z, double %w) {
-; CHECK: fma.rn.f64 %fd{{[0-9]+}}, %fd{{[0-9]+}}, %fd{{[0-9]+}}, %fd{{[0-9]+}};
-; CHECK: fma.rn.f64 %fd{{[0-9]+}}, %fd{{[0-9]+}}, %fd{{[0-9]+}}, %fd{{[0-9]+}};
-; CHECK: ret;
-  %a = fmul double %x, %y
-  %b = fadd double %a, %z
-  %c = fadd double %a, %w
-  %d = call double @dummy_f64(double %b, double %c)
-  ret double %d
-}
+; RUN: llc < %s -march=nvptx -mcpu=sm_20 -fp-contract=fast -verify-machineinstrs | FileCheck %s
+
+declare float @dummy_f32(float, float) #0
+declare double @dummy_f64(double, double) #0
+
+define ptx_device float @t1_f32(float %x, float %y, float %z) {
+; CHECK: fma.rn.f32 %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}};
+; CHECK: ret;
+  %a = fmul float %x, %y
+  %b = fadd float %a, %z
+  ret float %b
+}
+
+define ptx_device float @t2_f32(float %x, float %y, float %z, float %w) {
+; CHECK: fma.rn.f32 %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}};
+; CHECK: fma.rn.f32 %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}};
+; CHECK: ret;
+  %a = fmul float %x, %y
+  %b = fadd float %a, %z
+  %c = fadd float %a, %w
+  %d = call float @dummy_f32(float %b, float %c)
+  ret float %d
+}
+
+define ptx_device double @t1_f64(double %x, double %y, double %z) {
+; CHECK: fma.rn.f64 %fd{{[0-9]+}}, %fd{{[0-9]+}}, %fd{{[0-9]+}}, %fd{{[0-9]+}};
+; CHECK: ret;
+  %a = fmul double %x, %y
+  %b = fadd double %a, %z
+  ret double %b
+}
+
+define ptx_device double @t2_f64(double %x, double %y, double %z, double %w) {
+; CHECK: fma.rn.f64 %fd{{[0-9]+}}, %fd{{[0-9]+}}, %fd{{[0-9]+}}, %fd{{[0-9]+}};
+; CHECK: fma.rn.f64 %fd{{[0-9]+}}, %fd{{[0-9]+}}, %fd{{[0-9]+}}, %fd{{[0-9]+}};
+; CHECK: ret;
+  %a = fmul double %x, %y
+  %b = fadd double %a, %z
+  %c = fadd double %a, %w
+  %d = call double @dummy_f64(double %b, double %c)
+  ret double %d
+}
diff --git a/test/CodeGen/NVPTX/i8-param.ll b/test/CodeGen/NVPTX/i8-param.ll
index 6a1e3a0e1a0d..c41da0eebd1f 100644
--- a/test/CodeGen/NVPTX/i8-param.ll
+++ b/test/CodeGen/NVPTX/i8-param.ll
@@ -1,23 +1,23 @@
-; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s
-
-target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
-
-; CHECK: .visible .func  (.param .b32 func_retval0) callee
-define i8 @callee(i8 %a) {
-; CHECK: ld.param.u8
-  %ret = add i8 %a, 42
-; CHECK: st.param.b32
-  ret i8 %ret
-}
-
-; CHECK: .visible .func caller
-define void @caller(i8* %a) {
-; CHECK: ld.u8
-  %val = load i8, i8* %a
-  %ret = tail call i8 @callee(i8 %val)
-; CHECK: ld.param.b32
-  store i8 %ret, i8* %a
-  ret void
-}
-
-  
+; RUN: llc < %s -march=nvptx -mcpu=sm_20 -verify-machineinstrs | FileCheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
+
+; CHECK: .visible .func  (.param .b32 func_retval0) callee
+define i8 @callee(i8 %a) {
+; CHECK: ld.param.u8
+  %ret = add i8 %a, 42
+; CHECK: st.param.b32
+  ret i8 %ret
+}
+
+; CHECK: .visible .func caller
+define void @caller(i8* %a) {
+; CHECK: ld.u8
+  %val = load i8, i8* %a
+  %ret = tail call i8 @callee(i8 %val)
+; CHECK: ld.param.b32
+  store i8 %ret, i8* %a
+  ret void
+}
+
+  
diff --git a/test/CodeGen/NVPTX/param-load-store.ll b/test/CodeGen/NVPTX/param-load-store.ll
index 8a67567acc96..83991a2930a8 100644
--- a/test/CodeGen/NVPTX/param-load-store.ll
+++ b/test/CodeGen/NVPTX/param-load-store.ll
@@ -1,939 +1,939 @@
-; Verifies correctness of load/store of parameters and return values.
-; RUN: llc < %s -march=nvptx64 -mcpu=sm_35 -O0 | FileCheck %s
-
-%s_i1 = type { i1 }
-%s_i8 = type { i8 }
-%s_i16 = type { i16 }
-%s_f16 = type { half }
-%s_i32 = type { i32 }
-%s_f32 = type { float }
-%s_i64 = type { i64 }
-%s_f64 = type { double }
-
-; More complicated types. i64 is used to increase natural alignment
-; requirement for the type.
-%s_i32x4 = type { i32, i32, i32, i32, i64}
-%s_i32f32 = type { i32, float, i32, float, i64}
-%s_i8i32x4 = type { i32, i32, i8, i32, i32, i64}
-%s_i8i32x4p = type <{ i32, i32, i8, i32, i32, i64}>
-%s_crossfield = type { i32, [2 x i32], <4 x i32>, [3 x {i32, i32, i32}]}
-; All scalar parameters must be at least 32 bits in size.
-; i1 is loaded/stored as i8.
-
-; CHECK: .func  (.param .b32 func_retval0)
-; CHECK-LABEL: test_i1(
-; CHECK-NEXT: .param .b32 test_i1_param_0
-; CHECK:      ld.param.u8 [[A8:%r[0-9]+]], [test_i1_param_0];
-; CHECK:      and.b32 [[A:%r[0-9]+]], [[A8]], 1;
-; CHECK:      .param .b32 param0;
-; CHECK:      st.param.b32    [param0+0], [[A]]
-; CHECK:      .param .b32 retval0;
-; CHECK:      call.uni
-; CHECK-NEXT: test_i1,
-; CHECK:      ld.param.b32    [[R8:%r[0-9]+]], [retval0+0];
-; CHECK:      and.b32         [[R:%r[0-9]+]], [[R8]], 1;
-; CHECK:      st.param.b32    [func_retval0+0], [[R]];
-; CHECK:      ret;
-define i1 @test_i1(i1 %a) {
-  %r = tail call i1 @test_i1(i1 %a);
-  ret i1 %r;
-}
-
-; Signed i1 is a somewhat special case. We only care about one bit and
-; then us neg.s32 to convert it to 32-bit -1 if it's set.
-; CHECK: .func  (.param .b32 func_retval0)
-; CHECK-LABEL: test_i1s(
-; CHECK-NEXT: .param .b32 test_i1s_param_0
-; CHECK:      ld.param.u8 [[A8:%rs[0-9]+]], [test_i1s_param_0];
-; CHECK:      cvt.u32.u16     [[A32:%r[0-9]+]], [[A8]];
-; CHECK:      and.b32         [[A1:%r[0-9]+]], [[A32]], 1;
-; CHECK:      neg.s32         [[A:%r[0-9]+]], [[A1]];
-; CHECK:      .param .b32 param0;
-; CHECK:      st.param.b32    [param0+0], [[A]];
-; CHECK:      .param .b32 retval0;
-; CHECK:      call.uni
-; CHECK:      ld.param.b32    [[R8:%r[0-9]+]], [retval0+0];
-; CHECK:      and.b32         [[R1:%r[0-9]+]], [[R8]], 1;
-; CHECK:      neg.s32         [[R:%r[0-9]+]], [[R1]];
-; CHECK:      st.param.b32    [func_retval0+0], [[R]];
-; CHECK-NEXT: ret;
-define signext i1 @test_i1s(i1 signext %a) {
-       %r = tail call signext i1 @test_i1s(i1 signext %a);
-       ret i1 %r;
-}
-
-; Make sure that i1 loads are vectorized as i8 loads, respecting each element alignment.
-; CHECK: .func  (.param .align 4 .b8 func_retval0[4])
-; CHECK-LABEL: test_v3i1(
-; CHECK-NEXT: .param .align 4 .b8 test_v3i1_param_0[4]
-; CHECK-DAG:  ld.param.u8     [[E2:%rs[0-9]+]], [test_v3i1_param_0+2];
-; CHECK-DAG:  ld.param.v2.u8  {[[E0:%rs[0-9]+]], [[E1:%rs[0-9]+]]}, [test_v3i1_param_0]
-; CHECK:      .param .align 4 .b8 param0[4];
-; CHECK-DAG:  st.param.v2.b8  [param0+0], {[[E0]], [[E1]]};
-; CHECK-DAG:  st.param.b8     [param0+2], [[E2]];
-; CHECK:      .param .align 4 .b8 retval0[4];
-; CHECK:      call.uni (retval0),
-; CHECK-NEXT: test_v3i1,
-; CHECK-DAG:  ld.param.v2.b8  {[[RE0:%rs[0-9]+]], [[RE1:%rs[0-9]+]]}, [retval0+0];
-; CHECK-DAG:  ld.param.b8     [[RE2:%rs[0-9]+]], [retval0+2];
-; CHECK-DAG:  st.param.v2.b8  [func_retval0+0], {[[RE0]], [[RE1]]}
-; CHECK-DAG:  st.param.b8     [func_retval0+2], [[RE2]];
-; CHECK-NEXT: ret;
-define <3 x i1> @test_v3i1(<3 x i1> %a) {
-       %r = tail call <3 x i1> @test_v3i1(<3 x i1> %a);
-       ret <3 x i1> %r;
-}
-
-; CHECK: .func  (.param .align 4 .b8 func_retval0[4])
-; CHECK-LABEL: test_v4i1(
-; CHECK-NEXT: .param .align 4 .b8 test_v4i1_param_0[4]
-; CHECK:      ld.param.v4.u8 {[[E0:%rs[0-9]+]], [[E1:%rs[0-9]+]], [[E2:%rs[0-9]+]], [[E3:%rs[0-9]+]]}, [test_v4i1_param_0]
-; CHECK:      .param .align 4 .b8 param0[4];
-; CHECK:      st.param.v4.b8  [param0+0], {[[E0]], [[E1]], [[E2]], [[E3]]};
-; CHECK:      .param .align 4 .b8 retval0[4];
-; CHECK:      call.uni (retval0),
-; CHECK:      test_v4i1,
-; CHECK:      ld.param.v4.b8  {[[RE0:%rs[0-9]+]], [[RE1:%rs[0-9]+]], [[RE2:%rs[0-9]+]], [[RE3:%rs[0-9]+]]}, [retval0+0];
-; CHECK:      st.param.v4.b8 [func_retval0+0], {[[RE0]], [[RE1]], [[RE2]], [[RE3]]};
-; CHECK-NEXT: ret;
-define <4 x i1> @test_v4i1(<4 x i1> %a) {
-       %r = tail call <4 x i1> @test_v4i1(<4 x i1> %a);
-       ret <4 x i1> %r;
-}
-
-; CHECK: .func  (.param .align 8 .b8 func_retval0[8])
-; CHECK-LABEL: test_v5i1(
-; CHECK-NEXT: .param .align 8 .b8 test_v5i1_param_0[8]
-; CHECK-DAG:  ld.param.u8     [[E4:%rs[0-9]+]], [test_v5i1_param_0+4];
-; CHECK-DAG:  ld.param.v4.u8  {[[E0:%rs[0-9]+]], [[E1:%rs[0-9]+]], [[E2:%rs[0-9]+]], [[E3:%rs[0-9]+]]}, [test_v5i1_param_0]
-; CHECK:      .param .align 8 .b8 param0[8];
-; CHECK-DAG:  st.param.v4.b8  [param0+0], {[[E0]], [[E1]], [[E2]], [[E3]]};
-; CHECK-DAG:  st.param.b8     [param0+4], [[E4]];
-; CHECK:      .param .align 8 .b8 retval0[8];
-; CHECK:      call.uni (retval0),
-; CHECK-NEXT: test_v5i1,
-; CHECK-DAG:  ld.param.v4.b8  {[[RE0:%rs[0-9]+]], [[RE1:%rs[0-9]+]], [[RE2:%rs[0-9]+]], [[RE3:%rs[0-9]+]]}, [retval0+0];
-; CHECK-DAG:  ld.param.b8     [[RE4:%rs[0-9]+]], [retval0+4];
-; CHECK-DAG:  st.param.v4.b8  [func_retval0+0], {[[RE0]], [[RE1]], [[RE2]], [[RE3]]}
-; CHECK-DAG:  st.param.b8     [func_retval0+4], [[RE4]];
-; CHECK-NEXT: ret;
-define <5 x i1> @test_v5i1(<5 x i1> %a) {
-       %r = tail call <5 x i1> @test_v5i1(<5 x i1> %a);
-       ret <5 x i1> %r;
-}
-
-; Unsigned i8 is loaded directly into 32-bit register.
-; CHECK: .func  (.param .b32 func_retval0)
-; CHECK-LABEL: test_i8(
-; CHECK-NEXT: .param .b32 test_i8_param_0
-; CHECK:      ld.param.u8 [[A8:%rs[0-9]+]], [test_i8_param_0];
-; CHECK:      cvt.u32.u16     [[A32:%r[0-9]+]], [[A8]];
-; CHECK:      and.b32         [[A:%r[0-9]+]], [[A32]], 255;
-; CHECK:      .param .b32 param0;
-; CHECK:      st.param.b32    [param0+0], [[A]];
-; CHECK:      .param .b32 retval0;
-; CHECK:      call.uni (retval0),
-; CHECK:      test_i8,
-; CHECK:      ld.param.b32    [[R32:%r[0-9]+]], [retval0+0];
-; CHECK:      and.b32         [[R:%r[0-9]+]], [[R32]], 255;
-; CHECK:      st.param.b32    [func_retval0+0], [[R]];
-; CHECK-NEXT: ret;
-define i8 @test_i8(i8 %a) {
-       %r = tail call i8 @test_i8(i8 %a);
-       ret i8 %r;
-}
-
-; signed i8 is loaded into 16-bit register which is then sign-extended to i32.
-; CHECK: .func  (.param .b32 func_retval0)
-; CHECK-LABEL: test_i8s(
-; CHECK-NEXT: .param .b32 test_i8s_param_0
-; CHECK:      ld.param.s8 [[A8:%rs[0-9]+]], [test_i8s_param_0];
-; CHECK:      cvt.s32.s16     [[A:%r[0-9]+]], [[A8]];
-; CHECK:      .param .b32 param0;
-; CHECK:      st.param.b32    [param0+0], [[A]];
-; CHECK:      .param .b32 retval0;
-; CHECK:      call.uni (retval0),
-; CHECK:      test_i8s,
-; CHECK:      ld.param.b32    [[R32:%r[0-9]+]], [retval0+0];
-; -- This is suspicious (though correct) -- why not cvt.u8.u32, cvt.s8.s32 ?
-; CHECK:      cvt.u16.u32     [[R16:%rs[0-9]+]], [[R32]];
-; CHECK:      cvt.s32.s16     [[R:%r[0-9]+]], [[R16]];
-; CHECK:      st.param.b32    [func_retval0+0], [[R]];
-; CHECK-NEXT: ret;
-define signext i8 @test_i8s(i8 signext %a) {
-       %r = tail call signext i8 @test_i8s(i8 signext %a);
-       ret i8 %r;
-}
-
-; CHECK: .func  (.param .align 4 .b8 func_retval0[4])
-; CHECK-LABEL: test_v3i8(
-; CHECK-NEXT: .param .align 4 .b8 test_v3i8_param_0[4]
-; CHECK-DAG:  ld.param.u8     [[E2:%rs[0-9]+]], [test_v3i8_param_0+2];
-; CHECK-DAG:  ld.param.v2.u8  {[[E0:%rs[0-9]+]], [[E1:%rs[0-9]+]]}, [test_v3i8_param_0];
-; CHECK:      .param .align 4 .b8 param0[4];
-; CHECK:      st.param.v2.b8  [param0+0], {[[E0]], [[E1]]};
-; CHECK:      st.param.b8     [param0+2], [[E2]];
-; CHECK:      .param .align 4 .b8 retval0[4];
-; CHECK:      call.uni (retval0),
-; CHECK-NEXT: test_v3i8,
-; CHECK-DAG:  ld.param.v2.b8  {[[RE0:%rs[0-9]+]], [[RE1:%rs[0-9]+]]}, [retval0+0];
-; CHECK-DAG:  ld.param.b8     [[RE2:%rs[0-9]+]], [retval0+2];
-; CHECK-DAG:  st.param.v2.b8  [func_retval0+0], {[[RE0]], [[RE1]]};
-; CHECK-DAG:  st.param.b8     [func_retval0+2], [[RE2]];
-; CHECK-NEXT: ret;
-define <3 x i8> @test_v3i8(<3 x i8> %a) {
-       %r = tail call <3 x i8> @test_v3i8(<3 x i8> %a);
-       ret <3 x i8> %r;
-}
-
-; CHECK: .func  (.param .align 4 .b8 func_retval0[4])
-; CHECK-LABEL: test_v4i8(
-; CHECK-NEXT: .param .align 4 .b8 test_v4i8_param_0[4]
-; CHECK:      ld.param.v4.u8 {[[E0:%rs[0-9]+]], [[E1:%rs[0-9]+]], [[E2:%rs[0-9]+]], [[E3:%rs[0-9]+]]}, [test_v4i8_param_0]
-; CHECK:      .param .align 4 .b8 param0[4];
-; CHECK:      st.param.v4.b8  [param0+0], {[[E0]], [[E1]], [[E2]], [[E3]]};
-; CHECK:      .param .align 4 .b8 retval0[4];
-; CHECK:      call.uni (retval0),
-; CHECK-NEXT: test_v4i8,
-; CHECK:      ld.param.v4.b8  {[[RE0:%rs[0-9]+]], [[RE1:%rs[0-9]+]], [[RE2:%rs[0-9]+]], [[RE3:%rs[0-9]+]]}, [retval0+0];
-; CHECK:      st.param.v4.b8 [func_retval0+0], {[[RE0]], [[RE1]], [[RE2]], [[RE3]]}
-; CHECK-NEXT: ret;
-define <4 x i8> @test_v4i8(<4 x i8> %a) {
-       %r = tail call <4 x i8> @test_v4i8(<4 x i8> %a);
-       ret <4 x i8> %r;
-}
-
-; CHECK: .func  (.param .align 8 .b8 func_retval0[8])
-; CHECK-LABEL: test_v5i8(
-; CHECK-NEXT: .param .align 8 .b8 test_v5i8_param_0[8]
-; CHECK-DAG:  ld.param.u8     [[E4:%rs[0-9]+]], [test_v5i8_param_0+4];
-; CHECK-DAG   ld.param.v4.u8  {[[E0:%rs[0-9]+]], [[E1:%rs[0-9]+]], [[E2:%rs[0-9]+]], [[E3:%rs[0-9]+]]}, [test_v5i8_param_0]
-; CHECK:      .param .align 8 .b8 param0[8];
-; CHECK-DAG:  st.param.v4.b8  [param0+0], {[[E0]], [[E1]], [[E2]], [[E3]]};
-; CHECK-DAG:  st.param.b8     [param0+4], [[E4]];
-; CHECK:      .param .align 8 .b8 retval0[8];
-; CHECK:      call.uni (retval0),
-; CHECK-NEXT: test_v5i8,
-; CHECK-DAG:  ld.param.v4.b8  {[[RE0:%rs[0-9]+]], [[RE1:%rs[0-9]+]], [[RE2:%rs[0-9]+]], [[RE3:%rs[0-9]+]]}, [retval0+0];
-; CHECK-DAG:  ld.param.b8     [[RE4:%rs[0-9]+]], [retval0+4];
-; CHECK-DAG:  st.param.v4.b8  [func_retval0+0], {[[RE0]], [[RE1]], [[RE2]], [[RE3]]}
-; CHECK-DAG:  st.param.b8     [func_retval0+4], [[RE4]];
-; CHECK-NEXT: ret;
-define <5 x i8> @test_v5i8(<5 x i8> %a) {
-       %r = tail call <5 x i8> @test_v5i8(<5 x i8> %a);
-       ret <5 x i8> %r;
-}
-
-; CHECK: .func  (.param .b32 func_retval0)
-; CHECK-LABEL: test_i16(
-; CHECK-NEXT: .param .b32 test_i16_param_0
-; CHECK:      ld.param.u16    [[E16:%rs[0-9]+]], [test_i16_param_0];
-; CHECK:      cvt.u32.u16     [[E32:%r[0-9]+]], [[E16]];
-; CHECK:      .param .b32 param0;
-; CHECK:      st.param.b32    [param0+0], [[E32]];
-; CHECK:      .param .b32 retval0;
-; CHECK:      call.uni (retval0),
-; CHECK-NEXT: test_i16,
-; CHECK:      ld.param.b32    [[RE32:%r[0-9]+]], [retval0+0];
-; CHECK:      and.b32         [[R:%r[0-9]+]], [[RE32]], 65535;
-; CHECK:      st.param.b32    [func_retval0+0], [[R]];
-; CHECK-NEXT: ret;
-define i16 @test_i16(i16 %a) {
-       %r = tail call i16 @test_i16(i16 %a);
-       ret i16 %r;
-}
-
-; CHECK: .func  (.param .b32 func_retval0)
-; CHECK-LABEL: test_i16s(
-; CHECK-NEXT: .param .b32 test_i16s_param_0
-; CHECK:      ld.param.u16    [[E16:%rs[0-9]+]], [test_i16s_param_0];
-; CHECK:      cvt.s32.s16     [[E32:%r[0-9]+]], [[E16]];
-; CHECK:      .param .b32 param0;
-; CHECK:      st.param.b32    [param0+0], [[E32]];
-; CHECK:      .param .b32 retval0;
-; CHECK:      call.uni (retval0),
-; CHECK-NEXT: test_i16s,
-; CHECK:      ld.param.b32    [[RE32:%r[0-9]+]], [retval0+0];
-; CHECK:      cvt.s32.s16     [[R:%r[0-9]+]], [[RE32]];
-; CHECK:      st.param.b32    [func_retval0+0], [[R]];
-; CHECK-NEXT: ret;
-define signext i16 @test_i16s(i16 signext %a) {
-       %r = tail call signext i16 @test_i16s(i16 signext %a);
-       ret i16 %r;
-}
-
-; CHECK: .func  (.param .align 8 .b8 func_retval0[8])
-; CHECK-LABEL: test_v3i16(
-; CHECK-NEXT: .param .align 8 .b8 test_v3i16_param_0[8]
-; CHECK-DAG:  ld.param.u16    [[E2:%rs[0-9]+]], [test_v3i16_param_0+4];
-; CHECK-DAG:  ld.param.v2.u16 {[[E0:%rs[0-9]+]], [[E1:%rs[0-9]+]]}, [test_v3i16_param_0];
-; CHECK:      .param .align 8 .b8 param0[8];
-; CHECK:      st.param.v2.b16 [param0+0], {[[E0]], [[E1]]};
-; CHECK:      st.param.b16    [param0+4], [[E2]];
-; CHECK:      .param .align 8 .b8 retval0[8];
-; CHECK:      call.uni (retval0),
-; CHECK-NEXT: test_v3i16,
-; CHECK:      ld.param.v2.b16 {[[RE0:%rs[0-9]+]], [[RE1:%rs[0-9]+]]}, [retval0+0];
-; CHECK:      ld.param.b16    [[RE2:%rs[0-9]+]], [retval0+4];
-; CHECK-DAG:  st.param.v2.b16 [func_retval0+0], {[[RE0]], [[RE1]]};
-; CHECK-DAG:  st.param.b16    [func_retval0+4], [[RE2]];
-; CHECK-NEXT: ret;
-define <3 x i16> @test_v3i16(<3 x i16> %a) {
-       %r = tail call <3 x i16> @test_v3i16(<3 x i16> %a);
-       ret <3 x i16> %r;
-}
-
-; CHECK: .func  (.param .align 8 .b8 func_retval0[8])
-; CHECK-LABEL: test_v4i16(
-; CHECK-NEXT: .param .align 8 .b8 test_v4i16_param_0[8]
-; CHECK:      ld.param.v4.u16 {[[E0:%rs[0-9]+]], [[E1:%rs[0-9]+]], [[E2:%rs[0-9]+]], [[E3:%rs[0-9]+]]}, [test_v4i16_param_0]
-; CHECK:      .param .align 8 .b8 param0[8];
-; CHECK:      st.param.v4.b16 [param0+0], {[[E0]], [[E1]], [[E2]], [[E3]]};
-; CHECK:      .param .align 8 .b8 retval0[8];
-; CHECK:      call.uni (retval0),
-; CHECK-NEXT: test_v4i16,
-; CHECK:      ld.param.v4.b16 {[[RE0:%rs[0-9]+]], [[RE1:%rs[0-9]+]], [[RE2:%rs[0-9]+]], [[RE3:%rs[0-9]+]]}, [retval0+0];
-; CHECK:      st.param.v4.b16 [func_retval0+0], {[[RE0]], [[RE1]], [[RE2]], [[RE3]]}
-; CHECK-NEXT: ret;
-define <4 x i16> @test_v4i16(<4 x i16> %a) {
-       %r = tail call <4 x i16> @test_v4i16(<4 x i16> %a);
-       ret <4 x i16> %r;
-}
-
-; CHECK: .func  (.param .align 16 .b8 func_retval0[16])
-; CHECK-LABEL: test_v5i16(
-; CHECK-NEXT: .param .align 16 .b8 test_v5i16_param_0[16]
-; CHECK-DAG:  ld.param.u16    [[E4:%rs[0-9]+]], [test_v5i16_param_0+8];
-; CHECK-DAG   ld.param.v4.u16 {[[E0:%rs[0-9]+]], [[E1:%rs[0-9]+]], [[E2:%rs[0-9]+]], [[E3:%rs[0-9]+]]}, [test_v5i16_param_0]
-; CHECK:      .param .align 16 .b8 param0[16];
-; CHECK-DAG:  st.param.v4.b16 [param0+0], {[[E0]], [[E1]], [[E2]], [[E3]]};
-; CHECK-DAG:  st.param.b16    [param0+8], [[E4]];
-; CHECK:      .param .align 16 .b8 retval0[16];
-; CHECK:      call.uni (retval0),
-; CHECK-NEXT: test_v5i16,
-; CHECK-DAG:  ld.param.v4.b16 {[[RE0:%rs[0-9]+]], [[RE1:%rs[0-9]+]], [[RE2:%rs[0-9]+]], [[RE3:%rs[0-9]+]]}, [retval0+0];
-; CHECK-DAG:  ld.param.b16    [[RE4:%rs[0-9]+]], [retval0+8];
-; CHECK-DAG:  st.param.v4.b16 [func_retval0+0], {[[RE0]], [[RE1]], [[RE2]], [[RE3]]}
-; CHECK-DAG:  st.param.b16    [func_retval0+8], [[RE4]];
-; CHECK-NEXT: ret;
-define <5 x i16> @test_v5i16(<5 x i16> %a) {
-       %r = tail call <5 x i16> @test_v5i16(<5 x i16> %a);
-       ret <5 x i16> %r;
-}
-
-; CHECK: .func  (.param .b32 func_retval0)
-; CHECK-LABEL: test_f16(
-; CHECK-NEXT: .param .b32 test_f16_param_0
-; CHECK:      ld.param.b16    [[E:%h[0-9]+]], [test_f16_param_0];
-; CHECK:      .param .b32 param0;
-; CHECK:      st.param.b16    [param0+0], [[E]];
-; CHECK:      .param .b32 retval0;
-; CHECK:      call.uni (retval0),
-; CHECK-NEXT: test_f16,
-; CHECK:      ld.param.b16    [[R:%h[0-9]+]], [retval0+0];
-; CHECK:      st.param.b16    [func_retval0+0], [[R]]
-; CHECK-NEXT: ret;
-define half @test_f16(half %a) {
-       %r = tail call half @test_f16(half %a);
-       ret half %r;
-}
-
-; CHECK: .func  (.param .align 4 .b8 func_retval0[4])
-; CHECK-LABEL: test_v2f16(
-; CHECK-NEXT: .param .align 4 .b8 test_v2f16_param_0[4]
-; CHECK:      ld.param.b32    [[E:%hh[0-9]+]], [test_v2f16_param_0];
-; CHECK:      .param .align 4 .b8 param0[4];
-; CHECK:      st.param.b32    [param0+0], [[E]];
-; CHECK:      .param .align 4 .b8 retval0[4];
-; CHECK:      call.uni (retval0),
-; CHECK-NEXT: test_v2f16,
-; CHECK:      ld.param.b32    [[R:%hh[0-9]+]], [retval0+0];
-; CHECK:      st.param.b32    [func_retval0+0], [[R]]
-; CHECK-NEXT: ret;
-define <2 x half> @test_v2f16(<2 x half> %a) {
-       %r = tail call <2 x half> @test_v2f16(<2 x half> %a);
-       ret <2 x half> %r;
-}
-
-; CHECK:.func  (.param .align 8 .b8 func_retval0[8])
-; CHECK-LABEL: test_v3f16(
-; CHECK:      .param .align 8 .b8 test_v3f16_param_0[8]
-; CHECK-DAG:  ld.param.b32    [[HH01:%hh[0-9]+]], [test_v3f16_param_0];
-; CHECK-DAG:  mov.b32         {[[E0:%h[0-9]+]], [[E1:%h[0-9]+]]}, [[HH01]];
-; CHECK-DAG:  ld.param.b16    [[E2:%h[0-9]+]], [test_v3f16_param_0+4];
-; CHECK:      .param .align 8 .b8 param0[8];
-; CHECK-DAG:  st.param.v2.b16 [param0+0], {[[E0]], [[E1]]};
-; CHECK-DAG:  st.param.b16    [param0+4], [[E2]];
-; CHECK:      .param .align 8 .b8 retval0[8];
-; CHECK:      call.uni (retval0),
-; CHECK:      test_v3f16,
-; CHECK-DAG:  ld.param.v2.b16 {[[R0:%h[0-9]+]], [[R1:%h[0-9]+]]}, [retval0+0];
-; CHECK-DAG:  ld.param.b16    [[R2:%h[0-9]+]], [retval0+4];
-; CHECK-DAG:  st.param.v2.b16 [func_retval0+0], {[[R0]], [[R1]]};
-; CHECK-DAG:  st.param.b16    [func_retval0+4], [[R2]];
-; CHECK:      ret;
-define <3 x half> @test_v3f16(<3 x half> %a) {
-       %r = tail call <3 x half> @test_v3f16(<3 x half> %a);
-       ret <3 x half> %r;
-}
-
-; CHECK:.func  (.param .align 8 .b8 func_retval0[8])
-; CHECK-LABEL: test_v4f16(
-; CHECK:      .param .align 8 .b8 test_v4f16_param_0[8]
-; CHECK:      ld.param.v2.u32 {[[R01:%r[0-9]+]], [[R23:%r[0-9]+]]}, [test_v4f16_param_0];
-; CHECK-DAG:  mov.b32         [[HH01:%hh[0-9]+]], [[R01]];
-; CHECK-DAG:  mov.b32         [[HH23:%hh[0-9]+]], [[R23]];
-; CHECK:      .param .align 8 .b8 param0[8];
-; CHECK:      st.param.v2.b32 [param0+0], {[[HH01]], [[HH23]]};
-; CHECK:      .param .align 8 .b8 retval0[8];
-; CHECK:      call.uni (retval0),
-; CHECK:      test_v4f16,
-; CHECK:      ld.param.v2.b32 {[[RH01:%hh[0-9]+]], [[RH23:%hh[0-9]+]]}, [retval0+0];
-; CHECK:      st.param.v2.b32 [func_retval0+0], {[[RH01]], [[RH23]]};
-; CHECK:      ret;
-define <4 x half> @test_v4f16(<4 x half> %a) {
-       %r = tail call <4 x half> @test_v4f16(<4 x half> %a);
-       ret <4 x half> %r;
-}
-
-; CHECK:.func  (.param .align 16 .b8 func_retval0[16])
-; CHECK-LABEL: test_v5f16(
-; CHECK:      .param .align 16 .b8 test_v5f16_param_0[16]
-; CHECK-DAG:  ld.param.v4.b16  {[[E0:%h[0-9]+]], [[E1:%h[0-9]+]], [[E2:%h[0-9]+]], [[E3:%h[0-9]+]]}, [test_v5f16_param_0];
-; CHECK-DAG:  mov.b32         {[[E0:%h[0-9]+]], [[E1:%h[0-9]+]]}, [[HH01]];
-; CHECK-DAG:  ld.param.b16    [[E4:%h[0-9]+]], [test_v5f16_param_0+8];
-; CHECK:      .param .align 16 .b8 param0[16];
-; CHECK-DAG:  st.param.v4.b16 [param0+0],
-; CHECK-DAG:  st.param.b16    [param0+8], [[E4]];
-; CHECK:      .param .align 16 .b8 retval0[16];
-; CHECK:      call.uni (retval0),
-; CHECK:      test_v5f16,
-; CHECK-DAG:  ld.param.v4.b16 {[[R0:%h[0-9]+]], [[R1:%h[0-9]+]], [[R2:%h[0-9]+]], [[R3:%h[0-9]+]]}, [retval0+0];
-; CHECK-DAG:  ld.param.b16    [[R4:%h[0-9]+]], [retval0+8];
-; CHECK-DAG:  st.param.v4.b16 [func_retval0+0], {[[R0]], [[R1]], [[R2]], [[R3]]};
-; CHECK-DAG:  st.param.b16    [func_retval0+8], [[R4]];
-; CHECK:      ret;
-define <5 x half> @test_v5f16(<5 x half> %a) {
-       %r = tail call <5 x half> @test_v5f16(<5 x half> %a);
-       ret <5 x half> %r;
-}
-
-; CHECK:.func  (.param .align 16 .b8 func_retval0[16])
-; CHECK-LABEL: test_v8f16(
-; CHECK:      .param .align 16 .b8 test_v8f16_param_0[16]
-; CHECK:      ld.param.v4.u32 {[[R01:%r[0-9]+]], [[R23:%r[0-9]+]], [[R45:%r[0-9]+]], [[R67:%r[0-9]+]]}, [test_v8f16_param_0];
-; CHECK-DAG:  mov.b32         [[HH01:%hh[0-9]+]], [[R01]];
-; CHECK-DAG:  mov.b32         [[HH23:%hh[0-9]+]], [[R23]];
-; CHECK-DAG:  mov.b32         [[HH45:%hh[0-9]+]], [[R45]];
-; CHECK-DAG:  mov.b32         [[HH67:%hh[0-9]+]], [[R67]];
-; CHECK:      .param .align 16 .b8 param0[16];
-; CHECK:      st.param.v4.b32 [param0+0], {[[HH01]], [[HH23]], [[HH45]], [[HH67]]};
-; CHECK:      .param .align 16 .b8 retval0[16];
-; CHECK:      call.uni (retval0),
-; CHECK:      test_v8f16,
-; CHECK:      ld.param.v4.b32 {[[RH01:%hh[0-9]+]], [[RH23:%hh[0-9]+]], [[RH45:%hh[0-9]+]], [[RH67:%hh[0-9]+]]}, [retval0+0];
-; CHECK:      st.param.v4.b32 [func_retval0+0], {[[RH01]], [[RH23]], [[RH45]], [[RH67]]};
-; CHECK:      ret;
-define <8 x half> @test_v8f16(<8 x half> %a) {
-       %r = tail call <8 x half> @test_v8f16(<8 x half> %a);
-       ret <8 x half> %r;
-}
-
-; CHECK:.func  (.param .align 32 .b8 func_retval0[32])
-; CHECK-LABEL: test_v9f16(
-; CHECK:      .param .align 32 .b8 test_v9f16_param_0[32]
-; CHECK-DAG:  ld.param.v4.b16  {[[E0:%h[0-9]+]], [[E1:%h[0-9]+]], [[E2:%h[0-9]+]], [[E3:%h[0-9]+]]}, [test_v9f16_param_0];
-; CHECK-DAG:  ld.param.v4.b16  {[[E4:%h[0-9]+]], [[E5:%h[0-9]+]], [[E6:%h[0-9]+]], [[E7:%h[0-9]+]]}, [test_v9f16_param_0+8];
-; CHECK-DAG:  ld.param.b16     [[E8:%h[0-9]+]], [test_v9f16_param_0+16];
-; CHECK:      .param .align 32 .b8 param0[32];
-; CHECK-DAG:  st.param.v4.b16 [param0+0],
-; CHECK-DAG:  st.param.v4.b16 [param0+8],
-; CHECK-DAG:  st.param.b16    [param0+16], [[E8]];
-; CHECK:      .param .align 32 .b8 retval0[32];
-; CHECK:      call.uni (retval0),
-; CHECK:      test_v9f16,
-; CHECK-DAG:  ld.param.v4.b16 {[[R0:%h[0-9]+]], [[R1:%h[0-9]+]], [[R2:%h[0-9]+]], [[R3:%h[0-9]+]]}, [retval0+0];
-; CHECK-DAG:  ld.param.v4.b16 {[[R4:%h[0-9]+]], [[R5:%h[0-9]+]], [[R6:%h[0-9]+]], [[R7:%h[0-9]+]]}, [retval0+8];
-; CHECK-DAG:  ld.param.b16    [[R8:%h[0-9]+]], [retval0+16];
-; CHECK-DAG:  st.param.v4.b16 [func_retval0+0], {[[R0]], [[R1]], [[R2]], [[R3]]};
-; CHECK-DAG:  st.param.v4.b16 [func_retval0+8], {[[R4]], [[R5]], [[R6]], [[R7]]};
-; CHECK-DAG:  st.param.b16    [func_retval0+16], [[R8]];
-; CHECK:      ret;
-define <9 x half> @test_v9f16(<9 x half> %a) {
-       %r = tail call <9 x half> @test_v9f16(<9 x half> %a);
-       ret <9 x half> %r;
-}
-
-; CHECK: .func  (.param .b32 func_retval0)
-; CHECK-LABEL: test_i32(
-; CHECK-NEXT: .param .b32 test_i32_param_0
-; CHECK:      ld.param.u32    [[E:%r[0-9]+]], [test_i32_param_0];
-; CHECK:      .param .b32 param0;
-; CHECK:      st.param.b32    [param0+0], [[E]];
-; CHECK:      .param .b32 retval0;
-; CHECK:      call.uni (retval0),
-; CHECK-NEXT: test_i32,
-; CHECK:      ld.param.b32    [[R:%r[0-9]+]], [retval0+0];
-; CHECK:      st.param.b32    [func_retval0+0], [[R]];
-; CHECK-NEXT: ret;
-define i32 @test_i32(i32 %a) {
-       %r = tail call i32 @test_i32(i32 %a);
-       ret i32 %r;
-}
-
-; CHECK: .func  (.param .align 16 .b8 func_retval0[16])
-; CHECK-LABEL: test_v3i32(
-; CHECK-NEXT: .param .align 16 .b8 test_v3i32_param_0[16]
-; CHECK-DAG:  ld.param.u32     [[E2:%r[0-9]+]], [test_v3i32_param_0+8];
-; CHECK-DAG:  ld.param.v2.u32  {[[E0:%r[0-9]+]], [[E1:%r[0-9]+]]}, [test_v3i32_param_0];
-; CHECK:      .param .align 16 .b8 param0[16];
-; CHECK:      st.param.v2.b32  [param0+0], {[[E0]], [[E1]]};
-; CHECK:      st.param.b32     [param0+8], [[E2]];
-; CHECK:      .param .align 16 .b8 retval0[16];
-; CHECK:      call.uni (retval0),
-; CHECK-NEXT: test_v3i32,
-; CHECK:      ld.param.v2.b32  {[[RE0:%r[0-9]+]], [[RE1:%r[0-9]+]]}, [retval0+0];
-; CHECK:      ld.param.b32     [[RE2:%r[0-9]+]], [retval0+8];
-; CHECK-DAG:  st.param.v2.b32  [func_retval0+0], {[[RE0]], [[RE1]]};
-; CHECK-DAG:  st.param.b32     [func_retval0+8], [[RE2]];
-; CHECK-NEXT: ret;
-define <3 x i32> @test_v3i32(<3 x i32> %a) {
-       %r = tail call <3 x i32> @test_v3i32(<3 x i32> %a);
-       ret <3 x i32> %r;
-}
-
-; CHECK: .func  (.param .align 16 .b8 func_retval0[16])
-; CHECK-LABEL: test_v4i32(
-; CHECK-NEXT: .param .align 16 .b8 test_v4i32_param_0[16]
-; CHECK:      ld.param.v4.u32  {[[E0:%r[0-9]+]], [[E1:%r[0-9]+]], [[E2:%r[0-9]+]], [[E3:%r[0-9]+]]}, [test_v4i32_param_0]
-; CHECK:      .param .align 16 .b8 param0[16];
-; CHECK:      st.param.v4.b32  [param0+0], {[[E0]], [[E1]], [[E2]], [[E3]]};
-; CHECK:      .param .align 16 .b8 retval0[16];
-; CHECK:      call.uni (retval0),
-; CHECK-NEXT: test_v4i32,
-; CHECK:      ld.param.v4.b32  {[[RE0:%r[0-9]+]], [[RE1:%r[0-9]+]], [[RE2:%r[0-9]+]], [[RE3:%r[0-9]+]]}, [retval0+0];
-; CHECK:      st.param.v4.b32  [func_retval0+0], {[[RE0]], [[RE1]], [[RE2]], [[RE3]]}
-; CHCK-NEXT: ret;
-define <4 x i32> @test_v4i32(<4 x i32> %a) {
-       %r = tail call <4 x i32> @test_v4i32(<4 x i32> %a);
-       ret <4 x i32> %r;
-}
-
-; CHECK: .func  (.param .align 32 .b8 func_retval0[32])
-; CHECK-LABEL: test_v5i32(
-; CHECK-NEXT: .param .align 32 .b8 test_v5i32_param_0[32]
-; CHECK-DAG:  ld.param.u32     [[E4:%r[0-9]+]], [test_v5i32_param_0+16];
-; CHECK-DAG   ld.param.v4.u32  {[[E0:%r[0-9]+]], [[E1:%r[0-9]+]], [[E2:%r[0-9]+]], [[E3:%r[0-9]+]]}, [test_v5i32_param_0]
-; CHECK:      .param .align 32 .b8 param0[32];
-; CHECK-DAG:  st.param.v4.b32  [param0+0], {[[E0]], [[E1]], [[E2]], [[E3]]};
-; CHECK-DAG:  st.param.b32     [param0+16], [[E4]];
-; CHECK:      .param .align 32 .b8 retval0[32];
-; CHECK:      call.uni (retval0),
-; CHECK-NEXT: test_v5i32,
-; CHECK-DAG:  ld.param.v4.b32  {[[RE0:%r[0-9]+]], [[RE1:%r[0-9]+]], [[RE2:%r[0-9]+]], [[RE3:%r[0-9]+]]}, [retval0+0];
-; CHECK-DAG:  ld.param.b32     [[RE4:%r[0-9]+]], [retval0+16];
-; CHECK-DAG:  st.param.v4.b32  [func_retval0+0], {[[RE0]], [[RE1]], [[RE2]], [[RE3]]}
-; CHECK-DAG:  st.param.b32     [func_retval0+16], [[RE4]];
-; CHECK-NEXT: ret;
-define <5 x i32> @test_v5i32(<5 x i32> %a) {
-       %r = tail call <5 x i32> @test_v5i32(<5 x i32> %a);
-       ret <5 x i32> %r;
-}
-
-; CHECK: .func  (.param .b32 func_retval0)
-; CHECK-LABEL: test_f32(
-; CHECK-NEXT: .param .b32 test_f32_param_0
-; CHECK:      ld.param.f32    [[E:%f[0-9]+]], [test_f32_param_0];
-; CHECK:      .param .b32 param0;
-; CHECK:      st.param.f32    [param0+0], [[E]];
-; CHECK:      .param .b32 retval0;
-; CHECK:      call.uni (retval0),
-; CHECK-NEXT: test_f32,
-; CHECK:      ld.param.f32    [[R:%f[0-9]+]], [retval0+0];
-; CHECK:      st.param.f32    [func_retval0+0], [[R]];
-; CHECK-NEXT: ret;
-define float @test_f32(float %a) {
-       %r = tail call float @test_f32(float %a);
-       ret float %r;
-}
-
-; CHECK: .func  (.param .b64 func_retval0)
-; CHECK-LABEL: test_i64(
-; CHECK-NEXT: .param .b64 test_i64_param_0
-; CHECK:      ld.param.u64    [[E:%rd[0-9]+]], [test_i64_param_0];
-; CHECK:      .param .b64 param0;
-; CHECK:      st.param.b64    [param0+0], [[E]];
-; CHECK:      .param .b64 retval0;
-; CHECK:      call.uni (retval0),
-; CHECK-NEXT: test_i64,
-; CHECK:      ld.param.b64    [[R:%rd[0-9]+]], [retval0+0];
-; CHECK:      st.param.b64    [func_retval0+0], [[R]];
-; CHECK-NEXT: ret;
-define i64 @test_i64(i64 %a) {
-       %r = tail call i64 @test_i64(i64 %a);
-       ret i64 %r;
-}
-
-; CHECK: .func  (.param .align 32 .b8 func_retval0[32])
-; CHECK-LABEL: test_v3i64(
-; CHECK-NEXT: .param .align 32 .b8 test_v3i64_param_0[32]
-; CHECK-DAG:  ld.param.u64     [[E2:%rd[0-9]+]], [test_v3i64_param_0+16];
-; CHECK-DAG:  ld.param.v2.u64  {[[E0:%rd[0-9]+]], [[E1:%rd[0-9]+]]}, [test_v3i64_param_0];
-; CHECK:      .param .align 32 .b8 param0[32];
-; CHECK:      st.param.v2.b64  [param0+0], {[[E0]], [[E1]]};
-; CHECK:      st.param.b64     [param0+16], [[E2]];
-; CHECK:      .param .align 32 .b8 retval0[32];
-; CHECK:      call.uni (retval0),
-; CHECK-NEXT: test_v3i64,
-; CHECK:      ld.param.v2.b64  {[[RE0:%rd[0-9]+]], [[RE1:%rd[0-9]+]]}, [retval0+0];
-; CHECK:      ld.param.b64     [[RE2:%rd[0-9]+]], [retval0+16];
-; CHECK-DAG:  st.param.v2.b64  [func_retval0+0], {[[RE0]], [[RE1]]};
-; CHECK-DAG:  st.param.b64     [func_retval0+16], [[RE2]];
-; CHECK-DAG:  st.param.v2.b64  [func_retval0+0], {[[RE0]], [[RE1]]};
-; CHECK-DAG:  st.param.b64     [func_retval0+16], [[RE2]];
-; CHECK-NEXT: ret;
-define <3 x i64> @test_v3i64(<3 x i64> %a) {
-       %r = tail call <3 x i64> @test_v3i64(<3 x i64> %a);
-       ret <3 x i64> %r;
-}
-
-; For i64 vector loads are limited by PTX to 2 elements.
-; CHECK: .func  (.param .align 32 .b8 func_retval0[32])
-; CHECK-LABEL: test_v4i64(
-; CHECK-NEXT: .param .align 32 .b8 test_v4i64_param_0[32]
-; CHECK-DAG:  ld.param.v2.u64  {[[E2:%rd[0-9]+]], [[E3:%rd[0-9]+]]}, [test_v4i64_param_0+16];
-; CHECK-DAG:  ld.param.v2.u64  {[[E0:%rd[0-9]+]], [[E1:%rd[0-9]+]]}, [test_v4i64_param_0];
-; CHECK:      .param .align 32 .b8 param0[32];
-; CHECK:      st.param.v2.b64  [param0+0], {[[E0]], [[E1]]};
-; CHECK:      st.param.v2.b64  [param0+16], {[[E2]], [[E3]]};
-; CHECK:      .param .align 32 .b8 retval0[32];
-; CHECK:      call.uni (retval0),
-; CHECK-NEXT: test_v4i64,
-; CHECK:      ld.param.v2.b64  {[[RE0:%rd[0-9]+]], [[RE1:%rd[0-9]+]]}, [retval0+0];
-; CHECK:      ld.param.v2.b64  {[[RE2:%rd[0-9]+]], [[RE3:%rd[0-9]+]]}, [retval0+16];
-; CHECK-DAG:  st.param.v2.b64  [func_retval0+16], {[[RE2]], [[RE3]]};
-; CHECK-DAG:  st.param.v2.b64  [func_retval0+0], {[[RE0]], [[RE1]]};
-; CHECK-NEXT: ret;
-define <4 x i64> @test_v4i64(<4 x i64> %a) {
-       %r = tail call <4 x i64> @test_v4i64(<4 x i64> %a);
-       ret <4 x i64> %r;
-}
-
-; Aggregates, on the other hand, do not get extended.
-
-; CHECK: .func  (.param .align 1 .b8 func_retval0[1])
-; CHECK-LABEL: test_s_i1(
-; CHECK-NEXT: .align 1 .b8 test_s_i1_param_0[1]
-; CHECK:      ld.param.u8 [[A:%rs[0-9]+]], [test_s_i1_param_0];
-; CHECK:      .param .align 1 .b8 param0[1];
-; CHECK:      st.param.b8    [param0+0], [[A]]
-; CHECK:      .param .align 1 .b8 retval0[1];
-; CHECK:      call.uni
-; CHECK-NEXT: test_s_i1,
-; CHECK:      ld.param.b8    [[R:%rs[0-9]+]], [retval0+0];
-; CHECK:      st.param.b8    [func_retval0+0], [[R]];
-; CHECK-NEXT: ret;
-define %s_i1 @test_s_i1(%s_i1 %a) {
-       %r = tail call %s_i1 @test_s_i1(%s_i1 %a);
-       ret %s_i1 %r;
-}
-
-; CHECK: .func  (.param .align 1 .b8 func_retval0[1])
-; CHECK-LABEL: test_s_i8(
-; CHECK-NEXT: .param .align 1 .b8 test_s_i8_param_0[1]
-; CHECK:      ld.param.u8 [[A:%rs[0-9]+]], [test_s_i8_param_0];
-; CHECK:      .param .align 1 .b8 param0[1];
-; CHECK:      st.param.b8    [param0+0], [[A]]
-; CHECK:      .param .align 1 .b8 retval0[1];
-; CHECK:      call.uni
-; CHECK-NEXT: test_s_i8,
-; CHECK:      ld.param.b8    [[R:%rs[0-9]+]], [retval0+0];
-; CHECK:      st.param.b8    [func_retval0+0], [[R]];
-; CHECK-NEXT: ret;
-define %s_i8 @test_s_i8(%s_i8 %a) {
-       %r = tail call %s_i8 @test_s_i8(%s_i8 %a);
-       ret %s_i8 %r;
-}
-
-; CHECK: .func  (.param .align 2 .b8 func_retval0[2])
-; CHECK-LABEL: test_s_i16(
-; CHECK-NEXT: .param .align 2 .b8 test_s_i16_param_0[2]
-; CHECK:      ld.param.u16 [[A:%rs[0-9]+]], [test_s_i16_param_0];
-; CHECK:      .param .align 2 .b8 param0[2];
-; CHECK:      st.param.b16    [param0+0], [[A]]
-; CHECK:      .param .align 2 .b8 retval0[2];
-; CHECK:      call.uni
-; CHECK-NEXT: test_s_i16,
-; CHECK:      ld.param.b16    [[R:%rs[0-9]+]], [retval0+0];
-; CHECK:      st.param.b16    [func_retval0+0], [[R]];
-; CHECK-NEXT: ret;
-define %s_i16 @test_s_i16(%s_i16 %a) {
-       %r = tail call %s_i16 @test_s_i16(%s_i16 %a);
-       ret %s_i16 %r;
-}
-
-; CHECK: .func  (.param .align 2 .b8 func_retval0[2])
-; CHECK-LABEL: test_s_f16(
-; CHECK-NEXT: .param .align 2 .b8 test_s_f16_param_0[2]
-; CHECK:      ld.param.b16 [[A:%h[0-9]+]], [test_s_f16_param_0];
-; CHECK:      .param .align 2 .b8 param0[2];
-; CHECK:      st.param.b16    [param0+0], [[A]]
-; CHECK:      .param .align 2 .b8 retval0[2];
-; CHECK:      call.uni
-; CHECK-NEXT: test_s_f16,
-; CHECK:      ld.param.b16    [[R:%h[0-9]+]], [retval0+0];
-; CHECK:      st.param.b16    [func_retval0+0], [[R]];
-; CHECK-NEXT: ret;
-define %s_f16 @test_s_f16(%s_f16 %a) {
-       %r = tail call %s_f16 @test_s_f16(%s_f16 %a);
-       ret %s_f16 %r;
-}
-
-; CHECK: .func  (.param .align 4 .b8 func_retval0[4])
-; CHECK-LABEL: test_s_i32(
-; CHECK-NEXT: .param .align 4 .b8 test_s_i32_param_0[4]
-; CHECK:      ld.param.u32    [[E:%r[0-9]+]], [test_s_i32_param_0];
-; CHECK:      .param .align 4 .b8 param0[4]
-; CHECK:      st.param.b32    [param0+0], [[E]];
-; CHECK:      .param .align 4 .b8 retval0[4];
-; CHECK:      call.uni (retval0),
-; CHECK-NEXT: test_s_i32,
-; CHECK:      ld.param.b32    [[R:%r[0-9]+]], [retval0+0];
-; CHECK:      st.param.b32    [func_retval0+0], [[R]];
-; CHECK-NEXT: ret;
-define %s_i32 @test_s_i32(%s_i32 %a) {
-       %r = tail call %s_i32 @test_s_i32(%s_i32 %a);
-       ret %s_i32 %r;
-}
-
-; CHECK: .func  (.param .align 4 .b8 func_retval0[4])
-; CHECK-LABEL: test_s_f32(
-; CHECK-NEXT: .param .align 4 .b8 test_s_f32_param_0[4]
-; CHECK:      ld.param.f32    [[E:%f[0-9]+]], [test_s_f32_param_0];
-; CHECK:      .param .align 4 .b8 param0[4]
-; CHECK:      st.param.f32    [param0+0], [[E]];
-; CHECK:      .param .align 4 .b8 retval0[4];
-; CHECK:      call.uni (retval0),
-; CHECK-NEXT: test_s_f32,
-; CHECK:      ld.param.f32    [[R:%f[0-9]+]], [retval0+0];
-; CHECK:      st.param.f32    [func_retval0+0], [[R]];
-; CHECK-NEXT: ret;
-define %s_f32 @test_s_f32(%s_f32 %a) {
-       %r = tail call %s_f32 @test_s_f32(%s_f32 %a);
-       ret %s_f32 %r;
-}
-
-; CHECK: .func  (.param .align 8 .b8 func_retval0[8])
-; CHECK-LABEL: test_s_i64(
-; CHECK-NEXT: .param .align 8 .b8 test_s_i64_param_0[8]
-; CHECK:      ld.param.u64    [[E:%rd[0-9]+]], [test_s_i64_param_0];
-; CHECK:      .param .align 8 .b8 param0[8];
-; CHECK:      st.param.b64    [param0+0], [[E]];
-; CHECK:      .param .align 8 .b8 retval0[8];
-; CHECK:      call.uni (retval0),
-; CHECK-NEXT: test_s_i64,
-; CHECK:      ld.param.b64    [[R:%rd[0-9]+]], [retval0+0];
-; CHECK:      st.param.b64    [func_retval0+0], [[R]];
-; CHECK-NEXT: ret;
-define %s_i64 @test_s_i64(%s_i64 %a) {
-       %r = tail call %s_i64 @test_s_i64(%s_i64 %a);
-       ret %s_i64 %r;
-}
-
-; Fields that have different types, but identical sizes are not vectorized.
-; CHECK: .func  (.param .align 8 .b8 func_retval0[24])
-; CHECK-LABEL: test_s_i32f32(
-; CHECK:        .param .align 8 .b8 test_s_i32f32_param_0[24]
-; CHECK-DAG:    ld.param.u64    [[E4:%rd[0-9]+]], [test_s_i32f32_param_0+16];
-; CHECK-DAG:    ld.param.f32    [[E3:%f[0-9]+]], [test_s_i32f32_param_0+12];
-; CHECK-DAG:    ld.param.u32    [[E2:%r[0-9]+]], [test_s_i32f32_param_0+8];
-; CHECK-DAG:    ld.param.f32    [[E1:%f[0-9]+]], [test_s_i32f32_param_0+4];
-; CHECK-DAG:    ld.param.u32    [[E0:%r[0-9]+]], [test_s_i32f32_param_0];
-; CHECK:        .param .align 8 .b8 param0[24];
-; CHECK-DAG:    st.param.b32    [param0+0], [[E0]];
-; CHECK-DAG:    st.param.f32    [param0+4], [[E1]];
-; CHECK-DAG:    st.param.b32    [param0+8], [[E2]];
-; CHECK-DAG:    st.param.f32    [param0+12], [[E3]];
-; CHECK-DAG:    st.param.b64    [param0+16], [[E4]];
-; CHECK:        .param .align 8 .b8 retval0[24];
-; CHECK:        call.uni (retval0),
-; CHECK-NEXT:   test_s_i32f32,
-; CHECK-DAG:    ld.param.b32    [[RE0:%r[0-9]+]], [retval0+0];
-; CHECK-DAG:    ld.param.f32    [[RE1:%f[0-9]+]], [retval0+4];
-; CHECK-DAG:    ld.param.b32    [[RE2:%r[0-9]+]], [retval0+8];
-; CHECK-DAG:    ld.param.f32    [[RE3:%f[0-9]+]], [retval0+12];
-; CHECK-DAG:    ld.param.b64    [[RE4:%rd[0-9]+]], [retval0+16];
-; CHECK-DAG:    st.param.b32    [func_retval0+0], [[RE0]];
-; CHECK-DAG:    st.param.f32    [func_retval0+4], [[RE1]];
-; CHECK-DAG:    st.param.b32    [func_retval0+8], [[RE2]];
-; CHECK-DAG:    st.param.f32    [func_retval0+12], [[RE3]];
-; CHECK-DAG:    st.param.b64    [func_retval0+16], [[RE4]];
-; CHECK:        ret;
-define %s_i32f32 @test_s_i32f32(%s_i32f32 %a) {
-       %r = tail call %s_i32f32 @test_s_i32f32(%s_i32f32 %a);
-       ret %s_i32f32 %r;
-}
-
-; We do vectorize consecutive fields with matching types.
-; CHECK:.visible .func  (.param .align 8 .b8 func_retval0[24])
-; CHECK-LABEL: test_s_i32x4(
-; CHECK:        .param .align 8 .b8 test_s_i32x4_param_0[24]
-; CHECK-DAG:    ld.param.u64    [[RD1:%rd[0-9]+]], [test_s_i32x4_param_0+16];
-; CHECK-DAG:    ld.param.v2.u32 {[[E2:%r[0-9]+]], [[E3:%r[0-9]+]]}, [test_s_i32x4_param_0+8];
-; CHECK-DAG:    ld.param.v2.u32 {[[E0:%r[0-9]+]], [[E1:%r[0-9]+]]}, [test_s_i32x4_param_0];
-; CHECK:        .param .align 8 .b8 param0[24];
-; CHECK:        st.param.v2.b32 [param0+0], {[[E0]], [[E1]]};
-; CHECK:        st.param.v2.b32 [param0+8], {[[E2]], [[E3]]};
-; CHECK:        st.param.b64    [param0+16], [[E4]];
-; CHECK:        .param .align 8 .b8 retval0[24];
-; CHECK:        call.uni (retval0),
-; CHECK-NEXT:   test_s_i32x4,
-; CHECK:        ld.param.v2.b32 {[[RE0:%r[0-9]+]], [[RE1:%r[0-9]+]]}, [retval0+0];
-; CHECK:        ld.param.v2.b32 {[[RE2:%r[0-9]+]], [[RE3:%r[0-9]+]]}, [retval0+8];
-; CHECK:        ld.param.b64    [[RE4:%rd[0-9]+]], [retval0+16];
-; CHECK-DAG:    st.param.v2.b32 [func_retval0+0], {[[RE0]], [[RE1]]};
-; CHECK-DAG:    st.param.v2.b32 [func_retval0+8], {[[RE2]], [[RE3]]};
-; CHECK-DAG:    st.param.b64    [func_retval0+16], [[RE4]];
-; CHECK:        ret;
-
-define %s_i32x4 @test_s_i32x4(%s_i32x4 %a) {
-       %r = tail call %s_i32x4 @test_s_i32x4(%s_i32x4 %a);
-       ret %s_i32x4 %r;
-}
-
-; CHECK:.visible .func  (.param .align 8 .b8 func_retval0[32])
-; CHECK-LABEL: test_s_i1i32x4(
-; CHECK:        .param .align 8 .b8 test_s_i1i32x4_param_0[32]
-; CHECK:        ld.param.u64    [[E5:%rd[0-9]+]], [test_s_i1i32x4_param_0+24];
-; CHECK:        ld.param.u32    [[E4:%r[0-9]+]], [test_s_i1i32x4_param_0+16];
-; CHECK:        ld.param.u32    [[E3:%r[0-9]+]], [test_s_i1i32x4_param_0+12];
-; CHECK:        ld.param.u8     [[E2:%rs[0-9]+]], [test_s_i1i32x4_param_0+8];
-; CHECK:        ld.param.v2.u32         {[[E0:%r[0-9]+]], [[E1:%r[0-9]+]]}, [test_s_i1i32x4_param_0];
-; CHECK:        .param .align 8 .b8 param0[32];
-; CHECK:        st.param.v2.b32 [param0+0], {[[E0]], [[E1]]};
-; CHECK:        st.param.b8     [param0+8], [[E2]];
-; CHECK:        st.param.b32    [param0+12], [[E3]];
-; CHECK:        st.param.b32    [param0+16], [[E4]];
-; CHECK:        st.param.b64    [param0+24], [[E5]];
-; CHECK:        .param .align 8 .b8 retval0[32];
-; CHECK:        call.uni (retval0),
-; CHECK:        test_s_i1i32x4,
-; CHECK:        (
-; CHECK:        param0
-; CHECK:        );
-; CHECK:        ld.param.v2.b32 {[[RE0:%r[0-9]+]], [[RE1:%r[0-9]+]]}, [retval0+0];
-; CHECK:        ld.param.b8     [[RE2:%rs[0-9]+]], [retval0+8];
-; CHECK:        ld.param.b32    [[RE3:%r[0-9]+]], [retval0+12];
-; CHECK:        ld.param.b32    [[RE4:%r[0-9]+]], [retval0+16];
-; CHECK:        ld.param.b64    [[RE5:%rd[0-9]+]], [retval0+24];
-; CHECK:        st.param.v2.b32 [func_retval0+0], {[[RE0]], [[RE1]]};
-; CHECK:        st.param.b8     [func_retval0+8], [[RE2]];
-; CHECK:        st.param.b32    [func_retval0+12], [[RE3]];
-; CHECK:        st.param.b32    [func_retval0+16], [[RE4]];
-; CHECK:        st.param.b64    [func_retval0+24], [[RE5]];
-; CHECK:        ret;
-
-define %s_i8i32x4 @test_s_i1i32x4(%s_i8i32x4 %a) {
-       %r = tail call %s_i8i32x4 @test_s_i1i32x4(%s_i8i32x4 %a);
-       ret %s_i8i32x4 %r;
-}
-
-; -- All loads/stores from parameters aligned by one must be done one
-; -- byte at a time.
-; CHECK:.visible .func  (.param .align 1 .b8 func_retval0[25])
-; CHECK-LABEL: test_s_i1i32x4p(
-; CHECK-DAG:        .param .align 1 .b8 test_s_i1i32x4p_param_0[25]
-; CHECK-DAG:        ld.param.u8     %r{{.*}}, [test_s_i1i32x4p_param_0+24];
-; CHECK-DAG:        ld.param.u8     %r{{.*}}, [test_s_i1i32x4p_param_0+23];
-; CHECK-DAG:        ld.param.u8     %r{{.*}}, [test_s_i1i32x4p_param_0+22];
-; CHECK-DAG:        ld.param.u8     %r{{.*}}, [test_s_i1i32x4p_param_0+21];
-; CHECK-DAG:        ld.param.u8     %r{{.*}}, [test_s_i1i32x4p_param_0+20];
-; CHECK-DAG:        ld.param.u8     %r{{.*}}, [test_s_i1i32x4p_param_0+19];
-; CHECK-DAG:        ld.param.u8     %r{{.*}}, [test_s_i1i32x4p_param_0+18];
-; CHECK-DAG:        ld.param.u8     %r{{.*}}, [test_s_i1i32x4p_param_0+17];
-; CHECK-DAG:        ld.param.u8     %r{{.*}}, [test_s_i1i32x4p_param_0+16];
-; CHECK-DAG:        ld.param.u8     %r{{.*}}, [test_s_i1i32x4p_param_0+15];
-; CHECK-DAG:        ld.param.u8     %r{{.*}}, [test_s_i1i32x4p_param_0+14];
-; CHECK-DAG:        ld.param.u8     %r{{.*}}, [test_s_i1i32x4p_param_0+13];
-; CHECK-DAG:        ld.param.u8     %r{{.*}}, [test_s_i1i32x4p_param_0+12];
-; CHECK-DAG:        ld.param.u8     %r{{.*}}, [test_s_i1i32x4p_param_0+11];
-; CHECK-DAG:        ld.param.u8     %r{{.*}}, [test_s_i1i32x4p_param_0+10];
-; CHECK-DAG:        ld.param.u8     %r{{.*}}, [test_s_i1i32x4p_param_0+9];
-; CHECK-DAG:        ld.param.u8     %r{{.*}}, [test_s_i1i32x4p_param_0+8];
-; CHECK-DAG:        ld.param.u8     %r{{.*}}, [test_s_i1i32x4p_param_0+7];
-; CHECK-DAG:        ld.param.u8     %r{{.*}}, [test_s_i1i32x4p_param_0+6];
-; CHECK-DAG:        ld.param.u8     %r{{.*}}, [test_s_i1i32x4p_param_0+5];
-; CHECK-DAG:        ld.param.u8     %r{{.*}}, [test_s_i1i32x4p_param_0+4];
-; CHECK-DAG:        ld.param.u8     %r{{.*}}, [test_s_i1i32x4p_param_0+3];
-; CHECK-DAG:        ld.param.u8     %r{{.*}}, [test_s_i1i32x4p_param_0+2];
-; CHECK-DAG:        ld.param.u8     %r{{.*}}, [test_s_i1i32x4p_param_0+1];
-; CHECK-DAG:        ld.param.u8     %r{{.*}}, [test_s_i1i32x4p_param_0];
-; --- TODO
-; --- Unaligned parameter store/ return value load is broken in both nvcc
-; --- and llvm and needs to be fixed.
-; CHECK:        .param .align 1 .b8 param0[25];
-; CHECK-DAG:        st.param.b32    [param0+0],
-; CHECK-DAG:        st.param.b32    [param0+4],
-; CHECK-DAG:        st.param.b8     [param0+8],
-; CHECK-DAG:        st.param.b32    [param0+9],
-; CHECK-DAG:        st.param.b32    [param0+13],
-; CHECK-DAG:        st.param.b64    [param0+17],
-; CHECK:            .param .align 1 .b8 retval0[25];
-; CHECK:            call.uni (retval0),
-; CHECK-NEXT:       test_s_i1i32x4p,
-; CHECK-DAG:        ld.param.b32    %r41, [retval0+0];
-; CHECK-DAG:        ld.param.b32    %r42, [retval0+4];
-; CHECK-DAG:        ld.param.b8     %rs2, [retval0+8];
-; CHECK-DAG:        ld.param.b32    %r43, [retval0+9];
-; CHECK-DAG:        ld.param.b32    %r44, [retval0+13];
-; CHECK-DAG:        ld.param.b64    %rd23, [retval0+17];
-; CHECK-DAG:        st.param.b32    [func_retval0+0],
-; CHECK-DAG:        st.param.b32    [func_retval0+4],
-; CHECK-DAG:        st.param.b8     [func_retval0+8],
-; CHECK-DAG:        st.param.b32    [func_retval0+9],
-; CHECK-DAG:        st.param.b32    [func_retval0+13],
-; CHECK-DAG:        st.param.b64    [func_retval0+17],
-
-define %s_i8i32x4p @test_s_i1i32x4p(%s_i8i32x4p %a) {
-       %r = tail call %s_i8i32x4p @test_s_i1i32x4p(%s_i8i32x4p %a);
-       ret %s_i8i32x4p %r;
-}
-
-; Check that we can vectorize loads that span multiple aggregate fields.
-; CHECK:.visible .func  (.param .align 16 .b8 func_retval0[80])
-; CHECK-LABEL: test_s_crossfield(
-; CHECK:        .param .align 16 .b8 test_s_crossfield_param_0[80]
-; CHECK:        ld.param.u32    [[E15:%r[0-9]+]], [test_s_crossfield_param_0+64];
-; CHECK:        ld.param.v4.u32 {[[E11:%r[0-9]+]], [[E12:%r[0-9]+]], [[E13:%r[0-9]+]], [[E14:%r[0-9]+]]}, [test_s_crossfield_param_0+48];
-; CHECK:        ld.param.v4.u32 {[[E7:%r[0-9]+]], [[E8:%r[0-9]+]], [[E9:%r[0-9]+]], [[E10:%r[0-9]+]]}, [test_s_crossfield_param_0+32];
-; CHECK:        ld.param.v4.u32 {[[E3:%r[0-9]+]], [[E4:%r[0-9]+]], [[E5:%r[0-9]+]], [[E6:%r[0-9]+]]}, [test_s_crossfield_param_0+16];
-; CHECK:        ld.param.u32    [[E2:%r[0-9]+]], [test_s_crossfield_param_0+8];
-; CHECK:        ld.param.v2.u32 {[[E0:%r[0-9]+]], [[E1:%r[0-9]+]]}, [test_s_crossfield_param_0];
-; CHECK:        .param .align 16 .b8 param0[80];
-; CHECK:        st.param.v2.b32 [param0+0], {[[E0]], [[E1]]};
-; CHECK:        st.param.b32    [param0+8], [[E2]];
-; CHECK:        st.param.v4.b32 [param0+16], {[[E3]], [[E4]], [[E5]], [[E6]]};
-; CHECK:        st.param.v4.b32 [param0+32], {[[E7]], [[E8]], [[E9]], [[E10]]};
-; CHECK:        st.param.v4.b32 [param0+48], {[[E11]], [[E12]], [[E13]], [[E14]]};
-; CHECK:        st.param.b32    [param0+64], [[E15]];
-; CHECK:        .param .align 16 .b8 retval0[80];
-; CHECK:        call.uni (retval0),
-; CHECK:        test_s_crossfield,
-; CHECK:        ld.param.v2.b32 {[[RE0:%r[0-9]+]], [[RE1:%r[0-9]+]]}, [retval0+0];
-; CHECK:        ld.param.b32    [[RE2:%r[0-9]+]], [retval0+8];
-; CHECK:        ld.param.v4.b32 {[[RE3:%r[0-9]+]], [[RE4:%r[0-9]+]], [[RE5:%r[0-9]+]], [[RE6:%r[0-9]+]]}, [retval0+16];
-; CHECK:        ld.param.v4.b32 {[[RE7:%r[0-9]+]], [[RE8:%r[0-9]+]], [[RE9:%r[0-9]+]], [[RE10:%r[0-9]+]]}, [retval0+32];
-; CHECK:        ld.param.v4.b32 {[[RE11:%r[0-9]+]], [[RE12:%r[0-9]+]], [[RE13:%r[0-9]+]], [[RE14:%r[0-9]+]]}, [retval0+48];
-; CHECK:        ld.param.b32    [[RE15:%r[0-9]+]], [retval0+64];
-; CHECK:        st.param.v2.b32 [func_retval0+0], {[[RE0]], [[RE1]]};
-; CHECK:        st.param.b32    [func_retval0+8], [[RE2]];
-; CHECK:        st.param.v4.b32 [func_retval0+16], {[[RE3]], [[RE4]], [[RE5]], [[RE6]]};
-; CHECK:        st.param.v4.b32 [func_retval0+32], {[[RE7]], [[RE8]], [[RE9]], [[RE10]]};
-; CHECK:        st.param.v4.b32 [func_retval0+48], {[[RE11]], [[RE12]], [[RE13]], [[RE14]]};
-; CHECK:        st.param.b32    [func_retval0+64], [[RE15]];
-; CHECK:        ret;
-
-define %s_crossfield @test_s_crossfield(%s_crossfield %a) {
-       %r = tail call %s_crossfield @test_s_crossfield(%s_crossfield %a);
-       ret %s_crossfield %r;
-}
+; Verifies correctness of load/store of parameters and return values.
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_35 -O0 -verify-machineinstrs | FileCheck %s
+
+%s_i1 = type { i1 }
+%s_i8 = type { i8 }
+%s_i16 = type { i16 }
+%s_f16 = type { half }
+%s_i32 = type { i32 }
+%s_f32 = type { float }
+%s_i64 = type { i64 }
+%s_f64 = type { double }
+
+; More complicated types. i64 is used to increase natural alignment
+; requirement for the type.
+%s_i32x4 = type { i32, i32, i32, i32, i64}
+%s_i32f32 = type { i32, float, i32, float, i64}
+%s_i8i32x4 = type { i32, i32, i8, i32, i32, i64}
+%s_i8i32x4p = type <{ i32, i32, i8, i32, i32, i64}>
+%s_crossfield = type { i32, [2 x i32], <4 x i32>, [3 x {i32, i32, i32}]}
+; All scalar parameters must be at least 32 bits in size.
+; i1 is loaded/stored as i8.
+
+; CHECK: .func  (.param .b32 func_retval0)
+; CHECK-LABEL: test_i1(
+; CHECK-NEXT: .param .b32 test_i1_param_0
+; CHECK:      ld.param.u8 [[A8:%r[0-9]+]], [test_i1_param_0];
+; CHECK:      and.b32 [[A:%r[0-9]+]], [[A8]], 1;
+; CHECK:      .param .b32 param0;
+; CHECK:      st.param.b32    [param0+0], [[A]]
+; CHECK:      .param .b32 retval0;
+; CHECK:      call.uni
+; CHECK-NEXT: test_i1,
+; CHECK:      ld.param.b32    [[R8:%r[0-9]+]], [retval0+0];
+; CHECK:      and.b32         [[R:%r[0-9]+]], [[R8]], 1;
+; CHECK:      st.param.b32    [func_retval0+0], [[R]];
+; CHECK:      ret;
+define i1 @test_i1(i1 %a) {
+  %r = tail call i1 @test_i1(i1 %a);
+  ret i1 %r;
+}
+
+; Signed i1 is a somewhat special case. We only care about one bit and
+; then us neg.s32 to convert it to 32-bit -1 if it's set.
+; CHECK: .func  (.param .b32 func_retval0)
+; CHECK-LABEL: test_i1s(
+; CHECK-NEXT: .param .b32 test_i1s_param_0
+; CHECK:      ld.param.u8 [[A8:%rs[0-9]+]], [test_i1s_param_0];
+; CHECK:      cvt.u32.u16     [[A32:%r[0-9]+]], [[A8]];
+; CHECK:      and.b32         [[A1:%r[0-9]+]], [[A32]], 1;
+; CHECK:      neg.s32         [[A:%r[0-9]+]], [[A1]];
+; CHECK:      .param .b32 param0;
+; CHECK:      st.param.b32    [param0+0], [[A]];
+; CHECK:      .param .b32 retval0;
+; CHECK:      call.uni
+; CHECK:      ld.param.b32    [[R8:%r[0-9]+]], [retval0+0];
+; CHECK:      and.b32         [[R1:%r[0-9]+]], [[R8]], 1;
+; CHECK:      neg.s32         [[R:%r[0-9]+]], [[R1]];
+; CHECK:      st.param.b32    [func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+define signext i1 @test_i1s(i1 signext %a) {
+       %r = tail call signext i1 @test_i1s(i1 signext %a);
+       ret i1 %r;
+}
+
+; Make sure that i1 loads are vectorized as i8 loads, respecting each element alignment.
+; CHECK: .func  (.param .align 4 .b8 func_retval0[4])
+; CHECK-LABEL: test_v3i1(
+; CHECK-NEXT: .param .align 4 .b8 test_v3i1_param_0[4]
+; CHECK-DAG:  ld.param.u8     [[E2:%rs[0-9]+]], [test_v3i1_param_0+2];
+; CHECK-DAG:  ld.param.v2.u8  {[[E0:%rs[0-9]+]], [[E1:%rs[0-9]+]]}, [test_v3i1_param_0]
+; CHECK:      .param .align 4 .b8 param0[4];
+; CHECK-DAG:  st.param.v2.b8  [param0+0], {[[E0]], [[E1]]};
+; CHECK-DAG:  st.param.b8     [param0+2], [[E2]];
+; CHECK:      .param .align 4 .b8 retval0[4];
+; CHECK:      call.uni (retval0),
+; CHECK-NEXT: test_v3i1,
+; CHECK-DAG:  ld.param.v2.b8  {[[RE0:%rs[0-9]+]], [[RE1:%rs[0-9]+]]}, [retval0+0];
+; CHECK-DAG:  ld.param.b8     [[RE2:%rs[0-9]+]], [retval0+2];
+; CHECK-DAG:  st.param.v2.b8  [func_retval0+0], {[[RE0]], [[RE1]]}
+; CHECK-DAG:  st.param.b8     [func_retval0+2], [[RE2]];
+; CHECK-NEXT: ret;
+define <3 x i1> @test_v3i1(<3 x i1> %a) {
+       %r = tail call <3 x i1> @test_v3i1(<3 x i1> %a);
+       ret <3 x i1> %r;
+}
+
+; CHECK: .func  (.param .align 4 .b8 func_retval0[4])
+; CHECK-LABEL: test_v4i1(
+; CHECK-NEXT: .param .align 4 .b8 test_v4i1_param_0[4]
+; CHECK:      ld.param.v4.u8 {[[E0:%rs[0-9]+]], [[E1:%rs[0-9]+]], [[E2:%rs[0-9]+]], [[E3:%rs[0-9]+]]}, [test_v4i1_param_0]
+; CHECK:      .param .align 4 .b8 param0[4];
+; CHECK:      st.param.v4.b8  [param0+0], {[[E0]], [[E1]], [[E2]], [[E3]]};
+; CHECK:      .param .align 4 .b8 retval0[4];
+; CHECK:      call.uni (retval0),
+; CHECK:      test_v4i1,
+; CHECK:      ld.param.v4.b8  {[[RE0:%rs[0-9]+]], [[RE1:%rs[0-9]+]], [[RE2:%rs[0-9]+]], [[RE3:%rs[0-9]+]]}, [retval0+0];
+; CHECK:      st.param.v4.b8 [func_retval0+0], {[[RE0]], [[RE1]], [[RE2]], [[RE3]]};
+; CHECK-NEXT: ret;
+define <4 x i1> @test_v4i1(<4 x i1> %a) {
+       %r = tail call <4 x i1> @test_v4i1(<4 x i1> %a);
+       ret <4 x i1> %r;
+}
+
+; CHECK: .func  (.param .align 8 .b8 func_retval0[8])
+; CHECK-LABEL: test_v5i1(
+; CHECK-NEXT: .param .align 8 .b8 test_v5i1_param_0[8]
+; CHECK-DAG:  ld.param.u8     [[E4:%rs[0-9]+]], [test_v5i1_param_0+4];
+; CHECK-DAG:  ld.param.v4.u8  {[[E0:%rs[0-9]+]], [[E1:%rs[0-9]+]], [[E2:%rs[0-9]+]], [[E3:%rs[0-9]+]]}, [test_v5i1_param_0]
+; CHECK:      .param .align 8 .b8 param0[8];
+; CHECK-DAG:  st.param.v4.b8  [param0+0], {[[E0]], [[E1]], [[E2]], [[E3]]};
+; CHECK-DAG:  st.param.b8     [param0+4], [[E4]];
+; CHECK:      .param .align 8 .b8 retval0[8];
+; CHECK:      call.uni (retval0),
+; CHECK-NEXT: test_v5i1,
+; CHECK-DAG:  ld.param.v4.b8  {[[RE0:%rs[0-9]+]], [[RE1:%rs[0-9]+]], [[RE2:%rs[0-9]+]], [[RE3:%rs[0-9]+]]}, [retval0+0];
+; CHECK-DAG:  ld.param.b8     [[RE4:%rs[0-9]+]], [retval0+4];
+; CHECK-DAG:  st.param.v4.b8  [func_retval0+0], {[[RE0]], [[RE1]], [[RE2]], [[RE3]]}
+; CHECK-DAG:  st.param.b8     [func_retval0+4], [[RE4]];
+; CHECK-NEXT: ret;
+define <5 x i1> @test_v5i1(<5 x i1> %a) {
+       %r = tail call <5 x i1> @test_v5i1(<5 x i1> %a);
+       ret <5 x i1> %r;
+}
+
+; Unsigned i8 is loaded directly into 32-bit register.
+; CHECK: .func  (.param .b32 func_retval0)
+; CHECK-LABEL: test_i8(
+; CHECK-NEXT: .param .b32 test_i8_param_0
+; CHECK:      ld.param.u8 [[A8:%rs[0-9]+]], [test_i8_param_0];
+; CHECK:      cvt.u32.u16     [[A32:%r[0-9]+]], [[A8]];
+; CHECK:      and.b32         [[A:%r[0-9]+]], [[A32]], 255;
+; CHECK:      .param .b32 param0;
+; CHECK:      st.param.b32    [param0+0], [[A]];
+; CHECK:      .param .b32 retval0;
+; CHECK:      call.uni (retval0),
+; CHECK:      test_i8,
+; CHECK:      ld.param.b32    [[R32:%r[0-9]+]], [retval0+0];
+; CHECK:      and.b32         [[R:%r[0-9]+]], [[R32]], 255;
+; CHECK:      st.param.b32    [func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+define i8 @test_i8(i8 %a) {
+       %r = tail call i8 @test_i8(i8 %a);
+       ret i8 %r;
+}
+
+; signed i8 is loaded into 16-bit register which is then sign-extended to i32.
+; CHECK: .func  (.param .b32 func_retval0)
+; CHECK-LABEL: test_i8s(
+; CHECK-NEXT: .param .b32 test_i8s_param_0
+; CHECK:      ld.param.s8 [[A8:%rs[0-9]+]], [test_i8s_param_0];
+; CHECK:      cvt.s32.s16     [[A:%r[0-9]+]], [[A8]];
+; CHECK:      .param .b32 param0;
+; CHECK:      st.param.b32    [param0+0], [[A]];
+; CHECK:      .param .b32 retval0;
+; CHECK:      call.uni (retval0),
+; CHECK:      test_i8s,
+; CHECK:      ld.param.b32    [[R32:%r[0-9]+]], [retval0+0];
+; -- This is suspicious (though correct) -- why not cvt.u8.u32, cvt.s8.s32 ?
+; CHECK:      cvt.u16.u32     [[R16:%rs[0-9]+]], [[R32]];
+; CHECK:      cvt.s32.s16     [[R:%r[0-9]+]], [[R16]];
+; CHECK:      st.param.b32    [func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+define signext i8 @test_i8s(i8 signext %a) {
+       %r = tail call signext i8 @test_i8s(i8 signext %a);
+       ret i8 %r;
+}
+
+; CHECK: .func  (.param .align 4 .b8 func_retval0[4])
+; CHECK-LABEL: test_v3i8(
+; CHECK-NEXT: .param .align 4 .b8 test_v3i8_param_0[4]
+; CHECK-DAG:  ld.param.u8     [[E2:%rs[0-9]+]], [test_v3i8_param_0+2];
+; CHECK-DAG:  ld.param.v2.u8  {[[E0:%rs[0-9]+]], [[E1:%rs[0-9]+]]}, [test_v3i8_param_0];
+; CHECK:      .param .align 4 .b8 param0[4];
+; CHECK:      st.param.v2.b8  [param0+0], {[[E0]], [[E1]]};
+; CHECK:      st.param.b8     [param0+2], [[E2]];
+; CHECK:      .param .align 4 .b8 retval0[4];
+; CHECK:      call.uni (retval0),
+; CHECK-NEXT: test_v3i8,
+; CHECK-DAG:  ld.param.v2.b8  {[[RE0:%rs[0-9]+]], [[RE1:%rs[0-9]+]]}, [retval0+0];
+; CHECK-DAG:  ld.param.b8     [[RE2:%rs[0-9]+]], [retval0+2];
+; CHECK-DAG:  st.param.v2.b8  [func_retval0+0], {[[RE0]], [[RE1]]};
+; CHECK-DAG:  st.param.b8     [func_retval0+2], [[RE2]];
+; CHECK-NEXT: ret;
+define <3 x i8> @test_v3i8(<3 x i8> %a) {
+       %r = tail call <3 x i8> @test_v3i8(<3 x i8> %a);
+       ret <3 x i8> %r;
+}
+
+; CHECK: .func  (.param .align 4 .b8 func_retval0[4])
+; CHECK-LABEL: test_v4i8(
+; CHECK-NEXT: .param .align 4 .b8 test_v4i8_param_0[4]
+; CHECK:      ld.param.v4.u8 {[[E0:%rs[0-9]+]], [[E1:%rs[0-9]+]], [[E2:%rs[0-9]+]], [[E3:%rs[0-9]+]]}, [test_v4i8_param_0]
+; CHECK:      .param .align 4 .b8 param0[4];
+; CHECK:      st.param.v4.b8  [param0+0], {[[E0]], [[E1]], [[E2]], [[E3]]};
+; CHECK:      .param .align 4 .b8 retval0[4];
+; CHECK:      call.uni (retval0),
+; CHECK-NEXT: test_v4i8,
+; CHECK:      ld.param.v4.b8  {[[RE0:%rs[0-9]+]], [[RE1:%rs[0-9]+]], [[RE2:%rs[0-9]+]], [[RE3:%rs[0-9]+]]}, [retval0+0];
+; CHECK:      st.param.v4.b8 [func_retval0+0], {[[RE0]], [[RE1]], [[RE2]], [[RE3]]}
+; CHECK-NEXT: ret;
+define <4 x i8> @test_v4i8(<4 x i8> %a) {
+       %r = tail call <4 x i8> @test_v4i8(<4 x i8> %a);
+       ret <4 x i8> %r;
+}
+
+; CHECK: .func  (.param .align 8 .b8 func_retval0[8])
+; CHECK-LABEL: test_v5i8(
+; CHECK-NEXT: .param .align 8 .b8 test_v5i8_param_0[8]
+; CHECK-DAG:  ld.param.u8     [[E4:%rs[0-9]+]], [test_v5i8_param_0+4];
+; CHECK-DAG   ld.param.v4.u8  {[[E0:%rs[0-9]+]], [[E1:%rs[0-9]+]], [[E2:%rs[0-9]+]], [[E3:%rs[0-9]+]]}, [test_v5i8_param_0]
+; CHECK:      .param .align 8 .b8 param0[8];
+; CHECK-DAG:  st.param.v4.b8  [param0+0], {[[E0]], [[E1]], [[E2]], [[E3]]};
+; CHECK-DAG:  st.param.b8     [param0+4], [[E4]];
+; CHECK:      .param .align 8 .b8 retval0[8];
+; CHECK:      call.uni (retval0),
+; CHECK-NEXT: test_v5i8,
+; CHECK-DAG:  ld.param.v4.b8  {[[RE0:%rs[0-9]+]], [[RE1:%rs[0-9]+]], [[RE2:%rs[0-9]+]], [[RE3:%rs[0-9]+]]}, [retval0+0];
+; CHECK-DAG:  ld.param.b8     [[RE4:%rs[0-9]+]], [retval0+4];
+; CHECK-DAG:  st.param.v4.b8  [func_retval0+0], {[[RE0]], [[RE1]], [[RE2]], [[RE3]]}
+; CHECK-DAG:  st.param.b8     [func_retval0+4], [[RE4]];
+; CHECK-NEXT: ret;
+define <5 x i8> @test_v5i8(<5 x i8> %a) {
+       %r = tail call <5 x i8> @test_v5i8(<5 x i8> %a);
+       ret <5 x i8> %r;
+}
+
+; CHECK: .func  (.param .b32 func_retval0)
+; CHECK-LABEL: test_i16(
+; CHECK-NEXT: .param .b32 test_i16_param_0
+; CHECK:      ld.param.u16    [[E16:%rs[0-9]+]], [test_i16_param_0];
+; CHECK:      cvt.u32.u16     [[E32:%r[0-9]+]], [[E16]];
+; CHECK:      .param .b32 param0;
+; CHECK:      st.param.b32    [param0+0], [[E32]];
+; CHECK:      .param .b32 retval0;
+; CHECK:      call.uni (retval0),
+; CHECK-NEXT: test_i16,
+; CHECK:      ld.param.b32    [[RE32:%r[0-9]+]], [retval0+0];
+; CHECK:      and.b32         [[R:%r[0-9]+]], [[RE32]], 65535;
+; CHECK:      st.param.b32    [func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+define i16 @test_i16(i16 %a) {
+       %r = tail call i16 @test_i16(i16 %a);
+       ret i16 %r;
+}
+
+; CHECK: .func  (.param .b32 func_retval0)
+; CHECK-LABEL: test_i16s(
+; CHECK-NEXT: .param .b32 test_i16s_param_0
+; CHECK:      ld.param.u16    [[E16:%rs[0-9]+]], [test_i16s_param_0];
+; CHECK:      cvt.s32.s16     [[E32:%r[0-9]+]], [[E16]];
+; CHECK:      .param .b32 param0;
+; CHECK:      st.param.b32    [param0+0], [[E32]];
+; CHECK:      .param .b32 retval0;
+; CHECK:      call.uni (retval0),
+; CHECK-NEXT: test_i16s,
+; CHECK:      ld.param.b32    [[RE32:%r[0-9]+]], [retval0+0];
+; CHECK:      cvt.s32.s16     [[R:%r[0-9]+]], [[RE32]];
+; CHECK:      st.param.b32    [func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+define signext i16 @test_i16s(i16 signext %a) {
+       %r = tail call signext i16 @test_i16s(i16 signext %a);
+       ret i16 %r;
+}
+
+; CHECK: .func  (.param .align 8 .b8 func_retval0[8])
+; CHECK-LABEL: test_v3i16(
+; CHECK-NEXT: .param .align 8 .b8 test_v3i16_param_0[8]
+; CHECK-DAG:  ld.param.u16    [[E2:%rs[0-9]+]], [test_v3i16_param_0+4];
+; CHECK-DAG:  ld.param.v2.u16 {[[E0:%rs[0-9]+]], [[E1:%rs[0-9]+]]}, [test_v3i16_param_0];
+; CHECK:      .param .align 8 .b8 param0[8];
+; CHECK:      st.param.v2.b16 [param0+0], {[[E0]], [[E1]]};
+; CHECK:      st.param.b16    [param0+4], [[E2]];
+; CHECK:      .param .align 8 .b8 retval0[8];
+; CHECK:      call.uni (retval0),
+; CHECK-NEXT: test_v3i16,
+; CHECK:      ld.param.v2.b16 {[[RE0:%rs[0-9]+]], [[RE1:%rs[0-9]+]]}, [retval0+0];
+; CHECK:      ld.param.b16    [[RE2:%rs[0-9]+]], [retval0+4];
+; CHECK-DAG:  st.param.v2.b16 [func_retval0+0], {[[RE0]], [[RE1]]};
+; CHECK-DAG:  st.param.b16    [func_retval0+4], [[RE2]];
+; CHECK-NEXT: ret;
+define <3 x i16> @test_v3i16(<3 x i16> %a) {
+       %r = tail call <3 x i16> @test_v3i16(<3 x i16> %a);
+       ret <3 x i16> %r;
+}
+
+; CHECK: .func  (.param .align 8 .b8 func_retval0[8])
+; CHECK-LABEL: test_v4i16(
+; CHECK-NEXT: .param .align 8 .b8 test_v4i16_param_0[8]
+; CHECK:      ld.param.v4.u16 {[[E0:%rs[0-9]+]], [[E1:%rs[0-9]+]], [[E2:%rs[0-9]+]], [[E3:%rs[0-9]+]]}, [test_v4i16_param_0]
+; CHECK:      .param .align 8 .b8 param0[8];
+; CHECK:      st.param.v4.b16 [param0+0], {[[E0]], [[E1]], [[E2]], [[E3]]};
+; CHECK:      .param .align 8 .b8 retval0[8];
+; CHECK:      call.uni (retval0),
+; CHECK-NEXT: test_v4i16,
+; CHECK:      ld.param.v4.b16 {[[RE0:%rs[0-9]+]], [[RE1:%rs[0-9]+]], [[RE2:%rs[0-9]+]], [[RE3:%rs[0-9]+]]}, [retval0+0];
+; CHECK:      st.param.v4.b16 [func_retval0+0], {[[RE0]], [[RE1]], [[RE2]], [[RE3]]}
+; CHECK-NEXT: ret;
+define <4 x i16> @test_v4i16(<4 x i16> %a) {
+       %r = tail call <4 x i16> @test_v4i16(<4 x i16> %a);
+       ret <4 x i16> %r;
+}
+
+; CHECK: .func  (.param .align 16 .b8 func_retval0[16])
+; CHECK-LABEL: test_v5i16(
+; CHECK-NEXT: .param .align 16 .b8 test_v5i16_param_0[16]
+; CHECK-DAG:  ld.param.u16    [[E4:%rs[0-9]+]], [test_v5i16_param_0+8];
+; CHECK-DAG   ld.param.v4.u16 {[[E0:%rs[0-9]+]], [[E1:%rs[0-9]+]], [[E2:%rs[0-9]+]], [[E3:%rs[0-9]+]]}, [test_v5i16_param_0]
+; CHECK:      .param .align 16 .b8 param0[16];
+; CHECK-DAG:  st.param.v4.b16 [param0+0], {[[E0]], [[E1]], [[E2]], [[E3]]};
+; CHECK-DAG:  st.param.b16    [param0+8], [[E4]];
+; CHECK:      .param .align 16 .b8 retval0[16];
+; CHECK:      call.uni (retval0),
+; CHECK-NEXT: test_v5i16,
+; CHECK-DAG:  ld.param.v4.b16 {[[RE0:%rs[0-9]+]], [[RE1:%rs[0-9]+]], [[RE2:%rs[0-9]+]], [[RE3:%rs[0-9]+]]}, [retval0+0];
+; CHECK-DAG:  ld.param.b16    [[RE4:%rs[0-9]+]], [retval0+8];
+; CHECK-DAG:  st.param.v4.b16 [func_retval0+0], {[[RE0]], [[RE1]], [[RE2]], [[RE3]]}
+; CHECK-DAG:  st.param.b16    [func_retval0+8], [[RE4]];
+; CHECK-NEXT: ret;
+define <5 x i16> @test_v5i16(<5 x i16> %a) {
+       %r = tail call <5 x i16> @test_v5i16(<5 x i16> %a);
+       ret <5 x i16> %r;
+}
+
+; CHECK: .func  (.param .b32 func_retval0)
+; CHECK-LABEL: test_f16(
+; CHECK-NEXT: .param .b32 test_f16_param_0
+; CHECK:      ld.param.b16    [[E:%h[0-9]+]], [test_f16_param_0];
+; CHECK:      .param .b32 param0;
+; CHECK:      st.param.b16    [param0+0], [[E]];
+; CHECK:      .param .b32 retval0;
+; CHECK:      call.uni (retval0),
+; CHECK-NEXT: test_f16,
+; CHECK:      ld.param.b16    [[R:%h[0-9]+]], [retval0+0];
+; CHECK:      st.param.b16    [func_retval0+0], [[R]]
+; CHECK-NEXT: ret;
+define half @test_f16(half %a) {
+       %r = tail call half @test_f16(half %a);
+       ret half %r;
+}
+
+; CHECK: .func  (.param .align 4 .b8 func_retval0[4])
+; CHECK-LABEL: test_v2f16(
+; CHECK-NEXT: .param .align 4 .b8 test_v2f16_param_0[4]
+; CHECK:      ld.param.b32    [[E:%hh[0-9]+]], [test_v2f16_param_0];
+; CHECK:      .param .align 4 .b8 param0[4];
+; CHECK:      st.param.b32    [param0+0], [[E]];
+; CHECK:      .param .align 4 .b8 retval0[4];
+; CHECK:      call.uni (retval0),
+; CHECK-NEXT: test_v2f16,
+; CHECK:      ld.param.b32    [[R:%hh[0-9]+]], [retval0+0];
+; CHECK:      st.param.b32    [func_retval0+0], [[R]]
+; CHECK-NEXT: ret;
+define <2 x half> @test_v2f16(<2 x half> %a) {
+       %r = tail call <2 x half> @test_v2f16(<2 x half> %a);
+       ret <2 x half> %r;
+}
+
+; CHECK:.func  (.param .align 8 .b8 func_retval0[8])
+; CHECK-LABEL: test_v3f16(
+; CHECK:      .param .align 8 .b8 test_v3f16_param_0[8]
+; CHECK-DAG:  ld.param.b32    [[HH01:%hh[0-9]+]], [test_v3f16_param_0];
+; CHECK-DAG:  mov.b32         {[[E0:%h[0-9]+]], [[E1:%h[0-9]+]]}, [[HH01]];
+; CHECK-DAG:  ld.param.b16    [[E2:%h[0-9]+]], [test_v3f16_param_0+4];
+; CHECK:      .param .align 8 .b8 param0[8];
+; CHECK-DAG:  st.param.v2.b16 [param0+0], {[[E0]], [[E1]]};
+; CHECK-DAG:  st.param.b16    [param0+4], [[E2]];
+; CHECK:      .param .align 8 .b8 retval0[8];
+; CHECK:      call.uni (retval0),
+; CHECK:      test_v3f16,
+; CHECK-DAG:  ld.param.v2.b16 {[[R0:%h[0-9]+]], [[R1:%h[0-9]+]]}, [retval0+0];
+; CHECK-DAG:  ld.param.b16    [[R2:%h[0-9]+]], [retval0+4];
+; CHECK-DAG:  st.param.v2.b16 [func_retval0+0], {[[R0]], [[R1]]};
+; CHECK-DAG:  st.param.b16    [func_retval0+4], [[R2]];
+; CHECK:      ret;
+define <3 x half> @test_v3f16(<3 x half> %a) {
+       %r = tail call <3 x half> @test_v3f16(<3 x half> %a);
+       ret <3 x half> %r;
+}
+
+; CHECK:.func  (.param .align 8 .b8 func_retval0[8])
+; CHECK-LABEL: test_v4f16(
+; CHECK:      .param .align 8 .b8 test_v4f16_param_0[8]
+; CHECK:      ld.param.v2.u32 {[[R01:%r[0-9]+]], [[R23:%r[0-9]+]]}, [test_v4f16_param_0];
+; CHECK-DAG:  mov.b32         [[HH01:%hh[0-9]+]], [[R01]];
+; CHECK-DAG:  mov.b32         [[HH23:%hh[0-9]+]], [[R23]];
+; CHECK:      .param .align 8 .b8 param0[8];
+; CHECK:      st.param.v2.b32 [param0+0], {[[HH01]], [[HH23]]};
+; CHECK:      .param .align 8 .b8 retval0[8];
+; CHECK:      call.uni (retval0),
+; CHECK:      test_v4f16,
+; CHECK:      ld.param.v2.b32 {[[RH01:%hh[0-9]+]], [[RH23:%hh[0-9]+]]}, [retval0+0];
+; CHECK:      st.param.v2.b32 [func_retval0+0], {[[RH01]], [[RH23]]};
+; CHECK:      ret;
+define <4 x half> @test_v4f16(<4 x half> %a) {
+       %r = tail call <4 x half> @test_v4f16(<4 x half> %a);
+       ret <4 x half> %r;
+}
+
+; CHECK:.func  (.param .align 16 .b8 func_retval0[16])
+; CHECK-LABEL: test_v5f16(
+; CHECK:      .param .align 16 .b8 test_v5f16_param_0[16]
+; CHECK-DAG:  ld.param.v4.b16  {[[E0:%h[0-9]+]], [[E1:%h[0-9]+]], [[E2:%h[0-9]+]], [[E3:%h[0-9]+]]}, [test_v5f16_param_0];
+; CHECK-DAG:  mov.b32         {[[E0:%h[0-9]+]], [[E1:%h[0-9]+]]}, [[HH01]];
+; CHECK-DAG:  ld.param.b16    [[E4:%h[0-9]+]], [test_v5f16_param_0+8];
+; CHECK:      .param .align 16 .b8 param0[16];
+; CHECK-DAG:  st.param.v4.b16 [param0+0],
+; CHECK-DAG:  st.param.b16    [param0+8], [[E4]];
+; CHECK:      .param .align 16 .b8 retval0[16];
+; CHECK:      call.uni (retval0),
+; CHECK:      test_v5f16,
+; CHECK-DAG:  ld.param.v4.b16 {[[R0:%h[0-9]+]], [[R1:%h[0-9]+]], [[R2:%h[0-9]+]], [[R3:%h[0-9]+]]}, [retval0+0];
+; CHECK-DAG:  ld.param.b16    [[R4:%h[0-9]+]], [retval0+8];
+; CHECK-DAG:  st.param.v4.b16 [func_retval0+0], {[[R0]], [[R1]], [[R2]], [[R3]]};
+; CHECK-DAG:  st.param.b16    [func_retval0+8], [[R4]];
+; CHECK:      ret;
+define <5 x half> @test_v5f16(<5 x half> %a) {
+       %r = tail call <5 x half> @test_v5f16(<5 x half> %a);
+       ret <5 x half> %r;
+}
+
+; CHECK:.func  (.param .align 16 .b8 func_retval0[16])
+; CHECK-LABEL: test_v8f16(
+; CHECK:      .param .align 16 .b8 test_v8f16_param_0[16]
+; CHECK:      ld.param.v4.u32 {[[R01:%r[0-9]+]], [[R23:%r[0-9]+]], [[R45:%r[0-9]+]], [[R67:%r[0-9]+]]}, [test_v8f16_param_0];
+; CHECK-DAG:  mov.b32         [[HH01:%hh[0-9]+]], [[R01]];
+; CHECK-DAG:  mov.b32         [[HH23:%hh[0-9]+]], [[R23]];
+; CHECK-DAG:  mov.b32         [[HH45:%hh[0-9]+]], [[R45]];
+; CHECK-DAG:  mov.b32         [[HH67:%hh[0-9]+]], [[R67]];
+; CHECK:      .param .align 16 .b8 param0[16];
+; CHECK:      st.param.v4.b32 [param0+0], {[[HH01]], [[HH23]], [[HH45]], [[HH67]]};
+; CHECK:      .param .align 16 .b8 retval0[16];
+; CHECK:      call.uni (retval0),
+; CHECK:      test_v8f16,
+; CHECK:      ld.param.v4.b32 {[[RH01:%hh[0-9]+]], [[RH23:%hh[0-9]+]], [[RH45:%hh[0-9]+]], [[RH67:%hh[0-9]+]]}, [retval0+0];
+; CHECK:      st.param.v4.b32 [func_retval0+0], {[[RH01]], [[RH23]], [[RH45]], [[RH67]]};
+; CHECK:      ret;
+define <8 x half> @test_v8f16(<8 x half> %a) {
+       %r = tail call <8 x half> @test_v8f16(<8 x half> %a);
+       ret <8 x half> %r;
+}
+
+; CHECK:.func  (.param .align 32 .b8 func_retval0[32])
+; CHECK-LABEL: test_v9f16(
+; CHECK:      .param .align 32 .b8 test_v9f16_param_0[32]
+; CHECK-DAG:  ld.param.v4.b16  {[[E0:%h[0-9]+]], [[E1:%h[0-9]+]], [[E2:%h[0-9]+]], [[E3:%h[0-9]+]]}, [test_v9f16_param_0];
+; CHECK-DAG:  ld.param.v4.b16  {[[E4:%h[0-9]+]], [[E5:%h[0-9]+]], [[E6:%h[0-9]+]], [[E7:%h[0-9]+]]}, [test_v9f16_param_0+8];
+; CHECK-DAG:  ld.param.b16     [[E8:%h[0-9]+]], [test_v9f16_param_0+16];
+; CHECK:      .param .align 32 .b8 param0[32];
+; CHECK-DAG:  st.param.v4.b16 [param0+0],
+; CHECK-DAG:  st.param.v4.b16 [param0+8],
+; CHECK-DAG:  st.param.b16    [param0+16], [[E8]];
+; CHECK:      .param .align 32 .b8 retval0[32];
+; CHECK:      call.uni (retval0),
+; CHECK:      test_v9f16,
+; CHECK-DAG:  ld.param.v4.b16 {[[R0:%h[0-9]+]], [[R1:%h[0-9]+]], [[R2:%h[0-9]+]], [[R3:%h[0-9]+]]}, [retval0+0];
+; CHECK-DAG:  ld.param.v4.b16 {[[R4:%h[0-9]+]], [[R5:%h[0-9]+]], [[R6:%h[0-9]+]], [[R7:%h[0-9]+]]}, [retval0+8];
+; CHECK-DAG:  ld.param.b16    [[R8:%h[0-9]+]], [retval0+16];
+; CHECK-DAG:  st.param.v4.b16 [func_retval0+0], {[[R0]], [[R1]], [[R2]], [[R3]]};
+; CHECK-DAG:  st.param.v4.b16 [func_retval0+8], {[[R4]], [[R5]], [[R6]], [[R7]]};
+; CHECK-DAG:  st.param.b16    [func_retval0+16], [[R8]];
+; CHECK:      ret;
+define <9 x half> @test_v9f16(<9 x half> %a) {
+       %r = tail call <9 x half> @test_v9f16(<9 x half> %a);
+       ret <9 x half> %r;
+}
+
+; CHECK: .func  (.param .b32 func_retval0)
+; CHECK-LABEL: test_i32(
+; CHECK-NEXT: .param .b32 test_i32_param_0
+; CHECK:      ld.param.u32    [[E:%r[0-9]+]], [test_i32_param_0];
+; CHECK:      .param .b32 param0;
+; CHECK:      st.param.b32    [param0+0], [[E]];
+; CHECK:      .param .b32 retval0;
+; CHECK:      call.uni (retval0),
+; CHECK-NEXT: test_i32,
+; CHECK:      ld.param.b32    [[R:%r[0-9]+]], [retval0+0];
+; CHECK:      st.param.b32    [func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+define i32 @test_i32(i32 %a) {
+       %r = tail call i32 @test_i32(i32 %a);
+       ret i32 %r;
+}
+
+; CHECK: .func  (.param .align 16 .b8 func_retval0[16])
+; CHECK-LABEL: test_v3i32(
+; CHECK-NEXT: .param .align 16 .b8 test_v3i32_param_0[16]
+; CHECK-DAG:  ld.param.u32     [[E2:%r[0-9]+]], [test_v3i32_param_0+8];
+; CHECK-DAG:  ld.param.v2.u32  {[[E0:%r[0-9]+]], [[E1:%r[0-9]+]]}, [test_v3i32_param_0];
+; CHECK:      .param .align 16 .b8 param0[16];
+; CHECK:      st.param.v2.b32  [param0+0], {[[E0]], [[E1]]};
+; CHECK:      st.param.b32     [param0+8], [[E2]];
+; CHECK:      .param .align 16 .b8 retval0[16];
+; CHECK:      call.uni (retval0),
+; CHECK-NEXT: test_v3i32,
+; CHECK:      ld.param.v2.b32  {[[RE0:%r[0-9]+]], [[RE1:%r[0-9]+]]}, [retval0+0];
+; CHECK:      ld.param.b32     [[RE2:%r[0-9]+]], [retval0+8];
+; CHECK-DAG:  st.param.v2.b32  [func_retval0+0], {[[RE0]], [[RE1]]};
+; CHECK-DAG:  st.param.b32     [func_retval0+8], [[RE2]];
+; CHECK-NEXT: ret;
+define <3 x i32> @test_v3i32(<3 x i32> %a) {
+       %r = tail call <3 x i32> @test_v3i32(<3 x i32> %a);
+       ret <3 x i32> %r;
+}
+
+; CHECK: .func  (.param .align 16 .b8 func_retval0[16])
+; CHECK-LABEL: test_v4i32(
+; CHECK-NEXT: .param .align 16 .b8 test_v4i32_param_0[16]
+; CHECK:      ld.param.v4.u32  {[[E0:%r[0-9]+]], [[E1:%r[0-9]+]], [[E2:%r[0-9]+]], [[E3:%r[0-9]+]]}, [test_v4i32_param_0]
+; CHECK:      .param .align 16 .b8 param0[16];
+; CHECK:      st.param.v4.b32  [param0+0], {[[E0]], [[E1]], [[E2]], [[E3]]};
+; CHECK:      .param .align 16 .b8 retval0[16];
+; CHECK:      call.uni (retval0),
+; CHECK-NEXT: test_v4i32,
+; CHECK:      ld.param.v4.b32  {[[RE0:%r[0-9]+]], [[RE1:%r[0-9]+]], [[RE2:%r[0-9]+]], [[RE3:%r[0-9]+]]}, [retval0+0];
+; CHECK:      st.param.v4.b32  [func_retval0+0], {[[RE0]], [[RE1]], [[RE2]], [[RE3]]}
+; CHCK-NEXT: ret;
+define <4 x i32> @test_v4i32(<4 x i32> %a) {
+       %r = tail call <4 x i32> @test_v4i32(<4 x i32> %a);
+       ret <4 x i32> %r;
+}
+
+; CHECK: .func  (.param .align 32 .b8 func_retval0[32])
+; CHECK-LABEL: test_v5i32(
+; CHECK-NEXT: .param .align 32 .b8 test_v5i32_param_0[32]
+; CHECK-DAG:  ld.param.u32     [[E4:%r[0-9]+]], [test_v5i32_param_0+16];
+; CHECK-DAG   ld.param.v4.u32  {[[E0:%r[0-9]+]], [[E1:%r[0-9]+]], [[E2:%r[0-9]+]], [[E3:%r[0-9]+]]}, [test_v5i32_param_0]
+; CHECK:      .param .align 32 .b8 param0[32];
+; CHECK-DAG:  st.param.v4.b32  [param0+0], {[[E0]], [[E1]], [[E2]], [[E3]]};
+; CHECK-DAG:  st.param.b32     [param0+16], [[E4]];
+; CHECK:      .param .align 32 .b8 retval0[32];
+; CHECK:      call.uni (retval0),
+; CHECK-NEXT: test_v5i32,
+; CHECK-DAG:  ld.param.v4.b32  {[[RE0:%r[0-9]+]], [[RE1:%r[0-9]+]], [[RE2:%r[0-9]+]], [[RE3:%r[0-9]+]]}, [retval0+0];
+; CHECK-DAG:  ld.param.b32     [[RE4:%r[0-9]+]], [retval0+16];
+; CHECK-DAG:  st.param.v4.b32  [func_retval0+0], {[[RE0]], [[RE1]], [[RE2]], [[RE3]]}
+; CHECK-DAG:  st.param.b32     [func_retval0+16], [[RE4]];
+; CHECK-NEXT: ret;
+define <5 x i32> @test_v5i32(<5 x i32> %a) {
+       %r = tail call <5 x i32> @test_v5i32(<5 x i32> %a);
+       ret <5 x i32> %r;
+}
+
+; CHECK: .func  (.param .b32 func_retval0)
+; CHECK-LABEL: test_f32(
+; CHECK-NEXT: .param .b32 test_f32_param_0
+; CHECK:      ld.param.f32    [[E:%f[0-9]+]], [test_f32_param_0];
+; CHECK:      .param .b32 param0;
+; CHECK:      st.param.f32    [param0+0], [[E]];
+; CHECK:      .param .b32 retval0;
+; CHECK:      call.uni (retval0),
+; CHECK-NEXT: test_f32,
+; CHECK:      ld.param.f32    [[R:%f[0-9]+]], [retval0+0];
+; CHECK:      st.param.f32    [func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+define float @test_f32(float %a) {
+       %r = tail call float @test_f32(float %a);
+       ret float %r;
+}
+
+; CHECK: .func  (.param .b64 func_retval0)
+; CHECK-LABEL: test_i64(
+; CHECK-NEXT: .param .b64 test_i64_param_0
+; CHECK:      ld.param.u64    [[E:%rd[0-9]+]], [test_i64_param_0];
+; CHECK:      .param .b64 param0;
+; CHECK:      st.param.b64    [param0+0], [[E]];
+; CHECK:      .param .b64 retval0;
+; CHECK:      call.uni (retval0),
+; CHECK-NEXT: test_i64,
+; CHECK:      ld.param.b64    [[R:%rd[0-9]+]], [retval0+0];
+; CHECK:      st.param.b64    [func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+define i64 @test_i64(i64 %a) {
+       %r = tail call i64 @test_i64(i64 %a);
+       ret i64 %r;
+}
+
+; CHECK: .func  (.param .align 32 .b8 func_retval0[32])
+; CHECK-LABEL: test_v3i64(
+; CHECK-NEXT: .param .align 32 .b8 test_v3i64_param_0[32]
+; CHECK-DAG:  ld.param.u64     [[E2:%rd[0-9]+]], [test_v3i64_param_0+16];
+; CHECK-DAG:  ld.param.v2.u64  {[[E0:%rd[0-9]+]], [[E1:%rd[0-9]+]]}, [test_v3i64_param_0];
+; CHECK:      .param .align 32 .b8 param0[32];
+; CHECK:      st.param.v2.b64  [param0+0], {[[E0]], [[E1]]};
+; CHECK:      st.param.b64     [param0+16], [[E2]];
+; CHECK:      .param .align 32 .b8 retval0[32];
+; CHECK:      call.uni (retval0),
+; CHECK-NEXT: test_v3i64,
+; CHECK:      ld.param.v2.b64  {[[RE0:%rd[0-9]+]], [[RE1:%rd[0-9]+]]}, [retval0+0];
+; CHECK:      ld.param.b64     [[RE2:%rd[0-9]+]], [retval0+16];
+; CHECK-DAG:  st.param.v2.b64  [func_retval0+0], {[[RE0]], [[RE1]]};
+; CHECK-DAG:  st.param.b64     [func_retval0+16], [[RE2]];
+; CHECK-DAG:  st.param.v2.b64  [func_retval0+0], {[[RE0]], [[RE1]]};
+; CHECK-DAG:  st.param.b64     [func_retval0+16], [[RE2]];
+; CHECK-NEXT: ret;
+define <3 x i64> @test_v3i64(<3 x i64> %a) {
+       %r = tail call <3 x i64> @test_v3i64(<3 x i64> %a);
+       ret <3 x i64> %r;
+}
+
+; For i64 vector loads are limited by PTX to 2 elements.
+; CHECK: .func  (.param .align 32 .b8 func_retval0[32])
+; CHECK-LABEL: test_v4i64(
+; CHECK-NEXT: .param .align 32 .b8 test_v4i64_param_0[32]
+; CHECK-DAG:  ld.param.v2.u64  {[[E2:%rd[0-9]+]], [[E3:%rd[0-9]+]]}, [test_v4i64_param_0+16];
+; CHECK-DAG:  ld.param.v2.u64  {[[E0:%rd[0-9]+]], [[E1:%rd[0-9]+]]}, [test_v4i64_param_0];
+; CHECK:      .param .align 32 .b8 param0[32];
+; CHECK:      st.param.v2.b64  [param0+0], {[[E0]], [[E1]]};
+; CHECK:      st.param.v2.b64  [param0+16], {[[E2]], [[E3]]};
+; CHECK:      .param .align 32 .b8 retval0[32];
+; CHECK:      call.uni (retval0),
+; CHECK-NEXT: test_v4i64,
+; CHECK:      ld.param.v2.b64  {[[RE0:%rd[0-9]+]], [[RE1:%rd[0-9]+]]}, [retval0+0];
+; CHECK:      ld.param.v2.b64  {[[RE2:%rd[0-9]+]], [[RE3:%rd[0-9]+]]}, [retval0+16];
+; CHECK-DAG:  st.param.v2.b64  [func_retval0+16], {[[RE2]], [[RE3]]};
+; CHECK-DAG:  st.param.v2.b64  [func_retval0+0], {[[RE0]], [[RE1]]};
+; CHECK-NEXT: ret;
+define <4 x i64> @test_v4i64(<4 x i64> %a) {
+       %r = tail call <4 x i64> @test_v4i64(<4 x i64> %a);
+       ret <4 x i64> %r;
+}
+
+; Aggregates, on the other hand, do not get extended.
+
+; CHECK: .func  (.param .align 1 .b8 func_retval0[1])
+; CHECK-LABEL: test_s_i1(
+; CHECK-NEXT: .align 1 .b8 test_s_i1_param_0[1]
+; CHECK:      ld.param.u8 [[A:%rs[0-9]+]], [test_s_i1_param_0];
+; CHECK:      .param .align 1 .b8 param0[1];
+; CHECK:      st.param.b8    [param0+0], [[A]]
+; CHECK:      .param .align 1 .b8 retval0[1];
+; CHECK:      call.uni
+; CHECK-NEXT: test_s_i1,
+; CHECK:      ld.param.b8    [[R:%rs[0-9]+]], [retval0+0];
+; CHECK:      st.param.b8    [func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+define %s_i1 @test_s_i1(%s_i1 %a) {
+       %r = tail call %s_i1 @test_s_i1(%s_i1 %a);
+       ret %s_i1 %r;
+}
+
+; CHECK: .func  (.param .align 1 .b8 func_retval0[1])
+; CHECK-LABEL: test_s_i8(
+; CHECK-NEXT: .param .align 1 .b8 test_s_i8_param_0[1]
+; CHECK:      ld.param.u8 [[A:%rs[0-9]+]], [test_s_i8_param_0];
+; CHECK:      .param .align 1 .b8 param0[1];
+; CHECK:      st.param.b8    [param0+0], [[A]]
+; CHECK:      .param .align 1 .b8 retval0[1];
+; CHECK:      call.uni
+; CHECK-NEXT: test_s_i8,
+; CHECK:      ld.param.b8    [[R:%rs[0-9]+]], [retval0+0];
+; CHECK:      st.param.b8    [func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+define %s_i8 @test_s_i8(%s_i8 %a) {
+       %r = tail call %s_i8 @test_s_i8(%s_i8 %a);
+       ret %s_i8 %r;
+}
+
+; CHECK: .func  (.param .align 2 .b8 func_retval0[2])
+; CHECK-LABEL: test_s_i16(
+; CHECK-NEXT: .param .align 2 .b8 test_s_i16_param_0[2]
+; CHECK:      ld.param.u16 [[A:%rs[0-9]+]], [test_s_i16_param_0];
+; CHECK:      .param .align 2 .b8 param0[2];
+; CHECK:      st.param.b16    [param0+0], [[A]]
+; CHECK:      .param .align 2 .b8 retval0[2];
+; CHECK:      call.uni
+; CHECK-NEXT: test_s_i16,
+; CHECK:      ld.param.b16    [[R:%rs[0-9]+]], [retval0+0];
+; CHECK:      st.param.b16    [func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+define %s_i16 @test_s_i16(%s_i16 %a) {
+       %r = tail call %s_i16 @test_s_i16(%s_i16 %a);
+       ret %s_i16 %r;
+}
+
+; CHECK: .func  (.param .align 2 .b8 func_retval0[2])
+; CHECK-LABEL: test_s_f16(
+; CHECK-NEXT: .param .align 2 .b8 test_s_f16_param_0[2]
+; CHECK:      ld.param.b16 [[A:%h[0-9]+]], [test_s_f16_param_0];
+; CHECK:      .param .align 2 .b8 param0[2];
+; CHECK:      st.param.b16    [param0+0], [[A]]
+; CHECK:      .param .align 2 .b8 retval0[2];
+; CHECK:      call.uni
+; CHECK-NEXT: test_s_f16,
+; CHECK:      ld.param.b16    [[R:%h[0-9]+]], [retval0+0];
+; CHECK:      st.param.b16    [func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+define %s_f16 @test_s_f16(%s_f16 %a) {
+       %r = tail call %s_f16 @test_s_f16(%s_f16 %a);
+       ret %s_f16 %r;
+}
+
+; CHECK: .func  (.param .align 4 .b8 func_retval0[4])
+; CHECK-LABEL: test_s_i32(
+; CHECK-NEXT: .param .align 4 .b8 test_s_i32_param_0[4]
+; CHECK:      ld.param.u32    [[E:%r[0-9]+]], [test_s_i32_param_0];
+; CHECK:      .param .align 4 .b8 param0[4]
+; CHECK:      st.param.b32    [param0+0], [[E]];
+; CHECK:      .param .align 4 .b8 retval0[4];
+; CHECK:      call.uni (retval0),
+; CHECK-NEXT: test_s_i32,
+; CHECK:      ld.param.b32    [[R:%r[0-9]+]], [retval0+0];
+; CHECK:      st.param.b32    [func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+define %s_i32 @test_s_i32(%s_i32 %a) {
+       %r = tail call %s_i32 @test_s_i32(%s_i32 %a);
+       ret %s_i32 %r;
+}
+
+; CHECK: .func  (.param .align 4 .b8 func_retval0[4])
+; CHECK-LABEL: test_s_f32(
+; CHECK-NEXT: .param .align 4 .b8 test_s_f32_param_0[4]
+; CHECK:      ld.param.f32    [[E:%f[0-9]+]], [test_s_f32_param_0];
+; CHECK:      .param .align 4 .b8 param0[4]
+; CHECK:      st.param.f32    [param0+0], [[E]];
+; CHECK:      .param .align 4 .b8 retval0[4];
+; CHECK:      call.uni (retval0),
+; CHECK-NEXT: test_s_f32,
+; CHECK:      ld.param.f32    [[R:%f[0-9]+]], [retval0+0];
+; CHECK:      st.param.f32    [func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+define %s_f32 @test_s_f32(%s_f32 %a) {
+       %r = tail call %s_f32 @test_s_f32(%s_f32 %a);
+       ret %s_f32 %r;
+}
+
+; CHECK: .func  (.param .align 8 .b8 func_retval0[8])
+; CHECK-LABEL: test_s_i64(
+; CHECK-NEXT: .param .align 8 .b8 test_s_i64_param_0[8]
+; CHECK:      ld.param.u64    [[E:%rd[0-9]+]], [test_s_i64_param_0];
+; CHECK:      .param .align 8 .b8 param0[8];
+; CHECK:      st.param.b64    [param0+0], [[E]];
+; CHECK:      .param .align 8 .b8 retval0[8];
+; CHECK:      call.uni (retval0),
+; CHECK-NEXT: test_s_i64,
+; CHECK:      ld.param.b64    [[R:%rd[0-9]+]], [retval0+0];
+; CHECK:      st.param.b64    [func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+define %s_i64 @test_s_i64(%s_i64 %a) {
+       %r = tail call %s_i64 @test_s_i64(%s_i64 %a);
+       ret %s_i64 %r;
+}
+
+; Fields that have different types, but identical sizes are not vectorized.
+; CHECK: .func  (.param .align 8 .b8 func_retval0[24])
+; CHECK-LABEL: test_s_i32f32(
+; CHECK:        .param .align 8 .b8 test_s_i32f32_param_0[24]
+; CHECK-DAG:    ld.param.u64    [[E4:%rd[0-9]+]], [test_s_i32f32_param_0+16];
+; CHECK-DAG:    ld.param.f32    [[E3:%f[0-9]+]], [test_s_i32f32_param_0+12];
+; CHECK-DAG:    ld.param.u32    [[E2:%r[0-9]+]], [test_s_i32f32_param_0+8];
+; CHECK-DAG:    ld.param.f32    [[E1:%f[0-9]+]], [test_s_i32f32_param_0+4];
+; CHECK-DAG:    ld.param.u32    [[E0:%r[0-9]+]], [test_s_i32f32_param_0];
+; CHECK:        .param .align 8 .b8 param0[24];
+; CHECK-DAG:    st.param.b32    [param0+0], [[E0]];
+; CHECK-DAG:    st.param.f32    [param0+4], [[E1]];
+; CHECK-DAG:    st.param.b32    [param0+8], [[E2]];
+; CHECK-DAG:    st.param.f32    [param0+12], [[E3]];
+; CHECK-DAG:    st.param.b64    [param0+16], [[E4]];
+; CHECK:        .param .align 8 .b8 retval0[24];
+; CHECK:        call.uni (retval0),
+; CHECK-NEXT:   test_s_i32f32,
+; CHECK-DAG:    ld.param.b32    [[RE0:%r[0-9]+]], [retval0+0];
+; CHECK-DAG:    ld.param.f32    [[RE1:%f[0-9]+]], [retval0+4];
+; CHECK-DAG:    ld.param.b32    [[RE2:%r[0-9]+]], [retval0+8];
+; CHECK-DAG:    ld.param.f32    [[RE3:%f[0-9]+]], [retval0+12];
+; CHECK-DAG:    ld.param.b64    [[RE4:%rd[0-9]+]], [retval0+16];
+; CHECK-DAG:    st.param.b32    [func_retval0+0], [[RE0]];
+; CHECK-DAG:    st.param.f32    [func_retval0+4], [[RE1]];
+; CHECK-DAG:    st.param.b32    [func_retval0+8], [[RE2]];
+; CHECK-DAG:    st.param.f32    [func_retval0+12], [[RE3]];
+; CHECK-DAG:    st.param.b64    [func_retval0+16], [[RE4]];
+; CHECK:        ret;
+define %s_i32f32 @test_s_i32f32(%s_i32f32 %a) {
+       %r = tail call %s_i32f32 @test_s_i32f32(%s_i32f32 %a);
+       ret %s_i32f32 %r;
+}
+
+; We do vectorize consecutive fields with matching types.
+; CHECK:.visible .func  (.param .align 8 .b8 func_retval0[24])
+; CHECK-LABEL: test_s_i32x4(
+; CHECK:        .param .align 8 .b8 test_s_i32x4_param_0[24]
+; CHECK-DAG:    ld.param.u64    [[RD1:%rd[0-9]+]], [test_s_i32x4_param_0+16];
+; CHECK-DAG:    ld.param.v2.u32 {[[E2:%r[0-9]+]], [[E3:%r[0-9]+]]}, [test_s_i32x4_param_0+8];
+; CHECK-DAG:    ld.param.v2.u32 {[[E0:%r[0-9]+]], [[E1:%r[0-9]+]]}, [test_s_i32x4_param_0];
+; CHECK:        .param .align 8 .b8 param0[24];
+; CHECK:        st.param.v2.b32 [param0+0], {[[E0]], [[E1]]};
+; CHECK:        st.param.v2.b32 [param0+8], {[[E2]], [[E3]]};
+; CHECK:        st.param.b64    [param0+16], [[E4]];
+; CHECK:        .param .align 8 .b8 retval0[24];
+; CHECK:        call.uni (retval0),
+; CHECK-NEXT:   test_s_i32x4,
+; CHECK:        ld.param.v2.b32 {[[RE0:%r[0-9]+]], [[RE1:%r[0-9]+]]}, [retval0+0];
+; CHECK:        ld.param.v2.b32 {[[RE2:%r[0-9]+]], [[RE3:%r[0-9]+]]}, [retval0+8];
+; CHECK:        ld.param.b64    [[RE4:%rd[0-9]+]], [retval0+16];
+; CHECK-DAG:    st.param.v2.b32 [func_retval0+0], {[[RE0]], [[RE1]]};
+; CHECK-DAG:    st.param.v2.b32 [func_retval0+8], {[[RE2]], [[RE3]]};
+; CHECK-DAG:    st.param.b64    [func_retval0+16], [[RE4]];
+; CHECK:        ret;
+
+define %s_i32x4 @test_s_i32x4(%s_i32x4 %a) {
+       %r = tail call %s_i32x4 @test_s_i32x4(%s_i32x4 %a);
+       ret %s_i32x4 %r;
+}
+
+; CHECK:.visible .func  (.param .align 8 .b8 func_retval0[32])
+; CHECK-LABEL: test_s_i1i32x4(
+; CHECK:        .param .align 8 .b8 test_s_i1i32x4_param_0[32]
+; CHECK:        ld.param.u64    [[E5:%rd[0-9]+]], [test_s_i1i32x4_param_0+24];
+; CHECK:        ld.param.u32    [[E4:%r[0-9]+]], [test_s_i1i32x4_param_0+16];
+; CHECK:        ld.param.u32    [[E3:%r[0-9]+]], [test_s_i1i32x4_param_0+12];
+; CHECK:        ld.param.u8     [[E2:%rs[0-9]+]], [test_s_i1i32x4_param_0+8];
+; CHECK:        ld.param.v2.u32         {[[E0:%r[0-9]+]], [[E1:%r[0-9]+]]}, [test_s_i1i32x4_param_0];
+; CHECK:        .param .align 8 .b8 param0[32];
+; CHECK:        st.param.v2.b32 [param0+0], {[[E0]], [[E1]]};
+; CHECK:        st.param.b8     [param0+8], [[E2]];
+; CHECK:        st.param.b32    [param0+12], [[E3]];
+; CHECK:        st.param.b32    [param0+16], [[E4]];
+; CHECK:        st.param.b64    [param0+24], [[E5]];
+; CHECK:        .param .align 8 .b8 retval0[32];
+; CHECK:        call.uni (retval0),
+; CHECK:        test_s_i1i32x4,
+; CHECK:        (
+; CHECK:        param0
+; CHECK:        );
+; CHECK:        ld.param.v2.b32 {[[RE0:%r[0-9]+]], [[RE1:%r[0-9]+]]}, [retval0+0];
+; CHECK:        ld.param.b8     [[RE2:%rs[0-9]+]], [retval0+8];
+; CHECK:        ld.param.b32    [[RE3:%r[0-9]+]], [retval0+12];
+; CHECK:        ld.param.b32    [[RE4:%r[0-9]+]], [retval0+16];
+; CHECK:        ld.param.b64    [[RE5:%rd[0-9]+]], [retval0+24];
+; CHECK:        st.param.v2.b32 [func_retval0+0], {[[RE0]], [[RE1]]};
+; CHECK:        st.param.b8     [func_retval0+8], [[RE2]];
+; CHECK:        st.param.b32    [func_retval0+12], [[RE3]];
+; CHECK:        st.param.b32    [func_retval0+16], [[RE4]];
+; CHECK:        st.param.b64    [func_retval0+24], [[RE5]];
+; CHECK:        ret;
+
+define %s_i8i32x4 @test_s_i1i32x4(%s_i8i32x4 %a) {
+       %r = tail call %s_i8i32x4 @test_s_i1i32x4(%s_i8i32x4 %a);
+       ret %s_i8i32x4 %r;
+}
+
+; -- All loads/stores from parameters aligned by one must be done one
+; -- byte at a time.
+; CHECK:.visible .func  (.param .align 1 .b8 func_retval0[25])
+; CHECK-LABEL: test_s_i1i32x4p(
+; CHECK-DAG:        .param .align 1 .b8 test_s_i1i32x4p_param_0[25]
+; CHECK-DAG:        ld.param.u8     %r{{.*}}, [test_s_i1i32x4p_param_0+24];
+; CHECK-DAG:        ld.param.u8     %r{{.*}}, [test_s_i1i32x4p_param_0+23];
+; CHECK-DAG:        ld.param.u8     %r{{.*}}, [test_s_i1i32x4p_param_0+22];
+; CHECK-DAG:        ld.param.u8     %r{{.*}}, [test_s_i1i32x4p_param_0+21];
+; CHECK-DAG:        ld.param.u8     %r{{.*}}, [test_s_i1i32x4p_param_0+20];
+; CHECK-DAG:        ld.param.u8     %r{{.*}}, [test_s_i1i32x4p_param_0+19];
+; CHECK-DAG:        ld.param.u8     %r{{.*}}, [test_s_i1i32x4p_param_0+18];
+; CHECK-DAG:        ld.param.u8     %r{{.*}}, [test_s_i1i32x4p_param_0+17];
+; CHECK-DAG:        ld.param.u8     %r{{.*}}, [test_s_i1i32x4p_param_0+16];
+; CHECK-DAG:        ld.param.u8     %r{{.*}}, [test_s_i1i32x4p_param_0+15];
+; CHECK-DAG:        ld.param.u8     %r{{.*}}, [test_s_i1i32x4p_param_0+14];
+; CHECK-DAG:        ld.param.u8     %r{{.*}}, [test_s_i1i32x4p_param_0+13];
+; CHECK-DAG:        ld.param.u8     %r{{.*}}, [test_s_i1i32x4p_param_0+12];
+; CHECK-DAG:        ld.param.u8     %r{{.*}}, [test_s_i1i32x4p_param_0+11];
+; CHECK-DAG:        ld.param.u8     %r{{.*}}, [test_s_i1i32x4p_param_0+10];
+; CHECK-DAG:        ld.param.u8     %r{{.*}}, [test_s_i1i32x4p_param_0+9];
+; CHECK-DAG:        ld.param.u8     %r{{.*}}, [test_s_i1i32x4p_param_0+8];
+; CHECK-DAG:        ld.param.u8     %r{{.*}}, [test_s_i1i32x4p_param_0+7];
+; CHECK-DAG:        ld.param.u8     %r{{.*}}, [test_s_i1i32x4p_param_0+6];
+; CHECK-DAG:        ld.param.u8     %r{{.*}}, [test_s_i1i32x4p_param_0+5];
+; CHECK-DAG:        ld.param.u8     %r{{.*}}, [test_s_i1i32x4p_param_0+4];
+; CHECK-DAG:        ld.param.u8     %r{{.*}}, [test_s_i1i32x4p_param_0+3];
+; CHECK-DAG:        ld.param.u8     %r{{.*}}, [test_s_i1i32x4p_param_0+2];
+; CHECK-DAG:        ld.param.u8     %r{{.*}}, [test_s_i1i32x4p_param_0+1];
+; CHECK-DAG:        ld.param.u8     %r{{.*}}, [test_s_i1i32x4p_param_0];
+; --- TODO
+; --- Unaligned parameter store/ return value load is broken in both nvcc
+; --- and llvm and needs to be fixed.
+; CHECK:        .param .align 1 .b8 param0[25];
+; CHECK-DAG:        st.param.b32    [param0+0],
+; CHECK-DAG:        st.param.b32    [param0+4],
+; CHECK-DAG:        st.param.b8     [param0+8],
+; CHECK-DAG:        st.param.b32    [param0+9],
+; CHECK-DAG:        st.param.b32    [param0+13],
+; CHECK-DAG:        st.param.b64    [param0+17],
+; CHECK:            .param .align 1 .b8 retval0[25];
+; CHECK:            call.uni (retval0),
+; CHECK-NEXT:       test_s_i1i32x4p,
+; CHECK-DAG:        ld.param.b32    %r41, [retval0+0];
+; CHECK-DAG:        ld.param.b32    %r42, [retval0+4];
+; CHECK-DAG:        ld.param.b8     %rs2, [retval0+8];
+; CHECK-DAG:        ld.param.b32    %r43, [retval0+9];
+; CHECK-DAG:        ld.param.b32    %r44, [retval0+13];
+; CHECK-DAG:        ld.param.b64    %rd23, [retval0+17];
+; CHECK-DAG:        st.param.b32    [func_retval0+0],
+; CHECK-DAG:        st.param.b32    [func_retval0+4],
+; CHECK-DAG:        st.param.b8     [func_retval0+8],
+; CHECK-DAG:        st.param.b32    [func_retval0+9],
+; CHECK-DAG:        st.param.b32    [func_retval0+13],
+; CHECK-DAG:        st.param.b64    [func_retval0+17],
+
+define %s_i8i32x4p @test_s_i1i32x4p(%s_i8i32x4p %a) {
+       %r = tail call %s_i8i32x4p @test_s_i1i32x4p(%s_i8i32x4p %a);
+       ret %s_i8i32x4p %r;
+}
+
+; Check that we can vectorize loads that span multiple aggregate fields.
+; CHECK:.visible .func  (.param .align 16 .b8 func_retval0[80])
+; CHECK-LABEL: test_s_crossfield(
+; CHECK:        .param .align 16 .b8 test_s_crossfield_param_0[80]
+; CHECK:        ld.param.u32    [[E15:%r[0-9]+]], [test_s_crossfield_param_0+64];
+; CHECK:        ld.param.v4.u32 {[[E11:%r[0-9]+]], [[E12:%r[0-9]+]], [[E13:%r[0-9]+]], [[E14:%r[0-9]+]]}, [test_s_crossfield_param_0+48];
+; CHECK:        ld.param.v4.u32 {[[E7:%r[0-9]+]], [[E8:%r[0-9]+]], [[E9:%r[0-9]+]], [[E10:%r[0-9]+]]}, [test_s_crossfield_param_0+32];
+; CHECK:        ld.param.v4.u32 {[[E3:%r[0-9]+]], [[E4:%r[0-9]+]], [[E5:%r[0-9]+]], [[E6:%r[0-9]+]]}, [test_s_crossfield_param_0+16];
+; CHECK:        ld.param.u32    [[E2:%r[0-9]+]], [test_s_crossfield_param_0+8];
+; CHECK:        ld.param.v2.u32 {[[E0:%r[0-9]+]], [[E1:%r[0-9]+]]}, [test_s_crossfield_param_0];
+; CHECK:        .param .align 16 .b8 param0[80];
+; CHECK:        st.param.v2.b32 [param0+0], {[[E0]], [[E1]]};
+; CHECK:        st.param.b32    [param0+8], [[E2]];
+; CHECK:        st.param.v4.b32 [param0+16], {[[E3]], [[E4]], [[E5]], [[E6]]};
+; CHECK:        st.param.v4.b32 [param0+32], {[[E7]], [[E8]], [[E9]], [[E10]]};
+; CHECK:        st.param.v4.b32 [param0+48], {[[E11]], [[E12]], [[E13]], [[E14]]};
+; CHECK:        st.param.b32    [param0+64], [[E15]];
+; CHECK:        .param .align 16 .b8 retval0[80];
+; CHECK:        call.uni (retval0),
+; CHECK:        test_s_crossfield,
+; CHECK:        ld.param.v2.b32 {[[RE0:%r[0-9]+]], [[RE1:%r[0-9]+]]}, [retval0+0];
+; CHECK:        ld.param.b32    [[RE2:%r[0-9]+]], [retval0+8];
+; CHECK:        ld.param.v4.b32 {[[RE3:%r[0-9]+]], [[RE4:%r[0-9]+]], [[RE5:%r[0-9]+]], [[RE6:%r[0-9]+]]}, [retval0+16];
+; CHECK:        ld.param.v4.b32 {[[RE7:%r[0-9]+]], [[RE8:%r[0-9]+]], [[RE9:%r[0-9]+]], [[RE10:%r[0-9]+]]}, [retval0+32];
+; CHECK:        ld.param.v4.b32 {[[RE11:%r[0-9]+]], [[RE12:%r[0-9]+]], [[RE13:%r[0-9]+]], [[RE14:%r[0-9]+]]}, [retval0+48];
+; CHECK:        ld.param.b32    [[RE15:%r[0-9]+]], [retval0+64];
+; CHECK:        st.param.v2.b32 [func_retval0+0], {[[RE0]], [[RE1]]};
+; CHECK:        st.param.b32    [func_retval0+8], [[RE2]];
+; CHECK:        st.param.v4.b32 [func_retval0+16], {[[RE3]], [[RE4]], [[RE5]], [[RE6]]};
+; CHECK:        st.param.v4.b32 [func_retval0+32], {[[RE7]], [[RE8]], [[RE9]], [[RE10]]};
+; CHECK:        st.param.v4.b32 [func_retval0+48], {[[RE11]], [[RE12]], [[RE13]], [[RE14]]};
+; CHECK:        st.param.b32    [func_retval0+64], [[RE15]];
+; CHECK:        ret;
+
+define %s_crossfield @test_s_crossfield(%s_crossfield %a) {
+       %r = tail call %s_crossfield @test_s_crossfield(%s_crossfield %a);
+       ret %s_crossfield %r;
+}
diff --git a/test/CodeGen/NVPTX/sched1.ll b/test/CodeGen/NVPTX/sched1.ll
index fb01eb262adc..ecdf55ecdbeb 100644
--- a/test/CodeGen/NVPTX/sched1.ll
+++ b/test/CodeGen/NVPTX/sched1.ll
@@ -6,10 +6,10 @@ define void @foo(i32* %a) {
 ; CHECK: .func foo
 ; CHECK: ld.u32
 ; CHECK-NEXT: ld.u32
-; CHECK-NEXT: ld.u32
-; CHECK-NEXT: ld.u32
 ; CHECK-NEXT: add.s32
+; CHECK-NEXT: ld.u32
 ; CHECK-NEXT: add.s32
+; CHECK-NEXT: ld.u32
 ; CHECK-NEXT: add.s32
   %ptr0 = getelementptr i32, i32* %a, i32 0
   %val0 = load i32, i32* %ptr0
diff --git a/test/CodeGen/NVPTX/sched2.ll b/test/CodeGen/NVPTX/sched2.ll
index 91ed77878f81..347f77c5682c 100644
--- a/test/CodeGen/NVPTX/sched2.ll
+++ b/test/CodeGen/NVPTX/sched2.ll
@@ -4,12 +4,12 @@ define void @foo(<2 x i32>* %a) {
 ; CHECK: .func foo
 ; CHECK: ld.v2.u32
 ; CHECK-NEXT: ld.v2.u32
-; CHECK-NEXT: ld.v2.u32
-; CHECK-NEXT: ld.v2.u32
 ; CHECK-NEXT: add.s32
 ; CHECK-NEXT: add.s32
+; CHECK-NEXT: ld.v2.u32
 ; CHECK-NEXT: add.s32
 ; CHECK-NEXT: add.s32
+; CHECK-NEXT: ld.v2.u32
 ; CHECK-NEXT: add.s32
 ; CHECK-NEXT: add.s32
   %ptr0 = getelementptr <2 x i32>, <2 x i32>* %a, i32 0
diff --git a/test/CodeGen/NVPTX/simple-call.ll b/test/CodeGen/NVPTX/simple-call.ll
index da6568685fe6..8ff0b5da5bcc 100644
--- a/test/CodeGen/NVPTX/simple-call.ll
+++ b/test/CodeGen/NVPTX/simple-call.ll
@@ -1,26 +1,26 @@
-; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s
-; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s
-
-
-
-; CHECK: .func ({{.*}}) device_func
-define float @device_func(float %a) noinline {
-  %ret = fmul float %a, %a
-  ret float %ret
-}
-
-; CHECK: .entry kernel_func
-define void @kernel_func(float* %a) {
-  %val = load float, float* %a
-; CHECK: call.uni (retval0),
-; CHECK: device_func,
-  %mul = call float @device_func(float %val)
-  store float %mul, float* %a
-  ret void
-}
-
-
-
-!nvvm.annotations = !{!1}
-
-!1 = !{void (float*)* @kernel_func, !"kernel", i32 1}
+; RUN: llc < %s -march=nvptx -mcpu=sm_20 -verify-machineinstrs | FileCheck %s
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 -verify-machineinstrs | FileCheck %s
+
+
+
+; CHECK: .func ({{.*}}) device_func
+define float @device_func(float %a) noinline {
+  %ret = fmul float %a, %a
+  ret float %ret
+}
+
+; CHECK: .entry kernel_func
+define void @kernel_func(float* %a) {
+  %val = load float, float* %a
+; CHECK: call.uni (retval0),
+; CHECK: device_func,
+  %mul = call float @device_func(float %val)
+  store float %mul, float* %a
+  ret void
+}
+
+
+
+!nvvm.annotations = !{!1}
+
+!1 = !{void (float*)* @kernel_func, !"kernel", i32 1}
diff --git a/test/CodeGen/NVPTX/vec8.ll b/test/CodeGen/NVPTX/vec8.ll
index a86ba1e29d5c..93b39c1125f8 100644
--- a/test/CodeGen/NVPTX/vec8.ll
+++ b/test/CodeGen/NVPTX/vec8.ll
@@ -7,7 +7,7 @@ define void @foo(<8 x i8> %a, i8* %b) {
 ; CHECK-DAG: ld.param.v4.u8 {[[E0:%rs[0-9]+]], [[E1:%rs[0-9]+]], [[E2:%rs[0-9]+]], [[E3:%rs[0-9]+]]}, [foo_param_0]
 ; CHECK-DAG: ld.param.v4.u8 {[[E4:%rs[0-9]+]], [[E5:%rs[0-9]+]], [[E6:%rs[0-9]+]], [[E7:%rs[0-9]+]]}, [foo_param_0+4]
 ; CHECK-DAG: ld.param.u32   %[[B:r[0-9+]]], [foo_param_1]
-; CHECK:     add.s16        [[T:%rs[0-9+]]], [[E1]], [[E6]];
+; CHECK-DAG: add.s16        [[T:%rs[0-9+]]], [[E1]], [[E6]];
 ; CHECK:     st.u8          [%[[B]]], [[T]];
   %t0 = extractelement <8 x i8> %a, i32 1
   %t1 = extractelement <8 x i8> %a, i32 6
diff --git a/test/CodeGen/NVPTX/vector-call.ll b/test/CodeGen/NVPTX/vector-call.ll
index bf7b931a5758..d1ec8d25a107 100644
--- a/test/CodeGen/NVPTX/vector-call.ll
+++ b/test/CodeGen/NVPTX/vector-call.ll
@@ -1,30 +1,30 @@
-; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s
-
-target triple = "nvptx-unknown-cuda"
-
-declare void @bar(<4 x i32>)
-
-; CHECK-LABEL: .func foo(
-; CHECK-DAG: ld.param.v4.u32 {[[E0:%r[0-9]+]], [[E1:%r[0-9]+]], [[E2:%r[0-9]+]], [[E3:%r[0-9]+]]}, [foo_param_0];
-; CHECK: .param .align 16 .b8 param0[16];
-; CHECK-DAG: st.param.v4.b32  [param0+0],  {[[E0]], [[E1]], [[E2]], [[E3]]};
-; CHECK:     call.uni
-; CHECK:     ret;
-define void @foo(<4 x i32> %a) {
-  tail call void @bar(<4 x i32> %a)
-  ret void
-}
-
-; CHECK-LABEL: .func foo3(
-; CHECK-DAG: ld.param.v2.u32 {[[E0:%r[0-9]+]], [[E1:%r[0-9]+]]}, [foo3_param_0];
-; CHECK-DAG: ld.param.u32 [[E2:%r[0-9]+]], [foo3_param_0+8];
-; CHECK: .param .align 16 .b8 param0[16];
-; CHECK-DAG: st.param.v2.b32  [param0+0],  {[[E0]], [[E1]]};
-; CHECK-DAG: st.param.b32     [param0+8],  [[E2]];
-; CHECK:     call.uni
-; CHECK:     ret;
-declare void @bar3(<3 x i32>)
-define void @foo3(<3 x i32> %a) {
-  tail call void @bar3(<3 x i32> %a)
-  ret void
-}
+; RUN: llc < %s -march=nvptx -mcpu=sm_20 -verify-machineinstrs | FileCheck %s
+
+target triple = "nvptx-unknown-cuda"
+
+declare void @bar(<4 x i32>)
+
+; CHECK-LABEL: .func foo(
+; CHECK-DAG: ld.param.v4.u32 {[[E0:%r[0-9]+]], [[E1:%r[0-9]+]], [[E2:%r[0-9]+]], [[E3:%r[0-9]+]]}, [foo_param_0];
+; CHECK: .param .align 16 .b8 param0[16];
+; CHECK-DAG: st.param.v4.b32  [param0+0],  {[[E0]], [[E1]], [[E2]], [[E3]]};
+; CHECK:     call.uni
+; CHECK:     ret;
+define void @foo(<4 x i32> %a) {
+  tail call void @bar(<4 x i32> %a)
+  ret void
+}
+
+; CHECK-LABEL: .func foo3(
+; CHECK-DAG: ld.param.v2.u32 {[[E0:%r[0-9]+]], [[E1:%r[0-9]+]]}, [foo3_param_0];
+; CHECK-DAG: ld.param.u32 [[E2:%r[0-9]+]], [foo3_param_0+8];
+; CHECK: .param .align 16 .b8 param0[16];
+; CHECK-DAG: st.param.v2.b32  [param0+0],  {[[E0]], [[E1]]};
+; CHECK-DAG: st.param.b32     [param0+8],  [[E2]];
+; CHECK:     call.uni
+; CHECK:     ret;
+declare void @bar3(<3 x i32>)
+define void @foo3(<3 x i32> %a) {
+  tail call void @bar3(<3 x i32> %a)
+  ret void
+}
diff --git a/test/CodeGen/NVPTX/zeroext-32bit.ll b/test/CodeGen/NVPTX/zeroext-32bit.ll
index c2f0ec4b1447..bcfd987b4a66 100644
--- a/test/CodeGen/NVPTX/zeroext-32bit.ll
+++ b/test/CodeGen/NVPTX/zeroext-32bit.ll
@@ -1,26 +1,26 @@
-; RUN: llc < %s -march=nvptx64 -mcpu=sm_30 | FileCheck %s
-
-; The zeroext attribute below should be silently ignored because
-; we can pass a 32-bit integer across a function call without
-; needing to extend it.
-
-target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
-target triple = "nvptx64-unknown-cuda"
-
-; CHECK-LABEL: .visible .func zeroext_test
-; CHECK-NOT: cvt.u32.u16
-define void @zeroext_test()  {
-  tail call void @call1(i32 zeroext 0)
-  ret void
-}
-
-declare void @call1(i32 zeroext)
-
-; CHECK-LABEL: .visible .func signext_test
-; CHECK-NOT: cvt.s32.s16
-define void @signext_test()  {
-  tail call void @call2(i32 zeroext 0)
-  ret void
-}
-
-declare void @call2(i32 zeroext)
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_30 -verify-machineinstrs | FileCheck %s
+
+; The zeroext attribute below should be silently ignored because
+; we can pass a 32-bit integer across a function call without
+; needing to extend it.
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
+target triple = "nvptx64-unknown-cuda"
+
+; CHECK-LABEL: .visible .func zeroext_test
+; CHECK-NOT: cvt.u32.u16
+define void @zeroext_test()  {
+  tail call void @call1(i32 zeroext 0)
+  ret void
+}
+
+declare void @call1(i32 zeroext)
+
+; CHECK-LABEL: .visible .func signext_test
+; CHECK-NOT: cvt.s32.s16
+define void @signext_test()  {
+  tail call void @call2(i32 zeroext 0)
+  ret void
+}
+
+declare void @call2(i32 zeroext)
diff --git a/test/CodeGen/PowerPC/mtvsrdd.ll b/test/CodeGen/PowerPC/mtvsrdd.ll
new file mode 100644
index 000000000000..1d6a3553b2a1
--- /dev/null
+++ b/test/CodeGen/PowerPC/mtvsrdd.ll
@@ -0,0 +1,22 @@
+; RUN: llc -mcpu=pwr9 -ppc-vsr-nums-as-vr -mtriple=powerpc64le-unknown-unknown \
+; RUN:   < %s | FileCheck %s
+
+; This test case checks r0 is used as constant 0 in instruction mtvsrdd.
+
+define <2 x i64> @const0(i64 %a) {
+  %vecinit = insertelement <2 x i64> undef, i64 %a, i32 0
+  %vecinit1 = insertelement <2 x i64> %vecinit, i64 0, i32 1
+  ret <2 x i64> %vecinit1
+; CHECK-LABEL: const0
+; CHECK: mtvsrdd v2, 0, r3
+}
+
+define <2 x i64> @noconst0(i64* %a, i64* %b) {
+  %1 = load i64, i64* %a, align 8
+  %2 = load i64, i64* %b, align 8
+  %vecinit = insertelement <2 x i64> undef, i64 %2, i32 0
+  %vecinit1 = insertelement <2 x i64> %vecinit, i64 %1, i32 1
+  ret <2 x i64> %vecinit1
+; CHECK-LABEL: noconst0
+; CHECK: mtvsrdd v2, {{r[0-9]+}}, {{r[0-9]+}}
+}
diff --git a/test/CodeGen/PowerPC/setcc-logic.ll b/test/CodeGen/PowerPC/setcc-logic.ll
index 2ed08e2ae380..a5a86f101a94 100644
--- a/test/CodeGen/PowerPC/setcc-logic.ll
+++ b/test/CodeGen/PowerPC/setcc-logic.ll
@@ -6,7 +6,7 @@ define zeroext i1 @all_bits_clear(i32 %P, i32 %Q)  {
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    or 3, 3, 4
 ; CHECK-NEXT:    cntlzw 3, 3
-; CHECK-NEXT:    rlwinm 3, 3, 27, 31, 31
+; CHECK-NEXT:    srwi 3, 3, 5
 ; CHECK-NEXT:    blr
   %a = icmp eq i32 %P, 0
   %b = icmp eq i32 %Q, 0
@@ -30,11 +30,11 @@ define zeroext i1 @all_sign_bits_clear(i32 %P, i32 %Q)  {
 define zeroext i1 @all_bits_set(i32 %P, i32 %Q)  {
 ; CHECK-LABEL: all_bits_set:
 ; CHECK:       # BB#0:
+; CHECK-NEXT:    li 5, -1
 ; CHECK-NEXT:    and 3, 3, 4
-; CHECK-NEXT:    li 5, 0
-; CHECK-NEXT:    li 12, 1
-; CHECK-NEXT:    cmpwi 0, 3, -1
-; CHECK-NEXT:    isel 3, 12, 5, 2
+; CHECK-NEXT:    xor 3, 3, 5
+; CHECK-NEXT:    cntlzw 3, 3
+; CHECK-NEXT:    srwi 3, 3, 5
 ; CHECK-NEXT:    blr
   %a = icmp eq i32 %P, -1
   %b = icmp eq i32 %Q, -1
@@ -437,7 +437,7 @@ define zeroext i1 @and_eq(i16 zeroext  %a, i16 zeroext %b, i16 zeroext %c, i16 z
 ; CHECK-NEXT:    xor 3, 3, 4
 ; CHECK-NEXT:    or 3, 3, 5
 ; CHECK-NEXT:    cntlzw 3, 3
-; CHECK-NEXT:    rlwinm 3, 3, 27, 31, 31
+; CHECK-NEXT:    srwi 3, 3, 5
 ; CHECK-NEXT:    blr
   %cmp1 = icmp eq i16 %a, %b
   %cmp2 = icmp eq i16 %c, %d
diff --git a/test/CodeGen/PowerPC/stackmap-frame-setup.ll b/test/CodeGen/PowerPC/stackmap-frame-setup.ll
index b5f1d4cfe4bc..b677b8be2966 100644
--- a/test/CodeGen/PowerPC/stackmap-frame-setup.ll
+++ b/test/CodeGen/PowerPC/stackmap-frame-setup.ll
@@ -7,11 +7,11 @@ entry:
   store i64 11, i64* %metadata
   store i64 12, i64* %metadata
   store i64 13, i64* %metadata
-; ISEL:      ADJCALLSTACKDOWN 0, implicit-def
+; ISEL:      ADJCALLSTACKDOWN 0, 0, implicit-def
 ; ISEL-NEXT: STACKMAP
 ; ISEL-NEXT: ADJCALLSTACKUP 0, 0, implicit-def
   call void (i64, i32, ...) @llvm.experimental.stackmap(i64 4, i32 0, i64* %metadata)
-; FAST-ISEL:      ADJCALLSTACKDOWN 0, implicit-def
+; FAST-ISEL:      ADJCALLSTACKDOWN 0, 0, implicit-def
 ; FAST-ISEL-NEXT: STACKMAP
 ; FAST-ISEL-NEXT: ADJCALLSTACKUP 0, 0, implicit-def
   ret void
diff --git a/test/CodeGen/PowerPC/tail-dup-layout.ll b/test/CodeGen/PowerPC/tail-dup-layout.ll
index c9b5bf8c9eeb..9665901e874f 100644
--- a/test/CodeGen/PowerPC/tail-dup-layout.ll
+++ b/test/CodeGen/PowerPC/tail-dup-layout.ll
@@ -1,4 +1,5 @@
-; RUN: llc -O2 < %s | FileCheck %s
+; RUN: llc -O2 -o - %s | FileCheck --check-prefix=CHECK --check-prefix=CHECK-O2 %s
+; RUN: llc -O3 -o - %s | FileCheck --check-prefix=CHECK --check-prefix=CHECK-O3 %s
 target datalayout = "e-m:e-i64:64-n32:64"
 target triple = "powerpc64le-grtev4-linux-gnu"
 
@@ -99,11 +100,9 @@ exit:
 ; test1
 ; test2
 ; test3
-; test4
 ; optional1
 ; optional2
 ; optional3
-; optional4
 ; exit
 ; even for 50/50 branches.
 ; Tail duplication puts test n+1 at the end of optional n
@@ -162,6 +161,98 @@ exit:
   ret void
 }
 
+; Intended layout:
+; The chain-of-triangles based duplicating produces the layout when 3
+; instructions are allowed for tail-duplication.
+; test1
+; test2
+; test3
+; optional1
+; optional2
+; optional3
+; exit
+;
+; Otherwise it produces the layout:
+; test1
+; optional1
+; test2
+; optional2
+; test3
+; optional3
+; exit
+
+;CHECK-LABEL: straight_test_3_instr_test:
+; test1 may have been merged with entry
+;CHECK: mr [[TAGREG:[0-9]+]], 3
+;CHECK: clrlwi {{[0-9]+}}, [[TAGREG]], 30
+;CHECK-NEXT: cmplwi {{[0-9]+}}, 2
+
+;CHECK-O3-NEXT: bne 0, .[[OPT1LABEL:[_0-9A-Za-z]+]]
+;CHECK-O3-NEXT: # %test2
+;CHECK-O3-NEXT: rlwinm {{[0-9]+}}, [[TAGREG]], 0, 28, 29
+;CHECK-O3-NEXT: cmplwi {{[0-9]+}}, 8
+;CHECK-O3-NEXT: bne 0, .[[OPT2LABEL:[_0-9A-Za-z]+]]
+;CHECK-O3-NEXT: .[[TEST3LABEL:[_0-9A-Za-z]+]]: # %test3
+;CHECK-O3-NEXT: rlwinm {{[0-9]+}}, [[TAGREG]], 0, 26, 27
+;CHECK-O3-NEXT: cmplwi {{[0-9]+}}, 32
+;CHECK-O3-NEXT: bne 0, .[[OPT3LABEL:[_0-9A-Za-z]+]]
+;CHECK-O3-NEXT: .[[EXITLABEL:[_0-9A-Za-z]+]]: # %exit
+;CHECK-O3: blr
+;CHECK-O3-NEXT: .[[OPT1LABEL]]:
+;CHECK-O3: rlwinm {{[0-9]+}}, [[TAGREG]], 0, 28, 29
+;CHECK-O3-NEXT: cmplwi {{[0-9]+}}, 8
+;CHECK-O3-NEXT: beq 0, .[[TEST3LABEL]]
+;CHECK-O3-NEXT: .[[OPT2LABEL]]:
+;CHECK-O3: rlwinm {{[0-9]+}}, [[TAGREG]], 0, 26, 27
+;CHECK-O3-NEXT: cmplwi {{[0-9]+}}, 32
+;CHECK-O3-NEXT: beq 0, .[[EXITLABEL]]
+;CHECK-O3-NEXT: .[[OPT3LABEL]]:
+;CHECK-O3: b .[[EXITLABEL]]
+
+;CHECK-O2-NEXT: beq 0, .[[TEST2LABEL:[_0-9A-Za-z]+]]
+;CHECK-O2-NEXT: # %optional1
+;CHECK-O2: .[[TEST2LABEL]]: # %test2
+;CHECK-O2-NEXT: rlwinm {{[0-9]+}}, [[TAGREG]], 0, 28, 29
+;CHECK-O2-NEXT: cmplwi {{[0-9]+}}, 8
+;CHECK-O2-NEXT: beq 0, .[[TEST3LABEL:[_0-9A-Za-z]+]]
+;CHECK-O2-NEXT: # %optional2
+;CHECK-O2: .[[TEST3LABEL]]: # %test3
+;CHECK-O2-NEXT: rlwinm {{[0-9]+}}, [[TAGREG]], 0, 26, 27
+;CHECK-O2-NEXT: cmplwi {{[0-9]+}}, 32
+;CHECK-O2-NEXT: beq 0, .[[EXITLABEL:[_0-9A-Za-z]+]]
+;CHECK-O2-NEXT: # %optional3
+;CHECK-O2: .[[EXITLABEL:[_0-9A-Za-z]+]]: # %exit
+;CHECK-O2: blr
+
+
+define void @straight_test_3_instr_test(i32 %tag) {
+entry:
+  br label %test1
+test1:
+  %tagbit1 = and i32 %tag, 3
+  %tagbit1eq0 = icmp eq i32 %tagbit1, 2
+  br i1 %tagbit1eq0, label %test2, label %optional1, !prof !2
+optional1:
+  call void @a()
+  br label %test2
+test2:
+  %tagbit2 = and i32 %tag, 12
+  %tagbit2eq0 = icmp eq i32 %tagbit2, 8
+  br i1 %tagbit2eq0, label %test3, label %optional2, !prof !2
+optional2:
+  call void @b()
+  br label %test3
+test3:
+  %tagbit3 = and i32 %tag, 48
+  %tagbit3eq0 = icmp eq i32 %tagbit3, 32
+  br i1 %tagbit3eq0, label %exit, label %optional3, !prof !1
+optional3:
+  call void @c()
+  br label %exit
+exit:
+  ret void
+}
+
 ; Intended layout:
 ; The chain-based outlining produces the layout
 ; entry
diff --git a/test/CodeGen/PowerPC/testComparesieqsc.ll b/test/CodeGen/PowerPC/testComparesieqsc.ll
new file mode 100644
index 000000000000..71ad5ed34969
--- /dev/null
+++ b/test/CodeGen/PowerPC/testComparesieqsc.ll
@@ -0,0 +1,138 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu -O2 \
+; RUN:   -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s \
+; RUN:  --implicit-check-not cmpw --implicit-check-not cmpd --implicit-check-not cmpl
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu -O2 \
+; RUN:   -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s \
+; RUN:  --implicit-check-not cmpw --implicit-check-not cmpd --implicit-check-not cmpl
+; ModuleID = 'ComparisonTestCases/testComparesieqsc.c'
+
+@glob = common local_unnamed_addr global i8 0, align 1
+
+; Function Attrs: norecurse nounwind readnone
+define signext i32 @test_ieqsc(i8 signext %a, i8 signext %b) {
+; CHECK-LABEL: test_ieqsc:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    xor r3, r3, r4
+; CHECK-NEXT:    cntlzw r3, r3
+; CHECK-NEXT:    srwi r3, r3, 5
+; CHECK-NEXT:    blr
+entry:
+  %cmp = icmp eq i8 %a, %b
+  %conv2 = zext i1 %cmp to i32
+  ret i32 %conv2
+}
+
+; Function Attrs: norecurse nounwind readnone
+define signext i32 @test_ieqsc_sext(i8 signext %a, i8 signext %b) {
+; CHECK-LABEL: test_ieqsc_sext:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    xor r3, r3, r4
+; CHECK-NEXT:    cntlzw r3, r3
+; CHECK-NEXT:    rldicr r3, r3, 58, 0
+; CHECK-NEXT:    sradi r3, r3, 63
+; CHECK-NEXT:    blr
+entry:
+  %cmp = icmp eq i8 %a, %b
+  %sub = sext i1 %cmp to i32
+  ret i32 %sub
+}
+
+; Function Attrs: norecurse nounwind readnone
+define signext i32 @test_ieqsc_z(i8 signext %a) {
+; CHECK-LABEL: test_ieqsc_z:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    cntlzw r3, r3
+; CHECK-NEXT:    srwi r3, r3, 5
+; CHECK-NEXT:    blr
+entry:
+  %cmp = icmp eq i8 %a, 0
+  %conv1 = zext i1 %cmp to i32
+  ret i32 %conv1
+}
+
+; Function Attrs: norecurse nounwind readnone
+define signext i32 @test_ieqsc_sext_z(i8 signext %a) {
+; CHECK-LABEL: test_ieqsc_sext_z:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    cntlzw r3, r3
+; CHECK-NEXT:    rldicr r3, r3, 58, 0
+; CHECK-NEXT:    sradi r3, r3, 63
+; CHECK-NEXT:    blr
+entry:
+  %cmp = icmp eq i8 %a, 0
+  %sub = sext i1 %cmp to i32
+  ret i32 %sub
+}
+
+; Function Attrs: norecurse nounwind
+define void @test_ieqsc_store(i8 signext %a, i8 signext %b) {
+; CHECK-LABEL: test_ieqsc_store:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    addis r5, r2, .LC0@toc@ha
+; CHECK-NEXT:    xor r3, r3, r4
+; CHECK-NEXT:    ld r12, .LC0@toc@l(r5)
+; CHECK-NEXT:    cntlzw r3, r3
+; CHECK-NEXT:    srwi r3, r3, 5
+; CHECK-NEXT:    stb r3, 0(r12)
+; CHECK-NEXT:    blr
+entry:
+  %cmp = icmp eq i8 %a, %b
+  %conv3 = zext i1 %cmp to i8
+  store i8 %conv3, i8* @glob, align 1
+  ret void
+}
+
+; Function Attrs: norecurse nounwind
+define void @test_ieqsc_sext_store(i8 signext %a, i8 signext %b) {
+; CHECK-LABEL: test_ieqsc_sext_store:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    xor r3, r3, r4
+; CHECK-NEXT:    addis r5, r2, .LC0@toc@ha
+; CHECK-NEXT:    cntlzw r3, r3
+; CHECK-NEXT:    ld r4, .LC0@toc@l(r5)
+; CHECK-NEXT:    rldicr r3, r3, 58, 0
+; CHECK-NEXT:    sradi r3, r3, 63
+; CHECK-NEXT:    stb r3, 0(r4)
+; CHECK-NEXT:    blr
+entry:
+  %cmp = icmp eq i8 %a, %b
+  %conv3 = sext i1 %cmp to i8
+  store i8 %conv3, i8* @glob, align 1
+  ret void
+}
+
+; Function Attrs: norecurse nounwind
+define void @test_ieqsc_z_store(i8 signext %a) {
+; CHECK-LABEL: test_ieqsc_z_store:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    addis r4, r2, .LC0@toc@ha
+; CHECK-NEXT:    cntlzw r3, r3
+; CHECK-NEXT:    ld r4, .LC0@toc@l(r4)
+; CHECK-NEXT:    srwi r3, r3, 5
+; CHECK-NEXT:    stb r3, 0(r4)
+; CHECK-NEXT:    blr
+entry:
+  %cmp = icmp eq i8 %a, 0
+  %conv2 = zext i1 %cmp to i8
+  store i8 %conv2, i8* @glob, align 1
+  ret void
+}
+
+; Function Attrs: norecurse nounwind
+define void @test_ieqsc_sext_z_store(i8 signext %a) {
+; CHECK-LABEL: test_ieqsc_sext_z_store:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    addis r4, r2, .LC0@toc@ha
+; CHECK-NEXT:    cntlzw r3, r3
+; CHECK-NEXT:    ld r4, .LC0@toc@l(r4)
+; CHECK-NEXT:    rldicr r3, r3, 58, 0
+; CHECK-NEXT:    sradi r3, r3, 63
+; CHECK-NEXT:    stb r3, 0(r4)
+; CHECK-NEXT:    blr
+entry:
+  %cmp = icmp eq i8 %a, 0
+  %conv2 = sext i1 %cmp to i8
+  store i8 %conv2, i8* @glob, align 1
+  ret void
+}
diff --git a/test/CodeGen/PowerPC/testComparesieqsi.ll b/test/CodeGen/PowerPC/testComparesieqsi.ll
new file mode 100644
index 000000000000..16882dbd0045
--- /dev/null
+++ b/test/CodeGen/PowerPC/testComparesieqsi.ll
@@ -0,0 +1,138 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu -O2 \
+; RUN:   -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s \
+; RUN:  --implicit-check-not cmpw --implicit-check-not cmpd --implicit-check-not cmpl
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu -O2 \
+; RUN:   -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s \
+; RUN:  --implicit-check-not cmpw --implicit-check-not cmpd --implicit-check-not cmpl
+; ModuleID = 'ComparisonTestCases/testComparesieqsi.c'
+
+@glob = common local_unnamed_addr global i32 0, align 4
+
+; Function Attrs: norecurse nounwind readnone
+define signext i32 @test_ieqsi(i32 signext %a, i32 signext %b) {
+; CHECK-LABEL: test_ieqsi:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    xor r3, r3, r4
+; CHECK-NEXT:    cntlzw r3, r3
+; CHECK-NEXT:    srwi r3, r3, 5
+; CHECK-NEXT:    blr
+entry:
+  %cmp = icmp eq i32 %a, %b
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+}
+
+; Function Attrs: norecurse nounwind readnone
+define signext i32 @test_ieqsi_sext(i32 signext %a, i32 signext %b) {
+; CHECK-LABEL: test_ieqsi_sext:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    xor r3, r3, r4
+; CHECK-NEXT:    cntlzw r3, r3
+; CHECK-NEXT:    rldicr r3, r3, 58, 0
+; CHECK-NEXT:    sradi r3, r3, 63
+; CHECK-NEXT:    blr
+entry:
+  %cmp = icmp eq i32 %a, %b
+  %sub = sext i1 %cmp to i32
+  ret i32 %sub
+}
+
+; Function Attrs: norecurse nounwind readnone
+define signext i32 @test_ieqsi_z(i32 signext %a) {
+; CHECK-LABEL: test_ieqsi_z:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    cntlzw r3, r3
+; CHECK-NEXT:    srwi r3, r3, 5
+; CHECK-NEXT:    blr
+entry:
+  %cmp = icmp eq i32 %a, 0
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+}
+
+; Function Attrs: norecurse nounwind readnone
+define signext i32 @test_ieqsi_sext_z(i32 signext %a) {
+; CHECK-LABEL: test_ieqsi_sext_z:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    cntlzw r3, r3
+; CHECK-NEXT:    rldicr r3, r3, 58, 0
+; CHECK-NEXT:    sradi r3, r3, 63
+; CHECK-NEXT:    blr
+entry:
+  %cmp = icmp eq i32 %a, 0
+  %sub = sext i1 %cmp to i32
+  ret i32 %sub
+}
+
+; Function Attrs: norecurse nounwind
+define void @test_ieqsi_store(i32 signext %a, i32 signext %b) {
+; CHECK-LABEL: test_ieqsi_store:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    addis r5, r2, .LC0@toc@ha
+; CHECK-NEXT:    xor r3, r3, r4
+; CHECK-NEXT:    ld r12, .LC0@toc@l(r5)
+; CHECK-NEXT:    cntlzw r3, r3
+; CHECK-NEXT:    srwi r3, r3, 5
+; CHECK-NEXT:    stw r3, 0(r12)
+; CHECK-NEXT:    blr
+entry:
+  %cmp = icmp eq i32 %a, %b
+  %conv = zext i1 %cmp to i32
+  store i32 %conv, i32* @glob, align 4
+  ret void
+}
+
+; Function Attrs: norecurse nounwind
+define void @test_ieqsi_sext_store(i32 signext %a, i32 signext %b) {
+; CHECK-LABEL: test_ieqsi_sext_store:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    xor r3, r3, r4
+; CHECK-NEXT:    addis r5, r2, .LC0@toc@ha
+; CHECK-NEXT:    cntlzw r3, r3
+; CHECK-NEXT:    ld r4, .LC0@toc@l(r5)
+; CHECK-NEXT:    rldicr r3, r3, 58, 0
+; CHECK-NEXT:    sradi r3, r3, 63
+; CHECK-NEXT:    stw r3, 0(r4)
+; CHECK-NEXT:    blr
+entry:
+  %cmp = icmp eq i32 %a, %b
+  %sub = sext i1 %cmp to i32
+  store i32 %sub, i32* @glob, align 4
+  ret void
+}
+
+; Function Attrs: norecurse nounwind
+define void @test_ieqsi_z_store(i32 signext %a) {
+; CHECK-LABEL: test_ieqsi_z_store:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    addis r4, r2, .LC0@toc@ha
+; CHECK-NEXT:    cntlzw r3, r3
+; CHECK-NEXT:    ld r4, .LC0@toc@l(r4)
+; CHECK-NEXT:    srwi r3, r3, 5
+; CHECK-NEXT:    stw r3, 0(r4)
+; CHECK-NEXT:    blr
+entry:
+  %cmp = icmp eq i32 %a, 0
+  %conv = zext i1 %cmp to i32
+  store i32 %conv, i32* @glob, align 4
+  ret void
+}
+
+; Function Attrs: norecurse nounwind
+define void @test_ieqsi_sext_z_store(i32 signext %a) {
+; CHECK-LABEL: test_ieqsi_sext_z_store:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    addis r4, r2, .LC0@toc@ha
+; CHECK-NEXT:    cntlzw r3, r3
+; CHECK-NEXT:    ld r4, .LC0@toc@l(r4)
+; CHECK-NEXT:    rldicr r3, r3, 58, 0
+; CHECK-NEXT:    sradi r3, r3, 63
+; CHECK-NEXT:    stw r3, 0(r4)
+; CHECK-NEXT:    blr
+entry:
+  %cmp = icmp eq i32 %a, 0
+  %sub = sext i1 %cmp to i32
+  store i32 %sub, i32* @glob, align 4
+  ret void
+}
diff --git a/test/CodeGen/PowerPC/testComparesieqss.ll b/test/CodeGen/PowerPC/testComparesieqss.ll
new file mode 100644
index 000000000000..110c5a62804e
--- /dev/null
+++ b/test/CodeGen/PowerPC/testComparesieqss.ll
@@ -0,0 +1,138 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu -O2 \
+; RUN:   -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s \
+; RUN:  --implicit-check-not cmpw --implicit-check-not cmpd --implicit-check-not cmpl
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu -O2 \
+; RUN:   -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s \
+; RUN:  --implicit-check-not cmpw --implicit-check-not cmpd --implicit-check-not cmpl
+; ModuleID = 'ComparisonTestCases/testComparesieqss.c'
+
+@glob = common local_unnamed_addr global i16 0, align 2
+
+; Function Attrs: norecurse nounwind readnone
+define signext i32 @test_ieqss(i16 signext %a, i16 signext %b) {
+; CHECK-LABEL: test_ieqss:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    xor r3, r3, r4
+; CHECK-NEXT:    cntlzw r3, r3
+; CHECK-NEXT:    srwi r3, r3, 5
+; CHECK-NEXT:    blr
+entry:
+  %cmp = icmp eq i16 %a, %b
+  %conv2 = zext i1 %cmp to i32
+  ret i32 %conv2
+}
+
+; Function Attrs: norecurse nounwind readnone
+define signext i32 @test_ieqss_sext(i16 signext %a, i16 signext %b) {
+; CHECK-LABEL: test_ieqss_sext:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    xor r3, r3, r4
+; CHECK-NEXT:    cntlzw r3, r3
+; CHECK-NEXT:    rldicr r3, r3, 58, 0
+; CHECK-NEXT:    sradi r3, r3, 63
+; CHECK-NEXT:    blr
+entry:
+  %cmp = icmp eq i16 %a, %b
+  %sub = sext i1 %cmp to i32
+  ret i32 %sub
+}
+
+; Function Attrs: norecurse nounwind readnone
+define signext i32 @test_ieqss_z(i16 signext %a) {
+; CHECK-LABEL: test_ieqss_z:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    cntlzw r3, r3
+; CHECK-NEXT:    srwi r3, r3, 5
+; CHECK-NEXT:    blr
+entry:
+  %cmp = icmp eq i16 %a, 0
+  %conv1 = zext i1 %cmp to i32
+  ret i32 %conv1
+}
+
+; Function Attrs: norecurse nounwind readnone
+define signext i32 @test_ieqss_sext_z(i16 signext %a) {
+; CHECK-LABEL: test_ieqss_sext_z:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    cntlzw r3, r3
+; CHECK-NEXT:    rldicr r3, r3, 58, 0
+; CHECK-NEXT:    sradi r3, r3, 63
+; CHECK-NEXT:    blr
+entry:
+  %cmp = icmp eq i16 %a, 0
+  %sub = sext i1 %cmp to i32
+  ret i32 %sub
+}
+
+; Function Attrs: norecurse nounwind
+define void @test_ieqss_store(i16 signext %a, i16 signext %b) {
+; CHECK-LABEL: test_ieqss_store:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    addis r5, r2, .LC0@toc@ha
+; CHECK-NEXT:    xor r3, r3, r4
+; CHECK-NEXT:    ld r12, .LC0@toc@l(r5)
+; CHECK-NEXT:    cntlzw r3, r3
+; CHECK-NEXT:    srwi r3, r3, 5
+; CHECK-NEXT:    sth r3, 0(r12)
+; CHECK-NEXT:    blr
+entry:
+  %cmp = icmp eq i16 %a, %b
+  %conv3 = zext i1 %cmp to i16
+  store i16 %conv3, i16* @glob, align 2
+  ret void
+}
+
+; Function Attrs: norecurse nounwind
+define void @test_ieqss_sext_store(i16 signext %a, i16 signext %b) {
+; CHECK-LABEL: test_ieqss_sext_store:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    xor r3, r3, r4
+; CHECK-NEXT:    addis r5, r2, .LC0@toc@ha
+; CHECK-NEXT:    cntlzw r3, r3
+; CHECK-NEXT:    ld r4, .LC0@toc@l(r5)
+; CHECK-NEXT:    rldicr r3, r3, 58, 0
+; CHECK-NEXT:    sradi r3, r3, 63
+; CHECK-NEXT:    sth r3, 0(r4)
+; CHECK-NEXT:    blr
+entry:
+  %cmp = icmp eq i16 %a, %b
+  %conv3 = sext i1 %cmp to i16
+  store i16 %conv3, i16* @glob, align 2
+  ret void
+}
+
+; Function Attrs: norecurse nounwind
+define void @test_ieqss_z_store(i16 signext %a) {
+; CHECK-LABEL: test_ieqss_z_store:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    addis r4, r2, .LC0@toc@ha
+; CHECK-NEXT:    cntlzw r3, r3
+; CHECK-NEXT:    ld r4, .LC0@toc@l(r4)
+; CHECK-NEXT:    srwi r3, r3, 5
+; CHECK-NEXT:    sth r3, 0(r4)
+; CHECK-NEXT:    blr
+entry:
+  %cmp = icmp eq i16 %a, 0
+  %conv2 = zext i1 %cmp to i16
+  store i16 %conv2, i16* @glob, align 2
+  ret void
+}
+
+; Function Attrs: norecurse nounwind
+define void @test_ieqss_sext_z_store(i16 signext %a) {
+; CHECK-LABEL: test_ieqss_sext_z_store:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    addis r4, r2, .LC0@toc@ha
+; CHECK-NEXT:    cntlzw r3, r3
+; CHECK-NEXT:    ld r4, .LC0@toc@l(r4)
+; CHECK-NEXT:    rldicr r3, r3, 58, 0
+; CHECK-NEXT:    sradi r3, r3, 63
+; CHECK-NEXT:    sth r3, 0(r4)
+; CHECK-NEXT:    blr
+entry:
+  %cmp = icmp eq i16 %a, 0
+  %conv2 = sext i1 %cmp to i16
+  store i16 %conv2, i16* @glob, align 2
+  ret void
+}
diff --git a/test/CodeGen/PowerPC/testComparesiequc.ll b/test/CodeGen/PowerPC/testComparesiequc.ll
new file mode 100644
index 000000000000..e2c975f2c191
--- /dev/null
+++ b/test/CodeGen/PowerPC/testComparesiequc.ll
@@ -0,0 +1,138 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu -O2 \
+; RUN:   -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s \
+; RUN:  --implicit-check-not cmpw --implicit-check-not cmpd --implicit-check-not cmpl
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu -O2 \
+; RUN:   -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s \
+; RUN:  --implicit-check-not cmpw --implicit-check-not cmpd --implicit-check-not cmpl
+; ModuleID = 'ComparisonTestCases/testComparesiequc.c'
+
+@glob = common local_unnamed_addr global i8 0, align 1
+
+; Function Attrs: norecurse nounwind readnone
+define signext i32 @test_iequc(i8 zeroext %a, i8 zeroext %b) {
+; CHECK-LABEL: test_iequc:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    xor r3, r3, r4
+; CHECK-NEXT:    cntlzw r3, r3
+; CHECK-NEXT:    srwi r3, r3, 5
+; CHECK-NEXT:    blr
+entry:
+  %cmp = icmp eq i8 %a, %b
+  %conv2 = zext i1 %cmp to i32
+  ret i32 %conv2
+}
+
+; Function Attrs: norecurse nounwind readnone
+define signext i32 @test_iequc_sext(i8 zeroext %a, i8 zeroext %b) {
+; CHECK-LABEL: test_iequc_sext:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    xor r3, r3, r4
+; CHECK-NEXT:    cntlzw r3, r3
+; CHECK-NEXT:    rldicr r3, r3, 58, 0
+; CHECK-NEXT:    sradi r3, r3, 63
+; CHECK-NEXT:    blr
+entry:
+  %cmp = icmp eq i8 %a, %b
+  %sub = sext i1 %cmp to i32
+  ret i32 %sub
+}
+
+; Function Attrs: norecurse nounwind readnone
+define signext i32 @test_iequc_z(i8 zeroext %a) {
+; CHECK-LABEL: test_iequc_z:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    cntlzw r3, r3
+; CHECK-NEXT:    srwi r3, r3, 5
+; CHECK-NEXT:    blr
+entry:
+  %cmp = icmp eq i8 %a, 0
+  %conv1 = zext i1 %cmp to i32
+  ret i32 %conv1
+}
+
+; Function Attrs: norecurse nounwind readnone
+define signext i32 @test_iequc_sext_z(i8 zeroext %a) {
+; CHECK-LABEL: test_iequc_sext_z:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    cntlzw r3, r3
+; CHECK-NEXT:    rldicr r3, r3, 58, 0
+; CHECK-NEXT:    sradi r3, r3, 63
+; CHECK-NEXT:    blr
+entry:
+  %cmp = icmp eq i8 %a, 0
+  %sub = sext i1 %cmp to i32
+  ret i32 %sub
+}
+
+; Function Attrs: norecurse nounwind
+define void @test_iequc_store(i8 zeroext %a, i8 zeroext %b) {
+; CHECK-LABEL: test_iequc_store:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    addis r5, r2, .LC0@toc@ha
+; CHECK-NEXT:    xor r3, r3, r4
+; CHECK-NEXT:    ld r12, .LC0@toc@l(r5)
+; CHECK-NEXT:    cntlzw r3, r3
+; CHECK-NEXT:    srwi r3, r3, 5
+; CHECK-NEXT:    stb r3, 0(r12)
+; CHECK-NEXT:    blr
+entry:
+  %cmp = icmp eq i8 %a, %b
+  %conv3 = zext i1 %cmp to i8
+  store i8 %conv3, i8* @glob, align 1
+  ret void
+}
+
+; Function Attrs: norecurse nounwind
+define void @test_iequc_sext_store(i8 zeroext %a, i8 zeroext %b) {
+; CHECK-LABEL: test_iequc_sext_store:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    xor r3, r3, r4
+; CHECK-NEXT:    addis r5, r2, .LC0@toc@ha
+; CHECK-NEXT:    cntlzw r3, r3
+; CHECK-NEXT:    ld r4, .LC0@toc@l(r5)
+; CHECK-NEXT:    rldicr r3, r3, 58, 0
+; CHECK-NEXT:    sradi r3, r3, 63
+; CHECK-NEXT:    stb r3, 0(r4)
+; CHECK-NEXT:    blr
+entry:
+  %cmp = icmp eq i8 %a, %b
+  %conv3 = sext i1 %cmp to i8
+  store i8 %conv3, i8* @glob, align 1
+  ret void
+}
+
+; Function Attrs: norecurse nounwind
+define void @test_iequc_z_store(i8 zeroext %a) {
+; CHECK-LABEL: test_iequc_z_store:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    addis r4, r2, .LC0@toc@ha
+; CHECK-NEXT:    cntlzw r3, r3
+; CHECK-NEXT:    ld r4, .LC0@toc@l(r4)
+; CHECK-NEXT:    srwi r3, r3, 5
+; CHECK-NEXT:    stb r3, 0(r4)
+; CHECK-NEXT:    blr
+entry:
+  %cmp = icmp eq i8 %a, 0
+  %conv2 = zext i1 %cmp to i8
+  store i8 %conv2, i8* @glob, align 1
+  ret void
+}
+
+; Function Attrs: norecurse nounwind
+define void @test_iequc_sext_z_store(i8 zeroext %a) {
+; CHECK-LABEL: test_iequc_sext_z_store:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    addis r4, r2, .LC0@toc@ha
+; CHECK-NEXT:    cntlzw r3, r3
+; CHECK-NEXT:    ld r4, .LC0@toc@l(r4)
+; CHECK-NEXT:    rldicr r3, r3, 58, 0
+; CHECK-NEXT:    sradi r3, r3, 63
+; CHECK-NEXT:    stb r3, 0(r4)
+; CHECK-NEXT:    blr
+entry:
+  %cmp = icmp eq i8 %a, 0
+  %conv2 = sext i1 %cmp to i8
+  store i8 %conv2, i8* @glob, align 1
+  ret void
+}
diff --git a/test/CodeGen/PowerPC/testComparesiequi.ll b/test/CodeGen/PowerPC/testComparesiequi.ll
new file mode 100644
index 000000000000..789b176a7700
--- /dev/null
+++ b/test/CodeGen/PowerPC/testComparesiequi.ll
@@ -0,0 +1,138 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu -O2 \
+; RUN:   -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s \
+; RUN:  --implicit-check-not cmpw --implicit-check-not cmpd --implicit-check-not cmpl
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu -O2 \
+; RUN:   -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s \
+; RUN:  --implicit-check-not cmpw --implicit-check-not cmpd --implicit-check-not cmpl
+; ModuleID = 'ComparisonTestCases/testComparesiequi.c'
+
+@glob = common local_unnamed_addr global i32 0, align 4
+
+; Function Attrs: norecurse nounwind readnone
+define signext i32 @test_iequi(i32 zeroext %a, i32 zeroext %b) {
+; CHECK-LABEL: test_iequi:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    xor r3, r3, r4
+; CHECK-NEXT:    cntlzw r3, r3
+; CHECK-NEXT:    srwi r3, r3, 5
+; CHECK-NEXT:    blr
+entry:
+  %cmp = icmp eq i32 %a, %b
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+}
+
+; Function Attrs: norecurse nounwind readnone
+define signext i32 @test_iequi_sext(i32 zeroext %a, i32 zeroext %b) {
+; CHECK-LABEL: test_iequi_sext:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    xor r3, r3, r4
+; CHECK-NEXT:    cntlzw r3, r3
+; CHECK-NEXT:    rldicr r3, r3, 58, 0
+; CHECK-NEXT:    sradi r3, r3, 63
+; CHECK-NEXT:    blr
+entry:
+  %cmp = icmp eq i32 %a, %b
+  %sub = sext i1 %cmp to i32
+  ret i32 %sub
+}
+
+; Function Attrs: norecurse nounwind readnone
+define signext i32 @test_iequi_z(i32 zeroext %a) {
+; CHECK-LABEL: test_iequi_z:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    cntlzw r3, r3
+; CHECK-NEXT:    srwi r3, r3, 5
+; CHECK-NEXT:    blr
+entry:
+  %cmp = icmp eq i32 %a, 0
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+}
+
+; Function Attrs: norecurse nounwind readnone
+define signext i32 @test_iequi_sext_z(i32 zeroext %a) {
+; CHECK-LABEL: test_iequi_sext_z:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    cntlzw r3, r3
+; CHECK-NEXT:    rldicr r3, r3, 58, 0
+; CHECK-NEXT:    sradi r3, r3, 63
+; CHECK-NEXT:    blr
+entry:
+  %cmp = icmp eq i32 %a, 0
+  %sub = sext i1 %cmp to i32
+  ret i32 %sub
+}
+
+; Function Attrs: norecurse nounwind
+define void @test_iequi_store(i32 zeroext %a, i32 zeroext %b) {
+; CHECK-LABEL: test_iequi_store:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    addis r5, r2, .LC0@toc@ha
+; CHECK-NEXT:    xor r3, r3, r4
+; CHECK-NEXT:    ld r12, .LC0@toc@l(r5)
+; CHECK-NEXT:    cntlzw r3, r3
+; CHECK-NEXT:    srwi r3, r3, 5
+; CHECK-NEXT:    stw r3, 0(r12)
+; CHECK-NEXT:    blr
+entry:
+  %cmp = icmp eq i32 %a, %b
+  %conv = zext i1 %cmp to i32
+  store i32 %conv, i32* @glob, align 4
+  ret void
+}
+
+; Function Attrs: norecurse nounwind
+define void @test_iequi_sext_store(i32 zeroext %a, i32 zeroext %b) {
+; CHECK-LABEL: test_iequi_sext_store:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    xor r3, r3, r4
+; CHECK-NEXT:    addis r5, r2, .LC0@toc@ha
+; CHECK-NEXT:    cntlzw r3, r3
+; CHECK-NEXT:    ld r4, .LC0@toc@l(r5)
+; CHECK-NEXT:    rldicr r3, r3, 58, 0
+; CHECK-NEXT:    sradi r3, r3, 63
+; CHECK-NEXT:    stw r3, 0(r4)
+; CHECK-NEXT:    blr
+entry:
+  %cmp = icmp eq i32 %a, %b
+  %sub = sext i1 %cmp to i32
+  store i32 %sub, i32* @glob, align 4
+  ret void
+}
+
+; Function Attrs: norecurse nounwind
+define void @test_iequi_z_store(i32 zeroext %a) {
+; CHECK-LABEL: test_iequi_z_store:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    addis r4, r2, .LC0@toc@ha
+; CHECK-NEXT:    cntlzw r3, r3
+; CHECK-NEXT:    ld r4, .LC0@toc@l(r4)
+; CHECK-NEXT:    srwi r3, r3, 5
+; CHECK-NEXT:    stw r3, 0(r4)
+; CHECK-NEXT:    blr
+entry:
+  %cmp = icmp eq i32 %a, 0
+  %conv = zext i1 %cmp to i32
+  store i32 %conv, i32* @glob, align 4
+  ret void
+}
+
+; Function Attrs: norecurse nounwind
+define void @test_iequi_sext_z_store(i32 zeroext %a) {
+; CHECK-LABEL: test_iequi_sext_z_store:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    addis r4, r2, .LC0@toc@ha
+; CHECK-NEXT:    cntlzw r3, r3
+; CHECK-NEXT:    ld r4, .LC0@toc@l(r4)
+; CHECK-NEXT:    rldicr r3, r3, 58, 0
+; CHECK-NEXT:    sradi r3, r3, 63
+; CHECK-NEXT:    stw r3, 0(r4)
+; CHECK-NEXT:    blr
+entry:
+  %cmp = icmp eq i32 %a, 0
+  %sub = sext i1 %cmp to i32
+  store i32 %sub, i32* @glob, align 4
+  ret void
+}
diff --git a/test/CodeGen/PowerPC/testComparesiequs.ll b/test/CodeGen/PowerPC/testComparesiequs.ll
new file mode 100644
index 000000000000..b72943893e98
--- /dev/null
+++ b/test/CodeGen/PowerPC/testComparesiequs.ll
@@ -0,0 +1,138 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu -O2 \
+; RUN:   -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s \
+; RUN:  --implicit-check-not cmpw --implicit-check-not cmpd --implicit-check-not cmpl
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu -O2 \
+; RUN:   -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s \
+; RUN:  --implicit-check-not cmpw --implicit-check-not cmpd --implicit-check-not cmpl
+; ModuleID = 'ComparisonTestCases/testComparesiequs.c'
+
+@glob = common local_unnamed_addr global i16 0, align 2
+
+; Function Attrs: norecurse nounwind readnone
+define signext i32 @test_iequs(i16 zeroext %a, i16 zeroext %b) {
+; CHECK-LABEL: test_iequs:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    xor r3, r3, r4
+; CHECK-NEXT:    cntlzw r3, r3
+; CHECK-NEXT:    srwi r3, r3, 5
+; CHECK-NEXT:    blr
+entry:
+  %cmp = icmp eq i16 %a, %b
+  %conv2 = zext i1 %cmp to i32
+  ret i32 %conv2
+}
+
+; Function Attrs: norecurse nounwind readnone
+define signext i32 @test_iequs_sext(i16 zeroext %a, i16 zeroext %b) {
+; CHECK-LABEL: test_iequs_sext:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    xor r3, r3, r4
+; CHECK-NEXT:    cntlzw r3, r3
+; CHECK-NEXT:    rldicr r3, r3, 58, 0
+; CHECK-NEXT:    sradi r3, r3, 63
+; CHECK-NEXT:    blr
+entry:
+  %cmp = icmp eq i16 %a, %b
+  %sub = sext i1 %cmp to i32
+  ret i32 %sub
+}
+
+; Function Attrs: norecurse nounwind readnone
+define signext i32 @test_iequs_z(i16 zeroext %a) {
+; CHECK-LABEL: test_iequs_z:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    cntlzw r3, r3
+; CHECK-NEXT:    srwi r3, r3, 5
+; CHECK-NEXT:    blr
+entry:
+  %cmp = icmp eq i16 %a, 0
+  %conv1 = zext i1 %cmp to i32
+  ret i32 %conv1
+}
+
+; Function Attrs: norecurse nounwind readnone
+define signext i32 @test_iequs_sext_z(i16 zeroext %a) {
+; CHECK-LABEL: test_iequs_sext_z:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    cntlzw r3, r3
+; CHECK-NEXT:    rldicr r3, r3, 58, 0
+; CHECK-NEXT:    sradi r3, r3, 63
+; CHECK-NEXT:    blr
+entry:
+  %cmp = icmp eq i16 %a, 0
+  %sub = sext i1 %cmp to i32
+  ret i32 %sub
+}
+
+; Function Attrs: norecurse nounwind
+define void @test_iequs_store(i16 zeroext %a, i16 zeroext %b) {
+; CHECK-LABEL: test_iequs_store:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    addis r5, r2, .LC0@toc@ha
+; CHECK-NEXT:    xor r3, r3, r4
+; CHECK-NEXT:    ld r12, .LC0@toc@l(r5)
+; CHECK-NEXT:    cntlzw r3, r3
+; CHECK-NEXT:    srwi r3, r3, 5
+; CHECK-NEXT:    sth r3, 0(r12)
+; CHECK-NEXT:    blr
+entry:
+  %cmp = icmp eq i16 %a, %b
+  %conv3 = zext i1 %cmp to i16
+  store i16 %conv3, i16* @glob, align 2
+  ret void
+}
+
+; Function Attrs: norecurse nounwind
+define void @test_iequs_sext_store(i16 zeroext %a, i16 zeroext %b) {
+; CHECK-LABEL: test_iequs_sext_store:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    xor r3, r3, r4
+; CHECK-NEXT:    addis r5, r2, .LC0@toc@ha
+; CHECK-NEXT:    cntlzw r3, r3
+; CHECK-NEXT:    ld r4, .LC0@toc@l(r5)
+; CHECK-NEXT:    rldicr r3, r3, 58, 0
+; CHECK-NEXT:    sradi r3, r3, 63
+; CHECK-NEXT:    sth r3, 0(r4)
+; CHECK-NEXT:    blr
+entry:
+  %cmp = icmp eq i16 %a, %b
+  %conv3 = sext i1 %cmp to i16
+  store i16 %conv3, i16* @glob, align 2
+  ret void
+}
+
+; Function Attrs: norecurse nounwind
+define void @test_iequs_z_store(i16 zeroext %a) {
+; CHECK-LABEL: test_iequs_z_store:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    addis r4, r2, .LC0@toc@ha
+; CHECK-NEXT:    cntlzw r3, r3
+; CHECK-NEXT:    ld r4, .LC0@toc@l(r4)
+; CHECK-NEXT:    srwi r3, r3, 5
+; CHECK-NEXT:    sth r3, 0(r4)
+; CHECK-NEXT:    blr
+entry:
+  %cmp = icmp eq i16 %a, 0
+  %conv2 = zext i1 %cmp to i16
+  store i16 %conv2, i16* @glob, align 2
+  ret void
+}
+
+; Function Attrs: norecurse nounwind
+define void @test_iequs_sext_z_store(i16 zeroext %a) {
+; CHECK-LABEL: test_iequs_sext_z_store:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    addis r4, r2, .LC0@toc@ha
+; CHECK-NEXT:    cntlzw r3, r3
+; CHECK-NEXT:    ld r4, .LC0@toc@l(r4)
+; CHECK-NEXT:    rldicr r3, r3, 58, 0
+; CHECK-NEXT:    sradi r3, r3, 63
+; CHECK-NEXT:    sth r3, 0(r4)
+; CHECK-NEXT:    blr
+entry:
+  %cmp = icmp eq i16 %a, 0
+  %conv2 = sext i1 %cmp to i16
+  store i16 %conv2, i16* @glob, align 2
+  ret void
+}
diff --git a/test/CodeGen/PowerPC/testCompareslleqsc.ll b/test/CodeGen/PowerPC/testCompareslleqsc.ll
new file mode 100644
index 000000000000..56af12827931
--- /dev/null
+++ b/test/CodeGen/PowerPC/testCompareslleqsc.ll
@@ -0,0 +1,138 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu -O2 \
+; RUN:   -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s \
+; RUN:  --implicit-check-not cmpw --implicit-check-not cmpd --implicit-check-not cmpl
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu -O2 \
+; RUN:   -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s \
+; RUN:  --implicit-check-not cmpw --implicit-check-not cmpd --implicit-check-not cmpl
+; ModuleID = 'ComparisonTestCases/testCompareslleqsc.c'
+
+@glob = common local_unnamed_addr global i8 0, align 1
+
+; Function Attrs: norecurse nounwind readnone
+define i64 @test_lleqsc(i8 signext %a, i8 signext %b) {
+; CHECK-LABEL: test_lleqsc:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    xor r3, r3, r4
+; CHECK-NEXT:    cntlzw r3, r3
+; CHECK-NEXT:    srwi r3, r3, 5
+; CHECK-NEXT:    blr
+entry:
+  %cmp = icmp eq i8 %a, %b
+  %conv3 = zext i1 %cmp to i64
+  ret i64 %conv3
+}
+
+; Function Attrs: norecurse nounwind readnone
+define i64 @test_lleqsc_sext(i8 signext %a, i8 signext %b) {
+; CHECK-LABEL: test_lleqsc_sext:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    xor r3, r3, r4
+; CHECK-NEXT:    cntlzw r3, r3
+; CHECK-NEXT:    rldicr r3, r3, 58, 0
+; CHECK-NEXT:    sradi r3, r3, 63
+; CHECK-NEXT:    blr
+entry:
+  %cmp = icmp eq i8 %a, %b
+  %conv3 = sext i1 %cmp to i64
+  ret i64 %conv3
+}
+
+; Function Attrs: norecurse nounwind readnone
+define i64 @test_lleqsc_z(i8 signext %a) {
+; CHECK-LABEL: test_lleqsc_z:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    cntlzw r3, r3
+; CHECK-NEXT:    srwi r3, r3, 5
+; CHECK-NEXT:    blr
+entry:
+  %cmp = icmp eq i8 %a, 0
+  %conv2 = zext i1 %cmp to i64
+  ret i64 %conv2
+}
+
+; Function Attrs: norecurse nounwind readnone
+define i64 @test_lleqsc_sext_z(i8 signext %a) {
+; CHECK-LABEL: test_lleqsc_sext_z:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    cntlzw r3, r3
+; CHECK-NEXT:    rldicr r3, r3, 58, 0
+; CHECK-NEXT:    sradi r3, r3, 63
+; CHECK-NEXT:    blr
+entry:
+  %cmp = icmp eq i8 %a, 0
+  %conv2 = sext i1 %cmp to i64
+  ret i64 %conv2
+}
+
+; Function Attrs: norecurse nounwind
+define void @test_lleqsc_store(i8 signext %a, i8 signext %b) {
+; CHECK-LABEL: test_lleqsc_store:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    addis r5, r2, .LC0@toc@ha
+; CHECK-NEXT:    xor r3, r3, r4
+; CHECK-NEXT:    ld r12, .LC0@toc@l(r5)
+; CHECK-NEXT:    cntlzw r3, r3
+; CHECK-NEXT:    srwi r3, r3, 5
+; CHECK-NEXT:    stb r3, 0(r12)
+; CHECK-NEXT:    blr
+entry:
+  %cmp = icmp eq i8 %a, %b
+  %conv3 = zext i1 %cmp to i8
+  store i8 %conv3, i8* @glob, align 1
+  ret void
+}
+
+; Function Attrs: norecurse nounwind
+define void @test_lleqsc_sext_store(i8 signext %a, i8 signext %b) {
+; CHECK-LABEL: test_lleqsc_sext_store:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    xor r3, r3, r4
+; CHECK-NEXT:    addis r5, r2, .LC0@toc@ha
+; CHECK-NEXT:    cntlzw r3, r3
+; CHECK-NEXT:    ld r4, .LC0@toc@l(r5)
+; CHECK-NEXT:    rldicr r3, r3, 58, 0
+; CHECK-NEXT:    sradi r3, r3, 63
+; CHECK-NEXT:    stb r3, 0(r4)
+; CHECK-NEXT:    blr
+entry:
+  %cmp = icmp eq i8 %a, %b
+  %conv3 = sext i1 %cmp to i8
+  store i8 %conv3, i8* @glob, align 1
+  ret void
+}
+
+; Function Attrs: norecurse nounwind
+define void @test_lleqsc_z_store(i8 signext %a) {
+; CHECK-LABEL: test_lleqsc_z_store:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    addis r4, r2, .LC0@toc@ha
+; CHECK-NEXT:    cntlzw r3, r3
+; CHECK-NEXT:    ld r4, .LC0@toc@l(r4)
+; CHECK-NEXT:    srwi r3, r3, 5
+; CHECK-NEXT:    stb r3, 0(r4)
+; CHECK-NEXT:    blr
+entry:
+  %cmp = icmp eq i8 %a, 0
+  %conv2 = zext i1 %cmp to i8
+  store i8 %conv2, i8* @glob, align 1
+  ret void
+}
+
+; Function Attrs: norecurse nounwind
+define void @test_lleqsc_sext_z_store(i8 signext %a) {
+; CHECK-LABEL: test_lleqsc_sext_z_store:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    addis r4, r2, .LC0@toc@ha
+; CHECK-NEXT:    cntlzw r3, r3
+; CHECK-NEXT:    ld r4, .LC0@toc@l(r4)
+; CHECK-NEXT:    rldicr r3, r3, 58, 0
+; CHECK-NEXT:    sradi r3, r3, 63
+; CHECK-NEXT:    stb r3, 0(r4)
+; CHECK-NEXT:    blr
+entry:
+  %cmp = icmp eq i8 %a, 0
+  %conv2 = sext i1 %cmp to i8
+  store i8 %conv2, i8* @glob, align 1
+  ret void
+}
diff --git a/test/CodeGen/PowerPC/testCompareslleqsi.ll b/test/CodeGen/PowerPC/testCompareslleqsi.ll
new file mode 100644
index 000000000000..90cf2c85888e
--- /dev/null
+++ b/test/CodeGen/PowerPC/testCompareslleqsi.ll
@@ -0,0 +1,138 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu -O2 \
+; RUN:   -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s \
+; RUN:  --implicit-check-not cmpw --implicit-check-not cmpd --implicit-check-not cmpl
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu -O2 \
+; RUN:   -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s \
+; RUN:  --implicit-check-not cmpw --implicit-check-not cmpd --implicit-check-not cmpl
+
+@glob = common local_unnamed_addr global i32 0, align 4
+
+; Function Attrs: norecurse nounwind readnone
+define i64 @test_lleqsi(i32 signext %a, i32 signext %b) {
+; CHECK-LABEL: test_lleqsi:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    xor r3, r3, r4
+; CHECK-NEXT:    cntlzw r3, r3
+; CHECK-NEXT:    srwi r3, r3, 5
+; CHECK-NEXT:    blr
+entry:
+  %cmp = icmp eq i32 %a, %b
+  %conv1 = zext i1 %cmp to i64
+  ret i64 %conv1
+}
+
+; Function Attrs: norecurse nounwind readnone
+define i64 @test_lleqsi_sext(i32 signext %a, i32 signext %b) {
+; CHECK-LABEL: test_lleqsi_sext:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    xor r3, r3, r4
+; CHECK-NEXT:    cntlzw r3, r3
+; CHECK-NEXT:    rldicr r3, r3, 58, 0
+; CHECK-NEXT:    sradi r3, r3, 63
+; CHECK-NEXT:    blr
+entry:
+  %cmp = icmp eq i32 %a, %b
+  %conv1 = sext i1 %cmp to i64
+  ret i64 %conv1
+}
+
+; Function Attrs: norecurse nounwind readnone
+define i64 @test_lleqsi_z(i32 signext %a) {
+; CHECK-LABEL: test_lleqsi_z:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    cntlzw r3, r3
+; CHECK-NEXT:    srwi r3, r3, 5
+; CHECK-NEXT:    blr
+entry:
+  %cmp = icmp eq i32 %a, 0
+  %conv1 = zext i1 %cmp to i64
+  ret i64 %conv1
+}
+
+; Function Attrs: norecurse nounwind readnone
+define i64 @test_lleqsi_sext_z(i32 signext %a) {
+; CHECK-LABEL: test_lleqsi_sext_z:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    cntlzw r3, r3
+; CHECK-NEXT:    rldicr r3, r3, 58, 0
+; CHECK-NEXT:    sradi r3, r3, 63
+; CHECK-NEXT:    blr
+entry:
+  %cmp = icmp eq i32 %a, 0
+  %conv1 = sext i1 %cmp to i64
+  ret i64 %conv1
+}
+
+; Function Attrs: norecurse nounwind
+define void @test_lleqsi_store(i32 signext %a, i32 signext %b) {
+; CHECK-LABEL: test_lleqsi_store:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    addis r5, r2, .LC0@toc@ha
+; CHECK-NEXT:    xor r3, r3, r4
+; CHECK-NEXT:    ld r12, .LC0@toc@l(r5)
+; CHECK-NEXT:    cntlzw r3, r3
+; CHECK-NEXT:    srwi r3, r3, 5
+; CHECK-NEXT:    stw r3, 0(r12)
+; CHECK-NEXT:    blr
+entry:
+  %cmp = icmp eq i32 %a, %b
+  %conv = zext i1 %cmp to i32
+  store i32 %conv, i32* @glob, align 4
+  ret void
+}
+
+; Function Attrs: norecurse nounwind
+define void @test_lleqsi_sext_store(i32 signext %a, i32 signext %b) {
+; CHECK-LABEL: test_lleqsi_sext_store:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    xor r3, r3, r4
+; CHECK-NEXT:    addis r5, r2, .LC0@toc@ha
+; CHECK-NEXT:    cntlzw r3, r3
+; CHECK-NEXT:    ld r4, .LC0@toc@l(r5)
+; CHECK-NEXT:    rldicr r3, r3, 58, 0
+; CHECK-NEXT:    sradi r3, r3, 63
+; CHECK-NEXT:    stw r3, 0(r4)
+; CHECK-NEXT:    blr
+entry:
+  %cmp = icmp eq i32 %a, %b
+  %sub = sext i1 %cmp to i32
+  store i32 %sub, i32* @glob, align 4
+  ret void
+}
+
+; Function Attrs: norecurse nounwind
+define void @test_lleqsi_z_store(i32 signext %a) {
+; CHECK-LABEL: test_lleqsi_z_store:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    addis r4, r2, .LC0@toc@ha
+; CHECK-NEXT:    cntlzw r3, r3
+; CHECK-NEXT:    ld r4, .LC0@toc@l(r4)
+; CHECK-NEXT:    srwi r3, r3, 5
+; CHECK-NEXT:    stw r3, 0(r4)
+; CHECK-NEXT:    blr
+; CHECKNEXT:    blr
+entry:
+  %cmp = icmp eq i32 %a, 0
+  %conv = zext i1 %cmp to i32
+  store i32 %conv, i32* @glob, align 4
+  ret void
+}
+
+; Function Attrs: norecurse nounwind
+define void @test_lleqsi_sext_z_store(i32 signext %a) {
+; CHECK-LABEL: test_lleqsi_sext_z_store:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    addis r4, r2, .LC0@toc@ha
+; CHECK-NEXT:    cntlzw r3, r3
+; CHECK-NEXT:    ld r4, .LC0@toc@l(r4)
+; CHECK-NEXT:    rldicr r3, r3, 58, 0
+; CHECK-NEXT:    sradi r3, r3, 63
+; CHECK-NEXT:    stw r3, 0(r4)
+; CHECK-NEXT:    blr
+entry:
+  %cmp = icmp eq i32 %a, 0
+  %sub = sext i1 %cmp to i32
+  store i32 %sub, i32* @glob, align 4
+  ret void
+}
diff --git a/test/CodeGen/PowerPC/testCompareslleqss.ll b/test/CodeGen/PowerPC/testCompareslleqss.ll
new file mode 100644
index 000000000000..df60a6ccc00e
--- /dev/null
+++ b/test/CodeGen/PowerPC/testCompareslleqss.ll
@@ -0,0 +1,137 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu -O2 \
+; RUN:   -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s \
+; RUN:  --implicit-check-not cmpw --implicit-check-not cmpd --implicit-check-not cmpl
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu -O2 \
+; RUN:   -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s \
+; RUN:  --implicit-check-not cmpw --implicit-check-not cmpd --implicit-check-not cmpl
+
+@glob = common local_unnamed_addr global i16 0, align 2
+
+; Function Attrs: norecurse nounwind readnone
+define i64 @test_lleqss(i16 signext %a, i16 signext %b) {
+; CHECK-LABEL: test_lleqss:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    xor r3, r3, r4
+; CHECK-NEXT:    cntlzw r3, r3
+; CHECK-NEXT:    srwi r3, r3, 5
+; CHECK-NEXT:    blr
+entry:
+  %cmp = icmp eq i16 %a, %b
+  %conv3 = zext i1 %cmp to i64
+  ret i64 %conv3
+}
+
+; Function Attrs: norecurse nounwind readnone
+define i64 @test_lleqss_sext(i16 signext %a, i16 signext %b) {
+; CHECK-LABEL: test_lleqss_sext:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    xor r3, r3, r4
+; CHECK-NEXT:    cntlzw r3, r3
+; CHECK-NEXT:    rldicr r3, r3, 58, 0
+; CHECK-NEXT:    sradi r3, r3, 63
+; CHECK-NEXT:    blr
+entry:
+  %cmp = icmp eq i16 %a, %b
+  %conv3 = sext i1 %cmp to i64
+  ret i64 %conv3
+}
+
+; Function Attrs: norecurse nounwind readnone
+define i64 @test_lleqss_z(i16 signext %a) {
+; CHECK-LABEL: test_lleqss_z:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    cntlzw r3, r3
+; CHECK-NEXT:    srwi r3, r3, 5
+; CHECK-NEXT:    blr
+entry:
+  %cmp = icmp eq i16 %a, 0
+  %conv2 = zext i1 %cmp to i64
+  ret i64 %conv2
+}
+
+; Function Attrs: norecurse nounwind readnone
+define i64 @test_lleqss_sext_z(i16 signext %a) {
+; CHECK-LABEL: test_lleqss_sext_z:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    cntlzw r3, r3
+; CHECK-NEXT:    rldicr r3, r3, 58, 0
+; CHECK-NEXT:    sradi r3, r3, 63
+; CHECK-NEXT:    blr
+entry:
+  %cmp = icmp eq i16 %a, 0
+  %conv2 = sext i1 %cmp to i64
+  ret i64 %conv2
+}
+
+; Function Attrs: norecurse nounwind
+define void @test_lleqss_store(i16 signext %a, i16 signext %b) {
+; CHECK-LABEL: test_lleqss_store:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    addis r5, r2, .LC0@toc@ha
+; CHECK-NEXT:    xor r3, r3, r4
+; CHECK-NEXT:    ld r12, .LC0@toc@l(r5)
+; CHECK-NEXT:    cntlzw r3, r3
+; CHECK-NEXT:    srwi r3, r3, 5
+; CHECK-NEXT:    sth r3, 0(r12)
+; CHECK-NEXT:    blr
+entry:
+  %cmp = icmp eq i16 %a, %b
+  %conv3 = zext i1 %cmp to i16
+  store i16 %conv3, i16* @glob, align 2
+  ret void
+}
+
+; Function Attrs: norecurse nounwind
+define void @test_lleqss_sext_store(i16 signext %a, i16 signext %b) {
+; CHECK-LABEL: test_lleqss_sext_store:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    xor r3, r3, r4
+; CHECK-NEXT:    addis r5, r2, .LC0@toc@ha
+; CHECK-NEXT:    cntlzw r3, r3
+; CHECK-NEXT:    ld r4, .LC0@toc@l(r5)
+; CHECK-NEXT:    rldicr r3, r3, 58, 0
+; CHECK-NEXT:    sradi r3, r3, 63
+; CHECK-NEXT:    sth r3, 0(r4)
+; CHECK-NEXT:    blr
+entry:
+  %cmp = icmp eq i16 %a, %b
+  %conv3 = sext i1 %cmp to i16
+  store i16 %conv3, i16* @glob, align 2
+  ret void
+}
+
+; Function Attrs: norecurse nounwind
+define void @test_lleqss_z_store(i16 signext %a) {
+; CHECK-LABEL: test_lleqss_z_store:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    addis r4, r2, .LC0@toc@ha
+; CHECK-NEXT:    cntlzw r3, r3
+; CHECK-NEXT:    ld r4, .LC0@toc@l(r4)
+; CHECK-NEXT:    srwi r3, r3, 5
+; CHECK-NEXT:    sth r3, 0(r4)
+; CHECK-NEXT:    blr
+entry:
+  %cmp = icmp eq i16 %a, 0
+  %conv2 = zext i1 %cmp to i16
+  store i16 %conv2, i16* @glob, align 2
+  ret void
+}
+
+; Function Attrs: norecurse nounwind
+define void @test_lleqss_sext_z_store(i16 signext %a) {
+; CHECK-LABEL: test_lleqss_sext_z_store:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    addis r4, r2, .LC0@toc@ha
+; CHECK-NEXT:    cntlzw r3, r3
+; CHECK-NEXT:    ld r4, .LC0@toc@l(r4)
+; CHECK-NEXT:    rldicr r3, r3, 58, 0
+; CHECK-NEXT:    sradi r3, r3, 63
+; CHECK-NEXT:    sth r3, 0(r4)
+; CHECK-NEXT:    blr
+entry:
+  %cmp = icmp eq i16 %a, 0
+  %conv2 = sext i1 %cmp to i16
+  store i16 %conv2, i16* @glob, align 2
+  ret void
+}
diff --git a/test/CodeGen/PowerPC/testComparesllequc.ll b/test/CodeGen/PowerPC/testComparesllequc.ll
new file mode 100644
index 000000000000..248825761295
--- /dev/null
+++ b/test/CodeGen/PowerPC/testComparesllequc.ll
@@ -0,0 +1,137 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu -O2 \
+; RUN:   -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s \
+; RUN:  --implicit-check-not cmpw --implicit-check-not cmpd --implicit-check-not cmpl
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu -O2 \
+; RUN:   -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s \
+; RUN:  --implicit-check-not cmpw --implicit-check-not cmpd --implicit-check-not cmpl
+
+@glob = common local_unnamed_addr global i8 0, align 1
+
+; Function Attrs: norecurse nounwind readnone
+define i64 @test_llequc(i8 zeroext %a, i8 zeroext %b) {
+; CHECK-LABEL: test_llequc:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    xor r3, r3, r4
+; CHECK-NEXT:    cntlzw r3, r3
+; CHECK-NEXT:    srwi r3, r3, 5
+; CHECK-NEXT:    blr
+entry:
+  %cmp = icmp eq i8 %a, %b
+  %conv3 = zext i1 %cmp to i64
+  ret i64 %conv3
+}
+
+; Function Attrs: norecurse nounwind readnone
+define i64 @test_llequc_sext(i8 zeroext %a, i8 zeroext %b) {
+; CHECK-LABEL: test_llequc_sext:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    xor r3, r3, r4
+; CHECK-NEXT:    cntlzw r3, r3
+; CHECK-NEXT:    rldicr r3, r3, 58, 0
+; CHECK-NEXT:    sradi r3, r3, 63
+; CHECK-NEXT:    blr
+entry:
+  %cmp = icmp eq i8 %a, %b
+  %conv3 = sext i1 %cmp to i64
+  ret i64 %conv3
+}
+
+; Function Attrs: norecurse nounwind readnone
+define i64 @test_llequc_z(i8 zeroext %a) {
+; CHECK-LABEL: test_llequc_z:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    cntlzw r3, r3
+; CHECK-NEXT:    srwi r3, r3, 5
+; CHECK-NEXT:    blr
+entry:
+  %cmp = icmp eq i8 %a, 0
+  %conv2 = zext i1 %cmp to i64
+  ret i64 %conv2
+}
+
+; Function Attrs: norecurse nounwind readnone
+define i64 @test_llequc_sext_z(i8 zeroext %a) {
+; CHECK-LABEL: test_llequc_sext_z:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    cntlzw r3, r3
+; CHECK-NEXT:    rldicr r3, r3, 58, 0
+; CHECK-NEXT:    sradi r3, r3, 63
+; CHECK-NEXT:    blr
+entry:
+  %cmp = icmp eq i8 %a, 0
+  %conv2 = sext i1 %cmp to i64
+  ret i64 %conv2
+}
+
+; Function Attrs: norecurse nounwind
+define void @test_llequc_store(i8 zeroext %a, i8 zeroext %b) {
+; CHECK-LABEL: test_llequc_store:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    addis r5, r2, .LC0@toc@ha
+; CHECK-NEXT:    xor r3, r3, r4
+; CHECK-NEXT:    ld r12, .LC0@toc@l(r5)
+; CHECK-NEXT:    cntlzw r3, r3
+; CHECK-NEXT:    srwi r3, r3, 5
+; CHECK-NEXT:    stb r3, 0(r12)
+; CHECK-NEXT:    blr
+entry:
+  %cmp = icmp eq i8 %a, %b
+  %conv3 = zext i1 %cmp to i8
+  store i8 %conv3, i8* @glob, align 1
+  ret void
+}
+
+; Function Attrs: norecurse nounwind
+define void @test_llequc_sext_store(i8 zeroext %a, i8 zeroext %b) {
+; CHECK-LABEL: test_llequc_sext_store:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    xor r3, r3, r4
+; CHECK-NEXT:    addis r5, r2, .LC0@toc@ha
+; CHECK-NEXT:    cntlzw r3, r3
+; CHECK-NEXT:    ld r4, .LC0@toc@l(r5)
+; CHECK-NEXT:    rldicr r3, r3, 58, 0
+; CHECK-NEXT:    sradi r3, r3, 63
+; CHECK-NEXT:    stb r3, 0(r4)
+; CHECK-NEXT:    blr
+entry:
+  %cmp = icmp eq i8 %a, %b
+  %conv3 = sext i1 %cmp to i8
+  store i8 %conv3, i8* @glob, align 1
+  ret void
+}
+
+; Function Attrs: norecurse nounwind
+define void @test_llequc_z_store(i8 zeroext %a) {
+; CHECK-LABEL: test_llequc_z_store:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    addis r4, r2, .LC0@toc@ha
+; CHECK-NEXT:    cntlzw r3, r3
+; CHECK-NEXT:    ld r4, .LC0@toc@l(r4)
+; CHECK-NEXT:    srwi r3, r3, 5
+; CHECK-NEXT:    stb r3, 0(r4)
+; CHECK-NEXT:    blr
+entry:
+  %cmp = icmp eq i8 %a, 0
+  %conv2 = zext i1 %cmp to i8
+  store i8 %conv2, i8* @glob, align 1
+  ret void
+}
+
+; Function Attrs: norecurse nounwind
+define void @test_llequc_sext_z_store(i8 zeroext %a) {
+; CHECK-LABEL: test_llequc_sext_z_store:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    addis r4, r2, .LC0@toc@ha
+; CHECK-NEXT:    cntlzw r3, r3
+; CHECK-NEXT:    ld r4, .LC0@toc@l(r4)
+; CHECK-NEXT:    rldicr r3, r3, 58, 0
+; CHECK-NEXT:    sradi r3, r3, 63
+; CHECK-NEXT:    stb r3, 0(r4)
+; CHECK-NEXT:    blr
+entry:
+  %cmp = icmp eq i8 %a, 0
+  %conv2 = sext i1 %cmp to i8
+  store i8 %conv2, i8* @glob, align 1
+  ret void
+}
diff --git a/test/CodeGen/PowerPC/testComparesllequi.ll b/test/CodeGen/PowerPC/testComparesllequi.ll
new file mode 100644
index 000000000000..2342d80d94ef
--- /dev/null
+++ b/test/CodeGen/PowerPC/testComparesllequi.ll
@@ -0,0 +1,137 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu -O2 \
+; RUN:   -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s \
+; RUN:  --implicit-check-not cmpw --implicit-check-not cmpd --implicit-check-not cmpl
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu -O2 \
+; RUN:   -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s \
+; RUN:  --implicit-check-not cmpw --implicit-check-not cmpd --implicit-check-not cmpl
+
+@glob = common local_unnamed_addr global i32 0, align 4
+
+; Function Attrs: norecurse nounwind readnone
+define i64 @test_llequi(i32 zeroext %a, i32 zeroext %b) {
+; CHECK-LABEL: test_llequi:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    xor r3, r3, r4
+; CHECK-NEXT:    cntlzw r3, r3
+; CHECK-NEXT:    srwi r3, r3, 5
+; CHECK-NEXT:    blr
+entry:
+  %cmp = icmp eq i32 %a, %b
+  %conv1 = zext i1 %cmp to i64
+  ret i64 %conv1
+}
+
+; Function Attrs: norecurse nounwind readnone
+define i64 @test_llequi_sext(i32 zeroext %a, i32 zeroext %b) {
+; CHECK-LABEL: test_llequi_sext:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    xor r3, r3, r4
+; CHECK-NEXT:    cntlzw r3, r3
+; CHECK-NEXT:    rldicr r3, r3, 58, 0
+; CHECK-NEXT:    sradi r3, r3, 63
+; CHECK-NEXT:    blr
+entry:
+  %cmp = icmp eq i32 %a, %b
+  %conv1 = sext i1 %cmp to i64
+  ret i64 %conv1
+}
+
+; Function Attrs: norecurse nounwind readnone
+define i64 @test_llequi_z(i32 zeroext %a) {
+; CHECK-LABEL: test_llequi_z:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    cntlzw r3, r3
+; CHECK-NEXT:    srwi r3, r3, 5
+; CHECK-NEXT:    blr
+entry:
+  %cmp = icmp eq i32 %a, 0
+  %conv1 = zext i1 %cmp to i64
+  ret i64 %conv1
+}
+
+; Function Attrs: norecurse nounwind readnone
+define i64 @test_llequi_sext_z(i32 zeroext %a) {
+; CHECK-LABEL: test_llequi_sext_z:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    cntlzw r3, r3
+; CHECK-NEXT:    rldicr r3, r3, 58, 0
+; CHECK-NEXT:    sradi r3, r3, 63
+; CHECK-NEXT:    blr
+entry:
+  %cmp = icmp eq i32 %a, 0
+  %conv1 = sext i1 %cmp to i64
+  ret i64 %conv1
+}
+
+; Function Attrs: norecurse nounwind
+define void @test_llequi_store(i32 zeroext %a, i32 zeroext %b) {
+; CHECK-LABEL: test_llequi_store:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    addis r5, r2, .LC0@toc@ha
+; CHECK-NEXT:    xor r3, r3, r4
+; CHECK-NEXT:    ld r12, .LC0@toc@l(r5)
+; CHECK-NEXT:    cntlzw r3, r3
+; CHECK-NEXT:    srwi r3, r3, 5
+; CHECK-NEXT:    stw r3, 0(r12)
+; CHECK-NEXT:    blr
+entry:
+  %cmp = icmp eq i32 %a, %b
+  %conv = zext i1 %cmp to i32
+  store i32 %conv, i32* @glob, align 4
+  ret void
+}
+
+; Function Attrs: norecurse nounwind
+define void @test_llequi_sext_store(i32 zeroext %a, i32 zeroext %b) {
+; CHECK-LABEL: test_llequi_sext_store:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    xor r3, r3, r4
+; CHECK-NEXT:    addis r5, r2, .LC0@toc@ha
+; CHECK-NEXT:    cntlzw r3, r3
+; CHECK-NEXT:    ld r4, .LC0@toc@l(r5)
+; CHECK-NEXT:    rldicr r3, r3, 58, 0
+; CHECK-NEXT:    sradi r3, r3, 63
+; CHECK-NEXT:    stw r3, 0(r4)
+; CHECK-NEXT:    blr
+entry:
+  %cmp = icmp eq i32 %a, %b
+  %sub = sext i1 %cmp to i32
+  store i32 %sub, i32* @glob, align 4
+  ret void
+}
+
+; Function Attrs: norecurse nounwind
+define void @test_llequi_z_store(i32 zeroext %a) {
+; CHECK-LABEL: test_llequi_z_store:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    addis r4, r2, .LC0@toc@ha
+; CHECK-NEXT:    cntlzw r3, r3
+; CHECK-NEXT:    ld r4, .LC0@toc@l(r4)
+; CHECK-NEXT:    srwi r3, r3, 5
+; CHECK-NEXT:    stw r3, 0(r4)
+; CHECK-NEXT:    blr
+entry:
+  %cmp = icmp eq i32 %a, 0
+  %conv = zext i1 %cmp to i32
+  store i32 %conv, i32* @glob, align 4
+  ret void
+}
+
+; Function Attrs: norecurse nounwind
+define void @test_llequi_sext_z_store(i32 zeroext %a) {
+; CHECK-LABEL: test_llequi_sext_z_store:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    addis r4, r2, .LC0@toc@ha
+; CHECK-NEXT:    cntlzw r3, r3
+; CHECK-NEXT:    ld r4, .LC0@toc@l(r4)
+; CHECK-NEXT:    rldicr r3, r3, 58, 0
+; CHECK-NEXT:    sradi r3, r3, 63
+; CHECK-NEXT:    stw r3, 0(r4)
+; CHECK-NEXT:    blr
+entry:
+  %cmp = icmp eq i32 %a, 0
+  %sub = sext i1 %cmp to i32
+  store i32 %sub, i32* @glob, align 4
+  ret void
+}
diff --git a/test/CodeGen/PowerPC/testComparesllequs.ll b/test/CodeGen/PowerPC/testComparesllequs.ll
new file mode 100644
index 000000000000..e79a974c06f5
--- /dev/null
+++ b/test/CodeGen/PowerPC/testComparesllequs.ll
@@ -0,0 +1,137 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu -O2 \
+; RUN:   -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s \
+; RUN:  --implicit-check-not cmpw --implicit-check-not cmpd --implicit-check-not cmpl
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu -O2 \
+; RUN:   -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s \
+; RUN:  --implicit-check-not cmpw --implicit-check-not cmpd --implicit-check-not cmpl
+
+@glob = common local_unnamed_addr global i16 0, align 2
+
+; Function Attrs: norecurse nounwind readnone
+define i64 @test_llequs(i16 zeroext %a, i16 zeroext %b) {
+; CHECK-LABEL: test_llequs:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    xor r3, r3, r4
+; CHECK-NEXT:    cntlzw r3, r3
+; CHECK-NEXT:    srwi r3, r3, 5
+; CHECK-NEXT:    blr
+entry:
+  %cmp = icmp eq i16 %a, %b
+  %conv3 = zext i1 %cmp to i64
+  ret i64 %conv3
+}
+
+; Function Attrs: norecurse nounwind readnone
+define i64 @test_llequs_sext(i16 zeroext %a, i16 zeroext %b) {
+; CHECK-LABEL: test_llequs_sext:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    xor r3, r3, r4
+; CHECK-NEXT:    cntlzw r3, r3
+; CHECK-NEXT:    rldicr r3, r3, 58, 0
+; CHECK-NEXT:    sradi r3, r3, 63
+; CHECK-NEXT:    blr
+entry:
+  %cmp = icmp eq i16 %a, %b
+  %conv3 = sext i1 %cmp to i64
+  ret i64 %conv3
+}
+
+; Function Attrs: norecurse nounwind readnone
+define i64 @test_llequs_z(i16 zeroext %a) {
+; CHECK-LABEL: test_llequs_z:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    cntlzw r3, r3
+; CHECK-NEXT:    srwi r3, r3, 5
+; CHECK-NEXT:    blr
+entry:
+  %cmp = icmp eq i16 %a, 0
+  %conv2 = zext i1 %cmp to i64
+  ret i64 %conv2
+}
+
+; Function Attrs: norecurse nounwind readnone
+define i64 @test_llequs_sext_z(i16 zeroext %a) {
+; CHECK-LABEL: test_llequs_sext_z:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    cntlzw r3, r3
+; CHECK-NEXT:    rldicr r3, r3, 58, 0
+; CHECK-NEXT:    sradi r3, r3, 63
+; CHECK-NEXT:    blr
+entry:
+  %cmp = icmp eq i16 %a, 0
+  %conv2 = sext i1 %cmp to i64
+  ret i64 %conv2
+}
+
+; Function Attrs: norecurse nounwind
+define void @test_llequs_store(i16 zeroext %a, i16 zeroext %b) {
+; CHECK-LABEL: test_llequs_store:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    addis r5, r2, .LC0@toc@ha
+; CHECK-NEXT:    xor r3, r3, r4
+; CHECK-NEXT:    ld r12, .LC0@toc@l(r5)
+; CHECK-NEXT:    cntlzw r3, r3
+; CHECK-NEXT:    srwi r3, r3, 5
+; CHECK-NEXT:    sth r3, 0(r12)
+; CHECK-NEXT:    blr
+entry:
+  %cmp = icmp eq i16 %a, %b
+  %conv3 = zext i1 %cmp to i16
+  store i16 %conv3, i16* @glob, align 2
+  ret void
+}
+
+; Function Attrs: norecurse nounwind
+define void @test_llequs_sext_store(i16 zeroext %a, i16 zeroext %b) {
+; CHECK-LABEL: test_llequs_sext_store:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    xor r3, r3, r4
+; CHECK-NEXT:    addis r5, r2, .LC0@toc@ha
+; CHECK-NEXT:    cntlzw r3, r3
+; CHECK-NEXT:    ld r4, .LC0@toc@l(r5)
+; CHECK-NEXT:    rldicr r3, r3, 58, 0
+; CHECK-NEXT:    sradi r3, r3, 63
+; CHECK-NEXT:    sth r3, 0(r4)
+; CHECK-NEXT:    blr
+entry:
+  %cmp = icmp eq i16 %a, %b
+  %conv3 = sext i1 %cmp to i16
+  store i16 %conv3, i16* @glob, align 2
+  ret void
+}
+
+; Function Attrs: norecurse nounwind
+define void @test_llequs_z_store(i16 zeroext %a) {
+; CHECK-LABEL: test_llequs_z_store:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    addis r4, r2, .LC0@toc@ha
+; CHECK-NEXT:    cntlzw r3, r3
+; CHECK-NEXT:    ld r4, .LC0@toc@l(r4)
+; CHECK-NEXT:    srwi r3, r3, 5
+; CHECK-NEXT:    sth r3, 0(r4)
+; CHECK-NEXT:    blr
+entry:
+  %cmp = icmp eq i16 %a, 0
+  %conv2 = zext i1 %cmp to i16
+  store i16 %conv2, i16* @glob, align 2
+  ret void
+}
+
+; Function Attrs: norecurse nounwind
+define void @test_llequs_sext_z_store(i16 zeroext %a) {
+; CHECK-LABEL: test_llequs_sext_z_store:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    addis r4, r2, .LC0@toc@ha
+; CHECK-NEXT:    cntlzw r3, r3
+; CHECK-NEXT:    ld r4, .LC0@toc@l(r4)
+; CHECK-NEXT:    rldicr r3, r3, 58, 0
+; CHECK-NEXT:    sradi r3, r3, 63
+; CHECK-NEXT:    sth r3, 0(r4)
+; CHECK-NEXT:    blr
+entry:
+  %cmp = icmp eq i16 %a, 0
+  %conv2 = sext i1 %cmp to i16
+  store i16 %conv2, i16* @glob, align 2
+  ret void
+}
diff --git a/test/CodeGen/SPARC/LeonItinerariesUT.ll b/test/CodeGen/SPARC/LeonItinerariesUT.ll
index 87e0c4621c08..d586fe183a92 100644
--- a/test/CodeGen/SPARC/LeonItinerariesUT.ll
+++ b/test/CodeGen/SPARC/LeonItinerariesUT.ll
@@ -28,9 +28,9 @@
 ; LEON3_4_ITIN-LABEL: f32_ops:
 ; LEON3_4_ITIN:       ld 
 ; LEON3_4_ITIN-NEXT:  ld 
-; LEON3_4_ITIN-NEXT:  ld 
 ; LEON3_4_ITIN-NEXT:  fadds 
 ; LEON3_4_ITIN-NEXT:  ld 
+; LEON3_4_ITIN-NEXT:  ld 
 ; LEON3_4_ITIN-NEXT:  fsubs 
 ; LEON3_4_ITIN-NEXT:  fmuls 
 ; LEON3_4_ITIN-NEXT:  retl 
@@ -47,4 +47,4 @@ entry:
   %6 = fmul float %5, %3
   %7 = fdiv float %6, %4
   ret float %7
-}
\ No newline at end of file
+}
diff --git a/test/CodeGen/SPARC/inlineasm-v9.ll b/test/CodeGen/SPARC/inlineasm-v9.ll
new file mode 100644
index 000000000000..9c5424c46229
--- /dev/null
+++ b/test/CodeGen/SPARC/inlineasm-v9.ll
@@ -0,0 +1,30 @@
+; RUN: llc -march=sparcv9 <%s | FileCheck %s
+
+;; Ensures that inline-asm accepts and uses 'f' and 'e' register constraints.
+; CHECK-LABEL: faddd:
+; CHECK: faddd  %f0, %f2, %f0
+define double @faddd(double, double) local_unnamed_addr #2 {
+entry:
+  %2 = tail call double asm sideeffect "faddd  $1, $2, $0;", "=f,f,e"(double %0, double %1) #7
+  ret double %2
+}
+
+; CHECK-LABEL: faddq:
+; CHECK: faddq  %f0, %f4, %f0
+define fp128 @faddq(fp128, fp128) local_unnamed_addr #2 {
+entry:
+  %2 = tail call fp128 asm sideeffect "faddq  $1, $2, $0;", "=f,f,e"(fp128 %0, fp128 %1) #7
+  ret fp128 %2
+}
+
+;; Ensure that 'e' can indeed go in the high area, and 'f' cannot.
+; CHECK-LABEL: faddd_high:
+; CHECK: fmovd  %f2, %f32
+; CHECK: fmovd  %f0, %f2
+; CHECK: faddd  %f2, %f32, %f2
+define double @faddd_high(double, double) local_unnamed_addr #2 {
+entry:
+  %2 = tail call double asm sideeffect "faddd  $1, $2, $0;", "=f,f,e,~{d0},~{q1},~{q2},~{q3},~{q4},~{q5},~{q6},~{q7}"(double %0, double %1) #7
+  ret double %2
+}
+
diff --git a/test/CodeGen/SPARC/inlineasm.ll b/test/CodeGen/SPARC/inlineasm.ll
index af631f0d29f5..35a62706c1ab 100644
--- a/test/CodeGen/SPARC/inlineasm.ll
+++ b/test/CodeGen/SPARC/inlineasm.ll
@@ -94,3 +94,21 @@ entry:
   %0 = call i64 asm sideeffect "xor $1, %g0, $0", "=r,0,~{i1}"(i64 5);
   ret i64 %0
 }
+
+
+;; Ensures that inline-asm accepts and uses 'f' and 'e' register constraints.
+; CHECK-LABEL: fadds:
+; CHECK: fadds  %f0, %f1, %f0
+define float @fadds(float, float) local_unnamed_addr #2 {
+entry:
+  %2 = tail call float asm sideeffect "fadds  $1, $2, $0;", "=f,f,e"(float %0, float %1) #7
+  ret float %2
+}
+
+; CHECK-LABEL: faddd:
+; CHECK: faddd  %f0, %f2, %f0
+define double @faddd(double, double) local_unnamed_addr #2 {
+entry:
+  %2 = tail call double asm sideeffect "faddd  $1, $2, $0;", "=f,f,e"(double %0, double %1) #7
+  ret double %2
+}
diff --git a/test/CodeGen/SystemZ/list-ilp-crash.ll b/test/CodeGen/SystemZ/list-ilp-crash.ll
new file mode 100644
index 000000000000..c67ed318b93f
--- /dev/null
+++ b/test/CodeGen/SystemZ/list-ilp-crash.ll
@@ -0,0 +1,23 @@
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 -pre-RA-sched=list-ilp | FileCheck %s
+;
+; Check that list-ilp scheduler does not crash due to SystemZ's current use
+; of MVT::Untyped.
+
+define void @pr32723(i8) {
+; CHECK: .text
+BB:
+  br label %CF245
+
+CF245:                                            ; preds = %CF245, %BB
+  %Shuff57 = shufflevector <4 x i8> zeroinitializer, <4 x i8> zeroinitializer, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  %Cmp84 = icmp uge i8 %0, undef
+  br i1 %Cmp84, label %CF245, label %CF260
+
+CF260:                                            ; preds = %CF245
+  %B156 = sdiv <4 x i8> %Shuff57, %Shuff57
+  br label %CF255
+
+CF255:                                            ; preds = %CF255, %CF260
+  %I186 = insertelement <4 x i8> %B156, i8 %0, i32 2
+  br label %CF255
+}
diff --git a/test/CodeGen/SystemZ/lower-copy-undef-src.mir b/test/CodeGen/SystemZ/lower-copy-undef-src.mir
new file mode 100644
index 000000000000..322460d79d68
--- /dev/null
+++ b/test/CodeGen/SystemZ/lower-copy-undef-src.mir
@@ -0,0 +1,14 @@
+# RUN: llc -mtriple=s390x-linux-gnu -mcpu=z13 -run-pass=postrapseudos -o - %s | FileCheck %s
+#
+# Test that a COPY with an undef source operand gets handled like an identity
+# copy rather than lowered into a target instruction with the undef flag
+# dropped.
+---
+# CHECK-LABEL: name: undef_copy
+# CHECK: %r13d = KILL undef %r0d, implicit killed %r12q, implicit-def %r12q
+name: undef_copy
+tracksRegLiveness: true
+body: |
+  bb.0:
+    liveins: %r12q
+    %r13d = COPY undef %r0d, implicit killed %r12q, implicit-def %r12q
diff --git a/test/CodeGen/Thumb2/v8_IT_5.ll b/test/CodeGen/Thumb2/v8_IT_5.ll
index d8d60413cb0e..5e7a40299ed7 100644
--- a/test/CodeGen/Thumb2/v8_IT_5.ll
+++ b/test/CodeGen/Thumb2/v8_IT_5.ll
@@ -9,7 +9,7 @@
 ; CHECK-NEXT: b
 ; CHECK: [[JUMPTARGET]]:{{.*}}%if.else173
 ; CHECK-NEXT: mov.w
-; CHECK-NEXT: bx lr
+; CHECK-NEXT: pop
 ; CHECK-NEXT: %if.else145
 ; CHECK-NEXT: mov.w
 
diff --git a/test/CodeGen/X86/2007-01-08-InstrSched.ll b/test/CodeGen/X86/2007-01-08-InstrSched.ll
index 4ec703921e29..24aa5b98d0bb 100644
--- a/test/CodeGen/X86/2007-01-08-InstrSched.ll
+++ b/test/CodeGen/X86/2007-01-08-InstrSched.ll
@@ -11,12 +11,12 @@ define float @foo(float %x) nounwind {
     %tmp14 = fadd float %tmp12, %tmp7
     ret float %tmp14
 
-; CHECK: mulss
-; CHECK: mulss
 ; CHECK: mulss
 ; CHECK: mulss
 ; CHECK: addss
+; CHECK: mulss
 ; CHECK: addss
+; CHECK: mulss
 ; CHECK: addss
 ; CHECK: ret
 }
diff --git a/test/CodeGen/X86/2010-01-18-DbgValue.ll b/test/CodeGen/X86/2010-01-18-DbgValue.ll
index 8b11fd86ef17..ae60d57bbf49 100644
--- a/test/CodeGen/X86/2010-01-18-DbgValue.ll
+++ b/test/CodeGen/X86/2010-01-18-DbgValue.ll
@@ -1,14 +1,19 @@
-; RUN: llc -march=x86 -O0 < %s | FileCheck %s
-; Currently, dbg.declare generates a DEBUG_VALUE comment.  Eventually it will
-; generate DWARF and this test will need to be modified or removed.
+; RUN: llc -march=x86 -O0 < %s -filetype=obj | llvm-dwarfdump - | FileCheck %s
 
+; CHECK-LABEL: .debug_info contents:
+
+; CHECK-LABEL: DW_TAG_subprogram
+; CHECK:   DW_AT_name [DW_FORM_strp]       ( {{.*}}"foo")
+; CHECK:   DW_TAG_formal_parameter
+; CHECK-NEXT:     DW_AT_location [DW_FORM_exprloc]      (<0x2> 91 {{..}} )
+;                                                             DW_OP_fbreg ??
+; CHECK-NEXT:     DW_AT_name [DW_FORM_strp]     ( {{.*}}"my_r0")
 
 %struct.Pt = type { double, double }
 %struct.Rect = type { %struct.Pt, %struct.Pt }
 
 define double @foo(%struct.Rect* byval %my_r0) nounwind ssp !dbg !1 {
 entry:
-;CHECK: DEBUG_VALUE
   %retval = alloca double                         ; <double*> [#uses=2]
   %0 = alloca double                              ; <double*> [#uses=2]
   %"alloca point" = bitcast i32 0 to i32          ; <i32> [#uses=0]
diff --git a/test/CodeGen/X86/2012-11-30-handlemove-dbg.ll b/test/CodeGen/X86/2012-11-30-handlemove-dbg.ll
deleted file mode 100644
index 495ff0304b1b..000000000000
--- a/test/CodeGen/X86/2012-11-30-handlemove-dbg.ll
+++ /dev/null
@@ -1,51 +0,0 @@
-; RUN: llc < %s -mtriple=x86_64-apple-macosx -enable-misched \
-; RUN:          -verify-machineinstrs | FileCheck %s
-;
-; Test LiveInterval update handling of DBG_VALUE.
-; rdar://12777252.
-;
-; CHECK: %entry
-; CHECK: DEBUG_VALUE: subdivp:hg
-; CHECK: j
-
-%struct.node.0.27 = type { i16, double, [3 x double], i32, i32 }
-%struct.hgstruct.2.29 = type { %struct.bnode.1.28*, [3 x double], double, [3 x double] }
-%struct.bnode.1.28 = type { i16, double, [3 x double], i32, i32, [3 x double], [3 x double], [3 x double], double, %struct.bnode.1.28*, %struct.bnode.1.28* }
-
-declare void @llvm.dbg.declare(metadata, metadata, metadata) nounwind readnone
-
-define signext i16 @subdivp(%struct.node.0.27* nocapture %p, double %dsq, double %tolsq, %struct.hgstruct.2.29* nocapture byval align 8 %hg) nounwind uwtable readonly ssp !dbg !14 {
-entry:
-  call void @llvm.dbg.declare(metadata %struct.hgstruct.2.29* %hg, metadata !4, metadata !DIExpression()), !dbg !DILocation(scope: !14)
-  %type = getelementptr inbounds %struct.node.0.27, %struct.node.0.27* %p, i64 0, i32 0
-  %0 = load i16, i16* %type, align 2
-  %cmp = icmp eq i16 %0, 1
-  br i1 %cmp, label %return, label %for.cond.preheader
-
-for.cond.preheader:                               ; preds = %entry
-  %arrayidx6.1 = getelementptr inbounds %struct.hgstruct.2.29, %struct.hgstruct.2.29* %hg, i64 0, i32 1, i64 1
-  %cmp22 = fcmp olt double 0.000000e+00, %dsq
-  %conv24 = zext i1 %cmp22 to i16
-  br label %return
-
-return:                                           ; preds = %for.cond.preheader, %entry
-  %retval.0 = phi i16 [ %conv24, %for.cond.preheader ], [ 0, %entry ]
-  ret i16 %retval.0
-}
-
-declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnone
-
-!llvm.dbg.cu = !{!0}
-!llvm.module.flags = !{!12}
-
-!0 = distinct !DICompileUnit(language: DW_LANG_C99, producer: "clang version 3.3 (trunk 168918) (llvm/trunk 168920)", isOptimized: true, emissionKind: FullDebug, file: !11, enums: !2, retainedTypes: !2, globals: !2)
-!2 = !{}
-!4 = !DILocalVariable(name: "hg", line: 725, arg: 4, scope: !14, file: !5, type: !6)
-!5 = !DIFile(filename: "MultiSource/Benchmarks/Olden/bh/newbh.c", directory: "MultiSource/Benchmarks/Olden/bh")
-!6 = !DIDerivedType(tag: DW_TAG_typedef, name: "hgstruct", line: 492, file: !11, baseType: !7)
-!7 = !DICompositeType(tag: DW_TAG_structure_type, line: 487, size: 512, align: 64, file: !11)
-!11 = !DIFile(filename: "MultiSource/Benchmarks/Olden/bh/newbh.c", directory: "MultiSource/Benchmarks/Olden/bh")
-!12 = !{i32 1, !"Debug Info Version", i32 3}
-!14 = distinct !DISubprogram(name: "subdivp", isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, unit: !0, scopeLine: 1, file: !11, scope: !5, type: !15)
-!15 = !DISubroutineType(types: !16)
-!16 = !{null}
diff --git a/test/CodeGen/X86/2012-11-30-misched-dbg.ll b/test/CodeGen/X86/2012-11-30-misched-dbg.ll
deleted file mode 100644
index fbe6000d7ace..000000000000
--- a/test/CodeGen/X86/2012-11-30-misched-dbg.ll
+++ /dev/null
@@ -1,142 +0,0 @@
-; RUN: llc < %s -mtriple=x86_64-apple-macosx -enable-misched \
-; RUN:          -verify-machineinstrs | FileCheck %s
-;
-; Test MachineScheduler handling of DBG_VALUE.
-; rdar://12776937.
-;
-; CHECK: %if.else581
-; CHECK: DEBUG_VALUE: num1
-; CHECK: call
-
-%union.rec = type {}
-
-@.str15 = external hidden unnamed_addr constant [6 x i8], align 1
-
-declare void @llvm.dbg.declare(metadata, metadata, metadata) nounwind readnone
-
-define i32 @AttachGalley(%union.rec** nocapture %suspend_pt) nounwind uwtable ssp !dbg !21 {
-entry:
-  %num14075 = alloca [20 x i8], align 16
-  br label %if.end33
-
-if.end33:                                         ; preds = %entry
-  %cmp1733 = icmp eq i32 undef, 0
-  br label %if.else581
-
-if.else581:                                       ; preds = %if.end33
-  %cmp586 = icmp eq i8 undef, -123
-  br i1 %cmp586, label %if.then588, label %if.else594
-
-if.then588:                                       ; preds = %if.else581
-  br label %for.cond1710.preheader
-
-if.else594:                                       ; preds = %if.else581
-  unreachable
-
-for.cond1710.preheader:                           ; preds = %if.then588
-  br label %for.cond1710
-
-for.cond1710:                                     ; preds = %for.cond1710, %for.cond1710.preheader
-  br i1 undef, label %for.cond1710, label %if.then3344
-
-if.then3344:
-  br label %if.then4073
-
-if.then4073:                                      ; preds = %if.then3344
-  call void @llvm.dbg.declare(metadata [20 x i8]* %num14075, metadata !4, metadata !DIExpression()), !dbg !DILocation(scope: !5)
-  %arraydecay4078 = getelementptr inbounds [20 x i8], [20 x i8]* %num14075, i64 0, i64 0
-  %0 = load i32, i32* undef, align 4
-  %add4093 = add nsw i32 %0, 0
-  %conv4094 = sitofp i32 %add4093 to float
-  %div4095 = fdiv float %conv4094, 5.670000e+02
-  %conv4096 = fpext float %div4095 to double
-  %call4097 = call i32 (i8*, i32, i64, i8*, ...) @__sprintf_chk(i8* %arraydecay4078, i32 0, i64 20, i8* getelementptr inbounds ([6 x i8], [6 x i8]* @.str15, i64 0, i64 0), double %conv4096) nounwind
-  br i1 %cmp1733, label %if.then4107, label %if.else4114
-
-if.then4107:                                      ; preds = %if.then4073
-  unreachable
-
-if.else4114:                                      ; preds = %if.then4073
-  unreachable
-}
-
-declare i32 @__sprintf_chk(i8*, i32, i64, i8*, ...)
-
-!llvm.dbg.cu = !{!0}
-!llvm.module.flags = !{!35}
-
-!0 = distinct !DICompileUnit(language: DW_LANG_C99, producer: "clang version 3.3 (trunk 168918) (llvm/trunk 168920)", isOptimized: true, emissionKind: FullDebug, file: !19, enums: !2, retainedTypes: !2, globals: !2)
-!1 = !{!2}
-!2 = !{}
-!4 = !DILocalVariable(name: "num1", line: 815, scope: !5, file: !14, type: !15)
-!5 = distinct !DILexicalBlock(line: 815, column: 0, file: !14, scope: !6)
-!6 = distinct !DILexicalBlock(line: 812, column: 0, file: !14, scope: !7)
-!7 = distinct !DILexicalBlock(line: 807, column: 0, file: !14, scope: !8)
-!8 = distinct !DILexicalBlock(line: 440, column: 0, file: !14, scope: !9)
-!9 = distinct !DILexicalBlock(line: 435, column: 0, file: !14, scope: !10)
-!10 = distinct !DILexicalBlock(line: 434, column: 0, file: !14, scope: !11)
-!11 = distinct !DILexicalBlock(line: 250, column: 0, file: !14, scope: !12)
-!12 = distinct !DILexicalBlock(line: 249, column: 0, file: !14, scope: !13)
-!13 = distinct !DILexicalBlock(line: 221, column: 0, file: !14, scope: !21)
-!14 = !DIFile(filename: "MultiSource/Benchmarks/MiBench/consumer-typeset/z19.c", directory: "MultiSource/Benchmarks/MiBench/consumer-typeset")
-!15 = !DICompositeType(tag: DW_TAG_array_type, size: 160, align: 8, baseType: !16, elements: !17)
-!16 = !DIBasicType(tag: DW_TAG_base_type, name: "char", size: 8, align: 8, encoding: DW_ATE_signed_char)
-!17 = !{!18}
-!18 = !DISubrange(count: 20)
-!19 = !DIFile(filename: "MultiSource/Benchmarks/MiBench/consumer-typeset/z19.c", directory: "MultiSource/Benchmarks/MiBench/consumer-typeset")
-
-!21 = distinct !DISubprogram(name: "AttachGalley", isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, unit: !0, scopeLine: 1, file: !19, scope: !14, type: !22)
-!22 = !DISubroutineType(types: !23)
-!23 = !{null}
-
-; Test DebugValue uses visited by RegisterPressureTracker findUseBetween().
-;
-; CHECK: @main
-; CHECK: DEBUG_VALUE: main:X
-; CHECK: call
-
-%"class.__gnu_cxx::hash_map" = type { %"class.__gnu_cxx::hashtable" }
-%"class.__gnu_cxx::hashtable" = type { i64, i64, i64, i64, i64, i64 }
-
-define void @main() uwtable ssp personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) !dbg !37 {
-entry:
-  %X = alloca %"class.__gnu_cxx::hash_map", align 8
-  br i1 undef, label %cond.true, label %cond.end
-
-cond.true:                                        ; preds = %entry
-  unreachable
-
-cond.end:                                         ; preds = %entry
-  call void @llvm.dbg.declare(metadata %"class.__gnu_cxx::hash_map"* %X, metadata !31, metadata !DIExpression()), !dbg !DILocation(scope: !37)
-  %_M_num_elements.i.i.i.i = getelementptr inbounds %"class.__gnu_cxx::hash_map", %"class.__gnu_cxx::hash_map"* %X, i64 0, i32 0, i32 5
-  invoke void @_Znwm()
-          to label %exit.i unwind label %lpad2.i.i.i.i
-
-exit.i:                                           ; preds = %cond.end
-  unreachable
-
-lpad2.i.i.i.i:                                    ; preds = %cond.end
-  %0 = landingpad { i8*, i32 }
-          cleanup
-  br i1 undef, label %lpad.body.i.i, label %if.then.i.i.i.i.i.i.i.i
-
-if.then.i.i.i.i.i.i.i.i:                          ; preds = %lpad2.i.i.i.i
-  unreachable
-
-lpad.body.i.i:                                    ; preds = %lpad2.i.i.i.i
-  resume { i8*, i32 } %0
-}
-
-declare i32 @__gxx_personality_v0(...)
-
-declare void @_Znwm()
-
-!llvm.dbg.cu = !{!30}
-
-!30 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, producer: "clang version 3.3 (trunk 169129) (llvm/trunk 169135)", isOptimized: true, emissionKind: FullDebug, file: !34, enums: !2, retainedTypes: !2)
-!31 = !DILocalVariable(name: "X", line: 29, scope: !37, type: !32)
-!32 = !DIDerivedType(tag: DW_TAG_typedef, name: "HM", line: 28, file: !34, baseType: null)
-!33 = !DIFile(filename: "SingleSource/Benchmarks/Shootout-C++/hash.cpp", directory: "SingleSource/Benchmarks/Shootout-C++")
-!34 = !DIFile(filename: "SingleSource/Benchmarks/Shootout-C++/hash.cpp", directory: "SingleSource/Benchmarks/Shootout-C++")
-!35 = !{i32 1, !"Debug Info Version", i32 3}
-!37 = distinct !DISubprogram(name: "main", isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, unit: !30, scopeLine: 1, file: !19, scope: !14, type: !22)
diff --git a/test/CodeGen/X86/2012-11-30-regpres-dbg.ll b/test/CodeGen/X86/2012-11-30-regpres-dbg.ll
deleted file mode 100644
index a717202d3574..000000000000
--- a/test/CodeGen/X86/2012-11-30-regpres-dbg.ll
+++ /dev/null
@@ -1,47 +0,0 @@
-; RUN: llc < %s -mtriple=x86_64-apple-macosx -enable-misched \
-; RUN:          -verify-machineinstrs | FileCheck %s
-;
-; Test RegisterPressure handling of DBG_VALUE.
-;
-; CHECK: %entry
-; CHECK: DEBUG_VALUE: test:callback
-; CHECK: ret
-
-%struct.btCompoundLeafCallback = type { i32, i32 }
-
-declare void @llvm.dbg.declare(metadata, metadata, metadata) nounwind readnone
-
-define void @test() unnamed_addr uwtable ssp align 2 !dbg !2 {
-entry:
-  %callback = alloca %struct.btCompoundLeafCallback, align 8
-  br i1 undef, label %if.end, label %if.then
-
-if.then:                                          ; preds = %entry
-  unreachable
-
-if.end:                                           ; preds = %entry
-  call void @llvm.dbg.declare(metadata %struct.btCompoundLeafCallback* %callback, metadata !3, metadata !DIExpression()), !dbg !DILocation(scope: !2)
-  %m = getelementptr inbounds %struct.btCompoundLeafCallback, %struct.btCompoundLeafCallback* %callback, i64 0, i32 1
-  store i32 0, i32* undef, align 8
-  %cmp12447 = icmp sgt i32 undef, 0
-  br i1 %cmp12447, label %for.body.lr.ph, label %invoke.cont44
-
-for.body.lr.ph:                                   ; preds = %if.end
-  unreachable
-
-invoke.cont44:                                    ; preds = %if.end
-  ret void
-}
-
-!llvm.dbg.cu = !{!0}
-!llvm.module.flags = !{!8}
-
-!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, producer: "clang version 3.3 (trunk 168984) (llvm/trunk 168983)", isOptimized: true, emissionKind: FullDebug, file: !6)
-!2 = distinct !DISubprogram(name: "test", isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, unit: !0, scopeLine: 1, file: !6, scope: !5, type: !7)
-!3 = !DILocalVariable(name: "callback", line: 214, scope: !2, type: !4)
-!4 = !DICompositeType(tag: DW_TAG_structure_type, name: "btCompoundLeafCallback", line: 90, size: 64, align: 64, file: !6)
-!5 = !DIFile(filename: "MultiSource/Benchmarks/Bullet/btCompoundCollisionAlgorithm.cpp", directory: "MultiSource/Benchmarks/Bullet")
-!6 = !DIFile(filename: "MultiSource/Benchmarks/Bullet/btCompoundCollisionAlgorithm.cpp", directory: "MultiSource/Benchmarks/Bullet")
-!7 = !DISubroutineType(types: !9)
-!8 = !{i32 1, !"Debug Info Version", i32 3}
-!9 = !{null}
diff --git a/test/CodeGen/X86/GlobalISel/add-scalar.ll b/test/CodeGen/X86/GlobalISel/add-scalar.ll
new file mode 100644
index 000000000000..553bc2789ff0
--- /dev/null
+++ b/test/CodeGen/X86/GlobalISel/add-scalar.ll
@@ -0,0 +1,44 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=x86_64-linux-gnu -global-isel < %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=X64
+
+define i64 @test_add_i64(i64 %arg1, i64 %arg2) {
+; ALL-LABEL: test_add_i64:
+; ALL:       # BB#0:
+; ALL-NEXT:    leaq (%rsi,%rdi), %rax
+; ALL-NEXT:    retq
+  %ret = add i64 %arg1, %arg2
+  ret i64 %ret
+}
+
+define i32 @test_add_i32(i32 %arg1, i32 %arg2) {
+; ALL-LABEL: test_add_i32:
+; ALL:       # BB#0:
+; ALL-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; ALL-NEXT:    # kill: %ESI<def> %ESI<kill> %RSI<def>
+; ALL-NEXT:    leal (%rsi,%rdi), %eax
+; ALL-NEXT:    retq
+  %ret = add i32 %arg1, %arg2
+  ret i32 %ret
+}
+
+define i16 @test_add_i16(i16 %arg1, i16 %arg2) {
+; ALL-LABEL: test_add_i16:
+; ALL:       # BB#0:
+; ALL-NEXT:    # kill: %DI<def> %DI<kill> %RDI<def>
+; ALL-NEXT:    # kill: %SI<def> %SI<kill> %RSI<def>
+; ALL-NEXT:    leal (%rsi,%rdi), %eax
+; ALL-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; ALL-NEXT:    retq
+  %ret = add i16 %arg1, %arg2
+  ret i16 %ret
+}
+
+define i8 @test_add_i8(i8 %arg1, i8 %arg2) {
+; ALL-LABEL: test_add_i8:
+; ALL:       # BB#0:
+; ALL-NEXT:    addb %dil, %sil
+; ALL-NEXT:    movl %esi, %eax
+; ALL-NEXT:    retq
+  %ret = add i8 %arg1, %arg2
+  ret i8 %ret
+}
diff --git a/test/CodeGen/X86/GlobalISel/binop.ll b/test/CodeGen/X86/GlobalISel/binop.ll
index bf4c42cb4292..1aae1db8ab07 100644
--- a/test/CodeGen/X86/GlobalISel/binop.ll
+++ b/test/CodeGen/X86/GlobalISel/binop.ll
@@ -4,48 +4,6 @@
 ; RUN: llc -mtriple=x86_64-linux-gnu -mattr=+avx512f                  -global-isel < %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=ALL_AVX --check-prefix=AVX512F
 ; RUN: llc -mtriple=x86_64-linux-gnu -mattr=+avx512f -mattr=+avx512vl -global-isel < %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=ALL_AVX --check-prefix=AVX512VL
 
-define i64 @test_add_i64(i64 %arg1, i64 %arg2) {
-; ALL-LABEL: test_add_i64:
-; ALL:       # BB#0:
-; ALL-NEXT:    leaq (%rsi,%rdi), %rax
-; ALL-NEXT:    retq
-  %ret = add i64 %arg1, %arg2
-  ret i64 %ret
-}
-
-define i32 @test_add_i32(i32 %arg1, i32 %arg2) {
-; ALL-LABEL: test_add_i32:
-; ALL:       # BB#0:
-; ALL-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
-; ALL-NEXT:    # kill: %ESI<def> %ESI<kill> %RSI<def>
-; ALL-NEXT:    leal (%rsi,%rdi), %eax
-; ALL-NEXT:    retq
-  %ret = add i32 %arg1, %arg2
-  ret i32 %ret
-}
-
-define i16 @test_add_i16(i16 %arg1, i16 %arg2) {
-; ALL-LABEL: test_add_i16:
-; ALL:       # BB#0:
-; ALL-NEXT:    # kill: %DI<def> %DI<kill> %RDI<def>
-; ALL-NEXT:    # kill: %SI<def> %SI<kill> %RSI<def>
-; ALL-NEXT:    leal (%rsi,%rdi), %eax
-; ALL-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
-; ALL-NEXT:    retq
-  %ret = add i16 %arg1, %arg2
-  ret i16 %ret
-}
-
-define i8 @test_add_i8(i8 %arg1, i8 %arg2) {
-; ALL-LABEL: test_add_i8:
-; ALL:       # BB#0:
-; ALL-NEXT:    addb %dil, %sil
-; ALL-NEXT:    movl %esi, %eax
-; ALL-NEXT:    retq
-  %ret = add i8 %arg1, %arg2
-  ret i8 %ret
-}
-
 define i64 @test_sub_i64(i64 %arg1, i64 %arg2) {
 ; ALL-LABEL: test_sub_i64:
 ; ALL:       # BB#0:
diff --git a/test/CodeGen/X86/GlobalISel/br.ll b/test/CodeGen/X86/GlobalISel/br.ll
new file mode 100644
index 000000000000..faa6a0350337
--- /dev/null
+++ b/test/CodeGen/X86/GlobalISel/br.ll
@@ -0,0 +1,19 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -O0 -mtriple=x86_64-linux-gnu    -global-isel %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=X64
+
+define void @uncondbr() {
+; CHECK-LABEL: uncondbr:
+; CHECK:       # BB#1: # %entry
+; CHECK-NEXT:    jmp .LBB0_3
+; CHECK-NEXT:  .LBB0_2: # %end
+; CHECK-NEXT:    retq
+; CHECK-NEXT:  .LBB0_3: # %bb2
+; CHECK-NEXT:    jmp .LBB0_2
+entry:
+  br label %bb2
+end:
+  ret void
+bb2:
+  br label %end
+}
+
diff --git a/test/CodeGen/X86/GlobalISel/cmp.ll b/test/CodeGen/X86/GlobalISel/cmp.ll
new file mode 100644
index 000000000000..03692bb6b1de
--- /dev/null
+++ b/test/CodeGen/X86/GlobalISel/cmp.ll
@@ -0,0 +1,159 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=x86_64-linux-gnu    -global-isel < %s -o - | FileCheck %s --check-prefix=ALL
+
+define i32 @test_icmp_eq_i8(i8 %a, i8 %b) {
+; ALL-LABEL: test_icmp_eq_i8:
+; ALL:       # BB#0:
+; ALL-NEXT:    cmpb %sil, %dil
+; ALL-NEXT:    sete %al
+; ALL-NEXT:    andl $1, %eax
+; ALL-NEXT:    retq
+  %r = icmp eq i8 %a, %b
+  %res =  zext i1 %r to i32
+  ret i32 %res
+}
+
+define i32 @test_icmp_eq_i16(i16 %a, i16 %b) {
+; ALL-LABEL: test_icmp_eq_i16:
+; ALL:       # BB#0:
+; ALL-NEXT:    cmpw %si, %di
+; ALL-NEXT:    sete %al
+; ALL-NEXT:    andl $1, %eax
+; ALL-NEXT:    retq
+  %r = icmp eq i16 %a, %b
+  %res =  zext i1 %r to i32
+  ret i32 %res
+}
+
+define i32 @test_icmp_eq_i64(i64 %a, i64 %b) {
+; ALL-LABEL: test_icmp_eq_i64:
+; ALL:       # BB#0:
+; ALL-NEXT:    cmpq %rsi, %rdi
+; ALL-NEXT:    sete %al
+; ALL-NEXT:    andl $1, %eax
+; ALL-NEXT:    retq
+  %r = icmp eq i64 %a, %b
+  %res =  zext i1 %r to i32
+  ret i32 %res
+}
+
+define i32 @test_icmp_eq_i32(i32 %a, i32 %b) {
+; ALL-LABEL: test_icmp_eq_i32:
+; ALL:       # BB#0:
+; ALL-NEXT:    cmpl %esi, %edi
+; ALL-NEXT:    sete %al
+; ALL-NEXT:    andl $1, %eax
+; ALL-NEXT:    retq
+  %r = icmp eq i32 %a, %b
+  %res =  zext i1 %r to i32
+  ret i32 %res
+}
+
+define i32 @test_icmp_ne_i32(i32 %a, i32 %b) {
+; ALL-LABEL: test_icmp_ne_i32:
+; ALL:       # BB#0:
+; ALL-NEXT:    cmpl %esi, %edi
+; ALL-NEXT:    setne %al
+; ALL-NEXT:    andl $1, %eax
+; ALL-NEXT:    retq
+  %r = icmp ne i32 %a, %b
+  %res =  zext i1 %r to i32
+  ret i32 %res
+}
+
+define i32 @test_icmp_ugt_i32(i32 %a, i32 %b) {
+; ALL-LABEL: test_icmp_ugt_i32:
+; ALL:       # BB#0:
+; ALL-NEXT:    cmpl %esi, %edi
+; ALL-NEXT:    seta %al
+; ALL-NEXT:    andl $1, %eax
+; ALL-NEXT:    retq
+  %r = icmp ugt i32 %a, %b
+  %res =  zext i1 %r to i32
+  ret i32 %res
+}
+
+define i32 @test_icmp_uge_i32(i32 %a, i32 %b) {
+; ALL-LABEL: test_icmp_uge_i32:
+; ALL:       # BB#0:
+; ALL-NEXT:    cmpl %esi, %edi
+; ALL-NEXT:    setae %al
+; ALL-NEXT:    andl $1, %eax
+; ALL-NEXT:    retq
+  %r = icmp uge i32 %a, %b
+  %res =  zext i1 %r to i32
+  ret i32 %res
+}
+
+define i32 @test_icmp_ult_i32(i32 %a, i32 %b) {
+; ALL-LABEL: test_icmp_ult_i32:
+; ALL:       # BB#0:
+; ALL-NEXT:    cmpl %esi, %edi
+; ALL-NEXT:    setb %al
+; ALL-NEXT:    andl $1, %eax
+; ALL-NEXT:    retq
+  %r = icmp ult i32 %a, %b
+  %res =  zext i1 %r to i32
+  ret i32 %res
+}
+
+define i32 @test_icmp_ule_i32(i32 %a, i32 %b) {
+; ALL-LABEL: test_icmp_ule_i32:
+; ALL:       # BB#0:
+; ALL-NEXT:    cmpl %esi, %edi
+; ALL-NEXT:    setbe %al
+; ALL-NEXT:    andl $1, %eax
+; ALL-NEXT:    retq
+  %r = icmp ule i32 %a, %b
+  %res =  zext i1 %r to i32
+  ret i32 %res
+}
+
+define i32 @test_icmp_sgt_i32(i32 %a, i32 %b) {
+; ALL-LABEL: test_icmp_sgt_i32:
+; ALL:       # BB#0:
+; ALL-NEXT:    cmpl %esi, %edi
+; ALL-NEXT:    setg %al
+; ALL-NEXT:    andl $1, %eax
+; ALL-NEXT:    retq
+  %r = icmp sgt i32 %a, %b
+  %res =  zext i1 %r to i32
+  ret i32 %res
+}
+
+define i32 @test_icmp_sge_i32(i32 %a, i32 %b) {
+; ALL-LABEL: test_icmp_sge_i32:
+; ALL:       # BB#0:
+; ALL-NEXT:    cmpl %esi, %edi
+; ALL-NEXT:    setge %al
+; ALL-NEXT:    andl $1, %eax
+; ALL-NEXT:    retq
+  %r = icmp sge i32 %a, %b
+  %res =  zext i1 %r to i32
+  ret i32 %res
+}
+
+define i32 @test_icmp_slt_i32(i32 %a, i32 %b) {
+; ALL-LABEL: test_icmp_slt_i32:
+; ALL:       # BB#0:
+; ALL-NEXT:    cmpl %esi, %edi
+; ALL-NEXT:    setl %al
+; ALL-NEXT:    andl $1, %eax
+; ALL-NEXT:    retq
+  %r = icmp slt i32 %a, %b
+  %res =  zext i1 %r to i32
+  ret i32 %res
+}
+
+define i32 @test_icmp_sle_i32(i32 %a, i32 %b) {
+; ALL-LABEL: test_icmp_sle_i32:
+; ALL:       # BB#0:
+; ALL-NEXT:    cmpl %esi, %edi
+; ALL-NEXT:    setle %al
+; ALL-NEXT:    andl $1, %eax
+; ALL-NEXT:    retq
+  %r = icmp sle i32 %a, %b
+  %res =  zext i1 %r to i32
+  ret i32 %res
+}
+
diff --git a/test/CodeGen/X86/GlobalISel/ext-x86-64.ll b/test/CodeGen/X86/GlobalISel/ext-x86-64.ll
index c4d3566008b1..64cd0e70a4fd 100644
--- a/test/CodeGen/X86/GlobalISel/ext-x86-64.ll
+++ b/test/CodeGen/X86/GlobalISel/ext-x86-64.ll
@@ -1,7 +1,19 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=x86_64-linux-gnu    -global-isel < %s -o - | FileCheck %s --check-prefix=X64
 
-; TODO merge with ext.ll after i64 sext suported on 32bit platform 
+; TODO merge with ext.ll after i64 sext suported on 32bit platform
+
+define i64 @test_zext_i1(i8 %a) {
+; X64-LABEL: test_zext_i1:
+; X64:       # BB#0:
+; X64-NEXT:    # kill: %DIL<def> %DIL<kill> %RDI<def>
+; X64-NEXT:    andq $1, %rdi
+; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    retq
+  %val = trunc i8 %a to i1
+  %r = zext i1 %val to i64
+  ret i64 %r
+}
 
 define i64 @test_sext_i8(i8 %val) {
 ; X64-LABEL: test_sext_i8:
diff --git a/test/CodeGen/X86/GlobalISel/ext.ll b/test/CodeGen/X86/GlobalISel/ext.ll
index 3c032686130e..4d4e3b05ca28 100644
--- a/test/CodeGen/X86/GlobalISel/ext.ll
+++ b/test/CodeGen/X86/GlobalISel/ext.ll
@@ -2,6 +2,24 @@
 ; RUN: llc -mtriple=x86_64-linux-gnu    -global-isel < %s -o - | FileCheck %s --check-prefix=X64
 ; RUN: llc -mtriple=i386-linux-gnu      -global-isel < %s -o - | FileCheck %s --check-prefix=X32
 
+define i32 @test_zext_i1(i32 %a) {
+; X64-LABEL: test_zext_i1:
+; X64:       # BB#0:
+; X64-NEXT:    andl $1, %edi
+; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    retq
+;
+; X32-LABEL: test_zext_i1:
+; X32:       # BB#0:
+; X32-NEXT:    leal 4(%esp), %eax
+; X32-NEXT:    movl (%eax), %eax
+; X32-NEXT:    andl $1, %eax
+; X32-NEXT:    retl
+  %val = trunc i32 %a to i1
+  %r = zext i1 %val to i32
+  ret i32 %r
+}
+
 define i32 @test_zext_i8(i8 %val) {
 ; X64-LABEL: test_zext_i8:
 ; X64:       # BB#0:
diff --git a/test/CodeGen/X86/GlobalISel/legalize-cmp.mir b/test/CodeGen/X86/GlobalISel/legalize-cmp.mir
new file mode 100644
index 000000000000..68ccbbba0a73
--- /dev/null
+++ b/test/CodeGen/X86/GlobalISel/legalize-cmp.mir
@@ -0,0 +1,179 @@
+# RUN: llc -mtriple=x86_64-linux-gnu -global-isel -run-pass=legalizer %s -o - | FileCheck %s
+
+--- |
+  define i32 @test_cmp_i8(i8 %a, i8 %b) {
+    %r = icmp ult i8 %a, %b
+    %res = zext i1 %r to i32
+    ret i32 %res
+  }
+
+  define i32 @test_cmp_i16(i16 %a, i16 %b) {
+    %r = icmp ult i16 %a, %b
+    %res = zext i1 %r to i32
+    ret i32 %res
+  }
+
+  define i32 @test_cmp_i32(i32 %a, i32 %b) {
+    %r = icmp ult i32 %a, %b
+    %res = zext i1 %r to i32
+    ret i32 %res
+  }
+
+  define i32 @test_cmp_i64(i64 %a, i64 %b) {
+    %r = icmp ult i64 %a, %b
+    %res = zext i1 %r to i32
+    ret i32 %res
+  }
+
+  define i32 @test_cmp_p0(i32* %a, i32* %b) {
+    %r = icmp ult i32* %a, %b
+    %res = zext i1 %r to i32
+    ret i32 %res
+  }
+
+...
+---
+name:            test_cmp_i8
+# CHECK-LABEL: name:  test_cmp_i8
+alignment:       4
+legalized:       false
+regBankSelected: false
+registers:
+  - { id: 0, class: _ }
+  - { id: 1, class: _ }
+  - { id: 2, class: _ }
+  - { id: 3, class: _ }
+# CHECK:          %0(s8) = COPY %edi
+# CHECK-NEXT:     %1(s8) = COPY %esi
+# CHECK-NEXT:     %2(s1) = G_ICMP intpred(ult), %0(s8), %1
+# CHECK-NEXT:     %3(s32) = G_ZEXT %2(s1)
+# CHECK-NEXT:     %eax = COPY %3(s32)
+# CHECK-NEXT:     RET 0, implicit %eax
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: %edi, %esi
+
+    %0(s8) = COPY %edi
+    %1(s8) = COPY %esi
+    %2(s1) = G_ICMP intpred(ult), %0(s8), %1
+    %3(s32) = G_ZEXT %2(s1)
+    %eax = COPY %3(s32)
+    RET 0, implicit %eax
+
+...
+---
+name:            test_cmp_i16
+# CHECK-LABEL: name:  test_cmp_i16
+alignment:       4
+legalized:       false
+regBankSelected: false
+registers:
+  - { id: 0, class: _ }
+  - { id: 1, class: _ }
+  - { id: 2, class: _ }
+  - { id: 3, class: _ }
+# CHECK:          %0(s16) = COPY %edi
+# CHECK-NEXT:     %1(s16) = COPY %esi
+# CHECK-NEXT:     %2(s1) = G_ICMP intpred(ult), %0(s16), %1
+# CHECK-NEXT:     %3(s32) = G_ZEXT %2(s1)
+# CHECK-NEXT:     %eax = COPY %3(s32)
+# CHECK-NEXT:     RET 0, implicit %eax
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: %edi, %esi
+
+    %0(s16) = COPY %edi
+    %1(s16) = COPY %esi
+    %2(s1) = G_ICMP intpred(ult), %0(s16), %1
+    %3(s32) = G_ZEXT %2(s1)
+    %eax = COPY %3(s32)
+    RET 0, implicit %eax
+
+...
+---
+name:            test_cmp_i32
+# CHECK-LABEL: name:  test_cmp_i32
+alignment:       4
+legalized:       false
+regBankSelected: false
+registers:
+  - { id: 0, class: _ }
+  - { id: 1, class: _ }
+  - { id: 2, class: _ }
+  - { id: 3, class: _ }
+# CHECK:          %0(s32) = COPY %edi
+# CHECK-NEXT:     %1(s32) = COPY %esi
+# CHECK-NEXT:     %2(s1) = G_ICMP intpred(ult), %0(s32), %1
+# CHECK-NEXT:     %3(s32) = G_ZEXT %2(s1)
+# CHECK-NEXT:     %eax = COPY %3(s32)
+# CHECK-NEXT:     RET 0, implicit %eax
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: %edi, %esi
+
+    %0(s32) = COPY %edi
+    %1(s32) = COPY %esi
+    %2(s1) = G_ICMP intpred(ult), %0(s32), %1
+    %3(s32) = G_ZEXT %2(s1)
+    %eax = COPY %3(s32)
+    RET 0, implicit %eax
+
+...
+---
+name:            test_cmp_i64
+# CHECK-LABEL: name:  test_cmp_i64
+alignment:       4
+legalized:       false
+regBankSelected: false
+registers:
+  - { id: 0, class: _ }
+  - { id: 1, class: _ }
+  - { id: 2, class: _ }
+  - { id: 3, class: _ }
+# CHECK:          %0(s64) = COPY %rdi
+# CHECK-NEXT:     %1(s64) = COPY %rsi
+# CHECK-NEXT:     %2(s1) = G_ICMP intpred(ult), %0(s64), %1
+# CHECK-NEXT:     %3(s32) = G_ZEXT %2(s1)
+# CHECK-NEXT:     %eax = COPY %3(s32)
+# CHECK-NEXT:     RET 0, implicit %eax
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: %rdi, %rsi
+
+    %0(s64) = COPY %rdi
+    %1(s64) = COPY %rsi
+    %2(s1) = G_ICMP intpred(ult), %0(s64), %1
+    %3(s32) = G_ZEXT %2(s1)
+    %eax = COPY %3(s32)
+    RET 0, implicit %eax
+
+...
+---
+name:            test_cmp_p0
+# CHECK-LABEL: name:  test_cmp_p0
+alignment:       4
+legalized:       false
+regBankSelected: false
+registers:
+  - { id: 0, class: _ }
+  - { id: 1, class: _ }
+  - { id: 2, class: _ }
+  - { id: 3, class: _ }
+# CHECK:          %0(p0) = COPY %rdi
+# CHECK-NEXT:     %1(p0) = COPY %rsi
+# CHECK-NEXT:     %2(s1) = G_ICMP intpred(ult), %0(p0), %1
+# CHECK-NEXT:     %3(s32) = G_ZEXT %2(s1)
+# CHECK-NEXT:     %eax = COPY %3(s32)
+# CHECK-NEXT:     RET 0, implicit %eax
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: %rdi, %rsi
+
+    %0(p0) = COPY %rdi
+    %1(p0) = COPY %rsi
+    %2(s1) = G_ICMP intpred(ult), %0(p0), %1
+    %3(s32) = G_ZEXT %2(s1)
+    %eax = COPY %3(s32)
+    RET 0, implicit %eax
+
+...
diff --git a/test/CodeGen/X86/GlobalISel/legalize-ext-x86-64.mir b/test/CodeGen/X86/GlobalISel/legalize-ext-x86-64.mir
index 25af600f2299..6f051f1b6ea5 100644
--- a/test/CodeGen/X86/GlobalISel/legalize-ext-x86-64.mir
+++ b/test/CodeGen/X86/GlobalISel/legalize-ext-x86-64.mir
@@ -1,6 +1,12 @@
 # RUN: llc -mtriple=x86_64-linux-gnu -global-isel -run-pass=legalizer %s -o - | FileCheck %s
 
 --- |
+  define i64 @test_sext_i1(i8 %a) {
+    %val = trunc i8 %a to i1
+    %r = sext i1 %val to i64
+    ret i64 %r
+  }
+
   define i64 @test_sext_i8(i8 %val) {
     %r = sext i8 %val to i64
     ret i64 %r
@@ -16,6 +22,12 @@
     ret i64 %r
   }
 
+  define i64 @test_zext_i1(i8 %a) {
+    %val = trunc i8 %a to i1
+    %r = zext i1 %val to i64
+    ret i64 %r
+  }
+
   define i64 @test_zext_i8(i8 %val) {
     %r = zext i8 %val to i64
     ret i64 %r
@@ -31,6 +43,32 @@
     ret i64 %r
   }
 
+...
+---
+name:            test_sext_i1
+# CHECK-LABEL: name:  test_sext_i1
+alignment:       4
+legalized:       false
+regBankSelected: false
+registers:
+  - { id: 0, class: _ }
+  - { id: 1, class: _ }
+  - { id: 2, class: _ }
+# CHECK:          %0(s8) = COPY %edi
+# CHECK-NEXT:     %1(s1) = G_TRUNC %0(s8)
+# CHECK-NEXT:     %2(s64) = G_SEXT %1(s1)
+# CHECK-NEXT:     %rax = COPY %2(s64)
+# CHECK-NEXT:     RET 0, implicit %rax
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: %edi
+
+    %0(s8) = COPY %edi
+    %1(s1) = G_TRUNC %0(s8)
+    %2(s64) = G_SEXT %1(s1)
+    %rax = COPY %2(s64)
+    RET 0, implicit %rax
+
 ...
 ---
 name:            test_sext_i8
@@ -100,6 +138,32 @@ body:             |
     %rax = COPY %1(s64)
     RET 0, implicit %rax
 
+...
+---
+name:            test_zext_i1
+# CHECK-LABEL: name:  test_zext_i1
+alignment:       4
+legalized:       false
+regBankSelected: false
+registers:
+  - { id: 0, class: _ }
+  - { id: 1, class: _ }
+  - { id: 2, class: _ }
+# CHECK:          %0(s8) = COPY %edi
+# CHECK-NEXT:     %1(s1) = G_TRUNC %0(s8)
+# CHECK-NEXT:     %2(s64) = G_ZEXT %1(s1)
+# CHECK-NEXT:     %rax = COPY %2(s64)
+# CHECK-NEXT:     RET 0, implicit %rax
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: %edi
+
+    %0(s8) = COPY %edi
+    %1(s1) = G_TRUNC %0(s8)
+    %2(s64) = G_ZEXT %1(s1)
+    %rax = COPY %2(s64)
+    RET 0, implicit %rax
+
 ...
 ---
 name:            test_zext_i8
diff --git a/test/CodeGen/X86/GlobalISel/legalize-ext.mir b/test/CodeGen/X86/GlobalISel/legalize-ext.mir
index 46457e0fff59..c9add0dc4e95 100644
--- a/test/CodeGen/X86/GlobalISel/legalize-ext.mir
+++ b/test/CodeGen/X86/GlobalISel/legalize-ext.mir
@@ -1,6 +1,12 @@
 # RUN: llc -mtriple=i386-linux-gnu   -global-isel -run-pass=legalizer %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=X32
 # RUN: llc -mtriple=x86_64-linux-gnu -global-isel -run-pass=legalizer %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=X64
 --- |
+ define i32 @test_zext_i1(i8 %a) {
+    %val = trunc i8 %a to i1
+    %r = zext i1 %val to i32
+    ret i32 %r
+  }
+
   define i32 @test_zext_i8(i8 %val) {
     %r = zext i8 %val to i32
     ret i32 %r
@@ -11,6 +17,12 @@
     ret i32 %r
   }
 
+  define i32 @test_sext_i1(i8 %a) {
+    %val = trunc i8 %a to i1
+    %r = sext i1 %val to i32
+    ret i32 %r
+  }
+
   define i32 @test_sext_i8(i8 %val) {
     %r = sext i8 %val to i32
     ret i32 %r
@@ -21,6 +33,32 @@
     ret i32 %r
   }
 
+...
+---
+name:            test_zext_i1
+# ALL-LABEL: name:  test_zext_i1
+alignment:       4
+legalized:       false
+regBankSelected: false
+registers:
+  - { id: 0, class: _ }
+  - { id: 1, class: _ }
+  - { id: 2, class: _ }
+# ALL:          %0(s8) = COPY %edi
+# ALL-NEXT:     %1(s1) = G_TRUNC %0(s8)
+# ALL-NEXT:     %2(s32) = G_ZEXT %1(s1)
+# ALL-NEXT:     %eax = COPY %2(s32)
+# ALL-NEXT:     RET 0, implicit %eax
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: %edi
+
+    %0(s8) = COPY %edi
+    %1(s1) = G_TRUNC %0(s8)
+    %2(s32) = G_ZEXT %1(s1)
+    %eax = COPY %2(s32)
+    RET 0, implicit %eax
+
 ...
 ---
 name:            test_zext_i8
@@ -67,6 +105,32 @@ body:             |
     %eax = COPY %1(s32)
     RET 0, implicit %eax
 
+...
+---
+name:            test_sext_i1
+# ALL-LABEL: name:  test_sext_i1
+alignment:       4
+legalized:       false
+regBankSelected: false
+registers:
+  - { id: 0, class: _ }
+  - { id: 1, class: _ }
+  - { id: 2, class: _ }
+# ALL:          %0(s8) = COPY %edi
+# ALL-NEXT:     %1(s1) = G_TRUNC %0(s8)
+# ALL-NEXT:     %2(s32) = G_SEXT %1(s1)
+# ALL-NEXT:     %eax = COPY %2(s32)
+# ALL-NEXT:     RET 0, implicit %eax
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: %edi
+
+    %0(s8) = COPY %edi
+    %1(s1) = G_TRUNC %0(s8)
+    %2(s32) = G_SEXT %1(s1)
+    %eax = COPY %2(s32)
+    RET 0, implicit %eax
+
 ...
 ---
 name:            test_sext_i8
diff --git a/test/CodeGen/X86/GlobalISel/memop-scalar-x32.ll b/test/CodeGen/X86/GlobalISel/memop-scalar-x32.ll
new file mode 100644
index 000000000000..49a7fd79f8b2
--- /dev/null
+++ b/test/CodeGen/X86/GlobalISel/memop-scalar-x32.ll
@@ -0,0 +1,101 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=i386-linux-gnu                       -global-isel < %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE_FAST
+; RUN: llc -mtriple=i386-linux-gnu -regbankselect-greedy -global-isel < %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE_GREEDY
+
+;TODO merge with x86-64 tests (many operations not suppored yet)
+
+define i8 @test_load_i8(i8 * %p1) {
+; ALL-LABEL: test_load_i8:
+; ALL:       # BB#0:
+; ALL-NEXT:    leal 4(%esp), %eax
+; ALL-NEXT:    movl (%eax), %eax
+; ALL-NEXT:    movb (%eax), %al
+; ALL-NEXT:    retl
+  %r = load i8, i8* %p1
+  ret i8 %r
+}
+
+define i16 @test_load_i16(i16 * %p1) {
+; ALL-LABEL: test_load_i16:
+; ALL:       # BB#0:
+; ALL-NEXT:    leal 4(%esp), %eax
+; ALL-NEXT:    movl (%eax), %eax
+; ALL-NEXT:    movzwl (%eax), %eax
+; ALL-NEXT:    retl
+  %r = load i16, i16* %p1
+  ret i16 %r
+}
+
+define i32 @test_load_i32(i32 * %p1) {
+; ALL-LABEL: test_load_i32:
+; ALL:       # BB#0:
+; ALL-NEXT:    leal 4(%esp), %eax
+; ALL-NEXT:    movl (%eax), %eax
+; ALL-NEXT:    movl (%eax), %eax
+; ALL-NEXT:    retl
+  %r = load i32, i32* %p1
+  ret i32 %r
+}
+
+define i8 * @test_store_i8(i8 %val, i8 * %p1) {
+; ALL-LABEL: test_store_i8:
+; ALL:       # BB#0:
+; ALL-NEXT:    leal 4(%esp), %eax
+; ALL-NEXT:    movb (%eax), %cl
+; ALL-NEXT:    leal 8(%esp), %eax
+; ALL-NEXT:    movl (%eax), %eax
+; ALL-NEXT:    movb %cl, (%eax)
+; ALL-NEXT:    retl
+  store i8 %val, i8* %p1
+  ret i8 * %p1;
+}
+
+define i16 * @test_store_i16(i16 %val, i16 * %p1) {
+; ALL-LABEL: test_store_i16:
+; ALL:       # BB#0:
+; ALL-NEXT:    leal 4(%esp), %eax
+; ALL-NEXT:    movzwl (%eax), %ecx
+; ALL-NEXT:    leal 8(%esp), %eax
+; ALL-NEXT:    movl (%eax), %eax
+; ALL-NEXT:    movw %cx, (%eax)
+; ALL-NEXT:    retl
+  store i16 %val, i16* %p1
+  ret i16 * %p1;
+}
+
+define i32 * @test_store_i32(i32 %val, i32 * %p1) {
+; ALL-LABEL: test_store_i32:
+; ALL:       # BB#0:
+; ALL-NEXT:    leal 4(%esp), %eax
+; ALL-NEXT:    movl (%eax), %ecx
+; ALL-NEXT:    leal 8(%esp), %eax
+; ALL-NEXT:    movl (%eax), %eax
+; ALL-NEXT:    movl %ecx, (%eax)
+; ALL-NEXT:    retl
+  store i32 %val, i32* %p1
+  ret i32 * %p1;
+}
+
+define i32* @test_load_ptr(i32** %ptr1) {
+; ALL-LABEL: test_load_ptr:
+; ALL:       # BB#0:
+; ALL-NEXT:    leal 4(%esp), %eax
+; ALL-NEXT:    movl (%eax), %eax
+; ALL-NEXT:    movl (%eax), %eax
+; ALL-NEXT:    retl
+  %p = load i32*, i32** %ptr1
+  ret i32* %p
+}
+
+define void @test_store_ptr(i32** %ptr1, i32* %a) {
+; ALL-LABEL: test_store_ptr:
+; ALL:       # BB#0:
+; ALL-NEXT:    leal 4(%esp), %eax
+; ALL-NEXT:    movl (%eax), %eax
+; ALL-NEXT:    leal 8(%esp), %ecx
+; ALL-NEXT:    movl (%ecx), %ecx
+; ALL-NEXT:    movl %ecx, (%eax)
+; ALL-NEXT:    retl
+  store i32* %a, i32** %ptr1
+  ret void
+}
diff --git a/test/CodeGen/X86/GlobalISel/memop-scalar.ll b/test/CodeGen/X86/GlobalISel/memop-scalar.ll
new file mode 100644
index 000000000000..3e45a9c9a49d
--- /dev/null
+++ b/test/CodeGen/X86/GlobalISel/memop-scalar.ll
@@ -0,0 +1,146 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=x86_64-linux-gnu                       -global-isel < %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=SSE_FAST
+; RUN: llc -mtriple=x86_64-linux-gnu -regbankselect-greedy -global-isel < %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=SSE_GREEDY
+
+define i8 @test_load_i8(i8 * %p1) {
+; ALL-LABEL: test_load_i8:
+; ALL:       # BB#0:
+; ALL-NEXT:    movb (%rdi), %al
+; ALL-NEXT:    retq
+  %r = load i8, i8* %p1
+  ret i8 %r
+}
+
+define i16 @test_load_i16(i16 * %p1) {
+; ALL-LABEL: test_load_i16:
+; ALL:       # BB#0:
+; ALL-NEXT:    movzwl (%rdi), %eax
+; ALL-NEXT:    retq
+  %r = load i16, i16* %p1
+  ret i16 %r
+}
+
+define i32 @test_load_i32(i32 * %p1) {
+; ALL-LABEL: test_load_i32:
+; ALL:       # BB#0:
+; ALL-NEXT:    movl (%rdi), %eax
+; ALL-NEXT:    retq
+  %r = load i32, i32* %p1
+  ret i32 %r
+}
+
+define i64 @test_load_i64(i64 * %p1) {
+; ALL-LABEL: test_load_i64:
+; ALL:       # BB#0:
+; ALL-NEXT:    movq (%rdi), %rax
+; ALL-NEXT:    retq
+  %r = load i64, i64* %p1
+  ret i64 %r
+}
+
+define float @test_load_float(float * %p1) {
+; SSE-LABEL: test_load_float:
+; SSE:       # BB#0:
+; SSE-NEXT:    movl (%rdi), %eax
+; SSE-NEXT:    movd %eax, %xmm0
+; SSE-NEXT:    retq
+;
+; ALL_AVX-LABEL: test_load_float:
+; ALL_AVX:       # BB#0:
+; ALL_AVX-NEXT:    movl (%rdi), %eax
+; ALL_AVX-NEXT:    vmovd %eax, %xmm0
+; ALL_AVX-NEXT:    retq
+  %r = load float, float* %p1
+  ret float %r
+}
+
+define double @test_load_double(double * %p1) {
+; SSE-LABEL: test_load_double:
+; SSE:       # BB#0:
+; SSE-NEXT:    movq (%rdi), %rax
+; SSE-NEXT:    movq %rax, %xmm0
+; SSE-NEXT:    retq
+;
+; ALL_AVX-LABEL: test_load_double:
+; ALL_AVX:       # BB#0:
+; ALL_AVX-NEXT:    movq (%rdi), %rax
+; ALL_AVX-NEXT:    vmovq %rax, %xmm0
+; ALL_AVX-NEXT:    retq
+  %r = load double, double* %p1
+  ret double %r
+}
+
+define i32 * @test_store_i32(i32 %val, i32 * %p1) {
+; ALL-LABEL: test_store_i32:
+; ALL:       # BB#0:
+; ALL-NEXT:    movl %edi, (%rsi)
+; ALL-NEXT:    movq %rsi, %rax
+; ALL-NEXT:    retq
+  store i32 %val, i32* %p1
+  ret i32 * %p1;
+}
+
+define i64 * @test_store_i64(i64 %val, i64 * %p1) {
+; ALL-LABEL: test_store_i64:
+; ALL:       # BB#0:
+; ALL-NEXT:    movq %rdi, (%rsi)
+; ALL-NEXT:    movq %rsi, %rax
+; ALL-NEXT:    retq
+  store i64 %val, i64* %p1
+  ret i64 * %p1;
+}
+
+define float * @test_store_float(float %val, float * %p1) {
+;
+; SSE_FAST-LABEL: test_store_float:
+; SSE_FAST:       # BB#0:
+; SSE_FAST-NEXT:    movd %xmm0, %eax
+; SSE_FAST-NEXT:    movl %eax, (%rdi)
+; SSE_FAST-NEXT:    movq %rdi, %rax
+; SSE_FAST-NEXT:    retq
+;
+; SSE_GREEDY-LABEL: test_store_float:
+; SSE_GREEDY:       # BB#0:
+; SSE_GREEDY-NEXT:    movss %xmm0, (%rdi)
+; SSE_GREEDY-NEXT:    movq %rdi, %rax
+; SSE_GREEDY-NEXT:    retq
+  store float %val, float* %p1
+  ret float * %p1;
+}
+
+define double * @test_store_double(double %val, double * %p1) {
+;
+; SSE_FAST-LABEL: test_store_double:
+; SSE_FAST:       # BB#0:
+; SSE_FAST-NEXT:    movq %xmm0, %rax
+; SSE_FAST-NEXT:    movq %rax, (%rdi)
+; SSE_FAST-NEXT:    movq %rdi, %rax
+; SSE_FAST-NEXT:    retq
+;
+; SSE_GREEDY-LABEL: test_store_double:
+; SSE_GREEDY:       # BB#0:
+; SSE_GREEDY-NEXT:    movsd %xmm0, (%rdi)
+; SSE_GREEDY-NEXT:    movq %rdi, %rax
+; SSE_GREEDY-NEXT:    retq
+;
+  store double %val, double* %p1
+  ret double * %p1;
+}
+
+define i32* @test_load_ptr(i32** %ptr1) {
+; ALL-LABEL: test_load_ptr:
+; ALL:       # BB#0:
+; ALL-NEXT:    movq (%rdi), %rax
+; ALL-NEXT:    retq
+  %p = load i32*, i32** %ptr1
+  ret i32* %p
+}
+
+define void @test_store_ptr(i32** %ptr1, i32* %a) {
+; ALL-LABEL: test_store_ptr:
+; ALL:       # BB#0:
+; ALL-NEXT:    movq %rsi, (%rdi)
+; ALL-NEXT:    retq
+  store i32* %a, i32** %ptr1
+  ret void
+}
diff --git a/test/CodeGen/X86/GlobalISel/memop-vec.ll b/test/CodeGen/X86/GlobalISel/memop-vec.ll
new file mode 100644
index 000000000000..e218fded4d5f
--- /dev/null
+++ b/test/CodeGen/X86/GlobalISel/memop-vec.ll
@@ -0,0 +1,39 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=x86_64-linux-gnu -mcpu=skx                       -global-isel < %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=SKX
+; RUN: llc -mtriple=x86_64-linux-gnu -mcpu=skx -regbankselect-greedy -global-isel < %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=SKX
+
+define <4 x i32> @test_load_v4i32_noalign(<4 x i32> * %p1) {
+; ALL-LABEL: test_load_v4i32_noalign:
+; ALL:       # BB#0:
+; ALL-NEXT:    vmovups (%rdi), %xmm0
+; ALL-NEXT:    retq
+  %r = load <4 x i32>, <4 x i32>* %p1, align 1
+  ret <4 x i32> %r
+}
+
+define <4 x i32> @test_load_v4i32_align(<4 x i32> * %p1) {
+; ALL-LABEL: test_load_v4i32_align:
+; ALL:       # BB#0:
+; ALL-NEXT:    vmovaps (%rdi), %xmm0
+; ALL-NEXT:    retq
+  %r = load <4 x i32>, <4 x i32>* %p1, align 16
+  ret <4 x i32> %r
+}
+
+define void @test_store_v4i32_noalign(<4 x i32> %val, <4 x i32>* %p1) {
+; ALL-LABEL: test_store_v4i32_noalign:
+; ALL:       # BB#0:
+; ALL-NEXT:    vmovups %xmm0, (%rdi)
+; ALL-NEXT:    retq
+  store <4 x i32> %val, <4 x i32>* %p1, align 1
+  ret void
+}
+
+define void @test_store_v4i32_align(<4 x i32> %val, <4 x i32>* %p1) {
+; ALL-LABEL: test_store_v4i32_align:
+; ALL:       # BB#0:
+; ALL-NEXT:    vmovaps %xmm0, (%rdi)
+; ALL-NEXT:    retq
+  store <4 x i32> %val, <4 x i32>* %p1, align 16
+  ret void
+}
diff --git a/test/CodeGen/X86/GlobalISel/memop-x32.ll b/test/CodeGen/X86/GlobalISel/memop-x32.ll
deleted file mode 100644
index 49a7fd79f8b2..000000000000
--- a/test/CodeGen/X86/GlobalISel/memop-x32.ll
+++ /dev/null
@@ -1,101 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=i386-linux-gnu                       -global-isel < %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE_FAST
-; RUN: llc -mtriple=i386-linux-gnu -regbankselect-greedy -global-isel < %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE_GREEDY
-
-;TODO merge with x86-64 tests (many operations not suppored yet)
-
-define i8 @test_load_i8(i8 * %p1) {
-; ALL-LABEL: test_load_i8:
-; ALL:       # BB#0:
-; ALL-NEXT:    leal 4(%esp), %eax
-; ALL-NEXT:    movl (%eax), %eax
-; ALL-NEXT:    movb (%eax), %al
-; ALL-NEXT:    retl
-  %r = load i8, i8* %p1
-  ret i8 %r
-}
-
-define i16 @test_load_i16(i16 * %p1) {
-; ALL-LABEL: test_load_i16:
-; ALL:       # BB#0:
-; ALL-NEXT:    leal 4(%esp), %eax
-; ALL-NEXT:    movl (%eax), %eax
-; ALL-NEXT:    movzwl (%eax), %eax
-; ALL-NEXT:    retl
-  %r = load i16, i16* %p1
-  ret i16 %r
-}
-
-define i32 @test_load_i32(i32 * %p1) {
-; ALL-LABEL: test_load_i32:
-; ALL:       # BB#0:
-; ALL-NEXT:    leal 4(%esp), %eax
-; ALL-NEXT:    movl (%eax), %eax
-; ALL-NEXT:    movl (%eax), %eax
-; ALL-NEXT:    retl
-  %r = load i32, i32* %p1
-  ret i32 %r
-}
-
-define i8 * @test_store_i8(i8 %val, i8 * %p1) {
-; ALL-LABEL: test_store_i8:
-; ALL:       # BB#0:
-; ALL-NEXT:    leal 4(%esp), %eax
-; ALL-NEXT:    movb (%eax), %cl
-; ALL-NEXT:    leal 8(%esp), %eax
-; ALL-NEXT:    movl (%eax), %eax
-; ALL-NEXT:    movb %cl, (%eax)
-; ALL-NEXT:    retl
-  store i8 %val, i8* %p1
-  ret i8 * %p1;
-}
-
-define i16 * @test_store_i16(i16 %val, i16 * %p1) {
-; ALL-LABEL: test_store_i16:
-; ALL:       # BB#0:
-; ALL-NEXT:    leal 4(%esp), %eax
-; ALL-NEXT:    movzwl (%eax), %ecx
-; ALL-NEXT:    leal 8(%esp), %eax
-; ALL-NEXT:    movl (%eax), %eax
-; ALL-NEXT:    movw %cx, (%eax)
-; ALL-NEXT:    retl
-  store i16 %val, i16* %p1
-  ret i16 * %p1;
-}
-
-define i32 * @test_store_i32(i32 %val, i32 * %p1) {
-; ALL-LABEL: test_store_i32:
-; ALL:       # BB#0:
-; ALL-NEXT:    leal 4(%esp), %eax
-; ALL-NEXT:    movl (%eax), %ecx
-; ALL-NEXT:    leal 8(%esp), %eax
-; ALL-NEXT:    movl (%eax), %eax
-; ALL-NEXT:    movl %ecx, (%eax)
-; ALL-NEXT:    retl
-  store i32 %val, i32* %p1
-  ret i32 * %p1;
-}
-
-define i32* @test_load_ptr(i32** %ptr1) {
-; ALL-LABEL: test_load_ptr:
-; ALL:       # BB#0:
-; ALL-NEXT:    leal 4(%esp), %eax
-; ALL-NEXT:    movl (%eax), %eax
-; ALL-NEXT:    movl (%eax), %eax
-; ALL-NEXT:    retl
-  %p = load i32*, i32** %ptr1
-  ret i32* %p
-}
-
-define void @test_store_ptr(i32** %ptr1, i32* %a) {
-; ALL-LABEL: test_store_ptr:
-; ALL:       # BB#0:
-; ALL-NEXT:    leal 4(%esp), %eax
-; ALL-NEXT:    movl (%eax), %eax
-; ALL-NEXT:    leal 8(%esp), %ecx
-; ALL-NEXT:    movl (%ecx), %ecx
-; ALL-NEXT:    movl %ecx, (%eax)
-; ALL-NEXT:    retl
-  store i32* %a, i32** %ptr1
-  ret void
-}
diff --git a/test/CodeGen/X86/GlobalISel/memop.ll b/test/CodeGen/X86/GlobalISel/memop.ll
deleted file mode 100644
index a7407c0e6b75..000000000000
--- a/test/CodeGen/X86/GlobalISel/memop.ll
+++ /dev/null
@@ -1,206 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=x86_64-linux-gnu                 			                                  -global-isel < %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE_FAST
-; RUN: llc -mtriple=x86_64-linux-gnu                                  -regbankselect-greedy -global-isel < %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE_GREEDY
-; RUN: llc -mtriple=x86_64-linux-gnu -mattr=+avx                                            -global-isel < %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=ALL_AVX --check-prefix=ALL_AVX_FAST   --check-prefix=AVX_FAST
-; RUN: llc -mtriple=x86_64-linux-gnu -mattr=+avx                      -regbankselect-greedy -global-isel < %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=ALL_AVX --check-prefix=ALL_AVX_GREEDY --check-prefix=AVX_GREEDY
-; RUN: llc -mtriple=x86_64-linux-gnu -mattr=+avx512f 	                                      -global-isel < %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=ALL_AVX --check-prefix=ALL_AVX_FAST   --check-prefix=AVX512F_FAST
-; RUN: llc -mtriple=x86_64-linux-gnu -mattr=+avx512f                  -regbankselect-greedy -global-isel < %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=ALL_AVX --check-prefix=ALL_AVX_GREEDY --check-prefix=AVX512F_GREEDY
-; RUN: llc -mtriple=x86_64-linux-gnu -mattr=+avx512f -mattr=+avx512vl			                  -global-isel < %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=ALL_AVX --check-prefix=ALL_AVX_FAST   --check-prefix=AVX512VL_FAST
-; RUN: llc -mtriple=x86_64-linux-gnu -mattr=+avx512f -mattr=+avx512vl -regbankselect-greedy -global-isel < %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=ALL_AVX --check-prefix=ALL_AVX_GREEDY --check-prefix=AVX512VL_GREEDY
-
-
-define i8 @test_load_i8(i8 * %p1) {
-; ALL-LABEL: test_load_i8:
-; ALL:       # BB#0:
-; ALL-NEXT:    movb (%rdi), %al
-; ALL-NEXT:    retq
-  %r = load i8, i8* %p1
-  ret i8 %r
-}
-
-define i16 @test_load_i16(i16 * %p1) {
-; ALL-LABEL: test_load_i16:
-; ALL:       # BB#0:
-; ALL-NEXT:    movzwl (%rdi), %eax
-; ALL-NEXT:    retq
-  %r = load i16, i16* %p1
-  ret i16 %r
-}
-
-define i32 @test_load_i32(i32 * %p1) {
-; ALL-LABEL: test_load_i32:
-; ALL:       # BB#0:
-; ALL-NEXT:    movl (%rdi), %eax
-; ALL-NEXT:    retq
-  %r = load i32, i32* %p1
-  ret i32 %r
-}
-
-define i64 @test_load_i64(i64 * %p1) {
-; ALL-LABEL: test_load_i64:
-; ALL:       # BB#0:
-; ALL-NEXT:    movq (%rdi), %rax
-; ALL-NEXT:    retq
-  %r = load i64, i64* %p1
-  ret i64 %r
-}
-
-define float @test_load_float(float * %p1) {
-; SSE-LABEL: test_load_float:
-; SSE:       # BB#0:
-; SSE-NEXT:    movl (%rdi), %eax
-; SSE-NEXT:    movd %eax, %xmm0
-; SSE-NEXT:    retq
-;
-; ALL_AVX-LABEL: test_load_float:
-; ALL_AVX:       # BB#0:
-; ALL_AVX-NEXT:    movl (%rdi), %eax
-; ALL_AVX-NEXT:    vmovd %eax, %xmm0
-; ALL_AVX-NEXT:    retq
-  %r = load float, float* %p1
-  ret float %r
-}
-
-define double @test_load_double(double * %p1) {
-; SSE-LABEL: test_load_double:
-; SSE:       # BB#0:
-; SSE-NEXT:    movq (%rdi), %rax
-; SSE-NEXT:    movq %rax, %xmm0
-; SSE-NEXT:    retq
-;
-; ALL_AVX-LABEL: test_load_double:
-; ALL_AVX:       # BB#0:
-; ALL_AVX-NEXT:    movq (%rdi), %rax
-; ALL_AVX-NEXT:    vmovq %rax, %xmm0
-; ALL_AVX-NEXT:    retq
-  %r = load double, double* %p1
-  ret double %r
-}
-
-define <4 x i32> @test_load_v4i32_noalign(<4 x i32> * %p1) {
-; SSE-LABEL: test_load_v4i32_noalign:
-; SSE:       # BB#0:
-; SSE-NEXT:    movups (%rdi), %xmm0
-; SSE-NEXT:    retq
-;
-; ALL_AVX-LABEL: test_load_v4i32_noalign:
-; ALL_AVX:       # BB#0:
-; ALL_AVX-NEXT:    vmovups (%rdi), %xmm0
-; ALL_AVX-NEXT:    retq
-  %r = load <4 x i32>, <4 x i32>* %p1, align 1
-  ret <4 x i32> %r
-}
-
-define <4 x i32> @test_load_v4i32_align(<4 x i32> * %p1) {
-; SSE-LABEL: test_load_v4i32_align:
-; SSE:       # BB#0:
-; SSE-NEXT:    movaps (%rdi), %xmm0
-; SSE-NEXT:    retq
-;
-; ALL_AVX-LABEL: test_load_v4i32_align:
-; ALL_AVX:       # BB#0:
-; ALL_AVX-NEXT:    vmovaps (%rdi), %xmm0
-; ALL_AVX-NEXT:    retq
-  %r = load <4 x i32>, <4 x i32>* %p1, align 16
-  ret <4 x i32> %r
-}
-
-define i32 * @test_store_i32(i32 %val, i32 * %p1) {
-; ALL-LABEL: test_store_i32:
-; ALL:       # BB#0:
-; ALL-NEXT:    movl %edi, (%rsi)
-; ALL-NEXT:    movq %rsi, %rax
-; ALL-NEXT:    retq
-  store i32 %val, i32* %p1
-  ret i32 * %p1;
-}
-
-define i64 * @test_store_i64(i64 %val, i64 * %p1) {
-; ALL-LABEL: test_store_i64:
-; ALL:       # BB#0:
-; ALL-NEXT:    movq %rdi, (%rsi)
-; ALL-NEXT:    movq %rsi, %rax
-; ALL-NEXT:    retq
-  store i64 %val, i64* %p1
-  ret i64 * %p1;
-}
-
-define float * @test_store_float(float %val, float * %p1) {
-;
-; SSE_FAST-LABEL: test_store_float:
-; SSE_FAST:       # BB#0:
-; SSE_FAST-NEXT:    movd %xmm0, %eax
-; SSE_FAST-NEXT:    movl %eax, (%rdi)
-; SSE_FAST-NEXT:    movq %rdi, %rax
-; SSE_FAST-NEXT:    retq
-;
-; SSE_GREEDY-LABEL: test_store_float:
-; SSE_GREEDY:       # BB#0:
-; SSE_GREEDY-NEXT:    movss %xmm0, (%rdi)
-; SSE_GREEDY-NEXT:    movq %rdi, %rax
-; SSE_GREEDY-NEXT:    retq
-;
-; ALL_AVX_FAST-LABEL: test_store_float:
-; ALL_AVX_FAST:       # BB#0:
-; ALL_AVX_FAST-NEXT:    vmovd %xmm0, %eax
-; ALL_AVX_FAST-NEXT:    movl %eax, (%rdi)
-; ALL_AVX_FAST-NEXT:    movq %rdi, %rax
-; ALL_AVX_FAST-NEXT:    retq
-;
-; ALL_AVX_GREEDY-LABEL: test_store_float:
-; ALL_AVX_GREEDY:       # BB#0:
-; ALL_AVX_GREEDY-NEXT:    vmovss %xmm0, (%rdi)
-; ALL_AVX_GREEDY-NEXT:    movq %rdi, %rax
-; ALL_AVX_GREEDY-NEXT:    retq
-  store float %val, float* %p1
-  ret float * %p1;
-}
-
-define double * @test_store_double(double %val, double * %p1) {
-;
-; SSE_FAST-LABEL: test_store_double:
-; SSE_FAST:       # BB#0:
-; SSE_FAST-NEXT:    movq %xmm0, %rax
-; SSE_FAST-NEXT:    movq %rax, (%rdi)
-; SSE_FAST-NEXT:    movq %rdi, %rax
-; SSE_FAST-NEXT:    retq
-;
-; SSE_GREEDY-LABEL: test_store_double:
-; SSE_GREEDY:       # BB#0:
-; SSE_GREEDY-NEXT:    movsd %xmm0, (%rdi)
-; SSE_GREEDY-NEXT:    movq %rdi, %rax
-; SSE_GREEDY-NEXT:    retq
-;
-; ALL_AVX_FAST-LABEL: test_store_double:
-; ALL_AVX_FAST:       # BB#0:
-; ALL_AVX_FAST-NEXT:    vmovq %xmm0, %rax
-; ALL_AVX_FAST-NEXT:    movq %rax, (%rdi)
-; ALL_AVX_FAST-NEXT:    movq %rdi, %rax
-; ALL_AVX_FAST-NEXT:    retq
-;
-; ALL_AVX_GREEDY-LABEL: test_store_double:
-; ALL_AVX_GREEDY:       # BB#0:
-; ALL_AVX_GREEDY-NEXT:    vmovsd %xmm0, (%rdi)
-; ALL_AVX_GREEDY-NEXT:    movq %rdi, %rax
-; ALL_AVX_GREEDY-NEXT:    retq
-  store double %val, double* %p1
-  ret double * %p1;
-}
-
-define i32* @test_load_ptr(i32** %ptr1) {
-; ALL-LABEL: test_load_ptr:
-; ALL:       # BB#0:
-; ALL-NEXT:    movq (%rdi), %rax
-; ALL-NEXT:    retq
-  %p = load i32*, i32** %ptr1
-  ret i32* %p
-}
-
-define void @test_store_ptr(i32** %ptr1, i32* %a) {
-; ALL-LABEL: test_store_ptr:
-; ALL:       # BB#0:
-; ALL-NEXT:    movq %rsi, (%rdi)
-; ALL-NEXT:    retq
-  store i32* %a, i32** %ptr1
-  ret void
-}
diff --git a/test/CodeGen/X86/GlobalISel/regbankselect-X86_64.mir b/test/CodeGen/X86/GlobalISel/regbankselect-X86_64.mir
index 3a65a9003773..1ea922ee475a 100644
--- a/test/CodeGen/X86/GlobalISel/regbankselect-X86_64.mir
+++ b/test/CodeGen/X86/GlobalISel/regbankselect-X86_64.mir
@@ -2,11 +2,6 @@
 # RUN: llc -mtriple=x86_64-linux-gnu -global-isel -regbankselect-greedy -run-pass=regbankselect %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=GREEDY
 
 --- |
-  ; ModuleID = 'tmp.ll'
-  source_filename = "tmp.ll"
-  target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-  target triple = "x86_64--linux-gnu"
-
   define i8 @test_add_i8(i8 %arg1, i8 %arg2) {
     %ret = add i8 %arg1, %arg2
     ret i8 %ret
@@ -120,6 +115,26 @@
     ret void
   }
 
+  define i1 @test_icmp_eq_i8(i8 %a, i8 %b) {
+    %r = icmp eq i8 %a, %b
+    ret i1 %r
+  }
+
+  define i1 @test_icmp_eq_i16(i16 %a, i16 %b) {
+    %r = icmp eq i16 %a, %b
+    ret i1 %r
+  }
+
+  define i1 @test_icmp_eq_i32(i32 %a, i32 %b) {
+    %r = icmp eq i32 %a, %b
+    ret i1 %r
+  }
+
+  define i1 @test_icmp_eq_i64(i64 %a, i64 %b) {
+    %r = icmp eq i64 %a, %b
+    ret i1 %r
+  }
+
 ...
 ---
 name:            test_add_i8
@@ -735,3 +750,103 @@ body:             |
     RET 0
 
 ...
+---
+name:            test_icmp_eq_i8
+# CHECK-LABEL: name:  test_icmp_eq_i8
+alignment:       4
+legalized:       true
+regBankSelected: false
+# CHECK:      registers:
+# CHECK-NEXT:   - { id: 0, class: gpr }
+# CHECK-NEXT:   - { id: 1, class: gpr }
+# CHECK-NEXT:   - { id: 2, class: gpr }
+registers:
+  - { id: 0, class: _ }
+  - { id: 1, class: _ }
+  - { id: 2, class: _ }
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: %edi, %esi
+
+    %0(s8) = COPY %edi
+    %1(s8) = COPY %esi
+    %2(s1) = G_ICMP intpred(eq), %0(s8), %1
+    %al = COPY %2(s1)
+    RET 0, implicit %al
+
+...
+---
+name:            test_icmp_eq_i16
+# CHECK-LABEL: name:  test_icmp_eq_i16
+alignment:       4
+legalized:       true
+regBankSelected: false
+# CHECK:      registers:
+# CHECK-NEXT:   - { id: 0, class: gpr }
+# CHECK-NEXT:   - { id: 1, class: gpr }
+# CHECK-NEXT:   - { id: 2, class: gpr }
+registers:
+  - { id: 0, class: _ }
+  - { id: 1, class: _ }
+  - { id: 2, class: _ }
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: %edi, %esi
+
+    %0(s16) = COPY %edi
+    %1(s16) = COPY %esi
+    %2(s1) = G_ICMP intpred(eq), %0(s16), %1
+    %al = COPY %2(s1)
+    RET 0, implicit %al
+
+...
+---
+name:            test_icmp_eq_i32
+# CHECK-LABEL: name:  test_icmp_eq_i32
+alignment:       4
+legalized:       true
+regBankSelected: false
+# CHECK:      registers:
+# CHECK-NEXT:   - { id: 0, class: gpr }
+# CHECK-NEXT:   - { id: 1, class: gpr }
+# CHECK-NEXT:   - { id: 2, class: gpr }
+registers:
+  - { id: 0, class: _ }
+  - { id: 1, class: _ }
+  - { id: 2, class: _ }
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: %edi, %esi
+
+    %0(s32) = COPY %edi
+    %1(s32) = COPY %esi
+    %2(s1) = G_ICMP intpred(eq), %0(s32), %1
+    %al = COPY %2(s1)
+    RET 0, implicit %al
+
+...
+---
+name:            test_icmp_eq_i64
+# CHECK-LABEL: name:  test_icmp_eq_i64
+alignment:       4
+legalized:       true
+regBankSelected: false
+# CHECK:      registers:
+# CHECK-NEXT:   - { id: 0, class: gpr }
+# CHECK-NEXT:   - { id: 1, class: gpr }
+# CHECK-NEXT:   - { id: 2, class: gpr }
+registers:
+  - { id: 0, class: _ }
+  - { id: 1, class: _ }
+  - { id: 2, class: _ }
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: %rdi, %rsi
+
+    %0(s64) = COPY %rdi
+    %1(s64) = COPY %rsi
+    %2(s1) = G_ICMP intpred(eq), %0(s64), %1
+    %al = COPY %2(s1)
+    RET 0, implicit %al
+
+...
diff --git a/test/CodeGen/X86/GlobalISel/select-br.mir b/test/CodeGen/X86/GlobalISel/select-br.mir
new file mode 100644
index 000000000000..6d8cd2b1367d
--- /dev/null
+++ b/test/CodeGen/X86/GlobalISel/select-br.mir
@@ -0,0 +1,39 @@
+# RUN: llc -mtriple=x86_64-linux-gnu    -global-isel -run-pass=instruction-select %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=X64
+# RUN: llc -mtriple=i386-linux-gnu      -global-isel -run-pass=instruction-select %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=X32
+
+--- |
+  define void @uncondbr() {
+  entry:
+    br label %bb2
+
+  end:                                              ; preds = %bb2
+    ret void
+
+  bb2:                                              ; preds = %entry
+    br label %end
+  }
+
+...
+---
+name:            uncondbr
+# CHECK-LABEL: name:  uncondbr
+alignment:       4
+legalized:       true
+regBankSelected: true
+# CHECK:     JMP_1 %bb.2.bb2
+# CHECK:     JMP_1 %bb.1.end
+body:             |
+  bb.1.entry:
+    successors: %bb.3.bb2(0x80000000)
+
+    G_BR %bb.3.bb2
+
+  bb.2.end:
+    RET 0
+
+  bb.3.bb2:
+    successors: %bb.2.end(0x80000000)
+
+    G_BR %bb.2.end
+
+...
diff --git a/test/CodeGen/X86/GlobalISel/select-cmp.mir b/test/CodeGen/X86/GlobalISel/select-cmp.mir
new file mode 100644
index 000000000000..1d3da6cb88b9
--- /dev/null
+++ b/test/CodeGen/X86/GlobalISel/select-cmp.mir
@@ -0,0 +1,563 @@
+# RUN: llc -mtriple=x86_64-linux-gnu -global-isel -run-pass=instruction-select %s -o - | FileCheck %s --check-prefix=CHECK
+
+--- |
+  define i32 @test_icmp_eq_i8(i8 %a, i8 %b) {
+    %r = icmp eq i8 %a, %b
+    %res = zext i1 %r to i32
+    ret i32 %res
+  }
+
+  define i32 @test_icmp_eq_i16(i16 %a, i16 %b) {
+    %r = icmp eq i16 %a, %b
+    %res = zext i1 %r to i32
+    ret i32 %res
+  }
+
+  define i32 @test_icmp_eq_i64(i64 %a, i64 %b) {
+    %r = icmp eq i64 %a, %b
+    %res = zext i1 %r to i32
+    ret i32 %res
+  }
+
+  define i32 @test_icmp_eq_i32(i32 %a, i32 %b) {
+    %r = icmp eq i32 %a, %b
+    %res = zext i1 %r to i32
+    ret i32 %res
+  }
+
+  define i32 @test_icmp_ne_i32(i32 %a, i32 %b) {
+    %r = icmp ne i32 %a, %b
+    %res = zext i1 %r to i32
+    ret i32 %res
+  }
+
+  define i32 @test_icmp_ugt_i32(i32 %a, i32 %b) {
+    %r = icmp ugt i32 %a, %b
+    %res = zext i1 %r to i32
+    ret i32 %res
+  }
+
+  define i32 @test_icmp_uge_i32(i32 %a, i32 %b) {
+    %r = icmp uge i32 %a, %b
+    %res = zext i1 %r to i32
+    ret i32 %res
+  }
+
+  define i32 @test_icmp_ult_i32(i32 %a, i32 %b) {
+    %r = icmp ult i32 %a, %b
+    %res = zext i1 %r to i32
+    ret i32 %res
+  }
+
+  define i32 @test_icmp_ule_i32(i32 %a, i32 %b) {
+    %r = icmp ule i32 %a, %b
+    %res = zext i1 %r to i32
+    ret i32 %res
+  }
+
+  define i32 @test_icmp_sgt_i32(i32 %a, i32 %b) {
+    %r = icmp sgt i32 %a, %b
+    %res = zext i1 %r to i32
+    ret i32 %res
+  }
+
+  define i32 @test_icmp_sge_i32(i32 %a, i32 %b) {
+    %r = icmp sge i32 %a, %b
+    %res = zext i1 %r to i32
+    ret i32 %res
+  }
+
+  define i32 @test_icmp_slt_i32(i32 %a, i32 %b) {
+    %r = icmp slt i32 %a, %b
+    %res = zext i1 %r to i32
+    ret i32 %res
+  }
+
+  define i32 @test_icmp_sle_i32(i32 %a, i32 %b) {
+    %r = icmp sle i32 %a, %b
+    %res = zext i1 %r to i32
+    ret i32 %res
+  }
+
+...
+---
+name:            test_icmp_eq_i8
+# CHECK-LABEL: name:  test_icmp_eq_i8
+alignment:       4
+legalized:       true
+regBankSelected: true
+# CHECK:      registers:
+# CHECK-NEXT:   - { id: 0, class: gr8 }
+# CHECK-NEXT:   - { id: 1, class: gr8 }
+# CHECK-NEXT:   - { id: 2, class: gr8 }
+# CHECK-NEXT:   - { id: 3, class: gr32 }
+# CHECK-NEXT:   - { id: 4, class: gr32 }
+registers:
+  - { id: 0, class: gpr }
+  - { id: 1, class: gpr }
+  - { id: 2, class: gpr }
+  - { id: 3, class: gpr }
+# CHECK:          %0 = COPY %dil
+# CHECK-NEXT:     %1 = COPY %sil
+# CHECK-NEXT:     CMP8rr %0, %1, implicit-def %eflags
+# CHECK-NEXT:     %2 = SETEr implicit %eflags
+# CHECK-NEXT:     %4 = SUBREG_TO_REG 0, %2, 1
+# CHECK-NEXT:     %3 = AND32ri8 %4, 1, implicit-def %eflags
+# CHECK-NEXT:     %eax = COPY %3
+# CHECK-NEXT:     RET 0, implicit %eax
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: %edi, %esi
+
+    %0(s8) = COPY %edi
+    %1(s8) = COPY %esi
+    %2(s1) = G_ICMP intpred(eq), %0(s8), %1
+    %3(s32) = G_ZEXT %2(s1)
+    %eax = COPY %3(s32)
+    RET 0, implicit %eax
+
+...
+---
+name:            test_icmp_eq_i16
+# CHECK-LABEL: name:  test_icmp_eq_i16
+alignment:       4
+legalized:       true
+regBankSelected: true
+# CHECK:      registers:
+# CHECK-NEXT:   - { id: 0, class: gr16 }
+# CHECK-NEXT:   - { id: 1, class: gr16 }
+# CHECK-NEXT:   - { id: 2, class: gr8 }
+# CHECK-NEXT:   - { id: 3, class: gr32 }
+# CHECK-NEXT:   - { id: 4, class: gr32 }
+registers:
+  - { id: 0, class: gpr }
+  - { id: 1, class: gpr }
+  - { id: 2, class: gpr }
+  - { id: 3, class: gpr }
+# CHECK:          %0 = COPY %di
+# CHECK-NEXT:     %1 = COPY %si
+# CHECK-NEXT:     CMP16rr %0, %1, implicit-def %eflags
+# CHECK-NEXT:     %2 = SETEr implicit %eflags
+# CHECK-NEXT:     %4 = SUBREG_TO_REG 0, %2, 1
+# CHECK-NEXT:     %3 = AND32ri8 %4, 1, implicit-def %eflags
+# CHECK-NEXT:     %eax = COPY %3
+# CHECK-NEXT:     RET 0, implicit %eax
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: %edi, %esi
+
+    %0(s16) = COPY %edi
+    %1(s16) = COPY %esi
+    %2(s1) = G_ICMP intpred(eq), %0(s16), %1
+    %3(s32) = G_ZEXT %2(s1)
+    %eax = COPY %3(s32)
+    RET 0, implicit %eax
+
+...
+---
+name:            test_icmp_eq_i64
+# CHECK-LABEL: name:  test_icmp_eq_i64
+alignment:       4
+legalized:       true
+regBankSelected: true
+# CHECK:      registers:
+# CHECK-NEXT:   - { id: 0, class: gr64 }
+# CHECK-NEXT:   - { id: 1, class: gr64 }
+# CHECK-NEXT:   - { id: 2, class: gr8 }
+# CHECK-NEXT:   - { id: 3, class: gr32 }
+# CHECK-NEXT:   - { id: 4, class: gr32 }
+registers:
+  - { id: 0, class: gpr }
+  - { id: 1, class: gpr }
+  - { id: 2, class: gpr }
+  - { id: 3, class: gpr }
+# CHECK:          %0 = COPY %rdi
+# CHECK-NEXT:     %1 = COPY %rsi
+# CHECK-NEXT:     CMP64rr %0, %1, implicit-def %eflags
+# CHECK-NEXT:     %2 = SETEr implicit %eflags
+# CHECK-NEXT:     %4 = SUBREG_TO_REG 0, %2, 1
+# CHECK-NEXT:     %3 = AND32ri8 %4, 1, implicit-def %eflags
+# CHECK-NEXT:     %eax = COPY %3
+# CHECK-NEXT:     RET 0, implicit %eax
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: %rdi, %rsi
+
+    %0(s64) = COPY %rdi
+    %1(s64) = COPY %rsi
+    %2(s1) = G_ICMP intpred(eq), %0(s64), %1
+    %3(s32) = G_ZEXT %2(s1)
+    %eax = COPY %3(s32)
+    RET 0, implicit %eax
+
+...
+---
+name:            test_icmp_eq_i32
+# CHECK-LABEL: name:  test_icmp_eq_i32
+alignment:       4
+legalized:       true
+regBankSelected: true
+# CHECK:      registers:
+# CHECK-NEXT:   - { id: 0, class: gr32 }
+# CHECK-NEXT:   - { id: 1, class: gr32 }
+# CHECK-NEXT:   - { id: 2, class: gr8 }
+# CHECK-NEXT:   - { id: 3, class: gr32 }
+# CHECK-NEXT:   - { id: 4, class: gr32 }
+registers:
+  - { id: 0, class: gpr }
+  - { id: 1, class: gpr }
+  - { id: 2, class: gpr }
+  - { id: 3, class: gpr }
+# CHECK:          %0 = COPY %edi
+# CHECK-NEXT:     %1 = COPY %esi
+# CHECK-NEXT:     CMP32rr %0, %1, implicit-def %eflags
+# CHECK-NEXT:     %2 = SETEr implicit %eflags
+# CHECK-NEXT:     %4 = SUBREG_TO_REG 0, %2, 1
+# CHECK-NEXT:     %3 = AND32ri8 %4, 1, implicit-def %eflags
+# CHECK-NEXT:     %eax = COPY %3
+# CHECK-NEXT:     RET 0, implicit %eax
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: %edi, %esi
+
+    %0(s32) = COPY %edi
+    %1(s32) = COPY %esi
+    %2(s1) = G_ICMP intpred(eq), %0(s32), %1
+    %3(s32) = G_ZEXT %2(s1)
+    %eax = COPY %3(s32)
+    RET 0, implicit %eax
+
+...
+---
+name:            test_icmp_ne_i32
+# CHECK-LABEL: name:  test_icmp_ne_i32
+alignment:       4
+legalized:       true
+regBankSelected: true
+# CHECK:      registers:
+# CHECK-NEXT:   - { id: 0, class: gr32 }
+# CHECK-NEXT:   - { id: 1, class: gr32 }
+# CHECK-NEXT:   - { id: 2, class: gr8 }
+# CHECK-NEXT:   - { id: 3, class: gr32 }
+# CHECK-NEXT:   - { id: 4, class: gr32 }
+registers:
+  - { id: 0, class: gpr }
+  - { id: 1, class: gpr }
+  - { id: 2, class: gpr }
+  - { id: 3, class: gpr }
+# CHECK:          %0 = COPY %edi
+# CHECK-NEXT:     %1 = COPY %esi
+# CHECK-NEXT:     CMP32rr %0, %1, implicit-def %eflags
+# CHECK-NEXT:     %2 = SETNEr implicit %eflags
+# CHECK-NEXT:     %4 = SUBREG_TO_REG 0, %2, 1
+# CHECK-NEXT:     %3 = AND32ri8 %4, 1, implicit-def %eflags
+# CHECK-NEXT:     %eax = COPY %3
+# CHECK-NEXT:     RET 0, implicit %eax
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: %edi, %esi
+
+    %0(s32) = COPY %edi
+    %1(s32) = COPY %esi
+    %2(s1) = G_ICMP intpred(ne), %0(s32), %1
+    %3(s32) = G_ZEXT %2(s1)
+    %eax = COPY %3(s32)
+    RET 0, implicit %eax
+
+...
+---
+name:            test_icmp_ugt_i32
+# CHECK-LABEL: name:  test_icmp_ugt_i32
+alignment:       4
+legalized:       true
+regBankSelected: true
+# CHECK:      registers:
+# CHECK-NEXT:   - { id: 0, class: gr32 }
+# CHECK-NEXT:   - { id: 1, class: gr32 }
+# CHECK-NEXT:   - { id: 2, class: gr8 }
+# CHECK-NEXT:   - { id: 3, class: gr32 }
+# CHECK-NEXT:   - { id: 4, class: gr32 }
+registers:
+  - { id: 0, class: gpr }
+  - { id: 1, class: gpr }
+  - { id: 2, class: gpr }
+  - { id: 3, class: gpr }
+# CHECK:          %0 = COPY %edi
+# CHECK-NEXT:     %1 = COPY %esi
+# CHECK-NEXT:     CMP32rr %0, %1, implicit-def %eflags
+# CHECK-NEXT:     %2 = SETAr implicit %eflags
+# CHECK-NEXT:     %4 = SUBREG_TO_REG 0, %2, 1
+# CHECK-NEXT:     %3 = AND32ri8 %4, 1, implicit-def %eflags
+# CHECK-NEXT:     %eax = COPY %3
+# CHECK-NEXT:     RET 0, implicit %eax
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: %edi, %esi
+
+    %0(s32) = COPY %edi
+    %1(s32) = COPY %esi
+    %2(s1) = G_ICMP intpred(ugt), %0(s32), %1
+    %3(s32) = G_ZEXT %2(s1)
+    %eax = COPY %3(s32)
+    RET 0, implicit %eax
+
+...
+---
+name:            test_icmp_uge_i32
+# CHECK-LABEL: name:  test_icmp_uge_i32
+alignment:       4
+legalized:       true
+regBankSelected: true
+# CHECK:      registers:
+# CHECK-NEXT:   - { id: 0, class: gr32 }
+# CHECK-NEXT:   - { id: 1, class: gr32 }
+# CHECK-NEXT:   - { id: 2, class: gr8 }
+# CHECK-NEXT:   - { id: 3, class: gr32 }
+# CHECK-NEXT:   - { id: 4, class: gr32 }
+registers:
+  - { id: 0, class: gpr }
+  - { id: 1, class: gpr }
+  - { id: 2, class: gpr }
+  - { id: 3, class: gpr }
+# CHECK:          %0 = COPY %edi
+# CHECK-NEXT:     %1 = COPY %esi
+# CHECK-NEXT:     CMP32rr %0, %1, implicit-def %eflags
+# CHECK-NEXT:     %2 = SETAEr implicit %eflags
+# CHECK-NEXT:     %4 = SUBREG_TO_REG 0, %2, 1
+# CHECK-NEXT:     %3 = AND32ri8 %4, 1, implicit-def %eflags
+# CHECK-NEXT:     %eax = COPY %3
+# CHECK-NEXT:     RET 0, implicit %eax
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: %edi, %esi
+
+    %0(s32) = COPY %edi
+    %1(s32) = COPY %esi
+    %2(s1) = G_ICMP intpred(uge), %0(s32), %1
+    %3(s32) = G_ZEXT %2(s1)
+    %eax = COPY %3(s32)
+    RET 0, implicit %eax
+
+...
+---
+name:            test_icmp_ult_i32
+# CHECK-LABEL: name:  test_icmp_ult_i32
+alignment:       4
+legalized:       true
+regBankSelected: true
+# CHECK:      registers:
+# CHECK-NEXT:   - { id: 0, class: gr32 }
+# CHECK-NEXT:   - { id: 1, class: gr32 }
+# CHECK-NEXT:   - { id: 2, class: gr8 }
+# CHECK-NEXT:   - { id: 3, class: gr32 }
+# CHECK-NEXT:   - { id: 4, class: gr32 }
+registers:
+  - { id: 0, class: gpr }
+  - { id: 1, class: gpr }
+  - { id: 2, class: gpr }
+  - { id: 3, class: gpr }
+# CHECK:          %0 = COPY %edi
+# CHECK-NEXT:     %1 = COPY %esi
+# CHECK-NEXT:     CMP32rr %0, %1, implicit-def %eflags
+# CHECK-NEXT:     %2 = SETBr implicit %eflags
+# CHECK-NEXT:     %4 = SUBREG_TO_REG 0, %2, 1
+# CHECK-NEXT:     %3 = AND32ri8 %4, 1, implicit-def %eflags
+# CHECK-NEXT:     %eax = COPY %3
+# CHECK-NEXT:     RET 0, implicit %eax
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: %edi, %esi
+
+    %0(s32) = COPY %edi
+    %1(s32) = COPY %esi
+    %2(s1) = G_ICMP intpred(ult), %0(s32), %1
+    %3(s32) = G_ZEXT %2(s1)
+    %eax = COPY %3(s32)
+    RET 0, implicit %eax
+
+...
+---
+name:            test_icmp_ule_i32
+# CHECK-LABEL: name:  test_icmp_ule_i32
+alignment:       4
+legalized:       true
+regBankSelected: true
+# CHECK:      registers:
+# CHECK-NEXT:   - { id: 0, class: gr32 }
+# CHECK-NEXT:   - { id: 1, class: gr32 }
+# CHECK-NEXT:   - { id: 2, class: gr8 }
+# CHECK-NEXT:   - { id: 3, class: gr32 }
+# CHECK-NEXT:   - { id: 4, class: gr32 }
+registers:
+  - { id: 0, class: gpr }
+  - { id: 1, class: gpr }
+  - { id: 2, class: gpr }
+  - { id: 3, class: gpr }
+# CHECK:          %0 = COPY %edi
+# CHECK-NEXT:     %1 = COPY %esi
+# CHECK-NEXT:     CMP32rr %0, %1, implicit-def %eflags
+# CHECK-NEXT:     %2 = SETBEr implicit %eflags
+# CHECK-NEXT:     %4 = SUBREG_TO_REG 0, %2, 1
+# CHECK-NEXT:     %3 = AND32ri8 %4, 1, implicit-def %eflags
+# CHECK-NEXT:     %eax = COPY %3
+# CHECK-NEXT:     RET 0, implicit %eax
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: %edi, %esi
+
+    %0(s32) = COPY %edi
+    %1(s32) = COPY %esi
+    %2(s1) = G_ICMP intpred(ule), %0(s32), %1
+    %3(s32) = G_ZEXT %2(s1)
+    %eax = COPY %3(s32)
+    RET 0, implicit %eax
+
+...
+---
+name:            test_icmp_sgt_i32
+# CHECK-LABEL: name:  test_icmp_sgt_i32
+alignment:       4
+legalized:       true
+regBankSelected: true
+# CHECK:      registers:
+# CHECK-NEXT:   - { id: 0, class: gr32 }
+# CHECK-NEXT:   - { id: 1, class: gr32 }
+# CHECK-NEXT:   - { id: 2, class: gr8 }
+# CHECK-NEXT:   - { id: 3, class: gr32 }
+# CHECK-NEXT:   - { id: 4, class: gr32 }
+registers:
+  - { id: 0, class: gpr }
+  - { id: 1, class: gpr }
+  - { id: 2, class: gpr }
+  - { id: 3, class: gpr }
+# CHECK:          %0 = COPY %edi
+# CHECK-NEXT:     %1 = COPY %esi
+# CHECK-NEXT:     CMP32rr %0, %1, implicit-def %eflags
+# CHECK-NEXT:     %2 = SETGr implicit %eflags
+# CHECK-NEXT:     %4 = SUBREG_TO_REG 0, %2, 1
+# CHECK-NEXT:     %3 = AND32ri8 %4, 1, implicit-def %eflags
+# CHECK-NEXT:     %eax = COPY %3
+# CHECK-NEXT:     RET 0, implicit %eax
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: %edi, %esi
+
+    %0(s32) = COPY %edi
+    %1(s32) = COPY %esi
+    %2(s1) = G_ICMP intpred(sgt), %0(s32), %1
+    %3(s32) = G_ZEXT %2(s1)
+    %eax = COPY %3(s32)
+    RET 0, implicit %eax
+
+...
+---
+name:            test_icmp_sge_i32
+# CHECK-LABEL: name:  test_icmp_sge_i32
+alignment:       4
+legalized:       true
+regBankSelected: true
+# CHECK:      registers:
+# CHECK-NEXT:   - { id: 0, class: gr32 }
+# CHECK-NEXT:   - { id: 1, class: gr32 }
+# CHECK-NEXT:   - { id: 2, class: gr8 }
+# CHECK-NEXT:   - { id: 3, class: gr32 }
+# CHECK-NEXT:   - { id: 4, class: gr32 }
+registers:
+  - { id: 0, class: gpr }
+  - { id: 1, class: gpr }
+  - { id: 2, class: gpr }
+  - { id: 3, class: gpr }
+# CHECK:          %0 = COPY %edi
+# CHECK-NEXT:     %1 = COPY %esi
+# CHECK-NEXT:     CMP32rr %0, %1, implicit-def %eflags
+# CHECK-NEXT:     %2 = SETGEr implicit %eflags
+# CHECK-NEXT:     %4 = SUBREG_TO_REG 0, %2, 1
+# CHECK-NEXT:     %3 = AND32ri8 %4, 1, implicit-def %eflags
+# CHECK-NEXT:     %eax = COPY %3
+# CHECK-NEXT:     RET 0, implicit %eax
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: %edi, %esi
+
+    %0(s32) = COPY %edi
+    %1(s32) = COPY %esi
+    %2(s1) = G_ICMP intpred(sge), %0(s32), %1
+    %3(s32) = G_ZEXT %2(s1)
+    %eax = COPY %3(s32)
+    RET 0, implicit %eax
+
+...
+---
+name:            test_icmp_slt_i32
+# CHECK-LABEL: name:  test_icmp_slt_i32
+alignment:       4
+legalized:       true
+regBankSelected: true
+# CHECK:      registers:
+# CHECK-NEXT:   - { id: 0, class: gr32 }
+# CHECK-NEXT:   - { id: 1, class: gr32 }
+# CHECK-NEXT:   - { id: 2, class: gr8 }
+# CHECK-NEXT:   - { id: 3, class: gr32 }
+# CHECK-NEXT:   - { id: 4, class: gr32 }
+registers:
+  - { id: 0, class: gpr }
+  - { id: 1, class: gpr }
+  - { id: 2, class: gpr }
+  - { id: 3, class: gpr }
+# CHECK:          %0 = COPY %edi
+# CHECK-NEXT:     %1 = COPY %esi
+# CHECK-NEXT:     CMP32rr %0, %1, implicit-def %eflags
+# CHECK-NEXT:     %2 = SETLr implicit %eflags
+# CHECK-NEXT:     %4 = SUBREG_TO_REG 0, %2, 1
+# CHECK-NEXT:     %3 = AND32ri8 %4, 1, implicit-def %eflags
+# CHECK-NEXT:     %eax = COPY %3
+# CHECK-NEXT:     RET 0, implicit %eax
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: %edi, %esi
+
+    %0(s32) = COPY %edi
+    %1(s32) = COPY %esi
+    %2(s1) = G_ICMP intpred(slt), %0(s32), %1
+    %3(s32) = G_ZEXT %2(s1)
+    %eax = COPY %3(s32)
+    RET 0, implicit %eax
+
+...
+---
+name:            test_icmp_sle_i32
+# CHECK-LABEL: name:  test_icmp_sle_i32
+alignment:       4
+legalized:       true
+regBankSelected: true
+# CHECK:      registers:
+# CHECK-NEXT:   - { id: 0, class: gr32 }
+# CHECK-NEXT:   - { id: 1, class: gr32 }
+# CHECK-NEXT:   - { id: 2, class: gr8 }
+# CHECK-NEXT:   - { id: 3, class: gr32 }
+# CHECK-NEXT:   - { id: 4, class: gr32 }
+registers:
+  - { id: 0, class: gpr }
+  - { id: 1, class: gpr }
+  - { id: 2, class: gpr }
+  - { id: 3, class: gpr }
+# CHECK:          %0 = COPY %edi
+# CHECK-NEXT:     %1 = COPY %esi
+# CHECK-NEXT:     CMP32rr %0, %1, implicit-def %eflags
+# CHECK-NEXT:     %2 = SETLEr implicit %eflags
+# CHECK-NEXT:     %4 = SUBREG_TO_REG 0, %2, 1
+# CHECK-NEXT:     %3 = AND32ri8 %4, 1, implicit-def %eflags
+# CHECK-NEXT:     %eax = COPY %3
+# CHECK-NEXT:     RET 0, implicit %eax
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: %edi, %esi
+
+    %0(s32) = COPY %edi
+    %1(s32) = COPY %esi
+    %2(s1) = G_ICMP intpred(sle), %0(s32), %1
+    %3(s32) = G_ZEXT %2(s1)
+    %eax = COPY %3(s32)
+    RET 0, implicit %eax
+
+...
diff --git a/test/CodeGen/X86/GlobalISel/select-ext-x86-64.mir b/test/CodeGen/X86/GlobalISel/select-ext-x86-64.mir
index 85b3f61a9e44..0844701487bc 100644
--- a/test/CodeGen/X86/GlobalISel/select-ext-x86-64.mir
+++ b/test/CodeGen/X86/GlobalISel/select-ext-x86-64.mir
@@ -1,6 +1,12 @@
 # RUN: llc -mtriple=x86_64-linux-gnu -global-isel -run-pass=instruction-select %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=X64
 
 --- |
+  define i64 @test_zext_i1(i8 %a) {
+    %val = trunc i8 %a to i1
+    %r = zext i1 %val to i64
+    ret i64 %r
+  }
+
   define i64 @test_sext_i8(i8 %val) {
     %r = sext i8 %val to i64
     ret i64 %r
@@ -11,6 +17,38 @@
     ret i64 %r
   }
 
+...
+---
+name:            test_zext_i1
+# ALL-LABEL: name:  test_zext_i1
+alignment:       4
+legalized:       true
+regBankSelected: true
+# ALL:      registers:
+# ALL-NEXT:   - { id: 0, class: gr8 }
+# ALL-NEXT:   - { id: 1, class: gr8 }
+# ALL-NEXT:   - { id: 2, class: gr64 }
+# ALL-NEXT:   - { id: 3, class: gr64 }
+registers:
+  - { id: 0, class: gpr }
+  - { id: 1, class: gpr }
+  - { id: 2, class: gpr }
+# ALL:          %0 = COPY %dil
+# ALL-NEXT:     %1 = COPY %0
+# ALL-NEXT:     %3 = SUBREG_TO_REG 0, %1, 1
+# ALL-NEXT:     %2 = AND64ri8 %3, 1, implicit-def %eflags
+# ALL-NEXT:     %rax = COPY %2
+# ALL-NEXT:     RET 0, implicit %rax
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: %edi
+
+    %0(s8) = COPY %edi
+    %1(s1) = G_TRUNC %0(s8)
+    %2(s64) = G_ZEXT %1(s1)
+    %rax = COPY %2(s64)
+    RET 0, implicit %rax
+
 ...
 ---
 name:            test_sext_i8
diff --git a/test/CodeGen/X86/GlobalISel/select-ext.mir b/test/CodeGen/X86/GlobalISel/select-ext.mir
index 63aeae89bd1a..831d6efb75f1 100644
--- a/test/CodeGen/X86/GlobalISel/select-ext.mir
+++ b/test/CodeGen/X86/GlobalISel/select-ext.mir
@@ -2,6 +2,11 @@
 # RUN: llc -mtriple=x86_64-linux-gnu -global-isel -run-pass=instruction-select %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=X64
 
 --- |
+  define i32 @test_zext_i1(i1 %a) {
+    %r = zext i1 %a to i32
+    ret i32 %r
+  }
+
   define i32 @test_zext_i8(i8 %val) {
     %r = zext i8 %val to i32
     ret i32 %r
@@ -22,6 +27,34 @@
     ret i32 %r
   }
 
+...
+---
+name:            test_zext_i1
+# ALL-LABEL: name:  test_zext_i1
+alignment:       4
+legalized:       true
+regBankSelected: true
+# ALL:      registers:
+# ALL-NEXT:   - { id: 0, class: gr8 }
+# ALL-NEXT:   - { id: 1, class: gr32 }
+# ALL-NEXT:   - { id: 2, class: gr32 }
+registers:
+  - { id: 0, class: gpr }
+  - { id: 1, class: gpr }
+# ALL:          %0 = COPY %dil
+# ALL-NEXT:     %2 = SUBREG_TO_REG 0, %0, 1
+# ALL-NEXT:     %1 = AND32ri8 %2, 1, implicit-def %eflags
+# ALL-NEXT:     %eax = COPY %1
+# ALL-NEXT:     RET 0, implicit %eax
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: %edi
+
+    %0(s1) = COPY %edi
+    %1(s32) = G_ZEXT %0(s1)
+    %eax = COPY %1(s32)
+    RET 0, implicit %eax
+
 ...
 ---
 name:            test_zext_i8
diff --git a/test/CodeGen/X86/GlobalISel/select-memop-scalar-x32.mir b/test/CodeGen/X86/GlobalISel/select-memop-scalar-x32.mir
new file mode 100644
index 000000000000..8e6a2771db6e
--- /dev/null
+++ b/test/CodeGen/X86/GlobalISel/select-memop-scalar-x32.mir
@@ -0,0 +1,310 @@
+# RUN: llc -mtriple=i386-linux-gnu  -global-isel -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=ALL
+
+--- |
+  define i8 @test_load_i8(i8* %p1) {
+    %r = load i8, i8* %p1
+    ret i8 %r
+  }
+
+  define i16 @test_load_i16(i16* %p1) {
+    %r = load i16, i16* %p1
+    ret i16 %r
+  }
+
+  define i32 @test_load_i32(i32* %p1) {
+    %r = load i32, i32* %p1
+    ret i32 %r
+  }
+
+  define i8* @test_store_i8(i8 %val, i8* %p1) {
+    store i8 %val, i8* %p1
+    ret i8* %p1
+  }
+
+  define i16* @test_store_i16(i16 %val, i16* %p1) {
+    store i16 %val, i16* %p1
+    ret i16* %p1
+  }
+
+  define i32* @test_store_i32(i32 %val, i32* %p1) {
+    store i32 %val, i32* %p1
+    ret i32* %p1
+  }
+
+  define i32* @test_load_ptr(i32** %ptr1) {
+    %p = load i32*, i32** %ptr1
+    ret i32* %p
+  }
+
+  define void @test_store_ptr(i32** %ptr1, i32* %a) {
+    store i32* %a, i32** %ptr1
+    ret void
+  }
+
+...
+---
+name:            test_load_i8
+# ALL-LABEL: name:  test_load_i8
+alignment:       4
+legalized:       true
+regBankSelected: true
+# ALL:      registers:
+# ALL-NEXT:   - { id: 0, class: gr32 }
+# ALL-NEXT:   - { id: 1, class: gr32 }
+# ALL-NEXT:   - { id: 2, class: gr8 }
+registers:
+  - { id: 0, class: gpr }
+  - { id: 1, class: gpr }
+  - { id: 2, class: gpr }
+fixedStack:
+  - { id: 0, offset: 0, size: 4, alignment: 16, isImmutable: true, isAliased: false }
+# ALL:          %1 = LEA32r %fixed-stack.0, 1, _, 0, _
+# ALL-NEXT:     %0 = MOV32rm %1, 1, _, 0, _ :: (invariant load 4 from %fixed-stack.0, align 0)
+# ALL-NEXT:     %2 = MOV8rm %0, 1, _, 0, _ :: (load 1 from %ir.p1)
+# ALL-NEXT:     %al = COPY %2
+# ALL-NEXT:     RET 0, implicit %al
+body:             |
+  bb.1 (%ir-block.0):
+    %1(p0) = G_FRAME_INDEX %fixed-stack.0
+    %0(p0) = G_LOAD %1(p0) :: (invariant load 4 from %fixed-stack.0, align 0)
+    %2(s8) = G_LOAD %0(p0) :: (load 1 from %ir.p1)
+    %al = COPY %2(s8)
+    RET 0, implicit %al
+
+...
+---
+name:            test_load_i16
+# ALL-LABEL: name:  test_load_i16
+alignment:       4
+legalized:       true
+regBankSelected: true
+# ALL:      registers:
+# ALL-NEXT:   - { id: 0, class: gr32 }
+# ALL-NEXT:   - { id: 1, class: gr32 }
+# ALL-NEXT:   - { id: 2, class: gr16 }
+registers:
+  - { id: 0, class: gpr }
+  - { id: 1, class: gpr }
+  - { id: 2, class: gpr }
+fixedStack:
+  - { id: 0, offset: 0, size: 4, alignment: 16, isImmutable: true, isAliased: false }
+# ALL:          %1 = LEA32r %fixed-stack.0, 1, _, 0, _
+# ALL-NEXT:     %0 = MOV32rm %1, 1, _, 0, _ :: (invariant load 4 from %fixed-stack.0, align 0)
+# ALL-NEXT:     %2 = MOV16rm %0, 1, _, 0, _ :: (load 2 from %ir.p1)
+# ALL-NEXT:     %ax = COPY %2
+# ALL-NEXT:     RET 0, implicit %ax
+body:             |
+  bb.1 (%ir-block.0):
+    %1(p0) = G_FRAME_INDEX %fixed-stack.0
+    %0(p0) = G_LOAD %1(p0) :: (invariant load 4 from %fixed-stack.0, align 0)
+    %2(s16) = G_LOAD %0(p0) :: (load 2 from %ir.p1)
+    %ax = COPY %2(s16)
+    RET 0, implicit %ax
+
+...
+---
+name:            test_load_i32
+# ALL-LABEL: name:  test_load_i32
+alignment:       4
+legalized:       true
+regBankSelected: true
+# ALL:      registers:
+# ALL-NEXT:   - { id: 0, class: gr32 }
+# ALL-NEXT:   - { id: 1, class: gr32 }
+# ALL-NEXT:   - { id: 2, class: gr32 }
+registers:
+  - { id: 0, class: gpr }
+  - { id: 1, class: gpr }
+  - { id: 2, class: gpr }
+fixedStack:
+  - { id: 0, offset: 0, size: 4, alignment: 16, isImmutable: true, isAliased: false }
+# ALL:          %1 = LEA32r %fixed-stack.0, 1, _, 0, _
+# ALL-NEXT:     %0 = MOV32rm %1, 1, _, 0, _ :: (invariant load 4 from %fixed-stack.0, align 0)
+# ALL-NEXT:     %2 = MOV32rm %0, 1, _, 0, _ :: (load 4 from %ir.p1)
+# ALL-NEXT:     %eax = COPY %2
+# ALL-NEXT:     RET 0, implicit %eax
+body:             |
+  bb.1 (%ir-block.0):
+    %1(p0) = G_FRAME_INDEX %fixed-stack.0
+    %0(p0) = G_LOAD %1(p0) :: (invariant load 4 from %fixed-stack.0, align 0)
+    %2(s32) = G_LOAD %0(p0) :: (load 4 from %ir.p1)
+    %eax = COPY %2(s32)
+    RET 0, implicit %eax
+
+...
+---
+name:            test_store_i8
+# ALL-LABEL: name:  test_store_i8
+alignment:       4
+legalized:       true
+regBankSelected: true
+# ALL:      registers:
+# ALL-NEXT:   - { id: 0, class: gr8 }
+# ALL-NEXT:   - { id: 1, class: gr32 }
+# ALL-NEXT:   - { id: 2, class: gr32 }
+# ALL-NEXT:   - { id: 3, class: gr32 }
+registers:
+  - { id: 0, class: gpr }
+  - { id: 1, class: gpr }
+  - { id: 2, class: gpr }
+  - { id: 3, class: gpr }
+fixedStack:
+  - { id: 0, offset: 4, size: 4, alignment: 4, isImmutable: true, isAliased: false }
+  - { id: 1, offset: 0, size: 1, alignment: 16, isImmutable: true, isAliased: false }
+# ALL:          %2 = LEA32r %fixed-stack.0, 1, _, 0, _
+# ALL-NEXT:     %0 = MOV8rm %2, 1, _, 0, _ :: (invariant load 1 from %fixed-stack.0, align 0)
+# ALL-NEXT:     %3 = LEA32r %fixed-stack.1, 1, _, 0, _
+# ALL-NEXT:     %1 = MOV32rm %3, 1, _, 0, _ :: (invariant load 4 from %fixed-stack.1, align 0)
+# ALL-NEXT:     MOV8mr %1, 1, _, 0, _, %0 :: (store 1 into %ir.p1)
+# ALL-NEXT:     %eax = COPY %1
+# ALL-NEXT:     RET 0, implicit %eax
+body:             |
+  bb.1 (%ir-block.0):
+    %2(p0) = G_FRAME_INDEX %fixed-stack.1
+    %0(s8) = G_LOAD %2(p0) :: (invariant load 1 from %fixed-stack.1, align 0)
+    %3(p0) = G_FRAME_INDEX %fixed-stack.0
+    %1(p0) = G_LOAD %3(p0) :: (invariant load 4 from %fixed-stack.0, align 0)
+    G_STORE %0(s8), %1(p0) :: (store 1 into %ir.p1)
+    %eax = COPY %1(p0)
+    RET 0, implicit %eax
+
+...
+---
+name:            test_store_i16
+# ALL-LABEL: name:  test_store_i16
+alignment:       4
+legalized:       true
+regBankSelected: true
+# ALL:      registers:
+# ALL-NEXT:   - { id: 0, class: gr16 }
+# ALL-NEXT:   - { id: 1, class: gr32 }
+# ALL-NEXT:   - { id: 2, class: gr32 }
+# ALL-NEXT:   - { id: 3, class: gr32 }
+registers:
+  - { id: 0, class: gpr }
+  - { id: 1, class: gpr }
+  - { id: 2, class: gpr }
+  - { id: 3, class: gpr }
+fixedStack:
+  - { id: 0, offset: 4, size: 4, alignment: 4, isImmutable: true, isAliased: false }
+  - { id: 1, offset: 0, size: 2, alignment: 16, isImmutable: true, isAliased: false }
+# ALL:          %2 = LEA32r %fixed-stack.0, 1, _, 0, _
+# ALL-NEXT:     %0 = MOV16rm %2, 1, _, 0, _ :: (invariant load 2 from %fixed-stack.0, align 0)
+# ALL-NEXT:     %3 = LEA32r %fixed-stack.1, 1, _, 0, _
+# ALL-NEXT:     %1 = MOV32rm %3, 1, _, 0, _ :: (invariant load 4 from %fixed-stack.1, align 0)
+# ALL-NEXT:     MOV16mr %1, 1, _, 0, _, %0 :: (store 2 into %ir.p1)
+# ALL-NEXT:     %eax = COPY %1
+# ALL-NEXT:     RET 0, implicit %eax
+body:             |
+  bb.1 (%ir-block.0):
+    %2(p0) = G_FRAME_INDEX %fixed-stack.1
+    %0(s16) = G_LOAD %2(p0) :: (invariant load 2 from %fixed-stack.1, align 0)
+    %3(p0) = G_FRAME_INDEX %fixed-stack.0
+    %1(p0) = G_LOAD %3(p0) :: (invariant load 4 from %fixed-stack.0, align 0)
+    G_STORE %0(s16), %1(p0) :: (store 2 into %ir.p1)
+    %eax = COPY %1(p0)
+    RET 0, implicit %eax
+
+...
+---
+name:            test_store_i32
+# ALL-LABEL: name:  test_store_i32
+alignment:       4
+legalized:       true
+regBankSelected: true
+# ALL:      registers:
+# ALL-NEXT:   - { id: 0, class: gr32 }
+# ALL-NEXT:   - { id: 1, class: gr32 }
+# ALL-NEXT:   - { id: 2, class: gr32 }
+# ALL-NEXT:   - { id: 3, class: gr32 }
+registers:
+  - { id: 0, class: gpr }
+  - { id: 1, class: gpr }
+  - { id: 2, class: gpr }
+  - { id: 3, class: gpr }
+fixedStack:
+  - { id: 0, offset: 4, size: 4, alignment: 4, isImmutable: true, isAliased: false }
+  - { id: 1, offset: 0, size: 4, alignment: 16, isImmutable: true, isAliased: false }
+# ALL:          %2 = LEA32r %fixed-stack.0, 1, _, 0, _
+# ALL-NEXT:     %0 = MOV32rm %2, 1, _, 0, _ :: (invariant load 4 from %fixed-stack.0, align 0)
+# ALL-NEXT:     %3 = LEA32r %fixed-stack.1, 1, _, 0, _
+# ALL-NEXT:     %1 = MOV32rm %3, 1, _, 0, _ :: (invariant load 4 from %fixed-stack.1, align 0)
+# ALL-NEXT:     MOV32mr %1, 1, _, 0, _, %0 :: (store 4 into %ir.p1)
+# ALL-NEXT:     %eax = COPY %1
+# ALL-NEXT:     RET 0, implicit %eax
+body:             |
+  bb.1 (%ir-block.0):
+    %2(p0) = G_FRAME_INDEX %fixed-stack.1
+    %0(s32) = G_LOAD %2(p0) :: (invariant load 4 from %fixed-stack.1, align 0)
+    %3(p0) = G_FRAME_INDEX %fixed-stack.0
+    %1(p0) = G_LOAD %3(p0) :: (invariant load 4 from %fixed-stack.0, align 0)
+    G_STORE %0(s32), %1(p0) :: (store 4 into %ir.p1)
+    %eax = COPY %1(p0)
+    RET 0, implicit %eax
+
+...
+---
+name:            test_load_ptr
+# ALL-LABEL: name:  test_load_ptr
+alignment:       4
+legalized:       true
+regBankSelected: true
+# ALL:      registers:
+# ALL-NEXT:   - { id: 0, class: gr32 }
+# ALL-NEXT:   - { id: 1, class: gr32 }
+# ALL-NEXT:   - { id: 2, class: gr32 }
+registers:
+  - { id: 0, class: gpr }
+  - { id: 1, class: gpr }
+  - { id: 2, class: gpr }
+fixedStack:
+  - { id: 0, offset: 0, size: 4, alignment: 16, isImmutable: true, isAliased: false }
+# ALL:          %1 = LEA32r %fixed-stack.0, 1, _, 0, _
+# ALL-NEXT:     %0 = MOV32rm %1, 1, _, 0, _ :: (invariant load 4 from %fixed-stack.0, align 0)
+# ALL-NEXT:     %2 = MOV32rm %0, 1, _, 0, _ :: (load 4 from %ir.ptr1)
+# ALL-NEXT:     %eax = COPY %2
+# ALL-NEXT:     RET 0, implicit %eax
+body:             |
+  bb.1 (%ir-block.0):
+    %1(p0) = G_FRAME_INDEX %fixed-stack.0
+    %0(p0) = G_LOAD %1(p0) :: (invariant load 4 from %fixed-stack.0, align 0)
+    %2(p0) = G_LOAD %0(p0) :: (load 4 from %ir.ptr1)
+    %eax = COPY %2(p0)
+    RET 0, implicit %eax
+
+...
+---
+name:            test_store_ptr
+# ALL-LABEL: name:  test_store_ptr
+alignment:       4
+legalized:       true
+regBankSelected: true
+# ALL:      registers:
+# ALL-NEXT:   - { id: 0, class: gr32 }
+# ALL-NEXT:   - { id: 1, class: gr32 }
+# ALL-NEXT:   - { id: 2, class: gr32 }
+# ALL-NEXT:   - { id: 3, class: gr32 }
+registers:
+  - { id: 0, class: gpr }
+  - { id: 1, class: gpr }
+  - { id: 2, class: gpr }
+  - { id: 3, class: gpr }
+fixedStack:
+  - { id: 0, offset: 4, size: 4, alignment: 4, isImmutable: true, isAliased: false }
+  - { id: 1, offset: 0, size: 4, alignment: 16, isImmutable: true, isAliased: false }
+# ALL:          %2 = LEA32r %fixed-stack.0, 1, _, 0, _
+# ALL-NEXT:     %0 = MOV32rm %2, 1, _, 0, _ :: (invariant load 4 from %fixed-stack.0, align 0)
+# ALL-NEXT:     %3 = LEA32r %fixed-stack.1, 1, _, 0, _
+# ALL-NEXT:     %1 = MOV32rm %3, 1, _, 0, _ :: (invariant load 4 from %fixed-stack.1, align 0)
+# ALL-NEXT:     MOV32mr %0, 1, _, 0, _, %1 :: (store 4 into %ir.ptr1)
+# ALL-NEXT:     RET 0
+body:             |
+  bb.1 (%ir-block.0):
+    %2(p0) = G_FRAME_INDEX %fixed-stack.1
+    %0(p0) = G_LOAD %2(p0) :: (invariant load 4 from %fixed-stack.1, align 0)
+    %3(p0) = G_FRAME_INDEX %fixed-stack.0
+    %1(p0) = G_LOAD %3(p0) :: (invariant load 4 from %fixed-stack.0, align 0)
+    G_STORE %1(p0), %0(p0) :: (store 4 into %ir.ptr1)
+    RET 0
+
+...
diff --git a/test/CodeGen/X86/GlobalISel/select-memop-scalar.mir b/test/CodeGen/X86/GlobalISel/select-memop-scalar.mir
new file mode 100644
index 000000000000..b57c9b0cca98
--- /dev/null
+++ b/test/CodeGen/X86/GlobalISel/select-memop-scalar.mir
@@ -0,0 +1,500 @@
+# RUN: llc -mtriple=x86_64-linux-gnu                                  -global-isel -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=NO_AVX512VL --check-prefix=NO_AVX512F --check-prefix=SSE
+# RUN: llc -mtriple=x86_64-linux-gnu -mattr=+avx                      -global-isel -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=NO_AVX512VL --check-prefix=NO_AVX512F --check-prefix=AVX
+# RUN: llc -mtriple=x86_64-linux-gnu -mattr=+avx512f                  -global-isel -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=NO_AVX512VL --check-prefix=AVX512ALL --check-prefix=AVX512F
+# RUN: llc -mtriple=x86_64-linux-gnu -mattr=+avx512f -mattr=+avx512vl -global-isel -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=AVX512ALL --check-prefix=AVX512VL
+
+--- |
+  define i8 @test_load_i8(i8* %p1) {
+    %r = load i8, i8* %p1
+    ret i8 %r
+  }
+
+  define i16 @test_load_i16(i16* %p1) {
+    %r = load i16, i16* %p1
+    ret i16 %r
+  }
+
+  define i32 @test_load_i32(i32* %p1) {
+    %r = load i32, i32* %p1
+    ret i32 %r
+  }
+
+  define i64 @test_load_i64(i64* %p1) {
+    %r = load i64, i64* %p1
+    ret i64 %r
+  }
+
+  define float @test_load_float(float* %p1) {
+    %r = load float, float* %p1
+    ret float %r
+  }
+
+  define float @test_load_float_vecreg(float* %p1) {
+    %r = load float, float* %p1
+    ret float %r
+  }
+
+  define double @test_load_double(double* %p1) {
+    %r = load double, double* %p1
+    ret double %r
+  }
+
+  define double @test_load_double_vecreg(double* %p1) {
+    %r = load double, double* %p1
+    ret double %r
+  }
+
+  define i32* @test_store_i32(i32 %val, i32* %p1) {
+    store i32 %val, i32* %p1
+    ret i32* %p1
+  }
+
+  define i64* @test_store_i64(i64 %val, i64* %p1) {
+    store i64 %val, i64* %p1
+    ret i64* %p1
+  }
+
+  define float* @test_store_float(float %val, float* %p1) {
+    store float %val, float* %p1
+    ret float* %p1
+  }
+
+  define float* @test_store_float_vec(float %val, float* %p1) {
+    store float %val, float* %p1
+    ret float* %p1
+  }
+
+  define double* @test_store_double(double %val, double* %p1) {
+    store double %val, double* %p1
+    ret double* %p1
+  }
+
+  define double* @test_store_double_vec(double %val, double* %p1) {
+    store double %val, double* %p1
+    ret double* %p1
+  }
+
+  define i32* @test_load_ptr(i32** %ptr1) {
+    %p = load i32*, i32** %ptr1
+    ret i32* %p
+  }
+
+  define void @test_store_ptr(i32** %ptr1, i32* %a) {
+    store i32* %a, i32** %ptr1
+    ret void
+  }
+...
+---
+# ALL-LABEL: name:            test_load_i8
+name:            test_load_i8
+alignment:       4
+legalized:       true
+regBankSelected: true
+registers:
+# ALL:   - { id: 0, class: gr64 }
+# ALL:   - { id: 1, class: gr8 }
+  - { id: 0, class: gpr }
+  - { id: 1, class: gpr }
+# ALL:     %0 = COPY %rdi
+# ALL:     %1 = MOV8rm %0, 1, _, 0, _ :: (load 1 from %ir.p1)
+# ALL:     %al = COPY %1
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: %rdi
+
+    %0(p0) = COPY %rdi
+    %1(s8) = G_LOAD %0(p0) :: (load 1 from %ir.p1)
+    %al = COPY %1(s8)
+    RET 0, implicit %al
+
+...
+---
+# ALL-LABEL: name:            test_load_i16
+name:            test_load_i16
+alignment:       4
+legalized:       true
+regBankSelected: true
+registers:
+# ALL:   - { id: 0, class: gr64 }
+# ALL:   - { id: 1, class: gr16 }
+  - { id: 0, class: gpr }
+  - { id: 1, class: gpr }
+# ALL:     %0 = COPY %rdi
+# ALL:     %1 = MOV16rm %0, 1, _, 0, _ :: (load 2 from %ir.p1)
+# ALL:     %ax = COPY %1
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: %rdi
+
+    %0(p0) = COPY %rdi
+    %1(s16) = G_LOAD %0(p0) :: (load 2 from %ir.p1)
+    %ax = COPY %1(s16)
+    RET 0, implicit %ax
+
+...
+---
+# ALL-LABEL: name:            test_load_i32
+name:            test_load_i32
+alignment:       4
+legalized:       true
+regBankSelected: true
+registers:
+# ALL:   - { id: 0, class: gr64 }
+# ALL:   - { id: 1, class: gr32 }
+  - { id: 0, class: gpr }
+  - { id: 1, class: gpr }
+# ALL:     %0 = COPY %rdi
+# ALL:     %1 = MOV32rm %0, 1, _, 0, _ :: (load 4 from %ir.p1)
+# ALL:     %eax = COPY %1
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: %rdi
+
+    %0(p0) = COPY %rdi
+    %1(s32) = G_LOAD %0(p0) :: (load 4 from %ir.p1)
+    %eax = COPY %1(s32)
+    RET 0, implicit %eax
+
+...
+---
+# ALL-LABEL: name:            test_load_i64
+name:            test_load_i64
+alignment:       4
+legalized:       true
+regBankSelected: true
+registers:
+# ALL:   - { id: 0, class: gr64 }
+# ALL:   - { id: 1, class: gr64 }
+  - { id: 0, class: gpr }
+  - { id: 1, class: gpr }
+# ALL:     %0 = COPY %rdi
+# ALL:     %1 = MOV64rm %0, 1, _, 0, _ :: (load 8 from %ir.p1)
+# ALL:     %rax = COPY %1
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: %rdi
+
+    %0(p0) = COPY %rdi
+    %1(s64) = G_LOAD %0(p0) :: (load 8 from %ir.p1)
+    %rax = COPY %1(s64)
+    RET 0, implicit %rax
+
+...
+---
+# ALL-LABEL: name:            test_load_float
+name:            test_load_float
+alignment:       4
+legalized:       true
+regBankSelected: true
+registers:
+# ALL:   - { id: 0, class: gr64 }
+# ALL:   - { id: 1, class: gr32 }
+  - { id: 0, class: gpr }
+  - { id: 1, class: gpr }
+# ALL:     %0 = COPY %rdi
+# ALL:     %1 = MOV32rm %0, 1, _, 0, _ :: (load 4 from %ir.p1)
+# ALL:     %xmm0 = COPY %1
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: %rdi
+
+    %0(p0) = COPY %rdi
+    %1(s32) = G_LOAD %0(p0) :: (load 4 from %ir.p1)
+    %xmm0 = COPY %1(s32)
+    RET 0, implicit %xmm0
+
+...
+---
+# ALL-LABEL: name:            test_load_float_vecreg
+name:            test_load_float_vecreg
+alignment:       4
+legalized:       true
+regBankSelected: true
+registers:
+# ALL:          - { id: 0, class: gr64 }
+# NO_AVX512F:   - { id: 1, class: fr32 }
+# AVX512ALL:    - { id: 1, class: fr32x }
+  - { id: 0, class: gpr }
+  - { id: 1, class: vecr }
+# ALL:       %0 = COPY %rdi
+# SSE:       %1 = MOVSSrm %0, 1, _, 0, _ :: (load 4 from %ir.p1)
+# AVX:       %1 = VMOVSSrm %0, 1, _, 0, _ :: (load 4 from %ir.p1)
+# AVX512ALL: %1 = VMOVSSZrm %0, 1, _, 0, _ :: (load 4 from %ir.p1)
+# ALL: %xmm0 = COPY %1
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: %rdi
+
+    %0(p0) = COPY %rdi
+    %1(s32) = G_LOAD %0(p0) :: (load 4 from %ir.p1)
+    %xmm0 = COPY %1(s32)
+    RET 0, implicit %xmm0
+
+...
+---
+# ALL-LABEL: name:            test_load_double
+name:            test_load_double
+alignment:       4
+legalized:       true
+regBankSelected: true
+registers:
+# ALL:   - { id: 0, class: gr64 }
+# ALL:   - { id: 1, class: gr64 }
+  - { id: 0, class: gpr }
+  - { id: 1, class: gpr }
+# ALL:     %0 = COPY %rdi
+# ALL:     %1 = MOV64rm %0, 1, _, 0, _ :: (load 8 from %ir.p1)
+# ALL:     %xmm0 = COPY %1
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: %rdi
+
+    %0(p0) = COPY %rdi
+    %1(s64) = G_LOAD %0(p0) :: (load 8 from %ir.p1)
+    %xmm0 = COPY %1(s64)
+    RET 0, implicit %xmm0
+
+...
+---
+# ALL-LABEL: name:            test_load_double_vecreg
+name:            test_load_double_vecreg
+alignment:       4
+legalized:       true
+regBankSelected: true
+registers:
+# ALL:          - { id: 0, class: gr64 }
+# NO_AVX512F:   - { id: 1, class: fr64 }
+# AVX512ALL:    - { id: 1, class: fr64x }
+  - { id: 0, class: gpr }
+  - { id: 1, class: vecr }
+# ALL:       %0 = COPY %rdi
+# SSE:       %1 = MOVSDrm %0, 1, _, 0, _ :: (load 8 from %ir.p1)
+# AVX:       %1 = VMOVSDrm %0, 1, _, 0, _ :: (load 8 from %ir.p1)
+# AVX512ALL: %1 = VMOVSDZrm %0, 1, _, 0, _ :: (load 8 from %ir.p1)
+# ALL: %xmm0 = COPY %1
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: %rdi
+
+    %0(p0) = COPY %rdi
+    %1(s64) = G_LOAD %0(p0) :: (load 8 from %ir.p1)
+    %xmm0 = COPY %1(s64)
+    RET 0, implicit %xmm0
+
+...
+---
+# ALL-LABEL: name:            test_store_i32
+name:            test_store_i32
+alignment:       4
+legalized:       true
+regBankSelected: true
+registers:
+# ALL:   - { id: 0, class: gr32 }
+# ALL:   - { id: 1, class: gr64 }
+  - { id: 0, class: gpr }
+  - { id: 1, class: gpr }
+# ALL:     %0 = COPY %edi
+# ALL:     %1 = COPY %rsi
+# ALL:     MOV32mr %1, 1, _, 0, _, %0 :: (store 4 into %ir.p1)
+# ALL:     %rax = COPY %1
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: %edi, %rsi
+
+    %0(s32) = COPY %edi
+    %1(p0) = COPY %rsi
+    G_STORE %0(s32), %1(p0) :: (store 4 into %ir.p1)
+    %rax = COPY %1(p0)
+    RET 0, implicit %rax
+
+...
+---
+# ALL-LABEL: name:            test_store_i64
+name:            test_store_i64
+alignment:       4
+legalized:       true
+regBankSelected: true
+registers:
+# ALL:   - { id: 0, class: gr64 }
+# ALL:   - { id: 1, class: gr64 }
+  - { id: 0, class: gpr }
+  - { id: 1, class: gpr }
+# ALL:     %0 = COPY %rdi
+# ALL:     %1 = COPY %rsi
+# ALL:     MOV64mr %1, 1, _, 0, _, %0 :: (store 8 into %ir.p1)
+# ALL:     %rax = COPY %1
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: %rdi, %rsi
+
+    %0(s64) = COPY %rdi
+    %1(p0) = COPY %rsi
+    G_STORE %0(s64), %1(p0) :: (store 8 into %ir.p1)
+    %rax = COPY %1(p0)
+    RET 0, implicit %rax
+
+...
+---
+# ALL-LABEL: name:            test_store_float
+name:            test_store_float
+alignment:       4
+legalized:       true
+regBankSelected: true
+registers:
+# ALL:   - { id: 0, class: fr32x }
+# ALL:   - { id: 1, class: gr64 }
+# ALL:   - { id: 2, class: gr32 }
+  - { id: 0, class: vecr }
+  - { id: 1, class: gpr }
+  - { id: 2, class: gpr }
+# ALL:     %0 = COPY %xmm0
+# ALL:     %1 = COPY %rdi
+# ALL:     %2 = COPY %0
+# ALL:     MOV32mr %1, 1, _, 0, _, %2 :: (store 4 into %ir.p1)
+# ALL:     %rax = COPY %1
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: %rdi, %xmm0
+
+    %0(s32) = COPY %xmm0
+    %1(p0) = COPY %rdi
+    %2(s32) = COPY %0(s32)
+    G_STORE %2(s32), %1(p0) :: (store 4 into %ir.p1)
+    %rax = COPY %1(p0)
+    RET 0, implicit %rax
+
+...
+---
+# ALL-LABEL: name:            test_store_float_vec
+name:            test_store_float_vec
+alignment:       4
+legalized:       true
+regBankSelected: true
+registers:
+# NO_AVX512F:   - { id: 0, class: fr32 }
+# AVX512ALL:    - { id: 0, class: fr32x }
+# ALL:   - { id: 1, class: gr64 }
+  - { id: 0, class: vecr }
+  - { id: 1, class: gpr }
+# ALL:       %0 = COPY %xmm0
+# ALL:       %1 = COPY %rdi
+# SSE:       MOVSSmr %1, 1, _, 0, _, %0 :: (store 4 into %ir.p1)
+# AVX:       VMOVSSmr %1, 1, _, 0, _, %0 :: (store 4 into %ir.p1)
+# AVX512ALL: VMOVSSZmr %1, 1, _, 0, _, %0 :: (store 4 into %ir.p1)
+# ALL:       %rax = COPY %1
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: %rdi, %xmm0
+
+    %0(s32) = COPY %xmm0
+    %1(p0) = COPY %rdi
+    G_STORE %0(s32), %1(p0) :: (store 4 into %ir.p1)
+    %rax = COPY %1(p0)
+    RET 0, implicit %rax
+
+...
+---
+# ALL-LABEL: name:            test_store_double
+name:            test_store_double
+alignment:       4
+legalized:       true
+regBankSelected: true
+registers:
+# ALL:   - { id: 0, class: fr64x }
+# ALL:   - { id: 1, class: gr64 }
+# ALL:   - { id: 2, class: gr64 }
+  - { id: 0, class: vecr }
+  - { id: 1, class: gpr }
+  - { id: 2, class: gpr }
+# ALL:     %0 = COPY %xmm0
+# ALL:     %1 = COPY %rdi
+# ALL:     %2 = COPY %0
+# ALL:     MOV64mr %1, 1, _, 0, _, %2 :: (store 8 into %ir.p1)
+# ALL:     %rax = COPY %1
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: %rdi, %xmm0
+
+    %0(s64) = COPY %xmm0
+    %1(p0) = COPY %rdi
+    %2(s64) = COPY %0(s64)
+    G_STORE %2(s64), %1(p0) :: (store 8 into %ir.p1)
+    %rax = COPY %1(p0)
+    RET 0, implicit %rax
+
+...
+---
+# ALL-LABEL: name:            test_store_double_vec
+name:            test_store_double_vec
+alignment:       4
+legalized:       true
+regBankSelected: true
+registers:
+# NO_AVX512F:   - { id: 0, class: fr64 }
+# AVX512ALL:    - { id: 0, class: fr64x }
+# ALL:   - { id: 1, class: gr64 }
+  - { id: 0, class: vecr }
+  - { id: 1, class: gpr }
+# ALL:       %0 = COPY %xmm0
+# ALL:       %1 = COPY %rdi
+# SSE:       MOVSDmr %1, 1, _, 0, _, %0 :: (store 8 into %ir.p1)
+# AVX:       VMOVSDmr %1, 1, _, 0, _, %0 :: (store 8 into %ir.p1)
+# AVX512ALL: VMOVSDZmr %1, 1, _, 0, _, %0 :: (store 8 into %ir.p1)
+# ALL:       %rax = COPY %1
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: %rdi, %xmm0
+
+    %0(s64) = COPY %xmm0
+    %1(p0) = COPY %rdi
+    G_STORE %0(s64), %1(p0) :: (store 8 into %ir.p1)
+    %rax = COPY %1(p0)
+    RET 0, implicit %rax
+
+...
+---
+# ALL-LABEL: name:            test_load_ptr
+name:            test_load_ptr
+alignment:       4
+legalized:       true
+regBankSelected: true
+selected:        false
+registers:
+# ALL:   - { id: 0, class: gr64 }
+# ALL:   - { id: 1, class: gr64 }
+  - { id: 0, class: gpr }
+  - { id: 1, class: gpr }
+# ALL: %1 = MOV64rm %0, 1, _, 0, _ :: (load 8 from %ir.ptr1)
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: %rdi
+
+    %0(p0) = COPY %rdi
+    %1(p0) = G_LOAD %0(p0) :: (load 8 from %ir.ptr1)
+    %rax = COPY %1(p0)
+    RET 0, implicit %rax
+
+...
+---
+# ALL-LABEL: name:            test_store_ptr
+name:            test_store_ptr
+alignment:       4
+legalized:       true
+regBankSelected: true
+selected:        false
+registers:
+# ALL:   - { id: 0, class: gr64 }
+# ALL:   - { id: 1, class: gr64 }
+  - { id: 0, class: gpr }
+  - { id: 1, class: gpr }
+# ALL: MOV64mr %0, 1, _, 0, _, %1 :: (store 8 into %ir.ptr1)
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: %rdi, %rsi
+
+    %0(p0) = COPY %rdi
+    %1(p0) = COPY %rsi
+    G_STORE %1(p0), %0(p0) :: (store 8 into %ir.ptr1)
+    RET 0
+
+...
diff --git a/test/CodeGen/X86/GlobalISel/select-memop-v128.mir b/test/CodeGen/X86/GlobalISel/select-memop-v128.mir
new file mode 100644
index 000000000000..ce3f6b91dcf6
--- /dev/null
+++ b/test/CodeGen/X86/GlobalISel/select-memop-v128.mir
@@ -0,0 +1,143 @@
+# RUN: llc -mtriple=x86_64-linux-gnu                                  -global-isel -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=NO_AVX512VL --check-prefix=NO_AVX512F --check-prefix=SSE
+# RUN: llc -mtriple=x86_64-linux-gnu -mattr=+avx                      -global-isel -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=NO_AVX512VL --check-prefix=NO_AVX512F --check-prefix=AVX
+# RUN: llc -mtriple=x86_64-linux-gnu -mattr=+avx512f                  -global-isel -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=NO_AVX512VL --check-prefix=AVX512ALL --check-prefix=AVX512F
+# RUN: llc -mtriple=x86_64-linux-gnu -mattr=+avx512f -mattr=+avx512vl -global-isel -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=AVX512ALL --check-prefix=AVX512VL
+
+--- |
+  define <4 x i32> @test_load_v4i32_noalign(<4 x i32>* %p1) {
+    %r = load <4 x i32>, <4 x i32>* %p1, align 1
+    ret <4 x i32> %r
+  }
+
+  define <4 x i32> @test_load_v4i32_align(<4 x i32>* %p1) {
+    %r = load <4 x i32>, <4 x i32>* %p1, align 16
+    ret <4 x i32> %r
+  }
+
+  define <4 x i32>* @test_store_v4i32_align(<4 x i32> %val, <4 x i32>* %p1) {
+    store <4 x i32> %val, <4 x i32>* %p1, align 16
+    ret <4 x i32>* %p1
+  }
+
+  define <4 x i32>* @test_store_v4i32_noalign(<4 x i32> %val, <4 x i32>* %p1) {
+    store <4 x i32> %val, <4 x i32>* %p1, align 1
+    ret <4 x i32>* %p1
+  }
+
+...
+---
+# ALL-LABEL: name:            test_load_v4i32_noalign
+name:            test_load_v4i32_noalign
+alignment:       4
+legalized:       true
+regBankSelected: true
+registers:
+# ALL:          - { id: 0, class: gr64 }
+# NO_AVX512F:   - { id: 1, class: vr128 }
+# AVX512ALL:    - { id: 1, class: vr128x }
+  - { id: 0, class: gpr }
+  - { id: 1, class: vecr }
+# ALL:      %0 = COPY %rdi
+# SSE:      %1 = MOVUPSrm %0, 1, _, 0, _ :: (load 16 from %ir.p1, align 1)
+# AVX:      %1 = VMOVUPSrm %0, 1, _, 0, _ :: (load 16 from %ir.p1, align 1)
+# AVX512F:  %1 = VMOVUPSZ128rm_NOVLX %0, 1, _, 0, _ :: (load 16 from %ir.p1, align 1)
+# AVX512VL: %1 = VMOVUPSZ128rm %0, 1, _, 0, _ :: (load 16 from %ir.p1, align 1)
+# ALL: %xmm0 = COPY %1
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: %rdi
+
+    %0(p0) = COPY %rdi
+    %1(<4 x s32>) = G_LOAD %0(p0) :: (load 16 from %ir.p1, align 1)
+    %xmm0 = COPY %1(<4 x s32>)
+    RET 0, implicit %xmm0
+
+...
+---
+# ALL-LABEL: name:            test_load_v4i32_align
+name:            test_load_v4i32_align
+alignment:       4
+legalized:       true
+regBankSelected: true
+registers:
+# ALL:   - { id: 0, class: gr64 }
+# NO_AVX512F:   - { id: 1, class: vr128 }
+# AVX512ALL:    - { id: 1, class: vr128x }
+  - { id: 0, class: gpr }
+  - { id: 1, class: vecr }
+# ALL:      %0 = COPY %rdi
+# SSE:      %1 = MOVAPSrm %0, 1, _, 0, _ :: (load 16 from %ir.p1)
+# AVX:      %1 = VMOVAPSrm %0, 1, _, 0, _ :: (load 16 from %ir.p1)
+# AVX512F:  %1 = VMOVAPSZ128rm_NOVLX %0, 1, _, 0, _ :: (load 16 from %ir.p1)
+# AVX512VL: %1 = VMOVAPSZ128rm %0, 1, _, 0, _ :: (load 16 from %ir.p1)
+# ALL: %xmm0 = COPY %1
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: %rdi
+
+    %0(p0) = COPY %rdi
+    %1(<4 x s32>) = G_LOAD %0(p0) :: (load 16 from %ir.p1)
+    %xmm0 = COPY %1(<4 x s32>)
+    RET 0, implicit %xmm0
+
+...
+---
+# ALL-LABEL: name:            test_store_v4i32_align
+name:            test_store_v4i32_align
+alignment:       4
+legalized:       true
+regBankSelected: true
+registers:
+# NO_AVX512F:   - { id: 0, class: vr128 }
+# AVX512ALL:    - { id: 0, class: vr128x }
+# ALL:   - { id: 1, class: gr64 }
+  - { id: 0, class: vecr }
+  - { id: 1, class: gpr }
+# ALL:       %0 = COPY %xmm0
+# ALL:       %1 = COPY %rdi
+# SSE:       MOVAPSmr %1, 1, _, 0, _, %0 :: (store 16 into %ir.p1)
+# AVX:       VMOVAPSmr %1, 1, _, 0, _, %0 :: (store 16 into %ir.p1)
+# AVX512F:   VMOVAPSZ128mr_NOVLX %1, 1, _, 0, _, %0 :: (store 16 into %ir.p1)
+# AVX512VL:  VMOVAPSZ128mr %1, 1, _, 0, _, %0 :: (store 16 into %ir.p1)
+# ALL:       %rax = COPY %1
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: %rdi, %xmm0
+
+    %0(<4 x s32>) = COPY %xmm0
+    %1(p0) = COPY %rdi
+    G_STORE %0(<4 x s32>), %1(p0) :: (store 16 into %ir.p1, align 16)
+    %rax = COPY %1(p0)
+    RET 0, implicit %rax
+
+...
+---
+# ALL-LABEL: name:            test_store_v4i32_noalign
+name:            test_store_v4i32_noalign
+alignment:       4
+legalized:       true
+regBankSelected: true
+registers:
+# NO_AVX512F:   - { id: 0, class: vr128 }
+# AVX512ALL:    - { id: 0, class: vr128x }
+# ALL:   - { id: 1, class: gr64 }
+  - { id: 0, class: vecr }
+  - { id: 1, class: gpr }
+# ALL:       %0 = COPY %xmm0
+# ALL:       %1 = COPY %rdi
+# SSE:       MOVUPSmr %1, 1, _, 0, _, %0 :: (store 16 into %ir.p1, align 1)
+# AVX:       VMOVUPSmr %1, 1, _, 0, _, %0 :: (store 16 into %ir.p1, align 1)
+# AVX512F:   VMOVUPSZ128mr_NOVLX %1, 1, _, 0, _, %0 :: (store 16 into %ir.p1, align 1)
+# AVX512VL:  VMOVUPSZ128mr %1, 1, _, 0, _, %0 :: (store 16 into %ir.p1, align 1)
+# ALL:       %rax = COPY %1
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: %rdi, %xmm0
+
+    %0(<4 x s32>) = COPY %xmm0
+    %1(p0) = COPY %rdi
+    G_STORE %0(<4 x s32>), %1(p0) :: (store 16 into %ir.p1, align 1)
+    %rax = COPY %1(p0)
+    RET 0, implicit %rax
+
+...
diff --git a/test/CodeGen/X86/GlobalISel/select-memop-x32.mir b/test/CodeGen/X86/GlobalISel/select-memop-x32.mir
deleted file mode 100644
index 8e6a2771db6e..000000000000
--- a/test/CodeGen/X86/GlobalISel/select-memop-x32.mir
+++ /dev/null
@@ -1,310 +0,0 @@
-# RUN: llc -mtriple=i386-linux-gnu  -global-isel -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=ALL
-
---- |
-  define i8 @test_load_i8(i8* %p1) {
-    %r = load i8, i8* %p1
-    ret i8 %r
-  }
-
-  define i16 @test_load_i16(i16* %p1) {
-    %r = load i16, i16* %p1
-    ret i16 %r
-  }
-
-  define i32 @test_load_i32(i32* %p1) {
-    %r = load i32, i32* %p1
-    ret i32 %r
-  }
-
-  define i8* @test_store_i8(i8 %val, i8* %p1) {
-    store i8 %val, i8* %p1
-    ret i8* %p1
-  }
-
-  define i16* @test_store_i16(i16 %val, i16* %p1) {
-    store i16 %val, i16* %p1
-    ret i16* %p1
-  }
-
-  define i32* @test_store_i32(i32 %val, i32* %p1) {
-    store i32 %val, i32* %p1
-    ret i32* %p1
-  }
-
-  define i32* @test_load_ptr(i32** %ptr1) {
-    %p = load i32*, i32** %ptr1
-    ret i32* %p
-  }
-
-  define void @test_store_ptr(i32** %ptr1, i32* %a) {
-    store i32* %a, i32** %ptr1
-    ret void
-  }
-
-...
----
-name:            test_load_i8
-# ALL-LABEL: name:  test_load_i8
-alignment:       4
-legalized:       true
-regBankSelected: true
-# ALL:      registers:
-# ALL-NEXT:   - { id: 0, class: gr32 }
-# ALL-NEXT:   - { id: 1, class: gr32 }
-# ALL-NEXT:   - { id: 2, class: gr8 }
-registers:
-  - { id: 0, class: gpr }
-  - { id: 1, class: gpr }
-  - { id: 2, class: gpr }
-fixedStack:
-  - { id: 0, offset: 0, size: 4, alignment: 16, isImmutable: true, isAliased: false }
-# ALL:          %1 = LEA32r %fixed-stack.0, 1, _, 0, _
-# ALL-NEXT:     %0 = MOV32rm %1, 1, _, 0, _ :: (invariant load 4 from %fixed-stack.0, align 0)
-# ALL-NEXT:     %2 = MOV8rm %0, 1, _, 0, _ :: (load 1 from %ir.p1)
-# ALL-NEXT:     %al = COPY %2
-# ALL-NEXT:     RET 0, implicit %al
-body:             |
-  bb.1 (%ir-block.0):
-    %1(p0) = G_FRAME_INDEX %fixed-stack.0
-    %0(p0) = G_LOAD %1(p0) :: (invariant load 4 from %fixed-stack.0, align 0)
-    %2(s8) = G_LOAD %0(p0) :: (load 1 from %ir.p1)
-    %al = COPY %2(s8)
-    RET 0, implicit %al
-
-...
----
-name:            test_load_i16
-# ALL-LABEL: name:  test_load_i16
-alignment:       4
-legalized:       true
-regBankSelected: true
-# ALL:      registers:
-# ALL-NEXT:   - { id: 0, class: gr32 }
-# ALL-NEXT:   - { id: 1, class: gr32 }
-# ALL-NEXT:   - { id: 2, class: gr16 }
-registers:
-  - { id: 0, class: gpr }
-  - { id: 1, class: gpr }
-  - { id: 2, class: gpr }
-fixedStack:
-  - { id: 0, offset: 0, size: 4, alignment: 16, isImmutable: true, isAliased: false }
-# ALL:          %1 = LEA32r %fixed-stack.0, 1, _, 0, _
-# ALL-NEXT:     %0 = MOV32rm %1, 1, _, 0, _ :: (invariant load 4 from %fixed-stack.0, align 0)
-# ALL-NEXT:     %2 = MOV16rm %0, 1, _, 0, _ :: (load 2 from %ir.p1)
-# ALL-NEXT:     %ax = COPY %2
-# ALL-NEXT:     RET 0, implicit %ax
-body:             |
-  bb.1 (%ir-block.0):
-    %1(p0) = G_FRAME_INDEX %fixed-stack.0
-    %0(p0) = G_LOAD %1(p0) :: (invariant load 4 from %fixed-stack.0, align 0)
-    %2(s16) = G_LOAD %0(p0) :: (load 2 from %ir.p1)
-    %ax = COPY %2(s16)
-    RET 0, implicit %ax
-
-...
----
-name:            test_load_i32
-# ALL-LABEL: name:  test_load_i32
-alignment:       4
-legalized:       true
-regBankSelected: true
-# ALL:      registers:
-# ALL-NEXT:   - { id: 0, class: gr32 }
-# ALL-NEXT:   - { id: 1, class: gr32 }
-# ALL-NEXT:   - { id: 2, class: gr32 }
-registers:
-  - { id: 0, class: gpr }
-  - { id: 1, class: gpr }
-  - { id: 2, class: gpr }
-fixedStack:
-  - { id: 0, offset: 0, size: 4, alignment: 16, isImmutable: true, isAliased: false }
-# ALL:          %1 = LEA32r %fixed-stack.0, 1, _, 0, _
-# ALL-NEXT:     %0 = MOV32rm %1, 1, _, 0, _ :: (invariant load 4 from %fixed-stack.0, align 0)
-# ALL-NEXT:     %2 = MOV32rm %0, 1, _, 0, _ :: (load 4 from %ir.p1)
-# ALL-NEXT:     %eax = COPY %2
-# ALL-NEXT:     RET 0, implicit %eax
-body:             |
-  bb.1 (%ir-block.0):
-    %1(p0) = G_FRAME_INDEX %fixed-stack.0
-    %0(p0) = G_LOAD %1(p0) :: (invariant load 4 from %fixed-stack.0, align 0)
-    %2(s32) = G_LOAD %0(p0) :: (load 4 from %ir.p1)
-    %eax = COPY %2(s32)
-    RET 0, implicit %eax
-
-...
----
-name:            test_store_i8
-# ALL-LABEL: name:  test_store_i8
-alignment:       4
-legalized:       true
-regBankSelected: true
-# ALL:      registers:
-# ALL-NEXT:   - { id: 0, class: gr8 }
-# ALL-NEXT:   - { id: 1, class: gr32 }
-# ALL-NEXT:   - { id: 2, class: gr32 }
-# ALL-NEXT:   - { id: 3, class: gr32 }
-registers:
-  - { id: 0, class: gpr }
-  - { id: 1, class: gpr }
-  - { id: 2, class: gpr }
-  - { id: 3, class: gpr }
-fixedStack:
-  - { id: 0, offset: 4, size: 4, alignment: 4, isImmutable: true, isAliased: false }
-  - { id: 1, offset: 0, size: 1, alignment: 16, isImmutable: true, isAliased: false }
-# ALL:          %2 = LEA32r %fixed-stack.0, 1, _, 0, _
-# ALL-NEXT:     %0 = MOV8rm %2, 1, _, 0, _ :: (invariant load 1 from %fixed-stack.0, align 0)
-# ALL-NEXT:     %3 = LEA32r %fixed-stack.1, 1, _, 0, _
-# ALL-NEXT:     %1 = MOV32rm %3, 1, _, 0, _ :: (invariant load 4 from %fixed-stack.1, align 0)
-# ALL-NEXT:     MOV8mr %1, 1, _, 0, _, %0 :: (store 1 into %ir.p1)
-# ALL-NEXT:     %eax = COPY %1
-# ALL-NEXT:     RET 0, implicit %eax
-body:             |
-  bb.1 (%ir-block.0):
-    %2(p0) = G_FRAME_INDEX %fixed-stack.1
-    %0(s8) = G_LOAD %2(p0) :: (invariant load 1 from %fixed-stack.1, align 0)
-    %3(p0) = G_FRAME_INDEX %fixed-stack.0
-    %1(p0) = G_LOAD %3(p0) :: (invariant load 4 from %fixed-stack.0, align 0)
-    G_STORE %0(s8), %1(p0) :: (store 1 into %ir.p1)
-    %eax = COPY %1(p0)
-    RET 0, implicit %eax
-
-...
----
-name:            test_store_i16
-# ALL-LABEL: name:  test_store_i16
-alignment:       4
-legalized:       true
-regBankSelected: true
-# ALL:      registers:
-# ALL-NEXT:   - { id: 0, class: gr16 }
-# ALL-NEXT:   - { id: 1, class: gr32 }
-# ALL-NEXT:   - { id: 2, class: gr32 }
-# ALL-NEXT:   - { id: 3, class: gr32 }
-registers:
-  - { id: 0, class: gpr }
-  - { id: 1, class: gpr }
-  - { id: 2, class: gpr }
-  - { id: 3, class: gpr }
-fixedStack:
-  - { id: 0, offset: 4, size: 4, alignment: 4, isImmutable: true, isAliased: false }
-  - { id: 1, offset: 0, size: 2, alignment: 16, isImmutable: true, isAliased: false }
-# ALL:          %2 = LEA32r %fixed-stack.0, 1, _, 0, _
-# ALL-NEXT:     %0 = MOV16rm %2, 1, _, 0, _ :: (invariant load 2 from %fixed-stack.0, align 0)
-# ALL-NEXT:     %3 = LEA32r %fixed-stack.1, 1, _, 0, _
-# ALL-NEXT:     %1 = MOV32rm %3, 1, _, 0, _ :: (invariant load 4 from %fixed-stack.1, align 0)
-# ALL-NEXT:     MOV16mr %1, 1, _, 0, _, %0 :: (store 2 into %ir.p1)
-# ALL-NEXT:     %eax = COPY %1
-# ALL-NEXT:     RET 0, implicit %eax
-body:             |
-  bb.1 (%ir-block.0):
-    %2(p0) = G_FRAME_INDEX %fixed-stack.1
-    %0(s16) = G_LOAD %2(p0) :: (invariant load 2 from %fixed-stack.1, align 0)
-    %3(p0) = G_FRAME_INDEX %fixed-stack.0
-    %1(p0) = G_LOAD %3(p0) :: (invariant load 4 from %fixed-stack.0, align 0)
-    G_STORE %0(s16), %1(p0) :: (store 2 into %ir.p1)
-    %eax = COPY %1(p0)
-    RET 0, implicit %eax
-
-...
----
-name:            test_store_i32
-# ALL-LABEL: name:  test_store_i32
-alignment:       4
-legalized:       true
-regBankSelected: true
-# ALL:      registers:
-# ALL-NEXT:   - { id: 0, class: gr32 }
-# ALL-NEXT:   - { id: 1, class: gr32 }
-# ALL-NEXT:   - { id: 2, class: gr32 }
-# ALL-NEXT:   - { id: 3, class: gr32 }
-registers:
-  - { id: 0, class: gpr }
-  - { id: 1, class: gpr }
-  - { id: 2, class: gpr }
-  - { id: 3, class: gpr }
-fixedStack:
-  - { id: 0, offset: 4, size: 4, alignment: 4, isImmutable: true, isAliased: false }
-  - { id: 1, offset: 0, size: 4, alignment: 16, isImmutable: true, isAliased: false }
-# ALL:          %2 = LEA32r %fixed-stack.0, 1, _, 0, _
-# ALL-NEXT:     %0 = MOV32rm %2, 1, _, 0, _ :: (invariant load 4 from %fixed-stack.0, align 0)
-# ALL-NEXT:     %3 = LEA32r %fixed-stack.1, 1, _, 0, _
-# ALL-NEXT:     %1 = MOV32rm %3, 1, _, 0, _ :: (invariant load 4 from %fixed-stack.1, align 0)
-# ALL-NEXT:     MOV32mr %1, 1, _, 0, _, %0 :: (store 4 into %ir.p1)
-# ALL-NEXT:     %eax = COPY %1
-# ALL-NEXT:     RET 0, implicit %eax
-body:             |
-  bb.1 (%ir-block.0):
-    %2(p0) = G_FRAME_INDEX %fixed-stack.1
-    %0(s32) = G_LOAD %2(p0) :: (invariant load 4 from %fixed-stack.1, align 0)
-    %3(p0) = G_FRAME_INDEX %fixed-stack.0
-    %1(p0) = G_LOAD %3(p0) :: (invariant load 4 from %fixed-stack.0, align 0)
-    G_STORE %0(s32), %1(p0) :: (store 4 into %ir.p1)
-    %eax = COPY %1(p0)
-    RET 0, implicit %eax
-
-...
----
-name:            test_load_ptr
-# ALL-LABEL: name:  test_load_ptr
-alignment:       4
-legalized:       true
-regBankSelected: true
-# ALL:      registers:
-# ALL-NEXT:   - { id: 0, class: gr32 }
-# ALL-NEXT:   - { id: 1, class: gr32 }
-# ALL-NEXT:   - { id: 2, class: gr32 }
-registers:
-  - { id: 0, class: gpr }
-  - { id: 1, class: gpr }
-  - { id: 2, class: gpr }
-fixedStack:
-  - { id: 0, offset: 0, size: 4, alignment: 16, isImmutable: true, isAliased: false }
-# ALL:          %1 = LEA32r %fixed-stack.0, 1, _, 0, _
-# ALL-NEXT:     %0 = MOV32rm %1, 1, _, 0, _ :: (invariant load 4 from %fixed-stack.0, align 0)
-# ALL-NEXT:     %2 = MOV32rm %0, 1, _, 0, _ :: (load 4 from %ir.ptr1)
-# ALL-NEXT:     %eax = COPY %2
-# ALL-NEXT:     RET 0, implicit %eax
-body:             |
-  bb.1 (%ir-block.0):
-    %1(p0) = G_FRAME_INDEX %fixed-stack.0
-    %0(p0) = G_LOAD %1(p0) :: (invariant load 4 from %fixed-stack.0, align 0)
-    %2(p0) = G_LOAD %0(p0) :: (load 4 from %ir.ptr1)
-    %eax = COPY %2(p0)
-    RET 0, implicit %eax
-
-...
----
-name:            test_store_ptr
-# ALL-LABEL: name:  test_store_ptr
-alignment:       4
-legalized:       true
-regBankSelected: true
-# ALL:      registers:
-# ALL-NEXT:   - { id: 0, class: gr32 }
-# ALL-NEXT:   - { id: 1, class: gr32 }
-# ALL-NEXT:   - { id: 2, class: gr32 }
-# ALL-NEXT:   - { id: 3, class: gr32 }
-registers:
-  - { id: 0, class: gpr }
-  - { id: 1, class: gpr }
-  - { id: 2, class: gpr }
-  - { id: 3, class: gpr }
-fixedStack:
-  - { id: 0, offset: 4, size: 4, alignment: 4, isImmutable: true, isAliased: false }
-  - { id: 1, offset: 0, size: 4, alignment: 16, isImmutable: true, isAliased: false }
-# ALL:          %2 = LEA32r %fixed-stack.0, 1, _, 0, _
-# ALL-NEXT:     %0 = MOV32rm %2, 1, _, 0, _ :: (invariant load 4 from %fixed-stack.0, align 0)
-# ALL-NEXT:     %3 = LEA32r %fixed-stack.1, 1, _, 0, _
-# ALL-NEXT:     %1 = MOV32rm %3, 1, _, 0, _ :: (invariant load 4 from %fixed-stack.1, align 0)
-# ALL-NEXT:     MOV32mr %0, 1, _, 0, _, %1 :: (store 4 into %ir.ptr1)
-# ALL-NEXT:     RET 0
-body:             |
-  bb.1 (%ir-block.0):
-    %2(p0) = G_FRAME_INDEX %fixed-stack.1
-    %0(p0) = G_LOAD %2(p0) :: (invariant load 4 from %fixed-stack.1, align 0)
-    %3(p0) = G_FRAME_INDEX %fixed-stack.0
-    %1(p0) = G_LOAD %3(p0) :: (invariant load 4 from %fixed-stack.0, align 0)
-    G_STORE %1(p0), %0(p0) :: (store 4 into %ir.ptr1)
-    RET 0
-
-...
diff --git a/test/CodeGen/X86/GlobalISel/select-memop.mir b/test/CodeGen/X86/GlobalISel/select-memop.mir
deleted file mode 100644
index 817dc3cc9764..000000000000
--- a/test/CodeGen/X86/GlobalISel/select-memop.mir
+++ /dev/null
@@ -1,637 +0,0 @@
-# RUN: llc -mtriple=x86_64-linux-gnu                                  -global-isel -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=NO_AVX512VL --check-prefix=NO_AVX512F --check-prefix=SSE
-# RUN: llc -mtriple=x86_64-linux-gnu -mattr=+avx                      -global-isel -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=NO_AVX512VL --check-prefix=NO_AVX512F --check-prefix=AVX
-# RUN: llc -mtriple=x86_64-linux-gnu -mattr=+avx512f                  -global-isel -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=NO_AVX512VL --check-prefix=AVX512ALL --check-prefix=AVX512F
-# RUN: llc -mtriple=x86_64-linux-gnu -mattr=+avx512f -mattr=+avx512vl -global-isel -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=AVX512ALL --check-prefix=AVX512VL
-
---- |
-  define i8 @test_load_i8(i8* %p1) {
-    %r = load i8, i8* %p1
-    ret i8 %r
-  }
-
-  define i16 @test_load_i16(i16* %p1) {
-    %r = load i16, i16* %p1
-    ret i16 %r
-  }
-
-  define i32 @test_load_i32(i32* %p1) {
-    %r = load i32, i32* %p1
-    ret i32 %r
-  }
-
-  define i64 @test_load_i64(i64* %p1) {
-    %r = load i64, i64* %p1
-    ret i64 %r
-  }
-
-  define float @test_load_float(float* %p1) {
-    %r = load float, float* %p1
-    ret float %r
-  }
-
-  define float @test_load_float_vecreg(float* %p1) {
-    %r = load float, float* %p1
-    ret float %r
-  }
-
-
-  define double @test_load_double(double* %p1) {
-    %r = load double, double* %p1
-    ret double %r
-  }
-
-  define double @test_load_double_vecreg(double* %p1) {
-    %r = load double, double* %p1
-    ret double %r
-  }
-
-  define <4 x i32> @test_load_v4i32_noalign(<4 x i32>* %p1) {
-    %r = load <4 x i32>, <4 x i32>* %p1, align 1
-    ret <4 x i32> %r
-  }
-
-  define <4 x i32> @test_load_v4i32_align(<4 x i32>* %p1) {
-    %r = load <4 x i32>, <4 x i32>* %p1, align 16
-    ret <4 x i32> %r
-  }
-
-  define i32* @test_store_i32(i32 %val, i32* %p1) {
-    store i32 %val, i32* %p1
-    ret i32* %p1
-  }
-
-  define i64* @test_store_i64(i64 %val, i64* %p1) {
-    store i64 %val, i64* %p1
-    ret i64* %p1
-  }
-
-  define float* @test_store_float(float %val, float* %p1) {
-    store float %val, float* %p1
-    ret float* %p1
-  }
-
-  define float* @test_store_float_vec(float %val, float* %p1) {
-    store float %val, float* %p1
-    ret float* %p1
-  }
-
-  define double* @test_store_double(double %val, double* %p1) {
-    store double %val, double* %p1
-    ret double* %p1
-  }
-
-  define double* @test_store_double_vec(double %val, double* %p1) {
-    store double %val, double* %p1
-    ret double* %p1
-  }
-
-  define <4 x i32>* @test_store_v4i32_align(<4 x i32> %val, <4 x i32>* %p1) {
-    store <4 x i32> %val, <4 x i32>* %p1, align 16
-    ret <4 x i32>* %p1
-  }
-
-  define <4 x i32>* @test_store_v4i32_noalign(<4 x i32> %val, <4 x i32>* %p1) {
-    store <4 x i32> %val, <4 x i32>* %p1, align 1
-    ret <4 x i32>* %p1
-  }
-
-  define i32* @test_load_ptr(i32** %ptr1) {
-    %p = load i32*, i32** %ptr1
-    ret i32* %p
-  }
-
-  define void @test_store_ptr(i32** %ptr1, i32* %a) {
-    store i32* %a, i32** %ptr1
-    ret void
-  }
-...
----
-# ALL-LABEL: name:            test_load_i8
-name:            test_load_i8
-alignment:       4
-legalized:       true
-regBankSelected: true
-registers:
-# ALL:   - { id: 0, class: gr64 }
-# ALL:   - { id: 1, class: gr8 }
-  - { id: 0, class: gpr }
-  - { id: 1, class: gpr }
-# ALL:     %0 = COPY %rdi
-# ALL:     %1 = MOV8rm %0, 1, _, 0, _ :: (load 1 from %ir.p1)
-# ALL:     %al = COPY %1
-body:             |
-  bb.1 (%ir-block.0):
-    liveins: %rdi
-
-    %0(p0) = COPY %rdi
-    %1(s8) = G_LOAD %0(p0) :: (load 1 from %ir.p1)
-    %al = COPY %1(s8)
-    RET 0, implicit %al
-
-...
----
-# ALL-LABEL: name:            test_load_i16
-name:            test_load_i16
-alignment:       4
-legalized:       true
-regBankSelected: true
-registers:
-# ALL:   - { id: 0, class: gr64 }
-# ALL:   - { id: 1, class: gr16 }
-  - { id: 0, class: gpr }
-  - { id: 1, class: gpr }
-# ALL:     %0 = COPY %rdi
-# ALL:     %1 = MOV16rm %0, 1, _, 0, _ :: (load 2 from %ir.p1)
-# ALL:     %ax = COPY %1
-body:             |
-  bb.1 (%ir-block.0):
-    liveins: %rdi
-
-    %0(p0) = COPY %rdi
-    %1(s16) = G_LOAD %0(p0) :: (load 2 from %ir.p1)
-    %ax = COPY %1(s16)
-    RET 0, implicit %ax
-
-...
----
-# ALL-LABEL: name:            test_load_i32
-name:            test_load_i32
-alignment:       4
-legalized:       true
-regBankSelected: true
-registers:
-# ALL:   - { id: 0, class: gr64 }
-# ALL:   - { id: 1, class: gr32 }
-  - { id: 0, class: gpr }
-  - { id: 1, class: gpr }
-# ALL:     %0 = COPY %rdi
-# ALL:     %1 = MOV32rm %0, 1, _, 0, _ :: (load 4 from %ir.p1)
-# ALL:     %eax = COPY %1
-body:             |
-  bb.1 (%ir-block.0):
-    liveins: %rdi
-
-    %0(p0) = COPY %rdi
-    %1(s32) = G_LOAD %0(p0) :: (load 4 from %ir.p1)
-    %eax = COPY %1(s32)
-    RET 0, implicit %eax
-
-...
----
-# ALL-LABEL: name:            test_load_i64
-name:            test_load_i64
-alignment:       4
-legalized:       true
-regBankSelected: true
-registers:
-# ALL:   - { id: 0, class: gr64 }
-# ALL:   - { id: 1, class: gr64 }
-  - { id: 0, class: gpr }
-  - { id: 1, class: gpr }
-# ALL:     %0 = COPY %rdi
-# ALL:     %1 = MOV64rm %0, 1, _, 0, _ :: (load 8 from %ir.p1)
-# ALL:     %rax = COPY %1
-body:             |
-  bb.1 (%ir-block.0):
-    liveins: %rdi
-
-    %0(p0) = COPY %rdi
-    %1(s64) = G_LOAD %0(p0) :: (load 8 from %ir.p1)
-    %rax = COPY %1(s64)
-    RET 0, implicit %rax
-
-...
----
-# ALL-LABEL: name:            test_load_float
-name:            test_load_float
-alignment:       4
-legalized:       true
-regBankSelected: true
-registers:
-# ALL:   - { id: 0, class: gr64 }
-# ALL:   - { id: 1, class: gr32 }
-  - { id: 0, class: gpr }
-  - { id: 1, class: gpr }
-# ALL:     %0 = COPY %rdi
-# ALL:     %1 = MOV32rm %0, 1, _, 0, _ :: (load 4 from %ir.p1)
-# ALL:     %xmm0 = COPY %1
-body:             |
-  bb.1 (%ir-block.0):
-    liveins: %rdi
-
-    %0(p0) = COPY %rdi
-    %1(s32) = G_LOAD %0(p0) :: (load 4 from %ir.p1)
-    %xmm0 = COPY %1(s32)
-    RET 0, implicit %xmm0
-
-...
----
-# ALL-LABEL: name:            test_load_float_vecreg
-name:            test_load_float_vecreg
-alignment:       4
-legalized:       true
-regBankSelected: true
-registers:
-# ALL:          - { id: 0, class: gr64 }
-# NO_AVX512F:   - { id: 1, class: fr32 }
-# AVX512ALL:    - { id: 1, class: fr32x }
-  - { id: 0, class: gpr }
-  - { id: 1, class: vecr }
-# ALL:       %0 = COPY %rdi
-# SSE:       %1 = MOVSSrm %0, 1, _, 0, _ :: (load 4 from %ir.p1)
-# AVX:       %1 = VMOVSSrm %0, 1, _, 0, _ :: (load 4 from %ir.p1)
-# AVX512ALL: %1 = VMOVSSZrm %0, 1, _, 0, _ :: (load 4 from %ir.p1)
-# ALL: %xmm0 = COPY %1
-body:             |
-  bb.1 (%ir-block.0):
-    liveins: %rdi
-
-    %0(p0) = COPY %rdi
-    %1(s32) = G_LOAD %0(p0) :: (load 4 from %ir.p1)
-    %xmm0 = COPY %1(s32)
-    RET 0, implicit %xmm0
-
-...
----
-# ALL-LABEL: name:            test_load_double
-name:            test_load_double
-alignment:       4
-legalized:       true
-regBankSelected: true
-registers:
-# ALL:   - { id: 0, class: gr64 }
-# ALL:   - { id: 1, class: gr64 }
-  - { id: 0, class: gpr }
-  - { id: 1, class: gpr }
-# ALL:     %0 = COPY %rdi
-# ALL:     %1 = MOV64rm %0, 1, _, 0, _ :: (load 8 from %ir.p1)
-# ALL:     %xmm0 = COPY %1
-body:             |
-  bb.1 (%ir-block.0):
-    liveins: %rdi
-
-    %0(p0) = COPY %rdi
-    %1(s64) = G_LOAD %0(p0) :: (load 8 from %ir.p1)
-    %xmm0 = COPY %1(s64)
-    RET 0, implicit %xmm0
-
-...
----
-# ALL-LABEL: name:            test_load_double_vecreg
-name:            test_load_double_vecreg
-alignment:       4
-legalized:       true
-regBankSelected: true
-registers:
-# ALL:          - { id: 0, class: gr64 }
-# NO_AVX512F:   - { id: 1, class: fr64 }
-# AVX512ALL:    - { id: 1, class: fr64x }
-  - { id: 0, class: gpr }
-  - { id: 1, class: vecr }
-# ALL:       %0 = COPY %rdi
-# SSE:       %1 = MOVSDrm %0, 1, _, 0, _ :: (load 8 from %ir.p1)
-# AVX:       %1 = VMOVSDrm %0, 1, _, 0, _ :: (load 8 from %ir.p1)
-# AVX512ALL: %1 = VMOVSDZrm %0, 1, _, 0, _ :: (load 8 from %ir.p1)
-# ALL: %xmm0 = COPY %1
-body:             |
-  bb.1 (%ir-block.0):
-    liveins: %rdi
-
-    %0(p0) = COPY %rdi
-    %1(s64) = G_LOAD %0(p0) :: (load 8 from %ir.p1)
-    %xmm0 = COPY %1(s64)
-    RET 0, implicit %xmm0
-
-...
----
-# ALL-LABEL: name:            test_load_v4i32_noalign
-name:            test_load_v4i32_noalign
-alignment:       4
-legalized:       true
-regBankSelected: true
-registers:
-# ALL:          - { id: 0, class: gr64 }
-# NO_AVX512F:   - { id: 1, class: vr128 }
-# AVX512ALL:    - { id: 1, class: vr128x }
-  - { id: 0, class: gpr }
-  - { id: 1, class: vecr }
-# ALL:      %0 = COPY %rdi
-# SSE:      %1 = MOVUPSrm %0, 1, _, 0, _ :: (load 16 from %ir.p1, align 1)
-# AVX:      %1 = VMOVUPSrm %0, 1, _, 0, _ :: (load 16 from %ir.p1, align 1)
-# AVX512F:  %1 = VMOVUPSZ128rm_NOVLX %0, 1, _, 0, _ :: (load 16 from %ir.p1, align 1)
-# AVX512VL: %1 = VMOVUPSZ128rm %0, 1, _, 0, _ :: (load 16 from %ir.p1, align 1)
-# ALL: %xmm0 = COPY %1
-body:             |
-  bb.1 (%ir-block.0):
-    liveins: %rdi
-
-    %0(p0) = COPY %rdi
-    %1(<4 x s32>) = G_LOAD %0(p0) :: (load 16 from %ir.p1, align 1)
-    %xmm0 = COPY %1(<4 x s32>)
-    RET 0, implicit %xmm0
-
-...
----
-# ALL-LABEL: name:            test_load_v4i32_align
-name:            test_load_v4i32_align
-alignment:       4
-legalized:       true
-regBankSelected: true
-registers:
-# ALL:   - { id: 0, class: gr64 }
-# NO_AVX512F:   - { id: 1, class: vr128 }
-# AVX512ALL:    - { id: 1, class: vr128x }
-  - { id: 0, class: gpr }
-  - { id: 1, class: vecr }
-# ALL:      %0 = COPY %rdi
-# SSE:      %1 = MOVAPSrm %0, 1, _, 0, _ :: (load 16 from %ir.p1)
-# AVX:      %1 = VMOVAPSrm %0, 1, _, 0, _ :: (load 16 from %ir.p1)
-# AVX512F:  %1 = VMOVAPSZ128rm_NOVLX %0, 1, _, 0, _ :: (load 16 from %ir.p1)
-# AVX512VL: %1 = VMOVAPSZ128rm %0, 1, _, 0, _ :: (load 16 from %ir.p1)
-# ALL: %xmm0 = COPY %1
-body:             |
-  bb.1 (%ir-block.0):
-    liveins: %rdi
-
-    %0(p0) = COPY %rdi
-    %1(<4 x s32>) = G_LOAD %0(p0) :: (load 16 from %ir.p1)
-    %xmm0 = COPY %1(<4 x s32>)
-    RET 0, implicit %xmm0
-
-...
----
-# ALL-LABEL: name:            test_store_i32
-name:            test_store_i32
-alignment:       4
-legalized:       true
-regBankSelected: true
-registers:
-# ALL:   - { id: 0, class: gr32 }
-# ALL:   - { id: 1, class: gr64 }
-  - { id: 0, class: gpr }
-  - { id: 1, class: gpr }
-# ALL:     %0 = COPY %edi
-# ALL:     %1 = COPY %rsi
-# ALL:     MOV32mr %1, 1, _, 0, _, %0 :: (store 4 into %ir.p1)
-# ALL:     %rax = COPY %1
-body:             |
-  bb.1 (%ir-block.0):
-    liveins: %edi, %rsi
-
-    %0(s32) = COPY %edi
-    %1(p0) = COPY %rsi
-    G_STORE %0(s32), %1(p0) :: (store 4 into %ir.p1)
-    %rax = COPY %1(p0)
-    RET 0, implicit %rax
-
-...
----
-# ALL-LABEL: name:            test_store_i64
-name:            test_store_i64
-alignment:       4
-legalized:       true
-regBankSelected: true
-registers:
-# ALL:   - { id: 0, class: gr64 }
-# ALL:   - { id: 1, class: gr64 }
-  - { id: 0, class: gpr }
-  - { id: 1, class: gpr }
-# ALL:     %0 = COPY %rdi
-# ALL:     %1 = COPY %rsi
-# ALL:     MOV64mr %1, 1, _, 0, _, %0 :: (store 8 into %ir.p1)
-# ALL:     %rax = COPY %1
-body:             |
-  bb.1 (%ir-block.0):
-    liveins: %rdi, %rsi
-
-    %0(s64) = COPY %rdi
-    %1(p0) = COPY %rsi
-    G_STORE %0(s64), %1(p0) :: (store 8 into %ir.p1)
-    %rax = COPY %1(p0)
-    RET 0, implicit %rax
-
-...
----
-# ALL-LABEL: name:            test_store_float
-name:            test_store_float
-alignment:       4
-legalized:       true
-regBankSelected: true
-registers:
-# ALL:   - { id: 0, class: fr32x }
-# ALL:   - { id: 1, class: gr64 }
-# ALL:   - { id: 2, class: gr32 }
-  - { id: 0, class: vecr }
-  - { id: 1, class: gpr }
-  - { id: 2, class: gpr }
-# ALL:     %0 = COPY %xmm0
-# ALL:     %1 = COPY %rdi
-# ALL:     %2 = COPY %0
-# ALL:     MOV32mr %1, 1, _, 0, _, %2 :: (store 4 into %ir.p1)
-# ALL:     %rax = COPY %1
-body:             |
-  bb.1 (%ir-block.0):
-    liveins: %rdi, %xmm0
-
-    %0(s32) = COPY %xmm0
-    %1(p0) = COPY %rdi
-    %2(s32) = COPY %0(s32)
-    G_STORE %2(s32), %1(p0) :: (store 4 into %ir.p1)
-    %rax = COPY %1(p0)
-    RET 0, implicit %rax
-
-...
----
-# ALL-LABEL: name:            test_store_float_vec
-name:            test_store_float_vec
-alignment:       4
-legalized:       true
-regBankSelected: true
-registers:
-# NO_AVX512F:   - { id: 0, class: fr32 }
-# AVX512ALL:    - { id: 0, class: fr32x }
-# ALL:   - { id: 1, class: gr64 }
-  - { id: 0, class: vecr }
-  - { id: 1, class: gpr }
-# ALL:       %0 = COPY %xmm0
-# ALL:       %1 = COPY %rdi
-# SSE:       MOVSSmr %1, 1, _, 0, _, %0 :: (store 4 into %ir.p1)
-# AVX:       VMOVSSmr %1, 1, _, 0, _, %0 :: (store 4 into %ir.p1)
-# AVX512ALL: VMOVSSZmr %1, 1, _, 0, _, %0 :: (store 4 into %ir.p1)
-# ALL:       %rax = COPY %1
-body:             |
-  bb.1 (%ir-block.0):
-    liveins: %rdi, %xmm0
-
-    %0(s32) = COPY %xmm0
-    %1(p0) = COPY %rdi
-    G_STORE %0(s32), %1(p0) :: (store 4 into %ir.p1)
-    %rax = COPY %1(p0)
-    RET 0, implicit %rax
-
-...
----
-# ALL-LABEL: name:            test_store_double
-name:            test_store_double
-alignment:       4
-legalized:       true
-regBankSelected: true
-registers:
-# ALL:   - { id: 0, class: fr64x }
-# ALL:   - { id: 1, class: gr64 }
-# ALL:   - { id: 2, class: gr64 }
-  - { id: 0, class: vecr }
-  - { id: 1, class: gpr }
-  - { id: 2, class: gpr }
-# ALL:     %0 = COPY %xmm0
-# ALL:     %1 = COPY %rdi
-# ALL:     %2 = COPY %0
-# ALL:     MOV64mr %1, 1, _, 0, _, %2 :: (store 8 into %ir.p1)
-# ALL:     %rax = COPY %1
-body:             |
-  bb.1 (%ir-block.0):
-    liveins: %rdi, %xmm0
-
-    %0(s64) = COPY %xmm0
-    %1(p0) = COPY %rdi
-    %2(s64) = COPY %0(s64)
-    G_STORE %2(s64), %1(p0) :: (store 8 into %ir.p1)
-    %rax = COPY %1(p0)
-    RET 0, implicit %rax
-
-...
----
-# ALL-LABEL: name:            test_store_double_vec
-name:            test_store_double_vec
-alignment:       4
-legalized:       true
-regBankSelected: true
-registers:
-# NO_AVX512F:   - { id: 0, class: fr64 }
-# AVX512ALL:    - { id: 0, class: fr64x }
-# ALL:   - { id: 1, class: gr64 }
-  - { id: 0, class: vecr }
-  - { id: 1, class: gpr }
-# ALL:       %0 = COPY %xmm0
-# ALL:       %1 = COPY %rdi
-# SSE:       MOVSDmr %1, 1, _, 0, _, %0 :: (store 8 into %ir.p1)
-# AVX:       VMOVSDmr %1, 1, _, 0, _, %0 :: (store 8 into %ir.p1)
-# AVX512ALL: VMOVSDZmr %1, 1, _, 0, _, %0 :: (store 8 into %ir.p1)
-# ALL:       %rax = COPY %1
-body:             |
-  bb.1 (%ir-block.0):
-    liveins: %rdi, %xmm0
-
-    %0(s64) = COPY %xmm0
-    %1(p0) = COPY %rdi
-    G_STORE %0(s64), %1(p0) :: (store 8 into %ir.p1)
-    %rax = COPY %1(p0)
-    RET 0, implicit %rax
-
-...
----
-# ALL-LABEL: name:            test_store_v4i32_align
-name:            test_store_v4i32_align
-alignment:       4
-legalized:       true
-regBankSelected: true
-registers:
-# NO_AVX512F:   - { id: 0, class: vr128 }
-# AVX512ALL:    - { id: 0, class: vr128x }
-# ALL:   - { id: 1, class: gr64 }
-  - { id: 0, class: vecr }
-  - { id: 1, class: gpr }
-# ALL:       %0 = COPY %xmm0
-# ALL:       %1 = COPY %rdi
-# SSE:       MOVAPSmr %1, 1, _, 0, _, %0 :: (store 16 into %ir.p1)
-# AVX:       VMOVAPSmr %1, 1, _, 0, _, %0 :: (store 16 into %ir.p1)
-# AVX512F:   VMOVAPSZ128mr_NOVLX %1, 1, _, 0, _, %0 :: (store 16 into %ir.p1)
-# AVX512VL:  VMOVAPSZ128mr %1, 1, _, 0, _, %0 :: (store 16 into %ir.p1)
-# ALL:       %rax = COPY %1
-body:             |
-  bb.1 (%ir-block.0):
-    liveins: %rdi, %xmm0
-
-    %0(<4 x s32>) = COPY %xmm0
-    %1(p0) = COPY %rdi
-    G_STORE %0(<4 x s32>), %1(p0) :: (store 16 into %ir.p1, align 16)
-    %rax = COPY %1(p0)
-    RET 0, implicit %rax
-
-...
----
-# ALL-LABEL: name:            test_store_v4i32_noalign
-name:            test_store_v4i32_noalign
-alignment:       4
-legalized:       true
-regBankSelected: true
-registers:
-# NO_AVX512F:   - { id: 0, class: vr128 }
-# AVX512ALL:    - { id: 0, class: vr128x }
-# ALL:   - { id: 1, class: gr64 }
-  - { id: 0, class: vecr }
-  - { id: 1, class: gpr }
-# ALL:       %0 = COPY %xmm0
-# ALL:       %1 = COPY %rdi
-# SSE:       MOVUPSmr %1, 1, _, 0, _, %0 :: (store 16 into %ir.p1, align 1)
-# AVX:       VMOVUPSmr %1, 1, _, 0, _, %0 :: (store 16 into %ir.p1, align 1)
-# AVX512F:   VMOVUPSZ128mr_NOVLX %1, 1, _, 0, _, %0 :: (store 16 into %ir.p1, align 1)
-# AVX512VL:  VMOVUPSZ128mr %1, 1, _, 0, _, %0 :: (store 16 into %ir.p1, align 1)
-# ALL:       %rax = COPY %1
-body:             |
-  bb.1 (%ir-block.0):
-    liveins: %rdi, %xmm0
-
-    %0(<4 x s32>) = COPY %xmm0
-    %1(p0) = COPY %rdi
-    G_STORE %0(<4 x s32>), %1(p0) :: (store 16 into %ir.p1, align 1)
-    %rax = COPY %1(p0)
-    RET 0, implicit %rax
-
-...
----
-# ALL-LABEL: name:            test_load_ptr
-name:            test_load_ptr
-alignment:       4
-legalized:       true
-regBankSelected: true
-selected:        false
-registers:
-# ALL:   - { id: 0, class: gr64 }
-# ALL:   - { id: 1, class: gr64 }
-  - { id: 0, class: gpr }
-  - { id: 1, class: gpr }
-# ALL: %1 = MOV64rm %0, 1, _, 0, _ :: (load 8 from %ir.ptr1)
-body:             |
-  bb.1 (%ir-block.0):
-    liveins: %rdi
-
-    %0(p0) = COPY %rdi
-    %1(p0) = G_LOAD %0(p0) :: (load 8 from %ir.ptr1)
-    %rax = COPY %1(p0)
-    RET 0, implicit %rax
-
-...
----
-# ALL-LABEL: name:            test_store_ptr
-name:            test_store_ptr
-alignment:       4
-legalized:       true
-regBankSelected: true
-selected:        false
-registers:
-# ALL:   - { id: 0, class: gr64 }
-# ALL:   - { id: 1, class: gr64 }
-  - { id: 0, class: gpr }
-  - { id: 1, class: gpr }
-# ALL: MOV64mr %0, 1, _, 0, _, %1 :: (store 8 into %ir.ptr1)
-body:             |
-  bb.1 (%ir-block.0):
-    liveins: %rdi, %rsi
-
-    %0(p0) = COPY %rdi
-    %1(p0) = COPY %rsi
-    G_STORE %1(p0), %0(p0) :: (store 8 into %ir.ptr1)
-    RET 0
-
-...
diff --git a/test/CodeGen/X86/O0-pipeline.ll b/test/CodeGen/X86/O0-pipeline.ll
new file mode 100644
index 000000000000..262cb96ca6d8
--- /dev/null
+++ b/test/CodeGen/X86/O0-pipeline.ll
@@ -0,0 +1,67 @@
+; RUN: llc -mtriple=x86_64-- -O0 -debug-pass=Structure < %s -o /dev/null 2>&1 | FileCheck %s
+
+; REQUIRES: asserts
+
+; CHECK-LABEL: Pass Arguments:
+; CHECK-NEXT: Target Library Information
+; CHECK-NEXT: Target Transform Information
+; CHECK-NEXT: Target Pass Configuration
+; CHECK-NEXT: Type-Based Alias Analysis
+; CHECK-NEXT: Scoped NoAlias Alias Analysis
+; CHECK-NEXT: Assumption Cache Tracker
+; CHECK-NEXT: Create Garbage Collector Module Metadata
+; CHECK-NEXT: Machine Module Information
+; CHECK-NEXT: Machine Branch Probability Analysis
+; CHECK-NEXT:   ModulePass Manager
+; CHECK-NEXT:     Pre-ISel Intrinsic Lowering
+; CHECK-NEXT:     FunctionPass Manager
+; CHECK-NEXT:       Expand Atomic instructions
+; CHECK-NEXT:       Dominator Tree Construction
+; CHECK-NEXT:       Basic Alias Analysis (stateless AA impl)
+; CHECK-NEXT:       Module Verifier
+; CHECK-NEXT:       Lower Garbage Collection Instructions
+; CHECK-NEXT:       Shadow Stack GC Lowering
+; CHECK-NEXT:       Remove unreachable blocks from the CFG
+; CHECK-NEXT:       Inserts calls to mcount-like functions
+; CHECK-NEXT:       Scalarize Masked Memory Intrinsics
+; CHECK-NEXT:       Expand reduction intrinsics
+; CHECK-NEXT:     Rewrite Symbols
+; CHECK-NEXT:     FunctionPass Manager
+; CHECK-NEXT:       Dominator Tree Construction
+; CHECK-NEXT:       Exception handling preparation
+; CHECK-NEXT:       Safe Stack instrumentation pass
+; CHECK-NEXT:       Insert stack protectors
+; CHECK-NEXT:       Module Verifier
+; CHECK-NEXT:       X86 DAG->DAG Instruction Selection
+; CHECK-NEXT:       X86 PIC Global Base Reg Initialization
+; CHECK-NEXT:       Expand ISel Pseudo-instructions
+; CHECK-NEXT:       Local Stack Slot Allocation
+; CHECK-NEXT:       X86 WinAlloca Expander
+; CHECK-NEXT:       Eliminate PHI nodes for register allocation
+; CHECK-NEXT:       Two-Address instruction pass
+; CHECK-NEXT:       Fast Register Allocator
+; CHECK-NEXT:       Bundle Machine CFG Edges
+; CHECK-NEXT:       X86 FP Stackifier
+; CHECK-NEXT:       Prologue/Epilogue Insertion & Frame Finalization
+; CHECK-NEXT:       Post-RA pseudo instruction expansion pass
+; CHECK-NEXT:       X86 pseudo instruction expansion pass
+; CHECK-NEXT:       Analyze Machine Code For Garbage Collection
+; CHECK-NEXT:       X86 vzeroupper inserter
+; CHECK-NEXT:       Contiguously Lay Out Funclets
+; CHECK-NEXT:       StackMap Liveness Analysis
+; CHECK-NEXT:       Live DEBUG_VALUE analysis
+; CHECK-NEXT:       Insert fentry calls
+; CHECK-NEXT:       MachineDominator Tree Construction
+; CHECK-NEXT:       Machine Natural Loop Construction
+; CHECK-NEXT:       Insert XRay ops
+; CHECK-NEXT:       Implement the 'patchable-function' attribute
+; CHECK-NEXT:       Lazy Machine Block Frequency Analysis
+; CHECK-NEXT:       Machine Optimization Remark Emitter
+; CHECK-NEXT:       MachineDominator Tree Construction
+; CHECK-NEXT:       Machine Natural Loop Construction
+; CHECK-NEXT:       X86 Assembly Printer
+; CHECK-NEXT:       Free MachineFunction
+
+define void @f() {
+  ret void
+}
diff --git a/test/CodeGen/X86/all-ones-vector.ll b/test/CodeGen/X86/all-ones-vector.ll
index 35f488ea448c..d0160a5b84df 100644
--- a/test/CodeGen/X86/all-ones-vector.ll
+++ b/test/CodeGen/X86/all-ones-vector.ll
@@ -157,8 +157,8 @@ define <32 x i8> @allones_v32i8() nounwind {
 ;
 ; X32-AVX1-LABEL: allones_v32i8:
 ; X32-AVX1:       # BB#0:
-; X32-AVX1-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
-; X32-AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; X32-AVX1-NEXT:    vxorps %ymm0, %ymm0, %ymm0
+; X32-AVX1-NEXT:    vcmptrueps %ymm0, %ymm0, %ymm0
 ; X32-AVX1-NEXT:    retl
 ;
 ; X32-AVX256-LABEL: allones_v32i8:
@@ -174,8 +174,8 @@ define <32 x i8> @allones_v32i8() nounwind {
 ;
 ; X64-AVX1-LABEL: allones_v32i8:
 ; X64-AVX1:       # BB#0:
-; X64-AVX1-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
-; X64-AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; X64-AVX1-NEXT:    vxorps %ymm0, %ymm0, %ymm0
+; X64-AVX1-NEXT:    vcmptrueps %ymm0, %ymm0, %ymm0
 ; X64-AVX1-NEXT:    retq
 ;
 ; X64-AVX256-LABEL: allones_v32i8:
@@ -194,8 +194,8 @@ define <16 x i16> @allones_v16i16() nounwind {
 ;
 ; X32-AVX1-LABEL: allones_v16i16:
 ; X32-AVX1:       # BB#0:
-; X32-AVX1-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
-; X32-AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; X32-AVX1-NEXT:    vxorps %ymm0, %ymm0, %ymm0
+; X32-AVX1-NEXT:    vcmptrueps %ymm0, %ymm0, %ymm0
 ; X32-AVX1-NEXT:    retl
 ;
 ; X32-AVX256-LABEL: allones_v16i16:
@@ -211,8 +211,8 @@ define <16 x i16> @allones_v16i16() nounwind {
 ;
 ; X64-AVX1-LABEL: allones_v16i16:
 ; X64-AVX1:       # BB#0:
-; X64-AVX1-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
-; X64-AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; X64-AVX1-NEXT:    vxorps %ymm0, %ymm0, %ymm0
+; X64-AVX1-NEXT:    vcmptrueps %ymm0, %ymm0, %ymm0
 ; X64-AVX1-NEXT:    retq
 ;
 ; X64-AVX256-LABEL: allones_v16i16:
@@ -231,8 +231,8 @@ define <8 x i32> @allones_v8i32() nounwind {
 ;
 ; X32-AVX1-LABEL: allones_v8i32:
 ; X32-AVX1:       # BB#0:
-; X32-AVX1-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
-; X32-AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; X32-AVX1-NEXT:    vxorps %ymm0, %ymm0, %ymm0
+; X32-AVX1-NEXT:    vcmptrueps %ymm0, %ymm0, %ymm0
 ; X32-AVX1-NEXT:    retl
 ;
 ; X32-AVX256-LABEL: allones_v8i32:
@@ -248,8 +248,8 @@ define <8 x i32> @allones_v8i32() nounwind {
 ;
 ; X64-AVX1-LABEL: allones_v8i32:
 ; X64-AVX1:       # BB#0:
-; X64-AVX1-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
-; X64-AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; X64-AVX1-NEXT:    vxorps %ymm0, %ymm0, %ymm0
+; X64-AVX1-NEXT:    vcmptrueps %ymm0, %ymm0, %ymm0
 ; X64-AVX1-NEXT:    retq
 ;
 ; X64-AVX256-LABEL: allones_v8i32:
@@ -268,8 +268,8 @@ define <4 x i64> @allones_v4i64() nounwind {
 ;
 ; X32-AVX1-LABEL: allones_v4i64:
 ; X32-AVX1:       # BB#0:
-; X32-AVX1-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
-; X32-AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; X32-AVX1-NEXT:    vxorps %ymm0, %ymm0, %ymm0
+; X32-AVX1-NEXT:    vcmptrueps %ymm0, %ymm0, %ymm0
 ; X32-AVX1-NEXT:    retl
 ;
 ; X32-AVX256-LABEL: allones_v4i64:
@@ -285,8 +285,8 @@ define <4 x i64> @allones_v4i64() nounwind {
 ;
 ; X64-AVX1-LABEL: allones_v4i64:
 ; X64-AVX1:       # BB#0:
-; X64-AVX1-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
-; X64-AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; X64-AVX1-NEXT:    vxorps %ymm0, %ymm0, %ymm0
+; X64-AVX1-NEXT:    vcmptrueps %ymm0, %ymm0, %ymm0
 ; X64-AVX1-NEXT:    retq
 ;
 ; X64-AVX256-LABEL: allones_v4i64:
@@ -305,8 +305,8 @@ define <4 x double> @allones_v4f64() nounwind {
 ;
 ; X32-AVX1-LABEL: allones_v4f64:
 ; X32-AVX1:       # BB#0:
-; X32-AVX1-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
-; X32-AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; X32-AVX1-NEXT:    vxorps %ymm0, %ymm0, %ymm0
+; X32-AVX1-NEXT:    vcmptrueps %ymm0, %ymm0, %ymm0
 ; X32-AVX1-NEXT:    retl
 ;
 ; X32-AVX256-LABEL: allones_v4f64:
@@ -322,8 +322,8 @@ define <4 x double> @allones_v4f64() nounwind {
 ;
 ; X64-AVX1-LABEL: allones_v4f64:
 ; X64-AVX1:       # BB#0:
-; X64-AVX1-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
-; X64-AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; X64-AVX1-NEXT:    vxorps %ymm0, %ymm0, %ymm0
+; X64-AVX1-NEXT:    vcmptrueps %ymm0, %ymm0, %ymm0
 ; X64-AVX1-NEXT:    retq
 ;
 ; X64-AVX256-LABEL: allones_v4f64:
@@ -342,8 +342,8 @@ define <4 x double> @allones_v4f64_optsize() nounwind optsize {
 ;
 ; X32-AVX1-LABEL: allones_v4f64_optsize:
 ; X32-AVX1:       # BB#0:
-; X32-AVX1-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
-; X32-AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; X32-AVX1-NEXT:    vxorps %ymm0, %ymm0, %ymm0
+; X32-AVX1-NEXT:    vcmptrueps %ymm0, %ymm0, %ymm0
 ; X32-AVX1-NEXT:    retl
 ;
 ; X32-AVX256-LABEL: allones_v4f64_optsize:
@@ -359,8 +359,8 @@ define <4 x double> @allones_v4f64_optsize() nounwind optsize {
 ;
 ; X64-AVX1-LABEL: allones_v4f64_optsize:
 ; X64-AVX1:       # BB#0:
-; X64-AVX1-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
-; X64-AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; X64-AVX1-NEXT:    vxorps %ymm0, %ymm0, %ymm0
+; X64-AVX1-NEXT:    vcmptrueps %ymm0, %ymm0, %ymm0
 ; X64-AVX1-NEXT:    retq
 ;
 ; X64-AVX256-LABEL: allones_v4f64_optsize:
@@ -379,8 +379,8 @@ define <8 x float> @allones_v8f32() nounwind {
 ;
 ; X32-AVX1-LABEL: allones_v8f32:
 ; X32-AVX1:       # BB#0:
-; X32-AVX1-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
-; X32-AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; X32-AVX1-NEXT:    vxorps %ymm0, %ymm0, %ymm0
+; X32-AVX1-NEXT:    vcmptrueps %ymm0, %ymm0, %ymm0
 ; X32-AVX1-NEXT:    retl
 ;
 ; X32-AVX256-LABEL: allones_v8f32:
@@ -396,8 +396,8 @@ define <8 x float> @allones_v8f32() nounwind {
 ;
 ; X64-AVX1-LABEL: allones_v8f32:
 ; X64-AVX1:       # BB#0:
-; X64-AVX1-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
-; X64-AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; X64-AVX1-NEXT:    vxorps %ymm0, %ymm0, %ymm0
+; X64-AVX1-NEXT:    vcmptrueps %ymm0, %ymm0, %ymm0
 ; X64-AVX1-NEXT:    retq
 ;
 ; X64-AVX256-LABEL: allones_v8f32:
@@ -416,8 +416,8 @@ define <8 x float> @allones_v8f32_optsize() nounwind optsize {
 ;
 ; X32-AVX1-LABEL: allones_v8f32_optsize:
 ; X32-AVX1:       # BB#0:
-; X32-AVX1-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
-; X32-AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; X32-AVX1-NEXT:    vxorps %ymm0, %ymm0, %ymm0
+; X32-AVX1-NEXT:    vcmptrueps %ymm0, %ymm0, %ymm0
 ; X32-AVX1-NEXT:    retl
 ;
 ; X32-AVX256-LABEL: allones_v8f32_optsize:
@@ -433,8 +433,8 @@ define <8 x float> @allones_v8f32_optsize() nounwind optsize {
 ;
 ; X64-AVX1-LABEL: allones_v8f32_optsize:
 ; X64-AVX1:       # BB#0:
-; X64-AVX1-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
-; X64-AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; X64-AVX1-NEXT:    vxorps %ymm0, %ymm0, %ymm0
+; X64-AVX1-NEXT:    vcmptrueps %ymm0, %ymm0, %ymm0
 ; X64-AVX1-NEXT:    retq
 ;
 ; X64-AVX256-LABEL: allones_v8f32_optsize:
@@ -455,8 +455,8 @@ define <64 x i8> @allones_v64i8() nounwind {
 ;
 ; X32-AVX1-LABEL: allones_v64i8:
 ; X32-AVX1:       # BB#0:
-; X32-AVX1-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
-; X32-AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; X32-AVX1-NEXT:    vxorps %ymm0, %ymm0, %ymm0
+; X32-AVX1-NEXT:    vcmptrueps %ymm0, %ymm0, %ymm0
 ; X32-AVX1-NEXT:    vmovaps %ymm0, %ymm1
 ; X32-AVX1-NEXT:    retl
 ;
@@ -487,8 +487,8 @@ define <64 x i8> @allones_v64i8() nounwind {
 ;
 ; X64-AVX1-LABEL: allones_v64i8:
 ; X64-AVX1:       # BB#0:
-; X64-AVX1-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
-; X64-AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; X64-AVX1-NEXT:    vxorps %ymm0, %ymm0, %ymm0
+; X64-AVX1-NEXT:    vcmptrueps %ymm0, %ymm0, %ymm0
 ; X64-AVX1-NEXT:    vmovaps %ymm0, %ymm1
 ; X64-AVX1-NEXT:    retq
 ;
@@ -522,8 +522,8 @@ define <32 x i16> @allones_v32i16() nounwind {
 ;
 ; X32-AVX1-LABEL: allones_v32i16:
 ; X32-AVX1:       # BB#0:
-; X32-AVX1-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
-; X32-AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; X32-AVX1-NEXT:    vxorps %ymm0, %ymm0, %ymm0
+; X32-AVX1-NEXT:    vcmptrueps %ymm0, %ymm0, %ymm0
 ; X32-AVX1-NEXT:    vmovaps %ymm0, %ymm1
 ; X32-AVX1-NEXT:    retl
 ;
@@ -554,8 +554,8 @@ define <32 x i16> @allones_v32i16() nounwind {
 ;
 ; X64-AVX1-LABEL: allones_v32i16:
 ; X64-AVX1:       # BB#0:
-; X64-AVX1-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
-; X64-AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; X64-AVX1-NEXT:    vxorps %ymm0, %ymm0, %ymm0
+; X64-AVX1-NEXT:    vcmptrueps %ymm0, %ymm0, %ymm0
 ; X64-AVX1-NEXT:    vmovaps %ymm0, %ymm1
 ; X64-AVX1-NEXT:    retq
 ;
@@ -589,8 +589,8 @@ define <16 x i32> @allones_v16i32() nounwind {
 ;
 ; X32-AVX1-LABEL: allones_v16i32:
 ; X32-AVX1:       # BB#0:
-; X32-AVX1-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
-; X32-AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; X32-AVX1-NEXT:    vxorps %ymm0, %ymm0, %ymm0
+; X32-AVX1-NEXT:    vcmptrueps %ymm0, %ymm0, %ymm0
 ; X32-AVX1-NEXT:    vmovaps %ymm0, %ymm1
 ; X32-AVX1-NEXT:    retl
 ;
@@ -615,8 +615,8 @@ define <16 x i32> @allones_v16i32() nounwind {
 ;
 ; X64-AVX1-LABEL: allones_v16i32:
 ; X64-AVX1:       # BB#0:
-; X64-AVX1-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
-; X64-AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; X64-AVX1-NEXT:    vxorps %ymm0, %ymm0, %ymm0
+; X64-AVX1-NEXT:    vcmptrueps %ymm0, %ymm0, %ymm0
 ; X64-AVX1-NEXT:    vmovaps %ymm0, %ymm1
 ; X64-AVX1-NEXT:    retq
 ;
@@ -644,8 +644,8 @@ define <8 x i64> @allones_v8i64() nounwind {
 ;
 ; X32-AVX1-LABEL: allones_v8i64:
 ; X32-AVX1:       # BB#0:
-; X32-AVX1-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
-; X32-AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; X32-AVX1-NEXT:    vxorps %ymm0, %ymm0, %ymm0
+; X32-AVX1-NEXT:    vcmptrueps %ymm0, %ymm0, %ymm0
 ; X32-AVX1-NEXT:    vmovaps %ymm0, %ymm1
 ; X32-AVX1-NEXT:    retl
 ;
@@ -670,8 +670,8 @@ define <8 x i64> @allones_v8i64() nounwind {
 ;
 ; X64-AVX1-LABEL: allones_v8i64:
 ; X64-AVX1:       # BB#0:
-; X64-AVX1-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
-; X64-AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; X64-AVX1-NEXT:    vxorps %ymm0, %ymm0, %ymm0
+; X64-AVX1-NEXT:    vcmptrueps %ymm0, %ymm0, %ymm0
 ; X64-AVX1-NEXT:    vmovaps %ymm0, %ymm1
 ; X64-AVX1-NEXT:    retq
 ;
@@ -699,8 +699,8 @@ define <8 x double> @allones_v8f64() nounwind {
 ;
 ; X32-AVX1-LABEL: allones_v8f64:
 ; X32-AVX1:       # BB#0:
-; X32-AVX1-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
-; X32-AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; X32-AVX1-NEXT:    vxorps %ymm0, %ymm0, %ymm0
+; X32-AVX1-NEXT:    vcmptrueps %ymm0, %ymm0, %ymm0
 ; X32-AVX1-NEXT:    vmovaps %ymm0, %ymm1
 ; X32-AVX1-NEXT:    retl
 ;
@@ -725,8 +725,8 @@ define <8 x double> @allones_v8f64() nounwind {
 ;
 ; X64-AVX1-LABEL: allones_v8f64:
 ; X64-AVX1:       # BB#0:
-; X64-AVX1-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
-; X64-AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; X64-AVX1-NEXT:    vxorps %ymm0, %ymm0, %ymm0
+; X64-AVX1-NEXT:    vcmptrueps %ymm0, %ymm0, %ymm0
 ; X64-AVX1-NEXT:    vmovaps %ymm0, %ymm1
 ; X64-AVX1-NEXT:    retq
 ;
@@ -754,8 +754,8 @@ define <16 x float> @allones_v16f32() nounwind {
 ;
 ; X32-AVX1-LABEL: allones_v16f32:
 ; X32-AVX1:       # BB#0:
-; X32-AVX1-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
-; X32-AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; X32-AVX1-NEXT:    vxorps %ymm0, %ymm0, %ymm0
+; X32-AVX1-NEXT:    vcmptrueps %ymm0, %ymm0, %ymm0
 ; X32-AVX1-NEXT:    vmovaps %ymm0, %ymm1
 ; X32-AVX1-NEXT:    retl
 ;
@@ -780,8 +780,8 @@ define <16 x float> @allones_v16f32() nounwind {
 ;
 ; X64-AVX1-LABEL: allones_v16f32:
 ; X64-AVX1:       # BB#0:
-; X64-AVX1-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
-; X64-AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; X64-AVX1-NEXT:    vxorps %ymm0, %ymm0, %ymm0
+; X64-AVX1-NEXT:    vcmptrueps %ymm0, %ymm0, %ymm0
 ; X64-AVX1-NEXT:    vmovaps %ymm0, %ymm1
 ; X64-AVX1-NEXT:    retq
 ;
diff --git a/test/CodeGen/X86/avg.ll b/test/CodeGen/X86/avg.ll
index 2aaf14001758..aa28ef5175ed 100644
--- a/test/CodeGen/X86/avg.ll
+++ b/test/CodeGen/X86/avg.ll
@@ -135,88 +135,87 @@ define void @avg_v16i8(<16 x i8>* %a, <16 x i8>* %b) {
 define void @avg_v32i8(<32 x i8>* %a, <32 x i8>* %b) {
 ; SSE2-LABEL: avg_v32i8:
 ; SSE2:       # BB#0:
-; SSE2-NEXT:    movdqa (%rdi), %xmm8
-; SSE2-NEXT:    movdqa 16(%rdi), %xmm11
+; SSE2-NEXT:    movdqa (%rdi), %xmm3
+; SSE2-NEXT:    movdqa 16(%rdi), %xmm8
 ; SSE2-NEXT:    movdqa (%rsi), %xmm0
 ; SSE2-NEXT:    movdqa 16(%rsi), %xmm1
 ; SSE2-NEXT:    pxor %xmm4, %xmm4
-; SSE2-NEXT:    movdqa %xmm8, %xmm10
-; SSE2-NEXT:    punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm4[8],xmm10[9],xmm4[9],xmm10[10],xmm4[10],xmm10[11],xmm4[11],xmm10[12],xmm4[12],xmm10[13],xmm4[13],xmm10[14],xmm4[14],xmm10[15],xmm4[15]
-; SSE2-NEXT:    movdqa %xmm10, %xmm2
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
-; SSE2-NEXT:    movdqa %xmm2, -{{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm4[0],xmm10[1],xmm4[1],xmm10[2],xmm4[2],xmm10[3],xmm4[3]
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3],xmm8[4],xmm4[4],xmm8[5],xmm4[5],xmm8[6],xmm4[6],xmm8[7],xmm4[7]
-; SSE2-NEXT:    movdqa %xmm8, %xmm12
+; SSE2-NEXT:    movdqa %xmm3, %xmm5
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15]
+; SSE2-NEXT:    movdqa %xmm5, %xmm6
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
+; SSE2-NEXT:    movdqa %xmm3, %xmm12
 ; SSE2-NEXT:    punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm4[4],xmm12[5],xmm4[5],xmm12[6],xmm4[6],xmm12[7],xmm4[7]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
+; SSE2-NEXT:    movdqa %xmm8, %xmm7
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm4[8],xmm7[9],xmm4[9],xmm7[10],xmm4[10],xmm7[11],xmm4[11],xmm7[12],xmm4[12],xmm7[13],xmm4[13],xmm7[14],xmm4[14],xmm7[15],xmm4[15]
+; SSE2-NEXT:    movdqa %xmm7, %xmm11
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm4[4],xmm11[5],xmm4[5],xmm11[6],xmm4[6],xmm11[7],xmm4[7]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3]
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3],xmm8[4],xmm4[4],xmm8[5],xmm4[5],xmm8[6],xmm4[6],xmm8[7],xmm4[7]
+; SSE2-NEXT:    movdqa %xmm8, %xmm10
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm4[4],xmm10[5],xmm4[5],xmm10[6],xmm4[6],xmm10[7],xmm4[7]
 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3]
-; SSE2-NEXT:    movdqa %xmm11, %xmm15
-; SSE2-NEXT:    punpckhbw {{.*#+}} xmm15 = xmm15[8],xmm4[8],xmm15[9],xmm4[9],xmm15[10],xmm4[10],xmm15[11],xmm4[11],xmm15[12],xmm4[12],xmm15[13],xmm4[13],xmm15[14],xmm4[14],xmm15[15],xmm4[15]
-; SSE2-NEXT:    movdqa %xmm15, %xmm14
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm14 = xmm14[4],xmm4[4],xmm14[5],xmm4[5],xmm14[6],xmm4[6],xmm14[7],xmm4[7]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm4[0],xmm15[1],xmm4[1],xmm15[2],xmm4[2],xmm15[3],xmm4[3]
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm4[0],xmm11[1],xmm4[1],xmm11[2],xmm4[2],xmm11[3],xmm4[3],xmm11[4],xmm4[4],xmm11[5],xmm4[5],xmm11[6],xmm4[6],xmm11[7],xmm4[7]
-; SSE2-NEXT:    movdqa %xmm11, %xmm9
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm4[8],xmm2[9],xmm4[9],xmm2[10],xmm4[10],xmm2[11],xmm4[11],xmm2[12],xmm4[12],xmm2[13],xmm4[13],xmm2[14],xmm4[14],xmm2[15],xmm4[15]
+; SSE2-NEXT:    movdqa %xmm2, %xmm9
 ; SSE2-NEXT:    punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm4[4],xmm9[5],xmm4[5],xmm9[6],xmm4[6],xmm9[7],xmm4[7]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm4[0],xmm11[1],xmm4[1],xmm11[2],xmm4[2],xmm11[3],xmm4[3]
-; SSE2-NEXT:    movdqa %xmm0, %xmm3
-; SSE2-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15]
-; SSE2-NEXT:    movdqa %xmm3, %xmm7
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm4[4],xmm7[5],xmm4[5],xmm7[6],xmm4[6],xmm7[7],xmm4[7]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
+; SSE2-NEXT:    paddd %xmm6, %xmm9
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
+; SSE2-NEXT:    paddd %xmm5, %xmm2
 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
-; SSE2-NEXT:    movdqa %xmm0, %xmm6
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
-; SSE2-NEXT:    movdqa %xmm1, %xmm2
-; SSE2-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm4[8],xmm2[9],xmm4[9],xmm2[10],xmm4[10],xmm2[11],xmm4[11],xmm2[12],xmm4[12],xmm2[13],xmm4[13],xmm2[14],xmm4[14],xmm2[15],xmm4[15]
-; SSE2-NEXT:    movdqa %xmm2, %xmm5
+; SSE2-NEXT:    movdqa %xmm0, %xmm5
 ; SSE2-NEXT:    punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
+; SSE2-NEXT:    paddd %xmm12, %xmm5
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
+; SSE2-NEXT:    paddd %xmm3, %xmm0
+; SSE2-NEXT:    movdqa %xmm1, %xmm3
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15]
+; SSE2-NEXT:    movdqa %xmm3, %xmm6
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7]
+; SSE2-NEXT:    paddd %xmm11, %xmm6
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
+; SSE2-NEXT:    paddd %xmm7, %xmm3
 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
-; SSE2-NEXT:    movdqa %xmm1, %xmm13
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm4[4],xmm13[5],xmm4[5],xmm13[6],xmm4[6],xmm13[7],xmm4[7]
+; SSE2-NEXT:    movdqa %xmm1, %xmm7
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm4[4],xmm7[5],xmm4[5],xmm7[6],xmm4[6],xmm7[7],xmm4[7]
+; SSE2-NEXT:    paddd %xmm10, %xmm7
 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3]
-; SSE2-NEXT:    paddd %xmm11, %xmm1
-; SSE2-NEXT:    paddd %xmm9, %xmm13
-; SSE2-NEXT:    paddd %xmm15, %xmm2
-; SSE2-NEXT:    paddd %xmm14, %xmm5
-; SSE2-NEXT:    paddd %xmm8, %xmm0
-; SSE2-NEXT:    paddd %xmm12, %xmm6
-; SSE2-NEXT:    paddd %xmm10, %xmm3
-; SSE2-NEXT:    paddd -{{[0-9]+}}(%rsp), %xmm7 # 16-byte Folded Reload
+; SSE2-NEXT:    paddd %xmm8, %xmm1
 ; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [1,1,1,1]
-; SSE2-NEXT:    paddd %xmm4, %xmm7
-; SSE2-NEXT:    paddd %xmm4, %xmm3
-; SSE2-NEXT:    paddd %xmm4, %xmm6
-; SSE2-NEXT:    paddd %xmm4, %xmm0
-; SSE2-NEXT:    paddd %xmm4, %xmm5
+; SSE2-NEXT:    paddd %xmm4, %xmm9
 ; SSE2-NEXT:    paddd %xmm4, %xmm2
-; SSE2-NEXT:    paddd %xmm4, %xmm13
+; SSE2-NEXT:    paddd %xmm4, %xmm5
+; SSE2-NEXT:    paddd %xmm4, %xmm0
+; SSE2-NEXT:    paddd %xmm4, %xmm6
+; SSE2-NEXT:    paddd %xmm4, %xmm3
+; SSE2-NEXT:    paddd %xmm4, %xmm7
 ; SSE2-NEXT:    paddd %xmm4, %xmm1
-; SSE2-NEXT:    psrld $1, %xmm3
+; SSE2-NEXT:    psrld $1, %xmm1
 ; SSE2-NEXT:    psrld $1, %xmm7
-; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
-; SSE2-NEXT:    pand %xmm4, %xmm7
-; SSE2-NEXT:    pand %xmm4, %xmm3
-; SSE2-NEXT:    packuswb %xmm7, %xmm3
-; SSE2-NEXT:    psrld $1, %xmm0
+; SSE2-NEXT:    psrld $1, %xmm3
 ; SSE2-NEXT:    psrld $1, %xmm6
-; SSE2-NEXT:    pand %xmm4, %xmm6
-; SSE2-NEXT:    pand %xmm4, %xmm0
-; SSE2-NEXT:    packuswb %xmm6, %xmm0
-; SSE2-NEXT:    packuswb %xmm3, %xmm0
-; SSE2-NEXT:    psrld $1, %xmm2
+; SSE2-NEXT:    psrld $1, %xmm0
 ; SSE2-NEXT:    psrld $1, %xmm5
-; SSE2-NEXT:    pand %xmm4, %xmm5
+; SSE2-NEXT:    psrld $1, %xmm2
+; SSE2-NEXT:    psrld $1, %xmm9
+; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
+; SSE2-NEXT:    pand %xmm4, %xmm9
 ; SSE2-NEXT:    pand %xmm4, %xmm2
-; SSE2-NEXT:    packuswb %xmm5, %xmm2
-; SSE2-NEXT:    psrld $1, %xmm1
-; SSE2-NEXT:    psrld $1, %xmm13
-; SSE2-NEXT:    pand %xmm4, %xmm13
+; SSE2-NEXT:    packuswb %xmm9, %xmm2
+; SSE2-NEXT:    pand %xmm4, %xmm5
+; SSE2-NEXT:    pand %xmm4, %xmm0
+; SSE2-NEXT:    packuswb %xmm5, %xmm0
+; SSE2-NEXT:    packuswb %xmm2, %xmm0
+; SSE2-NEXT:    pand %xmm4, %xmm6
+; SSE2-NEXT:    pand %xmm4, %xmm3
+; SSE2-NEXT:    packuswb %xmm6, %xmm3
+; SSE2-NEXT:    pand %xmm4, %xmm7
 ; SSE2-NEXT:    pand %xmm4, %xmm1
-; SSE2-NEXT:    packuswb %xmm13, %xmm1
-; SSE2-NEXT:    packuswb %xmm2, %xmm1
+; SSE2-NEXT:    packuswb %xmm7, %xmm1
+; SSE2-NEXT:    packuswb %xmm3, %xmm1
 ; SSE2-NEXT:    movdqu %xmm1, (%rax)
 ; SSE2-NEXT:    movdqu %xmm0, (%rax)
 ; SSE2-NEXT:    retq
@@ -259,198 +258,183 @@ define void @avg_v32i8(<32 x i8>* %a, <32 x i8>* %b) {
 define void @avg_v64i8(<64 x i8>* %a, <64 x i8>* %b) {
 ; SSE2-LABEL: avg_v64i8:
 ; SSE2:       # BB#0:
-; SSE2-NEXT:    subq $152, %rsp
-; SSE2-NEXT:  .Lcfi0:
-; SSE2-NEXT:    .cfi_def_cfa_offset 160
-; SSE2-NEXT:    movdqa (%rdi), %xmm1
-; SSE2-NEXT:    movdqa 16(%rdi), %xmm4
-; SSE2-NEXT:    movdqa 32(%rdi), %xmm5
-; SSE2-NEXT:    movdqa 48(%rdi), %xmm6
+; SSE2-NEXT:    movdqa (%rdi), %xmm6
+; SSE2-NEXT:    movdqa 16(%rdi), %xmm2
+; SSE2-NEXT:    movdqa 32(%rdi), %xmm1
+; SSE2-NEXT:    movdqa 48(%rdi), %xmm0
+; SSE2-NEXT:    movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT:    movdqa (%rsi), %xmm5
+; SSE2-NEXT:    movdqa 16(%rsi), %xmm13
+; SSE2-NEXT:    movdqa 32(%rsi), %xmm11
 ; SSE2-NEXT:    pxor %xmm0, %xmm0
-; SSE2-NEXT:    movdqa %xmm1, %xmm3
-; SSE2-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15]
-; SSE2-NEXT:    movdqa %xmm3, %xmm2
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
-; SSE2-NEXT:    movdqa %xmm2, {{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3]
-; SSE2-NEXT:    movdqa %xmm3, {{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; SSE2-NEXT:    movdqa %xmm1, %xmm2
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
-; SSE2-NEXT:    movdqa %xmm2, {{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; SSE2-NEXT:    movdqa %xmm1, {{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT:    movdqa %xmm4, %xmm3
-; SSE2-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15]
-; SSE2-NEXT:    movdqa %xmm3, %xmm2
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
-; SSE2-NEXT:    movdqa %xmm2, {{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3]
-; SSE2-NEXT:    movdqa %xmm3, {{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3],xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7]
-; SSE2-NEXT:    movdqa %xmm4, %xmm2
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
-; SSE2-NEXT:    movdqa %xmm2, {{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT:    movdqa %xmm6, %xmm4
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm0[8],xmm4[9],xmm0[9],xmm4[10],xmm0[10],xmm4[11],xmm0[11],xmm4[12],xmm0[12],xmm4[13],xmm0[13],xmm4[14],xmm0[14],xmm4[15],xmm0[15]
+; SSE2-NEXT:    movdqa %xmm4, %xmm7
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm0[4],xmm7[5],xmm0[5],xmm7[6],xmm0[6],xmm7[7],xmm0[7]
 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3]
-; SSE2-NEXT:    movdqa %xmm4, {{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT:    movdqa %xmm5, %xmm3
-; SSE2-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15]
-; SSE2-NEXT:    movdqa %xmm3, %xmm2
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
-; SSE2-NEXT:    movdqa %xmm2, (%rsp) # 16-byte Spill
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3]
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3],xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7]
+; SSE2-NEXT:    movdqa %xmm6, %xmm12
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm0[4],xmm12[5],xmm0[5],xmm12[6],xmm0[6],xmm12[7],xmm0[7]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3]
+; SSE2-NEXT:    movdqa %xmm2, %xmm15
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm15 = xmm15[8],xmm0[8],xmm15[9],xmm0[9],xmm15[10],xmm0[10],xmm15[11],xmm0[11],xmm15[12],xmm0[12],xmm15[13],xmm0[13],xmm15[14],xmm0[14],xmm15[15],xmm0[15]
+; SSE2-NEXT:    movdqa %xmm15, %xmm14
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm14 = xmm14[4],xmm0[4],xmm14[5],xmm0[5],xmm14[6],xmm0[6],xmm14[7],xmm0[7]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3]
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
+; SSE2-NEXT:    movdqa %xmm2, %xmm8
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm0[4],xmm8[5],xmm0[5],xmm8[6],xmm0[6],xmm8[7],xmm0[7]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
+; SSE2-NEXT:    movdqa %xmm5, %xmm10
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm0[8],xmm10[9],xmm0[9],xmm10[10],xmm0[10],xmm10[11],xmm0[11],xmm10[12],xmm0[12],xmm10[13],xmm0[13],xmm10[14],xmm0[14],xmm10[15],xmm0[15]
+; SSE2-NEXT:    movdqa %xmm10, %xmm3
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
+; SSE2-NEXT:    paddd %xmm7, %xmm3
 ; SSE2-NEXT:    movdqa %xmm3, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT:    movdqa %xmm1, %xmm7
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm0[8],xmm7[9],xmm0[9],xmm7[10],xmm0[10],xmm7[11],xmm0[11],xmm7[12],xmm0[12],xmm7[13],xmm0[13],xmm7[14],xmm0[14],xmm7[15],xmm0[15]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1],xmm10[2],xmm0[2],xmm10[3],xmm0[3]
+; SSE2-NEXT:    paddd %xmm4, %xmm10
 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3],xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7]
-; SSE2-NEXT:    movdqa %xmm5, %xmm2
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
-; SSE2-NEXT:    movdqa %xmm2, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT:    movdqa %xmm5, %xmm3
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
+; SSE2-NEXT:    paddd %xmm12, %xmm3
+; SSE2-NEXT:    movdqa %xmm3, -{{[0-9]+}}(%rsp) # 16-byte Spill
 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3]
+; SSE2-NEXT:    paddd %xmm6, %xmm5
 ; SSE2-NEXT:    movdqa %xmm5, -{{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT:    movdqa %xmm6, %xmm8
-; SSE2-NEXT:    punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm0[8],xmm8[9],xmm0[9],xmm8[10],xmm0[10],xmm8[11],xmm0[11],xmm8[12],xmm0[12],xmm8[13],xmm0[13],xmm8[14],xmm0[14],xmm8[15],xmm0[15]
-; SSE2-NEXT:    movdqa %xmm8, %xmm1
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; SSE2-NEXT:    movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1],xmm8[2],xmm0[2],xmm8[3],xmm0[3]
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3],xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7]
-; SSE2-NEXT:    movdqa %xmm6, %xmm1
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; SSE2-NEXT:    movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3]
-; SSE2-NEXT:    movdqa %xmm6, -{{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT:    movdqa (%rsi), %xmm14
-; SSE2-NEXT:    movdqa %xmm14, %xmm7
-; SSE2-NEXT:    punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm0[8],xmm7[9],xmm0[9],xmm7[10],xmm0[10],xmm7[11],xmm0[11],xmm7[12],xmm0[12],xmm7[13],xmm0[13],xmm7[14],xmm0[14],xmm7[15],xmm0[15]
-; SSE2-NEXT:    movdqa %xmm7, %xmm15
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm15 = xmm15[4],xmm0[4],xmm15[5],xmm0[5],xmm15[6],xmm0[6],xmm15[7],xmm0[7]
+; SSE2-NEXT:    movdqa %xmm13, %xmm4
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm0[8],xmm4[9],xmm0[9],xmm4[10],xmm0[10],xmm4[11],xmm0[11],xmm4[12],xmm0[12],xmm4[13],xmm0[13],xmm4[14],xmm0[14],xmm4[15],xmm0[15]
+; SSE2-NEXT:    movdqa %xmm4, %xmm12
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm0[4],xmm12[5],xmm0[5],xmm12[6],xmm0[6],xmm12[7],xmm0[7]
+; SSE2-NEXT:    paddd %xmm14, %xmm12
+; SSE2-NEXT:    movdqa %xmm7, %xmm5
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7]
 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1],xmm7[2],xmm0[2],xmm7[3],xmm0[3]
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm14 = xmm14[0],xmm0[0],xmm14[1],xmm0[1],xmm14[2],xmm0[2],xmm14[3],xmm0[3],xmm14[4],xmm0[4],xmm14[5],xmm0[5],xmm14[6],xmm0[6],xmm14[7],xmm0[7]
-; SSE2-NEXT:    movdqa %xmm14, %xmm9
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm0[4],xmm9[5],xmm0[5],xmm9[6],xmm0[6],xmm9[7],xmm0[7]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm14 = xmm14[0],xmm0[0],xmm14[1],xmm0[1],xmm14[2],xmm0[2],xmm14[3],xmm0[3]
-; SSE2-NEXT:    movdqa 16(%rsi), %xmm12
-; SSE2-NEXT:    movdqa %xmm12, %xmm6
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3]
+; SSE2-NEXT:    paddd %xmm15, %xmm4
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm13 = xmm13[0],xmm0[0],xmm13[1],xmm0[1],xmm13[2],xmm0[2],xmm13[3],xmm0[3],xmm13[4],xmm0[4],xmm13[5],xmm0[5],xmm13[6],xmm0[6],xmm13[7],xmm0[7]
+; SSE2-NEXT:    movdqa %xmm13, %xmm15
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm15 = xmm15[4],xmm0[4],xmm15[5],xmm0[5],xmm15[6],xmm0[6],xmm15[7],xmm0[7]
+; SSE2-NEXT:    paddd %xmm8, %xmm15
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm0[0],xmm13[1],xmm0[1],xmm13[2],xmm0[2],xmm13[3],xmm0[3]
+; SSE2-NEXT:    paddd %xmm2, %xmm13
+; SSE2-NEXT:    movdqa %xmm11, %xmm6
 ; SSE2-NEXT:    punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm0[8],xmm6[9],xmm0[9],xmm6[10],xmm0[10],xmm6[11],xmm0[11],xmm6[12],xmm0[12],xmm6[13],xmm0[13],xmm6[14],xmm0[14],xmm6[15],xmm0[15]
-; SSE2-NEXT:    movdqa %xmm6, %xmm13
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm0[4],xmm13[5],xmm0[5],xmm13[6],xmm0[6],xmm13[7],xmm0[7]
+; SSE2-NEXT:    movdqa %xmm6, %xmm9
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm0[4],xmm9[5],xmm0[5],xmm9[6],xmm0[6],xmm9[7],xmm0[7]
+; SSE2-NEXT:    paddd %xmm5, %xmm9
+; SSE2-NEXT:    movdqa %xmm1, %xmm2
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3]
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm0[0],xmm12[1],xmm0[1],xmm12[2],xmm0[2],xmm12[3],xmm0[3],xmm12[4],xmm0[4],xmm12[5],xmm0[5],xmm12[6],xmm0[6],xmm12[7],xmm0[7]
-; SSE2-NEXT:    movdqa %xmm12, %xmm10
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm0[4],xmm10[5],xmm0[5],xmm10[6],xmm0[6],xmm10[7],xmm0[7]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm0[0],xmm12[1],xmm0[1],xmm12[2],xmm0[2],xmm12[3],xmm0[3]
-; SSE2-NEXT:    movdqa 32(%rsi), %xmm2
-; SSE2-NEXT:    movdqa %xmm2, %xmm5
-; SSE2-NEXT:    punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm0[8],xmm5[9],xmm0[9],xmm5[10],xmm0[10],xmm5[11],xmm0[11],xmm5[12],xmm0[12],xmm5[13],xmm0[13],xmm5[14],xmm0[14],xmm5[15],xmm0[15]
-; SSE2-NEXT:    movdqa %xmm5, %xmm11
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm0[4],xmm11[5],xmm0[5],xmm11[6],xmm0[6],xmm11[7],xmm0[7]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3]
+; SSE2-NEXT:    paddd %xmm7, %xmm6
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm0[0],xmm11[1],xmm0[1],xmm11[2],xmm0[2],xmm11[3],xmm0[3],xmm11[4],xmm0[4],xmm11[5],xmm0[5],xmm11[6],xmm0[6],xmm11[7],xmm0[7]
+; SSE2-NEXT:    movdqa %xmm11, %xmm14
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm14 = xmm14[4],xmm0[4],xmm14[5],xmm0[5],xmm14[6],xmm0[6],xmm14[7],xmm0[7]
+; SSE2-NEXT:    paddd %xmm2, %xmm14
+; SSE2-NEXT:    movdqa -{{[0-9]+}}(%rsp), %xmm5 # 16-byte Reload
+; SSE2-NEXT:    movdqa %xmm5, %xmm2
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm0[0],xmm11[1],xmm0[1],xmm11[2],xmm0[2],xmm11[3],xmm0[3]
+; SSE2-NEXT:    paddd %xmm1, %xmm11
+; SSE2-NEXT:    movdqa %xmm2, %xmm1
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSE2-NEXT:    movdqa 48(%rsi), %xmm7
+; SSE2-NEXT:    movdqa %xmm7, %xmm3
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15]
+; SSE2-NEXT:    movdqa %xmm3, %xmm8
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm0[4],xmm8[5],xmm0[5],xmm8[6],xmm0[6],xmm8[7],xmm0[7]
+; SSE2-NEXT:    paddd %xmm1, %xmm8
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3]
+; SSE2-NEXT:    paddd %xmm2, %xmm3
+; SSE2-NEXT:    movdqa %xmm5, %xmm2
 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
 ; SSE2-NEXT:    movdqa %xmm2, %xmm1
 ; SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; SSE2-NEXT:    movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1],xmm7[2],xmm0[2],xmm7[3],xmm0[3],xmm7[4],xmm0[4],xmm7[5],xmm0[5],xmm7[6],xmm0[6],xmm7[7],xmm0[7]
+; SSE2-NEXT:    movdqa %xmm7, %xmm5
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7]
+; SSE2-NEXT:    paddd %xmm1, %xmm5
 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; SSE2-NEXT:    movdqa 48(%rsi), %xmm1
-; SSE2-NEXT:    movdqa %xmm1, %xmm4
-; SSE2-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm0[8],xmm4[9],xmm0[9],xmm4[10],xmm0[10],xmm4[11],xmm0[11],xmm4[12],xmm0[12],xmm4[13],xmm0[13],xmm4[14],xmm0[14],xmm4[15],xmm0[15]
-; SSE2-NEXT:    movdqa %xmm4, %xmm3
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
-; SSE2-NEXT:    movdqa %xmm3, -{{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3]
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; SSE2-NEXT:    movdqa %xmm1, %xmm3
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; SSE2-NEXT:    paddd -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Folded Reload
-; SSE2-NEXT:    paddd -{{[0-9]+}}(%rsp), %xmm3 # 16-byte Folded Reload
-; SSE2-NEXT:    movdqa %xmm3, -{{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT:    paddd %xmm8, %xmm4
-; SSE2-NEXT:    movdqa -{{[0-9]+}}(%rsp), %xmm8 # 16-byte Reload
-; SSE2-NEXT:    paddd -{{[0-9]+}}(%rsp), %xmm8 # 16-byte Folded Reload
-; SSE2-NEXT:    paddd -{{[0-9]+}}(%rsp), %xmm2 # 16-byte Folded Reload
-; SSE2-NEXT:    movdqa -{{[0-9]+}}(%rsp), %xmm3 # 16-byte Reload
-; SSE2-NEXT:    paddd -{{[0-9]+}}(%rsp), %xmm3 # 16-byte Folded Reload
-; SSE2-NEXT:    paddd -{{[0-9]+}}(%rsp), %xmm5 # 16-byte Folded Reload
-; SSE2-NEXT:    paddd (%rsp), %xmm11 # 16-byte Folded Reload
-; SSE2-NEXT:    paddd {{[0-9]+}}(%rsp), %xmm12 # 16-byte Folded Reload
-; SSE2-NEXT:    paddd {{[0-9]+}}(%rsp), %xmm10 # 16-byte Folded Reload
-; SSE2-NEXT:    paddd {{[0-9]+}}(%rsp), %xmm6 # 16-byte Folded Reload
-; SSE2-NEXT:    paddd {{[0-9]+}}(%rsp), %xmm13 # 16-byte Folded Reload
-; SSE2-NEXT:    paddd {{[0-9]+}}(%rsp), %xmm14 # 16-byte Folded Reload
-; SSE2-NEXT:    paddd {{[0-9]+}}(%rsp), %xmm9 # 16-byte Folded Reload
-; SSE2-NEXT:    paddd {{[0-9]+}}(%rsp), %xmm7 # 16-byte Folded Reload
-; SSE2-NEXT:    paddd {{[0-9]+}}(%rsp), %xmm15 # 16-byte Folded Reload
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1],xmm7[2],xmm0[2],xmm7[3],xmm0[3]
+; SSE2-NEXT:    paddd %xmm2, %xmm7
 ; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [1,1,1,1]
+; SSE2-NEXT:    movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload
+; SSE2-NEXT:    paddd %xmm0, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT:    paddd %xmm0, %xmm10
+; SSE2-NEXT:    movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload
+; SSE2-NEXT:    paddd %xmm0, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT:    movdqa -{{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload
+; SSE2-NEXT:    paddd %xmm0, %xmm2
+; SSE2-NEXT:    paddd %xmm0, %xmm12
+; SSE2-NEXT:    paddd %xmm0, %xmm4
 ; SSE2-NEXT:    paddd %xmm0, %xmm15
-; SSE2-NEXT:    paddd %xmm0, %xmm7
-; SSE2-NEXT:    paddd %xmm0, %xmm9
-; SSE2-NEXT:    paddd %xmm0, %xmm14
 ; SSE2-NEXT:    paddd %xmm0, %xmm13
+; SSE2-NEXT:    paddd %xmm0, %xmm9
 ; SSE2-NEXT:    paddd %xmm0, %xmm6
-; SSE2-NEXT:    paddd %xmm0, %xmm10
-; SSE2-NEXT:    paddd %xmm0, %xmm12
+; SSE2-NEXT:    paddd %xmm0, %xmm14
 ; SSE2-NEXT:    paddd %xmm0, %xmm11
-; SSE2-NEXT:    paddd %xmm0, %xmm5
-; SSE2-NEXT:    paddd %xmm0, %xmm3
-; SSE2-NEXT:    movdqa %xmm3, -{{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT:    paddd %xmm0, %xmm2
 ; SSE2-NEXT:    paddd %xmm0, %xmm8
-; SSE2-NEXT:    paddd %xmm0, %xmm4
-; SSE2-NEXT:    movdqa -{{[0-9]+}}(%rsp), %xmm3 # 16-byte Reload
 ; SSE2-NEXT:    paddd %xmm0, %xmm3
-; SSE2-NEXT:    paddd %xmm0, %xmm1
-; SSE2-NEXT:    psrld $1, %xmm7
-; SSE2-NEXT:    psrld $1, %xmm15
+; SSE2-NEXT:    paddd %xmm0, %xmm5
+; SSE2-NEXT:    paddd %xmm0, %xmm7
+; SSE2-NEXT:    psrld $1, %xmm10
+; SSE2-NEXT:    movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload
+; SSE2-NEXT:    psrld $1, %xmm1
 ; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
+; SSE2-NEXT:    pand %xmm0, %xmm1
+; SSE2-NEXT:    pand %xmm0, %xmm10
+; SSE2-NEXT:    packuswb %xmm1, %xmm10
+; SSE2-NEXT:    psrld $1, %xmm2
+; SSE2-NEXT:    movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload
+; SSE2-NEXT:    psrld $1, %xmm1
+; SSE2-NEXT:    pand %xmm0, %xmm1
+; SSE2-NEXT:    pand %xmm0, %xmm2
+; SSE2-NEXT:    packuswb %xmm1, %xmm2
+; SSE2-NEXT:    packuswb %xmm10, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm1
+; SSE2-NEXT:    psrld $1, %xmm4
+; SSE2-NEXT:    psrld $1, %xmm12
+; SSE2-NEXT:    pand %xmm0, %xmm12
+; SSE2-NEXT:    pand %xmm0, %xmm4
+; SSE2-NEXT:    packuswb %xmm12, %xmm4
+; SSE2-NEXT:    psrld $1, %xmm13
+; SSE2-NEXT:    psrld $1, %xmm15
 ; SSE2-NEXT:    pand %xmm0, %xmm15
-; SSE2-NEXT:    pand %xmm0, %xmm7
-; SSE2-NEXT:    packuswb %xmm15, %xmm7
-; SSE2-NEXT:    psrld $1, %xmm14
+; SSE2-NEXT:    pand %xmm0, %xmm13
+; SSE2-NEXT:    packuswb %xmm15, %xmm13
+; SSE2-NEXT:    packuswb %xmm4, %xmm13
+; SSE2-NEXT:    psrld $1, %xmm6
 ; SSE2-NEXT:    psrld $1, %xmm9
 ; SSE2-NEXT:    pand %xmm0, %xmm9
-; SSE2-NEXT:    pand %xmm0, %xmm14
-; SSE2-NEXT:    packuswb %xmm9, %xmm14
-; SSE2-NEXT:    packuswb %xmm7, %xmm14
-; SSE2-NEXT:    psrld $1, %xmm6
-; SSE2-NEXT:    psrld $1, %xmm13
-; SSE2-NEXT:    pand %xmm0, %xmm13
 ; SSE2-NEXT:    pand %xmm0, %xmm6
-; SSE2-NEXT:    packuswb %xmm13, %xmm6
-; SSE2-NEXT:    psrld $1, %xmm12
-; SSE2-NEXT:    psrld $1, %xmm10
-; SSE2-NEXT:    pand %xmm0, %xmm10
-; SSE2-NEXT:    pand %xmm0, %xmm12
-; SSE2-NEXT:    packuswb %xmm10, %xmm12
-; SSE2-NEXT:    packuswb %xmm6, %xmm12
-; SSE2-NEXT:    psrld $1, %xmm5
+; SSE2-NEXT:    packuswb %xmm9, %xmm6
 ; SSE2-NEXT:    psrld $1, %xmm11
+; SSE2-NEXT:    psrld $1, %xmm14
+; SSE2-NEXT:    pand %xmm0, %xmm14
 ; SSE2-NEXT:    pand %xmm0, %xmm11
-; SSE2-NEXT:    pand %xmm0, %xmm5
-; SSE2-NEXT:    packuswb %xmm11, %xmm5
-; SSE2-NEXT:    psrld $1, %xmm2
-; SSE2-NEXT:    movdqa -{{[0-9]+}}(%rsp), %xmm6 # 16-byte Reload
-; SSE2-NEXT:    psrld $1, %xmm6
-; SSE2-NEXT:    pand %xmm0, %xmm6
-; SSE2-NEXT:    pand %xmm0, %xmm2
-; SSE2-NEXT:    packuswb %xmm6, %xmm2
-; SSE2-NEXT:    packuswb %xmm5, %xmm2
-; SSE2-NEXT:    psrld $1, %xmm4
-; SSE2-NEXT:    movdqa %xmm8, %xmm5
-; SSE2-NEXT:    psrld $1, %xmm5
-; SSE2-NEXT:    pand %xmm0, %xmm5
-; SSE2-NEXT:    pand %xmm0, %xmm4
-; SSE2-NEXT:    packuswb %xmm5, %xmm4
-; SSE2-NEXT:    psrld $1, %xmm1
-; SSE2-NEXT:    movdqa %xmm3, %xmm5
+; SSE2-NEXT:    packuswb %xmm14, %xmm11
+; SSE2-NEXT:    packuswb %xmm6, %xmm11
+; SSE2-NEXT:    psrld $1, %xmm3
+; SSE2-NEXT:    psrld $1, %xmm8
+; SSE2-NEXT:    pand %xmm0, %xmm8
+; SSE2-NEXT:    pand %xmm0, %xmm3
+; SSE2-NEXT:    packuswb %xmm8, %xmm3
+; SSE2-NEXT:    psrld $1, %xmm7
 ; SSE2-NEXT:    psrld $1, %xmm5
 ; SSE2-NEXT:    pand %xmm0, %xmm5
-; SSE2-NEXT:    pand %xmm0, %xmm1
-; SSE2-NEXT:    packuswb %xmm5, %xmm1
-; SSE2-NEXT:    packuswb %xmm4, %xmm1
+; SSE2-NEXT:    pand %xmm0, %xmm7
+; SSE2-NEXT:    packuswb %xmm5, %xmm7
+; SSE2-NEXT:    packuswb %xmm3, %xmm7
+; SSE2-NEXT:    movdqu %xmm7, (%rax)
+; SSE2-NEXT:    movdqu %xmm11, (%rax)
+; SSE2-NEXT:    movdqu %xmm13, (%rax)
 ; SSE2-NEXT:    movdqu %xmm1, (%rax)
-; SSE2-NEXT:    movdqu %xmm2, (%rax)
-; SSE2-NEXT:    movdqu %xmm12, (%rax)
-; SSE2-NEXT:    movdqu %xmm14, (%rax)
-; SSE2-NEXT:    addq $152, %rsp
 ; SSE2-NEXT:    retq
 ;
 ; AVX2-LABEL: avg_v64i8:
@@ -464,21 +448,21 @@ define void @avg_v64i8(<64 x i8>* %a, <64 x i8>* %b) {
 ; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm6 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
 ; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
 ; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
-; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm9 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
-; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm10 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
-; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm11 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
-; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm12 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
-; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm13 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
-; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm14 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
-; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
-; AVX2-NEXT:    vpaddd %ymm15, %ymm7, %ymm7
-; AVX2-NEXT:    vpaddd %ymm14, %ymm6, %ymm6
-; AVX2-NEXT:    vpaddd %ymm13, %ymm5, %ymm5
-; AVX2-NEXT:    vpaddd %ymm12, %ymm4, %ymm4
-; AVX2-NEXT:    vpaddd %ymm11, %ymm3, %ymm3
-; AVX2-NEXT:    vpaddd %ymm10, %ymm2, %ymm2
-; AVX2-NEXT:    vpaddd %ymm9, %ymm1, %ymm1
 ; AVX2-NEXT:    vpaddd %ymm8, %ymm0, %ymm0
+; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
+; AVX2-NEXT:    vpaddd %ymm8, %ymm1, %ymm1
+; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
+; AVX2-NEXT:    vpaddd %ymm8, %ymm2, %ymm2
+; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
+; AVX2-NEXT:    vpaddd %ymm8, %ymm3, %ymm3
+; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
+; AVX2-NEXT:    vpaddd %ymm8, %ymm4, %ymm4
+; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
+; AVX2-NEXT:    vpaddd %ymm8, %ymm5, %ymm5
+; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
+; AVX2-NEXT:    vpaddd %ymm8, %ymm6, %ymm6
+; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
+; AVX2-NEXT:    vpaddd %ymm8, %ymm7, %ymm7
 ; AVX2-NEXT:    vpbroadcastd {{.*}}(%rip), %ymm8
 ; AVX2-NEXT:    vpaddd %ymm8, %ymm0, %ymm9
 ; AVX2-NEXT:    vpaddd %ymm8, %ymm1, %ymm10
@@ -540,13 +524,13 @@ define void @avg_v64i8(<64 x i8>* %a, <64 x i8>* %b) {
 ; AVX512F-NEXT:    vpmovzxbd {{.*#+}} zmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
 ; AVX512F-NEXT:    vpmovzxbd {{.*#+}} zmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
 ; AVX512F-NEXT:    vpmovzxbd {{.*#+}} zmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
-; AVX512F-NEXT:    vpmovzxbd {{.*#+}} zmm5 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
-; AVX512F-NEXT:    vpmovzxbd {{.*#+}} zmm6 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
-; AVX512F-NEXT:    vpmovzxbd {{.*#+}} zmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
-; AVX512F-NEXT:    vpaddd %zmm7, %zmm3, %zmm3
-; AVX512F-NEXT:    vpaddd %zmm6, %zmm2, %zmm2
-; AVX512F-NEXT:    vpaddd %zmm5, %zmm1, %zmm1
 ; AVX512F-NEXT:    vpaddd %zmm4, %zmm0, %zmm0
+; AVX512F-NEXT:    vpmovzxbd {{.*#+}} zmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
+; AVX512F-NEXT:    vpaddd %zmm4, %zmm1, %zmm1
+; AVX512F-NEXT:    vpmovzxbd {{.*#+}} zmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
+; AVX512F-NEXT:    vpaddd %zmm4, %zmm2, %zmm2
+; AVX512F-NEXT:    vpmovzxbd {{.*#+}} zmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
+; AVX512F-NEXT:    vpaddd %zmm4, %zmm3, %zmm3
 ; AVX512F-NEXT:    vpbroadcastd {{.*}}(%rip), %zmm4
 ; AVX512F-NEXT:    vpaddd %zmm4, %zmm0, %zmm0
 ; AVX512F-NEXT:    vpaddd %zmm4, %zmm1, %zmm1
@@ -673,27 +657,27 @@ define void @avg_v8i16(<8 x i16>* %a, <8 x i16>* %b) {
 define void @avg_v16i16(<16 x i16>* %a, <16 x i16>* %b) {
 ; SSE2-LABEL: avg_v16i16:
 ; SSE2:       # BB#0:
-; SSE2-NEXT:    movdqa (%rdi), %xmm4
-; SSE2-NEXT:    movdqa 16(%rdi), %xmm5
+; SSE2-NEXT:    movdqa (%rdi), %xmm2
+; SSE2-NEXT:    movdqa 16(%rdi), %xmm4
 ; SSE2-NEXT:    movdqa (%rsi), %xmm0
 ; SSE2-NEXT:    movdqa 16(%rsi), %xmm1
-; SSE2-NEXT:    pxor %xmm6, %xmm6
-; SSE2-NEXT:    movdqa %xmm4, %xmm8
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3]
-; SSE2-NEXT:    movdqa %xmm5, %xmm7
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
+; SSE2-NEXT:    pxor %xmm5, %xmm5
+; SSE2-NEXT:    movdqa %xmm2, %xmm6
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3]
+; SSE2-NEXT:    movdqa %xmm4, %xmm7
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm5[4],xmm7[5],xmm5[5],xmm7[6],xmm5[6],xmm7[7],xmm5[7]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
 ; SSE2-NEXT:    movdqa %xmm0, %xmm3
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm6[4],xmm3[5],xmm6[5],xmm3[6],xmm6[6],xmm3[7],xmm6[7]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3]
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7]
+; SSE2-NEXT:    paddd %xmm6, %xmm3
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3]
+; SSE2-NEXT:    paddd %xmm2, %xmm0
 ; SSE2-NEXT:    movdqa %xmm1, %xmm2
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3]
-; SSE2-NEXT:    paddd %xmm5, %xmm1
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7]
 ; SSE2-NEXT:    paddd %xmm7, %xmm2
-; SSE2-NEXT:    paddd %xmm4, %xmm0
-; SSE2-NEXT:    paddd %xmm8, %xmm3
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3]
+; SSE2-NEXT:    paddd %xmm4, %xmm1
 ; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [1,1,1,1]
 ; SSE2-NEXT:    paddd %xmm4, %xmm3
 ; SSE2-NEXT:    paddd %xmm4, %xmm0
@@ -755,80 +739,79 @@ define void @avg_v16i16(<16 x i16>* %a, <16 x i16>* %b) {
 define void @avg_v32i16(<32 x i16>* %a, <32 x i16>* %b) {
 ; SSE2-LABEL: avg_v32i16:
 ; SSE2:       # BB#0:
-; SSE2-NEXT:    movdqa (%rdi), %xmm10
-; SSE2-NEXT:    movdqa 16(%rdi), %xmm9
-; SSE2-NEXT:    movdqa 32(%rdi), %xmm11
+; SSE2-NEXT:    movdqa (%rdi), %xmm4
+; SSE2-NEXT:    movdqa 16(%rdi), %xmm11
+; SSE2-NEXT:    movdqa 32(%rdi), %xmm10
 ; SSE2-NEXT:    movdqa 48(%rdi), %xmm8
-; SSE2-NEXT:    movdqa (%rsi), %xmm14
+; SSE2-NEXT:    movdqa (%rsi), %xmm9
 ; SSE2-NEXT:    movdqa 16(%rsi), %xmm1
 ; SSE2-NEXT:    movdqa 32(%rsi), %xmm2
 ; SSE2-NEXT:    movdqa 48(%rsi), %xmm3
 ; SSE2-NEXT:    pxor %xmm0, %xmm0
-; SSE2-NEXT:    movdqa %xmm10, %xmm4
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7]
-; SSE2-NEXT:    movdqa %xmm4, -{{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1],xmm10[2],xmm0[2],xmm10[3],xmm0[3]
-; SSE2-NEXT:    movdqa %xmm9, %xmm12
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm0[4],xmm12[5],xmm0[5],xmm12[6],xmm0[6],xmm12[7],xmm0[7]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1],xmm9[2],xmm0[2],xmm9[3],xmm0[3]
-; SSE2-NEXT:    movdqa %xmm11, %xmm15
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm15 = xmm15[4],xmm0[4],xmm15[5],xmm0[5],xmm15[6],xmm0[6],xmm15[7],xmm0[7]
+; SSE2-NEXT:    movdqa %xmm4, %xmm6
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3]
+; SSE2-NEXT:    movdqa %xmm11, %xmm5
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7]
 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm0[0],xmm11[1],xmm0[1],xmm11[2],xmm0[2],xmm11[3],xmm0[3]
+; SSE2-NEXT:    movdqa %xmm10, %xmm12
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm0[4],xmm12[5],xmm0[5],xmm12[6],xmm0[6],xmm12[7],xmm0[7]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1],xmm10[2],xmm0[2],xmm10[3],xmm0[3]
 ; SSE2-NEXT:    movdqa %xmm8, %xmm13
 ; SSE2-NEXT:    punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm0[4],xmm13[5],xmm0[5],xmm13[6],xmm0[6],xmm13[7],xmm0[7]
 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1],xmm8[2],xmm0[2],xmm8[3],xmm0[3]
-; SSE2-NEXT:    movdqa %xmm14, %xmm7
+; SSE2-NEXT:    movdqa %xmm9, %xmm7
 ; SSE2-NEXT:    punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm0[4],xmm7[5],xmm0[5],xmm7[6],xmm0[6],xmm7[7],xmm0[7]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm14 = xmm14[0],xmm0[0],xmm14[1],xmm0[1],xmm14[2],xmm0[2],xmm14[3],xmm0[3]
+; SSE2-NEXT:    paddd %xmm6, %xmm7
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1],xmm9[2],xmm0[2],xmm9[3],xmm0[3]
+; SSE2-NEXT:    paddd %xmm4, %xmm9
 ; SSE2-NEXT:    movdqa %xmm1, %xmm6
 ; SSE2-NEXT:    punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7]
+; SSE2-NEXT:    paddd %xmm5, %xmm6
 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; SSE2-NEXT:    paddd %xmm11, %xmm1
 ; SSE2-NEXT:    movdqa %xmm2, %xmm5
 ; SSE2-NEXT:    punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7]
+; SSE2-NEXT:    paddd %xmm12, %xmm5
 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
+; SSE2-NEXT:    paddd %xmm10, %xmm2
 ; SSE2-NEXT:    movdqa %xmm3, %xmm4
 ; SSE2-NEXT:    punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7]
+; SSE2-NEXT:    paddd %xmm13, %xmm4
 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3]
 ; SSE2-NEXT:    paddd %xmm8, %xmm3
-; SSE2-NEXT:    paddd %xmm13, %xmm4
-; SSE2-NEXT:    paddd %xmm11, %xmm2
-; SSE2-NEXT:    paddd %xmm15, %xmm5
-; SSE2-NEXT:    paddd %xmm9, %xmm1
-; SSE2-NEXT:    paddd %xmm12, %xmm6
-; SSE2-NEXT:    paddd %xmm10, %xmm14
-; SSE2-NEXT:    paddd -{{[0-9]+}}(%rsp), %xmm7 # 16-byte Folded Reload
 ; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [1,1,1,1]
 ; SSE2-NEXT:    paddd %xmm0, %xmm7
-; SSE2-NEXT:    paddd %xmm0, %xmm14
+; SSE2-NEXT:    paddd %xmm0, %xmm9
 ; SSE2-NEXT:    paddd %xmm0, %xmm6
 ; SSE2-NEXT:    paddd %xmm0, %xmm1
 ; SSE2-NEXT:    paddd %xmm0, %xmm5
 ; SSE2-NEXT:    paddd %xmm0, %xmm2
 ; SSE2-NEXT:    paddd %xmm0, %xmm4
 ; SSE2-NEXT:    paddd %xmm0, %xmm3
-; SSE2-NEXT:    psrld $1, %xmm14
+; SSE2-NEXT:    psrld $1, %xmm3
+; SSE2-NEXT:    psrld $1, %xmm4
+; SSE2-NEXT:    psrld $1, %xmm2
+; SSE2-NEXT:    psrld $1, %xmm5
+; SSE2-NEXT:    psrld $1, %xmm1
+; SSE2-NEXT:    psrld $1, %xmm6
+; SSE2-NEXT:    psrld $1, %xmm9
 ; SSE2-NEXT:    psrld $1, %xmm7
 ; SSE2-NEXT:    pslld $16, %xmm7
 ; SSE2-NEXT:    psrad $16, %xmm7
-; SSE2-NEXT:    pslld $16, %xmm14
-; SSE2-NEXT:    psrad $16, %xmm14
-; SSE2-NEXT:    packssdw %xmm7, %xmm14
-; SSE2-NEXT:    psrld $1, %xmm1
-; SSE2-NEXT:    psrld $1, %xmm6
+; SSE2-NEXT:    pslld $16, %xmm9
+; SSE2-NEXT:    psrad $16, %xmm9
+; SSE2-NEXT:    packssdw %xmm7, %xmm9
 ; SSE2-NEXT:    pslld $16, %xmm6
 ; SSE2-NEXT:    psrad $16, %xmm6
 ; SSE2-NEXT:    pslld $16, %xmm1
 ; SSE2-NEXT:    psrad $16, %xmm1
 ; SSE2-NEXT:    packssdw %xmm6, %xmm1
-; SSE2-NEXT:    psrld $1, %xmm2
-; SSE2-NEXT:    psrld $1, %xmm5
 ; SSE2-NEXT:    pslld $16, %xmm5
 ; SSE2-NEXT:    psrad $16, %xmm5
 ; SSE2-NEXT:    pslld $16, %xmm2
 ; SSE2-NEXT:    psrad $16, %xmm2
 ; SSE2-NEXT:    packssdw %xmm5, %xmm2
-; SSE2-NEXT:    psrld $1, %xmm3
-; SSE2-NEXT:    psrld $1, %xmm4
 ; SSE2-NEXT:    pslld $16, %xmm4
 ; SSE2-NEXT:    psrad $16, %xmm4
 ; SSE2-NEXT:    pslld $16, %xmm3
@@ -837,7 +820,7 @@ define void @avg_v32i16(<32 x i16>* %a, <32 x i16>* %b) {
 ; SSE2-NEXT:    movdqu %xmm3, (%rax)
 ; SSE2-NEXT:    movdqu %xmm2, (%rax)
 ; SSE2-NEXT:    movdqu %xmm1, (%rax)
-; SSE2-NEXT:    movdqu %xmm14, (%rax)
+; SSE2-NEXT:    movdqu %xmm9, (%rax)
 ; SSE2-NEXT:    retq
 ;
 ; AVX2-LABEL: avg_v32i16:
@@ -847,13 +830,13 @@ define void @avg_v32i16(<32 x i16>* %a, <32 x i16>* %b) {
 ; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
 ; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
 ; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
-; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm5 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
-; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm6 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
-; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm7 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
-; AVX2-NEXT:    vpaddd %ymm7, %ymm3, %ymm3
-; AVX2-NEXT:    vpaddd %ymm6, %ymm2, %ymm2
-; AVX2-NEXT:    vpaddd %ymm5, %ymm1, %ymm1
 ; AVX2-NEXT:    vpaddd %ymm4, %ymm0, %ymm0
+; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
+; AVX2-NEXT:    vpaddd %ymm4, %ymm1, %ymm1
+; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
+; AVX2-NEXT:    vpaddd %ymm4, %ymm2, %ymm2
+; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
+; AVX2-NEXT:    vpaddd %ymm4, %ymm3, %ymm3
 ; AVX2-NEXT:    vpbroadcastd {{.*}}(%rip), %ymm4
 ; AVX2-NEXT:    vpaddd %ymm4, %ymm0, %ymm0
 ; AVX2-NEXT:    vpaddd %ymm4, %ymm1, %ymm1
@@ -884,9 +867,9 @@ define void @avg_v32i16(<32 x i16>* %a, <32 x i16>* %b) {
 ; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
 ; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
 ; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
-; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
-; AVX512F-NEXT:    vpaddd %zmm3, %zmm1, %zmm1
 ; AVX512F-NEXT:    vpaddd %zmm2, %zmm0, %zmm0
+; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
+; AVX512F-NEXT:    vpaddd %zmm2, %zmm1, %zmm1
 ; AVX512F-NEXT:    vpbroadcastd {{.*}}(%rip), %zmm2
 ; AVX512F-NEXT:    vpaddd %zmm2, %zmm0, %zmm0
 ; AVX512F-NEXT:    vpaddd %zmm2, %zmm1, %zmm1
@@ -1047,88 +1030,87 @@ define void @avg_v16i8_2(<16 x i8>* %a, <16 x i8>* %b) {
 define void @avg_v32i8_2(<32 x i8>* %a, <32 x i8>* %b) {
 ; SSE2-LABEL: avg_v32i8_2:
 ; SSE2:       # BB#0:
-; SSE2-NEXT:    movdqa (%rdi), %xmm8
-; SSE2-NEXT:    movdqa 16(%rdi), %xmm11
+; SSE2-NEXT:    movdqa (%rdi), %xmm3
+; SSE2-NEXT:    movdqa 16(%rdi), %xmm8
 ; SSE2-NEXT:    movdqa (%rsi), %xmm0
 ; SSE2-NEXT:    movdqa 16(%rsi), %xmm1
 ; SSE2-NEXT:    pxor %xmm4, %xmm4
-; SSE2-NEXT:    movdqa %xmm8, %xmm10
-; SSE2-NEXT:    punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm4[8],xmm10[9],xmm4[9],xmm10[10],xmm4[10],xmm10[11],xmm4[11],xmm10[12],xmm4[12],xmm10[13],xmm4[13],xmm10[14],xmm4[14],xmm10[15],xmm4[15]
-; SSE2-NEXT:    movdqa %xmm10, %xmm2
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
-; SSE2-NEXT:    movdqa %xmm2, -{{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm4[0],xmm10[1],xmm4[1],xmm10[2],xmm4[2],xmm10[3],xmm4[3]
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3],xmm8[4],xmm4[4],xmm8[5],xmm4[5],xmm8[6],xmm4[6],xmm8[7],xmm4[7]
-; SSE2-NEXT:    movdqa %xmm8, %xmm12
+; SSE2-NEXT:    movdqa %xmm3, %xmm5
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15]
+; SSE2-NEXT:    movdqa %xmm5, %xmm6
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
+; SSE2-NEXT:    movdqa %xmm3, %xmm12
 ; SSE2-NEXT:    punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm4[4],xmm12[5],xmm4[5],xmm12[6],xmm4[6],xmm12[7],xmm4[7]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
+; SSE2-NEXT:    movdqa %xmm8, %xmm7
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm4[8],xmm7[9],xmm4[9],xmm7[10],xmm4[10],xmm7[11],xmm4[11],xmm7[12],xmm4[12],xmm7[13],xmm4[13],xmm7[14],xmm4[14],xmm7[15],xmm4[15]
+; SSE2-NEXT:    movdqa %xmm7, %xmm11
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm4[4],xmm11[5],xmm4[5],xmm11[6],xmm4[6],xmm11[7],xmm4[7]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3]
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3],xmm8[4],xmm4[4],xmm8[5],xmm4[5],xmm8[6],xmm4[6],xmm8[7],xmm4[7]
+; SSE2-NEXT:    movdqa %xmm8, %xmm10
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm4[4],xmm10[5],xmm4[5],xmm10[6],xmm4[6],xmm10[7],xmm4[7]
 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3]
-; SSE2-NEXT:    movdqa %xmm11, %xmm15
-; SSE2-NEXT:    punpckhbw {{.*#+}} xmm15 = xmm15[8],xmm4[8],xmm15[9],xmm4[9],xmm15[10],xmm4[10],xmm15[11],xmm4[11],xmm15[12],xmm4[12],xmm15[13],xmm4[13],xmm15[14],xmm4[14],xmm15[15],xmm4[15]
-; SSE2-NEXT:    movdqa %xmm15, %xmm14
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm14 = xmm14[4],xmm4[4],xmm14[5],xmm4[5],xmm14[6],xmm4[6],xmm14[7],xmm4[7]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm4[0],xmm15[1],xmm4[1],xmm15[2],xmm4[2],xmm15[3],xmm4[3]
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm4[0],xmm11[1],xmm4[1],xmm11[2],xmm4[2],xmm11[3],xmm4[3],xmm11[4],xmm4[4],xmm11[5],xmm4[5],xmm11[6],xmm4[6],xmm11[7],xmm4[7]
-; SSE2-NEXT:    movdqa %xmm11, %xmm9
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm4[8],xmm2[9],xmm4[9],xmm2[10],xmm4[10],xmm2[11],xmm4[11],xmm2[12],xmm4[12],xmm2[13],xmm4[13],xmm2[14],xmm4[14],xmm2[15],xmm4[15]
+; SSE2-NEXT:    movdqa %xmm2, %xmm9
 ; SSE2-NEXT:    punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm4[4],xmm9[5],xmm4[5],xmm9[6],xmm4[6],xmm9[7],xmm4[7]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm4[0],xmm11[1],xmm4[1],xmm11[2],xmm4[2],xmm11[3],xmm4[3]
-; SSE2-NEXT:    movdqa %xmm0, %xmm3
-; SSE2-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15]
-; SSE2-NEXT:    movdqa %xmm3, %xmm7
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm4[4],xmm7[5],xmm4[5],xmm7[6],xmm4[6],xmm7[7],xmm4[7]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
+; SSE2-NEXT:    paddd %xmm6, %xmm9
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
+; SSE2-NEXT:    paddd %xmm5, %xmm2
 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
-; SSE2-NEXT:    movdqa %xmm0, %xmm6
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
-; SSE2-NEXT:    movdqa %xmm1, %xmm2
-; SSE2-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm4[8],xmm2[9],xmm4[9],xmm2[10],xmm4[10],xmm2[11],xmm4[11],xmm2[12],xmm4[12],xmm2[13],xmm4[13],xmm2[14],xmm4[14],xmm2[15],xmm4[15]
-; SSE2-NEXT:    movdqa %xmm2, %xmm5
+; SSE2-NEXT:    movdqa %xmm0, %xmm5
 ; SSE2-NEXT:    punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
+; SSE2-NEXT:    paddd %xmm12, %xmm5
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
+; SSE2-NEXT:    paddd %xmm3, %xmm0
+; SSE2-NEXT:    movdqa %xmm1, %xmm3
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15]
+; SSE2-NEXT:    movdqa %xmm3, %xmm6
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7]
+; SSE2-NEXT:    paddd %xmm11, %xmm6
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
+; SSE2-NEXT:    paddd %xmm7, %xmm3
 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
-; SSE2-NEXT:    movdqa %xmm1, %xmm13
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm4[4],xmm13[5],xmm4[5],xmm13[6],xmm4[6],xmm13[7],xmm4[7]
+; SSE2-NEXT:    movdqa %xmm1, %xmm7
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm4[4],xmm7[5],xmm4[5],xmm7[6],xmm4[6],xmm7[7],xmm4[7]
+; SSE2-NEXT:    paddd %xmm10, %xmm7
 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3]
-; SSE2-NEXT:    paddd %xmm11, %xmm1
-; SSE2-NEXT:    paddd %xmm9, %xmm13
-; SSE2-NEXT:    paddd %xmm15, %xmm2
-; SSE2-NEXT:    paddd %xmm14, %xmm5
-; SSE2-NEXT:    paddd %xmm8, %xmm0
-; SSE2-NEXT:    paddd %xmm12, %xmm6
-; SSE2-NEXT:    paddd %xmm10, %xmm3
-; SSE2-NEXT:    paddd -{{[0-9]+}}(%rsp), %xmm7 # 16-byte Folded Reload
+; SSE2-NEXT:    paddd %xmm8, %xmm1
 ; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [1,1,1,1]
-; SSE2-NEXT:    paddd %xmm4, %xmm7
-; SSE2-NEXT:    paddd %xmm4, %xmm3
-; SSE2-NEXT:    paddd %xmm4, %xmm6
-; SSE2-NEXT:    paddd %xmm4, %xmm0
-; SSE2-NEXT:    paddd %xmm4, %xmm5
+; SSE2-NEXT:    paddd %xmm4, %xmm9
 ; SSE2-NEXT:    paddd %xmm4, %xmm2
-; SSE2-NEXT:    paddd %xmm4, %xmm13
+; SSE2-NEXT:    paddd %xmm4, %xmm5
+; SSE2-NEXT:    paddd %xmm4, %xmm0
+; SSE2-NEXT:    paddd %xmm4, %xmm6
+; SSE2-NEXT:    paddd %xmm4, %xmm3
+; SSE2-NEXT:    paddd %xmm4, %xmm7
 ; SSE2-NEXT:    paddd %xmm4, %xmm1
-; SSE2-NEXT:    psrld $1, %xmm3
+; SSE2-NEXT:    psrld $1, %xmm1
 ; SSE2-NEXT:    psrld $1, %xmm7
-; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
-; SSE2-NEXT:    pand %xmm4, %xmm7
-; SSE2-NEXT:    pand %xmm4, %xmm3
-; SSE2-NEXT:    packuswb %xmm7, %xmm3
-; SSE2-NEXT:    psrld $1, %xmm0
+; SSE2-NEXT:    psrld $1, %xmm3
 ; SSE2-NEXT:    psrld $1, %xmm6
-; SSE2-NEXT:    pand %xmm4, %xmm6
-; SSE2-NEXT:    pand %xmm4, %xmm0
-; SSE2-NEXT:    packuswb %xmm6, %xmm0
-; SSE2-NEXT:    packuswb %xmm3, %xmm0
-; SSE2-NEXT:    psrld $1, %xmm2
+; SSE2-NEXT:    psrld $1, %xmm0
 ; SSE2-NEXT:    psrld $1, %xmm5
-; SSE2-NEXT:    pand %xmm4, %xmm5
+; SSE2-NEXT:    psrld $1, %xmm2
+; SSE2-NEXT:    psrld $1, %xmm9
+; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
+; SSE2-NEXT:    pand %xmm4, %xmm9
 ; SSE2-NEXT:    pand %xmm4, %xmm2
-; SSE2-NEXT:    packuswb %xmm5, %xmm2
-; SSE2-NEXT:    psrld $1, %xmm1
-; SSE2-NEXT:    psrld $1, %xmm13
-; SSE2-NEXT:    pand %xmm4, %xmm13
+; SSE2-NEXT:    packuswb %xmm9, %xmm2
+; SSE2-NEXT:    pand %xmm4, %xmm5
+; SSE2-NEXT:    pand %xmm4, %xmm0
+; SSE2-NEXT:    packuswb %xmm5, %xmm0
+; SSE2-NEXT:    packuswb %xmm2, %xmm0
+; SSE2-NEXT:    pand %xmm4, %xmm6
+; SSE2-NEXT:    pand %xmm4, %xmm3
+; SSE2-NEXT:    packuswb %xmm6, %xmm3
+; SSE2-NEXT:    pand %xmm4, %xmm7
 ; SSE2-NEXT:    pand %xmm4, %xmm1
-; SSE2-NEXT:    packuswb %xmm13, %xmm1
-; SSE2-NEXT:    packuswb %xmm2, %xmm1
+; SSE2-NEXT:    packuswb %xmm7, %xmm1
+; SSE2-NEXT:    packuswb %xmm3, %xmm1
 ; SSE2-NEXT:    movdqu %xmm1, (%rax)
 ; SSE2-NEXT:    movdqu %xmm0, (%rax)
 ; SSE2-NEXT:    retq
@@ -1512,27 +1494,27 @@ define void @avg_v8i16_2(<8 x i16>* %a, <8 x i16>* %b) {
 define void @avg_v16i16_2(<16 x i16>* %a, <16 x i16>* %b) {
 ; SSE2-LABEL: avg_v16i16_2:
 ; SSE2:       # BB#0:
-; SSE2-NEXT:    movdqa (%rdi), %xmm4
-; SSE2-NEXT:    movdqa 16(%rdi), %xmm5
+; SSE2-NEXT:    movdqa (%rdi), %xmm2
+; SSE2-NEXT:    movdqa 16(%rdi), %xmm4
 ; SSE2-NEXT:    movdqa (%rsi), %xmm0
 ; SSE2-NEXT:    movdqa 16(%rsi), %xmm1
-; SSE2-NEXT:    pxor %xmm6, %xmm6
-; SSE2-NEXT:    movdqa %xmm4, %xmm8
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3]
-; SSE2-NEXT:    movdqa %xmm5, %xmm7
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
+; SSE2-NEXT:    pxor %xmm5, %xmm5
+; SSE2-NEXT:    movdqa %xmm2, %xmm6
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3]
+; SSE2-NEXT:    movdqa %xmm4, %xmm7
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm5[4],xmm7[5],xmm5[5],xmm7[6],xmm5[6],xmm7[7],xmm5[7]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
 ; SSE2-NEXT:    movdqa %xmm0, %xmm3
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm6[4],xmm3[5],xmm6[5],xmm3[6],xmm6[6],xmm3[7],xmm6[7]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3]
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7]
+; SSE2-NEXT:    paddd %xmm6, %xmm3
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3]
+; SSE2-NEXT:    paddd %xmm2, %xmm0
 ; SSE2-NEXT:    movdqa %xmm1, %xmm2
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3]
-; SSE2-NEXT:    paddd %xmm5, %xmm1
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7]
 ; SSE2-NEXT:    paddd %xmm7, %xmm2
-; SSE2-NEXT:    paddd %xmm4, %xmm0
-; SSE2-NEXT:    paddd %xmm8, %xmm3
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3]
+; SSE2-NEXT:    paddd %xmm4, %xmm1
 ; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [1,1,1,1]
 ; SSE2-NEXT:    paddd %xmm4, %xmm3
 ; SSE2-NEXT:    paddd %xmm4, %xmm0
@@ -1594,80 +1576,79 @@ define void @avg_v16i16_2(<16 x i16>* %a, <16 x i16>* %b) {
 define void @avg_v32i16_2(<32 x i16>* %a, <32 x i16>* %b) {
 ; SSE2-LABEL: avg_v32i16_2:
 ; SSE2:       # BB#0:
-; SSE2-NEXT:    movdqa (%rdi), %xmm10
-; SSE2-NEXT:    movdqa 16(%rdi), %xmm9
-; SSE2-NEXT:    movdqa 32(%rdi), %xmm11
+; SSE2-NEXT:    movdqa (%rdi), %xmm4
+; SSE2-NEXT:    movdqa 16(%rdi), %xmm11
+; SSE2-NEXT:    movdqa 32(%rdi), %xmm10
 ; SSE2-NEXT:    movdqa 48(%rdi), %xmm8
-; SSE2-NEXT:    movdqa (%rsi), %xmm14
+; SSE2-NEXT:    movdqa (%rsi), %xmm9
 ; SSE2-NEXT:    movdqa 16(%rsi), %xmm1
 ; SSE2-NEXT:    movdqa 32(%rsi), %xmm2
 ; SSE2-NEXT:    movdqa 48(%rsi), %xmm3
 ; SSE2-NEXT:    pxor %xmm0, %xmm0
-; SSE2-NEXT:    movdqa %xmm10, %xmm4
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7]
-; SSE2-NEXT:    movdqa %xmm4, -{{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1],xmm10[2],xmm0[2],xmm10[3],xmm0[3]
-; SSE2-NEXT:    movdqa %xmm9, %xmm12
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm0[4],xmm12[5],xmm0[5],xmm12[6],xmm0[6],xmm12[7],xmm0[7]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1],xmm9[2],xmm0[2],xmm9[3],xmm0[3]
-; SSE2-NEXT:    movdqa %xmm11, %xmm15
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm15 = xmm15[4],xmm0[4],xmm15[5],xmm0[5],xmm15[6],xmm0[6],xmm15[7],xmm0[7]
+; SSE2-NEXT:    movdqa %xmm4, %xmm6
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3]
+; SSE2-NEXT:    movdqa %xmm11, %xmm5
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7]
 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm0[0],xmm11[1],xmm0[1],xmm11[2],xmm0[2],xmm11[3],xmm0[3]
+; SSE2-NEXT:    movdqa %xmm10, %xmm12
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm0[4],xmm12[5],xmm0[5],xmm12[6],xmm0[6],xmm12[7],xmm0[7]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1],xmm10[2],xmm0[2],xmm10[3],xmm0[3]
 ; SSE2-NEXT:    movdqa %xmm8, %xmm13
 ; SSE2-NEXT:    punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm0[4],xmm13[5],xmm0[5],xmm13[6],xmm0[6],xmm13[7],xmm0[7]
 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1],xmm8[2],xmm0[2],xmm8[3],xmm0[3]
-; SSE2-NEXT:    movdqa %xmm14, %xmm7
+; SSE2-NEXT:    movdqa %xmm9, %xmm7
 ; SSE2-NEXT:    punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm0[4],xmm7[5],xmm0[5],xmm7[6],xmm0[6],xmm7[7],xmm0[7]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm14 = xmm14[0],xmm0[0],xmm14[1],xmm0[1],xmm14[2],xmm0[2],xmm14[3],xmm0[3]
+; SSE2-NEXT:    paddd %xmm6, %xmm7
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1],xmm9[2],xmm0[2],xmm9[3],xmm0[3]
+; SSE2-NEXT:    paddd %xmm4, %xmm9
 ; SSE2-NEXT:    movdqa %xmm1, %xmm6
 ; SSE2-NEXT:    punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7]
+; SSE2-NEXT:    paddd %xmm5, %xmm6
 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; SSE2-NEXT:    paddd %xmm11, %xmm1
 ; SSE2-NEXT:    movdqa %xmm2, %xmm5
 ; SSE2-NEXT:    punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7]
+; SSE2-NEXT:    paddd %xmm12, %xmm5
 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
+; SSE2-NEXT:    paddd %xmm10, %xmm2
 ; SSE2-NEXT:    movdqa %xmm3, %xmm4
 ; SSE2-NEXT:    punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7]
+; SSE2-NEXT:    paddd %xmm13, %xmm4
 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3]
 ; SSE2-NEXT:    paddd %xmm8, %xmm3
-; SSE2-NEXT:    paddd %xmm13, %xmm4
-; SSE2-NEXT:    paddd %xmm11, %xmm2
-; SSE2-NEXT:    paddd %xmm15, %xmm5
-; SSE2-NEXT:    paddd %xmm9, %xmm1
-; SSE2-NEXT:    paddd %xmm12, %xmm6
-; SSE2-NEXT:    paddd %xmm10, %xmm14
-; SSE2-NEXT:    paddd -{{[0-9]+}}(%rsp), %xmm7 # 16-byte Folded Reload
 ; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [1,1,1,1]
 ; SSE2-NEXT:    paddd %xmm0, %xmm7
-; SSE2-NEXT:    paddd %xmm0, %xmm14
+; SSE2-NEXT:    paddd %xmm0, %xmm9
 ; SSE2-NEXT:    paddd %xmm0, %xmm6
 ; SSE2-NEXT:    paddd %xmm0, %xmm1
 ; SSE2-NEXT:    paddd %xmm0, %xmm5
 ; SSE2-NEXT:    paddd %xmm0, %xmm2
 ; SSE2-NEXT:    paddd %xmm0, %xmm4
 ; SSE2-NEXT:    paddd %xmm0, %xmm3
-; SSE2-NEXT:    psrld $1, %xmm14
+; SSE2-NEXT:    psrld $1, %xmm3
+; SSE2-NEXT:    psrld $1, %xmm4
+; SSE2-NEXT:    psrld $1, %xmm2
+; SSE2-NEXT:    psrld $1, %xmm5
+; SSE2-NEXT:    psrld $1, %xmm1
+; SSE2-NEXT:    psrld $1, %xmm6
+; SSE2-NEXT:    psrld $1, %xmm9
 ; SSE2-NEXT:    psrld $1, %xmm7
 ; SSE2-NEXT:    pslld $16, %xmm7
 ; SSE2-NEXT:    psrad $16, %xmm7
-; SSE2-NEXT:    pslld $16, %xmm14
-; SSE2-NEXT:    psrad $16, %xmm14
-; SSE2-NEXT:    packssdw %xmm7, %xmm14
-; SSE2-NEXT:    psrld $1, %xmm1
-; SSE2-NEXT:    psrld $1, %xmm6
+; SSE2-NEXT:    pslld $16, %xmm9
+; SSE2-NEXT:    psrad $16, %xmm9
+; SSE2-NEXT:    packssdw %xmm7, %xmm9
 ; SSE2-NEXT:    pslld $16, %xmm6
 ; SSE2-NEXT:    psrad $16, %xmm6
 ; SSE2-NEXT:    pslld $16, %xmm1
 ; SSE2-NEXT:    psrad $16, %xmm1
 ; SSE2-NEXT:    packssdw %xmm6, %xmm1
-; SSE2-NEXT:    psrld $1, %xmm2
-; SSE2-NEXT:    psrld $1, %xmm5
 ; SSE2-NEXT:    pslld $16, %xmm5
 ; SSE2-NEXT:    psrad $16, %xmm5
 ; SSE2-NEXT:    pslld $16, %xmm2
 ; SSE2-NEXT:    psrad $16, %xmm2
 ; SSE2-NEXT:    packssdw %xmm5, %xmm2
-; SSE2-NEXT:    psrld $1, %xmm3
-; SSE2-NEXT:    psrld $1, %xmm4
 ; SSE2-NEXT:    pslld $16, %xmm4
 ; SSE2-NEXT:    psrad $16, %xmm4
 ; SSE2-NEXT:    pslld $16, %xmm3
@@ -1676,7 +1657,7 @@ define void @avg_v32i16_2(<32 x i16>* %a, <32 x i16>* %b) {
 ; SSE2-NEXT:    movdqu %xmm3, (%rax)
 ; SSE2-NEXT:    movdqu %xmm2, (%rax)
 ; SSE2-NEXT:    movdqu %xmm1, (%rax)
-; SSE2-NEXT:    movdqu %xmm14, (%rax)
+; SSE2-NEXT:    movdqu %xmm9, (%rax)
 ; SSE2-NEXT:    retq
 ;
 ; AVX2-LABEL: avg_v32i16_2:
@@ -1686,13 +1667,13 @@ define void @avg_v32i16_2(<32 x i16>* %a, <32 x i16>* %b) {
 ; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
 ; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
 ; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
-; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm5 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
-; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm6 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
-; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm7 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
-; AVX2-NEXT:    vpaddd %ymm7, %ymm3, %ymm3
-; AVX2-NEXT:    vpaddd %ymm6, %ymm2, %ymm2
-; AVX2-NEXT:    vpaddd %ymm5, %ymm1, %ymm1
 ; AVX2-NEXT:    vpaddd %ymm4, %ymm0, %ymm0
+; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
+; AVX2-NEXT:    vpaddd %ymm4, %ymm1, %ymm1
+; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
+; AVX2-NEXT:    vpaddd %ymm4, %ymm2, %ymm2
+; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
+; AVX2-NEXT:    vpaddd %ymm4, %ymm3, %ymm3
 ; AVX2-NEXT:    vpbroadcastd {{.*}}(%rip), %ymm4
 ; AVX2-NEXT:    vpaddd %ymm4, %ymm0, %ymm0
 ; AVX2-NEXT:    vpaddd %ymm4, %ymm1, %ymm1
@@ -1723,9 +1704,9 @@ define void @avg_v32i16_2(<32 x i16>* %a, <32 x i16>* %b) {
 ; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
 ; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
 ; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
-; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
-; AVX512F-NEXT:    vpaddd %zmm3, %zmm1, %zmm1
 ; AVX512F-NEXT:    vpaddd %zmm2, %zmm0, %zmm0
+; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
+; AVX512F-NEXT:    vpaddd %zmm2, %zmm1, %zmm1
 ; AVX512F-NEXT:    vpbroadcastd {{.*}}(%rip), %zmm2
 ; AVX512F-NEXT:    vpaddd %zmm2, %zmm0, %zmm0
 ; AVX512F-NEXT:    vpaddd %zmm2, %zmm1, %zmm1
diff --git a/test/CodeGen/X86/avx-basic.ll b/test/CodeGen/X86/avx-basic.ll
index e6cc95fcdb23..6869d088e7cd 100644
--- a/test/CodeGen/X86/avx-basic.ll
+++ b/test/CodeGen/X86/avx-basic.ll
@@ -34,8 +34,8 @@ define void @zero256() nounwind ssp {
 define void @ones([0 x float]* nocapture %RET, [0 x float]* nocapture %aFOO) nounwind {
 ; CHECK-LABEL: ones:
 ; CHECK:       ## BB#0: ## %allocas
-; CHECK-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
-; CHECK-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; CHECK-NEXT:    vxorps %ymm0, %ymm0, %ymm0
+; CHECK-NEXT:    vcmptrueps %ymm0, %ymm0, %ymm0
 ; CHECK-NEXT:    vmovaps %ymm0, (%rdi)
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
@@ -51,8 +51,8 @@ float>* %ptr2vec615, align 32
 define void @ones2([0 x i32]* nocapture %RET, [0 x i32]* nocapture %aFOO) nounwind {
 ; CHECK-LABEL: ones2:
 ; CHECK:       ## BB#0: ## %allocas
-; CHECK-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
-; CHECK-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; CHECK-NEXT:    vxorps %ymm0, %ymm0, %ymm0
+; CHECK-NEXT:    vcmptrueps %ymm0, %ymm0, %ymm0
 ; CHECK-NEXT:    vmovaps %ymm0, (%rdi)
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
diff --git a/test/CodeGen/X86/avx-cvt-3.ll b/test/CodeGen/X86/avx-cvt-3.ll
index 066719b3bfe8..231334ddcb85 100644
--- a/test/CodeGen/X86/avx-cvt-3.ll
+++ b/test/CodeGen/X86/avx-cvt-3.ll
@@ -48,16 +48,16 @@ define <8 x float> @sitofp_shuffle_zero_v8i32(<8 x i32> %a0) {
 define <8 x float> @sitofp_insert_allbits_v8i32(<8 x i32> %a0) {
 ; X86-LABEL: sitofp_insert_allbits_v8i32:
 ; X86:       # BB#0:
-; X86-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
-; X86-NEXT:    vinsertf128 $1, %xmm1, %ymm1, %ymm1
+; X86-NEXT:    vxorps %ymm1, %ymm1, %ymm1
+; X86-NEXT:    vcmptrueps %ymm1, %ymm1, %ymm1
 ; X86-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4,5],ymm0[6,7]
 ; X86-NEXT:    vcvtdq2ps %ymm0, %ymm0
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: sitofp_insert_allbits_v8i32:
 ; X64:       # BB#0:
-; X64-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
-; X64-NEXT:    vinsertf128 $1, %xmm1, %ymm1, %ymm1
+; X64-NEXT:    vxorps %ymm1, %ymm1, %ymm1
+; X64-NEXT:    vcmptrueps %ymm1, %ymm1, %ymm1
 ; X64-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4,5],ymm0[6,7]
 ; X64-NEXT:    vcvtdq2ps %ymm0, %ymm0
 ; X64-NEXT:    retq
@@ -72,16 +72,16 @@ define <8 x float> @sitofp_insert_allbits_v8i32(<8 x i32> %a0) {
 define <8 x float> @sitofp_shuffle_allbits_v8i32(<8 x i32> %a0) {
 ; X86-LABEL: sitofp_shuffle_allbits_v8i32:
 ; X86:       # BB#0:
-; X86-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
-; X86-NEXT:    vinsertf128 $1, %xmm1, %ymm1, %ymm1
+; X86-NEXT:    vxorps %ymm1, %ymm1, %ymm1
+; X86-NEXT:    vcmptrueps %ymm1, %ymm1, %ymm1
 ; X86-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7]
 ; X86-NEXT:    vcvtdq2ps %ymm0, %ymm0
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: sitofp_shuffle_allbits_v8i32:
 ; X64:       # BB#0:
-; X64-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
-; X64-NEXT:    vinsertf128 $1, %xmm1, %ymm1, %ymm1
+; X64-NEXT:    vxorps %ymm1, %ymm1, %ymm1
+; X64-NEXT:    vcmptrueps %ymm1, %ymm1, %ymm1
 ; X64-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7]
 ; X64-NEXT:    vcvtdq2ps %ymm0, %ymm0
 ; X64-NEXT:    retq
@@ -95,8 +95,7 @@ define <8 x float> @sitofp_insert_constants_v8i32(<8 x i32> %a0) {
 ; X86:       # BB#0:
 ; X86-NEXT:    vxorps %ymm1, %ymm1, %ymm1
 ; X86-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7]
-; X86-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
-; X86-NEXT:    vinsertf128 $1, %xmm1, %ymm1, %ymm1
+; X86-NEXT:    vcmptrueps %ymm1, %ymm1, %ymm1
 ; X86-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4,5,6,7]
 ; X86-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; X86-NEXT:    movl $2, %eax
@@ -111,8 +110,7 @@ define <8 x float> @sitofp_insert_constants_v8i32(<8 x i32> %a0) {
 ; X64:       # BB#0:
 ; X64-NEXT:    vxorps %ymm1, %ymm1, %ymm1
 ; X64-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7]
-; X64-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
-; X64-NEXT:    vinsertf128 $1, %xmm1, %ymm1, %ymm1
+; X64-NEXT:    vcmptrueps %ymm1, %ymm1, %ymm1
 ; X64-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4,5,6,7]
 ; X64-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; X64-NEXT:    movl $2, %eax
diff --git a/test/CodeGen/X86/avx-intrinsics-fast-isel.ll b/test/CodeGen/X86/avx-intrinsics-fast-isel.ll
index 1d925ff8e9bd..3cadbe2a8db3 100644
--- a/test/CodeGen/X86/avx-intrinsics-fast-isel.ll
+++ b/test/CodeGen/X86/avx-intrinsics-fast-isel.ll
@@ -99,16 +99,16 @@ define <8 x float> @test_mm256_and_ps(<8 x float> %a0, <8 x float> %a1) nounwind
 define <4 x double> @test_mm256_andnot_pd(<4 x double> %a0, <4 x double> %a1) nounwind {
 ; X32-LABEL: test_mm256_andnot_pd:
 ; X32:       # BB#0:
-; X32-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
-; X32-NEXT:    vinsertf128 $1, %xmm2, %ymm2, %ymm2
+; X32-NEXT:    vxorps %ymm2, %ymm2, %ymm2
+; X32-NEXT:    vcmptrueps %ymm2, %ymm2, %ymm2
 ; X32-NEXT:    vxorps %ymm2, %ymm0, %ymm0
 ; X32-NEXT:    vandps %ymm1, %ymm0, %ymm0
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: test_mm256_andnot_pd:
 ; X64:       # BB#0:
-; X64-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
-; X64-NEXT:    vinsertf128 $1, %xmm2, %ymm2, %ymm2
+; X64-NEXT:    vxorps %ymm2, %ymm2, %ymm2
+; X64-NEXT:    vcmptrueps %ymm2, %ymm2, %ymm2
 ; X64-NEXT:    vxorps %ymm2, %ymm0, %ymm0
 ; X64-NEXT:    vandps %ymm1, %ymm0, %ymm0
 ; X64-NEXT:    retq
@@ -2244,11 +2244,11 @@ define <4 x double> @test_mm256_set_pd(double %a0, double %a1, double %a2, doubl
 ; X32:       # BB#0:
 ; X32-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
 ; X32-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
-; X32-NEXT:    vmovsd {{.*#+}} xmm2 = mem[0],zero
-; X32-NEXT:    vmovsd {{.*#+}} xmm3 = mem[0],zero
-; X32-NEXT:    vunpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0]
 ; X32-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; X32-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; X32-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
+; X32-NEXT:    vmovsd {{.*#+}} xmm2 = mem[0],zero
+; X32-NEXT:    vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; X32-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: test_mm256_set_pd:
@@ -2269,19 +2269,19 @@ define <8 x float> @test_mm256_set_ps(float %a0, float %a1, float %a2, float %a3
 ; X32:       # BB#0:
 ; X32-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; X32-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; X32-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; X32-NEXT:    vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero
-; X32-NEXT:    vmovss {{.*#+}} xmm4 = mem[0],zero,zero,zero
-; X32-NEXT:    vmovss {{.*#+}} xmm5 = mem[0],zero,zero,zero
-; X32-NEXT:    vmovss {{.*#+}} xmm6 = mem[0],zero,zero,zero
-; X32-NEXT:    vmovss {{.*#+}} xmm7 = mem[0],zero,zero,zero
-; X32-NEXT:    vinsertps {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[2,3]
-; X32-NEXT:    vinsertps {{.*#+}} xmm4 = xmm4[0,1],xmm6[0],xmm4[3]
-; X32-NEXT:    vinsertps {{.*#+}} xmm4 = xmm4[0,1,2],xmm7[0]
 ; X32-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
-; X32-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3]
-; X32-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[0]
-; X32-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm0
+; X32-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X32-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
+; X32-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X32-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
+; X32-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X32-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; X32-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[2,3]
+; X32-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; X32-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
+; X32-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; X32-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0]
+; X32-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: test_mm256_set_ps:
@@ -2881,10 +2881,10 @@ define <4 x double> @test_mm256_setr_pd(double %a0, double %a1, double %a2, doub
 ; X32:       # BB#0:
 ; X32-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
 ; X32-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
-; X32-NEXT:    vmovsd {{.*#+}} xmm2 = mem[0],zero
-; X32-NEXT:    vmovsd {{.*#+}} xmm3 = mem[0],zero
 ; X32-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; X32-NEXT:    vunpcklpd {{.*#+}} xmm1 = xmm3[0],xmm2[0]
+; X32-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
+; X32-NEXT:    vmovsd {{.*#+}} xmm2 = mem[0],zero
+; X32-NEXT:    vunpcklpd {{.*#+}} xmm1 = xmm2[0],xmm1[0]
 ; X32-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
 ; X32-NEXT:    retl
 ;
@@ -2908,16 +2908,16 @@ define <8 x float> @test_mm256_setr_ps(float %a0, float %a1, float %a2, float %a
 ; X32-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
 ; X32-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
 ; X32-NEXT:    vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero
-; X32-NEXT:    vmovss {{.*#+}} xmm4 = mem[0],zero,zero,zero
-; X32-NEXT:    vmovss {{.*#+}} xmm5 = mem[0],zero,zero,zero
-; X32-NEXT:    vmovss {{.*#+}} xmm6 = mem[0],zero,zero,zero
-; X32-NEXT:    vmovss {{.*#+}} xmm7 = mem[0],zero,zero,zero
 ; X32-NEXT:    vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
 ; X32-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
 ; X32-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
-; X32-NEXT:    vinsertps {{.*#+}} xmm1 = xmm7[0],xmm6[0],xmm7[2,3]
-; X32-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm5[0],xmm1[3]
-; X32-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[0]
+; X32-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X32-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; X32-NEXT:    vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero
+; X32-NEXT:    vmovss {{.*#+}} xmm4 = mem[0],zero,zero,zero
+; X32-NEXT:    vinsertps {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[2,3]
+; X32-NEXT:    vinsertps {{.*#+}} xmm2 = xmm3[0,1],xmm2[0],xmm3[3]
+; X32-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[0]
 ; X32-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
 ; X32-NEXT:    retl
 ;
diff --git a/test/CodeGen/X86/avx-schedule.ll b/test/CodeGen/X86/avx-schedule.ll
index 052cacfea4dc..bb05481e313d 100644
--- a/test/CodeGen/X86/avx-schedule.ll
+++ b/test/CodeGen/X86/avx-schedule.ll
@@ -2837,4 +2837,54 @@ define <8 x float> @test_xorps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a
   ret <8 x float> %8
 }
 
+define void @test_zeroall() {
+; SANDY-LABEL: test_zeroall:
+; SANDY:       # BB#0:
+; SANDY-NEXT:    vzeroall # sched: [?:0.000000e+00]
+; SANDY-NEXT:    retq # sched: [5:1.00]
+;
+; HASWELL-LABEL: test_zeroall:
+; HASWELL:       # BB#0:
+; HASWELL-NEXT:    vzeroall # sched: [1:0.00]
+; HASWELL-NEXT:    retq # sched: [1:1.00]
+;
+; BTVER2-LABEL: test_zeroall:
+; BTVER2:       # BB#0:
+; BTVER2-NEXT:    vzeroall # sched: [?:0.000000e+00]
+; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_zeroall:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vzeroall # sched: [?:0.000000e+00]
+; ZNVER1-NEXT:    retq # sched: [4:1.00]
+  call void @llvm.x86.avx.vzeroall()
+  ret void
+}
+declare void @llvm.x86.avx.vzeroall() nounwind
+
+define void @test_zeroupper() {
+; SANDY-LABEL: test_zeroupper:
+; SANDY:       # BB#0:
+; SANDY-NEXT:    vzeroupper # sched: [?:0.000000e+00]
+; SANDY-NEXT:    retq # sched: [5:1.00]
+;
+; HASWELL-LABEL: test_zeroupper:
+; HASWELL:       # BB#0:
+; HASWELL-NEXT:    vzeroupper # sched: [1:0.00]
+; HASWELL-NEXT:    retq # sched: [1:1.00]
+;
+; BTVER2-LABEL: test_zeroupper:
+; BTVER2:       # BB#0:
+; BTVER2-NEXT:    vzeroupper # sched: [?:0.000000e+00]
+; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_zeroupper:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vzeroupper # sched: [?:0.000000e+00]
+; ZNVER1-NEXT:    retq # sched: [4:1.00]
+  call void @llvm.x86.avx.vzeroupper()
+  ret void
+}
+declare void @llvm.x86.avx.vzeroupper() nounwind
+
 !0 = !{i32 1}
diff --git a/test/CodeGen/X86/avx.ll b/test/CodeGen/X86/avx.ll
index 341dd867e4ff..647b7a8f4dfc 100644
--- a/test/CodeGen/X86/avx.ll
+++ b/test/CodeGen/X86/avx.ll
@@ -113,11 +113,11 @@ define <4 x float> @insertps_from_broadcast_multiple_use(<4 x float> %a, <4 x fl
 ; CHECK-NOT: mov
 ; CHECK: insertps    $48
 ; CHECK: insertps    $48
+; CHECK: vaddps
 ; CHECK: insertps    $48
 ; CHECK: insertps    $48
 ; CHECK: vaddps
 ; CHECK: vaddps
-; CHECK: vaddps
 ; CHECK-NEXT: ret
   %1 = getelementptr inbounds float, float* %fb, i64 %index
   %2 = load float, float* %1, align 4
diff --git a/test/CodeGen/X86/avx512-cmp-kor-sequence.ll b/test/CodeGen/X86/avx512-cmp-kor-sequence.ll
index 63b0281a7339..e29cf09718ad 100644
--- a/test/CodeGen/X86/avx512-cmp-kor-sequence.ll
+++ b/test/CodeGen/X86/avx512-cmp-kor-sequence.ll
@@ -13,10 +13,10 @@ define zeroext i16 @cmp_kor_seq_16(<16 x float> %a, <16 x float> %b, <16 x float
 ; CHECK:       # BB#0: # %entry
 ; CHECK-NEXT:    vcmpgeps %zmm4, %zmm0, %k0
 ; CHECK-NEXT:    vcmpgeps %zmm4, %zmm1, %k1
-; CHECK-NEXT:    vcmpgeps %zmm4, %zmm2, %k2
-; CHECK-NEXT:    vcmpgeps %zmm4, %zmm3, %k3
 ; CHECK-NEXT:    korw %k1, %k0, %k0
-; CHECK-NEXT:    korw %k3, %k2, %k1
+; CHECK-NEXT:    vcmpgeps %zmm4, %zmm2, %k1
+; CHECK-NEXT:    vcmpgeps %zmm4, %zmm3, %k2
+; CHECK-NEXT:    korw %k2, %k1, %k1
 ; CHECK-NEXT:    korw %k1, %k0, %k0
 ; CHECK-NEXT:    kmovw %k0, %eax
 ; CHECK-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
diff --git a/test/CodeGen/X86/avx512-gather-scatter-intrin.ll b/test/CodeGen/X86/avx512-gather-scatter-intrin.ll
index 4890afec2164..c03623a2f035 100644
--- a/test/CodeGen/X86/avx512-gather-scatter-intrin.ll
+++ b/test/CodeGen/X86/avx512-gather-scatter-intrin.ll
@@ -852,16 +852,16 @@ define <16 x float> @gather_mask_test(<16 x i32> %ind, <16 x float> %src, i8* %b
 ; CHECK-NEXT:    kxorw %k0, %k0, %k1
 ; CHECK-NEXT:    vmovaps %zmm1, %zmm3
 ; CHECK-NEXT:    vgatherdps (%rdi,%zmm0,4), %zmm3 {%k1}
+; CHECK-NEXT:    vaddps %zmm3, %zmm2, %zmm2
 ; CHECK-NEXT:    movw $1, %ax
 ; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vmovaps %zmm1, %zmm4
-; CHECK-NEXT:    vgatherdps (%rdi,%zmm0,4), %zmm4 {%k1}
+; CHECK-NEXT:    vmovaps %zmm1, %zmm3
+; CHECK-NEXT:    vgatherdps (%rdi,%zmm0,4), %zmm3 {%k1}
 ; CHECK-NEXT:    movw $220, %ax
 ; CHECK-NEXT:    kmovd %eax, %k1
 ; CHECK-NEXT:    vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
-; CHECK-NEXT:    vaddps %zmm3, %zmm2, %zmm0
-; CHECK-NEXT:    vaddps %zmm4, %zmm1, %zmm1
-; CHECK-NEXT:    vaddps %zmm0, %zmm1, %zmm0
+; CHECK-NEXT:    vaddps %zmm3, %zmm1, %zmm0
+; CHECK-NEXT:    vaddps %zmm2, %zmm0, %zmm0
 ; CHECK-NEXT:    retq
   %res = call <16 x float> @llvm.x86.avx512.gather.dps.512 (<16 x float> %src, i8* %base, <16 x i32>%ind, i16 -1, i32 4)
   %res1 = call <16 x float> @llvm.x86.avx512.gather.dps.512 (<16 x float> %src, i8* %base, <16 x i32>%ind, i16 0, i32 4)
diff --git a/test/CodeGen/X86/avx512-intrinsics-upgrade.ll b/test/CodeGen/X86/avx512-intrinsics-upgrade.ll
index 0e7a8d25c56f..56962ca2671d 100644
--- a/test/CodeGen/X86/avx512-intrinsics-upgrade.ll
+++ b/test/CodeGen/X86/avx512-intrinsics-upgrade.ll
@@ -9,8 +9,8 @@ define <16 x float> @test_x86_vbroadcast_ss_ps_512(<4 x float> %a0, <16 x float>
 ; CHECK-NEXT:    vbroadcastss %xmm0, %zmm2
 ; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vbroadcastss %xmm0, %zmm1 {%k1}
-; CHECK-NEXT:    vbroadcastss %xmm0, %zmm0 {%k1} {z}
 ; CHECK-NEXT:    vaddps %zmm1, %zmm2, %zmm1
+; CHECK-NEXT:    vbroadcastss %xmm0, %zmm0 {%k1} {z}
 ; CHECK-NEXT:    vaddps %zmm1, %zmm0, %zmm0
 ; CHECK-NEXT:    retq
 
@@ -30,8 +30,8 @@ define <8 x double> @test_x86_vbroadcast_sd_pd_512(<2 x double> %a0, <8 x double
 ; CHECK-NEXT:    vbroadcastsd %xmm0, %zmm2
 ; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vbroadcastsd %xmm0, %zmm1 {%k1}
-; CHECK-NEXT:    vbroadcastsd %xmm0, %zmm0 {%k1} {z}
 ; CHECK-NEXT:    vaddpd %zmm1, %zmm2, %zmm1
+; CHECK-NEXT:    vbroadcastsd %xmm0, %zmm0 {%k1} {z}
 ; CHECK-NEXT:    vaddpd %zmm1, %zmm0, %zmm0
 ; CHECK-NEXT:    retq
 
@@ -51,8 +51,8 @@ define <16 x i32>@test_int_x86_avx512_pbroadcastd_512(<4 x i32> %x0, <16 x i32>
 ; CHECK-NEXT:    vpbroadcastd %xmm0, %zmm2
 ; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vpbroadcastd %xmm0, %zmm1 {%k1}
-; CHECK-NEXT:    vpbroadcastd %xmm0, %zmm0 {%k1} {z}
 ; CHECK-NEXT:    vpaddd %zmm1, %zmm2, %zmm1
+; CHECK-NEXT:    vpbroadcastd %xmm0, %zmm0 {%k1} {z}
 ; CHECK-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
 ; CHECK-NEXT:    retq
   %res = call <16 x i32> @llvm.x86.avx512.pbroadcastd.512(<4 x i32> %x0, <16 x i32> %x1, i16 -1)
@@ -71,8 +71,8 @@ define <8 x i64>@test_int_x86_avx512_pbroadcastq_512(<2 x i64> %x0, <8 x i64> %x
 ; CHECK-NEXT:    vpbroadcastq %xmm0, %zmm2
 ; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vpbroadcastq %xmm0, %zmm1 {%k1}
-; CHECK-NEXT:    vpbroadcastq %xmm0, %zmm0 {%k1} {z}
 ; CHECK-NEXT:    vpaddq %zmm1, %zmm2, %zmm1
+; CHECK-NEXT:    vpbroadcastq %xmm0, %zmm0 {%k1} {z}
 ; CHECK-NEXT:    vpaddq %zmm1, %zmm0, %zmm0
 ; CHECK-NEXT:    retq
   %res = call <8 x i64> @llvm.x86.avx512.pbroadcastq.512(<2 x i64> %x0, <8 x i64> %x1,i8 -1)
@@ -91,8 +91,8 @@ define <16 x float>@test_int_x86_avx512_mask_movsldup_512(<16 x float> %x0, <16
 ; CHECK-NEXT:    vmovsldup {{.*#+}} zmm2 = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
 ; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vmovsldup {{.*#+}} zmm1 {%k1} = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
-; CHECK-NEXT:    vmovsldup {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
 ; CHECK-NEXT:    vaddps %zmm2, %zmm1, %zmm1
+; CHECK-NEXT:    vmovsldup {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
 ; CHECK-NEXT:    vaddps %zmm1, %zmm0, %zmm0
 ; CHECK-NEXT:    retq
   %res = call <16 x float> @llvm.x86.avx512.mask.movsldup.512(<16 x float> %x0, <16 x float> %x1, i16 %x2)
@@ -111,8 +111,8 @@ define <16 x float>@test_int_x86_avx512_mask_movshdup_512(<16 x float> %x0, <16
 ; CHECK-NEXT:    vmovshdup {{.*#+}} zmm2 = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
 ; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vmovshdup {{.*#+}} zmm1 {%k1} = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
-; CHECK-NEXT:    vmovshdup {{.*#+}} zmm0 {%k1} {z} = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
 ; CHECK-NEXT:    vaddps %zmm2, %zmm1, %zmm1
+; CHECK-NEXT:    vmovshdup {{.*#+}} zmm0 {%k1} {z} = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
 ; CHECK-NEXT:    vaddps %zmm1, %zmm0, %zmm0
 ; CHECK-NEXT:    retq
   %res = call <16 x float> @llvm.x86.avx512.mask.movshdup.512(<16 x float> %x0, <16 x float> %x1, i16 %x2)
@@ -131,8 +131,8 @@ define <8 x double>@test_int_x86_avx512_mask_movddup_512(<8 x double> %x0, <8 x
 ; CHECK-NEXT:    vmovddup {{.*#+}} zmm2 = zmm0[0,0,2,2,4,4,6,6]
 ; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vmovddup {{.*#+}} zmm1 {%k1} = zmm0[0,0,2,2,4,4,6,6]
-; CHECK-NEXT:    vmovddup {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,2,2,4,4,6,6]
 ; CHECK-NEXT:    vaddpd %zmm2, %zmm1, %zmm1
+; CHECK-NEXT:    vmovddup {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,2,2,4,4,6,6]
 ; CHECK-NEXT:    vaddpd %zmm1, %zmm0, %zmm0
 ; CHECK-NEXT:    retq
   %res = call <8 x double> @llvm.x86.avx512.mask.movddup.512(<8 x double> %x0, <8 x double> %x1, i8 %x2)
@@ -671,9 +671,9 @@ define <8 x i64>@test_int_x86_avx512_mask_punpcklqd_q_512(<8 x i64> %x0, <8 x i6
 ; CHECK-NEXT:    vpunpcklqdq {{.*#+}} zmm3 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
 ; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vpunpcklqdq {{.*#+}} zmm2 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
+; CHECK-NEXT:    vpaddq %zmm3, %zmm2, %zmm2
 ; CHECK-NEXT:    vpunpcklqdq {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
-; CHECK-NEXT:    vpaddq %zmm3, %zmm2, %zmm1
-; CHECK-NEXT:    vpaddq %zmm1, %zmm0, %zmm0
+; CHECK-NEXT:    vpaddq %zmm2, %zmm0, %zmm0
 ; CHECK-NEXT:    retq
   %res = call <8 x i64> @llvm.x86.avx512.mask.punpcklqd.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3)
   %res1 = call <8 x i64> @llvm.x86.avx512.mask.punpcklqd.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 -1)
@@ -1616,9 +1616,9 @@ define <8 x double>@test_int_x86_avx512_mask_shuf_pd_512(<8 x double> %x0, <8 x
 ; CHECK-NEXT:    vshufpd {{.*#+}} zmm3 = zmm0[0],zmm1[1],zmm0[3],zmm1[2],zmm0[5],zmm1[4],zmm0[6],zmm1[6]
 ; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vshufpd {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[1],zmm0[3],zmm1[2],zmm0[5],zmm1[4],zmm0[6],zmm1[6]
+; CHECK-NEXT:    vaddpd %zmm3, %zmm2, %zmm2
 ; CHECK-NEXT:    vshufpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[1],zmm0[3],zmm1[2],zmm0[5],zmm1[4],zmm0[6],zmm1[6]
-; CHECK-NEXT:    vaddpd %zmm3, %zmm2, %zmm1
-; CHECK-NEXT:    vaddpd %zmm0, %zmm1, %zmm0
+; CHECK-NEXT:    vaddpd %zmm0, %zmm2, %zmm0
 ; CHECK-NEXT:    retq
   %res = call <8 x double> @llvm.x86.avx512.mask.shuf.pd.512(<8 x double> %x0, <8 x double> %x1, i32 22, <8 x double> %x3, i8 %x4)
   %res1 = call <8 x double> @llvm.x86.avx512.mask.shuf.pd.512(<8 x double> %x0, <8 x double> %x1, i32 22, <8 x double> %x3, i8 -1)
@@ -2031,8 +2031,8 @@ define <8 x i64>@test_int_x86_avx512_mask_psrl_qi_512(<8 x i64> %x0, i32 %x1, <8
 ; CHECK-NEXT:    vpsrlq $4, %zmm0, %zmm2
 ; CHECK-NEXT:    kmovw %esi, %k1
 ; CHECK-NEXT:    vpsrlq $4, %zmm0, %zmm1 {%k1}
-; CHECK-NEXT:    vpsrlq $4, %zmm0, %zmm0 {%k1} {z}
 ; CHECK-NEXT:    vpaddq %zmm2, %zmm1, %zmm1
+; CHECK-NEXT:    vpsrlq $4, %zmm0, %zmm0 {%k1} {z}
 ; CHECK-NEXT:    vpaddq %zmm0, %zmm1, %zmm0
 ; CHECK-NEXT:    retq
   %res = call <8 x i64> @llvm.x86.avx512.mask.psrl.qi.512(<8 x i64> %x0, i32 4, <8 x i64> %x2, i8 %x3)
@@ -2051,8 +2051,8 @@ define <16 x i32>@test_int_x86_avx512_mask_psrl_di_512(<16 x i32> %x0, i32 %x1,
 ; CHECK-NEXT:    vpsrld $4, %zmm0, %zmm2
 ; CHECK-NEXT:    kmovw %esi, %k1
 ; CHECK-NEXT:    vpsrld $4, %zmm0, %zmm1 {%k1}
-; CHECK-NEXT:    vpsrld $4, %zmm0, %zmm0 {%k1} {z}
 ; CHECK-NEXT:    vpaddd %zmm2, %zmm1, %zmm1
+; CHECK-NEXT:    vpsrld $4, %zmm0, %zmm0 {%k1} {z}
 ; CHECK-NEXT:    vpaddd %zmm0, %zmm1, %zmm0
 ; CHECK-NEXT:    retq
   %res = call <16 x i32> @llvm.x86.avx512.mask.psrl.di.512(<16 x i32> %x0, i32 4, <16 x i32> %x2, i16 %x3)
@@ -2651,8 +2651,8 @@ define <16 x float>@test_int_x86_avx512_mask_vpermilvar_ps_512_constant_pool(<16
 ; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vpermilps {{.*#+}} zmm2 {%k1} = zmm0[2,3,0,1,7,6,5,4,9,8,11,10,12,13,14,15]
 ; CHECK-NEXT:    vpermilps {{.*#+}} zmm1 {%k1} {z} = zmm0[1,0,3,2,4,5,6,7,9,8,11,10,12,13,14,15]
-; CHECK-NEXT:    vpermilps {{.*#+}} zmm0 = zmm0[1,0,3,2,4,5,6,7,10,11,8,9,14,15,13,12]
 ; CHECK-NEXT:    vaddps %zmm1, %zmm2, %zmm1
+; CHECK-NEXT:    vpermilps {{.*#+}} zmm0 = zmm0[1,0,3,2,4,5,6,7,10,11,8,9,14,15,13,12]
 ; CHECK-NEXT:    vaddps %zmm1, %zmm0, %zmm0
 ; CHECK-NEXT:    retq
   %res = call <16 x float> @llvm.x86.avx512.mask.vpermilvar.ps.512(<16 x float> %x0, <16 x i32> <i32 2, i32 3, i32 0, i32 1, i32 3, i32 2, i32 1, i32 0, i32 1, i32 0, i32 3, i32 2, i32 0, i32 1, i32 2, i32 3>, <16 x float> %x2, i16 %x3)
@@ -2989,9 +2989,9 @@ define <16 x float>@test_int_x86_avx512_mask_insertf32x4_512(<16 x float> %x0, <
 ; CHECK-NEXT:    vinsertf32x4 $1, %xmm1, %zmm0, %zmm3
 ; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vinsertf32x4 $1, %xmm1, %zmm0, %zmm2 {%k1}
+; CHECK-NEXT:    vaddps %zmm3, %zmm2, %zmm2
 ; CHECK-NEXT:    vinsertf32x4 $1, %xmm1, %zmm0, %zmm0 {%k1} {z}
-; CHECK-NEXT:    vaddps %zmm3, %zmm2, %zmm1
-; CHECK-NEXT:    vaddps %zmm1, %zmm0, %zmm0
+; CHECK-NEXT:    vaddps %zmm2, %zmm0, %zmm0
 ; CHECK-NEXT:    retq
   %res = call <16 x float> @llvm.x86.avx512.mask.insertf32x4.512(<16 x float> %x0, <4 x float> %x1, i32 1, <16 x float> %x3, i16 %x4)
   %res1 = call <16 x float> @llvm.x86.avx512.mask.insertf32x4.512(<16 x float> %x0, <4 x float> %x1, i32 1, <16 x float> %x3, i16 -1)
@@ -3010,9 +3010,9 @@ define <16 x i32>@test_int_x86_avx512_mask_inserti32x4_512(<16 x i32> %x0, <4 x
 ; CHECK-NEXT:    vinserti32x4 $1, %xmm1, %zmm0, %zmm3
 ; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vinserti32x4 $1, %xmm1, %zmm0, %zmm2 {%k1}
+; CHECK-NEXT:    vpaddd %zmm3, %zmm2, %zmm2
 ; CHECK-NEXT:    vinserti32x4 $1, %xmm1, %zmm0, %zmm0 {%k1} {z}
-; CHECK-NEXT:    vpaddd %zmm3, %zmm2, %zmm1
-; CHECK-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
+; CHECK-NEXT:    vpaddd %zmm2, %zmm0, %zmm0
 ; CHECK-NEXT:    retq
   %res = call <16 x i32> @llvm.x86.avx512.mask.inserti32x4.512(<16 x i32> %x0, <4 x i32> %x1, i32 1, <16 x i32> %x3, i16 %x4)
   %res1 = call <16 x i32> @llvm.x86.avx512.mask.inserti32x4.512(<16 x i32> %x0, <4 x i32> %x1, i32 1, <16 x i32> %x3, i16 -1)
@@ -3030,9 +3030,9 @@ define <8 x double>@test_int_x86_avx512_mask_insertf64x4_512(<8 x double> %x0, <
 ; CHECK-NEXT:    vinsertf64x4 $1, %ymm1, %zmm0, %zmm3
 ; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vinsertf64x4 $1, %ymm1, %zmm0, %zmm2 {%k1}
+; CHECK-NEXT:    vaddpd %zmm3, %zmm2, %zmm2
 ; CHECK-NEXT:    vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 {%k1} {z}
-; CHECK-NEXT:    vaddpd %zmm3, %zmm2, %zmm1
-; CHECK-NEXT:    vaddpd %zmm1, %zmm0, %zmm0
+; CHECK-NEXT:    vaddpd %zmm2, %zmm0, %zmm0
 ; CHECK-NEXT:    retq
   %res = call <8 x double> @llvm.x86.avx512.mask.insertf64x4.512(<8 x double> %x0, <4 x double> %x1, i32 1, <8 x double> %x3, i8 %x4)
   %res1 = call <8 x double> @llvm.x86.avx512.mask.insertf64x4.512(<8 x double> %x0, <4 x double> %x1, i32 1, <8 x double> %x3, i8 -1)
@@ -3050,9 +3050,9 @@ define <8 x i64>@test_int_x86_avx512_mask_inserti64x4_512(<8 x i64> %x0, <4 x i6
 ; CHECK-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm3
 ; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm2 {%k1}
+; CHECK-NEXT:    vpaddq %zmm3, %zmm2, %zmm2
 ; CHECK-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0 {%k1} {z}
-; CHECK-NEXT:    vpaddq %zmm3, %zmm2, %zmm1
-; CHECK-NEXT:    vpaddq %zmm1, %zmm0, %zmm0
+; CHECK-NEXT:    vpaddq %zmm2, %zmm0, %zmm0
 ; CHECK-NEXT:    retq
   %res = call <8 x i64> @llvm.x86.avx512.mask.inserti64x4.512(<8 x i64> %x0, <4 x i64> %x1, i32 1, <8 x i64> %x3, i8 %x4)
   %res1 = call <8 x i64> @llvm.x86.avx512.mask.inserti64x4.512(<8 x i64> %x0, <4 x i64> %x1, i32 1, <8 x i64> %x3, i8 -1)
diff --git a/test/CodeGen/X86/avx512-intrinsics.ll b/test/CodeGen/X86/avx512-intrinsics.ll
index cc5e9e038e0b..f800d01064ba 100644
--- a/test/CodeGen/X86/avx512-intrinsics.ll
+++ b/test/CodeGen/X86/avx512-intrinsics.ll
@@ -274,11 +274,11 @@ define <4 x float> @test_sqrt_ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %
 ; CHECK-NEXT:    vmovaps %xmm2, %xmm3
 ; CHECK-NEXT:    vsqrtss %xmm1, %xmm0, %xmm3 {%k1}
 ; CHECK-NEXT:    vsqrtss {rd-sae}, %xmm1, %xmm0, %xmm2 {%k1}
-; CHECK-NEXT:    vsqrtss {ru-sae}, %xmm1, %xmm0, %xmm4 {%k1} {z}
+; CHECK-NEXT:    vaddps %xmm2, %xmm3, %xmm2
+; CHECK-NEXT:    vsqrtss {ru-sae}, %xmm1, %xmm0, %xmm3 {%k1} {z}
 ; CHECK-NEXT:    vsqrtss {rz-sae}, %xmm1, %xmm0, %xmm0
-; CHECK-NEXT:    vaddps %xmm2, %xmm3, %xmm1
-; CHECK-NEXT:    vaddps %xmm0, %xmm4, %xmm0
-; CHECK-NEXT:    vaddps %xmm0, %xmm1, %xmm0
+; CHECK-NEXT:    vaddps %xmm0, %xmm3, %xmm0
+; CHECK-NEXT:    vaddps %xmm0, %xmm2, %xmm0
 ; CHECK-NEXT:    retq
   %res0 = call <4 x float> @llvm.x86.avx512.mask.sqrt.ss(<4 x float>%a0, <4 x float> %a1, <4 x float> %a2, i8 %mask, i32 4)
   %res1 = call <4 x float> @llvm.x86.avx512.mask.sqrt.ss(<4 x float>%a0, <4 x float> %a1, <4 x float> %a2, i8 %mask, i32 1)
@@ -301,11 +301,11 @@ define <2 x double> @test_sqrt_sd(<2 x double> %a0, <2 x double> %a1, <2 x doubl
 ; CHECK-NEXT:    vmovapd %xmm2, %xmm3
 ; CHECK-NEXT:    vsqrtsd %xmm1, %xmm0, %xmm3 {%k1}
 ; CHECK-NEXT:    vsqrtsd {rd-sae}, %xmm1, %xmm0, %xmm2 {%k1}
-; CHECK-NEXT:    vsqrtsd {ru-sae}, %xmm1, %xmm0, %xmm4 {%k1} {z}
+; CHECK-NEXT:    vaddpd %xmm2, %xmm3, %xmm2
+; CHECK-NEXT:    vsqrtsd {ru-sae}, %xmm1, %xmm0, %xmm3 {%k1} {z}
 ; CHECK-NEXT:    vsqrtsd {rz-sae}, %xmm1, %xmm0, %xmm0
-; CHECK-NEXT:    vaddpd %xmm2, %xmm3, %xmm1
-; CHECK-NEXT:    vaddpd %xmm0, %xmm4, %xmm0
-; CHECK-NEXT:    vaddpd %xmm0, %xmm1, %xmm0
+; CHECK-NEXT:    vaddpd %xmm0, %xmm3, %xmm0
+; CHECK-NEXT:    vaddpd %xmm0, %xmm2, %xmm0
 ; CHECK-NEXT:    retq
   %res0 = call <2 x double> @llvm.x86.avx512.mask.sqrt.sd(<2 x double>%a0, <2 x double> %a1, <2 x double> %a2, i8 %mask, i32 4)
   %res1 = call <2 x double> @llvm.x86.avx512.mask.sqrt.sd(<2 x double>%a0, <2 x double> %a1, <2 x double> %a2, i8 %mask, i32 1)
@@ -477,11 +477,11 @@ declare i64 @llvm.x86.avx512.cvttss2usi64(<4 x float>, i32) nounwind readnone
 define i64 @test_x86_avx512_cvtsd2usi64(<2 x double> %a0) {
 ; CHECK-LABEL: test_x86_avx512_cvtsd2usi64:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    vcvtsd2usi %xmm0, %rcx
-; CHECK-NEXT:    vcvtsd2usi {rz-sae}, %xmm0, %rax
-; CHECK-NEXT:    vcvtsd2usi {rd-sae}, %xmm0, %rdx
+; CHECK-NEXT:    vcvtsd2usi %xmm0, %rax
+; CHECK-NEXT:    vcvtsd2usi {rz-sae}, %xmm0, %rcx
+; CHECK-NEXT:    addq %rax, %rcx
+; CHECK-NEXT:    vcvtsd2usi {rd-sae}, %xmm0, %rax
 ; CHECK-NEXT:    addq %rcx, %rax
-; CHECK-NEXT:    addq %rdx, %rax
 ; CHECK-NEXT:    retq
 
   %res = call i64 @llvm.x86.avx512.vcvtsd2usi64(<2 x double> %a0, i32 4)
@@ -496,11 +496,11 @@ declare i64 @llvm.x86.avx512.vcvtsd2usi64(<2 x double>, i32) nounwind readnone
 define i64 @test_x86_avx512_cvtsd2si64(<2 x double> %a0) {
 ; CHECK-LABEL: test_x86_avx512_cvtsd2si64:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    vcvtsd2si %xmm0, %rcx
-; CHECK-NEXT:    vcvtsd2si {rz-sae}, %xmm0, %rax
-; CHECK-NEXT:    vcvtsd2si {rd-sae}, %xmm0, %rdx
+; CHECK-NEXT:    vcvtsd2si %xmm0, %rax
+; CHECK-NEXT:    vcvtsd2si {rz-sae}, %xmm0, %rcx
+; CHECK-NEXT:    addq %rax, %rcx
+; CHECK-NEXT:    vcvtsd2si {rd-sae}, %xmm0, %rax
 ; CHECK-NEXT:    addq %rcx, %rax
-; CHECK-NEXT:    addq %rdx, %rax
 ; CHECK-NEXT:    retq
 
   %res = call i64 @llvm.x86.avx512.vcvtsd2si64(<2 x double> %a0, i32 4)
@@ -515,11 +515,11 @@ declare i64 @llvm.x86.avx512.vcvtsd2si64(<2 x double>, i32) nounwind readnone
 define i64 @test_x86_avx512_cvtss2usi64(<4 x float> %a0) {
 ; CHECK-LABEL: test_x86_avx512_cvtss2usi64:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    vcvtss2usi %xmm0, %rcx
-; CHECK-NEXT:    vcvtss2usi {rz-sae}, %xmm0, %rax
-; CHECK-NEXT:    vcvtss2usi {rd-sae}, %xmm0, %rdx
+; CHECK-NEXT:    vcvtss2usi %xmm0, %rax
+; CHECK-NEXT:    vcvtss2usi {rz-sae}, %xmm0, %rcx
+; CHECK-NEXT:    addq %rax, %rcx
+; CHECK-NEXT:    vcvtss2usi {rd-sae}, %xmm0, %rax
 ; CHECK-NEXT:    addq %rcx, %rax
-; CHECK-NEXT:    addq %rdx, %rax
 ; CHECK-NEXT:    retq
 
   %res = call i64 @llvm.x86.avx512.vcvtss2usi64(<4 x float> %a0, i32 4)
@@ -534,11 +534,11 @@ declare i64 @llvm.x86.avx512.vcvtss2usi64(<4 x float>, i32) nounwind readnone
 define i64 @test_x86_avx512_cvtss2si64(<4 x float> %a0) {
 ; CHECK-LABEL: test_x86_avx512_cvtss2si64:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    vcvtss2si %xmm0, %rcx
-; CHECK-NEXT:    vcvtss2si {rz-sae}, %xmm0, %rax
-; CHECK-NEXT:    vcvtss2si {rd-sae}, %xmm0, %rdx
+; CHECK-NEXT:    vcvtss2si %xmm0, %rax
+; CHECK-NEXT:    vcvtss2si {rz-sae}, %xmm0, %rcx
+; CHECK-NEXT:    addq %rax, %rcx
+; CHECK-NEXT:    vcvtss2si {rd-sae}, %xmm0, %rax
 ; CHECK-NEXT:    addq %rcx, %rax
-; CHECK-NEXT:    addq %rdx, %rax
 ; CHECK-NEXT:    retq
 
   %res = call i64 @llvm.x86.avx512.vcvtss2si64(<4 x float> %a0, i32 4)
@@ -553,11 +553,11 @@ declare i64 @llvm.x86.avx512.vcvtss2si64(<4 x float>, i32) nounwind readnone
 define i32 @test_x86_avx512_cvtsd2usi32(<2 x double> %a0) {
 ; CHECK-LABEL: test_x86_avx512_cvtsd2usi32:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    vcvtsd2usi %xmm0, %ecx
-; CHECK-NEXT:    vcvtsd2usi {rz-sae}, %xmm0, %eax
-; CHECK-NEXT:    vcvtsd2usi {rd-sae}, %xmm0, %edx
+; CHECK-NEXT:    vcvtsd2usi %xmm0, %eax
+; CHECK-NEXT:    vcvtsd2usi {rz-sae}, %xmm0, %ecx
+; CHECK-NEXT:    addl %eax, %ecx
+; CHECK-NEXT:    vcvtsd2usi {rd-sae}, %xmm0, %eax
 ; CHECK-NEXT:    addl %ecx, %eax
-; CHECK-NEXT:    addl %edx, %eax
 ; CHECK-NEXT:    retq
 
   %res = call i32 @llvm.x86.avx512.vcvtsd2usi32(<2 x double> %a0, i32 4)
@@ -572,11 +572,11 @@ declare i32 @llvm.x86.avx512.vcvtsd2usi32(<2 x double>, i32) nounwind readnone
 define i32 @test_x86_avx512_cvtsd2si32(<2 x double> %a0) {
 ; CHECK-LABEL: test_x86_avx512_cvtsd2si32:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    vcvtsd2si %xmm0, %ecx
-; CHECK-NEXT:    vcvtsd2si {rz-sae}, %xmm0, %eax
-; CHECK-NEXT:    vcvtsd2si {rd-sae}, %xmm0, %edx
+; CHECK-NEXT:    vcvtsd2si %xmm0, %eax
+; CHECK-NEXT:    vcvtsd2si {rz-sae}, %xmm0, %ecx
+; CHECK-NEXT:    addl %eax, %ecx
+; CHECK-NEXT:    vcvtsd2si {rd-sae}, %xmm0, %eax
 ; CHECK-NEXT:    addl %ecx, %eax
-; CHECK-NEXT:    addl %edx, %eax
 ; CHECK-NEXT:    retq
 
   %res = call i32 @llvm.x86.avx512.vcvtsd2si32(<2 x double> %a0, i32 4)
@@ -591,11 +591,11 @@ declare i32 @llvm.x86.avx512.vcvtsd2si32(<2 x double>, i32) nounwind readnone
 define i32 @test_x86_avx512_cvtss2usi32(<4 x float> %a0) {
 ; CHECK-LABEL: test_x86_avx512_cvtss2usi32:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    vcvtss2usi %xmm0, %ecx
-; CHECK-NEXT:    vcvtss2usi {rz-sae}, %xmm0, %eax
-; CHECK-NEXT:    vcvtss2usi {rd-sae}, %xmm0, %edx
+; CHECK-NEXT:    vcvtss2usi %xmm0, %eax
+; CHECK-NEXT:    vcvtss2usi {rz-sae}, %xmm0, %ecx
+; CHECK-NEXT:    addl %eax, %ecx
+; CHECK-NEXT:    vcvtss2usi {rd-sae}, %xmm0, %eax
 ; CHECK-NEXT:    addl %ecx, %eax
-; CHECK-NEXT:    addl %edx, %eax
 ; CHECK-NEXT:    retq
 
   %res = call i32 @llvm.x86.avx512.vcvtss2usi32(<4 x float> %a0, i32 4)
@@ -610,11 +610,11 @@ declare i32 @llvm.x86.avx512.vcvtss2usi32(<4 x float>, i32) nounwind readnone
 define i32 @test_x86_avx512_cvtss2si32(<4 x float> %a0) {
 ; CHECK-LABEL: test_x86_avx512_cvtss2si32:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    vcvtss2si %xmm0, %ecx
-; CHECK-NEXT:    vcvtss2si {rz-sae}, %xmm0, %eax
-; CHECK-NEXT:    vcvtss2si {rd-sae}, %xmm0, %edx
+; CHECK-NEXT:    vcvtss2si %xmm0, %eax
+; CHECK-NEXT:    vcvtss2si {rz-sae}, %xmm0, %ecx
+; CHECK-NEXT:    addl %eax, %ecx
+; CHECK-NEXT:    vcvtss2si {rd-sae}, %xmm0, %eax
 ; CHECK-NEXT:    addl %ecx, %eax
-; CHECK-NEXT:    addl %edx, %eax
 ; CHECK-NEXT:    retq
 
   %res = call i32 @llvm.x86.avx512.vcvtss2si32(<4 x float> %a0, i32 4)
@@ -683,8 +683,9 @@ define <16 x i16> @test_x86_vcvtps2ph_256(<16 x float> %a0, <16 x i16> %src, i16
 ; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vcvtps2ph $2, %zmm0, %ymm1 {%k1}
 ; CHECK-NEXT:    vcvtps2ph $2, %zmm0, %ymm2 {%k1} {z}
+; CHECK-NEXT:    vpaddw %ymm1, %ymm2, %ymm1
 ; CHECK-NEXT:    vcvtps2ph $2, %zmm0, (%rsi)
-; CHECK-NEXT:    vpaddw %ymm1, %ymm2, %ymm0
+; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
 ; CHECK-NEXT:    retq
   %res1 = call <16 x i16> @llvm.x86.avx512.mask.vcvtps2ph.512(<16 x float> %a0, i32 2, <16 x i16> zeroinitializer, i16 -1)
   %res2 = call <16 x i16> @llvm.x86.avx512.mask.vcvtps2ph.512(<16 x float> %a0, i32 2, <16 x i16> zeroinitializer, i16 %mask)
@@ -3656,11 +3657,11 @@ define <4 x float> @test_getexp_ss(<4 x float> %a0, <4 x float> %a1, <4 x float>
 ; CHECK-NEXT:    vmovaps %xmm2, %xmm3
 ; CHECK-NEXT:    vgetexpss %xmm1, %xmm0, %xmm3 {%k1}
 ; CHECK-NEXT:    vgetexpss {sae}, %xmm1, %xmm0, %xmm2 {%k1}
-; CHECK-NEXT:    vgetexpss {sae}, %xmm1, %xmm0, %xmm4 {%k1} {z}
+; CHECK-NEXT:    vaddps %xmm2, %xmm3, %xmm2
+; CHECK-NEXT:    vgetexpss {sae}, %xmm1, %xmm0, %xmm3 {%k1} {z}
 ; CHECK-NEXT:    vgetexpss {sae}, %xmm1, %xmm0, %xmm0
-; CHECK-NEXT:    vaddps %xmm2, %xmm3, %xmm1
-; CHECK-NEXT:    vaddps %xmm0, %xmm4, %xmm0
-; CHECK-NEXT:    vaddps %xmm0, %xmm1, %xmm0
+; CHECK-NEXT:    vaddps %xmm0, %xmm3, %xmm0
+; CHECK-NEXT:    vaddps %xmm0, %xmm2, %xmm0
 ; CHECK-NEXT:    retq
   %res0 = call <4 x float> @llvm.x86.avx512.mask.getexp.ss(<4 x float>%a0, <4 x float> %a1, <4 x float> %a2, i8 %mask, i32 4)
   %res1 = call <4 x float> @llvm.x86.avx512.mask.getexp.ss(<4 x float>%a0, <4 x float> %a1, <4 x float> %a2, i8 %mask, i32 8)
@@ -3684,10 +3685,10 @@ define <2 x double> @test_getexp_sd(<2 x double> %a0, <2 x double> %a1, <2 x dou
 ; CHECK-NEXT:    vgetexpsd %xmm1, %xmm0, %xmm3 {%k1}
 ; CHECK-NEXT:    vgetexpsd %xmm1, %xmm0, %xmm4
 ; CHECK-NEXT:    vgetexpsd {sae}, %xmm1, %xmm0, %xmm2 {%k1}
+; CHECK-NEXT:    vaddpd %xmm2, %xmm3, %xmm2
 ; CHECK-NEXT:    vgetexpsd {sae}, %xmm1, %xmm0, %xmm0 {%k1} {z}
-; CHECK-NEXT:    vaddpd %xmm2, %xmm3, %xmm1
 ; CHECK-NEXT:    vaddpd %xmm4, %xmm0, %xmm0
-; CHECK-NEXT:    vaddpd %xmm0, %xmm1, %xmm0
+; CHECK-NEXT:    vaddpd %xmm0, %xmm2, %xmm0
 ; CHECK-NEXT:    retq
   %res0 = call <2 x double> @llvm.x86.avx512.mask.getexp.sd(<2 x double>%a0, <2 x double> %a1, <2 x double> %a2, i8 %mask, i32 4)
   %res1 = call <2 x double> @llvm.x86.avx512.mask.getexp.sd(<2 x double>%a0, <2 x double> %a1, <2 x double> %a2, i8 %mask, i32 8)
@@ -3903,11 +3904,11 @@ define <2 x double>@test_int_x86_avx512_mask_getmant_sd(<2 x double> %x0, <2 x d
 ; CHECK-NEXT:    vmovapd %xmm2, %xmm3
 ; CHECK-NEXT:    vgetmantsd $11, %xmm1, %xmm0, %xmm3 {%k1}
 ; CHECK-NEXT:    vgetmantsd $11, %xmm1, %xmm0, %xmm4 {%k1} {z}
-; CHECK-NEXT:    vgetmantsd $11, %xmm1, %xmm0, %xmm5
+; CHECK-NEXT:    vaddpd %xmm4, %xmm3, %xmm3
+; CHECK-NEXT:    vgetmantsd $11, %xmm1, %xmm0, %xmm4
 ; CHECK-NEXT:    vgetmantsd $11, {sae}, %xmm1, %xmm0, %xmm2 {%k1}
-; CHECK-NEXT:    vaddpd %xmm4, %xmm3, %xmm0
-; CHECK-NEXT:    vaddpd %xmm5, %xmm2, %xmm1
-; CHECK-NEXT:    vaddpd %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    vaddpd %xmm4, %xmm2, %xmm0
+; CHECK-NEXT:    vaddpd %xmm0, %xmm3, %xmm0
 ; CHECK-NEXT:    retq
   %res  = call <2 x double> @llvm.x86.avx512.mask.getmant.sd(<2 x double> %x0, <2 x double> %x1, i32 11, <2 x double> %x2, i8 %x3, i32 4)
   %res1 = call <2 x double> @llvm.x86.avx512.mask.getmant.sd(<2 x double> %x0, <2 x double> %x1, i32 11, <2 x double> zeroinitializer, i8 %x3, i32 4)
@@ -3928,11 +3929,11 @@ define <4 x float>@test_int_x86_avx512_mask_getmant_ss(<4 x float> %x0, <4 x flo
 ; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vgetmantss $11, %xmm1, %xmm0, %xmm2 {%k1}
 ; CHECK-NEXT:    vgetmantss $11, %xmm1, %xmm0, %xmm3 {%k1} {z}
-; CHECK-NEXT:    vgetmantss $11, %xmm1, %xmm0, %xmm4
+; CHECK-NEXT:    vaddps %xmm3, %xmm2, %xmm2
+; CHECK-NEXT:    vgetmantss $11, %xmm1, %xmm0, %xmm3
 ; CHECK-NEXT:    vgetmantss $11, {sae}, %xmm1, %xmm0, %xmm0
-; CHECK-NEXT:    vaddps %xmm3, %xmm2, %xmm1
-; CHECK-NEXT:    vaddps %xmm4, %xmm0, %xmm0
-; CHECK-NEXT:    vaddps %xmm0, %xmm1, %xmm0
+; CHECK-NEXT:    vaddps %xmm3, %xmm0, %xmm0
+; CHECK-NEXT:    vaddps %xmm0, %xmm2, %xmm0
 ; CHECK-NEXT:    retq
   %res  = call <4 x float> @llvm.x86.avx512.mask.getmant.ss(<4 x float> %x0, <4 x float> %x1, i32 11, <4 x float> %x2, i8 %x3, i32 4)
   %res1 = call <4 x float> @llvm.x86.avx512.mask.getmant.ss(<4 x float> %x0, <4 x float> %x1, i32 11, <4 x float> zeroinitializer, i8 %x3, i32 4)
@@ -4434,8 +4435,8 @@ define <16 x i32>@test_int_x86_avx512_mask_prol_d_512(<16 x i32> %x0, i32 %x1, <
 ; CHECK-NEXT:    kmovw %esi, %k1
 ; CHECK-NEXT:    vprold $3, %zmm0, %zmm1 {%k1}
 ; CHECK-NEXT:    vprold $3, %zmm0, %zmm2 {%k1} {z}
-; CHECK-NEXT:    vprold $3, %zmm0, %zmm0
 ; CHECK-NEXT:    vpaddd %zmm2, %zmm1, %zmm1
+; CHECK-NEXT:    vprold $3, %zmm0, %zmm0
 ; CHECK-NEXT:    vpaddd %zmm0, %zmm1, %zmm0
 ; CHECK-NEXT:    retq
   %res = call <16 x i32> @llvm.x86.avx512.mask.prol.d.512(<16 x i32> %x0, i32 3, <16 x i32> %x2, i16 %x3)
@@ -4454,8 +4455,8 @@ define <8 x i64>@test_int_x86_avx512_mask_prol_q_512(<8 x i64> %x0, i32 %x1, <8
 ; CHECK-NEXT:    kmovw %esi, %k1
 ; CHECK-NEXT:    vprolq $3, %zmm0, %zmm1 {%k1}
 ; CHECK-NEXT:    vprolq $3, %zmm0, %zmm2 {%k1} {z}
-; CHECK-NEXT:    vprolq $3, %zmm0, %zmm0
 ; CHECK-NEXT:    vpaddq %zmm2, %zmm1, %zmm1
+; CHECK-NEXT:    vprolq $3, %zmm0, %zmm0
 ; CHECK-NEXT:    vpaddq %zmm0, %zmm1, %zmm0
 ; CHECK-NEXT:    retq
   %res = call <8 x i64> @llvm.x86.avx512.mask.prol.q.512(<8 x i64> %x0, i32 3, <8 x i64> %x2, i8 %x3)
@@ -4556,9 +4557,9 @@ define <8 x double>@test_int_x86_avx512_mask_fixupimm_pd_512(<8 x double> %x0, <
 ; CHECK-NEXT:    vfixupimmpd $4, %zmm2, %zmm1, %zmm3 {%k1}
 ; CHECK-NEXT:    vpxord %zmm4, %zmm4, %zmm4
 ; CHECK-NEXT:    vfixupimmpd $5, %zmm2, %zmm1, %zmm4 {%k1} {z}
+; CHECK-NEXT:    vaddpd %zmm4, %zmm3, %zmm3
 ; CHECK-NEXT:    vfixupimmpd $3, {sae}, %zmm2, %zmm1, %zmm0
-; CHECK-NEXT:    vaddpd %zmm4, %zmm3, %zmm1
-; CHECK-NEXT:    vaddpd %zmm0, %zmm1, %zmm0
+; CHECK-NEXT:    vaddpd %zmm0, %zmm3, %zmm0
 ; CHECK-NEXT:    retq
   %res = call <8 x double> @llvm.x86.avx512.mask.fixupimm.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x i64> %x2, i32 4, i8 %x4, i32 4)
   %res1 = call <8 x double> @llvm.x86.avx512.mask.fixupimm.pd.512(<8 x double> zeroinitializer, <8 x double> %x1, <8 x i64> %x2, i32 5, i8 %x4, i32 4)
@@ -4579,9 +4580,9 @@ define <8 x double>@test_int_x86_avx512_maskz_fixupimm_pd_512(<8 x double> %x0,
 ; CHECK-NEXT:    vpxord %zmm4, %zmm4, %zmm4
 ; CHECK-NEXT:    vmovapd %zmm0, %zmm5
 ; CHECK-NEXT:    vfixupimmpd $5, %zmm4, %zmm1, %zmm5 {%k1} {z}
+; CHECK-NEXT:    vaddpd %zmm5, %zmm3, %zmm3
 ; CHECK-NEXT:    vfixupimmpd $2, {sae}, %zmm2, %zmm1, %zmm0
-; CHECK-NEXT:    vaddpd %zmm5, %zmm3, %zmm1
-; CHECK-NEXT:    vaddpd %zmm0, %zmm1, %zmm0
+; CHECK-NEXT:    vaddpd %zmm0, %zmm3, %zmm0
 ; CHECK-NEXT:    retq
   %res = call <8 x double> @llvm.x86.avx512.maskz.fixupimm.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x i64> %x2, i32 3, i8 %x4, i32 4)
   %res1 = call <8 x double> @llvm.x86.avx512.maskz.fixupimm.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x i64> zeroinitializer, i32 5, i8 %x4, i32 4)
@@ -4603,9 +4604,9 @@ define <4 x float>@test_int_x86_avx512_mask_fixupimm_ss(<4 x float> %x0, <4 x fl
 ; CHECK-NEXT:    vxorps %xmm4, %xmm4, %xmm4
 ; CHECK-NEXT:    vmovaps %xmm0, %xmm5
 ; CHECK-NEXT:    vfixupimmss $5, %xmm4, %xmm1, %xmm5 {%k1}
+; CHECK-NEXT:    vaddps %xmm5, %xmm3, %xmm3
 ; CHECK-NEXT:    vfixupimmss $5, {sae}, %xmm2, %xmm1, %xmm0
-; CHECK-NEXT:    vaddps %xmm5, %xmm3, %xmm1
-; CHECK-NEXT:    vaddps %xmm0, %xmm1, %xmm0
+; CHECK-NEXT:    vaddps %xmm0, %xmm3, %xmm0
 ; CHECK-NEXT:    retq
   %res = call <4 x float> @llvm.x86.avx512.mask.fixupimm.ss(<4 x float> %x0, <4 x float> %x1, <4 x i32> %x2, i32 5, i8 %x4, i32 4)
   %res1 = call <4 x float> @llvm.x86.avx512.mask.fixupimm.ss(<4 x float> %x0, <4 x float> %x1, <4 x i32> zeroinitializer, i32 5, i8 %x4, i32 4)
@@ -4650,9 +4651,9 @@ define <16 x float>@test_int_x86_avx512_mask_fixupimm_ps_512(<16 x float> %x0, <
 ; CHECK-NEXT:    vpxord %zmm4, %zmm4, %zmm4
 ; CHECK-NEXT:    vmovaps %zmm0, %zmm5
 ; CHECK-NEXT:    vfixupimmps $5, %zmm4, %zmm1, %zmm5 {%k1}
+; CHECK-NEXT:    vaddps %zmm5, %zmm3, %zmm3
 ; CHECK-NEXT:    vfixupimmps $5, {sae}, %zmm2, %zmm1, %zmm0
-; CHECK-NEXT:    vaddps %zmm5, %zmm3, %zmm1
-; CHECK-NEXT:    vaddps %zmm0, %zmm1, %zmm0
+; CHECK-NEXT:    vaddps %zmm0, %zmm3, %zmm0
 ; CHECK-NEXT:    retq
   %res = call <16 x float> @llvm.x86.avx512.mask.fixupimm.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x i32> %x2, i32 5, i16 %x4, i32 4)
   %res1 = call <16 x float> @llvm.x86.avx512.mask.fixupimm.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x i32> zeroinitializer, i32 5, i16 %x4, i32 4)
@@ -4721,9 +4722,9 @@ define <2 x double>@test_int_x86_avx512_maskz_fixupimm_sd(<2 x double> %x0, <2 x
 ; CHECK-NEXT:    vxorpd %xmm4, %xmm4, %xmm4
 ; CHECK-NEXT:    vmovapd %xmm0, %xmm5
 ; CHECK-NEXT:    vfixupimmsd $5, {sae}, %xmm4, %xmm1, %xmm5 {%k1} {z}
+; CHECK-NEXT:    vaddpd %xmm5, %xmm3, %xmm3
 ; CHECK-NEXT:    vfixupimmsd $5, {sae}, %xmm2, %xmm1, %xmm0 {%k1} {z}
-; CHECK-NEXT:    vaddpd %xmm5, %xmm3, %xmm1
-; CHECK-NEXT:    vaddpd %xmm0, %xmm1, %xmm0
+; CHECK-NEXT:    vaddpd %xmm0, %xmm3, %xmm0
 ; CHECK-NEXT:    retq
   %res = call <2 x double> @llvm.x86.avx512.maskz.fixupimm.sd(<2 x double> %x0, <2 x double> %x1, <2 x i64> %x2, i32 5, i8 %x4, i32 4)
   %res1 = call <2 x double> @llvm.x86.avx512.maskz.fixupimm.sd(<2 x double> %x0, <2 x double> %x1, <2 x i64> zeroinitializer, i32 5, i8 %x4, i32 8)
@@ -4821,12 +4822,12 @@ define <2 x double>@test_int_x86_avx512_mask_vfmadd_sd(<2 x double> %x0, <2 x do
 ; CHECK-NEXT:    vfmadd213sd %xmm2, %xmm1, %xmm3 {%k1}
 ; CHECK-NEXT:    vmovapd %xmm0, %xmm4
 ; CHECK-NEXT:    vfmadd213sd %xmm2, %xmm1, %xmm4
-; CHECK-NEXT:    vmovapd %xmm0, %xmm5
-; CHECK-NEXT:    vfmadd213sd {rz-sae}, %xmm2, %xmm1, %xmm5 {%k1}
+; CHECK-NEXT:    vaddpd %xmm3, %xmm4, %xmm3
+; CHECK-NEXT:    vmovapd %xmm0, %xmm4
+; CHECK-NEXT:    vfmadd213sd {rz-sae}, %xmm2, %xmm1, %xmm4 {%k1}
 ; CHECK-NEXT:    vfmadd213sd {rz-sae}, %xmm2, %xmm1, %xmm0
-; CHECK-NEXT:    vaddpd %xmm3, %xmm4, %xmm1
-; CHECK-NEXT:    vaddpd %xmm5, %xmm0, %xmm0
-; CHECK-NEXT:    vaddpd %xmm0, %xmm1, %xmm0
+; CHECK-NEXT:    vaddpd %xmm4, %xmm0, %xmm0
+; CHECK-NEXT:    vaddpd %xmm0, %xmm3, %xmm0
 ; CHECK-NEXT:    retq
   %res = call <2 x double> @llvm.x86.avx512.mask.vfmadd.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1, i32 4)
   %res1 = call <2 x double> @llvm.x86.avx512.mask.vfmadd.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3, i32 4)
@@ -4849,12 +4850,12 @@ define <4 x float>@test_int_x86_avx512_mask_vfmadd_ss(<4 x float> %x0, <4 x floa
 ; CHECK-NEXT:    vfmadd213ss %xmm2, %xmm1, %xmm3 {%k1}
 ; CHECK-NEXT:    vmovaps %xmm0, %xmm4
 ; CHECK-NEXT:    vfmadd213ss %xmm2, %xmm1, %xmm4
-; CHECK-NEXT:    vmovaps %xmm0, %xmm5
-; CHECK-NEXT:    vfmadd213ss {rz-sae}, %xmm2, %xmm1, %xmm5 {%k1}
+; CHECK-NEXT:    vaddps %xmm3, %xmm4, %xmm3
+; CHECK-NEXT:    vmovaps %xmm0, %xmm4
+; CHECK-NEXT:    vfmadd213ss {rz-sae}, %xmm2, %xmm1, %xmm4 {%k1}
 ; CHECK-NEXT:    vfmadd213ss {rz-sae}, %xmm2, %xmm1, %xmm0
-; CHECK-NEXT:    vaddps %xmm3, %xmm4, %xmm1
-; CHECK-NEXT:    vaddps %xmm5, %xmm0, %xmm0
-; CHECK-NEXT:    vaddps %xmm0, %xmm1, %xmm0
+; CHECK-NEXT:    vaddps %xmm4, %xmm0, %xmm0
+; CHECK-NEXT:    vaddps %xmm0, %xmm3, %xmm0
 ; CHECK-NEXT:    retq
   %res = call <4 x float> @llvm.x86.avx512.mask.vfmadd.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1, i32 4)
   %res1 = call <4 x float> @llvm.x86.avx512.mask.vfmadd.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3, i32 4)
@@ -4909,12 +4910,12 @@ define <2 x double>@test_int_x86_avx512_mask3_vfmadd_sd(<2 x double> %x0, <2 x d
 ; CHECK-NEXT:    vfmadd231sd %xmm1, %xmm0, %xmm3 {%k1}
 ; CHECK-NEXT:    vmovapd %xmm2, %xmm4
 ; CHECK-NEXT:    vfmadd231sd %xmm1, %xmm0, %xmm4
-; CHECK-NEXT:    vmovapd %xmm2, %xmm5
-; CHECK-NEXT:    vfmadd231sd {rz-sae}, %xmm1, %xmm0, %xmm5 {%k1}
+; CHECK-NEXT:    vaddpd %xmm3, %xmm4, %xmm3
+; CHECK-NEXT:    vmovapd %xmm2, %xmm4
+; CHECK-NEXT:    vfmadd231sd {rz-sae}, %xmm1, %xmm0, %xmm4 {%k1}
 ; CHECK-NEXT:    vfmadd231sd {rz-sae}, %xmm1, %xmm0, %xmm2
-; CHECK-NEXT:    vaddpd %xmm3, %xmm4, %xmm0
-; CHECK-NEXT:    vaddpd %xmm5, %xmm2, %xmm1
-; CHECK-NEXT:    vaddpd %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    vaddpd %xmm4, %xmm2, %xmm0
+; CHECK-NEXT:    vaddpd %xmm0, %xmm3, %xmm0
 ; CHECK-NEXT:    retq
   %res = call <2 x double> @llvm.x86.avx512.mask3.vfmadd.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1, i32 4)
   %res1 = call <2 x double> @llvm.x86.avx512.mask3.vfmadd.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3, i32 4)
@@ -4937,12 +4938,12 @@ define <4 x float>@test_int_x86_avx512_mask3_vfmadd_ss(<4 x float> %x0, <4 x flo
 ; CHECK-NEXT:    vfmadd231ss %xmm1, %xmm0, %xmm3 {%k1}
 ; CHECK-NEXT:    vmovaps %xmm2, %xmm4
 ; CHECK-NEXT:    vfmadd231ss %xmm1, %xmm0, %xmm4
-; CHECK-NEXT:    vmovaps %xmm2, %xmm5
-; CHECK-NEXT:    vfmadd231ss {rz-sae}, %xmm1, %xmm0, %xmm5 {%k1}
+; CHECK-NEXT:    vaddps %xmm3, %xmm4, %xmm3
+; CHECK-NEXT:    vmovaps %xmm2, %xmm4
+; CHECK-NEXT:    vfmadd231ss {rz-sae}, %xmm1, %xmm0, %xmm4 {%k1}
 ; CHECK-NEXT:    vfmadd231ss {rz-sae}, %xmm1, %xmm0, %xmm2
-; CHECK-NEXT:    vaddps %xmm3, %xmm4, %xmm0
-; CHECK-NEXT:    vaddps %xmm5, %xmm2, %xmm1
-; CHECK-NEXT:    vaddps %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    vaddps %xmm4, %xmm2, %xmm0
+; CHECK-NEXT:    vaddps %xmm0, %xmm3, %xmm0
 ; CHECK-NEXT:    retq
   %res = call <4 x float> @llvm.x86.avx512.mask3.vfmadd.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1, i32 4)
   %res1 = call <4 x float> @llvm.x86.avx512.mask3.vfmadd.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3, i32 4)
@@ -5069,12 +5070,12 @@ define <2 x double>@test_int_x86_avx512_mask3_vfmsub_sd(<2 x double> %x0, <2 x d
 ; CHECK-NEXT:    vfmsub231sd %xmm1, %xmm0, %xmm3 {%k1}
 ; CHECK-NEXT:    vmovapd %xmm2, %xmm4
 ; CHECK-NEXT:    vfmsub231sd %xmm1, %xmm0, %xmm4
-; CHECK-NEXT:    vmovapd %xmm2, %xmm5
-; CHECK-NEXT:    vfmsub231sd {rz-sae}, %xmm1, %xmm0, %xmm5 {%k1}
+; CHECK-NEXT:    vaddpd %xmm3, %xmm4, %xmm3
+; CHECK-NEXT:    vmovapd %xmm2, %xmm4
+; CHECK-NEXT:    vfmsub231sd {rz-sae}, %xmm1, %xmm0, %xmm4 {%k1}
 ; CHECK-NEXT:    vfmsub231sd {rz-sae}, %xmm1, %xmm0, %xmm2
-; CHECK-NEXT:    vaddpd %xmm3, %xmm4, %xmm0
-; CHECK-NEXT:    vaddpd %xmm5, %xmm2, %xmm1
-; CHECK-NEXT:    vaddpd %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    vaddpd %xmm4, %xmm2, %xmm0
+; CHECK-NEXT:    vaddpd %xmm0, %xmm3, %xmm0
 ; CHECK-NEXT:    retq
   %res = call <2 x double> @llvm.x86.avx512.mask3.vfmsub.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1, i32 4)
   %res1 = call <2 x double> @llvm.x86.avx512.mask3.vfmsub.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3, i32 4)
@@ -5097,12 +5098,12 @@ define <4 x float>@test_int_x86_avx512_mask3_vfmsub_ss(<4 x float> %x0, <4 x flo
 ; CHECK-NEXT:    vfmsub231ss %xmm1, %xmm0, %xmm3 {%k1}
 ; CHECK-NEXT:    vmovaps %xmm2, %xmm4
 ; CHECK-NEXT:    vfmsub231ss %xmm1, %xmm0, %xmm4
-; CHECK-NEXT:    vmovaps %xmm2, %xmm5
-; CHECK-NEXT:    vfmsub231ss {rz-sae}, %xmm1, %xmm0, %xmm5 {%k1}
+; CHECK-NEXT:    vaddps %xmm3, %xmm4, %xmm3
+; CHECK-NEXT:    vmovaps %xmm2, %xmm4
+; CHECK-NEXT:    vfmsub231ss {rz-sae}, %xmm1, %xmm0, %xmm4 {%k1}
 ; CHECK-NEXT:    vfmsub231ss {rz-sae}, %xmm1, %xmm0, %xmm2
-; CHECK-NEXT:    vaddps %xmm3, %xmm4, %xmm0
-; CHECK-NEXT:    vaddps %xmm5, %xmm2, %xmm1
-; CHECK-NEXT:    vaddps %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    vaddps %xmm4, %xmm2, %xmm0
+; CHECK-NEXT:    vaddps %xmm0, %xmm3, %xmm0
 ; CHECK-NEXT:    retq
   %res = call <4 x float> @llvm.x86.avx512.mask3.vfmsub.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1, i32 4)
   %res1 = call <4 x float> @llvm.x86.avx512.mask3.vfmsub.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3, i32 4)
@@ -5125,12 +5126,12 @@ define <2 x double>@test_int_x86_avx512_mask3_vfnmsub_sd(<2 x double> %x0, <2 x
 ; CHECK-NEXT:    vfnmsub231sd %xmm1, %xmm0, %xmm3 {%k1}
 ; CHECK-NEXT:    vmovapd %xmm2, %xmm4
 ; CHECK-NEXT:    vfnmsub231sd %xmm1, %xmm0, %xmm4
-; CHECK-NEXT:    vmovapd %xmm2, %xmm5
-; CHECK-NEXT:    vfnmsub231sd {rz-sae}, %xmm1, %xmm0, %xmm5 {%k1}
+; CHECK-NEXT:    vaddpd %xmm3, %xmm4, %xmm3
+; CHECK-NEXT:    vmovapd %xmm2, %xmm4
+; CHECK-NEXT:    vfnmsub231sd {rz-sae}, %xmm1, %xmm0, %xmm4 {%k1}
 ; CHECK-NEXT:    vfnmsub231sd {rz-sae}, %xmm1, %xmm0, %xmm2
-; CHECK-NEXT:    vaddpd %xmm3, %xmm4, %xmm0
-; CHECK-NEXT:    vaddpd %xmm5, %xmm2, %xmm1
-; CHECK-NEXT:    vaddpd %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    vaddpd %xmm4, %xmm2, %xmm0
+; CHECK-NEXT:    vaddpd %xmm0, %xmm3, %xmm0
 ; CHECK-NEXT:    retq
   %res = call <2 x double> @llvm.x86.avx512.mask3.vfnmsub.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1, i32 4)
   %res1 = call <2 x double> @llvm.x86.avx512.mask3.vfnmsub.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3, i32 4)
@@ -5153,12 +5154,12 @@ define <4 x float>@test_int_x86_avx512_mask3_vfnmsub_ss(<4 x float> %x0, <4 x fl
 ; CHECK-NEXT:    vfnmsub231ss %xmm1, %xmm0, %xmm3 {%k1}
 ; CHECK-NEXT:    vmovaps %xmm2, %xmm4
 ; CHECK-NEXT:    vfnmsub231ss %xmm1, %xmm0, %xmm4
-; CHECK-NEXT:    vmovaps %xmm2, %xmm5
-; CHECK-NEXT:    vfnmsub231ss {rz-sae}, %xmm1, %xmm0, %xmm5 {%k1}
+; CHECK-NEXT:    vaddps %xmm3, %xmm4, %xmm3
+; CHECK-NEXT:    vmovaps %xmm2, %xmm4
+; CHECK-NEXT:    vfnmsub231ss {rz-sae}, %xmm1, %xmm0, %xmm4 {%k1}
 ; CHECK-NEXT:    vfnmsub231ss {rz-sae}, %xmm1, %xmm0, %xmm2
-; CHECK-NEXT:    vaddps %xmm3, %xmm4, %xmm0
-; CHECK-NEXT:    vaddps %xmm5, %xmm2, %xmm1
-; CHECK-NEXT:    vaddps %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    vaddps %xmm4, %xmm2, %xmm0
+; CHECK-NEXT:    vaddps %xmm0, %xmm3, %xmm0
 ; CHECK-NEXT:    retq
   %res = call <4 x float> @llvm.x86.avx512.mask3.vfnmsub.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1, i32 4)
   %res1 = call <4 x float> @llvm.x86.avx512.mask3.vfnmsub.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3, i32 4)
diff --git a/test/CodeGen/X86/avx512-mask-spills.ll b/test/CodeGen/X86/avx512-mask-spills.ll
index 4ef88ac495c3..96aefdb10584 100644
--- a/test/CodeGen/X86/avx512-mask-spills.ll
+++ b/test/CodeGen/X86/avx512-mask-spills.ll
@@ -9,13 +9,11 @@ define <4 x i1> @test_4i1(<4 x i32> %a, <4 x i32> %b) {
 ; CHECK-NEXT:  Lcfi0:
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    vpcmpnleud %xmm1, %xmm0, %k0
-; CHECK-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp) ## 2-byte Spill
-; CHECK-NEXT:    vpcmpgtd %xmm1, %xmm0, %k0
+; CHECK-NEXT:    vpcmpgtd %xmm1, %xmm0, %k1
+; CHECK-NEXT:    korw %k1, %k0, %k0
 ; CHECK-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp) ## 2-byte Spill
 ; CHECK-NEXT:    callq _f
 ; CHECK-NEXT:    kmovw {{[0-9]+}}(%rsp), %k0 ## 2-byte Reload
-; CHECK-NEXT:    kmovw {{[0-9]+}}(%rsp), %k1 ## 2-byte Reload
-; CHECK-NEXT:    korw %k1, %k0, %k0
 ; CHECK-NEXT:    vpmovm2d %k0, %xmm0
 ; CHECK-NEXT:    popq %rax
 ; CHECK-NEXT:    retq
@@ -34,14 +32,12 @@ define <8 x i1> @test_8i1(<8 x i32> %a, <8 x i32> %b) {
 ; CHECK-NEXT:  Lcfi1:
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    vpcmpnleud %ymm1, %ymm0, %k0
-; CHECK-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp) ## 2-byte Spill
-; CHECK-NEXT:    vpcmpgtd %ymm1, %ymm0, %k0
+; CHECK-NEXT:    vpcmpgtd %ymm1, %ymm0, %k1
+; CHECK-NEXT:    korb %k1, %k0, %k0
 ; CHECK-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp) ## 2-byte Spill
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    callq _f
 ; CHECK-NEXT:    kmovw {{[0-9]+}}(%rsp), %k0 ## 2-byte Reload
-; CHECK-NEXT:    kmovw {{[0-9]+}}(%rsp), %k1 ## 2-byte Reload
-; CHECK-NEXT:    korb %k1, %k0, %k0
 ; CHECK-NEXT:    vpmovm2w %k0, %xmm0
 ; CHECK-NEXT:    popq %rax
 ; CHECK-NEXT:    retq
@@ -60,14 +56,12 @@ define <16 x i1> @test_16i1(<16 x i32> %a, <16 x i32> %b) {
 ; CHECK-NEXT:  Lcfi2:
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    vpcmpnleud %zmm1, %zmm0, %k0
-; CHECK-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp) ## 2-byte Spill
-; CHECK-NEXT:    vpcmpgtd %zmm1, %zmm0, %k0
+; CHECK-NEXT:    vpcmpgtd %zmm1, %zmm0, %k1
+; CHECK-NEXT:    korw %k1, %k0, %k0
 ; CHECK-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp) ## 2-byte Spill
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    callq _f
 ; CHECK-NEXT:    kmovw {{[0-9]+}}(%rsp), %k0 ## 2-byte Reload
-; CHECK-NEXT:    kmovw {{[0-9]+}}(%rsp), %k1 ## 2-byte Reload
-; CHECK-NEXT:    korw %k1, %k0, %k0
 ; CHECK-NEXT:    vpmovm2b %k0, %xmm0
 ; CHECK-NEXT:    popq %rax
 ; CHECK-NEXT:    retq
@@ -85,14 +79,12 @@ define <32 x i1> @test_32i1(<32 x i16> %a, <32 x i16> %b) {
 ; CHECK-NEXT:  Lcfi3:
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    vpcmpnleuw %zmm1, %zmm0, %k0
+; CHECK-NEXT:    vpcmpgtw %zmm1, %zmm0, %k1
+; CHECK-NEXT:    kord %k1, %k0, %k0
 ; CHECK-NEXT:    kmovd %k0, {{[0-9]+}}(%rsp) ## 4-byte Spill
-; CHECK-NEXT:    vpcmpgtw %zmm1, %zmm0, %k0
-; CHECK-NEXT:    kmovd %k0, (%rsp) ## 4-byte Spill
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    callq _f
 ; CHECK-NEXT:    kmovd {{[0-9]+}}(%rsp), %k0 ## 4-byte Reload
-; CHECK-NEXT:    kmovd (%rsp), %k1 ## 4-byte Reload
-; CHECK-NEXT:    kord %k1, %k0, %k0
 ; CHECK-NEXT:    vpmovm2b %k0, %ymm0
 ; CHECK-NEXT:    popq %rax
 ; CHECK-NEXT:    retq
@@ -106,20 +98,18 @@ define <32 x i1> @test_32i1(<32 x i16> %a, <32 x i16> %b) {
 define <64 x i1> @test_64i1(<64 x i8> %a, <64 x i8> %b) {
 ; CHECK-LABEL: test_64i1:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    subq $24, %rsp
+; CHECK-NEXT:    pushq %rax
 ; CHECK-NEXT:  Lcfi4:
-; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    vpcmpnleub %zmm1, %zmm0, %k0
-; CHECK-NEXT:    kmovq %k0, {{[0-9]+}}(%rsp) ## 8-byte Spill
-; CHECK-NEXT:    vpcmpgtb %zmm1, %zmm0, %k0
-; CHECK-NEXT:    kmovq %k0, {{[0-9]+}}(%rsp) ## 8-byte Spill
+; CHECK-NEXT:    vpcmpgtb %zmm1, %zmm0, %k1
+; CHECK-NEXT:    korq %k1, %k0, %k0
+; CHECK-NEXT:    kmovq %k0, (%rsp) ## 8-byte Spill
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    callq _f
-; CHECK-NEXT:    kmovq {{[0-9]+}}(%rsp), %k0 ## 8-byte Reload
-; CHECK-NEXT:    kmovq {{[0-9]+}}(%rsp), %k1 ## 8-byte Reload
-; CHECK-NEXT:    korq %k1, %k0, %k0
+; CHECK-NEXT:    kmovq (%rsp), %k0 ## 8-byte Reload
 ; CHECK-NEXT:    vpmovm2b %k0, %zmm0
-; CHECK-NEXT:    addq $24, %rsp
+; CHECK-NEXT:    popq %rax
 ; CHECK-NEXT:    retq
 
   %cmp_res = icmp ugt <64 x i8> %a, %b
diff --git a/test/CodeGen/X86/avx512-scalar_mask.ll b/test/CodeGen/X86/avx512-scalar_mask.ll
new file mode 100644
index 000000000000..47c6813fa8dc
--- /dev/null
+++ b/test/CodeGen/X86/avx512-scalar_mask.ll
@@ -0,0 +1,107 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s
+
+declare <4 x float> @llvm.x86.avx512.mask.vfmadd.ss(<4 x float>, <4 x float>, <4 x float>, i8, i32)
+declare <4 x float> @llvm.x86.avx512.maskz.vfmadd.ss(<4 x float>, <4 x float>, <4 x float>, i8, i32)
+
+define <4 x float>@test_var_mask(<4 x float> %v0, <4 x float> %v1, <4 x float> %v2, i8 %mask) {
+; CHECK-LABEL: test_var_mask:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    andl $1, %edi
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vfmadd213ss %xmm2, %xmm1, %xmm0 {%k1}
+; CHECK-NEXT:    retq
+  %res = call <4 x float> @llvm.x86.avx512.mask.vfmadd.ss(<4 x float> %v0,<4 x float> %v1, <4 x float> %v2,  i8 %mask, i32 4)
+  ret < 4 x float> %res
+}
+
+define <4 x float>@test_var_maskz(<4 x float> %v0, <4 x float> %v1, <4 x float> %v2, i8 %mask) {
+; CHECK-LABEL: test_var_maskz:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    andl $1, %edi
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vfmadd213ss %xmm2, %xmm1, %xmm0 {%k1} {z}
+; CHECK-NEXT:    retq
+  %res = call <4 x float> @llvm.x86.avx512.maskz.vfmadd.ss(<4 x float> %v0,<4 x float> %v1, <4 x float> %v2,  i8 %mask, i32 4)
+  ret < 4 x float> %res
+}
+
+; FIXME: we should just return %xmm0 here.
+define <4 x float>@test_const0_mask(<4 x float> %v0, <4 x float> %v1, <4 x float> %v2) {
+; CHECK-LABEL: test_const0_mask:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kxorw %k0, %k0, %k1
+; CHECK-NEXT:    vfmadd213ss %xmm2, %xmm1, %xmm0 {%k1}
+; CHECK-NEXT:    retq
+  %res = call <4 x float> @llvm.x86.avx512.mask.vfmadd.ss(<4 x float> %v0,<4 x float> %v1, <4 x float> %v2,  i8 0, i32 4)
+  ret < 4 x float> %res
+}
+
+; FIXME: we should zero the lower element of xmm0 and return it.
+define <4 x float>@test_const0_maskz(<4 x float> %v0, <4 x float> %v1, <4 x float> %v2) {
+; CHECK-LABEL: test_const0_maskz:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kxorw %k0, %k0, %k1
+; CHECK-NEXT:    vfmadd213ss %xmm2, %xmm1, %xmm0 {%k1} {z}
+; CHECK-NEXT:    retq
+  %res = call <4 x float> @llvm.x86.avx512.maskz.vfmadd.ss(<4 x float> %v0,<4 x float> %v1, <4 x float> %v2,  i8 0, i32 4)
+  ret < 4 x float> %res
+}
+
+; FIXME: we should just return %xmm0 here.
+define <4 x float>@test_const2_mask(<4 x float> %v0, <4 x float> %v1, <4 x float> %v2) {
+; CHECK-LABEL: test_const2_mask:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kxorw %k0, %k0, %k1
+; CHECK-NEXT:    vfmadd213ss %xmm2, %xmm1, %xmm0 {%k1}
+; CHECK-NEXT:    retq
+  %res = call <4 x float> @llvm.x86.avx512.mask.vfmadd.ss(<4 x float> %v0,<4 x float> %v1, <4 x float> %v2,  i8 2, i32 4)
+  ret < 4 x float> %res
+}
+
+; FIXME: we should zero the lower element of xmm0 and return it.
+define <4 x float>@test_const2_maskz(<4 x float> %v0, <4 x float> %v1, <4 x float> %v2) {
+; CHECK-LABEL: test_const2_maskz:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kxorw %k0, %k0, %k1
+; CHECK-NEXT:    vfmadd213ss %xmm2, %xmm1, %xmm0 {%k1} {z}
+; CHECK-NEXT:    retq
+  %res = call <4 x float> @llvm.x86.avx512.maskz.vfmadd.ss(<4 x float> %v0,<4 x float> %v1, <4 x float> %v2,  i8 2, i32 4)
+  ret < 4 x float> %res
+}
+
+define <4 x float>@test_const_allone_mask(<4 x float> %v0, <4 x float> %v1, <4 x float> %v2) {
+; CHECK-LABEL: test_const_allone_mask:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vfmadd213ss %xmm2, %xmm1, %xmm0
+; CHECK-NEXT:    retq
+  %res = call <4 x float> @llvm.x86.avx512.mask.vfmadd.ss(<4 x float> %v0,<4 x float> %v1, <4 x float> %v2,  i8 -1, i32 4)
+  ret < 4 x float> %res
+}
+
+define <4 x float>@test_const_allone_maskz(<4 x float> %v0, <4 x float> %v1, <4 x float> %v2) {
+; CHECK-LABEL: test_const_allone_maskz:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vfmadd213ss %xmm2, %xmm1, %xmm0
+; CHECK-NEXT:    retq
+  %res = call <4 x float> @llvm.x86.avx512.maskz.vfmadd.ss(<4 x float> %v0,<4 x float> %v1, <4 x float> %v2,  i8 -1, i32 4)
+  ret < 4 x float> %res
+}
+
+define <4 x float>@test_const_3_mask(<4 x float> %v0, <4 x float> %v1, <4 x float> %v2) {
+; CHECK-LABEL: test_const_3_mask:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vfmadd213ss %xmm2, %xmm1, %xmm0
+; CHECK-NEXT:    retq
+  %res = call <4 x float> @llvm.x86.avx512.mask.vfmadd.ss(<4 x float> %v0,<4 x float> %v1, <4 x float> %v2,  i8 3, i32 4)
+  ret < 4 x float> %res
+}
+
+define <4 x float>@test_const_3_maskz(<4 x float> %v0, <4 x float> %v1, <4 x float> %v2) {
+; CHECK-LABEL: test_const_3_maskz:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vfmadd213ss %xmm2, %xmm1, %xmm0
+; CHECK-NEXT:    retq
+  %res = call <4 x float> @llvm.x86.avx512.maskz.vfmadd.ss(<4 x float> %v0,<4 x float> %v1, <4 x float> %v2,  i8 3, i32 4)
+  ret < 4 x float> %res
+}
diff --git a/test/CodeGen/X86/avx512-vselect.ll b/test/CodeGen/X86/avx512-vselect.ll
new file mode 100644
index 000000000000..1940864824ff
--- /dev/null
+++ b/test/CodeGen/X86/avx512-vselect.ll
@@ -0,0 +1,61 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mcpu=skx | FileCheck %s --check-prefixes=CHECK,CHECK-SKX
+; RUN: llc < %s -mcpu=knl | FileCheck %s --check-prefixes=CHECK,CHECK-KNL
+
+target triple = "x86_64-unknown-unknown"
+
+define <8 x i64> @test1(<8 x i64> %m, <8 x i64> %a, <8 x i64> %b) {
+; CHECK-LABEL: test1:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    vpsllq $63, %zmm0, %zmm0
+; CHECK-NEXT:    vptestmq %zmm0, %zmm0, %k1
+; CHECK-NEXT:    vpblendmq %zmm1, %zmm2, %zmm0 {%k1}
+; CHECK-NEXT:    retq
+entry:
+  %m.trunc = trunc <8 x i64> %m to <8 x i1>
+  %ret = select <8 x i1> %m.trunc, <8 x i64> %a, <8 x i64> %b
+  ret <8 x i64> %ret
+}
+
+; This is a very contrived test case to trick the legalizer into splitting the
+; v16i1 masks in the select during type legalization, and in so doing extend them
+; into two v8i64 types. This lets us ensure that the lowering code can handle
+; both formulations of vselect. All of this trickery is because we can't
+; directly form an SDAG input to the lowering.
+define <16 x double> @test2(<16 x float> %x, <16 x float> %y, <16 x double> %a, <16 x double> %b) {
+; CHECK-SKX-LABEL: test2:
+; CHECK-SKX:       # BB#0: # %entry
+; CHECK-SKX-NEXT:    vxorps %zmm6, %zmm6, %zmm6
+; CHECK-SKX-NEXT:    vcmpltps %zmm0, %zmm6, %k0
+; CHECK-SKX-NEXT:    vcmpltps %zmm6, %zmm1, %k1
+; CHECK-SKX-NEXT:    korw %k1, %k0, %k0
+; CHECK-SKX-NEXT:    kshiftrw $8, %k0, %k1
+; CHECK-SKX-NEXT:    vpmovm2q %k1, %zmm1
+; CHECK-SKX-NEXT:    vpmovm2q %k0, %zmm0
+; CHECK-SKX-NEXT:    vptestmq %zmm0, %zmm0, %k1
+; CHECK-SKX-NEXT:    vblendmpd %zmm2, %zmm4, %zmm0 {%k1}
+; CHECK-SKX-NEXT:    vptestmq %zmm1, %zmm1, %k1
+; CHECK-SKX-NEXT:    vblendmpd %zmm3, %zmm5, %zmm1 {%k1}
+; CHECK-SKX-NEXT:    retq
+;
+; CHECK-KNL-LABEL: test2:
+; CHECK-KNL:       # BB#0: # %entry
+; CHECK-KNL-NEXT:    vpxord %zmm6, %zmm6, %zmm6
+; CHECK-KNL-NEXT:    vcmpltps %zmm0, %zmm6, %k0
+; CHECK-KNL-NEXT:    vcmpltps %zmm6, %zmm1, %k1
+; CHECK-KNL-NEXT:    korw %k1, %k0, %k1
+; CHECK-KNL-NEXT:    kshiftrw $8, %k1, %k2
+; CHECK-KNL-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k2} {z}
+; CHECK-KNL-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; CHECK-KNL-NEXT:    vptestmq %zmm0, %zmm0, %k1
+; CHECK-KNL-NEXT:    vblendmpd %zmm2, %zmm4, %zmm0 {%k1}
+; CHECK-KNL-NEXT:    vptestmq %zmm1, %zmm1, %k1
+; CHECK-KNL-NEXT:    vblendmpd %zmm3, %zmm5, %zmm1 {%k1}
+; CHECK-KNL-NEXT:    retq
+entry:
+  %gt.m = fcmp ogt <16 x float> %x, zeroinitializer
+  %lt.m = fcmp olt <16 x float> %y, zeroinitializer
+  %m.or = or <16 x i1> %gt.m, %lt.m
+  %ret = select <16 x i1> %m.or, <16 x double> %a, <16 x double> %b
+  ret <16 x double> %ret
+}
diff --git a/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll b/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll
index 9b4e73a18fc2..faa055dfbbf3 100644
--- a/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll
+++ b/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll
@@ -796,9 +796,9 @@ define <32 x i16>@test_int_x86_avx512_mask_psrl_w_512(<32 x i16> %x0, <8 x i16>
 ; AVX512BW-NEXT:    vpsrlw %xmm1, %zmm0, %zmm3
 ; AVX512BW-NEXT:    kmovd %edi, %k1
 ; AVX512BW-NEXT:    vpsrlw %xmm1, %zmm0, %zmm2 {%k1}
+; AVX512BW-NEXT:    vpaddw %zmm3, %zmm2, %zmm2
 ; AVX512BW-NEXT:    vpsrlw %xmm1, %zmm0, %zmm0 {%k1} {z}
-; AVX512BW-NEXT:    vpaddw %zmm3, %zmm2, %zmm1
-; AVX512BW-NEXT:    vpaddw %zmm0, %zmm1, %zmm0
+; AVX512BW-NEXT:    vpaddw %zmm0, %zmm2, %zmm0
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512F-32-LABEL: test_int_x86_avx512_mask_psrl_w_512:
@@ -806,9 +806,9 @@ define <32 x i16>@test_int_x86_avx512_mask_psrl_w_512(<32 x i16> %x0, <8 x i16>
 ; AVX512F-32-NEXT:    vpsrlw %xmm1, %zmm0, %zmm3
 ; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
 ; AVX512F-32-NEXT:    vpsrlw %xmm1, %zmm0, %zmm2 {%k1}
+; AVX512F-32-NEXT:    vpaddw %zmm3, %zmm2, %zmm2
 ; AVX512F-32-NEXT:    vpsrlw %xmm1, %zmm0, %zmm0 {%k1} {z}
-; AVX512F-32-NEXT:    vpaddw %zmm3, %zmm2, %zmm1
-; AVX512F-32-NEXT:    vpaddw %zmm0, %zmm1, %zmm0
+; AVX512F-32-NEXT:    vpaddw %zmm0, %zmm2, %zmm0
 ; AVX512F-32-NEXT:    retl
   %res = call <32 x i16> @llvm.x86.avx512.mask.psrl.w.512(<32 x i16> %x0, <8 x i16> %x1, <32 x i16> %x2, i32 %x3)
   %res1 = call <32 x i16> @llvm.x86.avx512.mask.psrl.w.512(<32 x i16> %x0, <8 x i16> %x1, <32 x i16> %x2, i32 -1)
@@ -826,8 +826,8 @@ define <32 x i16>@test_int_x86_avx512_mask_psrl_wi_512(<32 x i16> %x0, i32 %x1,
 ; AVX512BW-NEXT:    vpsrlw $3, %zmm0, %zmm2
 ; AVX512BW-NEXT:    kmovd %esi, %k1
 ; AVX512BW-NEXT:    vpsrlw $3, %zmm0, %zmm1 {%k1}
-; AVX512BW-NEXT:    vpsrlw $3, %zmm0, %zmm0 {%k1} {z}
 ; AVX512BW-NEXT:    vpaddw %zmm2, %zmm1, %zmm1
+; AVX512BW-NEXT:    vpsrlw $3, %zmm0, %zmm0 {%k1} {z}
 ; AVX512BW-NEXT:    vpaddw %zmm0, %zmm1, %zmm0
 ; AVX512BW-NEXT:    retq
 ;
@@ -836,8 +836,8 @@ define <32 x i16>@test_int_x86_avx512_mask_psrl_wi_512(<32 x i16> %x0, i32 %x1,
 ; AVX512F-32-NEXT:    vpsrlw $3, %zmm0, %zmm2
 ; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
 ; AVX512F-32-NEXT:    vpsrlw $3, %zmm0, %zmm1 {%k1}
-; AVX512F-32-NEXT:    vpsrlw $3, %zmm0, %zmm0 {%k1} {z}
 ; AVX512F-32-NEXT:    vpaddw %zmm2, %zmm1, %zmm1
+; AVX512F-32-NEXT:    vpsrlw $3, %zmm0, %zmm0 {%k1} {z}
 ; AVX512F-32-NEXT:    vpaddw %zmm0, %zmm1, %zmm0
 ; AVX512F-32-NEXT:    retl
   %res = call <32 x i16> @llvm.x86.avx512.mask.psrl.wi.512(<32 x i16> %x0, i32 3, <32 x i16> %x2, i32 %x3)
diff --git a/test/CodeGen/X86/avx512bw-intrinsics.ll b/test/CodeGen/X86/avx512bw-intrinsics.ll
index 3337f42eb142..13b850ccc3b6 100644
--- a/test/CodeGen/X86/avx512bw-intrinsics.ll
+++ b/test/CodeGen/X86/avx512bw-intrinsics.ll
@@ -2159,9 +2159,9 @@ define <32 x i16>@test_int_x86_avx512_mask_dbpsadbw_512(<64 x i8> %x0, <64 x i8>
 ; AVX512BW-NEXT:    kmovd %edi, %k1
 ; AVX512BW-NEXT:    vdbpsadbw $2, %zmm1, %zmm0, %zmm2 {%k1}
 ; AVX512BW-NEXT:    vdbpsadbw $2, %zmm1, %zmm0, %zmm3 {%k1} {z}
+; AVX512BW-NEXT:    vpaddw %zmm3, %zmm2, %zmm2
 ; AVX512BW-NEXT:    vdbpsadbw $2, %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT:    vpaddw %zmm3, %zmm2, %zmm1
-; AVX512BW-NEXT:    vpaddw %zmm0, %zmm1, %zmm0
+; AVX512BW-NEXT:    vpaddw %zmm0, %zmm2, %zmm0
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512F-32-LABEL: test_int_x86_avx512_mask_dbpsadbw_512:
@@ -2169,9 +2169,9 @@ define <32 x i16>@test_int_x86_avx512_mask_dbpsadbw_512(<64 x i8> %x0, <64 x i8>
 ; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
 ; AVX512F-32-NEXT:    vdbpsadbw $2, %zmm1, %zmm0, %zmm2 {%k1}
 ; AVX512F-32-NEXT:    vdbpsadbw $2, %zmm1, %zmm0, %zmm3 {%k1} {z}
+; AVX512F-32-NEXT:    vpaddw %zmm3, %zmm2, %zmm2
 ; AVX512F-32-NEXT:    vdbpsadbw $2, %zmm1, %zmm0, %zmm0
-; AVX512F-32-NEXT:    vpaddw %zmm3, %zmm2, %zmm1
-; AVX512F-32-NEXT:    vpaddw %zmm0, %zmm1, %zmm0
+; AVX512F-32-NEXT:    vpaddw %zmm0, %zmm2, %zmm0
 ; AVX512F-32-NEXT:    retl
   %res = call <32 x i16> @llvm.x86.avx512.mask.dbpsadbw.512(<64 x i8> %x0, <64 x i8> %x1, i32 2, <32 x i16> %x3, i32 %x4)
   %res1 = call <32 x i16> @llvm.x86.avx512.mask.dbpsadbw.512(<64 x i8> %x0, <64 x i8> %x1, i32 2, <32 x i16> zeroinitializer, i32 %x4)
@@ -2411,9 +2411,9 @@ define <32 x i16>@test_int_x86_avx512_mask_permvar_hi_512(<32 x i16> %x0, <32 x
 ; AVX512BW-NEXT:    kmovd %edi, %k1
 ; AVX512BW-NEXT:    vpermw %zmm0, %zmm1, %zmm2 {%k1}
 ; AVX512BW-NEXT:    vpermw %zmm0, %zmm1, %zmm3 {%k1} {z}
+; AVX512BW-NEXT:    vpaddw %zmm3, %zmm2, %zmm2
 ; AVX512BW-NEXT:    vpermw %zmm0, %zmm1, %zmm0
-; AVX512BW-NEXT:    vpaddw %zmm3, %zmm2, %zmm1
-; AVX512BW-NEXT:    vpaddw %zmm0, %zmm1, %zmm0
+; AVX512BW-NEXT:    vpaddw %zmm0, %zmm2, %zmm0
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512F-32-LABEL: test_int_x86_avx512_mask_permvar_hi_512:
@@ -2421,9 +2421,9 @@ define <32 x i16>@test_int_x86_avx512_mask_permvar_hi_512(<32 x i16> %x0, <32 x
 ; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
 ; AVX512F-32-NEXT:    vpermw %zmm0, %zmm1, %zmm2 {%k1}
 ; AVX512F-32-NEXT:    vpermw %zmm0, %zmm1, %zmm3 {%k1} {z}
+; AVX512F-32-NEXT:    vpaddw %zmm3, %zmm2, %zmm2
 ; AVX512F-32-NEXT:    vpermw %zmm0, %zmm1, %zmm0
-; AVX512F-32-NEXT:    vpaddw %zmm3, %zmm2, %zmm1
-; AVX512F-32-NEXT:    vpaddw %zmm0, %zmm1, %zmm0
+; AVX512F-32-NEXT:    vpaddw %zmm0, %zmm2, %zmm0
 ; AVX512F-32-NEXT:    retl
   %res = call <32 x i16> @llvm.x86.avx512.mask.permvar.hi.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3)
   %res1 = call <32 x i16> @llvm.x86.avx512.mask.permvar.hi.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> zeroinitializer, i32 %x3)
diff --git a/test/CodeGen/X86/avx512bwvl-intrinsics-upgrade.ll b/test/CodeGen/X86/avx512bwvl-intrinsics-upgrade.ll
index 7df07b0413ed..571f345d4616 100644
--- a/test/CodeGen/X86/avx512bwvl-intrinsics-upgrade.ll
+++ b/test/CodeGen/X86/avx512bwvl-intrinsics-upgrade.ll
@@ -9,8 +9,8 @@ define <32 x i8>@test_int_x86_avx512_pbroadcastb_256(<16 x i8> %x0, <32 x i8> %x
 ; CHECK-NEXT:    vpbroadcastb %xmm0, %ymm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x78,0xd0]
 ; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
 ; CHECK-NEXT:    vpbroadcastb %xmm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x78,0xc8]
-; CHECK-NEXT:    vpbroadcastb %xmm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x78,0xc0]
 ; CHECK-NEXT:    vpaddb %ymm1, %ymm2, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfc,0xc9]
+; CHECK-NEXT:    vpbroadcastb %xmm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x78,0xc0]
 ; CHECK-NEXT:    vpaddb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfc,0xc1]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call <32 x i8> @llvm.x86.avx512.pbroadcastb.256(<16 x i8> %x0, <32 x i8> %x1, i32 -1)
@@ -29,8 +29,8 @@ define <16 x i8>@test_int_x86_avx512_pbroadcastb_128(<16 x i8> %x0, <16 x i8> %x
 ; CHECK-NEXT:    vpbroadcastb %xmm0, %xmm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x78,0xd0]
 ; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
 ; CHECK-NEXT:    vpbroadcastb %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x78,0xc8]
-; CHECK-NEXT:    vpbroadcastb %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x78,0xc0]
 ; CHECK-NEXT:    vpaddb %xmm1, %xmm2, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfc,0xc9]
+; CHECK-NEXT:    vpbroadcastb %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x78,0xc0]
 ; CHECK-NEXT:    vpaddb %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfc,0xc1]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call <16 x i8> @llvm.x86.avx512.pbroadcastb.128(<16 x i8> %x0, <16 x i8> %x1, i16 -1)
@@ -49,8 +49,8 @@ define <16 x i16>@test_int_x86_avx512_pbroadcastw_256(<8 x i16> %x0, <16 x i16>
 ; CHECK-NEXT:    vpbroadcastw %xmm0, %ymm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x79,0xd0]
 ; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
 ; CHECK-NEXT:    vpbroadcastw %xmm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x79,0xc8]
-; CHECK-NEXT:    vpbroadcastw %xmm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x79,0xc0]
 ; CHECK-NEXT:    vpaddw %ymm1, %ymm2, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfd,0xc9]
+; CHECK-NEXT:    vpbroadcastw %xmm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x79,0xc0]
 ; CHECK-NEXT:    vpaddw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfd,0xc1]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call <16 x i16> @llvm.x86.avx512.pbroadcastw.256(<8 x i16> %x0, <16 x i16> %x1, i16 -1)
@@ -69,8 +69,8 @@ define <8 x i16>@test_int_x86_avx512_pbroadcastw_128(<8 x i16> %x0, <8 x i16> %x
 ; CHECK-NEXT:    vpbroadcastw %xmm0, %xmm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x79,0xd0]
 ; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
 ; CHECK-NEXT:    vpbroadcastw %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x79,0xc8]
-; CHECK-NEXT:    vpbroadcastw %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x79,0xc0]
 ; CHECK-NEXT:    vpaddw %xmm1, %xmm2, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfd,0xc9]
+; CHECK-NEXT:    vpbroadcastw %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x79,0xc0]
 ; CHECK-NEXT:    vpaddw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfd,0xc1]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call <8 x i16> @llvm.x86.avx512.pbroadcastw.128(<8 x i16> %x0, <8 x i16> %x1, i8 -1)
@@ -89,8 +89,8 @@ define <64 x i8>@test_int_x86_avx512_pbroadcastb_512(<16 x i8> %x0, <64 x i8> %x
 ; CHECK-NEXT:    vpbroadcastb %xmm0, %zmm2 ## encoding: [0x62,0xf2,0x7d,0x48,0x78,0xd0]
 ; CHECK-NEXT:    kmovq %rdi, %k1 ## encoding: [0xc4,0xe1,0xfb,0x92,0xcf]
 ; CHECK-NEXT:    vpbroadcastb %xmm0, %zmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x49,0x78,0xc8]
-; CHECK-NEXT:    vpbroadcastb %xmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xc9,0x78,0xc0]
 ; CHECK-NEXT:    vpaddb %zmm1, %zmm2, %zmm1 ## encoding: [0x62,0xf1,0x6d,0x48,0xfc,0xc9]
+; CHECK-NEXT:    vpbroadcastb %xmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xc9,0x78,0xc0]
 ; CHECK-NEXT:    vpaddb %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7d,0x48,0xfc,0xc1]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call <64 x i8> @llvm.x86.avx512.pbroadcastb.512(<16 x i8> %x0, <64 x i8> %x1, i64 -1)
@@ -109,8 +109,8 @@ define <32 x i16>@test_int_x86_avx512_pbroadcastw_512(<8 x i16> %x0, <32 x i16>
 ; CHECK-NEXT:    vpbroadcastw %xmm0, %zmm2 ## encoding: [0x62,0xf2,0x7d,0x48,0x79,0xd0]
 ; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
 ; CHECK-NEXT:    vpbroadcastw %xmm0, %zmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x49,0x79,0xc8]
-; CHECK-NEXT:    vpbroadcastw %xmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xc9,0x79,0xc0]
 ; CHECK-NEXT:    vpaddw %zmm1, %zmm2, %zmm1 ## encoding: [0x62,0xf1,0x6d,0x48,0xfd,0xc9]
+; CHECK-NEXT:    vpbroadcastw %xmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xc9,0x79,0xc0]
 ; CHECK-NEXT:    vpaddw %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7d,0x48,0xfd,0xc1]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call <32 x i16> @llvm.x86.avx512.pbroadcastw.512(<8 x i16> %x0, <32 x i16> %x1, i32 -1)
@@ -1476,9 +1476,9 @@ define <8 x i16>@test_int_x86_avx512_mask_psrl_w_128(<8 x i16> %x0, <8 x i16> %x
 ; CHECK-NEXT:    vpsrlw %xmm1, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd1,0xd9]
 ; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
 ; CHECK-NEXT:    vpsrlw %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xd1,0xd1]
+; CHECK-NEXT:    vpaddw %xmm3, %xmm2, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfd,0xd3]
 ; CHECK-NEXT:    vpsrlw %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xd1,0xc1]
-; CHECK-NEXT:    vpaddw %xmm3, %xmm2, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfd,0xcb]
-; CHECK-NEXT:    vpaddw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfd,0xc1]
+; CHECK-NEXT:    vpaddw %xmm2, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfd,0xc2]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call <8 x i16> @llvm.x86.avx512.mask.psrl.w.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3)
   %res1 = call <8 x i16> @llvm.x86.avx512.mask.psrl.w.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 -1)
@@ -1496,9 +1496,9 @@ define <16 x i16>@test_int_x86_avx512_mask_psrl_w_256(<16 x i16> %x0, <8 x i16>
 ; CHECK-NEXT:    vpsrlw %xmm1, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd1,0xd9]
 ; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
 ; CHECK-NEXT:    vpsrlw %xmm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xd1,0xd1]
+; CHECK-NEXT:    vpaddw %ymm3, %ymm2, %ymm2 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfd,0xd3]
 ; CHECK-NEXT:    vpsrlw %xmm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xd1,0xc1]
-; CHECK-NEXT:    vpaddw %ymm3, %ymm2, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfd,0xcb]
-; CHECK-NEXT:    vpaddw %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfd,0xc0]
+; CHECK-NEXT:    vpaddw %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfd,0xc0]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call <16 x i16> @llvm.x86.avx512.mask.psrl.w.256(<16 x i16> %x0, <8 x i16> %x1, <16 x i16> %x2, i16 %x3)
   %res1 = call <16 x i16> @llvm.x86.avx512.mask.psrl.w.256(<16 x i16> %x0, <8 x i16> %x1, <16 x i16> %x2, i16 -1)
@@ -1596,8 +1596,8 @@ define <8 x i16>@test_int_x86_avx512_mask_psrl_wi_128(<8 x i16> %x0, i32 %x1, <8
 ; CHECK-NEXT:    vpsrlw $3, %xmm0, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0x71,0xd0,0x03]
 ; CHECK-NEXT:    kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
 ; CHECK-NEXT:    vpsrlw $3, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x75,0x09,0x71,0xd0,0x03]
-; CHECK-NEXT:    vpsrlw $3, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0x71,0xd0,0x03]
 ; CHECK-NEXT:    vpaddw %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfd,0xca]
+; CHECK-NEXT:    vpsrlw $3, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0x71,0xd0,0x03]
 ; CHECK-NEXT:    vpaddw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfd,0xc1]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call <8 x i16> @llvm.x86.avx512.mask.psrl.wi.128(<8 x i16> %x0, i32 3, <8 x i16> %x2, i8 %x3)
@@ -1616,8 +1616,8 @@ define <16 x i16>@test_int_x86_avx512_mask_psrl_wi_256(<16 x i16> %x0, i32 %x1,
 ; CHECK-NEXT:    vpsrlw $3, %ymm0, %ymm2 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0x71,0xd0,0x03]
 ; CHECK-NEXT:    kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
 ; CHECK-NEXT:    vpsrlw $3, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x75,0x29,0x71,0xd0,0x03]
-; CHECK-NEXT:    vpsrlw $3, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0x71,0xd0,0x03]
 ; CHECK-NEXT:    vpaddw %ymm2, %ymm1, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfd,0xca]
+; CHECK-NEXT:    vpsrlw $3, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0x71,0xd0,0x03]
 ; CHECK-NEXT:    vpaddw %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfd,0xc0]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call <16 x i16> @llvm.x86.avx512.mask.psrl.wi.256(<16 x i16> %x0, i32 3, <16 x i16> %x2, i16 %x3)
diff --git a/test/CodeGen/X86/avx512cdvl-intrinsics-upgrade.ll b/test/CodeGen/X86/avx512cdvl-intrinsics-upgrade.ll
index 8f528394f5bd..f8f47c87100a 100644
--- a/test/CodeGen/X86/avx512cdvl-intrinsics-upgrade.ll
+++ b/test/CodeGen/X86/avx512cdvl-intrinsics-upgrade.ll
@@ -9,8 +9,8 @@ define <4 x i32>@test_int_x86_avx512_mask_vplzcnt_d_128(<4 x i32> %x0, <4 x i32>
 ; CHECK-NEXT:    vplzcntd %xmm0, %xmm2
 ; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vplzcntd %xmm0, %xmm1 {%k1}
-; CHECK-NEXT:    vplzcntd %xmm0, %xmm0 {%k1} {z}
 ; CHECK-NEXT:    vpaddd %xmm2, %xmm1, %xmm1
+; CHECK-NEXT:    vplzcntd %xmm0, %xmm0 {%k1} {z}
 ; CHECK-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
 ; CHECK-NEXT:    retq
   %res = call <4 x i32> @llvm.x86.avx512.mask.lzcnt.d.128(<4 x i32> %x0, <4 x i32> %x1, i8 %x2)
diff --git a/test/CodeGen/X86/avx512cdvl-intrinsics.ll b/test/CodeGen/X86/avx512cdvl-intrinsics.ll
index 37aea45e6107..96254f7c95b0 100644
--- a/test/CodeGen/X86/avx512cdvl-intrinsics.ll
+++ b/test/CodeGen/X86/avx512cdvl-intrinsics.ll
@@ -7,8 +7,8 @@ define <4 x i32> @test_int_x86_avx512_mask_vplzcnt_d_128(<4 x i32> %x0, <4 x i32
 ; CHECK-NEXT:    vplzcntd %xmm0, %xmm2
 ; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vplzcntd %xmm0, %xmm1 {%k1}
-; CHECK-NEXT:    vplzcntd %xmm0, %xmm0 {%k1} {z}
 ; CHECK-NEXT:    vpaddd %xmm2, %xmm1, %xmm1
+; CHECK-NEXT:    vplzcntd %xmm0, %xmm0 {%k1} {z}
 ; CHECK-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
 ; CHECK-NEXT:    retq
   %1 = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %x0, i1 false)
diff --git a/test/CodeGen/X86/avx512dq-intrinsics-upgrade.ll b/test/CodeGen/X86/avx512dq-intrinsics-upgrade.ll
index c5478dad4224..1377733739fe 100644
--- a/test/CodeGen/X86/avx512dq-intrinsics-upgrade.ll
+++ b/test/CodeGen/X86/avx512dq-intrinsics-upgrade.ll
@@ -40,8 +40,8 @@ define <8 x float>@test_int_x86_avx512_mask_vextractf32x8(<16 x float> %x0, <8 x
 ; CHECK-NEXT:    vextractf32x8 $1, %zmm0, %ymm2
 ; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vextractf32x8 $1, %zmm0, %ymm1 {%k1}
-; CHECK-NEXT:    vextractf32x8 $1, %zmm0, %ymm0 {%k1} {z}
 ; CHECK-NEXT:    vaddps %ymm2, %ymm1, %ymm1
+; CHECK-NEXT:    vextractf32x8 $1, %zmm0, %ymm0 {%k1} {z}
 ; CHECK-NEXT:    vaddps %ymm1, %ymm0, %ymm0
 ; CHECK-NEXT:    retq
   %res  = call <8 x float> @llvm.x86.avx512.mask.vextractf32x8.512(<16 x float> %x0,i32 1, <8 x float> %x2, i8 %x3)
diff --git a/test/CodeGen/X86/avx512dq-intrinsics.ll b/test/CodeGen/X86/avx512dq-intrinsics.ll
index 000390404b54..97ac0fde10ec 100644
--- a/test/CodeGen/X86/avx512dq-intrinsics.ll
+++ b/test/CodeGen/X86/avx512dq-intrinsics.ll
@@ -414,8 +414,8 @@ define <16 x float>@test_int_x86_avx512_mask_broadcastf32x2_512(<4 x float> %x0,
 ; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vbroadcastf32x2 {{.*#+}} zmm1 {%k1} = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
 ; CHECK-NEXT:    vbroadcastf32x2 {{.*#+}} zmm2 {%k1} {z} = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
-; CHECK-NEXT:    vbroadcastf32x2 {{.*#+}} zmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
 ; CHECK-NEXT:    vaddps %zmm2, %zmm1, %zmm1
+; CHECK-NEXT:    vbroadcastf32x2 {{.*#+}} zmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
 ; CHECK-NEXT:    vaddps %zmm0, %zmm1, %zmm0
 ; CHECK-NEXT:    retq
   %res = call <16 x float> @llvm.x86.avx512.mask.broadcastf32x2.512(<4 x float>  %x0, <16 x float> %x2, i16 %x3)
@@ -434,8 +434,8 @@ define <16 x i32>@test_int_x86_avx512_mask_broadcasti32x2_512(<4 x i32> %x0, <16
 ; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vbroadcasti32x2 {{.*#+}} zmm1 {%k1} = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
 ; CHECK-NEXT:    vbroadcasti32x2 {{.*#+}} zmm2 {%k1} {z} = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
-; CHECK-NEXT:    vbroadcasti32x2 {{.*#+}} zmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
 ; CHECK-NEXT:    vpaddd %zmm2, %zmm1, %zmm1
+; CHECK-NEXT:    vbroadcasti32x2 {{.*#+}} zmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
 ; CHECK-NEXT:    vpaddd %zmm0, %zmm1, %zmm0
 ; CHECK-NEXT:    retq
   %res = call <16 x i32> @llvm.x86.avx512.mask.broadcasti32x2.512(<4 x i32>  %x0, <16 x i32> %x2, i16 %x3)
diff --git a/test/CodeGen/X86/avx512dqvl-intrinsics-upgrade.ll b/test/CodeGen/X86/avx512dqvl-intrinsics-upgrade.ll
index 52a84deebf51..595b3e0ebb86 100644
--- a/test/CodeGen/X86/avx512dqvl-intrinsics-upgrade.ll
+++ b/test/CodeGen/X86/avx512dqvl-intrinsics-upgrade.ll
@@ -1568,8 +1568,8 @@ define <2 x double>@test_int_x86_avx512_mask_vextractf64x2_256(<4 x double> %x0,
 ; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x19,0xc2,0x01]
 ; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
 ; CHECK-NEXT:    vextractf64x2 $1, %ymm0, %xmm1 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x19,0xc1,0x01]
-; CHECK-NEXT:    vextractf64x2 $1, %ymm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf3,0xfd,0xa9,0x19,0xc0,0x01]
 ; CHECK-NEXT:    vaddpd %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0x58,0xca]
+; CHECK-NEXT:    vextractf64x2 $1, %ymm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf3,0xfd,0xa9,0x19,0xc0,0x01]
 ; CHECK-NEXT:    vaddpd %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0x58,0xc0]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call <2 x double> @llvm.x86.avx512.mask.vextractf64x2.256(<4 x double> %x0,i32 1, <2 x double> %x2, i8 %x3)
@@ -1588,9 +1588,9 @@ define <4 x double>@test_int_x86_avx512_mask_insertf64x2_256(<4 x double> %x0, <
 ; CHECK-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x18,0xd9,0x01]
 ; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
 ; CHECK-NEXT:    vinsertf64x2 $1, %xmm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x18,0xd1,0x01]
+; CHECK-NEXT:    vaddpd %ymm3, %ymm2, %ymm2 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0x58,0xd3]
 ; CHECK-NEXT:    vinsertf64x2 $1, %xmm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf3,0xfd,0xa9,0x18,0xc1,0x01]
-; CHECK-NEXT:    vaddpd %ymm3, %ymm2, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0x58,0xcb]
-; CHECK-NEXT:    vaddpd %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x58,0xc1]
+; CHECK-NEXT:    vaddpd %ymm2, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x58,0xc2]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call <4 x double> @llvm.x86.avx512.mask.insertf64x2.256(<4 x double> %x0, <2 x double> %x1, i32 1, <4 x double> %x3, i8 %x4)
   %res1 = call <4 x double> @llvm.x86.avx512.mask.insertf64x2.256(<4 x double> %x0, <2 x double> %x1, i32 1, <4 x double> %x3, i8 -1)
@@ -1608,9 +1608,9 @@ define <4 x i64>@test_int_x86_avx512_mask_inserti64x2_256(<4 x i64> %x0, <2 x i6
 ; CHECK-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x38,0xd9,0x01]
 ; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
 ; CHECK-NEXT:    vinserti64x2 $1, %xmm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x38,0xd1,0x01]
+; CHECK-NEXT:    vpaddq %ymm3, %ymm2, %ymm2 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xd4,0xd3]
 ; CHECK-NEXT:    vinserti64x2 $1, %xmm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf3,0xfd,0xa9,0x38,0xc1,0x01]
-; CHECK-NEXT:    vpaddq %ymm3, %ymm2, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xd4,0xcb]
-; CHECK-NEXT:    vpaddq %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xc0]
+; CHECK-NEXT:    vpaddq %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xd4,0xc0]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call <4 x i64> @llvm.x86.avx512.mask.inserti64x2.256(<4 x i64> %x0, <2 x i64> %x1, i32 1, <4 x i64> %x3, i8 %x4)
   %res1 = call <4 x i64> @llvm.x86.avx512.mask.inserti64x2.256(<4 x i64> %x0, <2 x i64> %x1, i32 1, <4 x i64> %x3, i8 -1)
diff --git a/test/CodeGen/X86/avx512dqvl-intrinsics.ll b/test/CodeGen/X86/avx512dqvl-intrinsics.ll
index ad9ea93c2031..1bfdfd0e634d 100644
--- a/test/CodeGen/X86/avx512dqvl-intrinsics.ll
+++ b/test/CodeGen/X86/avx512dqvl-intrinsics.ll
@@ -635,8 +635,8 @@ define <8 x float>@test_int_x86_avx512_mask_broadcastf32x2_256(<4 x float> %x0,
 ; CHECK-NEXT:    ## ymm1 {%k1} = xmm0[0,1,0,1,0,1,0,1]
 ; CHECK-NEXT:    vbroadcastf32x2 %xmm0, %ymm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x19,0xd0]
 ; CHECK-NEXT:    ## ymm2 {%k1} {z} = xmm0[0,1,0,1,0,1,0,1]
-; CHECK-NEXT:    vbroadcastsd %xmm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x19,0xc0]
 ; CHECK-NEXT:    vaddps %ymm2, %ymm1, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf4,0x58,0xca]
+; CHECK-NEXT:    vbroadcastsd %xmm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x19,0xc0]
 ; CHECK-NEXT:    vaddps %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf4,0x58,0xc0]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call <8 x float> @llvm.x86.avx512.mask.broadcastf32x2.256(<4 x float>  %x0, <8 x float> %x2, i8 %x3)
@@ -680,8 +680,8 @@ define <4 x i32>@test_int_x86_avx512_mask_broadcasti32x2_128(<4 x i32> %x0, <4 x
 ; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
 ; CHECK-NEXT:    vbroadcasti32x2 %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x59,0xc8]
 ; CHECK-NEXT:    vbroadcasti32x2 %xmm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x59,0xd0]
-; CHECK-NEXT:    vpbroadcastq %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x59,0xc0]
 ; CHECK-NEXT:    vpaddd %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xca]
+; CHECK-NEXT:    vpbroadcastq %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x59,0xc0]
 ; CHECK-NEXT:    vpaddd %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc0]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call <4 x i32> @llvm.x86.avx512.mask.broadcasti32x2.128(<4 x i32>  %x0, <4 x i32> %x2, i8 %x3)
diff --git a/test/CodeGen/X86/avx512er-intrinsics.ll b/test/CodeGen/X86/avx512er-intrinsics.ll
index ca130bd2b676..b8531e25bfa1 100644
--- a/test/CodeGen/X86/avx512er-intrinsics.ll
+++ b/test/CodeGen/X86/avx512er-intrinsics.ll
@@ -118,78 +118,78 @@ define <4 x float> @test_rcp28_ss(<4 x float> %a0) {
 }
 declare <4 x float> @llvm.x86.avx512.rcp28.ss(<4 x float>, <4 x float>, <4 x float>, i8, i32) nounwind readnone
 
-define <4 x float> @test_rsqrt28_ss_maskz(<4 x float> %a0) {
+define <4 x float> @test_rsqrt28_ss_maskz(<4 x float> %a0, i8 %mask) {
 ; CHECK-LABEL: test_rsqrt28_ss_maskz:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    kxnorw %k0, %k0, %k0 # encoding: [0xc5,0xfc,0x46,0xc0]
-; CHECK-NEXT:    kshiftrw $15, %k0, %k1 # encoding: [0xc4,0xe3,0xf9,0x30,0xc8,0x0f]
+; CHECK-NEXT:    andl $1, %edi # encoding: [0x83,0xe7,0x01]
+; CHECK-NEXT:    kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
 ; CHECK-NEXT:    vrsqrt28ss {sae}, %xmm0, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x99,0xcd,0xc0]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
-  %res = call <4 x float> @llvm.x86.avx512.rsqrt28.ss(<4 x float> %a0, <4 x float> %a0, <4 x float> zeroinitializer, i8 7, i32 8) ;
+  %res = call <4 x float> @llvm.x86.avx512.rsqrt28.ss(<4 x float> %a0, <4 x float> %a0, <4 x float> zeroinitializer, i8 %mask, i32 8) ;
   ret <4 x float> %res
 }
 
-define <4 x float> @test_rsqrt28_ss_mask(<4 x float> %a0, <4 x float> %b0, <4 x float> %c0) {
+define <4 x float> @test_rsqrt28_ss_mask(<4 x float> %a0, <4 x float> %b0, <4 x float> %c0, i8 %mask) {
 ; CHECK-LABEL: test_rsqrt28_ss_mask:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    kxnorw %k0, %k0, %k0 # encoding: [0xc5,0xfc,0x46,0xc0]
-; CHECK-NEXT:    kshiftrw $15, %k0, %k1 # encoding: [0xc4,0xe3,0xf9,0x30,0xc8,0x0f]
+; CHECK-NEXT:    andl $1, %edi # encoding: [0x83,0xe7,0x01]
+; CHECK-NEXT:    kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
 ; CHECK-NEXT:    vrsqrt28ss {sae}, %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x19,0xcd,0xd1]
 ; CHECK-NEXT:    vmovaps %xmm2, %xmm0 # encoding: [0xc5,0xf8,0x28,0xc2]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
-  %res = call <4 x float> @llvm.x86.avx512.rsqrt28.ss(<4 x float> %a0, <4 x float> %b0, <4 x float> %c0, i8 7, i32 8) ;
+  %res = call <4 x float> @llvm.x86.avx512.rsqrt28.ss(<4 x float> %a0, <4 x float> %b0, <4 x float> %c0, i8 %mask, i32 8) ;
   ret <4 x float> %res
 }
 
-define <2 x double> @test_rsqrt28_sd_maskz(<2 x double> %a0) {
+define <2 x double> @test_rsqrt28_sd_maskz(<2 x double> %a0, i8 %mask) {
 ; CHECK-LABEL: test_rsqrt28_sd_maskz:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    kxnorw %k0, %k0, %k0 # encoding: [0xc5,0xfc,0x46,0xc0]
-; CHECK-NEXT:    kshiftrw $15, %k0, %k1 # encoding: [0xc4,0xe3,0xf9,0x30,0xc8,0x0f]
+; CHECK-NEXT:    andl $1, %edi # encoding: [0x83,0xe7,0x01]
+; CHECK-NEXT:    kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
 ; CHECK-NEXT:    vrsqrt28sd {sae}, %xmm0, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0x99,0xcd,0xc0]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
-  %res = call <2 x double> @llvm.x86.avx512.rsqrt28.sd(<2 x double> %a0, <2 x double> %a0, <2 x double> zeroinitializer, i8 7, i32 8) ;
+  %res = call <2 x double> @llvm.x86.avx512.rsqrt28.sd(<2 x double> %a0, <2 x double> %a0, <2 x double> zeroinitializer, i8 %mask, i32 8) ;
   ret <2 x double> %res
 }
 
-define <2 x double> @test_rsqrt28_sd_mask(<2 x double> %a0, <2 x double> %b0, <2 x double> %c0) {
+define <2 x double> @test_rsqrt28_sd_mask(<2 x double> %a0, <2 x double> %b0, <2 x double> %c0, i8 %mask) {
 ; CHECK-LABEL: test_rsqrt28_sd_mask:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    kxnorw %k0, %k0, %k0 # encoding: [0xc5,0xfc,0x46,0xc0]
-; CHECK-NEXT:    kshiftrw $15, %k0, %k1 # encoding: [0xc4,0xe3,0xf9,0x30,0xc8,0x0f]
+; CHECK-NEXT:    andl $1, %edi # encoding: [0x83,0xe7,0x01]
+; CHECK-NEXT:    kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
 ; CHECK-NEXT:    vrsqrt28sd {sae}, %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf2,0xfd,0x19,0xcd,0xd1]
 ; CHECK-NEXT:    vmovapd %xmm2, %xmm0 # encoding: [0xc5,0xf9,0x28,0xc2]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
-  %res = call <2 x double> @llvm.x86.avx512.rsqrt28.sd(<2 x double> %a0, <2 x double> %b0, <2 x double> %c0, i8 7, i32 8) ;
+  %res = call <2 x double> @llvm.x86.avx512.rsqrt28.sd(<2 x double> %a0, <2 x double> %b0, <2 x double> %c0, i8 %mask, i32 8) ;
   ret <2 x double> %res
 }
 
 declare <2 x double> @llvm.x86.avx512.rsqrt28.sd(<2 x double>, <2 x double>, <2 x double>, i8, i32) nounwind readnone
 
-define <2 x double> @test_rsqrt28_sd_maskz_mem(<2 x double> %a0, double* %ptr ) {
+define <2 x double> @test_rsqrt28_sd_maskz_mem(<2 x double> %a0, double* %ptr, i8 %mask) {
 ; CHECK-LABEL: test_rsqrt28_sd_maskz_mem:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    kxnorw %k0, %k0, %k0 # encoding: [0xc5,0xfc,0x46,0xc0]
-; CHECK-NEXT:    kshiftrw $15, %k0, %k1 # encoding: [0xc4,0xe3,0xf9,0x30,0xc8,0x0f]
+; CHECK-NEXT:    andl $1, %esi # encoding: [0x83,0xe6,0x01]
+; CHECK-NEXT:    kmovw %esi, %k1 # encoding: [0xc5,0xf8,0x92,0xce]
 ; CHECK-NEXT:    vrsqrt28sd (%rdi), %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0x89,0xcd,0x07]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
   %mem = load double , double * %ptr, align 8
   %mem_v = insertelement <2 x double> undef, double %mem, i32 0
-  %res = call <2 x double> @llvm.x86.avx512.rsqrt28.sd(<2 x double> %a0, <2 x double> %mem_v, <2 x double> zeroinitializer, i8 7, i32 4) ;
+  %res = call <2 x double> @llvm.x86.avx512.rsqrt28.sd(<2 x double> %a0, <2 x double> %mem_v, <2 x double> zeroinitializer, i8 %mask, i32 4) ;
   ret <2 x double> %res
 }
 
-define <2 x double> @test_rsqrt28_sd_maskz_mem_offset(<2 x double> %a0, double* %ptr ) {
+define <2 x double> @test_rsqrt28_sd_maskz_mem_offset(<2 x double> %a0, double* %ptr, i8 %mask) {
 ; CHECK-LABEL: test_rsqrt28_sd_maskz_mem_offset:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    kxnorw %k0, %k0, %k0 # encoding: [0xc5,0xfc,0x46,0xc0]
-; CHECK-NEXT:    kshiftrw $15, %k0, %k1 # encoding: [0xc4,0xe3,0xf9,0x30,0xc8,0x0f]
+; CHECK-NEXT:    andl $1, %esi # encoding: [0x83,0xe6,0x01]
+; CHECK-NEXT:    kmovw %esi, %k1 # encoding: [0xc5,0xf8,0x92,0xce]
 ; CHECK-NEXT:    vrsqrt28sd 144(%rdi), %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0x89,0xcd,0x47,0x12]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
   %ptr1 = getelementptr double, double* %ptr, i32 18
   %mem = load double , double * %ptr1, align 8
   %mem_v = insertelement <2 x double> undef, double %mem, i32 0
-  %res = call <2 x double> @llvm.x86.avx512.rsqrt28.sd(<2 x double> %a0, <2 x double> %mem_v, <2 x double> zeroinitializer, i8 7, i32 4) ;
+  %res = call <2 x double> @llvm.x86.avx512.rsqrt28.sd(<2 x double> %a0, <2 x double> %mem_v, <2 x double> zeroinitializer, i8 %mask, i32 4) ;
   ret <2 x double> %res
 }
 
diff --git a/test/CodeGen/X86/avx512ifma-intrinsics.ll b/test/CodeGen/X86/avx512ifma-intrinsics.ll
index 30ecc0d2e49e..9659dc6d455a 100644
--- a/test/CodeGen/X86/avx512ifma-intrinsics.ll
+++ b/test/CodeGen/X86/avx512ifma-intrinsics.ll
@@ -13,8 +13,8 @@ define <8 x i64>@test_int_x86_avx512_mask_vpmadd52h_uq_512(<8 x i64> %x0, <8 x i
 ; CHECK-NEXT:    vpmadd52huq %zmm2, %zmm1, %zmm4 {%k1}
 ; CHECK-NEXT:    vpxord %zmm2, %zmm2, %zmm2
 ; CHECK-NEXT:    vpmadd52huq %zmm2, %zmm1, %zmm0 {%k1}
-; CHECK-NEXT:    vpmadd52huq %zmm2, %zmm1, %zmm2 {%k1} {z}
 ; CHECK-NEXT:    vpaddq %zmm0, %zmm4, %zmm0
+; CHECK-NEXT:    vpmadd52huq %zmm2, %zmm1, %zmm2 {%k1} {z}
 ; CHECK-NEXT:    vpaddq %zmm2, %zmm3, %zmm1
 ; CHECK-NEXT:    vpaddq %zmm0, %zmm1, %zmm0
 ; CHECK-NEXT:    retq
@@ -41,8 +41,8 @@ define <8 x i64>@test_int_x86_avx512_maskz_vpmadd52h_uq_512(<8 x i64> %x0, <8 x
 ; CHECK-NEXT:    vpmadd52huq %zmm2, %zmm1, %zmm4 {%k1} {z}
 ; CHECK-NEXT:    vpxord %zmm2, %zmm2, %zmm2
 ; CHECK-NEXT:    vpmadd52huq %zmm2, %zmm1, %zmm0 {%k1} {z}
-; CHECK-NEXT:    vpmadd52huq %zmm2, %zmm1, %zmm2 {%k1} {z}
 ; CHECK-NEXT:    vpaddq %zmm0, %zmm4, %zmm0
+; CHECK-NEXT:    vpmadd52huq %zmm2, %zmm1, %zmm2 {%k1} {z}
 ; CHECK-NEXT:    vpaddq %zmm2, %zmm3, %zmm1
 ; CHECK-NEXT:    vpaddq %zmm0, %zmm1, %zmm0
 ; CHECK-NEXT:    retq
@@ -69,8 +69,8 @@ define <8 x i64>@test_int_x86_avx512_mask_vpmadd52l_uq_512(<8 x i64> %x0, <8 x i
 ; CHECK-NEXT:    vpmadd52luq %zmm2, %zmm1, %zmm4 {%k1}
 ; CHECK-NEXT:    vpxord %zmm2, %zmm2, %zmm2
 ; CHECK-NEXT:    vpmadd52luq %zmm2, %zmm1, %zmm0 {%k1}
-; CHECK-NEXT:    vpmadd52luq %zmm2, %zmm1, %zmm2 {%k1} {z}
 ; CHECK-NEXT:    vpaddq %zmm0, %zmm4, %zmm0
+; CHECK-NEXT:    vpmadd52luq %zmm2, %zmm1, %zmm2 {%k1} {z}
 ; CHECK-NEXT:    vpaddq %zmm2, %zmm3, %zmm1
 ; CHECK-NEXT:    vpaddq %zmm0, %zmm1, %zmm0
 ; CHECK-NEXT:    retq
@@ -97,8 +97,8 @@ define <8 x i64>@test_int_x86_avx512_maskz_vpmadd52l_uq_512(<8 x i64> %x0, <8 x
 ; CHECK-NEXT:    vpmadd52luq %zmm2, %zmm1, %zmm4 {%k1} {z}
 ; CHECK-NEXT:    vpxord %zmm2, %zmm2, %zmm2
 ; CHECK-NEXT:    vpmadd52luq %zmm2, %zmm1, %zmm0 {%k1} {z}
-; CHECK-NEXT:    vpmadd52luq %zmm2, %zmm1, %zmm2 {%k1} {z}
 ; CHECK-NEXT:    vpaddq %zmm0, %zmm4, %zmm0
+; CHECK-NEXT:    vpmadd52luq %zmm2, %zmm1, %zmm2 {%k1} {z}
 ; CHECK-NEXT:    vpaddq %zmm2, %zmm3, %zmm1
 ; CHECK-NEXT:    vpaddq %zmm0, %zmm1, %zmm0
 ; CHECK-NEXT:    retq
diff --git a/test/CodeGen/X86/avx512ifmavl-intrinsics.ll b/test/CodeGen/X86/avx512ifmavl-intrinsics.ll
index 3ca686cef3bf..b2fe6eba88ab 100644
--- a/test/CodeGen/X86/avx512ifmavl-intrinsics.ll
+++ b/test/CodeGen/X86/avx512ifmavl-intrinsics.ll
@@ -14,8 +14,8 @@ define <2 x i64>@test_int_x86_avx512_mask_vpmadd52h_uq_128(<2 x i64> %x0, <2 x i
 ; CHECK-NEXT:    vpmadd52huq %xmm2, %xmm1, %xmm4 {%k1}
 ; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 ; CHECK-NEXT:    vpmadd52huq %xmm2, %xmm1, %xmm0 {%k1}
-; CHECK-NEXT:    vpmadd52huq %xmm2, %xmm1, %xmm2 {%k1} {z}
 ; CHECK-NEXT:    vpaddq %xmm0, %xmm4, %xmm0
+; CHECK-NEXT:    vpmadd52huq %xmm2, %xmm1, %xmm2 {%k1} {z}
 ; CHECK-NEXT:    vpaddq %xmm2, %xmm3, %xmm1
 ; CHECK-NEXT:    vpaddq %xmm0, %xmm1, %xmm0
 ; CHECK-NEXT:    retq
@@ -42,8 +42,8 @@ define <4 x i64>@test_int_x86_avx512_mask_vpmadd52h_uq_256(<4 x i64> %x0, <4 x i
 ; CHECK-NEXT:    vpmadd52huq %ymm2, %ymm1, %ymm4 {%k1}
 ; CHECK-NEXT:    vpxor %ymm2, %ymm2, %ymm2
 ; CHECK-NEXT:    vpmadd52huq %ymm2, %ymm1, %ymm0 {%k1}
-; CHECK-NEXT:    vpmadd52huq %ymm2, %ymm1, %ymm2 {%k1} {z}
 ; CHECK-NEXT:    vpaddq %ymm0, %ymm4, %ymm0
+; CHECK-NEXT:    vpmadd52huq %ymm2, %ymm1, %ymm2 {%k1} {z}
 ; CHECK-NEXT:    vpaddq %ymm2, %ymm3, %ymm1
 ; CHECK-NEXT:    vpaddq %ymm0, %ymm1, %ymm0
 ; CHECK-NEXT:    retq
@@ -70,8 +70,8 @@ define <2 x i64>@test_int_x86_avx512_maskz_vpmadd52h_uq_128(<2 x i64> %x0, <2 x
 ; CHECK-NEXT:    vpmadd52huq %xmm2, %xmm1, %xmm4 {%k1} {z}
 ; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 ; CHECK-NEXT:    vpmadd52huq %xmm2, %xmm1, %xmm0 {%k1} {z}
-; CHECK-NEXT:    vpmadd52huq %xmm2, %xmm1, %xmm2 {%k1} {z}
 ; CHECK-NEXT:    vpaddq %xmm0, %xmm4, %xmm0
+; CHECK-NEXT:    vpmadd52huq %xmm2, %xmm1, %xmm2 {%k1} {z}
 ; CHECK-NEXT:    vpaddq %xmm2, %xmm3, %xmm1
 ; CHECK-NEXT:    vpaddq %xmm0, %xmm1, %xmm0
 ; CHECK-NEXT:    retq
@@ -98,8 +98,8 @@ define <4 x i64>@test_int_x86_avx512_maskz_vpmadd52h_uq_256(<4 x i64> %x0, <4 x
 ; CHECK-NEXT:    vpmadd52huq %ymm2, %ymm1, %ymm4 {%k1} {z}
 ; CHECK-NEXT:    vpxor %ymm2, %ymm2, %ymm2
 ; CHECK-NEXT:    vpmadd52huq %ymm2, %ymm1, %ymm0 {%k1} {z}
-; CHECK-NEXT:    vpmadd52huq %ymm2, %ymm1, %ymm2 {%k1} {z}
 ; CHECK-NEXT:    vpaddq %ymm0, %ymm4, %ymm0
+; CHECK-NEXT:    vpmadd52huq %ymm2, %ymm1, %ymm2 {%k1} {z}
 ; CHECK-NEXT:    vpaddq %ymm2, %ymm3, %ymm1
 ; CHECK-NEXT:    vpaddq %ymm0, %ymm1, %ymm0
 ; CHECK-NEXT:    retq
@@ -126,8 +126,8 @@ define <2 x i64>@test_int_x86_avx512_mask_vpmadd52l_uq_128(<2 x i64> %x0, <2 x i
 ; CHECK-NEXT:    vpmadd52luq %xmm2, %xmm1, %xmm4 {%k1}
 ; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 ; CHECK-NEXT:    vpmadd52luq %xmm2, %xmm1, %xmm0 {%k1}
-; CHECK-NEXT:    vpmadd52luq %xmm2, %xmm1, %xmm2 {%k1} {z}
 ; CHECK-NEXT:    vpaddq %xmm0, %xmm4, %xmm0
+; CHECK-NEXT:    vpmadd52luq %xmm2, %xmm1, %xmm2 {%k1} {z}
 ; CHECK-NEXT:    vpaddq %xmm2, %xmm3, %xmm1
 ; CHECK-NEXT:    vpaddq %xmm0, %xmm1, %xmm0
 ; CHECK-NEXT:    retq
@@ -154,8 +154,8 @@ define <4 x i64>@test_int_x86_avx512_mask_vpmadd52l_uq_256(<4 x i64> %x0, <4 x i
 ; CHECK-NEXT:    vpmadd52luq %ymm2, %ymm1, %ymm4 {%k1}
 ; CHECK-NEXT:    vpxor %ymm2, %ymm2, %ymm2
 ; CHECK-NEXT:    vpmadd52luq %ymm2, %ymm1, %ymm0 {%k1}
-; CHECK-NEXT:    vpmadd52luq %ymm2, %ymm1, %ymm2 {%k1} {z}
 ; CHECK-NEXT:    vpaddq %ymm0, %ymm4, %ymm0
+; CHECK-NEXT:    vpmadd52luq %ymm2, %ymm1, %ymm2 {%k1} {z}
 ; CHECK-NEXT:    vpaddq %ymm2, %ymm3, %ymm1
 ; CHECK-NEXT:    vpaddq %ymm0, %ymm1, %ymm0
 ; CHECK-NEXT:    retq
@@ -182,8 +182,8 @@ define <2 x i64>@test_int_x86_avx512_maskz_vpmadd52l_uq_128(<2 x i64> %x0, <2 x
 ; CHECK-NEXT:    vpmadd52luq %xmm2, %xmm1, %xmm4 {%k1} {z}
 ; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 ; CHECK-NEXT:    vpmadd52luq %xmm2, %xmm1, %xmm0 {%k1} {z}
-; CHECK-NEXT:    vpmadd52luq %xmm2, %xmm1, %xmm2 {%k1} {z}
 ; CHECK-NEXT:    vpaddq %xmm0, %xmm4, %xmm0
+; CHECK-NEXT:    vpmadd52luq %xmm2, %xmm1, %xmm2 {%k1} {z}
 ; CHECK-NEXT:    vpaddq %xmm2, %xmm3, %xmm1
 ; CHECK-NEXT:    vpaddq %xmm0, %xmm1, %xmm0
 ; CHECK-NEXT:    retq
@@ -210,8 +210,8 @@ define <4 x i64>@test_int_x86_avx512_maskz_vpmadd52l_uq_256(<4 x i64> %x0, <4 x
 ; CHECK-NEXT:    vpmadd52luq %ymm2, %ymm1, %ymm4 {%k1} {z}
 ; CHECK-NEXT:    vpxor %ymm2, %ymm2, %ymm2
 ; CHECK-NEXT:    vpmadd52luq %ymm2, %ymm1, %ymm0 {%k1} {z}
-; CHECK-NEXT:    vpmadd52luq %ymm2, %ymm1, %ymm2 {%k1} {z}
 ; CHECK-NEXT:    vpaddq %ymm0, %ymm4, %ymm0
+; CHECK-NEXT:    vpmadd52luq %ymm2, %ymm1, %ymm2 {%k1} {z}
 ; CHECK-NEXT:    vpaddq %ymm2, %ymm3, %ymm1
 ; CHECK-NEXT:    vpaddq %ymm0, %ymm1, %ymm0
 ; CHECK-NEXT:    retq
diff --git a/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll b/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll
index 4d906a4fd29a..c2d8df6476b3 100644
--- a/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll
+++ b/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll
@@ -30,8 +30,8 @@ define <4 x i32>@test_int_x86_avx512_pbroadcastd_128(<4 x i32> %x0, <4 x i32> %x
 ; CHECK-NEXT:    vpbroadcastd %xmm0, %xmm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x58,0xd0]
 ; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
 ; CHECK-NEXT:    vpbroadcastd %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x58,0xc8]
-; CHECK-NEXT:    vpbroadcastd %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x58,0xc0]
 ; CHECK-NEXT:    vpaddd %xmm1, %xmm2, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfe,0xc9]
+; CHECK-NEXT:    vpbroadcastd %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x58,0xc0]
 ; CHECK-NEXT:    vpaddd %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfe,0xc1]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call <4 x i32> @llvm.x86.avx512.pbroadcastd.128(<4 x i32> %x0, <4 x i32> %x1, i8 -1)
@@ -50,8 +50,8 @@ define <4 x i64>@test_int_x86_avx512_pbroadcastq_256(<2 x i64> %x0, <4 x i64> %x
 ; CHECK-NEXT:    vpbroadcastq %xmm0, %ymm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x59,0xd0]
 ; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
 ; CHECK-NEXT:    vpbroadcastq %xmm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x59,0xc8]
-; CHECK-NEXT:    vpbroadcastq %xmm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xa9,0x59,0xc0]
 ; CHECK-NEXT:    vpaddq %ymm1, %ymm2, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xd4,0xc9]
+; CHECK-NEXT:    vpbroadcastq %xmm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xa9,0x59,0xc0]
 ; CHECK-NEXT:    vpaddq %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd4,0xc1]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call <4 x i64> @llvm.x86.avx512.pbroadcastq.256(<2 x i64> %x0, <4 x i64> %x1,i8 -1)
@@ -70,8 +70,8 @@ define <2 x i64>@test_int_x86_avx512_pbroadcastq_128(<2 x i64> %x0, <2 x i64> %x
 ; CHECK-NEXT:    vpbroadcastq %xmm0, %xmm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x59,0xd0]
 ; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
 ; CHECK-NEXT:    vpbroadcastq %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0x59,0xc8]
-; CHECK-NEXT:    vpbroadcastq %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0x89,0x59,0xc0]
 ; CHECK-NEXT:    vpaddq %xmm1, %xmm2, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xd4,0xc9]
+; CHECK-NEXT:    vpbroadcastq %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0x89,0x59,0xc0]
 ; CHECK-NEXT:    vpaddq %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd4,0xc1]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call <2 x i64> @llvm.x86.avx512.pbroadcastq.128(<2 x i64> %x0, <2 x i64> %x1,i8 -1)
@@ -90,8 +90,8 @@ define <4 x double> @test_x86_vbroadcast_sd_pd_256(<2 x double> %a0, <4 x double
 ; CHECK-NEXT:    vbroadcastsd %xmm0, %ymm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x19,0xd0]
 ; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
 ; CHECK-NEXT:    vbroadcastsd %xmm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x19,0xc8]
-; CHECK-NEXT:    vbroadcastsd %xmm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xa9,0x19,0xc0]
 ; CHECK-NEXT:    vaddpd %ymm1, %ymm2, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0x58,0xc9]
+; CHECK-NEXT:    vbroadcastsd %xmm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xa9,0x19,0xc0]
 ; CHECK-NEXT:    vaddpd %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x58,0xc1]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call <4 x double> @llvm.x86.avx512.mask.broadcast.sd.pd.256(<2 x double> %a0, <4 x double> zeroinitializer, i8 -1)
@@ -110,8 +110,8 @@ define <8 x float> @test_x86_vbroadcast_ss_ps_256(<4 x float> %a0, <8 x float> %
 ; CHECK-NEXT:    vbroadcastss %xmm0, %ymm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x18,0xd0]
 ; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
 ; CHECK-NEXT:    vbroadcastss %xmm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x18,0xc8]
-; CHECK-NEXT:    vbroadcastss %xmm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x18,0xc0]
 ; CHECK-NEXT:    vaddps %ymm1, %ymm2, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xec,0x58,0xc9]
+; CHECK-NEXT:    vbroadcastss %xmm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x18,0xc0]
 ; CHECK-NEXT:    vaddps %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x58,0xc1]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call <8 x float> @llvm.x86.avx512.mask.broadcast.ss.ps.256(<4 x float> %a0, <8 x float> zeroinitializer, i8 -1)
@@ -130,8 +130,8 @@ define <4 x float> @test_x86_vbroadcast_ss_ps_128(<4 x float> %a0, <4 x float> %
 ; CHECK-NEXT:    vbroadcastss %xmm0, %xmm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x18,0xd0]
 ; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
 ; CHECK-NEXT:    vbroadcastss %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x18,0xc8]
-; CHECK-NEXT:    vbroadcastss %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x18,0xc0]
 ; CHECK-NEXT:    vaddps %xmm1, %xmm2, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xe8,0x58,0xc9]
+; CHECK-NEXT:    vbroadcastss %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x18,0xc0]
 ; CHECK-NEXT:    vaddps %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x58,0xc1]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call <4 x float> @llvm.x86.avx512.mask.broadcast.ss.ps.128(<4 x float> %a0, <4 x float> zeroinitializer, i8 -1)
@@ -152,9 +152,9 @@ define <4 x float>@test_int_x86_avx512_mask_movsldup_128(<4 x float> %x0, <4 x f
 ; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
 ; CHECK-NEXT:    vmovsldup %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7e,0x09,0x12,0xc8]
 ; CHECK-NEXT:    ## xmm1 {%k1} = xmm0[0,0,2,2]
+; CHECK-NEXT:    vaddps %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0x58,0xca]
 ; CHECK-NEXT:    vmovsldup %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7e,0x89,0x12,0xc0]
 ; CHECK-NEXT:    ## xmm0 {%k1} {z} = xmm0[0,0,2,2]
-; CHECK-NEXT:    vaddps %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0x58,0xca]
 ; CHECK-NEXT:    vaddps %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x58,0xc1]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call <4 x float> @llvm.x86.avx512.mask.movsldup.128(<4 x float> %x0, <4 x float> %x1, i8 %x2)
@@ -175,9 +175,9 @@ define <8 x float>@test_int_x86_avx512_mask_movsldup_256(<8 x float> %x0, <8 x f
 ; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
 ; CHECK-NEXT:    vmovsldup %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7e,0x29,0x12,0xc8]
 ; CHECK-NEXT:    ## ymm1 {%k1} = ymm0[0,0,2,2,4,4,6,6]
+; CHECK-NEXT:    vaddps %ymm2, %ymm1, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf4,0x58,0xca]
 ; CHECK-NEXT:    vmovsldup %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7e,0xa9,0x12,0xc0]
 ; CHECK-NEXT:    ## ymm0 {%k1} {z} = ymm0[0,0,2,2,4,4,6,6]
-; CHECK-NEXT:    vaddps %ymm2, %ymm1, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf4,0x58,0xca]
 ; CHECK-NEXT:    vaddps %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x58,0xc1]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call <8 x float> @llvm.x86.avx512.mask.movsldup.256(<8 x float> %x0, <8 x float> %x1, i8 %x2)
@@ -198,9 +198,9 @@ define <4 x float>@test_int_x86_avx512_mask_movshdup_128(<4 x float> %x0, <4 x f
 ; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
 ; CHECK-NEXT:    vmovshdup %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7e,0x09,0x16,0xc8]
 ; CHECK-NEXT:    ## xmm1 {%k1} = xmm0[1,1,3,3]
+; CHECK-NEXT:    vaddps %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0x58,0xca]
 ; CHECK-NEXT:    vmovshdup %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7e,0x89,0x16,0xc0]
 ; CHECK-NEXT:    ## xmm0 {%k1} {z} = xmm0[1,1,3,3]
-; CHECK-NEXT:    vaddps %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0x58,0xca]
 ; CHECK-NEXT:    vaddps %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x58,0xc1]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call <4 x float> @llvm.x86.avx512.mask.movshdup.128(<4 x float> %x0, <4 x float> %x1, i8 %x2)
@@ -221,9 +221,9 @@ define <8 x float>@test_int_x86_avx512_mask_movshdup_256(<8 x float> %x0, <8 x f
 ; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
 ; CHECK-NEXT:    vmovshdup %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7e,0x29,0x16,0xc8]
 ; CHECK-NEXT:    ## ymm1 {%k1} = ymm0[1,1,3,3,5,5,7,7]
+; CHECK-NEXT:    vaddps %ymm2, %ymm1, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf4,0x58,0xca]
 ; CHECK-NEXT:    vmovshdup %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7e,0xa9,0x16,0xc0]
 ; CHECK-NEXT:    ## ymm0 {%k1} {z} = ymm0[1,1,3,3,5,5,7,7]
-; CHECK-NEXT:    vaddps %ymm2, %ymm1, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf4,0x58,0xca]
 ; CHECK-NEXT:    vaddps %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x58,0xc1]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call <8 x float> @llvm.x86.avx512.mask.movshdup.256(<8 x float> %x0, <8 x float> %x1, i8 %x2)
@@ -243,9 +243,9 @@ define <2 x double>@test_int_x86_avx512_mask_movddup_128(<2 x double> %x0, <2 x
 ; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
 ; CHECK-NEXT:    vmovddup %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0xff,0x09,0x12,0xc8]
 ; CHECK-NEXT:    ## xmm1 {%k1} = xmm0[0,0]
+; CHECK-NEXT:    vaddpd %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0x58,0xca]
 ; CHECK-NEXT:    vmovddup %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xff,0x89,0x12,0xc0]
 ; CHECK-NEXT:    ## xmm0 {%k1} {z} = xmm0[0,0]
-; CHECK-NEXT:    vaddpd %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0x58,0xca]
 ; CHECK-NEXT:    vaddpd %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x58,0xc1]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call <2 x double> @llvm.x86.avx512.mask.movddup.128(<2 x double> %x0, <2 x double> %x1, i8 %x2)
@@ -266,9 +266,9 @@ define <4 x double>@test_int_x86_avx512_mask_movddup_256(<4 x double> %x0, <4 x
 ; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
 ; CHECK-NEXT:    vmovddup %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0xff,0x29,0x12,0xc8]
 ; CHECK-NEXT:    ## ymm1 {%k1} = ymm0[0,0,2,2]
+; CHECK-NEXT:    vaddpd %ymm2, %ymm1, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0x58,0xca]
 ; CHECK-NEXT:    vmovddup %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0xff,0xa9,0x12,0xc0]
 ; CHECK-NEXT:    ## ymm0 {%k1} {z} = ymm0[0,0,2,2]
-; CHECK-NEXT:    vaddpd %ymm2, %ymm1, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0x58,0xca]
 ; CHECK-NEXT:    vaddpd %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x58,0xc1]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call <4 x double> @llvm.x86.avx512.mask.movddup.256(<4 x double> %x0, <4 x double> %x1, i8 %x2)
@@ -3209,10 +3209,10 @@ define <2 x double>@test_int_x86_avx512_mask_shuf_pd_128(<2 x double> %x0, <2 x
 ; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
 ; CHECK-NEXT:    vshufpd $1, %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0xfd,0x09,0xc6,0xd1,0x01]
 ; CHECK-NEXT:    ## xmm2 {%k1} = xmm0[1],xmm1[0]
+; CHECK-NEXT:    vaddpd %xmm3, %xmm2, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0x58,0xd3]
 ; CHECK-NEXT:    vshufpd $1, %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0x89,0xc6,0xc1,0x01]
 ; CHECK-NEXT:    ## xmm0 {%k1} {z} = xmm0[1],xmm1[0]
-; CHECK-NEXT:    vaddpd %xmm3, %xmm2, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0x58,0xcb]
-; CHECK-NEXT:    vaddpd %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x58,0xc1]
+; CHECK-NEXT:    vaddpd %xmm2, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x58,0xc2]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call <2 x double> @llvm.x86.avx512.mask.shuf.pd.128(<2 x double> %x0, <2 x double> %x1, i32 1, <2 x double> %x3, i8 %x4)
   %res1 = call <2 x double> @llvm.x86.avx512.mask.shuf.pd.128(<2 x double> %x0, <2 x double> %x1, i32 1, <2 x double> %x3, i8 -1)
@@ -3540,9 +3540,9 @@ define <2 x i64>@test_int_x86_avx512_mask_psrl_q_128(<2 x i64> %x0, <2 x i64> %x
 ; CHECK-NEXT:    vpsrlq %xmm1, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd3,0xd9]
 ; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
 ; CHECK-NEXT:    vpsrlq %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0xfd,0x09,0xd3,0xd1]
+; CHECK-NEXT:    vpaddq %xmm3, %xmm2, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xd4,0xd3]
 ; CHECK-NEXT:    vpsrlq %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0x89,0xd3,0xc1]
-; CHECK-NEXT:    vpaddq %xmm3, %xmm2, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xd4,0xcb]
-; CHECK-NEXT:    vpaddq %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xd4,0xc0]
+; CHECK-NEXT:    vpaddq %xmm0, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xd4,0xc0]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call <2 x i64> @llvm.x86.avx512.mask.psrl.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3)
   %res1 = call <2 x i64> @llvm.x86.avx512.mask.psrl.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 -1)
@@ -3560,9 +3560,9 @@ define <4 x i64>@test_int_x86_avx512_mask_psrl_q_256(<4 x i64> %x0, <2 x i64> %x
 ; CHECK-NEXT:    vpsrlq %xmm1, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd3,0xd9]
 ; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
 ; CHECK-NEXT:    vpsrlq %xmm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0xfd,0x29,0xd3,0xd1]
+; CHECK-NEXT:    vpaddq %ymm3, %ymm2, %ymm2 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xd4,0xd3]
 ; CHECK-NEXT:    vpsrlq %xmm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xa9,0xd3,0xc1]
-; CHECK-NEXT:    vpaddq %ymm3, %ymm2, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xd4,0xcb]
-; CHECK-NEXT:    vpaddq %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xc0]
+; CHECK-NEXT:    vpaddq %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xd4,0xc0]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call <4 x i64> @llvm.x86.avx512.mask.psrl.q.256(<4 x i64> %x0, <2 x i64> %x1, <4 x i64> %x2, i8 %x3)
   %res1 = call <4 x i64> @llvm.x86.avx512.mask.psrl.q.256(<4 x i64> %x0, <2 x i64> %x1, <4 x i64> %x2, i8 -1)
@@ -3580,9 +3580,9 @@ define <4 x i32>@test_int_x86_avx512_mask_psrl_d_128(<4 x i32> %x0, <4 x i32> %x
 ; CHECK-NEXT:    vpsrld %xmm1, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd2,0xd9]
 ; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
 ; CHECK-NEXT:    vpsrld %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xd2,0xd1]
+; CHECK-NEXT:    vpaddd %xmm3, %xmm2, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfe,0xd3]
 ; CHECK-NEXT:    vpsrld %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xd2,0xc1]
-; CHECK-NEXT:    vpaddd %xmm3, %xmm2, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfe,0xcb]
-; CHECK-NEXT:    vpaddd %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc0]
+; CHECK-NEXT:    vpaddd %xmm0, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfe,0xc0]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call <4 x i32> @llvm.x86.avx512.mask.psrl.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x3)
   %res1 = call <4 x i32> @llvm.x86.avx512.mask.psrl.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 -1)
@@ -3600,9 +3600,9 @@ define <8 x i32>@test_int_x86_avx512_mask_psrl_d_256(<8 x i32> %x0, <4 x i32> %x
 ; CHECK-NEXT:    vpsrld %xmm1, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd2,0xd9]
 ; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
 ; CHECK-NEXT:    vpsrld %xmm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xd2,0xd1]
+; CHECK-NEXT:    vpaddd %ymm3, %ymm2, %ymm2 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfe,0xd3]
 ; CHECK-NEXT:    vpsrld %xmm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xd2,0xc1]
-; CHECK-NEXT:    vpaddd %ymm3, %ymm2, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfe,0xcb]
-; CHECK-NEXT:    vpaddd %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfe,0xc1]
+; CHECK-NEXT:    vpaddd %ymm2, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfe,0xc2]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call <8 x i32> @llvm.x86.avx512.mask.psrl.d.256(<8 x i32> %x0, <4 x i32> %x1, <8 x i32> %x2, i8 %x3)
   %res1 = call <8 x i32> @llvm.x86.avx512.mask.psrl.d.256(<8 x i32> %x0, <4 x i32> %x1, <8 x i32> %x2, i8 -1)
@@ -3720,8 +3720,8 @@ define <2 x i64>@test_int_x86_avx512_mask_psrl_qi_128(<2 x i64> %x0, i32 %x1, <2
 ; CHECK-NEXT:    vpsrlq $3, %xmm0, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0x73,0xd0,0x03]
 ; CHECK-NEXT:    kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
 ; CHECK-NEXT:    vpsrlq $3, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0xf5,0x09,0x73,0xd0,0x03]
-; CHECK-NEXT:    vpsrlq $3, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0x89,0x73,0xd0,0x03]
 ; CHECK-NEXT:    vpaddq %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xd4,0xca]
+; CHECK-NEXT:    vpsrlq $3, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0x89,0x73,0xd0,0x03]
 ; CHECK-NEXT:    vpaddq %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd4,0xc1]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call <2 x i64> @llvm.x86.avx512.mask.psrl.qi.128(<2 x i64> %x0, i32 3, <2 x i64> %x2, i8 %x3)
@@ -3740,8 +3740,8 @@ define <4 x i64>@test_int_x86_avx512_mask_psrl_qi_256(<4 x i64> %x0, i32 %x1, <4
 ; CHECK-NEXT:    vpsrlq $3, %ymm0, %ymm2 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0x73,0xd0,0x03]
 ; CHECK-NEXT:    kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
 ; CHECK-NEXT:    vpsrlq $3, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0xf5,0x29,0x73,0xd0,0x03]
-; CHECK-NEXT:    vpsrlq $3, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xa9,0x73,0xd0,0x03]
 ; CHECK-NEXT:    vpaddq %ymm2, %ymm1, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xca]
+; CHECK-NEXT:    vpsrlq $3, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xa9,0x73,0xd0,0x03]
 ; CHECK-NEXT:    vpaddq %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd4,0xc1]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call <4 x i64> @llvm.x86.avx512.mask.psrl.qi.256(<4 x i64> %x0, i32 3, <4 x i64> %x2, i8 %x3)
@@ -3760,8 +3760,8 @@ define <4 x i32>@test_int_x86_avx512_mask_psrl_di_128(<4 x i32> %x0, i32 %x1, <4
 ; CHECK-NEXT:    vpsrld $3, %xmm0, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0x72,0xd0,0x03]
 ; CHECK-NEXT:    kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
 ; CHECK-NEXT:    vpsrld $3, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x75,0x09,0x72,0xd0,0x03]
-; CHECK-NEXT:    vpsrld $3, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0x72,0xd0,0x03]
 ; CHECK-NEXT:    vpaddd %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xca]
+; CHECK-NEXT:    vpsrld $3, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0x72,0xd0,0x03]
 ; CHECK-NEXT:    vpaddd %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfe,0xc1]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call <4 x i32> @llvm.x86.avx512.mask.psrl.di.128(<4 x i32> %x0, i32 3, <4 x i32> %x2, i8 %x3)
@@ -3780,8 +3780,8 @@ define <8 x i32>@test_int_x86_avx512_mask_psrl_di_256(<8 x i32> %x0, i32 %x1, <8
 ; CHECK-NEXT:    vpsrld $3, %ymm0, %ymm2 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0x72,0xd0,0x03]
 ; CHECK-NEXT:    kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
 ; CHECK-NEXT:    vpsrld $3, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x75,0x29,0x72,0xd0,0x03]
-; CHECK-NEXT:    vpsrld $3, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0x72,0xd0,0x03]
 ; CHECK-NEXT:    vpaddd %ymm2, %ymm1, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfe,0xca]
+; CHECK-NEXT:    vpsrld $3, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0x72,0xd0,0x03]
 ; CHECK-NEXT:    vpaddd %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfe,0xc1]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call <8 x i32> @llvm.x86.avx512.mask.psrl.di.256(<8 x i32> %x0, i32 3, <8 x i32> %x2, i8 %x3)
@@ -4642,10 +4642,10 @@ define <4 x i32>@test_int_x86_avx512_mask_valign_d_128(<4 x i32> %x0, <4 x i32>
 ; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
 ; CHECK-NEXT:    valignd $2, %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf3,0x7d,0x09,0x03,0xd1,0x02]
 ; CHECK-NEXT:    ## xmm2 {%k1} = xmm1[2,3],xmm0[0,1]
+; CHECK-NEXT:    vpaddd %xmm3, %xmm2, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfe,0xd3]
 ; CHECK-NEXT:    valignd $2, %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf3,0x7d,0x89,0x03,0xc1,0x02]
 ; CHECK-NEXT:    ## xmm0 {%k1} {z} = xmm1[2,3],xmm0[0,1]
-; CHECK-NEXT:    vpaddd %xmm3, %xmm2, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfe,0xcb]
-; CHECK-NEXT:    vpaddd %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc0]
+; CHECK-NEXT:    vpaddd %xmm0, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfe,0xc0]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call <4 x i32> @llvm.x86.avx512.mask.valign.d.128(<4 x i32> %x0, <4 x i32> %x1, i32 2, <4 x i32> %x3, i8 %x4)
   %res1 = call <4 x i32> @llvm.x86.avx512.mask.valign.d.128(<4 x i32> %x0, <4 x i32> %x1, i32 2, <4 x i32> %x3, i8 -1)
@@ -4817,9 +4817,9 @@ define <8 x float>@test_int_x86_avx512_mask_insertf32x4_256(<8 x float> %x0, <4
 ; CHECK-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x18,0xd9,0x01]
 ; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
 ; CHECK-NEXT:    vinsertf32x4 $1, %xmm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x18,0xd1,0x01]
+; CHECK-NEXT:    vaddps %ymm3, %ymm2, %ymm2 ## EVEX TO VEX Compression encoding: [0xc5,0xec,0x58,0xd3]
 ; CHECK-NEXT:    vinsertf32x4 $1, %xmm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf3,0x7d,0xa9,0x18,0xc1,0x01]
-; CHECK-NEXT:    vaddps %ymm3, %ymm2, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xec,0x58,0xcb]
-; CHECK-NEXT:    vaddps %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x58,0xc1]
+; CHECK-NEXT:    vaddps %ymm2, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x58,0xc2]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call <8 x float> @llvm.x86.avx512.mask.insertf32x4.256(<8 x float> %x0, <4 x float> %x1, i32 1, <8 x float> %x3, i8 %x4)
   %res1 = call <8 x float> @llvm.x86.avx512.mask.insertf32x4.256(<8 x float> %x0, <4 x float> %x1, i32 1, <8 x float> %x3, i8 -1)
@@ -4837,9 +4837,9 @@ define <8 x i32>@test_int_x86_avx512_mask_inserti32x4_256(<8 x i32> %x0, <4 x i3
 ; CHECK-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x38,0xd9,0x01]
 ; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
 ; CHECK-NEXT:    vinserti32x4 $1, %xmm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x38,0xd1,0x01]
+; CHECK-NEXT:    vpaddd %ymm3, %ymm2, %ymm2 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfe,0xd3]
 ; CHECK-NEXT:    vinserti32x4 $1, %xmm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf3,0x7d,0xa9,0x38,0xc1,0x01]
-; CHECK-NEXT:    vpaddd %ymm3, %ymm2, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfe,0xcb]
-; CHECK-NEXT:    vpaddd %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfe,0xc1]
+; CHECK-NEXT:    vpaddd %ymm2, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfe,0xc2]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
 
   %res = call <8 x i32> @llvm.x86.avx512.mask.inserti32x4.256(<8 x i32> %x0, <4 x i32> %x1, i32 1, <8 x i32> %x3, i8 %x4)
diff --git a/test/CodeGen/X86/avx512vl-intrinsics.ll b/test/CodeGen/X86/avx512vl-intrinsics.ll
index 1f324d679564..684b0468cf51 100644
--- a/test/CodeGen/X86/avx512vl-intrinsics.ll
+++ b/test/CodeGen/X86/avx512vl-intrinsics.ll
@@ -4368,8 +4368,8 @@ define <4 x i32>@test_int_x86_avx512_mask_prol_d_128(<4 x i32> %x0, i32 %x1, <4
 ; CHECK-NEXT:    kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
 ; CHECK-NEXT:    vprold $3, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x75,0x09,0x72,0xc8,0x03]
 ; CHECK-NEXT:    vprold $3, %xmm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf1,0x6d,0x89,0x72,0xc8,0x03]
-; CHECK-NEXT:    vprold $3, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0x72,0xc8,0x03]
 ; CHECK-NEXT:    vpaddd %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xca]
+; CHECK-NEXT:    vprold $3, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0x72,0xc8,0x03]
 ; CHECK-NEXT:    vpaddd %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc0]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call <4 x i32> @llvm.x86.avx512.mask.prol.d.128(<4 x i32> %x0, i32 3, <4 x i32> %x2, i8 %x3)
@@ -4388,8 +4388,8 @@ define <8 x i32>@test_int_x86_avx512_mask_prol_d_256(<8 x i32> %x0, i32 %x1, <8
 ; CHECK-NEXT:    kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
 ; CHECK-NEXT:    vprold $3, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x75,0x29,0x72,0xc8,0x03]
 ; CHECK-NEXT:    vprold $3, %ymm0, %ymm2 {%k1} {z} ## encoding: [0x62,0xf1,0x6d,0xa9,0x72,0xc8,0x03]
-; CHECK-NEXT:    vprold $3, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0x72,0xc8,0x03]
 ; CHECK-NEXT:    vpaddd %ymm2, %ymm1, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfe,0xca]
+; CHECK-NEXT:    vprold $3, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0x72,0xc8,0x03]
 ; CHECK-NEXT:    vpaddd %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfe,0xc0]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call <8 x i32> @llvm.x86.avx512.mask.prol.d.256(<8 x i32> %x0, i32 3, <8 x i32> %x2, i8 %x3)
@@ -4408,8 +4408,8 @@ define <2 x i64>@test_int_x86_avx512_mask_prol_q_128(<2 x i64> %x0, i32 %x1, <2
 ; CHECK-NEXT:    kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
 ; CHECK-NEXT:    vprolq $3, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0xf5,0x09,0x72,0xc8,0x03]
 ; CHECK-NEXT:    vprolq $3, %xmm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf1,0xed,0x89,0x72,0xc8,0x03]
-; CHECK-NEXT:    vprolq $3, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0xfd,0x08,0x72,0xc8,0x03]
 ; CHECK-NEXT:    vpaddq %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xd4,0xca]
+; CHECK-NEXT:    vprolq $3, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0xfd,0x08,0x72,0xc8,0x03]
 ; CHECK-NEXT:    vpaddq %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xd4,0xc0]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call <2 x i64> @llvm.x86.avx512.mask.prol.q.128(<2 x i64> %x0, i32 3, <2 x i64> %x2, i8 %x3)
@@ -4428,8 +4428,8 @@ define <4 x i64>@test_int_x86_avx512_mask_prol_q_256(<4 x i64> %x0, i32 %x1, <4
 ; CHECK-NEXT:    kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
 ; CHECK-NEXT:    vprolq $3, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0xf5,0x29,0x72,0xc8,0x03]
 ; CHECK-NEXT:    vprolq $3, %ymm0, %ymm2 {%k1} {z} ## encoding: [0x62,0xf1,0xed,0xa9,0x72,0xc8,0x03]
-; CHECK-NEXT:    vprolq $3, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0xfd,0x28,0x72,0xc8,0x03]
 ; CHECK-NEXT:    vpaddq %ymm2, %ymm1, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xca]
+; CHECK-NEXT:    vprolq $3, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0xfd,0x28,0x72,0xc8,0x03]
 ; CHECK-NEXT:    vpaddq %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xc0]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call <4 x i64> @llvm.x86.avx512.mask.prol.q.256(<4 x i64> %x0, i32 3, <4 x i64> %x2, i8 %x3)
@@ -4528,8 +4528,8 @@ define <4 x i32>@test_int_x86_avx512_mask_pror_d_128(<4 x i32> %x0, i32 %x1, <4
 ; CHECK-NEXT:    kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
 ; CHECK-NEXT:    vprord $3, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x75,0x09,0x72,0xc0,0x03]
 ; CHECK-NEXT:    vprord $3, %xmm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf1,0x6d,0x89,0x72,0xc0,0x03]
-; CHECK-NEXT:    vprord $3, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0x72,0xc0,0x03]
 ; CHECK-NEXT:    vpaddd %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xca]
+; CHECK-NEXT:    vprord $3, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0x72,0xc0,0x03]
 ; CHECK-NEXT:    vpaddd %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc0]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call <4 x i32> @llvm.x86.avx512.mask.pror.d.128(<4 x i32> %x0, i32 3, <4 x i32> %x2, i8 %x3)
@@ -4548,8 +4548,8 @@ define <8 x i32>@test_int_x86_avx512_mask_pror_d_256(<8 x i32> %x0, i32 %x1, <8
 ; CHECK-NEXT:    kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
 ; CHECK-NEXT:    vprord $3, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x75,0x29,0x72,0xc0,0x03]
 ; CHECK-NEXT:    vprord $3, %ymm0, %ymm2 {%k1} {z} ## encoding: [0x62,0xf1,0x6d,0xa9,0x72,0xc0,0x03]
-; CHECK-NEXT:    vprord $3, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0x72,0xc0,0x03]
 ; CHECK-NEXT:    vpaddd %ymm2, %ymm1, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfe,0xca]
+; CHECK-NEXT:    vprord $3, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0x72,0xc0,0x03]
 ; CHECK-NEXT:    vpaddd %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfe,0xc0]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call <8 x i32> @llvm.x86.avx512.mask.pror.d.256(<8 x i32> %x0, i32 3, <8 x i32> %x2, i8 %x3)
@@ -4568,8 +4568,8 @@ define <2 x i64>@test_int_x86_avx512_mask_pror_q_128(<2 x i64> %x0, i32 %x1, <2
 ; CHECK-NEXT:    kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
 ; CHECK-NEXT:    vprorq $3, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0xf5,0x09,0x72,0xc0,0x03]
 ; CHECK-NEXT:    vprorq $3, %xmm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf1,0xed,0x89,0x72,0xc0,0x03]
-; CHECK-NEXT:    vprorq $3, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0xfd,0x08,0x72,0xc0,0x03]
 ; CHECK-NEXT:    vpaddq %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xd4,0xca]
+; CHECK-NEXT:    vprorq $3, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0xfd,0x08,0x72,0xc0,0x03]
 ; CHECK-NEXT:    vpaddq %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xd4,0xc0]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call <2 x i64> @llvm.x86.avx512.mask.pror.q.128(<2 x i64> %x0, i32 3, <2 x i64> %x2, i8 %x3)
@@ -4588,8 +4588,8 @@ define <4 x i64>@test_int_x86_avx512_mask_pror_q_256(<4 x i64> %x0, i32 %x1, <4
 ; CHECK-NEXT:    kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
 ; CHECK-NEXT:    vprorq $3, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0xf5,0x29,0x72,0xc0,0x03]
 ; CHECK-NEXT:    vprorq $3, %ymm0, %ymm2 {%k1} {z} ## encoding: [0x62,0xf1,0xed,0xa9,0x72,0xc0,0x03]
-; CHECK-NEXT:    vprorq $3, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0xfd,0x28,0x72,0xc0,0x03]
 ; CHECK-NEXT:    vpaddq %ymm2, %ymm1, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xca]
+; CHECK-NEXT:    vprorq $3, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0xfd,0x28,0x72,0xc0,0x03]
 ; CHECK-NEXT:    vpaddq %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xc0]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call <4 x i64> @llvm.x86.avx512.mask.pror.q.256(<4 x i64> %x0, i32 3, <4 x i64> %x2, i8 %x3)
@@ -4690,9 +4690,9 @@ define <2 x double>@test_int_x86_avx512_mask_fixupimm_pd_128(<2 x double> %x0, <
 ; CHECK-NEXT:    vfixupimmpd $5, %xmm2, %xmm1, %xmm3 {%k1} ## encoding: [0x62,0xf3,0xf5,0x09,0x54,0xda,0x05]
 ; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4 ## EVEX TO VEX Compression encoding: [0xc5,0xd9,0xef,0xe4]
 ; CHECK-NEXT:    vfixupimmpd $4, %xmm2, %xmm1, %xmm4 {%k1} {z} ## encoding: [0x62,0xf3,0xf5,0x89,0x54,0xe2,0x04]
+; CHECK-NEXT:    vaddpd %xmm4, %xmm3, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xe1,0x58,0xdc]
 ; CHECK-NEXT:    vfixupimmpd $3, %xmm2, %xmm1, %xmm0 ## encoding: [0x62,0xf3,0xf5,0x08,0x54,0xc2,0x03]
-; CHECK-NEXT:    vaddpd %xmm4, %xmm3, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xe1,0x58,0xcc]
-; CHECK-NEXT:    vaddpd %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0x58,0xc0]
+; CHECK-NEXT:    vaddpd %xmm0, %xmm3, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe1,0x58,0xc0]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call <2 x double> @llvm.x86.avx512.mask.fixupimm.pd.128(<2 x double> %x0, <2 x double> %x1,<2 x i64> %x2, i32 5, i8 %x4)
   %res1 = call <2 x double> @llvm.x86.avx512.mask.fixupimm.pd.128(<2 x double> zeroinitializer, <2 x double> %x1, <2 x i64> %x2, i32 4, i8 %x4)
@@ -4732,9 +4732,9 @@ define <4 x double>@test_int_x86_avx512_mask_fixupimm_pd_256(<4 x double> %x0, <
 ; CHECK-NEXT:    vfixupimmpd $4, %ymm2, %ymm1, %ymm3 {%k1} ## encoding: [0x62,0xf3,0xf5,0x29,0x54,0xda,0x04]
 ; CHECK-NEXT:    vpxor %ymm4, %ymm4, %ymm4 ## EVEX TO VEX Compression encoding: [0xc5,0xdd,0xef,0xe4]
 ; CHECK-NEXT:    vfixupimmpd $5, %ymm2, %ymm1, %ymm4 {%k1} {z} ## encoding: [0x62,0xf3,0xf5,0xa9,0x54,0xe2,0x05]
+; CHECK-NEXT:    vaddpd %ymm4, %ymm3, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xe5,0x58,0xdc]
 ; CHECK-NEXT:    vfixupimmpd $3, %ymm2, %ymm1, %ymm0 ## encoding: [0x62,0xf3,0xf5,0x28,0x54,0xc2,0x03]
-; CHECK-NEXT:    vaddpd %ymm4, %ymm3, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xe5,0x58,0xcc]
-; CHECK-NEXT:    vaddpd %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0x58,0xc0]
+; CHECK-NEXT:    vaddpd %ymm0, %ymm3, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe5,0x58,0xc0]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call <4 x double> @llvm.x86.avx512.mask.fixupimm.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x i64> %x2, i32 4, i8 %x4)
   %res1 = call <4 x double> @llvm.x86.avx512.mask.fixupimm.pd.256(<4 x double> zeroinitializer, <4 x double> %x1, <4 x i64> %x2 , i32 5, i8 %x4)
@@ -4755,9 +4755,9 @@ define <4 x double>@test_int_x86_avx512_maskz_fixupimm_pd_256(<4 x double> %x0,
 ; CHECK-NEXT:    vpxor %ymm4, %ymm4, %ymm4 ## EVEX TO VEX Compression encoding: [0xc5,0xdd,0xef,0xe4]
 ; CHECK-NEXT:    vmovapd %ymm0, %ymm5 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x28,0xe8]
 ; CHECK-NEXT:    vfixupimmpd $4, %ymm4, %ymm1, %ymm5 {%k1} {z} ## encoding: [0x62,0xf3,0xf5,0xa9,0x54,0xec,0x04]
+; CHECK-NEXT:    vaddpd %ymm5, %ymm3, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xe5,0x58,0xdd]
 ; CHECK-NEXT:    vfixupimmpd $3, %ymm2, %ymm1, %ymm0 ## encoding: [0x62,0xf3,0xf5,0x28,0x54,0xc2,0x03]
-; CHECK-NEXT:    vaddpd %ymm5, %ymm3, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xe5,0x58,0xcd]
-; CHECK-NEXT:    vaddpd %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0x58,0xc0]
+; CHECK-NEXT:    vaddpd %ymm0, %ymm3, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe5,0x58,0xc0]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call <4 x double> @llvm.x86.avx512.maskz.fixupimm.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x i64> %x2, i32 5, i8 %x4)
   %res1 = call <4 x double> @llvm.x86.avx512.maskz.fixupimm.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x i64> zeroinitializer, i32 4, i8 %x4)
diff --git a/test/CodeGen/X86/bmi.ll b/test/CodeGen/X86/bmi.ll
index afeba4ef2d99..94e2ee7a0aa9 100644
--- a/test/CodeGen/X86/bmi.ll
+++ b/test/CodeGen/X86/bmi.ll
@@ -454,6 +454,30 @@ entry:
   ret i32 %and
 }
 
+define i32 @bzhi32d(i32 %a, i32 %b) {
+; CHECK-LABEL: bzhi32d:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    bzhil %esi, %edi, %eax
+; CHECK-NEXT:    retq
+entry:
+  %sub = sub i32 32, %b
+  %shr = lshr i32 -1, %sub
+  %and = and i32 %shr, %a
+  ret i32 %and
+}
+
+define i32 @bzhi32e(i32 %a, i32 %b) {
+; CHECK-LABEL: bzhi32e:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    bzhil %esi, %edi, %eax
+; CHECK-NEXT:    retq
+entry:
+  %sub = sub i32 32, %b
+  %shl = shl i32 %a, %sub
+  %shr = lshr i32 %shl, %sub
+  ret i32 %shr
+}
+
 define i64 @bzhi64b(i64 %x, i8 zeroext %index) {
 ; CHECK-LABEL: bzhi64b:
 ; CHECK:       # BB#0: # %entry
@@ -468,6 +492,58 @@ entry:
   ret i64 %and
 }
 
+define i64 @bzhi64c(i64 %a, i64 %b) {
+; CHECK-LABEL: bzhi64c:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    bzhiq %rsi, %rdi, %rax
+; CHECK-NEXT:    retq
+entry:
+  %sub = sub i64 64, %b
+  %shr = lshr i64 -1, %sub
+  %and = and i64 %shr, %a
+  ret i64 %and
+}
+
+define i64 @bzhi64d(i64 %a, i32 %b) {
+; CHECK-LABEL: bzhi64d:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    # kill: %ESI<def> %ESI<kill> %RSI<def>
+; CHECK-NEXT:    bzhiq %rsi, %rdi, %rax
+; CHECK-NEXT:    retq
+entry:
+  %sub = sub i32 64, %b
+  %sh_prom = zext i32 %sub to i64
+  %shr = lshr i64 -1, %sh_prom
+  %and = and i64 %shr, %a
+  ret i64 %and
+}
+
+define i64 @bzhi64e(i64 %a, i64 %b) {
+; CHECK-LABEL: bzhi64e:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    bzhiq %rsi, %rdi, %rax
+; CHECK-NEXT:    retq
+entry:
+  %sub = sub i64 64, %b
+  %shl = shl i64 %a, %sub
+  %shr = lshr i64 %shl, %sub
+  ret i64 %shr
+}
+
+define i64 @bzhi64f(i64 %a, i32 %b) {
+; CHECK-LABEL: bzhi64f:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    # kill: %ESI<def> %ESI<kill> %RSI<def>
+; CHECK-NEXT:    bzhiq %rsi, %rdi, %rax
+; CHECK-NEXT:    retq
+entry:
+  %sub = sub i32 64, %b
+  %sh_prom = zext i32 %sub to i64
+  %shl = shl i64 %a, %sh_prom
+  %shr = lshr i64 %shl, %sh_prom
+  ret i64 %shr
+}
+
 define i64 @bzhi64_constant_mask(i64 %x) {
 ; CHECK-LABEL: bzhi64_constant_mask:
 ; CHECK:       # BB#0: # %entry
diff --git a/test/CodeGen/X86/bswap_tree2.ll b/test/CodeGen/X86/bswap_tree2.ll
index a9c74df9d0d9..1340b7662a7a 100644
--- a/test/CodeGen/X86/bswap_tree2.ll
+++ b/test/CodeGen/X86/bswap_tree2.ll
@@ -9,31 +9,32 @@
   define i32 @test1(i32 %x) nounwind {
 ; CHECK-LABEL: test1:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; CHECK-NEXT:    movl %ecx, %edx
-; CHECK-NEXT:    andl $16711680, %edx # imm = 0xFF0000
-; CHECK-NEXT:    movl %ecx, %eax
-; CHECK-NEXT:    orl $-16777216, %eax # imm = 0xFF000000
-; CHECK-NEXT:    shll $8, %edx
-; CHECK-NEXT:    shrl $8, %eax
-; CHECK-NEXT:    bswapl %ecx
-; CHECK-NEXT:    shrl $16, %ecx
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    movl %eax, %ecx
+; CHECK-NEXT:    andl $16711680, %ecx # imm = 0xFF0000
+; CHECK-NEXT:    movl %eax, %edx
+; CHECK-NEXT:    orl $-16777216, %edx # imm = 0xFF000000
+; CHECK-NEXT:    shll $8, %ecx
+; CHECK-NEXT:    shrl $8, %edx
+; CHECK-NEXT:    orl %ecx, %edx
+; CHECK-NEXT:    bswapl %eax
+; CHECK-NEXT:    shrl $16, %eax
 ; CHECK-NEXT:    orl %edx, %eax
-; CHECK-NEXT:    orl %ecx, %eax
 ; CHECK-NEXT:    retl
 ;
 ; CHECK64-LABEL: test1:
 ; CHECK64:       # BB#0:
-; CHECK64-NEXT:    movl %edi, %ecx
-; CHECK64-NEXT:    andl $16711680, %ecx # imm = 0xFF0000
 ; CHECK64-NEXT:    movl %edi, %eax
-; CHECK64-NEXT:    orl $-16777216, %eax # imm = 0xFF000000
-; CHECK64-NEXT:    shll $8, %ecx
-; CHECK64-NEXT:    shrl $8, %eax
+; CHECK64-NEXT:    andl $16711680, %eax # imm = 0xFF0000
+; CHECK64-NEXT:    movl %edi, %ecx
+; CHECK64-NEXT:    orl $-16777216, %ecx # imm = 0xFF000000
+; CHECK64-NEXT:    shll $8, %eax
+; CHECK64-NEXT:    shrl $8, %ecx
+; CHECK64-NEXT:    orl %eax, %ecx
 ; CHECK64-NEXT:    bswapl %edi
 ; CHECK64-NEXT:    shrl $16, %edi
-; CHECK64-NEXT:    orl %ecx, %eax
-; CHECK64-NEXT:    orl %edi, %eax
+; CHECK64-NEXT:    orl %ecx, %edi
+; CHECK64-NEXT:    movl %edi, %eax
 ; CHECK64-NEXT:    retq
   %byte0 = and i32 %x, 255        ; 0x000000ff
   %byte1 = and i32 %x, 65280      ; 0x0000ff00
diff --git a/test/CodeGen/X86/cast-vsel.ll b/test/CodeGen/X86/cast-vsel.ll
index 1e44aec99fc5..83ab2fac2f16 100644
--- a/test/CodeGen/X86/cast-vsel.ll
+++ b/test/CodeGen/X86/cast-vsel.ll
@@ -200,32 +200,29 @@ define <8 x i16> @trunc(<8 x i16> %a, <8 x i16> %b, <8 x i32> %c, <8 x i32> %d)
 ; SSE41:       # BB#0:
 ; SSE41-NEXT:    pcmpeqw %xmm1, %xmm0
 ; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
-; SSE41-NEXT:    pshufb %xmm1, %xmm5
-; SSE41-NEXT:    pshufb %xmm1, %xmm4
-; SSE41-NEXT:    punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm5[0]
 ; SSE41-NEXT:    pshufb %xmm1, %xmm3
 ; SSE41-NEXT:    pshufb %xmm1, %xmm2
 ; SSE41-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
-; SSE41-NEXT:    pand %xmm0, %xmm2
-; SSE41-NEXT:    pandn %xmm4, %xmm0
-; SSE41-NEXT:    por %xmm2, %xmm0
+; SSE41-NEXT:    pshufb %xmm1, %xmm5
+; SSE41-NEXT:    pshufb %xmm1, %xmm4
+; SSE41-NEXT:    punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm5[0]
+; SSE41-NEXT:    pblendvb %xmm0, %xmm2, %xmm4
+; SSE41-NEXT:    movdqa %xmm4, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: trunc:
 ; AVX1:       # BB#0:
 ; AVX1-NEXT:    vpcmpeqw %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm1
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm1
 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
 ; AVX1-NEXT:    vpshufb %xmm4, %xmm1, %xmm1
-; AVX1-NEXT:    vpshufb %xmm4, %xmm3, %xmm3
-; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm3[0],xmm1[0]
-; AVX1-NEXT:    vpandn %xmm1, %xmm0, %xmm1
-; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm3
-; AVX1-NEXT:    vpshufb %xmm4, %xmm3, %xmm3
 ; AVX1-NEXT:    vpshufb %xmm4, %xmm2, %xmm2
-; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
-; AVX1-NEXT:    vpand %xmm0, %xmm2, %xmm0
-; AVX1-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm2
+; AVX1-NEXT:    vpshufb %xmm4, %xmm2, %xmm2
+; AVX1-NEXT:    vpshufb %xmm4, %xmm3, %xmm3
+; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
+; AVX1-NEXT:    vpblendvb %xmm0, %xmm1, %xmm2, %xmm0
 ; AVX1-NEXT:    vzeroupper
 ; AVX1-NEXT:    retq
 ;
@@ -233,13 +230,11 @@ define <8 x i16> @trunc(<8 x i16> %a, <8 x i16> %b, <8 x i32> %c, <8 x i32> %d)
 ; AVX2:       # BB#0:
 ; AVX2-NEXT:    vpcmpeqw %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
-; AVX2-NEXT:    vpshufb %ymm1, %ymm3, %ymm3
-; AVX2-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
-; AVX2-NEXT:    vpandn %xmm3, %xmm0, %xmm3
-; AVX2-NEXT:    vpshufb %ymm1, %ymm2, %ymm1
+; AVX2-NEXT:    vpshufb %ymm1, %ymm2, %ymm2
+; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
+; AVX2-NEXT:    vpshufb %ymm1, %ymm3, %ymm1
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
-; AVX2-NEXT:    vpand %xmm0, %xmm1, %xmm0
-; AVX2-NEXT:    vpor %xmm3, %xmm0, %xmm0
+; AVX2-NEXT:    vpblendvb %xmm0, %xmm2, %xmm1, %xmm0
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
   %cmp = icmp eq <8 x i16> %a, %b
diff --git a/test/CodeGen/X86/combine-abs.ll b/test/CodeGen/X86/combine-abs.ll
index 887abe99f6ed..37beb438d737 100644
--- a/test/CodeGen/X86/combine-abs.ll
+++ b/test/CodeGen/X86/combine-abs.ll
@@ -50,12 +50,11 @@ define <32 x i8> @combine_v32i8_abs_abs(<32 x i8> %a) {
 define <4 x i64> @combine_v4i64_abs_abs(<4 x i64> %a) {
 ; AVX2-LABEL: combine_v4i64_abs_abs:
 ; AVX2:       # BB#0:
-; AVX2-NEXT:    vpsrad $31, %ymm0, %ymm1
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[1,1,3,3,5,5,7,7]
-; AVX2-NEXT:    vpaddq %ymm1, %ymm0, %ymm0
-; AVX2-NEXT:    vpxor %ymm1, %ymm0, %ymm0
-; AVX2-NEXT:    vpsrad $31, %ymm0, %ymm1
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[1,1,3,3,5,5,7,7]
+; AVX2-NEXT:    vpxor %ymm1, %ymm1, %ymm1
+; AVX2-NEXT:    vpcmpgtq %ymm0, %ymm1, %ymm2
+; AVX2-NEXT:    vpaddq %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpxor %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpcmpgtq %ymm0, %ymm1, %ymm1
 ; AVX2-NEXT:    vpaddq %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vpxor %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    retq
diff --git a/test/CodeGen/X86/combine-shl.ll b/test/CodeGen/X86/combine-shl.ll
index 3ad38f2717d9..3dbff2680c22 100644
--- a/test/CodeGen/X86/combine-shl.ll
+++ b/test/CodeGen/X86/combine-shl.ll
@@ -11,8 +11,7 @@ define <4 x i32> @combine_vec_shl_zero(<4 x i32> %x) {
 ;
 ; AVX-LABEL: combine_vec_shl_zero:
 ; AVX:       # BB#0:
-; AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX-NEXT:    vpsllvd %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
 ; AVX-NEXT:    retq
   %1 = shl <4 x i32> zeroinitializer, %x
   ret <4 x i32> %1
diff --git a/test/CodeGen/X86/combine-srl.ll b/test/CodeGen/X86/combine-srl.ll
index 706e89051a3d..21564cdd7353 100644
--- a/test/CodeGen/X86/combine-srl.ll
+++ b/test/CodeGen/X86/combine-srl.ll
@@ -6,30 +6,12 @@
 define <4 x i32> @combine_vec_lshr_zero(<4 x i32> %x) {
 ; SSE-LABEL: combine_vec_lshr_zero:
 ; SSE:       # BB#0:
-; SSE-NEXT:    movdqa %xmm0, %xmm2
-; SSE-NEXT:    psrldq {{.*#+}} xmm2 = xmm2[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; SSE-NEXT:    pxor %xmm1, %xmm1
-; SSE-NEXT:    pxor %xmm3, %xmm3
-; SSE-NEXT:    psrld %xmm2, %xmm3
-; SSE-NEXT:    movdqa %xmm0, %xmm2
-; SSE-NEXT:    psrlq $32, %xmm2
-; SSE-NEXT:    pxor %xmm4, %xmm4
-; SSE-NEXT:    psrld %xmm2, %xmm4
-; SSE-NEXT:    pblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm3[4,5,6,7]
-; SSE-NEXT:    pmovzxdq {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero
-; SSE-NEXT:    punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; SSE-NEXT:    pxor %xmm3, %xmm3
-; SSE-NEXT:    psrld %xmm0, %xmm3
-; SSE-NEXT:    psrld %xmm2, %xmm1
-; SSE-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5,6,7]
-; SSE-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3],xmm1[4,5],xmm4[6,7]
-; SSE-NEXT:    movdqa %xmm1, %xmm0
+; SSE-NEXT:    xorps %xmm0, %xmm0
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: combine_vec_lshr_zero:
 ; AVX:       # BB#0:
-; AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX-NEXT:    vpsrlvd %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
 ; AVX-NEXT:    retq
   %1 = lshr <4 x i32> zeroinitializer, %x
   ret <4 x i32> %1
diff --git a/test/CodeGen/X86/constructor.ll b/test/CodeGen/X86/constructor.ll
index 7b82125dc372..2f3c343afac0 100644
--- a/test/CodeGen/X86/constructor.ll
+++ b/test/CodeGen/X86/constructor.ll
@@ -3,6 +3,8 @@
 ; RUN: llc -mtriple x86_64-pc-linux < %s | FileCheck --check-prefix=INIT-ARRAY %s
 ; RUN: llc -mtriple x86_64-unknown-freebsd < %s | FileCheck --check-prefix=INIT-ARRAY %s
 ; RUN: llc -mtriple x86_64-unknown-nacl < %s | FileCheck --check-prefix=NACL %s
+; RUN: llc -mtriple i586-intel-elfiamcu -use-ctors < %s | FileCheck %s --check-prefix=MCU-CTORS
+; RUN: llc -mtriple i586-intel-elfiamcu < %s | FileCheck %s --check-prefix=MCU-INIT-ARRAY
 @llvm.global_ctors = appending global [2 x { i32, void ()*, i8* }] [{ i32, void ()*, i8* } { i32 65535, void ()* @f, i8* null}, { i32, void ()*, i8* } { i32 15, void ()* @g, i8* @v }]
 
 @v = weak_odr global i8 0
@@ -37,3 +39,6 @@ entry:
 ; NACL-NEXT:	.section	.init_array,"aw",@init_array
 ; NACL-NEXT:	.p2align	2
 ; NACL-NEXT:	.long	f
+
+; MCU-CTORS:         .section        .ctors,"aw",@progbits
+; MCU-INIT-ARRAY:    .section        .init_array,"aw",@init_array
diff --git a/test/CodeGen/X86/dbg-baseptr.ll b/test/CodeGen/X86/dbg-baseptr.ll
index fb0da1b50d11..893ca93a9944 100644
--- a/test/CodeGen/X86/dbg-baseptr.ll
+++ b/test/CodeGen/X86/dbg-baseptr.ll
@@ -1,4 +1,5 @@
 ; RUN: llc -o - %s | FileCheck %s
+; RUN: llc -filetype=obj -o - %s | llvm-dwarfdump - | FileCheck %s --check-prefix=DWARF
 ; This test checks that parameters on the stack pointer are correctly
 ; referenced by debug info.
 target triple = "x86_64--"
@@ -7,24 +8,54 @@ target triple = "x86_64--"
 @ptr = external global i32*
 %struct.s = type { i32, i32, i32, i32, i32 }
 
+; Simple case: no FP, use offset from RSP.
+
 ; CHECK-LABEL: f0:
-; CHECK: DEBUG_VALUE: f:input <- [%RSP+8]
+; CHECK-NOT: pushq
+; CHECK: movl $42, %eax
+; CHECK: retq
 define i32 @f0(%struct.s* byval align 8 %input) !dbg !8 {
   call void @llvm.dbg.declare(metadata %struct.s* %input, metadata !4, metadata !17), !dbg !18
-  ret i32 42
+  ret i32 42, !dbg !18
 }
 
+; DWARF-LABEL: .debug_info contents:
+
+; DWARF-LABEL: DW_TAG_subprogram
+; DWARF:   DW_AT_frame_base [DW_FORM_exprloc]      (<0x1> 57 )
+;                                                       0x57 -> RSP
+; DWARF:   DW_AT_name [DW_FORM_strp]       ( {{.*}}"f0")
+; DWARF:   DW_TAG_formal_parameter
+; DWARF-NEXT:     DW_AT_location [DW_FORM_exprloc]      (<0x2> 91 08 )
+;                                                        DW_OP_fbreg (0x91) 0x08
+; DWARF-NEXT:     DW_AT_name [DW_FORM_strp]     ( {{.*}}"input")
+
+
+; Dynamic alloca forces the use of RBP as the base pointer
+
 ; CHECK-LABEL: f1:
-; CHECK: DEBUG_VALUE: f:input <- [%RBP+16]
+; CHECK: pushq %rbp
+; CHECK: movl $42, %eax
+; CHECK: popq %rbp
+; CHECK: retq
 define i32 @f1(%struct.s* byval align 8 %input) !dbg !19 {
   %val = load i64, i64* @glob
   ; this alloca should force FP usage.
   %stackspace = alloca i32, i64 %val, align 1
   store i32* %stackspace, i32** @ptr
   call void @llvm.dbg.declare(metadata %struct.s* %input, metadata !20, metadata !17), !dbg !21
-  ret i32 42
+  ret i32 42, !dbg !21
 }
 
+; DWARF-LABEL: DW_TAG_subprogram
+; DWARF:   DW_AT_frame_base [DW_FORM_exprloc]      (<0x1> 56 )
+;                                                       0x56 -> RBP
+; DWARF:   DW_AT_name [DW_FORM_strp]       ( {{.*}}"f1")
+; DWARF:   DW_TAG_formal_parameter
+; DWARF-NEXT:     DW_AT_location [DW_FORM_exprloc]      (<0x2> 91 10 )
+;                                                        DW_OP_fbreg (0x91) 0x10
+; DWARF-NEXT:     DW_AT_name [DW_FORM_strp]     ( {{.*}}"input")
+
 ; CHECK-LABEL: f2:
 ; Just check that we are indeed aligning the stack and setting up a base pointer
 ; in RBX.
@@ -34,17 +65,24 @@ define i32 @f1(%struct.s* byval align 8 %input) !dbg !19 {
 ; CHECK: andq $-64, %rsp
 ; CHECK: subq $64, %rsp
 ; CHECK: movq %rsp, %rbx
-; The parameter should still be referenced through RBP though.
-; CHECK-NOT: DEBUG_VALUE: f:input <- [%RBX
-; CHECK: DEBUG_VALUE: f:input <- [%RBP+16]
 define i32 @f2(%struct.s* byval align 8 %input) !dbg !22 {
   %val = load i64, i64* @glob
   %stackspace = alloca i32, i64 %val, align 64
   store i32* %stackspace, i32** @ptr
   call void @llvm.dbg.declare(metadata %struct.s* %input, metadata !23, metadata !17), !dbg !24
-  ret i32 42
+  ret i32 42, !dbg !24
 }
 
+; "input" should still be referred to through RBP.
+; DWARF-LABEL: DW_TAG_subprogram
+; DWARF:   DW_AT_frame_base [DW_FORM_exprloc]      (<0x1> 56 )
+;                                                       0x56 -> RBP
+; DWARF:   DW_AT_name [DW_FORM_strp]       ( {{.*}}"f2")
+; DWARF:   DW_TAG_formal_parameter
+; DWARF-NEXT:     DW_AT_location [DW_FORM_exprloc]      (<0x2> 91 10 )
+;                                                        DW_OP_fbreg (0x91) 0x10
+; DWARF-NEXT:     DW_AT_name [DW_FORM_strp]     ( {{.*}}"input")
+
 declare void @llvm.dbg.declare(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!2}
@@ -52,7 +90,7 @@ declare void @llvm.dbg.declare(metadata, metadata, metadata)
 
 !0 = !{i32 2, !"Dwarf Version", i32 4}
 !1 = !{i32 2, !"Debug Info Version", i32 3}
-!2 = distinct !DICompileUnit(language: DW_LANG_C99, file: !3)
+!2 = distinct !DICompileUnit(language: DW_LANG_C99, file: !3, emissionKind: FullDebug)
 !3 = !DIFile(filename: "dbg-baseptr.ll", directory: "/")
 !4 = !DILocalVariable(name: "input", arg: 1, scope: !8, file: !3, line: 5, type: !9)
 !5 = !{}
@@ -60,7 +98,7 @@ declare void @llvm.dbg.declare(metadata, metadata, metadata)
 !6 = !DISubroutineType(types: !7)
 !7 = !{!10, !9}
 
-!8 = distinct !DISubprogram(name: "f", file: !3, line: 5, type: !6, isLocal: false, isDefinition: true, flags: DIFlagPrototyped, unit: !2, variables: !5)
+!8 = distinct !DISubprogram(name: "f0", file: !3, line: 5, type: !6, isLocal: false, isDefinition: true, unit: !2, variables: !5)
 
 !9 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "s", elements: !11)
 !10 = !DIBasicType(name: "unsigned int", size: 32, encoding: DW_ATE_unsigned)
@@ -74,9 +112,9 @@ declare void @llvm.dbg.declare(metadata, metadata, metadata)
 !17 = !DIExpression()
 !18 = !DILocation(line: 5, scope: !8)
 
-!19 = distinct !DISubprogram(name: "f", file: !3, line: 5, type: !6, isLocal: false, isDefinition: true, flags: DIFlagPrototyped, unit: !2, variables: !5)
+!19 = distinct !DISubprogram(name: "f1", file: !3, line: 5, type: !6, isLocal: false, isDefinition: true, flags: DIFlagPrototyped, unit: !2, variables: !5)
 !20 = !DILocalVariable(name: "input", arg: 1, scope: !19, file: !3, line: 5, type: !9)
 !21 = !DILocation(line: 5, scope: !19)
-!22 = distinct !DISubprogram(name: "f", file: !3, line: 5, type: !6, isLocal: false, isDefinition: true, flags: DIFlagPrototyped, unit: !2, variables: !5)
+!22 = distinct !DISubprogram(name: "f2", file: !3, line: 5, type: !6, isLocal: false, isDefinition: true, flags: DIFlagPrototyped, unit: !2, variables: !5)
 !23 = !DILocalVariable(name: "input", arg: 1, scope: !22, file: !3, line: 5, type: !9)
 !24 = !DILocation(line: 5, scope: !22)
diff --git a/test/CodeGen/X86/elf-associated.ll b/test/CodeGen/X86/elf-associated.ll
index 361cf66cce72..7d58c3437025 100644
--- a/test/CodeGen/X86/elf-associated.ll
+++ b/test/CodeGen/X86/elf-associated.ll
@@ -37,3 +37,8 @@
 @l = global i32 1, section "ccc", !associated !5
 !5 = !{i32* null}
 ; CHECK-DAG: .section	ccc,"aw",@progbits
+
+; Null metadata.
+@m = global i32 1, section "ddd", !associated !6
+!6 = distinct !{null}
+; CHECK-DAG: .section	ddd,"aw",@progbits
diff --git a/test/CodeGen/X86/fold-tied-op.ll b/test/CodeGen/X86/fold-tied-op.ll
index d68236e9d250..eb06eb75a4d7 100644
--- a/test/CodeGen/X86/fold-tied-op.ll
+++ b/test/CodeGen/X86/fold-tied-op.ll
@@ -6,9 +6,10 @@ target datalayout = "e-m:e-p:32:32-f64:32:64-f80:32-n8:16:32-S128"
 target triple = "i386--netbsd"
 
 ; CHECK-LABEL: fn1
-; CHECK:       addl  {{.*#+}} 4-byte Folded Reload
-; CHECK:       imull {{.*#+}} 4-byte Folded Reload
-; CHECK:       orl   {{.*#+}} 4-byte Folded Reload
+; CHECK:       orl  {{.*#+}} 4-byte Folded Reload
+; CHECK:       addl {{.*#+}} 4-byte Folded Reload
+; CHECK:       xorl {{.*#+}} 4-byte Folded Reload
+; CHECK:       xorl {{.*#+}} 4-byte Folded Reload
 ; CHECK:       retl
 
 %struct.XXH_state64_t = type { i32, i32, i64, i64, i64 }
diff --git a/test/CodeGen/X86/fp128-i128.ll b/test/CodeGen/X86/fp128-i128.ll
index 98082ec611d4..6c6bc8bdc1d1 100644
--- a/test/CodeGen/X86/fp128-i128.ll
+++ b/test/CodeGen/X86/fp128-i128.ll
@@ -50,8 +50,8 @@ define void @TestUnionLD1(fp128 %s, i64 %n) #0 {
 ; CHECK-NEXT:    andq %rdi, %rcx
 ; CHECK-NEXT:    movabsq $-281474976710656, %rdx # imm = 0xFFFF000000000000
 ; CHECK-NEXT:    andq -{{[0-9]+}}(%rsp), %rdx
-; CHECK-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
 ; CHECK-NEXT:    orq %rcx, %rdx
+; CHECK-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
 ; CHECK-NEXT:    movq %rdx, -{{[0-9]+}}(%rsp)
 ; CHECK-NEXT:    movaps -{{[0-9]+}}(%rsp), %xmm0
 ; CHECK-NEXT:    jmp foo # TAILCALL
diff --git a/test/CodeGen/X86/haddsub-2.ll b/test/CodeGen/X86/haddsub-2.ll
index 4596b83f7bc2..b5507523a75a 100644
--- a/test/CodeGen/X86/haddsub-2.ll
+++ b/test/CodeGen/X86/haddsub-2.ll
@@ -933,14 +933,14 @@ define <4 x float> @not_a_hsub_2(<4 x float> %A, <4 x float> %B) {
 ; AVX-NEXT:    vsubss %xmm3, %xmm2, %xmm2
 ; AVX-NEXT:    vmovshdup {{.*#+}} xmm3 = xmm0[1,1,3,3]
 ; AVX-NEXT:    vsubss %xmm3, %xmm0, %xmm0
-; AVX-NEXT:    vpermilps {{.*#+}} xmm3 = xmm1[3,1,2,3]
-; AVX-NEXT:    vpermilpd {{.*#+}} xmm4 = xmm1[1,0]
-; AVX-NEXT:    vsubss %xmm4, %xmm3, %xmm3
 ; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[2,3]
-; AVX-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; AVX-NEXT:    vsubss %xmm2, %xmm1, %xmm1
+; AVX-NEXT:    vpermilps {{.*#+}} xmm2 = xmm1[3,1,2,3]
+; AVX-NEXT:    vpermilpd {{.*#+}} xmm3 = xmm1[1,0]
+; AVX-NEXT:    vsubss %xmm3, %xmm2, %xmm2
+; AVX-NEXT:    vmovshdup {{.*#+}} xmm3 = xmm1[1,1,3,3]
+; AVX-NEXT:    vsubss %xmm3, %xmm1, %xmm1
 ; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
-; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[0]
+; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[0]
 ; AVX-NEXT:    retq
   %vecext = extractelement <4 x float> %A, i32 2
   %vecext1 = extractelement <4 x float> %A, i32 3
diff --git a/test/CodeGen/X86/known-signbits-vector.ll b/test/CodeGen/X86/known-signbits-vector.ll
index cea9ac26edbc..ec620b8ce877 100644
--- a/test/CodeGen/X86/known-signbits-vector.ll
+++ b/test/CodeGen/X86/known-signbits-vector.ll
@@ -137,3 +137,64 @@ define float @signbits_ashr_insert_ashr_extract_sitofp(i64 %a0, i64 %a1) nounwin
   %6 = sitofp i64 %5 to float
   ret float %6
 }
+
+define <4 x double> @signbits_sext_shuffle_sitofp(<4 x i32> %a0, <4 x i64> %a1) nounwind {
+; X32-LABEL: signbits_sext_shuffle_sitofp:
+; X32:       # BB#0:
+; X32-NEXT:    vpmovsxdq %xmm0, %xmm1
+; X32-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; X32-NEXT:    vpmovsxdq %xmm0, %xmm0
+; X32-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; X32-NEXT:    vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2]
+; X32-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1]
+; X32-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; X32-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
+; X32-NEXT:    vcvtdq2pd %xmm0, %ymm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: signbits_sext_shuffle_sitofp:
+; X64:       # BB#0:
+; X64-NEXT:    vpmovsxdq %xmm0, %xmm1
+; X64-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; X64-NEXT:    vpmovsxdq %xmm0, %xmm0
+; X64-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; X64-NEXT:    vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2]
+; X64-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1]
+; X64-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; X64-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
+; X64-NEXT:    vcvtdq2pd %xmm0, %ymm0
+; X64-NEXT:    retq
+  %1 = sext <4 x i32> %a0 to <4 x i64>
+  %2 = shufflevector <4 x i64> %1, <4 x i64>%a1, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+  %3 = sitofp <4 x i64> %2 to <4 x double>
+  ret <4 x double> %3
+}
+
+define <2 x double> @signbits_ashr_concat_ashr_extract_sitofp(<2 x i64> %a0, <4 x i64> %a1) nounwind {
+; X32-LABEL: signbits_ashr_concat_ashr_extract_sitofp:
+; X32:       # BB#0:
+; X32-NEXT:    vpsrad $16, %xmm0, %xmm1
+; X32-NEXT:    vpsrlq $16, %xmm0, %xmm0
+; X32-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; X32-NEXT:    vpsrlq $16, %xmm0, %xmm0
+; X32-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; X32-NEXT:    vcvtdq2pd %xmm0, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: signbits_ashr_concat_ashr_extract_sitofp:
+; X64:       # BB#0:
+; X64-NEXT:    vpsrad $16, %xmm0, %xmm1
+; X64-NEXT:    vpsrlq $16, %xmm0, %xmm0
+; X64-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; X64-NEXT:    vpsrlq $16, %xmm0, %xmm0
+; X64-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; X64-NEXT:    vcvtdq2pd %xmm0, %xmm0
+; X64-NEXT:    retq
+  %1 = ashr <2 x i64> %a0, <i64 16, i64 16>
+  %2 = shufflevector <2 x i64> %1, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+  %3 = shufflevector <4 x i64> %a1, <4 x i64> %2, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+  %4 = ashr <4 x i64> %3, <i64 16, i64 16, i64 16, i64 16>
+  %5 = shufflevector <4 x i64> %4, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
+  %6 = sitofp <2 x i64> %5 to <2 x double>
+  ret <2 x double> %6
+}
diff --git a/test/CodeGen/X86/leaFixup32.mir b/test/CodeGen/X86/leaFixup32.mir
new file mode 100644
index 000000000000..70aac21c7ff2
--- /dev/null
+++ b/test/CodeGen/X86/leaFixup32.mir
@@ -0,0 +1,508 @@
+# RUN: llc -run-pass x86-fixup-LEAs -mcpu=corei7-avx -o - %s | FileCheck %s
+--- |
+  ; ModuleID = 'test/CodeGen/X86/fixup-lea.ll'
+  source_filename = "test/CodeGen/X86/fixup-lea.ll"
+  target datalayout = "e-m:e-p:32:32-f64:32:64-f80:32-n8:16:32-S128"
+  target triple = "i386"
+  ;generated using: llc -stop-after x86-pad-short-functions fixup-lea.ll > leaFinxup32.mir
+
+  ;test2add_32: 3 operands LEA32r that can be replaced with 2 add instructions
+  ; where ADD32ri8 is chosen
+  define i32 @test2add_32() {
+    ret i32 0
+  }
+
+  ;test2add_ebp_32: 3 operands LEA32r that can be replaced with 2 add instructions
+  ; where the base is rbp/r13/ebp register
+  define i32 @test2add_ebp_32() {
+    ret i32 0
+  }
+
+  ;test1add_ebp_32: 2 operands LEA32r where base register is ebp and can be replaced
+  ; with an add instruction
+  define i32 @test1add_ebp_32() {
+    ret i32 0
+  }
+
+  ;testleaadd_32: 3 operands LEA32r that can be replaced with 1 lea 1 add instructions
+  define i32 @testleaadd_32() {
+    ret i32 0
+  }
+
+  ;testleaadd_ebp_32: 3 operands LEA32r that can be replaced with 1 lea 1 add instructions
+  ; where the base is ebp register
+  define i32 @testleaadd_ebp_32() {
+    ret i32 0
+  }
+
+  ;test1lea_ebp_32: 2 operands LEA32r wher base register is rbp/r13/ebp and can be replaced
+  ; with a lea instruction
+  define i32 @test1lea_ebp_32() {
+    ret i32 0
+  }
+ 
+  ;test2addi32_32: 3 operands LEA32r that can be replaced with 2 add instructions where ADD32ri32
+  ; is chosen
+  define i32 @test2addi32_32() {
+    ret i32 0
+  }
+ 
+  ;test1mov1add_ebp_32: 2 operands LEA32r that can be replaced with 1 add 1 mov instructions
+  ; where the base is rbp/r13/ebp register
+  define i32 @test1mov1add_ebp_32() {
+    ret i32 0
+  }
+
+  ;testleaadd_ebp_index_32: 3 operands LEA32r that can be replaced with 1 lea 1 add instructions
+  ; where the base and the index are ebp register and there is offset
+  define i32 @testleaadd_ebp_index_32() {
+    ret i32 0
+  }
+
+  ;testleaadd_ebp_index2_32: 3 operands LEA32r that can be replaced with 1 lea 1 add instructions
+  ; where the base and the index are ebp register and there is scale
+  define i32 @testleaadd_ebp_index2_32() {
+    ret i32 0
+  }
+  
+  ;test_skip_opt_32: 3 operands LEA32r that can not be replaced with 2 instructions
+  define i32 @test_skip_opt_32() {
+    ret i32 0
+  }
+
+  ;test_skip_eflags_32: LEA32r that cannot be replaced since its not safe to clobber eflags
+  define i32 @test_skip_eflags_32() {
+    ret i32 0
+  }
+
+...
+---
+name:            test2add_32
+alignment:       4
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+tracksRegLiveness: true
+liveins:         
+  - { reg: '%eax' }
+  - { reg: '%ebp' }
+frameInfo:       
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       0
+  offsetAdjustment: 0
+  maxAlignment:    0
+  adjustsStack:    false
+  hasCalls:        false
+  maxCallFrameSize: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+body:             |
+  bb.0 (%ir-block.0):
+    liveins: %eax, %ebp
+    ; CHECK: %eax = ADD32rr %eax, killed %ebp
+    ; CHECK: %eax = ADD32ri8 %eax, -5
+ 
+    %eax = LEA32r killed %eax, 1, killed %ebp, -5, _
+    RETQ %eax
+
+...
+---
+name:            test2add_ebp_32
+alignment:       4
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+tracksRegLiveness: true
+liveins:         
+  - { reg: '%eax' }
+  - { reg: '%ebp' }
+frameInfo:       
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       0
+  offsetAdjustment: 0
+  maxAlignment:    0
+  adjustsStack:    false
+  hasCalls:        false
+  maxCallFrameSize: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+body:             |
+  bb.0 (%ir-block.0):
+    liveins: %eax, %ebp
+    ; CHECK: %ebp = ADD32rr %ebp, killed %eax
+    ; CHECK: %ebp = ADD32ri8 %ebp, -5
+ 
+    %ebp = LEA32r killed %ebp, 1, killed %eax, -5, _
+    RETQ %ebp
+
+...
+---
+name:            test1add_ebp_32
+alignment:       4
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+tracksRegLiveness: true
+liveins:         
+  - { reg: '%eax' }
+  - { reg: '%ebp' }
+frameInfo:       
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       0
+  offsetAdjustment: 0
+  maxAlignment:    0
+  adjustsStack:    false
+  hasCalls:        false
+  maxCallFrameSize: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+body:             |
+  bb.0 (%ir-block.0):
+    liveins: %eax, %ebp
+    ; CHECK: %ebp = ADD32rr %ebp, killed %eax
+ 
+    %ebp = LEA32r killed %ebp, 1, killed %eax, 0, _
+    RETQ %ebp
+
+...
+---
+name:            testleaadd_32
+alignment:       4
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+tracksRegLiveness: true
+liveins:         
+  - { reg: '%eax' }
+  - { reg: '%ebp' }
+  - { reg: '%ebx' }
+frameInfo:       
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       0
+  offsetAdjustment: 0
+  maxAlignment:    0
+  adjustsStack:    false
+  hasCalls:        false
+  maxCallFrameSize: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+body:             |
+  bb.0 (%ir-block.0):
+    liveins: %eax, %ebp, %esi
+    ; CHECK: %ebx = LEA32r killed %eax, 1, killed %ebp, 0
+    ; CHECK: %ebx = ADD32ri8 %ebx, -5
+ 
+    %ebx = LEA32r killed %eax, 1, killed %ebp, -5, _
+    RETQ %ebx
+
+...
+---
+name:            testleaadd_ebp_32
+alignment:       4
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+tracksRegLiveness: true
+liveins:         
+  - { reg: '%eax' }
+  - { reg: '%ebp' }
+  - { reg: '%ebx' }
+frameInfo:       
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       0
+  offsetAdjustment: 0
+  maxAlignment:    0
+  adjustsStack:    false
+  hasCalls:        false
+  maxCallFrameSize: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+body:             |
+  bb.0 (%ir-block.0):
+    liveins: %eax, %ebp
+    ; CHECK: %ebx = LEA32r killed %eax, 1, killed %ebp, 0, _
+    ; CHECK: %ebx = ADD32ri8  %ebx, -5
+ 
+    %ebx = LEA32r killed %ebp, 1, killed %eax, -5, _
+    RETQ %ebx
+
+...
+---
+name:            test1lea_ebp_32
+alignment:       4
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+tracksRegLiveness: true
+liveins:         
+  - { reg: '%eax' }
+  - { reg: '%ebp' }
+  - { reg: '%ebx' }
+frameInfo:       
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       0
+  offsetAdjustment: 0
+  maxAlignment:    0
+  adjustsStack:    false
+  hasCalls:        false
+  maxCallFrameSize: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+body:             |
+  bb.0 (%ir-block.0):
+    liveins: %eax, %ebp
+    ; CHECK: %ebx = LEA32r killed %eax, 1, killed %ebp, 0, _
+ 
+    %ebx = LEA32r killed %ebp, 1, killed %eax, 0, _
+    RETQ %ebx
+
+...
+---
+name:            test2addi32_32
+alignment:       4
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+tracksRegLiveness: true
+liveins:         
+  - { reg: '%eax' }
+  - { reg: '%ebp' }
+frameInfo:       
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       0
+  offsetAdjustment: 0
+  maxAlignment:    0
+  adjustsStack:    false
+  hasCalls:        false
+  maxCallFrameSize: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+body:             |
+  bb.0 (%ir-block.0):
+    liveins: %eax, %ebp
+    ; CHECK: %eax = ADD32rr %eax, killed %ebp
+    ; CHECK: %eax = ADD32ri %eax, 129
+ 
+    %eax = LEA32r killed %eax, 1, killed %ebp, 129, _
+    RETQ %eax
+
+...
+---
+name:            test1mov1add_ebp_32
+alignment:       4
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+tracksRegLiveness: true
+liveins:         
+  - { reg: '%eax' }
+  - { reg: '%ebp' }
+frameInfo:       
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       0
+  offsetAdjustment: 0
+  maxAlignment:    0
+  adjustsStack:    false
+  hasCalls:        false
+  maxCallFrameSize: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+body:             |
+  bb.0 (%ir-block.0):
+    liveins: %eax, %ebp, %ebx
+    ; CHECK: %ebx = MOV32rr killed %ebp
+    ; CHECK: %ebx = ADD32rr %ebx, killed %ebp
+ 
+    %ebx = LEA32r killed %ebp, 1, killed %ebp, 0, _
+    RETQ %ebx
+
+...
+---
+name:            testleaadd_ebp_index_32
+alignment:       4
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+tracksRegLiveness: true
+liveins:         
+  - { reg: '%ebx' }
+  - { reg: '%ebp' }
+frameInfo:       
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       0
+  offsetAdjustment: 0
+  maxAlignment:    0
+  adjustsStack:    false
+  hasCalls:        false
+  maxCallFrameSize: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+body:             |
+  bb.0 (%ir-block.0):
+    liveins: %eax, %ebp, %ebx
+    ; CHECK: %ebx = LEA32r _, 1, killed %ebp, 5, _
+    ; CHECK: %ebx = ADD32rr %ebx, killed %ebp
+ 
+    %ebx = LEA32r killed %ebp, 1, killed %ebp, 5, _
+    RETQ %ebx
+
+...
+---
+name:            testleaadd_ebp_index2_32
+alignment:       4
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+tracksRegLiveness: true
+liveins:         
+  - { reg: '%ebx' }
+  - { reg: '%ebp' }
+frameInfo:       
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       0
+  offsetAdjustment: 0
+  maxAlignment:    0
+  adjustsStack:    false
+  hasCalls:        false
+  maxCallFrameSize: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+body:             |
+  bb.0 (%ir-block.0):
+    liveins: %eax, %ebp, %ebx
+    ; CHECK: %ebx = LEA32r _, 4, killed %ebp, 5, _
+    ; CHECK: %ebx = ADD32rr %ebx, killed %ebp
+ 
+    %ebx = LEA32r killed %ebp, 4, killed %ebp, 5, _
+    RETQ %ebx
+
+...
+---
+name:            test_skip_opt_32
+alignment:       4
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+tracksRegLiveness: true
+liveins:         
+  - { reg: '%ebx' }
+  - { reg: '%ebp' }
+frameInfo:       
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       0
+  offsetAdjustment: 0
+  maxAlignment:    0
+  adjustsStack:    false
+  hasCalls:        false
+  maxCallFrameSize: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+body:             |
+  bb.0 (%ir-block.0):
+    liveins: %eax, %ebp, %ebx
+    ; CHECK: %ebp = LEA32r killed %ebp, 4, killed %ebp, 0, _
+ 
+    %ebp = LEA32r killed %ebp, 4, killed %ebp, 0, _
+    RETQ %ebp
+
+...
+---
+name:            test_skip_eflags_32
+alignment:       4
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+tracksRegLiveness: true
+liveins:         
+  - { reg: '%ebp' }
+  - { reg: '%eax' }
+frameInfo:       
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       0
+  offsetAdjustment: 0
+  maxAlignment:    0
+  adjustsStack:    false
+  hasCalls:        false
+  maxCallFrameSize: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+body:             |
+  bb.0 (%ir-block.0):
+    liveins: %eax, %ebp, %ebx
+    ; CHECK: %ebx = LEA32r killed %eax, 4, killed %eax, 5, _
+    ; CHECK: %ebp = LEA32r killed %ebx, 4, killed %ebx, 0, _
+    ; CHECK: %ebp = ADD32ri8 %ebp, 5
+   
+    CMP32rr   %eax, killed %ebx, implicit-def %eflags
+    %ebx = LEA32r killed %eax, 4, killed %eax, 5, _
+    JE_1 %bb.1, implicit %eflags
+    RETQ %ebx
+  bb.1:
+    liveins: %eax, %ebp, %ebx
+    %ebp = LEA32r killed %ebx, 4, killed %ebx, 5, _
+    RETQ %ebp
+
+...
+
+
+
diff --git a/test/CodeGen/X86/leaFixup64.mir b/test/CodeGen/X86/leaFixup64.mir
new file mode 100644
index 000000000000..9b0058750598
--- /dev/null
+++ b/test/CodeGen/X86/leaFixup64.mir
@@ -0,0 +1,1041 @@
+# RUN: llc -run-pass x86-fixup-LEAs -mcpu=corei7-avx -o - %s | FileCheck %s
+--- |
+  ; ModuleID = 'lea-2.ll'
+  source_filename = "lea-2.ll"
+  target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+  ;generated using: llc -stop-after x86-pad-short-functions lea-2.ll > leaFinxup64.mir
+
+  ;testleaadd_64_32_1: 3 operands LEA64_32r cannot be replaced with 2 add instructions
+  ; but can be replaced with 1 lea + 1 add
+  define i32 @testleaadd_64_32_1() {
+    ret i32 0
+  }
+
+  ;testleaadd_rbp_64_32_1: 3 operands LEA64_32r cannot be replaced with 2 add instructions
+  ; where the base is rbp/r13/ebp register but it can be replaced with 1 lea + 1 add
+  define i32 @testleaadd_rbp_64_32_1() {
+    ret i32 0
+  }
+
+  ;test1lea_rbp_64_32_1: 2 operands LEA64_32r where base register is rbp/r13/ebp and can not
+  ; be replaced with an add instruction but can be replaced with 1 lea instruction
+  define i32 @test1lea_rbp_64_32_1() {
+    ret i32 0
+  }
+
+  ;test2add_64: 3 operands LEA64r that can be replaced with 2 add instructions
+  define i32 @test2add_64() {
+    ret i32 0
+  }
+
+  ;test2add_rbp_64: 3 operands LEA64r that can be replaced with 2 add instructions
+  ; where the base is rbp/r13/ebp register
+  define i32 @test2add_rbp_64() {
+    ret i32 0
+  }
+
+  ;test1add_rbp_64: 2 operands LEA64r where base register is rbp/r13/ebp and can be replaced
+  ; with an add instruction
+  define i32 @test1add_rbp_64() {
+    ret i32 0
+  }
+
+  ;testleaadd_64_32: 3 operands LEA64_32r that can be replaced with 1 lea 1 add instructions
+  define i32 @testleaadd_64_32() {
+    ret i32 0
+  }
+
+  ;testleaadd_rbp_64_32: 3 operands LEA64_32r that can be replaced with 1 lea 1 add instructions
+  ; where the base is rbp/r13/ebp register
+  define i32 @testleaadd_rbp_64_32() {
+    ret i32 0
+  }
+
+  ;test1lea_rbp_64_32: 2 operands LEA64_32r where base register is rbp/r13/ebp and can be replaced
+  ; with a lea instruction
+  define i32 @test1lea_rbp_64_32() {
+    ret i32 0
+  }
+
+  ;testleaadd_64: 3 operands LEA64r that can be replaced with 1 lea 1 add instructions
+  define i32 @testleaadd_64() {
+    ret i32 0
+  }
+
+  ;testleaadd_rbp_64: 3 operands LEA64r that can be replaced with 1 lea 1 add instructions
+  ; where the base is rbp/r13/ebp register
+  define i32 @testleaadd_rbp_64() {
+    ret i32 0
+  }
+
+  ;test1lea_rbp_64: 2 operands LEA64r wher base register is rbp/r13/ebp and can be replaced
+  ; with a lea instruction
+  define i32 @test1lea_rbp_64() {
+    ret i32 0
+  }
+
+  ;test8: dst = base & scale!=1, can't optimize
+  define i32 @test8() {
+      ret i32 0
+  }
+ 
+  ;testleaaddi32_64_32: 3 operands LEA64_32r that can be replaced with 1 lea + 1 add instructions where
+  ; ADD64ri32 is chosen
+  define i32 @testleaaddi32_64_32() {
+    ret i32 0
+  }
+ 
+  ;test1mov1add_rbp_64_32: 2 operands LEA64_32r cannot be replaced with 1 add 1 mov instructions
+  ; where the base is rbp/r13/ebp register
+  define i32 @test1mov1add_rbp_64_32() {
+    ret i32 0
+  }
+
+  ;testleaadd_rbp_index_64_32: 3 operands LEA64_32r that cannot replaced with 1 lea 1 add instructions
+  ; where the base and the index are ebp register and there is offset
+  define i32 @testleaadd_rbp_index_64_32() {
+    ret i32 0
+  }
+
+  ;testleaadd_rbp_index2_64_32: 3 operands LEA64_32r that cannot replaced with 1 lea 1 add instructions
+  ; where the base and the index are ebp register and there is scale
+  define i32 @testleaadd_rbp_index2_64_32() {
+    ret i32 0
+  }
+ 
+  ;test2addi32_64: 3 operands LEA64r that can be replaced with 2 add instructions where ADD64ri32
+  ; is chosen
+  define i32 @test2addi32_64() {
+    ret i32 0
+  }
+ 
+  ;test1mov1add_rbp_64: 2 operands LEA64r that can be replaced with 1 add 1 mov instructions
+  ; where the base is rbp/r13/ebp register
+  define i32 @test1mov1add_rbp_64() {
+    ret i32 0
+  }
+
+  ;testleaadd_rbp_index_64: 3 operands LEA64r that can be replaced with 1 lea 1 add instructions
+  ; where the base and the index are ebp register and there is offset
+  define i32 @testleaadd_rbp_index_64() {
+    ret i32 0
+  }
+
+  ;testleaadd_rbp_index2_64: 3 operands LEA64r that can be replaced with 1 lea 1 add instructions
+  ; where the base and the index are ebp register and there is scale
+  define i32 @testleaadd_rbp_index2_64() {
+    ret i32 0
+  }
+
+  ;test_skip_opt_64: 3 operands LEA64r that can not be replaced with 2 instructions
+  define i32 @test_skip_opt_64() {
+    ret i32 0
+  }
+
+  ;test_skip_eflags_64: LEA64r that cannot be replaced since its not safe to clobber eflags
+  define i32 @test_skip_eflags_64() {
+    ret i32 0
+  }
+
+  ;test_skip_opt_64_32: 3 operands LEA64_32r that can not be replaced with 2 instructions
+  define i32 @test_skip_opt_64_32() {
+    ret i32 0
+  }
+
+  ;test_skip_eflags_64_32: LEA64_32r that cannot be replaced since its not safe to clobber eflags
+  define i32 @test_skip_eflags_64_32() {
+    ret i32 0
+  }
+
+
+...
+---
+name:            testleaadd_64_32_1
+alignment:       4
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+tracksRegLiveness: true
+liveins:         
+  - { reg: '%rax' }
+  - { reg: '%rbp' }
+frameInfo:       
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       0
+  offsetAdjustment: 0
+  maxAlignment:    0
+  adjustsStack:    false
+  hasCalls:        false
+  maxCallFrameSize: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+body:             |
+  bb.0 (%ir-block.0):
+    liveins: %rax, %rbp
+    ; CHECK: %eax = LEA64_32r killed %rax, 1, killed %rbp, 0
+    ; CHECK: %eax = ADD32ri8 %eax, -5
+ 
+    %eax = LEA64_32r killed %rax, 1, killed %rbp, -5, _
+    RETQ %eax
+
+...
+---
+name:            testleaadd_rbp_64_32_1
+alignment:       4
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+tracksRegLiveness: true
+liveins:         
+  - { reg: '%rax' }
+  - { reg: '%rbp' }
+frameInfo:       
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       0
+  offsetAdjustment: 0
+  maxAlignment:    0
+  adjustsStack:    false
+  hasCalls:        false
+  maxCallFrameSize: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+body:             |
+  bb.0 (%ir-block.0):
+    liveins: %rax, %rbp
+    ; CHECK: %ebp = LEA64_32r killed %rax, 1,  killed %rbp, 0
+    ; CHECK: %ebp = ADD32ri8 %ebp, -5
+ 
+    %ebp = LEA64_32r killed %rbp, 1, killed %rax, -5, _
+    RETQ %ebp
+
+...
+---
+name:            test1lea_rbp_64_32_1
+alignment:       4
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+tracksRegLiveness: true
+liveins:         
+  - { reg: '%rax' }
+  - { reg: '%rbp' }
+frameInfo:       
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       0
+  offsetAdjustment: 0
+  maxAlignment:    0
+  adjustsStack:    false
+  hasCalls:        false
+  maxCallFrameSize: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+body:             |
+  bb.0 (%ir-block.0):
+    liveins: %rax, %rbp
+    ; CHECK: %ebp = LEA64_32r killed %rax, 1, killed %rbp, 0
+ 
+    %ebp = LEA64_32r killed %rbp, 1, killed %rax, 0, _
+    RETQ %ebp
+
+...
+---
+name:            test2add_64
+alignment:       4
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+tracksRegLiveness: true
+liveins:         
+  - { reg: '%rax' }
+  - { reg: '%rbp' }
+frameInfo:       
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       0
+  offsetAdjustment: 0
+  maxAlignment:    0
+  adjustsStack:    false
+  hasCalls:        false
+  maxCallFrameSize: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+body:             |
+  bb.0 (%ir-block.0):
+    liveins: %rax, %rbp
+    ; CHECK: %rax = ADD64rr %rax, killed %rbp
+    ; CHECK: %rax = ADD64ri8 %rax, -5
+ 
+    %rax = LEA64r killed %rax, 1, killed %rbp, -5, _
+    RETQ %eax
+
+...
+---
+name:            test2add_rbp_64
+alignment:       4
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+tracksRegLiveness: true
+liveins:         
+  - { reg: '%rax' }
+  - { reg: '%rbp' }
+frameInfo:       
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       0
+  offsetAdjustment: 0
+  maxAlignment:    0
+  adjustsStack:    false
+  hasCalls:        false
+  maxCallFrameSize: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+body:             |
+  bb.0 (%ir-block.0):
+    liveins: %rax, %rbp
+    ; CHECK: %rbp = ADD64rr %rbp, killed %rax
+    ; CHECK: %rbp = ADD64ri8 %rbp, -5
+ 
+    %rbp = LEA64r killed %rbp, 1, killed %rax, -5, _
+    RETQ %ebp
+
+...
+---
+name:            test1add_rbp_64
+alignment:       4
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+tracksRegLiveness: true
+liveins:         
+  - { reg: '%rax' }
+  - { reg: '%rbp' }
+frameInfo:       
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       0
+  offsetAdjustment: 0
+  maxAlignment:    0
+  adjustsStack:    false
+  hasCalls:        false
+  maxCallFrameSize: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+body:             |
+  bb.0 (%ir-block.0):
+    liveins: %rax, %rbp
+    ; CHECK: %rbp = ADD64rr %rbp, killed %rax
+ 
+    %rbp = LEA64r killed %rbp, 1, killed %rax, 0, _
+    RETQ %ebp
+
+...
+---
+name:            testleaadd_64_32
+alignment:       4
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+tracksRegLiveness: true
+liveins:         
+  - { reg: '%rax' }
+  - { reg: '%rbp' }
+  - { reg: '%rbx' }
+frameInfo:       
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       0
+  offsetAdjustment: 0
+  maxAlignment:    0
+  adjustsStack:    false
+  hasCalls:        false
+  maxCallFrameSize: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+body:             |
+  bb.0 (%ir-block.0):
+    liveins: %rax, %rbp
+    ; CHECK: %ebx = LEA64_32r killed %rax, 1, killed %rbp, 0, _
+    ; CHECK: %ebx = ADD32ri8 %ebx, -5
+ 
+    %ebx = LEA64_32r killed %rax, 1, killed %rbp, -5, _
+    RETQ %ebx
+
+...
+---
+name:            testleaadd_rbp_64_32
+alignment:       4
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+tracksRegLiveness: true
+liveins:         
+  - { reg: '%rax' }
+  - { reg: '%rbp' }
+  - { reg: '%rbx' }
+frameInfo:       
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       0
+  offsetAdjustment: 0
+  maxAlignment:    0
+  adjustsStack:    false
+  hasCalls:        false
+  maxCallFrameSize: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+body:             |
+  bb.0 (%ir-block.0):
+    liveins: %rax, %rbp
+    ; CHECK: %ebx = LEA64_32r killed %rax, 1, killed %rbp, 0, _
+    ; CHECK: %ebx = ADD32ri8 %ebx, -5
+ 
+    %ebx = LEA64_32r killed %rbp, 1, killed %rax, -5, _
+    RETQ %ebx
+
+...
+---
+name:            test1lea_rbp_64_32
+alignment:       4
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+tracksRegLiveness: true
+liveins:         
+  - { reg: '%rax' }
+  - { reg: '%rbp' }
+  - { reg: '%rbx' }
+frameInfo:       
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       0
+  offsetAdjustment: 0
+  maxAlignment:    0
+  adjustsStack:    false
+  hasCalls:        false
+  maxCallFrameSize: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+body:             |
+  bb.0 (%ir-block.0):
+    liveins: %rax, %rbp
+    ; CHECK: %ebx = LEA64_32r killed %rax, 1, killed %rbp, 0, _
+ 
+    %ebx = LEA64_32r killed %rbp, 1, killed %rax, 0, _
+    RETQ %ebx
+
+...
+---
+name:            testleaadd_64
+alignment:       4
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+tracksRegLiveness: true
+liveins:         
+  - { reg: '%rax' }
+  - { reg: '%rbp' }
+  - { reg: '%rbx' }
+frameInfo:       
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       0
+  offsetAdjustment: 0
+  maxAlignment:    0
+  adjustsStack:    false
+  hasCalls:        false
+  maxCallFrameSize: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+body:             |
+  bb.0 (%ir-block.0):
+    liveins: %rax, %rbp
+    ; CHECK: %rbx = LEA64r killed %rax, 1, killed %rbp, 0, _
+    ; CHECK: %rbx = ADD64ri8 %rbx, -5
+ 
+    %rbx = LEA64r killed %rax, 1, killed %rbp, -5, _
+    RETQ %ebx
+
+...
+---
+name:            testleaadd_rbp_64
+alignment:       4
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+tracksRegLiveness: true
+liveins:         
+  - { reg: '%rax' }
+  - { reg: '%rbp' }
+  - { reg: '%rbx' }
+frameInfo:       
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       0
+  offsetAdjustment: 0
+  maxAlignment:    0
+  adjustsStack:    false
+  hasCalls:        false
+  maxCallFrameSize: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+body:             |
+  bb.0 (%ir-block.0):
+    liveins: %rax, %rbp
+    ; CHECK: %rbx = LEA64r killed %rax, 1, killed %rbp, 0, _
+    ; CHECK: %rbx = ADD64ri8 %rbx, -5
+ 
+    %rbx = LEA64r killed %rbp, 1, killed %rax, -5, _
+    RETQ %ebx
+
+...
+---
+name:            test1lea_rbp_64
+alignment:       4
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+tracksRegLiveness: true
+liveins:         
+  - { reg: '%rax' }
+  - { reg: '%rbp' }
+  - { reg: '%rbx' }
+frameInfo:       
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       0
+  offsetAdjustment: 0
+  maxAlignment:    0
+  adjustsStack:    false
+  hasCalls:        false
+  maxCallFrameSize: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+body:             |
+  bb.0 (%ir-block.0):
+    liveins: %rax, %rbp
+    ; CHECK: %rbx = LEA64r killed %rax, 1, killed %rbp, 0, _
+ 
+    %rbx = LEA64r killed %rbp, 1, killed %rax, 0, _
+    RETQ %ebx
+
+...
+---
+name:            test8
+alignment:       4
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+tracksRegLiveness: true
+liveins:         
+  - { reg: '%rdi' }
+  - { reg: '%rbp' }
+frameInfo:       
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       0
+  offsetAdjustment: 0
+  maxAlignment:    0
+  adjustsStack:    false
+  hasCalls:        false
+  maxCallFrameSize: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+body:             |
+  bb.0 (%ir-block.0):
+    liveins: %rdi, %rbp
+    ; CHECK:  %r12 = LEA64r _, 2, killed %r13, 5, _
+    ; CHECK:  %r12 = ADD64rr %r12, killed %rbp
+    %rbp = KILL %rbp, implicit-def %rbp
+    %r13 = KILL %rdi, implicit-def %r13
+    %r12 = LEA64r killed %rbp, 2, killed %r13, 5, _
+    RETQ %r12
+
+...
+---
+name:            testleaaddi32_64_32
+alignment:       4
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+tracksRegLiveness: true
+liveins:         
+  - { reg: '%rax' }
+  - { reg: '%rbp' }
+frameInfo:       
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       0
+  offsetAdjustment: 0
+  maxAlignment:    0
+  adjustsStack:    false
+  hasCalls:        false
+  maxCallFrameSize: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+body:             |
+  bb.0 (%ir-block.0):
+    liveins: %rax, %rbp
+    ; CHECK: %eax = LEA64_32r killed %rax, 1, killed %rbp, 0
+    ; CHECK: %eax = ADD32ri %eax, 129
+ 
+    %eax = LEA64_32r killed %rax, 1, killed %rbp, 129, _
+    RETQ %eax
+
+...
+---
+name:            test1mov1add_rbp_64_32
+alignment:       4
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+tracksRegLiveness: true
+liveins:         
+  - { reg: '%rax' }
+  - { reg: '%rbp' }
+frameInfo:       
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       0
+  offsetAdjustment: 0
+  maxAlignment:    0
+  adjustsStack:    false
+  hasCalls:        false
+  maxCallFrameSize: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+body:             |
+  bb.0 (%ir-block.0):
+    liveins: %rax, %rbp, %rbx
+    ; CHECK: %ebx = LEA64_32r killed %rbp, 1, killed %rbp, 0, _
+
+    %ebx = LEA64_32r killed %rbp, 1, killed %rbp, 0, _
+    RETQ %ebx
+
+...
+---
+name:            testleaadd_rbp_index_64_32
+alignment:       4
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+tracksRegLiveness: true
+liveins:         
+  - { reg: '%rbx' }
+  - { reg: '%rbp' }
+frameInfo:       
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       0
+  offsetAdjustment: 0
+  maxAlignment:    0
+  adjustsStack:    false
+  hasCalls:        false
+  maxCallFrameSize: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+body:             |
+  bb.0 (%ir-block.0):
+    liveins: %rax, %rbp, %rbx
+    ; CHECK: %ebx = LEA64_32r killed %rbp, 1, killed %rbp, 5, _
+ 
+    %ebx = LEA64_32r killed %rbp, 1, killed %rbp, 5, _
+    RETQ %ebx
+
+...
+---
+name:            testleaadd_rbp_index2_64_32
+alignment:       4
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+tracksRegLiveness: true
+liveins:         
+  - { reg: '%rbx' }
+  - { reg: '%rbp' }
+frameInfo:       
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       0
+  offsetAdjustment: 0
+  maxAlignment:    0
+  adjustsStack:    false
+  hasCalls:        false
+  maxCallFrameSize: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+body:             |
+  bb.0 (%ir-block.0):
+    liveins: %eax, %ebp, %ebx
+    ; CHECK: %ebx = LEA64_32r killed %rbp, 4, killed %rbp, 5, _
+ 
+    %ebx = LEA64_32r killed %rbp, 4, killed %rbp, 5, _
+    RETQ %ebx
+
+...
+---
+name:            test2addi32_64
+alignment:       4
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+tracksRegLiveness: true
+liveins:         
+  - { reg: '%rax' }
+  - { reg: '%rbp' }
+frameInfo:       
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       0
+  offsetAdjustment: 0
+  maxAlignment:    0
+  adjustsStack:    false
+  hasCalls:        false
+  maxCallFrameSize: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+body:             |
+  bb.0 (%ir-block.0):
+    liveins: %rax, %rbp
+    ; CHECK: %rax = ADD64rr %rax, killed %rbp
+    ; CHECK: %rax = ADD64ri32 %rax, 129
+ 
+    %rax = LEA64r killed %rax, 1, killed %rbp, 129, _
+    RETQ %eax
+
+...
+---
+name:            test1mov1add_rbp_64
+alignment:       4
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+tracksRegLiveness: true
+liveins:         
+  - { reg: '%rax' }
+  - { reg: '%rbp' }
+frameInfo:       
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       0
+  offsetAdjustment: 0
+  maxAlignment:    0
+  adjustsStack:    false
+  hasCalls:        false
+  maxCallFrameSize: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+body:             |
+  bb.0 (%ir-block.0):
+    liveins: %rax, %rbp, %rbx
+    ; CHECK: %rbx = MOV64rr killed %rbp
+    ; CHECK: %rbx = ADD64rr %rbx, killed %rbp
+ 
+    %rbx = LEA64r killed %rbp, 1, killed %rbp, 0, _
+    RETQ %ebx
+
+...
+---
+name:            testleaadd_rbp_index_64
+alignment:       4
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+tracksRegLiveness: true
+liveins:         
+  - { reg: '%rbx' }
+  - { reg: '%rbp' }
+frameInfo:       
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       0
+  offsetAdjustment: 0
+  maxAlignment:    0
+  adjustsStack:    false
+  hasCalls:        false
+  maxCallFrameSize: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+body:             |
+  bb.0 (%ir-block.0):
+    liveins: %rax, %rbp, %rbx
+    ; CHECK: %rbx = LEA64r _, 1, killed %rbp, 5, _
+    ; CHECK: %rbx = ADD64rr %rbx, killed %rbp
+ 
+    %rbx = LEA64r killed %rbp, 1, killed %rbp, 5, _
+    RETQ %ebx
+
+...
+---
+name:            testleaadd_rbp_index2_64
+alignment:       4
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+tracksRegLiveness: true
+liveins:         
+  - { reg: '%rbx' }
+  - { reg: '%rbp' }
+frameInfo:       
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       0
+  offsetAdjustment: 0
+  maxAlignment:    0
+  adjustsStack:    false
+  hasCalls:        false
+  maxCallFrameSize: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+body:             |
+  bb.0 (%ir-block.0):
+    liveins: %rax, %rbp, %rbx
+    ; CHECK: %rbx = LEA64r _, 4, killed %rbp, 5, _
+    ; CHECK: %rbx = ADD64rr %rbx, killed %rbp
+ 
+    %rbx = LEA64r killed %rbp, 4, killed %rbp, 5, _
+    RETQ %ebx
+
+...
+---
+name:            test_skip_opt_64
+alignment:       4
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+tracksRegLiveness: true
+liveins:         
+  - { reg: '%rbx' }
+  - { reg: '%rbp' }
+frameInfo:       
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       0
+  offsetAdjustment: 0
+  maxAlignment:    0
+  adjustsStack:    false
+  hasCalls:        false
+  maxCallFrameSize: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+body:             |
+  bb.0 (%ir-block.0):
+    liveins: %rax, %rbp, %rbx
+    ; CHECK: %rbp = LEA64r killed %rbp, 4, killed %rbp, 0, _
+ 
+    %rbp = LEA64r killed %rbp, 4, killed %rbp, 0, _
+    RETQ %ebp
+
+...
+---
+name:            test_skip_eflags_64
+alignment:       4
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+tracksRegLiveness: true
+liveins:         
+  - { reg: '%rbp' }
+  - { reg: '%rax' }
+frameInfo:       
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       0
+  offsetAdjustment: 0
+  maxAlignment:    0
+  adjustsStack:    false
+  hasCalls:        false
+  maxCallFrameSize: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+body:             |
+  bb.0 (%ir-block.0):
+    liveins: %rax, %rbp, %rbx
+    ; CHECK: %rbx = LEA64r killed %rax, 4, killed %rax, 5, _
+    ; CHECK: %rbp = LEA64r killed %rbx, 4, killed %rbx, 0, _
+    ; CHECK: %rbp = ADD64ri8 %rbp, 5
+   
+    CMP64rr   %rax, killed %rbx, implicit-def %eflags
+    %rbx = LEA64r killed %rax, 4, killed %rax, 5, _
+    JE_1 %bb.1, implicit %eflags
+    RETQ %ebx
+  bb.1:
+    liveins: %rax, %rbp, %rbx
+    %rbp = LEA64r killed %rbx, 4, killed %rbx, 5, _
+    RETQ %ebp
+
+...
+---
+name:            test_skip_opt_64_32
+alignment:       4
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+tracksRegLiveness: true
+liveins:         
+  - { reg: '%rbx' }
+  - { reg: '%rbp' }
+frameInfo:       
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       0
+  offsetAdjustment: 0
+  maxAlignment:    0
+  adjustsStack:    false
+  hasCalls:        false
+  maxCallFrameSize: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+body:             |
+  bb.0 (%ir-block.0):
+    liveins: %rax, %rbp, %rbx
+    ; CHECK: %ebp = LEA64_32r killed %rbp, 4, killed %rbp, 0, _
+ 
+    %ebp = LEA64_32r killed %rbp, 4, killed %rbp, 0, _
+    RETQ %ebp
+
+...
+---
+name:            test_skip_eflags_64_32
+alignment:       4
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+tracksRegLiveness: true
+liveins:         
+  - { reg: '%rbp' }
+  - { reg: '%rax' }
+frameInfo:       
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       0
+  offsetAdjustment: 0
+  maxAlignment:    0
+  adjustsStack:    false
+  hasCalls:        false
+  maxCallFrameSize: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+body:             |
+  bb.0 (%ir-block.0):
+    liveins: %rax, %rbp, %rbx
+    ; CHECK: %ebx = LEA64_32r killed %rax, 4, killed %rax, 5, _
+    ; CHECK: %ebp = LEA64_32r killed %rbx, 4, killed %rbx, 0, _
+    ; CHECK: %ebp = ADD32ri8 %ebp, 5
+   
+    CMP64rr   %rax, killed %rbx, implicit-def %eflags
+    %ebx = LEA64_32r killed %rax, 4, killed %rax, 5, _
+    JE_1 %bb.1, implicit %eflags
+    RETQ %ebx
+  bb.1:
+    liveins: %rax, %rbp, %rbx
+    %ebp = LEA64_32r killed %rbx, 4, killed %rbx, 5, _
+    RETQ %ebp
+
+...
+
+
+
diff --git a/test/CodeGen/X86/lrshrink.ll b/test/CodeGen/X86/lrshrink.ll
new file mode 100644
index 000000000000..a9cf086dbd90
--- /dev/null
+++ b/test/CodeGen/X86/lrshrink.ll
@@ -0,0 +1,57 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s
+
+; Checks if "%7 = add nuw nsw i64 %4, %2" is moved before the last call
+; to minimize live-range.
+
+define i64 @test(i1 %a, i64 %r1, i64 %r2, i64 %s1, i64 %s2, i64 %t1, i64 %t2) {
+entry:
+  br i1 %a, label %then, label %else
+
+then:
+  br label %else
+
+else:
+  %0 = phi i64 [ 4, %entry ], [ 10, %then ]
+  %r = phi i64 [ %r1, %entry ], [ %r2, %then ]
+  %s = phi i64 [ %s1, %entry ], [ %s2, %then ]
+  %t = phi i64 [ %t1, %entry ], [ %t2, %then ]
+; CHECK-LABEL: test:
+; CHECK: add
+; CHECK: add
+; CHECK: call
+; CHECK: add
+; CHECK: call
+; CHECK: add
+; CHECK: call
+; CHECK: add
+  %1 = tail call i32 @_Z3foov()
+  %2 = zext i32 %1 to i64
+  %3 = tail call i32 @_Z3foov()
+  %4 = zext i32 %3 to i64
+  %5 = tail call i32 @_Z3foov()
+  %6 = zext i32 %5 to i64
+  %7 = add nuw nsw i64 %0, %r
+  tail call void @llvm.dbg.value(metadata i64 %7, i64 0, metadata !5, metadata !DIExpression()), !dbg !6
+  %8 = add nuw nsw i64 %2, %7
+  %9 = add nuw nsw i64 %4, %8
+  %10 = add nuw nsw i64 %6, %9
+  %11 = add nuw nsw i64 %s, %t
+  tail call void @llvm.dbg.value(metadata i64 %11, i64 0, metadata !5, metadata !DIExpression()), !dbg !6
+  %12 = add nuw nsw i64 %10, %11
+  ret i64 %12
+}
+
+declare i32 @_Z3foov()
+declare void @llvm.dbg.value(metadata, i64, metadata, metadata)
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!1, !2}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !3, emissionKind: FullDebug)
+!1 = !{i32 2, !"Dwarf Version", i32 4}
+!2 = !{i32 2, !"Debug Info Version", i32 3}
+!3 = !DIFile(filename: "a.c", directory: "./")
+!4 = distinct !DISubprogram(name: "test", scope: !3, unit: !0)
+!5 = !DILocalVariable(name: "x", scope: !4)
+!6 = !DILocation(line: 4, scope: !4)
diff --git a/test/CodeGen/X86/madd.ll b/test/CodeGen/X86/madd.ll
index d332b2f3169f..af86df510016 100644
--- a/test/CodeGen/X86/madd.ll
+++ b/test/CodeGen/X86/madd.ll
@@ -129,9 +129,9 @@ define i32 @test_unsigned_short(i16* nocapture readonly, i16* nocapture readonly
 ; SSE2-NEXT:    pmullw %xmm2, %xmm3
 ; SSE2-NEXT:    movdqa %xmm3, %xmm2
 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
+; SSE2-NEXT:    paddd %xmm2, %xmm0
 ; SSE2-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
 ; SSE2-NEXT:    paddd %xmm3, %xmm1
-; SSE2-NEXT:    paddd %xmm2, %xmm0
 ; SSE2-NEXT:    addq $16, %rsi
 ; SSE2-NEXT:    addq $16, %rdi
 ; SSE2-NEXT:    addq $-8, %rax
@@ -246,23 +246,23 @@ define i32 @_Z9test_charPcS_i(i8* nocapture readonly, i8* nocapture readonly, i3
 ; SSE2-NEXT:    pmullw %xmm4, %xmm5
 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
 ; SSE2-NEXT:    psrad $16, %xmm4
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm5 = xmm5[4,4,5,5,6,6,7,7]
-; SSE2-NEXT:    psrad $16, %xmm5
-; SSE2-NEXT:    movq {{.*#+}} xmm6 = mem[0],zero
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm6 = xmm6[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT:    psraw $8, %xmm6
-; SSE2-NEXT:    movq {{.*#+}} xmm7 = mem[0],zero
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm7 = xmm7[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT:    psraw $8, %xmm7
-; SSE2-NEXT:    pmullw %xmm6, %xmm7
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3]
-; SSE2-NEXT:    psrad $16, %xmm6
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm7 = xmm7[4,4,5,5,6,6,7,7]
-; SSE2-NEXT:    psrad $16, %xmm7
-; SSE2-NEXT:    paddd %xmm7, %xmm2
-; SSE2-NEXT:    paddd %xmm6, %xmm3
-; SSE2-NEXT:    paddd %xmm5, %xmm1
 ; SSE2-NEXT:    paddd %xmm4, %xmm0
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7]
+; SSE2-NEXT:    psrad $16, %xmm4
+; SSE2-NEXT:    paddd %xmm4, %xmm1
+; SSE2-NEXT:    movq {{.*#+}} xmm4 = mem[0],zero
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT:    psraw $8, %xmm4
+; SSE2-NEXT:    movq {{.*#+}} xmm5 = mem[0],zero
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT:    psraw $8, %xmm5
+; SSE2-NEXT:    pmullw %xmm4, %xmm5
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
+; SSE2-NEXT:    psrad $16, %xmm4
+; SSE2-NEXT:    paddd %xmm4, %xmm3
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7]
+; SSE2-NEXT:    psrad $16, %xmm4
+; SSE2-NEXT:    paddd %xmm4, %xmm2
 ; SSE2-NEXT:    addq $16, %rsi
 ; SSE2-NEXT:    addq $16, %rdi
 ; SSE2-NEXT:    addq $-16, %rax
diff --git a/test/CodeGen/X86/masked_gather_scatter.ll b/test/CodeGen/X86/masked_gather_scatter.ll
index 29a662fb217e..c5de8dd96cbc 100644
--- a/test/CodeGen/X86/masked_gather_scatter.ll
+++ b/test/CodeGen/X86/masked_gather_scatter.ll
@@ -3,7 +3,7 @@
 ; RUN: llc -mtriple=i386-unknown-linux-gnu  -mattr=+avx512f < %s | FileCheck %s --check-prefix=ALL --check-prefix=KNL_32
 ; RUN: llc -mtriple=x86_64-unknown-linux-gnu  -mattr=+avx512vl -mattr=+avx512dq < %s | FileCheck %s --check-prefix=ALL --check-prefix=SKX
 ; RUN: llc -mtriple=i386-unknown-linux-gnu  -mattr=+avx512vl -mattr=+avx512dq < %s | FileCheck %s --check-prefix=ALL --check-prefix=SKX_32
-; RUN: opt -mtriple=x86_64-apple-darwin -codegenprepare -mcpu=corei7-avx -S < %s | FileCheck %s -check-prefix=SCALAR
+; RUN: opt -mtriple=x86_64-apple-darwin -scalarize-masked-mem-intrin -mcpu=corei7-avx -S < %s | FileCheck %s -check-prefix=SCALAR
 ; RUN: llc -O0 -mtriple=x86_64-unknown-linux-gnu -mcpu=skx < %s -o /dev/null
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/test/CodeGen/X86/merge-consecutive-loads-128.ll b/test/CodeGen/X86/merge-consecutive-loads-128.ll
index 71417694b0d4..2f7714e63886 100644
--- a/test/CodeGen/X86/merge-consecutive-loads-128.ll
+++ b/test/CodeGen/X86/merge-consecutive-loads-128.ll
@@ -270,9 +270,9 @@ define <4 x float> @merge_4f32_f32_012u(float* %ptr) nounwind uwtable noinline s
 ; SSE2:       # BB#0:
 ; SSE2-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
 ; SSE2-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SSE2-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
 ; SSE2-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSE2-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSE2-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSE2-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
 ; SSE2-NEXT:    retq
 ;
 ; SSE41-LABEL: merge_4f32_f32_012u:
@@ -292,9 +292,9 @@ define <4 x float> @merge_4f32_f32_012u(float* %ptr) nounwind uwtable noinline s
 ; X32-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X32-SSE1-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
 ; X32-SSE1-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X32-SSE1-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
 ; X32-SSE1-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; X32-SSE1-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; X32-SSE1-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X32-SSE1-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
 ; X32-SSE1-NEXT:    retl
 ;
 ; X32-SSE41-LABEL: merge_4f32_f32_012u:
@@ -321,9 +321,9 @@ define <4 x float> @merge_4f32_f32_019u(float* %ptr) nounwind uwtable noinline s
 ; SSE2:       # BB#0:
 ; SSE2-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
 ; SSE2-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SSE2-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
 ; SSE2-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSE2-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSE2-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSE2-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
 ; SSE2-NEXT:    retq
 ;
 ; SSE41-LABEL: merge_4f32_f32_019u:
@@ -343,9 +343,9 @@ define <4 x float> @merge_4f32_f32_019u(float* %ptr) nounwind uwtable noinline s
 ; X32-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X32-SSE1-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
 ; X32-SSE1-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X32-SSE1-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
 ; X32-SSE1-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; X32-SSE1-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; X32-SSE1-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X32-SSE1-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
 ; X32-SSE1-NEXT:    retl
 ;
 ; X32-SSE41-LABEL: merge_4f32_f32_019u:
diff --git a/test/CodeGen/X86/misched-matrix.ll b/test/CodeGen/X86/misched-matrix.ll
index e62a1d04dad6..94bbe75702cb 100644
--- a/test/CodeGen/X86/misched-matrix.ll
+++ b/test/CodeGen/X86/misched-matrix.ll
@@ -17,9 +17,9 @@
 ;
 ; TOPDOWN-LABEL: %for.body
 ; TOPDOWN: movl %{{.*}}, (
-; TOPDOWN: imull {{[0-9]*}}(
+; TOPDOWN-NOT: imull {{[0-9]*}}(
 ; TOPDOWN: movl %{{.*}}, 4(
-; TOPDOWN: imull {{[0-9]*}}(
+; TOPDOWN-NOT: imull {{[0-9]*}}(
 ; TOPDOWN: movl %{{.*}}, 8(
 ; TOPDOWN: movl %{{.*}}, 12(
 ; TOPDOWN-LABEL: %for.end
diff --git a/test/CodeGen/X86/not-and-simplify.ll b/test/CodeGen/X86/not-and-simplify.ll
index dfce6c681500..83b2be83d552 100644
--- a/test/CodeGen/X86/not-and-simplify.ll
+++ b/test/CodeGen/X86/not-and-simplify.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=-bmi | FileCheck %s --check-prefix=ALL --check-prefix=NO_BMI
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+bmi | FileCheck %s --check-prefix=ALL --check-prefix=BMI
@@ -11,13 +12,24 @@ define i32 @shrink_xor_constant1(i32 %x) {
 ; ALL-NEXT:    xorl $1, %edi
 ; ALL-NEXT:    movl %edi, %eax
 ; ALL-NEXT:    retq
-;
   %sh = lshr i32 %x, 31
   %not = xor i32 %sh, -1
   %and = and i32 %not, 1
   ret i32 %and
 }
 
+define <4 x i32> @shrink_xor_constant1_splat(<4 x i32> %x) {
+; ALL-LABEL: shrink_xor_constant1_splat:
+; ALL:       # BB#0:
+; ALL-NEXT:    psrld $31, %xmm0
+; ALL-NEXT:    pandn {{.*}}(%rip), %xmm0
+; ALL-NEXT:    retq
+  %sh = lshr <4 x i32> %x, <i32 31, i32 31, i32 31, i32 31>
+  %not = xor <4 x i32> %sh, <i32 -1, i32 -1, i32 -1, i32 -1>
+  %and = and <4 x i32> %not, <i32 1, i32 1, i32 1, i32 1>
+  ret <4 x i32> %and
+}
+
 ; Clear low bits via shift, set them with xor (not), then mask them off.
 
 define i8 @shrink_xor_constant2(i8 %x) {
@@ -27,10 +39,22 @@ define i8 @shrink_xor_constant2(i8 %x) {
 ; ALL-NEXT:    xorb $-32, %dil
 ; ALL-NEXT:    movl %edi, %eax
 ; ALL-NEXT:    retq
-;
   %sh = shl i8 %x, 5
   %not = xor i8 %sh, -1
   %and = and i8 %not, 224 ; 0xE0
   ret i8 %and
 }
 
+define <16 x i8> @shrink_xor_constant2_splat(<16 x i8> %x) {
+; ALL-LABEL: shrink_xor_constant2_splat:
+; ALL:       # BB#0:
+; ALL-NEXT:    psllw $5, %xmm0
+; ALL-NEXT:    pand {{.*}}(%rip), %xmm0
+; ALL-NEXT:    pandn {{.*}}(%rip), %xmm0
+; ALL-NEXT:    retq
+  %sh = shl <16 x i8> %x, <i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5>
+  %not = xor <16 x i8> %sh, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+  %and = and <16 x i8> %not, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+  ret <16 x i8> %and
+}
+
diff --git a/test/CodeGen/X86/oddshuffles.ll b/test/CodeGen/X86/oddshuffles.ll
index d26cf02dd942..0bda41a30c69 100644
--- a/test/CodeGen/X86/oddshuffles.ll
+++ b/test/CodeGen/X86/oddshuffles.ll
@@ -746,9 +746,9 @@ define void @interleave_24i8_in(<24 x i8>* %p, <8 x i8>* %q1, <8 x i8>* %q2, <8
 ; SSE2-LABEL: interleave_24i8_in:
 ; SSE2:       # BB#0:
 ; SSE2-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
-; SSE2-NEXT:    movq {{.*#+}} xmm2 = mem[0],zero
 ; SSE2-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
-; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; SSE2-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
 ; SSE2-NEXT:    pxor %xmm2, %xmm2
 ; SSE2-NEXT:    movdqa %xmm1, %xmm3
 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
@@ -791,17 +791,17 @@ define void @interleave_24i8_in(<24 x i8>* %p, <8 x i8>* %q1, <8 x i8>* %q2, <8
 ; SSE42:       # BB#0:
 ; SSE42-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
 ; SSE42-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
-; SSE42-NEXT:    movq {{.*#+}} xmm2 = mem[0],zero
 ; SSE42-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; SSE42-NEXT:    movdqa %xmm0, %xmm1
-; SSE42-NEXT:    pshufb {{.*#+}} xmm1 = xmm1[0,8],zero,xmm1[1,9],zero,xmm1[2,10],zero,xmm1[3,11],zero,xmm1[4,12],zero,xmm1[5]
-; SSE42-NEXT:    movdqa %xmm2, %xmm3
+; SSE42-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
+; SSE42-NEXT:    movdqa %xmm0, %xmm2
+; SSE42-NEXT:    pshufb {{.*#+}} xmm2 = xmm2[0,8],zero,xmm2[1,9],zero,xmm2[2,10],zero,xmm2[3,11],zero,xmm2[4,12],zero,xmm2[5]
+; SSE42-NEXT:    movdqa %xmm1, %xmm3
 ; SSE42-NEXT:    pshufb {{.*#+}} xmm3 = zero,zero,xmm3[0],zero,zero,xmm3[1],zero,zero,xmm3[2],zero,zero,xmm3[3],zero,zero,xmm3[4],zero
-; SSE42-NEXT:    por %xmm1, %xmm3
+; SSE42-NEXT:    por %xmm2, %xmm3
 ; SSE42-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[13],zero,xmm0[6,14],zero,xmm0[7,15],zero,xmm0[u,u,u,u,u,u,u,u]
-; SSE42-NEXT:    pshufb {{.*#+}} xmm2 = zero,xmm2[5],zero,zero,xmm2[6],zero,zero,xmm2[7,u,u,u,u,u,u,u,u]
-; SSE42-NEXT:    por %xmm0, %xmm2
-; SSE42-NEXT:    movq %xmm2, 16(%rdi)
+; SSE42-NEXT:    pshufb {{.*#+}} xmm1 = zero,xmm1[5],zero,zero,xmm1[6],zero,zero,xmm1[7,u,u,u,u,u,u,u,u]
+; SSE42-NEXT:    por %xmm0, %xmm1
+; SSE42-NEXT:    movq %xmm1, 16(%rdi)
 ; SSE42-NEXT:    movdqu %xmm3, (%rdi)
 ; SSE42-NEXT:    retq
 ;
@@ -809,16 +809,16 @@ define void @interleave_24i8_in(<24 x i8>* %p, <8 x i8>* %q1, <8 x i8>* %q2, <8
 ; AVX:       # BB#0:
 ; AVX-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
 ; AVX-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
-; AVX-NEXT:    vmovq {{.*#+}} xmm2 = mem[0],zero
 ; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX-NEXT:    vpshufb {{.*#+}} xmm1 = xmm0[0,8],zero,xmm0[1,9],zero,xmm0[2,10],zero,xmm0[3,11],zero,xmm0[4,12],zero,xmm0[5]
-; AVX-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,xmm2[0],zero,zero,xmm2[1],zero,zero,xmm2[2],zero,zero,xmm2[3],zero,zero,xmm2[4],zero
-; AVX-NEXT:    vpor %xmm3, %xmm1, %xmm1
+; AVX-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
+; AVX-NEXT:    vpshufb {{.*#+}} xmm2 = xmm0[0,8],zero,xmm0[1,9],zero,xmm0[2,10],zero,xmm0[3,11],zero,xmm0[4,12],zero,xmm0[5]
+; AVX-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,xmm1[0],zero,zero,xmm1[1],zero,zero,xmm1[2],zero,zero,xmm1[3],zero,zero,xmm1[4],zero
+; AVX-NEXT:    vpor %xmm3, %xmm2, %xmm2
 ; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[13],zero,xmm0[6,14],zero,xmm0[7,15],zero,xmm0[u,u,u,u,u,u,u,u]
-; AVX-NEXT:    vpshufb {{.*#+}} xmm2 = zero,xmm2[5],zero,zero,xmm2[6],zero,zero,xmm2[7,u,u,u,u,u,u,u,u]
-; AVX-NEXT:    vpor %xmm2, %xmm0, %xmm0
+; AVX-NEXT:    vpshufb {{.*#+}} xmm1 = zero,xmm1[5],zero,zero,xmm1[6],zero,zero,xmm1[7,u,u,u,u,u,u,u,u]
+; AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    vmovq %xmm0, 16(%rdi)
-; AVX-NEXT:    vmovdqu %xmm1, (%rdi)
+; AVX-NEXT:    vmovdqu %xmm2, (%rdi)
 ; AVX-NEXT:    retq
   %s1 = load <8 x i8>, <8 x i8>* %q1, align 4
   %s2 = load <8 x i8>, <8 x i8>* %q2, align 4
diff --git a/test/CodeGen/X86/packss.ll b/test/CodeGen/X86/packss.ll
index 5cd649bb3902..24db6ba9ca2f 100644
--- a/test/CodeGen/X86/packss.ll
+++ b/test/CodeGen/X86/packss.ll
@@ -26,18 +26,17 @@ define <4 x i32> @trunc_ashr_v4i64(<4 x i64> %a) nounwind {
 ; X64-AVX1-LABEL: trunc_ashr_v4i64:
 ; X64-AVX1:       # BB#0:
 ; X64-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; X64-AVX1-NEXT:    vpsrad $31, %xmm1, %xmm1
-; X64-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; X64-AVX1-NEXT:    vpsrad $31, %xmm0, %xmm0
-; X64-AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; X64-AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; X64-AVX1-NEXT:    vpcmpgtq %xmm1, %xmm2, %xmm1
+; X64-AVX1-NEXT:    vpcmpgtq %xmm0, %xmm2, %xmm0
 ; X64-AVX1-NEXT:    vpacksswb %xmm1, %xmm0, %xmm0
 ; X64-AVX1-NEXT:    vzeroupper
 ; X64-AVX1-NEXT:    retq
 ;
 ; X64-AVX2-LABEL: trunc_ashr_v4i64:
 ; X64-AVX2:       # BB#0:
-; X64-AVX2-NEXT:    vpsrad $31, %ymm0, %ymm0
-; X64-AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[1,1,3,3,5,5,7,7]
+; X64-AVX2-NEXT:    vpxor %ymm1, %ymm1, %ymm1
+; X64-AVX2-NEXT:    vpcmpgtq %ymm0, %ymm1, %ymm0
 ; X64-AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; X64-AVX2-NEXT:    vpacksswb %xmm1, %xmm0, %xmm0
 ; X64-AVX2-NEXT:    vzeroupper
diff --git a/test/CodeGen/X86/pmul.ll b/test/CodeGen/X86/pmul.ll
index 88cb7a6d5825..50a661fcca11 100644
--- a/test/CodeGen/X86/pmul.ll
+++ b/test/CodeGen/X86/pmul.ll
@@ -1152,9 +1152,9 @@ define <4 x i32> @mul_v4i64_zero_upper(<4 x i32> %val1, <4 x i32> %val2) {
 ; SSE2-NEXT:    punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3]
 ; SSE2-NEXT:    movdqa %xmm1, %xmm4
 ; SSE2-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
+; SSE2-NEXT:    pmuludq %xmm4, %xmm2
 ; SSE2-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm3[2],xmm1[3],xmm3[3]
 ; SSE2-NEXT:    pmuludq %xmm0, %xmm1
-; SSE2-NEXT:    pmuludq %xmm4, %xmm2
 ; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[1,3],xmm1[1,3]
 ; SSE2-NEXT:    movaps %xmm2, %xmm0
 ; SSE2-NEXT:    retq
@@ -1166,9 +1166,9 @@ define <4 x i32> @mul_v4i64_zero_upper(<4 x i32> %val1, <4 x i32> %val2) {
 ; SSE41-NEXT:    pmovzxdq {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
 ; SSE41-NEXT:    pmovzxdq {{.*#+}} xmm4 = xmm0[0],zero,xmm0[1],zero
+; SSE41-NEXT:    pmuludq %xmm2, %xmm4
 ; SSE41-NEXT:    pmovzxdq {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero
 ; SSE41-NEXT:    pmuludq %xmm3, %xmm0
-; SSE41-NEXT:    pmuludq %xmm2, %xmm4
 ; SSE41-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,3],xmm4[1,3]
 ; SSE41-NEXT:    retq
 ;
@@ -1312,17 +1312,17 @@ define <8 x i32> @mul_v8i64_zero_upper(<8 x i32> %val1, <8 x i32> %val2) {
 ; SSE2-NEXT:    movdqa %xmm1, %xmm5
 ; SSE2-NEXT:    punpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1]
 ; SSE2-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm6[2],xmm1[3],xmm6[3]
-; SSE2-NEXT:    movdqa %xmm2, %xmm8
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm8 = xmm8[0],xmm6[0],xmm8[1],xmm6[1]
-; SSE2-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm6[2],xmm2[3],xmm6[3]
-; SSE2-NEXT:    movdqa %xmm3, %xmm7
+; SSE2-NEXT:    movdqa %xmm2, %xmm7
 ; SSE2-NEXT:    punpckldq {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1]
-; SSE2-NEXT:    punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm6[2],xmm3[3],xmm6[3]
-; SSE2-NEXT:    pmuludq %xmm1, %xmm3
-; SSE2-NEXT:    pmuludq %xmm7, %xmm5
+; SSE2-NEXT:    pmuludq %xmm7, %xmm4
+; SSE2-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm6[2],xmm2[3],xmm6[3]
 ; SSE2-NEXT:    pmuludq %xmm0, %xmm2
-; SSE2-NEXT:    pmuludq %xmm8, %xmm4
 ; SSE2-NEXT:    shufps {{.*#+}} xmm4 = xmm4[1,3],xmm2[1,3]
+; SSE2-NEXT:    movdqa %xmm3, %xmm0
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1]
+; SSE2-NEXT:    pmuludq %xmm0, %xmm5
+; SSE2-NEXT:    punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm6[2],xmm3[3],xmm6[3]
+; SSE2-NEXT:    pmuludq %xmm1, %xmm3
 ; SSE2-NEXT:    shufps {{.*#+}} xmm5 = xmm5[1,3],xmm3[1,3]
 ; SSE2-NEXT:    movaps %xmm4, %xmm0
 ; SSE2-NEXT:    movaps %xmm5, %xmm1
@@ -1331,22 +1331,22 @@ define <8 x i32> @mul_v8i64_zero_upper(<8 x i32> %val1, <8 x i32> %val2) {
 ; SSE41-LABEL: mul_v8i64_zero_upper:
 ; SSE41:       # BB#0: # %entry
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[2,3,0,1]
-; SSE41-NEXT:    pmovzxdq {{.*#+}} xmm8 = xmm4[0],zero,xmm4[1],zero
+; SSE41-NEXT:    pmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
 ; SSE41-NEXT:    pmovzxdq {{.*#+}} xmm5 = xmm0[0],zero,xmm0[1],zero
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
 ; SSE41-NEXT:    pmovzxdq {{.*#+}} xmm6 = xmm0[0],zero,xmm0[1],zero
 ; SSE41-NEXT:    pmovzxdq {{.*#+}} xmm7 = xmm1[0],zero,xmm1[1],zero
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
-; SSE41-NEXT:    pmovzxdq {{.*#+}} xmm4 = xmm0[0],zero,xmm0[1],zero
+; SSE41-NEXT:    pmovzxdq {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero
+; SSE41-NEXT:    pmuludq %xmm4, %xmm1
 ; SSE41-NEXT:    pmovzxdq {{.*#+}} xmm0 = xmm2[0],zero,xmm2[1],zero
+; SSE41-NEXT:    pmuludq %xmm5, %xmm0
+; SSE41-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3]
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[2,3,0,1]
 ; SSE41-NEXT:    pmovzxdq {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero
+; SSE41-NEXT:    pmuludq %xmm6, %xmm2
 ; SSE41-NEXT:    pmovzxdq {{.*#+}} xmm1 = xmm3[0],zero,xmm3[1],zero
 ; SSE41-NEXT:    pmuludq %xmm7, %xmm1
-; SSE41-NEXT:    pmuludq %xmm6, %xmm2
-; SSE41-NEXT:    pmuludq %xmm5, %xmm0
-; SSE41-NEXT:    pmuludq %xmm8, %xmm4
-; SSE41-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,3],xmm4[1,3]
 ; SSE41-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,3],xmm2[1,3]
 ; SSE41-NEXT:    retq
 ;
@@ -1356,11 +1356,11 @@ define <8 x i32> @mul_v8i64_zero_upper(<8 x i32> %val1, <8 x i32> %val2) {
 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
 ; AVX2-NEXT:    vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
 ; AVX2-NEXT:    vpmovzxdq {{.*#+}} ymm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
+; AVX2-NEXT:    vpmuludq %ymm3, %ymm2, %ymm2
 ; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm1
 ; AVX2-NEXT:    vpmovzxdq {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
 ; AVX2-NEXT:    vpmuludq %ymm1, %ymm0, %ymm0
-; AVX2-NEXT:    vpmuludq %ymm3, %ymm2, %ymm1
-; AVX2-NEXT:    vshufps {{.*#+}} ymm0 = ymm1[1,3],ymm0[1,3],ymm1[5,7],ymm0[5,7]
+; AVX2-NEXT:    vshufps {{.*#+}} ymm0 = ymm2[1,3],ymm0[1,3],ymm2[5,7],ymm0[5,7]
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
 ; AVX2-NEXT:    retq
 ;
@@ -1467,22 +1467,22 @@ define <8 x i64> @mul_v8i64_sext(<8 x i16> %val1, <8 x i32> %val2) {
 ; SSE41-LABEL: mul_v8i64_sext:
 ; SSE41:       # BB#0:
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[3,1,2,3]
-; SSE41-NEXT:    pmovsxwq %xmm3, %xmm8
+; SSE41-NEXT:    pmovsxwq %xmm3, %xmm4
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1]
-; SSE41-NEXT:    pmovsxwq %xmm3, %xmm6
+; SSE41-NEXT:    pmovsxwq %xmm3, %xmm5
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,2,3]
-; SSE41-NEXT:    pmovsxwq %xmm3, %xmm7
-; SSE41-NEXT:    pmovsxwq %xmm0, %xmm5
+; SSE41-NEXT:    pmovsxwq %xmm3, %xmm6
+; SSE41-NEXT:    pmovsxwq %xmm0, %xmm7
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
 ; SSE41-NEXT:    pmovsxdq %xmm0, %xmm3
+; SSE41-NEXT:    pmuldq %xmm4, %xmm3
 ; SSE41-NEXT:    pmovsxdq %xmm2, %xmm2
+; SSE41-NEXT:    pmuldq %xmm5, %xmm2
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
 ; SSE41-NEXT:    pmovsxdq %xmm0, %xmm4
+; SSE41-NEXT:    pmuldq %xmm6, %xmm4
 ; SSE41-NEXT:    pmovsxdq %xmm1, %xmm0
-; SSE41-NEXT:    pmuldq %xmm5, %xmm0
-; SSE41-NEXT:    pmuldq %xmm7, %xmm4
-; SSE41-NEXT:    pmuldq %xmm6, %xmm2
-; SSE41-NEXT:    pmuldq %xmm8, %xmm3
+; SSE41-NEXT:    pmuldq %xmm7, %xmm0
 ; SSE41-NEXT:    movdqa %xmm4, %xmm1
 ; SSE41-NEXT:    retq
 ;
@@ -1493,9 +1493,10 @@ define <8 x i64> @mul_v8i64_sext(<8 x i16> %val1, <8 x i32> %val2) {
 ; AVX2-NEXT:    vpmovsxwq %xmm0, %ymm0
 ; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm3
 ; AVX2-NEXT:    vpmovsxdq %xmm3, %ymm3
+; AVX2-NEXT:    vpmuldq %ymm3, %ymm2, %ymm2
 ; AVX2-NEXT:    vpmovsxdq %xmm1, %ymm1
 ; AVX2-NEXT:    vpmuldq %ymm1, %ymm0, %ymm0
-; AVX2-NEXT:    vpmuldq %ymm3, %ymm2, %ymm1
+; AVX2-NEXT:    vmovdqa %ymm2, %ymm1
 ; AVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: mul_v8i64_sext:
diff --git a/test/CodeGen/X86/pr28129.ll b/test/CodeGen/X86/pr28129.ll
index a155f71f79c3..15bffffa207f 100644
--- a/test/CodeGen/X86/pr28129.ll
+++ b/test/CodeGen/X86/pr28129.ll
@@ -5,15 +5,15 @@
 define <4 x double> @cmp4f64_domain(<4 x double> %a) {
 ; X86-LABEL: cmp4f64_domain:
 ; X86:       # BB#0:
-; X86-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
-; X86-NEXT:    vinsertf128 $1, %xmm1, %ymm1, %ymm1
+; X86-NEXT:    vxorps %ymm1, %ymm1, %ymm1
+; X86-NEXT:    vcmptrueps %ymm1, %ymm1, %ymm1
 ; X86-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: cmp4f64_domain:
 ; X64:       # BB#0:
-; X64-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
-; X64-NEXT:    vinsertf128 $1, %xmm1, %ymm1, %ymm1
+; X64-NEXT:    vxorps %ymm1, %ymm1, %ymm1
+; X64-NEXT:    vcmptrueps %ymm1, %ymm1, %ymm1
 ; X64-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
 ; X64-NEXT:    retq
   %cmp = fcmp oeq <4 x double> zeroinitializer, zeroinitializer
@@ -26,15 +26,15 @@ define <4 x double> @cmp4f64_domain(<4 x double> %a) {
 define <4 x double> @cmp4f64_domain_optsize(<4 x double> %a) optsize {
 ; X86-LABEL: cmp4f64_domain_optsize:
 ; X86:       # BB#0:
-; X86-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
-; X86-NEXT:    vinsertf128 $1, %xmm1, %ymm1, %ymm1
+; X86-NEXT:    vxorps %ymm1, %ymm1, %ymm1
+; X86-NEXT:    vcmptrueps %ymm1, %ymm1, %ymm1
 ; X86-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: cmp4f64_domain_optsize:
 ; X64:       # BB#0:
-; X64-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
-; X64-NEXT:    vinsertf128 $1, %xmm1, %ymm1, %ymm1
+; X64-NEXT:    vxorps %ymm1, %ymm1, %ymm1
+; X64-NEXT:    vcmptrueps %ymm1, %ymm1, %ymm1
 ; X64-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
 ; X64-NEXT:    retq
   %cmp = fcmp oeq <4 x double> zeroinitializer, zeroinitializer
@@ -47,15 +47,15 @@ define <4 x double> @cmp4f64_domain_optsize(<4 x double> %a) optsize {
 define <8 x float> @cmp8f32_domain(<8 x float> %a) {
 ; X86-LABEL: cmp8f32_domain:
 ; X86:       # BB#0:
-; X86-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
-; X86-NEXT:    vinsertf128 $1, %xmm1, %ymm1, %ymm1
+; X86-NEXT:    vxorps %ymm1, %ymm1, %ymm1
+; X86-NEXT:    vcmptrueps %ymm1, %ymm1, %ymm1
 ; X86-NEXT:    vaddps %ymm1, %ymm0, %ymm0
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: cmp8f32_domain:
 ; X64:       # BB#0:
-; X64-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
-; X64-NEXT:    vinsertf128 $1, %xmm1, %ymm1, %ymm1
+; X64-NEXT:    vxorps %ymm1, %ymm1, %ymm1
+; X64-NEXT:    vcmptrueps %ymm1, %ymm1, %ymm1
 ; X64-NEXT:    vaddps %ymm1, %ymm0, %ymm0
 ; X64-NEXT:    retq
   %cmp = fcmp oeq <8 x float> zeroinitializer, zeroinitializer
@@ -68,15 +68,15 @@ define <8 x float> @cmp8f32_domain(<8 x float> %a) {
 define <8 x float> @cmp8f32_domain_optsize(<8 x float> %a) optsize {
 ; X86-LABEL: cmp8f32_domain_optsize:
 ; X86:       # BB#0:
-; X86-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
-; X86-NEXT:    vinsertf128 $1, %xmm1, %ymm1, %ymm1
+; X86-NEXT:    vxorps %ymm1, %ymm1, %ymm1
+; X86-NEXT:    vcmptrueps %ymm1, %ymm1, %ymm1
 ; X86-NEXT:    vaddps %ymm1, %ymm0, %ymm0
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: cmp8f32_domain_optsize:
 ; X64:       # BB#0:
-; X64-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
-; X64-NEXT:    vinsertf128 $1, %xmm1, %ymm1, %ymm1
+; X64-NEXT:    vxorps %ymm1, %ymm1, %ymm1
+; X64-NEXT:    vcmptrueps %ymm1, %ymm1, %ymm1
 ; X64-NEXT:    vaddps %ymm1, %ymm0, %ymm0
 ; X64-NEXT:    retq
   %cmp = fcmp oeq <8 x float> zeroinitializer, zeroinitializer
diff --git a/test/CodeGen/X86/pr29112.ll b/test/CodeGen/X86/pr29112.ll
index 8c970b3d4771..94904018872b 100644
--- a/test/CodeGen/X86/pr29112.ll
+++ b/test/CodeGen/X86/pr29112.ll
@@ -38,7 +38,8 @@ define <4 x float> @bar(<4 x float>* %a1p, <4 x float>* %a2p, <4 x float> %a3, <
 ; CHECK-NEXT:    vinsertps {{.*#+}} xmm1 = xmm8[0],xmm0[0],xmm8[2,3]
 ; CHECK-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[1],xmm1[3]
 ; CHECK-NEXT:    vinsertps {{.*#+}} xmm14 = xmm1[0,1,2],xmm3[1]
-; CHECK-NEXT:    vinsertps {{.*#+}} xmm10 = xmm10[0,1,2],xmm3[1]
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm1 = xmm10[0,1,2],xmm3[1]
+; CHECK-NEXT:    vaddps %xmm14, %xmm1, %xmm10
 ; CHECK-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
 ; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm8[0],xmm0[0],xmm8[2,3]
 ; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[1],xmm0[3]
@@ -52,10 +53,9 @@ define <4 x float> @bar(<4 x float>* %a1p, <4 x float>* %a2p, <4 x float> %a3, <
 ; CHECK-NEXT:    vmovaps %xmm15, %xmm1
 ; CHECK-NEXT:    vmovaps %xmm1, {{[0-9]+}}(%rsp) # 16-byte Spill
 ; CHECK-NEXT:    vaddps %xmm0, %xmm1, %xmm9
-; CHECK-NEXT:    vaddps %xmm14, %xmm10, %xmm0
 ; CHECK-NEXT:    vaddps %xmm1, %xmm1, %xmm8
-; CHECK-NEXT:    vaddps %xmm11, %xmm3, %xmm3
-; CHECK-NEXT:    vaddps %xmm0, %xmm3, %xmm0
+; CHECK-NEXT:    vaddps %xmm11, %xmm3, %xmm0
+; CHECK-NEXT:    vaddps %xmm10, %xmm0, %xmm0
 ; CHECK-NEXT:    vaddps %xmm0, %xmm1, %xmm0
 ; CHECK-NEXT:    vmovaps %xmm8, {{[0-9]+}}(%rsp)
 ; CHECK-NEXT:    vmovaps %xmm9, (%rsp)
diff --git a/test/CodeGen/X86/pr30562.ll b/test/CodeGen/X86/pr30562.ll
index dda736a1a183..a8e648074194 100644
--- a/test/CodeGen/X86/pr30562.ll
+++ b/test/CodeGen/X86/pr30562.ll
@@ -1,5 +1,6 @@
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s
 
+
 define i32 @foo(i64* nocapture %perm, i32 %n) {
 entry:
   br label %body
diff --git a/test/CodeGen/X86/pr31088.ll b/test/CodeGen/X86/pr31088.ll
index 0dd8eb0ece85..d7a546c7396d 100644
--- a/test/CodeGen/X86/pr31088.ll
+++ b/test/CodeGen/X86/pr31088.ll
@@ -150,12 +150,12 @@ define <2 x half> @ir_fadd_v2f16(<2 x half> %arg0, <2 x half> %arg1) nounwind {
 ; F16C-NEXT:    vcvtph2ps %xmm3, %xmm3
 ; F16C-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
 ; F16C-NEXT:    vcvtph2ps %xmm1, %xmm1
+; F16C-NEXT:    vaddss %xmm3, %xmm1, %xmm1
 ; F16C-NEXT:    vcvtps2ph $4, %xmm2, %xmm2
 ; F16C-NEXT:    vcvtph2ps %xmm2, %xmm2
 ; F16C-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
 ; F16C-NEXT:    vcvtph2ps %xmm0, %xmm0
 ; F16C-NEXT:    vaddss %xmm2, %xmm0, %xmm0
-; F16C-NEXT:    vaddss %xmm3, %xmm1, %xmm1
 ; F16C-NEXT:    retq
   %retval = fadd <2 x half> %arg0, %arg1
   ret <2 x half> %retval
diff --git a/test/CodeGen/X86/pr32284.ll b/test/CodeGen/X86/pr32284.ll
index e05fc926b080..143e3af82eb7 100644
--- a/test/CodeGen/X86/pr32284.ll
+++ b/test/CodeGen/X86/pr32284.ll
@@ -30,25 +30,24 @@ define void @foo() {
 ; X86-O0-NEXT:    subl $12, %esp
 ; X86-O0-NEXT:  .Lcfi0:
 ; X86-O0-NEXT:    .cfi_def_cfa_offset 16
-; X86-O0-NEXT:    movzbl c, %eax
-; X86-O0-NEXT:    testl %eax, %eax
-; X86-O0-NEXT:    setne %cl
-; X86-O0-NEXT:    movl %eax, %edx
-; X86-O0-NEXT:    movb %dl, %ch
-; X86-O0-NEXT:    testb %ch, %ch
+; X86-O0-NEXT:    movb c, %al
+; X86-O0-NEXT:    testb %al, %al
 ; X86-O0-NEXT:    setne {{[0-9]+}}(%esp)
-; X86-O0-NEXT:    movzbl %cl, %edx
-; X86-O0-NEXT:    subl %eax, %edx
-; X86-O0-NEXT:    setle %cl
-; X86-O0-NEXT:    # implicit-def: %EAX
-; X86-O0-NEXT:    movb %cl, %al
-; X86-O0-NEXT:    andl $1, %eax
-; X86-O0-NEXT:    kmovd %eax, %k0
-; X86-O0-NEXT:    kmovd %k0, %eax
+; X86-O0-NEXT:    movzbl c, %ecx
+; X86-O0-NEXT:    testl %ecx, %ecx
+; X86-O0-NEXT:    setne %al
+; X86-O0-NEXT:    movzbl %al, %edx
+; X86-O0-NEXT:    subl %ecx, %edx
+; X86-O0-NEXT:    setle %al
+; X86-O0-NEXT:    # implicit-def: %ECX
 ; X86-O0-NEXT:    movb %al, %cl
-; X86-O0-NEXT:    andb $1, %cl
-; X86-O0-NEXT:    movzbl %cl, %eax
-; X86-O0-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-O0-NEXT:    andl $1, %ecx
+; X86-O0-NEXT:    kmovd %ecx, %k0
+; X86-O0-NEXT:    kmovd %k0, %ecx
+; X86-O0-NEXT:    movb %cl, %al
+; X86-O0-NEXT:    andb $1, %al
+; X86-O0-NEXT:    movzbl %al, %ecx
+; X86-O0-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X86-O0-NEXT:    movl %edx, (%esp) # 4-byte Spill
 ; X86-O0-NEXT:    addl $12, %esp
 ; X86-O0-NEXT:    retl
@@ -69,27 +68,25 @@ define void @foo() {
 ;
 ; X64-O0-LABEL: foo:
 ; X64-O0:       # BB#0: # %entry
-; X64-O0-NEXT:    movzbl {{.*}}(%rip), %eax
-; X64-O0-NEXT:    movl %eax, %ecx
-; X64-O0-NEXT:    movb %cl, %dl
-; X64-O0-NEXT:    movl %ecx, %eax
-; X64-O0-NEXT:    testq %rcx, %rcx
-; X64-O0-NEXT:    setne %sil
-; X64-O0-NEXT:    testb %dl, %dl
+; X64-O0-NEXT:    movb {{.*}}(%rip), %al
+; X64-O0-NEXT:    testb %al, %al
 ; X64-O0-NEXT:    setne -{{[0-9]+}}(%rsp)
-; X64-O0-NEXT:    movzbl %sil, %edi
-; X64-O0-NEXT:    subl %eax, %edi
-; X64-O0-NEXT:    setle %dl
-; X64-O0-NEXT:    # implicit-def: %EAX
-; X64-O0-NEXT:    movb %dl, %al
-; X64-O0-NEXT:    andl $1, %eax
-; X64-O0-NEXT:    kmovd %eax, %k0
-; X64-O0-NEXT:    kmovd %k0, %eax
-; X64-O0-NEXT:    movb %al, %dl
-; X64-O0-NEXT:    andb $1, %dl
-; X64-O0-NEXT:    movzbl %dl, %eax
-; X64-O0-NEXT:    movl %eax, -{{[0-9]+}}(%rsp)
-; X64-O0-NEXT:    movl %edi, -{{[0-9]+}}(%rsp) # 4-byte Spill
+; X64-O0-NEXT:    movzbl {{.*}}(%rip), %ecx
+; X64-O0-NEXT:    testl %ecx, %ecx
+; X64-O0-NEXT:    setne %al
+; X64-O0-NEXT:    movzbl %al, %edx
+; X64-O0-NEXT:    subl %ecx, %edx
+; X64-O0-NEXT:    setle %al
+; X64-O0-NEXT:    # implicit-def: %ECX
+; X64-O0-NEXT:    movb %al, %cl
+; X64-O0-NEXT:    andl $1, %ecx
+; X64-O0-NEXT:    kmovd %ecx, %k0
+; X64-O0-NEXT:    kmovd %k0, %ecx
+; X64-O0-NEXT:    movb %cl, %al
+; X64-O0-NEXT:    andb $1, %al
+; X64-O0-NEXT:    movzbl %al, %ecx
+; X64-O0-NEXT:    movl %ecx, -{{[0-9]+}}(%rsp)
+; X64-O0-NEXT:    movl %edx, -{{[0-9]+}}(%rsp) # 4-byte Spill
 ; X64-O0-NEXT:    retq
 entry:
   %a = alloca i8, align 1
diff --git a/test/CodeGen/X86/pr32907.ll b/test/CodeGen/X86/pr32907.ll
index bc03fbe06843..8057b31c961c 100644
--- a/test/CodeGen/X86/pr32907.ll
+++ b/test/CodeGen/X86/pr32907.ll
@@ -5,41 +5,44 @@
 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX --check-prefix=AVX512
 
 define <2 x i64> @PR32907(<2 x i64> %astype.i, <2 x i64> %astype6.i) {
-; SSE-LABEL: PR32907:
-; SSE:       # BB#0: # %entry
-; SSE-NEXT:    psubq %xmm1, %xmm0
-; SSE-NEXT:    movdqa %xmm0, %xmm1
-; SSE-NEXT:    psrad $31, %xmm1
-; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE-NEXT:    pxor %xmm1, %xmm1
-; SSE-NEXT:    psubq %xmm0, %xmm1
-; SSE-NEXT:    pand %xmm2, %xmm1
-; SSE-NEXT:    pandn %xmm0, %xmm2
-; SSE-NEXT:    por %xmm2, %xmm1
-; SSE-NEXT:    movdqa %xmm1, %xmm0
-; SSE-NEXT:    retq
+; SSE2-LABEL: PR32907:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    psubq %xmm1, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    psrad $31, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; SSE2-NEXT:    pxor %xmm1, %xmm1
+; SSE2-NEXT:    psubq %xmm0, %xmm1
+; SSE2-NEXT:    pand %xmm2, %xmm1
+; SSE2-NEXT:    pandn %xmm0, %xmm2
+; SSE2-NEXT:    por %xmm2, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE42-LABEL: PR32907:
+; SSE42:       # BB#0: # %entry
+; SSE42-NEXT:    psubq %xmm1, %xmm0
+; SSE42-NEXT:    pxor %xmm1, %xmm1
+; SSE42-NEXT:    pcmpgtq %xmm0, %xmm1
+; SSE42-NEXT:    pxor %xmm1, %xmm0
+; SSE42-NEXT:    psubq %xmm1, %xmm0
+; SSE42-NEXT:    retq
 ;
 ; AVX2-LABEL: PR32907:
 ; AVX2:       # BB#0: # %entry
 ; AVX2-NEXT:    vpsubq %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vpsrad $31, %xmm0, %xmm1
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX2-NEXT:    vpsubq %xmm0, %xmm2, %xmm2
-; AVX2-NEXT:    vpandn %xmm0, %xmm1, %xmm0
-; AVX2-NEXT:    vpand %xmm2, %xmm1, %xmm1
-; AVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX2-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm1
+; AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpsubq %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: PR32907:
 ; AVX512:       # BB#0: # %entry
 ; AVX512-NEXT:    vpsubq %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vpsraq $63, %zmm0, %zmm1
-; AVX512-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX512-NEXT:    vpsubq %xmm0, %xmm2, %xmm2
-; AVX512-NEXT:    vpandn %xmm0, %xmm1, %xmm0
-; AVX512-NEXT:    vpand %xmm2, %xmm1, %xmm1
-; AVX512-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpsubq %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
 entry:
diff --git a/test/CodeGen/X86/replace_unsupported_masked_mem_intrin.ll b/test/CodeGen/X86/replace_unsupported_masked_mem_intrin.ll
new file mode 100644
index 000000000000..9a5da33223ba
--- /dev/null
+++ b/test/CodeGen/X86/replace_unsupported_masked_mem_intrin.ll
@@ -0,0 +1,37 @@
+; RUN: llc -O0 -mtriple=x86_64-unknown-linux-gnu -mattr=+sse,+sse2 < %s -o /dev/null
+; pr33001 - Check that llc doesn't crash when running with O0 option.
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define <4 x i32> @test_masked_load(<4 x i32>* %base, <4 x i1> %mask) {
+  %res = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %base, i32 4, <4 x i1> %mask, <4 x i32> zeroinitializer)
+  ret <4 x i32> %res
+}
+
+declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32, <4 x i1>, <4 x i32>)
+
+
+define void @test_masked_store(<4 x i32>* %base, <4 x i32> %value, <4 x i1> %mask) {
+  call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %value, <4 x i32>* %base, i32 4, <4 x i1> %mask)
+  ret void
+}
+
+declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32, <4 x i1>)
+
+
+define <4 x i32> @llvm_masked_gather(<4 x i32*> %ptrs, <4 x i1> %mask) {
+  %res = call <4 x i32> @llvm.masked.gather.v4i32(<4 x i32*> %ptrs, i32 4, <4 x i1> %mask, <4 x i32> undef)
+  ret <4 x i32> %res
+}
+
+declare <4 x i32> @llvm.masked.gather.v4i32(<4 x i32*>, i32, <4 x i1>, <4 x i32>)
+
+
+define void @llvm_masked_scatter(<4 x i32*> %ptrs, <4 x i32> %value, <4 x i1> %mask) {
+  call void @llvm.masked.scatter.v4i32(<4 x i32> %value, <4 x i32*> %ptrs, i32 4, <4 x i1> %mask)
+  ret void
+}
+
+declare void @llvm.masked.scatter.v4i32(<4 x i32>, <4 x i32*>, i32, <4 x i1>)
+
diff --git a/test/CodeGen/X86/rotate.ll b/test/CodeGen/X86/rotate.ll
index 5d5150ad62d6..4be3a4c2391b 100644
--- a/test/CodeGen/X86/rotate.ll
+++ b/test/CodeGen/X86/rotate.ll
@@ -33,8 +33,8 @@ define i64 @rotl64(i64 %A, i8 %Amt) nounwind {
 ; 32-NEXT:    movl %ebx, %esi
 ; 32-NEXT:    xorl %ebx, %ebx
 ; 32-NEXT:  .LBB0_4:
-; 32-NEXT:    orl %esi, %eax
 ; 32-NEXT:    orl %ebx, %edx
+; 32-NEXT:    orl %esi, %eax
 ; 32-NEXT:    popl %esi
 ; 32-NEXT:    popl %edi
 ; 32-NEXT:    popl %ebx
@@ -86,8 +86,8 @@ define i64 @rotr64(i64 %A, i8 %Amt) nounwind {
 ; 32-NEXT:    movl %ebx, %esi
 ; 32-NEXT:    xorl %ebx, %ebx
 ; 32-NEXT:  .LBB1_4:
-; 32-NEXT:    orl %ebx, %eax
 ; 32-NEXT:    orl %esi, %edx
+; 32-NEXT:    orl %ebx, %eax
 ; 32-NEXT:    popl %esi
 ; 32-NEXT:    popl %edi
 ; 32-NEXT:    popl %ebx
@@ -546,7 +546,7 @@ define void @rotr1_64_mem(i64* %Aptr) nounwind {
 ; 32-LABEL: rotr1_64_mem:
 ; 32:       # BB#0:
 ; 32-NEXT:    pushl %esi
-; 32-NEXT:    movl 8(%esp), %eax
+; 32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; 32-NEXT:    movl (%eax), %ecx
 ; 32-NEXT:    movl 4(%eax), %edx
 ; 32-NEXT:    movl %edx, %esi
@@ -555,11 +555,13 @@ define void @rotr1_64_mem(i64* %Aptr) nounwind {
 ; 32-NEXT:    movl %ecx, 4(%eax)
 ; 32-NEXT:    movl %esi, (%eax)
 ; 32-NEXT:    popl %esi
-
+; 32-NEXT:    retl
+;
 ; 64-LABEL: rotr1_64_mem:
 ; 64:       # BB#0:
 ; 64-NEXT:    rorq (%rdi)
 ; 64-NEXT:    retq
+
   %A = load i64, i64 *%Aptr
   %B = shl i64 %A, 63
   %C = lshr i64 %A, 1
@@ -571,7 +573,7 @@ define void @rotr1_64_mem(i64* %Aptr) nounwind {
 define void @rotr1_32_mem(i32* %Aptr) nounwind {
 ; 32-LABEL: rotr1_32_mem:
 ; 32:       # BB#0:
-; 32-NEXT:    movl 4(%esp), %eax
+; 32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; 32-NEXT:    rorl (%eax)
 ; 32-NEXT:    retl
 ;
@@ -590,7 +592,7 @@ define void @rotr1_32_mem(i32* %Aptr) nounwind {
 define void @rotr1_16_mem(i16* %Aptr) nounwind {
 ; 32-LABEL: rotr1_16_mem:
 ; 32:       # BB#0:
-; 32-NEXT:    movl 4(%esp), %eax
+; 32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; 32-NEXT:    rorw (%eax)
 ; 32-NEXT:    retl
 ;
@@ -609,7 +611,7 @@ define void @rotr1_16_mem(i16* %Aptr) nounwind {
 define void @rotr1_8_mem(i8* %Aptr) nounwind {
 ; 32-LABEL: rotr1_8_mem:
 ; 32:       # BB#0:
-; 32-NEXT:    movl 4(%esp), %eax
+; 32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; 32-NEXT:    rorb (%eax)
 ; 32-NEXT:    retl
 ;
diff --git a/test/CodeGen/X86/sad.ll b/test/CodeGen/X86/sad.ll
index b8a8b8afd14f..6a565a5c76f0 100644
--- a/test/CodeGen/X86/sad.ll
+++ b/test/CodeGen/X86/sad.ll
@@ -149,127 +149,131 @@ middle.block:
 define i32 @sad_32i8() nounwind {
 ; SSE2-LABEL: sad_32i8:
 ; SSE2:       # BB#0: # %entry
-; SSE2-NEXT:    pxor %xmm11, %xmm11
-; SSE2-NEXT:    movq $-1024, %rax # imm = 0xFC00
-; SSE2-NEXT:    pxor %xmm0, %xmm0
-; SSE2-NEXT:    pxor %xmm3, %xmm3
-; SSE2-NEXT:    pxor %xmm2, %xmm2
-; SSE2-NEXT:    pxor %xmm1, %xmm1
 ; SSE2-NEXT:    pxor %xmm12, %xmm12
-; SSE2-NEXT:    pxor %xmm15, %xmm15
+; SSE2-NEXT:    movq $-1024, %rax # imm = 0xFC00
 ; SSE2-NEXT:    pxor %xmm13, %xmm13
+; SSE2-NEXT:    pxor %xmm6, %xmm6
+; SSE2-NEXT:    pxor %xmm4, %xmm4
+; SSE2-NEXT:    pxor %xmm3, %xmm3
 ; SSE2-NEXT:    pxor %xmm14, %xmm14
+; SSE2-NEXT:    pxor %xmm15, %xmm15
+; SSE2-NEXT:    pxor %xmm1, %xmm1
+; SSE2-NEXT:    pxor %xmm0, %xmm0
 ; SSE2-NEXT:    .p2align 4, 0x90
 ; SSE2-NEXT:  .LBB1_1: # %vector.body
 ; SSE2-NEXT:    # =>This Inner Loop Header: Depth=1
-; SSE2-NEXT:    movdqa %xmm3, -{{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT:    movdqa %xmm2, -{{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT:    movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill
 ; SSE2-NEXT:    movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT:    movdqa a+1040(%rax), %xmm6
+; SSE2-NEXT:    movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT:    movdqa %xmm3, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT:    movdqa %xmm4, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT:    movdqa a+1040(%rax), %xmm8
 ; SSE2-NEXT:    movdqa a+1024(%rax), %xmm3
-; SSE2-NEXT:    movdqa %xmm3, %xmm8
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm11[0],xmm8[1],xmm11[1],xmm8[2],xmm11[2],xmm8[3],xmm11[3],xmm8[4],xmm11[4],xmm8[5],xmm11[5],xmm8[6],xmm11[6],xmm8[7],xmm11[7]
-; SSE2-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm11[8],xmm3[9],xmm11[9],xmm3[10],xmm11[10],xmm3[11],xmm11[11],xmm3[12],xmm11[12],xmm3[13],xmm11[13],xmm3[14],xmm11[14],xmm3[15],xmm11[15]
-; SSE2-NEXT:    movdqa %xmm3, %xmm5
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm11[4],xmm3[5],xmm11[5],xmm3[6],xmm11[6],xmm3[7],xmm11[7]
-; SSE2-NEXT:    movdqa %xmm6, %xmm1
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm11[0],xmm1[1],xmm11[1],xmm1[2],xmm11[2],xmm1[3],xmm11[3],xmm1[4],xmm11[4],xmm1[5],xmm11[5],xmm1[6],xmm11[6],xmm1[7],xmm11[7]
-; SSE2-NEXT:    movdqa %xmm1, %xmm0
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1],xmm0[2],xmm11[2],xmm0[3],xmm11[3]
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm11[4],xmm1[5],xmm11[5],xmm1[6],xmm11[6],xmm1[7],xmm11[7]
-; SSE2-NEXT:    punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm11[8],xmm6[9],xmm11[9],xmm6[10],xmm11[10],xmm6[11],xmm11[11],xmm6[12],xmm11[12],xmm6[13],xmm11[13],xmm6[14],xmm11[14],xmm6[15],xmm11[15]
-; SSE2-NEXT:    movdqa %xmm6, %xmm7
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm11[0],xmm7[1],xmm11[1],xmm7[2],xmm11[2],xmm7[3],xmm11[3]
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm11[4],xmm6[5],xmm11[5],xmm6[6],xmm11[6],xmm6[7],xmm11[7]
+; SSE2-NEXT:    movdqa %xmm3, %xmm4
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm12[0],xmm4[1],xmm12[1],xmm4[2],xmm12[2],xmm4[3],xmm12[3],xmm4[4],xmm12[4],xmm4[5],xmm12[5],xmm4[6],xmm12[6],xmm4[7],xmm12[7]
+; SSE2-NEXT:    movdqa %xmm4, %xmm7
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm12[0],xmm7[1],xmm12[1],xmm7[2],xmm12[2],xmm7[3],xmm12[3]
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm12[4],xmm4[5],xmm12[5],xmm4[6],xmm12[6],xmm4[7],xmm12[7]
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm12[8],xmm3[9],xmm12[9],xmm3[10],xmm12[10],xmm3[11],xmm12[11],xmm3[12],xmm12[12],xmm3[13],xmm12[13],xmm3[14],xmm12[14],xmm3[15],xmm12[15]
+; SSE2-NEXT:    movdqa %xmm3, %xmm1
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm12[0],xmm1[1],xmm12[1],xmm1[2],xmm12[2],xmm1[3],xmm12[3]
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm12[4],xmm3[5],xmm12[5],xmm3[6],xmm12[6],xmm3[7],xmm12[7]
+; SSE2-NEXT:    movdqa %xmm8, %xmm0
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1],xmm0[2],xmm12[2],xmm0[3],xmm12[3],xmm0[4],xmm12[4],xmm0[5],xmm12[5],xmm0[6],xmm12[6],xmm0[7],xmm12[7]
+; SSE2-NEXT:    movdqa %xmm0, %xmm5
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm12[0],xmm5[1],xmm12[1],xmm5[2],xmm12[2],xmm5[3],xmm12[3]
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm12[4],xmm0[5],xmm12[5],xmm0[6],xmm12[6],xmm0[7],xmm12[7]
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm12[8],xmm8[9],xmm12[9],xmm8[10],xmm12[10],xmm8[11],xmm12[11],xmm8[12],xmm12[12],xmm8[13],xmm12[13],xmm8[14],xmm12[14],xmm8[15],xmm12[15]
+; SSE2-NEXT:    movdqa b+1024(%rax), %xmm11
+; SSE2-NEXT:    movdqa %xmm11, %xmm10
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm12[0],xmm10[1],xmm12[1],xmm10[2],xmm12[2],xmm10[3],xmm12[3],xmm10[4],xmm12[4],xmm10[5],xmm12[5],xmm10[6],xmm12[6],xmm10[7],xmm12[7]
+; SSE2-NEXT:    movdqa %xmm10, %xmm2
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm12[0],xmm2[1],xmm12[1],xmm2[2],xmm12[2],xmm2[3],xmm12[3]
+; SSE2-NEXT:    psubd %xmm2, %xmm7
 ; SSE2-NEXT:    movdqa b+1040(%rax), %xmm9
-; SSE2-NEXT:    movdqa %xmm9, %xmm2
-; SSE2-NEXT:    punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm11[8],xmm9[9],xmm11[9],xmm9[10],xmm11[10],xmm9[11],xmm11[11],xmm9[12],xmm11[12],xmm9[13],xmm11[13],xmm9[14],xmm11[14],xmm9[15],xmm11[15]
-; SSE2-NEXT:    movdqa %xmm9, %xmm10
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm11[4],xmm9[5],xmm11[5],xmm9[6],xmm11[6],xmm9[7],xmm11[7]
-; SSE2-NEXT:    psubd %xmm9, %xmm6
-; SSE2-NEXT:    movdqa b+1024(%rax), %xmm4
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm11[0],xmm2[1],xmm11[1],xmm2[2],xmm11[2],xmm2[3],xmm11[3],xmm2[4],xmm11[4],xmm2[5],xmm11[5],xmm2[6],xmm11[6],xmm2[7],xmm11[7]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3]
-; SSE2-NEXT:    psubd %xmm10, %xmm7
-; SSE2-NEXT:    movdqa %xmm2, %xmm9
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm11[4],xmm2[5],xmm11[5],xmm2[6],xmm11[6],xmm2[7],xmm11[7]
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm12[4],xmm10[5],xmm12[5],xmm10[6],xmm12[6],xmm10[7],xmm12[7]
+; SSE2-NEXT:    psubd %xmm10, %xmm4
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm11 = xmm11[8],xmm12[8],xmm11[9],xmm12[9],xmm11[10],xmm12[10],xmm11[11],xmm12[11],xmm11[12],xmm12[12],xmm11[13],xmm12[13],xmm11[14],xmm12[14],xmm11[15],xmm12[15]
+; SSE2-NEXT:    movdqa %xmm11, %xmm2
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm12[0],xmm2[1],xmm12[1],xmm2[2],xmm12[2],xmm2[3],xmm12[3]
 ; SSE2-NEXT:    psubd %xmm2, %xmm1
-; SSE2-NEXT:    movdqa %xmm4, %xmm2
-; SSE2-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm11[8],xmm4[9],xmm11[9],xmm4[10],xmm11[10],xmm4[11],xmm11[11],xmm4[12],xmm11[12],xmm4[13],xmm11[13],xmm4[14],xmm11[14],xmm4[15],xmm11[15]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm11[0],xmm9[1],xmm11[1],xmm9[2],xmm11[2],xmm9[3],xmm11[3]
-; SSE2-NEXT:    psubd %xmm9, %xmm0
-; SSE2-NEXT:    movdqa %xmm4, %xmm9
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm11[4],xmm4[5],xmm11[5],xmm4[6],xmm11[6],xmm4[7],xmm11[7]
-; SSE2-NEXT:    psubd %xmm4, %xmm3
-; SSE2-NEXT:    movdqa %xmm8, %xmm10
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm11[4],xmm8[5],xmm11[5],xmm8[6],xmm11[6],xmm8[7],xmm11[7]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm11[0],xmm5[1],xmm11[1],xmm5[2],xmm11[2],xmm5[3],xmm11[3]
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm11[0],xmm2[1],xmm11[1],xmm2[2],xmm11[2],xmm2[3],xmm11[3],xmm2[4],xmm11[4],xmm2[5],xmm11[5],xmm2[6],xmm11[6],xmm2[7],xmm11[7]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm11[0],xmm9[1],xmm11[1],xmm9[2],xmm11[2],xmm9[3],xmm11[3]
-; SSE2-NEXT:    psubd %xmm9, %xmm5
-; SSE2-NEXT:    movdqa %xmm2, %xmm4
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm11[4],xmm2[5],xmm11[5],xmm2[6],xmm11[6],xmm2[7],xmm11[7]
-; SSE2-NEXT:    psubd %xmm2, %xmm8
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm11[0],xmm4[1],xmm11[1],xmm4[2],xmm11[2],xmm4[3],xmm11[3]
-; SSE2-NEXT:    psubd %xmm4, %xmm10
-; SSE2-NEXT:    movdqa %xmm10, %xmm2
-; SSE2-NEXT:    psrad $31, %xmm2
-; SSE2-NEXT:    paddd %xmm2, %xmm10
-; SSE2-NEXT:    pxor %xmm2, %xmm10
-; SSE2-NEXT:    movdqa %xmm8, %xmm2
-; SSE2-NEXT:    psrad $31, %xmm2
-; SSE2-NEXT:    paddd %xmm2, %xmm8
-; SSE2-NEXT:    pxor %xmm2, %xmm8
-; SSE2-NEXT:    movdqa %xmm5, %xmm2
-; SSE2-NEXT:    psrad $31, %xmm2
-; SSE2-NEXT:    paddd %xmm2, %xmm5
-; SSE2-NEXT:    pxor %xmm2, %xmm5
-; SSE2-NEXT:    movdqa %xmm3, %xmm2
-; SSE2-NEXT:    psrad $31, %xmm2
-; SSE2-NEXT:    paddd %xmm2, %xmm3
-; SSE2-NEXT:    pxor %xmm2, %xmm3
-; SSE2-NEXT:    movdqa %xmm0, %xmm2
-; SSE2-NEXT:    psrad $31, %xmm2
-; SSE2-NEXT:    paddd %xmm2, %xmm0
-; SSE2-NEXT:    pxor %xmm2, %xmm0
-; SSE2-NEXT:    movdqa %xmm1, %xmm2
-; SSE2-NEXT:    psrad $31, %xmm2
-; SSE2-NEXT:    paddd %xmm2, %xmm1
-; SSE2-NEXT:    pxor %xmm2, %xmm1
-; SSE2-NEXT:    movdqa %xmm7, %xmm2
-; SSE2-NEXT:    psrad $31, %xmm2
-; SSE2-NEXT:    paddd %xmm2, %xmm7
-; SSE2-NEXT:    pxor %xmm2, %xmm7
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm12[4],xmm11[5],xmm12[5],xmm11[6],xmm12[6],xmm11[7],xmm12[7]
+; SSE2-NEXT:    psubd %xmm11, %xmm3
+; SSE2-NEXT:    movdqa %xmm6, %xmm10
+; SSE2-NEXT:    movdqa %xmm9, %xmm6
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm12[0],xmm6[1],xmm12[1],xmm6[2],xmm12[2],xmm6[3],xmm12[3],xmm6[4],xmm12[4],xmm6[5],xmm12[5],xmm6[6],xmm12[6],xmm6[7],xmm12[7]
 ; SSE2-NEXT:    movdqa %xmm6, %xmm2
-; SSE2-NEXT:    psrad $31, %xmm2
-; SSE2-NEXT:    paddd %xmm2, %xmm6
-; SSE2-NEXT:    pxor %xmm2, %xmm6
-; SSE2-NEXT:    movdqa -{{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload
-; SSE2-NEXT:    paddd %xmm6, %xmm14
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm12[0],xmm2[1],xmm12[1],xmm2[2],xmm12[2],xmm2[3],xmm12[3]
+; SSE2-NEXT:    psubd %xmm2, %xmm5
+; SSE2-NEXT:    movdqa %xmm8, %xmm2
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm12[0],xmm2[1],xmm12[1],xmm2[2],xmm12[2],xmm2[3],xmm12[3]
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm12[4],xmm6[5],xmm12[5],xmm6[6],xmm12[6],xmm6[7],xmm12[7]
+; SSE2-NEXT:    psubd %xmm6, %xmm0
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm12[8],xmm9[9],xmm12[9],xmm9[10],xmm12[10],xmm9[11],xmm12[11],xmm9[12],xmm12[12],xmm9[13],xmm12[13],xmm9[14],xmm12[14],xmm9[15],xmm12[15]
+; SSE2-NEXT:    movdqa %xmm9, %xmm6
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm12[0],xmm6[1],xmm12[1],xmm6[2],xmm12[2],xmm6[3],xmm12[3]
+; SSE2-NEXT:    psubd %xmm6, %xmm2
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm12[4],xmm8[5],xmm12[5],xmm8[6],xmm12[6],xmm8[7],xmm12[7]
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm12[4],xmm9[5],xmm12[5],xmm9[6],xmm12[6],xmm9[7],xmm12[7]
+; SSE2-NEXT:    psubd %xmm9, %xmm8
+; SSE2-NEXT:    movdqa %xmm7, %xmm6
+; SSE2-NEXT:    psrad $31, %xmm6
+; SSE2-NEXT:    paddd %xmm6, %xmm7
+; SSE2-NEXT:    pxor %xmm6, %xmm7
 ; SSE2-NEXT:    paddd %xmm7, %xmm13
-; SSE2-NEXT:    paddd %xmm1, %xmm15
+; SSE2-NEXT:    movdqa %xmm4, %xmm6
+; SSE2-NEXT:    psrad $31, %xmm6
+; SSE2-NEXT:    paddd %xmm6, %xmm4
+; SSE2-NEXT:    pxor %xmm6, %xmm4
+; SSE2-NEXT:    movdqa %xmm10, %xmm6
+; SSE2-NEXT:    paddd %xmm4, %xmm6
+; SSE2-NEXT:    movdqa %xmm1, %xmm4
+; SSE2-NEXT:    psrad $31, %xmm4
+; SSE2-NEXT:    paddd %xmm4, %xmm1
+; SSE2-NEXT:    pxor %xmm4, %xmm1
+; SSE2-NEXT:    movdqa -{{[0-9]+}}(%rsp), %xmm4 # 16-byte Reload
+; SSE2-NEXT:    paddd %xmm1, %xmm4
+; SSE2-NEXT:    movdqa %xmm3, %xmm1
+; SSE2-NEXT:    psrad $31, %xmm1
+; SSE2-NEXT:    paddd %xmm1, %xmm3
+; SSE2-NEXT:    pxor %xmm1, %xmm3
 ; SSE2-NEXT:    movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload
-; SSE2-NEXT:    paddd %xmm0, %xmm12
-; SSE2-NEXT:    movdqa -{{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
 ; SSE2-NEXT:    paddd %xmm3, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill
 ; SSE2-NEXT:    movdqa -{{[0-9]+}}(%rsp), %xmm3 # 16-byte Reload
-; SSE2-NEXT:    paddd %xmm5, %xmm2
-; SSE2-NEXT:    paddd %xmm8, %xmm3
-; SSE2-NEXT:    paddd %xmm10, %xmm0
+; SSE2-NEXT:    movdqa %xmm5, %xmm1
+; SSE2-NEXT:    psrad $31, %xmm1
+; SSE2-NEXT:    paddd %xmm1, %xmm5
+; SSE2-NEXT:    pxor %xmm1, %xmm5
+; SSE2-NEXT:    paddd %xmm5, %xmm14
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    psrad $31, %xmm1
+; SSE2-NEXT:    paddd %xmm1, %xmm0
+; SSE2-NEXT:    pxor %xmm1, %xmm0
+; SSE2-NEXT:    movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload
+; SSE2-NEXT:    paddd %xmm0, %xmm15
+; SSE2-NEXT:    movdqa %xmm2, %xmm0
+; SSE2-NEXT:    psrad $31, %xmm0
+; SSE2-NEXT:    paddd %xmm0, %xmm2
+; SSE2-NEXT:    pxor %xmm0, %xmm2
+; SSE2-NEXT:    paddd %xmm2, %xmm1
+; SSE2-NEXT:    movdqa %xmm8, %xmm0
+; SSE2-NEXT:    psrad $31, %xmm0
+; SSE2-NEXT:    paddd %xmm0, %xmm8
+; SSE2-NEXT:    pxor %xmm0, %xmm8
+; SSE2-NEXT:    movdqa -{{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
+; SSE2-NEXT:    paddd %xmm8, %xmm0
 ; SSE2-NEXT:    addq $4, %rax
 ; SSE2-NEXT:    jne .LBB1_1
 ; SSE2-NEXT:  # BB#2: # %middle.block
-; SSE2-NEXT:    paddd %xmm15, %xmm3
-; SSE2-NEXT:    paddd %xmm14, %xmm1
-; SSE2-NEXT:    paddd %xmm12, %xmm0
-; SSE2-NEXT:    paddd %xmm13, %xmm2
-; SSE2-NEXT:    paddd %xmm3, %xmm1
-; SSE2-NEXT:    paddd %xmm2, %xmm1
-; SSE2-NEXT:    paddd %xmm0, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
-; SSE2-NEXT:    paddd %xmm1, %xmm0
+; SSE2-NEXT:    paddd %xmm15, %xmm6
+; SSE2-NEXT:    paddd %xmm0, %xmm3
+; SSE2-NEXT:    paddd %xmm6, %xmm3
+; SSE2-NEXT:    paddd %xmm14, %xmm13
+; SSE2-NEXT:    paddd %xmm1, %xmm4
+; SSE2-NEXT:    paddd %xmm3, %xmm4
+; SSE2-NEXT:    paddd %xmm13, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[2,3,0,1]
+; SSE2-NEXT:    paddd %xmm4, %xmm0
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
 ; SSE2-NEXT:    paddd %xmm0, %xmm1
 ; SSE2-NEXT:    movd %xmm1, %eax
@@ -398,288 +402,284 @@ middle.block:
 define i32 @sad_avx64i8() nounwind {
 ; SSE2-LABEL: sad_avx64i8:
 ; SSE2:       # BB#0: # %entry
-; SSE2-NEXT:    subq $184, %rsp
-; SSE2-NEXT:    pxor %xmm15, %xmm15
+; SSE2-NEXT:    subq $200, %rsp
+; SSE2-NEXT:    pxor %xmm14, %xmm14
 ; SSE2-NEXT:    movq $-1024, %rax # imm = 0xFC00
-; SSE2-NEXT:    pxor %xmm12, %xmm12
+; SSE2-NEXT:    pxor %xmm15, %xmm15
+; SSE2-NEXT:    pxor %xmm10, %xmm10
+; SSE2-NEXT:    pxor %xmm3, %xmm3
+; SSE2-NEXT:    pxor %xmm5, %xmm5
+; SSE2-NEXT:    pxor %xmm13, %xmm13
+; SSE2-NEXT:    pxor %xmm1, %xmm1
 ; SSE2-NEXT:    pxor %xmm8, %xmm8
-; SSE2-NEXT:    pxor %xmm4, %xmm4
-; SSE2-NEXT:    pxor %xmm0, %xmm0
-; SSE2-NEXT:    movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT:    pxor %xmm14, %xmm14
-; SSE2-NEXT:    pxor %xmm0, %xmm0
-; SSE2-NEXT:    movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT:    pxor %xmm6, %xmm6
-; SSE2-NEXT:    pxor %xmm0, %xmm0
-; SSE2-NEXT:    movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT:    pxor %xmm11, %xmm11
 ; SSE2-NEXT:    pxor %xmm0, %xmm0
 ; SSE2-NEXT:    pxor %xmm2, %xmm2
+; SSE2-NEXT:    pxor %xmm11, %xmm11
+; SSE2-NEXT:    pxor %xmm4, %xmm4
+; SSE2-NEXT:    movdqa %xmm4, -{{[0-9]+}}(%rsp) # 16-byte Spill
 ; SSE2-NEXT:    pxor %xmm7, %xmm7
-; SSE2-NEXT:    pxor %xmm13, %xmm13
-; SSE2-NEXT:    pxor %xmm1, %xmm1
-; SSE2-NEXT:    pxor %xmm3, %xmm3
-; SSE2-NEXT:    pxor %xmm5, %xmm5
+; SSE2-NEXT:    movdqa %xmm7, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT:    pxor %xmm7, %xmm7
+; SSE2-NEXT:    movdqa %xmm7, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT:    pxor %xmm7, %xmm7
+; SSE2-NEXT:    movdqa %xmm7, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT:    pxor %xmm7, %xmm7
+; SSE2-NEXT:    movdqa %xmm7, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT:    pxor %xmm7, %xmm7
+; SSE2-NEXT:    movdqa %xmm7, -{{[0-9]+}}(%rsp) # 16-byte Spill
 ; SSE2-NEXT:    .p2align 4, 0x90
 ; SSE2-NEXT:  .LBB2_1: # %vector.body
 ; SSE2-NEXT:    # =>This Inner Loop Header: Depth=1
-; SSE2-NEXT:    movdqa %xmm3, {{[0-9]+}}(%rsp) # 16-byte Spill
 ; SSE2-NEXT:    movdqa %xmm2, {{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT:    movdqa %xmm13, -{{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT:    movdqa %xmm5, -{{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT:    movdqa %xmm11, (%rsp) # 16-byte Spill
-; SSE2-NEXT:    movdqa %xmm7, {{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT:    movdqa %xmm1, {{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT:    movdqa %xmm3, {{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT:    movdqa %xmm8, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT:    movdqa %xmm11, {{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT:    movdqa %xmm5, {{[0-9]+}}(%rsp) # 16-byte Spill
 ; SSE2-NEXT:    movdqa %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT:    movdqa %xmm14, {{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT:    movdqa %xmm4, {{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT:    movdqa %xmm6, {{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT:    movdqa %xmm8, {{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT:    movdqa %xmm12, {{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT:    movdqa a+1040(%rax), %xmm6
-; SSE2-NEXT:    movdqa a+1024(%rax), %xmm4
-; SSE2-NEXT:    movdqa a+1056(%rax), %xmm11
-; SSE2-NEXT:    movdqa a+1072(%rax), %xmm3
-; SSE2-NEXT:    movdqa %xmm3, %xmm5
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm15[0],xmm3[1],xmm15[1],xmm3[2],xmm15[2],xmm3[3],xmm15[3],xmm3[4],xmm15[4],xmm3[5],xmm15[5],xmm3[6],xmm15[6],xmm3[7],xmm15[7]
-; SSE2-NEXT:    movdqa %xmm11, %xmm1
-; SSE2-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm15[8],xmm1[9],xmm15[9],xmm1[10],xmm15[10],xmm1[11],xmm15[11],xmm1[12],xmm15[12],xmm1[13],xmm15[13],xmm1[14],xmm15[14],xmm1[15],xmm15[15]
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm15[0],xmm11[1],xmm15[1],xmm11[2],xmm15[2],xmm11[3],xmm15[3],xmm11[4],xmm15[4],xmm11[5],xmm15[5],xmm11[6],xmm15[6],xmm11[7],xmm15[7]
-; SSE2-NEXT:    movdqa %xmm11, -{{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm15[0],xmm11[1],xmm15[1],xmm11[2],xmm15[2],xmm11[3],xmm15[3]
-; SSE2-NEXT:    movdqa %xmm4, %xmm12
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm15[0],xmm12[1],xmm15[1],xmm12[2],xmm15[2],xmm12[3],xmm15[3],xmm12[4],xmm15[4],xmm12[5],xmm15[5],xmm12[6],xmm15[6],xmm12[7],xmm15[7]
-; SSE2-NEXT:    movdqa %xmm12, %xmm0
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1],xmm0[2],xmm15[2],xmm0[3],xmm15[3]
-; SSE2-NEXT:    movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm15[4],xmm12[5],xmm15[5],xmm12[6],xmm15[6],xmm12[7],xmm15[7]
-; SSE2-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm15[8],xmm4[9],xmm15[9],xmm4[10],xmm15[10],xmm4[11],xmm15[11],xmm4[12],xmm15[12],xmm4[13],xmm15[13],xmm4[14],xmm15[14],xmm4[15],xmm15[15]
-; SSE2-NEXT:    movdqa %xmm4, %xmm0
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1],xmm0[2],xmm15[2],xmm0[3],xmm15[3]
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm15[4],xmm4[5],xmm15[5],xmm4[6],xmm15[6],xmm4[7],xmm15[7]
-; SSE2-NEXT:    movdqa %xmm6, %xmm14
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3],xmm14[4],xmm15[4],xmm14[5],xmm15[5],xmm14[6],xmm15[6],xmm14[7],xmm15[7]
-; SSE2-NEXT:    movdqa %xmm14, %xmm7
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm15[0],xmm7[1],xmm15[1],xmm7[2],xmm15[2],xmm7[3],xmm15[3]
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm14 = xmm14[4],xmm15[4],xmm14[5],xmm15[5],xmm14[6],xmm15[6],xmm14[7],xmm15[7]
-; SSE2-NEXT:    punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm15[8],xmm6[9],xmm15[9],xmm6[10],xmm15[10],xmm6[11],xmm15[11],xmm6[12],xmm15[12],xmm6[13],xmm15[13],xmm6[14],xmm15[14],xmm6[15],xmm15[15]
-; SSE2-NEXT:    movdqa %xmm6, %xmm8
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm15[0],xmm8[1],xmm15[1],xmm8[2],xmm15[2],xmm8[3],xmm15[3]
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm15[4],xmm6[5],xmm15[5],xmm6[6],xmm15[6],xmm6[7],xmm15[7]
-; SSE2-NEXT:    movdqa b+1040(%rax), %xmm9
-; SSE2-NEXT:    movdqa %xmm9, %xmm13
-; SSE2-NEXT:    punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm15[8],xmm9[9],xmm15[9],xmm9[10],xmm15[10],xmm9[11],xmm15[11],xmm9[12],xmm15[12],xmm9[13],xmm15[13],xmm9[14],xmm15[14],xmm9[15],xmm15[15]
-; SSE2-NEXT:    movdqa %xmm9, %xmm10
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm15[4],xmm9[5],xmm15[5],xmm9[6],xmm15[6],xmm9[7],xmm15[7]
-; SSE2-NEXT:    psubd %xmm9, %xmm6
-; SSE2-NEXT:    movdqa b+1024(%rax), %xmm2
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm13 = xmm13[0],xmm15[0],xmm13[1],xmm15[1],xmm13[2],xmm15[2],xmm13[3],xmm15[3],xmm13[4],xmm15[4],xmm13[5],xmm15[5],xmm13[6],xmm15[6],xmm13[7],xmm15[7]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm15[0],xmm10[1],xmm15[1],xmm10[2],xmm15[2],xmm10[3],xmm15[3]
-; SSE2-NEXT:    psubd %xmm10, %xmm8
-; SSE2-NEXT:    movdqa %xmm13, %xmm9
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm15[4],xmm13[5],xmm15[5],xmm13[6],xmm15[6],xmm13[7],xmm15[7]
-; SSE2-NEXT:    psubd %xmm13, %xmm14
-; SSE2-NEXT:    movdqa %xmm2, %xmm10
-; SSE2-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm15[8],xmm2[9],xmm15[9],xmm2[10],xmm15[10],xmm2[11],xmm15[11],xmm2[12],xmm15[12],xmm2[13],xmm15[13],xmm2[14],xmm15[14],xmm2[15],xmm15[15]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm15[0],xmm9[1],xmm15[1],xmm9[2],xmm15[2],xmm9[3],xmm15[3]
-; SSE2-NEXT:    psubd %xmm9, %xmm7
-; SSE2-NEXT:    movdqa %xmm2, %xmm9
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm15[4],xmm2[5],xmm15[5],xmm2[6],xmm15[6],xmm2[7],xmm15[7]
-; SSE2-NEXT:    psubd %xmm2, %xmm4
-; SSE2-NEXT:    movdqa b+1056(%rax), %xmm2
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm15[0],xmm10[1],xmm15[1],xmm10[2],xmm15[2],xmm10[3],xmm15[3],xmm10[4],xmm15[4],xmm10[5],xmm15[5],xmm10[6],xmm15[6],xmm10[7],xmm15[7]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm15[0],xmm9[1],xmm15[1],xmm9[2],xmm15[2],xmm9[3],xmm15[3]
-; SSE2-NEXT:    psubd %xmm9, %xmm0
-; SSE2-NEXT:    movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT:    movdqa %xmm10, %xmm9
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm15[4],xmm10[5],xmm15[5],xmm10[6],xmm15[6],xmm10[7],xmm15[7]
-; SSE2-NEXT:    psubd %xmm10, %xmm12
-; SSE2-NEXT:    movdqa %xmm2, %xmm10
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm15[0],xmm2[1],xmm15[1],xmm2[2],xmm15[2],xmm2[3],xmm15[3],xmm2[4],xmm15[4],xmm2[5],xmm15[5],xmm2[6],xmm15[6],xmm2[7],xmm15[7]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm15[0],xmm9[1],xmm15[1],xmm9[2],xmm15[2],xmm9[3],xmm15[3]
-; SSE2-NEXT:    movdqa -{{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    psubd %xmm9, %xmm0
-; SSE2-NEXT:    movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT:    movdqa %xmm2, %xmm9
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm15[0],xmm2[1],xmm15[1],xmm2[2],xmm15[2],xmm2[3],xmm15[3]
-; SSE2-NEXT:    psubd %xmm2, %xmm11
-; SSE2-NEXT:    movdqa %xmm1, %xmm13
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm15[0],xmm1[1],xmm15[1],xmm1[2],xmm15[2],xmm1[3],xmm15[3]
-; SSE2-NEXT:    movdqa -{{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm15[4],xmm0[5],xmm15[5],xmm0[6],xmm15[6],xmm0[7],xmm15[7]
-; SSE2-NEXT:    punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm15[8],xmm10[9],xmm15[9],xmm10[10],xmm15[10],xmm10[11],xmm15[11],xmm10[12],xmm15[12],xmm10[13],xmm15[13],xmm10[14],xmm15[14],xmm10[15],xmm15[15]
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm15[4],xmm9[5],xmm15[5],xmm9[6],xmm15[6],xmm9[7],xmm15[7]
-; SSE2-NEXT:    psubd %xmm9, %xmm0
-; SSE2-NEXT:    movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT:    movdqa %xmm10, %xmm2
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm15[0],xmm10[1],xmm15[1],xmm10[2],xmm15[2],xmm10[3],xmm15[3]
-; SSE2-NEXT:    psubd %xmm10, %xmm1
-; SSE2-NEXT:    movdqa %xmm3, %xmm10
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm15[0],xmm3[1],xmm15[1],xmm3[2],xmm15[2],xmm3[3],xmm15[3]
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm15[4],xmm13[5],xmm15[5],xmm13[6],xmm15[6],xmm13[7],xmm15[7]
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm15[4],xmm2[5],xmm15[5],xmm2[6],xmm15[6],xmm2[7],xmm15[7]
-; SSE2-NEXT:    psubd %xmm2, %xmm13
-; SSE2-NEXT:    movdqa b+1072(%rax), %xmm2
-; SSE2-NEXT:    movdqa %xmm2, %xmm0
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm15[0],xmm2[1],xmm15[1],xmm2[2],xmm15[2],xmm2[3],xmm15[3],xmm2[4],xmm15[4],xmm2[5],xmm15[5],xmm2[6],xmm15[6],xmm2[7],xmm15[7]
-; SSE2-NEXT:    movdqa %xmm2, %xmm9
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm15[0],xmm2[1],xmm15[1],xmm2[2],xmm15[2],xmm2[3],xmm15[3]
-; SSE2-NEXT:    psubd %xmm2, %xmm3
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm15[4],xmm10[5],xmm15[5],xmm10[6],xmm15[6],xmm10[7],xmm15[7]
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm15[4],xmm9[5],xmm15[5],xmm9[6],xmm15[6],xmm9[7],xmm15[7]
-; SSE2-NEXT:    psubd %xmm9, %xmm10
-; SSE2-NEXT:    punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm15[8],xmm5[9],xmm15[9],xmm5[10],xmm15[10],xmm5[11],xmm15[11],xmm5[12],xmm15[12],xmm5[13],xmm15[13],xmm5[14],xmm15[14],xmm5[15],xmm15[15]
-; SSE2-NEXT:    movdqa %xmm5, %xmm9
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm15[0],xmm5[1],xmm15[1],xmm5[2],xmm15[2],xmm5[3],xmm15[3]
-; SSE2-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm15[8],xmm0[9],xmm15[9],xmm0[10],xmm15[10],xmm0[11],xmm15[11],xmm0[12],xmm15[12],xmm0[13],xmm15[13],xmm0[14],xmm15[14],xmm0[15],xmm15[15]
+; SSE2-NEXT:    movdqa %xmm13, {{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT:    movdqa %xmm10, {{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT:    movdqa %xmm1, {{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT:    movdqa %xmm15, {{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT:    movaps a+1040(%rax), %xmm0
+; SSE2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT:    movdqa a+1024(%rax), %xmm12
+; SSE2-NEXT:    movdqa a+1056(%rax), %xmm15
+; SSE2-NEXT:    movdqa a+1072(%rax), %xmm4
+; SSE2-NEXT:    movdqa %xmm4, %xmm6
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm14[8],xmm6[9],xmm14[9],xmm6[10],xmm14[10],xmm6[11],xmm14[11],xmm6[12],xmm14[12],xmm6[13],xmm14[13],xmm6[14],xmm14[14],xmm6[15],xmm14[15]
+; SSE2-NEXT:    movdqa %xmm6, %xmm1
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm14[4],xmm1[5],xmm14[5],xmm1[6],xmm14[6],xmm1[7],xmm14[7]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm14[0],xmm6[1],xmm14[1],xmm6[2],xmm14[2],xmm6[3],xmm14[3]
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm14[0],xmm4[1],xmm14[1],xmm4[2],xmm14[2],xmm4[3],xmm14[3],xmm4[4],xmm14[4],xmm4[5],xmm14[5],xmm4[6],xmm14[6],xmm4[7],xmm14[7]
+; SSE2-NEXT:    movdqa %xmm4, %xmm5
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm14[4],xmm5[5],xmm14[5],xmm5[6],xmm14[6],xmm5[7],xmm14[7]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm14[0],xmm4[1],xmm14[1],xmm4[2],xmm14[2],xmm4[3],xmm14[3]
+; SSE2-NEXT:    movdqa %xmm15, %xmm11
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm11 = xmm11[8],xmm14[8],xmm11[9],xmm14[9],xmm11[10],xmm14[10],xmm11[11],xmm14[11],xmm11[12],xmm14[12],xmm11[13],xmm14[13],xmm11[14],xmm14[14],xmm11[15],xmm14[15]
+; SSE2-NEXT:    movdqa %xmm11, %xmm8
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm14[4],xmm8[5],xmm14[5],xmm8[6],xmm14[6],xmm8[7],xmm14[7]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm14[0],xmm11[1],xmm14[1],xmm11[2],xmm14[2],xmm11[3],xmm14[3]
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3],xmm15[4],xmm14[4],xmm15[5],xmm14[5],xmm15[6],xmm14[6],xmm15[7],xmm14[7]
+; SSE2-NEXT:    movdqa %xmm15, %xmm0
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm14[4],xmm0[5],xmm14[5],xmm0[6],xmm14[6],xmm0[7],xmm14[7]
 ; SSE2-NEXT:    movdqa %xmm0, %xmm2
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1],xmm0[2],xmm15[2],xmm0[3],xmm15[3]
-; SSE2-NEXT:    psubd %xmm0, %xmm5
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm15[4],xmm9[5],xmm15[5],xmm9[6],xmm15[6],xmm9[7],xmm15[7]
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm15[4],xmm2[5],xmm15[5],xmm2[6],xmm15[6],xmm2[7],xmm15[7]
-; SSE2-NEXT:    psubd %xmm2, %xmm9
-; SSE2-NEXT:    movdqa %xmm9, %xmm0
-; SSE2-NEXT:    psrad $31, %xmm0
-; SSE2-NEXT:    paddd %xmm0, %xmm9
-; SSE2-NEXT:    pxor %xmm0, %xmm9
-; SSE2-NEXT:    movdqa %xmm5, %xmm0
-; SSE2-NEXT:    psrad $31, %xmm0
-; SSE2-NEXT:    paddd %xmm0, %xmm5
-; SSE2-NEXT:    pxor %xmm0, %xmm5
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3]
+; SSE2-NEXT:    movdqa %xmm12, %xmm10
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm14[0],xmm10[1],xmm14[1],xmm10[2],xmm14[2],xmm10[3],xmm14[3],xmm10[4],xmm14[4],xmm10[5],xmm14[5],xmm10[6],xmm14[6],xmm10[7],xmm14[7]
 ; SSE2-NEXT:    movdqa %xmm10, %xmm0
-; SSE2-NEXT:    psrad $31, %xmm0
-; SSE2-NEXT:    paddd %xmm0, %xmm10
-; SSE2-NEXT:    pxor %xmm0, %xmm10
-; SSE2-NEXT:    movdqa %xmm3, %xmm0
-; SSE2-NEXT:    psrad $31, %xmm0
-; SSE2-NEXT:    paddd %xmm0, %xmm3
-; SSE2-NEXT:    pxor %xmm0, %xmm3
-; SSE2-NEXT:    movdqa %xmm13, %xmm0
-; SSE2-NEXT:    psrad $31, %xmm0
-; SSE2-NEXT:    paddd %xmm0, %xmm13
-; SSE2-NEXT:    pxor %xmm0, %xmm13
-; SSE2-NEXT:    movdqa %xmm1, %xmm0
-; SSE2-NEXT:    psrad $31, %xmm0
-; SSE2-NEXT:    paddd %xmm0, %xmm1
-; SSE2-NEXT:    pxor %xmm0, %xmm1
-; SSE2-NEXT:    movdqa -{{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload
-; SSE2-NEXT:    movdqa %xmm2, %xmm0
-; SSE2-NEXT:    psrad $31, %xmm0
-; SSE2-NEXT:    paddd %xmm0, %xmm2
-; SSE2-NEXT:    pxor %xmm0, %xmm2
-; SSE2-NEXT:    movdqa %xmm2, -{{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT:    movdqa %xmm11, %xmm0
-; SSE2-NEXT:    psrad $31, %xmm0
-; SSE2-NEXT:    paddd %xmm0, %xmm11
-; SSE2-NEXT:    pxor %xmm0, %xmm11
-; SSE2-NEXT:    movdqa -{{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload
-; SSE2-NEXT:    movdqa %xmm2, %xmm0
-; SSE2-NEXT:    psrad $31, %xmm0
-; SSE2-NEXT:    paddd %xmm0, %xmm2
-; SSE2-NEXT:    pxor %xmm0, %xmm2
-; SSE2-NEXT:    movdqa %xmm2, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3]
+; SSE2-NEXT:    movdqa %xmm0, %xmm9
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm14[4],xmm10[5],xmm14[5],xmm10[6],xmm14[6],xmm10[7],xmm14[7]
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm12 = xmm12[8],xmm14[8],xmm12[9],xmm14[9],xmm12[10],xmm14[10],xmm12[11],xmm14[11],xmm12[12],xmm14[12],xmm12[13],xmm14[13],xmm12[14],xmm14[14],xmm12[15],xmm14[15]
 ; SSE2-NEXT:    movdqa %xmm12, %xmm0
-; SSE2-NEXT:    psrad $31, %xmm0
-; SSE2-NEXT:    paddd %xmm0, %xmm12
-; SSE2-NEXT:    pxor %xmm0, %xmm12
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3]
+; SSE2-NEXT:    movdqa %xmm0, %xmm13
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm14[4],xmm12[5],xmm14[5],xmm12[6],xmm14[6],xmm12[7],xmm14[7]
+; SSE2-NEXT:    movdqa b+1072(%rax), %xmm3
+; SSE2-NEXT:    movdqa %xmm3, %xmm7
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm14[8],xmm7[9],xmm14[9],xmm7[10],xmm14[10],xmm7[11],xmm14[11],xmm7[12],xmm14[12],xmm7[13],xmm14[13],xmm7[14],xmm14[14],xmm7[15],xmm14[15]
+; SSE2-NEXT:    movdqa %xmm7, %xmm0
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm14[4],xmm0[5],xmm14[5],xmm0[6],xmm14[6],xmm0[7],xmm14[7]
+; SSE2-NEXT:    psubd %xmm0, %xmm1
+; SSE2-NEXT:    movdqa b+1056(%rax), %xmm0
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm14[0],xmm7[1],xmm14[1],xmm7[2],xmm14[2],xmm7[3],xmm14[3]
+; SSE2-NEXT:    psubd %xmm7, %xmm6
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm14[0],xmm3[1],xmm14[1],xmm3[2],xmm14[2],xmm3[3],xmm14[3],xmm3[4],xmm14[4],xmm3[5],xmm14[5],xmm3[6],xmm14[6],xmm3[7],xmm14[7]
+; SSE2-NEXT:    movdqa %xmm3, %xmm7
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm14[4],xmm7[5],xmm14[5],xmm7[6],xmm14[6],xmm7[7],xmm14[7]
+; SSE2-NEXT:    psubd %xmm7, %xmm5
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm14[0],xmm3[1],xmm14[1],xmm3[2],xmm14[2],xmm3[3],xmm14[3]
+; SSE2-NEXT:    psubd %xmm3, %xmm4
+; SSE2-NEXT:    movdqa %xmm0, %xmm3
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm14[8],xmm3[9],xmm14[9],xmm3[10],xmm14[10],xmm3[11],xmm14[11],xmm3[12],xmm14[12],xmm3[13],xmm14[13],xmm3[14],xmm14[14],xmm3[15],xmm14[15]
+; SSE2-NEXT:    movdqa %xmm3, %xmm7
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm14[4],xmm7[5],xmm14[5],xmm7[6],xmm14[6],xmm7[7],xmm14[7]
+; SSE2-NEXT:    psubd %xmm7, %xmm8
+; SSE2-NEXT:    movdqa b+1024(%rax), %xmm7
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm14[0],xmm3[1],xmm14[1],xmm3[2],xmm14[2],xmm3[3],xmm14[3]
+; SSE2-NEXT:    psubd %xmm3, %xmm11
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3],xmm0[4],xmm14[4],xmm0[5],xmm14[5],xmm0[6],xmm14[6],xmm0[7],xmm14[7]
+; SSE2-NEXT:    movdqa %xmm0, %xmm3
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm14[4],xmm3[5],xmm14[5],xmm3[6],xmm14[6],xmm3[7],xmm14[7]
+; SSE2-NEXT:    psubd %xmm3, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, (%rsp) # 16-byte Spill
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3]
+; SSE2-NEXT:    psubd %xmm0, %xmm15
+; SSE2-NEXT:    movdqa %xmm7, %xmm0
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3],xmm0[4],xmm14[4],xmm0[5],xmm14[5],xmm0[6],xmm14[6],xmm0[7],xmm14[7]
+; SSE2-NEXT:    movdqa %xmm0, %xmm3
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm14[0],xmm3[1],xmm14[1],xmm3[2],xmm14[2],xmm3[3],xmm14[3]
+; SSE2-NEXT:    psubd %xmm3, %xmm9
+; SSE2-NEXT:    movdqa %xmm9, {{[0-9]+}}(%rsp) # 16-byte Spill
 ; SSE2-NEXT:    movdqa -{{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload
-; SSE2-NEXT:    movdqa %xmm2, %xmm0
-; SSE2-NEXT:    psrad $31, %xmm0
-; SSE2-NEXT:    paddd %xmm0, %xmm2
-; SSE2-NEXT:    pxor %xmm0, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm9
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm14[0],xmm9[1],xmm14[1],xmm9[2],xmm14[2],xmm9[3],xmm14[3],xmm9[4],xmm14[4],xmm9[5],xmm14[5],xmm9[6],xmm14[6],xmm9[7],xmm14[7]
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm14[4],xmm0[5],xmm14[5],xmm0[6],xmm14[6],xmm0[7],xmm14[7]
+; SSE2-NEXT:    psubd %xmm0, %xmm10
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm14[8],xmm7[9],xmm14[9],xmm7[10],xmm14[10],xmm7[11],xmm14[11],xmm7[12],xmm14[12],xmm7[13],xmm14[13],xmm7[14],xmm14[14],xmm7[15],xmm14[15]
+; SSE2-NEXT:    movdqa %xmm7, %xmm0
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3]
+; SSE2-NEXT:    psubd %xmm0, %xmm13
+; SSE2-NEXT:    movdqa %xmm13, {{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT:    movdqa %xmm9, %xmm0
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3]
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm14[4],xmm7[5],xmm14[5],xmm7[6],xmm14[6],xmm7[7],xmm14[7]
+; SSE2-NEXT:    psubd %xmm7, %xmm12
+; SSE2-NEXT:    movdqa b+1040(%rax), %xmm13
+; SSE2-NEXT:    movdqa %xmm13, %xmm3
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm14[0],xmm3[1],xmm14[1],xmm3[2],xmm14[2],xmm3[3],xmm14[3],xmm3[4],xmm14[4],xmm3[5],xmm14[5],xmm3[6],xmm14[6],xmm3[7],xmm14[7]
+; SSE2-NEXT:    movdqa %xmm3, %xmm7
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm14[0],xmm7[1],xmm14[1],xmm7[2],xmm14[2],xmm7[3],xmm14[3]
+; SSE2-NEXT:    psubd %xmm7, %xmm0
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm14[4],xmm9[5],xmm14[5],xmm9[6],xmm14[6],xmm9[7],xmm14[7]
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm14[4],xmm3[5],xmm14[5],xmm3[6],xmm14[6],xmm3[7],xmm14[7]
+; SSE2-NEXT:    psubd %xmm3, %xmm9
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm14[8],xmm2[9],xmm14[9],xmm2[10],xmm14[10],xmm2[11],xmm14[11],xmm2[12],xmm14[12],xmm2[13],xmm14[13],xmm2[14],xmm14[14],xmm2[15],xmm14[15]
+; SSE2-NEXT:    movdqa %xmm2, %xmm7
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm14[0],xmm7[1],xmm14[1],xmm7[2],xmm14[2],xmm7[3],xmm14[3]
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm13 = xmm13[8],xmm14[8],xmm13[9],xmm14[9],xmm13[10],xmm14[10],xmm13[11],xmm14[11],xmm13[12],xmm14[12],xmm13[13],xmm14[13],xmm13[14],xmm14[14],xmm13[15],xmm14[15]
+; SSE2-NEXT:    movdqa %xmm13, %xmm3
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm14[0],xmm3[1],xmm14[1],xmm3[2],xmm14[2],xmm3[3],xmm14[3]
+; SSE2-NEXT:    psubd %xmm3, %xmm7
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm14[4],xmm2[5],xmm14[5],xmm2[6],xmm14[6],xmm2[7],xmm14[7]
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm14[4],xmm13[5],xmm14[5],xmm13[6],xmm14[6],xmm13[7],xmm14[7]
+; SSE2-NEXT:    psubd %xmm13, %xmm2
 ; SSE2-NEXT:    movdqa %xmm2, -{{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT:    movdqa %xmm4, %xmm0
+; SSE2-NEXT:    movdqa %xmm1, %xmm3
+; SSE2-NEXT:    psrad $31, %xmm3
+; SSE2-NEXT:    paddd %xmm3, %xmm1
+; SSE2-NEXT:    pxor %xmm3, %xmm1
+; SSE2-NEXT:    movdqa -{{[0-9]+}}(%rsp), %xmm3 # 16-byte Reload
+; SSE2-NEXT:    paddd %xmm1, %xmm3
+; SSE2-NEXT:    movdqa %xmm3, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT:    movdqa %xmm6, %xmm1
+; SSE2-NEXT:    psrad $31, %xmm1
+; SSE2-NEXT:    paddd %xmm1, %xmm6
+; SSE2-NEXT:    pxor %xmm1, %xmm6
+; SSE2-NEXT:    movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload
+; SSE2-NEXT:    paddd %xmm6, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm3 # 16-byte Reload
+; SSE2-NEXT:    movdqa %xmm5, %xmm1
+; SSE2-NEXT:    psrad $31, %xmm1
+; SSE2-NEXT:    paddd %xmm1, %xmm5
+; SSE2-NEXT:    pxor %xmm1, %xmm5
+; SSE2-NEXT:    movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload
+; SSE2-NEXT:    paddd %xmm5, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm6 # 16-byte Reload
+; SSE2-NEXT:    movdqa %xmm4, %xmm1
+; SSE2-NEXT:    psrad $31, %xmm1
+; SSE2-NEXT:    paddd %xmm1, %xmm4
+; SSE2-NEXT:    pxor %xmm1, %xmm4
+; SSE2-NEXT:    movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload
+; SSE2-NEXT:    paddd %xmm4, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm13 # 16-byte Reload
+; SSE2-NEXT:    movdqa %xmm8, %xmm1
+; SSE2-NEXT:    psrad $31, %xmm1
+; SSE2-NEXT:    paddd %xmm1, %xmm8
+; SSE2-NEXT:    pxor %xmm1, %xmm8
+; SSE2-NEXT:    movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload
+; SSE2-NEXT:    paddd %xmm8, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm5 # 16-byte Reload
+; SSE2-NEXT:    movdqa %xmm11, %xmm1
+; SSE2-NEXT:    psrad $31, %xmm1
+; SSE2-NEXT:    paddd %xmm1, %xmm11
+; SSE2-NEXT:    pxor %xmm1, %xmm11
+; SSE2-NEXT:    movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload
+; SSE2-NEXT:    paddd %xmm11, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload
+; SSE2-NEXT:    movdqa (%rsp), %xmm4 # 16-byte Reload
+; SSE2-NEXT:    movdqa %xmm4, %xmm1
+; SSE2-NEXT:    psrad $31, %xmm1
+; SSE2-NEXT:    paddd %xmm1, %xmm4
+; SSE2-NEXT:    pxor %xmm1, %xmm4
+; SSE2-NEXT:    paddd %xmm4, %xmm3
+; SSE2-NEXT:    movdqa %xmm3, %xmm11
+; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm3 # 16-byte Reload
+; SSE2-NEXT:    movdqa %xmm15, %xmm1
+; SSE2-NEXT:    psrad $31, %xmm1
+; SSE2-NEXT:    paddd %xmm1, %xmm15
+; SSE2-NEXT:    pxor %xmm1, %xmm15
+; SSE2-NEXT:    paddd %xmm15, %xmm2
+; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm4 # 16-byte Reload
+; SSE2-NEXT:    movdqa %xmm4, %xmm1
+; SSE2-NEXT:    psrad $31, %xmm1
+; SSE2-NEXT:    paddd %xmm1, %xmm4
+; SSE2-NEXT:    pxor %xmm1, %xmm4
+; SSE2-NEXT:    paddd %xmm4, %xmm6
+; SSE2-NEXT:    movdqa %xmm6, %xmm15
+; SSE2-NEXT:    movdqa %xmm10, %xmm1
+; SSE2-NEXT:    psrad $31, %xmm1
+; SSE2-NEXT:    paddd %xmm1, %xmm10
+; SSE2-NEXT:    pxor %xmm1, %xmm10
+; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload
+; SSE2-NEXT:    paddd %xmm10, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm10
+; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm6 # 16-byte Reload
+; SSE2-NEXT:    movdqa %xmm6, %xmm1
+; SSE2-NEXT:    psrad $31, %xmm1
+; SSE2-NEXT:    paddd %xmm1, %xmm6
+; SSE2-NEXT:    pxor %xmm1, %xmm6
+; SSE2-NEXT:    paddd %xmm6, %xmm3
+; SSE2-NEXT:    movdqa %xmm12, %xmm1
+; SSE2-NEXT:    psrad $31, %xmm1
+; SSE2-NEXT:    paddd %xmm1, %xmm12
+; SSE2-NEXT:    pxor %xmm1, %xmm12
+; SSE2-NEXT:    paddd %xmm12, %xmm5
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    psrad $31, %xmm1
+; SSE2-NEXT:    paddd %xmm1, %xmm0
+; SSE2-NEXT:    pxor %xmm1, %xmm0
+; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload
+; SSE2-NEXT:    paddd %xmm0, %xmm13
+; SSE2-NEXT:    movdqa %xmm9, %xmm0
 ; SSE2-NEXT:    psrad $31, %xmm0
-; SSE2-NEXT:    paddd %xmm0, %xmm4
-; SSE2-NEXT:    pxor %xmm0, %xmm4
+; SSE2-NEXT:    paddd %xmm0, %xmm9
+; SSE2-NEXT:    pxor %xmm0, %xmm9
+; SSE2-NEXT:    paddd %xmm9, %xmm1
 ; SSE2-NEXT:    movdqa %xmm7, %xmm0
 ; SSE2-NEXT:    psrad $31, %xmm0
 ; SSE2-NEXT:    paddd %xmm0, %xmm7
 ; SSE2-NEXT:    pxor %xmm0, %xmm7
-; SSE2-NEXT:    movdqa %xmm14, %xmm0
-; SSE2-NEXT:    psrad $31, %xmm0
-; SSE2-NEXT:    paddd %xmm0, %xmm14
-; SSE2-NEXT:    pxor %xmm0, %xmm14
-; SSE2-NEXT:    movdqa %xmm8, %xmm0
-; SSE2-NEXT:    psrad $31, %xmm0
-; SSE2-NEXT:    paddd %xmm0, %xmm8
-; SSE2-NEXT:    pxor %xmm0, %xmm8
-; SSE2-NEXT:    movdqa %xmm6, %xmm0
+; SSE2-NEXT:    movdqa -{{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
+; SSE2-NEXT:    paddd %xmm7, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT:    movdqa -{{[0-9]+}}(%rsp), %xmm8 # 16-byte Reload
+; SSE2-NEXT:    movdqa -{{[0-9]+}}(%rsp), %xmm7 # 16-byte Reload
+; SSE2-NEXT:    movdqa %xmm7, %xmm0
 ; SSE2-NEXT:    psrad $31, %xmm0
-; SSE2-NEXT:    paddd %xmm0, %xmm6
-; SSE2-NEXT:    pxor %xmm0, %xmm6
-; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movdqa -{{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload
-; SSE2-NEXT:    paddd %xmm6, %xmm2
-; SSE2-NEXT:    movdqa %xmm2, -{{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm6 # 16-byte Reload
-; SSE2-NEXT:    paddd %xmm8, %xmm6
-; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm8 # 16-byte Reload
-; SSE2-NEXT:    movdqa -{{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload
-; SSE2-NEXT:    paddd %xmm14, %xmm2
-; SSE2-NEXT:    movdqa %xmm2, -{{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload
-; SSE2-NEXT:    paddd %xmm7, %xmm2
-; SSE2-NEXT:    movdqa %xmm2, {{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm14 # 16-byte Reload
-; SSE2-NEXT:    movdqa -{{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload
-; SSE2-NEXT:    paddd %xmm4, %xmm2
-; SSE2-NEXT:    movdqa %xmm2, -{{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm4 # 16-byte Reload
-; SSE2-NEXT:    paddd -{{[0-9]+}}(%rsp), %xmm4 # 16-byte Folded Reload
-; SSE2-NEXT:    paddd %xmm12, %xmm8
-; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload
-; SSE2-NEXT:    paddd -{{[0-9]+}}(%rsp), %xmm0 # 16-byte Folded Reload
-; SSE2-NEXT:    movdqa %xmm0, %xmm12
-; SSE2-NEXT:    movdqa (%rsp), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    paddd %xmm11, %xmm0
-; SSE2-NEXT:    movdqa %xmm0, (%rsp) # 16-byte Spill
-; SSE2-NEXT:    movdqa (%rsp), %xmm11 # 16-byte Reload
-; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm7 # 16-byte Reload
+; SSE2-NEXT:    paddd %xmm0, %xmm7
+; SSE2-NEXT:    pxor %xmm0, %xmm7
 ; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    paddd -{{[0-9]+}}(%rsp), %xmm0 # 16-byte Folded Reload
-; SSE2-NEXT:    paddd %xmm1, %xmm2
-; SSE2-NEXT:    paddd %xmm13, %xmm7
-; SSE2-NEXT:    movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload
-; SSE2-NEXT:    paddd %xmm3, %xmm1
-; SSE2-NEXT:    movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT:    movdqa -{{[0-9]+}}(%rsp), %xmm13 # 16-byte Reload
-; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload
-; SSE2-NEXT:    paddd %xmm10, %xmm1
-; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm3 # 16-byte Reload
-; SSE2-NEXT:    paddd %xmm5, %xmm3
-; SSE2-NEXT:    movdqa -{{[0-9]+}}(%rsp), %xmm5 # 16-byte Reload
-; SSE2-NEXT:    paddd %xmm9, %xmm5
-; SSE2-NEXT:    movdqa %xmm5, -{{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT:    movdqa -{{[0-9]+}}(%rsp), %xmm5 # 16-byte Reload
+; SSE2-NEXT:    paddd %xmm7, %xmm0
 ; SSE2-NEXT:    addq $4, %rax
 ; SSE2-NEXT:    jne .LBB2_1
 ; SSE2-NEXT:  # BB#2: # %middle.block
-; SSE2-NEXT:    paddd %xmm2, %xmm4
-; SSE2-NEXT:    paddd %xmm3, %xmm6
-; SSE2-NEXT:    movdqa %xmm12, %xmm2
-; SSE2-NEXT:    paddd %xmm11, %xmm2
-; SSE2-NEXT:    paddd %xmm13, %xmm14
-; SSE2-NEXT:    movdqa -{{[0-9]+}}(%rsp), %xmm3 # 16-byte Reload
-; SSE2-NEXT:    paddd %xmm7, %xmm3
-; SSE2-NEXT:    movdqa -{{[0-9]+}}(%rsp), %xmm7 # 16-byte Reload
-; SSE2-NEXT:    paddd %xmm5, %xmm7
-; SSE2-NEXT:    paddd %xmm0, %xmm8
-; SSE2-NEXT:    movdqa -{{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
+; SSE2-NEXT:    paddd -{{[0-9]+}}(%rsp), %xmm3 # 16-byte Folded Reload
+; SSE2-NEXT:    paddd -{{[0-9]+}}(%rsp), %xmm8 # 16-byte Folded Reload
+; SSE2-NEXT:    paddd %xmm3, %xmm8
+; SSE2-NEXT:    paddd %xmm2, %xmm15
+; SSE2-NEXT:    paddd -{{[0-9]+}}(%rsp), %xmm13 # 16-byte Folded Reload
+; SSE2-NEXT:    paddd %xmm8, %xmm13
+; SSE2-NEXT:    paddd -{{[0-9]+}}(%rsp), %xmm5 # 16-byte Folded Reload
+; SSE2-NEXT:    paddd -{{[0-9]+}}(%rsp), %xmm0 # 16-byte Folded Reload
+; SSE2-NEXT:    paddd %xmm5, %xmm0
+; SSE2-NEXT:    paddd %xmm11, %xmm10
+; SSE2-NEXT:    paddd -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Folded Reload
+; SSE2-NEXT:    paddd %xmm0, %xmm1
+; SSE2-NEXT:    paddd %xmm10, %xmm1
+; SSE2-NEXT:    paddd %xmm13, %xmm1
+; SSE2-NEXT:    paddd %xmm15, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
 ; SSE2-NEXT:    paddd %xmm1, %xmm0
-; SSE2-NEXT:    paddd %xmm3, %xmm7
-; SSE2-NEXT:    paddd %xmm4, %xmm6
-; SSE2-NEXT:    paddd %xmm14, %xmm6
-; SSE2-NEXT:    paddd %xmm0, %xmm7
-; SSE2-NEXT:    paddd %xmm8, %xmm7
-; SSE2-NEXT:    paddd %xmm6, %xmm7
-; SSE2-NEXT:    paddd %xmm2, %xmm7
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm7[2,3,0,1]
-; SSE2-NEXT:    paddd %xmm7, %xmm0
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
 ; SSE2-NEXT:    paddd %xmm0, %xmm1
 ; SSE2-NEXT:    movd %xmm1, %eax
-; SSE2-NEXT:    addq $184, %rsp
+; SSE2-NEXT:    addq $200, %rsp
 ; SSE2-NEXT:    retq
 ;
 ; AVX2-LABEL: sad_avx64i8:
@@ -688,8 +688,8 @@ define i32 @sad_avx64i8() nounwind {
 ; AVX2-NEXT:    movq $-1024, %rax # imm = 0xFC00
 ; AVX2-NEXT:    vpxor %ymm2, %ymm2, %ymm2
 ; AVX2-NEXT:    vpxor %ymm1, %ymm1, %ymm1
-; AVX2-NEXT:    vpxor %ymm3, %ymm3, %ymm3
 ; AVX2-NEXT:    vpxor %ymm4, %ymm4, %ymm4
+; AVX2-NEXT:    vpxor %ymm3, %ymm3, %ymm3
 ; AVX2-NEXT:    vpxor %ymm6, %ymm6, %ymm6
 ; AVX2-NEXT:    vpxor %ymm5, %ymm5, %ymm5
 ; AVX2-NEXT:    vpxor %ymm7, %ymm7, %ymm7
@@ -697,7 +697,6 @@ define i32 @sad_avx64i8() nounwind {
 ; AVX2-NEXT:  .LBB2_1: # %vector.body
 ; AVX2-NEXT:    # =>This Inner Loop Header: Depth=1
 ; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
-; AVX2-NEXT:    vmovdqu %ymm8, -{{[0-9]+}}(%rsp) # 32-byte Spill
 ; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm9 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
 ; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm10 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
 ; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm11 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
@@ -705,48 +704,49 @@ define i32 @sad_avx64i8() nounwind {
 ; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm13 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
 ; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm14 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
 ; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
-; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
-; AVX2-NEXT:    vpsubd %ymm8, %ymm15, %ymm8
+; AVX2-NEXT:    vmovdqu %ymm15, -{{[0-9]+}}(%rsp) # 32-byte Spill
 ; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
-; AVX2-NEXT:    vpsubd %ymm15, %ymm14, %ymm14
+; AVX2-NEXT:    vpsubd %ymm15, %ymm8, %ymm8
+; AVX2-NEXT:    vmovdqu %ymm8, -{{[0-9]+}}(%rsp) # 32-byte Spill
 ; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
-; AVX2-NEXT:    vpsubd %ymm15, %ymm13, %ymm13
+; AVX2-NEXT:    vpsubd %ymm15, %ymm9, %ymm9
 ; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
-; AVX2-NEXT:    vpsubd %ymm15, %ymm12, %ymm12
+; AVX2-NEXT:    vpsubd %ymm15, %ymm10, %ymm10
 ; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
 ; AVX2-NEXT:    vpsubd %ymm15, %ymm11, %ymm11
 ; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
-; AVX2-NEXT:    vpsubd %ymm15, %ymm10, %ymm10
+; AVX2-NEXT:    vpsubd %ymm15, %ymm12, %ymm12
 ; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
-; AVX2-NEXT:    vpsubd %ymm15, %ymm9, %ymm9
-; AVX2-NEXT:    vmovdqu %ymm9, -{{[0-9]+}}(%rsp) # 32-byte Spill
+; AVX2-NEXT:    vpsubd %ymm15, %ymm13, %ymm13
+; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
+; AVX2-NEXT:    vpsubd %ymm15, %ymm14, %ymm14
 ; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
-; AVX2-NEXT:    vmovdqu -{{[0-9]+}}(%rsp), %ymm9 # 32-byte Reload
-; AVX2-NEXT:    vpsubd %ymm15, %ymm9, %ymm15
-; AVX2-NEXT:    vpabsd %ymm8, %ymm8
+; AVX2-NEXT:    vmovdqu -{{[0-9]+}}(%rsp), %ymm8 # 32-byte Reload
+; AVX2-NEXT:    vpsubd %ymm15, %ymm8, %ymm15
+; AVX2-NEXT:    vpabsd -{{[0-9]+}}(%rsp), %ymm8 # 32-byte Folded Reload
+; AVX2-NEXT:    vpaddd %ymm7, %ymm8, %ymm7
+; AVX2-NEXT:    vpabsd %ymm9, %ymm8
+; AVX2-NEXT:    vpaddd %ymm5, %ymm8, %ymm5
+; AVX2-NEXT:    vpabsd %ymm10, %ymm8
+; AVX2-NEXT:    vpaddd %ymm6, %ymm8, %ymm6
+; AVX2-NEXT:    vpabsd %ymm11, %ymm8
 ; AVX2-NEXT:    vpaddd %ymm3, %ymm8, %ymm3
-; AVX2-NEXT:    vpabsd %ymm14, %ymm8
-; AVX2-NEXT:    vpaddd %ymm1, %ymm8, %ymm1
-; AVX2-NEXT:    vpabsd %ymm13, %ymm8
-; AVX2-NEXT:    vpaddd %ymm2, %ymm8, %ymm2
 ; AVX2-NEXT:    vpabsd %ymm12, %ymm8
 ; AVX2-NEXT:    vpaddd %ymm0, %ymm8, %ymm0
-; AVX2-NEXT:    vpabsd %ymm11, %ymm8
-; AVX2-NEXT:    vpaddd %ymm4, %ymm8, %ymm4
-; AVX2-NEXT:    vpabsd %ymm10, %ymm8
-; AVX2-NEXT:    vpaddd %ymm6, %ymm8, %ymm6
-; AVX2-NEXT:    vpabsd -{{[0-9]+}}(%rsp), %ymm8 # 32-byte Folded Reload
-; AVX2-NEXT:    vpaddd %ymm5, %ymm8, %ymm5
+; AVX2-NEXT:    vpabsd %ymm13, %ymm8
+; AVX2-NEXT:    vpaddd %ymm2, %ymm8, %ymm2
+; AVX2-NEXT:    vpabsd %ymm14, %ymm8
+; AVX2-NEXT:    vpaddd %ymm1, %ymm8, %ymm1
 ; AVX2-NEXT:    vpabsd %ymm15, %ymm8
-; AVX2-NEXT:    vpaddd %ymm7, %ymm8, %ymm7
+; AVX2-NEXT:    vpaddd %ymm4, %ymm8, %ymm4
 ; AVX2-NEXT:    addq $4, %rax
 ; AVX2-NEXT:    jne .LBB2_1
 ; AVX2-NEXT:  # BB#2: # %middle.block
 ; AVX2-NEXT:    vpaddd %ymm6, %ymm2, %ymm2
-; AVX2-NEXT:    vpaddd %ymm7, %ymm3, %ymm3
-; AVX2-NEXT:    vpaddd %ymm4, %ymm0, %ymm0
+; AVX2-NEXT:    vpaddd %ymm7, %ymm4, %ymm4
+; AVX2-NEXT:    vpaddd %ymm4, %ymm2, %ymm2
+; AVX2-NEXT:    vpaddd %ymm3, %ymm0, %ymm0
 ; AVX2-NEXT:    vpaddd %ymm5, %ymm1, %ymm1
-; AVX2-NEXT:    vpaddd %ymm3, %ymm2, %ymm2
 ; AVX2-NEXT:    vpaddd %ymm2, %ymm1, %ymm1
 ; AVX2-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
@@ -773,21 +773,21 @@ define i32 @sad_avx64i8() nounwind {
 ; AVX512F-NEXT:    vpmovzxbd {{.*#+}} zmm6 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
 ; AVX512F-NEXT:    vpmovzxbd {{.*#+}} zmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
 ; AVX512F-NEXT:    vpmovzxbd {{.*#+}} zmm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
-; AVX512F-NEXT:    vpmovzxbd {{.*#+}} zmm9 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
-; AVX512F-NEXT:    vpmovzxbd {{.*#+}} zmm10 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
-; AVX512F-NEXT:    vpmovzxbd {{.*#+}} zmm11 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
-; AVX512F-NEXT:    vpsubd %zmm11, %zmm7, %zmm7
-; AVX512F-NEXT:    vpsubd %zmm10, %zmm6, %zmm6
-; AVX512F-NEXT:    vpsubd %zmm9, %zmm5, %zmm5
 ; AVX512F-NEXT:    vpsubd %zmm8, %zmm4, %zmm4
+; AVX512F-NEXT:    vpmovzxbd {{.*#+}} zmm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
+; AVX512F-NEXT:    vpsubd %zmm8, %zmm5, %zmm5
+; AVX512F-NEXT:    vpmovzxbd {{.*#+}} zmm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
+; AVX512F-NEXT:    vpsubd %zmm8, %zmm6, %zmm6
+; AVX512F-NEXT:    vpmovzxbd {{.*#+}} zmm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
+; AVX512F-NEXT:    vpsubd %zmm8, %zmm7, %zmm7
 ; AVX512F-NEXT:    vpabsd %zmm4, %zmm4
-; AVX512F-NEXT:    vpabsd %zmm5, %zmm5
-; AVX512F-NEXT:    vpabsd %zmm6, %zmm6
-; AVX512F-NEXT:    vpabsd %zmm7, %zmm7
-; AVX512F-NEXT:    vpaddd %zmm3, %zmm7, %zmm3
-; AVX512F-NEXT:    vpaddd %zmm2, %zmm6, %zmm2
-; AVX512F-NEXT:    vpaddd %zmm1, %zmm5, %zmm1
 ; AVX512F-NEXT:    vpaddd %zmm0, %zmm4, %zmm0
+; AVX512F-NEXT:    vpabsd %zmm5, %zmm4
+; AVX512F-NEXT:    vpaddd %zmm1, %zmm4, %zmm1
+; AVX512F-NEXT:    vpabsd %zmm6, %zmm4
+; AVX512F-NEXT:    vpaddd %zmm2, %zmm4, %zmm2
+; AVX512F-NEXT:    vpabsd %zmm7, %zmm4
+; AVX512F-NEXT:    vpaddd %zmm3, %zmm4, %zmm3
 ; AVX512F-NEXT:    addq $4, %rax
 ; AVX512F-NEXT:    jne .LBB2_1
 ; AVX512F-NEXT:  # BB#2: # %middle.block
@@ -1154,59 +1154,54 @@ define i32 @sad_nonloop_32i8(<32 x i8>* nocapture readonly %p, i64, <32 x i8>* n
 ; SSE2-LABEL: sad_nonloop_32i8:
 ; SSE2:       # BB#0:
 ; SSE2-NEXT:    movdqu (%rdi), %xmm0
-; SSE2-NEXT:    movdqu 16(%rdi), %xmm3
-; SSE2-NEXT:    pxor %xmm4, %xmm4
-; SSE2-NEXT:    movdqa %xmm3, %xmm12
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm4[0],xmm12[1],xmm4[1],xmm12[2],xmm4[2],xmm12[3],xmm4[3],xmm12[4],xmm4[4],xmm12[5],xmm4[5],xmm12[6],xmm4[6],xmm12[7],xmm4[7]
-; SSE2-NEXT:    movdqa %xmm12, %xmm9
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm4[4],xmm9[5],xmm4[5],xmm9[6],xmm4[6],xmm9[7],xmm4[7]
-; SSE2-NEXT:    movdqa %xmm0, %xmm13
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm13 = xmm13[0],xmm4[0],xmm13[1],xmm4[1],xmm13[2],xmm4[2],xmm13[3],xmm4[3],xmm13[4],xmm4[4],xmm13[5],xmm4[5],xmm13[6],xmm4[6],xmm13[7],xmm4[7]
-; SSE2-NEXT:    movdqa %xmm13, %xmm10
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm4[4],xmm10[5],xmm4[5],xmm10[6],xmm4[6],xmm10[7],xmm4[7]
-; SSE2-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15]
-; SSE2-NEXT:    movdqa %xmm3, %xmm11
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm4[4],xmm11[5],xmm4[5],xmm11[6],xmm4[6],xmm11[7],xmm4[7]
-; SSE2-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm4[8],xmm0[9],xmm4[9],xmm0[10],xmm4[10],xmm0[11],xmm4[11],xmm0[12],xmm4[12],xmm0[13],xmm4[13],xmm0[14],xmm4[14],xmm0[15],xmm4[15]
-; SSE2-NEXT:    movdqa %xmm0, %xmm6
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm4[0],xmm12[1],xmm4[1],xmm12[2],xmm4[2],xmm12[3],xmm4[3]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm4[0],xmm13[1],xmm4[1],xmm13[2],xmm4[2],xmm13[3],xmm4[3]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
-; SSE2-NEXT:    movdqu (%rdx), %xmm5
-; SSE2-NEXT:    movdqu 16(%rdx), %xmm7
-; SSE2-NEXT:    movdqa %xmm7, %xmm1
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
-; SSE2-NEXT:    movdqa %xmm1, %xmm2
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
-; SSE2-NEXT:    movdqa %xmm2, -{{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT:    movdqa %xmm5, %xmm2
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
-; SSE2-NEXT:    movdqa %xmm2, %xmm14
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm14 = xmm14[4],xmm4[4],xmm14[5],xmm4[5],xmm14[6],xmm4[6],xmm14[7],xmm4[7]
-; SSE2-NEXT:    punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm4[8],xmm7[9],xmm4[9],xmm7[10],xmm4[10],xmm7[11],xmm4[11],xmm7[12],xmm4[12],xmm7[13],xmm4[13],xmm7[14],xmm4[14],xmm7[15],xmm4[15]
-; SSE2-NEXT:    movdqa %xmm7, %xmm15
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm15 = xmm15[4],xmm4[4],xmm15[5],xmm4[5],xmm15[6],xmm4[6],xmm15[7],xmm4[7]
-; SSE2-NEXT:    punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15]
-; SSE2-NEXT:    movdqa %xmm5, %xmm8
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm4[4],xmm8[5],xmm4[5],xmm8[6],xmm4[6],xmm8[7],xmm4[7]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
-; SSE2-NEXT:    psubd %xmm5, %xmm0
-; SSE2-NEXT:    psubd %xmm7, %xmm3
-; SSE2-NEXT:    psubd %xmm2, %xmm13
-; SSE2-NEXT:    psubd %xmm1, %xmm12
-; SSE2-NEXT:    psubd %xmm8, %xmm6
-; SSE2-NEXT:    psubd %xmm15, %xmm11
-; SSE2-NEXT:    psubd %xmm14, %xmm10
-; SSE2-NEXT:    psubd -{{[0-9]+}}(%rsp), %xmm9 # 16-byte Folded Reload
-; SSE2-NEXT:    movdqa %xmm9, %xmm1
-; SSE2-NEXT:    psrad $31, %xmm1
-; SSE2-NEXT:    paddd %xmm1, %xmm9
-; SSE2-NEXT:    pxor %xmm1, %xmm9
+; SSE2-NEXT:    movdqu 16(%rdi), %xmm12
+; SSE2-NEXT:    pxor %xmm1, %xmm1
+; SSE2-NEXT:    movdqa %xmm12, %xmm8
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm1[0],xmm8[1],xmm1[1],xmm8[2],xmm1[2],xmm8[3],xmm1[3],xmm8[4],xmm1[4],xmm8[5],xmm1[5],xmm8[6],xmm1[6],xmm8[7],xmm1[7]
+; SSE2-NEXT:    movdqa %xmm8, %xmm10
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm1[4],xmm10[5],xmm1[5],xmm10[6],xmm1[6],xmm10[7],xmm1[7]
+; SSE2-NEXT:    movdqa %xmm0, %xmm9
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm1[0],xmm9[1],xmm1[1],xmm9[2],xmm1[2],xmm9[3],xmm1[3],xmm9[4],xmm1[4],xmm9[5],xmm1[5],xmm9[6],xmm1[6],xmm9[7],xmm1[7]
+; SSE2-NEXT:    movdqa %xmm9, %xmm11
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm1[4],xmm11[5],xmm1[5],xmm11[6],xmm1[6],xmm11[7],xmm1[7]
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm12 = xmm12[8],xmm1[8],xmm12[9],xmm1[9],xmm12[10],xmm1[10],xmm12[11],xmm1[11],xmm12[12],xmm1[12],xmm12[13],xmm1[13],xmm12[14],xmm1[14],xmm12[15],xmm1[15]
+; SSE2-NEXT:    movdqa %xmm12, %xmm13
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm1[4],xmm13[5],xmm1[5],xmm13[6],xmm1[6],xmm13[7],xmm1[7]
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
+; SSE2-NEXT:    movdqa %xmm0, %xmm4
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm1[0],xmm8[1],xmm1[1],xmm8[2],xmm1[2],xmm8[3],xmm1[3]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm1[0],xmm9[1],xmm1[1],xmm9[2],xmm1[2],xmm9[3],xmm1[3]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm1[0],xmm12[1],xmm1[1],xmm12[2],xmm1[2],xmm12[3],xmm1[3]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSE2-NEXT:    movdqu (%rdx), %xmm7
+; SSE2-NEXT:    movdqu 16(%rdx), %xmm3
+; SSE2-NEXT:    movdqa %xmm3, %xmm6
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm1[0],xmm6[1],xmm1[1],xmm6[2],xmm1[2],xmm6[3],xmm1[3],xmm6[4],xmm1[4],xmm6[5],xmm1[5],xmm6[6],xmm1[6],xmm6[7],xmm1[7]
+; SSE2-NEXT:    movdqa %xmm6, %xmm5
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7]
+; SSE2-NEXT:    psubd %xmm5, %xmm10
+; SSE2-NEXT:    movdqa %xmm7, %xmm2
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
+; SSE2-NEXT:    movdqa %xmm2, %xmm5
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7]
+; SSE2-NEXT:    psubd %xmm5, %xmm11
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15]
+; SSE2-NEXT:    movdqa %xmm3, %xmm5
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7]
+; SSE2-NEXT:    psubd %xmm5, %xmm13
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm1[8],xmm7[9],xmm1[9],xmm7[10],xmm1[10],xmm7[11],xmm1[11],xmm7[12],xmm1[12],xmm7[13],xmm1[13],xmm7[14],xmm1[14],xmm7[15],xmm1[15]
+; SSE2-NEXT:    movdqa %xmm7, %xmm5
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7]
+; SSE2-NEXT:    psubd %xmm5, %xmm4
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm1[0],xmm6[1],xmm1[1],xmm6[2],xmm1[2],xmm6[3],xmm1[3]
+; SSE2-NEXT:    psubd %xmm6, %xmm8
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSE2-NEXT:    psubd %xmm2, %xmm9
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
+; SSE2-NEXT:    psubd %xmm3, %xmm12
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm1[0],xmm7[1],xmm1[1],xmm7[2],xmm1[2],xmm7[3],xmm1[3]
+; SSE2-NEXT:    psubd %xmm7, %xmm0
 ; SSE2-NEXT:    movdqa %xmm10, %xmm1
 ; SSE2-NEXT:    psrad $31, %xmm1
 ; SSE2-NEXT:    paddd %xmm1, %xmm10
@@ -1215,33 +1210,37 @@ define i32 @sad_nonloop_32i8(<32 x i8>* nocapture readonly %p, i64, <32 x i8>* n
 ; SSE2-NEXT:    psrad $31, %xmm1
 ; SSE2-NEXT:    paddd %xmm1, %xmm11
 ; SSE2-NEXT:    pxor %xmm1, %xmm11
-; SSE2-NEXT:    movdqa %xmm6, %xmm1
-; SSE2-NEXT:    psrad $31, %xmm1
-; SSE2-NEXT:    paddd %xmm1, %xmm6
-; SSE2-NEXT:    pxor %xmm1, %xmm6
-; SSE2-NEXT:    movdqa %xmm12, %xmm1
-; SSE2-NEXT:    psrad $31, %xmm1
-; SSE2-NEXT:    paddd %xmm1, %xmm12
-; SSE2-NEXT:    pxor %xmm1, %xmm12
 ; SSE2-NEXT:    movdqa %xmm13, %xmm1
 ; SSE2-NEXT:    psrad $31, %xmm1
 ; SSE2-NEXT:    paddd %xmm1, %xmm13
 ; SSE2-NEXT:    pxor %xmm1, %xmm13
-; SSE2-NEXT:    movdqa %xmm3, %xmm1
+; SSE2-NEXT:    movdqa %xmm4, %xmm1
 ; SSE2-NEXT:    psrad $31, %xmm1
-; SSE2-NEXT:    paddd %xmm1, %xmm3
-; SSE2-NEXT:    pxor %xmm1, %xmm3
+; SSE2-NEXT:    paddd %xmm1, %xmm4
+; SSE2-NEXT:    pxor %xmm1, %xmm4
+; SSE2-NEXT:    paddd %xmm13, %xmm4
+; SSE2-NEXT:    paddd %xmm10, %xmm4
+; SSE2-NEXT:    paddd %xmm11, %xmm4
+; SSE2-NEXT:    movdqa %xmm8, %xmm1
+; SSE2-NEXT:    psrad $31, %xmm1
+; SSE2-NEXT:    paddd %xmm1, %xmm8
+; SSE2-NEXT:    pxor %xmm1, %xmm8
+; SSE2-NEXT:    movdqa %xmm9, %xmm1
+; SSE2-NEXT:    psrad $31, %xmm1
+; SSE2-NEXT:    paddd %xmm1, %xmm9
+; SSE2-NEXT:    pxor %xmm1, %xmm9
+; SSE2-NEXT:    movdqa %xmm12, %xmm1
+; SSE2-NEXT:    psrad $31, %xmm1
+; SSE2-NEXT:    paddd %xmm1, %xmm12
+; SSE2-NEXT:    pxor %xmm1, %xmm12
 ; SSE2-NEXT:    movdqa %xmm0, %xmm1
 ; SSE2-NEXT:    psrad $31, %xmm1
 ; SSE2-NEXT:    paddd %xmm1, %xmm0
 ; SSE2-NEXT:    pxor %xmm1, %xmm0
-; SSE2-NEXT:    paddd %xmm3, %xmm0
-; SSE2-NEXT:    paddd %xmm11, %xmm6
-; SSE2-NEXT:    paddd %xmm9, %xmm6
-; SSE2-NEXT:    paddd %xmm10, %xmm6
 ; SSE2-NEXT:    paddd %xmm12, %xmm0
-; SSE2-NEXT:    paddd %xmm6, %xmm0
-; SSE2-NEXT:    paddd %xmm13, %xmm0
+; SSE2-NEXT:    paddd %xmm8, %xmm0
+; SSE2-NEXT:    paddd %xmm4, %xmm0
+; SSE2-NEXT:    paddd %xmm9, %xmm0
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; SSE2-NEXT:    paddd %xmm0, %xmm1
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
diff --git a/test/CodeGen/X86/select.ll b/test/CodeGen/X86/select.ll
index ce42d0d643e8..1afef86a5f11 100644
--- a/test/CodeGen/X86/select.ll
+++ b/test/CodeGen/X86/select.ll
@@ -299,20 +299,21 @@ define void @test8(i1 %c, <6 x i32>* %dst.addr, <6 x i32> %src1,<6 x i32> %src2)
 ; GENERIC-NEXT:    testb %dil, %dil
 ; GENERIC-NEXT:    jne LBB7_4
 ; GENERIC-NEXT:  ## BB#5:
+; GENERIC-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
 ; GENERIC-NEXT:    movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; GENERIC-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
 ; GENERIC-NEXT:    movd {{.*#+}} xmm3 = mem[0],zero,zero,zero
-; GENERIC-NEXT:    movd {{.*#+}} xmm4 = mem[0],zero,zero,zero
 ; GENERIC-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
 ; GENERIC-NEXT:    jmp LBB7_6
 ; GENERIC-NEXT:  LBB7_4:
-; GENERIC-NEXT:    movd %r9d, %xmm2
-; GENERIC-NEXT:    movd %ecx, %xmm3
-; GENERIC-NEXT:    movd %r8d, %xmm4
+; GENERIC-NEXT:    movd %r9d, %xmm1
+; GENERIC-NEXT:    movd %ecx, %xmm2
+; GENERIC-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; GENERIC-NEXT:    movd %r8d, %xmm3
 ; GENERIC-NEXT:    movd %edx, %xmm1
 ; GENERIC-NEXT:  LBB7_6:
-; GENERIC-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
-; GENERIC-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1]
 ; GENERIC-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
+; GENERIC-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
 ; GENERIC-NEXT:    psubd {{.*}}(%rip), %xmm1
 ; GENERIC-NEXT:    psubd {{.*}}(%rip), %xmm0
 ; GENERIC-NEXT:    movq %xmm0, 16(%rsi)
@@ -339,16 +340,19 @@ define void @test8(i1 %c, <6 x i32>* %dst.addr, <6 x i32> %src1,<6 x i32> %src2)
 ; ATOM-NEXT:    movd {{.*#+}} xmm3 = mem[0],zero,zero,zero
 ; ATOM-NEXT:    movd {{.*#+}} xmm4 = mem[0],zero,zero,zero
 ; ATOM-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; ATOM-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
+; ATOM-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1]
+; ATOM-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
 ; ATOM-NEXT:    jmp LBB7_6
 ; ATOM-NEXT:  LBB7_4:
-; ATOM-NEXT:    movd %r9d, %xmm2
-; ATOM-NEXT:    movd %ecx, %xmm3
-; ATOM-NEXT:    movd %r8d, %xmm4
+; ATOM-NEXT:    movd %r9d, %xmm1
+; ATOM-NEXT:    movd %ecx, %xmm2
+; ATOM-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; ATOM-NEXT:    movd %r8d, %xmm3
 ; ATOM-NEXT:    movd %edx, %xmm1
-; ATOM-NEXT:  LBB7_6:
-; ATOM-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
-; ATOM-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1]
 ; ATOM-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
+; ATOM-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; ATOM-NEXT:  LBB7_6:
 ; ATOM-NEXT:    psubd {{.*}}(%rip), %xmm0
 ; ATOM-NEXT:    psubd {{.*}}(%rip), %xmm1
 ; ATOM-NEXT:    movq %xmm0, 16(%rsi)
diff --git a/test/CodeGen/X86/setcc-wide-types.ll b/test/CodeGen/X86/setcc-wide-types.ll
index 2996edaec3e0..332bf2887fb0 100644
--- a/test/CodeGen/X86/setcc-wide-types.ll
+++ b/test/CodeGen/X86/setcc-wide-types.ll
@@ -58,25 +58,25 @@ define i32 @ne_i256(<4 x i64> %x, <4 x i64> %y) {
 ; SSE2-LABEL: ne_i256:
 ; SSE2:       # BB#0:
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[2,3,0,1]
-; SSE2-NEXT:    movq %xmm4, %r8
+; SSE2-NEXT:    movq %xmm4, %rax
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm1[2,3,0,1]
-; SSE2-NEXT:    movq %xmm4, %r9
-; SSE2-NEXT:    movq %xmm0, %r10
-; SSE2-NEXT:    movq %xmm1, %rsi
+; SSE2-NEXT:    movq %xmm4, %rcx
+; SSE2-NEXT:    movq %xmm0, %rdx
+; SSE2-NEXT:    movq %xmm1, %r8
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
 ; SSE2-NEXT:    movq %xmm0, %rdi
+; SSE2-NEXT:    xorq %rax, %rdi
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[2,3,0,1]
-; SSE2-NEXT:    movq %xmm0, %rax
-; SSE2-NEXT:    movq %xmm2, %rcx
-; SSE2-NEXT:    movq %xmm3, %rdx
-; SSE2-NEXT:    xorq %rsi, %rdx
-; SSE2-NEXT:    xorq %r10, %rcx
-; SSE2-NEXT:    orq %rdx, %rcx
-; SSE2-NEXT:    xorq %r9, %rax
-; SSE2-NEXT:    xorq %r8, %rdi
-; SSE2-NEXT:    orq %rax, %rdi
+; SSE2-NEXT:    movq %xmm0, %rsi
+; SSE2-NEXT:    xorq %rcx, %rsi
+; SSE2-NEXT:    orq %rdi, %rsi
+; SSE2-NEXT:    movq %xmm2, %rax
+; SSE2-NEXT:    xorq %rdx, %rax
+; SSE2-NEXT:    movq %xmm3, %rcx
+; SSE2-NEXT:    xorq %r8, %rcx
+; SSE2-NEXT:    orq %rax, %rcx
 ; SSE2-NEXT:    xorl %eax, %eax
-; SSE2-NEXT:    orq %rcx, %rdi
+; SSE2-NEXT:    orq %rsi, %rcx
 ; SSE2-NEXT:    setne %al
 ; SSE2-NEXT:    retq
 ;
@@ -100,25 +100,25 @@ define i32 @eq_i256(<4 x i64> %x, <4 x i64> %y) {
 ; SSE2-LABEL: eq_i256:
 ; SSE2:       # BB#0:
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[2,3,0,1]
-; SSE2-NEXT:    movq %xmm4, %r8
+; SSE2-NEXT:    movq %xmm4, %rax
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm1[2,3,0,1]
-; SSE2-NEXT:    movq %xmm4, %r9
-; SSE2-NEXT:    movq %xmm0, %r10
-; SSE2-NEXT:    movq %xmm1, %rsi
+; SSE2-NEXT:    movq %xmm4, %rcx
+; SSE2-NEXT:    movq %xmm0, %rdx
+; SSE2-NEXT:    movq %xmm1, %r8
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
 ; SSE2-NEXT:    movq %xmm0, %rdi
+; SSE2-NEXT:    xorq %rax, %rdi
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[2,3,0,1]
-; SSE2-NEXT:    movq %xmm0, %rax
-; SSE2-NEXT:    movq %xmm2, %rcx
-; SSE2-NEXT:    movq %xmm3, %rdx
-; SSE2-NEXT:    xorq %rsi, %rdx
-; SSE2-NEXT:    xorq %r10, %rcx
-; SSE2-NEXT:    orq %rdx, %rcx
-; SSE2-NEXT:    xorq %r9, %rax
-; SSE2-NEXT:    xorq %r8, %rdi
-; SSE2-NEXT:    orq %rax, %rdi
+; SSE2-NEXT:    movq %xmm0, %rsi
+; SSE2-NEXT:    xorq %rcx, %rsi
+; SSE2-NEXT:    orq %rdi, %rsi
+; SSE2-NEXT:    movq %xmm2, %rax
+; SSE2-NEXT:    xorq %rdx, %rax
+; SSE2-NEXT:    movq %xmm3, %rcx
+; SSE2-NEXT:    xorq %r8, %rcx
+; SSE2-NEXT:    orq %rax, %rcx
 ; SSE2-NEXT:    xorl %eax, %eax
-; SSE2-NEXT:    orq %rcx, %rdi
+; SSE2-NEXT:    orq %rsi, %rcx
 ; SSE2-NEXT:    sete %al
 ; SSE2-NEXT:    retq
 ;
diff --git a/test/CodeGen/X86/shrink_vmul_sse.ll b/test/CodeGen/X86/shrink_vmul_sse.ll
index c869dff9e642..6701c247e6fc 100644
--- a/test/CodeGen/X86/shrink_vmul_sse.ll
+++ b/test/CodeGen/X86/shrink_vmul_sse.ll
@@ -20,9 +20,9 @@ define void @mul_2xi8(i8* nocapture readonly %a, i8* nocapture readonly %b, i64
 ; CHECK-NEXT:    movzbl 1(%edx,%ecx), %edi
 ; CHECK-NEXT:    movzbl (%edx,%ecx), %edx
 ; CHECK-NEXT:    movzbl 1(%eax,%ecx), %ebx
+; CHECK-NEXT:    imull %edi, %ebx
 ; CHECK-NEXT:    movzbl (%eax,%ecx), %eax
 ; CHECK-NEXT:    imull %edx, %eax
-; CHECK-NEXT:    imull %edi, %ebx
 ; CHECK-NEXT:    movl %ebx, 4(%esi,%ecx,4)
 ; CHECK-NEXT:    movl %eax, (%esi,%ecx,4)
 ; CHECK-NEXT:    popl %esi
diff --git a/test/CodeGen/X86/shuffle-of-splat-multiuses.ll b/test/CodeGen/X86/shuffle-of-splat-multiuses.ll
index d46082f20a45..cbd5c69b1772 100644
--- a/test/CodeGen/X86/shuffle-of-splat-multiuses.ll
+++ b/test/CodeGen/X86/shuffle-of-splat-multiuses.ll
@@ -5,9 +5,8 @@
 define <2 x double> @foo2(<2 x double> %v, <2 x double> *%p) nounwind {
 ; AVX2-LABEL: foo2:
 ; AVX2:       # BB#0:
-; AVX2-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,1]
-; AVX2-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm1[1,0]
-; AVX2-NEXT:    vmovapd %xmm1, (%rdi)
+; AVX2-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,1]
+; AVX2-NEXT:    vmovapd %xmm0, (%rdi)
 ; AVX2-NEXT:    retq
   %res = shufflevector <2 x double> %v, <2 x double> undef, <2 x i32> <i32 1, i32 1>
   %res1 = shufflevector<2 x double> %res, <2 x double> undef, <2 x i32> <i32 1, i32 undef>
@@ -18,9 +17,8 @@ define <2 x double> @foo2(<2 x double> %v, <2 x double> *%p) nounwind {
 define <4 x double> @foo4(<4 x double> %v, <4 x double> *%p) nounwind {
 ; AVX2-LABEL: foo4:
 ; AVX2:       # BB#0:
-; AVX2-NEXT:    vpermpd {{.*#+}} ymm1 = ymm0[2,2,2,2]
-; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm1[2,0,2,3]
-; AVX2-NEXT:    vmovapd %ymm1, (%rdi)
+; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2]
+; AVX2-NEXT:    vmovapd %ymm0, (%rdi)
 ; AVX2-NEXT:    retq
   %res = shufflevector <4 x double> %v, <4 x double> undef, <4 x i32> <i32 2, i32 2, i32 2, i32 2>
   %res1 = shufflevector<4 x double> %res, <4 x double> undef, <4 x i32> <i32 2, i32 0, i32 undef, i32 undef>
@@ -32,10 +30,8 @@ define <8 x float> @foo8(<8 x float> %v, <8 x float> *%p) nounwind {
 ; AVX2-LABEL: foo8:
 ; AVX2:       # BB#0:
 ; AVX2-NEXT:    vmovshdup {{.*#+}} ymm0 = ymm0[1,1,3,3,5,5,7,7]
-; AVX2-NEXT:    vpermpd {{.*#+}} ymm1 = ymm0[2,2,2,2]
-; AVX2-NEXT:    vmovaps {{.*#+}} ymm0 = <2,0,u,u,5,1,3,7>
-; AVX2-NEXT:    vpermps %ymm1, %ymm0, %ymm0
-; AVX2-NEXT:    vmovapd %ymm1, (%rdi)
+; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2]
+; AVX2-NEXT:    vmovapd %ymm0, (%rdi)
 ; AVX2-NEXT:    retq
   %res = shufflevector <8 x float> %v, <8 x float> undef, <8 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
   %res1 = shufflevector<8 x float> %res, <8 x float> undef, <8 x i32> <i32 2, i32 0, i32 undef, i32 undef, i32 5, i32 1, i32 3, i32 7>
@@ -46,7 +42,7 @@ define <8 x float> @foo8(<8 x float> %v, <8 x float> *%p) nounwind {
 define <4 x i32> @undef_splatmask(<4 x i32> %v) nounwind {
 ; AVX2-LABEL: undef_splatmask:
 ; AVX2:       # BB#0:
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,2,2,3]
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
 ; AVX2-NEXT:    retq
   %res = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> <i32 2, i32 undef, i32 2, i32 undef>
   %res1 = shufflevector <4 x i32> %res, <4 x i32> undef, <4 x i32> <i32 0, i32 2, i32 undef, i32 undef>
@@ -66,7 +62,7 @@ define <4 x i32> @undef_splatmask2(<4 x i32> %v) nounwind {
 define <4 x i32> @undef_splatmask3(<4 x i32> %v) nounwind {
 ; AVX2-LABEL: undef_splatmask3:
 ; AVX2:       # BB#0:
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,2,2,3]
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
 ; AVX2-NEXT:    retq
   %res = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> <i32 2, i32 undef, i32 2, i32 undef>
   %res1 = shufflevector <4 x i32> %res, <4 x i32> undef, <4 x i32> <i32 0, i32 2, i32 undef, i32 3>
@@ -76,9 +72,10 @@ define <4 x i32> @undef_splatmask3(<4 x i32> %v) nounwind {
 define <4 x i32> @undef_splatmask4(<4 x i32> %v, <4 x i32>* %p) nounwind {
 ; AVX2-LABEL: undef_splatmask4:
 ; AVX2:       # BB#0:
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
-; AVX2-NEXT:    vmovdqa %xmm1, (%rdi)
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,2,3,3]
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; AVX2-NEXT:    vmovdqa %xmm0, (%rdi)
+; AVX2-NEXT:    vmovdqa %xmm1, %xmm0
 ; AVX2-NEXT:    retq
   %res = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> <i32 2, i32 undef, i32 2, i32 undef>
   %res1 = shufflevector <4 x i32> %res, <4 x i32> undef, <4 x i32> <i32 0, i32 2, i32 undef, i32 undef>
@@ -89,9 +86,10 @@ define <4 x i32> @undef_splatmask4(<4 x i32> %v, <4 x i32>* %p) nounwind {
 define <4 x i32> @undef_splatmask5(<4 x i32> %v, <4 x i32>* %p) nounwind {
 ; AVX2-LABEL: undef_splatmask5:
 ; AVX2:       # BB#0:
-; AVX2-NEXT:    vpbroadcastq %xmm0, %xmm1
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
-; AVX2-NEXT:    vmovdqa %xmm1, (%rdi)
+; AVX2-NEXT:    vpbroadcastd %xmm0, %xmm1
+; AVX2-NEXT:    vpbroadcastq %xmm0, %xmm0
+; AVX2-NEXT:    vmovdqa %xmm0, (%rdi)
+; AVX2-NEXT:    vmovdqa %xmm1, %xmm0
 ; AVX2-NEXT:    retq
   %res = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> <i32 0, i32 undef, i32 0, i32 undef>
   %res1 = shufflevector <4 x i32> %res, <4 x i32> undef, <4 x i32> <i32 0, i32 2, i32 undef, i32 3>
diff --git a/test/CodeGen/X86/sse-intrinsics-fast-isel.ll b/test/CodeGen/X86/sse-intrinsics-fast-isel.ll
index 0b03dffe99b5..d99cfaf535de 100644
--- a/test/CodeGen/X86/sse-intrinsics-fast-isel.ll
+++ b/test/CodeGen/X86/sse-intrinsics-fast-isel.ll
@@ -1537,9 +1537,9 @@ define <4 x float> @test_mm_set_ps(float %a0, float %a1, float %a2, float %a3) n
 ; X32-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; X32-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
 ; X32-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; X32-NEXT:    movss {{.*#+}} xmm3 = mem[0],zero,zero,zero
-; X32-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
 ; X32-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; X32-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; X32-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
 ; X32-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
 ; X32-NEXT:    retl
 ;
@@ -1673,13 +1673,13 @@ define void @test_mm_setcsr(i32 %a0) nounwind {
 define <4 x float> @test_mm_setr_ps(float %a0, float %a1, float %a2, float %a3) nounwind {
 ; X32-LABEL: test_mm_setr_ps:
 ; X32:       # BB#0:
+; X32-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; X32-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
 ; X32-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; X32-NEXT:    movss {{.*#+}} xmm3 = mem[0],zero,zero,zero
+; X32-NEXT:    unpcklps {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
 ; X32-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X32-NEXT:    unpcklps {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
+; X32-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
 ; X32-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; X32-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: test_mm_setr_ps:
diff --git a/test/CodeGen/X86/sse1.ll b/test/CodeGen/X86/sse1.ll
index dfc1aefd31a6..68ab3f9f3205 100644
--- a/test/CodeGen/X86/sse1.ll
+++ b/test/CodeGen/X86/sse1.ll
@@ -66,7 +66,10 @@ define <4 x float> @vselect(<4 x float>*%p, <4 x i32> %q) {
 ; X32-NEXT:    jne .LBB1_8
 ; X32-NEXT:  .LBB1_7:
 ; X32-NEXT:    movss {{.*#+}} xmm3 = mem[0],zero,zero,zero
-; X32-NEXT:    jmp .LBB1_9
+; X32-NEXT:    unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
+; X32-NEXT:    cmpl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    je .LBB1_10
+; X32-NEXT:    jmp .LBB1_11
 ; X32-NEXT:  .LBB1_1:
 ; X32-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
 ; X32-NEXT:    cmpl $0, {{[0-9]+}}(%esp)
@@ -77,11 +80,10 @@ define <4 x float> @vselect(<4 x float>*%p, <4 x i32> %q) {
 ; X32-NEXT:    je .LBB1_7
 ; X32-NEXT:  .LBB1_8: # %entry
 ; X32-NEXT:    xorps %xmm3, %xmm3
-; X32-NEXT:  .LBB1_9: # %entry
-; X32-NEXT:    cmpl $0, {{[0-9]+}}(%esp)
 ; X32-NEXT:    unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
+; X32-NEXT:    cmpl $0, {{[0-9]+}}(%esp)
 ; X32-NEXT:    jne .LBB1_11
-; X32-NEXT:  # BB#10:
+; X32-NEXT:  .LBB1_10:
 ; X32-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; X32-NEXT:  .LBB1_11: # %entry
 ; X32-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
@@ -103,7 +105,10 @@ define <4 x float> @vselect(<4 x float>*%p, <4 x i32> %q) {
 ; X64-NEXT:    jne .LBB1_8
 ; X64-NEXT:  .LBB1_7:
 ; X64-NEXT:    movss {{.*#+}} xmm3 = mem[0],zero,zero,zero
-; X64-NEXT:    jmp .LBB1_9
+; X64-NEXT:    unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
+; X64-NEXT:    testl %esi, %esi
+; X64-NEXT:    je .LBB1_10
+; X64-NEXT:    jmp .LBB1_11
 ; X64-NEXT:  .LBB1_1:
 ; X64-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
 ; X64-NEXT:    testl %edx, %edx
@@ -114,11 +119,10 @@ define <4 x float> @vselect(<4 x float>*%p, <4 x i32> %q) {
 ; X64-NEXT:    je .LBB1_7
 ; X64-NEXT:  .LBB1_8: # %entry
 ; X64-NEXT:    xorps %xmm3, %xmm3
-; X64-NEXT:  .LBB1_9: # %entry
-; X64-NEXT:    testl %esi, %esi
 ; X64-NEXT:    unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
+; X64-NEXT:    testl %esi, %esi
 ; X64-NEXT:    jne .LBB1_11
-; X64-NEXT:  # BB#10:
+; X64-NEXT:  .LBB1_10:
 ; X64-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; X64-NEXT:  .LBB1_11: # %entry
 ; X64-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
diff --git a/test/CodeGen/X86/sse3-avx-addsub-2.ll b/test/CodeGen/X86/sse3-avx-addsub-2.ll
index 4d895ea264c5..aed5e0d1c32e 100644
--- a/test/CodeGen/X86/sse3-avx-addsub-2.ll
+++ b/test/CodeGen/X86/sse3-avx-addsub-2.ll
@@ -412,14 +412,14 @@ define <4 x float> @test16(<4 x float> %A, <4 x float> %B) {
 ; SSE-NEXT:    movaps %xmm1, %xmm4
 ; SSE-NEXT:    movhlps {{.*#+}} xmm4 = xmm4[1,1]
 ; SSE-NEXT:    subss %xmm4, %xmm3
-; SSE-NEXT:    movshdup {{.*#+}} xmm4 = xmm0[1,1,3,3]
-; SSE-NEXT:    addss %xmm0, %xmm4
+; SSE-NEXT:    unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
+; SSE-NEXT:    movshdup {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; SSE-NEXT:    addss %xmm0, %xmm3
 ; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
 ; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,1,2,3]
 ; SSE-NEXT:    addss %xmm0, %xmm1
-; SSE-NEXT:    unpcklps {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
+; SSE-NEXT:    unpcklps {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
 ; SSE-NEXT:    unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
-; SSE-NEXT:    unpcklps {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
 ; SSE-NEXT:    movaps %xmm2, %xmm0
 ; SSE-NEXT:    retq
 ;
@@ -431,12 +431,12 @@ define <4 x float> @test16(<4 x float> %A, <4 x float> %B) {
 ; AVX-NEXT:    vsubss %xmm4, %xmm3, %xmm3
 ; AVX-NEXT:    vmovshdup {{.*#+}} xmm4 = xmm0[1,1,3,3]
 ; AVX-NEXT:    vaddss %xmm0, %xmm4, %xmm4
+; AVX-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[2,3]
+; AVX-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0],xmm2[3]
 ; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
 ; AVX-NEXT:    vpermilps {{.*#+}} xmm1 = xmm1[3,1,2,3]
 ; AVX-NEXT:    vaddss %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0],xmm4[0],xmm2[2,3]
-; AVX-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0],xmm1[3]
-; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[0]
 ; AVX-NEXT:    retq
   %1 = extractelement <4 x float> %A, i32 0
   %2 = extractelement <4 x float> %B, i32 0
diff --git a/test/CodeGen/X86/sse41.ll b/test/CodeGen/X86/sse41.ll
index 503b9416c8d3..4a0dc9c1eb17 100644
--- a/test/CodeGen/X86/sse41.ll
+++ b/test/CodeGen/X86/sse41.ll
@@ -273,8 +273,8 @@ define <2 x float> @buildvector(<2 x float> %A, <2 x float> %B) nounwind  {
 ; X32:       ## BB#0: ## %entry
 ; X32-NEXT:    movshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
 ; X32-NEXT:    movshdup {{.*#+}} xmm3 = xmm1[1,1,3,3]
-; X32-NEXT:    addss %xmm1, %xmm0
 ; X32-NEXT:    addss %xmm2, %xmm3
+; X32-NEXT:    addss %xmm1, %xmm0
 ; X32-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[2,3]
 ; X32-NEXT:    retl
 ;
@@ -282,8 +282,8 @@ define <2 x float> @buildvector(<2 x float> %A, <2 x float> %B) nounwind  {
 ; X64:       ## BB#0: ## %entry
 ; X64-NEXT:    movshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
 ; X64-NEXT:    movshdup {{.*#+}} xmm3 = xmm1[1,1,3,3]
-; X64-NEXT:    addss %xmm1, %xmm0
 ; X64-NEXT:    addss %xmm2, %xmm3
+; X64-NEXT:    addss %xmm1, %xmm0
 ; X64-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[2,3]
 ; X64-NEXT:    retq
 entry:
@@ -896,9 +896,9 @@ define <4 x float> @insertps_from_broadcast_multiple_use(<4 x float> %a, <4 x fl
 ; X32-NEXT:    movss {{.*#+}} xmm4 = mem[0],zero,zero,zero
 ; X32-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm4[0]
 ; X32-NEXT:    insertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[0]
+; X32-NEXT:    addps %xmm1, %xmm0
 ; X32-NEXT:    insertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm4[0]
 ; X32-NEXT:    insertps {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[0]
-; X32-NEXT:    addps %xmm1, %xmm0
 ; X32-NEXT:    addps %xmm2, %xmm3
 ; X32-NEXT:    addps %xmm3, %xmm0
 ; X32-NEXT:    retl
@@ -908,9 +908,9 @@ define <4 x float> @insertps_from_broadcast_multiple_use(<4 x float> %a, <4 x fl
 ; X64-NEXT:    movss {{.*#+}} xmm4 = mem[0],zero,zero,zero
 ; X64-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm4[0]
 ; X64-NEXT:    insertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[0]
+; X64-NEXT:    addps %xmm1, %xmm0
 ; X64-NEXT:    insertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm4[0]
 ; X64-NEXT:    insertps {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[0]
-; X64-NEXT:    addps %xmm1, %xmm0
 ; X64-NEXT:    addps %xmm2, %xmm3
 ; X64-NEXT:    addps %xmm3, %xmm0
 ; X64-NEXT:    retq
diff --git a/test/CodeGen/X86/stackmap-frame-setup.ll b/test/CodeGen/X86/stackmap-frame-setup.ll
index b83a8d61f6a2..df5ed5431b8a 100644
--- a/test/CodeGen/X86/stackmap-frame-setup.ll
+++ b/test/CodeGen/X86/stackmap-frame-setup.ll
@@ -7,11 +7,11 @@ entry:
   store i64 11, i64* %metadata
   store i64 12, i64* %metadata
   store i64 13, i64* %metadata
-; ISEL:      ADJCALLSTACKDOWN64 0, 0, implicit-def
+; ISEL:      ADJCALLSTACKDOWN64 0, 0, 0, implicit-def
 ; ISEL-NEXT: STACKMAP
 ; ISEL-NEXT: ADJCALLSTACKUP64 0, 0, implicit-def
   call void (i64, i32, ...) @llvm.experimental.stackmap(i64 4, i32 0, i64* %metadata)
-; FAST-ISEL:      ADJCALLSTACKDOWN64 0, 0, implicit-def
+; FAST-ISEL:      ADJCALLSTACKDOWN64 0, 0, 0, implicit-def
 ; FAST-ISEL-NEXT: STACKMAP
 ; FAST-ISEL-NEXT: ADJCALLSTACKUP64 0, 0, implicit-def
   ret void
diff --git a/test/CodeGen/X86/vec_int_to_fp.ll b/test/CodeGen/X86/vec_int_to_fp.ll
index a42b3c96c3ae..1eef67764ab9 100644
--- a/test/CodeGen/X86/vec_int_to_fp.ll
+++ b/test/CodeGen/X86/vec_int_to_fp.ll
@@ -4344,7 +4344,7 @@ define <8 x float> @uitofp_load_8i64_to_8f32(<8 x i64> *%a) {
 ; AVX1-NEXT:    testq %rax, %rax
 ; AVX1-NEXT:    js .LBB80_4
 ; AVX1-NEXT:  # BB#5:
-; AVX1-NEXT:    vcvtsi2ssq %rax, %xmm3, %xmm3
+; AVX1-NEXT:    vcvtsi2ssq %rax, %xmm3, %xmm4
 ; AVX1-NEXT:    jmp .LBB80_6
 ; AVX1-NEXT:  .LBB80_4:
 ; AVX1-NEXT:    movq %rax, %rcx
@@ -4352,22 +4352,22 @@ define <8 x float> @uitofp_load_8i64_to_8f32(<8 x i64> *%a) {
 ; AVX1-NEXT:    andl $1, %eax
 ; AVX1-NEXT:    orq %rcx, %rax
 ; AVX1-NEXT:    vcvtsi2ssq %rax, %xmm3, %xmm3
-; AVX1-NEXT:    vaddss %xmm3, %xmm3, %xmm3
+; AVX1-NEXT:    vaddss %xmm3, %xmm3, %xmm4
 ; AVX1-NEXT:  .LBB80_6:
 ; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm2
 ; AVX1-NEXT:    vmovq %xmm2, %rax
 ; AVX1-NEXT:    testq %rax, %rax
 ; AVX1-NEXT:    js .LBB80_7
 ; AVX1-NEXT:  # BB#8:
-; AVX1-NEXT:    vcvtsi2ssq %rax, %xmm4, %xmm4
+; AVX1-NEXT:    vcvtsi2ssq %rax, %xmm5, %xmm3
 ; AVX1-NEXT:    jmp .LBB80_9
 ; AVX1-NEXT:  .LBB80_7:
 ; AVX1-NEXT:    movq %rax, %rcx
 ; AVX1-NEXT:    shrq %rcx
 ; AVX1-NEXT:    andl $1, %eax
 ; AVX1-NEXT:    orq %rcx, %rax
-; AVX1-NEXT:    vcvtsi2ssq %rax, %xmm4, %xmm4
-; AVX1-NEXT:    vaddss %xmm4, %xmm4, %xmm4
+; AVX1-NEXT:    vcvtsi2ssq %rax, %xmm5, %xmm3
+; AVX1-NEXT:    vaddss %xmm3, %xmm3, %xmm3
 ; AVX1-NEXT:  .LBB80_9:
 ; AVX1-NEXT:    vpextrq $1, %xmm2, %rax
 ; AVX1-NEXT:    testq %rax, %rax
@@ -4397,29 +4397,29 @@ define <8 x float> @uitofp_load_8i64_to_8f32(<8 x i64> *%a) {
 ; AVX1-NEXT:    vcvtsi2ssq %rax, %xmm5, %xmm5
 ; AVX1-NEXT:    vaddss %xmm5, %xmm5, %xmm5
 ; AVX1-NEXT:  .LBB80_15:
-; AVX1-NEXT:    vinsertps {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[2,3]
+; AVX1-NEXT:    vinsertps {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[2,3]
 ; AVX1-NEXT:    vmovq %xmm0, %rax
 ; AVX1-NEXT:    testq %rax, %rax
 ; AVX1-NEXT:    js .LBB80_16
 ; AVX1-NEXT:  # BB#17:
-; AVX1-NEXT:    vcvtsi2ssq %rax, %xmm6, %xmm3
+; AVX1-NEXT:    vcvtsi2ssq %rax, %xmm6, %xmm4
 ; AVX1-NEXT:    jmp .LBB80_18
 ; AVX1-NEXT:  .LBB80_16:
 ; AVX1-NEXT:    movq %rax, %rcx
 ; AVX1-NEXT:    shrq %rcx
 ; AVX1-NEXT:    andl $1, %eax
 ; AVX1-NEXT:    orq %rcx, %rax
-; AVX1-NEXT:    vcvtsi2ssq %rax, %xmm6, %xmm3
-; AVX1-NEXT:    vaddss %xmm3, %xmm3, %xmm3
+; AVX1-NEXT:    vcvtsi2ssq %rax, %xmm6, %xmm4
+; AVX1-NEXT:    vaddss %xmm4, %xmm4, %xmm4
 ; AVX1-NEXT:  .LBB80_18:
-; AVX1-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm4[0],xmm1[3]
-; AVX1-NEXT:    vinsertps {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[2,3]
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
-; AVX1-NEXT:    vmovq %xmm4, %rax
+; AVX1-NEXT:    vinsertps {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[2,3]
+; AVX1-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0],xmm1[3]
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vmovq %xmm3, %rax
 ; AVX1-NEXT:    testq %rax, %rax
 ; AVX1-NEXT:    js .LBB80_19
 ; AVX1-NEXT:  # BB#20:
-; AVX1-NEXT:    vcvtsi2ssq %rax, %xmm6, %xmm5
+; AVX1-NEXT:    vcvtsi2ssq %rax, %xmm6, %xmm0
 ; AVX1-NEXT:    jmp .LBB80_21
 ; AVX1-NEXT:  .LBB80_19:
 ; AVX1-NEXT:    movq %rax, %rcx
@@ -4427,25 +4427,25 @@ define <8 x float> @uitofp_load_8i64_to_8f32(<8 x i64> *%a) {
 ; AVX1-NEXT:    andl $1, %eax
 ; AVX1-NEXT:    orq %rcx, %rax
 ; AVX1-NEXT:    vcvtsi2ssq %rax, %xmm6, %xmm0
-; AVX1-NEXT:    vaddss %xmm0, %xmm0, %xmm5
+; AVX1-NEXT:    vaddss %xmm0, %xmm0, %xmm0
 ; AVX1-NEXT:  .LBB80_21:
+; AVX1-NEXT:    vinsertps {{.*#+}} xmm4 = xmm4[0,1],xmm0[0],xmm4[3]
 ; AVX1-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm2[0]
-; AVX1-NEXT:    vinsertps {{.*#+}} xmm1 = xmm3[0,1],xmm5[0],xmm3[3]
-; AVX1-NEXT:    vpextrq $1, %xmm4, %rax
+; AVX1-NEXT:    vpextrq $1, %xmm3, %rax
 ; AVX1-NEXT:    testq %rax, %rax
 ; AVX1-NEXT:    js .LBB80_22
 ; AVX1-NEXT:  # BB#23:
-; AVX1-NEXT:    vcvtsi2ssq %rax, %xmm6, %xmm2
+; AVX1-NEXT:    vcvtsi2ssq %rax, %xmm6, %xmm1
 ; AVX1-NEXT:    jmp .LBB80_24
 ; AVX1-NEXT:  .LBB80_22:
 ; AVX1-NEXT:    movq %rax, %rcx
 ; AVX1-NEXT:    shrq %rcx
 ; AVX1-NEXT:    andl $1, %eax
 ; AVX1-NEXT:    orq %rcx, %rax
-; AVX1-NEXT:    vcvtsi2ssq %rax, %xmm6, %xmm2
-; AVX1-NEXT:    vaddss %xmm2, %xmm2, %xmm2
+; AVX1-NEXT:    vcvtsi2ssq %rax, %xmm6, %xmm1
+; AVX1-NEXT:    vaddss %xmm1, %xmm1, %xmm1
 ; AVX1-NEXT:  .LBB80_24:
-; AVX1-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0]
+; AVX1-NEXT:    vinsertps {{.*#+}} xmm1 = xmm4[0,1,2],xmm1[0]
 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
 ; AVX1-NEXT:    retq
 ;
@@ -4471,7 +4471,7 @@ define <8 x float> @uitofp_load_8i64_to_8f32(<8 x i64> *%a) {
 ; AVX2-NEXT:    testq %rax, %rax
 ; AVX2-NEXT:    js .LBB80_4
 ; AVX2-NEXT:  # BB#5:
-; AVX2-NEXT:    vcvtsi2ssq %rax, %xmm3, %xmm3
+; AVX2-NEXT:    vcvtsi2ssq %rax, %xmm3, %xmm4
 ; AVX2-NEXT:    jmp .LBB80_6
 ; AVX2-NEXT:  .LBB80_4:
 ; AVX2-NEXT:    movq %rax, %rcx
@@ -4479,22 +4479,22 @@ define <8 x float> @uitofp_load_8i64_to_8f32(<8 x i64> *%a) {
 ; AVX2-NEXT:    andl $1, %eax
 ; AVX2-NEXT:    orq %rcx, %rax
 ; AVX2-NEXT:    vcvtsi2ssq %rax, %xmm3, %xmm3
-; AVX2-NEXT:    vaddss %xmm3, %xmm3, %xmm3
+; AVX2-NEXT:    vaddss %xmm3, %xmm3, %xmm4
 ; AVX2-NEXT:  .LBB80_6:
 ; AVX2-NEXT:    vextracti128 $1, %ymm2, %xmm2
 ; AVX2-NEXT:    vmovq %xmm2, %rax
 ; AVX2-NEXT:    testq %rax, %rax
 ; AVX2-NEXT:    js .LBB80_7
 ; AVX2-NEXT:  # BB#8:
-; AVX2-NEXT:    vcvtsi2ssq %rax, %xmm4, %xmm4
+; AVX2-NEXT:    vcvtsi2ssq %rax, %xmm5, %xmm3
 ; AVX2-NEXT:    jmp .LBB80_9
 ; AVX2-NEXT:  .LBB80_7:
 ; AVX2-NEXT:    movq %rax, %rcx
 ; AVX2-NEXT:    shrq %rcx
 ; AVX2-NEXT:    andl $1, %eax
 ; AVX2-NEXT:    orq %rcx, %rax
-; AVX2-NEXT:    vcvtsi2ssq %rax, %xmm4, %xmm4
-; AVX2-NEXT:    vaddss %xmm4, %xmm4, %xmm4
+; AVX2-NEXT:    vcvtsi2ssq %rax, %xmm5, %xmm3
+; AVX2-NEXT:    vaddss %xmm3, %xmm3, %xmm3
 ; AVX2-NEXT:  .LBB80_9:
 ; AVX2-NEXT:    vpextrq $1, %xmm2, %rax
 ; AVX2-NEXT:    testq %rax, %rax
@@ -4524,29 +4524,29 @@ define <8 x float> @uitofp_load_8i64_to_8f32(<8 x i64> *%a) {
 ; AVX2-NEXT:    vcvtsi2ssq %rax, %xmm5, %xmm5
 ; AVX2-NEXT:    vaddss %xmm5, %xmm5, %xmm5
 ; AVX2-NEXT:  .LBB80_15:
-; AVX2-NEXT:    vinsertps {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[2,3]
+; AVX2-NEXT:    vinsertps {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[2,3]
 ; AVX2-NEXT:    vmovq %xmm0, %rax
 ; AVX2-NEXT:    testq %rax, %rax
 ; AVX2-NEXT:    js .LBB80_16
 ; AVX2-NEXT:  # BB#17:
-; AVX2-NEXT:    vcvtsi2ssq %rax, %xmm6, %xmm3
+; AVX2-NEXT:    vcvtsi2ssq %rax, %xmm6, %xmm4
 ; AVX2-NEXT:    jmp .LBB80_18
 ; AVX2-NEXT:  .LBB80_16:
 ; AVX2-NEXT:    movq %rax, %rcx
 ; AVX2-NEXT:    shrq %rcx
 ; AVX2-NEXT:    andl $1, %eax
 ; AVX2-NEXT:    orq %rcx, %rax
-; AVX2-NEXT:    vcvtsi2ssq %rax, %xmm6, %xmm3
-; AVX2-NEXT:    vaddss %xmm3, %xmm3, %xmm3
+; AVX2-NEXT:    vcvtsi2ssq %rax, %xmm6, %xmm4
+; AVX2-NEXT:    vaddss %xmm4, %xmm4, %xmm4
 ; AVX2-NEXT:  .LBB80_18:
-; AVX2-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm4[0],xmm1[3]
-; AVX2-NEXT:    vinsertps {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[2,3]
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm4
-; AVX2-NEXT:    vmovq %xmm4, %rax
+; AVX2-NEXT:    vinsertps {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[2,3]
+; AVX2-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0],xmm1[3]
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm3
+; AVX2-NEXT:    vmovq %xmm3, %rax
 ; AVX2-NEXT:    testq %rax, %rax
 ; AVX2-NEXT:    js .LBB80_19
 ; AVX2-NEXT:  # BB#20:
-; AVX2-NEXT:    vcvtsi2ssq %rax, %xmm6, %xmm5
+; AVX2-NEXT:    vcvtsi2ssq %rax, %xmm6, %xmm0
 ; AVX2-NEXT:    jmp .LBB80_21
 ; AVX2-NEXT:  .LBB80_19:
 ; AVX2-NEXT:    movq %rax, %rcx
@@ -4554,25 +4554,25 @@ define <8 x float> @uitofp_load_8i64_to_8f32(<8 x i64> *%a) {
 ; AVX2-NEXT:    andl $1, %eax
 ; AVX2-NEXT:    orq %rcx, %rax
 ; AVX2-NEXT:    vcvtsi2ssq %rax, %xmm6, %xmm0
-; AVX2-NEXT:    vaddss %xmm0, %xmm0, %xmm5
+; AVX2-NEXT:    vaddss %xmm0, %xmm0, %xmm0
 ; AVX2-NEXT:  .LBB80_21:
+; AVX2-NEXT:    vinsertps {{.*#+}} xmm4 = xmm4[0,1],xmm0[0],xmm4[3]
 ; AVX2-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm2[0]
-; AVX2-NEXT:    vinsertps {{.*#+}} xmm1 = xmm3[0,1],xmm5[0],xmm3[3]
-; AVX2-NEXT:    vpextrq $1, %xmm4, %rax
+; AVX2-NEXT:    vpextrq $1, %xmm3, %rax
 ; AVX2-NEXT:    testq %rax, %rax
 ; AVX2-NEXT:    js .LBB80_22
 ; AVX2-NEXT:  # BB#23:
-; AVX2-NEXT:    vcvtsi2ssq %rax, %xmm6, %xmm2
+; AVX2-NEXT:    vcvtsi2ssq %rax, %xmm6, %xmm1
 ; AVX2-NEXT:    jmp .LBB80_24
 ; AVX2-NEXT:  .LBB80_22:
 ; AVX2-NEXT:    movq %rax, %rcx
 ; AVX2-NEXT:    shrq %rcx
 ; AVX2-NEXT:    andl $1, %eax
 ; AVX2-NEXT:    orq %rcx, %rax
-; AVX2-NEXT:    vcvtsi2ssq %rax, %xmm6, %xmm2
-; AVX2-NEXT:    vaddss %xmm2, %xmm2, %xmm2
+; AVX2-NEXT:    vcvtsi2ssq %rax, %xmm6, %xmm1
+; AVX2-NEXT:    vaddss %xmm1, %xmm1, %xmm1
 ; AVX2-NEXT:  .LBB80_24:
-; AVX2-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0]
+; AVX2-NEXT:    vinsertps {{.*#+}} xmm1 = xmm4[0,1,2],xmm1[0]
 ; AVX2-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
 ; AVX2-NEXT:    retq
 ;
diff --git a/test/CodeGen/X86/vec_set-2.ll b/test/CodeGen/X86/vec_set-2.ll
index 443264cdffd4..51c8b2111107 100644
--- a/test/CodeGen/X86/vec_set-2.ll
+++ b/test/CodeGen/X86/vec_set-2.ll
@@ -1,11 +1,19 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=i386-unknown -mattr=+sse2,-sse4.1 | FileCheck %s
+; RUN: llc < %s -mtriple=i386-unknown -mattr=+sse2,-sse4.1 | FileCheck %s --check-prefix=X86
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse2,-sse4.1 | FileCheck %s --check-prefix=X64
 
 define <4 x float> @test1(float %a) nounwind {
-; CHECK-LABEL: test1:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT:    retl
+; X86-LABEL: test1:
+; X86:       # BB#0:
+; X86-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-NEXT:    retl
+;
+; X64-LABEL: test1:
+; X64:       # BB#0:
+; X64-NEXT:    xorps %xmm1, %xmm1
+; X64-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
+; X64-NEXT:    movaps %xmm1, %xmm0
+; X64-NEXT:    retq
   %tmp = insertelement <4 x float> zeroinitializer, float %a, i32 0
   %tmp5 = insertelement <4 x float> %tmp, float 0.000000e+00, i32 1
   %tmp6 = insertelement <4 x float> %tmp5, float 0.000000e+00, i32 2
@@ -14,10 +22,15 @@ define <4 x float> @test1(float %a) nounwind {
 }
 
 define <2 x i64> @test(i32 %a) nounwind {
-; CHECK-LABEL: test:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT:    retl
+; X86-LABEL: test:
+; X86:       # BB#0:
+; X86-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-NEXT:    retl
+;
+; X64-LABEL: test:
+; X64:       # BB#0:
+; X64-NEXT:    movd %edi, %xmm0
+; X64-NEXT:    retq
   %tmp = insertelement <4 x i32> zeroinitializer, i32 %a, i32 0
   %tmp6 = insertelement <4 x i32> %tmp, i32 0, i32 1
   %tmp8 = insertelement <4 x i32> %tmp6, i32 0, i32 2
diff --git a/test/CodeGen/X86/vec_set-3.ll b/test/CodeGen/X86/vec_set-3.ll
index ee4a08599968..b34f30924a8d 100644
--- a/test/CodeGen/X86/vec_set-3.ll
+++ b/test/CodeGen/X86/vec_set-3.ll
@@ -1,11 +1,17 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=i386-unknown -mattr=+sse2,+sse4.1 | FileCheck %s
+; RUN: llc < %s -mtriple=i386-unknown -mattr=+sse2,+sse4.1 | FileCheck %s --check-prefix=X86
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse2,+sse4.1 | FileCheck %s --check-prefix=X64
 
 define <4 x float> @test(float %a) {
-; CHECK-LABEL: test:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    insertps {{.*#+}} xmm0 = zero,mem[0],zero,zero
-; CHECK-NEXT:    retl
+; X86-LABEL: test:
+; X86:       # BB#0:
+; X86-NEXT:    insertps {{.*#+}} xmm0 = zero,mem[0],zero,zero
+; X86-NEXT:    retl
+;
+; X64-LABEL: test:
+; X64:       # BB#0:
+; X64-NEXT:    insertps {{.*#+}} xmm0 = zero,xmm0[0],zero,zero
+; X64-NEXT:    retq
   %tmp = insertelement <4 x float> zeroinitializer, float %a, i32 1
   %tmp5 = insertelement <4 x float> %tmp, float 0.000000e+00, i32 2
   %tmp6 = insertelement <4 x float> %tmp5, float 0.000000e+00, i32 3
@@ -13,11 +19,17 @@ define <4 x float> @test(float %a) {
 }
 
 define <2 x i64> @test2(i32 %a) {
-; CHECK-LABEL: test2:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,0,1]
-; CHECK-NEXT:    retl
+; X86-LABEL: test2:
+; X86:       # BB#0:
+; X86-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,0,1]
+; X86-NEXT:    retl
+;
+; X64-LABEL: test2:
+; X64:       # BB#0:
+; X64-NEXT:    movd %edi, %xmm0
+; X64-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,0,1]
+; X64-NEXT:    retq
   %tmp7 = insertelement <4 x i32> zeroinitializer, i32 %a, i32 2
   %tmp9 = insertelement <4 x i32> %tmp7, i32 0, i32 3
   %tmp10 = bitcast <4 x i32> %tmp9 to <2 x i64>
@@ -25,10 +37,15 @@ define <2 x i64> @test2(i32 %a) {
 }
 
 define <4 x float> @test3(<4 x float> %A) {
-; CHECK-LABEL: test3:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    insertps {{.*#+}} xmm0 = zero,xmm0[0],zero,zero
-; CHECK-NEXT:    retl
+; X86-LABEL: test3:
+; X86:       # BB#0:
+; X86-NEXT:    insertps {{.*#+}} xmm0 = zero,xmm0[0],zero,zero
+; X86-NEXT:    retl
+;
+; X64-LABEL: test3:
+; X64:       # BB#0:
+; X64-NEXT:    insertps {{.*#+}} xmm0 = zero,xmm0[0],zero,zero
+; X64-NEXT:    retq
   %tmp0 = extractelement <4 x float> %A, i32 0
   %tmp1 = insertelement <4 x float> <float 0.000000e+00, float undef, float undef, float undef >, float %tmp0, i32 1
   %tmp2 = insertelement <4 x float> %tmp1, float 0.000000e+00, i32 2
diff --git a/test/CodeGen/X86/vec_set-4.ll b/test/CodeGen/X86/vec_set-4.ll
index 8f35529d61b4..09142e16aa6e 100644
--- a/test/CodeGen/X86/vec_set-4.ll
+++ b/test/CodeGen/X86/vec_set-4.ll
@@ -1,12 +1,19 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=i386-unknown -mattr=+sse2 | FileCheck %s
+; RUN: llc < %s -mtriple=i386-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X86
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X64
 
 define <2 x i64> @test(i16 %a) nounwind {
-; CHECK-LABEL: test:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    pxor %xmm0, %xmm0
-; CHECK-NEXT:    pinsrw $3, {{[0-9]+}}(%esp), %xmm0
-; CHECK-NEXT:    retl
+; X86-LABEL: test:
+; X86:       # BB#0:
+; X86-NEXT:    pxor %xmm0, %xmm0
+; X86-NEXT:    pinsrw $3, {{[0-9]+}}(%esp), %xmm0
+; X86-NEXT:    retl
+;
+; X64-LABEL: test:
+; X64:       # BB#0:
+; X64-NEXT:    pxor %xmm0, %xmm0
+; X64-NEXT:    pinsrw $3, %edi, %xmm0
+; X64-NEXT:    retq
   %tmp10 = insertelement <8 x i16> zeroinitializer, i16 %a, i32 3
   %tmp12 = insertelement <8 x i16> %tmp10, i16 0, i32 4
   %tmp14 = insertelement <8 x i16> %tmp12, i16 0, i32 5
@@ -17,12 +24,19 @@ define <2 x i64> @test(i16 %a) nounwind {
 }
 
 define <2 x i64> @test2(i8 %a) nounwind {
-; CHECK-LABEL: test2:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT:    pxor %xmm0, %xmm0
-; CHECK-NEXT:    pinsrw $5, %eax, %xmm0
-; CHECK-NEXT:    retl
+; X86-LABEL: test2:
+; X86:       # BB#0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    pxor %xmm0, %xmm0
+; X86-NEXT:    pinsrw $5, %eax, %xmm0
+; X86-NEXT:    retl
+;
+; X64-LABEL: test2:
+; X64:       # BB#0:
+; X64-NEXT:    movzbl %dil, %eax
+; X64-NEXT:    pxor %xmm0, %xmm0
+; X64-NEXT:    pinsrw $5, %eax, %xmm0
+; X64-NEXT:    retq
   %tmp24 = insertelement <16 x i8> zeroinitializer, i8 %a, i32 10
   %tmp26 = insertelement <16 x i8> %tmp24, i8 0, i32 11
   %tmp28 = insertelement <16 x i8> %tmp26, i8 0, i32 12
diff --git a/test/CodeGen/X86/vec_set-6.ll b/test/CodeGen/X86/vec_set-6.ll
index 4429834b8ef0..3c9aca3a02da 100644
--- a/test/CodeGen/X86/vec_set-6.ll
+++ b/test/CodeGen/X86/vec_set-6.ll
@@ -1,13 +1,22 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=i386-unknown -mattr=+sse2,+sse4.1 | FileCheck %s
+; RUN: llc < %s -mtriple=i386-unknown -mattr=+sse2,+sse4.1 | FileCheck %s --check-prefix=X86
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse2,+sse4.1 | FileCheck %s --check-prefix=X64
 
 define <4 x float> @test(float %a, float %b, float %c) nounwind {
-; CHECK-LABEL: test:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
-; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,1]
-; CHECK-NEXT:    retl
+; X86-LABEL: test:
+; X86:       # BB#0:
+; X86-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; X86-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,1]
+; X86-NEXT:    retl
+;
+; X64-LABEL: test:
+; X64:       # BB#0:
+; X64-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; X64-NEXT:    xorps %xmm2, %xmm2
+; X64-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3]
+; X64-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,1]
+; X64-NEXT:    retq
   %tmp = insertelement <4 x float> zeroinitializer, float %a, i32 1
   %tmp8 = insertelement <4 x float> %tmp, float %b, i32 2
   %tmp10 = insertelement <4 x float> %tmp8, float %c, i32 3
diff --git a/test/CodeGen/X86/vec_set-7.ll b/test/CodeGen/X86/vec_set-7.ll
index e8fe6debb140..757a0d44cd43 100644
--- a/test/CodeGen/X86/vec_set-7.ll
+++ b/test/CodeGen/X86/vec_set-7.ll
@@ -1,12 +1,18 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=i386-unknown -mattr=+sse2 | FileCheck %s
+; RUN: llc < %s -mtriple=i386-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X86
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X64
 
 define <2 x i64> @test(<2 x i64>* %p) nounwind {
-; CHECK-LABEL: test:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; CHECK-NEXT:    retl
+; X86-LABEL: test:
+; X86:       # BB#0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; X86-NEXT:    retl
+;
+; X64-LABEL: test:
+; X64:       # BB#0:
+; X64-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; X64-NEXT:    retq
   %tmp = bitcast <2 x i64>* %p to double*
   %tmp.upgrd.1 = load double, double* %tmp
   %tmp.upgrd.2 = insertelement <2 x double> undef, double %tmp.upgrd.1, i32 0
diff --git a/test/CodeGen/X86/vec_set-8.ll b/test/CodeGen/X86/vec_set-8.ll
index 7a4326c01bb7..a9dceb90855a 100644
--- a/test/CodeGen/X86/vec_set-8.ll
+++ b/test/CodeGen/X86/vec_set-8.ll
@@ -1,11 +1,17 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse4.2 | FileCheck %s
+; RUN: llc < %s -mtriple=i386-unknown -mattr=+sse4.2 | FileCheck %s --check-prefix=X86
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse4.2 | FileCheck %s --check-prefix=X64
 
 define <2 x i64> @test(i64 %i) nounwind  {
-; CHECK-LABEL: test:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    movq %rdi, %xmm0
-; CHECK-NEXT:    retq
+; X86-LABEL: test:
+; X86:       # BB#0:
+; X86-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; X86-NEXT:    retl
+;
+; X64-LABEL: test:
+; X64:       # BB#0:
+; X64-NEXT:    movq %rdi, %xmm0
+; X64-NEXT:    retq
   %tmp10 = insertelement <2 x i64> undef, i64 %i, i32 0
   %tmp11 = insertelement <2 x i64> %tmp10, i64 0, i32 1
   ret <2 x i64> %tmp11
diff --git a/test/CodeGen/X86/vec_set-A.ll b/test/CodeGen/X86/vec_set-A.ll
index cae39a3d775b..259ace98d362 100644
--- a/test/CodeGen/X86/vec_set-A.ll
+++ b/test/CodeGen/X86/vec_set-A.ll
@@ -1,12 +1,19 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=i386-unknown -mattr=+sse2 | FileCheck %s
+; RUN: llc < %s -mtriple=i386-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X86
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X64
 
 define <2 x i64> @test1() nounwind {
-; CHECK-LABEL: test1:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    movl $1, %eax
-; CHECK-NEXT:    movd %eax, %xmm0
-; CHECK-NEXT:    retl
+; X86-LABEL: test1:
+; X86:       # BB#0:
+; X86-NEXT:    movl $1, %eax
+; X86-NEXT:    movd %eax, %xmm0
+; X86-NEXT:    retl
+;
+; X64-LABEL: test1:
+; X64:       # BB#0:
+; X64-NEXT:    movl $1, %eax
+; X64-NEXT:    movq %rax, %xmm0
+; X64-NEXT:    retq
   ret <2 x i64> < i64 1, i64 0 >
 }
 
diff --git a/test/CodeGen/X86/vec_set-B.ll b/test/CodeGen/X86/vec_set-B.ll
index 0580a3376656..ecd9b57cfd0c 100644
--- a/test/CodeGen/X86/vec_set-B.ll
+++ b/test/CodeGen/X86/vec_set-B.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=i386-unknown -mattr=+sse2 | FileCheck %s
+; RUN: llc < %s -mtriple=i386-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X86
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X64
 
 ; These should both generate something like this:
 ;_test3:
@@ -9,26 +10,37 @@
 ;	ret
 
 define <2 x i64> @test3(i64 %arg) nounwind {
-; CHECK-LABEL: test3:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    movl $1234567, %eax # imm = 0x12D687
-; CHECK-NEXT:    andl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT:    movd %eax, %xmm0
-; CHECK-NEXT:    retl
+; X86-LABEL: test3:
+; X86:       # BB#0:
+; X86-NEXT:    movl $1234567, %eax # imm = 0x12D687
+; X86-NEXT:    andl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movd %eax, %xmm0
+; X86-NEXT:    retl
+;
+; X64-LABEL: test3:
+; X64:       # BB#0:
+; X64-NEXT:    andl $1234567, %edi # imm = 0x12D687
+; X64-NEXT:    movq %rdi, %xmm0
+; X64-NEXT:    retq
   %A = and i64 %arg, 1234567
   %B = insertelement <2 x i64> zeroinitializer, i64 %A, i32 0
   ret <2 x i64> %B
 }
 
 define <2 x i64> @test2(i64 %arg) nounwind {
-; CHECK-LABEL: test2:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    movl $1234567, %eax # imm = 0x12D687
-; CHECK-NEXT:    andl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT:    movd %eax, %xmm0
-; CHECK-NEXT:    retl
+; X86-LABEL: test2:
+; X86:       # BB#0:
+; X86-NEXT:    movl $1234567, %eax # imm = 0x12D687
+; X86-NEXT:    andl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movd %eax, %xmm0
+; X86-NEXT:    retl
+;
+; X64-LABEL: test2:
+; X64:       # BB#0:
+; X64-NEXT:    andl $1234567, %edi # imm = 0x12D687
+; X64-NEXT:    movq %rdi, %xmm0
+; X64-NEXT:    retq
   %A = and i64 %arg, 1234567
   %B = insertelement <2 x i64> undef, i64 %A, i32 0
   ret <2 x i64> %B
 }
-
diff --git a/test/CodeGen/X86/vec_set-C.ll b/test/CodeGen/X86/vec_set-C.ll
index 994bc2b3056e..865e2fb83f17 100644
--- a/test/CodeGen/X86/vec_set-C.ll
+++ b/test/CodeGen/X86/vec_set-C.ll
@@ -1,12 +1,12 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=i386-linux-gnu -mattr=+sse2,-avx | FileCheck %s --check-prefix=X32
+; RUN: llc < %s -mtriple=i386-linux-gnu -mattr=+sse2,-avx | FileCheck %s --check-prefix=X86
 ; RUN: llc < %s -mtriple=x86_64-pc-linux -mattr=+sse2,-avx | FileCheck %s --check-prefix=X64
 
 define <2 x i64> @t1(i64 %x) nounwind  {
-; X32-LABEL: t1:
-; X32:       # BB#0:
-; X32-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; X32-NEXT:    retl
+; X86-LABEL: t1:
+; X86:       # BB#0:
+; X86-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; X86-NEXT:    retl
 ;
 ; X64-LABEL: t1:
 ; X64:       # BB#0:
diff --git a/test/CodeGen/X86/vec_set.ll b/test/CodeGen/X86/vec_set.ll
index 49bd3beef75a..6439a6dcb00b 100644
--- a/test/CodeGen/X86/vec_set.ll
+++ b/test/CodeGen/X86/vec_set.ll
@@ -1,27 +1,48 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=i386-unknown -mattr=+sse2,-sse4.1 | FileCheck %s
+; RUN: llc < %s -mtriple=i386-unknown -mattr=+sse2,-sse4.1 | FileCheck %s --check-prefix=X86
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse2,-sse4.1 | FileCheck %s --check-prefix=X64
 
 define void @test(<8 x i16>* %b, i16 %a0, i16 %a1, i16 %a2, i16 %a3, i16 %a4, i16 %a5, i16 %a6, i16 %a7) nounwind {
-; CHECK-LABEL: test:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; CHECK-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; CHECK-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT:    movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; CHECK-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; CHECK-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; CHECK-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; CHECK-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; CHECK-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT:    movd {{.*#+}} xmm3 = mem[0],zero,zero,zero
-; CHECK-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3]
-; CHECK-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
-; CHECK-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
-; CHECK-NEXT:    movdqa %xmm3, (%eax)
-; CHECK-NEXT:    retl
+; X86-LABEL: test:
+; X86:       # BB#0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X86-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; X86-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-NEXT:    movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; X86-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
+; X86-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; X86-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X86-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; X86-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-NEXT:    movd {{.*#+}} xmm3 = mem[0],zero,zero,zero
+; X86-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3]
+; X86-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
+; X86-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
+; X86-NEXT:    movdqa %xmm3, (%eax)
+; X86-NEXT:    retl
+;
+; X64-LABEL: test:
+; X64:       # BB#0:
+; X64-NEXT:    movd %r8d, %xmm0
+; X64-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X64-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; X64-NEXT:    movd %edx, %xmm1
+; X64-NEXT:    movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; X64-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; X64-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; X64-NEXT:    movd %ecx, %xmm0
+; X64-NEXT:    movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; X64-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; X64-NEXT:    movd %r9d, %xmm2
+; X64-NEXT:    movd %esi, %xmm3
+; X64-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
+; X64-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3]
+; X64-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
+; X64-NEXT:    movdqa %xmm3, (%rdi)
+; X64-NEXT:    retq
   %tmp = insertelement <8 x i16> zeroinitializer, i16 %a0, i32 0
   %tmp2 = insertelement <8 x i16> %tmp, i16 %a1, i32 1
   %tmp4 = insertelement <8 x i16> %tmp2, i16 %a2, i32 2
diff --git a/test/CodeGen/X86/vector-bitreverse.ll b/test/CodeGen/X86/vector-bitreverse.ll
index 226c0adbaf3c..2fb821555dba 100644
--- a/test/CodeGen/X86/vector-bitreverse.ll
+++ b/test/CodeGen/X86/vector-bitreverse.ll
@@ -2372,10 +2372,10 @@ define <8 x i64> @test_bitreverse_v8i64(<8 x i64> %a) nounwind {
 ; AVX512F-NEXT:    vporq %zmm1, %zmm2, %zmm1
 ; AVX512F-NEXT:    vpsrlq $24, %zmm0, %zmm2
 ; AVX512F-NEXT:    vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
-; AVX512F-NEXT:    vpsrlq $8, %zmm0, %zmm3
-; AVX512F-NEXT:    vpandq {{.*}}(%rip){1to8}, %zmm3, %zmm3
 ; AVX512F-NEXT:    vporq %zmm1, %zmm2, %zmm1
-; AVX512F-NEXT:    vporq %zmm1, %zmm3, %zmm1
+; AVX512F-NEXT:    vpsrlq $8, %zmm0, %zmm2
+; AVX512F-NEXT:    vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
+; AVX512F-NEXT:    vporq %zmm1, %zmm2, %zmm1
 ; AVX512F-NEXT:    vpsllq $8, %zmm0, %zmm2
 ; AVX512F-NEXT:    vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
 ; AVX512F-NEXT:    vpsllq $24, %zmm0, %zmm3
diff --git a/test/CodeGen/X86/vector-blend.ll b/test/CodeGen/X86/vector-blend.ll
index a05a981daa1f..f0a5fe1dbfff 100644
--- a/test/CodeGen/X86/vector-blend.ll
+++ b/test/CodeGen/X86/vector-blend.ll
@@ -848,10 +848,10 @@ define <8 x i32> @blend_logic_v8i32(<8 x i32> %b, <8 x i32> %a, <8 x i32> %c) {
 ; SSE2-NEXT:    psrad $31, %xmm1
 ; SSE2-NEXT:    pand %xmm1, %xmm3
 ; SSE2-NEXT:    pandn %xmm5, %xmm1
+; SSE2-NEXT:    por %xmm3, %xmm1
 ; SSE2-NEXT:    pand %xmm0, %xmm2
 ; SSE2-NEXT:    pandn %xmm4, %xmm0
 ; SSE2-NEXT:    por %xmm2, %xmm0
-; SSE2-NEXT:    por %xmm3, %xmm1
 ; SSE2-NEXT:    retq
 ;
 ; SSSE3-LABEL: blend_logic_v8i32:
@@ -860,10 +860,10 @@ define <8 x i32> @blend_logic_v8i32(<8 x i32> %b, <8 x i32> %a, <8 x i32> %c) {
 ; SSSE3-NEXT:    psrad $31, %xmm1
 ; SSSE3-NEXT:    pand %xmm1, %xmm3
 ; SSSE3-NEXT:    pandn %xmm5, %xmm1
+; SSSE3-NEXT:    por %xmm3, %xmm1
 ; SSSE3-NEXT:    pand %xmm0, %xmm2
 ; SSSE3-NEXT:    pandn %xmm4, %xmm0
 ; SSSE3-NEXT:    por %xmm2, %xmm0
-; SSSE3-NEXT:    por %xmm3, %xmm1
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: blend_logic_v8i32:
diff --git a/test/CodeGen/X86/vector-lzcnt-128.ll b/test/CodeGen/X86/vector-lzcnt-128.ll
index f1f795bf3cb0..e3261d15538f 100644
--- a/test/CodeGen/X86/vector-lzcnt-128.ll
+++ b/test/CodeGen/X86/vector-lzcnt-128.ll
@@ -1,15 +1,17 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse3 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE3
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSSE3
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512cd -mattr=+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512VLCD
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512cd | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512CD
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefix=X64 --check-prefix=SSE --check-prefix=SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse3 | FileCheck %s --check-prefix=X64 --check-prefix=SSE --check-prefix=SSE3
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefix=X64 --check-prefix=SSE --check-prefix=SSSE3
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=X64 --check-prefix=SSE --check-prefix=SSE41
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=X64 --check-prefix=NOBW --check-prefix=AVX --check-prefix=AVX1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=X64 --check-prefix=NOBW --check-prefix=AVX --check-prefix=AVX2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl | FileCheck %s --check-prefix=X64 --check-prefix=NOBW --check-prefix=AVX --check-prefix=AVX512VL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512bw,+avx512dq | FileCheck %s --check-prefix=X64 --check-prefix=AVX512VLBWDQ
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl -mattr=+avx512cd -mattr=+avx512vl | FileCheck %s --check-prefix=X64 --check-prefix=NOBW --check-prefix=AVX512 --check-prefix=AVX512VLCD
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl -mattr=+avx512cd | FileCheck %s --check-prefix=X64 --check-prefix=NOBW --check-prefix=AVX512 --check-prefix=AVX512CD
 ;
 ; Just one 32-bit run to make sure we do reasonable things for i64 lzcnt.
-; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=X32-SSE --check-prefix=X32-SSE41
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=X32-SSE
 
 define <2 x i64> @testv2i64(<2 x i64> %in) nounwind {
 ; SSE2-LABEL: testv2i64:
@@ -194,16 +196,46 @@ define <2 x i64> @testv2i64(<2 x i64> %in) nounwind {
 ; AVX-NEXT:    vpaddq %xmm0, %xmm1, %xmm0
 ; AVX-NEXT:    retq
 ;
+; AVX512VLBWDQ-LABEL: testv2i64:
+; AVX512VLBWDQ:       # BB#0:
+; AVX512VLBWDQ-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VLBWDQ-NEXT:    vpand %xmm1, %xmm0, %xmm2
+; AVX512VLBWDQ-NEXT:    vmovdqu {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512VLBWDQ-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
+; AVX512VLBWDQ-NEXT:    vpsrlw $4, %xmm0, %xmm4
+; AVX512VLBWDQ-NEXT:    vpand %xmm1, %xmm4, %xmm1
+; AVX512VLBWDQ-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; AVX512VLBWDQ-NEXT:    vpcmpeqb %xmm4, %xmm1, %xmm5
+; AVX512VLBWDQ-NEXT:    vpand %xmm5, %xmm2, %xmm2
+; AVX512VLBWDQ-NEXT:    vpshufb %xmm1, %xmm3, %xmm1
+; AVX512VLBWDQ-NEXT:    vpaddb %xmm1, %xmm2, %xmm1
+; AVX512VLBWDQ-NEXT:    vpcmpeqb %xmm4, %xmm0, %xmm2
+; AVX512VLBWDQ-NEXT:    vpsrlw $8, %xmm2, %xmm2
+; AVX512VLBWDQ-NEXT:    vpand %xmm2, %xmm1, %xmm2
+; AVX512VLBWDQ-NEXT:    vpsrlw $8, %xmm1, %xmm1
+; AVX512VLBWDQ-NEXT:    vpaddw %xmm2, %xmm1, %xmm1
+; AVX512VLBWDQ-NEXT:    vpcmpeqw %xmm4, %xmm0, %xmm2
+; AVX512VLBWDQ-NEXT:    vpsrld $16, %xmm2, %xmm2
+; AVX512VLBWDQ-NEXT:    vpand %xmm2, %xmm1, %xmm2
+; AVX512VLBWDQ-NEXT:    vpsrld $16, %xmm1, %xmm1
+; AVX512VLBWDQ-NEXT:    vpaddd %xmm2, %xmm1, %xmm1
+; AVX512VLBWDQ-NEXT:    vpcmpeqd %xmm4, %xmm0, %xmm0
+; AVX512VLBWDQ-NEXT:    vpsrlq $32, %xmm0, %xmm0
+; AVX512VLBWDQ-NEXT:    vpand %xmm0, %xmm1, %xmm0
+; AVX512VLBWDQ-NEXT:    vpsrlq $32, %xmm1, %xmm1
+; AVX512VLBWDQ-NEXT:    vpaddq %xmm0, %xmm1, %xmm0
+; AVX512VLBWDQ-NEXT:    retq
+;
 ; AVX512VLCD-LABEL: testv2i64:
-; AVX512VLCD:       ## BB#0:
+; AVX512VLCD:       # BB#0:
 ; AVX512VLCD-NEXT:    vplzcntq %xmm0, %xmm0
 ; AVX512VLCD-NEXT:    retq
 ;
 ; AVX512CD-LABEL: testv2i64:
-; AVX512CD:       ## BB#0:
-; AVX512CD-NEXT:    ## kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
+; AVX512CD:       # BB#0:
+; AVX512CD-NEXT:    # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
 ; AVX512CD-NEXT:    vplzcntq %zmm0, %zmm0
-; AVX512CD-NEXT:    ## kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512CD-NEXT:    # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
 ; AVX512CD-NEXT:    retq
 ;
 ; X32-SSE-LABEL: testv2i64:
@@ -429,16 +461,46 @@ define <2 x i64> @testv2i64u(<2 x i64> %in) nounwind {
 ; AVX-NEXT:    vpaddq %xmm0, %xmm1, %xmm0
 ; AVX-NEXT:    retq
 ;
+; AVX512VLBWDQ-LABEL: testv2i64u:
+; AVX512VLBWDQ:       # BB#0:
+; AVX512VLBWDQ-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VLBWDQ-NEXT:    vpand %xmm1, %xmm0, %xmm2
+; AVX512VLBWDQ-NEXT:    vmovdqu {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512VLBWDQ-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
+; AVX512VLBWDQ-NEXT:    vpsrlw $4, %xmm0, %xmm4
+; AVX512VLBWDQ-NEXT:    vpand %xmm1, %xmm4, %xmm1
+; AVX512VLBWDQ-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; AVX512VLBWDQ-NEXT:    vpcmpeqb %xmm4, %xmm1, %xmm5
+; AVX512VLBWDQ-NEXT:    vpand %xmm5, %xmm2, %xmm2
+; AVX512VLBWDQ-NEXT:    vpshufb %xmm1, %xmm3, %xmm1
+; AVX512VLBWDQ-NEXT:    vpaddb %xmm1, %xmm2, %xmm1
+; AVX512VLBWDQ-NEXT:    vpcmpeqb %xmm4, %xmm0, %xmm2
+; AVX512VLBWDQ-NEXT:    vpsrlw $8, %xmm2, %xmm2
+; AVX512VLBWDQ-NEXT:    vpand %xmm2, %xmm1, %xmm2
+; AVX512VLBWDQ-NEXT:    vpsrlw $8, %xmm1, %xmm1
+; AVX512VLBWDQ-NEXT:    vpaddw %xmm2, %xmm1, %xmm1
+; AVX512VLBWDQ-NEXT:    vpcmpeqw %xmm4, %xmm0, %xmm2
+; AVX512VLBWDQ-NEXT:    vpsrld $16, %xmm2, %xmm2
+; AVX512VLBWDQ-NEXT:    vpand %xmm2, %xmm1, %xmm2
+; AVX512VLBWDQ-NEXT:    vpsrld $16, %xmm1, %xmm1
+; AVX512VLBWDQ-NEXT:    vpaddd %xmm2, %xmm1, %xmm1
+; AVX512VLBWDQ-NEXT:    vpcmpeqd %xmm4, %xmm0, %xmm0
+; AVX512VLBWDQ-NEXT:    vpsrlq $32, %xmm0, %xmm0
+; AVX512VLBWDQ-NEXT:    vpand %xmm0, %xmm1, %xmm0
+; AVX512VLBWDQ-NEXT:    vpsrlq $32, %xmm1, %xmm1
+; AVX512VLBWDQ-NEXT:    vpaddq %xmm0, %xmm1, %xmm0
+; AVX512VLBWDQ-NEXT:    retq
+;
 ; AVX512VLCD-LABEL: testv2i64u:
-; AVX512VLCD:       ## BB#0:
+; AVX512VLCD:       # BB#0:
 ; AVX512VLCD-NEXT:    vplzcntq %xmm0, %xmm0
 ; AVX512VLCD-NEXT:    retq
 ;
 ; AVX512CD-LABEL: testv2i64u:
-; AVX512CD:       ## BB#0:
-; AVX512CD-NEXT:    ## kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
+; AVX512CD:       # BB#0:
+; AVX512CD-NEXT:    # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
 ; AVX512CD-NEXT:    vplzcntq %zmm0, %zmm0
-; AVX512CD-NEXT:    ## kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512CD-NEXT:    # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
 ; AVX512CD-NEXT:    retq
 ;
 ; X32-SSE-LABEL: testv2i64u:
@@ -651,16 +713,41 @@ define <4 x i32> @testv4i32(<4 x i32> %in) nounwind {
 ; AVX-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
 ; AVX-NEXT:    retq
 ;
+; AVX512VLBWDQ-LABEL: testv4i32:
+; AVX512VLBWDQ:       # BB#0:
+; AVX512VLBWDQ-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VLBWDQ-NEXT:    vpand %xmm1, %xmm0, %xmm2
+; AVX512VLBWDQ-NEXT:    vmovdqu {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512VLBWDQ-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
+; AVX512VLBWDQ-NEXT:    vpsrlw $4, %xmm0, %xmm4
+; AVX512VLBWDQ-NEXT:    vpand %xmm1, %xmm4, %xmm1
+; AVX512VLBWDQ-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; AVX512VLBWDQ-NEXT:    vpcmpeqb %xmm4, %xmm1, %xmm5
+; AVX512VLBWDQ-NEXT:    vpand %xmm5, %xmm2, %xmm2
+; AVX512VLBWDQ-NEXT:    vpshufb %xmm1, %xmm3, %xmm1
+; AVX512VLBWDQ-NEXT:    vpaddb %xmm1, %xmm2, %xmm1
+; AVX512VLBWDQ-NEXT:    vpcmpeqb %xmm4, %xmm0, %xmm2
+; AVX512VLBWDQ-NEXT:    vpsrlw $8, %xmm2, %xmm2
+; AVX512VLBWDQ-NEXT:    vpand %xmm2, %xmm1, %xmm2
+; AVX512VLBWDQ-NEXT:    vpsrlw $8, %xmm1, %xmm1
+; AVX512VLBWDQ-NEXT:    vpaddw %xmm2, %xmm1, %xmm1
+; AVX512VLBWDQ-NEXT:    vpcmpeqw %xmm4, %xmm0, %xmm0
+; AVX512VLBWDQ-NEXT:    vpsrld $16, %xmm0, %xmm0
+; AVX512VLBWDQ-NEXT:    vpand %xmm0, %xmm1, %xmm0
+; AVX512VLBWDQ-NEXT:    vpsrld $16, %xmm1, %xmm1
+; AVX512VLBWDQ-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
+; AVX512VLBWDQ-NEXT:    retq
+;
 ; AVX512VLCD-LABEL: testv4i32:
-; AVX512VLCD:       ## BB#0:
+; AVX512VLCD:       # BB#0:
 ; AVX512VLCD-NEXT:    vplzcntd %xmm0, %xmm0
 ; AVX512VLCD-NEXT:    retq
 ;
 ; AVX512CD-LABEL: testv4i32:
-; AVX512CD:       ## BB#0:
-; AVX512CD-NEXT:    ## kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
+; AVX512CD:       # BB#0:
+; AVX512CD-NEXT:    # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
 ; AVX512CD-NEXT:    vplzcntd %zmm0, %zmm0
-; AVX512CD-NEXT:    ## kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512CD-NEXT:    # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
 ; AVX512CD-NEXT:    retq
 ;
 ; X32-SSE-LABEL: testv4i32:
@@ -867,16 +954,41 @@ define <4 x i32> @testv4i32u(<4 x i32> %in) nounwind {
 ; AVX-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
 ; AVX-NEXT:    retq
 ;
+; AVX512VLBWDQ-LABEL: testv4i32u:
+; AVX512VLBWDQ:       # BB#0:
+; AVX512VLBWDQ-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VLBWDQ-NEXT:    vpand %xmm1, %xmm0, %xmm2
+; AVX512VLBWDQ-NEXT:    vmovdqu {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512VLBWDQ-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
+; AVX512VLBWDQ-NEXT:    vpsrlw $4, %xmm0, %xmm4
+; AVX512VLBWDQ-NEXT:    vpand %xmm1, %xmm4, %xmm1
+; AVX512VLBWDQ-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; AVX512VLBWDQ-NEXT:    vpcmpeqb %xmm4, %xmm1, %xmm5
+; AVX512VLBWDQ-NEXT:    vpand %xmm5, %xmm2, %xmm2
+; AVX512VLBWDQ-NEXT:    vpshufb %xmm1, %xmm3, %xmm1
+; AVX512VLBWDQ-NEXT:    vpaddb %xmm1, %xmm2, %xmm1
+; AVX512VLBWDQ-NEXT:    vpcmpeqb %xmm4, %xmm0, %xmm2
+; AVX512VLBWDQ-NEXT:    vpsrlw $8, %xmm2, %xmm2
+; AVX512VLBWDQ-NEXT:    vpand %xmm2, %xmm1, %xmm2
+; AVX512VLBWDQ-NEXT:    vpsrlw $8, %xmm1, %xmm1
+; AVX512VLBWDQ-NEXT:    vpaddw %xmm2, %xmm1, %xmm1
+; AVX512VLBWDQ-NEXT:    vpcmpeqw %xmm4, %xmm0, %xmm0
+; AVX512VLBWDQ-NEXT:    vpsrld $16, %xmm0, %xmm0
+; AVX512VLBWDQ-NEXT:    vpand %xmm0, %xmm1, %xmm0
+; AVX512VLBWDQ-NEXT:    vpsrld $16, %xmm1, %xmm1
+; AVX512VLBWDQ-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
+; AVX512VLBWDQ-NEXT:    retq
+;
 ; AVX512VLCD-LABEL: testv4i32u:
-; AVX512VLCD:       ## BB#0:
+; AVX512VLCD:       # BB#0:
 ; AVX512VLCD-NEXT:    vplzcntd %xmm0, %xmm0
 ; AVX512VLCD-NEXT:    retq
 ;
 ; AVX512CD-LABEL: testv4i32u:
-; AVX512CD:       ## BB#0:
-; AVX512CD-NEXT:    ## kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
+; AVX512CD:       # BB#0:
+; AVX512CD-NEXT:    # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
 ; AVX512CD-NEXT:    vplzcntd %zmm0, %zmm0
-; AVX512CD-NEXT:    ## kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512CD-NEXT:    # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
 ; AVX512CD-NEXT:    retq
 ;
 ; X32-SSE-LABEL: testv4i32u:
@@ -1054,8 +1166,28 @@ define <8 x i16> @testv8i16(<8 x i16> %in) nounwind {
 ; AVX-NEXT:    vpaddw %xmm0, %xmm1, %xmm0
 ; AVX-NEXT:    retq
 ;
+; AVX512VLBWDQ-LABEL: testv8i16:
+; AVX512VLBWDQ:       # BB#0:
+; AVX512VLBWDQ-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VLBWDQ-NEXT:    vpand %xmm1, %xmm0, %xmm2
+; AVX512VLBWDQ-NEXT:    vmovdqu {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512VLBWDQ-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
+; AVX512VLBWDQ-NEXT:    vpsrlw $4, %xmm0, %xmm4
+; AVX512VLBWDQ-NEXT:    vpand %xmm1, %xmm4, %xmm1
+; AVX512VLBWDQ-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; AVX512VLBWDQ-NEXT:    vpcmpeqb %xmm4, %xmm1, %xmm5
+; AVX512VLBWDQ-NEXT:    vpand %xmm5, %xmm2, %xmm2
+; AVX512VLBWDQ-NEXT:    vpshufb %xmm1, %xmm3, %xmm1
+; AVX512VLBWDQ-NEXT:    vpaddb %xmm1, %xmm2, %xmm1
+; AVX512VLBWDQ-NEXT:    vpcmpeqb %xmm4, %xmm0, %xmm0
+; AVX512VLBWDQ-NEXT:    vpsrlw $8, %xmm0, %xmm0
+; AVX512VLBWDQ-NEXT:    vpand %xmm0, %xmm1, %xmm0
+; AVX512VLBWDQ-NEXT:    vpsrlw $8, %xmm1, %xmm1
+; AVX512VLBWDQ-NEXT:    vpaddw %xmm0, %xmm1, %xmm0
+; AVX512VLBWDQ-NEXT:    retq
+;
 ; AVX512VLCD-LABEL: testv8i16:
-; AVX512VLCD:       ## BB#0:
+; AVX512VLCD:       # BB#0:
 ; AVX512VLCD-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
 ; AVX512VLCD-NEXT:    vplzcntd %ymm0, %ymm0
 ; AVX512VLCD-NEXT:    vpmovdw %ymm0, %xmm0
@@ -1063,7 +1195,7 @@ define <8 x i16> @testv8i16(<8 x i16> %in) nounwind {
 ; AVX512VLCD-NEXT:    retq
 ;
 ; AVX512CD-LABEL: testv8i16:
-; AVX512CD:       ## BB#0:
+; AVX512CD:       # BB#0:
 ; AVX512CD-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
 ; AVX512CD-NEXT:    vplzcntd %zmm0, %zmm0
 ; AVX512CD-NEXT:    vpmovdw %zmm0, %ymm0
@@ -1238,8 +1370,28 @@ define <8 x i16> @testv8i16u(<8 x i16> %in) nounwind {
 ; AVX-NEXT:    vpaddw %xmm0, %xmm1, %xmm0
 ; AVX-NEXT:    retq
 ;
+; AVX512VLBWDQ-LABEL: testv8i16u:
+; AVX512VLBWDQ:       # BB#0:
+; AVX512VLBWDQ-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VLBWDQ-NEXT:    vpand %xmm1, %xmm0, %xmm2
+; AVX512VLBWDQ-NEXT:    vmovdqu {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512VLBWDQ-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
+; AVX512VLBWDQ-NEXT:    vpsrlw $4, %xmm0, %xmm4
+; AVX512VLBWDQ-NEXT:    vpand %xmm1, %xmm4, %xmm1
+; AVX512VLBWDQ-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; AVX512VLBWDQ-NEXT:    vpcmpeqb %xmm4, %xmm1, %xmm5
+; AVX512VLBWDQ-NEXT:    vpand %xmm5, %xmm2, %xmm2
+; AVX512VLBWDQ-NEXT:    vpshufb %xmm1, %xmm3, %xmm1
+; AVX512VLBWDQ-NEXT:    vpaddb %xmm1, %xmm2, %xmm1
+; AVX512VLBWDQ-NEXT:    vpcmpeqb %xmm4, %xmm0, %xmm0
+; AVX512VLBWDQ-NEXT:    vpsrlw $8, %xmm0, %xmm0
+; AVX512VLBWDQ-NEXT:    vpand %xmm0, %xmm1, %xmm0
+; AVX512VLBWDQ-NEXT:    vpsrlw $8, %xmm1, %xmm1
+; AVX512VLBWDQ-NEXT:    vpaddw %xmm0, %xmm1, %xmm0
+; AVX512VLBWDQ-NEXT:    retq
+;
 ; AVX512VLCD-LABEL: testv8i16u:
-; AVX512VLCD:       ## BB#0:
+; AVX512VLCD:       # BB#0:
 ; AVX512VLCD-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
 ; AVX512VLCD-NEXT:    vplzcntd %ymm0, %ymm0
 ; AVX512VLCD-NEXT:    vpmovdw %ymm0, %xmm0
@@ -1247,7 +1399,7 @@ define <8 x i16> @testv8i16u(<8 x i16> %in) nounwind {
 ; AVX512VLCD-NEXT:    retq
 ;
 ; AVX512CD-LABEL: testv8i16u:
-; AVX512CD:       ## BB#0:
+; AVX512CD:       # BB#0:
 ; AVX512CD-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
 ; AVX512CD-NEXT:    vplzcntd %zmm0, %zmm0
 ; AVX512CD-NEXT:    vpmovdw %zmm0, %ymm0
@@ -1399,8 +1551,23 @@ define <16 x i8> @testv16i8(<16 x i8> %in) nounwind {
 ; AVX-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
 ; AVX-NEXT:    retq
 ;
+; AVX512VLBWDQ-LABEL: testv16i8:
+; AVX512VLBWDQ:       # BB#0:
+; AVX512VLBWDQ-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VLBWDQ-NEXT:    vpand %xmm1, %xmm0, %xmm2
+; AVX512VLBWDQ-NEXT:    vmovdqu {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512VLBWDQ-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
+; AVX512VLBWDQ-NEXT:    vpsrlw $4, %xmm0, %xmm0
+; AVX512VLBWDQ-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX512VLBWDQ-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512VLBWDQ-NEXT:    vpcmpeqb %xmm1, %xmm0, %xmm1
+; AVX512VLBWDQ-NEXT:    vpand %xmm1, %xmm2, %xmm1
+; AVX512VLBWDQ-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
+; AVX512VLBWDQ-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
+; AVX512VLBWDQ-NEXT:    retq
+;
 ; AVX512-LABEL: testv16i8:
-; AVX512:       ## BB#0:
+; AVX512:       # BB#0:
 ; AVX512-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
 ; AVX512-NEXT:    vplzcntd %zmm0, %zmm0
 ; AVX512-NEXT:    vpmovdb %zmm0, %xmm0
@@ -1546,8 +1713,23 @@ define <16 x i8> @testv16i8u(<16 x i8> %in) nounwind {
 ; AVX-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
 ; AVX-NEXT:    retq
 ;
+; AVX512VLBWDQ-LABEL: testv16i8u:
+; AVX512VLBWDQ:       # BB#0:
+; AVX512VLBWDQ-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VLBWDQ-NEXT:    vpand %xmm1, %xmm0, %xmm2
+; AVX512VLBWDQ-NEXT:    vmovdqu {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512VLBWDQ-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
+; AVX512VLBWDQ-NEXT:    vpsrlw $4, %xmm0, %xmm0
+; AVX512VLBWDQ-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX512VLBWDQ-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512VLBWDQ-NEXT:    vpcmpeqb %xmm1, %xmm0, %xmm1
+; AVX512VLBWDQ-NEXT:    vpand %xmm1, %xmm2, %xmm1
+; AVX512VLBWDQ-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
+; AVX512VLBWDQ-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
+; AVX512VLBWDQ-NEXT:    retq
+;
 ; AVX512-LABEL: testv16i8u:
-; AVX512:       ## BB#0:
+; AVX512:       # BB#0:
 ; AVX512-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
 ; AVX512-NEXT:    vplzcntd %zmm0, %zmm0
 ; AVX512-NEXT:    vpmovdb %zmm0, %xmm0
@@ -1582,17 +1764,17 @@ define <2 x i64> @foldv2i64() nounwind {
 ; SSE-NEXT:    movq %rax, %xmm0
 ; SSE-NEXT:    retq
 ;
-; AVX-LABEL: foldv2i64:
-; AVX:       # BB#0:
-; AVX-NEXT:    movl $55, %eax
-; AVX-NEXT:    vmovq %rax, %xmm0
-; AVX-NEXT:    retq
+; NOBW-LABEL: foldv2i64:
+; NOBW:       # BB#0:
+; NOBW-NEXT:    movl $55, %eax
+; NOBW-NEXT:    vmovq %rax, %xmm0
+; NOBW-NEXT:    retq
 ;
-; AVX512-LABEL: foldv2i64:
-; AVX512:       ## BB#0:
-; AVX512-NEXT:    movl $55, %eax
-; AVX512-NEXT:    vmovq %rax, %xmm0
-; AVX512-NEXT:    retq
+; AVX512VLBWDQ-LABEL: foldv2i64:
+; AVX512VLBWDQ:       # BB#0:
+; AVX512VLBWDQ-NEXT:    movl $55, %eax
+; AVX512VLBWDQ-NEXT:    vmovq %rax, %xmm0
+; AVX512VLBWDQ-NEXT:    retq
 ;
 ; X32-SSE-LABEL: foldv2i64:
 ; X32-SSE:       # BB#0:
@@ -1610,17 +1792,17 @@ define <2 x i64> @foldv2i64u() nounwind {
 ; SSE-NEXT:    movq %rax, %xmm0
 ; SSE-NEXT:    retq
 ;
-; AVX-LABEL: foldv2i64u:
-; AVX:       # BB#0:
-; AVX-NEXT:    movl $55, %eax
-; AVX-NEXT:    vmovq %rax, %xmm0
-; AVX-NEXT:    retq
+; NOBW-LABEL: foldv2i64u:
+; NOBW:       # BB#0:
+; NOBW-NEXT:    movl $55, %eax
+; NOBW-NEXT:    vmovq %rax, %xmm0
+; NOBW-NEXT:    retq
 ;
-; AVX512-LABEL: foldv2i64u:
-; AVX512:       ## BB#0:
-; AVX512-NEXT:    movl $55, %eax
-; AVX512-NEXT:    vmovq %rax, %xmm0
-; AVX512-NEXT:    retq
+; AVX512VLBWDQ-LABEL: foldv2i64u:
+; AVX512VLBWDQ:       # BB#0:
+; AVX512VLBWDQ-NEXT:    movl $55, %eax
+; AVX512VLBWDQ-NEXT:    vmovq %rax, %xmm0
+; AVX512VLBWDQ-NEXT:    retq
 ;
 ; X32-SSE-LABEL: foldv2i64u:
 ; X32-SSE:       # BB#0:
@@ -1637,15 +1819,15 @@ define <4 x i32> @foldv4i32() nounwind {
 ; SSE-NEXT:    movaps {{.*#+}} xmm0 = [23,0,32,24]
 ; SSE-NEXT:    retq
 ;
-; AVX-LABEL: foldv4i32:
-; AVX:       # BB#0:
-; AVX-NEXT:    vmovaps {{.*#+}} xmm0 = [23,0,32,24]
-; AVX-NEXT:    retq
+; NOBW-LABEL: foldv4i32:
+; NOBW:       # BB#0:
+; NOBW-NEXT:    vmovaps {{.*#+}} xmm0 = [23,0,32,24]
+; NOBW-NEXT:    retq
 ;
-; AVX512-LABEL: foldv4i32:
-; AVX512:       ## BB#0:
-; AVX512-NEXT:    vmovaps {{.*#+}} xmm0 = [23,0,32,24]
-; AVX512-NEXT:    retq
+; AVX512VLBWDQ-LABEL: foldv4i32:
+; AVX512VLBWDQ:       # BB#0:
+; AVX512VLBWDQ-NEXT:    vmovaps {{.*#+}} xmm0 = [23,0,32,24]
+; AVX512VLBWDQ-NEXT:    retq
 ;
 ; X32-SSE-LABEL: foldv4i32:
 ; X32-SSE:       # BB#0:
@@ -1661,15 +1843,15 @@ define <4 x i32> @foldv4i32u() nounwind {
 ; SSE-NEXT:    movaps {{.*#+}} xmm0 = [23,0,32,24]
 ; SSE-NEXT:    retq
 ;
-; AVX-LABEL: foldv4i32u:
-; AVX:       # BB#0:
-; AVX-NEXT:    vmovaps {{.*#+}} xmm0 = [23,0,32,24]
-; AVX-NEXT:    retq
+; NOBW-LABEL: foldv4i32u:
+; NOBW:       # BB#0:
+; NOBW-NEXT:    vmovaps {{.*#+}} xmm0 = [23,0,32,24]
+; NOBW-NEXT:    retq
 ;
-; AVX512-LABEL: foldv4i32u:
-; AVX512:       ## BB#0:
-; AVX512-NEXT:    vmovaps {{.*#+}} xmm0 = [23,0,32,24]
-; AVX512-NEXT:    retq
+; AVX512VLBWDQ-LABEL: foldv4i32u:
+; AVX512VLBWDQ:       # BB#0:
+; AVX512VLBWDQ-NEXT:    vmovaps {{.*#+}} xmm0 = [23,0,32,24]
+; AVX512VLBWDQ-NEXT:    retq
 ;
 ; X32-SSE-LABEL: foldv4i32u:
 ; X32-SSE:       # BB#0:
@@ -1685,15 +1867,15 @@ define <8 x i16> @foldv8i16() nounwind {
 ; SSE-NEXT:    movaps {{.*#+}} xmm0 = [7,0,16,8,16,13,11,9]
 ; SSE-NEXT:    retq
 ;
-; AVX-LABEL: foldv8i16:
-; AVX:       # BB#0:
-; AVX-NEXT:    vmovaps {{.*#+}} xmm0 = [7,0,16,8,16,13,11,9]
-; AVX-NEXT:    retq
+; NOBW-LABEL: foldv8i16:
+; NOBW:       # BB#0:
+; NOBW-NEXT:    vmovaps {{.*#+}} xmm0 = [7,0,16,8,16,13,11,9]
+; NOBW-NEXT:    retq
 ;
-; AVX512-LABEL: foldv8i16:
-; AVX512:       ## BB#0:
-; AVX512-NEXT:    vmovaps {{.*#+}} xmm0 = [7,0,16,8,16,13,11,9]
-; AVX512-NEXT:    retq
+; AVX512VLBWDQ-LABEL: foldv8i16:
+; AVX512VLBWDQ:       # BB#0:
+; AVX512VLBWDQ-NEXT:    vmovdqu {{.*#+}} xmm0 = [7,0,16,8,16,13,11,9]
+; AVX512VLBWDQ-NEXT:    retq
 ;
 ; X32-SSE-LABEL: foldv8i16:
 ; X32-SSE:       # BB#0:
@@ -1709,15 +1891,15 @@ define <8 x i16> @foldv8i16u() nounwind {
 ; SSE-NEXT:    movaps {{.*#+}} xmm0 = [7,0,16,8,16,13,11,9]
 ; SSE-NEXT:    retq
 ;
-; AVX-LABEL: foldv8i16u:
-; AVX:       # BB#0:
-; AVX-NEXT:    vmovaps {{.*#+}} xmm0 = [7,0,16,8,16,13,11,9]
-; AVX-NEXT:    retq
+; NOBW-LABEL: foldv8i16u:
+; NOBW:       # BB#0:
+; NOBW-NEXT:    vmovaps {{.*#+}} xmm0 = [7,0,16,8,16,13,11,9]
+; NOBW-NEXT:    retq
 ;
-; AVX512-LABEL: foldv8i16u:
-; AVX512:       ## BB#0:
-; AVX512-NEXT:    vmovaps {{.*#+}} xmm0 = [7,0,16,8,16,13,11,9]
-; AVX512-NEXT:    retq
+; AVX512VLBWDQ-LABEL: foldv8i16u:
+; AVX512VLBWDQ:       # BB#0:
+; AVX512VLBWDQ-NEXT:    vmovdqu {{.*#+}} xmm0 = [7,0,16,8,16,13,11,9]
+; AVX512VLBWDQ-NEXT:    retq
 ;
 ; X32-SSE-LABEL: foldv8i16u:
 ; X32-SSE:       # BB#0:
@@ -1733,15 +1915,15 @@ define <16 x i8> @foldv16i8() nounwind {
 ; SSE-NEXT:    movaps {{.*#+}} xmm0 = [8,0,8,0,8,5,3,1,0,0,7,6,5,4,3,2]
 ; SSE-NEXT:    retq
 ;
-; AVX-LABEL: foldv16i8:
-; AVX:       # BB#0:
-; AVX-NEXT:    vmovaps {{.*#+}} xmm0 = [8,0,8,0,8,5,3,1,0,0,7,6,5,4,3,2]
-; AVX-NEXT:    retq
+; NOBW-LABEL: foldv16i8:
+; NOBW:       # BB#0:
+; NOBW-NEXT:    vmovaps {{.*#+}} xmm0 = [8,0,8,0,8,5,3,1,0,0,7,6,5,4,3,2]
+; NOBW-NEXT:    retq
 ;
-; AVX512-LABEL: foldv16i8:
-; AVX512:       ## BB#0:
-; AVX512-NEXT:    vmovaps {{.*#+}} xmm0 = [8,0,8,0,8,5,3,1,0,0,7,6,5,4,3,2]
-; AVX512-NEXT:    retq
+; AVX512VLBWDQ-LABEL: foldv16i8:
+; AVX512VLBWDQ:       # BB#0:
+; AVX512VLBWDQ-NEXT:    vmovdqu {{.*#+}} xmm0 = [8,0,8,0,8,5,3,1,0,0,7,6,5,4,3,2]
+; AVX512VLBWDQ-NEXT:    retq
 ;
 ; X32-SSE-LABEL: foldv16i8:
 ; X32-SSE:       # BB#0:
@@ -1757,15 +1939,15 @@ define <16 x i8> @foldv16i8u() nounwind {
 ; SSE-NEXT:    movaps {{.*#+}} xmm0 = [8,0,8,0,8,5,3,1,0,0,7,6,5,4,3,2]
 ; SSE-NEXT:    retq
 ;
-; AVX-LABEL: foldv16i8u:
-; AVX:       # BB#0:
-; AVX-NEXT:    vmovaps {{.*#+}} xmm0 = [8,0,8,0,8,5,3,1,0,0,7,6,5,4,3,2]
-; AVX-NEXT:    retq
+; NOBW-LABEL: foldv16i8u:
+; NOBW:       # BB#0:
+; NOBW-NEXT:    vmovaps {{.*#+}} xmm0 = [8,0,8,0,8,5,3,1,0,0,7,6,5,4,3,2]
+; NOBW-NEXT:    retq
 ;
-; AVX512-LABEL: foldv16i8u:
-; AVX512:       ## BB#0:
-; AVX512-NEXT:    vmovaps {{.*#+}} xmm0 = [8,0,8,0,8,5,3,1,0,0,7,6,5,4,3,2]
-; AVX512-NEXT:    retq
+; AVX512VLBWDQ-LABEL: foldv16i8u:
+; AVX512VLBWDQ:       # BB#0:
+; AVX512VLBWDQ-NEXT:    vmovdqu {{.*#+}} xmm0 = [8,0,8,0,8,5,3,1,0,0,7,6,5,4,3,2]
+; AVX512VLBWDQ-NEXT:    retq
 ;
 ; X32-SSE-LABEL: foldv16i8u:
 ; X32-SSE:       # BB#0:
diff --git a/test/CodeGen/X86/vector-lzcnt-256.ll b/test/CodeGen/X86/vector-lzcnt-256.ll
index 53cb4d8e445b..185e1f4865ea 100644
--- a/test/CodeGen/X86/vector-lzcnt-256.ll
+++ b/test/CodeGen/X86/vector-lzcnt-256.ll
@@ -1,11 +1,13 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512cd -mattr=+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512VLCD
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512cd | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512CD
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=X64 --check-prefix=NOBW --check-prefix=AVX --check-prefix=AVX1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=X64 --check-prefix=NOBW --check-prefix=AVX --check-prefix=AVX2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl | FileCheck %s --check-prefix=X64 --check-prefix=NOBW --check-prefix=AVX512VL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512bw,+avx512dq | FileCheck %s --check-prefix=X64 --check-prefix=AVX512VLBWDQ
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl -mattr=+avx512cd -mattr=+avx512vl | FileCheck %s --check-prefix=X64 --check-prefix=NOBW --check-prefix=AVX512 --check-prefix=AVX512VLCD
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl -mattr=+avx512cd | FileCheck %s --check-prefix=X64 --check-prefix=NOBW --check-prefix=AVX512 --check-prefix=AVX512CD
 ;
 ; Just one 32-bit run to make sure we do reasonable things for i64 lzcnt.
-; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=X32-AVX --check-prefix=X32-AVX2
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=X32-AVX
 
 define <4 x i64> @testv4i64(<4 x i64> %in) nounwind {
 ; AVX1-LABEL: testv4i64:
@@ -93,16 +95,76 @@ define <4 x i64> @testv4i64(<4 x i64> %in) nounwind {
 ; AVX2-NEXT:    vpaddq %ymm0, %ymm1, %ymm0
 ; AVX2-NEXT:    retq
 ;
+; AVX512VL-LABEL: testv4i64:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VL-NEXT:    vpand %ymm1, %ymm0, %ymm2
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512VL-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
+; AVX512VL-NEXT:    vpsrlw $4, %ymm0, %ymm4
+; AVX512VL-NEXT:    vpand %ymm1, %ymm4, %ymm1
+; AVX512VL-NEXT:    vpxor %ymm4, %ymm4, %ymm4
+; AVX512VL-NEXT:    vpcmpeqb %ymm4, %ymm1, %ymm5
+; AVX512VL-NEXT:    vpand %ymm5, %ymm2, %ymm2
+; AVX512VL-NEXT:    vpshufb %ymm1, %ymm3, %ymm1
+; AVX512VL-NEXT:    vpaddb %ymm1, %ymm2, %ymm1
+; AVX512VL-NEXT:    vpcmpeqb %ymm4, %ymm0, %ymm2
+; AVX512VL-NEXT:    vpsrlw $8, %ymm2, %ymm2
+; AVX512VL-NEXT:    vpand %ymm2, %ymm1, %ymm2
+; AVX512VL-NEXT:    vpsrlw $8, %ymm1, %ymm1
+; AVX512VL-NEXT:    vpaddw %ymm2, %ymm1, %ymm1
+; AVX512VL-NEXT:    vpcmpeqw %ymm4, %ymm0, %ymm2
+; AVX512VL-NEXT:    vpsrld $16, %ymm2, %ymm2
+; AVX512VL-NEXT:    vpand %ymm2, %ymm1, %ymm2
+; AVX512VL-NEXT:    vpsrld $16, %ymm1, %ymm1
+; AVX512VL-NEXT:    vpaddd %ymm2, %ymm1, %ymm1
+; AVX512VL-NEXT:    vpcmpeqd %ymm4, %ymm0, %ymm0
+; AVX512VL-NEXT:    vpsrlq $32, %ymm0, %ymm0
+; AVX512VL-NEXT:    vpand %ymm0, %ymm1, %ymm0
+; AVX512VL-NEXT:    vpsrlq $32, %ymm1, %ymm1
+; AVX512VL-NEXT:    vpaddq %ymm0, %ymm1, %ymm0
+; AVX512VL-NEXT:    retq
+;
+; AVX512VLBWDQ-LABEL: testv4i64:
+; AVX512VLBWDQ:       # BB#0:
+; AVX512VLBWDQ-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VLBWDQ-NEXT:    vpand %ymm1, %ymm0, %ymm2
+; AVX512VLBWDQ-NEXT:    vmovdqu {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512VLBWDQ-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
+; AVX512VLBWDQ-NEXT:    vpsrlw $4, %ymm0, %ymm4
+; AVX512VLBWDQ-NEXT:    vpand %ymm1, %ymm4, %ymm1
+; AVX512VLBWDQ-NEXT:    vpxor %ymm4, %ymm4, %ymm4
+; AVX512VLBWDQ-NEXT:    vpcmpeqb %ymm4, %ymm1, %ymm5
+; AVX512VLBWDQ-NEXT:    vpand %ymm5, %ymm2, %ymm2
+; AVX512VLBWDQ-NEXT:    vpshufb %ymm1, %ymm3, %ymm1
+; AVX512VLBWDQ-NEXT:    vpaddb %ymm1, %ymm2, %ymm1
+; AVX512VLBWDQ-NEXT:    vpcmpeqb %ymm4, %ymm0, %ymm2
+; AVX512VLBWDQ-NEXT:    vpsrlw $8, %ymm2, %ymm2
+; AVX512VLBWDQ-NEXT:    vpand %ymm2, %ymm1, %ymm2
+; AVX512VLBWDQ-NEXT:    vpsrlw $8, %ymm1, %ymm1
+; AVX512VLBWDQ-NEXT:    vpaddw %ymm2, %ymm1, %ymm1
+; AVX512VLBWDQ-NEXT:    vpcmpeqw %ymm4, %ymm0, %ymm2
+; AVX512VLBWDQ-NEXT:    vpsrld $16, %ymm2, %ymm2
+; AVX512VLBWDQ-NEXT:    vpand %ymm2, %ymm1, %ymm2
+; AVX512VLBWDQ-NEXT:    vpsrld $16, %ymm1, %ymm1
+; AVX512VLBWDQ-NEXT:    vpaddd %ymm2, %ymm1, %ymm1
+; AVX512VLBWDQ-NEXT:    vpcmpeqd %ymm4, %ymm0, %ymm0
+; AVX512VLBWDQ-NEXT:    vpsrlq $32, %ymm0, %ymm0
+; AVX512VLBWDQ-NEXT:    vpand %ymm0, %ymm1, %ymm0
+; AVX512VLBWDQ-NEXT:    vpsrlq $32, %ymm1, %ymm1
+; AVX512VLBWDQ-NEXT:    vpaddq %ymm0, %ymm1, %ymm0
+; AVX512VLBWDQ-NEXT:    retq
+;
 ; AVX512VLCD-LABEL: testv4i64:
-; AVX512VLCD:       ## BB#0:
+; AVX512VLCD:       # BB#0:
 ; AVX512VLCD-NEXT:    vplzcntq %ymm0, %ymm0
 ; AVX512VLCD-NEXT:    retq
 ;
 ; AVX512CD-LABEL: testv4i64:
-; AVX512CD:       ## BB#0:
-; AVX512CD-NEXT:    ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; AVX512CD:       # BB#0:
+; AVX512CD-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
 ; AVX512CD-NEXT:    vplzcntq %zmm0, %zmm0
-; AVX512CD-NEXT:    ## kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; AVX512CD-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
 ; AVX512CD-NEXT:    retq
 ;
 ; X32-AVX-LABEL: testv4i64:
@@ -225,16 +287,76 @@ define <4 x i64> @testv4i64u(<4 x i64> %in) nounwind {
 ; AVX2-NEXT:    vpaddq %ymm0, %ymm1, %ymm0
 ; AVX2-NEXT:    retq
 ;
+; AVX512VL-LABEL: testv4i64u:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VL-NEXT:    vpand %ymm1, %ymm0, %ymm2
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512VL-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
+; AVX512VL-NEXT:    vpsrlw $4, %ymm0, %ymm4
+; AVX512VL-NEXT:    vpand %ymm1, %ymm4, %ymm1
+; AVX512VL-NEXT:    vpxor %ymm4, %ymm4, %ymm4
+; AVX512VL-NEXT:    vpcmpeqb %ymm4, %ymm1, %ymm5
+; AVX512VL-NEXT:    vpand %ymm5, %ymm2, %ymm2
+; AVX512VL-NEXT:    vpshufb %ymm1, %ymm3, %ymm1
+; AVX512VL-NEXT:    vpaddb %ymm1, %ymm2, %ymm1
+; AVX512VL-NEXT:    vpcmpeqb %ymm4, %ymm0, %ymm2
+; AVX512VL-NEXT:    vpsrlw $8, %ymm2, %ymm2
+; AVX512VL-NEXT:    vpand %ymm2, %ymm1, %ymm2
+; AVX512VL-NEXT:    vpsrlw $8, %ymm1, %ymm1
+; AVX512VL-NEXT:    vpaddw %ymm2, %ymm1, %ymm1
+; AVX512VL-NEXT:    vpcmpeqw %ymm4, %ymm0, %ymm2
+; AVX512VL-NEXT:    vpsrld $16, %ymm2, %ymm2
+; AVX512VL-NEXT:    vpand %ymm2, %ymm1, %ymm2
+; AVX512VL-NEXT:    vpsrld $16, %ymm1, %ymm1
+; AVX512VL-NEXT:    vpaddd %ymm2, %ymm1, %ymm1
+; AVX512VL-NEXT:    vpcmpeqd %ymm4, %ymm0, %ymm0
+; AVX512VL-NEXT:    vpsrlq $32, %ymm0, %ymm0
+; AVX512VL-NEXT:    vpand %ymm0, %ymm1, %ymm0
+; AVX512VL-NEXT:    vpsrlq $32, %ymm1, %ymm1
+; AVX512VL-NEXT:    vpaddq %ymm0, %ymm1, %ymm0
+; AVX512VL-NEXT:    retq
+;
+; AVX512VLBWDQ-LABEL: testv4i64u:
+; AVX512VLBWDQ:       # BB#0:
+; AVX512VLBWDQ-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VLBWDQ-NEXT:    vpand %ymm1, %ymm0, %ymm2
+; AVX512VLBWDQ-NEXT:    vmovdqu {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512VLBWDQ-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
+; AVX512VLBWDQ-NEXT:    vpsrlw $4, %ymm0, %ymm4
+; AVX512VLBWDQ-NEXT:    vpand %ymm1, %ymm4, %ymm1
+; AVX512VLBWDQ-NEXT:    vpxor %ymm4, %ymm4, %ymm4
+; AVX512VLBWDQ-NEXT:    vpcmpeqb %ymm4, %ymm1, %ymm5
+; AVX512VLBWDQ-NEXT:    vpand %ymm5, %ymm2, %ymm2
+; AVX512VLBWDQ-NEXT:    vpshufb %ymm1, %ymm3, %ymm1
+; AVX512VLBWDQ-NEXT:    vpaddb %ymm1, %ymm2, %ymm1
+; AVX512VLBWDQ-NEXT:    vpcmpeqb %ymm4, %ymm0, %ymm2
+; AVX512VLBWDQ-NEXT:    vpsrlw $8, %ymm2, %ymm2
+; AVX512VLBWDQ-NEXT:    vpand %ymm2, %ymm1, %ymm2
+; AVX512VLBWDQ-NEXT:    vpsrlw $8, %ymm1, %ymm1
+; AVX512VLBWDQ-NEXT:    vpaddw %ymm2, %ymm1, %ymm1
+; AVX512VLBWDQ-NEXT:    vpcmpeqw %ymm4, %ymm0, %ymm2
+; AVX512VLBWDQ-NEXT:    vpsrld $16, %ymm2, %ymm2
+; AVX512VLBWDQ-NEXT:    vpand %ymm2, %ymm1, %ymm2
+; AVX512VLBWDQ-NEXT:    vpsrld $16, %ymm1, %ymm1
+; AVX512VLBWDQ-NEXT:    vpaddd %ymm2, %ymm1, %ymm1
+; AVX512VLBWDQ-NEXT:    vpcmpeqd %ymm4, %ymm0, %ymm0
+; AVX512VLBWDQ-NEXT:    vpsrlq $32, %ymm0, %ymm0
+; AVX512VLBWDQ-NEXT:    vpand %ymm0, %ymm1, %ymm0
+; AVX512VLBWDQ-NEXT:    vpsrlq $32, %ymm1, %ymm1
+; AVX512VLBWDQ-NEXT:    vpaddq %ymm0, %ymm1, %ymm0
+; AVX512VLBWDQ-NEXT:    retq
+;
 ; AVX512VLCD-LABEL: testv4i64u:
-; AVX512VLCD:       ## BB#0:
+; AVX512VLCD:       # BB#0:
 ; AVX512VLCD-NEXT:    vplzcntq %ymm0, %ymm0
 ; AVX512VLCD-NEXT:    retq
 ;
 ; AVX512CD-LABEL: testv4i64u:
-; AVX512CD:       ## BB#0:
-; AVX512CD-NEXT:    ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; AVX512CD:       # BB#0:
+; AVX512CD-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
 ; AVX512CD-NEXT:    vplzcntq %zmm0, %zmm0
-; AVX512CD-NEXT:    ## kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; AVX512CD-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
 ; AVX512CD-NEXT:    retq
 ;
 ; X32-AVX-LABEL: testv4i64u:
@@ -342,16 +464,66 @@ define <8 x i32> @testv8i32(<8 x i32> %in) nounwind {
 ; AVX2-NEXT:    vpaddd %ymm0, %ymm1, %ymm0
 ; AVX2-NEXT:    retq
 ;
+; AVX512VL-LABEL: testv8i32:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VL-NEXT:    vpand %ymm1, %ymm0, %ymm2
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512VL-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
+; AVX512VL-NEXT:    vpsrlw $4, %ymm0, %ymm4
+; AVX512VL-NEXT:    vpand %ymm1, %ymm4, %ymm1
+; AVX512VL-NEXT:    vpxor %ymm4, %ymm4, %ymm4
+; AVX512VL-NEXT:    vpcmpeqb %ymm4, %ymm1, %ymm5
+; AVX512VL-NEXT:    vpand %ymm5, %ymm2, %ymm2
+; AVX512VL-NEXT:    vpshufb %ymm1, %ymm3, %ymm1
+; AVX512VL-NEXT:    vpaddb %ymm1, %ymm2, %ymm1
+; AVX512VL-NEXT:    vpcmpeqb %ymm4, %ymm0, %ymm2
+; AVX512VL-NEXT:    vpsrlw $8, %ymm2, %ymm2
+; AVX512VL-NEXT:    vpand %ymm2, %ymm1, %ymm2
+; AVX512VL-NEXT:    vpsrlw $8, %ymm1, %ymm1
+; AVX512VL-NEXT:    vpaddw %ymm2, %ymm1, %ymm1
+; AVX512VL-NEXT:    vpcmpeqw %ymm4, %ymm0, %ymm0
+; AVX512VL-NEXT:    vpsrld $16, %ymm0, %ymm0
+; AVX512VL-NEXT:    vpand %ymm0, %ymm1, %ymm0
+; AVX512VL-NEXT:    vpsrld $16, %ymm1, %ymm1
+; AVX512VL-NEXT:    vpaddd %ymm0, %ymm1, %ymm0
+; AVX512VL-NEXT:    retq
+;
+; AVX512VLBWDQ-LABEL: testv8i32:
+; AVX512VLBWDQ:       # BB#0:
+; AVX512VLBWDQ-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VLBWDQ-NEXT:    vpand %ymm1, %ymm0, %ymm2
+; AVX512VLBWDQ-NEXT:    vmovdqu {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512VLBWDQ-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
+; AVX512VLBWDQ-NEXT:    vpsrlw $4, %ymm0, %ymm4
+; AVX512VLBWDQ-NEXT:    vpand %ymm1, %ymm4, %ymm1
+; AVX512VLBWDQ-NEXT:    vpxor %ymm4, %ymm4, %ymm4
+; AVX512VLBWDQ-NEXT:    vpcmpeqb %ymm4, %ymm1, %ymm5
+; AVX512VLBWDQ-NEXT:    vpand %ymm5, %ymm2, %ymm2
+; AVX512VLBWDQ-NEXT:    vpshufb %ymm1, %ymm3, %ymm1
+; AVX512VLBWDQ-NEXT:    vpaddb %ymm1, %ymm2, %ymm1
+; AVX512VLBWDQ-NEXT:    vpcmpeqb %ymm4, %ymm0, %ymm2
+; AVX512VLBWDQ-NEXT:    vpsrlw $8, %ymm2, %ymm2
+; AVX512VLBWDQ-NEXT:    vpand %ymm2, %ymm1, %ymm2
+; AVX512VLBWDQ-NEXT:    vpsrlw $8, %ymm1, %ymm1
+; AVX512VLBWDQ-NEXT:    vpaddw %ymm2, %ymm1, %ymm1
+; AVX512VLBWDQ-NEXT:    vpcmpeqw %ymm4, %ymm0, %ymm0
+; AVX512VLBWDQ-NEXT:    vpsrld $16, %ymm0, %ymm0
+; AVX512VLBWDQ-NEXT:    vpand %ymm0, %ymm1, %ymm0
+; AVX512VLBWDQ-NEXT:    vpsrld $16, %ymm1, %ymm1
+; AVX512VLBWDQ-NEXT:    vpaddd %ymm0, %ymm1, %ymm0
+; AVX512VLBWDQ-NEXT:    retq
+;
 ; AVX512VLCD-LABEL: testv8i32:
-; AVX512VLCD:       ## BB#0:
+; AVX512VLCD:       # BB#0:
 ; AVX512VLCD-NEXT:    vplzcntd %ymm0, %ymm0
 ; AVX512VLCD-NEXT:    retq
 ;
 ; AVX512CD-LABEL: testv8i32:
-; AVX512CD:       ## BB#0:
-; AVX512CD-NEXT:    ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; AVX512CD:       # BB#0:
+; AVX512CD-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
 ; AVX512CD-NEXT:    vplzcntd %zmm0, %zmm0
-; AVX512CD-NEXT:    ## kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; AVX512CD-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
 ; AVX512CD-NEXT:    retq
 ;
 ; X32-AVX-LABEL: testv8i32:
@@ -454,16 +626,66 @@ define <8 x i32> @testv8i32u(<8 x i32> %in) nounwind {
 ; AVX2-NEXT:    vpaddd %ymm0, %ymm1, %ymm0
 ; AVX2-NEXT:    retq
 ;
+; AVX512VL-LABEL: testv8i32u:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VL-NEXT:    vpand %ymm1, %ymm0, %ymm2
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512VL-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
+; AVX512VL-NEXT:    vpsrlw $4, %ymm0, %ymm4
+; AVX512VL-NEXT:    vpand %ymm1, %ymm4, %ymm1
+; AVX512VL-NEXT:    vpxor %ymm4, %ymm4, %ymm4
+; AVX512VL-NEXT:    vpcmpeqb %ymm4, %ymm1, %ymm5
+; AVX512VL-NEXT:    vpand %ymm5, %ymm2, %ymm2
+; AVX512VL-NEXT:    vpshufb %ymm1, %ymm3, %ymm1
+; AVX512VL-NEXT:    vpaddb %ymm1, %ymm2, %ymm1
+; AVX512VL-NEXT:    vpcmpeqb %ymm4, %ymm0, %ymm2
+; AVX512VL-NEXT:    vpsrlw $8, %ymm2, %ymm2
+; AVX512VL-NEXT:    vpand %ymm2, %ymm1, %ymm2
+; AVX512VL-NEXT:    vpsrlw $8, %ymm1, %ymm1
+; AVX512VL-NEXT:    vpaddw %ymm2, %ymm1, %ymm1
+; AVX512VL-NEXT:    vpcmpeqw %ymm4, %ymm0, %ymm0
+; AVX512VL-NEXT:    vpsrld $16, %ymm0, %ymm0
+; AVX512VL-NEXT:    vpand %ymm0, %ymm1, %ymm0
+; AVX512VL-NEXT:    vpsrld $16, %ymm1, %ymm1
+; AVX512VL-NEXT:    vpaddd %ymm0, %ymm1, %ymm0
+; AVX512VL-NEXT:    retq
+;
+; AVX512VLBWDQ-LABEL: testv8i32u:
+; AVX512VLBWDQ:       # BB#0:
+; AVX512VLBWDQ-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VLBWDQ-NEXT:    vpand %ymm1, %ymm0, %ymm2
+; AVX512VLBWDQ-NEXT:    vmovdqu {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512VLBWDQ-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
+; AVX512VLBWDQ-NEXT:    vpsrlw $4, %ymm0, %ymm4
+; AVX512VLBWDQ-NEXT:    vpand %ymm1, %ymm4, %ymm1
+; AVX512VLBWDQ-NEXT:    vpxor %ymm4, %ymm4, %ymm4
+; AVX512VLBWDQ-NEXT:    vpcmpeqb %ymm4, %ymm1, %ymm5
+; AVX512VLBWDQ-NEXT:    vpand %ymm5, %ymm2, %ymm2
+; AVX512VLBWDQ-NEXT:    vpshufb %ymm1, %ymm3, %ymm1
+; AVX512VLBWDQ-NEXT:    vpaddb %ymm1, %ymm2, %ymm1
+; AVX512VLBWDQ-NEXT:    vpcmpeqb %ymm4, %ymm0, %ymm2
+; AVX512VLBWDQ-NEXT:    vpsrlw $8, %ymm2, %ymm2
+; AVX512VLBWDQ-NEXT:    vpand %ymm2, %ymm1, %ymm2
+; AVX512VLBWDQ-NEXT:    vpsrlw $8, %ymm1, %ymm1
+; AVX512VLBWDQ-NEXT:    vpaddw %ymm2, %ymm1, %ymm1
+; AVX512VLBWDQ-NEXT:    vpcmpeqw %ymm4, %ymm0, %ymm0
+; AVX512VLBWDQ-NEXT:    vpsrld $16, %ymm0, %ymm0
+; AVX512VLBWDQ-NEXT:    vpand %ymm0, %ymm1, %ymm0
+; AVX512VLBWDQ-NEXT:    vpsrld $16, %ymm1, %ymm1
+; AVX512VLBWDQ-NEXT:    vpaddd %ymm0, %ymm1, %ymm0
+; AVX512VLBWDQ-NEXT:    retq
+;
 ; AVX512VLCD-LABEL: testv8i32u:
-; AVX512VLCD:       ## BB#0:
+; AVX512VLCD:       # BB#0:
 ; AVX512VLCD-NEXT:    vplzcntd %ymm0, %ymm0
 ; AVX512VLCD-NEXT:    retq
 ;
 ; AVX512CD-LABEL: testv8i32u:
-; AVX512CD:       ## BB#0:
-; AVX512CD-NEXT:    ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; AVX512CD:       # BB#0:
+; AVX512CD-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
 ; AVX512CD-NEXT:    vplzcntd %zmm0, %zmm0
-; AVX512CD-NEXT:    ## kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; AVX512CD-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
 ; AVX512CD-NEXT:    retq
 ;
 ; X32-AVX-LABEL: testv8i32u:
@@ -551,8 +773,48 @@ define <16 x i16> @testv16i16(<16 x i16> %in) nounwind {
 ; AVX2-NEXT:    vpaddw %ymm0, %ymm1, %ymm0
 ; AVX2-NEXT:    retq
 ;
+; AVX512VL-LABEL: testv16i16:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VL-NEXT:    vpand %ymm1, %ymm0, %ymm2
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512VL-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
+; AVX512VL-NEXT:    vpsrlw $4, %ymm0, %ymm4
+; AVX512VL-NEXT:    vpand %ymm1, %ymm4, %ymm1
+; AVX512VL-NEXT:    vpxor %ymm4, %ymm4, %ymm4
+; AVX512VL-NEXT:    vpcmpeqb %ymm4, %ymm1, %ymm5
+; AVX512VL-NEXT:    vpand %ymm5, %ymm2, %ymm2
+; AVX512VL-NEXT:    vpshufb %ymm1, %ymm3, %ymm1
+; AVX512VL-NEXT:    vpaddb %ymm1, %ymm2, %ymm1
+; AVX512VL-NEXT:    vpcmpeqb %ymm4, %ymm0, %ymm0
+; AVX512VL-NEXT:    vpsrlw $8, %ymm0, %ymm0
+; AVX512VL-NEXT:    vpand %ymm0, %ymm1, %ymm0
+; AVX512VL-NEXT:    vpsrlw $8, %ymm1, %ymm1
+; AVX512VL-NEXT:    vpaddw %ymm0, %ymm1, %ymm0
+; AVX512VL-NEXT:    retq
+;
+; AVX512VLBWDQ-LABEL: testv16i16:
+; AVX512VLBWDQ:       # BB#0:
+; AVX512VLBWDQ-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VLBWDQ-NEXT:    vpand %ymm1, %ymm0, %ymm2
+; AVX512VLBWDQ-NEXT:    vmovdqu {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512VLBWDQ-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
+; AVX512VLBWDQ-NEXT:    vpsrlw $4, %ymm0, %ymm4
+; AVX512VLBWDQ-NEXT:    vpand %ymm1, %ymm4, %ymm1
+; AVX512VLBWDQ-NEXT:    vpxor %ymm4, %ymm4, %ymm4
+; AVX512VLBWDQ-NEXT:    vpcmpeqb %ymm4, %ymm1, %ymm5
+; AVX512VLBWDQ-NEXT:    vpand %ymm5, %ymm2, %ymm2
+; AVX512VLBWDQ-NEXT:    vpshufb %ymm1, %ymm3, %ymm1
+; AVX512VLBWDQ-NEXT:    vpaddb %ymm1, %ymm2, %ymm1
+; AVX512VLBWDQ-NEXT:    vpcmpeqb %ymm4, %ymm0, %ymm0
+; AVX512VLBWDQ-NEXT:    vpsrlw $8, %ymm0, %ymm0
+; AVX512VLBWDQ-NEXT:    vpand %ymm0, %ymm1, %ymm0
+; AVX512VLBWDQ-NEXT:    vpsrlw $8, %ymm1, %ymm1
+; AVX512VLBWDQ-NEXT:    vpaddw %ymm0, %ymm1, %ymm0
+; AVX512VLBWDQ-NEXT:    retq
+;
 ; AVX512-LABEL: testv16i16:
-; AVX512:       ## BB#0:
+; AVX512:       # BB#0:
 ; AVX512-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
 ; AVX512-NEXT:    vplzcntd %zmm0, %zmm0
 ; AVX512-NEXT:    vpmovdw %zmm0, %ymm0
@@ -638,8 +900,48 @@ define <16 x i16> @testv16i16u(<16 x i16> %in) nounwind {
 ; AVX2-NEXT:    vpaddw %ymm0, %ymm1, %ymm0
 ; AVX2-NEXT:    retq
 ;
+; AVX512VL-LABEL: testv16i16u:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VL-NEXT:    vpand %ymm1, %ymm0, %ymm2
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512VL-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
+; AVX512VL-NEXT:    vpsrlw $4, %ymm0, %ymm4
+; AVX512VL-NEXT:    vpand %ymm1, %ymm4, %ymm1
+; AVX512VL-NEXT:    vpxor %ymm4, %ymm4, %ymm4
+; AVX512VL-NEXT:    vpcmpeqb %ymm4, %ymm1, %ymm5
+; AVX512VL-NEXT:    vpand %ymm5, %ymm2, %ymm2
+; AVX512VL-NEXT:    vpshufb %ymm1, %ymm3, %ymm1
+; AVX512VL-NEXT:    vpaddb %ymm1, %ymm2, %ymm1
+; AVX512VL-NEXT:    vpcmpeqb %ymm4, %ymm0, %ymm0
+; AVX512VL-NEXT:    vpsrlw $8, %ymm0, %ymm0
+; AVX512VL-NEXT:    vpand %ymm0, %ymm1, %ymm0
+; AVX512VL-NEXT:    vpsrlw $8, %ymm1, %ymm1
+; AVX512VL-NEXT:    vpaddw %ymm0, %ymm1, %ymm0
+; AVX512VL-NEXT:    retq
+;
+; AVX512VLBWDQ-LABEL: testv16i16u:
+; AVX512VLBWDQ:       # BB#0:
+; AVX512VLBWDQ-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VLBWDQ-NEXT:    vpand %ymm1, %ymm0, %ymm2
+; AVX512VLBWDQ-NEXT:    vmovdqu {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512VLBWDQ-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
+; AVX512VLBWDQ-NEXT:    vpsrlw $4, %ymm0, %ymm4
+; AVX512VLBWDQ-NEXT:    vpand %ymm1, %ymm4, %ymm1
+; AVX512VLBWDQ-NEXT:    vpxor %ymm4, %ymm4, %ymm4
+; AVX512VLBWDQ-NEXT:    vpcmpeqb %ymm4, %ymm1, %ymm5
+; AVX512VLBWDQ-NEXT:    vpand %ymm5, %ymm2, %ymm2
+; AVX512VLBWDQ-NEXT:    vpshufb %ymm1, %ymm3, %ymm1
+; AVX512VLBWDQ-NEXT:    vpaddb %ymm1, %ymm2, %ymm1
+; AVX512VLBWDQ-NEXT:    vpcmpeqb %ymm4, %ymm0, %ymm0
+; AVX512VLBWDQ-NEXT:    vpsrlw $8, %ymm0, %ymm0
+; AVX512VLBWDQ-NEXT:    vpand %ymm0, %ymm1, %ymm0
+; AVX512VLBWDQ-NEXT:    vpsrlw $8, %ymm1, %ymm1
+; AVX512VLBWDQ-NEXT:    vpaddw %ymm0, %ymm1, %ymm0
+; AVX512VLBWDQ-NEXT:    retq
+;
 ; AVX512-LABEL: testv16i16u:
-; AVX512:       ## BB#0:
+; AVX512:       # BB#0:
 ; AVX512-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
 ; AVX512-NEXT:    vplzcntd %zmm0, %zmm0
 ; AVX512-NEXT:    vpmovdw %zmm0, %ymm0
@@ -710,8 +1012,38 @@ define <32 x i8> @testv32i8(<32 x i8> %in) nounwind {
 ; AVX2-NEXT:    vpaddb %ymm0, %ymm1, %ymm0
 ; AVX2-NEXT:    retq
 ;
+; AVX512VL-LABEL: testv32i8:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VL-NEXT:    vpand %ymm1, %ymm0, %ymm2
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512VL-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
+; AVX512VL-NEXT:    vpsrlw $4, %ymm0, %ymm0
+; AVX512VL-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; AVX512VL-NEXT:    vpxor %ymm1, %ymm1, %ymm1
+; AVX512VL-NEXT:    vpcmpeqb %ymm1, %ymm0, %ymm1
+; AVX512VL-NEXT:    vpand %ymm1, %ymm2, %ymm1
+; AVX512VL-NEXT:    vpshufb %ymm0, %ymm3, %ymm0
+; AVX512VL-NEXT:    vpaddb %ymm0, %ymm1, %ymm0
+; AVX512VL-NEXT:    retq
+;
+; AVX512VLBWDQ-LABEL: testv32i8:
+; AVX512VLBWDQ:       # BB#0:
+; AVX512VLBWDQ-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VLBWDQ-NEXT:    vpand %ymm1, %ymm0, %ymm2
+; AVX512VLBWDQ-NEXT:    vmovdqu {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512VLBWDQ-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
+; AVX512VLBWDQ-NEXT:    vpsrlw $4, %ymm0, %ymm0
+; AVX512VLBWDQ-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; AVX512VLBWDQ-NEXT:    vpxor %ymm1, %ymm1, %ymm1
+; AVX512VLBWDQ-NEXT:    vpcmpeqb %ymm1, %ymm0, %ymm1
+; AVX512VLBWDQ-NEXT:    vpand %ymm1, %ymm2, %ymm1
+; AVX512VLBWDQ-NEXT:    vpshufb %ymm0, %ymm3, %ymm0
+; AVX512VLBWDQ-NEXT:    vpaddb %ymm0, %ymm1, %ymm0
+; AVX512VLBWDQ-NEXT:    retq
+;
 ; AVX512-LABEL: testv32i8:
-; AVX512:       ## BB#0:
+; AVX512:       # BB#0:
 ; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; AVX512-NEXT:    vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
 ; AVX512-NEXT:    vplzcntd %zmm1, %zmm1
@@ -784,8 +1116,38 @@ define <32 x i8> @testv32i8u(<32 x i8> %in) nounwind {
 ; AVX2-NEXT:    vpaddb %ymm0, %ymm1, %ymm0
 ; AVX2-NEXT:    retq
 ;
+; AVX512VL-LABEL: testv32i8u:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VL-NEXT:    vpand %ymm1, %ymm0, %ymm2
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512VL-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
+; AVX512VL-NEXT:    vpsrlw $4, %ymm0, %ymm0
+; AVX512VL-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; AVX512VL-NEXT:    vpxor %ymm1, %ymm1, %ymm1
+; AVX512VL-NEXT:    vpcmpeqb %ymm1, %ymm0, %ymm1
+; AVX512VL-NEXT:    vpand %ymm1, %ymm2, %ymm1
+; AVX512VL-NEXT:    vpshufb %ymm0, %ymm3, %ymm0
+; AVX512VL-NEXT:    vpaddb %ymm0, %ymm1, %ymm0
+; AVX512VL-NEXT:    retq
+;
+; AVX512VLBWDQ-LABEL: testv32i8u:
+; AVX512VLBWDQ:       # BB#0:
+; AVX512VLBWDQ-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VLBWDQ-NEXT:    vpand %ymm1, %ymm0, %ymm2
+; AVX512VLBWDQ-NEXT:    vmovdqu {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512VLBWDQ-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
+; AVX512VLBWDQ-NEXT:    vpsrlw $4, %ymm0, %ymm0
+; AVX512VLBWDQ-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; AVX512VLBWDQ-NEXT:    vpxor %ymm1, %ymm1, %ymm1
+; AVX512VLBWDQ-NEXT:    vpcmpeqb %ymm1, %ymm0, %ymm1
+; AVX512VLBWDQ-NEXT:    vpand %ymm1, %ymm2, %ymm1
+; AVX512VLBWDQ-NEXT:    vpshufb %ymm0, %ymm3, %ymm0
+; AVX512VLBWDQ-NEXT:    vpaddb %ymm0, %ymm1, %ymm0
+; AVX512VLBWDQ-NEXT:    retq
+;
 ; AVX512-LABEL: testv32i8u:
-; AVX512:       ## BB#0:
+; AVX512:       # BB#0:
 ; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; AVX512-NEXT:    vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
 ; AVX512-NEXT:    vplzcntd %zmm1, %zmm1
@@ -818,15 +1180,10 @@ define <32 x i8> @testv32i8u(<32 x i8> %in) nounwind {
 }
 
 define <4 x i64> @foldv4i64() nounwind {
-; AVX-LABEL: foldv4i64:
-; AVX:       # BB#0:
-; AVX-NEXT:    vmovaps {{.*#+}} ymm0 = [55,0,64,56]
-; AVX-NEXT:    retq
-;
-; AVX512-LABEL: foldv4i64:
-; AVX512:       ## BB#0:
-; AVX512-NEXT:    vmovaps {{.*#+}} ymm0 = [55,0,64,56]
-; AVX512-NEXT:    retq
+; X64-LABEL: foldv4i64:
+; X64:       # BB#0:
+; X64-NEXT:    vmovaps {{.*#+}} ymm0 = [55,0,64,56]
+; X64-NEXT:    retq
 ;
 ; X32-AVX-LABEL: foldv4i64:
 ; X32-AVX:       # BB#0:
@@ -837,15 +1194,10 @@ define <4 x i64> @foldv4i64() nounwind {
 }
 
 define <4 x i64> @foldv4i64u() nounwind {
-; AVX-LABEL: foldv4i64u:
-; AVX:       # BB#0:
-; AVX-NEXT:    vmovaps {{.*#+}} ymm0 = [55,0,64,56]
-; AVX-NEXT:    retq
-;
-; AVX512-LABEL: foldv4i64u:
-; AVX512:       ## BB#0:
-; AVX512-NEXT:    vmovaps {{.*#+}} ymm0 = [55,0,64,56]
-; AVX512-NEXT:    retq
+; X64-LABEL: foldv4i64u:
+; X64:       # BB#0:
+; X64-NEXT:    vmovaps {{.*#+}} ymm0 = [55,0,64,56]
+; X64-NEXT:    retq
 ;
 ; X32-AVX-LABEL: foldv4i64u:
 ; X32-AVX:       # BB#0:
@@ -856,15 +1208,10 @@ define <4 x i64> @foldv4i64u() nounwind {
 }
 
 define <8 x i32> @foldv8i32() nounwind {
-; AVX-LABEL: foldv8i32:
-; AVX:       # BB#0:
-; AVX-NEXT:    vmovaps {{.*#+}} ymm0 = [23,0,32,24,0,29,27,25]
-; AVX-NEXT:    retq
-;
-; AVX512-LABEL: foldv8i32:
-; AVX512:       ## BB#0:
-; AVX512-NEXT:    vmovaps {{.*#+}} ymm0 = [23,0,32,24,0,29,27,25]
-; AVX512-NEXT:    retq
+; X64-LABEL: foldv8i32:
+; X64:       # BB#0:
+; X64-NEXT:    vmovaps {{.*#+}} ymm0 = [23,0,32,24,0,29,27,25]
+; X64-NEXT:    retq
 ;
 ; X32-AVX-LABEL: foldv8i32:
 ; X32-AVX:       # BB#0:
@@ -875,15 +1222,10 @@ define <8 x i32> @foldv8i32() nounwind {
 }
 
 define <8 x i32> @foldv8i32u() nounwind {
-; AVX-LABEL: foldv8i32u:
-; AVX:       # BB#0:
-; AVX-NEXT:    vmovaps {{.*#+}} ymm0 = [23,0,32,24,0,29,27,25]
-; AVX-NEXT:    retq
-;
-; AVX512-LABEL: foldv8i32u:
-; AVX512:       ## BB#0:
-; AVX512-NEXT:    vmovaps {{.*#+}} ymm0 = [23,0,32,24,0,29,27,25]
-; AVX512-NEXT:    retq
+; X64-LABEL: foldv8i32u:
+; X64:       # BB#0:
+; X64-NEXT:    vmovaps {{.*#+}} ymm0 = [23,0,32,24,0,29,27,25]
+; X64-NEXT:    retq
 ;
 ; X32-AVX-LABEL: foldv8i32u:
 ; X32-AVX:       # BB#0:
@@ -894,15 +1236,15 @@ define <8 x i32> @foldv8i32u() nounwind {
 }
 
 define <16 x i16> @foldv16i16() nounwind {
-; AVX-LABEL: foldv16i16:
-; AVX:       # BB#0:
-; AVX-NEXT:    vmovaps {{.*#+}} ymm0 = [7,0,16,8,16,13,11,9,0,8,15,14,13,12,11,10]
-; AVX-NEXT:    retq
-;
-; AVX512-LABEL: foldv16i16:
-; AVX512:       ## BB#0:
-; AVX512-NEXT:    vmovaps {{.*#+}} ymm0 = [7,0,16,8,16,13,11,9,0,8,15,14,13,12,11,10]
-; AVX512-NEXT:    retq
+; NOBW-LABEL: foldv16i16:
+; NOBW:       # BB#0:
+; NOBW-NEXT:    vmovaps {{.*#+}} ymm0 = [7,0,16,8,16,13,11,9,0,8,15,14,13,12,11,10]
+; NOBW-NEXT:    retq
+;
+; AVX512VLBWDQ-LABEL: foldv16i16:
+; AVX512VLBWDQ:       # BB#0:
+; AVX512VLBWDQ-NEXT:    vmovdqu {{.*#+}} ymm0 = [7,0,16,8,16,13,11,9,0,8,15,14,13,12,11,10]
+; AVX512VLBWDQ-NEXT:    retq
 ;
 ; X32-AVX-LABEL: foldv16i16:
 ; X32-AVX:       # BB#0:
@@ -913,15 +1255,15 @@ define <16 x i16> @foldv16i16() nounwind {
 }
 
 define <16 x i16> @foldv16i16u() nounwind {
-; AVX-LABEL: foldv16i16u:
-; AVX:       # BB#0:
-; AVX-NEXT:    vmovaps {{.*#+}} ymm0 = [7,0,16,8,16,13,11,9,0,8,15,14,13,12,11,10]
-; AVX-NEXT:    retq
-;
-; AVX512-LABEL: foldv16i16u:
-; AVX512:       ## BB#0:
-; AVX512-NEXT:    vmovaps {{.*#+}} ymm0 = [7,0,16,8,16,13,11,9,0,8,15,14,13,12,11,10]
-; AVX512-NEXT:    retq
+; NOBW-LABEL: foldv16i16u:
+; NOBW:       # BB#0:
+; NOBW-NEXT:    vmovaps {{.*#+}} ymm0 = [7,0,16,8,16,13,11,9,0,8,15,14,13,12,11,10]
+; NOBW-NEXT:    retq
+;
+; AVX512VLBWDQ-LABEL: foldv16i16u:
+; AVX512VLBWDQ:       # BB#0:
+; AVX512VLBWDQ-NEXT:    vmovdqu {{.*#+}} ymm0 = [7,0,16,8,16,13,11,9,0,8,15,14,13,12,11,10]
+; AVX512VLBWDQ-NEXT:    retq
 ;
 ; X32-AVX-LABEL: foldv16i16u:
 ; X32-AVX:       # BB#0:
@@ -932,15 +1274,15 @@ define <16 x i16> @foldv16i16u() nounwind {
 }
 
 define <32 x i8> @foldv32i8() nounwind {
-; AVX-LABEL: foldv32i8:
-; AVX:       # BB#0:
-; AVX-NEXT:    vmovaps {{.*#+}} ymm0 = [8,0,8,0,8,5,3,1,0,0,7,6,5,4,3,2,1,0,8,8,0,0,0,0,0,0,0,0,6,5,5,1]
-; AVX-NEXT:    retq
-;
-; AVX512-LABEL: foldv32i8:
-; AVX512:       ## BB#0:
-; AVX512-NEXT:    vmovaps {{.*#+}} ymm0 = [8,0,8,0,8,5,3,1,0,0,7,6,5,4,3,2,1,0,8,8,0,0,0,0,0,0,0,0,6,5,5,1]
-; AVX512-NEXT:    retq
+; NOBW-LABEL: foldv32i8:
+; NOBW:       # BB#0:
+; NOBW-NEXT:    vmovaps {{.*#+}} ymm0 = [8,0,8,0,8,5,3,1,0,0,7,6,5,4,3,2,1,0,8,8,0,0,0,0,0,0,0,0,6,5,5,1]
+; NOBW-NEXT:    retq
+;
+; AVX512VLBWDQ-LABEL: foldv32i8:
+; AVX512VLBWDQ:       # BB#0:
+; AVX512VLBWDQ-NEXT:    vmovdqu {{.*#+}} ymm0 = [8,0,8,0,8,5,3,1,0,0,7,6,5,4,3,2,1,0,8,8,0,0,0,0,0,0,0,0,6,5,5,1]
+; AVX512VLBWDQ-NEXT:    retq
 ;
 ; X32-AVX-LABEL: foldv32i8:
 ; X32-AVX:       # BB#0:
@@ -951,15 +1293,15 @@ define <32 x i8> @foldv32i8() nounwind {
 }
 
 define <32 x i8> @foldv32i8u() nounwind {
-; AVX-LABEL: foldv32i8u:
-; AVX:       # BB#0:
-; AVX-NEXT:    vmovaps {{.*#+}} ymm0 = [8,0,8,0,8,5,3,1,0,0,7,6,5,4,3,2,1,0,8,8,0,0,0,0,0,0,0,0,6,5,5,1]
-; AVX-NEXT:    retq
-;
-; AVX512-LABEL: foldv32i8u:
-; AVX512:       ## BB#0:
-; AVX512-NEXT:    vmovaps {{.*#+}} ymm0 = [8,0,8,0,8,5,3,1,0,0,7,6,5,4,3,2,1,0,8,8,0,0,0,0,0,0,0,0,6,5,5,1]
-; AVX512-NEXT:    retq
+; NOBW-LABEL: foldv32i8u:
+; NOBW:       # BB#0:
+; NOBW-NEXT:    vmovaps {{.*#+}} ymm0 = [8,0,8,0,8,5,3,1,0,0,7,6,5,4,3,2,1,0,8,8,0,0,0,0,0,0,0,0,6,5,5,1]
+; NOBW-NEXT:    retq
+;
+; AVX512VLBWDQ-LABEL: foldv32i8u:
+; AVX512VLBWDQ:       # BB#0:
+; AVX512VLBWDQ-NEXT:    vmovdqu {{.*#+}} ymm0 = [8,0,8,0,8,5,3,1,0,0,7,6,5,4,3,2,1,0,8,8,0,0,0,0,0,0,0,0,6,5,5,1]
+; AVX512VLBWDQ-NEXT:    retq
 ;
 ; X32-AVX-LABEL: foldv32i8u:
 ; X32-AVX:       # BB#0:
diff --git a/test/CodeGen/X86/vector-narrow-binop.ll b/test/CodeGen/X86/vector-narrow-binop.ll
new file mode 100644
index 000000000000..f737ea2b7fba
--- /dev/null
+++ b/test/CodeGen/X86/vector-narrow-binop.ll
@@ -0,0 +1,111 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512F
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512BW
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512DQ
+
+; AVX1 has support for 256-bit bitwise logic because the FP variants were included.
+; If using those ops requires extra insert/extract though, it's probably not worth it.
+
+define <8 x i32> @PR32790(<8 x i32> %a, <8 x i32> %b, <8 x i32> %c, <8 x i32> %d) {
+; SSE-LABEL: PR32790:
+; SSE:       # BB#0:
+; SSE-NEXT:    paddd %xmm2, %xmm0
+; SSE-NEXT:    paddd %xmm3, %xmm1
+; SSE-NEXT:    pand %xmm5, %xmm1
+; SSE-NEXT:    pand %xmm4, %xmm0
+; SSE-NEXT:    psubd %xmm6, %xmm0
+; SSE-NEXT:    psubd %xmm7, %xmm1
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: PR32790:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm5
+; AVX1-NEXT:    vpaddd %xmm4, %xmm5, %xmm4
+; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm0
+; AVX1-NEXT:    vandps %ymm2, %ymm0, %ymm0
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm2
+; AVX1-NEXT:    vpsubd %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpsubd %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: PR32790:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpsubd %ymm3, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: PR32790:
+; AVX512:       # BB#0:
+; AVX512-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    vpand %ymm2, %ymm0, %ymm0
+; AVX512-NEXT:    vpsubd %ymm3, %ymm0, %ymm0
+; AVX512-NEXT:    retq
+  %add = add <8 x i32> %a, %b
+  %and = and <8 x i32> %add, %c
+  %sub = sub <8 x i32> %and, %d
+  ret <8 x i32> %sub
+}
+
+; In a more extreme case, even the later AVX targets should avoid extract/insert just
+; because 256-bit ops are supported.
+
+define <4 x i32> @do_not_use_256bit_op(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c, <4 x i32> %d) {
+; SSE-LABEL: do_not_use_256bit_op:
+; SSE:       # BB#0:
+; SSE-NEXT:    pand %xmm3, %xmm1
+; SSE-NEXT:    pand %xmm2, %xmm0
+; SSE-NEXT:    psubd %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: do_not_use_256bit_op:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    # kill: %XMM2<def> %XMM2<kill> %YMM2<def>
+; AVX1-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm1
+; AVX1-NEXT:    vandps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: do_not_use_256bit_op:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    # kill: %XMM2<def> %XMM2<kill> %YMM2<def>
+; AVX2-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
+; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT:    vinserti128 $1, %xmm3, %ymm2, %ymm1
+; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: do_not_use_256bit_op:
+; AVX512:       # BB#0:
+; AVX512-NEXT:    # kill: %XMM2<def> %XMM2<kill> %YMM2<def>
+; AVX512-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
+; AVX512-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512-NEXT:    vinserti128 $1, %xmm3, %ymm2, %ymm1
+; AVX512-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  %concat1 = shufflevector <4 x i32> %a, <4 x i32> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %concat2 = shufflevector <4 x i32> %c, <4 x i32> %d, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %and = and <8 x i32> %concat1, %concat2
+  %extract1 = shufflevector <8 x i32> %and, <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %extract2 = shufflevector <8 x i32> %and, <8 x i32> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %sub = sub <4 x i32> %extract1, %extract2
+  ret <4 x i32> %sub
+}
+
diff --git a/test/CodeGen/X86/vector-pcmp.ll b/test/CodeGen/X86/vector-pcmp.ll
index f05588a2920c..99a05c3d49c0 100644
--- a/test/CodeGen/X86/vector-pcmp.ll
+++ b/test/CodeGen/X86/vector-pcmp.ll
@@ -148,8 +148,8 @@ define <32 x i8> @test_pcmpgtb_256(<32 x i8> %x) {
 ; AVX1-NEXT:    vpcmpgtb %xmm1, %xmm2, %xmm1
 ; AVX1-NEXT:    vpcmpgtb %xmm0, %xmm2, %xmm0
 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX1-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
-; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm1, %ymm1
+; AVX1-NEXT:    vxorps %ymm1, %ymm1, %ymm1
+; AVX1-NEXT:    vcmptrueps %ymm1, %ymm1, %ymm1
 ; AVX1-NEXT:    vxorps %ymm1, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
@@ -177,8 +177,8 @@ define <16 x i16> @test_pcmpgtw_256(<16 x i16> %x) {
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
 ; AVX1-NEXT:    vpsraw $15, %xmm0, %xmm0
 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; AVX1-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
-; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm1, %ymm1
+; AVX1-NEXT:    vxorps %ymm1, %ymm1, %ymm1
+; AVX1-NEXT:    vcmptrueps %ymm1, %ymm1, %ymm1
 ; AVX1-NEXT:    vxorps %ymm1, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
@@ -206,8 +206,8 @@ define <8 x i32> @test_pcmpgtd_256(<8 x i32> %x) {
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
 ; AVX1-NEXT:    vpsrad $31, %xmm0, %xmm0
 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; AVX1-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
-; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm1, %ymm1
+; AVX1-NEXT:    vxorps %ymm1, %ymm1, %ymm1
+; AVX1-NEXT:    vcmptrueps %ymm1, %ymm1, %ymm1
 ; AVX1-NEXT:    vxorps %ymm1, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
@@ -242,14 +242,13 @@ define <4 x i64> @test_pcmpgtq_256(<4 x i64> %x) {
 ;
 ; AVX1-LABEL: test_pcmpgtq_256:
 ; AVX1:       # BB#0:
-; AVX1-NEXT:    vpsrad $31, %xmm0, %xmm1
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT:    vpsrad $31, %xmm0, %xmm0
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; AVX1-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
-; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm1, %ymm1
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX1-NEXT:    vpcmpgtq %xmm1, %xmm2, %xmm1
+; AVX1-NEXT:    vpcmpgtq %xmm0, %xmm2, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    vxorps %ymm1, %ymm1, %ymm1
+; AVX1-NEXT:    vcmptrueps %ymm1, %ymm1, %ymm1
 ; AVX1-NEXT:    vxorps %ymm1, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
diff --git a/test/CodeGen/X86/vector-shift-ashr-256.ll b/test/CodeGen/X86/vector-shift-ashr-256.ll
index af3ddcf8048e..09e143ddcd4d 100644
--- a/test/CodeGen/X86/vector-shift-ashr-256.ll
+++ b/test/CodeGen/X86/vector-shift-ashr-256.ll
@@ -7,6 +7,10 @@
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl -mattr=+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl -mattr=+avx512dq,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX512VL --check-prefix=AVX512DQVL
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX512VL --check-prefix=AVX512BWVL
+;
+; 32-bit runs to make sure we do reasonable things for i64 shifts.
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx  | FileCheck %s --check-prefix=ALL --check-prefix=X32-AVX --check-prefix=X32-AVX1
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=X32-AVX --check-prefix=X32-AVX2
 
 ;
 ; Variable Shifts
@@ -81,6 +85,41 @@ define <4 x i64> @var_shift_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind {
 ; AVX512VL:       # BB#0:
 ; AVX512VL-NEXT:    vpsravq %ymm1, %ymm0, %ymm0
 ; AVX512VL-NEXT:    retq
+;
+; X32-AVX1-LABEL: var_shift_v4i64:
+; X32-AVX1:       # BB#0:
+; X32-AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; X32-AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,2147483648,0,2147483648]
+; X32-AVX1-NEXT:    vpsrlq %xmm2, %xmm3, %xmm4
+; X32-AVX1-NEXT:    vpshufd {{.*#+}} xmm5 = xmm2[2,3,0,1]
+; X32-AVX1-NEXT:    vpsrlq %xmm5, %xmm3, %xmm6
+; X32-AVX1-NEXT:    vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm6[4,5,6,7]
+; X32-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm6
+; X32-AVX1-NEXT:    vpsrlq %xmm2, %xmm6, %xmm2
+; X32-AVX1-NEXT:    vpsrlq %xmm5, %xmm6, %xmm5
+; X32-AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm5[4,5,6,7]
+; X32-AVX1-NEXT:    vpxor %xmm4, %xmm2, %xmm2
+; X32-AVX1-NEXT:    vpsubq %xmm4, %xmm2, %xmm2
+; X32-AVX1-NEXT:    vpsrlq %xmm1, %xmm3, %xmm4
+; X32-AVX1-NEXT:    vpshufd {{.*#+}} xmm5 = xmm1[2,3,0,1]
+; X32-AVX1-NEXT:    vpsrlq %xmm5, %xmm3, %xmm3
+; X32-AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7]
+; X32-AVX1-NEXT:    vpsrlq %xmm1, %xmm0, %xmm1
+; X32-AVX1-NEXT:    vpsrlq %xmm5, %xmm0, %xmm0
+; X32-AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
+; X32-AVX1-NEXT:    vpxor %xmm3, %xmm0, %xmm0
+; X32-AVX1-NEXT:    vpsubq %xmm3, %xmm0, %xmm0
+; X32-AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; X32-AVX1-NEXT:    retl
+;
+; X32-AVX2-LABEL: var_shift_v4i64:
+; X32-AVX2:       # BB#0:
+; X32-AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,2147483648,0,2147483648,0,2147483648,0,2147483648]
+; X32-AVX2-NEXT:    vpsrlvq %ymm1, %ymm2, %ymm3
+; X32-AVX2-NEXT:    vpxor %ymm2, %ymm0, %ymm0
+; X32-AVX2-NEXT:    vpsrlvq %ymm1, %ymm0, %ymm0
+; X32-AVX2-NEXT:    vpsubq %ymm3, %ymm0, %ymm0
+; X32-AVX2-NEXT:    retl
   %shift = ashr <4 x i64> %a, %b
   ret <4 x i64> %shift
 }
@@ -147,6 +186,41 @@ define <8 x i32> @var_shift_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind {
 ; AVX512VL:       # BB#0:
 ; AVX512VL-NEXT:    vpsravd %ymm1, %ymm0, %ymm0
 ; AVX512VL-NEXT:    retq
+;
+; X32-AVX1-LABEL: var_shift_v8i32:
+; X32-AVX1:       # BB#0:
+; X32-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; X32-AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
+; X32-AVX1-NEXT:    vpsrldq {{.*#+}} xmm4 = xmm3[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; X32-AVX1-NEXT:    vpsrad %xmm4, %xmm2, %xmm4
+; X32-AVX1-NEXT:    vpsrlq $32, %xmm3, %xmm5
+; X32-AVX1-NEXT:    vpsrad %xmm5, %xmm2, %xmm5
+; X32-AVX1-NEXT:    vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4,5,6,7]
+; X32-AVX1-NEXT:    vpxor %xmm5, %xmm5, %xmm5
+; X32-AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm6 = xmm3[2],xmm5[2],xmm3[3],xmm5[3]
+; X32-AVX1-NEXT:    vpsrad %xmm6, %xmm2, %xmm6
+; X32-AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero
+; X32-AVX1-NEXT:    vpsrad %xmm3, %xmm2, %xmm2
+; X32-AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm6[4,5,6,7]
+; X32-AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3],xmm2[4,5],xmm4[6,7]
+; X32-AVX1-NEXT:    vpsrldq {{.*#+}} xmm3 = xmm1[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; X32-AVX1-NEXT:    vpsrad %xmm3, %xmm0, %xmm3
+; X32-AVX1-NEXT:    vpsrlq $32, %xmm1, %xmm4
+; X32-AVX1-NEXT:    vpsrad %xmm4, %xmm0, %xmm4
+; X32-AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7]
+; X32-AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm4 = xmm1[2],xmm5[2],xmm1[3],xmm5[3]
+; X32-AVX1-NEXT:    vpsrad %xmm4, %xmm0, %xmm4
+; X32-AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; X32-AVX1-NEXT:    vpsrad %xmm1, %xmm0, %xmm0
+; X32-AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm4[4,5,6,7]
+; X32-AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3],xmm0[4,5],xmm3[6,7]
+; X32-AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; X32-AVX1-NEXT:    retl
+;
+; X32-AVX2-LABEL: var_shift_v8i32:
+; X32-AVX2:       # BB#0:
+; X32-AVX2-NEXT:    vpsravd %ymm1, %ymm0, %ymm0
+; X32-AVX2-NEXT:    retl
   %shift = ashr <8 x i32> %a, %b
   ret <8 x i32> %shift
 }
@@ -253,6 +327,55 @@ define <16 x i16> @var_shift_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind {
 ; AVX512BWVL:       # BB#0:
 ; AVX512BWVL-NEXT:    vpsravw %ymm1, %ymm0, %ymm0
 ; AVX512BWVL-NEXT:    retq
+;
+; X32-AVX1-LABEL: var_shift_v16i16:
+; X32-AVX1:       # BB#0:
+; X32-AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; X32-AVX1-NEXT:    vpsllw $12, %xmm2, %xmm3
+; X32-AVX1-NEXT:    vpsllw $4, %xmm2, %xmm2
+; X32-AVX1-NEXT:    vpor %xmm3, %xmm2, %xmm2
+; X32-AVX1-NEXT:    vpaddw %xmm2, %xmm2, %xmm3
+; X32-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
+; X32-AVX1-NEXT:    vpsraw $8, %xmm4, %xmm5
+; X32-AVX1-NEXT:    vpblendvb %xmm2, %xmm5, %xmm4, %xmm2
+; X32-AVX1-NEXT:    vpsraw $4, %xmm2, %xmm4
+; X32-AVX1-NEXT:    vpblendvb %xmm3, %xmm4, %xmm2, %xmm2
+; X32-AVX1-NEXT:    vpsraw $2, %xmm2, %xmm4
+; X32-AVX1-NEXT:    vpaddw %xmm3, %xmm3, %xmm3
+; X32-AVX1-NEXT:    vpblendvb %xmm3, %xmm4, %xmm2, %xmm2
+; X32-AVX1-NEXT:    vpsraw $1, %xmm2, %xmm4
+; X32-AVX1-NEXT:    vpaddw %xmm3, %xmm3, %xmm3
+; X32-AVX1-NEXT:    vpblendvb %xmm3, %xmm4, %xmm2, %xmm2
+; X32-AVX1-NEXT:    vpsllw $12, %xmm1, %xmm3
+; X32-AVX1-NEXT:    vpsllw $4, %xmm1, %xmm1
+; X32-AVX1-NEXT:    vpor %xmm3, %xmm1, %xmm1
+; X32-AVX1-NEXT:    vpaddw %xmm1, %xmm1, %xmm3
+; X32-AVX1-NEXT:    vpsraw $8, %xmm0, %xmm4
+; X32-AVX1-NEXT:    vpblendvb %xmm1, %xmm4, %xmm0, %xmm0
+; X32-AVX1-NEXT:    vpsraw $4, %xmm0, %xmm1
+; X32-AVX1-NEXT:    vpblendvb %xmm3, %xmm1, %xmm0, %xmm0
+; X32-AVX1-NEXT:    vpsraw $2, %xmm0, %xmm1
+; X32-AVX1-NEXT:    vpaddw %xmm3, %xmm3, %xmm3
+; X32-AVX1-NEXT:    vpblendvb %xmm3, %xmm1, %xmm0, %xmm0
+; X32-AVX1-NEXT:    vpsraw $1, %xmm0, %xmm1
+; X32-AVX1-NEXT:    vpaddw %xmm3, %xmm3, %xmm3
+; X32-AVX1-NEXT:    vpblendvb %xmm3, %xmm1, %xmm0, %xmm0
+; X32-AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; X32-AVX1-NEXT:    retl
+;
+; X32-AVX2-LABEL: var_shift_v16i16:
+; X32-AVX2:       # BB#0:
+; X32-AVX2-NEXT:    vpxor %ymm2, %ymm2, %ymm2
+; X32-AVX2-NEXT:    vpunpckhwd {{.*#+}} ymm3 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15]
+; X32-AVX2-NEXT:    vpunpckhwd {{.*#+}} ymm4 = ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15]
+; X32-AVX2-NEXT:    vpsravd %ymm3, %ymm4, %ymm3
+; X32-AVX2-NEXT:    vpsrld $16, %ymm3, %ymm3
+; X32-AVX2-NEXT:    vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11]
+; X32-AVX2-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11]
+; X32-AVX2-NEXT:    vpsravd %ymm1, %ymm0, %ymm0
+; X32-AVX2-NEXT:    vpsrld $16, %ymm0, %ymm0
+; X32-AVX2-NEXT:    vpackusdw %ymm3, %ymm0, %ymm0
+; X32-AVX2-NEXT:    retl
   %shift = ashr <16 x i16> %a, %b
   ret <16 x i16> %shift
 }
@@ -436,6 +559,89 @@ define <32 x i8> @var_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
 ; AVX512BWVL-NEXT:    vpsravw %zmm1, %zmm0, %zmm0
 ; AVX512BWVL-NEXT:    vpmovwb %zmm0, %ymm0
 ; AVX512BWVL-NEXT:    retq
+;
+; X32-AVX1-LABEL: var_shift_v32i8:
+; X32-AVX1:       # BB#0:
+; X32-AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; X32-AVX1-NEXT:    vpsllw $5, %xmm2, %xmm2
+; X32-AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15]
+; X32-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
+; X32-AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm5 = xmm0[8],xmm4[8],xmm0[9],xmm4[9],xmm0[10],xmm4[10],xmm0[11],xmm4[11],xmm0[12],xmm4[12],xmm0[13],xmm4[13],xmm0[14],xmm4[14],xmm0[15],xmm4[15]
+; X32-AVX1-NEXT:    vpsraw $4, %xmm5, %xmm6
+; X32-AVX1-NEXT:    vpblendvb %xmm3, %xmm6, %xmm5, %xmm5
+; X32-AVX1-NEXT:    vpsraw $2, %xmm5, %xmm6
+; X32-AVX1-NEXT:    vpaddw %xmm3, %xmm3, %xmm3
+; X32-AVX1-NEXT:    vpblendvb %xmm3, %xmm6, %xmm5, %xmm5
+; X32-AVX1-NEXT:    vpsraw $1, %xmm5, %xmm6
+; X32-AVX1-NEXT:    vpaddw %xmm3, %xmm3, %xmm3
+; X32-AVX1-NEXT:    vpblendvb %xmm3, %xmm6, %xmm5, %xmm3
+; X32-AVX1-NEXT:    vpsrlw $8, %xmm3, %xmm3
+; X32-AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm2 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
+; X32-AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm4 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
+; X32-AVX1-NEXT:    vpsraw $4, %xmm4, %xmm5
+; X32-AVX1-NEXT:    vpblendvb %xmm2, %xmm5, %xmm4, %xmm4
+; X32-AVX1-NEXT:    vpsraw $2, %xmm4, %xmm5
+; X32-AVX1-NEXT:    vpaddw %xmm2, %xmm2, %xmm2
+; X32-AVX1-NEXT:    vpblendvb %xmm2, %xmm5, %xmm4, %xmm4
+; X32-AVX1-NEXT:    vpsraw $1, %xmm4, %xmm5
+; X32-AVX1-NEXT:    vpaddw %xmm2, %xmm2, %xmm2
+; X32-AVX1-NEXT:    vpblendvb %xmm2, %xmm5, %xmm4, %xmm2
+; X32-AVX1-NEXT:    vpsrlw $8, %xmm2, %xmm2
+; X32-AVX1-NEXT:    vpackuswb %xmm3, %xmm2, %xmm2
+; X32-AVX1-NEXT:    vpsllw $5, %xmm1, %xmm1
+; X32-AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
+; X32-AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm4 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; X32-AVX1-NEXT:    vpsraw $4, %xmm4, %xmm5
+; X32-AVX1-NEXT:    vpblendvb %xmm3, %xmm5, %xmm4, %xmm4
+; X32-AVX1-NEXT:    vpsraw $2, %xmm4, %xmm5
+; X32-AVX1-NEXT:    vpaddw %xmm3, %xmm3, %xmm3
+; X32-AVX1-NEXT:    vpblendvb %xmm3, %xmm5, %xmm4, %xmm4
+; X32-AVX1-NEXT:    vpsraw $1, %xmm4, %xmm5
+; X32-AVX1-NEXT:    vpaddw %xmm3, %xmm3, %xmm3
+; X32-AVX1-NEXT:    vpblendvb %xmm3, %xmm5, %xmm4, %xmm3
+; X32-AVX1-NEXT:    vpsrlw $8, %xmm3, %xmm3
+; X32-AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; X32-AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; X32-AVX1-NEXT:    vpsraw $4, %xmm0, %xmm4
+; X32-AVX1-NEXT:    vpblendvb %xmm1, %xmm4, %xmm0, %xmm0
+; X32-AVX1-NEXT:    vpsraw $2, %xmm0, %xmm4
+; X32-AVX1-NEXT:    vpaddw %xmm1, %xmm1, %xmm1
+; X32-AVX1-NEXT:    vpblendvb %xmm1, %xmm4, %xmm0, %xmm0
+; X32-AVX1-NEXT:    vpsraw $1, %xmm0, %xmm4
+; X32-AVX1-NEXT:    vpaddw %xmm1, %xmm1, %xmm1
+; X32-AVX1-NEXT:    vpblendvb %xmm1, %xmm4, %xmm0, %xmm0
+; X32-AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm0
+; X32-AVX1-NEXT:    vpackuswb %xmm3, %xmm0, %xmm0
+; X32-AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; X32-AVX1-NEXT:    retl
+;
+; X32-AVX2-LABEL: var_shift_v32i8:
+; X32-AVX2:       # BB#0:
+; X32-AVX2-NEXT:    vpsllw $5, %ymm1, %ymm1
+; X32-AVX2-NEXT:    vpunpckhbw {{.*#+}} ymm2 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31]
+; X32-AVX2-NEXT:    vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
+; X32-AVX2-NEXT:    vpsraw $4, %ymm3, %ymm4
+; X32-AVX2-NEXT:    vpblendvb %ymm2, %ymm4, %ymm3, %ymm3
+; X32-AVX2-NEXT:    vpsraw $2, %ymm3, %ymm4
+; X32-AVX2-NEXT:    vpaddw %ymm2, %ymm2, %ymm2
+; X32-AVX2-NEXT:    vpblendvb %ymm2, %ymm4, %ymm3, %ymm3
+; X32-AVX2-NEXT:    vpsraw $1, %ymm3, %ymm4
+; X32-AVX2-NEXT:    vpaddw %ymm2, %ymm2, %ymm2
+; X32-AVX2-NEXT:    vpblendvb %ymm2, %ymm4, %ymm3, %ymm2
+; X32-AVX2-NEXT:    vpsrlw $8, %ymm2, %ymm2
+; X32-AVX2-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
+; X32-AVX2-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+; X32-AVX2-NEXT:    vpsraw $4, %ymm0, %ymm3
+; X32-AVX2-NEXT:    vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
+; X32-AVX2-NEXT:    vpsraw $2, %ymm0, %ymm3
+; X32-AVX2-NEXT:    vpaddw %ymm1, %ymm1, %ymm1
+; X32-AVX2-NEXT:    vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
+; X32-AVX2-NEXT:    vpsraw $1, %ymm0, %ymm3
+; X32-AVX2-NEXT:    vpaddw %ymm1, %ymm1, %ymm1
+; X32-AVX2-NEXT:    vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
+; X32-AVX2-NEXT:    vpsrlw $8, %ymm0, %ymm0
+; X32-AVX2-NEXT:    vpackuswb %ymm2, %ymm0, %ymm0
+; X32-AVX2-NEXT:    retl
   %shift = ashr <32 x i8> %a, %b
   ret <32 x i8> %shift
 }
@@ -499,6 +705,33 @@ define <4 x i64> @splatvar_shift_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind {
 ; AVX512VL:       # BB#0:
 ; AVX512VL-NEXT:    vpsraq %xmm1, %ymm0, %ymm0
 ; AVX512VL-NEXT:    retq
+;
+; X32-AVX1-LABEL: splatvar_shift_v4i64:
+; X32-AVX1:       # BB#0:
+; X32-AVX1-NEXT:    vpextrd $1, %xmm1, %eax
+; X32-AVX1-NEXT:    vpinsrd $1, %eax, %xmm1, %xmm1
+; X32-AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,2147483648,0,2147483648]
+; X32-AVX1-NEXT:    vpsrlq %xmm1, %xmm2, %xmm2
+; X32-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; X32-AVX1-NEXT:    vpsrlq %xmm1, %xmm3, %xmm3
+; X32-AVX1-NEXT:    vpxor %xmm2, %xmm3, %xmm3
+; X32-AVX1-NEXT:    vpsubq %xmm2, %xmm3, %xmm3
+; X32-AVX1-NEXT:    vpsrlq %xmm1, %xmm0, %xmm0
+; X32-AVX1-NEXT:    vpxor %xmm2, %xmm0, %xmm0
+; X32-AVX1-NEXT:    vpsubq %xmm2, %xmm0, %xmm0
+; X32-AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm0
+; X32-AVX1-NEXT:    retl
+;
+; X32-AVX2-LABEL: splatvar_shift_v4i64:
+; X32-AVX2:       # BB#0:
+; X32-AVX2-NEXT:    vpextrd $1, %xmm1, %eax
+; X32-AVX2-NEXT:    vpinsrd $1, %eax, %xmm1, %xmm1
+; X32-AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,2147483648,0,2147483648,0,2147483648,0,2147483648]
+; X32-AVX2-NEXT:    vpsrlq %xmm1, %ymm2, %ymm2
+; X32-AVX2-NEXT:    vpsrlq %xmm1, %ymm0, %ymm0
+; X32-AVX2-NEXT:    vpxor %ymm2, %ymm0, %ymm0
+; X32-AVX2-NEXT:    vpsubq %ymm2, %ymm0, %ymm0
+; X32-AVX2-NEXT:    retl
   %splat = shufflevector <4 x i64> %b, <4 x i64> undef, <4 x i32> zeroinitializer
   %shift = ashr <4 x i64> %a, %splat
   ret <4 x i64> %shift
@@ -546,6 +779,21 @@ define <8 x i32> @splatvar_shift_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind {
 ; AVX512VL-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
 ; AVX512VL-NEXT:    vpsrad %xmm1, %ymm0, %ymm0
 ; AVX512VL-NEXT:    retq
+;
+; X32-AVX1-LABEL: splatvar_shift_v8i32:
+; X32-AVX1:       # BB#0:
+; X32-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; X32-AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; X32-AVX1-NEXT:    vpsrad %xmm1, %xmm2, %xmm2
+; X32-AVX1-NEXT:    vpsrad %xmm1, %xmm0, %xmm0
+; X32-AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; X32-AVX1-NEXT:    retl
+;
+; X32-AVX2-LABEL: splatvar_shift_v8i32:
+; X32-AVX2:       # BB#0:
+; X32-AVX2-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; X32-AVX2-NEXT:    vpsrad %xmm1, %ymm0, %ymm0
+; X32-AVX2-NEXT:    retl
   %splat = shufflevector <8 x i32> %b, <8 x i32> undef, <8 x i32> zeroinitializer
   %shift = ashr <8 x i32> %a, %splat
   ret <8 x i32> %shift
@@ -593,6 +841,21 @@ define <16 x i16> @splatvar_shift_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind
 ; AVX512VL-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
 ; AVX512VL-NEXT:    vpsraw %xmm1, %ymm0, %ymm0
 ; AVX512VL-NEXT:    retq
+;
+; X32-AVX1-LABEL: splatvar_shift_v16i16:
+; X32-AVX1:       # BB#0:
+; X32-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; X32-AVX1-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
+; X32-AVX1-NEXT:    vpsraw %xmm1, %xmm2, %xmm2
+; X32-AVX1-NEXT:    vpsraw %xmm1, %xmm0, %xmm0
+; X32-AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; X32-AVX1-NEXT:    retl
+;
+; X32-AVX2-LABEL: splatvar_shift_v16i16:
+; X32-AVX2:       # BB#0:
+; X32-AVX2-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
+; X32-AVX2-NEXT:    vpsraw %xmm1, %ymm0, %ymm0
+; X32-AVX2-NEXT:    retl
   %splat = shufflevector <16 x i16> %b, <16 x i16> undef, <16 x i32> zeroinitializer
   %shift = ashr <16 x i16> %a, %splat
   ret <16 x i16> %shift
@@ -776,6 +1039,84 @@ define <32 x i8> @splatvar_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
 ; AVX512BWVL-NEXT:    vpsravw %zmm1, %zmm0, %zmm0
 ; AVX512BWVL-NEXT:    vpmovwb %zmm0, %ymm0
 ; AVX512BWVL-NEXT:    retq
+;
+; X32-AVX1-LABEL: splatvar_shift_v32i8:
+; X32-AVX1:       # BB#0:
+; X32-AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; X32-AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; X32-AVX1-NEXT:    vpsllw $5, %xmm1, %xmm1
+; X32-AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
+; X32-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; X32-AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm4 = xmm0[8],xmm3[8],xmm0[9],xmm3[9],xmm0[10],xmm3[10],xmm0[11],xmm3[11],xmm0[12],xmm3[12],xmm0[13],xmm3[13],xmm0[14],xmm3[14],xmm0[15],xmm3[15]
+; X32-AVX1-NEXT:    vpsraw $4, %xmm4, %xmm5
+; X32-AVX1-NEXT:    vpblendvb %xmm2, %xmm5, %xmm4, %xmm4
+; X32-AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm5 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; X32-AVX1-NEXT:    vpsraw $4, %xmm5, %xmm6
+; X32-AVX1-NEXT:    vpblendvb %xmm2, %xmm6, %xmm5, %xmm5
+; X32-AVX1-NEXT:    vpsraw $2, %xmm4, %xmm6
+; X32-AVX1-NEXT:    vpaddw %xmm2, %xmm2, %xmm2
+; X32-AVX1-NEXT:    vpblendvb %xmm2, %xmm6, %xmm4, %xmm4
+; X32-AVX1-NEXT:    vpsraw $2, %xmm5, %xmm6
+; X32-AVX1-NEXT:    vpblendvb %xmm2, %xmm6, %xmm5, %xmm5
+; X32-AVX1-NEXT:    vpsraw $1, %xmm4, %xmm6
+; X32-AVX1-NEXT:    vpaddw %xmm2, %xmm2, %xmm2
+; X32-AVX1-NEXT:    vpblendvb %xmm2, %xmm6, %xmm4, %xmm4
+; X32-AVX1-NEXT:    vpsraw $1, %xmm5, %xmm6
+; X32-AVX1-NEXT:    vpblendvb %xmm2, %xmm6, %xmm5, %xmm2
+; X32-AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; X32-AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm3 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
+; X32-AVX1-NEXT:    vpsraw $4, %xmm3, %xmm5
+; X32-AVX1-NEXT:    vpblendvb %xmm1, %xmm5, %xmm3, %xmm3
+; X32-AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; X32-AVX1-NEXT:    vpsraw $4, %xmm0, %xmm5
+; X32-AVX1-NEXT:    vpblendvb %xmm1, %xmm5, %xmm0, %xmm0
+; X32-AVX1-NEXT:    vpsraw $2, %xmm3, %xmm5
+; X32-AVX1-NEXT:    vpaddw %xmm1, %xmm1, %xmm1
+; X32-AVX1-NEXT:    vpblendvb %xmm1, %xmm5, %xmm3, %xmm3
+; X32-AVX1-NEXT:    vpsraw $2, %xmm0, %xmm5
+; X32-AVX1-NEXT:    vpblendvb %xmm1, %xmm5, %xmm0, %xmm0
+; X32-AVX1-NEXT:    vpsraw $1, %xmm3, %xmm5
+; X32-AVX1-NEXT:    vpaddw %xmm1, %xmm1, %xmm1
+; X32-AVX1-NEXT:    vpblendvb %xmm1, %xmm5, %xmm3, %xmm3
+; X32-AVX1-NEXT:    vpsraw $1, %xmm0, %xmm5
+; X32-AVX1-NEXT:    vpblendvb %xmm1, %xmm5, %xmm0, %xmm0
+; X32-AVX1-NEXT:    vpsrlw $8, %xmm4, %xmm1
+; X32-AVX1-NEXT:    vpsrlw $8, %xmm3, %xmm3
+; X32-AVX1-NEXT:    vpackuswb %xmm1, %xmm3, %xmm1
+; X32-AVX1-NEXT:    vpsrlw $8, %xmm2, %xmm2
+; X32-AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm0
+; X32-AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
+; X32-AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; X32-AVX1-NEXT:    retl
+;
+; X32-AVX2-LABEL: splatvar_shift_v32i8:
+; X32-AVX2:       # BB#0:
+; X32-AVX2-NEXT:    vpbroadcastb %xmm1, %ymm1
+; X32-AVX2-NEXT:    vpsllw $5, %ymm1, %ymm1
+; X32-AVX2-NEXT:    vpunpckhbw {{.*#+}} ymm2 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31]
+; X32-AVX2-NEXT:    vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
+; X32-AVX2-NEXT:    vpsraw $4, %ymm3, %ymm4
+; X32-AVX2-NEXT:    vpblendvb %ymm2, %ymm4, %ymm3, %ymm3
+; X32-AVX2-NEXT:    vpsraw $2, %ymm3, %ymm4
+; X32-AVX2-NEXT:    vpaddw %ymm2, %ymm2, %ymm2
+; X32-AVX2-NEXT:    vpblendvb %ymm2, %ymm4, %ymm3, %ymm3
+; X32-AVX2-NEXT:    vpsraw $1, %ymm3, %ymm4
+; X32-AVX2-NEXT:    vpaddw %ymm2, %ymm2, %ymm2
+; X32-AVX2-NEXT:    vpblendvb %ymm2, %ymm4, %ymm3, %ymm2
+; X32-AVX2-NEXT:    vpsrlw $8, %ymm2, %ymm2
+; X32-AVX2-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
+; X32-AVX2-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+; X32-AVX2-NEXT:    vpsraw $4, %ymm0, %ymm3
+; X32-AVX2-NEXT:    vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
+; X32-AVX2-NEXT:    vpsraw $2, %ymm0, %ymm3
+; X32-AVX2-NEXT:    vpaddw %ymm1, %ymm1, %ymm1
+; X32-AVX2-NEXT:    vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
+; X32-AVX2-NEXT:    vpsraw $1, %ymm0, %ymm3
+; X32-AVX2-NEXT:    vpaddw %ymm1, %ymm1, %ymm1
+; X32-AVX2-NEXT:    vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
+; X32-AVX2-NEXT:    vpsrlw $8, %ymm0, %ymm0
+; X32-AVX2-NEXT:    vpackuswb %ymm2, %ymm0, %ymm0
+; X32-AVX2-NEXT:    retl
   %splat = shufflevector <32 x i8> %b, <32 x i8> undef, <32 x i32> zeroinitializer
   %shift = ashr <32 x i8> %a, %splat
   ret <32 x i8> %shift
@@ -843,6 +1184,43 @@ define <4 x i64> @constant_shift_v4i64(<4 x i64> %a) nounwind {
 ; AVX512VL:       # BB#0:
 ; AVX512VL-NEXT:    vpsravq {{.*}}(%rip), %ymm0, %ymm0
 ; AVX512VL-NEXT:    retq
+;
+; X32-AVX1-LABEL: constant_shift_v4i64:
+; X32-AVX1:       # BB#0:
+; X32-AVX1-NEXT:    vmovdqa {{.*#+}} ymm1 = [1,0,7,0,31,0,62,0]
+; X32-AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; X32-AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,2147483648,0,2147483648]
+; X32-AVX1-NEXT:    vpsrlq %xmm2, %xmm3, %xmm4
+; X32-AVX1-NEXT:    vpshufd {{.*#+}} xmm5 = xmm2[2,3,0,1]
+; X32-AVX1-NEXT:    vpsrlq %xmm5, %xmm3, %xmm6
+; X32-AVX1-NEXT:    vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm6[4,5,6,7]
+; X32-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm6
+; X32-AVX1-NEXT:    vpsrlq %xmm2, %xmm6, %xmm2
+; X32-AVX1-NEXT:    vpsrlq %xmm5, %xmm6, %xmm5
+; X32-AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm5[4,5,6,7]
+; X32-AVX1-NEXT:    vpxor %xmm4, %xmm2, %xmm2
+; X32-AVX1-NEXT:    vpsubq %xmm4, %xmm2, %xmm2
+; X32-AVX1-NEXT:    vpsrlq %xmm1, %xmm3, %xmm4
+; X32-AVX1-NEXT:    vpshufd {{.*#+}} xmm5 = xmm1[2,3,0,1]
+; X32-AVX1-NEXT:    vpsrlq %xmm5, %xmm3, %xmm3
+; X32-AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7]
+; X32-AVX1-NEXT:    vpsrlq %xmm1, %xmm0, %xmm1
+; X32-AVX1-NEXT:    vpsrlq %xmm5, %xmm0, %xmm0
+; X32-AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
+; X32-AVX1-NEXT:    vpxor %xmm3, %xmm0, %xmm0
+; X32-AVX1-NEXT:    vpsubq %xmm3, %xmm0, %xmm0
+; X32-AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; X32-AVX1-NEXT:    retl
+;
+; X32-AVX2-LABEL: constant_shift_v4i64:
+; X32-AVX2:       # BB#0:
+; X32-AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [1,0,7,0,31,0,62,0]
+; X32-AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,2147483648,0,2147483648,0,2147483648,0,2147483648]
+; X32-AVX2-NEXT:    vpsrlvq %ymm1, %ymm2, %ymm3
+; X32-AVX2-NEXT:    vpxor %ymm2, %ymm0, %ymm0
+; X32-AVX2-NEXT:    vpsrlvq %ymm1, %ymm0, %ymm0
+; X32-AVX2-NEXT:    vpsubq %ymm3, %ymm0, %ymm0
+; X32-AVX2-NEXT:    retl
   %shift = ashr <4 x i64> %a, <i64 1, i64 7, i64 31, i64 62>
   ret <4 x i64> %shift
 }
@@ -893,6 +1271,29 @@ define <8 x i32> @constant_shift_v8i32(<8 x i32> %a) nounwind {
 ; AVX512VL:       # BB#0:
 ; AVX512VL-NEXT:    vpsravd {{.*}}(%rip), %ymm0, %ymm0
 ; AVX512VL-NEXT:    retq
+;
+; X32-AVX1-LABEL: constant_shift_v8i32:
+; X32-AVX1:       # BB#0:
+; X32-AVX1-NEXT:    vpsrad $7, %xmm0, %xmm1
+; X32-AVX1-NEXT:    vpsrad $5, %xmm0, %xmm2
+; X32-AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7]
+; X32-AVX1-NEXT:    vpsrad $6, %xmm0, %xmm2
+; X32-AVX1-NEXT:    vpsrad $4, %xmm0, %xmm3
+; X32-AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
+; X32-AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
+; X32-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; X32-AVX1-NEXT:    vpsrad $7, %xmm0, %xmm2
+; X32-AVX1-NEXT:    vpsrad $9, %xmm0, %xmm3
+; X32-AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
+; X32-AVX1-NEXT:    vpsrad $8, %xmm0, %xmm0
+; X32-AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; X32-AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; X32-AVX1-NEXT:    retl
+;
+; X32-AVX2-LABEL: constant_shift_v8i32:
+; X32-AVX2:       # BB#0:
+; X32-AVX2-NEXT:    vpsravd {{\.LCPI.*}}, %ymm0, %ymm0
+; X32-AVX2-NEXT:    retl
   %shift = ashr <8 x i32> %a, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 8, i32 7>
   ret <8 x i32> %shift
 }
@@ -980,6 +1381,40 @@ define <16 x i16> @constant_shift_v16i16(<16 x i16> %a) nounwind {
 ; AVX512BWVL:       # BB#0:
 ; AVX512BWVL-NEXT:    vpsravw {{.*}}(%rip), %ymm0, %ymm0
 ; AVX512BWVL-NEXT:    retq
+;
+; X32-AVX1-LABEL: constant_shift_v16i16:
+; X32-AVX1:       # BB#0:
+; X32-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; X32-AVX1-NEXT:    vpsraw $8, %xmm1, %xmm1
+; X32-AVX1-NEXT:    vpsraw $4, %xmm1, %xmm2
+; X32-AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7]
+; X32-AVX1-NEXT:    vpsraw $2, %xmm1, %xmm2
+; X32-AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
+; X32-AVX1-NEXT:    vpsraw $1, %xmm1, %xmm2
+; X32-AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
+; X32-AVX1-NEXT:    vpsraw $4, %xmm0, %xmm2
+; X32-AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7]
+; X32-AVX1-NEXT:    vpsraw $2, %xmm0, %xmm2
+; X32-AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; X32-AVX1-NEXT:    vpsraw $1, %xmm0, %xmm2
+; X32-AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7]
+; X32-AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; X32-AVX1-NEXT:    retl
+;
+; X32-AVX2-LABEL: constant_shift_v16i16:
+; X32-AVX2:       # BB#0:
+; X32-AVX2-NEXT:    vpxor %ymm1, %ymm1, %ymm1
+; X32-AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; X32-AVX2-NEXT:    vpunpckhwd {{.*#+}} ymm3 = ymm2[4],ymm1[4],ymm2[5],ymm1[5],ymm2[6],ymm1[6],ymm2[7],ymm1[7],ymm2[12],ymm1[12],ymm2[13],ymm1[13],ymm2[14],ymm1[14],ymm2[15],ymm1[15]
+; X32-AVX2-NEXT:    vpunpckhwd {{.*#+}} ymm4 = ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15]
+; X32-AVX2-NEXT:    vpsravd %ymm3, %ymm4, %ymm3
+; X32-AVX2-NEXT:    vpsrld $16, %ymm3, %ymm3
+; X32-AVX2-NEXT:    vpunpcklwd {{.*#+}} ymm2 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11]
+; X32-AVX2-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11]
+; X32-AVX2-NEXT:    vpsravd %ymm2, %ymm0, %ymm0
+; X32-AVX2-NEXT:    vpsrld $16, %ymm0, %ymm0
+; X32-AVX2-NEXT:    vpackusdw %ymm3, %ymm0, %ymm0
+; X32-AVX2-NEXT:    retl
   %shift = ashr <16 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>
   ret <16 x i16> %shift
 }
@@ -1149,6 +1584,81 @@ define <32 x i8> @constant_shift_v32i8(<32 x i8> %a) nounwind {
 ; AVX512BWVL-NEXT:    vpsravw {{.*}}(%rip), %zmm0, %zmm0
 ; AVX512BWVL-NEXT:    vpmovwb %zmm0, %ymm0
 ; AVX512BWVL-NEXT:    retq
+;
+; X32-AVX1-LABEL: constant_shift_v32i8:
+; X32-AVX1:       # BB#0:
+; X32-AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [8192,24640,41088,57536,49376,32928,16480,32]
+; X32-AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
+; X32-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; X32-AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm4 = xmm0[8],xmm3[8],xmm0[9],xmm3[9],xmm0[10],xmm3[10],xmm0[11],xmm3[11],xmm0[12],xmm3[12],xmm0[13],xmm3[13],xmm0[14],xmm3[14],xmm0[15],xmm3[15]
+; X32-AVX1-NEXT:    vpsraw $4, %xmm4, %xmm5
+; X32-AVX1-NEXT:    vpblendvb %xmm2, %xmm5, %xmm4, %xmm4
+; X32-AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm5 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; X32-AVX1-NEXT:    vpsraw $4, %xmm5, %xmm6
+; X32-AVX1-NEXT:    vpblendvb %xmm2, %xmm6, %xmm5, %xmm5
+; X32-AVX1-NEXT:    vpsraw $2, %xmm4, %xmm6
+; X32-AVX1-NEXT:    vpaddw %xmm2, %xmm2, %xmm2
+; X32-AVX1-NEXT:    vpblendvb %xmm2, %xmm6, %xmm4, %xmm4
+; X32-AVX1-NEXT:    vpsraw $2, %xmm5, %xmm6
+; X32-AVX1-NEXT:    vpblendvb %xmm2, %xmm6, %xmm5, %xmm5
+; X32-AVX1-NEXT:    vpsraw $1, %xmm4, %xmm6
+; X32-AVX1-NEXT:    vpaddw %xmm2, %xmm2, %xmm2
+; X32-AVX1-NEXT:    vpblendvb %xmm2, %xmm6, %xmm4, %xmm4
+; X32-AVX1-NEXT:    vpsraw $1, %xmm5, %xmm6
+; X32-AVX1-NEXT:    vpblendvb %xmm2, %xmm6, %xmm5, %xmm2
+; X32-AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; X32-AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm3 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
+; X32-AVX1-NEXT:    vpsraw $4, %xmm3, %xmm5
+; X32-AVX1-NEXT:    vpblendvb %xmm1, %xmm5, %xmm3, %xmm3
+; X32-AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; X32-AVX1-NEXT:    vpsraw $4, %xmm0, %xmm5
+; X32-AVX1-NEXT:    vpblendvb %xmm1, %xmm5, %xmm0, %xmm0
+; X32-AVX1-NEXT:    vpsraw $2, %xmm3, %xmm5
+; X32-AVX1-NEXT:    vpaddw %xmm1, %xmm1, %xmm1
+; X32-AVX1-NEXT:    vpblendvb %xmm1, %xmm5, %xmm3, %xmm3
+; X32-AVX1-NEXT:    vpsraw $2, %xmm0, %xmm5
+; X32-AVX1-NEXT:    vpblendvb %xmm1, %xmm5, %xmm0, %xmm0
+; X32-AVX1-NEXT:    vpsraw $1, %xmm3, %xmm5
+; X32-AVX1-NEXT:    vpaddw %xmm1, %xmm1, %xmm1
+; X32-AVX1-NEXT:    vpblendvb %xmm1, %xmm5, %xmm3, %xmm3
+; X32-AVX1-NEXT:    vpsraw $1, %xmm0, %xmm5
+; X32-AVX1-NEXT:    vpblendvb %xmm1, %xmm5, %xmm0, %xmm0
+; X32-AVX1-NEXT:    vpsrlw $8, %xmm4, %xmm1
+; X32-AVX1-NEXT:    vpsrlw $8, %xmm3, %xmm3
+; X32-AVX1-NEXT:    vpackuswb %xmm1, %xmm3, %xmm1
+; X32-AVX1-NEXT:    vpsrlw $8, %xmm2, %xmm2
+; X32-AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm0
+; X32-AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
+; X32-AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; X32-AVX1-NEXT:    retl
+;
+; X32-AVX2-LABEL: constant_shift_v32i8:
+; X32-AVX2:       # BB#0:
+; X32-AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [8192,24640,41088,57536,49376,32928,16480,32,8192,24640,41088,57536,49376,32928,16480,32]
+; X32-AVX2-NEXT:    vpunpckhbw {{.*#+}} ymm2 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31]
+; X32-AVX2-NEXT:    vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
+; X32-AVX2-NEXT:    vpsraw $4, %ymm3, %ymm4
+; X32-AVX2-NEXT:    vpblendvb %ymm2, %ymm4, %ymm3, %ymm3
+; X32-AVX2-NEXT:    vpsraw $2, %ymm3, %ymm4
+; X32-AVX2-NEXT:    vpaddw %ymm2, %ymm2, %ymm2
+; X32-AVX2-NEXT:    vpblendvb %ymm2, %ymm4, %ymm3, %ymm3
+; X32-AVX2-NEXT:    vpsraw $1, %ymm3, %ymm4
+; X32-AVX2-NEXT:    vpaddw %ymm2, %ymm2, %ymm2
+; X32-AVX2-NEXT:    vpblendvb %ymm2, %ymm4, %ymm3, %ymm2
+; X32-AVX2-NEXT:    vpsrlw $8, %ymm2, %ymm2
+; X32-AVX2-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
+; X32-AVX2-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+; X32-AVX2-NEXT:    vpsraw $4, %ymm0, %ymm3
+; X32-AVX2-NEXT:    vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
+; X32-AVX2-NEXT:    vpsraw $2, %ymm0, %ymm3
+; X32-AVX2-NEXT:    vpaddw %ymm1, %ymm1, %ymm1
+; X32-AVX2-NEXT:    vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
+; X32-AVX2-NEXT:    vpsraw $1, %ymm0, %ymm3
+; X32-AVX2-NEXT:    vpaddw %ymm1, %ymm1, %ymm1
+; X32-AVX2-NEXT:    vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
+; X32-AVX2-NEXT:    vpsrlw $8, %ymm0, %ymm0
+; X32-AVX2-NEXT:    vpackuswb %ymm2, %ymm0, %ymm0
+; X32-AVX2-NEXT:    retl
   %shift = ashr <32 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>
   ret <32 x i8> %shift
 }
@@ -1206,6 +1716,25 @@ define <4 x i64> @splatconstant_shift_v4i64(<4 x i64> %a) nounwind {
 ; AVX512VL:       # BB#0:
 ; AVX512VL-NEXT:    vpsraq $7, %ymm0, %ymm0
 ; AVX512VL-NEXT:    retq
+;
+; X32-AVX1-LABEL: splatconstant_shift_v4i64:
+; X32-AVX1:       # BB#0:
+; X32-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; X32-AVX1-NEXT:    vpsrad $7, %xmm1, %xmm2
+; X32-AVX1-NEXT:    vpsrlq $7, %xmm1, %xmm1
+; X32-AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
+; X32-AVX1-NEXT:    vpsrad $7, %xmm0, %xmm2
+; X32-AVX1-NEXT:    vpsrlq $7, %xmm0, %xmm0
+; X32-AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; X32-AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; X32-AVX1-NEXT:    retl
+;
+; X32-AVX2-LABEL: splatconstant_shift_v4i64:
+; X32-AVX2:       # BB#0:
+; X32-AVX2-NEXT:    vpsrad $7, %ymm0, %ymm1
+; X32-AVX2-NEXT:    vpsrlq $7, %ymm0, %ymm0
+; X32-AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
+; X32-AVX2-NEXT:    retl
   %shift = ashr <4 x i64> %a, <i64 7, i64 7, i64 7, i64 7>
   ret <4 x i64> %shift
 }
@@ -1246,6 +1775,19 @@ define <8 x i32> @splatconstant_shift_v8i32(<8 x i32> %a) nounwind {
 ; AVX512VL:       # BB#0:
 ; AVX512VL-NEXT:    vpsrad $5, %ymm0, %ymm0
 ; AVX512VL-NEXT:    retq
+;
+; X32-AVX1-LABEL: splatconstant_shift_v8i32:
+; X32-AVX1:       # BB#0:
+; X32-AVX1-NEXT:    vpsrad $5, %xmm0, %xmm1
+; X32-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; X32-AVX1-NEXT:    vpsrad $5, %xmm0, %xmm0
+; X32-AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; X32-AVX1-NEXT:    retl
+;
+; X32-AVX2-LABEL: splatconstant_shift_v8i32:
+; X32-AVX2:       # BB#0:
+; X32-AVX2-NEXT:    vpsrad $5, %ymm0, %ymm0
+; X32-AVX2-NEXT:    retl
   %shift = ashr <8 x i32> %a, <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
   ret <8 x i32> %shift
 }
@@ -1286,6 +1828,19 @@ define <16 x i16> @splatconstant_shift_v16i16(<16 x i16> %a) nounwind {
 ; AVX512VL:       # BB#0:
 ; AVX512VL-NEXT:    vpsraw $3, %ymm0, %ymm0
 ; AVX512VL-NEXT:    retq
+;
+; X32-AVX1-LABEL: splatconstant_shift_v16i16:
+; X32-AVX1:       # BB#0:
+; X32-AVX1-NEXT:    vpsraw $3, %xmm0, %xmm1
+; X32-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; X32-AVX1-NEXT:    vpsraw $3, %xmm0, %xmm0
+; X32-AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; X32-AVX1-NEXT:    retl
+;
+; X32-AVX2-LABEL: splatconstant_shift_v16i16:
+; X32-AVX2:       # BB#0:
+; X32-AVX2-NEXT:    vpsraw $3, %ymm0, %ymm0
+; X32-AVX2-NEXT:    retl
   %shift = ashr <16 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
   ret <16 x i16> %shift
 }
@@ -1352,6 +1907,31 @@ define <32 x i8> @splatconstant_shift_v32i8(<32 x i8> %a) nounwind {
 ; AVX512VL-NEXT:    vpxor %ymm1, %ymm0, %ymm0
 ; AVX512VL-NEXT:    vpsubb %ymm1, %ymm0, %ymm0
 ; AVX512VL-NEXT:    retq
+;
+; X32-AVX1-LABEL: splatconstant_shift_v32i8:
+; X32-AVX1:       # BB#0:
+; X32-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; X32-AVX1-NEXT:    vpsrlw $3, %xmm1, %xmm1
+; X32-AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31]
+; X32-AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; X32-AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; X32-AVX1-NEXT:    vpxor %xmm3, %xmm1, %xmm1
+; X32-AVX1-NEXT:    vpsubb %xmm3, %xmm1, %xmm1
+; X32-AVX1-NEXT:    vpsrlw $3, %xmm0, %xmm0
+; X32-AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm0
+; X32-AVX1-NEXT:    vpxor %xmm3, %xmm0, %xmm0
+; X32-AVX1-NEXT:    vpsubb %xmm3, %xmm0, %xmm0
+; X32-AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; X32-AVX1-NEXT:    retl
+;
+; X32-AVX2-LABEL: splatconstant_shift_v32i8:
+; X32-AVX2:       # BB#0:
+; X32-AVX2-NEXT:    vpsrlw $3, %ymm0, %ymm0
+; X32-AVX2-NEXT:    vpand {{\.LCPI.*}}, %ymm0, %ymm0
+; X32-AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; X32-AVX2-NEXT:    vpxor %ymm1, %ymm0, %ymm0
+; X32-AVX2-NEXT:    vpsubb %ymm1, %ymm0, %ymm0
+; X32-AVX2-NEXT:    retl
   %shift = ashr <32 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
   ret <32 x i8> %shift
 }
diff --git a/test/CodeGen/X86/vector-shift-lshr-256.ll b/test/CodeGen/X86/vector-shift-lshr-256.ll
index 60575250d713..46be36b76e98 100644
--- a/test/CodeGen/X86/vector-shift-lshr-256.ll
+++ b/test/CodeGen/X86/vector-shift-lshr-256.ll
@@ -7,6 +7,10 @@
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl -mattr=+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl -mattr=+avx512dq,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX512VL --check-prefix=AVX512DQVL
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX512VL --check-prefix=AVX512BWVL
+;
+; 32-bit runs to make sure we do reasonable things for i64 shifts.
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx  | FileCheck %s --check-prefix=ALL --check-prefix=X32-AVX --check-prefix=X32-AVX1
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=X32-AVX --check-prefix=X32-AVX2
 
 ;
 ; Variable Shifts
@@ -59,6 +63,26 @@ define <4 x i64> @var_shift_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind {
 ; AVX512VL:       # BB#0:
 ; AVX512VL-NEXT:    vpsrlvq %ymm1, %ymm0, %ymm0
 ; AVX512VL-NEXT:    retq
+;
+; X32-AVX1-LABEL: var_shift_v4i64:
+; X32-AVX1:       # BB#0:
+; X32-AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; X32-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; X32-AVX1-NEXT:    vpsrlq %xmm2, %xmm3, %xmm4
+; X32-AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
+; X32-AVX1-NEXT:    vpsrlq %xmm2, %xmm3, %xmm2
+; X32-AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm4[0,1,2,3],xmm2[4,5,6,7]
+; X32-AVX1-NEXT:    vpsrlq %xmm1, %xmm0, %xmm3
+; X32-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; X32-AVX1-NEXT:    vpsrlq %xmm1, %xmm0, %xmm0
+; X32-AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7]
+; X32-AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; X32-AVX1-NEXT:    retl
+;
+; X32-AVX2-LABEL: var_shift_v4i64:
+; X32-AVX2:       # BB#0:
+; X32-AVX2-NEXT:    vpsrlvq %ymm1, %ymm0, %ymm0
+; X32-AVX2-NEXT:    retl
   %shift = lshr <4 x i64> %a, %b
   ret <4 x i64> %shift
 }
@@ -125,6 +149,41 @@ define <8 x i32> @var_shift_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind {
 ; AVX512VL:       # BB#0:
 ; AVX512VL-NEXT:    vpsrlvd %ymm1, %ymm0, %ymm0
 ; AVX512VL-NEXT:    retq
+;
+; X32-AVX1-LABEL: var_shift_v8i32:
+; X32-AVX1:       # BB#0:
+; X32-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; X32-AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
+; X32-AVX1-NEXT:    vpsrldq {{.*#+}} xmm4 = xmm3[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; X32-AVX1-NEXT:    vpsrld %xmm4, %xmm2, %xmm4
+; X32-AVX1-NEXT:    vpsrlq $32, %xmm3, %xmm5
+; X32-AVX1-NEXT:    vpsrld %xmm5, %xmm2, %xmm5
+; X32-AVX1-NEXT:    vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4,5,6,7]
+; X32-AVX1-NEXT:    vpxor %xmm5, %xmm5, %xmm5
+; X32-AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm6 = xmm3[2],xmm5[2],xmm3[3],xmm5[3]
+; X32-AVX1-NEXT:    vpsrld %xmm6, %xmm2, %xmm6
+; X32-AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero
+; X32-AVX1-NEXT:    vpsrld %xmm3, %xmm2, %xmm2
+; X32-AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm6[4,5,6,7]
+; X32-AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3],xmm2[4,5],xmm4[6,7]
+; X32-AVX1-NEXT:    vpsrldq {{.*#+}} xmm3 = xmm1[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; X32-AVX1-NEXT:    vpsrld %xmm3, %xmm0, %xmm3
+; X32-AVX1-NEXT:    vpsrlq $32, %xmm1, %xmm4
+; X32-AVX1-NEXT:    vpsrld %xmm4, %xmm0, %xmm4
+; X32-AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7]
+; X32-AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm4 = xmm1[2],xmm5[2],xmm1[3],xmm5[3]
+; X32-AVX1-NEXT:    vpsrld %xmm4, %xmm0, %xmm4
+; X32-AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; X32-AVX1-NEXT:    vpsrld %xmm1, %xmm0, %xmm0
+; X32-AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm4[4,5,6,7]
+; X32-AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3],xmm0[4,5],xmm3[6,7]
+; X32-AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; X32-AVX1-NEXT:    retl
+;
+; X32-AVX2-LABEL: var_shift_v8i32:
+; X32-AVX2:       # BB#0:
+; X32-AVX2-NEXT:    vpsrlvd %ymm1, %ymm0, %ymm0
+; X32-AVX2-NEXT:    retl
   %shift = lshr <8 x i32> %a, %b
   ret <8 x i32> %shift
 }
@@ -231,6 +290,55 @@ define <16 x i16> @var_shift_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind {
 ; AVX512BWVL:       # BB#0:
 ; AVX512BWVL-NEXT:    vpsrlvw %ymm1, %ymm0, %ymm0
 ; AVX512BWVL-NEXT:    retq
+;
+; X32-AVX1-LABEL: var_shift_v16i16:
+; X32-AVX1:       # BB#0:
+; X32-AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; X32-AVX1-NEXT:    vpsllw $12, %xmm2, %xmm3
+; X32-AVX1-NEXT:    vpsllw $4, %xmm2, %xmm2
+; X32-AVX1-NEXT:    vpor %xmm3, %xmm2, %xmm2
+; X32-AVX1-NEXT:    vpaddw %xmm2, %xmm2, %xmm3
+; X32-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
+; X32-AVX1-NEXT:    vpsrlw $8, %xmm4, %xmm5
+; X32-AVX1-NEXT:    vpblendvb %xmm2, %xmm5, %xmm4, %xmm2
+; X32-AVX1-NEXT:    vpsrlw $4, %xmm2, %xmm4
+; X32-AVX1-NEXT:    vpblendvb %xmm3, %xmm4, %xmm2, %xmm2
+; X32-AVX1-NEXT:    vpsrlw $2, %xmm2, %xmm4
+; X32-AVX1-NEXT:    vpaddw %xmm3, %xmm3, %xmm3
+; X32-AVX1-NEXT:    vpblendvb %xmm3, %xmm4, %xmm2, %xmm2
+; X32-AVX1-NEXT:    vpsrlw $1, %xmm2, %xmm4
+; X32-AVX1-NEXT:    vpaddw %xmm3, %xmm3, %xmm3
+; X32-AVX1-NEXT:    vpblendvb %xmm3, %xmm4, %xmm2, %xmm2
+; X32-AVX1-NEXT:    vpsllw $12, %xmm1, %xmm3
+; X32-AVX1-NEXT:    vpsllw $4, %xmm1, %xmm1
+; X32-AVX1-NEXT:    vpor %xmm3, %xmm1, %xmm1
+; X32-AVX1-NEXT:    vpaddw %xmm1, %xmm1, %xmm3
+; X32-AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm4
+; X32-AVX1-NEXT:    vpblendvb %xmm1, %xmm4, %xmm0, %xmm0
+; X32-AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm1
+; X32-AVX1-NEXT:    vpblendvb %xmm3, %xmm1, %xmm0, %xmm0
+; X32-AVX1-NEXT:    vpsrlw $2, %xmm0, %xmm1
+; X32-AVX1-NEXT:    vpaddw %xmm3, %xmm3, %xmm3
+; X32-AVX1-NEXT:    vpblendvb %xmm3, %xmm1, %xmm0, %xmm0
+; X32-AVX1-NEXT:    vpsrlw $1, %xmm0, %xmm1
+; X32-AVX1-NEXT:    vpaddw %xmm3, %xmm3, %xmm3
+; X32-AVX1-NEXT:    vpblendvb %xmm3, %xmm1, %xmm0, %xmm0
+; X32-AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; X32-AVX1-NEXT:    retl
+;
+; X32-AVX2-LABEL: var_shift_v16i16:
+; X32-AVX2:       # BB#0:
+; X32-AVX2-NEXT:    vpxor %ymm2, %ymm2, %ymm2
+; X32-AVX2-NEXT:    vpunpckhwd {{.*#+}} ymm3 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15]
+; X32-AVX2-NEXT:    vpunpckhwd {{.*#+}} ymm4 = ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15]
+; X32-AVX2-NEXT:    vpsrlvd %ymm3, %ymm4, %ymm3
+; X32-AVX2-NEXT:    vpsrld $16, %ymm3, %ymm3
+; X32-AVX2-NEXT:    vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11]
+; X32-AVX2-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11]
+; X32-AVX2-NEXT:    vpsrlvd %ymm1, %ymm0, %ymm0
+; X32-AVX2-NEXT:    vpsrld $16, %ymm0, %ymm0
+; X32-AVX2-NEXT:    vpackusdw %ymm3, %ymm0, %ymm0
+; X32-AVX2-NEXT:    retl
   %shift = lshr <16 x i16> %a, %b
   ret <16 x i16> %shift
 }
@@ -357,6 +465,56 @@ define <32 x i8> @var_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
 ; AVX512BWVL-NEXT:    vpsrlvw %zmm1, %zmm0, %zmm0
 ; AVX512BWVL-NEXT:    vpmovwb %zmm0, %ymm0
 ; AVX512BWVL-NEXT:    retq
+;
+; X32-AVX1-LABEL: var_shift_v32i8:
+; X32-AVX1:       # BB#0:
+; X32-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; X32-AVX1-NEXT:    vpsrlw $4, %xmm2, %xmm3
+; X32-AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; X32-AVX1-NEXT:    vpand %xmm4, %xmm3, %xmm3
+; X32-AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm5
+; X32-AVX1-NEXT:    vpsllw $5, %xmm5, %xmm5
+; X32-AVX1-NEXT:    vpblendvb %xmm5, %xmm3, %xmm2, %xmm2
+; X32-AVX1-NEXT:    vpsrlw $2, %xmm2, %xmm3
+; X32-AVX1-NEXT:    vmovdqa {{.*#+}} xmm6 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
+; X32-AVX1-NEXT:    vpand %xmm6, %xmm3, %xmm3
+; X32-AVX1-NEXT:    vpaddb %xmm5, %xmm5, %xmm5
+; X32-AVX1-NEXT:    vpblendvb %xmm5, %xmm3, %xmm2, %xmm2
+; X32-AVX1-NEXT:    vpsrlw $1, %xmm2, %xmm3
+; X32-AVX1-NEXT:    vmovdqa {{.*#+}} xmm7 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; X32-AVX1-NEXT:    vpand %xmm7, %xmm3, %xmm3
+; X32-AVX1-NEXT:    vpaddb %xmm5, %xmm5, %xmm5
+; X32-AVX1-NEXT:    vpblendvb %xmm5, %xmm3, %xmm2, %xmm2
+; X32-AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm3
+; X32-AVX1-NEXT:    vpand %xmm4, %xmm3, %xmm3
+; X32-AVX1-NEXT:    vpsllw $5, %xmm1, %xmm1
+; X32-AVX1-NEXT:    vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
+; X32-AVX1-NEXT:    vpsrlw $2, %xmm0, %xmm3
+; X32-AVX1-NEXT:    vpand %xmm6, %xmm3, %xmm3
+; X32-AVX1-NEXT:    vpaddb %xmm1, %xmm1, %xmm1
+; X32-AVX1-NEXT:    vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
+; X32-AVX1-NEXT:    vpsrlw $1, %xmm0, %xmm3
+; X32-AVX1-NEXT:    vpand %xmm7, %xmm3, %xmm3
+; X32-AVX1-NEXT:    vpaddb %xmm1, %xmm1, %xmm1
+; X32-AVX1-NEXT:    vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
+; X32-AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; X32-AVX1-NEXT:    retl
+;
+; X32-AVX2-LABEL: var_shift_v32i8:
+; X32-AVX2:       # BB#0:
+; X32-AVX2-NEXT:    vpsllw $5, %ymm1, %ymm1
+; X32-AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm2
+; X32-AVX2-NEXT:    vpand {{\.LCPI.*}}, %ymm2, %ymm2
+; X32-AVX2-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
+; X32-AVX2-NEXT:    vpsrlw $2, %ymm0, %ymm2
+; X32-AVX2-NEXT:    vpand {{\.LCPI.*}}, %ymm2, %ymm2
+; X32-AVX2-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
+; X32-AVX2-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
+; X32-AVX2-NEXT:    vpsrlw $1, %ymm0, %ymm2
+; X32-AVX2-NEXT:    vpand {{\.LCPI.*}}, %ymm2, %ymm2
+; X32-AVX2-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
+; X32-AVX2-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
+; X32-AVX2-NEXT:    retl
   %shift = lshr <32 x i8> %a, %b
   ret <32 x i8> %shift
 }
@@ -401,6 +559,23 @@ define <4 x i64> @splatvar_shift_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind {
 ; AVX512VL:       # BB#0:
 ; AVX512VL-NEXT:    vpsrlq %xmm1, %ymm0, %ymm0
 ; AVX512VL-NEXT:    retq
+;
+; X32-AVX1-LABEL: splatvar_shift_v4i64:
+; X32-AVX1:       # BB#0:
+; X32-AVX1-NEXT:    vpextrd $1, %xmm1, %eax
+; X32-AVX1-NEXT:    vpinsrd $1, %eax, %xmm1, %xmm1
+; X32-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; X32-AVX1-NEXT:    vpsrlq %xmm1, %xmm2, %xmm2
+; X32-AVX1-NEXT:    vpsrlq %xmm1, %xmm0, %xmm0
+; X32-AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; X32-AVX1-NEXT:    retl
+;
+; X32-AVX2-LABEL: splatvar_shift_v4i64:
+; X32-AVX2:       # BB#0:
+; X32-AVX2-NEXT:    vpextrd $1, %xmm1, %eax
+; X32-AVX2-NEXT:    vpinsrd $1, %eax, %xmm1, %xmm1
+; X32-AVX2-NEXT:    vpsrlq %xmm1, %ymm0, %ymm0
+; X32-AVX2-NEXT:    retl
   %splat = shufflevector <4 x i64> %b, <4 x i64> undef, <4 x i32> zeroinitializer
   %shift = lshr <4 x i64> %a, %splat
   ret <4 x i64> %shift
@@ -448,6 +623,21 @@ define <8 x i32> @splatvar_shift_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind {
 ; AVX512VL-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
 ; AVX512VL-NEXT:    vpsrld %xmm1, %ymm0, %ymm0
 ; AVX512VL-NEXT:    retq
+;
+; X32-AVX1-LABEL: splatvar_shift_v8i32:
+; X32-AVX1:       # BB#0:
+; X32-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; X32-AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; X32-AVX1-NEXT:    vpsrld %xmm1, %xmm2, %xmm2
+; X32-AVX1-NEXT:    vpsrld %xmm1, %xmm0, %xmm0
+; X32-AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; X32-AVX1-NEXT:    retl
+;
+; X32-AVX2-LABEL: splatvar_shift_v8i32:
+; X32-AVX2:       # BB#0:
+; X32-AVX2-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; X32-AVX2-NEXT:    vpsrld %xmm1, %ymm0, %ymm0
+; X32-AVX2-NEXT:    retl
   %splat = shufflevector <8 x i32> %b, <8 x i32> undef, <8 x i32> zeroinitializer
   %shift = lshr <8 x i32> %a, %splat
   ret <8 x i32> %shift
@@ -495,6 +685,21 @@ define <16 x i16> @splatvar_shift_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind
 ; AVX512VL-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
 ; AVX512VL-NEXT:    vpsrlw %xmm1, %ymm0, %ymm0
 ; AVX512VL-NEXT:    retq
+;
+; X32-AVX1-LABEL: splatvar_shift_v16i16:
+; X32-AVX1:       # BB#0:
+; X32-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; X32-AVX1-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
+; X32-AVX1-NEXT:    vpsrlw %xmm1, %xmm2, %xmm2
+; X32-AVX1-NEXT:    vpsrlw %xmm1, %xmm0, %xmm0
+; X32-AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; X32-AVX1-NEXT:    retl
+;
+; X32-AVX2-LABEL: splatvar_shift_v16i16:
+; X32-AVX2:       # BB#0:
+; X32-AVX2-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
+; X32-AVX2-NEXT:    vpsrlw %xmm1, %ymm0, %ymm0
+; X32-AVX2-NEXT:    retl
   %splat = shufflevector <16 x i16> %b, <16 x i16> undef, <16 x i32> zeroinitializer
   %shift = lshr <16 x i16> %a, %splat
   ret <16 x i16> %shift
@@ -625,6 +830,55 @@ define <32 x i8> @splatvar_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
 ; AVX512BWVL-NEXT:    vpsrlvw %zmm1, %zmm0, %zmm0
 ; AVX512BWVL-NEXT:    vpmovwb %zmm0, %ymm0
 ; AVX512BWVL-NEXT:    retq
+;
+; X32-AVX1-LABEL: splatvar_shift_v32i8:
+; X32-AVX1:       # BB#0:
+; X32-AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; X32-AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; X32-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; X32-AVX1-NEXT:    vpsrlw $4, %xmm2, %xmm3
+; X32-AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; X32-AVX1-NEXT:    vpand %xmm4, %xmm3, %xmm3
+; X32-AVX1-NEXT:    vpsllw $5, %xmm1, %xmm1
+; X32-AVX1-NEXT:    vpblendvb %xmm1, %xmm3, %xmm2, %xmm2
+; X32-AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm3
+; X32-AVX1-NEXT:    vpand %xmm4, %xmm3, %xmm3
+; X32-AVX1-NEXT:    vpsrlw $2, %xmm2, %xmm4
+; X32-AVX1-NEXT:    vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
+; X32-AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
+; X32-AVX1-NEXT:    vpand %xmm3, %xmm4, %xmm4
+; X32-AVX1-NEXT:    vpaddb %xmm1, %xmm1, %xmm1
+; X32-AVX1-NEXT:    vpblendvb %xmm1, %xmm4, %xmm2, %xmm2
+; X32-AVX1-NEXT:    vpsrlw $2, %xmm0, %xmm4
+; X32-AVX1-NEXT:    vpand %xmm3, %xmm4, %xmm3
+; X32-AVX1-NEXT:    vpsrlw $1, %xmm2, %xmm4
+; X32-AVX1-NEXT:    vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
+; X32-AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; X32-AVX1-NEXT:    vpand %xmm3, %xmm4, %xmm4
+; X32-AVX1-NEXT:    vpaddb %xmm1, %xmm1, %xmm1
+; X32-AVX1-NEXT:    vpblendvb %xmm1, %xmm4, %xmm2, %xmm2
+; X32-AVX1-NEXT:    vpsrlw $1, %xmm0, %xmm4
+; X32-AVX1-NEXT:    vpand %xmm3, %xmm4, %xmm3
+; X32-AVX1-NEXT:    vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
+; X32-AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; X32-AVX1-NEXT:    retl
+;
+; X32-AVX2-LABEL: splatvar_shift_v32i8:
+; X32-AVX2:       # BB#0:
+; X32-AVX2-NEXT:    vpbroadcastb %xmm1, %ymm1
+; X32-AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm2
+; X32-AVX2-NEXT:    vpand {{\.LCPI.*}}, %ymm2, %ymm2
+; X32-AVX2-NEXT:    vpsllw $5, %ymm1, %ymm1
+; X32-AVX2-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
+; X32-AVX2-NEXT:    vpsrlw $2, %ymm0, %ymm2
+; X32-AVX2-NEXT:    vpand {{\.LCPI.*}}, %ymm2, %ymm2
+; X32-AVX2-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
+; X32-AVX2-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
+; X32-AVX2-NEXT:    vpsrlw $1, %ymm0, %ymm2
+; X32-AVX2-NEXT:    vpand {{\.LCPI.*}}, %ymm2, %ymm2
+; X32-AVX2-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
+; X32-AVX2-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
+; X32-AVX2-NEXT:    retl
   %splat = shufflevector <32 x i8> %b, <32 x i8> undef, <32 x i32> zeroinitializer
   %shift = lshr <32 x i8> %a, %splat
   ret <32 x i8> %shift
@@ -677,6 +931,27 @@ define <4 x i64> @constant_shift_v4i64(<4 x i64> %a) nounwind {
 ; AVX512VL:       # BB#0:
 ; AVX512VL-NEXT:    vpsrlvq {{.*}}(%rip), %ymm0, %ymm0
 ; AVX512VL-NEXT:    retq
+;
+; X32-AVX1-LABEL: constant_shift_v4i64:
+; X32-AVX1:       # BB#0:
+; X32-AVX1-NEXT:    vmovdqa {{.*#+}} ymm1 = [1,0,7,0,31,0,62,0]
+; X32-AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; X32-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; X32-AVX1-NEXT:    vpsrlq %xmm2, %xmm3, %xmm4
+; X32-AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
+; X32-AVX1-NEXT:    vpsrlq %xmm2, %xmm3, %xmm2
+; X32-AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm4[0,1,2,3],xmm2[4,5,6,7]
+; X32-AVX1-NEXT:    vpsrlq %xmm1, %xmm0, %xmm3
+; X32-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; X32-AVX1-NEXT:    vpsrlq %xmm1, %xmm0, %xmm0
+; X32-AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7]
+; X32-AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; X32-AVX1-NEXT:    retl
+;
+; X32-AVX2-LABEL: constant_shift_v4i64:
+; X32-AVX2:       # BB#0:
+; X32-AVX2-NEXT:    vpsrlvq {{\.LCPI.*}}, %ymm0, %ymm0
+; X32-AVX2-NEXT:    retl
   %shift = lshr <4 x i64> %a, <i64 1, i64 7, i64 31, i64 62>
   ret <4 x i64> %shift
 }
@@ -727,6 +1002,29 @@ define <8 x i32> @constant_shift_v8i32(<8 x i32> %a) nounwind {
 ; AVX512VL:       # BB#0:
 ; AVX512VL-NEXT:    vpsrlvd {{.*}}(%rip), %ymm0, %ymm0
 ; AVX512VL-NEXT:    retq
+;
+; X32-AVX1-LABEL: constant_shift_v8i32:
+; X32-AVX1:       # BB#0:
+; X32-AVX1-NEXT:    vpsrld $7, %xmm0, %xmm1
+; X32-AVX1-NEXT:    vpsrld $5, %xmm0, %xmm2
+; X32-AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7]
+; X32-AVX1-NEXT:    vpsrld $6, %xmm0, %xmm2
+; X32-AVX1-NEXT:    vpsrld $4, %xmm0, %xmm3
+; X32-AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
+; X32-AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
+; X32-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; X32-AVX1-NEXT:    vpsrld $7, %xmm0, %xmm2
+; X32-AVX1-NEXT:    vpsrld $9, %xmm0, %xmm3
+; X32-AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
+; X32-AVX1-NEXT:    vpsrld $8, %xmm0, %xmm0
+; X32-AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; X32-AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; X32-AVX1-NEXT:    retl
+;
+; X32-AVX2-LABEL: constant_shift_v8i32:
+; X32-AVX2:       # BB#0:
+; X32-AVX2-NEXT:    vpsrlvd {{\.LCPI.*}}, %ymm0, %ymm0
+; X32-AVX2-NEXT:    retl
   %shift = lshr <8 x i32> %a, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 8, i32 7>
   ret <8 x i32> %shift
 }
@@ -814,6 +1112,40 @@ define <16 x i16> @constant_shift_v16i16(<16 x i16> %a) nounwind {
 ; AVX512BWVL:       # BB#0:
 ; AVX512BWVL-NEXT:    vpsrlvw {{.*}}(%rip), %ymm0, %ymm0
 ; AVX512BWVL-NEXT:    retq
+;
+; X32-AVX1-LABEL: constant_shift_v16i16:
+; X32-AVX1:       # BB#0:
+; X32-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; X32-AVX1-NEXT:    vpsrlw $8, %xmm1, %xmm1
+; X32-AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm2
+; X32-AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7]
+; X32-AVX1-NEXT:    vpsrlw $2, %xmm1, %xmm2
+; X32-AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
+; X32-AVX1-NEXT:    vpsrlw $1, %xmm1, %xmm2
+; X32-AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
+; X32-AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm2
+; X32-AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7]
+; X32-AVX1-NEXT:    vpsrlw $2, %xmm0, %xmm2
+; X32-AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; X32-AVX1-NEXT:    vpsrlw $1, %xmm0, %xmm2
+; X32-AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7]
+; X32-AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; X32-AVX1-NEXT:    retl
+;
+; X32-AVX2-LABEL: constant_shift_v16i16:
+; X32-AVX2:       # BB#0:
+; X32-AVX2-NEXT:    vpxor %ymm1, %ymm1, %ymm1
+; X32-AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; X32-AVX2-NEXT:    vpunpckhwd {{.*#+}} ymm3 = ymm2[4],ymm1[4],ymm2[5],ymm1[5],ymm2[6],ymm1[6],ymm2[7],ymm1[7],ymm2[12],ymm1[12],ymm2[13],ymm1[13],ymm2[14],ymm1[14],ymm2[15],ymm1[15]
+; X32-AVX2-NEXT:    vpunpckhwd {{.*#+}} ymm4 = ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15]
+; X32-AVX2-NEXT:    vpsrlvd %ymm3, %ymm4, %ymm3
+; X32-AVX2-NEXT:    vpsrld $16, %ymm3, %ymm3
+; X32-AVX2-NEXT:    vpunpcklwd {{.*#+}} ymm2 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11]
+; X32-AVX2-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11]
+; X32-AVX2-NEXT:    vpsrlvd %ymm2, %ymm0, %ymm0
+; X32-AVX2-NEXT:    vpsrld $16, %ymm0, %ymm0
+; X32-AVX2-NEXT:    vpackusdw %ymm3, %ymm0, %ymm0
+; X32-AVX2-NEXT:    retl
   %shift = lshr <16 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>
   ret <16 x i16> %shift
 }
@@ -930,6 +1262,52 @@ define <32 x i8> @constant_shift_v32i8(<32 x i8> %a) nounwind {
 ; AVX512BWVL-NEXT:    vpsrlvw {{.*}}(%rip), %zmm0, %zmm0
 ; AVX512BWVL-NEXT:    vpmovwb %zmm0, %ymm0
 ; AVX512BWVL-NEXT:    retq
+;
+; X32-AVX1-LABEL: constant_shift_v32i8:
+; X32-AVX1:       # BB#0:
+; X32-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; X32-AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm2
+; X32-AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; X32-AVX1-NEXT:    vpand %xmm3, %xmm2, %xmm2
+; X32-AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [8192,24640,41088,57536,49376,32928,16480,32]
+; X32-AVX1-NEXT:    vpblendvb %xmm4, %xmm2, %xmm1, %xmm1
+; X32-AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm2
+; X32-AVX1-NEXT:    vpand %xmm3, %xmm2, %xmm2
+; X32-AVX1-NEXT:    vpsrlw $2, %xmm1, %xmm3
+; X32-AVX1-NEXT:    vpblendvb %xmm4, %xmm2, %xmm0, %xmm0
+; X32-AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
+; X32-AVX1-NEXT:    vpand %xmm2, %xmm3, %xmm3
+; X32-AVX1-NEXT:    vpaddb %xmm4, %xmm4, %xmm4
+; X32-AVX1-NEXT:    vpblendvb %xmm4, %xmm3, %xmm1, %xmm1
+; X32-AVX1-NEXT:    vpsrlw $2, %xmm0, %xmm3
+; X32-AVX1-NEXT:    vpand %xmm2, %xmm3, %xmm2
+; X32-AVX1-NEXT:    vpsrlw $1, %xmm1, %xmm3
+; X32-AVX1-NEXT:    vpblendvb %xmm4, %xmm2, %xmm0, %xmm0
+; X32-AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; X32-AVX1-NEXT:    vpand %xmm2, %xmm3, %xmm3
+; X32-AVX1-NEXT:    vpaddb %xmm4, %xmm4, %xmm4
+; X32-AVX1-NEXT:    vpblendvb %xmm4, %xmm3, %xmm1, %xmm1
+; X32-AVX1-NEXT:    vpsrlw $1, %xmm0, %xmm3
+; X32-AVX1-NEXT:    vpand %xmm2, %xmm3, %xmm2
+; X32-AVX1-NEXT:    vpblendvb %xmm4, %xmm2, %xmm0, %xmm0
+; X32-AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; X32-AVX1-NEXT:    retl
+;
+; X32-AVX2-LABEL: constant_shift_v32i8:
+; X32-AVX2:       # BB#0:
+; X32-AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm1
+; X32-AVX2-NEXT:    vpand {{\.LCPI.*}}, %ymm1, %ymm1
+; X32-AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [8192,24640,41088,57536,49376,32928,16480,32,8192,24640,41088,57536,49376,32928,16480,32]
+; X32-AVX2-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
+; X32-AVX2-NEXT:    vpsrlw $2, %ymm0, %ymm1
+; X32-AVX2-NEXT:    vpand {{\.LCPI.*}}, %ymm1, %ymm1
+; X32-AVX2-NEXT:    vpaddb %ymm2, %ymm2, %ymm2
+; X32-AVX2-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
+; X32-AVX2-NEXT:    vpsrlw $1, %ymm0, %ymm1
+; X32-AVX2-NEXT:    vpand {{\.LCPI.*}}, %ymm1, %ymm1
+; X32-AVX2-NEXT:    vpaddb %ymm2, %ymm2, %ymm2
+; X32-AVX2-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
+; X32-AVX2-NEXT:    retl
   %shift = lshr <32 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>
   ret <32 x i8> %shift
 }
@@ -974,6 +1352,19 @@ define <4 x i64> @splatconstant_shift_v4i64(<4 x i64> %a) nounwind {
 ; AVX512VL:       # BB#0:
 ; AVX512VL-NEXT:    vpsrlq $7, %ymm0, %ymm0
 ; AVX512VL-NEXT:    retq
+;
+; X32-AVX1-LABEL: splatconstant_shift_v4i64:
+; X32-AVX1:       # BB#0:
+; X32-AVX1-NEXT:    vpsrlq $7, %xmm0, %xmm1
+; X32-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; X32-AVX1-NEXT:    vpsrlq $7, %xmm0, %xmm0
+; X32-AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; X32-AVX1-NEXT:    retl
+;
+; X32-AVX2-LABEL: splatconstant_shift_v4i64:
+; X32-AVX2:       # BB#0:
+; X32-AVX2-NEXT:    vpsrlq $7, %ymm0, %ymm0
+; X32-AVX2-NEXT:    retl
   %shift = lshr <4 x i64> %a, <i64 7, i64 7, i64 7, i64 7>
   ret <4 x i64> %shift
 }
@@ -1014,6 +1405,19 @@ define <8 x i32> @splatconstant_shift_v8i32(<8 x i32> %a) nounwind {
 ; AVX512VL:       # BB#0:
 ; AVX512VL-NEXT:    vpsrld $5, %ymm0, %ymm0
 ; AVX512VL-NEXT:    retq
+;
+; X32-AVX1-LABEL: splatconstant_shift_v8i32:
+; X32-AVX1:       # BB#0:
+; X32-AVX1-NEXT:    vpsrld $5, %xmm0, %xmm1
+; X32-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; X32-AVX1-NEXT:    vpsrld $5, %xmm0, %xmm0
+; X32-AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; X32-AVX1-NEXT:    retl
+;
+; X32-AVX2-LABEL: splatconstant_shift_v8i32:
+; X32-AVX2:       # BB#0:
+; X32-AVX2-NEXT:    vpsrld $5, %ymm0, %ymm0
+; X32-AVX2-NEXT:    retl
   %shift = lshr <8 x i32> %a, <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
   ret <8 x i32> %shift
 }
@@ -1054,6 +1458,19 @@ define <16 x i16> @splatconstant_shift_v16i16(<16 x i16> %a) nounwind {
 ; AVX512VL:       # BB#0:
 ; AVX512VL-NEXT:    vpsrlw $3, %ymm0, %ymm0
 ; AVX512VL-NEXT:    retq
+;
+; X32-AVX1-LABEL: splatconstant_shift_v16i16:
+; X32-AVX1:       # BB#0:
+; X32-AVX1-NEXT:    vpsrlw $3, %xmm0, %xmm1
+; X32-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; X32-AVX1-NEXT:    vpsrlw $3, %xmm0, %xmm0
+; X32-AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; X32-AVX1-NEXT:    retl
+;
+; X32-AVX2-LABEL: splatconstant_shift_v16i16:
+; X32-AVX2:       # BB#0:
+; X32-AVX2-NEXT:    vpsrlw $3, %ymm0, %ymm0
+; X32-AVX2-NEXT:    retl
   %shift = lshr <16 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
   ret <16 x i16> %shift
 }
@@ -1103,6 +1520,23 @@ define <32 x i8> @splatconstant_shift_v32i8(<32 x i8> %a) nounwind {
 ; AVX512VL-NEXT:    vpsrlw $3, %ymm0, %ymm0
 ; AVX512VL-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
 ; AVX512VL-NEXT:    retq
+;
+; X32-AVX1-LABEL: splatconstant_shift_v32i8:
+; X32-AVX1:       # BB#0:
+; X32-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; X32-AVX1-NEXT:    vpsrlw $3, %xmm1, %xmm1
+; X32-AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31]
+; X32-AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; X32-AVX1-NEXT:    vpsrlw $3, %xmm0, %xmm0
+; X32-AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm0
+; X32-AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; X32-AVX1-NEXT:    retl
+;
+; X32-AVX2-LABEL: splatconstant_shift_v32i8:
+; X32-AVX2:       # BB#0:
+; X32-AVX2-NEXT:    vpsrlw $3, %ymm0, %ymm0
+; X32-AVX2-NEXT:    vpand {{\.LCPI.*}}, %ymm0, %ymm0
+; X32-AVX2-NEXT:    retl
   %shift = lshr <32 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
   ret <32 x i8> %shift
 }
diff --git a/test/CodeGen/X86/vector-shift-shl-256.ll b/test/CodeGen/X86/vector-shift-shl-256.ll
index 7f534050b6a7..4a134f440a78 100644
--- a/test/CodeGen/X86/vector-shift-shl-256.ll
+++ b/test/CodeGen/X86/vector-shift-shl-256.ll
@@ -7,6 +7,10 @@
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl -mattr=+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl -mattr=+avx512dq,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX512VL --check-prefix=AVX512DQVL
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX512VL --check-prefix=AVX512BWVL
+;
+; 32-bit runs to make sure we do reasonable things for i64 shifts.
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx  | FileCheck %s --check-prefix=ALL --check-prefix=X32-AVX --check-prefix=X32-AVX1
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=X32-AVX --check-prefix=X32-AVX2
 
 ;
 ; Variable Shifts
@@ -56,6 +60,26 @@ define <4 x i64> @var_shift_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind {
 ; AVX512VL:       # BB#0:
 ; AVX512VL-NEXT:    vpsllvq %ymm1, %ymm0, %ymm0
 ; AVX512VL-NEXT:    retq
+;
+; X32-AVX1-LABEL: var_shift_v4i64:
+; X32-AVX1:       # BB#0:
+; X32-AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; X32-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; X32-AVX1-NEXT:    vpsllq %xmm2, %xmm3, %xmm4
+; X32-AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
+; X32-AVX1-NEXT:    vpsllq %xmm2, %xmm3, %xmm2
+; X32-AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm4[0,1,2,3],xmm2[4,5,6,7]
+; X32-AVX1-NEXT:    vpsllq %xmm1, %xmm0, %xmm3
+; X32-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; X32-AVX1-NEXT:    vpsllq %xmm1, %xmm0, %xmm0
+; X32-AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7]
+; X32-AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; X32-AVX1-NEXT:    retl
+;
+; X32-AVX2-LABEL: var_shift_v4i64:
+; X32-AVX2:       # BB#0:
+; X32-AVX2-NEXT:    vpsllvq %ymm1, %ymm0, %ymm0
+; X32-AVX2-NEXT:    retl
   %shift = shl <4 x i64> %a, %b
   ret <4 x i64> %shift
 }
@@ -105,6 +129,27 @@ define <8 x i32> @var_shift_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind {
 ; AVX512VL:       # BB#0:
 ; AVX512VL-NEXT:    vpsllvd %ymm1, %ymm0, %ymm0
 ; AVX512VL-NEXT:    retq
+;
+; X32-AVX1-LABEL: var_shift_v8i32:
+; X32-AVX1:       # BB#0:
+; X32-AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; X32-AVX1-NEXT:    vpslld $23, %xmm2, %xmm2
+; X32-AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216]
+; X32-AVX1-NEXT:    vpaddd %xmm3, %xmm2, %xmm2
+; X32-AVX1-NEXT:    vcvttps2dq %xmm2, %xmm2
+; X32-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
+; X32-AVX1-NEXT:    vpmulld %xmm4, %xmm2, %xmm2
+; X32-AVX1-NEXT:    vpslld $23, %xmm1, %xmm1
+; X32-AVX1-NEXT:    vpaddd %xmm3, %xmm1, %xmm1
+; X32-AVX1-NEXT:    vcvttps2dq %xmm1, %xmm1
+; X32-AVX1-NEXT:    vpmulld %xmm0, %xmm1, %xmm0
+; X32-AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; X32-AVX1-NEXT:    retl
+;
+; X32-AVX2-LABEL: var_shift_v8i32:
+; X32-AVX2:       # BB#0:
+; X32-AVX2-NEXT:    vpsllvd %ymm1, %ymm0, %ymm0
+; X32-AVX2-NEXT:    retl
   %shift = shl <8 x i32> %a, %b
   ret <8 x i32> %shift
 }
@@ -205,6 +250,55 @@ define <16 x i16> @var_shift_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind {
 ; AVX512BWVL:       # BB#0:
 ; AVX512BWVL-NEXT:    vpsllvw %ymm1, %ymm0, %ymm0
 ; AVX512BWVL-NEXT:    retq
+;
+; X32-AVX1-LABEL: var_shift_v16i16:
+; X32-AVX1:       # BB#0:
+; X32-AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; X32-AVX1-NEXT:    vpsllw $12, %xmm2, %xmm3
+; X32-AVX1-NEXT:    vpsllw $4, %xmm2, %xmm2
+; X32-AVX1-NEXT:    vpor %xmm3, %xmm2, %xmm2
+; X32-AVX1-NEXT:    vpaddw %xmm2, %xmm2, %xmm3
+; X32-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
+; X32-AVX1-NEXT:    vpsllw $8, %xmm4, %xmm5
+; X32-AVX1-NEXT:    vpblendvb %xmm2, %xmm5, %xmm4, %xmm2
+; X32-AVX1-NEXT:    vpsllw $4, %xmm2, %xmm4
+; X32-AVX1-NEXT:    vpblendvb %xmm3, %xmm4, %xmm2, %xmm2
+; X32-AVX1-NEXT:    vpsllw $2, %xmm2, %xmm4
+; X32-AVX1-NEXT:    vpaddw %xmm3, %xmm3, %xmm3
+; X32-AVX1-NEXT:    vpblendvb %xmm3, %xmm4, %xmm2, %xmm2
+; X32-AVX1-NEXT:    vpsllw $1, %xmm2, %xmm4
+; X32-AVX1-NEXT:    vpaddw %xmm3, %xmm3, %xmm3
+; X32-AVX1-NEXT:    vpblendvb %xmm3, %xmm4, %xmm2, %xmm2
+; X32-AVX1-NEXT:    vpsllw $12, %xmm1, %xmm3
+; X32-AVX1-NEXT:    vpsllw $4, %xmm1, %xmm1
+; X32-AVX1-NEXT:    vpor %xmm3, %xmm1, %xmm1
+; X32-AVX1-NEXT:    vpaddw %xmm1, %xmm1, %xmm3
+; X32-AVX1-NEXT:    vpsllw $8, %xmm0, %xmm4
+; X32-AVX1-NEXT:    vpblendvb %xmm1, %xmm4, %xmm0, %xmm0
+; X32-AVX1-NEXT:    vpsllw $4, %xmm0, %xmm1
+; X32-AVX1-NEXT:    vpblendvb %xmm3, %xmm1, %xmm0, %xmm0
+; X32-AVX1-NEXT:    vpsllw $2, %xmm0, %xmm1
+; X32-AVX1-NEXT:    vpaddw %xmm3, %xmm3, %xmm3
+; X32-AVX1-NEXT:    vpblendvb %xmm3, %xmm1, %xmm0, %xmm0
+; X32-AVX1-NEXT:    vpsllw $1, %xmm0, %xmm1
+; X32-AVX1-NEXT:    vpaddw %xmm3, %xmm3, %xmm3
+; X32-AVX1-NEXT:    vpblendvb %xmm3, %xmm1, %xmm0, %xmm0
+; X32-AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; X32-AVX1-NEXT:    retl
+;
+; X32-AVX2-LABEL: var_shift_v16i16:
+; X32-AVX2:       # BB#0:
+; X32-AVX2-NEXT:    vpxor %ymm2, %ymm2, %ymm2
+; X32-AVX2-NEXT:    vpunpckhwd {{.*#+}} ymm3 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15]
+; X32-AVX2-NEXT:    vpunpckhwd {{.*#+}} ymm4 = ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15]
+; X32-AVX2-NEXT:    vpsllvd %ymm3, %ymm4, %ymm3
+; X32-AVX2-NEXT:    vpsrld $16, %ymm3, %ymm3
+; X32-AVX2-NEXT:    vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11]
+; X32-AVX2-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11]
+; X32-AVX2-NEXT:    vpsllvd %ymm1, %ymm0, %ymm0
+; X32-AVX2-NEXT:    vpsrld $16, %ymm0, %ymm0
+; X32-AVX2-NEXT:    vpackusdw %ymm3, %ymm0, %ymm0
+; X32-AVX2-NEXT:    retl
   %shift = shl <16 x i16> %a, %b
   ret <16 x i16> %shift
 }
@@ -319,6 +413,52 @@ define <32 x i8> @var_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
 ; AVX512BWVL-NEXT:    vpsllvw %zmm1, %zmm0, %zmm0
 ; AVX512BWVL-NEXT:    vpmovwb %zmm0, %ymm0
 ; AVX512BWVL-NEXT:    retq
+;
+; X32-AVX1-LABEL: var_shift_v32i8:
+; X32-AVX1:       # BB#0:
+; X32-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; X32-AVX1-NEXT:    vpsllw $4, %xmm2, %xmm3
+; X32-AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
+; X32-AVX1-NEXT:    vpand %xmm4, %xmm3, %xmm3
+; X32-AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm5
+; X32-AVX1-NEXT:    vpsllw $5, %xmm5, %xmm5
+; X32-AVX1-NEXT:    vpblendvb %xmm5, %xmm3, %xmm2, %xmm2
+; X32-AVX1-NEXT:    vpsllw $2, %xmm2, %xmm3
+; X32-AVX1-NEXT:    vmovdqa {{.*#+}} xmm6 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
+; X32-AVX1-NEXT:    vpand %xmm6, %xmm3, %xmm3
+; X32-AVX1-NEXT:    vpaddb %xmm5, %xmm5, %xmm5
+; X32-AVX1-NEXT:    vpblendvb %xmm5, %xmm3, %xmm2, %xmm2
+; X32-AVX1-NEXT:    vpaddb %xmm2, %xmm2, %xmm3
+; X32-AVX1-NEXT:    vpaddb %xmm5, %xmm5, %xmm5
+; X32-AVX1-NEXT:    vpblendvb %xmm5, %xmm3, %xmm2, %xmm2
+; X32-AVX1-NEXT:    vpsllw $4, %xmm0, %xmm3
+; X32-AVX1-NEXT:    vpand %xmm4, %xmm3, %xmm3
+; X32-AVX1-NEXT:    vpsllw $5, %xmm1, %xmm1
+; X32-AVX1-NEXT:    vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
+; X32-AVX1-NEXT:    vpsllw $2, %xmm0, %xmm3
+; X32-AVX1-NEXT:    vpand %xmm6, %xmm3, %xmm3
+; X32-AVX1-NEXT:    vpaddb %xmm1, %xmm1, %xmm1
+; X32-AVX1-NEXT:    vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
+; X32-AVX1-NEXT:    vpaddb %xmm0, %xmm0, %xmm3
+; X32-AVX1-NEXT:    vpaddb %xmm1, %xmm1, %xmm1
+; X32-AVX1-NEXT:    vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
+; X32-AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; X32-AVX1-NEXT:    retl
+;
+; X32-AVX2-LABEL: var_shift_v32i8:
+; X32-AVX2:       # BB#0:
+; X32-AVX2-NEXT:    vpsllw $5, %ymm1, %ymm1
+; X32-AVX2-NEXT:    vpsllw $4, %ymm0, %ymm2
+; X32-AVX2-NEXT:    vpand {{\.LCPI.*}}, %ymm2, %ymm2
+; X32-AVX2-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
+; X32-AVX2-NEXT:    vpsllw $2, %ymm0, %ymm2
+; X32-AVX2-NEXT:    vpand {{\.LCPI.*}}, %ymm2, %ymm2
+; X32-AVX2-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
+; X32-AVX2-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
+; X32-AVX2-NEXT:    vpaddb %ymm0, %ymm0, %ymm2
+; X32-AVX2-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
+; X32-AVX2-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
+; X32-AVX2-NEXT:    retl
   %shift = shl <32 x i8> %a, %b
   ret <32 x i8> %shift
 }
@@ -363,6 +503,23 @@ define <4 x i64> @splatvar_shift_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind {
 ; AVX512VL:       # BB#0:
 ; AVX512VL-NEXT:    vpsllq %xmm1, %ymm0, %ymm0
 ; AVX512VL-NEXT:    retq
+;
+; X32-AVX1-LABEL: splatvar_shift_v4i64:
+; X32-AVX1:       # BB#0:
+; X32-AVX1-NEXT:    vpextrd $1, %xmm1, %eax
+; X32-AVX1-NEXT:    vpinsrd $1, %eax, %xmm1, %xmm1
+; X32-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; X32-AVX1-NEXT:    vpsllq %xmm1, %xmm2, %xmm2
+; X32-AVX1-NEXT:    vpsllq %xmm1, %xmm0, %xmm0
+; X32-AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; X32-AVX1-NEXT:    retl
+;
+; X32-AVX2-LABEL: splatvar_shift_v4i64:
+; X32-AVX2:       # BB#0:
+; X32-AVX2-NEXT:    vpextrd $1, %xmm1, %eax
+; X32-AVX2-NEXT:    vpinsrd $1, %eax, %xmm1, %xmm1
+; X32-AVX2-NEXT:    vpsllq %xmm1, %ymm0, %ymm0
+; X32-AVX2-NEXT:    retl
   %splat = shufflevector <4 x i64> %b, <4 x i64> undef, <4 x i32> zeroinitializer
   %shift = shl <4 x i64> %a, %splat
   ret <4 x i64> %shift
@@ -410,6 +567,21 @@ define <8 x i32> @splatvar_shift_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind {
 ; AVX512VL-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
 ; AVX512VL-NEXT:    vpslld %xmm1, %ymm0, %ymm0
 ; AVX512VL-NEXT:    retq
+;
+; X32-AVX1-LABEL: splatvar_shift_v8i32:
+; X32-AVX1:       # BB#0:
+; X32-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; X32-AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; X32-AVX1-NEXT:    vpslld %xmm1, %xmm2, %xmm2
+; X32-AVX1-NEXT:    vpslld %xmm1, %xmm0, %xmm0
+; X32-AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; X32-AVX1-NEXT:    retl
+;
+; X32-AVX2-LABEL: splatvar_shift_v8i32:
+; X32-AVX2:       # BB#0:
+; X32-AVX2-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; X32-AVX2-NEXT:    vpslld %xmm1, %ymm0, %ymm0
+; X32-AVX2-NEXT:    retl
   %splat = shufflevector <8 x i32> %b, <8 x i32> undef, <8 x i32> zeroinitializer
   %shift = shl <8 x i32> %a, %splat
   ret <8 x i32> %shift
@@ -457,6 +629,21 @@ define <16 x i16> @splatvar_shift_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind
 ; AVX512VL-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
 ; AVX512VL-NEXT:    vpsllw %xmm1, %ymm0, %ymm0
 ; AVX512VL-NEXT:    retq
+;
+; X32-AVX1-LABEL: splatvar_shift_v16i16:
+; X32-AVX1:       # BB#0:
+; X32-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; X32-AVX1-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
+; X32-AVX1-NEXT:    vpsllw %xmm1, %xmm2, %xmm2
+; X32-AVX1-NEXT:    vpsllw %xmm1, %xmm0, %xmm0
+; X32-AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; X32-AVX1-NEXT:    retl
+;
+; X32-AVX2-LABEL: splatvar_shift_v16i16:
+; X32-AVX2:       # BB#0:
+; X32-AVX2-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
+; X32-AVX2-NEXT:    vpsllw %xmm1, %ymm0, %ymm0
+; X32-AVX2-NEXT:    retl
   %splat = shufflevector <16 x i16> %b, <16 x i16> undef, <16 x i32> zeroinitializer
   %shift = shl <16 x i16> %a, %splat
   ret <16 x i16> %shift
@@ -577,6 +764,51 @@ define <32 x i8> @splatvar_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
 ; AVX512BWVL-NEXT:    vpsllvw %zmm1, %zmm0, %zmm0
 ; AVX512BWVL-NEXT:    vpmovwb %zmm0, %ymm0
 ; AVX512BWVL-NEXT:    retq
+;
+; X32-AVX1-LABEL: splatvar_shift_v32i8:
+; X32-AVX1:       # BB#0:
+; X32-AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; X32-AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; X32-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; X32-AVX1-NEXT:    vpsllw $4, %xmm2, %xmm3
+; X32-AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
+; X32-AVX1-NEXT:    vpand %xmm4, %xmm3, %xmm3
+; X32-AVX1-NEXT:    vpsllw $5, %xmm1, %xmm1
+; X32-AVX1-NEXT:    vpblendvb %xmm1, %xmm3, %xmm2, %xmm2
+; X32-AVX1-NEXT:    vpsllw $2, %xmm2, %xmm3
+; X32-AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
+; X32-AVX1-NEXT:    vpand %xmm5, %xmm3, %xmm3
+; X32-AVX1-NEXT:    vpaddb %xmm1, %xmm1, %xmm6
+; X32-AVX1-NEXT:    vpblendvb %xmm6, %xmm3, %xmm2, %xmm2
+; X32-AVX1-NEXT:    vpaddb %xmm2, %xmm2, %xmm3
+; X32-AVX1-NEXT:    vpaddb %xmm6, %xmm6, %xmm7
+; X32-AVX1-NEXT:    vpblendvb %xmm7, %xmm3, %xmm2, %xmm2
+; X32-AVX1-NEXT:    vpsllw $4, %xmm0, %xmm3
+; X32-AVX1-NEXT:    vpand %xmm4, %xmm3, %xmm3
+; X32-AVX1-NEXT:    vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
+; X32-AVX1-NEXT:    vpsllw $2, %xmm0, %xmm1
+; X32-AVX1-NEXT:    vpand %xmm5, %xmm1, %xmm1
+; X32-AVX1-NEXT:    vpblendvb %xmm6, %xmm1, %xmm0, %xmm0
+; X32-AVX1-NEXT:    vpaddb %xmm0, %xmm0, %xmm1
+; X32-AVX1-NEXT:    vpblendvb %xmm7, %xmm1, %xmm0, %xmm0
+; X32-AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; X32-AVX1-NEXT:    retl
+;
+; X32-AVX2-LABEL: splatvar_shift_v32i8:
+; X32-AVX2:       # BB#0:
+; X32-AVX2-NEXT:    vpbroadcastb %xmm1, %ymm1
+; X32-AVX2-NEXT:    vpsllw $4, %ymm0, %ymm2
+; X32-AVX2-NEXT:    vpand {{\.LCPI.*}}, %ymm2, %ymm2
+; X32-AVX2-NEXT:    vpsllw $5, %ymm1, %ymm1
+; X32-AVX2-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
+; X32-AVX2-NEXT:    vpsllw $2, %ymm0, %ymm2
+; X32-AVX2-NEXT:    vpand {{\.LCPI.*}}, %ymm2, %ymm2
+; X32-AVX2-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
+; X32-AVX2-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
+; X32-AVX2-NEXT:    vpaddb %ymm0, %ymm0, %ymm2
+; X32-AVX2-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
+; X32-AVX2-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
+; X32-AVX2-NEXT:    retl
   %splat = shufflevector <32 x i8> %b, <32 x i8> undef, <32 x i32> zeroinitializer
   %shift = shl <32 x i8> %a, %splat
   ret <32 x i8> %shift
@@ -626,6 +858,27 @@ define <4 x i64> @constant_shift_v4i64(<4 x i64> %a) nounwind {
 ; AVX512VL:       # BB#0:
 ; AVX512VL-NEXT:    vpsllvq {{.*}}(%rip), %ymm0, %ymm0
 ; AVX512VL-NEXT:    retq
+;
+; X32-AVX1-LABEL: constant_shift_v4i64:
+; X32-AVX1:       # BB#0:
+; X32-AVX1-NEXT:    vmovdqa {{.*#+}} ymm1 = [1,0,7,0,31,0,62,0]
+; X32-AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; X32-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; X32-AVX1-NEXT:    vpsllq %xmm2, %xmm3, %xmm4
+; X32-AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
+; X32-AVX1-NEXT:    vpsllq %xmm2, %xmm3, %xmm2
+; X32-AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm4[0,1,2,3],xmm2[4,5,6,7]
+; X32-AVX1-NEXT:    vpsllq %xmm1, %xmm0, %xmm3
+; X32-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; X32-AVX1-NEXT:    vpsllq %xmm1, %xmm0, %xmm0
+; X32-AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7]
+; X32-AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; X32-AVX1-NEXT:    retl
+;
+; X32-AVX2-LABEL: constant_shift_v4i64:
+; X32-AVX2:       # BB#0:
+; X32-AVX2-NEXT:    vpsllvq {{\.LCPI.*}}, %ymm0, %ymm0
+; X32-AVX2-NEXT:    retl
   %shift = shl <4 x i64> %a, <i64 1, i64 7, i64 31, i64 62>
   ret <4 x i64> %shift
 }
@@ -666,6 +919,19 @@ define <8 x i32> @constant_shift_v8i32(<8 x i32> %a) nounwind {
 ; AVX512VL:       # BB#0:
 ; AVX512VL-NEXT:    vpsllvd {{.*}}(%rip), %ymm0, %ymm0
 ; AVX512VL-NEXT:    retq
+;
+; X32-AVX1-LABEL: constant_shift_v8i32:
+; X32-AVX1:       # BB#0:
+; X32-AVX1-NEXT:    vpmulld {{\.LCPI.*}}, %xmm0, %xmm1
+; X32-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; X32-AVX1-NEXT:    vpmulld {{\.LCPI.*}}, %xmm0, %xmm0
+; X32-AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; X32-AVX1-NEXT:    retl
+;
+; X32-AVX2-LABEL: constant_shift_v8i32:
+; X32-AVX2:       # BB#0:
+; X32-AVX2-NEXT:    vpsllvd {{\.LCPI.*}}, %ymm0, %ymm0
+; X32-AVX2-NEXT:    retl
   %shift = shl <8 x i32> %a, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 8, i32 7>
   ret <8 x i32> %shift
 }
@@ -719,6 +985,19 @@ define <16 x i16> @constant_shift_v16i16(<16 x i16> %a) nounwind {
 ; AVX512BWVL:       # BB#0:
 ; AVX512BWVL-NEXT:    vpsllvw {{.*}}(%rip), %ymm0, %ymm0
 ; AVX512BWVL-NEXT:    retq
+;
+; X32-AVX1-LABEL: constant_shift_v16i16:
+; X32-AVX1:       # BB#0:
+; X32-AVX1-NEXT:    vpmullw {{\.LCPI.*}}, %xmm0, %xmm1
+; X32-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; X32-AVX1-NEXT:    vpmullw {{\.LCPI.*}}, %xmm0, %xmm0
+; X32-AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; X32-AVX1-NEXT:    retl
+;
+; X32-AVX2-LABEL: constant_shift_v16i16:
+; X32-AVX2:       # BB#0:
+; X32-AVX2-NEXT:    vpmullw {{\.LCPI.*}}, %ymm0, %ymm0
+; X32-AVX2-NEXT:    retl
   %shift = shl <16 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>
   ret <16 x i16> %shift
 }
@@ -827,6 +1106,48 @@ define <32 x i8> @constant_shift_v32i8(<32 x i8> %a) nounwind {
 ; AVX512BWVL-NEXT:    vpsllvw {{.*}}(%rip), %zmm0, %zmm0
 ; AVX512BWVL-NEXT:    vpmovwb %zmm0, %ymm0
 ; AVX512BWVL-NEXT:    retq
+;
+; X32-AVX1-LABEL: constant_shift_v32i8:
+; X32-AVX1:       # BB#0:
+; X32-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; X32-AVX1-NEXT:    vpsllw $4, %xmm1, %xmm2
+; X32-AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
+; X32-AVX1-NEXT:    vpand %xmm3, %xmm2, %xmm2
+; X32-AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [8192,24640,41088,57536,49376,32928,16480,32]
+; X32-AVX1-NEXT:    vpblendvb %xmm4, %xmm2, %xmm1, %xmm1
+; X32-AVX1-NEXT:    vpsllw $2, %xmm1, %xmm2
+; X32-AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
+; X32-AVX1-NEXT:    vpand %xmm5, %xmm2, %xmm2
+; X32-AVX1-NEXT:    vpaddb %xmm4, %xmm4, %xmm6
+; X32-AVX1-NEXT:    vpblendvb %xmm6, %xmm2, %xmm1, %xmm1
+; X32-AVX1-NEXT:    vpaddb %xmm1, %xmm1, %xmm2
+; X32-AVX1-NEXT:    vpaddb %xmm6, %xmm6, %xmm7
+; X32-AVX1-NEXT:    vpblendvb %xmm7, %xmm2, %xmm1, %xmm1
+; X32-AVX1-NEXT:    vpsllw $4, %xmm0, %xmm2
+; X32-AVX1-NEXT:    vpand %xmm3, %xmm2, %xmm2
+; X32-AVX1-NEXT:    vpblendvb %xmm4, %xmm2, %xmm0, %xmm0
+; X32-AVX1-NEXT:    vpsllw $2, %xmm0, %xmm2
+; X32-AVX1-NEXT:    vpand %xmm5, %xmm2, %xmm2
+; X32-AVX1-NEXT:    vpblendvb %xmm6, %xmm2, %xmm0, %xmm0
+; X32-AVX1-NEXT:    vpaddb %xmm0, %xmm0, %xmm2
+; X32-AVX1-NEXT:    vpblendvb %xmm7, %xmm2, %xmm0, %xmm0
+; X32-AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; X32-AVX1-NEXT:    retl
+;
+; X32-AVX2-LABEL: constant_shift_v32i8:
+; X32-AVX2:       # BB#0:
+; X32-AVX2-NEXT:    vpsllw $4, %ymm0, %ymm1
+; X32-AVX2-NEXT:    vpand {{\.LCPI.*}}, %ymm1, %ymm1
+; X32-AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [8192,24640,41088,57536,49376,32928,16480,32,8192,24640,41088,57536,49376,32928,16480,32]
+; X32-AVX2-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
+; X32-AVX2-NEXT:    vpsllw $2, %ymm0, %ymm1
+; X32-AVX2-NEXT:    vpand {{\.LCPI.*}}, %ymm1, %ymm1
+; X32-AVX2-NEXT:    vpaddb %ymm2, %ymm2, %ymm2
+; X32-AVX2-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
+; X32-AVX2-NEXT:    vpaddb %ymm0, %ymm0, %ymm1
+; X32-AVX2-NEXT:    vpaddb %ymm2, %ymm2, %ymm2
+; X32-AVX2-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
+; X32-AVX2-NEXT:    retl
   %shift = shl <32 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>
   ret <32 x i8> %shift
 }
@@ -871,6 +1192,19 @@ define <4 x i64> @splatconstant_shift_v4i64(<4 x i64> %a) nounwind {
 ; AVX512VL:       # BB#0:
 ; AVX512VL-NEXT:    vpsllq $7, %ymm0, %ymm0
 ; AVX512VL-NEXT:    retq
+;
+; X32-AVX1-LABEL: splatconstant_shift_v4i64:
+; X32-AVX1:       # BB#0:
+; X32-AVX1-NEXT:    vpsllq $7, %xmm0, %xmm1
+; X32-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; X32-AVX1-NEXT:    vpsllq $7, %xmm0, %xmm0
+; X32-AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; X32-AVX1-NEXT:    retl
+;
+; X32-AVX2-LABEL: splatconstant_shift_v4i64:
+; X32-AVX2:       # BB#0:
+; X32-AVX2-NEXT:    vpsllq $7, %ymm0, %ymm0
+; X32-AVX2-NEXT:    retl
   %shift = shl <4 x i64> %a, <i64 7, i64 7, i64 7, i64 7>
   ret <4 x i64> %shift
 }
@@ -911,6 +1245,19 @@ define <8 x i32> @splatconstant_shift_v8i32(<8 x i32> %a) nounwind {
 ; AVX512VL:       # BB#0:
 ; AVX512VL-NEXT:    vpslld $5, %ymm0, %ymm0
 ; AVX512VL-NEXT:    retq
+;
+; X32-AVX1-LABEL: splatconstant_shift_v8i32:
+; X32-AVX1:       # BB#0:
+; X32-AVX1-NEXT:    vpslld $5, %xmm0, %xmm1
+; X32-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; X32-AVX1-NEXT:    vpslld $5, %xmm0, %xmm0
+; X32-AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; X32-AVX1-NEXT:    retl
+;
+; X32-AVX2-LABEL: splatconstant_shift_v8i32:
+; X32-AVX2:       # BB#0:
+; X32-AVX2-NEXT:    vpslld $5, %ymm0, %ymm0
+; X32-AVX2-NEXT:    retl
   %shift = shl <8 x i32> %a, <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
   ret <8 x i32> %shift
 }
@@ -951,6 +1298,19 @@ define <16 x i16> @splatconstant_shift_v16i16(<16 x i16> %a) nounwind {
 ; AVX512VL:       # BB#0:
 ; AVX512VL-NEXT:    vpsllw $3, %ymm0, %ymm0
 ; AVX512VL-NEXT:    retq
+;
+; X32-AVX1-LABEL: splatconstant_shift_v16i16:
+; X32-AVX1:       # BB#0:
+; X32-AVX1-NEXT:    vpsllw $3, %xmm0, %xmm1
+; X32-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; X32-AVX1-NEXT:    vpsllw $3, %xmm0, %xmm0
+; X32-AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; X32-AVX1-NEXT:    retl
+;
+; X32-AVX2-LABEL: splatconstant_shift_v16i16:
+; X32-AVX2:       # BB#0:
+; X32-AVX2-NEXT:    vpsllw $3, %ymm0, %ymm0
+; X32-AVX2-NEXT:    retl
   %shift = shl <16 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
   ret <16 x i16> %shift
 }
@@ -999,6 +1359,23 @@ define <32 x i8> @splatconstant_shift_v32i8(<32 x i8> %a) nounwind {
 ; AVX512VL-NEXT:    vpsllw $3, %ymm0, %ymm0
 ; AVX512VL-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
 ; AVX512VL-NEXT:    retq
+;
+; X32-AVX1-LABEL: splatconstant_shift_v32i8:
+; X32-AVX1:       # BB#0:
+; X32-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; X32-AVX1-NEXT:    vpsllw $3, %xmm1, %xmm1
+; X32-AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248]
+; X32-AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; X32-AVX1-NEXT:    vpsllw $3, %xmm0, %xmm0
+; X32-AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm0
+; X32-AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; X32-AVX1-NEXT:    retl
+;
+; X32-AVX2-LABEL: splatconstant_shift_v32i8:
+; X32-AVX2:       # BB#0:
+; X32-AVX2-NEXT:    vpsllw $3, %ymm0, %ymm0
+; X32-AVX2-NEXT:    vpand {{\.LCPI.*}}, %ymm0, %ymm0
+; X32-AVX2-NEXT:    retl
   %shift = shl <32 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
   ret <32 x i8> %shift
 }
diff --git a/test/CodeGen/X86/vector-shuffle-512-v32.ll b/test/CodeGen/X86/vector-shuffle-512-v32.ll
index 26cd7301fe60..7a5c992bb829 100644
--- a/test/CodeGen/X86/vector-shuffle-512-v32.ll
+++ b/test/CodeGen/X86/vector-shuffle-512-v32.ll
@@ -1,129 +1,235 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; NOTE: Assertions have been autogenerated by update_llc_test_checks.py
-; RUN: llc < %s -mcpu=x86-64 -mattr=+avx512f -mattr=+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck --check-prefixes=ALL,KNL %s
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=skx | FileCheck --check-prefixes=ALL,SKX %s
 
 target triple = "x86_64-unknown-unknown"
 
-define <32 x i16> @shuffle_v32i16_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<32 x i16> %a)  {
-; ALL-LABEL: shuffle_v32i16_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
-; ALL:       # BB#0:
-; ALL-NEXT:    vpbroadcastw %xmm0, %zmm0
-; ALL-NEXT:    retq
+define <32 x i16> @shuffle_v32i16(<32 x i16> %a)  {
+; KNL-LABEL: shuffle_v32i16:
+; KNL:       ## BB#0:
+; KNL-NEXT:    vpbroadcastw %xmm0, %ymm0
+; KNL-NEXT:    vmovdqa %ymm0, %ymm1
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: shuffle_v32i16:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vpbroadcastw %xmm0, %zmm0
+; SKX-NEXT:    retq
   %c = shufflevector <32 x i16> %a, <32 x i16> undef, <32 x i32> zeroinitializer
   ret <32 x i16> %c
 }
 
 define <32 x i16> @shuffle_v32i16_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08(<32 x i16> %a)  {
-; ALL-LABEL: shuffle_v32i16_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08:
-; ALL:       # BB#0:
-; ALL-NEXT:    vextracti32x4 $1, %zmm0, %xmm0
-; ALL-NEXT:    vpbroadcastw %xmm0, %zmm0
-; ALL-NEXT:    retq
+; KNL-LABEL: shuffle_v32i16_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08:
+; KNL:       ## BB#0:
+; KNL-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; KNL-NEXT:    vpbroadcastw %xmm0, %ymm0
+; KNL-NEXT:    vmovdqa %ymm0, %ymm1
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: shuffle_v32i16_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vextracti32x4 $1, %zmm0, %xmm0
+; SKX-NEXT:    vpbroadcastw %xmm0, %zmm0
+; SKX-NEXT:    retq
   %c = shufflevector <32 x i16> %a, <32 x i16> undef, <32 x i32> <i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
   ret <32 x i16> %c
 }
 
 define <32 x i16> @shuffle_v32i16_02_05_u_u_07_u_0a_01_00_05_u_04_07_u_0a_01_02_05_u_u_07_u_0a_01_00_05_u_04_07_u_0a_1f(<32 x i16> %a)  {
-; ALL-LABEL: shuffle_v32i16_02_05_u_u_07_u_0a_01_00_05_u_04_07_u_0a_01_02_05_u_u_07_u_0a_01_00_05_u_04_07_u_0a_1f:
-; ALL:       # BB#0:
-; ALL-NEXT:    vmovdqu16 {{.*#+}} zmm1 = <2,5,u,u,7,u,10,1,0,5,u,4,7,u,10,1,2,5,u,u,7,u,10,1,0,5,u,4,7,u,10,31>
-; ALL-NEXT:    vpermw %zmm0, %zmm1, %zmm0
-; ALL-NEXT:    retq
+; KNL-LABEL: shuffle_v32i16_02_05_u_u_07_u_0a_01_00_05_u_04_07_u_0a_01_02_05_u_u_07_u_0a_01_00_05_u_04_07_u_0a_1f:
+; KNL:       ## BB#0:
+; KNL-NEXT:    vperm2i128 {{.*#+}} ymm2 = ymm0[2,3,0,1]
+; KNL-NEXT:    vpshufb {{.*#+}} ymm3 = ymm2[0,1,10,11,8,9,8,9,14,15,2,3,4,5,2,3,16,17,26,27,24,25,24,25,30,31,18,19,20,21,18,19]
+; KNL-NEXT:    vpshufb {{.*#+}} ymm4 = ymm0[4,5,10,11,4,5,6,7,14,15,2,3,4,5,2,3,20,21,26,27,20,21,22,23,30,31,18,19,20,21,18,19]
+; KNL-NEXT:    vmovdqa {{.*#+}} ymm0 = <0,0,0,0,u,u,u,u,0,0,u,u,255,255,0,0,255,255,255,255,u,u,255,255,255,255,u,u,0,0,255,255>
+; KNL-NEXT:    vpblendvb %ymm0, %ymm3, %ymm4, %ymm0
+; KNL-NEXT:    vpshufb {{.*#+}} ymm2 = ymm2[0,1,10,11,8,9,8,9,14,15,6,7,4,5,14,15,16,17,26,27,24,25,24,25,30,31,22,23,20,21,30,31]
+; KNL-NEXT:    vmovdqa {{.*#+}} ymm3 = <255,255,255,255,u,u,u,u,255,255,u,u,0,0,255,255,0,0,0,0,u,u,0,0,0,0,u,u,255,255,u,u>
+; KNL-NEXT:    vpblendvb %ymm3, %ymm4, %ymm2, %ymm2
+; KNL-NEXT:    vmovdqa {{.*#+}} ymm3 = <255,255,255,255,u,u,u,u,255,255,u,u,255,255,255,255,255,255,255,255,u,u,255,255,255,255,u,u,255,255,0,0>
+; KNL-NEXT:    vpblendvb %ymm3, %ymm2, %ymm1, %ymm1
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: shuffle_v32i16_02_05_u_u_07_u_0a_01_00_05_u_04_07_u_0a_01_02_05_u_u_07_u_0a_01_00_05_u_04_07_u_0a_1f:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vmovdqu16 {{.*#+}} zmm1 = <2,5,u,u,7,u,10,1,0,5,u,4,7,u,10,1,2,5,u,u,7,u,10,1,0,5,u,4,7,u,10,31>
+; SKX-NEXT:    vpermw %zmm0, %zmm1, %zmm0
+; SKX-NEXT:    retq
   %c = shufflevector <32 x i16> %a, <32 x i16> undef, <32 x i32> <i32 2, i32 5, i32 undef, i32 undef, i32 7, i32 undef, i32 10, i32 1,  i32 0, i32 5, i32 undef, i32 4, i32 7, i32 undef, i32 10, i32 1, i32 2, i32 5, i32 undef, i32 undef, i32 7, i32 undef, i32 10, i32 1,  i32 0, i32 5, i32 undef, i32 4, i32 7, i32 undef, i32 10, i32 31>
   ret <32 x i16> %c
 }
 
 define <32 x i16> @shuffle_v32i16_0f_1f_0e_16_0d_1d_04_1e_0b_1b_0a_1a_09_19_08_18_0f_1f_0e_16_0d_1d_04_1e_0b_1b_0a_1a_09_19_08_38(<32 x i16> %a, <32 x i16> %b)  {
-; ALL-LABEL: shuffle_v32i16_0f_1f_0e_16_0d_1d_04_1e_0b_1b_0a_1a_09_19_08_18_0f_1f_0e_16_0d_1d_04_1e_0b_1b_0a_1a_09_19_08_38:
-; ALL:       # BB#0:
-; ALL-NEXT:    vmovdqu16 {{.*#+}} zmm2 = [15,31,14,22,13,29,4,28,11,27,10,26,9,25,8,24,15,31,14,22,13,29,4,28,11,27,10,26,9,25,8,56]
-; ALL-NEXT:    vpermt2w %zmm1, %zmm2, %zmm0
-; ALL-NEXT:    retq
+; KNL-LABEL: shuffle_v32i16_0f_1f_0e_16_0d_1d_04_1e_0b_1b_0a_1a_09_19_08_18_0f_1f_0e_16_0d_1d_04_1e_0b_1b_0a_1a_09_19_08_38:
+; KNL:       ## BB#0:
+; KNL-NEXT:    vextracti128 $1, %ymm1, %xmm2
+; KNL-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
+; KNL-NEXT:    vpshufb {{.*#+}} xmm4 = xmm1[8,9,12,13,12,13,10,11,0,1,4,5,4,5,0,1]
+; KNL-NEXT:    vpshufd {{.*#+}} xmm1 = xmm2[0,1,0,3]
+; KNL-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm1[0,3,2,2,4,5,6,7]
+; KNL-NEXT:    vinserti128 $1, %xmm2, %ymm4, %ymm1
+; KNL-NEXT:    vextracti128 $1, %ymm0, %xmm5
+; KNL-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7]
+; KNL-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[12,13,10,11,8,9,14,15,4,5,2,3,2,3,6,7]
+; KNL-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[6,7,2,3,4,5,6,7,2,3,2,3,0,1,14,15]
+; KNL-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm0
+; KNL-NEXT:    vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15]
+; KNL-NEXT:    vextracti128 $1, %ymm3, %xmm3
+; KNL-NEXT:    vpbroadcastw %xmm3, %ymm3
+; KNL-NEXT:    vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0]
+; KNL-NEXT:    vpblendvb %ymm5, %ymm1, %ymm3, %ymm1
+; KNL-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,6,4]
+; KNL-NEXT:    vinserti128 $1, %xmm2, %ymm4, %ymm2
+; KNL-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7],ymm0[8],ymm2[9],ymm0[10],ymm2[11],ymm0[12],ymm2[13],ymm0[14],ymm2[15]
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: shuffle_v32i16_0f_1f_0e_16_0d_1d_04_1e_0b_1b_0a_1a_09_19_08_18_0f_1f_0e_16_0d_1d_04_1e_0b_1b_0a_1a_09_19_08_38:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vmovdqu16 {{.*#+}} zmm2 = [15,31,14,22,13,29,4,28,11,27,10,26,9,25,8,24,15,31,14,22,13,29,4,28,11,27,10,26,9,25,8,56]
+; SKX-NEXT:    vpermt2w %zmm1, %zmm2, %zmm0
+; SKX-NEXT:    retq
   %c = shufflevector <32 x i16> %a, <32 x i16> %b, <32 x i32> <i32 15, i32 31, i32 14, i32 22, i32 13, i32 29, i32 4, i32 28, i32 11, i32 27, i32 10, i32 26, i32 9, i32 25, i32 8, i32 24, i32 15, i32 31, i32 14, i32 22, i32 13, i32 29, i32 4, i32 28, i32 11, i32 27, i32 10, i32 26, i32 9, i32 25, i32 8, i32 56>
   ret <32 x i16> %c
 }
 
 define <32 x i16> @shuffle_v16i32_0_32_1_33_2_34_3_35_8_40_9_41_u_u_u_u(<32 x i16> %a, <32 x i16> %b)  {
-; ALL-LABEL: shuffle_v16i32_0_32_1_33_2_34_3_35_8_40_9_41_u_u_u_u:
-; ALL:       # BB#0:
-; ALL-NEXT:    vpunpcklwd {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27]
-; ALL-NEXT:    retq
+; KNL-LABEL: shuffle_v16i32_0_32_1_33_2_34_3_35_8_40_9_41_u_u_u_u:
+; KNL:       ## BB#0:
+; KNL-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[8],ymm2[8],ymm0[9],ymm2[9],ymm0[10],ymm2[10],ymm0[11],ymm2[11]
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: shuffle_v16i32_0_32_1_33_2_34_3_35_8_40_9_41_u_u_u_u:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vpunpcklwd {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27]
+; SKX-NEXT:    retq
   %c = shufflevector <32 x i16> %a, <32 x i16> %b, <32 x i32> <i32 0, i32 32, i32 1, i32 33, i32 2, i32 34, i32 3, i32 35, i32 8, i32 40, i32 9, i32 41, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
   ret <32 x i16> %c
 }
 
 define <32 x i16> @shuffle_v16i32_4_36_5_37_6_38_7_39_12_44_13_45_u_u_u_u(<32 x i16> %a, <32 x i16> %b)  {
-; ALL-LABEL: shuffle_v16i32_4_36_5_37_6_38_7_39_12_44_13_45_u_u_u_u:
-; ALL:       # BB#0:
-; ALL-NEXT:    vpunpckhwd {{.*#+}} zmm0 = zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31]
-; ALL-NEXT:    retq
+; KNL-LABEL: shuffle_v16i32_4_36_5_37_6_38_7_39_12_44_13_45_u_u_u_u:
+; KNL:       ## BB#0:
+; KNL-NEXT:    vpunpckhwd {{.*#+}} ymm0 = ymm0[4],ymm2[4],ymm0[5],ymm2[5],ymm0[6],ymm2[6],ymm0[7],ymm2[7],ymm0[12],ymm2[12],ymm0[13],ymm2[13],ymm0[14],ymm2[14],ymm0[15],ymm2[15]
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: shuffle_v16i32_4_36_5_37_6_38_7_39_12_44_13_45_u_u_u_u:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vpunpckhwd {{.*#+}} zmm0 = zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31]
+; SKX-NEXT:    retq
   %c = shufflevector <32 x i16> %a, <32 x i16> %b, <32 x i32> <i32 4, i32 36, i32 5, i32 37, i32 6, i32 38, i32 7, i32 39, i32 12, i32 44, i32 13, i32 45, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
   ret <32 x i16> %c
 }
 
 define <32 x i16> @shuffle_v32i16_1_z_3_z_5_z_7_z_9_z_11_z_13_z_15_z_17_z_19_z_21_z_23_z_25_z_27_z_29_z_31_z(<32 x i16> %a, <32 x i16> %b)  {
-; ALL-LABEL: shuffle_v32i16_1_z_3_z_5_z_7_z_9_z_11_z_13_z_15_z_17_z_19_z_21_z_23_z_25_z_27_z_29_z_31_z:
-; ALL:       # BB#0:
-; ALL-NEXT:    vpsrld $16, %zmm0, %zmm0
-; ALL-NEXT:    retq
+; KNL-LABEL: shuffle_v32i16_1_z_3_z_5_z_7_z_9_z_11_z_13_z_15_z_17_z_19_z_21_z_23_z_25_z_27_z_29_z_31_z:
+; KNL:       ## BB#0:
+; KNL-NEXT:    vpsrld $16, %ymm0, %ymm0
+; KNL-NEXT:    vpsrld $16, %ymm1, %ymm1
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: shuffle_v32i16_1_z_3_z_5_z_7_z_9_z_11_z_13_z_15_z_17_z_19_z_21_z_23_z_25_z_27_z_29_z_31_z:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vpsrld $16, %zmm0, %zmm0
+; SKX-NEXT:    retq
   %c = shufflevector <32 x i16> %a, <32 x i16> zeroinitializer, <32 x i32> <i32 1, i32 34, i32 3, i32 34, i32 5, i32 34, i32 7, i32 34, i32 9, i32 34, i32 11, i32 34, i32 13, i32 34, i32 15, i32 34, i32 17, i32 34, i32 19, i32 34, i32 21, i32 34, i32 23, i32 34, i32 25, i32 34, i32 27, i32 34, i32 29, i32 34, i32 31, i32 34>
   ret <32 x i16> %c
 }
 
 define <32 x i16> @shuffle_v32i16_z_0_z_2_z_4_z_6_z_8_z_10_z_12_z_14_z_16_z_18_z_20_z_22_z_24_z_26_z_28_z_30(<32 x i16> %a, <32 x i16> %b)  {
-; ALL-LABEL: shuffle_v32i16_z_0_z_2_z_4_z_6_z_8_z_10_z_12_z_14_z_16_z_18_z_20_z_22_z_24_z_26_z_28_z_30:
-; ALL:       # BB#0:
-; ALL-NEXT:    vpslld $16, %zmm0, %zmm0
-; ALL-NEXT:    retq
+; KNL-LABEL: shuffle_v32i16_z_0_z_2_z_4_z_6_z_8_z_10_z_12_z_14_z_16_z_18_z_20_z_22_z_24_z_26_z_28_z_30:
+; KNL:       ## BB#0:
+; KNL-NEXT:    vpslld $16, %ymm0, %ymm0
+; KNL-NEXT:    vpslld $16, %ymm1, %ymm1
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: shuffle_v32i16_z_0_z_2_z_4_z_6_z_8_z_10_z_12_z_14_z_16_z_18_z_20_z_22_z_24_z_26_z_28_z_30:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vpslld $16, %zmm0, %zmm0
+; SKX-NEXT:    retq
   %c = shufflevector <32 x i16> %a, <32 x i16> zeroinitializer, <32 x i32> <i32 34, i32 0, i32 34, i32 2, i32 34, i32 4, i32 34, i32 6, i32 34, i32 8, i32 34, i32 10, i32 34, i32 12, i32 34, i32 14, i32 34, i32 16, i32 34, i32 18, i32 34, i32 20, i32 34, i32 22, i32 34, i32 24, i32 34, i32 26, i32 34, i32 28, i32 34, i32 30>
   ret <32 x i16> %c
 }
 
 define <32 x i16> @shuffle_v32i16_1_1_0_0_4_5_6_7_9_9_8_8_12_13_14_15_17_17_16_16_20_21_22_23_25_25_24_24_28_29_30_31(<32 x i16> %a, <32 x i16> %b)  {
-; ALL-LABEL: shuffle_v32i16_1_1_0_0_4_5_6_7_9_9_8_8_12_13_14_15_17_17_16_16_20_21_22_23_25_25_24_24_28_29_30_31:
-; ALL:       # BB#0:
-; ALL-NEXT:    vpshuflw {{.*#+}} zmm0 = zmm0[1,1,0,0,4,5,6,7,9,9,8,8,12,13,14,15,17,17,16,16,20,21,22,23,25,25,24,24,28,29,30,31]
-; ALL-NEXT:    retq
+; KNL-LABEL: shuffle_v32i16_1_1_0_0_4_5_6_7_9_9_8_8_12_13_14_15_17_17_16_16_20_21_22_23_25_25_24_24_28_29_30_31:
+; KNL:       ## BB#0:
+; KNL-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[1,1,0,0,4,5,6,7,9,9,8,8,12,13,14,15]
+; KNL-NEXT:    vpshuflw {{.*#+}} ymm1 = ymm1[1,1,0,0,4,5,6,7,9,9,8,8,12,13,14,15]
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: shuffle_v32i16_1_1_0_0_4_5_6_7_9_9_8_8_12_13_14_15_17_17_16_16_20_21_22_23_25_25_24_24_28_29_30_31:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vpshuflw {{.*#+}} zmm0 = zmm0[1,1,0,0,4,5,6,7,9,9,8,8,12,13,14,15,17,17,16,16,20,21,22,23,25,25,24,24,28,29,30,31]
+; SKX-NEXT:    retq
   %c = shufflevector <32 x i16> %a, <32 x i16> zeroinitializer, <32 x i32> <i32 1, i32 1, i32 0, i32 0, i32 4, i32 5, i32 6, i32 7, i32 9, i32 9, i32 8, i32 8, i32 12, i32 13, i32 14, i32 15, i32 17, i32 17, i32 16, i32 16, i32 20, i32 21, i32 22, i32 23, i32 25, i32 25, i32 24, i32 24, i32 28, i32 29, i32 30, i32 31>
   ret <32 x i16> %c
 }
 
 define <32 x i16> @shuffle_v32i16_0_1_2_3_5_5_4_4_8_9_10_11_13_13_12_12_16_17_18_19_21_21_20_20_24_25_26_27_29_29_28_28(<32 x i16> %a, <32 x i16> %b)  {
-; ALL-LABEL: shuffle_v32i16_0_1_2_3_5_5_4_4_8_9_10_11_13_13_12_12_16_17_18_19_21_21_20_20_24_25_26_27_29_29_28_28:
-; ALL:       # BB#0:
-; ALL-NEXT:    vpshufhw {{.*#+}} zmm0 = zmm0[0,1,2,3,5,5,4,4,8,9,10,11,13,13,12,12,16,17,18,19,21,21,20,20,24,25,26,27,29,29,28,28]
-; ALL-NEXT:    retq
+; KNL-LABEL: shuffle_v32i16_0_1_2_3_5_5_4_4_8_9_10_11_13_13_12_12_16_17_18_19_21_21_20_20_24_25_26_27_29_29_28_28:
+; KNL:       ## BB#0:
+; KNL-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,5,5,4,4,8,9,10,11,13,13,12,12]
+; KNL-NEXT:    vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,5,5,4,4,8,9,10,11,13,13,12,12]
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: shuffle_v32i16_0_1_2_3_5_5_4_4_8_9_10_11_13_13_12_12_16_17_18_19_21_21_20_20_24_25_26_27_29_29_28_28:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vpshufhw {{.*#+}} zmm0 = zmm0[0,1,2,3,5,5,4,4,8,9,10,11,13,13,12,12,16,17,18,19,21,21,20,20,24,25,26,27,29,29,28,28]
+; SKX-NEXT:    retq
   %c = shufflevector <32 x i16> %a, <32 x i16> zeroinitializer, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 5, i32 4, i32 4, i32 8, i32 9, i32 10, i32 11, i32 13, i32 13, i32 12, i32 12, i32 16, i32 17, i32 18, i32 19, i32 21, i32 21, i32 20, i32 20, i32 24, i32 25, i32 26, i32 27, i32 29, i32 29, i32 28, i32 28>
   ret <32 x i16> %c
 }
 
 define <32 x i16> @shuffle_v32i16_1_1_0_0_5_5_4_4_9_9_11_11_13_13_12_12_17_17_19_19_21_21_20_20_25_25_27_27_29_29_28_28(<32 x i16> %a, <32 x i16> %b)  {
-; ALL-LABEL: shuffle_v32i16_1_1_0_0_5_5_4_4_9_9_11_11_13_13_12_12_17_17_19_19_21_21_20_20_25_25_27_27_29_29_28_28:
-; ALL:       # BB#0:
-; ALL-NEXT:    vpshuflw {{.*#+}} zmm0 = zmm0[1,1,0,0,4,5,6,7,9,9,8,8,12,13,14,15,17,17,16,16,20,21,22,23,25,25,24,24,28,29,30,31]
-; ALL-NEXT:    vpshufhw {{.*#+}} zmm0 = zmm0[0,1,2,3,5,5,4,4,8,9,10,11,13,13,12,12,16,17,18,19,21,21,20,20,24,25,26,27,29,29,28,28]
-; ALL-NEXT:    retq
+; KNL-LABEL: shuffle_v32i16_1_1_0_0_5_5_4_4_9_9_11_11_13_13_12_12_17_17_19_19_21_21_20_20_25_25_27_27_29_29_28_28:
+; KNL:       ## BB#0:
+; KNL-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[1,1,0,0,4,5,6,7,9,9,8,8,12,13,14,15]
+; KNL-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,5,5,4,4,8,9,10,11,13,13,12,12]
+; KNL-NEXT:    vpshuflw {{.*#+}} ymm1 = ymm1[1,1,0,0,4,5,6,7,9,9,8,8,12,13,14,15]
+; KNL-NEXT:    vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,5,5,4,4,8,9,10,11,13,13,12,12]
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: shuffle_v32i16_1_1_0_0_5_5_4_4_9_9_11_11_13_13_12_12_17_17_19_19_21_21_20_20_25_25_27_27_29_29_28_28:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vpshuflw {{.*#+}} zmm0 = zmm0[1,1,0,0,4,5,6,7,9,9,8,8,12,13,14,15,17,17,16,16,20,21,22,23,25,25,24,24,28,29,30,31]
+; SKX-NEXT:    vpshufhw {{.*#+}} zmm0 = zmm0[0,1,2,3,5,5,4,4,8,9,10,11,13,13,12,12,16,17,18,19,21,21,20,20,24,25,26,27,29,29,28,28]
+; SKX-NEXT:    retq
   %c = shufflevector <32 x i16> %a, <32 x i16> zeroinitializer, <32 x i32> <i32 1, i32 1, i32 0, i32 0, i32 5, i32 5, i32 4, i32 4, i32 9, i32 9, i32 8, i32 8, i32 13, i32 13, i32 12, i32 12, i32 17, i32 17, i32 16, i32 16, i32 21, i32 21, i32 20, i32 20, i32 25, i32 25, i32 24, i32 24, i32 29, i32 29, i32 28, i32 28>
   ret <32 x i16> %c
 }
 
 define <32 x i16> @shuffle_v32i16_0zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz(<32 x i16> %a) {
-; ALL-LABEL: shuffle_v32i16_0zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz:
-; ALL:       # BB#0:
-; ALL-NEXT:    movl $1, %eax
-; ALL-NEXT:    kmovd %eax, %k1
-; ALL-NEXT:    vmovdqu16 %zmm0, %zmm0 {%k1} {z}
-; ALL-NEXT:    retq
+; KNL-LABEL: shuffle_v32i16_0zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz:
+; KNL:       ## BB#0:
+; KNL-NEXT:    movl $65535, %eax ## imm = 0xFFFF
+; KNL-NEXT:    vmovd %eax, %xmm1
+; KNL-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; KNL-NEXT:    vpxor %ymm1, %ymm1, %ymm1
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: shuffle_v32i16_0zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz:
+; SKX:       ## BB#0:
+; SKX-NEXT:    movl $1, %eax
+; SKX-NEXT:    kmovd %eax, %k1
+; SKX-NEXT:    vmovdqu16 %zmm0, %zmm0 {%k1} {z}
+; SKX-NEXT:    retq
   %shuffle = shufflevector <32 x i16> %a, <32 x i16> zeroinitializer, <32 x i32> <i32 0, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32>
   ret <32 x i16> %shuffle
 }
 
 define <32 x i16> @insert_dup_mem_v32i16_i32(i32* %ptr) {
-; ALL-LABEL: insert_dup_mem_v32i16_i32:
-; ALL:       # BB#0:
-; ALL-NEXT:    movl (%rdi), %eax
-; ALL-NEXT:    vpbroadcastw %ax, %zmm0
-; ALL-NEXT:    retq
+; KNL-LABEL: insert_dup_mem_v32i16_i32:
+; KNL:       ## BB#0:
+; KNL-NEXT:    vpbroadcastw (%rdi), %ymm0
+; KNL-NEXT:    vmovdqa %ymm0, %ymm1
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: insert_dup_mem_v32i16_i32:
+; SKX:       ## BB#0:
+; SKX-NEXT:    movl (%rdi), %eax
+; SKX-NEXT:    vpbroadcastw %ax, %zmm0
+; SKX-NEXT:    retq
   %tmp = load i32, i32* %ptr, align 4
   %tmp1 = insertelement <4 x i32> zeroinitializer, i32 %tmp, i32 0
   %tmp2 = bitcast <4 x i32> %tmp1 to <8 x i16>
@@ -132,11 +238,19 @@ define <32 x i16> @insert_dup_mem_v32i16_i32(i32* %ptr) {
 }
 
 define <32 x i16> @insert_dup_mem_v32i16_sext_i16(i16* %ptr) {
-; ALL-LABEL: insert_dup_mem_v32i16_sext_i16:
-; ALL:       # BB#0:
-; ALL-NEXT:    movswl (%rdi), %eax
-; ALL-NEXT:    vpbroadcastw %ax, %zmm0
-; ALL-NEXT:    retq
+; KNL-LABEL: insert_dup_mem_v32i16_sext_i16:
+; KNL:       ## BB#0:
+; KNL-NEXT:    movswl (%rdi), %eax
+; KNL-NEXT:    vmovd %eax, %xmm0
+; KNL-NEXT:    vpbroadcastw %xmm0, %ymm0
+; KNL-NEXT:    vmovdqa %ymm0, %ymm1
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: insert_dup_mem_v32i16_sext_i16:
+; SKX:       ## BB#0:
+; SKX-NEXT:    movswl (%rdi), %eax
+; SKX-NEXT:    vpbroadcastw %ax, %zmm0
+; SKX-NEXT:    retq
   %tmp = load i16, i16* %ptr, align 2
   %tmp1 = sext i16 %tmp to i32
   %tmp2 = insertelement <4 x i32> zeroinitializer, i32 %tmp1, i32 0
@@ -146,11 +260,17 @@ define <32 x i16> @insert_dup_mem_v32i16_sext_i16(i16* %ptr) {
 }
 
 define <32 x i16> @insert_dup_elt1_mem_v32i16_i32(i32* %ptr) #0 {
-; ALL-LABEL: insert_dup_elt1_mem_v32i16_i32:
-; ALL:       # BB#0:
-; ALL-NEXT:    movzwl 2(%rdi), %eax
-; ALL-NEXT:    vpbroadcastw %ax, %zmm0
-; ALL-NEXT:    retq
+; KNL-LABEL: insert_dup_elt1_mem_v32i16_i32:
+; KNL:       ## BB#0:
+; KNL-NEXT:    vpbroadcastw 2(%rdi), %ymm0
+; KNL-NEXT:    vmovdqa %ymm0, %ymm1
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: insert_dup_elt1_mem_v32i16_i32:
+; SKX:       ## BB#0:
+; SKX-NEXT:    movzwl 2(%rdi), %eax
+; SKX-NEXT:    vpbroadcastw %ax, %zmm0
+; SKX-NEXT:    retq
   %tmp = load i32, i32* %ptr, align 4
   %tmp1 = insertelement <4 x i32> zeroinitializer, i32 %tmp, i32 0
   %tmp2 = bitcast <4 x i32> %tmp1 to <8 x i16>
@@ -159,11 +279,17 @@ define <32 x i16> @insert_dup_elt1_mem_v32i16_i32(i32* %ptr) #0 {
 }
 
 define <32 x i16> @insert_dup_elt3_mem_v32i16_i32(i32* %ptr) #0 {
-; ALL-LABEL: insert_dup_elt3_mem_v32i16_i32:
-; ALL:       # BB#0:
-; ALL-NEXT:    movzwl 2(%rdi), %eax
-; ALL-NEXT:    vpbroadcastw %ax, %zmm0
-; ALL-NEXT:    retq
+; KNL-LABEL: insert_dup_elt3_mem_v32i16_i32:
+; KNL:       ## BB#0:
+; KNL-NEXT:    vpbroadcastw 2(%rdi), %ymm0
+; KNL-NEXT:    vmovdqa %ymm0, %ymm1
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: insert_dup_elt3_mem_v32i16_i32:
+; SKX:       ## BB#0:
+; SKX-NEXT:    movzwl 2(%rdi), %eax
+; SKX-NEXT:    vpbroadcastw %ax, %zmm0
+; SKX-NEXT:    retq
   %tmp = load i32, i32* %ptr, align 4
   %tmp1 = insertelement <4 x i32> zeroinitializer, i32 %tmp, i32 1
   %tmp2 = bitcast <4 x i32> %tmp1 to <8 x i16>
@@ -172,19 +298,79 @@ define <32 x i16> @insert_dup_elt3_mem_v32i16_i32(i32* %ptr) #0 {
 }
 
 define <32 x i16> @shuffle_v32i16_32_zz_zz_zz_33_zz_zz_zz_34_zz_zz_zz_35_zz_zz_zz_36_zz_zz_zz_37_zz_zz_zz_38_zz_zz_zz_39_zz_zz_zz(<32 x i16> %a) {
-; ALL-LABEL: shuffle_v32i16_32_zz_zz_zz_33_zz_zz_zz_34_zz_zz_zz_35_zz_zz_zz_36_zz_zz_zz_37_zz_zz_zz_38_zz_zz_zz_39_zz_zz_zz:
-; ALL:       # BB#0:
-; ALL-NEXT:    vpmovzxwq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
-; ALL-NEXT:    retq
+; KNL-LABEL: shuffle_v32i16_32_zz_zz_zz_33_zz_zz_zz_34_zz_zz_zz_35_zz_zz_zz_36_zz_zz_zz_37_zz_zz_zz_38_zz_zz_zz_39_zz_zz_zz:
+; KNL:       ## BB#0:
+; KNL-NEXT:    vpmovzxwq {{.*#+}} ymm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; KNL-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; KNL-NEXT:    vpmovzxwq {{.*#+}} ymm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; KNL-NEXT:    vmovdqa %ymm2, %ymm0
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: shuffle_v32i16_32_zz_zz_zz_33_zz_zz_zz_34_zz_zz_zz_35_zz_zz_zz_36_zz_zz_zz_37_zz_zz_zz_38_zz_zz_zz_39_zz_zz_zz:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vpmovzxwq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
+; SKX-NEXT:    retq
   %shuffle = shufflevector <32 x i16> zeroinitializer, <32 x i16> %a, <32 x i32> <i32 32, i32 0, i32 0, i32 0, i32 33, i32 0, i32 0, i32 0, i32 34, i32 0, i32 0, i32 0, i32 35, i32 0, i32 0, i32 0, i32 36, i32 0, i32 0, i32 0, i32 37, i32 0, i32 0, i32 0, i32 38, i32 0, i32 0, i32 0, i32 39, i32 0, i32 0, i32 0>
   ret <32 x i16> %shuffle
 }
 
 define <32 x i16> @shuffle_v32i16_32_zz_33_zz_34_zz_35_zz_36_zz_37_zz_38_zz_39_zz_40_zz_41_zz_42_zz_43_zz_44_zz_45_zz_46_zz_47_zz(<32 x i16> %a) {
-; ALL-LABEL: shuffle_v32i16_32_zz_33_zz_34_zz_35_zz_36_zz_37_zz_38_zz_39_zz_40_zz_41_zz_42_zz_43_zz_44_zz_45_zz_46_zz_47_zz:
-; ALL:       # BB#0:
-; ALL-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
-; ALL-NEXT:    retq
+; KNL-LABEL: shuffle_v32i16_32_zz_33_zz_34_zz_35_zz_36_zz_37_zz_38_zz_39_zz_40_zz_41_zz_42_zz_43_zz_44_zz_45_zz_46_zz_47_zz:
+; KNL:       ## BB#0:
+; KNL-NEXT:    vpmovzxwd {{.*#+}} ymm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; KNL-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; KNL-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; KNL-NEXT:    vmovdqa %ymm2, %ymm0
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: shuffle_v32i16_32_zz_33_zz_34_zz_35_zz_36_zz_37_zz_38_zz_39_zz_40_zz_41_zz_42_zz_43_zz_44_zz_45_zz_46_zz_47_zz:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
+; SKX-NEXT:    retq
   %shuffle = shufflevector <32 x i16> zeroinitializer, <32 x i16> %a, <32 x i32> <i32 32, i32 0, i32 33, i32 0, i32 34, i32 0, i32 35, i32 0, i32 36, i32 0, i32 37, i32 0, i32 38, i32 0, i32 39, i32 0, i32 40, i32 0, i32 41, i32 0, i32 42, i32 0, i32 43, i32 0, i32 44, i32 0, i32 45, i32 0, i32 46, i32 0, i32 47, i32 0>
   ret <32 x i16> %shuffle
 }
+
+define <8 x i16> @pr32967(<32 x i16> %v) {
+; KNL-LABEL: pr32967:
+; KNL:       ## BB#0:
+; KNL-NEXT:    vextracti128 $1, %ymm1, %xmm2
+; KNL-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
+; KNL-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[0,1,1,3,4,5,6,7]
+; KNL-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; KNL-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7]
+; KNL-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; KNL-NEXT:    vextracti128 $1, %ymm0, %xmm2
+; KNL-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
+; KNL-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[1,3,2,3,4,5,6,7]
+; KNL-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; KNL-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7]
+; KNL-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; KNL-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: pr32967:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vpextrw $5, %xmm0, %eax
+; SKX-NEXT:    vpextrw $1, %xmm0, %ecx
+; SKX-NEXT:    vmovd %ecx, %xmm1
+; SKX-NEXT:    vpinsrw $1, %eax, %xmm1, %xmm1
+; SKX-NEXT:    vextracti32x4 $1, %zmm0, %xmm2
+; SKX-NEXT:    vpextrw $1, %xmm2, %eax
+; SKX-NEXT:    vpinsrw $2, %eax, %xmm1, %xmm1
+; SKX-NEXT:    vpextrw $5, %xmm2, %eax
+; SKX-NEXT:    vpinsrw $3, %eax, %xmm1, %xmm1
+; SKX-NEXT:    vextracti32x4 $2, %zmm0, %xmm2
+; SKX-NEXT:    vpextrw $1, %xmm2, %eax
+; SKX-NEXT:    vpinsrw $4, %eax, %xmm1, %xmm1
+; SKX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm2[5],xmm1[6,7]
+; SKX-NEXT:    vextracti32x4 $3, %zmm0, %xmm0
+; SKX-NEXT:    vpextrw $1, %xmm0, %eax
+; SKX-NEXT:    vpinsrw $6, %eax, %xmm1, %xmm1
+; SKX-NEXT:    vpextrw $5, %xmm0, %eax
+; SKX-NEXT:    vpinsrw $7, %eax, %xmm1, %xmm0
+; SKX-NEXT:    vzeroupper
+; SKX-NEXT:    retq
+ %shuffle = shufflevector <32 x i16> %v, <32 x i16> undef, <8 x i32> <i32 1,i32 5,i32 9,i32 13,i32 17,i32 21,i32 25,i32 29>
+ ret <8 x i16> %shuffle
+}
diff --git a/test/CodeGen/X86/vector-sqrt.ll b/test/CodeGen/X86/vector-sqrt.ll
index c5ac4466b5fa..8081e9482d67 100644
--- a/test/CodeGen/X86/vector-sqrt.ll
+++ b/test/CodeGen/X86/vector-sqrt.ll
@@ -29,11 +29,11 @@ define <4 x float> @sqrtf4(float* nocapture readonly %v) local_unnamed_addr #0 {
 ; CHECK:       # BB#0: # %entry
 ; CHECK-NEXT:    vsqrtss (%rdi), %xmm0, %xmm0
 ; CHECK-NEXT:    vsqrtss 4(%rdi), %xmm1, %xmm1
-; CHECK-NEXT:    vsqrtss 8(%rdi), %xmm2, %xmm2
-; CHECK-NEXT:    vsqrtss 12(%rdi), %xmm3, %xmm3
 ; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
-; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3]
-; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[0]
+; CHECK-NEXT:    vsqrtss 8(%rdi), %xmm2, %xmm1
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
+; CHECK-NEXT:    vsqrtss 12(%rdi), %xmm2, %xmm1
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
 ; CHECK-NEXT:    retq
 entry:
   %0 = load float, float* %v, align 4
diff --git a/test/CodeGen/X86/viabs.ll b/test/CodeGen/X86/viabs.ll
index 34a9df1782a4..f5ec8e540b0b 100644
--- a/test/CodeGen/X86/viabs.ll
+++ b/test/CodeGen/X86/viabs.ll
@@ -405,16 +405,16 @@ define <2 x i64> @test_abs_ge_v2i64(<2 x i64> %a) nounwind {
 ;
 ; AVX1-LABEL: test_abs_ge_v2i64:
 ; AVX1:       # BB#0:
-; AVX1-NEXT:    vpsrad $31, %xmm0, %xmm1
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm1
 ; AVX1-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: test_abs_ge_v2i64:
 ; AVX2:       # BB#0:
-; AVX2-NEXT:    vpsrad $31, %xmm0, %xmm1
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX2-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm1
 ; AVX2-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    retq
@@ -447,21 +447,20 @@ define <4 x i64> @test_abs_gt_v4i64(<4 x i64> %a) nounwind {
 ; AVX1-LABEL: test_abs_gt_v4i64:
 ; AVX1:       # BB#0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT:    vpsrad $31, %xmm1, %xmm2
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; AVX1-NEXT:    vpaddq %xmm2, %xmm1, %xmm1
-; AVX1-NEXT:    vpsrad $31, %xmm0, %xmm3
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; AVX1-NEXT:    vpaddq %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX1-NEXT:    vpcmpgtq %xmm1, %xmm2, %xmm3
+; AVX1-NEXT:    vpcmpgtq %xmm0, %xmm2, %xmm2
+; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm4
+; AVX1-NEXT:    vpaddq %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpaddq %xmm2, %xmm0, %xmm0
 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm1
-; AVX1-NEXT:    vxorps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    vxorps %ymm4, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: test_abs_gt_v4i64:
 ; AVX2:       # BB#0:
-; AVX2-NEXT:    vpsrad $31, %ymm0, %ymm1
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[1,1,3,3,5,5,7,7]
+; AVX2-NEXT:    vpxor %ymm1, %ymm1, %ymm1
+; AVX2-NEXT:    vpcmpgtq %ymm0, %ymm1, %ymm1
 ; AVX2-NEXT:    vpaddq %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vpxor %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    retq
@@ -504,35 +503,31 @@ define <8 x i64> @test_abs_le_v8i64(<8 x i64> %a) nounwind {
 ; AVX1-LABEL: test_abs_le_v8i64:
 ; AVX1:       # BB#0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT:    vpsrad $31, %xmm2, %xmm3
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; AVX1-NEXT:    vpaddq %xmm3, %xmm2, %xmm2
-; AVX1-NEXT:    vpsrad $31, %xmm0, %xmm4
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; AVX1-NEXT:    vpaddq %xmm4, %xmm0, %xmm0
+; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT:    vpcmpgtq %xmm2, %xmm3, %xmm4
+; AVX1-NEXT:    vpcmpgtq %xmm0, %xmm3, %xmm5
+; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm5, %ymm6
+; AVX1-NEXT:    vpaddq %xmm4, %xmm2, %xmm2
+; AVX1-NEXT:    vpaddq %xmm5, %xmm0, %xmm0
 ; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm4, %ymm2
-; AVX1-NEXT:    vxorps %ymm2, %ymm0, %ymm0
+; AVX1-NEXT:    vxorps %ymm6, %ymm0, %ymm0
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
-; AVX1-NEXT:    vpsrad $31, %xmm2, %xmm3
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; AVX1-NEXT:    vpaddq %xmm3, %xmm2, %xmm2
-; AVX1-NEXT:    vpsrad $31, %xmm1, %xmm4
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; AVX1-NEXT:    vpaddq %xmm4, %xmm1, %xmm1
+; AVX1-NEXT:    vpcmpgtq %xmm2, %xmm3, %xmm4
+; AVX1-NEXT:    vpcmpgtq %xmm1, %xmm3, %xmm3
+; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm3, %ymm5
+; AVX1-NEXT:    vpaddq %xmm4, %xmm2, %xmm2
+; AVX1-NEXT:    vpaddq %xmm3, %xmm1, %xmm1
 ; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
-; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm4, %ymm2
-; AVX1-NEXT:    vxorps %ymm2, %ymm1, %ymm1
+; AVX1-NEXT:    vxorps %ymm5, %ymm1, %ymm1
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: test_abs_le_v8i64:
 ; AVX2:       # BB#0:
-; AVX2-NEXT:    vpsrad $31, %ymm0, %ymm2
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[1,1,3,3,5,5,7,7]
-; AVX2-NEXT:    vpaddq %ymm2, %ymm0, %ymm0
-; AVX2-NEXT:    vpxor %ymm2, %ymm0, %ymm0
-; AVX2-NEXT:    vpsrad $31, %ymm1, %ymm2
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[1,1,3,3,5,5,7,7]
+; AVX2-NEXT:    vpxor %ymm2, %ymm2, %ymm2
+; AVX2-NEXT:    vpcmpgtq %ymm0, %ymm2, %ymm3
+; AVX2-NEXT:    vpaddq %ymm3, %ymm0, %ymm0
+; AVX2-NEXT:    vpxor %ymm3, %ymm0, %ymm0
+; AVX2-NEXT:    vpcmpgtq %ymm1, %ymm2, %ymm2
 ; AVX2-NEXT:    vpaddq %ymm2, %ymm1, %ymm1
 ; AVX2-NEXT:    vpxor %ymm2, %ymm1, %ymm1
 ; AVX2-NEXT:    retq
@@ -581,37 +576,33 @@ define <8 x i64> @test_abs_le_v8i64_fold(<8 x i64>* %a.ptr) nounwind {
 ; AVX1-NEXT:    vmovdqu (%rdi), %ymm0
 ; AVX1-NEXT:    vmovdqu 32(%rdi), %ymm1
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT:    vpsrad $31, %xmm2, %xmm3
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; AVX1-NEXT:    vpaddq %xmm3, %xmm2, %xmm2
-; AVX1-NEXT:    vpsrad $31, %xmm0, %xmm4
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; AVX1-NEXT:    vpaddq %xmm4, %xmm0, %xmm0
+; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT:    vpcmpgtq %xmm2, %xmm3, %xmm4
+; AVX1-NEXT:    vpcmpgtq %xmm0, %xmm3, %xmm5
+; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm5, %ymm6
+; AVX1-NEXT:    vpaddq %xmm4, %xmm2, %xmm2
+; AVX1-NEXT:    vpaddq %xmm5, %xmm0, %xmm0
 ; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm4, %ymm2
-; AVX1-NEXT:    vxorps %ymm2, %ymm0, %ymm0
+; AVX1-NEXT:    vxorps %ymm6, %ymm0, %ymm0
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
-; AVX1-NEXT:    vpsrad $31, %xmm2, %xmm3
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; AVX1-NEXT:    vpaddq %xmm3, %xmm2, %xmm2
-; AVX1-NEXT:    vpsrad $31, %xmm1, %xmm4
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; AVX1-NEXT:    vpaddq %xmm4, %xmm1, %xmm1
+; AVX1-NEXT:    vpcmpgtq %xmm2, %xmm3, %xmm4
+; AVX1-NEXT:    vpcmpgtq %xmm1, %xmm3, %xmm3
+; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm3, %ymm5
+; AVX1-NEXT:    vpaddq %xmm4, %xmm2, %xmm2
+; AVX1-NEXT:    vpaddq %xmm3, %xmm1, %xmm1
 ; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
-; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm4, %ymm2
-; AVX1-NEXT:    vxorps %ymm2, %ymm1, %ymm1
+; AVX1-NEXT:    vxorps %ymm5, %ymm1, %ymm1
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: test_abs_le_v8i64_fold:
 ; AVX2:       # BB#0:
 ; AVX2-NEXT:    vmovdqu (%rdi), %ymm0
 ; AVX2-NEXT:    vmovdqu 32(%rdi), %ymm1
-; AVX2-NEXT:    vpsrad $31, %ymm0, %ymm2
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[1,1,3,3,5,5,7,7]
-; AVX2-NEXT:    vpaddq %ymm2, %ymm0, %ymm0
-; AVX2-NEXT:    vpxor %ymm2, %ymm0, %ymm0
-; AVX2-NEXT:    vpsrad $31, %ymm1, %ymm2
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[1,1,3,3,5,5,7,7]
+; AVX2-NEXT:    vpxor %ymm2, %ymm2, %ymm2
+; AVX2-NEXT:    vpcmpgtq %ymm0, %ymm2, %ymm3
+; AVX2-NEXT:    vpaddq %ymm3, %ymm0, %ymm0
+; AVX2-NEXT:    vpxor %ymm3, %ymm0, %ymm0
+; AVX2-NEXT:    vpcmpgtq %ymm1, %ymm2, %ymm2
 ; AVX2-NEXT:    vpaddq %ymm2, %ymm1, %ymm1
 ; AVX2-NEXT:    vpxor %ymm2, %ymm1, %ymm1
 ; AVX2-NEXT:    retq
diff --git a/test/CodeGen/X86/vselect-pcmp.ll b/test/CodeGen/X86/vselect-pcmp.ll
index d33fda4f49c2..7807991b455d 100644
--- a/test/CodeGen/X86/vselect-pcmp.ll
+++ b/test/CodeGen/X86/vselect-pcmp.ll
@@ -35,9 +35,7 @@ define <8 x i16> @signbit_sel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %mask)
 ; AVX:       # BB#0:
 ; AVX-NEXT:    vpxor %xmm3, %xmm3, %xmm3
 ; AVX-NEXT:    vpcmpgtw %xmm2, %xmm3, %xmm2
-; AVX-NEXT:    vpandn %xmm1, %xmm2, %xmm1
-; AVX-NEXT:    vpand %xmm2, %xmm0, %xmm0
-; AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
 ; AVX-NEXT:    retq
   %tr = icmp slt <8 x i16> %mask, zeroinitializer
   %z = select <8 x i1> %tr, <8 x i16> %x, <8 x i16> %y
@@ -162,18 +160,14 @@ define <16 x i16> @signbit_sel_v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i16> %
 ; AVX2:       # BB#0:
 ; AVX2-NEXT:    vpxor %ymm3, %ymm3, %ymm3
 ; AVX2-NEXT:    vpcmpgtw %ymm2, %ymm3, %ymm2
-; AVX2-NEXT:    vpandn %ymm1, %ymm2, %ymm1
-; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
-; AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
 ; AVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: signbit_sel_v16i16:
 ; AVX512:       # BB#0:
 ; AVX512-NEXT:    vpxor %ymm3, %ymm3, %ymm3
 ; AVX512-NEXT:    vpcmpgtw %ymm2, %ymm3, %ymm2
-; AVX512-NEXT:    vpandn %ymm1, %ymm2, %ymm1
-; AVX512-NEXT:    vpand %ymm2, %ymm0, %ymm0
-; AVX512-NEXT:    vpor %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
 ; AVX512-NEXT:    retq
   %tr = icmp slt <16 x i16> %mask, zeroinitializer
   %z = select <16 x i1> %tr, <16 x i16> %x, <16 x i16> %y
diff --git a/test/CodeGen/X86/x86-interleaved-access.ll b/test/CodeGen/X86/x86-interleaved-access.ll
index 6fbec91e77a3..450e255313b3 100644
--- a/test/CodeGen/X86/x86-interleaved-access.ll
+++ b/test/CodeGen/X86/x86-interleaved-access.ll
@@ -11,13 +11,13 @@ define <4 x double> @load_factorf64_4(<16 x double>* %ptr) {
 ; AVX-NEXT:    vmovupd 96(%rdi), %ymm3
 ; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm4
 ; AVX-NEXT:    vinsertf128 $1, %xmm3, %ymm1, %ymm5
+; AVX-NEXT:    vhaddpd %ymm5, %ymm4, %ymm4
 ; AVX-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3]
 ; AVX-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm3[2,3]
 ; AVX-NEXT:    vunpcklpd {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
+; AVX-NEXT:    vaddpd %ymm2, %ymm4, %ymm2
 ; AVX-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
-; AVX-NEXT:    vhaddpd %ymm5, %ymm4, %ymm1
-; AVX-NEXT:    vaddpd %ymm2, %ymm1, %ymm1
-; AVX-NEXT:    vaddpd %ymm0, %ymm1, %ymm0
+; AVX-NEXT:    vaddpd %ymm0, %ymm2, %ymm0
 ; AVX-NEXT:    retq
   %wide.vec = load <16 x double>, <16 x double>* %ptr, align 16
   %strided.v0 = shufflevector <16 x double> %wide.vec, <16 x double> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
@@ -39,11 +39,11 @@ define <4 x double> @load_factorf64_2(<16 x double>* %ptr) {
 ; AVX-NEXT:    vmovupd 96(%rdi), %ymm3
 ; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm4
 ; AVX-NEXT:    vinsertf128 $1, %xmm3, %ymm1, %ymm5
+; AVX-NEXT:    vunpcklpd {{.*#+}} ymm4 = ymm4[0],ymm5[0],ymm4[2],ymm5[2]
 ; AVX-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3]
 ; AVX-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm3[2,3]
-; AVX-NEXT:    vunpcklpd {{.*#+}} ymm2 = ymm4[0],ymm5[0],ymm4[2],ymm5[2]
 ; AVX-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
-; AVX-NEXT:    vmulpd %ymm0, %ymm2, %ymm0
+; AVX-NEXT:    vmulpd %ymm0, %ymm4, %ymm0
 ; AVX-NEXT:    retq
   %wide.vec = load <16 x double>, <16 x double>* %ptr, align 16
   %strided.v0 = shufflevector <16 x double> %wide.vec, <16 x double> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
@@ -124,9 +124,9 @@ define <4 x i64> @load_factori64_4(<16 x i64>* %ptr) {
 ; AVX2-NEXT:    vpunpcklqdq {{.*#+}} ymm2 = ymm4[0],ymm5[0],ymm4[2],ymm5[2]
 ; AVX2-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
 ; AVX2-NEXT:    vpunpckhqdq {{.*#+}} ymm4 = ymm4[1],ymm5[1],ymm4[3],ymm5[3]
+; AVX2-NEXT:    vpaddq %ymm3, %ymm4, %ymm3
 ; AVX2-NEXT:    vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
-; AVX2-NEXT:    vpaddq %ymm3, %ymm4, %ymm1
-; AVX2-NEXT:    vpaddq %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    vpaddq %ymm0, %ymm3, %ymm0
 ; AVX2-NEXT:    vpaddq %ymm0, %ymm2, %ymm0
 ; AVX2-NEXT:    retq
   %wide.vec = load <16 x i64>, <16 x i64>* %ptr, align 16
diff --git a/test/CodeGen/X86/x86-no_caller_saved_registers-preserve.ll b/test/CodeGen/X86/x86-no_caller_saved_registers-preserve.ll
index 7e370c25e31b..3052a0f615eb 100644
--- a/test/CodeGen/X86/x86-no_caller_saved_registers-preserve.ll
+++ b/test/CodeGen/X86/x86-no_caller_saved_registers-preserve.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py for function "bar"
 ; RUN: llc -mtriple=x86_64-unknown-unknown < %s | FileCheck %s
 
-;; In functions with 'no_caller_saved_registers' attribute, all registers should
+;; In functions with 'no_caller_saved_registers' attribute, all registers should
 ;; be preserved except for registers used for passing/returning arguments.
 ;; In the following function registers %RDI, %RSI and %XMM0 are used to store
 ;; arguments %a0, %a1 and %b0 accordingally. The value is returned in %RAX.
@@ -28,20 +28,20 @@ define x86_64_sysvcc i32 @bar(i32 %a0, i32 %a1, float %b0) #0 {
   ret i32 4
 }
 
-;; Because "bar" has 'no_caller_saved_registers' attribute, function "foo"
-;; doesn't need to preserve registers except for the arguments passed 
+;; Because "bar" has 'no_caller_saved_registers' attribute, function "foo"
+;; doesn't need to preserve registers except for the arguments passed 
 ;; to "bar" (%ESI, %EDI and %XMM0).
 define x86_64_sysvcc float @foo(i32 %a0, i32 %a1, float %b0) {
-; CHECK-LABEL: foo
-; CHECK:       movaps  %xmm0, %xmm1
-; CHECK-NEXT:  movl  %esi, %ecx
-; CHECK-NEXT:  movl  %edi, %edx
-; CHECK-NEXT:  callq bar
-; CHECK-NEXT:  addl  %edx, %eax
-; CHECK-NEXT:  addl  %ecx, %eax
-; CHECK-NEXT:  xorps %xmm0, %xmm0
-; CHECK-NEXT:  cvtsi2ssl %eax, %xmm0
-; CHECK-NEXT:  addss %xmm0, %xmm1
+; CHECK-LABEL: foo
+; CHECK:       movaps  %xmm0, %xmm1
+; CHECK-NEXT:  movl  %esi, %ecx
+; CHECK-NEXT:  movl  %edi, %edx
+; CHECK-NEXT:  callq bar
+; CHECK-NEXT:  addl  %edx, %eax
+; CHECK-NEXT:  addl  %ecx, %eax
+; CHECK-NEXT:  xorps %xmm0, %xmm0
+; CHECK-NEXT:  cvtsi2ssl %eax, %xmm0
+; CHECK-NEXT:  addss %xmm0, %xmm1
 ; CHECK:       retq
 	%call = call i32 @bar(i32 %a0, i32 %a1, float %b0) #0
 	%c0   = add i32 %a0, %call
diff --git a/test/CodeGen/X86/x86-no_caller_saved_registers.ll b/test/CodeGen/X86/x86-no_caller_saved_registers.ll
index 9c62e3ee6ba7..4e5403d1847f 100644
--- a/test/CodeGen/X86/x86-no_caller_saved_registers.ll
+++ b/test/CodeGen/X86/x86-no_caller_saved_registers.ll
@@ -1,31 +1,31 @@
-; RUN: llc -mtriple=x86_64-unknown-unknown < %s | FileCheck %s
-; RUN: llc -mtriple=x86_64-unknown-unknown -O0 < %s | FileCheck %s
-; RUN: llc -mtriple=i686-unknown-unknown -mattr=+sse2 < %s | FileCheck %s
-; RUN: llc -mtriple=i686-unknown-unknown -mattr=+sse2 -O0 < %s | FileCheck %s
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;; In functions with 'no_caller_saved_registers' attribute, all registers should
-;; be preserved except for registers used for passing/returning arguments.
-;; The test checks that function "bar" preserves xmm0 register.
-;; It also checks that caller function "foo" does not store registers for callee 
-;; "bar". For example, there is no store/load/access to xmm registers.
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-
-define i32 @bar(i32 %a0, i32 %a1, i32 %a2, i32 %a3, i32 %a4, i32 %a5, i32 %a6, i32 %a7, i32 %a8) #0 {
-; CHECK-LABEL: bar
-; CHECK:       mov{{.*}}  %xmm0
-; CHECK:       mov{{.*}} {{.*}}, %xmm0
-; CHECK:       ret
-  call void asm sideeffect "", "~{xmm0}"()
-  ret i32 1
-}
-
-define x86_intrcc void @foo(i8* nocapture readnone %c) {
-; CHECK-LABEL: foo
-; CHECK-NOT: xmm
-entry:
-  tail call i32 @bar(i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8) #0
-  ret void
-}
-
-attributes #0 = { "no_caller_saved_registers" }
+; RUN: llc -mtriple=x86_64-unknown-unknown < %s | FileCheck %s
+; RUN: llc -mtriple=x86_64-unknown-unknown -O0 < %s | FileCheck %s
+; RUN: llc -mtriple=i686-unknown-unknown -mattr=+sse2 < %s | FileCheck %s
+; RUN: llc -mtriple=i686-unknown-unknown -mattr=+sse2 -O0 < %s | FileCheck %s
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; In functions with 'no_caller_saved_registers' attribute, all registers should
+;; be preserved except for registers used for passing/returning arguments.
+;; The test checks that function "bar" preserves xmm0 register.
+;; It also checks that caller function "foo" does not store registers for callee 
+;; "bar". For example, there is no store/load/access to xmm registers.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+define i32 @bar(i32 %a0, i32 %a1, i32 %a2, i32 %a3, i32 %a4, i32 %a5, i32 %a6, i32 %a7, i32 %a8) #0 {
+; CHECK-LABEL: bar
+; CHECK:       mov{{.*}}  %xmm0
+; CHECK:       mov{{.*}} {{.*}}, %xmm0
+; CHECK:       ret
+  call void asm sideeffect "", "~{xmm0}"()
+  ret i32 1
+}
+
+define x86_intrcc void @foo(i8* nocapture readnone %c) {
+; CHECK-LABEL: foo
+; CHECK-NOT: xmm
+entry:
+  tail call i32 @bar(i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8) #0
+  ret void
+}
+
+attributes #0 = { "no_caller_saved_registers" }
diff --git a/test/CodeGen/X86/x86-shrink-wrapping.ll b/test/CodeGen/X86/x86-shrink-wrapping.ll
index 5b6e773fe5d4..519f0d0924e3 100644
--- a/test/CodeGen/X86/x86-shrink-wrapping.ll
+++ b/test/CodeGen/X86/x86-shrink-wrapping.ll
@@ -270,8 +270,6 @@ if.end:                                           ; preds = %if.else, %for.end
   ret i32 %sum.1
 }
 
-declare void @somethingElse(...)
-
 ; Check with a more complex case that we do not have restore within the loop and
 ; save outside.
 ; CHECK-LABEL: loopInfoRestoreOutsideLoop:
@@ -982,3 +980,54 @@ for.inc:
 }
 
 attributes #4 = { "no-frame-pointer-elim"="true" }
+
+@x = external global i32, align 4
+@y = external global i32, align 4
+
+; The post-dominator tree does not include the branch containing the infinite
+; loop, which can occur into a misplacement of the restore block, if we're
+; looking for the nearest common post-dominator of an "unreachable" block.
+
+; CHECK-LABEL: infiniteLoopNoSuccessor:
+; CHECK: ## BB#0:
+; Make sure the prologue happens in the entry block.
+; CHECK-NEXT: pushq %rbp
+; ...
+; Make sure we don't shrink-wrap.
+; CHECK: ## BB#1
+; CHECK-NOT: pushq %rbp
+; ...
+; Make sure the epilogue happens in the exit block.
+; CHECK: ## BB#5
+; CHECK: popq %rbp
+; CHECK-NEXT: retq
+define void @infiniteLoopNoSuccessor() #5 {
+  %1 = load i32, i32* @x, align 4
+  %2 = icmp ne i32 %1, 0
+  br i1 %2, label %3, label %4
+
+; <label>:3:
+  store i32 0, i32* @x, align 4
+  br label %4
+
+; <label>:4:
+  call void (...) @somethingElse()
+  %5 = load i32, i32* @y, align 4
+  %6 = icmp ne i32 %5, 0
+  br i1 %6, label %10, label %7
+
+; <label>:7:
+  %8 = call i32 (...) @something()
+  br label %9
+
+; <label>:9:
+  call void (...) @somethingElse()
+  br label %9
+
+; <label>:10:
+  ret void
+}
+
+declare void @somethingElse(...)
+
+attributes #5 = { nounwind  "no-frame-pointer-elim-non-leaf" }
diff --git a/test/CodeGen/X86/xop-intrinsics-fast-isel.ll b/test/CodeGen/X86/xop-intrinsics-fast-isel.ll
index a100a1425dd1..5f56e2d80d73 100644
--- a/test/CodeGen/X86/xop-intrinsics-fast-isel.ll
+++ b/test/CodeGen/X86/xop-intrinsics-fast-isel.ll
@@ -499,8 +499,8 @@ declare <2 x i64> @llvm.x86.xop.vpcmov(<2 x i64>, <2 x i64>, <2 x i64>) nounwind
 define <4 x i64> @test_mm256_cmov_si256(<4 x i64> %a0, <4 x i64> %a1, <4 x i64> %a2) {
 ; X32-LABEL: test_mm256_cmov_si256:
 ; X32:       # BB#0:
-; X32-NEXT:    vpcmpeqd %xmm3, %xmm3, %xmm3
-; X32-NEXT:    vinsertf128 $1, %xmm3, %ymm3, %ymm3
+; X32-NEXT:    vxorps %ymm3, %ymm3, %ymm3
+; X32-NEXT:    vcmptrueps %ymm3, %ymm3, %ymm3
 ; X32-NEXT:    vxorps %ymm3, %ymm2, %ymm3
 ; X32-NEXT:    vandps %ymm2, %ymm0, %ymm0
 ; X32-NEXT:    vandps %ymm3, %ymm1, %ymm1
@@ -509,8 +509,8 @@ define <4 x i64> @test_mm256_cmov_si256(<4 x i64> %a0, <4 x i64> %a1, <4 x i64>
 ;
 ; X64-LABEL: test_mm256_cmov_si256:
 ; X64:       # BB#0:
-; X64-NEXT:    vpcmpeqd %xmm3, %xmm3, %xmm3
-; X64-NEXT:    vinsertf128 $1, %xmm3, %ymm3, %ymm3
+; X64-NEXT:    vxorps %ymm3, %ymm3, %ymm3
+; X64-NEXT:    vcmptrueps %ymm3, %ymm3, %ymm3
 ; X64-NEXT:    vxorps %ymm3, %ymm2, %ymm3
 ; X64-NEXT:    vandps %ymm2, %ymm0, %ymm0
 ; X64-NEXT:    vandps %ymm3, %ymm1, %ymm1
diff --git a/test/DebugInfo/COFF/local-variables.ll b/test/DebugInfo/COFF/local-variables.ll
index d1ad8767d413..c0bac0d174a9 100644
--- a/test/DebugInfo/COFF/local-variables.ll
+++ b/test/DebugInfo/COFF/local-variables.ll
@@ -28,7 +28,6 @@
 ; ASM: .seh_proc f
 ; ASM: # BB#0:                                 # %entry
 ; ASM:         subq    $56, %rsp
-; ASM:         #DEBUG_VALUE: f:param <- [%RSP+52]
 ; ASM:         movl    %ecx, 52(%rsp)
 ; ASM: [[prologue_end:\.Ltmp.*]]:
 ; ASM:         .cv_loc 0 1 8 7                 # t.cpp:8:7
@@ -36,8 +35,6 @@
 ; ASM:         je      .LBB0_2
 ; ASM: [[if_start:\.Ltmp.*]]:
 ; ASM: # BB#1:                                 # %if.then
-; ASM:         #DEBUG_VALUE: f:param <- [%RSP+52]
-; ASM:         #DEBUG_VALUE: a <- [%RSP+40]
 ; ASM:         .cv_loc 0 1 9 9                 # t.cpp:9:9
 ; ASM:         movl    $42, 40(%rsp)
 ; ASM: [[inline_site1:\.Ltmp.*]]:
@@ -51,8 +48,6 @@
 ; ASM:         jmp     .LBB0_3
 ; ASM: [[else_start:\.Ltmp.*]]:
 ; ASM: .LBB0_2:                                # %if.else
-; ASM:         #DEBUG_VALUE: f:param <- [%RSP+52]
-; ASM:         #DEBUG_VALUE: b <- [%RSP+36]
 ; ASM:         .cv_loc 0 1 13 9                # t.cpp:13:9
 ; ASM:         movl    $42, 36(%rsp)
 ; ASM: [[inline_site2:\.Ltmp.*]]:
diff --git a/test/DebugInfo/COFF/no-cus.ll b/test/DebugInfo/COFF/no-cus.ll
new file mode 100644
index 000000000000..349fe680de66
--- /dev/null
+++ b/test/DebugInfo/COFF/no-cus.ll
@@ -0,0 +1,25 @@
+; RUN: llc < %s -filetype=obj -o %t.o
+; RUN: llvm-objdump -section-headers %t.o | FileCheck %s
+
+; Don't emit debug info in this scenario and don't crash.
+
+; CHECK-NOT: .debug$S
+; CHECK: .text
+; CHECK-NOT: .debug$S
+
+; ModuleID = 't.cpp'
+source_filename = "t.cpp"
+target datalayout = "e-m:w-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-pc-windows-msvc19.10.24728"
+
+define void @f() {
+entry:
+  ret void
+}
+
+!llvm.module.flags = !{!0, !1}
+!llvm.ident = !{!2}
+
+!0 = !{i32 2, !"CodeView", i32 1}
+!1 = !{i32 1, !"PIC Level", i32 2}
+!2 = !{!"clang version 5.0.0 "}
diff --git a/test/DebugInfo/Inputs/typeunit-header.elf-x86-64 b/test/DebugInfo/Inputs/typeunit-header.elf-x86-64
new file mode 100644
index 000000000000..26fb0a5177d0
Binary files /dev/null and b/test/DebugInfo/Inputs/typeunit-header.elf-x86-64 differ
diff --git a/test/DebugInfo/Inputs/typeunit-header.s b/test/DebugInfo/Inputs/typeunit-header.s
new file mode 100644
index 000000000000..802eb01c552c
--- /dev/null
+++ b/test/DebugInfo/Inputs/typeunit-header.s
@@ -0,0 +1,49 @@
+# Test object with an artifically constructed type unit header to verify 
+# that the length field is correctly used to verify the validity of the
+# type_offset field.
+#
+# To generate the test object:
+# llvm-mc -triple x86_64-unknown-linux typeunit-header.s -filetype=obj \
+#         -o typeunit-header.elf-x86-64
+#
+# We only have an abbreviation for the type unit die which is all we need.
+# Real type unit dies have quite different attributes of course, but we
+# just need to demonstrate an issue with validating length, so we just give it
+# a single visibility attribute.
+        .section .debug_abbrev,"",@progbits
+        .byte 0x01  # Abbrev code
+        .byte 0x41  # DW_TAG_type_unit
+        .byte 0x01  # DW_CHILDREN_yes
+        .byte 0x17  # DW_AT_visibility
+        .byte 0x0b  # DW_FORM_data1
+        .byte 0x00  # EOM(1)
+        .byte 0x00  # EOM(2)
+        .byte 0x02  # Abbrev code
+        .byte 0x13  # DW_TAG_structure_type
+        .byte 0x00  # DW_CHILDREN_no (no members)
+        .byte 0x17  # DW_AT_visibility
+        .byte 0x0b  # DW_FORM_data1
+        .byte 0x00  # EOM(1)
+        .byte 0x00  # EOM(2)
+        .byte 0x00  # EOM(3)
+        
+        .section .debug_types,"",@progbits
+# DWARF v4 Type unit header - DWARF32 format.
+TU_4_32_start:
+        .long TU_4_32_end-TU_4_32_version  # Length of Unit
+TU_4_32_version:
+        .short 4               # DWARF version number
+        .long .debug_abbrev    # Offset Into Abbrev. Section
+        .byte 8                # Address Size (in bytes)
+        .quad 0x0011223344556677 # Type Signature
+        .long TU_4_32_type-TU_4_32_start # Type offset
+# The type-unit DIE, which has just a visibility attribute.
+        .byte 1                # Abbreviation code
+        .byte 1                # DW_VIS_local
+# The type DIE, which also just has a one-byte visibility attribute.
+TU_4_32_type:
+        .byte 2                # Abbreviation code
+        .byte 1                # DW_VIS_local
+        .byte 0 # NULL
+        .byte 0 # NULL
+TU_4_32_end:
diff --git a/test/DebugInfo/PDB/DIA/pdbdump-symbol-format.test b/test/DebugInfo/PDB/DIA/pdbdump-symbol-format.test
index 0bb3e001d3a4..997cdd9f6bac 100644
--- a/test/DebugInfo/PDB/DIA/pdbdump-symbol-format.test
+++ b/test/DebugInfo/PDB/DIA/pdbdump-symbol-format.test
@@ -1,5 +1,5 @@
-; RUN: llvm-pdbdump pretty -symbols %p/../Inputs/symbolformat.pdb | FileCheck --check-prefix=SYM_FORMAT_FPO %s
-; RUN: llvm-pdbdump pretty -symbols %p/../Inputs/symbolformat.pdb | FileCheck --check-prefix=SYM_FORMAT %s
+; RUN: llvm-pdbdump pretty -module-syms %p/../Inputs/symbolformat.pdb | FileCheck --check-prefix=SYM_FORMAT_FPO %s
+; RUN: llvm-pdbdump pretty -module-syms %p/../Inputs/symbolformat.pdb | FileCheck --check-prefix=SYM_FORMAT %s
 ; RUN: llvm-pdbdump pretty -types %p/../Inputs/symbolformat.pdb > %t.types
 ; RUN: FileCheck --check-prefix=TYPES_FORMAT %s < %t.types
 ; RUN: FileCheck --check-prefix=TYPES_1 %s < %t.types
diff --git a/test/DebugInfo/X86/dbg-declare-inalloca.ll b/test/DebugInfo/X86/dbg-declare-inalloca.ll
new file mode 100644
index 000000000000..e3f5c7e629b8
--- /dev/null
+++ b/test/DebugInfo/X86/dbg-declare-inalloca.ll
@@ -0,0 +1,199 @@
+; RUN: llc -O0 < %s | FileCheck %s --check-prefix=CHECK --check-prefix=DEBUG
+; RUN: llc < %s | FileCheck %s
+; RUN: llc -filetype=obj -O0 < %s | llvm-readobj -codeview - | FileCheck %s --check-prefix=OBJ
+
+; IR generated by the following source:
+; struct NonTrivial {
+;   NonTrivial();// : x(42) {}
+;   ~NonTrivial();// {}
+;   int x;
+; };
+; extern "C" void g(int);// {}
+; extern "C" void h(int);// {}
+; extern "C" void f(NonTrivial a, int b, int unused, int c) {
+;   if (b) {
+;     g(c);
+;   } else {
+;     h(a.x);
+;   }
+;   (void)unused;
+; }
+; //int main() {
+; //  NonTrivial x;
+; //  f(x, 1, 2, 3);
+; //}
+;
+; Remove C++ comments to have a complete, debuggable program.
+
+; We don't need (or want) DBG_VALUE instructions to describe the location of
+; inalloca arguments. We want frame indices in the side table, especially at
+; -O0, because they are reliable across the entire function and don't require
+; any propagation or analysis.
+
+; CHECK: _f:                                     # @f
+; CHECK: Lfunc_begin0:
+; CHECK-NOT: DEBUG_VALUE
+; CHECK: [[start:Ltmp[0-9]+]]:
+; CHECK-NOT: DEBUG_VALUE
+; CHECK:         cmpl
+; CHECK:         calll   _g
+; CHECK:         calll   _h
+; CHECK:         jmp "??1NonTrivial@@QAE@XZ"
+; CHECK: [[end:Ltmp[0-9]+]]:
+; CHECK: Lfunc_end0:
+
+; FIXME: Optimized debug info should preserve this.
+; DEBUG:         .short  4414                    # Record kind: S_LOCAL
+; DEBUG:         .asciz  "a"
+; DEBUG:         .cv_def_range    [[start]] [[end]]
+
+; CHECK:         .short  4414                    # Record kind: S_LOCAL
+; CHECK:         .asciz  "b"
+; CHECK:         .cv_def_range    [[start]] [[end]]
+
+; CHECK:         .short  4414                    # Record kind: S_LOCAL
+; CHECK:         .asciz  "c"
+; CHECK:         .cv_def_range    [[start]] [[end]]
+
+; OBJ-LABEL: ProcStart {
+; OBJ:   Kind: S_GPROC32_ID (0x1147)
+; OBJ:   DisplayName: f
+; OBJ: }
+; OBJ: Local {
+; OBJ:   Type: NonTrivial (0x1007)
+; OBJ:   Flags [ (0x1)
+; OBJ:     IsParameter (0x1)
+; OBJ:   ]
+; OBJ:   VarName: a
+; OBJ: }
+; OBJ: DefRangeRegisterRel {
+; OBJ:   BaseRegister: 21
+; OBJ:   BasePointerOffset: 12
+; OBJ: }
+; OBJ: Local {
+; OBJ:   Type: int (0x74)
+; OBJ:   Flags [ (0x1)
+; OBJ:     IsParameter (0x1)
+; OBJ:   ]
+; OBJ:   VarName: b
+; OBJ: }
+; OBJ: DefRangeRegisterRel {
+; OBJ:   BaseRegister: 21
+; OBJ:   BasePointerOffset: 16
+; OBJ: }
+; FIXME: Retain unused.
+; OBJ: Local {
+; OBJ:   Type: int (0x74)
+; OBJ:   Flags [ (0x1)
+; OBJ:     IsParameter (0x1)
+; OBJ:   ]
+; OBJ:   VarName: c
+; OBJ: }
+; OBJ: DefRangeRegisterRel {
+; OBJ:   BaseRegister: 21
+; OBJ:   BasePointerOffset: 24
+; OBJ: }
+; OBJ-LABEL: ProcEnd {
+; OBJ: }
+
+
+; ModuleID = 't.cpp'
+source_filename = "t.cpp"
+target datalayout = "e-m:x-p:32:32-i64:64-f80:32-n8:16:32-a:0:32-S32"
+target triple = "i386-pc-windows-msvc19.10.24728"
+
+%struct.NonTrivial = type { i32 }
+
+; Function Attrs: nounwind
+define void @f(<{ %struct.NonTrivial, i32, i32, i32 }>* inalloca) local_unnamed_addr #0 !dbg !7 {
+entry:
+  %a = getelementptr inbounds <{ %struct.NonTrivial, i32, i32, i32 }>, <{ %struct.NonTrivial, i32, i32, i32 }>* %0, i32 0, i32 0
+  %b = getelementptr inbounds <{ %struct.NonTrivial, i32, i32, i32 }>, <{ %struct.NonTrivial, i32, i32, i32 }>* %0, i32 0, i32 1
+  tail call void @llvm.dbg.declare(metadata i32* %c, metadata !20, metadata !24), !dbg !25
+  tail call void @llvm.dbg.declare(metadata i32* %b, metadata !22, metadata !24), !dbg !26
+  tail call void @llvm.dbg.declare(metadata %struct.NonTrivial* %a, metadata !23, metadata !24), !dbg !27
+  %1 = load i32, i32* %b, align 4, !dbg !28, !tbaa !30
+  %tobool = icmp eq i32 %1, 0, !dbg !28
+  br i1 %tobool, label %if.else, label %if.then, !dbg !34
+
+if.then:                                          ; preds = %entry
+  %c = getelementptr inbounds <{ %struct.NonTrivial, i32, i32, i32 }>, <{ %struct.NonTrivial, i32, i32, i32 }>* %0, i32 0, i32 3
+  %2 = load i32, i32* %c, align 4, !dbg !35, !tbaa !30
+  tail call void @g(i32 %2) #4, !dbg !37
+  br label %if.end, !dbg !38
+
+if.else:                                          ; preds = %entry
+  %x = getelementptr inbounds <{ %struct.NonTrivial, i32, i32, i32 }>, <{ %struct.NonTrivial, i32, i32, i32 }>* %0, i32 0, i32 0, i32 0, !dbg !39
+  %3 = load i32, i32* %x, align 4, !dbg !39, !tbaa !41
+  tail call void @h(i32 %3) #4, !dbg !43
+  br label %if.end
+
+if.end:                                           ; preds = %if.else, %if.then
+  tail call x86_thiscallcc void @"\01??1NonTrivial@@QAE@XZ"(%struct.NonTrivial* nonnull %a) #4, !dbg !44
+  ret void, !dbg !44
+}
+
+; Function Attrs: nounwind readnone speculatable
+declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
+
+declare void @g(i32) local_unnamed_addr
+
+declare void @h(i32) local_unnamed_addr
+
+; Function Attrs: nounwind
+declare x86_thiscallcc void @"\01??1NonTrivial@@QAE@XZ"(%struct.NonTrivial*) unnamed_addr #3
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone speculatable }
+attributes #3 = { nounwind }
+attributes #4 = { nounwind }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!3, !4, !5}
+!llvm.ident = !{!6}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, producer: "clang version 5.0.0 ", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2)
+!1 = !DIFile(filename: "t.cpp", directory: "C:\5Csrc\5Cllvm-project\5Cbuild", checksumkind: CSK_MD5, checksum: "e41e3fda2a91b52e121ed6c29a209eae")
+!2 = !{}
+!3 = !{i32 1, !"NumRegisterParameters", i32 0}
+!4 = !{i32 2, !"CodeView", i32 1}
+!5 = !{i32 2, !"Debug Info Version", i32 3}
+!6 = !{!"clang version 5.0.0 "}
+!7 = distinct !DISubprogram(name: "f", scope: !1, file: !1, line: 8, type: !8, isLocal: false, isDefinition: true, scopeLine: 8, flags: DIFlagPrototyped, isOptimized: true, unit: !0, variables: !19)
+!8 = !DISubroutineType(types: !9)
+!9 = !{null, !10, !13, !13, !13}
+!10 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "NonTrivial", file: !1, line: 1, size: 32, elements: !11, identifier: ".?AUNonTrivial@@")
+!11 = !{!12, !14, !18}
+!12 = !DIDerivedType(tag: DW_TAG_member, name: "x", scope: !10, file: !1, line: 4, baseType: !13, size: 32)
+!13 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+!14 = !DISubprogram(name: "NonTrivial", scope: !10, file: !1, line: 2, type: !15, isLocal: false, isDefinition: false, scopeLine: 2, flags: DIFlagPrototyped, isOptimized: true)
+!15 = !DISubroutineType(cc: DW_CC_BORLAND_thiscall, types: !16)
+!16 = !{null, !17}
+!17 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !10, size: 32, flags: DIFlagArtificial | DIFlagObjectPointer)
+!18 = !DISubprogram(name: "~NonTrivial", scope: !10, file: !1, line: 3, type: !15, isLocal: false, isDefinition: false, scopeLine: 3, flags: DIFlagPrototyped, isOptimized: true)
+!19 = !{!20, !21, !22, !23}
+!20 = !DILocalVariable(name: "c", arg: 4, scope: !7, file: !1, line: 8, type: !13)
+!21 = !DILocalVariable(name: "unused", arg: 3, scope: !7, file: !1, line: 8, type: !13)
+!22 = !DILocalVariable(name: "b", arg: 2, scope: !7, file: !1, line: 8, type: !13)
+!23 = !DILocalVariable(name: "a", arg: 1, scope: !7, file: !1, line: 8, type: !10)
+!24 = !DIExpression()
+!25 = !DILocation(line: 8, column: 56, scope: !7)
+!26 = !DILocation(line: 8, column: 37, scope: !7)
+!27 = !DILocation(line: 8, column: 30, scope: !7)
+!28 = !DILocation(line: 9, column: 7, scope: !29)
+!29 = distinct !DILexicalBlock(scope: !7, file: !1, line: 9, column: 7)
+!30 = !{!31, !31, i64 0}
+!31 = !{!"int", !32, i64 0}
+!32 = !{!"omnipotent char", !33, i64 0}
+!33 = !{!"Simple C++ TBAA"}
+!34 = !DILocation(line: 9, column: 7, scope: !7)
+!35 = !DILocation(line: 10, column: 7, scope: !36)
+!36 = distinct !DILexicalBlock(scope: !29, file: !1, line: 9, column: 10)
+!37 = !DILocation(line: 10, column: 5, scope: !36)
+!38 = !DILocation(line: 11, column: 3, scope: !36)
+!39 = !DILocation(line: 12, column: 9, scope: !40)
+!40 = distinct !DILexicalBlock(scope: !29, file: !1, line: 11, column: 10)
+!41 = !{!42, !31, i64 0}
+!42 = !{!"?AUNonTrivial@@", !31, i64 0}
+!43 = !DILocation(line: 12, column: 5, scope: !40)
+!44 = !DILocation(line: 15, column: 1, scope: !7)
diff --git a/test/DebugInfo/X86/split-dwarf-cross-unit-reference.ll b/test/DebugInfo/X86/split-dwarf-cross-unit-reference.ll
index c6f0afa27937..ca8525cd335b 100644
--- a/test/DebugInfo/X86/split-dwarf-cross-unit-reference.ll
+++ b/test/DebugInfo/X86/split-dwarf-cross-unit-reference.ll
@@ -1,46 +1,194 @@
-; RUN: llc -mtriple=x86_64-linux -split-dwarf-file=foo.dwo -filetype=obj -o - < %s | llvm-objdump -r - | FileCheck %s
+; RUN: llc -mtriple=x86_64-linux -split-dwarf-cross-cu-references -split-dwarf-file=foo.dwo -filetype=obj -o %t < %s
+; RUN: llvm-objdump -r %t | FileCheck %s
+; RUN: llvm-dwarfdump -debug-dump=info.dwo %t | FileCheck --check-prefix=ALL --check-prefix=INFO --check-prefix=DWO --check-prefix=CROSS %s
+; RUN: llvm-dwarfdump -debug-dump=info %t | FileCheck --check-prefix=ALL --check-prefix=INFO %s
+
+; RUN: llc -mtriple=x86_64-linux -split-dwarf-file=foo.dwo -filetype=obj -o %t < %s
+; RUN: llvm-objdump -r %t | FileCheck %s
+; RUN: llvm-dwarfdump -debug-dump=info.dwo %t | FileCheck --check-prefix=ALL --check-prefix=DWO --check-prefix=NOCROSS %s
+; RUN: llvm-dwarfdump -debug-dump=info %t | FileCheck --check-prefix=ALL --check-prefix=INFO %s
+
+; Testing cross-CU references for types, subprograms, and variables
+; Built from code something like this:
+; foo.cpp:
+;   struct t1 { int i; };
+;   void f();
+;   __attribute__((always_inline)) void f1(t1 t) {
+;     f();
+;   }
+;   void foo(t1 t) {
+;     f1(t);
+;   }
+; bar.cpp:
+;   struct t1 { int i; };
+;   void f1(t1);
+;   void bar(t1 t) {
+;     f1(t);
+;   }
+; $ clang++-tot -emit-llvm -S {foo,bar}.cpp -g
+; $ llvm-link-tot {foo,bar}.ll -S -o foobar.ll
+; $ clang++-tot -emit-llvm foobar.ll -o foobar.opt.ll -S -c
+;
+; Then manually removing the original f1 definition, to simplify the DWARF a bit
+; (so it only has the inlined definitions, no concrete definition)
+
+; Check that:
+; * no relocations are emitted for the debug_info.dwo section no matter what
+; * one debug_info->debug_info relocation in debug_info no matter what (for
+;   split dwarf inlining)
+; * debug_info uses relocations and ref_addr no matter what
+; * debug_info.dwo uses relocations for types as well as abstract subprograms
+;   and variables when -split-dwarf-cross-cu-references is used
+; * debug_info.dwo contains duplicate types, abstract subprograms and abstract
+;   variables otherwise to avoid the need for cross-cu references
 
 ; CHECK-NOT: .rel{{a?}}.debug_info.dwo
 ; CHECK: RELOCATION RECORDS FOR [.rel{{a?}}.debug_info]:
 ; CHECK-NOT: RELOCATION RECORDS
-; Expect one relocation in debug_info, between f3 and f1.
+; Expect one relocation in debug_info, from the inlined f1 in foo to its
+; abstract origin in bar
 ; CHECK: R_X86_64_32 .debug_info
+; CHECK-NOT: RELOCATION RECORDS
 ; CHECK-NOT: .debug_info
 ; CHECK: RELOCATION RECORDS
 ; CHECK-NOT: .rel{{a?}}.debug_info.dwo
 
+; ALL: Compile Unit
+; ALL: DW_TAG_compile_unit
+; DWO:   DW_AT_name {{.*}} "foo.cpp"
+; ALL: 0x[[F1:.*]]: DW_TAG_subprogram
+; ALL:     DW_AT_name {{.*}} "f1"
+; DWO: 0x[[F1T:.*]]: DW_TAG_formal_parameter
+; DWO:       DW_AT_name {{.*}} "t"
+; DWO:       DW_AT_type [DW_FORM_ref4] {{.*}}{0x[[T1:.*]]}
+; DWO:     NULL
+; DWO: 0x[[T1]]: DW_TAG_structure_type
+; DWO:     DW_AT_name {{.*}} "t1"
+; ALL:   DW_TAG_subprogram
+; ALL:     DW_AT_name {{.*}} "foo"
+; DWO:     DW_TAG_formal_parameter
+; DWO:       DW_AT_name {{.*}} "t"
+; DWO:       DW_AT_type [DW_FORM_ref4] {{.*}}{0x[[T1]]}
+; ALL:     DW_TAG_inlined_subroutine
+; ALL:       DW_AT_abstract_origin [DW_FORM_ref4] {{.*}}{0x[[F1]]}
+; DWO:       DW_TAG_formal_parameter
+; DWO:         DW_AT_abstract_origin [DW_FORM_ref4] {{.*}}{0x[[F1T]]}
+
+; ALL: Compile Unit
+; ALL: DW_TAG_compile_unit
+; DWO:   DW_AT_name {{.*}} "bar.cpp"
+; NOCROSS: 0x[[BAR_F1:.*]]: DW_TAG_subprogram
+; NOCROSS: DW_AT_name {{.*}} "f1"
+; NOCROSS: 0x[[BAR_F1T:.*]]: DW_TAG_formal_parameter
+; NOCROSS:   DW_AT_name {{.*}} "t"
+; NOCROSS:   DW_AT_type [DW_FORM_ref4] {{.*}}{0x[[BAR_T1:.*]]}
+; NOCROSS: NULL
+; NOCROSS: 0x[[BAR_T1]]: DW_TAG_structure_type
+; NOCROSS: DW_AT_name {{.*}} "t1"
+; ALL:   DW_TAG_subprogram
+; ALL:     DW_AT_name {{.*}} "bar"
+; DWO:     DW_TAG_formal_parameter
+; DWO:       DW_AT_name {{.*}} "t"
+; CROSS:     DW_AT_type [DW_FORM_ref_addr] (0x00000000[[T1]]
+; NOCROSS:   DW_AT_type [DW_FORM_ref4] {{.*}}{0x[[BAR_T1]]}
+; ALL:     DW_TAG_inlined_subroutine
+; INFO:     DW_AT_abstract_origin [DW_FORM_ref_addr] (0x00000000[[F1]]
+; NOCROSS:   DW_AT_abstract_origin [DW_FORM_ref4] {{.*}}{0x[[BAR_F1]]}
+; DWO:       DW_TAG_formal_parameter
+; CROSS:       DW_AT_abstract_origin [DW_FORM_ref_addr] (0x00000000[[F1T]]
+; NOCROSS:     DW_AT_abstract_origin [DW_FORM_ref4] {{.*}}{0x[[BAR_F1T]]
 
-; Function Attrs: noinline nounwind optnone uwtable
-define void @_Z2f1v() !dbg !7 {
+%struct.t1 = type { i32 }
+
+; Function Attrs: nounwind readnone speculatable
+declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
+
+declare void @_Z1fv() #2
+
+; Function Attrs: noinline uwtable
+define void @_Z3foo2t1(i32 %t.coerce) #3 !dbg !20 {
 entry:
-  ret void, !dbg !10
+  %t.i = alloca %struct.t1, align 4
+  call void @llvm.dbg.declare(metadata %struct.t1* %t.i, metadata !15, metadata !16), !dbg !21
+  %t = alloca %struct.t1, align 4
+  %agg.tmp = alloca %struct.t1, align 4
+  %coerce.dive = getelementptr inbounds %struct.t1, %struct.t1* %t, i32 0, i32 0
+  store i32 %t.coerce, i32* %coerce.dive, align 4
+  call void @llvm.dbg.declare(metadata %struct.t1* %t, metadata !23, metadata !16), !dbg !24
+  %0 = bitcast %struct.t1* %agg.tmp to i8*, !dbg !25
+  %1 = bitcast %struct.t1* %t to i8*, !dbg !25
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %0, i8* %1, i64 4, i32 4, i1 false), !dbg !25
+  %coerce.dive1 = getelementptr inbounds %struct.t1, %struct.t1* %agg.tmp, i32 0, i32 0, !dbg !26
+  %2 = load i32, i32* %coerce.dive1, align 4, !dbg !26
+  %coerce.dive.i = getelementptr inbounds %struct.t1, %struct.t1* %t.i, i32 0, i32 0
+  store i32 %2, i32* %coerce.dive.i, align 4
+  call void @_Z1fv(), !dbg !27
+  ret void, !dbg !28
 }
 
+; Function Attrs: argmemonly nounwind
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture writeonly, i8* nocapture readonly, i64, i32, i1) #4
+
 ; Function Attrs: noinline uwtable
-define void @_Z2f3v() !dbg !13 {
+define void @_Z3bar2t1(i32 %t.coerce) #3 !dbg !29 {
 entry:
-  call void @_Z2f1v(), !dbg !14
-  ret void, !dbg !16
+  %t.i = alloca %struct.t1, align 4
+  call void @llvm.dbg.declare(metadata %struct.t1* %t.i, metadata !15, metadata !16), !dbg !30
+  %t = alloca %struct.t1, align 4
+  %agg.tmp = alloca %struct.t1, align 4
+  %coerce.dive = getelementptr inbounds %struct.t1, %struct.t1* %t, i32 0, i32 0
+  store i32 %t.coerce, i32* %coerce.dive, align 4
+  call void @llvm.dbg.declare(metadata %struct.t1* %t, metadata !32, metadata !16), !dbg !33
+  %0 = bitcast %struct.t1* %agg.tmp to i8*, !dbg !34
+  %1 = bitcast %struct.t1* %t to i8*, !dbg !34
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %0, i8* %1, i64 4, i32 4, i1 false), !dbg !34
+  %coerce.dive1 = getelementptr inbounds %struct.t1, %struct.t1* %agg.tmp, i32 0, i32 0, !dbg !35
+  %2 = load i32, i32* %coerce.dive1, align 4, !dbg !35
+  %coerce.dive.i = getelementptr inbounds %struct.t1, %struct.t1* %t.i, i32 0, i32 0
+  store i32 %2, i32* %coerce.dive.i, align 4
+  call void @_Z1fv(), !dbg !36
+  ret void, !dbg !37
 }
 
 !llvm.dbg.cu = !{!0, !3}
 !llvm.ident = !{!5, !5}
-!llvm.module.flags = !{!6}
+!llvm.module.flags = !{!6, !7}
 
-!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, producer: "clang version 5.0.0 (trunk 301051) (llvm/trunk 301062)", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !2)
-!1 = !DIFile(filename: "a.cpp", directory: "/usr/local/google/home/blaikie/dev/scratch")
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, producer: "clang version 5.0.0 (trunk 302809) (llvm/trunk 302815)", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, splitDebugInlining: true)
+!1 = !DIFile(filename: "foo.cpp", directory: "/usr/local/google/home/blaikie/dev/scratch")
 !2 = !{}
-!3 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !4, producer: "clang version 5.0.0 (trunk 301051) (llvm/trunk 301062)", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !2)
-!4 = !DIFile(filename: "b.cpp", directory: "/usr/local/google/home/blaikie/dev/scratch")
-!5 = !{!"clang version 5.0.0 (trunk 301051) (llvm/trunk 301062)"}
-!6 = !{i32 2, !"Debug Info Version", i32 3}
-!7 = distinct !DISubprogram(name: "f1", linkageName: "_Z2f1v", scope: !1, file: !1, line: 1, type: !8, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: false, unit: !0, variables: !2)
-!8 = !DISubroutineType(types: !9)
-!9 = !{null}
-!10 = !DILocation(line: 1, scope: !7)
-!11 = distinct !DISubprogram(name: "f2", linkageName: "_Z2f2v", scope: !1, file: !1, line: 1, type: !8, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: false, unit: !0, variables: !2)
-!12 = !DILocation(line: 1, scope: !11)
-!13 = distinct !DISubprogram(name: "f3", linkageName: "_Z2f3v", scope: !4, file: !4, line: 1, type: !8, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: false, unit: !3, variables: !2)
-!14 = !DILocation(line: 1, scope: !11, inlinedAt: !15)
-!15 = distinct !DILocation(line: 1, scope: !13)
-!16 = !DILocation(line: 1, scope: !13)
+!3 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !4, producer: "clang version 5.0.0 (trunk 302809) (llvm/trunk 302815)", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, splitDebugInlining: true)
+!4 = !DIFile(filename: "bar.cpp", directory: "/usr/local/google/home/blaikie/dev/scratch")
+!5 = !{!"clang version 5.0.0 (trunk 302809) (llvm/trunk 302815)"}
+!6 = !{i32 2, !"Dwarf Version", i32 4}
+!7 = !{i32 2, !"Debug Info Version", i32 3}
+!8 = distinct !DISubprogram(name: "f1", linkageName: "_Z2f12t1", scope: !1, file: !1, line: 3, type: !9, isLocal: false, isDefinition: true, scopeLine: 3, flags: DIFlagPrototyped, isOptimized: false, unit: !0, variables: !2)
+!9 = !DISubroutineType(types: !10)
+!10 = !{null, !11}
+!11 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "t1", file: !1, line: 1, size: 32, elements: !12, identifier: "_ZTS2t1")
+!12 = !{!13}
+!13 = !DIDerivedType(tag: DW_TAG_member, name: "i", scope: !11, file: !1, line: 1, baseType: !14, size: 32)
+!14 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+!15 = !DILocalVariable(name: "t", arg: 1, scope: !8, file: !1, line: 3, type: !11)
+!16 = !DIExpression()
+!17 = !DILocation(line: 3, column: 43, scope: !8)
+!18 = !DILocation(line: 4, column: 3, scope: !8)
+!19 = !DILocation(line: 5, column: 1, scope: !8)
+!20 = distinct !DISubprogram(name: "foo", linkageName: "_Z3foo2t1", scope: !1, file: !1, line: 6, type: !9, isLocal: false, isDefinition: true, scopeLine: 6, flags: DIFlagPrototyped, isOptimized: false, unit: !0, variables: !2)
+!21 = !DILocation(line: 3, column: 43, scope: !8, inlinedAt: !22)
+!22 = distinct !DILocation(line: 7, column: 3, scope: !20)
+!23 = !DILocalVariable(name: "t", arg: 1, scope: !20, file: !1, line: 6, type: !11)
+!24 = !DILocation(line: 6, column: 13, scope: !20)
+!25 = !DILocation(line: 7, column: 6, scope: !20)
+!26 = !DILocation(line: 7, column: 3, scope: !20)
+!27 = !DILocation(line: 4, column: 3, scope: !8, inlinedAt: !22)
+!28 = !DILocation(line: 8, column: 1, scope: !20)
+!29 = distinct !DISubprogram(name: "bar", linkageName: "_Z3bar2t1", scope: !4, file: !4, line: 3, type: !9, isLocal: false, isDefinition: true, scopeLine: 3, flags: DIFlagPrototyped, isOptimized: false, unit: !3, variables: !2)
+!30 = !DILocation(line: 3, column: 43, scope: !8, inlinedAt: !31)
+!31 = distinct !DILocation(line: 4, column: 3, scope: !29)
+!32 = !DILocalVariable(name: "t", arg: 1, scope: !29, file: !4, line: 3, type: !11)
+!33 = !DILocation(line: 3, column: 13, scope: !29)
+!34 = !DILocation(line: 4, column: 6, scope: !29)
+!35 = !DILocation(line: 4, column: 3, scope: !29)
+!36 = !DILocation(line: 4, column: 3, scope: !8, inlinedAt: !31)
+!37 = !DILocation(line: 5, column: 1, scope: !29)
diff --git a/test/DebugInfo/typeunit-header.test b/test/DebugInfo/typeunit-header.test
new file mode 100644
index 000000000000..c16156b91e6f
--- /dev/null
+++ b/test/DebugInfo/typeunit-header.test
@@ -0,0 +1,15 @@
+RUN: llvm-dwarfdump %p/Inputs/typeunit-header.elf-x86-64 | FileCheck %s
+
+This is testing a bugfix where parsing the type unit header was not 
+taking the unit's intial length field into account when validating.
+
+The input file is hand-coded assembler to generate a type unit stub,
+which only contains a type unit DIE with a sole visibility attribute.
+
+We make sure that llvm-dwarfdump is parsing the type unit header correctly
+and displays it. 
+
+CHECK: .debug_types contents:
+CHECK: 0x00000000: Type Unit: length = 0x00000019 version = 0x0004 abbr_offset = 0x0000 addr_size = 0x08 name = '' type_signature = 0x0011223344556677 type_offset = 0x0019 (next unit at 0x0000001d)
+CHECK: 0x00000017: DW_TAG_type_unit [1] *
+CHECK: DW_AT_visibility [DW_FORM_data1] (DW_VIS_local)
diff --git a/test/Feature/intrinsic-noduplicate.ll b/test/Feature/intrinsic-noduplicate.ll
index 4f2ae1c698c9..f7b377aae38b 100644
--- a/test/Feature/intrinsic-noduplicate.ll
+++ b/test/Feature/intrinsic-noduplicate.ll
@@ -1,4 +1,5 @@
 ; RUN: llvm-as < %s | llvm-dis | FileCheck %s
+; REQUIRES: NVPTX
 
 ; Make sure LLVM knows about the convergent attribute on the
 ; llvm.nvvm.barrier0 intrinsic.
diff --git a/test/Instrumentation/MemorySanitizer/msan_basic.ll b/test/Instrumentation/MemorySanitizer/msan_basic.ll
index 4b208d64427b..334e00dabf40 100644
--- a/test/Instrumentation/MemorySanitizer/msan_basic.ll
+++ b/test/Instrumentation/MemorySanitizer/msan_basic.ll
@@ -617,70 +617,6 @@ declare i32 @llvm.bswap.i32(i32) nounwind readnone
 ; CHECK-NOT: call void @__msan_warning
 ; CHECK: ret i32
 
-
-; Store intrinsic.
-
-define void @StoreIntrinsic(i8* %p, <4 x float> %x) nounwind uwtable sanitize_memory {
-  call void @llvm.x86.sse.storeu.ps(i8* %p, <4 x float> %x)
-  ret void
-}
-
-declare void @llvm.x86.sse.storeu.ps(i8*, <4 x float>) nounwind
-
-; CHECK-LABEL: @StoreIntrinsic
-; CHECK-NOT: br
-; CHECK-NOT: = or
-; CHECK: store <4 x i32> {{.*}} align 1
-; CHECK: store <4 x float> %{{.*}}, <4 x float>* %{{.*}}, align 1{{$}}
-; CHECK: ret void
-
-
-; Load intrinsic.
-
-define <16 x i8> @LoadIntrinsic(i8* %p) nounwind uwtable sanitize_memory {
-  %call = call <16 x i8> @llvm.x86.sse3.ldu.dq(i8* %p)
-  ret <16 x i8> %call
-}
-
-declare <16 x i8> @llvm.x86.sse3.ldu.dq(i8* %p) nounwind
-
-; CHECK-LABEL: @LoadIntrinsic
-; CHECK: load <16 x i8>, <16 x i8>* {{.*}} align 1
-; CHECK-ORIGINS: [[ORIGIN:%[01-9a-z]+]] = load i32, i32* {{.*}}
-; CHECK-NOT: br
-; CHECK-NOT: = or
-; CHECK: call <16 x i8> @llvm.x86.sse3.ldu.dq
-; CHECK: store <16 x i8> {{.*}} @__msan_retval_tls
-; CHECK-ORIGINS: store i32 {{.*}}[[ORIGIN]], i32* @__msan_retval_origin_tls
-; CHECK: ret <16 x i8>
-
-
-; Simple NoMem intrinsic
-; Check that shadow is OR'ed, and origin is Select'ed
-; And no shadow checks!
-
-define <8 x i16> @Paddsw128(<8 x i16> %a, <8 x i16> %b) nounwind uwtable sanitize_memory {
-  %call = call <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16> %a, <8 x i16> %b)
-  ret <8 x i16> %call
-}
-
-declare <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16> %a, <8 x i16> %b) nounwind
-
-; CHECK-LABEL: @Paddsw128
-; CHECK-NEXT: load <8 x i16>, <8 x i16>* {{.*}} @__msan_param_tls
-; CHECK-ORIGINS: load i32, i32* {{.*}} @__msan_param_origin_tls
-; CHECK-NEXT: load <8 x i16>, <8 x i16>* {{.*}} @__msan_param_tls
-; CHECK-ORIGINS: load i32, i32* {{.*}} @__msan_param_origin_tls
-; CHECK-NEXT: = or <8 x i16>
-; CHECK-ORIGINS: = bitcast <8 x i16> {{.*}} to i128
-; CHECK-ORIGINS-NEXT: = icmp ne i128 {{.*}}, 0
-; CHECK-ORIGINS-NEXT: = select i1 {{.*}}, i32 {{.*}}, i32
-; CHECK-NEXT: call <8 x i16> @llvm.x86.sse2.padds.w
-; CHECK-NEXT: store <8 x i16> {{.*}} @__msan_retval_tls
-; CHECK-ORIGINS: store i32 {{.*}} @__msan_retval_origin_tls
-; CHECK-NEXT: ret <8 x i16>
-
-
 ; Test handling of vectors of pointers.
 ; Check that shadow of such vector is a vector of integers.
 
diff --git a/test/Instrumentation/MemorySanitizer/msan_x86intrinsics.ll b/test/Instrumentation/MemorySanitizer/msan_x86intrinsics.ll
new file mode 100644
index 000000000000..be3f1976daa1
--- /dev/null
+++ b/test/Instrumentation/MemorySanitizer/msan_x86intrinsics.ll
@@ -0,0 +1,68 @@
+; RUN: opt < %s -msan -msan-check-access-address=0 -S | FileCheck %s
+; RUN: opt < %s -msan -msan-check-access-address=0 -msan-track-origins=1 -S | FileCheck -check-prefix=CHECK -check-prefix=CHECK-ORIGINS %s
+; REQUIRES: x86
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; Store intrinsic.
+
+define void @StoreIntrinsic(i8* %p, <4 x float> %x) nounwind uwtable sanitize_memory {
+  call void @llvm.x86.sse.storeu.ps(i8* %p, <4 x float> %x)
+  ret void
+}
+
+declare void @llvm.x86.sse.storeu.ps(i8*, <4 x float>) nounwind
+
+; CHECK-LABEL: @StoreIntrinsic
+; CHECK-NOT: br
+; CHECK-NOT: = or
+; CHECK: store <4 x i32> {{.*}} align 1
+; CHECK: store <4 x float> %{{.*}}, <4 x float>* %{{.*}}, align 1{{$}}
+; CHECK: ret void
+
+
+; Load intrinsic.
+
+define <16 x i8> @LoadIntrinsic(i8* %p) nounwind uwtable sanitize_memory {
+  %call = call <16 x i8> @llvm.x86.sse3.ldu.dq(i8* %p)
+  ret <16 x i8> %call
+}
+
+declare <16 x i8> @llvm.x86.sse3.ldu.dq(i8* %p) nounwind
+
+; CHECK-LABEL: @LoadIntrinsic
+; CHECK: load <16 x i8>, <16 x i8>* {{.*}} align 1
+; CHECK-ORIGINS: [[ORIGIN:%[01-9a-z]+]] = load i32, i32* {{.*}}
+; CHECK-NOT: br
+; CHECK-NOT: = or
+; CHECK: call <16 x i8> @llvm.x86.sse3.ldu.dq
+; CHECK: store <16 x i8> {{.*}} @__msan_retval_tls
+; CHECK-ORIGINS: store i32 {{.*}}[[ORIGIN]], i32* @__msan_retval_origin_tls
+; CHECK: ret <16 x i8>
+
+
+; Simple NoMem intrinsic
+; Check that shadow is OR'ed, and origin is Select'ed
+; And no shadow checks!
+
+define <8 x i16> @Paddsw128(<8 x i16> %a, <8 x i16> %b) nounwind uwtable sanitize_memory {
+  %call = call <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16> %a, <8 x i16> %b)
+  ret <8 x i16> %call
+}
+
+declare <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16> %a, <8 x i16> %b) nounwind
+
+; CHECK-LABEL: @Paddsw128
+; CHECK-NEXT: load <8 x i16>, <8 x i16>* {{.*}} @__msan_param_tls
+; CHECK-ORIGINS: load i32, i32* {{.*}} @__msan_param_origin_tls
+; CHECK-NEXT: load <8 x i16>, <8 x i16>* {{.*}} @__msan_param_tls
+; CHECK-ORIGINS: load i32, i32* {{.*}} @__msan_param_origin_tls
+; CHECK-NEXT: = or <8 x i16>
+; CHECK-ORIGINS: = bitcast <8 x i16> {{.*}} to i128
+; CHECK-ORIGINS-NEXT: = icmp ne i128 {{.*}}, 0
+; CHECK-ORIGINS-NEXT: = select i1 {{.*}}, i32 {{.*}}, i32
+; CHECK-NEXT: call <8 x i16> @llvm.x86.sse2.padds.w
+; CHECK-NEXT: store <8 x i16> {{.*}} @__msan_retval_tls
+; CHECK-ORIGINS: store i32 {{.*}} @__msan_retval_origin_tls
+; CHECK-NEXT: ret <8 x i16>
diff --git a/test/Instrumentation/MemorySanitizer/pr32842.ll b/test/Instrumentation/MemorySanitizer/pr32842.ll
new file mode 100644
index 000000000000..5d74c9a193bf
--- /dev/null
+++ b/test/Instrumentation/MemorySanitizer/pr32842.ll
@@ -0,0 +1,20 @@
+; Regression test for https://bugs.llvm.org/show_bug.cgi?id=32842
+;
+; RUN: opt < %s -msan -S | FileCheck %s
+;target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define zeroext i1 @_Z1fii(i32 %x, i32 %y) sanitize_memory {
+entry:
+  %cmp = icmp slt i32 %x, %y
+  ret i1 %cmp
+}
+
+; CHECK:      [[X:[^ ]+]] = load{{.*}}__msan_param_tls{{.*}}
+; CHECK:      [[Y:[^ ]+]] = load{{.*}}__msan_param_tls{{.*}}
+; CHECK:      [[OR:[^ ]+]] = or i32 [[Y]], [[X]]
+
+; Make sure the shadow of the (x < y) comparison isn't truncated to i1.
+; CHECK-NOT:  trunc i32 [[OR]] to i1
+; CHECK:      [[CMP:[^ ]+]] = icmp ne i32 [[OR]], 0
+; CHECK:      store i1 [[CMP]],{{.*}}__msan_retval_tls
diff --git a/test/Instrumentation/MemorySanitizer/vector_arith.ll b/test/Instrumentation/MemorySanitizer/vector_arith.ll
index 6541a1c3a394..8be085cff33d 100644
--- a/test/Instrumentation/MemorySanitizer/vector_arith.ll
+++ b/test/Instrumentation/MemorySanitizer/vector_arith.ll
@@ -1,4 +1,5 @@
 ; RUN: opt < %s -msan -msan-check-access-address=0 -S | FileCheck %s
+; REQUIRES: x86
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
diff --git a/test/Instrumentation/MemorySanitizer/vector_cmp.ll b/test/Instrumentation/MemorySanitizer/vector_cmp.ll
index fb54a5cb632e..62a5f573064e 100644
--- a/test/Instrumentation/MemorySanitizer/vector_cmp.ll
+++ b/test/Instrumentation/MemorySanitizer/vector_cmp.ll
@@ -1,4 +1,5 @@
 ; RUN: opt < %s -msan -msan-check-access-address=0 -S | FileCheck %s
+; REQUIRES: x86
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
diff --git a/test/Instrumentation/MemorySanitizer/vector_cvt.ll b/test/Instrumentation/MemorySanitizer/vector_cvt.ll
index 55e91c74a316..beedb0e63e50 100644
--- a/test/Instrumentation/MemorySanitizer/vector_cvt.ll
+++ b/test/Instrumentation/MemorySanitizer/vector_cvt.ll
@@ -1,4 +1,5 @@
 ; RUN: opt < %s -msan -msan-check-access-address=0 -S | FileCheck %s
+; REQUIRES: x86
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
diff --git a/test/Instrumentation/MemorySanitizer/vector_pack.ll b/test/Instrumentation/MemorySanitizer/vector_pack.ll
index 31c0c62980ec..deb03d84802a 100644
--- a/test/Instrumentation/MemorySanitizer/vector_pack.ll
+++ b/test/Instrumentation/MemorySanitizer/vector_pack.ll
@@ -1,4 +1,5 @@
 ; RUN: opt < %s -msan -msan-check-access-address=0 -S | FileCheck %s
+; REQUIRES: x86
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
diff --git a/test/Instrumentation/MemorySanitizer/vector_shift.ll b/test/Instrumentation/MemorySanitizer/vector_shift.ll
index 978bad3b6979..a4b8fdbd603f 100644
--- a/test/Instrumentation/MemorySanitizer/vector_shift.ll
+++ b/test/Instrumentation/MemorySanitizer/vector_shift.ll
@@ -1,4 +1,5 @@
 ; RUN: opt < %s -msan -msan-check-access-address=0 -S | FileCheck %s
+; REQUIRES: x86
 
 ; Test instrumentation of vector shift instructions.
 
diff --git a/test/LTO/Resolution/X86/ifunc.ll b/test/LTO/Resolution/X86/ifunc.ll
new file mode 100644
index 000000000000..63723763430c
--- /dev/null
+++ b/test/LTO/Resolution/X86/ifunc.ll
@@ -0,0 +1,15 @@
+; RUN: opt -module-summary -o %t.bc %s
+; RUN: llvm-lto2 run %t.bc -r %t.bc,foo,pl -o %t2
+; RUN: llvm-nm %t2.0 | FileCheck %s
+; CHECK: T foo
+; CHECK: t foo_ifunc
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+@foo = ifunc i32 (i32), i64 ()* @foo_ifunc
+
+define internal i64 @foo_ifunc() {
+entry:
+  ret i64 0
+}
diff --git a/test/MC/AArch64/directive-cpu-err.s b/test/MC/AArch64/directive-cpu-err.s
new file mode 100644
index 000000000000..ea0d28e71815
--- /dev/null
+++ b/test/MC/AArch64/directive-cpu-err.s
@@ -0,0 +1,9 @@
+// RUN: not llvm-mc -triple aarch64-linux-gnu %s 2> %t > /dev/null
+// RUN: FileCheck %s < %t
+
+    .cpu invalid
+    // CHECK: error: unknown CPU name
+
+    .cpu generic+wibble+nowobble
+    // CHECK: :[[@LINE-1]]:18: error: unsupported architectural extension
+    // CHECK: :[[@LINE-2]]:25: error: unsupported architectural extension
diff --git a/test/MC/AArch64/label-arithmetic-diags-elf.s b/test/MC/AArch64/label-arithmetic-diags-elf.s
index e9d92d591fac..dbfdd24f8dc9 100644
--- a/test/MC/AArch64/label-arithmetic-diags-elf.s
+++ b/test/MC/AArch64/label-arithmetic-diags-elf.s
@@ -1,5 +1,14 @@
 // RUN: not llvm-mc -triple aarch64-elf -filetype=obj %s -o /dev/null 2>&1 | FileCheck %s
 
+  .data
+b:
+  .fill 300
+e:
+  .byte e - b
+  // CHECK: error: value evaluated as 300 is out of range.
+  // CHECK-NEXT: .byte e - b
+  // CHECK-NEXT:       ^
+
   .section sec_x
 start:
   .space 5000
diff --git a/test/MC/AMDGPU/flat.s b/test/MC/AMDGPU/flat.s
index c6894c35f4d7..4e81799fe9f9 100644
--- a/test/MC/AMDGPU/flat.s
+++ b/test/MC/AMDGPU/flat.s
@@ -30,31 +30,6 @@ flat_load_dword v1, v[3:4] glc slc
 // CI: flat_load_dword v1, v[3:4] glc slc ; encoding: [0x00,0x00,0x33,0xdc,0x03,0x00,0x00,0x01]
 // VI: flat_load_dword v1, v[3:4] glc slc ; encoding: [0x00,0x00,0x53,0xdc,0x03,0x00,0x00,0x01]
 
-flat_load_dword v1, v[3:4] glc tfe
-// NOSI: error:
-// CI: flat_load_dword v1, v[3:4] glc tfe ; encoding: [0x00,0x00,0x31,0xdc,0x03,0x00,0x80,0x01]
-// VI: flat_load_dword v1, v[3:4] glc tfe ; encoding: [0x00,0x00,0x51,0xdc,0x03,0x00,0x80,0x01]
-
-flat_load_dword v1, v[3:4] glc slc tfe
-// NOSI: error:
-// CI: flat_load_dword v1, v[3:4] glc slc tfe ; encoding: [0x00,0x00,0x33,0xdc,0x03,0x00,0x80,0x01]
-// VI: flat_load_dword v1, v[3:4] glc slc tfe ; encoding: [0x00,0x00,0x53,0xdc,0x03,0x00,0x80,0x01]
-
-flat_load_dword v1, v[3:4] slc
-// NOSI: error:
-// CI: flat_load_dword v1, v[3:4] slc ; encoding: [0x00,0x00,0x32,0xdc,0x03,0x00,0x00,0x01]
-// VI: flat_load_dword v1, v[3:4] slc ; encoding: [0x00,0x00,0x52,0xdc,0x03,0x00,0x00,0x01]
-
-flat_load_dword v1, v[3:4] slc tfe
-// NOSI: error:
-// CI: flat_load_dword v1, v[3:4] slc tfe ; encoding: [0x00,0x00,0x32,0xdc,0x03,0x00,0x80,0x01]
-// VI: flat_load_dword v1, v[3:4] slc tfe ; encoding: [0x00,0x00,0x52,0xdc,0x03,0x00,0x80,0x01]
-
-flat_load_dword v1, v[3:4] tfe
-// NOSI: error:
-// CI: flat_load_dword v1, v[3:4] tfe ; encoding: [0x00,0x00,0x30,0xdc,0x03,0x00,0x80,0x01]
-// VI: flat_load_dword v1, v[3:4] tfe ; encoding: [0x00,0x00,0x50,0xdc,0x03,0x00,0x80,0x01]
-
 flat_store_dword v[3:4], v1
 // NOSI: error:
 // CIVI: flat_store_dword v[3:4], v1 ; encoding: [0x00,0x00,0x70,0xdc,0x03,0x01,0x00,0x00]
@@ -67,66 +42,25 @@ flat_store_dword v[3:4], v1 glc slc
 // NOSI: error:
 // CIVI: flat_store_dword v[3:4], v1 glc slc ; encoding: [0x00,0x00,0x73,0xdc,0x03,0x01,0x00,0x00]
 
-flat_store_dword v[3:4], v1 glc tfe
-// NOSI: error:
-// CIVI: flat_store_dword v[3:4], v1 glc tfe ; encoding: [0x00,0x00,0x71,0xdc,0x03,0x01,0x80,0x00]
-
-flat_store_dword v[3:4], v1 glc slc tfe
-// NOSI: error:
-// CIVI: flat_store_dword v[3:4], v1 glc slc tfe ; encoding: [0x00,0x00,0x73,0xdc,0x03,0x01,0x80,0x00]
 
 flat_store_dword v[3:4], v1 slc
 // NOSI: error:
 // CIVI: flat_store_dword v[3:4], v1 slc ; encoding: [0x00,0x00,0x72,0xdc,0x03,0x01,0x00,0x00]
 
-flat_store_dword v[3:4], v1 slc tfe
-// NOSI: error:
-// CIVI: flat_store_dword v[3:4], v1 slc tfe ; encoding: [0x00,0x00,0x72,0xdc,0x03,0x01,0x80,0x00]
-
-flat_store_dword v[3:4], v1 tfe
-// NOSI: error:
-// CIVI: flat_store_dword v[3:4], v1 tfe ; encoding: [0x00,0x00,0x70,0xdc,0x03,0x01,0x80,0x00]
-
 // FIXME: For atomic instructions, glc must be placed immediately following
 // the data regiser.  These forms aren't currently supported:
 // flat_atomic_add v1, v[3:4], v5 slc glc
-// flat_atomic_add v1, v[3:4], v5 slc glc tfe
-// flat_atomic_add v1, v[3:4], v5 slc tfe glc
-// flat_atomic_add v1, v[3:4], v5 tfe glc
-// flat_atomic_add v[3:4], v5 tfe glc
-// flat_atomic_add v1, v[3:4], v5 tfe glc slc
-// flat_atomic_add v1, v[3:4], v5 tfe slc glc
 
 flat_atomic_add v1 v[3:4], v5 glc slc
 // NOSI: error:
 // CI: flat_atomic_add v1, v[3:4], v5 glc slc ; encoding: [0x00,0x00,0xcb,0xdc,0x03,0x05,0x00,0x01]
 // VI: flat_atomic_add v1, v[3:4], v5 glc slc ; encoding: [0x00,0x00,0x0b,0xdd,0x03,0x05,0x00,0x01]
 
-flat_atomic_add v1 v[3:4], v5 glc tfe
-// NOSI: error:
-// CI: flat_atomic_add v1, v[3:4], v5 glc tfe ; encoding: [0x00,0x00,0xc9,0xdc,0x03,0x05,0x80,0x01]
-// VI: flat_atomic_add v1, v[3:4], v5 glc tfe ; encoding: [0x00,0x00,0x09,0xdd,0x03,0x05,0x80,0x01]
-
-flat_atomic_add v1 v[3:4], v5 glc slc tfe
-// NOSI: error:
-// CI: flat_atomic_add v1, v[3:4], v5 glc slc tfe ; encoding: [0x00,0x00,0xcb,0xdc,0x03,0x05,0x80,0x01]
-// VI: flat_atomic_add v1, v[3:4], v5 glc slc tfe ; encoding: [0x00,0x00,0x0b,0xdd,0x03,0x05,0x80,0x01]
-
 flat_atomic_add v[3:4], v5 slc
 // NOSI: error:
 // CI: flat_atomic_add v[3:4], v5 slc ; encoding: [0x00,0x00,0xca,0xdc,0x03,0x05,0x00,0x00]
 // VI: flat_atomic_add v[3:4], v5 slc ; encoding: [0x00,0x00,0x0a,0xdd,0x03,0x05,0x00,0x00]
 
-flat_atomic_add v[3:4], v5 slc tfe
-// NOSI: error:
-// CI: flat_atomic_add v[3:4], v5 slc tfe ; encoding: [0x00,0x00,0xca,0xdc,0x03,0x05,0x80,0x00]
-// VI: flat_atomic_add v[3:4], v5 slc tfe ; encoding: [0x00,0x00,0x0a,0xdd,0x03,0x05,0x80,0x00]
-
-flat_atomic_add v[3:4], v5 tfe
-// NOSI: error:
-// CI: flat_atomic_add v[3:4], v5 tfe ; encoding: [0x00,0x00,0xc8,0xdc,0x03,0x05,0x80,0x00]
-// VI: flat_atomic_add v[3:4], v5 tfe ; encoding: [0x00,0x00,0x08,0xdd,0x03,0x05,0x80,0x00]
-
 //===----------------------------------------------------------------------===//
 // Instructions
 //===----------------------------------------------------------------------===//
diff --git a/test/MC/AMDGPU/literal16.s b/test/MC/AMDGPU/literal16.s
index e578ce82372f..97d16c374285 100644
--- a/test/MC/AMDGPU/literal16.s
+++ b/test/MC/AMDGPU/literal16.s
@@ -133,16 +133,16 @@ v_add_f16 v1, 65535, v2
 
 // K-constant
 v_madmk_f16 v1, v2, 0x4280, v3
-// VI: v_madmk_f16_e32 v1, v2, 0x4280, v3 ; encoding: [0x02,0x07,0x02,0x48,0x80,0x42,0x00,0x00]
+// VI: v_madmk_f16 v1, v2, 0x4280, v3 ; encoding: [0x02,0x07,0x02,0x48,0x80,0x42,0x00,0x00]
 
 v_madmk_f16 v1, v2, 1.0, v3
-// VI: v_madmk_f16_e32 v1, v2, 0x3c00, v3 ; encoding: [0x02,0x07,0x02,0x48,0x00,0x3c,0x00,0x00]
+// VI: v_madmk_f16 v1, v2, 0x3c00, v3 ; encoding: [0x02,0x07,0x02,0x48,0x00,0x3c,0x00,0x00]
 
 v_madmk_f16 v1, v2, 1, v3
-// VI: v_madmk_f16_e32 v1, v2, 0x1, v3 ; encoding: [0x02,0x07,0x02,0x48,0x01,0x00,0x00,0x00]
+// VI: v_madmk_f16 v1, v2, 0x1, v3 ; encoding: [0x02,0x07,0x02,0x48,0x01,0x00,0x00,0x00]
 
 v_madmk_f16 v1, v2, 64.0, v3
-// VI: v_madmk_f16_e32 v1, v2, 0x5400, v3 ; encoding: [0x02,0x07,0x02,0x48,0x00,0x54,0x00,0x00]
+// VI: v_madmk_f16 v1, v2, 0x5400, v3 ; encoding: [0x02,0x07,0x02,0x48,0x00,0x54,0x00,0x00]
 
 
 v_add_f16_e32 v1, 64.0, v2
diff --git a/test/MC/AMDGPU/vop2.s b/test/MC/AMDGPU/vop2.s
index 078b68638008..79ea38e641a6 100644
--- a/test/MC/AMDGPU/vop2.s
+++ b/test/MC/AMDGPU/vop2.s
@@ -243,31 +243,31 @@ v_or_b32_e32 v1, v2, v3
 v_xor_b32_e32 v1, v2, v3
 
 // SICI: v_bfm_b32_e64 v1, v2, v3 ; encoding: [0x01,0x00,0x3c,0xd2,0x02,0x07,0x02,0x00]
-// VI:   v_bfm_b32_e64 v1, v2, v3 ; encoding: [0x01,0x00,0x93,0xd2,0x02,0x07,0x02,0x00]
+// VI:   v_bfm_b32 v1, v2, v3 ; encoding: [0x01,0x00,0x93,0xd2,0x02,0x07,0x02,0x00]
 v_bfm_b32_e64 v1, v2, v3
 
 // SICI: v_mac_f32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x3e]
 // VI:   v_mac_f32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x2c]
 v_mac_f32_e32 v1, v2, v3
 
-// SICI: v_madmk_f32_e32 v1, v2, 0x42800000, v3 ; encoding: [0x02,0x07,0x02,0x40,0x00,0x00,0x80,0x42]
-// VI:   v_madmk_f32_e32 v1, v2, 0x42800000, v3 ; encoding: [0x02,0x07,0x02,0x2e,0x00,0x00,0x80,0x42]
-v_madmk_f32_e32 v1, v2, 64.0, v3
+// SICI: v_madmk_f32 v1, v2, 0x42800000, v3 ; encoding: [0x02,0x07,0x02,0x40,0x00,0x00,0x80,0x42]
+// VI:   v_madmk_f32 v1, v2, 0x42800000, v3 ; encoding: [0x02,0x07,0x02,0x2e,0x00,0x00,0x80,0x42]
+v_madmk_f32 v1, v2, 64.0, v3
 
-// SICI: v_madak_f32_e32 v1, v2, v3, 0x42800000 ; encoding: [0x02,0x07,0x02,0x42,0x00,0x00,0x80,0x42]
-// VI:   v_madak_f32_e32 v1, v2, v3, 0x42800000 ; encoding: [0x02,0x07,0x02,0x30,0x00,0x00,0x80,0x42]
-v_madak_f32_e32 v1, v2, v3, 64.0
+// SICI: v_madak_f32 v1, v2, v3, 0x42800000 ; encoding: [0x02,0x07,0x02,0x42,0x00,0x00,0x80,0x42]
+// VI:   v_madak_f32 v1, v2, v3, 0x42800000 ; encoding: [0x02,0x07,0x02,0x30,0x00,0x00,0x80,0x42]
+v_madak_f32 v1, v2, v3, 64.0
 
 // SICI: v_bcnt_u32_b32_e64 v1, v2, v3 ; encoding: [0x01,0x00,0x44,0xd2,0x02,0x07,0x02,0x00]
-// VI:   v_bcnt_u32_b32_e64 v1, v2, v3 ; encoding: [0x01,0x00,0x8b,0xd2,0x02,0x07,0x02,0x00]
+// VI:   v_bcnt_u32_b32 v1, v2, v3 ; encoding: [0x01,0x00,0x8b,0xd2,0x02,0x07,0x02,0x00]
 v_bcnt_u32_b32_e64 v1, v2, v3
 
 // SICI: v_mbcnt_lo_u32_b32_e64 v1, v2, v3 ; encoding: [0x01,0x00,0x46,0xd2,0x02,0x07,0x02,0x00]
-// VI:   v_mbcnt_lo_u32_b32_e64 v1, v2, v3 ; encoding: [0x01,0x00,0x8c,0xd2,0x02,0x07,0x02,0x00]
+// VI:   v_mbcnt_lo_u32_b32 v1, v2, v3 ; encoding: [0x01,0x00,0x8c,0xd2,0x02,0x07,0x02,0x00]
 v_mbcnt_lo_u32_b32_e64 v1, v2, v3
 
 // SICI: v_mbcnt_hi_u32_b32_e64 v1, v2, v3 ; encoding: [0x01,0x00,0x48,0xd2,0x02,0x07,0x02,0x00]
-// VI:   v_mbcnt_hi_u32_b32_e64 v1, v2, v3 ; encoding: [0x01,0x00,0x8d,0xd2,0x02,0x07,0x02,0x00]
+// VI:   v_mbcnt_hi_u32_b32 v1, v2, v3 ; encoding: [0x01,0x00,0x8d,0xd2,0x02,0x07,0x02,0x00]
 v_mbcnt_hi_u32_b32_e64 v1, v2, v3
 
 // SICI: v_add_i32_e32 v1, vcc, v2, v3 ; encoding: [0x02,0x07,0x02,0x4a]
@@ -376,31 +376,31 @@ v_subbrev_u32 v1, vcc, v2, v3, vcc
 v_subbrev_u32 v1, s[0:1], v2, v3, vcc
 
 // SICI: v_ldexp_f32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x56]
-// VI:   v_ldexp_f32_e64 v1, v2, v3 ; encoding: [0x01,0x00,0x88,0xd2,0x02,0x07,0x02,0x00]
+// VI:   v_ldexp_f32 v1, v2, v3 ; encoding: [0x01,0x00,0x88,0xd2,0x02,0x07,0x02,0x00]
 v_ldexp_f32 v1, v2, v3
 
 // SICI: v_cvt_pkaccum_u8_f32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x58]
-// VI:   v_cvt_pkaccum_u8_f32_e64 v1, v2, v3 ; encoding: [0x01,0x00,0xf0,0xd1,0x02,0x07,0x02,0x00]
+// VI:   v_cvt_pkaccum_u8_f32 v1, v2, v3 ; encoding: [0x01,0x00,0xf0,0xd1,0x02,0x07,0x02,0x00]
 v_cvt_pkaccum_u8_f32 v1, v2, v3
 
 // SICI: v_cvt_pknorm_i16_f32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x5a]
-// VI:   v_cvt_pknorm_i16_f32_e64 v1, v2, v3 ; encoding: [0x01,0x00,0x94,0xd2,0x02,0x07,0x02,0x00]
+// VI:   v_cvt_pknorm_i16_f32 v1, v2, v3 ; encoding: [0x01,0x00,0x94,0xd2,0x02,0x07,0x02,0x00]
 v_cvt_pknorm_i16_f32 v1, v2, v3
 
 // SICI: v_cvt_pknorm_u16_f32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x5c]
-// VI:   v_cvt_pknorm_u16_f32_e64 v1, v2, v3 ; encoding: [0x01,0x00,0x95,0xd2,0x02,0x07,0x02,0x00]
+// VI:   v_cvt_pknorm_u16_f32 v1, v2, v3 ; encoding: [0x01,0x00,0x95,0xd2,0x02,0x07,0x02,0x00]
 v_cvt_pknorm_u16_f32 v1, v2, v3
 
 // SICI: v_cvt_pkrtz_f16_f32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x5e]
-// VI:   v_cvt_pkrtz_f16_f32_e64 v1, v2, v3 ; encoding: [0x01,0x00,0x96,0xd2,0x02,0x07,0x02,0x00]
+// VI:   v_cvt_pkrtz_f16_f32 v1, v2, v3 ; encoding: [0x01,0x00,0x96,0xd2,0x02,0x07,0x02,0x00]
 v_cvt_pkrtz_f16_f32 v1, v2, v3
 
 // SICI: v_cvt_pk_u16_u32_e64 v1, v2, v3 ; encoding: [0x01,0x00,0x60,0xd2,0x02,0x07,0x02,0x00]
-// VI:   v_cvt_pk_u16_u32_e64 v1, v2, v3 ; encoding: [0x01,0x00,0x97,0xd2,0x02,0x07,0x02,0x00]
+// VI:   v_cvt_pk_u16_u32 v1, v2, v3 ; encoding: [0x01,0x00,0x97,0xd2,0x02,0x07,0x02,0x00]
 v_cvt_pk_u16_u32_e64 v1, v2, v3
 
 // SICI: v_cvt_pk_i16_i32_e64 v1, v2, v3 ; encoding: [0x01,0x00,0x62,0xd2,0x02,0x07,0x02,0x00]
-// VI:   v_cvt_pk_i16_i32_e64 v1, v2, v3 ; encoding: [0x01,0x00,0x98,0xd2,0x02,0x07,0x02,0x00]
+// VI:   v_cvt_pk_i16_i32 v1, v2, v3 ; encoding: [0x01,0x00,0x98,0xd2,0x02,0x07,0x02,0x00]
 v_cvt_pk_i16_i32_e64 v1, v2, v3
 
 // NOSICI: error: instruction not supported on this GPU
@@ -430,12 +430,12 @@ v_mac_f16_e32 v1, v2, v3
 
 // NOSICI: error: instruction not supported on this GPU
 // NOSICI: v_madmk_f16 v1, v2, 64.0, v3
-// VI:     v_madmk_f16_e32 v1, v2, 0x5400, v3 ; encoding: [0x02,0x07,0x02,0x48,0x00,0x54,0x00,0x00]
+// VI:     v_madmk_f16 v1, v2, 0x5400, v3 ; encoding: [0x02,0x07,0x02,0x48,0x00,0x54,0x00,0x00]
 v_madmk_f16 v1, v2, 64.0, v3
 
 // NOSICI: error: instruction not supported on this GPU
 // NOSICI: v_madak_f16 v1, v2, v3, 64.0
-// VI:     v_madak_f16_e32 v1, v2, v3, 0x5400 ; encoding: [0x02,0x07,0x02,0x4a,0x00,0x54,0x00,0x00]
+// VI:     v_madak_f16 v1, v2, v3, 0x5400 ; encoding: [0x02,0x07,0x02,0x4a,0x00,0x54,0x00,0x00]
 v_madak_f16 v1, v2, v3, 64.0
 
 // NOSICI: error: instruction not supported on this GPU
diff --git a/test/MC/AMDGPU/vop3-convert.s b/test/MC/AMDGPU/vop3-convert.s
index 8bc88a08dda2..781aa672d3c4 100644
--- a/test/MC/AMDGPU/vop3-convert.s
+++ b/test/MC/AMDGPU/vop3-convert.s
@@ -288,31 +288,31 @@ v_or_b32 v1, v2, v3
 v_xor_b32 v1, v2, v3
 
 // SICI: v_bfm_b32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x3c]
-// VI:   v_bfm_b32_e64 v1, v2, v3 ; encoding: [0x01,0x00,0x93,0xd2,0x02,0x07,0x02,0x00]
+// VI:   v_bfm_b32 v1, v2, v3 ; encoding: [0x01,0x00,0x93,0xd2,0x02,0x07,0x02,0x00]
 v_bfm_b32 v1, v2, v3
 
 // SICI: v_bcnt_u32_b32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x44]
-// VI:   v_bcnt_u32_b32_e64 v1, v2, v3 ; encoding: [0x01,0x00,0x8b,0xd2,0x02,0x07,0x02,0x00]
+// VI:   v_bcnt_u32_b32 v1, v2, v3 ; encoding: [0x01,0x00,0x8b,0xd2,0x02,0x07,0x02,0x00]
 v_bcnt_u32_b32 v1, v2, v3
 
 // SICI: v_mbcnt_lo_u32_b32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x46]
-// VI:   v_mbcnt_lo_u32_b32_e64 v1, v2, v3 ; encoding: [0x01,0x00,0x8c,0xd2,0x02,0x07,0x02,0x00]
+// VI:   v_mbcnt_lo_u32_b32 v1, v2, v3 ; encoding: [0x01,0x00,0x8c,0xd2,0x02,0x07,0x02,0x00]
 v_mbcnt_lo_u32_b32 v1, v2, v3
 
 // SICI: v_mbcnt_hi_u32_b32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x48]
-// VI:   v_mbcnt_hi_u32_b32_e64 v1, v2, v3 ; encoding: [0x01,0x00,0x8d,0xd2,0x02,0x07,0x02,0x00]
+// VI:   v_mbcnt_hi_u32_b32 v1, v2, v3 ; encoding: [0x01,0x00,0x8d,0xd2,0x02,0x07,0x02,0x00]
 v_mbcnt_hi_u32_b32 v1, v2, v3
 
 // SICI: v_cvt_pk_u16_u32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x60]
-// VI:   v_cvt_pk_u16_u32_e64 v1, v2, v3 ; encoding: [0x01,0x00,0x97,0xd2,0x02,0x07,0x02,0x00]
+// VI:   v_cvt_pk_u16_u32 v1, v2, v3 ; encoding: [0x01,0x00,0x97,0xd2,0x02,0x07,0x02,0x00]
 v_cvt_pk_u16_u32 v1, v2, v3
 
 // SICI: v_cvt_pk_i16_i32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x62]
-// VI:   v_cvt_pk_i16_i32_e64 v1, v2, v3 ; encoding: [0x01,0x00,0x98,0xd2,0x02,0x07,0x02,0x00]
+// VI:   v_cvt_pk_i16_i32 v1, v2, v3 ; encoding: [0x01,0x00,0x98,0xd2,0x02,0x07,0x02,0x00]
 v_cvt_pk_i16_i32 v1, v2, v3
 
 // SICI: v_bfm_b32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x3c]
-// VI:   v_bfm_b32_e64 v1, v2, v3 ; encoding: [0x01,0x00,0x93,0xd2,0x02,0x07,0x02,0x00]
+// VI:   v_bfm_b32 v1, v2, v3 ; encoding: [0x01,0x00,0x93,0xd2,0x02,0x07,0x02,0x00]
 v_bfm_b32 v1, v2, v3
 
 // NOSICI: error: instruction not supported on this GPU
diff --git a/test/MC/AsmParser/altmacro_string_escape.s b/test/MC/AsmParser/altmacro_string_escape.s
new file mode 100644
index 000000000000..bcc9e845953e
--- /dev/null
+++ b/test/MC/AsmParser/altmacro_string_escape.s
@@ -0,0 +1,29 @@
+# RUN: llvm-mc -triple i386-linux-gnu %s| FileCheck %s
+
+.altmacro
+# single-character string escape
+# To include any single character literally in a string
+# (even if the character would otherwise have some special meaning),
+# you can prefix the character with `!'.
+# For example, you can write `<4.3 !> 5.4!!>' to get the literal text `4.3 > 5.4!'.
+
+# CHECK: workForFun:
+.macro fun1 number
+  .if \number=5
+    lableNotWork:
+  .else
+    workForFun:
+  .endif
+.endm
+
+# CHECK: workForFun2:
+.macro fun2 string
+  .if \string
+    workForFun2:
+  .else
+    notworkForFun2:
+  .endif
+.endm
+
+fun1 <5!!>
+fun2 <5!>4>
diff --git a/test/MC/Disassembler/AMDGPU/flat_vi.txt b/test/MC/Disassembler/AMDGPU/flat_vi.txt
index a7013092b493..bcc395078050 100644
--- a/test/MC/Disassembler/AMDGPU/flat_vi.txt
+++ b/test/MC/Disassembler/AMDGPU/flat_vi.txt
@@ -9,39 +9,15 @@
 # VI: flat_load_dword v1, v[3:4] glc slc ; encoding: [0x00,0x00,0x53,0xdc,0x03,0x00,0x00,0x01]
 0x00 0x00 0x53 0xdc 0x03 0x00 0x00 0x01
 
-# VI: flat_load_dword v1, v[3:4] glc tfe ; encoding: [0x00,0x00,0x51,0xdc,0x03,0x00,0x80,0x01]
-0x00 0x00 0x51 0xdc 0x03 0x00 0x80 0x01
-
-# VI: flat_load_dword v1, v[3:4] glc slc tfe ; encoding: [0x00,0x00,0x53,0xdc,0x03,0x00,0x80,0x01]
-0x00 0x00 0x53 0xdc 0x03 0x00 0x80 0x01
-
 # VI: flat_load_dword v1, v[3:4] slc ; encoding: [0x00,0x00,0x52,0xdc,0x03,0x00,0x00,0x01]
 0x00 0x00 0x52 0xdc 0x03 0x00 0x00 0x01
 
-# VI: flat_load_dword v1, v[3:4] slc tfe ; encoding: [0x00,0x00,0x52,0xdc,0x03,0x00,0x80,0x01]
-0x00 0x00 0x52 0xdc 0x03 0x00 0x80 0x01
-
-# VI: flat_load_dword v1, v[3:4] tfe ; encoding: [0x00,0x00,0x50,0xdc,0x03,0x00,0x80,0x01]
-0x00 0x00 0x50 0xdc 0x03 0x00 0x80 0x01
-
 # VI: flat_atomic_add v1, v[3:4], v5 glc slc ; encoding: [0x00,0x00,0x0b,0xdd,0x03,0x05,0x00,0x01]
 0x00 0x00 0x0b 0xdd 0x03 0x05 0x00 0x01
 
-# VI: flat_atomic_add v1, v[3:4], v5 glc tfe ; encoding: [0x00,0x00,0x09,0xdd,0x03,0x05,0x80,0x01]
-0x00 0x00 0x09 0xdd 0x03 0x05 0x80 0x01
-
-# VI: flat_atomic_add v1, v[3:4], v5 glc slc tfe ; encoding: [0x00,0x00,0x0b,0xdd,0x03,0x05,0x80,0x01]
-0x00 0x00 0x0b 0xdd 0x03 0x05 0x80 0x01
-
 # VI: flat_atomic_add v[3:4], v5 slc ; encoding: [0x00,0x00,0x0a,0xdd,0x03,0x05,0x00,0x00]
 0x00 0x00 0x0a 0xdd 0x03 0x05 0x00 0x00
 
-# VI: flat_atomic_add v[3:4], v5 slc tfe ; encoding: [0x00,0x00,0x0a,0xdd,0x03,0x05,0x80,0x00]
-0x00 0x00 0x0a 0xdd 0x03 0x05 0x80 0x00
-
-# VI: flat_atomic_add v[3:4], v5 tfe ; encoding: [0x00,0x00,0x08,0xdd,0x03,0x05,0x80,0x00]
-0x00 0x00 0x08 0xdd 0x03 0x05 0x80 0x00
-
 # VI: flat_load_ubyte v1, v[3:4] ; encoding: [0x00,0x00,0x40,0xdc,0x03,0x00,0x00,0x01]
 0x00 0x00 0x40 0xdc 0x03 0x00 0x00 0x01
 
diff --git a/test/MC/Disassembler/AMDGPU/literal16_vi.txt b/test/MC/Disassembler/AMDGPU/literal16_vi.txt
index 362e87703694..a3cdae33a4cc 100644
--- a/test/MC/Disassembler/AMDGPU/literal16_vi.txt
+++ b/test/MC/Disassembler/AMDGPU/literal16_vi.txt
@@ -44,11 +44,11 @@
 # VI: v_add_f16_e32 v1, 0, v3 ; encoding: [0x80,0x06,0x02,0x3e]
 0xff 0x06 0x02 0x3e 0x00 0x00 0x00 0x00
 
-# VI: v_madmk_f16_e32 v1, v2, 0x41, v3 ; encoding: [0x02,0x07,0x02,0x48,0x41,0x00,0x00,0x00]
+# VI: v_madmk_f16 v1, v2, 0x41, v3 ; encoding: [0x02,0x07,0x02,0x48,0x41,0x00,0x00,0x00]
 0x02 0x07 0x02 0x48 0x41 0x00 0x00 0x00
 
-# VI: v_madmk_f16_e32 v1, v2, 0x10041, v3 ; encoding: [0x02,0x07,0x02,0x48,0x41,0x00,0x01,0x00]
+# VI: v_madmk_f16 v1, v2, 0x10041, v3 ; encoding: [0x02,0x07,0x02,0x48,0x41,0x00,0x01,0x00]
 0x02 0x07 0x02 0x48 0x41 0x00 0x01 0x00
 
-# VI: v_madmk_f16_e32 v1, v2, 0x1000041, v3 ; encoding: [0x02,0x07,0x02,0x48,0x41,0x00,0x00,0x01]
+# VI: v_madmk_f16 v1, v2, 0x1000041, v3 ; encoding: [0x02,0x07,0x02,0x48,0x41,0x00,0x00,0x01]
 0x02 0x07 0x02 0x48 0x41 0x00 0x00 0x01
diff --git a/test/MC/Disassembler/AMDGPU/vop2_vi.txt b/test/MC/Disassembler/AMDGPU/vop2_vi.txt
index 4a47c8157971..b6f556bd55be 100644
--- a/test/MC/Disassembler/AMDGPU/vop2_vi.txt
+++ b/test/MC/Disassembler/AMDGPU/vop2_vi.txt
@@ -72,25 +72,25 @@
 # VI:   v_xor_b32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x2a]
 0x02 0x07 0x02 0x2a
 
-# VI:   v_bfm_b32_e64 v1, v2, v3 ; encoding: [0x01,0x00,0x93,0xd2,0x02,0x07,0x02,0x00]
+# VI:   v_bfm_b32 v1, v2, v3 ; encoding: [0x01,0x00,0x93,0xd2,0x02,0x07,0x02,0x00]
 0x01 0x00 0x93 0xd2 0x02 0x07 0x02 0x00
 
 # VI:   v_mac_f32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x2c]
 0x02 0x07 0x02 0x2c
 
-# VI:   v_madmk_f32_e32 v1, v2, 0x42800000, v3 ; encoding: [0x02,0x07,0x02,0x2e,0x00,0x00,0x80,0x42]
+# VI:   v_madmk_f32 v1, v2, 0x42800000, v3 ; encoding: [0x02,0x07,0x02,0x2e,0x00,0x00,0x80,0x42]
 0x02 0x07 0x02 0x2e 0x00 0x00 0x80 0x42
 
-# VI:   v_madak_f32_e32 v1, v2, v3, 0x42800000 ; encoding: [0x02,0x07,0x02,0x30,0x00,0x00,0x80,0x42]
+# VI:   v_madak_f32 v1, v2, v3, 0x42800000 ; encoding: [0x02,0x07,0x02,0x30,0x00,0x00,0x80,0x42]
 0x02 0x07 0x02 0x30 0x00 0x00 0x80 0x42
 
-# VI:   v_bcnt_u32_b32_e64 v1, v2, v3 ; encoding: [0x01,0x00,0x8b,0xd2,0x02,0x07,0x02,0x00]
+# VI:   v_bcnt_u32_b32 v1, v2, v3 ; encoding: [0x01,0x00,0x8b,0xd2,0x02,0x07,0x02,0x00]
 0x01 0x00 0x8b 0xd2 0x02 0x07 0x02 0x00
 
-# VI:   v_mbcnt_lo_u32_b32_e64 v1, v2, v3 ; encoding: [0x01,0x00,0x8c,0xd2,0x02,0x07,0x02,0x00]
+# VI:   v_mbcnt_lo_u32_b32 v1, v2, v3 ; encoding: [0x01,0x00,0x8c,0xd2,0x02,0x07,0x02,0x00]
 0x01 0x00 0x8c 0xd2 0x02 0x07 0x02 0x00
 
-# VI:   v_mbcnt_hi_u32_b32_e64 v1, v2, v3 ; encoding: [0x01,0x00,0x8d,0xd2,0x02,0x07,0x02,0x00]
+# VI:   v_mbcnt_hi_u32_b32 v1, v2, v3 ; encoding: [0x01,0x00,0x8d,0xd2,0x02,0x07,0x02,0x00]
 0x01 0x00 0x8d 0xd2 0x02 0x07 0x02 0x00
 
 # VI:   v_add_i32_e32 v1, vcc, v2, v3 ; encoding: [0x02,0x07,0x02,0x32]
@@ -171,25 +171,25 @@
 # VI: v_subbrev_u32_e64 v1, s[0:1], v2, v3, vcc ; encoding: [0x01,0x00,0x1e,0xd1,0x02,0x07,0xaa,0x01]
 0x01 0x00 0x1e 0xd1 0x02 0x07 0xaa 0x01
 
-# VI:   v_ldexp_f32_e64 v1, v2, v3 ; encoding: [0x01,0x00,0x88,0xd2,0x02,0x07,0x02,0x00]
+# VI:   v_ldexp_f32 v1, v2, v3 ; encoding: [0x01,0x00,0x88,0xd2,0x02,0x07,0x02,0x00]
 0x01 0x00 0x88 0xd2 0x02 0x07 0x02 0x00
 
-# VI:   v_cvt_pkaccum_u8_f32_e64 v1, v2, v3 ; encoding: [0x01,0x00,0xf0,0xd1,0x02,0x07,0x02,0x00]
+# VI:   v_cvt_pkaccum_u8_f32 v1, v2, v3 ; encoding: [0x01,0x00,0xf0,0xd1,0x02,0x07,0x02,0x00]
 0x01 0x00 0xf0 0xd1 0x02 0x07 0x02 0x00
 
-# VI:   v_cvt_pknorm_i16_f32_e64 v1, v2, v3 ; encoding: [0x01,0x00,0x94,0xd2,0x02,0x07,0x02,0x00]
+# VI:   v_cvt_pknorm_i16_f32 v1, v2, v3 ; encoding: [0x01,0x00,0x94,0xd2,0x02,0x07,0x02,0x00]
 0x01 0x00 0x94 0xd2 0x02 0x07 0x02 0x00
 
-# VI:   v_cvt_pknorm_u16_f32_e64 v1, v2, v3 ; encoding: [0x01,0x00,0x95,0xd2,0x02,0x07,0x02,0x00]
+# VI:   v_cvt_pknorm_u16_f32 v1, v2, v3 ; encoding: [0x01,0x00,0x95,0xd2,0x02,0x07,0x02,0x00]
 0x01 0x00 0x95 0xd2 0x02 0x07 0x02 0x00
 
-# VI:   v_cvt_pkrtz_f16_f32_e64 v1, v2, v3 ; encoding: [0x01,0x00,0x96,0xd2,0x02,0x07,0x02,0x00]
+# VI:   v_cvt_pkrtz_f16_f32 v1, v2, v3 ; encoding: [0x01,0x00,0x96,0xd2,0x02,0x07,0x02,0x00]
 0x01 0x00 0x96 0xd2 0x02 0x07 0x02 0x00
 
-# VI:   v_cvt_pk_u16_u32_e64 v1, v2, v3 ; encoding: [0x01,0x00,0x97,0xd2,0x02,0x07,0x02,0x00]
+# VI:   v_cvt_pk_u16_u32 v1, v2, v3 ; encoding: [0x01,0x00,0x97,0xd2,0x02,0x07,0x02,0x00]
 0x01 0x00 0x97 0xd2 0x02 0x07 0x02 0x00
 
-# VI:   v_cvt_pk_i16_i32_e64 v1, v2, v3 ; encoding: [0x01,0x00,0x98,0xd2,0x02,0x07,0x02,0x00]
+# VI:   v_cvt_pk_i16_i32 v1, v2, v3 ; encoding: [0x01,0x00,0x98,0xd2,0x02,0x07,0x02,0x00]
 0x01 0x00 0x98 0xd2 0x02 0x07 0x02 0x00
 
 # VI:     v_add_f16_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x3e]
@@ -207,10 +207,10 @@
 # VI:     v_mac_f16_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x46]
 0x02 0x07 0x02 0x46
 
-# VI:     v_madmk_f16_e32 v1, v2, 0x42800000, v3 ; encoding: [0x02,0x07,0x02,0x48,0x00,0x00,0x80,0x42]
+# VI:     v_madmk_f16 v1, v2, 0x42800000, v3 ; encoding: [0x02,0x07,0x02,0x48,0x00,0x00,0x80,0x42]
 0x02 0x07 0x02 0x48 0x00 0x00 0x80 0x42
 
-# VI:     v_madak_f16_e32 v1, v2, v3, 0x42800000 ; encoding: [0x02,0x07,0x02,0x4a,0x00,0x00,0x80,0x42]
+# VI:     v_madak_f16 v1, v2, v3, 0x42800000 ; encoding: [0x02,0x07,0x02,0x4a,0x00,0x00,0x80,0x42]
 0x02 0x07 0x02 0x4a 0x00 0x00 0x80 0x42
 
 # VI:     v_add_u16_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x4c]
diff --git a/test/MC/Disassembler/AMDGPU/vop3_vi.txt b/test/MC/Disassembler/AMDGPU/vop3_vi.txt
index c15fbaa1e3a8..a1cc1f06c3cb 100644
--- a/test/MC/Disassembler/AMDGPU/vop3_vi.txt
+++ b/test/MC/Disassembler/AMDGPU/vop3_vi.txt
@@ -81,6 +81,24 @@
 # VI:   v_clrexcp ; encoding: [0x00,0x00,0x75,0xd1,0x00,0x00,0x00,0x00]
 0x00 0x00 0x75 0xd1 0x00 0x00 0x00 0x00
 
+# VI:   v_fract_f64_e64 v[5:6], s[2:3] ; encoding: [0x05,0x00,0x72,0xd1,0x02,0x00,0x00,0x00]
+0x05,0x00,0x72,0xd1,0x02,0x00,0x00,0x00
+
+# VI:   v_fract_f64_e64 v[5:6], -4.0 ; encoding: [0x05,0x00,0x72,0xd1,0xf7,0x00,0x00,0x00]
+0x05,0x00,0x72,0xd1,0xf7,0x00,0x00,0x00
+
+# VI:   v_fract_f64_e64 v[5:6], -s[2:3] ; encoding: [0x05,0x00,0x72,0xd1,0x02,0x00,0x00,0x20]
+0x05,0x00,0x72,0xd1,0x02,0x00,0x00,0x20
+
+# VI:   v_fract_f64_e64 v[5:6], |s[2:3]| ; encoding: [0x05,0x01,0x72,0xd1,0x02,0x00,0x00,0x00]
+0x05,0x01,0x72,0xd1,0x02,0x00,0x00,0x00
+
+# VI:   v_fract_f64_e64 v[5:6], s[2:3] clamp ; encoding: [0x05,0x80,0x72,0xd1,0x02,0x00,0x00,0x00]
+0x05,0x80,0x72,0xd1,0x02,0x00,0x00,0x00
+
+# VI:   v_fract_f64_e64 v[5:6], s[2:3] mul:2 ; encoding: [0x05,0x00,0x72,0xd1,0x02,0x00,0x00,0x08]
+0x05,0x00,0x72,0xd1,0x02,0x00,0x00,0x08
+
 # VI:   v_fract_f32_e64 v1, -v2 ; encoding: [0x01,0x00,0x5b,0xd1,0x02,0x01,0x00,0x20]
 0x01 0x00 0x5b 0xd1 0x02 0x01 0x00 0x20
 
diff --git a/test/MC/Disassembler/PowerPC/ppc64-encoding-p9vector.txt b/test/MC/Disassembler/PowerPC/ppc64-encoding-p9vector.txt
new file mode 100644
index 000000000000..1a7964808a44
--- /dev/null
+++ b/test/MC/Disassembler/PowerPC/ppc64-encoding-p9vector.txt
@@ -0,0 +1,4 @@
+# RUN: llvm-mc --disassemble %s -triple powerpc64le-unknown-unknown -mcpu=pwr9 | FileCheck %s
+
+# CHECK: mtvsrdd 6, 0, 3
+0x66 0x1b 0xc0 0x7c
diff --git a/test/MC/Disassembler/SystemZ/insns-z13.txt b/test/MC/Disassembler/SystemZ/insns-z13.txt
index 5a983860df1a..4f5ec43f7348 100644
--- a/test/MC/Disassembler/SystemZ/insns-z13.txt
+++ b/test/MC/Disassembler/SystemZ/insns-z13.txt
@@ -2,6 +2,297 @@
 # RUN: llvm-mc --disassemble %s -triple=s390x-linux-gnu -mcpu=z13 \
 # RUN:   | FileCheck %s
 
+# CHECK: lcbb %r0, 0, 0
+0xe7 0x00 0x00 0x00 0x00 0x27
+
+# CHECK: lcbb %r0, 0, 15
+0xe7 0x00 0x00 0x00 0xf0 0x27
+
+# CHECK: lcbb %r0, 4095, 0
+0xe7 0x00 0x0f 0xff 0x00 0x27
+
+# CHECK: lcbb %r0, 0(%r15), 0
+0xe7 0x00 0xf0 0x00 0x00 0x27
+
+# CHECK: lcbb %r0, 0(%r15,%r1), 0
+0xe7 0x0f 0x10 0x00 0x00 0x27
+
+# CHECK: lcbb %r15, 0, 0
+0xe7 0xf0 0x00 0x00 0x00 0x27
+
+# CHECK: lcbb %r2, 1383(%r3,%r4), 8
+0xe7 0x23 0x45 0x67 0x80 0x27
+
+# CHECK: llzrgf %r0, -524288
+0xe3 0x00 0x00 0x00 0x80 0x3a
+
+# CHECK: llzrgf %r0, -1
+0xe3 0x00 0x0f 0xff 0xff 0x3a
+
+# CHECK: llzrgf %r0, 0
+0xe3 0x00 0x00 0x00 0x00 0x3a
+
+# CHECK: llzrgf %r0, 1
+0xe3 0x00 0x00 0x01 0x00 0x3a
+
+# CHECK: llzrgf %r0, 524287
+0xe3 0x00 0x0f 0xff 0x7f 0x3a
+
+# CHECK: llzrgf %r0, 0(%r1)
+0xe3 0x00 0x10 0x00 0x00 0x3a
+
+# CHECK: llzrgf %r0, 0(%r15)
+0xe3 0x00 0xf0 0x00 0x00 0x3a
+
+# CHECK: llzrgf %r0, 524287(%r1,%r15)
+0xe3 0x01 0xff 0xff 0x7f 0x3a
+
+# CHECK: llzrgf %r0, 524287(%r15,%r1)
+0xe3 0x0f 0x1f 0xff 0x7f 0x3a
+
+# CHECK: llzrgf %r15, 0
+0xe3 0xf0 0x00 0x00 0x00 0x3a
+
+# CHECK: lochi %r11, 42, 0
+0xec 0xb0 0x00 0x2a 0x00 0x42
+
+# CHECK: lochio %r11, 42
+0xec 0xb1 0x00 0x2a 0x00 0x42
+
+# CHECK: lochih %r11, 42
+0xec 0xb2 0x00 0x2a 0x00 0x42
+
+# CHECK: lochinle %r11, 42
+0xec 0xb3 0x00 0x2a 0x00 0x42
+
+# CHECK: lochil %r11, -1
+0xec 0xb4 0xff 0xff 0x00 0x42
+
+# CHECK: lochinhe %r11, 42
+0xec 0xb5 0x00 0x2a 0x00 0x42
+
+# CHECK: lochilh %r11, -1
+0xec 0xb6 0xff 0xff 0x00 0x42
+
+# CHECK: lochine %r11, 0
+0xec 0xb7 0x00 0x00 0x00 0x42
+
+# CHECK: lochie %r11, 0
+0xec 0xb8 0x00 0x00 0x00 0x42
+
+# CHECK: lochinlh %r11, 42
+0xec 0xb9 0x00 0x2a 0x00 0x42
+
+# CHECK: lochihe %r11, 255
+0xec 0xba 0x00 0xff 0x00 0x42
+
+# CHECK: lochinl %r11, 255
+0xec 0xbb 0x00 0xff 0x00 0x42
+
+# CHECK: lochile %r11, 32767
+0xec 0xbc 0x7f 0xff 0x00 0x42
+
+# CHECK: lochinh %r11, 32767
+0xec 0xbd 0x7f 0xff 0x00 0x42
+
+# CHECK: lochino %r11, 32512
+0xec 0xbe 0x7f 0x00 0x00 0x42
+
+# CHECK: lochi %r11, 32512, 15
+0xec 0xbf 0x7f 0x00 0x00 0x42
+
+# CHECK: locghi %r11, 42, 0
+0xec 0xb0 0x00 0x2a 0x00 0x46
+
+# CHECK: locghio %r11, 42
+0xec 0xb1 0x00 0x2a 0x00 0x46
+
+# CHECK: locghih %r11, 42
+0xec 0xb2 0x00 0x2a 0x00 0x46
+
+# CHECK: locghinle %r11, 42
+0xec 0xb3 0x00 0x2a 0x00 0x46
+
+# CHECK: locghil %r11, -1
+0xec 0xb4 0xff 0xff 0x00 0x46
+
+# CHECK: locghinhe %r11, 42
+0xec 0xb5 0x00 0x2a 0x00 0x46
+
+# CHECK: locghilh %r11, -1
+0xec 0xb6 0xff 0xff 0x00 0x46
+
+# CHECK: locghine %r11, 0
+0xec 0xb7 0x00 0x00 0x00 0x46
+
+# CHECK: locghie %r11, 0
+0xec 0xb8 0x00 0x00 0x00 0x46
+
+# CHECK: locghinlh %r11, 42
+0xec 0xb9 0x00 0x2a 0x00 0x46
+
+# CHECK: locghihe %r11, 255
+0xec 0xba 0x00 0xff 0x00 0x46
+
+# CHECK: locghinl %r11, 255
+0xec 0xbb 0x00 0xff 0x00 0x46
+
+# CHECK: locghile %r11, 32767
+0xec 0xbc 0x7f 0xff 0x00 0x46
+
+# CHECK: locghinh %r11, 32767
+0xec 0xbd 0x7f 0xff 0x00 0x46
+
+# CHECK: locghino %r11, 32512
+0xec 0xbe 0x7f 0x00 0x00 0x46
+
+# CHECK: locghi %r11, 32512, 15
+0xec 0xbf 0x7f 0x00 0x00 0x46
+
+# CHECK: lochhi %r11, 42, 0
+0xec 0xb0 0x00 0x2a 0x00 0x4e
+
+# CHECK: lochhio %r11, 42
+0xec 0xb1 0x00 0x2a 0x00 0x4e
+
+# CHECK: lochhih %r11, 42
+0xec 0xb2 0x00 0x2a 0x00 0x4e
+
+# CHECK: lochhinle %r11, 42
+0xec 0xb3 0x00 0x2a 0x00 0x4e
+
+# CHECK: lochhil %r11, -1
+0xec 0xb4 0xff 0xff 0x00 0x4e
+
+# CHECK: lochhinhe %r11, 42
+0xec 0xb5 0x00 0x2a 0x00 0x4e
+
+# CHECK: lochhilh %r11, -1
+0xec 0xb6 0xff 0xff 0x00 0x4e
+
+# CHECK: lochhine %r11, 0
+0xec 0xb7 0x00 0x00 0x00 0x4e
+
+# CHECK: lochhie %r11, 0
+0xec 0xb8 0x00 0x00 0x00 0x4e
+
+# CHECK: lochhinlh %r11, 42
+0xec 0xb9 0x00 0x2a 0x00 0x4e
+
+# CHECK: lochhihe %r11, 255
+0xec 0xba 0x00 0xff 0x00 0x4e
+
+# CHECK: lochhinl %r11, 255
+0xec 0xbb 0x00 0xff 0x00 0x4e
+
+# CHECK: lochhile %r11, 32767
+0xec 0xbc 0x7f 0xff 0x00 0x4e
+
+# CHECK: lochhinh %r11, 32767
+0xec 0xbd 0x7f 0xff 0x00 0x4e
+
+# CHECK: lochhino %r11, 32512
+0xec 0xbe 0x7f 0x00 0x00 0x4e
+
+# CHECK: lochhi %r11, 32512, 15
+0xec 0xbf 0x7f 0x00 0x00 0x4e
+
+# CHECK: locfh %r7, 6399(%r8), 0
+0xeb 0x70 0x88 0xff 0x01 0xe0
+
+# CHECK: locfho %r7, 6399(%r8)
+0xeb 0x71 0x88 0xff 0x01 0xe0
+
+# CHECK: locfhh %r7, 6399(%r8)
+0xeb 0x72 0x88 0xff 0x01 0xe0
+
+# CHECK: locfhnle %r7, 6399(%r8)
+0xeb 0x73 0x88 0xff 0x01 0xe0
+
+# CHECK: locfhl %r7, 6399(%r8)
+0xeb 0x74 0x88 0xff 0x01 0xe0
+
+# CHECK: locfhnhe %r7, 6399(%r8)
+0xeb 0x75 0x88 0xff 0x01 0xe0
+
+# CHECK: locfhlh %r7, 6399(%r8)
+0xeb 0x76 0x88 0xff 0x01 0xe0
+
+# CHECK: locfhne %r7, 6399(%r8)
+0xeb 0x77 0x88 0xff 0x01 0xe0
+
+# CHECK: locfhe %r7, 6399(%r8)
+0xeb 0x78 0x88 0xff 0x01 0xe0
+
+# CHECK: locfhnlh %r7, 6399(%r8)
+0xeb 0x79 0x88 0xff 0x01 0xe0
+
+# CHECK: locfhhe %r7, 6399(%r8)
+0xeb 0x7a 0x88 0xff 0x01 0xe0
+
+# CHECK: locfhnl %r7, 6399(%r8)
+0xeb 0x7b 0x88 0xff 0x01 0xe0
+
+# CHECK: locfhle %r7, 6399(%r8)
+0xeb 0x7c 0x88 0xff 0x01 0xe0
+
+# CHECK: locfhnh %r7, 6399(%r8)
+0xeb 0x7d 0x88 0xff 0x01 0xe0
+
+# CHECK: locfhno %r7, 6399(%r8)
+0xeb 0x7e 0x88 0xff 0x01 0xe0
+
+# CHECK: locfh %r7, 6399(%r8), 15
+0xeb 0x7f 0x88 0xff 0x01 0xe0
+
+# CHECK: locfhr %r11, %r3, 0
+0xb9 0xe0 0x00 0xb3
+
+# CHECK: locfhro %r11, %r3
+0xb9 0xe0 0x10 0xb3
+
+# CHECK: locfhrh %r11, %r3
+0xb9 0xe0 0x20 0xb3
+
+# CHECK: locfhrnle %r11, %r3
+0xb9 0xe0 0x30 0xb3
+
+# CHECK: locfhrl %r11, %r3
+0xb9 0xe0 0x40 0xb3
+
+# CHECK: locfhrnhe %r11, %r3
+0xb9 0xe0 0x50 0xb3
+
+# CHECK: locfhrlh %r11, %r3
+0xb9 0xe0 0x60 0xb3
+
+# CHECK: locfhrne %r11, %r3
+0xb9 0xe0 0x70 0xb3
+
+# CHECK: locfhre %r11, %r3
+0xb9 0xe0 0x80 0xb3
+
+# CHECK: locfhrnlh %r11, %r3
+0xb9 0xe0 0x90 0xb3
+
+# CHECK: locfhrhe %r11, %r3
+0xb9 0xe0 0xa0 0xb3
+
+# CHECK: locfhrnl %r11, %r3
+0xb9 0xe0 0xb0 0xb3
+
+# CHECK: locfhrle %r11, %r3
+0xb9 0xe0 0xc0 0xb3
+
+# CHECK: locfhrnh %r11, %r3
+0xb9 0xe0 0xd0 0xb3
+
+# CHECK: locfhrno %r11, %r3
+0xb9 0xe0 0xe0 0xb3
+
+# CHECK: locfhr %r11, %r3, 15
+0xb9 0xe0 0xf0 0xb3
+
 # CHECK: lzrf %r0, -524288
 0xe3 0x00 0x00 0x00 0x80 0x3b
 
@@ -62,4548 +353,4299 @@
 # CHECK: lzrg %r15, 0
 0xe3 0xf0 0x00 0x00 0x00 0x2a
 
-# CHECK: llzrgf %r0, -524288
-0xe3 0x00 0x00 0x00 0x80 0x3a
+# CHECK: ppno %r2, %r10
+0xb9 0x3c 0x00 0x2a
 
-# CHECK: llzrgf %r0, -1
-0xe3 0x00 0x0f 0xff 0xff 0x3a
+# CHECK: ppno %r2, %r14
+0xb9 0x3c 0x00 0x2e
 
-# CHECK: llzrgf %r0, 0
-0xe3 0x00 0x00 0x00 0x00 0x3a
+# CHECK: ppno %r14, %r2
+0xb9 0x3c 0x00 0xe2
 
-# CHECK: llzrgf %r0, 1
-0xe3 0x00 0x00 0x01 0x00 0x3a
+# CHECK: ppno %r14, %r10
+0xb9 0x3c 0x00 0xea
 
-# CHECK: llzrgf %r0, 524287
-0xe3 0x00 0x0f 0xff 0x7f 0x3a
+# CHECK: stocfh %r0, 0, 0
+0xeb 0x00 0x00 0x00 0x00 0xe1
 
-# CHECK: llzrgf %r0, 0(%r1)
-0xe3 0x00 0x10 0x00 0x00 0x3a
+# CHECK: stocfh %r0, 0, 15
+0xeb 0x0f 0x00 0x00 0x00 0xe1
 
-# CHECK: llzrgf %r0, 0(%r15)
-0xe3 0x00 0xf0 0x00 0x00 0x3a
+# CHECK: stocfh %r0, -524288, 0
+0xeb 0x00 0x00 0x00 0x80 0xe1
 
-# CHECK: llzrgf %r0, 524287(%r1,%r15)
-0xe3 0x01 0xff 0xff 0x7f 0x3a
+# CHECK: stocfh %r0, 524287, 0
+0xeb 0x00 0x0f 0xff 0x7f 0xe1
 
-# CHECK: llzrgf %r0, 524287(%r15,%r1)
-0xe3 0x0f 0x1f 0xff 0x7f 0x3a
+# CHECK: stocfh %r0, 0(%r1), 0
+0xeb 0x00 0x10 0x00 0x00 0xe1
 
-# CHECK: llzrgf %r15, 0
-0xe3 0xf0 0x00 0x00 0x00 0x3a
+# CHECK: stocfh %r0, 0(%r15), 0
+0xeb 0x00 0xf0 0x00 0x00 0xe1
 
-#CHECK: lcbb    %r0, 0, 0
-0xe7 0x00 0x00 0x00 0x00 0x27
+# CHECK: stocfh %r15, 0, 0
+0xeb 0xf0 0x00 0x00 0x00 0xe1
+
+# CHECK: stocfho %r1, 2(%r3)
+0xeb 0x11 0x30 0x02 0x00 0xe1
 
-#CHECK: lcbb    %r1, 2475(%r7,%r8), 12
-0xe7 0x17 0x89 0xab 0xc0 0x27
+# CHECK: stocfhh %r1, 2(%r3)
+0xeb 0x12 0x30 0x02 0x00 0xe1
 
-#CHECK: lcbb    %r15, 4095(%r15,%r15), 15
-0xe7 0xff 0xff 0xff 0xf0 0x27
+# CHECK: stocfhnle %r1, 2(%r3)
+0xeb 0x13 0x30 0x02 0x00 0xe1
 
-#CHECK: va      %v0, %v0, %v0, 11
+# CHECK: stocfhl %r1, 2(%r3)
+0xeb 0x14 0x30 0x02 0x00 0xe1
+
+# CHECK: stocfhnhe %r1, 2(%r3)
+0xeb 0x15 0x30 0x02 0x00 0xe1
+
+# CHECK: stocfhlh %r1, 2(%r3)
+0xeb 0x16 0x30 0x02 0x00 0xe1
+
+# CHECK: stocfhne %r1, 2(%r3)
+0xeb 0x17 0x30 0x02 0x00 0xe1
+
+# CHECK: stocfhe %r1, 2(%r3)
+0xeb 0x18 0x30 0x02 0x00 0xe1
+
+# CHECK: stocfhnlh %r1, 2(%r3)
+0xeb 0x19 0x30 0x02 0x00 0xe1
+
+# CHECK: stocfhhe %r1, 2(%r3)
+0xeb 0x1a 0x30 0x02 0x00 0xe1
+
+# CHECK: stocfhnl %r1, 2(%r3)
+0xeb 0x1b 0x30 0x02 0x00 0xe1
+
+# CHECK: stocfhle %r1, 2(%r3)
+0xeb 0x1c 0x30 0x02 0x00 0xe1
+
+# CHECK: stocfhnh %r1, 2(%r3)
+0xeb 0x1d 0x30 0x02 0x00 0xe1
+
+# CHECK: stocfhno %r1, 2(%r3)
+0xeb 0x1e 0x30 0x02 0x00 0xe1
+
+# CHECK: stocfh %r1, 2(%r3), 15
+0xeb 0x1f 0x30 0x02 0x00 0xe1
+
+# CHECK: va  %v0, %v0, %v0, 11
 0xe7 0x00 0x00 0x00 0xb0 0xf3
 
-#CHECK: va      %v18, %v3, %v20, 11
+# CHECK: va  %v18, %v3, %v20, 11
 0xe7 0x23 0x40 0x00 0xba 0xf3
 
-#CHECK: va      %v31, %v31, %v31, 11
+# CHECK: va  %v31, %v31, %v31, 11
 0xe7 0xff 0xf0 0x00 0xbe 0xf3
 
-#CHECK: vab     %v0, %v0, %v0
+# CHECK: vab  %v0, %v0, %v0
 0xe7 0x00 0x00 0x00 0x00 0xf3
 
-#CHECK: vab     %v18, %v3, %v20
+# CHECK: vab  %v18, %v3, %v20
 0xe7 0x23 0x40 0x00 0x0a 0xf3
 
-#CHECK: vab     %v31, %v31, %v31
+# CHECK: vab  %v31, %v31, %v31
 0xe7 0xff 0xf0 0x00 0x0e 0xf3
 
-#CHECK: vacc    %v0, %v0, %v0, 11
+# CHECK: vac  %v0, %v0, %v0, %v0, 11
+0xe7 0x00 0x0b 0x00 0x00 0xbb
+
+# CHECK: vac  %v3, %v20, %v5, %v22, 11
+0xe7 0x34 0x5b 0x00 0x65 0xbb
+
+# CHECK: vac  %v31, %v31, %v31, %v31, 11
+0xe7 0xff 0xfb 0x00 0xff 0xbb
+
+# CHECK: vacc %v0, %v0, %v0, 11
 0xe7 0x00 0x00 0x00 0xb0 0xf1
 
-#CHECK: vacc    %v18, %v3, %v20, 11
+# CHECK: vacc %v18, %v3, %v20, 11
 0xe7 0x23 0x40 0x00 0xba 0xf1
 
-#CHECK: vacc    %v31, %v31, %v31, 11
+# CHECK: vacc %v31, %v31, %v31, 11
 0xe7 0xff 0xf0 0x00 0xbe 0xf1
 
-#CHECK: vaccb   %v0, %v0, %v0
+# CHECK: vaccb %v0, %v0, %v0
 0xe7 0x00 0x00 0x00 0x00 0xf1
 
-#CHECK: vaccb   %v18, %v3, %v20
+# CHECK: vaccb %v18, %v3, %v20
 0xe7 0x23 0x40 0x00 0x0a 0xf1
 
-#CHECK: vaccb   %v31, %v31, %v31
+# CHECK: vaccb %v31, %v31, %v31
 0xe7 0xff 0xf0 0x00 0x0e 0xf1
 
-#CHECK: vaccc   %v0, %v0, %v0, %v0, 11
+# CHECK: vaccc %v0, %v0, %v0, %v0, 11
 0xe7 0x00 0x0b 0x00 0x00 0xb9
 
-#CHECK: vaccc   %v3, %v20, %v5, %v22, 11
+# CHECK: vaccc %v3, %v20, %v5, %v22, 11
 0xe7 0x34 0x5b 0x00 0x65 0xb9
 
-#CHECK: vaccc   %v31, %v31, %v31, %v31, 11
+# CHECK: vaccc %v31, %v31, %v31, %v31, 11
 0xe7 0xff 0xfb 0x00 0xff 0xb9
 
-#CHECK: vacccq  %v0, %v0, %v0, %v0
+# CHECK: vacccq %v0, %v0, %v0, %v0
 0xe7 0x00 0x04 0x00 0x00 0xb9
 
-#CHECK: vacccq  %v3, %v20, %v5, %v22
+# CHECK: vacccq %v3, %v20, %v5, %v22
 0xe7 0x34 0x54 0x00 0x65 0xb9
 
-#CHECK: vacccq  %v31, %v31, %v31, %v31
+# CHECK: vacccq %v31, %v31, %v31, %v31
 0xe7 0xff 0xf4 0x00 0xff 0xb9
 
-#CHECK: vaccf   %v0, %v0, %v0
+# CHECK: vaccf %v0, %v0, %v0
 0xe7 0x00 0x00 0x00 0x20 0xf1
 
-#CHECK: vaccf   %v18, %v3, %v20
+# CHECK: vaccf %v18, %v3, %v20
 0xe7 0x23 0x40 0x00 0x2a 0xf1
 
-#CHECK: vaccf   %v31, %v31, %v31
+# CHECK: vaccf %v31, %v31, %v31
 0xe7 0xff 0xf0 0x00 0x2e 0xf1
 
-#CHECK: vaccg   %v0, %v0, %v0
+# CHECK: vaccg %v0, %v0, %v0
 0xe7 0x00 0x00 0x00 0x30 0xf1
 
-#CHECK: vaccg   %v18, %v3, %v20
+# CHECK: vaccg %v18, %v3, %v20
 0xe7 0x23 0x40 0x00 0x3a 0xf1
 
-#CHECK: vaccg   %v31, %v31, %v31
+# CHECK: vaccg %v31, %v31, %v31
 0xe7 0xff 0xf0 0x00 0x3e 0xf1
 
-#CHECK: vacch   %v0, %v0, %v0
+# CHECK: vacch %v0, %v0, %v0
 0xe7 0x00 0x00 0x00 0x10 0xf1
 
-#CHECK: vacch   %v18, %v3, %v20
+# CHECK: vacch %v18, %v3, %v20
 0xe7 0x23 0x40 0x00 0x1a 0xf1
 
-#CHECK: vacch   %v31, %v31, %v31
+# CHECK: vacch %v31, %v31, %v31
 0xe7 0xff 0xf0 0x00 0x1e 0xf1
 
-#CHECK: vaccq   %v0, %v0, %v0
+# CHECK: vaccq %v0, %v0, %v0
 0xe7 0x00 0x00 0x00 0x40 0xf1
 
-#CHECK: vaccq   %v18, %v3, %v20
+# CHECK: vaccq %v18, %v3, %v20
 0xe7 0x23 0x40 0x00 0x4a 0xf1
 
-#CHECK: vaccq   %v31, %v31, %v31
+# CHECK: vaccq %v31, %v31, %v31
 0xe7 0xff 0xf0 0x00 0x4e 0xf1
 
-#CHECK: vac     %v0, %v0, %v0, %v0, 11
-0xe7 0x00 0x0b 0x00 0x00 0xbb
-
-#CHECK: vac     %v3, %v20, %v5, %v22, 11
-0xe7 0x34 0x5b 0x00 0x65 0xbb
-
-#CHECK: vac     %v31, %v31, %v31, %v31, 11
-0xe7 0xff 0xfb 0x00 0xff 0xbb
-
-#CHECK: vacq    %v0, %v0, %v0, %v0
+# CHECK: vacq %v0, %v0, %v0, %v0
 0xe7 0x00 0x04 0x00 0x00 0xbb
 
-#CHECK: vacq    %v3, %v20, %v5, %v22
+# CHECK: vacq %v3, %v20, %v5, %v22
 0xe7 0x34 0x54 0x00 0x65 0xbb
 
-#CHECK: vacq    %v31, %v31, %v31, %v31
+# CHECK: vacq %v31, %v31, %v31, %v31
 0xe7 0xff 0xf4 0x00 0xff 0xbb
 
-#CHECK: vaf     %v0, %v0, %v0
+# CHECK: vaf  %v0, %v0, %v0
 0xe7 0x00 0x00 0x00 0x20 0xf3
 
-#CHECK: vaf     %v18, %v3, %v20
+# CHECK: vaf  %v18, %v3, %v20
 0xe7 0x23 0x40 0x00 0x2a 0xf3
 
-#CHECK: vaf     %v31, %v31, %v31
+# CHECK: vaf  %v31, %v31, %v31
 0xe7 0xff 0xf0 0x00 0x2e 0xf3
 
-#CHECK: vag     %v0, %v0, %v0
+# CHECK: vag  %v0, %v0, %v0
 0xe7 0x00 0x00 0x00 0x30 0xf3
 
-#CHECK: vag     %v18, %v3, %v20
+# CHECK: vag  %v18, %v3, %v20
 0xe7 0x23 0x40 0x00 0x3a 0xf3
 
-#CHECK: vag     %v31, %v31, %v31
+# CHECK: vag  %v31, %v31, %v31
 0xe7 0xff 0xf0 0x00 0x3e 0xf3
 
-#CHECK: vah     %v0, %v0, %v0
+# CHECK: vah  %v0, %v0, %v0
 0xe7 0x00 0x00 0x00 0x10 0xf3
 
-#CHECK: vah     %v18, %v3, %v20
+# CHECK: vah  %v18, %v3, %v20
 0xe7 0x23 0x40 0x00 0x1a 0xf3
 
-#CHECK: vah     %v31, %v31, %v31
+# CHECK: vah  %v31, %v31, %v31
 0xe7 0xff 0xf0 0x00 0x1e 0xf3
 
-#CHECK: vaq     %v0, %v0, %v0
+# CHECK: vaq  %v0, %v0, %v0
 0xe7 0x00 0x00 0x00 0x40 0xf3
 
-#CHECK: vaq     %v18, %v3, %v20
+# CHECK: vaq  %v18, %v3, %v20
 0xe7 0x23 0x40 0x00 0x4a 0xf3
 
-#CHECK: vaq     %v31, %v31, %v31
+# CHECK: vaq  %v31, %v31, %v31
 0xe7 0xff 0xf0 0x00 0x4e 0xf3
 
-#CHECK: vavg    %v0, %v0, %v0, 11
+# CHECK: vavg %v0, %v0, %v0, 11
 0xe7 0x00 0x00 0x00 0xb0 0xf2
 
-#CHECK: vavg    %v18, %v3, %v20, 11
+# CHECK: vavg %v18, %v3, %v20, 11
 0xe7 0x23 0x40 0x00 0xba 0xf2
 
-#CHECK: vavg    %v31, %v31, %v31, 11
+# CHECK: vavg %v31, %v31, %v31, 11
 0xe7 0xff 0xf0 0x00 0xbe 0xf2
 
-#CHECK: vavgb   %v0, %v0, %v0
+# CHECK: vavgb %v0, %v0, %v0
 0xe7 0x00 0x00 0x00 0x00 0xf2
 
-#CHECK: vavgb   %v18, %v3, %v20
+# CHECK: vavgb %v18, %v3, %v20
 0xe7 0x23 0x40 0x00 0x0a 0xf2
 
-#CHECK: vavgb   %v31, %v31, %v31
+# CHECK: vavgb %v31, %v31, %v31
 0xe7 0xff 0xf0 0x00 0x0e 0xf2
 
-#CHECK: vavgf   %v0, %v0, %v0
+# CHECK: vavgf %v0, %v0, %v0
 0xe7 0x00 0x00 0x00 0x20 0xf2
 
-#CHECK: vavgf   %v18, %v3, %v20
+# CHECK: vavgf %v18, %v3, %v20
 0xe7 0x23 0x40 0x00 0x2a 0xf2
 
-#CHECK: vavgf   %v31, %v31, %v31
+# CHECK: vavgf %v31, %v31, %v31
 0xe7 0xff 0xf0 0x00 0x2e 0xf2
 
-#CHECK: vavgg   %v0, %v0, %v0
+# CHECK: vavgg %v0, %v0, %v0
 0xe7 0x00 0x00 0x00 0x30 0xf2
 
-#CHECK: vavgg   %v18, %v3, %v20
+# CHECK: vavgg %v18, %v3, %v20
 0xe7 0x23 0x40 0x00 0x3a 0xf2
 
-#CHECK: vavgg   %v31, %v31, %v31
+# CHECK: vavgg %v31, %v31, %v31
 0xe7 0xff 0xf0 0x00 0x3e 0xf2
 
-#CHECK: vavgh   %v0, %v0, %v0
+# CHECK: vavgh %v0, %v0, %v0
 0xe7 0x00 0x00 0x00 0x10 0xf2
 
-#CHECK: vavgh   %v18, %v3, %v20
+# CHECK: vavgh %v18, %v3, %v20
 0xe7 0x23 0x40 0x00 0x1a 0xf2
 
-#CHECK: vavgh   %v31, %v31, %v31
+# CHECK: vavgh %v31, %v31, %v31
 0xe7 0xff 0xf0 0x00 0x1e 0xf2
 
-#CHECK: vavgl   %v0, %v0, %v0, 11
+# CHECK: vavgl %v0, %v0, %v0, 11
 0xe7 0x00 0x00 0x00 0xb0 0xf0
 
-#CHECK: vavgl   %v18, %v3, %v20, 11
+# CHECK: vavgl %v18, %v3, %v20, 11
 0xe7 0x23 0x40 0x00 0xba 0xf0
 
-#CHECK: vavgl   %v31, %v31, %v31, 11
+# CHECK: vavgl %v31, %v31, %v31, 11
 0xe7 0xff 0xf0 0x00 0xbe 0xf0
 
-#CHECK: vavglb  %v0, %v0, %v0
+# CHECK: vavglb %v0, %v0, %v0
 0xe7 0x00 0x00 0x00 0x00 0xf0
 
-#CHECK: vavglb  %v18, %v3, %v20
+# CHECK: vavglb %v18, %v3, %v20
 0xe7 0x23 0x40 0x00 0x0a 0xf0
 
-#CHECK: vavglb  %v31, %v31, %v31
+# CHECK: vavglb %v31, %v31, %v31
 0xe7 0xff 0xf0 0x00 0x0e 0xf0
 
-#CHECK: vavglf  %v0, %v0, %v0
+# CHECK: vavglf %v0, %v0, %v0
 0xe7 0x00 0x00 0x00 0x20 0xf0
 
-#CHECK: vavglf  %v18, %v3, %v20
+# CHECK: vavglf %v18, %v3, %v20
 0xe7 0x23 0x40 0x00 0x2a 0xf0
 
-#CHECK: vavglf  %v31, %v31, %v31
+# CHECK: vavglf %v31, %v31, %v31
 0xe7 0xff 0xf0 0x00 0x2e 0xf0
 
-#CHECK: vavglg  %v0, %v0, %v0
+# CHECK: vavglg %v0, %v0, %v0
 0xe7 0x00 0x00 0x00 0x30 0xf0
 
-#CHECK: vavglg  %v18, %v3, %v20
+# CHECK: vavglg %v18, %v3, %v20
 0xe7 0x23 0x40 0x00 0x3a 0xf0
 
-#CHECK: vavglg  %v31, %v31, %v31
+# CHECK: vavglg %v31, %v31, %v31
 0xe7 0xff 0xf0 0x00 0x3e 0xf0
 
-#CHECK: vavglh  %v0, %v0, %v0
+# CHECK: vavglh %v0, %v0, %v0
 0xe7 0x00 0x00 0x00 0x10 0xf0
 
-#CHECK: vavglh  %v18, %v3, %v20
+# CHECK: vavglh %v18, %v3, %v20
 0xe7 0x23 0x40 0x00 0x1a 0xf0
 
-#CHECK: vavglh  %v31, %v31, %v31
+# CHECK: vavglh %v31, %v31, %v31
 0xe7 0xff 0xf0 0x00 0x1e 0xf0
 
-#CHECK: vcdg    %v0, %v0, 11, 0, 0
+# CHECK: vcdg %v0, %v0, 11, 0, 0
 0xe7 0x00 0x00 0x00 0xb0 0xc3
 
-#CHECK: vcdg    %v19, %v14, 11, 4, 10
+# CHECK: vcdg %v19, %v14, 11, 4, 10
 0xe7 0x3e 0x00 0xa4 0xb8 0xc3
 
-#CHECK: vcdg    %v31, %v31, 11, 7, 15
+# CHECK: vcdg %v31, %v31, 11, 7, 15
 0xe7 0xff 0x00 0xf7 0xbc 0xc3
 
-#CHECK: vcdgb   %v0, %v0, 0, 0
+# CHECK: vcdgb %v0, %v0, 0, 0
 0xe7 0x00 0x00 0x00 0x30 0xc3
 
-#CHECK: vcdgb   %v19, %v14, 4, 10
+# CHECK: vcdgb %v19, %v14, 4, 10
 0xe7 0x3e 0x00 0xa4 0x38 0xc3
 
-#CHECK: vcdgb   %v31, %v31, 7, 15
+# CHECK: vcdgb %v31, %v31, 7, 15
 0xe7 0xff 0x00 0xf7 0x3c 0xc3
 
-#CHECK: vcdlg   %v0, %v0, 11, 0, 0
+# CHECK: vcdlg %v0, %v0, 11, 0, 0
 0xe7 0x00 0x00 0x00 0xb0 0xc1
 
-#CHECK: vcdlg   %v19, %v14, 11, 4, 10
+# CHECK: vcdlg %v19, %v14, 11, 4, 10
 0xe7 0x3e 0x00 0xa4 0xb8 0xc1
 
-#CHECK: vcdlg   %v31, %v31, 11, 7, 15
+# CHECK: vcdlg %v31, %v31, 11, 7, 15
 0xe7 0xff 0x00 0xf7 0xbc 0xc1
 
-#CHECK: vcdlgb  %v0, %v0, 0, 0
+# CHECK: vcdlgb %v0, %v0, 0, 0
 0xe7 0x00 0x00 0x00 0x30 0xc1
 
-#CHECK: vcdlgb  %v19, %v14, 4, 10
+# CHECK: vcdlgb %v19, %v14, 4, 10
 0xe7 0x3e 0x00 0xa4 0x38 0xc1
 
-#CHECK: vcdlgb  %v31, %v31, 7, 15
+# CHECK: vcdlgb %v31, %v31, 7, 15
 0xe7 0xff 0x00 0xf7 0x3c 0xc1
 
-#CHECK: vceq    %v0, %v0, %v0, 11, 9
+# CHECK: vceq %v0, %v0, %v0, 11, 9
 0xe7 0x00 0x00 0x90 0xb0 0xf8
 
-#CHECK: vceq    %v18, %v3, %v20, 11, 9
+# CHECK: vceq %v18, %v3, %v20, 11, 9
 0xe7 0x23 0x40 0x90 0xba 0xf8
 
-#CHECK: vceq    %v7, %v24, %v9, 11, 9
+# CHECK: vceq %v7, %v24, %v9, 11, 9
 0xe7 0x78 0x90 0x90 0xb4 0xf8
 
-#CHECK: vceq    %v31, %v31, %v31, 11, 9
+# CHECK: vceq %v31, %v31, %v31, 11, 9
 0xe7 0xff 0xf0 0x90 0xbe 0xf8
 
-#CHECK: vceqb   %v0, %v0, %v0
+# CHECK: vceqb %v0, %v0, %v0
 0xe7 0x00 0x00 0x00 0x00 0xf8
 
-#CHECK: vceqb   %v18, %v3, %v20
+# CHECK: vceqb %v18, %v3, %v20
 0xe7 0x23 0x40 0x00 0x0a 0xf8
 
-#CHECK: vceqbs  %v7, %v24, %v9
-0xe7 0x78 0x90 0x10 0x04 0xf8
-
-#CHECK: vceqb   %v31, %v31, %v31
+# CHECK: vceqb %v31, %v31, %v31
 0xe7 0xff 0xf0 0x00 0x0e 0xf8
 
-#CHECK: vceqf   %v0, %v0, %v0
+# CHECK: vceqbs %v7, %v24, %v9
+0xe7 0x78 0x90 0x10 0x04 0xf8
+
+# CHECK: vceqf %v0, %v0, %v0
 0xe7 0x00 0x00 0x00 0x20 0xf8
 
-#CHECK: vceqf   %v18, %v3, %v20
+# CHECK: vceqf %v18, %v3, %v20
 0xe7 0x23 0x40 0x00 0x2a 0xf8
 
-#CHECK: vceqfs  %v7, %v24, %v9
-0xe7 0x78 0x90 0x10 0x24 0xf8
-
-#CHECK: vceqf   %v31, %v31, %v31
+# CHECK: vceqf %v31, %v31, %v31
 0xe7 0xff 0xf0 0x00 0x2e 0xf8
 
-#CHECK: vceqg   %v0, %v0, %v0
+# CHECK: vceqfs %v7, %v24, %v9
+0xe7 0x78 0x90 0x10 0x24 0xf8
+
+# CHECK: vceqg %v0, %v0, %v0
 0xe7 0x00 0x00 0x00 0x30 0xf8
 
-#CHECK: vceqg   %v18, %v3, %v20
+# CHECK: vceqg %v18, %v3, %v20
 0xe7 0x23 0x40 0x00 0x3a 0xf8
 
-#CHECK: vceqgs  %v7, %v24, %v9
-0xe7 0x78 0x90 0x10 0x34 0xf8
-
-#CHECK: vceqg   %v31, %v31, %v31
+# CHECK: vceqg %v31, %v31, %v31
 0xe7 0xff 0xf0 0x00 0x3e 0xf8
 
-#CHECK: vceqh   %v0, %v0, %v0
+# CHECK: vceqgs %v7, %v24, %v9
+0xe7 0x78 0x90 0x10 0x34 0xf8
+
+# CHECK: vceqh %v0, %v0, %v0
 0xe7 0x00 0x00 0x00 0x10 0xf8
 
-#CHECK: vceqh   %v18, %v3, %v20
+# CHECK: vceqh %v18, %v3, %v20
 0xe7 0x23 0x40 0x00 0x1a 0xf8
 
-#CHECK: vceqhs  %v7, %v24, %v9
-0xe7 0x78 0x90 0x10 0x14 0xf8
-
-#CHECK: vceqh   %v31, %v31, %v31
+# CHECK: vceqh %v31, %v31, %v31
 0xe7 0xff 0xf0 0x00 0x1e 0xf8
 
-#CHECK: vcgd    %v0, %v0, 11, 0, 0
+# CHECK: vceqhs %v7, %v24, %v9
+0xe7 0x78 0x90 0x10 0x14 0xf8
+
+# CHECK: vcgd %v0, %v0, 11, 0, 0
 0xe7 0x00 0x00 0x00 0xb0 0xc2
 
-#CHECK: vcgd    %v19, %v14, 11, 4, 10
+# CHECK: vcgd %v19, %v14, 11, 4, 10
 0xe7 0x3e 0x00 0xa4 0xb8 0xc2
 
-#CHECK: vcgd    %v31, %v31, 11, 7, 15
+# CHECK: vcgd %v31, %v31, 11, 7, 15
 0xe7 0xff 0x00 0xf7 0xbc 0xc2
 
-#CHECK: vcgdb   %v0, %v0, 0, 0
+# CHECK: vcgdb %v0, %v0, 0, 0
 0xe7 0x00 0x00 0x00 0x30 0xc2
 
-#CHECK: vcgdb   %v19, %v14, 4, 10
+# CHECK: vcgdb %v19, %v14, 4, 10
 0xe7 0x3e 0x00 0xa4 0x38 0xc2
 
-#CHECK: vcgdb   %v31, %v31, 7, 15
+# CHECK: vcgdb %v31, %v31, 7, 15
 0xe7 0xff 0x00 0xf7 0x3c 0xc2
 
-#CHECK: vch     %v0, %v0, %v0, 11, 9
+# CHECK: vch  %v0, %v0, %v0, 11, 9
 0xe7 0x00 0x00 0x90 0xb0 0xfb
 
-#CHECK: vch     %v18, %v3, %v20, 11, 9
+# CHECK: vch  %v18, %v3, %v20, 11, 9
 0xe7 0x23 0x40 0x90 0xba 0xfb
 
-#CHECK: vch     %v7, %v24, %v9, 11, 9
+# CHECK: vch  %v7, %v24, %v9, 11, 9
 0xe7 0x78 0x90 0x90 0xb4 0xfb
 
-#CHECK: vch     %v31, %v31, %v31, 11, 9
+# CHECK: vch  %v31, %v31, %v31, 11, 9
 0xe7 0xff 0xf0 0x90 0xbe 0xfb
 
-#CHECK: vchb    %v0, %v0, %v0
+# CHECK: vchb %v0, %v0, %v0
 0xe7 0x00 0x00 0x00 0x00 0xfb
 
-#CHECK: vchb    %v18, %v3, %v20
+# CHECK: vchb %v18, %v3, %v20
 0xe7 0x23 0x40 0x00 0x0a 0xfb
 
-#CHECK: vchbs   %v7, %v24, %v9
-0xe7 0x78 0x90 0x10 0x04 0xfb
-
-#CHECK: vchb    %v31, %v31, %v31
+# CHECK: vchb %v31, %v31, %v31
 0xe7 0xff 0xf0 0x00 0x0e 0xfb
 
-#CHECK: vchf    %v0, %v0, %v0
+# CHECK: vchbs %v7, %v24, %v9
+0xe7 0x78 0x90 0x10 0x04 0xfb
+
+# CHECK: vchf %v0, %v0, %v0
 0xe7 0x00 0x00 0x00 0x20 0xfb
 
-#CHECK: vchf    %v18, %v3, %v20
+# CHECK: vchf %v18, %v3, %v20
 0xe7 0x23 0x40 0x00 0x2a 0xfb
 
-#CHECK: vchfs   %v7, %v24, %v9
-0xe7 0x78 0x90 0x10 0x24 0xfb
-
-#CHECK: vchf    %v31, %v31, %v31
+# CHECK: vchf %v31, %v31, %v31
 0xe7 0xff 0xf0 0x00 0x2e 0xfb
 
-#CHECK: vchg    %v0, %v0, %v0
+# CHECK: vchfs %v7, %v24, %v9
+0xe7 0x78 0x90 0x10 0x24 0xfb
+
+# CHECK: vchg %v0, %v0, %v0
 0xe7 0x00 0x00 0x00 0x30 0xfb
 
-#CHECK: vchg    %v18, %v3, %v20
+# CHECK: vchg %v18, %v3, %v20
 0xe7 0x23 0x40 0x00 0x3a 0xfb
 
-#CHECK: vchgs   %v7, %v24, %v9
-0xe7 0x78 0x90 0x10 0x34 0xfb
-
-#CHECK: vchg    %v31, %v31, %v31
+# CHECK: vchg %v31, %v31, %v31
 0xe7 0xff 0xf0 0x00 0x3e 0xfb
 
-#CHECK: vchh    %v0, %v0, %v0
+# CHECK: vchgs %v7, %v24, %v9
+0xe7 0x78 0x90 0x10 0x34 0xfb
+
+# CHECK: vchh %v0, %v0, %v0
 0xe7 0x00 0x00 0x00 0x10 0xfb
 
-#CHECK: vchh    %v18, %v3, %v20
+# CHECK: vchh %v18, %v3, %v20
 0xe7 0x23 0x40 0x00 0x1a 0xfb
 
-#CHECK: vchhs   %v7, %v24, %v9
-0xe7 0x78 0x90 0x10 0x14 0xfb
-
-#CHECK: vchh    %v31, %v31, %v31
+# CHECK: vchh %v31, %v31, %v31
 0xe7 0xff 0xf0 0x00 0x1e 0xfb
 
-#CHECK: vchl    %v0, %v0, %v0, 11, 9
+# CHECK: vchhs %v7, %v24, %v9
+0xe7 0x78 0x90 0x10 0x14 0xfb
+
+# CHECK: vchl %v0, %v0, %v0, 11, 9
 0xe7 0x00 0x00 0x90 0xb0 0xf9
 
-#CHECK: vchl    %v18, %v3, %v20, 11, 9
+# CHECK: vchl %v18, %v3, %v20, 11, 9
 0xe7 0x23 0x40 0x90 0xba 0xf9
 
-#CHECK: vchl    %v7, %v24, %v9, 11, 9
+# CHECK: vchl %v7, %v24, %v9, 11, 9
 0xe7 0x78 0x90 0x90 0xb4 0xf9
 
-#CHECK: vchl    %v31, %v31, %v31, 11, 9
+# CHECK: vchl %v31, %v31, %v31, 11, 9
 0xe7 0xff 0xf0 0x90 0xbe 0xf9
 
-#CHECK: vchlb   %v0, %v0, %v0
+# CHECK: vchlb %v0, %v0, %v0
 0xe7 0x00 0x00 0x00 0x00 0xf9
 
-#CHECK: vchlb   %v18, %v3, %v20
+# CHECK: vchlb %v18, %v3, %v20
 0xe7 0x23 0x40 0x00 0x0a 0xf9
 
-#CHECK: vchlbs  %v7, %v24, %v9
-0xe7 0x78 0x90 0x10 0x04 0xf9
-
-#CHECK: vchlb   %v31, %v31, %v31
+# CHECK: vchlb %v31, %v31, %v31
 0xe7 0xff 0xf0 0x00 0x0e 0xf9
 
-#CHECK: vchlf   %v0, %v0, %v0
+# CHECK: vchlbs %v7, %v24, %v9
+0xe7 0x78 0x90 0x10 0x04 0xf9
+
+# CHECK: vchlf %v0, %v0, %v0
 0xe7 0x00 0x00 0x00 0x20 0xf9
 
-#CHECK: vchlf   %v18, %v3, %v20
+# CHECK: vchlf %v18, %v3, %v20
 0xe7 0x23 0x40 0x00 0x2a 0xf9
 
-#CHECK: vchlfs  %v7, %v24, %v9
-0xe7 0x78 0x90 0x10 0x24 0xf9
-
-#CHECK: vchlf   %v31, %v31, %v31
+# CHECK: vchlf %v31, %v31, %v31
 0xe7 0xff 0xf0 0x00 0x2e 0xf9
 
-#CHECK: vchlg   %v0, %v0, %v0
+# CHECK: vchlfs %v7, %v24, %v9
+0xe7 0x78 0x90 0x10 0x24 0xf9
+
+# CHECK: vchlg %v0, %v0, %v0
 0xe7 0x00 0x00 0x00 0x30 0xf9
 
-#CHECK: vchlg   %v18, %v3, %v20
+# CHECK: vchlg %v18, %v3, %v20
 0xe7 0x23 0x40 0x00 0x3a 0xf9
 
-#CHECK: vchlgs  %v7, %v24, %v9
-0xe7 0x78 0x90 0x10 0x34 0xf9
-
-#CHECK: vchlg   %v31, %v31, %v31
+# CHECK: vchlg %v31, %v31, %v31
 0xe7 0xff 0xf0 0x00 0x3e 0xf9
 
-#CHECK: vchlh   %v0, %v0, %v0
+# CHECK: vchlgs %v7, %v24, %v9
+0xe7 0x78 0x90 0x10 0x34 0xf9
+
+# CHECK: vchlh %v0, %v0, %v0
 0xe7 0x00 0x00 0x00 0x10 0xf9
 
-#CHECK: vchlh   %v18, %v3, %v20
+# CHECK: vchlh %v18, %v3, %v20
 0xe7 0x23 0x40 0x00 0x1a 0xf9
 
-#CHECK: vchlhs  %v7, %v24, %v9
-0xe7 0x78 0x90 0x10 0x14 0xf9
-
-#CHECK: vchlh   %v31, %v31, %v31
+# CHECK: vchlh %v31, %v31, %v31
 0xe7 0xff 0xf0 0x00 0x1e 0xf9
 
-#CHECK: vcksm   %v0, %v0, %v0
+# CHECK: vchlhs %v7, %v24, %v9
+0xe7 0x78 0x90 0x10 0x14 0xf9
+
+# CHECK: vcksm %v0, %v0, %v0
 0xe7 0x00 0x00 0x00 0x00 0x66
 
-#CHECK: vcksm   %v18, %v3, %v20
+# CHECK: vcksm %v18, %v3, %v20
 0xe7 0x23 0x40 0x00 0x0a 0x66
 
-#CHECK: vcksm   %v31, %v31, %v31
+# CHECK: vcksm %v31, %v31, %v31
 0xe7 0xff 0xf0 0x00 0x0e 0x66
 
-#CHECK: vclgd   %v0, %v0, 11, 0, 0
+# CHECK: vclgd %v0, %v0, 11, 0, 0
 0xe7 0x00 0x00 0x00 0xb0 0xc0
 
-#CHECK: vclgd   %v19, %v14, 11, 4, 10
+# CHECK: vclgd %v19, %v14, 11, 4, 10
 0xe7 0x3e 0x00 0xa4 0xb8 0xc0
 
-#CHECK: vclgd   %v31, %v31, 11, 7, 15
+# CHECK: vclgd %v31, %v31, 11, 7, 15
 0xe7 0xff 0x00 0xf7 0xbc 0xc0
 
-#CHECK: vclgdb  %v0, %v0, 0, 0
+# CHECK: vclgdb %v0, %v0, 0, 0
 0xe7 0x00 0x00 0x00 0x30 0xc0
 
-#CHECK: vclgdb  %v19, %v14, 4, 10
+# CHECK: vclgdb %v19, %v14, 4, 10
 0xe7 0x3e 0x00 0xa4 0x38 0xc0
 
-#CHECK: vclgdb  %v31, %v31, 7, 15
+# CHECK: vclgdb %v31, %v31, 7, 15
 0xe7 0xff 0x00 0xf7 0x3c 0xc0
 
-#CHECK: vclz    %v0, %v0, 11
+# CHECK: vclz %v0, %v0, 11
 0xe7 0x00 0x00 0x00 0xb0 0x53
 
-#CHECK: vclz    %v19, %v14, 11
+# CHECK: vclz %v19, %v14, 11
 0xe7 0x3e 0x00 0x00 0xb8 0x53
 
-#CHECK: vclz    %v31, %v31, 11
+# CHECK: vclz %v31, %v31, 11
 0xe7 0xff 0x00 0x00 0xbc 0x53
 
-#CHECK: vclzb   %v0, %v0
+# CHECK: vclzb %v0, %v0
 0xe7 0x00 0x00 0x00 0x00 0x53
 
-#CHECK: vclzb   %v19, %v14
+# CHECK: vclzb %v19, %v14
 0xe7 0x3e 0x00 0x00 0x08 0x53
 
-#CHECK: vclzb   %v31, %v31
+# CHECK: vclzb %v31, %v31
 0xe7 0xff 0x00 0x00 0x0c 0x53
 
-#CHECK: vclzf   %v0, %v0
+# CHECK: vclzf %v0, %v0
 0xe7 0x00 0x00 0x00 0x20 0x53
 
-#CHECK: vclzf   %v19, %v14
+# CHECK: vclzf %v19, %v14
 0xe7 0x3e 0x00 0x00 0x28 0x53
 
-#CHECK: vclzf   %v31, %v31
+# CHECK: vclzf %v31, %v31
 0xe7 0xff 0x00 0x00 0x2c 0x53
 
-#CHECK: vclzg   %v0, %v0
+# CHECK: vclzg %v0, %v0
 0xe7 0x00 0x00 0x00 0x30 0x53
 
-#CHECK: vclzg   %v19, %v14
+# CHECK: vclzg %v19, %v14
 0xe7 0x3e 0x00 0x00 0x38 0x53
 
-#CHECK: vclzg   %v31, %v31
+# CHECK: vclzg %v31, %v31
 0xe7 0xff 0x00 0x00 0x3c 0x53
 
-#CHECK: vclzh   %v0, %v0
+# CHECK: vclzh %v0, %v0
 0xe7 0x00 0x00 0x00 0x10 0x53
 
-#CHECK: vclzh   %v19, %v14
+# CHECK: vclzh %v19, %v14
 0xe7 0x3e 0x00 0x00 0x18 0x53
 
-#CHECK: vclzh   %v31, %v31
+# CHECK: vclzh %v31, %v31
 0xe7 0xff 0x00 0x00 0x1c 0x53
 
-#CHECK: vctz    %v0, %v0, 11
+# CHECK: vctz %v0, %v0, 11
 0xe7 0x00 0x00 0x00 0xb0 0x52
 
-#CHECK: vctz    %v19, %v14, 11
+# CHECK: vctz %v19, %v14, 11
 0xe7 0x3e 0x00 0x00 0xb8 0x52
 
-#CHECK: vctz    %v31, %v31, 11
+# CHECK: vctz %v31, %v31, 11
 0xe7 0xff 0x00 0x00 0xbc 0x52
 
-#CHECK: vctzb   %v0, %v0
+# CHECK: vctzb %v0, %v0
 0xe7 0x00 0x00 0x00 0x00 0x52
 
-#CHECK: vctzb   %v19, %v14
+# CHECK: vctzb %v19, %v14
 0xe7 0x3e 0x00 0x00 0x08 0x52
 
-#CHECK: vctzb   %v31, %v31
+# CHECK: vctzb %v31, %v31
 0xe7 0xff 0x00 0x00 0x0c 0x52
 
-#CHECK: vctzf   %v0, %v0
+# CHECK: vctzf %v0, %v0
 0xe7 0x00 0x00 0x00 0x20 0x52
 
-#CHECK: vctzf   %v19, %v14
+# CHECK: vctzf %v19, %v14
 0xe7 0x3e 0x00 0x00 0x28 0x52
 
-#CHECK: vctzf   %v31, %v31
+# CHECK: vctzf %v31, %v31
 0xe7 0xff 0x00 0x00 0x2c 0x52
 
-#CHECK: vctzg   %v0, %v0
+# CHECK: vctzg %v0, %v0
 0xe7 0x00 0x00 0x00 0x30 0x52
 
-#CHECK: vctzg   %v19, %v14
+# CHECK: vctzg %v19, %v14
 0xe7 0x3e 0x00 0x00 0x38 0x52
 
-#CHECK: vctzg   %v31, %v31
+# CHECK: vctzg %v31, %v31
 0xe7 0xff 0x00 0x00 0x3c 0x52
 
-#CHECK: vctzh   %v0, %v0
+# CHECK: vctzh %v0, %v0
 0xe7 0x00 0x00 0x00 0x10 0x52
 
-#CHECK: vctzh   %v19, %v14
+# CHECK: vctzh %v19, %v14
 0xe7 0x3e 0x00 0x00 0x18 0x52
 
-#CHECK: vctzh   %v31, %v31
+# CHECK: vctzh %v31, %v31
 0xe7 0xff 0x00 0x00 0x1c 0x52
 
-#CHECK: vec     %v0, %v0, 11
+# CHECK: vec  %v0, %v0, 11
 0xe7 0x00 0x00 0x00 0xb0 0xdb
 
-#CHECK: vec     %v19, %v14, 11
+# CHECK: vec  %v19, %v14, 11
 0xe7 0x3e 0x00 0x00 0xb8 0xdb
 
-#CHECK: vec     %v31, %v31, 11
+# CHECK: vec  %v31, %v31, 11
 0xe7 0xff 0x00 0x00 0xbc 0xdb
 
-#CHECK: vecb    %v0, %v0
+# CHECK: vecb %v0, %v0
 0xe7 0x00 0x00 0x00 0x00 0xdb
 
-#CHECK: vecb    %v19, %v14
+# CHECK: vecb %v19, %v14
 0xe7 0x3e 0x00 0x00 0x08 0xdb
 
-#CHECK: vecb    %v31, %v31
+# CHECK: vecb %v31, %v31
 0xe7 0xff 0x00 0x00 0x0c 0xdb
 
-#CHECK: vecf    %v0, %v0
+# CHECK: vecf %v0, %v0
 0xe7 0x00 0x00 0x00 0x20 0xdb
 
-#CHECK: vecf    %v19, %v14
+# CHECK: vecf %v19, %v14
 0xe7 0x3e 0x00 0x00 0x28 0xdb
 
-#CHECK: vecf    %v31, %v31
+# CHECK: vecf %v31, %v31
 0xe7 0xff 0x00 0x00 0x2c 0xdb
 
-#CHECK: vecg    %v0, %v0
+# CHECK: vecg %v0, %v0
 0xe7 0x00 0x00 0x00 0x30 0xdb
 
-#CHECK: vecg    %v19, %v14
+# CHECK: vecg %v19, %v14
 0xe7 0x3e 0x00 0x00 0x38 0xdb
 
-#CHECK: vecg    %v31, %v31
+# CHECK: vecg %v31, %v31
 0xe7 0xff 0x00 0x00 0x3c 0xdb
 
-#CHECK: vech    %v0, %v0
+# CHECK: vech %v0, %v0
 0xe7 0x00 0x00 0x00 0x10 0xdb
 
-#CHECK: vech    %v19, %v14
+# CHECK: vech %v19, %v14
 0xe7 0x3e 0x00 0x00 0x18 0xdb
 
-#CHECK: vech    %v31, %v31
+# CHECK: vech %v31, %v31
 0xe7 0xff 0x00 0x00 0x1c 0xdb
 
-#CHECK: vecl    %v0, %v0, 11
+# CHECK: vecl %v0, %v0, 11
 0xe7 0x00 0x00 0x00 0xb0 0xd9
 
-#CHECK: vecl    %v19, %v14, 11
+# CHECK: vecl %v19, %v14, 11
 0xe7 0x3e 0x00 0x00 0xb8 0xd9
 
-#CHECK: vecl    %v31, %v31, 11
+# CHECK: vecl %v31, %v31, 11
 0xe7 0xff 0x00 0x00 0xbc 0xd9
 
-#CHECK: veclb   %v0, %v0
+# CHECK: veclb %v0, %v0
 0xe7 0x00 0x00 0x00 0x00 0xd9
 
-#CHECK: veclb   %v19, %v14
+# CHECK: veclb %v19, %v14
 0xe7 0x3e 0x00 0x00 0x08 0xd9
 
-#CHECK: veclb   %v31, %v31
+# CHECK: veclb %v31, %v31
 0xe7 0xff 0x00 0x00 0x0c 0xd9
 
-#CHECK: veclf   %v0, %v0
+# CHECK: veclf %v0, %v0
 0xe7 0x00 0x00 0x00 0x20 0xd9
 
-#CHECK: veclf   %v19, %v14
+# CHECK: veclf %v19, %v14
 0xe7 0x3e 0x00 0x00 0x28 0xd9
 
-#CHECK: veclf   %v31, %v31
+# CHECK: veclf %v31, %v31
 0xe7 0xff 0x00 0x00 0x2c 0xd9
 
-#CHECK: veclg   %v0, %v0
+# CHECK: veclg %v0, %v0
 0xe7 0x00 0x00 0x00 0x30 0xd9
 
-#CHECK: veclg   %v19, %v14
+# CHECK: veclg %v19, %v14
 0xe7 0x3e 0x00 0x00 0x38 0xd9
 
-#CHECK: veclg   %v31, %v31
+# CHECK: veclg %v31, %v31
 0xe7 0xff 0x00 0x00 0x3c 0xd9
 
-#CHECK: veclh   %v0, %v0
+# CHECK: veclh %v0, %v0
 0xe7 0x00 0x00 0x00 0x10 0xd9
 
-#CHECK: veclh   %v19, %v14
+# CHECK: veclh %v19, %v14
 0xe7 0x3e 0x00 0x00 0x18 0xd9
 
-#CHECK: veclh   %v31, %v31
+# CHECK: veclh %v31, %v31
 0xe7 0xff 0x00 0x00 0x1c 0xd9
 
-#CHECK: verim   %v0, %v0, %v0, 0, 11
+# CHECK: verim %v0, %v0, %v0, 0, 11
 0xe7 0x00 0x00 0x00 0xb0 0x72
 
-#CHECK: verim   %v3, %v20, %v5, 103, 11
+# CHECK: verim %v3, %v20, %v5, 103, 11
 0xe7 0x34 0x50 0x67 0xb4 0x72
 
-#CHECK: verim   %v31, %v31, %v31, 255, 11
+# CHECK: verim %v31, %v31, %v31, 255, 11
 0xe7 0xff 0xf0 0xff 0xbe 0x72
 
-#CHECK: verimb  %v0, %v0, %v0, 0
+# CHECK: verimb %v0, %v0, %v0, 0
 0xe7 0x00 0x00 0x00 0x00 0x72
 
-#CHECK: verimb  %v3, %v20, %v5, 103
+# CHECK: verimb %v3, %v20, %v5, 103
 0xe7 0x34 0x50 0x67 0x04 0x72
 
-#CHECK: verimb  %v31, %v31, %v31, 255
+# CHECK: verimb %v31, %v31, %v31, 255
 0xe7 0xff 0xf0 0xff 0x0e 0x72
 
-#CHECK: verimf  %v0, %v0, %v0, 0
+# CHECK: verimf %v0, %v0, %v0, 0
 0xe7 0x00 0x00 0x00 0x20 0x72
 
-#CHECK: verimf  %v3, %v20, %v5, 103
+# CHECK: verimf %v3, %v20, %v5, 103
 0xe7 0x34 0x50 0x67 0x24 0x72
 
-#CHECK: verimf  %v31, %v31, %v31, 255
+# CHECK: verimf %v31, %v31, %v31, 255
 0xe7 0xff 0xf0 0xff 0x2e 0x72
 
-#CHECK: verimg  %v0, %v0, %v0, 0
+# CHECK: verimg %v0, %v0, %v0, 0
 0xe7 0x00 0x00 0x00 0x30 0x72
 
-#CHECK: verimg  %v3, %v20, %v5, 103
+# CHECK: verimg %v3, %v20, %v5, 103
 0xe7 0x34 0x50 0x67 0x34 0x72
 
-#CHECK: verimg  %v31, %v31, %v31, 255
+# CHECK: verimg %v31, %v31, %v31, 255
 0xe7 0xff 0xf0 0xff 0x3e 0x72
 
-#CHECK: verimh  %v0, %v0, %v0, 0
+# CHECK: verimh %v0, %v0, %v0, 0
 0xe7 0x00 0x00 0x00 0x10 0x72
 
-#CHECK: verimh  %v3, %v20, %v5, 103
+# CHECK: verimh %v3, %v20, %v5, 103
 0xe7 0x34 0x50 0x67 0x14 0x72
 
-#CHECK: verimh  %v31, %v31, %v31, 255
+# CHECK: verimh %v31, %v31, %v31, 255
 0xe7 0xff 0xf0 0xff 0x1e 0x72
 
-#CHECK: verllv  %v0, %v0, %v0, 11
-0xe7 0x00 0x00 0x00 0xb0 0x73
-
-#CHECK: verllv  %v18, %v3, %v20, 11
-0xe7 0x23 0x40 0x00 0xba 0x73
-
-#CHECK: verllv  %v31, %v31, %v31, 11
-0xe7 0xff 0xf0 0x00 0xbe 0x73
-
-#CHECK: verllvb %v0, %v0, %v0
-0xe7 0x00 0x00 0x00 0x00 0x73
-
-#CHECK: verllvb %v18, %v3, %v20
-0xe7 0x23 0x40 0x00 0x0a 0x73
-
-#CHECK: verllvb %v31, %v31, %v31
-0xe7 0xff 0xf0 0x00 0x0e 0x73
-
-#CHECK: verllvf %v0, %v0, %v0
-0xe7 0x00 0x00 0x00 0x20 0x73
-
-#CHECK: verllvf %v18, %v3, %v20
-0xe7 0x23 0x40 0x00 0x2a 0x73
-
-#CHECK: verllvf %v31, %v31, %v31
-0xe7 0xff 0xf0 0x00 0x2e 0x73
-
-#CHECK: verllvg %v0, %v0, %v0
-0xe7 0x00 0x00 0x00 0x30 0x73
-
-#CHECK: verllvg %v18, %v3, %v20
-0xe7 0x23 0x40 0x00 0x3a 0x73
-
-#CHECK: verllvg %v31, %v31, %v31
-0xe7 0xff 0xf0 0x00 0x3e 0x73
-
-#CHECK: verllvh %v0, %v0, %v0
-0xe7 0x00 0x00 0x00 0x10 0x73
-
-#CHECK: verllvh %v18, %v3, %v20
-0xe7 0x23 0x40 0x00 0x1a 0x73
-
-#CHECK: verllvh %v31, %v31, %v31
-0xe7 0xff 0xf0 0x00 0x1e 0x73
-
-#CHECK: verll   %v0, %v0, 0, 11
+# CHECK: verll %v0, %v0, 0, 11
 0xe7 0x00 0x00 0x00 0xb0 0x33
 
-#CHECK: verll   %v12, %v18, 1110(%r3), 11
+# CHECK: verll %v12, %v18, 1110(%r3), 11
 0xe7 0xc2 0x34 0x56 0xb4 0x33
 
-#CHECK: verll   %v31, %v31, 4095(%r15), 11
+# CHECK: verll %v31, %v31, 4095(%r15), 11
 0xe7 0xff 0xff 0xff 0xbc 0x33
 
-#CHECK: verllb  %v0, %v0, 0
+# CHECK: verllb %v0, %v0, 0
 0xe7 0x00 0x00 0x00 0x00 0x33
 
-#CHECK: verllb  %v12, %v18, 1110(%r3)
+# CHECK: verllb %v12, %v18, 1110(%r3)
 0xe7 0xc2 0x34 0x56 0x04 0x33
 
-#CHECK: verllb  %v31, %v31, 4095(%r15)
+# CHECK: verllb %v31, %v31, 4095(%r15)
 0xe7 0xff 0xff 0xff 0x0c 0x33
 
-#CHECK: verllf  %v0, %v0, 0
+# CHECK: verllf %v0, %v0, 0
 0xe7 0x00 0x00 0x00 0x20 0x33
 
-#CHECK: verllf  %v12, %v18, 1110(%r3)
+# CHECK: verllf %v12, %v18, 1110(%r3)
 0xe7 0xc2 0x34 0x56 0x24 0x33
 
-#CHECK: verllf  %v31, %v31, 4095(%r15)
+# CHECK: verllf %v31, %v31, 4095(%r15)
 0xe7 0xff 0xff 0xff 0x2c 0x33
 
-#CHECK: verllg  %v0, %v0, 0
+# CHECK: verllg %v0, %v0, 0
 0xe7 0x00 0x00 0x00 0x30 0x33
 
-#CHECK: verllg  %v12, %v18, 1110(%r3)
+# CHECK: verllg %v12, %v18, 1110(%r3)
 0xe7 0xc2 0x34 0x56 0x34 0x33
 
-#CHECK: verllg  %v31, %v31, 4095(%r15)
+# CHECK: verllg %v31, %v31, 4095(%r15)
 0xe7 0xff 0xff 0xff 0x3c 0x33
 
-#CHECK: verllh  %v0, %v0, 0
+# CHECK: verllh %v0, %v0, 0
 0xe7 0x00 0x00 0x00 0x10 0x33
 
-#CHECK: verllh  %v12, %v18, 1110(%r3)
+# CHECK: verllh %v12, %v18, 1110(%r3)
 0xe7 0xc2 0x34 0x56 0x14 0x33
 
-#CHECK: verllh  %v31, %v31, 4095(%r15)
+# CHECK: verllh %v31, %v31, 4095(%r15)
 0xe7 0xff 0xff 0xff 0x1c 0x33
 
-#CHECK: veslv   %v0, %v0, %v0, 11
-0xe7 0x00 0x00 0x00 0xb0 0x70
+# CHECK: verllv %v0, %v0, %v0, 11
+0xe7 0x00 0x00 0x00 0xb0 0x73
 
-#CHECK: veslv   %v18, %v3, %v20, 11
-0xe7 0x23 0x40 0x00 0xba 0x70
+# CHECK: verllv %v18, %v3, %v20, 11
+0xe7 0x23 0x40 0x00 0xba 0x73
 
-#CHECK: veslv   %v31, %v31, %v31, 11
-0xe7 0xff 0xf0 0x00 0xbe 0x70
+# CHECK: verllv %v31, %v31, %v31, 11
+0xe7 0xff 0xf0 0x00 0xbe 0x73
 
-#CHECK: veslvb  %v0, %v0, %v0
-0xe7 0x00 0x00 0x00 0x00 0x70
+# CHECK: verllvb %v0, %v0, %v0
+0xe7 0x00 0x00 0x00 0x00 0x73
 
-#CHECK: veslvb  %v18, %v3, %v20
-0xe7 0x23 0x40 0x00 0x0a 0x70
+# CHECK: verllvb %v18, %v3, %v20
+0xe7 0x23 0x40 0x00 0x0a 0x73
 
-#CHECK: veslvb  %v31, %v31, %v31
-0xe7 0xff 0xf0 0x00 0x0e 0x70
+# CHECK: verllvb %v31, %v31, %v31
+0xe7 0xff 0xf0 0x00 0x0e 0x73
 
-#CHECK: veslvf  %v0, %v0, %v0
-0xe7 0x00 0x00 0x00 0x20 0x70
+# CHECK: verllvf %v0, %v0, %v0
+0xe7 0x00 0x00 0x00 0x20 0x73
 
-#CHECK: veslvf  %v18, %v3, %v20
-0xe7 0x23 0x40 0x00 0x2a 0x70
+# CHECK: verllvf %v18, %v3, %v20
+0xe7 0x23 0x40 0x00 0x2a 0x73
 
-#CHECK: veslvf  %v31, %v31, %v31
-0xe7 0xff 0xf0 0x00 0x2e 0x70
+# CHECK: verllvf %v31, %v31, %v31
+0xe7 0xff 0xf0 0x00 0x2e 0x73
 
-#CHECK: veslvg  %v0, %v0, %v0
-0xe7 0x00 0x00 0x00 0x30 0x70
+# CHECK: verllvg %v0, %v0, %v0
+0xe7 0x00 0x00 0x00 0x30 0x73
 
-#CHECK: veslvg  %v18, %v3, %v20
-0xe7 0x23 0x40 0x00 0x3a 0x70
+# CHECK: verllvg %v18, %v3, %v20
+0xe7 0x23 0x40 0x00 0x3a 0x73
 
-#CHECK: veslvg  %v31, %v31, %v31
-0xe7 0xff 0xf0 0x00 0x3e 0x70
+# CHECK: verllvg %v31, %v31, %v31
+0xe7 0xff 0xf0 0x00 0x3e 0x73
 
-#CHECK: veslvh  %v0, %v0, %v0
-0xe7 0x00 0x00 0x00 0x10 0x70
+# CHECK: verllvh %v0, %v0, %v0
+0xe7 0x00 0x00 0x00 0x10 0x73
 
-#CHECK: veslvh  %v18, %v3, %v20
-0xe7 0x23 0x40 0x00 0x1a 0x70
+# CHECK: verllvh %v18, %v3, %v20
+0xe7 0x23 0x40 0x00 0x1a 0x73
 
-#CHECK: veslvh  %v31, %v31, %v31
-0xe7 0xff 0xf0 0x00 0x1e 0x70
+# CHECK: verllvh %v31, %v31, %v31
+0xe7 0xff 0xf0 0x00 0x1e 0x73
 
-#CHECK: vesl    %v0, %v0, 0, 11
+# CHECK: vesl %v0, %v0, 0, 11
 0xe7 0x00 0x00 0x00 0xb0 0x30
 
-#CHECK: vesl    %v12, %v18, 1110(%r3), 11
+# CHECK: vesl %v12, %v18, 1110(%r3), 11
 0xe7 0xc2 0x34 0x56 0xb4 0x30
 
-#CHECK: vesl    %v31, %v31, 4095(%r15), 11
+# CHECK: vesl %v31, %v31, 4095(%r15), 11
 0xe7 0xff 0xff 0xff 0xbc 0x30
 
-#CHECK: veslb   %v0, %v0, 0
+# CHECK: veslb %v0, %v0, 0
 0xe7 0x00 0x00 0x00 0x00 0x30
 
-#CHECK: veslb   %v12, %v18, 1110(%r3)
+# CHECK: veslb %v12, %v18, 1110(%r3)
 0xe7 0xc2 0x34 0x56 0x04 0x30
 
-#CHECK: veslb   %v31, %v31, 4095(%r15)
+# CHECK: veslb %v31, %v31, 4095(%r15)
 0xe7 0xff 0xff 0xff 0x0c 0x30
 
-#CHECK: veslf   %v0, %v0, 0
+# CHECK: veslf %v0, %v0, 0
 0xe7 0x00 0x00 0x00 0x20 0x30
 
-#CHECK: veslf   %v12, %v18, 1110(%r3)
+# CHECK: veslf %v12, %v18, 1110(%r3)
 0xe7 0xc2 0x34 0x56 0x24 0x30
 
-#CHECK: veslf   %v31, %v31, 4095(%r15)
+# CHECK: veslf %v31, %v31, 4095(%r15)
 0xe7 0xff 0xff 0xff 0x2c 0x30
 
-#CHECK: veslg   %v0, %v0, 0
+# CHECK: veslg %v0, %v0, 0
 0xe7 0x00 0x00 0x00 0x30 0x30
 
-#CHECK: veslg   %v12, %v18, 1110(%r3)
+# CHECK: veslg %v12, %v18, 1110(%r3)
 0xe7 0xc2 0x34 0x56 0x34 0x30
 
-#CHECK: veslg   %v31, %v31, 4095(%r15)
+# CHECK: veslg %v31, %v31, 4095(%r15)
 0xe7 0xff 0xff 0xff 0x3c 0x30
 
-#CHECK: veslh   %v0, %v0, 0
+# CHECK: veslh %v0, %v0, 0
 0xe7 0x00 0x00 0x00 0x10 0x30
 
-#CHECK: veslh   %v12, %v18, 1110(%r3)
+# CHECK: veslh %v12, %v18, 1110(%r3)
 0xe7 0xc2 0x34 0x56 0x14 0x30
 
-#CHECK: veslh   %v31, %v31, 4095(%r15)
+# CHECK: veslh %v31, %v31, 4095(%r15)
 0xe7 0xff 0xff 0xff 0x1c 0x30
 
-#CHECK: vesrav  %v0, %v0, %v0, 11
-0xe7 0x00 0x00 0x00 0xb0 0x7a
+# CHECK: veslv %v0, %v0, %v0, 11
+0xe7 0x00 0x00 0x00 0xb0 0x70
 
-#CHECK: vesrav  %v18, %v3, %v20, 11
-0xe7 0x23 0x40 0x00 0xba 0x7a
+# CHECK: veslv %v18, %v3, %v20, 11
+0xe7 0x23 0x40 0x00 0xba 0x70
 
-#CHECK: vesrav  %v31, %v31, %v31, 11
-0xe7 0xff 0xf0 0x00 0xbe 0x7a
+# CHECK: veslv %v31, %v31, %v31, 11
+0xe7 0xff 0xf0 0x00 0xbe 0x70
 
-#CHECK: vesravb %v0, %v0, %v0
-0xe7 0x00 0x00 0x00 0x00 0x7a
+# CHECK: veslvb %v0, %v0, %v0
+0xe7 0x00 0x00 0x00 0x00 0x70
 
-#CHECK: vesravb %v18, %v3, %v20
-0xe7 0x23 0x40 0x00 0x0a 0x7a
+# CHECK: veslvb %v18, %v3, %v20
+0xe7 0x23 0x40 0x00 0x0a 0x70
 
-#CHECK: vesravb %v31, %v31, %v31
-0xe7 0xff 0xf0 0x00 0x0e 0x7a
+# CHECK: veslvb %v31, %v31, %v31
+0xe7 0xff 0xf0 0x00 0x0e 0x70
 
-#CHECK: vesravf %v0, %v0, %v0
-0xe7 0x00 0x00 0x00 0x20 0x7a
+# CHECK: veslvf %v0, %v0, %v0
+0xe7 0x00 0x00 0x00 0x20 0x70
 
-#CHECK: vesravf %v18, %v3, %v20
-0xe7 0x23 0x40 0x00 0x2a 0x7a
+# CHECK: veslvf %v18, %v3, %v20
+0xe7 0x23 0x40 0x00 0x2a 0x70
 
-#CHECK: vesravf %v31, %v31, %v31
-0xe7 0xff 0xf0 0x00 0x2e 0x7a
+# CHECK: veslvf %v31, %v31, %v31
+0xe7 0xff 0xf0 0x00 0x2e 0x70
 
-#CHECK: vesravg %v0, %v0, %v0
-0xe7 0x00 0x00 0x00 0x30 0x7a
+# CHECK: veslvg %v0, %v0, %v0
+0xe7 0x00 0x00 0x00 0x30 0x70
 
-#CHECK: vesravg %v18, %v3, %v20
-0xe7 0x23 0x40 0x00 0x3a 0x7a
+# CHECK: veslvg %v18, %v3, %v20
+0xe7 0x23 0x40 0x00 0x3a 0x70
 
-#CHECK: vesravg %v31, %v31, %v31
-0xe7 0xff 0xf0 0x00 0x3e 0x7a
+# CHECK: veslvg %v31, %v31, %v31
+0xe7 0xff 0xf0 0x00 0x3e 0x70
 
-#CHECK: vesravh %v0, %v0, %v0
-0xe7 0x00 0x00 0x00 0x10 0x7a
+# CHECK: veslvh %v0, %v0, %v0
+0xe7 0x00 0x00 0x00 0x10 0x70
 
-#CHECK: vesravh %v18, %v3, %v20
-0xe7 0x23 0x40 0x00 0x1a 0x7a
+# CHECK: veslvh %v18, %v3, %v20
+0xe7 0x23 0x40 0x00 0x1a 0x70
 
-#CHECK: vesravh %v31, %v31, %v31
-0xe7 0xff 0xf0 0x00 0x1e 0x7a
+# CHECK: veslvh %v31, %v31, %v31
+0xe7 0xff 0xf0 0x00 0x1e 0x70
 
-#CHECK: vesra   %v0, %v0, 0, 11
+# CHECK: vesra %v0, %v0, 0, 11
 0xe7 0x00 0x00 0x00 0xb0 0x3a
 
-#CHECK: vesra   %v12, %v18, 1110(%r3), 11
+# CHECK: vesra %v12, %v18, 1110(%r3), 11
 0xe7 0xc2 0x34 0x56 0xb4 0x3a
 
-#CHECK: vesra   %v31, %v31, 4095(%r15), 11
+# CHECK: vesra %v31, %v31, 4095(%r15), 11
 0xe7 0xff 0xff 0xff 0xbc 0x3a
 
-#CHECK: vesrab  %v0, %v0, 0
+# CHECK: vesrab %v0, %v0, 0
 0xe7 0x00 0x00 0x00 0x00 0x3a
 
-#CHECK: vesrab  %v12, %v18, 1110(%r3)
+# CHECK: vesrab %v12, %v18, 1110(%r3)
 0xe7 0xc2 0x34 0x56 0x04 0x3a
 
-#CHECK: vesrab  %v31, %v31, 4095(%r15)
+# CHECK: vesrab %v31, %v31, 4095(%r15)
 0xe7 0xff 0xff 0xff 0x0c 0x3a
 
-#CHECK: vesraf  %v0, %v0, 0
+# CHECK: vesraf %v0, %v0, 0
 0xe7 0x00 0x00 0x00 0x20 0x3a
 
-#CHECK: vesraf  %v12, %v18, 1110(%r3)
+# CHECK: vesraf %v12, %v18, 1110(%r3)
 0xe7 0xc2 0x34 0x56 0x24 0x3a
 
-#CHECK: vesraf  %v31, %v31, 4095(%r15)
+# CHECK: vesraf %v31, %v31, 4095(%r15)
 0xe7 0xff 0xff 0xff 0x2c 0x3a
 
-#CHECK: vesrag  %v0, %v0, 0
+# CHECK: vesrag %v0, %v0, 0
 0xe7 0x00 0x00 0x00 0x30 0x3a
 
-#CHECK: vesrag  %v12, %v18, 1110(%r3)
+# CHECK: vesrag %v12, %v18, 1110(%r3)
 0xe7 0xc2 0x34 0x56 0x34 0x3a
 
-#CHECK: vesrag  %v31, %v31, 4095(%r15)
+# CHECK: vesrag %v31, %v31, 4095(%r15)
 0xe7 0xff 0xff 0xff 0x3c 0x3a
 
-#CHECK: vesrah  %v0, %v0, 0
+# CHECK: vesrah %v0, %v0, 0
 0xe7 0x00 0x00 0x00 0x10 0x3a
 
-#CHECK: vesrah  %v12, %v18, 1110(%r3)
+# CHECK: vesrah %v12, %v18, 1110(%r3)
 0xe7 0xc2 0x34 0x56 0x14 0x3a
 
-#CHECK: vesrah  %v31, %v31, 4095(%r15)
+# CHECK: vesrah %v31, %v31, 4095(%r15)
 0xe7 0xff 0xff 0xff 0x1c 0x3a
 
-#CHECK: vesrlv  %v0, %v0, %v0, 11
-0xe7 0x00 0x00 0x00 0xb0 0x78
+# CHECK: vesrav %v0, %v0, %v0, 11
+0xe7 0x00 0x00 0x00 0xb0 0x7a
 
-#CHECK: vesrlv  %v18, %v3, %v20, 11
-0xe7 0x23 0x40 0x00 0xba 0x78
+# CHECK: vesrav %v18, %v3, %v20, 11
+0xe7 0x23 0x40 0x00 0xba 0x7a
 
-#CHECK: vesrlv  %v31, %v31, %v31, 11
-0xe7 0xff 0xf0 0x00 0xbe 0x78
+# CHECK: vesrav %v31, %v31, %v31, 11
+0xe7 0xff 0xf0 0x00 0xbe 0x7a
 
-#CHECK: vesrlvb %v0, %v0, %v0
-0xe7 0x00 0x00 0x00 0x00 0x78
+# CHECK: vesravb %v0, %v0, %v0
+0xe7 0x00 0x00 0x00 0x00 0x7a
 
-#CHECK: vesrlvb %v18, %v3, %v20
-0xe7 0x23 0x40 0x00 0x0a 0x78
+# CHECK: vesravb %v18, %v3, %v20
+0xe7 0x23 0x40 0x00 0x0a 0x7a
 
-#CHECK: vesrlvb %v31, %v31, %v31
-0xe7 0xff 0xf0 0x00 0x0e 0x78
+# CHECK: vesravb %v31, %v31, %v31
+0xe7 0xff 0xf0 0x00 0x0e 0x7a
 
-#CHECK: vesrlvf %v0, %v0, %v0
-0xe7 0x00 0x00 0x00 0x20 0x78
+# CHECK: vesravf %v0, %v0, %v0
+0xe7 0x00 0x00 0x00 0x20 0x7a
 
-#CHECK: vesrlvf %v18, %v3, %v20
-0xe7 0x23 0x40 0x00 0x2a 0x78
+# CHECK: vesravf %v18, %v3, %v20
+0xe7 0x23 0x40 0x00 0x2a 0x7a
 
-#CHECK: vesrlvf %v31, %v31, %v31
-0xe7 0xff 0xf0 0x00 0x2e 0x78
+# CHECK: vesravf %v31, %v31, %v31
+0xe7 0xff 0xf0 0x00 0x2e 0x7a
 
-#CHECK: vesrlvg %v0, %v0, %v0
-0xe7 0x00 0x00 0x00 0x30 0x78
+# CHECK: vesravg %v0, %v0, %v0
+0xe7 0x00 0x00 0x00 0x30 0x7a
 
-#CHECK: vesrlvg %v18, %v3, %v20
-0xe7 0x23 0x40 0x00 0x3a 0x78
+# CHECK: vesravg %v18, %v3, %v20
+0xe7 0x23 0x40 0x00 0x3a 0x7a
 
-#CHECK: vesrlvg %v31, %v31, %v31
-0xe7 0xff 0xf0 0x00 0x3e 0x78
+# CHECK: vesravg %v31, %v31, %v31
+0xe7 0xff 0xf0 0x00 0x3e 0x7a
 
-#CHECK: vesrlvh %v0, %v0, %v0
-0xe7 0x00 0x00 0x00 0x10 0x78
+# CHECK: vesravh %v0, %v0, %v0
+0xe7 0x00 0x00 0x00 0x10 0x7a
 
-#CHECK: vesrlvh %v18, %v3, %v20
-0xe7 0x23 0x40 0x00 0x1a 0x78
+# CHECK: vesravh %v18, %v3, %v20
+0xe7 0x23 0x40 0x00 0x1a 0x7a
 
-#CHECK: vesrlvh %v31, %v31, %v31
-0xe7 0xff 0xf0 0x00 0x1e 0x78
+# CHECK: vesravh %v31, %v31, %v31
+0xe7 0xff 0xf0 0x00 0x1e 0x7a
 
-#CHECK: vesrl   %v0, %v0, 0, 11
+# CHECK: vesrl %v0, %v0, 0, 11
 0xe7 0x00 0x00 0x00 0xb0 0x38
 
-#CHECK: vesrl   %v12, %v18, 1110(%r3), 11
+# CHECK: vesrl %v12, %v18, 1110(%r3), 11
 0xe7 0xc2 0x34 0x56 0xb4 0x38
 
-#CHECK: vesrl   %v31, %v31, 4095(%r15), 11
+# CHECK: vesrl %v31, %v31, 4095(%r15), 11
 0xe7 0xff 0xff 0xff 0xbc 0x38
 
-#CHECK: vesrlb  %v0, %v0, 0
+# CHECK: vesrlb %v0, %v0, 0
 0xe7 0x00 0x00 0x00 0x00 0x38
 
-#CHECK: vesrlb  %v12, %v18, 1110(%r3)
+# CHECK: vesrlb %v12, %v18, 1110(%r3)
 0xe7 0xc2 0x34 0x56 0x04 0x38
 
-#CHECK: vesrlb  %v31, %v31, 4095(%r15)
+# CHECK: vesrlb %v31, %v31, 4095(%r15)
 0xe7 0xff 0xff 0xff 0x0c 0x38
 
-#CHECK: vesrlf  %v0, %v0, 0
+# CHECK: vesrlf %v0, %v0, 0
 0xe7 0x00 0x00 0x00 0x20 0x38
 
-#CHECK: vesrlf  %v12, %v18, 1110(%r3)
+# CHECK: vesrlf %v12, %v18, 1110(%r3)
 0xe7 0xc2 0x34 0x56 0x24 0x38
 
-#CHECK: vesrlf  %v31, %v31, 4095(%r15)
+# CHECK: vesrlf %v31, %v31, 4095(%r15)
 0xe7 0xff 0xff 0xff 0x2c 0x38
 
-#CHECK: vesrlg  %v0, %v0, 0
+# CHECK: vesrlg %v0, %v0, 0
 0xe7 0x00 0x00 0x00 0x30 0x38
 
-#CHECK: vesrlg  %v12, %v18, 1110(%r3)
+# CHECK: vesrlg %v12, %v18, 1110(%r3)
 0xe7 0xc2 0x34 0x56 0x34 0x38
 
-#CHECK: vesrlg  %v31, %v31, 4095(%r15)
+# CHECK: vesrlg %v31, %v31, 4095(%r15)
 0xe7 0xff 0xff 0xff 0x3c 0x38
 
-#CHECK: vesrlh  %v0, %v0, 0
+# CHECK: vesrlh %v0, %v0, 0
 0xe7 0x00 0x00 0x00 0x10 0x38
 
-#CHECK: vesrlh  %v12, %v18, 1110(%r3)
+# CHECK: vesrlh %v12, %v18, 1110(%r3)
 0xe7 0xc2 0x34 0x56 0x14 0x38
 
-#CHECK: vesrlh  %v31, %v31, 4095(%r15)
+# CHECK: vesrlh %v31, %v31, 4095(%r15)
 0xe7 0xff 0xff 0xff 0x1c 0x38
 
-#CHECK: vfa     %v0, %v0, %v0, 11, 9
+# CHECK: vesrlv %v0, %v0, %v0, 11
+0xe7 0x00 0x00 0x00 0xb0 0x78
+
+# CHECK: vesrlv %v18, %v3, %v20, 11
+0xe7 0x23 0x40 0x00 0xba 0x78
+
+# CHECK: vesrlv %v31, %v31, %v31, 11
+0xe7 0xff 0xf0 0x00 0xbe 0x78
+
+# CHECK: vesrlvb %v0, %v0, %v0
+0xe7 0x00 0x00 0x00 0x00 0x78
+
+# CHECK: vesrlvb %v18, %v3, %v20
+0xe7 0x23 0x40 0x00 0x0a 0x78
+
+# CHECK: vesrlvb %v31, %v31, %v31
+0xe7 0xff 0xf0 0x00 0x0e 0x78
+
+# CHECK: vesrlvf %v0, %v0, %v0
+0xe7 0x00 0x00 0x00 0x20 0x78
+
+# CHECK: vesrlvf %v18, %v3, %v20
+0xe7 0x23 0x40 0x00 0x2a 0x78
+
+# CHECK: vesrlvf %v31, %v31, %v31
+0xe7 0xff 0xf0 0x00 0x2e 0x78
+
+# CHECK: vesrlvg %v0, %v0, %v0
+0xe7 0x00 0x00 0x00 0x30 0x78
+
+# CHECK: vesrlvg %v18, %v3, %v20
+0xe7 0x23 0x40 0x00 0x3a 0x78
+
+# CHECK: vesrlvg %v31, %v31, %v31
+0xe7 0xff 0xf0 0x00 0x3e 0x78
+
+# CHECK: vesrlvh %v0, %v0, %v0
+0xe7 0x00 0x00 0x00 0x10 0x78
+
+# CHECK: vesrlvh %v18, %v3, %v20
+0xe7 0x23 0x40 0x00 0x1a 0x78
+
+# CHECK: vesrlvh %v31, %v31, %v31
+0xe7 0xff 0xf0 0x00 0x1e 0x78
+
+# CHECK: vfa  %v0, %v0, %v0, 11, 9
 0xe7 0x00 0x00 0x09 0xb0 0xe3
 
-#CHECK: vfa     %v18, %v3, %v20, 11, 9
+# CHECK: vfa  %v18, %v3, %v20, 11, 9
 0xe7 0x23 0x40 0x09 0xba 0xe3
 
-#CHECK: vfa     %v31, %v31, %v31, 11, 9
+# CHECK: vfa  %v31, %v31, %v31, 11, 9
 0xe7 0xff 0xf0 0x09 0xbe 0xe3
 
-#CHECK: vfadb   %v0, %v0, %v0
+# CHECK: vfadb %v0, %v0, %v0
 0xe7 0x00 0x00 0x00 0x30 0xe3
 
-#CHECK: vfadb   %v18, %v3, %v20
+# CHECK: vfadb %v18, %v3, %v20
 0xe7 0x23 0x40 0x00 0x3a 0xe3
 
-#CHECK: vfadb   %v31, %v31, %v31
+# CHECK: vfadb %v31, %v31, %v31
 0xe7 0xff 0xf0 0x00 0x3e 0xe3
 
-#CHECK: vfae    %v0, %v0, %v0, 11, 0
+# CHECK: vfae %v0, %v0, %v0, 11, 0
 0xe7 0x00 0x00 0x00 0xb0 0x82
 
-#CHECK: vfae    %v0, %v0, %v0, 11, 12
+# CHECK: vfae %v0, %v0, %v0, 11, 12
 0xe7 0x00 0x00 0xc0 0xb0 0x82
 
-#CHECK: vfae    %v18, %v3, %v20, 11, 0
+# CHECK: vfae %v18, %v3, %v20, 11, 0
 0xe7 0x23 0x40 0x00 0xba 0x82
 
-#CHECK: vfae    %v31, %v31, %v31, 11, 4
+# CHECK: vfae %v31, %v31, %v31, 11, 4
 0xe7 0xff 0xf0 0x40 0xbe 0x82
 
-#CHECK: vfaeb   %v0, %v0, %v0, 0
+# CHECK: vfaeb %v0, %v0, %v0, 0
 0xe7 0x00 0x00 0x00 0x00 0x82
 
-#CHECK: vfaeb   %v0, %v0, %v0, 12
+# CHECK: vfaeb %v0, %v0, %v0, 12
 0xe7 0x00 0x00 0xc0 0x00 0x82
 
-#CHECK: vfaeb   %v18, %v3, %v20, 0
+# CHECK: vfaeb %v18, %v3, %v20, 0
 0xe7 0x23 0x40 0x00 0x0a 0x82
 
-#CHECK: vfaeb   %v31, %v31, %v31, 4
+# CHECK: vfaeb %v31, %v31, %v31, 4
 0xe7 0xff 0xf0 0x40 0x0e 0x82
 
-#CHECK: vfaebs  %v31, %v31, %v31, 8
+# CHECK: vfaebs %v31, %v31, %v31, 8
 0xe7 0xff 0xf0 0x90 0x0e 0x82
 
-#CHECK: vfaezb  %v31, %v31, %v31, 4
+# CHECK: vfaezb %v31, %v31, %v31, 4
 0xe7 0xff 0xf0 0x60 0x0e 0x82
 
-#CHECK: vfaezbs %v31, %v31, %v31, 8
+# CHECK: vfaezbs %v31, %v31, %v31, 8
 0xe7 0xff 0xf0 0xb0 0x0e 0x82
 
-#CHECK: vfaef   %v0, %v0, %v0, 0
+# CHECK: vfaef %v0, %v0, %v0, 0
 0xe7 0x00 0x00 0x00 0x20 0x82
 
-#CHECK: vfaef   %v0, %v0, %v0, 12
+# CHECK: vfaef %v0, %v0, %v0, 12
 0xe7 0x00 0x00 0xc0 0x20 0x82
 
-#CHECK: vfaef   %v18, %v3, %v20, 0
+# CHECK: vfaef %v18, %v3, %v20, 0
 0xe7 0x23 0x40 0x00 0x2a 0x82
 
-#CHECK: vfaef   %v31, %v31, %v31, 4
+# CHECK: vfaef %v31, %v31, %v31, 4
 0xe7 0xff 0xf0 0x40 0x2e 0x82
 
-#CHECK: vfaefs  %v31, %v31, %v31, 8
+# CHECK: vfaefs %v31, %v31, %v31, 8
 0xe7 0xff 0xf0 0x90 0x2e 0x82
 
-#CHECK: vfaezf  %v31, %v31, %v31, 4
+# CHECK: vfaezf %v31, %v31, %v31, 4
 0xe7 0xff 0xf0 0x60 0x2e 0x82
 
-#CHECK: vfaezfs %v31, %v31, %v31, 8
+# CHECK: vfaezfs %v31, %v31, %v31, 8
 0xe7 0xff 0xf0 0xb0 0x2e 0x82
 
-#CHECK: vfaeh   %v0, %v0, %v0, 0
+# CHECK: vfaeh %v0, %v0, %v0, 0
 0xe7 0x00 0x00 0x00 0x10 0x82
 
-#CHECK: vfaeh   %v0, %v0, %v0, 12
+# CHECK: vfaeh %v0, %v0, %v0, 12
 0xe7 0x00 0x00 0xc0 0x10 0x82
 
-#CHECK: vfaeh   %v18, %v3, %v20, 0
+# CHECK: vfaeh %v18, %v3, %v20, 0
 0xe7 0x23 0x40 0x00 0x1a 0x82
 
-#CHECK: vfaeh   %v31, %v31, %v31, 4
+# CHECK: vfaeh %v31, %v31, %v31, 4
 0xe7 0xff 0xf0 0x40 0x1e 0x82
 
-#CHECK: vfaehs  %v31, %v31, %v31, 8
+# CHECK: vfaehs %v31, %v31, %v31, 8
 0xe7 0xff 0xf0 0x90 0x1e 0x82
 
-#CHECK: vfaezh  %v31, %v31, %v31, 4
+# CHECK: vfaezh %v31, %v31, %v31, 4
 0xe7 0xff 0xf0 0x60 0x1e 0x82
 
-#CHECK: vfaezhs %v31, %v31, %v31, 8
+# CHECK: vfaezhs %v31, %v31, %v31, 8
 0xe7 0xff 0xf0 0xb0 0x1e 0x82
 
-#CHECK: vfce    %v0, %v0, %v0, 11, 9
+# CHECK: vfce %v0, %v0, %v0, 11, 9
 0xe7 0x00 0x00 0x09 0xb0 0xe8
 
-#CHECK: vfce    %v18, %v3, %v20, 11, 9
+# CHECK: vfce %v18, %v3, %v20, 11, 9
 0xe7 0x23 0x40 0x09 0xba 0xe8
 
-#CHECK: vfce    %v31, %v31, %v31, 11, 9
+# CHECK: vfce %v31, %v31, %v31, 11, 9
 0xe7 0xff 0xf0 0x09 0xbe 0xe8
 
-#CHECK: vfcedb  %v0, %v0, %v0
+# CHECK: vfcedb %v0, %v0, %v0
 0xe7 0x00 0x00 0x00 0x30 0xe8
 
-#CHECK: vfcedb  %v18, %v3, %v20
+# CHECK: vfcedb %v18, %v3, %v20
 0xe7 0x23 0x40 0x00 0x3a 0xe8
 
-#CHECK: vfcedb  %v31, %v31, %v31
+# CHECK: vfcedb %v31, %v31, %v31
 0xe7 0xff 0xf0 0x00 0x3e 0xe8
 
-#CHECK: vfcedbs %v0, %v0, %v0
+# CHECK: vfcedbs %v0, %v0, %v0
 0xe7 0x00 0x00 0x10 0x30 0xe8
 
-#CHECK: vfcedbs %v18, %v3, %v20
+# CHECK: vfcedbs %v18, %v3, %v20
 0xe7 0x23 0x40 0x10 0x3a 0xe8
 
-#CHECK: vfcedbs %v31, %v31, %v31
+# CHECK: vfcedbs %v31, %v31, %v31
 0xe7 0xff 0xf0 0x10 0x3e 0xe8
 
-#CHECK: vfch    %v0, %v0, %v0, 11, 9
+# CHECK: vfch %v0, %v0, %v0, 11, 9
 0xe7 0x00 0x00 0x09 0xb0 0xeb
 
-#CHECK: vfch    %v18, %v3, %v20, 11, 9
+# CHECK: vfch %v18, %v3, %v20, 11, 9
 0xe7 0x23 0x40 0x09 0xba 0xeb
 
-#CHECK: vfch    %v31, %v31, %v31, 11, 9
+# CHECK: vfch %v31, %v31, %v31, 11, 9
 0xe7 0xff 0xf0 0x09 0xbe 0xeb
 
-#CHECK: vfchdb  %v0, %v0, %v0
+# CHECK: vfchdb %v0, %v0, %v0
 0xe7 0x00 0x00 0x00 0x30 0xeb
 
-#CHECK: vfchdb  %v18, %v3, %v20
+# CHECK: vfchdb %v18, %v3, %v20
 0xe7 0x23 0x40 0x00 0x3a 0xeb
 
-#CHECK: vfchdb  %v31, %v31, %v31
+# CHECK: vfchdb %v31, %v31, %v31
 0xe7 0xff 0xf0 0x00 0x3e 0xeb
 
-#CHECK: vfchdbs %v0, %v0, %v0
+# CHECK: vfchdbs %v0, %v0, %v0
 0xe7 0x00 0x00 0x10 0x30 0xeb
 
-#CHECK: vfchdbs %v18, %v3, %v20
+# CHECK: vfchdbs %v18, %v3, %v20
 0xe7 0x23 0x40 0x10 0x3a 0xeb
 
-#CHECK: vfchdbs %v31, %v31, %v31
+# CHECK: vfchdbs %v31, %v31, %v31
 0xe7 0xff 0xf0 0x10 0x3e 0xeb
 
-#CHECK: vfche   %v0, %v0, %v0, 11, 9
+# CHECK: vfche %v0, %v0, %v0, 11, 9
 0xe7 0x00 0x00 0x09 0xb0 0xea
 
-#CHECK: vfche   %v18, %v3, %v20, 11, 9
+# CHECK: vfche %v18, %v3, %v20, 11, 9
 0xe7 0x23 0x40 0x09 0xba 0xea
 
-#CHECK: vfche   %v31, %v31, %v31, 11, 9
+# CHECK: vfche %v31, %v31, %v31, 11, 9
 0xe7 0xff 0xf0 0x09 0xbe 0xea
 
-#CHECK: vfchedb %v0, %v0, %v0
+# CHECK: vfchedb %v0, %v0, %v0
 0xe7 0x00 0x00 0x00 0x30 0xea
 
-#CHECK: vfchedb %v18, %v3, %v20
+# CHECK: vfchedb %v18, %v3, %v20
 0xe7 0x23 0x40 0x00 0x3a 0xea
 
-#CHECK: vfchedb %v31, %v31, %v31
+# CHECK: vfchedb %v31, %v31, %v31
 0xe7 0xff 0xf0 0x00 0x3e 0xea
 
-#CHECK: vfchedbs %v0, %v0, %v0
+# CHECK: vfchedbs %v0, %v0, %v0
 0xe7 0x00 0x00 0x10 0x30 0xea
 
-#CHECK: vfchedbs %v18, %v3, %v20
+# CHECK: vfchedbs %v18, %v3, %v20
 0xe7 0x23 0x40 0x10 0x3a 0xea
 
-#CHECK: vfchedbs %v31, %v31, %v31
+# CHECK: vfchedbs %v31, %v31, %v31
 0xe7 0xff 0xf0 0x10 0x3e 0xea
 
-#CHECK: vfd     %v0, %v0, %v0, 11, 9
+# CHECK: vfd  %v0, %v0, %v0, 11, 9
 0xe7 0x00 0x00 0x09 0xb0 0xe5
 
-#CHECK: vfd     %v18, %v3, %v20, 11, 9
+# CHECK: vfd  %v18, %v3, %v20, 11, 9
 0xe7 0x23 0x40 0x09 0xba 0xe5
 
-#CHECK: vfd     %v31, %v31, %v31, 11, 9
+# CHECK: vfd  %v31, %v31, %v31, 11, 9
 0xe7 0xff 0xf0 0x09 0xbe 0xe5
 
-#CHECK: vfddb   %v0, %v0, %v0
+# CHECK: vfddb %v0, %v0, %v0
 0xe7 0x00 0x00 0x00 0x30 0xe5
 
-#CHECK: vfddb   %v18, %v3, %v20
+# CHECK: vfddb %v18, %v3, %v20
 0xe7 0x23 0x40 0x00 0x3a 0xe5
 
-#CHECK: vfddb   %v31, %v31, %v31
+# CHECK: vfddb %v31, %v31, %v31
 0xe7 0xff 0xf0 0x00 0x3e 0xe5
 
-#CHECK: vfee    %v0, %v0, %v0, 11, 0
+# CHECK: vfee %v0, %v0, %v0, 11, 0
 0xe7 0x00 0x00 0x00 0xb0 0x80
 
-#CHECK: vfee    %v0, %v0, %v0, 11, 12
+# CHECK: vfee %v0, %v0, %v0, 11, 12
 0xe7 0x00 0x00 0xc0 0xb0 0x80
 
-#CHECK: vfee    %v18, %v3, %v20, 11, 0
+# CHECK: vfee %v18, %v3, %v20, 11, 0
 0xe7 0x23 0x40 0x00 0xba 0x80
 
-#CHECK: vfee    %v31, %v31, %v31, 11, 0
+# CHECK: vfee %v31, %v31, %v31, 11, 0
 0xe7 0xff 0xf0 0x00 0xbe 0x80
 
-#CHECK: vfeeb   %v0, %v0, %v0, 0
+# CHECK: vfeeb %v0, %v0, %v0, 0
 0xe7 0x00 0x00 0x00 0x00 0x80
 
-#CHECK: vfeeb   %v0, %v0, %v0, 12
+# CHECK: vfeeb %v0, %v0, %v0, 12
 0xe7 0x00 0x00 0xc0 0x00 0x80
 
-#CHECK: vfeeb   %v18, %v3, %v20, 0
+# CHECK: vfeeb %v18, %v3, %v20, 0
 0xe7 0x23 0x40 0x00 0x0a 0x80
 
-#CHECK: vfeebs  %v7, %v24, %v9
+# CHECK: vfeeb %v31, %v31, %v31, 0
+0xe7 0xff 0xf0 0x00 0x0e 0x80
+
+# CHECK: vfeebs %v7, %v24, %v9
 0xe7 0x78 0x90 0x10 0x04 0x80
 
-#CHECK: vfeezb  %v18, %v3, %v20
+# CHECK: vfeezb %v18, %v3, %v20
 0xe7 0x23 0x40 0x20 0x0a 0x80
 
-#CHECK: vfeezbs %v7, %v24, %v9
+# CHECK: vfeezbs %v7, %v24, %v9
 0xe7 0x78 0x90 0x30 0x04 0x80
 
-#CHECK: vfeeb   %v31, %v31, %v31, 0
-0xe7 0xff 0xf0 0x00 0x0e 0x80
-
-#CHECK: vfeef   %v0, %v0, %v0, 0
+# CHECK: vfeef %v0, %v0, %v0, 0
 0xe7 0x00 0x00 0x00 0x20 0x80
 
-#CHECK: vfeef   %v0, %v0, %v0, 12
+# CHECK: vfeef %v0, %v0, %v0, 12
 0xe7 0x00 0x00 0xc0 0x20 0x80
 
-#CHECK: vfeef   %v18, %v3, %v20, 0
+# CHECK: vfeef %v18, %v3, %v20, 0
 0xe7 0x23 0x40 0x00 0x2a 0x80
 
-#CHECK: vfeefs  %v7, %v24, %v9
+# CHECK: vfeef %v31, %v31, %v31, 0
+0xe7 0xff 0xf0 0x00 0x2e 0x80
+
+# CHECK: vfeefs %v7, %v24, %v9
 0xe7 0x78 0x90 0x10 0x24 0x80
 
-#CHECK: vfeezf  %v18, %v3, %v20
+# CHECK: vfeezf %v18, %v3, %v20
 0xe7 0x23 0x40 0x20 0x2a 0x80
 
-#CHECK: vfeezfs %v7, %v24, %v9
+# CHECK: vfeezfs %v7, %v24, %v9
 0xe7 0x78 0x90 0x30 0x24 0x80
 
-#CHECK: vfeef   %v31, %v31, %v31, 0
-0xe7 0xff 0xf0 0x00 0x2e 0x80
-
-#CHECK: vfeeh   %v0, %v0, %v0, 0
+# CHECK: vfeeh %v0, %v0, %v0, 0
 0xe7 0x00 0x00 0x00 0x10 0x80
 
-#CHECK: vfeeh   %v0, %v0, %v0, 12
+# CHECK: vfeeh %v0, %v0, %v0, 12
 0xe7 0x00 0x00 0xc0 0x10 0x80
 
-#CHECK: vfeeh   %v18, %v3, %v20, 0
+# CHECK: vfeeh %v18, %v3, %v20, 0
 0xe7 0x23 0x40 0x00 0x1a 0x80
 
-#CHECK: vfeehs  %v7, %v24, %v9
+# CHECK: vfeeh %v31, %v31, %v31, 0
+0xe7 0xff 0xf0 0x00 0x1e 0x80
+
+# CHECK: vfeehs %v7, %v24, %v9
 0xe7 0x78 0x90 0x10 0x14 0x80
 
-#CHECK: vfeezh  %v18, %v3, %v20
+# CHECK: vfeezh %v18, %v3, %v20
 0xe7 0x23 0x40 0x20 0x1a 0x80
 
-#CHECK: vfeezhs %v7, %v24, %v9
+# CHECK: vfeezhs %v7, %v24, %v9
 0xe7 0x78 0x90 0x30 0x14 0x80
 
-#CHECK: vfeeh   %v31, %v31, %v31, 0
-0xe7 0xff 0xf0 0x00 0x1e 0x80
-
-#CHECK: vfene   %v0, %v0, %v0, 11, 0
+# CHECK: vfene %v0, %v0, %v0, 11, 0
 0xe7 0x00 0x00 0x00 0xb0 0x81
 
-#CHECK: vfene   %v0, %v0, %v0, 11, 12
+# CHECK: vfene %v0, %v0, %v0, 11, 12
 0xe7 0x00 0x00 0xc0 0xb0 0x81
 
-#CHECK: vfene   %v18, %v3, %v20, 11, 0
+# CHECK: vfene %v18, %v3, %v20, 11, 0
 0xe7 0x23 0x40 0x00 0xba 0x81
 
-#CHECK: vfene   %v31, %v31, %v31, 11, 0
+# CHECK: vfene %v31, %v31, %v31, 11, 0
 0xe7 0xff 0xf0 0x00 0xbe 0x81
 
-#CHECK: vfeneb   %v0, %v0, %v0, 0
+# CHECK: vfeneb %v0, %v0, %v0, 0
 0xe7 0x00 0x00 0x00 0x00 0x81
 
-#CHECK: vfeneb   %v0, %v0, %v0, 12
+# CHECK: vfeneb %v0, %v0, %v0, 12
 0xe7 0x00 0x00 0xc0 0x00 0x81
 
-#CHECK: vfeneb   %v18, %v3, %v20, 0
+# CHECK: vfeneb %v18, %v3, %v20, 0
 0xe7 0x23 0x40 0x00 0x0a 0x81
 
-#CHECK: vfenebs  %v7, %v24, %v9
+# CHECK: vfenebs %v7, %v24, %v9
 0xe7 0x78 0x90 0x10 0x04 0x81
 
-#CHECK: vfenezb  %v18, %v3, %v20
+# CHECK: vfeneb %v31, %v31, %v31, 0
+0xe7 0xff 0xf0 0x00 0x0e 0x81
+
+# CHECK: vfenezb %v18, %v3, %v20
 0xe7 0x23 0x40 0x20 0x0a 0x81
 
-#CHECK: vfenezbs %v7, %v24, %v9
+# CHECK: vfenezbs %v7, %v24, %v9
 0xe7 0x78 0x90 0x30 0x04 0x81
 
-#CHECK: vfeneb   %v31, %v31, %v31, 0
-0xe7 0xff 0xf0 0x00 0x0e 0x81
-
-#CHECK: vfenef   %v0, %v0, %v0, 0
+# CHECK: vfenef %v0, %v0, %v0, 0
 0xe7 0x00 0x00 0x00 0x20 0x81
 
-#CHECK: vfenef   %v0, %v0, %v0, 12
+# CHECK: vfenef %v0, %v0, %v0, 12
 0xe7 0x00 0x00 0xc0 0x20 0x81
 
-#CHECK: vfenef   %v18, %v3, %v20, 0
+# CHECK: vfenef %v18, %v3, %v20, 0
 0xe7 0x23 0x40 0x00 0x2a 0x81
 
-#CHECK: vfenefs  %v7, %v24, %v9
+# CHECK: vfenef %v31, %v31, %v31, 0
+0xe7 0xff 0xf0 0x00 0x2e 0x81
+
+# CHECK: vfenefs %v7, %v24, %v9
 0xe7 0x78 0x90 0x10 0x24 0x81
 
-#CHECK: vfenezf  %v18, %v3, %v20
+# CHECK: vfenezf %v18, %v3, %v20
 0xe7 0x23 0x40 0x20 0x2a 0x81
 
-#CHECK: vfenezfs %v7, %v24, %v9
+# CHECK: vfenezfs %v7, %v24, %v9
 0xe7 0x78 0x90 0x30 0x24 0x81
 
-#CHECK: vfenef   %v31, %v31, %v31, 0
-0xe7 0xff 0xf0 0x00 0x2e 0x81
-
-#CHECK: vfeneh   %v0, %v0, %v0, 0
+# CHECK: vfeneh %v0, %v0, %v0, 0
 0xe7 0x00 0x00 0x00 0x10 0x81
 
-#CHECK: vfeneh   %v0, %v0, %v0, 12
+# CHECK: vfeneh %v0, %v0, %v0, 12
 0xe7 0x00 0x00 0xc0 0x10 0x81
 
-#CHECK: vfeneh   %v18, %v3, %v20, 0
+# CHECK: vfeneh %v18, %v3, %v20, 0
 0xe7 0x23 0x40 0x00 0x1a 0x81
 
-#CHECK: vfenehs  %v7, %v24, %v9
+# CHECK: vfeneh %v31, %v31, %v31, 0
+0xe7 0xff 0xf0 0x00 0x1e 0x81
+
+# CHECK: vfenehs %v7, %v24, %v9
 0xe7 0x78 0x90 0x10 0x14 0x81
 
-#CHECK: vfenezh  %v18, %v3, %v20
+# CHECK: vfenezh %v18, %v3, %v20
 0xe7 0x23 0x40 0x20 0x1a 0x81
 
-#CHECK: vfenezhs %v7, %v24, %v9
+# CHECK: vfenezhs %v7, %v24, %v9
 0xe7 0x78 0x90 0x30 0x14 0x81
 
-#CHECK: vfeneh   %v31, %v31, %v31, 0
-0xe7 0xff 0xf0 0x00 0x1e 0x81
-
-#CHECK: vfi     %v0, %v0, 11, 0, 0
+# CHECK: vfi  %v0, %v0, 11, 0, 0
 0xe7 0x00 0x00 0x00 0xb0 0xc7
 
-#CHECK: vfi     %v19, %v14, 11, 4, 10
+# CHECK: vfi  %v19, %v14, 11, 4, 10
 0xe7 0x3e 0x00 0xa4 0xb8 0xc7
 
-#CHECK: vfi     %v31, %v31, 11, 7, 15
+# CHECK: vfi  %v31, %v31, 11, 7, 15
 0xe7 0xff 0x00 0xf7 0xbc 0xc7
 
-#CHECK: vfidb   %v0, %v0, 0, 0
+# CHECK: vfidb %v0, %v0, 0, 0
 0xe7 0x00 0x00 0x00 0x30 0xc7
 
-#CHECK: vfidb   %v19, %v14, 4, 10
+# CHECK: vfidb %v19, %v14, 4, 10
 0xe7 0x3e 0x00 0xa4 0x38 0xc7
 
-#CHECK: vfidb   %v31, %v31, 7, 15
+# CHECK: vfidb %v31, %v31, 7, 15
 0xe7 0xff 0x00 0xf7 0x3c 0xc7
 
-#CHECK: vistr   %v0, %v0, 11, 0
-0xe7 0x00 0x00 0x00 0xb0 0x5c
-
-#CHECK: vistr   %v0, %v0, 11, 12
-0xe7 0x00 0x00 0xc0 0xb0 0x5c
-
-#CHECK: vistr   %v18, %v3, 11, 0
-0xe7 0x23 0x00 0x00 0xb8 0x5c
-
-#CHECK: vistr   %v31, %v31, 11, 0
-0xe7 0xff 0x00 0x00 0xbc 0x5c
-
-#CHECK: vistrb  %v0, %v0, 0
-0xe7 0x00 0x00 0x00 0x00 0x5c
-
-#CHECK: vistrb  %v0, %v0, 12
-0xe7 0x00 0x00 0xc0 0x00 0x5c
-
-#CHECK: vistrb  %v18, %v3, 0
-0xe7 0x23 0x00 0x00 0x08 0x5c
-
-#CHECK: vistrbs %v7, %v24
-0xe7 0x78 0x00 0x10 0x04 0x5c
+# CHECK: vflcdb %v0, %v0
+0xe7 0x00 0x00 0x00 0x30 0xcc
 
-#CHECK: vistrb  %v31, %v31, 0
-0xe7 0xff 0x00 0x00 0x0c 0x5c
+# CHECK: vflcdb %v19, %v14
+0xe7 0x3e 0x00 0x00 0x38 0xcc
 
-#CHECK: vistrf  %v0, %v0, 0
-0xe7 0x00 0x00 0x00 0x20 0x5c
+# CHECK: vflcdb %v31, %v31
+0xe7 0xff 0x00 0x00 0x3c 0xcc
 
-#CHECK: vistrf  %v0, %v0, 12
-0xe7 0x00 0x00 0xc0 0x20 0x5c
+# CHECK: vflndb %v0, %v0
+0xe7 0x00 0x00 0x10 0x30 0xcc
 
-#CHECK: vistrf  %v18, %v3, 0
-0xe7 0x23 0x00 0x00 0x28 0x5c
+# CHECK: vflndb %v19, %v14
+0xe7 0x3e 0x00 0x10 0x38 0xcc
 
-#CHECK: vistrfs %v7, %v24
-0xe7 0x78 0x00 0x10 0x24 0x5c
+# CHECK: vflndb %v31, %v31
+0xe7 0xff 0x00 0x10 0x3c 0xcc
 
-#CHECK: vistrf  %v31, %v31, 0
-0xe7 0xff 0x00 0x00 0x2c 0x5c
+# CHECK: vflpdb %v0, %v0
+0xe7 0x00 0x00 0x20 0x30 0xcc
 
-#CHECK: vistrh  %v0, %v0, 0
-0xe7 0x00 0x00 0x00 0x10 0x5c
+# CHECK: vflpdb %v19, %v14
+0xe7 0x3e 0x00 0x20 0x38 0xcc
 
-#CHECK: vistrh  %v0, %v0, 12
-0xe7 0x00 0x00 0xc0 0x10 0x5c
+# CHECK: vflpdb %v31, %v31
+0xe7 0xff 0x00 0x20 0x3c 0xcc
 
-#CHECK: vistrh  %v18, %v3, 0
-0xe7 0x23 0x00 0x00 0x18 0x5c
+# CHECK: vfm  %v0, %v0, %v0, 11, 9
+0xe7 0x00 0x00 0x09 0xb0 0xe7
 
-#CHECK: vistrhs %v7, %v24
-0xe7 0x78 0x00 0x10 0x14 0x5c
+# CHECK: vfm  %v18, %v3, %v20, 11, 9
+0xe7 0x23 0x40 0x09 0xba 0xe7
 
-#CHECK: vistrh  %v31, %v31, 0
-0xe7 0xff 0x00 0x00 0x1c 0x5c
+# CHECK: vfm  %v31, %v31, %v31, 11, 9
+0xe7 0xff 0xf0 0x09 0xbe 0xe7
 
-#CHECK: vfma    %v0, %v0, %v0, %v0, 9, 11
+# CHECK: vfma %v0, %v0, %v0, %v0, 9, 11
 0xe7 0x00 0x0b 0x09 0x00 0x8f
 
-#CHECK: vfma    %v3, %v20, %v5, %v22, 9, 11
+# CHECK: vfma %v3, %v20, %v5, %v22, 9, 11
 0xe7 0x34 0x5b 0x09 0x65 0x8f
 
-#CHECK: vfma    %v31, %v31, %v31, %v31, 9, 11
+# CHECK: vfma %v31, %v31, %v31, %v31, 9, 11
 0xe7 0xff 0xfb 0x09 0xff 0x8f
 
-#CHECK: vfmadb  %v0, %v0, %v0, %v0
+# CHECK: vfmadb %v0, %v0, %v0, %v0
 0xe7 0x00 0x03 0x00 0x00 0x8f
 
-#CHECK: vfmadb  %v3, %v20, %v5, %v22
+# CHECK: vfmadb %v3, %v20, %v5, %v22
 0xe7 0x34 0x53 0x00 0x65 0x8f
 
-#CHECK: vfmadb  %v31, %v31, %v31, %v31
+# CHECK: vfmadb %v31, %v31, %v31, %v31
 0xe7 0xff 0xf3 0x00 0xff 0x8f
 
-#CHECK: vfm     %v0, %v0, %v0, 11, 9
-0xe7 0x00 0x00 0x09 0xb0 0xe7
-
-#CHECK: vfm     %v18, %v3, %v20, 11, 9
-0xe7 0x23 0x40 0x09 0xba 0xe7
-
-#CHECK: vfm     %v31, %v31, %v31, 11, 9
-0xe7 0xff 0xf0 0x09 0xbe 0xe7
-
-#CHECK: vfmdb   %v0, %v0, %v0
+# CHECK: vfmdb %v0, %v0, %v0
 0xe7 0x00 0x00 0x00 0x30 0xe7
 
-#CHECK: vfmdb   %v18, %v3, %v20
+# CHECK: vfmdb %v18, %v3, %v20
 0xe7 0x23 0x40 0x00 0x3a 0xe7
 
-#CHECK: vfmdb   %v31, %v31, %v31
+# CHECK: vfmdb %v31, %v31, %v31
 0xe7 0xff 0xf0 0x00 0x3e 0xe7
 
-#CHECK: vfms    %v0, %v0, %v0, %v0, 9, 11
+# CHECK: vfms %v0, %v0, %v0, %v0, 9, 11
 0xe7 0x00 0x0b 0x09 0x00 0x8e
 
-#CHECK: vfms    %v3, %v20, %v5, %v22, 9, 11
+# CHECK: vfms %v3, %v20, %v5, %v22, 9, 11
 0xe7 0x34 0x5b 0x09 0x65 0x8e
 
-#CHECK: vfms    %v31, %v31, %v31, %v31, 9, 11
+# CHECK: vfms %v31, %v31, %v31, %v31, 9, 11
 0xe7 0xff 0xfb 0x09 0xff 0x8e
 
-#CHECK: vfmsdb  %v0, %v0, %v0, %v0
+# CHECK: vfmsdb %v0, %v0, %v0, %v0
 0xe7 0x00 0x03 0x00 0x00 0x8e
 
-#CHECK: vfmsdb  %v3, %v20, %v5, %v22
+# CHECK: vfmsdb %v3, %v20, %v5, %v22
 0xe7 0x34 0x53 0x00 0x65 0x8e
 
-#CHECK: vfmsdb  %v31, %v31, %v31, %v31
+# CHECK: vfmsdb %v31, %v31, %v31, %v31
 0xe7 0xff 0xf3 0x00 0xff 0x8e
 
-#CHECK: vfs     %v0, %v0, %v0, 11, 9
+# CHECK: vfpso %v0, %v0, 11, 9, 7
+0xe7 0x00 0x00 0x79 0xb0 0xcc
+
+# CHECK: vfpso %v19, %v14, 11, 9, 7
+0xe7 0x3e 0x00 0x79 0xb8 0xcc
+
+# CHECK: vfpso %v31, %v31, 11, 9, 7
+0xe7 0xff 0x00 0x79 0xbc 0xcc
+
+# CHECK: vfpsodb %v0, %v0, 7
+0xe7 0x00 0x00 0x70 0x30 0xcc
+
+# CHECK: vfpsodb %v19, %v14, 7
+0xe7 0x3e 0x00 0x70 0x38 0xcc
+
+# CHECK: vfpsodb %v31, %v31, 7
+0xe7 0xff 0x00 0x70 0x3c 0xcc
+
+# CHECK: vfs  %v0, %v0, %v0, 11, 9
 0xe7 0x00 0x00 0x09 0xb0 0xe2
 
-#CHECK: vfs     %v18, %v3, %v20, 11, 9
+# CHECK: vfs  %v18, %v3, %v20, 11, 9
 0xe7 0x23 0x40 0x09 0xba 0xe2
 
-#CHECK: vfs     %v31, %v31, %v31, 11, 9
+# CHECK: vfs  %v31, %v31, %v31, 11, 9
 0xe7 0xff 0xf0 0x09 0xbe 0xe2
 
-#CHECK: vfsdb   %v0, %v0, %v0
-0xe7 0x00 0x00 0x00 0x30 0xe2
+# CHECK: vfsdb %v0, %v0, %v0
+0xe7 0x00 0x00 0x00 0x30 0xe2
+
+# CHECK: vfsdb %v18, %v3, %v20
+0xe7 0x23 0x40 0x00 0x3a 0xe2
+
+# CHECK: vfsdb %v31, %v31, %v31
+0xe7 0xff 0xf0 0x00 0x3e 0xe2
+
+# CHECK: vfsq %v0, %v0, 11, 9
+0xe7 0x00 0x00 0x09 0xb0 0xce
+
+# CHECK: vfsq %v19, %v14, 11, 9
+0xe7 0x3e 0x00 0x09 0xb8 0xce
+
+# CHECK: vfsq %v31, %v31, 11, 9
+0xe7 0xff 0x00 0x09 0xbc 0xce
+
+# CHECK: vfsqdb %v0, %v0
+0xe7 0x00 0x00 0x00 0x30 0xce
+
+# CHECK: vfsqdb %v19, %v14
+0xe7 0x3e 0x00 0x00 0x38 0xce
+
+# CHECK: vfsqdb %v31, %v31
+0xe7 0xff 0x00 0x00 0x3c 0xce
+
+# CHECK: vftci %v0, %v0, 0, 11, 9
+0xe7 0x00 0x00 0x09 0xb0 0x4a
+
+# CHECK: vftci %v19, %v4, 1383, 11, 9
+0xe7 0x34 0x56 0x79 0xb8 0x4a
+
+# CHECK: vftci %v31, %v31, 4095, 11, 9
+0xe7 0xff 0xff 0xf9 0xbc 0x4a
 
-#CHECK: vfsdb   %v18, %v3, %v20
-0xe7 0x23 0x40 0x00 0x3a 0xe2
+# CHECK: vftcidb %v0, %v0, 0
+0xe7 0x00 0x00 0x00 0x30 0x4a
 
-#CHECK: vfsdb   %v31, %v31, %v31
-0xe7 0xff 0xf0 0x00 0x3e 0xe2
+# CHECK: vftcidb %v19, %v4, 1383
+0xe7 0x34 0x56 0x70 0x38 0x4a
 
-#CHECK: vzero   %v0
-0xe7 0x00 0x00 0x00 0x00 0x44
+# CHECK: vftcidb %v31, %v31, 4095
+0xe7 0xff 0xff 0xf0 0x3c 0x4a
 
-#CHECK: vgbm    %v0, 1
+# CHECK: vgbm %v0, 1
 0xe7 0x00 0x00 0x01 0x00 0x44
 
-#CHECK: vgbm    %v0, 65534
+# CHECK: vgbm %v0, 65534
 0xe7 0x00 0xff 0xfe 0x00 0x44
 
-#CHECK: vone    %v0
-0xe7 0x00 0xff 0xff 0x00 0x44
-
-#CHECK: vgbm    %v17, 4660
+# CHECK: vgbm %v17, 4660
 0xe7 0x10 0x12 0x34 0x08 0x44
 
-#CHECK: vone    %v31
+# CHECK: vzero %v0
+0xe7 0x00 0x00 0x00 0x00 0x44
+
+# CHECK: vone %v0
+0xe7 0x00 0xff 0xff 0x00 0x44
+
+# CHECK: vone %v31
 0xe7 0xf0 0xff 0xff 0x08 0x44
 
-#CHECK: vgef    %v0, 0(%v0), 0
+# CHECK: vgef %v0, 0(%v0), 0
 0xe7 0x00 0x00 0x00 0x00 0x13
 
-#CHECK: vgef    %v10, 1000(%v19,%r7), 2
+# CHECK: vgef %v10, 1000(%v19,%r7), 2
 0xe7 0xa3 0x73 0xe8 0x24 0x13
 
-#CHECK: vgef    %v31, 4095(%v31,%r15), 3
+# CHECK: vgef %v31, 4095(%v31,%r15), 3
 0xe7 0xff 0xff 0xff 0x3c 0x13
 
-#CHECK: vgeg    %v0, 0(%v0), 0
+# CHECK: vgeg %v0, 0(%v0), 0
 0xe7 0x00 0x00 0x00 0x00 0x12
 
-#CHECK: vgeg    %v10, 1000(%v19,%r7), 1
+# CHECK: vgeg %v10, 1000(%v19,%r7), 1
 0xe7 0xa3 0x73 0xe8 0x14 0x12
 
-#CHECK: vgeg    %v31, 4095(%v31,%r15), 1
+# CHECK: vgeg %v31, 4095(%v31,%r15), 1
 0xe7 0xff 0xff 0xff 0x1c 0x12
 
-#CHECK: vgfma   %v0, %v0, %v0, %v0, 11
+# CHECK: vgfm %v0, %v0, %v0, 11
+0xe7 0x00 0x00 0x00 0xb0 0xb4
+
+# CHECK: vgfm %v18, %v3, %v20, 11
+0xe7 0x23 0x40 0x00 0xba 0xb4
+
+# CHECK: vgfm %v31, %v31, %v31, 11
+0xe7 0xff 0xf0 0x00 0xbe 0xb4
+
+# CHECK: vgfma %v0, %v0, %v0, %v0, 11
 0xe7 0x00 0x0b 0x00 0x00 0xbc
 
-#CHECK: vgfma   %v3, %v20, %v5, %v22, 11
+# CHECK: vgfma %v3, %v20, %v5, %v22, 11
 0xe7 0x34 0x5b 0x00 0x65 0xbc
 
-#CHECK: vgfma   %v31, %v31, %v31, %v31, 11
+# CHECK: vgfma %v31, %v31, %v31, %v31, 11
 0xe7 0xff 0xfb 0x00 0xff 0xbc
 
-#CHECK: vgfmab  %v0, %v0, %v0, %v0
+# CHECK: vgfmab %v0, %v0, %v0, %v0
 0xe7 0x00 0x00 0x00 0x00 0xbc
 
-#CHECK: vgfmab  %v3, %v20, %v5, %v22
+# CHECK: vgfmab %v3, %v20, %v5, %v22
 0xe7 0x34 0x50 0x00 0x65 0xbc
 
-#CHECK: vgfmab  %v31, %v31, %v31, %v31
+# CHECK: vgfmab %v31, %v31, %v31, %v31
 0xe7 0xff 0xf0 0x00 0xff 0xbc
 
-#CHECK: vgfmaf  %v0, %v0, %v0, %v0
+# CHECK: vgfmaf %v0, %v0, %v0, %v0
 0xe7 0x00 0x02 0x00 0x00 0xbc
 
-#CHECK: vgfmaf  %v3, %v20, %v5, %v22
+# CHECK: vgfmaf %v3, %v20, %v5, %v22
 0xe7 0x34 0x52 0x00 0x65 0xbc
 
-#CHECK: vgfmaf  %v31, %v31, %v31, %v31
+# CHECK: vgfmaf %v31, %v31, %v31, %v31
 0xe7 0xff 0xf2 0x00 0xff 0xbc
 
-#CHECK: vgfmag  %v0, %v0, %v0, %v0
+# CHECK: vgfmag %v0, %v0, %v0, %v0
 0xe7 0x00 0x03 0x00 0x00 0xbc
 
-#CHECK: vgfmag  %v3, %v20, %v5, %v22
+# CHECK: vgfmag %v3, %v20, %v5, %v22
 0xe7 0x34 0x53 0x00 0x65 0xbc
 
-#CHECK: vgfmag  %v31, %v31, %v31, %v31
+# CHECK: vgfmag %v31, %v31, %v31, %v31
 0xe7 0xff 0xf3 0x00 0xff 0xbc
 
-#CHECK: vgfmah  %v0, %v0, %v0, %v0
+# CHECK: vgfmah %v0, %v0, %v0, %v0
 0xe7 0x00 0x01 0x00 0x00 0xbc
 
-#CHECK: vgfmah  %v3, %v20, %v5, %v22
+# CHECK: vgfmah %v3, %v20, %v5, %v22
 0xe7 0x34 0x51 0x00 0x65 0xbc
 
-#CHECK: vgfmah  %v31, %v31, %v31, %v31
+# CHECK: vgfmah %v31, %v31, %v31, %v31
 0xe7 0xff 0xf1 0x00 0xff 0xbc
 
-#CHECK: vgfm    %v0, %v0, %v0, 11
-0xe7 0x00 0x00 0x00 0xb0 0xb4
-
-#CHECK: vgfm    %v18, %v3, %v20, 11
-0xe7 0x23 0x40 0x00 0xba 0xb4
-
-#CHECK: vgfm    %v31, %v31, %v31, 11
-0xe7 0xff 0xf0 0x00 0xbe 0xb4
-
-#CHECK: vgfmb   %v0, %v0, %v0
+# CHECK: vgfmb %v0, %v0, %v0
 0xe7 0x00 0x00 0x00 0x00 0xb4
 
-#CHECK: vgfmb   %v18, %v3, %v20
+# CHECK: vgfmb %v18, %v3, %v20
 0xe7 0x23 0x40 0x00 0x0a 0xb4
 
-#CHECK: vgfmb   %v31, %v31, %v31
+# CHECK: vgfmb %v31, %v31, %v31
 0xe7 0xff 0xf0 0x00 0x0e 0xb4
 
-#CHECK: vgfmf   %v0, %v0, %v0
+# CHECK: vgfmf %v0, %v0, %v0
 0xe7 0x00 0x00 0x00 0x20 0xb4
 
-#CHECK: vgfmf   %v18, %v3, %v20
+# CHECK: vgfmf %v18, %v3, %v20
 0xe7 0x23 0x40 0x00 0x2a 0xb4
 
-#CHECK: vgfmf   %v31, %v31, %v31
+# CHECK: vgfmf %v31, %v31, %v31
 0xe7 0xff 0xf0 0x00 0x2e 0xb4
 
-#CHECK: vgfmg   %v0, %v0, %v0
+# CHECK: vgfmg %v0, %v0, %v0
 0xe7 0x00 0x00 0x00 0x30 0xb4
 
-#CHECK: vgfmg   %v18, %v3, %v20
+# CHECK: vgfmg %v18, %v3, %v20
 0xe7 0x23 0x40 0x00 0x3a 0xb4
 
-#CHECK: vgfmg   %v31, %v31, %v31
+# CHECK: vgfmg %v31, %v31, %v31
 0xe7 0xff 0xf0 0x00 0x3e 0xb4
 
-#CHECK: vgfmh   %v0, %v0, %v0
+# CHECK: vgfmh %v0, %v0, %v0
 0xe7 0x00 0x00 0x00 0x10 0xb4
 
-#CHECK: vgfmh   %v18, %v3, %v20
+# CHECK: vgfmh %v18, %v3, %v20
 0xe7 0x23 0x40 0x00 0x1a 0xb4
 
-#CHECK: vgfmh   %v31, %v31, %v31
+# CHECK: vgfmh %v31, %v31, %v31
 0xe7 0xff 0xf0 0x00 0x1e 0xb4
 
-#CHECK: vgm     %v0, 0, 0, 11
+# CHECK: vgm  %v0, 0, 0, 11
 0xe7 0x00 0x00 0x00 0xb0 0x46
 
-#CHECK: vgm     %v22, 55, 66, 11
+# CHECK: vgm  %v22, 55, 66, 11
 0xe7 0x60 0x37 0x42 0xb8 0x46
 
-#CHECK: vgm     %v31, 255, 255, 11
+# CHECK: vgm  %v31, 255, 255, 11
 0xe7 0xf0 0xff 0xff 0xb8 0x46
 
-#CHECK: vgmb    %v0, 0, 0
+# CHECK: vgmb %v0, 0, 0
 0xe7 0x00 0x00 0x00 0x00 0x46
 
-#CHECK: vgmb    %v22, 55, 66
+# CHECK: vgmb %v22, 55, 66
 0xe7 0x60 0x37 0x42 0x08 0x46
 
-#CHECK: vgmb    %v31, 255, 255
+# CHECK: vgmb %v31, 255, 255
 0xe7 0xf0 0xff 0xff 0x08 0x46
 
-#CHECK: vgmf    %v0, 0, 0
+# CHECK: vgmf %v0, 0, 0
 0xe7 0x00 0x00 0x00 0x20 0x46
 
-#CHECK: vgmf    %v22, 55, 66
+# CHECK: vgmf %v22, 55, 66
 0xe7 0x60 0x37 0x42 0x28 0x46
 
-#CHECK: vgmf    %v31, 255, 255
+# CHECK: vgmf %v31, 255, 255
 0xe7 0xf0 0xff 0xff 0x28 0x46
 
-#CHECK: vgmg    %v0, 0, 0
+# CHECK: vgmg %v0, 0, 0
 0xe7 0x00 0x00 0x00 0x30 0x46
 
-#CHECK: vgmg    %v22, 55, 66
+# CHECK: vgmg %v22, 55, 66
 0xe7 0x60 0x37 0x42 0x38 0x46
 
-#CHECK: vgmg    %v31, 255, 255
+# CHECK: vgmg %v31, 255, 255
 0xe7 0xf0 0xff 0xff 0x38 0x46
 
-#CHECK: vgmh    %v0, 0, 0
+# CHECK: vgmh %v0, 0, 0
 0xe7 0x00 0x00 0x00 0x10 0x46
 
-#CHECK: vgmh    %v22, 55, 66
+# CHECK: vgmh %v22, 55, 66
 0xe7 0x60 0x37 0x42 0x18 0x46
 
-#CHECK: vgmh    %v31, 255, 255
+# CHECK: vgmh %v31, 255, 255
 0xe7 0xf0 0xff 0xff 0x18 0x46
 
-#CHECK: vl      %v0, 0
+# CHECK: vistr %v0, %v0, 11, 0
+0xe7 0x00 0x00 0x00 0xb0 0x5c
+
+# CHECK: vistr %v0, %v0, 11, 12
+0xe7 0x00 0x00 0xc0 0xb0 0x5c
+
+# CHECK: vistr %v18, %v3, 11, 0
+0xe7 0x23 0x00 0x00 0xb8 0x5c
+
+# CHECK: vistr %v31, %v31, 11, 0
+0xe7 0xff 0x00 0x00 0xbc 0x5c
+
+# CHECK: vistrb %v0, %v0, 0
+0xe7 0x00 0x00 0x00 0x00 0x5c
+
+# CHECK: vistrb %v0, %v0, 12
+0xe7 0x00 0x00 0xc0 0x00 0x5c
+
+# CHECK: vistrb %v18, %v3, 0
+0xe7 0x23 0x00 0x00 0x08 0x5c
+
+# CHECK: vistrb %v31, %v31, 0
+0xe7 0xff 0x00 0x00 0x0c 0x5c
+
+# CHECK: vistrbs %v7, %v24
+0xe7 0x78 0x00 0x10 0x04 0x5c
+
+# CHECK: vistrf %v0, %v0, 0
+0xe7 0x00 0x00 0x00 0x20 0x5c
+
+# CHECK: vistrf %v0, %v0, 12
+0xe7 0x00 0x00 0xc0 0x20 0x5c
+
+# CHECK: vistrf %v18, %v3, 0
+0xe7 0x23 0x00 0x00 0x28 0x5c
+
+# CHECK: vistrf %v31, %v31, 0
+0xe7 0xff 0x00 0x00 0x2c 0x5c
+
+# CHECK: vistrfs %v7, %v24
+0xe7 0x78 0x00 0x10 0x24 0x5c
+
+# CHECK: vistrh %v0, %v0, 0
+0xe7 0x00 0x00 0x00 0x10 0x5c
+
+# CHECK: vistrh %v0, %v0, 12
+0xe7 0x00 0x00 0xc0 0x10 0x5c
+
+# CHECK: vistrh %v18, %v3, 0
+0xe7 0x23 0x00 0x00 0x18 0x5c
+
+# CHECK: vistrh %v31, %v31, 0
+0xe7 0xff 0x00 0x00 0x1c 0x5c
+
+# CHECK: vistrhs %v7, %v24
+0xe7 0x78 0x00 0x10 0x14 0x5c
+
+# CHECK: vl  %v0, 0
 0xe7 0x00 0x00 0x00 0x00 0x06
 
-#CHECK: vl      %v17, 2475(%r7,%r8)
+# CHECK: vl  %v17, 2475(%r7,%r8)
 0xe7 0x17 0x89 0xab 0x08 0x06
 
-#CHECK: vl      %v31, 4095(%r15,%r15)
+# CHECK: vl  %v31, 4095(%r15,%r15)
 0xe7 0xff 0xff 0xff 0x08 0x06
 
-#CHECK: vlbb    %v0, 0, 0
+# CHECK: vlbb %v0, 0, 0
 0xe7 0x00 0x00 0x00 0x00 0x07
 
-#CHECK: vlbb    %v17, 2475(%r7,%r8), 12
+# CHECK: vlbb %v17, 2475(%r7,%r8), 12
 0xe7 0x17 0x89 0xab 0xc8 0x07
 
-#CHECK: vlbb    %v31, 4095(%r15,%r15), 15
+# CHECK: vlbb %v31, 4095(%r15,%r15), 15
 0xe7 0xff 0xff 0xff 0xf8 0x07
 
-#CHECK: vlc     %v0, %v0, 11
+# CHECK: vlc  %v0, %v0, 11
 0xe7 0x00 0x00 0x00 0xb0 0xde
 
-#CHECK: vlc     %v19, %v14, 11
+# CHECK: vlc  %v19, %v14, 11
 0xe7 0x3e 0x00 0x00 0xb8 0xde
 
-#CHECK: vlc     %v31, %v31, 11
+# CHECK: vlc  %v31, %v31, 11
 0xe7 0xff 0x00 0x00 0xbc 0xde
 
-#CHECK: vlcb    %v0, %v0
+# CHECK: vlcb %v0, %v0
 0xe7 0x00 0x00 0x00 0x00 0xde
 
-#CHECK: vlcb    %v19, %v14
+# CHECK: vlcb %v19, %v14
 0xe7 0x3e 0x00 0x00 0x08 0xde
 
-#CHECK: vlcb    %v31, %v31
+# CHECK: vlcb %v31, %v31
 0xe7 0xff 0x00 0x00 0x0c 0xde
 
-#CHECK: vlcf    %v0, %v0
+# CHECK: vlcf %v0, %v0
 0xe7 0x00 0x00 0x00 0x20 0xde
 
-#CHECK: vlcf    %v19, %v14
+# CHECK: vlcf %v19, %v14
 0xe7 0x3e 0x00 0x00 0x28 0xde
 
-#CHECK: vlcf    %v31, %v31
+# CHECK: vlcf %v31, %v31
 0xe7 0xff 0x00 0x00 0x2c 0xde
 
-#CHECK: vlcg    %v0, %v0
+# CHECK: vlcg %v0, %v0
 0xe7 0x00 0x00 0x00 0x30 0xde
 
-#CHECK: vlcg    %v19, %v14
+# CHECK: vlcg %v19, %v14
 0xe7 0x3e 0x00 0x00 0x38 0xde
 
-#CHECK: vlcg    %v31, %v31
+# CHECK: vlcg %v31, %v31
 0xe7 0xff 0x00 0x00 0x3c 0xde
 
-#CHECK: vlch    %v0, %v0
+# CHECK: vlch %v0, %v0
 0xe7 0x00 0x00 0x00 0x10 0xde
 
-#CHECK: vlch    %v19, %v14
+# CHECK: vlch %v19, %v14
 0xe7 0x3e 0x00 0x00 0x18 0xde
 
-#CHECK: vlch    %v31, %v31
+# CHECK: vlch %v31, %v31
 0xe7 0xff 0x00 0x00 0x1c 0xde
 
-#CHECK: vlde    %v0, %v0, 11, 9
+# CHECK: vlde %v0, %v0, 11, 9
 0xe7 0x00 0x00 0x09 0xb0 0xc4
 
-#CHECK: vlde    %v19, %v14, 11, 9
+# CHECK: vlde %v19, %v14, 11, 9
 0xe7 0x3e 0x00 0x09 0xb8 0xc4
 
-#CHECK: vlde    %v31, %v31, 11, 9
+# CHECK: vlde %v31, %v31, 11, 9
 0xe7 0xff 0x00 0x09 0xbc 0xc4
 
-#CHECK: vldeb   %v0, %v0
+# CHECK: vldeb %v0, %v0
 0xe7 0x00 0x00 0x00 0x20 0xc4
 
-#CHECK: vldeb   %v19, %v14
+# CHECK: vldeb %v19, %v14
 0xe7 0x3e 0x00 0x00 0x28 0xc4
 
-#CHECK: vldeb   %v31, %v31
+# CHECK: vldeb %v31, %v31
 0xe7 0xff 0x00 0x00 0x2c 0xc4
 
-#CHECK: vleb    %v0, 0, 0
+# CHECK: vleb %v0, 0, 0
 0xe7 0x00 0x00 0x00 0x00 0x00
 
-#CHECK: vleb    %v17, 2475(%r7,%r8), 12
+# CHECK: vleb %v17, 2475(%r7,%r8), 12
 0xe7 0x17 0x89 0xab 0xc8 0x00
 
-#CHECK: vleb    %v31, 4095(%r15,%r15), 15
+# CHECK: vleb %v31, 4095(%r15,%r15), 15
 0xe7 0xff 0xff 0xff 0xf8 0x00
 
-#CHECK: vled    %v0, %v0, 11, 0, 0
+# CHECK: vled %v0, %v0, 11, 0, 0
 0xe7 0x00 0x00 0x00 0xb0 0xc5
 
-#CHECK: vled    %v19, %v14, 11, 4, 10
+# CHECK: vled %v19, %v14, 11, 4, 10
 0xe7 0x3e 0x00 0xa4 0xb8 0xc5
 
-#CHECK: vled    %v31, %v31, 11, 7, 15
+# CHECK: vled %v31, %v31, 11, 7, 15
 0xe7 0xff 0x00 0xf7 0xbc 0xc5
 
-#CHECK: vledb   %v0, %v0, 0, 0
+# CHECK: vledb %v0, %v0, 0, 0
 0xe7 0x00 0x00 0x00 0x30 0xc5
 
-#CHECK: vledb   %v19, %v14, 4, 10
+# CHECK: vledb %v19, %v14, 4, 10
 0xe7 0x3e 0x00 0xa4 0x38 0xc5
 
-#CHECK: vledb   %v31, %v31, 7, 15
+# CHECK: vledb %v31, %v31, 7, 15
 0xe7 0xff 0x00 0xf7 0x3c 0xc5
 
-#CHECK: vlef    %v0, 0, 0
+# CHECK: vlef %v0, 0, 0
 0xe7 0x00 0x00 0x00 0x00 0x03
 
-#CHECK: vlef    %v17, 2475(%r7,%r8), 2
+# CHECK: vlef %v17, 2475(%r7,%r8), 2
 0xe7 0x17 0x89 0xab 0x28 0x03
 
-#CHECK: vlef    %v31, 4095(%r15,%r15), 3
+# CHECK: vlef %v31, 4095(%r15,%r15), 3
 0xe7 0xff 0xff 0xff 0x38 0x03
 
-#CHECK: vleg    %v0, 0, 0
+# CHECK: vleg %v0, 0, 0
 0xe7 0x00 0x00 0x00 0x00 0x02
 
-#CHECK: vleg    %v17, 2475(%r7,%r8), 1
+# CHECK: vleg %v17, 2475(%r7,%r8), 1
 0xe7 0x17 0x89 0xab 0x18 0x02
 
-#CHECK: vleg    %v31, 4095(%r15,%r15), 1
+# CHECK: vleg %v31, 4095(%r15,%r15), 1
 0xe7 0xff 0xff 0xff 0x18 0x02
 
-#CHECK: vleh    %v0, 0, 0
+# CHECK: vleh %v0, 0, 0
 0xe7 0x00 0x00 0x00 0x00 0x01
 
-#CHECK: vleh    %v17, 2475(%r7,%r8), 5
+# CHECK: vleh %v17, 2475(%r7,%r8), 5
 0xe7 0x17 0x89 0xab 0x58 0x01
 
-#CHECK: vleh    %v31, 4095(%r15,%r15), 7
+# CHECK: vleh %v31, 4095(%r15,%r15), 7
 0xe7 0xff 0xff 0xff 0x78 0x01
 
-#CHECK: vleib   %v0, 0, 0
+# CHECK: vleib %v0, 0, 0
 0xe7 0x00 0x00 0x00 0x00 0x40
 
-#CHECK: vleib   %v23, -30293, 12
+# CHECK: vleib %v23, -30293, 12
 0xe7 0x70 0x89 0xab 0xc8 0x40
 
-#CHECK: vleib   %v31, -1, 15
+# CHECK: vleib %v31, -1, 15
 0xe7 0xf0 0xff 0xff 0xf8 0x40
 
-#CHECK: vleif   %v0, 0, 0
+# CHECK: vleif %v0, 0, 0
 0xe7 0x00 0x00 0x00 0x00 0x43
 
-#CHECK: vleif   %v23, -30293, 2
+# CHECK: vleif %v23, -30293, 2
 0xe7 0x70 0x89 0xab 0x28 0x43
 
-#CHECK: vleif   %v31, -1, 3
+# CHECK: vleif %v31, -1, 3
 0xe7 0xf0 0xff 0xff 0x38 0x43
 
-#CHECK: vleig   %v0, 0, 0
+# CHECK: vleig %v0, 0, 0
 0xe7 0x00 0x00 0x00 0x00 0x42
 
-#CHECK: vleig   %v23, -30293, 1
+# CHECK: vleig %v23, -30293, 1
 0xe7 0x70 0x89 0xab 0x18 0x42
 
-#CHECK: vleig   %v31, -1, 1
+# CHECK: vleig %v31, -1, 1
 0xe7 0xf0 0xff 0xff 0x18 0x42
 
-#CHECK: vleih   %v0, 0, 0
+# CHECK: vleih %v0, 0, 0
 0xe7 0x00 0x00 0x00 0x00 0x41
 
-#CHECK: vleih   %v23, -30293, 5
+# CHECK: vleih %v23, -30293, 5
 0xe7 0x70 0x89 0xab 0x58 0x41
 
-#CHECK: vleih   %v31, -1, 7
+# CHECK: vleih %v31, -1, 7
 0xe7 0xf0 0xff 0xff 0x78 0x41
 
-#CHECK: vfpso   %v0, %v0, 11, 9, 7
-0xe7 0x00 0x00 0x79 0xb0 0xcc
-
-#CHECK: vfpso   %v19, %v14, 11, 9, 7
-0xe7 0x3e 0x00 0x79 0xb8 0xcc
-
-#CHECK: vfpso   %v31, %v31, 11, 9, 7
-0xe7 0xff 0x00 0x79 0xbc 0xcc
-
-#CHECK: vfpsodb %v0, %v0, 7
-0xe7 0x00 0x00 0x70 0x30 0xcc
-
-#CHECK: vfpsodb %v19, %v14, 7
-0xe7 0x3e 0x00 0x70 0x38 0xcc
-
-#CHECK: vfpsodb %v31, %v31, 7
-0xe7 0xff 0x00 0x70 0x3c 0xcc
-
-#CHECK: vflcdb  %v0, %v0
-0xe7 0x00 0x00 0x00 0x30 0xcc
-
-#CHECK: vflcdb  %v19, %v14
-0xe7 0x3e 0x00 0x00 0x38 0xcc
-
-#CHECK: vflcdb  %v31, %v31
-0xe7 0xff 0x00 0x00 0x3c 0xcc
-
-#CHECK: vflndb  %v0, %v0
-0xe7 0x00 0x00 0x10 0x30 0xcc
-
-#CHECK: vflndb  %v19, %v14
-0xe7 0x3e 0x00 0x10 0x38 0xcc
-
-#CHECK: vflndb  %v31, %v31
-0xe7 0xff 0x00 0x10 0x3c 0xcc
-
-#CHECK: vflpdb  %v0, %v0
-0xe7 0x00 0x00 0x20 0x30 0xcc
-
-#CHECK: vflpdb  %v19, %v14
-0xe7 0x3e 0x00 0x20 0x38 0xcc
-
-#CHECK: vflpdb  %v31, %v31
-0xe7 0xff 0x00 0x20 0x3c 0xcc
-
-#CHECK: vlgv    %r0, %v0, 0, 11
+# CHECK: vlgv %r0, %v0, 0, 11
 0xe7 0x00 0x00 0x00 0xb0 0x21
 
-#CHECK: vlgv    %r2, %v19, 1383(%r4), 11
+# CHECK: vlgv %r2, %v19, 1383(%r4), 11
 0xe7 0x23 0x45 0x67 0xb4 0x21
 
-#CHECK: vlgv    %r15, %v31, 4095(%r15), 11
+# CHECK: vlgv %r15, %v31, 4095(%r15), 11
 0xe7 0xff 0xff 0xff 0xb4 0x21
 
-#CHECK: vlgvb   %r0, %v0, 0
+# CHECK: vlgvb %r0, %v0, 0
 0xe7 0x00 0x00 0x00 0x00 0x21
 
-#CHECK: vlgvb   %r2, %v19, 1383(%r4)
+# CHECK: vlgvb %r2, %v19, 1383(%r4)
 0xe7 0x23 0x45 0x67 0x04 0x21
 
-#CHECK: vlgvb   %r15, %v31, 4095(%r15)
+# CHECK: vlgvb %r15, %v31, 4095(%r15)
 0xe7 0xff 0xff 0xff 0x04 0x21
 
-#CHECK: vlgvf   %r0, %v0, 0
+# CHECK: vlgvf %r0, %v0, 0
 0xe7 0x00 0x00 0x00 0x20 0x21
 
-#CHECK: vlgvf   %r2, %v19, 1383(%r4)
+# CHECK: vlgvf %r2, %v19, 1383(%r4)
 0xe7 0x23 0x45 0x67 0x24 0x21
 
-#CHECK: vlgvf   %r15, %v31, 4095(%r15)
+# CHECK: vlgvf %r15, %v31, 4095(%r15)
 0xe7 0xff 0xff 0xff 0x24 0x21
 
-#CHECK: vlgvg   %r0, %v0, 0
+# CHECK: vlgvg %r0, %v0, 0
 0xe7 0x00 0x00 0x00 0x30 0x21
 
-#CHECK: vlgvg   %r2, %v19, 1383(%r4)
+# CHECK: vlgvg %r2, %v19, 1383(%r4)
 0xe7 0x23 0x45 0x67 0x34 0x21
 
-#CHECK: vlgvg   %r15, %v31, 4095(%r15)
+# CHECK: vlgvg %r15, %v31, 4095(%r15)
 0xe7 0xff 0xff 0xff 0x34 0x21
 
-#CHECK: vlgvh   %r0, %v0, 0
+# CHECK: vlgvh %r0, %v0, 0
 0xe7 0x00 0x00 0x00 0x10 0x21
 
-#CHECK: vlgvh   %r2, %v19, 1383(%r4)
+# CHECK: vlgvh %r2, %v19, 1383(%r4)
 0xe7 0x23 0x45 0x67 0x14 0x21
 
-#CHECK: vlgvh   %r15, %v31, 4095(%r15)
+# CHECK: vlgvh %r15, %v31, 4095(%r15)
 0xe7 0xff 0xff 0xff 0x14 0x21
 
-#CHECK: vfsq    %v0, %v0, 11, 9
-0xe7 0x00 0x00 0x09 0xb0 0xce
-
-#CHECK: vfsq    %v19, %v14, 11, 9
-0xe7 0x3e 0x00 0x09 0xb8 0xce
-
-#CHECK: vfsq    %v31, %v31, 11, 9
-0xe7 0xff 0x00 0x09 0xbc 0xce
-
-#CHECK: vfsqdb  %v0, %v0
-0xe7 0x00 0x00 0x00 0x30 0xce
-
-#CHECK: vfsqdb  %v19, %v14
-0xe7 0x3e 0x00 0x00 0x38 0xce
-
-#CHECK: vfsqdb  %v31, %v31
-0xe7 0xff 0x00 0x00 0x3c 0xce
-
-#CHECK: vftci   %v0, %v0, 0, 11, 9
-0xe7 0x00 0x00 0x09 0xb0 0x4a
-
-#CHECK: vftci   %v19, %v4, 1383, 11, 9
-0xe7 0x34 0x56 0x79 0xb8 0x4a
-
-#CHECK: vftci   %v31, %v31, 4095, 11, 9
-0xe7 0xff 0xff 0xf9 0xbc 0x4a
-
-#CHECK: vftcidb %v0, %v0, 0
-0xe7 0x00 0x00 0x00 0x30 0x4a
-
-#CHECK: vftcidb %v19, %v4, 1383
-0xe7 0x34 0x56 0x70 0x38 0x4a
-
-#CHECK: vftcidb %v31, %v31, 4095
-0xe7 0xff 0xff 0xf0 0x3c 0x4a
-
-#CHECK: vll     %v0, %r0, 0
+# CHECK: vll  %v0, %r0, 0
 0xe7 0x00 0x00 0x00 0x00 0x37
 
-#CHECK: vll     %v18, %r3, 1383(%r4)
+# CHECK: vll  %v18, %r3, 1383(%r4)
 0xe7 0x23 0x45 0x67 0x08 0x37
 
-#CHECK: vll     %v31, %r15, 4095(%r15)
+# CHECK: vll  %v31, %r15, 4095(%r15)
 0xe7 0xff 0xff 0xff 0x08 0x37
 
-#CHECK: vllez   %v0, 0, 11
+# CHECK: vllez %v0, 0, 11
 0xe7 0x00 0x00 0x00 0xb0 0x04
 
-#CHECK: vllez   %v17, 2475(%r7,%r8), 11
+# CHECK: vllez %v17, 2475(%r7,%r8), 11
 0xe7 0x17 0x89 0xab 0xb8 0x04
 
-#CHECK: vllez   %v31, 4095(%r15,%r15), 11
+# CHECK: vllez %v31, 4095(%r15,%r15), 11
 0xe7 0xff 0xff 0xff 0xb8 0x04
 
-#CHECK: vllezb  %v0, 0
+# CHECK: vllezb %v0, 0
 0xe7 0x00 0x00 0x00 0x00 0x04
 
-#CHECK: vllezb  %v17, 2475(%r7,%r8)
+# CHECK: vllezb %v17, 2475(%r7,%r8)
 0xe7 0x17 0x89 0xab 0x08 0x04
 
-#CHECK: vllezb  %v31, 4095(%r15,%r15)
+# CHECK: vllezb %v31, 4095(%r15,%r15)
 0xe7 0xff 0xff 0xff 0x08 0x04
 
-#CHECK: vllezf  %v0, 0
+# CHECK: vllezf %v0, 0
 0xe7 0x00 0x00 0x00 0x20 0x04
 
-#CHECK: vllezf  %v17, 2475(%r7,%r8)
+# CHECK: vllezf %v17, 2475(%r7,%r8)
 0xe7 0x17 0x89 0xab 0x28 0x04
 
-#CHECK: vllezf  %v31, 4095(%r15,%r15)
+# CHECK: vllezf %v31, 4095(%r15,%r15)
 0xe7 0xff 0xff 0xff 0x28 0x04
 
-#CHECK: vllezg  %v0, 0
+# CHECK: vllezg %v0, 0
 0xe7 0x00 0x00 0x00 0x30 0x04
 
-#CHECK: vllezg  %v17, 2475(%r7,%r8)
+# CHECK: vllezg %v17, 2475(%r7,%r8)
 0xe7 0x17 0x89 0xab 0x38 0x04
 
-#CHECK: vllezg  %v31, 4095(%r15,%r15)
+# CHECK: vllezg %v31, 4095(%r15,%r15)
 0xe7 0xff 0xff 0xff 0x38 0x04
 
-#CHECK: vllezh  %v0, 0
+# CHECK: vllezh %v0, 0
 0xe7 0x00 0x00 0x00 0x10 0x04
 
-#CHECK: vllezh  %v17, 2475(%r7,%r8)
+# CHECK: vllezh %v17, 2475(%r7,%r8)
 0xe7 0x17 0x89 0xab 0x18 0x04
 
-#CHECK: vllezh  %v31, 4095(%r15,%r15)
+# CHECK: vllezh %v31, 4095(%r15,%r15)
 0xe7 0xff 0xff 0xff 0x18 0x04
 
-#CHECK: vlm     %v0, %v0, 0
+# CHECK: vlm  %v0, %v0, 0
 0xe7 0x00 0x00 0x00 0x00 0x36
 
-#CHECK: vlm     %v12, %v18, 1110(%r3)
+# CHECK: vlm  %v12, %v18, 1110(%r3)
 0xe7 0xc2 0x34 0x56 0x04 0x36
 
-#CHECK: vlm     %v31, %v31, 4095(%r15)
+# CHECK: vlm  %v31, %v31, 4095(%r15)
 0xe7 0xff 0xff 0xff 0x0c 0x36
 
-#CHECK: vlp     %v0, %v0, 11
+# CHECK: vlp  %v0, %v0, 11
 0xe7 0x00 0x00 0x00 0xb0 0xdf
 
-#CHECK: vlp     %v19, %v14, 11
+# CHECK: vlp  %v19, %v14, 11
 0xe7 0x3e 0x00 0x00 0xb8 0xdf
 
-#CHECK: vlp     %v31, %v31, 11
+# CHECK: vlp  %v31, %v31, 11
 0xe7 0xff 0x00 0x00 0xbc 0xdf
 
-#CHECK: vlpb    %v0, %v0
+# CHECK: vlpb %v0, %v0
 0xe7 0x00 0x00 0x00 0x00 0xdf
 
-#CHECK: vlpb    %v19, %v14
+# CHECK: vlpb %v19, %v14
 0xe7 0x3e 0x00 0x00 0x08 0xdf
 
-#CHECK: vlpb    %v31, %v31
+# CHECK: vlpb %v31, %v31
 0xe7 0xff 0x00 0x00 0x0c 0xdf
 
-#CHECK: vlpf    %v0, %v0
+# CHECK: vlpf %v0, %v0
 0xe7 0x00 0x00 0x00 0x20 0xdf
 
-#CHECK: vlpf    %v19, %v14
+# CHECK: vlpf %v19, %v14
 0xe7 0x3e 0x00 0x00 0x28 0xdf
 
-#CHECK: vlpf    %v31, %v31
+# CHECK: vlpf %v31, %v31
 0xe7 0xff 0x00 0x00 0x2c 0xdf
 
-#CHECK: vlpg    %v0, %v0
+# CHECK: vlpg %v0, %v0
 0xe7 0x00 0x00 0x00 0x30 0xdf
 
-#CHECK: vlpg    %v19, %v14
+# CHECK: vlpg %v19, %v14
 0xe7 0x3e 0x00 0x00 0x38 0xdf
 
-#CHECK: vlpg    %v31, %v31
+# CHECK: vlpg %v31, %v31
 0xe7 0xff 0x00 0x00 0x3c 0xdf
 
-#CHECK: vlph    %v0, %v0
+# CHECK: vlph %v0, %v0
 0xe7 0x00 0x00 0x00 0x10 0xdf
 
-#CHECK: vlph    %v19, %v14
+# CHECK: vlph %v19, %v14
 0xe7 0x3e 0x00 0x00 0x18 0xdf
 
-#CHECK: vlph    %v31, %v31
+# CHECK: vlph %v31, %v31
 0xe7 0xff 0x00 0x00 0x1c 0xdf
 
-#CHECK: vlr     %v0, %v0
+# CHECK: vlr  %v0, %v0
 0xe7 0x00 0x00 0x00 0x00 0x56
 
-#CHECK: vlr     %v19, %v14
+# CHECK: vlr  %v19, %v14
 0xe7 0x3e 0x00 0x00 0x08 0x56
 
-#CHECK: vlr     %v31, %v31
+# CHECK: vlr  %v31, %v31
 0xe7 0xff 0x00 0x00 0x0c 0x56
 
-#CHECK: vlrep    %v0, 0, 11
+# CHECK: vlrep %v0, 0, 11
 0xe7 0x00 0x00 0x00 0xb0 0x05
 
-#CHECK: vlrep    %v17, 2475(%r7,%r8), 11
+# CHECK: vlrep %v17, 2475(%r7,%r8), 11
 0xe7 0x17 0x89 0xab 0xb8 0x05
 
-#CHECK: vlrep    %v31, 4095(%r15,%r15), 11
+# CHECK: vlrep %v31, 4095(%r15,%r15), 11
 0xe7 0xff 0xff 0xff 0xb8 0x05
 
-#CHECK: vlrepb   %v0, 0
+# CHECK: vlrepb %v0, 0
 0xe7 0x00 0x00 0x00 0x00 0x05
 
-#CHECK: vlrepb   %v17, 2475(%r7,%r8)
+# CHECK: vlrepb %v17, 2475(%r7,%r8)
 0xe7 0x17 0x89 0xab 0x08 0x05
 
-#CHECK: vlrepb   %v31, 4095(%r15,%r15)
+# CHECK: vlrepb %v31, 4095(%r15,%r15)
 0xe7 0xff 0xff 0xff 0x08 0x05
 
-#CHECK: vlrepf   %v0, 0
+# CHECK: vlrepf %v0, 0
 0xe7 0x00 0x00 0x00 0x20 0x05
 
-#CHECK: vlrepf   %v17, 2475(%r7,%r8)
+# CHECK: vlrepf %v17, 2475(%r7,%r8)
 0xe7 0x17 0x89 0xab 0x28 0x05
 
-#CHECK: vlrepf   %v31, 4095(%r15,%r15)
+# CHECK: vlrepf %v31, 4095(%r15,%r15)
 0xe7 0xff 0xff 0xff 0x28 0x05
 
-#CHECK: vlrepg   %v0, 0
+# CHECK: vlrepg %v0, 0
 0xe7 0x00 0x00 0x00 0x30 0x05
 
-#CHECK: vlrepg   %v17, 2475(%r7,%r8)
+# CHECK: vlrepg %v17, 2475(%r7,%r8)
 0xe7 0x17 0x89 0xab 0x38 0x05
 
-#CHECK: vlrepg   %v31, 4095(%r15,%r15)
+# CHECK: vlrepg %v31, 4095(%r15,%r15)
 0xe7 0xff 0xff 0xff 0x38 0x05
 
-#CHECK: vlreph   %v0, 0
+# CHECK: vlreph %v0, 0
 0xe7 0x00 0x00 0x00 0x10 0x05
 
-#CHECK: vlreph   %v17, 2475(%r7,%r8)
+# CHECK: vlreph %v17, 2475(%r7,%r8)
 0xe7 0x17 0x89 0xab 0x18 0x05
 
-#CHECK: vlreph   %v31, 4095(%r15,%r15)
+# CHECK: vlreph %v31, 4095(%r15,%r15)
 0xe7 0xff 0xff 0xff 0x18 0x05
 
-#CHECK: vlvg    %v0, %r0, 0, 11
+# CHECK: vlvg %v0, %r0, 0, 11
 0xe7 0x00 0x00 0x00 0xb0 0x22
 
-#CHECK: vlvg    %v18, %r3, 1383(%r4), 11
+# CHECK: vlvg %v18, %r3, 1383(%r4), 11
 0xe7 0x23 0x45 0x67 0xb8 0x22
 
-#CHECK: vlvg    %v31, %r15, 4095(%r15), 11
+# CHECK: vlvg %v31, %r15, 4095(%r15), 11
 0xe7 0xff 0xff 0xff 0xb8 0x22
 
-#CHECK: vlvgb   %v0, %r0, 0
+# CHECK: vlvgb %v0, %r0, 0
 0xe7 0x00 0x00 0x00 0x00 0x22
 
-#CHECK: vlvgb   %v18, %r3, 1383(%r4)
+# CHECK: vlvgb %v18, %r3, 1383(%r4)
 0xe7 0x23 0x45 0x67 0x08 0x22
 
-#CHECK: vlvgb   %v31, %r15, 4095(%r15)
+# CHECK: vlvgb %v31, %r15, 4095(%r15)
 0xe7 0xff 0xff 0xff 0x08 0x22
 
-#CHECK: vlvgf   %v0, %r0, 0
+# CHECK: vlvgf %v0, %r0, 0
 0xe7 0x00 0x00 0x00 0x20 0x22
 
-#CHECK: vlvgf   %v18, %r3, 1383(%r4)
+# CHECK: vlvgf %v18, %r3, 1383(%r4)
 0xe7 0x23 0x45 0x67 0x28 0x22
 
-#CHECK: vlvgf   %v31, %r15, 4095(%r15)
+# CHECK: vlvgf %v31, %r15, 4095(%r15)
 0xe7 0xff 0xff 0xff 0x28 0x22
 
-#CHECK: vlvgg   %v0, %r0, 0
+# CHECK: vlvgg %v0, %r0, 0
 0xe7 0x00 0x00 0x00 0x30 0x22
 
-#CHECK: vlvgg   %v18, %r3, 1383(%r4)
+# CHECK: vlvgg %v18, %r3, 1383(%r4)
 0xe7 0x23 0x45 0x67 0x38 0x22
 
-#CHECK: vlvgg   %v31, %r15, 4095(%r15)
+# CHECK: vlvgg %v31, %r15, 4095(%r15)
 0xe7 0xff 0xff 0xff 0x38 0x22
 
-#CHECK: vlvgh   %v0, %r0, 0
+# CHECK: vlvgh %v0, %r0, 0
 0xe7 0x00 0x00 0x00 0x10 0x22
 
-#CHECK: vlvgh   %v18, %r3, 1383(%r4)
+# CHECK: vlvgh %v18, %r3, 1383(%r4)
 0xe7 0x23 0x45 0x67 0x18 0x22
 
-#CHECK: vlvgh   %v31, %r15, 4095(%r15)
+# CHECK: vlvgh %v31, %r15, 4095(%r15)
 0xe7 0xff 0xff 0xff 0x18 0x22
 
-#CHECK: vlvgp   %v0, %r0, %r0
+# CHECK: vlvgp %v0, %r0, %r0
 0xe7 0x00 0x00 0x00 0x00 0x62
 
-#CHECK: vlvgp   %v18, %r3, %r4
+# CHECK: vlvgp %v18, %r3, %r4
 0xe7 0x23 0x40 0x00 0x08 0x62
 
-#CHECK: vlvgp   %v31, %r15, %r15
+# CHECK: vlvgp %v31, %r15, %r15
 0xe7 0xff 0xf0 0x00 0x08 0x62
 
-#CHECK: vmae    %v0, %v0, %v0, %v0, 11
+# CHECK: vmae %v0, %v0, %v0, %v0, 11
 0xe7 0x00 0x0b 0x00 0x00 0xae
 
-#CHECK: vmae    %v3, %v20, %v5, %v22, 11
+# CHECK: vmae %v3, %v20, %v5, %v22, 11
 0xe7 0x34 0x5b 0x00 0x65 0xae
 
-#CHECK: vmae    %v31, %v31, %v31, %v31, 11
+# CHECK: vmae %v31, %v31, %v31, %v31, 11
 0xe7 0xff 0xfb 0x00 0xff 0xae
 
-#CHECK: vmaeb   %v0, %v0, %v0, %v0
+# CHECK: vmaeb %v0, %v0, %v0, %v0
 0xe7 0x00 0x00 0x00 0x00 0xae
 
-#CHECK: vmaeb   %v3, %v20, %v5, %v22
+# CHECK: vmaeb %v3, %v20, %v5, %v22
 0xe7 0x34 0x50 0x00 0x65 0xae
 
-#CHECK: vmaeb   %v31, %v31, %v31, %v31
+# CHECK: vmaeb %v31, %v31, %v31, %v31
 0xe7 0xff 0xf0 0x00 0xff 0xae
 
-#CHECK: vmaef   %v0, %v0, %v0, %v0
+# CHECK: vmaef %v0, %v0, %v0, %v0
 0xe7 0x00 0x02 0x00 0x00 0xae
 
-#CHECK: vmaef   %v3, %v20, %v5, %v22
+# CHECK: vmaef %v3, %v20, %v5, %v22
 0xe7 0x34 0x52 0x00 0x65 0xae
 
-#CHECK: vmaef   %v31, %v31, %v31, %v31
+# CHECK: vmaef %v31, %v31, %v31, %v31
 0xe7 0xff 0xf2 0x00 0xff 0xae
 
-#CHECK: vmaeh   %v0, %v0, %v0, %v0
+# CHECK: vmaeh %v0, %v0, %v0, %v0
 0xe7 0x00 0x01 0x00 0x00 0xae
 
-#CHECK: vmaeh   %v3, %v20, %v5, %v22
+# CHECK: vmaeh %v3, %v20, %v5, %v22
 0xe7 0x34 0x51 0x00 0x65 0xae
 
-#CHECK: vmaeh   %v31, %v31, %v31, %v31
+# CHECK: vmaeh %v31, %v31, %v31, %v31
 0xe7 0xff 0xf1 0x00 0xff 0xae
 
-#CHECK: vmah    %v0, %v0, %v0, %v0, 11
+# CHECK: vmah %v0, %v0, %v0, %v0, 11
 0xe7 0x00 0x0b 0x00 0x00 0xab
 
-#CHECK: vmah    %v3, %v20, %v5, %v22, 11
+# CHECK: vmah %v3, %v20, %v5, %v22, 11
 0xe7 0x34 0x5b 0x00 0x65 0xab
 
-#CHECK: vmah    %v31, %v31, %v31, %v31, 11
+# CHECK: vmah %v31, %v31, %v31, %v31, 11
 0xe7 0xff 0xfb 0x00 0xff 0xab
 
-#CHECK: vmahb   %v0, %v0, %v0, %v0
+# CHECK: vmahb %v0, %v0, %v0, %v0
 0xe7 0x00 0x00 0x00 0x00 0xab
 
-#CHECK: vmahb   %v3, %v20, %v5, %v22
+# CHECK: vmahb %v3, %v20, %v5, %v22
 0xe7 0x34 0x50 0x00 0x65 0xab
 
-#CHECK: vmahb   %v31, %v31, %v31, %v31
+# CHECK: vmahb %v31, %v31, %v31, %v31
 0xe7 0xff 0xf0 0x00 0xff 0xab
 
-#CHECK: vmahf   %v0, %v0, %v0, %v0
+# CHECK: vmahf %v0, %v0, %v0, %v0
 0xe7 0x00 0x02 0x00 0x00 0xab
 
-#CHECK: vmahf   %v3, %v20, %v5, %v22
+# CHECK: vmahf %v3, %v20, %v5, %v22
 0xe7 0x34 0x52 0x00 0x65 0xab
 
-#CHECK: vmahf   %v31, %v31, %v31, %v31
+# CHECK: vmahf %v31, %v31, %v31, %v31
 0xe7 0xff 0xf2 0x00 0xff 0xab
 
-#CHECK: vmahh   %v0, %v0, %v0, %v0
+# CHECK: vmahh %v0, %v0, %v0, %v0
 0xe7 0x00 0x01 0x00 0x00 0xab
 
-#CHECK: vmahh   %v3, %v20, %v5, %v22
+# CHECK: vmahh %v3, %v20, %v5, %v22
 0xe7 0x34 0x51 0x00 0x65 0xab
 
-#CHECK: vmahh   %v31, %v31, %v31, %v31
+# CHECK: vmahh %v31, %v31, %v31, %v31
 0xe7 0xff 0xf1 0x00 0xff 0xab
 
-#CHECK: vmal    %v0, %v0, %v0, %v0, 11
+# CHECK: vmal %v0, %v0, %v0, %v0, 11
 0xe7 0x00 0x0b 0x00 0x00 0xaa
 
-#CHECK: vmal    %v3, %v20, %v5, %v22, 11
+# CHECK: vmal %v3, %v20, %v5, %v22, 11
 0xe7 0x34 0x5b 0x00 0x65 0xaa
 
-#CHECK: vmal    %v31, %v31, %v31, %v31, 11
+# CHECK: vmal %v31, %v31, %v31, %v31, 11
 0xe7 0xff 0xfb 0x00 0xff 0xaa
 
-#CHECK: vmalb   %v0, %v0, %v0, %v0
+# CHECK: vmalb %v0, %v0, %v0, %v0
 0xe7 0x00 0x00 0x00 0x00 0xaa
 
-#CHECK: vmalb   %v3, %v20, %v5, %v22
+# CHECK: vmalb %v3, %v20, %v5, %v22
 0xe7 0x34 0x50 0x00 0x65 0xaa
 
-#CHECK: vmalb   %v31, %v31, %v31, %v31
+# CHECK: vmalb %v31, %v31, %v31, %v31
 0xe7 0xff 0xf0 0x00 0xff 0xaa
 
-#CHECK: vmale   %v0, %v0, %v0, %v0, 11
+# CHECK: vmale %v0, %v0, %v0, %v0, 11
 0xe7 0x00 0x0b 0x00 0x00 0xac
 
-#CHECK: vmale   %v3, %v20, %v5, %v22, 11
+# CHECK: vmale %v3, %v20, %v5, %v22, 11
 0xe7 0x34 0x5b 0x00 0x65 0xac
 
-#CHECK: vmale   %v31, %v31, %v31, %v31, 11
+# CHECK: vmale %v31, %v31, %v31, %v31, 11
 0xe7 0xff 0xfb 0x00 0xff 0xac
 
-#CHECK: vmaleb  %v0, %v0, %v0, %v0
+# CHECK: vmaleb %v0, %v0, %v0, %v0
 0xe7 0x00 0x00 0x00 0x00 0xac
 
-#CHECK: vmaleb  %v3, %v20, %v5, %v22
+# CHECK: vmaleb %v3, %v20, %v5, %v22
 0xe7 0x34 0x50 0x00 0x65 0xac
 
-#CHECK: vmaleb  %v31, %v31, %v31, %v31
+# CHECK: vmaleb %v31, %v31, %v31, %v31
 0xe7 0xff 0xf0 0x00 0xff 0xac
 
-#CHECK: vmalef  %v0, %v0, %v0, %v0
+# CHECK: vmalef %v0, %v0, %v0, %v0
 0xe7 0x00 0x02 0x00 0x00 0xac
 
-#CHECK: vmalef  %v3, %v20, %v5, %v22
+# CHECK: vmalef %v3, %v20, %v5, %v22
 0xe7 0x34 0x52 0x00 0x65 0xac
 
-#CHECK: vmalef  %v31, %v31, %v31, %v31
+# CHECK: vmalef %v31, %v31, %v31, %v31
 0xe7 0xff 0xf2 0x00 0xff 0xac
 
-#CHECK: vmaleh  %v0, %v0, %v0, %v0
+# CHECK: vmaleh %v0, %v0, %v0, %v0
 0xe7 0x00 0x01 0x00 0x00 0xac
 
-#CHECK: vmaleh  %v3, %v20, %v5, %v22
+# CHECK: vmaleh %v3, %v20, %v5, %v22
 0xe7 0x34 0x51 0x00 0x65 0xac
 
-#CHECK: vmaleh  %v31, %v31, %v31, %v31
+# CHECK: vmaleh %v31, %v31, %v31, %v31
 0xe7 0xff 0xf1 0x00 0xff 0xac
 
-#CHECK: vmalf   %v0, %v0, %v0, %v0
+# CHECK: vmalf %v0, %v0, %v0, %v0
 0xe7 0x00 0x02 0x00 0x00 0xaa
 
-#CHECK: vmalf   %v3, %v20, %v5, %v22
+# CHECK: vmalf %v3, %v20, %v5, %v22
 0xe7 0x34 0x52 0x00 0x65 0xaa
 
-#CHECK: vmalf   %v31, %v31, %v31, %v31
+# CHECK: vmalf %v31, %v31, %v31, %v31
 0xe7 0xff 0xf2 0x00 0xff 0xaa
 
-#CHECK: vmalh   %v0, %v0, %v0, %v0, 11
+# CHECK: vmalh %v0, %v0, %v0, %v0, 11
 0xe7 0x00 0x0b 0x00 0x00 0xa9
 
-#CHECK: vmalh   %v3, %v20, %v5, %v22, 11
+# CHECK: vmalh %v3, %v20, %v5, %v22, 11
 0xe7 0x34 0x5b 0x00 0x65 0xa9
 
-#CHECK: vmalh   %v31, %v31, %v31, %v31, 11
+# CHECK: vmalh %v31, %v31, %v31, %v31, 11
 0xe7 0xff 0xfb 0x00 0xff 0xa9
 
-#CHECK: vmalhb  %v0, %v0, %v0, %v0
+# CHECK: vmalhb %v0, %v0, %v0, %v0
 0xe7 0x00 0x00 0x00 0x00 0xa9
 
-#CHECK: vmalhb  %v3, %v20, %v5, %v22
+# CHECK: vmalhb %v3, %v20, %v5, %v22
 0xe7 0x34 0x50 0x00 0x65 0xa9
 
-#CHECK: vmalhb  %v31, %v31, %v31, %v31
+# CHECK: vmalhb %v31, %v31, %v31, %v31
 0xe7 0xff 0xf0 0x00 0xff 0xa9
 
-#CHECK: vmalhf  %v0, %v0, %v0, %v0
+# CHECK: vmalhf %v0, %v0, %v0, %v0
 0xe7 0x00 0x02 0x00 0x00 0xa9
 
-#CHECK: vmalhf  %v3, %v20, %v5, %v22
+# CHECK: vmalhf %v3, %v20, %v5, %v22
 0xe7 0x34 0x52 0x00 0x65 0xa9
 
-#CHECK: vmalhf  %v31, %v31, %v31, %v31
+# CHECK: vmalhf %v31, %v31, %v31, %v31
 0xe7 0xff 0xf2 0x00 0xff 0xa9
 
-#CHECK: vmalhh  %v0, %v0, %v0, %v0
+# CHECK: vmalhh %v0, %v0, %v0, %v0
 0xe7 0x00 0x01 0x00 0x00 0xa9
 
-#CHECK: vmalhh  %v3, %v20, %v5, %v22
+# CHECK: vmalhh %v3, %v20, %v5, %v22
 0xe7 0x34 0x51 0x00 0x65 0xa9
 
-#CHECK: vmalhh  %v31, %v31, %v31, %v31
+# CHECK: vmalhh %v31, %v31, %v31, %v31
 0xe7 0xff 0xf1 0x00 0xff 0xa9
 
-#CHECK: vmalhw  %v0, %v0, %v0, %v0
+# CHECK: vmalhw %v0, %v0, %v0, %v0
 0xe7 0x00 0x01 0x00 0x00 0xaa
 
-#CHECK: vmalhw  %v3, %v20, %v5, %v22
+# CHECK: vmalhw %v3, %v20, %v5, %v22
 0xe7 0x34 0x51 0x00 0x65 0xaa
 
-#CHECK: vmalhw  %v31, %v31, %v31, %v31
+# CHECK: vmalhw %v31, %v31, %v31, %v31
 0xe7 0xff 0xf1 0x00 0xff 0xaa
 
-#CHECK: vmalo   %v0, %v0, %v0, %v0, 11
+# CHECK: vmalo %v0, %v0, %v0, %v0, 11
 0xe7 0x00 0x0b 0x00 0x00 0xad
 
-#CHECK: vmalo   %v3, %v20, %v5, %v22, 11
+# CHECK: vmalo %v3, %v20, %v5, %v22, 11
 0xe7 0x34 0x5b 0x00 0x65 0xad
 
-#CHECK: vmalo   %v31, %v31, %v31, %v31, 11
+# CHECK: vmalo %v31, %v31, %v31, %v31, 11
 0xe7 0xff 0xfb 0x00 0xff 0xad
 
-#CHECK: vmalob  %v0, %v0, %v0, %v0
+# CHECK: vmalob %v0, %v0, %v0, %v0
 0xe7 0x00 0x00 0x00 0x00 0xad
 
-#CHECK: vmalob  %v3, %v20, %v5, %v22
+# CHECK: vmalob %v3, %v20, %v5, %v22
 0xe7 0x34 0x50 0x00 0x65 0xad
 
-#CHECK: vmalob  %v31, %v31, %v31, %v31
+# CHECK: vmalob %v31, %v31, %v31, %v31
 0xe7 0xff 0xf0 0x00 0xff 0xad
 
-#CHECK: vmalof  %v0, %v0, %v0, %v0
+# CHECK: vmalof %v0, %v0, %v0, %v0
 0xe7 0x00 0x02 0x00 0x00 0xad
 
-#CHECK: vmalof  %v3, %v20, %v5, %v22
+# CHECK: vmalof %v3, %v20, %v5, %v22
 0xe7 0x34 0x52 0x00 0x65 0xad
 
-#CHECK: vmalof  %v31, %v31, %v31, %v31
+# CHECK: vmalof %v31, %v31, %v31, %v31
 0xe7 0xff 0xf2 0x00 0xff 0xad
 
-#CHECK: vmaloh  %v0, %v0, %v0, %v0
+# CHECK: vmaloh %v0, %v0, %v0, %v0
 0xe7 0x00 0x01 0x00 0x00 0xad
 
-#CHECK: vmaloh  %v3, %v20, %v5, %v22
+# CHECK: vmaloh %v3, %v20, %v5, %v22
 0xe7 0x34 0x51 0x00 0x65 0xad
 
-#CHECK: vmaloh  %v31, %v31, %v31, %v31
+# CHECK: vmaloh %v31, %v31, %v31, %v31
 0xe7 0xff 0xf1 0x00 0xff 0xad
 
-#CHECK: vmao    %v0, %v0, %v0, %v0, 11
+# CHECK: vmao %v0, %v0, %v0, %v0, 11
 0xe7 0x00 0x0b 0x00 0x00 0xaf
 
-#CHECK: vmao    %v3, %v20, %v5, %v22, 11
+# CHECK: vmao %v3, %v20, %v5, %v22, 11
 0xe7 0x34 0x5b 0x00 0x65 0xaf
 
-#CHECK: vmao    %v31, %v31, %v31, %v31, 11
+# CHECK: vmao %v31, %v31, %v31, %v31, 11
 0xe7 0xff 0xfb 0x00 0xff 0xaf
 
-#CHECK: vmaob   %v0, %v0, %v0, %v0
+# CHECK: vmaob %v0, %v0, %v0, %v0
 0xe7 0x00 0x00 0x00 0x00 0xaf
 
-#CHECK: vmaob   %v3, %v20, %v5, %v22
+# CHECK: vmaob %v3, %v20, %v5, %v22
 0xe7 0x34 0x50 0x00 0x65 0xaf
 
-#CHECK: vmaob   %v31, %v31, %v31, %v31
+# CHECK: vmaob %v31, %v31, %v31, %v31
 0xe7 0xff 0xf0 0x00 0xff 0xaf
 
-#CHECK: vmaof   %v0, %v0, %v0, %v0
+# CHECK: vmaof %v0, %v0, %v0, %v0
 0xe7 0x00 0x02 0x00 0x00 0xaf
 
-#CHECK: vmaof   %v3, %v20, %v5, %v22
+# CHECK: vmaof %v3, %v20, %v5, %v22
 0xe7 0x34 0x52 0x00 0x65 0xaf
 
-#CHECK: vmaof   %v31, %v31, %v31, %v31
+# CHECK: vmaof %v31, %v31, %v31, %v31
 0xe7 0xff 0xf2 0x00 0xff 0xaf
 
-#CHECK: vmaoh   %v0, %v0, %v0, %v0
+# CHECK: vmaoh %v0, %v0, %v0, %v0
 0xe7 0x00 0x01 0x00 0x00 0xaf
 
-#CHECK: vmaoh   %v3, %v20, %v5, %v22
+# CHECK: vmaoh %v3, %v20, %v5, %v22
 0xe7 0x34 0x51 0x00 0x65 0xaf
 
-#CHECK: vmaoh   %v31, %v31, %v31, %v31
+# CHECK: vmaoh %v31, %v31, %v31, %v31
 0xe7 0xff 0xf1 0x00 0xff 0xaf
 
-#CHECK: vme     %v0, %v0, %v0, 11
+# CHECK: vme  %v0, %v0, %v0, 11
 0xe7 0x00 0x00 0x00 0xb0 0xa6
 
-#CHECK: vme     %v18, %v3, %v20, 11
+# CHECK: vme  %v18, %v3, %v20, 11
 0xe7 0x23 0x40 0x00 0xba 0xa6
 
-#CHECK: vme     %v31, %v31, %v31, 11
+# CHECK: vme  %v31, %v31, %v31, 11
 0xe7 0xff 0xf0 0x00 0xbe 0xa6
 
-#CHECK: vmeb    %v0, %v0, %v0
+# CHECK: vmeb %v0, %v0, %v0
 0xe7 0x00 0x00 0x00 0x00 0xa6
 
-#CHECK: vmeb    %v18, %v3, %v20
+# CHECK: vmeb %v18, %v3, %v20
 0xe7 0x23 0x40 0x00 0x0a 0xa6
 
-#CHECK: vmeb    %v31, %v31, %v31
+# CHECK: vmeb %v31, %v31, %v31
 0xe7 0xff 0xf0 0x00 0x0e 0xa6
 
-#CHECK: vmef    %v0, %v0, %v0
+# CHECK: vmef %v0, %v0, %v0
 0xe7 0x00 0x00 0x00 0x20 0xa6
 
-#CHECK: vmef    %v18, %v3, %v20
+# CHECK: vmef %v18, %v3, %v20
 0xe7 0x23 0x40 0x00 0x2a 0xa6
 
-#CHECK: vmef    %v31, %v31, %v31
+# CHECK: vmef %v31, %v31, %v31
 0xe7 0xff 0xf0 0x00 0x2e 0xa6
 
-#CHECK: vmeh    %v0, %v0, %v0
+# CHECK: vmeh %v0, %v0, %v0
 0xe7 0x00 0x00 0x00 0x10 0xa6
 
-#CHECK: vmeh    %v18, %v3, %v20
+# CHECK: vmeh %v18, %v3, %v20
 0xe7 0x23 0x40 0x00 0x1a 0xa6
 
-#CHECK: vmeh    %v31, %v31, %v31
+# CHECK: vmeh %v31, %v31, %v31
 0xe7 0xff 0xf0 0x00 0x1e 0xa6
 
-#CHECK: vmh     %v0, %v0, %v0, 11
+# CHECK: vmh  %v0, %v0, %v0, 11
 0xe7 0x00 0x00 0x00 0xb0 0xa3
 
-#CHECK: vmh     %v18, %v3, %v20, 11
+# CHECK: vmh  %v18, %v3, %v20, 11
 0xe7 0x23 0x40 0x00 0xba 0xa3
 
-#CHECK: vmh     %v31, %v31, %v31, 11
+# CHECK: vmh  %v31, %v31, %v31, 11
 0xe7 0xff 0xf0 0x00 0xbe 0xa3
 
-#CHECK: vmhb    %v0, %v0, %v0
+# CHECK: vmhb %v0, %v0, %v0
 0xe7 0x00 0x00 0x00 0x00 0xa3
 
-#CHECK: vmhb    %v18, %v3, %v20
+# CHECK: vmhb %v18, %v3, %v20
 0xe7 0x23 0x40 0x00 0x0a 0xa3
 
-#CHECK: vmhb    %v31, %v31, %v31
+# CHECK: vmhb %v31, %v31, %v31
 0xe7 0xff 0xf0 0x00 0x0e 0xa3
 
-#CHECK: vmhf    %v0, %v0, %v0
+# CHECK: vmhf %v0, %v0, %v0
 0xe7 0x00 0x00 0x00 0x20 0xa3
 
-#CHECK: vmhf    %v18, %v3, %v20
+# CHECK: vmhf %v18, %v3, %v20
 0xe7 0x23 0x40 0x00 0x2a 0xa3
 
-#CHECK: vmhf    %v31, %v31, %v31
+# CHECK: vmhf %v31, %v31, %v31
 0xe7 0xff 0xf0 0x00 0x2e 0xa3
 
-#CHECK: vmhh    %v0, %v0, %v0
+# CHECK: vmhh %v0, %v0, %v0
 0xe7 0x00 0x00 0x00 0x10 0xa3
 
-#CHECK: vmhh    %v18, %v3, %v20
+# CHECK: vmhh %v18, %v3, %v20
 0xe7 0x23 0x40 0x00 0x1a 0xa3
 
-#CHECK: vmhh    %v31, %v31, %v31
+# CHECK: vmhh %v31, %v31, %v31
 0xe7 0xff 0xf0 0x00 0x1e 0xa3
 
-#CHECK: vml     %v0, %v0, %v0, 11
+# CHECK: vml  %v0, %v0, %v0, 11
 0xe7 0x00 0x00 0x00 0xb0 0xa2
 
-#CHECK: vml     %v18, %v3, %v20, 11
+# CHECK: vml  %v18, %v3, %v20, 11
 0xe7 0x23 0x40 0x00 0xba 0xa2
 
-#CHECK: vml     %v31, %v31, %v31, 11
+# CHECK: vml  %v31, %v31, %v31, 11
 0xe7 0xff 0xf0 0x00 0xbe 0xa2
 
-#CHECK: vmlb    %v0, %v0, %v0
+# CHECK: vmlb %v0, %v0, %v0
 0xe7 0x00 0x00 0x00 0x00 0xa2
 
-#CHECK: vmlb    %v18, %v3, %v20
+# CHECK: vmlb %v18, %v3, %v20
 0xe7 0x23 0x40 0x00 0x0a 0xa2
 
-#CHECK: vmlb    %v31, %v31, %v31
+# CHECK: vmlb %v31, %v31, %v31
 0xe7 0xff 0xf0 0x00 0x0e 0xa2
 
-#CHECK: vmlf    %v0, %v0, %v0
-0xe7 0x00 0x00 0x00 0x20 0xa2
-
-#CHECK: vmlf    %v18, %v3, %v20
-0xe7 0x23 0x40 0x00 0x2a 0xa2
-
-#CHECK: vmlf    %v31, %v31, %v31
-0xe7 0xff 0xf0 0x00 0x2e 0xa2
-
-#CHECK: vmle    %v0, %v0, %v0, 11
+# CHECK: vmle %v0, %v0, %v0, 11
 0xe7 0x00 0x00 0x00 0xb0 0xa4
 
-#CHECK: vmle    %v18, %v3, %v20, 11
+# CHECK: vmle %v18, %v3, %v20, 11
 0xe7 0x23 0x40 0x00 0xba 0xa4
 
-#CHECK: vmle    %v31, %v31, %v31, 11
+# CHECK: vmle %v31, %v31, %v31, 11
 0xe7 0xff 0xf0 0x00 0xbe 0xa4
 
-#CHECK: vmleb   %v0, %v0, %v0
+# CHECK: vmleb %v0, %v0, %v0
 0xe7 0x00 0x00 0x00 0x00 0xa4
 
-#CHECK: vmleb   %v18, %v3, %v20
+# CHECK: vmleb %v18, %v3, %v20
 0xe7 0x23 0x40 0x00 0x0a 0xa4
 
-#CHECK: vmleb   %v31, %v31, %v31
+# CHECK: vmleb %v31, %v31, %v31
 0xe7 0xff 0xf0 0x00 0x0e 0xa4
 
-#CHECK: vmlef   %v0, %v0, %v0
+# CHECK: vmlef %v0, %v0, %v0
 0xe7 0x00 0x00 0x00 0x20 0xa4
 
-#CHECK: vmlef   %v18, %v3, %v20
+# CHECK: vmlef %v18, %v3, %v20
 0xe7 0x23 0x40 0x00 0x2a 0xa4
 
-#CHECK: vmlef   %v31, %v31, %v31
+# CHECK: vmlef %v31, %v31, %v31
 0xe7 0xff 0xf0 0x00 0x2e 0xa4
 
-#CHECK: vmleh   %v0, %v0, %v0
+# CHECK: vmleh %v0, %v0, %v0
 0xe7 0x00 0x00 0x00 0x10 0xa4
 
-#CHECK: vmleh   %v18, %v3, %v20
+# CHECK: vmleh %v18, %v3, %v20
 0xe7 0x23 0x40 0x00 0x1a 0xa4
 
-#CHECK: vmleh   %v31, %v31, %v31
+# CHECK: vmleh %v31, %v31, %v31
 0xe7 0xff 0xf0 0x00 0x1e 0xa4
 
-#CHECK: vmlh    %v0, %v0, %v0, 11
+# CHECK: vmlf %v0, %v0, %v0
+0xe7 0x00 0x00 0x00 0x20 0xa2
+
+# CHECK: vmlf %v18, %v3, %v20
+0xe7 0x23 0x40 0x00 0x2a 0xa2
+
+# CHECK: vmlf %v31, %v31, %v31
+0xe7 0xff 0xf0 0x00 0x2e 0xa2
+
+# CHECK: vmlh %v0, %v0, %v0, 11
 0xe7 0x00 0x00 0x00 0xb0 0xa1
 
-#CHECK: vmlh    %v18, %v3, %v20, 11
+# CHECK: vmlh %v18, %v3, %v20, 11
 0xe7 0x23 0x40 0x00 0xba 0xa1
 
-#CHECK: vmlh    %v31, %v31, %v31, 11
+# CHECK: vmlh %v31, %v31, %v31, 11
 0xe7 0xff 0xf0 0x00 0xbe 0xa1
 
-#CHECK: vmlhb   %v0, %v0, %v0
+# CHECK: vmlhb %v0, %v0, %v0
 0xe7 0x00 0x00 0x00 0x00 0xa1
 
-#CHECK: vmlhb   %v18, %v3, %v20
+# CHECK: vmlhb %v18, %v3, %v20
 0xe7 0x23 0x40 0x00 0x0a 0xa1
 
-#CHECK: vmlhb   %v31, %v31, %v31
+# CHECK: vmlhb %v31, %v31, %v31
 0xe7 0xff 0xf0 0x00 0x0e 0xa1
 
-#CHECK: vmlhf   %v0, %v0, %v0
+# CHECK: vmlhf %v0, %v0, %v0
 0xe7 0x00 0x00 0x00 0x20 0xa1
 
-#CHECK: vmlhf   %v18, %v3, %v20
+# CHECK: vmlhf %v18, %v3, %v20
 0xe7 0x23 0x40 0x00 0x2a 0xa1
 
-#CHECK: vmlhf   %v31, %v31, %v31
+# CHECK: vmlhf %v31, %v31, %v31
 0xe7 0xff 0xf0 0x00 0x2e 0xa1
 
-#CHECK: vmlhh   %v0, %v0, %v0
+# CHECK: vmlhh %v0, %v0, %v0
 0xe7 0x00 0x00 0x00 0x10 0xa1
 
-#CHECK: vmlhh   %v18, %v3, %v20
+# CHECK: vmlhh %v18, %v3, %v20
 0xe7 0x23 0x40 0x00 0x1a 0xa1
 
-#CHECK: vmlhh   %v31, %v31, %v31
+# CHECK: vmlhh %v31, %v31, %v31
 0xe7 0xff 0xf0 0x00 0x1e 0xa1
 
-#CHECK: vmlhw   %v0, %v0, %v0
+# CHECK: vmlhw %v0, %v0, %v0
 0xe7 0x00 0x00 0x00 0x10 0xa2
 
-#CHECK: vmlhw   %v18, %v3, %v20
+# CHECK: vmlhw %v18, %v3, %v20
 0xe7 0x23 0x40 0x00 0x1a 0xa2
 
-#CHECK: vmlhw   %v31, %v31, %v31
+# CHECK: vmlhw %v31, %v31, %v31
 0xe7 0xff 0xf0 0x00 0x1e 0xa2
 
-#CHECK: vmlo    %v0, %v0, %v0, 11
+# CHECK: vmlo %v0, %v0, %v0, 11
 0xe7 0x00 0x00 0x00 0xb0 0xa5
 
-#CHECK: vmlo    %v18, %v3, %v20, 11
+# CHECK: vmlo %v18, %v3, %v20, 11
 0xe7 0x23 0x40 0x00 0xba 0xa5
 
-#CHECK: vmlo    %v31, %v31, %v31, 11
+# CHECK: vmlo %v31, %v31, %v31, 11
 0xe7 0xff 0xf0 0x00 0xbe 0xa5
 
-#CHECK: vmlob   %v0, %v0, %v0
+# CHECK: vmlob %v0, %v0, %v0
 0xe7 0x00 0x00 0x00 0x00 0xa5
 
-#CHECK: vmlob   %v18, %v3, %v20
+# CHECK: vmlob %v18, %v3, %v20
 0xe7 0x23 0x40 0x00 0x0a 0xa5
 
-#CHECK: vmlob   %v31, %v31, %v31
+# CHECK: vmlob %v31, %v31, %v31
 0xe7 0xff 0xf0 0x00 0x0e 0xa5
 
-#CHECK: vmlof   %v0, %v0, %v0
+# CHECK: vmlof %v0, %v0, %v0
 0xe7 0x00 0x00 0x00 0x20 0xa5
 
-#CHECK: vmlof   %v18, %v3, %v20
+# CHECK: vmlof %v18, %v3, %v20
 0xe7 0x23 0x40 0x00 0x2a 0xa5
 
-#CHECK: vmlof   %v31, %v31, %v31
+# CHECK: vmlof %v31, %v31, %v31
 0xe7 0xff 0xf0 0x00 0x2e 0xa5
 
-#CHECK: vmloh   %v0, %v0, %v0
+# CHECK: vmloh %v0, %v0, %v0
 0xe7 0x00 0x00 0x00 0x10 0xa5
 
-#CHECK: vmloh   %v18, %v3, %v20
+# CHECK: vmloh %v18, %v3, %v20
 0xe7 0x23 0x40 0x00 0x1a 0xa5
 
-#CHECK: vmloh   %v31, %v31, %v31
+# CHECK: vmloh %v31, %v31, %v31
 0xe7 0xff 0xf0 0x00 0x1e 0xa5
 
-#CHECK: vmn     %v0, %v0, %v0, 11
+# CHECK: vmn  %v0, %v0, %v0, 11
 0xe7 0x00 0x00 0x00 0xb0 0xfe
 
-#CHECK: vmn     %v18, %v3, %v20, 11
+# CHECK: vmn  %v18, %v3, %v20, 11
 0xe7 0x23 0x40 0x00 0xba 0xfe
 
-#CHECK: vmn     %v31, %v31, %v31, 11
+# CHECK: vmn  %v31, %v31, %v31, 11
 0xe7 0xff 0xf0 0x00 0xbe 0xfe
 
-#CHECK: vmnb    %v0, %v0, %v0
+# CHECK: vmnb %v0, %v0, %v0
 0xe7 0x00 0x00 0x00 0x00 0xfe
 
-#CHECK: vmnb    %v18, %v3, %v20
+# CHECK: vmnb %v18, %v3, %v20
 0xe7 0x23 0x40 0x00 0x0a 0xfe
 
-#CHECK: vmnb    %v31, %v31, %v31
+# CHECK: vmnb %v31, %v31, %v31
 0xe7 0xff 0xf0 0x00 0x0e 0xfe
 
-#CHECK: vmnf    %v0, %v0, %v0
+# CHECK: vmnf %v0, %v0, %v0
 0xe7 0x00 0x00 0x00 0x20 0xfe
 
-#CHECK: vmnf    %v18, %v3, %v20
+# CHECK: vmnf %v18, %v3, %v20
 0xe7 0x23 0x40 0x00 0x2a 0xfe
 
-#CHECK: vmnf    %v31, %v31, %v31
+# CHECK: vmnf %v31, %v31, %v31
 0xe7 0xff 0xf0 0x00 0x2e 0xfe
 
-#CHECK: vmng    %v0, %v0, %v0
+# CHECK: vmng %v0, %v0, %v0
 0xe7 0x00 0x00 0x00 0x30 0xfe
 
-#CHECK: vmng    %v18, %v3, %v20
+# CHECK: vmng %v18, %v3, %v20
 0xe7 0x23 0x40 0x00 0x3a 0xfe
 
-#CHECK: vmng    %v31, %v31, %v31
+# CHECK: vmng %v31, %v31, %v31
 0xe7 0xff 0xf0 0x00 0x3e 0xfe
 
-#CHECK: vmnh    %v0, %v0, %v0
+# CHECK: vmnh %v0, %v0, %v0
 0xe7 0x00 0x00 0x00 0x10 0xfe
 
-#CHECK: vmnh    %v18, %v3, %v20
+# CHECK: vmnh %v18, %v3, %v20
 0xe7 0x23 0x40 0x00 0x1a 0xfe
 
-#CHECK: vmnh    %v31, %v31, %v31
+# CHECK: vmnh %v31, %v31, %v31
 0xe7 0xff 0xf0 0x00 0x1e 0xfe
 
-#CHECK: vmnl    %v0, %v0, %v0, 11
+# CHECK: vmnl %v0, %v0, %v0, 11
 0xe7 0x00 0x00 0x00 0xb0 0xfc
 
-#CHECK: vmnl    %v18, %v3, %v20, 11
+# CHECK: vmnl %v18, %v3, %v20, 11
 0xe7 0x23 0x40 0x00 0xba 0xfc
 
-#CHECK: vmnl    %v31, %v31, %v31, 11
+# CHECK: vmnl %v31, %v31, %v31, 11
 0xe7 0xff 0xf0 0x00 0xbe 0xfc
 
-#CHECK: vmnlb   %v0, %v0, %v0
+# CHECK: vmnlb %v0, %v0, %v0
 0xe7 0x00 0x00 0x00 0x00 0xfc
 
-#CHECK: vmnlb   %v18, %v3, %v20
+# CHECK: vmnlb %v18, %v3, %v20
 0xe7 0x23 0x40 0x00 0x0a 0xfc
 
-#CHECK: vmnlb   %v31, %v31, %v31
+# CHECK: vmnlb %v31, %v31, %v31
 0xe7 0xff 0xf0 0x00 0x0e 0xfc
 
-#CHECK: vmnlf   %v0, %v0, %v0
+# CHECK: vmnlf %v0, %v0, %v0
 0xe7 0x00 0x00 0x00 0x20 0xfc
 
-#CHECK: vmnlf   %v18, %v3, %v20
+# CHECK: vmnlf %v18, %v3, %v20
 0xe7 0x23 0x40 0x00 0x2a 0xfc
 
-#CHECK: vmnlf   %v31, %v31, %v31
+# CHECK: vmnlf %v31, %v31, %v31
 0xe7 0xff 0xf0 0x00 0x2e 0xfc
 
-#CHECK: vmnlg   %v0, %v0, %v0
+# CHECK: vmnlg %v0, %v0, %v0
 0xe7 0x00 0x00 0x00 0x30 0xfc
 
-#CHECK: vmnlg   %v18, %v3, %v20
+# CHECK: vmnlg %v18, %v3, %v20
 0xe7 0x23 0x40 0x00 0x3a 0xfc
 
-#CHECK: vmnlg   %v31, %v31, %v31
+# CHECK: vmnlg %v31, %v31, %v31
 0xe7 0xff 0xf0 0x00 0x3e 0xfc
 
-#CHECK: vmnlh   %v0, %v0, %v0
+# CHECK: vmnlh %v0, %v0, %v0
 0xe7 0x00 0x00 0x00 0x10 0xfc
 
-#CHECK: vmnlh   %v18, %v3, %v20
+# CHECK: vmnlh %v18, %v3, %v20
 0xe7 0x23 0x40 0x00 0x1a 0xfc
 
-#CHECK: vmnlh   %v31, %v31, %v31
+# CHECK: vmnlh %v31, %v31, %v31
 0xe7 0xff 0xf0 0x00 0x1e 0xfc
 
-#CHECK: vmo     %v0, %v0, %v0, 11
+# CHECK: vmo  %v0, %v0, %v0, 11
 0xe7 0x00 0x00 0x00 0xb0 0xa7
 
-#CHECK: vmo     %v18, %v3, %v20, 11
+# CHECK: vmo  %v18, %v3, %v20, 11
 0xe7 0x23 0x40 0x00 0xba 0xa7
 
-#CHECK: vmo     %v31, %v31, %v31, 11
+# CHECK: vmo  %v31, %v31, %v31, 11
 0xe7 0xff 0xf0 0x00 0xbe 0xa7
 
-#CHECK: vmob    %v0, %v0, %v0
+# CHECK: vmob %v0, %v0, %v0
 0xe7 0x00 0x00 0x00 0x00 0xa7
 
-#CHECK: vmob    %v18, %v3, %v20
+# CHECK: vmob %v18, %v3, %v20
 0xe7 0x23 0x40 0x00 0x0a 0xa7
 
-#CHECK: vmob    %v31, %v31, %v31
+# CHECK: vmob %v31, %v31, %v31
 0xe7 0xff 0xf0 0x00 0x0e 0xa7
 
-#CHECK: vmof    %v0, %v0, %v0
+# CHECK: vmof %v0, %v0, %v0
 0xe7 0x00 0x00 0x00 0x20 0xa7
 
-#CHECK: vmof    %v18, %v3, %v20
+# CHECK: vmof %v18, %v3, %v20
 0xe7 0x23 0x40 0x00 0x2a 0xa7
 
-#CHECK: vmof    %v31, %v31, %v31
+# CHECK: vmof %v31, %v31, %v31
 0xe7 0xff 0xf0 0x00 0x2e 0xa7
 
-#CHECK: vmoh    %v0, %v0, %v0
+# CHECK: vmoh %v0, %v0, %v0
 0xe7 0x00 0x00 0x00 0x10 0xa7
 
-#CHECK: vmoh    %v18, %v3, %v20
+# CHECK: vmoh %v18, %v3, %v20
 0xe7 0x23 0x40 0x00 0x1a 0xa7
 
-#CHECK: vmoh    %v31, %v31, %v31
+# CHECK: vmoh %v31, %v31, %v31
 0xe7 0xff 0xf0 0x00 0x1e 0xa7
 
-#CHECK: vmrh    %v0, %v0, %v0, 11
+# CHECK: vmrh %v0, %v0, %v0, 11
 0xe7 0x00 0x00 0x00 0xb0 0x61
 
-#CHECK: vmrh    %v18, %v3, %v20, 11
+# CHECK: vmrh %v18, %v3, %v20, 11
 0xe7 0x23 0x40 0x00 0xba 0x61
 
-#CHECK: vmrh    %v31, %v31, %v31, 11
+# CHECK: vmrh %v31, %v31, %v31, 11
 0xe7 0xff 0xf0 0x00 0xbe 0x61
 
-#CHECK: vmrhb   %v0, %v0, %v0
+# CHECK: vmrhb %v0, %v0, %v0
 0xe7 0x00 0x00 0x00 0x00 0x61
 
-#CHECK: vmrhb   %v18, %v3, %v20
+# CHECK: vmrhb %v18, %v3, %v20
 0xe7 0x23 0x40 0x00 0x0a 0x61
 
-#CHECK: vmrhb   %v31, %v31, %v31
+# CHECK: vmrhb %v31, %v31, %v31
 0xe7 0xff 0xf0 0x00 0x0e 0x61
 
-#CHECK: vmrhf   %v0, %v0, %v0
+# CHECK: vmrhf %v0, %v0, %v0
 0xe7 0x00 0x00 0x00 0x20 0x61
 
-#CHECK: vmrhf   %v18, %v3, %v20
+# CHECK: vmrhf %v18, %v3, %v20
 0xe7 0x23 0x40 0x00 0x2a 0x61
 
-#CHECK: vmrhf   %v31, %v31, %v31
+# CHECK: vmrhf %v31, %v31, %v31
 0xe7 0xff 0xf0 0x00 0x2e 0x61
 
-#CHECK: vmrhg   %v0, %v0, %v0
+# CHECK: vmrhg %v0, %v0, %v0
 0xe7 0x00 0x00 0x00 0x30 0x61
 
-#CHECK: vmrhg   %v18, %v3, %v20
+# CHECK: vmrhg %v18, %v3, %v20
 0xe7 0x23 0x40 0x00 0x3a 0x61
 
-#CHECK: vmrhg   %v31, %v31, %v31
+# CHECK: vmrhg %v31, %v31, %v31
 0xe7 0xff 0xf0 0x00 0x3e 0x61
 
-#CHECK: vmrhh   %v0, %v0, %v0
+# CHECK: vmrhh %v0, %v0, %v0
 0xe7 0x00 0x00 0x00 0x10 0x61
 
-#CHECK: vmrhh   %v18, %v3, %v20
+# CHECK: vmrhh %v18, %v3, %v20
 0xe7 0x23 0x40 0x00 0x1a 0x61
 
-#CHECK: vmrhh   %v31, %v31, %v31
+# CHECK: vmrhh %v31, %v31, %v31
 0xe7 0xff 0xf0 0x00 0x1e 0x61
 
-#CHECK: vmrl    %v0, %v0, %v0, 11
+# CHECK: vmrl %v0, %v0, %v0, 11
 0xe7 0x00 0x00 0x00 0xb0 0x60
 
-#CHECK: vmrl    %v18, %v3, %v20, 11
+# CHECK: vmrl %v18, %v3, %v20, 11
 0xe7 0x23 0x40 0x00 0xba 0x60
 
-#CHECK: vmrl    %v31, %v31, %v31, 11
+# CHECK: vmrl %v31, %v31, %v31, 11
 0xe7 0xff 0xf0 0x00 0xbe 0x60
 
-#CHECK: vmrlb   %v0, %v0, %v0
+# CHECK: vmrlb %v0, %v0, %v0
 0xe7 0x00 0x00 0x00 0x00 0x60
 
-#CHECK: vmrlb   %v18, %v3, %v20
+# CHECK: vmrlb %v18, %v3, %v20
 0xe7 0x23 0x40 0x00 0x0a 0x60
 
-#CHECK: vmrlb   %v31, %v31, %v31
+# CHECK: vmrlb %v31, %v31, %v31
 0xe7 0xff 0xf0 0x00 0x0e 0x60
 
-#CHECK: vmrlf   %v0, %v0, %v0
+# CHECK: vmrlf %v0, %v0, %v0
 0xe7 0x00 0x00 0x00 0x20 0x60
 
-#CHECK: vmrlf   %v18, %v3, %v20
+# CHECK: vmrlf %v18, %v3, %v20
 0xe7 0x23 0x40 0x00 0x2a 0x60
 
-#CHECK: vmrlf   %v31, %v31, %v31
+# CHECK: vmrlf %v31, %v31, %v31
 0xe7 0xff 0xf0 0x00 0x2e 0x60
 
-#CHECK: vmrlg   %v0, %v0, %v0
+# CHECK: vmrlg %v0, %v0, %v0
 0xe7 0x00 0x00 0x00 0x30 0x60
 
-#CHECK: vmrlg   %v18, %v3, %v20
+# CHECK: vmrlg %v18, %v3, %v20
 0xe7 0x23 0x40 0x00 0x3a 0x60
 
-#CHECK: vmrlg   %v31, %v31, %v31
+# CHECK: vmrlg %v31, %v31, %v31
 0xe7 0xff 0xf0 0x00 0x3e 0x60
 
-#CHECK: vmrlh   %v0, %v0, %v0
+# CHECK: vmrlh %v0, %v0, %v0
 0xe7 0x00 0x00 0x00 0x10 0x60
 
-#CHECK: vmrlh   %v18, %v3, %v20
+# CHECK: vmrlh %v18, %v3, %v20
 0xe7 0x23 0x40 0x00 0x1a 0x60
 
-#CHECK: vmrlh   %v31, %v31, %v31
+# CHECK: vmrlh %v31, %v31, %v31
 0xe7 0xff 0xf0 0x00 0x1e 0x60
 
-#CHECK: vmx     %v0, %v0, %v0, 11
+# CHECK: vmx  %v0, %v0, %v0, 11
 0xe7 0x00 0x00 0x00 0xb0 0xff
 
-#CHECK: vmx     %v18, %v3, %v20, 11
+# CHECK: vmx  %v18, %v3, %v20, 11
 0xe7 0x23 0x40 0x00 0xba 0xff
 
-#CHECK: vmx     %v31, %v31, %v31, 11
+# CHECK: vmx  %v31, %v31, %v31, 11
 0xe7 0xff 0xf0 0x00 0xbe 0xff
 
-#CHECK: vmxb    %v0, %v0, %v0
+# CHECK: vmxb %v0, %v0, %v0
 0xe7 0x00 0x00 0x00 0x00 0xff
 
-#CHECK: vmxb    %v18, %v3, %v20
+# CHECK: vmxb %v18, %v3, %v20
 0xe7 0x23 0x40 0x00 0x0a 0xff
 
-#CHECK: vmxb    %v31, %v31, %v31
+# CHECK: vmxb %v31, %v31, %v31
 0xe7 0xff 0xf0 0x00 0x0e 0xff
 
-#CHECK: vmxf    %v0, %v0, %v0
+# CHECK: vmxf %v0, %v0, %v0
 0xe7 0x00 0x00 0x00 0x20 0xff
 
-#CHECK: vmxf    %v18, %v3, %v20
+# CHECK: vmxf %v18, %v3, %v20
 0xe7 0x23 0x40 0x00 0x2a 0xff
 
-#CHECK: vmxf    %v31, %v31, %v31
+# CHECK: vmxf %v31, %v31, %v31
 0xe7 0xff 0xf0 0x00 0x2e 0xff
 
-#CHECK: vmxg    %v0, %v0, %v0
+# CHECK: vmxg %v0, %v0, %v0
 0xe7 0x00 0x00 0x00 0x30 0xff
 
-#CHECK: vmxg    %v18, %v3, %v20
+# CHECK: vmxg %v18, %v3, %v20
 0xe7 0x23 0x40 0x00 0x3a 0xff
 
-#CHECK: vmxg    %v31, %v31, %v31
+# CHECK: vmxg %v31, %v31, %v31
 0xe7 0xff 0xf0 0x00 0x3e 0xff
 
-#CHECK: vmxh    %v0, %v0, %v0
+# CHECK: vmxh %v0, %v0, %v0
 0xe7 0x00 0x00 0x00 0x10 0xff
 
-#CHECK: vmxh    %v18, %v3, %v20
+# CHECK: vmxh %v18, %v3, %v20
 0xe7 0x23 0x40 0x00 0x1a 0xff
 
-#CHECK: vmxh    %v31, %v31, %v31
+# CHECK: vmxh %v31, %v31, %v31
 0xe7 0xff 0xf0 0x00 0x1e 0xff
 
-#CHECK: vmxl    %v0, %v0, %v0, 11
+# CHECK: vmxl %v0, %v0, %v0, 11
 0xe7 0x00 0x00 0x00 0xb0 0xfd
 
-#CHECK: vmxl    %v18, %v3, %v20, 11
+# CHECK: vmxl %v18, %v3, %v20, 11
 0xe7 0x23 0x40 0x00 0xba 0xfd
 
-#CHECK: vmxl    %v31, %v31, %v31, 11
+# CHECK: vmxl %v31, %v31, %v31, 11
 0xe7 0xff 0xf0 0x00 0xbe 0xfd
 
-#CHECK: vmxlb   %v0, %v0, %v0
+# CHECK: vmxlb %v0, %v0, %v0
 0xe7 0x00 0x00 0x00 0x00 0xfd
 
-#CHECK: vmxlb   %v18, %v3, %v20
+# CHECK: vmxlb %v18, %v3, %v20
 0xe7 0x23 0x40 0x00 0x0a 0xfd
 
-#CHECK: vmxlb   %v31, %v31, %v31
+# CHECK: vmxlb %v31, %v31, %v31
 0xe7 0xff 0xf0 0x00 0x0e 0xfd
 
-#CHECK: vmxlf   %v0, %v0, %v0
+# CHECK: vmxlf %v0, %v0, %v0
 0xe7 0x00 0x00 0x00 0x20 0xfd
 
-#CHECK: vmxlf   %v18, %v3, %v20
+# CHECK: vmxlf %v18, %v3, %v20
 0xe7 0x23 0x40 0x00 0x2a 0xfd
 
-#CHECK: vmxlf   %v31, %v31, %v31
+# CHECK: vmxlf %v31, %v31, %v31
 0xe7 0xff 0xf0 0x00 0x2e 0xfd
 
-#CHECK: vmxlg   %v0, %v0, %v0
+# CHECK: vmxlg %v0, %v0, %v0
 0xe7 0x00 0x00 0x00 0x30 0xfd
 
-#CHECK: vmxlg   %v18, %v3, %v20
+# CHECK: vmxlg %v18, %v3, %v20
 0xe7 0x23 0x40 0x00 0x3a 0xfd
 
-#CHECK: vmxlg   %v31, %v31, %v31
+# CHECK: vmxlg %v31, %v31, %v31
 0xe7 0xff 0xf0 0x00 0x3e 0xfd
 
-#CHECK: vmxlh   %v0, %v0, %v0
+# CHECK: vmxlh %v0, %v0, %v0
 0xe7 0x00 0x00 0x00 0x10 0xfd
 
-#CHECK: vmxlh   %v18, %v3, %v20
+# CHECK: vmxlh %v18, %v3, %v20
 0xe7 0x23 0x40 0x00 0x1a 0xfd
 
-#CHECK: vmxlh   %v31, %v31, %v31
+# CHECK: vmxlh %v31, %v31, %v31
 0xe7 0xff 0xf0 0x00 0x1e 0xfd
 
-#CHECK: vn      %v0, %v0, %v0
+# CHECK: vn  %v0, %v0, %v0
 0xe7 0x00 0x00 0x00 0x00 0x68
 
-#CHECK: vn      %v18, %v3, %v20
+# CHECK: vn  %v18, %v3, %v20
 0xe7 0x23 0x40 0x00 0x0a 0x68
 
-#CHECK: vn      %v31, %v31, %v31
+# CHECK: vn  %v31, %v31, %v31
 0xe7 0xff 0xf0 0x00 0x0e 0x68
 
-#CHECK: vnc     %v0, %v0, %v0
+# CHECK: vnc  %v0, %v0, %v0
 0xe7 0x00 0x00 0x00 0x00 0x69
 
-#CHECK: vnc     %v18, %v3, %v20
+# CHECK: vnc  %v18, %v3, %v20
 0xe7 0x23 0x40 0x00 0x0a 0x69
 
-#CHECK: vnc     %v31, %v31, %v31
+# CHECK: vnc  %v31, %v31, %v31
 0xe7 0xff 0xf0 0x00 0x0e 0x69
 
-#CHECK: vno     %v0, %v0, %v0
+# CHECK: vno  %v0, %v0, %v0
 0xe7 0x00 0x00 0x00 0x00 0x6b
 
-#CHECK: vno     %v18, %v3, %v20
+# CHECK: vno  %v18, %v3, %v20
 0xe7 0x23 0x40 0x00 0x0a 0x6b
 
-#CHECK: vno     %v31, %v31, %v31
+# CHECK: vno  %v31, %v31, %v31
 0xe7 0xff 0xf0 0x00 0x0e 0x6b
 
-#CHECK: vo      %v0, %v0, %v0
+# CHECK: vo  %v0, %v0, %v0
 0xe7 0x00 0x00 0x00 0x00 0x6a
 
-#CHECK: vo      %v18, %v3, %v20
+# CHECK: vo  %v18, %v3, %v20
 0xe7 0x23 0x40 0x00 0x0a 0x6a
 
-#CHECK: vo      %v31, %v31, %v31
+# CHECK: vo  %v31, %v31, %v31
 0xe7 0xff 0xf0 0x00 0x0e 0x6a
 
-#CHECK: vpdi    %v0, %v0, %v0, 0
+# CHECK: vpdi %v0, %v0, %v0, 0
 0xe7 0x00 0x00 0x00 0x00 0x84
 
-#CHECK: vpdi    %v3, %v20, %v5, 4
+# CHECK: vpdi %v3, %v20, %v5, 4
 0xe7 0x34 0x50 0x00 0x44 0x84
 
-#CHECK: vpdi    %v31, %v31, %v31, 15
+# CHECK: vpdi %v31, %v31, %v31, 15
 0xe7 0xff 0xf0 0x00 0xfe 0x84
 
-#CHECK: vperm   %v0, %v0, %v0, %v0
+# CHECK: vperm %v0, %v0, %v0, %v0
 0xe7 0x00 0x00 0x00 0x00 0x8c
 
-#CHECK: vperm   %v3, %v20, %v5, %v22
+# CHECK: vperm %v3, %v20, %v5, %v22
 0xe7 0x34 0x50 0x00 0x65 0x8c
 
-#CHECK: vperm   %v31, %v31, %v31, %v31
+# CHECK: vperm %v31, %v31, %v31, %v31
 0xe7 0xff 0xf0 0x00 0xff 0x8c
 
-#CHECK: vpk     %v0, %v0, %v0, 11
+# CHECK: vpk  %v0, %v0, %v0, 11
 0xe7 0x00 0x00 0x00 0xb0 0x94
 
-#CHECK: vpk     %v18, %v3, %v20, 11
+# CHECK: vpk  %v18, %v3, %v20, 11
 0xe7 0x23 0x40 0x00 0xba 0x94
 
-#CHECK: vpk     %v31, %v31, %v31, 11
+# CHECK: vpk  %v31, %v31, %v31, 11
 0xe7 0xff 0xf0 0x00 0xbe 0x94
 
-#CHECK: vpkf    %v0, %v0, %v0
+# CHECK: vpkf %v0, %v0, %v0
 0xe7 0x00 0x00 0x00 0x20 0x94
 
-#CHECK: vpkf    %v18, %v3, %v20
+# CHECK: vpkf %v18, %v3, %v20
 0xe7 0x23 0x40 0x00 0x2a 0x94
 
-#CHECK: vpkf    %v31, %v31, %v31
+# CHECK: vpkf %v31, %v31, %v31
 0xe7 0xff 0xf0 0x00 0x2e 0x94
 
-#CHECK: vpkg    %v0, %v0, %v0
+# CHECK: vpkg %v0, %v0, %v0
 0xe7 0x00 0x00 0x00 0x30 0x94
 
-#CHECK: vpkg    %v18, %v3, %v20
+# CHECK: vpkg %v18, %v3, %v20
 0xe7 0x23 0x40 0x00 0x3a 0x94
 
-#CHECK: vpkg    %v31, %v31, %v31
+# CHECK: vpkg %v31, %v31, %v31
 0xe7 0xff 0xf0 0x00 0x3e 0x94
 
-#CHECK: vpkh    %v0, %v0, %v0
+# CHECK: vpkh %v0, %v0, %v0
 0xe7 0x00 0x00 0x00 0x10 0x94
 
-#CHECK: vpkh    %v18, %v3, %v20
+# CHECK: vpkh %v18, %v3, %v20
 0xe7 0x23 0x40 0x00 0x1a 0x94
 
-#CHECK: vpkh    %v31, %v31, %v31
+# CHECK: vpkh %v31, %v31, %v31
 0xe7 0xff 0xf0 0x00 0x1e 0x94
 
-#CHECK: vpkls   %v0, %v0, %v0, 11, 9
+# CHECK: vpkls %v0, %v0, %v0, 11, 9
 0xe7 0x00 0x00 0x90 0xb0 0x95
 
-#CHECK: vpkls   %v18, %v3, %v20, 11, 9
+# CHECK: vpkls %v18, %v3, %v20, 11, 9
 0xe7 0x23 0x40 0x90 0xba 0x95
 
-#CHECK: vpkls   %v31, %v31, %v31, 11, 9
+# CHECK: vpkls %v31, %v31, %v31, 11, 9
 0xe7 0xff 0xf0 0x90 0xbe 0x95
 
-#CHECK: vpklsf  %v0, %v0, %v0
+# CHECK: vpklsf %v0, %v0, %v0
 0xe7 0x00 0x00 0x00 0x20 0x95
 
-#CHECK: vpklsf  %v18, %v3, %v20
+# CHECK: vpklsf %v18, %v3, %v20
 0xe7 0x23 0x40 0x00 0x2a 0x95
 
-#CHECK: vpklsfs %v7, %v24, %v9
-0xe7 0x78 0x90 0x10 0x24 0x95
-
-#CHECK: vpklsf  %v31, %v31, %v31
+# CHECK: vpklsf %v31, %v31, %v31
 0xe7 0xff 0xf0 0x00 0x2e 0x95
 
-#CHECK: vpklsg  %v0, %v0, %v0
+# CHECK: vpklsfs %v7, %v24, %v9
+0xe7 0x78 0x90 0x10 0x24 0x95
+
+# CHECK: vpklsg %v0, %v0, %v0
 0xe7 0x00 0x00 0x00 0x30 0x95
 
-#CHECK: vpklsg  %v18, %v3, %v20
+# CHECK: vpklsg %v18, %v3, %v20
 0xe7 0x23 0x40 0x00 0x3a 0x95
 
-#CHECK: vpklsgs %v7, %v24, %v9
-0xe7 0x78 0x90 0x10 0x34 0x95
-
-#CHECK: vpklsg  %v31, %v31, %v31
+# CHECK: vpklsg %v31, %v31, %v31
 0xe7 0xff 0xf0 0x00 0x3e 0x95
 
-#CHECK: vpklsh  %v0, %v0, %v0
+# CHECK: vpklsgs %v7, %v24, %v9
+0xe7 0x78 0x90 0x10 0x34 0x95
+
+# CHECK: vpklsh %v0, %v0, %v0
 0xe7 0x00 0x00 0x00 0x10 0x95
 
-#CHECK: vpklsh  %v18, %v3, %v20
+# CHECK: vpklsh %v18, %v3, %v20
 0xe7 0x23 0x40 0x00 0x1a 0x95
 
-#CHECK: vpklshs %v7, %v24, %v9
-0xe7 0x78 0x90 0x10 0x14 0x95
-
-#CHECK: vpklsh  %v31, %v31, %v31
+# CHECK: vpklsh %v31, %v31, %v31
 0xe7 0xff 0xf0 0x00 0x1e 0x95
 
-#CHECK: vpks    %v0, %v0, %v0, 11, 9
+# CHECK: vpklshs %v7, %v24, %v9
+0xe7 0x78 0x90 0x10 0x14 0x95
+
+# CHECK: vpks %v0, %v0, %v0, 11, 9
 0xe7 0x00 0x00 0x90 0xb0 0x97
 
-#CHECK: vpks    %v18, %v3, %v20, 11, 9
+# CHECK: vpks %v18, %v3, %v20, 11, 9
 0xe7 0x23 0x40 0x90 0xba 0x97
 
-#CHECK: vpks    %v31, %v31, %v31, 11, 9
+# CHECK: vpks %v31, %v31, %v31, 11, 9
 0xe7 0xff 0xf0 0x90 0xbe 0x97
 
-#CHECK: vpksf   %v0, %v0, %v0
+# CHECK: vpksf %v0, %v0, %v0
 0xe7 0x00 0x00 0x00 0x20 0x97
 
-#CHECK: vpksf   %v18, %v3, %v20
+# CHECK: vpksf %v18, %v3, %v20
 0xe7 0x23 0x40 0x00 0x2a 0x97
 
-#CHECK: vpksfs  %v7, %v24, %v9
-0xe7 0x78 0x90 0x10 0x24 0x97
-
-#CHECK: vpksf   %v31, %v31, %v31
+# CHECK: vpksf %v31, %v31, %v31
 0xe7 0xff 0xf0 0x00 0x2e 0x97
 
-#CHECK: vpksg   %v0, %v0, %v0
+# CHECK: vpksfs %v7, %v24, %v9
+0xe7 0x78 0x90 0x10 0x24 0x97
+
+# CHECK: vpksg %v0, %v0, %v0
 0xe7 0x00 0x00 0x00 0x30 0x97
 
-#CHECK: vpksg   %v18, %v3, %v20
+# CHECK: vpksg %v18, %v3, %v20
 0xe7 0x23 0x40 0x00 0x3a 0x97
 
-#CHECK: vpksgs  %v7, %v24, %v9
-0xe7 0x78 0x90 0x10 0x34 0x97
-
-#CHECK: vpksg   %v31, %v31, %v31
+# CHECK: vpksg %v31, %v31, %v31
 0xe7 0xff 0xf0 0x00 0x3e 0x97
 
-#CHECK: vpksh   %v0, %v0, %v0
+# CHECK: vpksgs %v7, %v24, %v9
+0xe7 0x78 0x90 0x10 0x34 0x97
+
+# CHECK: vpksh %v0, %v0, %v0
 0xe7 0x00 0x00 0x00 0x10 0x97
 
-#CHECK: vpksh   %v18, %v3, %v20
+# CHECK: vpksh %v18, %v3, %v20
 0xe7 0x23 0x40 0x00 0x1a 0x97
 
-#CHECK: vpkshs  %v7, %v24, %v9
-0xe7 0x78 0x90 0x10 0x14 0x97
-
-#CHECK: vpksh   %v31, %v31, %v31
+# CHECK: vpksh %v31, %v31, %v31
 0xe7 0xff 0xf0 0x00 0x1e 0x97
 
-#CHECK: vpopct  %v0, %v0, 0
+# CHECK: vpkshs %v7, %v24, %v9
+0xe7 0x78 0x90 0x10 0x14 0x97
+
+# CHECK: vpopct %v0, %v0, 0
 0xe7 0x00 0x00 0x00 0x00 0x50
 
-#CHECK: vpopct  %v19, %v14, 0
+# CHECK: vpopct %v19, %v14, 0
 0xe7 0x3e 0x00 0x00 0x08 0x50
 
-#CHECK: vpopct  %v31, %v31
+# CHECK: vpopct %v31, %v31
 0xe7 0xff 0x00 0x00 0x0c 0x50
 
-#CHECK: vrep    %v0, %v0, 0, 11
+# CHECK: vrep %v0, %v0, 0, 11
 0xe7 0x00 0x00 0x00 0xb0 0x4d
 
-#CHECK: vrep    %v19, %v4, 22136, 11
+# CHECK: vrep %v19, %v4, 22136, 11
 0xe7 0x34 0x56 0x78 0xb8 0x4d
 
-#CHECK: vrep    %v31, %v31, 65535, 11
+# CHECK: vrep %v31, %v31, 65535, 11
 0xe7 0xff 0xff 0xff 0xbc 0x4d
 
-#CHECK: vrepb   %v0, %v0, 0
+# CHECK: vrepb %v0, %v0, 0
 0xe7 0x00 0x00 0x00 0x00 0x4d
 
-#CHECK: vrepb   %v19, %v4, 22136
+# CHECK: vrepb %v19, %v4, 22136
 0xe7 0x34 0x56 0x78 0x08 0x4d
 
-#CHECK: vrepb   %v31, %v31, 65535
+# CHECK: vrepb %v31, %v31, 65535
 0xe7 0xff 0xff 0xff 0x0c 0x4d
 
-#CHECK: vrepf   %v0, %v0, 0
+# CHECK: vrepf %v0, %v0, 0
 0xe7 0x00 0x00 0x00 0x20 0x4d
 
-#CHECK: vrepf   %v19, %v4, 22136
+# CHECK: vrepf %v19, %v4, 22136
 0xe7 0x34 0x56 0x78 0x28 0x4d
 
-#CHECK: vrepf   %v31, %v31, 65535
+# CHECK: vrepf %v31, %v31, 65535
 0xe7 0xff 0xff 0xff 0x2c 0x4d
 
-#CHECK: vrepg   %v0, %v0, 0
+# CHECK: vrepg %v0, %v0, 0
 0xe7 0x00 0x00 0x00 0x30 0x4d
 
-#CHECK: vrepg   %v19, %v4, 22136
+# CHECK: vrepg %v19, %v4, 22136
 0xe7 0x34 0x56 0x78 0x38 0x4d
 
-#CHECK: vrepg   %v31, %v31, 65535
+# CHECK: vrepg %v31, %v31, 65535
 0xe7 0xff 0xff 0xff 0x3c 0x4d
 
-#CHECK: vreph   %v0, %v0, 0
+# CHECK: vreph %v0, %v0, 0
 0xe7 0x00 0x00 0x00 0x10 0x4d
 
-#CHECK: vreph   %v19, %v4, 22136
+# CHECK: vreph %v19, %v4, 22136
 0xe7 0x34 0x56 0x78 0x18 0x4d
 
-#CHECK: vreph   %v31, %v31, 65535
+# CHECK: vreph %v31, %v31, 65535
 0xe7 0xff 0xff 0xff 0x1c 0x4d
 
-#CHECK: vrepi   %v0, 0, 11
+# CHECK: vrepi %v0, 0, 11
 0xe7 0x00 0x00 0x00 0xb0 0x45
 
-#CHECK: vrepi   %v23, -30293, 11
+# CHECK: vrepi %v23, -30293, 11
 0xe7 0x70 0x89 0xab 0xb8 0x45
 
-#CHECK: vrepi   %v31, -1, 11
+# CHECK: vrepi %v31, -1, 11
 0xe7 0xf0 0xff 0xff 0xb8 0x45
 
-#CHECK: vrepib  %v0, 0
+# CHECK: vrepib %v0, 0
 0xe7 0x00 0x00 0x00 0x00 0x45
 
-#CHECK: vrepib  %v23, -30293
+# CHECK: vrepib %v23, -30293
 0xe7 0x70 0x89 0xab 0x08 0x45
 
-#CHECK: vrepib  %v31, -1
+# CHECK: vrepib %v31, -1
 0xe7 0xf0 0xff 0xff 0x08 0x45
 
-#CHECK: vrepif  %v0, 0
+# CHECK: vrepif %v0, 0
 0xe7 0x00 0x00 0x00 0x20 0x45
 
-#CHECK: vrepif  %v23, -30293
+# CHECK: vrepif %v23, -30293
 0xe7 0x70 0x89 0xab 0x28 0x45
 
-#CHECK: vrepif  %v31, -1
+# CHECK: vrepif %v31, -1
 0xe7 0xf0 0xff 0xff 0x28 0x45
 
-#CHECK: vrepig  %v0, 0
+# CHECK: vrepig %v0, 0
 0xe7 0x00 0x00 0x00 0x30 0x45
 
-#CHECK: vrepig  %v23, -30293
+# CHECK: vrepig %v23, -30293
 0xe7 0x70 0x89 0xab 0x38 0x45
 
-#CHECK: vrepig  %v31, -1
+# CHECK: vrepig %v31, -1
 0xe7 0xf0 0xff 0xff 0x38 0x45
 
-#CHECK: vrepih  %v0, 0
+# CHECK: vrepih %v0, 0
 0xe7 0x00 0x00 0x00 0x10 0x45
 
-#CHECK: vrepih  %v23, -30293
+# CHECK: vrepih %v23, -30293
 0xe7 0x70 0x89 0xab 0x18 0x45
 
-#CHECK: vrepih  %v31, -1
+# CHECK: vrepih %v31, -1
 0xe7 0xf0 0xff 0xff 0x18 0x45
 
-#CHECK: vs      %v0, %v0, %v0, 11
+# CHECK: vs  %v0, %v0, %v0, 11
 0xe7 0x00 0x00 0x00 0xb0 0xf7
 
-#CHECK: vs      %v18, %v3, %v20, 11
+# CHECK: vs  %v18, %v3, %v20, 11
 0xe7 0x23 0x40 0x00 0xba 0xf7
 
-#CHECK: vs      %v31, %v31, %v31, 11
+# CHECK: vs  %v31, %v31, %v31, 11
 0xe7 0xff 0xf0 0x00 0xbe 0xf7
 
-#CHECK: vsb     %v0, %v0, %v0
+# CHECK: vsb  %v0, %v0, %v0
 0xe7 0x00 0x00 0x00 0x00 0xf7
 
-#CHECK: vsb     %v18, %v3, %v20
+# CHECK: vsb  %v18, %v3, %v20
 0xe7 0x23 0x40 0x00 0x0a 0xf7
 
-#CHECK: vsb     %v31, %v31, %v31
+# CHECK: vsb  %v31, %v31, %v31
 0xe7 0xff 0xf0 0x00 0x0e 0xf7
 
-#CHECK: vsbi    %v0, %v0, %v0, %v0, 11
-0xe7 0x00 0x0b 0x00 0x00 0xbf
-
-#CHECK: vsbi    %v3, %v20, %v5, %v22, 11
-0xe7 0x34 0x5b 0x00 0x65 0xbf
-
-#CHECK: vsbi    %v31, %v31, %v31, %v31, 11
-0xe7 0xff 0xfb 0x00 0xff 0xbf
-
-#CHECK: vsbiq   %v0, %v0, %v0, %v0
-0xe7 0x00 0x04 0x00 0x00 0xbf
-
-#CHECK: vsbiq   %v3, %v20, %v5, %v22
-0xe7 0x34 0x54 0x00 0x65 0xbf
-
-#CHECK: vsbiq   %v31, %v31, %v31, %v31
-0xe7 0xff 0xf4 0x00 0xff 0xbf
-
-#CHECK: vsbcbi  %v0, %v0, %v0, %v0, 11
+# CHECK: vsbcbi %v0, %v0, %v0, %v0, 11
 0xe7 0x00 0x0b 0x00 0x00 0xbd
 
-#CHECK: vsbcbi  %v3, %v20, %v5, %v22, 11
+# CHECK: vsbcbi %v3, %v20, %v5, %v22, 11
 0xe7 0x34 0x5b 0x00 0x65 0xbd
 
-#CHECK: vsbcbi  %v31, %v31, %v31, %v31, 11
+# CHECK: vsbcbi %v31, %v31, %v31, %v31, 11
 0xe7 0xff 0xfb 0x00 0xff 0xbd
 
-#CHECK: vsbcbiq %v0, %v0, %v0, %v0
+# CHECK: vsbcbiq %v0, %v0, %v0, %v0
 0xe7 0x00 0x04 0x00 0x00 0xbd
 
-#CHECK: vsbcbiq %v3, %v20, %v5, %v22
+# CHECK: vsbcbiq %v3, %v20, %v5, %v22
 0xe7 0x34 0x54 0x00 0x65 0xbd
 
-#CHECK: vsbcbiq %v31, %v31, %v31, %v31
+# CHECK: vsbcbiq %v31, %v31, %v31, %v31
 0xe7 0xff 0xf4 0x00 0xff 0xbd
 
-#CHECK: vscbi   %v0, %v0, %v0, 11
+# CHECK: vsbi %v0, %v0, %v0, %v0, 11
+0xe7 0x00 0x0b 0x00 0x00 0xbf
+
+# CHECK: vsbi %v3, %v20, %v5, %v22, 11
+0xe7 0x34 0x5b 0x00 0x65 0xbf
+
+# CHECK: vsbi %v31, %v31, %v31, %v31, 11
+0xe7 0xff 0xfb 0x00 0xff 0xbf
+
+# CHECK: vsbiq %v0, %v0, %v0, %v0
+0xe7 0x00 0x04 0x00 0x00 0xbf
+
+# CHECK: vsbiq %v3, %v20, %v5, %v22
+0xe7 0x34 0x54 0x00 0x65 0xbf
+
+# CHECK: vsbiq %v31, %v31, %v31, %v31
+0xe7 0xff 0xf4 0x00 0xff 0xbf
+
+# CHECK: vscbi %v0, %v0, %v0, 11
 0xe7 0x00 0x00 0x00 0xb0 0xf5
 
-#CHECK: vscbi   %v18, %v3, %v20, 11
+# CHECK: vscbi %v18, %v3, %v20, 11
 0xe7 0x23 0x40 0x00 0xba 0xf5
 
-#CHECK: vscbi   %v31, %v31, %v31, 11
+# CHECK: vscbi %v31, %v31, %v31, 11
 0xe7 0xff 0xf0 0x00 0xbe 0xf5
 
-#CHECK: vscbib  %v0, %v0, %v0
+# CHECK: vscbib %v0, %v0, %v0
 0xe7 0x00 0x00 0x00 0x00 0xf5
 
-#CHECK: vscbib  %v18, %v3, %v20
+# CHECK: vscbib %v18, %v3, %v20
 0xe7 0x23 0x40 0x00 0x0a 0xf5
 
-#CHECK: vscbib  %v31, %v31, %v31
+# CHECK: vscbib %v31, %v31, %v31
 0xe7 0xff 0xf0 0x00 0x0e 0xf5
 
-#CHECK: vscbif  %v0, %v0, %v0
+# CHECK: vscbif %v0, %v0, %v0
 0xe7 0x00 0x00 0x00 0x20 0xf5
 
-#CHECK: vscbif  %v18, %v3, %v20
+# CHECK: vscbif %v18, %v3, %v20
 0xe7 0x23 0x40 0x00 0x2a 0xf5
 
-#CHECK: vscbif  %v31, %v31, %v31
+# CHECK: vscbif %v31, %v31, %v31
 0xe7 0xff 0xf0 0x00 0x2e 0xf5
 
-#CHECK: vscbig  %v0, %v0, %v0
+# CHECK: vscbig %v0, %v0, %v0
 0xe7 0x00 0x00 0x00 0x30 0xf5
 
-#CHECK: vscbig  %v18, %v3, %v20
+# CHECK: vscbig %v18, %v3, %v20
 0xe7 0x23 0x40 0x00 0x3a 0xf5
 
-#CHECK: vscbig  %v31, %v31, %v31
+# CHECK: vscbig %v31, %v31, %v31
 0xe7 0xff 0xf0 0x00 0x3e 0xf5
 
-#CHECK: vscbih  %v0, %v0, %v0
+# CHECK: vscbih %v0, %v0, %v0
 0xe7 0x00 0x00 0x00 0x10 0xf5
 
-#CHECK: vscbih  %v18, %v3, %v20
+# CHECK: vscbih %v18, %v3, %v20
 0xe7 0x23 0x40 0x00 0x1a 0xf5
 
-#CHECK: vscbih  %v31, %v31, %v31
+# CHECK: vscbih %v31, %v31, %v31
 0xe7 0xff 0xf0 0x00 0x1e 0xf5
 
-#CHECK: vscbiq  %v0, %v0, %v0
+# CHECK: vscbiq %v0, %v0, %v0
 0xe7 0x00 0x00 0x00 0x40 0xf5
 
-#CHECK: vscbiq  %v18, %v3, %v20
+# CHECK: vscbiq %v18, %v3, %v20
 0xe7 0x23 0x40 0x00 0x4a 0xf5
 
-#CHECK: vscbiq  %v31, %v31, %v31
+# CHECK: vscbiq %v31, %v31, %v31
 0xe7 0xff 0xf0 0x00 0x4e 0xf5
 
-#CHECK: vscef   %v0, 0(%v0), 0
+# CHECK: vscef %v0, 0(%v0), 0
 0xe7 0x00 0x00 0x00 0x00 0x1b
 
-#CHECK: vscef   %v10, 1000(%v19,%r7), 2
+# CHECK: vscef %v10, 1000(%v19,%r7), 2
 0xe7 0xa3 0x73 0xe8 0x24 0x1b
 
-#CHECK: vscef   %v31, 4095(%v31,%r15), 3
+# CHECK: vscef %v31, 4095(%v31,%r15), 3
 0xe7 0xff 0xff 0xff 0x3c 0x1b
 
-#CHECK: vsceg   %v0, 0(%v0), 0
+# CHECK: vsceg %v0, 0(%v0), 0
 0xe7 0x00 0x00 0x00 0x00 0x1a
 
-#CHECK: vsceg   %v10, 1000(%v19,%r7), 1
+# CHECK: vsceg %v10, 1000(%v19,%r7), 1
 0xe7 0xa3 0x73 0xe8 0x14 0x1a
 
-#CHECK: vsceg   %v31, 4095(%v31,%r15), 1
+# CHECK: vsceg %v31, 4095(%v31,%r15), 1
 0xe7 0xff 0xff 0xff 0x1c 0x1a
 
-#CHECK: vseg    %v0, %v0, 11
+# CHECK: vseg %v0, %v0, 11
 0xe7 0x00 0x00 0x00 0xb0 0x5f
 
-#CHECK: vseg    %v19, %v14, 11
+# CHECK: vseg %v19, %v14, 11
 0xe7 0x3e 0x00 0x00 0xb8 0x5f
 
-#CHECK: vseg    %v31, %v31, 11
+# CHECK: vseg %v31, %v31, 11
 0xe7 0xff 0x00 0x00 0xbc 0x5f
 
-#CHECK: vsegb   %v0, %v0
+# CHECK: vsegb %v0, %v0
 0xe7 0x00 0x00 0x00 0x00 0x5f
 
-#CHECK: vsegb   %v19, %v14
+# CHECK: vsegb %v19, %v14
 0xe7 0x3e 0x00 0x00 0x08 0x5f
 
-#CHECK: vsegb   %v31, %v31
+# CHECK: vsegb %v31, %v31
 0xe7 0xff 0x00 0x00 0x0c 0x5f
 
-#CHECK: vsegf   %v0, %v0
+# CHECK: vsegf %v0, %v0
 0xe7 0x00 0x00 0x00 0x20 0x5f
 
-#CHECK: vsegf   %v19, %v14
+# CHECK: vsegf %v19, %v14
 0xe7 0x3e 0x00 0x00 0x28 0x5f
 
-#CHECK: vsegf   %v31, %v31
+# CHECK: vsegf %v31, %v31
 0xe7 0xff 0x00 0x00 0x2c 0x5f
 
-#CHECK: vsegh   %v0, %v0
+# CHECK: vsegh %v0, %v0
 0xe7 0x00 0x00 0x00 0x10 0x5f
 
-#CHECK: vsegh   %v19, %v14
+# CHECK: vsegh %v19, %v14
 0xe7 0x3e 0x00 0x00 0x18 0x5f
 
-#CHECK: vsegh   %v31, %v31
+# CHECK: vsegh %v31, %v31
 0xe7 0xff 0x00 0x00 0x1c 0x5f
 
-#CHECK: vsel    %v0, %v0, %v0, %v0
+# CHECK: vsel %v0, %v0, %v0, %v0
 0xe7 0x00 0x00 0x00 0x00 0x8d
 
-#CHECK: vsel    %v3, %v20, %v5, %v22
+# CHECK: vsel %v3, %v20, %v5, %v22
 0xe7 0x34 0x50 0x00 0x65 0x8d
 
-#CHECK: vsel    %v31, %v31, %v31, %v31
+# CHECK: vsel %v31, %v31, %v31, %v31
 0xe7 0xff 0xf0 0x00 0xff 0x8d
 
-#CHECK: vsf     %v0, %v0, %v0
+# CHECK: vsf  %v0, %v0, %v0
 0xe7 0x00 0x00 0x00 0x20 0xf7
 
-#CHECK: vsf     %v18, %v3, %v20
+# CHECK: vsf  %v18, %v3, %v20
 0xe7 0x23 0x40 0x00 0x2a 0xf7
 
-#CHECK: vsf     %v31, %v31, %v31
+# CHECK: vsf  %v31, %v31, %v31
 0xe7 0xff 0xf0 0x00 0x2e 0xf7
 
-#CHECK: vsg     %v0, %v0, %v0
+# CHECK: vsg  %v0, %v0, %v0
 0xe7 0x00 0x00 0x00 0x30 0xf7
 
-#CHECK: vsg     %v18, %v3, %v20
+# CHECK: vsg  %v18, %v3, %v20
 0xe7 0x23 0x40 0x00 0x3a 0xf7
 
-#CHECK: vsg     %v31, %v31, %v31
+# CHECK: vsg  %v31, %v31, %v31
 0xe7 0xff 0xf0 0x00 0x3e 0xf7
 
-#CHECK: vsh     %v0, %v0, %v0
+# CHECK: vsh  %v0, %v0, %v0
 0xe7 0x00 0x00 0x00 0x10 0xf7
 
-#CHECK: vsh     %v18, %v3, %v20
+# CHECK: vsh  %v18, %v3, %v20
 0xe7 0x23 0x40 0x00 0x1a 0xf7
 
-#CHECK: vsh     %v31, %v31, %v31
+# CHECK: vsh  %v31, %v31, %v31
 0xe7 0xff 0xf0 0x00 0x1e 0xf7
 
-#CHECK: vsl     %v0, %v0, %v0
+# CHECK: vsl  %v0, %v0, %v0
 0xe7 0x00 0x00 0x00 0x00 0x74
 
-#CHECK: vsl     %v18, %v3, %v20
+# CHECK: vsl  %v18, %v3, %v20
 0xe7 0x23 0x40 0x00 0x0a 0x74
 
-#CHECK: vsl     %v31, %v31, %v31
+# CHECK: vsl  %v31, %v31, %v31
 0xe7 0xff 0xf0 0x00 0x0e 0x74
 
-#CHECK: vslb    %v0, %v0, %v0
+# CHECK: vslb %v0, %v0, %v0
 0xe7 0x00 0x00 0x00 0x00 0x75
 
-#CHECK: vslb    %v18, %v3, %v20
+# CHECK: vslb %v18, %v3, %v20
 0xe7 0x23 0x40 0x00 0x0a 0x75
 
-#CHECK: vslb    %v31, %v31, %v31
+# CHECK: vslb %v31, %v31, %v31
 0xe7 0xff 0xf0 0x00 0x0e 0x75
 
-#CHECK: vsldb   %v0, %v0, %v0, 0
+# CHECK: vsldb %v0, %v0, %v0, 0
 0xe7 0x00 0x00 0x00 0x00 0x77
 
-#CHECK: vsldb   %v3, %v20, %v5, 103
+# CHECK: vsldb %v3, %v20, %v5, 103
 0xe7 0x34 0x50 0x67 0x04 0x77
 
-#CHECK: vsldb   %v31, %v31, %v31, 255
+# CHECK: vsldb %v31, %v31, %v31, 255
 0xe7 0xff 0xf0 0xff 0x0e 0x77
 
-#CHECK: vsq     %v0, %v0, %v0
+# CHECK: vsq  %v0, %v0, %v0
 0xe7 0x00 0x00 0x00 0x40 0xf7
 
-#CHECK: vsq     %v18, %v3, %v20
+# CHECK: vsq  %v18, %v3, %v20
 0xe7 0x23 0x40 0x00 0x4a 0xf7
 
-#CHECK: vsq     %v31, %v31, %v31
+# CHECK: vsq  %v31, %v31, %v31
 0xe7 0xff 0xf0 0x00 0x4e 0xf7
 
-#CHECK: vsra    %v0, %v0, %v0
+# CHECK: vsra %v0, %v0, %v0
 0xe7 0x00 0x00 0x00 0x00 0x7e
 
-#CHECK: vsra    %v18, %v3, %v20
+# CHECK: vsra %v18, %v3, %v20
 0xe7 0x23 0x40 0x00 0x0a 0x7e
 
-#CHECK: vsra    %v31, %v31, %v31
+# CHECK: vsra %v31, %v31, %v31
 0xe7 0xff 0xf0 0x00 0x0e 0x7e
 
-#CHECK: vsrab   %v0, %v0, %v0
+# CHECK: vsrab %v0, %v0, %v0
 0xe7 0x00 0x00 0x00 0x00 0x7f
 
-#CHECK: vsrab   %v18, %v3, %v20
+# CHECK: vsrab %v18, %v3, %v20
 0xe7 0x23 0x40 0x00 0x0a 0x7f
 
-#CHECK: vsrab   %v31, %v31, %v31
+# CHECK: vsrab %v31, %v31, %v31
 0xe7 0xff 0xf0 0x00 0x0e 0x7f
 
-#CHECK: vsrl    %v0, %v0, %v0
+# CHECK: vsrl %v0, %v0, %v0
 0xe7 0x00 0x00 0x00 0x00 0x7c
 
-#CHECK: vsrl    %v18, %v3, %v20
+# CHECK: vsrl %v18, %v3, %v20
 0xe7 0x23 0x40 0x00 0x0a 0x7c
 
-#CHECK: vsrl    %v31, %v31, %v31
+# CHECK: vsrl %v31, %v31, %v31
 0xe7 0xff 0xf0 0x00 0x0e 0x7c
 
-#CHECK: vsrlb   %v0, %v0, %v0
+# CHECK: vsrlb %v0, %v0, %v0
 0xe7 0x00 0x00 0x00 0x00 0x7d
 
-#CHECK: vsrlb   %v18, %v3, %v20
+# CHECK: vsrlb %v18, %v3, %v20
 0xe7 0x23 0x40 0x00 0x0a 0x7d
 
-#CHECK: vsrlb   %v31, %v31, %v31
+# CHECK: vsrlb %v31, %v31, %v31
 0xe7 0xff 0xf0 0x00 0x0e 0x7d
 
-#CHECK: vst     %v0, 0
+# CHECK: vst  %v0, 0
 0xe7 0x00 0x00 0x00 0x00 0x0E
 
-#CHECK: vst     %v17, 2475(%r7,%r8)
+# CHECK: vst  %v17, 2475(%r7,%r8)
 0xe7 0x17 0x89 0xab 0x08 0x0E
 
-#CHECK: vst     %v31, 4095(%r15,%r15)
+# CHECK: vst  %v31, 4095(%r15,%r15)
 0xe7 0xff 0xff 0xff 0x08 0x0E
 
-#CHECK: vsteb   %v0, 0, 0
+# CHECK: vsteb %v0, 0, 0
 0xe7 0x00 0x00 0x00 0x00 0x08
 
-#CHECK: vsteb   %v17, 2475(%r7,%r8), 12
+# CHECK: vsteb %v17, 2475(%r7,%r8), 12
 0xe7 0x17 0x89 0xab 0xc8 0x08
 
-#CHECK: vsteb   %v31, 4095(%r15,%r15), 15
+# CHECK: vsteb %v31, 4095(%r15,%r15), 15
 0xe7 0xff 0xff 0xff 0xf8 0x08
 
-#CHECK: vstef   %v0, 0, 0
+# CHECK: vstef %v0, 0, 0
 0xe7 0x00 0x00 0x00 0x00 0x0b
 
-#CHECK: vstef   %v17, 2475(%r7,%r8), 2
+# CHECK: vstef %v17, 2475(%r7,%r8), 2
 0xe7 0x17 0x89 0xab 0x28 0x0b
 
-#CHECK: vstef   %v31, 4095(%r15,%r15), 3
+# CHECK: vstef %v31, 4095(%r15,%r15), 3
 0xe7 0xff 0xff 0xff 0x38 0x0b
 
-#CHECK: vsteg   %v0, 0, 0
+# CHECK: vsteg %v0, 0, 0
 0xe7 0x00 0x00 0x00 0x00 0x0a
 
-#CHECK: vsteg   %v17, 2475(%r7,%r8), 1
+# CHECK: vsteg %v17, 2475(%r7,%r8), 1
 0xe7 0x17 0x89 0xab 0x18 0x0a
 
-#CHECK: vsteg   %v31, 4095(%r15,%r15), 1
+# CHECK: vsteg %v31, 4095(%r15,%r15), 1
 0xe7 0xff 0xff 0xff 0x18 0x0a
 
-#CHECK: vsteh   %v0, 0, 0
+# CHECK: vsteh %v0, 0, 0
 0xe7 0x00 0x00 0x00 0x00 0x09
 
-#CHECK: vsteh   %v17, 2475(%r7,%r8), 5
+# CHECK: vsteh %v17, 2475(%r7,%r8), 5
 0xe7 0x17 0x89 0xab 0x58 0x09
 
-#CHECK: vsteh   %v31, 4095(%r15,%r15), 7
+# CHECK: vsteh %v31, 4095(%r15,%r15), 7
 0xe7 0xff 0xff 0xff 0x78 0x09
 
-#CHECK: vstl    %v0, %r0, 0
+# CHECK: vstl %v0, %r0, 0
 0xe7 0x00 0x00 0x00 0x00 0x3f
 
-#CHECK: vstl    %v18, %r3, 1383(%r4)
+# CHECK: vstl %v18, %r3, 1383(%r4)
 0xe7 0x23 0x45 0x67 0x08 0x3f
 
-#CHECK: vstl    %v31, %r15, 4095(%r15)
+# CHECK: vstl %v31, %r15, 4095(%r15)
 0xe7 0xff 0xff 0xff 0x08 0x3f
 
-#CHECK: vstm    %v0, %v0, 0
+# CHECK: vstm %v0, %v0, 0
 0xe7 0x00 0x00 0x00 0x00 0x3e
 
-#CHECK: vstm    %v12, %v18, 1110(%r3)
+# CHECK: vstm %v12, %v18, 1110(%r3)
 0xe7 0xc2 0x34 0x56 0x04 0x3e
 
-#CHECK: vstm    %v31, %v31, 4095(%r15)
+# CHECK: vstm %v31, %v31, 4095(%r15)
 0xe7 0xff 0xff 0xff 0x0c 0x3e
 
-#CHECK: vstrc    %v0, %v0, %v0, %v0, 11, 0
+# CHECK: vstrc %v0, %v0, %v0, %v0, 11, 0
 0xe7 0x00 0x0b 0x00 0x00 0x8a
 
-#CHECK: vstrc    %v0, %v0, %v0, %v0, 11, 12
+# CHECK: vstrc %v0, %v0, %v0, %v0, 11, 12
 0xe7 0x00 0x0b 0xc0 0x00 0x8a
 
-#CHECK: vstrc    %v18, %v3, %v20, %v5, 11, 0
+# CHECK: vstrc %v18, %v3, %v20, %v5, 11, 0
 0xe7 0x23 0x4b 0x00 0x5a 0x8a
 
-#CHECK: vstrc    %v31, %v31, %v31, %v31, 11, 4
+# CHECK: vstrc %v31, %v31, %v31, %v31, 11, 4
 0xe7 0xff 0xfb 0x40 0xff 0x8a
 
-#CHECK: vstrcb   %v0, %v0, %v0, %v0, 0
+# CHECK: vstrcb %v0, %v0, %v0, %v0, 0
 0xe7 0x00 0x00 0x00 0x00 0x8a
 
-#CHECK: vstrcb   %v0, %v0, %v0, %v0, 12
+# CHECK: vstrcb %v0, %v0, %v0, %v0, 12
 0xe7 0x00 0x00 0xc0 0x00 0x8a
 
-#CHECK: vstrcb   %v18, %v3, %v20, %v5, 0
+# CHECK: vstrcb %v18, %v3, %v20, %v5, 0
 0xe7 0x23 0x40 0x00 0x5a 0x8a
 
-#CHECK: vstrcb   %v31, %v31, %v31, %v31, 4
+# CHECK: vstrcb %v31, %v31, %v31, %v31, 4
 0xe7 0xff 0xf0 0x40 0xff 0x8a
 
-#CHECK: vstrcbs  %v31, %v31, %v31, %v31, 8
+# CHECK: vstrcbs %v31, %v31, %v31, %v31, 8
 0xe7 0xff 0xf0 0x90 0xff 0x8a
 
-#CHECK: vstrczb  %v31, %v31, %v31, %v31, 4
+# CHECK: vstrczb %v31, %v31, %v31, %v31, 4
 0xe7 0xff 0xf0 0x60 0xff 0x8a
 
-#CHECK: vstrczbs %v31, %v31, %v31, %v31, 8
+# CHECK: vstrczbs %v31, %v31, %v31, %v31, 8
 0xe7 0xff 0xf0 0xb0 0xff 0x8a
 
-#CHECK: vstrcf   %v0, %v0, %v0, %v0, 0
+# CHECK: vstrcf %v0, %v0, %v0, %v0, 0
 0xe7 0x00 0x02 0x00 0x00 0x8a
 
-#CHECK: vstrcf   %v0, %v0, %v0, %v0, 12
+# CHECK: vstrcf %v0, %v0, %v0, %v0, 12
 0xe7 0x00 0x02 0xc0 0x00 0x8a
 
-#CHECK: vstrcf   %v18, %v3, %v20, %v5, 0
+# CHECK: vstrcf %v18, %v3, %v20, %v5, 0
 0xe7 0x23 0x42 0x00 0x5a 0x8a
 
-#CHECK: vstrcf   %v31, %v31, %v31, %v31, 4
+# CHECK: vstrcf %v31, %v31, %v31, %v31, 4
 0xe7 0xff 0xf2 0x40 0xff 0x8a
 
-#CHECK: vstrcfs  %v31, %v31, %v31, %v31, 8
+# CHECK: vstrcfs %v31, %v31, %v31, %v31, 8
 0xe7 0xff 0xf2 0x90 0xff 0x8a
 
-#CHECK: vstrczf  %v31, %v31, %v31, %v31, 4
+# CHECK: vstrczf %v31, %v31, %v31, %v31, 4
 0xe7 0xff 0xf2 0x60 0xff 0x8a
 
-#CHECK: vstrczfs %v31, %v31, %v31, %v31, 8
+# CHECK: vstrczfs %v31, %v31, %v31, %v31, 8
 0xe7 0xff 0xf2 0xb0 0xff 0x8a
 
-#CHECK: vstrch   %v0, %v0, %v0, %v0, 0
+# CHECK: vstrch %v0, %v0, %v0, %v0, 0
 0xe7 0x00 0x01 0x00 0x00 0x8a
 
-#CHECK: vstrch   %v0, %v0, %v0, %v0, 12
+# CHECK: vstrch %v0, %v0, %v0, %v0, 12
 0xe7 0x00 0x01 0xc0 0x00 0x8a
 
-#CHECK: vstrch   %v18, %v3, %v20, %v5, 0
+# CHECK: vstrch %v18, %v3, %v20, %v5, 0
 0xe7 0x23 0x41 0x00 0x5a 0x8a
 
-#CHECK: vstrch   %v31, %v31, %v31, %v31, 4
+# CHECK: vstrch %v31, %v31, %v31, %v31, 4
 0xe7 0xff 0xf1 0x40 0xff 0x8a
 
-#CHECK: vstrchs  %v31, %v31, %v31, %v31, 8
+# CHECK: vstrchs %v31, %v31, %v31, %v31, 8
 0xe7 0xff 0xf1 0x90 0xff 0x8a
 
-#CHECK: vstrczh  %v31, %v31, %v31, %v31, 4
+# CHECK: vstrczh %v31, %v31, %v31, %v31, 4
 0xe7 0xff 0xf1 0x60 0xff 0x8a
 
-#CHECK: vstrczhs %v31, %v31, %v31, %v31, 8
+# CHECK: vstrczhs %v31, %v31, %v31, %v31, 8
 0xe7 0xff 0xf1 0xb0 0xff 0x8a
 
-#CHECK: vsumg   %v0, %v0, %v0, 11
+# CHECK: vsum %v0, %v0, %v0, 11
+0xe7 0x00 0x00 0x00 0xb0 0x64
+
+# CHECK: vsum %v18, %v3, %v20, 11
+0xe7 0x23 0x40 0x00 0xba 0x64
+
+# CHECK: vsum %v31, %v31, %v31, 11
+0xe7 0xff 0xf0 0x00 0xbe 0x64
+
+# CHECK: vsumb %v0, %v0, %v0
+0xe7 0x00 0x00 0x00 0x00 0x64
+
+# CHECK: vsumb %v18, %v3, %v20
+0xe7 0x23 0x40 0x00 0x0a 0x64
+
+# CHECK: vsumb %v31, %v31, %v31
+0xe7 0xff 0xf0 0x00 0x0e 0x64
+
+# CHECK: vsumg %v0, %v0, %v0, 11
 0xe7 0x00 0x00 0x00 0xb0 0x65
 
-#CHECK: vsumg   %v18, %v3, %v20, 11
+# CHECK: vsumg %v18, %v3, %v20, 11
 0xe7 0x23 0x40 0x00 0xba 0x65
 
-#CHECK: vsumg   %v31, %v31, %v31, 11
+# CHECK: vsumg %v31, %v31, %v31, 11
 0xe7 0xff 0xf0 0x00 0xbe 0x65
 
-#CHECK: vsumgh  %v0, %v0, %v0
+# CHECK: vsumgf %v0, %v0, %v0
+0xe7 0x00 0x00 0x00 0x20 0x65
+
+# CHECK: vsumgf %v18, %v3, %v20
+0xe7 0x23 0x40 0x00 0x2a 0x65
+
+# CHECK: vsumgf %v31, %v31, %v31
+0xe7 0xff 0xf0 0x00 0x2e 0x65
+
+# CHECK: vsumgh %v0, %v0, %v0
 0xe7 0x00 0x00 0x00 0x10 0x65
 
-#CHECK: vsumgh  %v18, %v3, %v20
+# CHECK: vsumgh %v18, %v3, %v20
 0xe7 0x23 0x40 0x00 0x1a 0x65
 
-#CHECK: vsumgh  %v31, %v31, %v31
+# CHECK: vsumgh %v31, %v31, %v31
 0xe7 0xff 0xf0 0x00 0x1e 0x65
 
-#CHECK: vsumgf  %v0, %v0, %v0
-0xe7 0x00 0x00 0x00 0x20 0x65
+# CHECK: vsumh %v0, %v0, %v0
+0xe7 0x00 0x00 0x00 0x10 0x64
 
-#CHECK: vsumgf  %v18, %v3, %v20
-0xe7 0x23 0x40 0x00 0x2a 0x65
+# CHECK: vsumh %v18, %v3, %v20
+0xe7 0x23 0x40 0x00 0x1a 0x64
 
-#CHECK: vsumgf  %v31, %v31, %v31
-0xe7 0xff 0xf0 0x00 0x2e 0x65
+# CHECK: vsumh %v31, %v31, %v31
+0xe7 0xff 0xf0 0x00 0x1e 0x64
 
-#CHECK: vsumq   %v0, %v0, %v0, 11
+# CHECK: vsumq %v0, %v0, %v0, 11
 0xe7 0x00 0x00 0x00 0xb0 0x67
 
-#CHECK: vsumq   %v18, %v3, %v20, 11
+# CHECK: vsumq %v18, %v3, %v20, 11
 0xe7 0x23 0x40 0x00 0xba 0x67
 
-#CHECK: vsumq   %v31, %v31, %v31, 11
+# CHECK: vsumq %v31, %v31, %v31, 11
 0xe7 0xff 0xf0 0x00 0xbe 0x67
 
-#CHECK: vsumqf  %v0, %v0, %v0
+# CHECK: vsumqf %v0, %v0, %v0
 0xe7 0x00 0x00 0x00 0x20 0x67
 
-#CHECK: vsumqf  %v18, %v3, %v20
+# CHECK: vsumqf %v18, %v3, %v20
 0xe7 0x23 0x40 0x00 0x2a 0x67
 
-#CHECK: vsumqf  %v31, %v31, %v31
+# CHECK: vsumqf %v31, %v31, %v31
 0xe7 0xff 0xf0 0x00 0x2e 0x67
 
-#CHECK: vsumqg  %v0, %v0, %v0
+# CHECK: vsumqg %v0, %v0, %v0
 0xe7 0x00 0x00 0x00 0x30 0x67
 
-#CHECK: vsumqg  %v18, %v3, %v20
+# CHECK: vsumqg %v18, %v3, %v20
 0xe7 0x23 0x40 0x00 0x3a 0x67
 
-#CHECK: vsumqg  %v31, %v31, %v31
+# CHECK: vsumqg %v31, %v31, %v31
 0xe7 0xff 0xf0 0x00 0x3e 0x67
 
-#CHECK: vsum    %v0, %v0, %v0, 11
-0xe7 0x00 0x00 0x00 0xb0 0x64
-
-#CHECK: vsum    %v18, %v3, %v20, 11
-0xe7 0x23 0x40 0x00 0xba 0x64
-
-#CHECK: vsum    %v31, %v31, %v31, 11
-0xe7 0xff 0xf0 0x00 0xbe 0x64
-
-#CHECK: vsumb   %v0, %v0, %v0
-0xe7 0x00 0x00 0x00 0x00 0x64
-
-#CHECK: vsumb   %v18, %v3, %v20
-0xe7 0x23 0x40 0x00 0x0a 0x64
-
-#CHECK: vsumb   %v31, %v31, %v31
-0xe7 0xff 0xf0 0x00 0x0e 0x64
-
-#CHECK: vsumh   %v0, %v0, %v0
-0xe7 0x00 0x00 0x00 0x10 0x64
-
-#CHECK: vsumh   %v18, %v3, %v20
-0xe7 0x23 0x40 0x00 0x1a 0x64
-
-#CHECK: vsumh   %v31, %v31, %v31
-0xe7 0xff 0xf0 0x00 0x1e 0x64
-
-#CHECK: vtm     %v0, %v0
+# CHECK: vtm  %v0, %v0
 0xe7 0x00 0x00 0x00 0x00 0xd8
 
-#CHECK: vtm     %v19, %v14
+# CHECK: vtm  %v19, %v14
 0xe7 0x3e 0x00 0x00 0x08 0xd8
 
-#CHECK: vtm     %v31, %v31
+# CHECK: vtm  %v31, %v31
 0xe7 0xff 0x00 0x00 0x0c 0xd8
 
-#CHECK: vuph    %v0, %v0, 11
+# CHECK: vuph %v0, %v0, 11
 0xe7 0x00 0x00 0x00 0xb0 0xd7
 
-#CHECK: vuph    %v19, %v14, 11
+# CHECK: vuph %v19, %v14, 11
 0xe7 0x3e 0x00 0x00 0xb8 0xd7
 
-#CHECK: vuph    %v31, %v31, 11
+# CHECK: vuph %v31, %v31, 11
 0xe7 0xff 0x00 0x00 0xbc 0xd7
 
-#CHECK: vuphb   %v0, %v0
+# CHECK: vuphb %v0, %v0
 0xe7 0x00 0x00 0x00 0x00 0xd7
 
-#CHECK: vuphb   %v19, %v14
+# CHECK: vuphb %v19, %v14
 0xe7 0x3e 0x00 0x00 0x08 0xd7
 
-#CHECK: vuphb   %v31, %v31
+# CHECK: vuphb %v31, %v31
 0xe7 0xff 0x00 0x00 0x0c 0xd7
 
-#CHECK: vuphf   %v0, %v0
+# CHECK: vuphf %v0, %v0
 0xe7 0x00 0x00 0x00 0x20 0xd7
 
-#CHECK: vuphf   %v19, %v14
+# CHECK: vuphf %v19, %v14
 0xe7 0x3e 0x00 0x00 0x28 0xd7
 
-#CHECK: vuphf   %v31, %v31
+# CHECK: vuphf %v31, %v31
 0xe7 0xff 0x00 0x00 0x2c 0xd7
 
-#CHECK: vuphh   %v0, %v0
+# CHECK: vuphh %v0, %v0
 0xe7 0x00 0x00 0x00 0x10 0xd7
 
-#CHECK: vuphh   %v19, %v14
+# CHECK: vuphh %v19, %v14
 0xe7 0x3e 0x00 0x00 0x18 0xd7
 
-#CHECK: vuphh   %v31, %v31
+# CHECK: vuphh %v31, %v31
 0xe7 0xff 0x00 0x00 0x1c 0xd7
 
-#CHECK: vuplh   %v0, %v0, 11
-0xe7 0x00 0x00 0x00 0xb0 0xd5
+# CHECK: vupl %v0, %v0, 11
+0xe7 0x00 0x00 0x00 0xb0 0xd6
 
-#CHECK: vuplh   %v19, %v14, 11
-0xe7 0x3e 0x00 0x00 0xb8 0xd5
+# CHECK: vupl %v19, %v14, 11
+0xe7 0x3e 0x00 0x00 0xb8 0xd6
 
-#CHECK: vuplh   %v31, %v31, 11
-0xe7 0xff 0x00 0x00 0xbc 0xd5
+# CHECK: vupl %v31, %v31, 11
+0xe7 0xff 0x00 0x00 0xbc 0xd6
 
-#CHECK: vuplhb  %v0, %v0
-0xe7 0x00 0x00 0x00 0x00 0xd5
+# CHECK: vuplb %v0, %v0
+0xe7 0x00 0x00 0x00 0x00 0xd6
 
-#CHECK: vuplhb  %v19, %v14
-0xe7 0x3e 0x00 0x00 0x08 0xd5
+# CHECK: vuplb %v19, %v14
+0xe7 0x3e 0x00 0x00 0x08 0xd6
 
-#CHECK: vuplhb  %v31, %v31
-0xe7 0xff 0x00 0x00 0x0c 0xd5
+# CHECK: vuplb %v31, %v31
+0xe7 0xff 0x00 0x00 0x0c 0xd6
 
-#CHECK: vuplhf  %v0, %v0
-0xe7 0x00 0x00 0x00 0x20 0xd5
+# CHECK: vuplf %v0, %v0
+0xe7 0x00 0x00 0x00 0x20 0xd6
 
-#CHECK: vuplhf  %v19, %v14
-0xe7 0x3e 0x00 0x00 0x28 0xd5
+# CHECK: vuplf %v19, %v14
+0xe7 0x3e 0x00 0x00 0x28 0xd6
 
-#CHECK: vuplhf  %v31, %v31
-0xe7 0xff 0x00 0x00 0x2c 0xd5
+# CHECK: vuplf %v31, %v31
+0xe7 0xff 0x00 0x00 0x2c 0xd6
 
-#CHECK: vuplhh  %v0, %v0
-0xe7 0x00 0x00 0x00 0x10 0xd5
+# CHECK: vuplh %v0, %v0, 11
+0xe7 0x00 0x00 0x00 0xb0 0xd5
 
-#CHECK: vuplhh  %v19, %v14
-0xe7 0x3e 0x00 0x00 0x18 0xd5
+# CHECK: vuplh %v19, %v14, 11
+0xe7 0x3e 0x00 0x00 0xb8 0xd5
 
-#CHECK: vuplhh  %v31, %v31
-0xe7 0xff 0x00 0x00 0x1c 0xd5
+# CHECK: vuplh %v31, %v31, 11
+0xe7 0xff 0x00 0x00 0xbc 0xd5
 
-#CHECK: vupl    %v0, %v0, 11
-0xe7 0x00 0x00 0x00 0xb0 0xd6
+# CHECK: vuplhb %v0, %v0
+0xe7 0x00 0x00 0x00 0x00 0xd5
 
-#CHECK: vupl    %v19, %v14, 11
-0xe7 0x3e 0x00 0x00 0xb8 0xd6
+# CHECK: vuplhb %v19, %v14
+0xe7 0x3e 0x00 0x00 0x08 0xd5
 
-#CHECK: vupl    %v31, %v31, 11
-0xe7 0xff 0x00 0x00 0xbc 0xd6
+# CHECK: vuplhb %v31, %v31
+0xe7 0xff 0x00 0x00 0x0c 0xd5
 
-#CHECK: vuplb   %v0, %v0
-0xe7 0x00 0x00 0x00 0x00 0xd6
+# CHECK: vuplhf %v0, %v0
+0xe7 0x00 0x00 0x00 0x20 0xd5
 
-#CHECK: vuplb   %v19, %v14
-0xe7 0x3e 0x00 0x00 0x08 0xd6
+# CHECK: vuplhf %v19, %v14
+0xe7 0x3e 0x00 0x00 0x28 0xd5
 
-#CHECK: vuplb   %v31, %v31
-0xe7 0xff 0x00 0x00 0x0c 0xd6
+# CHECK: vuplhf %v31, %v31
+0xe7 0xff 0x00 0x00 0x2c 0xd5
 
-#CHECK: vuplf   %v0, %v0
-0xe7 0x00 0x00 0x00 0x20 0xd6
+# CHECK: vuplhh %v0, %v0
+0xe7 0x00 0x00 0x00 0x10 0xd5
 
-#CHECK: vuplf   %v19, %v14
-0xe7 0x3e 0x00 0x00 0x28 0xd6
+# CHECK: vuplhh %v19, %v14
+0xe7 0x3e 0x00 0x00 0x18 0xd5
 
-#CHECK: vuplf   %v31, %v31
-0xe7 0xff 0x00 0x00 0x2c 0xd6
+# CHECK: vuplhh %v31, %v31
+0xe7 0xff 0x00 0x00 0x1c 0xd5
 
-#CHECK: vuplhw  %v0, %v0
+# CHECK: vuplhw %v0, %v0
 0xe7 0x00 0x00 0x00 0x10 0xd6
 
-#CHECK: vuplhw  %v19, %v14
+# CHECK: vuplhw %v19, %v14
 0xe7 0x3e 0x00 0x00 0x18 0xd6
 
-#CHECK: vuplhw  %v31, %v31
+# CHECK: vuplhw %v31, %v31
 0xe7 0xff 0x00 0x00 0x1c 0xd6
 
-#CHECK: vupll   %v0, %v0, 11
+# CHECK: vupll %v0, %v0, 11
 0xe7 0x00 0x00 0x00 0xb0 0xd4
 
-#CHECK: vupll   %v19, %v14, 11
+# CHECK: vupll %v19, %v14, 11
 0xe7 0x3e 0x00 0x00 0xb8 0xd4
 
-#CHECK: vupll   %v31, %v31, 11
+# CHECK: vupll %v31, %v31, 11
 0xe7 0xff 0x00 0x00 0xbc 0xd4
 
-#CHECK: vupllb  %v0, %v0
+# CHECK: vupllb %v0, %v0
 0xe7 0x00 0x00 0x00 0x00 0xd4
 
-#CHECK: vupllb  %v19, %v14
+# CHECK: vupllb %v19, %v14
 0xe7 0x3e 0x00 0x00 0x08 0xd4
 
-#CHECK: vupllb  %v31, %v31
+# CHECK: vupllb %v31, %v31
 0xe7 0xff 0x00 0x00 0x0c 0xd4
 
-#CHECK: vupllf  %v0, %v0
+# CHECK: vupllf %v0, %v0
 0xe7 0x00 0x00 0x00 0x20 0xd4
 
-#CHECK: vupllf  %v19, %v14
+# CHECK: vupllf %v19, %v14
 0xe7 0x3e 0x00 0x00 0x28 0xd4
 
-#CHECK: vupllf  %v31, %v31
+# CHECK: vupllf %v31, %v31
 0xe7 0xff 0x00 0x00 0x2c 0xd4
 
-#CHECK: vupllh  %v0, %v0
+# CHECK: vupllh %v0, %v0
 0xe7 0x00 0x00 0x00 0x10 0xd4
 
-#CHECK: vupllh  %v19, %v14
+# CHECK: vupllh %v19, %v14
 0xe7 0x3e 0x00 0x00 0x18 0xd4
 
-#CHECK: vupllh  %v31, %v31
+# CHECK: vupllh %v31, %v31
 0xe7 0xff 0x00 0x00 0x1c 0xd4
 
-#CHECK: vx      %v0, %v0, %v0
+# CHECK: vx  %v0, %v0, %v0
 0xe7 0x00 0x00 0x00 0x00 0x6d
 
-#CHECK: vx      %v18, %v3, %v20
+# CHECK: vx  %v18, %v3, %v20
 0xe7 0x23 0x40 0x00 0x0a 0x6d
 
-#CHECK: vx      %v31, %v31, %v31
+# CHECK: vx  %v31, %v31, %v31
 0xe7 0xff 0xf0 0x00 0x0e 0x6d
 
-#CHECK: wcdgb   %f0, %f0, 0, 0
+# CHECK: wcdgb %f0, %f0, 0, 0
 0xe7 0x00 0x00 0x08 0x30 0xc3
 
-#CHECK: wcdgb   %v19, %f14, 4, 10
+# CHECK: wcdgb %v19, %f14, 4, 10
 0xe7 0x3e 0x00 0xac 0x38 0xc3
 
-#CHECK: wcdgb   %v31, %v31, 7, 15
+# CHECK: wcdgb %v31, %v31, 7, 15
 0xe7 0xff 0x00 0xff 0x3c 0xc3
 
-#CHECK: wcdlgb  %f0, %f0, 0, 0
+# CHECK: wcdlgb %f0, %f0, 0, 0
 0xe7 0x00 0x00 0x08 0x30 0xc1
 
-#CHECK: wcdlgb  %v19, %f14, 4, 10
+# CHECK: wcdlgb %v19, %f14, 4, 10
 0xe7 0x3e 0x00 0xac 0x38 0xc1
 
-#CHECK: wcdlgb  %v31, %v31, 7, 15
+# CHECK: wcdlgb %v31, %v31, 7, 15
 0xe7 0xff 0x00 0xff 0x3c 0xc1
 
-#CHECK: wcgdb   %f0, %f0, 0, 0
+# CHECK: wcgdb %f0, %f0, 0, 0
 0xe7 0x00 0x00 0x08 0x30 0xc2
 
-#CHECK: wcgdb   %v19, %f14, 4, 10
+# CHECK: wcgdb %v19, %f14, 4, 10
 0xe7 0x3e 0x00 0xac 0x38 0xc2
 
-#CHECK: wcgdb   %v31, %v31, 7, 15
+# CHECK: wcgdb %v31, %v31, 7, 15
 0xe7 0xff 0x00 0xff 0x3c 0xc2
 
-#CHECK: wclgdb  %f0, %f0, 0, 0
+# CHECK: wclgdb %f0, %f0, 0, 0
 0xe7 0x00 0x00 0x08 0x30 0xc0
 
-#CHECK: wclgdb  %v19, %f14, 4, 10
+# CHECK: wclgdb %v19, %f14, 4, 10
 0xe7 0x3e 0x00 0xac 0x38 0xc0
 
-#CHECK: wclgdb  %v31, %v31, 7, 15
+# CHECK: wclgdb %v31, %v31, 7, 15
 0xe7 0xff 0x00 0xff 0x3c 0xc0
 
-#CHECK: wfadb   %f0, %f0, %f0
+# CHECK: wfadb %f0, %f0, %f0
 0xe7 0x00 0x00 0x08 0x30 0xe3
 
-#CHECK: wfadb   %v18, %f3, %v20
+# CHECK: wfadb %v18, %f3, %v20
 0xe7 0x23 0x40 0x08 0x3a 0xe3
 
-#CHECK: wfadb   %v31, %v31, %v31
+# CHECK: wfadb %v31, %v31, %v31
 0xe7 0xff 0xf0 0x08 0x3e 0xe3
 
-#CHECK: wfc     %f0, %f0, 11, 9
+# CHECK: wfc  %f0, %f0, 11, 9
 0xe7 0x00 0x00 0x09 0xb0 0xcb
 
-#CHECK: wfc     %v19, %f14, 11, 9
+# CHECK: wfc  %v19, %f14, 11, 9
 0xe7 0x3e 0x00 0x09 0xb8 0xcb
 
-#CHECK: wfc     %v31, %v31, 11, 9
+# CHECK: wfc  %v31, %v31, 11, 9
 0xe7 0xff 0x00 0x09 0xbc 0xcb
 
-#CHECK: wfcdb   %f0, %f0
+# CHECK: wfcdb %f0, %f0
 0xe7 0x00 0x00 0x00 0x30 0xcb
 
-#CHECK: wfcdb   %v19, %f14
+# CHECK: wfcdb %v19, %f14
 0xe7 0x3e 0x00 0x00 0x38 0xcb
 
-#CHECK: wfcdb   %v31, %v31
+# CHECK: wfcdb %v31, %v31
 0xe7 0xff 0x00 0x00 0x3c 0xcb
 
-#CHECK: wfcedb  %f0, %f0, %f0
+# CHECK: wfcedb %f0, %f0, %f0
 0xe7 0x00 0x00 0x08 0x30 0xe8
 
-#CHECK: wfcedb  %v18, %f3, %v20
+# CHECK: wfcedb %v18, %f3, %v20
 0xe7 0x23 0x40 0x08 0x3a 0xe8
 
-#CHECK: wfcedb  %v31, %v31, %v31
+# CHECK: wfcedb %v31, %v31, %v31
 0xe7 0xff 0xf0 0x08 0x3e 0xe8
 
-#CHECK: wfcedbs %f0, %f0, %f0
+# CHECK: wfcedbs %f0, %f0, %f0
 0xe7 0x00 0x00 0x18 0x30 0xe8
 
-#CHECK: wfcedbs %v18, %f3, %v20
+# CHECK: wfcedbs %v18, %f3, %v20
 0xe7 0x23 0x40 0x18 0x3a 0xe8
 
-#CHECK: wfcedbs %v31, %v31, %v31
+# CHECK: wfcedbs %v31, %v31, %v31
 0xe7 0xff 0xf0 0x18 0x3e 0xe8
 
-#CHECK: wfchdb  %f0, %f0, %f0
+# CHECK: wfchdb %f0, %f0, %f0
 0xe7 0x00 0x00 0x08 0x30 0xeb
 
-#CHECK: wfchdb  %v18, %f3, %v20
+# CHECK: wfchdb %v18, %f3, %v20
 0xe7 0x23 0x40 0x08 0x3a 0xeb
 
-#CHECK: wfchdb  %v31, %v31, %v31
+# CHECK: wfchdb %v31, %v31, %v31
 0xe7 0xff 0xf0 0x08 0x3e 0xeb
 
-#CHECK: wfchdbs %f0, %f0, %f0
+# CHECK: wfchdbs %f0, %f0, %f0
 0xe7 0x00 0x00 0x18 0x30 0xeb
 
-#CHECK: wfchdbs %v18, %f3, %v20
+# CHECK: wfchdbs %v18, %f3, %v20
 0xe7 0x23 0x40 0x18 0x3a 0xeb
 
-#CHECK: wfchdbs %v31, %v31, %v31
+# CHECK: wfchdbs %v31, %v31, %v31
 0xe7 0xff 0xf0 0x18 0x3e 0xeb
 
-#CHECK: wfchedb %f0, %f0, %f0
+# CHECK: wfchedb %f0, %f0, %f0
 0xe7 0x00 0x00 0x08 0x30 0xea
 
-#CHECK: wfchedb %v18, %f3, %v20
+# CHECK: wfchedb %v18, %f3, %v20
 0xe7 0x23 0x40 0x08 0x3a 0xea
 
-#CHECK: wfchedb %v31, %v31, %v31
+# CHECK: wfchedb %v31, %v31, %v31
 0xe7 0xff 0xf0 0x08 0x3e 0xea
 
-#CHECK: wfchedbs %f0, %f0, %f0
+# CHECK: wfchedbs %f0, %f0, %f0
 0xe7 0x00 0x00 0x18 0x30 0xea
 
-#CHECK: wfchedbs %v18, %f3, %v20
+# CHECK: wfchedbs %v18, %f3, %v20
 0xe7 0x23 0x40 0x18 0x3a 0xea
 
-#CHECK: wfchedbs %v31, %v31, %v31
+# CHECK: wfchedbs %v31, %v31, %v31
 0xe7 0xff 0xf0 0x18 0x3e 0xea
 
-#CHECK: wfddb   %f0, %f0, %f0
+# CHECK: wfddb %f0, %f0, %f0
 0xe7 0x00 0x00 0x08 0x30 0xe5
 
-#CHECK: wfddb   %v18, %f3, %v20
+# CHECK: wfddb %v18, %f3, %v20
 0xe7 0x23 0x40 0x08 0x3a 0xe5
 
-#CHECK: wfddb   %v31, %v31, %v31
+# CHECK: wfddb %v31, %v31, %v31
 0xe7 0xff 0xf0 0x08 0x3e 0xe5
 
-#CHECK: wfidb   %f0, %f0, 0, 0
+# CHECK: wfidb %f0, %f0, 0, 0
 0xe7 0x00 0x00 0x08 0x30 0xc7
 
-#CHECK: wfidb   %v19, %f14, 4, 10
+# CHECK: wfidb %v19, %f14, 4, 10
 0xe7 0x3e 0x00 0xac 0x38 0xc7
 
-#CHECK: wfidb   %v31, %v31, 7, 15
+# CHECK: wfidb %v31, %v31, 7, 15
 0xe7 0xff 0x00 0xff 0x3c 0xc7
 
-#CHECK: wfk     %f0, %f0, 11, 9
+# CHECK: wfk  %f0, %f0, 11, 9
 0xe7 0x00 0x00 0x09 0xb0 0xca
 
-#CHECK: wfk     %v19, %f14, 11, 9
+# CHECK: wfk  %v19, %f14, 11, 9
 0xe7 0x3e 0x00 0x09 0xb8 0xca
 
-#CHECK: wfk     %v31, %v31, 11, 9
+# CHECK: wfk  %v31, %v31, 11, 9
 0xe7 0xff 0x00 0x09 0xbc 0xca
 
-#CHECK: wfkdb   %f0, %f0
+# CHECK: wfkdb %f0, %f0
 0xe7 0x00 0x00 0x00 0x30 0xca
 
-#CHECK: wfkdb   %v19, %f14
+# CHECK: wfkdb %v19, %f14
 0xe7 0x3e 0x00 0x00 0x38 0xca
 
-#CHECK: wfkdb   %v31, %v31
+# CHECK: wfkdb %v31, %v31
 0xe7 0xff 0x00 0x00 0x3c 0xca
 
-#CHECK: wfpsodb %f0, %f0, 7
-0xe7 0x00 0x00 0x78 0x30 0xcc
-
-#CHECK: wfpsodb %v19, %f14, 7
-0xe7 0x3e 0x00 0x78 0x38 0xcc
-
-#CHECK: wfpsodb %v31, %v31, 7
-0xe7 0xff 0x00 0x78 0x3c 0xcc
-
-#CHECK: wflcdb  %f0, %f0
+# CHECK: wflcdb %f0, %f0
 0xe7 0x00 0x00 0x08 0x30 0xcc
 
-#CHECK: wflcdb  %v19, %f14
+# CHECK: wflcdb %v19, %f14
 0xe7 0x3e 0x00 0x08 0x38 0xcc
 
-#CHECK: wflcdb  %v31, %v31
+# CHECK: wflcdb %v31, %v31
 0xe7 0xff 0x00 0x08 0x3c 0xcc
 
-#CHECK: wflndb  %f0, %f0
+# CHECK: wflndb %f0, %f0
 0xe7 0x00 0x00 0x18 0x30 0xcc
 
-#CHECK: wflndb  %v19, %f14
+# CHECK: wflndb %v19, %f14
 0xe7 0x3e 0x00 0x18 0x38 0xcc
 
-#CHECK: wflndb  %v31, %v31
+# CHECK: wflndb %v31, %v31
 0xe7 0xff 0x00 0x18 0x3c 0xcc
 
-#CHECK: wflpdb  %f0, %f0
+# CHECK: wflpdb %f0, %f0
 0xe7 0x00 0x00 0x28 0x30 0xcc
 
-#CHECK: wflpdb  %v19, %f14
+# CHECK: wflpdb %v19, %f14
 0xe7 0x3e 0x00 0x28 0x38 0xcc
 
-#CHECK: wflpdb  %v31, %v31
+# CHECK: wflpdb %v31, %v31
 0xe7 0xff 0x00 0x28 0x3c 0xcc
 
-#CHECK: wfmadb  %f0, %f0, %f0, %f0
+# CHECK: wfmadb %f0, %f0, %f0, %f0
 0xe7 0x00 0x03 0x08 0x00 0x8f
 
-#CHECK: wfmadb  %f3, %v20, %f5, %v22
+# CHECK: wfmadb %f3, %v20, %f5, %v22
 0xe7 0x34 0x53 0x08 0x65 0x8f
 
-#CHECK: wfmadb  %v31, %v31, %v31, %v31
+# CHECK: wfmadb %v31, %v31, %v31, %v31
 0xe7 0xff 0xf3 0x08 0xff 0x8f
 
-#CHECK: wfmdb   %f0, %f0, %f0
+# CHECK: wfmdb %f0, %f0, %f0
 0xe7 0x00 0x00 0x08 0x30 0xe7
 
-#CHECK: wfmdb   %v18, %f3, %v20
+# CHECK: wfmdb %v18, %f3, %v20
 0xe7 0x23 0x40 0x08 0x3a 0xe7
 
-#CHECK: wfmdb   %v31, %v31, %v31
+# CHECK: wfmdb %v31, %v31, %v31
 0xe7 0xff 0xf0 0x08 0x3e 0xe7
 
-#CHECK: wfmsdb  %f0, %f0, %f0, %f0
+# CHECK: wfmsdb %f0, %f0, %f0, %f0
 0xe7 0x00 0x03 0x08 0x00 0x8e
 
-#CHECK: wfmsdb  %f3, %v20, %f5, %v22
+# CHECK: wfmsdb %f3, %v20, %f5, %v22
 0xe7 0x34 0x53 0x08 0x65 0x8e
 
-#CHECK: wfmsdb  %v31, %v31, %v31, %v31
+# CHECK: wfmsdb %v31, %v31, %v31, %v31
 0xe7 0xff 0xf3 0x08 0xff 0x8e
 
-#CHECK: wfsdb   %f0, %f0, %f0
+# CHECK: wfpsodb %f0, %f0, 7
+0xe7 0x00 0x00 0x78 0x30 0xcc
+
+# CHECK: wfpsodb %v19, %f14, 7
+0xe7 0x3e 0x00 0x78 0x38 0xcc
+
+# CHECK: wfpsodb %v31, %v31, 7
+0xe7 0xff 0x00 0x78 0x3c 0xcc
+
+# CHECK: wfsdb %f0, %f0, %f0
 0xe7 0x00 0x00 0x08 0x30 0xe2
 
-#CHECK: wfsdb   %v18, %f3, %v20
+# CHECK: wfsdb %v18, %f3, %v20
 0xe7 0x23 0x40 0x08 0x3a 0xe2
 
-#CHECK: wfsdb   %v31, %v31, %v31
+# CHECK: wfsdb %v31, %v31, %v31
 0xe7 0xff 0xf0 0x08 0x3e 0xe2
 
-#CHECK: wfsqdb  %f0, %f0
+# CHECK: wfsqdb %f0, %f0
 0xe7 0x00 0x00 0x08 0x30 0xce
 
-#CHECK: wfsqdb  %v19, %f14
+# CHECK: wfsqdb %v19, %f14
 0xe7 0x3e 0x00 0x08 0x38 0xce
 
-#CHECK: wfsqdb  %v31, %v31
+# CHECK: wfsqdb %v31, %v31
 0xe7 0xff 0x00 0x08 0x3c 0xce
 
-#CHECK: wftcidb %f0, %f0, 0
+# CHECK: wftcidb %f0, %f0, 0
 0xe7 0x00 0x00 0x08 0x30 0x4a
 
-#CHECK: wftcidb %v19, %f4, 1383
+# CHECK: wftcidb %v19, %f4, 1383
 0xe7 0x34 0x56 0x78 0x38 0x4a
 
-#CHECK: wftcidb %v31, %v31, 4095
+# CHECK: wftcidb %v31, %v31, 4095
 0xe7 0xff 0xff 0xf8 0x3c 0x4a
 
-#CHECK: wldeb   %f0, %f0
+# CHECK: wldeb %f0, %f0
 0xe7 0x00 0x00 0x08 0x20 0xc4
 
-#CHECK: wldeb   %v19, %f14
+# CHECK: wldeb %v19, %f14
 0xe7 0x3e 0x00 0x08 0x28 0xc4
 
-#CHECK: wldeb   %v31, %v31
+# CHECK: wldeb %v31, %v31
 0xe7 0xff 0x00 0x08 0x2c 0xc4
 
-#CHECK: wledb   %f0, %f0, 0, 0
+# CHECK: wledb %f0, %f0, 0, 0
 0xe7 0x00 0x00 0x08 0x30 0xc5
 
-#CHECK: wledb   %v19, %f14, 4, 10
+# CHECK: wledb %v19, %f14, 4, 10
 0xe7 0x3e 0x00 0xac 0x38 0xc5
 
-#CHECK: wledb   %v31, %v31, 7, 15
+# CHECK: wledb %v31, %v31, 7, 15
 0xe7 0xff 0x00 0xff 0x3c 0xc5
 
-#CHECK: lochi %r11, 42, 0
-0xec 0xb0 0x00 0x2a 0x00 0x42
-
-#CHECK:	lochio %r11, 42
-0xec 0xb1 0x00 0x2a 0x00 0x42
-
-#CHECK: lochih %r11, 42
-0xec 0xb2 0x00 0x2a 0x00 0x42
-
-#CHECK: lochinle %r11, 42
-0xec 0xb3 0x00 0x2a 0x00 0x42
-
-#CHECK: lochil %r11, -1
-0xec 0xb4 0xff 0xff 0x00 0x42
-
-#CHECK: lochinhe %r11, 42
-0xec 0xb5 0x00 0x2a 0x00 0x42
-
-#CHECK: lochilh %r11, -1
-0xec 0xb6 0xff 0xff 0x00 0x42
-
-#CHECK: lochine %r11, 0
-0xec 0xb7 0x00 0x00 0x00 0x42
-
-#CHECK: lochie %r11, 0
-0xec 0xb8 0x00 0x00 0x00 0x42
-
-#CHECK: lochinlh %r11, 42
-0xec 0xb9 0x00 0x2a 0x00 0x42
-
-#CHECK: lochihe %r11, 255
-0xec 0xba 0x00 0xff 0x00 0x42
-
-#CHECK: lochinl %r11, 255
-0xec 0xbb 0x00 0xff 0x00 0x42
-
-#CHECK: lochile %r11, 32767
-0xec 0xbc 0x7f 0xff 0x00 0x42
-
-#CHECK: lochinh %r11, 32767
-0xec 0xbd 0x7f 0xff 0x00 0x42
-
-#CHECK: lochino %r11, 32512
-0xec 0xbe 0x7f 0x00 0x00 0x42
-
-#CHECK: lochi %r11, 32512, 15
-0xec 0xbf 0x7f 0x00 0x00 0x42
-
-#CHECK: locghi %r11, 42, 0
-0xec 0xb0 0x00 0x2a 0x00 0x46
-
-#CHECK: locghio %r11, 42
-0xec 0xb1 0x00 0x2a 0x00 0x46
-
-#CHECK: locghih %r11, 42
-0xec 0xb2 0x00 0x2a 0x00 0x46
-
-#CHECK: locghinle %r11, 42
-0xec 0xb3 0x00 0x2a 0x00 0x46
-
-#CHECK: locghil %r11, -1
-0xec 0xb4 0xff 0xff 0x00 0x46
-
-#CHECK: locghinhe %r11, 42
-0xec 0xb5 0x00 0x2a 0x00 0x46
-
-#CHECK: locghilh %r11, -1
-0xec 0xb6 0xff 0xff 0x00 0x46
-
-#CHECK: locghine %r11, 0
-0xec 0xb7 0x00 0x00 0x00 0x46
-
-#CHECK: locghie %r11, 0
-0xec 0xb8 0x00 0x00 0x00 0x46
-
-#CHECK: locghinlh %r11, 42
-0xec 0xb9 0x00 0x2a 0x00 0x46
-
-#CHECK: locghihe %r11, 255
-0xec 0xba 0x00 0xff 0x00 0x46
-
-#CHECK: locghinl %r11, 255
-0xec 0xbb 0x00 0xff 0x00 0x46
-
-#CHECK: locghile	%r11, 32767
-0xec 0xbc 0x7f 0xff 0x00 0x46
-
-#CHECK: locghinh %r11, 32767
-0xec 0xbd 0x7f 0xff 0x00 0x46
-
-#CHECK: locghino %r11, 32512
-0xec 0xbe 0x7f 0x00 0x00 0x46
-
-#CHECK: locghi %r11, 32512, 15
-0xec 0xbf 0x7f 0x00 0x00 0x46
-
-#CHECK: lochhi %r11, 42, 0
-0xec 0xb0 0x00 0x2a 0x00 0x4e
-
-#CHECK: lochhio %r11, 42
-0xec 0xb1 0x00 0x2a 0x00 0x4e
-
-#CHECK: lochhih %r11, 42
-0xec 0xb2 0x00 0x2a 0x00 0x4e
-
-#CHECK: lochhinle %r11, 42
-0xec 0xb3 0x00 0x2a 0x00 0x4e
-
-#CHECK: lochhil %r11, -1
-0xec 0xb4 0xff 0xff 0x00 0x4e
-
-#CHECK: lochhinhe %r11, 42
-0xec 0xb5 0x00 0x2a 0x00 0x4e
-
-#CHECK: lochhilh %r11, -1
-0xec 0xb6 0xff 0xff 0x00 0x4e
-
-#CHECK: lochhine %r11, 0
-0xec 0xb7 0x00 0x00 0x00 0x4e
-
-#CHECK: lochhie %r11, 0
-0xec 0xb8 0x00 0x00 0x00 0x4e
-
-#CHECK: lochhinlh %r11, 42
-0xec 0xb9 0x00 0x2a 0x00 0x4e
-
-#CHECK: lochhihe %r11, 255
-0xec 0xba 0x00 0xff 0x00 0x4e
-
-#CHECK: lochhinl %r11, 255
-0xec 0xbb 0x00 0xff 0x00 0x4e
-
-#CHECK: lochhile %r11, 32767
-0xec 0xbc 0x7f 0xff 0x00 0x4e
-
-#CHECK: lochhinh %r11, 32767
-0xec 0xbd 0x7f 0xff 0x00 0x4e
-
-#CHECK: lochhino %r11, 32512
-0xec 0xbe 0x7f 0x00 0x00 0x4e
-
-#CHECK: lochhi %r11, 32512, 15
-0xec 0xbf 0x7f 0x00 0x00 0x4e
-
-# CHECK: locfh %r7, 6399(%r8), 0
-0xeb 0x70 0x88 0xff 0x01 0xe0
-
-# CHECK: locfho %r7, 6399(%r8)
-0xeb 0x71 0x88 0xff 0x01 0xe0
-
-# CHECK: locfhh %r7, 6399(%r8)
-0xeb 0x72 0x88 0xff 0x01 0xe0
-
-# CHECK: locfhnle %r7, 6399(%r8)
-0xeb 0x73 0x88 0xff 0x01 0xe0
-
-# CHECK: locfhl %r7, 6399(%r8)
-0xeb 0x74 0x88 0xff 0x01 0xe0
-
-# CHECK: locfhnhe %r7, 6399(%r8)
-0xeb 0x75 0x88 0xff 0x01 0xe0
-
-# CHECK: locfhlh %r7, 6399(%r8)
-0xeb 0x76 0x88 0xff 0x01 0xe0
-
-# CHECK: locfhne %r7, 6399(%r8)
-0xeb 0x77 0x88 0xff 0x01 0xe0
-
-# CHECK: locfhe %r7, 6399(%r8)
-0xeb 0x78 0x88 0xff 0x01 0xe0
-
-# CHECK: locfhnlh %r7, 6399(%r8)
-0xeb 0x79 0x88 0xff 0x01 0xe0
-
-# CHECK: locfhhe %r7, 6399(%r8)
-0xeb 0x7a 0x88 0xff 0x01 0xe0
-
-# CHECK: locfhnl %r7, 6399(%r8)
-0xeb 0x7b 0x88 0xff 0x01 0xe0
-
-# CHECK: locfhle %r7, 6399(%r8)
-0xeb 0x7c 0x88 0xff 0x01 0xe0
-
-# CHECK: locfhnh %r7, 6399(%r8)
-0xeb 0x7d 0x88 0xff 0x01 0xe0
-
-# CHECK: locfhno %r7, 6399(%r8)
-0xeb 0x7e 0x88 0xff 0x01 0xe0
-
-# CHECK: locfh %r7, 6399(%r8), 15
-0xeb 0x7f 0x88 0xff 0x01 0xe0
-
-# CHECK: locfhr %r11, %r3, 0
-0xb9 0xe0 0x00 0xb3
-
-# CHECK: locfhro %r11, %r3
-0xb9 0xe0 0x10 0xb3
-
-# CHECK: locfhrh %r11, %r3
-0xb9 0xe0 0x20 0xb3
-
-# CHECK: locfhrnle %r11, %r3
-0xb9 0xe0 0x30 0xb3
-
-# CHECK: locfhrl %r11, %r3
-0xb9 0xe0 0x40 0xb3
-
-# CHECK: locfhrnhe %r11, %r3
-0xb9 0xe0 0x50 0xb3
-
-# CHECK: locfhrlh %r11, %r3
-0xb9 0xe0 0x60 0xb3
-
-# CHECK: locfhrne %r11, %r3
-0xb9 0xe0 0x70 0xb3
-
-# CHECK: locfhre %r11, %r3
-0xb9 0xe0 0x80 0xb3
-
-# CHECK: locfhrnlh %r11, %r3
-0xb9 0xe0 0x90 0xb3
-
-# CHECK: locfhrhe %r11, %r3
-0xb9 0xe0 0xa0 0xb3
-
-# CHECK: locfhrnl %r11, %r3
-0xb9 0xe0 0xb0 0xb3
-
-# CHECK: locfhrle %r11, %r3
-0xb9 0xe0 0xc0 0xb3
-
-# CHECK: locfhrnh %r11, %r3
-0xb9 0xe0 0xd0 0xb3
-
-# CHECK: locfhrno %r11, %r3
-0xb9 0xe0 0xe0 0xb3
-
-# CHECK: locfhr %r11, %r3, 15
-0xb9 0xe0 0xf0 0xb3
-
-# CHECK: stocfh %r1, 2(%r3), 0
-0xeb 0x10 0x30 0x02 0x00 0xe1
-
-# CHECK: stocfho %r1, 2(%r3)
-0xeb 0x11 0x30 0x02 0x00 0xe1
-
-# CHECK: stocfhh %r1, 2(%r3)
-0xeb 0x12 0x30 0x02 0x00 0xe1
-
-# CHECK: stocfhnle %r1, 2(%r3)
-0xeb 0x13 0x30 0x02 0x00 0xe1
-
-# CHECK: stocfhl %r1, 2(%r3)
-0xeb 0x14 0x30 0x02 0x00 0xe1
-
-# CHECK: stocfhnhe %r1, 2(%r3)
-0xeb 0x15 0x30 0x02 0x00 0xe1
-
-# CHECK: stocfhlh %r1, 2(%r3)
-0xeb 0x16 0x30 0x02 0x00 0xe1
-
-# CHECK: stocfhne %r1, 2(%r3)
-0xeb 0x17 0x30 0x02 0x00 0xe1
-
-# CHECK: stocfhe %r1, 2(%r3)
-0xeb 0x18 0x30 0x02 0x00 0xe1
-
-# CHECK: stocfhnlh %r1, 2(%r3)
-0xeb 0x19 0x30 0x02 0x00 0xe1
-
-# CHECK: stocfhhe %r1, 2(%r3)
-0xeb 0x1a 0x30 0x02 0x00 0xe1
-
-# CHECK: stocfhnl %r1, 2(%r3)
-0xeb 0x1b 0x30 0x02 0x00 0xe1
-
-# CHECK: stocfhle %r1, 2(%r3)
-0xeb 0x1c 0x30 0x02 0x00 0xe1
-
-# CHECK: stocfhnh %r1, 2(%r3)
-0xeb 0x1d 0x30 0x02 0x00 0xe1
-
-# CHECK: stocfhno %r1, 2(%r3)
-0xeb 0x1e 0x30 0x02 0x00 0xe1
-
-# CHECK: stocfh %r1, 2(%r3), 15
-0xeb 0x1f 0x30 0x02 0x00 0xe1
-
diff --git a/test/MC/Disassembler/SystemZ/insns.txt b/test/MC/Disassembler/SystemZ/insns.txt
index 9f76b6a5fd44..dac94099f276 100644
--- a/test/MC/Disassembler/SystemZ/insns.txt
+++ b/test/MC/Disassembler/SystemZ/insns.txt
@@ -1,17 +1,26 @@
 # Test instructions that don't have PC-relative operands.
 # RUN: llvm-mc --disassemble %s -triple=s390x-linux-gnu -mcpu=zEC12 | FileCheck %s
 
-# CHECK: adbr %f0, %f0
-0xb3 0x1a 0x00 0x00
+# CHECK: a %r0, 0
+0x5a 0x00 0x00 0x00
 
-# CHECK: adbr %f0, %f15
-0xb3 0x1a 0x00 0x0f
+# CHECK: a %r0, 4095
+0x5a 0x00 0x0f 0xff
 
-# CHECK: adbr %f7, %f8
-0xb3 0x1a 0x00 0x78
+# CHECK: a %r0, 0(%r1)
+0x5a 0x00 0x10 0x00
 
-# CHECK: adbr %f15, %f0
-0xb3 0x1a 0x00 0xf0
+# CHECK: a %r0, 0(%r15)
+0x5a 0x00 0xf0 0x00
+
+# CHECK: a %r0, 4095(%r1,%r15)
+0x5a 0x01 0xff 0xff
+
+# CHECK: a %r0, 4095(%r15,%r1)
+0x5a 0x0f 0x1f 0xff
+
+# CHECK: a %r15, 0
+0x5a 0xf0 0x00 0x00
 
 # CHECK: adb %f0, 0
 0xed 0x00 0x00 0x00 0x00 0x1a
@@ -34,17 +43,17 @@
 # CHECK: adb %f15, 0
 0xed 0xf0 0x00 0x00 0x00 0x1a
 
-# CHECK: aebr %f0, %f0
-0xb3 0x0a 0x00 0x00
+# CHECK: adbr %f0, %f0
+0xb3 0x1a 0x00 0x00
 
-# CHECK: aebr %f0, %f15
-0xb3 0x0a 0x00 0x0f
+# CHECK: adbr %f0, %f15
+0xb3 0x1a 0x00 0x0f
 
-# CHECK: aebr %f7, %f8
-0xb3 0x0a 0x00 0x78
+# CHECK: adbr %f7, %f8
+0xb3 0x1a 0x00 0x78
 
-# CHECK: aebr %f15, %f0
-0xb3 0x0a 0x00 0xf0
+# CHECK: adbr %f15, %f0
+0xb3 0x1a 0x00 0xf0
 
 # CHECK: aeb %f0, 0
 0xed 0x00 0x00 0x00 0x00 0x0a
@@ -67,6 +76,18 @@
 # CHECK: aeb %f15, 0
 0xed 0xf0 0x00 0x00 0x00 0x0a
 
+# CHECK: aebr %f0, %f0
+0xb3 0x0a 0x00 0x00
+
+# CHECK: aebr %f0, %f15
+0xb3 0x0a 0x00 0x0f
+
+# CHECK: aebr %f7, %f8
+0xb3 0x0a 0x00 0x78
+
+# CHECK: aebr %f15, %f0
+0xb3 0x0a 0x00 0xf0
+
 # CHECK: afi %r0, -2147483648
 0xc2 0x09 0x80 0x00 0x00 0x00
 
@@ -85,35 +106,35 @@
 # CHECK: afi %r15, 0
 0xc2 0xf9 0x00 0x00 0x00 0x00
 
-# CHECK: agfi %r0, -2147483648
-0xc2 0x08 0x80 0x00 0x00 0x00
+# CHECK: ag %r0, -524288
+0xe3 0x00 0x00 0x00 0x80 0x08
 
-# CHECK: agfi %r0, -1
-0xc2 0x08 0xff 0xff 0xff 0xff
+# CHECK: ag %r0, -1
+0xe3 0x00 0x0f 0xff 0xff 0x08
 
-# CHECK: agfi %r0, 0
-0xc2 0x08 0x00 0x00 0x00 0x00
+# CHECK: ag %r0, 0
+0xe3 0x00 0x00 0x00 0x00 0x08
 
-# CHECK: agfi %r0, 1
-0xc2 0x08 0x00 0x00 0x00 0x01
+# CHECK: ag %r0, 1
+0xe3 0x00 0x00 0x01 0x00 0x08
 
-# CHECK: agfi %r0, 2147483647
-0xc2 0x08 0x7f 0xff 0xff 0xff
+# CHECK: ag %r0, 524287
+0xe3 0x00 0x0f 0xff 0x7f 0x08
 
-# CHECK: agfi %r15, 0
-0xc2 0xf8 0x00 0x00 0x00 0x00
+# CHECK: ag %r0, 0(%r1)
+0xe3 0x00 0x10 0x00 0x00 0x08
 
-# CHECK: agfr %r0, %r0
-0xb9 0x18 0x00 0x00
+# CHECK: ag %r0, 0(%r15)
+0xe3 0x00 0xf0 0x00 0x00 0x08
 
-# CHECK: agfr %r0, %r15
-0xb9 0x18 0x00 0x0f
+# CHECK: ag %r0, 524287(%r1,%r15)
+0xe3 0x01 0xff 0xff 0x7f 0x08
 
-# CHECK: agfr %r15, %r0
-0xb9 0x18 0x00 0xf0
+# CHECK: ag %r0, 524287(%r15,%r1)
+0xe3 0x0f 0x1f 0xff 0x7f 0x08
 
-# CHECK: agfr %r7, %r8
-0xb9 0x18 0x00 0x78
+# CHECK: ag %r15, 0
+0xe3 0xf0 0x00 0x00 0x00 0x08
 
 # CHECK: agf %r0, -524288
 0xe3 0x00 0x00 0x00 0x80 0x18
@@ -145,6 +166,36 @@
 # CHECK: agf %r15, 0
 0xe3 0xf0 0x00 0x00 0x00 0x18
 
+# CHECK: agfi %r0, -2147483648
+0xc2 0x08 0x80 0x00 0x00 0x00
+
+# CHECK: agfi %r0, -1
+0xc2 0x08 0xff 0xff 0xff 0xff
+
+# CHECK: agfi %r0, 0
+0xc2 0x08 0x00 0x00 0x00 0x00
+
+# CHECK: agfi %r0, 1
+0xc2 0x08 0x00 0x00 0x00 0x01
+
+# CHECK: agfi %r0, 2147483647
+0xc2 0x08 0x7f 0xff 0xff 0xff
+
+# CHECK: agfi %r15, 0
+0xc2 0xf8 0x00 0x00 0x00 0x00
+
+# CHECK: agfr %r0, %r0
+0xb9 0x18 0x00 0x00
+
+# CHECK: agfr %r0, %r15
+0xb9 0x18 0x00 0x0f
+
+# CHECK: agfr %r15, %r0
+0xb9 0x18 0x00 0xf0
+
+# CHECK: agfr %r7, %r8
+0xb9 0x18 0x00 0x78
+
 # CHECK: aghi %r0, -32768
 0xa7 0x0b 0x80 0x00
 
@@ -235,35 +286,26 @@
 # CHECK: agsi 524287(%r15), 42
 0xeb 0x2a 0xff 0xff 0x7f 0x7a
 
-# CHECK: ag %r0, -524288
-0xe3 0x00 0x00 0x00 0x80 0x08
-
-# CHECK: ag %r0, -1
-0xe3 0x00 0x0f 0xff 0xff 0x08
-
-# CHECK: ag %r0, 0
-0xe3 0x00 0x00 0x00 0x00 0x08
-
-# CHECK: ag %r0, 1
-0xe3 0x00 0x00 0x01 0x00 0x08
+# CHECK: ah %r0, 0
+0x4a 0x00 0x00 0x00
 
-# CHECK: ag %r0, 524287
-0xe3 0x00 0x0f 0xff 0x7f 0x08
+# CHECK: ah %r0, 4095
+0x4a 0x00 0x0f 0xff
 
-# CHECK: ag %r0, 0(%r1)
-0xe3 0x00 0x10 0x00 0x00 0x08
+# CHECK: ah %r0, 0(%r1)
+0x4a 0x00 0x10 0x00
 
-# CHECK: ag %r0, 0(%r15)
-0xe3 0x00 0xf0 0x00 0x00 0x08
+# CHECK: ah %r0, 0(%r15)
+0x4a 0x00 0xf0 0x00
 
-# CHECK: ag %r0, 524287(%r1,%r15)
-0xe3 0x01 0xff 0xff 0x7f 0x08
+# CHECK: ah %r0, 4095(%r1,%r15)
+0x4a 0x01 0xff 0xff
 
-# CHECK: ag %r0, 524287(%r15,%r1)
-0xe3 0x0f 0x1f 0xff 0x7f 0x08
+# CHECK: ah %r0, 4095(%r15,%r1)
+0x4a 0x0f 0x1f 0xff
 
-# CHECK: ag %r15, 0
-0xe3 0xf0 0x00 0x00 0x00 0x08
+# CHECK: ah %r15, 0
+0x4a 0xf0 0x00 0x00
 
 # CHECK: ahi %r0, -32768
 0xa7 0x0a 0x80 0x00
@@ -298,27 +340,6 @@
 # CHECK: ahik %r8, %r15, 32767
 0xec 0x8f 0x7f 0xff 0x00 0xd8
 
-# CHECK: ah %r0, 0
-0x4a 0x00 0x00 0x00
-
-# CHECK: ah %r0, 4095
-0x4a 0x00 0x0f 0xff
-
-# CHECK: ah %r0, 0(%r1)
-0x4a 0x00 0x10 0x00
-
-# CHECK: ah %r0, 0(%r15)
-0x4a 0x00 0xf0 0x00
-
-# CHECK: ah %r0, 4095(%r1,%r15)
-0x4a 0x01 0xff 0xff
-
-# CHECK: ah %r0, 4095(%r15,%r1)
-0x4a 0x0f 0x1f 0xff
-
-# CHECK: ah %r15, 0
-0x4a 0xf0 0x00 0x00
-
 # CHECK: ahy %r0, -524288
 0xe3 0x00 0x00 0x00 0x80 0x7a
 
@@ -367,17 +388,56 @@
 # CHECK: aih %r15, 0
 0xcc 0xf8 0x00 0x00 0x00 0x00
 
-# CHECK: alcgr %r0, %r0
-0xb9 0x88 0x00 0x00
+# CHECK: al %r0, 0
+0x5e 0x00 0x00 0x00
 
-# CHECK: alcgr %r0, %r15
-0xb9 0x88 0x00 0x0f
+# CHECK: al %r0, 4095
+0x5e 0x00 0x0f 0xff
 
-# CHECK: alcgr %r15, %r0
-0xb9 0x88 0x00 0xf0
+# CHECK: al %r0, 0(%r1)
+0x5e 0x00 0x10 0x00
 
-# CHECK: alcgr %r7, %r8
-0xb9 0x88 0x00 0x78
+# CHECK: al %r0, 0(%r15)
+0x5e 0x00 0xf0 0x00
+
+# CHECK: al %r0, 4095(%r1,%r15)
+0x5e 0x01 0xff 0xff
+
+# CHECK: al %r0, 4095(%r15,%r1)
+0x5e 0x0f 0x1f 0xff
+
+# CHECK: al %r15, 0
+0x5e 0xf0 0x00 0x00
+
+# CHECK: alc %r0, -524288
+0xe3 0x00 0x00 0x00 0x80 0x98
+
+# CHECK: alc %r0, -1
+0xe3 0x00 0x0f 0xff 0xff 0x98
+
+# CHECK: alc %r0, 0
+0xe3 0x00 0x00 0x00 0x00 0x98
+
+# CHECK: alc %r0, 1
+0xe3 0x00 0x00 0x01 0x00 0x98
+
+# CHECK: alc %r0, 524287
+0xe3 0x00 0x0f 0xff 0x7f 0x98
+
+# CHECK: alc %r0, 0(%r1)
+0xe3 0x00 0x10 0x00 0x00 0x98
+
+# CHECK: alc %r0, 0(%r15)
+0xe3 0x00 0xf0 0x00 0x00 0x98
+
+# CHECK: alc %r0, 524287(%r1,%r15)
+0xe3 0x01 0xff 0xff 0x7f 0x98
+
+# CHECK: alc %r0, 524287(%r15,%r1)
+0xe3 0x0f 0x1f 0xff 0x7f 0x98
+
+# CHECK: alc %r15, 0
+0xe3 0xf0 0x00 0x00 0x00 0x98
 
 # CHECK: alcg %r0, -524288
 0xe3 0x00 0x00 0x00 0x80 0x88
@@ -409,6 +469,18 @@
 # CHECK: alcg %r15, 0
 0xe3 0xf0 0x00 0x00 0x00 0x88
 
+# CHECK: alcgr %r0, %r0
+0xb9 0x88 0x00 0x00
+
+# CHECK: alcgr %r0, %r15
+0xb9 0x88 0x00 0x0f
+
+# CHECK: alcgr %r15, %r0
+0xb9 0x88 0x00 0xf0
+
+# CHECK: alcgr %r7, %r8
+0xb9 0x88 0x00 0x78
+
 # CHECK: alcr %r0, %r0
 0xb9 0x98 0x00 0x00
 
@@ -421,65 +493,44 @@
 # CHECK: alcr %r7, %r8
 0xb9 0x98 0x00 0x78
 
-# CHECK: alc %r0, -524288
-0xe3 0x00 0x00 0x00 0x80 0x98
+# CHECK: alfi %r0, 0
+0xc2 0x0b 0x00 0x00 0x00 0x00
 
-# CHECK: alc %r0, -1
-0xe3 0x00 0x0f 0xff 0xff 0x98
+# CHECK: alfi %r0, 4294967295
+0xc2 0x0b 0xff 0xff 0xff 0xff
 
-# CHECK: alc %r0, 0
-0xe3 0x00 0x00 0x00 0x00 0x98
-
-# CHECK: alc %r0, 1
-0xe3 0x00 0x00 0x01 0x00 0x98
-
-# CHECK: alc %r0, 524287
-0xe3 0x00 0x0f 0xff 0x7f 0x98
-
-# CHECK: alc %r0, 0(%r1)
-0xe3 0x00 0x10 0x00 0x00 0x98
-
-# CHECK: alc %r0, 0(%r15)
-0xe3 0x00 0xf0 0x00 0x00 0x98
-
-# CHECK: alc %r0, 524287(%r1,%r15)
-0xe3 0x01 0xff 0xff 0x7f 0x98
-
-# CHECK: alc %r0, 524287(%r15,%r1)
-0xe3 0x0f 0x1f 0xff 0x7f 0x98
-
-# CHECK: alc %r15, 0
-0xe3 0xf0 0x00 0x00 0x00 0x98
+# CHECK: alfi %r15, 0
+0xc2 0xfb 0x00 0x00 0x00 0x00
 
-# CHECK: alfi %r0, 0
-0xc2 0x0b 0x00 0x00 0x00 0x00
+# CHECK: alg %r0, -524288
+0xe3 0x00 0x00 0x00 0x80 0x0a
 
-# CHECK: alfi %r0, 4294967295
-0xc2 0x0b 0xff 0xff 0xff 0xff
+# CHECK: alg %r0, -1
+0xe3 0x00 0x0f 0xff 0xff 0x0a
 
-# CHECK: alfi %r15, 0
-0xc2 0xfb 0x00 0x00 0x00 0x00
+# CHECK: alg %r0, 0
+0xe3 0x00 0x00 0x00 0x00 0x0a
 
-# CHECK: algfi %r0, 0
-0xc2 0x0a 0x00 0x00 0x00 0x00
+# CHECK: alg %r0, 1
+0xe3 0x00 0x00 0x01 0x00 0x0a
 
-# CHECK: algfi %r0, 4294967295
-0xc2 0x0a 0xff 0xff 0xff 0xff
+# CHECK: alg %r0, 524287
+0xe3 0x00 0x0f 0xff 0x7f 0x0a
 
-# CHECK: algfi %r15, 0
-0xc2 0xfa 0x00 0x00 0x00 0x00
+# CHECK: alg %r0, 0(%r1)
+0xe3 0x00 0x10 0x00 0x00 0x0a
 
-# CHECK: algfr %r0, %r0
-0xb9 0x1a 0x00 0x00
+# CHECK: alg %r0, 0(%r15)
+0xe3 0x00 0xf0 0x00 0x00 0x0a
 
-# CHECK: algfr %r0, %r15
-0xb9 0x1a 0x00 0x0f
+# CHECK: alg %r0, 524287(%r1,%r15)
+0xe3 0x01 0xff 0xff 0x7f 0x0a
 
-# CHECK: algfr %r15, %r0
-0xb9 0x1a 0x00 0xf0
+# CHECK: alg %r0, 524287(%r15,%r1)
+0xe3 0x0f 0x1f 0xff 0x7f 0x0a
 
-# CHECK: algfr %r7, %r8
-0xb9 0x1a 0x00 0x78
+# CHECK: alg %r15, 0
+0xe3 0xf0 0x00 0x00 0x00 0x0a
 
 # CHECK: algf %r0, -524288
 0xe3 0x00 0x00 0x00 0x80 0x1a
@@ -511,6 +562,42 @@
 # CHECK: algf %r15, 0
 0xe3 0xf0 0x00 0x00 0x00 0x1a
 
+# CHECK: algfi %r0, 0
+0xc2 0x0a 0x00 0x00 0x00 0x00
+
+# CHECK: algfi %r0, 4294967295
+0xc2 0x0a 0xff 0xff 0xff 0xff
+
+# CHECK: algfi %r15, 0
+0xc2 0xfa 0x00 0x00 0x00 0x00
+
+# CHECK: algfr %r0, %r0
+0xb9 0x1a 0x00 0x00
+
+# CHECK: algfr %r0, %r15
+0xb9 0x1a 0x00 0x0f
+
+# CHECK: algfr %r15, %r0
+0xb9 0x1a 0x00 0xf0
+
+# CHECK: algfr %r7, %r8
+0xb9 0x1a 0x00 0x78
+
+# CHECK: alghsik %r0, %r1, -32768
+0xec 0x01 0x80 0x00 0x00 0xdb
+
+# CHECK: alghsik %r2, %r3, -1
+0xec 0x23 0xff 0xff 0x00 0xdb
+
+# CHECK: alghsik %r4, %r5, 0
+0xec 0x45 0x00 0x00 0x00 0xdb
+
+# CHECK: alghsik %r6, %r7, 1
+0xec 0x67 0x00 0x01 0x00 0xdb
+
+# CHECK: alghsik %r8, %r15, 32767
+0xec 0x8f 0x7f 0xff 0x00 0xdb
+
 # CHECK: algr %r0, %r0
 0xb9 0x0a 0x00 0x00
 
@@ -529,50 +616,44 @@
 # CHECK: algrk %r2, %r3, %r4
 0xb9 0xea 0x40 0x23
 
-# CHECK: alg %r0, -524288
-0xe3 0x00 0x00 0x00 0x80 0x0a
-
-# CHECK: alg %r0, -1
-0xe3 0x00 0x0f 0xff 0xff 0x0a
-
-# CHECK: alg %r0, 0
-0xe3 0x00 0x00 0x00 0x00 0x0a
+# CHECK: algsi -524288, 0
+0xeb 0x00 0x00 0x00 0x80 0x7e
 
-# CHECK: alg %r0, 1
-0xe3 0x00 0x00 0x01 0x00 0x0a
+# CHECK: algsi -1, 0
+0xeb 0x00 0x0f 0xff 0xff 0x7e
 
-# CHECK: alg %r0, 524287
-0xe3 0x00 0x0f 0xff 0x7f 0x0a
+# CHECK: algsi 0, 0
+0xeb 0x00 0x00 0x00 0x00 0x7e
 
-# CHECK: alg %r0, 0(%r1)
-0xe3 0x00 0x10 0x00 0x00 0x0a
+# CHECK: algsi 1, 0
+0xeb 0x00 0x00 0x01 0x00 0x7e
 
-# CHECK: alg %r0, 0(%r15)
-0xe3 0x00 0xf0 0x00 0x00 0x0a
+# CHECK: algsi 524287, 0
+0xeb 0x00 0x0f 0xff 0x7f 0x7e
 
-# CHECK: alg %r0, 524287(%r1,%r15)
-0xe3 0x01 0xff 0xff 0x7f 0x0a
+# CHECK: algsi 0, -128
+0xeb 0x80 0x00 0x00 0x00 0x7e
 
-# CHECK: alg %r0, 524287(%r15,%r1)
-0xe3 0x0f 0x1f 0xff 0x7f 0x0a
+# CHECK: algsi 0, -1
+0xeb 0xff 0x00 0x00 0x00 0x7e
 
-# CHECK: alg %r15, 0
-0xe3 0xf0 0x00 0x00 0x00 0x0a
+# CHECK: algsi 0, 1
+0xeb 0x01 0x00 0x00 0x00 0x7e
 
-# CHECK: alghsik %r0, %r1, -32768
-0xec 0x01 0x80 0x00 0x00 0xdb
+# CHECK: algsi 0, 127
+0xeb 0x7f 0x00 0x00 0x00 0x7e
 
-# CHECK: alghsik %r2, %r3, -1
-0xec 0x23 0xff 0xff 0x00 0xdb
+# CHECK: algsi 0(%r1), 42
+0xeb 0x2a 0x10 0x00 0x00 0x7e
 
-# CHECK: alghsik %r4, %r5, 0
-0xec 0x45 0x00 0x00 0x00 0xdb
+# CHECK: algsi 0(%r15), 42
+0xeb 0x2a 0xf0 0x00 0x00 0x7e
 
-# CHECK: alghsik %r6, %r7, 1
-0xec 0x67 0x00 0x01 0x00 0xdb
+# CHECK: algsi 524287(%r1), 42
+0xeb 0x2a 0x1f 0xff 0x7f 0x7e
 
-# CHECK: alghsik %r8, %r15, 32767
-0xec 0x8f 0x7f 0xff 0x00 0xdb
+# CHECK: algsi 524287(%r15), 42
+0xeb 0x2a 0xff 0xff 0x7f 0x7e
 
 # CHECK: alhsik %r0, %r1, -32768
 0xec 0x01 0x80 0x00 0x00 0xda
@@ -607,26 +688,44 @@
 # CHECK: alrk %r2, %r3, %r4
 0xb9 0xfa 0x40 0x23
 
-# CHECK: al %r0, 0
-0x5e 0x00 0x00 0x00
+# CHECK: alsi -524288, 0
+0xeb 0x00 0x00 0x00 0x80 0x6e
 
-# CHECK: al %r0, 4095
-0x5e 0x00 0x0f 0xff
+# CHECK: alsi -1, 0
+0xeb 0x00 0x0f 0xff 0xff 0x6e
 
-# CHECK: al %r0, 0(%r1)
-0x5e 0x00 0x10 0x00
+# CHECK: alsi 0, 0
+0xeb 0x00 0x00 0x00 0x00 0x6e
 
-# CHECK: al %r0, 0(%r15)
-0x5e 0x00 0xf0 0x00
+# CHECK: alsi 1, 0
+0xeb 0x00 0x00 0x01 0x00 0x6e
 
-# CHECK: al %r0, 4095(%r1,%r15)
-0x5e 0x01 0xff 0xff
+# CHECK: alsi 524287, 0
+0xeb 0x00 0x0f 0xff 0x7f 0x6e
 
-# CHECK: al %r0, 4095(%r15,%r1)
-0x5e 0x0f 0x1f 0xff
+# CHECK: alsi 0, -128
+0xeb 0x80 0x00 0x00 0x00 0x6e
 
-# CHECK: al %r15, 0
-0x5e 0xf0 0x00 0x00
+# CHECK: alsi 0, -1
+0xeb 0xff 0x00 0x00 0x00 0x6e
+
+# CHECK: alsi 0, 1
+0xeb 0x01 0x00 0x00 0x00 0x6e
+
+# CHECK: alsi 0, 127
+0xeb 0x7f 0x00 0x00 0x00 0x6e
+
+# CHECK: alsi 0(%r1), 42
+0xeb 0x2a 0x10 0x00 0x00 0x6e
+
+# CHECK: alsi 0(%r15), 42
+0xeb 0x2a 0xf0 0x00 0x00 0x6e
+
+# CHECK: alsi 524287(%r1), 42
+0xeb 0x2a 0x1f 0xff 0x7f 0x6e
+
+# CHECK: alsi 524287(%r15), 42
+0xeb 0x2a 0xff 0xff 0x7f 0x6e
 
 # CHECK: aly %r0, -524288
 0xe3 0x00 0x00 0x00 0x80 0x5e
@@ -658,6 +757,48 @@
 # CHECK: aly %r15, 0
 0xe3 0xf0 0x00 0x00 0x00 0x5e
 
+# CHECK: ap 0(1), 0(1)
+0xfa 0x00 0x00 0x00 0x00 0x00
+
+# CHECK: ap 0(1), 0(1,%r1)
+0xfa 0x00 0x00 0x00 0x10 0x00
+
+# CHECK: ap 0(1), 0(1,%r15)
+0xfa 0x00 0x00 0x00 0xf0 0x00
+
+# CHECK: ap 0(1), 4095(1)
+0xfa 0x00 0x00 0x00 0x0f 0xff
+
+# CHECK: ap 0(1), 4095(1,%r1)
+0xfa 0x00 0x00 0x00 0x1f 0xff
+
+# CHECK: ap 0(1), 4095(1,%r15)
+0xfa 0x00 0x00 0x00 0xff 0xff
+
+# CHECK: ap 0(1,%r1), 0(1)
+0xfa 0x00 0x10 0x00 0x00 0x00
+
+# CHECK: ap 0(1,%r15), 0(1)
+0xfa 0x00 0xf0 0x00 0x00 0x00
+
+# CHECK: ap 4095(1,%r1), 0(1)
+0xfa 0x00 0x1f 0xff 0x00 0x00
+
+# CHECK: ap 4095(1,%r15), 0(1)
+0xfa 0x00 0xff 0xff 0x00 0x00
+
+# CHECK: ap 0(16,%r1), 0(1)
+0xfa 0xf0 0x10 0x00 0x00 0x00
+
+# CHECK: ap 0(16,%r15), 0(1)
+0xfa 0xf0 0xf0 0x00 0x00 0x00
+
+# CHECK: ap 0(1), 0(16,%r1)
+0xfa 0x0f 0x00 0x00 0x10 0x00
+
+# CHECK: ap 0(1), 0(16,%r15)
+0xfa 0x0f 0x00 0x00 0xf0 0x00
+
 # CHECK: ar %r0, %r0
 0x1a 0x00
 
@@ -715,27 +856,6 @@
 # CHECK: asi 524287(%r15), 42
 0xeb 0x2a 0xff 0xff 0x7f 0x6a
 
-# CHECK: a %r0, 0
-0x5a 0x00 0x00 0x00
-
-# CHECK: a %r0, 4095
-0x5a 0x00 0x0f 0xff
-
-# CHECK: a %r0, 0(%r1)
-0x5a 0x00 0x10 0x00
-
-# CHECK: a %r0, 0(%r15)
-0x5a 0x00 0xf0 0x00
-
-# CHECK: a %r0, 4095(%r1,%r15)
-0x5a 0x01 0xff 0xff
-
-# CHECK: a %r0, 4095(%r15,%r1)
-0x5a 0x0f 0x1f 0xff
-
-# CHECK: a %r15, 0
-0x5a 0xf0 0x00 0x00
-
 # CHECK: axbr %f0, %f0
 0xb3 0x4a 0x00 0x00
 
@@ -778,6 +898,24 @@
 # CHECK: ay %r15, 0
 0xe3 0xf0 0x00 0x00 0x00 0x5a
 
+# CHECK: b 0
+0x47 0xf0 0x00 0x00
+
+# CHECK: b 4095
+0x47 0xf0 0x0f 0xff
+
+# CHECK: b 0(%r1)
+0x47 0xf0 0x10 0x00
+
+# CHECK: b 0(%r15)
+0x47 0xf0 0xf0 0x00
+
+# CHECK: b 4095(%r1,%r15)
+0x47 0xf1 0xff 0xff
+
+# CHECK: b 4095(%r15,%r1)
+0x47 0xff 0x1f 0xff
+
 # CHECK: bal %r0, 0
 0x45 0x00 0x00 0x00
 
@@ -850,36 +988,6 @@
 # CHECK: bassm %r15, %r1
 0x0c 0xf1
 
-# CHECK: bsm %r0, %r1
-0x0b 0x01
-
-# CHECK: bsm %r0, %r15
-0x0b 0x0f
-
-# CHECK: bsm %r14, %r9
-0x0b 0xe9
-
-# CHECK: bsm %r15, %r1
-0x0b 0xf1
-
-# CHECK: b 0
-0x47 0xf0 0x00 0x00
-
-# CHECK: b 4095
-0x47 0xf0 0x0f 0xff
-
-# CHECK: b 0(%r1)
-0x47 0xf0 0x10 0x00
-
-# CHECK: b 0(%r15)
-0x47 0xf0 0xf0 0x00
-
-# CHECK: b 4095(%r1,%r15)
-0x47 0xf1 0xff 0xff
-
-# CHECK: b 4095(%r15,%r1)
-0x47 0xff 0x1f 0xff
-
 # CHECK: bc 0, 0
 0x47 0x00 0x00 0x00
 
@@ -1015,18 +1123,6 @@
 # CHECK: bct %r15, 0
 0x46 0xf0 0x00 0x00
 
-# CHECK: bctr %r0, %r9
-0x06 0x09
-
-# CHECK: bctr %r0, %r15
-0x06 0x0f
-
-# CHECK: bctr %r15, %r0
-0x06 0xf0
-
-# CHECK: bctr %r15, %r9
-0x06 0xf9
-
 # CHECK: bctg %r0, -524288
 0xe3 0x00 0x00 0x00 0x80 0x46
 
@@ -1069,11 +1165,35 @@
 # CHECK: bctgr %r15, %r9
 0xb9 0x46 0x00 0xf9
 
-# CHECK: bxh %r0, %r0, 0
-0x86 0x00 0x00 0x00
+# CHECK: bctr %r0, %r9
+0x06 0x09
 
-# CHECK: bxh %r0, %r15, 0
-0x86 0x0f 0x00 0x00
+# CHECK: bctr %r0, %r15
+0x06 0x0f
+
+# CHECK: bctr %r15, %r0
+0x06 0xf0
+
+# CHECK: bctr %r15, %r9
+0x06 0xf9
+
+# CHECK: bsm %r0, %r1
+0x0b 0x01
+
+# CHECK: bsm %r0, %r15
+0x0b 0x0f
+
+# CHECK: bsm %r14, %r9
+0x0b 0xe9
+
+# CHECK: bsm %r15, %r1
+0x0b 0xf1
+
+# CHECK: bxh %r0, %r0, 0
+0x86 0x00 0x00 0x00
+
+# CHECK: bxh %r0, %r15, 0
+0x86 0x0f 0x00 0x00
 
 # CHECK: bxh %r14, %r15, 0
 0x86 0xef 0x00 0x00
@@ -1177,7 +1297,7 @@
 # CHECK: bxleg %r14, %r15, 0
 0xeb 0xef 0x00 0x00 0x00 0x45
 
-# CHECK: bxleg   %r15, %r15, 0
+# CHECK: bxleg %r15, %r15, 0
 0xeb 0xff 0x00 0x00 0x00 0x45
 
 # CHECK: bxleg %r0, %r0, -524288
@@ -1207,17 +1327,26 @@
 # CHECK: bxleg %r0, %r0, 524287(%r15)
 0xeb 0x00 0xff 0xff 0x7f 0x45
 
-# CHECK: cdbr %f0, %f0
-0xb3 0x19 0x00 0x00
+# CHECK: c %r0, 0
+0x59 0x00 0x00 0x00
 
-# CHECK: cdbr %f0, %f15
-0xb3 0x19 0x00 0x0f
+# CHECK: c %r0, 4095
+0x59 0x00 0x0f 0xff
 
-# CHECK: cdbr %f7, %f8
-0xb3 0x19 0x00 0x78
+# CHECK: c %r0, 0(%r1)
+0x59 0x00 0x10 0x00
 
-# CHECK: cdbr %f15, %f0
-0xb3 0x19 0x00 0xf0
+# CHECK: c %r0, 0(%r15)
+0x59 0x00 0xf0 0x00
+
+# CHECK: c %r0, 4095(%r1,%r15)
+0x59 0x01 0xff 0xff
+
+# CHECK: c %r0, 4095(%r15,%r1)
+0x59 0x0f 0x1f 0xff
+
+# CHECK: c %r15, 0
+0x59 0xf0 0x00 0x00
 
 # CHECK: cdb %f0, 0
 0xed 0x00 0x00 0x00 0x00 0x19
@@ -1240,6 +1369,18 @@
 # CHECK: cdb %f15, 0
 0xed 0xf0 0x00 0x00 0x00 0x19
 
+# CHECK: cdbr %f0, %f0
+0xb3 0x19 0x00 0x00
+
+# CHECK: cdbr %f0, %f15
+0xb3 0x19 0x00 0x0f
+
+# CHECK: cdbr %f7, %f8
+0xb3 0x19 0x00 0x78
+
+# CHECK: cdbr %f15, %f0
+0xb3 0x19 0x00 0xf0
+
 # CHECK: cdfbr %f0, %r0
 0xb3 0x95 0x00 0x00
 
@@ -1255,22 +1396,22 @@
 # CHECK: cdfbr %f15, %r15
 0xb3 0x95 0x00 0xff
 
-# CHECK: cdfbra	%f0, 0, %r0, 1
+# CHECK: cdfbra %f0, 0, %r0, 1
 0xb3 0x95 0x01 0x00
 
-# CHECK: cdfbra	%f0, 0, %r0, 15
+# CHECK: cdfbra %f0, 0, %r0, 15
 0xb3 0x95 0x0f 0x00
 
-# CHECK: cdfbra	%f0, 0, %r15, 1
+# CHECK: cdfbra %f0, 0, %r15, 1
 0xb3 0x95 0x01 0x0f
 
-# CHECK: cdfbra	%f0, 15, %r0, 1
+# CHECK: cdfbra %f0, 15, %r0, 1
 0xb3 0x95 0xf1 0x00
 
-# CHECK: cdfbra	%f4, 5, %r6, 7
+# CHECK: cdfbra %f4, 5, %r6, 7
 0xb3 0x95 0x57 0x46
 
-# CHECK: cdfbra	%f15, 0, %r0, 1
+# CHECK: cdfbra %f15, 0, %r0, 1
 0xb3 0x95 0x01 0xf0
 
 # CHECK: cdgbr %f0, %r0
@@ -1288,58 +1429,58 @@
 # CHECK: cdgbr %f15, %r15
 0xb3 0xa5 0x00 0xff
 
-# CHECK: cdgbra	%f0, 0, %r0, 1
+# CHECK: cdgbra %f0, 0, %r0, 1
 0xb3 0xa5 0x01 0x00
 
-# CHECK: cdgbra	%f0, 0, %r0, 15
+# CHECK: cdgbra %f0, 0, %r0, 15
 0xb3 0xa5 0x0f 0x00
 
-# CHECK: cdgbra	%f0, 0, %r15, 1
+# CHECK: cdgbra %f0, 0, %r15, 1
 0xb3 0xa5 0x01 0x0f
 
-# CHECK: cdgbra	%f0, 15, %r0, 1
+# CHECK: cdgbra %f0, 15, %r0, 1
 0xb3 0xa5 0xf1 0x00
 
-# CHECK: cdgbra	%f4, 5, %r6, 7
+# CHECK: cdgbra %f4, 5, %r6, 7
 0xb3 0xa5 0x57 0x46
 
-# CHECK: cdgbra	%f15, 0, %r0, 1
+# CHECK: cdgbra %f15, 0, %r0, 1
 0xb3 0xa5 0x01 0xf0
 
-# CHECK: cdlfbr	%f0, 0, %r0, 1
+# CHECK: cdlfbr %f0, 0, %r0, 1
 0xb3 0x91 0x01 0x00
 
-# CHECK: cdlfbr	%f0, 0, %r0, 15
+# CHECK: cdlfbr %f0, 0, %r0, 15
 0xb3 0x91 0x0f 0x00
 
-# CHECK: cdlfbr	%f0, 0, %r15, 1
+# CHECK: cdlfbr %f0, 0, %r15, 1
 0xb3 0x91 0x01 0x0f
 
-# CHECK: cdlfbr	%f0, 15, %r0, 1
+# CHECK: cdlfbr %f0, 15, %r0, 1
 0xb3 0x91 0xf1 0x00
 
-# CHECK: cdlfbr	%f4, 5, %r6, 7
+# CHECK: cdlfbr %f4, 5, %r6, 7
 0xb3 0x91 0x57 0x46
 
-# CHECK: cdlfbr	%f15, 0, %r0, 1
+# CHECK: cdlfbr %f15, 0, %r0, 1
 0xb3 0x91 0x01 0xf0
 
-# CHECK: cdlgbr	%f0, 0, %r0, 1
+# CHECK: cdlgbr %f0, 0, %r0, 1
 0xb3 0xa1 0x01 0x00
 
-# CHECK: cdlgbr	%f0, 0, %r0, 15
+# CHECK: cdlgbr %f0, 0, %r0, 15
 0xb3 0xa1 0x0f 0x00
 
-# CHECK: cdlgbr	%f0, 0, %r15, 1
+# CHECK: cdlgbr %f0, 0, %r15, 1
 0xb3 0xa1 0x01 0x0f
 
-# CHECK: cdlgbr	%f0, 15, %r0, 1
+# CHECK: cdlgbr %f0, 15, %r0, 1
 0xb3 0xa1 0xf1 0x00
 
-# CHECK: cdlgbr	%f4, 5, %r6, 7
+# CHECK: cdlgbr %f4, 5, %r6, 7
 0xb3 0xa1 0x57 0x46
 
-# CHECK: cdlgbr	%f15, 0, %r0, 1
+# CHECK: cdlgbr %f15, 0, %r0, 1
 0xb3 0xa1 0x01 0xf0
 
 # CHECK: cds %r0, %r0, 0
@@ -1432,18 +1573,6 @@
 # CHECK: cdsy %r14, %r0, 0
 0xeb 0xe0 0x00 0x00 0x00 0x31
 
-# CHECK: cebr %f0, %f0
-0xb3 0x09 0x00 0x00
-
-# CHECK: cebr %f0, %f15
-0xb3 0x09 0x00 0x0f
-
-# CHECK: cebr %f7, %f8
-0xb3 0x09 0x00 0x78
-
-# CHECK: cebr %f15, %f0
-0xb3 0x09 0x00 0xf0
-
 # CHECK: ceb %f0, 0
 0xed 0x00 0x00 0x00 0x00 0x09
 
@@ -1465,6 +1594,18 @@
 # CHECK: ceb %f15, 0
 0xed 0xf0 0x00 0x00 0x00 0x09
 
+# CHECK: cebr %f0, %f0
+0xb3 0x09 0x00 0x00
+
+# CHECK: cebr %f0, %f15
+0xb3 0x09 0x00 0x0f
+
+# CHECK: cebr %f7, %f8
+0xb3 0x09 0x00 0x78
+
+# CHECK: cebr %f15, %f0
+0xb3 0x09 0x00 0xf0
+
 # CHECK: cefbr %f0, %r0
 0xb3 0x94 0x00 0x00
 
@@ -1480,22 +1621,22 @@
 # CHECK: cefbr %f15, %r15
 0xb3 0x94 0x00 0xff
 
-# CHECK: cefbra	%f0, 0, %r0, 1
+# CHECK: cefbra %f0, 0, %r0, 1
 0xb3 0x94 0x01 0x00
 
-# CHECK: cefbra	%f0, 0, %r0, 15
+# CHECK: cefbra %f0, 0, %r0, 15
 0xb3 0x94 0x0f 0x00
 
-# CHECK: cefbra	%f0, 0, %r15, 1
+# CHECK: cefbra %f0, 0, %r15, 1
 0xb3 0x94 0x01 0x0f
 
-# CHECK: cefbra	%f0, 15, %r0, 1
+# CHECK: cefbra %f0, 15, %r0, 1
 0xb3 0x94 0xf1 0x00
 
-# CHECK: cefbra	%f4, 5, %r6, 7
+# CHECK: cefbra %f4, 5, %r6, 7
 0xb3 0x94 0x57 0x46
 
-# CHECK: cefbra	%f15, 0, %r0, 1
+# CHECK: cefbra %f15, 0, %r0, 1
 0xb3 0x94 0x01 0xf0
 
 # CHECK: cegbr %f0, %r0
@@ -1513,60 +1654,78 @@
 # CHECK: cegbr %f15, %r15
 0xb3 0xa4 0x00 0xff
 
-# CHECK: cegbra	%f0, 0, %r0, 1
+# CHECK: cegbra %f0, 0, %r0, 1
 0xb3 0xa4 0x01 0x00
 
-# CHECK: cegbra	%f0, 0, %r0, 15
+# CHECK: cegbra %f0, 0, %r0, 15
 0xb3 0xa4 0x0f 0x00
 
-# CHECK: cegbra	%f0, 0, %r15, 1
+# CHECK: cegbra %f0, 0, %r15, 1
 0xb3 0xa4 0x01 0x0f
 
-# CHECK: cegbra	%f0, 15, %r0, 1
+# CHECK: cegbra %f0, 15, %r0, 1
 0xb3 0xa4 0xf1 0x00
 
-# CHECK: cegbra	%f4, 5, %r6, 7
+# CHECK: cegbra %f4, 5, %r6, 7
 0xb3 0xa4 0x57 0x46
 
-# CHECK: cegbra	%f15, 0, %r0, 1
+# CHECK: cegbra %f15, 0, %r0, 1
 0xb3 0xa4 0x01 0xf0
 
-# CHECK: celfbr	%f0, 0, %r0, 1
+# CHECK: celfbr %f0, 0, %r0, 1
 0xb3 0x90 0x01 0x00
 
-# CHECK: celfbr	%f0, 0, %r0, 15
+# CHECK: celfbr %f0, 0, %r0, 15
 0xb3 0x90 0x0f 0x00
 
-# CHECK: celfbr	%f0, 0, %r15, 1
+# CHECK: celfbr %f0, 0, %r15, 1
 0xb3 0x90 0x01 0x0f
 
-# CHECK: celfbr	%f0, 15, %r0, 1
+# CHECK: celfbr %f0, 15, %r0, 1
 0xb3 0x90 0xf1 0x00
 
-# CHECK: celfbr	%f4, 5, %r6, 7
+# CHECK: celfbr %f4, 5, %r6, 7
 0xb3 0x90 0x57 0x46
 
-# CHECK: celfbr	%f15, 0, %r0, 1
+# CHECK: celfbr %f15, 0, %r0, 1
 0xb3 0x90 0x01 0xf0
 
-# CHECK: celgbr	%f0, 0, %r0, 1
+# CHECK: celgbr %f0, 0, %r0, 1
 0xb3 0xa0 0x01 0x00
 
-# CHECK: celgbr	%f0, 0, %r0, 15
+# CHECK: celgbr %f0, 0, %r0, 15
 0xb3 0xa0 0x0f 0x00
 
-# CHECK: celgbr	%f0, 0, %r15, 1
+# CHECK: celgbr %f0, 0, %r15, 1
 0xb3 0xa0 0x01 0x0f
 
-# CHECK: celgbr	%f0, 15, %r0, 1
+# CHECK: celgbr %f0, 15, %r0, 1
 0xb3 0xa0 0xf1 0x00
 
-# CHECK: celgbr	%f4, 5, %r6, 7
+# CHECK: celgbr %f4, 5, %r6, 7
 0xb3 0xa0 0x57 0x46
 
-# CHECK: celgbr	%f15, 0, %r0, 1
+# CHECK: celgbr %f15, 0, %r0, 1
 0xb3 0xa0 0x01 0xf0
 
+# CHECK: cfc 0
+0xb2 0x1a 0x00 0x00
+
+# CHECK: cfc 0(%r1)
+0xb2 0x1a 0x10 0x00
+
+# CHECK: cfc 0(%r15)
+0xb2 0x1a 0xf0 0x00
+
+# CHECK: cfc 4095
+0xb2 0x1a 0x0f 0xff
+
+# CHECK: cfc 4095(%r1)
+0xb2 0x1a 0x1f 0xff
+
+# CHECK: cfc 4095(%r15)
+0xb2 0x1a 0xff 0xff
+
 # CHECK: cfdbr %r0, 0, %f0
 0xb3 0x99 0x00 0x00
 
@@ -1582,22 +1741,22 @@
 # CHECK: cfdbr %r15, 0, %f0
 0xb3 0x99 0x00 0xf0
 
-# CHECK: cfdbra	%r0, 0, %f0, 1
+# CHECK: cfdbra %r0, 0, %f0, 1
 0xb3 0x99 0x01 0x00
 
-# CHECK: cfdbra	%r0, 0, %f0, 15
+# CHECK: cfdbra %r0, 0, %f0, 15
 0xb3 0x99 0x0f 0x00
 
-# CHECK: cfdbra	%r0, 0, %f15, 1
+# CHECK: cfdbra %r0, 0, %f15, 1
 0xb3 0x99 0x01 0x0f
 
-# CHECK: cfdbra	%r0, 15, %f0, 1
+# CHECK: cfdbra %r0, 15, %f0, 1
 0xb3 0x99 0xf1 0x00
 
-# CHECK: cfdbra	%r4, 5, %f6, 7
+# CHECK: cfdbra %r4, 5, %f6, 7
 0xb3 0x99 0x57 0x46
 
-# CHECK: cfdbra	%r15, 0, %f0, 1
+# CHECK: cfdbra %r15, 0, %f0, 1
 0xb3 0x99 0x01 0xf0
 
 # CHECK: cfebr %r0, 0, %f0
@@ -1615,22 +1774,22 @@
 # CHECK: cfebr %r15, 0, %f0
 0xb3 0x98 0x00 0xf0
 
-# CHECK: cfebra	%r0, 0, %f0, 1
+# CHECK: cfebra %r0, 0, %f0, 1
 0xb3 0x98 0x01 0x00
 
-# CHECK: cfebra	%r0, 0, %f0, 15
+# CHECK: cfebra %r0, 0, %f0, 15
 0xb3 0x98 0x0f 0x00
 
-# CHECK: cfebra	%r0, 0, %f15, 1
+# CHECK: cfebra %r0, 0, %f15, 1
 0xb3 0x98 0x01 0x0f
 
-# CHECK: cfebra	%r0, 15, %f0, 1
+# CHECK: cfebra %r0, 15, %f0, 1
 0xb3 0x98 0xf1 0x00
 
-# CHECK: cfebra	%r4, 5, %f6, 7
+# CHECK: cfebra %r4, 5, %f6, 7
 0xb3 0x98 0x57 0x46
 
-# CHECK: cfebra	%r15, 0, %f0, 1
+# CHECK: cfebra %r15, 0, %f0, 1
 0xb3 0x98 0x01 0xf0
 
 # CHECK: cfi %r0, -2147483648
@@ -1666,24 +1825,54 @@
 # CHECK: cfxbr %r15, 0, %f0
 0xb3 0x9a 0x00 0xf0
 
-# CHECK: cfxbra	%r0, 0, %f0, 1
+# CHECK: cfxbra %r0, 0, %f0, 1
 0xb3 0x9a 0x01 0x00
 
-# CHECK: cfxbra	%r0, 0, %f0, 15
+# CHECK: cfxbra %r0, 0, %f0, 15
 0xb3 0x9a 0x0f 0x00
 
-# CHECK: cfxbra	%r0, 0, %f13, 1
+# CHECK: cfxbra %r0, 0, %f13, 1
 0xb3 0x9a 0x01 0x0d
 
-# CHECK: cfxbra	%r0, 15, %f0, 1
+# CHECK: cfxbra %r0, 15, %f0, 1
 0xb3 0x9a 0xf1 0x00
 
-# CHECK: cfxbra	%r4, 5, %f8, 9
+# CHECK: cfxbra %r4, 5, %f8, 9
 0xb3 0x9a 0x59 0x48
 
-# CHECK: cfxbra	%r15, 0, %f0, 1
+# CHECK: cfxbra %r15, 0, %f0, 1
 0xb3 0x9a 0x01 0xf0
 
+# CHECK: cg %r0, -524288
+0xe3 0x00 0x00 0x00 0x80 0x20
+
+# CHECK: cg %r0, -1
+0xe3 0x00 0x0f 0xff 0xff 0x20
+
+# CHECK: cg %r0, 0
+0xe3 0x00 0x00 0x00 0x00 0x20
+
+# CHECK: cg %r0, 1
+0xe3 0x00 0x00 0x01 0x00 0x20
+
+# CHECK: cg %r0, 524287
+0xe3 0x00 0x0f 0xff 0x7f 0x20
+
+# CHECK: cg %r0, 0(%r1)
+0xe3 0x00 0x10 0x00 0x00 0x20
+
+# CHECK: cg %r0, 0(%r15)
+0xe3 0x00 0xf0 0x00 0x00 0x20
+
+# CHECK: cg %r0, 524287(%r1,%r15)
+0xe3 0x01 0xff 0xff 0x7f 0x20
+
+# CHECK: cg %r0, 524287(%r15,%r1)
+0xe3 0x0f 0x1f 0xff 0x7f 0x20
+
+# CHECK: cg %r15, 0
+0xe3 0xf0 0x00 0x00 0x00 0x20
+
 # CHECK: cgdbr %r0, 0, %f0
 0xb3 0xa9 0x00 0x00
 
@@ -1699,22 +1888,22 @@
 # CHECK: cgdbr %r15, 0, %f0
 0xb3 0xa9 0x00 0xf0
 
-# CHECK: cgdbra	%r0, 0, %f0, 1
+# CHECK: cgdbra %r0, 0, %f0, 1
 0xb3 0xa9 0x01 0x00
 
-# CHECK: cgdbra	%r0, 0, %f0, 15
+# CHECK: cgdbra %r0, 0, %f0, 15
 0xb3 0xa9 0x0f 0x00
 
-# CHECK: cgdbra	%r0, 0, %f15, 1
+# CHECK: cgdbra %r0, 0, %f15, 1
 0xb3 0xa9 0x01 0x0f
 
-# CHECK: cgdbra	%r0, 15, %f0, 1
+# CHECK: cgdbra %r0, 15, %f0, 1
 0xb3 0xa9 0xf1 0x00
 
-# CHECK: cgdbra	%r4, 5, %f6, 7
+# CHECK: cgdbra %r4, 5, %f6, 7
 0xb3 0xa9 0x57 0x46
 
-# CHECK: cgdbra	%r15, 0, %f0, 1
+# CHECK: cgdbra %r15, 0, %f0, 1
 0xb3 0xa9 0x01 0xf0
 
 # CHECK: cgebr %r0, 0, %f0
@@ -1732,24 +1921,54 @@
 # CHECK: cgebr %r15, 0, %f0
 0xb3 0xa8 0x00 0xf0
 
-# CHECK: cgebra	%r0, 0, %f0, 1
+# CHECK: cgebra %r0, 0, %f0, 1
 0xb3 0xa8 0x01 0x00
 
-# CHECK: cgebra	%r0, 0, %f0, 15
+# CHECK: cgebra %r0, 0, %f0, 15
 0xb3 0xa8 0x0f 0x00
 
-# CHECK: cgebra	%r0, 0, %f15, 1
+# CHECK: cgebra %r0, 0, %f15, 1
 0xb3 0xa8 0x01 0x0f
 
-# CHECK: cgebra	%r0, 15, %f0, 1
+# CHECK: cgebra %r0, 15, %f0, 1
 0xb3 0xa8 0xf1 0x00
 
-# CHECK: cgebra	%r4, 5, %f6, 7
+# CHECK: cgebra %r4, 5, %f6, 7
 0xb3 0xa8 0x57 0x46
 
-# CHECK: cgebra	%r15, 0, %f0, 1
+# CHECK: cgebra %r15, 0, %f0, 1
 0xb3 0xa8 0x01 0xf0
 
+# CHECK: cgf %r0, -524288
+0xe3 0x00 0x00 0x00 0x80 0x30
+
+# CHECK: cgf %r0, -1
+0xe3 0x00 0x0f 0xff 0xff 0x30
+
+# CHECK: cgf %r0, 0
+0xe3 0x00 0x00 0x00 0x00 0x30
+
+# CHECK: cgf %r0, 1
+0xe3 0x00 0x00 0x01 0x00 0x30
+
+# CHECK: cgf %r0, 524287
+0xe3 0x00 0x0f 0xff 0x7f 0x30
+
+# CHECK: cgf %r0, 0(%r1)
+0xe3 0x00 0x10 0x00 0x00 0x30
+
+# CHECK: cgf %r0, 0(%r15)
+0xe3 0x00 0xf0 0x00 0x00 0x30
+
+# CHECK: cgf %r0, 524287(%r1,%r15)
+0xe3 0x01 0xff 0xff 0x7f 0x30
+
+# CHECK: cgf %r0, 524287(%r15,%r1)
+0xe3 0x0f 0x1f 0xff 0x7f 0x30
+
+# CHECK: cgf %r15, 0
+0xe3 0xf0 0x00 0x00 0x00 0x30
+
 # CHECK: cgfi %r0, -2147483648
 0xc2 0x0c 0x80 0x00 0x00 0x00
 
@@ -1780,35 +1999,35 @@
 # CHECK: cgfr %r7, %r8
 0xb9 0x30 0x00 0x78
 
-# CHECK: cgf %r0, -524288
-0xe3 0x00 0x00 0x00 0x80 0x30
+# CHECK: cgh %r0, -524288
+0xe3 0x00 0x00 0x00 0x80 0x34
 
-# CHECK: cgf %r0, -1
-0xe3 0x00 0x0f 0xff 0xff 0x30
+# CHECK: cgh %r0, -1
+0xe3 0x00 0x0f 0xff 0xff 0x34
 
-# CHECK: cgf %r0, 0
-0xe3 0x00 0x00 0x00 0x00 0x30
+# CHECK: cgh %r0, 0
+0xe3 0x00 0x00 0x00 0x00 0x34
 
-# CHECK: cgf %r0, 1
-0xe3 0x00 0x00 0x01 0x00 0x30
+# CHECK: cgh %r0, 1
+0xe3 0x00 0x00 0x01 0x00 0x34
 
-# CHECK: cgf %r0, 524287
-0xe3 0x00 0x0f 0xff 0x7f 0x30
+# CHECK: cgh %r0, 524287
+0xe3 0x00 0x0f 0xff 0x7f 0x34
 
-# CHECK: cgf %r0, 0(%r1)
-0xe3 0x00 0x10 0x00 0x00 0x30
+# CHECK: cgh %r0, 0(%r1)
+0xe3 0x00 0x10 0x00 0x00 0x34
 
-# CHECK: cgf %r0, 0(%r15)
-0xe3 0x00 0xf0 0x00 0x00 0x30
+# CHECK: cgh %r0, 0(%r15)
+0xe3 0x00 0xf0 0x00 0x00 0x34
 
-# CHECK: cgf %r0, 524287(%r1,%r15)
-0xe3 0x01 0xff 0xff 0x7f 0x30
+# CHECK: cgh %r0, 524287(%r1,%r15)
+0xe3 0x01 0xff 0xff 0x7f 0x34
 
-# CHECK: cgf %r0, 524287(%r15,%r1)
-0xe3 0x0f 0x1f 0xff 0x7f 0x30
+# CHECK: cgh %r0, 524287(%r15,%r1)
+0xe3 0x0f 0x1f 0xff 0x7f 0x34
 
-# CHECK: cgf %r15, 0
-0xe3 0xf0 0x00 0x00 0x00 0x30
+# CHECK: cgh %r15, 0
+0xe3 0xf0 0x00 0x00 0x00 0x34
 
 # CHECK: cghi %r0, -32768
 0xa7 0x0f 0x80 0x00
@@ -1861,55 +2080,115 @@
 # CHECK: cghsi 4095(%r15), 42
 0xe5 0x58 0xff 0xff 0x00 0x2a
 
-# CHECK: cgh %r0, -524288
-0xe3 0x00 0x00 0x00 0x80 0x34
+# CHECK: cgib %r0, 0, 0, 0
+0xec 0x00 0x00 0x00 0x00 0xfc
 
-# CHECK: cgh %r0, -1
-0xe3 0x00 0x0f 0xff 0xff 0x34
+# CHECK: cgib %r0, -128, 0, 0
+0xec 0x00 0x00 0x00 0x80 0xfc
 
-# CHECK: cgh %r0, 0
-0xe3 0x00 0x00 0x00 0x00 0x34
+# CHECK: cgib %r0, -1, 0, 0
+0xec 0x00 0x00 0x00 0xff 0xfc
 
-# CHECK: cgh %r0, 1
-0xe3 0x00 0x00 0x01 0x00 0x34
+# CHECK: cgib %r0, 127, 0, 0
+0xec 0x00 0x00 0x00 0x7f 0xfc
 
-# CHECK: cgh %r0, 524287
-0xe3 0x00 0x0f 0xff 0x7f 0x34
+# CHECK: cgib %r15, 0, 0, 0
+0xec 0xf0 0x00 0x00 0x00 0xfc
 
-# CHECK: cgh %r0, 0(%r1)
-0xe3 0x00 0x10 0x00 0x00 0x34
+# CHECK: cgib %r7, 100, 0, 0
+0xec 0x70 0x00 0x00 0x64 0xfc
 
-# CHECK: cgh %r0, 0(%r15)
-0xe3 0x00 0xf0 0x00 0x00 0x34
+# CHECK: cgib %r0, 0, 0, 4095(%r15)
+0xec 0x00 0xff 0xff 0x00 0xfc
 
-# CHECK: cgh %r0, 524287(%r1,%r15)
-0xe3 0x01 0xff 0xff 0x7f 0x34
+# CHECK: cgib %r0, 0, 0, 0(%r8)
+0xec 0x00 0x80 0x00 0x00 0xfc
 
-# CHECK: cgh %r0, 524287(%r15,%r1)
-0xe3 0x0f 0x1f 0xff 0x7f 0x34
+# CHECK: cgib %r0, 0, 0, 4095(%r7)
+0xec 0x00 0x7f 0xff 0x00 0xfc
 
-# CHECK: cgh %r15, 0
-0xe3 0xf0 0x00 0x00 0x00 0x34
+# CHECK: cgib %r0, 0, 1, 0
+0xec 0x01 0x00 0x00 0x00 0xfc
 
-# CHECK: cgr %r0, %r0
-0xb9 0x20 0x00 0x00
+# CHECK: cgibh %r0, 0, 0
+0xec 0x02 0x00 0x00 0x00 0xfc
 
-# CHECK: cgr %r0, %r15
-0xb9 0x20 0x00 0x0f
+# CHECK: cgib %r0, 0, 3, 0
+0xec 0x03 0x00 0x00 0x00 0xfc
 
-# CHECK: cgr %r15, %r0
-0xb9 0x20 0x00 0xf0
+# CHECK: cgibl %r0, 0, 0
+0xec 0x04 0x00 0x00 0x00 0xfc
 
-# CHECK: cgr %r7, %r8
-0xb9 0x20 0x00 0x78
+# CHECK: cgib %r0, 0, 5, 0
+0xec 0x05 0x00 0x00 0x00 0xfc
 
-# CHECK: cgrb %r0, %r0, 0, 0
-0xec 0x00 0x00 0x00 0x00 0xe4
+# CHECK: cgiblh %r0, 0, 0
+0xec 0x06 0x00 0x00 0x00 0xfc
 
-# CHECK: cgrb %r0, %r15, 0, 0
-0xec 0x0f 0x00 0x00 0x00 0xe4
+# CHECK: cgib %r0, 0, 7, 0
+0xec 0x07 0x00 0x00 0x00 0xfc
 
-# CHECK: cgrb %r15, %r0, 0, 0
+# CHECK: cgibe %r0, 0, 0
+0xec 0x08 0x00 0x00 0x00 0xfc
+
+# CHECK: cgib %r0, 0, 9, 0
+0xec 0x09 0x00 0x00 0x00 0xfc
+
+# CHECK: cgibhe %r0, 0, 0
+0xec 0x0a 0x00 0x00 0x00 0xfc
+
+# CHECK: cgib %r0, 0, 11, 0
+0xec 0x0b 0x00 0x00 0x00 0xfc
+
+# CHECK: cgible %r0, 0, 0
+0xec 0x0c 0x00 0x00 0x00 0xfc
+
+# CHECK: cgib %r0, 0, 13, 0
+0xec 0x0d 0x00 0x00 0x00 0xfc
+
+# CHECK: cgib %r0, 0, 14, 0
+0xec 0x0e 0x00 0x00 0x00 0xfc
+
+# CHECK: cgib %r0, 0, 15, 0
+0xec 0x0f 0x00 0x00 0x00 0xfc
+
+# CHECK: cgith %r0, 0
+0xec 0x00 0x00 0x00 0x20 0x70
+
+# CHECK: cgitl %r0, 0
+0xec 0x00 0x00 0x00 0x40 0x70
+
+# CHECK: cgite %r0, 0
+0xec 0x00 0x00 0x00 0x80 0x70
+
+# CHECK: cgitlh %r0, 0
+0xec 0x00 0x00 0x00 0x60 0x70
+
+# CHECK: cgithe %r0, 0
+0xec 0x00 0x00 0x00 0xa0 0x70
+
+# CHECK: cgitle %r0, 0
+0xec 0x00 0x00 0x00 0xc0 0x70
+
+# CHECK: cgr %r0, %r0
+0xb9 0x20 0x00 0x00
+
+# CHECK: cgr %r0, %r15
+0xb9 0x20 0x00 0x0f
+
+# CHECK: cgr %r15, %r0
+0xb9 0x20 0x00 0xf0
+
+# CHECK: cgr %r7, %r8
+0xb9 0x20 0x00 0x78
+
+# CHECK: cgrb %r0, %r0, 0, 0
+0xec 0x00 0x00 0x00 0x00 0xe4
+
+# CHECK: cgrb %r0, %r15, 0, 0
+0xec 0x0f 0x00 0x00 0x00 0xe4
+
+# CHECK: cgrb %r15, %r0, 0, 0
 0xec 0xf0 0x00 0x00 0x00 0xe4
 
 # CHECK: cgrb %r7, %r8, 0, 0
@@ -1987,126 +2266,6 @@
 # CHECK: cgrtle %r0, %r1
 0xb9 0x60 0xc0 0x01
 
-# CHECK: cg %r0, -524288
-0xe3 0x00 0x00 0x00 0x80 0x20
-
-# CHECK: cg %r0, -1
-0xe3 0x00 0x0f 0xff 0xff 0x20
-
-# CHECK: cg %r0, 0
-0xe3 0x00 0x00 0x00 0x00 0x20
-
-# CHECK: cg %r0, 1
-0xe3 0x00 0x00 0x01 0x00 0x20
-
-# CHECK: cg %r0, 524287
-0xe3 0x00 0x0f 0xff 0x7f 0x20
-
-# CHECK: cg %r0, 0(%r1)
-0xe3 0x00 0x10 0x00 0x00 0x20
-
-# CHECK: cg %r0, 0(%r15)
-0xe3 0x00 0xf0 0x00 0x00 0x20
-
-# CHECK: cg %r0, 524287(%r1,%r15)
-0xe3 0x01 0xff 0xff 0x7f 0x20
-
-# CHECK: cg %r0, 524287(%r15,%r1)
-0xe3 0x0f 0x1f 0xff 0x7f 0x20
-
-# CHECK: cg %r15, 0
-0xe3 0xf0 0x00 0x00 0x00 0x20
-
-# CHECK: cgib %r0, 0, 0, 0
-0xec 0x00 0x00 0x00 0x00 0xfc
-
-# CHECK: cgib %r0, -128, 0, 0
-0xec 0x00 0x00 0x00 0x80 0xfc
-
-# CHECK: cgib %r0, -1, 0, 0
-0xec 0x00 0x00 0x00 0xff 0xfc
-
-# CHECK: cgib %r0, 127, 0, 0
-0xec 0x00 0x00 0x00 0x7f 0xfc
-
-# CHECK: cgib %r15, 0, 0, 0
-0xec 0xf0 0x00 0x00 0x00 0xfc
-
-# CHECK: cgib %r7, 100, 0, 0
-0xec 0x70 0x00 0x00 0x64 0xfc
-
-# CHECK: cgib %r0, 0, 0, 4095(%r15)
-0xec 0x00 0xff 0xff 0x00 0xfc
-
-# CHECK: cgib %r0, 0, 0, 0(%r8)
-0xec 0x00 0x80 0x00 0x00 0xfc
-
-# CHECK: cgib %r0, 0, 0, 4095(%r7)
-0xec 0x00 0x7f 0xff 0x00 0xfc
-
-# CHECK: cgib %r0, 0, 1, 0
-0xec 0x01 0x00 0x00 0x00 0xfc
-
-# CHECK: cgibh %r0, 0, 0
-0xec 0x02 0x00 0x00 0x00 0xfc
-
-# CHECK: cgib %r0, 0, 3, 0
-0xec 0x03 0x00 0x00 0x00 0xfc
-
-# CHECK: cgibl %r0, 0, 0
-0xec 0x04 0x00 0x00 0x00 0xfc
-
-# CHECK: cgib %r0, 0, 5, 0
-0xec 0x05 0x00 0x00 0x00 0xfc
-
-# CHECK: cgiblh %r0, 0, 0
-0xec 0x06 0x00 0x00 0x00 0xfc
-
-# CHECK: cgib %r0, 0, 7, 0
-0xec 0x07 0x00 0x00 0x00 0xfc
-
-# CHECK: cgibe %r0, 0, 0
-0xec 0x08 0x00 0x00 0x00 0xfc
-
-# CHECK: cgib %r0, 0, 9, 0
-0xec 0x09 0x00 0x00 0x00 0xfc
-
-# CHECK: cgibhe %r0, 0, 0
-0xec 0x0a 0x00 0x00 0x00 0xfc
-
-# CHECK: cgib %r0, 0, 11, 0
-0xec 0x0b 0x00 0x00 0x00 0xfc
-
-# CHECK: cgible %r0, 0, 0
-0xec 0x0c 0x00 0x00 0x00 0xfc
-
-# CHECK: cgib %r0, 0, 13, 0
-0xec 0x0d 0x00 0x00 0x00 0xfc
-
-# CHECK: cgib %r0, 0, 14, 0
-0xec 0x0e 0x00 0x00 0x00 0xfc
-
-# CHECK: cgib %r0, 0, 15, 0
-0xec 0x0f 0x00 0x00 0x00 0xfc
-
-# CHECK: cgith %r0, 0
-0xec 0x00 0x00 0x00 0x20 0x70
-
-# CHECK: cgitl %r0, 0
-0xec 0x00 0x00 0x00 0x40 0x70
-
-# CHECK: cgite %r0, 0
-0xec 0x00 0x00 0x00 0x80 0x70
-
-# CHECK: cgitlh %r0, 0
-0xec 0x00 0x00 0x00 0x60 0x70
-
-# CHECK: cgithe %r0, 0
-0xec 0x00 0x00 0x00 0xa0 0x70
-
-# CHECK: cgitle %r0, 0
-0xec 0x00 0x00 0x00 0xc0 0x70
-
 # CHECK: cgxbr %r0, 0, %f0
 0xb3 0xaa 0x00 0x00
 
@@ -2122,24 +2281,45 @@
 # CHECK: cgxbr %r15, 0, %f0
 0xb3 0xaa 0x00 0xf0
 
-# CHECK: cgxbra	%r0, 0, %f0, 1
+# CHECK: cgxbra %r0, 0, %f0, 1
 0xb3 0xaa 0x01 0x00
 
-# CHECK: cgxbra	%r0, 0, %f0, 15
+# CHECK: cgxbra %r0, 0, %f0, 15
 0xb3 0xaa 0x0f 0x00
 
-# CHECK: cgxbra	%r0, 0, %f13, 1
+# CHECK: cgxbra %r0, 0, %f13, 1
 0xb3 0xaa 0x01 0x0d
 
-# CHECK: cgxbra	%r0, 15, %f0, 1
+# CHECK: cgxbra %r0, 15, %f0, 1
 0xb3 0xaa 0xf1 0x00
 
-# CHECK: cgxbra	%r4, 5, %f8, 9
+# CHECK: cgxbra %r4, 5, %f8, 9
 0xb3 0xaa 0x59 0x48
 
-# CHECK: cgxbra	%r15, 0, %f0, 1
+# CHECK: cgxbra %r15, 0, %f0, 1
 0xb3 0xaa 0x01 0xf0
 
+# CHECK: ch %r0, 0
+0x49 0x00 0x00 0x00
+
+# CHECK: ch %r0, 4095
+0x49 0x00 0x0f 0xff
+
+# CHECK: ch %r0, 0(%r1)
+0x49 0x00 0x10 0x00
+
+# CHECK: ch %r0, 0(%r15)
+0x49 0x00 0xf0 0x00
+
+# CHECK: ch %r0, 4095(%r1,%r15)
+0x49 0x01 0xff 0xff
+
+# CHECK: ch %r0, 4095(%r15,%r1)
+0x49 0x0f 0x1f 0xff
+
+# CHECK: ch %r15, 0
+0x49 0xf0 0x00 0x00
+
 # CHECK: chf %r0, -524288
 0xe3 0x00 0x00 0x00 0x80 0xcd
 
@@ -2254,27 +2434,6 @@
 # CHECK: chsi 4095(%r15), 42
 0xe5 0x5c 0xff 0xff 0x00 0x2a
 
-# CHECK: ch %r0, 0
-0x49 0x00 0x00 0x00
-
-# CHECK: ch %r0, 4095
-0x49 0x00 0x0f 0xff
-
-# CHECK: ch %r0, 0(%r1)
-0x49 0x00 0x10 0x00
-
-# CHECK: ch %r0, 0(%r15)
-0x49 0x00 0xf0 0x00
-
-# CHECK: ch %r0, 4095(%r1,%r15)
-0x49 0x01 0xff 0xff
-
-# CHECK: ch %r0, 4095(%r15,%r1)
-0x49 0x0f 0x1f 0xff
-
-# CHECK: ch %r15, 0
-0x49 0xf0 0x00 0x00
-
 # CHECK: chy %r0, -524288
 0xe3 0x00 0x00 0x00 0x80 0x79
 
@@ -2413,6 +2572,39 @@
 # CHECK: citle %r0, 0
 0xec 0x00 0x00 0x00 0xc0 0x72
 
+# CHECK: cksm %r0, %r0
+0xb2 0x41 0x00 0x00
+
+# CHECK: cksm %r0, %r14
+0xb2 0x41 0x00 0x0e
+
+# CHECK: cksm %r15, %r0
+0xb2 0x41 0x00 0xf0
+
+# CHECK: cksm %r6, %r8
+0xb2 0x41 0x00 0x68
+
+# CHECK: cl %r0, 0
+0x55 0x00 0x00 0x00
+
+# CHECK: cl %r0, 4095
+0x55 0x00 0x0f 0xff
+
+# CHECK: cl %r0, 0(%r1)
+0x55 0x00 0x10 0x00
+
+# CHECK: cl %r0, 0(%r15)
+0x55 0x00 0xf0 0x00
+
+# CHECK: cl %r0, 4095(%r1,%r15)
+0x55 0x01 0xff 0xff
+
+# CHECK: cl %r0, 4095(%r15,%r1)
+0x55 0x0f 0x1f 0xff
+
+# CHECK: cl %r15, 0
+0x55 0xf0 0x00 0x00
+
 # CHECK: clc 0(1), 0
 0xd5 0x00 0x00 0x00 0x00 0x00
 
@@ -2449,94 +2641,157 @@
 # CHECK: clc 0(256,%r15), 0
 0xd5 0xff 0xf0 0x00 0x00 0x00
 
-# CHECK: clfdbr	%r0, 0, %f0, 1
-0xb3 0x9d 0x01 0x00
+# CHECK: clcl %r0, %r8
+0x0f 0x08
 
-# CHECK: clfdbr	%r0, 0, %f0, 15
-0xb3 0x9d 0x0f 0x00
+# CHECK: clcl %r0, %r14
+0x0f 0x0e
 
-# CHECK: clfdbr	%r0, 0, %f15, 1
-0xb3 0x9d 0x01 0x0f
+# CHECK: clcl %r14, %r0
+0x0f 0xe0
 
-# CHECK: clfdbr	%r0, 15, %f0, 1
-0xb3 0x9d 0xf1 0x00
+# CHECK: clcl %r14, %r8
+0x0f 0xe8
 
-# CHECK: clfdbr	%r4, 5, %f6, 7
-0xb3 0x9d 0x57 0x46
+# CHECK: clcle %r0, %r0, 0
+0xa9 0x00 0x00 0x00
 
-# CHECK: clfdbr	%r15, 0, %f0, 1
-0xb3 0x9d 0x01 0xf0
+# CHECK: clcle %r0, %r14, 4095
+0xa9 0x0e 0x0f 0xff
 
-# CHECK: clfebr	%r0, 0, %f0, 1
-0xb3 0x9c 0x01 0x00
+# CHECK: clcle %r0, %r0, 0(%r1)
+0xa9 0x00 0x10 0x00
 
-# CHECK: clfebr	%r0, 0, %f0, 15
-0xb3 0x9c 0x0f 0x00
+# CHECK: clcle %r0, %r0, 0(%r15)
+0xa9 0x00 0xf0 0x00
 
-# CHECK: clfebr	%r0, 0, %f15, 1
+# CHECK: clcle %r0, %r14, 4095(%r15)
+0xa9 0x0e 0xff 0xff
+
+# CHECK: clcle %r0, %r0, 4095(%r1)
+0xa9 0x00 0x1f 0xff
+
+# CHECK: clcle %r14, %r0, 0
+0xa9 0xe0 0x00 0x00
+
+# CHECK: clclu %r0, %r0, -524288
+0xeb 0x00 0x00 0x00 0x80 0x8f
+
+# CHECK: clclu %r0, %r0, -1
+0xeb 0x00 0x0f 0xff 0xff 0x8f
+
+# CHECK: clclu %r0, %r14, 0
+0xeb 0x0e 0x00 0x00 0x00 0x8f
+
+# CHECK: clclu %r0, %r14, 1
+0xeb 0x0e 0x00 0x01 0x00 0x8f
+
+# CHECK: clclu %r0, %r8, 524287
+0xeb 0x08 0x0f 0xff 0x7f 0x8f
+
+# CHECK: clclu %r0, %r8, 0(%r1)
+0xeb 0x08 0x10 0x00 0x00 0x8f
+
+# CHECK: clclu %r0, %r4, 0(%r15)
+0xeb 0x04 0xf0 0x00 0x00 0x8f
+
+# CHECK: clclu %r0, %r4, 524287(%r15)
+0xeb 0x04 0xff 0xff 0x7f 0x8f
+
+# CHECK: clclu %r0, %r0, 524287(%r1)
+0xeb 0x00 0x1f 0xff 0x7f 0x8f
+
+# CHECK: clclu %r14, %r0, 0
+0xeb 0xe0 0x00 0x00 0x00 0x8f
+
+# CHECK: clfdbr %r0, 0, %f0, 1
+0xb3 0x9d 0x01 0x00
+
+# CHECK: clfdbr %r0, 0, %f0, 15
+0xb3 0x9d 0x0f 0x00
+
+# CHECK: clfdbr %r0, 0, %f15, 1
+0xb3 0x9d 0x01 0x0f
+
+# CHECK: clfdbr %r0, 15, %f0, 1
+0xb3 0x9d 0xf1 0x00
+
+# CHECK: clfdbr %r4, 5, %f6, 7
+0xb3 0x9d 0x57 0x46
+
+# CHECK: clfdbr %r15, 0, %f0, 1
+0xb3 0x9d 0x01 0xf0
+
+# CHECK: clfebr %r0, 0, %f0, 1
+0xb3 0x9c 0x01 0x00
+
+# CHECK: clfebr %r0, 0, %f0, 15
+0xb3 0x9c 0x0f 0x00
+
+# CHECK: clfebr %r0, 0, %f15, 1
 0xb3 0x9c 0x01 0x0f
 
-# CHECK: clfebr	%r0, 15, %f0, 1
+# CHECK: clfebr %r0, 15, %f0, 1
 0xb3 0x9c 0xf1 0x00
 
-# CHECK: clfebr	%r4, 5, %f6, 7
+# CHECK: clfebr %r4, 5, %f6, 7
 0xb3 0x9c 0x57 0x46
 
-# CHECK: clfebr	%r15, 0, %f0, 1
+# CHECK: clfebr %r15, 0, %f0, 1
 0xb3 0x9c 0x01 0xf0
 
-# CHECK: clfxbr	%r0, 0, %f0, 1
+# CHECK: clfxbr %r0, 0, %f0, 1
 0xb3 0x9e 0x01 0x00
 
-# CHECK: clfxbr	%r0, 0, %f0, 15
+# CHECK: clfxbr %r0, 0, %f0, 15
 0xb3 0x9e 0x0f 0x00
 
-# CHECK: clfxbr	%r0, 0, %f13, 1
+# CHECK: clfxbr %r0, 0, %f13, 1
 0xb3 0x9e 0x01 0x0d
 
-# CHECK: clfxbr	%r0, 15, %f0, 1
+# CHECK: clfxbr %r0, 15, %f0, 1
 0xb3 0x9e 0xf1 0x00
 
-# CHECK: clfxbr	%r4, 5, %f8, 9
+# CHECK: clfxbr %r4, 5, %f8, 9
 0xb3 0x9e 0x59 0x48
 
-# CHECK: clfxbr	%r15, 0, %f0, 1
+# CHECK: clfxbr %r15, 0, %f0, 1
 0xb3 0x9e 0x01 0xf0
 
-# CHECK: clgdbr	%r0, 0, %f0, 1
+# CHECK: clgdbr %r0, 0, %f0, 1
 0xb3 0xad 0x01 0x00
 
-# CHECK: clgdbr	%r0, 0, %f0, 15
+# CHECK: clgdbr %r0, 0, %f0, 15
 0xb3 0xad 0x0f 0x00
 
-# CHECK: clgdbr	%r0, 0, %f15, 1
+# CHECK: clgdbr %r0, 0, %f15, 1
 0xb3 0xad 0x01 0x0f
 
-# CHECK: clgdbr	%r0, 15, %f0, 1
+# CHECK: clgdbr %r0, 15, %f0, 1
 0xb3 0xad 0xf1 0x00
 
-# CHECK: clgdbr	%r4, 5, %f6, 7
+# CHECK: clgdbr %r4, 5, %f6, 7
 0xb3 0xad 0x57 0x46
 
-# CHECK: clgdbr	%r15, 0, %f0, 1
+# CHECK: clgdbr %r15, 0, %f0, 1
 0xb3 0xad 0x01 0xf0
 
-# CHECK: clgebr	%r0, 0, %f0, 1
+# CHECK: clgebr %r0, 0, %f0, 1
 0xb3 0xac 0x01 0x00
 
-# CHECK: clgebr	%r0, 0, %f0, 15
+# CHECK: clgebr %r0, 0, %f0, 15
 0xb3 0xac 0x0f 0x00
 
-# CHECK: clgebr	%r0, 0, %f15, 1
+# CHECK: clgebr %r0, 0, %f15, 1
 0xb3 0xac 0x01 0x0f
 
-# CHECK: clgebr	%r0, 15, %f0, 1
+# CHECK: clgebr %r0, 15, %f0, 1
 0xb3 0xac 0xf1 0x00
 
-# CHECK: clgebr	%r4, 5, %f6, 7
+# CHECK: clgebr %r4, 5, %f6, 7
 0xb3 0xac 0x57 0x46
 
-# CHECK: clgebr	%r15, 0, %f0, 1
+# CHECK: clgebr %r15, 0, %f0, 1
 0xb3 0xac 0x01 0xf0
 
 # CHECK: clgib %r0, 0, 0, 0
@@ -2611,22 +2866,22 @@
 # CHECK: clgib %r0, 0, 15, 0
 0xec 0x0f 0x00 0x00 0x00 0xfd
 
-# CHECK: clgxbr	%r0, 0, %f0, 1
+# CHECK: clgxbr %r0, 0, %f0, 1
 0xb3 0xae 0x01 0x00
 
-# CHECK: clgxbr	%r0, 0, %f0, 15
+# CHECK: clgxbr %r0, 0, %f0, 15
 0xb3 0xae 0x0f 0x00
 
-# CHECK: clgxbr	%r0, 0, %f13, 1
+# CHECK: clgxbr %r0, 0, %f13, 1
 0xb3 0xae 0x01 0x0d
 
-# CHECK: clgxbr	%r0, 15, %f0, 1
+# CHECK: clgxbr %r0, 15, %f0, 1
 0xb3 0xae 0xf1 0x00
 
-# CHECK: clgxbr	%r4, 5, %f8, 9
+# CHECK: clgxbr %r4, 5, %f8, 9
 0xb3 0xae 0x59 0x48
 
-# CHECK: clgxbr	%r15, 0, %f0, 1
+# CHECK: clgxbr %r15, 0, %f0, 1
 0xb3 0xae 0x01 0xf0
 
 # CHECK: clfhsi 0, 0
@@ -2677,6 +2932,36 @@
 # CHECK: clfitle %r0, 0
 0xec 0x00 0x00 0x00 0xc0 0x73
 
+# CHECK: clg %r0, -524288
+0xe3 0x00 0x00 0x00 0x80 0x21
+
+# CHECK: clg %r0, -1
+0xe3 0x00 0x0f 0xff 0xff 0x21
+
+# CHECK: clg %r0, 0
+0xe3 0x00 0x00 0x00 0x00 0x21
+
+# CHECK: clg %r0, 1
+0xe3 0x00 0x00 0x01 0x00 0x21
+
+# CHECK: clg %r0, 524287
+0xe3 0x00 0x0f 0xff 0x7f 0x21
+
+# CHECK: clg %r0, 0(%r1)
+0xe3 0x00 0x10 0x00 0x00 0x21
+
+# CHECK: clg %r0, 0(%r15)
+0xe3 0x00 0xf0 0x00 0x00 0x21
+
+# CHECK: clg %r0, 524287(%r1,%r15)
+0xe3 0x01 0xff 0xff 0x7f 0x21
+
+# CHECK: clg %r0, 524287(%r15,%r1)
+0xe3 0x0f 0x1f 0xff 0x7f 0x21
+
+# CHECK: clg %r15, 0
+0xe3 0xf0 0x00 0x00 0x00 0x21
+
 # CHECK: clgith %r0, 0
 0xec 0x00 0x00 0x00 0x20 0x71
 
@@ -2845,36 +3130,6 @@
 # CHECK: clgrb %r0, %r0, 15, 0
 0xec 0x00 0x00 0x00 0xf0 0xe5
 
-# CHECK: clg %r0, -524288
-0xe3 0x00 0x00 0x00 0x80 0x21
-
-# CHECK: clg %r0, -1
-0xe3 0x00 0x0f 0xff 0xff 0x21
-
-# CHECK: clg %r0, 0
-0xe3 0x00 0x00 0x00 0x00 0x21
-
-# CHECK: clg %r0, 1
-0xe3 0x00 0x00 0x01 0x00 0x21
-
-# CHECK: clg %r0, 524287
-0xe3 0x00 0x0f 0xff 0x7f 0x21
-
-# CHECK: clg %r0, 0(%r1)
-0xe3 0x00 0x10 0x00 0x00 0x21
-
-# CHECK: clg %r0, 0(%r15)
-0xe3 0x00 0xf0 0x00 0x00 0x21
-
-# CHECK: clg %r0, 524287(%r1,%r15)
-0xe3 0x01 0xff 0xff 0x7f 0x21
-
-# CHECK: clg %r0, 524287(%r15,%r1)
-0xe3 0x0f 0x1f 0xff 0x7f 0x21
-
-# CHECK: clg %r15, 0
-0xe3 0xf0 0x00 0x00 0x00 0x21
-
 # CHECK: clhf %r0, -524288
 0xe3 0x00 0x00 0x00 0x80 0xcf
 
@@ -3061,6 +3316,87 @@
 # CHECK: cliy 524287(%r15), 42
 0xeb 0x2a 0xff 0xff 0x7f 0x55
 
+# CHECK: clm %r0, 0, 0
+0xbd 0x00 0x00 0x00
+
+# CHECK: clm %r0, 15, 4095
+0xbd 0x0f 0x0f 0xff
+
+# CHECK: clm %r0, 0, 0(%r1)
+0xbd 0x00 0x10 0x00
+
+# CHECK: clm %r0, 0, 0(%r15)
+0xbd 0x00 0xf0 0x00
+
+# CHECK: clm %r0, 15, 4095(%r15)
+0xbd 0x0f 0xff 0xff
+
+# CHECK: clm %r0, 0, 4095(%r1)
+0xbd 0x00 0x1f 0xff
+
+# CHECK: clm %r15, 0, 0
+0xbd 0xf0 0x00 0x00
+
+# CHECK: clmh %r0, 0, -524288
+0xeb 0x00 0x00 0x00 0x80 0x20
+
+# CHECK: clmh %r0, 0, -1
+0xeb 0x00 0x0f 0xff 0xff 0x20
+
+# CHECK: clmh %r0, 15, 0
+0xeb 0x0f 0x00 0x00 0x00 0x20
+
+# CHECK: clmh %r0, 15, 1
+0xeb 0x0f 0x00 0x01 0x00 0x20
+
+# CHECK: clmh %r0, 8, 524287
+0xeb 0x08 0x0f 0xff 0x7f 0x20
+
+# CHECK: clmh %r0, 8, 0(%r1)
+0xeb 0x08 0x10 0x00 0x00 0x20
+
+# CHECK: clmh %r0, 4, 0(%r15)
+0xeb 0x04 0xf0 0x00 0x00 0x20
+
+# CHECK: clmh %r0, 4, 524287(%r15)
+0xeb 0x04 0xff 0xff 0x7f 0x20
+
+# CHECK: clmh %r0, 0, 524287(%r1)
+0xeb 0x00 0x1f 0xff 0x7f 0x20
+
+# CHECK: clmh %r15, 0, 0
+0xeb 0xf0 0x00 0x00 0x00 0x20
+
+# CHECK: clmy %r0, 0, -524288
+0xeb 0x00 0x00 0x00 0x80 0x21
+
+# CHECK: clmy %r0, 0, -1
+0xeb 0x00 0x0f 0xff 0xff 0x21
+
+# CHECK: clmy %r0, 15, 0
+0xeb 0x0f 0x00 0x00 0x00 0x21
+
+# CHECK: clmy %r0, 15, 1
+0xeb 0x0f 0x00 0x01 0x00 0x21
+
+# CHECK: clmy %r0, 8, 524287
+0xeb 0x08 0x0f 0xff 0x7f 0x21
+
+# CHECK: clmy %r0, 8, 0(%r1)
+0xeb 0x08 0x10 0x00 0x00 0x21
+
+# CHECK: clmy %r0, 4, 0(%r15)
+0xeb 0x04 0xf0 0x00 0x00 0x21
+
+# CHECK: clmy %r0, 4, 524287(%r15)
+0xeb 0x04 0xff 0xff 0x7f 0x21
+
+# CHECK: clmy %r0, 0, 524287(%r1)
+0xeb 0x00 0x1f 0xff 0x7f 0x21
+
+# CHECK: clmy %r15, 0, 0
+0xeb 0xf0 0x00 0x00 0x00 0x21
+
 # CHECK: clr %r0, %r0
 0x15 0x00
 
@@ -3271,27 +3607,6 @@
 # CHECK: clst %r7, %r8
 0xb2 0x5d 0x00 0x78
 
-# CHECK: cl %r0, 0
-0x55 0x00 0x00 0x00
-
-# CHECK: cl %r0, 4095
-0x55 0x00 0x0f 0xff
-
-# CHECK: cl %r0, 0(%r1)
-0x55 0x00 0x10 0x00
-
-# CHECK: cl %r0, 0(%r15)
-0x55 0x00 0xf0 0x00
-
-# CHECK: cl %r0, 4095(%r1,%r15)
-0x55 0x01 0xff 0xff
-
-# CHECK: cl %r0, 4095(%r15,%r1)
-0x55 0x0f 0x1f 0xff
-
-# CHECK: cl %r15, 0
-0x55 0xf0 0x00 0x00
-
 # CHECK: cly %r0, -524288
 0xe3 0x00 0x00 0x00 0x80 0x55
 
@@ -3322,6 +3637,60 @@
 # CHECK: cly %r15, 0
 0xe3 0xf0 0x00 0x00 0x00 0x55
 
+# CHECK: cmpsc %r0, %r0
+0xb2 0x63 0x00 0x00
+
+# CHECK: cmpsc %r0, %r14
+0xb2 0x63 0x00 0x0e
+
+# CHECK: cmpsc %r14, %r0
+0xb2 0x63 0x00 0xe0
+
+# CHECK: cmpsc %r6, %r8
+0xb2 0x63 0x00 0x68
+
+# CHECK: cp 0(1), 0(1)
+0xf9 0x00 0x00 0x00 0x00 0x00
+
+# CHECK: cp 0(1), 0(1,%r1)
+0xf9 0x00 0x00 0x00 0x10 0x00
+
+# CHECK: cp 0(1), 0(1,%r15)
+0xf9 0x00 0x00 0x00 0xf0 0x00
+
+# CHECK: cp 0(1), 4095(1)
+0xf9 0x00 0x00 0x00 0x0f 0xff
+
+# CHECK: cp 0(1), 4095(1,%r1)
+0xf9 0x00 0x00 0x00 0x1f 0xff
+
+# CHECK: cp 0(1), 4095(1,%r15)
+0xf9 0x00 0x00 0x00 0xff 0xff
+
+# CHECK: cp 0(1,%r1), 0(1)
+0xf9 0x00 0x10 0x00 0x00 0x00
+
+# CHECK: cp 0(1,%r15), 0(1)
+0xf9 0x00 0xf0 0x00 0x00 0x00
+
+# CHECK: cp 4095(1,%r1), 0(1)
+0xf9 0x00 0x1f 0xff 0x00 0x00
+
+# CHECK: cp 4095(1,%r15), 0(1)
+0xf9 0x00 0xff 0xff 0x00 0x00
+
+# CHECK: cp 0(16,%r1), 0(1)
+0xf9 0xf0 0x10 0x00 0x00 0x00
+
+# CHECK: cp 0(16,%r15), 0(1)
+0xf9 0xf0 0xf0 0x00 0x00 0x00
+
+# CHECK: cp 0(1), 0(16,%r1)
+0xf9 0x0f 0x00 0x00 0x10 0x00
+
+# CHECK: cp 0(1), 0(16,%r15)
+0xf9 0x0f 0x00 0x00 0xf0 0x00
+
 # CHECK: cpsdr %f0, %f0, %f0
 0xb3 0x72 0x00 0x00
 
@@ -3451,6 +3820,30 @@
 # CHECK: crtle %r0, %r1
 0xb9 0x72 0xc0 0x01
 
+# CHECK: cs %r0, %r0, 0
+0xba 0x00 0x00 0x00
+
+# CHECK: cs %r0, %r0, 4095
+0xba 0x00 0x0f 0xff
+
+# CHECK: cs %r0, %r0, 0(%r1)
+0xba 0x00 0x10 0x00
+
+# CHECK: cs %r0, %r0, 0(%r15)
+0xba 0x00 0xf0 0x00
+
+# CHECK: cs %r0, %r0, 4095(%r1)
+0xba 0x00 0x1f 0xff
+
+# CHECK: cs %r0, %r0, 4095(%r15)
+0xba 0x00 0xff 0xff
+
+# CHECK: cs %r0, %r15, 0
+0xba 0x0f 0x00 0x00
+
+# CHECK: cs %r15, %r0, 0
+0xba 0xf0 0x00 0x00
+
 # CHECK: csg %r0, %r0, -524288
 0xeb 0x00 0x00 0x00 0x80 0x30
 
@@ -3484,29 +3877,26 @@
 # CHECK: csg %r15, %r0, 0
 0xeb 0xf0 0x00 0x00 0x00 0x30
 
-# CHECK: cs %r0, %r0, 0
-0xba 0x00 0x00 0x00
-
-# CHECK: cs %r0, %r0, 4095
-0xba 0x00 0x0f 0xff
+# CHECK: csst 0, 0, %r0
+0xc8 0x02 0x00 0x00 0x00 0x00
 
-# CHECK: cs %r0, %r0, 0(%r1)
-0xba 0x00 0x10 0x00
+# CHECK: csst 0, 4095, %r2
+0xc8 0x22 0x00 0x00 0x0f 0xff
 
-# CHECK: cs %r0, %r0, 0(%r15)
-0xba 0x00 0xf0 0x00
+# CHECK: csst 0, 0(%r1), %r2
+0xc8 0x22 0x00 0x00 0x10 0x00
 
-# CHECK: cs %r0, %r0, 4095(%r1)
-0xba 0x00 0x1f 0xff
+# CHECK: csst 0, 0(%r15), %r2
+0xc8 0x22 0x00 0x00 0xf0 0x00
 
-# CHECK: cs %r0, %r0, 4095(%r15)
-0xba 0x00 0xff 0xff
+# CHECK: csst 0(%r1), 4095(%r15), %r2
+0xc8 0x22 0x10 0x00 0xff 0xff
 
-# CHECK: cs %r0, %r15, 0
-0xba 0x0f 0x00 0x00
+# CHECK: csst 0(%r1), 0(%r15), %r2
+0xc8 0x22 0x10 0x00 0xf0 0x00
 
-# CHECK: cs %r15, %r0, 0
-0xba 0xf0 0x00 0x00
+# CHECK: csst 4095(%r1), 0(%r15), %r2
+0xc8 0x22 0x1f 0xff 0xf0 0x00
 
 # CHECK: csy %r0, %r0, -524288
 0xeb 0x00 0x00 0x00 0x80 0x14
@@ -3541,91 +3931,319 @@
 # CHECK: csy %r15, %r0, 0
 0xeb 0xf0 0x00 0x00 0x00 0x14
 
-# CHECK: csst 0, 0, %r0
-0xc8 0x02 0x00 0x00 0x00 0x00
+# CHECK: cu12 %r0, %r0
+0xb2 0xa7 0x00 0x00
 
-# CHECK: csst 0, 4095, %r2
-0xc8 0x22 0x00 0x00 0x0f 0xff
+# CHECK: cu12 %r0, %r14
+0xb2 0xa7 0x00 0x0e
 
-# CHECK: csst 0, 0(%r1), %r2
-0xc8 0x22 0x00 0x00 0x10 0x00
+# CHECK: cu12 %r14, %r0
+0xb2 0xa7 0x00 0xe0
 
-# CHECK: csst 0, 0(%r15), %r2
-0xc8 0x22 0x00 0x00 0xf0 0x00
+# CHECK: cu12 %r6, %r8
+0xb2 0xa7 0x00 0x68
 
-# CHECK: csst 0(%r1), 4095(%r15), %r2
-0xc8 0x22 0x10 0x00 0xff 0xff
+# CHECK: cu12 %r4, %r12, 1
+0xb2 0xa7 0x10 0x4c
 
-# CHECK: csst 0(%r1), 0(%r15), %r2
-0xc8 0x22 0x10 0x00 0xf0 0x00
+# CHECK: cu12 %r4, %r12, 15
+0xb2 0xa7 0xf0 0x4c
 
-# CHECK: csst 4095(%r1), 0(%r15), %r2
-0xc8 0x22 0x1f 0xff 0xf0 0x00
+# CHECK: cu14 %r0, %r0
+0xb9 0xb0 0x00 0x00
 
-# CHECK: c %r0, 0
-0x59 0x00 0x00 0x00
+# CHECK: cu14 %r0, %r14
+0xb9 0xb0 0x00 0x0e
 
-# CHECK: c %r0, 4095
-0x59 0x00 0x0f 0xff
+# CHECK: cu14 %r14, %r0
+0xb9 0xb0 0x00 0xe0
 
-# CHECK: c %r0, 0(%r1)
-0x59 0x00 0x10 0x00
+# CHECK: cu14 %r6, %r8
+0xb9 0xb0 0x00 0x68
 
-# CHECK: c %r0, 0(%r15)
-0x59 0x00 0xf0 0x00
+# CHECK: cu14 %r4, %r12, 1
+0xb9 0xb0 0x10 0x4c
 
-# CHECK: c %r0, 4095(%r1,%r15)
-0x59 0x01 0xff 0xff
+# CHECK: cu14 %r4, %r12, 15
+0xb9 0xb0 0xf0 0x4c
 
-# CHECK: c %r0, 4095(%r15,%r1)
-0x59 0x0f 0x1f 0xff
+# CHECK: cu21 %r0, %r0
+0xb2 0xa6 0x00 0x00
 
-# CHECK: c %r15, 0
-0x59 0xf0 0x00 0x00
+# CHECK: cu21 %r0, %r14
+0xb2 0xa6 0x00 0x0e
 
-# CHECK: cxbr %f0, %f0
-0xb3 0x49 0x00 0x00
+# CHECK: cu21 %r14, %r0
+0xb2 0xa6 0x00 0xe0
 
-# CHECK: cxbr %f0, %f13
-0xb3 0x49 0x00 0x0d
+# CHECK: cu21 %r6, %r8
+0xb2 0xa6 0x00 0x68
 
-# CHECK: cxbr %f8, %f8
-0xb3 0x49 0x00 0x88
+# CHECK: cu21 %r4, %r12, 1
+0xb2 0xa6 0x10 0x4c
 
-# CHECK: cxbr %f13, %f0
-0xb3 0x49 0x00 0xd0
+# CHECK: cu21 %r4, %r12, 15
+0xb2 0xa6 0xf0 0x4c
 
-# CHECK: cxfbr %f0, %r0
-0xb3 0x96 0x00 0x00
+# CHECK: cu24 %r0, %r0
+0xb9 0xb1 0x00 0x00
 
-# CHECK: cxfbr %f0, %r15
-0xb3 0x96 0x00 0x0f
+# CHECK: cu24 %r0, %r14
+0xb9 0xb1 0x00 0x0e
 
-# CHECK: cxfbr %f13, %r0
-0xb3 0x96 0x00 0xd0
+# CHECK: cu24 %r14, %r0
+0xb9 0xb1 0x00 0xe0
 
-# CHECK: cxfbr %f8, %r7
-0xb3 0x96 0x00 0x87
+# CHECK: cu24 %r6, %r8
+0xb9 0xb1 0x00 0x68
 
-# CHECK: cxfbr %f13, %r15
-0xb3 0x96 0x00 0xdf
+# CHECK: cu24 %r4, %r12, 1
+0xb9 0xb1 0x10 0x4c
 
-# CHECK: cxfbra	%f0, 0, %r0, 1
+# CHECK: cu24 %r4, %r12, 15
+0xb9 0xb1 0xf0 0x4c
+
+# CHECK: cu41 %r0, %r0
+0xb9 0xb2 0x00 0x00
+
+# CHECK: cu41 %r0, %r14
+0xb9 0xb2 0x00 0x0e
+
+# CHECK: cu41 %r14, %r0
+0xb9 0xb2 0x00 0xe0
+
+# CHECK: cu41 %r6, %r8
+0xb9 0xb2 0x00 0x68
+
+# CHECK: cu42 %r0, %r0
+0xb9 0xb3 0x00 0x00
+
+# CHECK: cu42 %r0, %r14
+0xb9 0xb3 0x00 0x0e
+
+# CHECK: cu42 %r14, %r0
+0xb9 0xb3 0x00 0xe0
+
+# CHECK: cu42 %r6, %r8
+0xb9 0xb3 0x00 0x68
+
+# CHECK: cuse %r0, %r0
+0xb2 0x57 0x00 0x00
+
+# CHECK: cuse %r0, %r14
+0xb2 0x57 0x00 0x0e
+
+# CHECK: cuse %r14, %r0
+0xb2 0x57 0x00 0xe0
+
+# CHECK: cuse %r6, %r8
+0xb2 0x57 0x00 0x68
+
+# CHECK: cvb %r0, 0
+0x4f 0x00 0x00 0x00
+
+# CHECK: cvb %r0, 4095
+0x4f 0x00 0x0f 0xff
+
+# CHECK: cvb %r0, 0(%r1)
+0x4f 0x00 0x10 0x00
+
+# CHECK: cvb %r0, 0(%r15)
+0x4f 0x00 0xf0 0x00
+
+# CHECK: cvb %r0, 4095(%r1,%r15)
+0x4f 0x01 0xff 0xff
+
+# CHECK: cvb %r0, 4095(%r15,%r1)
+0x4f 0x0f 0x1f 0xff
+
+# CHECK: cvb %r15, 0
+0x4f 0xf0 0x00 0x00
+
+# CHECK: cvbg %r0, -524288
+0xe3 0x00 0x00 0x00 0x80 0x0e
+
+# CHECK: cvbg %r0, -1
+0xe3 0x00 0x0f 0xff 0xff 0x0e
+
+# CHECK: cvbg %r0, 0
+0xe3 0x00 0x00 0x00 0x00 0x0e
+
+# CHECK: cvbg %r0, 1
+0xe3 0x00 0x00 0x01 0x00 0x0e
+
+# CHECK: cvbg %r0, 524287
+0xe3 0x00 0x0f 0xff 0x7f 0x0e
+
+# CHECK: cvbg %r0, 0(%r1)
+0xe3 0x00 0x10 0x00 0x00 0x0e
+
+# CHECK: cvbg %r0, 0(%r15)
+0xe3 0x00 0xf0 0x00 0x00 0x0e
+
+# CHECK: cvbg %r0, 524287(%r1,%r15)
+0xe3 0x01 0xff 0xff 0x7f 0x0e
+
+# CHECK: cvbg %r0, 524287(%r15,%r1)
+0xe3 0x0f 0x1f 0xff 0x7f 0x0e
+
+# CHECK: cvbg %r15, 0
+0xe3 0xf0 0x00 0x00 0x00 0x0e
+
+# CHECK: cvby %r0, -524288
+0xe3 0x00 0x00 0x00 0x80 0x06
+
+# CHECK: cvby %r0, -1
+0xe3 0x00 0x0f 0xff 0xff 0x06
+
+# CHECK: cvby %r0, 0
+0xe3 0x00 0x00 0x00 0x00 0x06
+
+# CHECK: cvby %r0, 1
+0xe3 0x00 0x00 0x01 0x00 0x06
+
+# CHECK: cvby %r0, 524287
+0xe3 0x00 0x0f 0xff 0x7f 0x06
+
+# CHECK: cvby %r0, 0(%r1)
+0xe3 0x00 0x10 0x00 0x00 0x06
+
+# CHECK: cvby %r0, 0(%r15)
+0xe3 0x00 0xf0 0x00 0x00 0x06
+
+# CHECK: cvby %r0, 524287(%r1,%r15)
+0xe3 0x01 0xff 0xff 0x7f 0x06
+
+# CHECK: cvby %r0, 524287(%r15,%r1)
+0xe3 0x0f 0x1f 0xff 0x7f 0x06
+
+# CHECK: cvby %r15, 0
+0xe3 0xf0 0x00 0x00 0x00 0x06
+
+# CHECK: cvd %r0, 0
+0x4e 0x00 0x00 0x00
+
+# CHECK: cvd %r0, 4095
+0x4e 0x00 0x0f 0xff
+
+# CHECK: cvd %r0, 0(%r1)
+0x4e 0x00 0x10 0x00
+
+# CHECK: cvd %r0, 0(%r15)
+0x4e 0x00 0xf0 0x00
+
+# CHECK: cvd %r0, 4095(%r1,%r15)
+0x4e 0x01 0xff 0xff
+
+# CHECK: cvd %r0, 4095(%r15,%r1)
+0x4e 0x0f 0x1f 0xff
+
+# CHECK: cvd %r15, 0
+0x4e 0xf0 0x00 0x00
+
+# CHECK: cvdg %r0, -524288
+0xe3 0x00 0x00 0x00 0x80 0x2e
+
+# CHECK: cvdg %r0, -1
+0xe3 0x00 0x0f 0xff 0xff 0x2e
+
+# CHECK: cvdg %r0, 0
+0xe3 0x00 0x00 0x00 0x00 0x2e
+
+# CHECK: cvdg %r0, 1
+0xe3 0x00 0x00 0x01 0x00 0x2e
+
+# CHECK: cvdg %r0, 524287
+0xe3 0x00 0x0f 0xff 0x7f 0x2e
+
+# CHECK: cvdg %r0, 0(%r1)
+0xe3 0x00 0x10 0x00 0x00 0x2e
+
+# CHECK: cvdg %r0, 0(%r15)
+0xe3 0x00 0xf0 0x00 0x00 0x2e
+
+# CHECK: cvdg %r0, 524287(%r1,%r15)
+0xe3 0x01 0xff 0xff 0x7f 0x2e
+
+# CHECK: cvdg %r0, 524287(%r15,%r1)
+0xe3 0x0f 0x1f 0xff 0x7f 0x2e
+
+# CHECK: cvdg %r15, 0
+0xe3 0xf0 0x00 0x00 0x00 0x2e
+
+# CHECK: cvdy %r0, -524288
+0xe3 0x00 0x00 0x00 0x80 0x26
+
+# CHECK: cvdy %r0, -1
+0xe3 0x00 0x0f 0xff 0xff 0x26
+
+# CHECK: cvdy %r0, 0
+0xe3 0x00 0x00 0x00 0x00 0x26
+
+# CHECK: cvdy %r0, 1
+0xe3 0x00 0x00 0x01 0x00 0x26
+
+# CHECK: cvdy %r0, 524287
+0xe3 0x00 0x0f 0xff 0x7f 0x26
+
+# CHECK: cvdy %r0, 0(%r1)
+0xe3 0x00 0x10 0x00 0x00 0x26
+
+# CHECK: cvdy %r0, 0(%r15)
+0xe3 0x00 0xf0 0x00 0x00 0x26
+
+# CHECK: cvdy %r0, 524287(%r1,%r15)
+0xe3 0x01 0xff 0xff 0x7f 0x26
+
+# CHECK: cvdy %r0, 524287(%r15,%r1)
+0xe3 0x0f 0x1f 0xff 0x7f 0x26
+
+# CHECK: cvdy %r15, 0
+0xe3 0xf0 0x00 0x00 0x00 0x26
+
+# CHECK: cxbr %f0, %f0
+0xb3 0x49 0x00 0x00
+
+# CHECK: cxbr %f0, %f13
+0xb3 0x49 0x00 0x0d
+
+# CHECK: cxbr %f8, %f8
+0xb3 0x49 0x00 0x88
+
+# CHECK: cxbr %f13, %f0
+0xb3 0x49 0x00 0xd0
+
+# CHECK: cxfbr %f0, %r0
+0xb3 0x96 0x00 0x00
+
+# CHECK: cxfbr %f0, %r15
+0xb3 0x96 0x00 0x0f
+
+# CHECK: cxfbr %f13, %r0
+0xb3 0x96 0x00 0xd0
+
+# CHECK: cxfbr %f8, %r7
+0xb3 0x96 0x00 0x87
+
+# CHECK: cxfbr %f13, %r15
+0xb3 0x96 0x00 0xdf
+
+# CHECK: cxfbra %f0, 0, %r0, 1
 0xb3 0x96 0x01 0x00
 
-# CHECK: cxfbra	%f0, 0, %r0, 15
+# CHECK: cxfbra %f0, 0, %r0, 15
 0xb3 0x96 0x0f 0x00
 
-# CHECK: cxfbra	%f0, 0, %r15, 1
+# CHECK: cxfbra %f0, 0, %r15, 1
 0xb3 0x96 0x01 0x0f
 
-# CHECK: cxfbra	%f0, 15, %r0, 1
+# CHECK: cxfbra %f0, 15, %r0, 1
 0xb3 0x96 0xf1 0x00
 
-# CHECK: cxfbra	%f4, 5, %r6, 7
+# CHECK: cxfbra %f4, 5, %r6, 7
 0xb3 0x96 0x57 0x46
 
-# CHECK: cxfbra	%f13, 0, %r0, 1
+# CHECK: cxfbra %f13, 0, %r0, 1
 0xb3 0x96 0x01 0xd0
 
 # CHECK: cxgbr %f0, %r0
@@ -3643,58 +4261,58 @@
 # CHECK: cxgbr %f13, %r15
 0xb3 0xa6 0x00 0xdf
 
-# CHECK: cxgbra	%f0, 0, %r0, 1
+# CHECK: cxgbra %f0, 0, %r0, 1
 0xb3 0xa6 0x01 0x00
 
-# CHECK: cxgbra	%f0, 0, %r0, 15
+# CHECK: cxgbra %f0, 0, %r0, 15
 0xb3 0xa6 0x0f 0x00
 
-# CHECK: cxgbra	%f0, 0, %r15, 1
+# CHECK: cxgbra %f0, 0, %r15, 1
 0xb3 0xa6 0x01 0x0f
 
-# CHECK: cxgbra	%f0, 15, %r0, 1
+# CHECK: cxgbra %f0, 15, %r0, 1
 0xb3 0xa6 0xf1 0x00
 
-# CHECK: cxgbra	%f4, 5, %r6, 7
+# CHECK: cxgbra %f4, 5, %r6, 7
 0xb3 0xa6 0x57 0x46
 
-# CHECK: cxgbra	%f13, 0, %r0, 1
+# CHECK: cxgbra %f13, 0, %r0, 1
 0xb3 0xa6 0x01 0xd0
 
-# CHECK: cxlfbr	%f0, 0, %r0, 1
+# CHECK: cxlfbr %f0, 0, %r0, 1
 0xb3 0x92 0x01 0x00
 
-# CHECK: cxlfbr	%f0, 0, %r0, 15
+# CHECK: cxlfbr %f0, 0, %r0, 15
 0xb3 0x92 0x0f 0x00
 
-# CHECK: cxlfbr	%f0, 0, %r15, 1
+# CHECK: cxlfbr %f0, 0, %r15, 1
 0xb3 0x92 0x01 0x0f
 
-# CHECK: cxlfbr	%f0, 15, %r0, 1
+# CHECK: cxlfbr %f0, 15, %r0, 1
 0xb3 0x92 0xf1 0x00
 
-# CHECK: cxlfbr	%f4, 5, %r6, 7
+# CHECK: cxlfbr %f4, 5, %r6, 7
 0xb3 0x92 0x57 0x46
 
-# CHECK: cxlfbr	%f13, 0, %r0, 1
+# CHECK: cxlfbr %f13, 0, %r0, 1
 0xb3 0x92 0x01 0xd0
 
-# CHECK: cxlgbr	%f0, 0, %r0, 1
+# CHECK: cxlgbr %f0, 0, %r0, 1
 0xb3 0xa2 0x01 0x00
 
-# CHECK: cxlgbr	%f0, 0, %r0, 15
+# CHECK: cxlgbr %f0, 0, %r0, 15
 0xb3 0xa2 0x0f 0x00
 
-# CHECK: cxlgbr	%f0, 0, %r15, 1
+# CHECK: cxlgbr %f0, 0, %r15, 1
 0xb3 0xa2 0x01 0x0f
 
-# CHECK: cxlgbr	%f0, 15, %r0, 1
+# CHECK: cxlgbr %f0, 15, %r0, 1
 0xb3 0xa2 0xf1 0x00
 
-# CHECK: cxlgbr	%f4, 5, %r6, 7
+# CHECK: cxlgbr %f4, 5, %r6, 7
 0xb3 0xa2 0x57 0x46
 
-# CHECK: cxlgbr	%f13, 0, %r0, 1
+# CHECK: cxlgbr %f13, 0, %r0, 1
 0xb3 0xa2 0x01 0xd0
 
 # CHECK: cy %r0, -524288
@@ -3727,17 +4345,26 @@
 # CHECK: cy %r15, 0
 0xe3 0xf0 0x00 0x00 0x00 0x59
 
-# CHECK: ddbr %f0, %f0
-0xb3 0x1d 0x00 0x00
+# CHECK: d %r0, 0
+0x5d 0x00 0x00 0x00
 
-# CHECK: ddbr %f0, %f15
-0xb3 0x1d 0x00 0x0f
+# CHECK: d %r0, 4095
+0x5d 0x00 0x0f 0xff
 
-# CHECK: ddbr %f7, %f8
-0xb3 0x1d 0x00 0x78
+# CHECK: d %r0, 0(%r1)
+0x5d 0x00 0x10 0x00
 
-# CHECK: ddbr %f15, %f0
-0xb3 0x1d 0x00 0xf0
+# CHECK: d %r0, 0(%r15)
+0x5d 0x00 0xf0 0x00
+
+# CHECK: d %r0, 4095(%r1,%r15)
+0x5d 0x01 0xff 0xff
+
+# CHECK: d %r0, 4095(%r15,%r1)
+0x5d 0x0f 0x1f 0xff
+
+# CHECK: d %r14, 0
+0x5d 0xe0 0x00 0x00
 
 # CHECK: ddb %f0, 0
 0xed 0x00 0x00 0x00 0x00 0x1d
@@ -3760,17 +4387,17 @@
 # CHECK: ddb %f15, 0
 0xed 0xf0 0x00 0x00 0x00 0x1d
 
-# CHECK: debr %f0, %f0
-0xb3 0x0d 0x00 0x00
+# CHECK: ddbr %f0, %f0
+0xb3 0x1d 0x00 0x00
 
-# CHECK: debr %f0, %f15
-0xb3 0x0d 0x00 0x0f
+# CHECK: ddbr %f0, %f15
+0xb3 0x1d 0x00 0x0f
 
-# CHECK: debr %f7, %f8
-0xb3 0x0d 0x00 0x78
+# CHECK: ddbr %f7, %f8
+0xb3 0x1d 0x00 0x78
 
-# CHECK: debr %f15, %f0
-0xb3 0x0d 0x00 0xf0
+# CHECK: ddbr %f15, %f0
+0xb3 0x1d 0x00 0xf0
 
 # CHECK: deb %f0, 0
 0xed 0x00 0x00 0x00 0x00 0x0d
@@ -3793,17 +4420,83 @@
 # CHECK: deb %f15, 0
 0xed 0xf0 0x00 0x00 0x00 0x0d
 
-# CHECK: dlgr %r0, %r0
-0xb9 0x87 0x00 0x00
+# CHECK: debr %f0, %f0
+0xb3 0x0d 0x00 0x00
 
-# CHECK: dlgr %r0, %r15
-0xb9 0x87 0x00 0x0f
+# CHECK: debr %f0, %f15
+0xb3 0x0d 0x00 0x0f
 
-# CHECK: dlgr %r14, %r0
-0xb9 0x87 0x00 0xe0
+# CHECK: debr %f7, %f8
+0xb3 0x0d 0x00 0x78
 
-# CHECK: dlgr %r6, %r9
-0xb9 0x87 0x00 0x69
+# CHECK: debr %f15, %f0
+0xb3 0x0d 0x00 0xf0
+
+# CHECK: didbr	%f0, %f0, %f0, 1
+0xb3 0x5b 0x01 0x00
+
+# CHECK: didbr	%f0, %f0, %f0, 15
+0xb3 0x5b 0x0f 0x00
+
+# CHECK: didbr	%f0, %f0, %f15, 1
+0xb3 0x5b 0x01 0x0f
+
+# CHECK: didbr	%f0, %f15, %f0, 1
+0xb3 0x5b 0xf1 0x00
+
+# CHECK: didbr	%f4, %f5, %f6, 7
+0xb3 0x5b 0x57 0x46
+
+# CHECK: didbr	%f15, %f0, %f0, 1
+0xb3 0x5b 0x01 0xf0
+
+# CHECK: diebr	%f0, %f0, %f0, 1
+0xb3 0x53 0x01 0x00
+
+# CHECK: diebr	%f0, %f0, %f0, 15
+0xb3 0x53 0x0f 0x00
+
+# CHECK: diebr	%f0, %f0, %f15, 1
+0xb3 0x53 0x01 0x0f
+
+# CHECK: diebr	%f0, %f15, %f0, 1
+0xb3 0x53 0xf1 0x00
+
+# CHECK: diebr	%f4, %f5, %f6, 7
+0xb3 0x53 0x57 0x46
+
+# CHECK: diebr	%f15, %f0, %f0, 1
+0xb3 0x53 0x01 0xf0
+
+# CHECK: dl %r0, -524288
+0xe3 0x00 0x00 0x00 0x80 0x97
+
+# CHECK: dl %r0, -1
+0xe3 0x00 0x0f 0xff 0xff 0x97
+
+# CHECK: dl %r0, 0
+0xe3 0x00 0x00 0x00 0x00 0x97
+
+# CHECK: dl %r0, 1
+0xe3 0x00 0x00 0x01 0x00 0x97
+
+# CHECK: dl %r0, 524287
+0xe3 0x00 0x0f 0xff 0x7f 0x97
+
+# CHECK: dl %r0, 0(%r1)
+0xe3 0x00 0x10 0x00 0x00 0x97
+
+# CHECK: dl %r0, 0(%r15)
+0xe3 0x00 0xf0 0x00 0x00 0x97
+
+# CHECK: dl %r0, 524287(%r1,%r15)
+0xe3 0x01 0xff 0xff 0x7f 0x97
+
+# CHECK: dl %r0, 524287(%r15,%r1)
+0xe3 0x0f 0x1f 0xff 0x7f 0x97
+
+# CHECK: dl %r14, 0
+0xe3 0xe0 0x00 0x00 0x00 0x97
 
 # CHECK: dlg %r0, -524288
 0xe3 0x00 0x00 0x00 0x80 0x87
@@ -3835,6 +4528,18 @@
 # CHECK: dlg %r14, 0
 0xe3 0xe0 0x00 0x00 0x00 0x87
 
+# CHECK: dlgr %r0, %r0
+0xb9 0x87 0x00 0x00
+
+# CHECK: dlgr %r0, %r15
+0xb9 0x87 0x00 0x0f
+
+# CHECK: dlgr %r14, %r0
+0xb9 0x87 0x00 0xe0
+
+# CHECK: dlgr %r6, %r9
+0xb9 0x87 0x00 0x69
+
 # CHECK: dlr %r0, %r0
 0xb9 0x97 0x00 0x00
 
@@ -3847,47 +4552,89 @@
 # CHECK: dlr %r6, %r9
 0xb9 0x97 0x00 0x69
 
-# CHECK: dl %r0, -524288
-0xe3 0x00 0x00 0x00 0x80 0x97
+# CHECK: dp 0(1), 0(1)
+0xfd 0x00 0x00 0x00 0x00 0x00
 
-# CHECK: dl %r0, -1
-0xe3 0x00 0x0f 0xff 0xff 0x97
+# CHECK: dp 0(1), 0(1,%r1)
+0xfd 0x00 0x00 0x00 0x10 0x00
 
-# CHECK: dl %r0, 0
-0xe3 0x00 0x00 0x00 0x00 0x97
+# CHECK: dp 0(1), 0(1,%r15)
+0xfd 0x00 0x00 0x00 0xf0 0x00
 
-# CHECK: dl %r0, 1
-0xe3 0x00 0x00 0x01 0x00 0x97
+# CHECK: dp 0(1), 4095(1)
+0xfd 0x00 0x00 0x00 0x0f 0xff
 
-# CHECK: dl %r0, 524287
-0xe3 0x00 0x0f 0xff 0x7f 0x97
+# CHECK: dp 0(1), 4095(1,%r1)
+0xfd 0x00 0x00 0x00 0x1f 0xff
 
-# CHECK: dl %r0, 0(%r1)
-0xe3 0x00 0x10 0x00 0x00 0x97
+# CHECK: dp 0(1), 4095(1,%r15)
+0xfd 0x00 0x00 0x00 0xff 0xff
 
-# CHECK: dl %r0, 0(%r15)
-0xe3 0x00 0xf0 0x00 0x00 0x97
+# CHECK: dp 0(1,%r1), 0(1)
+0xfd 0x00 0x10 0x00 0x00 0x00
 
-# CHECK: dl %r0, 524287(%r1,%r15)
-0xe3 0x01 0xff 0xff 0x7f 0x97
+# CHECK: dp 0(1,%r15), 0(1)
+0xfd 0x00 0xf0 0x00 0x00 0x00
 
-# CHECK: dl %r0, 524287(%r15,%r1)
-0xe3 0x0f 0x1f 0xff 0x7f 0x97
+# CHECK: dp 4095(1,%r1), 0(1)
+0xfd 0x00 0x1f 0xff 0x00 0x00
 
-# CHECK: dl %r14, 0
-0xe3 0xe0 0x00 0x00 0x00 0x97
+# CHECK: dp 4095(1,%r15), 0(1)
+0xfd 0x00 0xff 0xff 0x00 0x00
 
-# CHECK: dsgfr %r0, %r0
-0xb9 0x1d 0x00 0x00
+# CHECK: dp 0(16,%r1), 0(1)
+0xfd 0xf0 0x10 0x00 0x00 0x00
 
-# CHECK: dsgfr %r0, %r15
-0xb9 0x1d 0x00 0x0f
+# CHECK: dp 0(16,%r15), 0(1)
+0xfd 0xf0 0xf0 0x00 0x00 0x00
 
-# CHECK: dsgfr %r14, %r0
-0xb9 0x1d 0x00 0xe0
+# CHECK: dp 0(1), 0(16,%r1)
+0xfd 0x0f 0x00 0x00 0x10 0x00
 
-# CHECK: dsgfr %r6, %r9
-0xb9 0x1d 0x00 0x69
+# CHECK: dp 0(1), 0(16,%r15)
+0xfd 0x0f 0x00 0x00 0xf0 0x00
+
+# CHECK: dr %r0, %r0
+0x1d 0x00
+
+# CHECK: dr %r0, %r15
+0x1d 0x0f
+
+# CHECK: dr %r14, %r0
+0x1d 0xe0
+
+# CHECK: dr %r6, %r9
+0x1d 0x69
+
+# CHECK: dsg %r0, -524288
+0xe3 0x00 0x00 0x00 0x80 0x0d
+
+# CHECK: dsg %r0, -1
+0xe3 0x00 0x0f 0xff 0xff 0x0d
+
+# CHECK: dsg %r0, 0
+0xe3 0x00 0x00 0x00 0x00 0x0d
+
+# CHECK: dsg %r0, 1
+0xe3 0x00 0x00 0x01 0x00 0x0d
+
+# CHECK: dsg %r0, 524287
+0xe3 0x00 0x0f 0xff 0x7f 0x0d
+
+# CHECK: dsg %r0, 0(%r1)
+0xe3 0x00 0x10 0x00 0x00 0x0d
+
+# CHECK: dsg %r0, 0(%r15)
+0xe3 0x00 0xf0 0x00 0x00 0x0d
+
+# CHECK: dsg %r0, 524287(%r1,%r15)
+0xe3 0x01 0xff 0xff 0x7f 0x0d
+
+# CHECK: dsg %r0, 524287(%r15,%r1)
+0xe3 0x0f 0x1f 0xff 0x7f 0x0d
+
+# CHECK: dsg %r14, 0
+0xe3 0xe0 0x00 0x00 0x00 0x0d
 
 # CHECK: dsgf %r0, -524288
 0xe3 0x00 0x00 0x00 0x80 0x1d
@@ -3919,6 +4666,18 @@
 # CHECK: dsgf %r14, 0
 0xe3 0xe0 0x00 0x00 0x00 0x1d
 
+# CHECK: dsgfr %r0, %r0
+0xb9 0x1d 0x00 0x00
+
+# CHECK: dsgfr %r0, %r15
+0xb9 0x1d 0x00 0x0f
+
+# CHECK: dsgfr %r14, %r0
+0xb9 0x1d 0x00 0xe0
+
+# CHECK: dsgfr %r6, %r9
+0xb9 0x1d 0x00 0x69
+
 # CHECK: dsgr %r0, %r0
 0xb9 0x0d 0x00 0x00
 
@@ -3931,36 +4690,6 @@
 # CHECK: dsgr %r6, %r9
 0xb9 0x0d 0x00 0x69
 
-# CHECK: dsg %r0, -524288
-0xe3 0x00 0x00 0x00 0x80 0x0d
-
-# CHECK: dsg %r0, -1
-0xe3 0x00 0x0f 0xff 0xff 0x0d
-
-# CHECK: dsg %r0, 0
-0xe3 0x00 0x00 0x00 0x00 0x0d
-
-# CHECK: dsg %r0, 1
-0xe3 0x00 0x00 0x01 0x00 0x0d
-
-# CHECK: dsg %r0, 524287
-0xe3 0x00 0x0f 0xff 0x7f 0x0d
-
-# CHECK: dsg %r0, 0(%r1)
-0xe3 0x00 0x10 0x00 0x00 0x0d
-
-# CHECK: dsg %r0, 0(%r15)
-0xe3 0x00 0xf0 0x00 0x00 0x0d
-
-# CHECK: dsg %r0, 524287(%r1,%r15)
-0xe3 0x01 0xff 0xff 0x7f 0x0d
-
-# CHECK: dsg %r0, 524287(%r15,%r1)
-0xe3 0x0f 0x1f 0xff 0x7f 0x0d
-
-# CHECK: dsg %r14, 0
-0xe3 0xe0 0x00 0x00 0x00 0x0d
-
 # CHECK: dxbr %f0, %f0
 0xb3 0x4d 0x00 0x00
 
@@ -3988,6 +4717,45 @@
 # CHECK: ear %r15, %a15
 0xb2 0x4f 0x00 0xff
 
+# CHECK: ecag %r0, %r0, 0
+0xeb 0x00 0x00 0x00 0x00 0x4c
+
+# CHECK: ecag %r0, %r15, 0
+0xeb 0x0f 0x00 0x00 0x00 0x4c
+
+# CHECK: ecag %r14, %r15, 0
+0xeb 0xef 0x00 0x00 0x00 0x4c
+
+# CHECK: ecag %r15, %r15, 0
+0xeb 0xff 0x00 0x00 0x00 0x4c
+
+# CHECK: ecag %r0, %r0, -524288
+0xeb 0x00 0x00 0x00 0x80 0x4c
+
+# CHECK: ecag %r0, %r0, -1
+0xeb 0x00 0x0f 0xff 0xff 0x4c
+
+# CHECK: ecag %r0, %r0, 0
+0xeb 0x00 0x00 0x00 0x00 0x4c
+
+# CHECK: ecag %r0, %r0, 1
+0xeb 0x00 0x00 0x01 0x00 0x4c
+
+# CHECK: ecag %r0, %r0, 524287
+0xeb 0x00 0x0f 0xff 0x7f 0x4c
+
+# CHECK: ecag %r0, %r0, 0(%r1)
+0xeb 0x00 0x10 0x00 0x00 0x4c
+
+# CHECK: ecag %r0, %r0, 0(%r15)
+0xeb 0x00 0xf0 0x00 0x00 0x4c
+
+# CHECK: ecag %r0, %r0, 524287(%r1)
+0xeb 0x00 0x1f 0xff 0x7f 0x4c
+
+# CHECK: ecag %r0, %r0, 524287(%r15)
+0xeb 0x00 0xff 0xff 0x7f 0x4c
+
 # CHECK: ectg 0, 0, %r0
 0xc8 0x01 0x00 0x00 0x00 0x00
 
@@ -4009,6 +4777,78 @@
 # CHECK: ectg 4095(%r1), 0(%r15), %r2
 0xc8 0x21 0x1f 0xff 0xf0 0x00
 
+# CHECK: ed 0(1), 0
+0xde 0x00 0x00 0x00 0x00 0x00
+
+# CHECK: ed 0(1), 0(%r1)
+0xde 0x00 0x00 0x00 0x10 0x00
+
+# CHECK: ed 0(1), 0(%r15)
+0xde 0x00 0x00 0x00 0xf0 0x00
+
+# CHECK: ed 0(1), 4095
+0xde 0x00 0x00 0x00 0x0f 0xff
+
+# CHECK: ed 0(1), 4095(%r1)
+0xde 0x00 0x00 0x00 0x1f 0xff
+
+# CHECK: ed 0(1), 4095(%r15)
+0xde 0x00 0x00 0x00 0xff 0xff
+
+# CHECK: ed 0(1,%r1), 0
+0xde 0x00 0x10 0x00 0x00 0x00
+
+# CHECK: ed 0(1,%r15), 0
+0xde 0x00 0xf0 0x00 0x00 0x00
+
+# CHECK: ed 4095(1,%r1), 0
+0xde 0x00 0x1f 0xff 0x00 0x00
+
+# CHECK: ed 4095(1,%r15), 0
+0xde 0x00 0xff 0xff 0x00 0x00
+
+# CHECK: ed 0(256,%r1), 0
+0xde 0xff 0x10 0x00 0x00 0x00
+
+# CHECK: ed 0(256,%r15), 0
+0xde 0xff 0xf0 0x00 0x00 0x00
+
+# CHECK: edmk 0(1), 0
+0xdf 0x00 0x00 0x00 0x00 0x00
+
+# CHECK: edmk 0(1), 0(%r1)
+0xdf 0x00 0x00 0x00 0x10 0x00
+
+# CHECK: edmk 0(1), 0(%r15)
+0xdf 0x00 0x00 0x00 0xf0 0x00
+
+# CHECK: edmk 0(1), 4095
+0xdf 0x00 0x00 0x00 0x0f 0xff
+
+# CHECK: edmk 0(1), 4095(%r1)
+0xdf 0x00 0x00 0x00 0x1f 0xff
+
+# CHECK: edmk 0(1), 4095(%r15)
+0xdf 0x00 0x00 0x00 0xff 0xff
+
+# CHECK: edmk 0(1,%r1), 0
+0xdf 0x00 0x10 0x00 0x00 0x00
+
+# CHECK: edmk 0(1,%r15), 0
+0xdf 0x00 0xf0 0x00 0x00 0x00
+
+# CHECK: edmk 4095(1,%r1), 0
+0xdf 0x00 0x1f 0xff 0x00 0x00
+
+# CHECK: edmk 4095(1,%r15), 0
+0xdf 0x00 0xff 0xff 0x00 0x00
+
+# CHECK: edmk 0(256,%r1), 0
+0xdf 0xff 0x10 0x00 0x00 0x00
+
+# CHECK: edmk 0(256,%r15), 0
+0xdf 0xff 0xf0 0x00 0x00 0x00
+
 # CHECK: efpc %r0
 0xb3 0x8c 0x00 0x00
 
@@ -4018,6 +4858,18 @@
 # CHECK: efpc %r15
 0xb3 0x8c 0x00 0xf0
 
+# CHECK: epsw %r0, %r0
+0xb9 0x8d 0x00 0x00
+
+# CHECK: epsw %r0, %r15
+0xb9 0x8d 0x00 0x0f
+
+# CHECK: epsw %r15, %r0
+0xb9 0x8d 0x00 0xf0
+
+# CHECK: epsw %r6, %r8
+0xb9 0x8d 0x00 0x68
+
 # CHECK: etnd %r0
 0xb2 0xec 0x00 0x00
 
@@ -4063,22 +4915,22 @@
 # CHECK: fidbr %f15, 0, %f0
 0xb3 0x5f 0x00 0xf0
 
-# CHECK: fidbra	%f0, 0, %f0, 1
+# CHECK: fidbra %f0, 0, %f0, 1
 0xb3 0x5f 0x01 0x00
 
-# CHECK: fidbra	%f0, 0, %f0, 15
+# CHECK: fidbra %f0, 0, %f0, 15
 0xb3 0x5f 0x0f 0x00
 
-# CHECK: fidbra	%f0, 0, %f15, 1
+# CHECK: fidbra %f0, 0, %f15, 1
 0xb3 0x5f 0x01 0x0f
 
-# CHECK: fidbra	%f0, 15, %f0, 1
+# CHECK: fidbra %f0, 15, %f0, 1
 0xb3 0x5f 0xf1 0x00
 
-# CHECK: fidbra	%f4, 5, %f6, 7
+# CHECK: fidbra %f4, 5, %f6, 7
 0xb3 0x5f 0x57 0x46
 
-# CHECK: fidbra	%f15, 0, %f0, 1
+# CHECK: fidbra %f15, 0, %f0, 1
 0xb3 0x5f 0x01 0xf0
 
 # CHECK: fiebr %f0, 0, %f0
@@ -4096,22 +4948,22 @@
 # CHECK: fiebr %f15, 0, %f0
 0xb3 0x57 0x00 0xf0
 
-# CHECK: fiebra	%f0, 0, %f0, 1
+# CHECK: fiebra %f0, 0, %f0, 1
 0xb3 0x57 0x01 0x00
 
-# CHECK: fiebra	%f0, 0, %f0, 15
+# CHECK: fiebra %f0, 0, %f0, 15
 0xb3 0x57 0x0f 0x00
 
-# CHECK: fiebra	%f0, 0, %f15, 1
+# CHECK: fiebra %f0, 0, %f15, 1
 0xb3 0x57 0x01 0x0f
 
-# CHECK: fiebra	%f0, 15, %f0, 1
+# CHECK: fiebra %f0, 15, %f0, 1
 0xb3 0x57 0xf1 0x00
 
-# CHECK: fiebra	%f4, 5, %f6, 7
+# CHECK: fiebra %f4, 5, %f6, 7
 0xb3 0x57 0x57 0x46
 
-# CHECK: fiebra	%f15, 0, %f0, 1
+# CHECK: fiebra %f15, 0, %f0, 1
 0xb3 0x57 0x01 0xf0
 
 # CHECK: fixbr %f0, 0, %f0
@@ -4129,22 +4981,22 @@
 # CHECK: fixbr %f13, 0, %f0
 0xb3 0x47 0x00 0xd0
 
-# CHECK: fixbra	%f0, 0, %f0, 1
+# CHECK: fixbra %f0, 0, %f0, 1
 0xb3 0x47 0x01 0x00
 
-# CHECK: fixbra	%f0, 0, %f0, 15
+# CHECK: fixbra %f0, 0, %f0, 15
 0xb3 0x47 0x0f 0x00
 
-# CHECK: fixbra	%f0, 0, %f13, 1
+# CHECK: fixbra %f0, 0, %f13, 1
 0xb3 0x47 0x01 0x0d
 
-# CHECK: fixbra	%f0, 15, %f0, 1
+# CHECK: fixbra %f0, 15, %f0, 1
 0xb3 0x47 0xf1 0x00
 
-# CHECK: fixbra	%f4, 5, %f8, 9
+# CHECK: fixbra %f4, 5, %f8, 9
 0xb3 0x47 0x59 0x48
 
-# CHECK: fixbra	%f13, 0, %f0, 1
+# CHECK: fixbra %f13, 0, %f0, 1
 0xb3 0x47 0x01 0xd0
 
 # CHECK: flogr %r0, %r0
@@ -4366,6 +5218,201 @@
 # CHECK: ipm %r15
 0xb2 0x22 0x00 0xf0
 
+# CHECK: kdb %f0, 0
+0xed 0x00 0x00 0x00 0x00 0x18
+
+# CHECK: kdb %f0, 4095
+0xed 0x00 0x0f 0xff 0x00 0x18
+
+# CHECK: kdb %f0, 0(%r1)
+0xed 0x00 0x10 0x00 0x00 0x18
+
+# CHECK: kdb %f0, 0(%r15)
+0xed 0x00 0xf0 0x00 0x00 0x18
+
+# CHECK: kdb %f0, 4095(%r1,%r15)
+0xed 0x01 0xff 0xff 0x00 0x18
+
+# CHECK: kdb %f0, 4095(%r15,%r1)
+0xed 0x0f 0x1f 0xff 0x00 0x18
+
+# CHECK: kdb %f15, 0
+0xed 0xf0 0x00 0x00 0x00 0x18
+
+# CHECK: kdbr %f0, %f0
+0xb3 0x18 0x00 0x00
+
+# CHECK: kdbr %f0, %f15
+0xb3 0x18 0x00 0x0f
+
+# CHECK: kdbr %f7, %f8
+0xb3 0x18 0x00 0x78
+
+# CHECK: kdbr %f15, %f0
+0xb3 0x18 0x00 0xf0
+
+# CHECK: keb %f0, 0
+0xed 0x00 0x00 0x00 0x00 0x08
+
+# CHECK: keb %f0, 4095
+0xed 0x00 0x0f 0xff 0x00 0x08
+
+# CHECK: keb %f0, 0(%r1)
+0xed 0x00 0x10 0x00 0x00 0x08
+
+# CHECK: keb %f0, 0(%r15)
+0xed 0x00 0xf0 0x00 0x00 0x08
+
+# CHECK: keb %f0, 4095(%r1,%r15)
+0xed 0x01 0xff 0xff 0x00 0x08
+
+# CHECK: keb %f0, 4095(%r15,%r1)
+0xed 0x0f 0x1f 0xff 0x00 0x08
+
+# CHECK: keb %f15, 0
+0xed 0xf0 0x00 0x00 0x00 0x08
+
+# CHECK: kebr %f0, %f0
+0xb3 0x08 0x00 0x00
+
+# CHECK: kebr %f0, %f15
+0xb3 0x08 0x00 0x0f
+
+# CHECK: kebr %f7, %f8
+0xb3 0x08 0x00 0x78
+
+# CHECK: kebr %f15, %f0
+0xb3 0x08 0x00 0xf0
+
+# CHECK: kimd %r2, %r10
+0xb9 0x3e 0x00 0x2a
+
+# CHECK: kimd %r2, %r14
+0xb9 0x3e 0x00 0x2e
+
+# CHECK: kimd %r14, %r2
+0xb9 0x3e 0x00 0xe2
+
+# CHECK: kimd %r14, %r10
+0xb9 0x3e 0x00 0xea
+
+# CHECK: klmd %r2, %r10
+0xb9 0x3f 0x00 0x2a
+
+# CHECK: klmd %r2, %r14
+0xb9 0x3f 0x00 0x2e
+
+# CHECK: klmd %r14, %r2
+0xb9 0x3f 0x00 0xe2
+
+# CHECK: klmd %r14, %r10
+0xb9 0x3f 0x00 0xea
+
+# CHECK: km %r2, %r10
+0xb9 0x2e 0x00 0x2a
+
+# CHECK: km %r2, %r14
+0xb9 0x2e 0x00 0x2e
+
+# CHECK: km %r14, %r2
+0xb9 0x2e 0x00 0xe2
+
+# CHECK: km %r14, %r10
+0xb9 0x2e 0x00 0xea
+
+# CHECK: kmac %r2, %r10
+0xb9 0x1e 0x00 0x2a
+
+# CHECK: kmac %r2, %r14
+0xb9 0x1e 0x00 0x2e
+
+# CHECK: kmac %r14, %r2
+0xb9 0x1e 0x00 0xe2
+
+# CHECK: kmac %r14, %r10
+0xb9 0x1e 0x00 0xea
+
+# CHECK: kmc %r2, %r10
+0xb9 0x2f 0x00 0x2a
+
+# CHECK: kmc %r2, %r14
+0xb9 0x2f 0x00 0x2e
+
+# CHECK: kmc %r14, %r2
+0xb9 0x2f 0x00 0xe2
+
+# CHECK: kmc %r14, %r10
+0xb9 0x2f 0x00 0xea
+
+# CHECK: kmctr %r2, %r4, %r10
+0xb9 0x2d 0x40 0x2a
+
+# CHECK: kmctr %r2, %r6, %r14
+0xb9 0x2d 0x60 0x2e
+
+# CHECK: kmctr %r14, %r8, %r2
+0xb9 0x2d 0x80 0xe2
+
+# CHECK: kmctr %r14, %r12, %r10
+0xb9 0x2d 0xc0 0xea
+
+# CHECK: kmf %r2, %r10
+0xb9 0x2a 0x00 0x2a
+
+# CHECK: kmf %r2, %r14
+0xb9 0x2a 0x00 0x2e
+
+# CHECK: kmf %r14, %r2
+0xb9 0x2a 0x00 0xe2
+
+# CHECK: kmf %r14, %r10
+0xb9 0x2a 0x00 0xea
+
+# CHECK: kmo %r2, %r10
+0xb9 0x2b 0x00 0x2a
+
+# CHECK: kmo %r2, %r14
+0xb9 0x2b 0x00 0x2e
+
+# CHECK: kmo %r14, %r2
+0xb9 0x2b 0x00 0xe2
+
+# CHECK: kmo %r14, %r10
+0xb9 0x2b 0x00 0xea
+
+# CHECK: kxbr %f0, %f0
+0xb3 0x48 0x00 0x00
+
+# CHECK: kxbr %f0, %f13
+0xb3 0x48 0x00 0x0d
+
+# CHECK: kxbr %f8, %f8
+0xb3 0x48 0x00 0x88
+
+# CHECK: kxbr %f13, %f0
+0xb3 0x48 0x00 0xd0
+
+# CHECK: l %r0, 0
+0x58 0x00 0x00 0x00
+
+# CHECK: l %r0, 4095
+0x58 0x00 0x0f 0xff
+
+# CHECK: l %r0, 0(%r1)
+0x58 0x00 0x10 0x00
+
+# CHECK: l %r0, 0(%r15)
+0x58 0x00 0xf0 0x00
+
+# CHECK: l %r0, 4095(%r1,%r15)
+0x58 0x01 0xff 0xff
+
+# CHECK: l %r0, 4095(%r15,%r1)
+0x58 0x0f 0x1f 0xff
+
+# CHECK: l %r15, 0
+0x58 0xf0 0x00 0x00
+
 # CHECK: la %r0, 0
 0x41 0x00 0x00 0x00
 
@@ -4672,8 +5719,8 @@
 # CHECK: lan %r15, %r0, 0
 0xeb 0xf0 0x00 0x00 0x00 0xf4
 
-# CHECK: csy %r0, %r0, -524288
-0xeb 0x00 0x00 0x00 0x80 0x14
+# CHECK: lang %r0, %r0, -524288
+0xeb 0x00 0x00 0x00 0x80 0xe4
 
 # CHECK: lang %r0, %r0, -1
 0xeb 0x00 0x0f 0xff 0xff 0xe4
@@ -4768,8 +5815,38 @@
 # CHECK: laog %r0, %r15, 0
 0xeb 0x0f 0x00 0x00 0x00 0xe6
 
-# CHECK: laog %r15, %r0, 0
-0xeb 0xf0 0x00 0x00 0x00 0xe6
+# CHECK: laog %r15, %r0, 0
+0xeb 0xf0 0x00 0x00 0x00 0xe6
+
+# CHECK: lat %r0, -524288
+0xe3 0x00 0x00 0x00 0x80 0x9f
+
+# CHECK: lat %r0, -1
+0xe3 0x00 0x0f 0xff 0xff 0x9f
+
+# CHECK: lat %r0, 0
+0xe3 0x00 0x00 0x00 0x00 0x9f
+
+# CHECK: lat %r0, 1
+0xe3 0x00 0x00 0x01 0x00 0x9f
+
+# CHECK: lat %r0, 524287
+0xe3 0x00 0x0f 0xff 0x7f 0x9f
+
+# CHECK: lat %r0, 0(%r1)
+0xe3 0x00 0x10 0x00 0x00 0x9f
+
+# CHECK: lat %r0, 0(%r15)
+0xe3 0x00 0xf0 0x00 0x00 0x9f
+
+# CHECK: lat %r0, 524287(%r1,%r15)
+0xe3 0x01 0xff 0xff 0x7f 0x9f
+
+# CHECK: lat %r0, 524287(%r15,%r1)
+0xe3 0x0f 0x1f 0xff 0x7f 0x9f
+
+# CHECK: lat %r15, 0
+0xe3 0xf0 0x00 0x00 0x00 0x9f
 
 # CHECK: lax %r0, %r0, -524288
 0xeb 0x00 0x00 0x00 0x80 0xf7
@@ -4867,15 +5944,6 @@
 # CHECK: lay %r15, 0
 0xe3 0xf0 0x00 0x00 0x00 0x71
 
-# CHECK: lbr %r0, %r15
-0xb9 0x26 0x00 0x0f
-
-# CHECK: lbr %r7, %r8
-0xb9 0x26 0x00 0x78
-
-# CHECK: lbr %r15, %r0
-0xb9 0x26 0x00 0xf0
-
 # CHECK: lb %r0, -524288
 0xe3 0x00 0x00 0x00 0x80 0x76
 
@@ -4936,6 +6004,15 @@
 # CHECK: lbh %r15, 0
 0xe3 0xf0 0x00 0x00 0x00 0xc0
 
+# CHECK: lbr %r0, %r15
+0xb9 0x26 0x00 0x0f
+
+# CHECK: lbr %r7, %r8
+0xb9 0x26 0x00 0x78
+
+# CHECK: lbr %r15, %r0
+0xb9 0x26 0x00 0xf0
+
 # CHECK: lcdbr %f0, %f9
 0xb3 0x13 0x00 0x09
 
@@ -5008,14 +6085,26 @@
 # CHECK: lcxbr %f13, %f9
 0xb3 0x43 0x00 0xd9
 
-# CHECK: ldebr %f0, %f15
-0xb3 0x04 0x00 0x0f
+# CHECK: ld %f0, 0
+0x68 0x00 0x00 0x00
 
-# CHECK: ldebr %f7, %f8
-0xb3 0x04 0x00 0x78
+# CHECK: ld %f0, 4095
+0x68 0x00 0x0f 0xff
 
-# CHECK: ldebr %f15, %f0
-0xb3 0x04 0x00 0xf0
+# CHECK: ld %f0, 0(%r1)
+0x68 0x00 0x10 0x00
+
+# CHECK: ld %f0, 0(%r15)
+0x68 0x00 0xf0 0x00
+
+# CHECK: ld %f0, 4095(%r1,%r15)
+0x68 0x01 0xff 0xff
+
+# CHECK: ld %f0, 4095(%r15,%r1)
+0x68 0x0f 0x1f 0xff
+
+# CHECK: ld %f15, 0
+0x68 0xf0 0x00 0x00
 
 # CHECK: ldeb %f0, 0
 0xed 0x00 0x00 0x00 0x00 0x04
@@ -5038,6 +6127,15 @@
 # CHECK: ldeb %f15, 0
 0xed 0xf0 0x00 0x00 0x00 0x04
 
+# CHECK: ldebr %f0, %f15
+0xb3 0x04 0x00 0x0f
+
+# CHECK: ldebr %f7, %f8
+0xb3 0x04 0x00 0x78
+
+# CHECK: ldebr %f15, %f0
+0xb3 0x04 0x00 0xf0
+
 # CHECK: ldgr %f0, %r0
 0xb3 0xc1 0x00 0x00
 
@@ -5065,27 +6163,6 @@
 # CHECK: ldr %f15, %f9
 0x28 0xf9
 
-# CHECK: ld %f0, 0
-0x68 0x00 0x00 0x00
-
-# CHECK: ld %f0, 4095
-0x68 0x00 0x0f 0xff
-
-# CHECK: ld %f0, 0(%r1)
-0x68 0x00 0x10 0x00
-
-# CHECK: ld %f0, 0(%r15)
-0x68 0x00 0xf0 0x00
-
-# CHECK: ld %f0, 4095(%r1,%r15)
-0x68 0x01 0xff 0xff
-
-# CHECK: ld %f0, 4095(%r15,%r1)
-0x68 0x0f 0x1f 0xff
-
-# CHECK: ld %f15, 0
-0x68 0xf0 0x00 0x00
-
 # CHECK: ldxbr %f0, %f0
 0xb3 0x45 0x00 0x00
 
@@ -5101,22 +6178,22 @@
 # CHECK: ldxbr %f13, %f13
 0xb3 0x45 0x00 0xdd
 
-# CHECK: ldxbra	%f0, 0, %f0, 1
+# CHECK: ldxbra %f0, 0, %f0, 1
 0xb3 0x45 0x01 0x00
 
-# CHECK: ldxbra	%f0, 0, %f0, 15
+# CHECK: ldxbra %f0, 0, %f0, 15
 0xb3 0x45 0x0f 0x00
 
-# CHECK: ldxbra	%f0, 0, %f13, 1
+# CHECK: ldxbra %f0, 0, %f13, 1
 0xb3 0x45 0x01 0x0d
 
-# CHECK: ldxbra	%f0, 15, %f0, 1
+# CHECK: ldxbra %f0, 15, %f0, 1
 0xb3 0x45 0xf1 0x00
 
-# CHECK: ldxbra	%f4, 5, %f8, 9
+# CHECK: ldxbra %f4, 5, %f8, 9
 0xb3 0x45 0x59 0x48
 
-# CHECK: ldxbra	%f13, 0, %f0, 1
+# CHECK: ldxbra %f13, 0, %f0, 1
 0xb3 0x45 0x01 0xd0
 
 # CHECK: ldy %f0, -524288
@@ -5149,6 +6226,27 @@
 # CHECK: ldy %f15, 0
 0xed 0xf0 0x00 0x00 0x00 0x65
 
+# CHECK: le %f0, 0
+0x78 0x00 0x00 0x00
+
+# CHECK: le %f0, 4095
+0x78 0x00 0x0f 0xff
+
+# CHECK: le %f0, 0(%r1)
+0x78 0x00 0x10 0x00
+
+# CHECK: le %f0, 0(%r15)
+0x78 0x00 0xf0 0x00
+
+# CHECK: le %f0, 4095(%r1,%r15)
+0x78 0x01 0xff 0xff
+
+# CHECK: le %f0, 4095(%r15,%r1)
+0x78 0x0f 0x1f 0xff
+
+# CHECK: le %f15, 0
+0x78 0xf0 0x00 0x00
+
 # CHECK: ledbr %f0, %f0
 0xb3 0x44 0x00 0x00
 
@@ -5164,22 +6262,22 @@
 # CHECK: ledbr %f15, %f15
 0xb3 0x44 0x00 0xff
 
-# CHECK: ledbra	%f0, 0, %f0, 1
+# CHECK: ledbra %f0, 0, %f0, 1
 0xb3 0x44 0x01 0x00
 
-# CHECK: ledbra	%f0, 0, %f0, 15
+# CHECK: ledbra %f0, 0, %f0, 15
 0xb3 0x44 0x0f 0x00
 
-# CHECK: ledbra	%f0, 0, %f15, 1
+# CHECK: ledbra %f0, 0, %f15, 1
 0xb3 0x44 0x01 0x0f
 
-# CHECK: ledbra	%f0, 15, %f0, 1
+# CHECK: ledbra %f0, 15, %f0, 1
 0xb3 0x44 0xf1 0x00
 
-# CHECK: ledbra	%f4, 5, %f6, 7
+# CHECK: ledbra %f4, 5, %f6, 7
 0xb3 0x44 0x57 0x46
 
-# CHECK: ledbra	%f15, 0, %f0, 1
+# CHECK: ledbra %f15, 0, %f0, 1
 0xb3 0x44 0x01 0xf0
 
 # CHECK: ler %f0, %f9
@@ -5194,27 +6292,6 @@
 # CHECK: ler %f15, %f9
 0x38 0xf9
 
-# CHECK: le %f0, 0
-0x78 0x00 0x00 0x00
-
-# CHECK: le %f0, 4095
-0x78 0x00 0x0f 0xff
-
-# CHECK: le %f0, 0(%r1)
-0x78 0x00 0x10 0x00
-
-# CHECK: le %f0, 0(%r15)
-0x78 0x00 0xf0 0x00
-
-# CHECK: le %f0, 4095(%r1,%r15)
-0x78 0x01 0xff 0xff
-
-# CHECK: le %f0, 4095(%r15,%r1)
-0x78 0x0f 0x1f 0xff
-
-# CHECK: le %f15, 0
-0x78 0xf0 0x00 0x00
-
 # CHECK: lexbr %f0, %f0
 0xb3 0x46 0x00 0x00
 
@@ -5230,22 +6307,22 @@
 # CHECK: lexbr %f13, %f13
 0xb3 0x46 0x00 0xdd
 
-# CHECK: lexbra	%f0, 0, %f0, 1
+# CHECK: lexbra %f0, 0, %f0, 1
 0xb3 0x46 0x01 0x00
 
-# CHECK: lexbra	%f0, 0, %f0, 15
+# CHECK: lexbra %f0, 0, %f0, 15
 0xb3 0x46 0x0f 0x00
 
-# CHECK: lexbra	%f0, 0, %f13, 1
+# CHECK: lexbra %f0, 0, %f13, 1
 0xb3 0x46 0x01 0x0d
 
-# CHECK: lexbra	%f0, 15, %f0, 1
+# CHECK: lexbra %f0, 15, %f0, 1
 0xb3 0x46 0xf1 0x00
 
-# CHECK: lexbra	%f4, 5, %f8, 9
+# CHECK: lexbra %f4, 5, %f8, 9
 0xb3 0x46 0x59 0x48
 
-# CHECK: lexbra	%f13, 0, %f0, 1
+# CHECK: lexbra %f13, 0, %f0, 1
 0xb3 0x46 0x01 0xd0
 
 # CHECK: ley %f0, -524288
@@ -5278,6 +6355,24 @@
 # CHECK: ley %f15, 0
 0xed 0xf0 0x00 0x00 0x00 0x64
 
+# CHECK: lfas 0
+0xb2 0xbd 0x00 0x00
+
+# CHECK: lfas 0(%r1)
+0xb2 0xbd 0x10 0x00
+
+# CHECK: lfas 0(%r15)
+0xb2 0xbd 0xf0 0x00
+
+# CHECK: lfas 4095
+0xb2 0xbd 0x0f 0xff
+
+# CHECK: lfas 4095(%r1)
+0xb2 0xbd 0x1f 0xff
+
+# CHECK: lfas 4095(%r15)
+0xb2 0xbd 0xff 0xff
+
 # CHECK: lfh %r0, -524288
 0xe3 0x00 0x00 0x00 0x80 0xca
 
@@ -5338,24 +6433,6 @@
 # CHECK: lfhat %r15, 0
 0xe3 0xf0 0x00 0x00 0x00 0xc8
 
-# CHECK: lfas 0
-0xb2 0xbd 0x00 0x00
-
-# CHECK: lfas 0(%r1)
-0xb2 0xbd 0x10 0x00
-
-# CHECK: lfas 0(%r15)
-0xb2 0xbd 0xf0 0x00
-
-# CHECK: lfas 4095
-0xb2 0xbd 0x0f 0xff
-
-# CHECK: lfas 4095(%r1)
-0xb2 0xbd 0x1f 0xff
-
-# CHECK: lfas 4095(%r15)
-0xb2 0xbd 0xff 0xff
-
 # CHECK: lfpc 0
 0xb2 0x9d 0x00 0x00
 
@@ -5374,14 +6451,65 @@
 # CHECK: lfpc 4095(%r15)
 0xb2 0x9d 0xff 0xff
 
-# CHECK: lgbr %r0, %r15
-0xb9 0x06 0x00 0x0f
+# CHECK: lg %r0, -524288
+0xe3 0x00 0x00 0x00 0x80 0x04
 
-# CHECK: lgbr %r7, %r8
-0xb9 0x06 0x00 0x78
+# CHECK: lg %r0, -1
+0xe3 0x00 0x0f 0xff 0xff 0x04
 
-# CHECK: lgbr %r15, %r0
-0xb9 0x06 0x00 0xf0
+# CHECK: lg %r0, 0
+0xe3 0x00 0x00 0x00 0x00 0x04
+
+# CHECK: lg %r0, 1
+0xe3 0x00 0x00 0x01 0x00 0x04
+
+# CHECK: lg %r0, 524287
+0xe3 0x00 0x0f 0xff 0x7f 0x04
+
+# CHECK: lg %r0, 0(%r1)
+0xe3 0x00 0x10 0x00 0x00 0x04
+
+# CHECK: lg %r0, 0(%r15)
+0xe3 0x00 0xf0 0x00 0x00 0x04
+
+# CHECK: lg %r0, 524287(%r1,%r15)
+0xe3 0x01 0xff 0xff 0x7f 0x04
+
+# CHECK: lg %r0, 524287(%r15,%r1)
+0xe3 0x0f 0x1f 0xff 0x7f 0x04
+
+# CHECK: lg %r15, 0
+0xe3 0xf0 0x00 0x00 0x00 0x04
+
+# CHECK: lgat %r0, -524288
+0xe3 0x00 0x00 0x00 0x80 0x85
+
+# CHECK: lgat %r0, -1
+0xe3 0x00 0x0f 0xff 0xff 0x85
+
+# CHECK: lgat %r0, 0
+0xe3 0x00 0x00 0x00 0x00 0x85
+
+# CHECK: lgat %r0, 1
+0xe3 0x00 0x00 0x01 0x00 0x85
+
+# CHECK: lgat %r0, 524287
+0xe3 0x00 0x0f 0xff 0x7f 0x85
+
+# CHECK: lgat %r0, 0(%r1)
+0xe3 0x00 0x10 0x00 0x00 0x85
+
+# CHECK: lgat %r0, 0(%r15)
+0xe3 0x00 0xf0 0x00 0x00 0x85
+
+# CHECK: lgat %r0, 524287(%r1,%r15)
+0xe3 0x01 0xff 0xff 0x7f 0x85
+
+# CHECK: lgat %r0, 524287(%r15,%r1)
+0xe3 0x0f 0x1f 0xff 0x7f 0x85
+
+# CHECK: lgat %r15, 0
+0xe3 0xf0 0x00 0x00 0x00 0x85
 
 # CHECK: lgb %r0, -524288
 0xe3 0x00 0x00 0x00 0x80 0x77
@@ -5413,6 +6541,15 @@
 # CHECK: lgb %r15, 0
 0xe3 0xf0 0x00 0x00 0x00 0x77
 
+# CHECK: lgbr %r0, %r15
+0xb9 0x06 0x00 0x0f
+
+# CHECK: lgbr %r7, %r8
+0xb9 0x06 0x00 0x78
+
+# CHECK: lgbr %r15, %r0
+0xb9 0x06 0x00 0xf0
+
 # CHECK: lgdr %r0, %f0
 0xb3 0xcd 0x00 0x00
 
@@ -5426,34 +6563,7 @@
 0xb3 0xcd 0x00 0x88
 
 # CHECK: lgdr %r15, %f15
-0xb3 0xcd 0x00 0xff
-
-# CHECK: lgfi %r0, -2147483648
-0xc0 0x01 0x80 0x00 0x00 0x00
-
-# CHECK: lgfi %r0, -1
-0xc0 0x01 0xff 0xff 0xff 0xff
-
-# CHECK: lgfi %r0, 0
-0xc0 0x01 0x00 0x00 0x00 0x00
-
-# CHECK: lgfi %r0, 1
-0xc0 0x01 0x00 0x00 0x00 0x01
-
-# CHECK: lgfi %r0, 2147483647
-0xc0 0x01 0x7f 0xff 0xff 0xff
-
-# CHECK: lgfi %r15, 0
-0xc0 0xf1 0x00 0x00 0x00 0x00
-
-# CHECK: lgfr %r0, %r15
-0xb9 0x14 0x00 0x0f
-
-# CHECK: lgfr %r7, %r8
-0xb9 0x14 0x00 0x78
-
-# CHECK: lgfr %r15, %r0
-0xb9 0x14 0x00 0xf0
+0xb3 0xcd 0x00 0xff
 
 # CHECK: lgf %r0, -524288
 0xe3 0x00 0x00 0x00 0x80 0x14
@@ -5485,32 +6595,32 @@
 # CHECK: lgf %r15, 0
 0xe3 0xf0 0x00 0x00 0x00 0x14
 
-# CHECK: lghi %r0, -32768
-0xa7 0x09 0x80 0x00
+# CHECK: lgfi %r0, -2147483648
+0xc0 0x01 0x80 0x00 0x00 0x00
 
-# CHECK: lghi %r0, -1
-0xa7 0x09 0xff 0xff
+# CHECK: lgfi %r0, -1
+0xc0 0x01 0xff 0xff 0xff 0xff
 
-# CHECK: lghi %r0, 0
-0xa7 0x09 0x00 0x00
+# CHECK: lgfi %r0, 0
+0xc0 0x01 0x00 0x00 0x00 0x00
 
-# CHECK: lghi %r0, 1
-0xa7 0x09 0x00 0x01
+# CHECK: lgfi %r0, 1
+0xc0 0x01 0x00 0x00 0x00 0x01
 
-# CHECK: lghi %r0, 32767
-0xa7 0x09 0x7f 0xff
+# CHECK: lgfi %r0, 2147483647
+0xc0 0x01 0x7f 0xff 0xff 0xff
 
-# CHECK: lghi %r15, 0
-0xa7 0xf9 0x00 0x00
+# CHECK: lgfi %r15, 0
+0xc0 0xf1 0x00 0x00 0x00 0x00
 
-# CHECK: lghr %r0, %r15
-0xb9 0x07 0x00 0x0f
+# CHECK: lgfr %r0, %r15
+0xb9 0x14 0x00 0x0f
 
-# CHECK: lghr %r7, %r8
-0xb9 0x07 0x00 0x78
+# CHECK: lgfr %r7, %r8
+0xb9 0x14 0x00 0x78
 
-# CHECK: lghr %r15, %r0
-0xb9 0x07 0x00 0xf0
+# CHECK: lgfr %r15, %r0
+0xb9 0x14 0x00 0xf0
 
 # CHECK: lgh %r0, -524288
 0xe3 0x00 0x00 0x00 0x80 0x15
@@ -5542,95 +6652,65 @@
 # CHECK: lgh %r15, 0
 0xe3 0xf0 0x00 0x00 0x00 0x15
 
-# CHECK: lgr %r0, %r9
-0xb9 0x04 0x00 0x09
-
-# CHECK: lgr %r0, %r15
-0xb9 0x04 0x00 0x0f
-
-# CHECK: lgr %r15, %r0
-0xb9 0x04 0x00 0xf0
-
-# CHECK: lgr %r15, %r9
-0xb9 0x04 0x00 0xf9
-
-# CHECK: lg %r0, -524288
-0xe3 0x00 0x00 0x00 0x80 0x04
-
-# CHECK: lg %r0, -1
-0xe3 0x00 0x0f 0xff 0xff 0x04
-
-# CHECK: lg %r0, 0
-0xe3 0x00 0x00 0x00 0x00 0x04
-
-# CHECK: lg %r0, 1
-0xe3 0x00 0x00 0x01 0x00 0x04
-
-# CHECK: lg %r0, 524287
-0xe3 0x00 0x0f 0xff 0x7f 0x04
-
-# CHECK: lg %r0, 0(%r1)
-0xe3 0x00 0x10 0x00 0x00 0x04
-
-# CHECK: lg %r0, 0(%r15)
-0xe3 0x00 0xf0 0x00 0x00 0x04
+# CHECK: lghi %r0, -32768
+0xa7 0x09 0x80 0x00
 
-# CHECK: lg %r0, 524287(%r1,%r15)
-0xe3 0x01 0xff 0xff 0x7f 0x04
+# CHECK: lghi %r0, -1
+0xa7 0x09 0xff 0xff
 
-# CHECK: lg %r0, 524287(%r15,%r1)
-0xe3 0x0f 0x1f 0xff 0x7f 0x04
+# CHECK: lghi %r0, 0
+0xa7 0x09 0x00 0x00
 
-# CHECK: lg %r15, 0
-0xe3 0xf0 0x00 0x00 0x00 0x04
+# CHECK: lghi %r0, 1
+0xa7 0x09 0x00 0x01
 
-# CHECK: lgat %r0, -524288
-0xe3 0x00 0x00 0x00 0x80 0x85
+# CHECK: lghi %r0, 32767
+0xa7 0x09 0x7f 0xff
 
-# CHECK: lgat %r0, -1
-0xe3 0x00 0x0f 0xff 0xff 0x85
+# CHECK: lghi %r15, 0
+0xa7 0xf9 0x00 0x00
 
-# CHECK: lgat %r0, 0
-0xe3 0x00 0x00 0x00 0x00 0x85
+# CHECK: lghr %r0, %r15
+0xb9 0x07 0x00 0x0f
 
-# CHECK: lgat %r0, 1
-0xe3 0x00 0x00 0x01 0x00 0x85
+# CHECK: lghr %r7, %r8
+0xb9 0x07 0x00 0x78
 
-# CHECK: lgat %r0, 524287
-0xe3 0x00 0x0f 0xff 0x7f 0x85
+# CHECK: lghr %r15, %r0
+0xb9 0x07 0x00 0xf0
 
-# CHECK: lgat %r0, 0(%r1)
-0xe3 0x00 0x10 0x00 0x00 0x85
+# CHECK: lgr %r0, %r9
+0xb9 0x04 0x00 0x09
 
-# CHECK: lgat %r0, 0(%r15)
-0xe3 0x00 0xf0 0x00 0x00 0x85
+# CHECK: lgr %r0, %r15
+0xb9 0x04 0x00 0x0f
 
-# CHECK: lgat %r0, 524287(%r1,%r15)
-0xe3 0x01 0xff 0xff 0x7f 0x85
+# CHECK: lgr %r15, %r0
+0xb9 0x04 0x00 0xf0
 
-# CHECK: lgat %r0, 524287(%r15,%r1)
-0xe3 0x0f 0x1f 0xff 0x7f 0x85
+# CHECK: lgr %r15, %r9
+0xb9 0x04 0x00 0xf9
 
-# CHECK: lgat %r15, 0
-0xe3 0xf0 0x00 0x00 0x00 0x85
+# CHECK: lh %r0, 0
+0x48 0x00 0x00 0x00
 
-# CHECK: lhi %r0, -32768
-0xa7 0x08 0x80 0x00
+# CHECK: lh %r0, 4095
+0x48 0x00 0x0f 0xff
 
-# CHECK: lhi %r0, -1
-0xa7 0x08 0xff 0xff
+# CHECK: lh %r0, 0(%r1)
+0x48 0x00 0x10 0x00
 
-# CHECK: lhi %r0, 0
-0xa7 0x08 0x00 0x00
+# CHECK: lh %r0, 0(%r15)
+0x48 0x00 0xf0 0x00
 
-# CHECK: lhi %r0, 1
-0xa7 0x08 0x00 0x01
+# CHECK: lh %r0, 4095(%r1,%r15)
+0x48 0x01 0xff 0xff
 
-# CHECK: lhi %r0, 32767
-0xa7 0x08 0x7f 0xff
+# CHECK: lh %r0, 4095(%r15,%r1)
+0x48 0x0f 0x1f 0xff
 
-# CHECK: lhi %r15, 0
-0xa7 0xf8 0x00 0x00
+# CHECK: lh %r15, 0
+0x48 0xf0 0x00 0x00
 
 # CHECK: lhh %r0, -524288
 0xe3 0x00 0x00 0x00 0x80 0xc4
@@ -5662,35 +6742,32 @@
 # CHECK: lhh %r15, 0
 0xe3 0xf0 0x00 0x00 0x00 0xc4
 
-# CHECK: lhr %r0, %r15
-0xb9 0x27 0x00 0x0f
-
-# CHECK: lhr %r7, %r8
-0xb9 0x27 0x00 0x78
+# CHECK: lhi %r0, -32768
+0xa7 0x08 0x80 0x00
 
-# CHECK: lhr %r15, %r0
-0xb9 0x27 0x00 0xf0
+# CHECK: lhi %r0, -1
+0xa7 0x08 0xff 0xff
 
-# CHECK: lh %r0, 0
-0x48 0x00 0x00 0x00
+# CHECK: lhi %r0, 0
+0xa7 0x08 0x00 0x00
 
-# CHECK: lh %r0, 4095
-0x48 0x00 0x0f 0xff
+# CHECK: lhi %r0, 1
+0xa7 0x08 0x00 0x01
 
-# CHECK: lh %r0, 0(%r1)
-0x48 0x00 0x10 0x00
+# CHECK: lhi %r0, 32767
+0xa7 0x08 0x7f 0xff
 
-# CHECK: lh %r0, 0(%r15)
-0x48 0x00 0xf0 0x00
+# CHECK: lhi %r15, 0
+0xa7 0xf8 0x00 0x00
 
-# CHECK: lh %r0, 4095(%r1,%r15)
-0x48 0x01 0xff 0xff
+# CHECK: lhr %r0, %r15
+0xb9 0x27 0x00 0x0f
 
-# CHECK: lh %r0, 4095(%r15,%r1)
-0x48 0x0f 0x1f 0xff
+# CHECK: lhr %r7, %r8
+0xb9 0x27 0x00 0x78
 
-# CHECK: lh %r15, 0
-0x48 0xf0 0x00 0x00
+# CHECK: lhr %r15, %r0
+0xb9 0x27 0x00 0xf0
 
 # CHECK: lhy %r0, -524288
 0xe3 0x00 0x00 0x00 0x80 0x78
@@ -5722,15 +6799,6 @@
 # CHECK: lhy %r15, 0
 0xe3 0xf0 0x00 0x00 0x00 0x78
 
-# CHECK: llcr %r0, %r15
-0xb9 0x94 0x00 0x0f
-
-# CHECK: llcr %r7, %r8
-0xb9 0x94 0x00 0x78
-
-# CHECK: llcr %r15, %r0
-0xb9 0x94 0x00 0xf0
-
 # CHECK: llc %r0, -524288
 0xe3 0x00 0x00 0x00 0x80 0x94
 
@@ -5791,14 +6859,14 @@
 # CHECK: llch %r15, 0
 0xe3 0xf0 0x00 0x00 0x00 0xc2
 
-# CHECK: llgcr %r0, %r15
-0xb9 0x84 0x00 0x0f
+# CHECK: llcr %r0, %r15
+0xb9 0x94 0x00 0x0f
 
-# CHECK: llgcr %r7, %r8
-0xb9 0x84 0x00 0x78
+# CHECK: llcr %r7, %r8
+0xb9 0x94 0x00 0x78
 
-# CHECK: llgcr %r15, %r0
-0xb9 0x84 0x00 0xf0
+# CHECK: llcr %r15, %r0
+0xb9 0x94 0x00 0xf0
 
 # CHECK: llgc %r0, -524288
 0xe3 0x00 0x00 0x00 0x80 0x90
@@ -5830,14 +6898,14 @@
 # CHECK: llgc %r15, 0
 0xe3 0xf0 0x00 0x00 0x00 0x90
 
-# CHECK: llgfr %r0, %r15
-0xb9 0x16 0x00 0x0f
+# CHECK: llgcr %r0, %r15
+0xb9 0x84 0x00 0x0f
 
-# CHECK: llgfr %r7, %r8
-0xb9 0x16 0x00 0x78
+# CHECK: llgcr %r7, %r8
+0xb9 0x84 0x00 0x78
 
-# CHECK: llgfr %r15, %r0
-0xb9 0x16 0x00 0xf0
+# CHECK: llgcr %r15, %r0
+0xb9 0x84 0x00 0xf0
 
 # CHECK: llgf %r0, -524288
 0xe3 0x00 0x00 0x00 0x80 0x16
@@ -5896,17 +6964,56 @@
 # CHECK: llgfat %r0, 524287(%r15,%r1)
 0xe3 0x0f 0x1f 0xff 0x7f 0x9d
 
-# CHECK: llgfat %r15, 0
-0xe3 0xf0 0x00 0x00 0x00 0x9d
+# CHECK: llgfat %r15, 0
+0xe3 0xf0 0x00 0x00 0x00 0x9d
+
+# CHECK: llgfr %r0, %r15
+0xb9 0x16 0x00 0x0f
+
+# CHECK: llgfr %r7, %r8
+0xb9 0x16 0x00 0x78
+
+# CHECK: llgfr %r15, %r0
+0xb9 0x16 0x00 0xf0
+
+# CHECK: llgh %r0, -524288
+0xe3 0x00 0x00 0x00 0x80 0x91
+
+# CHECK: llgh %r0, -1
+0xe3 0x00 0x0f 0xff 0xff 0x91
+
+# CHECK: llgh %r0, 0
+0xe3 0x00 0x00 0x00 0x00 0x91
+
+# CHECK: llgh %r0, 1
+0xe3 0x00 0x00 0x01 0x00 0x91
+
+# CHECK: llgh %r0, 524287
+0xe3 0x00 0x0f 0xff 0x7f 0x91
+
+# CHECK: llgh %r0, 0(%r1)
+0xe3 0x00 0x10 0x00 0x00 0x91
+
+# CHECK: llgh %r0, 0(%r15)
+0xe3 0x00 0xf0 0x00 0x00 0x91
+
+# CHECK: llgh %r0, 524287(%r1,%r15)
+0xe3 0x01 0xff 0xff 0x7f 0x91
+
+# CHECK: llgh %r0, 524287(%r15,%r1)
+0xe3 0x0f 0x1f 0xff 0x7f 0x91
+
+# CHECK: llgh %r15, 0
+0xe3 0xf0 0x00 0x00 0x00 0x91
 
-# CHECK: llgtr %r0, %r15
-0xb9 0x17 0x00 0x0f
+# CHECK: llghr %r0, %r15
+0xb9 0x85 0x00 0x0f
 
-# CHECK: llgtr %r7, %r8
-0xb9 0x17 0x00 0x78
+# CHECK: llghr %r7, %r8
+0xb9 0x85 0x00 0x78
 
-# CHECK: llgtr %r15, %r0
-0xb9 0x17 0x00 0xf0
+# CHECK: llghr %r15, %r0
+0xb9 0x85 0x00 0xf0
 
 # CHECK: llgt %r0, -524288
 0xe3 0x00 0x00 0x00 0x80 0x17
@@ -5968,53 +7075,14 @@
 # CHECK: llgtat %r15, 0
 0xe3 0xf0 0x00 0x00 0x00 0x9c
 
-# CHECK: llghr %r0, %r15
-0xb9 0x85 0x00 0x0f
-
-# CHECK: llghr %r7, %r8
-0xb9 0x85 0x00 0x78
-
-# CHECK: llghr %r15, %r0
-0xb9 0x85 0x00 0xf0
-
-# CHECK: llgh %r0, -524288
-0xe3 0x00 0x00 0x00 0x80 0x91
-
-# CHECK: llgh %r0, -1
-0xe3 0x00 0x0f 0xff 0xff 0x91
-
-# CHECK: llgh %r0, 0
-0xe3 0x00 0x00 0x00 0x00 0x91
-
-# CHECK: llgh %r0, 1
-0xe3 0x00 0x00 0x01 0x00 0x91
-
-# CHECK: llgh %r0, 524287
-0xe3 0x00 0x0f 0xff 0x7f 0x91
-
-# CHECK: llgh %r0, 0(%r1)
-0xe3 0x00 0x10 0x00 0x00 0x91
-
-# CHECK: llgh %r0, 0(%r15)
-0xe3 0x00 0xf0 0x00 0x00 0x91
-
-# CHECK: llgh %r0, 524287(%r1,%r15)
-0xe3 0x01 0xff 0xff 0x7f 0x91
-
-# CHECK: llgh %r0, 524287(%r15,%r1)
-0xe3 0x0f 0x1f 0xff 0x7f 0x91
-
-# CHECK: llgh %r15, 0
-0xe3 0xf0 0x00 0x00 0x00 0x91
-
-# CHECK: llhr %r0, %r15
-0xb9 0x95 0x00 0x0f
+# CHECK: llgtr %r0, %r15
+0xb9 0x17 0x00 0x0f
 
-# CHECK: llhr %r7, %r8
-0xb9 0x95 0x00 0x78
+# CHECK: llgtr %r7, %r8
+0xb9 0x17 0x00 0x78
 
-# CHECK: llhr %r15, %r0
-0xb9 0x95 0x00 0xf0
+# CHECK: llgtr %r15, %r0
+0xb9 0x17 0x00 0xf0
 
 # CHECK: llh %r0, -524288
 0xe3 0x00 0x00 0x00 0x80 0x95
@@ -6076,6 +7144,15 @@
 # CHECK: llhh %r15, 0
 0xe3 0xf0 0x00 0x00 0x00 0xc6
 
+# CHECK: llhr %r0, %r15
+0xb9 0x95 0x00 0x0f
+
+# CHECK: llhr %r7, %r8
+0xb9 0x95 0x00 0x78
+
+# CHECK: llhr %r15, %r0
+0xb9 0x95 0x00 0xf0
+
 # CHECK: llihf %r0, 0
 0xc0 0x0e 0x00 0x00 0x00 0x00
 
@@ -6172,6 +7249,27 @@
 # CHECK: lm %r0, %r0, 4095(%r15)
 0x98 0x00 0xff 0xff
 
+# CHECK: lmd %r0, %r0, 0, 0
+0xef 0x00 0x00 0x00 0x00 0x00
+
+# CHECK: lmd %r2, %r4, 0, 4095
+0xef 0x24 0x00 0x00 0x0f 0xff
+
+# CHECK: lmd %r2, %r4, 0, 0(%r1)
+0xef 0x24 0x00 0x00 0x10 0x00
+
+# CHECK: lmd %r2, %r4, 0, 0(%r15)
+0xef 0x24 0x00 0x00 0xf0 0x00
+
+# CHECK: lmd %r2, %r4, 0(%r1), 4095(%r15)
+0xef 0x24 0x10 0x00 0xff 0xff
+
+# CHECK: lmd %r2, %r4, 0(%r1), 0(%r15)
+0xef 0x24 0x10 0x00 0xf0 0x00
+
+# CHECK: lmd %r2, %r4, 4095(%r1), 0(%r15)
+0xef 0x24 0x1f 0xff 0xf0 0x00
+
 # CHECK: lmg %r0, %r0, 0
 0xeb 0x00 0x00 0x00 0x00 0x04
 
@@ -6574,6 +7672,18 @@
 # CHECK: lpd %r2, 4095(%r1), 0(%r15)
 0xc8 0x24 0x1f 0xff 0xf0 0x00
 
+# CHECK: lpdbr %f0, %f9
+0xb3 0x10 0x00 0x09
+
+# CHECK: lpdbr %f0, %f15
+0xb3 0x10 0x00 0x0f
+
+# CHECK: lpdbr %f15, %f0
+0xb3 0x10 0x00 0xf0
+
+# CHECK: lpdbr %f15, %f9
+0xb3 0x10 0x00 0xf9
+
 # CHECK: lpdg %r0, 0, 0
 0xc8 0x05 0x00 0x00 0x00 0x00
 
@@ -6595,18 +7705,6 @@
 # CHECK: lpdg %r2, 4095(%r1), 0(%r15)
 0xc8 0x25 0x1f 0xff 0xf0 0x00
 
-# CHECK: lpdbr %f0, %f9
-0xb3 0x10 0x00 0x09
-
-# CHECK: lpdbr %f0, %f15
-0xb3 0x10 0x00 0x0f
-
-# CHECK: lpdbr %f15, %f0
-0xb3 0x10 0x00 0xf0
-
-# CHECK: lpdbr %f15, %f9
-0xb3 0x10 0x00 0xf9
-
 # CHECK: lpebr %f0, %f9
 0xb3 0x00 0x00 0x09
 
@@ -6643,18 +7741,6 @@
 # CHECK: lpgr %r7, %r8
 0xb9 0x00 0x00 0x78
 
-# CHECK: lpr %r0, %r0
-0x10 0x00
-
-# CHECK: lpr %r0, %r15
-0x10 0x0f
-
-# CHECK: lpr %r15, %r0
-0x10 0xf0
-
-# CHECK: lpr %r7, %r8
-0x10 0x78
-
 # CHECK: lpq %r0, -524288
 0xe3 0x00 0x00 0x00 0x80 0x8f
 
@@ -6685,6 +7771,18 @@
 # CHECK: lpq %r14, 0
 0xe3 0xe0 0x00 0x00 0x00 0x8f
 
+# CHECK: lpr %r0, %r0
+0x10 0x00
+
+# CHECK: lpr %r0, %r15
+0x10 0x0f
+
+# CHECK: lpr %r15, %r0
+0x10 0xf0
+
+# CHECK: lpr %r7, %r8
+0x10 0x78
+
 # CHECK: lpxbr %f0, %f8
 0xb3 0x40 0x00 0x08
 
@@ -6709,20 +7807,35 @@
 # CHECK: lr %r15, %r9
 0x18 0xf9
 
-# CHECK: lrvgr %r0, %r0
-0xb9 0x0f 0x00 0x00
+# CHECK: lrv %r0, -524288
+0xe3 0x00 0x00 0x00 0x80 0x1e
 
-# CHECK: lrvgr %r0, %r15
-0xb9 0x0f 0x00 0x0f
+# CHECK: lrv %r0, -1
+0xe3 0x00 0x0f 0xff 0xff 0x1e
 
-# CHECK: lrvgr %r15, %r0
-0xb9 0x0f 0x00 0xf0
+# CHECK: lrv %r0, 0
+0xe3 0x00 0x00 0x00 0x00 0x1e
 
-# CHECK: lrvgr %r7, %r8
-0xb9 0x0f 0x00 0x78
+# CHECK: lrv %r0, 1
+0xe3 0x00 0x00 0x01 0x00 0x1e
 
-# CHECK: lrvgr %r15, %r15
-0xb9 0x0f 0x00 0xff
+# CHECK: lrv %r0, 524287
+0xe3 0x00 0x0f 0xff 0x7f 0x1e
+
+# CHECK: lrv %r0, 0(%r1)
+0xe3 0x00 0x10 0x00 0x00 0x1e
+
+# CHECK: lrv %r0, 0(%r15)
+0xe3 0x00 0xf0 0x00 0x00 0x1e
+
+# CHECK: lrv %r0, 524287(%r1,%r15)
+0xe3 0x01 0xff 0xff 0x7f 0x1e
+
+# CHECK: lrv %r0, 524287(%r15,%r1)
+0xe3 0x0f 0x1f 0xff 0x7f 0x1e
+
+# CHECK: lrv %r15, 0
+0xe3 0xf0 0x00 0x00 0x00 0x1e
 
 # CHECK: lrvg %r0, -524288
 0xe3 0x00 0x00 0x00 0x80 0x0f
@@ -6754,20 +7867,20 @@
 # CHECK: lrvg %r15, 0
 0xe3 0xf0 0x00 0x00 0x00 0x0f
 
-# CHECK: lrvr %r0, %r0
-0xb9 0x1f 0x00 0x00
+# CHECK: lrvgr %r0, %r0
+0xb9 0x0f 0x00 0x00
 
-# CHECK: lrvr %r0, %r15
-0xb9 0x1f 0x00 0x0f
+# CHECK: lrvgr %r0, %r15
+0xb9 0x0f 0x00 0x0f
 
-# CHECK: lrvr %r15, %r0
-0xb9 0x1f 0x00 0xf0
+# CHECK: lrvgr %r15, %r0
+0xb9 0x0f 0x00 0xf0
 
-# CHECK: lrvr %r7, %r8
-0xb9 0x1f 0x00 0x78
+# CHECK: lrvgr %r7, %r8
+0xb9 0x0f 0x00 0x78
 
-# CHECK: lrvr %r15, %r15
-0xb9 0x1f 0x00 0xff
+# CHECK: lrvgr %r15, %r15
+0xb9 0x0f 0x00 0xff
 
 # CHECK: lrvh %r0, -524288
 0xe3 0x00 0x00 0x00 0x80 0x1f
@@ -6781,104 +7894,38 @@
 # CHECK: lrvh %r0, 1
 0xe3 0x00 0x00 0x01 0x00 0x1f
 
-# CHECK: lrvh %r0, 524287
-0xe3 0x00 0x0f 0xff 0x7f 0x1f
-
-# CHECK: lrvh %r0, 0(%r1)
-0xe3 0x00 0x10 0x00 0x00 0x1f
-
-# CHECK: lrvh %r0, 0(%r15)
-0xe3 0x00 0xf0 0x00 0x00 0x1f
-
-# CHECK: lrvh %r0, 524287(%r1,%r15)
-0xe3 0x01 0xff 0xff 0x7f 0x1f
-
-# CHECK: lrvh %r0, 524287(%r15,%r1)
-0xe3 0x0f 0x1f 0xff 0x7f 0x1f
-
-# CHECK: lrvh %r15, 0
-0xe3 0xf0 0x00 0x00 0x00 0x1f
-
-# CHECK: lrv %r0, -524288
-0xe3 0x00 0x00 0x00 0x80 0x1e
-
-# CHECK: lrv %r0, -1
-0xe3 0x00 0x0f 0xff 0xff 0x1e
-
-# CHECK: lrv %r0, 0
-0xe3 0x00 0x00 0x00 0x00 0x1e
-
-# CHECK: lrv %r0, 1
-0xe3 0x00 0x00 0x01 0x00 0x1e
-
-# CHECK: lrv %r0, 524287
-0xe3 0x00 0x0f 0xff 0x7f 0x1e
-
-# CHECK: lrv %r0, 0(%r1)
-0xe3 0x00 0x10 0x00 0x00 0x1e
-
-# CHECK: lrv %r0, 0(%r15)
-0xe3 0x00 0xf0 0x00 0x00 0x1e
-
-# CHECK: lrv %r0, 524287(%r1,%r15)
-0xe3 0x01 0xff 0xff 0x7f 0x1e
-
-# CHECK: lrv %r0, 524287(%r15,%r1)
-0xe3 0x0f 0x1f 0xff 0x7f 0x1e
-
-# CHECK: lrv %r15, 0
-0xe3 0xf0 0x00 0x00 0x00 0x1e
-
-# CHECK: l %r0, 0
-0x58 0x00 0x00 0x00
-
-# CHECK: l %r0, 4095
-0x58 0x00 0x0f 0xff
-
-# CHECK: l %r0, 0(%r1)
-0x58 0x00 0x10 0x00
-
-# CHECK: l %r0, 0(%r15)
-0x58 0x00 0xf0 0x00
-
-# CHECK: l %r0, 4095(%r1,%r15)
-0x58 0x01 0xff 0xff
-
-# CHECK: l %r0, 4095(%r15,%r1)
-0x58 0x0f 0x1f 0xff
-
-# CHECK: l %r15, 0
-0x58 0xf0 0x00 0x00
+# CHECK: lrvh %r0, 524287
+0xe3 0x00 0x0f 0xff 0x7f 0x1f
 
-# CHECK: lat %r0, -524288
-0xe3 0x00 0x00 0x00 0x80 0x9f
+# CHECK: lrvh %r0, 0(%r1)
+0xe3 0x00 0x10 0x00 0x00 0x1f
 
-# CHECK: lat %r0, -1
-0xe3 0x00 0x0f 0xff 0xff 0x9f
+# CHECK: lrvh %r0, 0(%r15)
+0xe3 0x00 0xf0 0x00 0x00 0x1f
 
-# CHECK: lat %r0, 0
-0xe3 0x00 0x00 0x00 0x00 0x9f
+# CHECK: lrvh %r0, 524287(%r1,%r15)
+0xe3 0x01 0xff 0xff 0x7f 0x1f
 
-# CHECK: lat %r0, 1
-0xe3 0x00 0x00 0x01 0x00 0x9f
+# CHECK: lrvh %r0, 524287(%r15,%r1)
+0xe3 0x0f 0x1f 0xff 0x7f 0x1f
 
-# CHECK: lat %r0, 524287
-0xe3 0x00 0x0f 0xff 0x7f 0x9f
+# CHECK: lrvh %r15, 0
+0xe3 0xf0 0x00 0x00 0x00 0x1f
 
-# CHECK: lat %r0, 0(%r1)
-0xe3 0x00 0x10 0x00 0x00 0x9f
+# CHECK: lrvr %r0, %r0
+0xb9 0x1f 0x00 0x00
 
-# CHECK: lat %r0, 0(%r15)
-0xe3 0x00 0xf0 0x00 0x00 0x9f
+# CHECK: lrvr %r0, %r15
+0xb9 0x1f 0x00 0x0f
 
-# CHECK: lat %r0, 524287(%r1,%r15)
-0xe3 0x01 0xff 0xff 0x7f 0x9f
+# CHECK: lrvr %r15, %r0
+0xb9 0x1f 0x00 0xf0
 
-# CHECK: lat %r0, 524287(%r15,%r1)
-0xe3 0x0f 0x1f 0xff 0x7f 0x9f
+# CHECK: lrvr %r7, %r8
+0xb9 0x1f 0x00 0x78
 
-# CHECK: lat %r15, 0
-0xe3 0xf0 0x00 0x00 0x00 0x9f
+# CHECK: lrvr %r15, %r15
+0xb9 0x1f 0x00 0xff
 
 # CHECK: lt %r0, -524288
 0xe3 0x00 0x00 0x00 0x80 0x12
@@ -7111,23 +8158,26 @@
 # CHECK: lzxr %f13
 0xb3 0x76 0x00 0xd0
 
-# CHECK: madbr %f0, %f0, %f0
-0xb3 0x1e 0x00 0x00
+# CHECK: m %r0, 0
+0x5c 0x00 0x00 0x00
 
-# CHECK: madbr %f0, %f0, %f15
-0xb3 0x1e 0x00 0x0f
+# CHECK: m %r0, 4095
+0x5c 0x00 0x0f 0xff
 
-# CHECK: madbr %f0, %f15, %f0
-0xb3 0x1e 0x00 0xf0
+# CHECK: m %r0, 0(%r1)
+0x5c 0x00 0x10 0x00
 
-# CHECK: madbr %f15, %f0, %f0
-0xb3 0x1e 0xf0 0x00
+# CHECK: m %r0, 0(%r15)
+0x5c 0x00 0xf0 0x00
 
-# CHECK: madbr %f7, %f8, %f9
-0xb3 0x1e 0x70 0x89
+# CHECK: m %r0, 4095(%r1,%r15)
+0x5c 0x01 0xff 0xff
 
-# CHECK: madbr %f15, %f15, %f15
-0xb3 0x1e 0xf0 0xff
+# CHECK: m %r0, 4095(%r15,%r1)
+0x5c 0x0f 0x1f 0xff
+
+# CHECK: m %r14, 0
+0x5c 0xe0 0x00 0x00
 
 # CHECK: madb %f0, %f0, 0
 0xed 0x00 0x00 0x00 0x00 0x1e
@@ -7156,23 +8206,23 @@
 # CHECK: madb %f15, %f15, 0
 0xed 0xf0 0x00 0x00 0xf0 0x1e
 
-# CHECK: maebr %f0, %f0, %f0
-0xb3 0x0e 0x00 0x00
+# CHECK: madbr %f0, %f0, %f0
+0xb3 0x1e 0x00 0x00
 
-# CHECK: maebr %f0, %f0, %f15
-0xb3 0x0e 0x00 0x0f
+# CHECK: madbr %f0, %f0, %f15
+0xb3 0x1e 0x00 0x0f
 
-# CHECK: maebr %f0, %f15, %f0
-0xb3 0x0e 0x00 0xf0
+# CHECK: madbr %f0, %f15, %f0
+0xb3 0x1e 0x00 0xf0
 
-# CHECK: maebr %f15, %f0, %f0
-0xb3 0x0e 0xf0 0x00
+# CHECK: madbr %f15, %f0, %f0
+0xb3 0x1e 0xf0 0x00
 
-# CHECK: maebr %f7, %f8, %f9
-0xb3 0x0e 0x70 0x89
+# CHECK: madbr %f7, %f8, %f9
+0xb3 0x1e 0x70 0x89
 
-# CHECK: maebr %f15, %f15, %f15
-0xb3 0x0e 0xf0 0xff
+# CHECK: madbr %f15, %f15, %f15
+0xb3 0x1e 0xf0 0xff
 
 # CHECK: maeb %f0, %f0, 0
 0xed 0x00 0x00 0x00 0x00 0x0e
@@ -7201,17 +8251,44 @@
 # CHECK: maeb %f15, %f15, 0
 0xed 0xf0 0x00 0x00 0xf0 0x0e
 
-# CHECK: mdbr %f0, %f0
-0xb3 0x1c 0x00 0x00
+# CHECK: maebr %f0, %f0, %f0
+0xb3 0x0e 0x00 0x00
 
-# CHECK: mdbr %f0, %f15
-0xb3 0x1c 0x00 0x0f
+# CHECK: maebr %f0, %f0, %f15
+0xb3 0x0e 0x00 0x0f
 
-# CHECK: mdbr %f7, %f8
-0xb3 0x1c 0x00 0x78
+# CHECK: maebr %f0, %f15, %f0
+0xb3 0x0e 0x00 0xf0
 
-# CHECK: mdbr %f15, %f0
-0xb3 0x1c 0x00 0xf0
+# CHECK: maebr %f15, %f0, %f0
+0xb3 0x0e 0xf0 0x00
+
+# CHECK: maebr %f7, %f8, %f9
+0xb3 0x0e 0x70 0x89
+
+# CHECK: maebr %f15, %f15, %f15
+0xb3 0x0e 0xf0 0xff
+
+# CHECK: mc 0, 0
+0xaf 0x00 0x00 0x00
+
+# CHECK: mc 4095, 0
+0xaf 0x00 0x0f 0xff
+
+# CHECK: mc 0, 255
+0xaf 0xff 0x00 0x00
+
+# CHECK: mc 0(%r1), 42
+0xaf 0x2a 0x10 0x00
+
+# CHECK: mc 0(%r15), 42
+0xaf 0x2a 0xf0 0x00
+
+# CHECK: mc 4095(%r1), 42
+0xaf 0x2a 0x1f 0xff
+
+# CHECK: mc 4095(%r15), 42
+0xaf 0x2a 0xff 0xff
 
 # CHECK: mdb %f0, 0
 0xed 0x00 0x00 0x00 0x00 0x1c
@@ -7234,17 +8311,17 @@
 # CHECK: mdb %f15, 0
 0xed 0xf0 0x00 0x00 0x00 0x1c
 
-# CHECK: mdebr %f0, %f0
-0xb3 0x0c 0x00 0x00
+# CHECK: mdbr %f0, %f0
+0xb3 0x1c 0x00 0x00
 
-# CHECK: mdebr %f0, %f15
-0xb3 0x0c 0x00 0x0f
+# CHECK: mdbr %f0, %f15
+0xb3 0x1c 0x00 0x0f
 
-# CHECK: mdebr %f7, %f8
-0xb3 0x0c 0x00 0x78
+# CHECK: mdbr %f7, %f8
+0xb3 0x1c 0x00 0x78
 
-# CHECK: mdebr %f15, %f0
-0xb3 0x0c 0x00 0xf0
+# CHECK: mdbr %f15, %f0
+0xb3 0x1c 0x00 0xf0
 
 # CHECK: mdeb %f0, 0
 0xed 0x00 0x00 0x00 0x00 0x0c
@@ -7267,17 +8344,17 @@
 # CHECK: mdeb %f15, 0
 0xed 0xf0 0x00 0x00 0x00 0x0c
 
-# CHECK: meebr %f0, %f0
-0xb3 0x17 0x00 0x00
+# CHECK: mdebr %f0, %f0
+0xb3 0x0c 0x00 0x00
 
-# CHECK: meebr %f0, %f15
-0xb3 0x17 0x00 0x0f
+# CHECK: mdebr %f0, %f15
+0xb3 0x0c 0x00 0x0f
 
-# CHECK: meebr %f7, %f8
-0xb3 0x17 0x00 0x78
+# CHECK: mdebr %f7, %f8
+0xb3 0x0c 0x00 0x78
 
-# CHECK: meebr %f15, %f0
-0xb3 0x17 0x00 0xf0
+# CHECK: mdebr %f15, %f0
+0xb3 0x0c 0x00 0xf0
 
 # CHECK: meeb %f0, 0
 0xed 0x00 0x00 0x00 0x00 0x17
@@ -7300,6 +8377,48 @@
 # CHECK: meeb %f15, 0
 0xed 0xf0 0x00 0x00 0x00 0x17
 
+# CHECK: meebr %f0, %f0
+0xb3 0x17 0x00 0x00
+
+# CHECK: meebr %f0, %f15
+0xb3 0x17 0x00 0x0f
+
+# CHECK: meebr %f7, %f8
+0xb3 0x17 0x00 0x78
+
+# CHECK: meebr %f15, %f0
+0xb3 0x17 0x00 0xf0
+
+# CHECK: mfy %r0, -524288
+0xe3 0x00 0x00 0x00 0x80 0x5c
+
+# CHECK: mfy %r0, -1
+0xe3 0x00 0x0f 0xff 0xff 0x5c
+
+# CHECK: mfy %r0, 0
+0xe3 0x00 0x00 0x00 0x00 0x5c
+
+# CHECK: mfy %r0, 1
+0xe3 0x00 0x00 0x01 0x00 0x5c
+
+# CHECK: mfy %r0, 524287
+0xe3 0x00 0x0f 0xff 0x7f 0x5c
+
+# CHECK: mfy %r0, 0(%r1)
+0xe3 0x00 0x10 0x00 0x00 0x5c
+
+# CHECK: mfy %r0, 0(%r15)
+0xe3 0x00 0xf0 0x00 0x00 0x5c
+
+# CHECK: mfy %r0, 524287(%r1,%r15)
+0xe3 0x01 0xff 0xff 0x7f 0x5c
+
+# CHECK: mfy %r0, 524287(%r15,%r1)
+0xe3 0x0f 0x1f 0xff 0x7f 0x5c
+
+# CHECK: mfy %r14, 0
+0xe3 0xe0 0x00 0x00 0x00 0x5c
+
 # CHECK: mghi %r0, -32768
 0xa7 0x0d 0x80 0x00
 
@@ -7318,24 +8437,6 @@
 # CHECK: mghi %r15, 0
 0xa7 0xfd 0x00 0x00
 
-# CHECK: mhi %r0, -32768
-0xa7 0x0c 0x80 0x00
-
-# CHECK: mhi %r0, -1
-0xa7 0x0c 0xff 0xff
-
-# CHECK: mhi %r0, 0
-0xa7 0x0c 0x00 0x00
-
-# CHECK: mhi %r0, 1
-0xa7 0x0c 0x00 0x01
-
-# CHECK: mhi %r0, 32767
-0xa7 0x0c 0x7f 0xff
-
-# CHECK: mhi %r15, 0
-0xa7 0xfc 0x00 0x00
-
 # CHECK: mh %r0, 0
 0x4c 0x00 0x00 0x00
 
@@ -7357,6 +8458,24 @@
 # CHECK: mh %r15, 0
 0x4c 0xf0 0x00 0x00
 
+# CHECK: mhi %r0, -32768
+0xa7 0x0c 0x80 0x00
+
+# CHECK: mhi %r0, -1
+0xa7 0x0c 0xff 0xff
+
+# CHECK: mhi %r0, 0
+0xa7 0x0c 0x00 0x00
+
+# CHECK: mhi %r0, 1
+0xa7 0x0c 0x00 0x01
+
+# CHECK: mhi %r0, 32767
+0xa7 0x0c 0x7f 0xff
+
+# CHECK: mhi %r15, 0
+0xa7 0xfc 0x00 0x00
+
 # CHECK: mhy %r0, -524288
 0xe3 0x00 0x00 0x00 0x80 0x7c
 
@@ -7387,17 +8506,35 @@
 # CHECK: mhy %r15, 0
 0xe3 0xf0 0x00 0x00 0x00 0x7c
 
-# CHECK: mlgr %r0, %r0
-0xb9 0x86 0x00 0x00
+# CHECK: ml %r0, -524288
+0xe3 0x00 0x00 0x00 0x80 0x96
 
-# CHECK: mlgr %r0, %r15
-0xb9 0x86 0x00 0x0f
+# CHECK: ml %r0, -1
+0xe3 0x00 0x0f 0xff 0xff 0x96
 
-# CHECK: mlgr %r14, %r0
-0xb9 0x86 0x00 0xe0
+# CHECK: ml %r0, 0
+0xe3 0x00 0x00 0x00 0x00 0x96
 
-# CHECK: mlgr %r6, %r9
-0xb9 0x86 0x00 0x69
+# CHECK: ml %r0, 1
+0xe3 0x00 0x00 0x01 0x00 0x96
+
+# CHECK: ml %r0, 524287
+0xe3 0x00 0x0f 0xff 0x7f 0x96
+
+# CHECK: ml %r0, 0(%r1)
+0xe3 0x00 0x10 0x00 0x00 0x96
+
+# CHECK: ml %r0, 0(%r15)
+0xe3 0x00 0xf0 0x00 0x00 0x96
+
+# CHECK: ml %r0, 524287(%r1,%r15)
+0xe3 0x01 0xff 0xff 0x7f 0x96
+
+# CHECK: ml %r0, 524287(%r15,%r1)
+0xe3 0x0f 0x1f 0xff 0x7f 0x96
+
+# CHECK: ml %r14, 0
+0xe3 0xe0 0x00 0x00 0x00 0x96
 
 # CHECK: mlg %r0, -524288
 0xe3 0x00 0x00 0x00 0x80 0x86
@@ -7414,38 +8551,119 @@
 # CHECK: mlg %r0, 524287
 0xe3 0x00 0x0f 0xff 0x7f 0x86
 
-# CHECK: mlg %r0, 0(%r1)
-0xe3 0x00 0x10 0x00 0x00 0x86
+# CHECK: mlg %r0, 0(%r1)
+0xe3 0x00 0x10 0x00 0x00 0x86
+
+# CHECK: mlg %r0, 0(%r15)
+0xe3 0x00 0xf0 0x00 0x00 0x86
+
+# CHECK: mlg %r0, 524287(%r1,%r15)
+0xe3 0x01 0xff 0xff 0x7f 0x86
+
+# CHECK: mlg %r0, 524287(%r15,%r1)
+0xe3 0x0f 0x1f 0xff 0x7f 0x86
+
+# CHECK: mlg %r14, 0
+0xe3 0xe0 0x00 0x00 0x00 0x86
+
+# CHECK: mlgr %r0, %r0
+0xb9 0x86 0x00 0x00
+
+# CHECK: mlgr %r0, %r15
+0xb9 0x86 0x00 0x0f
+
+# CHECK: mlgr %r14, %r0
+0xb9 0x86 0x00 0xe0
+
+# CHECK: mlgr %r6, %r9
+0xb9 0x86 0x00 0x69
+
+# CHECK: mlr %r0, %r0
+0xb9 0x96 0x00 0x00
+
+# CHECK: mlr %r0, %r15
+0xb9 0x96 0x00 0x0f
+
+# CHECK: mlr %r14, %r0
+0xb9 0x96 0x00 0xe0
+
+# CHECK: mlr %r6, %r9
+0xb9 0x96 0x00 0x69
+
+# CHECK: mp 0(1), 0(1)
+0xfc 0x00 0x00 0x00 0x00 0x00
+
+# CHECK: mp 0(1), 0(1,%r1)
+0xfc 0x00 0x00 0x00 0x10 0x00
+
+# CHECK: mp 0(1), 0(1,%r15)
+0xfc 0x00 0x00 0x00 0xf0 0x00
+
+# CHECK: mp 0(1), 4095(1)
+0xfc 0x00 0x00 0x00 0x0f 0xff
+
+# CHECK: mp 0(1), 4095(1,%r1)
+0xfc 0x00 0x00 0x00 0x1f 0xff
+
+# CHECK: mp 0(1), 4095(1,%r15)
+0xfc 0x00 0x00 0x00 0xff 0xff
+
+# CHECK: mp 0(1,%r1), 0(1)
+0xfc 0x00 0x10 0x00 0x00 0x00
+
+# CHECK: mp 0(1,%r15), 0(1)
+0xfc 0x00 0xf0 0x00 0x00 0x00
+
+# CHECK: mp 4095(1,%r1), 0(1)
+0xfc 0x00 0x1f 0xff 0x00 0x00
+
+# CHECK: mp 4095(1,%r15), 0(1)
+0xfc 0x00 0xff 0xff 0x00 0x00
+
+# CHECK: mp 0(16,%r1), 0(1)
+0xfc 0xf0 0x10 0x00 0x00 0x00
+
+# CHECK: mp 0(16,%r15), 0(1)
+0xfc 0xf0 0xf0 0x00 0x00 0x00
+
+# CHECK: mp 0(1), 0(16,%r1)
+0xfc 0x0f 0x00 0x00 0x10 0x00
+
+# CHECK: mp 0(1), 0(16,%r15)
+0xfc 0x0f 0x00 0x00 0xf0 0x00
 
-# CHECK: mlg %r0, 0(%r15)
-0xe3 0x00 0xf0 0x00 0x00 0x86
+# CHECK: mr %r0, %r0
+0x1c 0x00
 
-# CHECK: mlg %r0, 524287(%r1,%r15)
-0xe3 0x01 0xff 0xff 0x7f 0x86
+# CHECK: mr %r0, %r15
+0x1c 0x0f
 
-# CHECK: mlg %r0, 524287(%r15,%r1)
-0xe3 0x0f 0x1f 0xff 0x7f 0x86
+# CHECK: mr %r14, %r0
+0x1c 0xe0
 
-# CHECK: mlg %r14, 0
-0xe3 0xe0 0x00 0x00 0x00 0x86
+# CHECK: mr %r6, %r9
+0x1c 0x69
 
-# CHECK: msdbr %f0, %f0, %f0
-0xb3 0x1f 0x00 0x00
+# CHECK: ms %r0, 0
+0x71 0x00 0x00 0x00
 
-# CHECK: msdbr %f0, %f0, %f15
-0xb3 0x1f 0x00 0x0f
+# CHECK: ms %r0, 4095
+0x71 0x00 0x0f 0xff
 
-# CHECK: msdbr %f0, %f15, %f0
-0xb3 0x1f 0x00 0xf0
+# CHECK: ms %r0, 0(%r1)
+0x71 0x00 0x10 0x00
 
-# CHECK: msdbr %f15, %f0, %f0
-0xb3 0x1f 0xf0 0x00
+# CHECK: ms %r0, 0(%r15)
+0x71 0x00 0xf0 0x00
 
-# CHECK: msdbr %f7, %f8, %f9
-0xb3 0x1f 0x70 0x89
+# CHECK: ms %r0, 4095(%r1,%r15)
+0x71 0x01 0xff 0xff
 
-# CHECK: msdbr %f15, %f15, %f15
-0xb3 0x1f 0xf0 0xff
+# CHECK: ms %r0, 4095(%r15,%r1)
+0x71 0x0f 0x1f 0xff
+
+# CHECK: ms %r15, 0
+0x71 0xf0 0x00 0x00
 
 # CHECK: msdb %f0, %f0, 0
 0xed 0x00 0x00 0x00 0x00 0x1f
@@ -7474,23 +8692,23 @@
 # CHECK: msdb %f15, %f15, 0
 0xed 0xf0 0x00 0x00 0xf0 0x1f
 
-# CHECK: msebr %f0, %f0, %f0
-0xb3 0x0f 0x00 0x00
+# CHECK: msdbr %f0, %f0, %f0
+0xb3 0x1f 0x00 0x00
 
-# CHECK: msebr %f0, %f0, %f15
-0xb3 0x0f 0x00 0x0f
+# CHECK: msdbr %f0, %f0, %f15
+0xb3 0x1f 0x00 0x0f
 
-# CHECK: msebr %f0, %f15, %f0
-0xb3 0x0f 0x00 0xf0
+# CHECK: msdbr %f0, %f15, %f0
+0xb3 0x1f 0x00 0xf0
 
-# CHECK: msebr %f15, %f0, %f0
-0xb3 0x0f 0xf0 0x00
+# CHECK: msdbr %f15, %f0, %f0
+0xb3 0x1f 0xf0 0x00
 
-# CHECK: msebr %f7, %f8, %f9
-0xb3 0x0f 0x70 0x89
+# CHECK: msdbr %f7, %f8, %f9
+0xb3 0x1f 0x70 0x89
 
-# CHECK: msebr %f15, %f15, %f15
-0xb3 0x0f 0xf0 0xff
+# CHECK: msdbr %f15, %f15, %f15
+0xb3 0x1f 0xf0 0xff
 
 # CHECK: mseb %f0, %f0, 0
 0xed 0x00 0x00 0x00 0x00 0x0f
@@ -7519,6 +8737,24 @@
 # CHECK: mseb %f15, %f15, 0
 0xed 0xf0 0x00 0x00 0xf0 0x0f
 
+# CHECK: msebr %f0, %f0, %f0
+0xb3 0x0f 0x00 0x00
+
+# CHECK: msebr %f0, %f0, %f15
+0xb3 0x0f 0x00 0x0f
+
+# CHECK: msebr %f0, %f15, %f0
+0xb3 0x0f 0x00 0xf0
+
+# CHECK: msebr %f15, %f0, %f0
+0xb3 0x0f 0xf0 0x00
+
+# CHECK: msebr %f7, %f8, %f9
+0xb3 0x0f 0x70 0x89
+
+# CHECK: msebr %f15, %f15, %f15
+0xb3 0x0f 0xf0 0xff
+
 # CHECK: msfi %r0, -2147483648
 0xc2 0x01 0x80 0x00 0x00 0x00
 
@@ -7537,35 +8773,35 @@
 # CHECK: msfi %r15, 0
 0xc2 0xf1 0x00 0x00 0x00 0x00
 
-# CHECK: msgfi %r0, -2147483648
-0xc2 0x00 0x80 0x00 0x00 0x00
+# CHECK: msg %r0, -524288
+0xe3 0x00 0x00 0x00 0x80 0x0c
 
-# CHECK: msgfi %r0, -1
-0xc2 0x00 0xff 0xff 0xff 0xff
+# CHECK: msg %r0, -1
+0xe3 0x00 0x0f 0xff 0xff 0x0c
 
-# CHECK: msgfi %r0, 0
-0xc2 0x00 0x00 0x00 0x00 0x00
+# CHECK: msg %r0, 0
+0xe3 0x00 0x00 0x00 0x00 0x0c
 
-# CHECK: msgfi %r0, 1
-0xc2 0x00 0x00 0x00 0x00 0x01
+# CHECK: msg %r0, 1
+0xe3 0x00 0x00 0x01 0x00 0x0c
 
-# CHECK: msgfi %r0, 2147483647
-0xc2 0x00 0x7f 0xff 0xff 0xff
+# CHECK: msg %r0, 524287
+0xe3 0x00 0x0f 0xff 0x7f 0x0c
 
-# CHECK: msgfi %r15, 0
-0xc2 0xf0 0x00 0x00 0x00 0x00
+# CHECK: msg %r0, 0(%r1)
+0xe3 0x00 0x10 0x00 0x00 0x0c
 
-# CHECK: msgfr %r0, %r0
-0xb9 0x1c 0x00 0x00
+# CHECK: msg %r0, 0(%r15)
+0xe3 0x00 0xf0 0x00 0x00 0x0c
 
-# CHECK: msgfr %r0, %r15
-0xb9 0x1c 0x00 0x0f
+# CHECK: msg %r0, 524287(%r1,%r15)
+0xe3 0x01 0xff 0xff 0x7f 0x0c
 
-# CHECK: msgfr %r15, %r0
-0xb9 0x1c 0x00 0xf0
+# CHECK: msg %r0, 524287(%r15,%r1)
+0xe3 0x0f 0x1f 0xff 0x7f 0x0c
 
-# CHECK: msgfr %r7, %r8
-0xb9 0x1c 0x00 0x78
+# CHECK: msg %r15, 0
+0xe3 0xf0 0x00 0x00 0x00 0x0c
 
 # CHECK: msgf %r0, -524288
 0xe3 0x00 0x00 0x00 0x80 0x1c
@@ -7597,47 +8833,47 @@
 # CHECK: msgf %r15, 0
 0xe3 0xf0 0x00 0x00 0x00 0x1c
 
-# CHECK: msgr %r0, %r0
-0xb9 0x0c 0x00 0x00
+# CHECK: msgfi %r0, -2147483648
+0xc2 0x00 0x80 0x00 0x00 0x00
 
-# CHECK: msgr %r0, %r15
-0xb9 0x0c 0x00 0x0f
+# CHECK: msgfi %r0, -1
+0xc2 0x00 0xff 0xff 0xff 0xff
 
-# CHECK: msgr %r15, %r0
-0xb9 0x0c 0x00 0xf0
+# CHECK: msgfi %r0, 0
+0xc2 0x00 0x00 0x00 0x00 0x00
 
-# CHECK: msgr %r7, %r8
-0xb9 0x0c 0x00 0x78
+# CHECK: msgfi %r0, 1
+0xc2 0x00 0x00 0x00 0x00 0x01
 
-# CHECK: msg %r0, -524288
-0xe3 0x00 0x00 0x00 0x80 0x0c
+# CHECK: msgfi %r0, 2147483647
+0xc2 0x00 0x7f 0xff 0xff 0xff
 
-# CHECK: msg %r0, -1
-0xe3 0x00 0x0f 0xff 0xff 0x0c
+# CHECK: msgfi %r15, 0
+0xc2 0xf0 0x00 0x00 0x00 0x00
 
-# CHECK: msg %r0, 0
-0xe3 0x00 0x00 0x00 0x00 0x0c
+# CHECK: msgfr %r0, %r0
+0xb9 0x1c 0x00 0x00
 
-# CHECK: msg %r0, 1
-0xe3 0x00 0x00 0x01 0x00 0x0c
+# CHECK: msgfr %r0, %r15
+0xb9 0x1c 0x00 0x0f
 
-# CHECK: msg %r0, 524287
-0xe3 0x00 0x0f 0xff 0x7f 0x0c
+# CHECK: msgfr %r15, %r0
+0xb9 0x1c 0x00 0xf0
 
-# CHECK: msg %r0, 0(%r1)
-0xe3 0x00 0x10 0x00 0x00 0x0c
+# CHECK: msgfr %r7, %r8
+0xb9 0x1c 0x00 0x78
 
-# CHECK: msg %r0, 0(%r15)
-0xe3 0x00 0xf0 0x00 0x00 0x0c
+# CHECK: msgr %r0, %r0
+0xb9 0x0c 0x00 0x00
 
-# CHECK: msg %r0, 524287(%r1,%r15)
-0xe3 0x01 0xff 0xff 0x7f 0x0c
+# CHECK: msgr %r0, %r15
+0xb9 0x0c 0x00 0x0f
 
-# CHECK: msg %r0, 524287(%r15,%r1)
-0xe3 0x0f 0x1f 0xff 0x7f 0x0c
+# CHECK: msgr %r15, %r0
+0xb9 0x0c 0x00 0xf0
 
-# CHECK: msg %r15, 0
-0xe3 0xf0 0x00 0x00 0x00 0x0c
+# CHECK: msgr %r7, %r8
+0xb9 0x0c 0x00 0x78
 
 # CHECK: msr %r0, %r0
 0xb2 0x52 0x00 0x00
@@ -7651,27 +8887,6 @@
 # CHECK: msr %r7, %r8
 0xb2 0x52 0x00 0x78
 
-# CHECK: ms %r0, 0
-0x71 0x00 0x00 0x00
-
-# CHECK: ms %r0, 4095
-0x71 0x00 0x0f 0xff
-
-# CHECK: ms %r0, 0(%r1)
-0x71 0x00 0x10 0x00
-
-# CHECK: ms %r0, 0(%r15)
-0x71 0x00 0xf0 0x00
-
-# CHECK: ms %r0, 4095(%r1,%r15)
-0x71 0x01 0xff 0xff
-
-# CHECK: ms %r0, 4095(%r15,%r1)
-0x71 0x0f 0x1f 0xff
-
-# CHECK: ms %r15, 0
-0x71 0xf0 0x00 0x00
-
 # CHECK: msy %r0, -524288
 0xe3 0x00 0x00 0x00 0x80 0x51
 
@@ -7738,6 +8953,42 @@
 # CHECK: mvc 0(256,%r15), 0
 0xd2 0xff 0xf0 0x00 0x00 0x00
 
+# CHECK: mvcin 0(1), 0
+0xe8 0x00 0x00 0x00 0x00 0x00
+
+# CHECK: mvcin 0(1), 0(%r1)
+0xe8 0x00 0x00 0x00 0x10 0x00
+
+# CHECK: mvcin 0(1), 0(%r15)
+0xe8 0x00 0x00 0x00 0xf0 0x00
+
+# CHECK: mvcin 0(1), 4095
+0xe8 0x00 0x00 0x00 0x0f 0xff
+
+# CHECK: mvcin 0(1), 4095(%r1)
+0xe8 0x00 0x00 0x00 0x1f 0xff
+
+# CHECK: mvcin 0(1), 4095(%r15)
+0xe8 0x00 0x00 0x00 0xff 0xff
+
+# CHECK: mvcin 0(1,%r1), 0
+0xe8 0x00 0x10 0x00 0x00 0x00
+
+# CHECK: mvcin 0(1,%r15), 0
+0xe8 0x00 0xf0 0x00 0x00 0x00
+
+# CHECK: mvcin 4095(1,%r1), 0
+0xe8 0x00 0x1f 0xff 0x00 0x00
+
+# CHECK: mvcin 4095(1,%r15), 0
+0xe8 0x00 0xff 0xff 0x00 0x00
+
+# CHECK: mvcin 0(256,%r1), 0
+0xe8 0xff 0x10 0x00 0x00 0x00
+
+# CHECK: mvcin 0(256,%r15), 0
+0xe8 0xff 0xf0 0x00 0x00 0x00
+
 # CHECK: mvck 0(%r0), 0, %r0
 0xd9 0x00 0x00 0x00 0x00 0x00
 
@@ -7759,6 +9010,69 @@
 # CHECK: mvck 4095(%r15,%r1), 0(%r15), %r2
 0xd9 0xf2 0x1f 0xff 0xf0 0x00
 
+# CHECK: mvcl %r0, %r8
+0x0e 0x08
+
+# CHECK: mvcl %r0, %r14
+0x0e 0x0e
+
+# CHECK: mvcl %r14, %r0
+0x0e 0xe0
+
+# CHECK: mvcl %r14, %r8
+0x0e 0xe8
+
+# CHECK: mvcle %r0, %r0, 0
+0xa8 0x00 0x00 0x00
+
+# CHECK: mvcle %r0, %r14, 4095
+0xa8 0x0e 0x0f 0xff
+
+# CHECK: mvcle %r0, %r0, 0(%r1)
+0xa8 0x00 0x10 0x00
+
+# CHECK: mvcle %r0, %r0, 0(%r15)
+0xa8 0x00 0xf0 0x00
+
+# CHECK: mvcle %r0, %r14, 4095(%r15)
+0xa8 0x0e 0xff 0xff
+
+# CHECK: mvcle %r0, %r0, 4095(%r1)
+0xa8 0x00 0x1f 0xff
+
+# CHECK: mvcle %r14, %r0, 0
+0xa8 0xe0 0x00 0x00
+
+# CHECK: mvclu %r0, %r0, -524288
+0xeb 0x00 0x00 0x00 0x80 0x8e
+
+# CHECK: mvclu %r0, %r0, -1
+0xeb 0x00 0x0f 0xff 0xff 0x8e
+
+# CHECK: mvclu %r0, %r14, 0
+0xeb 0x0e 0x00 0x00 0x00 0x8e
+
+# CHECK: mvclu %r0, %r14, 1
+0xeb 0x0e 0x00 0x01 0x00 0x8e
+
+# CHECK: mvclu %r0, %r8, 524287
+0xeb 0x08 0x0f 0xff 0x7f 0x8e
+
+# CHECK: mvclu %r0, %r8, 0(%r1)
+0xeb 0x08 0x10 0x00 0x00 0x8e
+
+# CHECK: mvclu %r0, %r4, 0(%r15)
+0xeb 0x04 0xf0 0x00 0x00 0x8e
+
+# CHECK: mvclu %r0, %r4, 524287(%r15)
+0xeb 0x04 0xff 0xff 0x7f 0x8e
+
+# CHECK: mvclu %r0, %r0, 524287(%r1)
+0xeb 0x00 0x1f 0xff 0x7f 0x8e
+
+# CHECK: mvclu %r14, %r0, 0
+0xeb 0xe0 0x00 0x00 0x00 0x8e
+
 # CHECK: mvghi 0, 0
 0xe5 0x48 0x00 0x00 0x00 0x00
 
@@ -7906,8 +9220,86 @@
 # CHECK: mviy 524287(%r1), 42
 0xeb 0x2a 0x1f 0xff 0x7f 0x52
 
-# CHECK: mviy 524287(%r15), 42
-0xeb 0x2a 0xff 0xff 0x7f 0x52
+# CHECK: mviy 524287(%r15), 42
+0xeb 0x2a 0xff 0xff 0x7f 0x52
+
+# CHECK: mvn 0(1), 0
+0xd1 0x00 0x00 0x00 0x00 0x00
+
+# CHECK: mvn 0(1), 0(%r1)
+0xd1 0x00 0x00 0x00 0x10 0x00
+
+# CHECK: mvn 0(1), 0(%r15)
+0xd1 0x00 0x00 0x00 0xf0 0x00
+
+# CHECK: mvn 0(1), 4095
+0xd1 0x00 0x00 0x00 0x0f 0xff
+
+# CHECK: mvn 0(1), 4095(%r1)
+0xd1 0x00 0x00 0x00 0x1f 0xff
+
+# CHECK: mvn 0(1), 4095(%r15)
+0xd1 0x00 0x00 0x00 0xff 0xff
+
+# CHECK: mvn 0(1,%r1), 0
+0xd1 0x00 0x10 0x00 0x00 0x00
+
+# CHECK: mvn 0(1,%r15), 0
+0xd1 0x00 0xf0 0x00 0x00 0x00
+
+# CHECK: mvn 4095(1,%r1), 0
+0xd1 0x00 0x1f 0xff 0x00 0x00
+
+# CHECK: mvn 4095(1,%r15), 0
+0xd1 0x00 0xff 0xff 0x00 0x00
+
+# CHECK: mvn 0(256,%r1), 0
+0xd1 0xff 0x10 0x00 0x00 0x00
+
+# CHECK: mvn 0(256,%r15), 0
+0xd1 0xff 0xf0 0x00 0x00 0x00
+
+# CHECK: mvo 0(1), 0(1)
+0xf1 0x00 0x00 0x00 0x00 0x00
+
+# CHECK: mvo 0(1), 0(1,%r1)
+0xf1 0x00 0x00 0x00 0x10 0x00
+
+# CHECK: mvo 0(1), 0(1,%r15)
+0xf1 0x00 0x00 0x00 0xf0 0x00
+
+# CHECK: mvo 0(1), 4095(1)
+0xf1 0x00 0x00 0x00 0x0f 0xff
+
+# CHECK: mvo 0(1), 4095(1,%r1)
+0xf1 0x00 0x00 0x00 0x1f 0xff
+
+# CHECK: mvo 0(1), 4095(1,%r15)
+0xf1 0x00 0x00 0x00 0xff 0xff
+
+# CHECK: mvo 0(1,%r1), 0(1)
+0xf1 0x00 0x10 0x00 0x00 0x00
+
+# CHECK: mvo 0(1,%r15), 0(1)
+0xf1 0x00 0xf0 0x00 0x00 0x00
+
+# CHECK: mvo 4095(1,%r1), 0(1)
+0xf1 0x00 0x1f 0xff 0x00 0x00
+
+# CHECK: mvo 4095(1,%r15), 0(1)
+0xf1 0x00 0xff 0xff 0x00 0x00
+
+# CHECK: mvo 0(16,%r1), 0(1)
+0xf1 0xf0 0x10 0x00 0x00 0x00
+
+# CHECK: mvo 0(16,%r15), 0(1)
+0xf1 0xf0 0xf0 0x00 0x00 0x00
+
+# CHECK: mvo 0(1), 0(16,%r1)
+0xf1 0x0f 0x00 0x00 0x10 0x00
+
+# CHECK: mvo 0(1), 0(16,%r15)
+0xf1 0x0f 0x00 0x00 0xf0 0x00
 
 # CHECK: mvst %r0, %r0
 0xb2 0x55 0x00 0x00
@@ -7921,6 +9313,42 @@
 # CHECK: mvst %r7, %r8
 0xb2 0x55 0x00 0x78
 
+# CHECK: mvz 0(1), 0
+0xd3 0x00 0x00 0x00 0x00 0x00
+
+# CHECK: mvz 0(1), 0(%r1)
+0xd3 0x00 0x00 0x00 0x10 0x00
+
+# CHECK: mvz 0(1), 0(%r15)
+0xd3 0x00 0x00 0x00 0xf0 0x00
+
+# CHECK: mvz 0(1), 4095
+0xd3 0x00 0x00 0x00 0x0f 0xff
+
+# CHECK: mvz 0(1), 4095(%r1)
+0xd3 0x00 0x00 0x00 0x1f 0xff
+
+# CHECK: mvz 0(1), 4095(%r15)
+0xd3 0x00 0x00 0x00 0xff 0xff
+
+# CHECK: mvz 0(1,%r1), 0
+0xd3 0x00 0x10 0x00 0x00 0x00
+
+# CHECK: mvz 0(1,%r15), 0
+0xd3 0x00 0xf0 0x00 0x00 0x00
+
+# CHECK: mvz 4095(1,%r1), 0
+0xd3 0x00 0x1f 0xff 0x00 0x00
+
+# CHECK: mvz 4095(1,%r15), 0
+0xd3 0x00 0xff 0xff 0x00 0x00
+
+# CHECK: mvz 0(256,%r1), 0
+0xd3 0xff 0x10 0x00 0x00 0x00
+
+# CHECK: mvz 0(256,%r15), 0
+0xd3 0xff 0xf0 0x00 0x00 0x00
+
 # CHECK: mxbr %f0, %f0
 0xb3 0x4c 0x00 0x00
 
@@ -7933,18 +9361,6 @@
 # CHECK: mxbr %f13, %f13
 0xb3 0x4c 0x00 0xdd
 
-# CHECK: mxdbr %f0, %f0
-0xb3 0x07 0x00 0x00
-
-# CHECK: mxdbr %f0, %f15
-0xb3 0x07 0x00 0x0f
-
-# CHECK: mxdbr %f8, %f8
-0xb3 0x07 0x00 0x88
-
-# CHECK: mxdbr %f13, %f0
-0xb3 0x07 0x00 0xd0
-
 # CHECK: mxdb %f0, 0
 0xed 0x00 0x00 0x00 0x00 0x07
 
@@ -7966,6 +9382,39 @@
 # CHECK: mxdb %f13, 0
 0xed 0xd0 0x00 0x00 0x00 0x07
 
+# CHECK: mxdbr %f0, %f0
+0xb3 0x07 0x00 0x00
+
+# CHECK: mxdbr %f0, %f15
+0xb3 0x07 0x00 0x0f
+
+# CHECK: mxdbr %f8, %f8
+0xb3 0x07 0x00 0x88
+
+# CHECK: mxdbr %f13, %f0
+0xb3 0x07 0x00 0xd0
+
+# CHECK: n %r0, 0
+0x54 0x00 0x00 0x00
+
+# CHECK: n %r0, 4095
+0x54 0x00 0x0f 0xff
+
+# CHECK: n %r0, 0(%r1)
+0x54 0x00 0x10 0x00
+
+# CHECK: n %r0, 0(%r15)
+0x54 0x00 0xf0 0x00
+
+# CHECK: n %r0, 4095(%r1,%r15)
+0x54 0x01 0xff 0xff
+
+# CHECK: n %r0, 4095(%r15,%r1)
+0x54 0x0f 0x1f 0xff
+
+# CHECK: n %r15, 0
+0x54 0xf0 0x00 0x00
+
 # CHECK: nc 0(1), 0
 0xd4 0x00 0x00 0x00 0x00 0x00
 
@@ -8002,27 +9451,9 @@
 # CHECK: nc 0(256,%r15), 0
 0xd4 0xff 0xf0 0x00 0x00 0x00
 
-# CHECK: ngr %r0, %r0
-0xb9 0x80 0x00 0x00
-
-# CHECK: ngr %r0, %r15
-0xb9 0x80 0x00 0x0f
-
-# CHECK: ngr %r15, %r0
-0xb9 0x80 0x00 0xf0
-
-# CHECK: ngr %r7, %r8
-0xb9 0x80 0x00 0x78
-
 # CHECK: ng %r0, -524288
 0xe3 0x00 0x00 0x00 0x80 0x80
 
-# CHECK: ngrk %r0, %r0, %r0
-0xb9 0xe4 0x00 0x00
-
-# CHECK: ngrk %r2, %r3, %r4
-0xb9 0xe4 0x40 0x23
-
 # CHECK: ng %r0, -1
 0xe3 0x00 0x0f 0xff 0xff 0x80
 
@@ -8050,6 +9481,57 @@
 # CHECK: ng %r15, 0
 0xe3 0xf0 0x00 0x00 0x00 0x80
 
+# CHECK: ngr %r0, %r0
+0xb9 0x80 0x00 0x00
+
+# CHECK: ngr %r0, %r15
+0xb9 0x80 0x00 0x0f
+
+# CHECK: ngr %r15, %r0
+0xb9 0x80 0x00 0xf0
+
+# CHECK: ngr %r7, %r8
+0xb9 0x80 0x00 0x78
+
+# CHECK: ngrk %r0, %r0, %r0
+0xb9 0xe4 0x00 0x00
+
+# CHECK: ngrk %r2, %r3, %r4
+0xb9 0xe4 0x40 0x23
+
+# CHECK: ni 0, 0
+0x94 0x00 0x00 0x00
+
+# CHECK: ni 4095, 0
+0x94 0x00 0x0f 0xff
+
+# CHECK: ni 0, 255
+0x94 0xff 0x00 0x00
+
+# CHECK: ni 0(%r1), 42
+0x94 0x2a 0x10 0x00
+
+# CHECK: ni 0(%r15), 42
+0x94 0x2a 0xf0 0x00
+
+# CHECK: ni 4095(%r1), 42
+0x94 0x2a 0x1f 0xff
+
+# CHECK: ni 4095(%r15), 42
+0x94 0x2a 0xff 0xff
+
+# CHECK: niai 0, 0
+0xb2 0xfa 0x00 0x00
+
+# CHECK: niai 15, 0
+0xb2 0xfa 0x00 0xf0
+
+# CHECK: niai 0, 15
+0xb2 0xfa 0x00 0x0f
+
+# CHECK: niai 15, 15
+0xb2 0xfa 0x00 0xff
+
 # CHECK: nihf %r0, 0
 0xc0 0x0a 0x00 0x00 0x00 0x00
 
@@ -8116,27 +9598,6 @@
 # CHECK: nill %r15, 0
 0xa5 0xf7 0x00 0x00
 
-# CHECK: ni 0, 0
-0x94 0x00 0x00 0x00
-
-# CHECK: ni 4095, 0
-0x94 0x00 0x0f 0xff
-
-# CHECK: ni 0, 255
-0x94 0xff 0x00 0x00
-
-# CHECK: ni 0(%r1), 42
-0x94 0x2a 0x10 0x00
-
-# CHECK: ni 0(%r15), 42
-0x94 0x2a 0xf0 0x00
-
-# CHECK: ni 4095(%r1), 42
-0x94 0x2a 0x1f 0xff
-
-# CHECK: ni 4095(%r15), 42
-0x94 0x2a 0xff 0xff
-
 # CHECK: niy -524288, 0
 0xeb 0x00 0x00 0x00 0x80 0x54
 
@@ -8185,26 +9646,35 @@
 # CHECK: nrk %r2, %r3, %r4
 0xb9 0xf4 0x40 0x23
 
-# CHECK: n %r0, 0
-0x54 0x00 0x00 0x00
+# CHECK: ntstg %r0, -524288
+0xe3 0x00 0x00 0x00 0x80 0x25
 
-# CHECK: n %r0, 4095
-0x54 0x00 0x0f 0xff
+# CHECK: ntstg %r0, -1
+0xe3 0x00 0x0f 0xff 0xff 0x25
 
-# CHECK: n %r0, 0(%r1)
-0x54 0x00 0x10 0x00
+# CHECK: ntstg %r0, 0
+0xe3 0x00 0x00 0x00 0x00 0x25
 
-# CHECK: n %r0, 0(%r15)
-0x54 0x00 0xf0 0x00
+# CHECK: ntstg %r0, 1
+0xe3 0x00 0x00 0x01 0x00 0x25
 
-# CHECK: n %r0, 4095(%r1,%r15)
-0x54 0x01 0xff 0xff
+# CHECK: ntstg %r0, 524287
+0xe3 0x00 0x0f 0xff 0x7f 0x25
 
-# CHECK: n %r0, 4095(%r15,%r1)
-0x54 0x0f 0x1f 0xff
+# CHECK: ntstg %r0, 0(%r1)
+0xe3 0x00 0x10 0x00 0x00 0x25
 
-# CHECK: n %r15, 0
-0x54 0xf0 0x00 0x00
+# CHECK: ntstg %r0, 0(%r15)
+0xe3 0x00 0xf0 0x00 0x00 0x25
+
+# CHECK: ntstg %r0, 524287(%r1,%r15)
+0xe3 0x01 0xff 0xff 0x7f 0x25
+
+# CHECK: ntstg %r0, 524287(%r15,%r1)
+0xe3 0x0f 0x1f 0xff 0x7f 0x25
+
+# CHECK: ntstg %r15, 0
+0xe3 0xf0 0x00 0x00 0x00 0x25
 
 # CHECK: ny %r0, -524288
 0xe3 0x00 0x00 0x00 0x80 0x54
@@ -8236,47 +9706,26 @@
 # CHECK: ny %r15, 0
 0xe3 0xf0 0x00 0x00 0x00 0x54
 
-# CHECK: niai 0, 0
-0xb2 0xfa 0x00 0x00
-
-# CHECK: niai 15, 0
-0xb2 0xfa 0x00 0xf0
-
-# CHECK: niai 0, 15
-0xb2 0xfa 0x00 0x0f
-
-# CHECK: niai 15, 15
-0xb2 0xfa 0x00 0xff
-
-# CHECK: ntstg %r0, -524288
-0xe3 0x00 0x00 0x00 0x80 0x25
-
-# CHECK: ntstg %r0, -1
-0xe3 0x00 0x0f 0xff 0xff 0x25
-
-# CHECK: ntstg %r0, 0
-0xe3 0x00 0x00 0x00 0x00 0x25
-
-# CHECK: ntstg %r0, 1
-0xe3 0x00 0x00 0x01 0x00 0x25
+# CHECK: o %r0, 0
+0x56 0x00 0x00 0x00
 
-# CHECK: ntstg %r0, 524287
-0xe3 0x00 0x0f 0xff 0x7f 0x25
+# CHECK: o %r0, 4095
+0x56 0x00 0x0f 0xff
 
-# CHECK: ntstg %r0, 0(%r1)
-0xe3 0x00 0x10 0x00 0x00 0x25
+# CHECK: o %r0, 0(%r1)
+0x56 0x00 0x10 0x00
 
-# CHECK: ntstg %r0, 0(%r15)
-0xe3 0x00 0xf0 0x00 0x00 0x25
+# CHECK: o %r0, 0(%r15)
+0x56 0x00 0xf0 0x00
 
-# CHECK: ntstg %r0, 524287(%r1,%r15)
-0xe3 0x01 0xff 0xff 0x7f 0x25
+# CHECK: o %r0, 4095(%r1,%r15)
+0x56 0x01 0xff 0xff
 
-# CHECK: ntstg %r0, 524287(%r15,%r1)
-0xe3 0x0f 0x1f 0xff 0x7f 0x25
+# CHECK: o %r0, 4095(%r15,%r1)
+0x56 0x0f 0x1f 0xff
 
-# CHECK: ntstg %r15, 0
-0xe3 0xf0 0x00 0x00 0x00 0x25
+# CHECK: o %r15, 0
+0x56 0xf0 0x00 0x00
 
 # CHECK: oc 0(1), 0
 0xd6 0x00 0x00 0x00 0x00 0x00
@@ -8302,35 +9751,17 @@
 # CHECK: oc 0(1,%r15), 0
 0xd6 0x00 0xf0 0x00 0x00 0x00
 
-# CHECK: oc 4095(1,%r1), 0
-0xd6 0x00 0x1f 0xff 0x00 0x00
-
-# CHECK: oc 4095(1,%r15), 0
-0xd6 0x00 0xff 0xff 0x00 0x00
-
-# CHECK: oc 0(256,%r1), 0
-0xd6 0xff 0x10 0x00 0x00 0x00
-
-# CHECK: oc 0(256,%r15), 0
-0xd6 0xff 0xf0 0x00 0x00 0x00
-
-# CHECK: ogr %r0, %r0
-0xb9 0x81 0x00 0x00
-
-# CHECK: ogr %r0, %r15
-0xb9 0x81 0x00 0x0f
-
-# CHECK: ogr %r15, %r0
-0xb9 0x81 0x00 0xf0
+# CHECK: oc 4095(1,%r1), 0
+0xd6 0x00 0x1f 0xff 0x00 0x00
 
-# CHECK: ogr %r7, %r8
-0xb9 0x81 0x00 0x78
+# CHECK: oc 4095(1,%r15), 0
+0xd6 0x00 0xff 0xff 0x00 0x00
 
-# CHECK: ogrk %r0, %r0, %r0
-0xb9 0xe6 0x00 0x00
+# CHECK: oc 0(256,%r1), 0
+0xd6 0xff 0x10 0x00 0x00 0x00
 
-# CHECK: ogrk %r2, %r3, %r4
-0xb9 0xe6 0x40 0x23
+# CHECK: oc 0(256,%r15), 0
+0xd6 0xff 0xf0 0x00 0x00 0x00
 
 # CHECK: og %r0, -524288
 0xe3 0x00 0x00 0x00 0x80 0x81
@@ -8362,6 +9793,45 @@
 # CHECK: og %r15, 0
 0xe3 0xf0 0x00 0x00 0x00 0x81
 
+# CHECK: ogr %r0, %r0
+0xb9 0x81 0x00 0x00
+
+# CHECK: ogr %r0, %r15
+0xb9 0x81 0x00 0x0f
+
+# CHECK: ogr %r15, %r0
+0xb9 0x81 0x00 0xf0
+
+# CHECK: ogr %r7, %r8
+0xb9 0x81 0x00 0x78
+
+# CHECK: ogrk %r0, %r0, %r0
+0xb9 0xe6 0x00 0x00
+
+# CHECK: ogrk %r2, %r3, %r4
+0xb9 0xe6 0x40 0x23
+
+# CHECK: oi 0, 0
+0x96 0x00 0x00 0x00
+
+# CHECK: oi 4095, 0
+0x96 0x00 0x0f 0xff
+
+# CHECK: oi 0, 255
+0x96 0xff 0x00 0x00
+
+# CHECK: oi 0(%r1), 42
+0x96 0x2a 0x10 0x00
+
+# CHECK: oi 0(%r15), 42
+0x96 0x2a 0xf0 0x00
+
+# CHECK: oi 4095(%r1), 42
+0x96 0x2a 0x1f 0xff
+
+# CHECK: oi 4095(%r15), 42
+0x96 0x2a 0xff 0xff
+
 # CHECK: oihf %r0, 0
 0xc0 0x0c 0x00 0x00 0x00 0x00
 
@@ -8428,27 +9898,6 @@
 # CHECK: oill %r15, 0
 0xa5 0xfb 0x00 0x00
 
-# CHECK: oi 0, 0
-0x96 0x00 0x00 0x00
-
-# CHECK: oi 4095, 0
-0x96 0x00 0x0f 0xff
-
-# CHECK: oi 0, 255
-0x96 0xff 0x00 0x00
-
-# CHECK: oi 0(%r1), 42
-0x96 0x2a 0x10 0x00
-
-# CHECK: oi 0(%r15), 42
-0x96 0x2a 0xf0 0x00
-
-# CHECK: oi 4095(%r1), 42
-0x96 0x2a 0x1f 0xff
-
-# CHECK: oi 4095(%r15), 42
-0x96 0x2a 0xff 0xff
-
 # CHECK: oiy -524288, 0
 0xeb 0x00 0x00 0x00 0x80 0x56
 
@@ -8497,27 +9946,6 @@
 # CHECK: ork %r2, %r3, %r4
 0xb9 0xf6 0x40 0x23
 
-# CHECK: o %r0, 0
-0x56 0x00 0x00 0x00
-
-# CHECK: o %r0, 4095
-0x56 0x00 0x0f 0xff
-
-# CHECK: o %r0, 0(%r1)
-0x56 0x00 0x10 0x00
-
-# CHECK: o %r0, 0(%r15)
-0x56 0x00 0xf0 0x00
-
-# CHECK: o %r0, 4095(%r1,%r15)
-0x56 0x01 0xff 0xff
-
-# CHECK: o %r0, 4095(%r15,%r1)
-0x56 0x0f 0x1f 0xff
-
-# CHECK: o %r15, 0
-0x56 0xf0 0x00 0x00
-
 # CHECK: oy %r0, -524288
 0xe3 0x00 0x00 0x00 0x80 0x56
 
@@ -8548,6 +9976,51 @@
 # CHECK: oy %r15, 0
 0xe3 0xf0 0x00 0x00 0x00 0x56
 
+# CHECK: pack 0(1), 0(1)
+0xf2 0x00 0x00 0x00 0x00 0x00
+
+# CHECK: pack 0(1), 0(1,%r1)
+0xf2 0x00 0x00 0x00 0x10 0x00
+
+# CHECK: pack 0(1), 0(1,%r15)
+0xf2 0x00 0x00 0x00 0xf0 0x00
+
+# CHECK: pack 0(1), 4095(1)
+0xf2 0x00 0x00 0x00 0x0f 0xff
+
+# CHECK: pack 0(1), 4095(1,%r1)
+0xf2 0x00 0x00 0x00 0x1f 0xff
+
+# CHECK: pack 0(1), 4095(1,%r15)
+0xf2 0x00 0x00 0x00 0xff 0xff
+
+# CHECK: pack 0(1,%r1), 0(1)
+0xf2 0x00 0x10 0x00 0x00 0x00
+
+# CHECK: pack 0(1,%r15), 0(1)
+0xf2 0x00 0xf0 0x00 0x00 0x00
+
+# CHECK: pack 4095(1,%r1), 0(1)
+0xf2 0x00 0x1f 0xff 0x00 0x00
+
+# CHECK: pack 4095(1,%r15), 0(1)
+0xf2 0x00 0xff 0xff 0x00 0x00
+
+# CHECK: pack 0(16,%r1), 0(1)
+0xf2 0xf0 0x10 0x00 0x00 0x00
+
+# CHECK: pack 0(16,%r15), 0(1)
+0xf2 0xf0 0xf0 0x00 0x00 0x00
+
+# CHECK: pack 0(1), 0(16,%r1)
+0xf2 0x0f 0x00 0x00 0x10 0x00
+
+# CHECK: pack 0(1), 0(16,%r15)
+0xf2 0x0f 0x00 0x00 0xf0 0x00
+
+# CHECK: pcc
+0xb9 0x2c 0x00 0x00
+
 # CHECK: pfd 0, -524288
 0xe3 0x00 0x00 0x00 0x80 0x36
 
@@ -8578,6 +10051,78 @@
 # CHECK: pfd 15, 0
 0xe3 0xf0 0x00 0x00 0x00 0x36
 
+# CHECK: pka 0, 0(1)
+0xe9 0x00 0x00 0x00 0x00 0x00
+
+# CHECK: pka 0, 0(1,%r1)
+0xe9 0x00 0x00 0x00 0x10 0x00
+
+# CHECK: pka 0, 0(1,%r15)
+0xe9 0x00 0x00 0x00 0xf0 0x00
+
+# CHECK: pka 0, 4095(1)
+0xe9 0x00 0x00 0x00 0x0f 0xff
+
+# CHECK: pka 0, 4095(1,%r1)
+0xe9 0x00 0x00 0x00 0x1f 0xff
+
+# CHECK: pka 0, 4095(1,%r15)
+0xe9 0x00 0x00 0x00 0xff 0xff
+
+# CHECK: pka 0(%r1), 0(1)
+0xe9 0x00 0x10 0x00 0x00 0x00
+
+# CHECK: pka 0(%r15), 0(1)
+0xe9 0x00 0xf0 0x00 0x00 0x00
+
+# CHECK: pka 4095(%r1), 0(1)
+0xe9 0x00 0x1f 0xff 0x00 0x00
+
+# CHECK: pka 4095(%r15), 0(1)
+0xe9 0x00 0xff 0xff 0x00 0x00
+
+# CHECK: pka 0, 0(256,%r1)
+0xe9 0xff 0x00 0x00 0x10 0x00
+
+# CHECK: pka 0, 0(256,%r15)
+0xe9 0xff 0x00 0x00 0xf0 0x00
+
+# CHECK: pku 0, 0(1)
+0xe1 0x00 0x00 0x00 0x00 0x00
+
+# CHECK: pku 0, 0(1,%r1)
+0xe1 0x00 0x00 0x00 0x10 0x00
+
+# CHECK: pku 0, 0(1,%r15)
+0xe1 0x00 0x00 0x00 0xf0 0x00
+
+# CHECK: pku 0, 4095(1)
+0xe1 0x00 0x00 0x00 0x0f 0xff
+
+# CHECK: pku 0, 4095(1,%r1)
+0xe1 0x00 0x00 0x00 0x1f 0xff
+
+# CHECK: pku 0, 4095(1,%r15)
+0xe1 0x00 0x00 0x00 0xff 0xff
+
+# CHECK: pku 0(%r1), 0(1)
+0xe1 0x00 0x10 0x00 0x00 0x00
+
+# CHECK: pku 0(%r15), 0(1)
+0xe1 0x00 0xf0 0x00 0x00 0x00
+
+# CHECK: pku 4095(%r1), 0(1)
+0xe1 0x00 0x1f 0xff 0x00 0x00
+
+# CHECK: pku 4095(%r15), 0(1)
+0xe1 0x00 0xff 0xff 0x00 0x00
+
+# CHECK: pku 0, 0(256,%r1)
+0xe1 0xff 0x00 0x00 0x10 0x00
+
+# CHECK: pku 0, 0(256,%r15)
+0xe1 0xff 0x00 0x00 0xf0 0x00
+
 # CHECK: plo %r0, 0, %r0, 0
 0xee 0x00 0x00 0x00 0x00 0x00
 
@@ -8599,9 +10144,6 @@
 # CHECK: plo %r2, 4095(%r1), %r4, 0(%r15)
 0xee 0x24 0x1f 0xff 0xf0 0x00
 
-# CHECK: pr
-0x01 0x01
-
 # CHECK: popcnt %r0, %r0
 0xb9 0xe1 0x00 0x00
 
@@ -8629,6 +10171,9 @@
 # CHECK: ppa %r15, %r0, 0
 0xb2 0xe8 0x00 0xf0
 
+# CHECK: pr
+0x01 0x01
+
 # CHECK: risbg %r0, %r0, 0, 0, 0
 0xec 0x00 0x00 0x00 0x00 0x55
 
@@ -8713,6 +10258,78 @@
 # CHECK: risblg %r4, %r5, 6, 7, 8
 0xec 0x45 0x06 0x07 0x08 0x51
 
+# CHECK: rll %r0, %r0, 0
+0xeb 0x00 0x00 0x00 0x00 0x1d
+
+# CHECK: rll %r15, %r1, 0
+0xeb 0xf1 0x00 0x00 0x00 0x1d
+
+# CHECK: rll %r1, %r15, 0
+0xeb 0x1f 0x00 0x00 0x00 0x1d
+
+# CHECK: rll %r15, %r15, 0
+0xeb 0xff 0x00 0x00 0x00 0x1d
+
+# CHECK: rll %r0, %r0, -524288
+0xeb 0x00 0x00 0x00 0x80 0x1d
+
+# CHECK: rll %r0, %r0, -1
+0xeb 0x00 0x0f 0xff 0xff 0x1d
+
+# CHECK: rll %r0, %r0, 1
+0xeb 0x00 0x00 0x01 0x00 0x1d
+
+# CHECK: rll %r0, %r0, 524287
+0xeb 0x00 0x0f 0xff 0x7f 0x1d
+
+# CHECK: rll %r0, %r0, 0(%r1)
+0xeb 0x00 0x10 0x00 0x00 0x1d
+
+# CHECK: rll %r0, %r0, 0(%r15)
+0xeb 0x00 0xf0 0x00 0x00 0x1d
+
+# CHECK: rll %r0, %r0, 524287(%r1)
+0xeb 0x00 0x1f 0xff 0x7f 0x1d
+
+# CHECK: rll %r0, %r0, 524287(%r15)
+0xeb 0x00 0xff 0xff 0x7f 0x1d
+
+# CHECK: rllg %r0, %r0, 0
+0xeb 0x00 0x00 0x00 0x00 0x1c
+
+# CHECK: rllg %r15, %r1, 0
+0xeb 0xf1 0x00 0x00 0x00 0x1c
+
+# CHECK: rllg %r1, %r15, 0
+0xeb 0x1f 0x00 0x00 0x00 0x1c
+
+# CHECK: rllg %r15, %r15, 0
+0xeb 0xff 0x00 0x00 0x00 0x1c
+
+# CHECK: rllg %r0, %r0, -524288
+0xeb 0x00 0x00 0x00 0x80 0x1c
+
+# CHECK: rllg %r0, %r0, -1
+0xeb 0x00 0x0f 0xff 0xff 0x1c
+
+# CHECK: rllg %r0, %r0, 1
+0xeb 0x00 0x00 0x01 0x00 0x1c
+
+# CHECK: rllg %r0, %r0, 524287
+0xeb 0x00 0x0f 0xff 0x7f 0x1c
+
+# CHECK: rllg %r0, %r0, 0(%r1)
+0xeb 0x00 0x10 0x00 0x00 0x1c
+
+# CHECK: rllg %r0, %r0, 0(%r15)
+0xeb 0x00 0xf0 0x00 0x00 0x1c
+
+# CHECK: rllg %r0, %r0, 524287(%r1)
+0xeb 0x00 0x1f 0xff 0x7f 0x1c
+
+# CHECK: rllg %r0, %r0, 524287(%r15)
+0xeb 0x00 0xff 0xff 0x7f 0x1c
+
 # CHECK: rnsbg %r0, %r0, 0, 0, 0
 0xec 0x00 0x00 0x00 0x00 0x54
 
@@ -8764,89 +10381,47 @@
 # CHECK: rxsbg %r0, %r0, 0, 255, 0
 0xec 0x00 0x00 0xff 0x00 0x57
 
-# CHECK: rxsbg %r0, %r0, 255, 0, 0
-0xec 0x00 0xff 0x00 0x00 0x57
-
-# CHECK: rxsbg %r0, %r15, 0, 0, 0
-0xec 0x0f 0x00 0x00 0x00 0x57
-
-# CHECK: rxsbg %r15, %r0, 0, 0, 0
-0xec 0xf0 0x00 0x00 0x00 0x57
-
-# CHECK: rxsbg %r4, %r5, 6, 7, 8
-0xec 0x45 0x06 0x07 0x08 0x57
-
-# CHECK: rllg %r0, %r0, 0
-0xeb 0x00 0x00 0x00 0x00 0x1c
-
-# CHECK: rllg %r15, %r1, 0
-0xeb 0xf1 0x00 0x00 0x00 0x1c
-
-# CHECK: rllg %r1, %r15, 0
-0xeb 0x1f 0x00 0x00 0x00 0x1c
-
-# CHECK: rllg %r15, %r15, 0
-0xeb 0xff 0x00 0x00 0x00 0x1c
-
-# CHECK: rllg %r0, %r0, -524288
-0xeb 0x00 0x00 0x00 0x80 0x1c
-
-# CHECK: rllg %r0, %r0, -1
-0xeb 0x00 0x0f 0xff 0xff 0x1c
-
-# CHECK: rllg %r0, %r0, 1
-0xeb 0x00 0x00 0x01 0x00 0x1c
-
-# CHECK: rllg %r0, %r0, 524287
-0xeb 0x00 0x0f 0xff 0x7f 0x1c
-
-# CHECK: rllg %r0, %r0, 0(%r1)
-0xeb 0x00 0x10 0x00 0x00 0x1c
-
-# CHECK: rllg %r0, %r0, 0(%r15)
-0xeb 0x00 0xf0 0x00 0x00 0x1c
-
-# CHECK: rllg %r0, %r0, 524287(%r1)
-0xeb 0x00 0x1f 0xff 0x7f 0x1c
+# CHECK: rxsbg %r0, %r0, 255, 0, 0
+0xec 0x00 0xff 0x00 0x00 0x57
 
-# CHECK: rllg %r0, %r0, 524287(%r15)
-0xeb 0x00 0xff 0xff 0x7f 0x1c
+# CHECK: rxsbg %r0, %r15, 0, 0, 0
+0xec 0x0f 0x00 0x00 0x00 0x57
 
-# CHECK: rll %r0, %r0, 0
-0xeb 0x00 0x00 0x00 0x00 0x1d
+# CHECK: rxsbg %r15, %r0, 0, 0, 0
+0xec 0xf0 0x00 0x00 0x00 0x57
 
-# CHECK: rll %r15, %r1, 0
-0xeb 0xf1 0x00 0x00 0x00 0x1d
+# CHECK: rxsbg %r4, %r5, 6, 7, 8
+0xec 0x45 0x06 0x07 0x08 0x57
 
-# CHECK: rll %r1, %r15, 0
-0xeb 0x1f 0x00 0x00 0x00 0x1d
+# CHECK: s %r0, 0
+0x5b 0x00 0x00 0x00
 
-# CHECK: rll %r15, %r15, 0
-0xeb 0xff 0x00 0x00 0x00 0x1d
+# CHECK: s %r0, 4095
+0x5b 0x00 0x0f 0xff
 
-# CHECK: rll %r0, %r0, -524288
-0xeb 0x00 0x00 0x00 0x80 0x1d
+# CHECK: s %r0, 0(%r1)
+0x5b 0x00 0x10 0x00
 
-# CHECK: rll %r0, %r0, -1
-0xeb 0x00 0x0f 0xff 0xff 0x1d
+# CHECK: s %r0, 0(%r15)
+0x5b 0x00 0xf0 0x00
 
-# CHECK: rll %r0, %r0, 1
-0xeb 0x00 0x00 0x01 0x00 0x1d
+# CHECK: s %r0, 4095(%r1,%r15)
+0x5b 0x01 0xff 0xff
 
-# CHECK: rll %r0, %r0, 524287
-0xeb 0x00 0x0f 0xff 0x7f 0x1d
+# CHECK: s %r0, 4095(%r15,%r1)
+0x5b 0x0f 0x1f 0xff
 
-# CHECK: rll %r0, %r0, 0(%r1)
-0xeb 0x00 0x10 0x00 0x00 0x1d
+# CHECK: s %r15, 0
+0x5b 0xf0 0x00 0x00
 
-# CHECK: rll %r0, %r0, 0(%r15)
-0xeb 0x00 0xf0 0x00 0x00 0x1d
+# CHECK: sam24
+0x01 0x0c
 
-# CHECK: rll %r0, %r0, 524287(%r1)
-0xeb 0x00 0x1f 0xff 0x7f 0x1d
+# CHECK: sam31
+0x01 0x0d
 
-# CHECK: rll %r0, %r0, 524287(%r15)
-0xeb 0x00 0xff 0xff 0x7f 0x1d
+# CHECK: sam64
+0x01 0x0e
 
 # CHECK: sar %a0, %r0
 0xb2 0x4e 0x00 0x00
@@ -8863,27 +10438,6 @@
 # CHECK: sar %a15, %r15
 0xb2 0x4e 0x00 0xff
 
-# CHECK: sam24
-0x01 0x0c
-
-# CHECK: sam31
-0x01 0x0d
-
-# CHECK: sam64
-0x01 0x0e
-
-# CHECK: sdbr %f0, %f0
-0xb3 0x1b 0x00 0x00
-
-# CHECK: sdbr %f0, %f15
-0xb3 0x1b 0x00 0x0f
-
-# CHECK: sdbr %f7, %f8
-0xb3 0x1b 0x00 0x78
-
-# CHECK: sdbr %f15, %f0
-0xb3 0x1b 0x00 0xf0
-
 # CHECK: sdb %f0, 0
 0xed 0x00 0x00 0x00 0x00 0x1b
 
@@ -8905,17 +10459,17 @@
 # CHECK: sdb %f15, 0
 0xed 0xf0 0x00 0x00 0x00 0x1b
 
-# CHECK: sebr %f0, %f0
-0xb3 0x0b 0x00 0x00
+# CHECK: sdbr %f0, %f0
+0xb3 0x1b 0x00 0x00
 
-# CHECK: sebr %f0, %f15
-0xb3 0x0b 0x00 0x0f
+# CHECK: sdbr %f0, %f15
+0xb3 0x1b 0x00 0x0f
 
-# CHECK: sebr %f7, %f8
-0xb3 0x0b 0x00 0x78
+# CHECK: sdbr %f7, %f8
+0xb3 0x1b 0x00 0x78
 
-# CHECK: sebr %f15, %f0
-0xb3 0x0b 0x00 0xf0
+# CHECK: sdbr %f15, %f0
+0xb3 0x1b 0x00 0xf0
 
 # CHECK: seb %f0, 0
 0xed 0x00 0x00 0x00 0x00 0x0b
@@ -8938,6 +10492,18 @@
 # CHECK: seb %f15, 0
 0xed 0xf0 0x00 0x00 0x00 0x0b
 
+# CHECK: sebr %f0, %f0
+0xb3 0x0b 0x00 0x00
+
+# CHECK: sebr %f0, %f15
+0xb3 0x0b 0x00 0x0f
+
+# CHECK: sebr %f7, %f8
+0xb3 0x0b 0x00 0x78
+
+# CHECK: sebr %f15, %f0
+0xb3 0x0b 0x00 0xf0
+
 # CHECK: sfasr %r0
 0xb3 0x85 0x00 0x00
 
@@ -8956,17 +10522,35 @@
 # CHECK: sfpc %r15
 0xb3 0x84 0x00 0xf0
 
-# CHECK: sgfr %r0, %r0
-0xb9 0x19 0x00 0x00
+# CHECK: sg %r0, -524288
+0xe3 0x00 0x00 0x00 0x80 0x09
 
-# CHECK: sgfr %r0, %r15
-0xb9 0x19 0x00 0x0f
+# CHECK: sg %r0, -1
+0xe3 0x00 0x0f 0xff 0xff 0x09
 
-# CHECK: sgfr %r15, %r0
-0xb9 0x19 0x00 0xf0
+# CHECK: sg %r0, 0
+0xe3 0x00 0x00 0x00 0x00 0x09
 
-# CHECK: sgfr %r7, %r8
-0xb9 0x19 0x00 0x78
+# CHECK: sg %r0, 1
+0xe3 0x00 0x00 0x01 0x00 0x09
+
+# CHECK: sg %r0, 524287
+0xe3 0x00 0x0f 0xff 0x7f 0x09
+
+# CHECK: sg %r0, 0(%r1)
+0xe3 0x00 0x10 0x00 0x00 0x09
+
+# CHECK: sg %r0, 0(%r15)
+0xe3 0x00 0xf0 0x00 0x00 0x09
+
+# CHECK: sg %r0, 524287(%r1,%r15)
+0xe3 0x01 0xff 0xff 0x7f 0x09
+
+# CHECK: sg %r0, 524287(%r15,%r1)
+0xe3 0x0f 0x1f 0xff 0x7f 0x09
+
+# CHECK: sg %r15, 0
+0xe3 0xf0 0x00 0x00 0x00 0x09
 
 # CHECK: sgf %r0, -524288
 0xe3 0x00 0x00 0x00 0x80 0x19
@@ -8998,6 +10582,18 @@
 # CHECK: sgf %r15, 0
 0xe3 0xf0 0x00 0x00 0x00 0x19
 
+# CHECK: sgfr %r0, %r0
+0xb9 0x19 0x00 0x00
+
+# CHECK: sgfr %r0, %r15
+0xb9 0x19 0x00 0x0f
+
+# CHECK: sgfr %r15, %r0
+0xb9 0x19 0x00 0xf0
+
+# CHECK: sgfr %r7, %r8
+0xb9 0x19 0x00 0x78
+
 # CHECK: sgr %r0, %r0
 0xb9 0x09 0x00 0x00
 
@@ -9016,36 +10612,6 @@
 # CHECK: sgrk %r2, %r3, %r4
 0xb9 0xe9 0x40 0x23
 
-# CHECK: sg %r0, -524288
-0xe3 0x00 0x00 0x00 0x80 0x09
-
-# CHECK: sg %r0, -1
-0xe3 0x00 0x0f 0xff 0xff 0x09
-
-# CHECK: sg %r0, 0
-0xe3 0x00 0x00 0x00 0x00 0x09
-
-# CHECK: sg %r0, 1
-0xe3 0x00 0x00 0x01 0x00 0x09
-
-# CHECK: sg %r0, 524287
-0xe3 0x00 0x0f 0xff 0x7f 0x09
-
-# CHECK: sg %r0, 0(%r1)
-0xe3 0x00 0x10 0x00 0x00 0x09
-
-# CHECK: sg %r0, 0(%r15)
-0xe3 0x00 0xf0 0x00 0x00 0x09
-
-# CHECK: sg %r0, 524287(%r1,%r15)
-0xe3 0x01 0xff 0xff 0x7f 0x09
-
-# CHECK: sg %r0, 524287(%r15,%r1)
-0xe3 0x0f 0x1f 0xff 0x7f 0x09
-
-# CHECK: sg %r15, 0
-0xe3 0xf0 0x00 0x00 0x00 0x09
-
 # CHECK: sh %r0, 0
 0x4b 0x00 0x00 0x00
 
@@ -9097,6 +10663,27 @@
 # CHECK: shy %r15, 0
 0xe3 0xf0 0x00 0x00 0x00 0x7b
 
+# CHECK: sl %r0, 0
+0x5f 0x00 0x00 0x00
+
+# CHECK: sl %r0, 4095
+0x5f 0x00 0x0f 0xff
+
+# CHECK: sl %r0, 0(%r1)
+0x5f 0x00 0x10 0x00
+
+# CHECK: sl %r0, 0(%r15)
+0x5f 0x00 0xf0 0x00
+
+# CHECK: sl %r0, 4095(%r1,%r15)
+0x5f 0x01 0xff 0xff
+
+# CHECK: sl %r0, 4095(%r15,%r1)
+0x5f 0x0f 0x1f 0xff
+
+# CHECK: sl %r15, 0
+0x5f 0xf0 0x00 0x00
+
 # CHECK: sla %r0, 0
 0x8b 0x00 0x00 0x00
 
@@ -9121,6 +10708,42 @@
 # CHECK: sla %r0, 4095(%r15)
 0x8b 0x00 0xff 0xff
 
+# CHECK: slag %r0, %r0, 0
+0xeb 0x00 0x00 0x00 0x00 0x0b
+
+# CHECK: slag %r15, %r1, 0
+0xeb 0xf1 0x00 0x00 0x00 0x0b
+
+# CHECK: slag %r1, %r15, 0
+0xeb 0x1f 0x00 0x00 0x00 0x0b
+
+# CHECK: slag %r15, %r15, 0
+0xeb 0xff 0x00 0x00 0x00 0x0b
+
+# CHECK: slag %r0, %r0, -524288
+0xeb 0x00 0x00 0x00 0x80 0x0b
+
+# CHECK: slag %r0, %r0, -1
+0xeb 0x00 0x0f 0xff 0xff 0x0b
+
+# CHECK: slag %r0, %r0, 1
+0xeb 0x00 0x00 0x01 0x00 0x0b
+
+# CHECK: slag %r0, %r0, 524287
+0xeb 0x00 0x0f 0xff 0x7f 0x0b
+
+# CHECK: slag %r0, %r0, 0(%r1)
+0xeb 0x00 0x10 0x00 0x00 0x0b
+
+# CHECK: slag %r0, %r0, 0(%r15)
+0xeb 0x00 0xf0 0x00 0x00 0x0b
+
+# CHECK: slag %r0, %r0, 524287(%r1)
+0xeb 0x00 0x1f 0xff 0x7f 0x0b
+
+# CHECK: slag %r0, %r0, 524287(%r15)
+0xeb 0x00 0xff 0xff 0x7f 0x0b
+
 # CHECK: slak %r0, %r0, 0
 0xeb 0x00 0x00 0x00 0x00 0xdd
 
@@ -9157,17 +10780,35 @@
 # CHECK: slak %r0, %r0, 524287(%r15)
 0xeb 0x00 0xff 0xff 0x7f 0xdd
 
-# CHECK: slbgr %r0, %r0
-0xb9 0x89 0x00 0x00
+# CHECK: slb %r0, -524288
+0xe3 0x00 0x00 0x00 0x80 0x99
 
-# CHECK: slbgr %r0, %r15
-0xb9 0x89 0x00 0x0f
+# CHECK: slb %r0, -1
+0xe3 0x00 0x0f 0xff 0xff 0x99
 
-# CHECK: slbgr %r15, %r0
-0xb9 0x89 0x00 0xf0
+# CHECK: slb %r0, 0
+0xe3 0x00 0x00 0x00 0x00 0x99
 
-# CHECK: slbgr %r7, %r8
-0xb9 0x89 0x00 0x78
+# CHECK: slb %r0, 1
+0xe3 0x00 0x00 0x01 0x00 0x99
+
+# CHECK: slb %r0, 524287
+0xe3 0x00 0x0f 0xff 0x7f 0x99
+
+# CHECK: slb %r0, 0(%r1)
+0xe3 0x00 0x10 0x00 0x00 0x99
+
+# CHECK: slb %r0, 0(%r15)
+0xe3 0x00 0xf0 0x00 0x00 0x99
+
+# CHECK: slb %r0, 524287(%r1,%r15)
+0xe3 0x01 0xff 0xff 0x7f 0x99
+
+# CHECK: slb %r0, 524287(%r15,%r1)
+0xe3 0x0f 0x1f 0xff 0x7f 0x99
+
+# CHECK: slb %r15, 0
+0xe3 0xf0 0x00 0x00 0x00 0x99
 
 # CHECK: slbg %r0, -524288
 0xe3 0x00 0x00 0x00 0x80 0x89
@@ -9199,6 +10840,18 @@
 # CHECK: slbg %r15, 0
 0xe3 0xf0 0x00 0x00 0x00 0x89
 
+# CHECK: slbgr %r0, %r0
+0xb9 0x89 0x00 0x00
+
+# CHECK: slbgr %r0, %r15
+0xb9 0x89 0x00 0x0f
+
+# CHECK: slbgr %r15, %r0
+0xb9 0x89 0x00 0xf0
+
+# CHECK: slbgr %r7, %r8
+0xb9 0x89 0x00 0x78
+
 # CHECK: slbr %r0, %r0
 0xb9 0x99 0x00 0x00
 
@@ -9211,35 +10864,53 @@
 # CHECK: slbr %r7, %r8
 0xb9 0x99 0x00 0x78
 
-# CHECK: slb %r0, -524288
-0xe3 0x00 0x00 0x00 0x80 0x99
+# CHECK: slda %r0, 0
+0x8f 0x00 0x00 0x00
 
-# CHECK: slb %r0, -1
-0xe3 0x00 0x0f 0xff 0xff 0x99
+# CHECK: slda %r6, 0
+0x8f 0x60 0x00 0x00
 
-# CHECK: slb %r0, 0
-0xe3 0x00 0x00 0x00 0x00 0x99
+# CHECK: slda %r14, 0
+0x8f 0xe0 0x00 0x00
 
-# CHECK: slb %r0, 1
-0xe3 0x00 0x00 0x01 0x00 0x99
+# CHECK: slda %r0, 4095
+0x8f 0x00 0x0f 0xff
 
-# CHECK: slb %r0, 524287
-0xe3 0x00 0x0f 0xff 0x7f 0x99
+# CHECK: slda %r0, 0(%r1)
+0x8f 0x00 0x10 0x00
 
-# CHECK: slb %r0, 0(%r1)
-0xe3 0x00 0x10 0x00 0x00 0x99
+# CHECK: slda %r0, 0(%r15)
+0x8f 0x00 0xf0 0x00
 
-# CHECK: slb %r0, 0(%r15)
-0xe3 0x00 0xf0 0x00 0x00 0x99
+# CHECK: slda %r0, 4095(%r1)
+0x8f 0x00 0x1f 0xff
 
-# CHECK: slb %r0, 524287(%r1,%r15)
-0xe3 0x01 0xff 0xff 0x7f 0x99
+# CHECK: slda %r0, 4095(%r15)
+0x8f 0x00 0xff 0xff
 
-# CHECK: slb %r0, 524287(%r15,%r1)
-0xe3 0x0f 0x1f 0xff 0x7f 0x99
+# CHECK: sldl %r0, 0
+0x8d 0x00 0x00 0x00
 
-# CHECK: slb %r15, 0
-0xe3 0xf0 0x00 0x00 0x00 0x99
+# CHECK: sldl %r6, 0
+0x8d 0x60 0x00 0x00
+
+# CHECK: sldl %r14, 0
+0x8d 0xe0 0x00 0x00
+
+# CHECK: sldl %r0, 4095
+0x8d 0x00 0x0f 0xff
+
+# CHECK: sldl %r0, 0(%r1)
+0x8d 0x00 0x10 0x00
+
+# CHECK: sldl %r0, 0(%r15)
+0x8d 0x00 0xf0 0x00
+
+# CHECK: sldl %r0, 4095(%r1)
+0x8d 0x00 0x1f 0xff
+
+# CHECK: sldl %r0, 4095(%r15)
+0x8d 0x00 0xff 0xff
 
 # CHECK: slfi %r0, 0
 0xc2 0x05 0x00 0x00 0x00 0x00
@@ -9250,26 +10921,35 @@
 # CHECK: slfi %r15, 0
 0xc2 0xf5 0x00 0x00 0x00 0x00
 
-# CHECK: slgfi %r0, 0
-0xc2 0x04 0x00 0x00 0x00 0x00
+# CHECK: slg %r0, -524288
+0xe3 0x00 0x00 0x00 0x80 0x0b
 
-# CHECK: slgfi %r0, 4294967295
-0xc2 0x04 0xff 0xff 0xff 0xff
+# CHECK: slg %r0, -1
+0xe3 0x00 0x0f 0xff 0xff 0x0b
 
-# CHECK: slgfi %r15, 0
-0xc2 0xf4 0x00 0x00 0x00 0x00
+# CHECK: slg %r0, 0
+0xe3 0x00 0x00 0x00 0x00 0x0b
 
-# CHECK: slgfr %r0, %r0
-0xb9 0x1b 0x00 0x00
+# CHECK: slg %r0, 1
+0xe3 0x00 0x00 0x01 0x00 0x0b
 
-# CHECK: slgfr %r0, %r15
-0xb9 0x1b 0x00 0x0f
+# CHECK: slg %r0, 524287
+0xe3 0x00 0x0f 0xff 0x7f 0x0b
 
-# CHECK: slgfr %r15, %r0
-0xb9 0x1b 0x00 0xf0
+# CHECK: slg %r0, 0(%r1)
+0xe3 0x00 0x10 0x00 0x00 0x0b
 
-# CHECK: slgfr %r7, %r8
-0xb9 0x1b 0x00 0x78
+# CHECK: slg %r0, 0(%r15)
+0xe3 0x00 0xf0 0x00 0x00 0x0b
+
+# CHECK: slg %r0, 524287(%r1,%r15)
+0xe3 0x01 0xff 0xff 0x7f 0x0b
+
+# CHECK: slg %r0, 524287(%r15,%r1)
+0xe3 0x0f 0x1f 0xff 0x7f 0x0b
+
+# CHECK: slg %r15, 0
+0xe3 0xf0 0x00 0x00 0x00 0x0b
 
 # CHECK: slgf %r0, -524288
 0xe3 0x00 0x00 0x00 0x80 0x1b
@@ -9301,6 +10981,27 @@
 # CHECK: slgf %r15, 0
 0xe3 0xf0 0x00 0x00 0x00 0x1b
 
+# CHECK: slgfi %r0, 0
+0xc2 0x04 0x00 0x00 0x00 0x00
+
+# CHECK: slgfi %r0, 4294967295
+0xc2 0x04 0xff 0xff 0xff 0xff
+
+# CHECK: slgfi %r15, 0
+0xc2 0xf4 0x00 0x00 0x00 0x00
+
+# CHECK: slgfr %r0, %r0
+0xb9 0x1b 0x00 0x00
+
+# CHECK: slgfr %r0, %r15
+0xb9 0x1b 0x00 0x0f
+
+# CHECK: slgfr %r15, %r0
+0xb9 0x1b 0x00 0xf0
+
+# CHECK: slgfr %r7, %r8
+0xb9 0x1b 0x00 0x78
+
 # CHECK: slgr %r0, %r0
 0xb9 0x0b 0x00 0x00
 
@@ -9319,35 +11020,29 @@
 # CHECK: slgrk %r2, %r3, %r4
 0xb9 0xeb 0x40 0x23
 
-# CHECK: slg %r0, -524288
-0xe3 0x00 0x00 0x00 0x80 0x0b
-
-# CHECK: slg %r0, -1
-0xe3 0x00 0x0f 0xff 0xff 0x0b
-
-# CHECK: slg %r0, 0
-0xe3 0x00 0x00 0x00 0x00 0x0b
+# CHECK: sll %r0, 0
+0x89 0x00 0x00 0x00
 
-# CHECK: slg %r0, 1
-0xe3 0x00 0x00 0x01 0x00 0x0b
+# CHECK: sll %r7, 0
+0x89 0x70 0x00 0x00
 
-# CHECK: slg %r0, 524287
-0xe3 0x00 0x0f 0xff 0x7f 0x0b
+# CHECK: sll %r15, 0
+0x89 0xf0 0x00 0x00
 
-# CHECK: slg %r0, 0(%r1)
-0xe3 0x00 0x10 0x00 0x00 0x0b
+# CHECK: sll %r0, 4095
+0x89 0x00 0x0f 0xff
 
-# CHECK: slg %r0, 0(%r15)
-0xe3 0x00 0xf0 0x00 0x00 0x0b
+# CHECK: sll %r0, 0(%r1)
+0x89 0x00 0x10 0x00
 
-# CHECK: slg %r0, 524287(%r1,%r15)
-0xe3 0x01 0xff 0xff 0x7f 0x0b
+# CHECK: sll %r0, 0(%r15)
+0x89 0x00 0xf0 0x00
 
-# CHECK: slg %r0, 524287(%r15,%r1)
-0xe3 0x0f 0x1f 0xff 0x7f 0x0b
+# CHECK: sll %r0, 4095(%r1)
+0x89 0x00 0x1f 0xff
 
-# CHECK: slg %r15, 0
-0xe3 0xf0 0x00 0x00 0x00 0x0b
+# CHECK: sll %r0, 4095(%r15)
+0x89 0x00 0xff 0xff
 
 # CHECK: sllg %r0, %r0, 0
 0xeb 0x00 0x00 0x00 0x00 0x0d
@@ -9421,30 +11116,6 @@
 # CHECK: sllk %r0, %r0, 524287(%r15)
 0xeb 0x00 0xff 0xff 0x7f 0xdf
 
-# CHECK: sll %r0, 0
-0x89 0x00 0x00 0x00
-
-# CHECK: sll %r7, 0
-0x89 0x70 0x00 0x00
-
-# CHECK: sll %r15, 0
-0x89 0xf0 0x00 0x00
-
-# CHECK: sll %r0, 4095
-0x89 0x00 0x0f 0xff
-
-# CHECK: sll %r0, 0(%r1)
-0x89 0x00 0x10 0x00
-
-# CHECK: sll %r0, 0(%r15)
-0x89 0x00 0xf0 0x00
-
-# CHECK: sll %r0, 4095(%r1)
-0x89 0x00 0x1f 0xff
-
-# CHECK: sll %r0, 4095(%r15)
-0x89 0x00 0xff 0xff
-
 # CHECK: slr %r0, %r0
 0x1f 0x00
 
@@ -9463,27 +11134,6 @@
 # CHECK: slrk %r2, %r3, %r4
 0xb9 0xfb 0x40 0x23
 
-# CHECK: sl %r0, 0
-0x5f 0x00 0x00 0x00
-
-# CHECK: sl %r0, 4095
-0x5f 0x00 0x0f 0xff
-
-# CHECK: sl %r0, 0(%r1)
-0x5f 0x00 0x10 0x00
-
-# CHECK: sl %r0, 0(%r15)
-0x5f 0x00 0xf0 0x00
-
-# CHECK: sl %r0, 4095(%r1,%r15)
-0x5f 0x01 0xff 0xff
-
-# CHECK: sl %r0, 4095(%r15,%r1)
-0x5f 0x0f 0x1f 0xff
-
-# CHECK: sl %r15, 0
-0x5f 0xf0 0x00 0x00
-
 # CHECK: sly %r0, -524288
 0xe3 0x00 0x00 0x00 0x80 0x5f
 
@@ -9514,8 +11164,47 @@
 # CHECK: sly %r15, 0
 0xe3 0xf0 0x00 0x00 0x00 0x5f
 
-# CHECK: sqdbr %f0, %f0
-0xb3 0x15 0x00 0x00
+# CHECK: sp 0(1), 0(1)
+0xfb 0x00 0x00 0x00 0x00 0x00
+
+# CHECK: sp 0(1), 0(1,%r1)
+0xfb 0x00 0x00 0x00 0x10 0x00
+
+# CHECK: sp 0(1), 0(1,%r15)
+0xfb 0x00 0x00 0x00 0xf0 0x00
+
+# CHECK: sp 0(1), 4095(1)
+0xfb 0x00 0x00 0x00 0x0f 0xff
+
+# CHECK: sp 0(1), 4095(1,%r1)
+0xfb 0x00 0x00 0x00 0x1f 0xff
+
+# CHECK: sp 0(1), 4095(1,%r15)
+0xfb 0x00 0x00 0x00 0xff 0xff
+
+# CHECK: sp 0(1,%r1), 0(1)
+0xfb 0x00 0x10 0x00 0x00 0x00
+
+# CHECK: sp 0(1,%r15), 0(1)
+0xfb 0x00 0xf0 0x00 0x00 0x00
+
+# CHECK: sp 4095(1,%r1), 0(1)
+0xfb 0x00 0x1f 0xff 0x00 0x00
+
+# CHECK: sp 4095(1,%r15), 0(1)
+0xfb 0x00 0xff 0xff 0x00 0x00
+
+# CHECK: sp 0(16,%r1), 0(1)
+0xfb 0xf0 0x10 0x00 0x00 0x00
+
+# CHECK: sp 0(16,%r15), 0(1)
+0xfb 0xf0 0xf0 0x00 0x00 0x00
+
+# CHECK: sp 0(1), 0(16,%r1)
+0xfb 0x0f 0x00 0x00 0x10 0x00
+
+# CHECK: sp 0(1), 0(16,%r15)
+0xfb 0x0f 0x00 0x00 0xf0 0x00
 
 # CHECK: spm %r0
 0x04 0x00
@@ -9526,15 +11215,6 @@
 # CHECK: spm %r15
 0x04 0xf0
 
-# CHECK: sqdbr %f0, %f15
-0xb3 0x15 0x00 0x0f
-
-# CHECK: sqdbr %f7, %f8
-0xb3 0x15 0x00 0x78
-
-# CHECK: sqdbr %f15, %f0
-0xb3 0x15 0x00 0xf0
-
 # CHECK: sqdb %f0, 0
 0xed 0x00 0x00 0x00 0x00 0x15
 
@@ -9556,17 +11236,17 @@
 # CHECK: sqdb %f15, 0
 0xed 0xf0 0x00 0x00 0x00 0x15
 
-# CHECK: sqebr %f0, %f0
-0xb3 0x14 0x00 0x00
+# CHECK: sqdbr %f0, %f0
+0xb3 0x15 0x00 0x00
 
-# CHECK: sqebr %f0, %f15
-0xb3 0x14 0x00 0x0f
+# CHECK: sqdbr %f0, %f15
+0xb3 0x15 0x00 0x0f
 
-# CHECK: sqebr %f7, %f8
-0xb3 0x14 0x00 0x78
+# CHECK: sqdbr %f7, %f8
+0xb3 0x15 0x00 0x78
 
-# CHECK: sqebr %f15, %f0
-0xb3 0x14 0x00 0xf0
+# CHECK: sqdbr %f15, %f0
+0xb3 0x15 0x00 0xf0
 
 # CHECK: sqeb %f0, 0
 0xed 0x00 0x00 0x00 0x00 0x14
@@ -9589,6 +11269,18 @@
 # CHECK: sqeb %f15, 0
 0xed 0xf0 0x00 0x00 0x00 0x14
 
+# CHECK: sqebr %f0, %f0
+0xb3 0x14 0x00 0x00
+
+# CHECK: sqebr %f0, %f15
+0xb3 0x14 0x00 0x0f
+
+# CHECK: sqebr %f7, %f8
+0xb3 0x14 0x00 0x78
+
+# CHECK: sqebr %f15, %f0
+0xb3 0x14 0x00 0xf0
+
 # CHECK: sqxbr %f0, %f0
 0xb3 0x16 0x00 0x00
 
@@ -9601,6 +11293,42 @@
 # CHECK: sqxbr %f13, %f0
 0xb3 0x16 0x00 0xd0
 
+# CHECK: sr %r0, %r0
+0x1b 0x00
+
+# CHECK: sr %r0, %r15
+0x1b 0x0f
+
+# CHECK: sr %r15, %r0
+0x1b 0xf0
+
+# CHECK: sr %r7, %r8
+0x1b 0x78
+
+# CHECK: sra %r0, 0
+0x8a 0x00 0x00 0x00
+
+# CHECK: sra %r7, 0
+0x8a 0x70 0x00 0x00
+
+# CHECK: sra %r15, 0
+0x8a 0xf0 0x00 0x00
+
+# CHECK: sra %r0, 4095
+0x8a 0x00 0x0f 0xff
+
+# CHECK: sra %r0, 0(%r1)
+0x8a 0x00 0x10 0x00
+
+# CHECK: sra %r0, 0(%r15)
+0x8a 0x00 0xf0 0x00
+
+# CHECK: sra %r0, 4095(%r1)
+0x8a 0x00 0x1f 0xff
+
+# CHECK: sra %r0, 4095(%r15)
+0x8a 0x00 0xff 0xff
+
 # CHECK: srag %r0, %r0, 0
 0xeb 0x00 0x00 0x00 0x00 0x0a
 
@@ -9673,29 +11401,83 @@
 # CHECK: srak %r0, %r0, 524287(%r15)
 0xeb 0x00 0xff 0xff 0x7f 0xdc
 
-# CHECK: sra %r0, 0
-0x8a 0x00 0x00 0x00
+# CHECK: srda %r0, 0
+0x8e 0x00 0x00 0x00
 
-# CHECK: sra %r7, 0
-0x8a 0x70 0x00 0x00
+# CHECK: srda %r6, 0
+0x8e 0x60 0x00 0x00
 
-# CHECK: sra %r15, 0
-0x8a 0xf0 0x00 0x00
+# CHECK: srda %r14, 0
+0x8e 0xe0 0x00 0x00
 
-# CHECK: sra %r0, 4095
-0x8a 0x00 0x0f 0xff
+# CHECK: srda %r0, 4095
+0x8e 0x00 0x0f 0xff
 
-# CHECK: sra %r0, 0(%r1)
-0x8a 0x00 0x10 0x00
+# CHECK: srda %r0, 0(%r1)
+0x8e 0x00 0x10 0x00
 
-# CHECK: sra %r0, 0(%r15)
-0x8a 0x00 0xf0 0x00
+# CHECK: srda %r0, 0(%r15)
+0x8e 0x00 0xf0 0x00
 
-# CHECK: sra %r0, 4095(%r1)
-0x8a 0x00 0x1f 0xff
+# CHECK: srda %r0, 4095(%r1)
+0x8e 0x00 0x1f 0xff
 
-# CHECK: sra %r0, 4095(%r15)
-0x8a 0x00 0xff 0xff
+# CHECK: srda %r0, 4095(%r15)
+0x8e 0x00 0xff 0xff
+
+# CHECK: srdl %r0, 0
+0x8c 0x00 0x00 0x00
+
+# CHECK: srdl %r6, 0
+0x8c 0x60 0x00 0x00
+
+# CHECK: srdl %r14, 0
+0x8c 0xe0 0x00 0x00
+
+# CHECK: srdl %r0, 4095
+0x8c 0x00 0x0f 0xff
+
+# CHECK: srdl %r0, 0(%r1)
+0x8c 0x00 0x10 0x00
+
+# CHECK: srdl %r0, 0(%r15)
+0x8c 0x00 0xf0 0x00
+
+# CHECK: srdl %r0, 4095(%r1)
+0x8c 0x00 0x1f 0xff
+
+# CHECK: srdl %r0, 4095(%r15)
+0x8c 0x00 0xff 0xff
+
+# CHECK: srk %r0, %r0, %r0
+0xb9 0xf9 0x00 0x00
+
+# CHECK: srk %r2, %r3, %r4
+0xb9 0xf9 0x40 0x23
+
+# CHECK: srl %r0, 0
+0x88 0x00 0x00 0x00
+
+# CHECK: srl %r7, 0
+0x88 0x70 0x00 0x00
+
+# CHECK: srl %r15, 0
+0x88 0xf0 0x00 0x00
+
+# CHECK: srl %r0, 4095
+0x88 0x00 0x0f 0xff
+
+# CHECK: srl %r0, 0(%r1)
+0x88 0x00 0x10 0x00
+
+# CHECK: srl %r0, 0(%r15)
+0x88 0x00 0xf0 0x00
+
+# CHECK: srl %r0, 4095(%r1)
+0x88 0x00 0x1f 0xff
+
+# CHECK: srl %r0, 4095(%r15)
+0x88 0x00 0xff 0xff
 
 # CHECK: srlg %r0, %r0, 0
 0xeb 0x00 0x00 0x00 0x00 0x0c
@@ -9769,48 +11551,6 @@
 # CHECK: srlk %r0, %r0, 524287(%r15)
 0xeb 0x00 0xff 0xff 0x7f 0xde
 
-# CHECK: srl %r0, 0
-0x88 0x00 0x00 0x00
-
-# CHECK: srl %r7, 0
-0x88 0x70 0x00 0x00
-
-# CHECK: srl %r15, 0
-0x88 0xf0 0x00 0x00
-
-# CHECK: srl %r0, 4095
-0x88 0x00 0x0f 0xff
-
-# CHECK: srl %r0, 0(%r1)
-0x88 0x00 0x10 0x00
-
-# CHECK: srl %r0, 0(%r15)
-0x88 0x00 0xf0 0x00
-
-# CHECK: srl %r0, 4095(%r1)
-0x88 0x00 0x1f 0xff
-
-# CHECK: srl %r0, 4095(%r15)
-0x88 0x00 0xff 0xff
-
-# CHECK: sr %r0, %r0
-0x1b 0x00
-
-# CHECK: sr %r0, %r15
-0x1b 0x0f
-
-# CHECK: sr %r15, %r0
-0x1b 0xf0
-
-# CHECK: sr %r7, %r8
-0x1b 0x78
-
-# CHECK: srk %r0, %r0, %r0
-0xb9 0xf9 0x00 0x00
-
-# CHECK: srk %r2, %r3, %r4
-0xb9 0xf9 0x40 0x23
-
 # CHECK: srnm 0
 0xb2 0x99 0x00 0x00
 
@@ -9865,6 +11605,45 @@
 # CHECK: srnmt 4095(%r15)
 0xb2 0xb9 0xff 0xff
 
+# CHECK: srp 0(1), 0, 0
+0xf0 0x00 0x00 0x00 0x00 0x00
+
+# CHECK: srp 0(1), 0, 15
+0xf0 0x0f 0x00 0x00 0x00 0x00
+
+# CHECK: srp 0(1), 0(%r1), 0
+0xf0 0x00 0x00 0x00 0x10 0x00
+
+# CHECK: srp 0(1), 0(%r15), 0
+0xf0 0x00 0x00 0x00 0xf0 0x00
+
+# CHECK: srp 0(1), 4095, 0
+0xf0 0x00 0x00 0x00 0x0f 0xff
+
+# CHECK: srp 0(1), 4095(%r1), 0
+0xf0 0x00 0x00 0x00 0x1f 0xff
+
+# CHECK: srp 0(1), 4095(%r15), 0
+0xf0 0x00 0x00 0x00 0xff 0xff
+
+# CHECK: srp 0(1,%r1), 0, 0
+0xf0 0x00 0x10 0x00 0x00 0x00
+
+# CHECK: srp 0(1,%r15), 0, 0
+0xf0 0x00 0xf0 0x00 0x00 0x00
+
+# CHECK: srp 4095(1,%r1), 0, 0
+0xf0 0x00 0x1f 0xff 0x00 0x00
+
+# CHECK: srp 4095(1,%r15), 0, 0
+0xf0 0x00 0xff 0xff 0x00 0x00
+
+# CHECK: srp 0(16,%r1), 0, 0
+0xf0 0xf0 0x10 0x00 0x00 0x00
+
+# CHECK: srp 0(16,%r15), 0, 0
+0xf0 0xf0 0xf0 0x00 0x00 0x00
+
 # CHECK: srst %r0, %r0
 0xb2 0x5e 0x00 0x00
 
@@ -9877,6 +11656,39 @@
 # CHECK: srst %r7, %r8
 0xb2 0x5e 0x00 0x78
 
+# CHECK: srstu %r0, %r0
+0xb9 0xbe 0x00 0x00
+
+# CHECK: srstu %r0, %r15
+0xb9 0xbe 0x00 0x0f
+
+# CHECK: srstu %r15, %r0
+0xb9 0xbe 0x00 0xf0
+
+# CHECK: srstu %r7, %r8
+0xb9 0xbe 0x00 0x78
+
+# CHECK: st %r0, 0
+0x50 0x00 0x00 0x00
+
+# CHECK: st %r0, 4095
+0x50 0x00 0x0f 0xff
+
+# CHECK: st %r0, 0(%r1)
+0x50 0x00 0x10 0x00
+
+# CHECK: st %r0, 0(%r15)
+0x50 0x00 0xf0 0x00
+
+# CHECK: st %r0, 4095(%r1,%r15)
+0x50 0x01 0xff 0xff
+
+# CHECK: st %r0, 4095(%r15,%r1)
+0x50 0x0f 0x1f 0xff
+
+# CHECK: st %r15, 0
+0x50 0xf0 0x00 0x00
+
 # CHECK: stam %a0, %a0, 0
 0x9b 0x00 0x00 0x00
 
@@ -9997,82 +11809,145 @@
 # CHECK: stch %r15, 0
 0xe3 0xf0 0x00 0x00 0x00 0xc3
 
-# CHECK: stcy %r0, -524288
-0xe3 0x00 0x00 0x00 0x80 0x72
-
 # CHECK: stck 0
 0xb2 0x05 0x00 0x00
 
 # CHECK: stck 0(%r1)
 0xb2 0x05 0x10 0x00
 
-#CHECK: stck   0(%r15)
+# CHECK: stck 0(%r15)
 0xb2 0x05 0xf0 0x00
 
-#CHECK: stck	4095
-0xb2 0x05 0x0f 0xff 
+# CHECK: stck 4095
+0xb2 0x05 0x0f 0xff
 
-#CHECK: stck	4095(%r1)
+# CHECK: stck 4095(%r1)
 0xb2 0x05 0x1f 0xff
 
-#CHECK: stck   4095(%r15)
+# CHECK: stck 4095(%r15)
 0xb2 0x05 0xff 0xff
 
+# CHECK: stcke 0
+0xb2 0x78 0x00 0x00
+
+# CHECK: stcke 0(%r1)
+0xb2 0x78 0x10 0x00
+
+# CHECK: stcke 0(%r15)
+0xb2 0x78 0xf0 0x00
+
+# CHECK: stcke 4095
+0xb2 0x78 0x0f 0xff
+
+# CHECK: stcke 4095(%r1)
+0xb2 0x78 0x1f 0xff
+
+# CHECK: stcke 4095(%r15)
+0xb2 0x78 0xff 0xff
+
 # CHECK: stckf 0
 0xb2 0x7c 0x00 0x00
 
 # CHECK: stckf 0(%r1)
 0xb2 0x7c 0x10 0x00
 
-#CHECK: stckf   0(%r15)
+# CHECK: stckf 0(%r15)
 0xb2 0x7c 0xf0 0x00
 
-#CHECK: stckf	4095
-0xb2 0x7c 0x0f 0xff 
+# CHECK: stckf 4095
+0xb2 0x7c 0x0f 0xff
 
-#CHECK: stckf	4095(%r1)
+# CHECK: stckf 4095(%r1)
 0xb2 0x7c 0x1f 0xff
 
-#CHECK: stckf   4095(%r15)
+# CHECK: stckf 4095(%r15)
 0xb2 0x7c 0xff 0xff
 
-# CHECK: stcke 0
-0xb2 0x78 0x00 0x00
+# CHECK: stcm %r0, 0, 0
+0xbe 0x00 0x00 0x00
 
-# CHECK: stcke 0(%r1)
-0xb2 0x78 0x10 0x00
+# CHECK: stcm %r0, 15, 4095
+0xbe 0x0f 0x0f 0xff
 
-#CHECK: stcke   0(%r15)
-0xb2 0x78 0xf0 0x00
+# CHECK: stcm %r0, 0, 0(%r1)
+0xbe 0x00 0x10 0x00
 
-#CHECK: stcke	4095
-0xb2 0x78 0x0f 0xff 
+# CHECK: stcm %r0, 0, 0(%r15)
+0xbe 0x00 0xf0 0x00
 
-#CHECK: stcke	4095(%r1)
-0xb2 0x78 0x1f 0xff
+# CHECK: stcm %r0, 15, 4095(%r15)
+0xbe 0x0f 0xff 0xff
 
-#CHECK: stcke   4095(%r15)
-0xb2 0x78 0xff 0xff
+# CHECK: stcm %r0, 0, 4095(%r1)
+0xbe 0x00 0x1f 0xff
 
-# CHECK: stfle 0
-0xb2 0xb0 0x00 0x00
+# CHECK: stcm %r15, 0, 0
+0xbe 0xf0 0x00 0x00
 
-# CHECK: stfle 0(%r1)
-0xb2 0xb0 0x10 0x00
+# CHECK: stcmh %r0, 0, -524288
+0xeb 0x00 0x00 0x00 0x80 0x2c
 
-#CHECK: stfle   0(%r15)
-0xb2 0xb0 0xf0 0x00
+# CHECK: stcmh %r0, 0, -1
+0xeb 0x00 0x0f 0xff 0xff 0x2c
 
-#CHECK: stfle	4095
-0xb2 0xb0 0x0f 0xff 
+# CHECK: stcmh %r0, 15, 0
+0xeb 0x0f 0x00 0x00 0x00 0x2c
 
-#CHECK: stfle	4095(%r1)
-0xb2 0xb0 0x1f 0xff
+# CHECK: stcmh %r0, 15, 1
+0xeb 0x0f 0x00 0x01 0x00 0x2c
 
-#CHECK: stfle   4095(%r15)
-0xb2 0xb0 0xff 0xff
+# CHECK: stcmh %r0, 8, 524287
+0xeb 0x08 0x0f 0xff 0x7f 0x2c
+
+# CHECK: stcmh %r0, 8, 0(%r1)
+0xeb 0x08 0x10 0x00 0x00 0x2c
+
+# CHECK: stcmh %r0, 4, 0(%r15)
+0xeb 0x04 0xf0 0x00 0x00 0x2c
+
+# CHECK: stcmh %r0, 4, 524287(%r15)
+0xeb 0x04 0xff 0xff 0x7f 0x2c
+
+# CHECK: stcmh %r0, 0, 524287(%r1)
+0xeb 0x00 0x1f 0xff 0x7f 0x2c
+
+# CHECK: stcmh %r15, 0, 0
+0xeb 0xf0 0x00 0x00 0x00 0x2c
+
+# CHECK: stcmy %r0, 0, -524288
+0xeb 0x00 0x00 0x00 0x80 0x2d
+
+# CHECK: stcmy %r0, 0, -1
+0xeb 0x00 0x0f 0xff 0xff 0x2d
+
+# CHECK: stcmy %r0, 15, 0
+0xeb 0x0f 0x00 0x00 0x00 0x2d
+
+# CHECK: stcmy %r0, 15, 1
+0xeb 0x0f 0x00 0x01 0x00 0x2d
+
+# CHECK: stcmy %r0, 8, 524287
+0xeb 0x08 0x0f 0xff 0x7f 0x2d
+
+# CHECK: stcmy %r0, 8, 0(%r1)
+0xeb 0x08 0x10 0x00 0x00 0x2d
 
-# CHECK: stcy  %r0, -1
+# CHECK: stcmy %r0, 4, 0(%r15)
+0xeb 0x04 0xf0 0x00 0x00 0x2d
+
+# CHECK: stcmy %r0, 4, 524287(%r15)
+0xeb 0x04 0xff 0xff 0x7f 0x2d
+
+# CHECK: stcmy %r0, 0, 524287(%r1)
+0xeb 0x00 0x1f 0xff 0x7f 0x2d
+
+# CHECK: stcmy %r15, 0, 0
+0xeb 0xf0 0x00 0x00 0x00 0x2d
+
+# CHECK: stcy %r0, -524288
+0xe3 0x00 0x00 0x00 0x80 0x72
+
+# CHECK: stcy %r0, -1
 0xe3 0x00 0x0f 0xff 0xff 0x72
 
 # CHECK: stcy %r0, 0
@@ -10201,6 +12076,54 @@
 # CHECK: stey %f15, 0
 0xed 0xf0 0x00 0x00 0x00 0x66
 
+# CHECK: stfh %r0, -524288
+0xe3 0x00 0x00 0x00 0x80 0xcb
+
+# CHECK: stfh %r0, -1
+0xe3 0x00 0x0f 0xff 0xff 0xcb
+
+# CHECK: stfh %r0, 0
+0xe3 0x00 0x00 0x00 0x00 0xcb
+
+# CHECK: stfh %r0, 1
+0xe3 0x00 0x00 0x01 0x00 0xcb
+
+# CHECK: stfh %r0, 524287
+0xe3 0x00 0x0f 0xff 0x7f 0xcb
+
+# CHECK: stfh %r0, 0(%r1)
+0xe3 0x00 0x10 0x00 0x00 0xcb
+
+# CHECK: stfh %r0, 0(%r15)
+0xe3 0x00 0xf0 0x00 0x00 0xcb
+
+# CHECK: stfh %r0, 524287(%r1,%r15)
+0xe3 0x01 0xff 0xff 0x7f 0xcb
+
+# CHECK: stfh %r0, 524287(%r15,%r1)
+0xe3 0x0f 0x1f 0xff 0x7f 0xcb
+
+# CHECK: stfh %r15, 0
+0xe3 0xf0 0x00 0x00 0x00 0xcb
+
+# CHECK: stfle 0
+0xb2 0xb0 0x00 0x00
+
+# CHECK: stfle 0(%r1)
+0xb2 0xb0 0x10 0x00
+
+# CHECK: stfle 0(%r15)
+0xb2 0xb0 0xf0 0x00
+
+# CHECK: stfle 4095
+0xb2 0xb0 0x0f 0xff
+
+# CHECK: stfle 4095(%r1)
+0xb2 0xb0 0x1f 0xff
+
+# CHECK: stfle 4095(%r15)
+0xb2 0xb0 0xff 0xff
+
 # CHECK: stfpc 0
 0xb2 0x9c 0x00 0x00
 
@@ -10282,53 +12205,23 @@
 # CHECK: sthh %r0, 1
 0xe3 0x00 0x00 0x01 0x00 0xc7
 
-# CHECK: sthh %r0, 524287
-0xe3 0x00 0x0f 0xff 0x7f 0xc7
-
-# CHECK: sthh %r0, 0(%r1)
-0xe3 0x00 0x10 0x00 0x00 0xc7
-
-# CHECK: sthh %r0, 0(%r15)
-0xe3 0x00 0xf0 0x00 0x00 0xc7
-
-# CHECK: sthh %r0, 524287(%r1,%r15)
-0xe3 0x01 0xff 0xff 0x7f 0xc7
-
-# CHECK: sthh %r0, 524287(%r15,%r1)
-0xe3 0x0f 0x1f 0xff 0x7f 0xc7
-
-# CHECK: sthh %r15, 0
-0xe3 0xf0 0x00 0x00 0x00 0xc7
-
-# CHECK: stfh %r0, -524288
-0xe3 0x00 0x00 0x00 0x80 0xcb
-
-# CHECK: stfh %r0, -1
-0xe3 0x00 0x0f 0xff 0xff 0xcb
-
-# CHECK: stfh %r0, 0
-0xe3 0x00 0x00 0x00 0x00 0xcb
-
-# CHECK: stfh %r0, 1
-0xe3 0x00 0x00 0x01 0x00 0xcb
-
-# CHECK: stfh %r0, 524287
-0xe3 0x00 0x0f 0xff 0x7f 0xcb
+# CHECK: sthh %r0, 524287
+0xe3 0x00 0x0f 0xff 0x7f 0xc7
 
-# CHECK: stfh %r0, 0(%r1)
-0xe3 0x00 0x10 0x00 0x00 0xcb
+# CHECK: sthh %r0, 0(%r1)
+0xe3 0x00 0x10 0x00 0x00 0xc7
 
-# CHECK: stfh %r0, 0(%r15)
-0xe3 0x00 0xf0 0x00 0x00 0xcb
+# CHECK: sthh %r0, 0(%r15)
+0xe3 0x00 0xf0 0x00 0x00 0xc7
 
-# CHECK: stfh %r0, 524287(%r1,%r15)
-0xe3 0x01 0xff 0xff 0x7f 0xcb
+# CHECK: sthh %r0, 524287(%r1,%r15)
+0xe3 0x01 0xff 0xff 0x7f 0xc7
 
-# CHECK: stfh %r0, 524287(%r15,%r1)
-0xe3 0x0f 0x1f 0xff 0x7f 0xcb
+# CHECK: sthh %r0, 524287(%r15,%r1)
+0xe3 0x0f 0x1f 0xff 0x7f 0xc7
 
-# CHECK: stfh %r15, 0
-0xe3 0xf0 0x00 0x00 0x00 0xcb
+# CHECK: sthh %r15, 0
+0xe3 0xf0 0x00 0x00 0x00 0xc7
 
 # CHECK: sthy %r0, -524288
 0xe3 0x00 0x00 0x00 0x80 0x70
@@ -10507,135 +12400,6 @@
 # CHECK: stmy %r0, %r0, 524287(%r15)
 0xeb 0x00 0xff 0xff 0x7f 0x90
 
-# CHECK: strag 0, 0
-0xe5 0x02 0x00 0x00 0x00 0x00
-
-# CHECK: strag 0, 4095
-0xe5 0x02 0x00 0x00 0x0f 0xff
-
-# CHECK: strag 0, 0(%r1)
-0xe5 0x02 0x00 0x00 0x10 0x00
-
-# CHECK: strag 0, 0(%r15)
-0xe5 0x02 0x00 0x00 0xf0 0x00
-
-# CHECK: strag 0(%r1), 4095(%r15)
-0xe5 0x02 0x10 0x00 0xff 0xff
-
-# CHECK: strag 4095(%r1), 0(%r15)
-0xe5 0x02 0x1f 0xff 0xf0 0x00
-
-# CHECK: strvg %r0, -524288
-0xe3 0x00 0x00 0x00 0x80 0x2f
-
-# CHECK: strvg %r0, -1
-0xe3 0x00 0x0f 0xff 0xff 0x2f
-
-# CHECK: strvg %r0, 0
-0xe3 0x00 0x00 0x00 0x00 0x2f
-
-# CHECK: strvg %r0, 1
-0xe3 0x00 0x00 0x01 0x00 0x2f
-
-# CHECK: strvg %r0, 524287
-0xe3 0x00 0x0f 0xff 0x7f 0x2f
-
-# CHECK: strvg %r0, 0(%r1)
-0xe3 0x00 0x10 0x00 0x00 0x2f
-
-# CHECK: strvg %r0, 0(%r15)
-0xe3 0x00 0xf0 0x00 0x00 0x2f
-
-# CHECK: strvg %r0, 524287(%r1,%r15)
-0xe3 0x01 0xff 0xff 0x7f 0x2f
-
-# CHECK: strvg %r0, 524287(%r15,%r1)
-0xe3 0x0f 0x1f 0xff 0x7f 0x2f
-
-# CHECK: strvg %r15, 0
-0xe3 0xf0 0x00 0x00 0x00 0x2f
-
-# CHECK: strvh %r0, -524288
-0xe3 0x00 0x00 0x00 0x80 0x3f
-
-# CHECK: strvh %r0, -1
-0xe3 0x00 0x0f 0xff 0xff 0x3f
-
-# CHECK: strvh %r0, 0
-0xe3 0x00 0x00 0x00 0x00 0x3f
-
-# CHECK: strvh %r0, 1
-0xe3 0x00 0x00 0x01 0x00 0x3f
-
-# CHECK: strvh %r0, 524287
-0xe3 0x00 0x0f 0xff 0x7f 0x3f
-
-# CHECK: strvh %r0, 0(%r1)
-0xe3 0x00 0x10 0x00 0x00 0x3f
-
-# CHECK: strvh %r0, 0(%r15)
-0xe3 0x00 0xf0 0x00 0x00 0x3f
-
-# CHECK: strvh %r0, 524287(%r1,%r15)
-0xe3 0x01 0xff 0xff 0x7f 0x3f
-
-# CHECK: strvh %r0, 524287(%r15,%r1)
-0xe3 0x0f 0x1f 0xff 0x7f 0x3f
-
-# CHECK: strvh %r15, 0
-0xe3 0xf0 0x00 0x00 0x00 0x3f
-
-# CHECK: strv %r0, -524288
-0xe3 0x00 0x00 0x00 0x80 0x3e
-
-# CHECK: strv %r0, -1
-0xe3 0x00 0x0f 0xff 0xff 0x3e
-
-# CHECK: strv %r0, 0
-0xe3 0x00 0x00 0x00 0x00 0x3e
-
-# CHECK: strv %r0, 1
-0xe3 0x00 0x00 0x01 0x00 0x3e
-
-# CHECK: strv %r0, 524287
-0xe3 0x00 0x0f 0xff 0x7f 0x3e
-
-# CHECK: strv %r0, 0(%r1)
-0xe3 0x00 0x10 0x00 0x00 0x3e
-
-# CHECK: strv %r0, 0(%r15)
-0xe3 0x00 0xf0 0x00 0x00 0x3e
-
-# CHECK: strv %r0, 524287(%r1,%r15)
-0xe3 0x01 0xff 0xff 0x7f 0x3e
-
-# CHECK: strv %r0, 524287(%r15,%r1)
-0xe3 0x0f 0x1f 0xff 0x7f 0x3e
-
-# CHECK: strv %r15, 0
-0xe3 0xf0 0x00 0x00 0x00 0x3e
-
-# CHECK: st %r0, 0
-0x50 0x00 0x00 0x00
-
-# CHECK: st %r0, 4095
-0x50 0x00 0x0f 0xff
-
-# CHECK: st %r0, 0(%r1)
-0x50 0x00 0x10 0x00
-
-# CHECK: st %r0, 0(%r15)
-0x50 0x00 0xf0 0x00
-
-# CHECK: st %r0, 4095(%r1,%r15)
-0x50 0x01 0xff 0xff
-
-# CHECK: st %r0, 4095(%r15,%r1)
-0x50 0x0f 0x1f 0xff
-
-# CHECK: st %r15, 0
-0x50 0xf0 0x00 0x00
-
 # CHECK: stoc %r1, 2(%r3), 0
 0xeb 0x10 0x30 0x02 0x00 0xf3
 
@@ -10762,26 +12526,113 @@
 # CHECK: stpq %r14, 0
 0xe3 0xe0 0x00 0x00 0x00 0x8e
 
-# CHECK: s %r0, 0
-0x5b 0x00 0x00 0x00
+# CHECK: strag 0, 0
+0xe5 0x02 0x00 0x00 0x00 0x00
 
-# CHECK: s %r0, 4095
-0x5b 0x00 0x0f 0xff
+# CHECK: strag 0, 4095
+0xe5 0x02 0x00 0x00 0x0f 0xff
 
-# CHECK: s %r0, 0(%r1)
-0x5b 0x00 0x10 0x00
+# CHECK: strag 0, 0(%r1)
+0xe5 0x02 0x00 0x00 0x10 0x00
 
-# CHECK: s %r0, 0(%r15)
-0x5b 0x00 0xf0 0x00
+# CHECK: strag 0, 0(%r15)
+0xe5 0x02 0x00 0x00 0xf0 0x00
+
+# CHECK: strag 0(%r1), 4095(%r15)
+0xe5 0x02 0x10 0x00 0xff 0xff
+
+# CHECK: strag 4095(%r1), 0(%r15)
+0xe5 0x02 0x1f 0xff 0xf0 0x00
+
+# CHECK: strv %r0, -524288
+0xe3 0x00 0x00 0x00 0x80 0x3e
+
+# CHECK: strv %r0, -1
+0xe3 0x00 0x0f 0xff 0xff 0x3e
+
+# CHECK: strv %r0, 0
+0xe3 0x00 0x00 0x00 0x00 0x3e
+
+# CHECK: strv %r0, 1
+0xe3 0x00 0x00 0x01 0x00 0x3e
+
+# CHECK: strv %r0, 524287
+0xe3 0x00 0x0f 0xff 0x7f 0x3e
+
+# CHECK: strv %r0, 0(%r1)
+0xe3 0x00 0x10 0x00 0x00 0x3e
+
+# CHECK: strv %r0, 0(%r15)
+0xe3 0x00 0xf0 0x00 0x00 0x3e
+
+# CHECK: strv %r0, 524287(%r1,%r15)
+0xe3 0x01 0xff 0xff 0x7f 0x3e
+
+# CHECK: strv %r0, 524287(%r15,%r1)
+0xe3 0x0f 0x1f 0xff 0x7f 0x3e
+
+# CHECK: strv %r15, 0
+0xe3 0xf0 0x00 0x00 0x00 0x3e
+
+# CHECK: strvg %r0, -524288
+0xe3 0x00 0x00 0x00 0x80 0x2f
+
+# CHECK: strvg %r0, -1
+0xe3 0x00 0x0f 0xff 0xff 0x2f
+
+# CHECK: strvg %r0, 0
+0xe3 0x00 0x00 0x00 0x00 0x2f
+
+# CHECK: strvg %r0, 1
+0xe3 0x00 0x00 0x01 0x00 0x2f
+
+# CHECK: strvg %r0, 524287
+0xe3 0x00 0x0f 0xff 0x7f 0x2f
+
+# CHECK: strvg %r0, 0(%r1)
+0xe3 0x00 0x10 0x00 0x00 0x2f
+
+# CHECK: strvg %r0, 0(%r15)
+0xe3 0x00 0xf0 0x00 0x00 0x2f
+
+# CHECK: strvg %r0, 524287(%r1,%r15)
+0xe3 0x01 0xff 0xff 0x7f 0x2f
+
+# CHECK: strvg %r0, 524287(%r15,%r1)
+0xe3 0x0f 0x1f 0xff 0x7f 0x2f
+
+# CHECK: strvg %r15, 0
+0xe3 0xf0 0x00 0x00 0x00 0x2f
+
+# CHECK: strvh %r0, -524288
+0xe3 0x00 0x00 0x00 0x80 0x3f
+
+# CHECK: strvh %r0, -1
+0xe3 0x00 0x0f 0xff 0xff 0x3f
+
+# CHECK: strvh %r0, 0
+0xe3 0x00 0x00 0x00 0x00 0x3f
+
+# CHECK: strvh %r0, 1
+0xe3 0x00 0x00 0x01 0x00 0x3f
+
+# CHECK: strvh %r0, 524287
+0xe3 0x00 0x0f 0xff 0x7f 0x3f
+
+# CHECK: strvh %r0, 0(%r1)
+0xe3 0x00 0x10 0x00 0x00 0x3f
+
+# CHECK: strvh %r0, 0(%r15)
+0xe3 0x00 0xf0 0x00 0x00 0x3f
 
-# CHECK: s %r0, 4095(%r1,%r15)
-0x5b 0x01 0xff 0xff
+# CHECK: strvh %r0, 524287(%r1,%r15)
+0xe3 0x01 0xff 0xff 0x7f 0x3f
 
-# CHECK: s %r0, 4095(%r15,%r1)
-0x5b 0x0f 0x1f 0xff
+# CHECK: strvh %r0, 524287(%r15,%r1)
+0xe3 0x0f 0x1f 0xff 0x7f 0x3f
 
-# CHECK: s %r15, 0
-0x5b 0xf0 0x00 0x00
+# CHECK: strvh %r15, 0
+0xe3 0xf0 0x00 0x00 0x00 0x3f
 
 # CHECK: sty %r0, -524288
 0xe3 0x00 0x00 0x00 0x80 0x50
@@ -10867,9 +12718,6 @@
 # CHECK: sy %r15, 0
 0xe3 0xf0 0x00 0x00 0x00 0x5b
 
-# CHECK: tam
-0x01 0x0b
-
 # CHECK: tabort 0
 0xb2 0xfc 0x00 0x00
 
@@ -10888,6 +12736,9 @@
 # CHECK: tabort 4095(%r15)
 0xb2 0xfc 0xff 0xff
 
+# CHECK: tam
+0x01 0x0b
+
 # CHECK: tbegin 0, 0
 0xe5 0x60 0x00 0x00 0x00 0x00
 
@@ -11119,24 +12970,411 @@
 # CHECK: tmy 524287(%r15), 42
 0xeb 0x2a 0xff 0xff 0x7f 0x51
 
+# CHECK: tp 0(1)
+0xeb 0x00 0x00 0x00 0x00 0xc0
+
+# CHECK: tp 0(1,%r1)
+0xeb 0x00 0x10 0x00 0x00 0xc0
+
+# CHECK: tp 0(1,%r15)
+0xeb 0x00 0xf0 0x00 0x00 0xc0
+
+# CHECK: tp 4095(1,%r1)
+0xeb 0x00 0x1f 0xff 0x00 0xc0
+
+# CHECK: tp 4095(1,%r15)
+0xeb 0x00 0xff 0xff 0x00 0xc0
+
+# CHECK: tp 0(16,%r1)
+0xeb 0xf0 0x10 0x00 0x00 0xc0
+
+# CHECK: tp 0(16,%r15)
+0xeb 0xf0 0xf0 0x00 0x00 0xc0
+
+# CHECK: tr 0(1), 0
+0xdc 0x00 0x00 0x00 0x00 0x00
+
+# CHECK: tr 0(1), 0(%r1)
+0xdc 0x00 0x00 0x00 0x10 0x00
+
+# CHECK: tr 0(1), 0(%r15)
+0xdc 0x00 0x00 0x00 0xf0 0x00
+
+# CHECK: tr 0(1), 4095
+0xdc 0x00 0x00 0x00 0x0f 0xff
+
+# CHECK: tr 0(1), 4095(%r1)
+0xdc 0x00 0x00 0x00 0x1f 0xff
+
+# CHECK: tr 0(1), 4095(%r15)
+0xdc 0x00 0x00 0x00 0xff 0xff
+
+# CHECK: tr 0(1,%r1), 0
+0xdc 0x00 0x10 0x00 0x00 0x00
+
+# CHECK: tr 0(1,%r15), 0
+0xdc 0x00 0xf0 0x00 0x00 0x00
+
+# CHECK: tr 4095(1,%r1), 0
+0xdc 0x00 0x1f 0xff 0x00 0x00
+
+# CHECK: tr 4095(1,%r15), 0
+0xdc 0x00 0xff 0xff 0x00 0x00
+
+# CHECK: tr 0(256,%r1), 0
+0xdc 0xff 0x10 0x00 0x00 0x00
+
+# CHECK: tr 0(256,%r15), 0
+0xdc 0xff 0xf0 0x00 0x00 0x00
+
+# CHECK: tre %r0, %r0
+0xb2 0xa5 0x00 0x00
+
+# CHECK: tre %r0, %r15
+0xb2 0xa5 0x00 0x0f
+
+# CHECK: tre %r14, %r0
+0xb2 0xa5 0x00 0xe0
+
+# CHECK: tre %r6, %r8
+0xb2 0xa5 0x00 0x68
+
+# CHECK: troo %r0, %r0
+0xb9 0x93 0x00 0x00
+
+# CHECK: troo %r0, %r15
+0xb9 0x93 0x00 0x0f
+
+# CHECK: troo %r14, %r0
+0xb9 0x93 0x00 0xe0
+
+# CHECK: troo %r6, %r8
+0xb9 0x93 0x00 0x68
+
+# CHECK: troo %r4, %r12, 1
+0xb9 0x93 0x10 0x4c
+
+# CHECK: troo %r4, %r12, 15
+0xb9 0x93 0xf0 0x4c
+
+# CHECK: trot %r0, %r0
+0xb9 0x92 0x00 0x00
+
+# CHECK: trot %r0, %r15
+0xb9 0x92 0x00 0x0f
+
+# CHECK: trot %r14, %r0
+0xb9 0x92 0x00 0xe0
+
+# CHECK: trot %r6, %r8
+0xb9 0x92 0x00 0x68
+
+# CHECK: trot %r4, %r12, 1
+0xb9 0x92 0x10 0x4c
+
+# CHECK: trot %r4, %r12, 15
+0xb9 0x92 0xf0 0x4c
+
+# CHECK: trt 0(1), 0
+0xdd 0x00 0x00 0x00 0x00 0x00
+
+# CHECK: trt 0(1), 0(%r1)
+0xdd 0x00 0x00 0x00 0x10 0x00
+
+# CHECK: trt 0(1), 0(%r15)
+0xdd 0x00 0x00 0x00 0xf0 0x00
+
+# CHECK: trt 0(1), 4095
+0xdd 0x00 0x00 0x00 0x0f 0xff
+
+# CHECK: trt 0(1), 4095(%r1)
+0xdd 0x00 0x00 0x00 0x1f 0xff
+
+# CHECK: trt 0(1), 4095(%r15)
+0xdd 0x00 0x00 0x00 0xff 0xff
+
+# CHECK: trt 0(1,%r1), 0
+0xdd 0x00 0x10 0x00 0x00 0x00
+
+# CHECK: trt 0(1,%r15), 0
+0xdd 0x00 0xf0 0x00 0x00 0x00
+
+# CHECK: trt 4095(1,%r1), 0
+0xdd 0x00 0x1f 0xff 0x00 0x00
+
+# CHECK: trt 4095(1,%r15), 0
+0xdd 0x00 0xff 0xff 0x00 0x00
+
+# CHECK: trt 0(256,%r1), 0
+0xdd 0xff 0x10 0x00 0x00 0x00
+
+# CHECK: trt 0(256,%r15), 0
+0xdd 0xff 0xf0 0x00 0x00 0x00
+
+# CHECK: trte %r0, %r0
+0xb9 0xbf 0x00 0x00
+
+# CHECK: trte %r0, %r15
+0xb9 0xbf 0x00 0x0f
+
+# CHECK: trte %r14, %r0
+0xb9 0xbf 0x00 0xe0
+
+# CHECK: trte %r6, %r8
+0xb9 0xbf 0x00 0x68
+
+# CHECK: trte %r4, %r12, 1
+0xb9 0xbf 0x10 0x4c
+
+# CHECK: trte %r4, %r12, 15
+0xb9 0xbf 0xf0 0x4c
+
+# CHECK: trto %r0, %r0
+0xb9 0x91 0x00 0x00
+
+# CHECK: trto %r0, %r15
+0xb9 0x91 0x00 0x0f
+
+# CHECK: trto %r14, %r0
+0xb9 0x91 0x00 0xe0
+
+# CHECK: trto %r6, %r8
+0xb9 0x91 0x00 0x68
+
+# CHECK: trto %r4, %r12, 1
+0xb9 0x91 0x10 0x4c
+
+# CHECK: trto %r4, %r12, 15
+0xb9 0x91 0xf0 0x4c
+
+# CHECK: trtr 0(1), 0
+0xd0 0x00 0x00 0x00 0x00 0x00
+
+# CHECK: trtr 0(1), 0(%r1)
+0xd0 0x00 0x00 0x00 0x10 0x00
+
+# CHECK: trtr 0(1), 0(%r15)
+0xd0 0x00 0x00 0x00 0xf0 0x00
+
+# CHECK: trtr 0(1), 4095
+0xd0 0x00 0x00 0x00 0x0f 0xff
+
+# CHECK: trtr 0(1), 4095(%r1)
+0xd0 0x00 0x00 0x00 0x1f 0xff
+
+# CHECK: trtr 0(1), 4095(%r15)
+0xd0 0x00 0x00 0x00 0xff 0xff
+
+# CHECK: trtr 0(1,%r1), 0
+0xd0 0x00 0x10 0x00 0x00 0x00
+
+# CHECK: trtr 0(1,%r15), 0
+0xd0 0x00 0xf0 0x00 0x00 0x00
+
+# CHECK: trtr 4095(1,%r1), 0
+0xd0 0x00 0x1f 0xff 0x00 0x00
+
+# CHECK: trtr 4095(1,%r15), 0
+0xd0 0x00 0xff 0xff 0x00 0x00
+
+# CHECK: trtr 0(256,%r1), 0
+0xd0 0xff 0x10 0x00 0x00 0x00
+
+# CHECK: trtr 0(256,%r15), 0
+0xd0 0xff 0xf0 0x00 0x00 0x00
+
+# CHECK: trtre %r0, %r0
+0xb9 0xbd 0x00 0x00
+
+# CHECK: trtre %r0, %r15
+0xb9 0xbd 0x00 0x0f
+
+# CHECK: trtre %r14, %r0
+0xb9 0xbd 0x00 0xe0
+
+# CHECK: trtre %r6, %r8
+0xb9 0xbd 0x00 0x68
+
+# CHECK: trtre %r4, %r12, 1
+0xb9 0xbd 0x10 0x4c
+
+# CHECK: trtre %r4, %r12, 15
+0xb9 0xbd 0xf0 0x4c
+
+# CHECK: trtt %r0, %r0
+0xb9 0x90 0x00 0x00
+
+# CHECK: trtt %r0, %r15
+0xb9 0x90 0x00 0x0f
+
+# CHECK: trtt %r14, %r0
+0xb9 0x90 0x00 0xe0
+
+# CHECK: trtt %r6, %r8
+0xb9 0x90 0x00 0x68
+
+# CHECK: trtt %r4, %r12, 1
+0xb9 0x90 0x10 0x4c
+
+# CHECK: trtt %r4, %r12, 15
+0xb9 0x90 0xf0 0x4c
+
 # CHECK: ts 0
 0x93 0x00 0x00 0x00
 
 # CHECK: ts 0(%r1)
 0x93 0x00 0x10 0x00
 
-#CHECK: ts   0(%r15)
+# CHECK: ts 0(%r15)
 0x93 0x00 0xf0 0x00
 
-#CHECK: ts	4095
+# CHECK: ts 4095
 0x93 0x00 0x0f 0xff
 
-#CHECK: ts	4095(%r1)
+# CHECK: ts 4095(%r1)
 0x93 0x00 0x1f 0xff
 
-#CHECK: ts   4095(%r15)
+# CHECK: ts 4095(%r15)
 0x93 0x00 0xff 0xff
 
+# CHECK: unpk 0(1), 0(1)
+0xf3 0x00 0x00 0x00 0x00 0x00
+
+# CHECK: unpk 0(1), 0(1,%r1)
+0xf3 0x00 0x00 0x00 0x10 0x00
+
+# CHECK: unpk 0(1), 0(1,%r15)
+0xf3 0x00 0x00 0x00 0xf0 0x00
+
+# CHECK: unpk 0(1), 4095(1)
+0xf3 0x00 0x00 0x00 0x0f 0xff
+
+# CHECK: unpk 0(1), 4095(1,%r1)
+0xf3 0x00 0x00 0x00 0x1f 0xff
+
+# CHECK: unpk 0(1), 4095(1,%r15)
+0xf3 0x00 0x00 0x00 0xff 0xff
+
+# CHECK: unpk 0(1,%r1), 0(1)
+0xf3 0x00 0x10 0x00 0x00 0x00
+
+# CHECK: unpk 0(1,%r15), 0(1)
+0xf3 0x00 0xf0 0x00 0x00 0x00
+
+# CHECK: unpk 4095(1,%r1), 0(1)
+0xf3 0x00 0x1f 0xff 0x00 0x00
+
+# CHECK: unpk 4095(1,%r15), 0(1)
+0xf3 0x00 0xff 0xff 0x00 0x00
+
+# CHECK: unpk 0(16,%r1), 0(1)
+0xf3 0xf0 0x10 0x00 0x00 0x00
+
+# CHECK: unpk 0(16,%r15), 0(1)
+0xf3 0xf0 0xf0 0x00 0x00 0x00
+
+# CHECK: unpk 0(1), 0(16,%r1)
+0xf3 0x0f 0x00 0x00 0x10 0x00
+
+# CHECK: unpk 0(1), 0(16,%r15)
+0xf3 0x0f 0x00 0x00 0xf0 0x00
+
+# CHECK: unpka 0(1), 0
+0xea 0x00 0x00 0x00 0x00 0x00
+
+# CHECK: unpka 0(1), 0(%r1)
+0xea 0x00 0x00 0x00 0x10 0x00
+
+# CHECK: unpka 0(1), 0(%r15)
+0xea 0x00 0x00 0x00 0xf0 0x00
+
+# CHECK: unpka 0(1), 4095
+0xea 0x00 0x00 0x00 0x0f 0xff
+
+# CHECK: unpka 0(1), 4095(%r1)
+0xea 0x00 0x00 0x00 0x1f 0xff
+
+# CHECK: unpka 0(1), 4095(%r15)
+0xea 0x00 0x00 0x00 0xff 0xff
+
+# CHECK: unpka 0(1,%r1), 0
+0xea 0x00 0x10 0x00 0x00 0x00
+
+# CHECK: unpka 0(1,%r15), 0
+0xea 0x00 0xf0 0x00 0x00 0x00
+
+# CHECK: unpka 4095(1,%r1), 0
+0xea 0x00 0x1f 0xff 0x00 0x00
+
+# CHECK: unpka 4095(1,%r15), 0
+0xea 0x00 0xff 0xff 0x00 0x00
+
+# CHECK: unpka 0(256,%r1), 0
+0xea 0xff 0x10 0x00 0x00 0x00
+
+# CHECK: unpka 0(256,%r15), 0
+0xea 0xff 0xf0 0x00 0x00 0x00
+
+# CHECK: unpku 0(1), 0
+0xe2 0x00 0x00 0x00 0x00 0x00
+
+# CHECK: unpku 0(1), 0(%r1)
+0xe2 0x00 0x00 0x00 0x10 0x00
+
+# CHECK: unpku 0(1), 0(%r15)
+0xe2 0x00 0x00 0x00 0xf0 0x00
+
+# CHECK: unpku 0(1), 4095
+0xe2 0x00 0x00 0x00 0x0f 0xff
+
+# CHECK: unpku 0(1), 4095(%r1)
+0xe2 0x00 0x00 0x00 0x1f 0xff
+
+# CHECK: unpku 0(1), 4095(%r15)
+0xe2 0x00 0x00 0x00 0xff 0xff
+
+# CHECK: unpku 0(1,%r1), 0
+0xe2 0x00 0x10 0x00 0x00 0x00
+
+# CHECK: unpku 0(1,%r15), 0
+0xe2 0x00 0xf0 0x00 0x00 0x00
+
+# CHECK: unpku 4095(1,%r1), 0
+0xe2 0x00 0x1f 0xff 0x00 0x00
+
+# CHECK: unpku 4095(1,%r15), 0
+0xe2 0x00 0xff 0xff 0x00 0x00
+
+# CHECK: unpku 0(256,%r1), 0
+0xe2 0xff 0x10 0x00 0x00 0x00
+
+# CHECK: unpku 0(256,%r15), 0
+0xe2 0xff 0xf0 0x00 0x00 0x00
+
+# CHECK: upt
+0x01 0x02
+
+# CHECK: x %r0, 0
+0x57 0x00 0x00 0x00
+
+# CHECK: x %r0, 4095
+0x57 0x00 0x0f 0xff
+
+# CHECK: x %r0, 0(%r1)
+0x57 0x00 0x10 0x00
+
+# CHECK: x %r0, 0(%r15)
+0x57 0x00 0xf0 0x00
+
+# CHECK: x %r0, 4095(%r1,%r15)
+0x57 0x01 0xff 0xff
+
+# CHECK: x %r0, 4095(%r15,%r1)
+0x57 0x0f 0x1f 0xff
+
+# CHECK: x %r15, 0
+0x57 0xf0 0x00 0x00
+
 # CHECK: xc 0(1), 0
 0xd7 0x00 0x00 0x00 0x00 0x00
 
@@ -11173,24 +13411,6 @@
 # CHECK: xc 0(256,%r15), 0
 0xd7 0xff 0xf0 0x00 0x00 0x00
 
-# CHECK: xgr %r0, %r0
-0xb9 0x82 0x00 0x00
-
-# CHECK: xgr %r0, %r15
-0xb9 0x82 0x00 0x0f
-
-# CHECK: xgr %r15, %r0
-0xb9 0x82 0x00 0xf0
-
-# CHECK: xgr %r7, %r8
-0xb9 0x82 0x00 0x78
-
-# CHECK: xgrk %r0, %r0, %r0
-0xb9 0xe7 0x00 0x00
-
-# CHECK: xgrk %r2, %r3, %r4
-0xb9 0xe7 0x40 0x23
-
 # CHECK: xg %r0, -524288
 0xe3 0x00 0x00 0x00 0x80 0x82
 
@@ -11221,23 +13441,23 @@
 # CHECK: xg %r15, 0
 0xe3 0xf0 0x00 0x00 0x00 0x82
 
-# CHECK: xihf %r0, 0
-0xc0 0x06 0x00 0x00 0x00 0x00
+# CHECK: xgr %r0, %r0
+0xb9 0x82 0x00 0x00
 
-# CHECK: xihf %r0, 4294967295
-0xc0 0x06 0xff 0xff 0xff 0xff
+# CHECK: xgr %r0, %r15
+0xb9 0x82 0x00 0x0f
 
-# CHECK: xihf %r15, 0
-0xc0 0xf6 0x00 0x00 0x00 0x00
+# CHECK: xgr %r15, %r0
+0xb9 0x82 0x00 0xf0
 
-# CHECK: xilf %r0, 0
-0xc0 0x07 0x00 0x00 0x00 0x00
+# CHECK: xgr %r7, %r8
+0xb9 0x82 0x00 0x78
 
-# CHECK: xilf %r0, 4294967295
-0xc0 0x07 0xff 0xff 0xff 0xff
+# CHECK: xgrk %r0, %r0, %r0
+0xb9 0xe7 0x00 0x00
 
-# CHECK: xilf %r15, 0
-0xc0 0xf7 0x00 0x00 0x00 0x00
+# CHECK: xgrk %r2, %r3, %r4
+0xb9 0xe7 0x40 0x23
 
 # CHECK: xi 0, 0
 0x97 0x00 0x00 0x00
@@ -11260,6 +13480,24 @@
 # CHECK: xi 4095(%r15), 42
 0x97 0x2a 0xff 0xff
 
+# CHECK: xihf %r0, 0
+0xc0 0x06 0x00 0x00 0x00 0x00
+
+# CHECK: xihf %r0, 4294967295
+0xc0 0x06 0xff 0xff 0xff 0xff
+
+# CHECK: xihf %r15, 0
+0xc0 0xf6 0x00 0x00 0x00 0x00
+
+# CHECK: xilf %r0, 0
+0xc0 0x07 0x00 0x00 0x00 0x00
+
+# CHECK: xilf %r0, 4294967295
+0xc0 0x07 0xff 0xff 0xff 0xff
+
+# CHECK: xilf %r15, 0
+0xc0 0xf7 0x00 0x00 0x00 0x00
+
 # CHECK: xiy -524288, 0
 0xeb 0x00 0x00 0x00 0x80 0x57
 
@@ -11308,27 +13546,6 @@
 # CHECK: xrk %r2, %r3, %r4
 0xb9 0xf7 0x40 0x23
 
-# CHECK: x %r0, 0
-0x57 0x00 0x00 0x00
-
-# CHECK: x %r0, 4095
-0x57 0x00 0x0f 0xff
-
-# CHECK: x %r0, 0(%r1)
-0x57 0x00 0x10 0x00
-
-# CHECK: x %r0, 0(%r15)
-0x57 0x00 0xf0 0x00
-
-# CHECK: x %r0, 4095(%r1,%r15)
-0x57 0x01 0xff 0xff
-
-# CHECK: x %r0, 4095(%r15,%r1)
-0x57 0x0f 0x1f 0xff
-
-# CHECK: x %r15, 0
-0x57 0xf0 0x00 0x00
-
 # CHECK: xy %r0, -524288
 0xe3 0x00 0x00 0x00 0x80 0x57
 
@@ -11358,3 +13575,45 @@
 
 # CHECK: xy %r15, 0
 0xe3 0xf0 0x00 0x00 0x00 0x57
+
+# CHECK: zap 0(1), 0(1)
+0xf8 0x00 0x00 0x00 0x00 0x00
+
+# CHECK: zap 0(1), 0(1,%r1)
+0xf8 0x00 0x00 0x00 0x10 0x00
+
+# CHECK: zap 0(1), 0(1,%r15)
+0xf8 0x00 0x00 0x00 0xf0 0x00
+
+# CHECK: zap 0(1), 4095(1)
+0xf8 0x00 0x00 0x00 0x0f 0xff
+
+# CHECK: zap 0(1), 4095(1,%r1)
+0xf8 0x00 0x00 0x00 0x1f 0xff
+
+# CHECK: zap 0(1), 4095(1,%r15)
+0xf8 0x00 0x00 0x00 0xff 0xff
+
+# CHECK: zap 0(1,%r1), 0(1)
+0xf8 0x00 0x10 0x00 0x00 0x00
+
+# CHECK: zap 0(1,%r15), 0(1)
+0xf8 0x00 0xf0 0x00 0x00 0x00
+
+# CHECK: zap 4095(1,%r1), 0(1)
+0xf8 0x00 0x1f 0xff 0x00 0x00
+
+# CHECK: zap 4095(1,%r15), 0(1)
+0xf8 0x00 0xff 0xff 0x00 0x00
+
+# CHECK: zap 0(16,%r1), 0(1)
+0xf8 0xf0 0x10 0x00 0x00 0x00
+
+# CHECK: zap 0(16,%r15), 0(1)
+0xf8 0xf0 0xf0 0x00 0x00 0x00
+
+# CHECK: zap 0(1), 0(16,%r1)
+0xf8 0x0f 0x00 0x00 0x10 0x00
+
+# CHECK: zap 0(1), 0(16,%r15)
+0xf8 0x0f 0x00 0x00 0xf0 0x00
diff --git a/test/MC/SystemZ/insn-bad-z13.s b/test/MC/SystemZ/insn-bad-z13.s
index db2de118bf36..82f47feeb8a9 100644
--- a/test/MC/SystemZ/insn-bad-z13.s
+++ b/test/MC/SystemZ/insn-bad-z13.s
@@ -4,6 +4,89 @@
 # RUN: not llvm-mc -triple s390x-linux-gnu -mcpu=arch11 < %s 2> %t
 # RUN: FileCheck < %t %s
 
+#CHECK: error: invalid operand
+#CHECK: lcbb	%r0, 0, -1
+#CHECK: error: invalid operand
+#CHECK: lcbb	%r0, 0, 16
+#CHECK: error: invalid operand
+#CHECK: lcbb	%r0, -1, 0
+#CHECK: error: invalid operand
+#CHECK: lcbb	%r0, 4096, 0
+#CHECK: error: invalid use of vector addressing
+#CHECK: lcbb	%r0, 0(%v1,%r2), 0
+
+	lcbb	%r0, 0, -1
+	lcbb	%r0, 0, 16
+	lcbb	%r0, -1, 0
+	lcbb	%r0, 4096, 0
+	lcbb	%r0, 0(%v1,%r2), 0
+
+#CHECK: error: invalid operand
+#CHECK: llzrgf	%r0, -524289
+#CHECK: error: invalid operand
+#CHECK: llzrgf	%r0, 524288
+
+	llzrgf	%r0, -524289
+	llzrgf	%r0, 524288
+
+#CHECK: error: invalid operand
+#CHECK: locfh	%r0, 0, -1
+#CHECK: error: invalid operand
+#CHECK: locfh	%r0, 0, 16
+#CHECK: error: invalid operand
+#CHECK: locfh	%r0, -524289, 1
+#CHECK: error: invalid operand
+#CHECK: locfh	%r0, 524288, 1
+#CHECK: error: invalid use of indexed addressing
+#CHECK: locfh	%r0, 0(%r1,%r2), 1
+
+	locfh	%r0, 0, -1
+	locfh	%r0, 0, 16
+	locfh	%r0, -524289, 1
+	locfh	%r0, 524288, 1
+	locfh	%r0, 0(%r1,%r2), 1
+
+#CHECK: error: invalid operand
+#CHECK: locfhr	%r0, %r0, -1
+#CHECK: error: invalid operand
+#CHECK: locfhr	%r0, %r0, 16
+
+	locfhr	%r0, %r0, -1
+	locfhr	%r0, %r0, 16
+
+#CHECK: error: invalid operand
+#CHECK: locghie	%r0, 66000
+#CHECK: error: invalid operand
+#CHECK: locghie	%f0, 0
+#CHECK: error: invalid operand
+#CHECK: locghie	0, %r0
+
+	locghie	%r0, 66000
+	locghie	%f0, 0
+	locghie	0, %r0
+
+#CHECK: error: invalid operand
+#CHECK: lochhie	%r0, 66000
+#CHECK: error: invalid operand
+#CHECK: lochhie	%f0, 0
+#CHECK: error: invalid operand
+#CHECK: lochhie	0, %r0
+
+	lochhie	%r0, 66000
+	lochhie	%f0, 0
+	lochhie	0, %r0
+
+#CHECK: error: invalid operand
+#CHECK: lochie	%r0, 66000
+#CHECK: error: invalid operand
+#CHECK: lochie	%f0, 0
+#CHECK: error: invalid operand
+#CHECK: lochie	0, %r0
+
+	lochie	%r0, 66000
+	lochie	%f0, 0
+	lochie	0, %r0
+
 #CHECK: error: invalid operand
 #CHECK: lzrf	%r0, -524289
 #CHECK: error: invalid operand
@@ -20,30 +103,30 @@
 	lzrg	%r0, -524289
 	lzrg	%r0, 524288
 
-#CHECK: error: invalid operand
-#CHECK: llzrgf	%r0, -524289
-#CHECK: error: invalid operand
-#CHECK: llzrgf	%r0, 524288
+#CHECK: error: invalid register pair
+#CHECK: ppno	%r1, %r2
+#CHECK: error: invalid register pair
+#CHECK: ppno	%r2, %r1
 
-	llzrgf	%r0, -524289
-	llzrgf	%r0, 524288
+	ppno	%r1, %r2
+	ppno	%r2, %r1
 
 #CHECK: error: invalid operand
-#CHECK: lcbb	%r0, 0, -1
+#CHECK: stocfh	%r0, 0, -1
 #CHECK: error: invalid operand
-#CHECK: lcbb	%r0, 0, 16
+#CHECK: stocfh	%r0, 0, 16
 #CHECK: error: invalid operand
-#CHECK: lcbb	%r0, -1, 0
+#CHECK: stocfh	%r0, -524289, 1
 #CHECK: error: invalid operand
-#CHECK: lcbb	%r0, 4096, 0
-#CHECK: error: invalid use of vector addressing
-#CHECK: lcbb	%r0, 0(%v1,%r2), 0
+#CHECK: stocfh	%r0, 524288, 1
+#CHECK: error: invalid use of indexed addressing
+#CHECK: stocfh	%r0, 0(%r1,%r2), 1
 
-	lcbb	%r0, 0, -1
-	lcbb	%r0, 0, 16
-	lcbb	%r0, -1, 0
-	lcbb	%r0, 4096, 0
-	lcbb	%r0, 0(%v1,%r2), 0
+	stocfh	%r0, 0, -1
+	stocfh	%r0, 0, 16
+	stocfh	%r0, -524289, 1
+	stocfh	%r0, 524288, 1
+	stocfh	%r0, 0(%r1,%r2), 1
 
 #CHECK: error: invalid operand
 #CHECK: vcdg	%v0, %v0, 0, 0, -1
@@ -473,6 +556,20 @@
 	vfaef	%v0, %v0
 	vfaef	%v0, %v0, %v0, 0, 0
 
+#CHECK: error: invalid operand
+#CHECK: vfaefs	%v0, %v0, %v0, -1
+#CHECK: error: invalid operand
+#CHECK: vfaefs	%v0, %v0, %v0, 16
+#CHECK: error: too few operands
+#CHECK: vfaefs	%v0, %v0
+#CHECK: error: invalid operand
+#CHECK: vfaefs	%v0, %v0, %v0, 0, 0
+
+	vfaefs	%v0, %v0, %v0, -1
+	vfaefs	%v0, %v0, %v0, 16
+	vfaefs	%v0, %v0
+	vfaefs	%v0, %v0, %v0, 0, 0
+
 #CHECK: error: invalid operand
 #CHECK: vfaeh	%v0, %v0, %v0, -1
 #CHECK: error: invalid operand
@@ -488,18 +585,60 @@
 	vfaeh	%v0, %v0, %v0, 0, 0
 
 #CHECK: error: invalid operand
-#CHECK: vfaezh	%v0, %v0, %v0, -1
+#CHECK: vfaehs	%v0, %v0, %v0, -1
 #CHECK: error: invalid operand
-#CHECK: vfaezh	%v0, %v0, %v0, 16
+#CHECK: vfaehs	%v0, %v0, %v0, 16
 #CHECK: error: too few operands
-#CHECK: vfaezh	%v0, %v0
+#CHECK: vfaehs	%v0, %v0
 #CHECK: error: invalid operand
-#CHECK: vfaezh	%v0, %v0, %v0, 0, 0
+#CHECK: vfaehs	%v0, %v0, %v0, 0, 0
 
-	vfaezh	%v0, %v0, %v0, -1
-	vfaezh	%v0, %v0, %v0, 16
-	vfaezh	%v0, %v0
-	vfaezh	%v0, %v0, %v0, 0, 0
+	vfaehs	%v0, %v0, %v0, -1
+	vfaehs	%v0, %v0, %v0, 16
+	vfaehs	%v0, %v0
+	vfaehs	%v0, %v0, %v0, 0, 0
+
+#CHECK: error: invalid operand
+#CHECK: vfaezb	%v0, %v0, %v0, -1
+#CHECK: error: invalid operand
+#CHECK: vfaezb	%v0, %v0, %v0, 16
+#CHECK: error: too few operands
+#CHECK: vfaezb	%v0, %v0
+#CHECK: error: invalid operand
+#CHECK: vfaezb	%v0, %v0, %v0, 0, 0
+
+	vfaezb	%v0, %v0, %v0, -1
+	vfaezb	%v0, %v0, %v0, 16
+	vfaezb	%v0, %v0
+	vfaezb	%v0, %v0, %v0, 0, 0
+
+#CHECK: error: invalid operand
+#CHECK: vfaezbs	%v0, %v0, %v0, -1
+#CHECK: error: invalid operand
+#CHECK: vfaezbs	%v0, %v0, %v0, 16
+#CHECK: error: too few operands
+#CHECK: vfaezbs	%v0, %v0
+#CHECK: error: invalid operand
+#CHECK: vfaezbs	%v0, %v0, %v0, 0, 0
+
+	vfaezbs	%v0, %v0, %v0, -1
+	vfaezbs	%v0, %v0, %v0, 16
+	vfaezbs	%v0, %v0
+	vfaezbs	%v0, %v0, %v0, 0, 0
+
+#CHECK: error: invalid operand
+#CHECK: vfaezf	%v0, %v0, %v0, -1
+#CHECK: error: invalid operand
+#CHECK: vfaezf	%v0, %v0, %v0, 16
+#CHECK: error: too few operands
+#CHECK: vfaezf	%v0, %v0
+#CHECK: error: invalid operand
+#CHECK: vfaezf	%v0, %v0, %v0, 0, 0
+
+	vfaezf	%v0, %v0, %v0, -1
+	vfaezf	%v0, %v0, %v0, 16
+	vfaezf	%v0, %v0
+	vfaezf	%v0, %v0, %v0, 0, 0
 
 #CHECK: error: invalid operand
 #CHECK: vfaezfs	%v0, %v0, %v0, -1
@@ -515,6 +654,34 @@
 	vfaezfs	%v0, %v0
 	vfaezfs	%v0, %v0, %v0, 0, 0
 
+#CHECK: error: invalid operand
+#CHECK: vfaezh	%v0, %v0, %v0, -1
+#CHECK: error: invalid operand
+#CHECK: vfaezh	%v0, %v0, %v0, 16
+#CHECK: error: too few operands
+#CHECK: vfaezh	%v0, %v0
+#CHECK: error: invalid operand
+#CHECK: vfaezh	%v0, %v0, %v0, 0, 0
+
+	vfaezh	%v0, %v0, %v0, -1
+	vfaezh	%v0, %v0, %v0, 16
+	vfaezh	%v0, %v0
+	vfaezh	%v0, %v0, %v0, 0, 0
+
+#CHECK: error: invalid operand
+#CHECK: vfaezhs	%v0, %v0, %v0, -1
+#CHECK: error: invalid operand
+#CHECK: vfaezhs	%v0, %v0, %v0, 16
+#CHECK: error: too few operands
+#CHECK: vfaezhs	%v0, %v0
+#CHECK: error: invalid operand
+#CHECK: vfaezhs	%v0, %v0, %v0, 0, 0
+
+	vfaezhs	%v0, %v0, %v0, -1
+	vfaezhs	%v0, %v0, %v0, 16
+	vfaezhs	%v0, %v0
+	vfaezhs	%v0, %v0, %v0, 0, 0
+
 #CHECK: error: invalid operand
 #CHECK: vfee	%v0, %v0, %v0, 0, -1
 #CHECK: error: invalid operand
@@ -549,6 +716,14 @@
 	vfeeb	%v0, %v0
 	vfeeb	%v0, %v0, %v0, 0, 0
 
+#CHECK: error: too few operands
+#CHECK: vfeebs	%v0, %v0
+#CHECK: error: invalid operand
+#CHECK: vfeebs	%v0, %v0, %v0, 0
+
+	vfeebs	%v0, %v0
+	vfeebs	%v0, %v0, %v0, 0
+
 #CHECK: error: invalid operand
 #CHECK: vfeef	%v0, %v0, %v0, -1
 #CHECK: error: invalid operand
@@ -563,6 +738,14 @@
 	vfeef	%v0, %v0
 	vfeef	%v0, %v0, %v0, 0, 0
 
+#CHECK: error: too few operands
+#CHECK: vfeefs	%v0, %v0
+#CHECK: error: invalid operand
+#CHECK: vfeefs	%v0, %v0, %v0, 0
+
+	vfeefs	%v0, %v0
+	vfeefs	%v0, %v0, %v0, 0
+
 #CHECK: error: invalid operand
 #CHECK: vfeeh	%v0, %v0, %v0, -1
 #CHECK: error: invalid operand
@@ -577,22 +760,6 @@
 	vfeeh	%v0, %v0
 	vfeeh	%v0, %v0, %v0, 0, 0
 
-#CHECK: error: too few operands
-#CHECK: vfeebs	%v0, %v0
-#CHECK: error: invalid operand
-#CHECK: vfeebs	%v0, %v0, %v0, 0
-
-	vfeebs	%v0, %v0
-	vfeebs	%v0, %v0, %v0, 0
-
-#CHECK: error: too few operands
-#CHECK: vfeefs	%v0, %v0
-#CHECK: error: invalid operand
-#CHECK: vfeefs	%v0, %v0, %v0, 0
-
-	vfeefs	%v0, %v0
-	vfeefs	%v0, %v0, %v0, 0
-
 #CHECK: error: too few operands
 #CHECK: vfeehs	%v0, %v0
 #CHECK: error: invalid operand
@@ -609,6 +776,14 @@
 	vfeezb	%v0, %v0
 	vfeezb	%v0, %v0, %v0, 0
 
+#CHECK: error: too few operands
+#CHECK: vfeezbs	%v0, %v0
+#CHECK: error: invalid operand
+#CHECK: vfeezbs	%v0, %v0, %v0, 0
+
+	vfeezbs	%v0, %v0
+	vfeezbs	%v0, %v0, %v0, 0
+
 #CHECK: error: too few operands
 #CHECK: vfeezf	%v0, %v0
 #CHECK: error: invalid operand
@@ -617,6 +792,14 @@
 	vfeezf	%v0, %v0
 	vfeezf	%v0, %v0, %v0, 0
 
+#CHECK: error: too few operands
+#CHECK: vfeezfs	%v0, %v0
+#CHECK: error: invalid operand
+#CHECK: vfeezfs	%v0, %v0, %v0, 0
+
+	vfeezfs	%v0, %v0
+	vfeezfs	%v0, %v0, %v0, 0
+
 #CHECK: error: too few operands
 #CHECK: vfeezh	%v0, %v0
 #CHECK: error: invalid operand
@@ -626,28 +809,146 @@
 	vfeezh	%v0, %v0, %v0, 0
 
 #CHECK: error: too few operands
-#CHECK: vfeezbs	%v0, %v0
+#CHECK: vfeezhs	%v0, %v0
 #CHECK: error: invalid operand
-#CHECK: vfeezbs	%v0, %v0, %v0, 0
+#CHECK: vfeezhs	%v0, %v0, %v0, 0
+
+	vfeezhs	%v0, %v0
+	vfeezhs	%v0, %v0, %v0, 0
+
+#CHECK: error: invalid operand
+#CHECK: vfene	%v0, %v0, %v0, 0, -1
+#CHECK: error: invalid operand
+#CHECK: vfene	%v0, %v0, %v0, 0, 16
+#CHECK: error: invalid operand
+#CHECK: vfene	%v0, %v0, %v0, -1, 0
+#CHECK: error: invalid operand
+#CHECK: vfene	%v0, %v0, %v0, 16, 0
+#CHECK: error: too few operands
+#CHECK: vfene	%v0, %v0, %v0
+#CHECK: error: invalid operand
+#CHECK: vfene	%v0, %v0, %v0, 0, 0, 0
+
+	vfene	%v0, %v0, %v0, 0, -1
+	vfene	%v0, %v0, %v0, 0, 16
+	vfene	%v0, %v0, %v0, -1, 0
+	vfene	%v0, %v0, %v0, 16, 0
+	vfene	%v0, %v0, %v0
+	vfene	%v0, %v0, %v0, 0, 0, 0
+
+#CHECK: error: invalid operand
+#CHECK: vfeneb	%v0, %v0, %v0, -1
+#CHECK: error: invalid operand
+#CHECK: vfeneb	%v0, %v0, %v0, 16
+#CHECK: error: too few operands
+#CHECK: vfeneb	%v0, %v0
+#CHECK: error: invalid operand
+#CHECK: vfeneb	%v0, %v0, %v0, 0, 0
+
+	vfeneb	%v0, %v0, %v0, -1
+	vfeneb	%v0, %v0, %v0, 16
+	vfeneb	%v0, %v0
+	vfeneb	%v0, %v0, %v0, 0, 0
+
+#CHECK: error: too few operands
+#CHECK: vfenebs	%v0, %v0
+#CHECK: error: invalid operand
+#CHECK: vfenebs	%v0, %v0, %v0, 0
+
+	vfenebs	%v0, %v0
+	vfenebs	%v0, %v0, %v0, 0
+
+#CHECK: error: invalid operand
+#CHECK: vfenef	%v0, %v0, %v0, -1
+#CHECK: error: invalid operand
+#CHECK: vfenef	%v0, %v0, %v0, 16
+#CHECK: error: too few operands
+#CHECK: vfenef	%v0, %v0
+#CHECK: error: invalid operand
+#CHECK: vfenef	%v0, %v0, %v0, 0, 0
+
+	vfenef	%v0, %v0, %v0, -1
+	vfenef	%v0, %v0, %v0, 16
+	vfenef	%v0, %v0
+	vfenef	%v0, %v0, %v0, 0, 0
+
+#CHECK: error: too few operands
+#CHECK: vfenefs	%v0, %v0
+#CHECK: error: invalid operand
+#CHECK: vfenefs	%v0, %v0, %v0, 0
+
+	vfenefs	%v0, %v0
+	vfenefs	%v0, %v0, %v0, 0
+
+#CHECK: error: invalid operand
+#CHECK: vfeneh	%v0, %v0, %v0, -1
+#CHECK: error: invalid operand
+#CHECK: vfeneh	%v0, %v0, %v0, 16
+#CHECK: error: too few operands
+#CHECK: vfeneh	%v0, %v0
+#CHECK: error: invalid operand
+#CHECK: vfeneh	%v0, %v0, %v0, 0, 0
+
+	vfeneh	%v0, %v0, %v0, -1
+	vfeneh	%v0, %v0, %v0, 16
+	vfeneh	%v0, %v0
+	vfeneh	%v0, %v0, %v0, 0, 0
+
+#CHECK: error: too few operands
+#CHECK: vfenehs	%v0, %v0
+#CHECK: error: invalid operand
+#CHECK: vfenehs	%v0, %v0, %v0, 0
+
+	vfenehs	%v0, %v0
+	vfenehs	%v0, %v0, %v0, 0
+
+#CHECK: error: too few operands
+#CHECK: vfenezb	%v0, %v0
+#CHECK: error: invalid operand
+#CHECK: vfenezb	%v0, %v0, %v0, 0
+
+	vfenezb	%v0, %v0
+	vfenezb	%v0, %v0, %v0, 0
+
+#CHECK: error: too few operands
+#CHECK: vfenezbs %v0, %v0
+#CHECK: error: invalid operand
+#CHECK: vfenezbs %v0, %v0, %v0, 0
+
+	vfenezbs %v0, %v0
+	vfenezbs %v0, %v0, %v0, 0
+
+#CHECK: error: too few operands
+#CHECK: vfenezf	%v0, %v0
+#CHECK: error: invalid operand
+#CHECK: vfenezf	%v0, %v0, %v0, 0
+
+	vfenezf	%v0, %v0
+	vfenezf	%v0, %v0, %v0, 0
+
+#CHECK: error: too few operands
+#CHECK: vfenezfs %v0, %v0
+#CHECK: error: invalid operand
+#CHECK: vfenezfs %v0, %v0, %v0, 0
 
-	vfeezbs	%v0, %v0
-	vfeezbs	%v0, %v0, %v0, 0
+	vfenezfs %v0, %v0
+	vfenezfs %v0, %v0, %v0, 0
 
 #CHECK: error: too few operands
-#CHECK: vfeezfs	%v0, %v0
+#CHECK: vfenezh	%v0, %v0
 #CHECK: error: invalid operand
-#CHECK: vfeezfs	%v0, %v0, %v0, 0
+#CHECK: vfenezh	%v0, %v0, %v0, 0
 
-	vfeezfs	%v0, %v0
-	vfeezfs	%v0, %v0, %v0, 0
+	vfenezh	%v0, %v0
+	vfenezh	%v0, %v0, %v0, 0
 
 #CHECK: error: too few operands
-#CHECK: vfeezhs	%v0, %v0
+#CHECK: vfenezhs %v0, %v0
 #CHECK: error: invalid operand
-#CHECK: vfeezhs	%v0, %v0, %v0, 0
+#CHECK: vfenezhs %v0, %v0, %v0, 0
 
-	vfeezhs	%v0, %v0
-	vfeezhs	%v0, %v0, %v0, 0
+	vfenezhs %v0, %v0
+	vfenezhs %v0, %v0, %v0, 0
 
 #CHECK: error: invalid operand
 #CHECK: vfi	%v0, %v0, 0, 0, -1
@@ -869,6 +1170,14 @@
 	vistrb	%v0
 	vistrb	%v0, %v0, 0, 0
 
+#CHECK: error: too few operands
+#CHECK: vistrbs	%v0
+#CHECK: error: invalid operand
+#CHECK: vistrbs	%v0, %v0, 0
+
+	vistrbs	%v0
+	vistrbs	%v0, %v0, 0
+
 #CHECK: error: invalid operand
 #CHECK: vistrf	%v0, %v0, -1
 #CHECK: error: invalid operand
@@ -883,6 +1192,14 @@
 	vistrf	%v0
 	vistrf	%v0, %v0, 0, 0
 
+#CHECK: error: too few operands
+#CHECK: vistrfs	%v0
+#CHECK: error: invalid operand
+#CHECK: vistrfs	%v0, %v0, 0
+
+	vistrfs	%v0
+	vistrfs	%v0, %v0, 0
+
 #CHECK: error: invalid operand
 #CHECK: vistrh	%v0, %v0, -1
 #CHECK: error: invalid operand
@@ -897,22 +1214,6 @@
 	vistrh	%v0
 	vistrh	%v0, %v0, 0, 0
 
-#CHECK: error: too few operands
-#CHECK: vistrbs	%v0
-#CHECK: error: invalid operand
-#CHECK: vistrbs	%v0, %v0, 0
-
-	vistrbs	%v0
-	vistrbs	%v0, %v0, 0
-
-#CHECK: error: too few operands
-#CHECK: vistrfs	%v0
-#CHECK: error: invalid operand
-#CHECK: vistrfs	%v0, %v0, 0
-
-	vistrfs	%v0
-	vistrfs	%v0, %v0, 0
-
 #CHECK: error: too few operands
 #CHECK: vistrhs	%v0
 #CHECK: error: invalid operand
@@ -1370,132 +1671,6 @@
 	vlvgh	%v0, %r0, 4096
 	vlvgh	%v0, %r0, 0(%r0)
 
-#CHECK: error: invalid operand
-#CHECK: vfene	%v0, %v0, %v0, 0, -1
-#CHECK: error: invalid operand
-#CHECK: vfene	%v0, %v0, %v0, 0, 16
-#CHECK: error: invalid operand
-#CHECK: vfene	%v0, %v0, %v0, -1, 0
-#CHECK: error: invalid operand
-#CHECK: vfene	%v0, %v0, %v0, 16, 0
-#CHECK: error: too few operands
-#CHECK: vfene	%v0, %v0, %v0
-#CHECK: error: invalid operand
-#CHECK: vfene	%v0, %v0, %v0, 0, 0, 0
-
-	vfene	%v0, %v0, %v0, 0, -1
-	vfene	%v0, %v0, %v0, 0, 16
-	vfene	%v0, %v0, %v0, -1, 0
-	vfene	%v0, %v0, %v0, 16, 0
-	vfene	%v0, %v0, %v0
-	vfene	%v0, %v0, %v0, 0, 0, 0
-
-#CHECK: error: invalid operand
-#CHECK: vfeneb	%v0, %v0, %v0, -1
-#CHECK: error: invalid operand
-#CHECK: vfeneb	%v0, %v0, %v0, 16
-#CHECK: error: too few operands
-#CHECK: vfeneb	%v0, %v0
-#CHECK: error: invalid operand
-#CHECK: vfeneb	%v0, %v0, %v0, 0, 0
-
-	vfeneb	%v0, %v0, %v0, -1
-	vfeneb	%v0, %v0, %v0, 16
-	vfeneb	%v0, %v0
-	vfeneb	%v0, %v0, %v0, 0, 0
-
-#CHECK: error: invalid operand
-#CHECK: vfenef	%v0, %v0, %v0, -1
-#CHECK: error: invalid operand
-#CHECK: vfenef	%v0, %v0, %v0, 16
-#CHECK: error: too few operands
-#CHECK: vfenef	%v0, %v0
-#CHECK: error: invalid operand
-#CHECK: vfenef	%v0, %v0, %v0, 0, 0
-
-	vfenef	%v0, %v0, %v0, -1
-	vfenef	%v0, %v0, %v0, 16
-	vfenef	%v0, %v0
-	vfenef	%v0, %v0, %v0, 0, 0
-
-#CHECK: error: invalid operand
-#CHECK: vfeneh	%v0, %v0, %v0, -1
-#CHECK: error: invalid operand
-#CHECK: vfeneh	%v0, %v0, %v0, 16
-#CHECK: error: too few operands
-#CHECK: vfeneh	%v0, %v0
-#CHECK: error: invalid operand
-#CHECK: vfeneh	%v0, %v0, %v0, 0, 0
-
-	vfeneh	%v0, %v0, %v0, -1
-	vfeneh	%v0, %v0, %v0, 16
-	vfeneh	%v0, %v0
-	vfeneh	%v0, %v0, %v0, 0, 0
-
-#CHECK: error: too few operands
-#CHECK: vfenebs	%v0, %v0
-#CHECK: error: invalid operand
-#CHECK: vfenebs	%v0, %v0, %v0, 0
-
-	vfenebs	%v0, %v0
-	vfenebs	%v0, %v0, %v0, 0
-
-#CHECK: error: too few operands
-#CHECK: vfenefs	%v0, %v0
-#CHECK: error: invalid operand
-#CHECK: vfenefs	%v0, %v0, %v0, 0
-
-	vfenefs	%v0, %v0
-	vfenefs	%v0, %v0, %v0, 0
-
-#CHECK: error: too few operands
-#CHECK: vfenehs	%v0, %v0
-#CHECK: error: invalid operand
-#CHECK: vfenehs	%v0, %v0, %v0, 0
-
-	vfenehs	%v0, %v0
-	vfenehs	%v0, %v0, %v0, 0
-
-#CHECK: error: too few operands
-#CHECK: vfenezb	%v0, %v0
-#CHECK: error: invalid operand
-#CHECK: vfenezb	%v0, %v0, %v0, 0
-
-	vfenezb	%v0, %v0
-	vfenezb	%v0, %v0, %v0, 0
-
-#CHECK: error: too few operands
-#CHECK: vfenezf	%v0, %v0
-#CHECK: error: invalid operand
-#CHECK: vfenezf	%v0, %v0, %v0, 0
-
-	vfenezf	%v0, %v0
-	vfenezf	%v0, %v0, %v0, 0
-
-#CHECK: error: too few operands
-#CHECK: vfenezh	%v0, %v0
-#CHECK: error: invalid operand
-#CHECK: vfenezh	%v0, %v0, %v0, 0
-
-	vfenezh	%v0, %v0
-	vfenezh	%v0, %v0, %v0, 0
-
-#CHECK: error: too few operands
-#CHECK: vfenezbs %v0, %v0
-#CHECK: error: invalid operand
-#CHECK: vfenezbs %v0, %v0, %v0, 0
-
-	vfenezbs %v0, %v0
-	vfenezbs %v0, %v0, %v0, 0
-
-#CHECK: error: too few operands
-#CHECK: vfenezfs %v0, %v0
-#CHECK: error: invalid operand
-#CHECK: vfenezfs %v0, %v0, %v0, 0
-
-	vfenezfs %v0, %v0
-	vfenezfs %v0, %v0, %v0, 0
-
 #CHECK: error: invalid operand
 #CHECK: vpdi	%v0, %v0, %v0, -1
 #CHECK: error: invalid operand
@@ -1755,12 +1930,12 @@
 #CHECK: error: invalid operand
 #CHECK: vstrc    %v0, %v0, %v0, %v0, 0, 0, 0
 
-        vstrc    %v0, %v0, %v0, %v0, 0, -1
-        vstrc    %v0, %v0, %v0, %v0, 0, 16
-        vstrc    %v0, %v0, %v0, %v0, -1, 0
-        vstrc    %v0, %v0, %v0, %v0, 16, 0
-        vstrc    %v0, %v0, %v0, %v0
-        vstrc    %v0, %v0, %v0, %v0, 0, 0, 0
+	vstrc    %v0, %v0, %v0, %v0, 0, -1
+	vstrc    %v0, %v0, %v0, %v0, 0, 16
+	vstrc    %v0, %v0, %v0, %v0, -1, 0
+	vstrc    %v0, %v0, %v0, %v0, 16, 0
+	vstrc    %v0, %v0, %v0, %v0
+	vstrc    %v0, %v0, %v0, %v0, 0, 0, 0
 
 #CHECK: error: invalid operand
 #CHECK: vstrcb   %v0, %v0, %v0, %v0, -1
@@ -1771,10 +1946,10 @@
 #CHECK: error: invalid operand
 #CHECK: vstrcb   %v0, %v0, %v0, %v0, 0, 0
 
-        vstrcb   %v0, %v0, %v0, %v0, -1
-        vstrcb   %v0, %v0, %v0, %v0, 16
-        vstrcb   %v0, %v0, %v0
-        vstrcb   %v0, %v0, %v0, %v0, 0, 0
+	vstrcb   %v0, %v0, %v0, %v0, -1
+	vstrcb   %v0, %v0, %v0, %v0, 16
+	vstrcb   %v0, %v0, %v0
+	vstrcb   %v0, %v0, %v0, %v0, 0, 0
 
 #CHECK: error: invalid operand
 #CHECK: vstrcbs  %v0, %v0, %v0, %v0, -1
@@ -1785,10 +1960,10 @@
 #CHECK: error: invalid operand
 #CHECK: vstrcbs  %v0, %v0, %v0, %v0, 0, 0
 
-        vstrcbs  %v0, %v0, %v0, %v0, -1
-        vstrcbs  %v0, %v0, %v0, %v0, 16
-        vstrcbs  %v0, %v0, %v0
-        vstrcbs  %v0, %v0, %v0, %v0, 0, 0
+	vstrcbs  %v0, %v0, %v0, %v0, -1
+	vstrcbs  %v0, %v0, %v0, %v0, 16
+	vstrcbs  %v0, %v0, %v0
+	vstrcbs  %v0, %v0, %v0, %v0, 0, 0
 
 #CHECK: error: invalid operand
 #CHECK: vstrcf   %v0, %v0, %v0, %v0, -1
@@ -1799,10 +1974,24 @@
 #CHECK: error: invalid operand
 #CHECK: vstrcf   %v0, %v0, %v0, %v0, 0, 0
 
-        vstrcf   %v0, %v0, %v0, %v0, -1
-        vstrcf   %v0, %v0, %v0, %v0, 16
-        vstrcf   %v0, %v0, %v0
-        vstrcf   %v0, %v0, %v0, %v0, 0, 0
+	vstrcf   %v0, %v0, %v0, %v0, -1
+	vstrcf   %v0, %v0, %v0, %v0, 16
+	vstrcf   %v0, %v0, %v0
+	vstrcf   %v0, %v0, %v0, %v0, 0, 0
+
+#CHECK: error: invalid operand
+#CHECK: vstrcfs  %v0, %v0, %v0, %v0, -1
+#CHECK: error: invalid operand
+#CHECK: vstrcfs  %v0, %v0, %v0, %v0, 16
+#CHECK: error: too few operands
+#CHECK: vstrcfs  %v0, %v0, %v0
+#CHECK: error: invalid operand
+#CHECK: vstrcfs  %v0, %v0, %v0, %v0, 0, 0
+
+	vstrcfs  %v0, %v0, %v0, %v0, -1
+	vstrcfs  %v0, %v0, %v0, %v0, 16
+	vstrcfs  %v0, %v0, %v0
+	vstrcfs  %v0, %v0, %v0, %v0, 0, 0
 
 #CHECK: error: invalid operand
 #CHECK: vstrch   %v0, %v0, %v0, %v0, -1
@@ -1813,24 +2002,66 @@
 #CHECK: error: invalid operand
 #CHECK: vstrch   %v0, %v0, %v0, %v0, 0, 0
 
-        vstrch   %v0, %v0, %v0, %v0, -1
-        vstrch   %v0, %v0, %v0, %v0, 16
-        vstrch   %v0, %v0, %v0
-        vstrch   %v0, %v0, %v0, %v0, 0, 0
+	vstrch   %v0, %v0, %v0, %v0, -1
+	vstrch   %v0, %v0, %v0, %v0, 16
+	vstrch   %v0, %v0, %v0
+	vstrch   %v0, %v0, %v0, %v0, 0, 0
 
 #CHECK: error: invalid operand
-#CHECK: vstrczh  %v0, %v0, %v0, %v0, -1
+#CHECK: vstrchs  %v0, %v0, %v0, %v0, -1
 #CHECK: error: invalid operand
-#CHECK: vstrczh  %v0, %v0, %v0, %v0, 16
+#CHECK: vstrchs  %v0, %v0, %v0, %v0, 16
 #CHECK: error: too few operands
-#CHECK: vstrczh  %v0, %v0, %v0
+#CHECK: vstrchs  %v0, %v0, %v0
 #CHECK: error: invalid operand
-#CHECK: vstrczh  %v0, %v0, %v0, %v0, 0, 0
+#CHECK: vstrchs  %v0, %v0, %v0, %v0, 0, 0
+
+	vstrchs  %v0, %v0, %v0, %v0, -1
+	vstrchs  %v0, %v0, %v0, %v0, 16
+	vstrchs  %v0, %v0, %v0
+	vstrchs  %v0, %v0, %v0, %v0, 0, 0
+
+#CHECK: error: invalid operand
+#CHECK: vstrczb  %v0, %v0, %v0, %v0, -1
+#CHECK: error: invalid operand
+#CHECK: vstrczb  %v0, %v0, %v0, %v0, 16
+#CHECK: error: too few operands
+#CHECK: vstrczb  %v0, %v0, %v0
+#CHECK: error: invalid operand
+#CHECK: vstrczb  %v0, %v0, %v0, %v0, 0, 0
+
+	vstrczb  %v0, %v0, %v0, %v0, -1
+	vstrczb  %v0, %v0, %v0, %v0, 16
+	vstrczb  %v0, %v0, %v0
+	vstrczb  %v0, %v0, %v0, %v0, 0, 0
+
+#CHECK: error: invalid operand
+#CHECK: vstrczbs %v0, %v0, %v0, %v0, -1
+#CHECK: error: invalid operand
+#CHECK: vstrczbs %v0, %v0, %v0, %v0, 16
+#CHECK: error: too few operands
+#CHECK: vstrczbs %v0, %v0, %v0
+#CHECK: error: invalid operand
+#CHECK: vstrczbs %v0, %v0, %v0, %v0, 0, 0
+
+	vstrczbs %v0, %v0, %v0, %v0, -1
+	vstrczbs %v0, %v0, %v0, %v0, 16
+	vstrczbs %v0, %v0, %v0
+	vstrczbs %v0, %v0, %v0, %v0, 0, 0
+
+#CHECK: error: invalid operand
+#CHECK: vstrczf  %v0, %v0, %v0, %v0, -1
+#CHECK: error: invalid operand
+#CHECK: vstrczf  %v0, %v0, %v0, %v0, 16
+#CHECK: error: too few operands
+#CHECK: vstrczf  %v0, %v0, %v0
+#CHECK: error: invalid operand
+#CHECK: vstrczf  %v0, %v0, %v0, %v0, 0, 0
 
-        vstrczh  %v0, %v0, %v0, %v0, -1
-        vstrczh  %v0, %v0, %v0, %v0, 16
-        vstrczh  %v0, %v0, %v0
-        vstrczh  %v0, %v0, %v0, %v0, 0, 0
+	vstrczf  %v0, %v0, %v0, %v0, -1
+	vstrczf  %v0, %v0, %v0, %v0, 16
+	vstrczf  %v0, %v0, %v0
+	vstrczf  %v0, %v0, %v0, %v0, 0, 0
 
 #CHECK: error: invalid operand
 #CHECK: vstrczfs %v0, %v0, %v0, %v0, -1
@@ -1841,10 +2072,38 @@
 #CHECK: error: invalid operand
 #CHECK: vstrczfs %v0, %v0, %v0, %v0, 0, 0
 
-        vstrczfs %v0, %v0, %v0, %v0, -1
-        vstrczfs %v0, %v0, %v0, %v0, 16
-        vstrczfs %v0, %v0, %v0
-        vstrczfs %v0, %v0, %v0, %v0, 0, 0
+	vstrczfs %v0, %v0, %v0, %v0, -1
+	vstrczfs %v0, %v0, %v0, %v0, 16
+	vstrczfs %v0, %v0, %v0
+	vstrczfs %v0, %v0, %v0, %v0, 0, 0
+
+#CHECK: error: invalid operand
+#CHECK: vstrczh  %v0, %v0, %v0, %v0, -1
+#CHECK: error: invalid operand
+#CHECK: vstrczh  %v0, %v0, %v0, %v0, 16
+#CHECK: error: too few operands
+#CHECK: vstrczh  %v0, %v0, %v0
+#CHECK: error: invalid operand
+#CHECK: vstrczh  %v0, %v0, %v0, %v0, 0, 0
+
+	vstrczh  %v0, %v0, %v0, %v0, -1
+	vstrczh  %v0, %v0, %v0, %v0, 16
+	vstrczh  %v0, %v0, %v0
+	vstrczh  %v0, %v0, %v0, %v0, 0, 0
+
+#CHECK: error: invalid operand
+#CHECK: vstrczhs %v0, %v0, %v0, %v0, -1
+#CHECK: error: invalid operand
+#CHECK: vstrczhs %v0, %v0, %v0, %v0, 16
+#CHECK: error: too few operands
+#CHECK: vstrczhs %v0, %v0, %v0
+#CHECK: error: invalid operand
+#CHECK: vstrczhs %v0, %v0, %v0, %v0, 0, 0
+
+	vstrczhs %v0, %v0, %v0, %v0, -1
+	vstrczhs %v0, %v0, %v0, %v0, 16
+	vstrczhs %v0, %v0, %v0
+	vstrczhs %v0, %v0, %v0, %v0, 0, 0
 
 #CHECK: error: invalid operand
 #CHECK: wcdgb	%v0, %v0, 0, -1
@@ -1937,79 +2196,4 @@
 	wledb	%v0, %v0, 0, 16
 	wledb	%v0, %v0, -1, 0
 	wledb	%v0, %v0, 16, 0
-        
-#CHECK: error: invalid operand
-#CHECK: lochie	%r0, 66000
-#CHECK: error: invalid operand
-#CHECK: lochie	%f0, 0
-#CHECK: error: invalid operand
-#CHECK: lochie	0, %r0
-        
-        lochie	%r0, 66000
-        lochie	%f0, 0
-        lochie	0, %r0        
-
-#CHECK: error: invalid operand
-#CHECK: locghie	%r0, 66000
-#CHECK: error: invalid operand
-#CHECK: locghie	%f0, 0
-#CHECK: error: invalid operand
-#CHECK: locghie	0, %r0
-        
-        locghie	%r0, 66000
-        locghie	%f0, 0
-        locghie	0, %r0        
-        
-#CHECK: error: invalid operand
-#CHECK: lochhie	%r0, 66000
-#CHECK: error: invalid operand
-#CHECK: lochhie	%f0, 0
-#CHECK: error: invalid operand
-#CHECK: lochhie	0, %r0
-
-        lochhie	%r0, 66000
-        lochhie	%f0, 0
-        lochhie	0, %r0
-
-#CHECK: error: invalid operand
-#CHECK: locfh	%r0,0,-1
-#CHECK: error: invalid operand
-#CHECK: locfh	%r0,0,16
-#CHECK: error: invalid operand
-#CHECK: locfh	%r0,-524289,1
-#CHECK: error: invalid operand
-#CHECK: locfh	%r0,524288,1
-#CHECK: error: invalid use of indexed addressing
-#CHECK: locfh	%r0,0(%r1,%r2),1
-
-	locfh	%r0,0,-1
-	locfh	%r0,0,16
-	locfh	%r0,-524289,1
-	locfh	%r0,524288,1
-	locfh	%r0,0(%r1,%r2),1
-
-#CHECK: error: invalid operand
-#CHECK: locfhr	%r0,%r0,-1
-#CHECK: error: invalid operand
-#CHECK: locfhr	%r0,%r0,16
-
-	locfhr	%r0,%r0,-1
-	locfhr	%r0,%r0,16
-
-#CHECK: error: invalid operand
-#CHECK: stocfh	%r0,0,-1
-#CHECK: error: invalid operand
-#CHECK: stocfh	%r0,0,16
-#CHECK: error: invalid operand
-#CHECK: stocfh	%r0,-524289,1
-#CHECK: error: invalid operand
-#CHECK: stocfh	%r0,524288,1
-#CHECK: error: invalid use of indexed addressing
-#CHECK: stocfh	%r0,0(%r1,%r2),1
-
-	stocfh	%r0,0,-1
-	stocfh	%r0,0,16
-	stocfh	%r0,-524289,1
-	stocfh	%r0,524288,1
-	stocfh	%r0,0(%r1,%r2),1
 
diff --git a/test/MC/SystemZ/insn-bad-z196.s b/test/MC/SystemZ/insn-bad-z196.s
index e370f10eefb4..04c19ff6319c 100644
--- a/test/MC/SystemZ/insn-bad-z196.s
+++ b/test/MC/SystemZ/insn-bad-z196.s
@@ -503,6 +503,33 @@
 	fixbra	%f0, 0, %f2, 0
 	fixbra	%f2, 0, %f0, 0
 
+#CHECK: error: invalid register pair
+#CHECK: kmctr	%r1, %r2, %r4
+#CHECK: error: invalid register pair
+#CHECK: kmctr	%r2, %r1, %r4
+#CHECK: error: invalid register pair
+#CHECK: kmctr	%r2, %r4, %r1
+
+	kmctr	%r1, %r2, %r4
+	kmctr	%r2, %r1, %r4
+	kmctr	%r2, %r4, %r1
+
+#CHECK: error: invalid register pair
+#CHECK: kmf	%r1, %r2
+#CHECK: error: invalid register pair
+#CHECK: kmf	%r2, %r1
+
+	kmf	%r1, %r2
+	kmf	%r2, %r1
+
+#CHECK: error: invalid register pair
+#CHECK: kmo	%r1, %r2
+#CHECK: error: invalid register pair
+#CHECK: kmo	%r2, %r1
+
+	kmo	%r1, %r2
+	kmo	%r2, %r1
+
 #CHECK: error: invalid operand
 #CHECK: laa	%r0, %r0, -524289
 #CHECK: error: invalid operand
@@ -757,11 +784,6 @@
 	locr	%r0,%r0,-1
 	locr	%r0,%r0,16
 
-#CHECK: error: instruction requires: execution-hint
-#CHECK: niai	0, 0
-
-	niai	0, 0
-
 #CHECK: error: invalid register pair
 #CHECK: lpd	%r1, 0, 0
 #CHECK: error: invalid use of indexed addressing
@@ -802,6 +824,11 @@
 	lpdg	%r2, 0(%r1), -1(%r15)
 	lpdg	%r2, 0(%r1), 4096(%r15)
 
+#CHECK: error: instruction requires: execution-hint
+#CHECK: niai	0, 0
+
+	niai	0, 0
+
 #CHECK: error: instruction requires: transactional-execution
 #CHECK: ntstg	%r0, 524287(%r1,%r15)
 
@@ -932,14 +959,6 @@
 	stch	%r0, -524289
 	stch	%r0, 524288
 
-#CHECK: error: invalid operand
-#CHECK: sthh	%r0, -524289
-#CHECK: error: invalid operand
-#CHECK: sthh	%r0, 524288
-
-	sthh	%r0, -524289
-	sthh	%r0, 524288
-
 #CHECK: error: invalid operand
 #CHECK: stfh	%r0, -524289
 #CHECK: error: invalid operand
@@ -948,6 +967,14 @@
 	stfh	%r0, -524289
 	stfh	%r0, 524288
 
+#CHECK: error: invalid operand
+#CHECK: sthh	%r0, -524289
+#CHECK: error: invalid operand
+#CHECK: sthh	%r0, 524288
+
+	sthh	%r0, -524289
+	sthh	%r0, 524288
+
 #CHECK: error: invalid operand
 #CHECK: stoc	%r0,0,-1
 #CHECK: error: invalid operand
diff --git a/test/MC/SystemZ/insn-bad-zEC12.s b/test/MC/SystemZ/insn-bad-zEC12.s
index 53dbd638e716..4bc3be3292e4 100644
--- a/test/MC/SystemZ/insn-bad-zEC12.s
+++ b/test/MC/SystemZ/insn-bad-zEC12.s
@@ -62,31 +62,6 @@
 	bprp	0, 0, 1
 	bprp	0, 0, 0x1000000
 
-#CHECK: error: invalid operand
-#CHECK: clt	%r0, -1, 0
-#CHECK: error: invalid operand
-#CHECK: clt	%r0, 16, 0
-#CHECK: error: invalid operand
-#CHECK: clt	%r0, 12, -524289
-#CHECK: error: invalid operand
-#CHECK: clt	%r0, 12, 524288
-#CHECK: error: invalid use of indexed addressing
-#CHECK: clt	%r0, 12, 0(%r1,%r2)
-
-	clt	%r0, -1, 0
-	clt	%r0, 16, 0
-	clt	%r0, 12, -524289
-	clt	%r0, 12, 524288
-	clt	%r0, 12, 0(%r1,%r2)
-
-#CHECK: error: invalid instruction
-#CHECK: clto    %r0, 0
-#CHECK: error: invalid instruction
-#CHECK: cltno   %r0, 0
-
-        clto    %r0, 0
-        cltno   %r0, 0
-
 #CHECK: error: invalid operand
 #CHECK: clgt	%r0, -1, 0
 #CHECK: error: invalid operand
@@ -104,13 +79,38 @@
 	clgt	%r0, 12, 524288
 	clgt	%r0, 12, 0(%r1,%r2)
 
-#CHECK: error: invalid instruction
-#CHECK: clgto    %r0, 0
 #CHECK: error: invalid instruction
 #CHECK: clgtno   %r0, 0
+#CHECK: error: invalid instruction
+#CHECK: clgto    %r0, 0
 
-        clgto    %r0, 0
         clgtno   %r0, 0
+        clgto    %r0, 0
+
+#CHECK: error: invalid operand
+#CHECK: clt	%r0, -1, 0
+#CHECK: error: invalid operand
+#CHECK: clt	%r0, 16, 0
+#CHECK: error: invalid operand
+#CHECK: clt	%r0, 12, -524289
+#CHECK: error: invalid operand
+#CHECK: clt	%r0, 12, 524288
+#CHECK: error: invalid use of indexed addressing
+#CHECK: clt	%r0, 12, 0(%r1,%r2)
+
+	clt	%r0, -1, 0
+	clt	%r0, 16, 0
+	clt	%r0, 12, -524289
+	clt	%r0, 12, 524288
+	clt	%r0, 12, 0(%r1,%r2)
+
+#CHECK: error: invalid instruction
+#CHECK: cltno   %r0, 0
+#CHECK: error: invalid instruction
+#CHECK: clto    %r0, 0
+
+        cltno   %r0, 0
+        clto    %r0, 0
 
 #CHECK: error: invalid operand
 #CHECK: lat	%r0, -524289
@@ -120,6 +120,11 @@
 	lat	%r0, -524289
 	lat	%r0, 524288
 
+#CHECK: error: instruction requires: vector
+#CHECK: lcbb	%r0, 0, 0
+
+	lcbb	%r0, 0, 0
+
 #CHECK: error: invalid operand
 #CHECK: lfhat	%r0, -524289
 #CHECK: error: invalid operand
@@ -152,10 +157,15 @@
 	llgtat	%r0, -524289
 	llgtat	%r0, 524288
 
-#CHECK: error: instruction requires: vector
-#CHECK: lcbb	%r0, 0, 0
+#CHECK: error: instruction requires: load-store-on-cond-2
+#CHECK: locghio %r11, 42
 
-	lcbb	%r0, 0, 0
+        locghio %r11, 42
+
+#CHECK: error: instruction requires: load-store-on-cond-2
+#CHECK: lochio %r11, 42
+
+        lochio %r11, 42
 
 #CHECK: error: invalid operand
 #CHECK:	niai	-1, 0
@@ -187,6 +197,11 @@
 	ppa	%r0, %r0, -1
 	ppa	%r0, %r0, 16
 
+#CHECK: error: instruction requires: message-security-assist-extension5
+#CHECK: ppno	%r2, %r4
+
+	ppno	%r2, %r4
+
 #CHECK: error: invalid operand
 #CHECK: risbgn	%r0,%r0,0,0,-1
 #CHECK: error: invalid operand
@@ -337,28 +352,28 @@
 #CHECK: error: instruction requires: vector
 #CHECK: vceqb	%v0, %v0, %v0
 #CHECK: error: instruction requires: vector
+#CHECK: vceqbs	%v0, %v0, %v0
+#CHECK: error: instruction requires: vector
 #CHECK: vceqf	%v0, %v0, %v0
 #CHECK: error: instruction requires: vector
+#CHECK: vceqfs	%v0, %v0, %v0
+#CHECK: error: instruction requires: vector
 #CHECK: vceqg	%v0, %v0, %v0
 #CHECK: error: instruction requires: vector
-#CHECK: vceqh	%v0, %v0, %v0
+#CHECK: vceqgs	%v0, %v0, %v0
 #CHECK: error: instruction requires: vector
-#CHECK: vceqbs	%v0, %v0, %v0
+#CHECK: vceqh	%v0, %v0, %v0
 #CHECK: error: instruction requires: vector
 #CHECK: vceqhs	%v0, %v0, %v0
-#CHECK: error: instruction requires: vector
-#CHECK: vceqfs	%v0, %v0, %v0
-#CHECK: error: instruction requires: vector
-#CHECK: vceqgs	%v0, %v0, %v0
 
 	vceqb	%v0, %v0, %v0
+	vceqbs	%v0, %v0, %v0
 	vceqf	%v0, %v0, %v0
+	vceqfs	%v0, %v0, %v0
 	vceqg	%v0, %v0, %v0
+	vceqgs	%v0, %v0, %v0
 	vceqh	%v0, %v0, %v0
-	vceqbs	%v0, %v0, %v0
 	vceqhs	%v0, %v0, %v0
-	vceqfs	%v0, %v0, %v0
-	vceqgs	%v0, %v0, %v0
 
 #CHECK: error: instruction requires: vector
 #CHECK: vcgdb	%v0, %v0, 0, 0
@@ -368,54 +383,54 @@
 #CHECK: error: instruction requires: vector
 #CHECK: vchb	%v0, %v0, %v0
 #CHECK: error: instruction requires: vector
+#CHECK: vchbs	%v0, %v0, %v0
+#CHECK: error: instruction requires: vector
 #CHECK: vchf	%v0, %v0, %v0
 #CHECK: error: instruction requires: vector
+#CHECK: vchfs	%v0, %v0, %v0
+#CHECK: error: instruction requires: vector
 #CHECK: vchg	%v0, %v0, %v0
 #CHECK: error: instruction requires: vector
-#CHECK: vchh	%v0, %v0, %v0
+#CHECK: vchgs	%v0, %v0, %v0
 #CHECK: error: instruction requires: vector
-#CHECK: vchbs	%v0, %v0, %v0
+#CHECK: vchh	%v0, %v0, %v0
 #CHECK: error: instruction requires: vector
 #CHECK: vchhs	%v0, %v0, %v0
-#CHECK: error: instruction requires: vector
-#CHECK: vchfs	%v0, %v0, %v0
-#CHECK: error: instruction requires: vector
-#CHECK: vchgs	%v0, %v0, %v0
 
 	vchb	%v0, %v0, %v0
+	vchbs	%v0, %v0, %v0
 	vchf	%v0, %v0, %v0
+	vchfs	%v0, %v0, %v0
 	vchg	%v0, %v0, %v0
+	vchgs	%v0, %v0, %v0
 	vchh	%v0, %v0, %v0
-	vchbs	%v0, %v0, %v0
 	vchhs	%v0, %v0, %v0
-	vchfs	%v0, %v0, %v0
-	vchgs	%v0, %v0, %v0
 
 #CHECK: error: instruction requires: vector
 #CHECK: vchlb	%v0, %v0, %v0
 #CHECK: error: instruction requires: vector
+#CHECK: vchlbs	%v0, %v0, %v0
+#CHECK: error: instruction requires: vector
 #CHECK: vchlf	%v0, %v0, %v0
 #CHECK: error: instruction requires: vector
+#CHECK: vchlfs	%v0, %v0, %v0
+#CHECK: error: instruction requires: vector
 #CHECK: vchlg	%v0, %v0, %v0
 #CHECK: error: instruction requires: vector
-#CHECK: vchlh	%v0, %v0, %v0
+#CHECK: vchlgs	%v0, %v0, %v0
 #CHECK: error: instruction requires: vector
-#CHECK: vchlbs	%v0, %v0, %v0
+#CHECK: vchlh	%v0, %v0, %v0
 #CHECK: error: instruction requires: vector
 #CHECK: vchlhs	%v0, %v0, %v0
-#CHECK: error: instruction requires: vector
-#CHECK: vchlfs	%v0, %v0, %v0
-#CHECK: error: instruction requires: vector
-#CHECK: vchlgs	%v0, %v0, %v0
 
 	vchlb	%v0, %v0, %v0
+	vchlbs	%v0, %v0, %v0
 	vchlf	%v0, %v0, %v0
+	vchlfs	%v0, %v0, %v0
 	vchlg	%v0, %v0, %v0
+	vchlgs	%v0, %v0, %v0
 	vchlh	%v0, %v0, %v0
-	vchlbs	%v0, %v0, %v0
 	vchlhs	%v0, %v0, %v0
-	vchlfs	%v0, %v0, %v0
-	vchlgs	%v0, %v0, %v0
 
 #CHECK: error: instruction requires: vector
 #CHECK: vcksm	%v0, %v0, %v0
@@ -469,20 +484,6 @@
 	vecg	%v0, %v0
 	vech	%v0, %v0
 
-#CHECK: error: instruction requires: vector
-#CHECK: verimb	%v0, %v0, %v0, 0
-#CHECK: error: instruction requires: vector
-#CHECK: verimf	%v0, %v0, %v0, 0
-#CHECK: error: instruction requires: vector
-#CHECK: verimg	%v0, %v0, %v0, 0
-#CHECK: error: instruction requires: vector
-#CHECK: verimh	%v0, %v0, %v0, 0
-
-	verimb	%v0, %v0, %v0, 0
-	verimf	%v0, %v0, %v0, 0
-	verimg	%v0, %v0, %v0, 0
-	verimh	%v0, %v0, %v0, 0
-
 #CHECK: error: instruction requires: vector
 #CHECK: veclb	%v0, %v0
 #CHECK: error: instruction requires: vector
@@ -498,18 +499,18 @@
 	veclh	%v0, %v0
 
 #CHECK: error: instruction requires: vector
-#CHECK: verllvb	%v0, %v0, %v0
+#CHECK: verimb	%v0, %v0, %v0, 0
 #CHECK: error: instruction requires: vector
-#CHECK: verllvf	%v0, %v0, %v0
+#CHECK: verimf	%v0, %v0, %v0, 0
 #CHECK: error: instruction requires: vector
-#CHECK: verllvg	%v0, %v0, %v0
+#CHECK: verimg	%v0, %v0, %v0, 0
 #CHECK: error: instruction requires: vector
-#CHECK: verllvh	%v0, %v0, %v0
+#CHECK: verimh	%v0, %v0, %v0, 0
 
-	verllvb	%v0, %v0, %v0
-	verllvf	%v0, %v0, %v0
-	verllvg	%v0, %v0, %v0
-	verllvh	%v0, %v0, %v0
+	verimb	%v0, %v0, %v0, 0
+	verimf	%v0, %v0, %v0, 0
+	verimg	%v0, %v0, %v0, 0
+	verimh	%v0, %v0, %v0, 0
 
 #CHECK: error: instruction requires: vector
 #CHECK: verllb	%v0, %v0, 0
@@ -526,18 +527,18 @@
 	verllh	%v0, %v0, 0
 
 #CHECK: error: instruction requires: vector
-#CHECK: veslvb	%v0, %v0, %v0
+#CHECK: verllvb	%v0, %v0, %v0
 #CHECK: error: instruction requires: vector
-#CHECK: veslvf	%v0, %v0, %v0
+#CHECK: verllvf	%v0, %v0, %v0
 #CHECK: error: instruction requires: vector
-#CHECK: veslvg	%v0, %v0, %v0
+#CHECK: verllvg	%v0, %v0, %v0
 #CHECK: error: instruction requires: vector
-#CHECK: veslvh	%v0, %v0, %v0
+#CHECK: verllvh	%v0, %v0, %v0
 
-	veslvb	%v0, %v0, %v0
-	veslvf	%v0, %v0, %v0
-	veslvg	%v0, %v0, %v0
-	veslvh	%v0, %v0, %v0
+	verllvb	%v0, %v0, %v0
+	verllvf	%v0, %v0, %v0
+	verllvg	%v0, %v0, %v0
+	verllvh	%v0, %v0, %v0
 
 #CHECK: error: instruction requires: vector
 #CHECK: veslb	%v0, %v0, 0
@@ -554,18 +555,18 @@
 	veslh	%v0, %v0, 0
 
 #CHECK: error: instruction requires: vector
-#CHECK: vesravb	%v0, %v0, %v0
+#CHECK: veslvb	%v0, %v0, %v0
 #CHECK: error: instruction requires: vector
-#CHECK: vesravf	%v0, %v0, %v0
+#CHECK: veslvf	%v0, %v0, %v0
 #CHECK: error: instruction requires: vector
-#CHECK: vesravg	%v0, %v0, %v0
+#CHECK: veslvg	%v0, %v0, %v0
 #CHECK: error: instruction requires: vector
-#CHECK: vesravh	%v0, %v0, %v0
+#CHECK: veslvh	%v0, %v0, %v0
 
-	vesravb	%v0, %v0, %v0
-	vesravf	%v0, %v0, %v0
-	vesravg	%v0, %v0, %v0
-	vesravh	%v0, %v0, %v0
+	veslvb	%v0, %v0, %v0
+	veslvf	%v0, %v0, %v0
+	veslvg	%v0, %v0, %v0
+	veslvh	%v0, %v0, %v0
 
 #CHECK: error: instruction requires: vector
 #CHECK: vesrab	%v0, %v0, 0
@@ -582,18 +583,18 @@
 	vesrah	%v0, %v0, 0
 
 #CHECK: error: instruction requires: vector
-#CHECK: vesrlvb	%v0, %v0, %v0
+#CHECK: vesravb	%v0, %v0, %v0
 #CHECK: error: instruction requires: vector
-#CHECK: vesrlvf	%v0, %v0, %v0
+#CHECK: vesravf	%v0, %v0, %v0
 #CHECK: error: instruction requires: vector
-#CHECK: vesrlvg	%v0, %v0, %v0
+#CHECK: vesravg	%v0, %v0, %v0
 #CHECK: error: instruction requires: vector
-#CHECK: vesrlvh	%v0, %v0, %v0
+#CHECK: vesravh	%v0, %v0, %v0
 
-	vesrlvb	%v0, %v0, %v0
-	vesrlvf	%v0, %v0, %v0
-	vesrlvg	%v0, %v0, %v0
-	vesrlvh	%v0, %v0, %v0
+	vesravb	%v0, %v0, %v0
+	vesravf	%v0, %v0, %v0
+	vesravg	%v0, %v0, %v0
+	vesravh	%v0, %v0, %v0
 
 #CHECK: error: instruction requires: vector
 #CHECK: vesrlb	%v0, %v0, 0
@@ -610,168 +611,162 @@
 	vesrlh	%v0, %v0, 0
 
 #CHECK: error: instruction requires: vector
-#CHECK: vfadb	%v0, %v0, %v0
-
-	vfadb	%v0, %v0, %v0
-
+#CHECK: vesrlvb	%v0, %v0, %v0
 #CHECK: error: instruction requires: vector
-#CHECK: vfcedb	%v0, %v0, %v0
-#CHECK: vfcedbs	%v0, %v0, %v0
-
-	vfcedb	%v0, %v0, %v0
-	vfcedbs	%v0, %v0, %v0
-
+#CHECK: vesrlvf	%v0, %v0, %v0
 #CHECK: error: instruction requires: vector
-#CHECK: vfchdb	%v0, %v0, %v0
-#CHECK: vfchdbs	%v0, %v0, %v0
+#CHECK: vesrlvg	%v0, %v0, %v0
+#CHECK: error: instruction requires: vector
+#CHECK: vesrlvh	%v0, %v0, %v0
 
-	vfchdb	%v0, %v0, %v0
-	vfchdbs	%v0, %v0, %v0
+	vesrlvb	%v0, %v0, %v0
+	vesrlvf	%v0, %v0, %v0
+	vesrlvg	%v0, %v0, %v0
+	vesrlvh	%v0, %v0, %v0
 
 #CHECK: error: instruction requires: vector
-#CHECK: vfddb	%v0, %v0, %v0
+#CHECK: vfadb	%v0, %v0, %v0
 
-	vfddb	%v0, %v0, %v0
+	vfadb	%v0, %v0, %v0
 
 #CHECK: error: instruction requires: vector
 #CHECK: vfaeb	%v0, %v0, %v0
 #CHECK: error: instruction requires: vector
-#CHECK: vfaezb	%v0, %v0, %v0
-#CHECK: error: instruction requires: vector
 #CHECK: vfaebs	%v0, %v0, %v0
 #CHECK: error: instruction requires: vector
-#CHECK: vfaezbs	%v0, %v0, %v0
+#CHECK: vfaef	%v0, %v0, %v0
 #CHECK: error: instruction requires: vector
-#CHECK: vfaeh	%v0, %v0, %v0
+#CHECK: vfaefs	%v0, %v0, %v0
 #CHECK: error: instruction requires: vector
-#CHECK: vfaezh	%v0, %v0, %v0
+#CHECK: vfaeh	%v0, %v0, %v0
 #CHECK: error: instruction requires: vector
 #CHECK: vfaehs	%v0, %v0, %v0
 #CHECK: error: instruction requires: vector
-#CHECK: vfaezhs	%v0, %v0, %v0
+#CHECK: vfaezb	%v0, %v0, %v0
 #CHECK: error: instruction requires: vector
-#CHECK: vfaef	%v0, %v0, %v0
+#CHECK: vfaezbs	%v0, %v0, %v0
 #CHECK: error: instruction requires: vector
 #CHECK: vfaezf	%v0, %v0, %v0
 #CHECK: error: instruction requires: vector
-#CHECK: vfaefs	%v0, %v0, %v0
-#CHECK: error: instruction requires: vector
 #CHECK: vfaezfs	%v0, %v0, %v0
+#CHECK: error: instruction requires: vector
+#CHECK: vfaezh	%v0, %v0, %v0
+#CHECK: error: instruction requires: vector
+#CHECK: vfaezhs	%v0, %v0, %v0
 
 	vfaeb	%v0, %v0, %v0
-	vfaezb	%v0, %v0, %v0
 	vfaebs	%v0, %v0, %v0
-	vfaezbs	%v0, %v0, %v0
+	vfaef	%v0, %v0, %v0
+	vfaefs	%v0, %v0, %v0
 	vfaeh	%v0, %v0, %v0
-	vfaezh	%v0, %v0, %v0
 	vfaehs	%v0, %v0, %v0
-	vfaezhs	%v0, %v0, %v0
-	vfaef	%v0, %v0, %v0
+	vfaezb	%v0, %v0, %v0
+	vfaezbs	%v0, %v0, %v0
 	vfaezf	%v0, %v0, %v0
-	vfaefs	%v0, %v0, %v0
 	vfaezfs	%v0, %v0, %v0
+	vfaezh	%v0, %v0, %v0
+	vfaezhs	%v0, %v0, %v0
 
 #CHECK: error: instruction requires: vector
-#CHECK: vfeeb	%v0, %v0, %v0
+#CHECK: vfcedb	%v0, %v0, %v0
+#CHECK: vfcedbs	%v0, %v0, %v0
+
+	vfcedb	%v0, %v0, %v0
+	vfcedbs	%v0, %v0, %v0
+
 #CHECK: error: instruction requires: vector
-#CHECK: vfeezb	%v0, %v0, %v0
+#CHECK: vfchdb	%v0, %v0, %v0
+#CHECK: vfchdbs	%v0, %v0, %v0
+
+	vfchdb	%v0, %v0, %v0
+	vfchdbs	%v0, %v0, %v0
+
+#CHECK: error: instruction requires: vector
+#CHECK: vfddb	%v0, %v0, %v0
+
+	vfddb	%v0, %v0, %v0
+
+#CHECK: error: instruction requires: vector
+#CHECK: vfeeb	%v0, %v0, %v0
 #CHECK: error: instruction requires: vector
 #CHECK: vfeebs	%v0, %v0, %v0
 #CHECK: error: instruction requires: vector
-#CHECK: vfeezbs	%v0, %v0, %v0
+#CHECK: vfeef	%v0, %v0, %v0
 #CHECK: error: instruction requires: vector
-#CHECK: vfeeh	%v0, %v0, %v0
+#CHECK: vfeefs	%v0, %v0, %v0
 #CHECK: error: instruction requires: vector
-#CHECK: vfeezh	%v0, %v0, %v0
+#CHECK: vfeeh	%v0, %v0, %v0
 #CHECK: error: instruction requires: vector
 #CHECK: vfeehs	%v0, %v0, %v0
 #CHECK: error: instruction requires: vector
-#CHECK: vfeezhs	%v0, %v0, %v0
+#CHECK: vfeezb	%v0, %v0, %v0
 #CHECK: error: instruction requires: vector
-#CHECK: vfeef	%v0, %v0, %v0
+#CHECK: vfeezbs	%v0, %v0, %v0
 #CHECK: error: instruction requires: vector
 #CHECK: vfeezf	%v0, %v0, %v0
 #CHECK: error: instruction requires: vector
-#CHECK: vfeefs	%v0, %v0, %v0
-#CHECK: error: instruction requires: vector
 #CHECK: vfeezfs	%v0, %v0, %v0
+#CHECK: error: instruction requires: vector
+#CHECK: vfeezh	%v0, %v0, %v0
+#CHECK: error: instruction requires: vector
+#CHECK: vfeezhs	%v0, %v0, %v0
 
 	vfeeb	%v0, %v0, %v0
-	vfeezb	%v0, %v0, %v0
 	vfeebs	%v0, %v0, %v0
-	vfeezbs	%v0, %v0, %v0
+	vfeef	%v0, %v0, %v0
+	vfeefs	%v0, %v0, %v0
 	vfeeh	%v0, %v0, %v0
-	vfeezh	%v0, %v0, %v0
 	vfeehs	%v0, %v0, %v0
-	vfeezhs	%v0, %v0, %v0
-	vfeef	%v0, %v0, %v0
+	vfeezb	%v0, %v0, %v0
+	vfeezbs	%v0, %v0, %v0
 	vfeezf	%v0, %v0, %v0
-	vfeefs	%v0, %v0, %v0
 	vfeezfs	%v0, %v0, %v0
+	vfeezh	%v0, %v0, %v0
+	vfeezhs	%v0, %v0, %v0
 
 #CHECK: error: instruction requires: vector
 #CHECK: vfeneb   %v0, %v0, %v0
 #CHECK: error: instruction requires: vector
-#CHECK: vfenezb  %v0, %v0, %v0
-#CHECK: error: instruction requires: vector
 #CHECK: vfenebs  %v0, %v0, %v0
 #CHECK: error: instruction requires: vector
-#CHECK: vfenezbs %v0, %v0, %v0
+#CHECK: vfenef   %v0, %v0, %v0
 #CHECK: error: instruction requires: vector
-#CHECK: vfeneh   %v0, %v0, %v0
+#CHECK: vfenefs  %v0, %v0, %v0
 #CHECK: error: instruction requires: vector
-#CHECK: vfenezh  %v0, %v0, %v0
+#CHECK: vfeneh   %v0, %v0, %v0
 #CHECK: error: instruction requires: vector
 #CHECK: vfenehs  %v0, %v0, %v0
 #CHECK: error: instruction requires: vector
-#CHECK: vfenezhs %v0, %v0, %v0
+#CHECK: vfenezb  %v0, %v0, %v0
 #CHECK: error: instruction requires: vector
-#CHECK: vfenef   %v0, %v0, %v0
+#CHECK: vfenezbs %v0, %v0, %v0
 #CHECK: error: instruction requires: vector
 #CHECK: vfenezf  %v0, %v0, %v0
 #CHECK: error: instruction requires: vector
-#CHECK: vfenefs  %v0, %v0, %v0
-#CHECK: error: instruction requires: vector
 #CHECK: vfenezfs %v0, %v0, %v0
+#CHECK: error: instruction requires: vector
+#CHECK: vfenezh  %v0, %v0, %v0
+#CHECK: error: instruction requires: vector
+#CHECK: vfenezhs %v0, %v0, %v0
 
 	vfeneb   %v0, %v0, %v0
-	vfenezb  %v0, %v0, %v0
 	vfenebs  %v0, %v0, %v0
-	vfenezbs %v0, %v0, %v0
+	vfenef   %v0, %v0, %v0
+	vfenefs  %v0, %v0, %v0
 	vfeneh   %v0, %v0, %v0
-	vfenezh  %v0, %v0, %v0
 	vfenehs  %v0, %v0, %v0
-	vfenezhs %v0, %v0, %v0
-	vfenef   %v0, %v0, %v0
+	vfenezb  %v0, %v0, %v0
+	vfenezbs %v0, %v0, %v0
 	vfenezf  %v0, %v0, %v0
-	vfenefs  %v0, %v0, %v0
 	vfenezfs %v0, %v0, %v0
+	vfenezh  %v0, %v0, %v0
+	vfenezhs %v0, %v0, %v0
 
 #CHECK: error: instruction requires: vector
 #CHECK: vfidb	%v0, %v0, 0, 0
 
 	vfidb	%v0, %v0, 0, 0
 
-#CHECK: error: instruction requires: vector
-#CHECK: vistrb	%v0, %v0
-#CHECK: error: instruction requires: vector
-#CHECK: vistrbs	%v0, %v0
-#CHECK: error: instruction requires: vector
-#CHECK: vistrh	%v0, %v0
-#CHECK: error: instruction requires: vector
-#CHECK: vistrhs	%v0, %v0
-#CHECK: error: instruction requires: vector
-#CHECK: vistrf	%v0, %v0
-#CHECK: error: instruction requires: vector
-#CHECK: vistrfs	%v0, %v0
-
-	vistrb	%v0, %v0
-	vistrbs	%v0, %v0
-	vistrh	%v0, %v0
-	vistrhs	%v0, %v0
-	vistrf	%v0, %v0
-	vistrfs	%v0, %v0
-
 #CHECK: error: instruction requires: vector
 #CHECK: vflcdb	%v0, %v0
 
@@ -872,6 +867,26 @@
 	vgmg	%v0, 0, 0
 	vgmh	%v0, 0, 0
 
+#CHECK: error: instruction requires: vector
+#CHECK: vistrb	%v0, %v0
+#CHECK: error: instruction requires: vector
+#CHECK: vistrbs	%v0, %v0
+#CHECK: error: instruction requires: vector
+#CHECK: vistrf	%v0, %v0
+#CHECK: error: instruction requires: vector
+#CHECK: vistrfs	%v0, %v0
+#CHECK: error: instruction requires: vector
+#CHECK: vistrh	%v0, %v0
+#CHECK: error: instruction requires: vector
+#CHECK: vistrhs	%v0, %v0
+
+	vistrb	%v0, %v0
+	vistrbs	%v0, %v0
+	vistrf	%v0, %v0
+	vistrfs	%v0, %v0
+	vistrh	%v0, %v0
+	vistrhs	%v0, %v0
+
 #CHECK: error: instruction requires: vector
 #CHECK: vl	%v0, 0
 
@@ -1309,44 +1324,44 @@
 	vpkh	%v0, %v0, %v0
 
 #CHECK: error: instruction requires: vector
-#CHECK: vpksf	%v0, %v0, %v0
+#CHECK: vpklsf	%v0, %v0, %v0
 #CHECK: error: instruction requires: vector
-#CHECK: vpksg	%v0, %v0, %v0
+#CHECK: vpklsfs	%v0, %v0, %v0
 #CHECK: error: instruction requires: vector
-#CHECK: vpksh	%v0, %v0, %v0
+#CHECK: vpklsg	%v0, %v0, %v0
 #CHECK: error: instruction requires: vector
-#CHECK: vpksfs	%v0, %v0, %v0
+#CHECK: vpklsgs	%v0, %v0, %v0
 #CHECK: error: instruction requires: vector
-#CHECK: vpksgs	%v0, %v0, %v0
+#CHECK: vpklsh	%v0, %v0, %v0
 #CHECK: error: instruction requires: vector
-#CHECK: vpkshs	%v0, %v0, %v0
+#CHECK: vpklshs	%v0, %v0, %v0
 
-	vpksf	%v0, %v0, %v0
-	vpksg	%v0, %v0, %v0
-	vpksh	%v0, %v0, %v0
-	vpksfs	%v0, %v0, %v0
-	vpksgs	%v0, %v0, %v0
-	vpkshs	%v0, %v0, %v0
+	vpklsf	%v0, %v0, %v0
+	vpklsfs	%v0, %v0, %v0
+	vpklsg	%v0, %v0, %v0
+	vpklsgs	%v0, %v0, %v0
+	vpklsh	%v0, %v0, %v0
+	vpklshs	%v0, %v0, %v0
 
 #CHECK: error: instruction requires: vector
-#CHECK: vpklsf	%v0, %v0, %v0
+#CHECK: vpksf	%v0, %v0, %v0
 #CHECK: error: instruction requires: vector
-#CHECK: vpklsg	%v0, %v0, %v0
+#CHECK: vpksfs	%v0, %v0, %v0
 #CHECK: error: instruction requires: vector
-#CHECK: vpklsh	%v0, %v0, %v0
+#CHECK: vpksg	%v0, %v0, %v0
 #CHECK: error: instruction requires: vector
-#CHECK: vpklsfs	%v0, %v0, %v0
+#CHECK: vpksgs	%v0, %v0, %v0
 #CHECK: error: instruction requires: vector
-#CHECK: vpklsgs	%v0, %v0, %v0
+#CHECK: vpksh	%v0, %v0, %v0
 #CHECK: error: instruction requires: vector
-#CHECK: vpklshs	%v0, %v0, %v0
+#CHECK: vpkshs	%v0, %v0, %v0
 
-	vpklsf	%v0, %v0, %v0
-	vpklsg	%v0, %v0, %v0
-	vpklsh	%v0, %v0, %v0
-	vpklsfs	%v0, %v0, %v0
-	vpklsgs	%v0, %v0, %v0
-	vpklshs	%v0, %v0, %v0
+	vpksf	%v0, %v0, %v0
+	vpksfs	%v0, %v0, %v0
+	vpksg	%v0, %v0, %v0
+	vpksgs	%v0, %v0, %v0
+	vpksh	%v0, %v0, %v0
+	vpkshs	%v0, %v0, %v0
 
 #CHECK: error: instruction requires: vector
 #CHECK: vpopct	%v0, %v0, 0
@@ -1502,40 +1517,48 @@
 #CHECK: error: instruction requires: vector
 #CHECK: vstrcb   %v0, %v0, %v0, %v0
 #CHECK: error: instruction requires: vector
-#CHECK: vstrczb  %v0, %v0, %v0, %v0
-#CHECK: error: instruction requires: vector
 #CHECK: vstrcbs  %v0, %v0, %v0, %v0
 #CHECK: error: instruction requires: vector
-#CHECK: vstrczbs %v0, %v0, %v0, %v0
+#CHECK: vstrcf   %v0, %v0, %v0, %v0
 #CHECK: error: instruction requires: vector
-#CHECK: vstrch   %v0, %v0, %v0, %v0
+#CHECK: vstrcfs  %v0, %v0, %v0, %v0
 #CHECK: error: instruction requires: vector
-#CHECK: vstrczh  %v0, %v0, %v0, %v0
+#CHECK: vstrch   %v0, %v0, %v0, %v0
 #CHECK: error: instruction requires: vector
 #CHECK: vstrchs  %v0, %v0, %v0, %v0
 #CHECK: error: instruction requires: vector
-#CHECK: vstrczhs %v0, %v0, %v0, %v0
+#CHECK: vstrczb  %v0, %v0, %v0, %v0
 #CHECK: error: instruction requires: vector
-#CHECK: vstrcf   %v0, %v0, %v0, %v0
+#CHECK: vstrczbs %v0, %v0, %v0, %v0
 #CHECK: error: instruction requires: vector
 #CHECK: vstrczf  %v0, %v0, %v0, %v0
 #CHECK: error: instruction requires: vector
-#CHECK: vstrcfs  %v0, %v0, %v0, %v0
-#CHECK: error: instruction requires: vector
 #CHECK: vstrczfs %v0, %v0, %v0, %v0
+#CHECK: error: instruction requires: vector
+#CHECK: vstrczh  %v0, %v0, %v0, %v0
+#CHECK: error: instruction requires: vector
+#CHECK: vstrczhs %v0, %v0, %v0, %v0
 
         vstrcb   %v0, %v0, %v0, %v0
-        vstrczb  %v0, %v0, %v0, %v0
         vstrcbs  %v0, %v0, %v0, %v0
-        vstrczbs %v0, %v0, %v0, %v0
+        vstrcf   %v0, %v0, %v0, %v0
+        vstrcfs  %v0, %v0, %v0, %v0
         vstrch   %v0, %v0, %v0, %v0
-        vstrczh  %v0, %v0, %v0, %v0
         vstrchs  %v0, %v0, %v0, %v0
-        vstrczhs %v0, %v0, %v0, %v0
-        vstrcf   %v0, %v0, %v0, %v0
+        vstrczb  %v0, %v0, %v0, %v0
+        vstrczbs %v0, %v0, %v0, %v0
         vstrczf  %v0, %v0, %v0, %v0
-        vstrcfs  %v0, %v0, %v0, %v0
         vstrczfs %v0, %v0, %v0, %v0
+        vstrczh  %v0, %v0, %v0, %v0
+        vstrczhs %v0, %v0, %v0, %v0
+
+#CHECK: error: instruction requires: vector
+#CHECK: vsumb	%v0, %v0, %v0
+#CHECK: error: instruction requires: vector
+#CHECK: vsumh	%v0, %v0, %v0
+
+	vsumb	%v0, %v0, %v0
+	vsumh	%v0, %v0, %v0
 
 #CHECK: error: instruction requires: vector
 #CHECK: vsumgh	%v0, %v0, %v0
@@ -1553,14 +1576,6 @@
 	vsumqf	%v0, %v0, %v0
 	vsumqg	%v0, %v0, %v0
 
-#CHECK: error: instruction requires: vector
-#CHECK: vsumb	%v0, %v0, %v0
-#CHECK: error: instruction requires: vector
-#CHECK: vsumh	%v0, %v0, %v0
-
-	vsumb	%v0, %v0, %v0
-	vsumh	%v0, %v0, %v0
-
 #CHECK: error: instruction requires: vector
 #CHECK: vtm	%v0, %v0
 
@@ -1577,17 +1592,6 @@
 	vuphf	%v0, %v0
 	vuphh	%v0, %v0
 
-#CHECK: error: instruction requires: vector
-#CHECK: vuplhb	%v0, %v0
-#CHECK: error: instruction requires: vector
-#CHECK: vuplhf	%v0, %v0
-#CHECK: error: instruction requires: vector
-#CHECK: vuplhh	%v0, %v0
-
-	vuplhb	%v0, %v0
-	vuplhf	%v0, %v0
-	vuplhh	%v0, %v0
-
 #CHECK: error: instruction requires: vector
 #CHECK: vuplb	%v0, %v0
 #CHECK: error: instruction requires: vector
@@ -1599,6 +1603,17 @@
 	vuplf	%v0, %v0
 	vuplhw	%v0, %v0
 
+#CHECK: error: instruction requires: vector
+#CHECK: vuplhb	%v0, %v0
+#CHECK: error: instruction requires: vector
+#CHECK: vuplhf	%v0, %v0
+#CHECK: error: instruction requires: vector
+#CHECK: vuplhh	%v0, %v0
+
+	vuplhb	%v0, %v0
+	vuplhf	%v0, %v0
+	vuplhh	%v0, %v0
+
 #CHECK: error: instruction requires: vector
 #CHECK: vupllb	%v0, %v0
 #CHECK: error: instruction requires: vector
@@ -1741,13 +1756,3 @@
 
 	wledb	%v0, %v0, 0, 0
 
-#CHECK: error: instruction requires: load-store-on-cond-2
-#CHECK: lochio %r11, 42
-        
-        lochio %r11, 42        
-
-#CHECK: error: instruction requires: load-store-on-cond-2
-#CHECK: locghio %r11, 42
-        
-        locghio %r11, 42        
-        
diff --git a/test/MC/SystemZ/insn-bad.s b/test/MC/SystemZ/insn-bad.s
index 018070a74dfc..b96c661ae3da 100644
--- a/test/MC/SystemZ/insn-bad.s
+++ b/test/MC/SystemZ/insn-bad.s
@@ -167,16 +167,6 @@
 	alfi	%r0, -1
 	alfi	%r0, (1 << 32)
 
-#CHECK: error: instruction requires: distinct-ops
-#CHECK: alghsik	%r1, %r2, 3
-
-	alghsik	%r1, %r2, 3
-
-#CHECK: error: instruction requires: distinct-ops
-#CHECK: alhsik	%r1, %r2, 3
-
-	alhsik	%r1, %r2, 3
-
 #CHECK: error: invalid operand
 #CHECK: alg	%r0, -524289
 #CHECK: error: invalid operand
@@ -201,16 +191,60 @@
 	algfi	%r0, -1
 	algfi	%r0, (1 << 32)
 
+#CHECK: error: instruction requires: distinct-ops
+#CHECK: alghsik	%r1, %r2, 3
+
+	alghsik	%r1, %r2, 3
+
 #CHECK: error: instruction requires: distinct-ops
 #CHECK: algrk	%r2,%r3,%r4
 
 	algrk	%r2,%r3,%r4
 
+#CHECK: error: instruction requires: distinct-ops
+#CHECK: alhsik	%r1, %r2, 3
+
+	alhsik	%r1, %r2, 3
+
 #CHECK: error: instruction requires: distinct-ops
 #CHECK: alrk	%r2,%r3,%r4
 
 	alrk	%r2,%r3,%r4
 
+#CHECK: error: invalid operand
+#CHECK: algsi	-524289, 0
+#CHECK: error: invalid operand
+#CHECK: algsi	524288, 0
+#CHECK: error: invalid use of indexed addressing
+#CHECK: algsi	0(%r1,%r2), 0
+#CHECK: error: invalid operand
+#CHECK: algsi	0, -129
+#CHECK: error: invalid operand
+#CHECK: algsi	0, 128
+
+	algsi	-524289, 0
+	algsi	524288, 0
+	algsi	0(%r1,%r2), 0
+	algsi	0, -129
+	algsi	0, 128
+
+#CHECK: error: invalid operand
+#CHECK: alsi	-524289, 0
+#CHECK: error: invalid operand
+#CHECK: alsi	524288, 0
+#CHECK: error: invalid use of indexed addressing
+#CHECK: alsi	0(%r1,%r2), 0
+#CHECK: error: invalid operand
+#CHECK: alsi	0, -129
+#CHECK: error: invalid operand
+#CHECK: alsi	0, 128
+
+	alsi	-524289, 0
+	alsi	524288, 0
+	alsi	0(%r1,%r2), 0
+	alsi	0, -129
+	alsi	0, 128
+
 #CHECK: error: invalid operand
 #CHECK: aly	%r0, -524289
 #CHECK: error: invalid operand
@@ -219,6 +253,59 @@
 	aly	%r0, -524289
 	aly	%r0, 524288
 
+#CHECK: error: missing length in address
+#CHECK: ap	0, 0(1)
+#CHECK: error: missing length in address
+#CHECK: ap	0(1), 0
+#CHECK: error: missing length in address
+#CHECK: ap	0(%r1), 0(1,%r1)
+#CHECK: error: missing length in address
+#CHECK: ap	0(1,%r1), 0(%r1)
+#CHECK: error: invalid operand
+#CHECK: ap	0(0,%r1), 0(1,%r1)
+#CHECK: error: invalid operand
+#CHECK: ap	0(1,%r1), 0(0,%r1)
+#CHECK: error: invalid operand
+#CHECK: ap	0(17,%r1), 0(1,%r1)
+#CHECK: error: invalid operand
+#CHECK: ap	0(1,%r1), 0(17,%r1)
+#CHECK: error: invalid operand
+#CHECK: ap	-1(1,%r1), 0(1,%r1)
+#CHECK: error: invalid operand
+#CHECK: ap	4096(1,%r1), 0(1,%r1)
+#CHECK: error: invalid operand
+#CHECK: ap	0(1,%r1), -1(1,%r1)
+#CHECK: error: invalid operand
+#CHECK: ap	0(1,%r1), 4096(1,%r1)
+#CHECK: error: %r0 used in an address
+#CHECK: ap	0(1,%r0), 0(1,%r1)
+#CHECK: error: %r0 used in an address
+#CHECK: ap	0(1,%r1), 0(1,%r0)
+#CHECK: error: invalid use of indexed addressing
+#CHECK: ap	0(%r1,%r2), 0(1,%r1)
+#CHECK: error: invalid use of indexed addressing
+#CHECK: ap	0(1,%r2), 0(%r1,%r2)
+#CHECK: error: unknown token in expression
+#CHECK: ap	0(-), 0(1)
+
+	ap	0, 0(1)
+	ap	0(1), 0
+	ap	0(%r1), 0(1,%r1)
+	ap	0(1,%r1), 0(%r1)
+	ap	0(0,%r1), 0(1,%r1)
+	ap	0(1,%r1), 0(0,%r1)
+	ap	0(17,%r1), 0(1,%r1)
+	ap	0(1,%r1), 0(17,%r1)
+	ap	-1(1,%r1), 0(1,%r1)
+	ap	4096(1,%r1), 0(1,%r1)
+	ap	0(1,%r1), -1(1,%r1)
+	ap	0(1,%r1), 4096(1,%r1)
+	ap	0(1,%r0), 0(1,%r1)
+	ap	0(1,%r1), 0(1,%r0)
+	ap	0(%r1,%r2), 0(1,%r1)
+	ap	0(1,%r2), 0(%r1,%r2)
+	ap	0(-), 0(1)
+
 #CHECK: error: instruction requires: distinct-ops
 #CHECK: ark	%r2,%r3,%r4
 
@@ -296,6 +383,22 @@
 	bcr	-1, %r1
 	bcr	16, %r1
 
+#CHECK: error: invalid operand
+#CHECK: bct	%r0, -1
+#CHECK: error: invalid operand
+#CHECK: bct	%r0, 4096
+
+	bct	%r0, -1
+	bct	%r0, 4096
+
+#CHECK: error: invalid operand
+#CHECK: bctg	%r0, -524289
+#CHECK: error: invalid operand
+#CHECK: bctg	%r0, 524288
+
+	bctg	%r0, -524289
+	bctg	%r0, 524288
+
 #CHECK: error: offset out of range
 #CHECK: bras	%r0, -0x100002
 #CHECK: error: offset out of range
@@ -374,22 +477,6 @@
 	brcl	-1, bar
 	brcl	16, bar
 
-#CHECK: error: invalid operand
-#CHECK: bct	%r0, -1
-#CHECK: error: invalid operand
-#CHECK: bct	%r0, 4096
-
-	bct	%r0, -1
-	bct	%r0, 4096
-
-#CHECK: error: invalid operand
-#CHECK: bctg	%r0, -524289
-#CHECK: error: invalid operand
-#CHECK: bctg	%r0, 524288
-
-	bctg	%r0, -524289
-	bctg	%r0, 524288
-
 #CHECK: error: offset out of range
 #CHECK: brct	%r0, -0x100002
 #CHECK: error: offset out of range
@@ -423,25 +510,6 @@
 
 	brcth	%r0, 0
 
-#CHECK: error: invalid operand
-#CHECK: bxh	%r0, %r0, 4096
-#CHECK: error: invalid use of indexed addressing
-#CHECK: bxh	%r0, %r0, 0(%r1,%r2)
-
-	bxh	%r0, %r0, 4096
-	bxh	%r0, %r0, 0(%r1,%r2)
-
-#CHECK: error: invalid operand
-#CHECK: bxhg	%r0, %r0, -524289
-#CHECK: error: invalid operand
-#CHECK: bxhg	%r0, %r0, 524288
-#CHECK: error: invalid use of indexed addressing
-#CHECK: bxhg	%r0, %r0, 0(%r1,%r2)
-
-	bxhg	%r0, %r0, -524289
-	bxhg	%r0, %r0, 524288
-	bxhg	%r0, %r0, 0(%r1,%r2)
-
 #CHECK: error: offset out of range
 #CHECK: brxh	%r0, %r2, -0x100002
 #CHECK: error: offset out of range
@@ -470,25 +538,6 @@
 	brxhg	%r0, %r2, 1
 	brxhg	%r0, %r2, 0x10000
 
-#CHECK: error: invalid operand
-#CHECK: bxle	%r0, %r0, 4096
-#CHECK: error: invalid use of indexed addressing
-#CHECK: bxle	%r0, %r0, 0(%r1,%r2)
-
-	bxle	%r0, %r0, 4096
-	bxle	%r0, %r0, 0(%r1,%r2)
-
-#CHECK: error: invalid operand
-#CHECK: bxhg	%r0, %r0, -524289
-#CHECK: error: invalid operand
-#CHECK: bxhg	%r0, %r0, 524288
-#CHECK: error: invalid use of indexed addressing
-#CHECK: bxhg	%r0, %r0, 0(%r1,%r2)
-
-	bxhg	%r0, %r0, -524289
-	bxhg	%r0, %r0, 524288
-	bxhg	%r0, %r0, 0(%r1,%r2)
-
 #CHECK: error: offset out of range
 #CHECK: brxle	%r0, %r2, -0x100002
 #CHECK: error: offset out of range
@@ -517,6 +566,44 @@
 	brxlg	%r0, %r2, 1
 	brxlg	%r0, %r2, 0x10000
 
+#CHECK: error: invalid operand
+#CHECK: bxh	%r0, %r0, 4096
+#CHECK: error: invalid use of indexed addressing
+#CHECK: bxh	%r0, %r0, 0(%r1,%r2)
+
+	bxh	%r0, %r0, 4096
+	bxh	%r0, %r0, 0(%r1,%r2)
+
+#CHECK: error: invalid operand
+#CHECK: bxhg	%r0, %r0, -524289
+#CHECK: error: invalid operand
+#CHECK: bxhg	%r0, %r0, 524288
+#CHECK: error: invalid use of indexed addressing
+#CHECK: bxhg	%r0, %r0, 0(%r1,%r2)
+
+	bxhg	%r0, %r0, -524289
+	bxhg	%r0, %r0, 524288
+	bxhg	%r0, %r0, 0(%r1,%r2)
+
+#CHECK: error: invalid operand
+#CHECK: bxle	%r0, %r0, 4096
+#CHECK: error: invalid use of indexed addressing
+#CHECK: bxle	%r0, %r0, 0(%r1,%r2)
+
+	bxle	%r0, %r0, 4096
+	bxle	%r0, %r0, 0(%r1,%r2)
+
+#CHECK: error: invalid operand
+#CHECK: bxleg	%r0, %r0, -524289
+#CHECK: error: invalid operand
+#CHECK: bxleg	%r0, %r0, 524288
+#CHECK: error: invalid use of indexed addressing
+#CHECK: bxleg	%r0, %r0, 0(%r1,%r2)
+
+	bxleg	%r0, %r0, -524289
+	bxleg	%r0, %r0, 524288
+	bxleg	%r0, %r0, 0(%r1,%r2)
+
 #CHECK: error: invalid operand
 #CHECK: c	%r0, -1
 #CHECK: error: invalid operand
@@ -632,6 +719,17 @@
 
 	celgbr	%f0, 0, %r0, 0
 
+#CHECK: error: invalid operand
+#CHECK: cfc	-1
+#CHECK: error: invalid operand
+#CHECK: cfc	4096
+#CHECK: error: invalid use of indexed addressing
+#CHECK: cfc	0(%r1,%r2)
+
+	cfc	-1
+	cfc	4096
+	cfc	0(%r1,%r2)
+
 #CHECK: error: invalid operand
 #CHECK: cfdbr	%r0, -1, %f0
 #CHECK: error: invalid operand
@@ -818,27 +916,27 @@
 	cgij	%r0, 0, 0, 1
 	cgij	%r0, 0, 0, 0x10000
 
-#CHECK: error: invalid instruction
-#CHECK:	cgijo	%r0, 0, 0, 0
 #CHECK: error: invalid instruction
 #CHECK:	cgijno	%r0, 0, 0, 0
+#CHECK: error: invalid instruction
+#CHECK:	cgijo	%r0, 0, 0, 0
 
-	cgijo	%r0, 0, 0, 0
 	cgijno	%r0, 0, 0, 0
+	cgijo	%r0, 0, 0, 0
 
 #CHECK: error: invalid operand
 #CHECK: cgit     %r0, -32769
 #CHECK: error: invalid operand
 #CHECK: cgit     %r0, 32768
 #CHECK: error: invalid instruction
-#CHECK: cgito    %r0, 0
-#CHECK: error: invalid instruction
 #CHECK: cgitno   %r0, 0
+#CHECK: error: invalid instruction
+#CHECK: cgito    %r0, 0
 
         cgit     %r0, -32769
         cgit     %r0, 32768
-        cgito    %r0, 0
         cgitno   %r0, 0
+        cgito    %r0, 0
 
 #CHECK: error: offset out of range
 #CHECK: cgrj	%r0, %r0, 0, -0x100002
@@ -854,13 +952,13 @@
 	cgrj	%r0, %r0, 0, 1
 	cgrj	%r0, %r0, 0, 0x10000
 
-#CHECK: error: invalid instruction
-#CHECK:	cgrjo	%r0, %r0, 0, 0
 #CHECK: error: invalid instruction
 #CHECK:	cgrjno	%r0, %r0, 0, 0
+#CHECK: error: invalid instruction
+#CHECK:	cgrjo	%r0, %r0, 0, 0
 
-	cgrjo	%r0, %r0, 0, 0
 	cgrjno	%r0, %r0, 0, 0
+	cgrjo	%r0, %r0, 0, 0
 
 #CHECK: error: offset out of range
 #CHECK: cgrl	%r0, -0x1000000002
@@ -876,13 +974,13 @@
 	cgrl	%r0, 1
 	cgrl	%r0, 0x100000000
 
-#CHECK: error: invalid instruction
-#CHECK: cgrto    %r0, %r0
 #CHECK: error: invalid instruction
 #CHECK: cgrtno   %r0, %r0
+#CHECK: error: invalid instruction
+#CHECK: cgrto    %r0, %r0
 
-        cgrto    %r0, %r0
         cgrtno   %r0, %r0
+        cgrto    %r0, %r0
 
 #CHECK: error: invalid operand
 #CHECK: cgxbr	%r0, -1, %f0
@@ -1007,27 +1105,32 @@
 	cij	%r0, 0, 0, 1
 	cij	%r0, 0, 0, 0x10000
 
-#CHECK: error: invalid instruction
-#CHECK:	cijo	%r0, 0, 0, 0
 #CHECK: error: invalid instruction
 #CHECK:	cijno	%r0, 0, 0, 0
+#CHECK: error: invalid instruction
+#CHECK:	cijo	%r0, 0, 0, 0
 
-	cijo	%r0, 0, 0, 0
 	cijno	%r0, 0, 0, 0
+	cijo	%r0, 0, 0, 0
 
 #CHECK: error: invalid operand
 #CHECK: cit     %r0, -32769
 #CHECK: error: invalid operand
 #CHECK: cit     %r0, 32768
 #CHECK: error: invalid instruction
-#CHECK: cito    %r0, 0
-#CHECK: error: invalid instruction
 #CHECK: citno   %r0, 0
+#CHECK: error: invalid instruction
+#CHECK: cito    %r0, 0
 
         cit     %r0, -32769
         cit     %r0, 32768
-        cito    %r0, 0
         citno   %r0, 0
+        cito    %r0, 0
+
+#CHECK: error: invalid register pair
+#CHECK: cksm	%r0, %r1
+
+	cksm	%r0, %r1
 
 #CHECK: error: invalid operand
 #CHECK: cl	%r0, -1
@@ -1081,10 +1184,41 @@
 	clc	0(1,%r2), 0(%r1,%r2)
 	clc	0(-), 0
 
-#CHECK: error: instruction requires: high-word
-#CHECK: clhf	%r0, 0
+#CHECK: error: invalid register pair
+#CHECK: clcl	%r1, %r0
+#CHECK: error: invalid register pair
+#CHECK: clcl	%r0, %r1
 
-	clhf	%r0, 0
+	clcl	%r1, %r0
+	clcl	%r0, %r1
+
+#CHECK: error: invalid register pair
+#CHECK: clcle	%r1, %r0
+#CHECK: error: invalid register pair
+#CHECK: clcle	%r0, %r1
+#CHECK: error: invalid operand
+#CHECK: clcle	%r0, %r0, -1
+#CHECK: error: invalid operand
+#CHECK: clcle	%r0, %r0, 4096
+
+	clcle	%r1, %r0, 0
+	clcle	%r0, %r1, 0
+	clcle	%r0, %r0, -1
+	clcle	%r0, %r0, 4096
+
+#CHECK: error: invalid register pair
+#CHECK: clclu	%r1, %r0
+#CHECK: error: invalid register pair
+#CHECK: clclu	%r0, %r1
+#CHECK: error: invalid operand
+#CHECK: clclu	%r0, %r0, -524289
+#CHECK: error: invalid operand
+#CHECK: clclu	%r0, %r0, 524288
+
+	clclu	%r1, %r0, 0
+	clclu	%r0, %r1, 0
+	clclu	%r0, %r0, -524289
+	clclu	%r0, %r0, 524288
 
 #CHECK: error: instruction requires: fp-extension
 #CHECK: clfdbr	%r0, 0, %f0, 0
@@ -1126,14 +1260,14 @@
 #CHECK: error: invalid operand
 #CHECK: clfit   %r0, 65536
 #CHECK: error: invalid instruction
-#CHECK: clfito  %r0, 0
-#CHECK: error: invalid instruction
 #CHECK: clfitno %r0, 0
+#CHECK: error: invalid instruction
+#CHECK: clfito  %r0, 0
 
         clfit   %r0, -1
         clfit   %r0, 65536
-        clfito  %r0, 0
         clfitno %r0, 0
+        clfito  %r0, 0
 
 #CHECK: error: instruction requires: fp-extension
 #CHECK: clfxbr	%r0, 0, %f0, 0
@@ -1148,20 +1282,6 @@
 	clg	%r0, -524289
 	clg	%r0, 524288
 
-#CHECK: error: invalid operand
-#CHECK: clgit   %r0, -1
-#CHECK: error: invalid operand
-#CHECK: clgit   %r0, 65536
-#CHECK: error: invalid instruction
-#CHECK: clgito  %r0, 0
-#CHECK: error: invalid instruction
-#CHECK: clgitno %r0, 0
-
-        clgit   %r0, -1
-        clgit   %r0, 65536
-        clgito  %r0, 0
-        clgitno %r0, 0
-
 #CHECK: error: instruction requires: fp-extension
 #CHECK: clgdbr	%r0, 0, %f0, 0
 
@@ -1255,13 +1375,27 @@
 	clgij	%r0, 0, 0, 1
 	clgij	%r0, 0, 0, 0x10000
 
-#CHECK: error: invalid instruction
-#CHECK:	clgijo	%r0, 0, 0, 0
 #CHECK: error: invalid instruction
 #CHECK:	clgijno	%r0, 0, 0, 0
+#CHECK: error: invalid instruction
+#CHECK:	clgijo	%r0, 0, 0, 0
 
-	clgijo	%r0, 0, 0, 0
 	clgijno	%r0, 0, 0, 0
+	clgijo	%r0, 0, 0, 0
+
+#CHECK: error: invalid operand
+#CHECK: clgit   %r0, -1
+#CHECK: error: invalid operand
+#CHECK: clgit   %r0, 65536
+#CHECK: error: invalid instruction
+#CHECK: clgitno %r0, 0
+#CHECK: error: invalid instruction
+#CHECK: clgito  %r0, 0
+
+        clgit   %r0, -1
+        clgit   %r0, 65536
+        clgitno %r0, 0
+        clgito  %r0, 0
 
 #CHECK: error: offset out of range
 #CHECK: clgrj	%r0, %r0, 0, -0x100002
@@ -1291,19 +1425,24 @@
 	clgrl	%r0, 1
 	clgrl	%r0, 0x100000000
 
-#CHECK: error: invalid instruction
-#CHECK: clgrto    %r0, %r0
 #CHECK: error: invalid instruction
 #CHECK: clgrtno   %r0, %r0
+#CHECK: error: invalid instruction
+#CHECK: clgrto    %r0, %r0
 
-        clgrto    %r0, %r0
         clgrtno   %r0, %r0
+        clgrto    %r0, %r0
 
 #CHECK: error: instruction requires: fp-extension
 #CHECK: clgxbr	%r0, 0, %f0, 0
 
 	clgxbr	%r0, 0, %f0, 0
 
+#CHECK: error: instruction requires: high-word
+#CHECK: clhf	%r0, 0
+
+	clhf	%r0, 0
+
 #CHECK: error: invalid operand
 #CHECK: clhhsi	-1, 0
 #CHECK: error: invalid operand
@@ -1379,13 +1518,13 @@
 	clij	%r0, 0, 0, 1
 	clij	%r0, 0, 0, 0x10000
 
-#CHECK: error: invalid instruction
-#CHECK:	clijo	%r0, 0, 0, 0
 #CHECK: error: invalid instruction
 #CHECK:	clijno	%r0, 0, 0, 0
+#CHECK: error: invalid instruction
+#CHECK:	clijo	%r0, 0, 0, 0
 
-	clijo	%r0, 0, 0, 0
 	clijno	%r0, 0, 0, 0
+	clijo	%r0, 0, 0, 0
 
 #CHECK: error: invalid operand
 #CHECK: cliy	-524289, 0
@@ -1404,6 +1543,48 @@
 	cliy	0, -1
 	cliy	0, 256
 
+#CHECK: error: invalid operand
+#CHECK: clm	%r0, 0, -1
+#CHECK: error: invalid operand
+#CHECK: clm	%r0, 0, 4096
+#CHECK: error: invalid operand
+#CHECK: clm	%r0, -1, 0
+#CHECK: error: invalid operand
+#CHECK: clm	%r0, 16, 0
+
+	clm	%r0, 0, -1
+	clm	%r0, 0, 4096
+	clm	%r0, -1, 0
+	clm	%r0, 16, 0
+
+#CHECK: error: invalid operand
+#CHECK: clmh	%r0, 0, -524289
+#CHECK: error: invalid operand
+#CHECK: clmh	%r0, 0, 524288
+#CHECK: error: invalid operand
+#CHECK: clmh	%r0, -1, 0
+#CHECK: error: invalid operand
+#CHECK: clmh	%r0, 16, 0
+
+	clmh	%r0, 0, -524289
+	clmh	%r0, 0, 524288
+	clmh	%r0, -1, 0
+	clmh	%r0, 16, 0
+
+#CHECK: error: invalid operand
+#CHECK: clmy	%r0, 0, -524289
+#CHECK: error: invalid operand
+#CHECK: clmy	%r0, 0, 524288
+#CHECK: error: invalid operand
+#CHECK: clmy	%r0, -1, 0
+#CHECK: error: invalid operand
+#CHECK: clmy	%r0, 16, 0
+
+	clmy	%r0, 0, -524289
+	clmy	%r0, 0, 524288
+	clmy	%r0, -1, 0
+	clmy	%r0, 16, 0
+
 #CHECK: error: offset out of range
 #CHECK: clrj	%r0, %r0, 0, -0x100002
 #CHECK: error: offset out of range
@@ -1418,13 +1599,13 @@
 	clrj	%r0, %r0, 0, 1
 	clrj	%r0, %r0, 0, 0x10000
 
-#CHECK: error: invalid instruction
-#CHECK:	clrjo	%r0, %r0, 0, 0
 #CHECK: error: invalid instruction
 #CHECK:	clrjno	%r0, %r0, 0, 0
+#CHECK: error: invalid instruction
+#CHECK:	clrjo	%r0, %r0, 0, 0
 
-	clrjo	%r0, %r0, 0, 0
 	clrjno	%r0, %r0, 0, 0
+	clrjo	%r0, %r0, 0, 0
 
 #CHECK: error: offset out of range
 #CHECK: clrl	%r0, -0x1000000002
@@ -1440,13 +1621,13 @@
 	clrl	%r0, 1
 	clrl	%r0, 0x100000000
 
-#CHECK: error: invalid instruction
-#CHECK: clrto    %r0, %r0
 #CHECK: error: invalid instruction
 #CHECK: clrtno   %r0, %r0
+#CHECK: error: invalid instruction
+#CHECK: clrto    %r0, %r0
 
-        clrto    %r0, %r0
         clrtno   %r0, %r0
+        clrto    %r0, %r0
 
 #CHECK: error: invalid operand
 #CHECK: cly	%r0, -524289
@@ -1456,6 +1637,67 @@
 	cly	%r0, -524289
 	cly	%r0, 524288
 
+#CHECK: error: invalid register pair
+#CHECK: cmpsc	%r1, %r0
+#CHECK: error: invalid register pair
+#CHECK: cmpsc	%r0, %r1
+
+	cmpsc	%r1, %r0
+	cmpsc	%r0, %r1
+
+#CHECK: error: missing length in address
+#CHECK: cp	0, 0(1)
+#CHECK: error: missing length in address
+#CHECK: cp	0(1), 0
+#CHECK: error: missing length in address
+#CHECK: cp	0(%r1), 0(1,%r1)
+#CHECK: error: missing length in address
+#CHECK: cp	0(1,%r1), 0(%r1)
+#CHECK: error: invalid operand
+#CHECK: cp	0(0,%r1), 0(1,%r1)
+#CHECK: error: invalid operand
+#CHECK: cp	0(1,%r1), 0(0,%r1)
+#CHECK: error: invalid operand
+#CHECK: cp	0(17,%r1), 0(1,%r1)
+#CHECK: error: invalid operand
+#CHECK: cp	0(1,%r1), 0(17,%r1)
+#CHECK: error: invalid operand
+#CHECK: cp	-1(1,%r1), 0(1,%r1)
+#CHECK: error: invalid operand
+#CHECK: cp	4096(1,%r1), 0(1,%r1)
+#CHECK: error: invalid operand
+#CHECK: cp	0(1,%r1), -1(1,%r1)
+#CHECK: error: invalid operand
+#CHECK: cp	0(1,%r1), 4096(1,%r1)
+#CHECK: error: %r0 used in an address
+#CHECK: cp	0(1,%r0), 0(1,%r1)
+#CHECK: error: %r0 used in an address
+#CHECK: cp	0(1,%r1), 0(1,%r0)
+#CHECK: error: invalid use of indexed addressing
+#CHECK: cp	0(%r1,%r2), 0(1,%r1)
+#CHECK: error: invalid use of indexed addressing
+#CHECK: cp	0(1,%r2), 0(%r1,%r2)
+#CHECK: error: unknown token in expression
+#CHECK: cp	0(-), 0(1)
+
+	cp	0, 0(1)
+	cp	0(1), 0
+	cp	0(%r1), 0(1,%r1)
+	cp	0(1,%r1), 0(%r1)
+	cp	0(0,%r1), 0(1,%r1)
+	cp	0(1,%r1), 0(0,%r1)
+	cp	0(17,%r1), 0(1,%r1)
+	cp	0(1,%r1), 0(17,%r1)
+	cp	-1(1,%r1), 0(1,%r1)
+	cp	4096(1,%r1), 0(1,%r1)
+	cp	0(1,%r1), -1(1,%r1)
+	cp	0(1,%r1), 4096(1,%r1)
+	cp	0(1,%r0), 0(1,%r1)
+	cp	0(1,%r1), 0(1,%r0)
+	cp	0(%r1,%r2), 0(1,%r1)
+	cp	0(1,%r2), 0(%r1,%r2)
+	cp	0(-), 0(1)
+
 #CHECK: error: offset out of range
 #CHECK: crj	%r0, %r0, 0, -0x100002
 #CHECK: error: offset out of range
@@ -1470,13 +1712,13 @@
 	crj	%r0, %r0, 0, 1
 	crj	%r0, %r0, 0, 0x10000
 
-#CHECK: error: invalid instruction
-#CHECK:	crjo	%r0, %r0, 0, 0
 #CHECK: error: invalid instruction
 #CHECK:	crjno	%r0, %r0, 0, 0
+#CHECK: error: invalid instruction
+#CHECK:	crjo	%r0, %r0, 0, 0
 
-	crjo	%r0, %r0, 0, 0
 	crjno	%r0, %r0, 0, 0
+	crjo	%r0, %r0, 0, 0
 
 #CHECK: error: offset out of range
 #CHECK: crl	%r0, -0x1000000002
@@ -1492,13 +1734,13 @@
 	crl	%r0, 1
 	crl	%r0, 0x100000000
 
-#CHECK: error: invalid instruction
-#CHECK: crto    %r0, %r0
 #CHECK: error: invalid instruction
 #CHECK: crtno   %r0, %r0
+#CHECK: error: invalid instruction
+#CHECK: crto    %r0, %r0
 
-        crto    %r0, %r0
         crtno   %r0, %r0
+        crto    %r0, %r0
 
 #CHECK: error: invalid operand
 #CHECK: cs	%r0, %r0, -1
@@ -1522,17 +1764,6 @@
 	csg	%r0, %r0, 524288
 	csg	%r0, %r0, 0(%r1,%r2)
 
-#CHECK: error: invalid operand
-#CHECK: csy	%r0, %r0, -524289
-#CHECK: error: invalid operand
-#CHECK: csy	%r0, %r0, 524288
-#CHECK: error: invalid use of indexed addressing
-#CHECK: csy	%r0, %r0, 0(%r1,%r2)
-
-	csy	%r0, %r0, -524289
-	csy	%r0, %r0, 524288
-	csy	%r0, %r0, 0(%r1,%r2)
-
 #CHECK: error: invalid use of indexed addressing
 #CHECK: csst	160(%r1,%r15), 160(%r15), %r2
 #CHECK: error: invalid operand
@@ -1550,6 +1781,173 @@
         csst	0(%r1), -1(%r15), %r2
         csst	0(%r1), 4096(%r15), %r2
 
+#CHECK: error: invalid operand
+#CHECK: csy	%r0, %r0, -524289
+#CHECK: error: invalid operand
+#CHECK: csy	%r0, %r0, 524288
+#CHECK: error: invalid use of indexed addressing
+#CHECK: csy	%r0, %r0, 0(%r1,%r2)
+
+	csy	%r0, %r0, -524289
+	csy	%r0, %r0, 524288
+	csy	%r0, %r0, 0(%r1,%r2)
+
+#CHECK: error: invalid register pair
+#CHECK: cu12	%r1, %r0
+#CHECK: error: invalid register pair
+#CHECK: cu12	%r0, %r1
+#CHECK: error: invalid operand
+#CHECK: cu12	%r2, %r4, -1
+#CHECK: error: invalid operand
+#CHECK: cu12	%r2, %r4, 16
+
+	cu12	%r1, %r0
+	cu12	%r0, %r1
+	cu12	%r2, %r4, -1
+	cu12	%r2, %r4, 16
+
+#CHECK: error: invalid register pair
+#CHECK: cu14	%r1, %r0
+#CHECK: error: invalid register pair
+#CHECK: cu14	%r0, %r1
+#CHECK: error: invalid operand
+#CHECK: cu14	%r2, %r4, -1
+#CHECK: error: invalid operand
+#CHECK: cu14	%r2, %r4, 16
+
+	cu14	%r1, %r0
+	cu14	%r0, %r1
+	cu14	%r2, %r4, -1
+	cu14	%r2, %r4, 16
+
+#CHECK: error: invalid register pair
+#CHECK: cu21	%r1, %r0
+#CHECK: error: invalid register pair
+#CHECK: cu21	%r0, %r1
+#CHECK: error: invalid operand
+#CHECK: cu21	%r2, %r4, -1
+#CHECK: error: invalid operand
+#CHECK: cu21	%r2, %r4, 16
+
+	cu21	%r1, %r0
+	cu21	%r0, %r1
+	cu21	%r2, %r4, -1
+	cu21	%r2, %r4, 16
+
+#CHECK: error: invalid register pair
+#CHECK: cu24	%r1, %r0
+#CHECK: error: invalid register pair
+#CHECK: cu24	%r0, %r1
+#CHECK: error: invalid operand
+#CHECK: cu24	%r2, %r4, -1
+#CHECK: error: invalid operand
+#CHECK: cu24	%r2, %r4, 16
+
+	cu24	%r1, %r0
+	cu24	%r0, %r1
+	cu24	%r2, %r4, -1
+	cu24	%r2, %r4, 16
+
+#CHECK: error: invalid register pair
+#CHECK: cu41	%r1, %r0
+#CHECK: error: invalid register pair
+#CHECK: cu41	%r0, %r1
+
+	cu41	%r1, %r0
+	cu41	%r0, %r1
+
+#CHECK: error: invalid register pair
+#CHECK: cu42	%r1, %r0
+#CHECK: error: invalid register pair
+#CHECK: cu42	%r0, %r1
+
+	cu42	%r1, %r0
+	cu42	%r0, %r1
+
+#CHECK: error: invalid register pair
+#CHECK: cuse	%r1, %r0
+#CHECK: error: invalid register pair
+#CHECK: cuse	%r0, %r1
+
+	cuse	%r1, %r0
+	cuse	%r0, %r1
+
+#CHECK: error: invalid register pair
+#CHECK: cutfu	%r1, %r0
+#CHECK: error: invalid register pair
+#CHECK: cutfu	%r0, %r1
+#CHECK: error: invalid operand
+#CHECK: cutfu	%r2, %r4, -1
+#CHECK: error: invalid operand
+#CHECK: cutfu	%r2, %r4, 16
+
+	cutfu	%r1, %r0
+	cutfu	%r0, %r1
+	cutfu	%r2, %r4, -1
+	cutfu	%r2, %r4, 16
+
+#CHECK: error: invalid register pair
+#CHECK: cuutf	%r1, %r0
+#CHECK: error: invalid register pair
+#CHECK: cuutf	%r0, %r1
+#CHECK: error: invalid operand
+#CHECK: cuutf	%r2, %r4, -1
+#CHECK: error: invalid operand
+#CHECK: cuutf	%r2, %r4, 16
+
+	cuutf	%r1, %r0
+	cuutf	%r0, %r1
+	cuutf	%r2, %r4, -1
+	cuutf	%r2, %r4, 16
+
+#CHECK: error: invalid operand
+#CHECK: cvb	%r0, -1
+#CHECK: error: invalid operand
+#CHECK: cvb	%r0, 4096
+
+	cvb	%r0, -1
+	cvb	%r0, 4096
+
+#CHECK: error: invalid operand
+#CHECK: cvbg	%r0, -524289
+#CHECK: error: invalid operand
+#CHECK: cvbg	%r0, 524288
+
+	cvbg	%r0, -524289
+	cvbg	%r0, 524288
+
+#CHECK: error: invalid operand
+#CHECK: cvby	%r0, -524289
+#CHECK: error: invalid operand
+#CHECK: cvby	%r0, 524288
+
+	cvby	%r0, -524289
+	cvby	%r0, 524288
+
+#CHECK: error: invalid operand
+#CHECK: cvd	%r0, -1
+#CHECK: error: invalid operand
+#CHECK: cvd	%r0, 4096
+
+	cvd	%r0, -1
+	cvd	%r0, 4096
+
+#CHECK: error: invalid operand
+#CHECK: cvdg	%r0, -524289
+#CHECK: error: invalid operand
+#CHECK: cvdg	%r0, 524288
+
+	cvdg	%r0, -524289
+	cvdg	%r0, 524288
+
+#CHECK: error: invalid operand
+#CHECK: cvdy	%r0, -524289
+#CHECK: error: invalid operand
+#CHECK: cvdy	%r0, 524288
+
+	cvdy	%r0, -524289
+	cvdy	%r0, 524288
+
 #CHECK: error: invalid register pair
 #CHECK: cxbr	%f0, %f2
 #CHECK: error: invalid register pair
@@ -1596,6 +1994,17 @@
 	cy	%r0, -524289
 	cy	%r0, 524288
 
+#CHECK: error: invalid operand
+#CHECK: d	%r0, -1
+#CHECK: error: invalid operand
+#CHECK: d	%r0, 4096
+#CHECK: error: invalid register pair
+#CHECK: d	%r1, 0
+
+	d	%r0, -1
+	d	%r0, 4096
+	d	%r1, 0
+
 #CHECK: error: invalid operand
 #CHECK: ddb	%f0, -1
 #CHECK: error: invalid operand
@@ -1612,6 +2021,22 @@
 	deb	%f0, -1
 	deb	%f0, 4096
 
+#CHECK: error: invalid operand
+#CHECK: didbr	%f0, %f0, %f0, -1
+#CHECK: error: invalid operand
+#CHECK: didbr	%f0, %f0, %f0, 16
+
+	didbr	%f0, %f0, %f0, -1
+	didbr	%f0, %f0, %f0, 16
+
+#CHECK: error: invalid operand
+#CHECK: diebr	%f0, %f0, %f0, -1
+#CHECK: error: invalid operand
+#CHECK: diebr	%f0, %f0, %f0, 16
+
+	diebr	%f0, %f0, %f0, -1
+	diebr	%f0, %f0, %f0, 16
+
 #CHECK: error: invalid operand
 #CHECK: dl	%r0, -524289
 #CHECK: error: invalid operand
@@ -1623,6 +2048,11 @@
 	dl	%r0, 524288
 	dl	%r1, 0
 
+#CHECK: error: invalid register pair
+#CHECK: dr	%r1, %r0
+
+	dr	%r1, %r0
+
 #CHECK: error: invalid operand
 #CHECK: dlg	%r0, -524289
 #CHECK: error: invalid operand
@@ -1644,6 +2074,59 @@
 
 	dlr	%r1, %r0
 
+#CHECK: error: missing length in address
+#CHECK: dp	0, 0(1)
+#CHECK: error: missing length in address
+#CHECK: dp	0(1), 0
+#CHECK: error: missing length in address
+#CHECK: dp	0(%r1), 0(1,%r1)
+#CHECK: error: missing length in address
+#CHECK: dp	0(1,%r1), 0(%r1)
+#CHECK: error: invalid operand
+#CHECK: dp	0(0,%r1), 0(1,%r1)
+#CHECK: error: invalid operand
+#CHECK: dp	0(1,%r1), 0(0,%r1)
+#CHECK: error: invalid operand
+#CHECK: dp	0(17,%r1), 0(1,%r1)
+#CHECK: error: invalid operand
+#CHECK: dp	0(1,%r1), 0(17,%r1)
+#CHECK: error: invalid operand
+#CHECK: dp	-1(1,%r1), 0(1,%r1)
+#CHECK: error: invalid operand
+#CHECK: dp	4096(1,%r1), 0(1,%r1)
+#CHECK: error: invalid operand
+#CHECK: dp	0(1,%r1), -1(1,%r1)
+#CHECK: error: invalid operand
+#CHECK: dp	0(1,%r1), 4096(1,%r1)
+#CHECK: error: %r0 used in an address
+#CHECK: dp	0(1,%r0), 0(1,%r1)
+#CHECK: error: %r0 used in an address
+#CHECK: dp	0(1,%r1), 0(1,%r0)
+#CHECK: error: invalid use of indexed addressing
+#CHECK: dp	0(%r1,%r2), 0(1,%r1)
+#CHECK: error: invalid use of indexed addressing
+#CHECK: dp	0(1,%r2), 0(%r1,%r2)
+#CHECK: error: unknown token in expression
+#CHECK: dp	0(-), 0(1)
+
+	dp	0, 0(1)
+	dp	0(1), 0
+	dp	0(%r1), 0(1,%r1)
+	dp	0(1,%r1), 0(%r1)
+	dp	0(0,%r1), 0(1,%r1)
+	dp	0(1,%r1), 0(0,%r1)
+	dp	0(17,%r1), 0(1,%r1)
+	dp	0(1,%r1), 0(17,%r1)
+	dp	-1(1,%r1), 0(1,%r1)
+	dp	4096(1,%r1), 0(1,%r1)
+	dp	0(1,%r1), -1(1,%r1)
+	dp	0(1,%r1), 4096(1,%r1)
+	dp	0(1,%r0), 0(1,%r1)
+	dp	0(1,%r1), 0(1,%r0)
+	dp	0(%r1,%r2), 0(1,%r1)
+	dp	0(1,%r2), 0(%r1,%r2)
+	dp	0(-), 0(1)
+
 #CHECK: error: invalid operand
 #CHECK: dsg	%r0, -524289
 #CHECK: error: invalid operand
@@ -1685,12 +2168,15 @@
 	dxbr	%f2, %f0
 
 #CHECK: error: invalid operand
-#CHECK: ex      %r0, -1
+#CHECK: ecag	%r0, %r0, -524289
 #CHECK: error: invalid operand
-#CHECK: ex      %r0, 4096
+#CHECK: ecag	%r0, %r0, 524288
+#CHECK: error: invalid use of indexed addressing
+#CHECK: ecag	%r0, %r0, 0(%r1,%r2)
 
-        ex      %r0, -1
-        ex      %r0, 4096
+	ecag	%r0, %r0, -524289
+	ecag	%r0, %r0, 524288
+	ecag	%r0, %r0, 0(%r1,%r2)
 
 #CHECK: error: invalid use of indexed addressing
 #CHECK: ectg    160(%r1,%r15),160(%r15), %r2
@@ -1709,6 +2195,102 @@
         ectg    0(%r1),-1(%r15), %r2
         ectg    0(%r1),4096(%r15), %r2
 
+#CHECK: error: missing length in address
+#CHECK: ed	0, 0
+#CHECK: error: missing length in address
+#CHECK: ed	0(%r1), 0(%r1)
+#CHECK: error: invalid use of length addressing
+#CHECK: ed	0(1,%r1), 0(2,%r1)
+#CHECK: error: invalid operand
+#CHECK: ed	0(0,%r1), 0(%r1)
+#CHECK: error: invalid operand
+#CHECK: ed	0(257,%r1), 0(%r1)
+#CHECK: error: invalid operand
+#CHECK: ed	-1(1,%r1), 0(%r1)
+#CHECK: error: invalid operand
+#CHECK: ed	4096(1,%r1), 0(%r1)
+#CHECK: error: invalid operand
+#CHECK: ed	0(1,%r1), -1(%r1)
+#CHECK: error: invalid operand
+#CHECK: ed	0(1,%r1), 4096(%r1)
+#CHECK: error: %r0 used in an address
+#CHECK: ed	0(1,%r0), 0(%r1)
+#CHECK: error: %r0 used in an address
+#CHECK: ed	0(1,%r1), 0(%r0)
+#CHECK: error: invalid use of indexed addressing
+#CHECK: ed	0(%r1,%r2), 0(%r1)
+#CHECK: error: invalid use of indexed addressing
+#CHECK: ed	0(1,%r2), 0(%r1,%r2)
+#CHECK: error: unknown token in expression
+#CHECK: ed	0(-), 0
+
+	ed	0, 0
+	ed	0(%r1), 0(%r1)
+	ed	0(1,%r1), 0(2,%r1)
+	ed	0(0,%r1), 0(%r1)
+	ed	0(257,%r1), 0(%r1)
+	ed	-1(1,%r1), 0(%r1)
+	ed	4096(1,%r1), 0(%r1)
+	ed	0(1,%r1), -1(%r1)
+	ed	0(1,%r1), 4096(%r1)
+	ed	0(1,%r0), 0(%r1)
+	ed	0(1,%r1), 0(%r0)
+	ed	0(%r1,%r2), 0(%r1)
+	ed	0(1,%r2), 0(%r1,%r2)
+	ed	0(-), 0
+
+#CHECK: error: missing length in address
+#CHECK: edmk	0, 0
+#CHECK: error: missing length in address
+#CHECK: edmk	0(%r1), 0(%r1)
+#CHECK: error: invalid use of length addressing
+#CHECK: edmk	0(1,%r1), 0(2,%r1)
+#CHECK: error: invalid operand
+#CHECK: edmk	0(0,%r1), 0(%r1)
+#CHECK: error: invalid operand
+#CHECK: edmk	0(257,%r1), 0(%r1)
+#CHECK: error: invalid operand
+#CHECK: edmk	-1(1,%r1), 0(%r1)
+#CHECK: error: invalid operand
+#CHECK: edmk	4096(1,%r1), 0(%r1)
+#CHECK: error: invalid operand
+#CHECK: edmk	0(1,%r1), -1(%r1)
+#CHECK: error: invalid operand
+#CHECK: edmk	0(1,%r1), 4096(%r1)
+#CHECK: error: %r0 used in an address
+#CHECK: edmk	0(1,%r0), 0(%r1)
+#CHECK: error: %r0 used in an address
+#CHECK: edmk	0(1,%r1), 0(%r0)
+#CHECK: error: invalid use of indexed addressing
+#CHECK: edmk	0(%r1,%r2), 0(%r1)
+#CHECK: error: invalid use of indexed addressing
+#CHECK: edmk	0(1,%r2), 0(%r1,%r2)
+#CHECK: error: unknown token in expression
+#CHECK: edmk	0(-), 0
+
+	edmk	0, 0
+	edmk	0(%r1), 0(%r1)
+	edmk	0(1,%r1), 0(2,%r1)
+	edmk	0(0,%r1), 0(%r1)
+	edmk	0(257,%r1), 0(%r1)
+	edmk	-1(1,%r1), 0(%r1)
+	edmk	4096(1,%r1), 0(%r1)
+	edmk	0(1,%r1), -1(%r1)
+	edmk	0(1,%r1), 4096(%r1)
+	edmk	0(1,%r0), 0(%r1)
+	edmk	0(1,%r1), 0(%r0)
+	edmk	0(%r1,%r2), 0(%r1)
+	edmk	0(1,%r2), 0(%r1,%r2)
+	edmk	0(-), 0
+
+#CHECK: error: invalid operand
+#CHECK: ex      %r0, -1
+#CHECK: error: invalid operand
+#CHECK: ex      %r0, 4096
+
+        ex      %r0, -1
+        ex      %r0, 4096
+
 #CHECK: error: invalid operand
 #CHECK: fidbr	%f0, -1, %f0
 #CHECK: error: invalid operand
@@ -1865,6 +2447,76 @@
 	iill	%r0, -1
 	iill	%r0, 0x10000
 
+#CHECK: error: invalid operand
+#CHECK: kdb	%f0, -1
+#CHECK: error: invalid operand
+#CHECK: kdb	%f0, 4096
+
+	kdb	%f0, -1
+	kdb	%f0, 4096
+
+#CHECK: error: invalid operand
+#CHECK: keb	%f0, -1
+#CHECK: error: invalid operand
+#CHECK: keb	%f0, 4096
+
+	keb	%f0, -1
+	keb	%f0, 4096
+
+#CHECK: error: invalid register pair
+#CHECK: kimd	%r0, %r1
+
+	kimd	%r0, %r1
+
+#CHECK: error: invalid register pair
+#CHECK: klmd	%r0, %r1
+
+	klmd	%r0, %r1
+
+#CHECK: error: invalid register pair
+#CHECK: km	%r1, %r2
+#CHECK: error: invalid register pair
+#CHECK: km	%r2, %r1
+
+	km	%r1, %r2
+	km	%r2, %r1
+
+#CHECK: error: invalid register pair
+#CHECK: kmac	%r0, %r1
+
+	kmac	%r0, %r1
+
+#CHECK: error: invalid register pair
+#CHECK: kmc	%r1, %r2
+#CHECK: error: invalid register pair
+#CHECK: kmc	%r2, %r1
+
+	kmc	%r1, %r2
+	kmc	%r2, %r1
+
+#CHECK: error: instruction requires: message-security-assist-extension4
+#CHECK: kmctr	%r2, %r4, %r6
+
+	kmctr	%r2, %r4, %r6
+
+#CHECK: error: instruction requires: message-security-assist-extension4
+#CHECK: kmf	%r2, %r4
+
+	kmf	%r2, %r4
+
+#CHECK: error: instruction requires: message-security-assist-extension4
+#CHECK: kmo	%r2, %r4
+
+	kmo	%r2, %r4
+
+#CHECK: error: invalid register pair
+#CHECK: kxbr	%f0, %f2
+#CHECK: error: invalid register pair
+#CHECK: kxbr	%f2, %f0
+
+	kxbr	%f0, %f2
+	kxbr	%f2, %f0
+
 #CHECK: error: invalid operand
 #CHECK: l	%r0, -1
 #CHECK: error: invalid operand
@@ -1881,14 +2533,6 @@
 	la	%r0, -1
 	la	%r0, 4096
 
-#CHECK: error: invalid operand
-#CHECK: lae	%r0, -1
-#CHECK: error: invalid operand
-#CHECK: lae	%r0, 4096
-
-	lae	%r0, -1
-	lae	%r0, 4096
-
 #CHECK: error: instruction requires: interlocked-access1
 #CHECK: laa	%r1, %r2, 100(%r3)
 	laa	%r1, %r2, 100(%r3)
@@ -1905,6 +2549,14 @@
 #CHECK: laalg	%r1, %r2, 100(%r3)
 	laalg	%r1, %r2, 100(%r3)
 
+#CHECK: error: invalid operand
+#CHECK: lae	%r0, -1
+#CHECK: error: invalid operand
+#CHECK: lae	%r0, 4096
+
+	lae	%r0, -1
+	lae	%r0, 4096
+
 #CHECK: error: invalid operand
 #CHECK: laey	%r0, -524289
 #CHECK: error: invalid operand
@@ -1948,14 +2600,6 @@
 #CHECK: laog	%r1, %r2, 100(%r3)
 	laog	%r1, %r2, 100(%r3)
 
-#CHECK: error: instruction requires: interlocked-access1
-#CHECK: lax	%r1, %r2, 100(%r3)
-	lax	%r1, %r2, 100(%r3)
-
-#CHECK: error: instruction requires: interlocked-access1
-#CHECK: laxg	%r1, %r2, 100(%r3)
-	laxg	%r1, %r2, 100(%r3)
-
 #CHECK: error: offset out of range
 #CHECK: larl	%r0, -0x1000000002
 #CHECK: error: offset out of range
@@ -1970,6 +2614,14 @@
 	larl	%r0, 1
 	larl	%r0, 0x100000000
 
+#CHECK: error: instruction requires: interlocked-access1
+#CHECK: lax	%r1, %r2, 100(%r3)
+	lax	%r1, %r2, 100(%r3)
+
+#CHECK: error: instruction requires: interlocked-access1
+#CHECK: laxg	%r1, %r2, 100(%r3)
+	laxg	%r1, %r2, 100(%r3)
+
 #CHECK: error: invalid operand
 #CHECK: lay	%r0, -524289
 #CHECK: error: invalid operand
@@ -2070,11 +2722,6 @@
 	ley	%f0, -524289
 	ley	%f0, 524288
 
-#CHECK: error: instruction requires: high-word
-#CHECK: lfh	%r0, 0
-
-	lfh	%r0, 0
-
 #CHECK: error: invalid operand
 #CHECK: lfas	-1
 #CHECK: error: invalid operand
@@ -2086,6 +2733,11 @@
 	lfas	4096
 	lfas	0(%r1,%r2)
 
+#CHECK: error: instruction requires: high-word
+#CHECK: lfh	%r0, 0
+
+	lfh	%r0, 0
+
 #CHECK: error: invalid operand
 #CHECK: lfpc	-1
 #CHECK: error: invalid operand
@@ -2257,14 +2909,6 @@
 	llgc	%r0, -524289
 	llgc	%r0, 524288
 
-#CHECK: error: invalid operand
-#CHECK: llgt	%r0, -524289
-#CHECK: error: invalid operand
-#CHECK: llgt	%r0, 524288
-
-	llgt	%r0, -524289
-	llgt	%r0, 524288
-
 #CHECK: error: invalid operand
 #CHECK: llgf	%r0, -524289
 #CHECK: error: invalid operand
@@ -2309,6 +2953,14 @@
 	llghrl	%r0, 1
 	llghrl	%r0, 0x100000000
 
+#CHECK: error: invalid operand
+#CHECK: llgt	%r0, -524289
+#CHECK: error: invalid operand
+#CHECK: llgt	%r0, 524288
+
+	llgt	%r0, -524289
+	llgt	%r0, 524288
+
 #CHECK: error: invalid operand
 #CHECK: llh	%r0, -524289
 #CHECK: error: invalid operand
@@ -2392,6 +3044,23 @@
 	lm	%r0, %r0, 4096
 	lm	%r0, %r0, 0(%r1,%r2)
 
+#CHECK: error: invalid use of indexed addressing
+#CHECK: lmd	%r2, %r4, 160(%r1,%r15), 160(%r15)
+#CHECK: error: invalid operand
+#CHECK: lmd	%r2, %r4, -1(%r1), 160(%r15)
+#CHECK: error: invalid operand
+#CHECK: lmd	%r2, %r4, 4096(%r1), 160(%r15)
+#CHECK: error: invalid operand
+#CHECK: lmd	%r2, %r4, 0(%r1), -1(%r15)
+#CHECK: error: invalid operand
+#CHECK: lmd	%r2, %r4, 0(%r1), 4096(%r15)
+
+        lmd	%r2, %r4, 160(%r1,%r15), 160(%r15)
+        lmd	%r2, %r4, -1(%r1), 160(%r15)
+        lmd	%r2, %r4, 4096(%r1), 160(%r15)
+        lmd	%r2, %r4, 0(%r1), -1(%r15)
+        lmd	%r2, %r4, 0(%r1), 4096(%r15)
+
 #CHECK: error: invalid operand
 #CHECK: lmg	%r0, %r0, -524289
 #CHECK: error: invalid operand
@@ -2543,6 +3212,17 @@
 
 	lzxr	%f2
 
+#CHECK: error: invalid operand
+#CHECK: m	%r0, -1
+#CHECK: error: invalid operand
+#CHECK: m	%r0, 4096
+#CHECK: error: invalid register pair
+#CHECK: m	%r1, 0
+
+	m	%r0, -1
+	m	%r0, 4096
+	m	%r1, 0
+
 #CHECK: error: invalid operand
 #CHECK: madb	%f0, %f0, -1
 #CHECK: error: invalid operand
@@ -2559,6 +3239,23 @@
 	maeb	%f0, %f0, -1
 	maeb	%f0, %f0, 4096
 
+#CHECK: error: invalid operand
+#CHECK: mc	-1, 0
+#CHECK: error: invalid operand
+#CHECK: mc	4096, 0
+#CHECK: error: invalid use of indexed addressing
+#CHECK: mc	0(%r1,%r2), 0
+#CHECK: error: invalid operand
+#CHECK: mc	0, -1
+#CHECK: error: invalid operand
+#CHECK: mc	0, 256
+
+	mc	-1, 0
+	mc	4096, 0
+	mc	0(%r1,%r2), 0
+	mc	0, -1
+	mc	0, 256
+
 #CHECK: error: invalid operand
 #CHECK: mdb	%f0, -1
 #CHECK: error: invalid operand
@@ -2583,6 +3280,17 @@
 	meeb	%f0, -1
 	meeb	%f0, 4096
 
+#CHECK: error: invalid operand
+#CHECK: mfy	%r0, -524289
+#CHECK: error: invalid operand
+#CHECK: mfy	%r0, 524288
+#CHECK: error: invalid register pair
+#CHECK: mfy	%r1, 0
+
+	mfy	%r0, -524289
+	mfy	%r0, 524288
+	mfy	%r1, 0
+
 #CHECK: error: invalid operand
 #CHECK: mghi	%r0, -32769
 #CHECK: error: invalid operand
@@ -2621,21 +3329,95 @@
 	mhy	%r0, -524289
 	mhy	%r0, 524288
 
+#CHECK: error: invalid operand
+#CHECK: ml	%r0, -524289
+#CHECK: error: invalid operand
+#CHECK: ml	%r0, 524288
+#CHECK: error: invalid register pair
+#CHECK: ml	%r1, 0
+
+	ml	%r0, -524289
+	ml	%r0, 524288
+	ml	%r1, 0
+
 #CHECK: error: invalid operand
 #CHECK: mlg	%r0, -524289
 #CHECK: error: invalid operand
-#CHECK: mlg	%r0, 524288
-#CHECK: error: invalid register pair
-#CHECK: mlg	%r1, 0
-
-	mlg	%r0, -524289
-	mlg	%r0, 524288
-	mlg	%r1, 0
+#CHECK: mlg	%r0, 524288
+#CHECK: error: invalid register pair
+#CHECK: mlg	%r1, 0
+
+	mlg	%r0, -524289
+	mlg	%r0, 524288
+	mlg	%r1, 0
+
+#CHECK: error: invalid register pair
+#CHECK: mlgr	%r1, %r0
+
+	mlgr	%r1, %r0
+
+#CHECK: error: invalid register pair
+#CHECK: mlr	%r1, %r0
+
+	mlr	%r1, %r0
+
+#CHECK: error: missing length in address
+#CHECK: mp	0, 0(1)
+#CHECK: error: missing length in address
+#CHECK: mp	0(1), 0
+#CHECK: error: missing length in address
+#CHECK: mp	0(%r1), 0(1,%r1)
+#CHECK: error: missing length in address
+#CHECK: mp	0(1,%r1), 0(%r1)
+#CHECK: error: invalid operand
+#CHECK: mp	0(0,%r1), 0(1,%r1)
+#CHECK: error: invalid operand
+#CHECK: mp	0(1,%r1), 0(0,%r1)
+#CHECK: error: invalid operand
+#CHECK: mp	0(17,%r1), 0(1,%r1)
+#CHECK: error: invalid operand
+#CHECK: mp	0(1,%r1), 0(17,%r1)
+#CHECK: error: invalid operand
+#CHECK: mp	-1(1,%r1), 0(1,%r1)
+#CHECK: error: invalid operand
+#CHECK: mp	4096(1,%r1), 0(1,%r1)
+#CHECK: error: invalid operand
+#CHECK: mp	0(1,%r1), -1(1,%r1)
+#CHECK: error: invalid operand
+#CHECK: mp	0(1,%r1), 4096(1,%r1)
+#CHECK: error: %r0 used in an address
+#CHECK: mp	0(1,%r0), 0(1,%r1)
+#CHECK: error: %r0 used in an address
+#CHECK: mp	0(1,%r1), 0(1,%r0)
+#CHECK: error: invalid use of indexed addressing
+#CHECK: mp	0(%r1,%r2), 0(1,%r1)
+#CHECK: error: invalid use of indexed addressing
+#CHECK: mp	0(1,%r2), 0(%r1,%r2)
+#CHECK: error: unknown token in expression
+#CHECK: mp	0(-), 0(1)
+
+	mp	0, 0(1)
+	mp	0(1), 0
+	mp	0(%r1), 0(1,%r1)
+	mp	0(1,%r1), 0(%r1)
+	mp	0(0,%r1), 0(1,%r1)
+	mp	0(1,%r1), 0(0,%r1)
+	mp	0(17,%r1), 0(1,%r1)
+	mp	0(1,%r1), 0(17,%r1)
+	mp	-1(1,%r1), 0(1,%r1)
+	mp	4096(1,%r1), 0(1,%r1)
+	mp	0(1,%r1), -1(1,%r1)
+	mp	0(1,%r1), 4096(1,%r1)
+	mp	0(1,%r0), 0(1,%r1)
+	mp	0(1,%r1), 0(1,%r0)
+	mp	0(%r1,%r2), 0(1,%r1)
+	mp	0(1,%r2), 0(%r1,%r2)
+	mp	0(-), 0(1)
 
 #CHECK: error: invalid register pair
-#CHECK: mlgr	%r1, %r0
+#CHECK: mr	%r1, %r0
 
-	mlgr	%r1, %r0
+	mr	%r1, %r0
 
 #CHECK: error: invalid operand
 #CHECK: ms	%r0, -1
@@ -2745,6 +3527,50 @@
 	mvc	0(1,%r2), 0(%r1,%r2)
 	mvc	0(-), 0
 
+#CHECK: error: missing length in address
+#CHECK: mvcin	0, 0
+#CHECK: error: missing length in address
+#CHECK: mvcin	0(%r1), 0(%r1)
+#CHECK: error: invalid use of length addressing
+#CHECK: mvcin	0(1,%r1), 0(2,%r1)
+#CHECK: error: invalid operand
+#CHECK: mvcin	0(0,%r1), 0(%r1)
+#CHECK: error: invalid operand
+#CHECK: mvcin	0(257,%r1), 0(%r1)
+#CHECK: error: invalid operand
+#CHECK: mvcin	-1(1,%r1), 0(%r1)
+#CHECK: error: invalid operand
+#CHECK: mvcin	4096(1,%r1), 0(%r1)
+#CHECK: error: invalid operand
+#CHECK: mvcin	0(1,%r1), -1(%r1)
+#CHECK: error: invalid operand
+#CHECK: mvcin	0(1,%r1), 4096(%r1)
+#CHECK: error: %r0 used in an address
+#CHECK: mvcin	0(1,%r0), 0(%r1)
+#CHECK: error: %r0 used in an address
+#CHECK: mvcin	0(1,%r1), 0(%r0)
+#CHECK: error: invalid use of indexed addressing
+#CHECK: mvcin	0(%r1,%r2), 0(%r1)
+#CHECK: error: invalid use of indexed addressing
+#CHECK: mvcin	0(1,%r2), 0(%r1,%r2)
+#CHECK: error: unknown token in expression
+#CHECK: mvcin	0(-), 0
+
+	mvcin	0, 0
+	mvcin	0(%r1), 0(%r1)
+	mvcin	0(1,%r1), 0(2,%r1)
+	mvcin	0(0,%r1), 0(%r1)
+	mvcin	0(257,%r1), 0(%r1)
+	mvcin	-1(1,%r1), 0(%r1)
+	mvcin	4096(1,%r1), 0(%r1)
+	mvcin	0(1,%r1), -1(%r1)
+	mvcin	0(1,%r1), 4096(%r1)
+	mvcin	0(1,%r0), 0(%r1)
+	mvcin	0(1,%r1), 0(%r0)
+	mvcin	0(%r1,%r2), 0(%r1)
+	mvcin	0(1,%r2), 0(%r1,%r2)
+	mvcin	0(-), 0
+
 #CHECK: error: invalid use of length addressing
 #CHECK: mvck	0(%r1,%r1), 0(2,%r1), %r3
 #CHECK: error: invalid operand
@@ -2774,6 +3600,42 @@
 	mvck	0(%r1,%r2), 0(%r1,%r2), %r3
 	mvck	0(-), 0, %r3
 
+#CHECK: error: invalid register pair
+#CHECK: mvcl	%r1, %r0
+#CHECK: error: invalid register pair
+#CHECK: mvcl	%r0, %r1
+
+	mvcl	%r1, %r0
+	mvcl	%r0, %r1
+
+#CHECK: error: invalid register pair
+#CHECK: mvcle	%r1, %r0
+#CHECK: error: invalid register pair
+#CHECK: mvcle	%r0, %r1
+#CHECK: error: invalid operand
+#CHECK: mvcle	%r0, %r0, -1
+#CHECK: error: invalid operand
+#CHECK: mvcle	%r0, %r0, 4096
+
+	mvcle	%r1, %r0, 0
+	mvcle	%r0, %r1, 0
+	mvcle	%r0, %r0, -1
+	mvcle	%r0, %r0, 4096
+
+#CHECK: error: invalid register pair
+#CHECK: mvclu	%r1, %r0
+#CHECK: error: invalid register pair
+#CHECK: mvclu	%r0, %r1
+#CHECK: error: invalid operand
+#CHECK: mvclu	%r0, %r0, -524289
+#CHECK: error: invalid operand
+#CHECK: mvclu	%r0, %r0, 524288
+
+	mvclu	%r1, %r0, 0
+	mvclu	%r0, %r1, 0
+	mvclu	%r0, %r0, -524289
+	mvclu	%r0, %r0, 524288
+
 #CHECK: error: invalid operand
 #CHECK: mvghi	-1, 0
 #CHECK: error: invalid operand
@@ -2859,6 +3721,147 @@
 	mviy	0, -1
 	mviy	0, 256
 
+#CHECK: error: missing length in address
+#CHECK: mvn	0, 0
+#CHECK: error: missing length in address
+#CHECK: mvn	0(%r1), 0(%r1)
+#CHECK: error: invalid use of length addressing
+#CHECK: mvn	0(1,%r1), 0(2,%r1)
+#CHECK: error: invalid operand
+#CHECK: mvn	0(0,%r1), 0(%r1)
+#CHECK: error: invalid operand
+#CHECK: mvn	0(257,%r1), 0(%r1)
+#CHECK: error: invalid operand
+#CHECK: mvn	-1(1,%r1), 0(%r1)
+#CHECK: error: invalid operand
+#CHECK: mvn	4096(1,%r1), 0(%r1)
+#CHECK: error: invalid operand
+#CHECK: mvn	0(1,%r1), -1(%r1)
+#CHECK: error: invalid operand
+#CHECK: mvn	0(1,%r1), 4096(%r1)
+#CHECK: error: %r0 used in an address
+#CHECK: mvn	0(1,%r0), 0(%r1)
+#CHECK: error: %r0 used in an address
+#CHECK: mvn	0(1,%r1), 0(%r0)
+#CHECK: error: invalid use of indexed addressing
+#CHECK: mvn	0(%r1,%r2), 0(%r1)
+#CHECK: error: invalid use of indexed addressing
+#CHECK: mvn	0(1,%r2), 0(%r1,%r2)
+#CHECK: error: unknown token in expression
+#CHECK: mvn	0(-), 0
+
+	mvn	0, 0
+	mvn	0(%r1), 0(%r1)
+	mvn	0(1,%r1), 0(2,%r1)
+	mvn	0(0,%r1), 0(%r1)
+	mvn	0(257,%r1), 0(%r1)
+	mvn	-1(1,%r1), 0(%r1)
+	mvn	4096(1,%r1), 0(%r1)
+	mvn	0(1,%r1), -1(%r1)
+	mvn	0(1,%r1), 4096(%r1)
+	mvn	0(1,%r0), 0(%r1)
+	mvn	0(1,%r1), 0(%r0)
+	mvn	0(%r1,%r2), 0(%r1)
+	mvn	0(1,%r2), 0(%r1,%r2)
+	mvn	0(-), 0
+
+#CHECK: error: missing length in address
+#CHECK: mvo	0, 0(1)
+#CHECK: error: missing length in address
+#CHECK: mvo	0(1), 0
+#CHECK: error: missing length in address
+#CHECK: mvo	0(%r1), 0(1,%r1)
+#CHECK: error: missing length in address
+#CHECK: mvo	0(1,%r1), 0(%r1)
+#CHECK: error: invalid operand
+#CHECK: mvo	0(0,%r1), 0(1,%r1)
+#CHECK: error: invalid operand
+#CHECK: mvo	0(1,%r1), 0(0,%r1)
+#CHECK: error: invalid operand
+#CHECK: mvo	0(17,%r1), 0(1,%r1)
+#CHECK: error: invalid operand
+#CHECK: mvo	0(1,%r1), 0(17,%r1)
+#CHECK: error: invalid operand
+#CHECK: mvo	-1(1,%r1), 0(1,%r1)
+#CHECK: error: invalid operand
+#CHECK: mvo	4096(1,%r1), 0(1,%r1)
+#CHECK: error: invalid operand
+#CHECK: mvo	0(1,%r1), -1(1,%r1)
+#CHECK: error: invalid operand
+#CHECK: mvo	0(1,%r1), 4096(1,%r1)
+#CHECK: error: %r0 used in an address
+#CHECK: mvo	0(1,%r0), 0(1,%r1)
+#CHECK: error: %r0 used in an address
+#CHECK: mvo	0(1,%r1), 0(1,%r0)
+#CHECK: error: invalid use of indexed addressing
+#CHECK: mvo	0(%r1,%r2), 0(1,%r1)
+#CHECK: error: invalid use of indexed addressing
+#CHECK: mvo	0(1,%r2), 0(%r1,%r2)
+#CHECK: error: unknown token in expression
+#CHECK: mvo	0(-), 0(1)
+
+	mvo	0, 0(1)
+	mvo	0(1), 0
+	mvo	0(%r1), 0(1,%r1)
+	mvo	0(1,%r1), 0(%r1)
+	mvo	0(0,%r1), 0(1,%r1)
+	mvo	0(1,%r1), 0(0,%r1)
+	mvo	0(17,%r1), 0(1,%r1)
+	mvo	0(1,%r1), 0(17,%r1)
+	mvo	-1(1,%r1), 0(1,%r1)
+	mvo	4096(1,%r1), 0(1,%r1)
+	mvo	0(1,%r1), -1(1,%r1)
+	mvo	0(1,%r1), 4096(1,%r1)
+	mvo	0(1,%r0), 0(1,%r1)
+	mvo	0(1,%r1), 0(1,%r0)
+	mvo	0(%r1,%r2), 0(1,%r1)
+	mvo	0(1,%r2), 0(%r1,%r2)
+	mvo	0(-), 0(1)
+
+#CHECK: error: missing length in address
+#CHECK: mvz	0, 0
+#CHECK: error: missing length in address
+#CHECK: mvz	0(%r1), 0(%r1)
+#CHECK: error: invalid use of length addressing
+#CHECK: mvz	0(1,%r1), 0(2,%r1)
+#CHECK: error: invalid operand
+#CHECK: mvz	0(0,%r1), 0(%r1)
+#CHECK: error: invalid operand
+#CHECK: mvz	0(257,%r1), 0(%r1)
+#CHECK: error: invalid operand
+#CHECK: mvz	-1(1,%r1), 0(%r1)
+#CHECK: error: invalid operand
+#CHECK: mvz	4096(1,%r1), 0(%r1)
+#CHECK: error: invalid operand
+#CHECK: mvz	0(1,%r1), -1(%r1)
+#CHECK: error: invalid operand
+#CHECK: mvz	0(1,%r1), 4096(%r1)
+#CHECK: error: %r0 used in an address
+#CHECK: mvz	0(1,%r0), 0(%r1)
+#CHECK: error: %r0 used in an address
+#CHECK: mvz	0(1,%r1), 0(%r0)
+#CHECK: error: invalid use of indexed addressing
+#CHECK: mvz	0(%r1,%r2), 0(%r1)
+#CHECK: error: invalid use of indexed addressing
+#CHECK: mvz	0(1,%r2), 0(%r1,%r2)
+#CHECK: error: unknown token in expression
+#CHECK: mvz	0(-), 0
+
+	mvz	0, 0
+	mvz	0(%r1), 0(%r1)
+	mvz	0(1,%r1), 0(2,%r1)
+	mvz	0(0,%r1), 0(%r1)
+	mvz	0(257,%r1), 0(%r1)
+	mvz	-1(1,%r1), 0(%r1)
+	mvz	4096(1,%r1), 0(%r1)
+	mvz	0(1,%r1), -1(%r1)
+	mvz	0(1,%r1), 4096(%r1)
+	mvz	0(1,%r0), 0(%r1)
+	mvz	0(1,%r1), 0(%r0)
+	mvz	0(%r1,%r2), 0(%r1)
+	mvz	0(1,%r2), 0(%r1,%r2)
+	mvz	0(-), 0
+
 #CHECK: error: invalid register pair
 #CHECK: mxbr	%f0, %f2
 #CHECK: error: invalid register pair
@@ -3203,6 +4206,64 @@
 	oy	%r0, -524289
 	oy	%r0, 524288
 
+#CHECK: error: missing length in address
+#CHECK: pack	0, 0(1)
+#CHECK: error: missing length in address
+#CHECK: pack	0(1), 0
+#CHECK: error: missing length in address
+#CHECK: pack	0(%r1), 0(1,%r1)
+#CHECK: error: missing length in address
+#CHECK: pack	0(1,%r1), 0(%r1)
+#CHECK: error: invalid operand
+#CHECK: pack	0(0,%r1), 0(1,%r1)
+#CHECK: error: invalid operand
+#CHECK: pack	0(1,%r1), 0(0,%r1)
+#CHECK: error: invalid operand
+#CHECK: pack	0(17,%r1), 0(1,%r1)
+#CHECK: error: invalid operand
+#CHECK: pack	0(1,%r1), 0(17,%r1)
+#CHECK: error: invalid operand
+#CHECK: pack	-1(1,%r1), 0(1,%r1)
+#CHECK: error: invalid operand
+#CHECK: pack	4096(1,%r1), 0(1,%r1)
+#CHECK: error: invalid operand
+#CHECK: pack	0(1,%r1), -1(1,%r1)
+#CHECK: error: invalid operand
+#CHECK: pack	0(1,%r1), 4096(1,%r1)
+#CHECK: error: %r0 used in an address
+#CHECK: pack	0(1,%r0), 0(1,%r1)
+#CHECK: error: %r0 used in an address
+#CHECK: pack	0(1,%r1), 0(1,%r0)
+#CHECK: error: invalid use of indexed addressing
+#CHECK: pack	0(%r1,%r2), 0(1,%r1)
+#CHECK: error: invalid use of indexed addressing
+#CHECK: pack	0(1,%r2), 0(%r1,%r2)
+#CHECK: error: unknown token in expression
+#CHECK: pack	0(-), 0(1)
+
+	pack	0, 0(1)
+	pack	0(1), 0
+	pack	0(%r1), 0(1,%r1)
+	pack	0(1,%r1), 0(%r1)
+	pack	0(0,%r1), 0(1,%r1)
+	pack	0(1,%r1), 0(0,%r1)
+	pack	0(17,%r1), 0(1,%r1)
+	pack	0(1,%r1), 0(17,%r1)
+	pack	-1(1,%r1), 0(1,%r1)
+	pack	4096(1,%r1), 0(1,%r1)
+	pack	0(1,%r1), -1(1,%r1)
+	pack	0(1,%r1), 4096(1,%r1)
+	pack	0(1,%r0), 0(1,%r1)
+	pack	0(1,%r1), 0(1,%r0)
+	pack	0(%r1,%r2), 0(1,%r1)
+	pack	0(1,%r2), 0(%r1,%r2)
+	pack	0(-), 0(1)
+
+#CHECK: error: instruction requires: message-security-assist-extension4
+#CHECK: pcc
+
+	pcc
+
 #CHECK: error: invalid operand
 #CHECK: pfd	-1, 0
 #CHECK: error: invalid operand
@@ -3237,6 +4298,94 @@
 	pfdrl	1, 1
 	pfdrl	1, 0x100000000
 
+#CHECK: error: missing length in address
+#CHECK: pka	0, 0
+#CHECK: error: missing length in address
+#CHECK: pka	0(%r1), 0(%r1)
+#CHECK: error: invalid use of length addressing
+#CHECK: pka	0(1,%r1), 0(2,%r1)
+#CHECK: error: invalid operand
+#CHECK: pka	0(%r1), 0(0,%r1)
+#CHECK: error: invalid operand
+#CHECK: pka	0(%r1), 0(257,%r1)
+#CHECK: error: invalid operand
+#CHECK: pka	-1(%r1), 0(1,%r1)
+#CHECK: error: invalid operand
+#CHECK: pka	4096(%r1), 0(1,%r1)
+#CHECK: error: invalid operand
+#CHECK: pka	0(%r1), -1(1,%r1)
+#CHECK: error: invalid operand
+#CHECK: pka	0(%r1), 4096(1,%r1)
+#CHECK: error: %r0 used in an address
+#CHECK: pka	0(%r0), 0(1,%r1)
+#CHECK: error: %r0 used in an address
+#CHECK: pka	0(%r1), 0(1,%r0)
+#CHECK: error: invalid use of indexed addressing
+#CHECK: pka	0(%r1,%r2), 0(1,%r1)
+#CHECK: error: invalid use of indexed addressing
+#CHECK: pka	0(%r2), 0(%r1,%r2)
+#CHECK: error: unknown token in expression
+#CHECK: pka	0, 0(-)
+
+	pka	0, 0
+	pka	0(%r1), 0(%r1)
+	pka	0(1,%r1), 0(2,%r1)
+	pka	0(%r1), 0(0,%r1)
+	pka	0(%r1), 0(257,%r1)
+	pka	-1(%r1), 0(1,%r1)
+	pka	4096(%r1), 0(1,%r1)
+	pka	0(%r1), -1(1,%r1)
+	pka	0(%r1), 4096(1,%r1)
+	pka	0(%r0), 0(1,%r1)
+	pka	0(%r1), 0(1,%r0)
+	pka	0(%r1,%r2), 0(1,%r1)
+	pka	0(%r2), 0(%r1,%r2)
+	pka	0, 0(-)
+
+#CHECK: error: missing length in address
+#CHECK: pku	0, 0
+#CHECK: error: missing length in address
+#CHECK: pku	0(%r1), 0(%r1)
+#CHECK: error: invalid use of length addressing
+#CHECK: pku	0(1,%r1), 0(2,%r1)
+#CHECK: error: invalid operand
+#CHECK: pku	0(%r1), 0(0,%r1)
+#CHECK: error: invalid operand
+#CHECK: pku	0(%r1), 0(257,%r1)
+#CHECK: error: invalid operand
+#CHECK: pku	-1(%r1), 0(1,%r1)
+#CHECK: error: invalid operand
+#CHECK: pku	4096(%r1), 0(1,%r1)
+#CHECK: error: invalid operand
+#CHECK: pku	0(%r1), -1(1,%r1)
+#CHECK: error: invalid operand
+#CHECK: pku	0(%r1), 4096(1,%r1)
+#CHECK: error: %r0 used in an address
+#CHECK: pku	0(%r0), 0(1,%r1)
+#CHECK: error: %r0 used in an address
+#CHECK: pku	0(%r1), 0(1,%r0)
+#CHECK: error: invalid use of indexed addressing
+#CHECK: pku	0(%r1,%r2), 0(1,%r1)
+#CHECK: error: invalid use of indexed addressing
+#CHECK: pku	0(%r2), 0(%r1,%r2)
+#CHECK: error: unknown token in expression
+#CHECK: pku	0, 0(-)
+
+	pku	0, 0
+	pku	0(%r1), 0(%r1)
+	pku	0(1,%r1), 0(2,%r1)
+	pku	0(%r1), 0(0,%r1)
+	pku	0(%r1), 0(257,%r1)
+	pku	-1(%r1), 0(1,%r1)
+	pku	4096(%r1), 0(1,%r1)
+	pku	0(%r1), -1(1,%r1)
+	pku	0(%r1), 4096(1,%r1)
+	pku	0(%r0), 0(1,%r1)
+	pku	0(%r1), 0(1,%r0)
+	pku	0(%r1,%r2), 0(1,%r1)
+	pku	0(%r2), 0(%r1,%r2)
+	pku	0, 0(-)
+
 #CHECK: error: invalid use of indexed addressing
 #CHECK: plo	%r2, 160(%r1,%r15), %r4, 160(%r15)
 #CHECK: error: invalid operand
@@ -3293,6 +4442,34 @@
 
 	risblg	%r1, %r2, 0, 0, 0
 
+#CHECK: error: invalid operand
+#CHECK: rll	%r0,%r0,-524289
+#CHECK: error: invalid operand
+#CHECK: rll	%r0,%r0,524288
+#CHECK: error: %r0 used in an address
+#CHECK: rll	%r0,%r0,0(%r0)
+#CHECK: error: invalid use of indexed addressing
+#CHECK: rll	%r0,%r0,0(%r1,%r2)
+
+	rll	%r0,%r0,-524289
+	rll	%r0,%r0,524288
+	rll	%r0,%r0,0(%r0)
+	rll	%r0,%r0,0(%r1,%r2)
+
+#CHECK: error: invalid operand
+#CHECK: rllg	%r0,%r0,-524289
+#CHECK: error: invalid operand
+#CHECK: rllg	%r0,%r0,524288
+#CHECK: error: %r0 used in an address
+#CHECK: rllg	%r0,%r0,0(%r0)
+#CHECK: error: invalid use of indexed addressing
+#CHECK: rllg	%r0,%r0,0(%r1,%r2)
+
+	rllg	%r0,%r0,-524289
+	rllg	%r0,%r0,524288
+	rllg	%r0,%r0,0(%r0)
+	rllg	%r0,%r0,0(%r1,%r2)
+
 #CHECK: error: invalid operand
 #CHECK: rnsbg	%r0,%r0,0,0,-1
 #CHECK: error: invalid operand
@@ -3353,34 +4530,6 @@
 	rxsbg	%r0,%r0,-1,0,0
 	rxsbg	%r0,%r0,256,0,0
 
-#CHECK: error: invalid operand
-#CHECK: rll	%r0,%r0,-524289
-#CHECK: error: invalid operand
-#CHECK: rll	%r0,%r0,524288
-#CHECK: error: %r0 used in an address
-#CHECK: rll	%r0,%r0,0(%r0)
-#CHECK: error: invalid use of indexed addressing
-#CHECK: rll	%r0,%r0,0(%r1,%r2)
-
-	rll	%r0,%r0,-524289
-	rll	%r0,%r0,524288
-	rll	%r0,%r0,0(%r0)
-	rll	%r0,%r0,0(%r1,%r2)
-
-#CHECK: error: invalid operand
-#CHECK: rllg	%r0,%r0,-524289
-#CHECK: error: invalid operand
-#CHECK: rllg	%r0,%r0,524288
-#CHECK: error: %r0 used in an address
-#CHECK: rllg	%r0,%r0,0(%r0)
-#CHECK: error: invalid use of indexed addressing
-#CHECK: rllg	%r0,%r0,0(%r1,%r2)
-
-	rllg	%r0,%r0,-524289
-	rllg	%r0,%r0,524288
-	rllg	%r0,%r0,0(%r0)
-	rllg	%r0,%r0,0(%r1,%r2)
-
 #CHECK: error: invalid operand
 #CHECK: s	%r0, -1
 #CHECK: error: invalid operand
@@ -3450,6 +4599,39 @@
 	sl	%r0, -1
 	sl	%r0, 4096
 
+#CHECK: error: invalid operand
+#CHECK: sla	%r0,-1
+#CHECK: error: invalid operand
+#CHECK: sla	%r0,4096
+#CHECK: error: %r0 used in an address
+#CHECK: sla	%r0,0(%r0)
+#CHECK: error: invalid use of indexed addressing
+#CHECK: sla	%r0,0(%r1,%r2)
+
+	sla	%r0,-1
+	sla	%r0,4096
+	sla	%r0,0(%r0)
+	sla	%r0,0(%r1,%r2)
+
+#CHECK: error: invalid operand
+#CHECK: slag	%r0,%r0,-524289
+#CHECK: error: invalid operand
+#CHECK: slag	%r0,%r0,524288
+#CHECK: error: %r0 used in an address
+#CHECK: slag	%r0,%r0,0(%r0)
+#CHECK: error: invalid use of indexed addressing
+#CHECK: slag	%r0,%r0,0(%r1,%r2)
+
+	slag	%r0,%r0,-524289
+	slag	%r0,%r0,524288
+	slag	%r0,%r0,0(%r0)
+	slag	%r0,%r0,0(%r1,%r2)
+
+#CHECK: error: instruction requires: distinct-ops
+#CHECK: slak	%r2,%r3,4(%r5)
+
+	slak	%r2,%r3,4(%r5)
+
 #CHECK: error: invalid operand
 #CHECK: slb	%r0, -524289
 #CHECK: error: invalid operand
@@ -3466,6 +4648,40 @@
 	slbg	%r0, -524289
 	slbg	%r0, 524288
 
+#CHECK: error: invalid register pair
+#CHECK: slda	%r1,0
+#CHECK: error: invalid operand
+#CHECK: slda	%r0,-1
+#CHECK: error: invalid operand
+#CHECK: slda	%r0,4096
+#CHECK: error: %r0 used in an address
+#CHECK: slda	%r0,0(%r0)
+#CHECK: error: invalid use of indexed addressing
+#CHECK: slda	%r0,0(%r1,%r2)
+
+	slda	%r1,0
+	slda	%r0,-1
+	slda	%r0,4096
+	slda	%r0,0(%r0)
+	slda	%r0,0(%r1,%r2)
+
+#CHECK: error: invalid register pair
+#CHECK: sldl	%r1,0
+#CHECK: error: invalid operand
+#CHECK: sldl	%r0,-1
+#CHECK: error: invalid operand
+#CHECK: sldl	%r0,4096
+#CHECK: error: %r0 used in an address
+#CHECK: sldl	%r0,0(%r0)
+#CHECK: error: invalid use of indexed addressing
+#CHECK: sldl	%r0,0(%r1,%r2)
+
+	sldl	%r1,0
+	sldl	%r0,-1
+	sldl	%r0,4096
+	sldl	%r0,0(%r0)
+	sldl	%r0,0(%r1,%r2)
+
 #CHECK: error: invalid operand
 #CHECK: slfi	%r0, -1
 #CHECK: error: invalid operand
@@ -3501,26 +4717,7 @@
 #CHECK: error: instruction requires: distinct-ops
 #CHECK: slgrk	%r2,%r3,%r4
 
-	slgrk	%r2,%r3,%r4
-
-#CHECK: error: invalid operand
-#CHECK: sla	%r0,-1
-#CHECK: error: invalid operand
-#CHECK: sla	%r0,4096
-#CHECK: error: %r0 used in an address
-#CHECK: sla	%r0,0(%r0)
-#CHECK: error: invalid use of indexed addressing
-#CHECK: sla	%r0,0(%r1,%r2)
-
-	sla	%r0,-1
-	sla	%r0,4096
-	sla	%r0,0(%r0)
-	sla	%r0,0(%r1,%r2)
-
-#CHECK: error: instruction requires: distinct-ops
-#CHECK: slak	%r2,%r3,4(%r5)
-
-	slak	%r2,%r3,4(%r5)
+	slgrk	%r2,%r3,%r4
 
 #CHECK: error: invalid operand
 #CHECK: sll	%r0,-1
@@ -3568,6 +4765,59 @@
 	sly	%r0, -524289
 	sly	%r0, 524288
 
+#CHECK: error: missing length in address
+#CHECK: sp	0, 0(1)
+#CHECK: error: missing length in address
+#CHECK: sp	0(1), 0
+#CHECK: error: missing length in address
+#CHECK: sp	0(%r1), 0(1,%r1)
+#CHECK: error: missing length in address
+#CHECK: sp	0(1,%r1), 0(%r1)
+#CHECK: error: invalid operand
+#CHECK: sp	0(0,%r1), 0(1,%r1)
+#CHECK: error: invalid operand
+#CHECK: sp	0(1,%r1), 0(0,%r1)
+#CHECK: error: invalid operand
+#CHECK: sp	0(17,%r1), 0(1,%r1)
+#CHECK: error: invalid operand
+#CHECK: sp	0(1,%r1), 0(17,%r1)
+#CHECK: error: invalid operand
+#CHECK: sp	-1(1,%r1), 0(1,%r1)
+#CHECK: error: invalid operand
+#CHECK: sp	4096(1,%r1), 0(1,%r1)
+#CHECK: error: invalid operand
+#CHECK: sp	0(1,%r1), -1(1,%r1)
+#CHECK: error: invalid operand
+#CHECK: sp	0(1,%r1), 4096(1,%r1)
+#CHECK: error: %r0 used in an address
+#CHECK: sp	0(1,%r0), 0(1,%r1)
+#CHECK: error: %r0 used in an address
+#CHECK: sp	0(1,%r1), 0(1,%r0)
+#CHECK: error: invalid use of indexed addressing
+#CHECK: sp	0(%r1,%r2), 0(1,%r1)
+#CHECK: error: invalid use of indexed addressing
+#CHECK: sp	0(1,%r2), 0(%r1,%r2)
+#CHECK: error: unknown token in expression
+#CHECK: sp	0(-), 0(1)
+
+	sp	0, 0(1)
+	sp	0(1), 0
+	sp	0(%r1), 0(1,%r1)
+	sp	0(1,%r1), 0(%r1)
+	sp	0(0,%r1), 0(1,%r1)
+	sp	0(1,%r1), 0(0,%r1)
+	sp	0(17,%r1), 0(1,%r1)
+	sp	0(1,%r1), 0(17,%r1)
+	sp	-1(1,%r1), 0(1,%r1)
+	sp	4096(1,%r1), 0(1,%r1)
+	sp	0(1,%r1), -1(1,%r1)
+	sp	0(1,%r1), 4096(1,%r1)
+	sp	0(1,%r0), 0(1,%r1)
+	sp	0(1,%r1), 0(1,%r0)
+	sp	0(%r1,%r2), 0(1,%r1)
+	sp	0(1,%r2), 0(%r1,%r2)
+	sp	0(-), 0(1)
+
 #CHECK: error: invalid operand
 #CHECK: sqdb	%f0, -1
 #CHECK: error: invalid operand
@@ -3625,6 +4875,40 @@
 
 	srak	%r2,%r3,4(%r5)
 
+#CHECK: error: invalid register pair
+#CHECK: srda	%r1,0
+#CHECK: error: invalid operand
+#CHECK: srda	%r0,-1
+#CHECK: error: invalid operand
+#CHECK: srda	%r0,4096
+#CHECK: error: %r0 used in an address
+#CHECK: srda	%r0,0(%r0)
+#CHECK: error: invalid use of indexed addressing
+#CHECK: srda	%r0,0(%r1,%r2)
+
+	srda	%r1,0
+	srda	%r0,-1
+	srda	%r0,4096
+	srda	%r0,0(%r0)
+	srda	%r0,0(%r1,%r2)
+
+#CHECK: error: invalid register pair
+#CHECK: srdl	%r1,0
+#CHECK: error: invalid operand
+#CHECK: srdl	%r0,-1
+#CHECK: error: invalid operand
+#CHECK: srdl	%r0,4096
+#CHECK: error: %r0 used in an address
+#CHECK: srdl	%r0,0(%r0)
+#CHECK: error: invalid use of indexed addressing
+#CHECK: srdl	%r0,0(%r1,%r2)
+
+	srdl	%r1,0
+	srdl	%r0,-1
+	srdl	%r0,4096
+	srdl	%r0,0(%r0)
+	srdl	%r0,0(%r1,%r2)
+
 #CHECK: error: instruction requires: distinct-ops
 #CHECK: srk	%r2,%r3,%r4
 
@@ -3690,6 +4974,56 @@
 	srnmt	4096
 	srnmt	0(%r1,%r2)
 
+#CHECK: error: missing length in address
+#CHECK: srp	0, 0, 0
+#CHECK: error: missing length in address
+#CHECK: srp	0(%r1), 0(%r1), 0
+#CHECK: error: invalid use of length addressing
+#CHECK: srp	0(1,%r1), 0(2,%r1), 0
+#CHECK: error: invalid operand
+#CHECK: srp	0(0,%r1), 0(%r1), 0
+#CHECK: error: invalid operand
+#CHECK: srp	0(17,%r1), 0(%r1), 0
+#CHECK: error: invalid operand
+#CHECK: srp	-1(1,%r1), 0(%r1), 0
+#CHECK: error: invalid operand
+#CHECK: srp	4096(1,%r1), 0(%r1), 0
+#CHECK: error: invalid operand
+#CHECK: srp	0(1,%r1), -1(%r1), 0
+#CHECK: error: invalid operand
+#CHECK: srp	0(1,%r1), 4096(%r1), 0
+#CHECK: error: %r0 used in an address
+#CHECK: srp	0(1,%r0), 0(%r1), 0
+#CHECK: error: %r0 used in an address
+#CHECK: srp	0(1,%r1), 0(%r0), 0
+#CHECK: error: invalid use of indexed addressing
+#CHECK: srp	0(%r1,%r2), 0(%r1), 0
+#CHECK: error: invalid use of indexed addressing
+#CHECK: srp	0(1,%r2), 0(%r1,%r2), 0
+#CHECK: error: invalid operand
+#CHECK: srp	0(1), 0, -1
+#CHECK: error: invalid operand
+#CHECK: srp	0(1), 0, 16
+#CHECK: error: unknown token in expression
+#CHECK: srp	0(-), 0, 0
+
+	srp	0, 0, 0
+	srp	0(%r1), 0(%r1), 0
+	srp	0(1,%r1), 0(2,%r1), 0
+	srp	0(0,%r1), 0(%r1), 0
+	srp	0(17,%r1), 0(%r1), 0
+	srp	-1(1,%r1), 0(%r1), 0
+	srp	4096(1,%r1), 0(%r1), 0
+	srp	0(1,%r1), -1(%r1), 0
+	srp	0(1,%r1), 4096(%r1), 0
+	srp	0(1,%r0), 0(%r1), 0
+	srp	0(1,%r1), 0(%r0), 0
+	srp	0(%r1,%r2), 0(%r1), 0
+	srp	0(1,%r2), 0(%r1,%r2), 0
+	srp	0(1), 0, -1
+	srp	0(1), 0, 16
+	srp	0(-), 0, 0
+
 #CHECK: error: invalid operand
 #CHECK: st	%r0, -1
 #CHECK: error: invalid operand
@@ -3730,6 +5064,48 @@
 
 	stch	%r0, 0
 
+#CHECK: error: invalid operand
+#CHECK: stcm	%r0, 0, -1
+#CHECK: error: invalid operand
+#CHECK: stcm	%r0, 0, 4096
+#CHECK: error: invalid operand
+#CHECK: stcm	%r0, -1, 0
+#CHECK: error: invalid operand
+#CHECK: stcm	%r0, 16, 0
+
+	stcm	%r0, 0, -1
+	stcm	%r0, 0, 4096
+	stcm	%r0, -1, 0
+	stcm	%r0, 16, 0
+
+#CHECK: error: invalid operand
+#CHECK: stcmy	%r0, 0, -524289
+#CHECK: error: invalid operand
+#CHECK: stcmy	%r0, 0, 524288
+#CHECK: error: invalid operand
+#CHECK: stcmy	%r0, -1, 0
+#CHECK: error: invalid operand
+#CHECK: stcmy	%r0, 16, 0
+
+	stcmy	%r0, 0, -524289
+	stcmy	%r0, 0, 524288
+	stcmy	%r0, -1, 0
+	stcmy	%r0, 16, 0
+
+#CHECK: error: invalid operand
+#CHECK: stcmy	%r0, 0, -524289
+#CHECK: error: invalid operand
+#CHECK: stcmy	%r0, 0, 524288
+#CHECK: error: invalid operand
+#CHECK: stcmy	%r0, -1, 0
+#CHECK: error: invalid operand
+#CHECK: stcmy	%r0, 16, 0
+
+	stcmy	%r0, 0, -524289
+	stcmy	%r0, 0, 524288
+	stcmy	%r0, -1, 0
+	stcmy	%r0, 16, 0
+
 #CHECK: error: invalid operand
 #CHECK: stcy	%r0, -524289
 #CHECK: error: invalid operand
@@ -3770,6 +5146,11 @@
 	stey	%f0, -524289
 	stey	%f0, 524288
 
+#CHECK: error: instruction requires: high-word
+#CHECK: stfh	%r0, 0
+
+	stfh	%r0, 0
+
 #CHECK: error: invalid operand
 #CHECK: stfpc	-1
 #CHECK: error: invalid operand
@@ -3838,11 +5219,6 @@
 	sthy	%r0, -524289
 	sthy	%r0, 524288
 
-#CHECK: error: instruction requires: high-word
-#CHECK: stfh	%r0, 0
-
-	stfh	%r0, 0
-
 #CHECK: error: invalid operand
 #CHECK: stm	%r0, %r0, 4096
 #CHECK: error: invalid use of indexed addressing
@@ -4007,6 +5383,14 @@
 	tm	0, -1
 	tm	0, 256
 
+#CHECK: error: invalid operand
+#CHECK: tmh	%r0, -1
+#CHECK: error: invalid operand
+#CHECK: tmh	%r0, 0x10000
+
+	tmh	%r0, -1
+	tmh	%r0, 0x10000
+
 #CHECK: error: invalid operand
 #CHECK: tmhh	%r0, -1
 #CHECK: error: invalid operand
@@ -4024,12 +5408,12 @@
 	tmhl	%r0, 0x10000
 
 #CHECK: error: invalid operand
-#CHECK: tmh	%r0, -1
+#CHECK: tml	%r0, -1
 #CHECK: error: invalid operand
-#CHECK: tmh	%r0, 0x10000
+#CHECK: tml	%r0, 0x10000
 
-	tmh	%r0, -1
-	tmh	%r0, 0x10000
+	tml	%r0, -1
+	tml	%r0, 0x10000
 
 #CHECK: error: invalid operand
 #CHECK: tmlh	%r0, -1
@@ -4039,14 +5423,6 @@
 	tmlh	%r0, -1
 	tmlh	%r0, 0x10000
 
-#CHECK: error: invalid operand
-#CHECK: tml	%r0, -1
-#CHECK: error: invalid operand
-#CHECK: tml	%r0, 0x10000
-
-	tml	%r0, -1
-	tml	%r0, 0x10000
-
 #CHECK: error: invalid operand
 #CHECK: tmll	%r0, -1
 #CHECK: error: invalid operand
@@ -4072,6 +5448,238 @@
 	tmy	0, -1
 	tmy	0, 256
 
+#CHECK: error: missing length in address
+#CHECK: tp	0
+#CHECK: error: missing length in address
+#CHECK: tp	0(%r1)
+#CHECK: error: invalid operand
+#CHECK: tp	0(0,%r1)
+#CHECK: error: invalid operand
+#CHECK: tp	0(17,%r1)
+#CHECK: error: invalid operand
+#CHECK: tp	-1(1,%r1)
+#CHECK: error: invalid operand
+#CHECK: tp	4096(1,%r1)
+#CHECK: error: %r0 used in an address
+#CHECK: tp	0(1,%r0)
+#CHECK: error: invalid use of indexed addressing
+#CHECK: tp	0(%r1,%r2)
+#CHECK: error: unknown token in expression
+#CHECK: tp	0(-)
+
+	tp	0
+	tp	0(%r1)
+	tp	0(0,%r1)
+	tp	0(17,%r1)
+	tp	-1(1,%r1)
+	tp	4096(1,%r1)
+	tp	0(1,%r0)
+	tp	0(%r1,%r2)
+	tp	0(-)
+
+#CHECK: error: missing length in address
+#CHECK: tr	0, 0
+#CHECK: error: missing length in address
+#CHECK: tr	0(%r1), 0(%r1)
+#CHECK: error: invalid use of length addressing
+#CHECK: tr	0(1,%r1), 0(2,%r1)
+#CHECK: error: invalid operand
+#CHECK: tr	0(0,%r1), 0(%r1)
+#CHECK: error: invalid operand
+#CHECK: tr	0(257,%r1), 0(%r1)
+#CHECK: error: invalid operand
+#CHECK: tr	-1(1,%r1), 0(%r1)
+#CHECK: error: invalid operand
+#CHECK: tr	4096(1,%r1), 0(%r1)
+#CHECK: error: invalid operand
+#CHECK: tr	0(1,%r1), -1(%r1)
+#CHECK: error: invalid operand
+#CHECK: tr	0(1,%r1), 4096(%r1)
+#CHECK: error: %r0 used in an address
+#CHECK: tr	0(1,%r0), 0(%r1)
+#CHECK: error: %r0 used in an address
+#CHECK: tr	0(1,%r1), 0(%r0)
+#CHECK: error: invalid use of indexed addressing
+#CHECK: tr	0(%r1,%r2), 0(%r1)
+#CHECK: error: invalid use of indexed addressing
+#CHECK: tr	0(1,%r2), 0(%r1,%r2)
+#CHECK: error: unknown token in expression
+#CHECK: tr	0(-), 0
+
+	tr	0, 0
+	tr	0(%r1), 0(%r1)
+	tr	0(1,%r1), 0(2,%r1)
+	tr	0(0,%r1), 0(%r1)
+	tr	0(257,%r1), 0(%r1)
+	tr	-1(1,%r1), 0(%r1)
+	tr	4096(1,%r1), 0(%r1)
+	tr	0(1,%r1), -1(%r1)
+	tr	0(1,%r1), 4096(%r1)
+	tr	0(1,%r0), 0(%r1)
+	tr	0(1,%r1), 0(%r0)
+	tr	0(%r1,%r2), 0(%r1)
+	tr	0(1,%r2), 0(%r1,%r2)
+	tr	0(-), 0
+
+#CHECK: error: invalid register pair
+#CHECK: tre	%r1, %r0
+
+	tre	%r1, %r0
+
+#CHECK: error: invalid register pair
+#CHECK: troo	%r1, %r0
+#CHECK: error: invalid operand
+#CHECK: troo	%r2, %r4, -1
+#CHECK: error: invalid operand
+#CHECK: troo	%r2, %r4, 16
+
+	troo	%r1, %r0
+	troo	%r2, %r4, -1
+	troo	%r2, %r4, 16
+
+#CHECK: error: invalid register pair
+#CHECK: trot	%r1, %r0
+#CHECK: error: invalid operand
+#CHECK: trot	%r2, %r4, -1
+#CHECK: error: invalid operand
+#CHECK: trot	%r2, %r4, 16
+
+	trot	%r1, %r0
+	trot	%r2, %r4, -1
+	trot	%r2, %r4, 16
+
+#CHECK: error: missing length in address
+#CHECK: trt	0, 0
+#CHECK: error: missing length in address
+#CHECK: trt	0(%r1), 0(%r1)
+#CHECK: error: invalid use of length addressing
+#CHECK: trt	0(1,%r1), 0(2,%r1)
+#CHECK: error: invalid operand
+#CHECK: trt	0(0,%r1), 0(%r1)
+#CHECK: error: invalid operand
+#CHECK: trt	0(257,%r1), 0(%r1)
+#CHECK: error: invalid operand
+#CHECK: trt	-1(1,%r1), 0(%r1)
+#CHECK: error: invalid operand
+#CHECK: trt	4096(1,%r1), 0(%r1)
+#CHECK: error: invalid operand
+#CHECK: trt	0(1,%r1), -1(%r1)
+#CHECK: error: invalid operand
+#CHECK: trt	0(1,%r1), 4096(%r1)
+#CHECK: error: %r0 used in an address
+#CHECK: trt	0(1,%r0), 0(%r1)
+#CHECK: error: %r0 used in an address
+#CHECK: trt	0(1,%r1), 0(%r0)
+#CHECK: error: invalid use of indexed addressing
+#CHECK: trt	0(%r1,%r2), 0(%r1)
+#CHECK: error: invalid use of indexed addressing
+#CHECK: trt	0(1,%r2), 0(%r1,%r2)
+#CHECK: error: unknown token in expression
+#CHECK: trt	0(-), 0
+
+	trt	0, 0
+	trt	0(%r1), 0(%r1)
+	trt	0(1,%r1), 0(2,%r1)
+	trt	0(0,%r1), 0(%r1)
+	trt	0(257,%r1), 0(%r1)
+	trt	-1(1,%r1), 0(%r1)
+	trt	4096(1,%r1), 0(%r1)
+	trt	0(1,%r1), -1(%r1)
+	trt	0(1,%r1), 4096(%r1)
+	trt	0(1,%r0), 0(%r1)
+	trt	0(1,%r1), 0(%r0)
+	trt	0(%r1,%r2), 0(%r1)
+	trt	0(1,%r2), 0(%r1,%r2)
+	trt	0(-), 0
+
+#CHECK: error: invalid register pair
+#CHECK: trte	%r1, %r0
+#CHECK: error: invalid operand
+#CHECK: trte	%r2, %r4, -1
+#CHECK: error: invalid operand
+#CHECK: trte	%r2, %r4, 16
+
+	trte	%r1, %r0
+	trte	%r2, %r4, -1
+	trte	%r2, %r4, 16
+
+#CHECK: error: invalid register pair
+#CHECK: trto	%r1, %r0
+#CHECK: error: invalid operand
+#CHECK: trto	%r2, %r4, -1
+#CHECK: error: invalid operand
+#CHECK: trto	%r2, %r4, 16
+
+	trto	%r1, %r0
+	trto	%r2, %r4, -1
+	trto	%r2, %r4, 16
+
+#CHECK: error: missing length in address
+#CHECK: trtr	0, 0
+#CHECK: error: missing length in address
+#CHECK: trtr	0(%r1), 0(%r1)
+#CHECK: error: invalid use of length addressing
+#CHECK: trtr	0(1,%r1), 0(2,%r1)
+#CHECK: error: invalid operand
+#CHECK: trtr	0(0,%r1), 0(%r1)
+#CHECK: error: invalid operand
+#CHECK: trtr	0(257,%r1), 0(%r1)
+#CHECK: error: invalid operand
+#CHECK: trtr	-1(1,%r1), 0(%r1)
+#CHECK: error: invalid operand
+#CHECK: trtr	4096(1,%r1), 0(%r1)
+#CHECK: error: invalid operand
+#CHECK: trtr	0(1,%r1), -1(%r1)
+#CHECK: error: invalid operand
+#CHECK: trtr	0(1,%r1), 4096(%r1)
+#CHECK: error: %r0 used in an address
+#CHECK: trtr	0(1,%r0), 0(%r1)
+#CHECK: error: %r0 used in an address
+#CHECK: trtr	0(1,%r1), 0(%r0)
+#CHECK: error: invalid use of indexed addressing
+#CHECK: trtr	0(%r1,%r2), 0(%r1)
+#CHECK: error: invalid use of indexed addressing
+#CHECK: trtr	0(1,%r2), 0(%r1,%r2)
+#CHECK: error: unknown token in expression
+#CHECK: trtr	0(-), 0
+
+	trtr	0, 0
+	trtr	0(%r1), 0(%r1)
+	trtr	0(1,%r1), 0(2,%r1)
+	trtr	0(0,%r1), 0(%r1)
+	trtr	0(257,%r1), 0(%r1)
+	trtr	-1(1,%r1), 0(%r1)
+	trtr	4096(1,%r1), 0(%r1)
+	trtr	0(1,%r1), -1(%r1)
+	trtr	0(1,%r1), 4096(%r1)
+	trtr	0(1,%r0), 0(%r1)
+	trtr	0(1,%r1), 0(%r0)
+	trtr	0(%r1,%r2), 0(%r1)
+	trtr	0(1,%r2), 0(%r1,%r2)
+	trtr	0(-), 0
+
+#CHECK: error: invalid register pair
+#CHECK: trtre	%r1, %r0
+#CHECK: error: invalid operand
+#CHECK: trtre	%r2, %r4, -1
+#CHECK: error: invalid operand
+#CHECK: trtre	%r2, %r4, 16
+
+	trtre	%r1, %r0
+	trtre	%r2, %r4, -1
+	trtre	%r2, %r4, 16
+
+#CHECK: error: invalid register pair
+#CHECK: trtt	%r1, %r0
+#CHECK: error: invalid operand
+#CHECK: trtt	%r2, %r4, -1
+#CHECK: error: invalid operand
+#CHECK: trtt	%r2, %r4, 16
+
+	trtt	%r1, %r0
+	trtt	%r2, %r4, -1
+	trtt	%r2, %r4, 16
+
 #CHECK: error: invalid operand
 #CHECK: ts	-1
 #CHECK: error: invalid operand
@@ -4083,6 +5691,147 @@
 	ts	4096
 	ts	0(%r1,%r2)
 
+#CHECK: error: missing length in address
+#CHECK: unpk	0, 0(1)
+#CHECK: error: missing length in address
+#CHECK: unpk	0(1), 0
+#CHECK: error: missing length in address
+#CHECK: unpk	0(%r1), 0(1,%r1)
+#CHECK: error: missing length in address
+#CHECK: unpk	0(1,%r1), 0(%r1)
+#CHECK: error: invalid operand
+#CHECK: unpk	0(0,%r1), 0(1,%r1)
+#CHECK: error: invalid operand
+#CHECK: unpk	0(1,%r1), 0(0,%r1)
+#CHECK: error: invalid operand
+#CHECK: unpk	0(17,%r1), 0(1,%r1)
+#CHECK: error: invalid operand
+#CHECK: unpk	0(1,%r1), 0(17,%r1)
+#CHECK: error: invalid operand
+#CHECK: unpk	-1(1,%r1), 0(1,%r1)
+#CHECK: error: invalid operand
+#CHECK: unpk	4096(1,%r1), 0(1,%r1)
+#CHECK: error: invalid operand
+#CHECK: unpk	0(1,%r1), -1(1,%r1)
+#CHECK: error: invalid operand
+#CHECK: unpk	0(1,%r1), 4096(1,%r1)
+#CHECK: error: %r0 used in an address
+#CHECK: unpk	0(1,%r0), 0(1,%r1)
+#CHECK: error: %r0 used in an address
+#CHECK: unpk	0(1,%r1), 0(1,%r0)
+#CHECK: error: invalid use of indexed addressing
+#CHECK: unpk	0(%r1,%r2), 0(1,%r1)
+#CHECK: error: invalid use of indexed addressing
+#CHECK: unpk	0(1,%r2), 0(%r1,%r2)
+#CHECK: error: unknown token in expression
+#CHECK: unpk	0(-), 0(1)
+
+	unpk	0, 0(1)
+	unpk	0(1), 0
+	unpk	0(%r1), 0(1,%r1)
+	unpk	0(1,%r1), 0(%r1)
+	unpk	0(0,%r1), 0(1,%r1)
+	unpk	0(1,%r1), 0(0,%r1)
+	unpk	0(17,%r1), 0(1,%r1)
+	unpk	0(1,%r1), 0(17,%r1)
+	unpk	-1(1,%r1), 0(1,%r1)
+	unpk	4096(1,%r1), 0(1,%r1)
+	unpk	0(1,%r1), -1(1,%r1)
+	unpk	0(1,%r1), 4096(1,%r1)
+	unpk	0(1,%r0), 0(1,%r1)
+	unpk	0(1,%r1), 0(1,%r0)
+	unpk	0(%r1,%r2), 0(1,%r1)
+	unpk	0(1,%r2), 0(%r1,%r2)
+	unpk	0(-), 0(1)
+
+#CHECK: error: missing length in address
+#CHECK: unpka	0, 0
+#CHECK: error: missing length in address
+#CHECK: unpka	0(%r1), 0(%r1)
+#CHECK: error: invalid use of length addressing
+#CHECK: unpka	0(1,%r1), 0(2,%r1)
+#CHECK: error: invalid operand
+#CHECK: unpka	0(0,%r1), 0(%r1)
+#CHECK: error: invalid operand
+#CHECK: unpka	0(257,%r1), 0(%r1)
+#CHECK: error: invalid operand
+#CHECK: unpka	-1(1,%r1), 0(%r1)
+#CHECK: error: invalid operand
+#CHECK: unpka	4096(1,%r1), 0(%r1)
+#CHECK: error: invalid operand
+#CHECK: unpka	0(1,%r1), -1(%r1)
+#CHECK: error: invalid operand
+#CHECK: unpka	0(1,%r1), 4096(%r1)
+#CHECK: error: %r0 used in an address
+#CHECK: unpka	0(1,%r0), 0(%r1)
+#CHECK: error: %r0 used in an address
+#CHECK: unpka	0(1,%r1), 0(%r0)
+#CHECK: error: invalid use of indexed addressing
+#CHECK: unpka	0(%r1,%r2), 0(%r1)
+#CHECK: error: invalid use of indexed addressing
+#CHECK: unpka	0(1,%r2), 0(%r1,%r2)
+#CHECK: error: unknown token in expression
+#CHECK: unpka	0(-), 0
+
+	unpka	0, 0
+	unpka	0(%r1), 0(%r1)
+	unpka	0(1,%r1), 0(2,%r1)
+	unpka	0(0,%r1), 0(%r1)
+	unpka	0(257,%r1), 0(%r1)
+	unpka	-1(1,%r1), 0(%r1)
+	unpka	4096(1,%r1), 0(%r1)
+	unpka	0(1,%r1), -1(%r1)
+	unpka	0(1,%r1), 4096(%r1)
+	unpka	0(1,%r0), 0(%r1)
+	unpka	0(1,%r1), 0(%r0)
+	unpka	0(%r1,%r2), 0(%r1)
+	unpka	0(1,%r2), 0(%r1,%r2)
+	unpka	0(-), 0
+
+#CHECK: error: missing length in address
+#CHECK: unpku	0, 0
+#CHECK: error: missing length in address
+#CHECK: unpku	0(%r1), 0(%r1)
+#CHECK: error: invalid use of length addressing
+#CHECK: unpku	0(1,%r1), 0(2,%r1)
+#CHECK: error: invalid operand
+#CHECK: unpku	0(0,%r1), 0(%r1)
+#CHECK: error: invalid operand
+#CHECK: unpku	0(257,%r1), 0(%r1)
+#CHECK: error: invalid operand
+#CHECK: unpku	-1(1,%r1), 0(%r1)
+#CHECK: error: invalid operand
+#CHECK: unpku	4096(1,%r1), 0(%r1)
+#CHECK: error: invalid operand
+#CHECK: unpku	0(1,%r1), -1(%r1)
+#CHECK: error: invalid operand
+#CHECK: unpku	0(1,%r1), 4096(%r1)
+#CHECK: error: %r0 used in an address
+#CHECK: unpku	0(1,%r0), 0(%r1)
+#CHECK: error: %r0 used in an address
+#CHECK: unpku	0(1,%r1), 0(%r0)
+#CHECK: error: invalid use of indexed addressing
+#CHECK: unpku	0(%r1,%r2), 0(%r1)
+#CHECK: error: invalid use of indexed addressing
+#CHECK: unpku	0(1,%r2), 0(%r1,%r2)
+#CHECK: error: unknown token in expression
+#CHECK: unpku	0(-), 0
+
+	unpku	0, 0
+	unpku	0(%r1), 0(%r1)
+	unpku	0(1,%r1), 0(2,%r1)
+	unpku	0(0,%r1), 0(%r1)
+	unpku	0(257,%r1), 0(%r1)
+	unpku	-1(1,%r1), 0(%r1)
+	unpku	4096(1,%r1), 0(%r1)
+	unpku	0(1,%r1), -1(%r1)
+	unpku	0(1,%r1), 4096(%r1)
+	unpku	0(1,%r0), 0(%r1)
+	unpku	0(1,%r1), 0(%r0)
+	unpku	0(%r1,%r2), 0(%r1)
+	unpku	0(1,%r2), 0(%r1,%r2)
+	unpku	0(-), 0
+
 #CHECK: error: invalid operand
 #CHECK: x	%r0, -1
 #CHECK: error: invalid operand
@@ -4210,3 +5959,56 @@
 
 	xy	%r0, -524289
 	xy	%r0, 524288
+
+#CHECK: error: missing length in address
+#CHECK: zap	0, 0(1)
+#CHECK: error: missing length in address
+#CHECK: zap	0(1), 0
+#CHECK: error: missing length in address
+#CHECK: zap	0(%r1), 0(1,%r1)
+#CHECK: error: missing length in address
+#CHECK: zap	0(1,%r1), 0(%r1)
+#CHECK: error: invalid operand
+#CHECK: zap	0(0,%r1), 0(1,%r1)
+#CHECK: error: invalid operand
+#CHECK: zap	0(1,%r1), 0(0,%r1)
+#CHECK: error: invalid operand
+#CHECK: zap	0(17,%r1), 0(1,%r1)
+#CHECK: error: invalid operand
+#CHECK: zap	0(1,%r1), 0(17,%r1)
+#CHECK: error: invalid operand
+#CHECK: zap	-1(1,%r1), 0(1,%r1)
+#CHECK: error: invalid operand
+#CHECK: zap	4096(1,%r1), 0(1,%r1)
+#CHECK: error: invalid operand
+#CHECK: zap	0(1,%r1), -1(1,%r1)
+#CHECK: error: invalid operand
+#CHECK: zap	0(1,%r1), 4096(1,%r1)
+#CHECK: error: %r0 used in an address
+#CHECK: zap	0(1,%r0), 0(1,%r1)
+#CHECK: error: %r0 used in an address
+#CHECK: zap	0(1,%r1), 0(1,%r0)
+#CHECK: error: invalid use of indexed addressing
+#CHECK: zap	0(%r1,%r2), 0(1,%r1)
+#CHECK: error: invalid use of indexed addressing
+#CHECK: zap	0(1,%r2), 0(%r1,%r2)
+#CHECK: error: unknown token in expression
+#CHECK: zap	0(-), 0(1)
+
+	zap	0, 0(1)
+	zap	0(1), 0
+	zap	0(%r1), 0(1,%r1)
+	zap	0(1,%r1), 0(%r1)
+	zap	0(0,%r1), 0(1,%r1)
+	zap	0(1,%r1), 0(0,%r1)
+	zap	0(17,%r1), 0(1,%r1)
+	zap	0(1,%r1), 0(17,%r1)
+	zap	-1(1,%r1), 0(1,%r1)
+	zap	4096(1,%r1), 0(1,%r1)
+	zap	0(1,%r1), -1(1,%r1)
+	zap	0(1,%r1), 4096(1,%r1)
+	zap	0(1,%r0), 0(1,%r1)
+	zap	0(1,%r1), 0(1,%r0)
+	zap	0(%r1,%r2), 0(1,%r1)
+	zap	0(1,%r2), 0(%r1,%r2)
+	zap	0(-), 0(1)
diff --git a/test/MC/SystemZ/insn-good-z13.s b/test/MC/SystemZ/insn-good-z13.s
index 4fd6a664a29d..cbfcfa9a89af 100644
--- a/test/MC/SystemZ/insn-good-z13.s
+++ b/test/MC/SystemZ/insn-good-z13.s
@@ -4,16 +4,264 @@
 # RUN: llvm-mc -triple s390x-linux-gnu -mcpu=arch11 -show-encoding %s \
 # RUN:   | FileCheck %s
 
-#CHECK: lzrf	%r0, -524288            # encoding: [0xe3,0x00,0x00,0x00,0x80,0x3b]
-#CHECK: lzrf	%r0, -1                 # encoding: [0xe3,0x00,0x0f,0xff,0xff,0x3b]
-#CHECK: lzrf	%r0, 0                  # encoding: [0xe3,0x00,0x00,0x00,0x00,0x3b]
-#CHECK: lzrf	%r0, 1                  # encoding: [0xe3,0x00,0x00,0x01,0x00,0x3b]
-#CHECK: lzrf	%r0, 524287             # encoding: [0xe3,0x00,0x0f,0xff,0x7f,0x3b]
-#CHECK: lzrf	%r0, 0(%r1)             # encoding: [0xe3,0x00,0x10,0x00,0x00,0x3b]
-#CHECK: lzrf	%r0, 0(%r15)            # encoding: [0xe3,0x00,0xf0,0x00,0x00,0x3b]
-#CHECK: lzrf	%r0, 524287(%r1,%r15)   # encoding: [0xe3,0x01,0xff,0xff,0x7f,0x3b]
-#CHECK: lzrf	%r0, 524287(%r15,%r1)   # encoding: [0xe3,0x0f,0x1f,0xff,0x7f,0x3b]
-#CHECK: lzrf	%r15, 0                 # encoding: [0xe3,0xf0,0x00,0x00,0x00,0x3b]
+#CHECK: lcbb    %r0, 0, 0               # encoding: [0xe7,0x00,0x00,0x00,0x00,0x27]
+#CHECK: lcbb    %r0, 0, 15              # encoding: [0xe7,0x00,0x00,0x00,0xf0,0x27]
+#CHECK: lcbb    %r0, 4095, 0            # encoding: [0xe7,0x00,0x0f,0xff,0x00,0x27]
+#CHECK: lcbb    %r0, 0(%r15), 0         # encoding: [0xe7,0x00,0xf0,0x00,0x00,0x27]
+#CHECK: lcbb    %r0, 0(%r15,%r1), 0     # encoding: [0xe7,0x0f,0x10,0x00,0x00,0x27]
+#CHECK: lcbb    %r15, 0, 0              # encoding: [0xe7,0xf0,0x00,0x00,0x00,0x27]
+#CHECK: lcbb    %r2, 1383(%r3,%r4), 8   # encoding: [0xe7,0x23,0x45,0x67,0x80,0x27]
+
+	lcbb	%r0, 0, 0
+	lcbb	%r0, 0, 15
+	lcbb	%r0, 4095, 0
+	lcbb	%r0, 0(%r15), 0
+	lcbb	%r0, 0(%r15,%r1), 0
+	lcbb	%r15, 0, 0
+	lcbb	%r2, 1383(%r3,%r4), 8
+
+#CHECK: llzrgf  %r0, -524288            # encoding: [0xe3,0x00,0x00,0x00,0x80,0x3a]
+#CHECK: llzrgf  %r0, -1                 # encoding: [0xe3,0x00,0x0f,0xff,0xff,0x3a]
+#CHECK: llzrgf  %r0, 0                  # encoding: [0xe3,0x00,0x00,0x00,0x00,0x3a]
+#CHECK: llzrgf  %r0, 1                  # encoding: [0xe3,0x00,0x00,0x01,0x00,0x3a]
+#CHECK: llzrgf  %r0, 524287             # encoding: [0xe3,0x00,0x0f,0xff,0x7f,0x3a]
+#CHECK: llzrgf  %r0, 0(%r1)             # encoding: [0xe3,0x00,0x10,0x00,0x00,0x3a]
+#CHECK: llzrgf  %r0, 0(%r15)            # encoding: [0xe3,0x00,0xf0,0x00,0x00,0x3a]
+#CHECK: llzrgf  %r0, 524287(%r1,%r15)   # encoding: [0xe3,0x01,0xff,0xff,0x7f,0x3a]
+#CHECK: llzrgf  %r0, 524287(%r15,%r1)   # encoding: [0xe3,0x0f,0x1f,0xff,0x7f,0x3a]
+#CHECK: llzrgf  %r15, 0                 # encoding: [0xe3,0xf0,0x00,0x00,0x00,0x3a]
+
+	llzrgf	%r0, -524288
+	llzrgf	%r0, -1
+	llzrgf	%r0, 0
+	llzrgf	%r0, 1
+	llzrgf	%r0, 524287
+	llzrgf	%r0, 0(%r1)
+	llzrgf	%r0, 0(%r15)
+	llzrgf	%r0, 524287(%r1,%r15)
+	llzrgf	%r0, 524287(%r15,%r1)
+	llzrgf	%r15, 0
+
+#CHECK: lochi  %r11, 42, 0    # encoding: [0xec,0xb0,0x00,0x2a,0x00,0x42]
+#CHECK: lochio %r11, 42       # encoding: [0xec,0xb1,0x00,0x2a,0x00,0x42]
+#CHECK: lochih %r11, 42       # encoding: [0xec,0xb2,0x00,0x2a,0x00,0x42]
+#CHECK: lochinle %r11, 42     # encoding: [0xec,0xb3,0x00,0x2a,0x00,0x42]
+#CHECK: lochil %r11, -1       # encoding: [0xec,0xb4,0xff,0xff,0x00,0x42]
+#CHECK: lochinhe %r11, 42     # encoding: [0xec,0xb5,0x00,0x2a,0x00,0x42]
+#CHECK: lochilh %r11, -1      # encoding: [0xec,0xb6,0xff,0xff,0x00,0x42]
+#CHECK: lochine %r11, 0       # encoding: [0xec,0xb7,0x00,0x00,0x00,0x42]
+#CHECK: lochie %r11, 0        # encoding: [0xec,0xb8,0x00,0x00,0x00,0x42]
+#CHECK: lochinlh %r11, 42     # encoding: [0xec,0xb9,0x00,0x2a,0x00,0x42]
+#CHECK: lochihe %r11, 255     # encoding: [0xec,0xba,0x00,0xff,0x00,0x42]
+#CHECK: lochinl %r11, 255     # encoding: [0xec,0xbb,0x00,0xff,0x00,0x42]
+#CHECK: lochile %r11, 32767   # encoding: [0xec,0xbc,0x7f,0xff,0x00,0x42]
+#CHECK: lochinh %r11, 32767   # encoding: [0xec,0xbd,0x7f,0xff,0x00,0x42]
+#CHECK: lochino %r11, 32512   # encoding: [0xec,0xbe,0x7f,0x00,0x00,0x42]
+#CHECK: lochi %r11, 32512, 15 # encoding: [0xec,0xbf,0x7f,0x00,0x00,0x42]
+
+        lochi  %r11, 42, 0
+        lochio %r11, 42
+        lochih %r11, 42
+        lochinle %r11, 42
+        lochil %r11, -1
+        lochinhe %r11, 42
+        lochilh %r11, -1
+        lochine %r11, 0
+        lochie %r11, 0
+        lochinlh %r11, 42
+        lochihe %r11, 255
+        lochinl %r11, 255
+        lochile %r11, 32767
+        lochinh %r11, 32767
+        lochino %r11, 32512
+        lochi %r11, 32512, 15
+
+#CHECK: locghi  %r11, 42, 0    # encoding: [0xec,0xb0,0x00,0x2a,0x00,0x46]
+#CHECK: locghio %r11, 42       # encoding: [0xec,0xb1,0x00,0x2a,0x00,0x46]
+#CHECK: locghih %r11, 42       # encoding: [0xec,0xb2,0x00,0x2a,0x00,0x46]
+#CHECK: locghinle %r11, 42     # encoding: [0xec,0xb3,0x00,0x2a,0x00,0x46]
+#CHECK: locghil %r11, -1       # encoding: [0xec,0xb4,0xff,0xff,0x00,0x46]
+#CHECK: locghinhe %r11, 42     # encoding: [0xec,0xb5,0x00,0x2a,0x00,0x46]
+#CHECK: locghilh %r11, -1      # encoding: [0xec,0xb6,0xff,0xff,0x00,0x46]
+#CHECK: locghine %r11, 0       # encoding: [0xec,0xb7,0x00,0x00,0x00,0x46]
+#CHECK: locghie %r11, 0        # encoding: [0xec,0xb8,0x00,0x00,0x00,0x46]
+#CHECK: locghinlh %r11, 42     # encoding: [0xec,0xb9,0x00,0x2a,0x00,0x46]
+#CHECK: locghihe %r11, 255     # encoding: [0xec,0xba,0x00,0xff,0x00,0x46]
+#CHECK: locghinl %r11, 255     # encoding: [0xec,0xbb,0x00,0xff,0x00,0x46]
+#CHECK: locghile %r11, 32767   # encoding: [0xec,0xbc,0x7f,0xff,0x00,0x46]
+#CHECK: locghinh %r11, 32767   # encoding: [0xec,0xbd,0x7f,0xff,0x00,0x46]
+#CHECK: locghino %r11, 32512   # encoding: [0xec,0xbe,0x7f,0x00,0x00,0x46]
+#CHECK: locghi %r11, 32512, 15 # encoding: [0xec,0xbf,0x7f,0x00,0x00,0x46]
+
+        locghi  %r11, 42, 0
+        locghio %r11, 42
+        locghih %r11, 42
+        locghinle %r11, 42
+        locghil %r11, -1
+        locghinhe %r11, 42
+        locghilh %r11, -1
+        locghine %r11, 0
+        locghie %r11, 0
+        locghinlh %r11, 42
+        locghihe %r11, 255
+        locghinl %r11, 255
+        locghile %r11, 32767
+        locghinh %r11, 32767
+        locghino %r11, 32512
+        locghi %r11, 32512, 15
+
+#CHECK: lochhi  %r11, 42, 0    # encoding: [0xec,0xb0,0x00,0x2a,0x00,0x4e]
+#CHECK: lochhio %r11, 42       # encoding: [0xec,0xb1,0x00,0x2a,0x00,0x4e]
+#CHECK: lochhih %r11, 42       # encoding: [0xec,0xb2,0x00,0x2a,0x00,0x4e]
+#CHECK: lochhinle %r11, 42     # encoding: [0xec,0xb3,0x00,0x2a,0x00,0x4e]
+#CHECK: lochhil %r11, -1       # encoding: [0xec,0xb4,0xff,0xff,0x00,0x4e]
+#CHECK: lochhinhe %r11, 42     # encoding: [0xec,0xb5,0x00,0x2a,0x00,0x4e]
+#CHECK: lochhilh %r11, -1      # encoding: [0xec,0xb6,0xff,0xff,0x00,0x4e]
+#CHECK: lochhine %r11, 0       # encoding: [0xec,0xb7,0x00,0x00,0x00,0x4e]
+#CHECK: lochhie %r11, 0        # encoding: [0xec,0xb8,0x00,0x00,0x00,0x4e]
+#CHECK: lochhinlh %r11, 42     # encoding: [0xec,0xb9,0x00,0x2a,0x00,0x4e]
+#CHECK: lochhihe %r11, 255     # encoding: [0xec,0xba,0x00,0xff,0x00,0x4e]
+#CHECK: lochhinl %r11, 255     # encoding: [0xec,0xbb,0x00,0xff,0x00,0x4e]
+#CHECK: lochhile %r11, 32767   # encoding: [0xec,0xbc,0x7f,0xff,0x00,0x4e]
+#CHECK: lochhinh %r11, 32767   # encoding: [0xec,0xbd,0x7f,0xff,0x00,0x4e]
+#CHECK: lochhino %r11, 32512   # encoding: [0xec,0xbe,0x7f,0x00,0x00,0x4e]
+#CHECK: lochhi %r11, 32512, 15 # encoding: [0xec,0xbf,0x7f,0x00,0x00,0x4e]
+
+        lochhi  %r11, 42, 0
+        lochhio %r11, 42
+        lochhih %r11, 42
+        lochhinle %r11, 42
+        lochhil %r11, -1
+        lochhinhe %r11, 42
+        lochhilh %r11, -1
+        lochhine %r11, 0
+        lochhie %r11, 0
+        lochhinlh %r11, 42
+        lochhihe %r11, 255
+        lochhinl %r11, 255
+        lochhile %r11, 32767
+        lochhinh %r11, 32767
+        lochhino %r11, 32512
+        lochhi %r11, 32512, 15
+
+#CHECK: locfh   %r0, 0, 0               # encoding: [0xeb,0x00,0x00,0x00,0x00,0xe0]
+#CHECK: locfh   %r0, 0, 15              # encoding: [0xeb,0x0f,0x00,0x00,0x00,0xe0]
+#CHECK: locfh   %r0, -524288, 0         # encoding: [0xeb,0x00,0x00,0x00,0x80,0xe0]
+#CHECK: locfh   %r0, 524287, 0          # encoding: [0xeb,0x00,0x0f,0xff,0x7f,0xe0]
+#CHECK: locfh   %r0, 0(%r1), 0          # encoding: [0xeb,0x00,0x10,0x00,0x00,0xe0]
+#CHECK: locfh   %r0, 0(%r15), 0         # encoding: [0xeb,0x00,0xf0,0x00,0x00,0xe0]
+#CHECK: locfh   %r15, 0, 0              # encoding: [0xeb,0xf0,0x00,0x00,0x00,0xe0]
+#CHECK: locfh   %r1, 4095(%r2), 3       # encoding: [0xeb,0x13,0x2f,0xff,0x00,0xe0]
+
+	locfh	%r0, 0, 0
+	locfh	%r0, 0, 15
+	locfh	%r0, -524288, 0
+	locfh	%r0, 524287, 0
+	locfh	%r0, 0(%r1), 0
+	locfh	%r0, 0(%r15), 0
+	locfh	%r15, 0, 0
+	locfh	%r1, 4095(%r2), 3
+
+#CHECK: locfho   %r1, 2(%r3)            # encoding: [0xeb,0x11,0x30,0x02,0x00,0xe0]
+#CHECK: locfhh   %r1, 2(%r3)            # encoding: [0xeb,0x12,0x30,0x02,0x00,0xe0]
+#CHECK: locfhp   %r1, 2(%r3)            # encoding: [0xeb,0x12,0x30,0x02,0x00,0xe0]
+#CHECK: locfhnle %r1, 2(%r3)            # encoding: [0xeb,0x13,0x30,0x02,0x00,0xe0]
+#CHECK: locfhl   %r1, 2(%r3)            # encoding: [0xeb,0x14,0x30,0x02,0x00,0xe0]
+#CHECK: locfhm   %r1, 2(%r3)            # encoding: [0xeb,0x14,0x30,0x02,0x00,0xe0]
+#CHECK: locfhnhe %r1, 2(%r3)            # encoding: [0xeb,0x15,0x30,0x02,0x00,0xe0]
+#CHECK: locfhlh  %r1, 2(%r3)            # encoding: [0xeb,0x16,0x30,0x02,0x00,0xe0]
+#CHECK: locfhne  %r1, 2(%r3)            # encoding: [0xeb,0x17,0x30,0x02,0x00,0xe0]
+#CHECK: locfhnz  %r1, 2(%r3)            # encoding: [0xeb,0x17,0x30,0x02,0x00,0xe0]
+#CHECK: locfhe   %r1, 2(%r3)            # encoding: [0xeb,0x18,0x30,0x02,0x00,0xe0]
+#CHECK: locfhz   %r1, 2(%r3)            # encoding: [0xeb,0x18,0x30,0x02,0x00,0xe0]
+#CHECK: locfhnlh %r1, 2(%r3)            # encoding: [0xeb,0x19,0x30,0x02,0x00,0xe0]
+#CHECK: locfhhe  %r1, 2(%r3)            # encoding: [0xeb,0x1a,0x30,0x02,0x00,0xe0]
+#CHECK: locfhnl  %r1, 2(%r3)            # encoding: [0xeb,0x1b,0x30,0x02,0x00,0xe0]
+#CHECK: locfhnm  %r1, 2(%r3)            # encoding: [0xeb,0x1b,0x30,0x02,0x00,0xe0]
+#CHECK: locfhle  %r1, 2(%r3)            # encoding: [0xeb,0x1c,0x30,0x02,0x00,0xe0]
+#CHECK: locfhnh  %r1, 2(%r3)            # encoding: [0xeb,0x1d,0x30,0x02,0x00,0xe0]
+#CHECK: locfhnp  %r1, 2(%r3)            # encoding: [0xeb,0x1d,0x30,0x02,0x00,0xe0]
+#CHECK: locfhno  %r1, 2(%r3)            # encoding: [0xeb,0x1e,0x30,0x02,0x00,0xe0]
+
+	locfho   %r1, 2(%r3)
+	locfhh   %r1, 2(%r3)
+	locfhp   %r1, 2(%r3)
+	locfhnle %r1, 2(%r3)
+	locfhl   %r1, 2(%r3)
+	locfhm   %r1, 2(%r3)
+	locfhnhe %r1, 2(%r3)
+	locfhlh  %r1, 2(%r3)
+	locfhne  %r1, 2(%r3)
+	locfhnz  %r1, 2(%r3)
+	locfhe   %r1, 2(%r3)
+	locfhz   %r1, 2(%r3)
+	locfhnlh %r1, 2(%r3)
+	locfhhe  %r1, 2(%r3)
+	locfhnl  %r1, 2(%r3)
+	locfhnm  %r1, 2(%r3)
+	locfhle  %r1, 2(%r3)
+	locfhnh  %r1, 2(%r3)
+	locfhnp  %r1, 2(%r3)
+	locfhno  %r1, 2(%r3)
+
+#CHECK: locfhr  %r1, %r2, 0             # encoding: [0xb9,0xe0,0x00,0x12]
+#CHECK: locfhr  %r1, %r2, 15            # encoding: [0xb9,0xe0,0xf0,0x12]
+
+	locfhr	%r1, %r2, 0
+	locfhr	%r1, %r2, 15
+
+#CHECK: locfhro   %r1, %r3              # encoding: [0xb9,0xe0,0x10,0x13]
+#CHECK: locfhrh   %r1, %r3              # encoding: [0xb9,0xe0,0x20,0x13]
+#CHECK: locfhrp   %r1, %r3              # encoding: [0xb9,0xe0,0x20,0x13]
+#CHECK: locfhrnle %r1, %r3              # encoding: [0xb9,0xe0,0x30,0x13]
+#CHECK: locfhrl   %r1, %r3              # encoding: [0xb9,0xe0,0x40,0x13]
+#CHECK: locfhrm   %r1, %r3              # encoding: [0xb9,0xe0,0x40,0x13]
+#CHECK: locfhrnhe %r1, %r3              # encoding: [0xb9,0xe0,0x50,0x13]
+#CHECK: locfhrlh  %r1, %r3              # encoding: [0xb9,0xe0,0x60,0x13]
+#CHECK: locfhrne  %r1, %r3              # encoding: [0xb9,0xe0,0x70,0x13]
+#CHECK: locfhrnz  %r1, %r3              # encoding: [0xb9,0xe0,0x70,0x13]
+#CHECK: locfhre   %r1, %r3              # encoding: [0xb9,0xe0,0x80,0x13]
+#CHECK: locfhrz   %r1, %r3              # encoding: [0xb9,0xe0,0x80,0x13]
+#CHECK: locfhrnlh %r1, %r3              # encoding: [0xb9,0xe0,0x90,0x13]
+#CHECK: locfhrhe  %r1, %r3              # encoding: [0xb9,0xe0,0xa0,0x13]
+#CHECK: locfhrnl  %r1, %r3              # encoding: [0xb9,0xe0,0xb0,0x13]
+#CHECK: locfhrnm  %r1, %r3              # encoding: [0xb9,0xe0,0xb0,0x13]
+#CHECK: locfhrle  %r1, %r3              # encoding: [0xb9,0xe0,0xc0,0x13]
+#CHECK: locfhrnh  %r1, %r3              # encoding: [0xb9,0xe0,0xd0,0x13]
+#CHECK: locfhrnp  %r1, %r3              # encoding: [0xb9,0xe0,0xd0,0x13]
+#CHECK: locfhrno  %r1, %r3              # encoding: [0xb9,0xe0,0xe0,0x13]
+
+	locfhro   %r1, %r3
+	locfhrh   %r1, %r3
+	locfhrp   %r1, %r3
+	locfhrnle %r1, %r3
+	locfhrl   %r1, %r3
+	locfhrm   %r1, %r3
+	locfhrnhe %r1, %r3
+	locfhrlh  %r1, %r3
+	locfhrne  %r1, %r3
+	locfhrnz  %r1, %r3
+	locfhre   %r1, %r3
+	locfhrz   %r1, %r3
+	locfhrnlh %r1, %r3
+	locfhrhe  %r1, %r3
+	locfhrnl  %r1, %r3
+	locfhrnm  %r1, %r3
+	locfhrle  %r1, %r3
+	locfhrnh  %r1, %r3
+	locfhrnp  %r1, %r3
+	locfhrno  %r1, %r3
+
+#CHECK: lzrf    %r0, -524288            # encoding: [0xe3,0x00,0x00,0x00,0x80,0x3b]
+#CHECK: lzrf    %r0, -1                 # encoding: [0xe3,0x00,0x0f,0xff,0xff,0x3b]
+#CHECK: lzrf    %r0, 0                  # encoding: [0xe3,0x00,0x00,0x00,0x00,0x3b]
+#CHECK: lzrf    %r0, 1                  # encoding: [0xe3,0x00,0x00,0x01,0x00,0x3b]
+#CHECK: lzrf    %r0, 524287             # encoding: [0xe3,0x00,0x0f,0xff,0x7f,0x3b]
+#CHECK: lzrf    %r0, 0(%r1)             # encoding: [0xe3,0x00,0x10,0x00,0x00,0x3b]
+#CHECK: lzrf    %r0, 0(%r15)            # encoding: [0xe3,0x00,0xf0,0x00,0x00,0x3b]
+#CHECK: lzrf    %r0, 524287(%r1,%r15)   # encoding: [0xe3,0x01,0xff,0xff,0x7f,0x3b]
+#CHECK: lzrf    %r0, 524287(%r15,%r1)   # encoding: [0xe3,0x0f,0x1f,0xff,0x7f,0x3b]
+#CHECK: lzrf    %r15, 0                 # encoding: [0xe3,0xf0,0x00,0x00,0x00,0x3b]
 
 	lzrf	%r0, -524288
 	lzrf	%r0, -1
@@ -26,16 +274,16 @@
 	lzrf	%r0, 524287(%r15,%r1)
 	lzrf	%r15, 0
 
-#CHECK: lzrg	%r0, -524288            # encoding: [0xe3,0x00,0x00,0x00,0x80,0x2a]
-#CHECK: lzrg	%r0, -1                 # encoding: [0xe3,0x00,0x0f,0xff,0xff,0x2a]
-#CHECK: lzrg	%r0, 0                  # encoding: [0xe3,0x00,0x00,0x00,0x00,0x2a]
-#CHECK: lzrg	%r0, 1                  # encoding: [0xe3,0x00,0x00,0x01,0x00,0x2a]
-#CHECK: lzrg	%r0, 524287             # encoding: [0xe3,0x00,0x0f,0xff,0x7f,0x2a]
-#CHECK: lzrg	%r0, 0(%r1)             # encoding: [0xe3,0x00,0x10,0x00,0x00,0x2a]
-#CHECK: lzrg	%r0, 0(%r15)            # encoding: [0xe3,0x00,0xf0,0x00,0x00,0x2a]
-#CHECK: lzrg	%r0, 524287(%r1,%r15)   # encoding: [0xe3,0x01,0xff,0xff,0x7f,0x2a]
-#CHECK: lzrg	%r0, 524287(%r15,%r1)   # encoding: [0xe3,0x0f,0x1f,0xff,0x7f,0x2a]
-#CHECK: lzrg	%r15, 0                 # encoding: [0xe3,0xf0,0x00,0x00,0x00,0x2a]
+#CHECK: lzrg    %r0, -524288            # encoding: [0xe3,0x00,0x00,0x00,0x80,0x2a]
+#CHECK: lzrg    %r0, -1                 # encoding: [0xe3,0x00,0x0f,0xff,0xff,0x2a]
+#CHECK: lzrg    %r0, 0                  # encoding: [0xe3,0x00,0x00,0x00,0x00,0x2a]
+#CHECK: lzrg    %r0, 1                  # encoding: [0xe3,0x00,0x00,0x01,0x00,0x2a]
+#CHECK: lzrg    %r0, 524287             # encoding: [0xe3,0x00,0x0f,0xff,0x7f,0x2a]
+#CHECK: lzrg    %r0, 0(%r1)             # encoding: [0xe3,0x00,0x10,0x00,0x00,0x2a]
+#CHECK: lzrg    %r0, 0(%r15)            # encoding: [0xe3,0x00,0xf0,0x00,0x00,0x2a]
+#CHECK: lzrg    %r0, 524287(%r1,%r15)   # encoding: [0xe3,0x01,0xff,0xff,0x7f,0x2a]
+#CHECK: lzrg    %r0, 524287(%r15,%r1)   # encoding: [0xe3,0x0f,0x1f,0xff,0x7f,0x2a]
+#CHECK: lzrg    %r15, 0                 # encoding: [0xe3,0xf0,0x00,0x00,0x00,0x2a]
 
 	lzrg	%r0, -524288
 	lzrg	%r0, -1
@@ -48,43 +296,75 @@
 	lzrg	%r0, 524287(%r15,%r1)
 	lzrg	%r15, 0
 
-#CHECK: llzrgf	%r0, -524288            # encoding: [0xe3,0x00,0x00,0x00,0x80,0x3a]
-#CHECK: llzrgf	%r0, -1                 # encoding: [0xe3,0x00,0x0f,0xff,0xff,0x3a]
-#CHECK: llzrgf	%r0, 0                  # encoding: [0xe3,0x00,0x00,0x00,0x00,0x3a]
-#CHECK: llzrgf	%r0, 1                  # encoding: [0xe3,0x00,0x00,0x01,0x00,0x3a]
-#CHECK: llzrgf	%r0, 524287             # encoding: [0xe3,0x00,0x0f,0xff,0x7f,0x3a]
-#CHECK: llzrgf	%r0, 0(%r1)             # encoding: [0xe3,0x00,0x10,0x00,0x00,0x3a]
-#CHECK: llzrgf	%r0, 0(%r15)            # encoding: [0xe3,0x00,0xf0,0x00,0x00,0x3a]
-#CHECK: llzrgf	%r0, 524287(%r1,%r15)   # encoding: [0xe3,0x01,0xff,0xff,0x7f,0x3a]
-#CHECK: llzrgf	%r0, 524287(%r15,%r1)   # encoding: [0xe3,0x0f,0x1f,0xff,0x7f,0x3a]
-#CHECK: llzrgf	%r15, 0                 # encoding: [0xe3,0xf0,0x00,0x00,0x00,0x3a]
-
-	llzrgf	%r0, -524288
-	llzrgf	%r0, -1
-	llzrgf	%r0, 0
-	llzrgf	%r0, 1
-	llzrgf	%r0, 524287
-	llzrgf	%r0, 0(%r1)
-	llzrgf	%r0, 0(%r15)
-	llzrgf	%r0, 524287(%r1,%r15)
-	llzrgf	%r0, 524287(%r15,%r1)
-	llzrgf	%r15, 0
+#CHECK: stocfh  %r0, 0, 0               # encoding: [0xeb,0x00,0x00,0x00,0x00,0xe1]
+#CHECK: stocfh  %r0, 0, 15              # encoding: [0xeb,0x0f,0x00,0x00,0x00,0xe1]
+#CHECK: stocfh  %r0, -524288, 0         # encoding: [0xeb,0x00,0x00,0x00,0x80,0xe1]
+#CHECK: stocfh  %r0, 524287, 0          # encoding: [0xeb,0x00,0x0f,0xff,0x7f,0xe1]
+#CHECK: stocfh  %r0, 0(%r1), 0          # encoding: [0xeb,0x00,0x10,0x00,0x00,0xe1]
+#CHECK: stocfh  %r0, 0(%r15), 0         # encoding: [0xeb,0x00,0xf0,0x00,0x00,0xe1]
+#CHECK: stocfh  %r15, 0, 0              # encoding: [0xeb,0xf0,0x00,0x00,0x00,0xe1]
+#CHECK: stocfh  %r1, 4095(%r2), 3       # encoding: [0xeb,0x13,0x2f,0xff,0x00,0xe1]
+
+	stocfh	%r0, 0, 0
+	stocfh	%r0, 0, 15
+	stocfh	%r0, -524288, 0
+	stocfh	%r0, 524287, 0
+	stocfh	%r0, 0(%r1), 0
+	stocfh	%r0, 0(%r15), 0
+	stocfh	%r15, 0, 0
+	stocfh	%r1, 4095(%r2), 3
 
-#CHECK: lcbb    %r0, 0, 0               # encoding: [0xe7,0x00,0x00,0x00,0x00,0x27]
-#CHECK: lcbb    %r0, 0, 15              # encoding: [0xe7,0x00,0x00,0x00,0xf0,0x27]
-#CHECK: lcbb    %r0, 4095, 0            # encoding: [0xe7,0x00,0x0f,0xff,0x00,0x27]
-#CHECK: lcbb    %r0, 0(%r15), 0         # encoding: [0xe7,0x00,0xf0,0x00,0x00,0x27]
-#CHECK: lcbb    %r0, 0(%r15,%r1), 0     # encoding: [0xe7,0x0f,0x10,0x00,0x00,0x27]
-#CHECK: lcbb    %r15, 0, 0              # encoding: [0xe7,0xf0,0x00,0x00,0x00,0x27]
-#CHECK: lcbb    %r2, 1383(%r3,%r4), 8   # encoding: [0xe7,0x23,0x45,0x67,0x80,0x27]
+#CHECK: stocfho   %r1, 2(%r3)           # encoding: [0xeb,0x11,0x30,0x02,0x00,0xe1]
+#CHECK: stocfhh   %r1, 2(%r3)           # encoding: [0xeb,0x12,0x30,0x02,0x00,0xe1]
+#CHECK: stocfhp   %r1, 2(%r3)           # encoding: [0xeb,0x12,0x30,0x02,0x00,0xe1]
+#CHECK: stocfhnle %r1, 2(%r3)           # encoding: [0xeb,0x13,0x30,0x02,0x00,0xe1]
+#CHECK: stocfhl   %r1, 2(%r3)           # encoding: [0xeb,0x14,0x30,0x02,0x00,0xe1]
+#CHECK: stocfhm   %r1, 2(%r3)           # encoding: [0xeb,0x14,0x30,0x02,0x00,0xe1]
+#CHECK: stocfhnhe %r1, 2(%r3)           # encoding: [0xeb,0x15,0x30,0x02,0x00,0xe1]
+#CHECK: stocfhlh  %r1, 2(%r3)           # encoding: [0xeb,0x16,0x30,0x02,0x00,0xe1]
+#CHECK: stocfhne  %r1, 2(%r3)           # encoding: [0xeb,0x17,0x30,0x02,0x00,0xe1]
+#CHECK: stocfhnz  %r1, 2(%r3)           # encoding: [0xeb,0x17,0x30,0x02,0x00,0xe1]
+#CHECK: stocfhe   %r1, 2(%r3)           # encoding: [0xeb,0x18,0x30,0x02,0x00,0xe1]
+#CHECK: stocfhz   %r1, 2(%r3)           # encoding: [0xeb,0x18,0x30,0x02,0x00,0xe1]
+#CHECK: stocfhnlh %r1, 2(%r3)           # encoding: [0xeb,0x19,0x30,0x02,0x00,0xe1]
+#CHECK: stocfhhe  %r1, 2(%r3)           # encoding: [0xeb,0x1a,0x30,0x02,0x00,0xe1]
+#CHECK: stocfhnl  %r1, 2(%r3)           # encoding: [0xeb,0x1b,0x30,0x02,0x00,0xe1]
+#CHECK: stocfhnm  %r1, 2(%r3)           # encoding: [0xeb,0x1b,0x30,0x02,0x00,0xe1]
+#CHECK: stocfhle  %r1, 2(%r3)           # encoding: [0xeb,0x1c,0x30,0x02,0x00,0xe1]
+#CHECK: stocfhnh  %r1, 2(%r3)           # encoding: [0xeb,0x1d,0x30,0x02,0x00,0xe1]
+#CHECK: stocfhnp  %r1, 2(%r3)           # encoding: [0xeb,0x1d,0x30,0x02,0x00,0xe1]
+#CHECK: stocfhno  %r1, 2(%r3)           # encoding: [0xeb,0x1e,0x30,0x02,0x00,0xe1]
 
-	lcbb	%r0, 0, 0
-	lcbb	%r0, 0, 15
-	lcbb	%r0, 4095, 0
-	lcbb	%r0, 0(%r15), 0
-	lcbb	%r0, 0(%r15,%r1), 0
-	lcbb	%r15, 0, 0
-	lcbb	%r2, 1383(%r3,%r4), 8
+	stocfho   %r1, 2(%r3)
+	stocfhh   %r1, 2(%r3)
+	stocfhp   %r1, 2(%r3)
+	stocfhnle %r1, 2(%r3)
+	stocfhl   %r1, 2(%r3)
+	stocfhm   %r1, 2(%r3)
+	stocfhnhe %r1, 2(%r3)
+	stocfhlh  %r1, 2(%r3)
+	stocfhne  %r1, 2(%r3)
+	stocfhnz  %r1, 2(%r3)
+	stocfhe   %r1, 2(%r3)
+	stocfhz   %r1, 2(%r3)
+	stocfhnlh %r1, 2(%r3)
+	stocfhhe  %r1, 2(%r3)
+	stocfhnl  %r1, 2(%r3)
+	stocfhnm  %r1, 2(%r3)
+	stocfhle  %r1, 2(%r3)
+	stocfhnh  %r1, 2(%r3)
+	stocfhnp  %r1, 2(%r3)
+	stocfhno  %r1, 2(%r3)
+
+#CHECK: ppno	%r2, %r2                # encoding: [0xb9,0x3c,0x00,0x22]
+#CHECK: ppno	%r2, %r14               # encoding: [0xb9,0x3c,0x00,0x2e]
+#CHECK: ppno	%r14, %r2               # encoding: [0xb9,0x3c,0x00,0xe2]
+#CHECK: ppno	%r6, %r10               # encoding: [0xb9,0x3c,0x00,0x6a]
+
+	ppno	%r2, %r2
+	ppno	%r2, %r14
+	ppno	%r14, %r2
+	ppno	%r6, %r10
 
 #CHECK: va      %v0, %v0, %v0, 0        # encoding: [0xe7,0x00,0x00,0x00,0x00,0xf3]
 #CHECK: va      %v0, %v0, %v0, 15       # encoding: [0xe7,0x00,0x00,0x00,0xf0,0xf3]
@@ -486,18 +766,6 @@
 	vcdlgb	%v31, %v0, 0, 0
 	vcdlgb	%v14, %v17, 4, 10
 
-#CHECK: vcksm   %v0, %v0, %v0           # encoding: [0xe7,0x00,0x00,0x00,0x00,0x66]
-#CHECK: vcksm   %v0, %v0, %v31          # encoding: [0xe7,0x00,0xf0,0x00,0x02,0x66]
-#CHECK: vcksm   %v0, %v31, %v0          # encoding: [0xe7,0x0f,0x00,0x00,0x04,0x66]
-#CHECK: vcksm   %v31, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x00,0x08,0x66]
-#CHECK: vcksm   %v18, %v3, %v20         # encoding: [0xe7,0x23,0x40,0x00,0x0a,0x66]
-
-	vcksm	%v0, %v0, %v0
-	vcksm	%v0, %v0, %v31
-	vcksm	%v0, %v31, %v0
-	vcksm	%v31, %v0, %v0
-	vcksm	%v18, %v3, %v20
-
 #CHECK: vceq    %v0, %v0, %v0, 0, 0     # encoding: [0xe7,0x00,0x00,0x00,0x00,0xf8]
 #CHECK: vceq    %v0, %v0, %v0, 15, 0    # encoding: [0xe7,0x00,0x00,0x00,0xf0,0xf8]
 #CHECK: vceq    %v0, %v0, %v0, 0, 15    # encoding: [0xe7,0x00,0x00,0xf0,0x00,0xf8]
@@ -748,6 +1016,18 @@
 	vchlh	%v18, %v3, %v20
 	vchlhs	%v5, %v22, %v7
 
+#CHECK: vcksm   %v0, %v0, %v0           # encoding: [0xe7,0x00,0x00,0x00,0x00,0x66]
+#CHECK: vcksm   %v0, %v0, %v31          # encoding: [0xe7,0x00,0xf0,0x00,0x02,0x66]
+#CHECK: vcksm   %v0, %v31, %v0          # encoding: [0xe7,0x0f,0x00,0x00,0x04,0x66]
+#CHECK: vcksm   %v31, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x00,0x08,0x66]
+#CHECK: vcksm   %v18, %v3, %v20         # encoding: [0xe7,0x23,0x40,0x00,0x0a,0x66]
+
+	vcksm	%v0, %v0, %v0
+	vcksm	%v0, %v0, %v31
+	vcksm	%v0, %v31, %v0
+	vcksm	%v31, %v0, %v0
+	vcksm	%v18, %v3, %v20
+
 #CHECK: vclgd   %v0, %v0, 0, 0, 0       # encoding: [0xe7,0x00,0x00,0x00,0x00,0xc0]
 #CHECK: vclgd   %v0, %v0, 15, 0, 0      # encoding: [0xe7,0x00,0x00,0x00,0xf0,0xc0]
 #CHECK: vclgd   %v0, %v0, 0, 0, 15      # encoding: [0xe7,0x00,0x00,0xf0,0x00,0xc0]
@@ -1142,75 +1422,13 @@
 	verimh	%v31, %v0, %v0, 0
 	verimh 	%v13, %v17, %v21, 0x79
 
-#CHECK: verllv  %v0, %v0, %v0, 0        # encoding: [0xe7,0x00,0x00,0x00,0x00,0x73]
-#CHECK: verllv  %v0, %v0, %v0, 15       # encoding: [0xe7,0x00,0x00,0x00,0xf0,0x73]
-#CHECK: verllv  %v0, %v0, %v31, 0       # encoding: [0xe7,0x00,0xf0,0x00,0x02,0x73]
-#CHECK: verllv  %v0, %v31, %v0, 0       # encoding: [0xe7,0x0f,0x00,0x00,0x04,0x73]
-#CHECK: verllv  %v31, %v0, %v0, 0       # encoding: [0xe7,0xf0,0x00,0x00,0x08,0x73]
-#CHECK: verllv  %v18, %v3, %v20, 11     # encoding: [0xe7,0x23,0x40,0x00,0xba,0x73]
-
-	verllv	%v0, %v0, %v0, 0
-	verllv	%v0, %v0, %v0, 15
-	verllv	%v0, %v0, %v31, 0
-	verllv	%v0, %v31, %v0, 0
-	verllv	%v31, %v0, %v0, 0
-	verllv	%v18, %v3, %v20, 11
-
-#CHECK: verllvb %v0, %v0, %v0           # encoding: [0xe7,0x00,0x00,0x00,0x00,0x73]
-#CHECK: verllvb %v0, %v0, %v31          # encoding: [0xe7,0x00,0xf0,0x00,0x02,0x73]
-#CHECK: verllvb %v0, %v31, %v0          # encoding: [0xe7,0x0f,0x00,0x00,0x04,0x73]
-#CHECK: verllvb %v31, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x00,0x08,0x73]
-#CHECK: verllvb %v18, %v3, %v20         # encoding: [0xe7,0x23,0x40,0x00,0x0a,0x73]
-
-	verllvb	%v0, %v0, %v0
-	verllvb	%v0, %v0, %v31
-	verllvb	%v0, %v31, %v0
-	verllvb	%v31, %v0, %v0
-	verllvb	%v18, %v3, %v20
-
-#CHECK: verllvf %v0, %v0, %v0           # encoding: [0xe7,0x00,0x00,0x00,0x20,0x73]
-#CHECK: verllvf %v0, %v0, %v31          # encoding: [0xe7,0x00,0xf0,0x00,0x22,0x73]
-#CHECK: verllvf %v0, %v31, %v0          # encoding: [0xe7,0x0f,0x00,0x00,0x24,0x73]
-#CHECK: verllvf %v31, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x00,0x28,0x73]
-#CHECK: verllvf %v18, %v3, %v20         # encoding: [0xe7,0x23,0x40,0x00,0x2a,0x73]
-
-	verllvf	%v0, %v0, %v0
-	verllvf	%v0, %v0, %v31
-	verllvf	%v0, %v31, %v0
-	verllvf	%v31, %v0, %v0
-	verllvf	%v18, %v3, %v20
-
-#CHECK: verllvg %v0, %v0, %v0           # encoding: [0xe7,0x00,0x00,0x00,0x30,0x73]
-#CHECK: verllvg %v0, %v0, %v31          # encoding: [0xe7,0x00,0xf0,0x00,0x32,0x73]
-#CHECK: verllvg %v0, %v31, %v0          # encoding: [0xe7,0x0f,0x00,0x00,0x34,0x73]
-#CHECK: verllvg %v31, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x00,0x38,0x73]
-#CHECK: verllvg %v18, %v3, %v20         # encoding: [0xe7,0x23,0x40,0x00,0x3a,0x73]
-
-	verllvg	%v0, %v0, %v0
-	verllvg	%v0, %v0, %v31
-	verllvg	%v0, %v31, %v0
-	verllvg	%v31, %v0, %v0
-	verllvg	%v18, %v3, %v20
-
-#CHECK: verllvh %v0, %v0, %v0           # encoding: [0xe7,0x00,0x00,0x00,0x10,0x73]
-#CHECK: verllvh %v0, %v0, %v31          # encoding: [0xe7,0x00,0xf0,0x00,0x12,0x73]
-#CHECK: verllvh %v0, %v31, %v0          # encoding: [0xe7,0x0f,0x00,0x00,0x14,0x73]
-#CHECK: verllvh %v31, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x00,0x18,0x73]
-#CHECK: verllvh %v18, %v3, %v20         # encoding: [0xe7,0x23,0x40,0x00,0x1a,0x73]
-
-	verllvh	%v0, %v0, %v0
-	verllvh	%v0, %v0, %v31
-	verllvh	%v0, %v31, %v0
-	verllvh	%v31, %v0, %v0
-	verllvh	%v18, %v3, %v20
-
-#CHECK: verll   %v0, %v0, 0, 0          # encoding: [0xe7,0x00,0x00,0x00,0x00,0x33]
-#CHECK: verll   %v0, %v0, 0, 15         # encoding: [0xe7,0x00,0x00,0x00,0xf0,0x33]
-#CHECK: verll   %v0, %v0, 4095, 0       # encoding: [0xe7,0x00,0x0f,0xff,0x00,0x33]
-#CHECK: verll   %v0, %v0, 0(%r15), 0    # encoding: [0xe7,0x00,0xf0,0x00,0x00,0x33]
-#CHECK: verll   %v0, %v31, 0, 0         # encoding: [0xe7,0x0f,0x00,0x00,0x04,0x33]
-#CHECK: verll   %v31, %v0, 0, 0         # encoding: [0xe7,0xf0,0x00,0x00,0x08,0x33]
-#CHECK: verll   %v14, %v17, 1074(%r5), 11  # encoding: [0xe7,0xe1,0x54,0x32,0xb4,0x33]
+#CHECK: verll   %v0, %v0, 0, 0          # encoding: [0xe7,0x00,0x00,0x00,0x00,0x33]
+#CHECK: verll   %v0, %v0, 0, 15         # encoding: [0xe7,0x00,0x00,0x00,0xf0,0x33]
+#CHECK: verll   %v0, %v0, 4095, 0       # encoding: [0xe7,0x00,0x0f,0xff,0x00,0x33]
+#CHECK: verll   %v0, %v0, 0(%r15), 0    # encoding: [0xe7,0x00,0xf0,0x00,0x00,0x33]
+#CHECK: verll   %v0, %v31, 0, 0         # encoding: [0xe7,0x0f,0x00,0x00,0x04,0x33]
+#CHECK: verll   %v31, %v0, 0, 0         # encoding: [0xe7,0xf0,0x00,0x00,0x08,0x33]
+#CHECK: verll   %v14, %v17, 1074(%r5), 11  # encoding: [0xe7,0xe1,0x54,0x32,0xb4,0x33]
 
 	verll	%v0, %v0, 0, 0
 	verll	%v0, %v0, 0, 15
@@ -1276,67 +1494,67 @@
 	verllh	%v31, %v0, 0
 	verllh	%v14, %v17, 1074(%r5)
 
-#CHECK: veslv   %v0, %v0, %v0, 0       # encoding: [0xe7,0x00,0x00,0x00,0x00,0x70]
-#CHECK: veslv   %v0, %v0, %v0, 15      # encoding: [0xe7,0x00,0x00,0x00,0xf0,0x70]
-#CHECK: veslv   %v0, %v0, %v31, 0      # encoding: [0xe7,0x00,0xf0,0x00,0x02,0x70]
-#CHECK: veslv   %v0, %v31, %v0, 0      # encoding: [0xe7,0x0f,0x00,0x00,0x04,0x70]
-#CHECK: veslv   %v31, %v0, %v0, 0      # encoding: [0xe7,0xf0,0x00,0x00,0x08,0x70]
-#CHECK: veslv   %v18, %v3, %v20, 11    # encoding: [0xe7,0x23,0x40,0x00,0xba,0x70]
+#CHECK: verllv  %v0, %v0, %v0, 0        # encoding: [0xe7,0x00,0x00,0x00,0x00,0x73]
+#CHECK: verllv  %v0, %v0, %v0, 15       # encoding: [0xe7,0x00,0x00,0x00,0xf0,0x73]
+#CHECK: verllv  %v0, %v0, %v31, 0       # encoding: [0xe7,0x00,0xf0,0x00,0x02,0x73]
+#CHECK: verllv  %v0, %v31, %v0, 0       # encoding: [0xe7,0x0f,0x00,0x00,0x04,0x73]
+#CHECK: verllv  %v31, %v0, %v0, 0       # encoding: [0xe7,0xf0,0x00,0x00,0x08,0x73]
+#CHECK: verllv  %v18, %v3, %v20, 11     # encoding: [0xe7,0x23,0x40,0x00,0xba,0x73]
 
-	veslv	%v0, %v0, %v0, 0
-	veslv	%v0, %v0, %v0, 15
-	veslv	%v0, %v0, %v31, 0
-	veslv	%v0, %v31, %v0, 0
-	veslv	%v31, %v0, %v0, 0
-	veslv	%v18, %v3, %v20, 11
+	verllv	%v0, %v0, %v0, 0
+	verllv	%v0, %v0, %v0, 15
+	verllv	%v0, %v0, %v31, 0
+	verllv	%v0, %v31, %v0, 0
+	verllv	%v31, %v0, %v0, 0
+	verllv	%v18, %v3, %v20, 11
 
-#CHECK: veslvb  %v0, %v0, %v0           # encoding: [0xe7,0x00,0x00,0x00,0x00,0x70]
-#CHECK: veslvb  %v0, %v0, %v31          # encoding: [0xe7,0x00,0xf0,0x00,0x02,0x70]
-#CHECK: veslvb  %v0, %v31, %v0          # encoding: [0xe7,0x0f,0x00,0x00,0x04,0x70]
-#CHECK: veslvb  %v31, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x00,0x08,0x70]
-#CHECK: veslvb  %v18, %v3, %v20         # encoding: [0xe7,0x23,0x40,0x00,0x0a,0x70]
+#CHECK: verllvb %v0, %v0, %v0           # encoding: [0xe7,0x00,0x00,0x00,0x00,0x73]
+#CHECK: verllvb %v0, %v0, %v31          # encoding: [0xe7,0x00,0xf0,0x00,0x02,0x73]
+#CHECK: verllvb %v0, %v31, %v0          # encoding: [0xe7,0x0f,0x00,0x00,0x04,0x73]
+#CHECK: verllvb %v31, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x00,0x08,0x73]
+#CHECK: verllvb %v18, %v3, %v20         # encoding: [0xe7,0x23,0x40,0x00,0x0a,0x73]
 
-	veslvb	%v0, %v0, %v0
-	veslvb	%v0, %v0, %v31
-	veslvb	%v0, %v31, %v0
-	veslvb	%v31, %v0, %v0
-	veslvb	%v18, %v3, %v20
+	verllvb	%v0, %v0, %v0
+	verllvb	%v0, %v0, %v31
+	verllvb	%v0, %v31, %v0
+	verllvb	%v31, %v0, %v0
+	verllvb	%v18, %v3, %v20
 
-#CHECK: veslvf  %v0, %v0, %v0           # encoding: [0xe7,0x00,0x00,0x00,0x20,0x70]
-#CHECK: veslvf  %v0, %v0, %v31          # encoding: [0xe7,0x00,0xf0,0x00,0x22,0x70]
-#CHECK: veslvf  %v0, %v31, %v0          # encoding: [0xe7,0x0f,0x00,0x00,0x24,0x70]
-#CHECK: veslvf  %v31, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x00,0x28,0x70]
-#CHECK: veslvf  %v18, %v3, %v20         # encoding: [0xe7,0x23,0x40,0x00,0x2a,0x70]
+#CHECK: verllvf %v0, %v0, %v0           # encoding: [0xe7,0x00,0x00,0x00,0x20,0x73]
+#CHECK: verllvf %v0, %v0, %v31          # encoding: [0xe7,0x00,0xf0,0x00,0x22,0x73]
+#CHECK: verllvf %v0, %v31, %v0          # encoding: [0xe7,0x0f,0x00,0x00,0x24,0x73]
+#CHECK: verllvf %v31, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x00,0x28,0x73]
+#CHECK: verllvf %v18, %v3, %v20         # encoding: [0xe7,0x23,0x40,0x00,0x2a,0x73]
 
-	veslvf	%v0, %v0, %v0
-	veslvf	%v0, %v0, %v31
-	veslvf	%v0, %v31, %v0
-	veslvf	%v31, %v0, %v0
-	veslvf	%v18, %v3, %v20
+	verllvf	%v0, %v0, %v0
+	verllvf	%v0, %v0, %v31
+	verllvf	%v0, %v31, %v0
+	verllvf	%v31, %v0, %v0
+	verllvf	%v18, %v3, %v20
 
-#CHECK: veslvg  %v0, %v0, %v0           # encoding: [0xe7,0x00,0x00,0x00,0x30,0x70]
-#CHECK: veslvg  %v0, %v0, %v31          # encoding: [0xe7,0x00,0xf0,0x00,0x32,0x70]
-#CHECK: veslvg  %v0, %v31, %v0          # encoding: [0xe7,0x0f,0x00,0x00,0x34,0x70]
-#CHECK: veslvg  %v31, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x00,0x38,0x70]
-#CHECK: veslvg  %v18, %v3, %v20         # encoding: [0xe7,0x23,0x40,0x00,0x3a,0x70]
+#CHECK: verllvg %v0, %v0, %v0           # encoding: [0xe7,0x00,0x00,0x00,0x30,0x73]
+#CHECK: verllvg %v0, %v0, %v31          # encoding: [0xe7,0x00,0xf0,0x00,0x32,0x73]
+#CHECK: verllvg %v0, %v31, %v0          # encoding: [0xe7,0x0f,0x00,0x00,0x34,0x73]
+#CHECK: verllvg %v31, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x00,0x38,0x73]
+#CHECK: verllvg %v18, %v3, %v20         # encoding: [0xe7,0x23,0x40,0x00,0x3a,0x73]
 
-	veslvg	%v0, %v0, %v0
-	veslvg	%v0, %v0, %v31
-	veslvg	%v0, %v31, %v0
-	veslvg	%v31, %v0, %v0
-	veslvg	%v18, %v3, %v20
+	verllvg	%v0, %v0, %v0
+	verllvg	%v0, %v0, %v31
+	verllvg	%v0, %v31, %v0
+	verllvg	%v31, %v0, %v0
+	verllvg	%v18, %v3, %v20
 
-#CHECK: veslvh  %v0, %v0, %v0           # encoding: [0xe7,0x00,0x00,0x00,0x10,0x70]
-#CHECK: veslvh  %v0, %v0, %v31          # encoding: [0xe7,0x00,0xf0,0x00,0x12,0x70]
-#CHECK: veslvh  %v0, %v31, %v0          # encoding: [0xe7,0x0f,0x00,0x00,0x14,0x70]
-#CHECK: veslvh  %v31, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x00,0x18,0x70]
-#CHECK: veslvh  %v18, %v3, %v20         # encoding: [0xe7,0x23,0x40,0x00,0x1a,0x70]
+#CHECK: verllvh %v0, %v0, %v0           # encoding: [0xe7,0x00,0x00,0x00,0x10,0x73]
+#CHECK: verllvh %v0, %v0, %v31          # encoding: [0xe7,0x00,0xf0,0x00,0x12,0x73]
+#CHECK: verllvh %v0, %v31, %v0          # encoding: [0xe7,0x0f,0x00,0x00,0x14,0x73]
+#CHECK: verllvh %v31, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x00,0x18,0x73]
+#CHECK: verllvh %v18, %v3, %v20         # encoding: [0xe7,0x23,0x40,0x00,0x1a,0x73]
 
-	veslvh	%v0, %v0, %v0
-	veslvh	%v0, %v0, %v31
-	veslvh	%v0, %v31, %v0
-	veslvh	%v31, %v0, %v0
-	veslvh	%v18, %v3, %v20
+	verllvh	%v0, %v0, %v0
+	verllvh	%v0, %v0, %v31
+	verllvh	%v0, %v31, %v0
+	verllvh	%v31, %v0, %v0
+	verllvh	%v18, %v3, %v20
 
 #CHECK: vesl    %v0, %v0, 0, 0          # encoding: [0xe7,0x00,0x00,0x00,0x00,0x30]
 #CHECK: vesl    %v0, %v0, 0, 15         # encoding: [0xe7,0x00,0x00,0x00,0xf0,0x30]
@@ -1410,67 +1628,67 @@
 	veslh	%v31, %v0, 0
 	veslh	%v14, %v17, 1074(%r5)
 
-#CHECK: vesrav  %v0, %v0, %v0, 0        # encoding: [0xe7,0x00,0x00,0x00,0x00,0x7a]
-#CHECK: vesrav  %v0, %v0, %v0, 15       # encoding: [0xe7,0x00,0x00,0x00,0xf0,0x7a]
-#CHECK: vesrav  %v0, %v0, %v31, 0       # encoding: [0xe7,0x00,0xf0,0x00,0x02,0x7a]
-#CHECK: vesrav  %v0, %v31, %v0, 0       # encoding: [0xe7,0x0f,0x00,0x00,0x04,0x7a]
-#CHECK: vesrav  %v31, %v0, %v0, 0       # encoding: [0xe7,0xf0,0x00,0x00,0x08,0x7a]
-#CHECK: vesrav  %v18, %v3, %v20, 11     # encoding: [0xe7,0x23,0x40,0x00,0xba,0x7a]
+#CHECK: veslv   %v0, %v0, %v0, 0       # encoding: [0xe7,0x00,0x00,0x00,0x00,0x70]
+#CHECK: veslv   %v0, %v0, %v0, 15      # encoding: [0xe7,0x00,0x00,0x00,0xf0,0x70]
+#CHECK: veslv   %v0, %v0, %v31, 0      # encoding: [0xe7,0x00,0xf0,0x00,0x02,0x70]
+#CHECK: veslv   %v0, %v31, %v0, 0      # encoding: [0xe7,0x0f,0x00,0x00,0x04,0x70]
+#CHECK: veslv   %v31, %v0, %v0, 0      # encoding: [0xe7,0xf0,0x00,0x00,0x08,0x70]
+#CHECK: veslv   %v18, %v3, %v20, 11    # encoding: [0xe7,0x23,0x40,0x00,0xba,0x70]
 
-	vesrav	%v0, %v0, %v0, 0
-	vesrav	%v0, %v0, %v0, 15
-	vesrav	%v0, %v0, %v31, 0
-	vesrav	%v0, %v31, %v0, 0
-	vesrav	%v31, %v0, %v0, 0
-	vesrav	%v18, %v3, %v20, 11
+	veslv	%v0, %v0, %v0, 0
+	veslv	%v0, %v0, %v0, 15
+	veslv	%v0, %v0, %v31, 0
+	veslv	%v0, %v31, %v0, 0
+	veslv	%v31, %v0, %v0, 0
+	veslv	%v18, %v3, %v20, 11
 
-#CHECK: vesravb %v0, %v0, %v0           # encoding: [0xe7,0x00,0x00,0x00,0x00,0x7a]
-#CHECK: vesravb %v0, %v0, %v31          # encoding: [0xe7,0x00,0xf0,0x00,0x02,0x7a]
-#CHECK: vesravb %v0, %v31, %v0          # encoding: [0xe7,0x0f,0x00,0x00,0x04,0x7a]
-#CHECK: vesravb %v31, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x00,0x08,0x7a]
-#CHECK: vesravb %v18, %v3, %v20         # encoding: [0xe7,0x23,0x40,0x00,0x0a,0x7a]
+#CHECK: veslvb  %v0, %v0, %v0           # encoding: [0xe7,0x00,0x00,0x00,0x00,0x70]
+#CHECK: veslvb  %v0, %v0, %v31          # encoding: [0xe7,0x00,0xf0,0x00,0x02,0x70]
+#CHECK: veslvb  %v0, %v31, %v0          # encoding: [0xe7,0x0f,0x00,0x00,0x04,0x70]
+#CHECK: veslvb  %v31, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x00,0x08,0x70]
+#CHECK: veslvb  %v18, %v3, %v20         # encoding: [0xe7,0x23,0x40,0x00,0x0a,0x70]
 
-	vesravb	%v0, %v0, %v0
-	vesravb	%v0, %v0, %v31
-	vesravb	%v0, %v31, %v0
-	vesravb	%v31, %v0, %v0
-	vesravb	%v18, %v3, %v20
+	veslvb	%v0, %v0, %v0
+	veslvb	%v0, %v0, %v31
+	veslvb	%v0, %v31, %v0
+	veslvb	%v31, %v0, %v0
+	veslvb	%v18, %v3, %v20
 
-#CHECK: vesravf %v0, %v0, %v0           # encoding: [0xe7,0x00,0x00,0x00,0x20,0x7a]
-#CHECK: vesravf %v0, %v0, %v31          # encoding: [0xe7,0x00,0xf0,0x00,0x22,0x7a]
-#CHECK: vesravf %v0, %v31, %v0          # encoding: [0xe7,0x0f,0x00,0x00,0x24,0x7a]
-#CHECK: vesravf %v31, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x00,0x28,0x7a]
-#CHECK: vesravf %v18, %v3, %v20         # encoding: [0xe7,0x23,0x40,0x00,0x2a,0x7a]
+#CHECK: veslvf  %v0, %v0, %v0           # encoding: [0xe7,0x00,0x00,0x00,0x20,0x70]
+#CHECK: veslvf  %v0, %v0, %v31          # encoding: [0xe7,0x00,0xf0,0x00,0x22,0x70]
+#CHECK: veslvf  %v0, %v31, %v0          # encoding: [0xe7,0x0f,0x00,0x00,0x24,0x70]
+#CHECK: veslvf  %v31, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x00,0x28,0x70]
+#CHECK: veslvf  %v18, %v3, %v20         # encoding: [0xe7,0x23,0x40,0x00,0x2a,0x70]
 
-	vesravf	%v0, %v0, %v0
-	vesravf	%v0, %v0, %v31
-	vesravf	%v0, %v31, %v0
-	vesravf	%v31, %v0, %v0
-	vesravf	%v18, %v3, %v20
+	veslvf	%v0, %v0, %v0
+	veslvf	%v0, %v0, %v31
+	veslvf	%v0, %v31, %v0
+	veslvf	%v31, %v0, %v0
+	veslvf	%v18, %v3, %v20
 
-#CHECK: vesravg %v0, %v0, %v0           # encoding: [0xe7,0x00,0x00,0x00,0x30,0x7a]
-#CHECK: vesravg %v0, %v0, %v31          # encoding: [0xe7,0x00,0xf0,0x00,0x32,0x7a]
-#CHECK: vesravg %v0, %v31, %v0          # encoding: [0xe7,0x0f,0x00,0x00,0x34,0x7a]
-#CHECK: vesravg %v31, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x00,0x38,0x7a]
-#CHECK: vesravg %v18, %v3, %v20         # encoding: [0xe7,0x23,0x40,0x00,0x3a,0x7a]
+#CHECK: veslvg  %v0, %v0, %v0           # encoding: [0xe7,0x00,0x00,0x00,0x30,0x70]
+#CHECK: veslvg  %v0, %v0, %v31          # encoding: [0xe7,0x00,0xf0,0x00,0x32,0x70]
+#CHECK: veslvg  %v0, %v31, %v0          # encoding: [0xe7,0x0f,0x00,0x00,0x34,0x70]
+#CHECK: veslvg  %v31, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x00,0x38,0x70]
+#CHECK: veslvg  %v18, %v3, %v20         # encoding: [0xe7,0x23,0x40,0x00,0x3a,0x70]
 
-	vesravg	%v0, %v0, %v0
-	vesravg	%v0, %v0, %v31
-	vesravg	%v0, %v31, %v0
-	vesravg	%v31, %v0, %v0
-	vesravg	%v18, %v3, %v20
+	veslvg	%v0, %v0, %v0
+	veslvg	%v0, %v0, %v31
+	veslvg	%v0, %v31, %v0
+	veslvg	%v31, %v0, %v0
+	veslvg	%v18, %v3, %v20
 
-#CHECK: vesravh %v0, %v0, %v0           # encoding: [0xe7,0x00,0x00,0x00,0x10,0x7a]
-#CHECK: vesravh %v0, %v0, %v31          # encoding: [0xe7,0x00,0xf0,0x00,0x12,0x7a]
-#CHECK: vesravh %v0, %v31, %v0          # encoding: [0xe7,0x0f,0x00,0x00,0x14,0x7a]
-#CHECK: vesravh %v31, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x00,0x18,0x7a]
-#CHECK: vesravh %v18, %v3, %v20         # encoding: [0xe7,0x23,0x40,0x00,0x1a,0x7a]
+#CHECK: veslvh  %v0, %v0, %v0           # encoding: [0xe7,0x00,0x00,0x00,0x10,0x70]
+#CHECK: veslvh  %v0, %v0, %v31          # encoding: [0xe7,0x00,0xf0,0x00,0x12,0x70]
+#CHECK: veslvh  %v0, %v31, %v0          # encoding: [0xe7,0x0f,0x00,0x00,0x14,0x70]
+#CHECK: veslvh  %v31, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x00,0x18,0x70]
+#CHECK: veslvh  %v18, %v3, %v20         # encoding: [0xe7,0x23,0x40,0x00,0x1a,0x70]
 
-	vesravh	%v0, %v0, %v0
-	vesravh	%v0, %v0, %v31
-	vesravh	%v0, %v31, %v0
-	vesravh	%v31, %v0, %v0
-	vesravh	%v18, %v3, %v20
+	veslvh	%v0, %v0, %v0
+	veslvh	%v0, %v0, %v31
+	veslvh	%v0, %v31, %v0
+	veslvh	%v31, %v0, %v0
+	veslvh	%v18, %v3, %v20
 
 #CHECK: vesra   %v0, %v0, 0, 0         # encoding: [0xe7,0x00,0x00,0x00,0x00,0x3a]
 #CHECK: vesra   %v0, %v0, 0, 15        # encoding: [0xe7,0x00,0x00,0x00,0xf0,0x3a]
@@ -1544,67 +1762,67 @@
 	vesrah	%v31, %v0, 0
 	vesrah	%v14, %v17, 1074(%r5)
 
-#CHECK: vesrlv  %v0, %v0, %v0, 0        # encoding: [0xe7,0x00,0x00,0x00,0x00,0x78]
-#CHECK: vesrlv  %v0, %v0, %v0, 15       # encoding: [0xe7,0x00,0x00,0x00,0xf0,0x78]
-#CHECK: vesrlv  %v0, %v0, %v31, 0       # encoding: [0xe7,0x00,0xf0,0x00,0x02,0x78]
-#CHECK: vesrlv  %v0, %v31, %v0, 0       # encoding: [0xe7,0x0f,0x00,0x00,0x04,0x78]
-#CHECK: vesrlv  %v31, %v0, %v0, 0       # encoding: [0xe7,0xf0,0x00,0x00,0x08,0x78]
-#CHECK: vesrlv  %v18, %v3, %v20, 11     # encoding: [0xe7,0x23,0x40,0x00,0xba,0x78]
+#CHECK: vesrav  %v0, %v0, %v0, 0        # encoding: [0xe7,0x00,0x00,0x00,0x00,0x7a]
+#CHECK: vesrav  %v0, %v0, %v0, 15       # encoding: [0xe7,0x00,0x00,0x00,0xf0,0x7a]
+#CHECK: vesrav  %v0, %v0, %v31, 0       # encoding: [0xe7,0x00,0xf0,0x00,0x02,0x7a]
+#CHECK: vesrav  %v0, %v31, %v0, 0       # encoding: [0xe7,0x0f,0x00,0x00,0x04,0x7a]
+#CHECK: vesrav  %v31, %v0, %v0, 0       # encoding: [0xe7,0xf0,0x00,0x00,0x08,0x7a]
+#CHECK: vesrav  %v18, %v3, %v20, 11     # encoding: [0xe7,0x23,0x40,0x00,0xba,0x7a]
 
-	vesrlv	%v0, %v0, %v0, 0
-	vesrlv	%v0, %v0, %v0, 15
-	vesrlv	%v0, %v0, %v31, 0
-	vesrlv	%v0, %v31, %v0, 0
-	vesrlv	%v31, %v0, %v0, 0
-	vesrlv	%v18, %v3, %v20, 11
+	vesrav	%v0, %v0, %v0, 0
+	vesrav	%v0, %v0, %v0, 15
+	vesrav	%v0, %v0, %v31, 0
+	vesrav	%v0, %v31, %v0, 0
+	vesrav	%v31, %v0, %v0, 0
+	vesrav	%v18, %v3, %v20, 11
 
-#CHECK: vesrlvb %v0, %v0, %v0           # encoding: [0xe7,0x00,0x00,0x00,0x00,0x78]
-#CHECK: vesrlvb %v0, %v0, %v31          # encoding: [0xe7,0x00,0xf0,0x00,0x02,0x78]
-#CHECK: vesrlvb %v0, %v31, %v0          # encoding: [0xe7,0x0f,0x00,0x00,0x04,0x78]
-#CHECK: vesrlvb %v31, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x00,0x08,0x78]
-#CHECK: vesrlvb %v18, %v3, %v20         # encoding: [0xe7,0x23,0x40,0x00,0x0a,0x78]
+#CHECK: vesravb %v0, %v0, %v0           # encoding: [0xe7,0x00,0x00,0x00,0x00,0x7a]
+#CHECK: vesravb %v0, %v0, %v31          # encoding: [0xe7,0x00,0xf0,0x00,0x02,0x7a]
+#CHECK: vesravb %v0, %v31, %v0          # encoding: [0xe7,0x0f,0x00,0x00,0x04,0x7a]
+#CHECK: vesravb %v31, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x00,0x08,0x7a]
+#CHECK: vesravb %v18, %v3, %v20         # encoding: [0xe7,0x23,0x40,0x00,0x0a,0x7a]
 
-	vesrlvb	%v0, %v0, %v0
-	vesrlvb	%v0, %v0, %v31
-	vesrlvb	%v0, %v31, %v0
-	vesrlvb	%v31, %v0, %v0
-	vesrlvb	%v18, %v3, %v20
+	vesravb	%v0, %v0, %v0
+	vesravb	%v0, %v0, %v31
+	vesravb	%v0, %v31, %v0
+	vesravb	%v31, %v0, %v0
+	vesravb	%v18, %v3, %v20
 
-#CHECK: vesrlvf %v0, %v0, %v0           # encoding: [0xe7,0x00,0x00,0x00,0x20,0x78]
-#CHECK: vesrlvf %v0, %v0, %v31          # encoding: [0xe7,0x00,0xf0,0x00,0x22,0x78]
-#CHECK: vesrlvf %v0, %v31, %v0          # encoding: [0xe7,0x0f,0x00,0x00,0x24,0x78]
-#CHECK: vesrlvf %v31, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x00,0x28,0x78]
-#CHECK: vesrlvf %v18, %v3, %v20         # encoding: [0xe7,0x23,0x40,0x00,0x2a,0x78]
+#CHECK: vesravf %v0, %v0, %v0           # encoding: [0xe7,0x00,0x00,0x00,0x20,0x7a]
+#CHECK: vesravf %v0, %v0, %v31          # encoding: [0xe7,0x00,0xf0,0x00,0x22,0x7a]
+#CHECK: vesravf %v0, %v31, %v0          # encoding: [0xe7,0x0f,0x00,0x00,0x24,0x7a]
+#CHECK: vesravf %v31, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x00,0x28,0x7a]
+#CHECK: vesravf %v18, %v3, %v20         # encoding: [0xe7,0x23,0x40,0x00,0x2a,0x7a]
 
-	vesrlvf	%v0, %v0, %v0
-	vesrlvf	%v0, %v0, %v31
-	vesrlvf	%v0, %v31, %v0
-	vesrlvf	%v31, %v0, %v0
-	vesrlvf	%v18, %v3, %v20
+	vesravf	%v0, %v0, %v0
+	vesravf	%v0, %v0, %v31
+	vesravf	%v0, %v31, %v0
+	vesravf	%v31, %v0, %v0
+	vesravf	%v18, %v3, %v20
 
-#CHECK: vesrlvg %v0, %v0, %v0           # encoding: [0xe7,0x00,0x00,0x00,0x30,0x78]
-#CHECK: vesrlvg %v0, %v0, %v31          # encoding: [0xe7,0x00,0xf0,0x00,0x32,0x78]
-#CHECK: vesrlvg %v0, %v31, %v0          # encoding: [0xe7,0x0f,0x00,0x00,0x34,0x78]
-#CHECK: vesrlvg %v31, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x00,0x38,0x78]
-#CHECK: vesrlvg %v18, %v3, %v20         # encoding: [0xe7,0x23,0x40,0x00,0x3a,0x78]
+#CHECK: vesravg %v0, %v0, %v0           # encoding: [0xe7,0x00,0x00,0x00,0x30,0x7a]
+#CHECK: vesravg %v0, %v0, %v31          # encoding: [0xe7,0x00,0xf0,0x00,0x32,0x7a]
+#CHECK: vesravg %v0, %v31, %v0          # encoding: [0xe7,0x0f,0x00,0x00,0x34,0x7a]
+#CHECK: vesravg %v31, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x00,0x38,0x7a]
+#CHECK: vesravg %v18, %v3, %v20         # encoding: [0xe7,0x23,0x40,0x00,0x3a,0x7a]
 
-	vesrlvg	%v0, %v0, %v0
-	vesrlvg	%v0, %v0, %v31
-	vesrlvg	%v0, %v31, %v0
-	vesrlvg	%v31, %v0, %v0
-	vesrlvg	%v18, %v3, %v20
+	vesravg	%v0, %v0, %v0
+	vesravg	%v0, %v0, %v31
+	vesravg	%v0, %v31, %v0
+	vesravg	%v31, %v0, %v0
+	vesravg	%v18, %v3, %v20
 
-#CHECK: vesrlvh %v0, %v0, %v0           # encoding: [0xe7,0x00,0x00,0x00,0x10,0x78]
-#CHECK: vesrlvh %v0, %v0, %v31          # encoding: [0xe7,0x00,0xf0,0x00,0x12,0x78]
-#CHECK: vesrlvh %v0, %v31, %v0          # encoding: [0xe7,0x0f,0x00,0x00,0x14,0x78]
-#CHECK: vesrlvh %v31, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x00,0x18,0x78]
-#CHECK: vesrlvh %v18, %v3, %v20         # encoding: [0xe7,0x23,0x40,0x00,0x1a,0x78]
+#CHECK: vesravh %v0, %v0, %v0           # encoding: [0xe7,0x00,0x00,0x00,0x10,0x7a]
+#CHECK: vesravh %v0, %v0, %v31          # encoding: [0xe7,0x00,0xf0,0x00,0x12,0x7a]
+#CHECK: vesravh %v0, %v31, %v0          # encoding: [0xe7,0x0f,0x00,0x00,0x14,0x7a]
+#CHECK: vesravh %v31, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x00,0x18,0x7a]
+#CHECK: vesravh %v18, %v3, %v20         # encoding: [0xe7,0x23,0x40,0x00,0x1a,0x7a]
 
-	vesrlvh	%v0, %v0, %v0
-	vesrlvh	%v0, %v0, %v31
-	vesrlvh	%v0, %v31, %v0
-	vesrlvh	%v31, %v0, %v0
-	vesrlvh	%v18, %v3, %v20
+	vesravh	%v0, %v0, %v0
+	vesravh	%v0, %v0, %v31
+	vesravh	%v0, %v31, %v0
+	vesravh	%v31, %v0, %v0
+	vesravh	%v18, %v3, %v20
 
 #CHECK: vesrl   %v0, %v0, 0, 0          # encoding: [0xe7,0x00,0x00,0x00,0x00,0x38]
 #CHECK: vesrl   %v0, %v0, 0, 15         # encoding: [0xe7,0x00,0x00,0x00,0xf0,0x38]
@@ -1678,6 +1896,68 @@
 	vesrlh	%v31, %v0, 0
 	vesrlh	%v14, %v17, 1074(%r5)
 
+#CHECK: vesrlv  %v0, %v0, %v0, 0        # encoding: [0xe7,0x00,0x00,0x00,0x00,0x78]
+#CHECK: vesrlv  %v0, %v0, %v0, 15       # encoding: [0xe7,0x00,0x00,0x00,0xf0,0x78]
+#CHECK: vesrlv  %v0, %v0, %v31, 0       # encoding: [0xe7,0x00,0xf0,0x00,0x02,0x78]
+#CHECK: vesrlv  %v0, %v31, %v0, 0       # encoding: [0xe7,0x0f,0x00,0x00,0x04,0x78]
+#CHECK: vesrlv  %v31, %v0, %v0, 0       # encoding: [0xe7,0xf0,0x00,0x00,0x08,0x78]
+#CHECK: vesrlv  %v18, %v3, %v20, 11     # encoding: [0xe7,0x23,0x40,0x00,0xba,0x78]
+
+	vesrlv	%v0, %v0, %v0, 0
+	vesrlv	%v0, %v0, %v0, 15
+	vesrlv	%v0, %v0, %v31, 0
+	vesrlv	%v0, %v31, %v0, 0
+	vesrlv	%v31, %v0, %v0, 0
+	vesrlv	%v18, %v3, %v20, 11
+
+#CHECK: vesrlvb %v0, %v0, %v0           # encoding: [0xe7,0x00,0x00,0x00,0x00,0x78]
+#CHECK: vesrlvb %v0, %v0, %v31          # encoding: [0xe7,0x00,0xf0,0x00,0x02,0x78]
+#CHECK: vesrlvb %v0, %v31, %v0          # encoding: [0xe7,0x0f,0x00,0x00,0x04,0x78]
+#CHECK: vesrlvb %v31, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x00,0x08,0x78]
+#CHECK: vesrlvb %v18, %v3, %v20         # encoding: [0xe7,0x23,0x40,0x00,0x0a,0x78]
+
+	vesrlvb	%v0, %v0, %v0
+	vesrlvb	%v0, %v0, %v31
+	vesrlvb	%v0, %v31, %v0
+	vesrlvb	%v31, %v0, %v0
+	vesrlvb	%v18, %v3, %v20
+
+#CHECK: vesrlvf %v0, %v0, %v0           # encoding: [0xe7,0x00,0x00,0x00,0x20,0x78]
+#CHECK: vesrlvf %v0, %v0, %v31          # encoding: [0xe7,0x00,0xf0,0x00,0x22,0x78]
+#CHECK: vesrlvf %v0, %v31, %v0          # encoding: [0xe7,0x0f,0x00,0x00,0x24,0x78]
+#CHECK: vesrlvf %v31, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x00,0x28,0x78]
+#CHECK: vesrlvf %v18, %v3, %v20         # encoding: [0xe7,0x23,0x40,0x00,0x2a,0x78]
+
+	vesrlvf	%v0, %v0, %v0
+	vesrlvf	%v0, %v0, %v31
+	vesrlvf	%v0, %v31, %v0
+	vesrlvf	%v31, %v0, %v0
+	vesrlvf	%v18, %v3, %v20
+
+#CHECK: vesrlvg %v0, %v0, %v0           # encoding: [0xe7,0x00,0x00,0x00,0x30,0x78]
+#CHECK: vesrlvg %v0, %v0, %v31          # encoding: [0xe7,0x00,0xf0,0x00,0x32,0x78]
+#CHECK: vesrlvg %v0, %v31, %v0          # encoding: [0xe7,0x0f,0x00,0x00,0x34,0x78]
+#CHECK: vesrlvg %v31, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x00,0x38,0x78]
+#CHECK: vesrlvg %v18, %v3, %v20         # encoding: [0xe7,0x23,0x40,0x00,0x3a,0x78]
+
+	vesrlvg	%v0, %v0, %v0
+	vesrlvg	%v0, %v0, %v31
+	vesrlvg	%v0, %v31, %v0
+	vesrlvg	%v31, %v0, %v0
+	vesrlvg	%v18, %v3, %v20
+
+#CHECK: vesrlvh %v0, %v0, %v0           # encoding: [0xe7,0x00,0x00,0x00,0x10,0x78]
+#CHECK: vesrlvh %v0, %v0, %v31          # encoding: [0xe7,0x00,0xf0,0x00,0x12,0x78]
+#CHECK: vesrlvh %v0, %v31, %v0          # encoding: [0xe7,0x0f,0x00,0x00,0x14,0x78]
+#CHECK: vesrlvh %v31, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x00,0x18,0x78]
+#CHECK: vesrlvh %v18, %v3, %v20         # encoding: [0xe7,0x23,0x40,0x00,0x1a,0x78]
+
+	vesrlvh	%v0, %v0, %v0
+	vesrlvh	%v0, %v0, %v31
+	vesrlvh	%v0, %v31, %v0
+	vesrlvh	%v31, %v0, %v0
+	vesrlvh	%v18, %v3, %v20
+
 #CHECK: vfa     %v0, %v0, %v0, 0, 0     # encoding: [0xe7,0x00,0x00,0x00,0x00,0xe3]
 #CHECK: vfa     %v0, %v0, %v0, 15, 0    # encoding: [0xe7,0x00,0x00,0x00,0xf0,0xe3]
 #CHECK: vfa     %v0, %v0, %v0, 0, 15    # encoding: [0xe7,0x00,0x00,0x0f,0x00,0xe3]
@@ -2040,19 +2320,19 @@
 	vfeezb	%v18, %v3, %v20
 	vfeezbs	%v5, %v22, %v7
 
-#CFECK: vfeef   %v0, %v0, %v0, 0        # encoding: [0xe7,0x00,0x00,0x00,0x20,0x80]
-#CFECK: vfeef   %v0, %v0, %v0, 0        # encoding: [0xe7,0x00,0x00,0x00,0x20,0x80]
-#CFECK: vfeef   %v0, %v0, %v0, 12       # encoding: [0xe7,0x00,0x00,0xc0,0x20,0x80]
-#CFECK: vfeef   %v0, %v0, %v15, 0       # encoding: [0xe7,0x00,0xf0,0x00,0x20,0x80]
-#CFECK: vfeef   %v0, %v0, %v31, 0       # encoding: [0xe7,0x00,0xf0,0x00,0x22,0x80]
-#CFECK: vfeef   %v0, %v15, %v0, 0       # encoding: [0xe7,0x0f,0x00,0x00,0x20,0x80]
-#CFECK: vfeef   %v0, %v31, %v0, 0       # encoding: [0xe7,0x0f,0x00,0x00,0x24,0x80]
-#CFECK: vfeef   %v15, %v0, %v0, 0       # encoding: [0xe7,0xf0,0x00,0x00,0x20,0x80]
-#CFECK: vfeef   %v31, %v0, %v0, 0       # encoding: [0xe7,0xf0,0x00,0x00,0x28,0x80]
-#CFECK: vfeef   %v18, %v3, %v20, 0      # encoding: [0xe7,0x23,0x40,0x00,0x2a,0x80]
-#CFECK: vfeefs  %v5, %v22, %v7          # encoding: [0xe7,0x56,0x70,0x10,0x24,0x80]
-#CFECK: vfeezf  %v18, %v3, %v20         # encoding: [0xe7,0x23,0x40,0x20,0x2a,0x80]
-#CFECK: vfeezfs %v5, %v22, %v7          # encoding: [0xe7,0x56,0x70,0x30,0x24,0x80]
+#CHECK: vfeef   %v0, %v0, %v0, 0        # encoding: [0xe7,0x00,0x00,0x00,0x20,0x80]
+#CHECK: vfeef   %v0, %v0, %v0, 0        # encoding: [0xe7,0x00,0x00,0x00,0x20,0x80]
+#CHECK: vfeef   %v0, %v0, %v0, 12       # encoding: [0xe7,0x00,0x00,0xc0,0x20,0x80]
+#CHECK: vfeef   %v0, %v0, %v15, 0       # encoding: [0xe7,0x00,0xf0,0x00,0x20,0x80]
+#CHECK: vfeef   %v0, %v0, %v31, 0       # encoding: [0xe7,0x00,0xf0,0x00,0x22,0x80]
+#CHECK: vfeef   %v0, %v15, %v0, 0       # encoding: [0xe7,0x0f,0x00,0x00,0x20,0x80]
+#CHECK: vfeef   %v0, %v31, %v0, 0       # encoding: [0xe7,0x0f,0x00,0x00,0x24,0x80]
+#CHECK: vfeef   %v15, %v0, %v0, 0       # encoding: [0xe7,0xf0,0x00,0x00,0x20,0x80]
+#CHECK: vfeef   %v31, %v0, %v0, 0       # encoding: [0xe7,0xf0,0x00,0x00,0x28,0x80]
+#CHECK: vfeef   %v18, %v3, %v20, 0      # encoding: [0xe7,0x23,0x40,0x00,0x2a,0x80]
+#CHECK: vfeefs  %v5, %v22, %v7          # encoding: [0xe7,0x56,0x70,0x10,0x24,0x80]
+#CHECK: vfeezf  %v18, %v3, %v20         # encoding: [0xe7,0x23,0x40,0x20,0x2a,0x80]
+#CHECK: vfeezfs %v5, %v22, %v7          # encoding: [0xe7,0x56,0x70,0x30,0x24,0x80]
 
 	vfeef	%v0, %v0, %v0
 	vfeef	%v0, %v0, %v0, 0
@@ -2152,19 +2432,19 @@
 	vfenezb  %v18, %v3, %v20
 	vfenezbs %v5, %v22, %v7
 
-#CFECK: vfenef   %v0, %v0, %v0, 0       # encoding: [0xe7,0x00,0x00,0x00,0x20,0x81]
-#CFECK: vfenef   %v0, %v0, %v0, 0       # encoding: [0xe7,0x00,0x00,0x00,0x20,0x81]
-#CFECK: vfenef   %v0, %v0, %v0, 12      # encoding: [0xe7,0x00,0x00,0xc0,0x20,0x81]
-#CFECK: vfenef   %v0, %v0, %v15, 0      # encoding: [0xe7,0x00,0xf0,0x00,0x20,0x81]
-#CFECK: vfenef   %v0, %v0, %v31, 0      # encoding: [0xe7,0x00,0xf0,0x00,0x22,0x81]
-#CFECK: vfenef   %v0, %v15, %v0, 0      # encoding: [0xe7,0x0f,0x00,0x00,0x20,0x81]
-#CFECK: vfenef   %v0, %v31, %v0, 0      # encoding: [0xe7,0x0f,0x00,0x00,0x24,0x81]
-#CFECK: vfenef   %v15, %v0, %v0, 0      # encoding: [0xe7,0xf0,0x00,0x00,0x20,0x81]
-#CFECK: vfenef   %v31, %v0, %v0, 0      # encoding: [0xe7,0xf0,0x00,0x00,0x28,0x81]
-#CFECK: vfenef   %v18, %v3, %v20, 0     # encoding: [0xe7,0x23,0x40,0x00,0x2a,0x81]
-#CFECK: vfenefs  %v5, %v22, %v7         # encoding: [0xe7,0x56,0x70,0x10,0x24,0x81]
-#CFECK: vfenezf  %v18, %v3, %v20        # encoding: [0xe7,0x23,0x40,0x20,0x2a,0x81]
-#CFECK: vfenezfs %v5, %v22, %v7         # encoding: [0xe7,0x56,0x70,0x30,0x24,0x81]
+#CHECK: vfenef   %v0, %v0, %v0, 0       # encoding: [0xe7,0x00,0x00,0x00,0x20,0x81]
+#CHECK: vfenef   %v0, %v0, %v0, 0       # encoding: [0xe7,0x00,0x00,0x00,0x20,0x81]
+#CHECK: vfenef   %v0, %v0, %v0, 12      # encoding: [0xe7,0x00,0x00,0xc0,0x20,0x81]
+#CHECK: vfenef   %v0, %v0, %v15, 0      # encoding: [0xe7,0x00,0xf0,0x00,0x20,0x81]
+#CHECK: vfenef   %v0, %v0, %v31, 0      # encoding: [0xe7,0x00,0xf0,0x00,0x22,0x81]
+#CHECK: vfenef   %v0, %v15, %v0, 0      # encoding: [0xe7,0x0f,0x00,0x00,0x20,0x81]
+#CHECK: vfenef   %v0, %v31, %v0, 0      # encoding: [0xe7,0x0f,0x00,0x00,0x24,0x81]
+#CHECK: vfenef   %v15, %v0, %v0, 0      # encoding: [0xe7,0xf0,0x00,0x00,0x20,0x81]
+#CHECK: vfenef   %v31, %v0, %v0, 0      # encoding: [0xe7,0xf0,0x00,0x00,0x28,0x81]
+#CHECK: vfenef   %v18, %v3, %v20, 0     # encoding: [0xe7,0x23,0x40,0x00,0x2a,0x81]
+#CHECK: vfenefs  %v5, %v22, %v7         # encoding: [0xe7,0x56,0x70,0x10,0x24,0x81]
+#CHECK: vfenezf  %v18, %v3, %v20        # encoding: [0xe7,0x23,0x40,0x20,0x2a,0x81]
+#CHECK: vfenezfs %v5, %v22, %v7         # encoding: [0xe7,0x56,0x70,0x30,0x24,0x81]
 
 	vfenef   %v0, %v0, %v0
 	vfenef   %v0, %v0, %v0, 0
@@ -2242,123 +2522,6 @@
 	vfidb	%v31, %v0, 0, 0
 	vfidb	%v14, %v17, 4, 10
 
-#CHECK: vistr    %v0, %v0, 0, 0         # encoding: [0xe7,0x00,0x00,0x00,0x00,0x5c]
-#CHECK: vistr    %v0, %v0, 15, 0        # encoding: [0xe7,0x00,0x00,0x00,0xf0,0x5c]
-#CHECK: vistr    %v0, %v0, 0, 0         # encoding: [0xe7,0x00,0x00,0x00,0x00,0x5c]
-#CHECK: vistr    %v0, %v0, 15, 0        # encoding: [0xe7,0x00,0x00,0x00,0xf0,0x5c]
-#CHECK: vistr    %v0, %v0, 0, 12        # encoding: [0xe7,0x00,0x00,0xc0,0x00,0x5c]
-#CHECK: vistr    %v0, %v15, 0, 0        # encoding: [0xe7,0x0f,0x00,0x00,0x00,0x5c]
-#CHECK: vistr    %v0, %v31, 0, 0        # encoding: [0xe7,0x0f,0x00,0x00,0x04,0x5c]
-#CHECK: vistr    %v15, %v0, 0, 0        # encoding: [0xe7,0xf0,0x00,0x00,0x00,0x5c]
-#CHECK: vistr    %v31, %v0, 0, 0        # encoding: [0xe7,0xf0,0x00,0x00,0x08,0x5c]
-#CHECK: vistr    %v18, %v3, 11, 9       # encoding: [0xe7,0x23,0x00,0x90,0xb8,0x5c]
-
-	vistr    %v0, %v0, 0
-	vistr    %v0, %v0, 15
-	vistr    %v0, %v0, 0, 0
-	vistr    %v0, %v0, 15, 0
-	vistr    %v0, %v0, 0, 12
-	vistr    %v0, %v15, 0
-	vistr    %v0, %v31, 0
-	vistr    %v15, %v0, 0
-	vistr    %v31, %v0, 0
-	vistr    %v18, %v3, 11, 9
-
-#CHECK: vistrb   %v0, %v0, 0            # encoding: [0xe7,0x00,0x00,0x00,0x00,0x5c]
-#CHECK: vistrb   %v0, %v0, 0            # encoding: [0xe7,0x00,0x00,0x00,0x00,0x5c]
-#CHECK: vistrb   %v0, %v0, 12           # encoding: [0xe7,0x00,0x00,0xc0,0x00,0x5c]
-#CHECK: vistrb   %v0, %v15, 0           # encoding: [0xe7,0x0f,0x00,0x00,0x00,0x5c]
-#CHECK: vistrb   %v0, %v31, 0           # encoding: [0xe7,0x0f,0x00,0x00,0x04,0x5c]
-#CHECK: vistrb   %v15, %v0, 0           # encoding: [0xe7,0xf0,0x00,0x00,0x00,0x5c]
-#CHECK: vistrb   %v31, %v0, 0           # encoding: [0xe7,0xf0,0x00,0x00,0x08,0x5c]
-#CHECK: vistrb   %v18, %v3, 0           # encoding: [0xe7,0x23,0x00,0x00,0x08,0x5c]
-#CHECK: vistrbs  %v5, %v22              # encoding: [0xe7,0x56,0x00,0x10,0x04,0x5c]
-
-	vistrb   %v0, %v0
-	vistrb   %v0, %v0, 0
-	vistrb   %v0, %v0, 12
-	vistrb   %v0, %v15
-	vistrb   %v0, %v31
-	vistrb   %v15, %v0
-	vistrb   %v31, %v0
-	vistrb   %v18, %v3
-	vistrbs  %v5, %v22
-
-#CBECK: vistrf   %v0, %v0, 0            # encoding: [0xe7,0x00,0x00,0x00,0x20,0x5c]
-#CBECK: vistrf   %v0, %v0, 0            # encoding: [0xe7,0x00,0x00,0x00,0x20,0x5c]
-#CBECK: vistrf   %v0, %v0, 12           # encoding: [0xe7,0x00,0x00,0xc0,0x20,0x5c]
-#CBECK: vistrf   %v0, %v15, 0           # encoding: [0xe7,0x0f,0x00,0x00,0x20,0x5c]
-#CBECK: vistrf   %v0, %v31, 0           # encoding: [0xe7,0x0f,0x00,0x00,0x24,0x5c]
-#CBECK: vistrf   %v15, %v0, 0           # encoding: [0xe7,0xf0,0x00,0x00,0x20,0x5c]
-#CBECK: vistrf   %v31, %v0, 0           # encoding: [0xe7,0xf0,0x00,0x00,0x28,0x5c]
-#CBECK: vistrf   %v18, %v3, 0           # encoding: [0xe7,0x23,0x00,0x00,0x28,0x5c]
-#CBECK: vistrfs  %v5, %v22              # encoding: [0xe7,0x56,0x00,0x10,0x24,0x5c]
-
-	vistrf   %v0, %v0
-	vistrf   %v0, %v0, 0
-	vistrf   %v0, %v0, 12
-	vistrf   %v0, %v15
-	vistrf   %v0, %v31
-	vistrf   %v15, %v0
-	vistrf   %v31, %v0
-	vistrf   %v18, %v3
-	vistrfs  %v5, %v22
-
-#CHECK: vistrh   %v0, %v0, 0            # encoding: [0xe7,0x00,0x00,0x00,0x10,0x5c]
-#CHECK: vistrh   %v0, %v0, 0            # encoding: [0xe7,0x00,0x00,0x00,0x10,0x5c]
-#CHECK: vistrh   %v0, %v0, 12           # encoding: [0xe7,0x00,0x00,0xc0,0x10,0x5c]
-#CHECK: vistrh   %v0, %v15, 0           # encoding: [0xe7,0x0f,0x00,0x00,0x10,0x5c]
-#CHECK: vistrh   %v0, %v31, 0           # encoding: [0xe7,0x0f,0x00,0x00,0x14,0x5c]
-#CHECK: vistrh   %v15, %v0, 0           # encoding: [0xe7,0xf0,0x00,0x00,0x10,0x5c]
-#CHECK: vistrh   %v31, %v0, 0           # encoding: [0xe7,0xf0,0x00,0x00,0x18,0x5c]
-#CHECK: vistrh   %v18, %v3, 0           # encoding: [0xe7,0x23,0x00,0x00,0x18,0x5c]
-#CHECK: vistrhs  %v5, %v22              # encoding: [0xe7,0x56,0x00,0x10,0x14,0x5c]
-
-	vistrh   %v0, %v0
-	vistrh   %v0, %v0, 0
-	vistrh   %v0, %v0, 12
-	vistrh   %v0, %v15
-	vistrh   %v0, %v31
-	vistrh   %v15, %v0
-	vistrh   %v31, %v0
-	vistrh   %v18, %v3
-	vistrhs  %v5, %v22
-
-#CHECK: vfpso   %v0, %v0, 0, 0, 0       # encoding: [0xe7,0x00,0x00,0x00,0x00,0xcc]
-#CHECK: vfpso   %v0, %v0, 15, 0, 0      # encoding: [0xe7,0x00,0x00,0x00,0xf0,0xcc]
-#CHECK: vfpso   %v0, %v0, 0, 15, 0      # encoding: [0xe7,0x00,0x00,0x0f,0x00,0xcc]
-#CHECK: vfpso   %v0, %v0, 0, 0, 15      # encoding: [0xe7,0x00,0x00,0xf0,0x00,0xcc]
-#CHECK: vfpso   %v0, %v15, 0, 0, 0      # encoding: [0xe7,0x0f,0x00,0x00,0x00,0xcc]
-#CHECK: vfpso   %v0, %v31, 0, 0, 0      # encoding: [0xe7,0x0f,0x00,0x00,0x04,0xcc]
-#CHECK: vfpso   %v15, %v0, 0, 0, 0      # encoding: [0xe7,0xf0,0x00,0x00,0x00,0xcc]
-#CHECK: vfpso   %v31, %v0, 0, 0, 0      # encoding: [0xe7,0xf0,0x00,0x00,0x08,0xcc]
-#CHECK: vfpso   %v14, %v17, 11, 9, 7    # encoding: [0xe7,0xe1,0x00,0x79,0xb4,0xcc]
-
-	vfpso	%v0, %v0, 0, 0, 0
-	vfpso	%v0, %v0, 15, 0, 0
-	vfpso	%v0, %v0, 0, 15, 0
-	vfpso	%v0, %v0, 0, 0, 15
-	vfpso	%v0, %v15, 0, 0, 0
-	vfpso	%v0, %v31, 0, 0, 0
-	vfpso	%v15, %v0, 0, 0, 0
-	vfpso	%v31, %v0, 0, 0, 0
-	vfpso	%v14, %v17, 11, 9, 7
-
-#CHECK: vfpsodb %v0, %v0, 15            # encoding: [0xe7,0x00,0x00,0xf0,0x30,0xcc]
-#CHECK: vfpsodb %v0, %v15, 0            # encoding: [0xe7,0x0f,0x00,0x00,0x30,0xcc]
-#CHECK: vfpsodb %v0, %v31, 0            # encoding: [0xe7,0x0f,0x00,0x00,0x34,0xcc]
-#CHECK: vfpsodb %v15, %v0, 0            # encoding: [0xe7,0xf0,0x00,0x00,0x30,0xcc]
-#CHECK: vfpsodb %v31, %v0, 0            # encoding: [0xe7,0xf0,0x00,0x00,0x38,0xcc]
-#CHECK: vfpsodb %v14, %v17, 7           # encoding: [0xe7,0xe1,0x00,0x70,0x34,0xcc]
-
-	vfpsodb	%v0, %v0, 0
-	vfpsodb	%v0, %v0, 15
-	vfpsodb	%v0, %v15, 0
-	vfpsodb	%v0, %v31, 0
-	vfpsodb	%v15, %v0, 0
-	vfpsodb	%v31, %v0, 0
-	vfpsodb	%v14, %v17, 7
-
 #CHECK: vflcdb  %v0, %v0                # encoding: [0xe7,0x00,0x00,0x00,0x30,0xcc]
 #CHECK: vflcdb  %v0, %v15               # encoding: [0xe7,0x0f,0x00,0x00,0x30,0xcc]
 #CHECK: vflcdb  %v0, %v31               # encoding: [0xe7,0x0f,0x00,0x00,0x34,0xcc]
@@ -2401,6 +2564,22 @@
 	vflpdb	%v31, %v0
 	vflpdb	%v14, %v17
 
+#CHECK: vfm     %v0, %v0, %v0, 0, 0      # encoding: [0xe7,0x00,0x00,0x00,0x00,0xe7]
+#CHECK: vfm     %v0, %v0, %v0, 15, 0     # encoding: [0xe7,0x00,0x00,0x00,0xf0,0xe7]
+#CHECK: vfm     %v0, %v0, %v0, 0, 15     # encoding: [0xe7,0x00,0x00,0x0f,0x00,0xe7]
+#CHECK: vfm     %v0, %v0, %v31, 0, 0     # encoding: [0xe7,0x00,0xf0,0x00,0x02,0xe7]
+#CHECK: vfm     %v0, %v31, %v0, 0, 0     # encoding: [0xe7,0x0f,0x00,0x00,0x04,0xe7]
+#CHECK: vfm     %v31, %v0, %v0, 0, 0     # encoding: [0xe7,0xf0,0x00,0x00,0x08,0xe7]
+#CHECK: vfm     %v18, %v3, %v20, 11, 9   # encoding: [0xe7,0x23,0x40,0x09,0xba,0xe7]
+
+	vfm	%v0, %v0, %v0, 0, 0
+	vfm	%v0, %v0, %v0, 15, 0
+	vfm	%v0, %v0, %v0, 0, 15
+	vfm	%v0, %v0, %v31, 0, 0
+	vfm	%v0, %v31, %v0, 0, 0
+	vfm	%v31, %v0, %v0, 0, 0
+	vfm	%v18, %v3, %v20, 11, 9
+
 #CHECK: vfma    %v0, %v0, %v0, %v0, 0, 0  # encoding: [0xe7,0x00,0x00,0x00,0x00,0x8f]
 #CHECK: vfma    %v0, %v0, %v0, %v0, 0, 15 # encoding: [0xe7,0x00,0x0f,0x00,0x00,0x8f]
 #CHECK: vfma    %v0, %v0, %v0, %v0, 15, 0 # encoding: [0xe7,0x00,0x00,0x0f,0x00,0x8f]
@@ -2433,22 +2612,6 @@
 	vfmadb	%v31, %v0, %v0, %v0
 	vfmadb	%v13, %v17, %v21, %v25
 
-#CHECK: vfm     %v0, %v0, %v0, 0, 0      # encoding: [0xe7,0x00,0x00,0x00,0x00,0xe7]
-#CHECK: vfm     %v0, %v0, %v0, 15, 0     # encoding: [0xe7,0x00,0x00,0x00,0xf0,0xe7]
-#CHECK: vfm     %v0, %v0, %v0, 0, 15     # encoding: [0xe7,0x00,0x00,0x0f,0x00,0xe7]
-#CHECK: vfm     %v0, %v0, %v31, 0, 0     # encoding: [0xe7,0x00,0xf0,0x00,0x02,0xe7]
-#CHECK: vfm     %v0, %v31, %v0, 0, 0     # encoding: [0xe7,0x0f,0x00,0x00,0x04,0xe7]
-#CHECK: vfm     %v31, %v0, %v0, 0, 0     # encoding: [0xe7,0xf0,0x00,0x00,0x08,0xe7]
-#CHECK: vfm     %v18, %v3, %v20, 11, 9   # encoding: [0xe7,0x23,0x40,0x09,0xba,0xe7]
-
-	vfm	%v0, %v0, %v0, 0, 0
-	vfm	%v0, %v0, %v0, 15, 0
-	vfm	%v0, %v0, %v0, 0, 15
-	vfm	%v0, %v0, %v31, 0, 0
-	vfm	%v0, %v31, %v0, 0, 0
-	vfm	%v31, %v0, %v0, 0, 0
-	vfm	%v18, %v3, %v20, 11, 9
-
 #CHECK: vfmdb   %v0, %v0, %v0           # encoding: [0xe7,0x00,0x00,0x00,0x30,0xe7]
 #CHECK: vfmdb   %v0, %v0, %v31          # encoding: [0xe7,0x00,0xf0,0x00,0x32,0xe7]
 #CHECK: vfmdb   %v0, %v31, %v0          # encoding: [0xe7,0x0f,0x00,0x00,0x34,0xe7]
@@ -2486,12 +2649,47 @@
 #CHECK: vfmsdb  %v31, %v0, %v0, %v0     # encoding: [0xe7,0xf0,0x03,0x00,0x08,0x8e]
 #CHECK: vfmsdb  %v13, %v17, %v21, %v25  # encoding: [0xe7,0xd1,0x53,0x00,0x97,0x8e]
 
-	vfmsdb	%v0, %v0, %v0, %v0
-	vfmsdb	%v0, %v0, %v0, %v31
-	vfmsdb	%v0, %v0, %v31, %v0
-	vfmsdb	%v0, %v31, %v0, %v0
-	vfmsdb	%v31, %v0, %v0, %v0
-	vfmsdb	%v13, %v17, %v21, %v25
+	vfmsdb	%v0, %v0, %v0, %v0
+	vfmsdb	%v0, %v0, %v0, %v31
+	vfmsdb	%v0, %v0, %v31, %v0
+	vfmsdb	%v0, %v31, %v0, %v0
+	vfmsdb	%v31, %v0, %v0, %v0
+	vfmsdb	%v13, %v17, %v21, %v25
+
+#CHECK: vfpso   %v0, %v0, 0, 0, 0       # encoding: [0xe7,0x00,0x00,0x00,0x00,0xcc]
+#CHECK: vfpso   %v0, %v0, 15, 0, 0      # encoding: [0xe7,0x00,0x00,0x00,0xf0,0xcc]
+#CHECK: vfpso   %v0, %v0, 0, 15, 0      # encoding: [0xe7,0x00,0x00,0x0f,0x00,0xcc]
+#CHECK: vfpso   %v0, %v0, 0, 0, 15      # encoding: [0xe7,0x00,0x00,0xf0,0x00,0xcc]
+#CHECK: vfpso   %v0, %v15, 0, 0, 0      # encoding: [0xe7,0x0f,0x00,0x00,0x00,0xcc]
+#CHECK: vfpso   %v0, %v31, 0, 0, 0      # encoding: [0xe7,0x0f,0x00,0x00,0x04,0xcc]
+#CHECK: vfpso   %v15, %v0, 0, 0, 0      # encoding: [0xe7,0xf0,0x00,0x00,0x00,0xcc]
+#CHECK: vfpso   %v31, %v0, 0, 0, 0      # encoding: [0xe7,0xf0,0x00,0x00,0x08,0xcc]
+#CHECK: vfpso   %v14, %v17, 11, 9, 7    # encoding: [0xe7,0xe1,0x00,0x79,0xb4,0xcc]
+
+	vfpso	%v0, %v0, 0, 0, 0
+	vfpso	%v0, %v0, 15, 0, 0
+	vfpso	%v0, %v0, 0, 15, 0
+	vfpso	%v0, %v0, 0, 0, 15
+	vfpso	%v0, %v15, 0, 0, 0
+	vfpso	%v0, %v31, 0, 0, 0
+	vfpso	%v15, %v0, 0, 0, 0
+	vfpso	%v31, %v0, 0, 0, 0
+	vfpso	%v14, %v17, 11, 9, 7
+
+#CHECK: vfpsodb %v0, %v0, 15            # encoding: [0xe7,0x00,0x00,0xf0,0x30,0xcc]
+#CHECK: vfpsodb %v0, %v15, 0            # encoding: [0xe7,0x0f,0x00,0x00,0x30,0xcc]
+#CHECK: vfpsodb %v0, %v31, 0            # encoding: [0xe7,0x0f,0x00,0x00,0x34,0xcc]
+#CHECK: vfpsodb %v15, %v0, 0            # encoding: [0xe7,0xf0,0x00,0x00,0x30,0xcc]
+#CHECK: vfpsodb %v31, %v0, 0            # encoding: [0xe7,0xf0,0x00,0x00,0x38,0xcc]
+#CHECK: vfpsodb %v14, %v17, 7           # encoding: [0xe7,0xe1,0x00,0x70,0x34,0xcc]
+
+	vfpsodb	%v0, %v0, 0
+	vfpsodb	%v0, %v0, 15
+	vfpsodb	%v0, %v15, 0
+	vfpsodb	%v0, %v31, 0
+	vfpsodb	%v15, %v0, 0
+	vfpsodb	%v31, %v0, 0
+	vfpsodb	%v14, %v17, 7
 
 #CHECK: vfs     %v0, %v0, %v0, 0, 0     # encoding: [0xe7,0x00,0x00,0x00,0x00,0xe2]
 #CHECK: vfs     %v0, %v0, %v0, 15, 0    # encoding: [0xe7,0x00,0x00,0x00,0xf0,0xe2]
@@ -2645,6 +2843,20 @@
 	vgeg	%v31, 0(%v0,%r1), 0
 	vgeg	%v10, 1000(%v19,%r7), 1
 
+#CHECK: vgfm    %v0, %v0, %v0, 0        # encoding: [0xe7,0x00,0x00,0x00,0x00,0xb4]
+#CHECK: vgfm    %v0, %v0, %v0, 15       # encoding: [0xe7,0x00,0x00,0x00,0xf0,0xb4]
+#CHECK: vgfm    %v0, %v0, %v31, 0       # encoding: [0xe7,0x00,0xf0,0x00,0x02,0xb4]
+#CHECK: vgfm    %v0, %v31, %v0, 0       # encoding: [0xe7,0x0f,0x00,0x00,0x04,0xb4]
+#CHECK: vgfm    %v31, %v0, %v0, 0       # encoding: [0xe7,0xf0,0x00,0x00,0x08,0xb4]
+#CHECK: vgfm    %v18, %v3, %v20, 11     # encoding: [0xe7,0x23,0x40,0x00,0xba,0xb4]
+
+	vgfm	%v0, %v0, %v0, 0
+	vgfm	%v0, %v0, %v0, 15
+	vgfm	%v0, %v0, %v31, 0
+	vgfm	%v0, %v31, %v0, 0
+	vgfm	%v31, %v0, %v0, 0
+	vgfm	%v18, %v3, %v20, 11
+
 #CHECK: vgfma   %v0, %v0, %v0, %v0, 0   # encoding: [0xe7,0x00,0x00,0x00,0x00,0xbc]
 #CHECK: vgfma   %v0, %v0, %v0, %v0, 15  # encoding: [0xe7,0x00,0x0f,0x00,0x00,0xbc]
 #CHECK: vgfma   %v0, %v0, %v0, %v31, 0  # encoding: [0xe7,0x00,0x00,0x00,0xf1,0xbc]
@@ -2717,20 +2929,6 @@
 	vgfmah	%v31, %v0, %v0, %v0
 	vgfmah	%v13, %v17, %v21, %v25
 
-#CHECK: vgfm    %v0, %v0, %v0, 0        # encoding: [0xe7,0x00,0x00,0x00,0x00,0xb4]
-#CHECK: vgfm    %v0, %v0, %v0, 15       # encoding: [0xe7,0x00,0x00,0x00,0xf0,0xb4]
-#CHECK: vgfm    %v0, %v0, %v31, 0       # encoding: [0xe7,0x00,0xf0,0x00,0x02,0xb4]
-#CHECK: vgfm    %v0, %v31, %v0, 0       # encoding: [0xe7,0x0f,0x00,0x00,0x04,0xb4]
-#CHECK: vgfm    %v31, %v0, %v0, 0       # encoding: [0xe7,0xf0,0x00,0x00,0x08,0xb4]
-#CHECK: vgfm    %v18, %v3, %v20, 11     # encoding: [0xe7,0x23,0x40,0x00,0xba,0xb4]
-
-	vgfm	%v0, %v0, %v0, 0
-	vgfm	%v0, %v0, %v0, 15
-	vgfm	%v0, %v0, %v31, 0
-	vgfm	%v0, %v31, %v0, 0
-	vgfm	%v31, %v0, %v0, 0
-	vgfm	%v18, %v3, %v20, 11
-
 #CHECK: vgfmb   %v0, %v0, %v0           # encoding: [0xe7,0x00,0x00,0x00,0x00,0xb4]
 #CHECK: vgfmb   %v0, %v0, %v31          # encoding: [0xe7,0x00,0xf0,0x00,0x02,0xb4]
 #CHECK: vgfmb   %v0, %v31, %v0          # encoding: [0xe7,0x0f,0x00,0x00,0x04,0xb4]
@@ -2851,6 +3049,88 @@
 	vgmh	%v31, 0, 0
 	vgmh	%v21, 2, 3
 
+#CHECK: vistr    %v0, %v0, 0, 0         # encoding: [0xe7,0x00,0x00,0x00,0x00,0x5c]
+#CHECK: vistr    %v0, %v0, 15, 0        # encoding: [0xe7,0x00,0x00,0x00,0xf0,0x5c]
+#CHECK: vistr    %v0, %v0, 0, 0         # encoding: [0xe7,0x00,0x00,0x00,0x00,0x5c]
+#CHECK: vistr    %v0, %v0, 15, 0        # encoding: [0xe7,0x00,0x00,0x00,0xf0,0x5c]
+#CHECK: vistr    %v0, %v0, 0, 12        # encoding: [0xe7,0x00,0x00,0xc0,0x00,0x5c]
+#CHECK: vistr    %v0, %v15, 0, 0        # encoding: [0xe7,0x0f,0x00,0x00,0x00,0x5c]
+#CHECK: vistr    %v0, %v31, 0, 0        # encoding: [0xe7,0x0f,0x00,0x00,0x04,0x5c]
+#CHECK: vistr    %v15, %v0, 0, 0        # encoding: [0xe7,0xf0,0x00,0x00,0x00,0x5c]
+#CHECK: vistr    %v31, %v0, 0, 0        # encoding: [0xe7,0xf0,0x00,0x00,0x08,0x5c]
+#CHECK: vistr    %v18, %v3, 11, 9       # encoding: [0xe7,0x23,0x00,0x90,0xb8,0x5c]
+
+	vistr    %v0, %v0, 0
+	vistr    %v0, %v0, 15
+	vistr    %v0, %v0, 0, 0
+	vistr    %v0, %v0, 15, 0
+	vistr    %v0, %v0, 0, 12
+	vistr    %v0, %v15, 0
+	vistr    %v0, %v31, 0
+	vistr    %v15, %v0, 0
+	vistr    %v31, %v0, 0
+	vistr    %v18, %v3, 11, 9
+
+#CHECK: vistrb   %v0, %v0, 0            # encoding: [0xe7,0x00,0x00,0x00,0x00,0x5c]
+#CHECK: vistrb   %v0, %v0, 0            # encoding: [0xe7,0x00,0x00,0x00,0x00,0x5c]
+#CHECK: vistrb   %v0, %v0, 12           # encoding: [0xe7,0x00,0x00,0xc0,0x00,0x5c]
+#CHECK: vistrb   %v0, %v15, 0           # encoding: [0xe7,0x0f,0x00,0x00,0x00,0x5c]
+#CHECK: vistrb   %v0, %v31, 0           # encoding: [0xe7,0x0f,0x00,0x00,0x04,0x5c]
+#CHECK: vistrb   %v15, %v0, 0           # encoding: [0xe7,0xf0,0x00,0x00,0x00,0x5c]
+#CHECK: vistrb   %v31, %v0, 0           # encoding: [0xe7,0xf0,0x00,0x00,0x08,0x5c]
+#CHECK: vistrb   %v18, %v3, 0           # encoding: [0xe7,0x23,0x00,0x00,0x08,0x5c]
+#CHECK: vistrbs  %v5, %v22              # encoding: [0xe7,0x56,0x00,0x10,0x04,0x5c]
+
+	vistrb   %v0, %v0
+	vistrb   %v0, %v0, 0
+	vistrb   %v0, %v0, 12
+	vistrb   %v0, %v15
+	vistrb   %v0, %v31
+	vistrb   %v15, %v0
+	vistrb   %v31, %v0
+	vistrb   %v18, %v3
+	vistrbs  %v5, %v22
+
+#CHECK: vistrf   %v0, %v0, 0            # encoding: [0xe7,0x00,0x00,0x00,0x20,0x5c]
+#CHECK: vistrf   %v0, %v0, 0            # encoding: [0xe7,0x00,0x00,0x00,0x20,0x5c]
+#CHECK: vistrf   %v0, %v0, 12           # encoding: [0xe7,0x00,0x00,0xc0,0x20,0x5c]
+#CHECK: vistrf   %v0, %v15, 0           # encoding: [0xe7,0x0f,0x00,0x00,0x20,0x5c]
+#CHECK: vistrf   %v0, %v31, 0           # encoding: [0xe7,0x0f,0x00,0x00,0x24,0x5c]
+#CHECK: vistrf   %v15, %v0, 0           # encoding: [0xe7,0xf0,0x00,0x00,0x20,0x5c]
+#CHECK: vistrf   %v31, %v0, 0           # encoding: [0xe7,0xf0,0x00,0x00,0x28,0x5c]
+#CHECK: vistrf   %v18, %v3, 0           # encoding: [0xe7,0x23,0x00,0x00,0x28,0x5c]
+#CHECK: vistrfs  %v5, %v22              # encoding: [0xe7,0x56,0x00,0x10,0x24,0x5c]
+
+	vistrf   %v0, %v0
+	vistrf   %v0, %v0, 0
+	vistrf   %v0, %v0, 12
+	vistrf   %v0, %v15
+	vistrf   %v0, %v31
+	vistrf   %v15, %v0
+	vistrf   %v31, %v0
+	vistrf   %v18, %v3
+	vistrfs  %v5, %v22
+
+#CHECK: vistrh   %v0, %v0, 0            # encoding: [0xe7,0x00,0x00,0x00,0x10,0x5c]
+#CHECK: vistrh   %v0, %v0, 0            # encoding: [0xe7,0x00,0x00,0x00,0x10,0x5c]
+#CHECK: vistrh   %v0, %v0, 12           # encoding: [0xe7,0x00,0x00,0xc0,0x10,0x5c]
+#CHECK: vistrh   %v0, %v15, 0           # encoding: [0xe7,0x0f,0x00,0x00,0x10,0x5c]
+#CHECK: vistrh   %v0, %v31, 0           # encoding: [0xe7,0x0f,0x00,0x00,0x14,0x5c]
+#CHECK: vistrh   %v15, %v0, 0           # encoding: [0xe7,0xf0,0x00,0x00,0x10,0x5c]
+#CHECK: vistrh   %v31, %v0, 0           # encoding: [0xe7,0xf0,0x00,0x00,0x18,0x5c]
+#CHECK: vistrh   %v18, %v3, 0           # encoding: [0xe7,0x23,0x00,0x00,0x18,0x5c]
+#CHECK: vistrhs  %v5, %v22              # encoding: [0xe7,0x56,0x00,0x10,0x14,0x5c]
+
+	vistrh   %v0, %v0
+	vistrh   %v0, %v0, 0
+	vistrh   %v0, %v0, 12
+	vistrh   %v0, %v15
+	vistrh   %v0, %v31
+	vistrh   %v15, %v0
+	vistrh   %v31, %v0
+	vistrh   %v18, %v3
+	vistrhs  %v5, %v22
+
 #CHECK: vl      %v0, 0                  # encoding: [0xe7,0x00,0x00,0x00,0x00,0x06]
 #CHECK: vl      %v0, 4095               # encoding: [0xe7,0x00,0x0f,0xff,0x00,0x06]
 #CHECK: vl      %v0, 0(%r15)            # encoding: [0xe7,0x00,0xf0,0x00,0x00,0x06]
@@ -5511,20 +5791,6 @@
 	vsceg	%v31, 0(%v0,%r1), 0
 	vsceg	%v10, 1000(%v19,%r7), 1
 
-#CHECK: vsel    %v0, %v0, %v0, %v0      # encoding: [0xe7,0x00,0x00,0x00,0x00,0x8d]
-#CHECK: vsel    %v0, %v0, %v0, %v31     # encoding: [0xe7,0x00,0x00,0x00,0xf1,0x8d]
-#CHECK: vsel    %v0, %v0, %v31, %v0     # encoding: [0xe7,0x00,0xf0,0x00,0x02,0x8d]
-#CHECK: vsel    %v0, %v31, %v0, %v0     # encoding: [0xe7,0x0f,0x00,0x00,0x04,0x8d]
-#CHECK: vsel    %v31, %v0, %v0, %v0     # encoding: [0xe7,0xf0,0x00,0x00,0x08,0x8d]
-#CHECK: vsel    %v13, %v17, %v21, %v25  # encoding: [0xe7,0xd1,0x50,0x00,0x97,0x8d]
-
-	vsel	%v0, %v0, %v0, %v0
-	vsel	%v0, %v0, %v0, %v31
-	vsel	%v0, %v0, %v31, %v0
-	vsel	%v0, %v31, %v0, %v0
-	vsel	%v31, %v0, %v0, %v0
-	vsel 	%v13, %v17, %v21, %v25
-
 #CHECK: vseg    %v0, %v0, 0             # encoding: [0xe7,0x00,0x00,0x00,0x00,0x5f]
 #CHECK: vseg    %v0, %v0, 15            # encoding: [0xe7,0x00,0x00,0x00,0xf0,0x5f]
 #CHECK: vseg    %v0, %v15, 0            # encoding: [0xe7,0x0f,0x00,0x00,0x00,0x5f]
@@ -5583,6 +5849,20 @@
 	vsegh	%v31, %v0
 	vsegh	%v14, %v17
 
+#CHECK: vsel    %v0, %v0, %v0, %v0      # encoding: [0xe7,0x00,0x00,0x00,0x00,0x8d]
+#CHECK: vsel    %v0, %v0, %v0, %v31     # encoding: [0xe7,0x00,0x00,0x00,0xf1,0x8d]
+#CHECK: vsel    %v0, %v0, %v31, %v0     # encoding: [0xe7,0x00,0xf0,0x00,0x02,0x8d]
+#CHECK: vsel    %v0, %v31, %v0, %v0     # encoding: [0xe7,0x0f,0x00,0x00,0x04,0x8d]
+#CHECK: vsel    %v31, %v0, %v0, %v0     # encoding: [0xe7,0xf0,0x00,0x00,0x08,0x8d]
+#CHECK: vsel    %v13, %v17, %v21, %v25  # encoding: [0xe7,0xd1,0x50,0x00,0x97,0x8d]
+
+	vsel	%v0, %v0, %v0, %v0
+	vsel	%v0, %v0, %v0, %v31
+	vsel	%v0, %v0, %v31, %v0
+	vsel	%v0, %v31, %v0, %v0
+	vsel	%v31, %v0, %v0, %v0
+	vsel 	%v13, %v17, %v21, %v25
+
 #CHECK: vsf     %v0, %v0, %v0           # encoding: [0xe7,0x00,0x00,0x00,0x20,0xf7]
 #CHECK: vsf     %v0, %v0, %v31          # encoding: [0xe7,0x00,0xf0,0x00,0x22,0xf7]
 #CHECK: vsf     %v0, %v31, %v0          # encoding: [0xe7,0x0f,0x00,0x00,0x24,0xf7]
@@ -5989,6 +6269,18 @@
 	vsum	%v31, %v0, %v0, 0
 	vsum	%v18, %v3, %v20, 11
 
+#CHECK: vsumb   %v0, %v0, %v0           # encoding: [0xe7,0x00,0x00,0x00,0x00,0x64]
+#CHECK: vsumb   %v0, %v0, %v31          # encoding: [0xe7,0x00,0xf0,0x00,0x02,0x64]
+#CHECK: vsumb   %v0, %v31, %v0          # encoding: [0xe7,0x0f,0x00,0x00,0x04,0x64]
+#CHECK: vsumb   %v31, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x00,0x08,0x64]
+#CHECK: vsumb   %v18, %v3, %v20         # encoding: [0xe7,0x23,0x40,0x00,0x0a,0x64]
+
+	vsumb	%v0, %v0, %v0
+	vsumb	%v0, %v0, %v31
+	vsumb	%v0, %v31, %v0
+	vsumb	%v31, %v0, %v0
+	vsumb	%v18, %v3, %v20
+
 #CHECK: vsumg   %v0, %v0, %v0, 0        # encoding: [0xe7,0x00,0x00,0x00,0x00,0x65]
 #CHECK: vsumg   %v0, %v0, %v0, 15       # encoding: [0xe7,0x00,0x00,0x00,0xf0,0x65]
 #CHECK: vsumg   %v0, %v0, %v31, 0       # encoding: [0xe7,0x00,0xf0,0x00,0x02,0x65]
@@ -6003,6 +6295,18 @@
 	vsumg	%v31, %v0, %v0, 0
 	vsumg	%v18, %v3, %v20, 11
 
+#CHECK: vsumgf  %v0, %v0, %v0           # encoding: [0xe7,0x00,0x00,0x00,0x20,0x65]
+#CHECK: vsumgf  %v0, %v0, %v31          # encoding: [0xe7,0x00,0xf0,0x00,0x22,0x65]
+#CHECK: vsumgf  %v0, %v31, %v0          # encoding: [0xe7,0x0f,0x00,0x00,0x24,0x65]
+#CHECK: vsumgf  %v31, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x00,0x28,0x65]
+#CHECK: vsumgf  %v18, %v3, %v20         # encoding: [0xe7,0x23,0x40,0x00,0x2a,0x65]
+
+	vsumgf	%v0, %v0, %v0
+	vsumgf	%v0, %v0, %v31
+	vsumgf	%v0, %v31, %v0
+	vsumgf	%v31, %v0, %v0
+	vsumgf	%v18, %v3, %v20
+
 #CHECK: vsumgh  %v0, %v0, %v0           # encoding: [0xe7,0x00,0x00,0x00,0x10,0x65]
 #CHECK: vsumgh  %v0, %v0, %v31          # encoding: [0xe7,0x00,0xf0,0x00,0x12,0x65]
 #CHECK: vsumgh  %v0, %v31, %v0          # encoding: [0xe7,0x0f,0x00,0x00,0x14,0x65]
@@ -6015,17 +6319,17 @@
 	vsumgh	%v31, %v0, %v0
 	vsumgh	%v18, %v3, %v20
 
-#CHECK: vsumgf  %v0, %v0, %v0           # encoding: [0xe7,0x00,0x00,0x00,0x20,0x65]
-#CHECK: vsumgf  %v0, %v0, %v31          # encoding: [0xe7,0x00,0xf0,0x00,0x22,0x65]
-#CHECK: vsumgf  %v0, %v31, %v0          # encoding: [0xe7,0x0f,0x00,0x00,0x24,0x65]
-#CHECK: vsumgf  %v31, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x00,0x28,0x65]
-#CHECK: vsumgf  %v18, %v3, %v20         # encoding: [0xe7,0x23,0x40,0x00,0x2a,0x65]
+#CHECK: vsumh   %v0, %v0, %v0           # encoding: [0xe7,0x00,0x00,0x00,0x10,0x64]
+#CHECK: vsumh   %v0, %v0, %v31          # encoding: [0xe7,0x00,0xf0,0x00,0x12,0x64]
+#CHECK: vsumh   %v0, %v31, %v0          # encoding: [0xe7,0x0f,0x00,0x00,0x14,0x64]
+#CHECK: vsumh   %v31, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x00,0x18,0x64]
+#CHECK: vsumh   %v18, %v3, %v20         # encoding: [0xe7,0x23,0x40,0x00,0x1a,0x64]
 
-	vsumgf	%v0, %v0, %v0
-	vsumgf	%v0, %v0, %v31
-	vsumgf	%v0, %v31, %v0
-	vsumgf	%v31, %v0, %v0
-	vsumgf	%v18, %v3, %v20
+	vsumh	%v0, %v0, %v0
+	vsumh	%v0, %v0, %v31
+	vsumh	%v0, %v31, %v0
+	vsumh	%v31, %v0, %v0
+	vsumh	%v18, %v3, %v20
 
 #CHECK: vsumq   %v0, %v0, %v0, 0        # encoding: [0xe7,0x00,0x00,0x00,0x00,0x67]
 #CHECK: vsumq   %v0, %v0, %v0, 15       # encoding: [0xe7,0x00,0x00,0x00,0xf0,0x67]
@@ -6065,30 +6369,6 @@
 	vsumqg	%v31, %v0, %v0
 	vsumqg	%v18, %v3, %v20
 
-#CHECK: vsumb   %v0, %v0, %v0           # encoding: [0xe7,0x00,0x00,0x00,0x00,0x64]
-#CHECK: vsumb   %v0, %v0, %v31          # encoding: [0xe7,0x00,0xf0,0x00,0x02,0x64]
-#CHECK: vsumb   %v0, %v31, %v0          # encoding: [0xe7,0x0f,0x00,0x00,0x04,0x64]
-#CHECK: vsumb   %v31, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x00,0x08,0x64]
-#CHECK: vsumb   %v18, %v3, %v20         # encoding: [0xe7,0x23,0x40,0x00,0x0a,0x64]
-
-	vsumb	%v0, %v0, %v0
-	vsumb	%v0, %v0, %v31
-	vsumb	%v0, %v31, %v0
-	vsumb	%v31, %v0, %v0
-	vsumb	%v18, %v3, %v20
-
-#CHECK: vsumh   %v0, %v0, %v0           # encoding: [0xe7,0x00,0x00,0x00,0x10,0x64]
-#CHECK: vsumh   %v0, %v0, %v31          # encoding: [0xe7,0x00,0xf0,0x00,0x12,0x64]
-#CHECK: vsumh   %v0, %v31, %v0          # encoding: [0xe7,0x0f,0x00,0x00,0x14,0x64]
-#CHECK: vsumh   %v31, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x00,0x18,0x64]
-#CHECK: vsumh   %v18, %v3, %v20         # encoding: [0xe7,0x23,0x40,0x00,0x1a,0x64]
-
-	vsumh	%v0, %v0, %v0
-	vsumh	%v0, %v0, %v31
-	vsumh	%v0, %v31, %v0
-	vsumh	%v31, %v0, %v0
-	vsumh	%v18, %v3, %v20
-
 #CHECK: vtm     %v0, %v0                # encoding: [0xe7,0x00,0x00,0x00,0x00,0xd8]
 #CHECK: vtm     %v0, %v15               # encoding: [0xe7,0x0f,0x00,0x00,0x00,0xd8]
 #CHECK: vtm     %v0, %v31               # encoding: [0xe7,0x0f,0x00,0x00,0x04,0xd8]
@@ -6161,6 +6441,50 @@
 	vuphh	%v31, %v0
 	vuphh	%v14, %v17
 
+#CHECK: vupl    %v0, %v0, 0             # encoding: [0xe7,0x00,0x00,0x00,0x00,0xd6]
+#CHECK: vupl    %v0, %v0, 15            # encoding: [0xe7,0x00,0x00,0x00,0xf0,0xd6]
+#CHECK: vupl    %v0, %v15, 0            # encoding: [0xe7,0x0f,0x00,0x00,0x00,0xd6]
+#CHECK: vupl    %v0, %v31, 0            # encoding: [0xe7,0x0f,0x00,0x00,0x04,0xd6]
+#CHECK: vupl    %v15, %v0, 0            # encoding: [0xe7,0xf0,0x00,0x00,0x00,0xd6]
+#CHECK: vupl    %v31, %v0, 0            # encoding: [0xe7,0xf0,0x00,0x00,0x08,0xd6]
+#CHECK: vupl    %v14, %v17, 11          # encoding: [0xe7,0xe1,0x00,0x00,0xb4,0xd6]
+
+	vupl	%v0, %v0, 0
+	vupl	%v0, %v0, 15
+	vupl	%v0, %v15, 0
+	vupl	%v0, %v31, 0
+	vupl	%v15, %v0, 0
+	vupl	%v31, %v0, 0
+	vupl	%v14, %v17, 11
+
+#CHECK: vuplb   %v0, %v0                # encoding: [0xe7,0x00,0x00,0x00,0x00,0xd6]
+#CHECK: vuplb   %v0, %v15               # encoding: [0xe7,0x0f,0x00,0x00,0x00,0xd6]
+#CHECK: vuplb   %v0, %v31               # encoding: [0xe7,0x0f,0x00,0x00,0x04,0xd6]
+#CHECK: vuplb   %v15, %v0               # encoding: [0xe7,0xf0,0x00,0x00,0x00,0xd6]
+#CHECK: vuplb   %v31, %v0               # encoding: [0xe7,0xf0,0x00,0x00,0x08,0xd6]
+#CHECK: vuplb   %v14, %v17              # encoding: [0xe7,0xe1,0x00,0x00,0x04,0xd6]
+
+	vuplb	%v0, %v0
+	vuplb	%v0, %v15
+	vuplb	%v0, %v31
+	vuplb	%v15, %v0
+	vuplb	%v31, %v0
+	vuplb	%v14, %v17
+
+#CHECK: vuplf   %v0, %v0                # encoding: [0xe7,0x00,0x00,0x00,0x20,0xd6]
+#CHECK: vuplf   %v0, %v15               # encoding: [0xe7,0x0f,0x00,0x00,0x20,0xd6]
+#CHECK: vuplf   %v0, %v31               # encoding: [0xe7,0x0f,0x00,0x00,0x24,0xd6]
+#CHECK: vuplf   %v15, %v0               # encoding: [0xe7,0xf0,0x00,0x00,0x20,0xd6]
+#CHECK: vuplf   %v31, %v0               # encoding: [0xe7,0xf0,0x00,0x00,0x28,0xd6]
+#CHECK: vuplf   %v14, %v17              # encoding: [0xe7,0xe1,0x00,0x00,0x24,0xd6]
+
+	vuplf	%v0, %v0
+	vuplf	%v0, %v15
+	vuplf	%v0, %v31
+	vuplf	%v15, %v0
+	vuplf	%v31, %v0
+	vuplf	%v14, %v17
+
 #CHECK: vuplh   %v0, %v0, 0             # encoding: [0xe7,0x00,0x00,0x00,0x00,0xd5]
 #CHECK: vuplh   %v0, %v0, 15            # encoding: [0xe7,0x00,0x00,0x00,0xf0,0xd5]
 #CHECK: vuplh   %v0, %v15, 0            # encoding: [0xe7,0x0f,0x00,0x00,0x00,0xd5]
@@ -6219,50 +6543,6 @@
 	vuplhh	%v31, %v0
 	vuplhh	%v14, %v17
 
-#CHECK: vupl    %v0, %v0, 0             # encoding: [0xe7,0x00,0x00,0x00,0x00,0xd6]
-#CHECK: vupl    %v0, %v0, 15            # encoding: [0xe7,0x00,0x00,0x00,0xf0,0xd6]
-#CHECK: vupl    %v0, %v15, 0            # encoding: [0xe7,0x0f,0x00,0x00,0x00,0xd6]
-#CHECK: vupl    %v0, %v31, 0            # encoding: [0xe7,0x0f,0x00,0x00,0x04,0xd6]
-#CHECK: vupl    %v15, %v0, 0            # encoding: [0xe7,0xf0,0x00,0x00,0x00,0xd6]
-#CHECK: vupl    %v31, %v0, 0            # encoding: [0xe7,0xf0,0x00,0x00,0x08,0xd6]
-#CHECK: vupl    %v14, %v17, 11          # encoding: [0xe7,0xe1,0x00,0x00,0xb4,0xd6]
-
-	vupl	%v0, %v0, 0
-	vupl	%v0, %v0, 15
-	vupl	%v0, %v15, 0
-	vupl	%v0, %v31, 0
-	vupl	%v15, %v0, 0
-	vupl	%v31, %v0, 0
-	vupl	%v14, %v17, 11
-
-#CHECK: vuplb   %v0, %v0                # encoding: [0xe7,0x00,0x00,0x00,0x00,0xd6]
-#CHECK: vuplb   %v0, %v15               # encoding: [0xe7,0x0f,0x00,0x00,0x00,0xd6]
-#CHECK: vuplb   %v0, %v31               # encoding: [0xe7,0x0f,0x00,0x00,0x04,0xd6]
-#CHECK: vuplb   %v15, %v0               # encoding: [0xe7,0xf0,0x00,0x00,0x00,0xd6]
-#CHECK: vuplb   %v31, %v0               # encoding: [0xe7,0xf0,0x00,0x00,0x08,0xd6]
-#CHECK: vuplb   %v14, %v17              # encoding: [0xe7,0xe1,0x00,0x00,0x04,0xd6]
-
-	vuplb	%v0, %v0
-	vuplb	%v0, %v15
-	vuplb	%v0, %v31
-	vuplb	%v15, %v0
-	vuplb	%v31, %v0
-	vuplb	%v14, %v17
-
-#CHECK: vuplf   %v0, %v0                # encoding: [0xe7,0x00,0x00,0x00,0x20,0xd6]
-#CHECK: vuplf   %v0, %v15               # encoding: [0xe7,0x0f,0x00,0x00,0x20,0xd6]
-#CHECK: vuplf   %v0, %v31               # encoding: [0xe7,0x0f,0x00,0x00,0x24,0xd6]
-#CHECK: vuplf   %v15, %v0               # encoding: [0xe7,0xf0,0x00,0x00,0x20,0xd6]
-#CHECK: vuplf   %v31, %v0               # encoding: [0xe7,0xf0,0x00,0x00,0x28,0xd6]
-#CHECK: vuplf   %v14, %v17              # encoding: [0xe7,0xe1,0x00,0x00,0x24,0xd6]
-
-	vuplf	%v0, %v0
-	vuplf	%v0, %v15
-	vuplf	%v0, %v31
-	vuplf	%v15, %v0
-	vuplf	%v31, %v0
-	vuplf	%v14, %v17
-
 #CHECK: vuplhw  %v0, %v0                # encoding: [0xe7,0x00,0x00,0x00,0x10,0xd6]
 #CHECK: vuplhw  %v0, %v15               # encoding: [0xe7,0x0f,0x00,0x00,0x10,0xd6]
 #CHECK: vuplhw  %v0, %v31               # encoding: [0xe7,0x0f,0x00,0x00,0x14,0xd6]
@@ -6358,7 +6638,7 @@
 	vzero	%v31
 
 #CHECK: wcdgb   %f0, %f0, 0, 0          # encoding: [0xe7,0x00,0x00,0x08,0x30,0xc3]
-#CHECK:	wcdgb	%f0, %f0, 0, 0          # encoding: [0xe7,0x00,0x00,0x08,0x30,0xc3]
+#CHECK: wcdgb   %f0, %f0, 0, 0          # encoding: [0xe7,0x00,0x00,0x08,0x30,0xc3]
 #CHECK: wcdgb   %f0, %f0, 0, 15         # encoding: [0xe7,0x00,0x00,0xf8,0x30,0xc3]
 #CHECK: wcdgb   %f0, %f0, 4, 0          # encoding: [0xe7,0x00,0x00,0x0c,0x30,0xc3]
 #CHECK: wcdgb   %f0, %f0, 12, 0         # encoding: [0xe7,0x00,0x00,0x0c,0x30,0xc3]
@@ -6376,7 +6656,7 @@
 	wcdgb	%v14, %v17, 4, 10
 
 #CHECK: wcdlgb  %f0, %f0, 0, 0          # encoding: [0xe7,0x00,0x00,0x08,0x30,0xc1]
-#CHECK:	wcdlgb	%f0, %f0, 0, 0          # encoding: [0xe7,0x00,0x00,0x08,0x30,0xc1]
+#CHECK: wcdlgb  %f0, %f0, 0, 0          # encoding: [0xe7,0x00,0x00,0x08,0x30,0xc1]
 #CHECK: wcdlgb  %f0, %f0, 0, 15         # encoding: [0xe7,0x00,0x00,0xf8,0x30,0xc1]
 #CHECK: wcdlgb  %f0, %f0, 4, 0          # encoding: [0xe7,0x00,0x00,0x0c,0x30,0xc1]
 #CHECK: wcdlgb  %f0, %f0, 12, 0         # encoding: [0xe7,0x00,0x00,0x0c,0x30,0xc1]
@@ -6394,7 +6674,7 @@
 	wcdlgb	%v14, %v17, 4, 10
 
 #CHECK: wcgdb   %f0, %f0, 0, 0          # encoding: [0xe7,0x00,0x00,0x08,0x30,0xc2]
-#CHECK:	wcgdb	%f0, %f0, 0, 0          # encoding: [0xe7,0x00,0x00,0x08,0x30,0xc2]
+#CHECK: wcgdb   %f0, %f0, 0, 0          # encoding: [0xe7,0x00,0x00,0x08,0x30,0xc2]
 #CHECK: wcgdb   %f0, %f0, 0, 15         # encoding: [0xe7,0x00,0x00,0xf8,0x30,0xc2]
 #CHECK: wcgdb   %f0, %f0, 4, 0          # encoding: [0xe7,0x00,0x00,0x0c,0x30,0xc2]
 #CHECK: wcgdb   %f0, %f0, 12, 0         # encoding: [0xe7,0x00,0x00,0x0c,0x30,0xc2]
@@ -6412,7 +6692,7 @@
 	wcgdb	%v14, %v17, 4, 10
 
 #CHECK: wclgdb  %f0, %f0, 0, 0          # encoding: [0xe7,0x00,0x00,0x08,0x30,0xc0]
-#CHECK:	wclgdb	%f0, %f0, 0, 0          # encoding: [0xe7,0x00,0x00,0x08,0x30,0xc0]
+#CHECK: wclgdb  %f0, %f0, 0, 0          # encoding: [0xe7,0x00,0x00,0x08,0x30,0xc0]
 #CHECK: wclgdb  %f0, %f0, 0, 15         # encoding: [0xe7,0x00,0x00,0xf8,0x30,0xc0]
 #CHECK: wclgdb  %f0, %f0, 4, 0          # encoding: [0xe7,0x00,0x00,0x0c,0x30,0xc0]
 #CHECK: wclgdb  %f0, %f0, 12, 0         # encoding: [0xe7,0x00,0x00,0x0c,0x30,0xc0]
@@ -6470,7 +6750,7 @@
 #CHECK: wfcdb   %f15, %f0               # encoding: [0xe7,0xf0,0x00,0x00,0x30,0xcb]
 #CHECK: wfcdb   %v31, %f0               # encoding: [0xe7,0xf0,0x00,0x00,0x38,0xcb]
 #CHECK: wfcdb   %f14, %v17              # encoding: [0xe7,0xe1,0x00,0x00,0x34,0xcb]
-	
+
 	wfcdb	%v0, %v0
 	wfcdb	%f0, %f0
 	wfcdb	%v0, %v15
@@ -6527,7 +6807,7 @@
 #CHECK: wfchdbs %f0, %v31, %f0          # encoding: [0xe7,0x0f,0x00,0x18,0x34,0xeb]
 #CHECK: wfchdbs %v31, %f0, %f0          # encoding: [0xe7,0xf0,0x00,0x18,0x38,0xeb]
 #CHECK: wfchdbs %v18, %f3, %v20         # encoding: [0xe7,0x23,0x40,0x18,0x3a,0xeb]
-	
+
 	wfchdbs	%v0, %v0, %v0
 	wfchdbs	%f0, %f0, %f0
 	wfchdbs	%v0, %v0, %v31
@@ -6578,7 +6858,7 @@
 	wfddb	%v18, %v3, %v20
 
 #CHECK: wfidb   %f0, %f0, 0, 0          # encoding: [0xe7,0x00,0x00,0x08,0x30,0xc7]
-#CHECK: wfidb	%f0, %f0, 0, 0          # encoding: [0xe7,0x00,0x00,0x08,0x30,0xc7]	
+#CHECK: wfidb   %f0, %f0, 0, 0          # encoding: [0xe7,0x00,0x00,0x08,0x30,0xc7]
 #CHECK: wfidb   %f0, %f0, 0, 15         # encoding: [0xe7,0x00,0x00,0xf8,0x30,0xc7]
 #CHECK: wfidb   %f0, %f0, 4, 0          # encoding: [0xe7,0x00,0x00,0x0c,0x30,0xc7]
 #CHECK: wfidb   %f0, %f0, 12, 0         # encoding: [0xe7,0x00,0x00,0x0c,0x30,0xc7]
@@ -6631,24 +6911,6 @@
 	wfkdb	%v31, %v0
 	wfkdb	%v14, %v17
 
-#CHECK: wfpsodb %f0, %f0, 0             # encoding: [0xe7,0x00,0x00,0x08,0x30,0xcc]
-#CHECK: wfpsodb %f0, %f0, 0             # encoding: [0xe7,0x00,0x00,0x08,0x30,0xcc]
-#CHECK: wfpsodb %f0, %f0, 15            # encoding: [0xe7,0x00,0x00,0xf8,0x30,0xcc]
-#CHECK: wfpsodb %f0, %f15, 0            # encoding: [0xe7,0x0f,0x00,0x08,0x30,0xcc]
-#CHECK: wfpsodb %f0, %v31, 0            # encoding: [0xe7,0x0f,0x00,0x08,0x34,0xcc]
-#CHECK: wfpsodb %f15, %f0, 0            # encoding: [0xe7,0xf0,0x00,0x08,0x30,0xcc]
-#CHECK: wfpsodb %v31, %f0, 0            # encoding: [0xe7,0xf0,0x00,0x08,0x38,0xcc]
-#CHECK: wfpsodb %f14, %v17, 7           # encoding: [0xe7,0xe1,0x00,0x78,0x34,0xcc]
-
-	wfpsodb	%v0, %v0, 0
-	wfpsodb	%f0, %f0, 0
-	wfpsodb	%v0, %v0, 15
-	wfpsodb	%v0, %v15, 0
-	wfpsodb	%v0, %v31, 0
-	wfpsodb	%v15, %v0, 0
-	wfpsodb	%v31, %v0, 0
-	wfpsodb	%v14, %v17, 7
-
 #CHECK: wflcdb  %f0, %f0                # encoding: [0xe7,0x00,0x00,0x08,0x30,0xcc]
 #CHECK: wflcdb  %f0, %f0                # encoding: [0xe7,0x00,0x00,0x08,0x30,0xcc]
 #CHECK: wflcdb  %f0, %f15               # encoding: [0xe7,0x0f,0x00,0x08,0x30,0xcc]
@@ -6743,20 +7005,38 @@
 	wfmsdb	%v31, %v0, %v0, %v0
 	wfmsdb	%v13, %v17, %v21, %v25
 
+#CHECK: wfpsodb %f0, %f0, 0             # encoding: [0xe7,0x00,0x00,0x08,0x30,0xcc]
+#CHECK: wfpsodb %f0, %f0, 0             # encoding: [0xe7,0x00,0x00,0x08,0x30,0xcc]
+#CHECK: wfpsodb %f0, %f0, 15            # encoding: [0xe7,0x00,0x00,0xf8,0x30,0xcc]
+#CHECK: wfpsodb %f0, %f15, 0            # encoding: [0xe7,0x0f,0x00,0x08,0x30,0xcc]
+#CHECK: wfpsodb %f0, %v31, 0            # encoding: [0xe7,0x0f,0x00,0x08,0x34,0xcc]
+#CHECK: wfpsodb %f15, %f0, 0            # encoding: [0xe7,0xf0,0x00,0x08,0x30,0xcc]
+#CHECK: wfpsodb %v31, %f0, 0            # encoding: [0xe7,0xf0,0x00,0x08,0x38,0xcc]
+#CHECK: wfpsodb %f14, %v17, 7           # encoding: [0xe7,0xe1,0x00,0x78,0x34,0xcc]
+
+	wfpsodb	%v0, %v0, 0
+	wfpsodb	%f0, %f0, 0
+	wfpsodb	%v0, %v0, 15
+	wfpsodb	%v0, %v15, 0
+	wfpsodb	%v0, %v31, 0
+	wfpsodb	%v15, %v0, 0
+	wfpsodb	%v31, %v0, 0
+	wfpsodb	%v14, %v17, 7
+
 #CHECK: wfsdb   %f0, %f0, %f0           # encoding: [0xe7,0x00,0x00,0x08,0x30,0xe2]
 #CHECK: wfsdb   %f0, %f0, %f0           # encoding: [0xe7,0x00,0x00,0x08,0x30,0xe2]
 #CHECK: wfsdb   %f0, %f0, %v31          # encoding: [0xe7,0x00,0xf0,0x08,0x32,0xe2]
 #CHECK: wfsdb   %f0, %v31, %f0          # encoding: [0xe7,0x0f,0x00,0x08,0x34,0xe2]
 #CHECK: wfsdb   %v31, %f0, %f0          # encoding: [0xe7,0xf0,0x00,0x08,0x38,0xe2]
 #CHECK: wfsdb   %v18, %f3, %v20         # encoding: [0xe7,0x23,0x40,0x08,0x3a,0xe2]
-	
+
 	wfsdb	%v0, %v0, %v0
 	wfsdb	%f0, %f0, %f0
 	wfsdb	%v0, %v0, %v31
 	wfsdb	%v0, %v31, %v0
 	wfsdb	%v31, %v0, %v0
 	wfsdb	%v18, %v3, %v20
-	
+
 #CHECK: wfsqdb  %f0, %f0                # encoding: [0xe7,0x00,0x00,0x08,0x30,0xce]
 #CHECK: wfsqdb  %f0, %f0                # encoding: [0xe7,0x00,0x00,0x08,0x30,0xce]
 #CHECK: wfsqdb  %f0, %f15               # encoding: [0xe7,0x0f,0x00,0x08,0x30,0xce]
@@ -6764,7 +7044,7 @@
 #CHECK: wfsqdb  %f15, %f0               # encoding: [0xe7,0xf0,0x00,0x08,0x30,0xce]
 #CHECK: wfsqdb  %v31, %f0               # encoding: [0xe7,0xf0,0x00,0x08,0x38,0xce]
 #CHECK: wfsqdb  %f14, %v17              # encoding: [0xe7,0xe1,0x00,0x08,0x34,0xce]
-	
+
 	wfsqdb	%v0, %v0
 	wfsqdb	%f0, %f0
 	wfsqdb	%v0, %v15
@@ -6781,7 +7061,7 @@
 #CHECK: wftcidb %f15, %f0, 0            # encoding: [0xe7,0xf0,0x00,0x08,0x30,0x4a]
 #CHECK: wftcidb %v31, %f0, 0            # encoding: [0xe7,0xf0,0x00,0x08,0x38,0x4a]
 #CHECK: wftcidb %f4, %v21, 1656         # encoding: [0xe7,0x45,0x67,0x88,0x34,0x4a]
-	
+
 	wftcidb	%v0, %v0, 0
 	wftcidb	%f0, %f0, 0
 	wftcidb	%v0, %v0, 4095
@@ -6818,280 +7098,10 @@
 
 	wledb	%v0, %v0, 0, 0
 	wledb	%f0, %f0, 0, 0
- 	wledb	%v0, %v0, 0, 15
+	wledb	%v0, %v0, 0, 15
 	wledb	%v0, %v0, 4, 0
 	wledb	%v0, %v0, 12, 0
 	wledb	%v0, %v31, 0, 0
 	wledb	%v31, %v0, 0, 0
 	wledb	%v14, %v17, 4, 10
 
-#CHECK: lochi  %r11, 42, 0    # encoding: [0xec,0xb0,0x00,0x2a,0x00,0x42]
-#CHECK:	lochio %r11, 42       # encoding: [0xec,0xb1,0x00,0x2a,0x00,0x42]
-#CHECK: lochih %r11, 42       # encoding: [0xec,0xb2,0x00,0x2a,0x00,0x42]
-#CHECK: lochinle %r11, 42     # encoding: [0xec,0xb3,0x00,0x2a,0x00,0x42]
-#CHECK: lochil %r11, -1       # encoding: [0xec,0xb4,0xff,0xff,0x00,0x42]
-#CHECK: lochinhe %r11, 42     # encoding: [0xec,0xb5,0x00,0x2a,0x00,0x42]
-#CHECK: lochilh %r11, -1      # encoding: [0xec,0xb6,0xff,0xff,0x00,0x42]
-#CHECK: lochine %r11, 0       # encoding: [0xec,0xb7,0x00,0x00,0x00,0x42]
-#CHECK: lochie %r11, 0        # encoding: [0xec,0xb8,0x00,0x00,0x00,0x42]
-#CHECK: lochinlh %r11, 42     # encoding: [0xec,0xb9,0x00,0x2a,0x00,0x42]
-#CHECK: lochihe %r11, 255     # encoding: [0xec,0xba,0x00,0xff,0x00,0x42]
-#CHECK: lochinl %r11, 255     # encoding: [0xec,0xbb,0x00,0xff,0x00,0x42]
-#CHECK: lochile %r11, 32767   # encoding: [0xec,0xbc,0x7f,0xff,0x00,0x42]
-#CHECK: lochinh %r11, 32767   # encoding: [0xec,0xbd,0x7f,0xff,0x00,0x42]
-#CHECK: lochino %r11, 32512   # encoding: [0xec,0xbe,0x7f,0x00,0x00,0x42]
-#CHECK: lochi %r11, 32512, 15 # encoding: [0xec,0xbf,0x7f,0x00,0x00,0x42]
-        
-        lochi  %r11, 42, 0
-        lochio %r11, 42
-        lochih %r11, 42
-        lochinle %r11, 42
-        lochil %r11, -1
-        lochinhe %r11, 42
-        lochilh %r11, -1
-        lochine %r11, 0
-        lochie %r11, 0
-        lochinlh %r11, 42
-        lochihe %r11, 255
-        lochinl %r11, 255
-        lochile %r11, 32767
-        lochinh %r11, 32767
-        lochino %r11, 32512
-        lochi %r11, 32512, 15
-
-#CHECK: locghi  %r11, 42, 0    # encoding: [0xec,0xb0,0x00,0x2a,0x00,0x46]
-#CHECK:	locghio %r11, 42       # encoding: [0xec,0xb1,0x00,0x2a,0x00,0x46]
-#CHECK: locghih %r11, 42       # encoding: [0xec,0xb2,0x00,0x2a,0x00,0x46]
-#CHECK: locghinle %r11, 42     # encoding: [0xec,0xb3,0x00,0x2a,0x00,0x46]
-#CHECK: locghil %r11, -1       # encoding: [0xec,0xb4,0xff,0xff,0x00,0x46]
-#CHECK: locghinhe %r11, 42     # encoding: [0xec,0xb5,0x00,0x2a,0x00,0x46]
-#CHECK: locghilh %r11, -1      # encoding: [0xec,0xb6,0xff,0xff,0x00,0x46]
-#CHECK: locghine %r11, 0       # encoding: [0xec,0xb7,0x00,0x00,0x00,0x46]
-#CHECK: locghie %r11, 0        # encoding: [0xec,0xb8,0x00,0x00,0x00,0x46]
-#CHECK: locghinlh %r11, 42     # encoding: [0xec,0xb9,0x00,0x2a,0x00,0x46]
-#CHECK: locghihe %r11, 255     # encoding: [0xec,0xba,0x00,0xff,0x00,0x46]
-#CHECK: locghinl %r11, 255     # encoding: [0xec,0xbb,0x00,0xff,0x00,0x46]
-#CHECK: locghile %r11, 32767   # encoding: [0xec,0xbc,0x7f,0xff,0x00,0x46]
-#CHECK: locghinh %r11, 32767   # encoding: [0xec,0xbd,0x7f,0xff,0x00,0x46]
-#CHECK: locghino %r11, 32512   # encoding: [0xec,0xbe,0x7f,0x00,0x00,0x46]
-#CHECK: locghi %r11, 32512, 15 # encoding: [0xec,0xbf,0x7f,0x00,0x00,0x46]
-        
-        locghi  %r11, 42, 0
-        locghio %r11, 42
-        locghih %r11, 42
-        locghinle %r11, 42
-        locghil %r11, -1
-        locghinhe %r11, 42
-        locghilh %r11, -1
-        locghine %r11, 0
-        locghie %r11, 0
-        locghinlh %r11, 42
-        locghihe %r11, 255
-        locghinl %r11, 255
-        locghile %r11, 32767
-        locghinh %r11, 32767
-        locghino %r11, 32512
-        locghi %r11, 32512, 15
-
-#CHECK: lochhi  %r11, 42, 0    # encoding: [0xec,0xb0,0x00,0x2a,0x00,0x4e]
-#CHECK: lochhio %r11, 42       # encoding: [0xec,0xb1,0x00,0x2a,0x00,0x4e]
-#CHECK: lochhih %r11, 42       # encoding: [0xec,0xb2,0x00,0x2a,0x00,0x4e]
-#CHECK: lochhinle %r11, 42     # encoding: [0xec,0xb3,0x00,0x2a,0x00,0x4e]
-#CHECK: lochhil %r11, -1       # encoding: [0xec,0xb4,0xff,0xff,0x00,0x4e]
-#CHECK: lochhinhe %r11, 42     # encoding: [0xec,0xb5,0x00,0x2a,0x00,0x4e]
-#CHECK: lochhilh %r11, -1      # encoding: [0xec,0xb6,0xff,0xff,0x00,0x4e]
-#CHECK: lochhine %r11, 0       # encoding: [0xec,0xb7,0x00,0x00,0x00,0x4e]
-#CHECK: lochhie %r11, 0        # encoding: [0xec,0xb8,0x00,0x00,0x00,0x4e]
-#CHECK: lochhinlh %r11, 42     # encoding: [0xec,0xb9,0x00,0x2a,0x00,0x4e]
-#CHECK: lochhihe %r11, 255     # encoding: [0xec,0xba,0x00,0xff,0x00,0x4e]
-#CHECK: lochhinl %r11, 255     # encoding: [0xec,0xbb,0x00,0xff,0x00,0x4e]
-#CHECK: lochhile %r11, 32767   # encoding: [0xec,0xbc,0x7f,0xff,0x00,0x4e]
-#CHECK: lochhinh %r11, 32767   # encoding: [0xec,0xbd,0x7f,0xff,0x00,0x4e]
-#CHECK: lochhino %r11, 32512   # encoding: [0xec,0xbe,0x7f,0x00,0x00,0x4e]
-#CHECK: lochhi %r11, 32512, 15 # encoding: [0xec,0xbf,0x7f,0x00,0x00,0x4e]
-
-        lochhi  %r11, 42, 0
-        lochhio %r11, 42
-        lochhih %r11, 42
-        lochhinle %r11, 42
-        lochhil %r11, -1
-        lochhinhe %r11, 42
-        lochhilh %r11, -1
-        lochhine %r11, 0
-        lochhie %r11, 0
-        lochhinlh %r11, 42
-        lochhihe %r11, 255
-        lochhinl %r11, 255
-        lochhile %r11, 32767
-        lochhinh %r11, 32767
-        lochhino %r11, 32512
-        lochhi %r11, 32512, 15
-
-#CHECK: locfh	%r0, 0, 0               # encoding: [0xeb,0x00,0x00,0x00,0x00,0xe0]
-#CHECK: locfh	%r0, 0, 15              # encoding: [0xeb,0x0f,0x00,0x00,0x00,0xe0]
-#CHECK: locfh	%r0, -524288, 0         # encoding: [0xeb,0x00,0x00,0x00,0x80,0xe0]
-#CHECK: locfh	%r0, 524287, 0          # encoding: [0xeb,0x00,0x0f,0xff,0x7f,0xe0]
-#CHECK: locfh	%r0, 0(%r1), 0          # encoding: [0xeb,0x00,0x10,0x00,0x00,0xe0]
-#CHECK: locfh	%r0, 0(%r15), 0         # encoding: [0xeb,0x00,0xf0,0x00,0x00,0xe0]
-#CHECK: locfh	%r15, 0, 0              # encoding: [0xeb,0xf0,0x00,0x00,0x00,0xe0]
-#CHECK: locfh	%r1, 4095(%r2), 3       # encoding: [0xeb,0x13,0x2f,0xff,0x00,0xe0]
-
-	locfh	%r0,0,0
-	locfh	%r0,0,15
-	locfh	%r0,-524288,0
-	locfh	%r0,524287,0
-	locfh	%r0,0(%r1),0
-	locfh	%r0,0(%r15),0
-	locfh	%r15,0,0
-	locfh	%r1,4095(%r2),3
-
-#CHECK: locfho   %r1, 2(%r3)            # encoding: [0xeb,0x11,0x30,0x02,0x00,0xe0]
-#CHECK: locfhh   %r1, 2(%r3)            # encoding: [0xeb,0x12,0x30,0x02,0x00,0xe0]
-#CHECK: locfhp   %r1, 2(%r3)            # encoding: [0xeb,0x12,0x30,0x02,0x00,0xe0]
-#CHECK: locfhnle %r1, 2(%r3)            # encoding: [0xeb,0x13,0x30,0x02,0x00,0xe0]
-#CHECK: locfhl   %r1, 2(%r3)            # encoding: [0xeb,0x14,0x30,0x02,0x00,0xe0]
-#CHECK: locfhm   %r1, 2(%r3)            # encoding: [0xeb,0x14,0x30,0x02,0x00,0xe0]
-#CHECK: locfhnhe %r1, 2(%r3)            # encoding: [0xeb,0x15,0x30,0x02,0x00,0xe0]
-#CHECK: locfhlh  %r1, 2(%r3)            # encoding: [0xeb,0x16,0x30,0x02,0x00,0xe0]
-#CHECK: locfhne  %r1, 2(%r3)            # encoding: [0xeb,0x17,0x30,0x02,0x00,0xe0]
-#CHECK: locfhnz  %r1, 2(%r3)            # encoding: [0xeb,0x17,0x30,0x02,0x00,0xe0]
-#CHECK: locfhe   %r1, 2(%r3)            # encoding: [0xeb,0x18,0x30,0x02,0x00,0xe0]
-#CHECK: locfhz   %r1, 2(%r3)            # encoding: [0xeb,0x18,0x30,0x02,0x00,0xe0]
-#CHECK: locfhnlh %r1, 2(%r3)            # encoding: [0xeb,0x19,0x30,0x02,0x00,0xe0]
-#CHECK: locfhhe  %r1, 2(%r3)            # encoding: [0xeb,0x1a,0x30,0x02,0x00,0xe0]
-#CHECK: locfhnl  %r1, 2(%r3)            # encoding: [0xeb,0x1b,0x30,0x02,0x00,0xe0]
-#CHECK: locfhnm  %r1, 2(%r3)            # encoding: [0xeb,0x1b,0x30,0x02,0x00,0xe0]
-#CHECK: locfhle  %r1, 2(%r3)            # encoding: [0xeb,0x1c,0x30,0x02,0x00,0xe0]
-#CHECK: locfhnh  %r1, 2(%r3)            # encoding: [0xeb,0x1d,0x30,0x02,0x00,0xe0]
-#CHECK: locfhnp  %r1, 2(%r3)            # encoding: [0xeb,0x1d,0x30,0x02,0x00,0xe0]
-#CHECK: locfhno  %r1, 2(%r3)            # encoding: [0xeb,0x1e,0x30,0x02,0x00,0xe0]
-
-	locfho   %r1,2(%r3)
-	locfhh   %r1,2(%r3)
-	locfhp   %r1,2(%r3)
-	locfhnle %r1,2(%r3)
-	locfhl   %r1,2(%r3)
-	locfhm   %r1,2(%r3)
-	locfhnhe %r1,2(%r3)
-	locfhlh  %r1,2(%r3)
-	locfhne  %r1,2(%r3)
-	locfhnz  %r1,2(%r3)
-	locfhe   %r1,2(%r3)
-	locfhz   %r1,2(%r3)
-	locfhnlh %r1,2(%r3)
-	locfhhe  %r1,2(%r3)
-	locfhnl  %r1,2(%r3)
-	locfhnm  %r1,2(%r3)
-	locfhle  %r1,2(%r3)
-	locfhnh  %r1,2(%r3)
-	locfhnp  %r1,2(%r3)
-	locfhno  %r1,2(%r3)
-
-#CHECK: locfhr	%r1, %r2, 0             # encoding: [0xb9,0xe0,0x00,0x12]
-#CHECK: locfhr	%r1, %r2, 15            # encoding: [0xb9,0xe0,0xf0,0x12]
-
-	locfhr	%r1,%r2,0
-	locfhr	%r1,%r2,15
-
-#CHECK: locfhro   %r1, %r3              # encoding: [0xb9,0xe0,0x10,0x13]
-#CHECK: locfhrh   %r1, %r3              # encoding: [0xb9,0xe0,0x20,0x13]
-#CHECK: locfhrp   %r1, %r3              # encoding: [0xb9,0xe0,0x20,0x13]
-#CHECK: locfhrnle %r1, %r3              # encoding: [0xb9,0xe0,0x30,0x13]
-#CHECK: locfhrl   %r1, %r3              # encoding: [0xb9,0xe0,0x40,0x13]
-#CHECK: locfhrm   %r1, %r3              # encoding: [0xb9,0xe0,0x40,0x13]
-#CHECK: locfhrnhe %r1, %r3              # encoding: [0xb9,0xe0,0x50,0x13]
-#CHECK: locfhrlh  %r1, %r3              # encoding: [0xb9,0xe0,0x60,0x13]
-#CHECK: locfhrne  %r1, %r3              # encoding: [0xb9,0xe0,0x70,0x13]
-#CHECK: locfhrnz  %r1, %r3              # encoding: [0xb9,0xe0,0x70,0x13]
-#CHECK: locfhre   %r1, %r3              # encoding: [0xb9,0xe0,0x80,0x13]
-#CHECK: locfhrz   %r1, %r3              # encoding: [0xb9,0xe0,0x80,0x13]
-#CHECK: locfhrnlh %r1, %r3              # encoding: [0xb9,0xe0,0x90,0x13]
-#CHECK: locfhrhe  %r1, %r3              # encoding: [0xb9,0xe0,0xa0,0x13]
-#CHECK: locfhrnl  %r1, %r3              # encoding: [0xb9,0xe0,0xb0,0x13]
-#CHECK: locfhrnm  %r1, %r3              # encoding: [0xb9,0xe0,0xb0,0x13]
-#CHECK: locfhrle  %r1, %r3              # encoding: [0xb9,0xe0,0xc0,0x13]
-#CHECK: locfhrnh  %r1, %r3              # encoding: [0xb9,0xe0,0xd0,0x13]
-#CHECK: locfhrnp  %r1, %r3              # encoding: [0xb9,0xe0,0xd0,0x13]
-#CHECK: locfhrno  %r1, %r3              # encoding: [0xb9,0xe0,0xe0,0x13]
-
-	locfhro   %r1,%r3
-	locfhrh   %r1,%r3
-	locfhrp   %r1,%r3
-	locfhrnle %r1,%r3
-	locfhrl   %r1,%r3
-	locfhrm   %r1,%r3
-	locfhrnhe %r1,%r3
-	locfhrlh  %r1,%r3
-	locfhrne  %r1,%r3
-	locfhrnz  %r1,%r3
-	locfhre   %r1,%r3
-	locfhrz   %r1,%r3
-	locfhrnlh %r1,%r3
-	locfhrhe  %r1,%r3
-	locfhrnl  %r1,%r3
-	locfhrnm  %r1,%r3
-	locfhrle  %r1,%r3
-	locfhrnh  %r1,%r3
-	locfhrnp  %r1,%r3
-	locfhrno  %r1,%r3
-
-#CHECK: stocfh	%r0, 0, 0               # encoding: [0xeb,0x00,0x00,0x00,0x00,0xe1]
-#CHECK: stocfh	%r0, 0, 15              # encoding: [0xeb,0x0f,0x00,0x00,0x00,0xe1]
-#CHECK: stocfh	%r0, -524288, 0         # encoding: [0xeb,0x00,0x00,0x00,0x80,0xe1]
-#CHECK: stocfh	%r0, 524287, 0          # encoding: [0xeb,0x00,0x0f,0xff,0x7f,0xe1]
-#CHECK: stocfh	%r0, 0(%r1), 0          # encoding: [0xeb,0x00,0x10,0x00,0x00,0xe1]
-#CHECK: stocfh	%r0, 0(%r15), 0         # encoding: [0xeb,0x00,0xf0,0x00,0x00,0xe1]
-#CHECK: stocfh	%r15, 0, 0              # encoding: [0xeb,0xf0,0x00,0x00,0x00,0xe1]
-#CHECK: stocfh	%r1, 4095(%r2), 3       # encoding: [0xeb,0x13,0x2f,0xff,0x00,0xe1]
-
-	stocfh	%r0,0,0
-	stocfh	%r0,0,15
-	stocfh	%r0,-524288,0
-	stocfh	%r0,524287,0
-	stocfh	%r0,0(%r1),0
-	stocfh	%r0,0(%r15),0
-	stocfh	%r15,0,0
-	stocfh	%r1,4095(%r2),3
-
-#CHECK: stocfho   %r1, 2(%r3)           # encoding: [0xeb,0x11,0x30,0x02,0x00,0xe1]
-#CHECK: stocfhh   %r1, 2(%r3)           # encoding: [0xeb,0x12,0x30,0x02,0x00,0xe1]
-#CHECK: stocfhp   %r1, 2(%r3)           # encoding: [0xeb,0x12,0x30,0x02,0x00,0xe1]
-#CHECK: stocfhnle %r1, 2(%r3)           # encoding: [0xeb,0x13,0x30,0x02,0x00,0xe1]
-#CHECK: stocfhl   %r1, 2(%r3)           # encoding: [0xeb,0x14,0x30,0x02,0x00,0xe1]
-#CHECK: stocfhm   %r1, 2(%r3)           # encoding: [0xeb,0x14,0x30,0x02,0x00,0xe1]
-#CHECK: stocfhnhe %r1, 2(%r3)           # encoding: [0xeb,0x15,0x30,0x02,0x00,0xe1]
-#CHECK: stocfhlh  %r1, 2(%r3)           # encoding: [0xeb,0x16,0x30,0x02,0x00,0xe1]
-#CHECK: stocfhne  %r1, 2(%r3)           # encoding: [0xeb,0x17,0x30,0x02,0x00,0xe1]
-#CHECK: stocfhnz  %r1, 2(%r3)           # encoding: [0xeb,0x17,0x30,0x02,0x00,0xe1]
-#CHECK: stocfhe   %r1, 2(%r3)           # encoding: [0xeb,0x18,0x30,0x02,0x00,0xe1]
-#CHECK: stocfhz   %r1, 2(%r3)           # encoding: [0xeb,0x18,0x30,0x02,0x00,0xe1]
-#CHECK: stocfhnlh %r1, 2(%r3)           # encoding: [0xeb,0x19,0x30,0x02,0x00,0xe1]
-#CHECK: stocfhhe  %r1, 2(%r3)           # encoding: [0xeb,0x1a,0x30,0x02,0x00,0xe1]
-#CHECK: stocfhnl  %r1, 2(%r3)           # encoding: [0xeb,0x1b,0x30,0x02,0x00,0xe1]
-#CHECK: stocfhnm  %r1, 2(%r3)           # encoding: [0xeb,0x1b,0x30,0x02,0x00,0xe1]
-#CHECK: stocfhle  %r1, 2(%r3)           # encoding: [0xeb,0x1c,0x30,0x02,0x00,0xe1]
-#CHECK: stocfhnh  %r1, 2(%r3)           # encoding: [0xeb,0x1d,0x30,0x02,0x00,0xe1]
-#CHECK: stocfhnp  %r1, 2(%r3)           # encoding: [0xeb,0x1d,0x30,0x02,0x00,0xe1]
-#CHECK: stocfhno  %r1, 2(%r3)           # encoding: [0xeb,0x1e,0x30,0x02,0x00,0xe1]
-
-	stocfho   %r1,2(%r3)
-	stocfhh   %r1,2(%r3)
-	stocfhp   %r1,2(%r3)
-	stocfhnle %r1,2(%r3)
-	stocfhl   %r1,2(%r3)
-	stocfhm   %r1,2(%r3)
-	stocfhnhe %r1,2(%r3)
-	stocfhlh  %r1,2(%r3)
-	stocfhne  %r1,2(%r3)
-	stocfhnz  %r1,2(%r3)
-	stocfhe   %r1,2(%r3)
-	stocfhz   %r1,2(%r3)
-	stocfhnlh %r1,2(%r3)
-	stocfhhe  %r1,2(%r3)
-	stocfhnl  %r1,2(%r3)
-	stocfhnm  %r1,2(%r3)
-	stocfhle  %r1,2(%r3)
-	stocfhnh  %r1,2(%r3)
-	stocfhnp  %r1,2(%r3)
-	stocfhno  %r1,2(%r3)
-
diff --git a/test/MC/SystemZ/insn-good-z196.s b/test/MC/SystemZ/insn-good-z196.s
index b24cc7d18e1f..02c473c11a4a 100644
--- a/test/MC/SystemZ/insn-good-z196.s
+++ b/test/MC/SystemZ/insn-good-z196.s
@@ -136,34 +136,6 @@
 	ark	%r15,%r0,%r0
 	ark	%r7,%r8,%r9
 
-#CHECK: cdfbra	%f0, 0, %r0, 0          # encoding: [0xb3,0x95,0x00,0x00]
-#CHECK: cdfbra	%f0, 0, %r0, 15         # encoding: [0xb3,0x95,0x0f,0x00]
-#CHECK: cdfbra	%f0, 0, %r15, 0         # encoding: [0xb3,0x95,0x00,0x0f]
-#CHECK: cdfbra	%f0, 15, %r0, 0         # encoding: [0xb3,0x95,0xf0,0x00]
-#CHECK: cdfbra	%f4, 5, %r6, 7          # encoding: [0xb3,0x95,0x57,0x46]
-#CHECK: cdfbra	%f15, 0, %r0, 0         # encoding: [0xb3,0x95,0x00,0xf0]
-
-	cdfbra	%f0, 0, %r0, 0
-	cdfbra	%f0, 0, %r0, 15
-	cdfbra	%f0, 0, %r15, 0
-	cdfbra	%f0, 15, %r0, 0
-	cdfbra	%f4, 5, %r6, 7
-	cdfbra	%f15, 0, %r0, 0
-
-#CHECK: cdgbra	%f0, 0, %r0, 0          # encoding: [0xb3,0xa5,0x00,0x00]
-#CHECK: cdgbra	%f0, 0, %r0, 15         # encoding: [0xb3,0xa5,0x0f,0x00]
-#CHECK: cdgbra	%f0, 0, %r15, 0         # encoding: [0xb3,0xa5,0x00,0x0f]
-#CHECK: cdgbra	%f0, 15, %r0, 0         # encoding: [0xb3,0xa5,0xf0,0x00]
-#CHECK: cdgbra	%f4, 5, %r6, 7          # encoding: [0xb3,0xa5,0x57,0x46]
-#CHECK: cdgbra	%f15, 0, %r0, 0         # encoding: [0xb3,0xa5,0x00,0xf0]
-
-	cdgbra	%f0, 0, %r0, 0
-	cdgbra	%f0, 0, %r0, 15
-	cdgbra	%f0, 0, %r15, 0
-	cdgbra	%f0, 15, %r0, 0
-	cdgbra	%f4, 5, %r6, 7
-	cdgbra	%f15, 0, %r0, 0
-
 #CHECK: brcth	%r0, .[[LAB:L.*]]-4294967296 # encoding: [0xcc,0x06,A,A,A,A]
 #CHECK:  fixup A - offset: 2, value: (.[[LAB]]-4294967296)+2, kind: FK_390_PC32DBL
 	brcth	%r0, -0x100000000
@@ -201,6 +173,34 @@
 	brcth	%r7,frob@PLT
 	brcth	%r8,frob@PLT
 
+#CHECK: cdfbra	%f0, 0, %r0, 0          # encoding: [0xb3,0x95,0x00,0x00]
+#CHECK: cdfbra	%f0, 0, %r0, 15         # encoding: [0xb3,0x95,0x0f,0x00]
+#CHECK: cdfbra	%f0, 0, %r15, 0         # encoding: [0xb3,0x95,0x00,0x0f]
+#CHECK: cdfbra	%f0, 15, %r0, 0         # encoding: [0xb3,0x95,0xf0,0x00]
+#CHECK: cdfbra	%f4, 5, %r6, 7          # encoding: [0xb3,0x95,0x57,0x46]
+#CHECK: cdfbra	%f15, 0, %r0, 0         # encoding: [0xb3,0x95,0x00,0xf0]
+
+	cdfbra	%f0, 0, %r0, 0
+	cdfbra	%f0, 0, %r0, 15
+	cdfbra	%f0, 0, %r15, 0
+	cdfbra	%f0, 15, %r0, 0
+	cdfbra	%f4, 5, %r6, 7
+	cdfbra	%f15, 0, %r0, 0
+
+#CHECK: cdgbra	%f0, 0, %r0, 0          # encoding: [0xb3,0xa5,0x00,0x00]
+#CHECK: cdgbra	%f0, 0, %r0, 15         # encoding: [0xb3,0xa5,0x0f,0x00]
+#CHECK: cdgbra	%f0, 0, %r15, 0         # encoding: [0xb3,0xa5,0x00,0x0f]
+#CHECK: cdgbra	%f0, 15, %r0, 0         # encoding: [0xb3,0xa5,0xf0,0x00]
+#CHECK: cdgbra	%f4, 5, %r6, 7          # encoding: [0xb3,0xa5,0x57,0x46]
+#CHECK: cdgbra	%f15, 0, %r0, 0         # encoding: [0xb3,0xa5,0x00,0xf0]
+
+	cdgbra	%f0, 0, %r0, 0
+	cdgbra	%f0, 0, %r0, 15
+	cdgbra	%f0, 0, %r15, 0
+	cdgbra	%f0, 15, %r0, 0
+	cdgbra	%f4, 5, %r6, 7
+	cdgbra	%f15, 0, %r0, 0
+
 #CHECK: cdlfbr	%f0, 0, %r0, 0          # encoding: [0xb3,0x91,0x00,0x00]
 #CHECK: cdlfbr	%f0, 0, %r0, 15         # encoding: [0xb3,0x91,0x0f,0x00]
 #CHECK: cdlfbr	%f0, 0, %r15, 0         # encoding: [0xb3,0x91,0x00,0x0f]
@@ -619,6 +619,36 @@
 	fixbra	%f4, 5, %f8, 9
 	fixbra	%f13, 0, %f0, 0
 
+#CHECK: kmctr	%r2, %r2, %r2           # encoding: [0xb9,0x2d,0x20,0x22]
+#CHECK: kmctr	%r2, %r8, %r14          # encoding: [0xb9,0x2d,0x80,0x2e]
+#CHECK: kmctr	%r14, %r8, %r2          # encoding: [0xb9,0x2d,0x80,0xe2]
+#CHECK: kmctr	%r6, %r8, %r10          # encoding: [0xb9,0x2d,0x80,0x6a]
+
+	kmctr	%r2, %r2, %r2
+	kmctr	%r2, %r8, %r14
+	kmctr	%r14, %r8, %r2
+	kmctr	%r6, %r8, %r10
+
+#CHECK: kmf	%r2, %r2                # encoding: [0xb9,0x2a,0x00,0x22]
+#CHECK: kmf	%r2, %r14               # encoding: [0xb9,0x2a,0x00,0x2e]
+#CHECK: kmf	%r14, %r2               # encoding: [0xb9,0x2a,0x00,0xe2]
+#CHECK: kmf	%r6, %r10               # encoding: [0xb9,0x2a,0x00,0x6a]
+
+	kmf	%r2, %r2
+	kmf	%r2, %r14
+	kmf	%r14, %r2
+	kmf	%r6, %r10
+
+#CHECK: kmo	%r2, %r2                # encoding: [0xb9,0x2b,0x00,0x22]
+#CHECK: kmo	%r2, %r14               # encoding: [0xb9,0x2b,0x00,0x2e]
+#CHECK: kmo	%r14, %r2               # encoding: [0xb9,0x2b,0x00,0xe2]
+#CHECK: kmo	%r6, %r10               # encoding: [0xb9,0x2b,0x00,0x6a]
+
+	kmo	%r2, %r2
+	kmo	%r2, %r14
+	kmo	%r14, %r2
+	kmo	%r6, %r10
+
 #CHECK: laa	%r0, %r0, -524288       # encoding: [0xeb,0x00,0x00,0x00,0x80,0xf8]
 #CHECK: laa	%r0, %r0, -1            # encoding: [0xeb,0x00,0x0f,0xff,0xff,0xf8]
 #CHECK: laa	%r0, %r0, 0             # encoding: [0xeb,0x00,0x00,0x00,0x00,0xf8]
@@ -1303,6 +1333,10 @@
 	ork	%r15,%r0,%r0
 	ork	%r7,%r8,%r9
 
+#CHECK: pcc                             # encoding: [0xb9,0x2c,0x00,0x00]
+
+	pcc
+
 #CHECK: popcnt	%r0, %r0                # encoding: [0xb9,0xe1,0x00,0x00]
 #CHECK: popcnt	%r0, %r15               # encoding: [0xb9,0xe1,0x00,0x0f]
 #CHECK: popcnt	%r15, %r0               # encoding: [0xb9,0xe1,0x00,0xf0]
@@ -1395,18 +1429,6 @@
 	slgrk	%r15,%r0,%r0
 	slgrk	%r7,%r8,%r9
 
-#CHECK: slrk	%r0, %r0, %r0           # encoding: [0xb9,0xfb,0x00,0x00]
-#CHECK: slrk	%r0, %r0, %r15          # encoding: [0xb9,0xfb,0xf0,0x00]
-#CHECK: slrk	%r0, %r15, %r0          # encoding: [0xb9,0xfb,0x00,0x0f]
-#CHECK: slrk	%r15, %r0, %r0          # encoding: [0xb9,0xfb,0x00,0xf0]
-#CHECK: slrk	%r7, %r8, %r9           # encoding: [0xb9,0xfb,0x90,0x78]
-
-	slrk	%r0,%r0,%r0
-	slrk	%r0,%r0,%r15
-	slrk	%r0,%r15,%r0
-	slrk	%r15,%r0,%r0
-	slrk	%r7,%r8,%r9
-
 #CHECK: sllk	%r0, %r0, 0             # encoding: [0xeb,0x00,0x00,0x00,0x00,0xdf]
 #CHECK: sllk	%r15, %r1, 0            # encoding: [0xeb,0xf1,0x00,0x00,0x00,0xdf]
 #CHECK: sllk	%r1, %r15, 0            # encoding: [0xeb,0x1f,0x00,0x00,0x00,0xdf]
@@ -1433,6 +1455,18 @@
 	sllk	%r0,%r0,524287(%r1)
 	sllk	%r0,%r0,524287(%r15)
 
+#CHECK: slrk	%r0, %r0, %r0           # encoding: [0xb9,0xfb,0x00,0x00]
+#CHECK: slrk	%r0, %r0, %r15          # encoding: [0xb9,0xfb,0xf0,0x00]
+#CHECK: slrk	%r0, %r15, %r0          # encoding: [0xb9,0xfb,0x00,0x0f]
+#CHECK: slrk	%r15, %r0, %r0          # encoding: [0xb9,0xfb,0x00,0xf0]
+#CHECK: slrk	%r7, %r8, %r9           # encoding: [0xb9,0xfb,0x90,0x78]
+
+	slrk	%r0,%r0,%r0
+	slrk	%r0,%r0,%r15
+	slrk	%r0,%r15,%r0
+	slrk	%r15,%r0,%r0
+	slrk	%r7,%r8,%r9
+
 #CHECK: srak	%r0, %r0, 0             # encoding: [0xeb,0x00,0x00,0x00,0x00,0xdc]
 #CHECK: srak	%r15, %r1, 0            # encoding: [0xeb,0xf1,0x00,0x00,0x00,0xdc]
 #CHECK: srak	%r1, %r15, 0            # encoding: [0xeb,0x1f,0x00,0x00,0x00,0xdc]
@@ -1533,28 +1567,6 @@
 	stch	%r0, 524287(%r15,%r1)
 	stch	%r15, 0
 
-#CHECK: sthh	%r0, -524288            # encoding: [0xe3,0x00,0x00,0x00,0x80,0xc7]
-#CHECK: sthh	%r0, -1                 # encoding: [0xe3,0x00,0x0f,0xff,0xff,0xc7]
-#CHECK: sthh	%r0, 0                  # encoding: [0xe3,0x00,0x00,0x00,0x00,0xc7]
-#CHECK: sthh	%r0, 1                  # encoding: [0xe3,0x00,0x00,0x01,0x00,0xc7]
-#CHECK: sthh	%r0, 524287             # encoding: [0xe3,0x00,0x0f,0xff,0x7f,0xc7]
-#CHECK: sthh	%r0, 0(%r1)             # encoding: [0xe3,0x00,0x10,0x00,0x00,0xc7]
-#CHECK: sthh	%r0, 0(%r15)            # encoding: [0xe3,0x00,0xf0,0x00,0x00,0xc7]
-#CHECK: sthh	%r0, 524287(%r1,%r15)   # encoding: [0xe3,0x01,0xff,0xff,0x7f,0xc7]
-#CHECK: sthh	%r0, 524287(%r15,%r1)   # encoding: [0xe3,0x0f,0x1f,0xff,0x7f,0xc7]
-#CHECK: sthh	%r15, 0                 # encoding: [0xe3,0xf0,0x00,0x00,0x00,0xc7]
-
-	sthh	%r0, -524288
-	sthh	%r0, -1
-	sthh	%r0, 0
-	sthh	%r0, 1
-	sthh	%r0, 524287
-	sthh	%r0, 0(%r1)
-	sthh	%r0, 0(%r15)
-	sthh	%r0, 524287(%r1,%r15)
-	sthh	%r0, 524287(%r15,%r1)
-	sthh	%r15, 0
-
 #CHECK: stfh	%r0, -524288            # encoding: [0xe3,0x00,0x00,0x00,0x80,0xcb]
 #CHECK: stfh	%r0, -1                 # encoding: [0xe3,0x00,0x0f,0xff,0xff,0xcb]
 #CHECK: stfh	%r0, 0                  # encoding: [0xe3,0x00,0x00,0x00,0x00,0xcb]
@@ -1577,6 +1589,28 @@
 	stfh	%r0, 524287(%r15,%r1)
 	stfh	%r15, 0
 
+#CHECK: sthh	%r0, -524288            # encoding: [0xe3,0x00,0x00,0x00,0x80,0xc7]
+#CHECK: sthh	%r0, -1                 # encoding: [0xe3,0x00,0x0f,0xff,0xff,0xc7]
+#CHECK: sthh	%r0, 0                  # encoding: [0xe3,0x00,0x00,0x00,0x00,0xc7]
+#CHECK: sthh	%r0, 1                  # encoding: [0xe3,0x00,0x00,0x01,0x00,0xc7]
+#CHECK: sthh	%r0, 524287             # encoding: [0xe3,0x00,0x0f,0xff,0x7f,0xc7]
+#CHECK: sthh	%r0, 0(%r1)             # encoding: [0xe3,0x00,0x10,0x00,0x00,0xc7]
+#CHECK: sthh	%r0, 0(%r15)            # encoding: [0xe3,0x00,0xf0,0x00,0x00,0xc7]
+#CHECK: sthh	%r0, 524287(%r1,%r15)   # encoding: [0xe3,0x01,0xff,0xff,0x7f,0xc7]
+#CHECK: sthh	%r0, 524287(%r15,%r1)   # encoding: [0xe3,0x0f,0x1f,0xff,0x7f,0xc7]
+#CHECK: sthh	%r15, 0                 # encoding: [0xe3,0xf0,0x00,0x00,0x00,0xc7]
+
+	sthh	%r0, -524288
+	sthh	%r0, -1
+	sthh	%r0, 0
+	sthh	%r0, 1
+	sthh	%r0, 524287
+	sthh	%r0, 0(%r1)
+	sthh	%r0, 0(%r15)
+	sthh	%r0, 524287(%r1,%r15)
+	sthh	%r0, 524287(%r15,%r1)
+	sthh	%r15, 0
+
 #CHECK: stoc	%r0, 0, 0               # encoding: [0xeb,0x00,0x00,0x00,0x00,0xf3]
 #CHECK: stoc	%r0, 0, 15              # encoding: [0xeb,0x0f,0x00,0x00,0x00,0xf3]
 #CHECK: stoc	%r0, -524288, 0         # encoding: [0xeb,0x00,0x00,0x00,0x80,0xf3]
diff --git a/test/MC/SystemZ/insn-good-zEC12.s b/test/MC/SystemZ/insn-good-zEC12.s
index bdaeef95eef1..275d68d8a619 100644
--- a/test/MC/SystemZ/insn-good-zEC12.s
+++ b/test/MC/SystemZ/insn-good-zEC12.s
@@ -178,6 +178,14 @@
 	clgtnl	%r0, 0(%r15)
 	clgtnh	%r0, 0(%r15)
 
+#CHECK: etnd	%r0                     # encoding: [0xb2,0xec,0x00,0x00]
+#CHECK: etnd	%r15                    # encoding: [0xb2,0xec,0x00,0xf0]
+#CHECK: etnd	%r7                     # encoding: [0xb2,0xec,0x00,0x70]
+
+	etnd	%r0
+	etnd	%r15
+	etnd	%r7
+
 #CHECK: lat	%r0, -524288            # encoding: [0xe3,0x00,0x00,0x00,0x80,0x9f]
 #CHECK: lat	%r0, -1                 # encoding: [0xe3,0x00,0x0f,0xff,0xff,0x9f]
 #CHECK: lat	%r0, 0                  # encoding: [0xe3,0x00,0x00,0x00,0x00,0x9f]
@@ -288,14 +296,6 @@
 	llgtat	%r0, 524287(%r15,%r1)
 	llgtat	%r15, 0
 
-#CHECK: etnd	%r0                     # encoding: [0xb2,0xec,0x00,0x00]
-#CHECK: etnd	%r15                    # encoding: [0xb2,0xec,0x00,0xf0]
-#CHECK: etnd	%r7                     # encoding: [0xb2,0xec,0x00,0x70]
-
-	etnd	%r0
-	etnd	%r15
-	etnd	%r7
-
 #CHECK: niai	0, 0                    # encoding: [0xb2,0xfa,0x00,0x00]
 #CHECK: niai	15, 0                   # encoding: [0xb2,0xfa,0x00,0xf0]
 #CHECK: niai	0, 15                   # encoding: [0xb2,0xfa,0x00,0x0f]
diff --git a/test/MC/SystemZ/insn-good.s b/test/MC/SystemZ/insn-good.s
index f4dddc4712d5..a6228f23c8f8 100644
--- a/test/MC/SystemZ/insn-good.s
+++ b/test/MC/SystemZ/insn-good.s
@@ -415,6 +415,34 @@
 	algr	%r15,%r0
 	algr	%r7,%r8
 
+#CHECK: algsi	-524288, 0              # encoding: [0xeb,0x00,0x00,0x00,0x80,0x7e]
+#CHECK: algsi	-1, 0                   # encoding: [0xeb,0x00,0x0f,0xff,0xff,0x7e]
+#CHECK: algsi	0, 0                    # encoding: [0xeb,0x00,0x00,0x00,0x00,0x7e]
+#CHECK: algsi	1, 0                    # encoding: [0xeb,0x00,0x00,0x01,0x00,0x7e]
+#CHECK: algsi	524287, 0               # encoding: [0xeb,0x00,0x0f,0xff,0x7f,0x7e]
+#CHECK: algsi	0, -128                 # encoding: [0xeb,0x80,0x00,0x00,0x00,0x7e]
+#CHECK: algsi	0, -1                   # encoding: [0xeb,0xff,0x00,0x00,0x00,0x7e]
+#CHECK: algsi	0, 1                    # encoding: [0xeb,0x01,0x00,0x00,0x00,0x7e]
+#CHECK: algsi	0, 127                  # encoding: [0xeb,0x7f,0x00,0x00,0x00,0x7e]
+#CHECK: algsi	0(%r1), 42              # encoding: [0xeb,0x2a,0x10,0x00,0x00,0x7e]
+#CHECK: algsi	0(%r15), 42             # encoding: [0xeb,0x2a,0xf0,0x00,0x00,0x7e]
+#CHECK: algsi	524287(%r1), 42         # encoding: [0xeb,0x2a,0x1f,0xff,0x7f,0x7e]
+#CHECK: algsi	524287(%r15), 42        # encoding: [0xeb,0x2a,0xff,0xff,0x7f,0x7e]
+
+	algsi	-524288, 0
+	algsi	-1, 0
+	algsi	0, 0
+	algsi	1, 0
+	algsi	524287, 0
+	algsi	0, -128
+	algsi	0, -1
+	algsi	0, 1
+	algsi	0, 127
+	algsi	0(%r1), 42
+	algsi	0(%r15), 42
+	algsi	524287(%r1), 42
+	algsi	524287(%r15), 42
+
 #CHECK: alr	%r0, %r0                # encoding: [0x1e,0x00]
 #CHECK: alr	%r0, %r15               # encoding: [0x1e,0x0f]
 #CHECK: alr	%r15, %r0               # encoding: [0x1e,0xf0]
@@ -425,6 +453,34 @@
 	alr	%r15,%r0
 	alr	%r7,%r8
 
+#CHECK: alsi	-524288, 0              # encoding: [0xeb,0x00,0x00,0x00,0x80,0x6e]
+#CHECK: alsi	-1, 0                   # encoding: [0xeb,0x00,0x0f,0xff,0xff,0x6e]
+#CHECK: alsi	0, 0                    # encoding: [0xeb,0x00,0x00,0x00,0x00,0x6e]
+#CHECK: alsi	1, 0                    # encoding: [0xeb,0x00,0x00,0x01,0x00,0x6e]
+#CHECK: alsi	524287, 0               # encoding: [0xeb,0x00,0x0f,0xff,0x7f,0x6e]
+#CHECK: alsi	0, -128                 # encoding: [0xeb,0x80,0x00,0x00,0x00,0x6e]
+#CHECK: alsi	0, -1                   # encoding: [0xeb,0xff,0x00,0x00,0x00,0x6e]
+#CHECK: alsi	0, 1                    # encoding: [0xeb,0x01,0x00,0x00,0x00,0x6e]
+#CHECK: alsi	0, 127                  # encoding: [0xeb,0x7f,0x00,0x00,0x00,0x6e]
+#CHECK: alsi	0(%r1), 42              # encoding: [0xeb,0x2a,0x10,0x00,0x00,0x6e]
+#CHECK: alsi	0(%r15), 42             # encoding: [0xeb,0x2a,0xf0,0x00,0x00,0x6e]
+#CHECK: alsi	524287(%r1), 42         # encoding: [0xeb,0x2a,0x1f,0xff,0x7f,0x6e]
+#CHECK: alsi	524287(%r15), 42        # encoding: [0xeb,0x2a,0xff,0xff,0x7f,0x6e]
+
+	alsi	-524288, 0
+	alsi	-1, 0
+	alsi	0, 0
+	alsi	1, 0
+	alsi	524287, 0
+	alsi	0, -128
+	alsi	0, -1
+	alsi	0, 1
+	alsi	0, 127
+	alsi	0(%r1), 42
+	alsi	0(%r15), 42
+	alsi	524287(%r1), 42
+	alsi	524287(%r15), 42
+
 #CHECK: aly	%r0, -524288            # encoding: [0xe3,0x00,0x00,0x00,0x80,0x5e]
 #CHECK: aly	%r0, -1                 # encoding: [0xe3,0x00,0x0f,0xff,0xff,0x5e]
 #CHECK: aly	%r0, 0                  # encoding: [0xe3,0x00,0x00,0x00,0x00,0x5e]
@@ -447,6 +503,36 @@
 	aly	%r0, 524287(%r15,%r1)
 	aly	%r15, 0
 
+#CHECK: ap	0(1), 0(1)              # encoding: [0xfa,0x00,0x00,0x00,0x00,0x00]
+#CHECK: ap	0(1), 0(1,%r1)          # encoding: [0xfa,0x00,0x00,0x00,0x10,0x00]
+#CHECK: ap	0(1), 0(1,%r15)         # encoding: [0xfa,0x00,0x00,0x00,0xf0,0x00]
+#CHECK: ap	0(1), 4095(1)           # encoding: [0xfa,0x00,0x00,0x00,0x0f,0xff]
+#CHECK: ap	0(1), 4095(1,%r1)       # encoding: [0xfa,0x00,0x00,0x00,0x1f,0xff]
+#CHECK: ap	0(1), 4095(1,%r15)      # encoding: [0xfa,0x00,0x00,0x00,0xff,0xff]
+#CHECK: ap	0(1,%r1), 0(1)          # encoding: [0xfa,0x00,0x10,0x00,0x00,0x00]
+#CHECK: ap	0(1,%r15), 0(1)         # encoding: [0xfa,0x00,0xf0,0x00,0x00,0x00]
+#CHECK: ap	4095(1,%r1), 0(1)       # encoding: [0xfa,0x00,0x1f,0xff,0x00,0x00]
+#CHECK: ap	4095(1,%r15), 0(1)      # encoding: [0xfa,0x00,0xff,0xff,0x00,0x00]
+#CHECK: ap	0(16,%r1), 0(1)         # encoding: [0xfa,0xf0,0x10,0x00,0x00,0x00]
+#CHECK: ap	0(16,%r15), 0(1)        # encoding: [0xfa,0xf0,0xf0,0x00,0x00,0x00]
+#CHECK: ap	0(1), 0(16,%r1)         # encoding: [0xfa,0x0f,0x00,0x00,0x10,0x00]
+#CHECK: ap	0(1), 0(16,%r15)        # encoding: [0xfa,0x0f,0x00,0x00,0xf0,0x00]
+
+	ap	0(1), 0(1)
+	ap	0(1), 0(1,%r1)
+	ap	0(1), 0(1,%r15)
+	ap	0(1), 4095(1)
+	ap	0(1), 4095(1,%r1)
+	ap	0(1), 4095(1,%r15)
+	ap	0(1,%r1), 0(1)
+	ap	0(1,%r15), 0(1)
+	ap	4095(1,%r1), 0(1)
+	ap	4095(1,%r15), 0(1)
+	ap	0(16,%r1), 0(1)
+	ap	0(16,%r15), 0(1)
+	ap	0(1), 0(16,%r1)
+	ap	0(1), 0(16,%r15)
+
 #CHECK: ar	%r0, %r0                # encoding: [0x1a,0x00]
 #CHECK: ar	%r0, %r15               # encoding: [0x1a,0x0f]
 #CHECK: ar	%r15, %r0               # encoding: [0x1a,0xf0]
@@ -1965,6 +2051,20 @@
 	cegbr	%f7, %r8
 	cegbr	%f15, %r15
 
+#CHECK: cfc	0                       # encoding: [0xb2,0x1a,0x00,0x00]
+#CHECK: cfc	0(%r1)                  # encoding: [0xb2,0x1a,0x10,0x00]
+#CHECK: cfc	0(%r15)                 # encoding: [0xb2,0x1a,0xf0,0x00]
+#CHECK: cfc	4095                    # encoding: [0xb2,0x1a,0x0f,0xff]
+#CHECK: cfc	4095(%r1)               # encoding: [0xb2,0x1a,0x1f,0xff]
+#CHECK: cfc	4095(%r15)              # encoding: [0xb2,0x1a,0xff,0xff]
+
+	cfc	0
+	cfc	0(%r1)
+	cfc	0(%r15)
+	cfc	4095
+	cfc	4095(%r1)
+	cfc	4095(%r15)
+
 #CHECK: cfdbr	%r0, 0, %f0             # encoding: [0xb3,0x99,0x00,0x00]
 #CHECK: cfdbr	%r0, 0, %f15            # encoding: [0xb3,0x99,0x00,0x0f]
 #CHECK: cfdbr	%r0, 15, %f0            # encoding: [0xb3,0x99,0xf0,0x00]
@@ -3435,6 +3535,16 @@
         citnl   %r15, 1
         citnh   %r15, 1
 
+#CHECK: cksm	%r0, %r8                # encoding: [0xb2,0x41,0x00,0x08]
+#CHECK: cksm	%r0, %r14               # encoding: [0xb2,0x41,0x00,0x0e]
+#CHECK: cksm	%r15, %r0               # encoding: [0xb2,0x41,0x00,0xf0]
+#CHECK: cksm	%r15, %r8               # encoding: [0xb2,0x41,0x00,0xf8]
+
+	cksm	%r0, %r8
+	cksm	%r0, %r14
+	cksm	%r15, %r0
+	cksm	%r15, %r8
+
 #CHECK: cl	%r0, 0                  # encoding: [0x55,0x00,0x00,0x00]
 #CHECK: cl	%r0, 4095               # encoding: [0x55,0x00,0x0f,0xff]
 #CHECK: cl	%r0, 0(%r1)             # encoding: [0x55,0x00,0x10,0x00]
@@ -3477,6 +3587,54 @@
 	clc	0(256,%r1), 0
 	clc	0(256,%r15), 0
 
+#CHECK: clcl	%r0, %r8                # encoding: [0x0f,0x08]
+#CHECK: clcl	%r0, %r14               # encoding: [0x0f,0x0e]
+#CHECK: clcl	%r14, %r0               # encoding: [0x0f,0xe0]
+#CHECK: clcl	%r14, %r8               # encoding: [0x0f,0xe8]
+
+	clcl	%r0, %r8
+	clcl	%r0, %r14
+	clcl	%r14, %r0
+	clcl	%r14, %r8
+
+#CHECK: clcle	%r0, %r0, 0             # encoding: [0xa9,0x00,0x00,0x00]
+#CHECK: clcle	%r0, %r14, 4095         # encoding: [0xa9,0x0e,0x0f,0xff]
+#CHECK: clcle	%r0, %r0, 0(%r1)        # encoding: [0xa9,0x00,0x10,0x00]
+#CHECK: clcle	%r0, %r0, 0(%r15)       # encoding: [0xa9,0x00,0xf0,0x00]
+#CHECK: clcle	%r14, %r14, 4095(%r1)   # encoding: [0xa9,0xee,0x1f,0xff]
+#CHECK: clcle	%r0, %r0, 4095(%r15)    # encoding: [0xa9,0x00,0xff,0xff]
+#CHECK: clcle	%r14, %r0, 0            # encoding: [0xa9,0xe0,0x00,0x00]
+
+	clcle	%r0, %r0, 0
+	clcle	%r0, %r14, 4095
+	clcle	%r0, %r0, 0(%r1)
+	clcle	%r0, %r0, 0(%r15)
+	clcle	%r14, %r14, 4095(%r1)
+	clcle	%r0, %r0, 4095(%r15)
+	clcle	%r14, %r0, 0
+
+#CHECK: clclu	%r0, %r0, -524288       # encoding: [0xeb,0x00,0x00,0x00,0x80,0x8f]
+#CHECK: clclu	%r0, %r0, -1            # encoding: [0xeb,0x00,0x0f,0xff,0xff,0x8f]
+#CHECK: clclu	%r0, %r14, 0            # encoding: [0xeb,0x0e,0x00,0x00,0x00,0x8f]
+#CHECK: clclu	%r0, %r14, 1            # encoding: [0xeb,0x0e,0x00,0x01,0x00,0x8f]
+#CHECK: clclu	%r0, %r8, 524287        # encoding: [0xeb,0x08,0x0f,0xff,0x7f,0x8f]
+#CHECK: clclu	%r0, %r8, 0(%r1)        # encoding: [0xeb,0x08,0x10,0x00,0x00,0x8f]
+#CHECK: clclu	%r0, %r4, 0(%r15)       # encoding: [0xeb,0x04,0xf0,0x00,0x00,0x8f]
+#CHECK: clclu	%r0, %r4, 524287(%r15)  # encoding: [0xeb,0x04,0xff,0xff,0x7f,0x8f]
+#CHECK: clclu	%r0, %r0, 524287(%r1)   # encoding: [0xeb,0x00,0x1f,0xff,0x7f,0x8f]
+#CHECK: clclu	%r14, %r0, 0            # encoding: [0xeb,0xe0,0x00,0x00,0x00,0x8f]
+
+	clclu	%r0, %r0, -524288
+	clclu	%r0, %r0, -1
+	clclu	%r0, %r14, 0
+	clclu	%r0, %r14, 1
+	clclu	%r0, %r8, 524287
+	clclu	%r0, %r8, 0(%r1)
+	clclu	%r0, %r4, 0(%r15)
+	clclu	%r0, %r4, 524287(%r15)
+	clclu	%r0, %r0, 524287(%r1)
+	clclu	%r14, %r0, 0
+
 #CHECK: clfhsi	0, 0                    # encoding: [0xe5,0x5d,0x00,0x00,0x00,0x00]
 #CHECK: clfhsi	4095, 0                 # encoding: [0xe5,0x5d,0x0f,0xff,0x00,0x00]
 #CHECK: clfhsi	0, 65535                # encoding: [0xe5,0x5d,0x00,0x00,0xff,0xff]
@@ -4751,6 +4909,66 @@
 	cliy	524287(%r1), 42
 	cliy	524287(%r15), 42
 
+#CHECK: clm	%r0, 0, 0               # encoding: [0xbd,0x00,0x00,0x00]
+#CHECK: clm	%r0, 15, 4095           # encoding: [0xbd,0x0f,0x0f,0xff]
+#CHECK: clm	%r0, 0, 0(%r1)          # encoding: [0xbd,0x00,0x10,0x00]
+#CHECK: clm	%r0, 0, 0(%r15)         # encoding: [0xbd,0x00,0xf0,0x00]
+#CHECK: clm	%r15, 15, 4095(%r1)     # encoding: [0xbd,0xff,0x1f,0xff]
+#CHECK: clm	%r0, 0, 4095(%r15)      # encoding: [0xbd,0x00,0xff,0xff]
+#CHECK: clm	%r15, 0, 0              # encoding: [0xbd,0xf0,0x00,0x00]
+
+	clm	%r0, 0, 0
+	clm	%r0, 15, 4095
+	clm	%r0, 0, 0(%r1)
+	clm	%r0, 0, 0(%r15)
+	clm	%r15, 15, 4095(%r1)
+	clm	%r0, 0, 4095(%r15)
+	clm	%r15, 0, 0
+
+#CHECK: clmh	%r0, 0, -524288            # encoding: [0xeb,0x00,0x00,0x00,0x80,0x20]
+#CHECK: clmh	%r0, 0, -1                 # encoding: [0xeb,0x00,0x0f,0xff,0xff,0x20]
+#CHECK: clmh	%r0, 15, 0                 # encoding: [0xeb,0x0f,0x00,0x00,0x00,0x20]
+#CHECK: clmh	%r0, 15, 1                 # encoding: [0xeb,0x0f,0x00,0x01,0x00,0x20]
+#CHECK: clmh	%r0, 8, 524287             # encoding: [0xeb,0x08,0x0f,0xff,0x7f,0x20]
+#CHECK: clmh	%r0, 8, 0(%r1)             # encoding: [0xeb,0x08,0x10,0x00,0x00,0x20]
+#CHECK: clmh	%r0, 4, 0(%r15)            # encoding: [0xeb,0x04,0xf0,0x00,0x00,0x20]
+#CHECK: clmh	%r0, 4, 524287(%r15)       # encoding: [0xeb,0x04,0xff,0xff,0x7f,0x20]
+#CHECK: clmh	%r0, 0, 524287(%r1)        # encoding: [0xeb,0x00,0x1f,0xff,0x7f,0x20]
+#CHECK: clmh	%r15, 0, 0                 # encoding: [0xeb,0xf0,0x00,0x00,0x00,0x20]
+
+	clmh	%r0, 0, -524288
+	clmh	%r0, 0, -1
+	clmh	%r0, 15, 0
+	clmh	%r0, 15, 1
+	clmh	%r0, 8, 524287
+	clmh	%r0, 8, 0(%r1)
+	clmh	%r0, 4, 0(%r15)
+	clmh	%r0, 4, 524287(%r15)
+	clmh	%r0, 0, 524287(%r1)
+	clmh	%r15, 0, 0
+
+#CHECK: clmy	%r0, 0, -524288            # encoding: [0xeb,0x00,0x00,0x00,0x80,0x21]
+#CHECK: clmy	%r0, 0, -1                 # encoding: [0xeb,0x00,0x0f,0xff,0xff,0x21]
+#CHECK: clmy	%r0, 15, 0                 # encoding: [0xeb,0x0f,0x00,0x00,0x00,0x21]
+#CHECK: clmy	%r0, 15, 1                 # encoding: [0xeb,0x0f,0x00,0x01,0x00,0x21]
+#CHECK: clmy	%r0, 8, 524287             # encoding: [0xeb,0x08,0x0f,0xff,0x7f,0x21]
+#CHECK: clmy	%r0, 8, 0(%r1)             # encoding: [0xeb,0x08,0x10,0x00,0x00,0x21]
+#CHECK: clmy	%r0, 4, 0(%r15)            # encoding: [0xeb,0x04,0xf0,0x00,0x00,0x21]
+#CHECK: clmy	%r0, 4, 524287(%r15)       # encoding: [0xeb,0x04,0xff,0xff,0x7f,0x21]
+#CHECK: clmy	%r0, 0, 524287(%r1)        # encoding: [0xeb,0x00,0x1f,0xff,0x7f,0x21]
+#CHECK: clmy	%r15, 0, 0                 # encoding: [0xeb,0xf0,0x00,0x00,0x00,0x21]
+
+	clmy	%r0, 0, -524288
+	clmy	%r0, 0, -1
+	clmy	%r0, 15, 0
+	clmy	%r0, 15, 1
+	clmy	%r0, 8, 524287
+	clmy	%r0, 8, 0(%r1)
+	clmy	%r0, 4, 0(%r15)
+	clmy	%r0, 4, 524287(%r15)
+	clmy	%r0, 0, 524287(%r1)
+	clmy	%r15, 0, 0
+
 #CHECK: clr	%r0, %r0                # encoding: [0x15,0x00]
 #CHECK: clr	%r0, %r15               # encoding: [0x15,0x0f]
 #CHECK: clr	%r15, %r0               # encoding: [0x15,0xf0]
@@ -5180,6 +5398,46 @@
 	cly	%r0, 524287(%r15,%r1)
 	cly	%r15, 0
 
+#CHECK: cmpsc	%r0, %r8                # encoding: [0xb2,0x63,0x00,0x08]
+#CHECK: cmpsc	%r0, %r14               # encoding: [0xb2,0x63,0x00,0x0e]
+#CHECK: cmpsc	%r14, %r0               # encoding: [0xb2,0x63,0x00,0xe0]
+#CHECK: cmpsc	%r14, %r8               # encoding: [0xb2,0x63,0x00,0xe8]
+
+	cmpsc	%r0, %r8
+	cmpsc	%r0, %r14
+	cmpsc	%r14, %r0
+	cmpsc	%r14, %r8
+
+#CHECK: cp	0(1), 0(1)              # encoding: [0xf9,0x00,0x00,0x00,0x00,0x00]
+#CHECK: cp	0(1), 0(1,%r1)          # encoding: [0xf9,0x00,0x00,0x00,0x10,0x00]
+#CHECK: cp	0(1), 0(1,%r15)         # encoding: [0xf9,0x00,0x00,0x00,0xf0,0x00]
+#CHECK: cp	0(1), 4095(1)           # encoding: [0xf9,0x00,0x00,0x00,0x0f,0xff]
+#CHECK: cp	0(1), 4095(1,%r1)       # encoding: [0xf9,0x00,0x00,0x00,0x1f,0xff]
+#CHECK: cp	0(1), 4095(1,%r15)      # encoding: [0xf9,0x00,0x00,0x00,0xff,0xff]
+#CHECK: cp	0(1,%r1), 0(1)          # encoding: [0xf9,0x00,0x10,0x00,0x00,0x00]
+#CHECK: cp	0(1,%r15), 0(1)         # encoding: [0xf9,0x00,0xf0,0x00,0x00,0x00]
+#CHECK: cp	4095(1,%r1), 0(1)       # encoding: [0xf9,0x00,0x1f,0xff,0x00,0x00]
+#CHECK: cp	4095(1,%r15), 0(1)      # encoding: [0xf9,0x00,0xff,0xff,0x00,0x00]
+#CHECK: cp	0(16,%r1), 0(1)         # encoding: [0xf9,0xf0,0x10,0x00,0x00,0x00]
+#CHECK: cp	0(16,%r15), 0(1)        # encoding: [0xf9,0xf0,0xf0,0x00,0x00,0x00]
+#CHECK: cp	0(1), 0(16,%r1)         # encoding: [0xf9,0x0f,0x00,0x00,0x10,0x00]
+#CHECK: cp	0(1), 0(16,%r15)        # encoding: [0xf9,0x0f,0x00,0x00,0xf0,0x00]
+
+	cp	0(1), 0(1)
+	cp	0(1), 0(1,%r1)
+	cp	0(1), 0(1,%r15)
+	cp	0(1), 4095(1)
+	cp	0(1), 4095(1,%r1)
+	cp	0(1), 4095(1,%r15)
+	cp	0(1,%r1), 0(1)
+	cp	0(1,%r15), 0(1)
+	cp	4095(1,%r1), 0(1)
+	cp	4095(1,%r15), 0(1)
+	cp	0(16,%r1), 0(1)
+	cp	0(16,%r15), 0(1)
+	cp	0(1), 0(16,%r1)
+	cp	0(1), 0(16,%r15)
+
 #CHECK: cpsdr	%f0, %f0, %f0           # encoding: [0xb3,0x72,0x00,0x00]
 #CHECK: cpsdr	%f0, %f0, %f15          # encoding: [0xb3,0x72,0x00,0x0f]
 #CHECK: cpsdr	%f0, %f15, %f0          # encoding: [0xb3,0x72,0xf0,0x00]
@@ -5623,6 +5881,20 @@
 	csg	%r0, %r15, 0
 	csg	%r15, %r0, 0
 
+#CHECK: csst	0, 0, %r0               # encoding: [0xc8,0x02,0x00,0x00,0x00,0x00]
+#CHECK: csst	0(%r1), 0(%r15), %r2    # encoding: [0xc8,0x22,0x10,0x00,0xf0,0x00]
+#CHECK: csst	1(%r1), 0(%r15), %r2    # encoding: [0xc8,0x22,0x10,0x01,0xf0,0x00]
+#CHECK: csst	4095(%r1), 0(%r15), %r2 # encoding: [0xc8,0x22,0x1f,0xff,0xf0,0x00]
+#CHECK: csst	0(%r1), 1(%r15), %r2    # encoding: [0xc8,0x22,0x10,0x00,0xf0,0x01]
+#CHECK: csst	0(%r1), 4095(%r15), %r2 # encoding: [0xc8,0x22,0x10,0x00,0xff,0xff]
+
+        csst	0, 0, %r0
+        csst	0(%r1), 0(%r15), %r2
+        csst	1(%r1), 0(%r15), %r2
+        csst	4095(%r1), 0(%r15), %r2
+        csst	0(%r1), 1(%r15), %r2
+        csst	0(%r1), 4095(%r15), %r2
+
 #CHECK: csy	%r0, %r0, -524288       # encoding: [0xeb,0x00,0x00,0x00,0x80,0x14]
 #CHECK: csy	%r0, %r0, -1            # encoding: [0xeb,0x00,0x0f,0xff,0xff,0x14]
 #CHECK: csy	%r0, %r0, 0             # encoding: [0xeb,0x00,0x00,0x00,0x00,0x14]
@@ -5647,19 +5919,239 @@
 	csy	%r0, %r15, 0
 	csy	%r15, %r0, 0
 
-#CHECK: csst	0, 0, %r0               # encoding: [0xc8,0x02,0x00,0x00,0x00,0x00]
-#CHECK: csst	0(%r1), 0(%r15), %r2    # encoding: [0xc8,0x22,0x10,0x00,0xf0,0x00]
-#CHECK: csst	1(%r1), 0(%r15), %r2    # encoding: [0xc8,0x22,0x10,0x01,0xf0,0x00]
-#CHECK: csst	4095(%r1), 0(%r15), %r2 # encoding: [0xc8,0x22,0x1f,0xff,0xf0,0x00]
-#CHECK: csst	0(%r1), 1(%r15), %r2    # encoding: [0xc8,0x22,0x10,0x00,0xf0,0x01]
-#CHECK: csst	0(%r1), 4095(%r15), %r2 # encoding: [0xc8,0x22,0x10,0x00,0xff,0xff]
-
-        csst	0, 0, %r0
-        csst	0(%r1), 0(%r15), %r2
-        csst	1(%r1), 0(%r15), %r2
-        csst	4095(%r1), 0(%r15), %r2
-        csst	0(%r1), 1(%r15), %r2
-        csst	0(%r1), 4095(%r15), %r2
+#CHECK: cu12	%r0, %r0                # encoding: [0xb2,0xa7,0x00,0x00]
+#CHECK: cu12	%r0, %r14               # encoding: [0xb2,0xa7,0x00,0x0e]
+#CHECK: cu12	%r14, %r0               # encoding: [0xb2,0xa7,0x00,0xe0]
+#CHECK: cu12	%r6, %r8                # encoding: [0xb2,0xa7,0x00,0x68]
+#CHECK: cu12	%r4, %r12, 0            # encoding: [0xb2,0xa7,0x00,0x4c]
+#CHECK: cu12	%r4, %r12, 15           # encoding: [0xb2,0xa7,0xf0,0x4c]
+
+	cu12	%r0, %r0
+	cu12	%r0, %r14
+	cu12	%r14, %r0
+	cu12	%r6, %r8
+	cu12	%r4, %r12, 0
+	cu12	%r4, %r12, 15
+
+#CHECK: cu14	%r0, %r0                # encoding: [0xb9,0xb0,0x00,0x00]
+#CHECK: cu14	%r0, %r14               # encoding: [0xb9,0xb0,0x00,0x0e]
+#CHECK: cu14	%r14, %r0               # encoding: [0xb9,0xb0,0x00,0xe0]
+#CHECK: cu14	%r6, %r8                # encoding: [0xb9,0xb0,0x00,0x68]
+#CHECK: cu14	%r4, %r12, 0            # encoding: [0xb9,0xb0,0x00,0x4c]
+#CHECK: cu14	%r4, %r12, 15           # encoding: [0xb9,0xb0,0xf0,0x4c]
+
+	cu14	%r0, %r0
+	cu14	%r0, %r14
+	cu14	%r14, %r0
+	cu14	%r6, %r8
+	cu14	%r4, %r12, 0
+	cu14	%r4, %r12, 15
+
+#CHECK: cu21	%r0, %r0                # encoding: [0xb2,0xa6,0x00,0x00]
+#CHECK: cu21	%r0, %r14               # encoding: [0xb2,0xa6,0x00,0x0e]
+#CHECK: cu21	%r14, %r0               # encoding: [0xb2,0xa6,0x00,0xe0]
+#CHECK: cu21	%r6, %r8                # encoding: [0xb2,0xa6,0x00,0x68]
+#CHECK: cu21	%r4, %r12, 0            # encoding: [0xb2,0xa6,0x00,0x4c]
+#CHECK: cu21	%r4, %r12, 15           # encoding: [0xb2,0xa6,0xf0,0x4c]
+
+	cu21	%r0, %r0
+	cu21	%r0, %r14
+	cu21	%r14, %r0
+	cu21	%r6, %r8
+	cu21	%r4, %r12, 0
+	cu21	%r4, %r12, 15
+
+#CHECK: cu24	%r0, %r0                # encoding: [0xb9,0xb1,0x00,0x00]
+#CHECK: cu24	%r0, %r14               # encoding: [0xb9,0xb1,0x00,0x0e]
+#CHECK: cu24	%r14, %r0               # encoding: [0xb9,0xb1,0x00,0xe0]
+#CHECK: cu24	%r6, %r8                # encoding: [0xb9,0xb1,0x00,0x68]
+#CHECK: cu24	%r4, %r12, 0            # encoding: [0xb9,0xb1,0x00,0x4c]
+#CHECK: cu24	%r4, %r12, 15           # encoding: [0xb9,0xb1,0xf0,0x4c]
+
+	cu24	%r0, %r0
+	cu24	%r0, %r14
+	cu24	%r14, %r0
+	cu24	%r6, %r8
+	cu24	%r4, %r12, 0
+	cu24	%r4, %r12, 15
+
+#CHECK: cu41	%r0, %r0                # encoding: [0xb9,0xb2,0x00,0x00]
+#CHECK: cu41	%r0, %r14               # encoding: [0xb9,0xb2,0x00,0x0e]
+#CHECK: cu41	%r14, %r0               # encoding: [0xb9,0xb2,0x00,0xe0]
+#CHECK: cu41	%r6, %r8                # encoding: [0xb9,0xb2,0x00,0x68]
+
+	cu41	%r0, %r0
+	cu41	%r0, %r14
+	cu41	%r14, %r0
+	cu41	%r6, %r8
+
+#CHECK: cu42	%r0, %r0                # encoding: [0xb9,0xb3,0x00,0x00]
+#CHECK: cu42	%r0, %r14               # encoding: [0xb9,0xb3,0x00,0x0e]
+#CHECK: cu42	%r14, %r0               # encoding: [0xb9,0xb3,0x00,0xe0]
+#CHECK: cu42	%r6, %r8                # encoding: [0xb9,0xb3,0x00,0x68]
+
+	cu42	%r0, %r0
+	cu42	%r0, %r14
+	cu42	%r14, %r0
+	cu42	%r6, %r8
+
+#CHECK: cuse	%r0, %r8                # encoding: [0xb2,0x57,0x00,0x08]
+#CHECK: cuse	%r0, %r14               # encoding: [0xb2,0x57,0x00,0x0e]
+#CHECK: cuse	%r14, %r0               # encoding: [0xb2,0x57,0x00,0xe0]
+#CHECK: cuse	%r14, %r8               # encoding: [0xb2,0x57,0x00,0xe8]
+
+	cuse	%r0, %r8
+	cuse	%r0, %r14
+	cuse	%r14, %r0
+	cuse	%r14, %r8
+
+#CHECK: cutfu	%r0, %r0                # encoding: [0xb2,0xa7,0x00,0x00]
+#CHECK: cutfu	%r0, %r14               # encoding: [0xb2,0xa7,0x00,0x0e]
+#CHECK: cutfu	%r14, %r0               # encoding: [0xb2,0xa7,0x00,0xe0]
+#CHECK: cutfu	%r6, %r8                # encoding: [0xb2,0xa7,0x00,0x68]
+#CHECK: cutfu	%r4, %r12, 0            # encoding: [0xb2,0xa7,0x00,0x4c]
+#CHECK: cutfu	%r4, %r12, 15           # encoding: [0xb2,0xa7,0xf0,0x4c]
+
+	cutfu	%r0, %r0
+	cutfu	%r0, %r14
+	cutfu	%r14, %r0
+	cutfu	%r6, %r8
+	cutfu	%r4, %r12, 0
+	cutfu	%r4, %r12, 15
+
+#CHECK: cuutf	%r0, %r0                # encoding: [0xb2,0xa6,0x00,0x00]
+#CHECK: cuutf	%r0, %r14               # encoding: [0xb2,0xa6,0x00,0x0e]
+#CHECK: cuutf	%r14, %r0               # encoding: [0xb2,0xa6,0x00,0xe0]
+#CHECK: cuutf	%r6, %r8                # encoding: [0xb2,0xa6,0x00,0x68]
+#CHECK: cuutf	%r4, %r12, 0            # encoding: [0xb2,0xa6,0x00,0x4c]
+#CHECK: cuutf	%r4, %r12, 15           # encoding: [0xb2,0xa6,0xf0,0x4c]
+
+	cuutf	%r0, %r0
+	cuutf	%r0, %r14
+	cuutf	%r14, %r0
+	cuutf	%r6, %r8
+	cuutf	%r4, %r12, 0
+	cuutf	%r4, %r12, 15
+
+#CHECK: cvb	%r0, 0                  # encoding: [0x4f,0x00,0x00,0x00]
+#CHECK: cvb	%r0, 4095               # encoding: [0x4f,0x00,0x0f,0xff]
+#CHECK: cvb	%r0, 0(%r1)             # encoding: [0x4f,0x00,0x10,0x00]
+#CHECK: cvb	%r0, 0(%r15)            # encoding: [0x4f,0x00,0xf0,0x00]
+#CHECK: cvb	%r0, 4095(%r1,%r15)     # encoding: [0x4f,0x01,0xff,0xff]
+#CHECK: cvb	%r0, 4095(%r15,%r1)     # encoding: [0x4f,0x0f,0x1f,0xff]
+#CHECK: cvb	%r15, 0                 # encoding: [0x4f,0xf0,0x00,0x00]
+
+	cvb	%r0, 0
+	cvb	%r0, 4095
+	cvb	%r0, 0(%r1)
+	cvb	%r0, 0(%r15)
+	cvb	%r0, 4095(%r1,%r15)
+	cvb	%r0, 4095(%r15,%r1)
+	cvb	%r15, 0
+
+#CHECK: cvbg	%r0, -524288            # encoding: [0xe3,0x00,0x00,0x00,0x80,0x0e]
+#CHECK: cvbg	%r0, -1                 # encoding: [0xe3,0x00,0x0f,0xff,0xff,0x0e]
+#CHECK: cvbg	%r0, 0                  # encoding: [0xe3,0x00,0x00,0x00,0x00,0x0e]
+#CHECK: cvbg	%r0, 1                  # encoding: [0xe3,0x00,0x00,0x01,0x00,0x0e]
+#CHECK: cvbg	%r0, 524287             # encoding: [0xe3,0x00,0x0f,0xff,0x7f,0x0e]
+#CHECK: cvbg	%r0, 0(%r1)             # encoding: [0xe3,0x00,0x10,0x00,0x00,0x0e]
+#CHECK: cvbg	%r0, 0(%r15)            # encoding: [0xe3,0x00,0xf0,0x00,0x00,0x0e]
+#CHECK: cvbg	%r0, 524287(%r1,%r15)   # encoding: [0xe3,0x01,0xff,0xff,0x7f,0x0e]
+#CHECK: cvbg	%r0, 524287(%r15,%r1)   # encoding: [0xe3,0x0f,0x1f,0xff,0x7f,0x0e]
+#CHECK: cvbg	%r15, 0                 # encoding: [0xe3,0xf0,0x00,0x00,0x00,0x0e]
+
+	cvbg	%r0, -524288
+	cvbg	%r0, -1
+	cvbg	%r0, 0
+	cvbg	%r0, 1
+	cvbg	%r0, 524287
+	cvbg	%r0, 0(%r1)
+	cvbg	%r0, 0(%r15)
+	cvbg	%r0, 524287(%r1,%r15)
+	cvbg	%r0, 524287(%r15,%r1)
+	cvbg	%r15, 0
+
+#CHECK: cvby	%r0, -524288            # encoding: [0xe3,0x00,0x00,0x00,0x80,0x06]
+#CHECK: cvby	%r0, -1                 # encoding: [0xe3,0x00,0x0f,0xff,0xff,0x06]
+#CHECK: cvby	%r0, 0                  # encoding: [0xe3,0x00,0x00,0x00,0x00,0x06]
+#CHECK: cvby	%r0, 1                  # encoding: [0xe3,0x00,0x00,0x01,0x00,0x06]
+#CHECK: cvby	%r0, 524287             # encoding: [0xe3,0x00,0x0f,0xff,0x7f,0x06]
+#CHECK: cvby	%r0, 0(%r1)             # encoding: [0xe3,0x00,0x10,0x00,0x00,0x06]
+#CHECK: cvby	%r0, 0(%r15)            # encoding: [0xe3,0x00,0xf0,0x00,0x00,0x06]
+#CHECK: cvby	%r0, 524287(%r1,%r15)   # encoding: [0xe3,0x01,0xff,0xff,0x7f,0x06]
+#CHECK: cvby	%r0, 524287(%r15,%r1)   # encoding: [0xe3,0x0f,0x1f,0xff,0x7f,0x06]
+#CHECK: cvby	%r15, 0                 # encoding: [0xe3,0xf0,0x00,0x00,0x00,0x06]
+
+	cvby	%r0, -524288
+	cvby	%r0, -1
+	cvby	%r0, 0
+	cvby	%r0, 1
+	cvby	%r0, 524287
+	cvby	%r0, 0(%r1)
+	cvby	%r0, 0(%r15)
+	cvby	%r0, 524287(%r1,%r15)
+	cvby	%r0, 524287(%r15,%r1)
+	cvby	%r15, 0
+
+#CHECK: cvd	%r0, 0                  # encoding: [0x4e,0x00,0x00,0x00]
+#CHECK: cvd	%r0, 4095               # encoding: [0x4e,0x00,0x0f,0xff]
+#CHECK: cvd	%r0, 0(%r1)             # encoding: [0x4e,0x00,0x10,0x00]
+#CHECK: cvd	%r0, 0(%r15)            # encoding: [0x4e,0x00,0xf0,0x00]
+#CHECK: cvd	%r0, 4095(%r1,%r15)     # encoding: [0x4e,0x01,0xff,0xff]
+#CHECK: cvd	%r0, 4095(%r15,%r1)     # encoding: [0x4e,0x0f,0x1f,0xff]
+#CHECK: cvd	%r15, 0                 # encoding: [0x4e,0xf0,0x00,0x00]
+
+	cvd	%r0, 0
+	cvd	%r0, 4095
+	cvd	%r0, 0(%r1)
+	cvd	%r0, 0(%r15)
+	cvd	%r0, 4095(%r1,%r15)
+	cvd	%r0, 4095(%r15,%r1)
+	cvd	%r15, 0
+
+#CHECK: cvdg	%r0, -524288            # encoding: [0xe3,0x00,0x00,0x00,0x80,0x2e]
+#CHECK: cvdg	%r0, -1                 # encoding: [0xe3,0x00,0x0f,0xff,0xff,0x2e]
+#CHECK: cvdg	%r0, 0                  # encoding: [0xe3,0x00,0x00,0x00,0x00,0x2e]
+#CHECK: cvdg	%r0, 1                  # encoding: [0xe3,0x00,0x00,0x01,0x00,0x2e]
+#CHECK: cvdg	%r0, 524287             # encoding: [0xe3,0x00,0x0f,0xff,0x7f,0x2e]
+#CHECK: cvdg	%r0, 0(%r1)             # encoding: [0xe3,0x00,0x10,0x00,0x00,0x2e]
+#CHECK: cvdg	%r0, 0(%r15)            # encoding: [0xe3,0x00,0xf0,0x00,0x00,0x2e]
+#CHECK: cvdg	%r0, 524287(%r1,%r15)   # encoding: [0xe3,0x01,0xff,0xff,0x7f,0x2e]
+#CHECK: cvdg	%r0, 524287(%r15,%r1)   # encoding: [0xe3,0x0f,0x1f,0xff,0x7f,0x2e]
+#CHECK: cvdg	%r15, 0                 # encoding: [0xe3,0xf0,0x00,0x00,0x00,0x2e]
+
+	cvdg	%r0, -524288
+	cvdg	%r0, -1
+	cvdg	%r0, 0
+	cvdg	%r0, 1
+	cvdg	%r0, 524287
+	cvdg	%r0, 0(%r1)
+	cvdg	%r0, 0(%r15)
+	cvdg	%r0, 524287(%r1,%r15)
+	cvdg	%r0, 524287(%r15,%r1)
+	cvdg	%r15, 0
+
+#CHECK: cvdy	%r0, -524288            # encoding: [0xe3,0x00,0x00,0x00,0x80,0x26]
+#CHECK: cvdy	%r0, -1                 # encoding: [0xe3,0x00,0x0f,0xff,0xff,0x26]
+#CHECK: cvdy	%r0, 0                  # encoding: [0xe3,0x00,0x00,0x00,0x00,0x26]
+#CHECK: cvdy	%r0, 1                  # encoding: [0xe3,0x00,0x00,0x01,0x00,0x26]
+#CHECK: cvdy	%r0, 524287             # encoding: [0xe3,0x00,0x0f,0xff,0x7f,0x26]
+#CHECK: cvdy	%r0, 0(%r1)             # encoding: [0xe3,0x00,0x10,0x00,0x00,0x26]
+#CHECK: cvdy	%r0, 0(%r15)            # encoding: [0xe3,0x00,0xf0,0x00,0x00,0x26]
+#CHECK: cvdy	%r0, 524287(%r1,%r15)   # encoding: [0xe3,0x01,0xff,0xff,0x7f,0x26]
+#CHECK: cvdy	%r0, 524287(%r15,%r1)   # encoding: [0xe3,0x0f,0x1f,0xff,0x7f,0x26]
+#CHECK: cvdy	%r15, 0                 # encoding: [0xe3,0xf0,0x00,0x00,0x00,0x26]
+
+	cvdy	%r0, -524288
+	cvdy	%r0, -1
+	cvdy	%r0, 0
+	cvdy	%r0, 1
+	cvdy	%r0, 524287
+	cvdy	%r0, 0(%r1)
+	cvdy	%r0, 0(%r15)
+	cvdy	%r0, 524287(%r1,%r15)
+	cvdy	%r0, 524287(%r15,%r1)
+	cvdy	%r15, 0
 
 #CHECK: cxbr	%f0, %f0                # encoding: [0xb3,0x49,0x00,0x00]
 #CHECK: cxbr	%f0, %f13               # encoding: [0xb3,0x49,0x00,0x0d]
@@ -5717,6 +6209,22 @@
 	cy	%r0, 524287(%r15,%r1)
 	cy	%r15, 0
 
+#CHECK: d	%r0, 0                  # encoding: [0x5d,0x00,0x00,0x00]
+#CHECK: d	%r0, 4095               # encoding: [0x5d,0x00,0x0f,0xff]
+#CHECK: d	%r0, 0(%r1)             # encoding: [0x5d,0x00,0x10,0x00]
+#CHECK: d	%r0, 0(%r15)            # encoding: [0x5d,0x00,0xf0,0x00]
+#CHECK: d	%r0, 4095(%r1,%r15)     # encoding: [0x5d,0x01,0xff,0xff]
+#CHECK: d	%r0, 4095(%r15,%r1)     # encoding: [0x5d,0x0f,0x1f,0xff]
+#CHECK: d	%r14, 0                 # encoding: [0x5d,0xe0,0x00,0x00]
+
+	d	%r0, 0
+	d	%r0, 4095
+	d	%r0, 0(%r1)
+	d	%r0, 0(%r15)
+	d	%r0, 4095(%r1,%r15)
+	d	%r0, 4095(%r15,%r1)
+	d	%r14, 0
+
 #CHECK: ddb	%f0, 0                  # encoding: [0xed,0x00,0x00,0x00,0x00,0x1d]
 #CHECK: ddb	%f0, 4095               # encoding: [0xed,0x00,0x0f,0xff,0x00,0x1d]
 #CHECK: ddb	%f0, 0(%r1)             # encoding: [0xed,0x00,0x10,0x00,0x00,0x1d]
@@ -5769,6 +6277,34 @@
 	debr	%f7, %f8
 	debr	%f15, %f0
 
+#CHECK: didbr	%f0, %f0, %f0, 0        # encoding: [0xb3,0x5b,0x00,0x00]
+#CHECK: didbr	%f0, %f0, %f0, 15       # encoding: [0xb3,0x5b,0x0f,0x00]
+#CHECK: didbr	%f0, %f0, %f15, 0       # encoding: [0xb3,0x5b,0x00,0x0f]
+#CHECK: didbr	%f0, %f15, %f0, 0       # encoding: [0xb3,0x5b,0xf0,0x00]
+#CHECK: didbr	%f4, %f5, %f6, 7        # encoding: [0xb3,0x5b,0x57,0x46]
+#CHECK: didbr	%f15, %f0, %f0, 0       # encoding: [0xb3,0x5b,0x00,0xf0]
+
+	didbr	%f0, %f0, %f0, 0
+	didbr	%f0, %f0, %f0, 15
+	didbr	%f0, %f0, %f15, 0
+	didbr	%f0, %f15, %f0, 0
+	didbr	%f4, %f5, %f6, 7
+	didbr	%f15, %f0, %f0, 0
+
+#CHECK: diebr	%f0, %f0, %f0, 0        # encoding: [0xb3,0x53,0x00,0x00]
+#CHECK: diebr	%f0, %f0, %f0, 15       # encoding: [0xb3,0x53,0x0f,0x00]
+#CHECK: diebr	%f0, %f0, %f15, 0       # encoding: [0xb3,0x53,0x00,0x0f]
+#CHECK: diebr	%f0, %f15, %f0, 0       # encoding: [0xb3,0x53,0xf0,0x00]
+#CHECK: diebr	%f4, %f5, %f6, 7        # encoding: [0xb3,0x53,0x57,0x46]
+#CHECK: diebr	%f15, %f0, %f0, 0       # encoding: [0xb3,0x53,0x00,0xf0]
+
+	diebr	%f0, %f0, %f0, 0
+	diebr	%f0, %f0, %f0, 15
+	diebr	%f0, %f0, %f15, 0
+	diebr	%f0, %f15, %f0, 0
+	diebr	%f4, %f5, %f6, 7
+	diebr	%f15, %f0, %f0, 0
+
 #CHECK: dl	%r0, -524288            # encoding: [0xe3,0x00,0x00,0x00,0x80,0x97]
 #CHECK: dl	%r0, -1                 # encoding: [0xe3,0x00,0x0f,0xff,0xff,0x97]
 #CHECK: dl	%r0, 0                  # encoding: [0xe3,0x00,0x00,0x00,0x00,0x97]
@@ -5833,6 +6369,46 @@
 	dlr	%r14,%r0
 	dlr	%r6,%r9
 
+#CHECK: dp	0(1), 0(1)              # encoding: [0xfd,0x00,0x00,0x00,0x00,0x00]
+#CHECK: dp	0(1), 0(1,%r1)          # encoding: [0xfd,0x00,0x00,0x00,0x10,0x00]
+#CHECK: dp	0(1), 0(1,%r15)         # encoding: [0xfd,0x00,0x00,0x00,0xf0,0x00]
+#CHECK: dp	0(1), 4095(1)           # encoding: [0xfd,0x00,0x00,0x00,0x0f,0xff]
+#CHECK: dp	0(1), 4095(1,%r1)       # encoding: [0xfd,0x00,0x00,0x00,0x1f,0xff]
+#CHECK: dp	0(1), 4095(1,%r15)      # encoding: [0xfd,0x00,0x00,0x00,0xff,0xff]
+#CHECK: dp	0(1,%r1), 0(1)          # encoding: [0xfd,0x00,0x10,0x00,0x00,0x00]
+#CHECK: dp	0(1,%r15), 0(1)         # encoding: [0xfd,0x00,0xf0,0x00,0x00,0x00]
+#CHECK: dp	4095(1,%r1), 0(1)       # encoding: [0xfd,0x00,0x1f,0xff,0x00,0x00]
+#CHECK: dp	4095(1,%r15), 0(1)      # encoding: [0xfd,0x00,0xff,0xff,0x00,0x00]
+#CHECK: dp	0(16,%r1), 0(1)         # encoding: [0xfd,0xf0,0x10,0x00,0x00,0x00]
+#CHECK: dp	0(16,%r15), 0(1)        # encoding: [0xfd,0xf0,0xf0,0x00,0x00,0x00]
+#CHECK: dp	0(1), 0(16,%r1)         # encoding: [0xfd,0x0f,0x00,0x00,0x10,0x00]
+#CHECK: dp	0(1), 0(16,%r15)        # encoding: [0xfd,0x0f,0x00,0x00,0xf0,0x00]
+
+	dp	0(1), 0(1)
+	dp	0(1), 0(1,%r1)
+	dp	0(1), 0(1,%r15)
+	dp	0(1), 4095(1)
+	dp	0(1), 4095(1,%r1)
+	dp	0(1), 4095(1,%r15)
+	dp	0(1,%r1), 0(1)
+	dp	0(1,%r15), 0(1)
+	dp	4095(1,%r1), 0(1)
+	dp	4095(1,%r15), 0(1)
+	dp	0(16,%r1), 0(1)
+	dp	0(16,%r15), 0(1)
+	dp	0(1), 0(16,%r1)
+	dp	0(1), 0(16,%r15)
+
+#CHECK: dr	%r0, %r0                # encoding: [0x1d,0x00]
+#CHECK: dr	%r0, %r15               # encoding: [0x1d,0x0f]
+#CHECK: dr	%r14, %r0               # encoding: [0x1d,0xe0]
+#CHECK: dr	%r6, %r9                # encoding: [0x1d,0x69]
+
+	dr	%r0,%r0
+	dr	%r0,%r15
+	dr	%r14,%r0
+	dr	%r6,%r9
+
 #CHECK: dsg	%r0, -524288            # encoding: [0xe3,0x00,0x00,0x00,0x80,0x0d]
 #CHECK: dsg	%r0, -1                 # encoding: [0xe3,0x00,0x0f,0xff,0xff,0x0d]
 #CHECK: dsg	%r0, 0                  # encoding: [0xe3,0x00,0x00,0x00,0x00,0x0d]
@@ -5919,6 +6495,34 @@
 	ear	%r7, %a8
 	ear	%r15, %a15
 
+#CHECK: ecag	%r0, %r0, 0             # encoding: [0xeb,0x00,0x00,0x00,0x00,0x4c]
+#CHECK: ecag	%r0, %r15, 0            # encoding: [0xeb,0x0f,0x00,0x00,0x00,0x4c]
+#CHECK: ecag	%r14, %r15, 0           # encoding: [0xeb,0xef,0x00,0x00,0x00,0x4c]
+#CHECK: ecag	%r15, %r15, 0           # encoding: [0xeb,0xff,0x00,0x00,0x00,0x4c]
+#CHECK: ecag	%r0, %r0, -524288       # encoding: [0xeb,0x00,0x00,0x00,0x80,0x4c]
+#CHECK: ecag	%r0, %r0, -1            # encoding: [0xeb,0x00,0x0f,0xff,0xff,0x4c]
+#CHECK: ecag	%r0, %r0, 0             # encoding: [0xeb,0x00,0x00,0x00,0x00,0x4c]
+#CHECK: ecag	%r0, %r0, 1             # encoding: [0xeb,0x00,0x00,0x01,0x00,0x4c]
+#CHECK: ecag	%r0, %r0, 524287        # encoding: [0xeb,0x00,0x0f,0xff,0x7f,0x4c]
+#CHECK: ecag	%r0, %r0, 0(%r1)        # encoding: [0xeb,0x00,0x10,0x00,0x00,0x4c]
+#CHECK: ecag	%r0, %r0, 0(%r15)       # encoding: [0xeb,0x00,0xf0,0x00,0x00,0x4c]
+#CHECK: ecag	%r0, %r0, 524287(%r1)   # encoding: [0xeb,0x00,0x1f,0xff,0x7f,0x4c]
+#CHECK: ecag	%r0, %r0, 524287(%r15)  # encoding: [0xeb,0x00,0xff,0xff,0x7f,0x4c]
+
+	ecag	%r0,%r0,0
+	ecag	%r0,%r15,0
+	ecag	%r14,%r15,0
+	ecag	%r15,%r15,0
+	ecag	%r0,%r0,-524288
+	ecag	%r0,%r0,-1
+	ecag	%r0,%r0,0
+	ecag	%r0,%r0,1
+	ecag	%r0,%r0,524287
+	ecag	%r0,%r0,0(%r1)
+	ecag	%r0,%r0,0(%r15)
+	ecag	%r0,%r0,524287(%r1)
+	ecag	%r0,%r0,524287(%r15)
+
 #CHECK: ectg    0, 0, %r0               # encoding: [0xc8,0x01,0x00,0x00,0x00,0x00]
 #CHECK: ectg    0(%r1), 0(%r15), %r2    # encoding: [0xc8,0x21,0x10,0x00,0xf0,0x00]
 #CHECK: ectg    1(%r1), 0(%r15), %r2    # encoding: [0xc8,0x21,0x10,0x01,0xf0,0x00]
@@ -5933,6 +6537,58 @@
         ectg    0(%r1),1(%r15),%r2
         ectg    0(%r1),4095(%r15),%r2
 
+#CHECK: ed	0(1), 0                 # encoding: [0xde,0x00,0x00,0x00,0x00,0x00]
+#CHECK: ed	0(1), 0(%r1)            # encoding: [0xde,0x00,0x00,0x00,0x10,0x00]
+#CHECK: ed	0(1), 0(%r15)           # encoding: [0xde,0x00,0x00,0x00,0xf0,0x00]
+#CHECK: ed	0(1), 4095              # encoding: [0xde,0x00,0x00,0x00,0x0f,0xff]
+#CHECK: ed	0(1), 4095(%r1)         # encoding: [0xde,0x00,0x00,0x00,0x1f,0xff]
+#CHECK: ed	0(1), 4095(%r15)        # encoding: [0xde,0x00,0x00,0x00,0xff,0xff]
+#CHECK: ed	0(1,%r1), 0             # encoding: [0xde,0x00,0x10,0x00,0x00,0x00]
+#CHECK: ed	0(1,%r15), 0            # encoding: [0xde,0x00,0xf0,0x00,0x00,0x00]
+#CHECK: ed	4095(1,%r1), 0          # encoding: [0xde,0x00,0x1f,0xff,0x00,0x00]
+#CHECK: ed	4095(1,%r15), 0         # encoding: [0xde,0x00,0xff,0xff,0x00,0x00]
+#CHECK: ed	0(256,%r1), 0           # encoding: [0xde,0xff,0x10,0x00,0x00,0x00]
+#CHECK: ed	0(256,%r15), 0          # encoding: [0xde,0xff,0xf0,0x00,0x00,0x00]
+
+	ed	0(1), 0
+	ed	0(1), 0(%r1)
+	ed	0(1), 0(%r15)
+	ed	0(1), 4095
+	ed	0(1), 4095(%r1)
+	ed	0(1), 4095(%r15)
+	ed	0(1,%r1), 0
+	ed	0(1,%r15), 0
+	ed	4095(1,%r1), 0
+	ed	4095(1,%r15), 0
+	ed	0(256,%r1), 0
+	ed	0(256,%r15), 0
+
+#CHECK: edmk	0(1), 0                 # encoding: [0xdf,0x00,0x00,0x00,0x00,0x00]
+#CHECK: edmk	0(1), 0(%r1)            # encoding: [0xdf,0x00,0x00,0x00,0x10,0x00]
+#CHECK: edmk	0(1), 0(%r15)           # encoding: [0xdf,0x00,0x00,0x00,0xf0,0x00]
+#CHECK: edmk	0(1), 4095              # encoding: [0xdf,0x00,0x00,0x00,0x0f,0xff]
+#CHECK: edmk	0(1), 4095(%r1)         # encoding: [0xdf,0x00,0x00,0x00,0x1f,0xff]
+#CHECK: edmk	0(1), 4095(%r15)        # encoding: [0xdf,0x00,0x00,0x00,0xff,0xff]
+#CHECK: edmk	0(1,%r1), 0             # encoding: [0xdf,0x00,0x10,0x00,0x00,0x00]
+#CHECK: edmk	0(1,%r15), 0            # encoding: [0xdf,0x00,0xf0,0x00,0x00,0x00]
+#CHECK: edmk	4095(1,%r1), 0          # encoding: [0xdf,0x00,0x1f,0xff,0x00,0x00]
+#CHECK: edmk	4095(1,%r15), 0         # encoding: [0xdf,0x00,0xff,0xff,0x00,0x00]
+#CHECK: edmk	0(256,%r1), 0           # encoding: [0xdf,0xff,0x10,0x00,0x00,0x00]
+#CHECK: edmk	0(256,%r15), 0          # encoding: [0xdf,0xff,0xf0,0x00,0x00,0x00]
+
+	edmk	0(1), 0
+	edmk	0(1), 0(%r1)
+	edmk	0(1), 0(%r15)
+	edmk	0(1), 4095
+	edmk	0(1), 4095(%r1)
+	edmk	0(1), 4095(%r15)
+	edmk	0(1,%r1), 0
+	edmk	0(1,%r15), 0
+	edmk	4095(1,%r1), 0
+	edmk	4095(1,%r15), 0
+	edmk	0(256,%r1), 0
+	edmk	0(256,%r15), 0
+
 #CHECK: efpc	%r0                     # encoding: [0xb3,0x8c,0x00,0x00]
 #CHECK: efpc	%r1                     # encoding: [0xb3,0x8c,0x00,0x10]
 #CHECK: efpc	%r15                    # encoding: [0xb3,0x8c,0x00,0xf0]
@@ -5941,6 +6597,16 @@
 	efpc	%r1
 	efpc	%r15
 
+#CHECK: epsw	%r0, %r8                # encoding: [0xb9,0x8d,0x00,0x08]
+#CHECK: epsw	%r0, %r15               # encoding: [0xb9,0x8d,0x00,0x0f]
+#CHECK: epsw	%r15, %r0               # encoding: [0xb9,0x8d,0x00,0xf0]
+#CHECK: epsw	%r15, %r8               # encoding: [0xb9,0x8d,0x00,0xf8]
+
+	epsw	%r0, %r8
+	epsw	%r0, %r15
+	epsw	%r15, %r0
+	epsw	%r15, %r8
+
 #CHECK: ex	%r0, 0                  # encoding: [0x44,0x00,0x00,0x00]
 #CHECK: ex	%r0, 4095               # encoding: [0x44,0x00,0x0f,0xff]
 #CHECK: ex	%r0, 0(%r1)             # encoding: [0x44,0x00,0x10,0x00]
@@ -6202,6 +6868,118 @@
 	ipm	%r1
 	ipm	%r15
 
+#CHECK: kdb	%f0, 0                  # encoding: [0xed,0x00,0x00,0x00,0x00,0x18]
+#CHECK: kdb	%f0, 4095               # encoding: [0xed,0x00,0x0f,0xff,0x00,0x18]
+#CHECK: kdb	%f0, 0(%r1)             # encoding: [0xed,0x00,0x10,0x00,0x00,0x18]
+#CHECK: kdb	%f0, 0(%r15)            # encoding: [0xed,0x00,0xf0,0x00,0x00,0x18]
+#CHECK: kdb	%f0, 4095(%r1,%r15)     # encoding: [0xed,0x01,0xff,0xff,0x00,0x18]
+#CHECK: kdb	%f0, 4095(%r15,%r1)     # encoding: [0xed,0x0f,0x1f,0xff,0x00,0x18]
+#CHECK: kdb	%f15, 0                 # encoding: [0xed,0xf0,0x00,0x00,0x00,0x18]
+
+	kdb	%f0, 0
+	kdb	%f0, 4095
+	kdb	%f0, 0(%r1)
+	kdb	%f0, 0(%r15)
+	kdb	%f0, 4095(%r1,%r15)
+	kdb	%f0, 4095(%r15,%r1)
+	kdb	%f15, 0
+
+#CHECK: kdbr	%f0, %f0                # encoding: [0xb3,0x18,0x00,0x00]
+#CHECK: kdbr	%f0, %f15               # encoding: [0xb3,0x18,0x00,0x0f]
+#CHECK: kdbr	%f7, %f8                # encoding: [0xb3,0x18,0x00,0x78]
+#CHECK: kdbr	%f15, %f0               # encoding: [0xb3,0x18,0x00,0xf0]
+
+	kdbr	%f0, %f0
+	kdbr	%f0, %f15
+	kdbr	%f7, %f8
+	kdbr	%f15, %f0
+
+#CHECK: keb	%f0, 0                  # encoding: [0xed,0x00,0x00,0x00,0x00,0x08]
+#CHECK: keb	%f0, 4095               # encoding: [0xed,0x00,0x0f,0xff,0x00,0x08]
+#CHECK: keb	%f0, 0(%r1)             # encoding: [0xed,0x00,0x10,0x00,0x00,0x08]
+#CHECK: keb	%f0, 0(%r15)            # encoding: [0xed,0x00,0xf0,0x00,0x00,0x08]
+#CHECK: keb	%f0, 4095(%r1,%r15)     # encoding: [0xed,0x01,0xff,0xff,0x00,0x08]
+#CHECK: keb	%f0, 4095(%r15,%r1)     # encoding: [0xed,0x0f,0x1f,0xff,0x00,0x08]
+#CHECK: keb	%f15, 0                 # encoding: [0xed,0xf0,0x00,0x00,0x00,0x08]
+
+	keb	%f0, 0
+	keb	%f0, 4095
+	keb	%f0, 0(%r1)
+	keb	%f0, 0(%r15)
+	keb	%f0, 4095(%r1,%r15)
+	keb	%f0, 4095(%r15,%r1)
+	keb	%f15, 0
+
+#CHECK: kebr	%f0, %f0                # encoding: [0xb3,0x08,0x00,0x00]
+#CHECK: kebr	%f0, %f15               # encoding: [0xb3,0x08,0x00,0x0f]
+#CHECK: kebr	%f7, %f8                # encoding: [0xb3,0x08,0x00,0x78]
+#CHECK: kebr	%f15, %f0               # encoding: [0xb3,0x08,0x00,0xf0]
+
+	kebr	%f0, %f0
+	kebr	%f0, %f15
+	kebr	%f7, %f8
+	kebr	%f15, %f0
+
+#CHECK: kimd	%r0, %r2                # encoding: [0xb9,0x3e,0x00,0x02]
+#CHECK: kimd	%r0, %r14               # encoding: [0xb9,0x3e,0x00,0x0e]
+#CHECK: kimd	%r15, %r2               # encoding: [0xb9,0x3e,0x00,0xf2]
+#CHECK: kimd	%r7, %r10               # encoding: [0xb9,0x3e,0x00,0x7a]
+
+	kimd	%r0, %r2
+	kimd	%r0, %r14
+	kimd	%r15, %r2
+	kimd	%r7, %r10
+
+#CHECK: klmd	%r0, %r2                # encoding: [0xb9,0x3f,0x00,0x02]
+#CHECK: klmd	%r0, %r14               # encoding: [0xb9,0x3f,0x00,0x0e]
+#CHECK: klmd	%r15, %r2               # encoding: [0xb9,0x3f,0x00,0xf2]
+#CHECK: klmd	%r7, %r10               # encoding: [0xb9,0x3f,0x00,0x7a]
+
+	klmd	%r0, %r2
+	klmd	%r0, %r14
+	klmd	%r15, %r2
+	klmd	%r7, %r10
+
+#CHECK: km	%r2, %r2                # encoding: [0xb9,0x2e,0x00,0x22]
+#CHECK: km	%r2, %r14               # encoding: [0xb9,0x2e,0x00,0x2e]
+#CHECK: km	%r14, %r2               # encoding: [0xb9,0x2e,0x00,0xe2]
+#CHECK: km	%r6, %r10               # encoding: [0xb9,0x2e,0x00,0x6a]
+
+	km	%r2, %r2
+	km	%r2, %r14
+	km	%r14, %r2
+	km	%r6, %r10
+
+#CHECK: kmac	%r0, %r2                # encoding: [0xb9,0x1e,0x00,0x02]
+#CHECK: kmac	%r0, %r14               # encoding: [0xb9,0x1e,0x00,0x0e]
+#CHECK: kmac	%r15, %r2               # encoding: [0xb9,0x1e,0x00,0xf2]
+#CHECK: kmac	%r7, %r10               # encoding: [0xb9,0x1e,0x00,0x7a]
+
+	kmac	%r0, %r2
+	kmac	%r0, %r14
+	kmac	%r15, %r2
+	kmac	%r7, %r10
+
+#CHECK: kmc	%r2, %r2                # encoding: [0xb9,0x2f,0x00,0x22]
+#CHECK: kmc	%r2, %r14               # encoding: [0xb9,0x2f,0x00,0x2e]
+#CHECK: kmc	%r14, %r2               # encoding: [0xb9,0x2f,0x00,0xe2]
+#CHECK: kmc	%r6, %r10               # encoding: [0xb9,0x2f,0x00,0x6a]
+
+	kmc	%r2, %r2
+	kmc	%r2, %r14
+	kmc	%r14, %r2
+	kmc	%r6, %r10
+
+#CHECK: kxbr	%f0, %f0                # encoding: [0xb3,0x48,0x00,0x00]
+#CHECK: kxbr	%f0, %f13               # encoding: [0xb3,0x48,0x00,0x0d]
+#CHECK: kxbr	%f8, %f8                # encoding: [0xb3,0x48,0x00,0x88]
+#CHECK: kxbr	%f13, %f0               # encoding: [0xb3,0x48,0x00,0xd0]
+
+	kxbr	%f0, %f0
+	kxbr	%f0, %f13
+	kxbr	%f8, %f8
+	kxbr	%f13, %f0
+
 #CHECK: l	%r0, 0                  # encoding: [0x58,0x00,0x00,0x00]
 #CHECK: l	%r0, 4095               # encoding: [0x58,0x00,0x0f,0xff]
 #CHECK: l	%r0, 0(%r1)             # encoding: [0x58,0x00,0x10,0x00]
@@ -7101,36 +7879,6 @@
 	llgcr	%r7, %r8
 	llgcr	%r15, %r0
 
-#CHECK: llgt	%r0, -524288            # encoding: [0xe3,0x00,0x00,0x00,0x80,0x17]
-#CHECK: llgt	%r0, -1                 # encoding: [0xe3,0x00,0x0f,0xff,0xff,0x17]
-#CHECK: llgt	%r0, 0                  # encoding: [0xe3,0x00,0x00,0x00,0x00,0x17]
-#CHECK: llgt	%r0, 1                  # encoding: [0xe3,0x00,0x00,0x01,0x00,0x17]
-#CHECK: llgt	%r0, 524287             # encoding: [0xe3,0x00,0x0f,0xff,0x7f,0x17]
-#CHECK: llgt	%r0, 0(%r1)             # encoding: [0xe3,0x00,0x10,0x00,0x00,0x17]
-#CHECK: llgt	%r0, 0(%r15)            # encoding: [0xe3,0x00,0xf0,0x00,0x00,0x17]
-#CHECK: llgt	%r0, 524287(%r1,%r15)   # encoding: [0xe3,0x01,0xff,0xff,0x7f,0x17]
-#CHECK: llgt	%r0, 524287(%r15,%r1)   # encoding: [0xe3,0x0f,0x1f,0xff,0x7f,0x17]
-#CHECK: llgt	%r15, 0                 # encoding: [0xe3,0xf0,0x00,0x00,0x00,0x17]
-
-	llgt	%r0, -524288
-	llgt	%r0, -1
-	llgt	%r0, 0
-	llgt	%r0, 1
-	llgt	%r0, 524287
-	llgt	%r0, 0(%r1)
-	llgt	%r0, 0(%r15)
-	llgt	%r0, 524287(%r1,%r15)
-	llgt	%r0, 524287(%r15,%r1)
-	llgt	%r15, 0
-
-#CHECK: llgtr	%r0, %r15               # encoding: [0xb9,0x17,0x00,0x0f]
-#CHECK: llgtr	%r7, %r8                # encoding: [0xb9,0x17,0x00,0x78]
-#CHECK: llgtr	%r15, %r0               # encoding: [0xb9,0x17,0x00,0xf0]
-
-	llgtr	%r0, %r15
-	llgtr	%r7, %r8
-	llgtr	%r15, %r0
-
 #CHECK: llgf	%r0, -524288            # encoding: [0xe3,0x00,0x00,0x00,0x80,0x16]
 #CHECK: llgf	%r0, -1                 # encoding: [0xe3,0x00,0x0f,0xff,0xff,0x16]
 #CHECK: llgf	%r0, 0                  # encoding: [0xe3,0x00,0x00,0x00,0x00,0x16]
@@ -7221,7 +7969,6 @@
 	llgh	%r0, 524287(%r15,%r1)
 	llgh	%r15, 0
 
-
 #CHECK: llghr	%r0, %r15               # encoding: [0xb9,0x85,0x00,0x0f]
 #CHECK: llghr	%r7, %r8                # encoding: [0xb9,0x85,0x00,0x78]
 #CHECK: llghr	%r15, %r0               # encoding: [0xb9,0x85,0x00,0xf0]
@@ -7267,6 +8014,36 @@
 	llghrl	%r7,frob@PLT
 	llghrl	%r8,frob@PLT
 
+#CHECK: llgt	%r0, -524288            # encoding: [0xe3,0x00,0x00,0x00,0x80,0x17]
+#CHECK: llgt	%r0, -1                 # encoding: [0xe3,0x00,0x0f,0xff,0xff,0x17]
+#CHECK: llgt	%r0, 0                  # encoding: [0xe3,0x00,0x00,0x00,0x00,0x17]
+#CHECK: llgt	%r0, 1                  # encoding: [0xe3,0x00,0x00,0x01,0x00,0x17]
+#CHECK: llgt	%r0, 524287             # encoding: [0xe3,0x00,0x0f,0xff,0x7f,0x17]
+#CHECK: llgt	%r0, 0(%r1)             # encoding: [0xe3,0x00,0x10,0x00,0x00,0x17]
+#CHECK: llgt	%r0, 0(%r15)            # encoding: [0xe3,0x00,0xf0,0x00,0x00,0x17]
+#CHECK: llgt	%r0, 524287(%r1,%r15)   # encoding: [0xe3,0x01,0xff,0xff,0x7f,0x17]
+#CHECK: llgt	%r0, 524287(%r15,%r1)   # encoding: [0xe3,0x0f,0x1f,0xff,0x7f,0x17]
+#CHECK: llgt	%r15, 0                 # encoding: [0xe3,0xf0,0x00,0x00,0x00,0x17]
+
+	llgt	%r0, -524288
+	llgt	%r0, -1
+	llgt	%r0, 0
+	llgt	%r0, 1
+	llgt	%r0, 524287
+	llgt	%r0, 0(%r1)
+	llgt	%r0, 0(%r15)
+	llgt	%r0, 524287(%r1,%r15)
+	llgt	%r0, 524287(%r15,%r1)
+	llgt	%r15, 0
+
+#CHECK: llgtr	%r0, %r15               # encoding: [0xb9,0x17,0x00,0x0f]
+#CHECK: llgtr	%r7, %r8                # encoding: [0xb9,0x17,0x00,0x78]
+#CHECK: llgtr	%r15, %r0               # encoding: [0xb9,0x17,0x00,0xf0]
+
+	llgtr	%r0, %r15
+	llgtr	%r7, %r8
+	llgtr	%r15, %r0
+
 #CHECK: llh	%r0, -524288            # encoding: [0xe3,0x00,0x00,0x00,0x80,0x95]
 #CHECK: llh	%r0, -1                 # encoding: [0xe3,0x00,0x0f,0xff,0xff,0x95]
 #CHECK: llh	%r0, 0                  # encoding: [0xe3,0x00,0x00,0x00,0x00,0x95]
@@ -7412,6 +8189,26 @@
 	lm	%r0,%r0,4095(%r1)
 	lm	%r0,%r0,4095(%r15)
 
+#CHECK: lmd	%r0, %r0, 0, 0               # encoding: [0xef,0x00,0x00,0x00,0x00,0x00]
+#CHECK: lmd	%r0, %r15, 0, 0              # encoding: [0xef,0x0f,0x00,0x00,0x00,0x00]
+#CHECK: lmd	%r14, %r15, 0, 0             # encoding: [0xef,0xef,0x00,0x00,0x00,0x00]
+#CHECK: lmd	%r15, %r15, 0, 0             # encoding: [0xef,0xff,0x00,0x00,0x00,0x00]
+#CHECK: lmd	%r2, %r4, 0(%r1), 0(%r15)    # encoding: [0xef,0x24,0x10,0x00,0xf0,0x00]
+#CHECK: lmd	%r2, %r4, 1(%r1), 0(%r15)    # encoding: [0xef,0x24,0x10,0x01,0xf0,0x00]
+#CHECK: lmd	%r2, %r4, 4095(%r1), 0(%r15) # encoding: [0xef,0x24,0x1f,0xff,0xf0,0x00]
+#CHECK: lmd	%r2, %r4, 0(%r1), 1(%r15)    # encoding: [0xef,0x24,0x10,0x00,0xf0,0x01]
+#CHECK: lmd	%r2, %r4, 0(%r1), 4095(%r15) # encoding: [0xef,0x24,0x10,0x00,0xff,0xff]
+
+        lmd	%r0, %r0, 0, 0
+        lmd	%r0, %r15, 0, 0
+        lmd	%r14, %r15, 0, 0
+        lmd	%r15, %r15, 0, 0
+        lmd	%r2, %r4, 0(%r1), 0(%r15)
+        lmd	%r2, %r4, 1(%r1), 0(%r15)
+        lmd	%r2, %r4, 4095(%r1), 0(%r15)
+        lmd	%r2, %r4, 0(%r1), 1(%r15)
+        lmd	%r2, %r4, 0(%r1), 4095(%r15)
+
 #CHECK: lmg	%r0, %r0, 0             # encoding: [0xeb,0x00,0x00,0x00,0x00,0x04]
 #CHECK: lmg	%r0, %r15, 0            # encoding: [0xeb,0x0f,0x00,0x00,0x00,0x04]
 #CHECK: lmg	%r14, %r15, 0           # encoding: [0xeb,0xef,0x00,0x00,0x00,0x04]
@@ -7685,28 +8482,6 @@
 	lrl	%r7,frob@PLT
 	lrl	%r8,frob@PLT
 
-#CHECK: lrvh	%r0, -524288            # encoding: [0xe3,0x00,0x00,0x00,0x80,0x1f]
-#CHECK: lrvh	%r0, -1                 # encoding: [0xe3,0x00,0x0f,0xff,0xff,0x1f]
-#CHECK: lrvh	%r0, 0                  # encoding: [0xe3,0x00,0x00,0x00,0x00,0x1f]
-#CHECK: lrvh	%r0, 1                  # encoding: [0xe3,0x00,0x00,0x01,0x00,0x1f]
-#CHECK: lrvh	%r0, 524287             # encoding: [0xe3,0x00,0x0f,0xff,0x7f,0x1f]
-#CHECK: lrvh	%r0, 0(%r1)             # encoding: [0xe3,0x00,0x10,0x00,0x00,0x1f]
-#CHECK: lrvh	%r0, 0(%r15)            # encoding: [0xe3,0x00,0xf0,0x00,0x00,0x1f]
-#CHECK: lrvh	%r0, 524287(%r1,%r15)   # encoding: [0xe3,0x01,0xff,0xff,0x7f,0x1f]
-#CHECK: lrvh	%r0, 524287(%r15,%r1)   # encoding: [0xe3,0x0f,0x1f,0xff,0x7f,0x1f]
-#CHECK: lrvh	%r15, 0                 # encoding: [0xe3,0xf0,0x00,0x00,0x00,0x1f]
-
-	lrvh	%r0,-524288
-	lrvh	%r0,-1
-	lrvh	%r0,0
-	lrvh	%r0,1
-	lrvh	%r0,524287
-	lrvh	%r0,0(%r1)
-	lrvh	%r0,0(%r15)
-	lrvh	%r0,524287(%r1,%r15)
-	lrvh	%r0,524287(%r15,%r1)
-	lrvh	%r15,0
-
 #CHECK: lrv	%r0, -524288            # encoding: [0xe3,0x00,0x00,0x00,0x80,0x1e]
 #CHECK: lrv	%r0, -1                 # encoding: [0xe3,0x00,0x0f,0xff,0xff,0x1e]
 #CHECK: lrv	%r0, 0                  # encoding: [0xe3,0x00,0x00,0x00,0x00,0x1e]
@@ -7763,6 +8538,28 @@
 	lrvgr	%r7,%r8
 	lrvgr	%r15,%r15
 
+#CHECK: lrvh	%r0, -524288            # encoding: [0xe3,0x00,0x00,0x00,0x80,0x1f]
+#CHECK: lrvh	%r0, -1                 # encoding: [0xe3,0x00,0x0f,0xff,0xff,0x1f]
+#CHECK: lrvh	%r0, 0                  # encoding: [0xe3,0x00,0x00,0x00,0x00,0x1f]
+#CHECK: lrvh	%r0, 1                  # encoding: [0xe3,0x00,0x00,0x01,0x00,0x1f]
+#CHECK: lrvh	%r0, 524287             # encoding: [0xe3,0x00,0x0f,0xff,0x7f,0x1f]
+#CHECK: lrvh	%r0, 0(%r1)             # encoding: [0xe3,0x00,0x10,0x00,0x00,0x1f]
+#CHECK: lrvh	%r0, 0(%r15)            # encoding: [0xe3,0x00,0xf0,0x00,0x00,0x1f]
+#CHECK: lrvh	%r0, 524287(%r1,%r15)   # encoding: [0xe3,0x01,0xff,0xff,0x7f,0x1f]
+#CHECK: lrvh	%r0, 524287(%r15,%r1)   # encoding: [0xe3,0x0f,0x1f,0xff,0x7f,0x1f]
+#CHECK: lrvh	%r15, 0                 # encoding: [0xe3,0xf0,0x00,0x00,0x00,0x1f]
+
+	lrvh	%r0,-524288
+	lrvh	%r0,-1
+	lrvh	%r0,0
+	lrvh	%r0,1
+	lrvh	%r0,524287
+	lrvh	%r0,0(%r1)
+	lrvh	%r0,0(%r15)
+	lrvh	%r0,524287(%r1,%r15)
+	lrvh	%r0,524287(%r15,%r1)
+	lrvh	%r15,0
+
 #CHECK: lrvr	%r0, %r0                # encoding: [0xb9,0x1f,0x00,0x00]
 #CHECK: lrvr	%r0, %r15               # encoding: [0xb9,0x1f,0x00,0x0f]
 #CHECK: lrvr	%r15, %r0               # encoding: [0xb9,0x1f,0x00,0xf0]
@@ -7797,6 +8594,26 @@
 	lt	%r0, 524287(%r15,%r1)
 	lt	%r15, 0
 
+#CHECK: ltdbr	%f0, %f9                # encoding: [0xb3,0x12,0x00,0x09]
+#CHECK: ltdbr	%f0, %f15               # encoding: [0xb3,0x12,0x00,0x0f]
+#CHECK: ltdbr	%f15, %f0               # encoding: [0xb3,0x12,0x00,0xf0]
+#CHECK: ltdbr	%f15, %f9               # encoding: [0xb3,0x12,0x00,0xf9]
+
+	ltdbr	%f0,%f9
+	ltdbr	%f0,%f15
+	ltdbr	%f15,%f0
+	ltdbr	%f15,%f9
+
+#CHECK: ltebr	%f0, %f9                # encoding: [0xb3,0x02,0x00,0x09]
+#CHECK: ltebr	%f0, %f15               # encoding: [0xb3,0x02,0x00,0x0f]
+#CHECK: ltebr	%f15, %f0               # encoding: [0xb3,0x02,0x00,0xf0]
+#CHECK: ltebr	%f15, %f9               # encoding: [0xb3,0x02,0x00,0xf9]
+
+	ltebr	%f0,%f9
+	ltebr	%f0,%f15
+	ltebr	%f15,%f0
+	ltebr	%f15,%f9
+
 #CHECK: ltg	%r0, -524288            # encoding: [0xe3,0x00,0x00,0x00,0x80,0x02]
 #CHECK: ltg	%r0, -1                 # encoding: [0xe3,0x00,0x0f,0xff,0xff,0x02]
 #CHECK: ltg	%r0, 0                  # encoding: [0xe3,0x00,0x00,0x00,0x00,0x02]
@@ -7841,26 +8658,6 @@
 	ltgf	%r0, 524287(%r15,%r1)
 	ltgf	%r15, 0
 
-#CHECK: ltdbr	%f0, %f9                # encoding: [0xb3,0x12,0x00,0x09]
-#CHECK: ltdbr	%f0, %f15               # encoding: [0xb3,0x12,0x00,0x0f]
-#CHECK: ltdbr	%f15, %f0               # encoding: [0xb3,0x12,0x00,0xf0]
-#CHECK: ltdbr	%f15, %f9               # encoding: [0xb3,0x12,0x00,0xf9]
-
-	ltdbr	%f0,%f9
-	ltdbr	%f0,%f15
-	ltdbr	%f15,%f0
-	ltdbr	%f15,%f9
-
-#CHECK: ltebr	%f0, %f9                # encoding: [0xb3,0x02,0x00,0x09]
-#CHECK: ltebr	%f0, %f15               # encoding: [0xb3,0x02,0x00,0x0f]
-#CHECK: ltebr	%f15, %f0               # encoding: [0xb3,0x02,0x00,0xf0]
-#CHECK: ltebr	%f15, %f9               # encoding: [0xb3,0x02,0x00,0xf9]
-
-	ltebr	%f0,%f9
-	ltebr	%f0,%f15
-	ltebr	%f15,%f0
-	ltebr	%f15,%f9
-
 #CHECK: ltgfr	%r0, %r9                # encoding: [0xb9,0x12,0x00,0x09]
 #CHECK: ltgfr	%r0, %r15               # encoding: [0xb9,0x12,0x00,0x0f]
 #CHECK: ltgfr	%r15, %r0               # encoding: [0xb9,0x12,0x00,0xf0]
@@ -7957,6 +8754,22 @@
 	lzxr	%f8
 	lzxr	%f13
 
+#CHECK: m	%r0, 0                  # encoding: [0x5c,0x00,0x00,0x00]
+#CHECK: m	%r0, 4095               # encoding: [0x5c,0x00,0x0f,0xff]
+#CHECK: m	%r0, 0(%r1)             # encoding: [0x5c,0x00,0x10,0x00]
+#CHECK: m	%r0, 0(%r15)            # encoding: [0x5c,0x00,0xf0,0x00]
+#CHECK: m	%r0, 4095(%r1,%r15)     # encoding: [0x5c,0x01,0xff,0xff]
+#CHECK: m	%r0, 4095(%r15,%r1)     # encoding: [0x5c,0x0f,0x1f,0xff]
+#CHECK: m	%r14, 0                 # encoding: [0x5c,0xe0,0x00,0x00]
+
+	m	%r0, 0
+	m	%r0, 4095
+	m	%r0, 0(%r1)
+	m	%r0, 0(%r15)
+	m	%r0, 4095(%r1,%r15)
+	m	%r0, 4095(%r15,%r1)
+	m	%r14, 0
+
 #CHECK: madb	%f0, %f0, 0             # encoding: [0xed,0x00,0x00,0x00,0x00,0x1e]
 #CHECK: madb	%f0, %f0, 4095          # encoding: [0xed,0x00,0x0f,0xff,0x00,0x1e]
 #CHECK: madb	%f0, %f0, 0(%r1)        # encoding: [0xed,0x00,0x10,0x00,0x00,0x1e]
@@ -8025,6 +8838,22 @@
 	maebr	%f7, %f8, %f9
 	maebr	%f15, %f15, %f15
 
+#CHECK: mc	0, 0                    # encoding: [0xaf,0x00,0x00,0x00]
+#CHECK: mc	4095, 0                 # encoding: [0xaf,0x00,0x0f,0xff]
+#CHECK: mc	0, 255                  # encoding: [0xaf,0xff,0x00,0x00]
+#CHECK: mc	0(%r1), 42              # encoding: [0xaf,0x2a,0x10,0x00]
+#CHECK: mc	0(%r15), 42             # encoding: [0xaf,0x2a,0xf0,0x00]
+#CHECK: mc	4095(%r1), 42           # encoding: [0xaf,0x2a,0x1f,0xff]
+#CHECK: mc	4095(%r15), 42          # encoding: [0xaf,0x2a,0xff,0xff]
+
+	mc	0, 0
+	mc	4095, 0
+	mc	0, 255
+	mc	0(%r1), 42
+	mc	0(%r15), 42
+	mc	4095(%r1), 42
+	mc	4095(%r15), 42
+
 #CHECK: mdb	%f0, 0                  # encoding: [0xed,0x00,0x00,0x00,0x00,0x1c]
 #CHECK: mdb	%f0, 4095               # encoding: [0xed,0x00,0x0f,0xff,0x00,0x1c]
 #CHECK: mdb	%f0, 0(%r1)             # encoding: [0xed,0x00,0x10,0x00,0x00,0x1c]
@@ -8103,6 +8932,28 @@
 	meebr	%f7, %f8
 	meebr	%f15, %f0
 
+#CHECK: mfy	%r0, -524288            # encoding: [0xe3,0x00,0x00,0x00,0x80,0x5c]
+#CHECK: mfy	%r0, -1                 # encoding: [0xe3,0x00,0x0f,0xff,0xff,0x5c]
+#CHECK: mfy	%r0, 0                  # encoding: [0xe3,0x00,0x00,0x00,0x00,0x5c]
+#CHECK: mfy	%r0, 1                  # encoding: [0xe3,0x00,0x00,0x01,0x00,0x5c]
+#CHECK: mfy	%r0, 524287             # encoding: [0xe3,0x00,0x0f,0xff,0x7f,0x5c]
+#CHECK: mfy	%r0, 0(%r1)             # encoding: [0xe3,0x00,0x10,0x00,0x00,0x5c]
+#CHECK: mfy	%r0, 0(%r15)            # encoding: [0xe3,0x00,0xf0,0x00,0x00,0x5c]
+#CHECK: mfy	%r0, 524287(%r1,%r15)   # encoding: [0xe3,0x01,0xff,0xff,0x7f,0x5c]
+#CHECK: mfy	%r0, 524287(%r15,%r1)   # encoding: [0xe3,0x0f,0x1f,0xff,0x7f,0x5c]
+#CHECK: mfy	%r14, 0                 # encoding: [0xe3,0xe0,0x00,0x00,0x00,0x5c]
+
+	mfy	%r0, -524288
+	mfy	%r0, -1
+	mfy	%r0, 0
+	mfy	%r0, 1
+	mfy	%r0, 524287
+	mfy	%r0, 0(%r1)
+	mfy	%r0, 0(%r15)
+	mfy	%r0, 524287(%r1,%r15)
+	mfy	%r0, 524287(%r15,%r1)
+	mfy	%r14, 0
+
 #CHECK: mghi	%r0, -32768             # encoding: [0xa7,0x0d,0x80,0x00]
 #CHECK: mghi	%r0, -1                 # encoding: [0xa7,0x0d,0xff,0xff]
 #CHECK: mghi	%r0, 0                  # encoding: [0xa7,0x0d,0x00,0x00]
@@ -8169,6 +9020,28 @@
 	mhy	%r0, 524287(%r15,%r1)
 	mhy	%r15, 0
 
+#CHECK: ml	%r0, -524288            # encoding: [0xe3,0x00,0x00,0x00,0x80,0x96]
+#CHECK: ml	%r0, -1                 # encoding: [0xe3,0x00,0x0f,0xff,0xff,0x96]
+#CHECK: ml	%r0, 0                  # encoding: [0xe3,0x00,0x00,0x00,0x00,0x96]
+#CHECK: ml	%r0, 1                  # encoding: [0xe3,0x00,0x00,0x01,0x00,0x96]
+#CHECK: ml	%r0, 524287             # encoding: [0xe3,0x00,0x0f,0xff,0x7f,0x96]
+#CHECK: ml	%r0, 0(%r1)             # encoding: [0xe3,0x00,0x10,0x00,0x00,0x96]
+#CHECK: ml	%r0, 0(%r15)            # encoding: [0xe3,0x00,0xf0,0x00,0x00,0x96]
+#CHECK: ml	%r0, 524287(%r1,%r15)   # encoding: [0xe3,0x01,0xff,0xff,0x7f,0x96]
+#CHECK: ml	%r0, 524287(%r15,%r1)   # encoding: [0xe3,0x0f,0x1f,0xff,0x7f,0x96]
+#CHECK: ml	%r14, 0                 # encoding: [0xe3,0xe0,0x00,0x00,0x00,0x96]
+
+	ml	%r0, -524288
+	ml	%r0, -1
+	ml	%r0, 0
+	ml	%r0, 1
+	ml	%r0, 524287
+	ml	%r0, 0(%r1)
+	ml	%r0, 0(%r15)
+	ml	%r0, 524287(%r1,%r15)
+	ml	%r0, 524287(%r15,%r1)
+	ml	%r14, 0
+
 #CHECK: mlg	%r0, -524288            # encoding: [0xe3,0x00,0x00,0x00,0x80,0x86]
 #CHECK: mlg	%r0, -1                 # encoding: [0xe3,0x00,0x0f,0xff,0xff,0x86]
 #CHECK: mlg	%r0, 0                  # encoding: [0xe3,0x00,0x00,0x00,0x00,0x86]
@@ -8201,6 +9074,56 @@
 	mlgr	%r14,%r0
 	mlgr	%r6,%r9
 
+#CHECK: mlr	%r0, %r0                # encoding: [0xb9,0x96,0x00,0x00]
+#CHECK: mlr	%r0, %r15               # encoding: [0xb9,0x96,0x00,0x0f]
+#CHECK: mlr	%r14, %r0               # encoding: [0xb9,0x96,0x00,0xe0]
+#CHECK: mlr	%r6, %r9                # encoding: [0xb9,0x96,0x00,0x69]
+
+	mlr	%r0,%r0
+	mlr	%r0,%r15
+	mlr	%r14,%r0
+	mlr	%r6,%r9
+
+#CHECK: mp	0(1), 0(1)              # encoding: [0xfc,0x00,0x00,0x00,0x00,0x00]
+#CHECK: mp	0(1), 0(1,%r1)          # encoding: [0xfc,0x00,0x00,0x00,0x10,0x00]
+#CHECK: mp	0(1), 0(1,%r15)         # encoding: [0xfc,0x00,0x00,0x00,0xf0,0x00]
+#CHECK: mp	0(1), 4095(1)           # encoding: [0xfc,0x00,0x00,0x00,0x0f,0xff]
+#CHECK: mp	0(1), 4095(1,%r1)       # encoding: [0xfc,0x00,0x00,0x00,0x1f,0xff]
+#CHECK: mp	0(1), 4095(1,%r15)      # encoding: [0xfc,0x00,0x00,0x00,0xff,0xff]
+#CHECK: mp	0(1,%r1), 0(1)          # encoding: [0xfc,0x00,0x10,0x00,0x00,0x00]
+#CHECK: mp	0(1,%r15), 0(1)         # encoding: [0xfc,0x00,0xf0,0x00,0x00,0x00]
+#CHECK: mp	4095(1,%r1), 0(1)       # encoding: [0xfc,0x00,0x1f,0xff,0x00,0x00]
+#CHECK: mp	4095(1,%r15), 0(1)      # encoding: [0xfc,0x00,0xff,0xff,0x00,0x00]
+#CHECK: mp	0(16,%r1), 0(1)         # encoding: [0xfc,0xf0,0x10,0x00,0x00,0x00]
+#CHECK: mp	0(16,%r15), 0(1)        # encoding: [0xfc,0xf0,0xf0,0x00,0x00,0x00]
+#CHECK: mp	0(1), 0(16,%r1)         # encoding: [0xfc,0x0f,0x00,0x00,0x10,0x00]
+#CHECK: mp	0(1), 0(16,%r15)        # encoding: [0xfc,0x0f,0x00,0x00,0xf0,0x00]
+
+	mp	0(1), 0(1)
+	mp	0(1), 0(1,%r1)
+	mp	0(1), 0(1,%r15)
+	mp	0(1), 4095(1)
+	mp	0(1), 4095(1,%r1)
+	mp	0(1), 4095(1,%r15)
+	mp	0(1,%r1), 0(1)
+	mp	0(1,%r15), 0(1)
+	mp	4095(1,%r1), 0(1)
+	mp	4095(1,%r15), 0(1)
+	mp	0(16,%r1), 0(1)
+	mp	0(16,%r15), 0(1)
+	mp	0(1), 0(16,%r1)
+	mp	0(1), 0(16,%r15)
+
+#CHECK: mr	%r0, %r0                # encoding: [0x1c,0x00]
+#CHECK: mr	%r0, %r15               # encoding: [0x1c,0x0f]
+#CHECK: mr	%r14, %r0               # encoding: [0x1c,0xe0]
+#CHECK: mr	%r6, %r9                # encoding: [0x1c,0x69]
+
+	mr	%r0,%r0
+	mr	%r0,%r15
+	mr	%r14,%r0
+	mr	%r6,%r9
+
 #CHECK: ms	%r0, 0                  # encoding: [0x71,0x00,0x00,0x00]
 #CHECK: ms	%r0, 4095               # encoding: [0x71,0x00,0x0f,0xff]
 #CHECK: ms	%r0, 0(%r1)             # encoding: [0x71,0x00,0x10,0x00]
@@ -8435,6 +9358,32 @@
 	mvc	0(256,%r1), 0
 	mvc	0(256,%r15), 0
 
+#CHECK: mvcin	0(1), 0                 # encoding: [0xe8,0x00,0x00,0x00,0x00,0x00]
+#CHECK: mvcin	0(1), 0(%r1)            # encoding: [0xe8,0x00,0x00,0x00,0x10,0x00]
+#CHECK: mvcin	0(1), 0(%r15)           # encoding: [0xe8,0x00,0x00,0x00,0xf0,0x00]
+#CHECK: mvcin	0(1), 4095              # encoding: [0xe8,0x00,0x00,0x00,0x0f,0xff]
+#CHECK: mvcin	0(1), 4095(%r1)         # encoding: [0xe8,0x00,0x00,0x00,0x1f,0xff]
+#CHECK: mvcin	0(1), 4095(%r15)        # encoding: [0xe8,0x00,0x00,0x00,0xff,0xff]
+#CHECK: mvcin	0(1,%r1), 0             # encoding: [0xe8,0x00,0x10,0x00,0x00,0x00]
+#CHECK: mvcin	0(1,%r15), 0            # encoding: [0xe8,0x00,0xf0,0x00,0x00,0x00]
+#CHECK: mvcin	4095(1,%r1), 0          # encoding: [0xe8,0x00,0x1f,0xff,0x00,0x00]
+#CHECK: mvcin	4095(1,%r15), 0         # encoding: [0xe8,0x00,0xff,0xff,0x00,0x00]
+#CHECK: mvcin	0(256,%r1), 0           # encoding: [0xe8,0xff,0x10,0x00,0x00,0x00]
+#CHECK: mvcin	0(256,%r15), 0          # encoding: [0xe8,0xff,0xf0,0x00,0x00,0x00]
+
+	mvcin	0(1), 0
+	mvcin	0(1), 0(%r1)
+	mvcin	0(1), 0(%r15)
+	mvcin	0(1), 4095
+	mvcin	0(1), 4095(%r1)
+	mvcin	0(1), 4095(%r15)
+	mvcin	0(1,%r1), 0
+	mvcin	0(1,%r15), 0
+	mvcin	4095(1,%r1), 0
+	mvcin	4095(1,%r15), 0
+	mvcin	0(256,%r1), 0
+	mvcin	0(256,%r15), 0
+
 #CHECK: mvck	0(%r0), 0, %r3          # encoding: [0xd9,0x03,0x00,0x00,0x00,0x00]
 #CHECK: mvck	0(%r1), 0, %r3          # encoding: [0xd9,0x13,0x00,0x00,0x00,0x00]
 #CHECK: mvck	0(%r1), 0(%r1), %r3     # encoding: [0xd9,0x13,0x00,0x00,0x10,0x00]
@@ -8463,6 +9412,54 @@
         mvck	0(%r2,%r1), 0, %r3
         mvck	0(%r2,%r15), 0, %r3
 
+#CHECK: mvcl	%r0, %r8                # encoding: [0x0e,0x08]
+#CHECK: mvcl	%r0, %r14               # encoding: [0x0e,0x0e]
+#CHECK: mvcl	%r14, %r0               # encoding: [0x0e,0xe0]
+#CHECK: mvcl	%r14, %r8               # encoding: [0x0e,0xe8]
+
+	mvcl	%r0, %r8
+	mvcl	%r0, %r14
+	mvcl	%r14, %r0
+	mvcl	%r14, %r8
+
+#CHECK: mvcle	%r0, %r0, 0             # encoding: [0xa8,0x00,0x00,0x00]
+#CHECK: mvcle	%r0, %r14, 4095         # encoding: [0xa8,0x0e,0x0f,0xff]
+#CHECK: mvcle	%r0, %r0, 0(%r1)        # encoding: [0xa8,0x00,0x10,0x00]
+#CHECK: mvcle	%r0, %r0, 0(%r15)       # encoding: [0xa8,0x00,0xf0,0x00]
+#CHECK: mvcle	%r14, %r14, 4095(%r1)   # encoding: [0xa8,0xee,0x1f,0xff]
+#CHECK: mvcle	%r0, %r0, 4095(%r15)    # encoding: [0xa8,0x00,0xff,0xff]
+#CHECK: mvcle	%r14, %r0, 0            # encoding: [0xa8,0xe0,0x00,0x00]
+
+	mvcle	%r0, %r0, 0
+	mvcle	%r0, %r14, 4095
+	mvcle	%r0, %r0, 0(%r1)
+	mvcle	%r0, %r0, 0(%r15)
+	mvcle	%r14, %r14, 4095(%r1)
+	mvcle	%r0, %r0, 4095(%r15)
+	mvcle	%r14, %r0, 0
+
+#CHECK: mvclu	%r0, %r0, -524288       # encoding: [0xeb,0x00,0x00,0x00,0x80,0x8e]
+#CHECK: mvclu	%r0, %r0, -1            # encoding: [0xeb,0x00,0x0f,0xff,0xff,0x8e]
+#CHECK: mvclu	%r0, %r14, 0            # encoding: [0xeb,0x0e,0x00,0x00,0x00,0x8e]
+#CHECK: mvclu	%r0, %r14, 1            # encoding: [0xeb,0x0e,0x00,0x01,0x00,0x8e]
+#CHECK: mvclu	%r0, %r8, 524287        # encoding: [0xeb,0x08,0x0f,0xff,0x7f,0x8e]
+#CHECK: mvclu	%r0, %r8, 0(%r1)        # encoding: [0xeb,0x08,0x10,0x00,0x00,0x8e]
+#CHECK: mvclu	%r0, %r4, 0(%r15)       # encoding: [0xeb,0x04,0xf0,0x00,0x00,0x8e]
+#CHECK: mvclu	%r0, %r4, 524287(%r15)  # encoding: [0xeb,0x04,0xff,0xff,0x7f,0x8e]
+#CHECK: mvclu	%r0, %r0, 524287(%r1)   # encoding: [0xeb,0x00,0x1f,0xff,0x7f,0x8e]
+#CHECK: mvclu	%r14, %r0, 0            # encoding: [0xeb,0xe0,0x00,0x00,0x00,0x8e]
+
+	mvclu	%r0, %r0, -524288
+	mvclu	%r0, %r0, -1
+	mvclu	%r0, %r14, 0
+	mvclu	%r0, %r14, 1
+	mvclu	%r0, %r8, 524287
+	mvclu	%r0, %r8, 0(%r1)
+	mvclu	%r0, %r4, 0(%r15)
+	mvclu	%r0, %r4, 524287(%r15)
+	mvclu	%r0, %r0, 524287(%r1)
+	mvclu	%r14, %r0, 0
+
 #CHECK: mvghi	0, 0                    # encoding: [0xe5,0x48,0x00,0x00,0x00,0x00]
 #CHECK: mvghi	4095, 0                 # encoding: [0xe5,0x48,0x0f,0xff,0x00,0x00]
 #CHECK: mvghi	0, -32768               # encoding: [0xe5,0x48,0x00,0x00,0x80,0x00]
@@ -8573,6 +9570,62 @@
 	mviy	524287(%r1), 42
 	mviy	524287(%r15), 42
 
+#CHECK: mvn	0(1), 0                 # encoding: [0xd1,0x00,0x00,0x00,0x00,0x00]
+#CHECK: mvn	0(1), 0(%r1)            # encoding: [0xd1,0x00,0x00,0x00,0x10,0x00]
+#CHECK: mvn	0(1), 0(%r15)           # encoding: [0xd1,0x00,0x00,0x00,0xf0,0x00]
+#CHECK: mvn	0(1), 4095              # encoding: [0xd1,0x00,0x00,0x00,0x0f,0xff]
+#CHECK: mvn	0(1), 4095(%r1)         # encoding: [0xd1,0x00,0x00,0x00,0x1f,0xff]
+#CHECK: mvn	0(1), 4095(%r15)        # encoding: [0xd1,0x00,0x00,0x00,0xff,0xff]
+#CHECK: mvn	0(1,%r1), 0             # encoding: [0xd1,0x00,0x10,0x00,0x00,0x00]
+#CHECK: mvn	0(1,%r15), 0            # encoding: [0xd1,0x00,0xf0,0x00,0x00,0x00]
+#CHECK: mvn	4095(1,%r1), 0          # encoding: [0xd1,0x00,0x1f,0xff,0x00,0x00]
+#CHECK: mvn	4095(1,%r15), 0         # encoding: [0xd1,0x00,0xff,0xff,0x00,0x00]
+#CHECK: mvn	0(256,%r1), 0           # encoding: [0xd1,0xff,0x10,0x00,0x00,0x00]
+#CHECK: mvn	0(256,%r15), 0          # encoding: [0xd1,0xff,0xf0,0x00,0x00,0x00]
+
+	mvn	0(1), 0
+	mvn	0(1), 0(%r1)
+	mvn	0(1), 0(%r15)
+	mvn	0(1), 4095
+	mvn	0(1), 4095(%r1)
+	mvn	0(1), 4095(%r15)
+	mvn	0(1,%r1), 0
+	mvn	0(1,%r15), 0
+	mvn	4095(1,%r1), 0
+	mvn	4095(1,%r15), 0
+	mvn	0(256,%r1), 0
+	mvn	0(256,%r15), 0
+
+#CHECK: mvo	0(1), 0(1)              # encoding: [0xf1,0x00,0x00,0x00,0x00,0x00]
+#CHECK: mvo	0(1), 0(1,%r1)          # encoding: [0xf1,0x00,0x00,0x00,0x10,0x00]
+#CHECK: mvo	0(1), 0(1,%r15)         # encoding: [0xf1,0x00,0x00,0x00,0xf0,0x00]
+#CHECK: mvo	0(1), 4095(1)           # encoding: [0xf1,0x00,0x00,0x00,0x0f,0xff]
+#CHECK: mvo	0(1), 4095(1,%r1)       # encoding: [0xf1,0x00,0x00,0x00,0x1f,0xff]
+#CHECK: mvo	0(1), 4095(1,%r15)      # encoding: [0xf1,0x00,0x00,0x00,0xff,0xff]
+#CHECK: mvo	0(1,%r1), 0(1)          # encoding: [0xf1,0x00,0x10,0x00,0x00,0x00]
+#CHECK: mvo	0(1,%r15), 0(1)         # encoding: [0xf1,0x00,0xf0,0x00,0x00,0x00]
+#CHECK: mvo	4095(1,%r1), 0(1)       # encoding: [0xf1,0x00,0x1f,0xff,0x00,0x00]
+#CHECK: mvo	4095(1,%r15), 0(1)      # encoding: [0xf1,0x00,0xff,0xff,0x00,0x00]
+#CHECK: mvo	0(16,%r1), 0(1)         # encoding: [0xf1,0xf0,0x10,0x00,0x00,0x00]
+#CHECK: mvo	0(16,%r15), 0(1)        # encoding: [0xf1,0xf0,0xf0,0x00,0x00,0x00]
+#CHECK: mvo	0(1), 0(16,%r1)         # encoding: [0xf1,0x0f,0x00,0x00,0x10,0x00]
+#CHECK: mvo	0(1), 0(16,%r15)        # encoding: [0xf1,0x0f,0x00,0x00,0xf0,0x00]
+
+	mvo	0(1), 0(1)
+	mvo	0(1), 0(1,%r1)
+	mvo	0(1), 0(1,%r15)
+	mvo	0(1), 4095(1)
+	mvo	0(1), 4095(1,%r1)
+	mvo	0(1), 4095(1,%r15)
+	mvo	0(1,%r1), 0(1)
+	mvo	0(1,%r15), 0(1)
+	mvo	4095(1,%r1), 0(1)
+	mvo	4095(1,%r15), 0(1)
+	mvo	0(16,%r1), 0(1)
+	mvo	0(16,%r15), 0(1)
+	mvo	0(1), 0(16,%r1)
+	mvo	0(1), 0(16,%r15)
+
 #CHECK: mvst	%r0, %r0                # encoding: [0xb2,0x55,0x00,0x00]
 #CHECK: mvst	%r0, %r15               # encoding: [0xb2,0x55,0x00,0x0f]
 #CHECK: mvst	%r15, %r0               # encoding: [0xb2,0x55,0x00,0xf0]
@@ -8583,6 +9636,32 @@
 	mvst	%r15,%r0
 	mvst	%r7,%r8
 
+#CHECK: mvz	0(1), 0                 # encoding: [0xd3,0x00,0x00,0x00,0x00,0x00]
+#CHECK: mvz	0(1), 0(%r1)            # encoding: [0xd3,0x00,0x00,0x00,0x10,0x00]
+#CHECK: mvz	0(1), 0(%r15)           # encoding: [0xd3,0x00,0x00,0x00,0xf0,0x00]
+#CHECK: mvz	0(1), 4095              # encoding: [0xd3,0x00,0x00,0x00,0x0f,0xff]
+#CHECK: mvz	0(1), 4095(%r1)         # encoding: [0xd3,0x00,0x00,0x00,0x1f,0xff]
+#CHECK: mvz	0(1), 4095(%r15)        # encoding: [0xd3,0x00,0x00,0x00,0xff,0xff]
+#CHECK: mvz	0(1,%r1), 0             # encoding: [0xd3,0x00,0x10,0x00,0x00,0x00]
+#CHECK: mvz	0(1,%r15), 0            # encoding: [0xd3,0x00,0xf0,0x00,0x00,0x00]
+#CHECK: mvz	4095(1,%r1), 0          # encoding: [0xd3,0x00,0x1f,0xff,0x00,0x00]
+#CHECK: mvz	4095(1,%r15), 0         # encoding: [0xd3,0x00,0xff,0xff,0x00,0x00]
+#CHECK: mvz	0(256,%r1), 0           # encoding: [0xd3,0xff,0x10,0x00,0x00,0x00]
+#CHECK: mvz	0(256,%r15), 0          # encoding: [0xd3,0xff,0xf0,0x00,0x00,0x00]
+
+	mvz	0(1), 0
+	mvz	0(1), 0(%r1)
+	mvz	0(1), 0(%r15)
+	mvz	0(1), 4095
+	mvz	0(1), 4095(%r1)
+	mvz	0(1), 4095(%r15)
+	mvz	0(1,%r1), 0
+	mvz	0(1,%r15), 0
+	mvz	4095(1,%r1), 0
+	mvz	4095(1,%r15), 0
+	mvz	0(256,%r1), 0
+	mvz	0(256,%r15), 0
+
 #CHECK: mxbr	%f0, %f0                # encoding: [0xb3,0x4c,0x00,0x00]
 #CHECK: mxbr	%f0, %f13               # encoding: [0xb3,0x4c,0x00,0x0d]
 #CHECK: mxbr	%f8, %f5                # encoding: [0xb3,0x4c,0x00,0x85]
@@ -9025,6 +10104,36 @@
 	oy	%r0, 524287(%r15,%r1)
 	oy	%r15, 0
 
+#CHECK: pack	0(1), 0(1)              # encoding: [0xf2,0x00,0x00,0x00,0x00,0x00]
+#CHECK: pack	0(1), 0(1,%r1)          # encoding: [0xf2,0x00,0x00,0x00,0x10,0x00]
+#CHECK: pack	0(1), 0(1,%r15)         # encoding: [0xf2,0x00,0x00,0x00,0xf0,0x00]
+#CHECK: pack	0(1), 4095(1)           # encoding: [0xf2,0x00,0x00,0x00,0x0f,0xff]
+#CHECK: pack	0(1), 4095(1,%r1)       # encoding: [0xf2,0x00,0x00,0x00,0x1f,0xff]
+#CHECK: pack	0(1), 4095(1,%r15)      # encoding: [0xf2,0x00,0x00,0x00,0xff,0xff]
+#CHECK: pack	0(1,%r1), 0(1)          # encoding: [0xf2,0x00,0x10,0x00,0x00,0x00]
+#CHECK: pack	0(1,%r15), 0(1)         # encoding: [0xf2,0x00,0xf0,0x00,0x00,0x00]
+#CHECK: pack	4095(1,%r1), 0(1)       # encoding: [0xf2,0x00,0x1f,0xff,0x00,0x00]
+#CHECK: pack	4095(1,%r15), 0(1)      # encoding: [0xf2,0x00,0xff,0xff,0x00,0x00]
+#CHECK: pack	0(16,%r1), 0(1)         # encoding: [0xf2,0xf0,0x10,0x00,0x00,0x00]
+#CHECK: pack	0(16,%r15), 0(1)        # encoding: [0xf2,0xf0,0xf0,0x00,0x00,0x00]
+#CHECK: pack	0(1), 0(16,%r1)         # encoding: [0xf2,0x0f,0x00,0x00,0x10,0x00]
+#CHECK: pack	0(1), 0(16,%r15)        # encoding: [0xf2,0x0f,0x00,0x00,0xf0,0x00]
+
+	pack	0(1), 0(1)
+	pack	0(1), 0(1,%r1)
+	pack	0(1), 0(1,%r15)
+	pack	0(1), 4095(1)
+	pack	0(1), 4095(1,%r1)
+	pack	0(1), 4095(1,%r15)
+	pack	0(1,%r1), 0(1)
+	pack	0(1,%r15), 0(1)
+	pack	4095(1,%r1), 0(1)
+	pack	4095(1,%r15), 0(1)
+	pack	0(16,%r1), 0(1)
+	pack	0(16,%r15), 0(1)
+	pack	0(1), 0(16,%r1)
+	pack	0(1), 0(16,%r15)
+
 #CHECK: pfd	0, -524288            # encoding: [0xe3,0x00,0x00,0x00,0x80,0x36]
 #CHECK: pfd	0, -1                 # encoding: [0xe3,0x00,0x0f,0xff,0xff,0x36]
 #CHECK: pfd	0, 0                  # encoding: [0xe3,0x00,0x00,0x00,0x00,0x36]
@@ -9084,6 +10193,58 @@
 	pfdrl	7, frob@PLT
 	pfdrl	8, frob@PLT
 
+#CHECK: pka	0, 0(1)                 # encoding: [0xe9,0x00,0x00,0x00,0x00,0x00]
+#CHECK: pka	0, 0(1,%r1)             # encoding: [0xe9,0x00,0x00,0x00,0x10,0x00]
+#CHECK: pka	0, 0(1,%r15)            # encoding: [0xe9,0x00,0x00,0x00,0xf0,0x00]
+#CHECK: pka	0, 4095(1)              # encoding: [0xe9,0x00,0x00,0x00,0x0f,0xff]
+#CHECK: pka	0, 4095(1,%r1)          # encoding: [0xe9,0x00,0x00,0x00,0x1f,0xff]
+#CHECK: pka	0, 4095(1,%r15)         # encoding: [0xe9,0x00,0x00,0x00,0xff,0xff]
+#CHECK: pka	0(%r1), 0(1)            # encoding: [0xe9,0x00,0x10,0x00,0x00,0x00]
+#CHECK: pka	0(%r15), 0(1)           # encoding: [0xe9,0x00,0xf0,0x00,0x00,0x00]
+#CHECK: pka	4095(%r1), 0(1)         # encoding: [0xe9,0x00,0x1f,0xff,0x00,0x00]
+#CHECK: pka	4095(%r15), 0(1)        # encoding: [0xe9,0x00,0xff,0xff,0x00,0x00]
+#CHECK: pka	0, 0(256,%r1)           # encoding: [0xe9,0xff,0x00,0x00,0x10,0x00]
+#CHECK: pka	0, 0(256,%r15)          # encoding: [0xe9,0xff,0x00,0x00,0xf0,0x00]
+
+	pka	0, 0(1)
+	pka	0, 0(1,%r1)
+	pka	0, 0(1,%r15)
+	pka	0, 4095(1)
+	pka	0, 4095(1,%r1)
+	pka	0, 4095(1,%r15)
+	pka	0(%r1), 0(1)
+	pka	0(%r15), 0(1)
+	pka	4095(%r1), 0(1)
+	pka	4095(%r15), 0(1)
+	pka	0, 0(256,%r1)
+	pka	0, 0(256,%r15)
+
+#CHECK: pku	0, 0(1)                 # encoding: [0xe1,0x00,0x00,0x00,0x00,0x00]
+#CHECK: pku	0, 0(1,%r1)             # encoding: [0xe1,0x00,0x00,0x00,0x10,0x00]
+#CHECK: pku	0, 0(1,%r15)            # encoding: [0xe1,0x00,0x00,0x00,0xf0,0x00]
+#CHECK: pku	0, 4095(1)              # encoding: [0xe1,0x00,0x00,0x00,0x0f,0xff]
+#CHECK: pku	0, 4095(1,%r1)          # encoding: [0xe1,0x00,0x00,0x00,0x1f,0xff]
+#CHECK: pku	0, 4095(1,%r15)         # encoding: [0xe1,0x00,0x00,0x00,0xff,0xff]
+#CHECK: pku	0(%r1), 0(1)            # encoding: [0xe1,0x00,0x10,0x00,0x00,0x00]
+#CHECK: pku	0(%r15), 0(1)           # encoding: [0xe1,0x00,0xf0,0x00,0x00,0x00]
+#CHECK: pku	4095(%r1), 0(1)         # encoding: [0xe1,0x00,0x1f,0xff,0x00,0x00]
+#CHECK: pku	4095(%r15), 0(1)        # encoding: [0xe1,0x00,0xff,0xff,0x00,0x00]
+#CHECK: pku	0, 0(256,%r1)           # encoding: [0xe1,0xff,0x00,0x00,0x10,0x00]
+#CHECK: pku	0, 0(256,%r15)          # encoding: [0xe1,0xff,0x00,0x00,0xf0,0x00]
+
+	pku	0, 0(1)
+	pku	0, 0(1,%r1)
+	pku	0, 0(1,%r15)
+	pku	0, 4095(1)
+	pku	0, 4095(1,%r1)
+	pku	0, 4095(1,%r15)
+	pku	0(%r1), 0(1)
+	pku	0(%r15), 0(1)
+	pku	4095(%r1), 0(1)
+	pku	4095(%r15), 0(1)
+	pku	0, 0(256,%r1)
+	pku	0, 0(256,%r15)
+
 #CHECK: plo	%r0, 0, %r0, 0               # encoding: [0xee,0x00,0x00,0x00,0x00,0x00]
 #CHECK: plo	%r2, 0(%r1), %r4, 0(%r15)    # encoding: [0xee,0x24,0x10,0x00,0xf0,0x00]
 #CHECK: plo	%r2, 1(%r1), %r4, 0(%r15)    # encoding: [0xee,0x24,0x10,0x01,0xf0,0x00]
@@ -9117,54 +10278,6 @@
 	risbg	%r15,%r0,0,0,0
 	risbg	%r4,%r5,6,7,8
 
-#CHECK: rnsbg	%r0, %r0, 0, 0, 0       # encoding: [0xec,0x00,0x00,0x00,0x00,0x54]
-#CHECK: rnsbg	%r0, %r0, 0, 0, 63      # encoding: [0xec,0x00,0x00,0x00,0x3f,0x54]
-#CHECK: rnsbg	%r0, %r0, 0, 255, 0     # encoding: [0xec,0x00,0x00,0xff,0x00,0x54]
-#CHECK: rnsbg	%r0, %r0, 255, 0, 0     # encoding: [0xec,0x00,0xff,0x00,0x00,0x54]
-#CHECK: rnsbg	%r0, %r15, 0, 0, 0      # encoding: [0xec,0x0f,0x00,0x00,0x00,0x54]
-#CHECK: rnsbg	%r15, %r0, 0, 0, 0      # encoding: [0xec,0xf0,0x00,0x00,0x00,0x54]
-#CHECK: rnsbg	%r4, %r5, 6, 7, 8       # encoding: [0xec,0x45,0x06,0x07,0x08,0x54]
-
-	rnsbg	%r0,%r0,0,0,0
-	rnsbg	%r0,%r0,0,0,63
-	rnsbg	%r0,%r0,0,255,0
-	rnsbg	%r0,%r0,255,0,0
-	rnsbg	%r0,%r15,0,0,0
-	rnsbg	%r15,%r0,0,0,0
-	rnsbg	%r4,%r5,6,7,8
-
-#CHECK: rosbg	%r0, %r0, 0, 0, 0       # encoding: [0xec,0x00,0x00,0x00,0x00,0x56]
-#CHECK: rosbg	%r0, %r0, 0, 0, 63      # encoding: [0xec,0x00,0x00,0x00,0x3f,0x56]
-#CHECK: rosbg	%r0, %r0, 0, 255, 0     # encoding: [0xec,0x00,0x00,0xff,0x00,0x56]
-#CHECK: rosbg	%r0, %r0, 255, 0, 0     # encoding: [0xec,0x00,0xff,0x00,0x00,0x56]
-#CHECK: rosbg	%r0, %r15, 0, 0, 0      # encoding: [0xec,0x0f,0x00,0x00,0x00,0x56]
-#CHECK: rosbg	%r15, %r0, 0, 0, 0      # encoding: [0xec,0xf0,0x00,0x00,0x00,0x56]
-#CHECK: rosbg	%r4, %r5, 6, 7, 8       # encoding: [0xec,0x45,0x06,0x07,0x08,0x56]
-
-	rosbg	%r0,%r0,0,0,0
-	rosbg	%r0,%r0,0,0,63
-	rosbg	%r0,%r0,0,255,0
-	rosbg	%r0,%r0,255,0,0
-	rosbg	%r0,%r15,0,0,0
-	rosbg	%r15,%r0,0,0,0
-	rosbg	%r4,%r5,6,7,8
-
-#CHECK: rxsbg	%r0, %r0, 0, 0, 0       # encoding: [0xec,0x00,0x00,0x00,0x00,0x57]
-#CHECK: rxsbg	%r0, %r0, 0, 0, 63      # encoding: [0xec,0x00,0x00,0x00,0x3f,0x57]
-#CHECK: rxsbg	%r0, %r0, 0, 255, 0     # encoding: [0xec,0x00,0x00,0xff,0x00,0x57]
-#CHECK: rxsbg	%r0, %r0, 255, 0, 0     # encoding: [0xec,0x00,0xff,0x00,0x00,0x57]
-#CHECK: rxsbg	%r0, %r15, 0, 0, 0      # encoding: [0xec,0x0f,0x00,0x00,0x00,0x57]
-#CHECK: rxsbg	%r15, %r0, 0, 0, 0      # encoding: [0xec,0xf0,0x00,0x00,0x00,0x57]
-#CHECK: rxsbg	%r4, %r5, 6, 7, 8       # encoding: [0xec,0x45,0x06,0x07,0x08,0x57]
-
-	rxsbg	%r0,%r0,0,0,0
-	rxsbg	%r0,%r0,0,0,63
-	rxsbg	%r0,%r0,0,255,0
-	rxsbg	%r0,%r0,255,0,0
-	rxsbg	%r0,%r15,0,0,0
-	rxsbg	%r15,%r0,0,0,0
-	rxsbg	%r4,%r5,6,7,8
-
 #CHECK: rll	%r0, %r0, 0             # encoding: [0xeb,0x00,0x00,0x00,0x00,0x1d]
 #CHECK: rll	%r15, %r1, 0            # encoding: [0xeb,0xf1,0x00,0x00,0x00,0x1d]
 #CHECK: rll	%r1, %r15, 0            # encoding: [0xeb,0x1f,0x00,0x00,0x00,0x1d]
@@ -9217,6 +10330,54 @@
 	rllg	%r0,%r0,524287(%r1)
 	rllg	%r0,%r0,524287(%r15)
 
+#CHECK: rnsbg	%r0, %r0, 0, 0, 0       # encoding: [0xec,0x00,0x00,0x00,0x00,0x54]
+#CHECK: rnsbg	%r0, %r0, 0, 0, 63      # encoding: [0xec,0x00,0x00,0x00,0x3f,0x54]
+#CHECK: rnsbg	%r0, %r0, 0, 255, 0     # encoding: [0xec,0x00,0x00,0xff,0x00,0x54]
+#CHECK: rnsbg	%r0, %r0, 255, 0, 0     # encoding: [0xec,0x00,0xff,0x00,0x00,0x54]
+#CHECK: rnsbg	%r0, %r15, 0, 0, 0      # encoding: [0xec,0x0f,0x00,0x00,0x00,0x54]
+#CHECK: rnsbg	%r15, %r0, 0, 0, 0      # encoding: [0xec,0xf0,0x00,0x00,0x00,0x54]
+#CHECK: rnsbg	%r4, %r5, 6, 7, 8       # encoding: [0xec,0x45,0x06,0x07,0x08,0x54]
+
+	rnsbg	%r0,%r0,0,0,0
+	rnsbg	%r0,%r0,0,0,63
+	rnsbg	%r0,%r0,0,255,0
+	rnsbg	%r0,%r0,255,0,0
+	rnsbg	%r0,%r15,0,0,0
+	rnsbg	%r15,%r0,0,0,0
+	rnsbg	%r4,%r5,6,7,8
+
+#CHECK: rosbg	%r0, %r0, 0, 0, 0       # encoding: [0xec,0x00,0x00,0x00,0x00,0x56]
+#CHECK: rosbg	%r0, %r0, 0, 0, 63      # encoding: [0xec,0x00,0x00,0x00,0x3f,0x56]
+#CHECK: rosbg	%r0, %r0, 0, 255, 0     # encoding: [0xec,0x00,0x00,0xff,0x00,0x56]
+#CHECK: rosbg	%r0, %r0, 255, 0, 0     # encoding: [0xec,0x00,0xff,0x00,0x00,0x56]
+#CHECK: rosbg	%r0, %r15, 0, 0, 0      # encoding: [0xec,0x0f,0x00,0x00,0x00,0x56]
+#CHECK: rosbg	%r15, %r0, 0, 0, 0      # encoding: [0xec,0xf0,0x00,0x00,0x00,0x56]
+#CHECK: rosbg	%r4, %r5, 6, 7, 8       # encoding: [0xec,0x45,0x06,0x07,0x08,0x56]
+
+	rosbg	%r0,%r0,0,0,0
+	rosbg	%r0,%r0,0,0,63
+	rosbg	%r0,%r0,0,255,0
+	rosbg	%r0,%r0,255,0,0
+	rosbg	%r0,%r15,0,0,0
+	rosbg	%r15,%r0,0,0,0
+	rosbg	%r4,%r5,6,7,8
+
+#CHECK: rxsbg	%r0, %r0, 0, 0, 0       # encoding: [0xec,0x00,0x00,0x00,0x00,0x57]
+#CHECK: rxsbg	%r0, %r0, 0, 0, 63      # encoding: [0xec,0x00,0x00,0x00,0x3f,0x57]
+#CHECK: rxsbg	%r0, %r0, 0, 255, 0     # encoding: [0xec,0x00,0x00,0xff,0x00,0x57]
+#CHECK: rxsbg	%r0, %r0, 255, 0, 0     # encoding: [0xec,0x00,0xff,0x00,0x00,0x57]
+#CHECK: rxsbg	%r0, %r15, 0, 0, 0      # encoding: [0xec,0x0f,0x00,0x00,0x00,0x57]
+#CHECK: rxsbg	%r15, %r0, 0, 0, 0      # encoding: [0xec,0xf0,0x00,0x00,0x00,0x57]
+#CHECK: rxsbg	%r4, %r5, 6, 7, 8       # encoding: [0xec,0x45,0x06,0x07,0x08,0x57]
+
+	rxsbg	%r0,%r0,0,0,0
+	rxsbg	%r0,%r0,0,0,63
+	rxsbg	%r0,%r0,0,255,0
+	rxsbg	%r0,%r0,255,0,0
+	rxsbg	%r0,%r15,0,0,0
+	rxsbg	%r15,%r0,0,0,0
+	rxsbg	%r4,%r5,6,7,8
+
 #CHECK: s	%r0, 0                  # encoding: [0x5b,0x00,0x00,0x00]
 #CHECK: s	%r0, 4095               # encoding: [0x5b,0x00,0x0f,0xff]
 #CHECK: s	%r0, 0(%r1)             # encoding: [0x5b,0x00,0x10,0x00]
@@ -9233,6 +10394,14 @@
 	s	%r0, 4095(%r15,%r1)
 	s	%r15, 0
 
+#CHECK: sam24                           # encoding: [0x01,0x0c]
+#CHECK: sam31                           # encoding: [0x01,0x0d]
+#CHECK: sam64                           # encoding: [0x01,0x0e]
+
+	sam24
+	sam31
+	sam64
+
 #CHECK: sar	%a0, %r0                # encoding: [0xb2,0x4e,0x00,0x00]
 #CHECK: sar	%a0, %r15               # encoding: [0xb2,0x4e,0x00,0x0f]
 #CHECK: sar	%a15, %r0               # encoding: [0xb2,0x4e,0x00,0xf0]
@@ -9245,14 +10414,6 @@
 	sar	%a7, %r8
 	sar	%a15, %r15
 
-#CHECK: sam24                           # encoding: [0x01,0x0c]
-#CHECK: sam31                           # encoding: [0x01,0x0d]
-#CHECK: sam64                           # encoding: [0x01,0x0e]
-
-	sam24
-	sam31
-	sam64
-
 #CHECK: sdb	%f0, 0                  # encoding: [0xed,0x00,0x00,0x00,0x00,0x1b]
 #CHECK: sdb	%f0, 4095               # encoding: [0xed,0x00,0x0f,0xff,0x00,0x1b]
 #CHECK: sdb	%f0, 0(%r1)             # encoding: [0xed,0x00,0x10,0x00,0x00,0x1b]
@@ -9439,6 +10600,50 @@
 	sl	%r0, 4095(%r15,%r1)
 	sl	%r15, 0
 
+#CHECK: sla	%r0, 0                  # encoding: [0x8b,0x00,0x00,0x00]
+#CHECK: sla	%r7, 0                  # encoding: [0x8b,0x70,0x00,0x00]
+#CHECK: sla	%r15, 0                 # encoding: [0x8b,0xf0,0x00,0x00]
+#CHECK: sla	%r0, 4095               # encoding: [0x8b,0x00,0x0f,0xff]
+#CHECK: sla	%r0, 0(%r1)             # encoding: [0x8b,0x00,0x10,0x00]
+#CHECK: sla	%r0, 0(%r15)            # encoding: [0x8b,0x00,0xf0,0x00]
+#CHECK: sla	%r0, 4095(%r1)          # encoding: [0x8b,0x00,0x1f,0xff]
+#CHECK: sla	%r0, 4095(%r15)         # encoding: [0x8b,0x00,0xff,0xff]
+
+	sla	%r0,0
+	sla	%r7,0
+	sla	%r15,0
+	sla	%r0,4095
+	sla	%r0,0(%r1)
+	sla	%r0,0(%r15)
+	sla	%r0,4095(%r1)
+	sla	%r0,4095(%r15)
+
+#CHECK: slag	%r0, %r0, 0             # encoding: [0xeb,0x00,0x00,0x00,0x00,0x0b]
+#CHECK: slag	%r15, %r1, 0            # encoding: [0xeb,0xf1,0x00,0x00,0x00,0x0b]
+#CHECK: slag	%r1, %r15, 0            # encoding: [0xeb,0x1f,0x00,0x00,0x00,0x0b]
+#CHECK: slag	%r15, %r15, 0           # encoding: [0xeb,0xff,0x00,0x00,0x00,0x0b]
+#CHECK: slag	%r0, %r0, -524288       # encoding: [0xeb,0x00,0x00,0x00,0x80,0x0b]
+#CHECK: slag	%r0, %r0, -1            # encoding: [0xeb,0x00,0x0f,0xff,0xff,0x0b]
+#CHECK: slag	%r0, %r0, 1             # encoding: [0xeb,0x00,0x00,0x01,0x00,0x0b]
+#CHECK: slag	%r0, %r0, 524287        # encoding: [0xeb,0x00,0x0f,0xff,0x7f,0x0b]
+#CHECK: slag	%r0, %r0, 0(%r1)        # encoding: [0xeb,0x00,0x10,0x00,0x00,0x0b]
+#CHECK: slag	%r0, %r0, 0(%r15)       # encoding: [0xeb,0x00,0xf0,0x00,0x00,0x0b]
+#CHECK: slag	%r0, %r0, 524287(%r1)   # encoding: [0xeb,0x00,0x1f,0xff,0x7f,0x0b]
+#CHECK: slag	%r0, %r0, 524287(%r15)  # encoding: [0xeb,0x00,0xff,0xff,0x7f,0x0b]
+
+	slag	%r0,%r0,0
+	slag	%r15,%r1,0
+	slag	%r1,%r15,0
+	slag	%r15,%r15,0
+	slag	%r0,%r0,-524288
+	slag	%r0,%r0,-1
+	slag	%r0,%r0,1
+	slag	%r0,%r0,524287
+	slag	%r0,%r0,0(%r1)
+	slag	%r0,%r0,0(%r15)
+	slag	%r0,%r0,524287(%r1)
+	slag	%r0,%r0,524287(%r15)
+
 #CHECK: slb	%r0, -524288            # encoding: [0xe3,0x00,0x00,0x00,0x80,0x99]
 #CHECK: slb	%r0, -1                 # encoding: [0xe3,0x00,0x0f,0xff,0xff,0x99]
 #CHECK: slb	%r0, 0                  # encoding: [0xe3,0x00,0x00,0x00,0x00,0x99]
@@ -9503,6 +10708,42 @@
 	slbr	%r15,%r0
 	slbr	%r7,%r8
 
+#CHECK: slda	%r0, 0                  # encoding: [0x8f,0x00,0x00,0x00]
+#CHECK: slda	%r6, 0                  # encoding: [0x8f,0x60,0x00,0x00]
+#CHECK: slda	%r14, 0                 # encoding: [0x8f,0xe0,0x00,0x00]
+#CHECK: slda	%r0, 4095               # encoding: [0x8f,0x00,0x0f,0xff]
+#CHECK: slda	%r0, 0(%r1)             # encoding: [0x8f,0x00,0x10,0x00]
+#CHECK: slda	%r0, 0(%r15)            # encoding: [0x8f,0x00,0xf0,0x00]
+#CHECK: slda	%r0, 4095(%r1)          # encoding: [0x8f,0x00,0x1f,0xff]
+#CHECK: slda	%r0, 4095(%r15)         # encoding: [0x8f,0x00,0xff,0xff]
+
+	slda	%r0,0
+	slda	%r6,0
+	slda	%r14,0
+	slda	%r0,4095
+	slda	%r0,0(%r1)
+	slda	%r0,0(%r15)
+	slda	%r0,4095(%r1)
+	slda	%r0,4095(%r15)
+
+#CHECK: sldl	%r0, 0                  # encoding: [0x8d,0x00,0x00,0x00]
+#CHECK: sldl	%r6, 0                  # encoding: [0x8d,0x60,0x00,0x00]
+#CHECK: sldl	%r14, 0                 # encoding: [0x8d,0xe0,0x00,0x00]
+#CHECK: sldl	%r0, 4095               # encoding: [0x8d,0x00,0x0f,0xff]
+#CHECK: sldl	%r0, 0(%r1)             # encoding: [0x8d,0x00,0x10,0x00]
+#CHECK: sldl	%r0, 0(%r15)            # encoding: [0x8d,0x00,0xf0,0x00]
+#CHECK: sldl	%r0, 4095(%r1)          # encoding: [0x8d,0x00,0x1f,0xff]
+#CHECK: sldl	%r0, 4095(%r15)         # encoding: [0x8d,0x00,0xff,0xff]
+
+	sldl	%r0,0
+	sldl	%r6,0
+	sldl	%r14,0
+	sldl	%r0,4095
+	sldl	%r0,0(%r1)
+	sldl	%r0,0(%r15)
+	sldl	%r0,4095(%r1)
+	sldl	%r0,4095(%r15)
+
 #CHECK: slfi	%r0, 0                  # encoding: [0xc2,0x05,0x00,0x00,0x00,0x00]
 #CHECK: slfi	%r0, 4294967295         # encoding: [0xc2,0x05,0xff,0xff,0xff,0xff]
 #CHECK: slfi	%r15, 0                 # encoding: [0xc2,0xf5,0x00,0x00,0x00,0x00]
@@ -9583,24 +10824,6 @@
 	slgr	%r15,%r0
 	slgr	%r7,%r8
 
-#CHECK: sla	%r0, 0                  # encoding: [0x8b,0x00,0x00,0x00]
-#CHECK: sla	%r7, 0                  # encoding: [0x8b,0x70,0x00,0x00]
-#CHECK: sla	%r15, 0                 # encoding: [0x8b,0xf0,0x00,0x00]
-#CHECK: sla	%r0, 4095               # encoding: [0x8b,0x00,0x0f,0xff]
-#CHECK: sla	%r0, 0(%r1)             # encoding: [0x8b,0x00,0x10,0x00]
-#CHECK: sla	%r0, 0(%r15)            # encoding: [0x8b,0x00,0xf0,0x00]
-#CHECK: sla	%r0, 4095(%r1)          # encoding: [0x8b,0x00,0x1f,0xff]
-#CHECK: sla	%r0, 4095(%r15)         # encoding: [0x8b,0x00,0xff,0xff]
-
-	sla	%r0,0
-	sla	%r7,0
-	sla	%r15,0
-	sla	%r0,4095
-	sla	%r0,0(%r1)
-	sla	%r0,0(%r15)
-	sla	%r0,4095(%r1)
-	sla	%r0,4095(%r15)
-
 #CHECK: sll	%r0, 0                  # encoding: [0x89,0x00,0x00,0x00]
 #CHECK: sll	%r7, 0                  # encoding: [0x89,0x70,0x00,0x00]
 #CHECK: sll	%r15, 0                 # encoding: [0x89,0xf0,0x00,0x00]
@@ -9677,6 +10900,36 @@
 	sly	%r0, 524287(%r15,%r1)
 	sly	%r15, 0
 
+#CHECK: sp	0(1), 0(1)              # encoding: [0xfb,0x00,0x00,0x00,0x00,0x00]
+#CHECK: sp	0(1), 0(1,%r1)          # encoding: [0xfb,0x00,0x00,0x00,0x10,0x00]
+#CHECK: sp	0(1), 0(1,%r15)         # encoding: [0xfb,0x00,0x00,0x00,0xf0,0x00]
+#CHECK: sp	0(1), 4095(1)           # encoding: [0xfb,0x00,0x00,0x00,0x0f,0xff]
+#CHECK: sp	0(1), 4095(1,%r1)       # encoding: [0xfb,0x00,0x00,0x00,0x1f,0xff]
+#CHECK: sp	0(1), 4095(1,%r15)      # encoding: [0xfb,0x00,0x00,0x00,0xff,0xff]
+#CHECK: sp	0(1,%r1), 0(1)          # encoding: [0xfb,0x00,0x10,0x00,0x00,0x00]
+#CHECK: sp	0(1,%r15), 0(1)         # encoding: [0xfb,0x00,0xf0,0x00,0x00,0x00]
+#CHECK: sp	4095(1,%r1), 0(1)       # encoding: [0xfb,0x00,0x1f,0xff,0x00,0x00]
+#CHECK: sp	4095(1,%r15), 0(1)      # encoding: [0xfb,0x00,0xff,0xff,0x00,0x00]
+#CHECK: sp	0(16,%r1), 0(1)         # encoding: [0xfb,0xf0,0x10,0x00,0x00,0x00]
+#CHECK: sp	0(16,%r15), 0(1)        # encoding: [0xfb,0xf0,0xf0,0x00,0x00,0x00]
+#CHECK: sp	0(1), 0(16,%r1)         # encoding: [0xfb,0x0f,0x00,0x00,0x10,0x00]
+#CHECK: sp	0(1), 0(16,%r15)        # encoding: [0xfb,0x0f,0x00,0x00,0xf0,0x00]
+
+	sp	0(1), 0(1)
+	sp	0(1), 0(1,%r1)
+	sp	0(1), 0(1,%r15)
+	sp	0(1), 4095(1)
+	sp	0(1), 4095(1,%r1)
+	sp	0(1), 4095(1,%r15)
+	sp	0(1,%r1), 0(1)
+	sp	0(1,%r15), 0(1)
+	sp	4095(1,%r1), 0(1)
+	sp	4095(1,%r15), 0(1)
+	sp	0(16,%r1), 0(1)
+	sp	0(16,%r15), 0(1)
+	sp	0(1), 0(16,%r1)
+	sp	0(1), 0(16,%r15)
+
 #CHECK: spm	%r0                     # encoding: [0x04,0x00]
 #CHECK: spm	%r1                     # encoding: [0x04,0x10]
 #CHECK: spm	%r15                    # encoding: [0x04,0xf0]
@@ -9801,6 +11054,42 @@
 	srag	%r0,%r0,524287(%r1)
 	srag	%r0,%r0,524287(%r15)
 
+#CHECK: srda	%r0, 0                  # encoding: [0x8e,0x00,0x00,0x00]
+#CHECK: srda	%r6, 0                  # encoding: [0x8e,0x60,0x00,0x00]
+#CHECK: srda	%r14, 0                 # encoding: [0x8e,0xe0,0x00,0x00]
+#CHECK: srda	%r0, 4095               # encoding: [0x8e,0x00,0x0f,0xff]
+#CHECK: srda	%r0, 0(%r1)             # encoding: [0x8e,0x00,0x10,0x00]
+#CHECK: srda	%r0, 0(%r15)            # encoding: [0x8e,0x00,0xf0,0x00]
+#CHECK: srda	%r0, 4095(%r1)          # encoding: [0x8e,0x00,0x1f,0xff]
+#CHECK: srda	%r0, 4095(%r15)         # encoding: [0x8e,0x00,0xff,0xff]
+
+	srda	%r0,0
+	srda	%r6,0
+	srda	%r14,0
+	srda	%r0,4095
+	srda	%r0,0(%r1)
+	srda	%r0,0(%r15)
+	srda	%r0,4095(%r1)
+	srda	%r0,4095(%r15)
+
+#CHECK: srdl	%r0, 0                  # encoding: [0x8c,0x00,0x00,0x00]
+#CHECK: srdl	%r6, 0                  # encoding: [0x8c,0x60,0x00,0x00]
+#CHECK: srdl	%r14, 0                 # encoding: [0x8c,0xe0,0x00,0x00]
+#CHECK: srdl	%r0, 4095               # encoding: [0x8c,0x00,0x0f,0xff]
+#CHECK: srdl	%r0, 0(%r1)             # encoding: [0x8c,0x00,0x10,0x00]
+#CHECK: srdl	%r0, 0(%r15)            # encoding: [0x8c,0x00,0xf0,0x00]
+#CHECK: srdl	%r0, 4095(%r1)          # encoding: [0x8c,0x00,0x1f,0xff]
+#CHECK: srdl	%r0, 4095(%r15)         # encoding: [0x8c,0x00,0xff,0xff]
+
+	srdl	%r0,0
+	srdl	%r6,0
+	srdl	%r14,0
+	srdl	%r0,4095
+	srdl	%r0,0(%r1)
+	srdl	%r0,0(%r15)
+	srdl	%r0,4095(%r1)
+	srdl	%r0,4095(%r15)
+
 #CHECK: srl	%r0, 0                  # encoding: [0x88,0x00,0x00,0x00]
 #CHECK: srl	%r7, 0                  # encoding: [0x88,0x70,0x00,0x00]
 #CHECK: srl	%r15, 0                 # encoding: [0x88,0xf0,0x00,0x00]
@@ -9873,6 +11162,34 @@
 	srnmt	4095(%r1)
 	srnmt	4095(%r15)
 
+#CHECK: srp	0(1), 0, 0              # encoding: [0xf0,0x00,0x00,0x00,0x00,0x00]
+#CHECK: srp	0(1), 0, 15             # encoding: [0xf0,0x0f,0x00,0x00,0x00,0x00]
+#CHECK: srp	0(1), 0(%r1), 0         # encoding: [0xf0,0x00,0x00,0x00,0x10,0x00]
+#CHECK: srp	0(1), 0(%r15), 0        # encoding: [0xf0,0x00,0x00,0x00,0xf0,0x00]
+#CHECK: srp	0(1), 4095, 0           # encoding: [0xf0,0x00,0x00,0x00,0x0f,0xff]
+#CHECK: srp	0(1), 4095(%r1), 0      # encoding: [0xf0,0x00,0x00,0x00,0x1f,0xff]
+#CHECK: srp	0(1), 4095(%r15), 0     # encoding: [0xf0,0x00,0x00,0x00,0xff,0xff]
+#CHECK: srp	0(1,%r1), 0, 0          # encoding: [0xf0,0x00,0x10,0x00,0x00,0x00]
+#CHECK: srp	0(1,%r15), 0, 0         # encoding: [0xf0,0x00,0xf0,0x00,0x00,0x00]
+#CHECK: srp	4095(1,%r1), 0, 0       # encoding: [0xf0,0x00,0x1f,0xff,0x00,0x00]
+#CHECK: srp	4095(1,%r15), 0, 0      # encoding: [0xf0,0x00,0xff,0xff,0x00,0x00]
+#CHECK: srp	0(16,%r1), 0, 0         # encoding: [0xf0,0xf0,0x10,0x00,0x00,0x00]
+#CHECK: srp	0(16,%r15), 0, 0        # encoding: [0xf0,0xf0,0xf0,0x00,0x00,0x00]
+
+	srp	0(1), 0, 0
+	srp	0(1), 0, 15
+	srp	0(1), 0(%r1), 0
+	srp	0(1), 0(%r15), 0
+	srp	0(1), 4095, 0
+	srp	0(1), 4095(%r1), 0
+	srp	0(1), 4095(%r15), 0
+	srp	0(1,%r1), 0, 0
+	srp	0(1,%r15), 0, 0
+	srp	4095(1,%r1), 0, 0
+	srp	4095(1,%r15), 0, 0
+	srp	0(16,%r1), 0, 0
+	srp	0(16,%r15), 0, 0
+
 #CHECK: srst	%r0, %r0                # encoding: [0xb2,0x5e,0x00,0x00]
 #CHECK: srst	%r0, %r15               # encoding: [0xb2,0x5e,0x00,0x0f]
 #CHECK: srst	%r15, %r0               # encoding: [0xb2,0x5e,0x00,0xf0]
@@ -9883,6 +11200,16 @@
 	srst	%r15,%r0
 	srst	%r7,%r8
 
+#CHECK: srstu	%r0, %r0                # encoding: [0xb9,0xbe,0x00,0x00]
+#CHECK: srstu	%r0, %r15               # encoding: [0xb9,0xbe,0x00,0x0f]
+#CHECK: srstu	%r15, %r0               # encoding: [0xb9,0xbe,0x00,0xf0]
+#CHECK: srstu	%r7, %r8                # encoding: [0xb9,0xbe,0x00,0x78]
+
+	srstu	%r0,%r0
+	srstu	%r0,%r15
+	srstu	%r15,%r0
+	srstu	%r7,%r8
+
 #CHECK: st	%r0, 0                  # encoding: [0x50,0x00,0x00,0x00]
 #CHECK: st	%r0, 4095               # encoding: [0x50,0x00,0x0f,0xff]
 #CHECK: st	%r0, 0(%r1)             # encoding: [0x50,0x00,0x10,0x00]
@@ -9970,56 +11297,102 @@
 #CHECK: stck	0(%r15)            	# encoding: [0xb2,0x05,0xf0,0x00]
 #CHECK: stck	4095                 	# encoding: [0xb2,0x05,0x0f,0xff]
 #CHECK: stck	4095(%r1)             	# encoding: [0xb2,0x05,0x1f,0xff]
-#CHECK: stck	4095(%r15)             	# encoding: [0xb2,0x05,0xff,0xff]	
+#CHECK: stck	4095(%r15)             	# encoding: [0xb2,0x05,0xff,0xff]
 
 	stck	0
 	stck	0(%r1)
 	stck	0(%r15)
-	stck	4095	
+	stck	4095
 	stck	4095(%r1)
 	stck	4095(%r15)
 
-#CHECK: stckf	0                  	# encoding: [0xb2,0x7c,0x00,0x00]
-#CHECK: stckf	0(%r1)             	# encoding: [0xb2,0x7c,0x10,0x00]
-#CHECK: stckf	0(%r15)            	# encoding: [0xb2,0x7c,0xf0,0x00]
-#CHECK: stckf	4095                 	# encoding: [0xb2,0x7c,0x0f,0xff]
-#CHECK: stckf	4095(%r1)             	# encoding: [0xb2,0x7c,0x1f,0xff]
-#CHECK: stckf	4095(%r15)             	# encoding: [0xb2,0x7c,0xff,0xff]	
-
-	stckf	0
-	stckf	0(%r1)
-	stckf	0(%r15)
-	stckf	4095	
-	stckf	4095(%r1)
-	stckf	4095(%r15)
-
 #CHECK: stcke	0                  	# encoding: [0xb2,0x78,0x00,0x00]
 #CHECK: stcke	0(%r1)             	# encoding: [0xb2,0x78,0x10,0x00]
 #CHECK: stcke	0(%r15)            	# encoding: [0xb2,0x78,0xf0,0x00]
 #CHECK: stcke	4095                 	# encoding: [0xb2,0x78,0x0f,0xff]
 #CHECK: stcke	4095(%r1)             	# encoding: [0xb2,0x78,0x1f,0xff]
-#CHECK: stcke	4095(%r15)             	# encoding: [0xb2,0x78,0xff,0xff]	
+#CHECK: stcke	4095(%r15)             	# encoding: [0xb2,0x78,0xff,0xff]
 
 	stcke	0
 	stcke	0(%r1)
 	stcke	0(%r15)
-	stcke	4095	
+	stcke	4095
 	stcke	4095(%r1)
 	stcke	4095(%r15)
 
-#CHECK: stfle	0                  	# encoding: [0xb2,0xb0,0x00,0x00]
-#CHECK: stfle	0(%r1)             	# encoding: [0xb2,0xb0,0x10,0x00]
-#CHECK: stfle	0(%r15)            	# encoding: [0xb2,0xb0,0xf0,0x00]
-#CHECK: stfle	4095                 	# encoding: [0xb2,0xb0,0x0f,0xff]
-#CHECK: stfle	4095(%r1)             	# encoding: [0xb2,0xb0,0x1f,0xff]
-#CHECK: stfle	4095(%r15)             	# encoding: [0xb2,0xb0,0xff,0xff]	
+#CHECK: stckf	0                  	# encoding: [0xb2,0x7c,0x00,0x00]
+#CHECK: stckf	0(%r1)             	# encoding: [0xb2,0x7c,0x10,0x00]
+#CHECK: stckf	0(%r15)            	# encoding: [0xb2,0x7c,0xf0,0x00]
+#CHECK: stckf	4095                 	# encoding: [0xb2,0x7c,0x0f,0xff]
+#CHECK: stckf	4095(%r1)             	# encoding: [0xb2,0x7c,0x1f,0xff]
+#CHECK: stckf	4095(%r15)             	# encoding: [0xb2,0x7c,0xff,0xff]
 
-	stfle	0
-	stfle	0(%r1)
-	stfle	0(%r15)
-	stfle	4095	
-	stfle	4095(%r1)
-	stfle	4095(%r15)
+	stckf	0
+	stckf	0(%r1)
+	stckf	0(%r15)
+	stckf	4095
+	stckf	4095(%r1)
+	stckf	4095(%r15)
+
+#CHECK: stcm	%r0, 0, 0               # encoding: [0xbe,0x00,0x00,0x00]
+#CHECK: stcm	%r0, 15, 4095           # encoding: [0xbe,0x0f,0x0f,0xff]
+#CHECK: stcm	%r0, 0, 0(%r1)          # encoding: [0xbe,0x00,0x10,0x00]
+#CHECK: stcm	%r0, 0, 0(%r15)         # encoding: [0xbe,0x00,0xf0,0x00]
+#CHECK: stcm	%r15, 15, 4095(%r1)     # encoding: [0xbe,0xff,0x1f,0xff]
+#CHECK: stcm	%r0, 0, 4095(%r15)      # encoding: [0xbe,0x00,0xff,0xff]
+#CHECK: stcm	%r15, 0, 0              # encoding: [0xbe,0xf0,0x00,0x00]
+
+	stcm	%r0, 0, 0
+	stcm	%r0, 15, 4095
+	stcm	%r0, 0, 0(%r1)
+	stcm	%r0, 0, 0(%r15)
+	stcm	%r15, 15, 4095(%r1)
+	stcm	%r0, 0, 4095(%r15)
+	stcm	%r15, 0, 0
+
+#CHECK: stcmh	%r0, 0, -524288            # encoding: [0xeb,0x00,0x00,0x00,0x80,0x2c]
+#CHECK: stcmh	%r0, 0, -1                 # encoding: [0xeb,0x00,0x0f,0xff,0xff,0x2c]
+#CHECK: stcmh	%r0, 15, 0                 # encoding: [0xeb,0x0f,0x00,0x00,0x00,0x2c]
+#CHECK: stcmh	%r0, 15, 1                 # encoding: [0xeb,0x0f,0x00,0x01,0x00,0x2c]
+#CHECK: stcmh	%r0, 8, 524287             # encoding: [0xeb,0x08,0x0f,0xff,0x7f,0x2c]
+#CHECK: stcmh	%r0, 8, 0(%r1)             # encoding: [0xeb,0x08,0x10,0x00,0x00,0x2c]
+#CHECK: stcmh	%r0, 4, 0(%r15)            # encoding: [0xeb,0x04,0xf0,0x00,0x00,0x2c]
+#CHECK: stcmh	%r0, 4, 524287(%r15)       # encoding: [0xeb,0x04,0xff,0xff,0x7f,0x2c]
+#CHECK: stcmh	%r0, 0, 524287(%r1)        # encoding: [0xeb,0x00,0x1f,0xff,0x7f,0x2c]
+#CHECK: stcmh	%r15, 0, 0                 # encoding: [0xeb,0xf0,0x00,0x00,0x00,0x2c]
+
+	stcmh	%r0, 0, -524288
+	stcmh	%r0, 0, -1
+	stcmh	%r0, 15, 0
+	stcmh	%r0, 15, 1
+	stcmh	%r0, 8, 524287
+	stcmh	%r0, 8, 0(%r1)
+	stcmh	%r0, 4, 0(%r15)
+	stcmh	%r0, 4, 524287(%r15)
+	stcmh	%r0, 0, 524287(%r1)
+	stcmh	%r15, 0, 0
+
+#CHECK: stcmy	%r0, 0, -524288            # encoding: [0xeb,0x00,0x00,0x00,0x80,0x2d]
+#CHECK: stcmy	%r0, 0, -1                 # encoding: [0xeb,0x00,0x0f,0xff,0xff,0x2d]
+#CHECK: stcmy	%r0, 15, 0                 # encoding: [0xeb,0x0f,0x00,0x00,0x00,0x2d]
+#CHECK: stcmy	%r0, 15, 1                 # encoding: [0xeb,0x0f,0x00,0x01,0x00,0x2d]
+#CHECK: stcmy	%r0, 8, 524287             # encoding: [0xeb,0x08,0x0f,0xff,0x7f,0x2d]
+#CHECK: stcmy	%r0, 8, 0(%r1)             # encoding: [0xeb,0x08,0x10,0x00,0x00,0x2d]
+#CHECK: stcmy	%r0, 4, 0(%r15)            # encoding: [0xeb,0x04,0xf0,0x00,0x00,0x2d]
+#CHECK: stcmy	%r0, 4, 524287(%r15)       # encoding: [0xeb,0x04,0xff,0xff,0x7f,0x2d]
+#CHECK: stcmy	%r0, 0, 524287(%r1)        # encoding: [0xeb,0x00,0x1f,0xff,0x7f,0x2d]
+#CHECK: stcmy	%r15, 0, 0                 # encoding: [0xeb,0xf0,0x00,0x00,0x00,0x2d]
+
+	stcmy	%r0, 0, -524288
+	stcmy	%r0, 0, -1
+	stcmy	%r0, 15, 0
+	stcmy	%r0, 15, 1
+	stcmy	%r0, 8, 524287
+	stcmy	%r0, 8, 0(%r1)
+	stcmy	%r0, 4, 0(%r15)
+	stcmy	%r0, 4, 524287(%r15)
+	stcmy	%r0, 0, 524287(%r1)
+	stcmy	%r15, 0, 0
 
 #CHECK: stcy	%r0, -524288            # encoding: [0xe3,0x00,0x00,0x00,0x80,0x72]
 #CHECK: stcy	%r0, -1                 # encoding: [0xe3,0x00,0x0f,0xff,0xff,0x72]
@@ -10119,6 +11492,20 @@
 	stey	%f0, 524287(%r15,%r1)
 	stey	%f15, 0
 
+#CHECK: stfle	0                  	# encoding: [0xb2,0xb0,0x00,0x00]
+#CHECK: stfle	0(%r1)             	# encoding: [0xb2,0xb0,0x10,0x00]
+#CHECK: stfle	0(%r15)            	# encoding: [0xb2,0xb0,0xf0,0x00]
+#CHECK: stfle	4095                 	# encoding: [0xb2,0xb0,0x0f,0xff]
+#CHECK: stfle	4095(%r1)             	# encoding: [0xb2,0xb0,0x1f,0xff]
+#CHECK: stfle	4095(%r15)             	# encoding: [0xb2,0xb0,0xff,0xff]
+
+	stfle	0
+	stfle	0(%r1)
+	stfle	0(%r15)
+	stfle	4095
+	stfle	4095(%r1)
+	stfle	4095(%r15)
+
 #CHECK: stfpc	0                       # encoding: [0xb2,0x9c,0x00,0x00]
 #CHECK: stfpc	0(%r1)                  # encoding: [0xb2,0x9c,0x10,0x00]
 #CHECK: stfpc	0(%r15)                 # encoding: [0xb2,0x9c,0xf0,0x00]
@@ -10454,28 +11841,6 @@
 	strl	%r7,frob@PLT
 	strl	%r8,frob@PLT
 
-#CHECK: strvh	%r0, -524288            # encoding: [0xe3,0x00,0x00,0x00,0x80,0x3f]
-#CHECK: strvh	%r0, -1                 # encoding: [0xe3,0x00,0x0f,0xff,0xff,0x3f]
-#CHECK: strvh	%r0, 0                  # encoding: [0xe3,0x00,0x00,0x00,0x00,0x3f]
-#CHECK: strvh	%r0, 1                  # encoding: [0xe3,0x00,0x00,0x01,0x00,0x3f]
-#CHECK: strvh	%r0, 524287             # encoding: [0xe3,0x00,0x0f,0xff,0x7f,0x3f]
-#CHECK: strvh	%r0, 0(%r1)             # encoding: [0xe3,0x00,0x10,0x00,0x00,0x3f]
-#CHECK: strvh	%r0, 0(%r15)            # encoding: [0xe3,0x00,0xf0,0x00,0x00,0x3f]
-#CHECK: strvh	%r0, 524287(%r1,%r15)   # encoding: [0xe3,0x01,0xff,0xff,0x7f,0x3f]
-#CHECK: strvh	%r0, 524287(%r15,%r1)   # encoding: [0xe3,0x0f,0x1f,0xff,0x7f,0x3f]
-#CHECK: strvh	%r15, 0                 # encoding: [0xe3,0xf0,0x00,0x00,0x00,0x3f]
-
-	strvh	%r0,-524288
-	strvh	%r0,-1
-	strvh	%r0,0
-	strvh	%r0,1
-	strvh	%r0,524287
-	strvh	%r0,0(%r1)
-	strvh	%r0,0(%r15)
-	strvh	%r0,524287(%r1,%r15)
-	strvh	%r0,524287(%r15,%r1)
-	strvh	%r15,0
-
 #CHECK: strv	%r0, -524288            # encoding: [0xe3,0x00,0x00,0x00,0x80,0x3e]
 #CHECK: strv	%r0, -1                 # encoding: [0xe3,0x00,0x0f,0xff,0xff,0x3e]
 #CHECK: strv	%r0, 0                  # encoding: [0xe3,0x00,0x00,0x00,0x00,0x3e]
@@ -10520,15 +11885,27 @@
 	strvg	%r0,524287(%r15,%r1)
 	strvg	%r15,0
 
-#CHECK: svc	0			# encoding: [0x0a,0x00]
-#CHECK: svc	3			# encoding: [0x0a,0x03]
-#CHECK: svc	128			# encoding: [0x0a,0x80]
-#CHECK: svc	255			# encoding: [0x0a,0xff]
+#CHECK: strvh	%r0, -524288            # encoding: [0xe3,0x00,0x00,0x00,0x80,0x3f]
+#CHECK: strvh	%r0, -1                 # encoding: [0xe3,0x00,0x0f,0xff,0xff,0x3f]
+#CHECK: strvh	%r0, 0                  # encoding: [0xe3,0x00,0x00,0x00,0x00,0x3f]
+#CHECK: strvh	%r0, 1                  # encoding: [0xe3,0x00,0x00,0x01,0x00,0x3f]
+#CHECK: strvh	%r0, 524287             # encoding: [0xe3,0x00,0x0f,0xff,0x7f,0x3f]
+#CHECK: strvh	%r0, 0(%r1)             # encoding: [0xe3,0x00,0x10,0x00,0x00,0x3f]
+#CHECK: strvh	%r0, 0(%r15)            # encoding: [0xe3,0x00,0xf0,0x00,0x00,0x3f]
+#CHECK: strvh	%r0, 524287(%r1,%r15)   # encoding: [0xe3,0x01,0xff,0xff,0x7f,0x3f]
+#CHECK: strvh	%r0, 524287(%r15,%r1)   # encoding: [0xe3,0x0f,0x1f,0xff,0x7f,0x3f]
+#CHECK: strvh	%r15, 0                 # encoding: [0xe3,0xf0,0x00,0x00,0x00,0x3f]
 
-	svc	0
-	svc	3
-	svc	128
-	svc	0xff
+	strvh	%r0,-524288
+	strvh	%r0,-1
+	strvh	%r0,0
+	strvh	%r0,1
+	strvh	%r0,524287
+	strvh	%r0,0(%r1)
+	strvh	%r0,0(%r15)
+	strvh	%r0,524287(%r1,%r15)
+	strvh	%r0,524287(%r15,%r1)
+	strvh	%r15,0
 
 #CHECK: sty	%r0, -524288            # encoding: [0xe3,0x00,0x00,0x00,0x80,0x50]
 #CHECK: sty	%r0, -1                 # encoding: [0xe3,0x00,0x0f,0xff,0xff,0x50]
@@ -10552,6 +11929,16 @@
 	sty	%r0, 524287(%r15,%r1)
 	sty	%r15, 0
 
+#CHECK: svc	0			# encoding: [0x0a,0x00]
+#CHECK: svc	3			# encoding: [0x0a,0x03]
+#CHECK: svc	128			# encoding: [0x0a,0x80]
+#CHECK: svc	255			# encoding: [0x0a,0xff]
+
+	svc	0
+	svc	3
+	svc	128
+	svc	0xff
+
 #CHECK: sxbr	%f0, %f0                # encoding: [0xb3,0x4b,0x00,0x00]
 #CHECK: sxbr	%f0, %f13               # encoding: [0xb3,0x4b,0x00,0x0d]
 #CHECK: sxbr	%f8, %f8                # encoding: [0xb3,0x4b,0x00,0x88]
@@ -10734,6 +12121,194 @@
 	tmy	524287(%r1), 42
 	tmy	524287(%r15), 42
 
+#CHECK: tp	0(1)                    # encoding: [0xeb,0x00,0x00,0x00,0x00,0xc0]
+#CHECK: tp	0(1,%r1)                # encoding: [0xeb,0x00,0x10,0x00,0x00,0xc0]
+#CHECK: tp	0(1,%r15)               # encoding: [0xeb,0x00,0xf0,0x00,0x00,0xc0]
+#CHECK: tp	4095(1,%r1)             # encoding: [0xeb,0x00,0x1f,0xff,0x00,0xc0]
+#CHECK: tp	4095(1,%r15)            # encoding: [0xeb,0x00,0xff,0xff,0x00,0xc0]
+#CHECK: tp	0(16,%r1)               # encoding: [0xeb,0xf0,0x10,0x00,0x00,0xc0]
+#CHECK: tp	0(16,%r15)              # encoding: [0xeb,0xf0,0xf0,0x00,0x00,0xc0]
+
+	tp	0(1)
+	tp	0(1,%r1)
+	tp	0(1,%r15)
+	tp	4095(1,%r1)
+	tp	4095(1,%r15)
+	tp	0(16,%r1)
+	tp	0(16,%r15)
+
+#CHECK: tr	0(1), 0                 # encoding: [0xdc,0x00,0x00,0x00,0x00,0x00]
+#CHECK: tr	0(1), 0(%r1)            # encoding: [0xdc,0x00,0x00,0x00,0x10,0x00]
+#CHECK: tr	0(1), 0(%r15)           # encoding: [0xdc,0x00,0x00,0x00,0xf0,0x00]
+#CHECK: tr	0(1), 4095              # encoding: [0xdc,0x00,0x00,0x00,0x0f,0xff]
+#CHECK: tr	0(1), 4095(%r1)         # encoding: [0xdc,0x00,0x00,0x00,0x1f,0xff]
+#CHECK: tr	0(1), 4095(%r15)        # encoding: [0xdc,0x00,0x00,0x00,0xff,0xff]
+#CHECK: tr	0(1,%r1), 0             # encoding: [0xdc,0x00,0x10,0x00,0x00,0x00]
+#CHECK: tr	0(1,%r15), 0            # encoding: [0xdc,0x00,0xf0,0x00,0x00,0x00]
+#CHECK: tr	4095(1,%r1), 0          # encoding: [0xdc,0x00,0x1f,0xff,0x00,0x00]
+#CHECK: tr	4095(1,%r15), 0         # encoding: [0xdc,0x00,0xff,0xff,0x00,0x00]
+#CHECK: tr	0(256,%r1), 0           # encoding: [0xdc,0xff,0x10,0x00,0x00,0x00]
+#CHECK: tr	0(256,%r15), 0          # encoding: [0xdc,0xff,0xf0,0x00,0x00,0x00]
+
+	tr	0(1), 0
+	tr	0(1), 0(%r1)
+	tr	0(1), 0(%r15)
+	tr	0(1), 4095
+	tr	0(1), 4095(%r1)
+	tr	0(1), 4095(%r15)
+	tr	0(1,%r1), 0
+	tr	0(1,%r15), 0
+	tr	4095(1,%r1), 0
+	tr	4095(1,%r15), 0
+	tr	0(256,%r1), 0
+	tr	0(256,%r15), 0
+
+#CHECK: tre	%r0, %r0                # encoding: [0xb2,0xa5,0x00,0x00]
+#CHECK: tre	%r0, %r15               # encoding: [0xb2,0xa5,0x00,0x0f]
+#CHECK: tre	%r14, %r0               # encoding: [0xb2,0xa5,0x00,0xe0]
+#CHECK: tre	%r6, %r8                # encoding: [0xb2,0xa5,0x00,0x68]
+
+	tre	%r0, %r0
+	tre	%r0, %r15
+	tre	%r14, %r0
+	tre	%r6, %r8
+
+#CHECK: troo	%r0, %r0                # encoding: [0xb9,0x93,0x00,0x00]
+#CHECK: troo	%r0, %r15               # encoding: [0xb9,0x93,0x00,0x0f]
+#CHECK: troo	%r14, %r0               # encoding: [0xb9,0x93,0x00,0xe0]
+#CHECK: troo	%r6, %r8                # encoding: [0xb9,0x93,0x00,0x68]
+#CHECK: troo	%r4, %r13, 0            # encoding: [0xb9,0x93,0x00,0x4d]
+#CHECK: troo	%r4, %r13, 15           # encoding: [0xb9,0x93,0xf0,0x4d]
+
+	troo	%r0, %r0
+	troo	%r0, %r15
+	troo	%r14, %r0
+	troo	%r6, %r8
+	troo	%r4, %r13, 0
+	troo	%r4, %r13, 15
+
+#CHECK: trot	%r0, %r0                # encoding: [0xb9,0x92,0x00,0x00]
+#CHECK: trot	%r0, %r15               # encoding: [0xb9,0x92,0x00,0x0f]
+#CHECK: trot	%r14, %r0               # encoding: [0xb9,0x92,0x00,0xe0]
+#CHECK: trot	%r6, %r8                # encoding: [0xb9,0x92,0x00,0x68]
+#CHECK: trot	%r4, %r13, 0            # encoding: [0xb9,0x92,0x00,0x4d]
+#CHECK: trot	%r4, %r13, 15           # encoding: [0xb9,0x92,0xf0,0x4d]
+
+	trot	%r0, %r0
+	trot	%r0, %r15
+	trot	%r14, %r0
+	trot	%r6, %r8
+	trot	%r4, %r13, 0
+	trot	%r4, %r13, 15
+
+#CHECK: trt	0(1), 0                 # encoding: [0xdd,0x00,0x00,0x00,0x00,0x00]
+#CHECK: trt	0(1), 0(%r1)            # encoding: [0xdd,0x00,0x00,0x00,0x10,0x00]
+#CHECK: trt	0(1), 0(%r15)           # encoding: [0xdd,0x00,0x00,0x00,0xf0,0x00]
+#CHECK: trt	0(1), 4095              # encoding: [0xdd,0x00,0x00,0x00,0x0f,0xff]
+#CHECK: trt	0(1), 4095(%r1)         # encoding: [0xdd,0x00,0x00,0x00,0x1f,0xff]
+#CHECK: trt	0(1), 4095(%r15)        # encoding: [0xdd,0x00,0x00,0x00,0xff,0xff]
+#CHECK: trt	0(1,%r1), 0             # encoding: [0xdd,0x00,0x10,0x00,0x00,0x00]
+#CHECK: trt	0(1,%r15), 0            # encoding: [0xdd,0x00,0xf0,0x00,0x00,0x00]
+#CHECK: trt	4095(1,%r1), 0          # encoding: [0xdd,0x00,0x1f,0xff,0x00,0x00]
+#CHECK: trt	4095(1,%r15), 0         # encoding: [0xdd,0x00,0xff,0xff,0x00,0x00]
+#CHECK: trt	0(256,%r1), 0           # encoding: [0xdd,0xff,0x10,0x00,0x00,0x00]
+#CHECK: trt	0(256,%r15), 0          # encoding: [0xdd,0xff,0xf0,0x00,0x00,0x00]
+
+	trt	0(1), 0
+	trt	0(1), 0(%r1)
+	trt	0(1), 0(%r15)
+	trt	0(1), 4095
+	trt	0(1), 4095(%r1)
+	trt	0(1), 4095(%r15)
+	trt	0(1,%r1), 0
+	trt	0(1,%r15), 0
+	trt	4095(1,%r1), 0
+	trt	4095(1,%r15), 0
+	trt	0(256,%r1), 0
+	trt	0(256,%r15), 0
+
+#CHECK: trte	%r0, %r0                # encoding: [0xb9,0xbf,0x00,0x00]
+#CHECK: trte	%r0, %r15               # encoding: [0xb9,0xbf,0x00,0x0f]
+#CHECK: trte	%r14, %r0               # encoding: [0xb9,0xbf,0x00,0xe0]
+#CHECK: trte	%r6, %r8                # encoding: [0xb9,0xbf,0x00,0x68]
+#CHECK: trte	%r4, %r13, 0            # encoding: [0xb9,0xbf,0x00,0x4d]
+#CHECK: trte	%r4, %r13, 15           # encoding: [0xb9,0xbf,0xf0,0x4d]
+
+	trte	%r0, %r0
+	trte	%r0, %r15
+	trte	%r14, %r0
+	trte	%r6, %r8
+	trte	%r4, %r13, 0
+	trte	%r4, %r13, 15
+
+#CHECK: trto	%r0, %r0                # encoding: [0xb9,0x91,0x00,0x00]
+#CHECK: trto	%r0, %r15               # encoding: [0xb9,0x91,0x00,0x0f]
+#CHECK: trto	%r14, %r0               # encoding: [0xb9,0x91,0x00,0xe0]
+#CHECK: trto	%r6, %r8                # encoding: [0xb9,0x91,0x00,0x68]
+#CHECK: trto	%r4, %r13, 0            # encoding: [0xb9,0x91,0x00,0x4d]
+#CHECK: trto	%r4, %r13, 15           # encoding: [0xb9,0x91,0xf0,0x4d]
+
+	trto	%r0, %r0
+	trto	%r0, %r15
+	trto	%r14, %r0
+	trto	%r6, %r8
+	trto	%r4, %r13, 0
+	trto	%r4, %r13, 15
+
+#CHECK: trtr	0(1), 0                 # encoding: [0xd0,0x00,0x00,0x00,0x00,0x00]
+#CHECK: trtr	0(1), 0(%r1)            # encoding: [0xd0,0x00,0x00,0x00,0x10,0x00]
+#CHECK: trtr	0(1), 0(%r15)           # encoding: [0xd0,0x00,0x00,0x00,0xf0,0x00]
+#CHECK: trtr	0(1), 4095              # encoding: [0xd0,0x00,0x00,0x00,0x0f,0xff]
+#CHECK: trtr	0(1), 4095(%r1)         # encoding: [0xd0,0x00,0x00,0x00,0x1f,0xff]
+#CHECK: trtr	0(1), 4095(%r15)        # encoding: [0xd0,0x00,0x00,0x00,0xff,0xff]
+#CHECK: trtr	0(1,%r1), 0             # encoding: [0xd0,0x00,0x10,0x00,0x00,0x00]
+#CHECK: trtr	0(1,%r15), 0            # encoding: [0xd0,0x00,0xf0,0x00,0x00,0x00]
+#CHECK: trtr	4095(1,%r1), 0          # encoding: [0xd0,0x00,0x1f,0xff,0x00,0x00]
+#CHECK: trtr	4095(1,%r15), 0         # encoding: [0xd0,0x00,0xff,0xff,0x00,0x00]
+#CHECK: trtr	0(256,%r1), 0           # encoding: [0xd0,0xff,0x10,0x00,0x00,0x00]
+#CHECK: trtr	0(256,%r15), 0          # encoding: [0xd0,0xff,0xf0,0x00,0x00,0x00]
+
+	trtr	0(1), 0
+	trtr	0(1), 0(%r1)
+	trtr	0(1), 0(%r15)
+	trtr	0(1), 4095
+	trtr	0(1), 4095(%r1)
+	trtr	0(1), 4095(%r15)
+	trtr	0(1,%r1), 0
+	trtr	0(1,%r15), 0
+	trtr	4095(1,%r1), 0
+	trtr	4095(1,%r15), 0
+	trtr	0(256,%r1), 0
+	trtr	0(256,%r15), 0
+
+#CHECK: trtre	%r0, %r0                # encoding: [0xb9,0xbd,0x00,0x00]
+#CHECK: trtre	%r0, %r15               # encoding: [0xb9,0xbd,0x00,0x0f]
+#CHECK: trtre	%r14, %r0               # encoding: [0xb9,0xbd,0x00,0xe0]
+#CHECK: trtre	%r6, %r8                # encoding: [0xb9,0xbd,0x00,0x68]
+#CHECK: trtre	%r4, %r13, 0            # encoding: [0xb9,0xbd,0x00,0x4d]
+#CHECK: trtre	%r4, %r13, 15           # encoding: [0xb9,0xbd,0xf0,0x4d]
+
+	trtre	%r0, %r0
+	trtre	%r0, %r15
+	trtre	%r14, %r0
+	trtre	%r6, %r8
+	trtre	%r4, %r13, 0
+	trtre	%r4, %r13, 15
+
+#CHECK: trtt	%r0, %r0                # encoding: [0xb9,0x90,0x00,0x00]
+#CHECK: trtt	%r0, %r15               # encoding: [0xb9,0x90,0x00,0x0f]
+#CHECK: trtt	%r14, %r0               # encoding: [0xb9,0x90,0x00,0xe0]
+#CHECK: trtt	%r6, %r8                # encoding: [0xb9,0x90,0x00,0x68]
+#CHECK: trtt	%r4, %r13, 0            # encoding: [0xb9,0x90,0x00,0x4d]
+#CHECK: trtt	%r4, %r13, 15           # encoding: [0xb9,0x90,0xf0,0x4d]
+
+	trtt	%r0, %r0
+	trtt	%r0, %r15
+	trtt	%r14, %r0
+	trtt	%r6, %r8
+	trtt	%r4, %r13, 0
+	trtt	%r4, %r13, 15
+
 #CHECK: ts	0                  	# encoding: [0x93,0x00,0x00,0x00]
 #CHECK: ts	0(%r1)             	# encoding: [0x93,0x00,0x10,0x00]
 #CHECK: ts	0(%r15)            	# encoding: [0x93,0x00,0xf0,0x00]
@@ -10748,6 +12323,92 @@
 	ts	4095(%r1)
 	ts	4095(%r15)
 
+#CHECK: unpk	0(1), 0(1)              # encoding: [0xf3,0x00,0x00,0x00,0x00,0x00]
+#CHECK: unpk	0(1), 0(1,%r1)          # encoding: [0xf3,0x00,0x00,0x00,0x10,0x00]
+#CHECK: unpk	0(1), 0(1,%r15)         # encoding: [0xf3,0x00,0x00,0x00,0xf0,0x00]
+#CHECK: unpk	0(1), 4095(1)           # encoding: [0xf3,0x00,0x00,0x00,0x0f,0xff]
+#CHECK: unpk	0(1), 4095(1,%r1)       # encoding: [0xf3,0x00,0x00,0x00,0x1f,0xff]
+#CHECK: unpk	0(1), 4095(1,%r15)      # encoding: [0xf3,0x00,0x00,0x00,0xff,0xff]
+#CHECK: unpk	0(1,%r1), 0(1)          # encoding: [0xf3,0x00,0x10,0x00,0x00,0x00]
+#CHECK: unpk	0(1,%r15), 0(1)         # encoding: [0xf3,0x00,0xf0,0x00,0x00,0x00]
+#CHECK: unpk	4095(1,%r1), 0(1)       # encoding: [0xf3,0x00,0x1f,0xff,0x00,0x00]
+#CHECK: unpk	4095(1,%r15), 0(1)      # encoding: [0xf3,0x00,0xff,0xff,0x00,0x00]
+#CHECK: unpk	0(16,%r1), 0(1)         # encoding: [0xf3,0xf0,0x10,0x00,0x00,0x00]
+#CHECK: unpk	0(16,%r15), 0(1)        # encoding: [0xf3,0xf0,0xf0,0x00,0x00,0x00]
+#CHECK: unpk	0(1), 0(16,%r1)         # encoding: [0xf3,0x0f,0x00,0x00,0x10,0x00]
+#CHECK: unpk	0(1), 0(16,%r15)        # encoding: [0xf3,0x0f,0x00,0x00,0xf0,0x00]
+
+	unpk	0(1), 0(1)
+	unpk	0(1), 0(1,%r1)
+	unpk	0(1), 0(1,%r15)
+	unpk	0(1), 4095(1)
+	unpk	0(1), 4095(1,%r1)
+	unpk	0(1), 4095(1,%r15)
+	unpk	0(1,%r1), 0(1)
+	unpk	0(1,%r15), 0(1)
+	unpk	4095(1,%r1), 0(1)
+	unpk	4095(1,%r15), 0(1)
+	unpk	0(16,%r1), 0(1)
+	unpk	0(16,%r15), 0(1)
+	unpk	0(1), 0(16,%r1)
+	unpk	0(1), 0(16,%r15)
+
+#CHECK: unpka	0(1), 0                 # encoding: [0xea,0x00,0x00,0x00,0x00,0x00]
+#CHECK: unpka	0(1), 0(%r1)            # encoding: [0xea,0x00,0x00,0x00,0x10,0x00]
+#CHECK: unpka	0(1), 0(%r15)           # encoding: [0xea,0x00,0x00,0x00,0xf0,0x00]
+#CHECK: unpka	0(1), 4095              # encoding: [0xea,0x00,0x00,0x00,0x0f,0xff]
+#CHECK: unpka	0(1), 4095(%r1)         # encoding: [0xea,0x00,0x00,0x00,0x1f,0xff]
+#CHECK: unpka	0(1), 4095(%r15)        # encoding: [0xea,0x00,0x00,0x00,0xff,0xff]
+#CHECK: unpka	0(1,%r1), 0             # encoding: [0xea,0x00,0x10,0x00,0x00,0x00]
+#CHECK: unpka	0(1,%r15), 0            # encoding: [0xea,0x00,0xf0,0x00,0x00,0x00]
+#CHECK: unpka	4095(1,%r1), 0          # encoding: [0xea,0x00,0x1f,0xff,0x00,0x00]
+#CHECK: unpka	4095(1,%r15), 0         # encoding: [0xea,0x00,0xff,0xff,0x00,0x00]
+#CHECK: unpka	0(256,%r1), 0           # encoding: [0xea,0xff,0x10,0x00,0x00,0x00]
+#CHECK: unpka	0(256,%r15), 0          # encoding: [0xea,0xff,0xf0,0x00,0x00,0x00]
+
+	unpka	0(1), 0
+	unpka	0(1), 0(%r1)
+	unpka	0(1), 0(%r15)
+	unpka	0(1), 4095
+	unpka	0(1), 4095(%r1)
+	unpka	0(1), 4095(%r15)
+	unpka	0(1,%r1), 0
+	unpka	0(1,%r15), 0
+	unpka	4095(1,%r1), 0
+	unpka	4095(1,%r15), 0
+	unpka	0(256,%r1), 0
+	unpka	0(256,%r15), 0
+
+#CHECK: unpku	0(1), 0                 # encoding: [0xe2,0x00,0x00,0x00,0x00,0x00]
+#CHECK: unpku	0(1), 0(%r1)            # encoding: [0xe2,0x00,0x00,0x00,0x10,0x00]
+#CHECK: unpku	0(1), 0(%r15)           # encoding: [0xe2,0x00,0x00,0x00,0xf0,0x00]
+#CHECK: unpku	0(1), 4095              # encoding: [0xe2,0x00,0x00,0x00,0x0f,0xff]
+#CHECK: unpku	0(1), 4095(%r1)         # encoding: [0xe2,0x00,0x00,0x00,0x1f,0xff]
+#CHECK: unpku	0(1), 4095(%r15)        # encoding: [0xe2,0x00,0x00,0x00,0xff,0xff]
+#CHECK: unpku	0(1,%r1), 0             # encoding: [0xe2,0x00,0x10,0x00,0x00,0x00]
+#CHECK: unpku	0(1,%r15), 0            # encoding: [0xe2,0x00,0xf0,0x00,0x00,0x00]
+#CHECK: unpku	4095(1,%r1), 0          # encoding: [0xe2,0x00,0x1f,0xff,0x00,0x00]
+#CHECK: unpku	4095(1,%r15), 0         # encoding: [0xe2,0x00,0xff,0xff,0x00,0x00]
+#CHECK: unpku	0(256,%r1), 0           # encoding: [0xe2,0xff,0x10,0x00,0x00,0x00]
+#CHECK: unpku	0(256,%r15), 0          # encoding: [0xe2,0xff,0xf0,0x00,0x00,0x00]
+
+	unpku	0(1), 0
+	unpku	0(1), 0(%r1)
+	unpku	0(1), 0(%r15)
+	unpku	0(1), 4095
+	unpku	0(1), 4095(%r1)
+	unpku	0(1), 4095(%r15)
+	unpku	0(1,%r1), 0
+	unpku	0(1,%r15), 0
+	unpku	4095(1,%r1), 0
+	unpku	4095(1,%r15), 0
+	unpku	0(256,%r1), 0
+	unpku	0(256,%r15), 0
+
+#CHECK: upt                             # encoding: [0x01,0x02]
+
+	upt
+
 #CHECK: x	%r0, 0                  # encoding: [0x57,0x00,0x00,0x00]
 #CHECK: x	%r0, 4095               # encoding: [0x57,0x00,0x0f,0xff]
 #CHECK: x	%r0, 0(%r1)             # encoding: [0x57,0x00,0x10,0x00]
@@ -10907,3 +12568,33 @@
 	xy	%r0, 524287(%r1,%r15)
 	xy	%r0, 524287(%r15,%r1)
 	xy	%r15, 0
+
+#CHECK: zap	0(1), 0(1)              # encoding: [0xf8,0x00,0x00,0x00,0x00,0x00]
+#CHECK: zap	0(1), 0(1,%r1)          # encoding: [0xf8,0x00,0x00,0x00,0x10,0x00]
+#CHECK: zap	0(1), 0(1,%r15)         # encoding: [0xf8,0x00,0x00,0x00,0xf0,0x00]
+#CHECK: zap	0(1), 4095(1)           # encoding: [0xf8,0x00,0x00,0x00,0x0f,0xff]
+#CHECK: zap	0(1), 4095(1,%r1)       # encoding: [0xf8,0x00,0x00,0x00,0x1f,0xff]
+#CHECK: zap	0(1), 4095(1,%r15)      # encoding: [0xf8,0x00,0x00,0x00,0xff,0xff]
+#CHECK: zap	0(1,%r1), 0(1)          # encoding: [0xf8,0x00,0x10,0x00,0x00,0x00]
+#CHECK: zap	0(1,%r15), 0(1)         # encoding: [0xf8,0x00,0xf0,0x00,0x00,0x00]
+#CHECK: zap	4095(1,%r1), 0(1)       # encoding: [0xf8,0x00,0x1f,0xff,0x00,0x00]
+#CHECK: zap	4095(1,%r15), 0(1)      # encoding: [0xf8,0x00,0xff,0xff,0x00,0x00]
+#CHECK: zap	0(16,%r1), 0(1)         # encoding: [0xf8,0xf0,0x10,0x00,0x00,0x00]
+#CHECK: zap	0(16,%r15), 0(1)        # encoding: [0xf8,0xf0,0xf0,0x00,0x00,0x00]
+#CHECK: zap	0(1), 0(16,%r1)         # encoding: [0xf8,0x0f,0x00,0x00,0x10,0x00]
+#CHECK: zap	0(1), 0(16,%r15)        # encoding: [0xf8,0x0f,0x00,0x00,0xf0,0x00]
+
+	zap	0(1), 0(1)
+	zap	0(1), 0(1,%r1)
+	zap	0(1), 0(1,%r15)
+	zap	0(1), 4095(1)
+	zap	0(1), 4095(1,%r1)
+	zap	0(1), 4095(1,%r15)
+	zap	0(1,%r1), 0(1)
+	zap	0(1,%r15), 0(1)
+	zap	4095(1,%r1), 0(1)
+	zap	4095(1,%r15), 0(1)
+	zap	0(16,%r1), 0(1)
+	zap	0(16,%r15), 0(1)
+	zap	0(1), 0(16,%r1)
+	zap	0(1), 0(16,%r15)
diff --git a/test/Object/Inputs/COFF/empty-drectve.yaml b/test/Object/Inputs/COFF/empty-drectve.yaml
new file mode 100644
index 000000000000..af288807e3ad
--- /dev/null
+++ b/test/Object/Inputs/COFF/empty-drectve.yaml
@@ -0,0 +1,14 @@
+--- !COFF
+header:
+  Machine:           IMAGE_FILE_MACHINE_I386
+sections:
+  - Name:            .drectve
+    Characteristics: [ IMAGE_SCN_LNK_INFO, IMAGE_SCN_LNK_REMOVE ]
+    SectionData:     ''
+symbols:
+  - Name:            .drectve
+    Value:           0
+    SectionNumber:   1
+    SimpleType:      IMAGE_SYM_TYPE_NULL
+    ComplexType:     IMAGE_SYM_DTYPE_NULL
+    StorageClass:    IMAGE_SYM_CLASS_STATIC
diff --git a/test/Object/X86/archive-symbol-table.s b/test/Object/X86/archive-symbol-table.s
new file mode 100644
index 000000000000..2e6fcbed60b1
--- /dev/null
+++ b/test/Object/X86/archive-symbol-table.s
@@ -0,0 +1,19 @@
+# RUN: llvm-mc %s -o %t.o -filetype=obj -triple=x86_64-pc-linux
+# RUN: rm -f %t
+# RUN: llvm-ar rcs %t %t.o
+# RUN: llvm-nm -M %t | FileCheck %s
+
+# Test that weak undefined symbols don't show up in the archive symbol
+# table.
+
+.global foo
+foo:
+.weak bar
+.quad bar
+
+# CHECK: Archive map
+# CHECK-NEXT: foo in archive-symbol-table.s.tmp.o
+# CHECK-NOT: in
+# CHECK: archive-symbol-table.s.tmp.o
+# CHECK-NEXT: w bar
+# CHECK-NEXT: T foo
diff --git a/test/Object/X86/nm-ir.ll b/test/Object/X86/nm-ir.ll
index 29f7a5c7018c..c90f67b15160 100644
--- a/test/Object/X86/nm-ir.ll
+++ b/test/Object/X86/nm-ir.ll
@@ -12,7 +12,7 @@
 ; CHECK-NEXT: C g3
 ; CHECK-NOT: g4
 ; CHECK-NEXT: T global_asm_sym
-; CHECK-NEXT: D ifunc_f1
+; CHECK-NEXT: T ifunc_f1
 ; CHECK-NEXT: t local_asm_sym
 ; CHECK-NEXT: U undef_asm_sy
 
diff --git a/test/Object/coff-empty-drectve.test b/test/Object/coff-empty-drectve.test
new file mode 100644
index 000000000000..f76d7bf72716
--- /dev/null
+++ b/test/Object/coff-empty-drectve.test
@@ -0,0 +1,3 @@
+RUN: yaml2obj %p/Inputs/COFF/empty-drectve.yaml | llvm-readobj -coff-directives - | FileCheck %s
+
+CHECK: Directive(s): {{$}}
diff --git a/test/Object/invalid.test b/test/Object/invalid.test
index fc1a77b2c0c0..dcbac32f7196 100644
--- a/test/Object/invalid.test
+++ b/test/Object/invalid.test
@@ -53,7 +53,7 @@ INVALID-SYMTAB-SIZE: size is not a multiple of sh_entsize
 
 
 RUN: not llvm-readobj -t %p/Inputs/invalid-xindex-size.elf 2>&1 | FileCheck --check-prefix=INVALID-XINDEX-SIZE %s
-INVALID-XINDEX-SIZE: Invalid data was encountered while parsing the file.
+INVALID-XINDEX-SIZE: Invalid data was encountered while parsing the file
 
 RUN: not llvm-readobj -t %p/Inputs/invalid-e_shnum.elf 2>&1 | FileCheck --check-prefix=INVALID-SH-NUM %s
 INVALID-SH-NUM: invalid e_phentsize
@@ -77,7 +77,7 @@ RUN:   FileCheck --check-prefix=INVALID-SECTION-SIZE2 %s
 INVALID-SECTION-SIZE2: invalid section offset
 
 RUN: not llvm-readobj -t %p/Inputs/invalid-sections-num.elf 2>&1 | FileCheck --check-prefix=INVALID-SECTION-NUM %s
-INVALID-SECTION-NUM: Invalid data was encountered while parsing the file.
+INVALID-SECTION-NUM: Invalid data was encountered while parsing the file
 
 RUN: not llvm-readobj -r %p/Inputs/invalid-rel-sym.elf 2>&1 | FileCheck --check-prefix=INVALID-REL-SYM %s
 INVALID-REL-SYM: invalid section offset
diff --git a/test/Object/wasm-invalid-start.test b/test/Object/wasm-invalid-start.test
new file mode 100644
index 000000000000..12f75676345f
--- /dev/null
+++ b/test/Object/wasm-invalid-start.test
@@ -0,0 +1,10 @@
+# RUN: yaml2obj %s | not llvm-objdump -h - 2>&1 | FileCheck %s
+
+!WASM
+FileHeader:
+  Version:         0x00000001
+Sections:
+  - Type:            START
+    StartFunction:   0
+
+# CHECK: {{.*}}: Invalid start function
diff --git a/test/ObjectYAML/wasm/export_section.yaml b/test/ObjectYAML/wasm/export_section.yaml
index 1d1a16fb8335..89ebee328246 100644
--- a/test/ObjectYAML/wasm/export_section.yaml
+++ b/test/ObjectYAML/wasm/export_section.yaml
@@ -5,12 +5,18 @@ FileHeader:
 Sections:
   - Type:            EXPORT
     Exports:         
-      - Name:            foo
-        Kind:            FUNCTION
-        Index:           0
-      - Name:            bar
+      - Name:            function_export
         Kind:            FUNCTION
         Index:           1
+      - Name:            global_export
+        Kind:            GLOBAL
+        Index:           1
+      - Name:            memory_export
+        Kind:            MEMORY
+        Index:           0
+      - Name:            table_export
+        Kind:            TABLE
+        Index:           0
 ...
 # CHECK: --- !WASM
 # CHECK: FileHeader:
@@ -18,10 +24,16 @@ Sections:
 # CHECK: Sections:
 # CHECK:   - Type:            EXPORT
 # CHECK:     Exports:         
-# CHECK:       - Name:            foo
-# CHECK:         Kind:            FUNCTION
-# CHECK:         Index:           0
-# CHECK:       - Name:            bar
+# CHECK:       - Name:            function_export
 # CHECK:         Kind:            FUNCTION
 # CHECK:         Index:           1
+# CHECK:       - Name:            global_export
+# CHECK:         Kind:            GLOBAL
+# CHECK:         Index:           1
+# CHECK:       - Name:            memory_export
+# CHECK:         Kind:            MEMORY
+# CHECK:         Index:           0
+# CHECK:       - Name:            table_export
+# CHECK:         Kind:            TABLE
+# CHECK:         Index:           0
 # CHECK: ...
diff --git a/test/ObjectYAML/wasm/function_section.yaml b/test/ObjectYAML/wasm/function_section.yaml
index 39e6b75d5cdc..571b762787a2 100644
--- a/test/ObjectYAML/wasm/function_section.yaml
+++ b/test/ObjectYAML/wasm/function_section.yaml
@@ -4,9 +4,7 @@ FileHeader:
   Version:         0x00000001
 Sections:
   - Type:            FUNCTION
-    FunctionTypes:   
-      - 1
-      - 0
+    FunctionTypes: [ 1, 0 ]
 ...
 # CHECK: --- !WASM
 # CHECK: FileHeader:
diff --git a/test/ObjectYAML/wasm/import_section.yaml b/test/ObjectYAML/wasm/import_section.yaml
index 52f466a00b66..115d4cc0bd6b 100644
--- a/test/ObjectYAML/wasm/import_section.yaml
+++ b/test/ObjectYAML/wasm/import_section.yaml
@@ -9,19 +9,32 @@ Sections:
         ParamTypes:
           - I32
   - Type:            IMPORT
-    Imports:         
+    Imports:
       - Module:          foo
-        Field:           bar
+        Field:           imported_function
         Kind:            FUNCTION
         SigIndex:        0
       - Module:          fiz
-        Field:           baz
+        Field:           imported_global
         Kind:            GLOBAL
         GlobalType:      I32
         GlobalMutable:   false
-  - Type:            FUNCTION
-    FunctionTypes:
-      - 0
+      - Module:          foo
+        Field:           imported_memory
+        Kind:            MEMORY
+        Memory:
+          Flags:           0x00000001
+          Initial:         0x00000010
+          Maximum:         0x00000011
+      - Module:          foo
+        Field:           imported_table
+        Kind:            TABLE
+        Table:
+          ElemType:      ANYFUNC
+          Limits:
+            Flags:           0x00000001
+            Initial:         0x00000020
+            Maximum:         0x00000022
 ...
 # CHECK: --- !WASM
 # CHECK: FileHeader:
@@ -30,12 +43,28 @@ Sections:
 # CHECK:   - Type:            IMPORT
 # CHECK:     Imports:         
 # CHECK:       - Module:          foo
-# CHECK:         Field:           bar
+# CHECK:         Field:           imported_function
 # CHECK:         Kind:            FUNCTION
 # CHECK:         SigIndex:        0
 # CHECK:       - Module:          fiz
-# CHECK:         Field:           baz
+# CHECK:         Field:           imported_global
 # CHECK:         Kind:            GLOBAL
 # CHECK:         GlobalType:      I32
 # CHECK:         GlobalMutable:   false
+# CHECK:       - Module:          foo
+# CHECK:         Field:           imported_memory
+# CHECK:         Kind:            MEMORY
+# CHECK:         Memory:
+# CHECK:           Flags:           0x00000001
+# CHECK:           Initial:         0x00000010
+# CHECK:           Maximum:         0x00000011
+# CHECK:       - Module:          foo
+# CHECK:         Field:           imported_table
+# CHECK:         Kind:            TABLE
+# CHECK:         Table:
+# CHECK:           ElemType:      ANYFUNC
+# CHECK:           Limits:
+# CHECK:             Flags:           0x00000001
+# CHECK:             Initial:         0x00000020
+# CHECK:             Maximum:         0x00000022
 # CHECK: ...
diff --git a/test/ObjectYAML/wasm/start_section.yaml b/test/ObjectYAML/wasm/start_section.yaml
index 41301a620037..38feebcdf993 100644
--- a/test/ObjectYAML/wasm/start_section.yaml
+++ b/test/ObjectYAML/wasm/start_section.yaml
@@ -1,8 +1,17 @@
 # RUN: yaml2obj %s | obj2yaml | FileCheck %s
+
 --- !WASM
 FileHeader:
   Version:         0x00000001
 Sections:
+  - Type:            TYPE
+    Signatures:
+      - ReturnType:      I32
+        ParamTypes:
+          - F32
+          - F32
+  - Type:            FUNCTION
+    FunctionTypes: [ 0, 0, 0 ]
   - Type:            START
     StartFunction:   1
 ...
diff --git a/test/TableGen/AsmVariant.td b/test/TableGen/AsmVariant.td
index 6c50241e5ae1..cb5d32385d3b 100644
--- a/test/TableGen/AsmVariant.td
+++ b/test/TableGen/AsmVariant.td
@@ -1,6 +1,6 @@
 // RUN: llvm-tblgen -gen-asm-matcher -I %p/../../include %s | FileCheck %s
 
-// Check that cpecifying AsmVariant works correctly
+// Check that specifying AsmVariant works correctly
 
 include "llvm/Target/Target.td"
 
diff --git a/test/TableGen/RegisterEncoder.td b/test/TableGen/RegisterEncoder.td
new file mode 100644
index 000000000000..a0472c5feffa
--- /dev/null
+++ b/test/TableGen/RegisterEncoder.td
@@ -0,0 +1,35 @@
+// RUN: llvm-tblgen -gen-emitter -I %p/../../include %s | FileCheck %s
+
+// Check that EncoderMethod for RegisterOperand is working correctly
+
+include "llvm/Target/Target.td"
+
+def ArchInstrInfo : InstrInfo { }
+
+def Arch : Target {
+  let InstructionSet = ArchInstrInfo;
+}
+
+def Reg : Register<"reg">;
+
+def RegClass : RegisterClass<"foo", [i32], 0, (add Reg)>;
+
+def RegOperand : RegisterOperand<RegClass> {
+  let EncoderMethod = "barEncoder";
+}
+
+def foo : Instruction {
+  let Size = 1;
+
+  let OutOperandList = (outs);
+  let InOperandList = (ins RegOperand:$bar);
+
+  bits<8> bar;
+  bits<8> Inst = bar;
+}
+
+// CHECK: case ::foo: {
+// CHECK:   op = barEncoder
+// CHECK:   Value |= op & UINT64_C(255);
+// CHECK:   break;
+// CHECK: }
\ No newline at end of file
diff --git a/test/Transforms/CodeExtractor/ExtractedFnEntryCount.ll b/test/Transforms/CodeExtractor/ExtractedFnEntryCount.ll
index 509a4d7bfa18..8313cfac04ee 100644
--- a/test/Transforms/CodeExtractor/ExtractedFnEntryCount.ll
+++ b/test/Transforms/CodeExtractor/ExtractedFnEntryCount.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -partial-inliner -S | FileCheck %s
+; RUN: opt < %s -partial-inliner -skip-partial-inlining-cost-analysis -S | FileCheck %s
 
 ; This test checks to make sure that the CodeExtractor
 ;  properly sets the entry count for the function that is
diff --git a/test/Transforms/CodeExtractor/MultipleExitBranchProb.ll b/test/Transforms/CodeExtractor/MultipleExitBranchProb.ll
index 425e96973596..8e362080dc48 100644
--- a/test/Transforms/CodeExtractor/MultipleExitBranchProb.ll
+++ b/test/Transforms/CodeExtractor/MultipleExitBranchProb.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -partial-inliner -max-num-inline-blocks=2 -S | FileCheck %s
+; RUN: opt < %s -partial-inliner -max-num-inline-blocks=2 -skip-partial-inlining-cost-analysis -S | FileCheck %s
 
 ; This test checks to make sure that CodeExtractor updates
 ;  the exit branch probabilities for multiple exit blocks.
diff --git a/test/Transforms/CodeExtractor/PartialInlineAnd.ll b/test/Transforms/CodeExtractor/PartialInlineAnd.ll
index e981a5ba5816..d32d834d2df3 100644
--- a/test/Transforms/CodeExtractor/PartialInlineAnd.ll
+++ b/test/Transforms/CodeExtractor/PartialInlineAnd.ll
@@ -1,7 +1,7 @@
 ; RUN: opt < %s -partial-inliner -S | FileCheck %s
 ; RUN: opt < %s -passes=partial-inliner -S | FileCheck %s
-; RUN: opt < %s -partial-inliner -max-num-inline-blocks=2 -S | FileCheck --check-prefix=LIMIT %s
-; RUN: opt < %s -passes=partial-inliner -max-num-inline-blocks=2 -S | FileCheck  --check-prefix=LIMIT %s
+; RUN: opt < %s -partial-inliner -skip-partial-inlining-cost-analysis -max-num-inline-blocks=2 -S | FileCheck --check-prefix=LIMIT %s
+; RUN: opt < %s -passes=partial-inliner -skip-partial-inlining-cost-analysis -max-num-inline-blocks=2 -S | FileCheck  --check-prefix=LIMIT %s
 
 ; Function Attrs: nounwind uwtable
 define i32 @bar(i32 %arg) local_unnamed_addr #0 {
diff --git a/test/Transforms/CodeExtractor/PartialInlineEntryUpdate.ll b/test/Transforms/CodeExtractor/PartialInlineEntryUpdate.ll
new file mode 100644
index 000000000000..3a7a9752e507
--- /dev/null
+++ b/test/Transforms/CodeExtractor/PartialInlineEntryUpdate.ll
@@ -0,0 +1,41 @@
+; RUN: opt < %s -skip-partial-inlining-cost-analysis -partial-inliner -S  | FileCheck %s
+; RUN: opt < %s -skip-partial-inlining-cost-analysis -passes=partial-inliner -S  | FileCheck %s
+
+define i32 @Func(i1 %cond, i32* align 4 %align.val) !prof !1 {
+; CHECK: @Func({{.*}}) !prof [[REMAINCOUNT:![0-9]+]]
+entry:
+  br i1 %cond, label %if.then, label %return
+if.then:
+  ; Dummy store to have more than 0 uses
+  store i32 10, i32* %align.val, align 4
+  br label %return
+return:             ; preds = %entry
+  ret i32 0
+}
+
+define internal i32 @Caller1(i1 %cond, i32* align 2 %align.val) !prof !3{
+entry:
+; CHECK-LABEL: @Caller1
+; CHECK: br
+; CHECK: call void @Func.1_ 
+; CHECK: br
+; CHECK: call void @Func.1_ 
+  %val = call i32 @Func(i1 %cond, i32* %align.val)
+  %val2 = call i32 @Func(i1 %cond, i32* %align.val)
+  ret i32 %val
+}
+
+define internal i32 @Caller2(i1 %cond, i32* align 2 %align.val) !prof !2{
+entry:
+; CHECK-LABEL: @Caller2
+; CHECK: br
+; CHECK: call void @Func.1_ 
+  %val = call i32 @Func(i1 %cond, i32* %align.val)
+  ret i32 %val
+}
+
+; CHECK: [[REMAINCOUNT]] = !{!"function_entry_count", i64 150}
+!1 = !{!"function_entry_count", i64 200}
+!2 = !{!"function_entry_count", i64 10}
+!3 = !{!"function_entry_count", i64 20}
+
diff --git a/test/Transforms/CodeExtractor/PartialInlineHighCost.ll b/test/Transforms/CodeExtractor/PartialInlineHighCost.ll
new file mode 100644
index 000000000000..e43a94dc6c37
--- /dev/null
+++ b/test/Transforms/CodeExtractor/PartialInlineHighCost.ll
@@ -0,0 +1,107 @@
+; The outlined region has high frequency  and the outlining
+; call sequence is expensive (input, output, multiple exit etc)
+; RUN: opt < %s -partial-inliner -max-num-inline-blocks=2 -S | FileCheck %s
+; RUN: opt < %s -passes=partial-inliner -max-num-inline-blocks=2 -S | FileCheck %s
+; RUN: opt < %s -partial-inliner -skip-partial-inlining-cost-analysis -max-num-inline-blocks=2 -S | FileCheck --check-prefix=NOCOST %s
+; RUN: opt < %s -passes=partial-inliner -skip-partial-inlining-cost-analysis -max-num-inline-blocks=2 -S | FileCheck  --check-prefix=NOCOST %s
+
+
+; Function Attrs: nounwind
+define i32 @bar_hot_outline_region(i32 %arg) local_unnamed_addr #0 {
+bb:
+  %tmp = icmp slt i32 %arg, 0
+  br i1 %tmp, label %bb1, label %bb16, !prof !1
+
+bb1:                                              ; preds = %bb
+  %tmp2 = tail call i32 (...) @foo() #0
+  %tmp3 = tail call i32 (...) @foo() #0
+  %tmp4 = tail call i32 (...) @foo() #0
+  %tmp5 = tail call i32 (...) @foo() #0
+  %tmp6 = tail call i32 (...) @foo() #0
+  %tmp7 = tail call i32 (...) @foo() #0
+  %tmp8 = add nsw i32 %arg, 1
+  %tmp9 = tail call i32 @goo(i32 %tmp8) #0
+  %tmp10 = tail call i32 (...) @foo() #0
+  %tmp11 = icmp eq i32 %tmp10, 0
+  br i1 %tmp11, label %bb12, label %bb16
+
+bb12:                                             ; preds = %bb1
+  %tmp13 = tail call i32 (...) @foo() #0
+  %tmp14 = icmp eq i32 %tmp13, 0
+  %tmp15 = select i1 %tmp14, i32 0, i32 3
+  br label %bb16
+
+bb16:                                             ; preds = %bb12, %bb1, %bb
+  %tmp17 = phi i32 [ 2, %bb1 ], [ %tmp15, %bb12 ], [ 0, %bb ]
+  ret i32 %tmp17
+}
+
+define i32 @bar_cold_outline_region(i32 %arg) local_unnamed_addr #0 {
+bb:
+  %tmp = icmp slt i32 %arg, 0
+  br i1 %tmp, label %bb1, label %bb16, !prof !2
+
+bb1:                                              ; preds = %bb
+  %tmp2 = tail call i32 (...) @foo() #0
+  %tmp3 = tail call i32 (...) @foo() #0
+  %tmp4 = tail call i32 (...) @foo() #0
+  %tmp5 = tail call i32 (...) @foo() #0
+  %tmp6 = tail call i32 (...) @foo() #0
+  %tmp7 = tail call i32 (...) @foo() #0
+  %tmp8 = add nsw i32 %arg, 1
+  %tmp9 = tail call i32 @goo(i32 %tmp8) #0
+  %tmp10 = tail call i32 (...) @foo() #0
+  %tmp11 = icmp eq i32 %tmp10, 0
+  br i1 %tmp11, label %bb12, label %bb16
+
+bb12:                                             ; preds = %bb1
+  %tmp13 = tail call i32 (...) @foo() #0
+  %tmp14 = icmp eq i32 %tmp13, 0
+  %tmp15 = select i1 %tmp14, i32 0, i32 3
+  br label %bb16
+
+bb16:                                             ; preds = %bb12, %bb1, %bb
+  %tmp17 = phi i32 [ 2, %bb1 ], [ %tmp15, %bb12 ], [ 0, %bb ]
+  ret i32 %tmp17
+}
+
+; Function Attrs: nounwind
+declare i32 @foo(...) local_unnamed_addr #0
+
+; Function Attrs: nounwind
+declare i32 @goo(i32) local_unnamed_addr #0
+
+; Function Attrs: nounwind
+define i32 @dummy_caller(i32 %arg) local_unnamed_addr #0 {
+bb:
+; CHECK-LABEL: @dummy_caller
+; CHECK-NOT: br i1
+; CHECK-NOT: call{{.*}}bar_hot_outline_region. 
+; NOCOST-LABEL: @dummy_caller
+; NOCOST: br i1
+; NOCOST: call{{.*}}bar_hot_outline_region.
+
+  %tmp = tail call i32 @bar_hot_outline_region(i32 %arg)
+  ret i32 %tmp
+}
+
+define i32 @dummy_caller2(i32 %arg) local_unnamed_addr #0 {
+bb:
+; CHECK-LABEL: @dummy_caller2
+; CHECK: br i1
+; CHECK: call{{.*}}bar_cold_outline_region.
+; NOCOST-LABEL: @dummy_caller2
+; NOCOST: br i1
+; NOCOST: call{{.*}}bar_cold_outline_region.
+
+  %tmp = tail call i32 @bar_cold_outline_region(i32 %arg)
+  ret i32 %tmp
+}
+
+attributes #0 = { nounwind }
+
+!llvm.ident = !{!0}
+
+!0 = !{!"clang version 5.0.0 (trunk 301898)"}
+!1 = !{!"branch_weights", i32 2000, i32 1}
+!2 = !{!"branch_weights", i32 1, i32 100}
diff --git a/test/Transforms/CodeExtractor/PartialInlineOr.ll b/test/Transforms/CodeExtractor/PartialInlineOr.ll
index 5408b4faaf70..758945c7ade5 100644
--- a/test/Transforms/CodeExtractor/PartialInlineOr.ll
+++ b/test/Transforms/CodeExtractor/PartialInlineOr.ll
@@ -1,5 +1,5 @@
-; RUN: opt < %s -partial-inliner -S | FileCheck %s
-; RUN: opt < %s -passes=partial-inliner -S | FileCheck %s
+; RUN: opt < %s -partial-inliner -skip-partial-inlining-cost-analysis -S | FileCheck %s
+; RUN: opt < %s -passes=partial-inliner -skip-partial-inlining-cost-analysis -S | FileCheck %s
 ; RUN: opt < %s -partial-inliner -max-num-inline-blocks=2 -S | FileCheck --check-prefix=LIMIT %s
 ; RUN: opt < %s -passes=partial-inliner -max-num-inline-blocks=2 -S | FileCheck  --check-prefix=LIMIT %s
 
diff --git a/test/Transforms/CodeExtractor/PartialInlineOrAnd.ll b/test/Transforms/CodeExtractor/PartialInlineOrAnd.ll
index 282d300fadb9..fb6d1c335361 100644
--- a/test/Transforms/CodeExtractor/PartialInlineOrAnd.ll
+++ b/test/Transforms/CodeExtractor/PartialInlineOrAnd.ll
@@ -1,7 +1,7 @@
 ; RUN: opt < %s -partial-inliner -S | FileCheck %s
 ; RUN: opt < %s -passes=partial-inliner -S | FileCheck %s
-; RUN: opt < %s -partial-inliner -max-num-inline-blocks=3 -S | FileCheck --check-prefix=LIMIT3 %s
-; RUN: opt < %s -passes=partial-inliner -max-num-inline-blocks=3 -S | FileCheck  --check-prefix=LIMIT3 %s
+; RUN: opt < %s -partial-inliner -max-num-inline-blocks=3 -skip-partial-inlining-cost-analysis  -S | FileCheck --check-prefix=LIMIT3 %s
+; RUN: opt < %s -passes=partial-inliner -max-num-inline-blocks=3 -skip-partial-inlining-cost-analysis -S | FileCheck  --check-prefix=LIMIT3 %s
 ; RUN: opt < %s -partial-inliner -max-num-inline-blocks=2 -S | FileCheck --check-prefix=LIMIT2 %s
 ; RUN: opt < %s -passes=partial-inliner -max-num-inline-blocks=2 -S | FileCheck  --check-prefix=LIMIT2 %s
 
diff --git a/test/Transforms/CodeExtractor/SingleCondition.ll b/test/Transforms/CodeExtractor/SingleCondition.ll
index 90cda889a21b..4110cd95b7ee 100644
--- a/test/Transforms/CodeExtractor/SingleCondition.ll
+++ b/test/Transforms/CodeExtractor/SingleCondition.ll
@@ -1,5 +1,5 @@
-; RUN: opt < %s -partial-inliner -S  | FileCheck %s
-; RUN: opt < %s -passes=partial-inliner -S  | FileCheck %s
+; RUN: opt < %s -skip-partial-inlining-cost-analysis -partial-inliner -S  | FileCheck %s
+; RUN: opt < %s -skip-partial-inlining-cost-analysis -passes=partial-inliner -S  | FileCheck %s
 
 define internal i32 @inlinedFunc(i1 %cond, i32* align 4 %align.val) {
 entry:
diff --git a/test/Transforms/CodeExtractor/X86/InheritTargetAttributes.ll b/test/Transforms/CodeExtractor/X86/InheritTargetAttributes.ll
index 41d883c8c378..0f8a71907d85 100644
--- a/test/Transforms/CodeExtractor/X86/InheritTargetAttributes.ll
+++ b/test/Transforms/CodeExtractor/X86/InheritTargetAttributes.ll
@@ -1,5 +1,5 @@
-; RUN: opt < %s -partial-inliner | llc -filetype=null
-; RUN: opt < %s -partial-inliner -S | FileCheck %s
+; RUN: opt < %s -partial-inliner -skip-partial-inlining-cost-analysis | llc -filetype=null
+; RUN: opt < %s -partial-inliner -skip-partial-inlining-cost-analysis -S | FileCheck %s
 ; This testcase checks to see if CodeExtractor properly inherits
 ;   target specific attributes for the extracted function. This can
 ;   cause certain instructions that depend on the attributes to not
diff --git a/test/Transforms/CodeGenPrepare/section-samplepgo.ll b/test/Transforms/CodeGenPrepare/section-samplepgo.ll
new file mode 100644
index 000000000000..93d2a5f2542c
--- /dev/null
+++ b/test/Transforms/CodeGenPrepare/section-samplepgo.ll
@@ -0,0 +1,57 @@
+; RUN: opt < %s -codegenprepare -S | FileCheck %s
+
+target triple = "x86_64-pc-linux-gnu"
+
+; This tests that hot/cold functions get correct section prefix assigned
+
+; CHECK: hot_func{{.*}}!section_prefix ![[HOT_ID:[0-9]+]]
+; The entry is hot
+define void @hot_func() !prof !15 {
+  ret void
+}
+
+; CHECK: hot_call_func{{.*}}!section_prefix ![[HOT_ID]]
+; The sum of 2 callsites are hot
+define void @hot_call_func() !prof !16 {
+  call void @hot_func(), !prof !17
+  call void @hot_func(), !prof !17
+  ret void
+}
+
+; CHECK-NOT: normal_func{{.*}}!section_prefix
+; The sum of all callsites are neither hot or cold
+define void @normal_func() !prof !16 {
+  call void @hot_func(), !prof !17
+  call void @hot_func(), !prof !18
+  call void @hot_func(), !prof !18
+  ret void
+}
+
+; CHECK: cold_func{{.*}}!section_prefix ![[COLD_ID:[0-9]+]]
+; The entry and the callsite are both cold
+define void @cold_func() !prof !16 {
+  call void @hot_func(), !prof !18
+  ret void
+}
+
+; CHECK: ![[HOT_ID]] = !{!"function_section_prefix", !".hot"}
+; CHECK: ![[COLD_ID]] = !{!"function_section_prefix", !".unlikely"}
+!llvm.module.flags = !{!1}
+!1 = !{i32 1, !"ProfileSummary", !2}
+!2 = !{!3, !4, !5, !6, !7, !8, !9, !10}
+!3 = !{!"ProfileFormat", !"SampleProfile"}
+!4 = !{!"TotalCount", i64 10000}
+!5 = !{!"MaxCount", i64 1000}
+!6 = !{!"MaxInternalCount", i64 1}
+!7 = !{!"MaxFunctionCount", i64 1000}
+!8 = !{!"NumCounts", i64 3}
+!9 = !{!"NumFunctions", i64 3}
+!10 = !{!"DetailedSummary", !11}
+!11 = !{!12, !13, !14}
+!12 = !{i32 10000, i64 100, i32 1}
+!13 = !{i32 999000, i64 100, i32 1}
+!14 = !{i32 999999, i64 1, i32 2}
+!15 = !{!"function_entry_count", i64 1000}
+!16 = !{!"function_entry_count", i64 1}
+!17 = !{!"branch_weights", i32 80}
+!18 = !{!"branch_weights", i32 1}
diff --git a/test/Transforms/CodeGenPrepare/section.ll b/test/Transforms/CodeGenPrepare/section.ll
index 2c96612e1baf..4f3144e7fc73 100644
--- a/test/Transforms/CodeGenPrepare/section.ll
+++ b/test/Transforms/CodeGenPrepare/section.ll
@@ -10,32 +10,32 @@ define void @hot_func() !prof !15 {
   ret void
 }
 
-; CHECK: hot_call_func{{.*}}!section_prefix ![[HOT_ID]]
-; The sum of 2 callsites are hot
-define void @hot_call_func() !prof !16 {
+; For instrumentation based PGO, we should only look at entry counts,
+; not call site VP metadata (which can exist on value profiled memcpy,
+; or possibly left behind after static analysis based devirtualization).
+; CHECK: cold_func1{{.*}}!section_prefix ![[COLD_ID:[0-9]+]]
+define void @cold_func1() !prof !16 {
   call void @hot_func(), !prof !17
   call void @hot_func(), !prof !17
   ret void
 }
 
-; CHECK-NOT: normal_func{{.*}}!section_prefix
-; The sum of all callsites are neither hot or cold
-define void @normal_func() !prof !16 {
+; CHECK: cold_func2{{.*}}!section_prefix
+define void @cold_func2() !prof !16 {
   call void @hot_func(), !prof !17
   call void @hot_func(), !prof !18
   call void @hot_func(), !prof !18
   ret void
 }
 
-; CHECK: cold_func{{.*}}!section_prefix ![[COLD_ID:[0-9]+]]
-; The entry and the callsite are both cold
-define void @cold_func() !prof !16 {
+; CHECK: cold_func3{{.*}}!section_prefix ![[COLD_ID]]
+define void @cold_func3() !prof !16 {
   call void @hot_func(), !prof !18
   ret void
 }
 
 ; CHECK: ![[HOT_ID]] = !{!"function_section_prefix", !".hot"}
-; CHECK: ![[COLD_ID]] = !{!"function_section_prefix", !".cold"}
+; CHECK: ![[COLD_ID]] = !{!"function_section_prefix", !".unlikely"}
 !llvm.module.flags = !{!1}
 !1 = !{i32 1, !"ProfileSummary", !2}
 !2 = !{!3, !4, !5, !6, !7, !8, !9, !10}
diff --git a/test/Transforms/ConstProp/calls-math-finite.ll b/test/Transforms/ConstProp/calls-math-finite.ll
new file mode 100644
index 000000000000..00041f3e4a4b
--- /dev/null
+++ b/test/Transforms/ConstProp/calls-math-finite.ll
@@ -0,0 +1,83 @@
+; RUN: opt < %s -constprop -S | FileCheck %s
+
+; Test to verify constant folding can occur when math
+; routines are mapped to the __<func>_finite versions
+; of functions due to __FINITE_MATH_ONLY__ being
+; enabled on headers. All calls should constant
+; fold away in this test.
+
+declare double @__acos_finite(double) #0
+declare float @__acosf_finite(float) #0
+declare double @__asin_finite(double) #0
+declare float @__asinf_finite(float) #0
+declare double @__atan2_finite(double, double) #0
+declare float @__atan2f_finite(float, float) #0
+declare double @__cosh_finite(double) #0
+declare float @__coshf_finite(float) #0
+declare double @__exp2_finite(double) #0
+declare float @__exp2f_finite(float) #0
+declare double @__exp_finite(double) #0
+declare float @__expf_finite(float) #0
+declare double @__log10_finite(double) #0
+declare float @__log10f_finite(float) #0
+declare double @__log_finite(double) #0
+declare float @__logf_finite(float) #0
+declare double @__pow_finite(double, double) #0
+declare float @__powf_finite(float, float) #0
+declare double @__sinh_finite(double) #0
+declare float @__sinhf_finite(float) #0
+
+attributes #0 = { nounwind readnone }
+
+define void @T() {
+; CHECK-LABEL: @T(
+
+; CHECK-NOT: call
+; CHECK: ret
+
+  %slot = alloca double
+  %slotf = alloca float
+  
+  %ACOS = call fast double @__acos_finite(double 1.000000e+00)
+  store double %ACOS, double* %slot
+  %ASIN = call fast double @__asin_finite(double 1.000000e+00)
+  store double %ASIN, double* %slot
+  %ATAN2 = call fast double @__atan2_finite(double 3.000000e+00, double 4.000000e+00)
+  store double %ATAN2, double* %slot  
+  %COSH = call fast double @__cosh_finite(double 3.000000e+00)
+  store double %COSH, double* %slot
+  %EXP = call fast double @__exp_finite(double 3.000000e+00)
+  store double %EXP, double* %slot
+  %EXP2 = call fast double @__exp2_finite(double 3.000000e+00)
+  store double %EXP2, double* %slot
+  %LOG = call fast double @__log_finite(double 3.000000e+00)
+  store double %LOG, double* %slot
+  %LOG10 = call fast double @__log10_finite(double 3.000000e+00)
+  store double %LOG10, double* %slot  
+  %POW = call fast double @__pow_finite(double 1.000000e+00, double 4.000000e+00)
+  store double %POW, double* %slot
+  %SINH = call fast double @__sinh_finite(double 3.000000e+00)
+  store double %SINH, double* %slot  
+  
+  %ACOSF = call fast float @__acosf_finite(float 1.000000e+00)
+  store float %ACOSF, float* %slotf
+  %ASINF = call fast float @__asinf_finite(float 1.000000e+00)
+  store float %ASINF, float* %slotf
+  %ATAN2F = call fast float @__atan2f_finite(float 3.000000e+00, float 4.000000e+00)
+  store float %ATAN2F, float* %slotf  
+  %COSHF = call fast float @__coshf_finite(float 3.000000e+00)
+  store float %COSHF, float* %slotf
+  %EXPF = call fast float @__expf_finite(float 3.000000e+00)
+  store float %EXPF, float* %slotf
+  %EXP2F = call fast float @__exp2f_finite(float 3.000000e+00)
+  store float %EXP2F, float* %slotf
+  %LOGF = call fast float @__logf_finite(float 3.000000e+00)
+  store float %LOGF, float* %slotf
+  %LOG10F = call fast float @__log10f_finite(float 3.000000e+00)
+  store float %LOG10F, float* %slotf  
+  %POWF = call fast float @__powf_finite(float 3.000000e+00, float 4.000000e+00)
+  store float %POWF, float* %slotf
+  %SINHF = call fast float @__sinhf_finite(float 3.000000e+00)
+  store float %SINHF, float* %slotf
+  ret void
+}
diff --git a/test/Transforms/ConstProp/calls.ll b/test/Transforms/ConstProp/calls.ll
index 1175ea522175..161637cc92b8 100644
--- a/test/Transforms/ConstProp/calls.ll
+++ b/test/Transforms/ConstProp/calls.ll
@@ -184,212 +184,6 @@ define double @T() {
   ret double %d
 }
 
-define i1 @test_sse_cvts_exact() nounwind readnone {
-; CHECK-LABEL: @test_sse_cvts_exact(
-; CHECK-NOT: call
-; CHECK: ret i1 true
-entry:
-  %i0 = tail call i32 @llvm.x86.sse.cvtss2si(<4 x float> <float 3.0, float undef, float undef, float undef>) nounwind
-  %i1 = tail call i64 @llvm.x86.sse.cvtss2si64(<4 x float> <float 3.0, float undef, float undef, float undef>) nounwind
-  %i2 = call i32 @llvm.x86.sse2.cvtsd2si(<2 x double> <double 7.0, double undef>) nounwind
-  %i3 = call i64 @llvm.x86.sse2.cvtsd2si64(<2 x double> <double 7.0, double undef>) nounwind
-  %sum02 = add i32 %i0, %i2
-  %sum13 = add i64 %i1, %i3
-  %cmp02 = icmp eq i32 %sum02, 10
-  %cmp13 = icmp eq i64 %sum13, 10
-  %b = and i1 %cmp02, %cmp13
-  ret i1 %b
-}
-
-; Inexact values should not fold as they are dependent on rounding mode
-define i1 @test_sse_cvts_inexact() nounwind readnone {
-; CHECK-LABEL: @test_sse_cvts_inexact(
-; CHECK: call
-; CHECK: call
-; CHECK: call
-; CHECK: call
-entry:
-  %i0 = tail call i32 @llvm.x86.sse.cvtss2si(<4 x float> <float 1.75, float undef, float undef, float undef>) nounwind
-  %i1 = tail call i64 @llvm.x86.sse.cvtss2si64(<4 x float> <float 1.75, float undef, float undef, float undef>) nounwind
-  %i2 = call i32 @llvm.x86.sse2.cvtsd2si(<2 x double> <double 1.75, double undef>) nounwind
-  %i3 = call i64 @llvm.x86.sse2.cvtsd2si64(<2 x double> <double 1.75, double undef>) nounwind
-  %sum02 = add i32 %i0, %i2
-  %sum13 = add i64 %i1, %i3
-  %cmp02 = icmp eq i32 %sum02, 4
-  %cmp13 = icmp eq i64 %sum13, 4
-  %b = and i1 %cmp02, %cmp13
-  ret i1 %b
-}
-
-; FLT_MAX/DBL_MAX should not fold
-define i1 @test_sse_cvts_max() nounwind readnone {
-; CHECK-LABEL: @test_sse_cvts_max(
-; CHECK: call
-; CHECK: call
-; CHECK: call
-; CHECK: call
-entry:
-  %fm = bitcast <4 x i32> <i32 2139095039, i32 undef, i32 undef, i32 undef> to <4 x float>
-  %dm = bitcast <2 x i64> <i64 9218868437227405311, i64 undef> to <2 x double>
-  %i0 = tail call i32 @llvm.x86.sse.cvtss2si(<4 x float> %fm) nounwind
-  %i1 = tail call i64 @llvm.x86.sse.cvtss2si64(<4 x float> %fm) nounwind
-  %i2 = call i32 @llvm.x86.sse2.cvtsd2si(<2 x double> %dm) nounwind
-  %i3 = call i64 @llvm.x86.sse2.cvtsd2si64(<2 x double> %dm) nounwind
-  %sum02 = add i32 %i0, %i2
-  %sum13 = add i64 %i1, %i3
-  %sum02.sext = sext i32 %sum02 to i64
-  %b = icmp eq i64 %sum02.sext, %sum13
-  ret i1 %b
-}
-
-; INF should not fold
-define i1 @test_sse_cvts_inf() nounwind readnone {
-; CHECK-LABEL: @test_sse_cvts_inf(
-; CHECK: call
-; CHECK: call
-; CHECK: call
-; CHECK: call
-entry:
-  %fm = bitcast <4 x i32> <i32 2139095040, i32 undef, i32 undef, i32 undef> to <4 x float>
-  %dm = bitcast <2 x i64> <i64 9218868437227405312, i64 undef> to <2 x double>
-  %i0 = tail call i32 @llvm.x86.sse.cvtss2si(<4 x float> %fm) nounwind
-  %i1 = tail call i64 @llvm.x86.sse.cvtss2si64(<4 x float> %fm) nounwind
-  %i2 = call i32 @llvm.x86.sse2.cvtsd2si(<2 x double> %dm) nounwind
-  %i3 = call i64 @llvm.x86.sse2.cvtsd2si64(<2 x double> %dm) nounwind
-  %sum02 = add i32 %i0, %i2
-  %sum13 = add i64 %i1, %i3
-  %sum02.sext = sext i32 %sum02 to i64
-  %b = icmp eq i64 %sum02.sext, %sum13
-  ret i1 %b
-}
-
-; NAN should not fold
-define i1 @test_sse_cvts_nan() nounwind readnone {
-; CHECK-LABEL: @test_sse_cvts_nan(
-; CHECK: call
-; CHECK: call
-; CHECK: call
-; CHECK: call
-entry:
-  %fm = bitcast <4 x i32> <i32 2143289344, i32 undef, i32 undef, i32 undef> to <4 x float>
-  %dm = bitcast <2 x i64> <i64 9221120237041090560, i64 undef> to <2 x double>
-  %i0 = tail call i32 @llvm.x86.sse.cvtss2si(<4 x float> %fm) nounwind
-  %i1 = tail call i64 @llvm.x86.sse.cvtss2si64(<4 x float> %fm) nounwind
-  %i2 = call i32 @llvm.x86.sse2.cvtsd2si(<2 x double> %dm) nounwind
-  %i3 = call i64 @llvm.x86.sse2.cvtsd2si64(<2 x double> %dm) nounwind
-  %sum02 = add i32 %i0, %i2
-  %sum13 = add i64 %i1, %i3
-  %sum02.sext = sext i32 %sum02 to i64
-  %b = icmp eq i64 %sum02.sext, %sum13
-  ret i1 %b
-}
-
-define i1 @test_sse_cvtts_exact() nounwind readnone {
-; CHECK-LABEL: @test_sse_cvtts_exact(
-; CHECK-NOT: call
-; CHECK: ret i1 true
-entry:
-  %i0 = tail call i32 @llvm.x86.sse.cvttss2si(<4 x float> <float 3.0, float undef, float undef, float undef>) nounwind
-  %i1 = tail call i64 @llvm.x86.sse.cvttss2si64(<4 x float> <float 3.0, float undef, float undef, float undef>) nounwind
-  %i2 = call i32 @llvm.x86.sse2.cvttsd2si(<2 x double> <double 7.0, double undef>) nounwind
-  %i3 = call i64 @llvm.x86.sse2.cvttsd2si64(<2 x double> <double 7.0, double undef>) nounwind
-  %sum02 = add i32 %i0, %i2
-  %sum13 = add i64 %i1, %i3
-  %cmp02 = icmp eq i32 %sum02, 10
-  %cmp13 = icmp eq i64 %sum13, 10
-  %b = and i1 %cmp02, %cmp13
-  ret i1 %b
-}
-
-define i1 @test_sse_cvtts_inexact() nounwind readnone {
-; CHECK-LABEL: @test_sse_cvtts_inexact(
-; CHECK-NOT: call
-; CHECK: ret i1 true
-entry:
-  %i0 = tail call i32 @llvm.x86.sse.cvttss2si(<4 x float> <float 1.75, float undef, float undef, float undef>) nounwind
-  %i1 = tail call i64 @llvm.x86.sse.cvttss2si64(<4 x float> <float 1.75, float undef, float undef, float undef>) nounwind
-  %i2 = call i32 @llvm.x86.sse2.cvttsd2si(<2 x double> <double 1.75, double undef>) nounwind
-  %i3 = call i64 @llvm.x86.sse2.cvttsd2si64(<2 x double> <double 1.75, double undef>) nounwind
-  %sum02 = add i32 %i0, %i2
-  %sum13 = add i64 %i1, %i3
-  %cmp02 = icmp eq i32 %sum02, 2
-  %cmp13 = icmp eq i64 %sum13, 2
-  %b = and i1 %cmp02, %cmp13
-  ret i1 %b
-}
-
-; FLT_MAX/DBL_MAX should not fold
-define i1 @test_sse_cvtts_max() nounwind readnone {
-; CHECK-LABEL: @test_sse_cvtts_max(
-; CHECK: call
-; CHECK: call
-; CHECK: call
-; CHECK: call
-entry:
-  %fm = bitcast <4 x i32> <i32 2139095039, i32 undef, i32 undef, i32 undef> to <4 x float>
-  %dm = bitcast <2 x i64> <i64 9218868437227405311, i64 undef> to <2 x double>
-  %i0 = tail call i32 @llvm.x86.sse.cvttss2si(<4 x float> %fm) nounwind
-  %i1 = tail call i64 @llvm.x86.sse.cvttss2si64(<4 x float> %fm) nounwind
-  %i2 = call i32 @llvm.x86.sse2.cvttsd2si(<2 x double> %dm) nounwind
-  %i3 = call i64 @llvm.x86.sse2.cvttsd2si64(<2 x double> %dm) nounwind
-  %sum02 = add i32 %i0, %i2
-  %sum13 = add i64 %i1, %i3
-  %sum02.sext = sext i32 %sum02 to i64
-  %b = icmp eq i64 %sum02.sext, %sum13
-  ret i1 %b
-}
-
-; INF should not fold
-define i1 @test_sse_cvtts_inf() nounwind readnone {
-; CHECK-LABEL: @test_sse_cvtts_inf(
-; CHECK: call
-; CHECK: call
-; CHECK: call
-; CHECK: call
-entry:
-  %fm = bitcast <4 x i32> <i32 2139095040, i32 undef, i32 undef, i32 undef> to <4 x float>
-  %dm = bitcast <2 x i64> <i64 9218868437227405312, i64 undef> to <2 x double>
-  %i0 = tail call i32 @llvm.x86.sse.cvttss2si(<4 x float> %fm) nounwind
-  %i1 = tail call i64 @llvm.x86.sse.cvttss2si64(<4 x float> %fm) nounwind
-  %i2 = call i32 @llvm.x86.sse2.cvttsd2si(<2 x double> %dm) nounwind
-  %i3 = call i64 @llvm.x86.sse2.cvttsd2si64(<2 x double> %dm) nounwind
-  %sum02 = add i32 %i0, %i2
-  %sum13 = add i64 %i1, %i3
-  %sum02.sext = sext i32 %sum02 to i64
-  %b = icmp eq i64 %sum02.sext, %sum13
-  ret i1 %b
-}
-
-; NAN should not fold
-define i1 @test_sse_cvtts_nan() nounwind readnone {
-; CHECK-LABEL: @test_sse_cvtts_nan(
-; CHECK: call
-; CHECK: call
-; CHECK: call
-; CHECK: call
-entry:
-  %fm = bitcast <4 x i32> <i32 2143289344, i32 undef, i32 undef, i32 undef> to <4 x float>
-  %dm = bitcast <2 x i64> <i64 9221120237041090560, i64 undef> to <2 x double>
-  %i0 = tail call i32 @llvm.x86.sse.cvttss2si(<4 x float> %fm) nounwind
-  %i1 = tail call i64 @llvm.x86.sse.cvttss2si64(<4 x float> %fm) nounwind
-  %i2 = call i32 @llvm.x86.sse2.cvttsd2si(<2 x double> %dm) nounwind
-  %i3 = call i64 @llvm.x86.sse2.cvttsd2si64(<2 x double> %dm) nounwind
-  %sum02 = add i32 %i0, %i2
-  %sum13 = add i64 %i1, %i3
-  %sum02.sext = sext i32 %sum02 to i64
-  %b = icmp eq i64 %sum02.sext, %sum13
-  ret i1 %b
-}
-
-declare i32 @llvm.x86.sse.cvtss2si(<4 x float>) nounwind readnone
-declare i32 @llvm.x86.sse.cvttss2si(<4 x float>) nounwind readnone
-declare i64 @llvm.x86.sse.cvtss2si64(<4 x float>) nounwind readnone
-declare i64 @llvm.x86.sse.cvttss2si64(<4 x float>) nounwind readnone
-declare i32 @llvm.x86.sse2.cvtsd2si(<2 x double>) nounwind readnone
-declare i32 @llvm.x86.sse2.cvttsd2si(<2 x double>) nounwind readnone
-declare i64 @llvm.x86.sse2.cvtsd2si64(<2 x double>) nounwind readnone
-declare i64 @llvm.x86.sse2.cvttsd2si64(<2 x double>) nounwind readnone
-
 define double @test_intrinsic_pow() nounwind uwtable ssp {
 entry:
 ; CHECK-LABEL: @test_intrinsic_pow(
diff --git a/test/Transforms/ConstProp/sse.ll b/test/Transforms/ConstProp/sse.ll
new file mode 100644
index 000000000000..cc37c96c1ff1
--- /dev/null
+++ b/test/Transforms/ConstProp/sse.ll
@@ -0,0 +1,208 @@
+; RUN: opt < %s -constprop -S | FileCheck %s
+; REQUIRES: x86
+
+define i1 @test_sse_cvts_exact() nounwind readnone {
+; CHECK-LABEL: @test_sse_cvts_exact(
+; CHECK-NOT: call
+; CHECK: ret i1 true
+entry:
+  %i0 = tail call i32 @llvm.x86.sse.cvtss2si(<4 x float> <float 3.0, float undef, float undef, float undef>) nounwind
+  %i1 = tail call i64 @llvm.x86.sse.cvtss2si64(<4 x float> <float 3.0, float undef, float undef, float undef>) nounwind
+  %i2 = call i32 @llvm.x86.sse2.cvtsd2si(<2 x double> <double 7.0, double undef>) nounwind
+  %i3 = call i64 @llvm.x86.sse2.cvtsd2si64(<2 x double> <double 7.0, double undef>) nounwind
+  %sum02 = add i32 %i0, %i2
+  %sum13 = add i64 %i1, %i3
+  %cmp02 = icmp eq i32 %sum02, 10
+  %cmp13 = icmp eq i64 %sum13, 10
+  %b = and i1 %cmp02, %cmp13
+  ret i1 %b
+}
+
+; Inexact values should not fold as they are dependent on rounding mode
+define i1 @test_sse_cvts_inexact() nounwind readnone {
+; CHECK-LABEL: @test_sse_cvts_inexact(
+; CHECK: call
+; CHECK: call
+; CHECK: call
+; CHECK: call
+entry:
+  %i0 = tail call i32 @llvm.x86.sse.cvtss2si(<4 x float> <float 1.75, float undef, float undef, float undef>) nounwind
+  %i1 = tail call i64 @llvm.x86.sse.cvtss2si64(<4 x float> <float 1.75, float undef, float undef, float undef>) nounwind
+  %i2 = call i32 @llvm.x86.sse2.cvtsd2si(<2 x double> <double 1.75, double undef>) nounwind
+  %i3 = call i64 @llvm.x86.sse2.cvtsd2si64(<2 x double> <double 1.75, double undef>) nounwind
+  %sum02 = add i32 %i0, %i2
+  %sum13 = add i64 %i1, %i3
+  %cmp02 = icmp eq i32 %sum02, 4
+  %cmp13 = icmp eq i64 %sum13, 4
+  %b = and i1 %cmp02, %cmp13
+  ret i1 %b
+}
+
+; FLT_MAX/DBL_MAX should not fold
+define i1 @test_sse_cvts_max() nounwind readnone {
+; CHECK-LABEL: @test_sse_cvts_max(
+; CHECK: call
+; CHECK: call
+; CHECK: call
+; CHECK: call
+entry:
+  %fm = bitcast <4 x i32> <i32 2139095039, i32 undef, i32 undef, i32 undef> to <4 x float>
+  %dm = bitcast <2 x i64> <i64 9218868437227405311, i64 undef> to <2 x double>
+  %i0 = tail call i32 @llvm.x86.sse.cvtss2si(<4 x float> %fm) nounwind
+  %i1 = tail call i64 @llvm.x86.sse.cvtss2si64(<4 x float> %fm) nounwind
+  %i2 = call i32 @llvm.x86.sse2.cvtsd2si(<2 x double> %dm) nounwind
+  %i3 = call i64 @llvm.x86.sse2.cvtsd2si64(<2 x double> %dm) nounwind
+  %sum02 = add i32 %i0, %i2
+  %sum13 = add i64 %i1, %i3
+  %sum02.sext = sext i32 %sum02 to i64
+  %b = icmp eq i64 %sum02.sext, %sum13
+  ret i1 %b
+}
+
+; INF should not fold
+define i1 @test_sse_cvts_inf() nounwind readnone {
+; CHECK-LABEL: @test_sse_cvts_inf(
+; CHECK: call
+; CHECK: call
+; CHECK: call
+; CHECK: call
+entry:
+  %fm = bitcast <4 x i32> <i32 2139095040, i32 undef, i32 undef, i32 undef> to <4 x float>
+  %dm = bitcast <2 x i64> <i64 9218868437227405312, i64 undef> to <2 x double>
+  %i0 = tail call i32 @llvm.x86.sse.cvtss2si(<4 x float> %fm) nounwind
+  %i1 = tail call i64 @llvm.x86.sse.cvtss2si64(<4 x float> %fm) nounwind
+  %i2 = call i32 @llvm.x86.sse2.cvtsd2si(<2 x double> %dm) nounwind
+  %i3 = call i64 @llvm.x86.sse2.cvtsd2si64(<2 x double> %dm) nounwind
+  %sum02 = add i32 %i0, %i2
+  %sum13 = add i64 %i1, %i3
+  %sum02.sext = sext i32 %sum02 to i64
+  %b = icmp eq i64 %sum02.sext, %sum13
+  ret i1 %b
+}
+
+; NAN should not fold
+define i1 @test_sse_cvts_nan() nounwind readnone {
+; CHECK-LABEL: @test_sse_cvts_nan(
+; CHECK: call
+; CHECK: call
+; CHECK: call
+; CHECK: call
+entry:
+  %fm = bitcast <4 x i32> <i32 2143289344, i32 undef, i32 undef, i32 undef> to <4 x float>
+  %dm = bitcast <2 x i64> <i64 9221120237041090560, i64 undef> to <2 x double>
+  %i0 = tail call i32 @llvm.x86.sse.cvtss2si(<4 x float> %fm) nounwind
+  %i1 = tail call i64 @llvm.x86.sse.cvtss2si64(<4 x float> %fm) nounwind
+  %i2 = call i32 @llvm.x86.sse2.cvtsd2si(<2 x double> %dm) nounwind
+  %i3 = call i64 @llvm.x86.sse2.cvtsd2si64(<2 x double> %dm) nounwind
+  %sum02 = add i32 %i0, %i2
+  %sum13 = add i64 %i1, %i3
+  %sum02.sext = sext i32 %sum02 to i64
+  %b = icmp eq i64 %sum02.sext, %sum13
+  ret i1 %b
+}
+
+define i1 @test_sse_cvtts_exact() nounwind readnone {
+; CHECK-LABEL: @test_sse_cvtts_exact(
+; CHECK-NOT: call
+; CHECK: ret i1 true
+entry:
+  %i0 = tail call i32 @llvm.x86.sse.cvttss2si(<4 x float> <float 3.0, float undef, float undef, float undef>) nounwind
+  %i1 = tail call i64 @llvm.x86.sse.cvttss2si64(<4 x float> <float 3.0, float undef, float undef, float undef>) nounwind
+  %i2 = call i32 @llvm.x86.sse2.cvttsd2si(<2 x double> <double 7.0, double undef>) nounwind
+  %i3 = call i64 @llvm.x86.sse2.cvttsd2si64(<2 x double> <double 7.0, double undef>) nounwind
+  %sum02 = add i32 %i0, %i2
+  %sum13 = add i64 %i1, %i3
+  %cmp02 = icmp eq i32 %sum02, 10
+  %cmp13 = icmp eq i64 %sum13, 10
+  %b = and i1 %cmp02, %cmp13
+  ret i1 %b
+}
+
+define i1 @test_sse_cvtts_inexact() nounwind readnone {
+; CHECK-LABEL: @test_sse_cvtts_inexact(
+; CHECK-NOT: call
+; CHECK: ret i1 true
+entry:
+  %i0 = tail call i32 @llvm.x86.sse.cvttss2si(<4 x float> <float 1.75, float undef, float undef, float undef>) nounwind
+  %i1 = tail call i64 @llvm.x86.sse.cvttss2si64(<4 x float> <float 1.75, float undef, float undef, float undef>) nounwind
+  %i2 = call i32 @llvm.x86.sse2.cvttsd2si(<2 x double> <double 1.75, double undef>) nounwind
+  %i3 = call i64 @llvm.x86.sse2.cvttsd2si64(<2 x double> <double 1.75, double undef>) nounwind
+  %sum02 = add i32 %i0, %i2
+  %sum13 = add i64 %i1, %i3
+  %cmp02 = icmp eq i32 %sum02, 2
+  %cmp13 = icmp eq i64 %sum13, 2
+  %b = and i1 %cmp02, %cmp13
+  ret i1 %b
+}
+
+; FLT_MAX/DBL_MAX should not fold
+define i1 @test_sse_cvtts_max() nounwind readnone {
+; CHECK-LABEL: @test_sse_cvtts_max(
+; CHECK: call
+; CHECK: call
+; CHECK: call
+; CHECK: call
+entry:
+  %fm = bitcast <4 x i32> <i32 2139095039, i32 undef, i32 undef, i32 undef> to <4 x float>
+  %dm = bitcast <2 x i64> <i64 9218868437227405311, i64 undef> to <2 x double>
+  %i0 = tail call i32 @llvm.x86.sse.cvttss2si(<4 x float> %fm) nounwind
+  %i1 = tail call i64 @llvm.x86.sse.cvttss2si64(<4 x float> %fm) nounwind
+  %i2 = call i32 @llvm.x86.sse2.cvttsd2si(<2 x double> %dm) nounwind
+  %i3 = call i64 @llvm.x86.sse2.cvttsd2si64(<2 x double> %dm) nounwind
+  %sum02 = add i32 %i0, %i2
+  %sum13 = add i64 %i1, %i3
+  %sum02.sext = sext i32 %sum02 to i64
+  %b = icmp eq i64 %sum02.sext, %sum13
+  ret i1 %b
+}
+
+; INF should not fold
+define i1 @test_sse_cvtts_inf() nounwind readnone {
+; CHECK-LABEL: @test_sse_cvtts_inf(
+; CHECK: call
+; CHECK: call
+; CHECK: call
+; CHECK: call
+entry:
+  %fm = bitcast <4 x i32> <i32 2139095040, i32 undef, i32 undef, i32 undef> to <4 x float>
+  %dm = bitcast <2 x i64> <i64 9218868437227405312, i64 undef> to <2 x double>
+  %i0 = tail call i32 @llvm.x86.sse.cvttss2si(<4 x float> %fm) nounwind
+  %i1 = tail call i64 @llvm.x86.sse.cvttss2si64(<4 x float> %fm) nounwind
+  %i2 = call i32 @llvm.x86.sse2.cvttsd2si(<2 x double> %dm) nounwind
+  %i3 = call i64 @llvm.x86.sse2.cvttsd2si64(<2 x double> %dm) nounwind
+  %sum02 = add i32 %i0, %i2
+  %sum13 = add i64 %i1, %i3
+  %sum02.sext = sext i32 %sum02 to i64
+  %b = icmp eq i64 %sum02.sext, %sum13
+  ret i1 %b
+}
+
+; NAN should not fold
+define i1 @test_sse_cvtts_nan() nounwind readnone {
+; CHECK-LABEL: @test_sse_cvtts_nan(
+; CHECK: call
+; CHECK: call
+; CHECK: call
+; CHECK: call
+entry:
+  %fm = bitcast <4 x i32> <i32 2143289344, i32 undef, i32 undef, i32 undef> to <4 x float>
+  %dm = bitcast <2 x i64> <i64 9221120237041090560, i64 undef> to <2 x double>
+  %i0 = tail call i32 @llvm.x86.sse.cvttss2si(<4 x float> %fm) nounwind
+  %i1 = tail call i64 @llvm.x86.sse.cvttss2si64(<4 x float> %fm) nounwind
+  %i2 = call i32 @llvm.x86.sse2.cvttsd2si(<2 x double> %dm) nounwind
+  %i3 = call i64 @llvm.x86.sse2.cvttsd2si64(<2 x double> %dm) nounwind
+  %sum02 = add i32 %i0, %i2
+  %sum13 = add i64 %i1, %i3
+  %sum02.sext = sext i32 %sum02 to i64
+  %b = icmp eq i64 %sum02.sext, %sum13
+  ret i1 %b
+}
+
+declare i32 @llvm.x86.sse.cvtss2si(<4 x float>) nounwind readnone
+declare i32 @llvm.x86.sse.cvttss2si(<4 x float>) nounwind readnone
+declare i64 @llvm.x86.sse.cvtss2si64(<4 x float>) nounwind readnone
+declare i64 @llvm.x86.sse.cvttss2si64(<4 x float>) nounwind readnone
+declare i32 @llvm.x86.sse2.cvtsd2si(<2 x double>) nounwind readnone
+declare i32 @llvm.x86.sse2.cvttsd2si(<2 x double>) nounwind readnone
+declare i64 @llvm.x86.sse2.cvtsd2si64(<2 x double>) nounwind readnone
+declare i64 @llvm.x86.sse2.cvttsd2si64(<2 x double>) nounwind readnone
diff --git a/test/Transforms/Coroutines/coro-eh-aware-edge-split.ll b/test/Transforms/Coroutines/coro-eh-aware-edge-split.ll
new file mode 100644
index 000000000000..5da0e3c199db
--- /dev/null
+++ b/test/Transforms/Coroutines/coro-eh-aware-edge-split.ll
@@ -0,0 +1,218 @@
+; Check that we can handle edge splits leading into a landingpad
+; RUN: opt < %s -coro-split -S | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; CHECK-LABEL: define internal fastcc void @f.resume(
+define void @f(i1 %cond) "coroutine.presplit"="1" personality i32 0 {
+entry:
+  %id = call token @llvm.coro.id(i32 16, i8* null, i8* null, i8* null)
+  %size = tail call i64 @llvm.coro.size.i64()
+  %alloc = call i8* @malloc(i64 %size)
+  %hdl = call i8* @llvm.coro.begin(token %id, i8* %alloc)
+  %sp = call i8 @llvm.coro.suspend(token none, i1 false)
+  switch i8 %sp, label %coro.ret [
+    i8 0, label %resume
+    i8 1, label %cleanup
+  ]
+
+resume:
+  br i1 %cond, label %invoke1, label %invoke2
+
+invoke1:
+  invoke void @may_throw1()
+          to label %unreach unwind label %pad.with.phi
+invoke2:
+  invoke void @may_throw2()
+          to label %unreach unwind label %pad.with.phi
+
+; Verify that we cloned landing pad on every edge and inserted a reload of the spilled value
+
+; CHECK: pad.with.phi.from.invoke2:
+; CHECK:   %0 = landingpad { i8*, i32 }
+; CHECK:           catch i8* null
+; CHECK:   br label %pad.with.phi
+
+; CHECK: pad.with.phi.from.invoke1:
+; CHECK:   %1 = landingpad { i8*, i32 }
+; CHECK:           catch i8* null
+; CHECK:   br label %pad.with.phi
+
+; CHECK: pad.with.phi:
+; CHECK:   %val = phi i32 [ 0, %pad.with.phi.from.invoke1 ], [ 1, %pad.with.phi.from.invoke2 ]
+; CHECK:   %lp = phi { i8*, i32 } [ %0, %pad.with.phi.from.invoke2 ], [ %1, %pad.with.phi.from.invoke1 ]
+; CHECK:   %exn = extractvalue { i8*, i32 } %lp, 0
+; CHECK:   call i8* @__cxa_begin_catch(i8* %exn)
+; CHECK:   call void @use_val(i32 %val)
+; CHECK:   call void @__cxa_end_catch()
+; CHECK:   call void @free(i8* %vFrame)
+; CHECK:   ret void
+
+pad.with.phi:
+  %val = phi i32 [ 0, %invoke1 ], [ 1, %invoke2 ]
+  %lp = landingpad { i8*, i32 }
+          catch i8* null
+  %exn = extractvalue { i8*, i32 } %lp, 0
+  call i8* @__cxa_begin_catch(i8* %exn)
+  call void @use_val(i32 %val)
+  call void @__cxa_end_catch()
+  br label %cleanup
+
+cleanup:                                        ; preds = %invoke.cont15, %if.else, %if.then, %ehcleanup21, %init.suspend
+  %mem = call i8* @llvm.coro.free(token %id, i8* %hdl)
+  call void @free(i8* %mem)
+  br label %coro.ret
+
+coro.ret:
+  call i1 @llvm.coro.end(i8* null, i1 false)
+  ret void
+
+unreach:
+  unreachable
+}
+
+; CHECK-LABEL: define internal fastcc void @g.resume(
+define void @g(i1 %cond, i32 %x, i32 %y) "coroutine.presplit"="1" personality i32 0 {
+entry:
+  %id = call token @llvm.coro.id(i32 16, i8* null, i8* null, i8* null)
+  %size = tail call i64 @llvm.coro.size.i64()
+  %alloc = call i8* @malloc(i64 %size)
+  %hdl = call i8* @llvm.coro.begin(token %id, i8* %alloc)
+  %sp = call i8 @llvm.coro.suspend(token none, i1 false)
+  switch i8 %sp, label %coro.ret [
+    i8 0, label %resume
+    i8 1, label %cleanup
+  ]
+
+resume:
+  br i1 %cond, label %invoke1, label %invoke2
+
+invoke1:
+  invoke void @may_throw1()
+          to label %unreach unwind label %pad.with.phi
+invoke2:
+  invoke void @may_throw2()
+          to label %unreach unwind label %pad.with.phi
+
+; Verify that we created cleanuppads on every edge and inserted a reload of the spilled value
+
+; CHECK: pad.with.phi.from.invoke2:
+; CHECK:   %0 = cleanuppad within none []
+; CHECK:   %y.reload.addr = getelementptr inbounds %g.Frame, %g.Frame* %FramePtr, i32 0, i32 6
+; CHECK:   %y.reload = load i32, i32* %y.reload.addr
+; CHECK:   cleanupret from %0 unwind label %pad.with.phi
+
+; CHECK: pad.with.phi.from.invoke1:
+; CHECK:   %1 = cleanuppad within none []
+; CHECK:   %x.reload.addr = getelementptr inbounds %g.Frame, %g.Frame* %FramePtr, i32 0, i32 5
+; CHECK:   %x.reload = load i32, i32* %x.reload.addr
+; CHECK:   cleanupret from %1 unwind label %pad.with.phi
+
+; CHECK: pad.with.phi:
+; CHECK:   %val = phi i32 [ %x.reload, %pad.with.phi.from.invoke1 ], [ %y.reload, %pad.with.phi.from.invoke2 ]
+; CHECK:   %tok = cleanuppad within none []
+; CHECK:   call void @use_val(i32 %val)
+; CHECK:   cleanupret from %tok unwind to caller
+
+pad.with.phi:
+  %val = phi i32 [ %x, %invoke1 ], [ %y, %invoke2 ]
+  %tok = cleanuppad within none []
+  call void @use_val(i32 %val)
+  cleanupret from %tok unwind to caller
+
+cleanup:                                        ; preds = %invoke.cont15, %if.else, %if.then, %ehcleanup21, %init.suspend
+  %mem = call i8* @llvm.coro.free(token %id, i8* %hdl)
+  call void @free(i8* %mem)
+  br label %coro.ret
+
+coro.ret:
+  call i1 @llvm.coro.end(i8* null, i1 false)
+  ret void
+
+unreach:
+  unreachable
+}
+
+; CHECK-LABEL: define internal fastcc void @h.resume(
+define void @h(i1 %cond, i32 %x, i32 %y) "coroutine.presplit"="1" personality i32 0 {
+entry:
+  %id = call token @llvm.coro.id(i32 16, i8* null, i8* null, i8* null)
+  %size = tail call i64 @llvm.coro.size.i64()
+  %alloc = call i8* @malloc(i64 %size)
+  %hdl = call i8* @llvm.coro.begin(token %id, i8* %alloc)
+  %sp = call i8 @llvm.coro.suspend(token none, i1 false)
+  switch i8 %sp, label %coro.ret [
+    i8 0, label %resume
+    i8 1, label %cleanup
+  ]
+
+resume:
+  br i1 %cond, label %invoke1, label %invoke2
+
+invoke1:
+  invoke void @may_throw1()
+          to label %coro.ret unwind label %pad.with.phi
+invoke2:
+  invoke void @may_throw2()
+          to label %coro.ret unwind label %pad.with.phi
+
+; Verify that we created cleanuppads on every edge and inserted a reload of the spilled value
+
+; CHECK: pad.with.phi.from.invoke2:
+; CHECK:   %0 = cleanuppad within none []
+; CHECK:   %y.reload.addr = getelementptr inbounds %h.Frame, %h.Frame* %FramePtr, i32 0, i32 6
+; CHECK:   %y.reload = load i32, i32* %y.reload.addr
+; CHECK:   cleanupret from %0 unwind label %pad.with.phi
+
+; CHECK: pad.with.phi.from.invoke1:
+; CHECK:   %1 = cleanuppad within none []
+; CHECK:   %x.reload.addr = getelementptr inbounds %h.Frame, %h.Frame* %FramePtr, i32 0, i32 5
+; CHECK:   %x.reload = load i32, i32* %x.reload.addr
+; CHECK:   cleanupret from %1 unwind label %pad.with.phi
+
+; CHECK: pad.with.phi:
+; CHECK:   %val = phi i32 [ %x.reload, %pad.with.phi.from.invoke1 ], [ %y.reload, %pad.with.phi.from.invoke2 ]
+; CHECK:   %switch = catchswitch within none [label %catch] unwind to caller
+pad.with.phi:
+  %val = phi i32 [ %x, %invoke1 ], [ %y, %invoke2 ]
+  %switch = catchswitch within none [label %catch] unwind to caller
+
+catch:                                            ; preds = %catch.dispatch
+  %pad = catchpad within %switch [i8* null, i32 64, i8* null]
+  call void @use_val(i32 %val)
+  catchret from %pad to label %coro.ret
+
+cleanup:                                        ; preds = %invoke.cont15, %if.else, %if.then, %ehcleanup21, %init.suspend
+  %mem = call i8* @llvm.coro.free(token %id, i8* %hdl)
+  call void @free(i8* %mem)
+  br label %coro.ret
+
+coro.ret:
+  call i1 @llvm.coro.end(i8* null, i1 false)
+  ret void
+}
+
+; Function Attrs: argmemonly nounwind readonly
+declare token @llvm.coro.id(i32, i8* readnone, i8* nocapture readonly, i8*)
+declare noalias i8* @malloc(i64)
+declare i64 @llvm.coro.size.i64()
+declare i8* @llvm.coro.begin(token, i8* writeonly)
+
+; Function Attrs: nounwind
+declare token @llvm.coro.save(i8*)
+declare i8 @llvm.coro.suspend(token, i1)
+
+; Function Attrs: argmemonly nounwind
+declare void @may_throw1()
+declare void @may_throw2()
+
+declare i8* @__cxa_begin_catch(i8*)
+
+declare void @use_val(i32)
+declare void @__cxa_end_catch()
+
+; Function Attrs: nounwind
+declare i1 @llvm.coro.end(i8*, i1)
+declare void @free(i8*)
+declare i8* @llvm.coro.free(token, i8* nocapture readonly)
diff --git a/test/Transforms/GVN/PRE/2011-06-01-NonLocalMemdepMiscompile.ll b/test/Transforms/GVN/PRE/2011-06-01-NonLocalMemdepMiscompile.ll
index 0769575759ba..05dc79db95ad 100644
--- a/test/Transforms/GVN/PRE/2011-06-01-NonLocalMemdepMiscompile.ll
+++ b/test/Transforms/GVN/PRE/2011-06-01-NonLocalMemdepMiscompile.ll
@@ -5,8 +5,7 @@
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 target triple = "x86_64-apple-macosx10.7.0"
-
-define i1 @rb_intern() nounwind ssp {
+define i1 @rb_intern(i8 *%foo) nounwind ssp {
 ; CHECK-LABEL: @rb_intern(
 
 bb:
@@ -19,7 +18,7 @@ bb1:
   br i1 undef, label %bb3, label %bb15
 
 ; CHECK: bb1:
-; CHECK: [[TMP:%.*]] = phi i8* [ getelementptr (i8, i8* null, i64 undef), %bb10 ], [ null, %bb ]
+; CHECK: [[TMP:%.*]] = phi i8* [ %tmp14, %bb10 ], [ null, %bb ]
 
 ; CHECK: bb1.bb15_crit_edge:
 ; CHECK: %tmp17.pre = load i8, i8* [[TMP]], align 1
@@ -41,7 +40,7 @@ bb10:
   %tmp11 = load i8*, i8** %tmp, align 8
   %tmp12 = load i8, i8* %tmp11, align 1
   %tmp13 = zext i8 %tmp12 to i64
-  %tmp14 = getelementptr inbounds i8, i8* null, i64 undef
+  %tmp14 = getelementptr inbounds i8, i8* %foo, i64 undef
   store i8* %tmp14, i8** %tmp, align 8
   br label %bb1
 
diff --git a/test/Transforms/GVN/PRE/nonintegral.ll b/test/Transforms/GVN/PRE/nonintegral.ll
new file mode 100644
index 000000000000..75a756e8af8c
--- /dev/null
+++ b/test/Transforms/GVN/PRE/nonintegral.ll
@@ -0,0 +1,39 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -gvn -S < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128-ni:4"
+target triple = "x86_64-unknown-linux-gnu"
+
+define void @nipre(double addrspace(4)** noalias %p, i64 addrspace(4)** noalias %p2, i8 %jmp) {
+
+; CHECK-LABEL: @nipre(
+; CHECK:    [[PCAST:%.*]] = bitcast double addrspace(4)** [[P:%.*]] to i64 addrspace(4)**
+; CHECK:       a:
+; CHECK:    [[L1:%.*]] = load i64 addrspace(4)*, i64 addrspace(4)** [[PCAST]]
+; CHECK:    [[TMP0:%.*]] = bitcast i64 addrspace(4)* [[L1]] to double addrspace(4)*
+; CHECK:       b:
+; CHECK:    [[L2:%.*]] = load i64 addrspace(4)*, i64 addrspace(4)** [[PCAST]]
+; CHECK:    [[TMP1:%.*]] = bitcast i64 addrspace(4)* [[L2]] to double addrspace(4)*
+; CHECK:       c:
+; CHECK-NEXT:    [[L3_PRE:%.*]] = load double addrspace(4)*, double addrspace(4)** %p
+
+entry:
+  %pcast = bitcast double addrspace(4)** %p to i64 addrspace(4)**
+  switch i8 %jmp, label %c [ i8 0, label %a
+  i8 1, label %b]
+a:
+  %l1 = load i64 addrspace(4)*, i64 addrspace(4)** %pcast
+  store i64 addrspace(4)* %l1, i64 addrspace(4)** %p2
+  br label %tail
+b:
+  %l2 = load i64 addrspace(4)*, i64 addrspace(4)** %pcast
+  store i64 addrspace(4)* %l2, i64 addrspace(4)** %p2
+  br label %tail
+c:
+  br label %tail
+tail:
+  %l3 = load double addrspace(4)*, double addrspace(4)** %p
+  %l3cast = bitcast double addrspace(4)* %l3 to i64 addrspace(4)*
+  store i64 addrspace(4)* %l3cast, i64 addrspace(4)** %p2
+  ret void
+}
diff --git a/test/Transforms/IndVarSimplify/2011-10-27-lftrnull.ll b/test/Transforms/IndVarSimplify/2011-10-27-lftrnull.ll
index 3d77a364f96f..49e5d24296c0 100644
--- a/test/Transforms/IndVarSimplify/2011-10-27-lftrnull.ll
+++ b/test/Transforms/IndVarSimplify/2011-10-27-lftrnull.ll
@@ -6,7 +6,7 @@ target triple = "thumbv7-apple-darwin"
 
 ; CHECK-LABEL: @test(
 ; CHECK: if.end.i126:
-; CHECK: %exitcond = icmp ne i8* %incdec.ptr.i, getelementptr (i8, i8* null, i32 undef)
+; CHECK: %exitcond = icmp ne i8* %incdec.ptr.i, null
 define void @test() nounwind {
 entry:
   br label %while.cond
diff --git a/test/Transforms/InferFunctionAttrs/annotate.ll b/test/Transforms/InferFunctionAttrs/annotate.ll
index 64676bf310bd..cb4b5cdd1e8c 100644
--- a/test/Transforms/InferFunctionAttrs/annotate.ll
+++ b/test/Transforms/InferFunctionAttrs/annotate.ll
@@ -22,12 +22,138 @@ declare i32 @__nvvm_reflect(i8*)
 ; Use an opaque pointer type for all the (possibly opaque) structs.
 %opaque = type opaque
 
+; CHECK: declare double @__acos_finite(double)
+declare double @__acos_finite(double)
+
+; CHECK: declare float @__acosf_finite(float)
+declare float @__acosf_finite(float)
+
+; CHECK: declare double @__acosh_finite(double)
+declare double @__acosh_finite(double)
+
+; CHECK: declare float @__acoshf_finite(float)
+declare float @__acoshf_finite(float)
+
+; CHECK: declare x86_fp80 @__acoshl_finite(x86_fp80)
+declare x86_fp80 @__acoshl_finite(x86_fp80)
+
+; CHECK: declare x86_fp80 @__acosl_finite(x86_fp80)
+declare x86_fp80 @__acosl_finite(x86_fp80)
+
+; CHECK: declare double @__asin_finite(double)
+declare double @__asin_finite(double)
+
+; CHECK: declare float @__asinf_finite(float)
+declare float @__asinf_finite(float)
+
+; CHECK: declare x86_fp80 @__asinl_finite(x86_fp80)
+declare x86_fp80 @__asinl_finite(x86_fp80)
+
+; CHECK: declare double @__atan2_finite(double, double)
+declare double @__atan2_finite(double, double)
+
+; CHECK: declare float @__atan2f_finite(float, float)
+declare float @__atan2f_finite(float, float)
+
+; CHECK: declare x86_fp80 @__atan2l_finite(x86_fp80, x86_fp80)
+declare x86_fp80 @__atan2l_finite(x86_fp80, x86_fp80)
+
+; CHECK: declare double @__atanh_finite(double)
+declare double @__atanh_finite(double)
+
+; CHECK: declare float @__atanhf_finite(float)
+declare float @__atanhf_finite(float)
+
+; CHECK: declare x86_fp80 @__atanhl_finite(x86_fp80)
+declare x86_fp80 @__atanhl_finite(x86_fp80)
+
+; CHECK: declare double @__cosh_finite(double)
+declare double @__cosh_finite(double)
+
+; CHECK: declare float @__coshf_finite(float)
+declare float @__coshf_finite(float)
+
+; CHECK: declare x86_fp80 @__coshl_finite(x86_fp80)
+declare x86_fp80 @__coshl_finite(x86_fp80)
+
 ; CHECK: declare double @__cospi(double)
 declare double @__cospi(double)
 
 ; CHECK: declare float @__cospif(float)
 declare float @__cospif(float)
 
+; CHECK: declare double @__exp10_finite(double)
+declare double @__exp10_finite(double)
+
+; CHECK: declare float @__exp10f_finite(float)
+declare float @__exp10f_finite(float)
+
+; CHECK: declare x86_fp80 @__exp10l_finite(x86_fp80)
+declare x86_fp80 @__exp10l_finite(x86_fp80)
+
+; CHECK: declare double @__exp2_finite(double)
+declare double @__exp2_finite(double)
+
+; CHECK: declare float @__exp2f_finite(float)
+declare float @__exp2f_finite(float)
+
+; CHECK: declare x86_fp80 @__exp2l_finite(x86_fp80)
+declare x86_fp80 @__exp2l_finite(x86_fp80)
+
+; CHECK: declare double @__exp_finite(double)
+declare double @__exp_finite(double)
+
+; CHECK: declare float @__expf_finite(float)
+declare float @__expf_finite(float)
+
+; CHECK: declare x86_fp80 @__expl_finite(x86_fp80)
+declare x86_fp80 @__expl_finite(x86_fp80)
+
+; CHECK: declare double @__log10_finite(double)
+declare double @__log10_finite(double)
+
+; CHECK: declare float @__log10f_finite(float)
+declare float @__log10f_finite(float)
+
+; CHECK: declare x86_fp80 @__log10l_finite(x86_fp80)
+declare x86_fp80 @__log10l_finite(x86_fp80)
+
+; CHECK: declare double @__log2_finite(double)
+declare double @__log2_finite(double)
+
+; CHECK: declare float @__log2f_finite(float)
+declare float @__log2f_finite(float)
+
+; CHECK: declare x86_fp80 @__log2l_finite(x86_fp80)
+declare x86_fp80 @__log2l_finite(x86_fp80)
+
+; CHECK: declare double @__log_finite(double)
+declare double @__log_finite(double)
+
+; CHECK: declare float @__logf_finite(float)
+declare float @__logf_finite(float)
+
+; CHECK: declare x86_fp80 @__logl_finite(x86_fp80)
+declare x86_fp80 @__logl_finite(x86_fp80)
+
+; CHECK: declare double @__pow_finite(double, double)
+declare double @__pow_finite(double, double)
+
+; CHECK: declare float @__powf_finite(float, float)
+declare float @__powf_finite(float, float)
+
+; CHECK: declare x86_fp80 @__powl_finite(x86_fp80, x86_fp80)
+declare x86_fp80 @__powl_finite(x86_fp80, x86_fp80)
+
+; CHECK: declare double @__sinh_finite(double)
+declare double @__sinh_finite(double)
+
+; CHECK: declare float @__sinhf_finite(float)
+declare float @__sinhf_finite(float)
+
+; CHECK: declare x86_fp80 @__sinhl_finite(x86_fp80)
+declare x86_fp80 @__sinhl_finite(x86_fp80)
+
 ; CHECK: declare double @__sinpi(double)
 declare double @__sinpi(double)
 
diff --git a/test/Transforms/InferFunctionAttrs/no-proto.ll b/test/Transforms/InferFunctionAttrs/no-proto.ll
index 25a4805c367f..3cab0ab4bf40 100644
--- a/test/Transforms/InferFunctionAttrs/no-proto.ll
+++ b/test/Transforms/InferFunctionAttrs/no-proto.ll
@@ -3,12 +3,138 @@
 
 ; Check that we don't modify libc functions with invalid prototypes.
 
+; CHECK: declare void @__acos_finite(...)
+declare void @__acos_finite(...)
+
+; CHECK: declare void @__acosf_finite(...)
+declare void @__acosf_finite(...)
+
+; CHECK: declare void @__acosh_finite(...)
+declare void @__acosh_finite(...)
+
+; CHECK: declare void @__acoshf_finite(...)
+declare void @__acoshf_finite(...)
+
+; CHECK: declare void @__acoshl_finite(...)
+declare void @__acoshl_finite(...)
+
+; CHECK: declare void @__acosl_finite(...)
+declare void @__acosl_finite(...)
+
+; CHECK: declare void @__asin_finite(...)
+declare void @__asin_finite(...)
+
+; CHECK: declare void @__asinf_finite(...)
+declare void @__asinf_finite(...)
+
+; CHECK: declare void @__asinl_finite(...)
+declare void @__asinl_finite(...)
+
+; CHECK: declare void @__atan2_finite(...)
+declare void @__atan2_finite(...)
+
+; CHECK: declare void @__atan2f_finite(...)
+declare void @__atan2f_finite(...)
+
+; CHECK: declare void @__atan2l_finite(...)
+declare void @__atan2l_finite(...)
+
+; CHECK: declare void @__atanh_finite(...)
+declare void @__atanh_finite(...)
+
+; CHECK: declare void @__atanhf_finite(...)
+declare void @__atanhf_finite(...)
+
+; CHECK: declare void @__atanhl_finite(...)
+declare void @__atanhl_finite(...)
+
+; CHECK: declare void @__cosh_finite(...)
+declare void @__cosh_finite(...)
+
+; CHECK: declare void @__coshf_finite(...)
+declare void @__coshf_finite(...)
+
+; CHECK: declare void @__coshl_finite(...)
+declare void @__coshl_finite(...)
+
 ; CHECK: declare void @__cospi(...)
 declare void @__cospi(...)
 
 ; CHECK: declare void @__cospif(...)
 declare void @__cospif(...)
 
+; CHECK: declare void @__exp10_finite(...)
+declare void @__exp10_finite(...)
+
+; CHECK: declare void @__exp10f_finite(...)
+declare void @__exp10f_finite(...)
+
+; CHECK: declare void @__exp10l_finite(...)
+declare void @__exp10l_finite(...)
+
+; CHECK: declare void @__exp2_finite(...)
+declare void @__exp2_finite(...)
+
+; CHECK: declare void @__exp2f_finite(...)
+declare void @__exp2f_finite(...)
+
+; CHECK: declare void @__exp2l_finite(...)
+declare void @__exp2l_finite(...)
+
+; CHECK: declare void @__exp_finite(...)
+declare void @__exp_finite(...)
+
+; CHECK: declare void @__expf_finite(...)
+declare void @__expf_finite(...)
+
+; CHECK: declare void @__expl_finite(...)
+declare void @__expl_finite(...)
+
+; CHECK: declare void @__log10_finite(...)
+declare void @__log10_finite(...)
+
+; CHECK: declare void @__log10f_finite(...)
+declare void @__log10f_finite(...)
+
+; CHECK: declare void @__log10l_finite(...)
+declare void @__log10l_finite(...)
+
+; CHECK: declare void @__log2_finite(...)
+declare void @__log2_finite(...)
+
+; CHECK: declare void @__log2f_finite(...)
+declare void @__log2f_finite(...)
+
+; CHECK: declare void @__log2l_finite(...)
+declare void @__log2l_finite(...)
+
+; CHECK: declare void @__log_finite(...)
+declare void @__log_finite(...)
+
+; CHECK: declare void @__logf_finite(...)
+declare void @__logf_finite(...)
+
+; CHECK: declare void @__logl_finite(...)
+declare void @__logl_finite(...)
+
+; CHECK: declare void @__pow_finite(...)
+declare void @__pow_finite(...)
+
+; CHECK: declare void @__powf_finite(...)
+declare void @__powf_finite(...)
+
+; CHECK: declare void @__powl_finite(...)
+declare void @__powl_finite(...)
+
+; CHECK: declare void @__sinh_finite(...)
+declare void @__sinh_finite(...)
+
+; CHECK: declare void @__sinhf_finite(...)
+declare void @__sinhf_finite(...)
+
+; CHECK: declare void @__sinhl_finite(...)
+declare void @__sinhl_finite(...)
+
 ; CHECK: declare void @__sinpi(...)
 declare void @__sinpi(...)
 
diff --git a/test/Transforms/Inline/inline-cold.ll b/test/Transforms/Inline/inline-cold.ll
index 93d2569d87ad..e0e679ad4036 100644
--- a/test/Transforms/Inline/inline-cold.ll
+++ b/test/Transforms/Inline/inline-cold.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -inline -S -inlinecold-threshold=75 | FileCheck %s
+; RUN: opt < %s -inline -S -inlinecold-threshold=25 | FileCheck %s
 ; Test that functions with attribute Cold are not inlined while the 
 ; same function without attribute Cold will be inlined.
 
@@ -64,23 +64,7 @@ entry:
   %x3 = add i32 %x2, %a3
   %a4 = load volatile i32, i32* @a
   %x4 = add i32 %x3, %a4
-  %a5 = load volatile i32, i32* @a
-  %x5 = add i32 %x4, %a5
-  %a6 = load volatile i32, i32* @a
-  %x6 = add i32 %x5, %a6
-  %a7 = load volatile i32, i32* @a
-  %x7 = add i32 %x6, %a6
-  %a8 = load volatile i32, i32* @a
-  %x8 = add i32 %x7, %a8
-  %a9 = load volatile i32, i32* @a
-  %x9 = add i32 %x8, %a9
-  %a10 = load volatile i32, i32* @a
-  %x10 = add i32 %x9, %a10
-  %a11 = load volatile i32, i32* @a
-  %x11 = add i32 %x10, %a11
-  %a12 = load volatile i32, i32* @a
-  %x12 = add i32 %x11, %a12
-  %add = add i32 %x12, %a
+  %add = add i32 %x4, %a
   ret i32 %add
 }
 
diff --git a/test/Transforms/Inline/inline-constexpr-addrspacecast-argument.ll b/test/Transforms/Inline/inline-constexpr-addrspacecast-argument.ll
index 1f2b143c97ee..b8d41abe1c35 100644
--- a/test/Transforms/Inline/inline-constexpr-addrspacecast-argument.ll
+++ b/test/Transforms/Inline/inline-constexpr-addrspacecast-argument.ll
@@ -6,7 +6,7 @@ target datalayout = "e-p3:32:32-p4:64:64-n32"
 @lds = internal addrspace(3) global [64 x i64] zeroinitializer
 
 ; CHECK-LABEL: @constexpr_addrspacecast_ptr_size_change(
-; CHECK: load i64, i64 addrspace(4)* getelementptr (i64, i64 addrspace(4)* addrspacecast (i64 addrspace(3)* getelementptr inbounds ([64 x i64], [64 x i64] addrspace(3)* @lds, i32 0, i32 0) to i64 addrspace(4)*), i64 undef)
+; CHECK: load i64, i64 addrspace(4)* addrspacecast (i64 addrspace(3)* getelementptr inbounds ([64 x i64], [64 x i64] addrspace(3)* @lds, i32 0, i32 0) to i64 addrspace(4)*)
 ; CHECK-NEXT: br
 define void @constexpr_addrspacecast_ptr_size_change() #0 {
   %tmp0 = call i32 @foo(i64 addrspace(4)* addrspacecast (i64 addrspace(3)* getelementptr inbounds ([64 x i64], [64 x i64] addrspace(3)* @lds, i32 0, i32 0) to i64 addrspace(4)*)) #1
diff --git a/test/Transforms/Inline/partial-inline-act.ll b/test/Transforms/Inline/partial-inline-act.ll
index 916436260bd6..27e719153875 100644
--- a/test/Transforms/Inline/partial-inline-act.ll
+++ b/test/Transforms/Inline/partial-inline-act.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -partial-inliner -disable-output
+; RUN: opt < %s -partial-inliner -skip-partial-inlining-cost-analysis -disable-output
 ; This testcase tests the assumption cache
 
 define internal i32 @inlinedFunc(i1 %cond, i32* align 4 %align.val) {
diff --git a/test/Transforms/Inline/prof-update.ll b/test/Transforms/Inline/prof-update.ll
index 3fefa1c56cea..4a4471e8e17a 100644
--- a/test/Transforms/Inline/prof-update.ll
+++ b/test/Transforms/Inline/prof-update.ll
@@ -6,21 +6,21 @@ declare void @ext1();
 @func = global void ()* null
 
 ; CHECK: define void @callee(i32 %n) !prof ![[ENTRY_COUNT:[0-9]*]]
-define void  @callee(i32 %n) !prof !1 {
+define void  @callee(i32 %n) !prof !15 {
   %cond = icmp sle i32 %n, 10
   br i1 %cond, label %cond_true, label %cond_false
 cond_true:
 ; ext1 is optimized away, thus not updated.
 ; CHECK: call void @ext1(), !prof ![[COUNT_CALLEE1:[0-9]*]]
-  call void @ext1(), !prof !2
+  call void @ext1(), !prof !16
   ret void
 cond_false:
 ; ext is cloned and updated.
 ; CHECK: call void @ext(), !prof ![[COUNT_CALLEE:[0-9]*]]
-  call void @ext(), !prof !2
+  call void @ext(), !prof !16
   %f = load void ()*, void ()** @func
 ; CHECK: call void %f(), !prof ![[COUNT_IND_CALLEE:[0-9]*]] 
-  call void %f(), !prof !4
+  call void %f(), !prof !18
   ret void
 }
 
@@ -28,16 +28,29 @@ cond_false:
 define void @caller() {
 ; CHECK: call void @ext(), !prof ![[COUNT_CALLER:[0-9]*]]
 ; CHECK: call void %f.i(), !prof ![[COUNT_IND_CALLER:[0-9]*]]
-  call void @callee(i32 15), !prof !3
+  call void @callee(i32 15), !prof !17
   ret void
 }
 
-!llvm.module.flags = !{!0}
-!0 = !{i32 1, !"MaxFunctionCount", i32 2000}
-!1 = !{!"function_entry_count", i64 1000}
-!2 = !{!"branch_weights", i64 2000}
-!3 = !{!"branch_weights", i64 400}
-!4 = !{!"VP", i32 0, i64 140, i64 111, i64 80, i64 222, i64 40, i64 333, i64 20}
+!llvm.module.flags = !{!1}
+!1 = !{i32 1, !"ProfileSummary", !2}
+!2 = !{!3, !4, !5, !6, !7, !8, !9, !10}
+!3 = !{!"ProfileFormat", !"SampleProfile"}
+!4 = !{!"TotalCount", i64 10000}
+!5 = !{!"MaxCount", i64 10}
+!6 = !{!"MaxInternalCount", i64 1}
+!7 = !{!"MaxFunctionCount", i64 2000}
+!8 = !{!"NumCounts", i64 2}
+!9 = !{!"NumFunctions", i64 2}
+!10 = !{!"DetailedSummary", !11}
+!11 = !{!12, !13, !14}
+!12 = !{i32 10000, i64 100, i32 1}
+!13 = !{i32 999000, i64 100, i32 1}
+!14 = !{i32 999999, i64 1, i32 2}
+!15 = !{!"function_entry_count", i64 1000}
+!16 = !{!"branch_weights", i64 2000}
+!17 = !{!"branch_weights", i64 400}
+!18 = !{!"VP", i32 0, i64 140, i64 111, i64 80, i64 222, i64 40, i64 333, i64 20}
 attributes #0 = { alwaysinline }
 ; CHECK: ![[ENTRY_COUNT]] = !{!"function_entry_count", i64 600}
 ; CHECK: ![[COUNT_CALLEE1]] = !{!"branch_weights", i64 2000}
diff --git a/test/Transforms/InstCombine/2012-04-23-Neon-Intrinsics.ll b/test/Transforms/InstCombine/2012-04-23-Neon-Intrinsics.ll
deleted file mode 100644
index 39408a2d394c..000000000000
--- a/test/Transforms/InstCombine/2012-04-23-Neon-Intrinsics.ll
+++ /dev/null
@@ -1,135 +0,0 @@
-; RUN: opt -S -instcombine < %s | FileCheck %s
-
-define <4 x i32> @mulByZero(<4 x i16> %x) nounwind readnone ssp {
-entry:
-  %a = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %x, <4 x i16> zeroinitializer) nounwind
-  ret <4 x i32> %a
-; CHECK: entry:
-; CHECK-NEXT: ret <4 x i32> zeroinitializer
-}
-
-define <4 x i32> @mulByOne(<4 x i16> %x) nounwind readnone ssp {
-entry:
-  %a = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %x, <4 x i16> <i16 1, i16 1, i16 1, i16 1>) nounwind
-  ret <4 x i32> %a
-; CHECK: entry:
-; CHECK-NEXT: %a = sext <4 x i16> %x to <4 x i32>
-; CHECK-NEXT: ret <4 x i32> %a
-}
-
-define <4 x i32> @constantMul() nounwind readnone ssp {
-entry:
-  %a = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> <i16 3, i16 3, i16 3, i16 3>, <4 x i16> <i16 2, i16 2, i16 2, i16 2>) nounwind
-  ret <4 x i32> %a
-; CHECK: entry:
-; CHECK-NEXT: ret <4 x i32> <i32 6, i32 6, i32 6, i32 6>
-}
-
-define <4 x i32> @constantMulS() nounwind readnone ssp {
-entry:
-  %b = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1>, <4 x i16> <i16 1, i16 1, i16 1, i16 1>) nounwind
-  ret <4 x i32> %b
-; CHECK: entry:
-; CHECK-NEXT: ret <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>
-}
-
-define <4 x i32> @constantMulU() nounwind readnone ssp {
-entry:
-  %b = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1>, <4 x i16> <i16 1, i16 1, i16 1, i16 1>) nounwind
-  ret <4 x i32> %b
-; CHECK: entry:
-; CHECK-NEXT: ret <4 x i32> <i32 65535, i32 65535, i32 65535, i32 65535>
-}
-
-define <4 x i32> @complex1(<4 x i16> %x) nounwind readnone ssp {
-entry:
-  %a = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> <i16 2, i16 2, i16 2, i16 2>, <4 x i16> %x) nounwind
-  %b = add <4 x i32> zeroinitializer, %a
-  ret <4 x i32> %b
-; CHECK: entry:
-; CHECK-NEXT: %a = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> <i16 2, i16 2, i16 2, i16 2>, <4 x i16> %x) [[NUW:#[0-9]+]]
-; CHECK-NEXT: ret <4 x i32> %a
-}
-
-define <4 x i32> @complex2(<4 x i32> %x) nounwind readnone ssp {
-entry:
-  %a = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> <i16 3, i16 3, i16 3, i16 3>, <4 x i16> <i16 2, i16 2, i16 2, i16 2>) nounwind
-  %b = add <4 x i32> %x, %a
-  ret <4 x i32> %b  
-; CHECK: entry:
-; CHECK-NEXT: %b = add <4 x i32> %x, <i32 6, i32 6, i32 6, i32 6>
-; CHECK-NEXT: ret <4 x i32> %b
-}
-
-declare <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16>, <4 x i16>) nounwind readnone
-declare <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16>, <4 x i16>) nounwind readnone
-
-; ARM64 variants - <rdar://problem/12349617>
-
-define <4 x i32> @mulByZeroARM64(<4 x i16> %x) nounwind readnone ssp {
-entry:
-  %a = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %x, <4 x i16> zeroinitializer) nounwind
-  ret <4 x i32> %a
-; CHECK: entry:
-; CHECK-NEXT: ret <4 x i32> zeroinitializer
-}
-
-define <4 x i32> @mulByOneARM64(<4 x i16> %x) nounwind readnone ssp {
-entry:
-  %a = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %x, <4 x i16> <i16 1, i16 1, i16 1, i16 1>) nounwind
-  ret <4 x i32> %a
-; CHECK: entry:
-; CHECK-NEXT: %a = sext <4 x i16> %x to <4 x i32>
-; CHECK-NEXT: ret <4 x i32> %a
-}
-
-define <4 x i32> @constantMulARM64() nounwind readnone ssp {
-entry:
-  %a = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> <i16 3, i16 3, i16 3, i16 3>, <4 x i16> <i16 2, i16 2, i16 2, i16 2>) nounwind
-  ret <4 x i32> %a
-; CHECK: entry:
-; CHECK-NEXT: ret <4 x i32> <i32 6, i32 6, i32 6, i32 6>
-}
-
-define <4 x i32> @constantMulSARM64() nounwind readnone ssp {
-entry:
-  %b = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1>, <4 x i16> <i16 1, i16 1, i16 1, i16 1>) nounwind
-  ret <4 x i32> %b
-; CHECK: entry:
-; CHECK-NEXT: ret <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>
-}
-
-define <4 x i32> @constantMulUARM64() nounwind readnone ssp {
-entry:
-  %b = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1>, <4 x i16> <i16 1, i16 1, i16 1, i16 1>) nounwind
-  ret <4 x i32> %b
-; CHECK: entry:
-; CHECK-NEXT: ret <4 x i32> <i32 65535, i32 65535, i32 65535, i32 65535>
-}
-
-define <4 x i32> @complex1ARM64(<4 x i16> %x) nounwind readnone ssp {
-entry:
-  %a = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> <i16 2, i16 2, i16 2, i16 2>, <4 x i16> %x) nounwind
-  %b = add <4 x i32> zeroinitializer, %a
-  ret <4 x i32> %b
-; CHECK: entry:
-; CHECK-NEXT: %a = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> <i16 2, i16 2, i16 2, i16 2>, <4 x i16> %x) [[NUW:#[0-9]+]]
-; CHECK-NEXT: ret <4 x i32> %a
-}
-
-define <4 x i32> @complex2ARM64(<4 x i32> %x) nounwind readnone ssp {
-entry:
-  %a = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> <i16 3, i16 3, i16 3, i16 3>, <4 x i16> <i16 2, i16 2, i16 2, i16 2>) nounwind
-  %b = add <4 x i32> %x, %a
-  ret <4 x i32> %b
-; CHECK: entry:
-; CHECK-NEXT: %b = add <4 x i32> %x, <i32 6, i32 6, i32 6, i32 6>
-; CHECK-NEXT: ret <4 x i32> %b
-}
-
-declare <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16>, <4 x i16>) nounwind readnone
-declare <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16>, <4 x i16>) nounwind readnone
-
-; CHECK: attributes #0 = { nounwind readnone ssp }
-; CHECK: attributes #1 = { nounwind readnone }
-; CHECK: attributes [[NUW]] = { nounwind }
diff --git a/test/Transforms/InstCombine/AArch64/2012-04-23-Neon-Intrinsics.ll b/test/Transforms/InstCombine/AArch64/2012-04-23-Neon-Intrinsics.ll
new file mode 100644
index 000000000000..04fb7d91193a
--- /dev/null
+++ b/test/Transforms/InstCombine/AArch64/2012-04-23-Neon-Intrinsics.ll
@@ -0,0 +1,71 @@
+; RUN: opt -S -instcombine < %s | FileCheck %s
+; ARM64 neon intrinsic variants - <rdar://problem/12349617>
+; REQUIRES: aarch64
+
+define <4 x i32> @mulByZeroARM64(<4 x i16> %x) nounwind readnone ssp {
+entry:
+  %a = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %x, <4 x i16> zeroinitializer) nounwind
+  ret <4 x i32> %a
+; CHECK: entry:
+; CHECK-NEXT: ret <4 x i32> zeroinitializer
+}
+
+define <4 x i32> @mulByOneARM64(<4 x i16> %x) nounwind readnone ssp {
+entry:
+  %a = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %x, <4 x i16> <i16 1, i16 1, i16 1, i16 1>) nounwind
+  ret <4 x i32> %a
+; CHECK: entry:
+; CHECK-NEXT: %a = sext <4 x i16> %x to <4 x i32>
+; CHECK-NEXT: ret <4 x i32> %a
+}
+
+define <4 x i32> @constantMulARM64() nounwind readnone ssp {
+entry:
+  %a = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> <i16 3, i16 3, i16 3, i16 3>, <4 x i16> <i16 2, i16 2, i16 2, i16 2>) nounwind
+  ret <4 x i32> %a
+; CHECK: entry:
+; CHECK-NEXT: ret <4 x i32> <i32 6, i32 6, i32 6, i32 6>
+}
+
+define <4 x i32> @constantMulSARM64() nounwind readnone ssp {
+entry:
+  %b = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1>, <4 x i16> <i16 1, i16 1, i16 1, i16 1>) nounwind
+  ret <4 x i32> %b
+; CHECK: entry:
+; CHECK-NEXT: ret <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>
+}
+
+define <4 x i32> @constantMulUARM64() nounwind readnone ssp {
+entry:
+  %b = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1>, <4 x i16> <i16 1, i16 1, i16 1, i16 1>) nounwind
+  ret <4 x i32> %b
+; CHECK: entry:
+; CHECK-NEXT: ret <4 x i32> <i32 65535, i32 65535, i32 65535, i32 65535>
+}
+
+define <4 x i32> @complex1ARM64(<4 x i16> %x) nounwind readnone ssp {
+entry:
+  %a = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> <i16 2, i16 2, i16 2, i16 2>, <4 x i16> %x) nounwind
+  %b = add <4 x i32> zeroinitializer, %a
+  ret <4 x i32> %b
+; CHECK: entry:
+; CHECK-NEXT: %a = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> <i16 2, i16 2, i16 2, i16 2>, <4 x i16> %x) [[NUW:#[0-9]+]]
+; CHECK-NEXT: ret <4 x i32> %a
+}
+
+define <4 x i32> @complex2ARM64(<4 x i32> %x) nounwind readnone ssp {
+entry:
+  %a = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> <i16 3, i16 3, i16 3, i16 3>, <4 x i16> <i16 2, i16 2, i16 2, i16 2>) nounwind
+  %b = add <4 x i32> %x, %a
+  ret <4 x i32> %b
+; CHECK: entry:
+; CHECK-NEXT: %b = add <4 x i32> %x, <i32 6, i32 6, i32 6, i32 6>
+; CHECK-NEXT: ret <4 x i32> %b
+}
+
+declare <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16>, <4 x i16>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16>, <4 x i16>) nounwind readnone
+
+; CHECK: attributes #0 = { nounwind readnone ssp }
+; CHECK: attributes #1 = { nounwind readnone }
+; CHECK: attributes [[NUW]] = { nounwind }
diff --git a/test/Transforms/InstCombine/AArch64/lit.local.cfg b/test/Transforms/InstCombine/AArch64/lit.local.cfg
new file mode 100644
index 000000000000..7184443994b6
--- /dev/null
+++ b/test/Transforms/InstCombine/AArch64/lit.local.cfg
@@ -0,0 +1,2 @@
+if not 'AArch64' in config.root.targets:
+    config.unsupported = True
diff --git a/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll b/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll
new file mode 100644
index 000000000000..1901997c5521
--- /dev/null
+++ b/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll
@@ -0,0 +1,1540 @@
+; RUN: opt -instcombine -S < %s | FileCheck %s
+
+; --------------------------------------------------------------------
+; llvm.amdgcn.rcp
+; --------------------------------------------------------------------
+
+declare float @llvm.amdgcn.rcp.f32(float) nounwind readnone
+declare double @llvm.amdgcn.rcp.f64(double) nounwind readnone
+
+; CHECK-LABEL: @test_constant_fold_rcp_f32_undef
+; CHECK-NEXT: ret float undef
+define float @test_constant_fold_rcp_f32_undef() nounwind {
+  %val = call float @llvm.amdgcn.rcp.f32(float undef) nounwind readnone
+  ret float %val
+}
+
+; CHECK-LABEL: @test_constant_fold_rcp_f32_1
+; CHECK-NEXT: ret float 1.000000e+00
+define float @test_constant_fold_rcp_f32_1() nounwind {
+  %val = call float @llvm.amdgcn.rcp.f32(float 1.0) nounwind readnone
+  ret float %val
+}
+
+; CHECK-LABEL: @test_constant_fold_rcp_f64_1
+; CHECK-NEXT:  ret double 1.000000e+00
+define double @test_constant_fold_rcp_f64_1() nounwind {
+  %val = call double @llvm.amdgcn.rcp.f64(double 1.0) nounwind readnone
+  ret double %val
+}
+
+; CHECK-LABEL: @test_constant_fold_rcp_f32_half
+; CHECK-NEXT: ret float 2.000000e+00
+define float @test_constant_fold_rcp_f32_half() nounwind {
+  %val = call float @llvm.amdgcn.rcp.f32(float 0.5) nounwind readnone
+  ret float %val
+}
+
+; CHECK-LABEL: @test_constant_fold_rcp_f64_half
+; CHECK-NEXT:  ret double 2.000000e+00
+define double @test_constant_fold_rcp_f64_half() nounwind {
+  %val = call double @llvm.amdgcn.rcp.f64(double 0.5) nounwind readnone
+  ret double %val
+}
+
+; CHECK-LABEL: @test_constant_fold_rcp_f32_43
+; CHECK-NEXT: call float @llvm.amdgcn.rcp.f32(float 4.300000e+01)
+define float @test_constant_fold_rcp_f32_43() nounwind {
+ %val = call float @llvm.amdgcn.rcp.f32(float 4.300000e+01) nounwind readnone
+ ret float %val
+}
+
+; CHECK-LABEL: @test_constant_fold_rcp_f64_43
+; CHECK-NEXT: call double @llvm.amdgcn.rcp.f64(double 4.300000e+01)
+define double @test_constant_fold_rcp_f64_43() nounwind {
+  %val = call double @llvm.amdgcn.rcp.f64(double 4.300000e+01) nounwind readnone
+  ret double %val
+}
+
+; --------------------------------------------------------------------
+; llvm.amdgcn.rsq
+; --------------------------------------------------------------------
+
+declare float @llvm.amdgcn.rsq.f32(float) nounwind readnone
+
+; CHECK-LABEL: @test_constant_fold_rsq_f32_undef
+; CHECK-NEXT: ret float undef
+define float @test_constant_fold_rsq_f32_undef() nounwind {
+  %val = call float @llvm.amdgcn.rsq.f32(float undef) nounwind readnone
+  ret float %val
+}
+
+; --------------------------------------------------------------------
+; llvm.amdgcn.frexp.mant
+; --------------------------------------------------------------------
+
+declare float @llvm.amdgcn.frexp.mant.f32(float) nounwind readnone
+declare double @llvm.amdgcn.frexp.mant.f64(double) nounwind readnone
+
+
+; CHECK-LABEL: @test_constant_fold_frexp_mant_f32_undef(
+; CHECK-NEXT: ret float undef
+define float @test_constant_fold_frexp_mant_f32_undef() nounwind {
+  %val = call float @llvm.amdgcn.frexp.mant.f32(float undef)
+  ret float %val
+}
+
+; CHECK-LABEL: @test_constant_fold_frexp_mant_f64_undef(
+; CHECK-NEXT:  ret double undef
+define double @test_constant_fold_frexp_mant_f64_undef() nounwind {
+  %val = call double @llvm.amdgcn.frexp.mant.f64(double undef)
+  ret double %val
+}
+
+; CHECK-LABEL: @test_constant_fold_frexp_mant_f32_0(
+; CHECK-NEXT: ret float 0.000000e+00
+define float @test_constant_fold_frexp_mant_f32_0() nounwind {
+  %val = call float @llvm.amdgcn.frexp.mant.f32(float 0.0)
+  ret float %val
+}
+
+; CHECK-LABEL: @test_constant_fold_frexp_mant_f64_0(
+; CHECK-NEXT:  ret double 0.000000e+00
+define double @test_constant_fold_frexp_mant_f64_0() nounwind {
+  %val = call double @llvm.amdgcn.frexp.mant.f64(double 0.0)
+  ret double %val
+}
+
+
+; CHECK-LABEL: @test_constant_fold_frexp_mant_f32_n0(
+; CHECK-NEXT: ret float -0.000000e+00
+define float @test_constant_fold_frexp_mant_f32_n0() nounwind {
+  %val = call float @llvm.amdgcn.frexp.mant.f32(float -0.0)
+  ret float %val
+}
+
+; CHECK-LABEL: @test_constant_fold_frexp_mant_f64_n0(
+; CHECK-NEXT:  ret double -0.000000e+00
+define double @test_constant_fold_frexp_mant_f64_n0() nounwind {
+  %val = call double @llvm.amdgcn.frexp.mant.f64(double -0.0)
+  ret double %val
+}
+
+; CHECK-LABEL: @test_constant_fold_frexp_mant_f32_1(
+; CHECK-NEXT: ret float 5.000000e-01
+define float @test_constant_fold_frexp_mant_f32_1() nounwind {
+  %val = call float @llvm.amdgcn.frexp.mant.f32(float 1.0)
+  ret float %val
+}
+
+; CHECK-LABEL: @test_constant_fold_frexp_mant_f64_1(
+; CHECK-NEXT:  ret double 5.000000e-01
+define double @test_constant_fold_frexp_mant_f64_1() nounwind {
+  %val = call double @llvm.amdgcn.frexp.mant.f64(double 1.0)
+  ret double %val
+}
+
+; CHECK-LABEL: @test_constant_fold_frexp_mant_f32_n1(
+; CHECK-NEXT: ret float -5.000000e-01
+define float @test_constant_fold_frexp_mant_f32_n1() nounwind {
+  %val = call float @llvm.amdgcn.frexp.mant.f32(float -1.0)
+  ret float %val
+}
+
+; CHECK-LABEL: @test_constant_fold_frexp_mant_f64_n1(
+; CHECK-NEXT:  ret double -5.000000e-01
+define double @test_constant_fold_frexp_mant_f64_n1() nounwind {
+  %val = call double @llvm.amdgcn.frexp.mant.f64(double -1.0)
+  ret double %val
+}
+
+; CHECK-LABEL: @test_constant_fold_frexp_mant_f32_nan(
+; CHECK-NEXT: ret float 0x7FF8000000000000
+define float @test_constant_fold_frexp_mant_f32_nan() nounwind {
+  %val = call float @llvm.amdgcn.frexp.mant.f32(float 0x7FF8000000000000)
+  ret float %val
+}
+
+; CHECK-LABEL: @test_constant_fold_frexp_mant_f64_nan(
+; CHECK-NEXT:  ret double 0x7FF8000000000000
+define double @test_constant_fold_frexp_mant_f64_nan() nounwind {
+  %val = call double @llvm.amdgcn.frexp.mant.f64(double 0x7FF8000000000000)
+  ret double %val
+}
+
+; CHECK-LABEL: @test_constant_fold_frexp_mant_f32_inf(
+; CHECK-NEXT: ret float 0x7FF0000000000000
+define float @test_constant_fold_frexp_mant_f32_inf() nounwind {
+  %val = call float @llvm.amdgcn.frexp.mant.f32(float 0x7FF0000000000000)
+  ret float %val
+}
+
+; CHECK-LABEL: @test_constant_fold_frexp_mant_f64_inf(
+; CHECK-NEXT:  ret double 0x7FF0000000000000
+define double @test_constant_fold_frexp_mant_f64_inf() nounwind {
+  %val = call double @llvm.amdgcn.frexp.mant.f64(double 0x7FF0000000000000)
+  ret double %val
+}
+
+; CHECK-LABEL: @test_constant_fold_frexp_mant_f32_ninf(
+; CHECK-NEXT: ret float 0xFFF0000000000000
+define float @test_constant_fold_frexp_mant_f32_ninf() nounwind {
+  %val = call float @llvm.amdgcn.frexp.mant.f32(float 0xFFF0000000000000)
+  ret float %val
+}
+
+; CHECK-LABEL: @test_constant_fold_frexp_mant_f64_ninf(
+; CHECK-NEXT:  ret double 0xFFF0000000000000
+define double @test_constant_fold_frexp_mant_f64_ninf() nounwind {
+  %val = call double @llvm.amdgcn.frexp.mant.f64(double 0xFFF0000000000000)
+  ret double %val
+}
+
+; CHECK-LABEL: @test_constant_fold_frexp_mant_f32_max_num(
+; CHECK-NEXT: ret float 0x3FEFFFFFE0000000
+define float @test_constant_fold_frexp_mant_f32_max_num() nounwind {
+  %val = call float @llvm.amdgcn.frexp.mant.f32(float 0x47EFFFFFE0000000)
+  ret float %val
+}
+
+; CHECK-LABEL: @test_constant_fold_frexp_mant_f64_max_num(
+; CHECK-NEXT:  ret double 0x3FEFFFFFFFFFFFFF
+define double @test_constant_fold_frexp_mant_f64_max_num() nounwind {
+  %val = call double @llvm.amdgcn.frexp.mant.f64(double 0x7FEFFFFFFFFFFFFF)
+  ret double %val
+}
+
+; CHECK-LABEL: @test_constant_fold_frexp_mant_f32_min_num(
+; CHECK-NEXT: ret float 5.000000e-01
+define float @test_constant_fold_frexp_mant_f32_min_num() nounwind {
+  %val = call float @llvm.amdgcn.frexp.mant.f32(float 0x36A0000000000000)
+  ret float %val
+}
+
+; CHECK-LABEL: @test_constant_fold_frexp_mant_f64_min_num(
+; CHECK-NEXT:  ret double 5.000000e-01
+define double @test_constant_fold_frexp_mant_f64_min_num() nounwind {
+  %val = call double @llvm.amdgcn.frexp.mant.f64(double 4.940656e-324)
+  ret double %val
+}
+
+
+; --------------------------------------------------------------------
+; llvm.amdgcn.frexp.exp
+; --------------------------------------------------------------------
+
+declare i32 @llvm.amdgcn.frexp.exp.f32(float) nounwind readnone
+declare i32 @llvm.amdgcn.frexp.exp.f64(double) nounwind readnone
+
+; CHECK-LABEL: @test_constant_fold_frexp_exp_f32_undef(
+; CHECK-NEXT: ret i32 undef
+define i32 @test_constant_fold_frexp_exp_f32_undef() nounwind {
+  %val = call i32 @llvm.amdgcn.frexp.exp.f32(float undef)
+  ret i32 %val
+}
+
+; CHECK-LABEL: @test_constant_fold_frexp_exp_f64_undef(
+; CHECK-NEXT:  ret i32 undef
+define i32 @test_constant_fold_frexp_exp_f64_undef() nounwind {
+  %val = call i32 @llvm.amdgcn.frexp.exp.f64(double undef)
+  ret i32 %val
+}
+
+; CHECK-LABEL: @test_constant_fold_frexp_exp_f32_0(
+; CHECK-NEXT: ret i32 0
+define i32 @test_constant_fold_frexp_exp_f32_0() nounwind {
+  %val = call i32 @llvm.amdgcn.frexp.exp.f32(float 0.0)
+  ret i32 %val
+}
+
+; CHECK-LABEL: @test_constant_fold_frexp_exp_f64_0(
+; CHECK-NEXT:  ret i32 0
+define i32 @test_constant_fold_frexp_exp_f64_0() nounwind {
+  %val = call i32 @llvm.amdgcn.frexp.exp.f64(double 0.0)
+  ret i32 %val
+}
+
+; CHECK-LABEL: @test_constant_fold_frexp_exp_f32_n0(
+; CHECK-NEXT: ret i32 0
+define i32 @test_constant_fold_frexp_exp_f32_n0() nounwind {
+  %val = call i32 @llvm.amdgcn.frexp.exp.f32(float -0.0)
+  ret i32 %val
+}
+
+; CHECK-LABEL: @test_constant_fold_frexp_exp_f64_n0(
+; CHECK-NEXT:  ret i32 0
+define i32 @test_constant_fold_frexp_exp_f64_n0() nounwind {
+  %val = call i32 @llvm.amdgcn.frexp.exp.f64(double -0.0)
+  ret i32 %val
+}
+
+; CHECK-LABEL: @test_constant_fold_frexp_exp_f32_1024(
+; CHECK-NEXT: ret i32 11
+define i32 @test_constant_fold_frexp_exp_f32_1024() nounwind {
+  %val = call i32 @llvm.amdgcn.frexp.exp.f32(float 1024.0)
+  ret i32 %val
+}
+
+; CHECK-LABEL: @test_constant_fold_frexp_exp_f64_1024(
+; CHECK-NEXT:  ret i32 11
+define i32 @test_constant_fold_frexp_exp_f64_1024() nounwind {
+  %val = call i32 @llvm.amdgcn.frexp.exp.f64(double 1024.0)
+  ret i32 %val
+}
+
+; CHECK-LABEL: @test_constant_fold_frexp_exp_f32_n1024(
+; CHECK-NEXT: ret i32 11
+define i32 @test_constant_fold_frexp_exp_f32_n1024() nounwind {
+  %val = call i32 @llvm.amdgcn.frexp.exp.f32(float -1024.0)
+  ret i32 %val
+}
+
+; CHECK-LABEL: @test_constant_fold_frexp_exp_f64_n1024(
+; CHECK-NEXT:  ret i32 11
+define i32 @test_constant_fold_frexp_exp_f64_n1024() nounwind {
+  %val = call i32 @llvm.amdgcn.frexp.exp.f64(double -1024.0)
+  ret i32 %val
+}
+
+; CHECK-LABEL: @test_constant_fold_frexp_exp_f32_1_1024(
+; CHECK-NEXT: ret i32 -9
+define i32 @test_constant_fold_frexp_exp_f32_1_1024() nounwind {
+  %val = call i32 @llvm.amdgcn.frexp.exp.f32(float 0.0009765625)
+  ret i32 %val
+}
+
+; CHECK-LABEL: @test_constant_fold_frexp_exp_f64_1_1024(
+; CHECK-NEXT:  ret i32 -9
+define i32 @test_constant_fold_frexp_exp_f64_1_1024() nounwind {
+  %val = call i32 @llvm.amdgcn.frexp.exp.f64(double 0.0009765625)
+  ret i32 %val
+}
+
+; CHECK-LABEL: @test_constant_fold_frexp_exp_f32_nan(
+; CHECK-NEXT: ret i32 0
+define i32 @test_constant_fold_frexp_exp_f32_nan() nounwind {
+  %val = call i32 @llvm.amdgcn.frexp.exp.f32(float 0x7FF8000000000000)
+  ret i32 %val
+}
+
+; CHECK-LABEL: @test_constant_fold_frexp_exp_f64_nan(
+; CHECK-NEXT:  ret i32 0
+define i32 @test_constant_fold_frexp_exp_f64_nan() nounwind {
+  %val = call i32 @llvm.amdgcn.frexp.exp.f64(double 0x7FF8000000000000)
+  ret i32 %val
+}
+
+; CHECK-LABEL: @test_constant_fold_frexp_exp_f32_inf(
+; CHECK-NEXT: ret i32 0
+define i32 @test_constant_fold_frexp_exp_f32_inf() nounwind {
+  %val = call i32 @llvm.amdgcn.frexp.exp.f32(float 0x7FF0000000000000)
+  ret i32 %val
+}
+
+; CHECK-LABEL: @test_constant_fold_frexp_exp_f64_inf(
+; CHECK-NEXT:  ret i32 0
+define i32 @test_constant_fold_frexp_exp_f64_inf() nounwind {
+  %val = call i32 @llvm.amdgcn.frexp.exp.f64(double 0x7FF0000000000000)
+  ret i32 %val
+}
+
+; CHECK-LABEL: @test_constant_fold_frexp_exp_f32_ninf(
+; CHECK-NEXT: ret i32 0
+define i32 @test_constant_fold_frexp_exp_f32_ninf() nounwind {
+  %val = call i32 @llvm.amdgcn.frexp.exp.f32(float 0xFFF0000000000000)
+  ret i32 %val
+}
+
+; CHECK-LABEL: @test_constant_fold_frexp_exp_f64_ninf(
+; CHECK-NEXT:  ret i32 0
+define i32 @test_constant_fold_frexp_exp_f64_ninf() nounwind {
+  %val = call i32 @llvm.amdgcn.frexp.exp.f64(double 0xFFF0000000000000)
+  ret i32 %val
+}
+
+; CHECK-LABEL: @test_constant_fold_frexp_exp_f32_max_num(
+; CHECK-NEXT: ret i32 128
+define i32 @test_constant_fold_frexp_exp_f32_max_num() nounwind {
+  %val = call i32 @llvm.amdgcn.frexp.exp.f32(float 0x47EFFFFFE0000000)
+  ret i32 %val
+}
+
+; CHECK-LABEL: @test_constant_fold_frexp_exp_f64_max_num(
+; CHECK-NEXT:  ret i32 1024
+define i32 @test_constant_fold_frexp_exp_f64_max_num() nounwind {
+  %val = call i32 @llvm.amdgcn.frexp.exp.f64(double 0x7FEFFFFFFFFFFFFF)
+  ret i32 %val
+}
+
+; CHECK-LABEL: @test_constant_fold_frexp_exp_f32_min_num(
+; CHECK-NEXT: ret i32 -148
+define i32 @test_constant_fold_frexp_exp_f32_min_num() nounwind {
+  %val = call i32 @llvm.amdgcn.frexp.exp.f32(float 0x36A0000000000000)
+  ret i32 %val
+}
+
+; CHECK-LABEL: @test_constant_fold_frexp_exp_f64_min_num(
+; CHECK-NEXT:  ret i32 -1073
+define i32 @test_constant_fold_frexp_exp_f64_min_num() nounwind {
+  %val = call i32 @llvm.amdgcn.frexp.exp.f64(double 4.940656e-324)
+  ret i32 %val
+}
+
+; --------------------------------------------------------------------
+; llvm.amdgcn.class
+; --------------------------------------------------------------------
+
+declare i1 @llvm.amdgcn.class.f32(float, i32) nounwind readnone
+declare i1 @llvm.amdgcn.class.f64(double, i32) nounwind readnone
+
+; CHECK-LABEL: @test_class_undef_mask_f32(
+; CHECK: ret i1 false
+define i1 @test_class_undef_mask_f32(float %x) nounwind {
+  %val = call i1 @llvm.amdgcn.class.f32(float %x, i32 undef)
+  ret i1 %val
+}
+
+; CHECK-LABEL: @test_class_over_max_mask_f32(
+; CHECK: %val = call i1 @llvm.amdgcn.class.f32(float %x, i32 1)
+define i1 @test_class_over_max_mask_f32(float %x) nounwind {
+  %val = call i1 @llvm.amdgcn.class.f32(float %x, i32 1025)
+  ret i1 %val
+}
+
+; CHECK-LABEL: @test_class_no_mask_f32(
+; CHECK: ret i1 false
+define i1 @test_class_no_mask_f32(float %x) nounwind {
+  %val = call i1 @llvm.amdgcn.class.f32(float %x, i32 0)
+  ret i1 %val
+}
+
+; CHECK-LABEL: @test_class_full_mask_f32(
+; CHECK: ret i1 true
+define i1 @test_class_full_mask_f32(float %x) nounwind {
+  %val = call i1 @llvm.amdgcn.class.f32(float %x, i32 1023)
+  ret i1 %val
+}
+
+; CHECK-LABEL: @test_class_undef_no_mask_f32(
+; CHECK: ret i1 false
+define i1 @test_class_undef_no_mask_f32() nounwind {
+  %val = call i1 @llvm.amdgcn.class.f32(float undef, i32 0)
+  ret i1 %val
+}
+
+; CHECK-LABEL: @test_class_undef_full_mask_f32(
+; CHECK: ret i1 true
+define i1 @test_class_undef_full_mask_f32() nounwind {
+  %val = call i1 @llvm.amdgcn.class.f32(float undef, i32 1023)
+  ret i1 %val
+}
+
+; CHECK-LABEL: @test_class_undef_val_f32(
+; CHECK: ret i1 undef
+define i1 @test_class_undef_val_f32() nounwind {
+  %val = call i1 @llvm.amdgcn.class.f32(float undef, i32 4)
+  ret i1 %val
+}
+
+; CHECK-LABEL: @test_class_undef_undef_f32(
+; CHECK: ret i1 undef
+define i1 @test_class_undef_undef_f32() nounwind {
+  %val = call i1 @llvm.amdgcn.class.f32(float undef, i32 undef)
+  ret i1 %val
+}
+
+; CHECK-LABEL: @test_class_var_mask_f32(
+; CHECK: %val = call i1 @llvm.amdgcn.class.f32(float %x, i32 %mask)
+define i1 @test_class_var_mask_f32(float %x, i32 %mask) nounwind {
+  %val = call i1 @llvm.amdgcn.class.f32(float %x, i32 %mask)
+  ret i1 %val
+}
+
+; CHECK-LABEL: @test_class_isnan_f32(
+; CHECK: %val = fcmp uno float %x, 0.000000e+00
+define i1 @test_class_isnan_f32(float %x) nounwind {
+  %val = call i1 @llvm.amdgcn.class.f32(float %x, i32 3)
+  ret i1 %val
+}
+
+; CHECK-LABEL: @test_constant_class_snan_test_snan_f64(
+; CHECK: ret i1 true
+define i1 @test_constant_class_snan_test_snan_f64() nounwind {
+  %val = call i1 @llvm.amdgcn.class.f64(double 0x7FF0000000000001, i32 1)
+  ret i1 %val
+}
+
+; CHECK-LABEL: @test_constant_class_qnan_test_qnan_f64(
+; CHECK: ret i1 true
+define i1 @test_constant_class_qnan_test_qnan_f64() nounwind {
+  %val = call i1 @llvm.amdgcn.class.f64(double 0x7FF8000000000000, i32 2)
+  ret i1 %val
+}
+
+; CHECK-LABEL: @test_constant_class_qnan_test_snan_f64(
+; CHECK: ret i1 false
+define i1 @test_constant_class_qnan_test_snan_f64() nounwind {
+  %val = call i1 @llvm.amdgcn.class.f64(double 0x7FF8000000000000, i32 1)
+  ret i1 %val
+}
+
+; CHECK-LABEL: @test_constant_class_ninf_test_ninf_f64(
+; CHECK: ret i1 true
+define i1 @test_constant_class_ninf_test_ninf_f64() nounwind {
+  %val = call i1 @llvm.amdgcn.class.f64(double 0xFFF0000000000000, i32 4)
+  ret i1 %val
+}
+
+; CHECK-LABEL: @test_constant_class_pinf_test_ninf_f64(
+; CHECK: ret i1 false
+define i1 @test_constant_class_pinf_test_ninf_f64() nounwind {
+  %val = call i1 @llvm.amdgcn.class.f64(double 0x7FF0000000000000, i32 4)
+  ret i1 %val
+}
+
+; CHECK-LABEL: @test_constant_class_qnan_test_ninf_f64(
+; CHECK: ret i1 false
+define i1 @test_constant_class_qnan_test_ninf_f64() nounwind {
+  %val = call i1 @llvm.amdgcn.class.f64(double 0x7FF8000000000000, i32 4)
+  ret i1 %val
+}
+
+; CHECK-LABEL: @test_constant_class_snan_test_ninf_f64(
+; CHECK: ret i1 false
+define i1 @test_constant_class_snan_test_ninf_f64() nounwind {
+  %val = call i1 @llvm.amdgcn.class.f64(double 0x7FF0000000000001, i32 4)
+  ret i1 %val
+}
+
+; CHECK-LABEL: @test_constant_class_nnormal_test_nnormal_f64(
+; CHECK: ret i1 true
+define i1 @test_constant_class_nnormal_test_nnormal_f64() nounwind {
+  %val = call i1 @llvm.amdgcn.class.f64(double -1.0, i32 8)
+  ret i1 %val
+}
+
+; CHECK-LABEL: @test_constant_class_pnormal_test_nnormal_f64(
+; CHECK: ret i1 false
+define i1 @test_constant_class_pnormal_test_nnormal_f64() nounwind {
+  %val = call i1 @llvm.amdgcn.class.f64(double 1.0, i32 8)
+  ret i1 %val
+}
+
+; CHECK-LABEL: @test_constant_class_nsubnormal_test_nsubnormal_f64(
+; CHECK: ret i1 true
+define i1 @test_constant_class_nsubnormal_test_nsubnormal_f64() nounwind {
+  %val = call i1 @llvm.amdgcn.class.f64(double 0x800fffffffffffff, i32 16)
+  ret i1 %val
+}
+
+; CHECK-LABEL: @test_constant_class_psubnormal_test_nsubnormal_f64(
+; CHECK: ret i1 false
+define i1 @test_constant_class_psubnormal_test_nsubnormal_f64() nounwind {
+  %val = call i1 @llvm.amdgcn.class.f64(double 0x000fffffffffffff, i32 16)
+  ret i1 %val
+}
+
+; CHECK-LABEL: @test_constant_class_nzero_test_nzero_f64(
+; CHECK: ret i1 true
+define i1 @test_constant_class_nzero_test_nzero_f64() nounwind {
+  %val = call i1 @llvm.amdgcn.class.f64(double -0.0, i32 32)
+  ret i1 %val
+}
+
+; CHECK-LABEL: @test_constant_class_pzero_test_nzero_f64(
+; CHECK: ret i1 false
+define i1 @test_constant_class_pzero_test_nzero_f64() nounwind {
+  %val = call i1 @llvm.amdgcn.class.f64(double 0.0, i32 32)
+  ret i1 %val
+}
+
+; CHECK-LABEL: @test_constant_class_pzero_test_pzero_f64(
+; CHECK: ret i1 true
+define i1 @test_constant_class_pzero_test_pzero_f64() nounwind {
+  %val = call i1 @llvm.amdgcn.class.f64(double 0.0, i32 64)
+  ret i1 %val
+}
+
+; CHECK-LABEL: @test_constant_class_nzero_test_pzero_f64(
+; CHECK: ret i1 false
+define i1 @test_constant_class_nzero_test_pzero_f64() nounwind {
+  %val = call i1 @llvm.amdgcn.class.f64(double -0.0, i32 64)
+  ret i1 %val
+}
+
+; CHECK-LABEL: @test_constant_class_psubnormal_test_psubnormal_f64(
+; CHECK: ret i1 true
+define i1 @test_constant_class_psubnormal_test_psubnormal_f64() nounwind {
+  %val = call i1 @llvm.amdgcn.class.f64(double 0x000fffffffffffff, i32 128)
+  ret i1 %val
+}
+
+; CHECK-LABEL: @test_constant_class_nsubnormal_test_psubnormal_f64(
+; CHECK: ret i1 false
+define i1 @test_constant_class_nsubnormal_test_psubnormal_f64() nounwind {
+  %val = call i1 @llvm.amdgcn.class.f64(double 0x800fffffffffffff, i32 128)
+  ret i1 %val
+}
+
+; CHECK-LABEL: @test_constant_class_pnormal_test_pnormal_f64(
+; CHECK: ret i1 true
+define i1 @test_constant_class_pnormal_test_pnormal_f64() nounwind {
+  %val = call i1 @llvm.amdgcn.class.f64(double 1.0, i32 256)
+  ret i1 %val
+}
+
+; CHECK-LABEL: @test_constant_class_nnormal_test_pnormal_f64(
+; CHECK: ret i1 false
+define i1 @test_constant_class_nnormal_test_pnormal_f64() nounwind {
+  %val = call i1 @llvm.amdgcn.class.f64(double -1.0, i32 256)
+  ret i1 %val
+}
+
+; CHECK-LABEL: @test_constant_class_pinf_test_pinf_f64(
+; CHECK: ret i1 true
+define i1 @test_constant_class_pinf_test_pinf_f64() nounwind {
+  %val = call i1 @llvm.amdgcn.class.f64(double 0x7FF0000000000000, i32 512)
+  ret i1 %val
+}
+
+; CHECK-LABEL: @test_constant_class_ninf_test_pinf_f64(
+; CHECK: ret i1 false
+define i1 @test_constant_class_ninf_test_pinf_f64() nounwind {
+  %val = call i1 @llvm.amdgcn.class.f64(double 0xFFF0000000000000, i32 512)
+  ret i1 %val
+}
+
+; CHECK-LABEL: @test_constant_class_qnan_test_pinf_f64(
+; CHECK: ret i1 false
+define i1 @test_constant_class_qnan_test_pinf_f64() nounwind {
+  %val = call i1 @llvm.amdgcn.class.f64(double 0x7FF8000000000000, i32 512)
+  ret i1 %val
+}
+
+; CHECK-LABEL: @test_constant_class_snan_test_pinf_f64(
+; CHECK: ret i1 false
+define i1 @test_constant_class_snan_test_pinf_f64() nounwind {
+  %val = call i1 @llvm.amdgcn.class.f64(double 0x7FF0000000000001, i32 512)
+  ret i1 %val
+}
+
+; --------------------------------------------------------------------
+; llvm.amdgcn.cos
+; --------------------------------------------------------------------
+declare float @llvm.amdgcn.cos.f32(float) nounwind readnone
+declare float @llvm.fabs.f32(float) nounwind readnone
+
+; CHECK-LABEL: @cos_fneg_f32(
+; CHECK: %cos = call float @llvm.amdgcn.cos.f32(float %x)
+; CHECK-NEXT: ret float %cos
+define float @cos_fneg_f32(float %x) {
+  %x.fneg = fsub float -0.0, %x
+  %cos = call float @llvm.amdgcn.cos.f32(float %x.fneg)
+  ret float %cos
+}
+
+; CHECK-LABEL: @cos_fabs_f32(
+; CHECK-NEXT: %cos = call float @llvm.amdgcn.cos.f32(float %x)
+; CHECK-NEXT: ret float %cos
+define float @cos_fabs_f32(float %x) {
+  %x.fabs = call float @llvm.fabs.f32(float %x)
+  %cos = call float @llvm.amdgcn.cos.f32(float %x.fabs)
+  ret float %cos
+}
+
+; CHECK-LABEL: @cos_fabs_fneg_f32(
+; CHECK-NEXT: %cos = call float @llvm.amdgcn.cos.f32(float %x)
+; CHECK-NEXT: ret float %cos
+define float @cos_fabs_fneg_f32(float %x) {
+  %x.fabs = call float @llvm.fabs.f32(float %x)
+  %x.fabs.fneg = fsub float -0.0, %x.fabs
+  %cos = call float @llvm.amdgcn.cos.f32(float %x.fabs.fneg)
+  ret float %cos
+}
+
+; --------------------------------------------------------------------
+; llvm.amdgcn.cvt.pkrtz
+; --------------------------------------------------------------------
+
+declare <2 x half> @llvm.amdgcn.cvt.pkrtz(float, float) nounwind readnone
+
+; CHECK-LABEL: @vars_lhs_cvt_pkrtz(
+; CHECK: %cvt = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %x, float %y)
+define <2 x half> @vars_lhs_cvt_pkrtz(float %x, float %y) {
+  %cvt = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %x, float %y)
+  ret <2 x half> %cvt
+}
+
+; CHECK-LABEL: @constant_lhs_cvt_pkrtz(
+; CHECK: %cvt = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float 0.000000e+00, float %y)
+define <2 x half> @constant_lhs_cvt_pkrtz(float %y) {
+  %cvt = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float 0.0, float %y)
+  ret <2 x half> %cvt
+}
+
+; CHECK-LABEL: @constant_rhs_cvt_pkrtz(
+; CHECK: %cvt = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %x, float 0.000000e+00)
+define <2 x half> @constant_rhs_cvt_pkrtz(float %x) {
+  %cvt = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %x, float 0.0)
+  ret <2 x half> %cvt
+}
+
+; CHECK-LABEL: @undef_lhs_cvt_pkrtz(
+; CHECK: %cvt = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float undef, float %y)
+define <2 x half> @undef_lhs_cvt_pkrtz(float %y) {
+  %cvt = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float undef, float %y)
+  ret <2 x half> %cvt
+}
+
+; CHECK-LABEL: @undef_rhs_cvt_pkrtz(
+; CHECK: %cvt = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %x, float undef)
+define <2 x half> @undef_rhs_cvt_pkrtz(float %x) {
+  %cvt = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %x, float undef)
+  ret <2 x half> %cvt
+}
+
+; CHECK-LABEL: @undef_cvt_pkrtz(
+; CHECK: ret <2 x half> undef
+define <2 x half> @undef_cvt_pkrtz() {
+  %cvt = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float undef, float undef)
+  ret <2 x half> %cvt
+}
+
+; CHECK-LABEL: @constant_splat0_cvt_pkrtz(
+; CHECK: ret <2 x half> zeroinitializer
+define <2 x half> @constant_splat0_cvt_pkrtz() {
+  %cvt = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float 0.0, float 0.0)
+  ret <2 x half> %cvt
+}
+
+; CHECK-LABEL: @constant_cvt_pkrtz(
+; CHECK: ret <2 x half> <half 0xH4000, half 0xH4400>
+define <2 x half> @constant_cvt_pkrtz() {
+  %cvt = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float 2.0, float 4.0)
+  ret <2 x half> %cvt
+}
+
+; Test constant values where rtz changes result
+; CHECK-LABEL: @constant_rtz_pkrtz(
+; CHECK: ret <2 x half> <half 0xH7BFF, half 0xH7BFF>
+define <2 x half> @constant_rtz_pkrtz() {
+  %cvt = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float 65535.0, float 65535.0)
+  ret <2 x half> %cvt
+}
+
+; --------------------------------------------------------------------
+; llvm.amdgcn.ubfe
+; --------------------------------------------------------------------
+
+declare i32 @llvm.amdgcn.ubfe.i32(i32, i32, i32) nounwind readnone
+declare i64 @llvm.amdgcn.ubfe.i64(i64, i32, i32) nounwind readnone
+
+; CHECK-LABEL: @ubfe_var_i32(
+; CHECK-NEXT: %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %src, i32 %offset, i32 %width)
+define i32 @ubfe_var_i32(i32 %src, i32 %offset, i32 %width) {
+  %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %src, i32 %offset, i32 %width)
+  ret i32 %bfe
+}
+
+; CHECK-LABEL: @ubfe_clear_high_bits_constant_offset_i32(
+; CHECK-NEXT: %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %src, i32 5, i32 %width)
+define i32 @ubfe_clear_high_bits_constant_offset_i32(i32 %src, i32 %width) {
+  %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %src, i32 133, i32 %width)
+  ret i32 %bfe
+}
+
+; CHECK-LABEL: @ubfe_clear_high_bits_constant_width_i32(
+; CHECK-NEXT: %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %src, i32 %offset, i32 5)
+define i32 @ubfe_clear_high_bits_constant_width_i32(i32 %src, i32 %offset) {
+  %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %src, i32 %offset, i32 133)
+  ret i32 %bfe
+}
+
+; CHECK-LABEL: @ubfe_width_0(
+; CHECK-NEXT: ret i32 0
+define i32 @ubfe_width_0(i32 %src, i32 %offset) {
+  %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %src, i32 %offset, i32 0)
+  ret i32 %bfe
+}
+
+; CHECK-LABEL: @ubfe_width_31(
+; CHECK: %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %src, i32 %offset, i32 31)
+define i32 @ubfe_width_31(i32 %src, i32 %offset) {
+  %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %src, i32 %offset, i32 31)
+  ret i32 %bfe
+}
+
+; CHECK-LABEL: @ubfe_width_32(
+; CHECK-NEXT: ret i32 0
+define i32 @ubfe_width_32(i32 %src, i32 %offset) {
+  %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %src, i32 %offset, i32 32)
+  ret i32 %bfe
+}
+
+; CHECK-LABEL: @ubfe_width_33(
+; CHECK-NEXT: %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %src, i32 %offset, i32 1)
+define i32 @ubfe_width_33(i32 %src, i32 %offset) {
+  %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %src, i32 %offset, i32 33)
+  ret i32 %bfe
+}
+
+; CHECK-LABEL: @ubfe_offset_33(
+; CHECK-NEXT: %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %src, i32 1, i32 %width)
+define i32 @ubfe_offset_33(i32 %src, i32 %width) {
+  %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %src, i32 33, i32 %width)
+  ret i32 %bfe
+}
+
+; CHECK-LABEL: @ubfe_offset_0(
+; CHECK-NEXT: %1 = sub i32 32, %width
+; CHECK-NEXT: %2 = shl i32 %src, %1
+; CHECK-NEXT: %bfe = lshr i32 %2, %1
+; CHECK-NEXT: ret i32 %bfe
+define i32 @ubfe_offset_0(i32 %src, i32 %width) {
+  %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %src, i32 0, i32 %width)
+  ret i32 %bfe
+}
+
+; CHECK-LABEL: @ubfe_offset_32(
+; CHECK-NEXT: %1 = sub i32 32, %width
+; CHECK-NEXT: %2 = shl i32 %src, %1
+; CHECK-NEXT: %bfe = lshr i32 %2, %1
+; CHECK-NEXT: ret i32 %bfe
+define i32 @ubfe_offset_32(i32 %src, i32 %width) {
+  %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %src, i32 32, i32 %width)
+  ret i32 %bfe
+}
+
+; CHECK-LABEL: @ubfe_offset_31(
+; CHECK-NEXT: %1 = sub i32 32, %width
+; CHECK-NEXT: %2 = shl i32 %src, %1
+; CHECK-NEXT: %bfe = lshr i32 %2, %1
+; CHECK-NEXT: ret i32 %bfe
+define i32 @ubfe_offset_31(i32 %src, i32 %width) {
+  %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %src, i32 32, i32 %width)
+  ret i32 %bfe
+}
+
+; CHECK-LABEL: @ubfe_offset_0_width_0(
+; CHECK-NEXT: ret i32 0
+define i32 @ubfe_offset_0_width_0(i32 %src) {
+  %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %src, i32 0, i32 0)
+  ret i32 %bfe
+}
+
+; CHECK-LABEL: @ubfe_offset_0_width_3(
+; CHECK-NEXT: and i32 %src, 7
+; CHECK-NEXT: ret
+define i32 @ubfe_offset_0_width_3(i32 %src) {
+  %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %src, i32 0, i32 3)
+  ret i32 %bfe
+}
+
+; CHECK-LABEL: @ubfe_offset_3_width_1(
+; CHECK-NEXT: %1 = lshr i32 %src, 3
+; CHECK-NEXT: and i32 %1, 1
+; CHECK-NEXT: ret i32
+define i32 @ubfe_offset_3_width_1(i32 %src) {
+  %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %src, i32 3, i32 1)
+  ret i32 %bfe
+}
+
+; CHECK-LABEL: @ubfe_offset_3_width_4(
+; CHECK-NEXT: %1 = lshr i32 %src, 3
+; CHECK-NEXT: and i32 %1, 15
+; CHECK-NEXT: ret i32
+define i32 @ubfe_offset_3_width_4(i32 %src) {
+  %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %src, i32 3, i32 4)
+  ret i32 %bfe
+}
+
+; CHECK-LABEL: @ubfe_0_0_0(
+; CHECK-NEXT: ret i32 0
+define i32 @ubfe_0_0_0() {
+  %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 0, i32 0, i32 0)
+  ret i32 %bfe
+}
+
+; CHECK-LABEL: @ubfe_neg1_5_7(
+; CHECK-NEXT: ret i32 127
+define i32 @ubfe_neg1_5_7() {
+  %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 -1, i32 5, i32 7)
+  ret i32 %bfe
+}
+
+; CHECK-LABEL: @ubfe_undef_src_i32(
+; CHECK-NEXT: ret i32 undef
+define i32 @ubfe_undef_src_i32(i32 %offset, i32 %width) {
+  %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 undef, i32 %offset, i32 %width)
+  ret i32 %bfe
+}
+
+; CHECK-LABEL: @ubfe_undef_offset_i32(
+; CHECK: %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %src, i32 undef, i32 %width)
+define i32 @ubfe_undef_offset_i32(i32 %src, i32 %width) {
+  %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %src, i32 undef, i32 %width)
+  ret i32 %bfe
+}
+
+; CHECK-LABEL: @ubfe_undef_width_i32(
+; CHECK: %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %src, i32 %offset, i32 undef)
+define i32 @ubfe_undef_width_i32(i32 %src, i32 %offset) {
+  %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %src, i32 %offset, i32 undef)
+  ret i32 %bfe
+}
+
+; CHECK-LABEL: @ubfe_offset_33_width_4_i64(
+; CHECK-NEXT: %1 = lshr i64 %src, 33
+; CHECK-NEXT: %bfe = and i64 %1, 15
+define i64 @ubfe_offset_33_width_4_i64(i64 %src) {
+  %bfe = call i64 @llvm.amdgcn.ubfe.i64(i64 %src, i32 33, i32 4)
+  ret i64 %bfe
+}
+
+; CHECK-LABEL: @ubfe_offset_0_i64(
+; CHECK-NEXT: %1 = sub i32 64, %width
+; CHECK-NEXT: %2 = zext i32 %1 to i64
+; CHECK-NEXT: %3 = shl i64 %src, %2
+; CHECK-NEXT: %bfe = lshr i64 %3, %2
+; CHECK-NEXT: ret i64 %bfe
+define i64 @ubfe_offset_0_i64(i64 %src, i32 %width) {
+  %bfe = call i64 @llvm.amdgcn.ubfe.i64(i64 %src, i32 0, i32 %width)
+  ret i64 %bfe
+}
+
+; CHECK-LABEL: @ubfe_offset_32_width_32_i64(
+; CHECK-NEXT: %bfe = lshr i64 %src, 32
+; CHECK-NEXT: ret i64 %bfe
+define i64 @ubfe_offset_32_width_32_i64(i64 %src) {
+  %bfe = call i64 @llvm.amdgcn.ubfe.i64(i64 %src, i32 32, i32 32)
+  ret i64 %bfe
+}
+
+; --------------------------------------------------------------------
+; llvm.amdgcn.sbfe
+; --------------------------------------------------------------------
+
+declare i32 @llvm.amdgcn.sbfe.i32(i32, i32, i32) nounwind readnone
+declare i64 @llvm.amdgcn.sbfe.i64(i64, i32, i32) nounwind readnone
+
+; CHECK-LABEL: @sbfe_offset_31(
+; CHECK-NEXT: %1 = sub i32 32, %width
+; CHECK-NEXT: %2 = shl i32 %src, %1
+; CHECK-NEXT: %bfe = ashr i32 %2, %1
+; CHECK-NEXT: ret i32 %bfe
+define i32 @sbfe_offset_31(i32 %src, i32 %width) {
+  %bfe = call i32 @llvm.amdgcn.sbfe.i32(i32 %src, i32 32, i32 %width)
+  ret i32 %bfe
+}
+
+; CHECK-LABEL: @sbfe_neg1_5_7(
+; CHECK-NEXT: ret i32 -1
+define i32 @sbfe_neg1_5_7() {
+  %bfe = call i32 @llvm.amdgcn.sbfe.i32(i32 -1, i32 5, i32 7)
+  ret i32 %bfe
+}
+
+; CHECK-LABEL: @sbfe_offset_32_width_32_i64(
+; CHECK-NEXT: %bfe = ashr i64 %src, 32
+; CHECK-NEXT: ret i64 %bfe
+define i64 @sbfe_offset_32_width_32_i64(i64 %src) {
+  %bfe = call i64 @llvm.amdgcn.sbfe.i64(i64 %src, i32 32, i32 32)
+  ret i64 %bfe
+}
+
+; --------------------------------------------------------------------
+; llvm.amdgcn.exp
+; --------------------------------------------------------------------
+
+declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) nounwind inaccessiblememonly
+
+; Make sure no crashing on invalid variable params
+; CHECK-LABEL: @exp_invalid_inputs(
+; CHECK: call void @llvm.amdgcn.exp.f32(i32 0, i32 %en, float 1.000000e+00, float 2.000000e+00, float 5.000000e-01, float 4.000000e+00, i1 true, i1 false)
+; CHECK: call void @llvm.amdgcn.exp.f32(i32 %tgt, i32 15, float 1.000000e+00, float 2.000000e+00, float 5.000000e-01, float 4.000000e+00, i1 true, i1 false)
+define void @exp_invalid_inputs(i32 %tgt, i32 %en) {
+  call void @llvm.amdgcn.exp.f32(i32 0, i32 %en, float 1.0, float 2.0, float 0.5, float 4.0, i1 true, i1 false)
+  call void @llvm.amdgcn.exp.f32(i32 %tgt, i32 15, float 1.0, float 2.0, float 0.5, float 4.0, i1 true, i1 false)
+  ret void
+}
+
+; CHECK-LABEL: @exp_disabled_inputs_to_undef(
+; CHECK: call void @llvm.amdgcn.exp.f32(i32 0, i32 1, float 1.000000e+00, float undef, float undef, float undef, i1 true, i1 false)
+; CHECK: call void @llvm.amdgcn.exp.f32(i32 0, i32 2, float undef, float 2.000000e+00, float undef, float undef, i1 true, i1 false)
+; CHECK: call void @llvm.amdgcn.exp.f32(i32 0, i32 4, float undef, float undef, float 5.000000e-01, float undef, i1 true, i1 false)
+; CHECK: call void @llvm.amdgcn.exp.f32(i32 0, i32 8, float undef, float undef, float undef, float 4.000000e+00, i1 true, i1 false)
+
+; CHECK: call void @llvm.amdgcn.exp.f32(i32 0, i32 1, float %x, float undef, float undef, float undef, i1 true, i1 false)
+; CHECK: call void @llvm.amdgcn.exp.f32(i32 0, i32 2, float undef, float %y, float undef, float undef, i1 true, i1 false)
+; CHECK: call void @llvm.amdgcn.exp.f32(i32 0, i32 4, float undef, float undef, float %z, float undef, i1 true, i1 false)
+; CHECK: call void @llvm.amdgcn.exp.f32(i32 0, i32 8, float undef, float undef, float undef, float %w, i1 true, i1 false)
+
+; CHECK: call void @llvm.amdgcn.exp.f32(i32 0, i32 0, float undef, float undef, float undef, float undef, i1 true, i1 false)
+
+; CHECK: call void @llvm.amdgcn.exp.f32(i32 0, i32 3, float 1.000000e+00, float 2.000000e+00, float undef, float undef, i1 true, i1 false)
+; CHECK: call void @llvm.amdgcn.exp.f32(i32 0, i32 5, float 1.000000e+00, float undef, float 5.000000e-01, float undef, i1 true, i1 false)
+; CHECK: call void @llvm.amdgcn.exp.f32(i32 0, i32 9, float 1.000000e+00, float undef, float undef, float 4.000000e+00, i1 false, i1 false)
+; CHECK: call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float 1.000000e+00, float 2.000000e+00, float 5.000000e-01, float 4.000000e+00, i1 false, i1 false)
+define void @exp_disabled_inputs_to_undef(float %x, float %y, float %z, float %w) {
+  ; enable src0..src3 constants
+  call void @llvm.amdgcn.exp.f32(i32 0, i32 1, float 1.0, float 2.0, float 0.5, float 4.0, i1 true, i1 false)
+  call void @llvm.amdgcn.exp.f32(i32 0, i32 2, float 1.0, float 2.0, float 0.5, float 4.0, i1 true, i1 false)
+  call void @llvm.amdgcn.exp.f32(i32 0, i32 4, float 1.0, float 2.0, float 0.5, float 4.0, i1 true, i1 false)
+  call void @llvm.amdgcn.exp.f32(i32 0, i32 8, float 1.0, float 2.0, float 0.5, float 4.0, i1 true, i1 false)
+
+  ; enable src0..src3 variables
+  call void @llvm.amdgcn.exp.f32(i32 0, i32 1, float %x, float %y, float %z, float %w, i1 true, i1 false)
+  call void @llvm.amdgcn.exp.f32(i32 0, i32 2, float %x, float %y, float %z, float %w, i1 true, i1 false)
+  call void @llvm.amdgcn.exp.f32(i32 0, i32 4, float %x, float %y, float %z, float %w, i1 true, i1 false)
+  call void @llvm.amdgcn.exp.f32(i32 0, i32 8, float %x, float %y, float %z, float %w, i1 true, i1 false)
+
+  ; enable none
+  call void @llvm.amdgcn.exp.f32(i32 0, i32 0, float %x, float %y, float %z, float %w, i1 true, i1 false)
+
+  ; enable different source combinations
+  call void @llvm.amdgcn.exp.f32(i32 0, i32 3, float 1.0, float 2.0, float 0.5, float 4.0, i1 true, i1 false)
+  call void @llvm.amdgcn.exp.f32(i32 0, i32 5, float 1.0, float 2.0, float 0.5, float 4.0, i1 true, i1 false)
+  call void @llvm.amdgcn.exp.f32(i32 0, i32 9, float 1.0, float 2.0, float 0.5, float 4.0, i1 false, i1 false)
+  call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float 1.0, float 2.0, float 0.5, float 4.0, i1 false, i1 false)
+
+  ret void
+}
+
+; --------------------------------------------------------------------
+; llvm.amdgcn.exp.compr
+; --------------------------------------------------------------------
+
+declare void @llvm.amdgcn.exp.compr.v2f16(i32, i32, <2 x half>, <2 x half>, i1, i1) nounwind inaccessiblememonly
+
+; CHECK-LABEL: @exp_compr_invalid_inputs(
+; CHECK: call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 %en, <2 x half> <half 0xH3C00, half 0xH4000>, <2 x half> <half 0xH3800, half 0xH4400>, i1 true, i1 false)
+; CHECK: call void @llvm.amdgcn.exp.compr.v2f16(i32 %tgt, i32 5, <2 x half> <half 0xH3C00, half 0xH4000>, <2 x half> <half 0xH3800, half 0xH4400>, i1 true, i1 false)
+define void @exp_compr_invalid_inputs(i32 %tgt, i32 %en) {
+  call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 %en, <2 x half> <half 1.0, half 2.0>, <2 x half> <half 0.5, half 4.0>, i1 true, i1 false)
+  call void @llvm.amdgcn.exp.compr.v2f16(i32 %tgt, i32 5, <2 x half> <half 1.0, half 2.0>, <2 x half> <half 0.5, half 4.0>, i1 true, i1 false)
+  ret void
+}
+
+; CHECK-LABEL: @exp_compr_disabled_inputs_to_undef(
+; CHECK: call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 0, <2 x half> undef, <2 x half> undef, i1 true, i1 false)
+; CHECK: call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 1, <2 x half> <half 0xH3C00, half 0xH4000>, <2 x half> undef, i1 true, i1 false)
+; CHECK: call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 2, <2 x half> <half 0xH3C00, half 0xH4000>, <2 x half> undef, i1 true, i1 false)
+; CHECK: call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 3, <2 x half> <half 0xH3C00, half 0xH4000>, <2 x half> undef, i1 true, i1 false)
+
+; CHECK: call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 0, <2 x half> undef, <2 x half> undef, i1 true, i1 false)
+; CHECK: call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 1, <2 x half> %xy, <2 x half> undef, i1 true, i1 false)
+; CHECK: call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 2, <2 x half> %xy, <2 x half> undef, i1 true, i1 false)
+; CHECK: call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 3, <2 x half> %xy, <2 x half> undef, i1 true, i1 false)
+
+; CHECK: call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 12, <2 x half> undef, <2 x half> %zw, i1 true, i1 false)
+; CHECK: call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 15, <2 x half> %xy, <2 x half> %zw, i1 true, i1 false)
+define void @exp_compr_disabled_inputs_to_undef(<2 x half> %xy, <2 x half> %zw) {
+  call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 0, <2 x half> <half 1.0, half 2.0>, <2 x half> <half 0.5, half 4.0>, i1 true, i1 false)
+  call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 1, <2 x half> <half 1.0, half 2.0>, <2 x half> <half 0.5, half 4.0>, i1 true, i1 false)
+  call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 2, <2 x half> <half 1.0, half 2.0>, <2 x half> <half 0.5, half 4.0>, i1 true, i1 false)
+  call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 3, <2 x half> <half 1.0, half 2.0>, <2 x half> <half 0.5, half 4.0>, i1 true, i1 false)
+
+  call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 0, <2 x half> %xy, <2 x half> %zw, i1 true, i1 false)
+  call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 1, <2 x half> %xy, <2 x half> %zw, i1 true, i1 false)
+  call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 2, <2 x half> %xy, <2 x half> %zw, i1 true, i1 false)
+  call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 3, <2 x half> %xy, <2 x half> %zw, i1 true, i1 false)
+
+  call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 12, <2 x half> %xy, <2 x half> %zw, i1 true, i1 false)
+  call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 15, <2 x half> %xy, <2 x half> %zw, i1 true, i1 false)
+  ret void
+}
+
+; --------------------------------------------------------------------
+; llvm.amdgcn.fmed3
+; --------------------------------------------------------------------
+
+declare float @llvm.amdgcn.fmed3.f32(float, float, float) nounwind readnone
+
+; CHECK-LABEL: @fmed3_f32(
+; CHECK: %med3 = call float @llvm.amdgcn.fmed3.f32(float %x, float %y, float %z)
+define float @fmed3_f32(float %x, float %y, float %z) {
+  %med3 = call float @llvm.amdgcn.fmed3.f32(float %x, float %y, float %z)
+  ret float %med3
+}
+
+; CHECK-LABEL: @fmed3_canonicalize_x_c0_c1_f32(
+; CHECK: call float @llvm.amdgcn.fmed3.f32(float %x, float 0.000000e+00, float 1.000000e+00)
+define float @fmed3_canonicalize_x_c0_c1_f32(float %x) {
+  %med3 = call float @llvm.amdgcn.fmed3.f32(float %x, float 0.0, float 1.0)
+  ret float %med3
+}
+
+; CHECK-LABEL: @fmed3_canonicalize_c0_x_c1_f32(
+; CHECK: call float @llvm.amdgcn.fmed3.f32(float %x, float 0.000000e+00, float 1.000000e+00)
+define float @fmed3_canonicalize_c0_x_c1_f32(float %x) {
+  %med3 = call float @llvm.amdgcn.fmed3.f32(float 0.0, float %x, float 1.0)
+  ret float %med3
+}
+
+; CHECK-LABEL: @fmed3_canonicalize_c0_c1_x_f32(
+; CHECK: call float @llvm.amdgcn.fmed3.f32(float %x, float 0.000000e+00, float 1.000000e+00)
+define float @fmed3_canonicalize_c0_c1_x_f32(float %x) {
+  %med3 = call float @llvm.amdgcn.fmed3.f32(float 0.0, float 1.0, float %x)
+  ret float %med3
+}
+
+; CHECK-LABEL: @fmed3_canonicalize_x_y_c_f32(
+; CHECK: call float @llvm.amdgcn.fmed3.f32(float %x, float %y, float 1.000000e+00)
+define float @fmed3_canonicalize_x_y_c_f32(float %x, float %y) {
+  %med3 = call float @llvm.amdgcn.fmed3.f32(float %x, float %y, float 1.0)
+  ret float %med3
+}
+
+; CHECK-LABEL: @fmed3_canonicalize_x_c_y_f32(
+; CHECK: %med3 = call float @llvm.amdgcn.fmed3.f32(float %x, float %y, float 1.000000e+00)
+define float @fmed3_canonicalize_x_c_y_f32(float %x, float %y) {
+  %med3 = call float @llvm.amdgcn.fmed3.f32(float %x, float 1.0, float %y)
+  ret float %med3
+}
+
+; CHECK-LABEL: @fmed3_canonicalize_c_x_y_f32(
+; CHECK: call float @llvm.amdgcn.fmed3.f32(float %x, float %y, float 1.000000e+00)
+define float @fmed3_canonicalize_c_x_y_f32(float %x, float %y) {
+  %med3 = call float @llvm.amdgcn.fmed3.f32(float 1.0, float %x, float %y)
+  ret float %med3
+}
+
+; CHECK-LABEL: @fmed3_undef_x_y_f32(
+; CHECK: call float @llvm.minnum.f32(float %x, float %y)
+define float @fmed3_undef_x_y_f32(float %x, float %y) {
+  %med3 = call float @llvm.amdgcn.fmed3.f32(float undef, float %x, float %y)
+  ret float %med3
+}
+
+; CHECK-LABEL: @fmed3_fmf_undef_x_y_f32(
+; CHECK: call nnan float @llvm.minnum.f32(float %x, float %y)
+define float @fmed3_fmf_undef_x_y_f32(float %x, float %y) {
+  %med3 = call nnan float @llvm.amdgcn.fmed3.f32(float undef, float %x, float %y)
+  ret float %med3
+}
+
+; CHECK-LABEL: @fmed3_x_undef_y_f32(
+; CHECK: call float @llvm.minnum.f32(float %x, float %y)
+define float @fmed3_x_undef_y_f32(float %x, float %y) {
+  %med3 = call float @llvm.amdgcn.fmed3.f32(float %x, float undef, float %y)
+  ret float %med3
+}
+
+; CHECK-LABEL: @fmed3_x_y_undef_f32(
+; CHECK: call float @llvm.minnum.f32(float %x, float %y)
+define float @fmed3_x_y_undef_f32(float %x, float %y) {
+  %med3 = call float @llvm.amdgcn.fmed3.f32(float %x, float %y, float undef)
+  ret float %med3
+}
+
+; CHECK-LABEL: @fmed3_qnan0_x_y_f32(
+; CHECK: call float @llvm.minnum.f32(float %x, float %y)
+define float @fmed3_qnan0_x_y_f32(float %x, float %y) {
+  %med3 = call float @llvm.amdgcn.fmed3.f32(float 0x7FF8000000000000, float %x, float %y)
+  ret float %med3
+}
+
+; CHECK-LABEL: @fmed3_x_qnan0_y_f32(
+; CHECK: call float @llvm.minnum.f32(float %x, float %y)
+define float @fmed3_x_qnan0_y_f32(float %x, float %y) {
+  %med3 = call float @llvm.amdgcn.fmed3.f32(float %x, float 0x7FF8000000000000, float %y)
+  ret float %med3
+}
+
+; CHECK-LABEL: @fmed3_x_y_qnan0_f32(
+; CHECK: call float @llvm.minnum.f32(float %x, float %y)
+define float @fmed3_x_y_qnan0_f32(float %x, float %y) {
+  %med3 = call float @llvm.amdgcn.fmed3.f32(float %x, float %y, float 0x7FF8000000000000)
+  ret float %med3
+}
+
+; CHECK-LABEL: @fmed3_qnan1_x_y_f32(
+; CHECK: call float @llvm.minnum.f32(float %x, float %y)
+define float @fmed3_qnan1_x_y_f32(float %x, float %y) {
+  %med3 = call float @llvm.amdgcn.fmed3.f32(float 0x7FF8000100000000, float %x, float %y)
+  ret float %med3
+}
+
+; This can return any of the qnans.
+; CHECK-LABEL: @fmed3_qnan0_qnan1_qnan2_f32(
+; CHECK: ret float 0x7FF8002000000000
+define float @fmed3_qnan0_qnan1_qnan2_f32(float %x, float %y) {
+  %med3 = call float @llvm.amdgcn.fmed3.f32(float 0x7FF8000100000000, float 0x7FF8002000000000, float 0x7FF8030000000000)
+  ret float %med3
+}
+
+; CHECK-LABEL: @fmed3_constant_src0_0_f32(
+; CHECK: ret float 5.000000e-01
+define float @fmed3_constant_src0_0_f32(float %x, float %y) {
+  %med3 = call float @llvm.amdgcn.fmed3.f32(float 0.5, float -1.0, float 4.0)
+  ret float %med3
+}
+
+; CHECK-LABEL: @fmed3_constant_src0_1_f32(
+; CHECK: ret float 5.000000e-01
+define float @fmed3_constant_src0_1_f32(float %x, float %y) {
+  %med3 = call float @llvm.amdgcn.fmed3.f32(float 0.5, float 4.0, float -1.0)
+  ret float %med3
+}
+
+; CHECK-LABEL: @fmed3_constant_src1_0_f32(
+; CHECK: ret float 5.000000e-01
+define float @fmed3_constant_src1_0_f32(float %x, float %y) {
+  %med3 = call float @llvm.amdgcn.fmed3.f32(float -1.0, float 0.5, float 4.0)
+  ret float %med3
+}
+
+; CHECK-LABEL: @fmed3_constant_src1_1_f32(
+; CHECK: ret float 5.000000e-01
+define float @fmed3_constant_src1_1_f32(float %x, float %y) {
+  %med3 = call float @llvm.amdgcn.fmed3.f32(float 4.0, float 0.5, float -1.0)
+  ret float %med3
+}
+
+; CHECK-LABEL: @fmed3_constant_src2_0_f32(
+; CHECK: ret float 5.000000e-01
+define float @fmed3_constant_src2_0_f32(float %x, float %y) {
+  %med3 = call float @llvm.amdgcn.fmed3.f32(float -1.0, float 4.0, float 0.5)
+  ret float %med3
+}
+
+; CHECK-LABEL: @fmed3_constant_src2_1_f32(
+; CHECK: ret float 5.000000e-01
+define float @fmed3_constant_src2_1_f32(float %x, float %y) {
+  %med3 = call float @llvm.amdgcn.fmed3.f32(float 4.0, float -1.0, float 0.5)
+  ret float %med3
+}
+
+; CHECK-LABEL: @fmed3_x_qnan0_qnan1_f32(
+; CHECK: ret float %x
+define float @fmed3_x_qnan0_qnan1_f32(float %x) {
+  %med3 = call float @llvm.amdgcn.fmed3.f32(float %x, float 0x7FF8001000000000, float 0x7FF8002000000000)
+  ret float %med3
+}
+
+; CHECK-LABEL: @fmed3_qnan0_x_qnan1_f32(
+; CHECK: ret float %x
+define float @fmed3_qnan0_x_qnan1_f32(float %x) {
+  %med3 = call float @llvm.amdgcn.fmed3.f32(float 0x7FF8001000000000, float %x, float 0x7FF8002000000000)
+  ret float %med3
+}
+
+; CHECK-LABEL: @fmed3_qnan0_qnan1_x_f32(
+; CHECK: ret float %x
+define float @fmed3_qnan0_qnan1_x_f32(float %x) {
+  %med3 = call float @llvm.amdgcn.fmed3.f32(float 0x7FF8001000000000, float 0x7FF8002000000000, float %x)
+  ret float %med3
+}
+
+; --------------------------------------------------------------------
+; llvm.amdgcn.icmp
+; --------------------------------------------------------------------
+
+declare i64 @llvm.amdgcn.icmp.i32(i32, i32, i32) nounwind readnone convergent
+declare i64 @llvm.amdgcn.icmp.i64(i64, i64, i32) nounwind readnone convergent
+
+; Make sure there's no crash for invalid input
+; CHECK-LABEL: @invalid_nonconstant_icmp_code(
+; CHECK: call i64 @llvm.amdgcn.icmp.i32(i32 %a, i32 %b, i32 %c)
+define i64 @invalid_nonconstant_icmp_code(i32 %a, i32 %b, i32 %c) {
+  %result = call i64 @llvm.amdgcn.icmp.i32(i32 %a, i32 %b, i32 %c)
+  ret i64 %result
+}
+
+; CHECK-LABEL: @invalid_icmp_code(
+; CHECK: %under = call i64 @llvm.amdgcn.icmp.i32(i32 %a, i32 %b, i32 31)
+; CHECK: %over = call i64 @llvm.amdgcn.icmp.i32(i32 %a, i32 %b, i32 42)
+define i64 @invalid_icmp_code(i32 %a, i32 %b) {
+  %under = call i64 @llvm.amdgcn.icmp.i32(i32 %a, i32 %b, i32 31)
+  %over = call i64 @llvm.amdgcn.icmp.i32(i32 %a, i32 %b, i32 42)
+  %or = or i64 %under, %over
+  ret i64 %or
+}
+
+; CHECK-LABEL: @icmp_constant_inputs_false(
+; CHECK: ret i64 0
+define i64 @icmp_constant_inputs_false() {
+  %result = call i64 @llvm.amdgcn.icmp.i32(i32 9, i32 8, i32 32)
+  ret i64 %result
+}
+
+; CHECK-LABEL: @icmp_constant_inputs_true(
+; CHECK: %result = call i64 @llvm.read_register.i64(metadata !0) #5
+define i64 @icmp_constant_inputs_true() {
+  %result = call i64 @llvm.amdgcn.icmp.i32(i32 9, i32 8, i32 34)
+  ret i64 %result
+}
+
+; CHECK-LABEL: @icmp_constant_to_rhs_slt(
+; CHECK: %result = call i64 @llvm.amdgcn.icmp.i32(i32 %x, i32 9, i32 38)
+define i64 @icmp_constant_to_rhs_slt(i32 %x) {
+  %result = call i64 @llvm.amdgcn.icmp.i32(i32 9, i32 %x, i32 40)
+  ret i64 %result
+}
+
+; CHECK-LABEL: @fold_icmp_ne_0_zext_icmp_eq_i32(
+; CHECK-NEXT: call i64 @llvm.amdgcn.icmp.i32(i32 %a, i32 %b, i32 32)
+define i64 @fold_icmp_ne_0_zext_icmp_eq_i32(i32 %a, i32 %b) {
+  %cmp = icmp eq i32 %a, %b
+  %zext.cmp = zext i1 %cmp to i32
+  %mask = call i64 @llvm.amdgcn.icmp.i32(i32 %zext.cmp, i32 0, i32 33)
+  ret i64 %mask
+}
+
+; CHECK-LABEL: @fold_icmp_ne_0_zext_icmp_ne_i32(
+; CHECK-NEXT: call i64 @llvm.amdgcn.icmp.i32(i32 %a, i32 %b, i32 33)
+define i64 @fold_icmp_ne_0_zext_icmp_ne_i32(i32 %a, i32 %b) {
+  %cmp = icmp ne i32 %a, %b
+  %zext.cmp = zext i1 %cmp to i32
+  %mask = call i64 @llvm.amdgcn.icmp.i32(i32 %zext.cmp, i32 0, i32 33)
+  ret i64 %mask
+}
+
+; CHECK-LABEL: @fold_icmp_ne_0_zext_icmp_sle_i32(
+; CHECK-NEXT: call i64 @llvm.amdgcn.icmp.i32(i32 %a, i32 %b, i32 41)
+define i64 @fold_icmp_ne_0_zext_icmp_sle_i32(i32 %a, i32 %b) {
+  %cmp = icmp sle i32 %a, %b
+  %zext.cmp = zext i1 %cmp to i32
+  %mask = call i64 @llvm.amdgcn.icmp.i32(i32 %zext.cmp, i32 0, i32 33)
+  ret i64 %mask
+}
+
+; CHECK-LABEL: @fold_icmp_ne_0_zext_icmp_ugt_i64(
+; CHECK-NEXT: call i64 @llvm.amdgcn.icmp.i64(i64 %a, i64 %b, i32 34)
+define i64 @fold_icmp_ne_0_zext_icmp_ugt_i64(i64 %a, i64 %b) {
+  %cmp = icmp ugt i64 %a, %b
+  %zext.cmp = zext i1 %cmp to i32
+  %mask = call i64 @llvm.amdgcn.icmp.i32(i32 %zext.cmp, i32 0, i32 33)
+  ret i64 %mask
+}
+
+; CHECK-LABEL: @fold_icmp_ne_0_zext_icmp_ult_swap_i64(
+; CHECK-NEXT: call i64 @llvm.amdgcn.icmp.i64(i64 %a, i64 %b, i32 34)
+define i64 @fold_icmp_ne_0_zext_icmp_ult_swap_i64(i64 %a, i64 %b) {
+  %cmp = icmp ugt i64 %a, %b
+  %zext.cmp = zext i1 %cmp to i32
+  %mask = call i64 @llvm.amdgcn.icmp.i32(i32 0, i32 %zext.cmp, i32 33)
+  ret i64 %mask
+}
+
+; CHECK-LABEL: @fold_icmp_ne_0_zext_fcmp_oeq_f32(
+; CHECK-NEXT: call i64 @llvm.amdgcn.fcmp.f32(float %a, float %b, i32 1)
+define i64 @fold_icmp_ne_0_zext_fcmp_oeq_f32(float %a, float %b) {
+  %cmp = fcmp oeq float %a, %b
+  %zext.cmp = zext i1 %cmp to i32
+  %mask = call i64 @llvm.amdgcn.icmp.i32(i32 %zext.cmp, i32 0, i32 33)
+  ret i64 %mask
+}
+
+; CHECK-LABEL: @fold_icmp_ne_0_zext_fcmp_une_f32(
+; CHECK-NEXT: call i64 @llvm.amdgcn.fcmp.f32(float %a, float %b, i32 14)
+define i64 @fold_icmp_ne_0_zext_fcmp_une_f32(float %a, float %b) {
+  %cmp = fcmp une float %a, %b
+  %zext.cmp = zext i1 %cmp to i32
+  %mask = call i64 @llvm.amdgcn.icmp.i32(i32 %zext.cmp, i32 0, i32 33)
+  ret i64 %mask
+}
+
+; CHECK-LABEL: @fold_icmp_ne_0_zext_fcmp_olt_f64(
+; CHECK-NEXT: call i64 @llvm.amdgcn.fcmp.f64(double %a, double %b, i32 4)
+define i64 @fold_icmp_ne_0_zext_fcmp_olt_f64(double %a, double %b) {
+  %cmp = fcmp olt double %a, %b
+  %zext.cmp = zext i1 %cmp to i32
+  %mask = call i64 @llvm.amdgcn.icmp.i32(i32 %zext.cmp, i32 0, i32 33)
+  ret i64 %mask
+}
+
+; CHECK-LABEL: @fold_icmp_sext_icmp_ne_0_i32(
+; CHECK: %mask = call i64 @llvm.amdgcn.icmp.i32(i32 %a, i32 %b, i32 32)
+define i64 @fold_icmp_sext_icmp_ne_0_i32(i32 %a, i32 %b) {
+  %cmp = icmp eq i32 %a, %b
+  %sext.cmp = sext i1 %cmp to i32
+  %mask = call i64 @llvm.amdgcn.icmp.i32(i32 %sext.cmp, i32 0, i32 33)
+  ret i64 %mask
+}
+
+; CHECK-LABEL: @fold_icmp_eq_0_zext_icmp_eq_i32(
+; CHECK-NEXT: call i64 @llvm.amdgcn.icmp.i32(i32 %a, i32 %b, i32 33)
+define i64 @fold_icmp_eq_0_zext_icmp_eq_i32(i32 %a, i32 %b) {
+  %cmp = icmp eq i32 %a, %b
+  %zext.cmp = zext i1 %cmp to i32
+  %mask = call i64 @llvm.amdgcn.icmp.i32(i32 %zext.cmp, i32 0, i32 32)
+  ret i64 %mask
+}
+
+; CHECK-LABEL: @fold_icmp_eq_0_zext_icmp_slt_i32(
+; CHECK-NEXT: call i64 @llvm.amdgcn.icmp.i32(i32 %a, i32 %b, i32 39)
+define i64 @fold_icmp_eq_0_zext_icmp_slt_i32(i32 %a, i32 %b) {
+  %cmp = icmp slt i32 %a, %b
+  %zext.cmp = zext i1 %cmp to i32
+  %mask = call i64 @llvm.amdgcn.icmp.i32(i32 %zext.cmp, i32 0, i32 32)
+  ret i64 %mask
+}
+
+; CHECK-LABEL: @fold_icmp_eq_0_zext_fcmp_oeq_f32(
+; CHECK-NEXT: call i64 @llvm.amdgcn.fcmp.f32(float %a, float %b, i32 14)
+define i64 @fold_icmp_eq_0_zext_fcmp_oeq_f32(float %a, float %b) {
+  %cmp = fcmp oeq float %a, %b
+  %zext.cmp = zext i1 %cmp to i32
+  %mask = call i64 @llvm.amdgcn.icmp.i32(i32 %zext.cmp, i32 0, i32 32)
+  ret i64 %mask
+}
+
+; CHECK-LABEL: @fold_icmp_eq_0_zext_fcmp_ule_f32(
+; CHECK-NEXT: call i64 @llvm.amdgcn.fcmp.f32(float %a, float %b, i32 2)
+define i64 @fold_icmp_eq_0_zext_fcmp_ule_f32(float %a, float %b) {
+  %cmp = fcmp ule float %a, %b
+  %zext.cmp = zext i1 %cmp to i32
+  %mask = call i64 @llvm.amdgcn.icmp.i32(i32 %zext.cmp, i32 0, i32 32)
+  ret i64 %mask
+}
+
+; CHECK-LABEL: @fold_icmp_eq_0_zext_fcmp_ogt_f32(
+; CHECK-NEXT: call i64 @llvm.amdgcn.fcmp.f32(float %a, float %b, i32 13)
+define i64 @fold_icmp_eq_0_zext_fcmp_ogt_f32(float %a, float %b) {
+  %cmp = fcmp ogt float %a, %b
+  %zext.cmp = zext i1 %cmp to i32
+  %mask = call i64 @llvm.amdgcn.icmp.i32(i32 %zext.cmp, i32 0, i32 32)
+  ret i64 %mask
+}
+
+; CHECK-LABEL: @fold_icmp_zext_icmp_eq_1_i32(
+; CHECK-NEXT: call i64 @llvm.amdgcn.icmp.i32(i32 %a, i32 %b, i32 32)
+define i64 @fold_icmp_zext_icmp_eq_1_i32(i32 %a, i32 %b) {
+  %cmp = icmp eq i32 %a, %b
+  %zext.cmp = zext i1 %cmp to i32
+  %mask = call i64 @llvm.amdgcn.icmp.i32(i32 %zext.cmp, i32 1, i32 32)
+  ret i64 %mask
+}
+
+; CHECK-LABEL: @fold_icmp_zext_argi1_eq_1_i32(
+; CHECK: %zext.cond = zext i1 %cond to i32
+; CHECK: call i64 @llvm.amdgcn.icmp.i32(i32 %zext.cond, i32 0, i32 33)
+define i64 @fold_icmp_zext_argi1_eq_1_i32(i1 %cond) {
+  %zext.cond = zext i1 %cond to i32
+  %mask = call i64 @llvm.amdgcn.icmp.i32(i32 %zext.cond, i32 1, i32 32)
+  ret i64 %mask
+}
+
+; CHECK-LABEL: @fold_icmp_zext_argi1_eq_neg1_i32(
+; CHECK: %zext.cond = zext i1 %cond to i32
+; CHECK: call i64 @llvm.amdgcn.icmp.i32(i32 %zext.cond, i32 -1, i32 32)
+define i64 @fold_icmp_zext_argi1_eq_neg1_i32(i1 %cond) {
+  %zext.cond = zext i1 %cond to i32
+  %mask = call i64 @llvm.amdgcn.icmp.i32(i32 %zext.cond, i32 -1, i32 32)
+  ret i64 %mask
+}
+
+; CHECK-LABEL: @fold_icmp_sext_argi1_eq_1_i32(
+; CHECK: %sext.cond = sext i1 %cond to i32
+; CHECK: call i64 @llvm.amdgcn.icmp.i32(i32 %sext.cond, i32 1, i32 32)
+define i64 @fold_icmp_sext_argi1_eq_1_i32(i1 %cond) {
+  %sext.cond = sext i1 %cond to i32
+  %mask = call i64 @llvm.amdgcn.icmp.i32(i32 %sext.cond, i32 1, i32 32)
+  ret i64 %mask
+}
+
+; CHECK-LABEL: @fold_icmp_sext_argi1_eq_neg1_i32(
+; CHECK: %sext.cond = sext i1 %cond to i32
+; CHECK: call i64 @llvm.amdgcn.icmp.i32(i32 %sext.cond, i32 0, i32 33)
+define i64 @fold_icmp_sext_argi1_eq_neg1_i32(i1 %cond) {
+  %sext.cond = sext i1 %cond to i32
+  %mask = call i64 @llvm.amdgcn.icmp.i32(i32 %sext.cond, i32 -1, i32 32)
+  ret i64 %mask
+}
+
+; CHECK-LABEL: @fold_icmp_sext_argi1_eq_neg1_i64(
+; CHECK: %sext.cond = sext i1 %cond to i64
+; CHECK: call i64 @llvm.amdgcn.icmp.i64(i64 %sext.cond, i64 0, i32 33)
+define i64 @fold_icmp_sext_argi1_eq_neg1_i64(i1 %cond) {
+  %sext.cond = sext i1 %cond to i64
+  %mask = call i64 @llvm.amdgcn.icmp.i64(i64 %sext.cond, i64 -1, i32 32)
+  ret i64 %mask
+}
+
+; TODO: Should be able to fold to false
+; CHECK-LABEL: @fold_icmp_sext_icmp_eq_1_i32(
+; CHECK: %cmp = icmp eq i32 %a, %b
+; CHECK: %sext.cmp = sext i1 %cmp to i32
+; CHECK: %mask = call i64 @llvm.amdgcn.icmp.i32(i32 %sext.cmp, i32 1, i32 32)
+define i64 @fold_icmp_sext_icmp_eq_1_i32(i32 %a, i32 %b) {
+  %cmp = icmp eq i32 %a, %b
+  %sext.cmp = sext i1 %cmp to i32
+  %mask = call i64 @llvm.amdgcn.icmp.i32(i32 %sext.cmp, i32 1, i32 32)
+  ret i64 %mask
+}
+
+; CHECK-LABEL: @fold_icmp_sext_icmp_eq_neg1_i32(
+; CHECK: call i64 @llvm.amdgcn.icmp.i32(i32 %a, i32 %b, i32 32)
+define i64 @fold_icmp_sext_icmp_eq_neg1_i32(i32 %a, i32 %b) {
+  %cmp = icmp eq i32 %a, %b
+  %sext.cmp = sext i1 %cmp to i32
+  %mask = call i64 @llvm.amdgcn.icmp.i32(i32 %sext.cmp, i32 -1, i32 32)
+  ret i64 %mask
+}
+
+; CHECK-LABEL: @fold_icmp_sext_icmp_sge_neg1_i32(
+; CHECK: call i64 @llvm.amdgcn.icmp.i32(i32 %a, i32 %b, i32 39)
+define i64 @fold_icmp_sext_icmp_sge_neg1_i32(i32 %a, i32 %b) {
+  %cmp = icmp sge i32 %a, %b
+  %sext.cmp = sext i1 %cmp to i32
+  %mask = call i64 @llvm.amdgcn.icmp.i32(i32 %sext.cmp, i32 -1, i32 32)
+  ret i64 %mask
+}
+
+; CHECK-LABEL: @fold_not_icmp_ne_0_zext_icmp_sle_i32(
+; CHECK-NEXT: call i64 @llvm.amdgcn.icmp.i32(i32 %a, i32 %b, i32 38)
+define i64 @fold_not_icmp_ne_0_zext_icmp_sle_i32(i32 %a, i32 %b) {
+  %cmp = icmp sle i32 %a, %b
+  %not = xor i1 %cmp, true
+  %zext.cmp = zext i1 %not to i32
+  %mask = call i64 @llvm.amdgcn.icmp.i32(i32 %zext.cmp, i32 0, i32 33)
+  ret i64 %mask
+}
+
+; --------------------------------------------------------------------
+; llvm.amdgcn.fcmp
+; --------------------------------------------------------------------
+
+declare i64 @llvm.amdgcn.fcmp.f32(float, float, i32) nounwind readnone convergent
+
+; Make sure there's no crash for invalid input
+; CHECK-LABEL: @invalid_nonconstant_fcmp_code(
+; CHECK: call i64 @llvm.amdgcn.fcmp.f32(float %a, float %b, i32 %c)
+define i64 @invalid_nonconstant_fcmp_code(float %a, float %b, i32 %c) {
+  %result = call i64 @llvm.amdgcn.fcmp.f32(float %a, float %b, i32 %c)
+  ret i64 %result
+}
+
+; CHECK-LABEL: @invalid_fcmp_code(
+; CHECK: %under = call i64 @llvm.amdgcn.fcmp.f32(float %a, float %b, i32 -1)
+; CHECK: %over = call i64 @llvm.amdgcn.fcmp.f32(float %a, float %b, i32 16)
+define i64 @invalid_fcmp_code(float %a, float %b) {
+  %under = call i64 @llvm.amdgcn.fcmp.f32(float %a, float %b, i32 -1)
+  %over = call i64 @llvm.amdgcn.fcmp.f32(float %a, float %b, i32 16)
+  %or = or i64 %under, %over
+  ret i64 %or
+}
+
+; CHECK-LABEL: @fcmp_constant_inputs_false(
+; CHECK: ret i64 0
+define i64 @fcmp_constant_inputs_false() {
+  %result = call i64 @llvm.amdgcn.fcmp.f32(float 2.0, float 4.0, i32 1)
+  ret i64 %result
+}
+
+; CHECK-LABEL: @fcmp_constant_inputs_true(
+; CHECK: %result = call i64 @llvm.read_register.i64(metadata !0) #5
+define i64 @fcmp_constant_inputs_true() {
+  %result = call i64 @llvm.amdgcn.fcmp.f32(float 2.0, float 4.0, i32 4)
+  ret i64 %result
+}
+
+; CHECK-LABEL: @fcmp_constant_to_rhs_olt(
+; CHECK: %result = call i64 @llvm.amdgcn.fcmp.f32(float %x, float 4.000000e+00, i32 2)
+define i64 @fcmp_constant_to_rhs_olt(float %x) {
+  %result = call i64 @llvm.amdgcn.fcmp.f32(float 4.0, float %x, i32 4)
+  ret i64 %result
+}
+
+; CHECK: attributes #5 = { convergent }
diff --git a/test/Transforms/InstCombine/AMDGPU/lit.local.cfg b/test/Transforms/InstCombine/AMDGPU/lit.local.cfg
new file mode 100644
index 000000000000..2a665f06be72
--- /dev/null
+++ b/test/Transforms/InstCombine/AMDGPU/lit.local.cfg
@@ -0,0 +1,2 @@
+if not 'AMDGPU' in config.root.targets:
+    config.unsupported = True
diff --git a/test/Transforms/InstCombine/ARM/2012-04-23-Neon-Intrinsics.ll b/test/Transforms/InstCombine/ARM/2012-04-23-Neon-Intrinsics.ll
new file mode 100644
index 000000000000..9efed367d19f
--- /dev/null
+++ b/test/Transforms/InstCombine/ARM/2012-04-23-Neon-Intrinsics.ll
@@ -0,0 +1,65 @@
+; RUN: opt -S -instcombine < %s | FileCheck %s
+
+define <4 x i32> @mulByZero(<4 x i16> %x) nounwind readnone ssp {
+entry:
+  %a = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %x, <4 x i16> zeroinitializer) nounwind
+  ret <4 x i32> %a
+; CHECK: entry:
+; CHECK-NEXT: ret <4 x i32> zeroinitializer
+}
+
+define <4 x i32> @mulByOne(<4 x i16> %x) nounwind readnone ssp {
+entry:
+  %a = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %x, <4 x i16> <i16 1, i16 1, i16 1, i16 1>) nounwind
+  ret <4 x i32> %a
+; CHECK: entry:
+; CHECK-NEXT: %a = sext <4 x i16> %x to <4 x i32>
+; CHECK-NEXT: ret <4 x i32> %a
+}
+
+define <4 x i32> @constantMul() nounwind readnone ssp {
+entry:
+  %a = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> <i16 3, i16 3, i16 3, i16 3>, <4 x i16> <i16 2, i16 2, i16 2, i16 2>) nounwind
+  ret <4 x i32> %a
+; CHECK: entry:
+; CHECK-NEXT: ret <4 x i32> <i32 6, i32 6, i32 6, i32 6>
+}
+
+define <4 x i32> @constantMulS() nounwind readnone ssp {
+entry:
+  %b = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1>, <4 x i16> <i16 1, i16 1, i16 1, i16 1>) nounwind
+  ret <4 x i32> %b
+; CHECK: entry:
+; CHECK-NEXT: ret <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>
+}
+
+define <4 x i32> @constantMulU() nounwind readnone ssp {
+entry:
+  %b = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1>, <4 x i16> <i16 1, i16 1, i16 1, i16 1>) nounwind
+  ret <4 x i32> %b
+; CHECK: entry:
+; CHECK-NEXT: ret <4 x i32> <i32 65535, i32 65535, i32 65535, i32 65535>
+}
+
+define <4 x i32> @complex1(<4 x i16> %x) nounwind readnone ssp {
+entry:
+  %a = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> <i16 2, i16 2, i16 2, i16 2>, <4 x i16> %x) nounwind
+  %b = add <4 x i32> zeroinitializer, %a
+  ret <4 x i32> %b
+; CHECK: entry:
+; CHECK-NEXT: %a = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> <i16 2, i16 2, i16 2, i16 2>, <4 x i16> %x) [[NUW:#[0-9]+]]
+; CHECK-NEXT: ret <4 x i32> %a
+}
+
+define <4 x i32> @complex2(<4 x i32> %x) nounwind readnone ssp {
+entry:
+  %a = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> <i16 3, i16 3, i16 3, i16 3>, <4 x i16> <i16 2, i16 2, i16 2, i16 2>) nounwind
+  %b = add <4 x i32> %x, %a
+  ret <4 x i32> %b  
+; CHECK: entry:
+; CHECK-NEXT: %b = add <4 x i32> %x, <i32 6, i32 6, i32 6, i32 6>
+; CHECK-NEXT: ret <4 x i32> %b
+}
+
+declare <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16>, <4 x i16>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16>, <4 x i16>) nounwind readnone
diff --git a/test/Transforms/InstCombine/ARM/constant-fold-hang.ll b/test/Transforms/InstCombine/ARM/constant-fold-hang.ll
new file mode 100644
index 000000000000..2ca6b86ccc2f
--- /dev/null
+++ b/test/Transforms/InstCombine/ARM/constant-fold-hang.ll
@@ -0,0 +1,14 @@
+; RUN: opt -instcombine < %s
+
+; Function Attrs: nounwind readnone ssp
+define void @mulByZero(<4 x i16> %x) #0 {
+entry:
+  %a = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %x, <4 x i16> zeroinitializer) #2
+  ret void
+}
+
+; Function Attrs: nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16>, <4 x i16>) #1
+
+attributes #0 = { nounwind readnone ssp }
+attributes #1 = { nounwind readnone }
diff --git a/test/Transforms/InstCombine/ARM/lit.local.cfg b/test/Transforms/InstCombine/ARM/lit.local.cfg
new file mode 100644
index 000000000000..236e1d344166
--- /dev/null
+++ b/test/Transforms/InstCombine/ARM/lit.local.cfg
@@ -0,0 +1,2 @@
+if not 'ARM' in config.root.targets:
+    config.unsupported = True
diff --git a/test/Transforms/InstCombine/ARM/neon-intrinsics.ll b/test/Transforms/InstCombine/ARM/neon-intrinsics.ll
new file mode 100644
index 000000000000..d22fa9c811dc
--- /dev/null
+++ b/test/Transforms/InstCombine/ARM/neon-intrinsics.ll
@@ -0,0 +1,25 @@
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+; The alignment arguments for NEON load/store intrinsics can be increased
+; by instcombine.  Check for this.
+
+; CHECK: vld4.v2i32.p0i8({{.*}}, i32 32)
+; CHECK: vst4.p0i8.v2i32({{.*}}, i32 16)
+
+@x = common global [8 x i32] zeroinitializer, align 32
+@y = common global [8 x i32] zeroinitializer, align 16
+
+%struct.__neon_int32x2x4_t = type { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }
+
+define void @test() nounwind ssp {
+  %tmp1 = call %struct.__neon_int32x2x4_t @llvm.arm.neon.vld4.v2i32.p0i8(i8* bitcast ([8 x i32]* @x to i8*), i32 1)
+  %tmp2 = extractvalue %struct.__neon_int32x2x4_t %tmp1, 0
+  %tmp3 = extractvalue %struct.__neon_int32x2x4_t %tmp1, 1
+  %tmp4 = extractvalue %struct.__neon_int32x2x4_t %tmp1, 2
+  %tmp5 = extractvalue %struct.__neon_int32x2x4_t %tmp1, 3
+  call void @llvm.arm.neon.vst4.p0i8.v2i32(i8* bitcast ([8 x i32]* @y to i8*), <2 x i32> %tmp2, <2 x i32> %tmp3, <2 x i32> %tmp4, <2 x i32> %tmp5, i32 1)
+  ret void
+}
+
+declare %struct.__neon_int32x2x4_t @llvm.arm.neon.vld4.v2i32.p0i8(i8*, i32) nounwind readonly
+declare void @llvm.arm.neon.vst4.p0i8.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, i32) nounwind
diff --git a/test/Transforms/InstCombine/PowerPC/aligned-altivec.ll b/test/Transforms/InstCombine/PowerPC/aligned-altivec.ll
new file mode 100644
index 000000000000..10b4e4d62631
--- /dev/null
+++ b/test/Transforms/InstCombine/PowerPC/aligned-altivec.ll
@@ -0,0 +1,131 @@
+; RUN: opt -S -instcombine < %s | FileCheck %s
+target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+declare <4 x i32> @llvm.ppc.altivec.lvx(i8*) #1
+
+define <4 x i32> @test1(<4 x i32>* %h) #0 {
+entry:
+  %h1 = getelementptr <4 x i32>, <4 x i32>* %h, i64 1
+  %hv = bitcast <4 x i32>* %h1 to i8*
+  %vl = call <4 x i32> @llvm.ppc.altivec.lvx(i8* %hv)
+
+; CHECK-LABEL: @test1
+; CHECK: @llvm.ppc.altivec.lvx
+; CHECK: ret <4 x i32>
+
+  %v0 = load <4 x i32>, <4 x i32>* %h, align 8
+  %a = add <4 x i32> %v0, %vl
+  ret <4 x i32> %a
+}
+
+define <4 x i32> @test1a(<4 x i32>* align 16 %h) #0 {
+entry:
+  %h1 = getelementptr <4 x i32>, <4 x i32>* %h, i64 1
+  %hv = bitcast <4 x i32>* %h1 to i8*
+  %vl = call <4 x i32> @llvm.ppc.altivec.lvx(i8* %hv)
+
+; CHECK-LABEL: @test1a
+; CHECK-NOT: @llvm.ppc.altivec.lvx
+; CHECK: ret <4 x i32>
+
+  %v0 = load <4 x i32>, <4 x i32>* %h, align 8
+  %a = add <4 x i32> %v0, %vl
+  ret <4 x i32> %a
+}
+
+declare void @llvm.ppc.altivec.stvx(<4 x i32>, i8*) #0
+
+define <4 x i32> @test2(<4 x i32>* %h, <4 x i32> %d) #0 {
+entry:
+  %h1 = getelementptr <4 x i32>, <4 x i32>* %h, i64 1
+  %hv = bitcast <4 x i32>* %h1 to i8*
+  call void @llvm.ppc.altivec.stvx(<4 x i32> %d, i8* %hv)
+
+  %v0 = load <4 x i32>, <4 x i32>* %h, align 8
+  ret <4 x i32> %v0
+
+; CHECK-LABEL: @test2
+; CHECK: @llvm.ppc.altivec.stvx
+; CHECK: ret <4 x i32>
+}
+
+define <4 x i32> @test2a(<4 x i32>* align 16 %h, <4 x i32> %d) #0 {
+entry:
+  %h1 = getelementptr <4 x i32>, <4 x i32>* %h, i64 1
+  %hv = bitcast <4 x i32>* %h1 to i8*
+  call void @llvm.ppc.altivec.stvx(<4 x i32> %d, i8* %hv)
+
+  %v0 = load <4 x i32>, <4 x i32>* %h, align 8
+  ret <4 x i32> %v0
+
+; CHECK-LABEL: @test2
+; CHECK-NOT: @llvm.ppc.altivec.stvx
+; CHECK: ret <4 x i32>
+}
+
+declare <4 x i32> @llvm.ppc.altivec.lvxl(i8*) #1
+
+define <4 x i32> @test1l(<4 x i32>* %h) #0 {
+entry:
+  %h1 = getelementptr <4 x i32>, <4 x i32>* %h, i64 1
+  %hv = bitcast <4 x i32>* %h1 to i8*
+  %vl = call <4 x i32> @llvm.ppc.altivec.lvxl(i8* %hv)
+
+; CHECK-LABEL: @test1l
+; CHECK: @llvm.ppc.altivec.lvxl
+; CHECK: ret <4 x i32>
+
+  %v0 = load <4 x i32>, <4 x i32>* %h, align 8
+  %a = add <4 x i32> %v0, %vl
+  ret <4 x i32> %a
+}
+
+define <4 x i32> @test1la(<4 x i32>* align 16 %h) #0 {
+entry:
+  %h1 = getelementptr <4 x i32>, <4 x i32>* %h, i64 1
+  %hv = bitcast <4 x i32>* %h1 to i8*
+  %vl = call <4 x i32> @llvm.ppc.altivec.lvxl(i8* %hv)
+
+; CHECK-LABEL: @test1la
+; CHECK-NOT: @llvm.ppc.altivec.lvxl
+; CHECK: ret <4 x i32>
+
+  %v0 = load <4 x i32>, <4 x i32>* %h, align 8
+  %a = add <4 x i32> %v0, %vl
+  ret <4 x i32> %a
+}
+
+declare void @llvm.ppc.altivec.stvxl(<4 x i32>, i8*) #0
+
+define <4 x i32> @test2l(<4 x i32>* %h, <4 x i32> %d) #0 {
+entry:
+  %h1 = getelementptr <4 x i32>, <4 x i32>* %h, i64 1
+  %hv = bitcast <4 x i32>* %h1 to i8*
+  call void @llvm.ppc.altivec.stvxl(<4 x i32> %d, i8* %hv)
+
+  %v0 = load <4 x i32>, <4 x i32>* %h, align 8
+  ret <4 x i32> %v0
+
+; CHECK-LABEL: @test2l
+; CHECK: @llvm.ppc.altivec.stvxl
+; CHECK: ret <4 x i32>
+}
+
+define <4 x i32> @test2la(<4 x i32>* align 16 %h, <4 x i32> %d) #0 {
+entry:
+  %h1 = getelementptr <4 x i32>, <4 x i32>* %h, i64 1
+  %hv = bitcast <4 x i32>* %h1 to i8*
+  call void @llvm.ppc.altivec.stvxl(<4 x i32> %d, i8* %hv)
+
+  %v0 = load <4 x i32>, <4 x i32>* %h, align 8
+  ret <4 x i32> %v0
+
+; CHECK-LABEL: @test2l
+; CHECK-NOT: @llvm.ppc.altivec.stvxl
+; CHECK: ret <4 x i32>
+}
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readonly }
+
diff --git a/test/Transforms/InstCombine/PowerPC/aligned-qpx.ll b/test/Transforms/InstCombine/PowerPC/aligned-qpx.ll
new file mode 100644
index 000000000000..e9710df5670c
--- /dev/null
+++ b/test/Transforms/InstCombine/PowerPC/aligned-qpx.ll
@@ -0,0 +1,165 @@
+; RUN: opt -S -instcombine < %s | FileCheck %s
+target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+declare <4 x double> @llvm.ppc.qpx.qvlfs(i8*) #1
+
+define <4 x double> @test1(<4 x float>* %h) #0 {
+entry:
+  %h1 = getelementptr <4 x float>, <4 x float>* %h, i64 1
+  %hv = bitcast <4 x float>* %h1 to i8*
+  %vl = call <4 x double> @llvm.ppc.qpx.qvlfs(i8* %hv)
+
+; CHECK-LABEL: @test1
+; CHECK: @llvm.ppc.qpx.qvlfs
+; CHECK: ret <4 x double>
+
+  %v0 = load <4 x float>, <4 x float>* %h, align 8
+  %v0e = fpext <4 x float> %v0 to <4 x double>
+  %a = fadd <4 x double> %v0e, %vl
+  ret <4 x double> %a
+}
+
+define <4 x double> @test1a(<4 x float>* align 16 %h) #0 {
+entry:
+  %h1 = getelementptr <4 x float>, <4 x float>* %h, i64 1
+  %hv = bitcast <4 x float>* %h1 to i8*
+  %vl = call <4 x double> @llvm.ppc.qpx.qvlfs(i8* %hv)
+
+; CHECK-LABEL: @test1a
+; CHECK-NOT: @llvm.ppc.qpx.qvlfs
+; CHECK-NOT: load <4 x double>
+; CHECK: ret <4 x double>
+
+  %v0 = load <4 x float>, <4 x float>* %h, align 8
+  %v0e = fpext <4 x float> %v0 to <4 x double>
+  %a = fadd <4 x double> %v0e, %vl
+  ret <4 x double> %a
+}
+
+declare void @llvm.ppc.qpx.qvstfs(<4 x double>, i8*) #0
+
+define <4 x float> @test2(<4 x float>* %h, <4 x double> %d) #0 {
+entry:
+  %h1 = getelementptr <4 x float>, <4 x float>* %h, i64 1
+  %hv = bitcast <4 x float>* %h1 to i8*
+  call void @llvm.ppc.qpx.qvstfs(<4 x double> %d, i8* %hv)
+
+  %v0 = load <4 x float>, <4 x float>* %h, align 8
+  ret <4 x float> %v0
+
+; CHECK-LABEL: @test2
+; CHECK: @llvm.ppc.qpx.qvstfs
+; CHECK: ret <4 x float>
+}
+
+define <4 x float> @test2a(<4 x float>* align 16 %h, <4 x double> %d) #0 {
+entry:
+  %h1 = getelementptr <4 x float>, <4 x float>* %h, i64 1
+  %hv = bitcast <4 x float>* %h1 to i8*
+  call void @llvm.ppc.qpx.qvstfs(<4 x double> %d, i8* %hv)
+
+  %v0 = load <4 x float>, <4 x float>* %h, align 8
+  ret <4 x float> %v0
+
+; CHECK-LABEL: @test2
+; CHECK: fptrunc <4 x double> %d to <4 x float>
+; CHECK-NOT: @llvm.ppc.qpx.qvstfs
+; CHECK-NOT: store <4 x double>
+; CHECK: ret <4 x float>
+}
+
+declare <4 x double> @llvm.ppc.qpx.qvlfd(i8*) #1
+
+define <4 x double> @test1l(<4 x double>* %h) #0 {
+entry:
+  %h1 = getelementptr <4 x double>, <4 x double>* %h, i64 1
+  %hv = bitcast <4 x double>* %h1 to i8*
+  %vl = call <4 x double> @llvm.ppc.qpx.qvlfd(i8* %hv)
+
+; CHECK-LABEL: @test1l
+; CHECK: @llvm.ppc.qpx.qvlfd
+; CHECK: ret <4 x double>
+
+  %v0 = load <4 x double>, <4 x double>* %h, align 8
+  %a = fadd <4 x double> %v0, %vl
+  ret <4 x double> %a
+}
+
+define <4 x double> @test1ln(<4 x double>* align 16 %h) #0 {
+entry:
+  %h1 = getelementptr <4 x double>, <4 x double>* %h, i64 1
+  %hv = bitcast <4 x double>* %h1 to i8*
+  %vl = call <4 x double> @llvm.ppc.qpx.qvlfd(i8* %hv)
+
+; CHECK-LABEL: @test1ln
+; CHECK: @llvm.ppc.qpx.qvlfd
+; CHECK: ret <4 x double>
+
+  %v0 = load <4 x double>, <4 x double>* %h, align 8
+  %a = fadd <4 x double> %v0, %vl
+  ret <4 x double> %a
+}
+
+define <4 x double> @test1la(<4 x double>* align 32 %h) #0 {
+entry:
+  %h1 = getelementptr <4 x double>, <4 x double>* %h, i64 1
+  %hv = bitcast <4 x double>* %h1 to i8*
+  %vl = call <4 x double> @llvm.ppc.qpx.qvlfd(i8* %hv)
+
+; CHECK-LABEL: @test1la
+; CHECK-NOT: @llvm.ppc.qpx.qvlfd
+; CHECK: ret <4 x double>
+
+  %v0 = load <4 x double>, <4 x double>* %h, align 8
+  %a = fadd <4 x double> %v0, %vl
+  ret <4 x double> %a
+}
+
+declare void @llvm.ppc.qpx.qvstfd(<4 x double>, i8*) #0
+
+define <4 x double> @test2l(<4 x double>* %h, <4 x double> %d) #0 {
+entry:
+  %h1 = getelementptr <4 x double>, <4 x double>* %h, i64 1
+  %hv = bitcast <4 x double>* %h1 to i8*
+  call void @llvm.ppc.qpx.qvstfd(<4 x double> %d, i8* %hv)
+
+  %v0 = load <4 x double>, <4 x double>* %h, align 8
+  ret <4 x double> %v0
+
+; CHECK-LABEL: @test2l
+; CHECK: @llvm.ppc.qpx.qvstfd
+; CHECK: ret <4 x double>
+}
+
+define <4 x double> @test2ln(<4 x double>* align 16 %h, <4 x double> %d) #0 {
+entry:
+  %h1 = getelementptr <4 x double>, <4 x double>* %h, i64 1
+  %hv = bitcast <4 x double>* %h1 to i8*
+  call void @llvm.ppc.qpx.qvstfd(<4 x double> %d, i8* %hv)
+
+  %v0 = load <4 x double>, <4 x double>* %h, align 8
+  ret <4 x double> %v0
+
+; CHECK-LABEL: @test2ln
+; CHECK: @llvm.ppc.qpx.qvstfd
+; CHECK: ret <4 x double>
+}
+
+define <4 x double> @test2la(<4 x double>* align 32 %h, <4 x double> %d) #0 {
+entry:
+  %h1 = getelementptr <4 x double>, <4 x double>* %h, i64 1
+  %hv = bitcast <4 x double>* %h1 to i8*
+  call void @llvm.ppc.qpx.qvstfd(<4 x double> %d, i8* %hv)
+
+  %v0 = load <4 x double>, <4 x double>* %h, align 8
+  ret <4 x double> %v0
+
+; CHECK-LABEL: @test2l
+; CHECK-NOT: @llvm.ppc.qpx.qvstfd
+; CHECK: ret <4 x double>
+}
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readonly }
+
diff --git a/test/Transforms/InstCombine/PowerPC/lit.local.cfg b/test/Transforms/InstCombine/PowerPC/lit.local.cfg
new file mode 100644
index 000000000000..5d33887ff0a4
--- /dev/null
+++ b/test/Transforms/InstCombine/PowerPC/lit.local.cfg
@@ -0,0 +1,3 @@
+if not 'PowerPC' in config.root.targets:
+    config.unsupported = True
+
diff --git a/test/Transforms/InstCombine/PowerPC/vsx-unaligned.ll b/test/Transforms/InstCombine/PowerPC/vsx-unaligned.ll
new file mode 100644
index 000000000000..ad264fb15b31
--- /dev/null
+++ b/test/Transforms/InstCombine/PowerPC/vsx-unaligned.ll
@@ -0,0 +1,44 @@
+; Verify that we can create unaligned loads and stores from VSX intrinsics.
+
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+target triple = "powerpc64-unknown-linux-gnu"
+
+@vf = common global <4 x float> zeroinitializer, align 1
+@res_vf = common global <4 x float> zeroinitializer, align 1
+@vd = common global <2 x double> zeroinitializer, align 1
+@res_vd = common global <2 x double> zeroinitializer, align 1
+
+define void @test1() {
+entry:
+  %t1 = alloca <4 x float>*, align 8
+  %t2 = alloca <2 x double>*, align 8
+  store <4 x float>* @vf, <4 x float>** %t1, align 8
+  %0 = load <4 x float>*, <4 x float>** %t1, align 8
+  %1 = bitcast <4 x float>* %0 to i8*
+  %2 = call <4 x i32> @llvm.ppc.vsx.lxvw4x(i8* %1)
+  store <4 x float>* @res_vf, <4 x float>** %t1, align 8
+  %3 = load <4 x float>*, <4 x float>** %t1, align 8
+  %4 = bitcast <4 x float>* %3 to i8*
+  call void @llvm.ppc.vsx.stxvw4x(<4 x i32> %2, i8* %4)
+  store <2 x double>* @vd, <2 x double>** %t2, align 8
+  %5 = load <2 x double>*, <2 x double>** %t2, align 8
+  %6 = bitcast <2 x double>* %5 to i8*
+  %7 = call <2 x double> @llvm.ppc.vsx.lxvd2x(i8* %6)
+  store <2 x double>* @res_vd, <2 x double>** %t2, align 8
+  %8 = load <2 x double>*, <2 x double>** %t2, align 8
+  %9 = bitcast <2 x double>* %8 to i8*
+  call void @llvm.ppc.vsx.stxvd2x(<2 x double> %7, i8* %9)
+  ret void
+}
+
+; CHECK-LABEL: @test1
+; CHECK: %0 = load <4 x i32>, <4 x i32>* bitcast (<4 x float>* @vf to <4 x i32>*), align 1
+; CHECK: store <4 x i32> %0, <4 x i32>* bitcast (<4 x float>* @res_vf to <4 x i32>*), align 1
+; CHECK: %1 = load <2 x double>, <2 x double>* @vd, align 1
+; CHECK: store <2 x double> %1, <2 x double>* @res_vd, align 1
+
+declare <4 x i32> @llvm.ppc.vsx.lxvw4x(i8*)
+declare void @llvm.ppc.vsx.stxvw4x(<4 x i32>, i8*)
+declare <2 x double> @llvm.ppc.vsx.lxvd2x(i8*)
+declare void @llvm.ppc.vsx.stxvd2x(<2 x double>, i8*)
diff --git a/test/Transforms/InstCombine/X86/X86FsubCmpCombine.ll b/test/Transforms/InstCombine/X86/X86FsubCmpCombine.ll
new file mode 100644
index 000000000000..fde0692d00a2
--- /dev/null
+++ b/test/Transforms/InstCombine/X86/X86FsubCmpCombine.ll
@@ -0,0 +1,181 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+; The test checks the folding of cmp(sub(a,b),0) into cmp(a,b).
+
+define i8 @sub_compare_foldingPD128_safe(<2 x double> %a, <2 x double> %b){
+; CHECK-LABEL: @sub_compare_foldingPD128_safe(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[SUB_SAFE:%.*]] = fsub <2 x double> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[TMP0:%.*]] = tail call i8 @llvm.x86.avx512.mask.cmp.pd.128(<2 x double> [[SUB_SAFE]], <2 x double> zeroinitializer, i32 5, i8 -1)
+; CHECK-NEXT:    ret i8 [[TMP0]]
+;
+entry:
+  %sub.safe = fsub <2 x double> %a, %b
+  %0 = tail call i8 @llvm.x86.avx512.mask.cmp.pd.128(<2 x double> %sub.safe , <2 x double> zeroinitializer, i32 5, i8 -1)
+  ret i8 %0
+}
+
+
+define i8 @sub_compare_foldingPD128(<2 x double> %a, <2 x double> %b){
+; CHECK-LABEL: @sub_compare_foldingPD128(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = tail call i8 @llvm.x86.avx512.mask.cmp.pd.128(<2 x double> [[A:%.*]], <2 x double> [[B:%.*]], i32 5, i8 -1)
+; CHECK-NEXT:    ret i8 [[TMP0]]
+;
+entry:
+  %sub.i = fsub ninf <2 x double> %a, %b
+  %0 = tail call i8 @llvm.x86.avx512.mask.cmp.pd.128(<2 x double> %sub.i , <2 x double> zeroinitializer, i32 5, i8 -1)
+  ret i8 %0
+}
+
+
+define i8 @sub_compare_foldingPD256(<4 x double> %a, <4 x double> %b){
+; CHECK-LABEL: @sub_compare_foldingPD256(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = tail call i8 @llvm.x86.avx512.mask.cmp.pd.256(<4 x double> [[A:%.*]], <4 x double> [[B:%.*]], i32 5, i8 -1)
+; CHECK-NEXT:    ret i8 [[TMP0]]
+;
+entry:
+  %sub.i1 = fsub ninf <4 x double> %a, %b
+  %0 = tail call i8 @llvm.x86.avx512.mask.cmp.pd.256(<4 x double> %sub.i1, <4 x double> zeroinitializer, i32 5, i8 -1)
+  ret i8 %0
+}
+
+
+define i8 @sub_compare_foldingPD512(<8 x double> %a, <8 x double> %b){
+; CHECK-LABEL: @sub_compare_foldingPD512(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = tail call i8 @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> [[A:%.*]], <8 x double> [[B:%.*]], i32 11, i8 -1, i32 4)
+; CHECK-NEXT:    ret i8 [[TMP0]]
+;
+entry:
+  %sub.i2 = fsub ninf <8 x double> %a, %b
+  %0 = tail call i8 @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> %sub.i2, <8 x double> zeroinitializer, i32 11, i8 -1, i32 4)
+  ret i8 %0
+}
+
+
+define i8 @sub_compare_foldingPS128(<4 x float> %a, <4 x float> %b){
+; CHECK-LABEL: @sub_compare_foldingPS128(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = tail call i8 @llvm.x86.avx512.mask.cmp.ps.128(<4 x float> [[A:%.*]], <4 x float> [[B:%.*]], i32 12, i8 -1)
+; CHECK-NEXT:    ret i8 [[TMP0]]
+;
+entry:
+  %sub.i3 = fsub ninf <4 x float> %a, %b
+  %0 = tail call i8 @llvm.x86.avx512.mask.cmp.ps.128(<4 x float> %sub.i3, <4 x float> zeroinitializer, i32 12, i8 -1)
+  ret i8 %0
+}
+
+
+define i8 @sub_compare_foldingPS256(<8 x float> %a, <8 x float> %b){
+; CHECK-LABEL: @sub_compare_foldingPS256(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = tail call i8 @llvm.x86.avx512.mask.cmp.ps.256(<8 x float> [[A:%.*]], <8 x float> [[B:%.*]], i32 5, i8 -1)
+; CHECK-NEXT:    ret i8 [[TMP0]]
+;
+entry:
+  %sub.i4 = fsub ninf <8 x float> %a, %b
+  %0 = tail call i8 @llvm.x86.avx512.mask.cmp.ps.256(<8 x float> %sub.i4, <8 x float> zeroinitializer, i32 5, i8 -1)
+  ret i8 %0
+}
+
+
+define i16 @sub_compare_foldingPS512(<16 x float> %a, <16 x float> %b){
+; CHECK-LABEL: @sub_compare_foldingPS512(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = tail call i16 @llvm.x86.avx512.mask.cmp.ps.512(<16 x float> [[A:%.*]], <16 x float> [[B:%.*]], i32 11, i16 -1, i32 4)
+; CHECK-NEXT:    ret i16 [[TMP0]]
+;
+entry:
+  %sub.i5 = fsub ninf <16 x float> %a, %b
+  %0 = tail call i16 @llvm.x86.avx512.mask.cmp.ps.512(<16 x float> %sub.i5, <16 x float> zeroinitializer, i32 11, i16 -1, i32 4)
+  ret i16 %0
+}
+
+
+
+define i8 @sub_compare_folding_swapPD128(<2 x double> %a, <2 x double> %b){
+; CHECK-LABEL: @sub_compare_folding_swapPD128(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = tail call i8 @llvm.x86.avx512.mask.cmp.pd.128(<2 x double> [[B:%.*]], <2 x double> [[A:%.*]], i32 5, i8 -1)
+; CHECK-NEXT:    ret i8 [[TMP0]]
+;
+entry:
+  %sub.i = fsub ninf <2 x double> %a, %b
+  %0 = tail call i8 @llvm.x86.avx512.mask.cmp.pd.128(<2 x double> zeroinitializer, <2 x double> %sub.i, i32 5, i8 -1)
+  ret i8 %0
+}
+
+
+define i8 @sub_compare_folding_swapPD256(<4 x double> %a, <4 x double> %b){
+; CHECK-LABEL: @sub_compare_folding_swapPD256(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = tail call i8 @llvm.x86.avx512.mask.cmp.pd.256(<4 x double> [[B:%.*]], <4 x double> [[A:%.*]], i32 5, i8 -1)
+; CHECK-NEXT:    ret i8 [[TMP0]]
+;
+entry:
+  %sub.i = fsub ninf <4 x double> %a, %b
+  %0 = tail call i8 @llvm.x86.avx512.mask.cmp.pd.256(<4 x double> zeroinitializer, <4 x double> %sub.i, i32 5, i8 -1)
+  ret i8 %0
+}
+
+
+define i8 @sub_compare_folding_swapPD512(<8 x double> %a, <8 x double> %b){
+; CHECK-LABEL: @sub_compare_folding_swapPD512(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = tail call i8 @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> [[B:%.*]], <8 x double> [[A:%.*]], i32 11, i8 -1, i32 4)
+; CHECK-NEXT:    ret i8 [[TMP0]]
+;
+entry:
+  %sub.i = fsub ninf <8 x double> %a, %b
+  %0 = tail call i8 @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> zeroinitializer, <8 x double> %sub.i, i32 11, i8 -1, i32 4)
+  ret i8 %0
+}
+
+
+define i8 @sub_compare_folding_swapPS128(<4 x float> %a, <4 x float> %b){
+; CHECK-LABEL: @sub_compare_folding_swapPS128(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = tail call i8 @llvm.x86.avx512.mask.cmp.ps.128(<4 x float> [[B:%.*]], <4 x float> [[A:%.*]], i32 12, i8 -1)
+; CHECK-NEXT:    ret i8 [[TMP0]]
+;
+entry:
+  %sub.i = fsub ninf <4 x float> %a, %b
+  %0 = tail call i8 @llvm.x86.avx512.mask.cmp.ps.128(<4 x float> zeroinitializer, <4 x float> %sub.i, i32 12, i8 -1)
+  ret i8 %0
+}
+
+
+define i8 @sub_compare_folding_swapPS256(<8 x float> %a, <8 x float> %b){
+; CHECK-LABEL: @sub_compare_folding_swapPS256(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = tail call i8 @llvm.x86.avx512.mask.cmp.ps.256(<8 x float> [[B:%.*]], <8 x float> [[A:%.*]], i32 5, i8 -1)
+; CHECK-NEXT:    ret i8 [[TMP0]]
+;
+entry:
+  %sub.i = fsub ninf <8 x float> %a, %b
+  %0 = tail call i8 @llvm.x86.avx512.mask.cmp.ps.256(<8 x float> zeroinitializer, <8 x float> %sub.i, i32 5, i8 -1)
+  ret i8 %0
+}
+
+
+define i16 @sub_compare_folding_swapPS512(<16 x float> %a, <16 x float> %b){
+; CHECK-LABEL: @sub_compare_folding_swapPS512(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = tail call i16 @llvm.x86.avx512.mask.cmp.ps.512(<16 x float> [[B:%.*]], <16 x float> [[A:%.*]], i32 11, i16 -1, i32 4)
+; CHECK-NEXT:    ret i16 [[TMP0]]
+;
+entry:
+  %sub.i = fsub ninf <16 x float> %a, %b
+  %0 = tail call i16 @llvm.x86.avx512.mask.cmp.ps.512(<16 x float> zeroinitializer, <16 x float> %sub.i, i32 11, i16 -1, i32 4)
+  ret i16 %0
+}
+
+declare i8 @llvm.x86.avx512.mask.cmp.pd.128(<2 x double>, <2 x double>, i32, i8)
+declare i8 @llvm.x86.avx512.mask.cmp.pd.256(<4 x double>, <4 x double>, i32, i8)
+declare i8 @llvm.x86.avx512.mask.cmp.pd.512(<8 x double>, <8 x double>, i32, i8, i32)
+declare i8 @llvm.x86.avx512.mask.cmp.ps.128(<4 x float>, <4 x float>, i32, i8)
+declare i8 @llvm.x86.avx512.mask.cmp.ps.256(<8 x float>, <8 x float>, i32, i8)
+declare i16 @llvm.x86.avx512.mask.cmp.ps.512(<16 x float>, <16 x float>, i32, i16, i32)
diff --git a/test/Transforms/InstCombine/X86/blend_x86.ll b/test/Transforms/InstCombine/X86/blend_x86.ll
new file mode 100644
index 000000000000..39ceb0186efe
--- /dev/null
+++ b/test/Transforms/InstCombine/X86/blend_x86.ll
@@ -0,0 +1,151 @@
+; RUN: opt < %s -instcombine -mtriple=x86_64-apple-macosx -mcpu=core-avx2 -S | FileCheck %s
+
+define <2 x double> @constant_blendvpd(<2 x double> %xy, <2 x double> %ab) {
+; CHECK-LABEL: @constant_blendvpd(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x double> %ab, <2 x double> %xy, <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    ret <2 x double> [[TMP1]]
+;
+  %1 = tail call <2 x double> @llvm.x86.sse41.blendvpd(<2 x double> %xy, <2 x double> %ab, <2 x double> <double 0xFFFFFFFFE0000000, double 0.000000e+00>)
+  ret <2 x double> %1
+}
+
+define <2 x double> @constant_blendvpd_zero(<2 x double> %xy, <2 x double> %ab) {
+; CHECK-LABEL: @constant_blendvpd_zero
+; CHECK-NEXT: ret <2 x double> %xy
+  %1 = tail call <2 x double> @llvm.x86.sse41.blendvpd(<2 x double> %xy, <2 x double> %ab, <2 x double> zeroinitializer)
+  ret <2 x double> %1
+}
+
+define <2 x double> @constant_blendvpd_dup(<2 x double> %xy, <2 x double> %sel) {
+; CHECK-LABEL: @constant_blendvpd_dup
+; CHECK-NEXT: ret <2 x double> %xy
+  %1 = tail call <2 x double> @llvm.x86.sse41.blendvpd(<2 x double> %xy, <2 x double> %xy, <2 x double> %sel)
+  ret <2 x double> %1
+}
+
+define <4 x float> @constant_blendvps(<4 x float> %xyzw, <4 x float> %abcd) {
+; CHECK-LABEL: @constant_blendvps(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> %abcd, <4 x float> %xyzw, <4 x i32> <i32 4, i32 5, i32 6, i32 3>
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
+;
+  %1 = tail call <4 x float> @llvm.x86.sse41.blendvps(<4 x float> %xyzw, <4 x float> %abcd, <4 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0xFFFFFFFFE0000000>)
+  ret <4 x float> %1
+}
+
+define <4 x float> @constant_blendvps_zero(<4 x float> %xyzw, <4 x float> %abcd) {
+; CHECK-LABEL: @constant_blendvps_zero
+; CHECK-NEXT: ret <4 x float> %xyzw
+  %1 = tail call <4 x float> @llvm.x86.sse41.blendvps(<4 x float> %xyzw, <4 x float> %abcd, <4 x float> zeroinitializer)
+  ret <4 x float> %1
+}
+
+define <4 x float> @constant_blendvps_dup(<4 x float> %xyzw, <4 x float> %sel) {
+; CHECK-LABEL: @constant_blendvps_dup
+; CHECK-NEXT: ret <4 x float> %xyzw
+  %1 = tail call <4 x float> @llvm.x86.sse41.blendvps(<4 x float> %xyzw, <4 x float> %xyzw, <4 x float> %sel)
+  ret <4 x float> %1
+}
+
+define <16 x i8> @constant_pblendvb(<16 x i8> %xyzw, <16 x i8> %abcd) {
+; CHECK-LABEL: @constant_pblendvb(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i8> %abcd, <16 x i8> %xyzw, <16 x i32> <i32 16, i32 17, i32 2, i32 19, i32 4, i32 5, i32 6, i32 23, i32 24, i32 25, i32 10, i32 27, i32 12, i32 13, i32 14, i32 31>
+; CHECK-NEXT:    ret <16 x i8> [[TMP1]]
+;
+  %1 = tail call <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8> %xyzw, <16 x i8> %abcd, <16 x i8> <i8 0, i8 0, i8 255, i8 0, i8 255, i8 255, i8 255, i8 0, i8 0, i8 0, i8 255, i8 0, i8 255, i8 255, i8 255, i8 0>)
+  ret <16 x i8> %1
+}
+
+define <16 x i8> @constant_pblendvb_zero(<16 x i8> %xyzw, <16 x i8> %abcd) {
+; CHECK-LABEL: @constant_pblendvb_zero
+; CHECK-NEXT: ret <16 x i8> %xyzw
+  %1 = tail call <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8> %xyzw, <16 x i8> %abcd, <16 x i8> zeroinitializer)
+  ret <16 x i8> %1
+}
+
+define <16 x i8> @constant_pblendvb_dup(<16 x i8> %xyzw, <16 x i8> %sel) {
+; CHECK-LABEL: @constant_pblendvb_dup
+; CHECK-NEXT: ret <16 x i8> %xyzw
+  %1 = tail call <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8> %xyzw, <16 x i8> %xyzw, <16 x i8> %sel)
+  ret <16 x i8> %1
+}
+
+define <4 x double> @constant_blendvpd_avx(<4 x double> %xy, <4 x double> %ab) {
+; CHECK-LABEL: @constant_blendvpd_avx(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> %ab, <4 x double> %xy, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+; CHECK-NEXT:    ret <4 x double> [[TMP1]]
+;
+  %1 = tail call <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double> %xy, <4 x double> %ab, <4 x double> <double 0xFFFFFFFFE0000000, double 0.000000e+00, double 0xFFFFFFFFE0000000, double 0.000000e+00>)
+  ret <4 x double> %1
+}
+
+define <4 x double> @constant_blendvpd_avx_zero(<4 x double> %xy, <4 x double> %ab) {
+; CHECK-LABEL: @constant_blendvpd_avx_zero
+; CHECK-NEXT: ret <4 x double> %xy
+  %1 = tail call <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double> %xy, <4 x double> %ab, <4 x double> zeroinitializer)
+  ret <4 x double> %1
+}
+
+define <4 x double> @constant_blendvpd_avx_dup(<4 x double> %xy, <4 x double> %sel) {
+; CHECK-LABEL: @constant_blendvpd_avx_dup
+; CHECK-NEXT: ret <4 x double> %xy
+  %1 = tail call <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double> %xy, <4 x double> %xy, <4 x double> %sel)
+  ret <4 x double> %1
+}
+
+define <8 x float> @constant_blendvps_avx(<8 x float> %xyzw, <8 x float> %abcd) {
+; CHECK-LABEL: @constant_blendvps_avx(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> %abcd, <8 x float> %xyzw, <8 x i32> <i32 8, i32 9, i32 10, i32 3, i32 12, i32 13, i32 14, i32 7>
+; CHECK-NEXT:    ret <8 x float> [[TMP1]]
+;
+  %1 = tail call <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float> %xyzw, <8 x float> %abcd, <8 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0xFFFFFFFFE0000000, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0xFFFFFFFFE0000000>)
+  ret <8 x float> %1
+}
+
+define <8 x float> @constant_blendvps_avx_zero(<8 x float> %xyzw, <8 x float> %abcd) {
+; CHECK-LABEL: @constant_blendvps_avx_zero
+; CHECK-NEXT: ret <8 x float> %xyzw
+  %1 = tail call <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float> %xyzw, <8 x float> %abcd, <8 x float> zeroinitializer)
+  ret <8 x float> %1
+}
+
+define <8 x float> @constant_blendvps_avx_dup(<8 x float> %xyzw, <8 x float> %sel) {
+; CHECK-LABEL: @constant_blendvps_avx_dup
+; CHECK-NEXT: ret <8 x float> %xyzw
+  %1 = tail call <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float> %xyzw, <8 x float> %xyzw, <8 x float> %sel)
+  ret <8 x float> %1
+}
+
+define <32 x i8> @constant_pblendvb_avx2(<32 x i8> %xyzw, <32 x i8> %abcd) {
+; CHECK-LABEL: @constant_pblendvb_avx2(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <32 x i8> %abcd, <32 x i8> %xyzw, <32 x i32> <i32 32, i32 33, i32 2, i32 35, i32 4, i32 5, i32 6, i32 39, i32 40, i32 41, i32 10, i32 43, i32 12, i32 13, i32 14, i32 47, i32 48, i32 49, i32 18, i32 51, i32 20, i32 21, i32 22, i32 55, i32 56, i32 57, i32 26, i32 59, i32 28, i32 29, i32 30, i32 63>
+; CHECK-NEXT:    ret <32 x i8> [[TMP1]]
+;
+  %1 = tail call <32 x i8> @llvm.x86.avx2.pblendvb(<32 x i8> %xyzw, <32 x i8> %abcd,
+        <32 x i8> <i8 0, i8 0, i8 255, i8 0, i8 255, i8 255, i8 255, i8 0,
+                   i8 0, i8 0, i8 255, i8 0, i8 255, i8 255, i8 255, i8 0,
+                   i8 0, i8 0, i8 255, i8 0, i8 255, i8 255, i8 255, i8 0,
+                   i8 0, i8 0, i8 255, i8 0, i8 255, i8 255, i8 255, i8 0>)
+  ret <32 x i8> %1
+}
+
+define <32 x i8> @constant_pblendvb_avx2_zero(<32 x i8> %xyzw, <32 x i8> %abcd) {
+; CHECK-LABEL: @constant_pblendvb_avx2_zero
+; CHECK-NEXT: ret <32 x i8> %xyzw
+  %1 = tail call <32 x i8> @llvm.x86.avx2.pblendvb(<32 x i8> %xyzw, <32 x i8> %abcd, <32 x i8> zeroinitializer)
+  ret <32 x i8> %1
+}
+
+define <32 x i8> @constant_pblendvb_avx2_dup(<32 x i8> %xyzw, <32 x i8> %sel) {
+; CHECK-LABEL: @constant_pblendvb_avx2_dup
+; CHECK-NEXT: ret <32 x i8> %xyzw
+  %1 = tail call <32 x i8> @llvm.x86.avx2.pblendvb(<32 x i8> %xyzw, <32 x i8> %xyzw, <32 x i8> %sel)
+  ret <32 x i8> %1
+}
+
+declare <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8>, <16 x i8>, <16 x i8>)
+declare <4 x float> @llvm.x86.sse41.blendvps(<4 x float>, <4 x float>, <4 x float>)
+declare <2 x double> @llvm.x86.sse41.blendvpd(<2 x double>, <2 x double>, <2 x double>)
+
+declare <32 x i8> @llvm.x86.avx2.pblendvb(<32 x i8>, <32 x i8>, <32 x i8>)
+declare <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float>, <8 x float>, <8 x float>)
+declare <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double>, <4 x double>, <4 x double>)
diff --git a/test/Transforms/InstCombine/X86/lit.local.cfg b/test/Transforms/InstCombine/X86/lit.local.cfg
new file mode 100644
index 000000000000..c8625f4d9d24
--- /dev/null
+++ b/test/Transforms/InstCombine/X86/lit.local.cfg
@@ -0,0 +1,2 @@
+if not 'X86' in config.root.targets:
+    config.unsupported = True
diff --git a/test/Transforms/InstCombine/X86/pr2645-1.ll b/test/Transforms/InstCombine/X86/pr2645-1.ll
new file mode 100644
index 000000000000..2986d21866bf
--- /dev/null
+++ b/test/Transforms/InstCombine/X86/pr2645-1.ll
@@ -0,0 +1,39 @@
+; RUN: opt < %s -instcombine -S | grep shufflevector
+; PR2645
+
+; instcombine shouldn't delete the shufflevector.
+
+define internal void @""(i8*, i32, i8*) {
+; <label>:3
+        br label %4
+
+; <label>:4             ; preds = %6, %3
+        %.0 = phi i32 [ 0, %3 ], [ %19, %6 ]            ; <i32> [#uses=4]
+        %5 = icmp slt i32 %.0, %1               ; <i1> [#uses=1]
+        br i1 %5, label %6, label %20
+
+; <label>:6             ; preds = %4
+        %7 = getelementptr i8, i8* %2, i32 %.0              ; <i8*> [#uses=1]
+        %8 = bitcast i8* %7 to <4 x i16>*               ; <<4 x i16>*> [#uses=1]
+        %9 = load <4 x i16>, <4 x i16>* %8, align 1                ; <<4 x i16>> [#uses=1]
+        %10 = bitcast <4 x i16> %9 to <1 x i64>         ; <<1 x i64>> [#uses=1]
+        %11 = call <2 x i64> @foo(<1 x i64> %10)
+; <<2 x i64>> [#uses=1]
+        %12 = bitcast <2 x i64> %11 to <4 x i32>                ; <<4 x i32>> [#uses=1]
+        %13 = bitcast <4 x i32> %12 to <8 x i16>                ; <<8 x i16>> [#uses=2]
+        %14 = shufflevector <8 x i16> %13, <8 x i16> %13, <8 x i32> < i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3 >          ; <<8 x i16>> [#uses=1]
+        %15 = bitcast <8 x i16> %14 to <4 x i32>                ; <<4 x i32>> [#uses=1]
+        %16 = sitofp <4 x i32> %15 to <4 x float>               ; <<4 x float>> [#uses=1]
+        %17 = getelementptr i8, i8* %0, i32 %.0             ; <i8*> [#uses=1]
+        %18 = bitcast i8* %17 to <4 x float>*           ; <<4 x float>*> [#uses=1]
+        store <4 x float> %16, <4 x float>* %18, align 1
+        %19 = add i32 %.0, 1            ; <i32> [#uses=1]
+        br label %4
+
+; <label>:20            ; preds = %4
+        call void @llvm.x86.mmx.emms( )
+        ret void
+}
+
+declare <2 x i64> @foo(<1 x i64>)
+declare void @llvm.x86.mmx.emms( )
diff --git a/test/Transforms/InstCombine/X86/shufflemask-undef.ll b/test/Transforms/InstCombine/X86/shufflemask-undef.ll
new file mode 100644
index 000000000000..d95c42da5f7e
--- /dev/null
+++ b/test/Transforms/InstCombine/X86/shufflemask-undef.ll
@@ -0,0 +1,110 @@
+; RUN: opt < %s -instcombine -S | FileCheck %s
+; CHECK-NOT: shufflevector{{.*}}i32 8"
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
+target triple = "i386-apple-darwin9"
+	%struct.ActiveTextureTargets = type { i64, i64, i64, i64, i64, i64 }
+	%struct.AlphaTest = type { float, i16, i8, i8 }
+	%struct.ArrayRange = type { i8, i8, i8, i8 }
+	%struct.BlendMode = type { i16, i16, i16, i16, %struct.IColor4, i16, i16, i8, i8, i8, i8 }
+	%struct.ClearColor = type { double, %struct.IColor4, %struct.IColor4, float, i32 }
+	%struct.ClipPlane = type { i32, [6 x %struct.IColor4] }
+	%struct.ColorBuffer = type { i16, i8, i8, [8 x i16], [0 x i32] }
+	%struct.ColorMatrix = type { [16 x float]*, %struct.ImagingColorScale }
+	%struct.Convolution = type { %struct.IColor4, %struct.ImagingColorScale, i16, i16, [0 x i32], float*, i32, i32 }
+	%struct.DepthTest = type { i16, i16, i8, i8, i8, i8, double, double }
+	%struct.FixedFunction = type { %struct.PPStreamToken* }
+	%struct.FogMode = type { %struct.IColor4, float, float, float, float, float, i16, i16, i16, i8, i8 }
+	%struct.HintMode = type { i16, i16, i16, i16, i16, i16, i16, i16, i16, i16 }
+	%struct.Histogram = type { %struct.ProgramLimits*, i32, i16, i8, i8 }
+	%struct.ImagingColorScale = type { %struct.TCoord2, %struct.TCoord2, %struct.TCoord2, %struct.TCoord2 }
+	%struct.ImagingSubset = type { %struct.Convolution, %struct.Convolution, %struct.Convolution, %struct.ColorMatrix, %struct.Minmax, %struct.Histogram, %struct.ImagingColorScale, %struct.ImagingColorScale, %struct.ImagingColorScale, %struct.ImagingColorScale, i32, [0 x i32] }
+	%struct.Light = type { %struct.IColor4, %struct.IColor4, %struct.IColor4, %struct.IColor4, %struct.PointLineLimits, float, float, float, float, float, %struct.PointLineLimits, float, %struct.PointLineLimits, float, %struct.PointLineLimits, float, float, float, float, float }
+	%struct.LightModel = type { %struct.IColor4, [8 x %struct.Light], [2 x %struct.Material], i32, i16, i16, i16, i8, i8, i8, i8, i8, i8 }
+	%struct.LightProduct = type { %struct.IColor4, %struct.IColor4, %struct.IColor4 }
+	%struct.LineMode = type { float, i32, i16, i16, i8, i8, i8, i8 }
+	%struct.LogicOp = type { i16, i8, i8 }
+	%struct.MaskMode = type { i32, [3 x i32], i8, i8, i8, i8, i8, i8, i8, i8 }
+	%struct.Material = type { %struct.IColor4, %struct.IColor4, %struct.IColor4, %struct.IColor4, float, float, float, float, [8 x %struct.LightProduct], %struct.IColor4, [8 x i32] }
+	%struct.Minmax = type { %struct.MinmaxTable*, i16, i8, i8, [0 x i32] }
+	%struct.MinmaxTable = type { %struct.IColor4, %struct.IColor4 }
+	%struct.Mipmaplevel = type { [4 x i32], [4 x i32], [4 x float], [4 x i32], i32, i32, float*, i8*, i16, i16, i16, i16, [2 x float] }
+	%struct.Multisample = type { float, i8, i8, i8, i8, i8, i8, i8, i8 }
+	%struct.PipelineProgramState = type { i8, i8, i8, i8, [0 x i32], %struct.IColor4* }
+	%struct.PixelMap = type { i32*, float*, float*, float*, float*, float*, float*, float*, float*, i32*, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 }
+	%struct.PixelMode = type { float, float, %struct.PixelStore, %struct.PixelTransfer, %struct.PixelMap, %struct.ImagingSubset, i32, i32 }
+	%struct.PixelPack = type { i32, i32, i32, i32, i32, i32, i32, i32, i8, i8, i8, i8 }
+	%struct.PixelStore = type { %struct.PixelPack, %struct.PixelPack }
+	%struct.PixelTransfer = type { float, float, float, float, float, float, float, float, float, float, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float }
+	%struct.PluginBufferData = type { i32 }
+	%struct.PointLineLimits = type { float, float, float }
+	%struct.PointMode = type { float, float, float, float, %struct.PointLineLimits, float, i8, i8, i8, i8, i16, i16, i32, i16, i16 }
+	%struct.PolygonMode = type { [128 x i8], float, float, i16, i16, i16, i16, i8, i8, i8, i8, i8, i8, i8, i8 }
+	%struct.ProgramLimits = type { i32, i32, i32, i32 }
+	%struct.RegisterCombiners = type { i8, i8, i8, i8, i32, [2 x %struct.IColor4], [8 x %struct.RegisterCombinersPerStageState], %struct.RegisterCombinersFinalStageState }
+	%struct.RegisterCombinersFinalStageState = type { i8, i8, i8, i8, [7 x %struct.RegisterCombinersPerVariableState] }
+	%struct.RegisterCombinersPerPortionState = type { [4 x %struct.RegisterCombinersPerVariableState], i8, i8, i8, i8, i16, i16, i16, i16, i16, i16 }
+	%struct.RegisterCombinersPerStageState = type { [2 x %struct.RegisterCombinersPerPortionState], [2 x %struct.IColor4] }
+	%struct.RegisterCombinersPerVariableState = type { i16, i16, i16, i16 }
+	%struct.SWRSurfaceRec = type { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i8*, i8*, i8*, [4 x i8*], i32 }
+	%struct.ScissorTest = type { %struct.ProgramLimits, i8, i8, i8, i8 }
+	%struct.State = type <{ i16, i16, i16, i16, i32, i32, [256 x %struct.IColor4], [128 x %struct.IColor4], %struct.Viewport, %struct.Transform, %struct.LightModel, %struct.ActiveTextureTargets, %struct.AlphaTest, %struct.BlendMode, %struct.ClearColor, %struct.ColorBuffer, %struct.DepthTest, %struct.ArrayRange, %struct.FogMode, %struct.HintMode, %struct.LineMode, %struct.LogicOp, %struct.MaskMode, %struct.PixelMode, %struct.PointMode, %struct.PolygonMode, %struct.ScissorTest, i32, %struct.StencilTest, [8 x %struct.TextureMode], [16 x %struct.TextureImageMode], %struct.ArrayRange, [8 x %struct.TextureCoordGen], %struct.ClipPlane, %struct.Multisample, %struct.RegisterCombiners, %struct.ArrayRange, %struct.ArrayRange, [3 x %struct.PipelineProgramState], %struct.ArrayRange, %struct.TransformFeedback, i32*, %struct.FixedFunction, [3 x i32], [3 x i32] }>
+	%struct.StencilTest = type { [3 x { i32, i32, i16, i16, i16, i16 }], i32, [4 x i8] }
+	%struct.TextureCoordGen = type { { i16, i16, %struct.IColor4, %struct.IColor4 }, { i16, i16, %struct.IColor4, %struct.IColor4 }, { i16, i16, %struct.IColor4, %struct.IColor4 }, { i16, i16, %struct.IColor4, %struct.IColor4 }, i8, i8, i8, i8 }
+	%struct.TextureGeomState = type { i16, i16, i16, i16, i16, i8, i8, i8, i8, i16, i16, i16, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, [6 x i16], [6 x i16] }
+	%struct.TextureImageMode = type { float }
+	%struct.TextureLevel = type { i32, i32, i16, i16, i16, i8, i8, i16, i16, i16, i16, i8* }
+	%struct.TextureMode = type { %struct.IColor4, i32, i16, i16, i16, i16, i16, i16, i16, i16, i16, i16, i16, i16, i16, i16, i16, i16, float, float, i16, i16, i16, i16, i16, i16, [4 x i16], i8, i8, i8, i8, [3 x float], [4 x float], float, float }
+	%struct.TextureParamState = type { i16, i16, i16, i16, i16, i16, %struct.IColor4, float, float, float, float, i16, i16, i16, i16, float, i16, i8, i8, i32, i8* }
+	%struct.TextureRec = type { [4 x float], %struct.TextureState*, %struct.Mipmaplevel*, %struct.Mipmaplevel*, float, float, float, float, i8, i8, i8, i8, i16, i16, i16, i16, i32, float, [2 x %struct.PPStreamToken] }
+	%struct.TextureState = type { i16, i8, i8, i16, i16, float, i32, %struct.SWRSurfaceRec*, %struct.TextureParamState, %struct.TextureGeomState, [0 x i32], i8*, i32, %struct.TextureLevel, [1 x [15 x %struct.TextureLevel]] }
+	%struct.Transform = type <{ [24 x [16 x float]], [24 x [16 x float]], [16 x float], float, float, float, float, float, i8, i8, i8, i8, i32, i32, i32, i16, i16, i8, i8, i8, i8, i32 }>
+	%struct.TransformFeedback = type { i8, i8, i8, i8, [0 x i32], [16 x i32], [16 x i32] }
+	%struct.Viewport = type { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, double, double, i32, i32, i32, i32, float, float, float, float }
+	%struct.IColor4 = type { float, float, float, float }
+	%struct.TCoord2 = type { float, float }
+	%struct.VMGPStack = type { [6 x <4 x float>*], <4 x float>*, i32, i32, <4 x float>*, <4 x float>**, i32, i32, i32, i32, i32, i32 }
+	%struct.VMTextures = type { [16 x %struct.TextureRec*] }
+	%struct.PPStreamToken = type { { i16, i16, i32 } }
+	%struct._VMConstants = type { <4 x float>, <4 x float>, <4 x float>, <4 x float>, <4 x float>, <4 x float>, <4 x float>, <4 x float>, <4 x float>, <4 x float>, float, float, float, float, float, float, float, float, float, float, float, float, [256 x float], [528 x i8], { void (i8*, i8*, i32, i8*)*, float (float)*, float (float)*, float (float)*, i32 (float)* } }
+
+define i32 @foo(%struct.State* %dst, <4 x float>* %prgrm, <4 x float>** %buffs, %struct._VMConstants* %cnstn, %struct.PPStreamToken* %pstrm, %struct.PluginBufferData* %gpctx, %struct.VMTextures* %txtrs, %struct.VMGPStack* %gpstk) nounwind {
+bb266.i:
+	getelementptr <4 x float>, <4 x float>* null, i32 11		; <<4 x float>*>:0 [#uses=1]
+	load <4 x float>, <4 x float>* %0, align 16		; <<4 x float>>:1 [#uses=1]
+	shufflevector <4 x float> %1, <4 x float> undef, <4 x i32> < i32 0, i32 1, i32 1, i32 1 >		; <<4 x float>>:2 [#uses=1]
+	shufflevector <4 x float> %2, <4 x float> undef, <4 x i32> < i32 0, i32 4, i32 1, i32 5 >		; <<4 x float>>:3 [#uses=1]
+	shufflevector <4 x float> undef, <4 x float> undef, <4 x i32> < i32 0, i32 4, i32 1, i32 5 >		; <<4 x float>>:4 [#uses=1]
+	shufflevector <4 x float> %4, <4 x float> %3, <4 x i32> < i32 6, i32 7, i32 2, i32 3 >		; <<4 x float>>:5 [#uses=1]
+	fmul <4 x float> %5, zeroinitializer		; <<4 x float>>:6 [#uses=2]
+	fmul <4 x float> %6, %6		; <<4 x float>>:7 [#uses=1]
+	fadd <4 x float> zeroinitializer, %7		; <<4 x float>>:8 [#uses=1]
+	call <4 x float> @llvm.x86.sse.max.ps( <4 x float> zeroinitializer, <4 x float> %8 ) nounwind readnone		; <<4 x float>>:9 [#uses=1]
+	%phitmp40 = bitcast <4 x float> %9 to <4 x i32>		; <<4 x i32>> [#uses=1]
+	%tmp4109.i = and <4 x i32> %phitmp40, < i32 8388607, i32 8388607, i32 8388607, i32 8388607 >		; <<4 x i32>> [#uses=1]
+	%tmp4116.i = or <4 x i32> %tmp4109.i, < i32 1065353216, i32 1065353216, i32 1065353216, i32 1065353216 >		; <<4 x i32>> [#uses=1]
+	%tmp4117.i = bitcast <4 x i32> %tmp4116.i to <4 x float>		; <<4 x float>> [#uses=1]
+	fadd <4 x float> %tmp4117.i, zeroinitializer		; <<4 x float>>:10 [#uses=1]
+	fmul <4 x float> %10, < float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01 >		; <<4 x float>>:11 [#uses=1]
+	call <4 x float> @llvm.x86.sse.max.ps( <4 x float> %11, <4 x float> zeroinitializer ) nounwind readnone		; <<4 x float>>:12 [#uses=1]
+	call <4 x float> @llvm.x86.sse.min.ps( <4 x float> %12, <4 x float> zeroinitializer ) nounwind readnone		; <<4 x float>>:13 [#uses=1]
+	%tmp4170.i = call <4 x float> @llvm.x86.sse.cmp.ps( <4 x float> %13, <4 x float> zeroinitializer, i8 2 ) nounwind		; <<4 x float>> [#uses=1]
+	bitcast <4 x float> %tmp4170.i to <16 x i8>		; <<16 x i8>>:14 [#uses=1]
+	call i32 @llvm.x86.sse2.pmovmskb.128( <16 x i8> %14 ) nounwind readnone		; <i32>:15 [#uses=1]
+	icmp eq i32 %15, 0		; <i1>:16 [#uses=1]
+	br i1 %16, label %bb5574.i, label %bb4521.i
+
+bb4521.i:		; preds = %bb266.i
+	unreachable
+
+bb5574.i:		; preds = %bb266.i
+	unreachable
+}
+
+declare <4 x float> @llvm.x86.sse.cmp.ps(<4 x float>, <4 x float>, i8) nounwind readnone
+
+declare i32 @llvm.x86.sse2.pmovmskb.128(<16 x i8>) nounwind readnone
+
+declare <4 x float> @llvm.x86.sse.max.ps(<4 x float>, <4 x float>) nounwind readnone
+
+declare <4 x float> @llvm.x86.sse.min.ps(<4 x float>, <4 x float>) nounwind readnone
diff --git a/test/Transforms/InstCombine/X86/x86-avx2.ll b/test/Transforms/InstCombine/X86/x86-avx2.ll
new file mode 100644
index 000000000000..f4045f788e2d
--- /dev/null
+++ b/test/Transforms/InstCombine/X86/x86-avx2.ll
@@ -0,0 +1,109 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -instcombine -S | FileCheck %s
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+; Verify that instcombine is able to fold identity shuffles.
+
+define <8 x i32> @identity_test_vpermd(<8 x i32> %a0) {
+; CHECK-LABEL: @identity_test_vpermd(
+; CHECK-NEXT:    ret <8 x i32> %a0
+;
+  %a = tail call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %a0, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>)
+  ret <8 x i32> %a
+}
+
+define <8 x float> @identity_test_vpermps(<8 x float> %a0) {
+; CHECK-LABEL: @identity_test_vpermps(
+; CHECK-NEXT:    ret <8 x float> %a0
+;
+  %a = tail call <8 x float> @llvm.x86.avx2.permps(<8 x float> %a0, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>)
+  ret <8 x float> %a
+}
+
+; Instcombine should be able to fold the following shuffle to a builtin shufflevector
+; with a shuffle mask of all zeroes.
+
+define <8 x i32> @zero_test_vpermd(<8 x i32> %a0) {
+; CHECK-LABEL: @zero_test_vpermd(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> %a0, <8 x i32> undef, <8 x i32> zeroinitializer
+; CHECK-NEXT:    ret <8 x i32> [[TMP1]]
+;
+  %a = tail call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %a0, <8 x i32> zeroinitializer)
+  ret <8 x i32> %a
+}
+
+define <8 x float> @zero_test_vpermps(<8 x float> %a0) {
+; CHECK-LABEL: @zero_test_vpermps(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> %a0, <8 x float> undef, <8 x i32> zeroinitializer
+; CHECK-NEXT:    ret <8 x float> [[TMP1]]
+;
+  %a = tail call <8 x float> @llvm.x86.avx2.permps(<8 x float> %a0, <8 x i32> zeroinitializer)
+  ret <8 x float> %a
+}
+
+; Verify that instcombine is able to fold constant shuffles.
+
+define <8 x i32> @shuffle_test_vpermd(<8 x i32> %a0) {
+; CHECK-LABEL: @shuffle_test_vpermd(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> %a0, <8 x i32> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    ret <8 x i32> [[TMP1]]
+;
+  %a = tail call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %a0, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>)
+  ret <8 x i32> %a
+}
+
+define <8 x float> @shuffle_test_vpermps(<8 x float> %a0) {
+; CHECK-LABEL: @shuffle_test_vpermps(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> %a0, <8 x float> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    ret <8 x float> [[TMP1]]
+;
+  %a = tail call <8 x float> @llvm.x86.avx2.permps(<8 x float> %a0, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>)
+  ret <8 x float> %a
+}
+
+; Verify that instcombine is able to fold constant shuffles with undef mask elements.
+
+define <8 x i32> @undef_test_vpermd(<8 x i32> %a0) {
+; CHECK-LABEL: @undef_test_vpermd(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> %a0, <8 x i32> undef, <8 x i32> <i32 undef, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    ret <8 x i32> [[TMP1]]
+;
+  %a = tail call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %a0, <8 x i32> <i32 undef, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>)
+  ret <8 x i32> %a
+}
+
+define <8 x float> @undef_test_vpermps(<8 x float> %a0) {
+; CHECK-LABEL: @undef_test_vpermps(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> %a0, <8 x float> undef, <8 x i32> <i32 undef, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    ret <8 x float> [[TMP1]]
+;
+  %a = tail call <8 x float> @llvm.x86.avx2.permps(<8 x float> %a0, <8 x i32> <i32 undef, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>)
+  ret <8 x float> %a
+}
+
+; Verify simplify demanded elts.
+
+define <8 x i32> @elts_test_vpermd(<8 x i32> %a0, i32 %a1) {
+; CHECK-LABEL: @elts_test_vpermd(
+; CHECK-NEXT:    ret <8 x i32> %a0
+;
+  %1 = insertelement <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>, i32 %a1, i32 0
+  %2 = tail call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %a0, <8 x i32> %1)
+  %3 = shufflevector <8 x i32> %2, <8 x i32> undef, <8 x i32> <i32 undef, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x i32> %3
+}
+
+define <8 x float> @elts_test_vpermps(<8 x float> %a0, <8 x i32> %a1) {
+; CHECK-LABEL: @elts_test_vpermps(
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x float> @llvm.x86.avx2.permps(<8 x float> %a0, <8 x i32> %a1)
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> undef, <8 x i32> zeroinitializer
+; CHECK-NEXT:    ret <8 x float> [[TMP2]]
+;
+  %1 = insertelement <8 x i32> %a1, i32 0, i32 7
+  %2 = tail call <8 x float> @llvm.x86.avx2.permps(<8 x float> %a0, <8 x i32> %1)
+  %3 = shufflevector <8 x float> %2, <8 x float> undef, <8 x i32> zeroinitializer
+  ret <8 x float> %3
+}
+
+declare <8 x i32> @llvm.x86.avx2.permd(<8 x i32>, <8 x i32>)
+declare <8 x float> @llvm.x86.avx2.permps(<8 x float>, <8 x i32>)
diff --git a/test/Transforms/InstCombine/X86/x86-avx512.ll b/test/Transforms/InstCombine/X86/x86-avx512.ll
new file mode 100644
index 000000000000..2a24d93ce76a
--- /dev/null
+++ b/test/Transforms/InstCombine/X86/x86-avx512.ll
@@ -0,0 +1,2793 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -instcombine -S | FileCheck %s
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+declare <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float>, <4 x float>, <4 x float>, i8, i32)
+
+define <4 x float> @test_add_ss(<4 x float> %a, <4 x float> %b) {
+; CHECK-LABEL: @test_add_ss(
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x float> [[A:%.*]], i64 0
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[B:%.*]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = fadd float [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x float> [[A]], float [[TMP3]], i64 0
+; CHECK-NEXT:    ret <4 x float> [[TMP4]]
+;
+  %1 = insertelement <4 x float> %b, float 1.000000e+00, i32 1
+  %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2
+  %3 = insertelement <4 x float> %2, float 3.000000e+00, i32 3
+  %4 = tail call <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float> %a, <4 x float> %3, <4 x float> undef, i8 -1, i32 4)
+  ret <4 x float> %4
+}
+
+define <4 x float> @test_add_ss_round(<4 x float> %a, <4 x float> %b) {
+; CHECK-LABEL: @test_add_ss_round(
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x float> undef, i8 -1, i32 8)
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
+;
+  %1 = insertelement <4 x float> %b, float 1.000000e+00, i32 1
+  %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2
+  %3 = insertelement <4 x float> %2, float 3.000000e+00, i32 3
+  %4 = tail call <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float> %a, <4 x float> %3, <4 x float> undef, i8 -1, i32 8)
+  ret <4 x float> %4
+}
+
+define <4 x float> @test_add_ss_mask(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) {
+; CHECK-LABEL: @test_add_ss_mask(
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x float> [[A:%.*]], i64 0
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[B:%.*]], i64 0
+; CHECK-NEXT:    [[TMP3:%.*]] = fadd float [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <8 x i1> [[TMP4]], i64 0
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x float> [[C:%.*]], i32 0
+; CHECK-NEXT:    [[TMP7:%.*]] = select i1 [[TMP5]], float [[TMP3]], float [[TMP6]]
+; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <4 x float> [[A]], float [[TMP7]], i64 0
+; CHECK-NEXT:    ret <4 x float> [[TMP8]]
+;
+  %1 = insertelement <4 x float> %c, float 1.000000e+00, i32 1
+  %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2
+  %3 = insertelement <4 x float> %2, float 3.000000e+00, i32 3
+  %4 = tail call <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float> %a, <4 x float> %b, <4 x float> %3, i8 %mask, i32 4)
+  ret <4 x float> %4
+}
+
+define <4 x float> @test_add_ss_mask_round(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) {
+; CHECK-LABEL: @test_add_ss_mask_round(
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x float> [[C:%.*]], i8 [[MASK:%.*]], i32 8)
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
+;
+  %1 = insertelement <4 x float> %c, float 1.000000e+00, i32 1
+  %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2
+  %3 = insertelement <4 x float> %2, float 3.000000e+00, i32 3
+  %4 = tail call <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float> %a, <4 x float> %b, <4 x float> %3, i8 %mask, i32 8)
+  ret <4 x float> %4
+}
+
+define float @test_add_ss_1(float %a, float %b) {
+; CHECK-LABEL: @test_add_ss_1(
+; CHECK-NEXT:    ret float 1.000000e+00
+;
+  %1 = insertelement <4 x float> undef, float %a, i32 0
+  %2 = insertelement <4 x float> %1, float 1.000000e+00, i32 1
+  %3 = insertelement <4 x float> %2, float 2.000000e+00, i32 2
+  %4 = insertelement <4 x float> %3, float 3.000000e+00, i32 3
+  %5 = insertelement <4 x float> undef, float %b, i32 0
+  %6 = insertelement <4 x float> %5, float 4.000000e+00, i32 1
+  %7 = insertelement <4 x float> %6, float 5.000000e+00, i32 2
+  %8 = insertelement <4 x float> %7, float 6.000000e+00, i32 3
+  %9 = tail call <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float> %4, <4 x float> %8, <4 x float> undef, i8 -1, i32 8)
+  %10 = extractelement <4 x float> %9, i32 1
+  ret float %10
+}
+
+declare <2 x double> @llvm.x86.avx512.mask.add.sd.round(<2 x double>, <2 x double>, <2 x double>, i8, i32)
+
+define <2 x double> @test_add_sd(<2 x double> %a, <2 x double> %b) {
+; CHECK-LABEL: @test_add_sd(
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x double> [[A:%.*]], i64 0
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x double> [[B:%.*]], i64 0
+; CHECK-NEXT:    [[TMP3:%.*]] = fadd double [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x double> [[A]], double [[TMP3]], i64 0
+; CHECK-NEXT:    ret <2 x double> [[TMP4]]
+;
+  %1 = insertelement <2 x double> %b, double 1.000000e+00, i32 1
+  %2 = tail call <2 x double> @llvm.x86.avx512.mask.add.sd.round(<2 x double> %a, <2 x double> %1, <2 x double> undef, i8 -1, i32 4)
+  ret <2 x double> %2
+}
+
+define <2 x double> @test_add_sd_round(<2 x double> %a, <2 x double> %b) {
+; CHECK-LABEL: @test_add_sd_round(
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x double> @llvm.x86.avx512.mask.add.sd.round(<2 x double> [[A:%.*]], <2 x double> [[B:%.*]], <2 x double> undef, i8 -1, i32 8)
+; CHECK-NEXT:    ret <2 x double> [[TMP1]]
+;
+  %1 = insertelement <2 x double> %b, double 1.000000e+00, i32 1
+  %2 = tail call <2 x double> @llvm.x86.avx512.mask.add.sd.round(<2 x double> %a, <2 x double> %1, <2 x double> undef, i8 -1, i32 8)
+  ret <2 x double> %2
+}
+
+define <2 x double> @test_add_sd_mask(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) {
+; CHECK-LABEL: @test_add_sd_mask(
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x double> [[A:%.*]], i64 0
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x double> [[B:%.*]], i64 0
+; CHECK-NEXT:    [[TMP3:%.*]] = fadd double [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <8 x i1> [[TMP4]], i64 0
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x double> [[C:%.*]], i64 0
+; CHECK-NEXT:    [[TMP7:%.*]] = select i1 [[TMP5]], double [[TMP3]], double [[TMP6]]
+; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <2 x double> [[A]], double [[TMP7]], i64 0
+; CHECK-NEXT:    ret <2 x double> [[TMP8]]
+;
+  %1 = insertelement <2 x double> %c, double 1.000000e+00, i32 1
+  %2 = tail call <2 x double> @llvm.x86.avx512.mask.add.sd.round(<2 x double> %a, <2 x double> %b, <2 x double> %1, i8 %mask, i32 4)
+  ret <2 x double> %2
+}
+
+define <2 x double> @test_add_sd_mask_round(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) {
+; CHECK-LABEL: @test_add_sd_mask_round(
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x double> @llvm.x86.avx512.mask.add.sd.round(<2 x double> [[A:%.*]], <2 x double> [[B:%.*]], <2 x double> [[C:%.*]], i8 [[MASK:%.*]], i32 8)
+; CHECK-NEXT:    ret <2 x double> [[TMP1]]
+;
+  %1 = insertelement <2 x double> %c, double 1.000000e+00, i32 1
+  %2 = tail call <2 x double> @llvm.x86.avx512.mask.add.sd.round(<2 x double> %a, <2 x double> %b, <2 x double> %1, i8 %mask, i32 8)
+  ret <2 x double> %2
+}
+
+define double @test_add_sd_1(double %a, double %b) {
+; CHECK-LABEL: @test_add_sd_1(
+; CHECK-NEXT:    ret double 1.000000e+00
+;
+  %1 = insertelement <2 x double> undef, double %a, i32 0
+  %2 = insertelement <2 x double> %1, double 1.000000e+00, i32 1
+  %3 = insertelement <2 x double> undef, double %b, i32 0
+  %4 = insertelement <2 x double> %3, double 2.000000e+00, i32 1
+  %5 = tail call <2 x double> @llvm.x86.avx512.mask.add.sd.round(<2 x double> %2, <2 x double> %4, <2 x double> undef, i8 -1, i32 8)
+  %6 = extractelement <2 x double> %5, i32 1
+  ret double %6
+}
+
+declare <4 x float> @llvm.x86.avx512.mask.sub.ss.round(<4 x float>, <4 x float>, <4 x float>, i8, i32)
+
+define <4 x float> @test_sub_ss(<4 x float> %a, <4 x float> %b) {
+; CHECK-LABEL: @test_sub_ss(
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x float> [[A:%.*]], i64 0
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[B:%.*]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = fsub float [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x float> [[A]], float [[TMP3]], i64 0
+; CHECK-NEXT:    ret <4 x float> [[TMP4]]
+;
+  %1 = insertelement <4 x float> %b, float 1.000000e+00, i32 1
+  %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2
+  %3 = insertelement <4 x float> %2, float 3.000000e+00, i32 3
+  %4 = tail call <4 x float> @llvm.x86.avx512.mask.sub.ss.round(<4 x float> %a, <4 x float> %3, <4 x float> undef, i8 -1, i32 4)
+  ret <4 x float> %4
+}
+
+define <4 x float> @test_sub_ss_round(<4 x float> %a, <4 x float> %b) {
+; CHECK-LABEL: @test_sub_ss_round(
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x float> @llvm.x86.avx512.mask.sub.ss.round(<4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x float> undef, i8 -1, i32 8)
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
+;
+  %1 = insertelement <4 x float> %b, float 1.000000e+00, i32 1
+  %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2
+  %3 = insertelement <4 x float> %2, float 3.000000e+00, i32 3
+  %4 = tail call <4 x float> @llvm.x86.avx512.mask.sub.ss.round(<4 x float> %a, <4 x float> %3, <4 x float> undef, i8 -1, i32 8)
+  ret <4 x float> %4
+}
+
+define <4 x float> @test_sub_ss_mask(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) {
+; CHECK-LABEL: @test_sub_ss_mask(
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x float> [[A:%.*]], i64 0
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[B:%.*]], i64 0
+; CHECK-NEXT:    [[TMP3:%.*]] = fsub float [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <8 x i1> [[TMP4]], i64 0
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x float> [[C:%.*]], i32 0
+; CHECK-NEXT:    [[TMP7:%.*]] = select i1 [[TMP5]], float [[TMP3]], float [[TMP6]]
+; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <4 x float> [[A]], float [[TMP7]], i64 0
+; CHECK-NEXT:    ret <4 x float> [[TMP8]]
+;
+  %1 = insertelement <4 x float> %c, float 1.000000e+00, i32 1
+  %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2
+  %3 = insertelement <4 x float> %2, float 3.000000e+00, i32 3
+  %4 = tail call <4 x float> @llvm.x86.avx512.mask.sub.ss.round(<4 x float> %a, <4 x float> %b, <4 x float> %3, i8 %mask, i32 4)
+  ret <4 x float> %4
+}
+
+define <4 x float> @test_sub_ss_mask_round(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) {
+; CHECK-LABEL: @test_sub_ss_mask_round(
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x float> @llvm.x86.avx512.mask.sub.ss.round(<4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x float> [[C:%.*]], i8 [[MASK:%.*]], i32 8)
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
+;
+  %1 = insertelement <4 x float> %c, float 1.000000e+00, i32 1
+  %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2
+  %3 = insertelement <4 x float> %2, float 3.000000e+00, i32 3
+  %4 = tail call <4 x float> @llvm.x86.avx512.mask.sub.ss.round(<4 x float> %a, <4 x float> %b, <4 x float> %3, i8 %mask, i32 8)
+  ret <4 x float> %4
+}
+
+define float @test_sub_ss_1(float %a, float %b) {
+; CHECK-LABEL: @test_sub_ss_1(
+; CHECK-NEXT:    ret float 1.000000e+00
+;
+  %1 = insertelement <4 x float> undef, float %a, i32 0
+  %2 = insertelement <4 x float> %1, float 1.000000e+00, i32 1
+  %3 = insertelement <4 x float> %2, float 2.000000e+00, i32 2
+  %4 = insertelement <4 x float> %3, float 3.000000e+00, i32 3
+  %5 = insertelement <4 x float> undef, float %b, i32 0
+  %6 = insertelement <4 x float> %5, float 4.000000e+00, i32 1
+  %7 = insertelement <4 x float> %6, float 5.000000e+00, i32 2
+  %8 = insertelement <4 x float> %7, float 6.000000e+00, i32 3
+  %9 = tail call <4 x float> @llvm.x86.avx512.mask.sub.ss.round(<4 x float> %4, <4 x float> %8, <4 x float> undef, i8 -1, i32 8)
+  %10 = extractelement <4 x float> %9, i32 1
+  ret float %10
+}
+
+declare <2 x double> @llvm.x86.avx512.mask.sub.sd.round(<2 x double>, <2 x double>, <2 x double>, i8, i32)
+
+define <2 x double> @test_sub_sd(<2 x double> %a, <2 x double> %b) {
+; CHECK-LABEL: @test_sub_sd(
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x double> [[A:%.*]], i64 0
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x double> [[B:%.*]], i64 0
+; CHECK-NEXT:    [[TMP3:%.*]] = fsub double [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x double> [[A]], double [[TMP3]], i64 0
+; CHECK-NEXT:    ret <2 x double> [[TMP4]]
+;
+  %1 = insertelement <2 x double> %b, double 1.000000e+00, i32 1
+  %2 = tail call <2 x double> @llvm.x86.avx512.mask.sub.sd.round(<2 x double> %a, <2 x double> %1, <2 x double> undef, i8 -1, i32 4)
+  ret <2 x double> %2
+}
+
+define <2 x double> @test_sub_sd_round(<2 x double> %a, <2 x double> %b) {
+; CHECK-LABEL: @test_sub_sd_round(
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x double> @llvm.x86.avx512.mask.sub.sd.round(<2 x double> [[A:%.*]], <2 x double> [[B:%.*]], <2 x double> undef, i8 -1, i32 8)
+; CHECK-NEXT:    ret <2 x double> [[TMP1]]
+;
+  %1 = insertelement <2 x double> %b, double 1.000000e+00, i32 1
+  %2 = tail call <2 x double> @llvm.x86.avx512.mask.sub.sd.round(<2 x double> %a, <2 x double> %1, <2 x double> undef, i8 -1, i32 8)
+  ret <2 x double> %2
+}
+
+define <2 x double> @test_sub_sd_mask(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) {
+; CHECK-LABEL: @test_sub_sd_mask(
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x double> [[A:%.*]], i64 0
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x double> [[B:%.*]], i64 0
+; CHECK-NEXT:    [[TMP3:%.*]] = fsub double [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <8 x i1> [[TMP4]], i64 0
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x double> [[C:%.*]], i64 0
+; CHECK-NEXT:    [[TMP7:%.*]] = select i1 [[TMP5]], double [[TMP3]], double [[TMP6]]
+; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <2 x double> [[A]], double [[TMP7]], i64 0
+; CHECK-NEXT:    ret <2 x double> [[TMP8]]
+;
+  %1 = insertelement <2 x double> %c, double 1.000000e+00, i32 1
+  %2 = tail call <2 x double> @llvm.x86.avx512.mask.sub.sd.round(<2 x double> %a, <2 x double> %b, <2 x double> %1, i8 %mask, i32 4)
+  ret <2 x double> %2
+}
+
+define <2 x double> @test_sub_sd_mask_round(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) {
+; CHECK-LABEL: @test_sub_sd_mask_round(
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x double> @llvm.x86.avx512.mask.sub.sd.round(<2 x double> [[A:%.*]], <2 x double> [[B:%.*]], <2 x double> [[C:%.*]], i8 [[MASK:%.*]], i32 8)
+; CHECK-NEXT:    ret <2 x double> [[TMP1]]
+;
+  %1 = insertelement <2 x double> %c, double 1.000000e+00, i32 1
+  %2 = tail call <2 x double> @llvm.x86.avx512.mask.sub.sd.round(<2 x double> %a, <2 x double> %b, <2 x double> %1, i8 %mask, i32 8)
+  ret <2 x double> %2
+}
+
+define double @test_sub_sd_1(double %a, double %b) {
+; CHECK-LABEL: @test_sub_sd_1(
+; CHECK-NEXT:    ret double 1.000000e+00
+;
+  %1 = insertelement <2 x double> undef, double %a, i32 0
+  %2 = insertelement <2 x double> %1, double 1.000000e+00, i32 1
+  %3 = insertelement <2 x double> undef, double %b, i32 0
+  %4 = insertelement <2 x double> %3, double 2.000000e+00, i32 1
+  %5 = tail call <2 x double> @llvm.x86.avx512.mask.sub.sd.round(<2 x double> %2, <2 x double> %4, <2 x double> undef, i8 -1, i32 8)
+  %6 = extractelement <2 x double> %5, i32 1
+  ret double %6
+}
+
+declare <4 x float> @llvm.x86.avx512.mask.mul.ss.round(<4 x float>, <4 x float>, <4 x float>, i8, i32)
+
+define <4 x float> @test_mul_ss(<4 x float> %a, <4 x float> %b) {
+; CHECK-LABEL: @test_mul_ss(
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x float> [[A:%.*]], i64 0
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[B:%.*]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = fmul float [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x float> [[A]], float [[TMP3]], i64 0
+; CHECK-NEXT:    ret <4 x float> [[TMP4]]
+;
+  %1 = insertelement <4 x float> %b, float 1.000000e+00, i32 1
+  %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2
+  %3 = insertelement <4 x float> %2, float 3.000000e+00, i32 3
+  %4 = tail call <4 x float> @llvm.x86.avx512.mask.mul.ss.round(<4 x float> %a, <4 x float> %3, <4 x float> undef, i8 -1, i32 4)
+  ret <4 x float> %4
+}
+
+define <4 x float> @test_mul_ss_round(<4 x float> %a, <4 x float> %b) {
+; CHECK-LABEL: @test_mul_ss_round(
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x float> @llvm.x86.avx512.mask.mul.ss.round(<4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x float> undef, i8 -1, i32 8)
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
+;
+  %1 = insertelement <4 x float> %b, float 1.000000e+00, i32 1
+  %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2
+  %3 = insertelement <4 x float> %2, float 3.000000e+00, i32 3
+  %4 = tail call <4 x float> @llvm.x86.avx512.mask.mul.ss.round(<4 x float> %a, <4 x float> %3, <4 x float> undef, i8 -1, i32 8)
+  ret <4 x float> %4
+}
+
+define <4 x float> @test_mul_ss_mask(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) {
+; CHECK-LABEL: @test_mul_ss_mask(
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x float> [[A:%.*]], i64 0
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[B:%.*]], i64 0
+; CHECK-NEXT:    [[TMP3:%.*]] = fmul float [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <8 x i1> [[TMP4]], i64 0
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x float> [[C:%.*]], i32 0
+; CHECK-NEXT:    [[TMP7:%.*]] = select i1 [[TMP5]], float [[TMP3]], float [[TMP6]]
+; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <4 x float> [[A]], float [[TMP7]], i64 0
+; CHECK-NEXT:    ret <4 x float> [[TMP8]]
+;
+  %1 = insertelement <4 x float> %c, float 1.000000e+00, i32 1
+  %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2
+  %3 = insertelement <4 x float> %2, float 3.000000e+00, i32 3
+  %4 = tail call <4 x float> @llvm.x86.avx512.mask.mul.ss.round(<4 x float> %a, <4 x float> %b, <4 x float> %3, i8 %mask, i32 4)
+  ret <4 x float> %4
+}
+
+define <4 x float> @test_mul_ss_mask_round(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) {
+; CHECK-LABEL: @test_mul_ss_mask_round(
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x float> @llvm.x86.avx512.mask.mul.ss.round(<4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x float> [[C:%.*]], i8 [[MASK:%.*]], i32 8)
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
+;
+  %1 = insertelement <4 x float> %c, float 1.000000e+00, i32 1
+  %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2
+  %3 = insertelement <4 x float> %2, float 3.000000e+00, i32 3
+  %4 = tail call <4 x float> @llvm.x86.avx512.mask.mul.ss.round(<4 x float> %a, <4 x float> %b, <4 x float> %3, i8 %mask, i32 8)
+  ret <4 x float> %4
+}
+
+define float @test_mul_ss_1(float %a, float %b) {
+; CHECK-LABEL: @test_mul_ss_1(
+; CHECK-NEXT:    ret float 1.000000e+00
+;
+  %1 = insertelement <4 x float> undef, float %a, i32 0
+  %2 = insertelement <4 x float> %1, float 1.000000e+00, i32 1
+  %3 = insertelement <4 x float> %2, float 2.000000e+00, i32 2
+  %4 = insertelement <4 x float> %3, float 3.000000e+00, i32 3
+  %5 = insertelement <4 x float> undef, float %b, i32 0
+  %6 = insertelement <4 x float> %5, float 4.000000e+00, i32 1
+  %7 = insertelement <4 x float> %6, float 5.000000e+00, i32 2
+  %8 = insertelement <4 x float> %7, float 6.000000e+00, i32 3
+  %9 = tail call <4 x float> @llvm.x86.avx512.mask.mul.ss.round(<4 x float> %4, <4 x float> %8, <4 x float> undef, i8 -1, i32 8)
+  %10 = extractelement <4 x float> %9, i32 1
+  ret float %10
+}
+
+declare <2 x double> @llvm.x86.avx512.mask.mul.sd.round(<2 x double>, <2 x double>, <2 x double>, i8, i32)
+
+define <2 x double> @test_mul_sd(<2 x double> %a, <2 x double> %b) {
+; CHECK-LABEL: @test_mul_sd(
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x double> [[A:%.*]], i64 0
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x double> [[B:%.*]], i64 0
+; CHECK-NEXT:    [[TMP3:%.*]] = fmul double [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x double> [[A]], double [[TMP3]], i64 0
+; CHECK-NEXT:    ret <2 x double> [[TMP4]]
+;
+  %1 = insertelement <2 x double> %b, double 1.000000e+00, i32 1
+  %2 = tail call <2 x double> @llvm.x86.avx512.mask.mul.sd.round(<2 x double> %a, <2 x double> %1, <2 x double> undef, i8 -1, i32 4)
+  ret <2 x double> %2
+}
+
+define <2 x double> @test_mul_sd_round(<2 x double> %a, <2 x double> %b) {
+; CHECK-LABEL: @test_mul_sd_round(
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x double> @llvm.x86.avx512.mask.mul.sd.round(<2 x double> [[A:%.*]], <2 x double> [[B:%.*]], <2 x double> undef, i8 -1, i32 8)
+; CHECK-NEXT:    ret <2 x double> [[TMP1]]
+;
+  %1 = insertelement <2 x double> %b, double 1.000000e+00, i32 1
+  %2 = tail call <2 x double> @llvm.x86.avx512.mask.mul.sd.round(<2 x double> %a, <2 x double> %1, <2 x double> undef, i8 -1, i32 8)
+  ret <2 x double> %2
+}
+
+define <2 x double> @test_mul_sd_mask(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) {
+; CHECK-LABEL: @test_mul_sd_mask(
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x double> [[A:%.*]], i64 0
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x double> [[B:%.*]], i64 0
+; CHECK-NEXT:    [[TMP3:%.*]] = fmul double [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <8 x i1> [[TMP4]], i64 0
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x double> [[C:%.*]], i64 0
+; CHECK-NEXT:    [[TMP7:%.*]] = select i1 [[TMP5]], double [[TMP3]], double [[TMP6]]
+; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <2 x double> [[A]], double [[TMP7]], i64 0
+; CHECK-NEXT:    ret <2 x double> [[TMP8]]
+;
+  %1 = insertelement <2 x double> %c, double 1.000000e+00, i32 1
+  %2 = tail call <2 x double> @llvm.x86.avx512.mask.mul.sd.round(<2 x double> %a, <2 x double> %b, <2 x double> %1, i8 %mask, i32 4)
+  ret <2 x double> %2
+}
+
+define <2 x double> @test_mul_sd_mask_round(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) {
+; CHECK-LABEL: @test_mul_sd_mask_round(
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x double> @llvm.x86.avx512.mask.mul.sd.round(<2 x double> [[A:%.*]], <2 x double> [[B:%.*]], <2 x double> [[C:%.*]], i8 [[MASK:%.*]], i32 8)
+; CHECK-NEXT:    ret <2 x double> [[TMP1]]
+;
+  %1 = insertelement <2 x double> %c, double 1.000000e+00, i32 1
+  %2 = tail call <2 x double> @llvm.x86.avx512.mask.mul.sd.round(<2 x double> %a, <2 x double> %b, <2 x double> %1, i8 %mask, i32 8)
+  ret <2 x double> %2
+}
+
+define double @test_mul_sd_1(double %a, double %b) {
+; CHECK-LABEL: @test_mul_sd_1(
+; CHECK-NEXT:    ret double 1.000000e+00
+;
+  %1 = insertelement <2 x double> undef, double %a, i32 0
+  %2 = insertelement <2 x double> %1, double 1.000000e+00, i32 1
+  %3 = insertelement <2 x double> undef, double %b, i32 0
+  %4 = insertelement <2 x double> %3, double 2.000000e+00, i32 1
+  %5 = tail call <2 x double> @llvm.x86.avx512.mask.mul.sd.round(<2 x double> %2, <2 x double> %4, <2 x double> undef, i8 -1, i32 8)
+  %6 = extractelement <2 x double> %5, i32 1
+  ret double %6
+}
+
+declare <4 x float> @llvm.x86.avx512.mask.div.ss.round(<4 x float>, <4 x float>, <4 x float>, i8, i32)
+
+define <4 x float> @test_div_ss(<4 x float> %a, <4 x float> %b) {
+; CHECK-LABEL: @test_div_ss(
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x float> [[A:%.*]], i64 0
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[B:%.*]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = fdiv float [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x float> [[A]], float [[TMP3]], i64 0
+; CHECK-NEXT:    ret <4 x float> [[TMP4]]
+;
+  %1 = insertelement <4 x float> %b, float 1.000000e+00, i32 1
+  %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2
+  %3 = insertelement <4 x float> %2, float 3.000000e+00, i32 3
+  %4 = tail call <4 x float> @llvm.x86.avx512.mask.div.ss.round(<4 x float> %a, <4 x float> %3, <4 x float> undef, i8 -1, i32 4)
+  ret <4 x float> %4
+}
+
+define <4 x float> @test_div_ss_round(<4 x float> %a, <4 x float> %b) {
+; CHECK-LABEL: @test_div_ss_round(
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x float> @llvm.x86.avx512.mask.div.ss.round(<4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x float> undef, i8 -1, i32 8)
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
+;
+  %1 = insertelement <4 x float> %b, float 1.000000e+00, i32 1
+  %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2
+  %3 = insertelement <4 x float> %2, float 3.000000e+00, i32 3
+  %4 = tail call <4 x float> @llvm.x86.avx512.mask.div.ss.round(<4 x float> %a, <4 x float> %3, <4 x float> undef, i8 -1, i32 8)
+  ret <4 x float> %4
+}
+
+define <4 x float> @test_div_ss_mask(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) {
+; CHECK-LABEL: @test_div_ss_mask(
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x float> [[A:%.*]], i64 0
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[B:%.*]], i64 0
+; CHECK-NEXT:    [[TMP3:%.*]] = fdiv float [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <8 x i1> [[TMP4]], i64 0
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x float> [[C:%.*]], i32 0
+; CHECK-NEXT:    [[TMP7:%.*]] = select i1 [[TMP5]], float [[TMP3]], float [[TMP6]]
+; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <4 x float> [[A]], float [[TMP7]], i64 0
+; CHECK-NEXT:    ret <4 x float> [[TMP8]]
+;
+  %1 = insertelement <4 x float> %c, float 1.000000e+00, i32 1
+  %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2
+  %3 = insertelement <4 x float> %2, float 3.000000e+00, i32 3
+  %4 = tail call <4 x float> @llvm.x86.avx512.mask.div.ss.round(<4 x float> %a, <4 x float> %b, <4 x float> %3, i8 %mask, i32 4)
+  ret <4 x float> %4
+}
+
+define <4 x float> @test_div_ss_mask_round(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) {
+; CHECK-LABEL: @test_div_ss_mask_round(
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x float> @llvm.x86.avx512.mask.div.ss.round(<4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x float> [[C:%.*]], i8 [[MASK:%.*]], i32 8)
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
+;
+  %1 = insertelement <4 x float> %c, float 1.000000e+00, i32 1
+  %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2
+  %3 = insertelement <4 x float> %2, float 3.000000e+00, i32 3
+  %4 = tail call <4 x float> @llvm.x86.avx512.mask.div.ss.round(<4 x float> %a, <4 x float> %b, <4 x float> %3, i8 %mask, i32 8)
+  ret <4 x float> %4
+}
+
+define float @test_div_ss_1(float %a, float %b) {
+; CHECK-LABEL: @test_div_ss_1(
+; CHECK-NEXT:    ret float 1.000000e+00
+;
+  %1 = insertelement <4 x float> undef, float %a, i32 0
+  %2 = insertelement <4 x float> %1, float 1.000000e+00, i32 1
+  %3 = insertelement <4 x float> %2, float 2.000000e+00, i32 2
+  %4 = insertelement <4 x float> %3, float 3.000000e+00, i32 3
+  %5 = insertelement <4 x float> undef, float %b, i32 0
+  %6 = insertelement <4 x float> %5, float 4.000000e+00, i32 1
+  %7 = insertelement <4 x float> %6, float 5.000000e+00, i32 2
+  %8 = insertelement <4 x float> %7, float 6.000000e+00, i32 3
+  %9 = tail call <4 x float> @llvm.x86.avx512.mask.div.ss.round(<4 x float> %4, <4 x float> %8, <4 x float> undef, i8 -1, i32 8)
+  %10 = extractelement <4 x float> %9, i32 1
+  ret float %10
+}
+
+declare <2 x double> @llvm.x86.avx512.mask.div.sd.round(<2 x double>, <2 x double>, <2 x double>, i8, i32)
+
+define <2 x double> @test_div_sd(<2 x double> %a, <2 x double> %b) {
+; CHECK-LABEL: @test_div_sd(
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x double> [[A:%.*]], i64 0
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x double> [[B:%.*]], i64 0
+; CHECK-NEXT:    [[TMP3:%.*]] = fdiv double [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x double> [[A]], double [[TMP3]], i64 0
+; CHECK-NEXT:    ret <2 x double> [[TMP4]]
+;
+  %1 = insertelement <2 x double> %b, double 1.000000e+00, i32 1
+  %2 = tail call <2 x double> @llvm.x86.avx512.mask.div.sd.round(<2 x double> %a, <2 x double> %1, <2 x double> undef, i8 -1, i32 4)
+  ret <2 x double> %2
+}
+
+define <2 x double> @test_div_sd_round(<2 x double> %a, <2 x double> %b) {
+; CHECK-LABEL: @test_div_sd_round(
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x double> @llvm.x86.avx512.mask.div.sd.round(<2 x double> [[A:%.*]], <2 x double> [[B:%.*]], <2 x double> undef, i8 -1, i32 8)
+; CHECK-NEXT:    ret <2 x double> [[TMP1]]
+;
+  %1 = insertelement <2 x double> %b, double 1.000000e+00, i32 1
+  %2 = tail call <2 x double> @llvm.x86.avx512.mask.div.sd.round(<2 x double> %a, <2 x double> %1, <2 x double> undef, i8 -1, i32 8)
+  ret <2 x double> %2
+}
+
+define <2 x double> @test_div_sd_mask(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) {
+; CHECK-LABEL: @test_div_sd_mask(
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x double> [[A:%.*]], i64 0
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x double> [[B:%.*]], i64 0
+; CHECK-NEXT:    [[TMP3:%.*]] = fdiv double [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <8 x i1> [[TMP4]], i64 0
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x double> [[C:%.*]], i64 0
+; CHECK-NEXT:    [[TMP7:%.*]] = select i1 [[TMP5]], double [[TMP3]], double [[TMP6]]
+; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <2 x double> [[A]], double [[TMP7]], i64 0
+; CHECK-NEXT:    ret <2 x double> [[TMP8]]
+;
+  %1 = insertelement <2 x double> %c, double 1.000000e+00, i32 1
+  %2 = tail call <2 x double> @llvm.x86.avx512.mask.div.sd.round(<2 x double> %a, <2 x double> %b, <2 x double> %1, i8 %mask, i32 4)
+  ret <2 x double> %2
+}
+
+define <2 x double> @test_div_sd_mask_round(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) {
+; CHECK-LABEL: @test_div_sd_mask_round(
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x double> @llvm.x86.avx512.mask.div.sd.round(<2 x double> [[A:%.*]], <2 x double> [[B:%.*]], <2 x double> [[C:%.*]], i8 [[MASK:%.*]], i32 8)
+; CHECK-NEXT:    ret <2 x double> [[TMP1]]
+;
+  %1 = insertelement <2 x double> %c, double 1.000000e+00, i32 1
+  %2 = tail call <2 x double> @llvm.x86.avx512.mask.div.sd.round(<2 x double> %a, <2 x double> %b, <2 x double> %1, i8 %mask, i32 8)
+  ret <2 x double> %2
+}
+
+define double @test_div_sd_1(double %a, double %b) {
+; CHECK-LABEL: @test_div_sd_1(
+; CHECK-NEXT:    ret double 1.000000e+00
+;
+  %1 = insertelement <2 x double> undef, double %a, i32 0
+  %2 = insertelement <2 x double> %1, double 1.000000e+00, i32 1
+  %3 = insertelement <2 x double> undef, double %b, i32 0
+  %4 = insertelement <2 x double> %3, double 2.000000e+00, i32 1
+  %5 = tail call <2 x double> @llvm.x86.avx512.mask.div.sd.round(<2 x double> %2, <2 x double> %4, <2 x double> undef, i8 -1, i32 8)
+  %6 = extractelement <2 x double> %5, i32 1
+  ret double %6
+}
+
+declare <4 x float> @llvm.x86.avx512.mask.max.ss.round(<4 x float>, <4 x float>, <4 x float>, i8, i32)
+
+define <4 x float> @test_max_ss(<4 x float> %a, <4 x float> %b) {
+; CHECK-LABEL: @test_max_ss(
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x float> @llvm.x86.avx512.mask.max.ss.round(<4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x float> undef, i8 -1, i32 4)
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
+;
+  %1 = insertelement <4 x float> %b, float 1.000000e+00, i32 1
+  %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2
+  %3 = insertelement <4 x float> %2, float 3.000000e+00, i32 3
+  %4 = tail call <4 x float> @llvm.x86.avx512.mask.max.ss.round(<4 x float> %a, <4 x float> %3, <4 x float> undef, i8 -1, i32 4)
+  ret <4 x float> %4
+}
+
+define <4 x float> @test_max_ss_mask(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) {
+; CHECK-LABEL: @test_max_ss_mask(
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x float> @llvm.x86.avx512.mask.max.ss.round(<4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x float> [[C:%.*]], i8 [[MASK:%.*]], i32 4)
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
+;
+  %1 = insertelement <4 x float> %c, float 1.000000e+00, i32 1
+  %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2
+  %3 = insertelement <4 x float> %2, float 3.000000e+00, i32 3
+  %4 = tail call <4 x float> @llvm.x86.avx512.mask.max.ss.round(<4 x float> %a, <4 x float> %b, <4 x float> %3, i8 %mask, i32 4)
+  ret <4 x float> %4
+}
+
+define float @test_max_ss_1(float %a, float %b) {
+; CHECK-LABEL: @test_max_ss_1(
+; CHECK-NEXT:    ret float 1.000000e+00
+;
+  %1 = insertelement <4 x float> undef, float %a, i32 0
+  %2 = insertelement <4 x float> %1, float 1.000000e+00, i32 1
+  %3 = insertelement <4 x float> %2, float 2.000000e+00, i32 2
+  %4 = insertelement <4 x float> %3, float 3.000000e+00, i32 3
+  %5 = insertelement <4 x float> undef, float %b, i32 0
+  %6 = insertelement <4 x float> %5, float 4.000000e+00, i32 1
+  %7 = insertelement <4 x float> %6, float 5.000000e+00, i32 2
+  %8 = insertelement <4 x float> %7, float 6.000000e+00, i32 3
+  %9 = tail call <4 x float> @llvm.x86.avx512.mask.max.ss.round(<4 x float> %4, <4 x float> %8, <4 x float> undef, i8 -1, i32 8)
+  %10 = extractelement <4 x float> %9, i32 1
+  ret float %10
+}
+
+declare <2 x double> @llvm.x86.avx512.mask.max.sd.round(<2 x double>, <2 x double>, <2 x double>, i8, i32)
+
+define <2 x double> @test_max_sd(<2 x double> %a, <2 x double> %b) {
+; CHECK-LABEL: @test_max_sd(
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x double> @llvm.x86.avx512.mask.max.sd.round(<2 x double> [[A:%.*]], <2 x double> [[B:%.*]], <2 x double> undef, i8 -1, i32 4)
+; CHECK-NEXT:    ret <2 x double> [[TMP1]]
+;
+  %1 = insertelement <2 x double> %b, double 1.000000e+00, i32 1
+  %2 = tail call <2 x double> @llvm.x86.avx512.mask.max.sd.round(<2 x double> %a, <2 x double> %1, <2 x double> undef, i8 -1, i32 4)
+  ret <2 x double> %2
+}
+
+define <2 x double> @test_max_sd_mask(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) {
+; CHECK-LABEL: @test_max_sd_mask(
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x double> @llvm.x86.avx512.mask.max.sd.round(<2 x double> [[A:%.*]], <2 x double> [[B:%.*]], <2 x double> [[C:%.*]], i8 [[MASK:%.*]], i32 4)
+; CHECK-NEXT:    ret <2 x double> [[TMP1]]
+;
+  %1 = insertelement <2 x double> %c, double 1.000000e+00, i32 1
+  %2 = tail call <2 x double> @llvm.x86.avx512.mask.max.sd.round(<2 x double> %a, <2 x double> %b, <2 x double> %1, i8 %mask, i32 4)
+  ret <2 x double> %2
+}
+
+define double @test_max_sd_1(double %a, double %b) {
+; CHECK-LABEL: @test_max_sd_1(
+; CHECK-NEXT:    ret double 1.000000e+00
+;
+  %1 = insertelement <2 x double> undef, double %a, i32 0
+  %2 = insertelement <2 x double> %1, double 1.000000e+00, i32 1
+  %3 = insertelement <2 x double> undef, double %b, i32 0
+  %4 = insertelement <2 x double> %3, double 2.000000e+00, i32 1
+  %5 = tail call <2 x double> @llvm.x86.avx512.mask.max.sd.round(<2 x double> %2, <2 x double> %4, <2 x double> undef, i8 -1, i32 8)
+  %6 = extractelement <2 x double> %5, i32 1
+  ret double %6
+}
+
+declare <4 x float> @llvm.x86.avx512.mask.min.ss.round(<4 x float>, <4 x float>, <4 x float>, i8, i32)
+
+define <4 x float> @test_min_ss(<4 x float> %a, <4 x float> %b) {
+; CHECK-LABEL: @test_min_ss(
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x float> @llvm.x86.avx512.mask.min.ss.round(<4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x float> undef, i8 -1, i32 4)
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
+;
+  %1 = insertelement <4 x float> %b, float 1.000000e+00, i32 1
+  %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2
+  %3 = insertelement <4 x float> %2, float 3.000000e+00, i32 3
+  %4 = tail call <4 x float> @llvm.x86.avx512.mask.min.ss.round(<4 x float> %a, <4 x float> %3, <4 x float> undef, i8 -1, i32 4)
+  ret <4 x float> %4
+}
+
+define <4 x float> @test_min_ss_mask(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) {
+; CHECK-LABEL: @test_min_ss_mask(
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x float> @llvm.x86.avx512.mask.min.ss.round(<4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x float> [[C:%.*]], i8 [[MASK:%.*]], i32 4)
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
+;
+  %1 = insertelement <4 x float> %c, float 1.000000e+00, i32 1
+  %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2
+  %3 = insertelement <4 x float> %2, float 3.000000e+00, i32 3
+  %4 = tail call <4 x float> @llvm.x86.avx512.mask.min.ss.round(<4 x float> %a, <4 x float> %b, <4 x float> %3, i8 %mask, i32 4)
+  ret <4 x float> %4
+}
+
+define float @test_min_ss_1(float %a, float %b) {
+; CHECK-LABEL: @test_min_ss_1(
+; CHECK-NEXT:    ret float 1.000000e+00
+;
+  %1 = insertelement <4 x float> undef, float %a, i32 0
+  %2 = insertelement <4 x float> %1, float 1.000000e+00, i32 1
+  %3 = insertelement <4 x float> %2, float 2.000000e+00, i32 2
+  %4 = insertelement <4 x float> %3, float 3.000000e+00, i32 3
+  %5 = insertelement <4 x float> undef, float %b, i32 0
+  %6 = insertelement <4 x float> %5, float 4.000000e+00, i32 1
+  %7 = insertelement <4 x float> %6, float 5.000000e+00, i32 2
+  %8 = insertelement <4 x float> %7, float 6.000000e+00, i32 3
+  %9 = tail call <4 x float> @llvm.x86.avx512.mask.min.ss.round(<4 x float> %4, <4 x float> %8, <4 x float> undef, i8 -1, i32 8)
+  %10 = extractelement <4 x float> %9, i32 1
+  ret float %10
+}
+
+declare <2 x double> @llvm.x86.avx512.mask.min.sd.round(<2 x double>, <2 x double>, <2 x double>, i8, i32)
+
+define <2 x double> @test_min_sd(<2 x double> %a, <2 x double> %b) {
+; CHECK-LABEL: @test_min_sd(
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x double> @llvm.x86.avx512.mask.min.sd.round(<2 x double> [[A:%.*]], <2 x double> [[B:%.*]], <2 x double> undef, i8 -1, i32 4)
+; CHECK-NEXT:    ret <2 x double> [[TMP1]]
+;
+  %1 = insertelement <2 x double> %b, double 1.000000e+00, i32 1
+  %2 = tail call <2 x double> @llvm.x86.avx512.mask.min.sd.round(<2 x double> %a, <2 x double> %1, <2 x double> undef, i8 -1, i32 4)
+  ret <2 x double> %2
+}
+
+define <2 x double> @test_min_sd_mask(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) {
+; CHECK-LABEL: @test_min_sd_mask(
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x double> @llvm.x86.avx512.mask.min.sd.round(<2 x double> [[A:%.*]], <2 x double> [[B:%.*]], <2 x double> [[C:%.*]], i8 [[MASK:%.*]], i32 4)
+; CHECK-NEXT:    ret <2 x double> [[TMP1]]
+;
+  %1 = insertelement <2 x double> %c, double 1.000000e+00, i32 1
+  %2 = tail call <2 x double> @llvm.x86.avx512.mask.min.sd.round(<2 x double> %a, <2 x double> %b, <2 x double> %1, i8 %mask, i32 4)
+  ret <2 x double> %2
+}
+
+define double @test_min_sd_1(double %a, double %b) {
+; CHECK-LABEL: @test_min_sd_1(
+; CHECK-NEXT:    ret double 1.000000e+00
+;
+  %1 = insertelement <2 x double> undef, double %a, i32 0
+  %2 = insertelement <2 x double> %1, double 1.000000e+00, i32 1
+  %3 = insertelement <2 x double> undef, double %b, i32 0
+  %4 = insertelement <2 x double> %3, double 2.000000e+00, i32 1
+  %5 = tail call <2 x double> @llvm.x86.avx512.mask.min.sd.round(<2 x double> %2, <2 x double> %4, <2 x double> undef, i8 -1, i32 8)
+  %6 = extractelement <2 x double> %5, i32 1
+  ret double %6
+}
+
+declare i8 @llvm.x86.avx512.mask.cmp.ss(<4 x float>, <4 x float>, i32, i8, i32)
+
+define i8 @test_cmp_ss(<4 x float> %a, <4 x float> %b, i8 %mask) {
+; CHECK-LABEL: @test_cmp_ss(
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call i8 @llvm.x86.avx512.mask.cmp.ss(<4 x float> [[A:%.*]], <4 x float> [[B:%.*]], i32 3, i8 [[MASK:%.*]], i32 4)
+; CHECK-NEXT:    ret i8 [[TMP1]]
+;
+  %1 = insertelement <4 x float> %a, float 1.000000e+00, i32 1
+  %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2
+  %3 = insertelement <4 x float> %2, float 3.000000e+00, i32 3
+  %4 = insertelement <4 x float> %b, float 4.000000e+00, i32 1
+  %5 = insertelement <4 x float> %4, float 5.000000e+00, i32 2
+  %6 = insertelement <4 x float> %5, float 6.000000e+00, i32 3
+  %7 = tail call i8 @llvm.x86.avx512.mask.cmp.ss(<4 x float> %3, <4 x float> %6, i32 3, i8 %mask, i32 4)
+  ret i8 %7
+}
+
+declare i8 @llvm.x86.avx512.mask.cmp.sd(<2 x double>, <2 x double>, i32, i8, i32)
+
+define i8 @test_cmp_sd(<2 x double> %a, <2 x double> %b, i8 %mask) {
+; CHECK-LABEL: @test_cmp_sd(
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call i8 @llvm.x86.avx512.mask.cmp.sd(<2 x double> [[A:%.*]], <2 x double> [[B:%.*]], i32 3, i8 [[MASK:%.*]], i32 4)
+; CHECK-NEXT:    ret i8 [[TMP1]]
+;
+  %1 = insertelement <2 x double> %a, double 1.000000e+00, i32 1
+  %2 = insertelement <2 x double> %b, double 2.000000e+00, i32 1
+  %3 = tail call i8 @llvm.x86.avx512.mask.cmp.sd(<2 x double> %1, <2 x double> %2, i32 3, i8 %mask, i32 4)
+  ret i8 %3
+}
+
+define i64 @test(float %f, double %d) {
+; CHECK-LABEL: @test(
+; CHECK-NEXT:    [[V03:%.*]] = insertelement <4 x float> undef, float [[F:%.*]], i32 0
+; CHECK-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.x86.avx512.vcvtss2si32(<4 x float> [[V03]], i32 4)
+; CHECK-NEXT:    [[V13:%.*]] = insertelement <4 x float> undef, float [[F]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.x86.avx512.vcvtss2si64(<4 x float> [[V13]], i32 4)
+; CHECK-NEXT:    [[V23:%.*]] = insertelement <4 x float> undef, float [[F]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call i32 @llvm.x86.avx512.cvttss2si(<4 x float> [[V23]], i32 4)
+; CHECK-NEXT:    [[V33:%.*]] = insertelement <4 x float> undef, float [[F]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = tail call i64 @llvm.x86.avx512.cvttss2si64(<4 x float> [[V33]], i32 4)
+; CHECK-NEXT:    [[V41:%.*]] = insertelement <2 x double> undef, double [[D:%.*]], i32 0
+; CHECK-NEXT:    [[TMP4:%.*]] = tail call i32 @llvm.x86.avx512.vcvtsd2si32(<2 x double> [[V41]], i32 4)
+; CHECK-NEXT:    [[V51:%.*]] = insertelement <2 x double> undef, double [[D]], i32 0
+; CHECK-NEXT:    [[TMP5:%.*]] = tail call i64 @llvm.x86.avx512.vcvtsd2si64(<2 x double> [[V51]], i32 4)
+; CHECK-NEXT:    [[V61:%.*]] = insertelement <2 x double> undef, double [[D]], i32 0
+; CHECK-NEXT:    [[TMP6:%.*]] = tail call i32 @llvm.x86.avx512.cvttsd2si(<2 x double> [[V61]], i32 4)
+; CHECK-NEXT:    [[V71:%.*]] = insertelement <2 x double> undef, double [[D]], i32 0
+; CHECK-NEXT:    [[TMP7:%.*]] = tail call i64 @llvm.x86.avx512.cvttsd2si64(<2 x double> [[V71]], i32 4)
+; CHECK-NEXT:    [[TMP8:%.*]] = add i32 [[TMP0]], [[TMP2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = add i32 [[TMP4]], [[TMP6]]
+; CHECK-NEXT:    [[TMP10:%.*]] = add i32 [[TMP8]], [[TMP9]]
+; CHECK-NEXT:    [[TMP11:%.*]] = sext i32 [[TMP10]] to i64
+; CHECK-NEXT:    [[TMP12:%.*]] = add i64 [[TMP1]], [[TMP3]]
+; CHECK-NEXT:    [[TMP13:%.*]] = add i64 [[TMP5]], [[TMP7]]
+; CHECK-NEXT:    [[TMP14:%.*]] = add i64 [[TMP12]], [[TMP13]]
+; CHECK-NEXT:    [[TMP15:%.*]] = add i64 [[TMP14]], [[TMP11]]
+; CHECK-NEXT:    ret i64 [[TMP15]]
+;
+  %v00 = insertelement <4 x float> undef, float %f, i32 0
+  %v01 = insertelement <4 x float> %v00, float 0.000000e+00, i32 1
+  %v02 = insertelement <4 x float> %v01, float 0.000000e+00, i32 2
+  %v03 = insertelement <4 x float> %v02, float 0.000000e+00, i32 3
+  %tmp0 = tail call i32 @llvm.x86.avx512.vcvtss2si32(<4 x float> %v03, i32 4)
+  %v10 = insertelement <4 x float> undef, float %f, i32 0
+  %v11 = insertelement <4 x float> %v10, float 0.000000e+00, i32 1
+  %v12 = insertelement <4 x float> %v11, float 0.000000e+00, i32 2
+  %v13 = insertelement <4 x float> %v12, float 0.000000e+00, i32 3
+  %tmp1 = tail call i64 @llvm.x86.avx512.vcvtss2si64(<4 x float> %v13, i32 4)
+  %v20 = insertelement <4 x float> undef, float %f, i32 0
+  %v21 = insertelement <4 x float> %v20, float 0.000000e+00, i32 1
+  %v22 = insertelement <4 x float> %v21, float 0.000000e+00, i32 2
+  %v23 = insertelement <4 x float> %v22, float 0.000000e+00, i32 3
+  %tmp2 = tail call i32 @llvm.x86.avx512.cvttss2si(<4 x float> %v23, i32 4)
+  %v30 = insertelement <4 x float> undef, float %f, i32 0
+  %v31 = insertelement <4 x float> %v30, float 0.000000e+00, i32 1
+  %v32 = insertelement <4 x float> %v31, float 0.000000e+00, i32 2
+  %v33 = insertelement <4 x float> %v32, float 0.000000e+00, i32 3
+  %tmp3 = tail call i64 @llvm.x86.avx512.cvttss2si64(<4 x float> %v33, i32 4)
+  %v40 = insertelement <2 x double> undef, double %d, i32 0
+  %v41 = insertelement <2 x double> %v40, double 0.000000e+00, i32 1
+  %tmp4 = tail call i32 @llvm.x86.avx512.vcvtsd2si32(<2 x double> %v41, i32 4)
+  %v50 = insertelement <2 x double> undef, double %d, i32 0
+  %v51 = insertelement <2 x double> %v50, double 0.000000e+00, i32 1
+  %tmp5 = tail call i64 @llvm.x86.avx512.vcvtsd2si64(<2 x double> %v51, i32 4)
+  %v60 = insertelement <2 x double> undef, double %d, i32 0
+  %v61 = insertelement <2 x double> %v60, double 0.000000e+00, i32 1
+  %tmp6 = tail call i32 @llvm.x86.avx512.cvttsd2si(<2 x double> %v61, i32 4)
+  %v70 = insertelement <2 x double> undef, double %d, i32 0
+  %v71 = insertelement <2 x double> %v70, double 0.000000e+00, i32 1
+  %tmp7 = tail call i64 @llvm.x86.avx512.cvttsd2si64(<2 x double> %v71, i32 4)
+  %tmp8 = add i32 %tmp0, %tmp2
+  %tmp9 = add i32 %tmp4, %tmp6
+  %tmp10 = add i32 %tmp8, %tmp9
+  %tmp11 = sext i32 %tmp10 to i64
+  %tmp12 = add i64 %tmp1, %tmp3
+  %tmp13 = add i64 %tmp5, %tmp7
+  %tmp14 = add i64 %tmp12, %tmp13
+  %tmp15 = add i64 %tmp11, %tmp14
+  ret i64 %tmp15
+}
+
+declare i32 @llvm.x86.avx512.vcvtss2si32(<4 x float>, i32)
+declare i64 @llvm.x86.avx512.vcvtss2si64(<4 x float>, i32)
+declare i32 @llvm.x86.avx512.cvttss2si(<4 x float>, i32)
+declare i64 @llvm.x86.avx512.cvttss2si64(<4 x float>, i32)
+declare i32 @llvm.x86.avx512.vcvtsd2si32(<2 x double>, i32)
+declare i64 @llvm.x86.avx512.vcvtsd2si64(<2 x double>, i32)
+declare i32 @llvm.x86.avx512.cvttsd2si(<2 x double>, i32)
+declare i64 @llvm.x86.avx512.cvttsd2si64(<2 x double>, i32)
+
+define i64 @test2(float %f, double %d) {
+; CHECK-LABEL: @test2(
+; CHECK-NEXT:    [[V03:%.*]] = insertelement <4 x float> undef, float [[F:%.*]], i32 0
+; CHECK-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.x86.avx512.vcvtss2usi32(<4 x float> [[V03]], i32 4)
+; CHECK-NEXT:    [[V13:%.*]] = insertelement <4 x float> undef, float [[F]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.x86.avx512.vcvtss2usi64(<4 x float> [[V13]], i32 4)
+; CHECK-NEXT:    [[V23:%.*]] = insertelement <4 x float> undef, float [[F]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call i32 @llvm.x86.avx512.cvttss2usi(<4 x float> [[V23]], i32 4)
+; CHECK-NEXT:    [[V33:%.*]] = insertelement <4 x float> undef, float [[F]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = tail call i64 @llvm.x86.avx512.cvttss2usi64(<4 x float> [[V33]], i32 4)
+; CHECK-NEXT:    [[V41:%.*]] = insertelement <2 x double> undef, double [[D:%.*]], i32 0
+; CHECK-NEXT:    [[TMP4:%.*]] = tail call i32 @llvm.x86.avx512.vcvtsd2usi32(<2 x double> [[V41]], i32 4)
+; CHECK-NEXT:    [[V51:%.*]] = insertelement <2 x double> undef, double [[D]], i32 0
+; CHECK-NEXT:    [[TMP5:%.*]] = tail call i64 @llvm.x86.avx512.vcvtsd2usi64(<2 x double> [[V51]], i32 4)
+; CHECK-NEXT:    [[V61:%.*]] = insertelement <2 x double> undef, double [[D]], i32 0
+; CHECK-NEXT:    [[TMP6:%.*]] = tail call i32 @llvm.x86.avx512.cvttsd2usi(<2 x double> [[V61]], i32 4)
+; CHECK-NEXT:    [[V71:%.*]] = insertelement <2 x double> undef, double [[D]], i32 0
+; CHECK-NEXT:    [[TMP7:%.*]] = tail call i64 @llvm.x86.avx512.cvttsd2usi64(<2 x double> [[V71]], i32 4)
+; CHECK-NEXT:    [[TMP8:%.*]] = add i32 [[TMP0]], [[TMP2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = add i32 [[TMP4]], [[TMP6]]
+; CHECK-NEXT:    [[TMP10:%.*]] = add i32 [[TMP8]], [[TMP9]]
+; CHECK-NEXT:    [[TMP11:%.*]] = sext i32 [[TMP10]] to i64
+; CHECK-NEXT:    [[TMP12:%.*]] = add i64 [[TMP1]], [[TMP3]]
+; CHECK-NEXT:    [[TMP13:%.*]] = add i64 [[TMP5]], [[TMP7]]
+; CHECK-NEXT:    [[TMP14:%.*]] = add i64 [[TMP12]], [[TMP13]]
+; CHECK-NEXT:    [[TMP15:%.*]] = add i64 [[TMP14]], [[TMP11]]
+; CHECK-NEXT:    ret i64 [[TMP15]]
+;
+  %v00 = insertelement <4 x float> undef, float %f, i32 0
+  %v01 = insertelement <4 x float> %v00, float 0.000000e+00, i32 1
+  %v02 = insertelement <4 x float> %v01, float 0.000000e+00, i32 2
+  %v03 = insertelement <4 x float> %v02, float 0.000000e+00, i32 3
+  %tmp0 = tail call i32 @llvm.x86.avx512.vcvtss2usi32(<4 x float> %v03, i32 4)
+  %v10 = insertelement <4 x float> undef, float %f, i32 0
+  %v11 = insertelement <4 x float> %v10, float 0.000000e+00, i32 1
+  %v12 = insertelement <4 x float> %v11, float 0.000000e+00, i32 2
+  %v13 = insertelement <4 x float> %v12, float 0.000000e+00, i32 3
+  %tmp1 = tail call i64 @llvm.x86.avx512.vcvtss2usi64(<4 x float> %v13, i32 4)
+  %v20 = insertelement <4 x float> undef, float %f, i32 0
+  %v21 = insertelement <4 x float> %v20, float 0.000000e+00, i32 1
+  %v22 = insertelement <4 x float> %v21, float 0.000000e+00, i32 2
+  %v23 = insertelement <4 x float> %v22, float 0.000000e+00, i32 3
+  %tmp2 = tail call i32 @llvm.x86.avx512.cvttss2usi(<4 x float> %v23, i32 4)
+  %v30 = insertelement <4 x float> undef, float %f, i32 0
+  %v31 = insertelement <4 x float> %v30, float 0.000000e+00, i32 1
+  %v32 = insertelement <4 x float> %v31, float 0.000000e+00, i32 2
+  %v33 = insertelement <4 x float> %v32, float 0.000000e+00, i32 3
+  %tmp3 = tail call i64 @llvm.x86.avx512.cvttss2usi64(<4 x float> %v33, i32 4)
+  %v40 = insertelement <2 x double> undef, double %d, i32 0
+  %v41 = insertelement <2 x double> %v40, double 0.000000e+00, i32 1
+  %tmp4 = tail call i32 @llvm.x86.avx512.vcvtsd2usi32(<2 x double> %v41, i32 4)
+  %v50 = insertelement <2 x double> undef, double %d, i32 0
+  %v51 = insertelement <2 x double> %v50, double 0.000000e+00, i32 1
+  %tmp5 = tail call i64 @llvm.x86.avx512.vcvtsd2usi64(<2 x double> %v51, i32 4)
+  %v60 = insertelement <2 x double> undef, double %d, i32 0
+  %v61 = insertelement <2 x double> %v60, double 0.000000e+00, i32 1
+  %tmp6 = tail call i32 @llvm.x86.avx512.cvttsd2usi(<2 x double> %v61, i32 4)
+  %v70 = insertelement <2 x double> undef, double %d, i32 0
+  %v71 = insertelement <2 x double> %v70, double 0.000000e+00, i32 1
+  %tmp7 = tail call i64 @llvm.x86.avx512.cvttsd2usi64(<2 x double> %v71, i32 4)
+  %tmp8 = add i32 %tmp0, %tmp2
+  %tmp9 = add i32 %tmp4, %tmp6
+  %tmp10 = add i32 %tmp8, %tmp9
+  %tmp11 = sext i32 %tmp10 to i64
+  %tmp12 = add i64 %tmp1, %tmp3
+  %tmp13 = add i64 %tmp5, %tmp7
+  %tmp14 = add i64 %tmp12, %tmp13
+  %tmp15 = add i64 %tmp11, %tmp14
+  ret i64 %tmp15
+}
+
+declare i32 @llvm.x86.avx512.vcvtss2usi32(<4 x float>, i32)
+declare i64 @llvm.x86.avx512.vcvtss2usi64(<4 x float>, i32)
+declare i32 @llvm.x86.avx512.cvttss2usi(<4 x float>, i32)
+declare i64 @llvm.x86.avx512.cvttss2usi64(<4 x float>, i32)
+declare i32 @llvm.x86.avx512.vcvtsd2usi32(<2 x double>, i32)
+declare i64 @llvm.x86.avx512.vcvtsd2usi64(<2 x double>, i32)
+declare i32 @llvm.x86.avx512.cvttsd2usi(<2 x double>, i32)
+declare i64 @llvm.x86.avx512.cvttsd2usi64(<2 x double>, i32)
+
+declare <4 x float> @llvm.x86.avx512.mask.vfmadd.ss(<4 x float>, <4 x float>, <4 x float>, i8, i32)
+
+define <4 x float> @test_mask_vfmadd_ss(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) {
+; CHECK-LABEL: @test_mask_vfmadd_ss(
+; CHECK-NEXT:    [[RES:%.*]] = tail call <4 x float> @llvm.x86.avx512.mask.vfmadd.ss(<4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x float> [[C:%.*]], i8 [[MASK:%.*]], i32 4)
+; CHECK-NEXT:    ret <4 x float> [[RES]]
+;
+  %1 = insertelement <4 x float> %b, float 1.000000e+00, i32 1
+  %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2
+  %3 = insertelement <4 x float> %2, float 3.000000e+00, i32 3
+  %4 = insertelement <4 x float> %c, float 4.000000e+00, i32 1
+  %5 = insertelement <4 x float> %4, float 5.000000e+00, i32 2
+  %6 = insertelement <4 x float> %5, float 6.000000e+00, i32 3
+  %res = tail call <4 x float> @llvm.x86.avx512.mask.vfmadd.ss(<4 x float> %a, <4 x float> %3, <4 x float> %6, i8 %mask, i32 4)
+  ret <4 x float> %res
+}
+
+define float @test_mask_vfmadd_ss_0(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) {
+; CHECK-LABEL: @test_mask_vfmadd_ss_0(
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x float> @llvm.x86.avx512.mask.vfmadd.ss(<4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x float> [[C:%.*]], i8 [[MASK:%.*]], i32 4)
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 0
+; CHECK-NEXT:    ret float [[TMP2]]
+;
+  %1 = insertelement <4 x float> %a, float 1.000000e+00, i32 1
+  %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2
+  %3 = insertelement <4 x float> %2, float 3.000000e+00, i32 3
+  %4 = tail call <4 x float> @llvm.x86.avx512.mask.vfmadd.ss(<4 x float> %3, <4 x float> %b, <4 x float> %c, i8 %mask, i32 4)
+  %5 = extractelement <4 x float> %4, i32 0
+  ret float %5
+}
+
+define float @test_mask_vfmadd_ss_1(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) {
+; CHECK-LABEL: @test_mask_vfmadd_ss_1(
+; CHECK-NEXT:    ret float 1.000000e+00
+;
+  %1 = insertelement <4 x float> %a, float 1.000000e+00, i32 1
+  %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2
+  %3 = insertelement <4 x float> %2, float 3.000000e+00, i32 3
+  %4 = tail call <4 x float> @llvm.x86.avx512.mask.vfmadd.ss(<4 x float> %3, <4 x float> %b, <4 x float> %c, i8 %mask, i32 4)
+  %5 = extractelement <4 x float> %4, i32 1
+  ret float %5
+}
+
+declare <2 x double> @llvm.x86.avx512.mask.vfmadd.sd(<2 x double>, <2 x double>, <2 x double>, i8, i32)
+
+define <2 x double> @test_mask_vfmadd_sd(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) {
+; CHECK-LABEL: @test_mask_vfmadd_sd(
+; CHECK-NEXT:    [[RES:%.*]] = tail call <2 x double> @llvm.x86.avx512.mask.vfmadd.sd(<2 x double> [[A:%.*]], <2 x double> [[B:%.*]], <2 x double> [[C:%.*]], i8 [[MASK:%.*]], i32 4)
+; CHECK-NEXT:    ret <2 x double> [[RES]]
+;
+  %1 = insertelement <2 x double> %b, double 1.000000e+00, i32 1
+  %2 = insertelement <2 x double> %c, double 2.000000e+00, i32 1
+  %res = tail call <2 x double> @llvm.x86.avx512.mask.vfmadd.sd(<2 x double> %a, <2 x double> %1, <2 x double> %2, i8 %mask, i32 4)
+  ret <2 x double> %res
+}
+
+define double @test_mask_vfmadd_sd_0(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) {
+; CHECK-LABEL: @test_mask_vfmadd_sd_0(
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x double> @llvm.x86.avx512.mask.vfmadd.sd(<2 x double> [[A:%.*]], <2 x double> [[B:%.*]], <2 x double> [[C:%.*]], i8 [[MASK:%.*]], i32 4)
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x double> [[TMP1]], i32 0
+; CHECK-NEXT:    ret double [[TMP2]]
+;
+  %1 = insertelement <2 x double> %a, double 1.000000e+00, i32 1
+  %2 = tail call <2 x double> @llvm.x86.avx512.mask.vfmadd.sd(<2 x double> %1, <2 x double> %b, <2 x double> %c, i8 %mask, i32 4)
+  %3 = extractelement <2 x double> %2, i32 0
+  ret double %3
+}
+
+define double @test_mask_vfmadd_sd_1(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) {
+; CHECK-LABEL: @test_mask_vfmadd_sd_1(
+; CHECK-NEXT:    ret double 1.000000e+00
+;
+  %1 = insertelement <2 x double> %a, double 1.000000e+00, i32 1
+  %2 = tail call <2 x double> @llvm.x86.avx512.mask.vfmadd.sd(<2 x double> %1, <2 x double> %b, <2 x double> %c, i8 %mask, i32 4)
+  %3 = extractelement <2 x double> %2, i32 1
+  ret double %3
+}
+
+declare <4 x float> @llvm.x86.avx512.maskz.vfmadd.ss(<4 x float>, <4 x float>, <4 x float>, i8, i32)
+
+define <4 x float> @test_maskz_vfmadd_ss(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) {
+; CHECK-LABEL: @test_maskz_vfmadd_ss(
+; CHECK-NEXT:    [[RES:%.*]] = tail call <4 x float> @llvm.x86.avx512.maskz.vfmadd.ss(<4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x float> [[C:%.*]], i8 [[MASK:%.*]], i32 4)
+; CHECK-NEXT:    ret <4 x float> [[RES]]
+;
+  %1 = insertelement <4 x float> %b, float 1.000000e+00, i32 1
+  %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2
+  %3 = insertelement <4 x float> %2, float 3.000000e+00, i32 3
+  %4 = insertelement <4 x float> %c, float 4.000000e+00, i32 1
+  %5 = insertelement <4 x float> %4, float 5.000000e+00, i32 2
+  %6 = insertelement <4 x float> %5, float 6.000000e+00, i32 3
+  %res = tail call <4 x float> @llvm.x86.avx512.maskz.vfmadd.ss(<4 x float> %a, <4 x float> %3, <4 x float> %6, i8 %mask, i32 4)
+  ret <4 x float> %res
+}
+
+define float @test_maskz_vfmadd_ss_0(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) {
+; CHECK-LABEL: @test_maskz_vfmadd_ss_0(
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x float> @llvm.x86.avx512.maskz.vfmadd.ss(<4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x float> [[C:%.*]], i8 [[MASK:%.*]], i32 4)
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 0
+; CHECK-NEXT:    ret float [[TMP2]]
+;
+  %1 = insertelement <4 x float> %a, float 1.000000e+00, i32 1
+  %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2
+  %3 = insertelement <4 x float> %2, float 3.000000e+00, i32 3
+  %4 = tail call <4 x float> @llvm.x86.avx512.maskz.vfmadd.ss(<4 x float> %3, <4 x float> %b, <4 x float> %c, i8 %mask, i32 4)
+  %5 = extractelement <4 x float> %4, i32 0
+  ret float %5
+}
+
+define float @test_maskz_vfmadd_ss_1(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) {
+; CHECK-LABEL: @test_maskz_vfmadd_ss_1(
+; CHECK-NEXT:    ret float 1.000000e+00
+;
+  %1 = insertelement <4 x float> %a, float 1.000000e+00, i32 1
+  %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2
+  %3 = insertelement <4 x float> %2, float 3.000000e+00, i32 3
+  %4 = tail call <4 x float> @llvm.x86.avx512.maskz.vfmadd.ss(<4 x float> %3, <4 x float> %b, <4 x float> %c, i8 %mask, i32 4)
+  %5 = extractelement <4 x float> %4, i32 1
+  ret float %5
+}
+
+declare <2 x double> @llvm.x86.avx512.maskz.vfmadd.sd(<2 x double>, <2 x double>, <2 x double>, i8, i32)
+
+define <2 x double> @test_maskz_vfmadd_sd(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) {
+; CHECK-LABEL: @test_maskz_vfmadd_sd(
+; CHECK-NEXT:    [[RES:%.*]] = tail call <2 x double> @llvm.x86.avx512.maskz.vfmadd.sd(<2 x double> [[A:%.*]], <2 x double> [[B:%.*]], <2 x double> [[C:%.*]], i8 [[MASK:%.*]], i32 4)
+; CHECK-NEXT:    ret <2 x double> [[RES]]
+;
+  %1 = insertelement <2 x double> %b, double 1.000000e+00, i32 1
+  %2 = insertelement <2 x double> %c, double 2.000000e+00, i32 1
+  %res = tail call <2 x double> @llvm.x86.avx512.maskz.vfmadd.sd(<2 x double> %a, <2 x double> %1, <2 x double> %2, i8 %mask, i32 4)
+  ret <2 x double> %res
+}
+
+define double @test_maskz_vfmadd_sd_0(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) {
+; CHECK-LABEL: @test_maskz_vfmadd_sd_0(
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x double> @llvm.x86.avx512.maskz.vfmadd.sd(<2 x double> [[A:%.*]], <2 x double> [[B:%.*]], <2 x double> [[C:%.*]], i8 [[MASK:%.*]], i32 4)
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x double> [[TMP1]], i32 0
+; CHECK-NEXT:    ret double [[TMP2]]
+;
+  %1 = insertelement <2 x double> %a, double 1.000000e+00, i32 1
+  %2 = tail call <2 x double> @llvm.x86.avx512.maskz.vfmadd.sd(<2 x double> %1, <2 x double> %b, <2 x double> %c, i8 %mask, i32 4)
+  %3 = extractelement <2 x double> %2, i32 0
+  ret double %3
+}
+
+define double @test_maskz_vfmadd_sd_1(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) {
+; CHECK-LABEL: @test_maskz_vfmadd_sd_1(
+; CHECK-NEXT:    ret double 1.000000e+00
+;
+  %1 = insertelement <2 x double> %a, double 1.000000e+00, i32 1
+  %2 = tail call <2 x double> @llvm.x86.avx512.maskz.vfmadd.sd(<2 x double> %1, <2 x double> %b, <2 x double> %c, i8 %mask, i32 4)
+  %3 = extractelement <2 x double> %2, i32 1
+  ret double %3
+}
+
+declare <4 x float> @llvm.x86.avx512.mask3.vfmadd.ss(<4 x float>, <4 x float>, <4 x float>, i8, i32)
+
+define <4 x float> @test_mask3_vfmadd_ss(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) {
+; CHECK-LABEL: @test_mask3_vfmadd_ss(
+; CHECK-NEXT:    [[RES:%.*]] = tail call <4 x float> @llvm.x86.avx512.mask3.vfmadd.ss(<4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x float> [[C:%.*]], i8 [[MASK:%.*]], i32 4)
+; CHECK-NEXT:    ret <4 x float> [[RES]]
+;
+  %1 = insertelement <4 x float> %a, float 1.000000e+00, i32 1
+  %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2
+  %3 = insertelement <4 x float> %2, float 3.000000e+00, i32 3
+  %4 = insertelement <4 x float> %b, float 4.000000e+00, i32 1
+  %5 = insertelement <4 x float> %4, float 5.000000e+00, i32 2
+  %6 = insertelement <4 x float> %5, float 6.000000e+00, i32 3
+  %res = tail call <4 x float> @llvm.x86.avx512.mask3.vfmadd.ss(<4 x float> %3, <4 x float> %6, <4 x float> %c, i8 %mask, i32 4)
+  ret <4 x float> %res
+}
+
+define float @test_mask3_vfmadd_ss_0(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) {
+; CHECK-LABEL: @test_mask3_vfmadd_ss_0(
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x float> @llvm.x86.avx512.mask3.vfmadd.ss(<4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x float> [[C:%.*]], i8 [[MASK:%.*]], i32 4)
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 0
+; CHECK-NEXT:    ret float [[TMP2]]
+;
+  %1 = insertelement <4 x float> %c, float 1.000000e+00, i32 1
+  %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2
+  %3 = insertelement <4 x float> %2, float 3.000000e+00, i32 3
+  %4 = tail call <4 x float> @llvm.x86.avx512.mask3.vfmadd.ss(<4 x float> %a, <4 x float> %b, <4 x float> %3, i8 %mask, i32 4)
+  %5 = extractelement <4 x float> %4, i32 0
+  ret float %5
+}
+
+define float @test_mask3_vfmadd_ss_1(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) {
+; CHECK-LABEL: @test_mask3_vfmadd_ss_1(
+; CHECK-NEXT:    ret float 1.000000e+00
+;
+  %1 = insertelement <4 x float> %c, float 1.000000e+00, i32 1
+  %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2
+  %3 = insertelement <4 x float> %2, float 3.000000e+00, i32 3
+  %4 = tail call <4 x float> @llvm.x86.avx512.mask3.vfmadd.ss(<4 x float> %a, <4 x float> %b, <4 x float> %3, i8 %mask, i32 4)
+  %5 = extractelement <4 x float> %4, i32 1
+  ret float %5
+}
+
+declare <2 x double> @llvm.x86.avx512.mask3.vfmadd.sd(<2 x double>, <2 x double>, <2 x double>, i8, i32)
+
+define <2 x double> @test_mask3_vfmadd_sd(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) {
+; CHECK-LABEL: @test_mask3_vfmadd_sd(
+; CHECK-NEXT:    [[RES:%.*]] = tail call <2 x double> @llvm.x86.avx512.mask3.vfmadd.sd(<2 x double> [[A:%.*]], <2 x double> [[B:%.*]], <2 x double> [[C:%.*]], i8 [[MASK:%.*]], i32 4)
+; CHECK-NEXT:    ret <2 x double> [[RES]]
+;
+  %1 = insertelement <2 x double> %a, double 1.000000e+00, i32 1
+  %2 = insertelement <2 x double> %b, double 2.000000e+00, i32 1
+  %res = tail call <2 x double> @llvm.x86.avx512.mask3.vfmadd.sd(<2 x double> %1, <2 x double> %2, <2 x double> %c, i8 %mask, i32 4)
+  ret <2 x double> %res
+}
+
+define double @test_mask3_vfmadd_sd_0(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) {
+; CHECK-LABEL: @test_mask3_vfmadd_sd_0(
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x double> @llvm.x86.avx512.mask3.vfmadd.sd(<2 x double> [[A:%.*]], <2 x double> [[B:%.*]], <2 x double> [[C:%.*]], i8 [[MASK:%.*]], i32 4)
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x double> [[TMP1]], i32 0
+; CHECK-NEXT:    ret double [[TMP2]]
+;
+  %1 = insertelement <2 x double> %c, double 1.000000e+00, i32 1
+  %2 = tail call <2 x double> @llvm.x86.avx512.mask3.vfmadd.sd(<2 x double> %a, <2 x double> %b, <2 x double> %1, i8 %mask, i32 4)
+  %3 = extractelement <2 x double> %2, i32 0
+  ret double %3
+}
+
+define double @test_mask3_vfmadd_sd_1(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) {
+; CHECK-LABEL: @test_mask3_vfmadd_sd_1(
+; CHECK-NEXT:    ret double 1.000000e+00
+;
+  %1 = insertelement <2 x double> %c, double 1.000000e+00, i32 1
+  %2 = tail call <2 x double> @llvm.x86.avx512.mask3.vfmadd.sd(<2 x double> %a, <2 x double> %b, <2 x double> %1, i8 %mask, i32 4)
+  %3 = extractelement <2 x double> %2, i32 1
+  ret double %3
+}
+
+declare <4 x float> @llvm.x86.avx512.mask3.vfmsub.ss(<4 x float>, <4 x float>, <4 x float>, i8, i32)
+
+define <4 x float> @test_mask3_vfmsub_ss(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) {
+; CHECK-LABEL: @test_mask3_vfmsub_ss(
+; CHECK-NEXT:    [[RES:%.*]] = tail call <4 x float> @llvm.x86.avx512.mask3.vfmsub.ss(<4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x float> [[C:%.*]], i8 [[MASK:%.*]], i32 4)
+; CHECK-NEXT:    ret <4 x float> [[RES]]
+;
+  %1 = insertelement <4 x float> %a, float 1.000000e+00, i32 1
+  %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2
+  %3 = insertelement <4 x float> %2, float 3.000000e+00, i32 3
+  %4 = insertelement <4 x float> %b, float 4.000000e+00, i32 1
+  %5 = insertelement <4 x float> %4, float 5.000000e+00, i32 2
+  %6 = insertelement <4 x float> %5, float 6.000000e+00, i32 3
+  %res = tail call <4 x float> @llvm.x86.avx512.mask3.vfmsub.ss(<4 x float> %3, <4 x float> %6, <4 x float> %c, i8 %mask, i32 4)
+  ret <4 x float> %res
+}
+
+define float @test_mask3_vfmsub_ss_0(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) {
+; CHECK-LABEL: @test_mask3_vfmsub_ss_0(
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x float> @llvm.x86.avx512.mask3.vfmsub.ss(<4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x float> [[C:%.*]], i8 [[MASK:%.*]], i32 4)
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 0
+; CHECK-NEXT:    ret float [[TMP2]]
+;
+  %1 = insertelement <4 x float> %c, float 1.000000e+00, i32 1
+  %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2
+  %3 = insertelement <4 x float> %2, float 3.000000e+00, i32 3
+  %4 = tail call <4 x float> @llvm.x86.avx512.mask3.vfmsub.ss(<4 x float> %a, <4 x float> %b, <4 x float> %3, i8 %mask, i32 4)
+  %5 = extractelement <4 x float> %4, i32 0
+  ret float %5
+}
+
+define float @test_mask3_vfmsub_ss_1(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) {
+; CHECK-LABEL: @test_mask3_vfmsub_ss_1(
+; CHECK-NEXT:    ret float 1.000000e+00
+;
+  %1 = insertelement <4 x float> %c, float 1.000000e+00, i32 1
+  %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2
+  %3 = insertelement <4 x float> %2, float 3.000000e+00, i32 3
+  %4 = tail call <4 x float> @llvm.x86.avx512.mask3.vfmsub.ss(<4 x float> %a, <4 x float> %b, <4 x float> %3, i8 %mask, i32 4)
+  %5 = extractelement <4 x float> %4, i32 1
+  ret float %5
+}
+
+declare <2 x double> @llvm.x86.avx512.mask3.vfmsub.sd(<2 x double>, <2 x double>, <2 x double>, i8, i32)
+
+define <2 x double> @test_mask3_vfmsub_sd(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) {
+; CHECK-LABEL: @test_mask3_vfmsub_sd(
+; CHECK-NEXT:    [[RES:%.*]] = tail call <2 x double> @llvm.x86.avx512.mask3.vfmsub.sd(<2 x double> [[A:%.*]], <2 x double> [[B:%.*]], <2 x double> [[C:%.*]], i8 [[MASK:%.*]], i32 4)
+; CHECK-NEXT:    ret <2 x double> [[RES]]
+;
+  %1 = insertelement <2 x double> %a, double 1.000000e+00, i32 1
+  %2 = insertelement <2 x double> %b, double 2.000000e+00, i32 1
+  %res = tail call <2 x double> @llvm.x86.avx512.mask3.vfmsub.sd(<2 x double> %1, <2 x double> %2, <2 x double> %c, i8 %mask, i32 4)
+  ret <2 x double> %res
+}
+
+define double @test_mask3_vfmsub_sd_0(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) {
+; CHECK-LABEL: @test_mask3_vfmsub_sd_0(
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x double> @llvm.x86.avx512.mask3.vfmsub.sd(<2 x double> [[A:%.*]], <2 x double> [[B:%.*]], <2 x double> [[C:%.*]], i8 [[MASK:%.*]], i32 4)
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x double> [[TMP1]], i32 0
+; CHECK-NEXT:    ret double [[TMP2]]
+;
+  %1 = insertelement <2 x double> %c, double 1.000000e+00, i32 1
+  %2 = tail call <2 x double> @llvm.x86.avx512.mask3.vfmsub.sd(<2 x double> %a, <2 x double> %b, <2 x double> %1, i8 %mask, i32 4)
+  %3 = extractelement <2 x double> %2, i32 0
+  ret double %3
+}
+
+define double @test_mask3_vfmsub_sd_1(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) {
+; CHECK-LABEL: @test_mask3_vfmsub_sd_1(
+; CHECK-NEXT:    ret double 1.000000e+00
+;
+  %1 = insertelement <2 x double> %c, double 1.000000e+00, i32 1
+  %2 = tail call <2 x double> @llvm.x86.avx512.mask3.vfmsub.sd(<2 x double> %a, <2 x double> %b, <2 x double> %1, i8 %mask, i32 4)
+  %3 = extractelement <2 x double> %2, i32 1
+  ret double %3
+}
+
+declare <4 x float> @llvm.x86.avx512.mask3.vfnmsub.ss(<4 x float>, <4 x float>, <4 x float>, i8, i32)
+
+define <4 x float> @test_mask3_vfnmsub_ss(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) {
+; CHECK-LABEL: @test_mask3_vfnmsub_ss(
+; CHECK-NEXT:    [[RES:%.*]] = tail call <4 x float> @llvm.x86.avx512.mask3.vfnmsub.ss(<4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x float> [[C:%.*]], i8 [[MASK:%.*]], i32 4)
+; CHECK-NEXT:    ret <4 x float> [[RES]]
+;
+  %1 = insertelement <4 x float> %a, float 1.000000e+00, i32 1
+  %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2
+  %3 = insertelement <4 x float> %2, float 3.000000e+00, i32 3
+  %4 = insertelement <4 x float> %b, float 4.000000e+00, i32 1
+  %5 = insertelement <4 x float> %4, float 5.000000e+00, i32 2
+  %6 = insertelement <4 x float> %5, float 6.000000e+00, i32 3
+  %res = tail call <4 x float> @llvm.x86.avx512.mask3.vfnmsub.ss(<4 x float> %3, <4 x float> %6, <4 x float> %c, i8 %mask, i32 4)
+  ret <4 x float> %res
+}
+
+define float @test_mask3_vfnmsub_ss_0(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) {
+; CHECK-LABEL: @test_mask3_vfnmsub_ss_0(
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x float> @llvm.x86.avx512.mask3.vfnmsub.ss(<4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x float> [[C:%.*]], i8 [[MASK:%.*]], i32 4)
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 0
+; CHECK-NEXT:    ret float [[TMP2]]
+;
+  %1 = insertelement <4 x float> %c, float 1.000000e+00, i32 1
+  %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2
+  %3 = insertelement <4 x float> %2, float 3.000000e+00, i32 3
+  %4 = tail call <4 x float> @llvm.x86.avx512.mask3.vfnmsub.ss(<4 x float> %a, <4 x float> %b, <4 x float> %3, i8 %mask, i32 4)
+  %5 = extractelement <4 x float> %4, i32 0
+  ret float %5
+}
+
+define float @test_mask3_vfnmsub_ss_1(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) {
+; CHECK-LABEL: @test_mask3_vfnmsub_ss_1(
+; CHECK-NEXT:    ret float 1.000000e+00
+;
+  %1 = insertelement <4 x float> %c, float 1.000000e+00, i32 1
+  %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2
+  %3 = insertelement <4 x float> %2, float 3.000000e+00, i32 3
+  %4 = tail call <4 x float> @llvm.x86.avx512.mask3.vfnmsub.ss(<4 x float> %a, <4 x float> %b, <4 x float> %3, i8 %mask, i32 4)
+  %5 = extractelement <4 x float> %4, i32 1
+  ret float %5
+}
+
+declare <2 x double> @llvm.x86.avx512.mask3.vfnmsub.sd(<2 x double>, <2 x double>, <2 x double>, i8, i32)
+
+define <2 x double> @test_mask3_vfnmsub_sd(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) {
+; CHECK-LABEL: @test_mask3_vfnmsub_sd(
+; CHECK-NEXT:    [[RES:%.*]] = tail call <2 x double> @llvm.x86.avx512.mask3.vfnmsub.sd(<2 x double> [[A:%.*]], <2 x double> [[B:%.*]], <2 x double> [[C:%.*]], i8 [[MASK:%.*]], i32 4)
+; CHECK-NEXT:    ret <2 x double> [[RES]]
+;
+  %1 = insertelement <2 x double> %a, double 1.000000e+00, i32 1
+  %2 = insertelement <2 x double> %b, double 2.000000e+00, i32 1
+  %res = tail call <2 x double> @llvm.x86.avx512.mask3.vfnmsub.sd(<2 x double> %1, <2 x double> %2, <2 x double> %c, i8 %mask, i32 4)
+  ret <2 x double> %res
+}
+
+define double @test_mask3_vfnmsub_sd_0(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) {
+; CHECK-LABEL: @test_mask3_vfnmsub_sd_0(
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x double> @llvm.x86.avx512.mask3.vfnmsub.sd(<2 x double> [[A:%.*]], <2 x double> [[B:%.*]], <2 x double> [[C:%.*]], i8 [[MASK:%.*]], i32 4)
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x double> [[TMP1]], i32 0
+; CHECK-NEXT:    ret double [[TMP2]]
+;
+  %1 = insertelement <2 x double> %c, double 1.000000e+00, i32 1
+  %2 = tail call <2 x double> @llvm.x86.avx512.mask3.vfnmsub.sd(<2 x double> %a, <2 x double> %b, <2 x double> %1, i8 %mask, i32 4)
+  %3 = extractelement <2 x double> %2, i32 0
+  ret double %3
+}
+
+define double @test_mask3_vfnmsub_sd_1(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) {
+; CHECK-LABEL: @test_mask3_vfnmsub_sd_1(
+; CHECK-NEXT:    ret double 1.000000e+00
+;
+  %1 = insertelement <2 x double> %c, double 1.000000e+00, i32 1
+  %2 = tail call <2 x double> @llvm.x86.avx512.mask3.vfnmsub.sd(<2 x double> %a, <2 x double> %b, <2 x double> %1, i8 %mask, i32 4)
+  %3 = extractelement <2 x double> %2, i32 1
+  ret double %3
+}
+
+declare <8 x i32> @llvm.x86.avx512.mask.permvar.si.256(<8 x i32>, <8 x i32>, <8 x i32>, i8)
+
+define <8 x i32> @identity_test_permvar_si_256(<8 x i32> %a0) {
+; CHECK-LABEL: @identity_test_permvar_si_256(
+; CHECK-NEXT:    ret <8 x i32> [[A0:%.*]]
+;
+  %a = tail call <8 x i32> @llvm.x86.avx512.mask.permvar.si.256(<8 x i32> %a0, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>, <8 x i32> undef, i8 -1)
+  ret <8 x i32> %a
+}
+
+define <8 x i32> @identity_test_permvar_si_256_mask(<8 x i32> %a0, <8 x i32> %passthru, i8 %mask) {
+; CHECK-LABEL: @identity_test_permvar_si_256_mask(
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP2:%.*]] = select <8 x i1> [[TMP1]], <8 x i32> [[A0:%.*]], <8 x i32> [[PASSTHRU:%.*]]
+; CHECK-NEXT:    ret <8 x i32> [[TMP2]]
+;
+  %a = tail call <8 x i32> @llvm.x86.avx512.mask.permvar.si.256(<8 x i32> %a0, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>, <8 x i32> %passthru, i8 %mask)
+  ret <8 x i32> %a
+}
+
+define <8 x i32> @zero_test_permvar_si_256(<8 x i32> %a0) {
+; CHECK-LABEL: @zero_test_permvar_si_256(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A0:%.*]], <8 x i32> undef, <8 x i32> zeroinitializer
+; CHECK-NEXT:    ret <8 x i32> [[TMP1]]
+;
+  %a = tail call <8 x i32> @llvm.x86.avx512.mask.permvar.si.256(<8 x i32> %a0, <8 x i32> zeroinitializer, <8 x i32> undef, i8 -1)
+  ret <8 x i32> %a
+}
+
+define <8 x i32> @zero_test_permvar_si_256_mask(<8 x i32> %a0, <8 x i32> %passthru, i8 %mask) {
+; CHECK-LABEL: @zero_test_permvar_si_256_mask(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A0:%.*]], <8 x i32> undef, <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP3:%.*]] = select <8 x i1> [[TMP2]], <8 x i32> [[TMP1]], <8 x i32> [[PASSTHRU:%.*]]
+; CHECK-NEXT:    ret <8 x i32> [[TMP3]]
+;
+  %a = tail call <8 x i32> @llvm.x86.avx512.mask.permvar.si.256(<8 x i32> %a0, <8 x i32> zeroinitializer, <8 x i32> %passthru, i8 %mask)
+  ret <8 x i32> %a
+}
+
+define <8 x i32> @shuffle_test_permvar_si_256(<8 x i32> %a0) {
+; CHECK-LABEL: @shuffle_test_permvar_si_256(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A0:%.*]], <8 x i32> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    ret <8 x i32> [[TMP1]]
+;
+  %a = tail call <8 x i32> @llvm.x86.avx512.mask.permvar.si.256(<8 x i32> %a0, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>, <8 x i32> undef, i8 -1)
+  ret <8 x i32> %a
+}
+
+define <8 x i32> @shuffle_test_permvar_si_256_mask(<8 x i32> %a0, <8 x i32> %passthru, i8 %mask) {
+; CHECK-LABEL: @shuffle_test_permvar_si_256_mask(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A0:%.*]], <8 x i32> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP3:%.*]] = select <8 x i1> [[TMP2]], <8 x i32> [[TMP1]], <8 x i32> [[PASSTHRU:%.*]]
+; CHECK-NEXT:    ret <8 x i32> [[TMP3]]
+;
+  %a = tail call <8 x i32> @llvm.x86.avx512.mask.permvar.si.256(<8 x i32> %a0, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>, <8 x i32> %passthru, i8 %mask)
+  ret <8 x i32> %a
+}
+
+define <8 x i32> @undef_test_permvar_si_256(<8 x i32> %a0) {
+; CHECK-LABEL: @undef_test_permvar_si_256(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A0:%.*]], <8 x i32> undef, <8 x i32> <i32 undef, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    ret <8 x i32> [[TMP1]]
+;
+  %a = tail call <8 x i32> @llvm.x86.avx512.mask.permvar.si.256(<8 x i32> %a0, <8 x i32> <i32 undef, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>, <8 x i32> undef, i8 -1)
+  ret <8 x i32> %a
+}
+
+define <8 x i32> @undef_test_permvar_si_256_mask(<8 x i32> %a0, <8 x i32> %passthru, i8 %mask) {
+; CHECK-LABEL: @undef_test_permvar_si_256_mask(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A0:%.*]], <8 x i32> undef, <8 x i32> <i32 undef, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP3:%.*]] = select <8 x i1> [[TMP2]], <8 x i32> [[TMP1]], <8 x i32> [[PASSTHRU:%.*]]
+; CHECK-NEXT:    ret <8 x i32> [[TMP3]]
+;
+  %a = tail call <8 x i32> @llvm.x86.avx512.mask.permvar.si.256(<8 x i32> %a0, <8 x i32> <i32 undef, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>, <8 x i32> %passthru, i8 %mask)
+  ret <8 x i32> %a
+}
+
+declare <8 x float> @llvm.x86.avx512.mask.permvar.sf.256(<8 x float>, <8 x i32>, <8 x float>, i8)
+
+define <8 x float> @identity_test_permvar_sf_256(<8 x float> %a0) {
+; CHECK-LABEL: @identity_test_permvar_sf_256(
+; CHECK-NEXT:    ret <8 x float> [[A0:%.*]]
+;
+  %a = tail call <8 x float> @llvm.x86.avx512.mask.permvar.sf.256(<8 x float> %a0, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>, <8 x float> undef, i8 -1)
+  ret <8 x float> %a
+}
+
+define <8 x float> @identity_test_permvar_sf_256_mask(<8 x float> %a0, <8 x float> %passthru, i8 %mask) {
+; CHECK-LABEL: @identity_test_permvar_sf_256_mask(
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP2:%.*]] = select <8 x i1> [[TMP1]], <8 x float> [[A0:%.*]], <8 x float> [[PASSTHRU:%.*]]
+; CHECK-NEXT:    ret <8 x float> [[TMP2]]
+;
+  %a = tail call <8 x float> @llvm.x86.avx512.mask.permvar.sf.256(<8 x float> %a0, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>, <8 x float> %passthru, i8 %mask)
+  ret <8 x float> %a
+}
+
+define <8 x float> @zero_test_permvar_sf_256(<8 x float> %a0) {
+; CHECK-LABEL: @zero_test_permvar_sf_256(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A0:%.*]], <8 x float> undef, <8 x i32> zeroinitializer
+; CHECK-NEXT:    ret <8 x float> [[TMP1]]
+;
+  %a = tail call <8 x float> @llvm.x86.avx512.mask.permvar.sf.256(<8 x float> %a0, <8 x i32> zeroinitializer, <8 x float> undef, i8 -1)
+  ret <8 x float> %a
+}
+
+define <8 x float> @zero_test_permvar_sf_256_mask(<8 x float> %a0, <8 x float> %passthru, i8 %mask) {
+; CHECK-LABEL: @zero_test_permvar_sf_256_mask(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A0:%.*]], <8 x float> undef, <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP3:%.*]] = select <8 x i1> [[TMP2]], <8 x float> [[TMP1]], <8 x float> [[PASSTHRU:%.*]]
+; CHECK-NEXT:    ret <8 x float> [[TMP3]]
+;
+  %a = tail call <8 x float> @llvm.x86.avx512.mask.permvar.sf.256(<8 x float> %a0, <8 x i32> zeroinitializer, <8 x float> %passthru, i8 %mask)
+  ret <8 x float> %a
+}
+
+define <8 x float> @shuffle_test_permvar_sf_256(<8 x float> %a0) {
+; CHECK-LABEL: @shuffle_test_permvar_sf_256(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A0:%.*]], <8 x float> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    ret <8 x float> [[TMP1]]
+;
+  %a = tail call <8 x float> @llvm.x86.avx512.mask.permvar.sf.256(<8 x float> %a0, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>, <8 x float> undef, i8 -1)
+  ret <8 x float> %a
+}
+
+define <8 x float> @shuffle_test_permvar_sf_256_mask(<8 x float> %a0, <8 x float> %passthru, i8 %mask) {
+; CHECK-LABEL: @shuffle_test_permvar_sf_256_mask(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A0:%.*]], <8 x float> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP3:%.*]] = select <8 x i1> [[TMP2]], <8 x float> [[TMP1]], <8 x float> [[PASSTHRU:%.*]]
+; CHECK-NEXT:    ret <8 x float> [[TMP3]]
+;
+  %a = tail call <8 x float> @llvm.x86.avx512.mask.permvar.sf.256(<8 x float> %a0, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>, <8 x float> %passthru, i8 %mask)
+  ret <8 x float> %a
+}
+
+define <8 x float> @undef_test_permvar_sf_256(<8 x float> %a0) {
+; CHECK-LABEL: @undef_test_permvar_sf_256(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A0:%.*]], <8 x float> undef, <8 x i32> <i32 undef, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    ret <8 x float> [[TMP1]]
+;
+  %a = tail call <8 x float> @llvm.x86.avx512.mask.permvar.sf.256(<8 x float> %a0, <8 x i32> <i32 undef, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>, <8 x float> undef, i8 -1)
+  ret <8 x float> %a
+}
+
+define <8 x float> @undef_test_permvar_sf_256_mask(<8 x float> %a0, <8 x float> %passthru, i8 %mask) {
+; CHECK-LABEL: @undef_test_permvar_sf_256_mask(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A0:%.*]], <8 x float> undef, <8 x i32> <i32 undef, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP3:%.*]] = select <8 x i1> [[TMP2]], <8 x float> [[TMP1]], <8 x float> [[PASSTHRU:%.*]]
+; CHECK-NEXT:    ret <8 x float> [[TMP3]]
+;
+  %a = tail call <8 x float> @llvm.x86.avx512.mask.permvar.sf.256(<8 x float> %a0, <8 x i32> <i32 undef, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>, <8 x float> %passthru, i8 %mask)
+  ret <8 x float> %a
+}
+
+declare <4 x i64> @llvm.x86.avx512.mask.permvar.di.256(<4 x i64>, <4 x i64>, <4 x i64>, i8)
+
+define <4 x i64> @identity_test_permvar_di_256(<4 x i64> %a0) {
+; CHECK-LABEL: @identity_test_permvar_di_256(
+; CHECK-NEXT:    ret <4 x i64> [[A0:%.*]]
+;
+  %a = tail call <4 x i64> @llvm.x86.avx512.mask.permvar.di.256(<4 x i64> %a0, <4 x i64> <i64 0, i64 1, i64 2, i64 3>, <4 x i64> undef, i8 -1)
+  ret <4 x i64> %a
+}
+
+define <4 x i64> @identity_test_permvar_di_256_mask(<4 x i64> %a0, <4 x i64> %passthru, i8 %mask) {
+; CHECK-LABEL: @identity_test_permvar_di_256_mask(
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP1]], <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP2:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i64> [[A0:%.*]], <4 x i64> [[PASSTHRU:%.*]]
+; CHECK-NEXT:    ret <4 x i64> [[TMP2]]
+;
+  %a = tail call <4 x i64> @llvm.x86.avx512.mask.permvar.di.256(<4 x i64> %a0, <4 x i64> <i64 0, i64 1, i64 2, i64 3>, <4 x i64> %passthru, i8 %mask)
+  ret <4 x i64> %a
+}
+
+define <4 x i64> @zero_test_permvar_di_256(<4 x i64> %a0) {
+; CHECK-LABEL: @zero_test_permvar_di_256(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i64> [[A0:%.*]], <4 x i64> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT:    ret <4 x i64> [[TMP1]]
+;
+  %a = tail call <4 x i64> @llvm.x86.avx512.mask.permvar.di.256(<4 x i64> %a0, <4 x i64> zeroinitializer, <4 x i64> undef, i8 -1)
+  ret <4 x i64> %a
+}
+
+define <4 x i64> @zero_test_permvar_di_256_mask(<4 x i64> %a0, <4 x i64> %passthru, i8 %mask) {
+; CHECK-LABEL: @zero_test_permvar_di_256_mask(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i64> [[A0:%.*]], <4 x i64> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP3:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i64> [[TMP1]], <4 x i64> [[PASSTHRU:%.*]]
+; CHECK-NEXT:    ret <4 x i64> [[TMP3]]
+;
+  %a = tail call <4 x i64> @llvm.x86.avx512.mask.permvar.di.256(<4 x i64> %a0, <4 x i64> zeroinitializer, <4 x i64> %passthru, i8 %mask)
+  ret <4 x i64> %a
+}
+
+define <4 x i64> @shuffle_test_permvar_di_256(<4 x i64> %a0) {
+; CHECK-LABEL: @shuffle_test_permvar_di_256(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i64> [[A0:%.*]], <4 x i64> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    ret <4 x i64> [[TMP1]]
+;
+  %a = tail call <4 x i64> @llvm.x86.avx512.mask.permvar.di.256(<4 x i64> %a0, <4 x i64> <i64 3, i64 2, i64 1, i64 0>, <4 x i64> undef, i8 -1)
+  ret <4 x i64> %a
+}
+
+define <4 x i64> @shuffle_test_permvar_di_256_mask(<4 x i64> %a0, <4 x i64> %passthru, i8 %mask) {
+; CHECK-LABEL: @shuffle_test_permvar_di_256_mask(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i64> [[A0:%.*]], <4 x i64> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP3:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i64> [[TMP1]], <4 x i64> [[PASSTHRU:%.*]]
+; CHECK-NEXT:    ret <4 x i64> [[TMP3]]
+;
+  %a = tail call <4 x i64> @llvm.x86.avx512.mask.permvar.di.256(<4 x i64> %a0, <4 x i64> <i64 3, i64 2, i64 1, i64 0>, <4 x i64> %passthru, i8 %mask)
+  ret <4 x i64> %a
+}
+
+define <4 x i64> @undef_test_permvar_di_256(<4 x i64> %a0) {
+; CHECK-LABEL: @undef_test_permvar_di_256(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i64> [[A0:%.*]], <4 x i64> undef, <4 x i32> <i32 undef, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    ret <4 x i64> [[TMP1]]
+;
+  %a = tail call <4 x i64> @llvm.x86.avx512.mask.permvar.di.256(<4 x i64> %a0, <4 x i64> <i64 undef, i64 2, i64 1, i64 0>, <4 x i64> undef, i8 -1)
+  ret <4 x i64> %a
+}
+
+define <4 x i64> @undef_test_permvar_di_256_mask(<4 x i64> %a0, <4 x i64> %passthru, i8 %mask) {
+; CHECK-LABEL: @undef_test_permvar_di_256_mask(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i64> [[A0:%.*]], <4 x i64> undef, <4 x i32> <i32 undef, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP3:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i64> [[TMP1]], <4 x i64> [[PASSTHRU:%.*]]
+; CHECK-NEXT:    ret <4 x i64> [[TMP3]]
+;
+  %a = tail call <4 x i64> @llvm.x86.avx512.mask.permvar.di.256(<4 x i64> %a0, <4 x i64> <i64 undef, i64 2, i64 1, i64 0>, <4 x i64> %passthru, i8 %mask)
+  ret <4 x i64> %a
+}
+
+declare <4 x double> @llvm.x86.avx512.mask.permvar.df.256(<4 x double>, <4 x i64>, <4 x double>, i8)
+
+define <4 x double> @identity_test_permvar_df_256(<4 x double> %a0) {
+; CHECK-LABEL: @identity_test_permvar_df_256(
+; CHECK-NEXT:    ret <4 x double> [[A0:%.*]]
+;
+  %a = tail call <4 x double> @llvm.x86.avx512.mask.permvar.df.256(<4 x double> %a0, <4 x i64> <i64 0, i64 1, i64 2, i64 3>, <4 x double> undef, i8 -1)
+  ret <4 x double> %a
+}
+
+define <4 x double> @identity_test_permvar_df_256_mask(<4 x double> %a0, <4 x double> %passthru, i8 %mask) {
+; CHECK-LABEL: @identity_test_permvar_df_256_mask(
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP1]], <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP2:%.*]] = select <4 x i1> [[EXTRACT]], <4 x double> [[A0:%.*]], <4 x double> [[PASSTHRU:%.*]]
+; CHECK-NEXT:    ret <4 x double> [[TMP2]]
+;
+  %a = tail call <4 x double> @llvm.x86.avx512.mask.permvar.df.256(<4 x double> %a0, <4 x i64> <i64 0, i64 1, i64 2, i64 3>, <4 x double> %passthru, i8 %mask)
+  ret <4 x double> %a
+}
+
+define <4 x double> @zero_test_permvar_df_256(<4 x double> %a0) {
+; CHECK-LABEL: @zero_test_permvar_df_256(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A0:%.*]], <4 x double> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT:    ret <4 x double> [[TMP1]]
+;
+  %a = tail call <4 x double> @llvm.x86.avx512.mask.permvar.df.256(<4 x double> %a0, <4 x i64> zeroinitializer, <4 x double> undef, i8 -1)
+  ret <4 x double> %a
+}
+
+define <4 x double> @zero_test_permvar_df_256_mask(<4 x double> %a0, <4 x double> %passthru, i8 %mask) {
+; CHECK-LABEL: @zero_test_permvar_df_256_mask(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A0:%.*]], <4 x double> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP3:%.*]] = select <4 x i1> [[EXTRACT]], <4 x double> [[TMP1]], <4 x double> [[PASSTHRU:%.*]]
+; CHECK-NEXT:    ret <4 x double> [[TMP3]]
+;
+  %a = tail call <4 x double> @llvm.x86.avx512.mask.permvar.df.256(<4 x double> %a0, <4 x i64> zeroinitializer, <4 x double> %passthru, i8 %mask)
+  ret <4 x double> %a
+}
+
+define <4 x double> @shuffle_test_permvar_df_256(<4 x double> %a0) {
+; CHECK-LABEL: @shuffle_test_permvar_df_256(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A0:%.*]], <4 x double> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    ret <4 x double> [[TMP1]]
+;
+  %a = tail call <4 x double> @llvm.x86.avx512.mask.permvar.df.256(<4 x double> %a0, <4 x i64> <i64 3, i64 2, i64 1, i64 0>, <4 x double> undef, i8 -1)
+  ret <4 x double> %a
+}
+
+define <4 x double> @shuffle_test_permvar_df_256_mask(<4 x double> %a0, <4 x double> %passthru, i8 %mask) {
+; CHECK-LABEL: @shuffle_test_permvar_df_256_mask(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A0:%.*]], <4 x double> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP3:%.*]] = select <4 x i1> [[EXTRACT]], <4 x double> [[TMP1]], <4 x double> [[PASSTHRU:%.*]]
+; CHECK-NEXT:    ret <4 x double> [[TMP3]]
+;
+  %a = tail call <4 x double> @llvm.x86.avx512.mask.permvar.df.256(<4 x double> %a0, <4 x i64> <i64 3, i64 2, i64 1, i64 0>, <4 x double> %passthru, i8 %mask)
+  ret <4 x double> %a
+}
+
+define <4 x double> @undef_test_permvar_df_256(<4 x double> %a0) {
+; CHECK-LABEL: @undef_test_permvar_df_256(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A0:%.*]], <4 x double> undef, <4 x i32> <i32 undef, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    ret <4 x double> [[TMP1]]
+;
+  %a = tail call <4 x double> @llvm.x86.avx512.mask.permvar.df.256(<4 x double> %a0, <4 x i64> <i64 undef, i64 2, i64 1, i64 0>, <4 x double> undef, i8 -1)
+  ret <4 x double> %a
+}
+
+define <4 x double> @undef_test_permvar_df_256_mask(<4 x double> %a0, <4 x double> %passthru, i8 %mask) {
+; CHECK-LABEL: @undef_test_permvar_df_256_mask(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A0:%.*]], <4 x double> undef, <4 x i32> <i32 undef, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP3:%.*]] = select <4 x i1> [[EXTRACT]], <4 x double> [[TMP1]], <4 x double> [[PASSTHRU:%.*]]
+; CHECK-NEXT:    ret <4 x double> [[TMP3]]
+;
+  %a = tail call <4 x double> @llvm.x86.avx512.mask.permvar.df.256(<4 x double> %a0, <4 x i64> <i64 undef, i64 2, i64 1, i64 0>, <4 x double> %passthru, i8 %mask)
+  ret <4 x double> %a
+}
+
+declare <16 x i32> @llvm.x86.avx512.mask.permvar.si.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
+
+define <16 x i32> @identity_test_permvar_si_512(<16 x i32> %a0) {
+; CHECK-LABEL: @identity_test_permvar_si_512(
+; CHECK-NEXT:    ret <16 x i32> [[A0:%.*]]
+;
+  %a = tail call <16 x i32> @llvm.x86.avx512.mask.permvar.si.512(<16 x i32> %a0, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>, <16 x i32> undef, i16 -1)
+  ret <16 x i32> %a
+}
+
+define <16 x i32> @identity_test_permvar_si_512_mask(<16 x i32> %a0, <16 x i32> %passthru, i16 %mask) {
+; CHECK-LABEL: @identity_test_permvar_si_512_mask(
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP2:%.*]] = select <16 x i1> [[TMP1]], <16 x i32> [[A0:%.*]], <16 x i32> [[PASSTHRU:%.*]]
+; CHECK-NEXT:    ret <16 x i32> [[TMP2]]
+;
+  %a = tail call <16 x i32> @llvm.x86.avx512.mask.permvar.si.512(<16 x i32> %a0, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>, <16 x i32> %passthru, i16 %mask)
+  ret <16 x i32> %a
+}
+
+define <16 x i32> @zero_test_permvar_si_512(<16 x i32> %a0) {
+; CHECK-LABEL: @zero_test_permvar_si_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i32> [[A0:%.*]], <16 x i32> undef, <16 x i32> zeroinitializer
+; CHECK-NEXT:    ret <16 x i32> [[TMP1]]
+;
+  %a = tail call <16 x i32> @llvm.x86.avx512.mask.permvar.si.512(<16 x i32> %a0, <16 x i32> zeroinitializer, <16 x i32> undef, i16 -1)
+  ret <16 x i32> %a
+}
+
+define <16 x i32> @zero_test_permvar_si_512_mask(<16 x i32> %a0, <16 x i32> %passthru, i16 %mask) {
+; CHECK-LABEL: @zero_test_permvar_si_512_mask(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i32> [[A0:%.*]], <16 x i32> undef, <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP3:%.*]] = select <16 x i1> [[TMP2]], <16 x i32> [[TMP1]], <16 x i32> [[PASSTHRU:%.*]]
+; CHECK-NEXT:    ret <16 x i32> [[TMP3]]
+;
+  %a = tail call <16 x i32> @llvm.x86.avx512.mask.permvar.si.512(<16 x i32> %a0, <16 x i32> zeroinitializer, <16 x i32> %passthru, i16 %mask)
+  ret <16 x i32> %a
+}
+
+define <16 x i32> @shuffle_test_permvar_si_512(<16 x i32> %a0) {
+; CHECK-LABEL: @shuffle_test_permvar_si_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i32> [[A0:%.*]], <16 x i32> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    ret <16 x i32> [[TMP1]]
+;
+  %a = tail call <16 x i32> @llvm.x86.avx512.mask.permvar.si.512(<16 x i32> %a0, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>, <16 x i32> undef, i16 -1)
+  ret <16 x i32> %a
+}
+
+define <16 x i32> @shuffle_test_permvar_si_512_mask(<16 x i32> %a0, <16 x i32> %passthru, i16 %mask) {
+; CHECK-LABEL: @shuffle_test_permvar_si_512_mask(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i32> [[A0:%.*]], <16 x i32> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP3:%.*]] = select <16 x i1> [[TMP2]], <16 x i32> [[TMP1]], <16 x i32> [[PASSTHRU:%.*]]
+; CHECK-NEXT:    ret <16 x i32> [[TMP3]]
+;
+  %a = tail call <16 x i32> @llvm.x86.avx512.mask.permvar.si.512(<16 x i32> %a0, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>, <16 x i32> %passthru, i16 %mask)
+  ret <16 x i32> %a
+}
+
+define <16 x i32> @undef_test_permvar_si_512(<16 x i32> %a0) {
+; CHECK-LABEL: @undef_test_permvar_si_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i32> [[A0:%.*]], <16 x i32> undef, <16 x i32> <i32 undef, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    ret <16 x i32> [[TMP1]]
+;
+  %a = tail call <16 x i32> @llvm.x86.avx512.mask.permvar.si.512(<16 x i32> %a0, <16 x i32> <i32 undef, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>, <16 x i32> undef, i16 -1)
+  ret <16 x i32> %a
+}
+
+define <16 x i32> @undef_test_permvar_si_512_mask(<16 x i32> %a0, <16 x i32> %passthru, i16 %mask) {
+; CHECK-LABEL: @undef_test_permvar_si_512_mask(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i32> [[A0:%.*]], <16 x i32> undef, <16 x i32> <i32 undef, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP3:%.*]] = select <16 x i1> [[TMP2]], <16 x i32> [[TMP1]], <16 x i32> [[PASSTHRU:%.*]]
+; CHECK-NEXT:    ret <16 x i32> [[TMP3]]
+;
+  %a = tail call <16 x i32> @llvm.x86.avx512.mask.permvar.si.512(<16 x i32> %a0, <16 x i32> <i32 undef, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>, <16 x i32> %passthru, i16 %mask)
+  ret <16 x i32> %a
+}
+
+declare <16 x float> @llvm.x86.avx512.mask.permvar.sf.512(<16 x float>, <16 x i32>, <16 x float>, i16)
+
+define <16 x float> @identity_test_permvar_sf_512(<16 x float> %a0) {
+; CHECK-LABEL: @identity_test_permvar_sf_512(
+; CHECK-NEXT:    ret <16 x float> [[A0:%.*]]
+;
+  %a = tail call <16 x float> @llvm.x86.avx512.mask.permvar.sf.512(<16 x float> %a0, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>, <16 x float> undef, i16 -1)
+  ret <16 x float> %a
+}
+
+define <16 x float> @identity_test_permvar_sf_512_mask(<16 x float> %a0, <16 x float> %passthru, i16 %mask) {
+; CHECK-LABEL: @identity_test_permvar_sf_512_mask(
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP2:%.*]] = select <16 x i1> [[TMP1]], <16 x float> [[A0:%.*]], <16 x float> [[PASSTHRU:%.*]]
+; CHECK-NEXT:    ret <16 x float> [[TMP2]]
+;
+  %a = tail call <16 x float> @llvm.x86.avx512.mask.permvar.sf.512(<16 x float> %a0, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>, <16 x float> %passthru, i16 %mask)
+  ret <16 x float> %a
+}
+
+define <16 x float> @zero_test_permvar_sf_512(<16 x float> %a0) {
+; CHECK-LABEL: @zero_test_permvar_sf_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x float> [[A0:%.*]], <16 x float> undef, <16 x i32> zeroinitializer
+; CHECK-NEXT:    ret <16 x float> [[TMP1]]
+;
+  %a = tail call <16 x float> @llvm.x86.avx512.mask.permvar.sf.512(<16 x float> %a0, <16 x i32> zeroinitializer, <16 x float> undef, i16 -1)
+  ret <16 x float> %a
+}
+
+define <16 x float> @zero_test_permvar_sf_512_mask(<16 x float> %a0, <16 x float> %passthru, i16 %mask) {
+; CHECK-LABEL: @zero_test_permvar_sf_512_mask(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x float> [[A0:%.*]], <16 x float> undef, <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP3:%.*]] = select <16 x i1> [[TMP2]], <16 x float> [[TMP1]], <16 x float> [[PASSTHRU:%.*]]
+; CHECK-NEXT:    ret <16 x float> [[TMP3]]
+;
+  %a = tail call <16 x float> @llvm.x86.avx512.mask.permvar.sf.512(<16 x float> %a0, <16 x i32> zeroinitializer, <16 x float> %passthru, i16 %mask)
+  ret <16 x float> %a
+}
+
+define <16 x float> @shuffle_test_permvar_sf_512(<16 x float> %a0) {
+; CHECK-LABEL: @shuffle_test_permvar_sf_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x float> [[A0:%.*]], <16 x float> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    ret <16 x float> [[TMP1]]
+;
+  %a = tail call <16 x float> @llvm.x86.avx512.mask.permvar.sf.512(<16 x float> %a0, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>, <16 x float> undef, i16 -1)
+  ret <16 x float> %a
+}
+
+define <16 x float> @shuffle_test_permvar_sf_512_mask(<16 x float> %a0, <16 x float> %passthru, i16 %mask) {
+; CHECK-LABEL: @shuffle_test_permvar_sf_512_mask(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x float> [[A0:%.*]], <16 x float> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP3:%.*]] = select <16 x i1> [[TMP2]], <16 x float> [[TMP1]], <16 x float> [[PASSTHRU:%.*]]
+; CHECK-NEXT:    ret <16 x float> [[TMP3]]
+;
+  %a = tail call <16 x float> @llvm.x86.avx512.mask.permvar.sf.512(<16 x float> %a0, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>, <16 x float> %passthru, i16 %mask)
+  ret <16 x float> %a
+}
+
+define <16 x float> @undef_test_permvar_sf_512(<16 x float> %a0) {
+; CHECK-LABEL: @undef_test_permvar_sf_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x float> [[A0:%.*]], <16 x float> undef, <16 x i32> <i32 undef, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    ret <16 x float> [[TMP1]]
+;
+  %a = tail call <16 x float> @llvm.x86.avx512.mask.permvar.sf.512(<16 x float> %a0, <16 x i32> <i32 undef, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>, <16 x float> undef, i16 -1)
+  ret <16 x float> %a
+}
+
+define <16 x float> @undef_test_permvar_sf_512_mask(<16 x float> %a0, <16 x float> %passthru, i16 %mask) {
+; CHECK-LABEL: @undef_test_permvar_sf_512_mask(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x float> [[A0:%.*]], <16 x float> undef, <16 x i32> <i32 undef, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP3:%.*]] = select <16 x i1> [[TMP2]], <16 x float> [[TMP1]], <16 x float> [[PASSTHRU:%.*]]
+; CHECK-NEXT:    ret <16 x float> [[TMP3]]
+;
+  %a = tail call <16 x float> @llvm.x86.avx512.mask.permvar.sf.512(<16 x float> %a0, <16 x i32> <i32 undef, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>, <16 x float> %passthru, i16 %mask)
+  ret <16 x float> %a
+}
+
+declare <8 x i64> @llvm.x86.avx512.mask.permvar.di.512(<8 x i64>, <8 x i64>, <8 x i64>, i8)
+
+define <8 x i64> @identity_test_permvar_di_512(<8 x i64> %a0) {
+; CHECK-LABEL: @identity_test_permvar_di_512(
+; CHECK-NEXT:    ret <8 x i64> [[A0:%.*]]
+;
+  %a = tail call <8 x i64> @llvm.x86.avx512.mask.permvar.di.512(<8 x i64> %a0, <8 x i64> <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>, <8 x i64> undef, i8 -1)
+  ret <8 x i64> %a
+}
+
+define <8 x i64> @identity_test_permvar_di_512_mask(<8 x i64> %a0, <8 x i64> %passthru, i8 %mask) {
+; CHECK-LABEL: @identity_test_permvar_di_512_mask(
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP2:%.*]] = select <8 x i1> [[TMP1]], <8 x i64> [[A0:%.*]], <8 x i64> [[PASSTHRU:%.*]]
+; CHECK-NEXT:    ret <8 x i64> [[TMP2]]
+;
+  %a = tail call <8 x i64> @llvm.x86.avx512.mask.permvar.di.512(<8 x i64> %a0, <8 x i64> <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>, <8 x i64> %passthru, i8 %mask)
+  ret <8 x i64> %a
+}
+
+define <8 x i64> @zero_test_permvar_di_512(<8 x i64> %a0) {
+; CHECK-LABEL: @zero_test_permvar_di_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i64> [[A0:%.*]], <8 x i64> undef, <8 x i32> zeroinitializer
+; CHECK-NEXT:    ret <8 x i64> [[TMP1]]
+;
+  %a = tail call <8 x i64> @llvm.x86.avx512.mask.permvar.di.512(<8 x i64> %a0, <8 x i64> zeroinitializer, <8 x i64> undef, i8 -1)
+  ret <8 x i64> %a
+}
+
+define <8 x i64> @zero_test_permvar_di_512_mask(<8 x i64> %a0, <8 x i64> %passthru, i8 %mask) {
+; CHECK-LABEL: @zero_test_permvar_di_512_mask(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i64> [[A0:%.*]], <8 x i64> undef, <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP3:%.*]] = select <8 x i1> [[TMP2]], <8 x i64> [[TMP1]], <8 x i64> [[PASSTHRU:%.*]]
+; CHECK-NEXT:    ret <8 x i64> [[TMP3]]
+;
+  %a = tail call <8 x i64> @llvm.x86.avx512.mask.permvar.di.512(<8 x i64> %a0, <8 x i64> zeroinitializer, <8 x i64> %passthru, i8 %mask)
+  ret <8 x i64> %a
+}
+
+define <8 x i64> @shuffle_test_permvar_di_512(<8 x i64> %a0) {
+; CHECK-LABEL: @shuffle_test_permvar_di_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i64> [[A0:%.*]], <8 x i64> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    ret <8 x i64> [[TMP1]]
+;
+  %a = tail call <8 x i64> @llvm.x86.avx512.mask.permvar.di.512(<8 x i64> %a0, <8 x i64> <i64 7, i64 6, i64 5, i64 4, i64 3, i64 2, i64 1, i64 0>, <8 x i64> undef, i8 -1)
+  ret <8 x i64> %a
+}
+
+define <8 x i64> @shuffle_test_permvar_di_512_mask(<8 x i64> %a0, <8 x i64> %passthru, i8 %mask) {
+; CHECK-LABEL: @shuffle_test_permvar_di_512_mask(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i64> [[A0:%.*]], <8 x i64> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP3:%.*]] = select <8 x i1> [[TMP2]], <8 x i64> [[TMP1]], <8 x i64> [[PASSTHRU:%.*]]
+; CHECK-NEXT:    ret <8 x i64> [[TMP3]]
+;
+  %a = tail call <8 x i64> @llvm.x86.avx512.mask.permvar.di.512(<8 x i64> %a0, <8 x i64> <i64 7, i64 6, i64 5, i64 4, i64 3, i64 2, i64 1, i64 0>, <8 x i64> %passthru, i8 %mask)
+  ret <8 x i64> %a
+}
+
+define <8 x i64> @undef_test_permvar_di_512(<8 x i64> %a0) {
+; CHECK-LABEL: @undef_test_permvar_di_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i64> [[A0:%.*]], <8 x i64> undef, <8 x i32> <i32 undef, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    ret <8 x i64> [[TMP1]]
+;
+  %a = tail call <8 x i64> @llvm.x86.avx512.mask.permvar.di.512(<8 x i64> %a0, <8 x i64> <i64 undef, i64 6, i64 5, i64 4, i64 3, i64 2, i64 1, i64 0>, <8 x i64> undef, i8 -1)
+  ret <8 x i64> %a
+}
+
+define <8 x i64> @undef_test_permvar_di_512_mask(<8 x i64> %a0, <8 x i64> %passthru, i8 %mask) {
+; CHECK-LABEL: @undef_test_permvar_di_512_mask(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i64> [[A0:%.*]], <8 x i64> undef, <8 x i32> <i32 undef, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP3:%.*]] = select <8 x i1> [[TMP2]], <8 x i64> [[TMP1]], <8 x i64> [[PASSTHRU:%.*]]
+; CHECK-NEXT:    ret <8 x i64> [[TMP3]]
+;
+  %a = tail call <8 x i64> @llvm.x86.avx512.mask.permvar.di.512(<8 x i64> %a0, <8 x i64> <i64 undef, i64 6, i64 5, i64 4, i64 3, i64 2, i64 1, i64 0>, <8 x i64> %passthru, i8 %mask)
+  ret <8 x i64> %a
+}
+
+declare <8 x double> @llvm.x86.avx512.mask.permvar.df.512(<8 x double>, <8 x i64>, <8 x double>, i8)
+
+define <8 x double> @identity_test_permvar_df_512(<8 x double> %a0) {
+; CHECK-LABEL: @identity_test_permvar_df_512(
+; CHECK-NEXT:    ret <8 x double> [[A0:%.*]]
+;
+  %a = tail call <8 x double> @llvm.x86.avx512.mask.permvar.df.512(<8 x double> %a0, <8 x i64> <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>, <8 x double> undef, i8 -1)
+  ret <8 x double> %a
+}
+
+define <8 x double> @identity_test_permvar_df_512_mask(<8 x double> %a0, <8 x double> %passthru, i8 %mask) {
+; CHECK-LABEL: @identity_test_permvar_df_512_mask(
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP2:%.*]] = select <8 x i1> [[TMP1]], <8 x double> [[A0:%.*]], <8 x double> [[PASSTHRU:%.*]]
+; CHECK-NEXT:    ret <8 x double> [[TMP2]]
+;
+  %a = tail call <8 x double> @llvm.x86.avx512.mask.permvar.df.512(<8 x double> %a0, <8 x i64> <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>, <8 x double> %passthru, i8 %mask)
+  ret <8 x double> %a
+}
+
+define <8 x double> @zero_test_permvar_df_512(<8 x double> %a0) {
+; CHECK-LABEL: @zero_test_permvar_df_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x double> [[A0:%.*]], <8 x double> undef, <8 x i32> zeroinitializer
+; CHECK-NEXT:    ret <8 x double> [[TMP1]]
+;
+  %a = tail call <8 x double> @llvm.x86.avx512.mask.permvar.df.512(<8 x double> %a0, <8 x i64> zeroinitializer, <8 x double> undef, i8 -1)
+  ret <8 x double> %a
+}
+
+define <8 x double> @zero_test_permvar_df_512_mask(<8 x double> %a0, <8 x double> %passthru, i8 %mask) {
+; CHECK-LABEL: @zero_test_permvar_df_512_mask(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x double> [[A0:%.*]], <8 x double> undef, <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP3:%.*]] = select <8 x i1> [[TMP2]], <8 x double> [[TMP1]], <8 x double> [[PASSTHRU:%.*]]
+; CHECK-NEXT:    ret <8 x double> [[TMP3]]
+;
+  %a = tail call <8 x double> @llvm.x86.avx512.mask.permvar.df.512(<8 x double> %a0, <8 x i64> zeroinitializer, <8 x double> %passthru, i8 %mask)
+  ret <8 x double> %a
+}
+
+define <8 x double> @shuffle_test_permvar_df_512(<8 x double> %a0) {
+; CHECK-LABEL: @shuffle_test_permvar_df_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x double> [[A0:%.*]], <8 x double> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    ret <8 x double> [[TMP1]]
+;
+  %a = tail call <8 x double> @llvm.x86.avx512.mask.permvar.df.512(<8 x double> %a0, <8 x i64> <i64 7, i64 6, i64 5, i64 4, i64 3, i64 2, i64 1, i64 0>, <8 x double> undef, i8 -1)
+  ret <8 x double> %a
+}
+
+define <8 x double> @shuffle_test_permvar_df_512_mask(<8 x double> %a0, <8 x double> %passthru, i8 %mask) {
+; CHECK-LABEL: @shuffle_test_permvar_df_512_mask(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x double> [[A0:%.*]], <8 x double> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP3:%.*]] = select <8 x i1> [[TMP2]], <8 x double> [[TMP1]], <8 x double> [[PASSTHRU:%.*]]
+; CHECK-NEXT:    ret <8 x double> [[TMP3]]
+;
+  %a = tail call <8 x double> @llvm.x86.avx512.mask.permvar.df.512(<8 x double> %a0, <8 x i64> <i64 7, i64 6, i64 5, i64 4, i64 3, i64 2, i64 1, i64 0>, <8 x double> %passthru, i8 %mask)
+  ret <8 x double> %a
+}
+
+define <8 x double> @undef_test_permvar_df_512(<8 x double> %a0) {
+; CHECK-LABEL: @undef_test_permvar_df_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x double> [[A0:%.*]], <8 x double> undef, <8 x i32> <i32 undef, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    ret <8 x double> [[TMP1]]
+;
+  %a = tail call <8 x double> @llvm.x86.avx512.mask.permvar.df.512(<8 x double> %a0, <8 x i64> <i64 undef, i64 6, i64 5, i64 4, i64 3, i64 2, i64 1, i64 0>, <8 x double> undef, i8 -1)
+  ret <8 x double> %a
+}
+
+define <8 x double> @undef_test_permvar_df_512_mask(<8 x double> %a0, <8 x double> %passthru, i8 %mask) {
+; CHECK-LABEL: @undef_test_permvar_df_512_mask(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x double> [[A0:%.*]], <8 x double> undef, <8 x i32> <i32 undef, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP3:%.*]] = select <8 x i1> [[TMP2]], <8 x double> [[TMP1]], <8 x double> [[PASSTHRU:%.*]]
+; CHECK-NEXT:    ret <8 x double> [[TMP3]]
+;
+  %a = tail call <8 x double> @llvm.x86.avx512.mask.permvar.df.512(<8 x double> %a0, <8 x i64> <i64 undef, i64 6, i64 5, i64 4, i64 3, i64 2, i64 1, i64 0>, <8 x double> %passthru, i8 %mask)
+  ret <8 x double> %a
+}
+
+declare <8 x i16> @llvm.x86.avx512.mask.permvar.hi.128(<8 x i16>, <8 x i16>, <8 x i16>, i8)
+
+define <8 x i16> @identity_test_permvar_hi_128(<8 x i16> %a0) {
+; CHECK-LABEL: @identity_test_permvar_hi_128(
+; CHECK-NEXT:    ret <8 x i16> [[A0:%.*]]
+;
+  %a = tail call <8 x i16> @llvm.x86.avx512.mask.permvar.hi.128(<8 x i16> %a0, <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, <8 x i16> undef, i8 -1)
+  ret <8 x i16> %a
+}
+
+define <8 x i16> @identity_test_permvar_hi_128_mask(<8 x i16> %a0, <8 x i16> %passthru, i8 %mask) {
+; CHECK-LABEL: @identity_test_permvar_hi_128_mask(
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP2:%.*]] = select <8 x i1> [[TMP1]], <8 x i16> [[A0:%.*]], <8 x i16> [[PASSTHRU:%.*]]
+; CHECK-NEXT:    ret <8 x i16> [[TMP2]]
+;
+  %a = tail call <8 x i16> @llvm.x86.avx512.mask.permvar.hi.128(<8 x i16> %a0, <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, <8 x i16> %passthru, i8 %mask)
+  ret <8 x i16> %a
+}
+
+define <8 x i16> @zero_test_permvar_hi_128(<8 x i16> %a0) {
+; CHECK-LABEL: @zero_test_permvar_hi_128(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i16> [[A0:%.*]], <8 x i16> undef, <8 x i32> zeroinitializer
+; CHECK-NEXT:    ret <8 x i16> [[TMP1]]
+;
+  %a = tail call <8 x i16> @llvm.x86.avx512.mask.permvar.hi.128(<8 x i16> %a0, <8 x i16> zeroinitializer, <8 x i16> undef, i8 -1)
+  ret <8 x i16> %a
+}
+
+define <8 x i16> @zero_test_permvar_hi_128_mask(<8 x i16> %a0, <8 x i16> %passthru, i8 %mask) {
+; CHECK-LABEL: @zero_test_permvar_hi_128_mask(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i16> [[A0:%.*]], <8 x i16> undef, <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP3:%.*]] = select <8 x i1> [[TMP2]], <8 x i16> [[TMP1]], <8 x i16> [[PASSTHRU:%.*]]
+; CHECK-NEXT:    ret <8 x i16> [[TMP3]]
+;
+  %a = tail call <8 x i16> @llvm.x86.avx512.mask.permvar.hi.128(<8 x i16> %a0, <8 x i16> zeroinitializer, <8 x i16> %passthru, i8 %mask)
+  ret <8 x i16> %a
+}
+
+define <8 x i16> @shuffle_test_permvar_hi_128(<8 x i16> %a0) {
+; CHECK-LABEL: @shuffle_test_permvar_hi_128(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i16> [[A0:%.*]], <8 x i16> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    ret <8 x i16> [[TMP1]]
+;
+  %a = tail call <8 x i16> @llvm.x86.avx512.mask.permvar.hi.128(<8 x i16> %a0, <8 x i16> <i16 7, i16 6, i16 5, i16 4, i16 3, i16 2, i16 1, i16 0>, <8 x i16> undef, i8 -1)
+  ret <8 x i16> %a
+}
+
+define <8 x i16> @shuffle_test_permvar_hi_128_mask(<8 x i16> %a0, <8 x i16> %passthru, i8 %mask) {
+; CHECK-LABEL: @shuffle_test_permvar_hi_128_mask(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i16> [[A0:%.*]], <8 x i16> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP3:%.*]] = select <8 x i1> [[TMP2]], <8 x i16> [[TMP1]], <8 x i16> [[PASSTHRU:%.*]]
+; CHECK-NEXT:    ret <8 x i16> [[TMP3]]
+;
+  %a = tail call <8 x i16> @llvm.x86.avx512.mask.permvar.hi.128(<8 x i16> %a0, <8 x i16> <i16 7, i16 6, i16 5, i16 4, i16 3, i16 2, i16 1, i16 0>, <8 x i16> %passthru, i8 %mask)
+  ret <8 x i16> %a
+}
+
+define <8 x i16> @undef_test_permvar_hi_128(<8 x i16> %a0) {
+; CHECK-LABEL: @undef_test_permvar_hi_128(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i16> [[A0:%.*]], <8 x i16> undef, <8 x i32> <i32 undef, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    ret <8 x i16> [[TMP1]]
+;
+  %a = tail call <8 x i16> @llvm.x86.avx512.mask.permvar.hi.128(<8 x i16> %a0, <8 x i16> <i16 undef, i16 6, i16 5, i16 4, i16 3, i16 2, i16 1, i16 0>, <8 x i16> undef, i8 -1)
+  ret <8 x i16> %a
+}
+
+define <8 x i16> @undef_test_permvar_hi_128_mask(<8 x i16> %a0, <8 x i16> %passthru, i8 %mask) {
+; CHECK-LABEL: @undef_test_permvar_hi_128_mask(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i16> [[A0:%.*]], <8 x i16> undef, <8 x i32> <i32 undef, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP3:%.*]] = select <8 x i1> [[TMP2]], <8 x i16> [[TMP1]], <8 x i16> [[PASSTHRU:%.*]]
+; CHECK-NEXT:    ret <8 x i16> [[TMP3]]
+;
+  %a = tail call <8 x i16> @llvm.x86.avx512.mask.permvar.hi.128(<8 x i16> %a0, <8 x i16> <i16 undef, i16 6, i16 5, i16 4, i16 3, i16 2, i16 1, i16 0>, <8 x i16> %passthru, i8 %mask)
+  ret <8 x i16> %a
+}
+
+declare <16 x i16> @llvm.x86.avx512.mask.permvar.hi.256(<16 x i16>, <16 x i16>, <16 x i16>, i16)
+
+define <16 x i16> @identity_test_permvar_hi_256(<16 x i16> %a0) {
+; CHECK-LABEL: @identity_test_permvar_hi_256(
+; CHECK-NEXT:    ret <16 x i16> [[A0:%.*]]
+;
+  %a = tail call <16 x i16> @llvm.x86.avx512.mask.permvar.hi.256(<16 x i16> %a0, <16 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>, <16 x i16> undef, i16 -1)
+  ret <16 x i16> %a
+}
+
+define <16 x i16> @identity_test_permvar_hi_256_mask(<16 x i16> %a0, <16 x i16> %passthru, i16 %mask) {
+; CHECK-LABEL: @identity_test_permvar_hi_256_mask(
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP2:%.*]] = select <16 x i1> [[TMP1]], <16 x i16> [[A0:%.*]], <16 x i16> [[PASSTHRU:%.*]]
+; CHECK-NEXT:    ret <16 x i16> [[TMP2]]
+;
+  %a = tail call <16 x i16> @llvm.x86.avx512.mask.permvar.hi.256(<16 x i16> %a0, <16 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>, <16 x i16> %passthru, i16 %mask)
+  ret <16 x i16> %a
+}
+
+define <16 x i16> @zero_test_permvar_hi_256(<16 x i16> %a0) {
+; CHECK-LABEL: @zero_test_permvar_hi_256(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A0:%.*]], <16 x i16> undef, <16 x i32> zeroinitializer
+; CHECK-NEXT:    ret <16 x i16> [[TMP1]]
+;
+  %a = tail call <16 x i16> @llvm.x86.avx512.mask.permvar.hi.256(<16 x i16> %a0, <16 x i16> zeroinitializer, <16 x i16> undef, i16 -1)
+  ret <16 x i16> %a
+}
+
+define <16 x i16> @zero_test_permvar_hi_256_mask(<16 x i16> %a0, <16 x i16> %passthru, i16 %mask) {
+; CHECK-LABEL: @zero_test_permvar_hi_256_mask(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A0:%.*]], <16 x i16> undef, <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP3:%.*]] = select <16 x i1> [[TMP2]], <16 x i16> [[TMP1]], <16 x i16> [[PASSTHRU:%.*]]
+; CHECK-NEXT:    ret <16 x i16> [[TMP3]]
+;
+  %a = tail call <16 x i16> @llvm.x86.avx512.mask.permvar.hi.256(<16 x i16> %a0, <16 x i16> zeroinitializer, <16 x i16> %passthru, i16 %mask)
+  ret <16 x i16> %a
+}
+
+define <16 x i16> @shuffle_test_permvar_hi_256(<16 x i16> %a0) {
+; CHECK-LABEL: @shuffle_test_permvar_hi_256(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A0:%.*]], <16 x i16> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    ret <16 x i16> [[TMP1]]
+;
+  %a = tail call <16 x i16> @llvm.x86.avx512.mask.permvar.hi.256(<16 x i16> %a0, <16 x i16> <i16 15, i16 14, i16 13, i16 12, i16 11, i16 10, i16 9, i16 8, i16 7, i16 6, i16 5, i16 4, i16 3, i16 2, i16 1, i16 0>, <16 x i16> undef, i16 -1)
+  ret <16 x i16> %a
+}
+
+define <16 x i16> @shuffle_test_permvar_hi_256_mask(<16 x i16> %a0, <16 x i16> %passthru, i16 %mask) {
+; CHECK-LABEL: @shuffle_test_permvar_hi_256_mask(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A0:%.*]], <16 x i16> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP3:%.*]] = select <16 x i1> [[TMP2]], <16 x i16> [[TMP1]], <16 x i16> [[PASSTHRU:%.*]]
+; CHECK-NEXT:    ret <16 x i16> [[TMP3]]
+;
+  %a = tail call <16 x i16> @llvm.x86.avx512.mask.permvar.hi.256(<16 x i16> %a0, <16 x i16> <i16 15, i16 14, i16 13, i16 12, i16 11, i16 10, i16 9, i16 8, i16 7, i16 6, i16 5, i16 4, i16 3, i16 2, i16 1, i16 0>, <16 x i16> %passthru, i16 %mask)
+  ret <16 x i16> %a
+}
+
+define <16 x i16> @undef_test_permvar_hi_256(<16 x i16> %a0) {
+; CHECK-LABEL: @undef_test_permvar_hi_256(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A0:%.*]], <16 x i16> undef, <16 x i32> <i32 undef, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    ret <16 x i16> [[TMP1]]
+;
+  %a = tail call <16 x i16> @llvm.x86.avx512.mask.permvar.hi.256(<16 x i16> %a0, <16 x i16> <i16 undef, i16 14, i16 13, i16 12, i16 11, i16 10, i16 9, i16 8, i16 7, i16 6, i16 5, i16 4, i16 3, i16 2, i16 1, i16 0>, <16 x i16> undef, i16 -1)
+  ret <16 x i16> %a
+}
+
+define <16 x i16> @undef_test_permvar_hi_256_mask(<16 x i16> %a0, <16 x i16> %passthru, i16 %mask) {
+; CHECK-LABEL: @undef_test_permvar_hi_256_mask(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A0:%.*]], <16 x i16> undef, <16 x i32> <i32 undef, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP3:%.*]] = select <16 x i1> [[TMP2]], <16 x i16> [[TMP1]], <16 x i16> [[PASSTHRU:%.*]]
+; CHECK-NEXT:    ret <16 x i16> [[TMP3]]
+;
+  %a = tail call <16 x i16> @llvm.x86.avx512.mask.permvar.hi.256(<16 x i16> %a0, <16 x i16> <i16 undef, i16 14, i16 13, i16 12, i16 11, i16 10, i16 9, i16 8, i16 7, i16 6, i16 5, i16 4, i16 3, i16 2, i16 1, i16 0>, <16 x i16> %passthru, i16 %mask)
+  ret <16 x i16> %a
+}
+
+declare <32 x i16> @llvm.x86.avx512.mask.permvar.hi.512(<32 x i16>, <32 x i16>, <32 x i16>, i32)
+
+define <32 x i16> @identity_test_permvar_hi_512(<32 x i16> %a0) {
+; CHECK-LABEL: @identity_test_permvar_hi_512(
+; CHECK-NEXT:    ret <32 x i16> [[A0:%.*]]
+;
+  %a = tail call <32 x i16> @llvm.x86.avx512.mask.permvar.hi.512(<32 x i16> %a0, <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19, i16 20, i16 21, i16 22, i16 23, i16 24, i16 25, i16 26, i16 27, i16 28, i16 29, i16 30, i16 31>, <32 x i16> undef, i32 -1)
+  ret <32 x i16> %a
+}
+
+define <32 x i16> @identity_test_permvar_hi_512_mask(<32 x i16> %a0, <32 x i16> %passthru, i32 %mask) {
+; CHECK-LABEL: @identity_test_permvar_hi_512_mask(
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32 [[MASK:%.*]] to <32 x i1>
+; CHECK-NEXT:    [[TMP2:%.*]] = select <32 x i1> [[TMP1]], <32 x i16> [[A0:%.*]], <32 x i16> [[PASSTHRU:%.*]]
+; CHECK-NEXT:    ret <32 x i16> [[TMP2]]
+;
+  %a = tail call <32 x i16> @llvm.x86.avx512.mask.permvar.hi.512(<32 x i16> %a0, <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19, i16 20, i16 21, i16 22, i16 23, i16 24, i16 25, i16 26, i16 27, i16 28, i16 29, i16 30, i16 31>, <32 x i16> %passthru, i32 %mask)
+  ret <32 x i16> %a
+}
+
+define <32 x i16> @zero_test_permvar_hi_512(<32 x i16> %a0) {
+; CHECK-LABEL: @zero_test_permvar_hi_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <32 x i16> [[A0:%.*]], <32 x i16> undef, <32 x i32> zeroinitializer
+; CHECK-NEXT:    ret <32 x i16> [[TMP1]]
+;
+  %a = tail call <32 x i16> @llvm.x86.avx512.mask.permvar.hi.512(<32 x i16> %a0, <32 x i16> zeroinitializer, <32 x i16> undef, i32 -1)
+  ret <32 x i16> %a
+}
+
+define <32 x i16> @zero_test_permvar_hi_512_mask(<32 x i16> %a0, <32 x i16> %passthru, i32 %mask) {
+; CHECK-LABEL: @zero_test_permvar_hi_512_mask(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <32 x i16> [[A0:%.*]], <32 x i16> undef, <32 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i32 [[MASK:%.*]] to <32 x i1>
+; CHECK-NEXT:    [[TMP3:%.*]] = select <32 x i1> [[TMP2]], <32 x i16> [[TMP1]], <32 x i16> [[PASSTHRU:%.*]]
+; CHECK-NEXT:    ret <32 x i16> [[TMP3]]
+;
+  %a = tail call <32 x i16> @llvm.x86.avx512.mask.permvar.hi.512(<32 x i16> %a0, <32 x i16> zeroinitializer, <32 x i16> %passthru, i32 %mask)
+  ret <32 x i16> %a
+}
+
+define <32 x i16> @shuffle_test_permvar_hi_512(<32 x i16> %a0) {
+; CHECK-LABEL: @shuffle_test_permvar_hi_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <32 x i16> [[A0:%.*]], <32 x i16> undef, <32 x i32> <i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    ret <32 x i16> [[TMP1]]
+;
+  %a = tail call <32 x i16> @llvm.x86.avx512.mask.permvar.hi.512(<32 x i16> %a0, <32 x i16> <i16 31, i16 30, i16 29, i16 28, i16 27, i16 26, i16 25, i16 24, i16 23, i16 22, i16 21, i16 20, i16 19, i16 18, i16 17, i16 16, i16 15, i16 14, i16 13, i16 12, i16 11, i16 10, i16 9, i16 8, i16 7, i16 6, i16 5, i16 4, i16 3, i16 2, i16 1, i16 0>, <32 x i16> undef, i32 -1)
+  ret <32 x i16> %a
+}
+
+define <32 x i16> @shuffle_test_permvar_hi_512_mask(<32 x i16> %a0, <32 x i16> %passthru, i32 %mask) {
+; CHECK-LABEL: @shuffle_test_permvar_hi_512_mask(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <32 x i16> [[A0:%.*]], <32 x i16> undef, <32 x i32> <i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i32 [[MASK:%.*]] to <32 x i1>
+; CHECK-NEXT:    [[TMP3:%.*]] = select <32 x i1> [[TMP2]], <32 x i16> [[TMP1]], <32 x i16> [[PASSTHRU:%.*]]
+; CHECK-NEXT:    ret <32 x i16> [[TMP3]]
+;
+  %a = tail call <32 x i16> @llvm.x86.avx512.mask.permvar.hi.512(<32 x i16> %a0, <32 x i16> <i16 31, i16 30, i16 29, i16 28, i16 27, i16 26, i16 25, i16 24, i16 23, i16 22, i16 21, i16 20, i16 19, i16 18, i16 17, i16 16, i16 15, i16 14, i16 13, i16 12, i16 11, i16 10, i16 9, i16 8, i16 7, i16 6, i16 5, i16 4, i16 3, i16 2, i16 1, i16 0>, <32 x i16> %passthru, i32 %mask)
+  ret <32 x i16> %a
+}
+
+define <32 x i16> @undef_test_permvar_hi_512(<32 x i16> %a0) {
+; CHECK-LABEL: @undef_test_permvar_hi_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <32 x i16> [[A0:%.*]], <32 x i16> undef, <32 x i32> <i32 undef, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    ret <32 x i16> [[TMP1]]
+;
+  %a = tail call <32 x i16> @llvm.x86.avx512.mask.permvar.hi.512(<32 x i16> %a0, <32 x i16> <i16 undef, i16 30, i16 29, i16 28, i16 27, i16 26, i16 25, i16 24, i16 23, i16 22, i16 21, i16 20, i16 19, i16 18, i16 17, i16 16, i16 15, i16 14, i16 13, i16 12, i16 11, i16 10, i16 9, i16 8, i16 7, i16 6, i16 5, i16 4, i16 3, i16 2, i16 1, i16 0>, <32 x i16> undef, i32 -1)
+  ret <32 x i16> %a
+}
+
+define <32 x i16> @undef_test_permvar_hi_512_mask(<32 x i16> %a0, <32 x i16> %passthru, i32 %mask) {
+; CHECK-LABEL: @undef_test_permvar_hi_512_mask(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <32 x i16> [[A0:%.*]], <32 x i16> undef, <32 x i32> <i32 undef, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i32 [[MASK:%.*]] to <32 x i1>
+; CHECK-NEXT:    [[TMP3:%.*]] = select <32 x i1> [[TMP2]], <32 x i16> [[TMP1]], <32 x i16> [[PASSTHRU:%.*]]
+; CHECK-NEXT:    ret <32 x i16> [[TMP3]]
+;
+  %a = tail call <32 x i16> @llvm.x86.avx512.mask.permvar.hi.512(<32 x i16> %a0, <32 x i16> <i16 undef, i16 30, i16 29, i16 28, i16 27, i16 26, i16 25, i16 24, i16 23, i16 22, i16 21, i16 20, i16 19, i16 18, i16 17, i16 16, i16 15, i16 14, i16 13, i16 12, i16 11, i16 10, i16 9, i16 8, i16 7, i16 6, i16 5, i16 4, i16 3, i16 2, i16 1, i16 0>, <32 x i16> %passthru, i32 %mask)
+  ret <32 x i16> %a
+}
+
+declare <16 x i8> @llvm.x86.avx512.mask.permvar.qi.128(<16 x i8>, <16 x i8>, <16 x i8>, i16)
+
+define <16 x i8> @identity_test_permvar_qi_128(<16 x i8> %a0) {
+; CHECK-LABEL: @identity_test_permvar_qi_128(
+; CHECK-NEXT:    ret <16 x i8> [[A0:%.*]]
+;
+  %a = tail call <16 x i8> @llvm.x86.avx512.mask.permvar.qi.128(<16 x i8> %a0, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, <16 x i8> undef, i16 -1)
+  ret <16 x i8> %a
+}
+
+define <16 x i8> @identity_test_permvar_qi_128_mask(<16 x i8> %a0, <16 x i8> %passthru, i16 %mask) {
+; CHECK-LABEL: @identity_test_permvar_qi_128_mask(
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP2:%.*]] = select <16 x i1> [[TMP1]], <16 x i8> [[A0:%.*]], <16 x i8> [[PASSTHRU:%.*]]
+; CHECK-NEXT:    ret <16 x i8> [[TMP2]]
+;
+  %a = tail call <16 x i8> @llvm.x86.avx512.mask.permvar.qi.128(<16 x i8> %a0, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, <16 x i8> %passthru, i16 %mask)
+  ret <16 x i8> %a
+}
+
+define <16 x i8> @zero_test_permvar_qi_128(<16 x i8> %a0) {
+; CHECK-LABEL: @zero_test_permvar_qi_128(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i8> [[A0:%.*]], <16 x i8> undef, <16 x i32> zeroinitializer
+; CHECK-NEXT:    ret <16 x i8> [[TMP1]]
+;
+  %a = tail call <16 x i8> @llvm.x86.avx512.mask.permvar.qi.128(<16 x i8> %a0, <16 x i8> zeroinitializer, <16 x i8> undef, i16 -1)
+  ret <16 x i8> %a
+}
+
+define <16 x i8> @zero_test_permvar_qi_128_mask(<16 x i8> %a0, <16 x i8> %passthru, i16 %mask) {
+; CHECK-LABEL: @zero_test_permvar_qi_128_mask(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i8> [[A0:%.*]], <16 x i8> undef, <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP3:%.*]] = select <16 x i1> [[TMP2]], <16 x i8> [[TMP1]], <16 x i8> [[PASSTHRU:%.*]]
+; CHECK-NEXT:    ret <16 x i8> [[TMP3]]
+;
+  %a = tail call <16 x i8> @llvm.x86.avx512.mask.permvar.qi.128(<16 x i8> %a0, <16 x i8> zeroinitializer, <16 x i8> %passthru, i16 %mask)
+  ret <16 x i8> %a
+}
+
+define <16 x i8> @shuffle_test_permvar_qi_128(<16 x i8> %a0) {
+; CHECK-LABEL: @shuffle_test_permvar_qi_128(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i8> [[A0:%.*]], <16 x i8> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    ret <16 x i8> [[TMP1]]
+;
+  %a = tail call <16 x i8> @llvm.x86.avx512.mask.permvar.qi.128(<16 x i8> %a0, <16 x i8> <i8 15, i8 14, i8 13, i8 12, i8 11, i8 10, i8 9, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>, <16 x i8> undef, i16 -1)
+  ret <16 x i8> %a
+}
+
+define <16 x i8> @shuffle_test_permvar_qi_128_mask(<16 x i8> %a0, <16 x i8> %passthru, i16 %mask) {
+; CHECK-LABEL: @shuffle_test_permvar_qi_128_mask(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i8> [[A0:%.*]], <16 x i8> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP3:%.*]] = select <16 x i1> [[TMP2]], <16 x i8> [[TMP1]], <16 x i8> [[PASSTHRU:%.*]]
+; CHECK-NEXT:    ret <16 x i8> [[TMP3]]
+;
+  %a = tail call <16 x i8> @llvm.x86.avx512.mask.permvar.qi.128(<16 x i8> %a0, <16 x i8> <i8 15, i8 14, i8 13, i8 12, i8 11, i8 10, i8 9, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>, <16 x i8> %passthru, i16 %mask)
+  ret <16 x i8> %a
+}
+
+define <16 x i8> @undef_test_permvar_qi_128(<16 x i8> %a0) {
+; CHECK-LABEL: @undef_test_permvar_qi_128(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i8> [[A0:%.*]], <16 x i8> undef, <16 x i32> <i32 undef, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    ret <16 x i8> [[TMP1]]
+;
+  %a = tail call <16 x i8> @llvm.x86.avx512.mask.permvar.qi.128(<16 x i8> %a0, <16 x i8> <i8 undef, i8 14, i8 13, i8 12, i8 11, i8 10, i8 9, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>, <16 x i8> undef, i16 -1)
+  ret <16 x i8> %a
+}
+
+define <16 x i8> @undef_test_permvar_qi_128_mask(<16 x i8> %a0, <16 x i8> %passthru, i16 %mask) {
+; CHECK-LABEL: @undef_test_permvar_qi_128_mask(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i8> [[A0:%.*]], <16 x i8> undef, <16 x i32> <i32 undef, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP3:%.*]] = select <16 x i1> [[TMP2]], <16 x i8> [[TMP1]], <16 x i8> [[PASSTHRU:%.*]]
+; CHECK-NEXT:    ret <16 x i8> [[TMP3]]
+;
+  %a = tail call <16 x i8> @llvm.x86.avx512.mask.permvar.qi.128(<16 x i8> %a0, <16 x i8> <i8 undef, i8 14, i8 13, i8 12, i8 11, i8 10, i8 9, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>, <16 x i8> %passthru, i16 %mask)
+  ret <16 x i8> %a
+}
+
+declare <32 x i8> @llvm.x86.avx512.mask.permvar.qi.256(<32 x i8>, <32 x i8>, <32 x i8>, i32)
+
+define <32 x i8> @identity_test_permvar_qi_256(<32 x i8> %a0) {
+; CHECK-LABEL: @identity_test_permvar_qi_256(
+; CHECK-NEXT:    ret <32 x i8> [[A0:%.*]]
+;
+  %a = tail call <32 x i8> @llvm.x86.avx512.mask.permvar.qi.256(<32 x i8> %a0, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31>, <32 x i8> undef, i32 -1)
+  ret <32 x i8> %a
+}
+
+define <32 x i8> @identity_test_permvar_qi_256_mask(<32 x i8> %a0, <32 x i8> %passthru, i32 %mask) {
+; CHECK-LABEL: @identity_test_permvar_qi_256_mask(
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32 [[MASK:%.*]] to <32 x i1>
+; CHECK-NEXT:    [[TMP2:%.*]] = select <32 x i1> [[TMP1]], <32 x i8> [[A0:%.*]], <32 x i8> [[PASSTHRU:%.*]]
+; CHECK-NEXT:    ret <32 x i8> [[TMP2]]
+;
+  %a = tail call <32 x i8> @llvm.x86.avx512.mask.permvar.qi.256(<32 x i8> %a0, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31>, <32 x i8> %passthru, i32 %mask)
+  ret <32 x i8> %a
+}
+
+define <32 x i8> @zero_test_permvar_qi_256(<32 x i8> %a0) {
+; CHECK-LABEL: @zero_test_permvar_qi_256(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <32 x i8> [[A0:%.*]], <32 x i8> undef, <32 x i32> zeroinitializer
+; CHECK-NEXT:    ret <32 x i8> [[TMP1]]
+;
+  %a = tail call <32 x i8> @llvm.x86.avx512.mask.permvar.qi.256(<32 x i8> %a0, <32 x i8> zeroinitializer, <32 x i8> undef, i32 -1)
+  ret <32 x i8> %a
+}
+
+define <32 x i8> @zero_test_permvar_qi_256_mask(<32 x i8> %a0, <32 x i8> %passthru, i32 %mask) {
+; CHECK-LABEL: @zero_test_permvar_qi_256_mask(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <32 x i8> [[A0:%.*]], <32 x i8> undef, <32 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i32 [[MASK:%.*]] to <32 x i1>
+; CHECK-NEXT:    [[TMP3:%.*]] = select <32 x i1> [[TMP2]], <32 x i8> [[TMP1]], <32 x i8> [[PASSTHRU:%.*]]
+; CHECK-NEXT:    ret <32 x i8> [[TMP3]]
+;
+  %a = tail call <32 x i8> @llvm.x86.avx512.mask.permvar.qi.256(<32 x i8> %a0, <32 x i8> zeroinitializer, <32 x i8> %passthru, i32 %mask)
+  ret <32 x i8> %a
+}
+
+define <32 x i8> @shuffle_test_permvar_qi_256(<32 x i8> %a0) {
+; CHECK-LABEL: @shuffle_test_permvar_qi_256(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <32 x i8> [[A0:%.*]], <32 x i8> undef, <32 x i32> <i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    ret <32 x i8> [[TMP1]]
+;
+  %a = tail call <32 x i8> @llvm.x86.avx512.mask.permvar.qi.256(<32 x i8> %a0, <32 x i8> <i8 31, i8 30, i8 29, i8 28, i8 27, i8 26, i8 25, i8 24, i8 23, i8 22, i8 21, i8 20, i8 19, i8 18, i8 17, i8 16, i8 15, i8 14, i8 13, i8 12, i8 11, i8 10, i8 9, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>, <32 x i8> undef, i32 -1)
+  ret <32 x i8> %a
+}
+
+define <32 x i8> @shuffle_test_permvar_qi_256_mask(<32 x i8> %a0, <32 x i8> %passthru, i32 %mask) {
+; CHECK-LABEL: @shuffle_test_permvar_qi_256_mask(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <32 x i8> [[A0:%.*]], <32 x i8> undef, <32 x i32> <i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i32 [[MASK:%.*]] to <32 x i1>
+; CHECK-NEXT:    [[TMP3:%.*]] = select <32 x i1> [[TMP2]], <32 x i8> [[TMP1]], <32 x i8> [[PASSTHRU:%.*]]
+; CHECK-NEXT:    ret <32 x i8> [[TMP3]]
+;
+  %a = tail call <32 x i8> @llvm.x86.avx512.mask.permvar.qi.256(<32 x i8> %a0, <32 x i8> <i8 31, i8 30, i8 29, i8 28, i8 27, i8 26, i8 25, i8 24, i8 23, i8 22, i8 21, i8 20, i8 19, i8 18, i8 17, i8 16, i8 15, i8 14, i8 13, i8 12, i8 11, i8 10, i8 9, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>, <32 x i8> %passthru, i32 %mask)
+  ret <32 x i8> %a
+}
+
+define <32 x i8> @undef_test_permvar_qi_256(<32 x i8> %a0) {
+; CHECK-LABEL: @undef_test_permvar_qi_256(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <32 x i8> [[A0:%.*]], <32 x i8> undef, <32 x i32> <i32 undef, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    ret <32 x i8> [[TMP1]]
+;
+  %a = tail call <32 x i8> @llvm.x86.avx512.mask.permvar.qi.256(<32 x i8> %a0, <32 x i8> <i8 undef, i8 30, i8 29, i8 28, i8 27, i8 26, i8 25, i8 24, i8 23, i8 22, i8 21, i8 20, i8 19, i8 18, i8 17, i8 16, i8 15, i8 14, i8 13, i8 12, i8 11, i8 10, i8 9, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>, <32 x i8> undef, i32 -1)
+  ret <32 x i8> %a
+}
+
+define <32 x i8> @undef_test_permvar_qi_256_mask(<32 x i8> %a0, <32 x i8> %passthru, i32 %mask) {
+; CHECK-LABEL: @undef_test_permvar_qi_256_mask(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <32 x i8> [[A0:%.*]], <32 x i8> undef, <32 x i32> <i32 undef, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i32 [[MASK:%.*]] to <32 x i1>
+; CHECK-NEXT:    [[TMP3:%.*]] = select <32 x i1> [[TMP2]], <32 x i8> [[TMP1]], <32 x i8> [[PASSTHRU:%.*]]
+; CHECK-NEXT:    ret <32 x i8> [[TMP3]]
+;
+  %a = tail call <32 x i8> @llvm.x86.avx512.mask.permvar.qi.256(<32 x i8> %a0, <32 x i8> <i8 undef, i8 30, i8 29, i8 28, i8 27, i8 26, i8 25, i8 24, i8 23, i8 22, i8 21, i8 20, i8 19, i8 18, i8 17, i8 16, i8 15, i8 14, i8 13, i8 12, i8 11, i8 10, i8 9, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>, <32 x i8> %passthru, i32 %mask)
+  ret <32 x i8> %a
+}
+
+declare <64 x i8> @llvm.x86.avx512.mask.permvar.qi.512(<64 x i8>, <64 x i8>, <64 x i8>, i64)
+
+define <64 x i8> @identity_test_permvar_qi_512(<64 x i8> %a0) {
+; CHECK-LABEL: @identity_test_permvar_qi_512(
+; CHECK-NEXT:    ret <64 x i8> [[A0:%.*]]
+;
+  %a = tail call <64 x i8> @llvm.x86.avx512.mask.permvar.qi.512(<64 x i8> %a0, <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31, i8 32, i8 33, i8 34, i8 35, i8 36, i8 37, i8 38, i8 39, i8 40, i8 41, i8 42, i8 43, i8 44, i8 45, i8 46, i8 47, i8 48, i8 49, i8 50, i8 51, i8 52, i8 53, i8 54, i8 55, i8 56, i8 57, i8 58, i8 59, i8 60, i8 61, i8 62, i8 63>, <64 x i8> undef, i64 -1)
+  ret <64 x i8> %a
+}
+
+define <64 x i8> @identity_test_permvar_qi_512_mask(<64 x i8> %a0, <64 x i8> %passthru, i64 %mask) {
+; CHECK-LABEL: @identity_test_permvar_qi_512_mask(
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64 [[MASK:%.*]] to <64 x i1>
+; CHECK-NEXT:    [[TMP2:%.*]] = select <64 x i1> [[TMP1]], <64 x i8> [[A0:%.*]], <64 x i8> [[PASSTHRU:%.*]]
+; CHECK-NEXT:    ret <64 x i8> [[TMP2]]
+;
+  %a = tail call <64 x i8> @llvm.x86.avx512.mask.permvar.qi.512(<64 x i8> %a0, <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31, i8 32, i8 33, i8 34, i8 35, i8 36, i8 37, i8 38, i8 39, i8 40, i8 41, i8 42, i8 43, i8 44, i8 45, i8 46, i8 47, i8 48, i8 49, i8 50, i8 51, i8 52, i8 53, i8 54, i8 55, i8 56, i8 57, i8 58, i8 59, i8 60, i8 61, i8 62, i8 63>, <64 x i8> %passthru, i64 %mask)
+  ret <64 x i8> %a
+}
+
+define <64 x i8> @zero_test_permvar_qi_512(<64 x i8> %a0) {
+; CHECK-LABEL: @zero_test_permvar_qi_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <64 x i8> [[A0:%.*]], <64 x i8> undef, <64 x i32> zeroinitializer
+; CHECK-NEXT:    ret <64 x i8> [[TMP1]]
+;
+  %a = tail call <64 x i8> @llvm.x86.avx512.mask.permvar.qi.512(<64 x i8> %a0, <64 x i8> zeroinitializer, <64 x i8> undef, i64 -1)
+  ret <64 x i8> %a
+}
+
+define <64 x i8> @zero_test_permvar_qi_512_mask(<64 x i8> %a0, <64 x i8> %passthru, i64 %mask) {
+; CHECK-LABEL: @zero_test_permvar_qi_512_mask(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <64 x i8> [[A0:%.*]], <64 x i8> undef, <64 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i64 [[MASK:%.*]] to <64 x i1>
+; CHECK-NEXT:    [[TMP3:%.*]] = select <64 x i1> [[TMP2]], <64 x i8> [[TMP1]], <64 x i8> [[PASSTHRU:%.*]]
+; CHECK-NEXT:    ret <64 x i8> [[TMP3]]
+;
+  %a = tail call <64 x i8> @llvm.x86.avx512.mask.permvar.qi.512(<64 x i8> %a0, <64 x i8> zeroinitializer, <64 x i8> %passthru, i64 %mask)
+  ret <64 x i8> %a
+}
+
+define <64 x i8> @shuffle_test_permvar_qi_512(<64 x i8> %a0) {
+; CHECK-LABEL: @shuffle_test_permvar_qi_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <64 x i8> [[A0:%.*]], <64 x i8> undef, <64 x i32> <i32 63, i32 62, i32 61, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    ret <64 x i8> [[TMP1]]
+;
+  %a = tail call <64 x i8> @llvm.x86.avx512.mask.permvar.qi.512(<64 x i8> %a0, <64 x i8> <i8 63, i8 62, i8 61, i8 60, i8 59, i8 58, i8 57, i8 56, i8 55, i8 54, i8 53, i8 52, i8 51, i8 50, i8 49, i8 48, i8 47, i8 46, i8 45, i8 44, i8 43, i8 42, i8 41, i8 40, i8 39, i8 38, i8 37, i8 36, i8 35, i8 34, i8 33, i8 32, i8 31, i8 30, i8 29, i8 28, i8 27, i8 26, i8 25, i8 24, i8 23, i8 22, i8 21, i8 20, i8 19, i8 18, i8 17, i8 16, i8 15, i8 14, i8 13, i8 12, i8 11, i8 10, i8 9, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>, <64 x i8> undef, i64 -1)
+  ret <64 x i8> %a
+}
+
+define <64 x i8> @shuffle_test_permvar_qi_512_mask(<64 x i8> %a0, <64 x i8> %passthru, i64 %mask) {
+; CHECK-LABEL: @shuffle_test_permvar_qi_512_mask(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <64 x i8> [[A0:%.*]], <64 x i8> undef, <64 x i32> <i32 63, i32 62, i32 61, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i64 [[MASK:%.*]] to <64 x i1>
+; CHECK-NEXT:    [[TMP3:%.*]] = select <64 x i1> [[TMP2]], <64 x i8> [[TMP1]], <64 x i8> [[PASSTHRU:%.*]]
+; CHECK-NEXT:    ret <64 x i8> [[TMP3]]
+;
+  %a = tail call <64 x i8> @llvm.x86.avx512.mask.permvar.qi.512(<64 x i8> %a0, <64 x i8> <i8 63, i8 62, i8 61, i8 60, i8 59, i8 58, i8 57, i8 56, i8 55, i8 54, i8 53, i8 52, i8 51, i8 50, i8 49, i8 48, i8 47, i8 46, i8 45, i8 44, i8 43, i8 42, i8 41, i8 40, i8 39, i8 38, i8 37, i8 36, i8 35, i8 34, i8 33, i8 32, i8 31, i8 30, i8 29, i8 28, i8 27, i8 26, i8 25, i8 24, i8 23, i8 22, i8 21, i8 20, i8 19, i8 18, i8 17, i8 16, i8 15, i8 14, i8 13, i8 12, i8 11, i8 10, i8 9, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>, <64 x i8> %passthru, i64 %mask)
+  ret <64 x i8> %a
+}
+
+define <64 x i8> @undef_test_permvar_qi_512(<64 x i8> %a0) {
+; CHECK-LABEL: @undef_test_permvar_qi_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <64 x i8> [[A0:%.*]], <64 x i8> undef, <64 x i32> <i32 undef, i32 62, i32 61, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    ret <64 x i8> [[TMP1]]
+;
+  %a = tail call <64 x i8> @llvm.x86.avx512.mask.permvar.qi.512(<64 x i8> %a0, <64 x i8> <i8 undef, i8 62, i8 61, i8 60, i8 59, i8 58, i8 57, i8 56, i8 55, i8 54, i8 53, i8 52, i8 51, i8 50, i8 49, i8 48, i8 47, i8 46, i8 45, i8 44, i8 43, i8 42, i8 41, i8 40, i8 39, i8 38, i8 37, i8 36, i8 35, i8 34, i8 33, i8 32, i8 31, i8 30, i8 29, i8 28, i8 27, i8 26, i8 25, i8 24, i8 23, i8 22, i8 21, i8 20, i8 19, i8 18, i8 17, i8 16, i8 15, i8 14, i8 13, i8 12, i8 11, i8 10, i8 9, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>, <64 x i8> undef, i64 -1)
+  ret <64 x i8> %a
+}
+
+define <64 x i8> @undef_test_permvar_qi_512_mask(<64 x i8> %a0, <64 x i8> %passthru, i64 %mask) {
+; CHECK-LABEL: @undef_test_permvar_qi_512_mask(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <64 x i8> [[A0:%.*]], <64 x i8> undef, <64 x i32> <i32 undef, i32 62, i32 61, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i64 [[MASK:%.*]] to <64 x i1>
+; CHECK-NEXT:    [[TMP3:%.*]] = select <64 x i1> [[TMP2]], <64 x i8> [[TMP1]], <64 x i8> [[PASSTHRU:%.*]]
+; CHECK-NEXT:    ret <64 x i8> [[TMP3]]
+;
+  %a = tail call <64 x i8> @llvm.x86.avx512.mask.permvar.qi.512(<64 x i8> %a0, <64 x i8> <i8 undef, i8 62, i8 61, i8 60, i8 59, i8 58, i8 57, i8 56, i8 55, i8 54, i8 53, i8 52, i8 51, i8 50, i8 49, i8 48, i8 47, i8 46, i8 45, i8 44, i8 43, i8 42, i8 41, i8 40, i8 39, i8 38, i8 37, i8 36, i8 35, i8 34, i8 33, i8 32, i8 31, i8 30, i8 29, i8 28, i8 27, i8 26, i8 25, i8 24, i8 23, i8 22, i8 21, i8 20, i8 19, i8 18, i8 17, i8 16, i8 15, i8 14, i8 13, i8 12, i8 11, i8 10, i8 9, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>, <64 x i8> %passthru, i64 %mask)
+  ret <64 x i8> %a
+}
+
+declare <16 x float> @llvm.x86.avx512.mask.add.ps.512(<16 x float>, <16 x float>, <16 x float>, i16, i32)
+
+define <16 x float> @test_add_ps(<16 x float> %a, <16 x float> %b) {
+; CHECK-LABEL: @test_add_ps(
+; CHECK-NEXT:    [[TMP1:%.*]] = fadd <16 x float> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    ret <16 x float> [[TMP1]]
+;
+  %1 = tail call <16 x float> @llvm.x86.avx512.mask.add.ps.512(<16 x float> %a, <16 x float> %b, <16 x float> undef, i16 -1, i32 4)
+  ret <16 x float> %1
+}
+
+define <16 x float> @test_add_ps_round(<16 x float> %a, <16 x float> %b) {
+; CHECK-LABEL: @test_add_ps_round(
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x float> @llvm.x86.avx512.mask.add.ps.512(<16 x float> [[A:%.*]], <16 x float> [[B:%.*]], <16 x float> undef, i16 -1, i32 8)
+; CHECK-NEXT:    ret <16 x float> [[TMP1]]
+;
+  %1 = tail call <16 x float> @llvm.x86.avx512.mask.add.ps.512(<16 x float> %a, <16 x float> %b, <16 x float> undef, i16 -1, i32 8)
+  ret <16 x float> %1
+}
+
+define <16 x float> @test_add_ps_mask(<16 x float> %a, <16 x float> %b, <16 x float> %c, i16 %mask) {
+; CHECK-LABEL: @test_add_ps_mask(
+; CHECK-NEXT:    [[TMP1:%.*]] = fadd <16 x float> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP3:%.*]] = select <16 x i1> [[TMP2]], <16 x float> [[TMP1]], <16 x float> [[C:%.*]]
+; CHECK-NEXT:    ret <16 x float> [[TMP3]]
+;
+  %1 = tail call <16 x float> @llvm.x86.avx512.mask.add.ps.512(<16 x float> %a, <16 x float> %b, <16 x float> %c, i16 %mask, i32 4)
+  ret <16 x float> %1
+}
+
+define <16 x float> @test_add_ps_mask_round(<16 x float> %a, <16 x float> %b, <16 x float> %c, i16 %mask) {
+; CHECK-LABEL: @test_add_ps_mask_round(
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x float> @llvm.x86.avx512.mask.add.ps.512(<16 x float> [[A:%.*]], <16 x float> [[B:%.*]], <16 x float> [[C:%.*]], i16 [[MASK:%.*]], i32 8)
+; CHECK-NEXT:    ret <16 x float> [[TMP1]]
+;
+  %1 = tail call <16 x float> @llvm.x86.avx512.mask.add.ps.512(<16 x float> %a, <16 x float> %b, <16 x float> %c, i16 %mask, i32 8)
+  ret <16 x float> %1
+}
+
+declare <8 x double> @llvm.x86.avx512.mask.add.pd.512(<8 x double>, <8 x double>, <8 x double>, i8, i32)
+
+define <8 x double> @test_add_pd(<8 x double> %a, <8 x double> %b) {
+; CHECK-LABEL: @test_add_pd(
+; CHECK-NEXT:    [[TMP1:%.*]] = fadd <8 x double> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    ret <8 x double> [[TMP1]]
+;
+  %1 = tail call <8 x double> @llvm.x86.avx512.mask.add.pd.512(<8 x double> %a, <8 x double> %b, <8 x double> undef, i8 -1, i32 4)
+  ret <8 x double> %1
+}
+
+define <8 x double> @test_add_pd_round(<8 x double> %a, <8 x double> %b) {
+; CHECK-LABEL: @test_add_pd_round(
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x double> @llvm.x86.avx512.mask.add.pd.512(<8 x double> [[A:%.*]], <8 x double> [[B:%.*]], <8 x double> undef, i8 -1, i32 8)
+; CHECK-NEXT:    ret <8 x double> [[TMP1]]
+;
+  %1 = tail call <8 x double> @llvm.x86.avx512.mask.add.pd.512(<8 x double> %a, <8 x double> %b, <8 x double> undef, i8 -1, i32 8)
+  ret <8 x double> %1
+}
+
+define <8 x double> @test_add_pd_mask(<8 x double> %a, <8 x double> %b, <8 x double> %c, i8 %mask) {
+; CHECK-LABEL: @test_add_pd_mask(
+; CHECK-NEXT:    [[TMP1:%.*]] = fadd <8 x double> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP3:%.*]] = select <8 x i1> [[TMP2]], <8 x double> [[TMP1]], <8 x double> [[C:%.*]]
+; CHECK-NEXT:    ret <8 x double> [[TMP3]]
+;
+  %1 = tail call <8 x double> @llvm.x86.avx512.mask.add.pd.512(<8 x double> %a, <8 x double> %b, <8 x double> %c, i8 %mask, i32 4)
+  ret <8 x double> %1
+}
+
+define <8 x double> @test_add_pd_mask_round(<8 x double> %a, <8 x double> %b, <8 x double> %c, i8 %mask) {
+; CHECK-LABEL: @test_add_pd_mask_round(
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x double> @llvm.x86.avx512.mask.add.pd.512(<8 x double> [[A:%.*]], <8 x double> [[B:%.*]], <8 x double> [[C:%.*]], i8 [[MASK:%.*]], i32 8)
+; CHECK-NEXT:    ret <8 x double> [[TMP1]]
+;
+  %1 = tail call <8 x double> @llvm.x86.avx512.mask.add.pd.512(<8 x double> %a, <8 x double> %b, <8 x double> %c, i8 %mask, i32 8)
+  ret <8 x double> %1
+}
+
+declare <16 x float> @llvm.x86.avx512.mask.sub.ps.512(<16 x float>, <16 x float>, <16 x float>, i16, i32)
+
+define <16 x float> @test_sub_ps(<16 x float> %a, <16 x float> %b) {
+; CHECK-LABEL: @test_sub_ps(
+; CHECK-NEXT:    [[TMP1:%.*]] = fsub <16 x float> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    ret <16 x float> [[TMP1]]
+;
+  %1 = tail call <16 x float> @llvm.x86.avx512.mask.sub.ps.512(<16 x float> %a, <16 x float> %b, <16 x float> undef, i16 -1, i32 4)
+  ret <16 x float> %1
+}
+
+define <16 x float> @test_sub_ps_round(<16 x float> %a, <16 x float> %b) {
+; CHECK-LABEL: @test_sub_ps_round(
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x float> @llvm.x86.avx512.mask.sub.ps.512(<16 x float> [[A:%.*]], <16 x float> [[B:%.*]], <16 x float> undef, i16 -1, i32 8)
+; CHECK-NEXT:    ret <16 x float> [[TMP1]]
+;
+  %1 = tail call <16 x float> @llvm.x86.avx512.mask.sub.ps.512(<16 x float> %a, <16 x float> %b, <16 x float> undef, i16 -1, i32 8)
+  ret <16 x float> %1
+}
+
+define <16 x float> @test_sub_ps_mask(<16 x float> %a, <16 x float> %b, <16 x float> %c, i16 %mask) {
+; CHECK-LABEL: @test_sub_ps_mask(
+; CHECK-NEXT:    [[TMP1:%.*]] = fsub <16 x float> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP3:%.*]] = select <16 x i1> [[TMP2]], <16 x float> [[TMP1]], <16 x float> [[C:%.*]]
+; CHECK-NEXT:    ret <16 x float> [[TMP3]]
+;
+  %1 = tail call <16 x float> @llvm.x86.avx512.mask.sub.ps.512(<16 x float> %a, <16 x float> %b, <16 x float> %c, i16 %mask, i32 4)
+  ret <16 x float> %1
+}
+
+define <16 x float> @test_sub_ps_mask_round(<16 x float> %a, <16 x float> %b, <16 x float> %c, i16 %mask) {
+; CHECK-LABEL: @test_sub_ps_mask_round(
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x float> @llvm.x86.avx512.mask.sub.ps.512(<16 x float> [[A:%.*]], <16 x float> [[B:%.*]], <16 x float> [[C:%.*]], i16 [[MASK:%.*]], i32 8)
+; CHECK-NEXT:    ret <16 x float> [[TMP1]]
+;
+  %1 = tail call <16 x float> @llvm.x86.avx512.mask.sub.ps.512(<16 x float> %a, <16 x float> %b, <16 x float> %c, i16 %mask, i32 8)
+  ret <16 x float> %1
+}
+
+declare <8 x double> @llvm.x86.avx512.mask.sub.pd.512(<8 x double>, <8 x double>, <8 x double>, i8, i32)
+
+define <8 x double> @test_sub_pd(<8 x double> %a, <8 x double> %b) {
+; CHECK-LABEL: @test_sub_pd(
+; CHECK-NEXT:    [[TMP1:%.*]] = fsub <8 x double> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    ret <8 x double> [[TMP1]]
+;
+  %1 = tail call <8 x double> @llvm.x86.avx512.mask.sub.pd.512(<8 x double> %a, <8 x double> %b, <8 x double> undef, i8 -1, i32 4)
+  ret <8 x double> %1
+}
+
+define <8 x double> @test_sub_pd_round(<8 x double> %a, <8 x double> %b) {
+; CHECK-LABEL: @test_sub_pd_round(
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x double> @llvm.x86.avx512.mask.sub.pd.512(<8 x double> [[A:%.*]], <8 x double> [[B:%.*]], <8 x double> undef, i8 -1, i32 8)
+; CHECK-NEXT:    ret <8 x double> [[TMP1]]
+;
+  %1 = tail call <8 x double> @llvm.x86.avx512.mask.sub.pd.512(<8 x double> %a, <8 x double> %b, <8 x double> undef, i8 -1, i32 8)
+  ret <8 x double> %1
+}
+
+define <8 x double> @test_sub_pd_mask(<8 x double> %a, <8 x double> %b, <8 x double> %c, i8 %mask) {
+; CHECK-LABEL: @test_sub_pd_mask(
+; CHECK-NEXT:    [[TMP1:%.*]] = fsub <8 x double> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP3:%.*]] = select <8 x i1> [[TMP2]], <8 x double> [[TMP1]], <8 x double> [[C:%.*]]
+; CHECK-NEXT:    ret <8 x double> [[TMP3]]
+;
+  %1 = tail call <8 x double> @llvm.x86.avx512.mask.sub.pd.512(<8 x double> %a, <8 x double> %b, <8 x double> %c, i8 %mask, i32 4)
+  ret <8 x double> %1
+}
+
+define <8 x double> @test_sub_pd_mask_round(<8 x double> %a, <8 x double> %b, <8 x double> %c, i8 %mask) {
+; CHECK-LABEL: @test_sub_pd_mask_round(
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x double> @llvm.x86.avx512.mask.sub.pd.512(<8 x double> [[A:%.*]], <8 x double> [[B:%.*]], <8 x double> [[C:%.*]], i8 [[MASK:%.*]], i32 8)
+; CHECK-NEXT:    ret <8 x double> [[TMP1]]
+;
+  %1 = tail call <8 x double> @llvm.x86.avx512.mask.sub.pd.512(<8 x double> %a, <8 x double> %b, <8 x double> %c, i8 %mask, i32 8)
+  ret <8 x double> %1
+}
+
+declare <16 x float> @llvm.x86.avx512.mask.mul.ps.512(<16 x float>, <16 x float>, <16 x float>, i16, i32)
+
+define <16 x float> @test_mul_ps(<16 x float> %a, <16 x float> %b) {
+; CHECK-LABEL: @test_mul_ps(
+; CHECK-NEXT:    [[TMP1:%.*]] = fmul <16 x float> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    ret <16 x float> [[TMP1]]
+;
+  %1 = tail call <16 x float> @llvm.x86.avx512.mask.mul.ps.512(<16 x float> %a, <16 x float> %b, <16 x float> undef, i16 -1, i32 4)
+  ret <16 x float> %1
+}
+
+define <16 x float> @test_mul_ps_round(<16 x float> %a, <16 x float> %b) {
+; CHECK-LABEL: @test_mul_ps_round(
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x float> @llvm.x86.avx512.mask.mul.ps.512(<16 x float> [[A:%.*]], <16 x float> [[B:%.*]], <16 x float> undef, i16 -1, i32 8)
+; CHECK-NEXT:    ret <16 x float> [[TMP1]]
+;
+  %1 = tail call <16 x float> @llvm.x86.avx512.mask.mul.ps.512(<16 x float> %a, <16 x float> %b, <16 x float> undef, i16 -1, i32 8)
+  ret <16 x float> %1
+}
+
+define <16 x float> @test_mul_ps_mask(<16 x float> %a, <16 x float> %b, <16 x float> %c, i16 %mask) {
+; CHECK-LABEL: @test_mul_ps_mask(
+; CHECK-NEXT:    [[TMP1:%.*]] = fmul <16 x float> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP3:%.*]] = select <16 x i1> [[TMP2]], <16 x float> [[TMP1]], <16 x float> [[C:%.*]]
+; CHECK-NEXT:    ret <16 x float> [[TMP3]]
+;
+  %1 = tail call <16 x float> @llvm.x86.avx512.mask.mul.ps.512(<16 x float> %a, <16 x float> %b, <16 x float> %c, i16 %mask, i32 4)
+  ret <16 x float> %1
+}
+
+define <16 x float> @test_mul_ps_mask_round(<16 x float> %a, <16 x float> %b, <16 x float> %c, i16 %mask) {
+; CHECK-LABEL: @test_mul_ps_mask_round(
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x float> @llvm.x86.avx512.mask.mul.ps.512(<16 x float> [[A:%.*]], <16 x float> [[B:%.*]], <16 x float> [[C:%.*]], i16 [[MASK:%.*]], i32 8)
+; CHECK-NEXT:    ret <16 x float> [[TMP1]]
+;
+  %1 = tail call <16 x float> @llvm.x86.avx512.mask.mul.ps.512(<16 x float> %a, <16 x float> %b, <16 x float> %c, i16 %mask, i32 8)
+  ret <16 x float> %1
+}
+
+declare <8 x double> @llvm.x86.avx512.mask.mul.pd.512(<8 x double>, <8 x double>, <8 x double>, i8, i32)
+
+define <8 x double> @test_mul_pd(<8 x double> %a, <8 x double> %b) {
+; CHECK-LABEL: @test_mul_pd(
+; CHECK-NEXT:    [[TMP1:%.*]] = fmul <8 x double> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    ret <8 x double> [[TMP1]]
+;
+  %1 = tail call <8 x double> @llvm.x86.avx512.mask.mul.pd.512(<8 x double> %a, <8 x double> %b, <8 x double> undef, i8 -1, i32 4)
+  ret <8 x double> %1
+}
+
+define <8 x double> @test_mul_pd_round(<8 x double> %a, <8 x double> %b) {
+; CHECK-LABEL: @test_mul_pd_round(
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x double> @llvm.x86.avx512.mask.mul.pd.512(<8 x double> [[A:%.*]], <8 x double> [[B:%.*]], <8 x double> undef, i8 -1, i32 8)
+; CHECK-NEXT:    ret <8 x double> [[TMP1]]
+;
+  %1 = tail call <8 x double> @llvm.x86.avx512.mask.mul.pd.512(<8 x double> %a, <8 x double> %b, <8 x double> undef, i8 -1, i32 8)
+  ret <8 x double> %1
+}
+
+define <8 x double> @test_mul_pd_mask(<8 x double> %a, <8 x double> %b, <8 x double> %c, i8 %mask) {
+; CHECK-LABEL: @test_mul_pd_mask(
+; CHECK-NEXT:    [[TMP1:%.*]] = fmul <8 x double> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP3:%.*]] = select <8 x i1> [[TMP2]], <8 x double> [[TMP1]], <8 x double> [[C:%.*]]
+; CHECK-NEXT:    ret <8 x double> [[TMP3]]
+;
+  %1 = tail call <8 x double> @llvm.x86.avx512.mask.mul.pd.512(<8 x double> %a, <8 x double> %b, <8 x double> %c, i8 %mask, i32 4)
+  ret <8 x double> %1
+}
+
+define <8 x double> @test_mul_pd_mask_round(<8 x double> %a, <8 x double> %b, <8 x double> %c, i8 %mask) {
+; CHECK-LABEL: @test_mul_pd_mask_round(
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x double> @llvm.x86.avx512.mask.mul.pd.512(<8 x double> [[A:%.*]], <8 x double> [[B:%.*]], <8 x double> [[C:%.*]], i8 [[MASK:%.*]], i32 8)
+; CHECK-NEXT:    ret <8 x double> [[TMP1]]
+;
+  %1 = tail call <8 x double> @llvm.x86.avx512.mask.mul.pd.512(<8 x double> %a, <8 x double> %b, <8 x double> %c, i8 %mask, i32 8)
+  ret <8 x double> %1
+}
+
+declare <16 x float> @llvm.x86.avx512.mask.div.ps.512(<16 x float>, <16 x float>, <16 x float>, i16, i32)
+
+define <16 x float> @test_div_ps(<16 x float> %a, <16 x float> %b) {
+; CHECK-LABEL: @test_div_ps(
+; CHECK-NEXT:    [[TMP1:%.*]] = fdiv <16 x float> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    ret <16 x float> [[TMP1]]
+;
+  %1 = tail call <16 x float> @llvm.x86.avx512.mask.div.ps.512(<16 x float> %a, <16 x float> %b, <16 x float> undef, i16 -1, i32 4)
+  ret <16 x float> %1
+}
+
+define <16 x float> @test_div_ps_round(<16 x float> %a, <16 x float> %b) {
+; CHECK-LABEL: @test_div_ps_round(
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x float> @llvm.x86.avx512.mask.div.ps.512(<16 x float> [[A:%.*]], <16 x float> [[B:%.*]], <16 x float> undef, i16 -1, i32 8)
+; CHECK-NEXT:    ret <16 x float> [[TMP1]]
+;
+  %1 = tail call <16 x float> @llvm.x86.avx512.mask.div.ps.512(<16 x float> %a, <16 x float> %b, <16 x float> undef, i16 -1, i32 8)
+  ret <16 x float> %1
+}
+
+define <16 x float> @test_div_ps_mask(<16 x float> %a, <16 x float> %b, <16 x float> %c, i16 %mask) {
+; CHECK-LABEL: @test_div_ps_mask(
+; CHECK-NEXT:    [[TMP1:%.*]] = fdiv <16 x float> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP3:%.*]] = select <16 x i1> [[TMP2]], <16 x float> [[TMP1]], <16 x float> [[C:%.*]]
+; CHECK-NEXT:    ret <16 x float> [[TMP3]]
+;
+  %1 = tail call <16 x float> @llvm.x86.avx512.mask.div.ps.512(<16 x float> %a, <16 x float> %b, <16 x float> %c, i16 %mask, i32 4)
+  ret <16 x float> %1
+}
+
+define <16 x float> @test_div_ps_mask_round(<16 x float> %a, <16 x float> %b, <16 x float> %c, i16 %mask) {
+; CHECK-LABEL: @test_div_ps_mask_round(
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x float> @llvm.x86.avx512.mask.div.ps.512(<16 x float> [[A:%.*]], <16 x float> [[B:%.*]], <16 x float> [[C:%.*]], i16 [[MASK:%.*]], i32 8)
+; CHECK-NEXT:    ret <16 x float> [[TMP1]]
+;
+  %1 = tail call <16 x float> @llvm.x86.avx512.mask.div.ps.512(<16 x float> %a, <16 x float> %b, <16 x float> %c, i16 %mask, i32 8)
+  ret <16 x float> %1
+}
+
+declare <8 x double> @llvm.x86.avx512.mask.div.pd.512(<8 x double>, <8 x double>, <8 x double>, i8, i32)
+
+define <8 x double> @test_div_pd(<8 x double> %a, <8 x double> %b) {
+; CHECK-LABEL: @test_div_pd(
+; CHECK-NEXT:    [[TMP1:%.*]] = fdiv <8 x double> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    ret <8 x double> [[TMP1]]
+;
+  %1 = tail call <8 x double> @llvm.x86.avx512.mask.div.pd.512(<8 x double> %a, <8 x double> %b, <8 x double> undef, i8 -1, i32 4)
+  ret <8 x double> %1
+}
+
+define <8 x double> @test_div_pd_round(<8 x double> %a, <8 x double> %b) {
+; CHECK-LABEL: @test_div_pd_round(
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x double> @llvm.x86.avx512.mask.div.pd.512(<8 x double> [[A:%.*]], <8 x double> [[B:%.*]], <8 x double> undef, i8 -1, i32 8)
+; CHECK-NEXT:    ret <8 x double> [[TMP1]]
+;
+  %1 = tail call <8 x double> @llvm.x86.avx512.mask.div.pd.512(<8 x double> %a, <8 x double> %b, <8 x double> undef, i8 -1, i32 8)
+  ret <8 x double> %1
+}
+
+define <8 x double> @test_div_pd_mask(<8 x double> %a, <8 x double> %b, <8 x double> %c, i8 %mask) {
+; CHECK-LABEL: @test_div_pd_mask(
+; CHECK-NEXT:    [[TMP1:%.*]] = fdiv <8 x double> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP3:%.*]] = select <8 x i1> [[TMP2]], <8 x double> [[TMP1]], <8 x double> [[C:%.*]]
+; CHECK-NEXT:    ret <8 x double> [[TMP3]]
+;
+  %1 = tail call <8 x double> @llvm.x86.avx512.mask.div.pd.512(<8 x double> %a, <8 x double> %b, <8 x double> %c, i8 %mask, i32 4)
+  ret <8 x double> %1
+}
+
+define <8 x double> @test_div_pd_mask_round(<8 x double> %a, <8 x double> %b, <8 x double> %c, i8 %mask) {
+; CHECK-LABEL: @test_div_pd_mask_round(
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x double> @llvm.x86.avx512.mask.div.pd.512(<8 x double> [[A:%.*]], <8 x double> [[B:%.*]], <8 x double> [[C:%.*]], i8 [[MASK:%.*]], i32 8)
+; CHECK-NEXT:    ret <8 x double> [[TMP1]]
+;
+  %1 = tail call <8 x double> @llvm.x86.avx512.mask.div.pd.512(<8 x double> %a, <8 x double> %b, <8 x double> %c, i8 %mask, i32 8)
+  ret <8 x double> %1
+}
+
+declare i32 @llvm.x86.avx512.vcomi.ss(<4 x float>, <4 x float>, i32, i32)
+
+define i32 @test_comi_ss_0(float %a, float %b) {
+; CHECK-LABEL: @test_comi_ss_0(
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x float> undef, float [[A:%.*]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x float> undef, float [[B:%.*]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = tail call i32 @llvm.x86.avx512.vcomi.ss(<4 x float> [[TMP1]], <4 x float> [[TMP2]], i32 0, i32 4)
+; CHECK-NEXT:    ret i32 [[TMP3]]
+;
+  %1 = insertelement <4 x float> undef, float %a, i32 0
+  %2 = insertelement <4 x float> %1, float 1.000000e+00, i32 1
+  %3 = insertelement <4 x float> %2, float 2.000000e+00, i32 2
+  %4 = insertelement <4 x float> %3, float 3.000000e+00, i32 3
+  %5 = insertelement <4 x float> undef, float %b, i32 0
+  %6 = insertelement <4 x float> %5, float 4.000000e+00, i32 1
+  %7 = insertelement <4 x float> %6, float 5.000000e+00, i32 2
+  %8 = insertelement <4 x float> %7, float 6.000000e+00, i32 3
+  %9 = tail call i32 @llvm.x86.avx512.vcomi.ss(<4 x float> %4, <4 x float> %8, i32 0, i32 4)
+  ret i32 %9
+}
+
+declare i32 @llvm.x86.avx512.vcomi.sd(<2 x double>, <2 x double>, i32, i32)
+
+define i32 @test_comi_sd_0(double %a, double %b) {
+; CHECK-LABEL: @test_comi_sd_0(
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> undef, double [[A:%.*]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> undef, double [[B:%.*]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = tail call i32 @llvm.x86.avx512.vcomi.sd(<2 x double> [[TMP1]], <2 x double> [[TMP2]], i32 0, i32 4)
+; CHECK-NEXT:    ret i32 [[TMP3]]
+;
+  %1 = insertelement <2 x double> undef, double %a, i32 0
+  %2 = insertelement <2 x double> %1, double 1.000000e+00, i32 1
+  %3 = insertelement <2 x double> undef, double %b, i32 0
+  %4 = insertelement <2 x double> %3, double 2.000000e+00, i32 1
+  %5 = tail call i32 @llvm.x86.avx512.vcomi.sd(<2 x double> %2, <2 x double> %4, i32 0, i32 4)
+  ret i32 %5
+}
diff --git a/test/Transforms/InstCombine/X86/x86-crc32-demanded.ll b/test/Transforms/InstCombine/X86/x86-crc32-demanded.ll
new file mode 100644
index 000000000000..878b97d1bb22
--- /dev/null
+++ b/test/Transforms/InstCombine/X86/x86-crc32-demanded.ll
@@ -0,0 +1,17 @@
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+; crc32 with 64-bit destination zeros high 32-bit.
+; rdar://9467055
+
+define i64 @test() nounwind {
+entry:
+; CHECK: test
+; CHECK: tail call i64 @llvm.x86.sse42.crc32.64.64
+; CHECK-NOT: and
+; CHECK: ret
+  %0 = tail call i64 @llvm.x86.sse42.crc32.64.64(i64 0, i64 4) nounwind
+  %1 = and i64 %0, 4294967295
+  ret i64 %1
+}
+
+declare i64 @llvm.x86.sse42.crc32.64.64(i64, i64) nounwind readnone
diff --git a/test/Transforms/InstCombine/X86/x86-f16c.ll b/test/Transforms/InstCombine/X86/x86-f16c.ll
new file mode 100644
index 000000000000..6b5b6cb26eda
--- /dev/null
+++ b/test/Transforms/InstCombine/X86/x86-f16c.ll
@@ -0,0 +1,68 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+declare <4 x float> @llvm.x86.vcvtph2ps.128(<8 x i16>)
+declare <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16>)
+
+;
+; Vector Demanded Bits
+;
+
+; Only bottom 4 elements required.
+define <4 x float> @demand_vcvtph2ps_128(<8 x i16> %A) {
+; CHECK-LABEL: @demand_vcvtph2ps_128(
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x float> @llvm.x86.vcvtph2ps.128(<8 x i16> %A)
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
+;
+  %1 = shufflevector <8 x i16> %A, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+  %2 = tail call <4 x float> @llvm.x86.vcvtph2ps.128(<8 x i16> %1)
+  ret <4 x float> %2
+}
+
+; All 8 elements required.
+define <8 x float> @demand_vcvtph2ps_256(<8 x i16> %A) {
+; CHECK-LABEL: @demand_vcvtph2ps_256(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i16> %A, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16> [[TMP1]])
+; CHECK-NEXT:    ret <8 x float> [[TMP2]]
+;
+  %1 = shufflevector <8 x i16> %A, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+  %2 = tail call <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16> %1)
+  ret <8 x float> %2
+}
+
+;
+; Constant Folding
+;
+
+define <4 x float> @fold_vcvtph2ps_128() {
+; CHECK-LABEL: @fold_vcvtph2ps_128(
+; CHECK-NEXT:    ret <4 x float> <float 0.000000e+00, float 5.000000e-01, float 1.000000e+00, float -0.000000e+00>
+;
+  %1 = tail call <4 x float> @llvm.x86.vcvtph2ps.128(<8 x i16> <i16 0, i16 14336, i16 15360, i16 32768, i16 16384, i16 31743, i16 48128, i16 49152>)
+  ret <4 x float> %1
+}
+
+define <8 x float> @fold_vcvtph2ps_256() {
+; CHECK-LABEL: @fold_vcvtph2ps_256(
+; CHECK-NEXT:    ret <8 x float> <float 0.000000e+00, float 5.000000e-01, float 1.000000e+00, float -0.000000e+00, float 2.000000e+00, float 6.550400e+04, float -1.000000e+00, float -2.000000e+00>
+;
+  %1 = tail call <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16> <i16 0, i16 14336, i16 15360, i16 32768, i16 16384, i16 31743, i16 48128, i16 49152>)
+  ret <8 x float> %1
+}
+
+define <4 x float> @fold_vcvtph2ps_128_zero() {
+; CHECK-LABEL: @fold_vcvtph2ps_128_zero(
+; CHECK-NEXT:    ret <4 x float> zeroinitializer
+;
+  %1 = tail call <4 x float> @llvm.x86.vcvtph2ps.128(<8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>)
+  ret <4 x float> %1
+}
+
+define <8 x float> @fold_vcvtph2ps_256_zero() {
+; CHECK-LABEL: @fold_vcvtph2ps_256_zero(
+; CHECK-NEXT:    ret <8 x float> zeroinitializer
+;
+  %1 = tail call <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>)
+  ret <8 x float> %1
+}
diff --git a/test/Transforms/InstCombine/X86/x86-fma.ll b/test/Transforms/InstCombine/X86/x86-fma.ll
new file mode 100644
index 000000000000..0d27d3276163
--- /dev/null
+++ b/test/Transforms/InstCombine/X86/x86-fma.ll
@@ -0,0 +1,315 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -instcombine -S | FileCheck %s
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+declare <4 x float> @llvm.x86.fma.vfmadd.ss(<4 x float>, <4 x float>, <4 x float>)
+
+define <4 x float> @test_vfmadd_ss(<4 x float> %a, <4 x float> %b, <4 x float> %c) {
+; CHECK-LABEL: @test_vfmadd_ss(
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x float> @llvm.x86.fma.vfmadd.ss(<4 x float> %a, <4 x float> %b, <4 x float> %c)
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
+;
+  %1 = insertelement <4 x float> %b, float 1.000000e+00, i32 1
+  %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2
+  %3 = insertelement <4 x float> %2, float 3.000000e+00, i32 3
+  %4 = insertelement <4 x float> %c, float 4.000000e+00, i32 1
+  %5 = insertelement <4 x float> %4, float 5.000000e+00, i32 2
+  %6 = insertelement <4 x float> %5, float 6.000000e+00, i32 3
+  %res = tail call <4 x float> @llvm.x86.fma.vfmadd.ss(<4 x float> %a, <4 x float> %3, <4 x float> %6)
+  ret <4 x float> %res
+}
+
+define float @test_vfmadd_ss_0(<4 x float> %a, <4 x float> %b, <4 x float> %c) {
+; CHECK-LABEL: @test_vfmadd_ss_0(
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x float> @llvm.x86.fma.vfmadd.ss(<4 x float> %a, <4 x float> %b, <4 x float> %c)
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 0
+; CHECK-NEXT:    ret float [[TMP2]]
+;
+  %1 = insertelement <4 x float> %a, float 1.000000e+00, i32 1
+  %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2
+  %3 = insertelement <4 x float> %2, float 3.000000e+00, i32 3
+  %4 = tail call <4 x float> @llvm.x86.fma.vfmadd.ss(<4 x float> %3, <4 x float> %b, <4 x float> %c)
+  %5 = extractelement <4 x float> %4, i32 0
+  ret float %5
+}
+
+define float @test_vfmadd_ss_1(<4 x float> %a, <4 x float> %b, <4 x float> %c) {
+; CHECK-LABEL: @test_vfmadd_ss_1(
+; CHECK-NEXT:    ret float 1.000000e+00
+;
+  %1 = insertelement <4 x float> %a, float 1.000000e+00, i32 1
+  %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2
+  %3 = insertelement <4 x float> %2, float 3.000000e+00, i32 3
+  %4 = tail call <4 x float> @llvm.x86.fma.vfmadd.ss(<4 x float> %3, <4 x float> %b, <4 x float> %c)
+  %5 = extractelement <4 x float> %4, i32 1
+  ret float %5
+}
+
+declare <2 x double> @llvm.x86.fma.vfmadd.sd(<2 x double>, <2 x double>, <2 x double>)
+
+define <2 x double> @test_vfmadd_sd(<2 x double> %a, <2 x double> %b, <2 x double> %c) {
+; CHECK-LABEL: @test_vfmadd_sd(
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x double> @llvm.x86.fma.vfmadd.sd(<2 x double> %a, <2 x double> %b, <2 x double> %c)
+; CHECK-NEXT:    ret <2 x double> [[TMP1]]
+;
+  %1 = insertelement <2 x double> %b, double 1.000000e+00, i32 1
+  %2 = insertelement <2 x double> %c, double 2.000000e+00, i32 1
+  %res = tail call <2 x double> @llvm.x86.fma.vfmadd.sd(<2 x double> %a, <2 x double> %1, <2 x double> %2)
+  ret <2 x double> %res
+}
+
+define double @test_vfmadd_sd_0(<2 x double> %a, <2 x double> %b, <2 x double> %c) {
+; CHECK-LABEL: @test_vfmadd_sd_0(
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x double> @llvm.x86.fma.vfmadd.sd(<2 x double> %a, <2 x double> %b, <2 x double> %c)
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x double> [[TMP1]], i32 0
+; CHECK-NEXT:    ret double [[TMP2]]
+;
+  %1 = insertelement <2 x double> %a, double 1.000000e+00, i32 1
+  %2 = tail call <2 x double> @llvm.x86.fma.vfmadd.sd(<2 x double> %1, <2 x double> %b, <2 x double> %c)
+  %3 = extractelement <2 x double> %2, i32 0
+  ret double %3
+}
+
+define double @test_vfmadd_sd_1(<2 x double> %a, <2 x double> %b, <2 x double> %c) {
+; CHECK-LABEL: @test_vfmadd_sd_1(
+; CHECK-NEXT:    ret double 1.000000e+00
+;
+  %1 = insertelement <2 x double> %a, double 1.000000e+00, i32 1
+  %2 = tail call <2 x double> @llvm.x86.fma.vfmadd.sd(<2 x double> %1, <2 x double> %b, <2 x double> %c)
+  %3 = extractelement <2 x double> %2, i32 1
+  ret double %3
+}
+
+declare <4 x float> @llvm.x86.fma.vfmsub.ss(<4 x float>, <4 x float>, <4 x float>)
+
+define <4 x float> @test_vfmsub_ss(<4 x float> %a, <4 x float> %b, <4 x float> %c) {
+; CHECK-LABEL: @test_vfmsub_ss(
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x float> @llvm.x86.fma.vfmsub.ss(<4 x float> %a, <4 x float> %b, <4 x float> %c)
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
+;
+  %1 = insertelement <4 x float> %b, float 1.000000e+00, i32 1
+  %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2
+  %3 = insertelement <4 x float> %2, float 3.000000e+00, i32 3
+  %4 = insertelement <4 x float> %c, float 4.000000e+00, i32 1
+  %5 = insertelement <4 x float> %4, float 5.000000e+00, i32 2
+  %6 = insertelement <4 x float> %5, float 6.000000e+00, i32 3
+  %res = tail call <4 x float> @llvm.x86.fma.vfmsub.ss(<4 x float> %a, <4 x float> %3, <4 x float> %6)
+  ret <4 x float> %res
+}
+
+define float @test_vfmsub_ss_0(<4 x float> %a, <4 x float> %b, <4 x float> %c) {
+; CHECK-LABEL: @test_vfmsub_ss_0(
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x float> @llvm.x86.fma.vfmsub.ss(<4 x float> %a, <4 x float> %b, <4 x float> %c)
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 0
+; CHECK-NEXT:    ret float [[TMP2]]
+;
+  %1 = insertelement <4 x float> %a, float 1.000000e+00, i32 1
+  %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2
+  %3 = insertelement <4 x float> %2, float 3.000000e+00, i32 3
+  %4 = tail call <4 x float> @llvm.x86.fma.vfmsub.ss(<4 x float> %3, <4 x float> %b, <4 x float> %c)
+  %5 = extractelement <4 x float> %4, i32 0
+  ret float %5
+}
+
+define float @test_vfmsub_ss_1(<4 x float> %a, <4 x float> %b, <4 x float> %c) {
+; CHECK-LABEL: @test_vfmsub_ss_1(
+; CHECK-NEXT:    ret float 1.000000e+00
+;
+  %1 = insertelement <4 x float> %a, float 1.000000e+00, i32 1
+  %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2
+  %3 = insertelement <4 x float> %2, float 3.000000e+00, i32 3
+  %4 = tail call <4 x float> @llvm.x86.fma.vfmsub.ss(<4 x float> %3, <4 x float> %b, <4 x float> %c)
+  %5 = extractelement <4 x float> %4, i32 1
+  ret float %5
+}
+
+declare <2 x double> @llvm.x86.fma.vfmsub.sd(<2 x double>, <2 x double>, <2 x double>)
+
+define <2 x double> @test_vfmsub_sd(<2 x double> %a, <2 x double> %b, <2 x double> %c) {
+; CHECK-LABEL: @test_vfmsub_sd(
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x double> @llvm.x86.fma.vfmsub.sd(<2 x double> %a, <2 x double> %b, <2 x double> %c)
+; CHECK-NEXT:    ret <2 x double> [[TMP1]]
+;
+  %1 = insertelement <2 x double> %b, double 1.000000e+00, i32 1
+  %2 = insertelement <2 x double> %c, double 2.000000e+00, i32 1
+  %res = tail call <2 x double> @llvm.x86.fma.vfmsub.sd(<2 x double> %a, <2 x double> %1, <2 x double> %2)
+  ret <2 x double> %res
+}
+
+define double @test_vfmsub_sd_0(<2 x double> %a, <2 x double> %b, <2 x double> %c) {
+; CHECK-LABEL: @test_vfmsub_sd_0(
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x double> @llvm.x86.fma.vfmsub.sd(<2 x double> %a, <2 x double> %b, <2 x double> %c)
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x double> [[TMP1]], i32 0
+; CHECK-NEXT:    ret double [[TMP2]]
+;
+  %1 = insertelement <2 x double> %a, double 1.000000e+00, i32 1
+  %2 = tail call <2 x double> @llvm.x86.fma.vfmsub.sd(<2 x double> %1, <2 x double> %b, <2 x double> %c)
+  %3 = extractelement <2 x double> %2, i32 0
+  ret double %3
+}
+
+define double @test_vfmsub_sd_1(<2 x double> %a, <2 x double> %b, <2 x double> %c) {
+; CHECK-LABEL: @test_vfmsub_sd_1(
+; CHECK-NEXT:    ret double 1.000000e+00
+;
+  %1 = insertelement <2 x double> %a, double 1.000000e+00, i32 1
+  %2 = tail call <2 x double> @llvm.x86.fma.vfmsub.sd(<2 x double> %1, <2 x double> %b, <2 x double> %c)
+  %3 = extractelement <2 x double> %2, i32 1
+  ret double %3
+}
+
+declare <4 x float> @llvm.x86.fma.vfnmadd.ss(<4 x float>, <4 x float>, <4 x float>)
+
+define <4 x float> @test_vfnmadd_ss(<4 x float> %a, <4 x float> %b, <4 x float> %c) {
+; CHECK-LABEL: @test_vfnmadd_ss(
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x float> @llvm.x86.fma.vfnmadd.ss(<4 x float> %a, <4 x float> %b, <4 x float> %c)
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
+;
+  %1 = insertelement <4 x float> %b, float 1.000000e+00, i32 1
+  %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2
+  %3 = insertelement <4 x float> %2, float 3.000000e+00, i32 3
+  %4 = insertelement <4 x float> %c, float 4.000000e+00, i32 1
+  %5 = insertelement <4 x float> %4, float 5.000000e+00, i32 2
+  %6 = insertelement <4 x float> %5, float 6.000000e+00, i32 3
+  %res = tail call <4 x float> @llvm.x86.fma.vfnmadd.ss(<4 x float> %a, <4 x float> %3, <4 x float> %6)
+  ret <4 x float> %res
+}
+
+define float @test_vfnmadd_ss_0(<4 x float> %a, <4 x float> %b, <4 x float> %c) {
+; CHECK-LABEL: @test_vfnmadd_ss_0(
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x float> @llvm.x86.fma.vfnmadd.ss(<4 x float> %a, <4 x float> %b, <4 x float> %c)
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 0
+; CHECK-NEXT:    ret float [[TMP2]]
+;
+  %1 = insertelement <4 x float> %a, float 1.000000e+00, i32 1
+  %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2
+  %3 = insertelement <4 x float> %2, float 3.000000e+00, i32 3
+  %4 = tail call <4 x float> @llvm.x86.fma.vfnmadd.ss(<4 x float> %3, <4 x float> %b, <4 x float> %c)
+  %5 = extractelement <4 x float> %4, i32 0
+  ret float %5
+}
+
+define float @test_vfnmadd_ss_1(<4 x float> %a, <4 x float> %b, <4 x float> %c) {
+; CHECK-LABEL: @test_vfnmadd_ss_1(
+; CHECK-NEXT:    ret float 1.000000e+00
+;
+  %1 = insertelement <4 x float> %a, float 1.000000e+00, i32 1
+  %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2
+  %3 = insertelement <4 x float> %2, float 3.000000e+00, i32 3
+  %4 = tail call <4 x float> @llvm.x86.fma.vfnmadd.ss(<4 x float> %3, <4 x float> %b, <4 x float> %c)
+  %5 = extractelement <4 x float> %4, i32 1
+  ret float %5
+}
+
+declare <2 x double> @llvm.x86.fma.vfnmadd.sd(<2 x double>, <2 x double>, <2 x double>)
+
+define <2 x double> @test_vfnmadd_sd(<2 x double> %a, <2 x double> %b, <2 x double> %c) {
+; CHECK-LABEL: @test_vfnmadd_sd(
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x double> @llvm.x86.fma.vfnmadd.sd(<2 x double> %a, <2 x double> %b, <2 x double> %c)
+; CHECK-NEXT:    ret <2 x double> [[TMP1]]
+;
+  %1 = insertelement <2 x double> %b, double 1.000000e+00, i32 1
+  %2 = insertelement <2 x double> %c, double 2.000000e+00, i32 1
+  %res = tail call <2 x double> @llvm.x86.fma.vfnmadd.sd(<2 x double> %a, <2 x double> %1, <2 x double> %2)
+  ret <2 x double> %res
+}
+
+define double @test_vfnmadd_sd_0(<2 x double> %a, <2 x double> %b, <2 x double> %c) {
+; CHECK-LABEL: @test_vfnmadd_sd_0(
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x double> @llvm.x86.fma.vfnmadd.sd(<2 x double> %a, <2 x double> %b, <2 x double> %c)
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x double> [[TMP1]], i32 0
+; CHECK-NEXT:    ret double [[TMP2]]
+;
+  %1 = insertelement <2 x double> %a, double 1.000000e+00, i32 1
+  %2 = tail call <2 x double> @llvm.x86.fma.vfnmadd.sd(<2 x double> %1, <2 x double> %b, <2 x double> %c)
+  %3 = extractelement <2 x double> %2, i32 0
+  ret double %3
+}
+
+define double @test_vfnmadd_sd_1(<2 x double> %a, <2 x double> %b, <2 x double> %c) {
+; CHECK-LABEL: @test_vfnmadd_sd_1(
+; CHECK-NEXT:    ret double 1.000000e+00
+;
+  %1 = insertelement <2 x double> %a, double 1.000000e+00, i32 1
+  %2 = tail call <2 x double> @llvm.x86.fma.vfnmadd.sd(<2 x double> %1, <2 x double> %b, <2 x double> %c)
+  %3 = extractelement <2 x double> %2, i32 1
+  ret double %3
+}
+
+declare <4 x float> @llvm.x86.fma.vfnmsub.ss(<4 x float>, <4 x float>, <4 x float>)
+
+define <4 x float> @test_vfnmsub_ss(<4 x float> %a, <4 x float> %b, <4 x float> %c) {
+; CHECK-LABEL: @test_vfnmsub_ss(
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x float> @llvm.x86.fma.vfnmsub.ss(<4 x float> %a, <4 x float> %b, <4 x float> %c)
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
+;
+  %1 = insertelement <4 x float> %b, float 1.000000e+00, i32 1
+  %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2
+  %3 = insertelement <4 x float> %2, float 3.000000e+00, i32 3
+  %4 = insertelement <4 x float> %c, float 4.000000e+00, i32 1
+  %5 = insertelement <4 x float> %4, float 5.000000e+00, i32 2
+  %6 = insertelement <4 x float> %5, float 6.000000e+00, i32 3
+  %res = tail call <4 x float> @llvm.x86.fma.vfnmsub.ss(<4 x float> %a, <4 x float> %3, <4 x float> %6)
+  ret <4 x float> %res
+}
+
+define float @test_vfnmsub_ss_0(<4 x float> %a, <4 x float> %b, <4 x float> %c) {
+; CHECK-LABEL: @test_vfnmsub_ss_0(
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x float> @llvm.x86.fma.vfnmsub.ss(<4 x float> %a, <4 x float> %b, <4 x float> %c)
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 0
+; CHECK-NEXT:    ret float [[TMP2]]
+;
+  %1 = insertelement <4 x float> %a, float 1.000000e+00, i32 1
+  %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2
+  %3 = insertelement <4 x float> %2, float 3.000000e+00, i32 3
+  %4 = tail call <4 x float> @llvm.x86.fma.vfnmsub.ss(<4 x float> %3, <4 x float> %b, <4 x float> %c)
+  %5 = extractelement <4 x float> %4, i32 0
+  ret float %5
+}
+
+define float @test_vfnmsub_ss_1(<4 x float> %a, <4 x float> %b, <4 x float> %c) {
+; CHECK-LABEL: @test_vfnmsub_ss_1(
+; CHECK-NEXT:    ret float 1.000000e+00
+;
+  %1 = insertelement <4 x float> %a, float 1.000000e+00, i32 1
+  %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2
+  %3 = insertelement <4 x float> %2, float 3.000000e+00, i32 3
+  %4 = tail call <4 x float> @llvm.x86.fma.vfnmsub.ss(<4 x float> %3, <4 x float> %b, <4 x float> %c)
+  %5 = extractelement <4 x float> %4, i32 1
+  ret float %5
+}
+
+declare <2 x double> @llvm.x86.fma.vfnmsub.sd(<2 x double>, <2 x double>, <2 x double>)
+
+define <2 x double> @test_vfnmsub_sd(<2 x double> %a, <2 x double> %b, <2 x double> %c) {
+; CHECK-LABEL: @test_vfnmsub_sd(
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x double> @llvm.x86.fma.vfnmsub.sd(<2 x double> %a, <2 x double> %b, <2 x double> %c)
+; CHECK-NEXT:    ret <2 x double> [[TMP1]]
+;
+  %1 = insertelement <2 x double> %b, double 1.000000e+00, i32 1
+  %2 = insertelement <2 x double> %c, double 2.000000e+00, i32 1
+  %res = tail call <2 x double> @llvm.x86.fma.vfnmsub.sd(<2 x double> %a, <2 x double> %1, <2 x double> %2)
+  ret <2 x double> %res
+}
+
+define double @test_vfnmsub_sd_0(<2 x double> %a, <2 x double> %b, <2 x double> %c) {
+; CHECK-LABEL: @test_vfnmsub_sd_0(
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x double> @llvm.x86.fma.vfnmsub.sd(<2 x double> %a, <2 x double> %b, <2 x double> %c)
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x double> [[TMP1]], i32 0
+; CHECK-NEXT:    ret double [[TMP2]]
+;
+  %1 = insertelement <2 x double> %a, double 1.000000e+00, i32 1
+  %2 = tail call <2 x double> @llvm.x86.fma.vfnmsub.sd(<2 x double> %1, <2 x double> %b, <2 x double> %c)
+  %3 = extractelement <2 x double> %2, i32 0
+  ret double %3
+}
+
+define double @test_vfnmsub_sd_1(<2 x double> %a, <2 x double> %b, <2 x double> %c) {
+; CHECK-LABEL: @test_vfnmsub_sd_1(
+; CHECK-NEXT:    ret double 1.000000e+00
+;
+  %1 = insertelement <2 x double> %a, double 1.000000e+00, i32 1
+  %2 = tail call <2 x double> @llvm.x86.fma.vfnmsub.sd(<2 x double> %1, <2 x double> %b, <2 x double> %c)
+  %3 = extractelement <2 x double> %2, i32 1
+  ret double %3
+}
diff --git a/test/Transforms/InstCombine/X86/x86-insertps.ll b/test/Transforms/InstCombine/X86/x86-insertps.ll
new file mode 100644
index 000000000000..f55ea6f22d2e
--- /dev/null
+++ b/test/Transforms/InstCombine/X86/x86-insertps.ll
@@ -0,0 +1,166 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+declare <4 x float> @llvm.x86.sse41.insertps(<4 x float>, <4 x float>, i8) nounwind readnone
+
+; This should never happen, but make sure we don't crash handling a non-constant immediate byte.
+
+define <4 x float> @insertps_non_const_imm(<4 x float> %v1, <4 x float> %v2, i8 %c) {
+; CHECK-LABEL: @insertps_non_const_imm(
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %v1, <4 x float> %v2, i8 %c)
+; CHECK-NEXT:    ret <4 x float> [[RES]]
+;
+  %res = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %v1, <4 x float> %v2, i8 %c)
+  ret <4 x float> %res
+
+}
+
+; If all zero mask bits are set, return a zero regardless of the other control bits.
+
+define <4 x float> @insertps_0x0f(<4 x float> %v1, <4 x float> %v2) {
+; CHECK-LABEL: @insertps_0x0f(
+; CHECK-NEXT:    ret <4 x float> zeroinitializer
+;
+  %res = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %v1, <4 x float> %v2, i8 15)
+  ret <4 x float> %res
+
+}
+define <4 x float> @insertps_0xff(<4 x float> %v1, <4 x float> %v2) {
+; CHECK-LABEL: @insertps_0xff(
+; CHECK-NEXT:    ret <4 x float> zeroinitializer
+;
+  %res = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %v1, <4 x float> %v2, i8 255)
+  ret <4 x float> %res
+
+}
+
+; If some zero mask bits are set that do not override the insertion, we do not change anything.
+
+define <4 x float> @insertps_0x0c(<4 x float> %v1, <4 x float> %v2) {
+; CHECK-LABEL: @insertps_0x0c(
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %v1, <4 x float> %v2, i8 12)
+; CHECK-NEXT:    ret <4 x float> [[RES]]
+;
+  %res = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %v1, <4 x float> %v2, i8 12)
+  ret <4 x float> %res
+
+}
+
+; ...unless both input vectors are the same operand.
+
+define <4 x float> @insertps_0x15_single_input(<4 x float> %v1) {
+; CHECK-LABEL: @insertps_0x15_single_input(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> %v1, <4 x float> <float 0.000000e+00, float undef, float 0.000000e+00, float undef>, <4 x i32> <i32 4, i32 0, i32 6, i32 3>
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
+;
+  %res = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %v1, <4 x float> %v1, i8 21)
+  ret <4 x float> %res
+
+}
+
+; The zero mask overrides the insertion lane.
+
+define <4 x float> @insertps_0x1a_single_input(<4 x float> %v1) {
+; CHECK-LABEL: @insertps_0x1a_single_input(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> %v1, <4 x float> <float undef, float 0.000000e+00, float undef, float 0.000000e+00>, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
+;
+  %res = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %v1, <4 x float> %v1, i8 26)
+  ret <4 x float> %res
+
+}
+
+; The zero mask overrides the insertion lane, so the second input vector is not used.
+
+define <4 x float> @insertps_0xc1(<4 x float> %v1, <4 x float> %v2) {
+; CHECK-LABEL: @insertps_0xc1(
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x float> %v1, float 0.000000e+00, i32 0
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
+;
+  %res = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %v1, <4 x float> %v2, i8 193)
+  ret <4 x float> %res
+
+}
+
+; If no zero mask bits are set, convert to a shuffle.
+
+define <4 x float> @insertps_0x00(<4 x float> %v1, <4 x float> %v2) {
+; CHECK-LABEL: @insertps_0x00(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> %v1, <4 x float> %v2, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
+;
+  %res = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %v1, <4 x float> %v2, i8 0)
+  ret <4 x float> %res
+
+}
+
+define <4 x float> @insertps_0x10(<4 x float> %v1, <4 x float> %v2) {
+; CHECK-LABEL: @insertps_0x10(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> %v1, <4 x float> %v2, <4 x i32> <i32 0, i32 4, i32 2, i32 3>
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
+;
+  %res = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %v1, <4 x float> %v2, i8 16)
+  ret <4 x float> %res
+
+}
+
+define <4 x float> @insertps_0x20(<4 x float> %v1, <4 x float> %v2) {
+; CHECK-LABEL: @insertps_0x20(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> %v1, <4 x float> %v2, <4 x i32> <i32 0, i32 1, i32 4, i32 3>
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
+;
+  %res = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %v1, <4 x float> %v2, i8 32)
+  ret <4 x float> %res
+
+}
+
+define <4 x float> @insertps_0x30(<4 x float> %v1, <4 x float> %v2) {
+; CHECK-LABEL: @insertps_0x30(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> %v1, <4 x float> %v2, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
+;
+  %res = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %v1, <4 x float> %v2, i8 48)
+  ret <4 x float> %res
+
+}
+
+define <4 x float> @insertps_0xc0(<4 x float> %v1, <4 x float> %v2) {
+; CHECK-LABEL: @insertps_0xc0(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> %v1, <4 x float> %v2, <4 x i32> <i32 7, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
+;
+  %res = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %v1, <4 x float> %v2, i8 192)
+  ret <4 x float> %res
+
+}
+
+define <4 x float> @insertps_0xd0(<4 x float> %v1, <4 x float> %v2) {
+; CHECK-LABEL: @insertps_0xd0(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> %v1, <4 x float> %v2, <4 x i32> <i32 0, i32 7, i32 2, i32 3>
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
+;
+  %res = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %v1, <4 x float> %v2, i8 208)
+  ret <4 x float> %res
+
+}
+
+define <4 x float> @insertps_0xe0(<4 x float> %v1, <4 x float> %v2) {
+; CHECK-LABEL: @insertps_0xe0(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> %v1, <4 x float> %v2, <4 x i32> <i32 0, i32 1, i32 7, i32 3>
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
+;
+  %res = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %v1, <4 x float> %v2, i8 224)
+  ret <4 x float> %res
+
+}
+
+define <4 x float> @insertps_0xf0(<4 x float> %v1, <4 x float> %v2) {
+; CHECK-LABEL: @insertps_0xf0(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> %v1, <4 x float> %v2, <4 x i32> <i32 0, i32 1, i32 2, i32 7>
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
+;
+  %res = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %v1, <4 x float> %v2, i8 240)
+  ret <4 x float> %res
+
+}
+
diff --git a/test/Transforms/InstCombine/X86/x86-masked-memops.ll b/test/Transforms/InstCombine/X86/x86-masked-memops.ll
new file mode 100644
index 000000000000..8502b1899ecb
--- /dev/null
+++ b/test/Transforms/InstCombine/X86/x86-masked-memops.ll
@@ -0,0 +1,328 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+;; MASKED LOADS
+
+; If the mask isn't constant, do nothing.
+
+define <4 x float> @mload(i8* %f, <4 x i32> %mask) {
+; CHECK-LABEL: @mload(
+; CHECK-NEXT:    [[LD:%.*]] = tail call <4 x float> @llvm.x86.avx.maskload.ps(i8* %f, <4 x i32> %mask)
+; CHECK-NEXT:    ret <4 x float> [[LD]]
+;
+  %ld = tail call <4 x float> @llvm.x86.avx.maskload.ps(i8* %f, <4 x i32> %mask)
+  ret <4 x float> %ld
+
+}
+
+; Zero mask returns a zero vector.
+
+define <4 x float> @mload_zeros(i8* %f) {
+; CHECK-LABEL: @mload_zeros(
+; CHECK-NEXT:    ret <4 x float> zeroinitializer
+;
+  %ld = tail call <4 x float> @llvm.x86.avx.maskload.ps(i8* %f, <4 x i32> zeroinitializer)
+  ret <4 x float> %ld
+
+}
+
+; Only the sign bit matters.
+
+define <4 x float> @mload_fake_ones(i8* %f) {
+; CHECK-LABEL: @mload_fake_ones(
+; CHECK-NEXT:    ret <4 x float> zeroinitializer
+;
+  %ld = tail call <4 x float> @llvm.x86.avx.maskload.ps(i8* %f, <4 x i32> <i32 1, i32 2, i32 3, i32 2147483647>)
+  ret <4 x float> %ld
+
+}
+
+; All mask bits are set, so this is just a vector load.
+
+define <4 x float> @mload_real_ones(i8* %f) {
+; CHECK-LABEL: @mload_real_ones(
+; CHECK-NEXT:    [[CASTVEC:%.*]] = bitcast i8* %f to <4 x float>*
+; CHECK-NEXT:    [[UNMASKEDLOAD:%.*]] = load <4 x float>, <4 x float>* [[CASTVEC]], align 1
+; CHECK-NEXT:    ret <4 x float> [[UNMASKEDLOAD]]
+;
+  %ld = tail call <4 x float> @llvm.x86.avx.maskload.ps(i8* %f, <4 x i32> <i32 -1, i32 -2, i32 -3, i32 2147483648>)
+  ret <4 x float> %ld
+
+}
+
+; It's a constant mask, so convert to an LLVM intrinsic. The backend should optimize further.
+
+define <4 x float> @mload_one_one(i8* %f) {
+; CHECK-LABEL: @mload_one_one(
+; CHECK-NEXT:    [[CASTVEC:%.*]] = bitcast i8* %f to <4 x float>*
+; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* [[CASTVEC]], i32 1, <4 x i1> <i1 false, i1 false, i1 false, i1 true>, <4 x float> zeroinitializer)
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
+;
+  %ld = tail call <4 x float> @llvm.x86.avx.maskload.ps(i8* %f, <4 x i32> <i32 0, i32 0, i32 0, i32 -1>)
+  ret <4 x float> %ld
+
+}
+
+; Try doubles.
+
+define <2 x double> @mload_one_one_double(i8* %f) {
+; CHECK-LABEL: @mload_one_one_double(
+; CHECK-NEXT:    [[CASTVEC:%.*]] = bitcast i8* %f to <2 x double>*
+; CHECK-NEXT:    [[TMP1:%.*]] = call <2 x double> @llvm.masked.load.v2f64.p0v2f64(<2 x double>* [[CASTVEC]], i32 1, <2 x i1> <i1 true, i1 false>, <2 x double> zeroinitializer)
+; CHECK-NEXT:    ret <2 x double> [[TMP1]]
+;
+  %ld = tail call <2 x double> @llvm.x86.avx.maskload.pd(i8* %f, <2 x i64> <i64 -1, i64 0>)
+  ret <2 x double> %ld
+
+}
+
+; Try 256-bit FP ops.
+
+define <8 x float> @mload_v8f32(i8* %f) {
+; CHECK-LABEL: @mload_v8f32(
+; CHECK-NEXT:    [[CASTVEC:%.*]] = bitcast i8* %f to <8 x float>*
+; CHECK-NEXT:    [[TMP1:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* [[CASTVEC]], i32 1, <8 x i1> <i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false>, <8 x float> zeroinitializer)
+; CHECK-NEXT:    ret <8 x float> [[TMP1]]
+;
+  %ld = tail call <8 x float> @llvm.x86.avx.maskload.ps.256(i8* %f, <8 x i32> <i32 0, i32 0, i32 0, i32 -1, i32 0, i32 0, i32 0, i32 0>)
+  ret <8 x float> %ld
+
+}
+
+define <4 x double> @mload_v4f64(i8* %f) {
+; CHECK-LABEL: @mload_v4f64(
+; CHECK-NEXT:    [[CASTVEC:%.*]] = bitcast i8* %f to <4 x double>*
+; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* [[CASTVEC]], i32 1, <4 x i1> <i1 true, i1 false, i1 false, i1 false>, <4 x double> zeroinitializer)
+; CHECK-NEXT:    ret <4 x double> [[TMP1]]
+;
+  %ld = tail call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %f, <4 x i64> <i64 -1, i64 0, i64 0, i64 0>)
+  ret <4 x double> %ld
+
+}
+
+; Try the AVX2 variants.
+
+define <4 x i32> @mload_v4i32(i8* %f) {
+; CHECK-LABEL: @mload_v4i32(
+; CHECK-NEXT:    [[CASTVEC:%.*]] = bitcast i8* %f to <4 x i32>*
+; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[CASTVEC]], i32 1, <4 x i1> <i1 false, i1 false, i1 false, i1 true>, <4 x i32> zeroinitializer)
+; CHECK-NEXT:    ret <4 x i32> [[TMP1]]
+;
+  %ld = tail call <4 x i32> @llvm.x86.avx2.maskload.d(i8* %f, <4 x i32> <i32 0, i32 0, i32 0, i32 -1>)
+  ret <4 x i32> %ld
+
+}
+
+define <2 x i64> @mload_v2i64(i8* %f) {
+; CHECK-LABEL: @mload_v2i64(
+; CHECK-NEXT:    [[CASTVEC:%.*]] = bitcast i8* %f to <2 x i64>*
+; CHECK-NEXT:    [[TMP1:%.*]] = call <2 x i64> @llvm.masked.load.v2i64.p0v2i64(<2 x i64>* [[CASTVEC]], i32 1, <2 x i1> <i1 true, i1 false>, <2 x i64> zeroinitializer)
+; CHECK-NEXT:    ret <2 x i64> [[TMP1]]
+;
+  %ld = tail call <2 x i64> @llvm.x86.avx2.maskload.q(i8* %f, <2 x i64> <i64 -1, i64 0>)
+  ret <2 x i64> %ld
+
+}
+
+define <8 x i32> @mload_v8i32(i8* %f) {
+; CHECK-LABEL: @mload_v8i32(
+; CHECK-NEXT:    [[CASTVEC:%.*]] = bitcast i8* %f to <8 x i32>*
+; CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* [[CASTVEC]], i32 1, <8 x i1> <i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false>, <8 x i32> zeroinitializer)
+; CHECK-NEXT:    ret <8 x i32> [[TMP1]]
+;
+  %ld = tail call <8 x i32> @llvm.x86.avx2.maskload.d.256(i8* %f, <8 x i32> <i32 0, i32 0, i32 0, i32 -1, i32 0, i32 0, i32 0, i32 0>)
+  ret <8 x i32> %ld
+
+}
+
+define <4 x i64> @mload_v4i64(i8* %f) {
+; CHECK-LABEL: @mload_v4i64(
+; CHECK-NEXT:    [[CASTVEC:%.*]] = bitcast i8* %f to <4 x i64>*
+; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i64> @llvm.masked.load.v4i64.p0v4i64(<4 x i64>* [[CASTVEC]], i32 1, <4 x i1> <i1 true, i1 false, i1 false, i1 false>, <4 x i64> zeroinitializer)
+; CHECK-NEXT:    ret <4 x i64> [[TMP1]]
+;
+  %ld = tail call <4 x i64> @llvm.x86.avx2.maskload.q.256(i8* %f, <4 x i64> <i64 -1, i64 0, i64 0, i64 0>)
+  ret <4 x i64> %ld
+
+}
+
+
+;; MASKED STORES
+
+; If the mask isn't constant, do nothing.
+
+define void @mstore(i8* %f, <4 x i32> %mask, <4 x float> %v) {
+; CHECK-LABEL: @mstore(
+; CHECK-NEXT:    tail call void @llvm.x86.avx.maskstore.ps(i8* %f, <4 x i32> %mask, <4 x float> %v)
+; CHECK-NEXT:    ret void
+;
+  tail call void @llvm.x86.avx.maskstore.ps(i8* %f, <4 x i32> %mask, <4 x float> %v)
+  ret void
+
+}
+
+; Zero mask is a nop.
+
+define void @mstore_zeros(i8* %f, <4 x float> %v)  {
+; CHECK-LABEL: @mstore_zeros(
+; CHECK-NEXT:    ret void
+;
+  tail call void @llvm.x86.avx.maskstore.ps(i8* %f, <4 x i32> zeroinitializer, <4 x float> %v)
+  ret void
+
+}
+
+; Only the sign bit matters.
+
+define void @mstore_fake_ones(i8* %f, <4 x float> %v) {
+; CHECK-LABEL: @mstore_fake_ones(
+; CHECK-NEXT:    ret void
+;
+  tail call void @llvm.x86.avx.maskstore.ps(i8* %f, <4 x i32> <i32 1, i32 2, i32 3, i32 2147483647>, <4 x float> %v)
+  ret void
+
+}
+
+; All mask bits are set, so this is just a vector store.
+
+define void @mstore_real_ones(i8* %f, <4 x float> %v) {
+; CHECK-LABEL: @mstore_real_ones(
+; CHECK-NEXT:    [[CASTVEC:%.*]] = bitcast i8* %f to <4 x float>*
+; CHECK-NEXT:    store <4 x float> %v, <4 x float>* [[CASTVEC]], align 1
+; CHECK-NEXT:    ret void
+;
+  tail call void @llvm.x86.avx.maskstore.ps(i8* %f, <4 x i32> <i32 -1, i32 -2, i32 -3, i32 -2147483648>, <4 x float> %v)
+  ret void
+
+}
+
+; It's a constant mask, so convert to an LLVM intrinsic. The backend should optimize further.
+
+define void @mstore_one_one(i8* %f, <4 x float> %v) {
+; CHECK-LABEL: @mstore_one_one(
+; CHECK-NEXT:    [[CASTVEC:%.*]] = bitcast i8* %f to <4 x float>*
+; CHECK-NEXT:    call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %v, <4 x float>* [[CASTVEC]], i32 1, <4 x i1> <i1 false, i1 false, i1 false, i1 true>)
+; CHECK-NEXT:    ret void
+;
+  tail call void @llvm.x86.avx.maskstore.ps(i8* %f, <4 x i32> <i32 0, i32 0, i32 0, i32 -1>, <4 x float> %v)
+  ret void
+
+}
+
+; Try doubles.
+
+define void @mstore_one_one_double(i8* %f, <2 x double> %v) {
+; CHECK-LABEL: @mstore_one_one_double(
+; CHECK-NEXT:    [[CASTVEC:%.*]] = bitcast i8* %f to <2 x double>*
+; CHECK-NEXT:    call void @llvm.masked.store.v2f64.p0v2f64(<2 x double> %v, <2 x double>* [[CASTVEC]], i32 1, <2 x i1> <i1 true, i1 false>)
+; CHECK-NEXT:    ret void
+;
+  tail call void @llvm.x86.avx.maskstore.pd(i8* %f, <2 x i64> <i64 -1, i64 0>, <2 x double> %v)
+  ret void
+
+}
+
+; Try 256-bit FP ops.
+
+define void @mstore_v8f32(i8* %f, <8 x float> %v) {
+; CHECK-LABEL: @mstore_v8f32(
+; CHECK-NEXT:    [[CASTVEC:%.*]] = bitcast i8* %f to <8 x float>*
+; CHECK-NEXT:    call void @llvm.masked.store.v8f32.p0v8f32(<8 x float> %v, <8 x float>* [[CASTVEC]], i32 1, <8 x i1> <i1 false, i1 false, i1 false, i1 false, i1 true, i1 true, i1 true, i1 true>)
+; CHECK-NEXT:    ret void
+;
+  tail call void @llvm.x86.avx.maskstore.ps.256(i8* %f, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 -1, i32 -2, i32 -3, i32 -4>, <8 x float> %v)
+  ret void
+
+}
+
+define void @mstore_v4f64(i8* %f, <4 x double> %v) {
+; CHECK-LABEL: @mstore_v4f64(
+; CHECK-NEXT:    [[CASTVEC:%.*]] = bitcast i8* %f to <4 x double>*
+; CHECK-NEXT:    call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> %v, <4 x double>* [[CASTVEC]], i32 1, <4 x i1> <i1 true, i1 false, i1 false, i1 false>)
+; CHECK-NEXT:    ret void
+;
+  tail call void @llvm.x86.avx.maskstore.pd.256(i8* %f, <4 x i64> <i64 -1, i64 0, i64 1, i64 2>, <4 x double> %v)
+  ret void
+
+}
+
+; Try the AVX2 variants.
+
+define void @mstore_v4i32(i8* %f, <4 x i32> %v) {
+; CHECK-LABEL: @mstore_v4i32(
+; CHECK-NEXT:    [[CASTVEC:%.*]] = bitcast i8* %f to <4 x i32>*
+; CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %v, <4 x i32>* [[CASTVEC]], i32 1, <4 x i1> <i1 false, i1 false, i1 true, i1 true>)
+; CHECK-NEXT:    ret void
+;
+  tail call void @llvm.x86.avx2.maskstore.d(i8* %f, <4 x i32> <i32 0, i32 1, i32 -1, i32 -2>, <4 x i32> %v)
+  ret void
+
+}
+
+define void @mstore_v2i64(i8* %f, <2 x i64> %v) {
+; CHECK-LABEL: @mstore_v2i64(
+; CHECK-NEXT:    [[CASTVEC:%.*]] = bitcast i8* %f to <2 x i64>*
+; CHECK-NEXT:    call void @llvm.masked.store.v2i64.p0v2i64(<2 x i64> %v, <2 x i64>* [[CASTVEC]], i32 1, <2 x i1> <i1 true, i1 false>)
+; CHECK-NEXT:    ret void
+;
+  tail call void @llvm.x86.avx2.maskstore.q(i8* %f, <2 x i64> <i64 -1, i64 0>, <2 x i64> %v)
+  ret void
+
+}
+
+define void @mstore_v8i32(i8* %f, <8 x i32> %v) {
+; CHECK-LABEL: @mstore_v8i32(
+; CHECK-NEXT:    [[CASTVEC:%.*]] = bitcast i8* %f to <8 x i32>*
+; CHECK-NEXT:    call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> %v, <8 x i32>* [[CASTVEC]], i32 1, <8 x i1> <i1 false, i1 false, i1 false, i1 false, i1 true, i1 true, i1 true, i1 true>)
+; CHECK-NEXT:    ret void
+;
+  tail call void @llvm.x86.avx2.maskstore.d.256(i8* %f, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 -1, i32 -2, i32 -3, i32 -4>, <8 x i32> %v)
+  ret void
+
+}
+
+define void @mstore_v4i64(i8* %f, <4 x i64> %v) {
+; CHECK-LABEL: @mstore_v4i64(
+; CHECK-NEXT:    [[CASTVEC:%.*]] = bitcast i8* %f to <4 x i64>*
+; CHECK-NEXT:    call void @llvm.masked.store.v4i64.p0v4i64(<4 x i64> %v, <4 x i64>* [[CASTVEC]], i32 1, <4 x i1> <i1 true, i1 false, i1 false, i1 false>)
+; CHECK-NEXT:    ret void
+;
+  tail call void @llvm.x86.avx2.maskstore.q.256(i8* %f, <4 x i64> <i64 -1, i64 0, i64 1, i64 2>, <4 x i64> %v)
+  ret void
+
+}
+
+; The original SSE2 masked store variant.
+
+define void @mstore_v16i8_sse2_zeros(<16 x i8> %d, i8* %p) {
+; CHECK-LABEL: @mstore_v16i8_sse2_zeros(
+; CHECK-NEXT:    ret void
+;
+  tail call void @llvm.x86.sse2.maskmov.dqu(<16 x i8> %d, <16 x i8> zeroinitializer, i8* %p)
+  ret void
+
+}
+
+
+declare <4 x float> @llvm.x86.avx.maskload.ps(i8*, <4 x i32>)
+declare <2 x double> @llvm.x86.avx.maskload.pd(i8*, <2 x i64>)
+declare <8 x float> @llvm.x86.avx.maskload.ps.256(i8*, <8 x i32>)
+declare <4 x double> @llvm.x86.avx.maskload.pd.256(i8*, <4 x i64>)
+
+declare <4 x i32> @llvm.x86.avx2.maskload.d(i8*, <4 x i32>)
+declare <2 x i64> @llvm.x86.avx2.maskload.q(i8*, <2 x i64>)
+declare <8 x i32> @llvm.x86.avx2.maskload.d.256(i8*, <8 x i32>)
+declare <4 x i64> @llvm.x86.avx2.maskload.q.256(i8*, <4 x i64>)
+
+declare void @llvm.x86.avx.maskstore.ps(i8*, <4 x i32>, <4 x float>)
+declare void @llvm.x86.avx.maskstore.pd(i8*, <2 x i64>, <2 x double>)
+declare void @llvm.x86.avx.maskstore.ps.256(i8*, <8 x i32>, <8 x float>)
+declare void @llvm.x86.avx.maskstore.pd.256(i8*, <4 x i64>, <4 x double>)
+
+declare void @llvm.x86.avx2.maskstore.d(i8*, <4 x i32>, <4 x i32>)
+declare void @llvm.x86.avx2.maskstore.q(i8*, <2 x i64>, <2 x i64>)
+declare void @llvm.x86.avx2.maskstore.d.256(i8*, <8 x i32>, <8 x i32>)
+declare void @llvm.x86.avx2.maskstore.q.256(i8*, <4 x i64>, <4 x i64>)
+
+declare void @llvm.x86.sse2.maskmov.dqu(<16 x i8>, <16 x i8>, i8*)
+
diff --git a/test/Transforms/InstCombine/X86/x86-movmsk.ll b/test/Transforms/InstCombine/X86/x86-movmsk.ll
new file mode 100644
index 000000000000..11acc1dbca84
--- /dev/null
+++ b/test/Transforms/InstCombine/X86/x86-movmsk.ll
@@ -0,0 +1,324 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+;
+; DemandedBits - MOVMSK zeros the upper bits of the result.
+;
+
+define i32 @test_upper_x86_mmx_pmovmskb(x86_mmx %a0) {
+; CHECK-LABEL: @test_upper_x86_mmx_pmovmskb(
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.x86.mmx.pmovmskb(x86_mmx %a0)
+; CHECK-NEXT:    ret i32 [[TMP1]]
+;
+  %1 = call i32 @llvm.x86.mmx.pmovmskb(x86_mmx %a0)
+  %2 = and i32 %1, 255
+  ret i32 %2
+}
+
+define i32 @test_upper_x86_sse_movmsk_ps(<4 x float> %a0) {
+; CHECK-LABEL: @test_upper_x86_sse_movmsk_ps(
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %a0)
+; CHECK-NEXT:    ret i32 [[TMP1]]
+;
+  %1 = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %a0)
+  %2 = and i32 %1, 15
+  ret i32 %2
+}
+
+define i32 @test_upper_x86_sse2_movmsk_pd(<2 x double> %a0) {
+; CHECK-LABEL: @test_upper_x86_sse2_movmsk_pd(
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.x86.sse2.movmsk.pd(<2 x double> %a0)
+; CHECK-NEXT:    ret i32 [[TMP1]]
+;
+  %1 = call i32 @llvm.x86.sse2.movmsk.pd(<2 x double> %a0)
+  %2 = and i32 %1, 3
+  ret i32 %2
+}
+
+define i32 @test_upper_x86_sse2_pmovmskb_128(<16 x i8> %a0) {
+; CHECK-LABEL: @test_upper_x86_sse2_pmovmskb_128(
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.x86.sse2.pmovmskb.128(<16 x i8> %a0)
+; CHECK-NEXT:    ret i32 [[TMP1]]
+;
+  %1 = call i32 @llvm.x86.sse2.pmovmskb.128(<16 x i8> %a0)
+  %2 = and i32 %1, 65535
+  ret i32 %2
+}
+
+define i32 @test_upper_x86_avx_movmsk_ps_256(<8 x float> %a0) {
+; CHECK-LABEL: @test_upper_x86_avx_movmsk_ps_256(
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %a0)
+; CHECK-NEXT:    ret i32 [[TMP1]]
+;
+  %1 = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %a0)
+  %2 = and i32 %1, 255
+  ret i32 %2
+}
+
+define i32 @test_upper_x86_avx_movmsk_pd_256(<4 x double> %a0) {
+; CHECK-LABEL: @test_upper_x86_avx_movmsk_pd_256(
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.x86.avx.movmsk.pd.256(<4 x double> %a0)
+; CHECK-NEXT:    ret i32 [[TMP1]]
+;
+  %1 = call i32 @llvm.x86.avx.movmsk.pd.256(<4 x double> %a0)
+  %2 = and i32 %1, 15
+  ret i32 %2
+}
+
+; llvm.x86.avx2.pmovmskb uses the whole of the 32-bit register.
+
+;
+; DemandedBits - If we don't use the lower bits then we just return zero.
+;
+
+define i32 @test_lower_x86_mmx_pmovmskb(x86_mmx %a0) {
+; CHECK-LABEL: @test_lower_x86_mmx_pmovmskb(
+; CHECK-NEXT:    ret i32 0
+;
+  %1 = call i32 @llvm.x86.mmx.pmovmskb(x86_mmx %a0)
+  %2 = and i32 %1, -256
+  ret i32 %2
+}
+
+define i32 @test_lower_x86_sse_movmsk_ps(<4 x float> %a0) {
+; CHECK-LABEL: @test_lower_x86_sse_movmsk_ps(
+; CHECK-NEXT:    ret i32 0
+;
+  %1 = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %a0)
+  %2 = and i32 %1, -16
+  ret i32 %2
+}
+
+define i32 @test_lower_x86_sse2_movmsk_pd(<2 x double> %a0) {
+; CHECK-LABEL: @test_lower_x86_sse2_movmsk_pd(
+; CHECK-NEXT:    ret i32 0
+;
+  %1 = call i32 @llvm.x86.sse2.movmsk.pd(<2 x double> %a0)
+  %2 = and i32 %1, -4
+  ret i32 %2
+}
+
+define i32 @test_lower_x86_sse2_pmovmskb_128(<16 x i8> %a0) {
+; CHECK-LABEL: @test_lower_x86_sse2_pmovmskb_128(
+; CHECK-NEXT:    ret i32 0
+;
+  %1 = call i32 @llvm.x86.sse2.pmovmskb.128(<16 x i8> %a0)
+  %2 = and i32 %1, -65536
+  ret i32 %2
+}
+
+define i32 @test_lower_x86_avx_movmsk_ps_256(<8 x float> %a0) {
+; CHECK-LABEL: @test_lower_x86_avx_movmsk_ps_256(
+; CHECK-NEXT:    ret i32 0
+;
+  %1 = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %a0)
+  %2 = and i32 %1, -256
+  ret i32 %2
+}
+
+define i32 @test_lower_x86_avx_movmsk_pd_256(<4 x double> %a0) {
+; CHECK-LABEL: @test_lower_x86_avx_movmsk_pd_256(
+; CHECK-NEXT:    ret i32 0
+;
+  %1 = call i32 @llvm.x86.avx.movmsk.pd.256(<4 x double> %a0)
+  %2 = and i32 %1, -16
+  ret i32 %2
+}
+
+; llvm.x86.avx2.pmovmskb uses the whole of the 32-bit register.
+
+;
+; Constant Folding (UNDEF -> ZERO)
+;
+
+define i32 @undef_x86_mmx_pmovmskb() {
+; CHECK-LABEL: @undef_x86_mmx_pmovmskb(
+; CHECK-NEXT:    ret i32 0
+;
+  %1 = call i32 @llvm.x86.mmx.pmovmskb(x86_mmx undef)
+  ret i32 %1
+}
+
+define i32 @undef_x86_sse_movmsk_ps() {
+; CHECK-LABEL: @undef_x86_sse_movmsk_ps(
+; CHECK-NEXT:    ret i32 0
+;
+  %1 = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> undef)
+  ret i32 %1
+}
+
+define i32 @undef_x86_sse2_movmsk_pd() {
+; CHECK-LABEL: @undef_x86_sse2_movmsk_pd(
+; CHECK-NEXT:    ret i32 0
+;
+  %1 = call i32 @llvm.x86.sse2.movmsk.pd(<2 x double> undef)
+  ret i32 %1
+}
+
+define i32 @undef_x86_sse2_pmovmskb_128() {
+; CHECK-LABEL: @undef_x86_sse2_pmovmskb_128(
+; CHECK-NEXT:    ret i32 0
+;
+  %1 = call i32 @llvm.x86.sse2.pmovmskb.128(<16 x i8> undef)
+  ret i32 %1
+}
+
+define i32 @undef_x86_avx_movmsk_ps_256() {
+; CHECK-LABEL: @undef_x86_avx_movmsk_ps_256(
+; CHECK-NEXT:    ret i32 0
+;
+  %1 = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> undef)
+  ret i32 %1
+}
+
+define i32 @undef_x86_avx_movmsk_pd_256() {
+; CHECK-LABEL: @undef_x86_avx_movmsk_pd_256(
+; CHECK-NEXT:    ret i32 0
+;
+  %1 = call i32 @llvm.x86.avx.movmsk.pd.256(<4 x double> undef)
+  ret i32 %1
+}
+
+define i32 @undef_x86_avx2_pmovmskb() {
+; CHECK-LABEL: @undef_x86_avx2_pmovmskb(
+; CHECK-NEXT:    ret i32 0
+;
+  %1 = call i32 @llvm.x86.avx2.pmovmskb(<32 x i8> undef)
+  ret i32 %1
+}
+
+;
+; Constant Folding (ZERO -> ZERO)
+;
+
+define i32 @zero_x86_mmx_pmovmskb() {
+; CHECK-LABEL: @zero_x86_mmx_pmovmskb(
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.x86.mmx.pmovmskb(x86_mmx bitcast (<1 x i64> zeroinitializer to x86_mmx))
+; CHECK-NEXT:    ret i32 [[TMP1]]
+;
+  %1 = bitcast <1 x i64> zeroinitializer to x86_mmx
+  %2 = call i32 @llvm.x86.mmx.pmovmskb(x86_mmx %1)
+  ret i32 %2
+}
+
+define i32 @zero_x86_sse_movmsk_ps() {
+; CHECK-LABEL: @zero_x86_sse_movmsk_ps(
+; CHECK-NEXT:    ret i32 0
+;
+  %1 = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> zeroinitializer)
+  ret i32 %1
+}
+
+define i32 @zero_x86_sse2_movmsk_pd() {
+; CHECK-LABEL: @zero_x86_sse2_movmsk_pd(
+; CHECK-NEXT:    ret i32 0
+;
+  %1 = call i32 @llvm.x86.sse2.movmsk.pd(<2 x double> zeroinitializer)
+  ret i32 %1
+}
+
+define i32 @zero_x86_sse2_pmovmskb_128() {
+; CHECK-LABEL: @zero_x86_sse2_pmovmskb_128(
+; CHECK-NEXT:    ret i32 0
+;
+  %1 = call i32 @llvm.x86.sse2.pmovmskb.128(<16 x i8> zeroinitializer)
+  ret i32 %1
+}
+
+define i32 @zero_x86_avx_movmsk_ps_256() {
+; CHECK-LABEL: @zero_x86_avx_movmsk_ps_256(
+; CHECK-NEXT:    ret i32 0
+;
+  %1 = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> zeroinitializer)
+  ret i32 %1
+}
+
+define i32 @zero_x86_avx_movmsk_pd_256() {
+; CHECK-LABEL: @zero_x86_avx_movmsk_pd_256(
+; CHECK-NEXT:    ret i32 0
+;
+  %1 = call i32 @llvm.x86.avx.movmsk.pd.256(<4 x double> zeroinitializer)
+  ret i32 %1
+}
+
+define i32 @zero_x86_avx2_pmovmskb() {
+; CHECK-LABEL: @zero_x86_avx2_pmovmskb(
+; CHECK-NEXT:    ret i32 0
+;
+  %1 = call i32 @llvm.x86.avx2.pmovmskb(<32 x i8> zeroinitializer)
+  ret i32 %1
+}
+
+;
+; Constant Folding
+;
+
+define i32 @fold_x86_mmx_pmovmskb() {
+; CHECK-LABEL: @fold_x86_mmx_pmovmskb(
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.x86.mmx.pmovmskb(x86_mmx bitcast (<8 x i8> <i8 0, i8 -1, i8 -1, i8 127, i8 -127, i8 63, i8 64, i8 0> to x86_mmx))
+; CHECK-NEXT:    ret i32 [[TMP1]]
+;
+  %1 = bitcast <8 x i8> <i8 0, i8 255, i8 -1, i8 127, i8 -127, i8 63, i8 64, i8 256> to x86_mmx
+  %2 = call i32 @llvm.x86.mmx.pmovmskb(x86_mmx %1)
+  ret i32 %2
+}
+
+define i32 @fold_x86_sse_movmsk_ps() {
+; CHECK-LABEL: @fold_x86_sse_movmsk_ps(
+; CHECK-NEXT:    ret i32 10
+;
+  %1 = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> <float 1.0, float -1.0, float 100.0, float -200.0>)
+  ret i32 %1
+}
+
+define i32 @fold_x86_sse2_movmsk_pd() {
+; CHECK-LABEL: @fold_x86_sse2_movmsk_pd(
+; CHECK-NEXT:    ret i32 2
+;
+  %1 = call i32 @llvm.x86.sse2.movmsk.pd(<2 x double> <double 1.0, double -1.0>)
+  ret i32 %1
+}
+
+define i32 @fold_x86_sse2_pmovmskb_128() {
+; CHECK-LABEL: @fold_x86_sse2_pmovmskb_128(
+; CHECK-NEXT:    ret i32 5654
+;
+  %1 = call i32 @llvm.x86.sse2.pmovmskb.128(<16 x i8> <i8 0, i8 255, i8 -1, i8 127, i8 -127, i8 63, i8 64, i8 256, i8 0, i8 255, i8 -1, i8 127, i8 -127, i8 63, i8 64, i8 256>)
+  ret i32 %1
+}
+
+define i32 @fold_x86_avx_movmsk_ps_256() {
+; CHECK-LABEL: @fold_x86_avx_movmsk_ps_256(
+; CHECK-NEXT:    ret i32 170
+;
+  %1 = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> <float 1.0, float -1.0, float 100.0, float -200.0, float +0.0, float -0.0, float 100000.0, float -5000000.0>)
+  ret i32 %1
+}
+
+define i32 @fold_x86_avx_movmsk_pd_256() {
+; CHECK-LABEL: @fold_x86_avx_movmsk_pd_256(
+; CHECK-NEXT:    ret i32 10
+;
+  %1 = call i32 @llvm.x86.avx.movmsk.pd.256(<4 x double> <double 1.0, double -1.0, double 100.0, double -200.0>)
+  ret i32 %1
+}
+
+define i32 @fold_x86_avx2_pmovmskb() {
+; CHECK-LABEL: @fold_x86_avx2_pmovmskb(
+; CHECK-NEXT:    ret i32 370546176
+;
+  %1 = call i32 @llvm.x86.avx2.pmovmskb(<32 x i8> <i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 0, i8 255, i8 -1, i8 127, i8 -127, i8 63, i8 64, i8 256, i8 0, i8 255, i8 -1, i8 127, i8 -127, i8 63, i8 64, i8 256, i8 0, i8 255, i8 -1, i8 127, i8 -127, i8 63, i8 64, i8 256>)
+  ret i32 %1
+}
+
+declare i32 @llvm.x86.mmx.pmovmskb(x86_mmx)
+
+declare i32 @llvm.x86.sse.movmsk.ps(<4 x float>)
+declare i32 @llvm.x86.sse2.movmsk.pd(<2 x double>)
+declare i32 @llvm.x86.sse2.pmovmskb.128(<16 x i8>)
+
+declare i32 @llvm.x86.avx.movmsk.ps.256(<8 x float>)
+declare i32 @llvm.x86.avx.movmsk.pd.256(<4 x double>)
+declare i32 @llvm.x86.avx2.pmovmskb(<32 x i8>)
diff --git a/test/Transforms/InstCombine/X86/x86-muldq.ll b/test/Transforms/InstCombine/X86/x86-muldq.ll
new file mode 100644
index 000000000000..bcbb8919c403
--- /dev/null
+++ b/test/Transforms/InstCombine/X86/x86-muldq.ll
@@ -0,0 +1,245 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+;
+; UNDEF Elts
+;
+
+define <2 x i64> @undef_pmuludq_128(<4 x i32> %a0, <4 x i32> %a1) {
+; CHECK-LABEL: @undef_pmuludq_128(
+; CHECK-NEXT:    ret <2 x i64> zeroinitializer
+;
+  %1 = call <2 x i64> @llvm.x86.sse2.pmulu.dq(<4 x i32> undef, <4 x i32> undef)
+  ret <2 x i64> %1
+}
+
+define <4 x i64> @undef_pmuludq_256(<8 x i32> %a0, <8 x i32> %a1) {
+; CHECK-LABEL: @undef_pmuludq_256(
+; CHECK-NEXT:    ret <4 x i64> zeroinitializer
+;
+  %1 = call <4 x i64> @llvm.x86.avx2.pmulu.dq(<8 x i32> undef, <8 x i32> undef)
+  ret <4 x i64> %1
+}
+
+define <8 x i64> @undef_pmuludq_512(<16 x i32> %a0, <16 x i32> %a1) {
+; CHECK-LABEL: @undef_pmuludq_512(
+; CHECK-NEXT:    ret <8 x i64> zeroinitializer
+;
+  %1 = call <8 x i64> @llvm.x86.avx512.pmulu.dq.512(<16 x i32> undef, <16 x i32> undef)
+  ret <8 x i64> %1
+}
+
+define <2 x i64> @undef_pmuldq_128(<4 x i32> %a0, <4 x i32> %a1) {
+; CHECK-LABEL: @undef_pmuldq_128(
+; CHECK-NEXT:    ret <2 x i64> zeroinitializer
+;
+  %1 = call <2 x i64> @llvm.x86.sse41.pmuldq(<4 x i32> undef, <4 x i32> undef)
+  ret <2 x i64> %1
+}
+
+define <4 x i64> @undef_pmuldq_256(<8 x i32> %a0, <8 x i32> %a1) {
+; CHECK-LABEL: @undef_pmuldq_256(
+; CHECK-NEXT:    ret <4 x i64> zeroinitializer
+;
+  %1 = call <4 x i64> @llvm.x86.avx2.pmul.dq(<8 x i32> undef, <8 x i32> undef)
+  ret <4 x i64> %1
+}
+
+define <8 x i64> @undef_pmuldq_512(<16 x i32> %a0, <16 x i32> %a1) {
+; CHECK-LABEL: @undef_pmuldq_512(
+; CHECK-NEXT:    ret <8 x i64> zeroinitializer
+;
+  %1 = call <8 x i64> @llvm.x86.avx512.pmul.dq.512(<16 x i32> undef, <16 x i32> undef)
+  ret <8 x i64> %1
+}
+
+define <2 x i64> @undef_zero_pmuludq_128(<4 x i32> %a0, <4 x i32> %a1) {
+; CHECK-LABEL: @undef_zero_pmuludq_128(
+; CHECK-NEXT:    ret <2 x i64> zeroinitializer
+;
+  %1 = call <2 x i64> @llvm.x86.sse2.pmulu.dq(<4 x i32> undef, <4 x i32> zeroinitializer)
+  ret <2 x i64> %1
+}
+
+define <4 x i64> @undef_zero_pmuludq_256(<8 x i32> %a0, <8 x i32> %a1) {
+; CHECK-LABEL: @undef_zero_pmuludq_256(
+; CHECK-NEXT:    ret <4 x i64> zeroinitializer
+;
+  %1 = call <4 x i64> @llvm.x86.avx2.pmulu.dq(<8 x i32> zeroinitializer, <8 x i32> undef)
+  ret <4 x i64> %1
+}
+
+define <8 x i64> @undef_zero_pmuludq_512(<16 x i32> %a0, <16 x i32> %a1) {
+; CHECK-LABEL: @undef_zero_pmuludq_512(
+; CHECK-NEXT:    ret <8 x i64> zeroinitializer
+;
+  %1 = call <8 x i64> @llvm.x86.avx512.pmulu.dq.512(<16 x i32> undef, <16 x i32> zeroinitializer)
+  ret <8 x i64> %1
+}
+
+define <2 x i64> @undef_zero_pmuldq_128(<4 x i32> %a0, <4 x i32> %a1) {
+; CHECK-LABEL: @undef_zero_pmuldq_128(
+; CHECK-NEXT:    ret <2 x i64> zeroinitializer
+;
+  %1 = call <2 x i64> @llvm.x86.sse41.pmuldq(<4 x i32> zeroinitializer, <4 x i32> undef)
+  ret <2 x i64> %1
+}
+
+define <4 x i64> @undef_zero_pmuldq_256(<8 x i32> %a0, <8 x i32> %a1) {
+; CHECK-LABEL: @undef_zero_pmuldq_256(
+; CHECK-NEXT:    ret <4 x i64> zeroinitializer
+;
+  %1 = call <4 x i64> @llvm.x86.avx2.pmul.dq(<8 x i32> undef, <8 x i32> zeroinitializer)
+  ret <4 x i64> %1
+}
+
+define <8 x i64> @undef_zero_pmuldq_512(<16 x i32> %a0, <16 x i32> %a1) {
+; CHECK-LABEL: @undef_zero_pmuldq_512(
+; CHECK-NEXT:    ret <8 x i64> zeroinitializer
+;
+  %1 = call <8 x i64> @llvm.x86.avx512.pmul.dq.512(<16 x i32> zeroinitializer, <16 x i32> undef)
+  ret <8 x i64> %1
+}
+
+;
+; Constant Folding
+;
+
+define <2 x i64> @fold_pmuludq_128(<4 x i32> %a0, <4 x i32> %a1) {
+; CHECK-LABEL: @fold_pmuludq_128(
+; CHECK-NEXT:    ret <2 x i64> <i64 9223372030412324865, i64 4294967295>
+;
+  %1 = call <2 x i64> @llvm.x86.sse2.pmulu.dq(<4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, <4 x i32> <i32 2147483647, i32 1, i32 1, i32 3>)
+  ret <2 x i64> %1
+}
+
+define <4 x i64> @fold_pmuludq_256(<8 x i32> %a0, <8 x i32> %a1) {
+; CHECK-LABEL: @fold_pmuludq_256(
+; CHECK-NEXT:    ret <4 x i64> zeroinitializer
+;
+  %1 = call <4 x i64> @llvm.x86.avx2.pmulu.dq(<8 x i32> zeroinitializer, <8 x i32> zeroinitializer)
+  ret <4 x i64> %1
+}
+
+define <8 x i64> @fold_pmuludq_512(<16 x i32> %a0, <16 x i32> %a1) {
+; CHECK-LABEL: @fold_pmuludq_512(
+; CHECK-NEXT:    ret <8 x i64> <i64 0, i64 0, i64 255, i64 131070, i64 0, i64 -281474976645121, i64 140737488289792, i64 281470681743360>
+;
+  %1 = call <8 x i64> @llvm.x86.avx512.pmulu.dq.512(<16 x i32> <i32 0, i32 0, i32 undef, i32 0, i32 1, i32 1, i32 2, i32 2, i32 undef, i32 undef, i32 -1, i32 -1, i32 65536, i32 -1, i32 -65536, i32 undef>, <16 x i32> <i32 undef, i32 undef, i32 undef, i32 1, i32 255, i32 -256, i32 65535, i32 -65536, i32 0, i32 -1, i32 -65535, i32 -65535, i32 2147483647, i32 2147483648, i32 65536, i32 -65535>)
+  ret <8 x i64> %1
+}
+
+define <2 x i64> @fold_pmuldq_128(<4 x i32> %a0, <4 x i32> %a1) {
+; CHECK-LABEL: @fold_pmuldq_128(
+; CHECK-NEXT:    ret <2 x i64> <i64 0, i64 2>
+;
+  %1 = call <2 x i64> @llvm.x86.sse41.pmuldq(<4 x i32> <i32 undef, i32 -1, i32 -1, i32 -1>, <4 x i32> <i32 undef, i32 1, i32 -2, i32 3>)
+  ret <2 x i64> %1
+}
+
+define <4 x i64> @fold_pmuldq_256(<8 x i32> %a0, <8 x i32> %a1) {
+; CHECK-LABEL: @fold_pmuldq_256(
+; CHECK-NEXT:    ret <4 x i64> <i64 0, i64 4294836225, i64 140737488289792, i64 -140737488355328>
+;
+  %1 = call <4 x i64> @llvm.x86.avx2.pmul.dq(<8 x i32> <i32 undef, i32 1, i32 -65535, i32 128, i32 65536, i32 2147483647, i32 -2147483648, i32 65536>, <8 x i32> <i32 0, i32 -1, i32 -65535, i32 -65535, i32 2147483647, i32 2147483648, i32 65536, i32 -65535>)
+  ret <4 x i64> %1
+}
+
+define <8 x i64> @fold_pmuldq_512(<16 x i32> %a0, <16 x i32> %a1) {
+; CHECK-LABEL: @fold_pmuldq_512(
+; CHECK-NEXT:    ret <8 x i64> zeroinitializer
+;
+  %1 = call <8 x i64> @llvm.x86.avx512.pmul.dq.512(<16 x i32> zeroinitializer, <16 x i32> <i32 undef, i32 -1, i32 -3, i32 -1, i32 8, i32 10, i32 -256, i32 65536, i32 undef, i32 1, i32 -65535, i32 128, i32 65536, i32 2147483647, i32 -2147483648, i32 65536>)
+  ret <8 x i64> %1
+}
+
+;
+; PMULUDQ/PMULDQ - only the even elements (0, 2, 4, 6) of the vXi32 inputs are required.
+;
+
+define <2 x i64> @test_demanded_elts_pmuludq_128(<4 x i32> %a0, <4 x i32> %a1) {
+; CHECK-LABEL: @test_demanded_elts_pmuludq_128(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> %a1, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP2:%.*]] = call <2 x i64> @llvm.x86.sse2.pmulu.dq(<4 x i32> %a0, <4 x i32> [[TMP1]])
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> undef, <2 x i32> zeroinitializer
+; CHECK-NEXT:    ret <2 x i64> [[TMP3]]
+;
+  %1 = shufflevector <4 x i32> %a0, <4 x i32> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
+  %2 = shufflevector <4 x i32> %a1, <4 x i32> undef, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
+  %3 = call <2 x i64> @llvm.x86.sse2.pmulu.dq(<4 x i32> %1, <4 x i32> %2)
+  %4 = shufflevector <2 x i64> %3, <2 x i64> undef, <2 x i32> zeroinitializer
+  ret <2 x i64> %4
+}
+
+define <4 x i64> @test_demanded_elts_pmuludq_256(<8 x i32> %a0, <8 x i32> %a1) {
+; CHECK-LABEL: @test_demanded_elts_pmuludq_256(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> %a1, <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 3, i32 undef, i32 5, i32 undef, i32 7, i32 undef>
+; CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i64> @llvm.x86.avx2.pmulu.dq(<8 x i32> %a0, <8 x i32> [[TMP1]])
+; CHECK-NEXT:    ret <4 x i64> [[TMP2]]
+;
+  %1 = shufflevector <8 x i32> %a0, <8 x i32> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
+  %2 = shufflevector <8 x i32> %a1, <8 x i32> undef, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7>
+  %3 = call <4 x i64> @llvm.x86.avx2.pmulu.dq(<8 x i32> %1, <8 x i32> %2)
+  ret <4 x i64> %3
+}
+
+define <8 x i64> @test_demanded_elts_pmuludq_512(<16 x i32> %a0, <16 x i32> %a1) {
+; CHECK-LABEL: @test_demanded_elts_pmuludq_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i32> %a1, <16 x i32> undef, <16 x i32> <i32 1, i32 undef, i32 3, i32 undef, i32 5, i32 undef, i32 7, i32 undef, i32 9, i32 undef, i32 11, i32 undef, i32 13, i32 undef, i32 15, i32 undef>
+; CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i64> @llvm.x86.avx512.pmulu.dq.512(<16 x i32> %a0, <16 x i32> [[TMP1]])
+; CHECK-NEXT:    ret <8 x i64> [[TMP2]]
+;
+  %1 = shufflevector <16 x i32> %a0, <16 x i32> undef, <16 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14>
+  %2 = shufflevector <16 x i32> %a1, <16 x i32> undef, <16 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7, i32 9, i32 9, i32 11, i32 11, i32 13, i32 13, i32 15, i32 15>
+  %3 = call <8 x i64> @llvm.x86.avx512.pmulu.dq.512(<16 x i32> %1, <16 x i32> %2)
+  ret <8 x i64> %3
+}
+
+define <2 x i64> @test_demanded_elts_pmuldq_128(<4 x i32> %a0, <4 x i32> %a1) {
+; CHECK-LABEL: @test_demanded_elts_pmuldq_128(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> %a1, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 3, i32 undef>
+; CHECK-NEXT:    [[TMP2:%.*]] = call <2 x i64> @llvm.x86.sse41.pmuldq(<4 x i32> %a0, <4 x i32> [[TMP1]])
+; CHECK-NEXT:    ret <2 x i64> [[TMP2]]
+;
+  %1 = shufflevector <4 x i32> %a0, <4 x i32> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
+  %2 = shufflevector <4 x i32> %a1, <4 x i32> undef, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
+  %3 = call <2 x i64> @llvm.x86.sse41.pmuldq(<4 x i32> %1, <4 x i32> %2)
+  ret <2 x i64> %3
+}
+
+define <4 x i64> @test_demanded_elts_pmuldq_256(<8 x i32> %a0, <8 x i32> %a1) {
+; CHECK-LABEL: @test_demanded_elts_pmuldq_256(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> %a1, <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 7, i32 undef>
+; CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i64> @llvm.x86.avx2.pmul.dq(<8 x i32> %a0, <8 x i32> [[TMP1]])
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i64> [[TMP2]], <4 x i64> undef, <4 x i32> <i32 0, i32 0, i32 3, i32 3>
+; CHECK-NEXT:    ret <4 x i64> [[TMP3]]
+;
+  %1 = shufflevector <8 x i32> %a0, <8 x i32> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
+  %2 = shufflevector <8 x i32> %a1, <8 x i32> undef, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7>
+  %3 = call <4 x i64> @llvm.x86.avx2.pmul.dq(<8 x i32> %1, <8 x i32> %2)
+  %4 = shufflevector <4 x i64> %3, <4 x i64> undef, <4 x i32> <i32 0, i32 0, i32 3, i32 3>
+  ret <4 x i64> %4
+}
+
+define <8 x i64> @test_demanded_elts_pmuldq_512(<16 x i32> %a0, <16 x i32> %a1) {
+; CHECK-LABEL: @test_demanded_elts_pmuldq_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i32> %a1, <16 x i32> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 7, i32 undef, i32 9, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 15, i32 undef>
+; CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i64> @llvm.x86.avx512.pmul.dq.512(<16 x i32> %a0, <16 x i32> [[TMP1]])
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i64> [[TMP2]], <8 x i64> undef, <8 x i32> <i32 0, i32 0, i32 3, i32 3, i32 4, i32 4, i32 7, i32 7>
+; CHECK-NEXT:    ret <8 x i64> [[TMP3]]
+;
+  %1 = shufflevector <16 x i32> %a0, <16 x i32> undef, <16 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14>
+  %2 = shufflevector <16 x i32> %a1, <16 x i32> undef, <16 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7, i32 9, i32 9, i32 11, i32 11, i32 13, i32 13, i32 15, i32 15>
+  %3 = call <8 x i64> @llvm.x86.avx512.pmul.dq.512(<16 x i32> %1, <16 x i32> %2)
+  %4 = shufflevector <8 x i64> %3, <8 x i64> undef, <8 x i32> <i32 0, i32 0, i32 3, i32 3, i32 4, i32 4, i32 7, i32 7>
+  ret <8 x i64> %4
+}
+
+declare <2 x i64> @llvm.x86.sse2.pmulu.dq(<4 x i32>, <4 x i32>) nounwind readnone
+declare <2 x i64> @llvm.x86.sse41.pmuldq(<4 x i32>, <4 x i32>) nounwind readnone
+
+declare <4 x i64> @llvm.x86.avx2.pmulu.dq(<8 x i32>, <8 x i32>) nounwind readnone
+declare <4 x i64> @llvm.x86.avx2.pmul.dq(<8 x i32>, <8 x i32>) nounwind readnone
+
+declare <8 x i64> @llvm.x86.avx512.pmulu.dq.512(<16 x i32>, <16 x i32>) nounwind readnone
+declare <8 x i64> @llvm.x86.avx512.pmul.dq.512(<16 x i32>, <16 x i32>) nounwind readnone
diff --git a/test/Transforms/InstCombine/X86/x86-pack.ll b/test/Transforms/InstCombine/X86/x86-pack.ll
new file mode 100644
index 000000000000..f3c41a8aa476
--- /dev/null
+++ b/test/Transforms/InstCombine/X86/x86-pack.ll
@@ -0,0 +1,366 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+;
+; UNDEF Elts
+;
+
+define <8 x i16> @undef_packssdw_128() {
+; CHECK-LABEL: @undef_packssdw_128(
+; CHECK-NEXT:    ret <8 x i16> undef
+;
+  %1 = call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> undef, <4 x i32> undef)
+  ret <8 x i16> %1
+}
+
+define <8 x i16> @undef_packusdw_128() {
+; CHECK-LABEL: @undef_packusdw_128(
+; CHECK-NEXT:    ret <8 x i16> undef
+;
+  %1 = call <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32> undef, <4 x i32> undef)
+  ret <8 x i16> %1
+}
+
+define <16 x i8> @undef_packsswb_128() {
+; CHECK-LABEL: @undef_packsswb_128(
+; CHECK-NEXT:    ret <16 x i8> undef
+;
+  %1 = call <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16> undef, <8 x i16> undef)
+  ret <16 x i8> %1
+}
+
+define <16 x i8> @undef_packuswb_128() {
+; CHECK-LABEL: @undef_packuswb_128(
+; CHECK-NEXT:    ret <16 x i8> undef
+;
+  %1 = call <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16> undef, <8 x i16> undef)
+  ret <16 x i8> %1
+}
+
+define <16 x i16> @undef_packssdw_256() {
+; CHECK-LABEL: @undef_packssdw_256(
+; CHECK-NEXT:    ret <16 x i16> undef
+;
+  %1 = call <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32> undef, <8 x i32> undef)
+  ret <16 x i16> %1
+}
+
+define <16 x i16> @undef_packusdw_256() {
+; CHECK-LABEL: @undef_packusdw_256(
+; CHECK-NEXT:    ret <16 x i16> undef
+;
+  %1 = call <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32> undef, <8 x i32> undef)
+  ret <16 x i16> %1
+}
+
+define <32 x i8> @undef_packsswb_256() {
+; CHECK-LABEL: @undef_packsswb_256(
+; CHECK-NEXT:    ret <32 x i8> undef
+;
+  %1 = call <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16> undef, <16 x i16> undef)
+  ret <32 x i8> %1
+}
+
+define <32 x i8> @undef_packuswb_256() {
+; CHECK-LABEL: @undef_packuswb_256(
+; CHECK-NEXT:    ret <32 x i8> undef
+;
+  %1 = call <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16> undef, <16 x i16> undef)
+  ret <32 x i8> %1
+}
+
+define <32 x i16> @undef_packssdw_512() {
+; CHECK-LABEL: @undef_packssdw_512(
+; CHECK-NEXT:    ret <32 x i16> undef
+;
+  %1 = call <32 x i16> @llvm.x86.avx512.packssdw.512(<16 x i32> undef, <16 x i32> undef)
+  ret <32 x i16> %1
+}
+
+define <32 x i16> @undef_packusdw_512() {
+; CHECK-LABEL: @undef_packusdw_512(
+; CHECK-NEXT:    ret <32 x i16> undef
+;
+  %1 = call <32 x i16> @llvm.x86.avx512.packusdw.512(<16 x i32> undef, <16 x i32> undef)
+  ret <32 x i16> %1
+}
+
+define <64 x i8> @undef_packsswb_512() {
+; CHECK-LABEL: @undef_packsswb_512(
+; CHECK-NEXT:    ret <64 x i8> undef
+;
+  %1 = call <64 x i8> @llvm.x86.avx512.packsswb.512(<32 x i16> undef, <32 x i16> undef)
+  ret <64 x i8> %1
+}
+
+define <64 x i8> @undef_packuswb_512() {
+; CHECK-LABEL: @undef_packuswb_512(
+; CHECK-NEXT:    ret <64 x i8> undef
+;
+  %1 = call <64 x i8> @llvm.x86.avx512.packuswb.512(<32 x i16> undef, <32 x i16> undef)
+  ret <64 x i8> %1
+}
+
+;
+; Constant Folding
+;
+
+define <8 x i16> @fold_packssdw_128() {
+; CHECK-LABEL: @fold_packssdw_128(
+; CHECK-NEXT:    ret <8 x i16> <i16 0, i16 -1, i16 32767, i16 -32768, i16 0, i16 0, i16 0, i16 0>
+;
+  %1 = call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> <i32 0, i32 -1, i32 65536, i32 -131072>, <4 x i32> zeroinitializer)
+  ret <8 x i16> %1
+}
+
+define <8 x i16> @fold_packusdw_128() {
+; CHECK-LABEL: @fold_packusdw_128(
+; CHECK-NEXT:    ret <8 x i16> <i16 undef, i16 undef, i16 undef, i16 undef, i16 0, i16 0, i16 -32768, i16 -1>
+;
+  %1 = call <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32> undef, <4 x i32> <i32 0, i32 -1, i32 32768, i32 65537>)
+  ret <8 x i16> %1
+}
+
+define <16 x i8> @fold_packsswb_128() {
+; CHECK-LABEL: @fold_packsswb_128(
+; CHECK-NEXT:    ret <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef>
+;
+  %1 = call <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16> zeroinitializer, <8 x i16> undef)
+  ret <16 x i8> %1
+}
+
+define <16 x i8> @fold_packuswb_128() {
+; CHECK-LABEL: @fold_packuswb_128(
+; CHECK-NEXT:    ret <16 x i8> <i8 0, i8 1, i8 0, i8 -1, i8 0, i8 0, i8 0, i8 15, i8 0, i8 127, i8 0, i8 1, i8 0, i8 1, i8 0, i8 0>
+;
+  %1 = call <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16> <i16 0, i16 1, i16 -1, i16 255, i16 65535, i16 -32768, i16 -127, i16 15>, <8 x i16> <i16 -15, i16 127, i16 32768, i16 -65535, i16 -255, i16 1, i16 -1, i16 0>)
+  ret <16 x i8> %1
+}
+
+define <16 x i16> @fold_packssdw_256() {
+; CHECK-LABEL: @fold_packssdw_256(
+; CHECK-NEXT:    ret <16 x i16> <i16 0, i16 256, i16 32767, i16 -32768, i16 undef, i16 undef, i16 undef, i16 undef, i16 -127, i16 -32768, i16 -32767, i16 32767, i16 undef, i16 undef, i16 undef, i16 undef>
+;
+  %1 = call <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32> <i32 0, i32 256, i32 65535, i32 -65536, i32 -127, i32 -32768, i32 -32767, i32 32767>, <8 x i32> undef)
+  ret <16 x i16> %1
+}
+
+define <16 x i16> @fold_packusdw_256() {
+; CHECK-LABEL: @fold_packusdw_256(
+; CHECK-NEXT:    ret <16 x i16> <i16 0, i16 0, i16 0, i16 -1, i16 0, i16 256, i16 -1, i16 0, i16 127, i16 -32768, i16 32767, i16 0, i16 0, i16 0, i16 0, i16 32767>
+;
+  %1 = call <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32> <i32 0, i32 -256, i32 -65535, i32 65536, i32 127, i32 32768, i32 32767, i32 -32767>, <8 x i32> <i32 0, i32 256, i32 65535, i32 -65536, i32 -127, i32 -32768, i32 -32767, i32 32767>)
+  ret <16 x i16> %1
+}
+
+define <32 x i8> @fold_packsswb_256() {
+; CHECK-LABEL: @fold_packsswb_256(
+; CHECK-NEXT:    ret <32 x i8> <i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>
+;
+  %1 = call <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16> undef, <16 x i16> zeroinitializer)
+  ret <32 x i8> %1
+}
+
+define <32 x i8> @fold_packuswb_256() {
+; CHECK-LABEL: @fold_packuswb_256(
+; CHECK-NEXT:    ret <32 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 -1, i8 -1, i8 -1, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 1, i8 2, i8 4, i8 8, i8 16, i8 32, i8 64>
+;
+  %1 = call <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16> zeroinitializer, <16 x i16> <i16 0, i16 -127, i16 -128, i16 -32768, i16 65536, i16 255, i16 256, i16 512, i16 -1, i16 1, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64>)
+  ret <32 x i8> %1
+}
+
+define <32 x i16> @fold_packssdw_512() {
+; CHECK-LABEL: @fold_packssdw_512(
+; CHECK-NEXT:    ret <32 x i16> <i16 0, i16 512, i16 32767, i16 -32768, i16 undef, i16 undef, i16 undef, i16 undef, i16 -127, i16 -32768, i16 -32767, i16 32767, i16 undef, i16 undef, i16 undef, i16 undef, i16 0, i16 512, i16 32767, i16 -32768, i16 undef, i16 undef, i16 undef, i16 undef, i16 -127, i16 -32768, i16 -32767, i16 32767, i16 undef, i16 undef, i16 undef, i16 undef>
+;
+  %1 = call <32 x i16> @llvm.x86.avx512.packssdw.512(<16 x i32> <i32 0, i32 512, i32 65535, i32 -65536, i32 -127, i32 -32768, i32 -32767, i32 32767, i32 0, i32 512, i32 65535, i32 -65536, i32 -127, i32 -32768, i32 -32767, i32 32767>, <16 x i32> undef)
+  ret <32 x i16> %1
+}
+
+define <32 x i16> @fold_packusdw_512() {
+; CHECK-LABEL: @fold_packusdw_512(
+; CHECK-NEXT:    ret <32 x i16> <i16 0, i16 0, i16 0, i16 -1, i16 0, i16 512, i16 -1, i16 0, i16 127, i16 -32768, i16 32767, i16 0, i16 0, i16 0, i16 0, i16 32767, i16 0, i16 0, i16 0, i16 -1, i16 0, i16 512, i16 -1, i16 0, i16 127, i16 -32768, i16 32767, i16 0, i16 0, i16 0, i16 0, i16 32767>
+;
+  %1 = call <32 x i16> @llvm.x86.avx512.packusdw.512(<16 x i32> <i32 0, i32 -512, i32 -65535, i32 65536, i32 127, i32 32768, i32 32767, i32 -32767, i32 0, i32 -512, i32 -65535, i32 65536, i32 127, i32 32768, i32 32767, i32 -32767>, <16 x i32> <i32 0, i32 512, i32 65535, i32 -65536, i32 -127, i32 -32768, i32 -32767, i32 32767, i32 0, i32 512, i32 65535, i32 -65536, i32 -127, i32 -32768, i32 -32767, i32 32767>)
+  ret <32 x i16> %1
+}
+
+define <64 x i8> @fold_packsswb_512() {
+; CHECK-LABEL: @fold_packsswb_512(
+; CHECK-NEXT:    ret <64 x i8> <i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>
+;
+  %1 = call <64 x i8> @llvm.x86.avx512.packsswb.512(<32 x i16> undef, <32 x i16> zeroinitializer)
+  ret <64 x i8> %1
+}
+
+define <64 x i8> @fold_packuswb_512() {
+; CHECK-LABEL: @fold_packuswb_512(
+; CHECK-NEXT:    ret <64 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 -1, i8 -1, i8 -1, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 1, i8 2, i8 4, i8 8, i8 16, i8 32, i8 64, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 -1, i8 -1, i8 -1, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 1, i8 2, i8 4, i8 8, i8 16, i8 32, i8 64>
+;
+  %1 = call <64 x i8> @llvm.x86.avx512.packuswb.512(<32 x i16> zeroinitializer, <32 x i16> <i16 0, i16 -127, i16 -128, i16 -32768, i16 65536, i16 255, i16 512, i16 512, i16 -1, i16 1, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 0, i16 -127, i16 -128, i16 -32768, i16 65536, i16 255, i16 512, i16 512, i16 -1, i16 1, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64>)
+  ret <64 x i8> %1
+}
+
+;
+; Demanded Elts
+;
+
+define <8 x i16> @elts_packssdw_128(<4 x i32> %a0, <4 x i32> %a1) {
+; CHECK-LABEL: @elts_packssdw_128(
+; CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> [[A0:%.*]], <4 x i32> undef)
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> undef, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    ret <8 x i16> [[TMP2]]
+;
+  %1 = shufflevector <4 x i32> %a0, <4 x i32> undef, <4 x i32> <i32 3, i32 1, i32 undef, i32 undef>
+  %2 = shufflevector <4 x i32> %a1, <4 x i32> undef, <4 x i32> <i32 undef, i32 2, i32 1, i32 undef>
+  %3 = call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> %1, <4 x i32> %2)
+  %4 = shufflevector <8 x i16> %3, <8 x i16> undef, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 7, i32 7, i32 7, i32 7>
+  ret <8 x i16> %4
+}
+
+define <8 x i16> @elts_packusdw_128(<4 x i32> %a0, <4 x i32> %a1) {
+; CHECK-LABEL: @elts_packusdw_128(
+; CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32> [[A0:%.*]], <4 x i32> [[A1:%.*]])
+; CHECK-NEXT:    ret <8 x i16> [[TMP1]]
+;
+  %1 = insertelement <4 x i32> %a0, i32 0, i32 0
+  %2 = insertelement <4 x i32> %a1, i32 0, i32 3
+  %3 = call <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32> %1, <4 x i32> %2)
+  %4 = shufflevector <8 x i16> %3, <8 x i16> undef, <8 x i32> <i32 undef, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 undef>
+  ret <8 x i16> %4
+}
+
+define <16 x i8> @elts_packsswb_128(<8 x i16> %a0, <8 x i16> %a1) {
+; CHECK-LABEL: @elts_packsswb_128(
+; CHECK-NEXT:    ret <16 x i8> zeroinitializer
+;
+  %1 = insertelement <8 x i16> %a0, i16 0, i32 0
+  %2 = insertelement <8 x i16> %a1, i16 0, i32 0
+  %3 = call <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16> %1, <8 x i16> %2)
+  %4 = shufflevector <16 x i8> %3, <16 x i8> undef, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
+  ret <16 x i8> %4
+}
+
+define <16 x i8> @elts_packuswb_128(<8 x i16> %a0, <8 x i16> %a1) {
+; CHECK-LABEL: @elts_packuswb_128(
+; CHECK-NEXT:    ret <16 x i8> undef
+;
+  %1 = insertelement <8 x i16> undef, i16 0, i32 0
+  %2 = insertelement <8 x i16> undef, i16 0, i32 0
+  %3 = call <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16> %1, <8 x i16> %2)
+  %4 = shufflevector <16 x i8> %3, <16 x i8> undef, <16 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
+  ret <16 x i8> %4
+}
+
+define <16 x i16> @elts_packssdw_256(<8 x i32> %a0, <8 x i32> %a1) {
+; CHECK-LABEL: @elts_packssdw_256(
+; CHECK-NEXT:    [[TMP1:%.*]] = call <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32> [[A0:%.*]], <8 x i32> undef)
+; CHECK-NEXT:    ret <16 x i16> [[TMP1]]
+;
+  %1 = shufflevector <8 x i32> %a0, <8 x i32> undef, <8 x i32> <i32 1, i32 0, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %2 = shufflevector <8 x i32> %a1, <8 x i32> undef, <8 x i32> <i32 undef, i32 2, i32 1, i32 undef, i32 undef, i32 6, i32 5, i32 undef>
+  %3 = call <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32> %1, <8 x i32> %2)
+  %4 = shufflevector <16 x i16> %3, <16 x i16> undef, <16 x i32> <i32 undef, i32 undef, i32 2, i32 3, i32 4, i32 undef, i32 undef, i32 7, i32 8, i32 undef, i32 undef, i32 11, i32 12, i32 undef, i32 undef, i32 15>
+  ret <16 x i16> %4
+}
+
+define <16 x i16> @elts_packusdw_256(<8 x i32> %a0, <8 x i32> %a1) {
+; CHECK-LABEL: @elts_packusdw_256(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A1:%.*]], <8 x i32> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[TMP2:%.*]] = call <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32> undef, <8 x i32> [[TMP1]])
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <16 x i16> [[TMP2]], <16 x i16> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    ret <16 x i16> [[TMP3]]
+;
+  %1 = shufflevector <8 x i32> %a0, <8 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %2 = shufflevector <8 x i32> %a1, <8 x i32> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+  %3 = call <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32> %1, <8 x i32> %2)
+  %4 = shufflevector <16 x i16> %3, <16 x i16> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef>
+  ret <16 x i16> %4
+}
+
+define <32 x i8> @elts_packsswb_256(<16 x i16> %a0, <16 x i16> %a1) {
+; CHECK-LABEL: @elts_packsswb_256(
+; CHECK-NEXT:    ret <32 x i8> zeroinitializer
+;
+  %1 = insertelement <16 x i16> %a0, i16 0, i32 0
+  %2 = insertelement <16 x i16> %a1, i16 0, i32 8
+  %3 = call <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16> %1, <16 x i16> %2)
+  %4 = shufflevector <32 x i8> %3, <32 x i8> undef, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24>
+  ret <32 x i8> %4
+}
+
+define <32 x i8> @elts_packuswb_256(<16 x i16> %a0, <16 x i16> %a1) {
+; CHECK-LABEL: @elts_packuswb_256(
+; CHECK-NEXT:    ret <32 x i8> undef
+;
+  %1 = insertelement <16 x i16> undef, i16 0, i32 1
+  %2 = insertelement <16 x i16> undef, i16 0, i32 0
+  %3 = call <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16> %1, <16 x i16> %2)
+  %4 = shufflevector <32 x i8> %3, <32 x i8> undef, <32 x i32> zeroinitializer
+  ret <32 x i8> %4
+}
+
+define <32 x i16> @elts_packssdw_512(<16 x i32> %a0, <16 x i32> %a1) {
+; CHECK-LABEL: @elts_packssdw_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = call <32 x i16> @llvm.x86.avx512.packssdw.512(<16 x i32> [[A0:%.*]], <16 x i32> undef)
+; CHECK-NEXT:    ret <32 x i16> [[TMP1]]
+;
+  %1 = shufflevector <16 x i32> %a0, <16 x i32> undef, <16 x i32> <i32 1, i32 0, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 9, i32 8, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %2 = shufflevector <16 x i32> %a1, <16 x i32> undef, <16 x i32> <i32 undef, i32 2, i32 1, i32 undef, i32 undef, i32 6, i32 5, i32 undef, i32 undef, i32 10, i32 9, i32 undef, i32 undef, i32 14, i32 13, i32 undef>
+  %3 = call <32 x i16> @llvm.x86.avx512.packssdw.512(<16 x i32> %1, <16 x i32> %2)
+  %4 = shufflevector <32 x i16> %3, <32 x i16> undef, <32 x i32> <i32 undef, i32 undef, i32 2, i32 3, i32 4, i32 undef, i32 undef, i32 7, i32 8, i32 undef, i32 undef, i32 11, i32 12, i32 undef, i32 undef, i32 15, i32 undef, i32 undef, i32 18, i32 19, i32 20, i32 undef, i32 undef, i32 23, i32 24, i32 undef, i32 undef, i32 27, i32 28, i32 undef, i32 undef, i32 31>
+  ret <32 x i16> %4
+}
+
+define <32 x i16> @elts_packusdw_512(<16 x i32> %a0, <16 x i32> %a1) {
+; CHECK-LABEL: @elts_packusdw_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i32> [[A1:%.*]], <16 x i32> undef, <16 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8>
+; CHECK-NEXT:    [[TMP2:%.*]] = call <32 x i16> @llvm.x86.avx512.packusdw.512(<16 x i32> undef, <16 x i32> [[TMP1]])
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <32 x i16> [[TMP2]], <32 x i16> undef, <32 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 20, i32 21, i32 22, i32 23, i32 undef, i32 undef, i32 undef, i32 undef, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    ret <32 x i16> [[TMP3]]
+;
+  %1 = shufflevector <16 x i32> %a0, <16 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %2 = shufflevector <16 x i32> %a1, <16 x i32> undef, <16 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8>
+  %3 = call <32 x i16> @llvm.x86.avx512.packusdw.512(<16 x i32> %1, <16 x i32> %2)
+  %4 = shufflevector <32 x i16> %3, <32 x i16> undef, <32 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 20, i32 21, i32 22, i32 23, i32 undef, i32 undef, i32 undef, i32 undef, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef>
+  ret <32 x i16> %4
+}
+
+define <64 x i8> @elts_packsswb_512(<32 x i16> %a0, <32 x i16> %a1) {
+; CHECK-LABEL: @elts_packsswb_512(
+; CHECK-NEXT:    ret <64 x i8> zeroinitializer
+;
+  %1 = insertelement <32 x i16> %a0, i16 0, i32 0
+  %2 = insertelement <32 x i16> %a1, i16 0, i32 8
+  %3 = insertelement <32 x i16> %1, i16 0, i32 16
+  %4 = insertelement <32 x i16> %2, i16 0, i32 24
+  %5 = call <64 x i8> @llvm.x86.avx512.packsswb.512(<32 x i16> %3, <32 x i16> %4)
+  %6 = shufflevector <64 x i8> %5, <64 x i8> undef, <64 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56>
+  ret <64 x i8> %6
+}
+
+define <64 x i8> @elts_packuswb_512(<32 x i16> %a0, <32 x i16> %a1) {
+; CHECK-LABEL: @elts_packuswb_512(
+; CHECK-NEXT:    ret <64 x i8> undef
+;
+  %1 = insertelement <32 x i16> undef, i16 0, i32 1
+  %2 = insertelement <32 x i16> undef, i16 0, i32 0
+  %3 = call <64 x i8> @llvm.x86.avx512.packuswb.512(<32 x i16> %1, <32 x i16> %2)
+  %4 = shufflevector <64 x i8> %3, <64 x i8> undef, <64 x i32> zeroinitializer
+  ret <64 x i8> %4
+}
+
+declare <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32>, <4 x i32>) nounwind readnone
+declare <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16>, <8 x i16>) nounwind readnone
+declare <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16>, <8 x i16>) nounwind readnone
+declare <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32>, <4 x i32>) nounwind readnone
+
+declare <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32>, <8 x i32>) nounwind readnone
+declare <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32>, <8 x i32>) nounwind readnone
+declare <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16>, <16 x i16>) nounwind readnone
+declare <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16>, <16 x i16>) nounwind readnone
+
+declare <32 x i16> @llvm.x86.avx512.packssdw.512(<16 x i32>, <16 x i32>) nounwind readnone
+declare <32 x i16> @llvm.x86.avx512.packusdw.512(<16 x i32>, <16 x i32>) nounwind readnone
+declare <64 x i8> @llvm.x86.avx512.packsswb.512(<32 x i16>, <32 x i16>) nounwind readnone
+declare <64 x i8> @llvm.x86.avx512.packuswb.512(<32 x i16>, <32 x i16>) nounwind readnone
diff --git a/test/Transforms/InstCombine/X86/x86-pshufb.ll b/test/Transforms/InstCombine/X86/x86-pshufb.ll
new file mode 100644
index 000000000000..f181ef57fe20
--- /dev/null
+++ b/test/Transforms/InstCombine/X86/x86-pshufb.ll
@@ -0,0 +1,515 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+; Verify that instcombine is able to fold identity shuffles.
+
+define <16 x i8> @identity_test(<16 x i8> %InVec) {
+; CHECK-LABEL: @identity_test(
+; CHECK-NEXT:    ret <16 x i8> %InVec
+;
+  %1 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %InVec, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>)
+  ret <16 x i8> %1
+}
+
+define <32 x i8> @identity_test_avx2(<32 x i8> %InVec) {
+; CHECK-LABEL: @identity_test_avx2(
+; CHECK-NEXT:    ret <32 x i8> %InVec
+;
+  %1 = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %InVec, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>)
+  ret <32 x i8> %1
+}
+
+define <64 x i8> @identity_test_avx512(<64 x i8> %InVec) {
+; CHECK-LABEL: @identity_test_avx512(
+; CHECK-NEXT:    ret <64 x i8> %InVec
+;
+  %1 = tail call <64 x i8> @llvm.x86.avx512.pshuf.b.512(<64 x i8> %InVec, <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>)
+  ret <64 x i8> %1
+}
+
+; Verify that instcombine is able to fold byte shuffles with zero masks.
+
+define <16 x i8> @fold_to_zero_vector(<16 x i8> %InVec) {
+; CHECK-LABEL: @fold_to_zero_vector(
+; CHECK-NEXT:    ret <16 x i8> zeroinitializer
+;
+  %1 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %InVec, <16 x i8> <i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128>)
+  ret <16 x i8> %1
+}
+
+define <32 x i8> @fold_to_zero_vector_avx2(<32 x i8> %InVec) {
+; CHECK-LABEL: @fold_to_zero_vector_avx2(
+; CHECK-NEXT:    ret <32 x i8> zeroinitializer
+;
+  %1 = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %InVec, <32 x i8> <i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128>)
+  ret <32 x i8> %1
+}
+
+define <64 x i8> @fold_to_zero_vector_avx512(<64 x i8> %InVec) {
+; CHECK-LABEL: @fold_to_zero_vector_avx512(
+; CHECK-NEXT:    ret <64 x i8> zeroinitializer
+;
+  %1 = tail call <64 x i8> @llvm.x86.avx512.pshuf.b.512(<64 x i8> %InVec, <64 x i8> <i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128>)
+  ret <64 x i8> %1
+}
+
+; Instcombine should be able to fold the following byte shuffle to a builtin shufflevector
+; with a shuffle mask of all zeroes.
+
+define <16 x i8> @splat_test(<16 x i8> %InVec) {
+; CHECK-LABEL: @splat_test(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i8> %InVec, <16 x i8> undef, <16 x i32> zeroinitializer
+; CHECK-NEXT:    ret <16 x i8> [[TMP1]]
+;
+  %1 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %InVec, <16 x i8> zeroinitializer)
+  ret <16 x i8> %1
+}
+
+; In the test case below, elements in the low 128-bit lane of the result
+; vector are equal to the lower byte of %InVec (shuffle index 0).
+; Elements in the high 128-bit lane of the result vector are equal to
+; the lower byte in the high 128-bit lane of %InVec (shuffle index 16).
+
+define <32 x i8> @splat_test_avx2(<32 x i8> %InVec) {
+; CHECK-LABEL: @splat_test_avx2(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <32 x i8> %InVec, <32 x i8> undef, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
+; CHECK-NEXT:    ret <32 x i8> [[TMP1]]
+;
+  %1 = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %InVec, <32 x i8> zeroinitializer)
+  ret <32 x i8> %1
+}
+
+define <64 x i8> @splat_test_avx512(<64 x i8> %InVec) {
+; CHECK-LABEL: @splat_test_avx512(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <64 x i8> %InVec, <64 x i8> undef, <64 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48>
+; CHECK-NEXT:    ret <64 x i8> [[TMP1]]
+;
+  %1 = tail call <64 x i8> @llvm.x86.avx512.pshuf.b.512(<64 x i8> %InVec, <64 x i8> zeroinitializer)
+  ret <64 x i8> %1
+}
+
+; Each of the byte shuffles in the following tests is equivalent to a blend between
+; vector %InVec and a vector of all zeroes.
+
+define <16 x i8> @blend1(<16 x i8> %InVec) {
+; CHECK-LABEL: @blend1(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i8> %InVec, <16 x i8> <i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef>, <16 x i32> <i32 16, i32 1, i32 16, i32 3, i32 16, i32 5, i32 16, i32 7, i32 16, i32 9, i32 16, i32 11, i32 16, i32 13, i32 16, i32 15>
+; CHECK-NEXT:    ret <16 x i8> [[TMP1]]
+;
+  %1 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %InVec, <16 x i8> <i8 -128, i8 1, i8 -128, i8 3, i8 -128, i8 5, i8 -128, i8 7, i8 -128, i8 9, i8 -128, i8 11, i8 -128, i8 13, i8 -128, i8 15>)
+  ret <16 x i8> %1
+}
+
+define <16 x i8> @blend2(<16 x i8> %InVec) {
+; CHECK-LABEL: @blend2(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i8> %InVec, <16 x i8> <i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef>, <16 x i32> <i32 16, i32 16, i32 2, i32 3, i32 16, i32 16, i32 6, i32 7, i32 16, i32 16, i32 10, i32 11, i32 16, i32 16, i32 14, i32 15>
+; CHECK-NEXT:    ret <16 x i8> [[TMP1]]
+;
+  %1 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %InVec, <16 x i8> <i8 -128, i8 -128, i8 2, i8 3, i8 -128, i8 -128, i8 6, i8 7, i8 -128, i8 -128, i8 10, i8 11, i8 -128, i8 -128, i8 14, i8 15>)
+  ret <16 x i8> %1
+}
+
+define <16 x i8> @blend3(<16 x i8> %InVec) {
+; CHECK-LABEL: @blend3(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i8> %InVec, <16 x i8> <i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef>, <16 x i32> <i32 16, i32 16, i32 16, i32 16, i32 4, i32 5, i32 6, i32 7, i32 16, i32 16, i32 16, i32 16, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    ret <16 x i8> [[TMP1]]
+;
+  %1 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %InVec, <16 x i8> <i8 -128, i8 -128, i8 -128, i8 -128, i8 4, i8 5, i8 6, i8 7, i8 -128, i8 -128, i8 -128, i8 -128, i8 12, i8 13, i8 14, i8 15>)
+  ret <16 x i8> %1
+}
+
+define <16 x i8> @blend4(<16 x i8> %InVec) {
+; CHECK-LABEL: @blend4(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i8> %InVec, <16 x i8> <i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef>, <16 x i32> <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    ret <16 x i8> [[TMP1]]
+;
+  %1 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %InVec, <16 x i8> <i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>)
+  ret <16 x i8> %1
+}
+
+define <16 x i8> @blend5(<16 x i8> %InVec) {
+; CHECK-LABEL: @blend5(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i8> %InVec, <16 x i8> <i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef>, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
+; CHECK-NEXT:    ret <16 x i8> [[TMP1]]
+;
+  %1 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %InVec, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128>)
+  ret <16 x i8> %1
+}
+
+define <16 x i8> @blend6(<16 x i8> %InVec) {
+; CHECK-LABEL: @blend6(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i8> %InVec, <16 x i8> <i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef>, <16 x i32> <i32 0, i32 1, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
+; CHECK-NEXT:    ret <16 x i8> [[TMP1]]
+;
+  %1 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %InVec, <16 x i8> <i8 0, i8 1, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128>)
+  ret <16 x i8> %1
+}
+
+define <32 x i8> @blend1_avx2(<32 x i8> %InVec) {
+; CHECK-LABEL: @blend1_avx2(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <32 x i8> %InVec, <32 x i8> <i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef>, <32 x i32> <i32 32, i32 1, i32 32, i32 3, i32 32, i32 5, i32 32, i32 7, i32 32, i32 9, i32 32, i32 11, i32 32, i32 13, i32 32, i32 15, i32 48, i32 17, i32 48, i32 19, i32 48, i32 21, i32 48, i32 23, i32 48, i32 25, i32 48, i32 27, i32 48, i32 29, i32 48, i32 31>
+; CHECK-NEXT:    ret <32 x i8> [[TMP1]]
+;
+  %1 = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %InVec, <32 x i8> <i8 -128, i8 1, i8 -128, i8 3, i8 -128, i8 5, i8 -128, i8 7, i8 -128, i8 9, i8 -128, i8 11, i8 -128, i8 13, i8 -128, i8 15, i8 -128, i8 1, i8 -128, i8 3, i8 -128, i8 5, i8 -128, i8 7, i8 -128, i8 9, i8 -128, i8 11, i8 -128, i8 13, i8 -128, i8 15>)
+  ret <32 x i8> %1
+}
+
+define <32 x i8> @blend2_avx2(<32 x i8> %InVec) {
+; CHECK-LABEL: @blend2_avx2(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <32 x i8> %InVec, <32 x i8> <i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef>, <32 x i32> <i32 32, i32 32, i32 2, i32 3, i32 32, i32 32, i32 6, i32 7, i32 32, i32 32, i32 10, i32 11, i32 32, i32 32, i32 14, i32 15, i32 48, i32 48, i32 18, i32 19, i32 48, i32 48, i32 22, i32 23, i32 48, i32 48, i32 26, i32 27, i32 48, i32 48, i32 30, i32 31>
+; CHECK-NEXT:    ret <32 x i8> [[TMP1]]
+;
+  %1 = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %InVec, <32 x i8> <i8 -128, i8 -128, i8 2, i8 3, i8 -128, i8 -128, i8 6, i8 7, i8 -128, i8 -128, i8 10, i8 11, i8 -128, i8 -128, i8 14, i8 15, i8 -128, i8 -128, i8 2, i8 3, i8 -128, i8 -128, i8 6, i8 7, i8 -128, i8 -128, i8 10, i8 11, i8 -128, i8 -128, i8 14, i8 15>)
+  ret <32 x i8> %1
+}
+
+define <32 x i8> @blend3_avx2(<32 x i8> %InVec) {
+; CHECK-LABEL: @blend3_avx2(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <32 x i8> %InVec, <32 x i8> <i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef>, <32 x i32> <i32 32, i32 32, i32 32, i32 32, i32 4, i32 5, i32 6, i32 7, i32 32, i32 32, i32 32, i32 32, i32 12, i32 13, i32 14, i32 15, i32 48, i32 48, i32 48, i32 48, i32 20, i32 21, i32 22, i32 23, i32 48, i32 48, i32 48, i32 48, i32 28, i32 29, i32 30, i32 31>
+; CHECK-NEXT:    ret <32 x i8> [[TMP1]]
+;
+  %1 = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %InVec, <32 x i8> <i8 -128, i8 -128, i8 -128, i8 -128, i8 4, i8 5, i8 6, i8 7, i8 -128, i8 -128, i8 -128, i8 -128, i8 12, i8 13, i8 14, i8 15, i8 -128, i8 -128, i8 -128, i8 -128, i8 4, i8 5, i8 6, i8 7, i8 -128, i8 -128, i8 -128, i8 -128, i8 12, i8 13, i8 14, i8 15>)
+  ret <32 x i8> %1
+}
+
+define <32 x i8> @blend4_avx2(<32 x i8> %InVec) {
+; CHECK-LABEL: @blend4_avx2(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <32 x i8> %InVec, <32 x i8> <i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef>, <32 x i32> <i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; CHECK-NEXT:    ret <32 x i8> [[TMP1]]
+;
+  %1 = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %InVec, <32 x i8> <i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>)
+  ret <32 x i8> %1
+}
+
+define <32 x i8> @blend5_avx2(<32 x i8> %InVec) {
+; CHECK-LABEL: @blend5_avx2(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <32 x i8> %InVec, <32 x i8> <i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef>, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 16, i32 17, i32 18, i32 19, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48>
+; CHECK-NEXT:    ret <32 x i8> [[TMP1]]
+;
+  %1 = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %InVec, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 0, i8 1, i8 2, i8 3, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128>)
+  ret <32 x i8> %1
+}
+
+define <32 x i8> @blend6_avx2(<32 x i8> %InVec) {
+; CHECK-LABEL: @blend6_avx2(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <32 x i8> %InVec, <32 x i8> <i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef>, <32 x i32> <i32 0, i32 1, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 16, i32 17, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48>
+; CHECK-NEXT:    ret <32 x i8> [[TMP1]]
+;
+  %1 = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %InVec, <32 x i8> <i8 0, i8 1, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 0, i8 1, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128>)
+  ret <32 x i8> %1
+}
+
+define <64 x i8> @blend1_avx512(<64 x i8> %InVec) {
+; CHECK-LABEL: @blend1_avx512(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <64 x i8> %InVec, <64 x i8> <i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef>, <64 x i32> <i32 64, i32 1, i32 64, i32 3, i32 64, i32 5, i32 64, i32 7, i32 64, i32 9, i32 64, i32 11, i32 64, i32 13, i32 64, i32 15, i32 80, i32 17, i32 80, i32 19, i32 80, i32 21, i32 80, i32 23, i32 80, i32 25, i32 80, i32 27, i32 80, i32 29, i32 80, i32 31, i32 96, i32 33, i32 96, i32 35, i32 96, i32 37, i32 96, i32 39, i32 96, i32 41, i32 96, i32 43, i32 96, i32 45, i32 96, i32 47, i32 112, i32 49, i32 112, i32 51, i32 112, i32 53, i32 112, i32 55, i32 112, i32 57, i32 112, i32 59, i32 112, i32 61, i32 112, i32 63>
+; CHECK-NEXT:    ret <64 x i8> [[TMP1]]
+;
+  %1 = tail call <64 x i8> @llvm.x86.avx512.pshuf.b.512(<64 x i8> %InVec, <64 x i8> <i8 -128, i8 1, i8 -128, i8 3, i8 -128, i8 5, i8 -128, i8 7, i8 -128, i8 9, i8 -128, i8 11, i8 -128, i8 13, i8 -128, i8 15, i8 -128, i8 1, i8 -128, i8 3, i8 -128, i8 5, i8 -128, i8 7, i8 -128, i8 9, i8 -128, i8 11, i8 -128, i8 13, i8 -128, i8 15, i8 -128, i8 1, i8 -128, i8 3, i8 -128, i8 5, i8 -128, i8 7, i8 -128, i8 9, i8 -128, i8 11, i8 -128, i8 13, i8 -128, i8 15, i8 -128, i8 1, i8 -128, i8 3, i8 -128, i8 5, i8 -128, i8 7, i8 -128, i8 9, i8 -128, i8 11, i8 -128, i8 13, i8 -128, i8 15>)
+  ret <64 x i8> %1
+}
+
+define <64 x i8> @blend2_avx512(<64 x i8> %InVec) {
+; CHECK-LABEL: @blend2_avx512(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <64 x i8> %InVec, <64 x i8> <i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef>, <64 x i32> <i32 64, i32 64, i32 2, i32 3, i32 64, i32 64, i32 6, i32 7, i32 64, i32 64, i32 10, i32 11, i32 64, i32 64, i32 14, i32 15, i32 80, i32 80, i32 18, i32 19, i32 80, i32 80, i32 22, i32 23, i32 80, i32 80, i32 26, i32 27, i32 80, i32 80, i32 30, i32 31, i32 96, i32 96, i32 34, i32 35, i32 96, i32 96, i32 38, i32 39, i32 96, i32 96, i32 42, i32 43, i32 96, i32 96, i32 46, i32 47, i32 112, i32 112, i32 50, i32 51, i32 112, i32 112, i32 54, i32 55, i32 112, i32 112, i32 58, i32 59, i32 112, i32 112, i32 62, i32 63>
+; CHECK-NEXT:    ret <64 x i8> [[TMP1]]
+;
+  %1 = tail call <64 x i8> @llvm.x86.avx512.pshuf.b.512(<64 x i8> %InVec, <64 x i8> <i8 -128, i8 -128, i8 2, i8 3, i8 -128, i8 -128, i8 6, i8 7, i8 -128, i8 -128, i8 10, i8 11, i8 -128, i8 -128, i8 14, i8 15, i8 -128, i8 -128, i8 2, i8 3, i8 -128, i8 -128, i8 6, i8 7, i8 -128, i8 -128, i8 10, i8 11, i8 -128, i8 -128, i8 14, i8 15, i8 -128, i8 -128, i8 2, i8 3, i8 -128, i8 -128, i8 6, i8 7, i8 -128, i8 -128, i8 10, i8 11, i8 -128, i8 -128, i8 14, i8 15, i8 -128, i8 -128, i8 2, i8 3, i8 -128, i8 -128, i8 6, i8 7, i8 -128, i8 -128, i8 10, i8 11, i8 -128, i8 -128, i8 14, i8 15>)
+  ret <64 x i8> %1
+}
+
+define <64 x i8> @blend3_avx512(<64 x i8> %InVec) {
+; CHECK-LABEL: @blend3_avx512(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <64 x i8> %InVec, <64 x i8> <i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef>, <64 x i32> <i32 64, i32 64, i32 64, i32 64, i32 4, i32 5, i32 6, i32 7, i32 64, i32 64, i32 64, i32 64, i32 12, i32 13, i32 14, i32 15, i32 80, i32 80, i32 80, i32 80, i32 20, i32 21, i32 22, i32 23, i32 80, i32 80, i32 80, i32 80, i32 28, i32 29, i32 30, i32 31, i32 96, i32 96, i32 96, i32 96, i32 36, i32 37, i32 38, i32 39, i32 96, i32 96, i32 96, i32 96, i32 44, i32 45, i32 46, i32 47, i32 112, i32 112, i32 112, i32 112, i32 52, i32 53, i32 54, i32 55, i32 112, i32 112, i32 112, i32 112, i32 60, i32 61, i32 62, i32 63>
+; CHECK-NEXT:    ret <64 x i8> [[TMP1]]
+;
+  %1 = tail call <64 x i8> @llvm.x86.avx512.pshuf.b.512(<64 x i8> %InVec, <64 x i8> <i8 -128, i8 -128, i8 -128, i8 -128, i8 4, i8 5, i8 6, i8 7, i8 -128, i8 -128, i8 -128, i8 -128, i8 12, i8 13, i8 14, i8 15, i8 -128, i8 -128, i8 -128, i8 -128, i8 4, i8 5, i8 6, i8 7, i8 -128, i8 -128, i8 -128, i8 -128, i8 12, i8 13, i8 14, i8 15, i8 -128, i8 -128, i8 -128, i8 -128, i8 4, i8 5, i8 6, i8 7, i8 -128, i8 -128, i8 -128, i8 -128, i8 12, i8 13, i8 14, i8 15, i8 -128, i8 -128, i8 -128, i8 -128, i8 4, i8 5, i8 6, i8 7, i8 -128, i8 -128, i8 -128, i8 -128, i8 12, i8 13, i8 14, i8 15>)
+  ret <64 x i8> %1
+}
+
+define <64 x i8> @blend4_avx512(<64 x i8> %InVec) {
+; CHECK-LABEL: @blend4_avx512(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <64 x i8> %InVec, <64 x i8> <i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef>, <64 x i32> <i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; CHECK-NEXT:    ret <64 x i8> [[TMP1]]
+;
+  %1 = tail call <64 x i8> @llvm.x86.avx512.pshuf.b.512(<64 x i8> %InVec, <64 x i8> <i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>)
+  ret <64 x i8> %1
+}
+
+define <64 x i8> @blend5_avx512(<64 x i8> %InVec) {
+; CHECK-LABEL: @blend5_avx512(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <64 x i8> %InVec, <64 x i8> <i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef>, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 16, i32 17, i32 18, i32 19, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 32, i32 33, i32 34, i32 35, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 48, i32 49, i32 50, i32 51, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112>
+; CHECK-NEXT:    ret <64 x i8> [[TMP1]]
+;
+  %1 = tail call <64 x i8> @llvm.x86.avx512.pshuf.b.512(<64 x i8> %InVec, <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 0, i8 1, i8 2, i8 3, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 0, i8 1, i8 2, i8 3, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 0, i8 1, i8 2, i8 3, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128>)
+  ret <64 x i8> %1
+}
+
+define <64 x i8> @blend6_avx512(<64 x i8> %InVec) {
+; CHECK-LABEL: @blend6_avx512(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <64 x i8> %InVec, <64 x i8> <i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef>, <64 x i32> <i32 0, i32 1, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 16, i32 17, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 32, i32 33, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 48, i32 49, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112>
+; CHECK-NEXT:    ret <64 x i8> [[TMP1]]
+;
+  %1 = tail call <64 x i8> @llvm.x86.avx512.pshuf.b.512(<64 x i8> %InVec, <64 x i8> <i8 0, i8 1, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128,i8 0, i8 1, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 0, i8 1, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 0, i8 1, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128>)
+  ret <64 x i8> %1
+}
+
+; movq idiom.
+define <16 x i8> @movq_idiom(<16 x i8> %InVec) {
+; CHECK-LABEL: @movq_idiom(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i8> %InVec, <16 x i8> <i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef>, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
+; CHECK-NEXT:    ret <16 x i8> [[TMP1]]
+;
+  %1 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %InVec, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128>)
+  ret <16 x i8> %1
+}
+
+define <32 x i8> @movq_idiom_avx2(<32 x i8> %InVec) {
+; CHECK-LABEL: @movq_idiom_avx2(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <32 x i8> %InVec, <32 x i8> <i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef>, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48>
+; CHECK-NEXT:    ret <32 x i8> [[TMP1]]
+;
+  %1 = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %InVec, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128>)
+  ret <32 x i8> %1
+}
+
+define <64 x i8> @movq_idiom_avx512(<64 x i8> %InVec) {
+; CHECK-LABEL: @movq_idiom_avx512(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <64 x i8> %InVec, <64 x i8> <i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef>, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112>
+; CHECK-NEXT:    ret <64 x i8> [[TMP1]]
+;
+  %1 = tail call <64 x i8> @llvm.x86.avx512.pshuf.b.512(<64 x i8> %InVec, <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128>)
+  ret <64 x i8> %1
+}
+
+; Vector permutations using byte shuffles.
+
+define <16 x i8> @permute1(<16 x i8> %InVec) {
+; CHECK-LABEL: @permute1(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i8> %InVec, <16 x i8> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    ret <16 x i8> [[TMP1]]
+;
+  %1 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %InVec, <16 x i8> <i8 4, i8 5, i8 6, i8 7, i8 4, i8 5, i8 6, i8 7, i8 12, i8 13, i8 14, i8 15, i8 12, i8 13, i8 14, i8 15>)
+  ret <16 x i8> %1
+}
+
+define <16 x i8> @permute2(<16 x i8> %InVec) {
+; CHECK-LABEL: @permute2(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i8> %InVec, <16 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    ret <16 x i8> [[TMP1]]
+;
+  %1 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %InVec, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7>)
+  ret <16 x i8> %1
+}
+
+define <32 x i8> @permute1_avx2(<32 x i8> %InVec) {
+; CHECK-LABEL: @permute1_avx2(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <32 x i8> %InVec, <32 x i8> undef, <32 x i32> <i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15, i32 12, i32 13, i32 14, i32 15, i32 20, i32 21, i32 22, i32 23, i32 20, i32 21, i32 22, i32 23, i32 28, i32 29, i32 30, i32 31, i32 28, i32 29, i32 30, i32 31>
+; CHECK-NEXT:    ret <32 x i8> [[TMP1]]
+;
+  %1 = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %InVec, <32 x i8> <i8 4, i8 5, i8 6, i8 7, i8 4, i8 5, i8 6, i8 7, i8 12, i8 13, i8 14, i8 15, i8 12, i8 13, i8 14, i8 15, i8 4, i8 5, i8 6, i8 7, i8 4, i8 5, i8 6, i8 7, i8 12, i8 13, i8 14, i8 15, i8 12, i8 13, i8 14, i8 15>)
+  ret <32 x i8> %1
+}
+
+define <32 x i8> @permute2_avx2(<32 x i8> %InVec) {
+; CHECK-LABEL: @permute2_avx2(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <32 x i8> %InVec, <32 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+; CHECK-NEXT:    ret <32 x i8> [[TMP1]]
+;
+  %1 = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %InVec, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7>)
+  ret <32 x i8> %1
+}
+
+define <64 x i8> @permute1_avx512(<64 x i8> %InVec) {
+; CHECK-LABEL: @permute1_avx512(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <64 x i8> %InVec, <64 x i8> undef, <64 x i32> <i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15, i32 12, i32 13, i32 14, i32 15, i32 20, i32 21, i32 22, i32 23, i32 20, i32 21, i32 22, i32 23, i32 28, i32 29, i32 30, i32 31, i32 28, i32 29, i32 30, i32 31, i32 36, i32 37, i32 38, i32 39, i32 36, i32 37, i32 38, i32 39, i32 44, i32 45, i32 46, i32 47, i32 44, i32 45, i32 46, i32 47, i32 52, i32 53, i32 54, i32 55, i32 52, i32 53, i32 54, i32 55, i32 60, i32 61, i32 62, i32 63, i32 60, i32 61, i32 62, i32 63>
+; CHECK-NEXT:    ret <64 x i8> [[TMP1]]
+;
+  %1 = tail call <64 x i8> @llvm.x86.avx512.pshuf.b.512(<64 x i8> %InVec, <64 x i8> <i8 4, i8 5, i8 6, i8 7, i8 4, i8 5, i8 6, i8 7, i8 12, i8 13, i8 14, i8 15, i8 12, i8 13, i8 14, i8 15, i8 4, i8 5, i8 6, i8 7, i8 4, i8 5, i8 6, i8 7, i8 12, i8 13, i8 14, i8 15, i8 12, i8 13, i8 14, i8 15, i8 4, i8 5, i8 6, i8 7, i8 4, i8 5, i8 6, i8 7, i8 12, i8 13, i8 14, i8 15, i8 12, i8 13, i8 14, i8 15, i8 4, i8 5, i8 6, i8 7, i8 4, i8 5, i8 6, i8 7, i8 12, i8 13, i8 14, i8 15, i8 12, i8 13, i8 14, i8 15>)
+  ret <64 x i8> %1
+}
+
+define <64 x i8> @permute2_avx512(<64 x i8> %InVec) {
+; CHECK-LABEL: @permute2_avx512(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <64 x i8> %InVec, <64 x i8> undef, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55>
+; CHECK-NEXT:    ret <64 x i8> [[TMP1]]
+;
+  %1 = tail call <64 x i8> @llvm.x86.avx512.pshuf.b.512(<64 x i8> %InVec, <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7>)
+  ret <64 x i8> %1
+}
+
+; Test that instcombine correctly folds a pshufb with values that
+; are not -128 and that are not encoded in four bits.
+
+define <16 x i8> @identity_test2_2(<16 x i8> %InVec) {
+; CHECK-LABEL: @identity_test2_2(
+; CHECK-NEXT:    ret <16 x i8> %InVec
+;
+  %1 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %InVec, <16 x i8> <i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31>)
+  ret <16 x i8> %1
+}
+
+define <32 x i8> @identity_test_avx2_2(<32 x i8> %InVec) {
+; CHECK-LABEL: @identity_test_avx2_2(
+; CHECK-NEXT:    ret <32 x i8> %InVec
+;
+  %1 = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %InVec, <32 x i8> <i8 16, i8 33, i8 66, i8 19, i8 36, i8 69, i8 22, i8 39, i8 72, i8 25, i8 42, i8 75, i8 28, i8 45, i8 78, i8 31, i8 48, i8 81, i8 34, i8 51, i8 84, i8 37, i8 54, i8 87, i8 40, i8 57, i8 90, i8 43, i8 60, i8 93, i8 46, i8 63>)
+  ret <32 x i8> %1
+}
+
+define <64 x i8> @identity_test_avx512_2(<64 x i8> %InVec) {
+; CHECK-LABEL: @identity_test_avx512_2(
+; CHECK-NEXT:    ret <64 x i8> %InVec
+;
+  %1 = tail call <64 x i8> @llvm.x86.avx512.pshuf.b.512(<64 x i8> %InVec, <64 x i8> <i8 16, i8 33, i8 66, i8 19, i8 36, i8 69, i8 22, i8 39, i8 72, i8 25, i8 42, i8 75, i8 28, i8 45, i8 78, i8 31, i8 48, i8 81, i8 34, i8 51, i8 84, i8 37, i8 54, i8 87, i8 40, i8 57, i8 90, i8 43, i8 60, i8 93, i8 46, i8 63, i8 96, i8 49, i8 66, i8 99, i8 52, i8 69, i8 102, i8 55, i8 72, i8 105, i8 58, i8 75, i8 108, i8 61, i8 78, i8 111, i8 64, i8 81, i8 114, i8 67, i8 84, i8 117, i8 70, i8 87, i8 120, i8 73, i8 90, i8 123, i8 76, i8 93, i8 126, i8 79>)
+  ret <64 x i8> %1
+}
+
+define <16 x i8> @fold_to_zero_vector_2(<16 x i8> %InVec) {
+; CHECK-LABEL: @fold_to_zero_vector_2(
+; CHECK-NEXT:    ret <16 x i8> zeroinitializer
+;
+  %1 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %InVec, <16 x i8> <i8 -125, i8 -1, i8 -53, i8 -32, i8 -4, i8 -7, i8 -33, i8 -66, i8 -99, i8 -120, i8 -100, i8 -22, i8 -17, i8 -1, i8 -11, i8 -15>)
+  ret <16 x i8> %1
+}
+
+define <32 x i8> @fold_to_zero_vector_avx2_2(<32 x i8> %InVec) {
+; CHECK-LABEL: @fold_to_zero_vector_avx2_2(
+; CHECK-NEXT:    ret <32 x i8> zeroinitializer
+;
+  %1 = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %InVec, <32 x i8> <i8 -127, i8 -1, i8 -53, i8 -32, i8 -4, i8 -7, i8 -33, i8 -66, i8 -99, i8 -120, i8 -100, i8 -22, i8 -17, i8 -1, i8 -11, i8 -15, i8 -126, i8 -2, i8 -52, i8 -31, i8 -5, i8 -8, i8 -34, i8 -67, i8 -100, i8 -119, i8 -101, i8 -23, i8 -16, i8 -2, i8 -12, i8 -16>)
+  ret <32 x i8> %1
+}
+
+define <64 x i8> @fold_to_zero_vector_avx512_2(<64 x i8> %InVec) {
+; CHECK-LABEL: @fold_to_zero_vector_avx512_2(
+; CHECK-NEXT:    ret <64 x i8> zeroinitializer
+;
+  %1 = tail call <64 x i8> @llvm.x86.avx512.pshuf.b.512(<64 x i8> %InVec, <64 x i8> <i8 -127, i8 -1, i8 -53, i8 -32, i8 -4, i8 -7, i8 -33, i8 -66, i8 -99, i8 -120, i8 -100, i8 -22, i8 -17, i8 -1, i8 -11, i8 -15, i8 -126, i8 -2, i8 -52, i8 -31, i8 -5, i8 -8, i8 -34, i8 -67, i8 -100, i8 -119, i8 -101, i8 -23, i8 -16, i8 -2, i8 -12, i8 -16, i8 -125, i8 -3, i8 -51, i8 -30, i8 -6, i8 -9, i8 -35, i8 -68, i8 -101, i8 -118, i8 -102, i8 -24, i8 -15, i8 -3, i8 -13, i8 -17, i8 -124, i8 -4, i8 -56, i8 -29, i8 -7, i8 -10, i8 -36, i8 -69, i8 -102, i8 -117, i8 -103, i8 -25, i8 -14, i8 -4, i8 -14, i8 -18>)
+  ret <64 x i8> %1
+}
+
+define <16 x i8> @permute3(<16 x i8> %InVec) {
+; CHECK-LABEL: @permute3(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i8> %InVec, <16 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    ret <16 x i8> [[TMP1]]
+;
+  %1 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %InVec, <16 x i8> <i8 48, i8 17, i8 34, i8 51, i8 20, i8 37, i8 54, i8 23, i8 16, i8 49, i8 66, i8 19, i8 52, i8 69, i8 22, i8 55>)
+  ret <16 x i8> %1
+}
+
+define <32 x i8> @permute3_avx2(<32 x i8> %InVec) {
+; CHECK-LABEL: @permute3_avx2(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <32 x i8> %InVec, <32 x i8> undef, <32 x i32> <i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15, i32 12, i32 13, i32 14, i32 15, i32 20, i32 21, i32 22, i32 23, i32 20, i32 21, i32 22, i32 23, i32 28, i32 29, i32 30, i32 31, i32 28, i32 29, i32 30, i32 31>
+; CHECK-NEXT:    ret <32 x i8> [[TMP1]]
+;
+  %1 = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %InVec, <32 x i8> <i8 52, i8 21, i8 38, i8 55, i8 20, i8 37, i8 54, i8 23, i8 28, i8 61, i8 78, i8 31, i8 60, i8 29, i8 30, i8 79, i8 52, i8 21, i8 38, i8 55, i8 20, i8 53, i8 102, i8 23, i8 92, i8 93, i8 94, i8 95, i8 108, i8 109, i8 110, i8 111>)
+  ret <32 x i8> %1
+}
+
+define <64 x i8> @permute3_avx512(<64 x i8> %InVec) {
+; CHECK-LABEL: @permute3_avx512(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <64 x i8> %InVec, <64 x i8> undef, <64 x i32> <i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15, i32 12, i32 13, i32 14, i32 15, i32 20, i32 21, i32 22, i32 23, i32 20, i32 21, i32 22, i32 23, i32 28, i32 29, i32 30, i32 31, i32 28, i32 29, i32 30, i32 31, i32 36, i32 37, i32 38, i32 39, i32 36, i32 37, i32 38, i32 39, i32 44, i32 45, i32 46, i32 47, i32 44, i32 45, i32 46, i32 47, i32 52, i32 53, i32 54, i32 55, i32 52, i32 53, i32 54, i32 55, i32 60, i32 61, i32 62, i32 63, i32 60, i32 61, i32 62, i32 63>
+; CHECK-NEXT:    ret <64 x i8> [[TMP1]]
+;
+  %1 = tail call <64 x i8> @llvm.x86.avx512.pshuf.b.512(<64 x i8> %InVec, <64 x i8> <i8 52, i8 21, i8 38, i8 55, i8 20, i8 37, i8 54, i8 23, i8 28, i8 61, i8 78, i8 31, i8 60, i8 29, i8 30, i8 79, i8 52, i8 21, i8 38, i8 55, i8 20, i8 53, i8 102, i8 23, i8 92, i8 93, i8 94, i8 95, i8 108, i8 109, i8 110, i8 111, i8 52, i8 21, i8 38, i8 55, i8 20, i8 37, i8 54, i8 23, i8 28, i8 61, i8 78, i8 31, i8 60, i8 29, i8 30, i8 79, i8 52, i8 21, i8 38, i8 55, i8 20, i8 53, i8 102, i8 23, i8 108, i8 109, i8 110, i8 111, i8 124, i8 125, i8 126, i8 127>)
+  ret <64 x i8> %1
+}
+
+; FIXME: Verify that instcombine is able to fold constant byte shuffles with undef mask elements.
+
+define <16 x i8> @fold_with_undef_elts(<16 x i8> %InVec) {
+; CHECK-LABEL: @fold_with_undef_elts(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i8> %InVec, <16 x i8> <i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef>, <16 x i32> <i32 0, i32 16, i32 undef, i32 16, i32 1, i32 16, i32 undef, i32 16, i32 2, i32 16, i32 undef, i32 16, i32 3, i32 16, i32 undef, i32 16>
+; CHECK-NEXT:    ret <16 x i8> [[TMP1]]
+;
+  %1 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %InVec, <16 x i8> <i8 0, i8 -128, i8 undef, i8 -128, i8 1, i8 -128, i8 undef, i8 -128, i8 2, i8 -128, i8 undef, i8 -128, i8 3, i8 -128, i8 undef, i8 -128>)
+  ret <16 x i8> %1
+}
+
+define <32 x i8> @fold_with_undef_elts_avx2(<32 x i8> %InVec) {
+; CHECK-LABEL: @fold_with_undef_elts_avx2(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <32 x i8> %InVec, <32 x i8> <i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef>, <32 x i32> <i32 0, i32 32, i32 undef, i32 32, i32 1, i32 32, i32 undef, i32 32, i32 2, i32 32, i32 undef, i32 32, i32 3, i32 32, i32 undef, i32 32, i32 16, i32 48, i32 undef, i32 48, i32 17, i32 48, i32 undef, i32 48, i32 18, i32 48, i32 undef, i32 48, i32 19, i32 48, i32 undef, i32 48>
+; CHECK-NEXT:    ret <32 x i8> [[TMP1]]
+;
+  %1 = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %InVec, <32 x i8> <i8 0, i8 -128, i8 undef, i8 -128, i8 1, i8 -128, i8 undef, i8 -128, i8 2, i8 -128, i8 undef, i8 -128, i8 3, i8 -128, i8 undef, i8 -128, i8 0, i8 -128, i8 undef, i8 -128, i8 1, i8 -128, i8 undef, i8 -128, i8 2, i8 -128, i8 undef, i8 -128, i8 3, i8 -128, i8 undef, i8 -128>)
+  ret <32 x i8> %1
+}
+
+define <64 x i8> @fold_with_undef_elts_avx512(<64 x i8> %InVec) {
+; CHECK-LABEL: @fold_with_undef_elts_avx512(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <64 x i8> %InVec, <64 x i8> <i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef>, <64 x i32> <i32 0, i32 64, i32 undef, i32 64, i32 1, i32 64, i32 undef, i32 64, i32 2, i32 64, i32 undef, i32 64, i32 3, i32 64, i32 undef, i32 64, i32 16, i32 80, i32 undef, i32 80, i32 17, i32 80, i32 undef, i32 80, i32 18, i32 80, i32 undef, i32 80, i32 19, i32 80, i32 undef, i32 80, i32 32, i32 96, i32 undef, i32 96, i32 33, i32 96, i32 undef, i32 96, i32 34, i32 96, i32 undef, i32 96, i32 35, i32 96, i32 undef, i32 96, i32 48, i32 112, i32 undef, i32 112, i32 49, i32 112, i32 undef, i32 112, i32 50, i32 112, i32 undef, i32 112, i32 51, i32 112, i32 undef, i32 112>
+; CHECK-NEXT:    ret <64 x i8> [[TMP1]]
+;
+  %1 = tail call <64 x i8> @llvm.x86.avx512.pshuf.b.512(<64 x i8> %InVec, <64 x i8> <i8 0, i8 -128, i8 undef, i8 -128, i8 1, i8 -128, i8 undef, i8 -128, i8 2, i8 -128, i8 undef, i8 -128, i8 3, i8 -128, i8 undef, i8 -128, i8 0, i8 -128, i8 undef, i8 -128, i8 1, i8 -128, i8 undef, i8 -128, i8 2, i8 -128, i8 undef, i8 -128, i8 3, i8 -128, i8 undef, i8 -128, i8 0, i8 -128, i8 undef, i8 -128, i8 1, i8 -128, i8 undef, i8 -128, i8 2, i8 -128, i8 undef, i8 -128, i8 3, i8 -128, i8 undef, i8 -128, i8 0, i8 -128, i8 undef, i8 -128, i8 1, i8 -128, i8 undef, i8 -128, i8 2, i8 -128, i8 undef, i8 -128, i8 3, i8 -128, i8 undef, i8 -128>)
+  ret <64 x i8> %1
+}
+
+define <16 x i8> @fold_with_allundef_elts(<16 x i8> %InVec) {
+; CHECK-LABEL: @fold_with_allundef_elts(
+; CHECK-NEXT:    ret <16 x i8> undef
+;
+  %1 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %InVec, <16 x i8> undef)
+  ret <16 x i8> %1
+}
+
+define <32 x i8> @fold_with_allundef_elts_avx2(<32 x i8> %InVec) {
+; CHECK-LABEL: @fold_with_allundef_elts_avx2(
+; CHECK-NEXT:    ret <32 x i8> undef
+;
+  %1 = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %InVec, <32 x i8> undef)
+  ret <32 x i8> %1
+}
+
+define <64 x i8> @fold_with_allundef_elts_avx512(<64 x i8> %InVec) {
+; CHECK-LABEL: @fold_with_allundef_elts_avx512(
+; CHECK-NEXT:    ret <64 x i8> undef
+;
+  %1 = tail call <64 x i8> @llvm.x86.avx512.pshuf.b.512(<64 x i8> %InVec, <64 x i8> undef)
+  ret <64 x i8> %1
+}
+
+; Demanded elts tests.
+
+define <16 x i8> @demanded_elts_insertion(<16 x i8> %InVec, <16 x i8> %BaseMask, i8 %M0, i8 %M15) {
+; CHECK-LABEL: @demanded_elts_insertion(
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %InVec, <16 x i8> %BaseMask)
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> undef, <16 x i32> <i32 undef, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 undef>
+; CHECK-NEXT:    ret <16 x i8> [[TMP2]]
+;
+  %1 = insertelement <16 x i8> %BaseMask, i8 %M0, i32 0
+  %2 = insertelement <16 x i8> %1, i8 %M15, i32 15
+  %3 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %InVec, <16 x i8> %2)
+  %4 = shufflevector <16 x i8> %3, <16 x i8> undef, <16 x i32> <i32 undef, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 undef>
+  ret <16 x i8> %4
+}
+
+define <32 x i8> @demanded_elts_insertion_avx2(<32 x i8> %InVec, <32 x i8> %BaseMask, i8 %M0, i8 %M22) {
+; CHECK-LABEL: @demanded_elts_insertion_avx2(
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <32 x i8> %BaseMask, i8 %M0, i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %InVec, <32 x i8> [[TMP1]])
+; CHECK-NEXT:    ret <32 x i8> [[TMP2]]
+;
+  %1 = insertelement <32 x i8> %BaseMask, i8 %M0, i32 0
+  %2 = insertelement <32 x i8> %1, i8 %M22, i32 22
+  %3 = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %InVec, <32 x i8> %2)
+  %4 = shufflevector <32 x i8> %3, <32 x i8> undef, <32 x i32> <i32 undef, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 undef, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  ret <32 x i8> %4
+}
+
+define <64 x i8> @demanded_elts_insertion_avx512(<64 x i8> %InVec, <64 x i8> %BaseMask, i8 %M0, i8 %M30) {
+; CHECK-LABEL: @demanded_elts_insertion_avx512(
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <64 x i8> undef, i8 %M0, i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call <64 x i8> @llvm.x86.avx512.pshuf.b.512(<64 x i8> %InVec, <64 x i8> [[TMP1]])
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <64 x i8> [[TMP2]], <64 x i8> undef, <64 x i32> zeroinitializer
+; CHECK-NEXT:    ret <64 x i8> [[TMP3]]
+;
+  %1 = insertelement <64 x i8> %BaseMask, i8 %M0, i32 0
+  %2 = insertelement <64 x i8> %1, i8 %M30, i32 30
+  %3 = tail call <64 x i8> @llvm.x86.avx512.pshuf.b.512(<64 x i8> %InVec, <64 x i8> %2)
+  %4 = shufflevector <64 x i8> %3, <64 x i8> undef, <64 x i32> zeroinitializer
+  ret <64 x i8> %4
+}
+
+declare <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8>, <16 x i8>)
+declare <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8>, <32 x i8>)
+declare <64 x i8> @llvm.x86.avx512.pshuf.b.512(<64 x i8>, <64 x i8>)
diff --git a/test/Transforms/InstCombine/X86/x86-sse.ll b/test/Transforms/InstCombine/X86/x86-sse.ll
new file mode 100644
index 000000000000..6ed62a4e0224
--- /dev/null
+++ b/test/Transforms/InstCombine/X86/x86-sse.ll
@@ -0,0 +1,613 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -instcombine -S | FileCheck %s
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+define float @test_rcp_ss_0(float %a) {
+; CHECK-LABEL: @test_rcp_ss_0(
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x float> undef, float %a, i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x float> @llvm.x86.sse.rcp.ss(<4 x float> [[TMP1]])
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[TMP2]], i32 0
+; CHECK-NEXT:    ret float [[TMP3]]
+;
+  %1 = insertelement <4 x float> undef, float %a, i32 0
+  %2 = insertelement <4 x float> %1, float 1.000000e+00, i32 1
+  %3 = insertelement <4 x float> %2, float 2.000000e+00, i32 2
+  %4 = insertelement <4 x float> %3, float 3.000000e+00, i32 3
+  %5 = tail call <4 x float> @llvm.x86.sse.rcp.ss(<4 x float> %4)
+  %6 = extractelement <4 x float> %5, i32 0
+  ret float %6
+}
+
+define float @test_rcp_ss_1(float %a) {
+; CHECK-LABEL: @test_rcp_ss_1(
+; CHECK-NEXT:    ret float 1.000000e+00
+;
+  %1 = insertelement <4 x float> undef, float %a, i32 0
+  %2 = insertelement <4 x float> %1, float 1.000000e+00, i32 1
+  %3 = insertelement <4 x float> %2, float 2.000000e+00, i32 2
+  %4 = insertelement <4 x float> %3, float 3.000000e+00, i32 3
+  %5 = tail call <4 x float> @llvm.x86.sse.rcp.ss(<4 x float> %4)
+  %6 = extractelement <4 x float> %5, i32 1
+  ret float %6
+}
+
+define float @test_sqrt_ss_0(float %a) {
+; CHECK-LABEL: @test_sqrt_ss_0(
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x float> undef, float %a, i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float> [[TMP1]])
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[TMP2]], i32 0
+; CHECK-NEXT:    ret float [[TMP3]]
+;
+  %1 = insertelement <4 x float> undef, float %a, i32 0
+  %2 = insertelement <4 x float> %1, float 1.000000e+00, i32 1
+  %3 = insertelement <4 x float> %2, float 2.000000e+00, i32 2
+  %4 = insertelement <4 x float> %3, float 3.000000e+00, i32 3
+  %5 = tail call <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float> %4)
+  %6 = extractelement <4 x float> %5, i32 0
+  ret float %6
+}
+
+define float @test_sqrt_ss_2(float %a) {
+; CHECK-LABEL: @test_sqrt_ss_2(
+; CHECK-NEXT:    ret float 2.000000e+00
+;
+  %1 = insertelement <4 x float> undef, float %a, i32 0
+  %2 = insertelement <4 x float> %1, float 1.000000e+00, i32 1
+  %3 = insertelement <4 x float> %2, float 2.000000e+00, i32 2
+  %4 = insertelement <4 x float> %3, float 3.000000e+00, i32 3
+  %5 = tail call <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float> %4)
+  %6 = extractelement <4 x float> %5, i32 2
+  ret float %6
+}
+
+define float @test_rsqrt_ss_0(float %a) {
+; CHECK-LABEL: @test_rsqrt_ss_0(
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x float> undef, float %a, i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float> [[TMP1]])
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[TMP2]], i32 0
+; CHECK-NEXT:    ret float [[TMP3]]
+;
+  %1 = insertelement <4 x float> undef, float %a, i32 0
+  %2 = insertelement <4 x float> %1, float 1.000000e+00, i32 1
+  %3 = insertelement <4 x float> %2, float 2.000000e+00, i32 2
+  %4 = insertelement <4 x float> %3, float 3.000000e+00, i32 3
+  %5 = tail call <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float> %4)
+  %6 = extractelement <4 x float> %5, i32 0
+  ret float %6
+}
+
+define float @test_rsqrt_ss_3(float %a) {
+; CHECK-LABEL: @test_rsqrt_ss_3(
+; CHECK-NEXT:    ret float 3.000000e+00
+;
+  %1 = insertelement <4 x float> undef, float %a, i32 0
+  %2 = insertelement <4 x float> %1, float 1.000000e+00, i32 1
+  %3 = insertelement <4 x float> %2, float 2.000000e+00, i32 2
+  %4 = insertelement <4 x float> %3, float 3.000000e+00, i32 3
+  %5 = tail call <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float> %4)
+  %6 = extractelement <4 x float> %5, i32 3
+  ret float %6
+}
+
+define float @test_add_ss_0(float %a, float %b) {
+; CHECK-LABEL: @test_add_ss_0(
+; CHECK-NEXT:    [[TMP1:%.*]] = fadd float %a, %b
+; CHECK-NEXT:    ret float [[TMP1]]
+;
+  %1 = insertelement <4 x float> undef, float %a, i32 0
+  %2 = insertelement <4 x float> %1, float 1.000000e+00, i32 1
+  %3 = insertelement <4 x float> %2, float 2.000000e+00, i32 2
+  %4 = insertelement <4 x float> %3, float 3.000000e+00, i32 3
+  %5 = insertelement <4 x float> undef, float %b, i32 0
+  %6 = insertelement <4 x float> %5, float 4.000000e+00, i32 1
+  %7 = insertelement <4 x float> %6, float 5.000000e+00, i32 2
+  %8 = insertelement <4 x float> %7, float 6.000000e+00, i32 3
+  %9 = tail call <4 x float> @llvm.x86.sse.add.ss(<4 x float> %4, <4 x float> %8)
+  %r = extractelement <4 x float> %9, i32 0
+  ret float %r
+}
+
+define float @test_add_ss_1(float %a, float %b) {
+; CHECK-LABEL: @test_add_ss_1(
+; CHECK-NEXT:    ret float 1.000000e+00
+;
+  %1 = insertelement <4 x float> undef, float %a, i32 0
+  %2 = insertelement <4 x float> %1, float 1.000000e+00, i32 1
+  %3 = insertelement <4 x float> %2, float 2.000000e+00, i32 2
+  %4 = insertelement <4 x float> %3, float 3.000000e+00, i32 3
+  %5 = insertelement <4 x float> undef, float %b, i32 0
+  %6 = tail call <4 x float> @llvm.x86.sse.add.ss(<4 x float> %4, <4 x float> %5)
+  %7 = extractelement <4 x float> %6, i32 1
+  ret float %7
+}
+
+define float @test_sub_ss_0(float %a, float %b) {
+; CHECK-LABEL: @test_sub_ss_0(
+; CHECK-NEXT:    [[TMP1:%.*]] = fsub float %a, %b
+; CHECK-NEXT:    ret float [[TMP1]]
+;
+  %1 = insertelement <4 x float> undef, float %a, i32 0
+  %2 = insertelement <4 x float> %1, float 1.000000e+00, i32 1
+  %3 = insertelement <4 x float> %2, float 2.000000e+00, i32 2
+  %4 = insertelement <4 x float> %3, float 3.000000e+00, i32 3
+  %5 = insertelement <4 x float> undef, float %b, i32 0
+  %6 = insertelement <4 x float> %5, float 4.000000e+00, i32 1
+  %7 = insertelement <4 x float> %6, float 5.000000e+00, i32 2
+  %8 = insertelement <4 x float> %7, float 6.000000e+00, i32 3
+  %9 = tail call <4 x float> @llvm.x86.sse.sub.ss(<4 x float> %4, <4 x float> %8)
+  %r = extractelement <4 x float> %9, i32 0
+  ret float %r
+}
+
+define float @test_sub_ss_2(float %a, float %b) {
+; CHECK-LABEL: @test_sub_ss_2(
+; CHECK-NEXT:    ret float 2.000000e+00
+;
+  %1 = insertelement <4 x float> undef, float %a, i32 0
+  %2 = insertelement <4 x float> %1, float 1.000000e+00, i32 1
+  %3 = insertelement <4 x float> %2, float 2.000000e+00, i32 2
+  %4 = insertelement <4 x float> %3, float 3.000000e+00, i32 3
+  %5 = insertelement <4 x float> undef, float %b, i32 0
+  %6 = tail call <4 x float> @llvm.x86.sse.sub.ss(<4 x float> %4, <4 x float> %5)
+  %7 = extractelement <4 x float> %6, i32 2
+  ret float %7
+}
+
+define float @test_mul_ss_0(float %a, float %b) {
+; CHECK-LABEL: @test_mul_ss_0(
+; CHECK-NEXT:    [[TMP1:%.*]] = fmul float %a, %b
+; CHECK-NEXT:    ret float [[TMP1]]
+;
+  %1 = insertelement <4 x float> undef, float %a, i32 0
+  %2 = insertelement <4 x float> %1, float 1.000000e+00, i32 1
+  %3 = insertelement <4 x float> %2, float 2.000000e+00, i32 2
+  %4 = insertelement <4 x float> %3, float 3.000000e+00, i32 3
+  %5 = insertelement <4 x float> undef, float %b, i32 0
+  %6 = insertelement <4 x float> %5, float 4.000000e+00, i32 1
+  %7 = insertelement <4 x float> %6, float 5.000000e+00, i32 2
+  %8 = insertelement <4 x float> %7, float 6.000000e+00, i32 3
+  %9 = tail call <4 x float> @llvm.x86.sse.mul.ss(<4 x float> %4, <4 x float> %8)
+  %r = extractelement <4 x float> %9, i32 0
+  ret float %r
+}
+
+define float @test_mul_ss_3(float %a, float %b) {
+; CHECK-LABEL: @test_mul_ss_3(
+; CHECK-NEXT:    ret float 3.000000e+00
+;
+  %1 = insertelement <4 x float> undef, float %a, i32 0
+  %2 = insertelement <4 x float> %1, float 1.000000e+00, i32 1
+  %3 = insertelement <4 x float> %2, float 2.000000e+00, i32 2
+  %4 = insertelement <4 x float> %3, float 3.000000e+00, i32 3
+  %5 = insertelement <4 x float> undef, float %b, i32 0
+  %6 = tail call <4 x float> @llvm.x86.sse.mul.ss(<4 x float> %4, <4 x float> %5)
+  %7 = extractelement <4 x float> %6, i32 3
+  ret float %7
+}
+
+define float @test_div_ss_0(float %a, float %b) {
+; CHECK-LABEL: @test_div_ss_0(
+; CHECK-NEXT:    [[TMP1:%.*]] = fdiv float %a, %b
+; CHECK-NEXT:    ret float [[TMP1]]
+;
+  %1 = insertelement <4 x float> undef, float %a, i32 0
+  %2 = insertelement <4 x float> %1, float 1.000000e+00, i32 1
+  %3 = insertelement <4 x float> %2, float 2.000000e+00, i32 2
+  %4 = insertelement <4 x float> %3, float 3.000000e+00, i32 3
+  %5 = insertelement <4 x float> undef, float %b, i32 0
+  %6 = insertelement <4 x float> %5, float 4.000000e+00, i32 1
+  %7 = insertelement <4 x float> %6, float 5.000000e+00, i32 2
+  %8 = insertelement <4 x float> %7, float 6.000000e+00, i32 3
+  %9 = tail call <4 x float> @llvm.x86.sse.div.ss(<4 x float> %4, <4 x float> %8)
+  %r = extractelement <4 x float> %9, i32 0
+  ret float %r
+}
+
+define float @test_div_ss_1(float %a, float %b) {
+; CHECK-LABEL: @test_div_ss_1(
+; CHECK-NEXT:    ret float 1.000000e+00
+;
+  %1 = insertelement <4 x float> undef, float %a, i32 0
+  %2 = insertelement <4 x float> %1, float 1.000000e+00, i32 1
+  %3 = insertelement <4 x float> %2, float 2.000000e+00, i32 2
+  %4 = insertelement <4 x float> %3, float 3.000000e+00, i32 3
+  %5 = insertelement <4 x float> undef, float %b, i32 0
+  %6 = tail call <4 x float> @llvm.x86.sse.div.ss(<4 x float> %4, <4 x float> %5)
+  %7 = extractelement <4 x float> %6, i32 1
+  ret float %7
+}
+
+define <4 x float> @test_min_ss(<4 x float> %a, <4 x float> %b) {
+; CHECK-LABEL: @test_min_ss(
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x float> @llvm.x86.sse.min.ss(<4 x float> %a, <4 x float> %b)
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
+;
+  %1 = insertelement <4 x float> %b, float 1.000000e+00, i32 1
+  %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2
+  %3 = insertelement <4 x float> %2, float 3.000000e+00, i32 3
+  %4 = tail call <4 x float> @llvm.x86.sse.min.ss(<4 x float> %a, <4 x float> %3)
+  ret <4 x float> %4
+}
+
+define float @test_min_ss_0(float %a, float %b) {
+; CHECK-LABEL: @test_min_ss_0(
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x float> undef, float %a, i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x float> undef, float %b, i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = tail call <4 x float> @llvm.x86.sse.min.ss(<4 x float> [[TMP1]], <4 x float> [[TMP2]])
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[TMP3]], i32 0
+; CHECK-NEXT:    ret float [[TMP4]]
+;
+  %1 = insertelement <4 x float> undef, float %a, i32 0
+  %2 = insertelement <4 x float> %1, float 1.000000e+00, i32 1
+  %3 = insertelement <4 x float> %2, float 2.000000e+00, i32 2
+  %4 = insertelement <4 x float> %3, float 3.000000e+00, i32 3
+  %5 = insertelement <4 x float> undef, float %b, i32 0
+  %6 = insertelement <4 x float> %5, float 4.000000e+00, i32 1
+  %7 = insertelement <4 x float> %6, float 5.000000e+00, i32 2
+  %8 = insertelement <4 x float> %7, float 6.000000e+00, i32 3
+  %9 = tail call <4 x float> @llvm.x86.sse.min.ss(<4 x float> %4, <4 x float> %8)
+  %10 = extractelement <4 x float> %9, i32 0
+  ret float %10
+}
+
+define float @test_min_ss_2(float %a, float %b) {
+; CHECK-LABEL: @test_min_ss_2(
+; CHECK-NEXT:    ret float 2.000000e+00
+;
+  %1 = insertelement <4 x float> undef, float %a, i32 0
+  %2 = insertelement <4 x float> %1, float 1.000000e+00, i32 1
+  %3 = insertelement <4 x float> %2, float 2.000000e+00, i32 2
+  %4 = insertelement <4 x float> %3, float 3.000000e+00, i32 3
+  %5 = insertelement <4 x float> undef, float %b, i32 0
+  %6 = tail call <4 x float> @llvm.x86.sse.min.ss(<4 x float> %4, <4 x float> %5)
+  %7 = extractelement <4 x float> %6, i32 2
+  ret float %7
+}
+
+define <4 x float> @test_max_ss(<4 x float> %a, <4 x float> %b) {
+; CHECK-LABEL: @test_max_ss(
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x float> @llvm.x86.sse.max.ss(<4 x float> %a, <4 x float> %b)
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
+;
+  %1 = insertelement <4 x float> %b, float 1.000000e+00, i32 1
+  %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2
+  %3 = insertelement <4 x float> %2, float 3.000000e+00, i32 3
+  %4 = tail call <4 x float> @llvm.x86.sse.max.ss(<4 x float> %a, <4 x float> %3)
+  ret <4 x float> %4
+}
+
+define float @test_max_ss_0(float %a, float %b) {
+; CHECK-LABEL: @test_max_ss_0(
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x float> undef, float %a, i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x float> undef, float %b, i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = tail call <4 x float> @llvm.x86.sse.max.ss(<4 x float> [[TMP1]], <4 x float> [[TMP2]])
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[TMP3]], i32 0
+; CHECK-NEXT:    ret float [[TMP4]]
+;
+  %1 = insertelement <4 x float> undef, float %a, i32 0
+  %2 = insertelement <4 x float> %1, float 1.000000e+00, i32 1
+  %3 = insertelement <4 x float> %2, float 2.000000e+00, i32 2
+  %4 = insertelement <4 x float> %3, float 3.000000e+00, i32 3
+  %5 = insertelement <4 x float> undef, float %b, i32 0
+  %6 = insertelement <4 x float> %5, float 4.000000e+00, i32 1
+  %7 = insertelement <4 x float> %6, float 5.000000e+00, i32 2
+  %8 = insertelement <4 x float> %7, float 6.000000e+00, i32 3
+  %9 = tail call <4 x float> @llvm.x86.sse.max.ss(<4 x float> %4, <4 x float> %8)
+  %10 = extractelement <4 x float> %9, i32 0
+  ret float %10
+}
+
+define float @test_max_ss_3(float %a, float %b) {
+; CHECK-LABEL: @test_max_ss_3(
+; CHECK-NEXT:    ret float 3.000000e+00
+;
+  %1 = insertelement <4 x float> undef, float %a, i32 0
+  %2 = insertelement <4 x float> %1, float 1.000000e+00, i32 1
+  %3 = insertelement <4 x float> %2, float 2.000000e+00, i32 2
+  %4 = insertelement <4 x float> %3, float 3.000000e+00, i32 3
+  %5 = insertelement <4 x float> undef, float %b, i32 0
+  %6 = tail call <4 x float> @llvm.x86.sse.max.ss(<4 x float> %4, <4 x float> %5)
+  %7 = extractelement <4 x float> %6, i32 3
+  ret float %7
+}
+
+define <4 x float> @test_cmp_ss(<4 x float> %a, <4 x float> %b) {
+; CHECK-LABEL: @test_cmp_ss(
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %a, <4 x float> %b, i8 0)
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
+;
+  %1 = insertelement <4 x float> %b, float 1.000000e+00, i32 1
+  %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2
+  %3 = insertelement <4 x float> %2, float 3.000000e+00, i32 3
+  %4 = tail call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %a, <4 x float> %3, i8 0)
+  ret <4 x float> %4
+}
+
+define float @test_cmp_ss_0(float %a, float %b) {
+; CHECK-LABEL: @test_cmp_ss_0(
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x float> undef, float %a, i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x float> undef, float %b, i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = tail call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> [[TMP1]], <4 x float> [[TMP2]], i8 0)
+; CHECK-NEXT:    [[R:%.*]] = extractelement <4 x float> [[TMP3]], i32 0
+; CHECK-NEXT:    ret float [[R]]
+;
+  %1 = insertelement <4 x float> undef, float %a, i32 0
+  %2 = insertelement <4 x float> %1, float 1.000000e+00, i32 1
+  %3 = insertelement <4 x float> %2, float 2.000000e+00, i32 2
+  %4 = insertelement <4 x float> %3, float 3.000000e+00, i32 3
+  %5 = insertelement <4 x float> undef, float %b, i32 0
+  %6 = insertelement <4 x float> %5, float 4.000000e+00, i32 1
+  %7 = insertelement <4 x float> %6, float 5.000000e+00, i32 2
+  %8 = insertelement <4 x float> %7, float 6.000000e+00, i32 3
+  %9 = tail call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %4, <4 x float> %8, i8 0)
+  %r = extractelement <4 x float> %9, i32 0
+  ret float %r
+}
+
+define float @test_cmp_ss_1(float %a, float %b) {
+; CHECK-LABEL: @test_cmp_ss_1(
+; CHECK-NEXT:    ret float 1.000000e+00
+;
+  %1 = insertelement <4 x float> undef, float %a, i32 0
+  %2 = insertelement <4 x float> %1, float 1.000000e+00, i32 1
+  %3 = insertelement <4 x float> %2, float 2.000000e+00, i32 2
+  %4 = insertelement <4 x float> %3, float 3.000000e+00, i32 3
+  %5 = insertelement <4 x float> undef, float %b, i32 0
+  %6 = tail call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %4, <4 x float> %5, i8 0)
+  %7 = extractelement <4 x float> %6, i32 1
+  ret float %7
+}
+
+define i32 @test_comieq_ss_0(float %a, float %b) {
+; CHECK-LABEL: @test_comieq_ss_0(
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x float> undef, float %a, i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x float> undef, float %b, i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = tail call i32 @llvm.x86.sse.comieq.ss(<4 x float> [[TMP1]], <4 x float> [[TMP2]])
+; CHECK-NEXT:    ret i32 [[TMP3]]
+;
+  %1 = insertelement <4 x float> undef, float %a, i32 0
+  %2 = insertelement <4 x float> %1, float 1.000000e+00, i32 1
+  %3 = insertelement <4 x float> %2, float 2.000000e+00, i32 2
+  %4 = insertelement <4 x float> %3, float 3.000000e+00, i32 3
+  %5 = insertelement <4 x float> undef, float %b, i32 0
+  %6 = insertelement <4 x float> %5, float 4.000000e+00, i32 1
+  %7 = insertelement <4 x float> %6, float 5.000000e+00, i32 2
+  %8 = insertelement <4 x float> %7, float 6.000000e+00, i32 3
+  %9 = tail call i32 @llvm.x86.sse.comieq.ss(<4 x float> %4, <4 x float> %8)
+  ret i32 %9
+}
+
+define i32 @test_comige_ss_0(float %a, float %b) {
+; CHECK-LABEL: @test_comige_ss_0(
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x float> undef, float %a, i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x float> undef, float %b, i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = tail call i32 @llvm.x86.sse.comige.ss(<4 x float> [[TMP1]], <4 x float> [[TMP2]])
+; CHECK-NEXT:    ret i32 [[TMP3]]
+;
+  %1 = insertelement <4 x float> undef, float %a, i32 0
+  %2 = insertelement <4 x float> %1, float 1.000000e+00, i32 1
+  %3 = insertelement <4 x float> %2, float 2.000000e+00, i32 2
+  %4 = insertelement <4 x float> %3, float 3.000000e+00, i32 3
+  %5 = insertelement <4 x float> undef, float %b, i32 0
+  %6 = insertelement <4 x float> %5, float 4.000000e+00, i32 1
+  %7 = insertelement <4 x float> %6, float 5.000000e+00, i32 2
+  %8 = insertelement <4 x float> %7, float 6.000000e+00, i32 3
+  %9 = tail call i32 @llvm.x86.sse.comige.ss(<4 x float> %4, <4 x float> %8)
+  ret i32 %9
+}
+
+define i32 @test_comigt_ss_0(float %a, float %b) {
+; CHECK-LABEL: @test_comigt_ss_0(
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x float> undef, float %a, i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x float> undef, float %b, i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = tail call i32 @llvm.x86.sse.comigt.ss(<4 x float> [[TMP1]], <4 x float> [[TMP2]])
+; CHECK-NEXT:    ret i32 [[TMP3]]
+;
+  %1 = insertelement <4 x float> undef, float %a, i32 0
+  %2 = insertelement <4 x float> %1, float 1.000000e+00, i32 1
+  %3 = insertelement <4 x float> %2, float 2.000000e+00, i32 2
+  %4 = insertelement <4 x float> %3, float 3.000000e+00, i32 3
+  %5 = insertelement <4 x float> undef, float %b, i32 0
+  %6 = insertelement <4 x float> %5, float 4.000000e+00, i32 1
+  %7 = insertelement <4 x float> %6, float 5.000000e+00, i32 2
+  %8 = insertelement <4 x float> %7, float 6.000000e+00, i32 3
+  %9 = tail call i32 @llvm.x86.sse.comigt.ss(<4 x float> %4, <4 x float> %8)
+  ret i32 %9
+}
+
+define i32 @test_comile_ss_0(float %a, float %b) {
+; CHECK-LABEL: @test_comile_ss_0(
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x float> undef, float %a, i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x float> undef, float %b, i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = tail call i32 @llvm.x86.sse.comile.ss(<4 x float> [[TMP1]], <4 x float> [[TMP2]])
+; CHECK-NEXT:    ret i32 [[TMP3]]
+;
+  %1 = insertelement <4 x float> undef, float %a, i32 0
+  %2 = insertelement <4 x float> %1, float 1.000000e+00, i32 1
+  %3 = insertelement <4 x float> %2, float 2.000000e+00, i32 2
+  %4 = insertelement <4 x float> %3, float 3.000000e+00, i32 3
+  %5 = insertelement <4 x float> undef, float %b, i32 0
+  %6 = insertelement <4 x float> %5, float 4.000000e+00, i32 1
+  %7 = insertelement <4 x float> %6, float 5.000000e+00, i32 2
+  %8 = insertelement <4 x float> %7, float 6.000000e+00, i32 3
+  %9 = tail call i32 @llvm.x86.sse.comile.ss(<4 x float> %4, <4 x float> %8)
+  ret i32 %9
+}
+
+define i32 @test_comilt_ss_0(float %a, float %b) {
+; CHECK-LABEL: @test_comilt_ss_0(
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x float> undef, float %a, i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x float> undef, float %b, i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = tail call i32 @llvm.x86.sse.comilt.ss(<4 x float> [[TMP1]], <4 x float> [[TMP2]])
+; CHECK-NEXT:    ret i32 [[TMP3]]
+;
+  %1 = insertelement <4 x float> undef, float %a, i32 0
+  %2 = insertelement <4 x float> %1, float 1.000000e+00, i32 1
+  %3 = insertelement <4 x float> %2, float 2.000000e+00, i32 2
+  %4 = insertelement <4 x float> %3, float 3.000000e+00, i32 3
+  %5 = insertelement <4 x float> undef, float %b, i32 0
+  %6 = insertelement <4 x float> %5, float 4.000000e+00, i32 1
+  %7 = insertelement <4 x float> %6, float 5.000000e+00, i32 2
+  %8 = insertelement <4 x float> %7, float 6.000000e+00, i32 3
+  %9 = tail call i32 @llvm.x86.sse.comilt.ss(<4 x float> %4, <4 x float> %8)
+  ret i32 %9
+}
+
+define i32 @test_comineq_ss_0(float %a, float %b) {
+; CHECK-LABEL: @test_comineq_ss_0(
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x float> undef, float %a, i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x float> undef, float %b, i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = tail call i32 @llvm.x86.sse.comineq.ss(<4 x float> [[TMP1]], <4 x float> [[TMP2]])
+; CHECK-NEXT:    ret i32 [[TMP3]]
+;
+  %1 = insertelement <4 x float> undef, float %a, i32 0
+  %2 = insertelement <4 x float> %1, float 1.000000e+00, i32 1
+  %3 = insertelement <4 x float> %2, float 2.000000e+00, i32 2
+  %4 = insertelement <4 x float> %3, float 3.000000e+00, i32 3
+  %5 = insertelement <4 x float> undef, float %b, i32 0
+  %6 = insertelement <4 x float> %5, float 4.000000e+00, i32 1
+  %7 = insertelement <4 x float> %6, float 5.000000e+00, i32 2
+  %8 = insertelement <4 x float> %7, float 6.000000e+00, i32 3
+  %9 = tail call i32 @llvm.x86.sse.comineq.ss(<4 x float> %4, <4 x float> %8)
+  ret i32 %9
+}
+
+define i32 @test_ucomieq_ss_0(float %a, float %b) {
+; CHECK-LABEL: @test_ucomieq_ss_0(
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x float> undef, float %a, i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x float> undef, float %b, i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = tail call i32 @llvm.x86.sse.ucomieq.ss(<4 x float> [[TMP1]], <4 x float> [[TMP2]])
+; CHECK-NEXT:    ret i32 [[TMP3]]
+;
+  %1 = insertelement <4 x float> undef, float %a, i32 0
+  %2 = insertelement <4 x float> %1, float 1.000000e+00, i32 1
+  %3 = insertelement <4 x float> %2, float 2.000000e+00, i32 2
+  %4 = insertelement <4 x float> %3, float 3.000000e+00, i32 3
+  %5 = insertelement <4 x float> undef, float %b, i32 0
+  %6 = insertelement <4 x float> %5, float 4.000000e+00, i32 1
+  %7 = insertelement <4 x float> %6, float 5.000000e+00, i32 2
+  %8 = insertelement <4 x float> %7, float 6.000000e+00, i32 3
+  %9 = tail call i32 @llvm.x86.sse.ucomieq.ss(<4 x float> %4, <4 x float> %8)
+  ret i32 %9
+}
+
+define i32 @test_ucomige_ss_0(float %a, float %b) {
+; CHECK-LABEL: @test_ucomige_ss_0(
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x float> undef, float %a, i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x float> undef, float %b, i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = tail call i32 @llvm.x86.sse.ucomige.ss(<4 x float> [[TMP1]], <4 x float> [[TMP2]])
+; CHECK-NEXT:    ret i32 [[TMP3]]
+;
+  %1 = insertelement <4 x float> undef, float %a, i32 0
+  %2 = insertelement <4 x float> %1, float 1.000000e+00, i32 1
+  %3 = insertelement <4 x float> %2, float 2.000000e+00, i32 2
+  %4 = insertelement <4 x float> %3, float 3.000000e+00, i32 3
+  %5 = insertelement <4 x float> undef, float %b, i32 0
+  %6 = insertelement <4 x float> %5, float 4.000000e+00, i32 1
+  %7 = insertelement <4 x float> %6, float 5.000000e+00, i32 2
+  %8 = insertelement <4 x float> %7, float 6.000000e+00, i32 3
+  %9 = tail call i32 @llvm.x86.sse.ucomige.ss(<4 x float> %4, <4 x float> %8)
+  ret i32 %9
+}
+
+define i32 @test_ucomigt_ss_0(float %a, float %b) {
+; CHECK-LABEL: @test_ucomigt_ss_0(
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x float> undef, float %a, i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x float> undef, float %b, i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = tail call i32 @llvm.x86.sse.ucomigt.ss(<4 x float> [[TMP1]], <4 x float> [[TMP2]])
+; CHECK-NEXT:    ret i32 [[TMP3]]
+;
+  %1 = insertelement <4 x float> undef, float %a, i32 0
+  %2 = insertelement <4 x float> %1, float 1.000000e+00, i32 1
+  %3 = insertelement <4 x float> %2, float 2.000000e+00, i32 2
+  %4 = insertelement <4 x float> %3, float 3.000000e+00, i32 3
+  %5 = insertelement <4 x float> undef, float %b, i32 0
+  %6 = insertelement <4 x float> %5, float 4.000000e+00, i32 1
+  %7 = insertelement <4 x float> %6, float 5.000000e+00, i32 2
+  %8 = insertelement <4 x float> %7, float 6.000000e+00, i32 3
+  %9 = tail call i32 @llvm.x86.sse.ucomigt.ss(<4 x float> %4, <4 x float> %8)
+  ret i32 %9
+}
+
+define i32 @test_ucomile_ss_0(float %a, float %b) {
+; CHECK-LABEL: @test_ucomile_ss_0(
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x float> undef, float %a, i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x float> undef, float %b, i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = tail call i32 @llvm.x86.sse.ucomile.ss(<4 x float> [[TMP1]], <4 x float> [[TMP2]])
+; CHECK-NEXT:    ret i32 [[TMP3]]
+;
+  %1 = insertelement <4 x float> undef, float %a, i32 0
+  %2 = insertelement <4 x float> %1, float 1.000000e+00, i32 1
+  %3 = insertelement <4 x float> %2, float 2.000000e+00, i32 2
+  %4 = insertelement <4 x float> %3, float 3.000000e+00, i32 3
+  %5 = insertelement <4 x float> undef, float %b, i32 0
+  %6 = insertelement <4 x float> %5, float 4.000000e+00, i32 1
+  %7 = insertelement <4 x float> %6, float 5.000000e+00, i32 2
+  %8 = insertelement <4 x float> %7, float 6.000000e+00, i32 3
+  %9 = tail call i32 @llvm.x86.sse.ucomile.ss(<4 x float> %4, <4 x float> %8)
+  ret i32 %9
+}
+
+define i32 @test_ucomilt_ss_0(float %a, float %b) {
+; CHECK-LABEL: @test_ucomilt_ss_0(
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x float> undef, float %a, i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x float> undef, float %b, i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = tail call i32 @llvm.x86.sse.ucomilt.ss(<4 x float> [[TMP1]], <4 x float> [[TMP2]])
+; CHECK-NEXT:    ret i32 [[TMP3]]
+;
+  %1 = insertelement <4 x float> undef, float %a, i32 0
+  %2 = insertelement <4 x float> %1, float 1.000000e+00, i32 1
+  %3 = insertelement <4 x float> %2, float 2.000000e+00, i32 2
+  %4 = insertelement <4 x float> %3, float 3.000000e+00, i32 3
+  %5 = insertelement <4 x float> undef, float %b, i32 0
+  %6 = insertelement <4 x float> %5, float 4.000000e+00, i32 1
+  %7 = insertelement <4 x float> %6, float 5.000000e+00, i32 2
+  %8 = insertelement <4 x float> %7, float 6.000000e+00, i32 3
+  %9 = tail call i32 @llvm.x86.sse.ucomilt.ss(<4 x float> %4, <4 x float> %8)
+  ret i32 %9
+}
+
+define i32 @test_ucomineq_ss_0(float %a, float %b) {
+; CHECK-LABEL: @test_ucomineq_ss_0(
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x float> undef, float %a, i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x float> undef, float %b, i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = tail call i32 @llvm.x86.sse.ucomineq.ss(<4 x float> [[TMP1]], <4 x float> [[TMP2]])
+; CHECK-NEXT:    ret i32 [[TMP3]]
+;
+  %1 = insertelement <4 x float> undef, float %a, i32 0
+  %2 = insertelement <4 x float> %1, float 1.000000e+00, i32 1
+  %3 = insertelement <4 x float> %2, float 2.000000e+00, i32 2
+  %4 = insertelement <4 x float> %3, float 3.000000e+00, i32 3
+  %5 = insertelement <4 x float> undef, float %b, i32 0
+  %6 = insertelement <4 x float> %5, float 4.000000e+00, i32 1
+  %7 = insertelement <4 x float> %6, float 5.000000e+00, i32 2
+  %8 = insertelement <4 x float> %7, float 6.000000e+00, i32 3
+  %9 = tail call i32 @llvm.x86.sse.ucomineq.ss(<4 x float> %4, <4 x float> %8)
+  ret i32 %9
+}
+
+declare <4 x float> @llvm.x86.sse.rcp.ss(<4 x float>)
+declare <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float>)
+declare <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float>)
+
+declare <4 x float> @llvm.x86.sse.add.ss(<4 x float>, <4 x float>)
+declare <4 x float> @llvm.x86.sse.sub.ss(<4 x float>, <4 x float>)
+declare <4 x float> @llvm.x86.sse.mul.ss(<4 x float>, <4 x float>)
+declare <4 x float> @llvm.x86.sse.div.ss(<4 x float>, <4 x float>)
+declare <4 x float> @llvm.x86.sse.min.ss(<4 x float>, <4 x float>)
+declare <4 x float> @llvm.x86.sse.max.ss(<4 x float>, <4 x float>)
+declare <4 x float> @llvm.x86.sse.cmp.ss(<4 x float>, <4 x float>, i8)
+
+declare i32 @llvm.x86.sse.comieq.ss(<4 x float>, <4 x float>)
+declare i32 @llvm.x86.sse.comige.ss(<4 x float>, <4 x float>)
+declare i32 @llvm.x86.sse.comigt.ss(<4 x float>, <4 x float>)
+declare i32 @llvm.x86.sse.comile.ss(<4 x float>, <4 x float>)
+declare i32 @llvm.x86.sse.comilt.ss(<4 x float>, <4 x float>)
+declare i32 @llvm.x86.sse.comineq.ss(<4 x float>, <4 x float>)
+
+declare i32 @llvm.x86.sse.ucomieq.ss(<4 x float>, <4 x float>)
+declare i32 @llvm.x86.sse.ucomige.ss(<4 x float>, <4 x float>)
+declare i32 @llvm.x86.sse.ucomigt.ss(<4 x float>, <4 x float>)
+declare i32 @llvm.x86.sse.ucomile.ss(<4 x float>, <4 x float>)
+declare i32 @llvm.x86.sse.ucomilt.ss(<4 x float>, <4 x float>)
+declare i32 @llvm.x86.sse.ucomineq.ss(<4 x float>, <4 x float>)
diff --git a/test/Transforms/InstCombine/X86/x86-sse2.ll b/test/Transforms/InstCombine/X86/x86-sse2.ll
new file mode 100644
index 000000000000..fe8828bfb5b2
--- /dev/null
+++ b/test/Transforms/InstCombine/X86/x86-sse2.ll
@@ -0,0 +1,460 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -instcombine -S | FileCheck %s
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+define double @test_sqrt_sd_0(double %a) {
+; CHECK-LABEL: @test_sqrt_sd_0(
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> undef, double %a, i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double> [[TMP1]])
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x double> [[TMP2]], i32 0
+; CHECK-NEXT:    ret double [[TMP3]]
+;
+  %1 = insertelement <2 x double> undef, double %a, i32 0
+  %2 = insertelement <2 x double> %1, double 1.000000e+00, i32 1
+  %3 = tail call <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double> %2)
+  %4 = extractelement <2 x double> %3, i32 0
+  ret double %4
+}
+
+define double @test_sqrt_sd_1(double %a) {
+; CHECK-LABEL: @test_sqrt_sd_1(
+; CHECK-NEXT:    ret double 1.000000e+00
+;
+  %1 = insertelement <2 x double> undef, double %a, i32 0
+  %2 = insertelement <2 x double> %1, double 1.000000e+00, i32 1
+  %3 = tail call <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double> %2)
+  %4 = extractelement <2 x double> %3, i32 1
+  ret double %4
+}
+
+define double @test_add_sd_0(double %a, double %b) {
+; CHECK-LABEL: @test_add_sd_0(
+; CHECK-NEXT:    [[TMP1:%.*]] = fadd double %a, %b
+; CHECK-NEXT:    ret double [[TMP1]]
+;
+  %1 = insertelement <2 x double> undef, double %a, i32 0
+  %2 = insertelement <2 x double> %1, double 1.000000e+00, i32 1
+  %3 = insertelement <2 x double> undef, double %b, i32 0
+  %4 = insertelement <2 x double> %3, double 2.000000e+00, i32 1
+  %5 = tail call <2 x double> @llvm.x86.sse2.add.sd(<2 x double> %2, <2 x double> %4)
+  %6 = extractelement <2 x double> %5, i32 0
+  ret double %6
+}
+
+define double @test_add_sd_1(double %a, double %b) {
+; CHECK-LABEL: @test_add_sd_1(
+; CHECK-NEXT:    ret double 1.000000e+00
+;
+  %1 = insertelement <2 x double> undef, double %a, i32 0
+  %2 = insertelement <2 x double> %1, double 1.000000e+00, i32 1
+  %3 = insertelement <2 x double> undef, double %b, i32 0
+  %4 = insertelement <2 x double> %3, double 2.000000e+00, i32 1
+  %5 = tail call <2 x double> @llvm.x86.sse2.add.sd(<2 x double> %2, <2 x double> %4)
+  %6 = extractelement <2 x double> %5, i32 1
+  ret double %6
+}
+
+define double @test_sub_sd_0(double %a, double %b) {
+; CHECK-LABEL: @test_sub_sd_0(
+; CHECK-NEXT:    [[TMP1:%.*]] = fsub double %a, %b
+; CHECK-NEXT:    ret double [[TMP1]]
+;
+  %1 = insertelement <2 x double> undef, double %a, i32 0
+  %2 = insertelement <2 x double> %1, double 1.000000e+00, i32 1
+  %3 = insertelement <2 x double> undef, double %b, i32 0
+  %4 = insertelement <2 x double> %3, double 2.000000e+00, i32 1
+  %5 = tail call <2 x double> @llvm.x86.sse2.sub.sd(<2 x double> %2, <2 x double> %4)
+  %6 = extractelement <2 x double> %5, i32 0
+  ret double %6
+}
+
+define double @test_sub_sd_1(double %a, double %b) {
+; CHECK-LABEL: @test_sub_sd_1(
+; CHECK-NEXT:    ret double 1.000000e+00
+;
+  %1 = insertelement <2 x double> undef, double %a, i32 0
+  %2 = insertelement <2 x double> %1, double 1.000000e+00, i32 1
+  %3 = insertelement <2 x double> undef, double %b, i32 0
+  %4 = insertelement <2 x double> %3, double 2.000000e+00, i32 1
+  %5 = tail call <2 x double> @llvm.x86.sse2.sub.sd(<2 x double> %2, <2 x double> %4)
+  %6 = extractelement <2 x double> %5, i32 1
+  ret double %6
+}
+
+define double @test_mul_sd_0(double %a, double %b) {
+; CHECK-LABEL: @test_mul_sd_0(
+; CHECK-NEXT:    [[TMP1:%.*]] = fmul double %a, %b
+; CHECK-NEXT:    ret double [[TMP1]]
+;
+  %1 = insertelement <2 x double> undef, double %a, i32 0
+  %2 = insertelement <2 x double> %1, double 1.000000e+00, i32 1
+  %3 = insertelement <2 x double> undef, double %b, i32 0
+  %4 = insertelement <2 x double> %3, double 2.000000e+00, i32 1
+  %5 = tail call <2 x double> @llvm.x86.sse2.mul.sd(<2 x double> %2, <2 x double> %4)
+  %6 = extractelement <2 x double> %5, i32 0
+  ret double %6
+}
+
+define double @test_mul_sd_1(double %a, double %b) {
+; CHECK-LABEL: @test_mul_sd_1(
+; CHECK-NEXT:    ret double 1.000000e+00
+;
+  %1 = insertelement <2 x double> undef, double %a, i32 0
+  %2 = insertelement <2 x double> %1, double 1.000000e+00, i32 1
+  %3 = insertelement <2 x double> undef, double %b, i32 0
+  %4 = insertelement <2 x double> %3, double 2.000000e+00, i32 1
+  %5 = tail call <2 x double> @llvm.x86.sse2.mul.sd(<2 x double> %2, <2 x double> %4)
+  %6 = extractelement <2 x double> %5, i32 1
+  ret double %6
+}
+
+define double @test_div_sd_0(double %a, double %b) {
+; CHECK-LABEL: @test_div_sd_0(
+; CHECK-NEXT:    [[TMP1:%.*]] = fdiv double %a, %b
+; CHECK-NEXT:    ret double [[TMP1]]
+;
+  %1 = insertelement <2 x double> undef, double %a, i32 0
+  %2 = insertelement <2 x double> %1, double 1.000000e+00, i32 1
+  %3 = insertelement <2 x double> undef, double %b, i32 0
+  %4 = insertelement <2 x double> %3, double 2.000000e+00, i32 1
+  %5 = tail call <2 x double> @llvm.x86.sse2.div.sd(<2 x double> %2, <2 x double> %4)
+  %6 = extractelement <2 x double> %5, i32 0
+  ret double %6
+}
+
+define double @test_div_sd_1(double %a, double %b) {
+; CHECK-LABEL: @test_div_sd_1(
+; CHECK-NEXT:    ret double 1.000000e+00
+;
+  %1 = insertelement <2 x double> undef, double %a, i32 0
+  %2 = insertelement <2 x double> %1, double 1.000000e+00, i32 1
+  %3 = insertelement <2 x double> undef, double %b, i32 0
+  %4 = insertelement <2 x double> %3, double 2.000000e+00, i32 1
+  %5 = tail call <2 x double> @llvm.x86.sse2.div.sd(<2 x double> %2, <2 x double> %4)
+  %6 = extractelement <2 x double> %5, i32 1
+  ret double %6
+}
+
+define <2 x double> @test_min_sd(<2 x double> %a, <2 x double> %b) {
+; CHECK-LABEL: @test_min_sd(
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x double> @llvm.x86.sse2.min.sd(<2 x double> %a, <2 x double> %b)
+; CHECK-NEXT:    ret <2 x double> [[TMP1]]
+;
+  %1 = insertelement <2 x double> %b, double 2.000000e+00, i32 1
+  %2 = tail call <2 x double> @llvm.x86.sse2.min.sd(<2 x double> %a, <2 x double> %1)
+  ret <2 x double> %2
+}
+
+define double @test_min_sd_0(double %a, double %b) {
+; CHECK-LABEL: @test_min_sd_0(
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> undef, double %a, i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> undef, double %b, i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = tail call <2 x double> @llvm.x86.sse2.min.sd(<2 x double> [[TMP1]], <2 x double> [[TMP2]])
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x double> [[TMP3]], i32 0
+; CHECK-NEXT:    ret double [[TMP4]]
+;
+  %1 = insertelement <2 x double> undef, double %a, i32 0
+  %2 = insertelement <2 x double> %1, double 1.000000e+00, i32 1
+  %3 = insertelement <2 x double> undef, double %b, i32 0
+  %4 = insertelement <2 x double> %3, double 2.000000e+00, i32 1
+  %5 = tail call <2 x double> @llvm.x86.sse2.min.sd(<2 x double> %2, <2 x double> %4)
+  %6 = extractelement <2 x double> %5, i32 0
+  ret double %6
+}
+
+define double @test_min_sd_1(double %a, double %b) {
+; CHECK-LABEL: @test_min_sd_1(
+; CHECK-NEXT:    ret double 1.000000e+00
+;
+  %1 = insertelement <2 x double> undef, double %a, i32 0
+  %2 = insertelement <2 x double> %1, double 1.000000e+00, i32 1
+  %3 = insertelement <2 x double> undef, double %b, i32 0
+  %4 = insertelement <2 x double> %3, double 2.000000e+00, i32 1
+  %5 = tail call <2 x double> @llvm.x86.sse2.min.sd(<2 x double> %2, <2 x double> %4)
+  %6 = extractelement <2 x double> %5, i32 1
+  ret double %6
+}
+
+define <2 x double> @test_max_sd(<2 x double> %a, <2 x double> %b) {
+; CHECK-LABEL: @test_max_sd(
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x double> @llvm.x86.sse2.max.sd(<2 x double> %a, <2 x double> %b)
+; CHECK-NEXT:    ret <2 x double> [[TMP1]]
+;
+  %1 = insertelement <2 x double> %b, double 2.000000e+00, i32 1
+  %2 = tail call <2 x double> @llvm.x86.sse2.max.sd(<2 x double> %a, <2 x double> %1)
+  ret <2 x double> %2
+}
+
+define double @test_max_sd_0(double %a, double %b) {
+; CHECK-LABEL: @test_max_sd_0(
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> undef, double %a, i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> undef, double %b, i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = tail call <2 x double> @llvm.x86.sse2.max.sd(<2 x double> [[TMP1]], <2 x double> [[TMP2]])
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x double> [[TMP3]], i32 0
+; CHECK-NEXT:    ret double [[TMP4]]
+;
+  %1 = insertelement <2 x double> undef, double %a, i32 0
+  %2 = insertelement <2 x double> %1, double 1.000000e+00, i32 1
+  %3 = insertelement <2 x double> undef, double %b, i32 0
+  %4 = insertelement <2 x double> %3, double 2.000000e+00, i32 1
+  %5 = tail call <2 x double> @llvm.x86.sse2.max.sd(<2 x double> %2, <2 x double> %4)
+  %6 = extractelement <2 x double> %5, i32 0
+  ret double %6
+}
+
+define double @test_max_sd_1(double %a, double %b) {
+; CHECK-LABEL: @test_max_sd_1(
+; CHECK-NEXT:    ret double 1.000000e+00
+;
+  %1 = insertelement <2 x double> undef, double %a, i32 0
+  %2 = insertelement <2 x double> %1, double 1.000000e+00, i32 1
+  %3 = insertelement <2 x double> undef, double %b, i32 0
+  %4 = insertelement <2 x double> %3, double 2.000000e+00, i32 1
+  %5 = tail call <2 x double> @llvm.x86.sse2.max.sd(<2 x double> %2, <2 x double> %4)
+  %6 = extractelement <2 x double> %5, i32 1
+  ret double %6
+}
+
+define <2 x double> @test_cmp_sd(<2 x double> %a, <2 x double> %b) {
+; CHECK-LABEL: @test_cmp_sd(
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double> %a, <2 x double> %b, i8 0)
+; CHECK-NEXT:    ret <2 x double> [[TMP1]]
+;
+  %1 = insertelement <2 x double> %b, double 2.000000e+00, i32 1
+  %2 = tail call <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double> %a, <2 x double> %1, i8 0)
+  ret <2 x double> %2
+}
+
+define double @test_cmp_sd_0(double %a, double %b) {
+; CHECK-LABEL: @test_cmp_sd_0(
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> undef, double %a, i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> undef, double %b, i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = tail call <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double> [[TMP1]], <2 x double> [[TMP2]], i8 0)
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x double> [[TMP3]], i32 0
+; CHECK-NEXT:    ret double [[TMP4]]
+;
+  %1 = insertelement <2 x double> undef, double %a, i32 0
+  %2 = insertelement <2 x double> %1, double 1.000000e+00, i32 1
+  %3 = insertelement <2 x double> undef, double %b, i32 0
+  %4 = insertelement <2 x double> %3, double 2.000000e+00, i32 1
+  %5 = tail call <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double> %2, <2 x double> %4, i8 0)
+  %6 = extractelement <2 x double> %5, i32 0
+  ret double %6
+}
+
+define double @test_cmp_sd_1(double %a, double %b) {
+; CHECK-LABEL: @test_cmp_sd_1(
+; CHECK-NEXT:    ret double 1.000000e+00
+;
+  %1 = insertelement <2 x double> undef, double %a, i32 0
+  %2 = insertelement <2 x double> %1, double 1.000000e+00, i32 1
+  %3 = insertelement <2 x double> undef, double %b, i32 0
+  %4 = insertelement <2 x double> %3, double 2.000000e+00, i32 1
+  %5 = tail call <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double> %2, <2 x double> %4, i8 0)
+  %6 = extractelement <2 x double> %5, i32 1
+  ret double %6
+}
+
+define i32 @test_comieq_sd_0(double %a, double %b) {
+; CHECK-LABEL: @test_comieq_sd_0(
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> undef, double %a, i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> undef, double %b, i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = tail call i32 @llvm.x86.sse2.comieq.sd(<2 x double> [[TMP1]], <2 x double> [[TMP2]])
+; CHECK-NEXT:    ret i32 [[TMP3]]
+;
+  %1 = insertelement <2 x double> undef, double %a, i32 0
+  %2 = insertelement <2 x double> %1, double 1.000000e+00, i32 1
+  %3 = insertelement <2 x double> undef, double %b, i32 0
+  %4 = insertelement <2 x double> %3, double 2.000000e+00, i32 1
+  %5 = tail call i32 @llvm.x86.sse2.comieq.sd(<2 x double> %2, <2 x double> %4)
+  ret i32 %5
+}
+
+define i32 @test_comige_sd_0(double %a, double %b) {
+; CHECK-LABEL: @test_comige_sd_0(
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> undef, double %a, i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> undef, double %b, i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = tail call i32 @llvm.x86.sse2.comige.sd(<2 x double> [[TMP1]], <2 x double> [[TMP2]])
+; CHECK-NEXT:    ret i32 [[TMP3]]
+;
+  %1 = insertelement <2 x double> undef, double %a, i32 0
+  %2 = insertelement <2 x double> %1, double 1.000000e+00, i32 1
+  %3 = insertelement <2 x double> undef, double %b, i32 0
+  %4 = insertelement <2 x double> %3, double 2.000000e+00, i32 1
+  %5 = tail call i32 @llvm.x86.sse2.comige.sd(<2 x double> %2, <2 x double> %4)
+  ret i32 %5
+}
+
+define i32 @test_comigt_sd_0(double %a, double %b) {
+; CHECK-LABEL: @test_comigt_sd_0(
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> undef, double %a, i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> undef, double %b, i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = tail call i32 @llvm.x86.sse2.comigt.sd(<2 x double> [[TMP1]], <2 x double> [[TMP2]])
+; CHECK-NEXT:    ret i32 [[TMP3]]
+;
+  %1 = insertelement <2 x double> undef, double %a, i32 0
+  %2 = insertelement <2 x double> %1, double 1.000000e+00, i32 1
+  %3 = insertelement <2 x double> undef, double %b, i32 0
+  %4 = insertelement <2 x double> %3, double 2.000000e+00, i32 1
+  %5 = tail call i32 @llvm.x86.sse2.comigt.sd(<2 x double> %2, <2 x double> %4)
+  ret i32 %5
+}
+
+define i32 @test_comile_sd_0(double %a, double %b) {
+; CHECK-LABEL: @test_comile_sd_0(
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> undef, double %a, i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> undef, double %b, i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = tail call i32 @llvm.x86.sse2.comile.sd(<2 x double> [[TMP1]], <2 x double> [[TMP2]])
+; CHECK-NEXT:    ret i32 [[TMP3]]
+;
+  %1 = insertelement <2 x double> undef, double %a, i32 0
+  %2 = insertelement <2 x double> %1, double 1.000000e+00, i32 1
+  %3 = insertelement <2 x double> undef, double %b, i32 0
+  %4 = insertelement <2 x double> %3, double 2.000000e+00, i32 1
+  %5 = tail call i32 @llvm.x86.sse2.comile.sd(<2 x double> %2, <2 x double> %4)
+  ret i32 %5
+}
+
+define i32 @test_comilt_sd_0(double %a, double %b) {
+; CHECK-LABEL: @test_comilt_sd_0(
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> undef, double %a, i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> undef, double %b, i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = tail call i32 @llvm.x86.sse2.comilt.sd(<2 x double> [[TMP1]], <2 x double> [[TMP2]])
+; CHECK-NEXT:    ret i32 [[TMP3]]
+;
+  %1 = insertelement <2 x double> undef, double %a, i32 0
+  %2 = insertelement <2 x double> %1, double 1.000000e+00, i32 1
+  %3 = insertelement <2 x double> undef, double %b, i32 0
+  %4 = insertelement <2 x double> %3, double 2.000000e+00, i32 1
+  %5 = tail call i32 @llvm.x86.sse2.comilt.sd(<2 x double> %2, <2 x double> %4)
+  ret i32 %5
+}
+
+define i32 @test_comineq_sd_0(double %a, double %b) {
+; CHECK-LABEL: @test_comineq_sd_0(
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> undef, double %a, i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> undef, double %b, i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = tail call i32 @llvm.x86.sse2.comineq.sd(<2 x double> [[TMP1]], <2 x double> [[TMP2]])
+; CHECK-NEXT:    ret i32 [[TMP3]]
+;
+  %1 = insertelement <2 x double> undef, double %a, i32 0
+  %2 = insertelement <2 x double> %1, double 1.000000e+00, i32 1
+  %3 = insertelement <2 x double> undef, double %b, i32 0
+  %4 = insertelement <2 x double> %3, double 2.000000e+00, i32 1
+  %5 = tail call i32 @llvm.x86.sse2.comineq.sd(<2 x double> %2, <2 x double> %4)
+  ret i32 %5
+}
+
+define i32 @test_ucomieq_sd_0(double %a, double %b) {
+; CHECK-LABEL: @test_ucomieq_sd_0(
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> undef, double %a, i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> undef, double %b, i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = tail call i32 @llvm.x86.sse2.ucomieq.sd(<2 x double> [[TMP1]], <2 x double> [[TMP2]])
+; CHECK-NEXT:    ret i32 [[TMP3]]
+;
+  %1 = insertelement <2 x double> undef, double %a, i32 0
+  %2 = insertelement <2 x double> %1, double 1.000000e+00, i32 1
+  %3 = insertelement <2 x double> undef, double %b, i32 0
+  %4 = insertelement <2 x double> %3, double 2.000000e+00, i32 1
+  %5 = tail call i32 @llvm.x86.sse2.ucomieq.sd(<2 x double> %2, <2 x double> %4)
+  ret i32 %5
+}
+
+define i32 @test_ucomige_sd_0(double %a, double %b) {
+; CHECK-LABEL: @test_ucomige_sd_0(
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> undef, double %a, i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> undef, double %b, i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = tail call i32 @llvm.x86.sse2.ucomige.sd(<2 x double> [[TMP1]], <2 x double> [[TMP2]])
+; CHECK-NEXT:    ret i32 [[TMP3]]
+;
+  %1 = insertelement <2 x double> undef, double %a, i32 0
+  %2 = insertelement <2 x double> %1, double 1.000000e+00, i32 1
+  %3 = insertelement <2 x double> undef, double %b, i32 0
+  %4 = insertelement <2 x double> %3, double 2.000000e+00, i32 1
+  %5 = tail call i32 @llvm.x86.sse2.ucomige.sd(<2 x double> %2, <2 x double> %4)
+  ret i32 %5
+}
+
+define i32 @test_ucomigt_sd_0(double %a, double %b) {
+; CHECK-LABEL: @test_ucomigt_sd_0(
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> undef, double %a, i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> undef, double %b, i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = tail call i32 @llvm.x86.sse2.ucomigt.sd(<2 x double> [[TMP1]], <2 x double> [[TMP2]])
+; CHECK-NEXT:    ret i32 [[TMP3]]
+;
+  %1 = insertelement <2 x double> undef, double %a, i32 0
+  %2 = insertelement <2 x double> %1, double 1.000000e+00, i32 1
+  %3 = insertelement <2 x double> undef, double %b, i32 0
+  %4 = insertelement <2 x double> %3, double 2.000000e+00, i32 1
+  %5 = tail call i32 @llvm.x86.sse2.ucomigt.sd(<2 x double> %2, <2 x double> %4)
+  ret i32 %5
+}
+
+define i32 @test_ucomile_sd_0(double %a, double %b) {
+; CHECK-LABEL: @test_ucomile_sd_0(
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> undef, double %a, i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> undef, double %b, i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = tail call i32 @llvm.x86.sse2.ucomile.sd(<2 x double> [[TMP1]], <2 x double> [[TMP2]])
+; CHECK-NEXT:    ret i32 [[TMP3]]
+;
+  %1 = insertelement <2 x double> undef, double %a, i32 0
+  %2 = insertelement <2 x double> %1, double 1.000000e+00, i32 1
+  %3 = insertelement <2 x double> undef, double %b, i32 0
+  %4 = insertelement <2 x double> %3, double 2.000000e+00, i32 1
+  %5 = tail call i32 @llvm.x86.sse2.ucomile.sd(<2 x double> %2, <2 x double> %4)
+  ret i32 %5
+}
+
+define i32 @test_ucomilt_sd_0(double %a, double %b) {
+; CHECK-LABEL: @test_ucomilt_sd_0(
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> undef, double %a, i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> undef, double %b, i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = tail call i32 @llvm.x86.sse2.ucomilt.sd(<2 x double> [[TMP1]], <2 x double> [[TMP2]])
+; CHECK-NEXT:    ret i32 [[TMP3]]
+;
+  %1 = insertelement <2 x double> undef, double %a, i32 0
+  %2 = insertelement <2 x double> %1, double 1.000000e+00, i32 1
+  %3 = insertelement <2 x double> undef, double %b, i32 0
+  %4 = insertelement <2 x double> %3, double 2.000000e+00, i32 1
+  %5 = tail call i32 @llvm.x86.sse2.ucomilt.sd(<2 x double> %2, <2 x double> %4)
+  ret i32 %5
+}
+
+define i32 @test_ucomineq_sd_0(double %a, double %b) {
+; CHECK-LABEL: @test_ucomineq_sd_0(
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> undef, double %a, i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> undef, double %b, i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = tail call i32 @llvm.x86.sse2.ucomineq.sd(<2 x double> [[TMP1]], <2 x double> [[TMP2]])
+; CHECK-NEXT:    ret i32 [[TMP3]]
+;
+  %1 = insertelement <2 x double> undef, double %a, i32 0
+  %2 = insertelement <2 x double> %1, double 1.000000e+00, i32 1
+  %3 = insertelement <2 x double> undef, double %b, i32 0
+  %4 = insertelement <2 x double> %3, double 2.000000e+00, i32 1
+  %5 = tail call i32 @llvm.x86.sse2.ucomineq.sd(<2 x double> %2, <2 x double> %4)
+  ret i32 %5
+}
+
+declare <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double>) nounwind readnone
+
+declare <2 x double> @llvm.x86.sse2.add.sd(<2 x double>, <2 x double>)
+declare <2 x double> @llvm.x86.sse2.sub.sd(<2 x double>, <2 x double>)
+declare <2 x double> @llvm.x86.sse2.mul.sd(<2 x double>, <2 x double>)
+declare <2 x double> @llvm.x86.sse2.div.sd(<2 x double>, <2 x double>)
+declare <2 x double> @llvm.x86.sse2.min.sd(<2 x double>, <2 x double>)
+declare <2 x double> @llvm.x86.sse2.max.sd(<2 x double>, <2 x double>)
+declare <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double>, <2 x double>, i8)
+
+declare i32 @llvm.x86.sse2.comieq.sd(<2 x double>, <2 x double>)
+declare i32 @llvm.x86.sse2.comige.sd(<2 x double>, <2 x double>)
+declare i32 @llvm.x86.sse2.comigt.sd(<2 x double>, <2 x double>)
+declare i32 @llvm.x86.sse2.comile.sd(<2 x double>, <2 x double>)
+declare i32 @llvm.x86.sse2.comilt.sd(<2 x double>, <2 x double>)
+declare i32 @llvm.x86.sse2.comineq.sd(<2 x double>, <2 x double>)
+
+declare i32 @llvm.x86.sse2.ucomieq.sd(<2 x double>, <2 x double>)
+declare i32 @llvm.x86.sse2.ucomige.sd(<2 x double>, <2 x double>)
+declare i32 @llvm.x86.sse2.ucomigt.sd(<2 x double>, <2 x double>)
+declare i32 @llvm.x86.sse2.ucomile.sd(<2 x double>, <2 x double>)
+declare i32 @llvm.x86.sse2.ucomilt.sd(<2 x double>, <2 x double>)
+declare i32 @llvm.x86.sse2.ucomineq.sd(<2 x double>, <2 x double>)
diff --git a/test/Transforms/InstCombine/X86/x86-sse41.ll b/test/Transforms/InstCombine/X86/x86-sse41.ll
new file mode 100644
index 000000000000..16975471b9e1
--- /dev/null
+++ b/test/Transforms/InstCombine/X86/x86-sse41.ll
@@ -0,0 +1,98 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -instcombine -S | FileCheck %s
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+define <2 x double> @test_round_sd(<2 x double> %a, <2 x double> %b) {
+; CHECK-LABEL: @test_round_sd(
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %a, <2 x double> %b, i32 10)
+; CHECK-NEXT:    ret <2 x double> [[TMP1]]
+;
+  %1 = insertelement <2 x double> %a, double 1.000000e+00, i32 0
+  %2 = insertelement <2 x double> %b, double 2.000000e+00, i32 1
+  %3 = tail call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %1, <2 x double> %2, i32 10)
+  ret <2 x double> %3
+}
+
+define double @test_round_sd_0(double %a, double %b) {
+; CHECK-LABEL: @test_round_sd_0(
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> undef, double %b, i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> undef, <2 x double> [[TMP1]], i32 10)
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x double> [[TMP2]], i32 0
+; CHECK-NEXT:    ret double [[TMP3]]
+;
+  %1 = insertelement <2 x double> undef, double %a, i32 0
+  %2 = insertelement <2 x double> %1, double 1.000000e+00, i32 1
+  %3 = insertelement <2 x double> undef, double %b, i32 0
+  %4 = insertelement <2 x double> %3, double 2.000000e+00, i32 1
+  %5 = tail call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %2, <2 x double> %4, i32 10)
+  %6 = extractelement <2 x double> %5, i32 0
+  ret double %6
+}
+
+define double @test_round_sd_1(double %a, double %b) {
+; CHECK-LABEL: @test_round_sd_1(
+; CHECK-NEXT:    ret double 1.000000e+00
+;
+  %1 = insertelement <2 x double> undef, double %a, i32 0
+  %2 = insertelement <2 x double> %1, double 1.000000e+00, i32 1
+  %3 = insertelement <2 x double> undef, double %b, i32 0
+  %4 = insertelement <2 x double> %3, double 2.000000e+00, i32 1
+  %5 = tail call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %2, <2 x double> %4, i32 10)
+  %6 = extractelement <2 x double> %5, i32 1
+  ret double %6
+}
+
+define <4 x float> @test_round_ss(<4 x float> %a, <4 x float> %b) {
+; CHECK-LABEL: @test_round_ss(
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> <float undef, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00>, <4 x float> %b, i32 10)
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
+;
+  %1 = insertelement <4 x float> %a, float 1.000000e+00, i32 1
+  %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2
+  %3 = insertelement <4 x float> %2, float 3.000000e+00, i32 3
+  %4 = insertelement <4 x float> %b, float 1.000000e+00, i32 1
+  %5 = insertelement <4 x float> %4, float 2.000000e+00, i32 2
+  %6 = insertelement <4 x float> %5, float 3.000000e+00, i32 3
+  %7 = tail call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %3, <4 x float> %6, i32 10)
+  ret <4 x float> %7
+}
+
+define float @test_round_ss_0(float %a, float %b) {
+; CHECK-LABEL: @test_round_ss_0(
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x float> undef, float %b, i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> undef, <4 x float> [[TMP1]], i32 10)
+; CHECK-NEXT:    [[R:%.*]] = extractelement <4 x float> [[TMP2]], i32 0
+; CHECK-NEXT:    ret float [[R]]
+;
+  %1 = insertelement <4 x float> undef, float %a, i32 0
+  %2 = insertelement <4 x float> %1, float 1.000000e+00, i32 1
+  %3 = insertelement <4 x float> %2, float 2.000000e+00, i32 2
+  %4 = insertelement <4 x float> %3, float 3.000000e+00, i32 3
+  %5 = insertelement <4 x float> undef, float %b, i32 0
+  %6 = insertelement <4 x float> %5, float 4.000000e+00, i32 1
+  %7 = insertelement <4 x float> %6, float 5.000000e+00, i32 2
+  %8 = insertelement <4 x float> %7, float 6.000000e+00, i32 3
+  %9 = tail call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %4, <4 x float> %8, i32 10)
+  %r = extractelement <4 x float> %9, i32 0
+  ret float %r
+}
+
+define float @test_round_ss_2(float %a, float %b) {
+; CHECK-LABEL: @test_round_ss_2(
+; CHECK-NEXT:    ret float 2.000000e+00
+;
+  %1 = insertelement <4 x float> undef, float %a, i32 0
+  %2 = insertelement <4 x float> %1, float 1.000000e+00, i32 1
+  %3 = insertelement <4 x float> %2, float 2.000000e+00, i32 2
+  %4 = insertelement <4 x float> %3, float 3.000000e+00, i32 3
+  %5 = insertelement <4 x float> undef, float %b, i32 0
+  %6 = insertelement <4 x float> %5, float 4.000000e+00, i32 1
+  %7 = insertelement <4 x float> %6, float 5.000000e+00, i32 2
+  %8 = insertelement <4 x float> %7, float 6.000000e+00, i32 3
+  %9 = tail call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %4, <4 x float> %8, i32 10)
+  %r = extractelement <4 x float> %9, i32 2
+  ret float %r
+}
+
+declare <2 x double> @llvm.x86.sse41.round.sd(<2 x double>, <2 x double>, i32) nounwind readnone
+declare <4 x float> @llvm.x86.sse41.round.ss(<4 x float>, <4 x float>, i32) nounwind readnone
diff --git a/test/Transforms/InstCombine/X86/x86-sse4a.ll b/test/Transforms/InstCombine/X86/x86-sse4a.ll
new file mode 100644
index 000000000000..e36a73532259
--- /dev/null
+++ b/test/Transforms/InstCombine/X86/x86-sse4a.ll
@@ -0,0 +1,408 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+;
+; EXTRQ
+;
+
+define <2 x i64> @test_extrq_call(<2 x i64> %x, <16 x i8> %y) {
+; CHECK-LABEL: @test_extrq_call(
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.x86.sse4a.extrq(<2 x i64> %x, <16 x i8> %y) #1
+; CHECK-NEXT:    ret <2 x i64> [[TMP1]]
+;
+  %1 = tail call <2 x i64> @llvm.x86.sse4a.extrq(<2 x i64> %x, <16 x i8> %y) nounwind
+  ret <2 x i64> %1
+}
+
+define <2 x i64> @test_extrq_zero_arg0(<2 x i64> %x, <16 x i8> %y) {
+; CHECK-LABEL: @test_extrq_zero_arg0(
+; CHECK-NEXT:    ret <2 x i64> <i64 0, i64 undef>
+;
+  %1 = tail call <2 x i64> @llvm.x86.sse4a.extrq(<2 x i64> zeroinitializer, <16 x i8> %y) nounwind
+  ret <2 x i64> %1
+}
+
+define <2 x i64> @test_extrq_zero_arg1(<2 x i64> %x, <16 x i8> %y) {
+; CHECK-LABEL: @test_extrq_zero_arg1(
+; CHECK-NEXT:    ret <2 x i64> %x
+;
+  %1 = tail call <2 x i64> @llvm.x86.sse4a.extrq(<2 x i64> %x, <16 x i8> zeroinitializer) nounwind
+  ret <2 x i64> %1
+}
+
+define <2 x i64> @test_extrq_to_extqi(<2 x i64> %x, <16 x i8> %y) {
+; CHECK-LABEL: @test_extrq_to_extqi(
+; CHECK-NEXT:    [[TMP1:%.*]] = call <2 x i64> @llvm.x86.sse4a.extrqi(<2 x i64> %x, i8 8, i8 15)
+; CHECK-NEXT:    ret <2 x i64> [[TMP1]]
+;
+  %1 = tail call <2 x i64> @llvm.x86.sse4a.extrq(<2 x i64> %x, <16 x i8> <i8 8, i8 15, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>) nounwind
+  ret <2 x i64> %1
+}
+
+define <2 x i64> @test_extrq_constant(<2 x i64> %x, <16 x i8> %y) {
+; CHECK-LABEL: @test_extrq_constant(
+; CHECK-NEXT:    ret <2 x i64> <i64 255, i64 undef>
+;
+  %1 = tail call <2 x i64> @llvm.x86.sse4a.extrq(<2 x i64> <i64 -1, i64 55>, <16 x i8> <i8 8, i8 15, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>) nounwind
+  ret <2 x i64> %1
+}
+
+define <2 x i64> @test_extrq_constant_undef(<2 x i64> %x, <16 x i8> %y) {
+; CHECK-LABEL: @test_extrq_constant_undef(
+; CHECK-NEXT:    ret <2 x i64> <i64 65535, i64 undef>
+;
+  %1 = tail call <2 x i64> @llvm.x86.sse4a.extrq(<2 x i64> <i64 -1, i64 undef>, <16 x i8> <i8 16, i8 15, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>) nounwind
+  ret <2 x i64> %1
+}
+
+define <2 x i64> @test_extrq_call_constexpr(<2 x i64> %x) {
+; CHECK-LABEL: @test_extrq_call_constexpr(
+; CHECK-NEXT:    ret <2 x i64> %x
+;
+  %1 = call <2 x i64> @llvm.x86.sse4a.extrq(<2 x i64> %x, <16 x i8> bitcast (<2 x i64> <i64 0, i64 undef> to <16 x i8>))
+  ret <2 x i64> %1
+}
+
+;
+; EXTRQI
+;
+
+define <2 x i64> @test_extrqi_call(<2 x i64> %x) {
+; CHECK-LABEL: @test_extrqi_call(
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.x86.sse4a.extrqi(<2 x i64> %x, i8 8, i8 23)
+; CHECK-NEXT:    ret <2 x i64> [[TMP1]]
+;
+  %1 = tail call <2 x i64> @llvm.x86.sse4a.extrqi(<2 x i64> %x, i8 8, i8 23)
+  ret <2 x i64> %1
+}
+
+define <2 x i64> @test_extrqi_shuffle_1zuu(<2 x i64> %x) {
+; CHECK-LABEL: @test_extrqi_shuffle_1zuu(
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> %x to <16 x i8>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> <i8 undef, i8 undef, i8 undef, i8 undef, i8 0, i8 0, i8 0, i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef>, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 20, i32 21, i32 22, i32 23, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x i64>
+; CHECK-NEXT:    ret <2 x i64> [[TMP3]]
+;
+  %1 = tail call <2 x i64> @llvm.x86.sse4a.extrqi(<2 x i64> %x, i8 32, i8 32)
+  ret <2 x i64> %1
+}
+
+define <2 x i64> @test_extrqi_shuffle_2zzzzzzzuuuuuuuu(<2 x i64> %x) {
+; CHECK-LABEL: @test_extrqi_shuffle_2zzzzzzzuuuuuuuu(
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> %x to <16 x i8>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> <i8 undef, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef>, <16 x i32> <i32 2, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x i64>
+; CHECK-NEXT:    ret <2 x i64> [[TMP3]]
+;
+  %1 = tail call <2 x i64> @llvm.x86.sse4a.extrqi(<2 x i64> %x, i8 8, i8 16)
+  ret <2 x i64> %1
+}
+
+define <2 x i64> @test_extrqi_undef(<2 x i64> %x) {
+; CHECK-LABEL: @test_extrqi_undef(
+; CHECK-NEXT:    ret <2 x i64> undef
+;
+  %1 = tail call <2 x i64> @llvm.x86.sse4a.extrqi(<2 x i64> zeroinitializer, i8 32, i8 33)
+  ret <2 x i64> %1
+}
+
+define <2 x i64> @test_extrqi_zero(<2 x i64> %x) {
+; CHECK-LABEL: @test_extrqi_zero(
+; CHECK-NEXT:    ret <2 x i64> <i64 0, i64 undef>
+;
+  %1 = tail call <2 x i64> @llvm.x86.sse4a.extrqi(<2 x i64> zeroinitializer, i8 3, i8 18)
+  ret <2 x i64> %1
+}
+
+define <2 x i64> @test_extrqi_constant(<2 x i64> %x) {
+; CHECK-LABEL: @test_extrqi_constant(
+; CHECK-NEXT:    ret <2 x i64> <i64 7, i64 undef>
+;
+  %1 = tail call <2 x i64> @llvm.x86.sse4a.extrqi(<2 x i64> <i64 -1, i64 55>, i8 3, i8 18)
+  ret <2 x i64> %1
+}
+
+define <2 x i64> @test_extrqi_constant_undef(<2 x i64> %x) {
+; CHECK-LABEL: @test_extrqi_constant_undef(
+; CHECK-NEXT:    ret <2 x i64> <i64 15, i64 undef>
+;
+  %1 = tail call <2 x i64> @llvm.x86.sse4a.extrqi(<2 x i64> <i64 -1, i64 undef>, i8 4, i8 18)
+  ret <2 x i64> %1
+}
+
+define <2 x i64> @test_extrqi_call_constexpr() {
+; CHECK-LABEL: @test_extrqi_call_constexpr(
+; CHECK-NEXT:    ret <2 x i64> zeroinitializer
+;
+  %1 = tail call <2 x i64> @llvm.x86.sse4a.extrqi(<2 x i64> bitcast (<16 x i8> trunc (<16 x i16> bitcast (<4 x i64> <i64 0, i64 undef, i64 2, i64 undef> to <16 x i16>) to <16 x i8>) to <2 x i64>), i8 8, i8 16)
+  ret <2 x i64> %1
+}
+
+;
+; INSERTQ
+;
+
+define <2 x i64> @test_insertq_call(<2 x i64> %x, <2 x i64> %y) {
+; CHECK-LABEL: @test_insertq_call(
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.x86.sse4a.insertq(<2 x i64> %x, <2 x i64> %y) #1
+; CHECK-NEXT:    ret <2 x i64> [[TMP1]]
+;
+  %1 = tail call <2 x i64> @llvm.x86.sse4a.insertq(<2 x i64> %x, <2 x i64> %y) nounwind
+  ret <2 x i64> %1
+}
+
+define <2 x i64> @test_insertq_to_insertqi(<2 x i64> %x, <2 x i64> %y) {
+; CHECK-LABEL: @test_insertq_to_insertqi(
+; CHECK-NEXT:    [[TMP1:%.*]] = call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %x, <2 x i64> <i64 8, i64 undef>, i8 18, i8 2)
+; CHECK-NEXT:    ret <2 x i64> [[TMP1]]
+;
+  %1 = tail call <2 x i64> @llvm.x86.sse4a.insertq(<2 x i64> %x, <2 x i64> <i64 8, i64 658>) nounwind
+  ret <2 x i64> %1
+}
+
+define <2 x i64> @test_insertq_constant(<2 x i64> %x, <2 x i64> %y) {
+; CHECK-LABEL: @test_insertq_constant(
+; CHECK-NEXT:    ret <2 x i64> <i64 32, i64 undef>
+;
+  %1 = tail call <2 x i64> @llvm.x86.sse4a.insertq(<2 x i64> <i64 0, i64 0>, <2 x i64> <i64 8, i64 658>) nounwind
+  ret <2 x i64> %1
+}
+
+define <2 x i64> @test_insertq_constant_undef(<2 x i64> %x, <2 x i64> %y) {
+; CHECK-LABEL: @test_insertq_constant_undef(
+; CHECK-NEXT:    ret <2 x i64> <i64 33, i64 undef>
+;
+  %1 = tail call <2 x i64> @llvm.x86.sse4a.insertq(<2 x i64> <i64 1, i64 undef>, <2 x i64> <i64 8, i64 658>) nounwind
+  ret <2 x i64> %1
+}
+
+define <2 x i64> @test_insertq_call_constexpr(<2 x i64> %x) {
+; CHECK-LABEL: @test_insertq_call_constexpr(
+; CHECK-NEXT:    [[TMP1:%.*]] = call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %x, <2 x i64> <i64 0, i64 undef>, i8 2, i8 0)
+; CHECK-NEXT:    ret <2 x i64> [[TMP1]]
+;
+  %1 = tail call <2 x i64> @llvm.x86.sse4a.insertq(<2 x i64> %x, <2 x i64> bitcast (<16 x i8> trunc (<16 x i16> bitcast (<4 x i64> <i64 0, i64 undef, i64 2, i64 undef> to <16 x i16>) to <16 x i8>) to <2 x i64>))
+  ret <2 x i64> %1
+}
+
+;
+; INSERTQI
+;
+
+define <16 x i8> @test_insertqi_shuffle_04uu(<16 x i8> %v, <16 x i8> %i) {
+; CHECK-LABEL: @test_insertqi_shuffle_04uu(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i8> %v, <16 x i8> %i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    ret <16 x i8> [[TMP1]]
+;
+  %1 = bitcast <16 x i8> %v to <2 x i64>
+  %2 = bitcast <16 x i8> %i to <2 x i64>
+  %3 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %2, i8 32, i8 32)
+  %4 = bitcast <2 x i64> %3 to <16 x i8>
+  ret <16 x i8> %4
+}
+
+define <16 x i8> @test_insertqi_shuffle_8123uuuu(<16 x i8> %v, <16 x i8> %i) {
+; CHECK-LABEL: @test_insertqi_shuffle_8123uuuu(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i8> %v, <16 x i8> %i, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    ret <16 x i8> [[TMP1]]
+;
+  %1 = bitcast <16 x i8> %v to <2 x i64>
+  %2 = bitcast <16 x i8> %i to <2 x i64>
+  %3 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %2, i8 16, i8 0)
+  %4 = bitcast <2 x i64> %3 to <16 x i8>
+  ret <16 x i8> %4
+}
+
+define <2 x i64> @test_insertqi_constant(<2 x i64> %v, <2 x i64> %i) {
+; CHECK-LABEL: @test_insertqi_constant(
+; CHECK-NEXT:    ret <2 x i64> <i64 -131055, i64 undef>
+;
+  %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> <i64 -1, i64 -1>, <2 x i64> <i64 8, i64 0>, i8 16, i8 1)
+  ret <2 x i64> %1
+}
+
+define <2 x i64> @test_insertqi_call_constexpr(<2 x i64> %x) {
+; CHECK-LABEL: @test_insertqi_call_constexpr(
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %x, <2 x i64> <i64 0, i64 undef>, i8 48, i8 3)
+; CHECK-NEXT:    ret <2 x i64> [[TMP1]]
+;
+  %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %x, <2 x i64> bitcast (<16 x i8> trunc (<16 x i16> bitcast (<4 x i64> <i64 0, i64 undef, i64 2, i64 undef> to <16 x i16>) to <16 x i8>) to <2 x i64>), i8 48, i8 3)
+  ret <2 x i64> %1
+}
+
+; The result of this insert is the second arg, since the top 64 bits of
+; the result are undefined, and we copy the bottom 64 bits from the
+; second arg
+define <2 x i64> @testInsert64Bits(<2 x i64> %v, <2 x i64> %i) {
+; CHECK-LABEL: @testInsert64Bits(
+; CHECK-NEXT:    ret <2 x i64> %i
+;
+  %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 64, i8 0)
+  ret <2 x i64> %1
+}
+
+define <2 x i64> @testZeroLength(<2 x i64> %v, <2 x i64> %i) {
+; CHECK-LABEL: @testZeroLength(
+; CHECK-NEXT:    ret <2 x i64> %i
+;
+  %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 0, i8 0)
+  ret <2 x i64> %1
+}
+
+define <2 x i64> @testUndefinedInsertq_1(<2 x i64> %v, <2 x i64> %i) {
+; CHECK-LABEL: @testUndefinedInsertq_1(
+; CHECK-NEXT:    ret <2 x i64> undef
+;
+  %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 0, i8 16)
+  ret <2 x i64> %1
+}
+
+define <2 x i64> @testUndefinedInsertq_2(<2 x i64> %v, <2 x i64> %i) {
+; CHECK-LABEL: @testUndefinedInsertq_2(
+; CHECK-NEXT:    ret <2 x i64> undef
+;
+  %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 48, i8 32)
+  ret <2 x i64> %1
+}
+
+define <2 x i64> @testUndefinedInsertq_3(<2 x i64> %v, <2 x i64> %i) {
+; CHECK-LABEL: @testUndefinedInsertq_3(
+; CHECK-NEXT:    ret <2 x i64> undef
+;
+  %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 64, i8 16)
+  ret <2 x i64> %1
+}
+
+;
+; Vector Demanded Bits
+;
+
+define <2 x i64> @test_extrq_arg0(<2 x i64> %x, <16 x i8> %y) {
+; CHECK-LABEL: @test_extrq_arg0(
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.x86.sse4a.extrq(<2 x i64> %x, <16 x i8> %y) #1
+; CHECK-NEXT:    ret <2 x i64> [[TMP1]]
+;
+  %1 = shufflevector <2 x i64> %x, <2 x i64> undef, <2 x i32> <i32 0, i32 0>
+  %2 = tail call <2 x i64> @llvm.x86.sse4a.extrq(<2 x i64> %1, <16 x i8> %y) nounwind
+  ret <2 x i64> %2
+}
+
+define <2 x i64> @test_extrq_arg1(<2 x i64> %x, <16 x i8> %y) {
+; CHECK-LABEL: @test_extrq_arg1(
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.x86.sse4a.extrq(<2 x i64> %x, <16 x i8> %y) #1
+; CHECK-NEXT:    ret <2 x i64> [[TMP1]]
+;
+  %1 = shufflevector <16 x i8> %y, <16 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+  %2 = tail call <2 x i64> @llvm.x86.sse4a.extrq(<2 x i64> %x, <16 x i8> %1) nounwind
+  ret <2 x i64> %2
+}
+
+define <2 x i64> @test_extrq_args01(<2 x i64> %x, <16 x i8> %y) {
+; CHECK-LABEL: @test_extrq_args01(
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.x86.sse4a.extrq(<2 x i64> %x, <16 x i8> %y) #1
+; CHECK-NEXT:    ret <2 x i64> [[TMP1]]
+;
+  %1 = shufflevector <2 x i64> %x, <2 x i64> undef, <2 x i32> <i32 0, i32 0>
+  %2 = shufflevector <16 x i8> %y, <16 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+  %3 = tail call <2 x i64> @llvm.x86.sse4a.extrq(<2 x i64> %1, <16 x i8> %2) nounwind
+  ret <2 x i64> %3
+}
+
+define <2 x i64> @test_extrq_ret(<2 x i64> %x, <16 x i8> %y) {
+; CHECK-LABEL: @test_extrq_ret(
+; CHECK-NEXT:    ret <2 x i64> undef
+;
+  %1 = tail call <2 x i64> @llvm.x86.sse4a.extrq(<2 x i64> %x, <16 x i8> %y) nounwind
+  %2 = shufflevector <2 x i64> %1, <2 x i64> undef, <2 x i32> <i32 1, i32 1>
+  ret <2 x i64> %2
+}
+
+define <2 x i64> @test_extrqi_arg0(<2 x i64> %x) {
+; CHECK-LABEL: @test_extrqi_arg0(
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.x86.sse4a.extrqi(<2 x i64> %x, i8 3, i8 2)
+; CHECK-NEXT:    ret <2 x i64> [[TMP1]]
+;
+  %1 = shufflevector <2 x i64> %x, <2 x i64> undef, <2 x i32> <i32 0, i32 0>
+  %2 = tail call <2 x i64> @llvm.x86.sse4a.extrqi(<2 x i64> %1, i8 3, i8 2)
+  ret <2 x i64> %2
+}
+
+define <2 x i64> @test_extrqi_ret(<2 x i64> %x) {
+; CHECK-LABEL: @test_extrqi_ret(
+; CHECK-NEXT:    ret <2 x i64> undef
+;
+  %1 = tail call <2 x i64> @llvm.x86.sse4a.extrqi(<2 x i64> %x, i8 3, i8 2) nounwind
+  %2 = shufflevector <2 x i64> %1, <2 x i64> undef, <2 x i32> <i32 1, i32 1>
+  ret <2 x i64> %2
+}
+
+define <2 x i64> @test_insertq_arg0(<2 x i64> %x, <2 x i64> %y) {
+; CHECK-LABEL: @test_insertq_arg0(
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.x86.sse4a.insertq(<2 x i64> %x, <2 x i64> %y) #1
+; CHECK-NEXT:    ret <2 x i64> [[TMP1]]
+;
+  %1 = shufflevector <2 x i64> %x, <2 x i64> undef, <2 x i32> <i32 0, i32 0>
+  %2 = tail call <2 x i64> @llvm.x86.sse4a.insertq(<2 x i64> %1, <2 x i64> %y) nounwind
+  ret <2 x i64> %2
+}
+
+define <2 x i64> @test_insertq_ret(<2 x i64> %x, <2 x i64> %y) {
+; CHECK-LABEL: @test_insertq_ret(
+; CHECK-NEXT:    ret <2 x i64> undef
+;
+  %1 = tail call <2 x i64> @llvm.x86.sse4a.insertq(<2 x i64> %x, <2 x i64> %y) nounwind
+  %2 = shufflevector <2 x i64> %1, <2 x i64> undef, <2 x i32> <i32 1, i32 1>
+  ret <2 x i64> %2
+}
+
+define <2 x i64> @test_insertqi_arg0(<2 x i64> %x, <2 x i64> %y) {
+; CHECK-LABEL: @test_insertqi_arg0(
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %x, <2 x i64> %y, i8 3, i8 2) #1
+; CHECK-NEXT:    ret <2 x i64> [[TMP1]]
+;
+  %1 = shufflevector <2 x i64> %x, <2 x i64> undef, <2 x i32> <i32 0, i32 0>
+  %2 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %y, i8 3, i8 2) nounwind
+  ret <2 x i64> %2
+}
+
+define <2 x i64> @test_insertqi_arg1(<2 x i64> %x, <2 x i64> %y) {
+; CHECK-LABEL: @test_insertqi_arg1(
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %x, <2 x i64> %y, i8 3, i8 2) #1
+; CHECK-NEXT:    ret <2 x i64> [[TMP1]]
+;
+  %1 = shufflevector <2 x i64> %y, <2 x i64> undef, <2 x i32> <i32 0, i32 0>
+  %2 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %x, <2 x i64> %1, i8 3, i8 2) nounwind
+  ret <2 x i64> %2
+}
+
+define <2 x i64> @test_insertqi_args01(<2 x i64> %x, <2 x i64> %y) {
+; CHECK-LABEL: @test_insertqi_args01(
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %x, <2 x i64> %y, i8 3, i8 2) #1
+; CHECK-NEXT:    ret <2 x i64> [[TMP1]]
+;
+  %1 = shufflevector <2 x i64> %x, <2 x i64> undef, <2 x i32> <i32 0, i32 0>
+  %2 = shufflevector <2 x i64> %y, <2 x i64> undef, <2 x i32> <i32 0, i32 0>
+  %3 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %2, i8 3, i8 2) nounwind
+  ret <2 x i64> %3
+}
+
+define <2 x i64> @test_insertqi_ret(<2 x i64> %x, <2 x i64> %y) {
+; CHECK-LABEL: @test_insertqi_ret(
+; CHECK-NEXT:    ret <2 x i64> undef
+;
+  %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %x, <2 x i64> %y, i8 3, i8 2) nounwind
+  %2 = shufflevector <2 x i64> %1, <2 x i64> undef, <2 x i32> <i32 1, i32 1>
+  ret <2 x i64> %2
+}
+
+; CHECK: declare <2 x i64> @llvm.x86.sse4a.extrq
+declare <2 x i64> @llvm.x86.sse4a.extrq(<2 x i64>, <16 x i8>) nounwind
+
+; CHECK: declare <2 x i64> @llvm.x86.sse4a.extrqi
+declare <2 x i64> @llvm.x86.sse4a.extrqi(<2 x i64>, i8, i8) nounwind
+
+; CHECK: declare <2 x i64> @llvm.x86.sse4a.insertq
+declare <2 x i64> @llvm.x86.sse4a.insertq(<2 x i64>, <2 x i64>) nounwind
+
+; CHECK: declare <2 x i64> @llvm.x86.sse4a.insertqi
+declare <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64>, <2 x i64>, i8, i8) nounwind
diff --git a/test/Transforms/InstCombine/X86/x86-vec_demanded_elts.ll b/test/Transforms/InstCombine/X86/x86-vec_demanded_elts.ll
new file mode 100644
index 000000000000..5ad8e767d767
--- /dev/null
+++ b/test/Transforms/InstCombine/X86/x86-vec_demanded_elts.ll
@@ -0,0 +1,110 @@
+; RUN: opt < %s -instcombine -S | FileCheck %s
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+define i16 @test1(float %f) {
+; CHECK-LABEL: @test1(
+; CHECK-NEXT:    [[TMP281:%.*]] = fadd float %f, -1.000000e+00
+; CHECK-NEXT:    [[TMP373:%.*]] = fmul float [[TMP281]], 5.000000e-01
+; CHECK-NEXT:    [[TMP374:%.*]] = insertelement <4 x float> undef, float [[TMP373]], i32 0
+; CHECK-NEXT:    [[TMP48:%.*]] = tail call <4 x float> @llvm.x86.sse.min.ss(<4 x float> [[TMP374]], <4 x float> <float 6.553500e+04, float undef, float undef, float undef>)
+; CHECK-NEXT:    [[TMP59:%.*]] = tail call <4 x float> @llvm.x86.sse.max.ss(<4 x float> [[TMP48]], <4 x float> <float 0.000000e+00, float undef, float undef, float undef>)
+; CHECK-NEXT:    [[TMP_UPGRD_1:%.*]] = tail call i32 @llvm.x86.sse.cvttss2si(<4 x float> [[TMP59]])
+; CHECK-NEXT:    [[TMP69:%.*]] = trunc i32 [[TMP_UPGRD_1]] to i16
+; CHECK-NEXT:    ret i16 [[TMP69]]
+;
+  %tmp = insertelement <4 x float> undef, float %f, i32 0
+  %tmp10 = insertelement <4 x float> %tmp, float 0.000000e+00, i32 1
+  %tmp11 = insertelement <4 x float> %tmp10, float 0.000000e+00, i32 2
+  %tmp12 = insertelement <4 x float> %tmp11, float 0.000000e+00, i32 3
+  %tmp28 = tail call <4 x float> @llvm.x86.sse.sub.ss( <4 x float> %tmp12, <4 x float> < float 1.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00 > )
+  %tmp37 = tail call <4 x float> @llvm.x86.sse.mul.ss( <4 x float> %tmp28, <4 x float> < float 5.000000e-01, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00 > )
+  %tmp48 = tail call <4 x float> @llvm.x86.sse.min.ss( <4 x float> %tmp37, <4 x float> < float 6.553500e+04, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00 > )
+  %tmp59 = tail call <4 x float> @llvm.x86.sse.max.ss( <4 x float> %tmp48, <4 x float> zeroinitializer )
+  %tmp.upgrd.1 = tail call i32 @llvm.x86.sse.cvttss2si( <4 x float> %tmp59 )
+  %tmp69 = trunc i32 %tmp.upgrd.1 to i16
+  ret i16 %tmp69
+}
+
+define i64 @test3(float %f, double %d) {
+; CHECK-LABEL: @test3(
+; CHECK-NEXT:    [[V00:%.*]] = insertelement <4 x float> undef, float %f, i32 0
+; CHECK-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.x86.sse.cvtss2si(<4 x float> [[V00]])
+; CHECK-NEXT:    [[V10:%.*]] = insertelement <4 x float> undef, float %f, i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.x86.sse.cvtss2si64(<4 x float> [[V10]])
+; CHECK-NEXT:    [[V20:%.*]] = insertelement <4 x float> undef, float %f, i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call i32 @llvm.x86.sse.cvttss2si(<4 x float> [[V20]])
+; CHECK-NEXT:    [[V30:%.*]] = insertelement <4 x float> undef, float %f, i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = tail call i64 @llvm.x86.sse.cvttss2si64(<4 x float> [[V30]])
+; CHECK-NEXT:    [[V40:%.*]] = insertelement <2 x double> undef, double %d, i32 0
+; CHECK-NEXT:    [[TMP4:%.*]] = tail call i32 @llvm.x86.sse2.cvtsd2si(<2 x double> [[V40]])
+; CHECK-NEXT:    [[V50:%.*]] = insertelement <2 x double> undef, double %d, i32 0
+; CHECK-NEXT:    [[TMP5:%.*]] = tail call i64 @llvm.x86.sse2.cvtsd2si64(<2 x double> [[V50]])
+; CHECK-NEXT:    [[V60:%.*]] = insertelement <2 x double> undef, double %d, i32 0
+; CHECK-NEXT:    [[TMP6:%.*]] = tail call i32 @llvm.x86.sse2.cvttsd2si(<2 x double> [[V60]])
+; CHECK-NEXT:    [[V70:%.*]] = insertelement <2 x double> undef, double %d, i32 0
+; CHECK-NEXT:    [[TMP7:%.*]] = tail call i64 @llvm.x86.sse2.cvttsd2si64(<2 x double> [[V70]])
+; CHECK-NEXT:    [[TMP8:%.*]] = add i32 [[TMP0]], [[TMP2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = add i32 [[TMP4]], [[TMP6]]
+; CHECK-NEXT:    [[TMP10:%.*]] = add i32 [[TMP8]], [[TMP9]]
+; CHECK-NEXT:    [[TMP11:%.*]] = sext i32 [[TMP10]] to i64
+; CHECK-NEXT:    [[TMP12:%.*]] = add i64 [[TMP1]], [[TMP3]]
+; CHECK-NEXT:    [[TMP13:%.*]] = add i64 [[TMP5]], [[TMP7]]
+; CHECK-NEXT:    [[TMP14:%.*]] = add i64 [[TMP12]], [[TMP13]]
+; CHECK-NEXT:    [[TMP15:%.*]] = add i64 [[TMP14]], [[TMP11]]
+; CHECK-NEXT:    ret i64 [[TMP15]]
+;
+  %v00 = insertelement <4 x float> undef, float %f, i32 0
+  %v01 = insertelement <4 x float> %v00, float 0.000000e+00, i32 1
+  %v02 = insertelement <4 x float> %v01, float 0.000000e+00, i32 2
+  %v03 = insertelement <4 x float> %v02, float 0.000000e+00, i32 3
+  %tmp0 = tail call i32 @llvm.x86.sse.cvtss2si(<4 x float> %v03)
+  %v10 = insertelement <4 x float> undef, float %f, i32 0
+  %v11 = insertelement <4 x float> %v10, float 0.000000e+00, i32 1
+  %v12 = insertelement <4 x float> %v11, float 0.000000e+00, i32 2
+  %v13 = insertelement <4 x float> %v12, float 0.000000e+00, i32 3
+  %tmp1 = tail call i64 @llvm.x86.sse.cvtss2si64(<4 x float> %v13)
+  %v20 = insertelement <4 x float> undef, float %f, i32 0
+  %v21 = insertelement <4 x float> %v20, float 0.000000e+00, i32 1
+  %v22 = insertelement <4 x float> %v21, float 0.000000e+00, i32 2
+  %v23 = insertelement <4 x float> %v22, float 0.000000e+00, i32 3
+  %tmp2 = tail call i32 @llvm.x86.sse.cvttss2si(<4 x float> %v23)
+  %v30 = insertelement <4 x float> undef, float %f, i32 0
+  %v31 = insertelement <4 x float> %v30, float 0.000000e+00, i32 1
+  %v32 = insertelement <4 x float> %v31, float 0.000000e+00, i32 2
+  %v33 = insertelement <4 x float> %v32, float 0.000000e+00, i32 3
+  %tmp3 = tail call i64 @llvm.x86.sse.cvttss2si64(<4 x float> %v33)
+  %v40 = insertelement <2 x double> undef, double %d, i32 0
+  %v41 = insertelement <2 x double> %v40, double 0.000000e+00, i32 1
+  %tmp4 = tail call i32 @llvm.x86.sse2.cvtsd2si(<2 x double> %v41)
+  %v50 = insertelement <2 x double> undef, double %d, i32 0
+  %v51 = insertelement <2 x double> %v50, double 0.000000e+00, i32 1
+  %tmp5 = tail call i64 @llvm.x86.sse2.cvtsd2si64(<2 x double> %v51)
+  %v60 = insertelement <2 x double> undef, double %d, i32 0
+  %v61 = insertelement <2 x double> %v60, double 0.000000e+00, i32 1
+  %tmp6 = tail call i32 @llvm.x86.sse2.cvttsd2si(<2 x double> %v61)
+  %v70 = insertelement <2 x double> undef, double %d, i32 0
+  %v71 = insertelement <2 x double> %v70, double 0.000000e+00, i32 1
+  %tmp7 = tail call i64 @llvm.x86.sse2.cvttsd2si64(<2 x double> %v71)
+  %tmp8 = add i32 %tmp0, %tmp2
+  %tmp9 = add i32 %tmp4, %tmp6
+  %tmp10 = add i32 %tmp8, %tmp9
+  %tmp11 = sext i32 %tmp10 to i64
+  %tmp12 = add i64 %tmp1, %tmp3
+  %tmp13 = add i64 %tmp5, %tmp7
+  %tmp14 = add i64 %tmp12, %tmp13
+  %tmp15 = add i64 %tmp11, %tmp14
+  ret i64 %tmp15
+}
+
+declare <4 x float> @llvm.x86.sse.sub.ss(<4 x float>, <4 x float>)
+declare <4 x float> @llvm.x86.sse.mul.ss(<4 x float>, <4 x float>)
+declare <4 x float> @llvm.x86.sse.min.ss(<4 x float>, <4 x float>)
+declare <4 x float> @llvm.x86.sse.max.ss(<4 x float>, <4 x float>)
+declare i32 @llvm.x86.sse.cvtss2si(<4 x float>)
+declare i64 @llvm.x86.sse.cvtss2si64(<4 x float>)
+declare i32 @llvm.x86.sse.cvttss2si(<4 x float>)
+declare i64 @llvm.x86.sse.cvttss2si64(<4 x float>)
+declare i32 @llvm.x86.sse2.cvtsd2si(<2 x double>)
+declare i64 @llvm.x86.sse2.cvtsd2si64(<2 x double>)
+declare i32 @llvm.x86.sse2.cvttsd2si(<2 x double>)
+declare i64 @llvm.x86.sse2.cvttsd2si64(<2 x double>)
diff --git a/test/Transforms/InstCombine/X86/x86-vector-shifts.ll b/test/Transforms/InstCombine/X86/x86-vector-shifts.ll
new file mode 100644
index 000000000000..07934fbdfe72
--- /dev/null
+++ b/test/Transforms/InstCombine/X86/x86-vector-shifts.ll
@@ -0,0 +1,3434 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -instcombine -S | FileCheck %s
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+;
+; ASHR - Immediate
+;
+
+define <8 x i16> @sse2_psrai_w_0(<8 x i16> %v) {
+; CHECK-LABEL: @sse2_psrai_w_0(
+; CHECK-NEXT:    ret <8 x i16> %v
+;
+  %1 = tail call <8 x i16> @llvm.x86.sse2.psrai.w(<8 x i16> %v, i32 0)
+  ret <8 x i16> %1
+}
+
+define <8 x i16> @sse2_psrai_w_15(<8 x i16> %v) {
+; CHECK-LABEL: @sse2_psrai_w_15(
+; CHECK-NEXT:    [[TMP1:%.*]] = ashr <8 x i16> %v, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
+; CHECK-NEXT:    ret <8 x i16> [[TMP1]]
+;
+  %1 = tail call <8 x i16> @llvm.x86.sse2.psrai.w(<8 x i16> %v, i32 15)
+  ret <8 x i16> %1
+}
+
+define <8 x i16> @sse2_psrai_w_64(<8 x i16> %v) {
+; CHECK-LABEL: @sse2_psrai_w_64(
+; CHECK-NEXT:    [[TMP1:%.*]] = ashr <8 x i16> %v, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
+; CHECK-NEXT:    ret <8 x i16> [[TMP1]]
+;
+  %1 = tail call <8 x i16> @llvm.x86.sse2.psrai.w(<8 x i16> %v, i32 64)
+  ret <8 x i16> %1
+}
+
+define <4 x i32> @sse2_psrai_d_0(<4 x i32> %v) {
+; CHECK-LABEL: @sse2_psrai_d_0(
+; CHECK-NEXT:    ret <4 x i32> %v
+;
+  %1 = tail call <4 x i32> @llvm.x86.sse2.psrai.d(<4 x i32> %v, i32 0)
+  ret <4 x i32> %1
+}
+
+define <4 x i32> @sse2_psrai_d_15(<4 x i32> %v) {
+; CHECK-LABEL: @sse2_psrai_d_15(
+; CHECK-NEXT:    [[TMP1:%.*]] = ashr <4 x i32> %v, <i32 15, i32 15, i32 15, i32 15>
+; CHECK-NEXT:    ret <4 x i32> [[TMP1]]
+;
+  %1 = tail call <4 x i32> @llvm.x86.sse2.psrai.d(<4 x i32> %v, i32 15)
+  ret <4 x i32> %1
+}
+
+define <4 x i32> @sse2_psrai_d_64(<4 x i32> %v) {
+; CHECK-LABEL: @sse2_psrai_d_64(
+; CHECK-NEXT:    [[TMP1:%.*]] = ashr <4 x i32> %v, <i32 31, i32 31, i32 31, i32 31>
+; CHECK-NEXT:    ret <4 x i32> [[TMP1]]
+;
+  %1 = tail call <4 x i32> @llvm.x86.sse2.psrai.d(<4 x i32> %v, i32 64)
+  ret <4 x i32> %1
+}
+
+define <16 x i16> @avx2_psrai_w_0(<16 x i16> %v) {
+; CHECK-LABEL: @avx2_psrai_w_0(
+; CHECK-NEXT:    ret <16 x i16> %v
+;
+  %1 = tail call <16 x i16> @llvm.x86.avx2.psrai.w(<16 x i16> %v, i32 0)
+  ret <16 x i16> %1
+}
+
+define <16 x i16> @avx2_psrai_w_15(<16 x i16> %v) {
+; CHECK-LABEL: @avx2_psrai_w_15(
+; CHECK-NEXT:    [[TMP1:%.*]] = ashr <16 x i16> %v, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
+; CHECK-NEXT:    ret <16 x i16> [[TMP1]]
+;
+  %1 = tail call <16 x i16> @llvm.x86.avx2.psrai.w(<16 x i16> %v, i32 15)
+  ret <16 x i16> %1
+}
+
+define <16 x i16> @avx2_psrai_w_64(<16 x i16> %v) {
+; CHECK-LABEL: @avx2_psrai_w_64(
+; CHECK-NEXT:    [[TMP1:%.*]] = ashr <16 x i16> %v, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
+; CHECK-NEXT:    ret <16 x i16> [[TMP1]]
+;
+  %1 = tail call <16 x i16> @llvm.x86.avx2.psrai.w(<16 x i16> %v, i32 64)
+  ret <16 x i16> %1
+}
+
+define <8 x i32> @avx2_psrai_d_0(<8 x i32> %v) {
+; CHECK-LABEL: @avx2_psrai_d_0(
+; CHECK-NEXT:    ret <8 x i32> %v
+;
+  %1 = tail call <8 x i32> @llvm.x86.avx2.psrai.d(<8 x i32> %v, i32 0)
+  ret <8 x i32> %1
+}
+
+define <8 x i32> @avx2_psrai_d_15(<8 x i32> %v) {
+; CHECK-LABEL: @avx2_psrai_d_15(
+; CHECK-NEXT:    [[TMP1:%.*]] = ashr <8 x i32> %v, <i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
+; CHECK-NEXT:    ret <8 x i32> [[TMP1]]
+;
+  %1 = tail call <8 x i32> @llvm.x86.avx2.psrai.d(<8 x i32> %v, i32 15)
+  ret <8 x i32> %1
+}
+
+define <8 x i32> @avx2_psrai_d_64(<8 x i32> %v) {
+; CHECK-LABEL: @avx2_psrai_d_64(
+; CHECK-NEXT:    [[TMP1:%.*]] = ashr <8 x i32> %v, <i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
+; CHECK-NEXT:    ret <8 x i32> [[TMP1]]
+;
+  %1 = tail call <8 x i32> @llvm.x86.avx2.psrai.d(<8 x i32> %v, i32 64)
+  ret <8 x i32> %1
+}
+
+define <2 x i64> @avx512_psrai_q_128_0(<2 x i64> %v) {
+; CHECK-LABEL: @avx512_psrai_q_128_0(
+; CHECK-NEXT:    ret <2 x i64> %v
+;
+  %1 = tail call <2 x i64> @llvm.x86.avx512.psrai.q.128(<2 x i64> %v, i32 0)
+  ret <2 x i64> %1
+}
+
+define <2 x i64> @avx512_psrai_q_128_15(<2 x i64> %v) {
+; CHECK-LABEL: @avx512_psrai_q_128_15(
+; CHECK-NEXT:    [[TMP1:%.*]] = ashr <2 x i64> %v, <i64 15, i64 15>
+; CHECK-NEXT:    ret <2 x i64> [[TMP1]]
+;
+  %1 = tail call <2 x i64> @llvm.x86.avx512.psrai.q.128(<2 x i64> %v, i32 15)
+  ret <2 x i64> %1
+}
+
+define <2 x i64> @avx512_psrai_q_128_64(<2 x i64> %v) {
+; CHECK-LABEL: @avx512_psrai_q_128_64(
+; CHECK-NEXT:    [[TMP1:%.*]] = ashr <2 x i64> %v, <i64 63, i64 63>
+; CHECK-NEXT:    ret <2 x i64> [[TMP1]]
+;
+  %1 = tail call <2 x i64> @llvm.x86.avx512.psrai.q.128(<2 x i64> %v, i32 64)
+  ret <2 x i64> %1
+}
+
+define <4 x i64> @avx512_psrai_q_256_0(<4 x i64> %v) {
+; CHECK-LABEL: @avx512_psrai_q_256_0(
+; CHECK-NEXT:    ret <4 x i64> %v
+;
+  %1 = tail call <4 x i64> @llvm.x86.avx512.psrai.q.256(<4 x i64> %v, i32 0)
+  ret <4 x i64> %1
+}
+
+define <4 x i64> @avx512_psrai_q_256_15(<4 x i64> %v) {
+; CHECK-LABEL: @avx512_psrai_q_256_15(
+; CHECK-NEXT:    [[TMP1:%.*]] = ashr <4 x i64> %v, <i64 15, i64 15, i64 15, i64 15>
+; CHECK-NEXT:    ret <4 x i64> [[TMP1]]
+;
+  %1 = tail call <4 x i64> @llvm.x86.avx512.psrai.q.256(<4 x i64> %v, i32 15)
+  ret <4 x i64> %1
+}
+
+define <4 x i64> @avx512_psrai_q_256_64(<4 x i64> %v) {
+; CHECK-LABEL: @avx512_psrai_q_256_64(
+; CHECK-NEXT:    [[TMP1:%.*]] = ashr <4 x i64> %v, <i64 63, i64 63, i64 63, i64 63>
+; CHECK-NEXT:    ret <4 x i64> [[TMP1]]
+;
+  %1 = tail call <4 x i64> @llvm.x86.avx512.psrai.q.256(<4 x i64> %v, i32 64)
+  ret <4 x i64> %1
+}
+
+define <32 x i16> @avx512_psrai_w_512_0(<32 x i16> %v) {
+; CHECK-LABEL: @avx512_psrai_w_512_0(
+; CHECK-NEXT:    ret <32 x i16> %v
+;
+  %1 = tail call <32 x i16> @llvm.x86.avx512.psrai.w.512(<32 x i16> %v, i32 0)
+  ret <32 x i16> %1
+}
+
+define <32 x i16> @avx512_psrai_w_512_15(<32 x i16> %v) {
+; CHECK-LABEL: @avx512_psrai_w_512_15(
+; CHECK-NEXT:    [[TMP1:%.*]] = ashr <32 x i16> %v, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
+; CHECK-NEXT:    ret <32 x i16> [[TMP1]]
+;
+  %1 = tail call <32 x i16> @llvm.x86.avx512.psrai.w.512(<32 x i16> %v, i32 15)
+  ret <32 x i16> %1
+}
+
+define <32 x i16> @avx512_psrai_w_512_64(<32 x i16> %v) {
+; CHECK-LABEL: @avx512_psrai_w_512_64(
+; CHECK-NEXT:    [[TMP1:%.*]] = ashr <32 x i16> %v, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
+; CHECK-NEXT:    ret <32 x i16> [[TMP1]]
+;
+  %1 = tail call <32 x i16> @llvm.x86.avx512.psrai.w.512(<32 x i16> %v, i32 64)
+  ret <32 x i16> %1
+}
+
+define <16 x i32> @avx512_psrai_d_512_0(<16 x i32> %v) {
+; CHECK-LABEL: @avx512_psrai_d_512_0(
+; CHECK-NEXT:    ret <16 x i32> %v
+;
+  %1 = tail call <16 x i32> @llvm.x86.avx512.psrai.d.512(<16 x i32> %v, i32 0)
+  ret <16 x i32> %1
+}
+
+define <16 x i32> @avx512_psrai_d_512_15(<16 x i32> %v) {
+; CHECK-LABEL: @avx512_psrai_d_512_15(
+; CHECK-NEXT:    [[TMP1:%.*]] = ashr <16 x i32> %v, <i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
+; CHECK-NEXT:    ret <16 x i32> [[TMP1]]
+;
+  %1 = tail call <16 x i32> @llvm.x86.avx512.psrai.d.512(<16 x i32> %v, i32 15)
+  ret <16 x i32> %1
+}
+
+define <16 x i32> @avx512_psrai_d_512_64(<16 x i32> %v) {
+; CHECK-LABEL: @avx512_psrai_d_512_64(
+; CHECK-NEXT:    [[TMP1:%.*]] = ashr <16 x i32> %v, <i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
+; CHECK-NEXT:    ret <16 x i32> [[TMP1]]
+;
+  %1 = tail call <16 x i32> @llvm.x86.avx512.psrai.d.512(<16 x i32> %v, i32 64)
+  ret <16 x i32> %1
+}
+
+define <8 x i64> @avx512_psrai_q_512_0(<8 x i64> %v) {
+; CHECK-LABEL: @avx512_psrai_q_512_0(
+; CHECK-NEXT:    ret <8 x i64> %v
+;
+  %1 = tail call <8 x i64> @llvm.x86.avx512.psrai.q.512(<8 x i64> %v, i32 0)
+  ret <8 x i64> %1
+}
+
+define <8 x i64> @avx512_psrai_q_512_15(<8 x i64> %v) {
+; CHECK-LABEL: @avx512_psrai_q_512_15(
+; CHECK-NEXT:    [[TMP1:%.*]] = ashr <8 x i64> %v, <i64 15, i64 15, i64 15, i64 15, i64 15, i64 15, i64 15, i64 15>
+; CHECK-NEXT:    ret <8 x i64> [[TMP1]]
+;
+  %1 = tail call <8 x i64> @llvm.x86.avx512.psrai.q.512(<8 x i64> %v, i32 15)
+  ret <8 x i64> %1
+}
+
+define <8 x i64> @avx512_psrai_q_512_64(<8 x i64> %v) {
+; CHECK-LABEL: @avx512_psrai_q_512_64(
+; CHECK-NEXT:    [[TMP1:%.*]] = ashr <8 x i64> %v, <i64 63, i64 63, i64 63, i64 63, i64 63, i64 63, i64 63, i64 63>
+; CHECK-NEXT:    ret <8 x i64> [[TMP1]]
+;
+  %1 = tail call <8 x i64> @llvm.x86.avx512.psrai.q.512(<8 x i64> %v, i32 64)
+  ret <8 x i64> %1
+}
+
+;
+; LSHR - Immediate
+;
+
+define <8 x i16> @sse2_psrli_w_0(<8 x i16> %v) {
+; CHECK-LABEL: @sse2_psrli_w_0(
+; CHECK-NEXT:    ret <8 x i16> %v
+;
+  %1 = tail call <8 x i16> @llvm.x86.sse2.psrli.w(<8 x i16> %v, i32 0)
+  ret <8 x i16> %1
+}
+
+define <8 x i16> @sse2_psrli_w_15(<8 x i16> %v) {
+; CHECK-LABEL: @sse2_psrli_w_15(
+; CHECK-NEXT:    [[TMP1:%.*]] = lshr <8 x i16> %v, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
+; CHECK-NEXT:    ret <8 x i16> [[TMP1]]
+;
+  %1 = tail call <8 x i16> @llvm.x86.sse2.psrli.w(<8 x i16> %v, i32 15)
+  ret <8 x i16> %1
+}
+
+define <8 x i16> @sse2_psrli_w_64(<8 x i16> %v) {
+; CHECK-LABEL: @sse2_psrli_w_64(
+; CHECK-NEXT:    ret <8 x i16> zeroinitializer
+;
+  %1 = tail call <8 x i16> @llvm.x86.sse2.psrli.w(<8 x i16> %v, i32 64)
+  ret <8 x i16> %1
+}
+
+define <4 x i32> @sse2_psrli_d_0(<4 x i32> %v) {
+; CHECK-LABEL: @sse2_psrli_d_0(
+; CHECK-NEXT:    ret <4 x i32> %v
+;
+  %1 = tail call <4 x i32> @llvm.x86.sse2.psrli.d(<4 x i32> %v, i32 0)
+  ret <4 x i32> %1
+}
+
+define <4 x i32> @sse2_psrli_d_15(<4 x i32> %v) {
+; CHECK-LABEL: @sse2_psrli_d_15(
+; CHECK-NEXT:    [[TMP1:%.*]] = lshr <4 x i32> %v, <i32 15, i32 15, i32 15, i32 15>
+; CHECK-NEXT:    ret <4 x i32> [[TMP1]]
+;
+  %1 = tail call <4 x i32> @llvm.x86.sse2.psrli.d(<4 x i32> %v, i32 15)
+  ret <4 x i32> %1
+}
+
+define <4 x i32> @sse2_psrli_d_64(<4 x i32> %v) {
+; CHECK-LABEL: @sse2_psrli_d_64(
+; CHECK-NEXT:    ret <4 x i32> zeroinitializer
+;
+  %1 = tail call <4 x i32> @llvm.x86.sse2.psrli.d(<4 x i32> %v, i32 64)
+  ret <4 x i32> %1
+}
+
+define <2 x i64> @sse2_psrli_q_0(<2 x i64> %v) {
+; CHECK-LABEL: @sse2_psrli_q_0(
+; CHECK-NEXT:    ret <2 x i64> %v
+;
+  %1 = tail call <2 x i64> @llvm.x86.sse2.psrli.q(<2 x i64> %v, i32 0)
+  ret <2 x i64> %1
+}
+
+define <2 x i64> @sse2_psrli_q_15(<2 x i64> %v) {
+; CHECK-LABEL: @sse2_psrli_q_15(
+; CHECK-NEXT:    [[TMP1:%.*]] = lshr <2 x i64> %v, <i64 15, i64 15>
+; CHECK-NEXT:    ret <2 x i64> [[TMP1]]
+;
+  %1 = tail call <2 x i64> @llvm.x86.sse2.psrli.q(<2 x i64> %v, i32 15)
+  ret <2 x i64> %1
+}
+
+define <2 x i64> @sse2_psrli_q_64(<2 x i64> %v) {
+; CHECK-LABEL: @sse2_psrli_q_64(
+; CHECK-NEXT:    ret <2 x i64> zeroinitializer
+;
+  %1 = tail call <2 x i64> @llvm.x86.sse2.psrli.q(<2 x i64> %v, i32 64)
+  ret <2 x i64> %1
+}
+
+define <16 x i16> @avx2_psrli_w_0(<16 x i16> %v) {
+; CHECK-LABEL: @avx2_psrli_w_0(
+; CHECK-NEXT:    ret <16 x i16> %v
+;
+  %1 = tail call <16 x i16> @llvm.x86.avx2.psrli.w(<16 x i16> %v, i32 0)
+  ret <16 x i16> %1
+}
+
+define <16 x i16> @avx2_psrli_w_15(<16 x i16> %v) {
+; CHECK-LABEL: @avx2_psrli_w_15(
+; CHECK-NEXT:    [[TMP1:%.*]] = lshr <16 x i16> %v, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
+; CHECK-NEXT:    ret <16 x i16> [[TMP1]]
+;
+  %1 = tail call <16 x i16> @llvm.x86.avx2.psrli.w(<16 x i16> %v, i32 15)
+  ret <16 x i16> %1
+}
+
+define <16 x i16> @avx2_psrli_w_64(<16 x i16> %v) {
+; CHECK-LABEL: @avx2_psrli_w_64(
+; CHECK-NEXT:    ret <16 x i16> zeroinitializer
+;
+  %1 = tail call <16 x i16> @llvm.x86.avx2.psrli.w(<16 x i16> %v, i32 64)
+  ret <16 x i16> %1
+}
+
+define <8 x i32> @avx2_psrli_d_0(<8 x i32> %v) {
+; CHECK-LABEL: @avx2_psrli_d_0(
+; CHECK-NEXT:    ret <8 x i32> %v
+;
+  %1 = tail call <8 x i32> @llvm.x86.avx2.psrli.d(<8 x i32> %v, i32 0)
+  ret <8 x i32> %1
+}
+
+define <8 x i32> @avx2_psrli_d_15(<8 x i32> %v) {
+; CHECK-LABEL: @avx2_psrli_d_15(
+; CHECK-NEXT:    [[TMP1:%.*]] = lshr <8 x i32> %v, <i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
+; CHECK-NEXT:    ret <8 x i32> [[TMP1]]
+;
+  %1 = tail call <8 x i32> @llvm.x86.avx2.psrli.d(<8 x i32> %v, i32 15)
+  ret <8 x i32> %1
+}
+
+define <8 x i32> @avx2_psrli_d_64(<8 x i32> %v) {
+; CHECK-LABEL: @avx2_psrli_d_64(
+; CHECK-NEXT:    ret <8 x i32> zeroinitializer
+;
+  %1 = tail call <8 x i32> @llvm.x86.avx2.psrli.d(<8 x i32> %v, i32 64)
+  ret <8 x i32> %1
+}
+
+define <4 x i64> @avx2_psrli_q_0(<4 x i64> %v) {
+; CHECK-LABEL: @avx2_psrli_q_0(
+; CHECK-NEXT:    ret <4 x i64> %v
+;
+  %1 = tail call <4 x i64> @llvm.x86.avx2.psrli.q(<4 x i64> %v, i32 0)
+  ret <4 x i64> %1
+}
+
+define <4 x i64> @avx2_psrli_q_15(<4 x i64> %v) {
+; CHECK-LABEL: @avx2_psrli_q_15(
+; CHECK-NEXT:    [[TMP1:%.*]] = lshr <4 x i64> %v, <i64 15, i64 15, i64 15, i64 15>
+; CHECK-NEXT:    ret <4 x i64> [[TMP1]]
+;
+  %1 = tail call <4 x i64> @llvm.x86.avx2.psrli.q(<4 x i64> %v, i32 15)
+  ret <4 x i64> %1
+}
+
+define <4 x i64> @avx2_psrli_q_64(<4 x i64> %v) {
+; CHECK-LABEL: @avx2_psrli_q_64(
+; CHECK-NEXT:    ret <4 x i64> zeroinitializer
+;
+  %1 = tail call <4 x i64> @llvm.x86.avx2.psrli.q(<4 x i64> %v, i32 64)
+  ret <4 x i64> %1
+}
+
+define <32 x i16> @avx512_psrli_w_512_0(<32 x i16> %v) {
+; CHECK-LABEL: @avx512_psrli_w_512_0(
+; CHECK-NEXT:    ret <32 x i16> %v
+;
+  %1 = tail call <32 x i16> @llvm.x86.avx512.psrli.w.512(<32 x i16> %v, i32 0)
+  ret <32 x i16> %1
+}
+
+define <32 x i16> @avx512_psrli_w_512_15(<32 x i16> %v) {
+; CHECK-LABEL: @avx512_psrli_w_512_15(
+; CHECK-NEXT:    [[TMP1:%.*]] = lshr <32 x i16> %v, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
+; CHECK-NEXT:    ret <32 x i16> [[TMP1]]
+;
+  %1 = tail call <32 x i16> @llvm.x86.avx512.psrli.w.512(<32 x i16> %v, i32 15)
+  ret <32 x i16> %1
+}
+
+define <32 x i16> @avx512_psrli_w_512_64(<32 x i16> %v) {
+; CHECK-LABEL: @avx512_psrli_w_512_64(
+; CHECK-NEXT:    ret <32 x i16> zeroinitializer
+;
+  %1 = tail call <32 x i16> @llvm.x86.avx512.psrli.w.512(<32 x i16> %v, i32 64)
+  ret <32 x i16> %1
+}
+
+define <16 x i32> @avx512_psrli_d_512_0(<16 x i32> %v) {
+; CHECK-LABEL: @avx512_psrli_d_512_0(
+; CHECK-NEXT:    ret <16 x i32> %v
+;
+  %1 = tail call <16 x i32> @llvm.x86.avx512.psrli.d.512(<16 x i32> %v, i32 0)
+  ret <16 x i32> %1
+}
+
+define <16 x i32> @avx512_psrli_d_512_15(<16 x i32> %v) {
+; CHECK-LABEL: @avx512_psrli_d_512_15(
+; CHECK-NEXT:    [[TMP1:%.*]] = lshr <16 x i32> %v, <i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
+; CHECK-NEXT:    ret <16 x i32> [[TMP1]]
+;
+  %1 = tail call <16 x i32> @llvm.x86.avx512.psrli.d.512(<16 x i32> %v, i32 15)
+  ret <16 x i32> %1
+}
+
+define <16 x i32> @avx512_psrli_d_512_64(<16 x i32> %v) {
+; CHECK-LABEL: @avx512_psrli_d_512_64(
+; CHECK-NEXT:    ret <16 x i32> zeroinitializer
+;
+  %1 = tail call <16 x i32> @llvm.x86.avx512.psrli.d.512(<16 x i32> %v, i32 64)
+  ret <16 x i32> %1
+}
+
+define <8 x i64> @avx512_psrli_q_512_0(<8 x i64> %v) {
+; CHECK-LABEL: @avx512_psrli_q_512_0(
+; CHECK-NEXT:    ret <8 x i64> %v
+;
+  %1 = tail call <8 x i64> @llvm.x86.avx512.psrli.q.512(<8 x i64> %v, i32 0)
+  ret <8 x i64> %1
+}
+
+define <8 x i64> @avx512_psrli_q_512_15(<8 x i64> %v) {
+; CHECK-LABEL: @avx512_psrli_q_512_15(
+; CHECK-NEXT:    [[TMP1:%.*]] = lshr <8 x i64> %v, <i64 15, i64 15, i64 15, i64 15, i64 15, i64 15, i64 15, i64 15>
+; CHECK-NEXT:    ret <8 x i64> [[TMP1]]
+;
+  %1 = tail call <8 x i64> @llvm.x86.avx512.psrli.q.512(<8 x i64> %v, i32 15)
+  ret <8 x i64> %1
+}
+
+define <8 x i64> @avx512_psrli_q_512_64(<8 x i64> %v) {
+; CHECK-LABEL: @avx512_psrli_q_512_64(
+; CHECK-NEXT:    ret <8 x i64> zeroinitializer
+;
+  %1 = tail call <8 x i64> @llvm.x86.avx512.psrli.q.512(<8 x i64> %v, i32 64)
+  ret <8 x i64> %1
+}
+
+;
+; SHL - Immediate
+;
+
+define <8 x i16> @sse2_pslli_w_0(<8 x i16> %v) {
+; CHECK-LABEL: @sse2_pslli_w_0(
+; CHECK-NEXT:    ret <8 x i16> %v
+;
+  %1 = tail call <8 x i16> @llvm.x86.sse2.pslli.w(<8 x i16> %v, i32 0)
+  ret <8 x i16> %1
+}
+
+define <8 x i16> @sse2_pslli_w_15(<8 x i16> %v) {
+; CHECK-LABEL: @sse2_pslli_w_15(
+; CHECK-NEXT:    [[TMP1:%.*]] = shl <8 x i16> %v, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
+; CHECK-NEXT:    ret <8 x i16> [[TMP1]]
+;
+  %1 = tail call <8 x i16> @llvm.x86.sse2.pslli.w(<8 x i16> %v, i32 15)
+  ret <8 x i16> %1
+}
+
+define <8 x i16> @sse2_pslli_w_64(<8 x i16> %v) {
+; CHECK-LABEL: @sse2_pslli_w_64(
+; CHECK-NEXT:    ret <8 x i16> zeroinitializer
+;
+  %1 = tail call <8 x i16> @llvm.x86.sse2.pslli.w(<8 x i16> %v, i32 64)
+  ret <8 x i16> %1
+}
+
+define <4 x i32> @sse2_pslli_d_0(<4 x i32> %v) {
+; CHECK-LABEL: @sse2_pslli_d_0(
+; CHECK-NEXT:    ret <4 x i32> %v
+;
+  %1 = tail call <4 x i32> @llvm.x86.sse2.pslli.d(<4 x i32> %v, i32 0)
+  ret <4 x i32> %1
+}
+
+define <4 x i32> @sse2_pslli_d_15(<4 x i32> %v) {
+; CHECK-LABEL: @sse2_pslli_d_15(
+; CHECK-NEXT:    [[TMP1:%.*]] = shl <4 x i32> %v, <i32 15, i32 15, i32 15, i32 15>
+; CHECK-NEXT:    ret <4 x i32> [[TMP1]]
+;
+  %1 = tail call <4 x i32> @llvm.x86.sse2.pslli.d(<4 x i32> %v, i32 15)
+  ret <4 x i32> %1
+}
+
+define <4 x i32> @sse2_pslli_d_64(<4 x i32> %v) {
+; CHECK-LABEL: @sse2_pslli_d_64(
+; CHECK-NEXT:    ret <4 x i32> zeroinitializer
+;
+  %1 = tail call <4 x i32> @llvm.x86.sse2.pslli.d(<4 x i32> %v, i32 64)
+  ret <4 x i32> %1
+}
+
+define <2 x i64> @sse2_pslli_q_0(<2 x i64> %v) {
+; CHECK-LABEL: @sse2_pslli_q_0(
+; CHECK-NEXT:    ret <2 x i64> %v
+;
+  %1 = tail call <2 x i64> @llvm.x86.sse2.pslli.q(<2 x i64> %v, i32 0)
+  ret <2 x i64> %1
+}
+
+define <2 x i64> @sse2_pslli_q_15(<2 x i64> %v) {
+; CHECK-LABEL: @sse2_pslli_q_15(
+; CHECK-NEXT:    [[TMP1:%.*]] = shl <2 x i64> %v, <i64 15, i64 15>
+; CHECK-NEXT:    ret <2 x i64> [[TMP1]]
+;
+  %1 = tail call <2 x i64> @llvm.x86.sse2.pslli.q(<2 x i64> %v, i32 15)
+  ret <2 x i64> %1
+}
+
+define <2 x i64> @sse2_pslli_q_64(<2 x i64> %v) {
+; CHECK-LABEL: @sse2_pslli_q_64(
+; CHECK-NEXT:    ret <2 x i64> zeroinitializer
+;
+  %1 = tail call <2 x i64> @llvm.x86.sse2.pslli.q(<2 x i64> %v, i32 64)
+  ret <2 x i64> %1
+}
+
+define <16 x i16> @avx2_pslli_w_0(<16 x i16> %v) {
+; CHECK-LABEL: @avx2_pslli_w_0(
+; CHECK-NEXT:    ret <16 x i16> %v
+;
+  %1 = tail call <16 x i16> @llvm.x86.avx2.pslli.w(<16 x i16> %v, i32 0)
+  ret <16 x i16> %1
+}
+
+define <16 x i16> @avx2_pslli_w_15(<16 x i16> %v) {
+; CHECK-LABEL: @avx2_pslli_w_15(
+; CHECK-NEXT:    [[TMP1:%.*]] = shl <16 x i16> %v, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
+; CHECK-NEXT:    ret <16 x i16> [[TMP1]]
+;
+  %1 = tail call <16 x i16> @llvm.x86.avx2.pslli.w(<16 x i16> %v, i32 15)
+  ret <16 x i16> %1
+}
+
+define <16 x i16> @avx2_pslli_w_64(<16 x i16> %v) {
+; CHECK-LABEL: @avx2_pslli_w_64(
+; CHECK-NEXT:    ret <16 x i16> zeroinitializer
+;
+  %1 = tail call <16 x i16> @llvm.x86.avx2.pslli.w(<16 x i16> %v, i32 64)
+  ret <16 x i16> %1
+}
+
+define <8 x i32> @avx2_pslli_d_0(<8 x i32> %v) {
+; CHECK-LABEL: @avx2_pslli_d_0(
+; CHECK-NEXT:    ret <8 x i32> %v
+;
+  %1 = tail call <8 x i32> @llvm.x86.avx2.pslli.d(<8 x i32> %v, i32 0)
+  ret <8 x i32> %1
+}
+
+define <8 x i32> @avx2_pslli_d_15(<8 x i32> %v) {
+; CHECK-LABEL: @avx2_pslli_d_15(
+; CHECK-NEXT:    [[TMP1:%.*]] = shl <8 x i32> %v, <i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
+; CHECK-NEXT:    ret <8 x i32> [[TMP1]]
+;
+  %1 = tail call <8 x i32> @llvm.x86.avx2.pslli.d(<8 x i32> %v, i32 15)
+  ret <8 x i32> %1
+}
+
+define <8 x i32> @avx2_pslli_d_64(<8 x i32> %v) {
+; CHECK-LABEL: @avx2_pslli_d_64(
+; CHECK-NEXT:    ret <8 x i32> zeroinitializer
+;
+  %1 = tail call <8 x i32> @llvm.x86.avx2.pslli.d(<8 x i32> %v, i32 64)
+  ret <8 x i32> %1
+}
+
+define <4 x i64> @avx2_pslli_q_0(<4 x i64> %v) {
+; CHECK-LABEL: @avx2_pslli_q_0(
+; CHECK-NEXT:    ret <4 x i64> %v
+;
+  %1 = tail call <4 x i64> @llvm.x86.avx2.pslli.q(<4 x i64> %v, i32 0)
+  ret <4 x i64> %1
+}
+
+define <4 x i64> @avx2_pslli_q_15(<4 x i64> %v) {
+; CHECK-LABEL: @avx2_pslli_q_15(
+; CHECK-NEXT:    [[TMP1:%.*]] = shl <4 x i64> %v, <i64 15, i64 15, i64 15, i64 15>
+; CHECK-NEXT:    ret <4 x i64> [[TMP1]]
+;
+  %1 = tail call <4 x i64> @llvm.x86.avx2.pslli.q(<4 x i64> %v, i32 15)
+  ret <4 x i64> %1
+}
+
+define <4 x i64> @avx2_pslli_q_64(<4 x i64> %v) {
+; CHECK-LABEL: @avx2_pslli_q_64(
+; CHECK-NEXT:    ret <4 x i64> zeroinitializer
+;
+  %1 = tail call <4 x i64> @llvm.x86.avx2.pslli.q(<4 x i64> %v, i32 64)
+  ret <4 x i64> %1
+}
+
+define <32 x i16> @avx512_pslli_w_512_0(<32 x i16> %v) {
+; CHECK-LABEL: @avx512_pslli_w_512_0(
+; CHECK-NEXT:    ret <32 x i16> %v
+;
+  %1 = tail call <32 x i16> @llvm.x86.avx512.pslli.w.512(<32 x i16> %v, i32 0)
+  ret <32 x i16> %1
+}
+
+define <32 x i16> @avx512_pslli_w_512_15(<32 x i16> %v) {
+; CHECK-LABEL: @avx512_pslli_w_512_15(
+; CHECK-NEXT:    [[TMP1:%.*]] = shl <32 x i16> %v, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
+; CHECK-NEXT:    ret <32 x i16> [[TMP1]]
+;
+  %1 = tail call <32 x i16> @llvm.x86.avx512.pslli.w.512(<32 x i16> %v, i32 15)
+  ret <32 x i16> %1
+}
+
+define <32 x i16> @avx512_pslli_w_512_64(<32 x i16> %v) {
+; CHECK-LABEL: @avx512_pslli_w_512_64(
+; CHECK-NEXT:    ret <32 x i16> zeroinitializer
+;
+  %1 = tail call <32 x i16> @llvm.x86.avx512.pslli.w.512(<32 x i16> %v, i32 64)
+  ret <32 x i16> %1
+}
+
+define <16 x i32> @avx512_pslli_d_512_0(<16 x i32> %v) {
+; CHECK-LABEL: @avx512_pslli_d_512_0(
+; CHECK-NEXT:    ret <16 x i32> %v
+;
+  %1 = tail call <16 x i32> @llvm.x86.avx512.pslli.d.512(<16 x i32> %v, i32 0)
+  ret <16 x i32> %1
+}
+
+define <16 x i32> @avx512_pslli_d_512_15(<16 x i32> %v) {
+; CHECK-LABEL: @avx512_pslli_d_512_15(
+; CHECK-NEXT:    [[TMP1:%.*]] = shl <16 x i32> %v, <i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
+; CHECK-NEXT:    ret <16 x i32> [[TMP1]]
+;
+  %1 = tail call <16 x i32> @llvm.x86.avx512.pslli.d.512(<16 x i32> %v, i32 15)
+  ret <16 x i32> %1
+}
+
+define <16 x i32> @avx512_pslli_d_512_64(<16 x i32> %v) {
+; CHECK-LABEL: @avx512_pslli_d_512_64(
+; CHECK-NEXT:    ret <16 x i32> zeroinitializer
+;
+  %1 = tail call <16 x i32> @llvm.x86.avx512.pslli.d.512(<16 x i32> %v, i32 64)
+  ret <16 x i32> %1
+}
+
+define <8 x i64> @avx512_pslli_q_512_0(<8 x i64> %v) {
+; CHECK-LABEL: @avx512_pslli_q_512_0(
+; CHECK-NEXT:    ret <8 x i64> %v
+;
+  %1 = tail call <8 x i64> @llvm.x86.avx512.pslli.q.512(<8 x i64> %v, i32 0)
+  ret <8 x i64> %1
+}
+
+define <8 x i64> @avx512_pslli_q_512_15(<8 x i64> %v) {
+; CHECK-LABEL: @avx512_pslli_q_512_15(
+; CHECK-NEXT:    [[TMP1:%.*]] = shl <8 x i64> %v, <i64 15, i64 15, i64 15, i64 15, i64 15, i64 15, i64 15, i64 15>
+; CHECK-NEXT:    ret <8 x i64> [[TMP1]]
+;
+  %1 = tail call <8 x i64> @llvm.x86.avx512.pslli.q.512(<8 x i64> %v, i32 15)
+  ret <8 x i64> %1
+}
+
+define <8 x i64> @avx512_pslli_q_512_64(<8 x i64> %v) {
+; CHECK-LABEL: @avx512_pslli_q_512_64(
+; CHECK-NEXT:    ret <8 x i64> zeroinitializer
+;
+  %1 = tail call <8 x i64> @llvm.x86.avx512.pslli.q.512(<8 x i64> %v, i32 64)
+  ret <8 x i64> %1
+}
+
+;
+; ASHR - Constant Vector
+;
+
+define <8 x i16> @sse2_psra_w_0(<8 x i16> %v) {
+; CHECK-LABEL: @sse2_psra_w_0(
+; CHECK-NEXT:    ret <8 x i16> %v
+;
+  %1 = tail call <8 x i16> @llvm.x86.sse2.psra.w(<8 x i16> %v, <8 x i16> zeroinitializer)
+  ret <8 x i16> %1
+}
+
+define <8 x i16> @sse2_psra_w_15(<8 x i16> %v) {
+; CHECK-LABEL: @sse2_psra_w_15(
+; CHECK-NEXT:    [[TMP1:%.*]] = ashr <8 x i16> %v, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
+; CHECK-NEXT:    ret <8 x i16> [[TMP1]]
+;
+  %1 = tail call <8 x i16> @llvm.x86.sse2.psra.w(<8 x i16> %v, <8 x i16> <i16 15, i16 0, i16 0, i16 0, i16 9999, i16 9999, i16 9999, i16 9999>)
+  ret <8 x i16> %1
+}
+
+define <8 x i16> @sse2_psra_w_15_splat(<8 x i16> %v) {
+; CHECK-LABEL: @sse2_psra_w_15_splat(
+; CHECK-NEXT:    [[TMP1:%.*]] = ashr <8 x i16> %v, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
+; CHECK-NEXT:    ret <8 x i16> [[TMP1]]
+;
+  %1 = tail call <8 x i16> @llvm.x86.sse2.psra.w(<8 x i16> %v, <8 x i16> <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>)
+  ret <8 x i16> %1
+}
+
+define <8 x i16> @sse2_psra_w_64(<8 x i16> %v) {
+; CHECK-LABEL: @sse2_psra_w_64(
+; CHECK-NEXT:    [[TMP1:%.*]] = ashr <8 x i16> %v, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
+; CHECK-NEXT:    ret <8 x i16> [[TMP1]]
+;
+  %1 = tail call <8 x i16> @llvm.x86.sse2.psra.w(<8 x i16> %v, <8 x i16> <i16 64, i16 0, i16 0, i16 0, i16 9999, i16 9999, i16 9999, i16 9999>)
+  ret <8 x i16> %1
+}
+
+define <4 x i32> @sse2_psra_d_0(<4 x i32> %v) {
+; CHECK-LABEL: @sse2_psra_d_0(
+; CHECK-NEXT:    ret <4 x i32> %v
+;
+  %1 = tail call <4 x i32> @llvm.x86.sse2.psra.d(<4 x i32> %v, <4 x i32> zeroinitializer)
+  ret <4 x i32> %1
+}
+
+define <4 x i32> @sse2_psra_d_15(<4 x i32> %v) {
+; CHECK-LABEL: @sse2_psra_d_15(
+; CHECK-NEXT:    [[TMP1:%.*]] = ashr <4 x i32> %v, <i32 15, i32 15, i32 15, i32 15>
+; CHECK-NEXT:    ret <4 x i32> [[TMP1]]
+;
+  %1 = tail call <4 x i32> @llvm.x86.sse2.psra.d(<4 x i32> %v, <4 x i32> <i32 15, i32 0, i32 9999, i32 9999>)
+  ret <4 x i32> %1
+}
+
+define <4 x i32> @sse2_psra_d_15_splat(<4 x i32> %v) {
+; CHECK-LABEL: @sse2_psra_d_15_splat(
+; CHECK-NEXT:    [[TMP1:%.*]] = ashr <4 x i32> %v, <i32 31, i32 31, i32 31, i32 31>
+; CHECK-NEXT:    ret <4 x i32> [[TMP1]]
+;
+  %1 = tail call <4 x i32> @llvm.x86.sse2.psra.d(<4 x i32> %v, <4 x i32> <i32 15, i32 15, i32 15, i32 15>)
+  ret <4 x i32> %1
+}
+
+define <4 x i32> @sse2_psra_d_64(<4 x i32> %v) {
+; CHECK-LABEL: @sse2_psra_d_64(
+; CHECK-NEXT:    [[TMP1:%.*]] = ashr <4 x i32> %v, <i32 31, i32 31, i32 31, i32 31>
+; CHECK-NEXT:    ret <4 x i32> [[TMP1]]
+;
+  %1 = tail call <4 x i32> @llvm.x86.sse2.psra.d(<4 x i32> %v, <4 x i32> <i32 64, i32 0, i32 9999, i32 9999>)
+  ret <4 x i32> %1
+}
+
+define <16 x i16> @avx2_psra_w_0(<16 x i16> %v) {
+; CHECK-LABEL: @avx2_psra_w_0(
+; CHECK-NEXT:    ret <16 x i16> %v
+;
+  %1 = tail call <16 x i16> @llvm.x86.avx2.psra.w(<16 x i16> %v, <8 x i16> zeroinitializer)
+  ret <16 x i16> %1
+}
+
+define <16 x i16> @avx2_psra_w_15(<16 x i16> %v) {
+; CHECK-LABEL: @avx2_psra_w_15(
+; CHECK-NEXT:    [[TMP1:%.*]] = ashr <16 x i16> %v, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
+; CHECK-NEXT:    ret <16 x i16> [[TMP1]]
+;
+  %1 = tail call <16 x i16> @llvm.x86.avx2.psra.w(<16 x i16> %v, <8 x i16> <i16 15, i16 0, i16 0, i16 0, i16 9999, i16 9999, i16 9999, i16 9999>)
+  ret <16 x i16> %1
+}
+
+define <16 x i16> @avx2_psra_w_15_splat(<16 x i16> %v) {
+; CHECK-LABEL: @avx2_psra_w_15_splat(
+; CHECK-NEXT:    [[TMP1:%.*]] = ashr <16 x i16> %v, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
+; CHECK-NEXT:    ret <16 x i16> [[TMP1]]
+;
+  %1 = tail call <16 x i16> @llvm.x86.avx2.psra.w(<16 x i16> %v, <8 x i16> <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>)
+  ret <16 x i16> %1
+}
+
+define <16 x i16> @avx2_psra_w_64(<16 x i16> %v) {
+; CHECK-LABEL: @avx2_psra_w_64(
+; CHECK-NEXT:    [[TMP1:%.*]] = ashr <16 x i16> %v, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
+; CHECK-NEXT:    ret <16 x i16> [[TMP1]]
+;
+  %1 = tail call <16 x i16> @llvm.x86.avx2.psra.w(<16 x i16> %v, <8 x i16> <i16 64, i16 0, i16 0, i16 0, i16 9999, i16 9999, i16 9999, i16 9999>)
+  ret <16 x i16> %1
+}
+
+define <8 x i32> @avx2_psra_d_0(<8 x i32> %v) {
+; CHECK-LABEL: @avx2_psra_d_0(
+; CHECK-NEXT:    ret <8 x i32> %v
+;
+  %1 = tail call <8 x i32> @llvm.x86.avx2.psra.d(<8 x i32> %v, <4 x i32> zeroinitializer)
+  ret <8 x i32> %1
+}
+
+define <8 x i32> @avx2_psra_d_15(<8 x i32> %v) {
+; CHECK-LABEL: @avx2_psra_d_15(
+; CHECK-NEXT:    [[TMP1:%.*]] = ashr <8 x i32> %v, <i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
+; CHECK-NEXT:    ret <8 x i32> [[TMP1]]
+;
+  %1 = tail call <8 x i32> @llvm.x86.avx2.psra.d(<8 x i32> %v, <4 x i32> <i32 15, i32 0, i32 9999, i32 9999>)
+  ret <8 x i32> %1
+}
+
+define <8 x i32> @avx2_psra_d_15_splat(<8 x i32> %v) {
+; CHECK-LABEL: @avx2_psra_d_15_splat(
+; CHECK-NEXT:    [[TMP1:%.*]] = ashr <8 x i32> %v, <i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
+; CHECK-NEXT:    ret <8 x i32> [[TMP1]]
+;
+  %1 = tail call <8 x i32> @llvm.x86.avx2.psra.d(<8 x i32> %v, <4 x i32> <i32 15, i32 15, i32 15, i32 15>)
+  ret <8 x i32> %1
+}
+
+define <8 x i32> @avx2_psra_d_64(<8 x i32> %v) {
+; CHECK-LABEL: @avx2_psra_d_64(
+; CHECK-NEXT:    [[TMP1:%.*]] = ashr <8 x i32> %v, <i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
+; CHECK-NEXT:    ret <8 x i32> [[TMP1]]
+;
+  %1 = tail call <8 x i32> @llvm.x86.avx2.psra.d(<8 x i32> %v, <4 x i32> <i32 64, i32 0, i32 9999, i32 9999>)
+  ret <8 x i32> %1
+}
+
+define <2 x i64> @avx512_psra_q_128_0(<2 x i64> %v) {
+; CHECK-LABEL: @avx512_psra_q_128_0(
+; CHECK-NEXT:    ret <2 x i64> %v
+;
+  %1 = tail call <2 x i64> @llvm.x86.avx512.psra.q.128(<2 x i64> %v, <2 x i64> zeroinitializer)
+  ret <2 x i64> %1
+}
+
+define <2 x i64> @avx512_psra_q_128_15(<2 x i64> %v) {
+; CHECK-LABEL: @avx512_psra_q_128_15(
+; CHECK-NEXT:    [[TMP1:%.*]] = ashr <2 x i64> %v, <i64 15, i64 15>
+; CHECK-NEXT:    ret <2 x i64> [[TMP1]]
+;
+  %1 = tail call <2 x i64> @llvm.x86.avx512.psra.q.128(<2 x i64> %v, <2 x i64> <i64 15, i64 9999>)
+  ret <2 x i64> %1
+}
+
+define <2 x i64> @avx512_psra_q_128_64(<2 x i64> %v) {
+; CHECK-LABEL: @avx512_psra_q_128_64(
+; CHECK-NEXT:    [[TMP1:%.*]] = ashr <2 x i64> %v, <i64 63, i64 63>
+; CHECK-NEXT:    ret <2 x i64> [[TMP1]]
+;
+  %1 = tail call <2 x i64> @llvm.x86.avx512.psra.q.128(<2 x i64> %v, <2 x i64> <i64 64, i64 9999>)
+  ret <2 x i64> %1
+}
+
+define <4 x i64> @avx512_psra_q_256_0(<4 x i64> %v) {
+; CHECK-LABEL: @avx512_psra_q_256_0(
+; CHECK-NEXT:    ret <4 x i64> %v
+;
+  %1 = tail call <4 x i64> @llvm.x86.avx512.psra.q.256(<4 x i64> %v, <2 x i64> zeroinitializer)
+  ret <4 x i64> %1
+}
+
+define <4 x i64> @avx512_psra_q_256_15(<4 x i64> %v) {
+; CHECK-LABEL: @avx512_psra_q_256_15(
+; CHECK-NEXT:    [[TMP1:%.*]] = ashr <4 x i64> %v, <i64 15, i64 15, i64 15, i64 15>
+; CHECK-NEXT:    ret <4 x i64> [[TMP1]]
+;
+  %1 = tail call <4 x i64> @llvm.x86.avx512.psra.q.256(<4 x i64> %v, <2 x i64> <i64 15, i64 9999>)
+  ret <4 x i64> %1
+}
+
+define <4 x i64> @avx512_psra_q_256_64(<4 x i64> %v) {
+; CHECK-LABEL: @avx512_psra_q_256_64(
+; CHECK-NEXT:    [[TMP1:%.*]] = ashr <4 x i64> %v, <i64 63, i64 63, i64 63, i64 63>
+; CHECK-NEXT:    ret <4 x i64> [[TMP1]]
+;
+  %1 = tail call <4 x i64> @llvm.x86.avx512.psra.q.256(<4 x i64> %v, <2 x i64> <i64 64, i64 9999>)
+  ret <4 x i64> %1
+}
+
+define <32 x i16> @avx512_psra_w_512_0(<32 x i16> %v) {
+; CHECK-LABEL: @avx512_psra_w_512_0(
+; CHECK-NEXT:    ret <32 x i16> %v
+;
+  %1 = tail call <32 x i16> @llvm.x86.avx512.psra.w.512(<32 x i16> %v, <8 x i16> zeroinitializer)
+  ret <32 x i16> %1
+}
+
+define <32 x i16> @avx512_psra_w_512_15(<32 x i16> %v) {
+; CHECK-LABEL: @avx512_psra_w_512_15(
+; CHECK-NEXT:    [[TMP1:%.*]] = ashr <32 x i16> %v, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
+; CHECK-NEXT:    ret <32 x i16> [[TMP1]]
+;
+  %1 = tail call <32 x i16> @llvm.x86.avx512.psra.w.512(<32 x i16> %v, <8 x i16> <i16 15, i16 0, i16 0, i16 0, i16 9999, i16 9999, i16 9999, i16 9999>)
+  ret <32 x i16> %1
+}
+
+define <32 x i16> @avx512_psra_w_512_15_splat(<32 x i16> %v) {
+; CHECK-LABEL: @avx512_psra_w_512_15_splat(
+; CHECK-NEXT:    [[TMP1:%.*]] = ashr <32 x i16> %v, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
+; CHECK-NEXT:    ret <32 x i16> [[TMP1]]
+;
+  %1 = tail call <32 x i16> @llvm.x86.avx512.psra.w.512(<32 x i16> %v, <8 x i16> <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>)
+  ret <32 x i16> %1
+}
+
+define <32 x i16> @avx512_psra_w_512_64(<32 x i16> %v) {
+; CHECK-LABEL: @avx512_psra_w_512_64(
+; CHECK-NEXT:    [[TMP1:%.*]] = ashr <32 x i16> %v, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
+; CHECK-NEXT:    ret <32 x i16> [[TMP1]]
+;
+  %1 = tail call <32 x i16> @llvm.x86.avx512.psra.w.512(<32 x i16> %v, <8 x i16> <i16 64, i16 0, i16 0, i16 0, i16 9999, i16 9999, i16 9999, i16 9999>)
+  ret <32 x i16> %1
+}
+
+define <16 x i32> @avx512_psra_d_512_0(<16 x i32> %v) {
+; CHECK-LABEL: @avx512_psra_d_512_0(
+; CHECK-NEXT:    ret <16 x i32> %v
+;
+  %1 = tail call <16 x i32> @llvm.x86.avx512.psra.d.512(<16 x i32> %v, <4 x i32> zeroinitializer)
+  ret <16 x i32> %1
+}
+
+define <16 x i32> @avx512_psra_d_512_15(<16 x i32> %v) {
+; CHECK-LABEL: @avx512_psra_d_512_15(
+; CHECK-NEXT:    [[TMP1:%.*]] = ashr <16 x i32> %v, <i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
+; CHECK-NEXT:    ret <16 x i32> [[TMP1]]
+;
+  %1 = tail call <16 x i32> @llvm.x86.avx512.psra.d.512(<16 x i32> %v, <4 x i32> <i32 15, i32 0, i32 9999, i32 9999>)
+  ret <16 x i32> %1
+}
+
+define <16 x i32> @avx512_psra_d_512_15_splat(<16 x i32> %v) {
+; CHECK-LABEL: @avx512_psra_d_512_15_splat(
+; CHECK-NEXT:    [[TMP1:%.*]] = ashr <16 x i32> %v, <i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
+; CHECK-NEXT:    ret <16 x i32> [[TMP1]]
+;
+  %1 = tail call <16 x i32> @llvm.x86.avx512.psra.d.512(<16 x i32> %v, <4 x i32> <i32 15, i32 15, i32 15, i32 15>)
+  ret <16 x i32> %1
+}
+
+define <16 x i32> @avx512_psra_d_512_64(<16 x i32> %v) {
+; CHECK-LABEL: @avx512_psra_d_512_64(
+; CHECK-NEXT:    [[TMP1:%.*]] = ashr <16 x i32> %v, <i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
+; CHECK-NEXT:    ret <16 x i32> [[TMP1]]
+;
+  %1 = tail call <16 x i32> @llvm.x86.avx512.psra.d.512(<16 x i32> %v, <4 x i32> <i32 64, i32 0, i32 9999, i32 9999>)
+  ret <16 x i32> %1
+}
+
+define <8 x i64> @avx512_psra_q_512_0(<8 x i64> %v) {
+; CHECK-LABEL: @avx512_psra_q_512_0(
+; CHECK-NEXT:    ret <8 x i64> %v
+;
+  %1 = tail call <8 x i64> @llvm.x86.avx512.psra.q.512(<8 x i64> %v, <2 x i64> zeroinitializer)
+  ret <8 x i64> %1
+}
+
+define <8 x i64> @avx512_psra_q_512_15(<8 x i64> %v) {
+; CHECK-LABEL: @avx512_psra_q_512_15(
+; CHECK-NEXT:    [[TMP1:%.*]] = ashr <8 x i64> %v, <i64 15, i64 15, i64 15, i64 15, i64 15, i64 15, i64 15, i64 15>
+; CHECK-NEXT:    ret <8 x i64> [[TMP1]]
+;
+  %1 = tail call <8 x i64> @llvm.x86.avx512.psra.q.512(<8 x i64> %v, <2 x i64> <i64 15, i64 9999>)
+  ret <8 x i64> %1
+}
+
+define <8 x i64> @avx512_psra_q_512_64(<8 x i64> %v) {
+; CHECK-LABEL: @avx512_psra_q_512_64(
+; CHECK-NEXT:    [[TMP1:%.*]] = ashr <8 x i64> %v, <i64 63, i64 63, i64 63, i64 63, i64 63, i64 63, i64 63, i64 63>
+; CHECK-NEXT:    ret <8 x i64> [[TMP1]]
+;
+  %1 = tail call <8 x i64> @llvm.x86.avx512.psra.q.512(<8 x i64> %v, <2 x i64> <i64 64, i64 9999>)
+  ret <8 x i64> %1
+}
+
+;
+; LSHR - Constant Vector
+;
+
+define <8 x i16> @sse2_psrl_w_0(<8 x i16> %v) {
+; CHECK-LABEL: @sse2_psrl_w_0(
+; CHECK-NEXT:    ret <8 x i16> %v
+;
+  %1 = tail call <8 x i16> @llvm.x86.sse2.psrl.w(<8 x i16> %v, <8 x i16> zeroinitializer)
+  ret <8 x i16> %1
+}
+
+define <8 x i16> @sse2_psrl_w_15(<8 x i16> %v) {
+; CHECK-LABEL: @sse2_psrl_w_15(
+; CHECK-NEXT:    [[TMP1:%.*]] = lshr <8 x i16> %v, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
+; CHECK-NEXT:    ret <8 x i16> [[TMP1]]
+;
+  %1 = tail call <8 x i16> @llvm.x86.sse2.psrl.w(<8 x i16> %v, <8 x i16> <i16 15, i16 0, i16 0, i16 0, i16 9999, i16 9999, i16 9999, i16 9999>)
+  ret <8 x i16> %1
+}
+
+define <8 x i16> @sse2_psrl_w_15_splat(<8 x i16> %v) {
+; CHECK-LABEL: @sse2_psrl_w_15_splat(
+; CHECK-NEXT:    ret <8 x i16> zeroinitializer
+;
+  %1 = tail call <8 x i16> @llvm.x86.sse2.psrl.w(<8 x i16> %v, <8 x i16> <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>)
+  ret <8 x i16> %1
+}
+
+define <8 x i16> @sse2_psrl_w_64(<8 x i16> %v) {
+; CHECK-LABEL: @sse2_psrl_w_64(
+; CHECK-NEXT:    ret <8 x i16> zeroinitializer
+;
+  %1 = tail call <8 x i16> @llvm.x86.sse2.psrl.w(<8 x i16> %v, <8 x i16> <i16 64, i16 0, i16 0, i16 0, i16 9999, i16 9999, i16 9999, i16 9999>)
+  ret <8 x i16> %1
+}
+
+define <4 x i32> @sse2_psrl_d_0(<4 x i32> %v) {
+; CHECK-LABEL: @sse2_psrl_d_0(
+; CHECK-NEXT:    ret <4 x i32> %v
+;
+  %1 = tail call <4 x i32> @llvm.x86.sse2.psrl.d(<4 x i32> %v, <4 x i32> zeroinitializer)
+  ret <4 x i32> %1
+}
+
+define <4 x i32> @sse2_psrl_d_15(<4 x i32> %v) {
+; CHECK-LABEL: @sse2_psrl_d_15(
+; CHECK-NEXT:    [[TMP1:%.*]] = lshr <4 x i32> %v, <i32 15, i32 15, i32 15, i32 15>
+; CHECK-NEXT:    ret <4 x i32> [[TMP1]]
+;
+  %1 = tail call <4 x i32> @llvm.x86.sse2.psrl.d(<4 x i32> %v, <4 x i32> <i32 15, i32 0, i32 9999, i32 9999>)
+  ret <4 x i32> %1
+}
+
+define <4 x i32> @sse2_psrl_d_15_splat(<4 x i32> %v) {
+; CHECK-LABEL: @sse2_psrl_d_15_splat(
+; CHECK-NEXT:    ret <4 x i32> zeroinitializer
+;
+  %1 = tail call <4 x i32> @llvm.x86.sse2.psrl.d(<4 x i32> %v, <4 x i32> <i32 15, i32 15, i32 15, i32 15>)
+  ret <4 x i32> %1
+}
+
+define <4 x i32> @sse2_psrl_d_64(<4 x i32> %v) {
+; CHECK-LABEL: @sse2_psrl_d_64(
+; CHECK-NEXT:    ret <4 x i32> zeroinitializer
+;
+  %1 = tail call <4 x i32> @llvm.x86.sse2.psrl.d(<4 x i32> %v, <4 x i32> <i32 64, i32 0, i32 9999, i32 9999>)
+  ret <4 x i32> %1
+}
+
+define <2 x i64> @sse2_psrl_q_0(<2 x i64> %v) {
+; CHECK-LABEL: @sse2_psrl_q_0(
+; CHECK-NEXT:    ret <2 x i64> %v
+;
+  %1 = tail call <2 x i64> @llvm.x86.sse2.psrl.q(<2 x i64> %v, <2 x i64> zeroinitializer)
+  ret <2 x i64> %1
+}
+
+define <2 x i64> @sse2_psrl_q_15(<2 x i64> %v) {
+; CHECK-LABEL: @sse2_psrl_q_15(
+; CHECK-NEXT:    [[TMP1:%.*]] = lshr <2 x i64> %v, <i64 15, i64 15>
+; CHECK-NEXT:    ret <2 x i64> [[TMP1]]
+;
+  %1 = tail call <2 x i64> @llvm.x86.sse2.psrl.q(<2 x i64> %v, <2 x i64> <i64 15, i64 9999>)
+  ret <2 x i64> %1
+}
+
+define <2 x i64> @sse2_psrl_q_64(<2 x i64> %v) {
+; CHECK-LABEL: @sse2_psrl_q_64(
+; CHECK-NEXT:    ret <2 x i64> zeroinitializer
+;
+  %1 = tail call <2 x i64> @llvm.x86.sse2.psrl.q(<2 x i64> %v, <2 x i64> <i64 64, i64 9999>)
+  ret <2 x i64> %1
+}
+
+define <16 x i16> @avx2_psrl_w_0(<16 x i16> %v) {
+; CHECK-LABEL: @avx2_psrl_w_0(
+; CHECK-NEXT:    ret <16 x i16> %v
+;
+  %1 = tail call <16 x i16> @llvm.x86.avx2.psrl.w(<16 x i16> %v, <8 x i16> zeroinitializer)
+  ret <16 x i16> %1
+}
+
+define <16 x i16> @avx2_psrl_w_15(<16 x i16> %v) {
+; CHECK-LABEL: @avx2_psrl_w_15(
+; CHECK-NEXT:    [[TMP1:%.*]] = lshr <16 x i16> %v, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
+; CHECK-NEXT:    ret <16 x i16> [[TMP1]]
+;
+  %1 = tail call <16 x i16> @llvm.x86.avx2.psrl.w(<16 x i16> %v, <8 x i16> <i16 15, i16 0, i16 0, i16 0, i16 9999, i16 9999, i16 9999, i16 9999>)
+  ret <16 x i16> %1
+}
+
+define <16 x i16> @avx2_psrl_w_15_splat(<16 x i16> %v) {
+; CHECK-LABEL: @avx2_psrl_w_15_splat(
+; CHECK-NEXT:    ret <16 x i16> zeroinitializer
+;
+  %1 = tail call <16 x i16> @llvm.x86.avx2.psrl.w(<16 x i16> %v, <8 x i16> <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>)
+  ret <16 x i16> %1
+}
+
+define <16 x i16> @avx2_psrl_w_64(<16 x i16> %v) {
+; CHECK-LABEL: @avx2_psrl_w_64(
+; CHECK-NEXT:    ret <16 x i16> zeroinitializer
+;
+  %1 = tail call <16 x i16> @llvm.x86.avx2.psrl.w(<16 x i16> %v, <8 x i16> <i16 64, i16 0, i16 0, i16 0, i16 9999, i16 9999, i16 9999, i16 9999>)
+  ret <16 x i16> %1
+}
+
+define <8 x i32> @avx2_psrl_d_0(<8 x i32> %v) {
+; CHECK-LABEL: @avx2_psrl_d_0(
+; CHECK-NEXT:    ret <8 x i32> %v
+;
+  %1 = tail call <8 x i32> @llvm.x86.avx2.psrl.d(<8 x i32> %v, <4 x i32> zeroinitializer)
+  ret <8 x i32> %1
+}
+
+define <8 x i32> @avx2_psrl_d_15(<8 x i32> %v) {
+; CHECK-LABEL: @avx2_psrl_d_15(
+; CHECK-NEXT:    [[TMP1:%.*]] = lshr <8 x i32> %v, <i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
+; CHECK-NEXT:    ret <8 x i32> [[TMP1]]
+;
+  %1 = tail call <8 x i32> @llvm.x86.avx2.psrl.d(<8 x i32> %v, <4 x i32> <i32 15, i32 0, i32 9999, i32 9999>)
+  ret <8 x i32> %1
+}
+
+define <8 x i32> @avx2_psrl_d_15_splat(<8 x i32> %v) {
+; CHECK-LABEL: @avx2_psrl_d_15_splat(
+; CHECK-NEXT:    ret <8 x i32> zeroinitializer
+;
+  %1 = tail call <8 x i32> @llvm.x86.avx2.psrl.d(<8 x i32> %v, <4 x i32> <i32 15, i32 15, i32 15, i32 15>)
+  ret <8 x i32> %1
+}
+
+define <8 x i32> @avx2_psrl_d_64(<8 x i32> %v) {
+; CHECK-LABEL: @avx2_psrl_d_64(
+; CHECK-NEXT:    ret <8 x i32> zeroinitializer
+;
+  %1 = tail call <8 x i32> @llvm.x86.avx2.psrl.d(<8 x i32> %v, <4 x i32> <i32 64, i32 0, i32 9999, i32 9999>)
+  ret <8 x i32> %1
+}
+
+define <4 x i64> @avx2_psrl_q_0(<4 x i64> %v) {
+; CHECK-LABEL: @avx2_psrl_q_0(
+; CHECK-NEXT:    ret <4 x i64> %v
+;
+  %1 = tail call <4 x i64> @llvm.x86.avx2.psrl.q(<4 x i64> %v, <2 x i64> zeroinitializer)
+  ret <4 x i64> %1
+}
+
+define <4 x i64> @avx2_psrl_q_15(<4 x i64> %v) {
+; CHECK-LABEL: @avx2_psrl_q_15(
+; CHECK-NEXT:    [[TMP1:%.*]] = lshr <4 x i64> %v, <i64 15, i64 15, i64 15, i64 15>
+; CHECK-NEXT:    ret <4 x i64> [[TMP1]]
+;
+  %1 = tail call <4 x i64> @llvm.x86.avx2.psrl.q(<4 x i64> %v, <2 x i64> <i64 15, i64 9999>)
+  ret <4 x i64> %1
+}
+
+define <4 x i64> @avx2_psrl_q_64(<4 x i64> %v) {
+; CHECK-LABEL: @avx2_psrl_q_64(
+; CHECK-NEXT:    ret <4 x i64> zeroinitializer
+;
+  %1 = tail call <4 x i64> @llvm.x86.avx2.psrl.q(<4 x i64> %v, <2 x i64> <i64 64, i64 9999>)
+  ret <4 x i64> %1
+}
+
+define <32 x i16> @avx512_psrl_w_512_0(<32 x i16> %v) {
+; CHECK-LABEL: @avx512_psrl_w_512_0(
+; CHECK-NEXT:    ret <32 x i16> %v
+;
+  %1 = tail call <32 x i16> @llvm.x86.avx512.psrl.w.512(<32 x i16> %v, <8 x i16> zeroinitializer)
+  ret <32 x i16> %1
+}
+
+define <32 x i16> @avx512_psrl_w_512_15(<32 x i16> %v) {
+; CHECK-LABEL: @avx512_psrl_w_512_15(
+; CHECK-NEXT:    [[TMP1:%.*]] = lshr <32 x i16> %v, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
+; CHECK-NEXT:    ret <32 x i16> [[TMP1]]
+;
+  %1 = tail call <32 x i16> @llvm.x86.avx512.psrl.w.512(<32 x i16> %v, <8 x i16> <i16 15, i16 0, i16 0, i16 0, i16 9999, i16 9999, i16 9999, i16 9999>)
+  ret <32 x i16> %1
+}
+
+define <32 x i16> @avx512_psrl_w_512_15_splat(<32 x i16> %v) {
+; CHECK-LABEL: @avx512_psrl_w_512_15_splat(
+; CHECK-NEXT:    ret <32 x i16> zeroinitializer
+;
+  %1 = tail call <32 x i16> @llvm.x86.avx512.psrl.w.512(<32 x i16> %v, <8 x i16> <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>)
+  ret <32 x i16> %1
+}
+
+define <32 x i16> @avx512_psrl_w_512_64(<32 x i16> %v) {
+; CHECK-LABEL: @avx512_psrl_w_512_64(
+; CHECK-NEXT:    ret <32 x i16> zeroinitializer
+;
+  %1 = tail call <32 x i16> @llvm.x86.avx512.psrl.w.512(<32 x i16> %v, <8 x i16> <i16 64, i16 0, i16 0, i16 0, i16 9999, i16 9999, i16 9999, i16 9999>)
+  ret <32 x i16> %1
+}
+
+define <16 x i32> @avx512_psrl_d_512_0(<16 x i32> %v) {
+; CHECK-LABEL: @avx512_psrl_d_512_0(
+; CHECK-NEXT:    ret <16 x i32> %v
+;
+  %1 = tail call <16 x i32> @llvm.x86.avx512.psrl.d.512(<16 x i32> %v, <4 x i32> zeroinitializer)
+  ret <16 x i32> %1
+}
+
+define <16 x i32> @avx512_psrl_d_512_15(<16 x i32> %v) {
+; CHECK-LABEL: @avx512_psrl_d_512_15(
+; CHECK-NEXT:    [[TMP1:%.*]] = lshr <16 x i32> %v, <i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
+; CHECK-NEXT:    ret <16 x i32> [[TMP1]]
+;
+  %1 = tail call <16 x i32> @llvm.x86.avx512.psrl.d.512(<16 x i32> %v, <4 x i32> <i32 15, i32 0, i32 9999, i32 9999>)
+  ret <16 x i32> %1
+}
+
+define <16 x i32> @avx512_psrl_d_512_15_splat(<16 x i32> %v) {
+; CHECK-LABEL: @avx512_psrl_d_512_15_splat(
+; CHECK-NEXT:    ret <16 x i32> zeroinitializer
+;
+  %1 = tail call <16 x i32> @llvm.x86.avx512.psrl.d.512(<16 x i32> %v, <4 x i32> <i32 15, i32 15, i32 15, i32 15>)
+  ret <16 x i32> %1
+}
+
+define <16 x i32> @avx512_psrl_d_512_64(<16 x i32> %v) {
+; CHECK-LABEL: @avx512_psrl_d_512_64(
+; CHECK-NEXT:    ret <16 x i32> zeroinitializer
+;
+  %1 = tail call <16 x i32> @llvm.x86.avx512.psrl.d.512(<16 x i32> %v, <4 x i32> <i32 64, i32 0, i32 9999, i32 9999>)
+  ret <16 x i32> %1
+}
+
+define <8 x i64> @avx512_psrl_q_512_0(<8 x i64> %v) {
+; CHECK-LABEL: @avx512_psrl_q_512_0(
+; CHECK-NEXT:    ret <8 x i64> %v
+;
+  %1 = tail call <8 x i64> @llvm.x86.avx512.psrl.q.512(<8 x i64> %v, <2 x i64> zeroinitializer)
+  ret <8 x i64> %1
+}
+
+define <8 x i64> @avx512_psrl_q_512_15(<8 x i64> %v) {
+; CHECK-LABEL: @avx512_psrl_q_512_15(
+; CHECK-NEXT:    [[TMP1:%.*]] = lshr <8 x i64> %v, <i64 15, i64 15, i64 15, i64 15, i64 15, i64 15, i64 15, i64 15>
+; CHECK-NEXT:    ret <8 x i64> [[TMP1]]
+;
+  %1 = tail call <8 x i64> @llvm.x86.avx512.psrl.q.512(<8 x i64> %v, <2 x i64> <i64 15, i64 9999>)
+  ret <8 x i64> %1
+}
+
+define <8 x i64> @avx512_psrl_q_512_64(<8 x i64> %v) {
+; CHECK-LABEL: @avx512_psrl_q_512_64(
+; CHECK-NEXT:    ret <8 x i64> zeroinitializer
+;
+  %1 = tail call <8 x i64> @llvm.x86.avx512.psrl.q.512(<8 x i64> %v, <2 x i64> <i64 64, i64 9999>)
+  ret <8 x i64> %1
+}
+
+;
+; SHL - Constant Vector
+;
+
+define <8 x i16> @sse2_psll_w_0(<8 x i16> %v) {
+; CHECK-LABEL: @sse2_psll_w_0(
+; CHECK-NEXT:    ret <8 x i16> %v
+;
+  %1 = tail call <8 x i16> @llvm.x86.sse2.psll.w(<8 x i16> %v, <8 x i16> zeroinitializer)
+  ret <8 x i16> %1
+}
+
+define <8 x i16> @sse2_psll_w_15(<8 x i16> %v) {
+; CHECK-LABEL: @sse2_psll_w_15(
+; CHECK-NEXT:    [[TMP1:%.*]] = shl <8 x i16> %v, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
+; CHECK-NEXT:    ret <8 x i16> [[TMP1]]
+;
+  %1 = tail call <8 x i16> @llvm.x86.sse2.psll.w(<8 x i16> %v, <8 x i16> <i16 15, i16 0, i16 0, i16 0, i16 9999, i16 9999, i16 9999, i16 9999>)
+  ret <8 x i16> %1
+}
+
+define <8 x i16> @sse2_psll_w_15_splat(<8 x i16> %v) {
+; CHECK-LABEL: @sse2_psll_w_15_splat(
+; CHECK-NEXT:    ret <8 x i16> zeroinitializer
+;
+  %1 = tail call <8 x i16> @llvm.x86.sse2.psll.w(<8 x i16> %v, <8 x i16> <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>)
+  ret <8 x i16> %1
+}
+
+define <8 x i16> @sse2_psll_w_64(<8 x i16> %v) {
+; CHECK-LABEL: @sse2_psll_w_64(
+; CHECK-NEXT:    ret <8 x i16> zeroinitializer
+;
+  %1 = tail call <8 x i16> @llvm.x86.sse2.psll.w(<8 x i16> %v, <8 x i16> <i16 64, i16 0, i16 0, i16 0, i16 9999, i16 9999, i16 9999, i16 9999>)
+  ret <8 x i16> %1
+}
+
+define <4 x i32> @sse2_psll_d_0(<4 x i32> %v) {
+; CHECK-LABEL: @sse2_psll_d_0(
+; CHECK-NEXT:    ret <4 x i32> %v
+;
+  %1 = tail call <4 x i32> @llvm.x86.sse2.psll.d(<4 x i32> %v, <4 x i32> zeroinitializer)
+  ret <4 x i32> %1
+}
+
+define <4 x i32> @sse2_psll_d_15(<4 x i32> %v) {
+; CHECK-LABEL: @sse2_psll_d_15(
+; CHECK-NEXT:    [[TMP1:%.*]] = shl <4 x i32> %v, <i32 15, i32 15, i32 15, i32 15>
+; CHECK-NEXT:    ret <4 x i32> [[TMP1]]
+;
+  %1 = tail call <4 x i32> @llvm.x86.sse2.psll.d(<4 x i32> %v, <4 x i32> <i32 15, i32 0, i32 9999, i32 9999>)
+  ret <4 x i32> %1
+}
+
+define <4 x i32> @sse2_psll_d_15_splat(<4 x i32> %v) {
+; CHECK-LABEL: @sse2_psll_d_15_splat(
+; CHECK-NEXT:    ret <4 x i32> zeroinitializer
+;
+  %1 = tail call <4 x i32> @llvm.x86.sse2.psll.d(<4 x i32> %v, <4 x i32> <i32 15, i32 15, i32 15, i32 15>)
+  ret <4 x i32> %1
+}
+
+define <4 x i32> @sse2_psll_d_64(<4 x i32> %v) {
+; CHECK-LABEL: @sse2_psll_d_64(
+; CHECK-NEXT:    ret <4 x i32> zeroinitializer
+;
+  %1 = tail call <4 x i32> @llvm.x86.sse2.psll.d(<4 x i32> %v, <4 x i32> <i32 64, i32 0, i32 9999, i32 9999>)
+  ret <4 x i32> %1
+}
+
+define <2 x i64> @sse2_psll_q_0(<2 x i64> %v) {
+; CHECK-LABEL: @sse2_psll_q_0(
+; CHECK-NEXT:    ret <2 x i64> %v
+;
+  %1 = tail call <2 x i64> @llvm.x86.sse2.psll.q(<2 x i64> %v, <2 x i64> zeroinitializer)
+  ret <2 x i64> %1
+}
+
+define <2 x i64> @sse2_psll_q_15(<2 x i64> %v) {
+; CHECK-LABEL: @sse2_psll_q_15(
+; CHECK-NEXT:    [[TMP1:%.*]] = shl <2 x i64> %v, <i64 15, i64 15>
+; CHECK-NEXT:    ret <2 x i64> [[TMP1]]
+;
+  %1 = tail call <2 x i64> @llvm.x86.sse2.psll.q(<2 x i64> %v, <2 x i64> <i64 15, i64 9999>)
+  ret <2 x i64> %1
+}
+
+define <2 x i64> @sse2_psll_q_64(<2 x i64> %v) {
+; CHECK-LABEL: @sse2_psll_q_64(
+; CHECK-NEXT:    ret <2 x i64> zeroinitializer
+;
+  %1 = tail call <2 x i64> @llvm.x86.sse2.psll.q(<2 x i64> %v, <2 x i64> <i64 64, i64 9999>)
+  ret <2 x i64> %1
+}
+
+define <16 x i16> @avx2_psll_w_0(<16 x i16> %v) {
+; CHECK-LABEL: @avx2_psll_w_0(
+; CHECK-NEXT:    ret <16 x i16> %v
+;
+  %1 = tail call <16 x i16> @llvm.x86.avx2.psll.w(<16 x i16> %v, <8 x i16> zeroinitializer)
+  ret <16 x i16> %1
+}
+
+define <16 x i16> @avx2_psll_w_15(<16 x i16> %v) {
+; CHECK-LABEL: @avx2_psll_w_15(
+; CHECK-NEXT:    [[TMP1:%.*]] = shl <16 x i16> %v, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
+; CHECK-NEXT:    ret <16 x i16> [[TMP1]]
+;
+  %1 = tail call <16 x i16> @llvm.x86.avx2.psll.w(<16 x i16> %v, <8 x i16> <i16 15, i16 0, i16 0, i16 0, i16 9999, i16 9999, i16 9999, i16 9999>)
+  ret <16 x i16> %1
+}
+
+define <16 x i16> @avx2_psll_w_15_splat(<16 x i16> %v) {
+; CHECK-LABEL: @avx2_psll_w_15_splat(
+; CHECK-NEXT:    ret <16 x i16> zeroinitializer
+;
+  %1 = tail call <16 x i16> @llvm.x86.avx2.psll.w(<16 x i16> %v, <8 x i16> <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>)
+  ret <16 x i16> %1
+}
+
+define <16 x i16> @avx2_psll_w_64(<16 x i16> %v) {
+; CHECK-LABEL: @avx2_psll_w_64(
+; CHECK-NEXT:    ret <16 x i16> zeroinitializer
+;
+  %1 = tail call <16 x i16> @llvm.x86.avx2.psll.w(<16 x i16> %v, <8 x i16> <i16 64, i16 0, i16 0, i16 0, i16 9999, i16 9999, i16 9999, i16 9999>)
+  ret <16 x i16> %1
+}
+
+define <8 x i32> @avx2_psll_d_0(<8 x i32> %v) {
+; CHECK-LABEL: @avx2_psll_d_0(
+; CHECK-NEXT:    ret <8 x i32> %v
+;
+  %1 = tail call <8 x i32> @llvm.x86.avx2.psll.d(<8 x i32> %v, <4 x i32> zeroinitializer)
+  ret <8 x i32> %1
+}
+
+define <8 x i32> @avx2_psll_d_15(<8 x i32> %v) {
+; CHECK-LABEL: @avx2_psll_d_15(
+; CHECK-NEXT:    [[TMP1:%.*]] = shl <8 x i32> %v, <i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
+; CHECK-NEXT:    ret <8 x i32> [[TMP1]]
+;
+  %1 = tail call <8 x i32> @llvm.x86.avx2.psll.d(<8 x i32> %v, <4 x i32> <i32 15, i32 0, i32 9999, i32 9999>)
+  ret <8 x i32> %1
+}
+
+define <8 x i32> @avx2_psll_d_15_splat(<8 x i32> %v) {
+; CHECK-LABEL: @avx2_psll_d_15_splat(
+; CHECK-NEXT:    ret <8 x i32> zeroinitializer
+;
+  %1 = tail call <8 x i32> @llvm.x86.avx2.psll.d(<8 x i32> %v, <4 x i32> <i32 15, i32 15, i32 15, i32 15>)
+  ret <8 x i32> %1
+}
+
+define <8 x i32> @avx2_psll_d_64(<8 x i32> %v) {
+; CHECK-LABEL: @avx2_psll_d_64(
+; CHECK-NEXT:    ret <8 x i32> zeroinitializer
+;
+  %1 = tail call <8 x i32> @llvm.x86.avx2.psll.d(<8 x i32> %v, <4 x i32> <i32 64, i32 0, i32 9999, i32 9999>)
+  ret <8 x i32> %1
+}
+
+define <4 x i64> @avx2_psll_q_0(<4 x i64> %v) {
+; CHECK-LABEL: @avx2_psll_q_0(
+; CHECK-NEXT:    ret <4 x i64> %v
+;
+  %1 = tail call <4 x i64> @llvm.x86.avx2.psll.q(<4 x i64> %v, <2 x i64> zeroinitializer)
+  ret <4 x i64> %1
+}
+
+define <4 x i64> @avx2_psll_q_15(<4 x i64> %v) {
+; CHECK-LABEL: @avx2_psll_q_15(
+; CHECK-NEXT:    [[TMP1:%.*]] = shl <4 x i64> %v, <i64 15, i64 15, i64 15, i64 15>
+; CHECK-NEXT:    ret <4 x i64> [[TMP1]]
+;
+  %1 = tail call <4 x i64> @llvm.x86.avx2.psll.q(<4 x i64> %v, <2 x i64> <i64 15, i64 9999>)
+  ret <4 x i64> %1
+}
+
+define <4 x i64> @avx2_psll_q_64(<4 x i64> %v) {
+; CHECK-LABEL: @avx2_psll_q_64(
+; CHECK-NEXT:    ret <4 x i64> zeroinitializer
+;
+  %1 = tail call <4 x i64> @llvm.x86.avx2.psll.q(<4 x i64> %v, <2 x i64> <i64 64, i64 9999>)
+  ret <4 x i64> %1
+}
+
+define <32 x i16> @avx512_psll_w_512_0(<32 x i16> %v) {
+; CHECK-LABEL: @avx512_psll_w_512_0(
+; CHECK-NEXT:    ret <32 x i16> %v
+;
+  %1 = tail call <32 x i16> @llvm.x86.avx512.psll.w.512(<32 x i16> %v, <8 x i16> zeroinitializer)
+  ret <32 x i16> %1
+}
+
+define <32 x i16> @avx512_psll_w_512_15(<32 x i16> %v) {
+; CHECK-LABEL: @avx512_psll_w_512_15(
+; CHECK-NEXT:    [[TMP1:%.*]] = shl <32 x i16> %v, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
+; CHECK-NEXT:    ret <32 x i16> [[TMP1]]
+;
+  %1 = tail call <32 x i16> @llvm.x86.avx512.psll.w.512(<32 x i16> %v, <8 x i16> <i16 15, i16 0, i16 0, i16 0, i16 9999, i16 9999, i16 9999, i16 9999>)
+  ret <32 x i16> %1
+}
+
+define <32 x i16> @avx512_psll_w_15_512_splat(<32 x i16> %v) {
+; CHECK-LABEL: @avx512_psll_w_15_512_splat(
+; CHECK-NEXT:    ret <32 x i16> zeroinitializer
+;
+  %1 = tail call <32 x i16> @llvm.x86.avx512.psll.w.512(<32 x i16> %v, <8 x i16> <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>)
+  ret <32 x i16> %1
+}
+
+define <32 x i16> @avx512_psll_w_512_64(<32 x i16> %v) {
+; CHECK-LABEL: @avx512_psll_w_512_64(
+; CHECK-NEXT:    ret <32 x i16> zeroinitializer
+;
+  %1 = tail call <32 x i16> @llvm.x86.avx512.psll.w.512(<32 x i16> %v, <8 x i16> <i16 64, i16 0, i16 0, i16 0, i16 9999, i16 9999, i16 9999, i16 9999>)
+  ret <32 x i16> %1
+}
+
+define <16 x i32> @avx512_psll_d_512_0(<16 x i32> %v) {
+; CHECK-LABEL: @avx512_psll_d_512_0(
+; CHECK-NEXT:    ret <16 x i32> %v
+;
+  %1 = tail call <16 x i32> @llvm.x86.avx512.psll.d.512(<16 x i32> %v, <4 x i32> zeroinitializer)
+  ret <16 x i32> %1
+}
+
+define <16 x i32> @avx512_psll_d_512_15(<16 x i32> %v) {
+; CHECK-LABEL: @avx512_psll_d_512_15(
+; CHECK-NEXT:    [[TMP1:%.*]] = shl <16 x i32> %v, <i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
+; CHECK-NEXT:    ret <16 x i32> [[TMP1]]
+;
+  %1 = tail call <16 x i32> @llvm.x86.avx512.psll.d.512(<16 x i32> %v, <4 x i32> <i32 15, i32 0, i32 9999, i32 9999>)
+  ret <16 x i32> %1
+}
+
+define <16 x i32> @avx512_psll_d_512_15_splat(<16 x i32> %v) {
+; CHECK-LABEL: @avx512_psll_d_512_15_splat(
+; CHECK-NEXT:    ret <16 x i32> zeroinitializer
+;
+  %1 = tail call <16 x i32> @llvm.x86.avx512.psll.d.512(<16 x i32> %v, <4 x i32> <i32 15, i32 15, i32 15, i32 15>)
+  ret <16 x i32> %1
+}
+
+define <16 x i32> @avx512_psll_d_512_64(<16 x i32> %v) {
+; CHECK-LABEL: @avx512_psll_d_512_64(
+; CHECK-NEXT:    ret <16 x i32> zeroinitializer
+;
+  %1 = tail call <16 x i32> @llvm.x86.avx512.psll.d.512(<16 x i32> %v, <4 x i32> <i32 64, i32 0, i32 9999, i32 9999>)
+  ret <16 x i32> %1
+}
+
+define <8 x i64> @avx512_psll_q_512_0(<8 x i64> %v) {
+; CHECK-LABEL: @avx512_psll_q_512_0(
+; CHECK-NEXT:    ret <8 x i64> %v
+;
+  %1 = tail call <8 x i64> @llvm.x86.avx512.psll.q.512(<8 x i64> %v, <2 x i64> zeroinitializer)
+  ret <8 x i64> %1
+}
+
+define <8 x i64> @avx512_psll_q_512_15(<8 x i64> %v) {
+; CHECK-LABEL: @avx512_psll_q_512_15(
+; CHECK-NEXT:    [[TMP1:%.*]] = shl <8 x i64> %v, <i64 15, i64 15, i64 15, i64 15, i64 15, i64 15, i64 15, i64 15>
+; CHECK-NEXT:    ret <8 x i64> [[TMP1]]
+;
+  %1 = tail call <8 x i64> @llvm.x86.avx512.psll.q.512(<8 x i64> %v, <2 x i64> <i64 15, i64 9999>)
+  ret <8 x i64> %1
+}
+
+define <8 x i64> @avx512_psll_q_512_64(<8 x i64> %v) {
+; CHECK-LABEL: @avx512_psll_q_512_64(
+; CHECK-NEXT:    ret <8 x i64> zeroinitializer
+;
+  %1 = tail call <8 x i64> @llvm.x86.avx512.psll.q.512(<8 x i64> %v, <2 x i64> <i64 64, i64 9999>)
+  ret <8 x i64> %1
+}
+
+;
+; ASHR - Constant Per-Element Vector
+;
+
+define <4 x i32> @avx2_psrav_d_128_0(<4 x i32> %v) {
+; CHECK-LABEL: @avx2_psrav_d_128_0(
+; CHECK-NEXT:    ret <4 x i32> %v
+;
+  %1 = tail call <4 x i32> @llvm.x86.avx2.psrav.d(<4 x i32> %v, <4 x i32> zeroinitializer)
+  ret <4 x i32> %1
+}
+
+define <8 x i32> @avx2_psrav_d_256_0(<8 x i32> %v) {
+; CHECK-LABEL: @avx2_psrav_d_256_0(
+; CHECK-NEXT:    ret <8 x i32> %v
+;
+  %1 = tail call <8 x i32> @llvm.x86.avx2.psrav.d.256(<8 x i32> %v, <8 x i32> zeroinitializer)
+  ret <8 x i32> %1
+}
+
+define <16 x i32> @avx512_psrav_d_512_0(<16 x i32> %v) {
+; CHECK-LABEL: @avx512_psrav_d_512_0(
+; CHECK-NEXT:    ret <16 x i32> %v
+;
+  %1 = tail call <16 x i32> @llvm.x86.avx512.psrav.d.512(<16 x i32> %v, <16 x i32> zeroinitializer)
+  ret <16 x i32> %1
+}
+
+define <4 x i32> @avx2_psrav_d_128_var(<4 x i32> %v) {
+; CHECK-LABEL: @avx2_psrav_d_128_var(
+; CHECK-NEXT:    [[TMP1:%.*]] = ashr <4 x i32> %v, <i32 0, i32 8, i32 16, i32 31>
+; CHECK-NEXT:    ret <4 x i32> [[TMP1]]
+;
+  %1 = tail call <4 x i32> @llvm.x86.avx2.psrav.d(<4 x i32> %v, <4 x i32> <i32 0, i32 8, i32 16, i32 64>)
+  ret <4 x i32> %1
+}
+
+define <8 x i32> @avx2_psrav_d_256_var(<8 x i32> %v) {
+; CHECK-LABEL: @avx2_psrav_d_256_var(
+; CHECK-NEXT:    [[TMP1:%.*]] = ashr <8 x i32> %v, <i32 0, i32 8, i32 16, i32 24, i32 31, i32 24, i32 8, i32 0>
+; CHECK-NEXT:    ret <8 x i32> [[TMP1]]
+;
+  %1 = tail call <8 x i32> @llvm.x86.avx2.psrav.d.256(<8 x i32> %v, <8 x i32> <i32 0, i32 8, i32 16, i32 24, i32 32, i32 24, i32 8, i32 0>)
+  ret <8 x i32> %1
+}
+
+define <16 x i32> @avx512_psrav_d_512_var(<16 x i32> %v) {
+; CHECK-LABEL: @avx512_psrav_d_512_var(
+; CHECK-NEXT:    [[TMP1:%.*]] = ashr <16 x i32> %v, <i32 0, i32 8, i32 16, i32 24, i32 31, i32 24, i32 8, i32 0, i32 0, i32 8, i32 16, i32 24, i32 31, i32 24, i32 8, i32 0>
+; CHECK-NEXT:    ret <16 x i32> [[TMP1]]
+;
+  %1 = tail call <16 x i32> @llvm.x86.avx512.psrav.d.512(<16 x i32> %v, <16 x i32> <i32 0, i32 8, i32 16, i32 24, i32 32, i32 24, i32 8, i32 0, i32 0, i32 8, i32 16, i32 24, i32 32, i32 24, i32 8, i32 0>)
+  ret <16 x i32> %1
+}
+
+define <4 x i32> @avx2_psrav_d_128_allbig(<4 x i32> %v) {
+; CHECK-LABEL: @avx2_psrav_d_128_allbig(
+; CHECK-NEXT:    [[TMP1:%.*]] = ashr <4 x i32> %v, <i32 31, i32 31, i32 31, i32 undef>
+; CHECK-NEXT:    ret <4 x i32> [[TMP1]]
+;
+  %1 = tail call <4 x i32> @llvm.x86.avx2.psrav.d(<4 x i32> %v, <4 x i32> <i32 32, i32 100, i32 -255, i32 undef>)
+  ret <4 x i32> %1
+}
+
+define <8 x i32> @avx2_psrav_d_256_allbig(<8 x i32> %v) {
+; CHECK-LABEL: @avx2_psrav_d_256_allbig(
+; CHECK-NEXT:    [[TMP1:%.*]] = ashr <8 x i32> %v, <i32 undef, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
+; CHECK-NEXT:    ret <8 x i32> [[TMP1]]
+;
+  %1 = tail call <8 x i32> @llvm.x86.avx2.psrav.d.256(<8 x i32> %v, <8 x i32> <i32 undef, i32 100, i32 255, i32 55555, i32 -32, i32 -100, i32 -255, i32 -55555>)
+  ret <8 x i32> %1
+}
+
+define <16 x i32> @avx512_psrav_d_512_allbig(<16 x i32> %v) {
+; CHECK-LABEL: @avx512_psrav_d_512_allbig(
+; CHECK-NEXT:    [[TMP1:%.*]] = ashr <16 x i32> %v, <i32 undef, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 undef, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
+; CHECK-NEXT:    ret <16 x i32> [[TMP1]]
+;
+  %1 = tail call <16 x i32> @llvm.x86.avx512.psrav.d.512(<16 x i32> %v, <16 x i32> <i32 undef, i32 100, i32 255, i32 55555, i32 -32, i32 -100, i32 -255, i32 -55555, i32 undef, i32 100, i32 255, i32 55555, i32 -32, i32 -100, i32 -255, i32 -55555>)
+  ret <16 x i32> %1
+}
+
+define <4 x i32> @avx2_psrav_d_128_undef(<4 x i32> %v) {
+; CHECK-LABEL: @avx2_psrav_d_128_undef(
+; CHECK-NEXT:    [[TMP1:%.*]] = ashr <4 x i32> %v, <i32 undef, i32 8, i32 16, i32 31>
+; CHECK-NEXT:    ret <4 x i32> [[TMP1]]
+;
+  %1 = insertelement <4 x i32> <i32 0, i32 8, i32 16, i32 64>, i32 undef, i32 0
+  %2 = tail call <4 x i32> @llvm.x86.avx2.psrav.d(<4 x i32> %v, <4 x i32> %1)
+  ret <4 x i32> %2
+}
+
+define <8 x i32> @avx2_psrav_d_256_undef(<8 x i32> %v) {
+; CHECK-LABEL: @avx2_psrav_d_256_undef(
+; CHECK-NEXT:    [[TMP1:%.*]] = ashr <8 x i32> %v, <i32 0, i32 undef, i32 16, i32 24, i32 31, i32 24, i32 8, i32 0>
+; CHECK-NEXT:    ret <8 x i32> [[TMP1]]
+;
+  %1 = insertelement <8 x i32> <i32 0, i32 8, i32 16, i32 24, i32 32, i32 24, i32 8, i32 0>, i32 undef, i32 1
+  %2 = tail call <8 x i32> @llvm.x86.avx2.psrav.d.256(<8 x i32> %v, <8 x i32> %1)
+  ret <8 x i32> %2
+}
+
+define <16 x i32> @avx512_psrav_d_512_undef(<16 x i32> %v) {
+; CHECK-LABEL: @avx512_psrav_d_512_undef(
+; CHECK-NEXT:    [[TMP1:%.*]] = ashr <16 x i32> %v, <i32 0, i32 undef, i32 16, i32 24, i32 31, i32 24, i32 8, i32 0, i32 0, i32 8, i32 16, i32 24, i32 31, i32 24, i32 8, i32 0>
+; CHECK-NEXT:    ret <16 x i32> [[TMP1]]
+;
+  %1 = insertelement <16 x i32> <i32 0, i32 8, i32 16, i32 24, i32 32, i32 24, i32 8, i32 0, i32 0, i32 8, i32 16, i32 24, i32 32, i32 24, i32 8, i32 0>, i32 undef, i32 1
+  %2 = tail call <16 x i32> @llvm.x86.avx512.psrav.d.512(<16 x i32> %v, <16 x i32> %1)
+  ret <16 x i32> %2
+}
+
+define <2 x i64> @avx512_psrav_q_128_0(<2 x i64> %v) {
+; CHECK-LABEL: @avx512_psrav_q_128_0(
+; CHECK-NEXT:    ret <2 x i64> %v
+;
+  %1 = tail call <2 x i64> @llvm.x86.avx512.psrav.q.128(<2 x i64> %v, <2 x i64> zeroinitializer)
+  ret <2 x i64> %1
+}
+
+define <4 x i64> @avx512_psrav_q_256_0(<4 x i64> %v) {
+; CHECK-LABEL: @avx512_psrav_q_256_0(
+; CHECK-NEXT:    ret <4 x i64> %v
+;
+  %1 = tail call <4 x i64> @llvm.x86.avx512.psrav.q.256(<4 x i64> %v, <4 x i64> zeroinitializer)
+  ret <4 x i64> %1
+}
+
+define <2 x i64> @avx512_psrav_q_128_var(<2 x i64> %v) {
+; CHECK-LABEL: @avx512_psrav_q_128_var(
+; CHECK-NEXT:    [[TMP1:%.*]] = ashr <2 x i64> %v, <i64 0, i64 8>
+  %1 = tail call <2 x i64> @llvm.x86.avx512.psrav.q.128(<2 x i64> %v, <2 x i64> <i64 0, i64 8>)
+  ret <2 x i64> %1
+}
+
+define <4 x i64> @avx512_psrav_q_256_var(<4 x i64> %v) {
+; CHECK-LABEL: @avx512_psrav_q_256_var(
+; CHECK-NEXT:    [[TMP1:%.*]] = ashr <4 x i64> %v, <i64 0, i64 8, i64 16, i64 31>
+; CHECK-NEXT:    ret <4 x i64> [[TMP1]]
+;
+  %1 = tail call <4 x i64> @llvm.x86.avx512.psrav.q.256(<4 x i64> %v, <4 x i64> <i64 0, i64 8, i64 16, i64 31>)
+  ret <4 x i64> %1
+}
+
+define <2 x i64> @avx512_psrav_q_128_allbig(<2 x i64> %v) {
+; CHECK-LABEL: @avx512_psrav_q_128_allbig(
+; CHECK-NEXT:    [[TMP1:%.*]] = ashr <2 x i64> %v, <i64 63, i64 undef>
+; CHECK-NEXT:    ret <2 x i64> [[TMP1]]
+;
+  %1 = tail call <2 x i64> @llvm.x86.avx512.psrav.q.128(<2 x i64> %v, <2 x i64> <i64 64, i64 undef>)
+  ret <2 x i64> %1
+}
+
+define <4 x i64> @avx512_psrav_q_256_allbig(<4 x i64> %v) {
+; CHECK-LABEL: @avx512_psrav_q_256_allbig(
+; CHECK-NEXT:    [[TMP1:%.*]] = ashr <4 x i64> %v, <i64 63, i64 undef, i64 63, i64 63>
+; CHECK-NEXT:    ret <4 x i64> [[TMP1]]
+;
+  %1 = tail call <4 x i64> @llvm.x86.avx512.psrav.q.256(<4 x i64> %v, <4 x i64> <i64 64, i64 undef, i64 -128, i64 -60>)
+  ret <4 x i64> %1
+}
+
+define <2 x i64> @avx512_psrav_q_128_undef(<2 x i64> %v) {
+; CHECK-LABEL: @avx512_psrav_q_128_undef(
+; CHECK-NEXT:    [[TMP1:%.*]] = ashr <2 x i64> %v, <i64 undef, i64 8>
+; CHECK-NEXT:    ret <2 x i64> [[TMP1]]
+;
+  %1 = insertelement <2 x i64> <i64 0, i64 8>, i64 undef, i64 0
+  %2 = tail call <2 x i64> @llvm.x86.avx512.psrav.q.128(<2 x i64> %v, <2 x i64> %1)
+  ret <2 x i64> %2
+}
+
+define <4 x i64> @avx512_psrav_q_256_undef(<4 x i64> %v) {
+; CHECK-LABEL: @avx512_psrav_q_256_undef(
+; CHECK-NEXT:    [[TMP1:%.*]] = ashr <4 x i64> %v, <i64 undef, i64 8, i64 16, i64 31>
+; CHECK-NEXT:    ret <4 x i64> [[TMP1]]
+;
+  %1 = insertelement <4 x i64> <i64 0, i64 8, i64 16, i64 31>, i64 undef, i64 0
+  %2 = tail call <4 x i64> @llvm.x86.avx512.psrav.q.256(<4 x i64> %v, <4 x i64> %1)
+  ret <4 x i64> %2
+}
+
+define <8 x i64> @avx512_psrav_q_512_0(<8 x i64> %v) {
+; CHECK-LABEL: @avx512_psrav_q_512_0(
+; CHECK-NEXT:    ret <8 x i64> %v
+;
+  %1 = tail call <8 x i64> @llvm.x86.avx512.psrav.q.512(<8 x i64> %v, <8 x i64> zeroinitializer)
+  ret <8 x i64> %1
+}
+
+define <8 x i64> @avx512_psrav_q_512_var(<8 x i64> %v) {
+; CHECK-LABEL: @avx512_psrav_q_512_var(
+; CHECK-NEXT:    [[TMP1:%.*]] = ashr <8 x i64> %v, <i64 0, i64 8, i64 16, i64 31, i64 0, i64 8, i64 16, i64 31>
+; CHECK-NEXT:    ret <8 x i64> [[TMP1]]
+;
+  %1 = tail call <8 x i64> @llvm.x86.avx512.psrav.q.512(<8 x i64> %v, <8 x i64> <i64 0, i64 8, i64 16, i64 31, i64 0, i64 8, i64 16, i64 31>)
+  ret <8 x i64> %1
+}
+
+define <8 x i64> @avx512_psrav_q_512_allbig(<8 x i64> %v) {
+; CHECK-LABEL: @avx512_psrav_q_512_allbig(
+; CHECK-NEXT:    [[TMP1:%.*]] = ashr <8 x i64> %v, <i64 63, i64 undef, i64 63, i64 63, i64 63, i64 undef, i64 63, i64 63>
+; CHECK-NEXT:    ret <8 x i64> [[TMP1]]
+;
+  %1 = tail call <8 x i64> @llvm.x86.avx512.psrav.q.512(<8 x i64> %v, <8 x i64> <i64 64, i64 undef, i64 -128, i64 -60, i64 64, i64 undef, i64 -128, i64 -60>)
+  ret <8 x i64> %1
+}
+
+define <8 x i64> @avx512_psrav_q_512_undef(<8 x i64> %v) {
+; CHECK-LABEL: @avx512_psrav_q_512_undef(
+; CHECK-NEXT:    [[TMP1:%.*]] = ashr <8 x i64> %v, <i64 undef, i64 8, i64 16, i64 31, i64 0, i64 8, i64 16, i64 31>
+; CHECK-NEXT:    ret <8 x i64> [[TMP1]]
+;
+  %1 = insertelement <8 x i64> <i64 0, i64 8, i64 16, i64 31, i64 0, i64 8, i64 16, i64 31>, i64 undef, i64 0
+  %2 = tail call <8 x i64> @llvm.x86.avx512.psrav.q.512(<8 x i64> %v, <8 x i64> %1)
+  ret <8 x i64> %2
+}
+
+define <8 x i16> @avx512_psrav_w_128_0(<8 x i16> %v) {
+; CHECK-LABEL: @avx512_psrav_w_128_0(
+; CHECK-NEXT:    ret <8 x i16> %v
+;
+  %1 = tail call <8 x i16> @llvm.x86.avx512.psrav.w.128(<8 x i16> %v, <8 x i16> zeroinitializer)
+  ret <8 x i16> %1
+}
+
+define <8 x i16> @avx512_psrav_w_128_var(<8 x i16> %v) {
+; CHECK-LABEL: @avx512_psrav_w_128_var(
+; CHECK-NEXT:    [[TMP1:%.*]] = ashr <8 x i16> %v, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>
+; CHECK-NEXT:    ret <8 x i16> [[TMP1]]
+;
+  %1 = tail call <8 x i16> @llvm.x86.avx512.psrav.w.128(<8 x i16> %v, <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+  ret <8 x i16> %1
+}
+
+define <8 x i16> @avx512_psrav_w_128_allbig(<8 x i16> %v) {
+; CHECK-LABEL: @avx512_psrav_w_128_allbig(
+; CHECK-NEXT:    [[TMP1:%.*]] = ashr <8 x i16> %v, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 undef>
+; CHECK-NEXT:    ret <8 x i16> [[TMP1]]
+;
+;
+  %1 = tail call <8 x i16> @llvm.x86.avx512.psrav.w.128(<8 x i16> %v, <8 x i16> <i16 20, i16 -1, i16 -2, i16 33, i16 44, i16 55, i16 66, i16 undef>)
+  ret <8 x i16> %1
+}
+
+define <8 x i16> @avx512_psrav_w_128_undef(<8 x i16> %v) {
+; CHECK-LABEL: @avx512_psrav_w_128_undef(
+; CHECK-NEXT:    [[TMP1:%.*]] = ashr <8 x i16> %v, <i16 undef, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>
+; CHECK-NEXT:    ret <8 x i16> [[TMP1]]
+;
+  %1 = insertelement <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, i16 undef, i64 0
+  %2 = tail call <8 x i16> @llvm.x86.avx512.psrav.w.128(<8 x i16> %v, <8 x i16> %1)
+  ret <8 x i16> %2
+}
+
+define <16 x i16> @avx512_psrav_w_256_0(<16 x i16> %v) {
+; CHECK-LABEL: @avx512_psrav_w_256_0(
+; CHECK-NEXT:    ret <16 x i16> %v
+;
+  %1 = tail call <16 x i16> @llvm.x86.avx512.psrav.w.256(<16 x i16> %v, <16 x i16> zeroinitializer)
+  ret <16 x i16> %1
+}
+
+define <16 x i16> @avx512_psrav_w_256_var(<16 x i16> %v) {
+; CHECK-LABEL: @avx512_psrav_w_256_var(
+; CHECK-NEXT:    [[TMP1:%.*]] = ashr <16 x i16> %v, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>
+; CHECK-NEXT:    ret <16 x i16> [[TMP1]]
+;
+  %1 = tail call <16 x i16> @llvm.x86.avx512.psrav.w.256(<16 x i16> %v, <16 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>)
+  ret <16 x i16> %1
+}
+
+define <16 x i16> @avx512_psrav_w_256_allbig(<16 x i16> %v) {
+; CHECK-LABEL: @avx512_psrav_w_256_allbig(
+; CHECK-NEXT:    [[TMP1:%.*]] = ashr <16 x i16> %v, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 undef, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
+; CHECK-NEXT:    ret <16 x i16> [[TMP1]]
+;
+;
+  %1 = tail call <16 x i16> @llvm.x86.avx512.psrav.w.256(<16 x i16> %v, <16 x i16> <i16 20, i16 -1, i16 -2, i16 33, i16 44, i16 55, i16 66, i16 -7, i16 undef, i16 64, i16 -10, i16 256, i16 16, i16 28, i16 65535, i16 32767>)
+  ret <16 x i16> %1
+}
+
+define <16 x i16> @avx512_psrav_w_256_undef(<16 x i16> %v) {
+; CHECK-LABEL: @avx512_psrav_w_256_undef(
+; CHECK-NEXT:    [[TMP1:%.*]] = ashr <16 x i16> %v, <i16 undef, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>
+; CHECK-NEXT:    ret <16 x i16> [[TMP1]]
+;
+  %1 = insertelement <16 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>, i16 undef, i64 0
+  %2 = tail call <16 x i16> @llvm.x86.avx512.psrav.w.256(<16 x i16> %v, <16 x i16> %1)
+  ret <16 x i16> %2
+}
+
+define <32 x i16> @avx512_psrav_w_512_0(<32 x i16> %v) {
+; CHECK-LABEL: @avx512_psrav_w_512_0(
+; CHECK-NEXT:    ret <32 x i16> %v
+;
+  %1 = tail call <32 x i16> @llvm.x86.avx512.psrav.w.512(<32 x i16> %v, <32 x i16> zeroinitializer)
+  ret <32 x i16> %1
+}
+
+define <32 x i16> @avx512_psrav_w_512_var(<32 x i16> %v) {
+; CHECK-LABEL: @avx512_psrav_w_512_var(
+; CHECK-NEXT:    [[TMP1:%.*]] = ashr <32 x i16> %v, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 15, i16 14, i16 13, i16 12, i16 11, i16 10, i16 9, i16 8, i16 7, i16 6, i16 5, i16 4, i16 3, i16 2, i16 1, i16 0>
+; CHECK-NEXT:    ret <32 x i16> [[TMP1]]
+;
+  %1 = tail call <32 x i16> @llvm.x86.avx512.psrav.w.512(<32 x i16> %v, <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 15, i16 14, i16 13, i16 12, i16 11, i16 10, i16 9, i16 8, i16 7, i16 6, i16 5, i16 4, i16 3, i16 2, i16 1, i16 0>)
+  ret <32 x i16> %1
+}
+
+define <32 x i16> @avx512_psrav_w_512_allbig(<32 x i16> %v) {
+; CHECK-LABEL: @avx512_psrav_w_512_allbig(
+; CHECK-NEXT:    [[TMP1:%.*]] = ashr <32 x i16> %v, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 undef, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 undef, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 undef, i16 15, i16 15, i16 undef, i16 15, i16 15>
+; CHECK-NEXT:    ret <32 x i16> [[TMP1]]
+;
+  %1 = tail call <32 x i16> @llvm.x86.avx512.psrav.w.512(<32 x i16> %v, <32 x i16> <i16 20, i16 -1, i16 -2, i16 33, i16 44, i16 55, i16 66, i16 -7, i16 undef, i16 64, i16 -10, i16 128, i16 16, i16 28, i16 65535, i16 32767, i16 56, i16 -14, i16 undef, i16 16, i16 67, i16 567, i16 -32768, i16 4096, i16 8192, i16 -12345, i16 undef, i16 345, i16 123, i16 undef, i16 1024, i16 54321>)
+  ret <32 x i16> %1
+}
+
+define <32 x i16> @avx512_psrav_w_512_undef(<32 x i16> %v) {
+; CHECK-LABEL: @avx512_psrav_w_512_undef(
+; CHECK-NEXT:    [[TMP1:%.*]] = ashr <32 x i16> %v, <i16 undef, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 15, i16 14, i16 13, i16 12, i16 11, i16 10, i16 9, i16 8, i16 7, i16 6, i16 5, i16 4, i16 3, i16 2, i16 1, i16 0>
+; CHECK-NEXT:    ret <32 x i16> [[TMP1]]
+;
+  %1 = insertelement <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 15, i16 14, i16 13, i16 12, i16 11, i16 10, i16 9, i16 8, i16 7, i16 6, i16 5, i16 4, i16 3, i16 2, i16 1, i16 0>, i16 undef, i64 0
+  %2 = tail call <32 x i16> @llvm.x86.avx512.psrav.w.512(<32 x i16> %v, <32 x i16> %1)
+  ret <32 x i16> %2
+}
+
+;
+; LSHR - Constant Per-Element Vector
+;
+
+define <4 x i32> @avx2_psrlv_d_128_0(<4 x i32> %v) {
+; CHECK-LABEL: @avx2_psrlv_d_128_0(
+; CHECK-NEXT:    ret <4 x i32> %v
+;
+  %1 = tail call <4 x i32> @llvm.x86.avx2.psrlv.d(<4 x i32> %v, <4 x i32> zeroinitializer)
+  ret <4 x i32> %1
+}
+
+define <8 x i32> @avx2_psrlv_d_256_0(<8 x i32> %v) {
+; CHECK-LABEL: @avx2_psrlv_d_256_0(
+; CHECK-NEXT:    ret <8 x i32> %v
+;
+  %1 = tail call <8 x i32> @llvm.x86.avx2.psrlv.d.256(<8 x i32> %v, <8 x i32> zeroinitializer)
+  ret <8 x i32> %1
+}
+
+define <4 x i32> @avx2_psrlv_d_128_var(<4 x i32> %v) {
+; CHECK-LABEL: @avx2_psrlv_d_128_var(
+; CHECK-NEXT:    [[TMP1:%.*]] = lshr <4 x i32> %v, <i32 0, i32 8, i32 16, i32 31>
+; CHECK-NEXT:    ret <4 x i32> [[TMP1]]
+;
+  %1 = tail call <4 x i32> @llvm.x86.avx2.psrlv.d(<4 x i32> %v, <4 x i32> <i32 0, i32 8, i32 16, i32 31>)
+  ret <4 x i32> %1
+}
+
+define <8 x i32> @avx2_psrlv_d_256_var(<8 x i32> %v) {
+; CHECK-LABEL: @avx2_psrlv_d_256_var(
+; CHECK-NEXT:    [[TMP1:%.*]] = lshr <8 x i32> %v, <i32 0, i32 8, i32 16, i32 24, i32 31, i32 24, i32 8, i32 0>
+; CHECK-NEXT:    ret <8 x i32> [[TMP1]]
+;
+  %1 = tail call <8 x i32> @llvm.x86.avx2.psrlv.d.256(<8 x i32> %v, <8 x i32> <i32 0, i32 8, i32 16, i32 24, i32 31, i32 24, i32 8, i32 0>)
+  ret <8 x i32> %1
+}
+
+define <4 x i32> @avx2_psrlv_d_128_big(<4 x i32> %v) {
+; CHECK-LABEL: @avx2_psrlv_d_128_big(
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i32> @llvm.x86.avx2.psrlv.d(<4 x i32> %v, <4 x i32> <i32 0, i32 8, i32 16, i32 64>)
+; CHECK-NEXT:    ret <4 x i32> [[TMP1]]
+;
+  %1 = tail call <4 x i32> @llvm.x86.avx2.psrlv.d(<4 x i32> %v, <4 x i32> <i32 0, i32 8, i32 16, i32 64>)
+  ret <4 x i32> %1
+}
+
+define <8 x i32> @avx2_psrlv_d_256_big(<8 x i32> %v) {
+; CHECK-LABEL: @avx2_psrlv_d_256_big(
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.x86.avx2.psrlv.d.256(<8 x i32> %v, <8 x i32> <i32 0, i32 8, i32 16, i32 64, i32 31, i32 24, i32 8, i32 0>)
+; CHECK-NEXT:    ret <8 x i32> [[TMP1]]
+;
+  %1 = tail call <8 x i32> @llvm.x86.avx2.psrlv.d.256(<8 x i32> %v, <8 x i32> <i32 0, i32 8, i32 16, i32 64, i32 31, i32 24, i32 8, i32 0>)
+  ret <8 x i32> %1
+}
+
+define <4 x i32> @avx2_psrlv_d_128_allbig(<4 x i32> %v) {
+; CHECK-LABEL: @avx2_psrlv_d_128_allbig(
+; CHECK-NEXT:    ret <4 x i32> <i32 0, i32 0, i32 0, i32 undef>
+;
+  %1 = tail call <4 x i32> @llvm.x86.avx2.psrlv.d(<4 x i32> %v, <4 x i32> <i32 32, i32 100, i32 -255, i32 undef>)
+  ret <4 x i32> %1
+}
+
+define <8 x i32> @avx2_psrlv_d_256_allbig(<8 x i32> %v) {
+; CHECK-LABEL: @avx2_psrlv_d_256_allbig(
+; CHECK-NEXT:    ret <8 x i32> <i32 undef, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+;
+  %1 = tail call <8 x i32> @llvm.x86.avx2.psrlv.d.256(<8 x i32> %v, <8 x i32> <i32 undef, i32 100, i32 255, i32 55555, i32 -32, i32 -100, i32 -255, i32 -55555>)
+  ret <8 x i32> %1
+}
+
+define <4 x i32> @avx2_psrlv_d_128_undef(<4 x i32> %v) {
+; CHECK-LABEL: @avx2_psrlv_d_128_undef(
+; CHECK-NEXT:    [[TMP1:%.*]] = lshr <4 x i32> %v, <i32 undef, i32 8, i32 16, i32 31>
+; CHECK-NEXT:    ret <4 x i32> [[TMP1]]
+;
+  %1 = insertelement <4 x i32> <i32 0, i32 8, i32 16, i32 31>, i32 undef, i32 0
+  %2 = tail call <4 x i32> @llvm.x86.avx2.psrlv.d(<4 x i32> %v, <4 x i32> %1)
+  ret <4 x i32> %2
+}
+
+define <8 x i32> @avx2_psrlv_d_256_undef(<8 x i32> %v) {
+; CHECK-LABEL: @avx2_psrlv_d_256_undef(
+; CHECK-NEXT:    [[TMP1:%.*]] = lshr <8 x i32> %v, <i32 0, i32 undef, i32 16, i32 31, i32 31, i32 24, i32 8, i32 0>
+; CHECK-NEXT:    ret <8 x i32> [[TMP1]]
+;
+  %1 = insertelement <8 x i32> <i32 0, i32 8, i32 16, i32 31, i32 31, i32 24, i32 8, i32 0>, i32 undef, i32 1
+  %2 = tail call <8 x i32> @llvm.x86.avx2.psrlv.d.256(<8 x i32> %v, <8 x i32> %1)
+  ret <8 x i32> %2
+}
+
+define <2 x i64> @avx2_psrlv_q_128_0(<2 x i64> %v) {
+; CHECK-LABEL: @avx2_psrlv_q_128_0(
+; CHECK-NEXT:    ret <2 x i64> %v
+;
+  %1 = tail call <2 x i64> @llvm.x86.avx2.psrlv.q(<2 x i64> %v, <2 x i64> zeroinitializer)
+  ret <2 x i64> %1
+}
+
+define <4 x i64> @avx2_psrlv_q_256_0(<4 x i64> %v) {
+; CHECK-LABEL: @avx2_psrlv_q_256_0(
+; CHECK-NEXT:    ret <4 x i64> %v
+;
+  %1 = tail call <4 x i64> @llvm.x86.avx2.psrlv.q.256(<4 x i64> %v, <4 x i64> zeroinitializer)
+  ret <4 x i64> %1
+}
+
+define <2 x i64> @avx2_psrlv_q_128_var(<2 x i64> %v) {
+; CHECK-LABEL: @avx2_psrlv_q_128_var(
+; CHECK-NEXT:    [[TMP1:%.*]] = lshr <2 x i64> %v, <i64 0, i64 8>
+; CHECK-NEXT:    ret <2 x i64> [[TMP1]]
+;
+  %1 = tail call <2 x i64> @llvm.x86.avx2.psrlv.q(<2 x i64> %v, <2 x i64> <i64 0, i64 8>)
+  ret <2 x i64> %1
+}
+
+define <4 x i64> @avx2_psrlv_q_256_var(<4 x i64> %v) {
+; CHECK-LABEL: @avx2_psrlv_q_256_var(
+; CHECK-NEXT:    [[TMP1:%.*]] = lshr <4 x i64> %v, <i64 0, i64 8, i64 16, i64 31>
+; CHECK-NEXT:    ret <4 x i64> [[TMP1]]
+;
+  %1 = tail call <4 x i64> @llvm.x86.avx2.psrlv.q.256(<4 x i64> %v, <4 x i64> <i64 0, i64 8, i64 16, i64 31>)
+  ret <4 x i64> %1
+}
+
+define <2 x i64> @avx2_psrlv_q_128_big(<2 x i64> %v) {
+; CHECK-LABEL: @avx2_psrlv_q_128_big(
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.x86.avx2.psrlv.q(<2 x i64> %v, <2 x i64> <i64 0, i64 128>)
+; CHECK-NEXT:    ret <2 x i64> [[TMP1]]
+;
+  %1 = tail call <2 x i64> @llvm.x86.avx2.psrlv.q(<2 x i64> %v, <2 x i64> <i64 0, i64 128>)
+  ret <2 x i64> %1
+}
+
+define <4 x i64> @avx2_psrlv_q_256_big(<4 x i64> %v) {
+; CHECK-LABEL: @avx2_psrlv_q_256_big(
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.x86.avx2.psrlv.q.256(<4 x i64> %v, <4 x i64> <i64 0, i64 8, i64 16, i64 64>)
+; CHECK-NEXT:    ret <4 x i64> [[TMP1]]
+;
+  %1 = tail call <4 x i64> @llvm.x86.avx2.psrlv.q.256(<4 x i64> %v, <4 x i64> <i64 0, i64 8, i64 16, i64 64>)
+  ret <4 x i64> %1
+}
+
+define <2 x i64> @avx2_psrlv_q_128_allbig(<2 x i64> %v) {
+; CHECK-LABEL: @avx2_psrlv_q_128_allbig(
+; CHECK-NEXT:    ret <2 x i64> zeroinitializer
+;
+  %1 = tail call <2 x i64> @llvm.x86.avx2.psrlv.q(<2 x i64> %v, <2 x i64> <i64 128, i64 -64>)
+  ret <2 x i64> %1
+}
+
+define <4 x i64> @avx2_psrlv_q_256_allbig(<4 x i64> %v) {
+; CHECK-LABEL: @avx2_psrlv_q_256_allbig(
+; CHECK-NEXT:    ret <4 x i64> <i64 0, i64 undef, i64 0, i64 0>
+;
+  %1 = tail call <4 x i64> @llvm.x86.avx2.psrlv.q.256(<4 x i64> %v, <4 x i64> <i64 64, i64 undef, i64 -128, i64 -60>)
+  ret <4 x i64> %1
+}
+
+define <2 x i64> @avx2_psrlv_q_128_undef(<2 x i64> %v) {
+; CHECK-LABEL: @avx2_psrlv_q_128_undef(
+; CHECK-NEXT:    [[TMP1:%.*]] = lshr <2 x i64> %v, <i64 0, i64 undef>
+; CHECK-NEXT:    ret <2 x i64> [[TMP1]]
+;
+  %1 = insertelement <2 x i64> <i64 0, i64 8>, i64 undef, i64 1
+  %2 = tail call <2 x i64> @llvm.x86.avx2.psrlv.q(<2 x i64> %v, <2 x i64> %1)
+  ret <2 x i64> %2
+}
+
+define <4 x i64> @avx2_psrlv_q_256_undef(<4 x i64> %v) {
+; CHECK-LABEL: @avx2_psrlv_q_256_undef(
+; CHECK-NEXT:    [[TMP1:%.*]] = lshr <4 x i64> %v, <i64 undef, i64 8, i64 16, i64 31>
+; CHECK-NEXT:    ret <4 x i64> [[TMP1]]
+;
+  %1 = insertelement <4 x i64> <i64 0, i64 8, i64 16, i64 31>, i64 undef, i64 0
+  %2 = tail call <4 x i64> @llvm.x86.avx2.psrlv.q.256(<4 x i64> %v, <4 x i64> %1)
+  ret <4 x i64> %2
+}
+
+define <16 x i32> @avx2_psrlv_d_512_0(<16 x i32> %v) {
+; CHECK-LABEL: @avx2_psrlv_d_512_0(
+; CHECK-NEXT:    ret <16 x i32> %v
+;
+  %1 = tail call <16 x i32> @llvm.x86.avx512.psrlv.d.512(<16 x i32> %v, <16 x i32> zeroinitializer)
+  ret <16 x i32> %1
+}
+
+define <16 x i32> @avx512_psrlv_d_512_var(<16 x i32> %v) {
+; CHECK-LABEL: @avx512_psrlv_d_512_var(
+; CHECK-NEXT:    [[TMP1:%.*]] = lshr <16 x i32> %v, <i32 0, i32 8, i32 16, i32 24, i32 31, i32 24, i32 8, i32 0, i32 0, i32 8, i32 16, i32 24, i32 31, i32 24, i32 8, i32 0>
+; CHECK-NEXT:    ret <16 x i32> [[TMP1]]
+;
+  %1 = tail call <16 x i32> @llvm.x86.avx512.psrlv.d.512(<16 x i32> %v, <16 x i32> <i32 0, i32 8, i32 16, i32 24, i32 31, i32 24, i32 8, i32 0, i32 0, i32 8, i32 16, i32 24, i32 31, i32 24, i32 8, i32 0>)
+  ret <16 x i32> %1
+}
+
+define <16 x i32> @avx512_psrlv_d_512_big(<16 x i32> %v) {
+; CHECK-LABEL: @avx512_psrlv_d_512_big(
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i32> @llvm.x86.avx512.psrlv.d.512(<16 x i32> %v, <16 x i32> <i32 0, i32 8, i32 16, i32 64, i32 31, i32 24, i32 8, i32 0, i32 0, i32 8, i32 16, i32 64, i32 31, i32 24, i32 8, i32 0>)
+; CHECK-NEXT:    ret <16 x i32> [[TMP1]]
+;
+  %1 = tail call <16 x i32> @llvm.x86.avx512.psrlv.d.512(<16 x i32> %v, <16 x i32> <i32 0, i32 8, i32 16, i32 64, i32 31, i32 24, i32 8, i32 0, i32 0, i32 8, i32 16, i32 64, i32 31, i32 24, i32 8, i32 0>)
+  ret <16 x i32> %1
+}
+
+define <16 x i32> @avx512_psrlv_d_512_allbig(<16 x i32> %v) {
+; CHECK-LABEL: @avx512_psrlv_d_512_allbig(
+; CHECK-NEXT:    ret <16 x i32> <i32 undef, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 undef, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+;
+  %1 = tail call <16 x i32> @llvm.x86.avx512.psrlv.d.512(<16 x i32> %v, <16 x i32> <i32 undef, i32 100, i32 255, i32 55555, i32 -32, i32 -100, i32 -255, i32 -55555, i32 undef, i32 100, i32 255, i32 55555, i32 -32, i32 -100, i32 -255, i32 -55555>)
+  ret <16 x i32> %1
+}
+
+define <16 x i32> @avx512_psrlv_d_512_undef(<16 x i32> %v) {
+; CHECK-LABEL: @avx512_psrlv_d_512_undef(
+; CHECK-NEXT:    [[TMP1:%.*]] = lshr <16 x i32> %v, <i32 0, i32 undef, i32 16, i32 31, i32 31, i32 24, i32 8, i32 0, i32 0, i32 8, i32 16, i32 31, i32 31, i32 24, i32 8, i32 0>
+; CHECK-NEXT:    ret <16 x i32> [[TMP1]]
+;
+  %1 = insertelement <16 x i32> <i32 0, i32 8, i32 16, i32 31, i32 31, i32 24, i32 8, i32 0, i32 0, i32 8, i32 16, i32 31, i32 31, i32 24, i32 8, i32 0>, i32 undef, i32 1
+  %2 = tail call <16 x i32> @llvm.x86.avx512.psrlv.d.512(<16 x i32> %v, <16 x i32> %1)
+  ret <16 x i32> %2
+}
+
+define <8 x i64> @avx512_psrlv_q_512_0(<8 x i64> %v) {
+; CHECK-LABEL: @avx512_psrlv_q_512_0(
+; CHECK-NEXT:    ret <8 x i64> %v
+;
+  %1 = tail call <8 x i64> @llvm.x86.avx512.psrlv.q.512(<8 x i64> %v, <8 x i64> zeroinitializer)
+  ret <8 x i64> %1
+}
+
+define <8 x i64> @avx512_psrlv_q_512_var(<8 x i64> %v) {
+; CHECK-LABEL: @avx512_psrlv_q_512_var(
+; CHECK-NEXT:    [[TMP1:%.*]] = lshr <8 x i64> %v, <i64 0, i64 8, i64 16, i64 31, i64 0, i64 8, i64 16, i64 31>
+; CHECK-NEXT:    ret <8 x i64> [[TMP1]]
+;
+  %1 = tail call <8 x i64> @llvm.x86.avx512.psrlv.q.512(<8 x i64> %v, <8 x i64> <i64 0, i64 8, i64 16, i64 31, i64 0, i64 8, i64 16, i64 31>)
+  ret <8 x i64> %1
+}
+
+define <8 x i64> @avx512_psrlv_q_512_big(<8 x i64> %v) {
+; CHECK-LABEL: @avx512_psrlv_q_512_big(
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i64> @llvm.x86.avx512.psrlv.q.512(<8 x i64> %v, <8 x i64> <i64 0, i64 8, i64 16, i64 64, i64 0, i64 8, i64 16, i64 64>)
+; CHECK-NEXT:    ret <8 x i64> [[TMP1]]
+;
+  %1 = tail call <8 x i64> @llvm.x86.avx512.psrlv.q.512(<8 x i64> %v, <8 x i64> <i64 0, i64 8, i64 16, i64 64, i64 0, i64 8, i64 16, i64 64>)
+  ret <8 x i64> %1
+}
+
+define <8 x i64> @avx512_psrlv_q_512_allbig(<8 x i64> %v) {
+; CHECK-LABEL: @avx512_psrlv_q_512_allbig(
+; CHECK-NEXT:    ret <8 x i64> <i64 0, i64 undef, i64 0, i64 0, i64 0, i64 undef, i64 0, i64 0>
+;
+  %1 = tail call <8 x i64> @llvm.x86.avx512.psrlv.q.512(<8 x i64> %v, <8 x i64> <i64 64, i64 undef, i64 -128, i64 -60, i64 64, i64 undef, i64 -128, i64 -60>)
+  ret <8 x i64> %1
+}
+
+define <8 x i64> @avx512_psrlv_q_512_undef(<8 x i64> %v) {
+; CHECK-LABEL: @avx512_psrlv_q_512_undef(
+; CHECK-NEXT:    [[TMP1:%.*]] = lshr <8 x i64> %v, <i64 undef, i64 8, i64 16, i64 31, i64 0, i64 8, i64 16, i64 31>
+; CHECK-NEXT:    ret <8 x i64> [[TMP1]]
+;
+  %1 = insertelement <8 x i64> <i64 0, i64 8, i64 16, i64 31, i64 0, i64 8, i64 16, i64 31>, i64 undef, i64 0
+  %2 = tail call <8 x i64> @llvm.x86.avx512.psrlv.q.512(<8 x i64> %v, <8 x i64> %1)
+  ret <8 x i64> %2
+}
+
+define <8 x i16> @avx512_psrlv_w_128_0(<8 x i16> %v) {
+; CHECK-LABEL: @avx512_psrlv_w_128_0(
+; CHECK-NEXT:    ret <8 x i16> %v
+;
+  %1 = tail call <8 x i16> @llvm.x86.avx512.psrlv.w.128(<8 x i16> %v, <8 x i16> zeroinitializer)
+  ret <8 x i16> %1
+}
+
+define <8 x i16> @avx512_psrlv_w_128_var(<8 x i16> %v) {
+; CHECK-LABEL: @avx512_psrlv_w_128_var(
+; CHECK-NEXT:    [[TMP1:%.*]] = lshr <8 x i16> %v, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>
+; CHECK-NEXT:    ret <8 x i16> [[TMP1]]
+;
+  %1 = tail call <8 x i16> @llvm.x86.avx512.psrlv.w.128(<8 x i16> %v, <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+  ret <8 x i16> %1
+}
+
+define <8 x i16> @avx512_psrlv_w_128_big(<8 x i16> %v) {
+; CHECK-LABEL: @avx512_psrlv_w_128_big(
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i16> @llvm.x86.avx512.psrlv.w.128(<8 x i16> %v, <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 16>)
+; CHECK-NEXT:    ret <8 x i16> [[TMP1]]
+;
+  %1 = tail call <8 x i16> @llvm.x86.avx512.psrlv.w.128(<8 x i16> %v, <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 16>)
+  ret <8 x i16> %1
+}
+
+define <8 x i16> @avx512_psrlv_w_128_allbig(<8 x i16> %v) {
+; CHECK-LABEL: @avx512_psrlv_w_128_allbig(
+; CHECK-NEXT:    ret <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 undef>
+;
+  %1 = tail call <8 x i16> @llvm.x86.avx512.psrlv.w.128(<8 x i16> %v, <8 x i16> <i16 20, i16 -1, i16 -2, i16 33, i16 44, i16 55, i16 66, i16 undef>)
+  ret <8 x i16> %1
+}
+
+define <8 x i16> @avx512_psrlv_w_128_undef(<8 x i16> %v) {
+; CHECK-LABEL: @avx512_psrlv_w_128_undef(
+; CHECK-NEXT:    [[TMP1:%.*]] = lshr <8 x i16> %v, <i16 undef, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>
+; CHECK-NEXT:    ret <8 x i16> [[TMP1]]
+;
+  %1 = insertelement <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, i16 undef, i64 0
+  %2 = tail call <8 x i16> @llvm.x86.avx512.psrlv.w.128(<8 x i16> %v, <8 x i16> %1)
+  ret <8 x i16> %2
+}
+
+define <16 x i16> @avx512_psrlv_w_256_0(<16 x i16> %v) {
+; CHECK-LABEL: @avx512_psrlv_w_256_0(
+; CHECK-NEXT:    ret <16 x i16> %v
+;
+  %1 = tail call <16 x i16> @llvm.x86.avx512.psrlv.w.256(<16 x i16> %v, <16 x i16> zeroinitializer)
+  ret <16 x i16> %1
+}
+
+define <16 x i16> @avx512_psrlv_w_256_var(<16 x i16> %v) {
+; CHECK-LABEL: @avx512_psrlv_w_256_var(
+; CHECK-NEXT:    [[TMP1:%.*]] = lshr <16 x i16> %v, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>
+; CHECK-NEXT:    ret <16 x i16> [[TMP1]]
+;
+  %1 = tail call <16 x i16> @llvm.x86.avx512.psrlv.w.256(<16 x i16> %v, <16 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>)
+  ret <16 x i16> %1
+}
+
+define <16 x i16> @avx512_psrlv_w_256_big(<16 x i16> %v) {
+; CHECK-LABEL: @avx512_psrlv_w_256_big(
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.x86.avx512.psrlv.w.256(<16 x i16> %v, <16 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 16>)
+; CHECK-NEXT:    ret <16 x i16> [[TMP1]]
+;
+  %1 = tail call <16 x i16> @llvm.x86.avx512.psrlv.w.256(<16 x i16> %v, <16 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 16>)
+  ret <16 x i16> %1
+}
+
+define <16 x i16> @avx512_psrlv_w_256_allbig(<16 x i16> %v) {
+; CHECK-LABEL: @avx512_psrlv_w_256_allbig(
+; CHECK-NEXT:    ret <16 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 undef, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>
+;
+  %1 = tail call <16 x i16> @llvm.x86.avx512.psrlv.w.256(<16 x i16> %v, <16 x i16> <i16 20, i16 -1, i16 -2, i16 33, i16 44, i16 55, i16 66, i16 -7, i16 undef, i16 64, i16 -10, i16 256, i16 16, i16 28, i16 65535, i16 32767>)
+  ret <16 x i16> %1
+}
+
+define <16 x i16> @avx512_psrlv_w_256_undef(<16 x i16> %v) {
+; CHECK-LABEL: @avx512_psrlv_w_256_undef(
+; CHECK-NEXT:    [[TMP1:%.*]] = lshr <16 x i16> %v, <i16 undef, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>
+; CHECK-NEXT:    ret <16 x i16> [[TMP1]]
+;
+  %1 = insertelement <16 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>, i16 undef, i64 0
+  %2 = tail call <16 x i16> @llvm.x86.avx512.psrlv.w.256(<16 x i16> %v, <16 x i16> %1)
+  ret <16 x i16> %2
+}
+
+define <32 x i16> @avx512_psrlv_w_512_0(<32 x i16> %v) {
+; CHECK-LABEL: @avx512_psrlv_w_512_0(
+; CHECK-NEXT:    ret <32 x i16> %v
+;
+  %1 = tail call <32 x i16> @llvm.x86.avx512.psrlv.w.512(<32 x i16> %v, <32 x i16> zeroinitializer)
+  ret <32 x i16> %1
+}
+
+define <32 x i16> @avx512_psrlv_w_512_var(<32 x i16> %v) {
+; CHECK-LABEL: @avx512_psrlv_w_512_var(
+; CHECK-NEXT:    [[TMP1:%.*]] = lshr <32 x i16> %v, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 15, i16 14, i16 13, i16 12, i16 11, i16 10, i16 9, i16 8, i16 7, i16 6, i16 5, i16 4, i16 3, i16 2, i16 1, i16 0>
+; CHECK-NEXT:    ret <32 x i16> [[TMP1]]
+;
+  %1 = tail call <32 x i16> @llvm.x86.avx512.psrlv.w.512(<32 x i16> %v, <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 15, i16 14, i16 13, i16 12, i16 11, i16 10, i16 9, i16 8, i16 7, i16 6, i16 5, i16 4, i16 3, i16 2, i16 1, i16 0>)
+  ret <32 x i16> %1
+}
+
+define <32 x i16> @avx512_psrlv_w_512_big(<32 x i16> %v) {
+; CHECK-LABEL: @avx512_psrlv_w_512_big(
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i16> @llvm.x86.avx512.psrlv.w.512(<32 x i16> %v, <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 14, i16 13, i16 12, i16 11, i16 10, i16 9, i16 8, i16 7, i16 6, i16 5, i16 4, i16 3, i16 2, i16 1, i16 0>)
+; CHECK-NEXT:    ret <32 x i16> [[TMP1]]
+;
+  %1 = tail call <32 x i16> @llvm.x86.avx512.psrlv.w.512(<32 x i16> %v, <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 14, i16 13, i16 12, i16 11, i16 10, i16 9, i16 8, i16 7, i16 6, i16 5, i16 4, i16 3, i16 2, i16 1, i16 0>)
+  ret <32 x i16> %1
+}
+
+define <32 x i16> @avx512_psrlv_w_512_allbig(<32 x i16> %v) {
+; CHECK-LABEL: @avx512_psrlv_w_512_allbig(
+; CHECK-NEXT:    ret <32 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 undef, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 undef, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 undef, i16 0, i16 0, i16 undef, i16 0, i16 0>
+;
+  %1 = tail call <32 x i16> @llvm.x86.avx512.psrlv.w.512(<32 x i16> %v, <32 x i16> <i16 20, i16 -1, i16 -2, i16 33, i16 44, i16 55, i16 66, i16 -7, i16 undef, i16 64, i16 -10, i16 128, i16 16, i16 28, i16 65535, i16 32767, i16 56, i16 -14, i16 undef, i16 16, i16 67, i16 567, i16 -32768, i16 4096, i16 8192, i16 -12345, i16 undef, i16 345, i16 123, i16 undef, i16 1024, i16 54321>)
+  ret <32 x i16> %1
+}
+
+define <32 x i16> @avx512_psrlv_w_512_undef(<32 x i16> %v) {
+; CHECK-LABEL: @avx512_psrlv_w_512_undef(
+; CHECK-NEXT:    [[TMP1:%.*]] = lshr <32 x i16> %v, <i16 undef, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 15, i16 14, i16 13, i16 12, i16 11, i16 10, i16 9, i16 8, i16 7, i16 6, i16 5, i16 4, i16 3, i16 2, i16 1, i16 0>
+; CHECK-NEXT:    ret <32 x i16> [[TMP1]]
+;
+  %1 = insertelement <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 15, i16 14, i16 13, i16 12, i16 11, i16 10, i16 9, i16 8, i16 7, i16 6, i16 5, i16 4, i16 3, i16 2, i16 1, i16 0>, i16 undef, i64 0
+  %2 = tail call <32 x i16> @llvm.x86.avx512.psrlv.w.512(<32 x i16> %v, <32 x i16> %1)
+  ret <32 x i16> %2
+}
+
+;
+; SHL - Constant Per-Element Vector
+;
+
+define <4 x i32> @avx2_psllv_d_128_0(<4 x i32> %v) {
+; CHECK-LABEL: @avx2_psllv_d_128_0(
+; CHECK-NEXT:    ret <4 x i32> %v
+;
+  %1 = tail call <4 x i32> @llvm.x86.avx2.psllv.d(<4 x i32> %v, <4 x i32> zeroinitializer)
+  ret <4 x i32> %1
+}
+
+define <8 x i32> @avx2_psllv_d_256_0(<8 x i32> %v) {
+; CHECK-LABEL: @avx2_psllv_d_256_0(
+; CHECK-NEXT:    ret <8 x i32> %v
+;
+  %1 = tail call <8 x i32> @llvm.x86.avx2.psllv.d.256(<8 x i32> %v, <8 x i32> zeroinitializer)
+  ret <8 x i32> %1
+}
+
+define <4 x i32> @avx2_psllv_d_128_var(<4 x i32> %v) {
+; CHECK-LABEL: @avx2_psllv_d_128_var(
+; CHECK-NEXT:    [[TMP1:%.*]] = shl <4 x i32> %v, <i32 0, i32 8, i32 16, i32 31>
+; CHECK-NEXT:    ret <4 x i32> [[TMP1]]
+;
+  %1 = tail call <4 x i32> @llvm.x86.avx2.psllv.d(<4 x i32> %v, <4 x i32> <i32 0, i32 8, i32 16, i32 31>)
+  ret <4 x i32> %1
+}
+
+define <8 x i32> @avx2_psllv_d_256_var(<8 x i32> %v) {
+; CHECK-LABEL: @avx2_psllv_d_256_var(
+; CHECK-NEXT:    [[TMP1:%.*]] = shl <8 x i32> %v, <i32 0, i32 8, i32 16, i32 24, i32 31, i32 24, i32 8, i32 0>
+; CHECK-NEXT:    ret <8 x i32> [[TMP1]]
+;
+  %1 = tail call <8 x i32> @llvm.x86.avx2.psllv.d.256(<8 x i32> %v, <8 x i32> <i32 0, i32 8, i32 16, i32 24, i32 31, i32 24, i32 8, i32 0>)
+  ret <8 x i32> %1
+}
+
+define <4 x i32> @avx2_psllv_d_128_big(<4 x i32> %v) {
+; CHECK-LABEL: @avx2_psllv_d_128_big(
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i32> @llvm.x86.avx2.psllv.d(<4 x i32> %v, <4 x i32> <i32 0, i32 8, i32 16, i32 64>)
+; CHECK-NEXT:    ret <4 x i32> [[TMP1]]
+;
+  %1 = tail call <4 x i32> @llvm.x86.avx2.psllv.d(<4 x i32> %v, <4 x i32> <i32 0, i32 8, i32 16, i32 64>)
+  ret <4 x i32> %1
+}
+
+define <8 x i32> @avx2_psllv_d_256_big(<8 x i32> %v) {
+; CHECK-LABEL: @avx2_psllv_d_256_big(
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.x86.avx2.psllv.d.256(<8 x i32> %v, <8 x i32> <i32 0, i32 8, i32 16, i32 64, i32 31, i32 24, i32 8, i32 0>)
+; CHECK-NEXT:    ret <8 x i32> [[TMP1]]
+;
+  %1 = tail call <8 x i32> @llvm.x86.avx2.psllv.d.256(<8 x i32> %v, <8 x i32> <i32 0, i32 8, i32 16, i32 64, i32 31, i32 24, i32 8, i32 0>)
+  ret <8 x i32> %1
+}
+
+define <4 x i32> @avx2_psllv_d_128_allbig(<4 x i32> %v) {
+; CHECK-LABEL: @avx2_psllv_d_128_allbig(
+; CHECK-NEXT:    ret <4 x i32> <i32 0, i32 0, i32 0, i32 undef>
+;
+  %1 = tail call <4 x i32> @llvm.x86.avx2.psllv.d(<4 x i32> %v, <4 x i32> <i32 32, i32 100, i32 -255, i32 undef>)
+  ret <4 x i32> %1
+}
+
+define <8 x i32> @avx2_psllv_d_256_allbig(<8 x i32> %v) {
+; CHECK-LABEL: @avx2_psllv_d_256_allbig(
+; CHECK-NEXT:    ret <8 x i32> <i32 undef, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+;
+  %1 = tail call <8 x i32> @llvm.x86.avx2.psllv.d.256(<8 x i32> %v, <8 x i32> <i32 undef, i32 100, i32 255, i32 55555, i32 -32, i32 -100, i32 -255, i32 -55555>)
+  ret <8 x i32> %1
+}
+
+define <4 x i32> @avx2_psllv_d_128_undef(<4 x i32> %v) {
+; CHECK-LABEL: @avx2_psllv_d_128_undef(
+; CHECK-NEXT:    [[TMP1:%.*]] = shl <4 x i32> %v, <i32 undef, i32 8, i32 16, i32 31>
+; CHECK-NEXT:    ret <4 x i32> [[TMP1]]
+;
+  %1 = insertelement <4 x i32> <i32 0, i32 8, i32 16, i32 31>, i32 undef, i32 0
+  %2 = tail call <4 x i32> @llvm.x86.avx2.psllv.d(<4 x i32> %v, <4 x i32> %1)
+  ret <4 x i32> %2
+}
+
+define <8 x i32> @avx2_psllv_d_256_undef(<8 x i32> %v) {
+; CHECK-LABEL: @avx2_psllv_d_256_undef(
+; CHECK-NEXT:    [[TMP1:%.*]] = shl <8 x i32> %v, <i32 0, i32 undef, i32 16, i32 31, i32 31, i32 24, i32 8, i32 0>
+; CHECK-NEXT:    ret <8 x i32> [[TMP1]]
+;
+  %1 = insertelement <8 x i32> <i32 0, i32 8, i32 16, i32 31, i32 31, i32 24, i32 8, i32 0>, i32 undef, i32 1
+  %2 = tail call <8 x i32> @llvm.x86.avx2.psllv.d.256(<8 x i32> %v, <8 x i32> %1)
+  ret <8 x i32> %2
+}
+
+define <2 x i64> @avx2_psllv_q_128_0(<2 x i64> %v) {
+; CHECK-LABEL: @avx2_psllv_q_128_0(
+; CHECK-NEXT:    ret <2 x i64> %v
+;
+  %1 = tail call <2 x i64> @llvm.x86.avx2.psllv.q(<2 x i64> %v, <2 x i64> zeroinitializer)
+  ret <2 x i64> %1
+}
+
+define <4 x i64> @avx2_psllv_q_256_0(<4 x i64> %v) {
+; CHECK-LABEL: @avx2_psllv_q_256_0(
+; CHECK-NEXT:    ret <4 x i64> %v
+;
+  %1 = tail call <4 x i64> @llvm.x86.avx2.psllv.q.256(<4 x i64> %v, <4 x i64> zeroinitializer)
+  ret <4 x i64> %1
+}
+
+define <2 x i64> @avx2_psllv_q_128_var(<2 x i64> %v) {
+; CHECK-LABEL: @avx2_psllv_q_128_var(
+; CHECK-NEXT:    [[TMP1:%.*]] = shl <2 x i64> %v, <i64 0, i64 8>
+; CHECK-NEXT:    ret <2 x i64> [[TMP1]]
+;
+  %1 = tail call <2 x i64> @llvm.x86.avx2.psllv.q(<2 x i64> %v, <2 x i64> <i64 0, i64 8>)
+  ret <2 x i64> %1
+}
+
+define <4 x i64> @avx2_psllv_q_256_var(<4 x i64> %v) {
+; CHECK-LABEL: @avx2_psllv_q_256_var(
+; CHECK-NEXT:    [[TMP1:%.*]] = shl <4 x i64> %v, <i64 0, i64 8, i64 16, i64 31>
+; CHECK-NEXT:    ret <4 x i64> [[TMP1]]
+;
+  %1 = tail call <4 x i64> @llvm.x86.avx2.psllv.q.256(<4 x i64> %v, <4 x i64> <i64 0, i64 8, i64 16, i64 31>)
+  ret <4 x i64> %1
+}
+
+define <2 x i64> @avx2_psllv_q_128_big(<2 x i64> %v) {
+; CHECK-LABEL: @avx2_psllv_q_128_big(
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.x86.avx2.psllv.q(<2 x i64> %v, <2 x i64> <i64 0, i64 128>)
+; CHECK-NEXT:    ret <2 x i64> [[TMP1]]
+;
+  %1 = tail call <2 x i64> @llvm.x86.avx2.psllv.q(<2 x i64> %v, <2 x i64> <i64 0, i64 128>)
+  ret <2 x i64> %1
+}
+
+define <4 x i64> @avx2_psllv_q_256_big(<4 x i64> %v) {
+; CHECK-LABEL: @avx2_psllv_q_256_big(
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.x86.avx2.psllv.q.256(<4 x i64> %v, <4 x i64> <i64 0, i64 8, i64 16, i64 64>)
+; CHECK-NEXT:    ret <4 x i64> [[TMP1]]
+;
+  %1 = tail call <4 x i64> @llvm.x86.avx2.psllv.q.256(<4 x i64> %v, <4 x i64> <i64 0, i64 8, i64 16, i64 64>)
+  ret <4 x i64> %1
+}
+
+define <2 x i64> @avx2_psllv_q_128_allbig(<2 x i64> %v) {
+; CHECK-LABEL: @avx2_psllv_q_128_allbig(
+; CHECK-NEXT:    ret <2 x i64> zeroinitializer
+;
+  %1 = tail call <2 x i64> @llvm.x86.avx2.psllv.q(<2 x i64> %v, <2 x i64> <i64 128, i64 -64>)
+  ret <2 x i64> %1
+}
+
+define <4 x i64> @avx2_psllv_q_256_allbig(<4 x i64> %v) {
+; CHECK-LABEL: @avx2_psllv_q_256_allbig(
+; CHECK-NEXT:    ret <4 x i64> <i64 0, i64 undef, i64 0, i64 0>
+;
+  %1 = tail call <4 x i64> @llvm.x86.avx2.psllv.q.256(<4 x i64> %v, <4 x i64> <i64 64, i64 undef, i64 -128, i64 -60>)
+  ret <4 x i64> %1
+}
+
+define <2 x i64> @avx2_psllv_q_128_undef(<2 x i64> %v) {
+; CHECK-LABEL: @avx2_psllv_q_128_undef(
+; CHECK-NEXT:    [[TMP1:%.*]] = shl <2 x i64> %v, <i64 0, i64 undef>
+; CHECK-NEXT:    ret <2 x i64> [[TMP1]]
+;
+  %1 = insertelement <2 x i64> <i64 0, i64 8>, i64 undef, i64 1
+  %2 = tail call <2 x i64> @llvm.x86.avx2.psllv.q(<2 x i64> %v, <2 x i64> %1)
+  ret <2 x i64> %2
+}
+
+define <4 x i64> @avx2_psllv_q_256_undef(<4 x i64> %v) {
+; CHECK-LABEL: @avx2_psllv_q_256_undef(
+; CHECK-NEXT:    [[TMP1:%.*]] = shl <4 x i64> %v, <i64 undef, i64 8, i64 16, i64 31>
+; CHECK-NEXT:    ret <4 x i64> [[TMP1]]
+;
+  %1 = insertelement <4 x i64> <i64 0, i64 8, i64 16, i64 31>, i64 undef, i64 0
+  %2 = tail call <4 x i64> @llvm.x86.avx2.psllv.q.256(<4 x i64> %v, <4 x i64> %1)
+  ret <4 x i64> %2
+}
+
+define <16 x i32> @avx512_psllv_d_512_0(<16 x i32> %v) {
+; CHECK-LABEL: @avx512_psllv_d_512_0(
+; CHECK-NEXT:    ret <16 x i32> %v
+;
+  %1 = tail call <16 x i32> @llvm.x86.avx512.psllv.d.512(<16 x i32> %v, <16 x i32> zeroinitializer)
+  ret <16 x i32> %1
+}
+
+define <16 x i32> @avx512_psllv_d_512_var(<16 x i32> %v) {
+; CHECK-LABEL: @avx512_psllv_d_512_var(
+; CHECK-NEXT:    [[TMP1:%.*]] = shl <16 x i32> %v, <i32 0, i32 8, i32 16, i32 24, i32 31, i32 24, i32 8, i32 0, i32 0, i32 8, i32 16, i32 24, i32 31, i32 24, i32 8, i32 0>
+; CHECK-NEXT:    ret <16 x i32> [[TMP1]]
+;
+  %1 = tail call <16 x i32> @llvm.x86.avx512.psllv.d.512(<16 x i32> %v, <16 x i32> <i32 0, i32 8, i32 16, i32 24, i32 31, i32 24, i32 8, i32 0, i32 0, i32 8, i32 16, i32 24, i32 31, i32 24, i32 8, i32 0>)
+  ret <16 x i32> %1
+}
+
+define <16 x i32> @avx512_psllv_d_512_big(<16 x i32> %v) {
+; CHECK-LABEL: @avx512_psllv_d_512_big(
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i32> @llvm.x86.avx512.psllv.d.512(<16 x i32> %v, <16 x i32> <i32 0, i32 8, i32 16, i32 64, i32 31, i32 24, i32 8, i32 0, i32 0, i32 8, i32 16, i32 64, i32 31, i32 24, i32 8, i32 0>)
+; CHECK-NEXT:    ret <16 x i32> [[TMP1]]
+;
+  %1 = tail call <16 x i32> @llvm.x86.avx512.psllv.d.512(<16 x i32> %v, <16 x i32> <i32 0, i32 8, i32 16, i32 64, i32 31, i32 24, i32 8, i32 0, i32 0, i32 8, i32 16, i32 64, i32 31, i32 24, i32 8, i32 0>)
+  ret <16 x i32> %1
+}
+
+define <16 x i32> @avx512_psllv_d_512_allbig(<16 x i32> %v) {
+; CHECK-LABEL: @avx512_psllv_d_512_allbig(
+; CHECK-NEXT:    ret <16 x i32> <i32 undef, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 undef, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+;
+  %1 = tail call <16 x i32> @llvm.x86.avx512.psllv.d.512(<16 x i32> %v, <16 x i32> <i32 undef, i32 100, i32 255, i32 55555, i32 -32, i32 -100, i32 -255, i32 -55555, i32 undef, i32 100, i32 255, i32 55555, i32 -32, i32 -100, i32 -255, i32 -55555>)
+  ret <16 x i32> %1
+}
+
+define <16 x i32> @avx512_psllv_d_512_undef(<16 x i32> %v) {
+; CHECK-LABEL: @avx512_psllv_d_512_undef(
+; CHECK-NEXT:    [[TMP1:%.*]] = shl <16 x i32> %v, <i32 0, i32 undef, i32 16, i32 31, i32 31, i32 24, i32 8, i32 0, i32 0, i32 8, i32 16, i32 31, i32 31, i32 24, i32 8, i32 0>
+; CHECK-NEXT:    ret <16 x i32> [[TMP1]]
+;
+  %1 = insertelement <16 x i32> <i32 0, i32 8, i32 16, i32 31, i32 31, i32 24, i32 8, i32 0, i32 0, i32 8, i32 16, i32 31, i32 31, i32 24, i32 8, i32 0>, i32 undef, i32 1
+  %2 = tail call <16 x i32> @llvm.x86.avx512.psllv.d.512(<16 x i32> %v, <16 x i32> %1)
+  ret <16 x i32> %2
+}
+
+define <8 x i64> @avx512_psllv_q_512_0(<8 x i64> %v) {
+; CHECK-LABEL: @avx512_psllv_q_512_0(
+; CHECK-NEXT:    ret <8 x i64> %v
+;
+  %1 = tail call <8 x i64> @llvm.x86.avx512.psllv.q.512(<8 x i64> %v, <8 x i64> zeroinitializer)
+  ret <8 x i64> %1
+}
+
+define <8 x i64> @avx512_psllv_q_512_var(<8 x i64> %v) {
+; CHECK-LABEL: @avx512_psllv_q_512_var(
+; CHECK-NEXT:    [[TMP1:%.*]] = shl <8 x i64> %v, <i64 0, i64 8, i64 16, i64 31, i64 0, i64 8, i64 16, i64 31>
+; CHECK-NEXT:    ret <8 x i64> [[TMP1]]
+;
+  %1 = tail call <8 x i64> @llvm.x86.avx512.psllv.q.512(<8 x i64> %v, <8 x i64> <i64 0, i64 8, i64 16, i64 31, i64 0, i64 8, i64 16, i64 31>)
+  ret <8 x i64> %1
+}
+
+define <8 x i64> @avx512_psllv_q_512_big(<8 x i64> %v) {
+; CHECK-LABEL: @avx512_psllv_q_512_big(
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i64> @llvm.x86.avx512.psllv.q.512(<8 x i64> %v, <8 x i64> <i64 0, i64 8, i64 16, i64 64, i64 0, i64 8, i64 16, i64 64>)
+; CHECK-NEXT:    ret <8 x i64> [[TMP1]]
+;
+  %1 = tail call <8 x i64> @llvm.x86.avx512.psllv.q.512(<8 x i64> %v, <8 x i64> <i64 0, i64 8, i64 16, i64 64, i64 0, i64 8, i64 16, i64 64>)
+  ret <8 x i64> %1
+}
+
+define <8 x i64> @avx512_psllv_q_512_allbig(<8 x i64> %v) {
+; CHECK-LABEL: @avx512_psllv_q_512_allbig(
+; CHECK-NEXT:    ret <8 x i64> <i64 0, i64 undef, i64 0, i64 0, i64 0, i64 undef, i64 0, i64 0>
+;
+  %1 = tail call <8 x i64> @llvm.x86.avx512.psllv.q.512(<8 x i64> %v, <8 x i64> <i64 64, i64 undef, i64 -128, i64 -60, i64 64, i64 undef, i64 -128, i64 -60>)
+  ret <8 x i64> %1
+}
+
+define <8 x i64> @avx512_psllv_q_512_undef(<8 x i64> %v) {
+; CHECK-LABEL: @avx512_psllv_q_512_undef(
+; CHECK-NEXT:    [[TMP1:%.*]] = shl <8 x i64> %v, <i64 undef, i64 8, i64 16, i64 31, i64 0, i64 8, i64 16, i64 31>
+; CHECK-NEXT:    ret <8 x i64> [[TMP1]]
+;
+  %1 = insertelement <8 x i64> <i64 0, i64 8, i64 16, i64 31, i64 0, i64 8, i64 16, i64 31>, i64 undef, i64 0
+  %2 = tail call <8 x i64> @llvm.x86.avx512.psllv.q.512(<8 x i64> %v, <8 x i64> %1)
+  ret <8 x i64> %2
+}
+
+define <8 x i16> @avx512_psllv_w_128_0(<8 x i16> %v) {
+; CHECK-LABEL: @avx512_psllv_w_128_0(
+; CHECK-NEXT:    ret <8 x i16> %v
+;
+  %1 = tail call <8 x i16> @llvm.x86.avx512.psllv.w.128(<8 x i16> %v, <8 x i16> zeroinitializer)
+  ret <8 x i16> %1
+}
+
+define <8 x i16> @avx512_psllv_w_128_var(<8 x i16> %v) {
+; CHECK-LABEL: @avx512_psllv_w_128_var(
+; CHECK-NEXT:    [[TMP1:%.*]] = shl <8 x i16> %v, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>
+; CHECK-NEXT:    ret <8 x i16> [[TMP1]]
+;
+  %1 = tail call <8 x i16> @llvm.x86.avx512.psllv.w.128(<8 x i16> %v, <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+  ret <8 x i16> %1
+}
+
+define <8 x i16> @avx512_psllv_w_128_big(<8 x i16> %v) {
+; CHECK-LABEL: @avx512_psllv_w_128_big(
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i16> @llvm.x86.avx512.psllv.w.128(<8 x i16> %v, <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 16>)
+; CHECK-NEXT:    ret <8 x i16> [[TMP1]]
+;
+  %1 = tail call <8 x i16> @llvm.x86.avx512.psllv.w.128(<8 x i16> %v, <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 16>)
+  ret <8 x i16> %1
+}
+
+define <8 x i16> @avx512_psllv_w_128_allbig(<8 x i16> %v) {
+; CHECK-LABEL: @avx512_psllv_w_128_allbig(
+; CHECK-NEXT:    ret <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 undef>
+;
+  %1 = tail call <8 x i16> @llvm.x86.avx512.psllv.w.128(<8 x i16> %v, <8 x i16> <i16 20, i16 -1, i16 -2, i16 33, i16 44, i16 55, i16 66, i16 undef>)
+  ret <8 x i16> %1
+}
+
+define <8 x i16> @avx512_psllv_w_128_undef(<8 x i16> %v) {
+; CHECK-LABEL: @avx512_psllv_w_128_undef(
+; CHECK-NEXT:    [[TMP1:%.*]] = shl <8 x i16> %v, <i16 undef, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>
+; CHECK-NEXT:    ret <8 x i16> [[TMP1]]
+;
+  %1 = insertelement <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, i16 undef, i64 0
+  %2 = tail call <8 x i16> @llvm.x86.avx512.psllv.w.128(<8 x i16> %v, <8 x i16> %1)
+  ret <8 x i16> %2
+}
+
+define <16 x i16> @avx512_psllv_w_256_0(<16 x i16> %v) {
+; CHECK-LABEL: @avx512_psllv_w_256_0(
+; CHECK-NEXT:    ret <16 x i16> %v
+;
+  %1 = tail call <16 x i16> @llvm.x86.avx512.psllv.w.256(<16 x i16> %v, <16 x i16> zeroinitializer)
+  ret <16 x i16> %1
+}
+
+define <16 x i16> @avx512_psllv_w_256_var(<16 x i16> %v) {
+; CHECK-LABEL: @avx512_psllv_w_256_var(
+; CHECK-NEXT:    [[TMP1:%.*]] = shl <16 x i16> %v, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>
+; CHECK-NEXT:    ret <16 x i16> [[TMP1]]
+;
+  %1 = tail call <16 x i16> @llvm.x86.avx512.psllv.w.256(<16 x i16> %v, <16 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>)
+  ret <16 x i16> %1
+}
+
+define <16 x i16> @avx512_psllv_w_256_big(<16 x i16> %v) {
+; CHECK-LABEL: @avx512_psllv_w_256_big(
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.x86.avx512.psllv.w.256(<16 x i16> %v, <16 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 16>)
+; CHECK-NEXT:    ret <16 x i16> [[TMP1]]
+;
+  %1 = tail call <16 x i16> @llvm.x86.avx512.psllv.w.256(<16 x i16> %v, <16 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 16>)
+  ret <16 x i16> %1
+}
+
+define <16 x i16> @avx512_psllv_w_256_allbig(<16 x i16> %v) {
+; CHECK-LABEL: @avx512_psllv_w_256_allbig(
+; CHECK-NEXT:    ret <16 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 undef, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>
+;
+  %1 = tail call <16 x i16> @llvm.x86.avx512.psllv.w.256(<16 x i16> %v, <16 x i16> <i16 20, i16 -1, i16 -2, i16 33, i16 44, i16 55, i16 66, i16 -7, i16 undef, i16 64, i16 -10, i16 256, i16 16, i16 28, i16 65535, i16 32767>)
+  ret <16 x i16> %1
+}
+
+define <16 x i16> @avx512_psllv_w_256_undef(<16 x i16> %v) {
+; CHECK-LABEL: @avx512_psllv_w_256_undef(
+; CHECK-NEXT:    [[TMP1:%.*]] = shl <16 x i16> %v, <i16 undef, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>
+; CHECK-NEXT:    ret <16 x i16> [[TMP1]]
+;
+  %1 = insertelement <16 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>, i16 undef, i64 0
+  %2 = tail call <16 x i16> @llvm.x86.avx512.psllv.w.256(<16 x i16> %v, <16 x i16> %1)
+  ret <16 x i16> %2
+}
+
+define <32 x i16> @avx512_psllv_w_512_0(<32 x i16> %v) {
+; CHECK-LABEL: @avx512_psllv_w_512_0(
+; CHECK-NEXT:    ret <32 x i16> %v
+;
+  %1 = tail call <32 x i16> @llvm.x86.avx512.psllv.w.512(<32 x i16> %v, <32 x i16> zeroinitializer)
+  ret <32 x i16> %1
+}
+
+define <32 x i16> @avx512_psllv_w_512_var(<32 x i16> %v) {
+; CHECK-LABEL: @avx512_psllv_w_512_var(
+; CHECK-NEXT:    [[TMP1:%.*]] = shl <32 x i16> %v, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 15, i16 14, i16 13, i16 12, i16 11, i16 10, i16 9, i16 8, i16 7, i16 6, i16 5, i16 4, i16 3, i16 2, i16 1, i16 0>
+; CHECK-NEXT:    ret <32 x i16> [[TMP1]]
+;
+  %1 = tail call <32 x i16> @llvm.x86.avx512.psllv.w.512(<32 x i16> %v, <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 15, i16 14, i16 13, i16 12, i16 11, i16 10, i16 9, i16 8, i16 7, i16 6, i16 5, i16 4, i16 3, i16 2, i16 1, i16 0>)
+  ret <32 x i16> %1
+}
+
+define <32 x i16> @avx512_psllv_w_512_big(<32 x i16> %v) {
+; CHECK-LABEL: @avx512_psllv_w_512_big(
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i16> @llvm.x86.avx512.psllv.w.512(<32 x i16> %v, <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 14, i16 13, i16 12, i16 11, i16 10, i16 9, i16 8, i16 7, i16 6, i16 5, i16 4, i16 3, i16 2, i16 1, i16 0>)
+; CHECK-NEXT:    ret <32 x i16> [[TMP1]]
+;
+  %1 = tail call <32 x i16> @llvm.x86.avx512.psllv.w.512(<32 x i16> %v, <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 14, i16 13, i16 12, i16 11, i16 10, i16 9, i16 8, i16 7, i16 6, i16 5, i16 4, i16 3, i16 2, i16 1, i16 0>)
+  ret <32 x i16> %1
+}
+
+define <32 x i16> @avx512_psllv_w_512_allbig(<32 x i16> %v) {
+; CHECK-LABEL: @avx512_psllv_w_512_allbig(
+; CHECK-NEXT:    ret <32 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 undef, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 undef, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 undef, i16 0, i16 0, i16 undef, i16 0, i16 0>
+;
+  %1 = tail call <32 x i16> @llvm.x86.avx512.psllv.w.512(<32 x i16> %v, <32 x i16> <i16 20, i16 -1, i16 -2, i16 33, i16 44, i16 55, i16 66, i16 -7, i16 undef, i16 64, i16 -10, i16 128, i16 16, i16 28, i16 65535, i16 32767, i16 56, i16 -14, i16 undef, i16 16, i16 67, i16 567, i16 -32768, i16 4096, i16 8192, i16 -12345, i16 undef, i16 345, i16 123, i16 undef, i16 1024, i16 54321>)
+  ret <32 x i16> %1
+}
+
+define <32 x i16> @avx512_psllv_w_512_undef(<32 x i16> %v) {
+; CHECK-LABEL: @avx512_psllv_w_512_undef(
+; CHECK-NEXT:    [[TMP1:%.*]] = shl <32 x i16> %v, <i16 undef, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 15, i16 14, i16 13, i16 12, i16 11, i16 10, i16 9, i16 8, i16 7, i16 6, i16 5, i16 4, i16 3, i16 2, i16 1, i16 0>
+; CHECK-NEXT:    ret <32 x i16> [[TMP1]]
+;
+  %1 = insertelement <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 15, i16 14, i16 13, i16 12, i16 11, i16 10, i16 9, i16 8, i16 7, i16 6, i16 5, i16 4, i16 3, i16 2, i16 1, i16 0>, i16 undef, i64 0
+  %2 = tail call <32 x i16> @llvm.x86.avx512.psllv.w.512(<32 x i16> %v, <32 x i16> %1)
+  ret <32 x i16> %2
+}
+
+;
+; Vector Demanded Bits
+;
+
+define <8 x i16> @sse2_psra_w_var(<8 x i16> %v, <8 x i16> %a) {
+; CHECK-LABEL: @sse2_psra_w_var(
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i16> @llvm.x86.sse2.psra.w(<8 x i16> %v, <8 x i16> %a)
+; CHECK-NEXT:    ret <8 x i16> [[TMP1]]
+;
+  %1 = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+  %2 = tail call <8 x i16> @llvm.x86.sse2.psra.w(<8 x i16> %v, <8 x i16> %1)
+  ret <8 x i16> %2
+}
+
+define <8 x i16> @sse2_psra_w_var_bc(<8 x i16> %v, <2 x i64> %a) {
+; CHECK-LABEL: @sse2_psra_w_var_bc(
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> %a to <8 x i16>
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.x86.sse2.psra.w(<8 x i16> %v, <8 x i16> [[TMP1]])
+; CHECK-NEXT:    ret <8 x i16> [[TMP2]]
+;
+  %1 = shufflevector <2 x i64> %a, <2 x i64> undef, <2 x i32> <i32 0, i32 0>
+  %2 = bitcast <2 x i64> %1 to <8 x i16>
+  %3 = tail call <8 x i16> @llvm.x86.sse2.psra.w(<8 x i16> %v, <8 x i16> %2)
+  ret <8 x i16> %3
+}
+
+define <4 x i32> @sse2_psra_d_var(<4 x i32> %v, <4 x i32> %a) {
+; CHECK-LABEL: @sse2_psra_d_var(
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i32> @llvm.x86.sse2.psra.d(<4 x i32> %v, <4 x i32> %a)
+; CHECK-NEXT:    ret <4 x i32> [[TMP1]]
+;
+  %1 = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
+  %2 = tail call <4 x i32> @llvm.x86.sse2.psra.d(<4 x i32> %v, <4 x i32> %1)
+  ret <4 x i32> %2
+}
+
+define <4 x i32> @sse2_psra_d_var_bc(<4 x i32> %v, <8 x i16> %a) {
+; CHECK-LABEL: @sse2_psra_d_var_bc(
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> %a to <4 x i32>
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.x86.sse2.psra.d(<4 x i32> %v, <4 x i32> [[TMP1]])
+; CHECK-NEXT:    ret <4 x i32> [[TMP2]]
+;
+  %1 = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+  %2 = bitcast <8 x i16> %1 to <4 x i32>
+  %3 = tail call <4 x i32> @llvm.x86.sse2.psra.d(<4 x i32> %v, <4 x i32> %2)
+  ret <4 x i32> %3
+}
+
+define <16 x i16> @avx2_psra_w_var(<16 x i16> %v, <8 x i16> %a) {
+; CHECK-LABEL: @avx2_psra_w_var(
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.x86.avx2.psra.w(<16 x i16> %v, <8 x i16> %a)
+; CHECK-NEXT:    ret <16 x i16> [[TMP1]]
+;
+  %1 = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+  %2 = tail call <16 x i16> @llvm.x86.avx2.psra.w(<16 x i16> %v, <8 x i16> %1)
+  ret <16 x i16> %2
+}
+
+define <8 x i32> @avx2_psra_d_var(<8 x i32> %v, <4 x i32> %a) {
+; CHECK-LABEL: @avx2_psra_d_var(
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.x86.avx2.psra.d(<8 x i32> %v, <4 x i32> %a)
+; CHECK-NEXT:    ret <8 x i32> [[TMP1]]
+;
+  %1 = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
+  %2 = tail call <8 x i32> @llvm.x86.avx2.psra.d(<8 x i32> %v, <4 x i32> %1)
+  ret <8 x i32> %2
+}
+
+define <2 x i64> @avx512_psra_q_128_var(<2 x i64> %v, <2 x i64> %a) {
+; CHECK-LABEL: @avx512_psra_q_128_var(
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.x86.avx512.psra.q.128(<2 x i64> %v, <2 x i64> %a)
+; CHECK-NEXT:    ret <2 x i64> [[TMP1]]
+;
+  %1 = shufflevector <2 x i64> %a, <2 x i64> undef, <2 x i32> <i32 0, i32 0>
+  %2 = tail call <2 x i64> @llvm.x86.avx512.psra.q.128(<2 x i64> %v, <2 x i64> %1)
+  ret <2 x i64> %2
+}
+
+define <4 x i64> @avx512_psra_q_256_var(<4 x i64> %v, <2 x i64> %a) {
+; CHECK-LABEL: @avx512_psra_q_256_var(
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.x86.avx512.psra.q.256(<4 x i64> %v, <2 x i64> %a)
+; CHECK-NEXT:    ret <4 x i64> [[TMP1]]
+;
+  %1 = shufflevector <2 x i64> %a, <2 x i64> undef, <2 x i32> <i32 0, i32 0>
+  %2 = tail call <4 x i64> @llvm.x86.avx512.psra.q.256(<4 x i64> %v, <2 x i64> %1)
+  ret <4 x i64> %2
+}
+
+define <32 x i16> @avx512_psra_w_512_var(<32 x i16> %v, <8 x i16> %a) {
+; CHECK-LABEL: @avx512_psra_w_512_var(
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i16> @llvm.x86.avx512.psra.w.512(<32 x i16> %v, <8 x i16> %a)
+; CHECK-NEXT:    ret <32 x i16> [[TMP1]]
+;
+  %1 = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+  %2 = tail call <32 x i16> @llvm.x86.avx512.psra.w.512(<32 x i16> %v, <8 x i16> %1)
+  ret <32 x i16> %2
+}
+
+define <16 x i32> @avx512_psra_d_512_var(<16 x i32> %v, <4 x i32> %a) {
+; CHECK-LABEL: @avx512_psra_d_512_var(
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i32> @llvm.x86.avx512.psra.d.512(<16 x i32> %v, <4 x i32> %a)
+; CHECK-NEXT:    ret <16 x i32> [[TMP1]]
+;
+  %1 = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
+  %2 = tail call <16 x i32> @llvm.x86.avx512.psra.d.512(<16 x i32> %v, <4 x i32> %1)
+  ret <16 x i32> %2
+}
+
+define <8 x i64> @avx512_psra_q_512_var(<8 x i64> %v, <2 x i64> %a) {
+; CHECK-LABEL: @avx512_psra_q_512_var(
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i64> @llvm.x86.avx512.psra.q.512(<8 x i64> %v, <2 x i64> %a)
+; CHECK-NEXT:    ret <8 x i64> [[TMP1]]
+;
+  %1 = shufflevector <2 x i64> %a, <2 x i64> undef, <2 x i32> <i32 0, i32 0>
+  %2 = tail call <8 x i64> @llvm.x86.avx512.psra.q.512(<8 x i64> %v, <2 x i64> %1)
+  ret <8 x i64> %2
+}
+
+define <8 x i16> @sse2_psrl_w_var(<8 x i16> %v, <8 x i16> %a) {
+; CHECK-LABEL: @sse2_psrl_w_var(
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i16> @llvm.x86.sse2.psrl.w(<8 x i16> %v, <8 x i16> %a)
+; CHECK-NEXT:    ret <8 x i16> [[TMP1]]
+;
+  %1 = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+  %2 = tail call <8 x i16> @llvm.x86.sse2.psrl.w(<8 x i16> %v, <8 x i16> %1)
+  ret <8 x i16> %2
+}
+
+define <4 x i32> @sse2_psrl_d_var(<4 x i32> %v, <4 x i32> %a) {
+; CHECK-LABEL: @sse2_psrl_d_var(
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i32> @llvm.x86.sse2.psrl.d(<4 x i32> %v, <4 x i32> %a)
+; CHECK-NEXT:    ret <4 x i32> [[TMP1]]
+;
+  %1 = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
+  %2 = tail call <4 x i32> @llvm.x86.sse2.psrl.d(<4 x i32> %v, <4 x i32> %1)
+  ret <4 x i32> %2
+}
+
+define <2 x i64> @sse2_psrl_q_var(<2 x i64> %v, <2 x i64> %a) {
+; CHECK-LABEL: @sse2_psrl_q_var(
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.x86.sse2.psrl.q(<2 x i64> %v, <2 x i64> %a)
+; CHECK-NEXT:    ret <2 x i64> [[TMP1]]
+;
+  %1 = shufflevector <2 x i64> %a, <2 x i64> undef, <2 x i32> <i32 0, i32 0>
+  %2 = tail call <2 x i64> @llvm.x86.sse2.psrl.q(<2 x i64> %v, <2 x i64> %1)
+  ret <2 x i64> %2
+}
+
+define <16 x i16> @avx2_psrl_w_var(<16 x i16> %v, <8 x i16> %a) {
+; CHECK-LABEL: @avx2_psrl_w_var(
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.x86.avx2.psrl.w(<16 x i16> %v, <8 x i16> %a)
+; CHECK-NEXT:    ret <16 x i16> [[TMP1]]
+;
+  %1 = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+  %2 = tail call <16 x i16> @llvm.x86.avx2.psrl.w(<16 x i16> %v, <8 x i16> %1)
+  ret <16 x i16> %2
+}
+
+define <16 x i16> @avx2_psrl_w_var_bc(<16 x i16> %v, <16 x i8> %a) {
+; CHECK-LABEL: @avx2_psrl_w_var_bc(
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> %a to <8 x i16>
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.x86.avx2.psrl.w(<16 x i16> %v, <8 x i16> [[TMP1]])
+; CHECK-NEXT:    ret <16 x i16> [[TMP2]]
+;
+  %1 = shufflevector <16 x i8> %a, <16 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %2 = bitcast <16 x i8> %1 to <8 x i16>
+  %3 = tail call <16 x i16> @llvm.x86.avx2.psrl.w(<16 x i16> %v, <8 x i16> %2)
+  ret <16 x i16> %3
+}
+
+define <8 x i32> @avx2_psrl_d_var(<8 x i32> %v, <4 x i32> %a) {
+; CHECK-LABEL: @avx2_psrl_d_var(
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.x86.avx2.psrl.d(<8 x i32> %v, <4 x i32> %a)
+; CHECK-NEXT:    ret <8 x i32> [[TMP1]]
+;
+  %1 = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
+  %2 = tail call <8 x i32> @llvm.x86.avx2.psrl.d(<8 x i32> %v, <4 x i32> %1)
+  ret <8 x i32> %2
+}
+
+define <8 x i32> @avx2_psrl_d_var_bc(<8 x i32> %v, <2 x i64> %a) {
+; CHECK-LABEL: @avx2_psrl_d_var_bc(
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> %a to <4 x i32>
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.x86.avx2.psrl.d(<8 x i32> %v, <4 x i32> [[TMP1]])
+; CHECK-NEXT:    ret <8 x i32> [[TMP2]]
+;
+  %1 = shufflevector <2 x i64> %a, <2 x i64> undef, <2 x i32> <i32 0, i32 0>
+  %2 = bitcast <2 x i64> %1 to <4 x i32>
+  %3 = tail call <8 x i32> @llvm.x86.avx2.psrl.d(<8 x i32> %v, <4 x i32> %2)
+  ret <8 x i32> %3
+}
+
+define <4 x i64> @avx2_psrl_q_var(<4 x i64> %v, <2 x i64> %a) {
+; CHECK-LABEL: @avx2_psrl_q_var(
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.x86.avx2.psrl.q(<4 x i64> %v, <2 x i64> %a)
+; CHECK-NEXT:    ret <4 x i64> [[TMP1]]
+;
+  %1 = shufflevector <2 x i64> %a, <2 x i64> undef, <2 x i32> <i32 0, i32 0>
+  %2 = tail call <4 x i64> @llvm.x86.avx2.psrl.q(<4 x i64> %v, <2 x i64> %1)
+  ret <4 x i64> %2
+}
+
+define <32 x i16> @avx512_psrl_w_512_var(<32 x i16> %v, <8 x i16> %a) {
+; CHECK-LABEL: @avx512_psrl_w_512_var(
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i16> @llvm.x86.avx512.psrl.w.512(<32 x i16> %v, <8 x i16> %a)
+; CHECK-NEXT:    ret <32 x i16> [[TMP1]]
+;
+  %1 = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+  %2 = tail call <32 x i16> @llvm.x86.avx512.psrl.w.512(<32 x i16> %v, <8 x i16> %1)
+  ret <32 x i16> %2
+}
+
+define <32 x i16> @avx512_psrl_w_512_var_bc(<32 x i16> %v, <16 x i8> %a) {
+; CHECK-LABEL: @avx512_psrl_w_512_var_bc(
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> %a to <8 x i16>
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i16> @llvm.x86.avx512.psrl.w.512(<32 x i16> %v, <8 x i16> [[TMP1]])
+; CHECK-NEXT:    ret <32 x i16> [[TMP2]]
+;
+  %1 = shufflevector <16 x i8> %a, <16 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %2 = bitcast <16 x i8> %1 to <8 x i16>
+  %3 = tail call <32 x i16> @llvm.x86.avx512.psrl.w.512(<32 x i16> %v, <8 x i16> %2)
+  ret <32 x i16> %3
+}
+
+define <16 x i32> @avx512_psrl_d_512_var(<16 x i32> %v, <4 x i32> %a) {
+; CHECK-LABEL: @avx512_psrl_d_512_var(
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i32> @llvm.x86.avx512.psrl.d.512(<16 x i32> %v, <4 x i32> %a)
+; CHECK-NEXT:    ret <16 x i32> [[TMP1]]
+;
+  %1 = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
+  %2 = tail call <16 x i32> @llvm.x86.avx512.psrl.d.512(<16 x i32> %v, <4 x i32> %1)
+  ret <16 x i32> %2
+}
+
+define <16 x i32> @avx512_psrl_d_512_var_bc(<16 x i32> %v, <2 x i64> %a) {
+; CHECK-LABEL: @avx512_psrl_d_512_var_bc(
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> %a to <4 x i32>
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i32> @llvm.x86.avx512.psrl.d.512(<16 x i32> %v, <4 x i32> [[TMP1]])
+; CHECK-NEXT:    ret <16 x i32> [[TMP2]]
+;
+  %1 = shufflevector <2 x i64> %a, <2 x i64> undef, <2 x i32> <i32 0, i32 0>
+  %2 = bitcast <2 x i64> %1 to <4 x i32>
+  %3 = tail call <16 x i32> @llvm.x86.avx512.psrl.d.512(<16 x i32> %v, <4 x i32> %2)
+  ret <16 x i32> %3
+}
+
+define <8 x i64> @avx512_psrl_q_512_var(<8 x i64> %v, <2 x i64> %a) {
+; CHECK-LABEL: @avx512_psrl_q_512_var(
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i64> @llvm.x86.avx512.psrl.q.512(<8 x i64> %v, <2 x i64> %a)
+; CHECK-NEXT:    ret <8 x i64> [[TMP1]]
+;
+  %1 = shufflevector <2 x i64> %a, <2 x i64> undef, <2 x i32> <i32 0, i32 0>
+  %2 = tail call <8 x i64> @llvm.x86.avx512.psrl.q.512(<8 x i64> %v, <2 x i64> %1)
+  ret <8 x i64> %2
+}
+
+define <8 x i16> @sse2_psll_w_var(<8 x i16> %v, <8 x i16> %a) {
+; CHECK-LABEL: @sse2_psll_w_var(
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i16> @llvm.x86.sse2.psll.w(<8 x i16> %v, <8 x i16> %a)
+; CHECK-NEXT:    ret <8 x i16> [[TMP1]]
+;
+  %1 = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+  %2 = tail call <8 x i16> @llvm.x86.sse2.psll.w(<8 x i16> %v, <8 x i16> %1)
+  ret <8 x i16> %2
+}
+
+define <4 x i32> @sse2_psll_d_var(<4 x i32> %v, <4 x i32> %a) {
+; CHECK-LABEL: @sse2_psll_d_var(
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i32> @llvm.x86.sse2.psll.d(<4 x i32> %v, <4 x i32> %a)
+; CHECK-NEXT:    ret <4 x i32> [[TMP1]]
+;
+  %1 = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
+  %2 = tail call <4 x i32> @llvm.x86.sse2.psll.d(<4 x i32> %v, <4 x i32> %1)
+  ret <4 x i32> %2
+}
+
+define <2 x i64> @sse2_psll_q_var(<2 x i64> %v, <2 x i64> %a) {
+; CHECK-LABEL: @sse2_psll_q_var(
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.x86.sse2.psll.q(<2 x i64> %v, <2 x i64> %a)
+; CHECK-NEXT:    ret <2 x i64> [[TMP1]]
+;
+  %1 = shufflevector <2 x i64> %a, <2 x i64> undef, <2 x i32> <i32 0, i32 0>
+  %2 = tail call <2 x i64> @llvm.x86.sse2.psll.q(<2 x i64> %v, <2 x i64> %1)
+  ret <2 x i64> %2
+}
+
+define <16 x i16> @avx2_psll_w_var(<16 x i16> %v, <8 x i16> %a) {
+; CHECK-LABEL: @avx2_psll_w_var(
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.x86.avx2.psll.w(<16 x i16> %v, <8 x i16> %a)
+; CHECK-NEXT:    ret <16 x i16> [[TMP1]]
+;
+  %1 = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+  %2 = tail call <16 x i16> @llvm.x86.avx2.psll.w(<16 x i16> %v, <8 x i16> %1)
+  ret <16 x i16> %2
+}
+
+define <8 x i32> @avx2_psll_d_var(<8 x i32> %v, <4 x i32> %a) {
+; CHECK-LABEL: @avx2_psll_d_var(
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.x86.avx2.psll.d(<8 x i32> %v, <4 x i32> %a)
+; CHECK-NEXT:    ret <8 x i32> [[TMP1]]
+;
+  %1 = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
+  %2 = tail call <8 x i32> @llvm.x86.avx2.psll.d(<8 x i32> %v, <4 x i32> %1)
+  ret <8 x i32> %2
+}
+
+define <4 x i64> @avx2_psll_q_var(<4 x i64> %v, <2 x i64> %a) {
+; CHECK-LABEL: @avx2_psll_q_var(
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.x86.avx2.psll.q(<4 x i64> %v, <2 x i64> %a)
+; CHECK-NEXT:    ret <4 x i64> [[TMP1]]
+;
+  %1 = shufflevector <2 x i64> %a, <2 x i64> undef, <2 x i32> <i32 0, i32 0>
+  %2 = tail call <4 x i64> @llvm.x86.avx2.psll.q(<4 x i64> %v, <2 x i64> %1)
+  ret <4 x i64> %2
+}
+
+define <32 x i16> @avx512_psll_w_512_var(<32 x i16> %v, <8 x i16> %a) {
+; CHECK-LABEL: @avx512_psll_w_512_var(
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i16> @llvm.x86.avx512.psll.w.512(<32 x i16> %v, <8 x i16> %a)
+; CHECK-NEXT:    ret <32 x i16> [[TMP1]]
+;
+  %1 = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+  %2 = tail call <32 x i16> @llvm.x86.avx512.psll.w.512(<32 x i16> %v, <8 x i16> %1)
+  ret <32 x i16> %2
+}
+
+define <16 x i32> @avx512_psll_d_512_var(<16 x i32> %v, <4 x i32> %a) {
+; CHECK-LABEL: @avx512_psll_d_512_var(
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i32> @llvm.x86.avx512.psll.d.512(<16 x i32> %v, <4 x i32> %a)
+; CHECK-NEXT:    ret <16 x i32> [[TMP1]]
+;
+  %1 = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
+  %2 = tail call <16 x i32> @llvm.x86.avx512.psll.d.512(<16 x i32> %v, <4 x i32> %1)
+  ret <16 x i32> %2
+}
+
+define <8 x i64> @avx512_psll_q_512_var(<8 x i64> %v, <2 x i64> %a) {
+; CHECK-LABEL: @avx512_psll_q_512_var(
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i64> @llvm.x86.avx512.psll.q.512(<8 x i64> %v, <2 x i64> %a)
+; CHECK-NEXT:    ret <8 x i64> [[TMP1]]
+;
+  %1 = shufflevector <2 x i64> %a, <2 x i64> undef, <2 x i32> <i32 0, i32 0>
+  %2 = tail call <8 x i64> @llvm.x86.avx512.psll.q.512(<8 x i64> %v, <2 x i64> %1)
+  ret <8 x i64> %2
+}
+
+;
+; Constant Folding
+;
+
+define <8 x i16> @test_sse2_psra_w_0(<8 x i16> %A) {
+; CHECK-LABEL: @test_sse2_psra_w_0(
+; CHECK-NEXT:    ret <8 x i16> %A
+;
+  %1 = tail call <8 x i16> @llvm.x86.sse2.psrai.w(<8 x i16> %A, i32 0)
+  %2 = tail call <8 x i16> @llvm.x86.sse2.psra.w(<8 x i16> %1, <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 7, i16 0, i16 0, i16 0>)
+  %3 = tail call <8 x i16> @llvm.x86.sse2.psrai.w(<8 x i16> %2, i32 0)
+  ret <8 x i16> %3
+}
+
+define <8 x i16> @test_sse2_psra_w_8() {
+; CHECK-LABEL: @test_sse2_psra_w_8(
+; CHECK-NEXT:    ret <8 x i16> <i16 -128, i16 64, i16 32, i16 16, i16 -128, i16 64, i16 32, i16 16>
+;
+  %1 = bitcast <2 x i64> <i64 1152956690052710400, i64 1152956690052710400> to <8 x i16>
+  %2 = tail call <8 x i16> @llvm.x86.sse2.psrai.w(<8 x i16> %1, i32 3)
+  %3 = tail call <8 x i16> @llvm.x86.sse2.psra.w(<8 x i16> %2, <8 x i16> <i16 3, i16 0, i16 0, i16 0, i16 7, i16 0, i16 0, i16 0>)
+  %4 = tail call <8 x i16> @llvm.x86.sse2.psrai.w(<8 x i16> %3, i32 2)
+  ret <8 x i16> %4
+}
+
+define <4 x i32> @test_sse2_psra_d_0(<4 x i32> %A) {
+; CHECK-LABEL: @test_sse2_psra_d_0(
+; CHECK-NEXT:    ret <4 x i32> %A
+;
+  %1 = tail call <4 x i32> @llvm.x86.sse2.psrai.d(<4 x i32> %A, i32 0)
+  %2 = tail call <4 x i32> @llvm.x86.sse2.psra.d(<4 x i32> %1, <4 x i32> <i32 0, i32 0, i32 7, i32 0>)
+  %3 = tail call <4 x i32> @llvm.x86.sse2.psrai.d(<4 x i32> %1, i32 0)
+  ret <4 x i32> %3
+}
+
+define <4 x i32> @sse2_psra_d_8() {
+; CHECK-LABEL: @sse2_psra_d_8(
+; CHECK-NEXT:    ret <4 x i32> <i32 4194432, i32 1048608, i32 4194432, i32 1048608>
+;
+  %1 = bitcast <2 x i64> <i64 1152956690052710400, i64 1152956690052710400> to <4 x i32>
+  %2 = tail call <4 x i32> @llvm.x86.sse2.psrai.d(<4 x i32> %1, i32 3)
+  %3 = tail call <4 x i32> @llvm.x86.sse2.psra.d(<4 x i32> %2, <4 x i32> <i32 3, i32 0, i32 7, i32 0>)
+  %4 = tail call <4 x i32> @llvm.x86.sse2.psrai.d(<4 x i32> %3, i32 2)
+  ret <4 x i32> %4
+}
+
+define <16 x i16> @test_avx2_psra_w_0(<16 x i16> %A) {
+; CHECK-LABEL: @test_avx2_psra_w_0(
+; CHECK-NEXT:    ret <16 x i16> %A
+;
+  %1 = tail call <16 x i16> @llvm.x86.avx2.psrai.w(<16 x i16> %A, i32 0)
+  %2 = tail call <16 x i16> @llvm.x86.avx2.psra.w(<16 x i16> %1, <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 7, i16 0, i16 0, i16 0>)
+  %3 = tail call <16 x i16> @llvm.x86.avx2.psrai.w(<16 x i16> %2, i32 0)
+  ret <16 x i16> %3
+}
+
+define <16 x i16> @test_avx2_psra_w_8(<16 x i16> %A) {
+; CHECK-LABEL: @test_avx2_psra_w_8(
+; CHECK-NEXT:    ret <16 x i16> <i16 -128, i16 64, i16 32, i16 16, i16 -128, i16 64, i16 32, i16 16, i16 -128, i16 64, i16 32, i16 16, i16 -128, i16 64, i16 32, i16 16>
+;
+  %1 = bitcast <4 x i64> <i64 1152956690052710400, i64 1152956690052710400, i64 1152956690052710400, i64 1152956690052710400> to <16 x i16>
+  %2 = tail call <16 x i16> @llvm.x86.avx2.psrai.w(<16 x i16> %1, i32 3)
+  %3 = tail call <16 x i16> @llvm.x86.avx2.psra.w(<16 x i16> %2, <8 x i16> <i16 3, i16 0, i16 0, i16 0, i16 7, i16 0, i16 0, i16 0>)
+  %4 = tail call <16 x i16> @llvm.x86.avx2.psrai.w(<16 x i16> %3, i32 2)
+  ret <16 x i16> %4
+}
+
+define <8 x i32> @test_avx2_psra_d_0(<8 x i32> %A) {
+; CHECK-LABEL: @test_avx2_psra_d_0(
+; CHECK-NEXT:    ret <8 x i32> %A
+;
+  %1 = tail call <8 x i32> @llvm.x86.avx2.psrai.d(<8 x i32> %A, i32 0)
+  %2 = tail call <8 x i32> @llvm.x86.avx2.psra.d(<8 x i32> %1, <4 x i32> <i32 0, i32 0, i32 7, i32 0>)
+  %3 = tail call <8 x i32> @llvm.x86.avx2.psrai.d(<8 x i32> %2, i32 0)
+  ret <8 x i32> %3
+}
+
+define <8 x i32> @test_avx2_psra_d_8() {
+; CHECK-LABEL: @test_avx2_psra_d_8(
+; CHECK-NEXT:    ret <8 x i32> <i32 4194432, i32 1048608, i32 4194432, i32 1048608, i32 4194432, i32 1048608, i32 4194432, i32 1048608>
+;
+  %1 = bitcast <4 x i64> <i64 1152956690052710400, i64 1152956690052710400, i64 1152956690052710400, i64 1152956690052710400> to <8 x i32>
+  %2 = tail call <8 x i32> @llvm.x86.avx2.psrai.d(<8 x i32> %1, i32 3)
+  %3 = tail call <8 x i32> @llvm.x86.avx2.psra.d(<8 x i32> %2, <4 x i32> <i32 3, i32 0, i32 7, i32 0>)
+  %4 = tail call <8 x i32> @llvm.x86.avx2.psrai.d(<8 x i32> %3, i32 2)
+  ret <8 x i32> %4
+}
+
+define <32 x i16> @test_avx512_psra_w_512_0(<32 x i16> %A) {
+; CHECK-LABEL: @test_avx512_psra_w_512_0(
+; CHECK-NEXT:    ret <32 x i16> %A
+;
+  %1 = tail call <32 x i16> @llvm.x86.avx512.psrai.w.512(<32 x i16> %A, i32 0)
+  %2 = tail call <32 x i16> @llvm.x86.avx512.psra.w.512(<32 x i16> %1, <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 7, i16 0, i16 0, i16 0>)
+  %3 = tail call <32 x i16> @llvm.x86.avx512.psrai.w.512(<32 x i16> %2, i32 0)
+  ret <32 x i16> %3
+}
+
+define <32 x i16> @test_avx512_psra_w_512_8(<32 x i16> %A) {
+; CHECK-LABEL: @test_avx512_psra_w_512_8(
+; CHECK-NEXT:    ret <32 x i16> <i16 -128, i16 64, i16 32, i16 16, i16 -128, i16 64, i16 32, i16 16, i16 -128, i16 64, i16 32, i16 16, i16 -128, i16 64, i16 32, i16 16, i16 -128, i16 64, i16 32, i16 16, i16 -128, i16 64, i16 32, i16 16, i16 -128, i16 64, i16 32, i16 16, i16 -128, i16 64, i16 32, i16 16>
+;
+  %1 = bitcast <8 x i64> <i64 1152956690052710400, i64 1152956690052710400, i64 1152956690052710400, i64 1152956690052710400, i64 1152956690052710400, i64 1152956690052710400, i64 1152956690052710400, i64 1152956690052710400> to <32 x i16>
+  %2 = tail call <32 x i16> @llvm.x86.avx512.psrai.w.512(<32 x i16> %1, i32 3)
+  %3 = tail call <32 x i16> @llvm.x86.avx512.psra.w.512(<32 x i16> %2, <8 x i16> <i16 3, i16 0, i16 0, i16 0, i16 7, i16 0, i16 0, i16 0>)
+  %4 = tail call <32 x i16> @llvm.x86.avx512.psrai.w.512(<32 x i16> %3, i32 2)
+  ret <32 x i16> %4
+}
+
+define <16 x i32> @test_avx512_psra_d_512_0(<16 x i32> %A) {
+; CHECK-LABEL: @test_avx512_psra_d_512_0(
+; CHECK-NEXT:    ret <16 x i32> %A
+;
+  %1 = tail call <16 x i32> @llvm.x86.avx512.psrai.d.512(<16 x i32> %A, i32 0)
+  %2 = tail call <16 x i32> @llvm.x86.avx512.psra.d.512(<16 x i32> %1, <4 x i32> <i32 0, i32 0, i32 7, i32 0>)
+  %3 = tail call <16 x i32> @llvm.x86.avx512.psrai.d.512(<16 x i32> %2, i32 0)
+  ret <16 x i32> %3
+}
+
+define <16 x i32> @test_avx512_psra_d_512_8() {
+; CHECK-LABEL: @test_avx512_psra_d_512_8(
+; CHECK-NEXT:    ret <16 x i32> <i32 4194432, i32 1048608, i32 4194432, i32 1048608, i32 4194432, i32 1048608, i32 4194432, i32 1048608, i32 4194432, i32 1048608, i32 4194432, i32 1048608, i32 4194432, i32 1048608, i32 4194432, i32 1048608>
+;
+  %1 = bitcast <8 x i64> <i64 1152956690052710400, i64 1152956690052710400, i64 1152956690052710400, i64 1152956690052710400, i64 1152956690052710400, i64 1152956690052710400, i64 1152956690052710400, i64 1152956690052710400> to <16 x i32>
+  %2 = tail call <16 x i32> @llvm.x86.avx512.psrai.d.512(<16 x i32> %1, i32 3)
+  %3 = tail call <16 x i32> @llvm.x86.avx512.psra.d.512(<16 x i32> %2, <4 x i32> <i32 3, i32 0, i32 7, i32 0>)
+  %4 = tail call <16 x i32> @llvm.x86.avx512.psrai.d.512(<16 x i32> %3, i32 2)
+  ret <16 x i32> %4
+}
+
+;
+; Old Tests
+;
+
+define <2 x i64> @test_sse2_1() {
+; CHECK-LABEL: @test_sse2_1(
+; CHECK-NEXT:    ret <2 x i64> <i64 72058418680037440, i64 144117112246370624>
+;
+  %S = bitcast i32 1 to i32
+  %1 = zext i32 %S to i64
+  %2 = insertelement <2 x i64> undef, i64 %1, i32 0
+  %3 = insertelement <2 x i64> %2, i64 0, i32 1
+  %4 = bitcast <2 x i64> %3 to <8 x i16>
+  %5 = tail call <8 x i16> @llvm.x86.sse2.psll.w(<8 x i16> <i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8>, <8 x i16> %4)
+  %6 = bitcast <8 x i16> %5 to <4 x i32>
+  %7 = bitcast <2 x i64> %3 to <4 x i32>
+  %8 = tail call <4 x i32> @llvm.x86.sse2.psll.d(<4 x i32> %6, <4 x i32> %7)
+  %9 = bitcast <4 x i32> %8 to <2 x i64>
+  %10 = tail call <2 x i64> @llvm.x86.sse2.psll.q(<2 x i64> %9, <2 x i64> %3)
+  %11 = bitcast <2 x i64> %10 to <8 x i16>
+  %12 = tail call <8 x i16> @llvm.x86.sse2.pslli.w(<8 x i16> %11, i32 %S)
+  %13 = bitcast <8 x i16> %12 to <4 x i32>
+  %14 = tail call <4 x i32> @llvm.x86.sse2.pslli.d(<4 x i32> %13, i32 %S)
+  %15 = bitcast <4 x i32> %14 to <2 x i64>
+  %16 = tail call <2 x i64> @llvm.x86.sse2.pslli.q(<2 x i64> %15, i32 %S)
+  ret <2 x i64> %16
+}
+
+define <4 x i64> @test_avx2_1() {
+; CHECK-LABEL: @test_avx2_1(
+; CHECK-NEXT:    ret <4 x i64> <i64 64, i64 128, i64 192, i64 256>
+;
+  %S = bitcast i32 1 to i32
+  %1 = zext i32 %S to i64
+  %2 = insertelement <2 x i64> undef, i64 %1, i32 0
+  %3 = insertelement <2 x i64> %2, i64 0, i32 1
+  %4 = bitcast <2 x i64> %3 to <8 x i16>
+  %5 = tail call <16 x i16> @llvm.x86.avx2.psll.w(<16 x i16> <i16 1, i16 0, i16 0, i16 0, i16 2, i16 0, i16 0, i16 0, i16 3, i16 0, i16 0, i16 0, i16 4, i16 0, i16 0, i16 0>, <8 x i16> %4)
+  %6 = bitcast <16 x i16> %5 to <8 x i32>
+  %7 = bitcast <2 x i64> %3 to <4 x i32>
+  %8 = tail call <8 x i32> @llvm.x86.avx2.psll.d(<8 x i32> %6, <4 x i32> %7)
+  %9 = bitcast <8 x i32> %8 to <4 x i64>
+  %10 = tail call <4 x i64> @llvm.x86.avx2.psll.q(<4 x i64> %9, <2 x i64> %3)
+  %11 = bitcast <4 x i64> %10 to <16 x i16>
+  %12 = tail call <16 x i16> @llvm.x86.avx2.pslli.w(<16 x i16> %11, i32 %S)
+  %13 = bitcast <16 x i16> %12 to <8 x i32>
+  %14 = tail call <8 x i32> @llvm.x86.avx2.pslli.d(<8 x i32> %13, i32 %S)
+  %15 = bitcast <8 x i32> %14 to <4 x i64>
+  %16 = tail call <4 x i64> @llvm.x86.avx2.pslli.q(<4 x i64> %15, i32 %S)
+  ret <4 x i64> %16
+}
+
+define <2 x i64> @test_sse2_0() {
+; CHECK-LABEL: @test_sse2_0(
+; CHECK-NEXT:    ret <2 x i64> zeroinitializer
+;
+  %S = bitcast i32 128 to i32
+  %1 = zext i32 %S to i64
+  %2 = insertelement <2 x i64> undef, i64 %1, i32 0
+  %3 = insertelement <2 x i64> %2, i64 0, i32 1
+  %4 = bitcast <2 x i64> %3 to <8 x i16>
+  %5 = tail call <8 x i16> @llvm.x86.sse2.psll.w(<8 x i16> <i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8>, <8 x i16> %4)
+  %6 = bitcast <8 x i16> %5 to <4 x i32>
+  %7 = bitcast <2 x i64> %3 to <4 x i32>
+  %8 = tail call <4 x i32> @llvm.x86.sse2.psll.d(<4 x i32> %6, <4 x i32> %7)
+  %9 = bitcast <4 x i32> %8 to <2 x i64>
+  %10 = tail call <2 x i64> @llvm.x86.sse2.psll.q(<2 x i64> %9, <2 x i64> %3)
+  %11 = bitcast <2 x i64> %10 to <8 x i16>
+  %12 = tail call <8 x i16> @llvm.x86.sse2.pslli.w(<8 x i16> %11, i32 %S)
+  %13 = bitcast <8 x i16> %12 to <4 x i32>
+  %14 = tail call <4 x i32> @llvm.x86.sse2.pslli.d(<4 x i32> %13, i32 %S)
+  %15 = bitcast <4 x i32> %14 to <2 x i64>
+  %16 = tail call <2 x i64> @llvm.x86.sse2.pslli.q(<2 x i64> %15, i32 %S)
+  ret <2 x i64> %16
+}
+
+define <4 x i64> @test_avx2_0() {
+; CHECK-LABEL: @test_avx2_0(
+; CHECK-NEXT:    ret <4 x i64> zeroinitializer
+;
+  %S = bitcast i32 128 to i32
+  %1 = zext i32 %S to i64
+  %2 = insertelement <2 x i64> undef, i64 %1, i32 0
+  %3 = insertelement <2 x i64> %2, i64 0, i32 1
+  %4 = bitcast <2 x i64> %3 to <8 x i16>
+  %5 = tail call <16 x i16> @llvm.x86.avx2.psll.w(<16 x i16> <i16 1, i16 0, i16 0, i16 0, i16 2, i16 0, i16 0, i16 0, i16 3, i16 0, i16 0, i16 0, i16 4, i16 0, i16 0, i16 0>, <8 x i16> %4)
+  %6 = bitcast <16 x i16> %5 to <8 x i32>
+  %7 = bitcast <2 x i64> %3 to <4 x i32>
+  %8 = tail call <8 x i32> @llvm.x86.avx2.psll.d(<8 x i32> %6, <4 x i32> %7)
+  %9 = bitcast <8 x i32> %8 to <4 x i64>
+  %10 = tail call <4 x i64> @llvm.x86.avx2.psll.q(<4 x i64> %9, <2 x i64> %3)
+  %11 = bitcast <4 x i64> %10 to <16 x i16>
+  %12 = tail call <16 x i16> @llvm.x86.avx2.pslli.w(<16 x i16> %11, i32 %S)
+  %13 = bitcast <16 x i16> %12 to <8 x i32>
+  %14 = tail call <8 x i32> @llvm.x86.avx2.pslli.d(<8 x i32> %13, i32 %S)
+  %15 = bitcast <8 x i32> %14 to <4 x i64>
+  %16 = tail call <4 x i64> @llvm.x86.avx2.pslli.q(<4 x i64> %15, i32 %S)
+  ret <4 x i64> %16
+}
+define <2 x i64> @test_sse2_psrl_1() {
+; CHECK-LABEL: @test_sse2_psrl_1(
+; CHECK-NEXT:    ret <2 x i64> <i64 562954248421376, i64 9007267974742020>
+;
+  %S = bitcast i32 1 to i32
+  %1 = zext i32 %S to i64
+  %2 = insertelement <2 x i64> undef, i64 %1, i32 0
+  %3 = insertelement <2 x i64> %2, i64 0, i32 1
+  %4 = bitcast <2 x i64> %3 to <8 x i16>
+  %5 = tail call <8 x i16> @llvm.x86.sse2.psrl.w(<8 x i16> <i16 16, i16 32, i16 64, i16 128, i16 256, i16 512, i16 1024, i16 2048>, <8 x i16> %4)
+  %6 = bitcast <8 x i16> %5 to <4 x i32>
+  %7 = bitcast <2 x i64> %3 to <4 x i32>
+  %8 = tail call <4 x i32> @llvm.x86.sse2.psrl.d(<4 x i32> %6, <4 x i32> %7)
+  %9 = bitcast <4 x i32> %8 to <2 x i64>
+  %10 = tail call <2 x i64> @llvm.x86.sse2.psrl.q(<2 x i64> %9, <2 x i64> %3)
+  %11 = bitcast <2 x i64> %10 to <8 x i16>
+  %12 = tail call <8 x i16> @llvm.x86.sse2.psrli.w(<8 x i16> %11, i32 %S)
+  %13 = bitcast <8 x i16> %12 to <4 x i32>
+  %14 = tail call <4 x i32> @llvm.x86.sse2.psrli.d(<4 x i32> %13, i32 %S)
+  %15 = bitcast <4 x i32> %14 to <2 x i64>
+  %16 = tail call <2 x i64> @llvm.x86.sse2.psrli.q(<2 x i64> %15, i32 %S)
+  ret <2 x i64> %16
+}
+
+define <4 x i64> @test_avx2_psrl_1() {
+; CHECK-LABEL: @test_avx2_psrl_1(
+; CHECK-NEXT:    ret <4 x i64> <i64 16, i64 32, i64 64, i64 128>
+;
+  %S = bitcast i32 1 to i32
+  %1 = zext i32 %S to i64
+  %2 = insertelement <2 x i64> undef, i64 %1, i32 0
+  %3 = insertelement <2 x i64> %2, i64 0, i32 1
+  %4 = bitcast <2 x i64> %3 to <8 x i16>
+  %5 = tail call <16 x i16> @llvm.x86.avx2.psrl.w(<16 x i16> <i16 1024, i16 0, i16 0, i16 0, i16 2048, i16 0, i16 0, i16 0, i16 4096, i16 0, i16 0, i16 0, i16 8192, i16 0, i16 0, i16 0>, <8 x i16> %4)
+  %6 = bitcast <16 x i16> %5 to <8 x i32>
+  %7 = bitcast <2 x i64> %3 to <4 x i32>
+  %8 = tail call <8 x i32> @llvm.x86.avx2.psrl.d(<8 x i32> %6, <4 x i32> %7)
+  %9 = bitcast <8 x i32> %8 to <4 x i64>
+  %10 = tail call <4 x i64> @llvm.x86.avx2.psrl.q(<4 x i64> %9, <2 x i64> %3)
+  %11 = bitcast <4 x i64> %10 to <16 x i16>
+  %12 = tail call <16 x i16> @llvm.x86.avx2.psrli.w(<16 x i16> %11, i32 %S)
+  %13 = bitcast <16 x i16> %12 to <8 x i32>
+  %14 = tail call <8 x i32> @llvm.x86.avx2.psrli.d(<8 x i32> %13, i32 %S)
+  %15 = bitcast <8 x i32> %14 to <4 x i64>
+  %16 = tail call <4 x i64> @llvm.x86.avx2.psrli.q(<4 x i64> %15, i32 %S)
+  ret <4 x i64> %16
+}
+
+define <2 x i64> @test_sse2_psrl_0() {
+; CHECK-LABEL: @test_sse2_psrl_0(
+; CHECK-NEXT:    ret <2 x i64> zeroinitializer
+;
+  %S = bitcast i32 128 to i32
+  %1 = zext i32 %S to i64
+  %2 = insertelement <2 x i64> undef, i64 %1, i32 0
+  %3 = insertelement <2 x i64> %2, i64 0, i32 1
+  %4 = bitcast <2 x i64> %3 to <8 x i16>
+  %5 = tail call <8 x i16> @llvm.x86.sse2.psrl.w(<8 x i16> <i16 32, i16 64, i16 128, i16 256, i16 512, i16 1024, i16 2048, i16 4096>, <8 x i16> %4)
+  %6 = bitcast <8 x i16> %5 to <4 x i32>
+  %7 = bitcast <2 x i64> %3 to <4 x i32>
+  %8 = tail call <4 x i32> @llvm.x86.sse2.psrl.d(<4 x i32> %6, <4 x i32> %7)
+  %9 = bitcast <4 x i32> %8 to <2 x i64>
+  %10 = tail call <2 x i64> @llvm.x86.sse2.psrl.q(<2 x i64> %9, <2 x i64> %3)
+  %11 = bitcast <2 x i64> %10 to <8 x i16>
+  %12 = tail call <8 x i16> @llvm.x86.sse2.psrli.w(<8 x i16> %11, i32 %S)
+  %13 = bitcast <8 x i16> %12 to <4 x i32>
+  %14 = tail call <4 x i32> @llvm.x86.sse2.psrli.d(<4 x i32> %13, i32 %S)
+  %15 = bitcast <4 x i32> %14 to <2 x i64>
+  %16 = tail call <2 x i64> @llvm.x86.sse2.psrli.q(<2 x i64> %15, i32 %S)
+  ret <2 x i64> %16
+}
+
+define <4 x i64> @test_avx2_psrl_0() {
+; CHECK-LABEL: @test_avx2_psrl_0(
+; CHECK-NEXT:    ret <4 x i64> zeroinitializer
+;
+  %S = bitcast i32 128 to i32
+  %1 = zext i32 %S to i64
+  %2 = insertelement <2 x i64> undef, i64 %1, i32 0
+  %3 = insertelement <2 x i64> %2, i64 0, i32 1
+  %4 = bitcast <2 x i64> %3 to <8 x i16>
+  %5 = tail call <16 x i16> @llvm.x86.avx2.psrl.w(<16 x i16> <i16 1024, i16 0, i16 0, i16 0, i16 2048, i16 0, i16 0, i16 0, i16 4096, i16 0, i16 0, i16 0, i16 8192, i16 0, i16 0, i16 0>, <8 x i16> %4)
+  %6 = bitcast <16 x i16> %5 to <8 x i32>
+  %7 = bitcast <2 x i64> %3 to <4 x i32>
+  %8 = tail call <8 x i32> @llvm.x86.avx2.psrl.d(<8 x i32> %6, <4 x i32> %7)
+  %9 = bitcast <8 x i32> %8 to <4 x i64>
+  %10 = tail call <4 x i64> @llvm.x86.avx2.psrl.q(<4 x i64> %9, <2 x i64> %3)
+  %11 = bitcast <4 x i64> %10 to <16 x i16>
+  %12 = tail call <16 x i16> @llvm.x86.avx2.psrli.w(<16 x i16> %11, i32 %S)
+  %13 = bitcast <16 x i16> %12 to <8 x i32>
+  %14 = tail call <8 x i32> @llvm.x86.avx2.psrli.d(<8 x i32> %13, i32 %S)
+  %15 = bitcast <8 x i32> %14 to <4 x i64>
+  %16 = tail call <4 x i64> @llvm.x86.avx2.psrli.q(<4 x i64> %15, i32 %S)
+  ret <4 x i64> %16
+}
+
+declare <8 x i64> @llvm.x86.avx512.pslli.q.512(<8 x i64>, i32) #1
+declare <16 x i32> @llvm.x86.avx512.pslli.d.512(<16 x i32>, i32) #1
+declare <32 x i16> @llvm.x86.avx512.pslli.w.512(<32 x i16>, i32) #1
+declare <8 x i64> @llvm.x86.avx512.psll.q.512(<8 x i64>, <2 x i64>) #1
+declare <16 x i32> @llvm.x86.avx512.psll.d.512(<16 x i32>, <4 x i32>) #1
+declare <32 x i16> @llvm.x86.avx512.psll.w.512(<32 x i16>, <8 x i16>) #1
+declare <4 x i64> @llvm.x86.avx2.pslli.q(<4 x i64>, i32) #1
+declare <8 x i32> @llvm.x86.avx2.pslli.d(<8 x i32>, i32) #1
+declare <16 x i16> @llvm.x86.avx2.pslli.w(<16 x i16>, i32) #1
+declare <4 x i64> @llvm.x86.avx2.psll.q(<4 x i64>, <2 x i64>) #1
+declare <8 x i32> @llvm.x86.avx2.psll.d(<8 x i32>, <4 x i32>) #1
+declare <16 x i16> @llvm.x86.avx2.psll.w(<16 x i16>, <8 x i16>) #1
+declare <2 x i64> @llvm.x86.sse2.pslli.q(<2 x i64>, i32) #1
+declare <4 x i32> @llvm.x86.sse2.pslli.d(<4 x i32>, i32) #1
+declare <8 x i16> @llvm.x86.sse2.pslli.w(<8 x i16>, i32) #1
+declare <2 x i64> @llvm.x86.sse2.psll.q(<2 x i64>, <2 x i64>) #1
+declare <4 x i32> @llvm.x86.sse2.psll.d(<4 x i32>, <4 x i32>) #1
+declare <8 x i16> @llvm.x86.sse2.psll.w(<8 x i16>, <8 x i16>) #1
+
+declare <8 x i64> @llvm.x86.avx512.psrli.q.512(<8 x i64>, i32) #1
+declare <16 x i32> @llvm.x86.avx512.psrli.d.512(<16 x i32>, i32) #1
+declare <32 x i16> @llvm.x86.avx512.psrli.w.512(<32 x i16>, i32) #1
+declare <8 x i64> @llvm.x86.avx512.psrl.q.512(<8 x i64>, <2 x i64>) #1
+declare <16 x i32> @llvm.x86.avx512.psrl.d.512(<16 x i32>, <4 x i32>) #1
+declare <32 x i16> @llvm.x86.avx512.psrl.w.512(<32 x i16>, <8 x i16>) #1
+declare <4 x i64> @llvm.x86.avx2.psrli.q(<4 x i64>, i32) #1
+declare <8 x i32> @llvm.x86.avx2.psrli.d(<8 x i32>, i32) #1
+declare <16 x i16> @llvm.x86.avx2.psrli.w(<16 x i16>, i32) #1
+declare <4 x i64> @llvm.x86.avx2.psrl.q(<4 x i64>, <2 x i64>) #1
+declare <8 x i32> @llvm.x86.avx2.psrl.d(<8 x i32>, <4 x i32>) #1
+declare <16 x i16> @llvm.x86.avx2.psrl.w(<16 x i16>, <8 x i16>) #1
+declare <2 x i64> @llvm.x86.sse2.psrli.q(<2 x i64>, i32) #1
+declare <4 x i32> @llvm.x86.sse2.psrli.d(<4 x i32>, i32) #1
+declare <8 x i16> @llvm.x86.sse2.psrli.w(<8 x i16>, i32) #1
+declare <2 x i64> @llvm.x86.sse2.psrl.q(<2 x i64>, <2 x i64>) #1
+declare <4 x i32> @llvm.x86.sse2.psrl.d(<4 x i32>, <4 x i32>) #1
+declare <8 x i16> @llvm.x86.sse2.psrl.w(<8 x i16>, <8 x i16>) #1
+
+declare <8 x i64> @llvm.x86.avx512.psrai.q.512(<8 x i64>, i32) #1
+declare <16 x i32> @llvm.x86.avx512.psrai.d.512(<16 x i32>, i32) #1
+declare <32 x i16> @llvm.x86.avx512.psrai.w.512(<32 x i16>, i32) #1
+declare <8 x i64> @llvm.x86.avx512.psra.q.512(<8 x i64>, <2 x i64>) #1
+declare <16 x i32> @llvm.x86.avx512.psra.d.512(<16 x i32>, <4 x i32>) #1
+declare <32 x i16> @llvm.x86.avx512.psra.w.512(<32 x i16>, <8 x i16>) #1
+declare <4 x i64> @llvm.x86.avx512.psrai.q.256(<4 x i64>, i32) #1
+declare <8 x i32> @llvm.x86.avx2.psrai.d(<8 x i32>, i32) #1
+declare <16 x i16> @llvm.x86.avx2.psrai.w(<16 x i16>, i32) #1
+declare <4 x i64> @llvm.x86.avx512.psra.q.256(<4 x i64>, <2 x i64>) #1
+declare <8 x i32> @llvm.x86.avx2.psra.d(<8 x i32>, <4 x i32>) #1
+declare <16 x i16> @llvm.x86.avx2.psra.w(<16 x i16>, <8 x i16>) #1
+declare <2 x i64> @llvm.x86.avx512.psrai.q.128(<2 x i64>, i32) #1
+declare <4 x i32> @llvm.x86.sse2.psrai.d(<4 x i32>, i32) #1
+declare <8 x i16> @llvm.x86.sse2.psrai.w(<8 x i16>, i32) #1
+declare <2 x i64> @llvm.x86.avx512.psra.q.128(<2 x i64>, <2 x i64>) #1
+declare <4 x i32> @llvm.x86.sse2.psra.d(<4 x i32>, <4 x i32>) #1
+declare <8 x i16> @llvm.x86.sse2.psra.w(<8 x i16>, <8 x i16>) #1
+
+declare <4 x i32> @llvm.x86.avx2.psrav.d(<4 x i32>, <4 x i32>) #1
+declare <8 x i32> @llvm.x86.avx2.psrav.d.256(<8 x i32>, <8 x i32>) #1
+declare <16 x i32> @llvm.x86.avx512.psrav.d.512(<16 x i32>, <16 x i32>) #1
+declare <2 x i64> @llvm.x86.avx512.psrav.q.128(<2 x i64>, <2 x i64>) #1
+declare <4 x i64> @llvm.x86.avx512.psrav.q.256(<4 x i64>, <4 x i64>) #1
+declare <8 x i64> @llvm.x86.avx512.psrav.q.512(<8 x i64>, <8 x i64>) #1
+
+declare <4 x i32> @llvm.x86.avx2.psrlv.d(<4 x i32>, <4 x i32>) #1
+declare <8 x i32> @llvm.x86.avx2.psrlv.d.256(<8 x i32>, <8 x i32>) #1
+declare <2 x i64> @llvm.x86.avx2.psrlv.q(<2 x i64>, <2 x i64>) #1
+declare <4 x i64> @llvm.x86.avx2.psrlv.q.256(<4 x i64>, <4 x i64>) #1
+declare <16 x i32> @llvm.x86.avx512.psrlv.d.512(<16 x i32>, <16 x i32>) #1
+declare <8 x i64> @llvm.x86.avx512.psrlv.q.512(<8 x i64>, <8 x i64>) #1
+
+declare <4 x i32> @llvm.x86.avx2.psllv.d(<4 x i32>, <4 x i32>) #1
+declare <8 x i32> @llvm.x86.avx2.psllv.d.256(<8 x i32>, <8 x i32>) #1
+declare <2 x i64> @llvm.x86.avx2.psllv.q(<2 x i64>, <2 x i64>) #1
+declare <4 x i64> @llvm.x86.avx2.psllv.q.256(<4 x i64>, <4 x i64>) #1
+declare <16 x i32> @llvm.x86.avx512.psllv.d.512(<16 x i32>, <16 x i32>) #1
+declare <8 x i64> @llvm.x86.avx512.psllv.q.512(<8 x i64>, <8 x i64>) #1
+
+declare <8 x i16> @llvm.x86.avx512.psrav.w.128(<8 x i16>, <8 x i16>) #1
+declare <16 x i16> @llvm.x86.avx512.psrav.w.256(<16 x i16>, <16 x i16>) #1
+declare <32 x i16> @llvm.x86.avx512.psrav.w.512(<32 x i16>, <32 x i16>) #1
+declare <8 x i16> @llvm.x86.avx512.psrlv.w.128(<8 x i16>, <8 x i16>) #1
+declare <16 x i16> @llvm.x86.avx512.psrlv.w.256(<16 x i16>, <16 x i16>) #1
+declare <32 x i16> @llvm.x86.avx512.psrlv.w.512(<32 x i16>, <32 x i16>) #1
+declare <8 x i16> @llvm.x86.avx512.psllv.w.128(<8 x i16>, <8 x i16>) #1
+declare <16 x i16> @llvm.x86.avx512.psllv.w.256(<16 x i16>, <16 x i16>) #1
+declare <32 x i16> @llvm.x86.avx512.psllv.w.512(<32 x i16>, <32 x i16>) #1
+
+attributes #1 = { nounwind readnone }
diff --git a/test/Transforms/InstCombine/X86/x86-vperm2.ll b/test/Transforms/InstCombine/X86/x86-vperm2.ll
new file mode 100644
index 000000000000..84f69aa25d24
--- /dev/null
+++ b/test/Transforms/InstCombine/X86/x86-vperm2.ll
@@ -0,0 +1,313 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+; This should never happen, but make sure we don't crash handling a non-constant immediate byte.
+
+define <4 x double> @perm2pd_non_const_imm(<4 x double> %a0, <4 x double> %a1, i8 %b) {
+; CHECK-LABEL: @perm2pd_non_const_imm(
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 %b)
+; CHECK-NEXT:    ret <4 x double> [[RES]]
+;
+  %res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 %b)
+  ret <4 x double> %res
+
+}
+
+
+; In the following 4 tests, both zero mask bits of the immediate are set.
+
+define <4 x double> @perm2pd_0x88(<4 x double> %a0, <4 x double> %a1) {
+; CHECK-LABEL: @perm2pd_0x88(
+; CHECK-NEXT:    ret <4 x double> zeroinitializer
+;
+  %res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 136)
+  ret <4 x double> %res
+
+}
+
+define <8 x float> @perm2ps_0x88(<8 x float> %a0, <8 x float> %a1) {
+; CHECK-LABEL: @perm2ps_0x88(
+; CHECK-NEXT:    ret <8 x float> zeroinitializer
+;
+  %res = call <8 x float> @llvm.x86.avx.vperm2f128.ps.256(<8 x float> %a0, <8 x float> %a1, i8 136)
+  ret <8 x float> %res
+
+}
+
+define <8 x i32> @perm2si_0x88(<8 x i32> %a0, <8 x i32> %a1) {
+; CHECK-LABEL: @perm2si_0x88(
+; CHECK-NEXT:    ret <8 x i32> zeroinitializer
+;
+  %res = call <8 x i32> @llvm.x86.avx.vperm2f128.si.256(<8 x i32> %a0, <8 x i32> %a1, i8 136)
+  ret <8 x i32> %res
+
+}
+
+define <4 x i64> @perm2i_0x88(<4 x i64> %a0, <4 x i64> %a1) {
+; CHECK-LABEL: @perm2i_0x88(
+; CHECK-NEXT:    ret <4 x i64> zeroinitializer
+;
+  %res = call <4 x i64> @llvm.x86.avx2.vperm2i128(<4 x i64> %a0, <4 x i64> %a1, i8 136)
+  ret <4 x i64> %res
+
+}
+
+
+; The other control bits are ignored when zero mask bits of the immediate are set.
+
+define <4 x double> @perm2pd_0xff(<4 x double> %a0, <4 x double> %a1) {
+; CHECK-LABEL: @perm2pd_0xff(
+; CHECK-NEXT:    ret <4 x double> zeroinitializer
+;
+  %res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 255)
+  ret <4 x double> %res
+
+}
+
+
+; The following 16 tests are simple shuffles, except for 2 cases where we can just return one of the
+; source vectors. Verify that we generate the right shuffle masks and undef source operand where possible..
+
+define <4 x double> @perm2pd_0x00(<4 x double> %a0, <4 x double> %a1) {
+; CHECK-LABEL: @perm2pd_0x00(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> %a0, <4 x double> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
+; CHECK-NEXT:    ret <4 x double> [[TMP1]]
+;
+  %res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 0)
+  ret <4 x double> %res
+
+}
+
+define <4 x double> @perm2pd_0x01(<4 x double> %a0, <4 x double> %a1) {
+; CHECK-LABEL: @perm2pd_0x01(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> %a0, <4 x double> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
+; CHECK-NEXT:    ret <4 x double> [[TMP1]]
+;
+  %res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 1)
+  ret <4 x double> %res
+
+}
+
+define <4 x double> @perm2pd_0x02(<4 x double> %a0, <4 x double> %a1) {
+; CHECK-LABEL: @perm2pd_0x02(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> %a1, <4 x double> %a0, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; CHECK-NEXT:    ret <4 x double> [[TMP1]]
+;
+  %res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 2)
+  ret <4 x double> %res
+
+}
+
+define <4 x double> @perm2pd_0x03(<4 x double> %a0, <4 x double> %a1) {
+; CHECK-LABEL: @perm2pd_0x03(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> %a1, <4 x double> %a0, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; CHECK-NEXT:    ret <4 x double> [[TMP1]]
+;
+  %res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 3)
+  ret <4 x double> %res
+
+}
+
+define <4 x double> @perm2pd_0x10(<4 x double> %a0, <4 x double> %a1) {
+; CHECK-LABEL: @perm2pd_0x10(
+; CHECK-NEXT:    ret <4 x double> %a0
+;
+  %res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 16)
+  ret <4 x double> %res
+
+}
+
+define <4 x double> @perm2pd_0x11(<4 x double> %a0, <4 x double> %a1) {
+; CHECK-LABEL: @perm2pd_0x11(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> %a0, <4 x double> undef, <4 x i32> <i32 2, i32 3, i32 2, i32 3>
+; CHECK-NEXT:    ret <4 x double> [[TMP1]]
+;
+  %res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 17)
+  ret <4 x double> %res
+
+}
+
+define <4 x double> @perm2pd_0x12(<4 x double> %a0, <4 x double> %a1) {
+; CHECK-LABEL: @perm2pd_0x12(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> %a1, <4 x double> %a0, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
+; CHECK-NEXT:    ret <4 x double> [[TMP1]]
+;
+  %res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 18)
+  ret <4 x double> %res
+
+}
+
+define <4 x double> @perm2pd_0x13(<4 x double> %a0, <4 x double> %a1) {
+; CHECK-LABEL: @perm2pd_0x13(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> %a1, <4 x double> %a0, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
+; CHECK-NEXT:    ret <4 x double> [[TMP1]]
+;
+  %res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 19)
+  ret <4 x double> %res
+
+}
+
+define <4 x double> @perm2pd_0x20(<4 x double> %a0, <4 x double> %a1) {
+; CHECK-LABEL: @perm2pd_0x20(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> %a0, <4 x double> %a1, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; CHECK-NEXT:    ret <4 x double> [[TMP1]]
+;
+  %res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 32)
+  ret <4 x double> %res
+
+}
+
+define <4 x double> @perm2pd_0x21(<4 x double> %a0, <4 x double> %a1) {
+; CHECK-LABEL: @perm2pd_0x21(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> %a0, <4 x double> %a1, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; CHECK-NEXT:    ret <4 x double> [[TMP1]]
+;
+  %res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 33)
+  ret <4 x double> %res
+
+}
+
+define <4 x double> @perm2pd_0x22(<4 x double> %a0, <4 x double> %a1) {
+; CHECK-LABEL: @perm2pd_0x22(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> %a1, <4 x double> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
+; CHECK-NEXT:    ret <4 x double> [[TMP1]]
+;
+  %res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 34)
+  ret <4 x double> %res
+
+}
+
+define <4 x double> @perm2pd_0x23(<4 x double> %a0, <4 x double> %a1) {
+; CHECK-LABEL: @perm2pd_0x23(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> %a1, <4 x double> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
+; CHECK-NEXT:    ret <4 x double> [[TMP1]]
+;
+  %res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 35)
+  ret <4 x double> %res
+
+}
+
+define <4 x double> @perm2pd_0x30(<4 x double> %a0, <4 x double> %a1) {
+; CHECK-LABEL: @perm2pd_0x30(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> %a0, <4 x double> %a1, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
+; CHECK-NEXT:    ret <4 x double> [[TMP1]]
+;
+  %res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 48)
+  ret <4 x double> %res
+
+}
+
+define <4 x double> @perm2pd_0x31(<4 x double> %a0, <4 x double> %a1) {
+; CHECK-LABEL: @perm2pd_0x31(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> %a0, <4 x double> %a1, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
+; CHECK-NEXT:    ret <4 x double> [[TMP1]]
+;
+  %res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 49)
+  ret <4 x double> %res
+
+}
+
+define <4 x double> @perm2pd_0x32(<4 x double> %a0, <4 x double> %a1) {
+; CHECK-LABEL: @perm2pd_0x32(
+; CHECK-NEXT:    ret <4 x double> %a1
+;
+  %res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 50)
+  ret <4 x double> %res
+
+}
+
+define <4 x double> @perm2pd_0x33(<4 x double> %a0, <4 x double> %a1) {
+; CHECK-LABEL: @perm2pd_0x33(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> %a1, <4 x double> undef, <4 x i32> <i32 2, i32 3, i32 2, i32 3>
+; CHECK-NEXT:    ret <4 x double> [[TMP1]]
+;
+  %res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 51)
+  ret <4 x double> %res
+
+}
+
+; Confirm that a mask for 32-bit elements is also correct.
+
+define <8 x float> @perm2ps_0x31(<8 x float> %a0, <8 x float> %a1) {
+; CHECK-LABEL: @perm2ps_0x31(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> %a0, <8 x float> %a1, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    ret <8 x float> [[TMP1]]
+;
+  %res = call <8 x float> @llvm.x86.avx.vperm2f128.ps.256(<8 x float> %a0, <8 x float> %a1, i8 49)
+  ret <8 x float> %res
+
+}
+
+
+; Confirm that the AVX2 version works the same.
+
+define <4 x i64> @perm2i_0x33(<4 x i64> %a0, <4 x i64> %a1) {
+; CHECK-LABEL: @perm2i_0x33(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i64> %a1, <4 x i64> undef, <4 x i32> <i32 2, i32 3, i32 2, i32 3>
+; CHECK-NEXT:    ret <4 x i64> [[TMP1]]
+;
+  %res = call <4 x i64> @llvm.x86.avx2.vperm2i128(<4 x i64> %a0, <4 x i64> %a1, i8 51)
+  ret <4 x i64> %res
+
+}
+
+
+; Confirm that when a single zero mask bit is set, we replace a source vector with zeros.
+
+define <4 x double> @perm2pd_0x81(<4 x double> %a0, <4 x double> %a1) {
+; CHECK-LABEL: @perm2pd_0x81(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> %a0, <4 x double> <double 0.000000e+00, double 0.000000e+00, double undef, double undef>, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; CHECK-NEXT:    ret <4 x double> [[TMP1]]
+;
+  %res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 129)
+  ret <4 x double> %res
+
+}
+
+define <4 x double> @perm2pd_0x83(<4 x double> %a0, <4 x double> %a1) {
+; CHECK-LABEL: @perm2pd_0x83(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> %a1, <4 x double> <double 0.000000e+00, double 0.000000e+00, double undef, double undef>, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; CHECK-NEXT:    ret <4 x double> [[TMP1]]
+;
+  %res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 131)
+  ret <4 x double> %res
+
+}
+
+define <4 x double> @perm2pd_0x28(<4 x double> %a0, <4 x double> %a1) {
+; CHECK-LABEL: @perm2pd_0x28(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> <double 0.000000e+00, double 0.000000e+00, double undef, double undef>, <4 x double> %a1, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; CHECK-NEXT:    ret <4 x double> [[TMP1]]
+;
+  %res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 40)
+  ret <4 x double> %res
+
+}
+
+define <4 x double> @perm2pd_0x08(<4 x double> %a0, <4 x double> %a1) {
+; CHECK-LABEL: @perm2pd_0x08(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> <double 0.000000e+00, double 0.000000e+00, double undef, double undef>, <4 x double> %a0, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; CHECK-NEXT:    ret <4 x double> [[TMP1]]
+;
+  %res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 8)
+  ret <4 x double> %res
+
+}
+
+; Check one more with the AVX2 version.
+
+define <4 x i64> @perm2i_0x28(<4 x i64> %a0, <4 x i64> %a1) {
+; CHECK-LABEL: @perm2i_0x28(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i64> <i64 0, i64 0, i64 undef, i64 undef>, <4 x i64> %a1, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; CHECK-NEXT:    ret <4 x i64> [[TMP1]]
+;
+  %res = call <4 x i64> @llvm.x86.avx2.vperm2i128(<4 x i64> %a0, <4 x i64> %a1, i8 40)
+  ret <4 x i64> %res
+
+}
+
+declare <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double>, <4 x double>, i8) nounwind readnone
+declare <8 x float> @llvm.x86.avx.vperm2f128.ps.256(<8 x float>, <8 x float>, i8) nounwind readnone
+declare <8 x i32> @llvm.x86.avx.vperm2f128.si.256(<8 x i32>, <8 x i32>, i8) nounwind readnone
+declare <4 x i64> @llvm.x86.avx2.vperm2i128(<4 x i64>, <4 x i64>, i8) nounwind readnone
+
diff --git a/test/Transforms/InstCombine/X86/x86-vpermil.ll b/test/Transforms/InstCombine/X86/x86-vpermil.ll
new file mode 100644
index 000000000000..f68eb36c4b58
--- /dev/null
+++ b/test/Transforms/InstCombine/X86/x86-vpermil.ll
@@ -0,0 +1,298 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -instcombine -S | FileCheck %s
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+; Verify that instcombine is able to fold identity shuffles.
+
+define <4 x float> @identity_test_vpermilvar_ps(<4 x float> %v) {
+; CHECK-LABEL: @identity_test_vpermilvar_ps(
+; CHECK-NEXT:    ret <4 x float> %v
+;
+  %a = tail call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> %v, <4 x i32> <i32 0, i32 1, i32 2, i32 3>)
+  ret <4 x float> %a
+}
+
+define <8 x float> @identity_test_vpermilvar_ps_256(<8 x float> %v) {
+; CHECK-LABEL: @identity_test_vpermilvar_ps_256(
+; CHECK-NEXT:    ret <8 x float> %v
+;
+  %a = tail call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %v, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>)
+  ret <8 x float> %a
+}
+
+define <16 x float> @identity_test_vpermilvar_ps_512(<16 x float> %v) {
+; CHECK-LABEL: @identity_test_vpermilvar_ps_512(
+; CHECK-NEXT:    ret <16 x float> %v
+;
+  %a = tail call <16 x float> @llvm.x86.avx512.vpermilvar.ps.512(<16 x float> %v, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>)
+  ret <16 x float> %a
+}
+
+define <2 x double> @identity_test_vpermilvar_pd(<2 x double> %v) {
+; CHECK-LABEL: @identity_test_vpermilvar_pd(
+; CHECK-NEXT:    ret <2 x double> %v
+;
+  %a = tail call <2 x double> @llvm.x86.avx.vpermilvar.pd(<2 x double> %v, <2 x i64> <i64 0, i64 2>)
+  ret <2 x double> %a
+}
+
+define <4 x double> @identity_test_vpermilvar_pd_256(<4 x double> %v) {
+; CHECK-LABEL: @identity_test_vpermilvar_pd_256(
+; CHECK-NEXT:    ret <4 x double> %v
+;
+  %a = tail call <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double> %v, <4 x i64> <i64 0, i64 2, i64 0, i64 2>)
+  ret <4 x double> %a
+}
+
+define <8 x double> @identity_test_vpermilvar_pd_512(<8 x double> %v) {
+; CHECK-LABEL: @identity_test_vpermilvar_pd_512(
+; CHECK-NEXT:    ret <8 x double> %v
+;
+  %a = tail call <8 x double> @llvm.x86.avx512.vpermilvar.pd.512(<8 x double> %v, <8 x i64> <i64 0, i64 2, i64 0, i64 2, i64 0, i64 2, i64 0, i64 2>)
+  ret <8 x double> %a
+}
+
+; Instcombine should be able to fold the following byte shuffle to a builtin shufflevector
+; with a shuffle mask of all zeroes.
+
+define <4 x float> @zero_test_vpermilvar_ps_zero(<4 x float> %v) {
+; CHECK-LABEL: @zero_test_vpermilvar_ps_zero(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
+;
+  %a = tail call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> %v, <4 x i32> zeroinitializer)
+  ret <4 x float> %a
+}
+
+define <8 x float> @zero_test_vpermilvar_ps_256_zero(<8 x float> %v) {
+; CHECK-LABEL: @zero_test_vpermilvar_ps_256_zero(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> %v, <8 x float> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4>
+; CHECK-NEXT:    ret <8 x float> [[TMP1]]
+;
+  %a = tail call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %v, <8 x i32> zeroinitializer)
+  ret <8 x float> %a
+}
+
+define <16 x float> @zero_test_vpermilvar_ps_512_zero(<16 x float> %v) {
+; CHECK-LABEL: @zero_test_vpermilvar_ps_512_zero(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x float> %v, <16 x float> undef, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4, i32 8, i32 8, i32 8, i32 8, i32 12, i32 12, i32 12, i32 12>
+; CHECK-NEXT:    ret <16 x float> [[TMP1]]
+;
+  %a = tail call <16 x float> @llvm.x86.avx512.vpermilvar.ps.512(<16 x float> %v, <16 x i32> zeroinitializer)
+  ret <16 x float> %a
+}
+
+define <2 x double> @zero_test_vpermilvar_pd_zero(<2 x double> %v) {
+; CHECK-LABEL: @zero_test_vpermilvar_pd_zero(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x double> %v, <2 x double> undef, <2 x i32> zeroinitializer
+; CHECK-NEXT:    ret <2 x double> [[TMP1]]
+;
+  %a = tail call <2 x double> @llvm.x86.avx.vpermilvar.pd(<2 x double> %v, <2 x i64> zeroinitializer)
+  ret <2 x double> %a
+}
+
+define <4 x double> @zero_test_vpermilvar_pd_256_zero(<4 x double> %v) {
+; CHECK-LABEL: @zero_test_vpermilvar_pd_256_zero(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> %v, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
+; CHECK-NEXT:    ret <4 x double> [[TMP1]]
+;
+  %a = tail call <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double> %v, <4 x i64> zeroinitializer)
+  ret <4 x double> %a
+}
+
+define <8 x double> @zero_test_vpermilvar_pd_512_zero(<8 x double> %v) {
+; CHECK-LABEL: @zero_test_vpermilvar_pd_512_zero(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x double> %v, <8 x double> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
+; CHECK-NEXT:    ret <8 x double> [[TMP1]]
+;
+  %a = tail call <8 x double> @llvm.x86.avx512.vpermilvar.pd.512(<8 x double> %v, <8 x i64> zeroinitializer)
+  ret <8 x double> %a
+}
+
+; Verify that instcombine is able to fold constant shuffles.
+
+define <4 x float> @test_vpermilvar_ps(<4 x float> %v) {
+; CHECK-LABEL: @test_vpermilvar_ps(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
+;
+  %a = tail call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> %v, <4 x i32> <i32 3, i32 2, i32 1, i32 0>)
+  ret <4 x float> %a
+}
+
+define <8 x float> @test_vpermilvar_ps_256(<8 x float> %v) {
+; CHECK-LABEL: @test_vpermilvar_ps_256(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> %v, <8 x float> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
+; CHECK-NEXT:    ret <8 x float> [[TMP1]]
+;
+  %a = tail call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %v, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>)
+  ret <8 x float> %a
+}
+
+define <16 x float> @test_vpermilvar_ps_512(<16 x float> %v) {
+; CHECK-LABEL: @test_vpermilvar_ps_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x float> %v, <16 x float> undef, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>
+; CHECK-NEXT:    ret <16 x float> [[TMP1]]
+;
+  %a = tail call <16 x float> @llvm.x86.avx512.vpermilvar.ps.512(<16 x float> %v, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>)
+  ret <16 x float> %a
+}
+
+define <2 x double> @test_vpermilvar_pd(<2 x double> %v) {
+; CHECK-LABEL: @test_vpermilvar_pd(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x double> %v, <2 x double> undef, <2 x i32> <i32 1, i32 0>
+; CHECK-NEXT:    ret <2 x double> [[TMP1]]
+;
+  %a = tail call <2 x double> @llvm.x86.avx.vpermilvar.pd(<2 x double> %v, <2 x i64> <i64 2, i64 0>)
+  ret <2 x double> %a
+}
+
+define <4 x double> @test_vpermilvar_pd_256(<4 x double> %v) {
+; CHECK-LABEL: @test_vpermilvar_pd_256(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> %v, <4 x double> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+; CHECK-NEXT:    ret <4 x double> [[TMP1]]
+;
+  %a = tail call <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double> %v, <4 x i64> <i64 3, i64 1, i64 2, i64 0>)
+  ret <4 x double> %a
+}
+
+define <8 x double> @test_vpermilvar_pd_512(<8 x double> %v) {
+; CHECK-LABEL: @test_vpermilvar_pd_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x double> %v, <8 x double> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
+; CHECK-NEXT:    ret <8 x double> [[TMP1]]
+;
+  %a = tail call <8 x double> @llvm.x86.avx512.vpermilvar.pd.512(<8 x double> %v, <8 x i64> <i64 3, i64 1, i64 2, i64 0, i64 7, i64 5, i64 6, i64 4>)
+  ret <8 x double> %a
+}
+
+; Verify that instcombine is able to fold constant shuffles with undef mask elements.
+
+define <4 x float> @undef_test_vpermilvar_ps(<4 x float> %v) {
+; CHECK-LABEL: @undef_test_vpermilvar_ps(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> <i32 undef, i32 2, i32 1, i32 undef>
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
+;
+  %a = tail call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> %v, <4 x i32> <i32 undef, i32 2, i32 1, i32 undef>)
+  ret <4 x float> %a
+}
+
+define <8 x float> @undef_test_vpermilvar_ps_256(<8 x float> %v) {
+; CHECK-LABEL: @undef_test_vpermilvar_ps_256(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> %v, <8 x float> undef, <8 x i32> <i32 undef, i32 2, i32 1, i32 undef, i32 7, i32 6, i32 5, i32 4>
+; CHECK-NEXT:    ret <8 x float> [[TMP1]]
+;
+  %a = tail call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %v, <8 x i32> <i32 undef, i32 6, i32 5, i32 undef, i32 3, i32 2, i32 1, i32 0>)
+  ret <8 x float> %a
+}
+
+define <16 x float> @undef_test_vpermilvar_ps_512(<16 x float> %v) {
+; CHECK-LABEL: @undef_test_vpermilvar_ps_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x float> %v, <16 x float> undef, <16 x i32> <i32 undef, i32 2, i32 1, i32 undef, i32 7, i32 6, i32 5, i32 4, i32 undef, i32 10, i32 9, i32 undef, i32 15, i32 14, i32 13, i32 12>
+; CHECK-NEXT:    ret <16 x float> [[TMP1]]
+;
+  %a = tail call <16 x float> @llvm.x86.avx512.vpermilvar.ps.512(<16 x float> %v, <16 x i32> <i32 undef, i32 6, i32 5, i32 undef, i32 3, i32 2, i32 1, i32 0, i32 undef, i32 6, i32 5, i32 undef, i32 3, i32 2, i32 1, i32 0>)
+  ret <16 x float> %a
+}
+
+define <2 x double> @undef_test_vpermilvar_pd(<2 x double> %v) {
+; CHECK-LABEL: @undef_test_vpermilvar_pd(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x double> %v, <2 x double> undef, <2 x i32> <i32 undef, i32 0>
+; CHECK-NEXT:    ret <2 x double> [[TMP1]]
+;
+  %a = tail call <2 x double> @llvm.x86.avx.vpermilvar.pd(<2 x double> %v, <2 x i64> <i64 undef, i64 0>)
+  ret <2 x double> %a
+}
+
+define <4 x double> @undef_test_vpermilvar_pd_256(<4 x double> %v) {
+; CHECK-LABEL: @undef_test_vpermilvar_pd_256(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> %v, <4 x double> undef, <4 x i32> <i32 undef, i32 0, i32 3, i32 undef>
+; CHECK-NEXT:    ret <4 x double> [[TMP1]]
+;
+  %a = tail call <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double> %v, <4 x i64> <i64 undef, i64 1, i64 2, i64 undef>)
+  ret <4 x double> %a
+}
+
+define <8 x double> @undef_test_vpermilvar_pd_512(<8 x double> %v) {
+; CHECK-LABEL: @undef_test_vpermilvar_pd_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x double> %v, <8 x double> undef, <8 x i32> <i32 undef, i32 0, i32 3, i32 undef, i32 undef, i32 4, i32 7, i32 undef>
+; CHECK-NEXT:    ret <8 x double> [[TMP1]]
+;
+  %a = tail call <8 x double> @llvm.x86.avx512.vpermilvar.pd.512(<8 x double> %v, <8 x i64> <i64 undef, i64 1, i64 2, i64 undef, i64 undef, i64 1, i64 2, i64 undef>)
+  ret <8 x double> %a
+}
+
+; Simplify demanded elts
+
+define <4 x float> @elts_test_vpermilvar_ps(<4 x float> %a0, i32 %a1) {
+; CHECK-LABEL: @elts_test_vpermilvar_ps(
+; CHECK-NEXT:    ret <4 x float> %a0
+;
+  %1 = insertelement <4 x i32> <i32 0, i32 1, i32 2, i32 3>, i32 %a1, i32 3
+  %2 = tail call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> %a0, <4 x i32> %1)
+  %3 = shufflevector <4 x float> %2, <4 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 undef>
+  ret <4 x float> %3
+}
+
+define <8 x float> @elts_test_vpermilvar_ps_256(<8 x float> %a0, <8 x i32> %a1) {
+; CHECK-LABEL: @elts_test_vpermilvar_ps_256(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> %a0, <8 x float> undef, <8 x i32> <i32 undef, i32 0, i32 undef, i32 1, i32 undef, i32 6, i32 undef, i32 7>
+; CHECK-NEXT:    ret <8 x float> [[TMP1]]
+;
+  %1 = shufflevector <8 x i32> %a1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 3, i32 2, i32 1, i32 0>, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
+  %2 = tail call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %a0, <8 x i32> %1)
+  %3 = shufflevector <8 x float> %2, <8 x float> undef, <8 x i32> <i32 undef, i32 1, i32 undef, i32 3, i32 undef, i32 5, i32 undef, i32 7>
+  ret <8 x float> %3
+}
+
+define <16 x float> @elts_test_vpermilvar_ps_512(<16 x float> %a0, <16 x i32> %a1, i32 %a2) {
+; CHECK-LABEL: @elts_test_vpermilvar_ps_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x float> @llvm.x86.avx512.vpermilvar.ps.512(<16 x float> %a0, <16 x i32> %a1)
+; CHECK-NEXT:    ret <16 x float> [[TMP1]]
+;
+  %1 = insertelement <16 x i32> %a1, i32 %a2, i32 0
+  %2 = tail call <16 x float> @llvm.x86.avx512.vpermilvar.ps.512(<16 x float> %a0, <16 x i32> %1)
+  %3 = shufflevector <16 x float> %2, <16 x float> undef, <16 x i32> <i32 undef, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  ret <16 x float> %3
+}
+
+define <2 x double> @elts_test_vpermilvar_pd(<2 x double> %a0, i64 %a1) {
+; CHECK-LABEL: @elts_test_vpermilvar_pd(
+; CHECK-NEXT:    ret <2 x double> %a0
+;
+  %1 = insertelement <2 x i64> <i64 0, i64 2>, i64 %a1, i32 1
+  %2 = tail call <2 x double> @llvm.x86.avx.vpermilvar.pd(<2 x double> %a0, <2 x i64> %1)
+  %3 = shufflevector <2 x double> %2, <2 x double> undef, <2 x i32> <i32 0, i32 undef>
+  ret <2 x double> %3
+}
+
+define <4 x double> @elts_test_vpermilvar_pd_256(<4 x double> %a0, <4 x i64> %a1) {
+; CHECK-LABEL: @elts_test_vpermilvar_pd_256(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> %a0, <4 x double> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 undef>
+; CHECK-NEXT:    ret <4 x double> [[TMP1]]
+;
+  %1 = shufflevector <4 x i64> <i64 0, i64 2, i64 0, i64 2>, <4 x i64> %a1, <4 x i32> <i32 1, i32 2, i32 3, i32 4>
+  %2 = tail call <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double> %a0, <4 x i64> %1)
+  %3 = shufflevector <4 x double> %2, <4 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 undef>
+  ret <4 x double> %3
+}
+
+define <8 x double> @elts_test_vpermilvar_pd_512(<8 x double> %a0, <8 x i64> %a1, i64 %a2) {
+; CHECK-LABEL: @elts_test_vpermilvar_pd_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <8 x i64> undef, i64 %a2, i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x double> @llvm.x86.avx512.vpermilvar.pd.512(<8 x double> %a0, <8 x i64> [[TMP1]])
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <8 x double> [[TMP2]], <8 x double> undef, <8 x i32> zeroinitializer
+; CHECK-NEXT:    ret <8 x double> [[TMP3]]
+;
+  %1 = insertelement <8 x i64> %a1, i64 %a2, i32 0
+  %2 = tail call <8 x double> @llvm.x86.avx512.vpermilvar.pd.512(<8 x double> %a0, <8 x i64> %1)
+  %3 = shufflevector <8 x double> %2, <8 x double> undef, <8 x i32> zeroinitializer
+  ret <8 x double> %3
+}
+
+declare <2 x double> @llvm.x86.avx.vpermilvar.pd(<2 x double>, <2 x i64>)
+declare <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double>, <4 x i64>)
+declare <8 x double> @llvm.x86.avx512.vpermilvar.pd.512(<8 x double>, <8 x i64>)
+
+declare <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float>, <4 x i32>)
+declare <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float>, <8 x i32>)
+declare <16 x float> @llvm.x86.avx512.vpermilvar.ps.512(<16 x float>, <16 x i32>)
diff --git a/test/Transforms/InstCombine/X86/x86-xop.ll b/test/Transforms/InstCombine/X86/x86-xop.ll
new file mode 100644
index 000000000000..03a3f921abb2
--- /dev/null
+++ b/test/Transforms/InstCombine/X86/x86-xop.ll
@@ -0,0 +1,305 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+define <2 x double> @test_vfrcz_sd(<2 x double> %a) {
+; CHECK-LABEL: @test_vfrcz_sd(
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x double> @llvm.x86.xop.vfrcz.sd(<2 x double> %a)
+; CHECK-NEXT:    ret <2 x double> [[TMP1]]
+;
+  %1 = insertelement <2 x double> %a, double 1.000000e+00, i32 1
+  %2 = tail call <2 x double> @llvm.x86.xop.vfrcz.sd(<2 x double> %1)
+  ret <2 x double> %2
+}
+
+define double @test_vfrcz_sd_0(double %a) {
+; CHECK-LABEL: @test_vfrcz_sd_0(
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> undef, double %a, i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x double> @llvm.x86.xop.vfrcz.sd(<2 x double> [[TMP1]])
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x double> [[TMP2]], i32 0
+; CHECK-NEXT:    ret double [[TMP3]]
+;
+  %1 = insertelement <2 x double> undef, double %a, i32 0
+  %2 = insertelement <2 x double> %1, double 1.000000e+00, i32 1
+  %3 = tail call <2 x double> @llvm.x86.xop.vfrcz.sd(<2 x double> %2)
+  %4 = extractelement <2 x double> %3, i32 0
+  ret double %4
+}
+
+define double @test_vfrcz_sd_1(double %a) {
+; CHECK-LABEL: @test_vfrcz_sd_1(
+; CHECK-NEXT:    ret double 0.000000e+00
+;
+  %1 = insertelement <2 x double> undef, double %a, i32 0
+  %2 = insertelement <2 x double> %1, double 1.000000e+00, i32 1
+  %3 = tail call <2 x double> @llvm.x86.xop.vfrcz.sd(<2 x double> %2)
+  %4 = extractelement <2 x double> %3, i32 1
+  ret double %4
+}
+
+define <4 x float> @test_vfrcz_ss(<4 x float> %a) {
+; CHECK-LABEL: @test_vfrcz_ss(
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x float> @llvm.x86.xop.vfrcz.ss(<4 x float> %a)
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
+;
+  %1 = insertelement <4 x float> %a, float 1.000000e+00, i32 1
+  %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2
+  %3 = insertelement <4 x float> %2, float 3.000000e+00, i32 3
+  %4 = tail call <4 x float> @llvm.x86.xop.vfrcz.ss(<4 x float> %3)
+  ret <4 x float> %4
+}
+
+define float @test_vfrcz_ss_0(float %a) {
+; CHECK-LABEL: @test_vfrcz_ss_0(
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x float> undef, float %a, i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x float> @llvm.x86.xop.vfrcz.ss(<4 x float> [[TMP1]])
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[TMP2]], i32 0
+; CHECK-NEXT:    ret float [[TMP3]]
+;
+  %1 = insertelement <4 x float> undef, float %a, i32 0
+  %2 = insertelement <4 x float> %1, float 1.000000e+00, i32 1
+  %3 = insertelement <4 x float> %2, float 2.000000e+00, i32 2
+  %4 = insertelement <4 x float> %3, float 3.000000e+00, i32 3
+  %5 = tail call <4 x float> @llvm.x86.xop.vfrcz.ss(<4 x float> %4)
+  %6 = extractelement <4 x float> %5, i32 0
+  ret float %6
+}
+
+define float @test_vfrcz_ss_3(float %a) {
+; CHECK-LABEL: @test_vfrcz_ss_3(
+; CHECK-NEXT:    ret float 0.000000e+00
+;
+  %1 = insertelement <4 x float> undef, float %a, i32 0
+  %2 = insertelement <4 x float> %1, float 1.000000e+00, i32 1
+  %3 = insertelement <4 x float> %2, float 2.000000e+00, i32 2
+  %4 = insertelement <4 x float> %3, float 3.000000e+00, i32 3
+  %5 = tail call <4 x float> @llvm.x86.xop.vfrcz.ss(<4 x float> %4)
+  %6 = extractelement <4 x float> %5, i32 3
+  ret float %6
+}
+
+define <2 x i64> @cmp_slt_v2i64(<2 x i64> %a, <2 x i64> %b) {
+; CHECK-LABEL: @cmp_slt_v2i64(
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp slt <2 x i64> %a, %b
+; CHECK-NEXT:    [[TMP2:%.*]] = sext <2 x i1> [[TMP1]] to <2 x i64>
+; CHECK-NEXT:    ret <2 x i64> [[TMP2]]
+;
+  %1 = tail call <2 x i64> @llvm.x86.xop.vpcomltq(<2 x i64> %a, <2 x i64> %b)
+  ret <2 x i64> %1
+}
+
+define <2 x i64> @cmp_ult_v2i64(<2 x i64> %a, <2 x i64> %b) {
+; CHECK-LABEL: @cmp_ult_v2i64(
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp ult <2 x i64> %a, %b
+; CHECK-NEXT:    [[TMP2:%.*]] = sext <2 x i1> [[TMP1]] to <2 x i64>
+; CHECK-NEXT:    ret <2 x i64> [[TMP2]]
+;
+  %1 = tail call <2 x i64> @llvm.x86.xop.vpcomltuq(<2 x i64> %a, <2 x i64> %b)
+  ret <2 x i64> %1
+}
+
+define <2 x i64> @cmp_sle_v2i64(<2 x i64> %a, <2 x i64> %b) {
+; CHECK-LABEL: @cmp_sle_v2i64(
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp sle <2 x i64> %a, %b
+; CHECK-NEXT:    [[TMP2:%.*]] = sext <2 x i1> [[TMP1]] to <2 x i64>
+; CHECK-NEXT:    ret <2 x i64> [[TMP2]]
+;
+  %1 = tail call <2 x i64> @llvm.x86.xop.vpcomleq(<2 x i64> %a, <2 x i64> %b)
+  ret <2 x i64> %1
+}
+
+define <2 x i64> @cmp_ule_v2i64(<2 x i64> %a, <2 x i64> %b) {
+; CHECK-LABEL: @cmp_ule_v2i64(
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp ule <2 x i64> %a, %b
+; CHECK-NEXT:    [[TMP2:%.*]] = sext <2 x i1> [[TMP1]] to <2 x i64>
+; CHECK-NEXT:    ret <2 x i64> [[TMP2]]
+;
+  %1 = tail call <2 x i64> @llvm.x86.xop.vpcomleuq(<2 x i64> %a, <2 x i64> %b)
+  ret <2 x i64> %1
+}
+
+define <4 x i32> @cmp_sgt_v4i32(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: @cmp_sgt_v4i32(
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp sgt <4 x i32> %a, %b
+; CHECK-NEXT:    [[TMP2:%.*]] = sext <4 x i1> [[TMP1]] to <4 x i32>
+; CHECK-NEXT:    ret <4 x i32> [[TMP2]]
+;
+  %1 = tail call <4 x i32> @llvm.x86.xop.vpcomgtd(<4 x i32> %a, <4 x i32> %b)
+  ret <4 x i32> %1
+}
+
+define <4 x i32> @cmp_ugt_v4i32(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: @cmp_ugt_v4i32(
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp ugt <4 x i32> %a, %b
+; CHECK-NEXT:    [[TMP2:%.*]] = sext <4 x i1> [[TMP1]] to <4 x i32>
+; CHECK-NEXT:    ret <4 x i32> [[TMP2]]
+;
+  %1 = tail call <4 x i32> @llvm.x86.xop.vpcomgtud(<4 x i32> %a, <4 x i32> %b)
+  ret <4 x i32> %1
+}
+
+define <4 x i32> @cmp_sge_v4i32(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: @cmp_sge_v4i32(
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp sge <4 x i32> %a, %b
+; CHECK-NEXT:    [[TMP2:%.*]] = sext <4 x i1> [[TMP1]] to <4 x i32>
+; CHECK-NEXT:    ret <4 x i32> [[TMP2]]
+;
+  %1 = tail call <4 x i32> @llvm.x86.xop.vpcomged(<4 x i32> %a, <4 x i32> %b)
+  ret <4 x i32> %1
+}
+
+define <4 x i32> @cmp_uge_v4i32(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: @cmp_uge_v4i32(
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp uge <4 x i32> %a, %b
+; CHECK-NEXT:    [[TMP2:%.*]] = sext <4 x i1> [[TMP1]] to <4 x i32>
+; CHECK-NEXT:    ret <4 x i32> [[TMP2]]
+;
+  %1 = tail call <4 x i32> @llvm.x86.xop.vpcomgeud(<4 x i32> %a, <4 x i32> %b)
+  ret <4 x i32> %1
+}
+
+define <8 x i16> @cmp_seq_v8i16(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: @cmp_seq_v8i16(
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq <8 x i16> %a, %b
+; CHECK-NEXT:    [[TMP2:%.*]] = sext <8 x i1> [[TMP1]] to <8 x i16>
+; CHECK-NEXT:    ret <8 x i16> [[TMP2]]
+;
+  %1 = tail call <8 x i16> @llvm.x86.xop.vpcomeqw(<8 x i16> %a, <8 x i16> %b)
+  ret <8 x i16> %1
+}
+
+define <8 x i16> @cmp_ueq_v8i16(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: @cmp_ueq_v8i16(
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq <8 x i16> %a, %b
+; CHECK-NEXT:    [[TMP2:%.*]] = sext <8 x i1> [[TMP1]] to <8 x i16>
+; CHECK-NEXT:    ret <8 x i16> [[TMP2]]
+;
+  %1 = tail call <8 x i16> @llvm.x86.xop.vpcomequw(<8 x i16> %a, <8 x i16> %b)
+  ret <8 x i16> %1
+}
+
+define <8 x i16> @cmp_sne_v8i16(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: @cmp_sne_v8i16(
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp ne <8 x i16> %a, %b
+; CHECK-NEXT:    [[TMP2:%.*]] = sext <8 x i1> [[TMP1]] to <8 x i16>
+; CHECK-NEXT:    ret <8 x i16> [[TMP2]]
+;
+  %1 = tail call <8 x i16> @llvm.x86.xop.vpcomnew(<8 x i16> %a, <8 x i16> %b)
+  ret <8 x i16> %1
+}
+
+define <8 x i16> @cmp_une_v8i16(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: @cmp_une_v8i16(
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp ne <8 x i16> %a, %b
+; CHECK-NEXT:    [[TMP2:%.*]] = sext <8 x i1> [[TMP1]] to <8 x i16>
+; CHECK-NEXT:    ret <8 x i16> [[TMP2]]
+;
+  %1 = tail call <8 x i16> @llvm.x86.xop.vpcomneuw(<8 x i16> %a, <8 x i16> %b)
+  ret <8 x i16> %1
+}
+
+define <16 x i8> @cmp_strue_v16i8(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: @cmp_strue_v16i8(
+; CHECK-NEXT:    ret <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+;
+  %1 = tail call <16 x i8> @llvm.x86.xop.vpcomtrueb(<16 x i8> %a, <16 x i8> %b)
+  ret <16 x i8> %1
+}
+
+define <16 x i8> @cmp_utrue_v16i8(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: @cmp_utrue_v16i8(
+; CHECK-NEXT:    ret <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+;
+  %1 = tail call <16 x i8> @llvm.x86.xop.vpcomtrueub(<16 x i8> %a, <16 x i8> %b)
+  ret <16 x i8> %1
+}
+
+define <16 x i8> @cmp_sfalse_v16i8(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: @cmp_sfalse_v16i8(
+; CHECK-NEXT:    ret <16 x i8> zeroinitializer
+;
+  %1 = tail call <16 x i8> @llvm.x86.xop.vpcomfalseb(<16 x i8> %a, <16 x i8> %b)
+  ret <16 x i8> %1
+}
+
+define <16 x i8> @cmp_ufalse_v16i8(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: @cmp_ufalse_v16i8(
+; CHECK-NEXT:    ret <16 x i8> zeroinitializer
+;
+  %1 = tail call <16 x i8> @llvm.x86.xop.vpcomfalseub(<16 x i8> %a, <16 x i8> %b)
+  ret <16 x i8> %1
+}
+
+declare <2 x double> @llvm.x86.xop.vfrcz.sd(<2 x double>) nounwind readnone
+declare <4 x float> @llvm.x86.xop.vfrcz.ss(<4 x float>) nounwind readnone
+
+declare <16 x i8> @llvm.x86.xop.vpcomltb(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.x86.xop.vpcomltw(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.x86.xop.vpcomltd(<4 x i32>, <4 x i32>) nounwind readnone
+declare <2 x i64> @llvm.x86.xop.vpcomltq(<2 x i64>, <2 x i64>) nounwind readnone
+declare <16 x i8> @llvm.x86.xop.vpcomltub(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.x86.xop.vpcomltuw(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.x86.xop.vpcomltud(<4 x i32>, <4 x i32>) nounwind readnone
+declare <2 x i64> @llvm.x86.xop.vpcomltuq(<2 x i64>, <2 x i64>) nounwind readnone
+
+declare <16 x i8> @llvm.x86.xop.vpcomleb(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.x86.xop.vpcomlew(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.x86.xop.vpcomled(<4 x i32>, <4 x i32>) nounwind readnone
+declare <2 x i64> @llvm.x86.xop.vpcomleq(<2 x i64>, <2 x i64>) nounwind readnone
+declare <16 x i8> @llvm.x86.xop.vpcomleub(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.x86.xop.vpcomleuw(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.x86.xop.vpcomleud(<4 x i32>, <4 x i32>) nounwind readnone
+declare <2 x i64> @llvm.x86.xop.vpcomleuq(<2 x i64>, <2 x i64>) nounwind readnone
+
+declare <16 x i8> @llvm.x86.xop.vpcomgtb(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.x86.xop.vpcomgtw(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.x86.xop.vpcomgtd(<4 x i32>, <4 x i32>) nounwind readnone
+declare <2 x i64> @llvm.x86.xop.vpcomgtq(<2 x i64>, <2 x i64>) nounwind readnone
+declare <16 x i8> @llvm.x86.xop.vpcomgtub(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.x86.xop.vpcomgtuw(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.x86.xop.vpcomgtud(<4 x i32>, <4 x i32>) nounwind readnone
+declare <2 x i64> @llvm.x86.xop.vpcomgtuq(<2 x i64>, <2 x i64>) nounwind readnone
+
+declare <16 x i8> @llvm.x86.xop.vpcomgeb(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.x86.xop.vpcomgew(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.x86.xop.vpcomged(<4 x i32>, <4 x i32>) nounwind readnone
+declare <2 x i64> @llvm.x86.xop.vpcomgeq(<2 x i64>, <2 x i64>) nounwind readnone
+declare <16 x i8> @llvm.x86.xop.vpcomgeub(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.x86.xop.vpcomgeuw(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.x86.xop.vpcomgeud(<4 x i32>, <4 x i32>) nounwind readnone
+declare <2 x i64> @llvm.x86.xop.vpcomgeuq(<2 x i64>, <2 x i64>) nounwind readnone
+
+declare <16 x i8> @llvm.x86.xop.vpcomeqb(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.x86.xop.vpcomeqw(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.x86.xop.vpcomeqd(<4 x i32>, <4 x i32>) nounwind readnone
+declare <2 x i64> @llvm.x86.xop.vpcomeqq(<2 x i64>, <2 x i64>) nounwind readnone
+declare <16 x i8> @llvm.x86.xop.vpcomequb(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.x86.xop.vpcomequw(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.x86.xop.vpcomequd(<4 x i32>, <4 x i32>) nounwind readnone
+declare <2 x i64> @llvm.x86.xop.vpcomequq(<2 x i64>, <2 x i64>) nounwind readnone
+
+declare <16 x i8> @llvm.x86.xop.vpcomneb(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.x86.xop.vpcomnew(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.x86.xop.vpcomned(<4 x i32>, <4 x i32>) nounwind readnone
+declare <2 x i64> @llvm.x86.xop.vpcomneq(<2 x i64>, <2 x i64>) nounwind readnone
+declare <16 x i8> @llvm.x86.xop.vpcomneub(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.x86.xop.vpcomneuw(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.x86.xop.vpcomneud(<4 x i32>, <4 x i32>) nounwind readnone
+declare <2 x i64> @llvm.x86.xop.vpcomneuq(<2 x i64>, <2 x i64>) nounwind readnone
+
+declare <16 x i8> @llvm.x86.xop.vpcomfalseb(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.x86.xop.vpcomfalsew(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.x86.xop.vpcomfalsed(<4 x i32>, <4 x i32>) nounwind readnone
+declare <2 x i64> @llvm.x86.xop.vpcomfalseq(<2 x i64>, <2 x i64>) nounwind readnone
+declare <16 x i8> @llvm.x86.xop.vpcomfalseub(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.x86.xop.vpcomfalseuw(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.x86.xop.vpcomfalseud(<4 x i32>, <4 x i32>) nounwind readnone
+declare <2 x i64> @llvm.x86.xop.vpcomfalseuq(<2 x i64>, <2 x i64>) nounwind readnone
+
+declare <16 x i8> @llvm.x86.xop.vpcomtrueb(<16 x i8>, <16 x i8>) nounwind readnone
+declare <4 x i32> @llvm.x86.xop.vpcomtrued(<4 x i32>, <4 x i32>) nounwind readnone
+declare <2 x i64> @llvm.x86.xop.vpcomtrueq(<2 x i64>, <2 x i64>) nounwind readnone
+declare <8 x i16> @llvm.x86.xop.vpcomtruew(<8 x i16>, <8 x i16>) nounwind readnone
+declare <16 x i8> @llvm.x86.xop.vpcomtrueub(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.x86.xop.vpcomtrueuw(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.x86.xop.vpcomtrueud(<4 x i32>, <4 x i32>) nounwind readnone
+declare <2 x i64> @llvm.x86.xop.vpcomtrueuq(<2 x i64>, <2 x i64>) nounwind readnone
diff --git a/test/Transforms/InstCombine/X86FsubCmpCombine.ll b/test/Transforms/InstCombine/X86FsubCmpCombine.ll
deleted file mode 100644
index fde0692d00a2..000000000000
--- a/test/Transforms/InstCombine/X86FsubCmpCombine.ll
+++ /dev/null
@@ -1,181 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -instcombine -S | FileCheck %s
-
-; The test checks the folding of cmp(sub(a,b),0) into cmp(a,b).
-
-define i8 @sub_compare_foldingPD128_safe(<2 x double> %a, <2 x double> %b){
-; CHECK-LABEL: @sub_compare_foldingPD128_safe(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[SUB_SAFE:%.*]] = fsub <2 x double> [[A:%.*]], [[B:%.*]]
-; CHECK-NEXT:    [[TMP0:%.*]] = tail call i8 @llvm.x86.avx512.mask.cmp.pd.128(<2 x double> [[SUB_SAFE]], <2 x double> zeroinitializer, i32 5, i8 -1)
-; CHECK-NEXT:    ret i8 [[TMP0]]
-;
-entry:
-  %sub.safe = fsub <2 x double> %a, %b
-  %0 = tail call i8 @llvm.x86.avx512.mask.cmp.pd.128(<2 x double> %sub.safe , <2 x double> zeroinitializer, i32 5, i8 -1)
-  ret i8 %0
-}
-
-
-define i8 @sub_compare_foldingPD128(<2 x double> %a, <2 x double> %b){
-; CHECK-LABEL: @sub_compare_foldingPD128(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = tail call i8 @llvm.x86.avx512.mask.cmp.pd.128(<2 x double> [[A:%.*]], <2 x double> [[B:%.*]], i32 5, i8 -1)
-; CHECK-NEXT:    ret i8 [[TMP0]]
-;
-entry:
-  %sub.i = fsub ninf <2 x double> %a, %b
-  %0 = tail call i8 @llvm.x86.avx512.mask.cmp.pd.128(<2 x double> %sub.i , <2 x double> zeroinitializer, i32 5, i8 -1)
-  ret i8 %0
-}
-
-
-define i8 @sub_compare_foldingPD256(<4 x double> %a, <4 x double> %b){
-; CHECK-LABEL: @sub_compare_foldingPD256(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = tail call i8 @llvm.x86.avx512.mask.cmp.pd.256(<4 x double> [[A:%.*]], <4 x double> [[B:%.*]], i32 5, i8 -1)
-; CHECK-NEXT:    ret i8 [[TMP0]]
-;
-entry:
-  %sub.i1 = fsub ninf <4 x double> %a, %b
-  %0 = tail call i8 @llvm.x86.avx512.mask.cmp.pd.256(<4 x double> %sub.i1, <4 x double> zeroinitializer, i32 5, i8 -1)
-  ret i8 %0
-}
-
-
-define i8 @sub_compare_foldingPD512(<8 x double> %a, <8 x double> %b){
-; CHECK-LABEL: @sub_compare_foldingPD512(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = tail call i8 @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> [[A:%.*]], <8 x double> [[B:%.*]], i32 11, i8 -1, i32 4)
-; CHECK-NEXT:    ret i8 [[TMP0]]
-;
-entry:
-  %sub.i2 = fsub ninf <8 x double> %a, %b
-  %0 = tail call i8 @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> %sub.i2, <8 x double> zeroinitializer, i32 11, i8 -1, i32 4)
-  ret i8 %0
-}
-
-
-define i8 @sub_compare_foldingPS128(<4 x float> %a, <4 x float> %b){
-; CHECK-LABEL: @sub_compare_foldingPS128(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = tail call i8 @llvm.x86.avx512.mask.cmp.ps.128(<4 x float> [[A:%.*]], <4 x float> [[B:%.*]], i32 12, i8 -1)
-; CHECK-NEXT:    ret i8 [[TMP0]]
-;
-entry:
-  %sub.i3 = fsub ninf <4 x float> %a, %b
-  %0 = tail call i8 @llvm.x86.avx512.mask.cmp.ps.128(<4 x float> %sub.i3, <4 x float> zeroinitializer, i32 12, i8 -1)
-  ret i8 %0
-}
-
-
-define i8 @sub_compare_foldingPS256(<8 x float> %a, <8 x float> %b){
-; CHECK-LABEL: @sub_compare_foldingPS256(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = tail call i8 @llvm.x86.avx512.mask.cmp.ps.256(<8 x float> [[A:%.*]], <8 x float> [[B:%.*]], i32 5, i8 -1)
-; CHECK-NEXT:    ret i8 [[TMP0]]
-;
-entry:
-  %sub.i4 = fsub ninf <8 x float> %a, %b
-  %0 = tail call i8 @llvm.x86.avx512.mask.cmp.ps.256(<8 x float> %sub.i4, <8 x float> zeroinitializer, i32 5, i8 -1)
-  ret i8 %0
-}
-
-
-define i16 @sub_compare_foldingPS512(<16 x float> %a, <16 x float> %b){
-; CHECK-LABEL: @sub_compare_foldingPS512(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = tail call i16 @llvm.x86.avx512.mask.cmp.ps.512(<16 x float> [[A:%.*]], <16 x float> [[B:%.*]], i32 11, i16 -1, i32 4)
-; CHECK-NEXT:    ret i16 [[TMP0]]
-;
-entry:
-  %sub.i5 = fsub ninf <16 x float> %a, %b
-  %0 = tail call i16 @llvm.x86.avx512.mask.cmp.ps.512(<16 x float> %sub.i5, <16 x float> zeroinitializer, i32 11, i16 -1, i32 4)
-  ret i16 %0
-}
-
-
-
-define i8 @sub_compare_folding_swapPD128(<2 x double> %a, <2 x double> %b){
-; CHECK-LABEL: @sub_compare_folding_swapPD128(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = tail call i8 @llvm.x86.avx512.mask.cmp.pd.128(<2 x double> [[B:%.*]], <2 x double> [[A:%.*]], i32 5, i8 -1)
-; CHECK-NEXT:    ret i8 [[TMP0]]
-;
-entry:
-  %sub.i = fsub ninf <2 x double> %a, %b
-  %0 = tail call i8 @llvm.x86.avx512.mask.cmp.pd.128(<2 x double> zeroinitializer, <2 x double> %sub.i, i32 5, i8 -1)
-  ret i8 %0
-}
-
-
-define i8 @sub_compare_folding_swapPD256(<4 x double> %a, <4 x double> %b){
-; CHECK-LABEL: @sub_compare_folding_swapPD256(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = tail call i8 @llvm.x86.avx512.mask.cmp.pd.256(<4 x double> [[B:%.*]], <4 x double> [[A:%.*]], i32 5, i8 -1)
-; CHECK-NEXT:    ret i8 [[TMP0]]
-;
-entry:
-  %sub.i = fsub ninf <4 x double> %a, %b
-  %0 = tail call i8 @llvm.x86.avx512.mask.cmp.pd.256(<4 x double> zeroinitializer, <4 x double> %sub.i, i32 5, i8 -1)
-  ret i8 %0
-}
-
-
-define i8 @sub_compare_folding_swapPD512(<8 x double> %a, <8 x double> %b){
-; CHECK-LABEL: @sub_compare_folding_swapPD512(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = tail call i8 @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> [[B:%.*]], <8 x double> [[A:%.*]], i32 11, i8 -1, i32 4)
-; CHECK-NEXT:    ret i8 [[TMP0]]
-;
-entry:
-  %sub.i = fsub ninf <8 x double> %a, %b
-  %0 = tail call i8 @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> zeroinitializer, <8 x double> %sub.i, i32 11, i8 -1, i32 4)
-  ret i8 %0
-}
-
-
-define i8 @sub_compare_folding_swapPS128(<4 x float> %a, <4 x float> %b){
-; CHECK-LABEL: @sub_compare_folding_swapPS128(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = tail call i8 @llvm.x86.avx512.mask.cmp.ps.128(<4 x float> [[B:%.*]], <4 x float> [[A:%.*]], i32 12, i8 -1)
-; CHECK-NEXT:    ret i8 [[TMP0]]
-;
-entry:
-  %sub.i = fsub ninf <4 x float> %a, %b
-  %0 = tail call i8 @llvm.x86.avx512.mask.cmp.ps.128(<4 x float> zeroinitializer, <4 x float> %sub.i, i32 12, i8 -1)
-  ret i8 %0
-}
-
-
-define i8 @sub_compare_folding_swapPS256(<8 x float> %a, <8 x float> %b){
-; CHECK-LABEL: @sub_compare_folding_swapPS256(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = tail call i8 @llvm.x86.avx512.mask.cmp.ps.256(<8 x float> [[B:%.*]], <8 x float> [[A:%.*]], i32 5, i8 -1)
-; CHECK-NEXT:    ret i8 [[TMP0]]
-;
-entry:
-  %sub.i = fsub ninf <8 x float> %a, %b
-  %0 = tail call i8 @llvm.x86.avx512.mask.cmp.ps.256(<8 x float> zeroinitializer, <8 x float> %sub.i, i32 5, i8 -1)
-  ret i8 %0
-}
-
-
-define i16 @sub_compare_folding_swapPS512(<16 x float> %a, <16 x float> %b){
-; CHECK-LABEL: @sub_compare_folding_swapPS512(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = tail call i16 @llvm.x86.avx512.mask.cmp.ps.512(<16 x float> [[B:%.*]], <16 x float> [[A:%.*]], i32 11, i16 -1, i32 4)
-; CHECK-NEXT:    ret i16 [[TMP0]]
-;
-entry:
-  %sub.i = fsub ninf <16 x float> %a, %b
-  %0 = tail call i16 @llvm.x86.avx512.mask.cmp.ps.512(<16 x float> zeroinitializer, <16 x float> %sub.i, i32 11, i16 -1, i32 4)
-  ret i16 %0
-}
-
-declare i8 @llvm.x86.avx512.mask.cmp.pd.128(<2 x double>, <2 x double>, i32, i8)
-declare i8 @llvm.x86.avx512.mask.cmp.pd.256(<4 x double>, <4 x double>, i32, i8)
-declare i8 @llvm.x86.avx512.mask.cmp.pd.512(<8 x double>, <8 x double>, i32, i8, i32)
-declare i8 @llvm.x86.avx512.mask.cmp.ps.128(<4 x float>, <4 x float>, i32, i8)
-declare i8 @llvm.x86.avx512.mask.cmp.ps.256(<8 x float>, <8 x float>, i32, i8)
-declare i16 @llvm.x86.avx512.mask.cmp.ps.512(<16 x float>, <16 x float>, i32, i16, i32)
diff --git a/test/Transforms/InstCombine/add.ll b/test/Transforms/InstCombine/add.ll
index 648305d134cd..5f7101e8feca 100644
--- a/test/Transforms/InstCombine/add.ll
+++ b/test/Transforms/InstCombine/add.ll
@@ -27,6 +27,32 @@ define <2 x i32> @select_0_or_1_from_bool_vec(<2 x i1> %x) {
   ret <2 x i32> %add
 }
 
+; This is an 'andn' of the low bit.
+
+define i32 @flip_and_mask(i32 %x) {
+; CHECK-LABEL: @flip_and_mask(
+; CHECK-NEXT:    [[TMP1:%.*]] = and i32 %x, 1
+; CHECK-NEXT:    [[INC:%.*]] = xor i32 [[TMP1]], 1
+; CHECK-NEXT:    ret i32 [[INC]]
+;
+  %shl = shl i32 %x, 31
+  %shr = ashr i32 %shl, 31
+  %inc = add i32 %shr, 1
+  ret i32 %inc
+}
+
+define <2 x i8> @flip_and_mask_splat(<2 x i8> %x) {
+; CHECK-LABEL: @flip_and_mask_splat(
+; CHECK-NEXT:    [[TMP1:%.*]] = xor <2 x i8> %x, <i8 1, i8 1>
+; CHECK-NEXT:    [[INC:%.*]] = and <2 x i8> [[TMP1]], <i8 1, i8 1>
+; CHECK-NEXT:    ret <2 x i8> [[INC]]
+;
+  %shl = shl <2 x i8> %x, <i8 7, i8 7>
+  %shr = ashr <2 x i8> %shl, <i8 7, i8 7>
+  %inc = add <2 x i8> %shr, <i8 1, i8 1>
+  ret <2 x i8> %inc
+}
+
 define i32 @test1(i32 %A) {
 ; CHECK-LABEL: @test1(
 ; CHECK-NEXT:    ret i32 %A
diff --git a/test/Transforms/InstCombine/aligned-altivec.ll b/test/Transforms/InstCombine/aligned-altivec.ll
deleted file mode 100644
index 10b4e4d62631..000000000000
--- a/test/Transforms/InstCombine/aligned-altivec.ll
+++ /dev/null
@@ -1,131 +0,0 @@
-; RUN: opt -S -instcombine < %s | FileCheck %s
-target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64"
-target triple = "powerpc64-unknown-linux-gnu"
-
-declare <4 x i32> @llvm.ppc.altivec.lvx(i8*) #1
-
-define <4 x i32> @test1(<4 x i32>* %h) #0 {
-entry:
-  %h1 = getelementptr <4 x i32>, <4 x i32>* %h, i64 1
-  %hv = bitcast <4 x i32>* %h1 to i8*
-  %vl = call <4 x i32> @llvm.ppc.altivec.lvx(i8* %hv)
-
-; CHECK-LABEL: @test1
-; CHECK: @llvm.ppc.altivec.lvx
-; CHECK: ret <4 x i32>
-
-  %v0 = load <4 x i32>, <4 x i32>* %h, align 8
-  %a = add <4 x i32> %v0, %vl
-  ret <4 x i32> %a
-}
-
-define <4 x i32> @test1a(<4 x i32>* align 16 %h) #0 {
-entry:
-  %h1 = getelementptr <4 x i32>, <4 x i32>* %h, i64 1
-  %hv = bitcast <4 x i32>* %h1 to i8*
-  %vl = call <4 x i32> @llvm.ppc.altivec.lvx(i8* %hv)
-
-; CHECK-LABEL: @test1a
-; CHECK-NOT: @llvm.ppc.altivec.lvx
-; CHECK: ret <4 x i32>
-
-  %v0 = load <4 x i32>, <4 x i32>* %h, align 8
-  %a = add <4 x i32> %v0, %vl
-  ret <4 x i32> %a
-}
-
-declare void @llvm.ppc.altivec.stvx(<4 x i32>, i8*) #0
-
-define <4 x i32> @test2(<4 x i32>* %h, <4 x i32> %d) #0 {
-entry:
-  %h1 = getelementptr <4 x i32>, <4 x i32>* %h, i64 1
-  %hv = bitcast <4 x i32>* %h1 to i8*
-  call void @llvm.ppc.altivec.stvx(<4 x i32> %d, i8* %hv)
-
-  %v0 = load <4 x i32>, <4 x i32>* %h, align 8
-  ret <4 x i32> %v0
-
-; CHECK-LABEL: @test2
-; CHECK: @llvm.ppc.altivec.stvx
-; CHECK: ret <4 x i32>
-}
-
-define <4 x i32> @test2a(<4 x i32>* align 16 %h, <4 x i32> %d) #0 {
-entry:
-  %h1 = getelementptr <4 x i32>, <4 x i32>* %h, i64 1
-  %hv = bitcast <4 x i32>* %h1 to i8*
-  call void @llvm.ppc.altivec.stvx(<4 x i32> %d, i8* %hv)
-
-  %v0 = load <4 x i32>, <4 x i32>* %h, align 8
-  ret <4 x i32> %v0
-
-; CHECK-LABEL: @test2
-; CHECK-NOT: @llvm.ppc.altivec.stvx
-; CHECK: ret <4 x i32>
-}
-
-declare <4 x i32> @llvm.ppc.altivec.lvxl(i8*) #1
-
-define <4 x i32> @test1l(<4 x i32>* %h) #0 {
-entry:
-  %h1 = getelementptr <4 x i32>, <4 x i32>* %h, i64 1
-  %hv = bitcast <4 x i32>* %h1 to i8*
-  %vl = call <4 x i32> @llvm.ppc.altivec.lvxl(i8* %hv)
-
-; CHECK-LABEL: @test1l
-; CHECK: @llvm.ppc.altivec.lvxl
-; CHECK: ret <4 x i32>
-
-  %v0 = load <4 x i32>, <4 x i32>* %h, align 8
-  %a = add <4 x i32> %v0, %vl
-  ret <4 x i32> %a
-}
-
-define <4 x i32> @test1la(<4 x i32>* align 16 %h) #0 {
-entry:
-  %h1 = getelementptr <4 x i32>, <4 x i32>* %h, i64 1
-  %hv = bitcast <4 x i32>* %h1 to i8*
-  %vl = call <4 x i32> @llvm.ppc.altivec.lvxl(i8* %hv)
-
-; CHECK-LABEL: @test1la
-; CHECK-NOT: @llvm.ppc.altivec.lvxl
-; CHECK: ret <4 x i32>
-
-  %v0 = load <4 x i32>, <4 x i32>* %h, align 8
-  %a = add <4 x i32> %v0, %vl
-  ret <4 x i32> %a
-}
-
-declare void @llvm.ppc.altivec.stvxl(<4 x i32>, i8*) #0
-
-define <4 x i32> @test2l(<4 x i32>* %h, <4 x i32> %d) #0 {
-entry:
-  %h1 = getelementptr <4 x i32>, <4 x i32>* %h, i64 1
-  %hv = bitcast <4 x i32>* %h1 to i8*
-  call void @llvm.ppc.altivec.stvxl(<4 x i32> %d, i8* %hv)
-
-  %v0 = load <4 x i32>, <4 x i32>* %h, align 8
-  ret <4 x i32> %v0
-
-; CHECK-LABEL: @test2l
-; CHECK: @llvm.ppc.altivec.stvxl
-; CHECK: ret <4 x i32>
-}
-
-define <4 x i32> @test2la(<4 x i32>* align 16 %h, <4 x i32> %d) #0 {
-entry:
-  %h1 = getelementptr <4 x i32>, <4 x i32>* %h, i64 1
-  %hv = bitcast <4 x i32>* %h1 to i8*
-  call void @llvm.ppc.altivec.stvxl(<4 x i32> %d, i8* %hv)
-
-  %v0 = load <4 x i32>, <4 x i32>* %h, align 8
-  ret <4 x i32> %v0
-
-; CHECK-LABEL: @test2l
-; CHECK-NOT: @llvm.ppc.altivec.stvxl
-; CHECK: ret <4 x i32>
-}
-
-attributes #0 = { nounwind }
-attributes #1 = { nounwind readonly }
-
diff --git a/test/Transforms/InstCombine/aligned-qpx.ll b/test/Transforms/InstCombine/aligned-qpx.ll
deleted file mode 100644
index e9710df5670c..000000000000
--- a/test/Transforms/InstCombine/aligned-qpx.ll
+++ /dev/null
@@ -1,165 +0,0 @@
-; RUN: opt -S -instcombine < %s | FileCheck %s
-target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64"
-target triple = "powerpc64-unknown-linux-gnu"
-
-declare <4 x double> @llvm.ppc.qpx.qvlfs(i8*) #1
-
-define <4 x double> @test1(<4 x float>* %h) #0 {
-entry:
-  %h1 = getelementptr <4 x float>, <4 x float>* %h, i64 1
-  %hv = bitcast <4 x float>* %h1 to i8*
-  %vl = call <4 x double> @llvm.ppc.qpx.qvlfs(i8* %hv)
-
-; CHECK-LABEL: @test1
-; CHECK: @llvm.ppc.qpx.qvlfs
-; CHECK: ret <4 x double>
-
-  %v0 = load <4 x float>, <4 x float>* %h, align 8
-  %v0e = fpext <4 x float> %v0 to <4 x double>
-  %a = fadd <4 x double> %v0e, %vl
-  ret <4 x double> %a
-}
-
-define <4 x double> @test1a(<4 x float>* align 16 %h) #0 {
-entry:
-  %h1 = getelementptr <4 x float>, <4 x float>* %h, i64 1
-  %hv = bitcast <4 x float>* %h1 to i8*
-  %vl = call <4 x double> @llvm.ppc.qpx.qvlfs(i8* %hv)
-
-; CHECK-LABEL: @test1a
-; CHECK-NOT: @llvm.ppc.qpx.qvlfs
-; CHECK-NOT: load <4 x double>
-; CHECK: ret <4 x double>
-
-  %v0 = load <4 x float>, <4 x float>* %h, align 8
-  %v0e = fpext <4 x float> %v0 to <4 x double>
-  %a = fadd <4 x double> %v0e, %vl
-  ret <4 x double> %a
-}
-
-declare void @llvm.ppc.qpx.qvstfs(<4 x double>, i8*) #0
-
-define <4 x float> @test2(<4 x float>* %h, <4 x double> %d) #0 {
-entry:
-  %h1 = getelementptr <4 x float>, <4 x float>* %h, i64 1
-  %hv = bitcast <4 x float>* %h1 to i8*
-  call void @llvm.ppc.qpx.qvstfs(<4 x double> %d, i8* %hv)
-
-  %v0 = load <4 x float>, <4 x float>* %h, align 8
-  ret <4 x float> %v0
-
-; CHECK-LABEL: @test2
-; CHECK: @llvm.ppc.qpx.qvstfs
-; CHECK: ret <4 x float>
-}
-
-define <4 x float> @test2a(<4 x float>* align 16 %h, <4 x double> %d) #0 {
-entry:
-  %h1 = getelementptr <4 x float>, <4 x float>* %h, i64 1
-  %hv = bitcast <4 x float>* %h1 to i8*
-  call void @llvm.ppc.qpx.qvstfs(<4 x double> %d, i8* %hv)
-
-  %v0 = load <4 x float>, <4 x float>* %h, align 8
-  ret <4 x float> %v0
-
-; CHECK-LABEL: @test2
-; CHECK: fptrunc <4 x double> %d to <4 x float>
-; CHECK-NOT: @llvm.ppc.qpx.qvstfs
-; CHECK-NOT: store <4 x double>
-; CHECK: ret <4 x float>
-}
-
-declare <4 x double> @llvm.ppc.qpx.qvlfd(i8*) #1
-
-define <4 x double> @test1l(<4 x double>* %h) #0 {
-entry:
-  %h1 = getelementptr <4 x double>, <4 x double>* %h, i64 1
-  %hv = bitcast <4 x double>* %h1 to i8*
-  %vl = call <4 x double> @llvm.ppc.qpx.qvlfd(i8* %hv)
-
-; CHECK-LABEL: @test1l
-; CHECK: @llvm.ppc.qpx.qvlfd
-; CHECK: ret <4 x double>
-
-  %v0 = load <4 x double>, <4 x double>* %h, align 8
-  %a = fadd <4 x double> %v0, %vl
-  ret <4 x double> %a
-}
-
-define <4 x double> @test1ln(<4 x double>* align 16 %h) #0 {
-entry:
-  %h1 = getelementptr <4 x double>, <4 x double>* %h, i64 1
-  %hv = bitcast <4 x double>* %h1 to i8*
-  %vl = call <4 x double> @llvm.ppc.qpx.qvlfd(i8* %hv)
-
-; CHECK-LABEL: @test1ln
-; CHECK: @llvm.ppc.qpx.qvlfd
-; CHECK: ret <4 x double>
-
-  %v0 = load <4 x double>, <4 x double>* %h, align 8
-  %a = fadd <4 x double> %v0, %vl
-  ret <4 x double> %a
-}
-
-define <4 x double> @test1la(<4 x double>* align 32 %h) #0 {
-entry:
-  %h1 = getelementptr <4 x double>, <4 x double>* %h, i64 1
-  %hv = bitcast <4 x double>* %h1 to i8*
-  %vl = call <4 x double> @llvm.ppc.qpx.qvlfd(i8* %hv)
-
-; CHECK-LABEL: @test1la
-; CHECK-NOT: @llvm.ppc.qpx.qvlfd
-; CHECK: ret <4 x double>
-
-  %v0 = load <4 x double>, <4 x double>* %h, align 8
-  %a = fadd <4 x double> %v0, %vl
-  ret <4 x double> %a
-}
-
-declare void @llvm.ppc.qpx.qvstfd(<4 x double>, i8*) #0
-
-define <4 x double> @test2l(<4 x double>* %h, <4 x double> %d) #0 {
-entry:
-  %h1 = getelementptr <4 x double>, <4 x double>* %h, i64 1
-  %hv = bitcast <4 x double>* %h1 to i8*
-  call void @llvm.ppc.qpx.qvstfd(<4 x double> %d, i8* %hv)
-
-  %v0 = load <4 x double>, <4 x double>* %h, align 8
-  ret <4 x double> %v0
-
-; CHECK-LABEL: @test2l
-; CHECK: @llvm.ppc.qpx.qvstfd
-; CHECK: ret <4 x double>
-}
-
-define <4 x double> @test2ln(<4 x double>* align 16 %h, <4 x double> %d) #0 {
-entry:
-  %h1 = getelementptr <4 x double>, <4 x double>* %h, i64 1
-  %hv = bitcast <4 x double>* %h1 to i8*
-  call void @llvm.ppc.qpx.qvstfd(<4 x double> %d, i8* %hv)
-
-  %v0 = load <4 x double>, <4 x double>* %h, align 8
-  ret <4 x double> %v0
-
-; CHECK-LABEL: @test2ln
-; CHECK: @llvm.ppc.qpx.qvstfd
-; CHECK: ret <4 x double>
-}
-
-define <4 x double> @test2la(<4 x double>* align 32 %h, <4 x double> %d) #0 {
-entry:
-  %h1 = getelementptr <4 x double>, <4 x double>* %h, i64 1
-  %hv = bitcast <4 x double>* %h1 to i8*
-  call void @llvm.ppc.qpx.qvstfd(<4 x double> %d, i8* %hv)
-
-  %v0 = load <4 x double>, <4 x double>* %h, align 8
-  ret <4 x double> %v0
-
-; CHECK-LABEL: @test2l
-; CHECK-NOT: @llvm.ppc.qpx.qvstfd
-; CHECK: ret <4 x double>
-}
-
-attributes #0 = { nounwind }
-attributes #1 = { nounwind readonly }
-
diff --git a/test/Transforms/InstCombine/amdgcn-intrinsics.ll b/test/Transforms/InstCombine/amdgcn-intrinsics.ll
deleted file mode 100644
index 1901997c5521..000000000000
--- a/test/Transforms/InstCombine/amdgcn-intrinsics.ll
+++ /dev/null
@@ -1,1540 +0,0 @@
-; RUN: opt -instcombine -S < %s | FileCheck %s
-
-; --------------------------------------------------------------------
-; llvm.amdgcn.rcp
-; --------------------------------------------------------------------
-
-declare float @llvm.amdgcn.rcp.f32(float) nounwind readnone
-declare double @llvm.amdgcn.rcp.f64(double) nounwind readnone
-
-; CHECK-LABEL: @test_constant_fold_rcp_f32_undef
-; CHECK-NEXT: ret float undef
-define float @test_constant_fold_rcp_f32_undef() nounwind {
-  %val = call float @llvm.amdgcn.rcp.f32(float undef) nounwind readnone
-  ret float %val
-}
-
-; CHECK-LABEL: @test_constant_fold_rcp_f32_1
-; CHECK-NEXT: ret float 1.000000e+00
-define float @test_constant_fold_rcp_f32_1() nounwind {
-  %val = call float @llvm.amdgcn.rcp.f32(float 1.0) nounwind readnone
-  ret float %val
-}
-
-; CHECK-LABEL: @test_constant_fold_rcp_f64_1
-; CHECK-NEXT:  ret double 1.000000e+00
-define double @test_constant_fold_rcp_f64_1() nounwind {
-  %val = call double @llvm.amdgcn.rcp.f64(double 1.0) nounwind readnone
-  ret double %val
-}
-
-; CHECK-LABEL: @test_constant_fold_rcp_f32_half
-; CHECK-NEXT: ret float 2.000000e+00
-define float @test_constant_fold_rcp_f32_half() nounwind {
-  %val = call float @llvm.amdgcn.rcp.f32(float 0.5) nounwind readnone
-  ret float %val
-}
-
-; CHECK-LABEL: @test_constant_fold_rcp_f64_half
-; CHECK-NEXT:  ret double 2.000000e+00
-define double @test_constant_fold_rcp_f64_half() nounwind {
-  %val = call double @llvm.amdgcn.rcp.f64(double 0.5) nounwind readnone
-  ret double %val
-}
-
-; CHECK-LABEL: @test_constant_fold_rcp_f32_43
-; CHECK-NEXT: call float @llvm.amdgcn.rcp.f32(float 4.300000e+01)
-define float @test_constant_fold_rcp_f32_43() nounwind {
- %val = call float @llvm.amdgcn.rcp.f32(float 4.300000e+01) nounwind readnone
- ret float %val
-}
-
-; CHECK-LABEL: @test_constant_fold_rcp_f64_43
-; CHECK-NEXT: call double @llvm.amdgcn.rcp.f64(double 4.300000e+01)
-define double @test_constant_fold_rcp_f64_43() nounwind {
-  %val = call double @llvm.amdgcn.rcp.f64(double 4.300000e+01) nounwind readnone
-  ret double %val
-}
-
-; --------------------------------------------------------------------
-; llvm.amdgcn.rsq
-; --------------------------------------------------------------------
-
-declare float @llvm.amdgcn.rsq.f32(float) nounwind readnone
-
-; CHECK-LABEL: @test_constant_fold_rsq_f32_undef
-; CHECK-NEXT: ret float undef
-define float @test_constant_fold_rsq_f32_undef() nounwind {
-  %val = call float @llvm.amdgcn.rsq.f32(float undef) nounwind readnone
-  ret float %val
-}
-
-; --------------------------------------------------------------------
-; llvm.amdgcn.frexp.mant
-; --------------------------------------------------------------------
-
-declare float @llvm.amdgcn.frexp.mant.f32(float) nounwind readnone
-declare double @llvm.amdgcn.frexp.mant.f64(double) nounwind readnone
-
-
-; CHECK-LABEL: @test_constant_fold_frexp_mant_f32_undef(
-; CHECK-NEXT: ret float undef
-define float @test_constant_fold_frexp_mant_f32_undef() nounwind {
-  %val = call float @llvm.amdgcn.frexp.mant.f32(float undef)
-  ret float %val
-}
-
-; CHECK-LABEL: @test_constant_fold_frexp_mant_f64_undef(
-; CHECK-NEXT:  ret double undef
-define double @test_constant_fold_frexp_mant_f64_undef() nounwind {
-  %val = call double @llvm.amdgcn.frexp.mant.f64(double undef)
-  ret double %val
-}
-
-; CHECK-LABEL: @test_constant_fold_frexp_mant_f32_0(
-; CHECK-NEXT: ret float 0.000000e+00
-define float @test_constant_fold_frexp_mant_f32_0() nounwind {
-  %val = call float @llvm.amdgcn.frexp.mant.f32(float 0.0)
-  ret float %val
-}
-
-; CHECK-LABEL: @test_constant_fold_frexp_mant_f64_0(
-; CHECK-NEXT:  ret double 0.000000e+00
-define double @test_constant_fold_frexp_mant_f64_0() nounwind {
-  %val = call double @llvm.amdgcn.frexp.mant.f64(double 0.0)
-  ret double %val
-}
-
-
-; CHECK-LABEL: @test_constant_fold_frexp_mant_f32_n0(
-; CHECK-NEXT: ret float -0.000000e+00
-define float @test_constant_fold_frexp_mant_f32_n0() nounwind {
-  %val = call float @llvm.amdgcn.frexp.mant.f32(float -0.0)
-  ret float %val
-}
-
-; CHECK-LABEL: @test_constant_fold_frexp_mant_f64_n0(
-; CHECK-NEXT:  ret double -0.000000e+00
-define double @test_constant_fold_frexp_mant_f64_n0() nounwind {
-  %val = call double @llvm.amdgcn.frexp.mant.f64(double -0.0)
-  ret double %val
-}
-
-; CHECK-LABEL: @test_constant_fold_frexp_mant_f32_1(
-; CHECK-NEXT: ret float 5.000000e-01
-define float @test_constant_fold_frexp_mant_f32_1() nounwind {
-  %val = call float @llvm.amdgcn.frexp.mant.f32(float 1.0)
-  ret float %val
-}
-
-; CHECK-LABEL: @test_constant_fold_frexp_mant_f64_1(
-; CHECK-NEXT:  ret double 5.000000e-01
-define double @test_constant_fold_frexp_mant_f64_1() nounwind {
-  %val = call double @llvm.amdgcn.frexp.mant.f64(double 1.0)
-  ret double %val
-}
-
-; CHECK-LABEL: @test_constant_fold_frexp_mant_f32_n1(
-; CHECK-NEXT: ret float -5.000000e-01
-define float @test_constant_fold_frexp_mant_f32_n1() nounwind {
-  %val = call float @llvm.amdgcn.frexp.mant.f32(float -1.0)
-  ret float %val
-}
-
-; CHECK-LABEL: @test_constant_fold_frexp_mant_f64_n1(
-; CHECK-NEXT:  ret double -5.000000e-01
-define double @test_constant_fold_frexp_mant_f64_n1() nounwind {
-  %val = call double @llvm.amdgcn.frexp.mant.f64(double -1.0)
-  ret double %val
-}
-
-; CHECK-LABEL: @test_constant_fold_frexp_mant_f32_nan(
-; CHECK-NEXT: ret float 0x7FF8000000000000
-define float @test_constant_fold_frexp_mant_f32_nan() nounwind {
-  %val = call float @llvm.amdgcn.frexp.mant.f32(float 0x7FF8000000000000)
-  ret float %val
-}
-
-; CHECK-LABEL: @test_constant_fold_frexp_mant_f64_nan(
-; CHECK-NEXT:  ret double 0x7FF8000000000000
-define double @test_constant_fold_frexp_mant_f64_nan() nounwind {
-  %val = call double @llvm.amdgcn.frexp.mant.f64(double 0x7FF8000000000000)
-  ret double %val
-}
-
-; CHECK-LABEL: @test_constant_fold_frexp_mant_f32_inf(
-; CHECK-NEXT: ret float 0x7FF0000000000000
-define float @test_constant_fold_frexp_mant_f32_inf() nounwind {
-  %val = call float @llvm.amdgcn.frexp.mant.f32(float 0x7FF0000000000000)
-  ret float %val
-}
-
-; CHECK-LABEL: @test_constant_fold_frexp_mant_f64_inf(
-; CHECK-NEXT:  ret double 0x7FF0000000000000
-define double @test_constant_fold_frexp_mant_f64_inf() nounwind {
-  %val = call double @llvm.amdgcn.frexp.mant.f64(double 0x7FF0000000000000)
-  ret double %val
-}
-
-; CHECK-LABEL: @test_constant_fold_frexp_mant_f32_ninf(
-; CHECK-NEXT: ret float 0xFFF0000000000000
-define float @test_constant_fold_frexp_mant_f32_ninf() nounwind {
-  %val = call float @llvm.amdgcn.frexp.mant.f32(float 0xFFF0000000000000)
-  ret float %val
-}
-
-; CHECK-LABEL: @test_constant_fold_frexp_mant_f64_ninf(
-; CHECK-NEXT:  ret double 0xFFF0000000000000
-define double @test_constant_fold_frexp_mant_f64_ninf() nounwind {
-  %val = call double @llvm.amdgcn.frexp.mant.f64(double 0xFFF0000000000000)
-  ret double %val
-}
-
-; CHECK-LABEL: @test_constant_fold_frexp_mant_f32_max_num(
-; CHECK-NEXT: ret float 0x3FEFFFFFE0000000
-define float @test_constant_fold_frexp_mant_f32_max_num() nounwind {
-  %val = call float @llvm.amdgcn.frexp.mant.f32(float 0x47EFFFFFE0000000)
-  ret float %val
-}
-
-; CHECK-LABEL: @test_constant_fold_frexp_mant_f64_max_num(
-; CHECK-NEXT:  ret double 0x3FEFFFFFFFFFFFFF
-define double @test_constant_fold_frexp_mant_f64_max_num() nounwind {
-  %val = call double @llvm.amdgcn.frexp.mant.f64(double 0x7FEFFFFFFFFFFFFF)
-  ret double %val
-}
-
-; CHECK-LABEL: @test_constant_fold_frexp_mant_f32_min_num(
-; CHECK-NEXT: ret float 5.000000e-01
-define float @test_constant_fold_frexp_mant_f32_min_num() nounwind {
-  %val = call float @llvm.amdgcn.frexp.mant.f32(float 0x36A0000000000000)
-  ret float %val
-}
-
-; CHECK-LABEL: @test_constant_fold_frexp_mant_f64_min_num(
-; CHECK-NEXT:  ret double 5.000000e-01
-define double @test_constant_fold_frexp_mant_f64_min_num() nounwind {
-  %val = call double @llvm.amdgcn.frexp.mant.f64(double 4.940656e-324)
-  ret double %val
-}
-
-
-; --------------------------------------------------------------------
-; llvm.amdgcn.frexp.exp
-; --------------------------------------------------------------------
-
-declare i32 @llvm.amdgcn.frexp.exp.f32(float) nounwind readnone
-declare i32 @llvm.amdgcn.frexp.exp.f64(double) nounwind readnone
-
-; CHECK-LABEL: @test_constant_fold_frexp_exp_f32_undef(
-; CHECK-NEXT: ret i32 undef
-define i32 @test_constant_fold_frexp_exp_f32_undef() nounwind {
-  %val = call i32 @llvm.amdgcn.frexp.exp.f32(float undef)
-  ret i32 %val
-}
-
-; CHECK-LABEL: @test_constant_fold_frexp_exp_f64_undef(
-; CHECK-NEXT:  ret i32 undef
-define i32 @test_constant_fold_frexp_exp_f64_undef() nounwind {
-  %val = call i32 @llvm.amdgcn.frexp.exp.f64(double undef)
-  ret i32 %val
-}
-
-; CHECK-LABEL: @test_constant_fold_frexp_exp_f32_0(
-; CHECK-NEXT: ret i32 0
-define i32 @test_constant_fold_frexp_exp_f32_0() nounwind {
-  %val = call i32 @llvm.amdgcn.frexp.exp.f32(float 0.0)
-  ret i32 %val
-}
-
-; CHECK-LABEL: @test_constant_fold_frexp_exp_f64_0(
-; CHECK-NEXT:  ret i32 0
-define i32 @test_constant_fold_frexp_exp_f64_0() nounwind {
-  %val = call i32 @llvm.amdgcn.frexp.exp.f64(double 0.0)
-  ret i32 %val
-}
-
-; CHECK-LABEL: @test_constant_fold_frexp_exp_f32_n0(
-; CHECK-NEXT: ret i32 0
-define i32 @test_constant_fold_frexp_exp_f32_n0() nounwind {
-  %val = call i32 @llvm.amdgcn.frexp.exp.f32(float -0.0)
-  ret i32 %val
-}
-
-; CHECK-LABEL: @test_constant_fold_frexp_exp_f64_n0(
-; CHECK-NEXT:  ret i32 0
-define i32 @test_constant_fold_frexp_exp_f64_n0() nounwind {
-  %val = call i32 @llvm.amdgcn.frexp.exp.f64(double -0.0)
-  ret i32 %val
-}
-
-; CHECK-LABEL: @test_constant_fold_frexp_exp_f32_1024(
-; CHECK-NEXT: ret i32 11
-define i32 @test_constant_fold_frexp_exp_f32_1024() nounwind {
-  %val = call i32 @llvm.amdgcn.frexp.exp.f32(float 1024.0)
-  ret i32 %val
-}
-
-; CHECK-LABEL: @test_constant_fold_frexp_exp_f64_1024(
-; CHECK-NEXT:  ret i32 11
-define i32 @test_constant_fold_frexp_exp_f64_1024() nounwind {
-  %val = call i32 @llvm.amdgcn.frexp.exp.f64(double 1024.0)
-  ret i32 %val
-}
-
-; CHECK-LABEL: @test_constant_fold_frexp_exp_f32_n1024(
-; CHECK-NEXT: ret i32 11
-define i32 @test_constant_fold_frexp_exp_f32_n1024() nounwind {
-  %val = call i32 @llvm.amdgcn.frexp.exp.f32(float -1024.0)
-  ret i32 %val
-}
-
-; CHECK-LABEL: @test_constant_fold_frexp_exp_f64_n1024(
-; CHECK-NEXT:  ret i32 11
-define i32 @test_constant_fold_frexp_exp_f64_n1024() nounwind {
-  %val = call i32 @llvm.amdgcn.frexp.exp.f64(double -1024.0)
-  ret i32 %val
-}
-
-; CHECK-LABEL: @test_constant_fold_frexp_exp_f32_1_1024(
-; CHECK-NEXT: ret i32 -9
-define i32 @test_constant_fold_frexp_exp_f32_1_1024() nounwind {
-  %val = call i32 @llvm.amdgcn.frexp.exp.f32(float 0.0009765625)
-  ret i32 %val
-}
-
-; CHECK-LABEL: @test_constant_fold_frexp_exp_f64_1_1024(
-; CHECK-NEXT:  ret i32 -9
-define i32 @test_constant_fold_frexp_exp_f64_1_1024() nounwind {
-  %val = call i32 @llvm.amdgcn.frexp.exp.f64(double 0.0009765625)
-  ret i32 %val
-}
-
-; CHECK-LABEL: @test_constant_fold_frexp_exp_f32_nan(
-; CHECK-NEXT: ret i32 0
-define i32 @test_constant_fold_frexp_exp_f32_nan() nounwind {
-  %val = call i32 @llvm.amdgcn.frexp.exp.f32(float 0x7FF8000000000000)
-  ret i32 %val
-}
-
-; CHECK-LABEL: @test_constant_fold_frexp_exp_f64_nan(
-; CHECK-NEXT:  ret i32 0
-define i32 @test_constant_fold_frexp_exp_f64_nan() nounwind {
-  %val = call i32 @llvm.amdgcn.frexp.exp.f64(double 0x7FF8000000000000)
-  ret i32 %val
-}
-
-; CHECK-LABEL: @test_constant_fold_frexp_exp_f32_inf(
-; CHECK-NEXT: ret i32 0
-define i32 @test_constant_fold_frexp_exp_f32_inf() nounwind {
-  %val = call i32 @llvm.amdgcn.frexp.exp.f32(float 0x7FF0000000000000)
-  ret i32 %val
-}
-
-; CHECK-LABEL: @test_constant_fold_frexp_exp_f64_inf(
-; CHECK-NEXT:  ret i32 0
-define i32 @test_constant_fold_frexp_exp_f64_inf() nounwind {
-  %val = call i32 @llvm.amdgcn.frexp.exp.f64(double 0x7FF0000000000000)
-  ret i32 %val
-}
-
-; CHECK-LABEL: @test_constant_fold_frexp_exp_f32_ninf(
-; CHECK-NEXT: ret i32 0
-define i32 @test_constant_fold_frexp_exp_f32_ninf() nounwind {
-  %val = call i32 @llvm.amdgcn.frexp.exp.f32(float 0xFFF0000000000000)
-  ret i32 %val
-}
-
-; CHECK-LABEL: @test_constant_fold_frexp_exp_f64_ninf(
-; CHECK-NEXT:  ret i32 0
-define i32 @test_constant_fold_frexp_exp_f64_ninf() nounwind {
-  %val = call i32 @llvm.amdgcn.frexp.exp.f64(double 0xFFF0000000000000)
-  ret i32 %val
-}
-
-; CHECK-LABEL: @test_constant_fold_frexp_exp_f32_max_num(
-; CHECK-NEXT: ret i32 128
-define i32 @test_constant_fold_frexp_exp_f32_max_num() nounwind {
-  %val = call i32 @llvm.amdgcn.frexp.exp.f32(float 0x47EFFFFFE0000000)
-  ret i32 %val
-}
-
-; CHECK-LABEL: @test_constant_fold_frexp_exp_f64_max_num(
-; CHECK-NEXT:  ret i32 1024
-define i32 @test_constant_fold_frexp_exp_f64_max_num() nounwind {
-  %val = call i32 @llvm.amdgcn.frexp.exp.f64(double 0x7FEFFFFFFFFFFFFF)
-  ret i32 %val
-}
-
-; CHECK-LABEL: @test_constant_fold_frexp_exp_f32_min_num(
-; CHECK-NEXT: ret i32 -148
-define i32 @test_constant_fold_frexp_exp_f32_min_num() nounwind {
-  %val = call i32 @llvm.amdgcn.frexp.exp.f32(float 0x36A0000000000000)
-  ret i32 %val
-}
-
-; CHECK-LABEL: @test_constant_fold_frexp_exp_f64_min_num(
-; CHECK-NEXT:  ret i32 -1073
-define i32 @test_constant_fold_frexp_exp_f64_min_num() nounwind {
-  %val = call i32 @llvm.amdgcn.frexp.exp.f64(double 4.940656e-324)
-  ret i32 %val
-}
-
-; --------------------------------------------------------------------
-; llvm.amdgcn.class
-; --------------------------------------------------------------------
-
-declare i1 @llvm.amdgcn.class.f32(float, i32) nounwind readnone
-declare i1 @llvm.amdgcn.class.f64(double, i32) nounwind readnone
-
-; CHECK-LABEL: @test_class_undef_mask_f32(
-; CHECK: ret i1 false
-define i1 @test_class_undef_mask_f32(float %x) nounwind {
-  %val = call i1 @llvm.amdgcn.class.f32(float %x, i32 undef)
-  ret i1 %val
-}
-
-; CHECK-LABEL: @test_class_over_max_mask_f32(
-; CHECK: %val = call i1 @llvm.amdgcn.class.f32(float %x, i32 1)
-define i1 @test_class_over_max_mask_f32(float %x) nounwind {
-  %val = call i1 @llvm.amdgcn.class.f32(float %x, i32 1025)
-  ret i1 %val
-}
-
-; CHECK-LABEL: @test_class_no_mask_f32(
-; CHECK: ret i1 false
-define i1 @test_class_no_mask_f32(float %x) nounwind {
-  %val = call i1 @llvm.amdgcn.class.f32(float %x, i32 0)
-  ret i1 %val
-}
-
-; CHECK-LABEL: @test_class_full_mask_f32(
-; CHECK: ret i1 true
-define i1 @test_class_full_mask_f32(float %x) nounwind {
-  %val = call i1 @llvm.amdgcn.class.f32(float %x, i32 1023)
-  ret i1 %val
-}
-
-; CHECK-LABEL: @test_class_undef_no_mask_f32(
-; CHECK: ret i1 false
-define i1 @test_class_undef_no_mask_f32() nounwind {
-  %val = call i1 @llvm.amdgcn.class.f32(float undef, i32 0)
-  ret i1 %val
-}
-
-; CHECK-LABEL: @test_class_undef_full_mask_f32(
-; CHECK: ret i1 true
-define i1 @test_class_undef_full_mask_f32() nounwind {
-  %val = call i1 @llvm.amdgcn.class.f32(float undef, i32 1023)
-  ret i1 %val
-}
-
-; CHECK-LABEL: @test_class_undef_val_f32(
-; CHECK: ret i1 undef
-define i1 @test_class_undef_val_f32() nounwind {
-  %val = call i1 @llvm.amdgcn.class.f32(float undef, i32 4)
-  ret i1 %val
-}
-
-; CHECK-LABEL: @test_class_undef_undef_f32(
-; CHECK: ret i1 undef
-define i1 @test_class_undef_undef_f32() nounwind {
-  %val = call i1 @llvm.amdgcn.class.f32(float undef, i32 undef)
-  ret i1 %val
-}
-
-; CHECK-LABEL: @test_class_var_mask_f32(
-; CHECK: %val = call i1 @llvm.amdgcn.class.f32(float %x, i32 %mask)
-define i1 @test_class_var_mask_f32(float %x, i32 %mask) nounwind {
-  %val = call i1 @llvm.amdgcn.class.f32(float %x, i32 %mask)
-  ret i1 %val
-}
-
-; CHECK-LABEL: @test_class_isnan_f32(
-; CHECK: %val = fcmp uno float %x, 0.000000e+00
-define i1 @test_class_isnan_f32(float %x) nounwind {
-  %val = call i1 @llvm.amdgcn.class.f32(float %x, i32 3)
-  ret i1 %val
-}
-
-; CHECK-LABEL: @test_constant_class_snan_test_snan_f64(
-; CHECK: ret i1 true
-define i1 @test_constant_class_snan_test_snan_f64() nounwind {
-  %val = call i1 @llvm.amdgcn.class.f64(double 0x7FF0000000000001, i32 1)
-  ret i1 %val
-}
-
-; CHECK-LABEL: @test_constant_class_qnan_test_qnan_f64(
-; CHECK: ret i1 true
-define i1 @test_constant_class_qnan_test_qnan_f64() nounwind {
-  %val = call i1 @llvm.amdgcn.class.f64(double 0x7FF8000000000000, i32 2)
-  ret i1 %val
-}
-
-; CHECK-LABEL: @test_constant_class_qnan_test_snan_f64(
-; CHECK: ret i1 false
-define i1 @test_constant_class_qnan_test_snan_f64() nounwind {
-  %val = call i1 @llvm.amdgcn.class.f64(double 0x7FF8000000000000, i32 1)
-  ret i1 %val
-}
-
-; CHECK-LABEL: @test_constant_class_ninf_test_ninf_f64(
-; CHECK: ret i1 true
-define i1 @test_constant_class_ninf_test_ninf_f64() nounwind {
-  %val = call i1 @llvm.amdgcn.class.f64(double 0xFFF0000000000000, i32 4)
-  ret i1 %val
-}
-
-; CHECK-LABEL: @test_constant_class_pinf_test_ninf_f64(
-; CHECK: ret i1 false
-define i1 @test_constant_class_pinf_test_ninf_f64() nounwind {
-  %val = call i1 @llvm.amdgcn.class.f64(double 0x7FF0000000000000, i32 4)
-  ret i1 %val
-}
-
-; CHECK-LABEL: @test_constant_class_qnan_test_ninf_f64(
-; CHECK: ret i1 false
-define i1 @test_constant_class_qnan_test_ninf_f64() nounwind {
-  %val = call i1 @llvm.amdgcn.class.f64(double 0x7FF8000000000000, i32 4)
-  ret i1 %val
-}
-
-; CHECK-LABEL: @test_constant_class_snan_test_ninf_f64(
-; CHECK: ret i1 false
-define i1 @test_constant_class_snan_test_ninf_f64() nounwind {
-  %val = call i1 @llvm.amdgcn.class.f64(double 0x7FF0000000000001, i32 4)
-  ret i1 %val
-}
-
-; CHECK-LABEL: @test_constant_class_nnormal_test_nnormal_f64(
-; CHECK: ret i1 true
-define i1 @test_constant_class_nnormal_test_nnormal_f64() nounwind {
-  %val = call i1 @llvm.amdgcn.class.f64(double -1.0, i32 8)
-  ret i1 %val
-}
-
-; CHECK-LABEL: @test_constant_class_pnormal_test_nnormal_f64(
-; CHECK: ret i1 false
-define i1 @test_constant_class_pnormal_test_nnormal_f64() nounwind {
-  %val = call i1 @llvm.amdgcn.class.f64(double 1.0, i32 8)
-  ret i1 %val
-}
-
-; CHECK-LABEL: @test_constant_class_nsubnormal_test_nsubnormal_f64(
-; CHECK: ret i1 true
-define i1 @test_constant_class_nsubnormal_test_nsubnormal_f64() nounwind {
-  %val = call i1 @llvm.amdgcn.class.f64(double 0x800fffffffffffff, i32 16)
-  ret i1 %val
-}
-
-; CHECK-LABEL: @test_constant_class_psubnormal_test_nsubnormal_f64(
-; CHECK: ret i1 false
-define i1 @test_constant_class_psubnormal_test_nsubnormal_f64() nounwind {
-  %val = call i1 @llvm.amdgcn.class.f64(double 0x000fffffffffffff, i32 16)
-  ret i1 %val
-}
-
-; CHECK-LABEL: @test_constant_class_nzero_test_nzero_f64(
-; CHECK: ret i1 true
-define i1 @test_constant_class_nzero_test_nzero_f64() nounwind {
-  %val = call i1 @llvm.amdgcn.class.f64(double -0.0, i32 32)
-  ret i1 %val
-}
-
-; CHECK-LABEL: @test_constant_class_pzero_test_nzero_f64(
-; CHECK: ret i1 false
-define i1 @test_constant_class_pzero_test_nzero_f64() nounwind {
-  %val = call i1 @llvm.amdgcn.class.f64(double 0.0, i32 32)
-  ret i1 %val
-}
-
-; CHECK-LABEL: @test_constant_class_pzero_test_pzero_f64(
-; CHECK: ret i1 true
-define i1 @test_constant_class_pzero_test_pzero_f64() nounwind {
-  %val = call i1 @llvm.amdgcn.class.f64(double 0.0, i32 64)
-  ret i1 %val
-}
-
-; CHECK-LABEL: @test_constant_class_nzero_test_pzero_f64(
-; CHECK: ret i1 false
-define i1 @test_constant_class_nzero_test_pzero_f64() nounwind {
-  %val = call i1 @llvm.amdgcn.class.f64(double -0.0, i32 64)
-  ret i1 %val
-}
-
-; CHECK-LABEL: @test_constant_class_psubnormal_test_psubnormal_f64(
-; CHECK: ret i1 true
-define i1 @test_constant_class_psubnormal_test_psubnormal_f64() nounwind {
-  %val = call i1 @llvm.amdgcn.class.f64(double 0x000fffffffffffff, i32 128)
-  ret i1 %val
-}
-
-; CHECK-LABEL: @test_constant_class_nsubnormal_test_psubnormal_f64(
-; CHECK: ret i1 false
-define i1 @test_constant_class_nsubnormal_test_psubnormal_f64() nounwind {
-  %val = call i1 @llvm.amdgcn.class.f64(double 0x800fffffffffffff, i32 128)
-  ret i1 %val
-}
-
-; CHECK-LABEL: @test_constant_class_pnormal_test_pnormal_f64(
-; CHECK: ret i1 true
-define i1 @test_constant_class_pnormal_test_pnormal_f64() nounwind {
-  %val = call i1 @llvm.amdgcn.class.f64(double 1.0, i32 256)
-  ret i1 %val
-}
-
-; CHECK-LABEL: @test_constant_class_nnormal_test_pnormal_f64(
-; CHECK: ret i1 false
-define i1 @test_constant_class_nnormal_test_pnormal_f64() nounwind {
-  %val = call i1 @llvm.amdgcn.class.f64(double -1.0, i32 256)
-  ret i1 %val
-}
-
-; CHECK-LABEL: @test_constant_class_pinf_test_pinf_f64(
-; CHECK: ret i1 true
-define i1 @test_constant_class_pinf_test_pinf_f64() nounwind {
-  %val = call i1 @llvm.amdgcn.class.f64(double 0x7FF0000000000000, i32 512)
-  ret i1 %val
-}
-
-; CHECK-LABEL: @test_constant_class_ninf_test_pinf_f64(
-; CHECK: ret i1 false
-define i1 @test_constant_class_ninf_test_pinf_f64() nounwind {
-  %val = call i1 @llvm.amdgcn.class.f64(double 0xFFF0000000000000, i32 512)
-  ret i1 %val
-}
-
-; CHECK-LABEL: @test_constant_class_qnan_test_pinf_f64(
-; CHECK: ret i1 false
-define i1 @test_constant_class_qnan_test_pinf_f64() nounwind {
-  %val = call i1 @llvm.amdgcn.class.f64(double 0x7FF8000000000000, i32 512)
-  ret i1 %val
-}
-
-; CHECK-LABEL: @test_constant_class_snan_test_pinf_f64(
-; CHECK: ret i1 false
-define i1 @test_constant_class_snan_test_pinf_f64() nounwind {
-  %val = call i1 @llvm.amdgcn.class.f64(double 0x7FF0000000000001, i32 512)
-  ret i1 %val
-}
-
-; --------------------------------------------------------------------
-; llvm.amdgcn.cos
-; --------------------------------------------------------------------
-declare float @llvm.amdgcn.cos.f32(float) nounwind readnone
-declare float @llvm.fabs.f32(float) nounwind readnone
-
-; CHECK-LABEL: @cos_fneg_f32(
-; CHECK: %cos = call float @llvm.amdgcn.cos.f32(float %x)
-; CHECK-NEXT: ret float %cos
-define float @cos_fneg_f32(float %x) {
-  %x.fneg = fsub float -0.0, %x
-  %cos = call float @llvm.amdgcn.cos.f32(float %x.fneg)
-  ret float %cos
-}
-
-; CHECK-LABEL: @cos_fabs_f32(
-; CHECK-NEXT: %cos = call float @llvm.amdgcn.cos.f32(float %x)
-; CHECK-NEXT: ret float %cos
-define float @cos_fabs_f32(float %x) {
-  %x.fabs = call float @llvm.fabs.f32(float %x)
-  %cos = call float @llvm.amdgcn.cos.f32(float %x.fabs)
-  ret float %cos
-}
-
-; CHECK-LABEL: @cos_fabs_fneg_f32(
-; CHECK-NEXT: %cos = call float @llvm.amdgcn.cos.f32(float %x)
-; CHECK-NEXT: ret float %cos
-define float @cos_fabs_fneg_f32(float %x) {
-  %x.fabs = call float @llvm.fabs.f32(float %x)
-  %x.fabs.fneg = fsub float -0.0, %x.fabs
-  %cos = call float @llvm.amdgcn.cos.f32(float %x.fabs.fneg)
-  ret float %cos
-}
-
-; --------------------------------------------------------------------
-; llvm.amdgcn.cvt.pkrtz
-; --------------------------------------------------------------------
-
-declare <2 x half> @llvm.amdgcn.cvt.pkrtz(float, float) nounwind readnone
-
-; CHECK-LABEL: @vars_lhs_cvt_pkrtz(
-; CHECK: %cvt = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %x, float %y)
-define <2 x half> @vars_lhs_cvt_pkrtz(float %x, float %y) {
-  %cvt = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %x, float %y)
-  ret <2 x half> %cvt
-}
-
-; CHECK-LABEL: @constant_lhs_cvt_pkrtz(
-; CHECK: %cvt = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float 0.000000e+00, float %y)
-define <2 x half> @constant_lhs_cvt_pkrtz(float %y) {
-  %cvt = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float 0.0, float %y)
-  ret <2 x half> %cvt
-}
-
-; CHECK-LABEL: @constant_rhs_cvt_pkrtz(
-; CHECK: %cvt = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %x, float 0.000000e+00)
-define <2 x half> @constant_rhs_cvt_pkrtz(float %x) {
-  %cvt = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %x, float 0.0)
-  ret <2 x half> %cvt
-}
-
-; CHECK-LABEL: @undef_lhs_cvt_pkrtz(
-; CHECK: %cvt = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float undef, float %y)
-define <2 x half> @undef_lhs_cvt_pkrtz(float %y) {
-  %cvt = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float undef, float %y)
-  ret <2 x half> %cvt
-}
-
-; CHECK-LABEL: @undef_rhs_cvt_pkrtz(
-; CHECK: %cvt = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %x, float undef)
-define <2 x half> @undef_rhs_cvt_pkrtz(float %x) {
-  %cvt = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %x, float undef)
-  ret <2 x half> %cvt
-}
-
-; CHECK-LABEL: @undef_cvt_pkrtz(
-; CHECK: ret <2 x half> undef
-define <2 x half> @undef_cvt_pkrtz() {
-  %cvt = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float undef, float undef)
-  ret <2 x half> %cvt
-}
-
-; CHECK-LABEL: @constant_splat0_cvt_pkrtz(
-; CHECK: ret <2 x half> zeroinitializer
-define <2 x half> @constant_splat0_cvt_pkrtz() {
-  %cvt = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float 0.0, float 0.0)
-  ret <2 x half> %cvt
-}
-
-; CHECK-LABEL: @constant_cvt_pkrtz(
-; CHECK: ret <2 x half> <half 0xH4000, half 0xH4400>
-define <2 x half> @constant_cvt_pkrtz() {
-  %cvt = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float 2.0, float 4.0)
-  ret <2 x half> %cvt
-}
-
-; Test constant values where rtz changes result
-; CHECK-LABEL: @constant_rtz_pkrtz(
-; CHECK: ret <2 x half> <half 0xH7BFF, half 0xH7BFF>
-define <2 x half> @constant_rtz_pkrtz() {
-  %cvt = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float 65535.0, float 65535.0)
-  ret <2 x half> %cvt
-}
-
-; --------------------------------------------------------------------
-; llvm.amdgcn.ubfe
-; --------------------------------------------------------------------
-
-declare i32 @llvm.amdgcn.ubfe.i32(i32, i32, i32) nounwind readnone
-declare i64 @llvm.amdgcn.ubfe.i64(i64, i32, i32) nounwind readnone
-
-; CHECK-LABEL: @ubfe_var_i32(
-; CHECK-NEXT: %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %src, i32 %offset, i32 %width)
-define i32 @ubfe_var_i32(i32 %src, i32 %offset, i32 %width) {
-  %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %src, i32 %offset, i32 %width)
-  ret i32 %bfe
-}
-
-; CHECK-LABEL: @ubfe_clear_high_bits_constant_offset_i32(
-; CHECK-NEXT: %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %src, i32 5, i32 %width)
-define i32 @ubfe_clear_high_bits_constant_offset_i32(i32 %src, i32 %width) {
-  %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %src, i32 133, i32 %width)
-  ret i32 %bfe
-}
-
-; CHECK-LABEL: @ubfe_clear_high_bits_constant_width_i32(
-; CHECK-NEXT: %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %src, i32 %offset, i32 5)
-define i32 @ubfe_clear_high_bits_constant_width_i32(i32 %src, i32 %offset) {
-  %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %src, i32 %offset, i32 133)
-  ret i32 %bfe
-}
-
-; CHECK-LABEL: @ubfe_width_0(
-; CHECK-NEXT: ret i32 0
-define i32 @ubfe_width_0(i32 %src, i32 %offset) {
-  %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %src, i32 %offset, i32 0)
-  ret i32 %bfe
-}
-
-; CHECK-LABEL: @ubfe_width_31(
-; CHECK: %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %src, i32 %offset, i32 31)
-define i32 @ubfe_width_31(i32 %src, i32 %offset) {
-  %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %src, i32 %offset, i32 31)
-  ret i32 %bfe
-}
-
-; CHECK-LABEL: @ubfe_width_32(
-; CHECK-NEXT: ret i32 0
-define i32 @ubfe_width_32(i32 %src, i32 %offset) {
-  %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %src, i32 %offset, i32 32)
-  ret i32 %bfe
-}
-
-; CHECK-LABEL: @ubfe_width_33(
-; CHECK-NEXT: %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %src, i32 %offset, i32 1)
-define i32 @ubfe_width_33(i32 %src, i32 %offset) {
-  %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %src, i32 %offset, i32 33)
-  ret i32 %bfe
-}
-
-; CHECK-LABEL: @ubfe_offset_33(
-; CHECK-NEXT: %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %src, i32 1, i32 %width)
-define i32 @ubfe_offset_33(i32 %src, i32 %width) {
-  %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %src, i32 33, i32 %width)
-  ret i32 %bfe
-}
-
-; CHECK-LABEL: @ubfe_offset_0(
-; CHECK-NEXT: %1 = sub i32 32, %width
-; CHECK-NEXT: %2 = shl i32 %src, %1
-; CHECK-NEXT: %bfe = lshr i32 %2, %1
-; CHECK-NEXT: ret i32 %bfe
-define i32 @ubfe_offset_0(i32 %src, i32 %width) {
-  %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %src, i32 0, i32 %width)
-  ret i32 %bfe
-}
-
-; CHECK-LABEL: @ubfe_offset_32(
-; CHECK-NEXT: %1 = sub i32 32, %width
-; CHECK-NEXT: %2 = shl i32 %src, %1
-; CHECK-NEXT: %bfe = lshr i32 %2, %1
-; CHECK-NEXT: ret i32 %bfe
-define i32 @ubfe_offset_32(i32 %src, i32 %width) {
-  %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %src, i32 32, i32 %width)
-  ret i32 %bfe
-}
-
-; CHECK-LABEL: @ubfe_offset_31(
-; CHECK-NEXT: %1 = sub i32 32, %width
-; CHECK-NEXT: %2 = shl i32 %src, %1
-; CHECK-NEXT: %bfe = lshr i32 %2, %1
-; CHECK-NEXT: ret i32 %bfe
-define i32 @ubfe_offset_31(i32 %src, i32 %width) {
-  %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %src, i32 32, i32 %width)
-  ret i32 %bfe
-}
-
-; CHECK-LABEL: @ubfe_offset_0_width_0(
-; CHECK-NEXT: ret i32 0
-define i32 @ubfe_offset_0_width_0(i32 %src) {
-  %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %src, i32 0, i32 0)
-  ret i32 %bfe
-}
-
-; CHECK-LABEL: @ubfe_offset_0_width_3(
-; CHECK-NEXT: and i32 %src, 7
-; CHECK-NEXT: ret
-define i32 @ubfe_offset_0_width_3(i32 %src) {
-  %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %src, i32 0, i32 3)
-  ret i32 %bfe
-}
-
-; CHECK-LABEL: @ubfe_offset_3_width_1(
-; CHECK-NEXT: %1 = lshr i32 %src, 3
-; CHECK-NEXT: and i32 %1, 1
-; CHECK-NEXT: ret i32
-define i32 @ubfe_offset_3_width_1(i32 %src) {
-  %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %src, i32 3, i32 1)
-  ret i32 %bfe
-}
-
-; CHECK-LABEL: @ubfe_offset_3_width_4(
-; CHECK-NEXT: %1 = lshr i32 %src, 3
-; CHECK-NEXT: and i32 %1, 15
-; CHECK-NEXT: ret i32
-define i32 @ubfe_offset_3_width_4(i32 %src) {
-  %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %src, i32 3, i32 4)
-  ret i32 %bfe
-}
-
-; CHECK-LABEL: @ubfe_0_0_0(
-; CHECK-NEXT: ret i32 0
-define i32 @ubfe_0_0_0() {
-  %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 0, i32 0, i32 0)
-  ret i32 %bfe
-}
-
-; CHECK-LABEL: @ubfe_neg1_5_7(
-; CHECK-NEXT: ret i32 127
-define i32 @ubfe_neg1_5_7() {
-  %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 -1, i32 5, i32 7)
-  ret i32 %bfe
-}
-
-; CHECK-LABEL: @ubfe_undef_src_i32(
-; CHECK-NEXT: ret i32 undef
-define i32 @ubfe_undef_src_i32(i32 %offset, i32 %width) {
-  %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 undef, i32 %offset, i32 %width)
-  ret i32 %bfe
-}
-
-; CHECK-LABEL: @ubfe_undef_offset_i32(
-; CHECK: %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %src, i32 undef, i32 %width)
-define i32 @ubfe_undef_offset_i32(i32 %src, i32 %width) {
-  %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %src, i32 undef, i32 %width)
-  ret i32 %bfe
-}
-
-; CHECK-LABEL: @ubfe_undef_width_i32(
-; CHECK: %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %src, i32 %offset, i32 undef)
-define i32 @ubfe_undef_width_i32(i32 %src, i32 %offset) {
-  %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %src, i32 %offset, i32 undef)
-  ret i32 %bfe
-}
-
-; CHECK-LABEL: @ubfe_offset_33_width_4_i64(
-; CHECK-NEXT: %1 = lshr i64 %src, 33
-; CHECK-NEXT: %bfe = and i64 %1, 15
-define i64 @ubfe_offset_33_width_4_i64(i64 %src) {
-  %bfe = call i64 @llvm.amdgcn.ubfe.i64(i64 %src, i32 33, i32 4)
-  ret i64 %bfe
-}
-
-; CHECK-LABEL: @ubfe_offset_0_i64(
-; CHECK-NEXT: %1 = sub i32 64, %width
-; CHECK-NEXT: %2 = zext i32 %1 to i64
-; CHECK-NEXT: %3 = shl i64 %src, %2
-; CHECK-NEXT: %bfe = lshr i64 %3, %2
-; CHECK-NEXT: ret i64 %bfe
-define i64 @ubfe_offset_0_i64(i64 %src, i32 %width) {
-  %bfe = call i64 @llvm.amdgcn.ubfe.i64(i64 %src, i32 0, i32 %width)
-  ret i64 %bfe
-}
-
-; CHECK-LABEL: @ubfe_offset_32_width_32_i64(
-; CHECK-NEXT: %bfe = lshr i64 %src, 32
-; CHECK-NEXT: ret i64 %bfe
-define i64 @ubfe_offset_32_width_32_i64(i64 %src) {
-  %bfe = call i64 @llvm.amdgcn.ubfe.i64(i64 %src, i32 32, i32 32)
-  ret i64 %bfe
-}
-
-; --------------------------------------------------------------------
-; llvm.amdgcn.sbfe
-; --------------------------------------------------------------------
-
-declare i32 @llvm.amdgcn.sbfe.i32(i32, i32, i32) nounwind readnone
-declare i64 @llvm.amdgcn.sbfe.i64(i64, i32, i32) nounwind readnone
-
-; CHECK-LABEL: @sbfe_offset_31(
-; CHECK-NEXT: %1 = sub i32 32, %width
-; CHECK-NEXT: %2 = shl i32 %src, %1
-; CHECK-NEXT: %bfe = ashr i32 %2, %1
-; CHECK-NEXT: ret i32 %bfe
-define i32 @sbfe_offset_31(i32 %src, i32 %width) {
-  %bfe = call i32 @llvm.amdgcn.sbfe.i32(i32 %src, i32 32, i32 %width)
-  ret i32 %bfe
-}
-
-; CHECK-LABEL: @sbfe_neg1_5_7(
-; CHECK-NEXT: ret i32 -1
-define i32 @sbfe_neg1_5_7() {
-  %bfe = call i32 @llvm.amdgcn.sbfe.i32(i32 -1, i32 5, i32 7)
-  ret i32 %bfe
-}
-
-; CHECK-LABEL: @sbfe_offset_32_width_32_i64(
-; CHECK-NEXT: %bfe = ashr i64 %src, 32
-; CHECK-NEXT: ret i64 %bfe
-define i64 @sbfe_offset_32_width_32_i64(i64 %src) {
-  %bfe = call i64 @llvm.amdgcn.sbfe.i64(i64 %src, i32 32, i32 32)
-  ret i64 %bfe
-}
-
-; --------------------------------------------------------------------
-; llvm.amdgcn.exp
-; --------------------------------------------------------------------
-
-declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) nounwind inaccessiblememonly
-
-; Make sure no crashing on invalid variable params
-; CHECK-LABEL: @exp_invalid_inputs(
-; CHECK: call void @llvm.amdgcn.exp.f32(i32 0, i32 %en, float 1.000000e+00, float 2.000000e+00, float 5.000000e-01, float 4.000000e+00, i1 true, i1 false)
-; CHECK: call void @llvm.amdgcn.exp.f32(i32 %tgt, i32 15, float 1.000000e+00, float 2.000000e+00, float 5.000000e-01, float 4.000000e+00, i1 true, i1 false)
-define void @exp_invalid_inputs(i32 %tgt, i32 %en) {
-  call void @llvm.amdgcn.exp.f32(i32 0, i32 %en, float 1.0, float 2.0, float 0.5, float 4.0, i1 true, i1 false)
-  call void @llvm.amdgcn.exp.f32(i32 %tgt, i32 15, float 1.0, float 2.0, float 0.5, float 4.0, i1 true, i1 false)
-  ret void
-}
-
-; CHECK-LABEL: @exp_disabled_inputs_to_undef(
-; CHECK: call void @llvm.amdgcn.exp.f32(i32 0, i32 1, float 1.000000e+00, float undef, float undef, float undef, i1 true, i1 false)
-; CHECK: call void @llvm.amdgcn.exp.f32(i32 0, i32 2, float undef, float 2.000000e+00, float undef, float undef, i1 true, i1 false)
-; CHECK: call void @llvm.amdgcn.exp.f32(i32 0, i32 4, float undef, float undef, float 5.000000e-01, float undef, i1 true, i1 false)
-; CHECK: call void @llvm.amdgcn.exp.f32(i32 0, i32 8, float undef, float undef, float undef, float 4.000000e+00, i1 true, i1 false)
-
-; CHECK: call void @llvm.amdgcn.exp.f32(i32 0, i32 1, float %x, float undef, float undef, float undef, i1 true, i1 false)
-; CHECK: call void @llvm.amdgcn.exp.f32(i32 0, i32 2, float undef, float %y, float undef, float undef, i1 true, i1 false)
-; CHECK: call void @llvm.amdgcn.exp.f32(i32 0, i32 4, float undef, float undef, float %z, float undef, i1 true, i1 false)
-; CHECK: call void @llvm.amdgcn.exp.f32(i32 0, i32 8, float undef, float undef, float undef, float %w, i1 true, i1 false)
-
-; CHECK: call void @llvm.amdgcn.exp.f32(i32 0, i32 0, float undef, float undef, float undef, float undef, i1 true, i1 false)
-
-; CHECK: call void @llvm.amdgcn.exp.f32(i32 0, i32 3, float 1.000000e+00, float 2.000000e+00, float undef, float undef, i1 true, i1 false)
-; CHECK: call void @llvm.amdgcn.exp.f32(i32 0, i32 5, float 1.000000e+00, float undef, float 5.000000e-01, float undef, i1 true, i1 false)
-; CHECK: call void @llvm.amdgcn.exp.f32(i32 0, i32 9, float 1.000000e+00, float undef, float undef, float 4.000000e+00, i1 false, i1 false)
-; CHECK: call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float 1.000000e+00, float 2.000000e+00, float 5.000000e-01, float 4.000000e+00, i1 false, i1 false)
-define void @exp_disabled_inputs_to_undef(float %x, float %y, float %z, float %w) {
-  ; enable src0..src3 constants
-  call void @llvm.amdgcn.exp.f32(i32 0, i32 1, float 1.0, float 2.0, float 0.5, float 4.0, i1 true, i1 false)
-  call void @llvm.amdgcn.exp.f32(i32 0, i32 2, float 1.0, float 2.0, float 0.5, float 4.0, i1 true, i1 false)
-  call void @llvm.amdgcn.exp.f32(i32 0, i32 4, float 1.0, float 2.0, float 0.5, float 4.0, i1 true, i1 false)
-  call void @llvm.amdgcn.exp.f32(i32 0, i32 8, float 1.0, float 2.0, float 0.5, float 4.0, i1 true, i1 false)
-
-  ; enable src0..src3 variables
-  call void @llvm.amdgcn.exp.f32(i32 0, i32 1, float %x, float %y, float %z, float %w, i1 true, i1 false)
-  call void @llvm.amdgcn.exp.f32(i32 0, i32 2, float %x, float %y, float %z, float %w, i1 true, i1 false)
-  call void @llvm.amdgcn.exp.f32(i32 0, i32 4, float %x, float %y, float %z, float %w, i1 true, i1 false)
-  call void @llvm.amdgcn.exp.f32(i32 0, i32 8, float %x, float %y, float %z, float %w, i1 true, i1 false)
-
-  ; enable none
-  call void @llvm.amdgcn.exp.f32(i32 0, i32 0, float %x, float %y, float %z, float %w, i1 true, i1 false)
-
-  ; enable different source combinations
-  call void @llvm.amdgcn.exp.f32(i32 0, i32 3, float 1.0, float 2.0, float 0.5, float 4.0, i1 true, i1 false)
-  call void @llvm.amdgcn.exp.f32(i32 0, i32 5, float 1.0, float 2.0, float 0.5, float 4.0, i1 true, i1 false)
-  call void @llvm.amdgcn.exp.f32(i32 0, i32 9, float 1.0, float 2.0, float 0.5, float 4.0, i1 false, i1 false)
-  call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float 1.0, float 2.0, float 0.5, float 4.0, i1 false, i1 false)
-
-  ret void
-}
-
-; --------------------------------------------------------------------
-; llvm.amdgcn.exp.compr
-; --------------------------------------------------------------------
-
-declare void @llvm.amdgcn.exp.compr.v2f16(i32, i32, <2 x half>, <2 x half>, i1, i1) nounwind inaccessiblememonly
-
-; CHECK-LABEL: @exp_compr_invalid_inputs(
-; CHECK: call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 %en, <2 x half> <half 0xH3C00, half 0xH4000>, <2 x half> <half 0xH3800, half 0xH4400>, i1 true, i1 false)
-; CHECK: call void @llvm.amdgcn.exp.compr.v2f16(i32 %tgt, i32 5, <2 x half> <half 0xH3C00, half 0xH4000>, <2 x half> <half 0xH3800, half 0xH4400>, i1 true, i1 false)
-define void @exp_compr_invalid_inputs(i32 %tgt, i32 %en) {
-  call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 %en, <2 x half> <half 1.0, half 2.0>, <2 x half> <half 0.5, half 4.0>, i1 true, i1 false)
-  call void @llvm.amdgcn.exp.compr.v2f16(i32 %tgt, i32 5, <2 x half> <half 1.0, half 2.0>, <2 x half> <half 0.5, half 4.0>, i1 true, i1 false)
-  ret void
-}
-
-; CHECK-LABEL: @exp_compr_disabled_inputs_to_undef(
-; CHECK: call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 0, <2 x half> undef, <2 x half> undef, i1 true, i1 false)
-; CHECK: call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 1, <2 x half> <half 0xH3C00, half 0xH4000>, <2 x half> undef, i1 true, i1 false)
-; CHECK: call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 2, <2 x half> <half 0xH3C00, half 0xH4000>, <2 x half> undef, i1 true, i1 false)
-; CHECK: call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 3, <2 x half> <half 0xH3C00, half 0xH4000>, <2 x half> undef, i1 true, i1 false)
-
-; CHECK: call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 0, <2 x half> undef, <2 x half> undef, i1 true, i1 false)
-; CHECK: call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 1, <2 x half> %xy, <2 x half> undef, i1 true, i1 false)
-; CHECK: call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 2, <2 x half> %xy, <2 x half> undef, i1 true, i1 false)
-; CHECK: call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 3, <2 x half> %xy, <2 x half> undef, i1 true, i1 false)
-
-; CHECK: call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 12, <2 x half> undef, <2 x half> %zw, i1 true, i1 false)
-; CHECK: call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 15, <2 x half> %xy, <2 x half> %zw, i1 true, i1 false)
-define void @exp_compr_disabled_inputs_to_undef(<2 x half> %xy, <2 x half> %zw) {
-  call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 0, <2 x half> <half 1.0, half 2.0>, <2 x half> <half 0.5, half 4.0>, i1 true, i1 false)
-  call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 1, <2 x half> <half 1.0, half 2.0>, <2 x half> <half 0.5, half 4.0>, i1 true, i1 false)
-  call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 2, <2 x half> <half 1.0, half 2.0>, <2 x half> <half 0.5, half 4.0>, i1 true, i1 false)
-  call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 3, <2 x half> <half 1.0, half 2.0>, <2 x half> <half 0.5, half 4.0>, i1 true, i1 false)
-
-  call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 0, <2 x half> %xy, <2 x half> %zw, i1 true, i1 false)
-  call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 1, <2 x half> %xy, <2 x half> %zw, i1 true, i1 false)
-  call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 2, <2 x half> %xy, <2 x half> %zw, i1 true, i1 false)
-  call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 3, <2 x half> %xy, <2 x half> %zw, i1 true, i1 false)
-
-  call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 12, <2 x half> %xy, <2 x half> %zw, i1 true, i1 false)
-  call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 15, <2 x half> %xy, <2 x half> %zw, i1 true, i1 false)
-  ret void
-}
-
-; --------------------------------------------------------------------
-; llvm.amdgcn.fmed3
-; --------------------------------------------------------------------
-
-declare float @llvm.amdgcn.fmed3.f32(float, float, float) nounwind readnone
-
-; CHECK-LABEL: @fmed3_f32(
-; CHECK: %med3 = call float @llvm.amdgcn.fmed3.f32(float %x, float %y, float %z)
-define float @fmed3_f32(float %x, float %y, float %z) {
-  %med3 = call float @llvm.amdgcn.fmed3.f32(float %x, float %y, float %z)
-  ret float %med3
-}
-
-; CHECK-LABEL: @fmed3_canonicalize_x_c0_c1_f32(
-; CHECK: call float @llvm.amdgcn.fmed3.f32(float %x, float 0.000000e+00, float 1.000000e+00)
-define float @fmed3_canonicalize_x_c0_c1_f32(float %x) {
-  %med3 = call float @llvm.amdgcn.fmed3.f32(float %x, float 0.0, float 1.0)
-  ret float %med3
-}
-
-; CHECK-LABEL: @fmed3_canonicalize_c0_x_c1_f32(
-; CHECK: call float @llvm.amdgcn.fmed3.f32(float %x, float 0.000000e+00, float 1.000000e+00)
-define float @fmed3_canonicalize_c0_x_c1_f32(float %x) {
-  %med3 = call float @llvm.amdgcn.fmed3.f32(float 0.0, float %x, float 1.0)
-  ret float %med3
-}
-
-; CHECK-LABEL: @fmed3_canonicalize_c0_c1_x_f32(
-; CHECK: call float @llvm.amdgcn.fmed3.f32(float %x, float 0.000000e+00, float 1.000000e+00)
-define float @fmed3_canonicalize_c0_c1_x_f32(float %x) {
-  %med3 = call float @llvm.amdgcn.fmed3.f32(float 0.0, float 1.0, float %x)
-  ret float %med3
-}
-
-; CHECK-LABEL: @fmed3_canonicalize_x_y_c_f32(
-; CHECK: call float @llvm.amdgcn.fmed3.f32(float %x, float %y, float 1.000000e+00)
-define float @fmed3_canonicalize_x_y_c_f32(float %x, float %y) {
-  %med3 = call float @llvm.amdgcn.fmed3.f32(float %x, float %y, float 1.0)
-  ret float %med3
-}
-
-; CHECK-LABEL: @fmed3_canonicalize_x_c_y_f32(
-; CHECK: %med3 = call float @llvm.amdgcn.fmed3.f32(float %x, float %y, float 1.000000e+00)
-define float @fmed3_canonicalize_x_c_y_f32(float %x, float %y) {
-  %med3 = call float @llvm.amdgcn.fmed3.f32(float %x, float 1.0, float %y)
-  ret float %med3
-}
-
-; CHECK-LABEL: @fmed3_canonicalize_c_x_y_f32(
-; CHECK: call float @llvm.amdgcn.fmed3.f32(float %x, float %y, float 1.000000e+00)
-define float @fmed3_canonicalize_c_x_y_f32(float %x, float %y) {
-  %med3 = call float @llvm.amdgcn.fmed3.f32(float 1.0, float %x, float %y)
-  ret float %med3
-}
-
-; CHECK-LABEL: @fmed3_undef_x_y_f32(
-; CHECK: call float @llvm.minnum.f32(float %x, float %y)
-define float @fmed3_undef_x_y_f32(float %x, float %y) {
-  %med3 = call float @llvm.amdgcn.fmed3.f32(float undef, float %x, float %y)
-  ret float %med3
-}
-
-; CHECK-LABEL: @fmed3_fmf_undef_x_y_f32(
-; CHECK: call nnan float @llvm.minnum.f32(float %x, float %y)
-define float @fmed3_fmf_undef_x_y_f32(float %x, float %y) {
-  %med3 = call nnan float @llvm.amdgcn.fmed3.f32(float undef, float %x, float %y)
-  ret float %med3
-}
-
-; CHECK-LABEL: @fmed3_x_undef_y_f32(
-; CHECK: call float @llvm.minnum.f32(float %x, float %y)
-define float @fmed3_x_undef_y_f32(float %x, float %y) {
-  %med3 = call float @llvm.amdgcn.fmed3.f32(float %x, float undef, float %y)
-  ret float %med3
-}
-
-; CHECK-LABEL: @fmed3_x_y_undef_f32(
-; CHECK: call float @llvm.minnum.f32(float %x, float %y)
-define float @fmed3_x_y_undef_f32(float %x, float %y) {
-  %med3 = call float @llvm.amdgcn.fmed3.f32(float %x, float %y, float undef)
-  ret float %med3
-}
-
-; CHECK-LABEL: @fmed3_qnan0_x_y_f32(
-; CHECK: call float @llvm.minnum.f32(float %x, float %y)
-define float @fmed3_qnan0_x_y_f32(float %x, float %y) {
-  %med3 = call float @llvm.amdgcn.fmed3.f32(float 0x7FF8000000000000, float %x, float %y)
-  ret float %med3
-}
-
-; CHECK-LABEL: @fmed3_x_qnan0_y_f32(
-; CHECK: call float @llvm.minnum.f32(float %x, float %y)
-define float @fmed3_x_qnan0_y_f32(float %x, float %y) {
-  %med3 = call float @llvm.amdgcn.fmed3.f32(float %x, float 0x7FF8000000000000, float %y)
-  ret float %med3
-}
-
-; CHECK-LABEL: @fmed3_x_y_qnan0_f32(
-; CHECK: call float @llvm.minnum.f32(float %x, float %y)
-define float @fmed3_x_y_qnan0_f32(float %x, float %y) {
-  %med3 = call float @llvm.amdgcn.fmed3.f32(float %x, float %y, float 0x7FF8000000000000)
-  ret float %med3
-}
-
-; CHECK-LABEL: @fmed3_qnan1_x_y_f32(
-; CHECK: call float @llvm.minnum.f32(float %x, float %y)
-define float @fmed3_qnan1_x_y_f32(float %x, float %y) {
-  %med3 = call float @llvm.amdgcn.fmed3.f32(float 0x7FF8000100000000, float %x, float %y)
-  ret float %med3
-}
-
-; This can return any of the qnans.
-; CHECK-LABEL: @fmed3_qnan0_qnan1_qnan2_f32(
-; CHECK: ret float 0x7FF8002000000000
-define float @fmed3_qnan0_qnan1_qnan2_f32(float %x, float %y) {
-  %med3 = call float @llvm.amdgcn.fmed3.f32(float 0x7FF8000100000000, float 0x7FF8002000000000, float 0x7FF8030000000000)
-  ret float %med3
-}
-
-; CHECK-LABEL: @fmed3_constant_src0_0_f32(
-; CHECK: ret float 5.000000e-01
-define float @fmed3_constant_src0_0_f32(float %x, float %y) {
-  %med3 = call float @llvm.amdgcn.fmed3.f32(float 0.5, float -1.0, float 4.0)
-  ret float %med3
-}
-
-; CHECK-LABEL: @fmed3_constant_src0_1_f32(
-; CHECK: ret float 5.000000e-01
-define float @fmed3_constant_src0_1_f32(float %x, float %y) {
-  %med3 = call float @llvm.amdgcn.fmed3.f32(float 0.5, float 4.0, float -1.0)
-  ret float %med3
-}
-
-; CHECK-LABEL: @fmed3_constant_src1_0_f32(
-; CHECK: ret float 5.000000e-01
-define float @fmed3_constant_src1_0_f32(float %x, float %y) {
-  %med3 = call float @llvm.amdgcn.fmed3.f32(float -1.0, float 0.5, float 4.0)
-  ret float %med3
-}
-
-; CHECK-LABEL: @fmed3_constant_src1_1_f32(
-; CHECK: ret float 5.000000e-01
-define float @fmed3_constant_src1_1_f32(float %x, float %y) {
-  %med3 = call float @llvm.amdgcn.fmed3.f32(float 4.0, float 0.5, float -1.0)
-  ret float %med3
-}
-
-; CHECK-LABEL: @fmed3_constant_src2_0_f32(
-; CHECK: ret float 5.000000e-01
-define float @fmed3_constant_src2_0_f32(float %x, float %y) {
-  %med3 = call float @llvm.amdgcn.fmed3.f32(float -1.0, float 4.0, float 0.5)
-  ret float %med3
-}
-
-; CHECK-LABEL: @fmed3_constant_src2_1_f32(
-; CHECK: ret float 5.000000e-01
-define float @fmed3_constant_src2_1_f32(float %x, float %y) {
-  %med3 = call float @llvm.amdgcn.fmed3.f32(float 4.0, float -1.0, float 0.5)
-  ret float %med3
-}
-
-; CHECK-LABEL: @fmed3_x_qnan0_qnan1_f32(
-; CHECK: ret float %x
-define float @fmed3_x_qnan0_qnan1_f32(float %x) {
-  %med3 = call float @llvm.amdgcn.fmed3.f32(float %x, float 0x7FF8001000000000, float 0x7FF8002000000000)
-  ret float %med3
-}
-
-; CHECK-LABEL: @fmed3_qnan0_x_qnan1_f32(
-; CHECK: ret float %x
-define float @fmed3_qnan0_x_qnan1_f32(float %x) {
-  %med3 = call float @llvm.amdgcn.fmed3.f32(float 0x7FF8001000000000, float %x, float 0x7FF8002000000000)
-  ret float %med3
-}
-
-; CHECK-LABEL: @fmed3_qnan0_qnan1_x_f32(
-; CHECK: ret float %x
-define float @fmed3_qnan0_qnan1_x_f32(float %x) {
-  %med3 = call float @llvm.amdgcn.fmed3.f32(float 0x7FF8001000000000, float 0x7FF8002000000000, float %x)
-  ret float %med3
-}
-
-; --------------------------------------------------------------------
-; llvm.amdgcn.icmp
-; --------------------------------------------------------------------
-
-declare i64 @llvm.amdgcn.icmp.i32(i32, i32, i32) nounwind readnone convergent
-declare i64 @llvm.amdgcn.icmp.i64(i64, i64, i32) nounwind readnone convergent
-
-; Make sure there's no crash for invalid input
-; CHECK-LABEL: @invalid_nonconstant_icmp_code(
-; CHECK: call i64 @llvm.amdgcn.icmp.i32(i32 %a, i32 %b, i32 %c)
-define i64 @invalid_nonconstant_icmp_code(i32 %a, i32 %b, i32 %c) {
-  %result = call i64 @llvm.amdgcn.icmp.i32(i32 %a, i32 %b, i32 %c)
-  ret i64 %result
-}
-
-; CHECK-LABEL: @invalid_icmp_code(
-; CHECK: %under = call i64 @llvm.amdgcn.icmp.i32(i32 %a, i32 %b, i32 31)
-; CHECK: %over = call i64 @llvm.amdgcn.icmp.i32(i32 %a, i32 %b, i32 42)
-define i64 @invalid_icmp_code(i32 %a, i32 %b) {
-  %under = call i64 @llvm.amdgcn.icmp.i32(i32 %a, i32 %b, i32 31)
-  %over = call i64 @llvm.amdgcn.icmp.i32(i32 %a, i32 %b, i32 42)
-  %or = or i64 %under, %over
-  ret i64 %or
-}
-
-; CHECK-LABEL: @icmp_constant_inputs_false(
-; CHECK: ret i64 0
-define i64 @icmp_constant_inputs_false() {
-  %result = call i64 @llvm.amdgcn.icmp.i32(i32 9, i32 8, i32 32)
-  ret i64 %result
-}
-
-; CHECK-LABEL: @icmp_constant_inputs_true(
-; CHECK: %result = call i64 @llvm.read_register.i64(metadata !0) #5
-define i64 @icmp_constant_inputs_true() {
-  %result = call i64 @llvm.amdgcn.icmp.i32(i32 9, i32 8, i32 34)
-  ret i64 %result
-}
-
-; CHECK-LABEL: @icmp_constant_to_rhs_slt(
-; CHECK: %result = call i64 @llvm.amdgcn.icmp.i32(i32 %x, i32 9, i32 38)
-define i64 @icmp_constant_to_rhs_slt(i32 %x) {
-  %result = call i64 @llvm.amdgcn.icmp.i32(i32 9, i32 %x, i32 40)
-  ret i64 %result
-}
-
-; CHECK-LABEL: @fold_icmp_ne_0_zext_icmp_eq_i32(
-; CHECK-NEXT: call i64 @llvm.amdgcn.icmp.i32(i32 %a, i32 %b, i32 32)
-define i64 @fold_icmp_ne_0_zext_icmp_eq_i32(i32 %a, i32 %b) {
-  %cmp = icmp eq i32 %a, %b
-  %zext.cmp = zext i1 %cmp to i32
-  %mask = call i64 @llvm.amdgcn.icmp.i32(i32 %zext.cmp, i32 0, i32 33)
-  ret i64 %mask
-}
-
-; CHECK-LABEL: @fold_icmp_ne_0_zext_icmp_ne_i32(
-; CHECK-NEXT: call i64 @llvm.amdgcn.icmp.i32(i32 %a, i32 %b, i32 33)
-define i64 @fold_icmp_ne_0_zext_icmp_ne_i32(i32 %a, i32 %b) {
-  %cmp = icmp ne i32 %a, %b
-  %zext.cmp = zext i1 %cmp to i32
-  %mask = call i64 @llvm.amdgcn.icmp.i32(i32 %zext.cmp, i32 0, i32 33)
-  ret i64 %mask
-}
-
-; CHECK-LABEL: @fold_icmp_ne_0_zext_icmp_sle_i32(
-; CHECK-NEXT: call i64 @llvm.amdgcn.icmp.i32(i32 %a, i32 %b, i32 41)
-define i64 @fold_icmp_ne_0_zext_icmp_sle_i32(i32 %a, i32 %b) {
-  %cmp = icmp sle i32 %a, %b
-  %zext.cmp = zext i1 %cmp to i32
-  %mask = call i64 @llvm.amdgcn.icmp.i32(i32 %zext.cmp, i32 0, i32 33)
-  ret i64 %mask
-}
-
-; CHECK-LABEL: @fold_icmp_ne_0_zext_icmp_ugt_i64(
-; CHECK-NEXT: call i64 @llvm.amdgcn.icmp.i64(i64 %a, i64 %b, i32 34)
-define i64 @fold_icmp_ne_0_zext_icmp_ugt_i64(i64 %a, i64 %b) {
-  %cmp = icmp ugt i64 %a, %b
-  %zext.cmp = zext i1 %cmp to i32
-  %mask = call i64 @llvm.amdgcn.icmp.i32(i32 %zext.cmp, i32 0, i32 33)
-  ret i64 %mask
-}
-
-; CHECK-LABEL: @fold_icmp_ne_0_zext_icmp_ult_swap_i64(
-; CHECK-NEXT: call i64 @llvm.amdgcn.icmp.i64(i64 %a, i64 %b, i32 34)
-define i64 @fold_icmp_ne_0_zext_icmp_ult_swap_i64(i64 %a, i64 %b) {
-  %cmp = icmp ugt i64 %a, %b
-  %zext.cmp = zext i1 %cmp to i32
-  %mask = call i64 @llvm.amdgcn.icmp.i32(i32 0, i32 %zext.cmp, i32 33)
-  ret i64 %mask
-}
-
-; CHECK-LABEL: @fold_icmp_ne_0_zext_fcmp_oeq_f32(
-; CHECK-NEXT: call i64 @llvm.amdgcn.fcmp.f32(float %a, float %b, i32 1)
-define i64 @fold_icmp_ne_0_zext_fcmp_oeq_f32(float %a, float %b) {
-  %cmp = fcmp oeq float %a, %b
-  %zext.cmp = zext i1 %cmp to i32
-  %mask = call i64 @llvm.amdgcn.icmp.i32(i32 %zext.cmp, i32 0, i32 33)
-  ret i64 %mask
-}
-
-; CHECK-LABEL: @fold_icmp_ne_0_zext_fcmp_une_f32(
-; CHECK-NEXT: call i64 @llvm.amdgcn.fcmp.f32(float %a, float %b, i32 14)
-define i64 @fold_icmp_ne_0_zext_fcmp_une_f32(float %a, float %b) {
-  %cmp = fcmp une float %a, %b
-  %zext.cmp = zext i1 %cmp to i32
-  %mask = call i64 @llvm.amdgcn.icmp.i32(i32 %zext.cmp, i32 0, i32 33)
-  ret i64 %mask
-}
-
-; CHECK-LABEL: @fold_icmp_ne_0_zext_fcmp_olt_f64(
-; CHECK-NEXT: call i64 @llvm.amdgcn.fcmp.f64(double %a, double %b, i32 4)
-define i64 @fold_icmp_ne_0_zext_fcmp_olt_f64(double %a, double %b) {
-  %cmp = fcmp olt double %a, %b
-  %zext.cmp = zext i1 %cmp to i32
-  %mask = call i64 @llvm.amdgcn.icmp.i32(i32 %zext.cmp, i32 0, i32 33)
-  ret i64 %mask
-}
-
-; CHECK-LABEL: @fold_icmp_sext_icmp_ne_0_i32(
-; CHECK: %mask = call i64 @llvm.amdgcn.icmp.i32(i32 %a, i32 %b, i32 32)
-define i64 @fold_icmp_sext_icmp_ne_0_i32(i32 %a, i32 %b) {
-  %cmp = icmp eq i32 %a, %b
-  %sext.cmp = sext i1 %cmp to i32
-  %mask = call i64 @llvm.amdgcn.icmp.i32(i32 %sext.cmp, i32 0, i32 33)
-  ret i64 %mask
-}
-
-; CHECK-LABEL: @fold_icmp_eq_0_zext_icmp_eq_i32(
-; CHECK-NEXT: call i64 @llvm.amdgcn.icmp.i32(i32 %a, i32 %b, i32 33)
-define i64 @fold_icmp_eq_0_zext_icmp_eq_i32(i32 %a, i32 %b) {
-  %cmp = icmp eq i32 %a, %b
-  %zext.cmp = zext i1 %cmp to i32
-  %mask = call i64 @llvm.amdgcn.icmp.i32(i32 %zext.cmp, i32 0, i32 32)
-  ret i64 %mask
-}
-
-; CHECK-LABEL: @fold_icmp_eq_0_zext_icmp_slt_i32(
-; CHECK-NEXT: call i64 @llvm.amdgcn.icmp.i32(i32 %a, i32 %b, i32 39)
-define i64 @fold_icmp_eq_0_zext_icmp_slt_i32(i32 %a, i32 %b) {
-  %cmp = icmp slt i32 %a, %b
-  %zext.cmp = zext i1 %cmp to i32
-  %mask = call i64 @llvm.amdgcn.icmp.i32(i32 %zext.cmp, i32 0, i32 32)
-  ret i64 %mask
-}
-
-; CHECK-LABEL: @fold_icmp_eq_0_zext_fcmp_oeq_f32(
-; CHECK-NEXT: call i64 @llvm.amdgcn.fcmp.f32(float %a, float %b, i32 14)
-define i64 @fold_icmp_eq_0_zext_fcmp_oeq_f32(float %a, float %b) {
-  %cmp = fcmp oeq float %a, %b
-  %zext.cmp = zext i1 %cmp to i32
-  %mask = call i64 @llvm.amdgcn.icmp.i32(i32 %zext.cmp, i32 0, i32 32)
-  ret i64 %mask
-}
-
-; CHECK-LABEL: @fold_icmp_eq_0_zext_fcmp_ule_f32(
-; CHECK-NEXT: call i64 @llvm.amdgcn.fcmp.f32(float %a, float %b, i32 2)
-define i64 @fold_icmp_eq_0_zext_fcmp_ule_f32(float %a, float %b) {
-  %cmp = fcmp ule float %a, %b
-  %zext.cmp = zext i1 %cmp to i32
-  %mask = call i64 @llvm.amdgcn.icmp.i32(i32 %zext.cmp, i32 0, i32 32)
-  ret i64 %mask
-}
-
-; CHECK-LABEL: @fold_icmp_eq_0_zext_fcmp_ogt_f32(
-; CHECK-NEXT: call i64 @llvm.amdgcn.fcmp.f32(float %a, float %b, i32 13)
-define i64 @fold_icmp_eq_0_zext_fcmp_ogt_f32(float %a, float %b) {
-  %cmp = fcmp ogt float %a, %b
-  %zext.cmp = zext i1 %cmp to i32
-  %mask = call i64 @llvm.amdgcn.icmp.i32(i32 %zext.cmp, i32 0, i32 32)
-  ret i64 %mask
-}
-
-; CHECK-LABEL: @fold_icmp_zext_icmp_eq_1_i32(
-; CHECK-NEXT: call i64 @llvm.amdgcn.icmp.i32(i32 %a, i32 %b, i32 32)
-define i64 @fold_icmp_zext_icmp_eq_1_i32(i32 %a, i32 %b) {
-  %cmp = icmp eq i32 %a, %b
-  %zext.cmp = zext i1 %cmp to i32
-  %mask = call i64 @llvm.amdgcn.icmp.i32(i32 %zext.cmp, i32 1, i32 32)
-  ret i64 %mask
-}
-
-; CHECK-LABEL: @fold_icmp_zext_argi1_eq_1_i32(
-; CHECK: %zext.cond = zext i1 %cond to i32
-; CHECK: call i64 @llvm.amdgcn.icmp.i32(i32 %zext.cond, i32 0, i32 33)
-define i64 @fold_icmp_zext_argi1_eq_1_i32(i1 %cond) {
-  %zext.cond = zext i1 %cond to i32
-  %mask = call i64 @llvm.amdgcn.icmp.i32(i32 %zext.cond, i32 1, i32 32)
-  ret i64 %mask
-}
-
-; CHECK-LABEL: @fold_icmp_zext_argi1_eq_neg1_i32(
-; CHECK: %zext.cond = zext i1 %cond to i32
-; CHECK: call i64 @llvm.amdgcn.icmp.i32(i32 %zext.cond, i32 -1, i32 32)
-define i64 @fold_icmp_zext_argi1_eq_neg1_i32(i1 %cond) {
-  %zext.cond = zext i1 %cond to i32
-  %mask = call i64 @llvm.amdgcn.icmp.i32(i32 %zext.cond, i32 -1, i32 32)
-  ret i64 %mask
-}
-
-; CHECK-LABEL: @fold_icmp_sext_argi1_eq_1_i32(
-; CHECK: %sext.cond = sext i1 %cond to i32
-; CHECK: call i64 @llvm.amdgcn.icmp.i32(i32 %sext.cond, i32 1, i32 32)
-define i64 @fold_icmp_sext_argi1_eq_1_i32(i1 %cond) {
-  %sext.cond = sext i1 %cond to i32
-  %mask = call i64 @llvm.amdgcn.icmp.i32(i32 %sext.cond, i32 1, i32 32)
-  ret i64 %mask
-}
-
-; CHECK-LABEL: @fold_icmp_sext_argi1_eq_neg1_i32(
-; CHECK: %sext.cond = sext i1 %cond to i32
-; CHECK: call i64 @llvm.amdgcn.icmp.i32(i32 %sext.cond, i32 0, i32 33)
-define i64 @fold_icmp_sext_argi1_eq_neg1_i32(i1 %cond) {
-  %sext.cond = sext i1 %cond to i32
-  %mask = call i64 @llvm.amdgcn.icmp.i32(i32 %sext.cond, i32 -1, i32 32)
-  ret i64 %mask
-}
-
-; CHECK-LABEL: @fold_icmp_sext_argi1_eq_neg1_i64(
-; CHECK: %sext.cond = sext i1 %cond to i64
-; CHECK: call i64 @llvm.amdgcn.icmp.i64(i64 %sext.cond, i64 0, i32 33)
-define i64 @fold_icmp_sext_argi1_eq_neg1_i64(i1 %cond) {
-  %sext.cond = sext i1 %cond to i64
-  %mask = call i64 @llvm.amdgcn.icmp.i64(i64 %sext.cond, i64 -1, i32 32)
-  ret i64 %mask
-}
-
-; TODO: Should be able to fold to false
-; CHECK-LABEL: @fold_icmp_sext_icmp_eq_1_i32(
-; CHECK: %cmp = icmp eq i32 %a, %b
-; CHECK: %sext.cmp = sext i1 %cmp to i32
-; CHECK: %mask = call i64 @llvm.amdgcn.icmp.i32(i32 %sext.cmp, i32 1, i32 32)
-define i64 @fold_icmp_sext_icmp_eq_1_i32(i32 %a, i32 %b) {
-  %cmp = icmp eq i32 %a, %b
-  %sext.cmp = sext i1 %cmp to i32
-  %mask = call i64 @llvm.amdgcn.icmp.i32(i32 %sext.cmp, i32 1, i32 32)
-  ret i64 %mask
-}
-
-; CHECK-LABEL: @fold_icmp_sext_icmp_eq_neg1_i32(
-; CHECK: call i64 @llvm.amdgcn.icmp.i32(i32 %a, i32 %b, i32 32)
-define i64 @fold_icmp_sext_icmp_eq_neg1_i32(i32 %a, i32 %b) {
-  %cmp = icmp eq i32 %a, %b
-  %sext.cmp = sext i1 %cmp to i32
-  %mask = call i64 @llvm.amdgcn.icmp.i32(i32 %sext.cmp, i32 -1, i32 32)
-  ret i64 %mask
-}
-
-; CHECK-LABEL: @fold_icmp_sext_icmp_sge_neg1_i32(
-; CHECK: call i64 @llvm.amdgcn.icmp.i32(i32 %a, i32 %b, i32 39)
-define i64 @fold_icmp_sext_icmp_sge_neg1_i32(i32 %a, i32 %b) {
-  %cmp = icmp sge i32 %a, %b
-  %sext.cmp = sext i1 %cmp to i32
-  %mask = call i64 @llvm.amdgcn.icmp.i32(i32 %sext.cmp, i32 -1, i32 32)
-  ret i64 %mask
-}
-
-; CHECK-LABEL: @fold_not_icmp_ne_0_zext_icmp_sle_i32(
-; CHECK-NEXT: call i64 @llvm.amdgcn.icmp.i32(i32 %a, i32 %b, i32 38)
-define i64 @fold_not_icmp_ne_0_zext_icmp_sle_i32(i32 %a, i32 %b) {
-  %cmp = icmp sle i32 %a, %b
-  %not = xor i1 %cmp, true
-  %zext.cmp = zext i1 %not to i32
-  %mask = call i64 @llvm.amdgcn.icmp.i32(i32 %zext.cmp, i32 0, i32 33)
-  ret i64 %mask
-}
-
-; --------------------------------------------------------------------
-; llvm.amdgcn.fcmp
-; --------------------------------------------------------------------
-
-declare i64 @llvm.amdgcn.fcmp.f32(float, float, i32) nounwind readnone convergent
-
-; Make sure there's no crash for invalid input
-; CHECK-LABEL: @invalid_nonconstant_fcmp_code(
-; CHECK: call i64 @llvm.amdgcn.fcmp.f32(float %a, float %b, i32 %c)
-define i64 @invalid_nonconstant_fcmp_code(float %a, float %b, i32 %c) {
-  %result = call i64 @llvm.amdgcn.fcmp.f32(float %a, float %b, i32 %c)
-  ret i64 %result
-}
-
-; CHECK-LABEL: @invalid_fcmp_code(
-; CHECK: %under = call i64 @llvm.amdgcn.fcmp.f32(float %a, float %b, i32 -1)
-; CHECK: %over = call i64 @llvm.amdgcn.fcmp.f32(float %a, float %b, i32 16)
-define i64 @invalid_fcmp_code(float %a, float %b) {
-  %under = call i64 @llvm.amdgcn.fcmp.f32(float %a, float %b, i32 -1)
-  %over = call i64 @llvm.amdgcn.fcmp.f32(float %a, float %b, i32 16)
-  %or = or i64 %under, %over
-  ret i64 %or
-}
-
-; CHECK-LABEL: @fcmp_constant_inputs_false(
-; CHECK: ret i64 0
-define i64 @fcmp_constant_inputs_false() {
-  %result = call i64 @llvm.amdgcn.fcmp.f32(float 2.0, float 4.0, i32 1)
-  ret i64 %result
-}
-
-; CHECK-LABEL: @fcmp_constant_inputs_true(
-; CHECK: %result = call i64 @llvm.read_register.i64(metadata !0) #5
-define i64 @fcmp_constant_inputs_true() {
-  %result = call i64 @llvm.amdgcn.fcmp.f32(float 2.0, float 4.0, i32 4)
-  ret i64 %result
-}
-
-; CHECK-LABEL: @fcmp_constant_to_rhs_olt(
-; CHECK: %result = call i64 @llvm.amdgcn.fcmp.f32(float %x, float 4.000000e+00, i32 2)
-define i64 @fcmp_constant_to_rhs_olt(float %x) {
-  %result = call i64 @llvm.amdgcn.fcmp.f32(float 4.0, float %x, i32 4)
-  ret i64 %result
-}
-
-; CHECK: attributes #5 = { convergent }
diff --git a/test/Transforms/InstCombine/and.ll b/test/Transforms/InstCombine/and.ll
index 8ef7870891f0..7bb9b95b3179 100644
--- a/test/Transforms/InstCombine/and.ll
+++ b/test/Transforms/InstCombine/and.ll
@@ -310,7 +310,7 @@ define i8 @test27(i8 %A) {
   ret i8 %E
 }
 
-;; This is juse a zero extending shr.
+;; This is just a zero-extending shr.
 define i32 @test28(i32 %X) {
 ; CHECK-LABEL: @test28(
 ; CHECK-NEXT:    [[Y1:%.*]] = lshr i32 %X, 24
diff --git a/test/Transforms/InstCombine/bit-tracking.ll b/test/Transforms/InstCombine/bit-tracking.ll
deleted file mode 100644
index 51bbc0888836..000000000000
--- a/test/Transforms/InstCombine/bit-tracking.ll
+++ /dev/null
@@ -1,26 +0,0 @@
-; This file contains various testcases that require tracking whether bits are
-; set or cleared by various instructions.
-; RUN: opt < %s -instcombine -instcombine -S |\
-; RUN:   not grep %ELIM
-
-; Reduce down to a single XOR
-define i32 @test3(i32 %B) {
-        %ELIMinc = and i32 %B, 1                ; <i32> [#uses=1]
-        %tmp.5 = xor i32 %ELIMinc, 1            ; <i32> [#uses=1]
-        %ELIM7 = and i32 %B, -2         ; <i32> [#uses=1]
-        %tmp.8 = or i32 %tmp.5, %ELIM7          ; <i32> [#uses=1]
-        ret i32 %tmp.8
-}
-
-; Finally, a bigger case where we chain things together.  This corresponds to
-; incrementing a single-bit bitfield, which should become just an xor.
-define i32 @test4(i32 %B) {
-        %ELIM3 = shl i32 %B, 31         ; <i32> [#uses=1]
-        %ELIM4 = ashr i32 %ELIM3, 31            ; <i32> [#uses=1]
-        %inc = add i32 %ELIM4, 1                ; <i32> [#uses=1]
-        %ELIM5 = and i32 %inc, 1                ; <i32> [#uses=1]
-        %ELIM7 = and i32 %B, -2         ; <i32> [#uses=1]
-        %tmp.8 = or i32 %ELIM5, %ELIM7          ; <i32> [#uses=1]
-        ret i32 %tmp.8
-}
-
diff --git a/test/Transforms/InstCombine/blend_x86.ll b/test/Transforms/InstCombine/blend_x86.ll
deleted file mode 100644
index 39ceb0186efe..000000000000
--- a/test/Transforms/InstCombine/blend_x86.ll
+++ /dev/null
@@ -1,151 +0,0 @@
-; RUN: opt < %s -instcombine -mtriple=x86_64-apple-macosx -mcpu=core-avx2 -S | FileCheck %s
-
-define <2 x double> @constant_blendvpd(<2 x double> %xy, <2 x double> %ab) {
-; CHECK-LABEL: @constant_blendvpd(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x double> %ab, <2 x double> %xy, <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT:    ret <2 x double> [[TMP1]]
-;
-  %1 = tail call <2 x double> @llvm.x86.sse41.blendvpd(<2 x double> %xy, <2 x double> %ab, <2 x double> <double 0xFFFFFFFFE0000000, double 0.000000e+00>)
-  ret <2 x double> %1
-}
-
-define <2 x double> @constant_blendvpd_zero(<2 x double> %xy, <2 x double> %ab) {
-; CHECK-LABEL: @constant_blendvpd_zero
-; CHECK-NEXT: ret <2 x double> %xy
-  %1 = tail call <2 x double> @llvm.x86.sse41.blendvpd(<2 x double> %xy, <2 x double> %ab, <2 x double> zeroinitializer)
-  ret <2 x double> %1
-}
-
-define <2 x double> @constant_blendvpd_dup(<2 x double> %xy, <2 x double> %sel) {
-; CHECK-LABEL: @constant_blendvpd_dup
-; CHECK-NEXT: ret <2 x double> %xy
-  %1 = tail call <2 x double> @llvm.x86.sse41.blendvpd(<2 x double> %xy, <2 x double> %xy, <2 x double> %sel)
-  ret <2 x double> %1
-}
-
-define <4 x float> @constant_blendvps(<4 x float> %xyzw, <4 x float> %abcd) {
-; CHECK-LABEL: @constant_blendvps(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> %abcd, <4 x float> %xyzw, <4 x i32> <i32 4, i32 5, i32 6, i32 3>
-; CHECK-NEXT:    ret <4 x float> [[TMP1]]
-;
-  %1 = tail call <4 x float> @llvm.x86.sse41.blendvps(<4 x float> %xyzw, <4 x float> %abcd, <4 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0xFFFFFFFFE0000000>)
-  ret <4 x float> %1
-}
-
-define <4 x float> @constant_blendvps_zero(<4 x float> %xyzw, <4 x float> %abcd) {
-; CHECK-LABEL: @constant_blendvps_zero
-; CHECK-NEXT: ret <4 x float> %xyzw
-  %1 = tail call <4 x float> @llvm.x86.sse41.blendvps(<4 x float> %xyzw, <4 x float> %abcd, <4 x float> zeroinitializer)
-  ret <4 x float> %1
-}
-
-define <4 x float> @constant_blendvps_dup(<4 x float> %xyzw, <4 x float> %sel) {
-; CHECK-LABEL: @constant_blendvps_dup
-; CHECK-NEXT: ret <4 x float> %xyzw
-  %1 = tail call <4 x float> @llvm.x86.sse41.blendvps(<4 x float> %xyzw, <4 x float> %xyzw, <4 x float> %sel)
-  ret <4 x float> %1
-}
-
-define <16 x i8> @constant_pblendvb(<16 x i8> %xyzw, <16 x i8> %abcd) {
-; CHECK-LABEL: @constant_pblendvb(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i8> %abcd, <16 x i8> %xyzw, <16 x i32> <i32 16, i32 17, i32 2, i32 19, i32 4, i32 5, i32 6, i32 23, i32 24, i32 25, i32 10, i32 27, i32 12, i32 13, i32 14, i32 31>
-; CHECK-NEXT:    ret <16 x i8> [[TMP1]]
-;
-  %1 = tail call <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8> %xyzw, <16 x i8> %abcd, <16 x i8> <i8 0, i8 0, i8 255, i8 0, i8 255, i8 255, i8 255, i8 0, i8 0, i8 0, i8 255, i8 0, i8 255, i8 255, i8 255, i8 0>)
-  ret <16 x i8> %1
-}
-
-define <16 x i8> @constant_pblendvb_zero(<16 x i8> %xyzw, <16 x i8> %abcd) {
-; CHECK-LABEL: @constant_pblendvb_zero
-; CHECK-NEXT: ret <16 x i8> %xyzw
-  %1 = tail call <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8> %xyzw, <16 x i8> %abcd, <16 x i8> zeroinitializer)
-  ret <16 x i8> %1
-}
-
-define <16 x i8> @constant_pblendvb_dup(<16 x i8> %xyzw, <16 x i8> %sel) {
-; CHECK-LABEL: @constant_pblendvb_dup
-; CHECK-NEXT: ret <16 x i8> %xyzw
-  %1 = tail call <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8> %xyzw, <16 x i8> %xyzw, <16 x i8> %sel)
-  ret <16 x i8> %1
-}
-
-define <4 x double> @constant_blendvpd_avx(<4 x double> %xy, <4 x double> %ab) {
-; CHECK-LABEL: @constant_blendvpd_avx(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> %ab, <4 x double> %xy, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
-; CHECK-NEXT:    ret <4 x double> [[TMP1]]
-;
-  %1 = tail call <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double> %xy, <4 x double> %ab, <4 x double> <double 0xFFFFFFFFE0000000, double 0.000000e+00, double 0xFFFFFFFFE0000000, double 0.000000e+00>)
-  ret <4 x double> %1
-}
-
-define <4 x double> @constant_blendvpd_avx_zero(<4 x double> %xy, <4 x double> %ab) {
-; CHECK-LABEL: @constant_blendvpd_avx_zero
-; CHECK-NEXT: ret <4 x double> %xy
-  %1 = tail call <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double> %xy, <4 x double> %ab, <4 x double> zeroinitializer)
-  ret <4 x double> %1
-}
-
-define <4 x double> @constant_blendvpd_avx_dup(<4 x double> %xy, <4 x double> %sel) {
-; CHECK-LABEL: @constant_blendvpd_avx_dup
-; CHECK-NEXT: ret <4 x double> %xy
-  %1 = tail call <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double> %xy, <4 x double> %xy, <4 x double> %sel)
-  ret <4 x double> %1
-}
-
-define <8 x float> @constant_blendvps_avx(<8 x float> %xyzw, <8 x float> %abcd) {
-; CHECK-LABEL: @constant_blendvps_avx(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> %abcd, <8 x float> %xyzw, <8 x i32> <i32 8, i32 9, i32 10, i32 3, i32 12, i32 13, i32 14, i32 7>
-; CHECK-NEXT:    ret <8 x float> [[TMP1]]
-;
-  %1 = tail call <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float> %xyzw, <8 x float> %abcd, <8 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0xFFFFFFFFE0000000, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0xFFFFFFFFE0000000>)
-  ret <8 x float> %1
-}
-
-define <8 x float> @constant_blendvps_avx_zero(<8 x float> %xyzw, <8 x float> %abcd) {
-; CHECK-LABEL: @constant_blendvps_avx_zero
-; CHECK-NEXT: ret <8 x float> %xyzw
-  %1 = tail call <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float> %xyzw, <8 x float> %abcd, <8 x float> zeroinitializer)
-  ret <8 x float> %1
-}
-
-define <8 x float> @constant_blendvps_avx_dup(<8 x float> %xyzw, <8 x float> %sel) {
-; CHECK-LABEL: @constant_blendvps_avx_dup
-; CHECK-NEXT: ret <8 x float> %xyzw
-  %1 = tail call <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float> %xyzw, <8 x float> %xyzw, <8 x float> %sel)
-  ret <8 x float> %1
-}
-
-define <32 x i8> @constant_pblendvb_avx2(<32 x i8> %xyzw, <32 x i8> %abcd) {
-; CHECK-LABEL: @constant_pblendvb_avx2(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <32 x i8> %abcd, <32 x i8> %xyzw, <32 x i32> <i32 32, i32 33, i32 2, i32 35, i32 4, i32 5, i32 6, i32 39, i32 40, i32 41, i32 10, i32 43, i32 12, i32 13, i32 14, i32 47, i32 48, i32 49, i32 18, i32 51, i32 20, i32 21, i32 22, i32 55, i32 56, i32 57, i32 26, i32 59, i32 28, i32 29, i32 30, i32 63>
-; CHECK-NEXT:    ret <32 x i8> [[TMP1]]
-;
-  %1 = tail call <32 x i8> @llvm.x86.avx2.pblendvb(<32 x i8> %xyzw, <32 x i8> %abcd,
-        <32 x i8> <i8 0, i8 0, i8 255, i8 0, i8 255, i8 255, i8 255, i8 0,
-                   i8 0, i8 0, i8 255, i8 0, i8 255, i8 255, i8 255, i8 0,
-                   i8 0, i8 0, i8 255, i8 0, i8 255, i8 255, i8 255, i8 0,
-                   i8 0, i8 0, i8 255, i8 0, i8 255, i8 255, i8 255, i8 0>)
-  ret <32 x i8> %1
-}
-
-define <32 x i8> @constant_pblendvb_avx2_zero(<32 x i8> %xyzw, <32 x i8> %abcd) {
-; CHECK-LABEL: @constant_pblendvb_avx2_zero
-; CHECK-NEXT: ret <32 x i8> %xyzw
-  %1 = tail call <32 x i8> @llvm.x86.avx2.pblendvb(<32 x i8> %xyzw, <32 x i8> %abcd, <32 x i8> zeroinitializer)
-  ret <32 x i8> %1
-}
-
-define <32 x i8> @constant_pblendvb_avx2_dup(<32 x i8> %xyzw, <32 x i8> %sel) {
-; CHECK-LABEL: @constant_pblendvb_avx2_dup
-; CHECK-NEXT: ret <32 x i8> %xyzw
-  %1 = tail call <32 x i8> @llvm.x86.avx2.pblendvb(<32 x i8> %xyzw, <32 x i8> %xyzw, <32 x i8> %sel)
-  ret <32 x i8> %1
-}
-
-declare <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8>, <16 x i8>, <16 x i8>)
-declare <4 x float> @llvm.x86.sse41.blendvps(<4 x float>, <4 x float>, <4 x float>)
-declare <2 x double> @llvm.x86.sse41.blendvpd(<2 x double>, <2 x double>, <2 x double>)
-
-declare <32 x i8> @llvm.x86.avx2.pblendvb(<32 x i8>, <32 x i8>, <32 x i8>)
-declare <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float>, <8 x float>, <8 x float>)
-declare <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double>, <4 x double>, <4 x double>)
diff --git a/test/Transforms/InstCombine/cast.ll b/test/Transforms/InstCombine/cast.ll
index 4621d33d4388..a4375a5cd57e 100644
--- a/test/Transforms/InstCombine/cast.ll
+++ b/test/Transforms/InstCombine/cast.ll
@@ -1432,3 +1432,41 @@ define <2 x i32> @test90() {
   %tmp6 = bitcast <4 x half> <half undef, half undef, half undef, half 0xH3C00> to <2 x i32>
   ret <2 x i32> %tmp6
 }
+
+; Do not optimize to ashr i64 (shift by 48 > 96 - 64)
+define i64 @test91(i64 %A) {
+; CHECK-LABEL: @test91(
+; CHECK-NEXT:    [[B:%.*]] = sext i64 %A to i96
+; CHECK-NEXT:    [[C:%.*]] = lshr i96 [[B]], 48
+; CHECK-NEXT:    [[D:%.*]] = trunc i96 [[C]] to i64
+; CHECK-NEXT:    ret i64 [[D]]
+;
+  %B = sext i64 %A to i96
+  %C = lshr i96 %B, 48
+  %D = trunc i96 %C to i64
+  ret i64 %D
+}
+
+; Do optimize to ashr i64 (shift by 32 <= 96 - 64)
+define i64 @test92(i64 %A) {
+; CHECK-LABEL: @test92(
+; CHECK-NEXT:    [[C:%.*]] = ashr i64 %A, 32
+; CHECK-NEXT:    ret i64 [[C]]
+;
+  %B = sext i64 %A to i96
+  %C = lshr i96 %B, 32
+  %D = trunc i96 %C to i64
+  ret i64 %D
+}
+
+; When optimizing to ashr i32, don't shift by more than 31.
+define i32 @test93(i32 %A) {
+; CHECK-LABEL: @test93(
+; CHECK-NEXT:    [[C:%.*]] = ashr i32 %A, 31
+; CHECK-NEXT:    ret i32 [[C]]
+;
+  %B = sext i32 %A to i96
+  %C = lshr i96 %B, 64
+  %D = trunc i96 %C to i32
+  ret i32 %D
+}
diff --git a/test/Transforms/InstCombine/constant-fold-hang.ll b/test/Transforms/InstCombine/constant-fold-hang.ll
deleted file mode 100644
index 2ca6b86ccc2f..000000000000
--- a/test/Transforms/InstCombine/constant-fold-hang.ll
+++ /dev/null
@@ -1,14 +0,0 @@
-; RUN: opt -instcombine < %s
-
-; Function Attrs: nounwind readnone ssp
-define void @mulByZero(<4 x i16> %x) #0 {
-entry:
-  %a = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %x, <4 x i16> zeroinitializer) #2
-  ret void
-}
-
-; Function Attrs: nounwind readnone
-declare <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16>, <4 x i16>) #1
-
-attributes #0 = { nounwind readnone ssp }
-attributes #1 = { nounwind readnone }
diff --git a/test/Transforms/InstCombine/constant-fold-iteration.ll b/test/Transforms/InstCombine/constant-fold-iteration.ll
new file mode 100644
index 000000000000..e1b692173ce8
--- /dev/null
+++ b/test/Transforms/InstCombine/constant-fold-iteration.ll
@@ -0,0 +1,10 @@
+; RUN: opt < %s -instcombine -S -debug 2>&1 | FileCheck %s
+; REQUIRES: asserts
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:32:32-n8:16:32"
+
+define i32 @a() nounwind readnone {
+entry:
+  ret i32 zext (i1 icmp eq (i32 0, i32 ptrtoint (i32 ()* @a to i32)) to i32)
+}
+; CHECK: INSTCOMBINE ITERATION #1
+; CHECK-NOT: INSTCOMBINE ITERATION #2
diff --git a/test/Transforms/InstCombine/demorgan.ll b/test/Transforms/InstCombine/demorgan.ll
index 26c2270a3fdf..8c3d3b830468 100644
--- a/test/Transforms/InstCombine/demorgan.ll
+++ b/test/Transforms/InstCombine/demorgan.ll
@@ -399,7 +399,7 @@ define i32 @demorgan_or_zext(i1 %X, i1 %Y) {
 ; CHECK-LABEL: @demorgan_or_zext(
 ; CHECK-NEXT:    [[OR1_DEMORGAN:%.*]] = and i1 %X, %Y
 ; CHECK-NEXT:    [[OR1:%.*]] = xor i1 [[OR1_DEMORGAN]], true
-; CHECK-NEXT:    [[OR:%.*]] = zext i1 [[OR:%.*]]1 to i32
+; CHECK-NEXT:    [[OR:%.*]] = zext i1 [[OR1]] to i32
 ; CHECK-NEXT:    ret i32 [[OR]]
 ;
   %zextX = zext i1 %X to i32
@@ -414,7 +414,7 @@ define i32 @demorgan_and_zext(i1 %X, i1 %Y) {
 ; CHECK-LABEL: @demorgan_and_zext(
 ; CHECK-NEXT:    [[AND1_DEMORGAN:%.*]] = or i1 %X, %Y
 ; CHECK-NEXT:    [[AND1:%.*]] = xor i1 [[AND1_DEMORGAN]], true
-; CHECK-NEXT:    [[AND:%.*]] = zext i1 [[AND:%.*]]1 to i32
+; CHECK-NEXT:    [[AND:%.*]] = zext i1 [[AND1]] to i32
 ; CHECK-NEXT:    ret i32 [[AND]]
 ;
   %zextX = zext i1 %X to i32
@@ -429,7 +429,7 @@ define <2 x i32> @demorgan_or_zext_vec(<2 x i1> %X, <2 x i1> %Y) {
 ; CHECK-LABEL: @demorgan_or_zext_vec(
 ; CHECK-NEXT:    [[OR1_DEMORGAN:%.*]] = and <2 x i1> %X, %Y
 ; CHECK-NEXT:    [[OR1:%.*]] = xor <2 x i1> [[OR1_DEMORGAN]], <i1 true, i1 true>
-; CHECK-NEXT:    [[OR:%.*]] = zext <2 x i1> [[OR:%.*]]1 to <2 x i32>
+; CHECK-NEXT:    [[OR:%.*]] = zext <2 x i1> [[OR1]] to <2 x i32>
 ; CHECK-NEXT:    ret <2 x i32> [[OR]]
 ;
   %zextX = zext <2 x i1> %X to <2 x i32>
@@ -444,7 +444,7 @@ define <2 x i32> @demorgan_and_zext_vec(<2 x i1> %X, <2 x i1> %Y) {
 ; CHECK-LABEL: @demorgan_and_zext_vec(
 ; CHECK-NEXT:    [[AND1_DEMORGAN:%.*]] = or <2 x i1> %X, %Y
 ; CHECK-NEXT:    [[AND1:%.*]] = xor <2 x i1> [[AND1_DEMORGAN]], <i1 true, i1 true>
-; CHECK-NEXT:    [[AND:%.*]] = zext <2 x i1> [[AND:%.*]]1 to <2 x i32>
+; CHECK-NEXT:    [[AND:%.*]] = zext <2 x i1> [[AND1]] to <2 x i32>
 ; CHECK-NEXT:    ret <2 x i32> [[AND]]
 ;
   %zextX = zext <2 x i1> %X to <2 x i32>
diff --git a/test/Transforms/InstCombine/icmp.ll b/test/Transforms/InstCombine/icmp.ll
index edfa9a102917..6f657b190454 100644
--- a/test/Transforms/InstCombine/icmp.ll
+++ b/test/Transforms/InstCombine/icmp.ll
@@ -695,6 +695,21 @@ define i1 @test48(i32 %X, i32 %Y, i32 %Z) {
   ret i1 %C
 }
 
+; The above transform only works for equality predicates.
+
+define i1 @PR32949(i32 %X, i32 %Y, i32 %Z) {
+; CHECK-LABEL: @PR32949(
+; CHECK-NEXT:    [[A:%.*]] = sdiv exact i32 %X, %Z
+; CHECK-NEXT:    [[B:%.*]] = sdiv exact i32 %Y, %Z
+; CHECK-NEXT:    [[C:%.*]] = icmp sgt i32 [[A]], [[B]]
+; CHECK-NEXT:    ret i1 [[C]]
+;
+  %A = sdiv exact i32 %X, %Z
+  %B = sdiv exact i32 %Y, %Z
+  %C = icmp sgt i32 %A, %B
+  ret i1 %C
+}
+
 ; PR8469
 define <2 x i1> @test49(<2 x i32> %tmp3) {
 ; CHECK-LABEL: @test49(
diff --git a/test/Transforms/InstCombine/intrinsics.ll b/test/Transforms/InstCombine/intrinsics.ll
index 66ab7f48aeff..5654b265da58 100644
--- a/test/Transforms/InstCombine/intrinsics.ll
+++ b/test/Transforms/InstCombine/intrinsics.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -instcombine -S < %s | FileCheck %s
 
 %overflow.result = type {i8, i1}
@@ -283,14 +284,24 @@ define i32 @cttz(i32 %a) {
 
 define i1 @cttz_knownbits(i32 %arg) {
 ; CHECK-LABEL: @cttz_knownbits(
+; CHECK-NEXT:    ret i1 false
+;
+  %or = or i32 %arg, 4
+  %cnt = call i32 @llvm.cttz.i32(i32 %or, i1 true) nounwind readnone
+  %res = icmp eq i32 %cnt, 4
+  ret i1 %res
+}
+
+define i1 @cttz_knownbits2(i32 %arg) {
+; CHECK-LABEL: @cttz_knownbits2(
 ; CHECK-NEXT:    [[OR:%.*]] = or i32 [[ARG:%.*]], 4
 ; CHECK-NEXT:    [[CNT:%.*]] = call i32 @llvm.cttz.i32(i32 [[OR]], i1 true)
-; CHECK-NEXT:    [[RES:%.*]] = icmp eq i32 [[CNT]], 4
+; CHECK-NEXT:    [[RES:%.*]] = icmp eq i32 [[CNT]], 2
 ; CHECK-NEXT:    ret i1 [[RES]]
 ;
   %or = or i32 %arg, 4
   %cnt = call i32 @llvm.cttz.i32(i32 %or, i1 true) nounwind readnone
-  %res = icmp eq i32 %cnt, 4
+  %res = icmp eq i32 %cnt, 2
   ret i1 %res
 }
 
@@ -306,14 +317,24 @@ define i8 @ctlz(i8 %a) {
 
 define i1 @ctlz_knownbits(i8 %arg) {
 ; CHECK-LABEL: @ctlz_knownbits(
+; CHECK-NEXT:    ret i1 false
+;
+  %or = or i8 %arg, 32
+  %cnt = call i8 @llvm.ctlz.i8(i8 %or, i1 true) nounwind readnone
+  %res = icmp eq i8 %cnt, 4
+  ret i1 %res
+}
+
+define i1 @ctlz_knownbits2(i8 %arg) {
+; CHECK-LABEL: @ctlz_knownbits2(
 ; CHECK-NEXT:    [[OR:%.*]] = or i8 [[ARG:%.*]], 32
 ; CHECK-NEXT:    [[CNT:%.*]] = call i8 @llvm.ctlz.i8(i8 [[OR]], i1 true)
-; CHECK-NEXT:    [[RES:%.*]] = icmp eq i8 [[CNT]], 4
+; CHECK-NEXT:    [[RES:%.*]] = icmp eq i8 [[CNT]], 2
 ; CHECK-NEXT:    ret i1 [[RES]]
 ;
   %or = or i8 %arg, 32
   %cnt = call i8 @llvm.ctlz.i8(i8 %or, i1 true) nounwind readnone
-  %res = icmp eq i8 %cnt, 4
+  %res = icmp eq i8 %cnt, 2
   ret i1 %res
 }
 
diff --git a/test/Transforms/InstCombine/logical-select.ll b/test/Transforms/InstCombine/logical-select.ll
index 3ab40c4de92d..7f0bd23eb8a5 100644
--- a/test/Transforms/InstCombine/logical-select.ll
+++ b/test/Transforms/InstCombine/logical-select.ll
@@ -62,6 +62,81 @@ define i32 @poo(i32 %a, i32 %b, i32 %c, i32 %d) {
   ret i32 %t3
 }
 
+; TODO: For the next 4 tests, are there potential canonicalizations and/or folds for these
+; in InstCombine? Independent of that, tests like this that may not show any transforms 
+; still have value because they can help identify conflicting canonicalization rules that 
+; lead to infinite looping. 
+
+; PR32791 - https://bugs.llvm.org//show_bug.cgi?id=32791
+; Fold two selects with inverted predicates and zero operands.
+define i32 @fold_inverted_icmp_preds(i32 %a, i32 %b, i32 %c, i32 %d) {
+; CHECK-LABEL: @fold_inverted_icmp_preds(
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp slt i32 %a, %b
+; CHECK-NEXT:    [[SEL1:%.*]] = select i1 [[CMP1]], i32 %c, i32 0
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp sge i32 %a, %b
+; CHECK-NEXT:    [[SEL2:%.*]] = select i1 [[CMP2]], i32 %d, i32 0
+; CHECK-NEXT:    [[OR:%.*]] = or i32 [[SEL1]], [[SEL2]]
+; CHECK-NEXT:    ret i32 [[OR]]
+;
+  %cmp1 = icmp slt i32 %a, %b
+  %sel1 = select i1 %cmp1, i32 %c, i32 0
+  %cmp2 = icmp sge i32 %a, %b
+  %sel2 = select i1 %cmp2, i32 %d, i32 0
+  %or = or i32 %sel1, %sel2
+  ret i32 %or
+}
+
+define i32 @fold_inverted_icmp_preds_reverse(i32 %a, i32 %b, i32 %c, i32 %d) {
+; CHECK-LABEL: @fold_inverted_icmp_preds_reverse(
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp slt i32 %a, %b
+; CHECK-NEXT:    [[SEL1:%.*]] = select i1 [[CMP1]], i32 0, i32 %c
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp sge i32 %a, %b
+; CHECK-NEXT:    [[SEL2:%.*]] = select i1 [[CMP2]], i32 0, i32 %d
+; CHECK-NEXT:    [[OR:%.*]] = or i32 [[SEL1]], [[SEL2]]
+; CHECK-NEXT:    ret i32 [[OR]]
+;
+  %cmp1 = icmp slt i32 %a, %b
+  %sel1 = select i1 %cmp1, i32 0, i32 %c
+  %cmp2 = icmp sge i32 %a, %b
+  %sel2 = select i1 %cmp2, i32 0, i32 %d
+  %or = or i32 %sel1, %sel2
+  ret i32 %or
+}
+
+define i32 @fold_inverted_fcmp_preds(float %a, float %b, i32 %c, i32 %d) {
+; CHECK-LABEL: @fold_inverted_fcmp_preds(
+; CHECK-NEXT:    [[CMP1:%.*]] = fcmp olt float %a, %b
+; CHECK-NEXT:    [[SEL1:%.*]] = select i1 [[CMP1]], i32 %c, i32 0
+; CHECK-NEXT:    [[CMP2:%.*]] = fcmp uge float %a, %b
+; CHECK-NEXT:    [[SEL2:%.*]] = select i1 [[CMP2]], i32 %d, i32 0
+; CHECK-NEXT:    [[OR:%.*]] = or i32 [[SEL1]], [[SEL2]]
+; CHECK-NEXT:    ret i32 [[OR]]
+;
+  %cmp1 = fcmp olt float %a, %b
+  %sel1 = select i1 %cmp1, i32 %c, i32 0
+  %cmp2 = fcmp uge float %a, %b
+  %sel2 = select i1 %cmp2, i32 %d, i32 0
+  %or = or i32 %sel1, %sel2
+  ret i32 %or
+}
+
+define <2 x i32> @fold_inverted_icmp_vector_preds(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c, <2 x i32> %d) {
+; CHECK-LABEL: @fold_inverted_icmp_vector_preds(
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp ne <2 x i32> %a, %b
+; CHECK-NEXT:    [[SEL1:%.*]] = select <2 x i1> [[CMP1]], <2 x i32> %c, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp eq <2 x i32> %a, %b
+; CHECK-NEXT:    [[SEL2:%.*]] = select <2 x i1> [[CMP2]], <2 x i32> %d, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[OR:%.*]] = or <2 x i32> [[SEL1]], [[SEL2]]
+; CHECK-NEXT:    ret <2 x i32> [[OR]]
+;
+  %cmp1 = icmp ne <2 x i32> %a, %b
+  %sel1 = select <2 x i1> %cmp1, <2 x i32> %c, <2 x i32> <i32 0, i32 0>
+  %cmp2 = icmp eq <2 x i32> %a, %b
+  %sel2 = select <2 x i1> %cmp2, <2 x i32> %d, <2 x i32> <i32 0, i32 0>
+  %or = or <2 x i32> %sel1, %sel2
+  ret <2 x i32> %or
+}
+
 define i32 @par(i32 %a, i32 %b, i32 %c, i32 %d) {
 ; CHECK-LABEL: @par(
 ; CHECK-NEXT:    [[T0:%.*]] = icmp slt i32 %a, %b
diff --git a/test/Transforms/InstCombine/neon-intrinsics.ll b/test/Transforms/InstCombine/neon-intrinsics.ll
deleted file mode 100644
index d22fa9c811dc..000000000000
--- a/test/Transforms/InstCombine/neon-intrinsics.ll
+++ /dev/null
@@ -1,25 +0,0 @@
-; RUN: opt < %s -instcombine -S | FileCheck %s
-
-; The alignment arguments for NEON load/store intrinsics can be increased
-; by instcombine.  Check for this.
-
-; CHECK: vld4.v2i32.p0i8({{.*}}, i32 32)
-; CHECK: vst4.p0i8.v2i32({{.*}}, i32 16)
-
-@x = common global [8 x i32] zeroinitializer, align 32
-@y = common global [8 x i32] zeroinitializer, align 16
-
-%struct.__neon_int32x2x4_t = type { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }
-
-define void @test() nounwind ssp {
-  %tmp1 = call %struct.__neon_int32x2x4_t @llvm.arm.neon.vld4.v2i32.p0i8(i8* bitcast ([8 x i32]* @x to i8*), i32 1)
-  %tmp2 = extractvalue %struct.__neon_int32x2x4_t %tmp1, 0
-  %tmp3 = extractvalue %struct.__neon_int32x2x4_t %tmp1, 1
-  %tmp4 = extractvalue %struct.__neon_int32x2x4_t %tmp1, 2
-  %tmp5 = extractvalue %struct.__neon_int32x2x4_t %tmp1, 3
-  call void @llvm.arm.neon.vst4.p0i8.v2i32(i8* bitcast ([8 x i32]* @y to i8*), <2 x i32> %tmp2, <2 x i32> %tmp3, <2 x i32> %tmp4, <2 x i32> %tmp5, i32 1)
-  ret void
-}
-
-declare %struct.__neon_int32x2x4_t @llvm.arm.neon.vld4.v2i32.p0i8(i8*, i32) nounwind readonly
-declare void @llvm.arm.neon.vst4.p0i8.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, i32) nounwind
diff --git a/test/Transforms/InstCombine/not.ll b/test/Transforms/InstCombine/not.ll
index 2760d4ae044d..6ff0a50318d2 100644
--- a/test/Transforms/InstCombine/not.ll
+++ b/test/Transforms/InstCombine/not.ll
@@ -11,8 +11,8 @@ define i32 @test1(i32 %A) {
 
 define i1 @invert_icmp(i32 %A, i32 %B) {
 ; CHECK-LABEL: @invert_icmp(
-; CHECK-NEXT:    [[NOT:%.*]] = icmp sgt i32 %A, %B
-; CHECK-NEXT:    ret i1 [[NOT]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i32 %A, %B
+; CHECK-NEXT:    ret i1 [[CMP]]
 ;
   %cmp = icmp sle i32 %A, %B
   %not = xor i1 %cmp, true
@@ -23,8 +23,8 @@ define i1 @invert_icmp(i32 %A, i32 %B) {
 
 define i1 @invert_fcmp(float %X, float %Y) {
 ; CHECK-LABEL: @invert_fcmp(
-; CHECK-NEXT:    [[NOT:%.*]] = fcmp uge float %X, %Y
-; CHECK-NEXT:    ret i1 [[NOT]]
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp uge float %X, %Y
+; CHECK-NEXT:    ret i1 [[CMP]]
 ;
   %cmp = fcmp olt float %X, %Y
   %not = xor i1 %cmp, true
@@ -48,11 +48,75 @@ define zeroext i8 @test6(i32 %a, i32 %b) {
 
 define <2 x i1> @test7(<2 x i32> %A, <2 x i32> %B) {
 ; CHECK-LABEL: @test7(
-; CHECK-NEXT:    [[RET:%.*]] = icmp sgt <2 x i32> %A, %B
-; CHECK-NEXT:    ret <2 x i1> [[RET]]
+; CHECK-NEXT:    [[COND:%.*]] = icmp sgt <2 x i32> %A, %B
+; CHECK-NEXT:    ret <2 x i1> [[COND]]
 ;
   %cond = icmp sle <2 x i32> %A, %B
   %Ret = xor <2 x i1> %cond, <i1 true, i1 true>
   ret <2 x i1> %Ret
 }
 
+define i32 @not_ashr_not(i32 %A, i32 %B) {
+; CHECK-LABEL: @not_ashr_not(
+; CHECK-NEXT:    [[NOT2:%.*]] = ashr i32 %A, %B
+; CHECK-NEXT:    ret i32 [[NOT2]]
+;
+  %not1 = xor i32 %A, -1
+  %ashr = ashr i32 %not1, %B
+  %not2 = xor i32 %ashr, -1
+  ret i32 %not2
+}
+
+define i8 @not_ashr_const(i8 %x) {
+; CHECK-LABEL: @not_ashr_const(
+; CHECK-NEXT:    [[NOT:%.*]] = lshr i8 41, %x
+; CHECK-NEXT:    ret i8 [[NOT]]
+;
+  %shr = ashr i8 -42, %x
+  %not = xor i8 %shr, -1
+  ret i8 %not
+}
+
+define <2 x i8> @not_ashr_const_splat(<2 x i8> %x) {
+; CHECK-LABEL: @not_ashr_const_splat(
+; CHECK-NEXT:    [[NOT:%.*]] = lshr <2 x i8> <i8 41, i8 41>, %x
+; CHECK-NEXT:    ret <2 x i8> [[NOT]]
+;
+  %shr = ashr <2 x i8> <i8 -42, i8 -42>, %x
+  %not = xor <2 x i8> %shr, <i8 -1, i8 -1>
+  ret <2 x i8> %not
+}
+
+; We can't get rid of the 'not' on a logical shift of a negative constant.
+
+define i8 @not_lshr_const_negative(i8 %x) {
+; CHECK-LABEL: @not_lshr_const_negative(
+; CHECK-NEXT:    [[SHR:%.*]] = lshr i8 -42, %x
+; CHECK-NEXT:    [[NOT:%.*]] = xor i8 [[SHR]], -1
+; CHECK-NEXT:    ret i8 [[NOT]]
+;
+  %shr = lshr i8 -42, %x
+  %not = xor i8 %shr, -1
+  ret i8 %not
+}
+
+define i8 @not_lshr_const(i8 %x) {
+; CHECK-LABEL: @not_lshr_const(
+; CHECK-NEXT:    [[NOT:%.*]] = ashr i8 -43, %x
+; CHECK-NEXT:    ret i8 [[NOT]]
+;
+  %shr = lshr i8 42, %x
+  %not = xor i8 %shr, -1
+  ret i8 %not
+}
+
+define <2 x i8> @not_lshr_const_splat(<2 x i8> %x) {
+; CHECK-LABEL: @not_lshr_const_splat(
+; CHECK-NEXT:    [[NOT:%.*]] = ashr <2 x i8> <i8 -43, i8 -43>, %x
+; CHECK-NEXT:    ret <2 x i8> [[NOT]]
+;
+  %shr = lshr <2 x i8> <i8 42, i8 42>, %x
+  %not = xor <2 x i8> %shr, <i8 -1, i8 -1>
+  ret <2 x i8> %not
+}
+
diff --git a/test/Transforms/InstCombine/or-xor.ll b/test/Transforms/InstCombine/or-xor.ll
index ec5b71656a47..f2bc290d79a4 100644
--- a/test/Transforms/InstCombine/or-xor.ll
+++ b/test/Transforms/InstCombine/or-xor.ll
@@ -230,3 +230,73 @@ define i32 @test16(i32 %a, i32 %b) {
   %xor = or i32 %and1, %and2
   ret i32 %xor
 }
+
+define i8 @not_or(i8 %x) {
+; CHECK-LABEL: @not_or(
+; CHECK-NEXT:    [[NOTX:%.*]] = or i8 %x, 7
+; CHECK-NEXT:    [[OR:%.*]] = xor i8 [[NOTX]], -8
+; CHECK-NEXT:    ret i8 [[OR]]
+;
+  %notx = xor i8 %x, -1
+  %or = or i8 %notx, 7
+  ret i8 %or
+}
+
+define i8 @not_or_xor(i8 %x) {
+; CHECK-LABEL: @not_or_xor(
+; CHECK-NEXT:    [[NOTX:%.*]] = or i8 %x, 7
+; CHECK-NEXT:    [[XOR:%.*]] = xor i8 [[NOTX]], -12
+; CHECK-NEXT:    ret i8 [[XOR]]
+;
+  %notx = xor i8 %x, -1
+  %or = or i8 %notx, 7
+  %xor = xor i8 %or, 12
+  ret i8 %xor
+}
+
+define i8 @xor_or(i8 %x) {
+; CHECK-LABEL: @xor_or(
+; CHECK-NEXT:    [[XOR:%.*]] = or i8 %x, 7
+; CHECK-NEXT:    [[OR:%.*]] = xor i8 [[XOR]], 32
+; CHECK-NEXT:    ret i8 [[OR]]
+;
+  %xor = xor i8 %x, 32
+  %or = or i8 %xor, 7
+  ret i8 %or
+}
+
+define i8 @xor_or2(i8 %x) {
+; CHECK-LABEL: @xor_or2(
+; CHECK-NEXT:    [[XOR:%.*]] = or i8 %x, 7
+; CHECK-NEXT:    [[OR:%.*]] = xor i8 [[XOR]], 32
+; CHECK-NEXT:    ret i8 [[OR]]
+;
+  %xor = xor i8 %x, 33
+  %or = or i8 %xor, 7
+  ret i8 %or
+}
+
+define i8 @xor_or_xor(i8 %x) {
+; CHECK-LABEL: @xor_or_xor(
+; CHECK-NEXT:    [[XOR1:%.*]] = or i8 %x, 7
+; CHECK-NEXT:    [[XOR2:%.*]] = xor i8 [[XOR1]], 44
+; CHECK-NEXT:    ret i8 [[XOR2]]
+;
+  %xor1 = xor i8 %x, 33
+  %or = or i8 %xor1, 7
+  %xor2 = xor i8 %or, 12
+  ret i8 %xor2
+}
+
+define i8 @or_xor_or(i8 %x) {
+; CHECK-LABEL: @or_xor_or(
+; CHECK-NEXT:    [[XOR:%.*]] = or i8 %x, 39
+; CHECK-NEXT:    [[OR2:%.*]] = xor i8 [[XOR]], 8
+; CHECK-NEXT:    ret i8 [[OR2]]
+;
+  %or1 = or i8 %x, 33
+  %xor = xor i8 %or1, 12
+  %or2 = or i8 %xor, 7
+  ret i8 %or2
+}
+
diff --git a/test/Transforms/InstCombine/or.ll b/test/Transforms/InstCombine/or.ll
index bfafd66ebb41..764fe4503b5e 100644
--- a/test/Transforms/InstCombine/or.ll
+++ b/test/Transforms/InstCombine/or.ll
@@ -3,115 +3,6 @@
 
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
 
-define i32 @test1(i32 %A) {
-; CHECK-LABEL: @test1(
-; CHECK-NEXT:    ret i32 %A
-;
-  %B = or i32 %A, 0
-  ret i32 %B
-}
-
-define i32 @test2(i32 %A) {
-; CHECK-LABEL: @test2(
-; CHECK-NEXT:    ret i32 -1
-;
-  %B = or i32 %A, -1
-  ret i32 %B
-}
-
-define i8 @test2a(i8 %A) {
-; CHECK-LABEL: @test2a(
-; CHECK-NEXT:    ret i8 -1
-;
-  %B = or i8 %A, -1
-  ret i8 %B
-}
-
-define i1 @test3(i1 %A) {
-; CHECK-LABEL: @test3(
-; CHECK-NEXT:    ret i1 %A
-;
-  %B = or i1 %A, false
-  ret i1 %B
-}
-
-define i1 @test4(i1 %A) {
-; CHECK-LABEL: @test4(
-; CHECK-NEXT:    ret i1 true
-;
-  %B = or i1 %A, true
-  ret i1 %B
-}
-
-define i1 @test5(i1 %A) {
-; CHECK-LABEL: @test5(
-; CHECK-NEXT:    ret i1 %A
-;
-  %B = or i1 %A, %A
-  ret i1 %B
-}
-
-define i32 @test6(i32 %A) {
-; CHECK-LABEL: @test6(
-; CHECK-NEXT:    ret i32 %A
-;
-  %B = or i32 %A, %A
-  ret i32 %B
-}
-
-; A | ~A == -1
-define i32 @test7(i32 %A) {
-; CHECK-LABEL: @test7(
-; CHECK-NEXT:    ret i32 -1
-;
-  %NotA = xor i32 -1, %A
-  %B = or i32 %A, %NotA
-  ret i32 %B
-}
-
-define i8 @test8(i8 %A) {
-; CHECK-LABEL: @test8(
-; CHECK-NEXT:    ret i8 -1
-;
-  %B = or i8 %A, -2
-  %C = or i8 %B, 1
-  ret i8 %C
-}
-
-; Test that (A|c1)|(B|c2) == (A|B)|(c1|c2)
-define i8 @test9(i8 %A, i8 %B) {
-; CHECK-LABEL: @test9(
-; CHECK-NEXT:    ret i8 -1
-;
-  %C = or i8 %A, 1
-  %D = or i8 %B, -2
-  %E = or i8 %C, %D
-  ret i8 %E
-}
-
-define i8 @test10(i8 %A) {
-; CHECK-LABEL: @test10(
-; CHECK-NEXT:    ret i8 -2
-;
-  %B = or i8 %A, 1
-  %C = and i8 %B, -2
-  ; (X & C1) | C2 --> (X | C2) & (C1|C2)
-  %D = or i8 %C, -2
-  ret i8 %D
-}
-
-define i8 @test11(i8 %A) {
-; CHECK-LABEL: @test11(
-; CHECK-NEXT:    ret i8 -1
-;
-  %B = or i8 %A, -2
-  %C = xor i8 %B, 13
-  ; (X ^ C1) | C2 --> (X | C2) ^ (C1&~C2)
-  %D = or i8 %C, 1
-  %E = xor i8 %D, 12
-  ret i8 %E
-}
-
 define i32 @test12(i32 %A) {
         ; Should be eliminated
 ; CHECK-LABEL: @test12(
diff --git a/test/Transforms/InstCombine/pr2645-1.ll b/test/Transforms/InstCombine/pr2645-1.ll
deleted file mode 100644
index 2986d21866bf..000000000000
--- a/test/Transforms/InstCombine/pr2645-1.ll
+++ /dev/null
@@ -1,39 +0,0 @@
-; RUN: opt < %s -instcombine -S | grep shufflevector
-; PR2645
-
-; instcombine shouldn't delete the shufflevector.
-
-define internal void @""(i8*, i32, i8*) {
-; <label>:3
-        br label %4
-
-; <label>:4             ; preds = %6, %3
-        %.0 = phi i32 [ 0, %3 ], [ %19, %6 ]            ; <i32> [#uses=4]
-        %5 = icmp slt i32 %.0, %1               ; <i1> [#uses=1]
-        br i1 %5, label %6, label %20
-
-; <label>:6             ; preds = %4
-        %7 = getelementptr i8, i8* %2, i32 %.0              ; <i8*> [#uses=1]
-        %8 = bitcast i8* %7 to <4 x i16>*               ; <<4 x i16>*> [#uses=1]
-        %9 = load <4 x i16>, <4 x i16>* %8, align 1                ; <<4 x i16>> [#uses=1]
-        %10 = bitcast <4 x i16> %9 to <1 x i64>         ; <<1 x i64>> [#uses=1]
-        %11 = call <2 x i64> @foo(<1 x i64> %10)
-; <<2 x i64>> [#uses=1]
-        %12 = bitcast <2 x i64> %11 to <4 x i32>                ; <<4 x i32>> [#uses=1]
-        %13 = bitcast <4 x i32> %12 to <8 x i16>                ; <<8 x i16>> [#uses=2]
-        %14 = shufflevector <8 x i16> %13, <8 x i16> %13, <8 x i32> < i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3 >          ; <<8 x i16>> [#uses=1]
-        %15 = bitcast <8 x i16> %14 to <4 x i32>                ; <<4 x i32>> [#uses=1]
-        %16 = sitofp <4 x i32> %15 to <4 x float>               ; <<4 x float>> [#uses=1]
-        %17 = getelementptr i8, i8* %0, i32 %.0             ; <i8*> [#uses=1]
-        %18 = bitcast i8* %17 to <4 x float>*           ; <<4 x float>*> [#uses=1]
-        store <4 x float> %16, <4 x float>* %18, align 1
-        %19 = add i32 %.0, 1            ; <i32> [#uses=1]
-        br label %4
-
-; <label>:20            ; preds = %4
-        call void @llvm.x86.mmx.emms( )
-        ret void
-}
-
-declare <2 x i64> @foo(<1 x i64>)
-declare void @llvm.x86.mmx.emms( )
diff --git a/test/Transforms/InstCombine/sext.ll b/test/Transforms/InstCombine/sext.ll
index 4cdd080fb0e0..46406ac2f788 100644
--- a/test/Transforms/InstCombine/sext.ll
+++ b/test/Transforms/InstCombine/sext.ll
@@ -128,7 +128,7 @@ F:
 define i32 @test10(i32 %i) {
 ; CHECK-LABEL: @test10(
 ; CHECK-NEXT:    [[B1:%.*]] = shl i32 %i, 30
-; CHECK-NEXT:    [[B:%.*]] = ashr exact i32 [[B:%.*]]1, 30
+; CHECK-NEXT:    [[B:%.*]] = ashr exact i32 [[B1]], 30
 ; CHECK-NEXT:    ret i32 [[B]]
 ;
   %tmp12 = trunc i32 %i to i8
diff --git a/test/Transforms/InstCombine/shufflemask-undef.ll b/test/Transforms/InstCombine/shufflemask-undef.ll
deleted file mode 100644
index 10509a92941b..000000000000
--- a/test/Transforms/InstCombine/shufflemask-undef.ll
+++ /dev/null
@@ -1,109 +0,0 @@
-; RUN: opt < %s -instcombine -S | not grep "shufflevector.*i32 8"
-
-target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
-target triple = "i386-apple-darwin9"
-	%struct.ActiveTextureTargets = type { i64, i64, i64, i64, i64, i64 }
-	%struct.AlphaTest = type { float, i16, i8, i8 }
-	%struct.ArrayRange = type { i8, i8, i8, i8 }
-	%struct.BlendMode = type { i16, i16, i16, i16, %struct.IColor4, i16, i16, i8, i8, i8, i8 }
-	%struct.ClearColor = type { double, %struct.IColor4, %struct.IColor4, float, i32 }
-	%struct.ClipPlane = type { i32, [6 x %struct.IColor4] }
-	%struct.ColorBuffer = type { i16, i8, i8, [8 x i16], [0 x i32] }
-	%struct.ColorMatrix = type { [16 x float]*, %struct.ImagingColorScale }
-	%struct.Convolution = type { %struct.IColor4, %struct.ImagingColorScale, i16, i16, [0 x i32], float*, i32, i32 }
-	%struct.DepthTest = type { i16, i16, i8, i8, i8, i8, double, double }
-	%struct.FixedFunction = type { %struct.PPStreamToken* }
-	%struct.FogMode = type { %struct.IColor4, float, float, float, float, float, i16, i16, i16, i8, i8 }
-	%struct.HintMode = type { i16, i16, i16, i16, i16, i16, i16, i16, i16, i16 }
-	%struct.Histogram = type { %struct.ProgramLimits*, i32, i16, i8, i8 }
-	%struct.ImagingColorScale = type { %struct.TCoord2, %struct.TCoord2, %struct.TCoord2, %struct.TCoord2 }
-	%struct.ImagingSubset = type { %struct.Convolution, %struct.Convolution, %struct.Convolution, %struct.ColorMatrix, %struct.Minmax, %struct.Histogram, %struct.ImagingColorScale, %struct.ImagingColorScale, %struct.ImagingColorScale, %struct.ImagingColorScale, i32, [0 x i32] }
-	%struct.Light = type { %struct.IColor4, %struct.IColor4, %struct.IColor4, %struct.IColor4, %struct.PointLineLimits, float, float, float, float, float, %struct.PointLineLimits, float, %struct.PointLineLimits, float, %struct.PointLineLimits, float, float, float, float, float }
-	%struct.LightModel = type { %struct.IColor4, [8 x %struct.Light], [2 x %struct.Material], i32, i16, i16, i16, i8, i8, i8, i8, i8, i8 }
-	%struct.LightProduct = type { %struct.IColor4, %struct.IColor4, %struct.IColor4 }
-	%struct.LineMode = type { float, i32, i16, i16, i8, i8, i8, i8 }
-	%struct.LogicOp = type { i16, i8, i8 }
-	%struct.MaskMode = type { i32, [3 x i32], i8, i8, i8, i8, i8, i8, i8, i8 }
-	%struct.Material = type { %struct.IColor4, %struct.IColor4, %struct.IColor4, %struct.IColor4, float, float, float, float, [8 x %struct.LightProduct], %struct.IColor4, [8 x i32] }
-	%struct.Minmax = type { %struct.MinmaxTable*, i16, i8, i8, [0 x i32] }
-	%struct.MinmaxTable = type { %struct.IColor4, %struct.IColor4 }
-	%struct.Mipmaplevel = type { [4 x i32], [4 x i32], [4 x float], [4 x i32], i32, i32, float*, i8*, i16, i16, i16, i16, [2 x float] }
-	%struct.Multisample = type { float, i8, i8, i8, i8, i8, i8, i8, i8 }
-	%struct.PipelineProgramState = type { i8, i8, i8, i8, [0 x i32], %struct.IColor4* }
-	%struct.PixelMap = type { i32*, float*, float*, float*, float*, float*, float*, float*, float*, i32*, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 }
-	%struct.PixelMode = type { float, float, %struct.PixelStore, %struct.PixelTransfer, %struct.PixelMap, %struct.ImagingSubset, i32, i32 }
-	%struct.PixelPack = type { i32, i32, i32, i32, i32, i32, i32, i32, i8, i8, i8, i8 }
-	%struct.PixelStore = type { %struct.PixelPack, %struct.PixelPack }
-	%struct.PixelTransfer = type { float, float, float, float, float, float, float, float, float, float, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float }
-	%struct.PluginBufferData = type { i32 }
-	%struct.PointLineLimits = type { float, float, float }
-	%struct.PointMode = type { float, float, float, float, %struct.PointLineLimits, float, i8, i8, i8, i8, i16, i16, i32, i16, i16 }
-	%struct.PolygonMode = type { [128 x i8], float, float, i16, i16, i16, i16, i8, i8, i8, i8, i8, i8, i8, i8 }
-	%struct.ProgramLimits = type { i32, i32, i32, i32 }
-	%struct.RegisterCombiners = type { i8, i8, i8, i8, i32, [2 x %struct.IColor4], [8 x %struct.RegisterCombinersPerStageState], %struct.RegisterCombinersFinalStageState }
-	%struct.RegisterCombinersFinalStageState = type { i8, i8, i8, i8, [7 x %struct.RegisterCombinersPerVariableState] }
-	%struct.RegisterCombinersPerPortionState = type { [4 x %struct.RegisterCombinersPerVariableState], i8, i8, i8, i8, i16, i16, i16, i16, i16, i16 }
-	%struct.RegisterCombinersPerStageState = type { [2 x %struct.RegisterCombinersPerPortionState], [2 x %struct.IColor4] }
-	%struct.RegisterCombinersPerVariableState = type { i16, i16, i16, i16 }
-	%struct.SWRSurfaceRec = type { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i8*, i8*, i8*, [4 x i8*], i32 }
-	%struct.ScissorTest = type { %struct.ProgramLimits, i8, i8, i8, i8 }
-	%struct.State = type <{ i16, i16, i16, i16, i32, i32, [256 x %struct.IColor4], [128 x %struct.IColor4], %struct.Viewport, %struct.Transform, %struct.LightModel, %struct.ActiveTextureTargets, %struct.AlphaTest, %struct.BlendMode, %struct.ClearColor, %struct.ColorBuffer, %struct.DepthTest, %struct.ArrayRange, %struct.FogMode, %struct.HintMode, %struct.LineMode, %struct.LogicOp, %struct.MaskMode, %struct.PixelMode, %struct.PointMode, %struct.PolygonMode, %struct.ScissorTest, i32, %struct.StencilTest, [8 x %struct.TextureMode], [16 x %struct.TextureImageMode], %struct.ArrayRange, [8 x %struct.TextureCoordGen], %struct.ClipPlane, %struct.Multisample, %struct.RegisterCombiners, %struct.ArrayRange, %struct.ArrayRange, [3 x %struct.PipelineProgramState], %struct.ArrayRange, %struct.TransformFeedback, i32*, %struct.FixedFunction, [3 x i32], [3 x i32] }>
-	%struct.StencilTest = type { [3 x { i32, i32, i16, i16, i16, i16 }], i32, [4 x i8] }
-	%struct.TextureCoordGen = type { { i16, i16, %struct.IColor4, %struct.IColor4 }, { i16, i16, %struct.IColor4, %struct.IColor4 }, { i16, i16, %struct.IColor4, %struct.IColor4 }, { i16, i16, %struct.IColor4, %struct.IColor4 }, i8, i8, i8, i8 }
-	%struct.TextureGeomState = type { i16, i16, i16, i16, i16, i8, i8, i8, i8, i16, i16, i16, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, [6 x i16], [6 x i16] }
-	%struct.TextureImageMode = type { float }
-	%struct.TextureLevel = type { i32, i32, i16, i16, i16, i8, i8, i16, i16, i16, i16, i8* }
-	%struct.TextureMode = type { %struct.IColor4, i32, i16, i16, i16, i16, i16, i16, i16, i16, i16, i16, i16, i16, i16, i16, i16, i16, float, float, i16, i16, i16, i16, i16, i16, [4 x i16], i8, i8, i8, i8, [3 x float], [4 x float], float, float }
-	%struct.TextureParamState = type { i16, i16, i16, i16, i16, i16, %struct.IColor4, float, float, float, float, i16, i16, i16, i16, float, i16, i8, i8, i32, i8* }
-	%struct.TextureRec = type { [4 x float], %struct.TextureState*, %struct.Mipmaplevel*, %struct.Mipmaplevel*, float, float, float, float, i8, i8, i8, i8, i16, i16, i16, i16, i32, float, [2 x %struct.PPStreamToken] }
-	%struct.TextureState = type { i16, i8, i8, i16, i16, float, i32, %struct.SWRSurfaceRec*, %struct.TextureParamState, %struct.TextureGeomState, [0 x i32], i8*, i32, %struct.TextureLevel, [1 x [15 x %struct.TextureLevel]] }
-	%struct.Transform = type <{ [24 x [16 x float]], [24 x [16 x float]], [16 x float], float, float, float, float, float, i8, i8, i8, i8, i32, i32, i32, i16, i16, i8, i8, i8, i8, i32 }>
-	%struct.TransformFeedback = type { i8, i8, i8, i8, [0 x i32], [16 x i32], [16 x i32] }
-	%struct.Viewport = type { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, double, double, i32, i32, i32, i32, float, float, float, float }
-	%struct.IColor4 = type { float, float, float, float }
-	%struct.TCoord2 = type { float, float }
-	%struct.VMGPStack = type { [6 x <4 x float>*], <4 x float>*, i32, i32, <4 x float>*, <4 x float>**, i32, i32, i32, i32, i32, i32 }
-	%struct.VMTextures = type { [16 x %struct.TextureRec*] }
-	%struct.PPStreamToken = type { { i16, i16, i32 } }
-	%struct._VMConstants = type { <4 x float>, <4 x float>, <4 x float>, <4 x float>, <4 x float>, <4 x float>, <4 x float>, <4 x float>, <4 x float>, <4 x float>, float, float, float, float, float, float, float, float, float, float, float, float, [256 x float], [528 x i8], { void (i8*, i8*, i32, i8*)*, float (float)*, float (float)*, float (float)*, i32 (float)* } }
-
-define i32 @foo(%struct.State* %dst, <4 x float>* %prgrm, <4 x float>** %buffs, %struct._VMConstants* %cnstn, %struct.PPStreamToken* %pstrm, %struct.PluginBufferData* %gpctx, %struct.VMTextures* %txtrs, %struct.VMGPStack* %gpstk) nounwind {
-bb266.i:
-	getelementptr <4 x float>, <4 x float>* null, i32 11		; <<4 x float>*>:0 [#uses=1]
-	load <4 x float>, <4 x float>* %0, align 16		; <<4 x float>>:1 [#uses=1]
-	shufflevector <4 x float> %1, <4 x float> undef, <4 x i32> < i32 0, i32 1, i32 1, i32 1 >		; <<4 x float>>:2 [#uses=1]
-	shufflevector <4 x float> %2, <4 x float> undef, <4 x i32> < i32 0, i32 4, i32 1, i32 5 >		; <<4 x float>>:3 [#uses=1]
-	shufflevector <4 x float> undef, <4 x float> undef, <4 x i32> < i32 0, i32 4, i32 1, i32 5 >		; <<4 x float>>:4 [#uses=1]
-	shufflevector <4 x float> %4, <4 x float> %3, <4 x i32> < i32 6, i32 7, i32 2, i32 3 >		; <<4 x float>>:5 [#uses=1]
-	fmul <4 x float> %5, zeroinitializer		; <<4 x float>>:6 [#uses=2]
-	fmul <4 x float> %6, %6		; <<4 x float>>:7 [#uses=1]
-	fadd <4 x float> zeroinitializer, %7		; <<4 x float>>:8 [#uses=1]
-	call <4 x float> @llvm.x86.sse.max.ps( <4 x float> zeroinitializer, <4 x float> %8 ) nounwind readnone		; <<4 x float>>:9 [#uses=1]
-	%phitmp40 = bitcast <4 x float> %9 to <4 x i32>		; <<4 x i32>> [#uses=1]
-	%tmp4109.i = and <4 x i32> %phitmp40, < i32 8388607, i32 8388607, i32 8388607, i32 8388607 >		; <<4 x i32>> [#uses=1]
-	%tmp4116.i = or <4 x i32> %tmp4109.i, < i32 1065353216, i32 1065353216, i32 1065353216, i32 1065353216 >		; <<4 x i32>> [#uses=1]
-	%tmp4117.i = bitcast <4 x i32> %tmp4116.i to <4 x float>		; <<4 x float>> [#uses=1]
-	fadd <4 x float> %tmp4117.i, zeroinitializer		; <<4 x float>>:10 [#uses=1]
-	fmul <4 x float> %10, < float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01 >		; <<4 x float>>:11 [#uses=1]
-	call <4 x float> @llvm.x86.sse.max.ps( <4 x float> %11, <4 x float> zeroinitializer ) nounwind readnone		; <<4 x float>>:12 [#uses=1]
-	call <4 x float> @llvm.x86.sse.min.ps( <4 x float> %12, <4 x float> zeroinitializer ) nounwind readnone		; <<4 x float>>:13 [#uses=1]
-	%tmp4170.i = call <4 x float> @llvm.x86.sse.cmp.ps( <4 x float> %13, <4 x float> zeroinitializer, i8 2 ) nounwind		; <<4 x float>> [#uses=1]
-	bitcast <4 x float> %tmp4170.i to <16 x i8>		; <<16 x i8>>:14 [#uses=1]
-	call i32 @llvm.x86.sse2.pmovmskb.128( <16 x i8> %14 ) nounwind readnone		; <i32>:15 [#uses=1]
-	icmp eq i32 %15, 0		; <i1>:16 [#uses=1]
-	br i1 %16, label %bb5574.i, label %bb4521.i
-
-bb4521.i:		; preds = %bb266.i
-	unreachable
-
-bb5574.i:		; preds = %bb266.i
-	unreachable
-}
-
-declare <4 x float> @llvm.x86.sse.cmp.ps(<4 x float>, <4 x float>, i8) nounwind readnone
-
-declare i32 @llvm.x86.sse2.pmovmskb.128(<16 x i8>) nounwind readnone
-
-declare <4 x float> @llvm.x86.sse.max.ps(<4 x float>, <4 x float>) nounwind readnone
-
-declare <4 x float> @llvm.x86.sse.min.ps(<4 x float>, <4 x float>) nounwind readnone
diff --git a/test/Transforms/InstCombine/trunc.ll b/test/Transforms/InstCombine/trunc.ll
index 5597b578f017..dd86e5a907b8 100644
--- a/test/Transforms/InstCombine/trunc.ll
+++ b/test/Transforms/InstCombine/trunc.ll
@@ -24,7 +24,7 @@ define i64 @test2(i64 %a) {
 ; CHECK-LABEL: @test2(
 ; CHECK-NEXT:    [[B:%.*]] = trunc i64 %a to i32
 ; CHECK-NEXT:    [[D1:%.*]] = shl i64 %a, 36
-; CHECK-NEXT:    [[D:%.*]] = ashr exact i64 [[D:%.*]]1, 36
+; CHECK-NEXT:    [[D:%.*]] = ashr exact i64 [[D1]], 36
 ; CHECK-NEXT:    call void @use(i32 [[B]])
 ; CHECK-NEXT:    ret i64 [[D]]
 ;
diff --git a/test/Transforms/InstCombine/vec_demanded_elts.ll b/test/Transforms/InstCombine/vec_demanded_elts.ll
index 5f27634da19c..00efbe00b08d 100644
--- a/test/Transforms/InstCombine/vec_demanded_elts.ll
+++ b/test/Transforms/InstCombine/vec_demanded_elts.ll
@@ -2,30 +2,6 @@
 ; RUN: opt < %s -instcombine -S | FileCheck %s
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
-define i16 @test1(float %f) {
-; CHECK-LABEL: @test1(
-; CHECK-NEXT:    [[TMP281:%.*]] = fadd float %f, -1.000000e+00
-; CHECK-NEXT:    [[TMP373:%.*]] = fmul float [[TMP281]], 5.000000e-01
-; CHECK-NEXT:    [[TMP374:%.*]] = insertelement <4 x float> undef, float [[TMP373]], i32 0
-; CHECK-NEXT:    [[TMP48:%.*]] = tail call <4 x float> @llvm.x86.sse.min.ss(<4 x float> [[TMP374]], <4 x float> <float 6.553500e+04, float undef, float undef, float undef>)
-; CHECK-NEXT:    [[TMP59:%.*]] = tail call <4 x float> @llvm.x86.sse.max.ss(<4 x float> [[TMP48]], <4 x float> <float 0.000000e+00, float undef, float undef, float undef>)
-; CHECK-NEXT:    [[TMP_UPGRD_1:%.*]] = tail call i32 @llvm.x86.sse.cvttss2si(<4 x float> [[TMP59]])
-; CHECK-NEXT:    [[TMP69:%.*]] = trunc i32 [[TMP_UPGRD_1]] to i16
-; CHECK-NEXT:    ret i16 [[TMP69]]
-;
-  %tmp = insertelement <4 x float> undef, float %f, i32 0
-  %tmp10 = insertelement <4 x float> %tmp, float 0.000000e+00, i32 1
-  %tmp11 = insertelement <4 x float> %tmp10, float 0.000000e+00, i32 2
-  %tmp12 = insertelement <4 x float> %tmp11, float 0.000000e+00, i32 3
-  %tmp28 = tail call <4 x float> @llvm.x86.sse.sub.ss( <4 x float> %tmp12, <4 x float> < float 1.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00 > )
-  %tmp37 = tail call <4 x float> @llvm.x86.sse.mul.ss( <4 x float> %tmp28, <4 x float> < float 5.000000e-01, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00 > )
-  %tmp48 = tail call <4 x float> @llvm.x86.sse.min.ss( <4 x float> %tmp37, <4 x float> < float 6.553500e+04, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00 > )
-  %tmp59 = tail call <4 x float> @llvm.x86.sse.max.ss( <4 x float> %tmp48, <4 x float> zeroinitializer )
-  %tmp.upgrd.1 = tail call i32 @llvm.x86.sse.cvttss2si( <4 x float> %tmp59 )
-  %tmp69 = trunc i32 %tmp.upgrd.1 to i16
-  ret i16 %tmp69
-}
-
 define i32 @test2(float %f) {
 ; CHECK-LABEL: @test2(
 ; CHECK-NEXT:    [[TMP5:%.*]] = fmul float %f, %f
@@ -42,77 +18,6 @@ define i32 @test2(float %f) {
   ret i32 %tmp21
 }
 
-define i64 @test3(float %f, double %d) {
-; CHECK-LABEL: @test3(
-; CHECK-NEXT:    [[V00:%.*]] = insertelement <4 x float> undef, float %f, i32 0
-; CHECK-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.x86.sse.cvtss2si(<4 x float> [[V00]])
-; CHECK-NEXT:    [[V10:%.*]] = insertelement <4 x float> undef, float %f, i32 0
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.x86.sse.cvtss2si64(<4 x float> [[V10]])
-; CHECK-NEXT:    [[V20:%.*]] = insertelement <4 x float> undef, float %f, i32 0
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call i32 @llvm.x86.sse.cvttss2si(<4 x float> [[V20]])
-; CHECK-NEXT:    [[V30:%.*]] = insertelement <4 x float> undef, float %f, i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = tail call i64 @llvm.x86.sse.cvttss2si64(<4 x float> [[V30]])
-; CHECK-NEXT:    [[V40:%.*]] = insertelement <2 x double> undef, double %d, i32 0
-; CHECK-NEXT:    [[TMP4:%.*]] = tail call i32 @llvm.x86.sse2.cvtsd2si(<2 x double> [[V40]])
-; CHECK-NEXT:    [[V50:%.*]] = insertelement <2 x double> undef, double %d, i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = tail call i64 @llvm.x86.sse2.cvtsd2si64(<2 x double> [[V50]])
-; CHECK-NEXT:    [[V60:%.*]] = insertelement <2 x double> undef, double %d, i32 0
-; CHECK-NEXT:    [[TMP6:%.*]] = tail call i32 @llvm.x86.sse2.cvttsd2si(<2 x double> [[V60]])
-; CHECK-NEXT:    [[V70:%.*]] = insertelement <2 x double> undef, double %d, i32 0
-; CHECK-NEXT:    [[TMP7:%.*]] = tail call i64 @llvm.x86.sse2.cvttsd2si64(<2 x double> [[V70]])
-; CHECK-NEXT:    [[TMP8:%.*]] = add i32 [[TMP0]], [[TMP2]]
-; CHECK-NEXT:    [[TMP9:%.*]] = add i32 [[TMP4]], [[TMP6]]
-; CHECK-NEXT:    [[TMP10:%.*]] = add i32 [[TMP8]], [[TMP9]]
-; CHECK-NEXT:    [[TMP11:%.*]] = sext i32 [[TMP10]] to i64
-; CHECK-NEXT:    [[TMP12:%.*]] = add i64 [[TMP1]], [[TMP3]]
-; CHECK-NEXT:    [[TMP13:%.*]] = add i64 [[TMP5]], [[TMP7]]
-; CHECK-NEXT:    [[TMP14:%.*]] = add i64 [[TMP12]], [[TMP13]]
-; CHECK-NEXT:    [[TMP15:%.*]] = add i64 [[TMP14]], [[TMP11]]
-; CHECK-NEXT:    ret i64 [[TMP15]]
-;
-  %v00 = insertelement <4 x float> undef, float %f, i32 0
-  %v01 = insertelement <4 x float> %v00, float 0.000000e+00, i32 1
-  %v02 = insertelement <4 x float> %v01, float 0.000000e+00, i32 2
-  %v03 = insertelement <4 x float> %v02, float 0.000000e+00, i32 3
-  %tmp0 = tail call i32 @llvm.x86.sse.cvtss2si(<4 x float> %v03)
-  %v10 = insertelement <4 x float> undef, float %f, i32 0
-  %v11 = insertelement <4 x float> %v10, float 0.000000e+00, i32 1
-  %v12 = insertelement <4 x float> %v11, float 0.000000e+00, i32 2
-  %v13 = insertelement <4 x float> %v12, float 0.000000e+00, i32 3
-  %tmp1 = tail call i64 @llvm.x86.sse.cvtss2si64(<4 x float> %v13)
-  %v20 = insertelement <4 x float> undef, float %f, i32 0
-  %v21 = insertelement <4 x float> %v20, float 0.000000e+00, i32 1
-  %v22 = insertelement <4 x float> %v21, float 0.000000e+00, i32 2
-  %v23 = insertelement <4 x float> %v22, float 0.000000e+00, i32 3
-  %tmp2 = tail call i32 @llvm.x86.sse.cvttss2si(<4 x float> %v23)
-  %v30 = insertelement <4 x float> undef, float %f, i32 0
-  %v31 = insertelement <4 x float> %v30, float 0.000000e+00, i32 1
-  %v32 = insertelement <4 x float> %v31, float 0.000000e+00, i32 2
-  %v33 = insertelement <4 x float> %v32, float 0.000000e+00, i32 3
-  %tmp3 = tail call i64 @llvm.x86.sse.cvttss2si64(<4 x float> %v33)
-  %v40 = insertelement <2 x double> undef, double %d, i32 0
-  %v41 = insertelement <2 x double> %v40, double 0.000000e+00, i32 1
-  %tmp4 = tail call i32 @llvm.x86.sse2.cvtsd2si(<2 x double> %v41)
-  %v50 = insertelement <2 x double> undef, double %d, i32 0
-  %v51 = insertelement <2 x double> %v50, double 0.000000e+00, i32 1
-  %tmp5 = tail call i64 @llvm.x86.sse2.cvtsd2si64(<2 x double> %v51)
-  %v60 = insertelement <2 x double> undef, double %d, i32 0
-  %v61 = insertelement <2 x double> %v60, double 0.000000e+00, i32 1
-  %tmp6 = tail call i32 @llvm.x86.sse2.cvttsd2si(<2 x double> %v61)
-  %v70 = insertelement <2 x double> undef, double %d, i32 0
-  %v71 = insertelement <2 x double> %v70, double 0.000000e+00, i32 1
-  %tmp7 = tail call i64 @llvm.x86.sse2.cvttsd2si64(<2 x double> %v71)
-  %tmp8 = add i32 %tmp0, %tmp2
-  %tmp9 = add i32 %tmp4, %tmp6
-  %tmp10 = add i32 %tmp8, %tmp9
-  %tmp11 = sext i32 %tmp10 to i64
-  %tmp12 = add i64 %tmp1, %tmp3
-  %tmp13 = add i64 %tmp5, %tmp7
-  %tmp14 = add i64 %tmp12, %tmp13
-  %tmp15 = add i64 %tmp11, %tmp14
-  ret i64 %tmp15
-}
-
 define void @get_image() nounwind {
 ; CHECK-LABEL: @get_image(
 ; CHECK-NEXT:  entry:
@@ -156,18 +61,6 @@ entry:
 }
 
 declare i32 @fgetc(i8*)
-declare <4 x float> @llvm.x86.sse.sub.ss(<4 x float>, <4 x float>)
-declare <4 x float> @llvm.x86.sse.mul.ss(<4 x float>, <4 x float>)
-declare <4 x float> @llvm.x86.sse.min.ss(<4 x float>, <4 x float>)
-declare <4 x float> @llvm.x86.sse.max.ss(<4 x float>, <4 x float>)
-declare i32 @llvm.x86.sse.cvtss2si(<4 x float>)
-declare i64 @llvm.x86.sse.cvtss2si64(<4 x float>)
-declare i32 @llvm.x86.sse.cvttss2si(<4 x float>)
-declare i64 @llvm.x86.sse.cvttss2si64(<4 x float>)
-declare i32 @llvm.x86.sse2.cvtsd2si(<2 x double>)
-declare i64 @llvm.x86.sse2.cvtsd2si64(<2 x double>)
-declare i32 @llvm.x86.sse2.cvttsd2si(<2 x double>)
-declare i64 @llvm.x86.sse2.cvttsd2si64(<2 x double>)
 
 define <4 x float> @dead_shuffle_elt(<4 x float> %x, <2 x float> %y) nounwind {
 ; CHECK-LABEL: @dead_shuffle_elt(
@@ -248,4 +141,3 @@ define <2 x i64> @PR24922(<2 x i64> %v) {
   %result = select <2 x i1> <i1 icmp eq (i64 extractelement (<2 x i64> bitcast (<4 x i32> <i32 15, i32 15, i32 15, i32 15> to <2 x i64>), i64 0), i64 0), i1 true>, <2 x i64> %v, <2 x i64> zeroinitializer
   ret <2 x i64> %result
 }
-
diff --git a/test/Transforms/InstCombine/vsx-unaligned.ll b/test/Transforms/InstCombine/vsx-unaligned.ll
deleted file mode 100644
index ad264fb15b31..000000000000
--- a/test/Transforms/InstCombine/vsx-unaligned.ll
+++ /dev/null
@@ -1,44 +0,0 @@
-; Verify that we can create unaligned loads and stores from VSX intrinsics.
-
-; RUN: opt < %s -instcombine -S | FileCheck %s
-
-target triple = "powerpc64-unknown-linux-gnu"
-
-@vf = common global <4 x float> zeroinitializer, align 1
-@res_vf = common global <4 x float> zeroinitializer, align 1
-@vd = common global <2 x double> zeroinitializer, align 1
-@res_vd = common global <2 x double> zeroinitializer, align 1
-
-define void @test1() {
-entry:
-  %t1 = alloca <4 x float>*, align 8
-  %t2 = alloca <2 x double>*, align 8
-  store <4 x float>* @vf, <4 x float>** %t1, align 8
-  %0 = load <4 x float>*, <4 x float>** %t1, align 8
-  %1 = bitcast <4 x float>* %0 to i8*
-  %2 = call <4 x i32> @llvm.ppc.vsx.lxvw4x(i8* %1)
-  store <4 x float>* @res_vf, <4 x float>** %t1, align 8
-  %3 = load <4 x float>*, <4 x float>** %t1, align 8
-  %4 = bitcast <4 x float>* %3 to i8*
-  call void @llvm.ppc.vsx.stxvw4x(<4 x i32> %2, i8* %4)
-  store <2 x double>* @vd, <2 x double>** %t2, align 8
-  %5 = load <2 x double>*, <2 x double>** %t2, align 8
-  %6 = bitcast <2 x double>* %5 to i8*
-  %7 = call <2 x double> @llvm.ppc.vsx.lxvd2x(i8* %6)
-  store <2 x double>* @res_vd, <2 x double>** %t2, align 8
-  %8 = load <2 x double>*, <2 x double>** %t2, align 8
-  %9 = bitcast <2 x double>* %8 to i8*
-  call void @llvm.ppc.vsx.stxvd2x(<2 x double> %7, i8* %9)
-  ret void
-}
-
-; CHECK-LABEL: @test1
-; CHECK: %0 = load <4 x i32>, <4 x i32>* bitcast (<4 x float>* @vf to <4 x i32>*), align 1
-; CHECK: store <4 x i32> %0, <4 x i32>* bitcast (<4 x float>* @res_vf to <4 x i32>*), align 1
-; CHECK: %1 = load <2 x double>, <2 x double>* @vd, align 1
-; CHECK: store <2 x double> %1, <2 x double>* @res_vd, align 1
-
-declare <4 x i32> @llvm.ppc.vsx.lxvw4x(i8*)
-declare void @llvm.ppc.vsx.stxvw4x(<4 x i32>, i8*)
-declare <2 x double> @llvm.ppc.vsx.lxvd2x(i8*)
-declare void @llvm.ppc.vsx.stxvd2x(<2 x double>, i8*)
diff --git a/test/Transforms/InstCombine/x86-avx2.ll b/test/Transforms/InstCombine/x86-avx2.ll
deleted file mode 100644
index f4045f788e2d..000000000000
--- a/test/Transforms/InstCombine/x86-avx2.ll
+++ /dev/null
@@ -1,109 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -instcombine -S | FileCheck %s
-target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-
-; Verify that instcombine is able to fold identity shuffles.
-
-define <8 x i32> @identity_test_vpermd(<8 x i32> %a0) {
-; CHECK-LABEL: @identity_test_vpermd(
-; CHECK-NEXT:    ret <8 x i32> %a0
-;
-  %a = tail call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %a0, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>)
-  ret <8 x i32> %a
-}
-
-define <8 x float> @identity_test_vpermps(<8 x float> %a0) {
-; CHECK-LABEL: @identity_test_vpermps(
-; CHECK-NEXT:    ret <8 x float> %a0
-;
-  %a = tail call <8 x float> @llvm.x86.avx2.permps(<8 x float> %a0, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>)
-  ret <8 x float> %a
-}
-
-; Instcombine should be able to fold the following shuffle to a builtin shufflevector
-; with a shuffle mask of all zeroes.
-
-define <8 x i32> @zero_test_vpermd(<8 x i32> %a0) {
-; CHECK-LABEL: @zero_test_vpermd(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> %a0, <8 x i32> undef, <8 x i32> zeroinitializer
-; CHECK-NEXT:    ret <8 x i32> [[TMP1]]
-;
-  %a = tail call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %a0, <8 x i32> zeroinitializer)
-  ret <8 x i32> %a
-}
-
-define <8 x float> @zero_test_vpermps(<8 x float> %a0) {
-; CHECK-LABEL: @zero_test_vpermps(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> %a0, <8 x float> undef, <8 x i32> zeroinitializer
-; CHECK-NEXT:    ret <8 x float> [[TMP1]]
-;
-  %a = tail call <8 x float> @llvm.x86.avx2.permps(<8 x float> %a0, <8 x i32> zeroinitializer)
-  ret <8 x float> %a
-}
-
-; Verify that instcombine is able to fold constant shuffles.
-
-define <8 x i32> @shuffle_test_vpermd(<8 x i32> %a0) {
-; CHECK-LABEL: @shuffle_test_vpermd(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> %a0, <8 x i32> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    ret <8 x i32> [[TMP1]]
-;
-  %a = tail call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %a0, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>)
-  ret <8 x i32> %a
-}
-
-define <8 x float> @shuffle_test_vpermps(<8 x float> %a0) {
-; CHECK-LABEL: @shuffle_test_vpermps(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> %a0, <8 x float> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    ret <8 x float> [[TMP1]]
-;
-  %a = tail call <8 x float> @llvm.x86.avx2.permps(<8 x float> %a0, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>)
-  ret <8 x float> %a
-}
-
-; Verify that instcombine is able to fold constant shuffles with undef mask elements.
-
-define <8 x i32> @undef_test_vpermd(<8 x i32> %a0) {
-; CHECK-LABEL: @undef_test_vpermd(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> %a0, <8 x i32> undef, <8 x i32> <i32 undef, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    ret <8 x i32> [[TMP1]]
-;
-  %a = tail call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %a0, <8 x i32> <i32 undef, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>)
-  ret <8 x i32> %a
-}
-
-define <8 x float> @undef_test_vpermps(<8 x float> %a0) {
-; CHECK-LABEL: @undef_test_vpermps(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> %a0, <8 x float> undef, <8 x i32> <i32 undef, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    ret <8 x float> [[TMP1]]
-;
-  %a = tail call <8 x float> @llvm.x86.avx2.permps(<8 x float> %a0, <8 x i32> <i32 undef, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>)
-  ret <8 x float> %a
-}
-
-; Verify simplify demanded elts.
-
-define <8 x i32> @elts_test_vpermd(<8 x i32> %a0, i32 %a1) {
-; CHECK-LABEL: @elts_test_vpermd(
-; CHECK-NEXT:    ret <8 x i32> %a0
-;
-  %1 = insertelement <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>, i32 %a1, i32 0
-  %2 = tail call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %a0, <8 x i32> %1)
-  %3 = shufflevector <8 x i32> %2, <8 x i32> undef, <8 x i32> <i32 undef, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-  ret <8 x i32> %3
-}
-
-define <8 x float> @elts_test_vpermps(<8 x float> %a0, <8 x i32> %a1) {
-; CHECK-LABEL: @elts_test_vpermps(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x float> @llvm.x86.avx2.permps(<8 x float> %a0, <8 x i32> %a1)
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> undef, <8 x i32> zeroinitializer
-; CHECK-NEXT:    ret <8 x float> [[TMP2]]
-;
-  %1 = insertelement <8 x i32> %a1, i32 0, i32 7
-  %2 = tail call <8 x float> @llvm.x86.avx2.permps(<8 x float> %a0, <8 x i32> %1)
-  %3 = shufflevector <8 x float> %2, <8 x float> undef, <8 x i32> zeroinitializer
-  ret <8 x float> %3
-}
-
-declare <8 x i32> @llvm.x86.avx2.permd(<8 x i32>, <8 x i32>)
-declare <8 x float> @llvm.x86.avx2.permps(<8 x float>, <8 x i32>)
diff --git a/test/Transforms/InstCombine/x86-avx512.ll b/test/Transforms/InstCombine/x86-avx512.ll
deleted file mode 100644
index 2a24d93ce76a..000000000000
--- a/test/Transforms/InstCombine/x86-avx512.ll
+++ /dev/null
@@ -1,2793 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -instcombine -S | FileCheck %s
-target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-
-declare <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float>, <4 x float>, <4 x float>, i8, i32)
-
-define <4 x float> @test_add_ss(<4 x float> %a, <4 x float> %b) {
-; CHECK-LABEL: @test_add_ss(
-; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x float> [[A:%.*]], i64 0
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[B:%.*]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = fadd float [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x float> [[A]], float [[TMP3]], i64 0
-; CHECK-NEXT:    ret <4 x float> [[TMP4]]
-;
-  %1 = insertelement <4 x float> %b, float 1.000000e+00, i32 1
-  %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2
-  %3 = insertelement <4 x float> %2, float 3.000000e+00, i32 3
-  %4 = tail call <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float> %a, <4 x float> %3, <4 x float> undef, i8 -1, i32 4)
-  ret <4 x float> %4
-}
-
-define <4 x float> @test_add_ss_round(<4 x float> %a, <4 x float> %b) {
-; CHECK-LABEL: @test_add_ss_round(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x float> undef, i8 -1, i32 8)
-; CHECK-NEXT:    ret <4 x float> [[TMP1]]
-;
-  %1 = insertelement <4 x float> %b, float 1.000000e+00, i32 1
-  %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2
-  %3 = insertelement <4 x float> %2, float 3.000000e+00, i32 3
-  %4 = tail call <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float> %a, <4 x float> %3, <4 x float> undef, i8 -1, i32 8)
-  ret <4 x float> %4
-}
-
-define <4 x float> @test_add_ss_mask(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) {
-; CHECK-LABEL: @test_add_ss_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x float> [[A:%.*]], i64 0
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[B:%.*]], i64 0
-; CHECK-NEXT:    [[TMP3:%.*]] = fadd float [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <8 x i1> [[TMP4]], i64 0
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x float> [[C:%.*]], i32 0
-; CHECK-NEXT:    [[TMP7:%.*]] = select i1 [[TMP5]], float [[TMP3]], float [[TMP6]]
-; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <4 x float> [[A]], float [[TMP7]], i64 0
-; CHECK-NEXT:    ret <4 x float> [[TMP8]]
-;
-  %1 = insertelement <4 x float> %c, float 1.000000e+00, i32 1
-  %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2
-  %3 = insertelement <4 x float> %2, float 3.000000e+00, i32 3
-  %4 = tail call <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float> %a, <4 x float> %b, <4 x float> %3, i8 %mask, i32 4)
-  ret <4 x float> %4
-}
-
-define <4 x float> @test_add_ss_mask_round(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) {
-; CHECK-LABEL: @test_add_ss_mask_round(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x float> [[C:%.*]], i8 [[MASK:%.*]], i32 8)
-; CHECK-NEXT:    ret <4 x float> [[TMP1]]
-;
-  %1 = insertelement <4 x float> %c, float 1.000000e+00, i32 1
-  %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2
-  %3 = insertelement <4 x float> %2, float 3.000000e+00, i32 3
-  %4 = tail call <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float> %a, <4 x float> %b, <4 x float> %3, i8 %mask, i32 8)
-  ret <4 x float> %4
-}
-
-define float @test_add_ss_1(float %a, float %b) {
-; CHECK-LABEL: @test_add_ss_1(
-; CHECK-NEXT:    ret float 1.000000e+00
-;
-  %1 = insertelement <4 x float> undef, float %a, i32 0
-  %2 = insertelement <4 x float> %1, float 1.000000e+00, i32 1
-  %3 = insertelement <4 x float> %2, float 2.000000e+00, i32 2
-  %4 = insertelement <4 x float> %3, float 3.000000e+00, i32 3
-  %5 = insertelement <4 x float> undef, float %b, i32 0
-  %6 = insertelement <4 x float> %5, float 4.000000e+00, i32 1
-  %7 = insertelement <4 x float> %6, float 5.000000e+00, i32 2
-  %8 = insertelement <4 x float> %7, float 6.000000e+00, i32 3
-  %9 = tail call <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float> %4, <4 x float> %8, <4 x float> undef, i8 -1, i32 8)
-  %10 = extractelement <4 x float> %9, i32 1
-  ret float %10
-}
-
-declare <2 x double> @llvm.x86.avx512.mask.add.sd.round(<2 x double>, <2 x double>, <2 x double>, i8, i32)
-
-define <2 x double> @test_add_sd(<2 x double> %a, <2 x double> %b) {
-; CHECK-LABEL: @test_add_sd(
-; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x double> [[A:%.*]], i64 0
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x double> [[B:%.*]], i64 0
-; CHECK-NEXT:    [[TMP3:%.*]] = fadd double [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x double> [[A]], double [[TMP3]], i64 0
-; CHECK-NEXT:    ret <2 x double> [[TMP4]]
-;
-  %1 = insertelement <2 x double> %b, double 1.000000e+00, i32 1
-  %2 = tail call <2 x double> @llvm.x86.avx512.mask.add.sd.round(<2 x double> %a, <2 x double> %1, <2 x double> undef, i8 -1, i32 4)
-  ret <2 x double> %2
-}
-
-define <2 x double> @test_add_sd_round(<2 x double> %a, <2 x double> %b) {
-; CHECK-LABEL: @test_add_sd_round(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x double> @llvm.x86.avx512.mask.add.sd.round(<2 x double> [[A:%.*]], <2 x double> [[B:%.*]], <2 x double> undef, i8 -1, i32 8)
-; CHECK-NEXT:    ret <2 x double> [[TMP1]]
-;
-  %1 = insertelement <2 x double> %b, double 1.000000e+00, i32 1
-  %2 = tail call <2 x double> @llvm.x86.avx512.mask.add.sd.round(<2 x double> %a, <2 x double> %1, <2 x double> undef, i8 -1, i32 8)
-  ret <2 x double> %2
-}
-
-define <2 x double> @test_add_sd_mask(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) {
-; CHECK-LABEL: @test_add_sd_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x double> [[A:%.*]], i64 0
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x double> [[B:%.*]], i64 0
-; CHECK-NEXT:    [[TMP3:%.*]] = fadd double [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <8 x i1> [[TMP4]], i64 0
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x double> [[C:%.*]], i64 0
-; CHECK-NEXT:    [[TMP7:%.*]] = select i1 [[TMP5]], double [[TMP3]], double [[TMP6]]
-; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <2 x double> [[A]], double [[TMP7]], i64 0
-; CHECK-NEXT:    ret <2 x double> [[TMP8]]
-;
-  %1 = insertelement <2 x double> %c, double 1.000000e+00, i32 1
-  %2 = tail call <2 x double> @llvm.x86.avx512.mask.add.sd.round(<2 x double> %a, <2 x double> %b, <2 x double> %1, i8 %mask, i32 4)
-  ret <2 x double> %2
-}
-
-define <2 x double> @test_add_sd_mask_round(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) {
-; CHECK-LABEL: @test_add_sd_mask_round(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x double> @llvm.x86.avx512.mask.add.sd.round(<2 x double> [[A:%.*]], <2 x double> [[B:%.*]], <2 x double> [[C:%.*]], i8 [[MASK:%.*]], i32 8)
-; CHECK-NEXT:    ret <2 x double> [[TMP1]]
-;
-  %1 = insertelement <2 x double> %c, double 1.000000e+00, i32 1
-  %2 = tail call <2 x double> @llvm.x86.avx512.mask.add.sd.round(<2 x double> %a, <2 x double> %b, <2 x double> %1, i8 %mask, i32 8)
-  ret <2 x double> %2
-}
-
-define double @test_add_sd_1(double %a, double %b) {
-; CHECK-LABEL: @test_add_sd_1(
-; CHECK-NEXT:    ret double 1.000000e+00
-;
-  %1 = insertelement <2 x double> undef, double %a, i32 0
-  %2 = insertelement <2 x double> %1, double 1.000000e+00, i32 1
-  %3 = insertelement <2 x double> undef, double %b, i32 0
-  %4 = insertelement <2 x double> %3, double 2.000000e+00, i32 1
-  %5 = tail call <2 x double> @llvm.x86.avx512.mask.add.sd.round(<2 x double> %2, <2 x double> %4, <2 x double> undef, i8 -1, i32 8)
-  %6 = extractelement <2 x double> %5, i32 1
-  ret double %6
-}
-
-declare <4 x float> @llvm.x86.avx512.mask.sub.ss.round(<4 x float>, <4 x float>, <4 x float>, i8, i32)
-
-define <4 x float> @test_sub_ss(<4 x float> %a, <4 x float> %b) {
-; CHECK-LABEL: @test_sub_ss(
-; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x float> [[A:%.*]], i64 0
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[B:%.*]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = fsub float [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x float> [[A]], float [[TMP3]], i64 0
-; CHECK-NEXT:    ret <4 x float> [[TMP4]]
-;
-  %1 = insertelement <4 x float> %b, float 1.000000e+00, i32 1
-  %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2
-  %3 = insertelement <4 x float> %2, float 3.000000e+00, i32 3
-  %4 = tail call <4 x float> @llvm.x86.avx512.mask.sub.ss.round(<4 x float> %a, <4 x float> %3, <4 x float> undef, i8 -1, i32 4)
-  ret <4 x float> %4
-}
-
-define <4 x float> @test_sub_ss_round(<4 x float> %a, <4 x float> %b) {
-; CHECK-LABEL: @test_sub_ss_round(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x float> @llvm.x86.avx512.mask.sub.ss.round(<4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x float> undef, i8 -1, i32 8)
-; CHECK-NEXT:    ret <4 x float> [[TMP1]]
-;
-  %1 = insertelement <4 x float> %b, float 1.000000e+00, i32 1
-  %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2
-  %3 = insertelement <4 x float> %2, float 3.000000e+00, i32 3
-  %4 = tail call <4 x float> @llvm.x86.avx512.mask.sub.ss.round(<4 x float> %a, <4 x float> %3, <4 x float> undef, i8 -1, i32 8)
-  ret <4 x float> %4
-}
-
-define <4 x float> @test_sub_ss_mask(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) {
-; CHECK-LABEL: @test_sub_ss_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x float> [[A:%.*]], i64 0
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[B:%.*]], i64 0
-; CHECK-NEXT:    [[TMP3:%.*]] = fsub float [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <8 x i1> [[TMP4]], i64 0
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x float> [[C:%.*]], i32 0
-; CHECK-NEXT:    [[TMP7:%.*]] = select i1 [[TMP5]], float [[TMP3]], float [[TMP6]]
-; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <4 x float> [[A]], float [[TMP7]], i64 0
-; CHECK-NEXT:    ret <4 x float> [[TMP8]]
-;
-  %1 = insertelement <4 x float> %c, float 1.000000e+00, i32 1
-  %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2
-  %3 = insertelement <4 x float> %2, float 3.000000e+00, i32 3
-  %4 = tail call <4 x float> @llvm.x86.avx512.mask.sub.ss.round(<4 x float> %a, <4 x float> %b, <4 x float> %3, i8 %mask, i32 4)
-  ret <4 x float> %4
-}
-
-define <4 x float> @test_sub_ss_mask_round(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) {
-; CHECK-LABEL: @test_sub_ss_mask_round(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x float> @llvm.x86.avx512.mask.sub.ss.round(<4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x float> [[C:%.*]], i8 [[MASK:%.*]], i32 8)
-; CHECK-NEXT:    ret <4 x float> [[TMP1]]
-;
-  %1 = insertelement <4 x float> %c, float 1.000000e+00, i32 1
-  %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2
-  %3 = insertelement <4 x float> %2, float 3.000000e+00, i32 3
-  %4 = tail call <4 x float> @llvm.x86.avx512.mask.sub.ss.round(<4 x float> %a, <4 x float> %b, <4 x float> %3, i8 %mask, i32 8)
-  ret <4 x float> %4
-}
-
-define float @test_sub_ss_1(float %a, float %b) {
-; CHECK-LABEL: @test_sub_ss_1(
-; CHECK-NEXT:    ret float 1.000000e+00
-;
-  %1 = insertelement <4 x float> undef, float %a, i32 0
-  %2 = insertelement <4 x float> %1, float 1.000000e+00, i32 1
-  %3 = insertelement <4 x float> %2, float 2.000000e+00, i32 2
-  %4 = insertelement <4 x float> %3, float 3.000000e+00, i32 3
-  %5 = insertelement <4 x float> undef, float %b, i32 0
-  %6 = insertelement <4 x float> %5, float 4.000000e+00, i32 1
-  %7 = insertelement <4 x float> %6, float 5.000000e+00, i32 2
-  %8 = insertelement <4 x float> %7, float 6.000000e+00, i32 3
-  %9 = tail call <4 x float> @llvm.x86.avx512.mask.sub.ss.round(<4 x float> %4, <4 x float> %8, <4 x float> undef, i8 -1, i32 8)
-  %10 = extractelement <4 x float> %9, i32 1
-  ret float %10
-}
-
-declare <2 x double> @llvm.x86.avx512.mask.sub.sd.round(<2 x double>, <2 x double>, <2 x double>, i8, i32)
-
-define <2 x double> @test_sub_sd(<2 x double> %a, <2 x double> %b) {
-; CHECK-LABEL: @test_sub_sd(
-; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x double> [[A:%.*]], i64 0
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x double> [[B:%.*]], i64 0
-; CHECK-NEXT:    [[TMP3:%.*]] = fsub double [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x double> [[A]], double [[TMP3]], i64 0
-; CHECK-NEXT:    ret <2 x double> [[TMP4]]
-;
-  %1 = insertelement <2 x double> %b, double 1.000000e+00, i32 1
-  %2 = tail call <2 x double> @llvm.x86.avx512.mask.sub.sd.round(<2 x double> %a, <2 x double> %1, <2 x double> undef, i8 -1, i32 4)
-  ret <2 x double> %2
-}
-
-define <2 x double> @test_sub_sd_round(<2 x double> %a, <2 x double> %b) {
-; CHECK-LABEL: @test_sub_sd_round(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x double> @llvm.x86.avx512.mask.sub.sd.round(<2 x double> [[A:%.*]], <2 x double> [[B:%.*]], <2 x double> undef, i8 -1, i32 8)
-; CHECK-NEXT:    ret <2 x double> [[TMP1]]
-;
-  %1 = insertelement <2 x double> %b, double 1.000000e+00, i32 1
-  %2 = tail call <2 x double> @llvm.x86.avx512.mask.sub.sd.round(<2 x double> %a, <2 x double> %1, <2 x double> undef, i8 -1, i32 8)
-  ret <2 x double> %2
-}
-
-define <2 x double> @test_sub_sd_mask(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) {
-; CHECK-LABEL: @test_sub_sd_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x double> [[A:%.*]], i64 0
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x double> [[B:%.*]], i64 0
-; CHECK-NEXT:    [[TMP3:%.*]] = fsub double [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <8 x i1> [[TMP4]], i64 0
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x double> [[C:%.*]], i64 0
-; CHECK-NEXT:    [[TMP7:%.*]] = select i1 [[TMP5]], double [[TMP3]], double [[TMP6]]
-; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <2 x double> [[A]], double [[TMP7]], i64 0
-; CHECK-NEXT:    ret <2 x double> [[TMP8]]
-;
-  %1 = insertelement <2 x double> %c, double 1.000000e+00, i32 1
-  %2 = tail call <2 x double> @llvm.x86.avx512.mask.sub.sd.round(<2 x double> %a, <2 x double> %b, <2 x double> %1, i8 %mask, i32 4)
-  ret <2 x double> %2
-}
-
-define <2 x double> @test_sub_sd_mask_round(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) {
-; CHECK-LABEL: @test_sub_sd_mask_round(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x double> @llvm.x86.avx512.mask.sub.sd.round(<2 x double> [[A:%.*]], <2 x double> [[B:%.*]], <2 x double> [[C:%.*]], i8 [[MASK:%.*]], i32 8)
-; CHECK-NEXT:    ret <2 x double> [[TMP1]]
-;
-  %1 = insertelement <2 x double> %c, double 1.000000e+00, i32 1
-  %2 = tail call <2 x double> @llvm.x86.avx512.mask.sub.sd.round(<2 x double> %a, <2 x double> %b, <2 x double> %1, i8 %mask, i32 8)
-  ret <2 x double> %2
-}
-
-define double @test_sub_sd_1(double %a, double %b) {
-; CHECK-LABEL: @test_sub_sd_1(
-; CHECK-NEXT:    ret double 1.000000e+00
-;
-  %1 = insertelement <2 x double> undef, double %a, i32 0
-  %2 = insertelement <2 x double> %1, double 1.000000e+00, i32 1
-  %3 = insertelement <2 x double> undef, double %b, i32 0
-  %4 = insertelement <2 x double> %3, double 2.000000e+00, i32 1
-  %5 = tail call <2 x double> @llvm.x86.avx512.mask.sub.sd.round(<2 x double> %2, <2 x double> %4, <2 x double> undef, i8 -1, i32 8)
-  %6 = extractelement <2 x double> %5, i32 1
-  ret double %6
-}
-
-declare <4 x float> @llvm.x86.avx512.mask.mul.ss.round(<4 x float>, <4 x float>, <4 x float>, i8, i32)
-
-define <4 x float> @test_mul_ss(<4 x float> %a, <4 x float> %b) {
-; CHECK-LABEL: @test_mul_ss(
-; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x float> [[A:%.*]], i64 0
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[B:%.*]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = fmul float [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x float> [[A]], float [[TMP3]], i64 0
-; CHECK-NEXT:    ret <4 x float> [[TMP4]]
-;
-  %1 = insertelement <4 x float> %b, float 1.000000e+00, i32 1
-  %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2
-  %3 = insertelement <4 x float> %2, float 3.000000e+00, i32 3
-  %4 = tail call <4 x float> @llvm.x86.avx512.mask.mul.ss.round(<4 x float> %a, <4 x float> %3, <4 x float> undef, i8 -1, i32 4)
-  ret <4 x float> %4
-}
-
-define <4 x float> @test_mul_ss_round(<4 x float> %a, <4 x float> %b) {
-; CHECK-LABEL: @test_mul_ss_round(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x float> @llvm.x86.avx512.mask.mul.ss.round(<4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x float> undef, i8 -1, i32 8)
-; CHECK-NEXT:    ret <4 x float> [[TMP1]]
-;
-  %1 = insertelement <4 x float> %b, float 1.000000e+00, i32 1
-  %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2
-  %3 = insertelement <4 x float> %2, float 3.000000e+00, i32 3
-  %4 = tail call <4 x float> @llvm.x86.avx512.mask.mul.ss.round(<4 x float> %a, <4 x float> %3, <4 x float> undef, i8 -1, i32 8)
-  ret <4 x float> %4
-}
-
-define <4 x float> @test_mul_ss_mask(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) {
-; CHECK-LABEL: @test_mul_ss_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x float> [[A:%.*]], i64 0
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[B:%.*]], i64 0
-; CHECK-NEXT:    [[TMP3:%.*]] = fmul float [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <8 x i1> [[TMP4]], i64 0
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x float> [[C:%.*]], i32 0
-; CHECK-NEXT:    [[TMP7:%.*]] = select i1 [[TMP5]], float [[TMP3]], float [[TMP6]]
-; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <4 x float> [[A]], float [[TMP7]], i64 0
-; CHECK-NEXT:    ret <4 x float> [[TMP8]]
-;
-  %1 = insertelement <4 x float> %c, float 1.000000e+00, i32 1
-  %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2
-  %3 = insertelement <4 x float> %2, float 3.000000e+00, i32 3
-  %4 = tail call <4 x float> @llvm.x86.avx512.mask.mul.ss.round(<4 x float> %a, <4 x float> %b, <4 x float> %3, i8 %mask, i32 4)
-  ret <4 x float> %4
-}
-
-define <4 x float> @test_mul_ss_mask_round(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) {
-; CHECK-LABEL: @test_mul_ss_mask_round(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x float> @llvm.x86.avx512.mask.mul.ss.round(<4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x float> [[C:%.*]], i8 [[MASK:%.*]], i32 8)
-; CHECK-NEXT:    ret <4 x float> [[TMP1]]
-;
-  %1 = insertelement <4 x float> %c, float 1.000000e+00, i32 1
-  %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2
-  %3 = insertelement <4 x float> %2, float 3.000000e+00, i32 3
-  %4 = tail call <4 x float> @llvm.x86.avx512.mask.mul.ss.round(<4 x float> %a, <4 x float> %b, <4 x float> %3, i8 %mask, i32 8)
-  ret <4 x float> %4
-}
-
-define float @test_mul_ss_1(float %a, float %b) {
-; CHECK-LABEL: @test_mul_ss_1(
-; CHECK-NEXT:    ret float 1.000000e+00
-;
-  %1 = insertelement <4 x float> undef, float %a, i32 0
-  %2 = insertelement <4 x float> %1, float 1.000000e+00, i32 1
-  %3 = insertelement <4 x float> %2, float 2.000000e+00, i32 2
-  %4 = insertelement <4 x float> %3, float 3.000000e+00, i32 3
-  %5 = insertelement <4 x float> undef, float %b, i32 0
-  %6 = insertelement <4 x float> %5, float 4.000000e+00, i32 1
-  %7 = insertelement <4 x float> %6, float 5.000000e+00, i32 2
-  %8 = insertelement <4 x float> %7, float 6.000000e+00, i32 3
-  %9 = tail call <4 x float> @llvm.x86.avx512.mask.mul.ss.round(<4 x float> %4, <4 x float> %8, <4 x float> undef, i8 -1, i32 8)
-  %10 = extractelement <4 x float> %9, i32 1
-  ret float %10
-}
-
-declare <2 x double> @llvm.x86.avx512.mask.mul.sd.round(<2 x double>, <2 x double>, <2 x double>, i8, i32)
-
-define <2 x double> @test_mul_sd(<2 x double> %a, <2 x double> %b) {
-; CHECK-LABEL: @test_mul_sd(
-; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x double> [[A:%.*]], i64 0
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x double> [[B:%.*]], i64 0
-; CHECK-NEXT:    [[TMP3:%.*]] = fmul double [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x double> [[A]], double [[TMP3]], i64 0
-; CHECK-NEXT:    ret <2 x double> [[TMP4]]
-;
-  %1 = insertelement <2 x double> %b, double 1.000000e+00, i32 1
-  %2 = tail call <2 x double> @llvm.x86.avx512.mask.mul.sd.round(<2 x double> %a, <2 x double> %1, <2 x double> undef, i8 -1, i32 4)
-  ret <2 x double> %2
-}
-
-define <2 x double> @test_mul_sd_round(<2 x double> %a, <2 x double> %b) {
-; CHECK-LABEL: @test_mul_sd_round(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x double> @llvm.x86.avx512.mask.mul.sd.round(<2 x double> [[A:%.*]], <2 x double> [[B:%.*]], <2 x double> undef, i8 -1, i32 8)
-; CHECK-NEXT:    ret <2 x double> [[TMP1]]
-;
-  %1 = insertelement <2 x double> %b, double 1.000000e+00, i32 1
-  %2 = tail call <2 x double> @llvm.x86.avx512.mask.mul.sd.round(<2 x double> %a, <2 x double> %1, <2 x double> undef, i8 -1, i32 8)
-  ret <2 x double> %2
-}
-
-define <2 x double> @test_mul_sd_mask(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) {
-; CHECK-LABEL: @test_mul_sd_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x double> [[A:%.*]], i64 0
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x double> [[B:%.*]], i64 0
-; CHECK-NEXT:    [[TMP3:%.*]] = fmul double [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <8 x i1> [[TMP4]], i64 0
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x double> [[C:%.*]], i64 0
-; CHECK-NEXT:    [[TMP7:%.*]] = select i1 [[TMP5]], double [[TMP3]], double [[TMP6]]
-; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <2 x double> [[A]], double [[TMP7]], i64 0
-; CHECK-NEXT:    ret <2 x double> [[TMP8]]
-;
-  %1 = insertelement <2 x double> %c, double 1.000000e+00, i32 1
-  %2 = tail call <2 x double> @llvm.x86.avx512.mask.mul.sd.round(<2 x double> %a, <2 x double> %b, <2 x double> %1, i8 %mask, i32 4)
-  ret <2 x double> %2
-}
-
-define <2 x double> @test_mul_sd_mask_round(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) {
-; CHECK-LABEL: @test_mul_sd_mask_round(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x double> @llvm.x86.avx512.mask.mul.sd.round(<2 x double> [[A:%.*]], <2 x double> [[B:%.*]], <2 x double> [[C:%.*]], i8 [[MASK:%.*]], i32 8)
-; CHECK-NEXT:    ret <2 x double> [[TMP1]]
-;
-  %1 = insertelement <2 x double> %c, double 1.000000e+00, i32 1
-  %2 = tail call <2 x double> @llvm.x86.avx512.mask.mul.sd.round(<2 x double> %a, <2 x double> %b, <2 x double> %1, i8 %mask, i32 8)
-  ret <2 x double> %2
-}
-
-define double @test_mul_sd_1(double %a, double %b) {
-; CHECK-LABEL: @test_mul_sd_1(
-; CHECK-NEXT:    ret double 1.000000e+00
-;
-  %1 = insertelement <2 x double> undef, double %a, i32 0
-  %2 = insertelement <2 x double> %1, double 1.000000e+00, i32 1
-  %3 = insertelement <2 x double> undef, double %b, i32 0
-  %4 = insertelement <2 x double> %3, double 2.000000e+00, i32 1
-  %5 = tail call <2 x double> @llvm.x86.avx512.mask.mul.sd.round(<2 x double> %2, <2 x double> %4, <2 x double> undef, i8 -1, i32 8)
-  %6 = extractelement <2 x double> %5, i32 1
-  ret double %6
-}
-
-declare <4 x float> @llvm.x86.avx512.mask.div.ss.round(<4 x float>, <4 x float>, <4 x float>, i8, i32)
-
-define <4 x float> @test_div_ss(<4 x float> %a, <4 x float> %b) {
-; CHECK-LABEL: @test_div_ss(
-; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x float> [[A:%.*]], i64 0
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[B:%.*]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = fdiv float [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x float> [[A]], float [[TMP3]], i64 0
-; CHECK-NEXT:    ret <4 x float> [[TMP4]]
-;
-  %1 = insertelement <4 x float> %b, float 1.000000e+00, i32 1
-  %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2
-  %3 = insertelement <4 x float> %2, float 3.000000e+00, i32 3
-  %4 = tail call <4 x float> @llvm.x86.avx512.mask.div.ss.round(<4 x float> %a, <4 x float> %3, <4 x float> undef, i8 -1, i32 4)
-  ret <4 x float> %4
-}
-
-define <4 x float> @test_div_ss_round(<4 x float> %a, <4 x float> %b) {
-; CHECK-LABEL: @test_div_ss_round(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x float> @llvm.x86.avx512.mask.div.ss.round(<4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x float> undef, i8 -1, i32 8)
-; CHECK-NEXT:    ret <4 x float> [[TMP1]]
-;
-  %1 = insertelement <4 x float> %b, float 1.000000e+00, i32 1
-  %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2
-  %3 = insertelement <4 x float> %2, float 3.000000e+00, i32 3
-  %4 = tail call <4 x float> @llvm.x86.avx512.mask.div.ss.round(<4 x float> %a, <4 x float> %3, <4 x float> undef, i8 -1, i32 8)
-  ret <4 x float> %4
-}
-
-define <4 x float> @test_div_ss_mask(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) {
-; CHECK-LABEL: @test_div_ss_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x float> [[A:%.*]], i64 0
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[B:%.*]], i64 0
-; CHECK-NEXT:    [[TMP3:%.*]] = fdiv float [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <8 x i1> [[TMP4]], i64 0
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x float> [[C:%.*]], i32 0
-; CHECK-NEXT:    [[TMP7:%.*]] = select i1 [[TMP5]], float [[TMP3]], float [[TMP6]]
-; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <4 x float> [[A]], float [[TMP7]], i64 0
-; CHECK-NEXT:    ret <4 x float> [[TMP8]]
-;
-  %1 = insertelement <4 x float> %c, float 1.000000e+00, i32 1
-  %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2
-  %3 = insertelement <4 x float> %2, float 3.000000e+00, i32 3
-  %4 = tail call <4 x float> @llvm.x86.avx512.mask.div.ss.round(<4 x float> %a, <4 x float> %b, <4 x float> %3, i8 %mask, i32 4)
-  ret <4 x float> %4
-}
-
-define <4 x float> @test_div_ss_mask_round(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) {
-; CHECK-LABEL: @test_div_ss_mask_round(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x float> @llvm.x86.avx512.mask.div.ss.round(<4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x float> [[C:%.*]], i8 [[MASK:%.*]], i32 8)
-; CHECK-NEXT:    ret <4 x float> [[TMP1]]
-;
-  %1 = insertelement <4 x float> %c, float 1.000000e+00, i32 1
-  %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2
-  %3 = insertelement <4 x float> %2, float 3.000000e+00, i32 3
-  %4 = tail call <4 x float> @llvm.x86.avx512.mask.div.ss.round(<4 x float> %a, <4 x float> %b, <4 x float> %3, i8 %mask, i32 8)
-  ret <4 x float> %4
-}
-
-define float @test_div_ss_1(float %a, float %b) {
-; CHECK-LABEL: @test_div_ss_1(
-; CHECK-NEXT:    ret float 1.000000e+00
-;
-  %1 = insertelement <4 x float> undef, float %a, i32 0
-  %2 = insertelement <4 x float> %1, float 1.000000e+00, i32 1
-  %3 = insertelement <4 x float> %2, float 2.000000e+00, i32 2
-  %4 = insertelement <4 x float> %3, float 3.000000e+00, i32 3
-  %5 = insertelement <4 x float> undef, float %b, i32 0
-  %6 = insertelement <4 x float> %5, float 4.000000e+00, i32 1
-  %7 = insertelement <4 x float> %6, float 5.000000e+00, i32 2
-  %8 = insertelement <4 x float> %7, float 6.000000e+00, i32 3
-  %9 = tail call <4 x float> @llvm.x86.avx512.mask.div.ss.round(<4 x float> %4, <4 x float> %8, <4 x float> undef, i8 -1, i32 8)
-  %10 = extractelement <4 x float> %9, i32 1
-  ret float %10
-}
-
-declare <2 x double> @llvm.x86.avx512.mask.div.sd.round(<2 x double>, <2 x double>, <2 x double>, i8, i32)
-
-define <2 x double> @test_div_sd(<2 x double> %a, <2 x double> %b) {
-; CHECK-LABEL: @test_div_sd(
-; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x double> [[A:%.*]], i64 0
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x double> [[B:%.*]], i64 0
-; CHECK-NEXT:    [[TMP3:%.*]] = fdiv double [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x double> [[A]], double [[TMP3]], i64 0
-; CHECK-NEXT:    ret <2 x double> [[TMP4]]
-;
-  %1 = insertelement <2 x double> %b, double 1.000000e+00, i32 1
-  %2 = tail call <2 x double> @llvm.x86.avx512.mask.div.sd.round(<2 x double> %a, <2 x double> %1, <2 x double> undef, i8 -1, i32 4)
-  ret <2 x double> %2
-}
-
-define <2 x double> @test_div_sd_round(<2 x double> %a, <2 x double> %b) {
-; CHECK-LABEL: @test_div_sd_round(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x double> @llvm.x86.avx512.mask.div.sd.round(<2 x double> [[A:%.*]], <2 x double> [[B:%.*]], <2 x double> undef, i8 -1, i32 8)
-; CHECK-NEXT:    ret <2 x double> [[TMP1]]
-;
-  %1 = insertelement <2 x double> %b, double 1.000000e+00, i32 1
-  %2 = tail call <2 x double> @llvm.x86.avx512.mask.div.sd.round(<2 x double> %a, <2 x double> %1, <2 x double> undef, i8 -1, i32 8)
-  ret <2 x double> %2
-}
-
-define <2 x double> @test_div_sd_mask(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) {
-; CHECK-LABEL: @test_div_sd_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x double> [[A:%.*]], i64 0
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x double> [[B:%.*]], i64 0
-; CHECK-NEXT:    [[TMP3:%.*]] = fdiv double [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <8 x i1> [[TMP4]], i64 0
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x double> [[C:%.*]], i64 0
-; CHECK-NEXT:    [[TMP7:%.*]] = select i1 [[TMP5]], double [[TMP3]], double [[TMP6]]
-; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <2 x double> [[A]], double [[TMP7]], i64 0
-; CHECK-NEXT:    ret <2 x double> [[TMP8]]
-;
-  %1 = insertelement <2 x double> %c, double 1.000000e+00, i32 1
-  %2 = tail call <2 x double> @llvm.x86.avx512.mask.div.sd.round(<2 x double> %a, <2 x double> %b, <2 x double> %1, i8 %mask, i32 4)
-  ret <2 x double> %2
-}
-
-define <2 x double> @test_div_sd_mask_round(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) {
-; CHECK-LABEL: @test_div_sd_mask_round(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x double> @llvm.x86.avx512.mask.div.sd.round(<2 x double> [[A:%.*]], <2 x double> [[B:%.*]], <2 x double> [[C:%.*]], i8 [[MASK:%.*]], i32 8)
-; CHECK-NEXT:    ret <2 x double> [[TMP1]]
-;
-  %1 = insertelement <2 x double> %c, double 1.000000e+00, i32 1
-  %2 = tail call <2 x double> @llvm.x86.avx512.mask.div.sd.round(<2 x double> %a, <2 x double> %b, <2 x double> %1, i8 %mask, i32 8)
-  ret <2 x double> %2
-}
-
-define double @test_div_sd_1(double %a, double %b) {
-; CHECK-LABEL: @test_div_sd_1(
-; CHECK-NEXT:    ret double 1.000000e+00
-;
-  %1 = insertelement <2 x double> undef, double %a, i32 0
-  %2 = insertelement <2 x double> %1, double 1.000000e+00, i32 1
-  %3 = insertelement <2 x double> undef, double %b, i32 0
-  %4 = insertelement <2 x double> %3, double 2.000000e+00, i32 1
-  %5 = tail call <2 x double> @llvm.x86.avx512.mask.div.sd.round(<2 x double> %2, <2 x double> %4, <2 x double> undef, i8 -1, i32 8)
-  %6 = extractelement <2 x double> %5, i32 1
-  ret double %6
-}
-
-declare <4 x float> @llvm.x86.avx512.mask.max.ss.round(<4 x float>, <4 x float>, <4 x float>, i8, i32)
-
-define <4 x float> @test_max_ss(<4 x float> %a, <4 x float> %b) {
-; CHECK-LABEL: @test_max_ss(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x float> @llvm.x86.avx512.mask.max.ss.round(<4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x float> undef, i8 -1, i32 4)
-; CHECK-NEXT:    ret <4 x float> [[TMP1]]
-;
-  %1 = insertelement <4 x float> %b, float 1.000000e+00, i32 1
-  %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2
-  %3 = insertelement <4 x float> %2, float 3.000000e+00, i32 3
-  %4 = tail call <4 x float> @llvm.x86.avx512.mask.max.ss.round(<4 x float> %a, <4 x float> %3, <4 x float> undef, i8 -1, i32 4)
-  ret <4 x float> %4
-}
-
-define <4 x float> @test_max_ss_mask(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) {
-; CHECK-LABEL: @test_max_ss_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x float> @llvm.x86.avx512.mask.max.ss.round(<4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x float> [[C:%.*]], i8 [[MASK:%.*]], i32 4)
-; CHECK-NEXT:    ret <4 x float> [[TMP1]]
-;
-  %1 = insertelement <4 x float> %c, float 1.000000e+00, i32 1
-  %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2
-  %3 = insertelement <4 x float> %2, float 3.000000e+00, i32 3
-  %4 = tail call <4 x float> @llvm.x86.avx512.mask.max.ss.round(<4 x float> %a, <4 x float> %b, <4 x float> %3, i8 %mask, i32 4)
-  ret <4 x float> %4
-}
-
-define float @test_max_ss_1(float %a, float %b) {
-; CHECK-LABEL: @test_max_ss_1(
-; CHECK-NEXT:    ret float 1.000000e+00
-;
-  %1 = insertelement <4 x float> undef, float %a, i32 0
-  %2 = insertelement <4 x float> %1, float 1.000000e+00, i32 1
-  %3 = insertelement <4 x float> %2, float 2.000000e+00, i32 2
-  %4 = insertelement <4 x float> %3, float 3.000000e+00, i32 3
-  %5 = insertelement <4 x float> undef, float %b, i32 0
-  %6 = insertelement <4 x float> %5, float 4.000000e+00, i32 1
-  %7 = insertelement <4 x float> %6, float 5.000000e+00, i32 2
-  %8 = insertelement <4 x float> %7, float 6.000000e+00, i32 3
-  %9 = tail call <4 x float> @llvm.x86.avx512.mask.max.ss.round(<4 x float> %4, <4 x float> %8, <4 x float> undef, i8 -1, i32 8)
-  %10 = extractelement <4 x float> %9, i32 1
-  ret float %10
-}
-
-declare <2 x double> @llvm.x86.avx512.mask.max.sd.round(<2 x double>, <2 x double>, <2 x double>, i8, i32)
-
-define <2 x double> @test_max_sd(<2 x double> %a, <2 x double> %b) {
-; CHECK-LABEL: @test_max_sd(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x double> @llvm.x86.avx512.mask.max.sd.round(<2 x double> [[A:%.*]], <2 x double> [[B:%.*]], <2 x double> undef, i8 -1, i32 4)
-; CHECK-NEXT:    ret <2 x double> [[TMP1]]
-;
-  %1 = insertelement <2 x double> %b, double 1.000000e+00, i32 1
-  %2 = tail call <2 x double> @llvm.x86.avx512.mask.max.sd.round(<2 x double> %a, <2 x double> %1, <2 x double> undef, i8 -1, i32 4)
-  ret <2 x double> %2
-}
-
-define <2 x double> @test_max_sd_mask(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) {
-; CHECK-LABEL: @test_max_sd_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x double> @llvm.x86.avx512.mask.max.sd.round(<2 x double> [[A:%.*]], <2 x double> [[B:%.*]], <2 x double> [[C:%.*]], i8 [[MASK:%.*]], i32 4)
-; CHECK-NEXT:    ret <2 x double> [[TMP1]]
-;
-  %1 = insertelement <2 x double> %c, double 1.000000e+00, i32 1
-  %2 = tail call <2 x double> @llvm.x86.avx512.mask.max.sd.round(<2 x double> %a, <2 x double> %b, <2 x double> %1, i8 %mask, i32 4)
-  ret <2 x double> %2
-}
-
-define double @test_max_sd_1(double %a, double %b) {
-; CHECK-LABEL: @test_max_sd_1(
-; CHECK-NEXT:    ret double 1.000000e+00
-;
-  %1 = insertelement <2 x double> undef, double %a, i32 0
-  %2 = insertelement <2 x double> %1, double 1.000000e+00, i32 1
-  %3 = insertelement <2 x double> undef, double %b, i32 0
-  %4 = insertelement <2 x double> %3, double 2.000000e+00, i32 1
-  %5 = tail call <2 x double> @llvm.x86.avx512.mask.max.sd.round(<2 x double> %2, <2 x double> %4, <2 x double> undef, i8 -1, i32 8)
-  %6 = extractelement <2 x double> %5, i32 1
-  ret double %6
-}
-
-declare <4 x float> @llvm.x86.avx512.mask.min.ss.round(<4 x float>, <4 x float>, <4 x float>, i8, i32)
-
-define <4 x float> @test_min_ss(<4 x float> %a, <4 x float> %b) {
-; CHECK-LABEL: @test_min_ss(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x float> @llvm.x86.avx512.mask.min.ss.round(<4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x float> undef, i8 -1, i32 4)
-; CHECK-NEXT:    ret <4 x float> [[TMP1]]
-;
-  %1 = insertelement <4 x float> %b, float 1.000000e+00, i32 1
-  %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2
-  %3 = insertelement <4 x float> %2, float 3.000000e+00, i32 3
-  %4 = tail call <4 x float> @llvm.x86.avx512.mask.min.ss.round(<4 x float> %a, <4 x float> %3, <4 x float> undef, i8 -1, i32 4)
-  ret <4 x float> %4
-}
-
-define <4 x float> @test_min_ss_mask(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) {
-; CHECK-LABEL: @test_min_ss_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x float> @llvm.x86.avx512.mask.min.ss.round(<4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x float> [[C:%.*]], i8 [[MASK:%.*]], i32 4)
-; CHECK-NEXT:    ret <4 x float> [[TMP1]]
-;
-  %1 = insertelement <4 x float> %c, float 1.000000e+00, i32 1
-  %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2
-  %3 = insertelement <4 x float> %2, float 3.000000e+00, i32 3
-  %4 = tail call <4 x float> @llvm.x86.avx512.mask.min.ss.round(<4 x float> %a, <4 x float> %b, <4 x float> %3, i8 %mask, i32 4)
-  ret <4 x float> %4
-}
-
-define float @test_min_ss_1(float %a, float %b) {
-; CHECK-LABEL: @test_min_ss_1(
-; CHECK-NEXT:    ret float 1.000000e+00
-;
-  %1 = insertelement <4 x float> undef, float %a, i32 0
-  %2 = insertelement <4 x float> %1, float 1.000000e+00, i32 1
-  %3 = insertelement <4 x float> %2, float 2.000000e+00, i32 2
-  %4 = insertelement <4 x float> %3, float 3.000000e+00, i32 3
-  %5 = insertelement <4 x float> undef, float %b, i32 0
-  %6 = insertelement <4 x float> %5, float 4.000000e+00, i32 1
-  %7 = insertelement <4 x float> %6, float 5.000000e+00, i32 2
-  %8 = insertelement <4 x float> %7, float 6.000000e+00, i32 3
-  %9 = tail call <4 x float> @llvm.x86.avx512.mask.min.ss.round(<4 x float> %4, <4 x float> %8, <4 x float> undef, i8 -1, i32 8)
-  %10 = extractelement <4 x float> %9, i32 1
-  ret float %10
-}
-
-declare <2 x double> @llvm.x86.avx512.mask.min.sd.round(<2 x double>, <2 x double>, <2 x double>, i8, i32)
-
-define <2 x double> @test_min_sd(<2 x double> %a, <2 x double> %b) {
-; CHECK-LABEL: @test_min_sd(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x double> @llvm.x86.avx512.mask.min.sd.round(<2 x double> [[A:%.*]], <2 x double> [[B:%.*]], <2 x double> undef, i8 -1, i32 4)
-; CHECK-NEXT:    ret <2 x double> [[TMP1]]
-;
-  %1 = insertelement <2 x double> %b, double 1.000000e+00, i32 1
-  %2 = tail call <2 x double> @llvm.x86.avx512.mask.min.sd.round(<2 x double> %a, <2 x double> %1, <2 x double> undef, i8 -1, i32 4)
-  ret <2 x double> %2
-}
-
-define <2 x double> @test_min_sd_mask(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) {
-; CHECK-LABEL: @test_min_sd_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x double> @llvm.x86.avx512.mask.min.sd.round(<2 x double> [[A:%.*]], <2 x double> [[B:%.*]], <2 x double> [[C:%.*]], i8 [[MASK:%.*]], i32 4)
-; CHECK-NEXT:    ret <2 x double> [[TMP1]]
-;
-  %1 = insertelement <2 x double> %c, double 1.000000e+00, i32 1
-  %2 = tail call <2 x double> @llvm.x86.avx512.mask.min.sd.round(<2 x double> %a, <2 x double> %b, <2 x double> %1, i8 %mask, i32 4)
-  ret <2 x double> %2
-}
-
-define double @test_min_sd_1(double %a, double %b) {
-; CHECK-LABEL: @test_min_sd_1(
-; CHECK-NEXT:    ret double 1.000000e+00
-;
-  %1 = insertelement <2 x double> undef, double %a, i32 0
-  %2 = insertelement <2 x double> %1, double 1.000000e+00, i32 1
-  %3 = insertelement <2 x double> undef, double %b, i32 0
-  %4 = insertelement <2 x double> %3, double 2.000000e+00, i32 1
-  %5 = tail call <2 x double> @llvm.x86.avx512.mask.min.sd.round(<2 x double> %2, <2 x double> %4, <2 x double> undef, i8 -1, i32 8)
-  %6 = extractelement <2 x double> %5, i32 1
-  ret double %6
-}
-
-declare i8 @llvm.x86.avx512.mask.cmp.ss(<4 x float>, <4 x float>, i32, i8, i32)
-
-define i8 @test_cmp_ss(<4 x float> %a, <4 x float> %b, i8 %mask) {
-; CHECK-LABEL: @test_cmp_ss(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call i8 @llvm.x86.avx512.mask.cmp.ss(<4 x float> [[A:%.*]], <4 x float> [[B:%.*]], i32 3, i8 [[MASK:%.*]], i32 4)
-; CHECK-NEXT:    ret i8 [[TMP1]]
-;
-  %1 = insertelement <4 x float> %a, float 1.000000e+00, i32 1
-  %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2
-  %3 = insertelement <4 x float> %2, float 3.000000e+00, i32 3
-  %4 = insertelement <4 x float> %b, float 4.000000e+00, i32 1
-  %5 = insertelement <4 x float> %4, float 5.000000e+00, i32 2
-  %6 = insertelement <4 x float> %5, float 6.000000e+00, i32 3
-  %7 = tail call i8 @llvm.x86.avx512.mask.cmp.ss(<4 x float> %3, <4 x float> %6, i32 3, i8 %mask, i32 4)
-  ret i8 %7
-}
-
-declare i8 @llvm.x86.avx512.mask.cmp.sd(<2 x double>, <2 x double>, i32, i8, i32)
-
-define i8 @test_cmp_sd(<2 x double> %a, <2 x double> %b, i8 %mask) {
-; CHECK-LABEL: @test_cmp_sd(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call i8 @llvm.x86.avx512.mask.cmp.sd(<2 x double> [[A:%.*]], <2 x double> [[B:%.*]], i32 3, i8 [[MASK:%.*]], i32 4)
-; CHECK-NEXT:    ret i8 [[TMP1]]
-;
-  %1 = insertelement <2 x double> %a, double 1.000000e+00, i32 1
-  %2 = insertelement <2 x double> %b, double 2.000000e+00, i32 1
-  %3 = tail call i8 @llvm.x86.avx512.mask.cmp.sd(<2 x double> %1, <2 x double> %2, i32 3, i8 %mask, i32 4)
-  ret i8 %3
-}
-
-define i64 @test(float %f, double %d) {
-; CHECK-LABEL: @test(
-; CHECK-NEXT:    [[V03:%.*]] = insertelement <4 x float> undef, float [[F:%.*]], i32 0
-; CHECK-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.x86.avx512.vcvtss2si32(<4 x float> [[V03]], i32 4)
-; CHECK-NEXT:    [[V13:%.*]] = insertelement <4 x float> undef, float [[F]], i32 0
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.x86.avx512.vcvtss2si64(<4 x float> [[V13]], i32 4)
-; CHECK-NEXT:    [[V23:%.*]] = insertelement <4 x float> undef, float [[F]], i32 0
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call i32 @llvm.x86.avx512.cvttss2si(<4 x float> [[V23]], i32 4)
-; CHECK-NEXT:    [[V33:%.*]] = insertelement <4 x float> undef, float [[F]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = tail call i64 @llvm.x86.avx512.cvttss2si64(<4 x float> [[V33]], i32 4)
-; CHECK-NEXT:    [[V41:%.*]] = insertelement <2 x double> undef, double [[D:%.*]], i32 0
-; CHECK-NEXT:    [[TMP4:%.*]] = tail call i32 @llvm.x86.avx512.vcvtsd2si32(<2 x double> [[V41]], i32 4)
-; CHECK-NEXT:    [[V51:%.*]] = insertelement <2 x double> undef, double [[D]], i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = tail call i64 @llvm.x86.avx512.vcvtsd2si64(<2 x double> [[V51]], i32 4)
-; CHECK-NEXT:    [[V61:%.*]] = insertelement <2 x double> undef, double [[D]], i32 0
-; CHECK-NEXT:    [[TMP6:%.*]] = tail call i32 @llvm.x86.avx512.cvttsd2si(<2 x double> [[V61]], i32 4)
-; CHECK-NEXT:    [[V71:%.*]] = insertelement <2 x double> undef, double [[D]], i32 0
-; CHECK-NEXT:    [[TMP7:%.*]] = tail call i64 @llvm.x86.avx512.cvttsd2si64(<2 x double> [[V71]], i32 4)
-; CHECK-NEXT:    [[TMP8:%.*]] = add i32 [[TMP0]], [[TMP2]]
-; CHECK-NEXT:    [[TMP9:%.*]] = add i32 [[TMP4]], [[TMP6]]
-; CHECK-NEXT:    [[TMP10:%.*]] = add i32 [[TMP8]], [[TMP9]]
-; CHECK-NEXT:    [[TMP11:%.*]] = sext i32 [[TMP10]] to i64
-; CHECK-NEXT:    [[TMP12:%.*]] = add i64 [[TMP1]], [[TMP3]]
-; CHECK-NEXT:    [[TMP13:%.*]] = add i64 [[TMP5]], [[TMP7]]
-; CHECK-NEXT:    [[TMP14:%.*]] = add i64 [[TMP12]], [[TMP13]]
-; CHECK-NEXT:    [[TMP15:%.*]] = add i64 [[TMP14]], [[TMP11]]
-; CHECK-NEXT:    ret i64 [[TMP15]]
-;
-  %v00 = insertelement <4 x float> undef, float %f, i32 0
-  %v01 = insertelement <4 x float> %v00, float 0.000000e+00, i32 1
-  %v02 = insertelement <4 x float> %v01, float 0.000000e+00, i32 2
-  %v03 = insertelement <4 x float> %v02, float 0.000000e+00, i32 3
-  %tmp0 = tail call i32 @llvm.x86.avx512.vcvtss2si32(<4 x float> %v03, i32 4)
-  %v10 = insertelement <4 x float> undef, float %f, i32 0
-  %v11 = insertelement <4 x float> %v10, float 0.000000e+00, i32 1
-  %v12 = insertelement <4 x float> %v11, float 0.000000e+00, i32 2
-  %v13 = insertelement <4 x float> %v12, float 0.000000e+00, i32 3
-  %tmp1 = tail call i64 @llvm.x86.avx512.vcvtss2si64(<4 x float> %v13, i32 4)
-  %v20 = insertelement <4 x float> undef, float %f, i32 0
-  %v21 = insertelement <4 x float> %v20, float 0.000000e+00, i32 1
-  %v22 = insertelement <4 x float> %v21, float 0.000000e+00, i32 2
-  %v23 = insertelement <4 x float> %v22, float 0.000000e+00, i32 3
-  %tmp2 = tail call i32 @llvm.x86.avx512.cvttss2si(<4 x float> %v23, i32 4)
-  %v30 = insertelement <4 x float> undef, float %f, i32 0
-  %v31 = insertelement <4 x float> %v30, float 0.000000e+00, i32 1
-  %v32 = insertelement <4 x float> %v31, float 0.000000e+00, i32 2
-  %v33 = insertelement <4 x float> %v32, float 0.000000e+00, i32 3
-  %tmp3 = tail call i64 @llvm.x86.avx512.cvttss2si64(<4 x float> %v33, i32 4)
-  %v40 = insertelement <2 x double> undef, double %d, i32 0
-  %v41 = insertelement <2 x double> %v40, double 0.000000e+00, i32 1
-  %tmp4 = tail call i32 @llvm.x86.avx512.vcvtsd2si32(<2 x double> %v41, i32 4)
-  %v50 = insertelement <2 x double> undef, double %d, i32 0
-  %v51 = insertelement <2 x double> %v50, double 0.000000e+00, i32 1
-  %tmp5 = tail call i64 @llvm.x86.avx512.vcvtsd2si64(<2 x double> %v51, i32 4)
-  %v60 = insertelement <2 x double> undef, double %d, i32 0
-  %v61 = insertelement <2 x double> %v60, double 0.000000e+00, i32 1
-  %tmp6 = tail call i32 @llvm.x86.avx512.cvttsd2si(<2 x double> %v61, i32 4)
-  %v70 = insertelement <2 x double> undef, double %d, i32 0
-  %v71 = insertelement <2 x double> %v70, double 0.000000e+00, i32 1
-  %tmp7 = tail call i64 @llvm.x86.avx512.cvttsd2si64(<2 x double> %v71, i32 4)
-  %tmp8 = add i32 %tmp0, %tmp2
-  %tmp9 = add i32 %tmp4, %tmp6
-  %tmp10 = add i32 %tmp8, %tmp9
-  %tmp11 = sext i32 %tmp10 to i64
-  %tmp12 = add i64 %tmp1, %tmp3
-  %tmp13 = add i64 %tmp5, %tmp7
-  %tmp14 = add i64 %tmp12, %tmp13
-  %tmp15 = add i64 %tmp11, %tmp14
-  ret i64 %tmp15
-}
-
-declare i32 @llvm.x86.avx512.vcvtss2si32(<4 x float>, i32)
-declare i64 @llvm.x86.avx512.vcvtss2si64(<4 x float>, i32)
-declare i32 @llvm.x86.avx512.cvttss2si(<4 x float>, i32)
-declare i64 @llvm.x86.avx512.cvttss2si64(<4 x float>, i32)
-declare i32 @llvm.x86.avx512.vcvtsd2si32(<2 x double>, i32)
-declare i64 @llvm.x86.avx512.vcvtsd2si64(<2 x double>, i32)
-declare i32 @llvm.x86.avx512.cvttsd2si(<2 x double>, i32)
-declare i64 @llvm.x86.avx512.cvttsd2si64(<2 x double>, i32)
-
-define i64 @test2(float %f, double %d) {
-; CHECK-LABEL: @test2(
-; CHECK-NEXT:    [[V03:%.*]] = insertelement <4 x float> undef, float [[F:%.*]], i32 0
-; CHECK-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.x86.avx512.vcvtss2usi32(<4 x float> [[V03]], i32 4)
-; CHECK-NEXT:    [[V13:%.*]] = insertelement <4 x float> undef, float [[F]], i32 0
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.x86.avx512.vcvtss2usi64(<4 x float> [[V13]], i32 4)
-; CHECK-NEXT:    [[V23:%.*]] = insertelement <4 x float> undef, float [[F]], i32 0
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call i32 @llvm.x86.avx512.cvttss2usi(<4 x float> [[V23]], i32 4)
-; CHECK-NEXT:    [[V33:%.*]] = insertelement <4 x float> undef, float [[F]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = tail call i64 @llvm.x86.avx512.cvttss2usi64(<4 x float> [[V33]], i32 4)
-; CHECK-NEXT:    [[V41:%.*]] = insertelement <2 x double> undef, double [[D:%.*]], i32 0
-; CHECK-NEXT:    [[TMP4:%.*]] = tail call i32 @llvm.x86.avx512.vcvtsd2usi32(<2 x double> [[V41]], i32 4)
-; CHECK-NEXT:    [[V51:%.*]] = insertelement <2 x double> undef, double [[D]], i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = tail call i64 @llvm.x86.avx512.vcvtsd2usi64(<2 x double> [[V51]], i32 4)
-; CHECK-NEXT:    [[V61:%.*]] = insertelement <2 x double> undef, double [[D]], i32 0
-; CHECK-NEXT:    [[TMP6:%.*]] = tail call i32 @llvm.x86.avx512.cvttsd2usi(<2 x double> [[V61]], i32 4)
-; CHECK-NEXT:    [[V71:%.*]] = insertelement <2 x double> undef, double [[D]], i32 0
-; CHECK-NEXT:    [[TMP7:%.*]] = tail call i64 @llvm.x86.avx512.cvttsd2usi64(<2 x double> [[V71]], i32 4)
-; CHECK-NEXT:    [[TMP8:%.*]] = add i32 [[TMP0]], [[TMP2]]
-; CHECK-NEXT:    [[TMP9:%.*]] = add i32 [[TMP4]], [[TMP6]]
-; CHECK-NEXT:    [[TMP10:%.*]] = add i32 [[TMP8]], [[TMP9]]
-; CHECK-NEXT:    [[TMP11:%.*]] = sext i32 [[TMP10]] to i64
-; CHECK-NEXT:    [[TMP12:%.*]] = add i64 [[TMP1]], [[TMP3]]
-; CHECK-NEXT:    [[TMP13:%.*]] = add i64 [[TMP5]], [[TMP7]]
-; CHECK-NEXT:    [[TMP14:%.*]] = add i64 [[TMP12]], [[TMP13]]
-; CHECK-NEXT:    [[TMP15:%.*]] = add i64 [[TMP14]], [[TMP11]]
-; CHECK-NEXT:    ret i64 [[TMP15]]
-;
-  %v00 = insertelement <4 x float> undef, float %f, i32 0
-  %v01 = insertelement <4 x float> %v00, float 0.000000e+00, i32 1
-  %v02 = insertelement <4 x float> %v01, float 0.000000e+00, i32 2
-  %v03 = insertelement <4 x float> %v02, float 0.000000e+00, i32 3
-  %tmp0 = tail call i32 @llvm.x86.avx512.vcvtss2usi32(<4 x float> %v03, i32 4)
-  %v10 = insertelement <4 x float> undef, float %f, i32 0
-  %v11 = insertelement <4 x float> %v10, float 0.000000e+00, i32 1
-  %v12 = insertelement <4 x float> %v11, float 0.000000e+00, i32 2
-  %v13 = insertelement <4 x float> %v12, float 0.000000e+00, i32 3
-  %tmp1 = tail call i64 @llvm.x86.avx512.vcvtss2usi64(<4 x float> %v13, i32 4)
-  %v20 = insertelement <4 x float> undef, float %f, i32 0
-  %v21 = insertelement <4 x float> %v20, float 0.000000e+00, i32 1
-  %v22 = insertelement <4 x float> %v21, float 0.000000e+00, i32 2
-  %v23 = insertelement <4 x float> %v22, float 0.000000e+00, i32 3
-  %tmp2 = tail call i32 @llvm.x86.avx512.cvttss2usi(<4 x float> %v23, i32 4)
-  %v30 = insertelement <4 x float> undef, float %f, i32 0
-  %v31 = insertelement <4 x float> %v30, float 0.000000e+00, i32 1
-  %v32 = insertelement <4 x float> %v31, float 0.000000e+00, i32 2
-  %v33 = insertelement <4 x float> %v32, float 0.000000e+00, i32 3
-  %tmp3 = tail call i64 @llvm.x86.avx512.cvttss2usi64(<4 x float> %v33, i32 4)
-  %v40 = insertelement <2 x double> undef, double %d, i32 0
-  %v41 = insertelement <2 x double> %v40, double 0.000000e+00, i32 1
-  %tmp4 = tail call i32 @llvm.x86.avx512.vcvtsd2usi32(<2 x double> %v41, i32 4)
-  %v50 = insertelement <2 x double> undef, double %d, i32 0
-  %v51 = insertelement <2 x double> %v50, double 0.000000e+00, i32 1
-  %tmp5 = tail call i64 @llvm.x86.avx512.vcvtsd2usi64(<2 x double> %v51, i32 4)
-  %v60 = insertelement <2 x double> undef, double %d, i32 0
-  %v61 = insertelement <2 x double> %v60, double 0.000000e+00, i32 1
-  %tmp6 = tail call i32 @llvm.x86.avx512.cvttsd2usi(<2 x double> %v61, i32 4)
-  %v70 = insertelement <2 x double> undef, double %d, i32 0
-  %v71 = insertelement <2 x double> %v70, double 0.000000e+00, i32 1
-  %tmp7 = tail call i64 @llvm.x86.avx512.cvttsd2usi64(<2 x double> %v71, i32 4)
-  %tmp8 = add i32 %tmp0, %tmp2
-  %tmp9 = add i32 %tmp4, %tmp6
-  %tmp10 = add i32 %tmp8, %tmp9
-  %tmp11 = sext i32 %tmp10 to i64
-  %tmp12 = add i64 %tmp1, %tmp3
-  %tmp13 = add i64 %tmp5, %tmp7
-  %tmp14 = add i64 %tmp12, %tmp13
-  %tmp15 = add i64 %tmp11, %tmp14
-  ret i64 %tmp15
-}
-
-declare i32 @llvm.x86.avx512.vcvtss2usi32(<4 x float>, i32)
-declare i64 @llvm.x86.avx512.vcvtss2usi64(<4 x float>, i32)
-declare i32 @llvm.x86.avx512.cvttss2usi(<4 x float>, i32)
-declare i64 @llvm.x86.avx512.cvttss2usi64(<4 x float>, i32)
-declare i32 @llvm.x86.avx512.vcvtsd2usi32(<2 x double>, i32)
-declare i64 @llvm.x86.avx512.vcvtsd2usi64(<2 x double>, i32)
-declare i32 @llvm.x86.avx512.cvttsd2usi(<2 x double>, i32)
-declare i64 @llvm.x86.avx512.cvttsd2usi64(<2 x double>, i32)
-
-declare <4 x float> @llvm.x86.avx512.mask.vfmadd.ss(<4 x float>, <4 x float>, <4 x float>, i8, i32)
-
-define <4 x float> @test_mask_vfmadd_ss(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) {
-; CHECK-LABEL: @test_mask_vfmadd_ss(
-; CHECK-NEXT:    [[RES:%.*]] = tail call <4 x float> @llvm.x86.avx512.mask.vfmadd.ss(<4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x float> [[C:%.*]], i8 [[MASK:%.*]], i32 4)
-; CHECK-NEXT:    ret <4 x float> [[RES]]
-;
-  %1 = insertelement <4 x float> %b, float 1.000000e+00, i32 1
-  %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2
-  %3 = insertelement <4 x float> %2, float 3.000000e+00, i32 3
-  %4 = insertelement <4 x float> %c, float 4.000000e+00, i32 1
-  %5 = insertelement <4 x float> %4, float 5.000000e+00, i32 2
-  %6 = insertelement <4 x float> %5, float 6.000000e+00, i32 3
-  %res = tail call <4 x float> @llvm.x86.avx512.mask.vfmadd.ss(<4 x float> %a, <4 x float> %3, <4 x float> %6, i8 %mask, i32 4)
-  ret <4 x float> %res
-}
-
-define float @test_mask_vfmadd_ss_0(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) {
-; CHECK-LABEL: @test_mask_vfmadd_ss_0(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x float> @llvm.x86.avx512.mask.vfmadd.ss(<4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x float> [[C:%.*]], i8 [[MASK:%.*]], i32 4)
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 0
-; CHECK-NEXT:    ret float [[TMP2]]
-;
-  %1 = insertelement <4 x float> %a, float 1.000000e+00, i32 1
-  %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2
-  %3 = insertelement <4 x float> %2, float 3.000000e+00, i32 3
-  %4 = tail call <4 x float> @llvm.x86.avx512.mask.vfmadd.ss(<4 x float> %3, <4 x float> %b, <4 x float> %c, i8 %mask, i32 4)
-  %5 = extractelement <4 x float> %4, i32 0
-  ret float %5
-}
-
-define float @test_mask_vfmadd_ss_1(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) {
-; CHECK-LABEL: @test_mask_vfmadd_ss_1(
-; CHECK-NEXT:    ret float 1.000000e+00
-;
-  %1 = insertelement <4 x float> %a, float 1.000000e+00, i32 1
-  %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2
-  %3 = insertelement <4 x float> %2, float 3.000000e+00, i32 3
-  %4 = tail call <4 x float> @llvm.x86.avx512.mask.vfmadd.ss(<4 x float> %3, <4 x float> %b, <4 x float> %c, i8 %mask, i32 4)
-  %5 = extractelement <4 x float> %4, i32 1
-  ret float %5
-}
-
-declare <2 x double> @llvm.x86.avx512.mask.vfmadd.sd(<2 x double>, <2 x double>, <2 x double>, i8, i32)
-
-define <2 x double> @test_mask_vfmadd_sd(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) {
-; CHECK-LABEL: @test_mask_vfmadd_sd(
-; CHECK-NEXT:    [[RES:%.*]] = tail call <2 x double> @llvm.x86.avx512.mask.vfmadd.sd(<2 x double> [[A:%.*]], <2 x double> [[B:%.*]], <2 x double> [[C:%.*]], i8 [[MASK:%.*]], i32 4)
-; CHECK-NEXT:    ret <2 x double> [[RES]]
-;
-  %1 = insertelement <2 x double> %b, double 1.000000e+00, i32 1
-  %2 = insertelement <2 x double> %c, double 2.000000e+00, i32 1
-  %res = tail call <2 x double> @llvm.x86.avx512.mask.vfmadd.sd(<2 x double> %a, <2 x double> %1, <2 x double> %2, i8 %mask, i32 4)
-  ret <2 x double> %res
-}
-
-define double @test_mask_vfmadd_sd_0(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) {
-; CHECK-LABEL: @test_mask_vfmadd_sd_0(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x double> @llvm.x86.avx512.mask.vfmadd.sd(<2 x double> [[A:%.*]], <2 x double> [[B:%.*]], <2 x double> [[C:%.*]], i8 [[MASK:%.*]], i32 4)
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x double> [[TMP1]], i32 0
-; CHECK-NEXT:    ret double [[TMP2]]
-;
-  %1 = insertelement <2 x double> %a, double 1.000000e+00, i32 1
-  %2 = tail call <2 x double> @llvm.x86.avx512.mask.vfmadd.sd(<2 x double> %1, <2 x double> %b, <2 x double> %c, i8 %mask, i32 4)
-  %3 = extractelement <2 x double> %2, i32 0
-  ret double %3
-}
-
-define double @test_mask_vfmadd_sd_1(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) {
-; CHECK-LABEL: @test_mask_vfmadd_sd_1(
-; CHECK-NEXT:    ret double 1.000000e+00
-;
-  %1 = insertelement <2 x double> %a, double 1.000000e+00, i32 1
-  %2 = tail call <2 x double> @llvm.x86.avx512.mask.vfmadd.sd(<2 x double> %1, <2 x double> %b, <2 x double> %c, i8 %mask, i32 4)
-  %3 = extractelement <2 x double> %2, i32 1
-  ret double %3
-}
-
-declare <4 x float> @llvm.x86.avx512.maskz.vfmadd.ss(<4 x float>, <4 x float>, <4 x float>, i8, i32)
-
-define <4 x float> @test_maskz_vfmadd_ss(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) {
-; CHECK-LABEL: @test_maskz_vfmadd_ss(
-; CHECK-NEXT:    [[RES:%.*]] = tail call <4 x float> @llvm.x86.avx512.maskz.vfmadd.ss(<4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x float> [[C:%.*]], i8 [[MASK:%.*]], i32 4)
-; CHECK-NEXT:    ret <4 x float> [[RES]]
-;
-  %1 = insertelement <4 x float> %b, float 1.000000e+00, i32 1
-  %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2
-  %3 = insertelement <4 x float> %2, float 3.000000e+00, i32 3
-  %4 = insertelement <4 x float> %c, float 4.000000e+00, i32 1
-  %5 = insertelement <4 x float> %4, float 5.000000e+00, i32 2
-  %6 = insertelement <4 x float> %5, float 6.000000e+00, i32 3
-  %res = tail call <4 x float> @llvm.x86.avx512.maskz.vfmadd.ss(<4 x float> %a, <4 x float> %3, <4 x float> %6, i8 %mask, i32 4)
-  ret <4 x float> %res
-}
-
-define float @test_maskz_vfmadd_ss_0(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) {
-; CHECK-LABEL: @test_maskz_vfmadd_ss_0(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x float> @llvm.x86.avx512.maskz.vfmadd.ss(<4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x float> [[C:%.*]], i8 [[MASK:%.*]], i32 4)
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 0
-; CHECK-NEXT:    ret float [[TMP2]]
-;
-  %1 = insertelement <4 x float> %a, float 1.000000e+00, i32 1
-  %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2
-  %3 = insertelement <4 x float> %2, float 3.000000e+00, i32 3
-  %4 = tail call <4 x float> @llvm.x86.avx512.maskz.vfmadd.ss(<4 x float> %3, <4 x float> %b, <4 x float> %c, i8 %mask, i32 4)
-  %5 = extractelement <4 x float> %4, i32 0
-  ret float %5
-}
-
-define float @test_maskz_vfmadd_ss_1(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) {
-; CHECK-LABEL: @test_maskz_vfmadd_ss_1(
-; CHECK-NEXT:    ret float 1.000000e+00
-;
-  %1 = insertelement <4 x float> %a, float 1.000000e+00, i32 1
-  %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2
-  %3 = insertelement <4 x float> %2, float 3.000000e+00, i32 3
-  %4 = tail call <4 x float> @llvm.x86.avx512.maskz.vfmadd.ss(<4 x float> %3, <4 x float> %b, <4 x float> %c, i8 %mask, i32 4)
-  %5 = extractelement <4 x float> %4, i32 1
-  ret float %5
-}
-
-declare <2 x double> @llvm.x86.avx512.maskz.vfmadd.sd(<2 x double>, <2 x double>, <2 x double>, i8, i32)
-
-define <2 x double> @test_maskz_vfmadd_sd(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) {
-; CHECK-LABEL: @test_maskz_vfmadd_sd(
-; CHECK-NEXT:    [[RES:%.*]] = tail call <2 x double> @llvm.x86.avx512.maskz.vfmadd.sd(<2 x double> [[A:%.*]], <2 x double> [[B:%.*]], <2 x double> [[C:%.*]], i8 [[MASK:%.*]], i32 4)
-; CHECK-NEXT:    ret <2 x double> [[RES]]
-;
-  %1 = insertelement <2 x double> %b, double 1.000000e+00, i32 1
-  %2 = insertelement <2 x double> %c, double 2.000000e+00, i32 1
-  %res = tail call <2 x double> @llvm.x86.avx512.maskz.vfmadd.sd(<2 x double> %a, <2 x double> %1, <2 x double> %2, i8 %mask, i32 4)
-  ret <2 x double> %res
-}
-
-define double @test_maskz_vfmadd_sd_0(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) {
-; CHECK-LABEL: @test_maskz_vfmadd_sd_0(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x double> @llvm.x86.avx512.maskz.vfmadd.sd(<2 x double> [[A:%.*]], <2 x double> [[B:%.*]], <2 x double> [[C:%.*]], i8 [[MASK:%.*]], i32 4)
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x double> [[TMP1]], i32 0
-; CHECK-NEXT:    ret double [[TMP2]]
-;
-  %1 = insertelement <2 x double> %a, double 1.000000e+00, i32 1
-  %2 = tail call <2 x double> @llvm.x86.avx512.maskz.vfmadd.sd(<2 x double> %1, <2 x double> %b, <2 x double> %c, i8 %mask, i32 4)
-  %3 = extractelement <2 x double> %2, i32 0
-  ret double %3
-}
-
-define double @test_maskz_vfmadd_sd_1(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) {
-; CHECK-LABEL: @test_maskz_vfmadd_sd_1(
-; CHECK-NEXT:    ret double 1.000000e+00
-;
-  %1 = insertelement <2 x double> %a, double 1.000000e+00, i32 1
-  %2 = tail call <2 x double> @llvm.x86.avx512.maskz.vfmadd.sd(<2 x double> %1, <2 x double> %b, <2 x double> %c, i8 %mask, i32 4)
-  %3 = extractelement <2 x double> %2, i32 1
-  ret double %3
-}
-
-declare <4 x float> @llvm.x86.avx512.mask3.vfmadd.ss(<4 x float>, <4 x float>, <4 x float>, i8, i32)
-
-define <4 x float> @test_mask3_vfmadd_ss(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) {
-; CHECK-LABEL: @test_mask3_vfmadd_ss(
-; CHECK-NEXT:    [[RES:%.*]] = tail call <4 x float> @llvm.x86.avx512.mask3.vfmadd.ss(<4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x float> [[C:%.*]], i8 [[MASK:%.*]], i32 4)
-; CHECK-NEXT:    ret <4 x float> [[RES]]
-;
-  %1 = insertelement <4 x float> %a, float 1.000000e+00, i32 1
-  %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2
-  %3 = insertelement <4 x float> %2, float 3.000000e+00, i32 3
-  %4 = insertelement <4 x float> %b, float 4.000000e+00, i32 1
-  %5 = insertelement <4 x float> %4, float 5.000000e+00, i32 2
-  %6 = insertelement <4 x float> %5, float 6.000000e+00, i32 3
-  %res = tail call <4 x float> @llvm.x86.avx512.mask3.vfmadd.ss(<4 x float> %3, <4 x float> %6, <4 x float> %c, i8 %mask, i32 4)
-  ret <4 x float> %res
-}
-
-define float @test_mask3_vfmadd_ss_0(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) {
-; CHECK-LABEL: @test_mask3_vfmadd_ss_0(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x float> @llvm.x86.avx512.mask3.vfmadd.ss(<4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x float> [[C:%.*]], i8 [[MASK:%.*]], i32 4)
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 0
-; CHECK-NEXT:    ret float [[TMP2]]
-;
-  %1 = insertelement <4 x float> %c, float 1.000000e+00, i32 1
-  %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2
-  %3 = insertelement <4 x float> %2, float 3.000000e+00, i32 3
-  %4 = tail call <4 x float> @llvm.x86.avx512.mask3.vfmadd.ss(<4 x float> %a, <4 x float> %b, <4 x float> %3, i8 %mask, i32 4)
-  %5 = extractelement <4 x float> %4, i32 0
-  ret float %5
-}
-
-define float @test_mask3_vfmadd_ss_1(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) {
-; CHECK-LABEL: @test_mask3_vfmadd_ss_1(
-; CHECK-NEXT:    ret float 1.000000e+00
-;
-  %1 = insertelement <4 x float> %c, float 1.000000e+00, i32 1
-  %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2
-  %3 = insertelement <4 x float> %2, float 3.000000e+00, i32 3
-  %4 = tail call <4 x float> @llvm.x86.avx512.mask3.vfmadd.ss(<4 x float> %a, <4 x float> %b, <4 x float> %3, i8 %mask, i32 4)
-  %5 = extractelement <4 x float> %4, i32 1
-  ret float %5
-}
-
-declare <2 x double> @llvm.x86.avx512.mask3.vfmadd.sd(<2 x double>, <2 x double>, <2 x double>, i8, i32)
-
-define <2 x double> @test_mask3_vfmadd_sd(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) {
-; CHECK-LABEL: @test_mask3_vfmadd_sd(
-; CHECK-NEXT:    [[RES:%.*]] = tail call <2 x double> @llvm.x86.avx512.mask3.vfmadd.sd(<2 x double> [[A:%.*]], <2 x double> [[B:%.*]], <2 x double> [[C:%.*]], i8 [[MASK:%.*]], i32 4)
-; CHECK-NEXT:    ret <2 x double> [[RES]]
-;
-  %1 = insertelement <2 x double> %a, double 1.000000e+00, i32 1
-  %2 = insertelement <2 x double> %b, double 2.000000e+00, i32 1
-  %res = tail call <2 x double> @llvm.x86.avx512.mask3.vfmadd.sd(<2 x double> %1, <2 x double> %2, <2 x double> %c, i8 %mask, i32 4)
-  ret <2 x double> %res
-}
-
-define double @test_mask3_vfmadd_sd_0(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) {
-; CHECK-LABEL: @test_mask3_vfmadd_sd_0(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x double> @llvm.x86.avx512.mask3.vfmadd.sd(<2 x double> [[A:%.*]], <2 x double> [[B:%.*]], <2 x double> [[C:%.*]], i8 [[MASK:%.*]], i32 4)
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x double> [[TMP1]], i32 0
-; CHECK-NEXT:    ret double [[TMP2]]
-;
-  %1 = insertelement <2 x double> %c, double 1.000000e+00, i32 1
-  %2 = tail call <2 x double> @llvm.x86.avx512.mask3.vfmadd.sd(<2 x double> %a, <2 x double> %b, <2 x double> %1, i8 %mask, i32 4)
-  %3 = extractelement <2 x double> %2, i32 0
-  ret double %3
-}
-
-define double @test_mask3_vfmadd_sd_1(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) {
-; CHECK-LABEL: @test_mask3_vfmadd_sd_1(
-; CHECK-NEXT:    ret double 1.000000e+00
-;
-  %1 = insertelement <2 x double> %c, double 1.000000e+00, i32 1
-  %2 = tail call <2 x double> @llvm.x86.avx512.mask3.vfmadd.sd(<2 x double> %a, <2 x double> %b, <2 x double> %1, i8 %mask, i32 4)
-  %3 = extractelement <2 x double> %2, i32 1
-  ret double %3
-}
-
-declare <4 x float> @llvm.x86.avx512.mask3.vfmsub.ss(<4 x float>, <4 x float>, <4 x float>, i8, i32)
-
-define <4 x float> @test_mask3_vfmsub_ss(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) {
-; CHECK-LABEL: @test_mask3_vfmsub_ss(
-; CHECK-NEXT:    [[RES:%.*]] = tail call <4 x float> @llvm.x86.avx512.mask3.vfmsub.ss(<4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x float> [[C:%.*]], i8 [[MASK:%.*]], i32 4)
-; CHECK-NEXT:    ret <4 x float> [[RES]]
-;
-  %1 = insertelement <4 x float> %a, float 1.000000e+00, i32 1
-  %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2
-  %3 = insertelement <4 x float> %2, float 3.000000e+00, i32 3
-  %4 = insertelement <4 x float> %b, float 4.000000e+00, i32 1
-  %5 = insertelement <4 x float> %4, float 5.000000e+00, i32 2
-  %6 = insertelement <4 x float> %5, float 6.000000e+00, i32 3
-  %res = tail call <4 x float> @llvm.x86.avx512.mask3.vfmsub.ss(<4 x float> %3, <4 x float> %6, <4 x float> %c, i8 %mask, i32 4)
-  ret <4 x float> %res
-}
-
-define float @test_mask3_vfmsub_ss_0(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) {
-; CHECK-LABEL: @test_mask3_vfmsub_ss_0(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x float> @llvm.x86.avx512.mask3.vfmsub.ss(<4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x float> [[C:%.*]], i8 [[MASK:%.*]], i32 4)
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 0
-; CHECK-NEXT:    ret float [[TMP2]]
-;
-  %1 = insertelement <4 x float> %c, float 1.000000e+00, i32 1
-  %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2
-  %3 = insertelement <4 x float> %2, float 3.000000e+00, i32 3
-  %4 = tail call <4 x float> @llvm.x86.avx512.mask3.vfmsub.ss(<4 x float> %a, <4 x float> %b, <4 x float> %3, i8 %mask, i32 4)
-  %5 = extractelement <4 x float> %4, i32 0
-  ret float %5
-}
-
-define float @test_mask3_vfmsub_ss_1(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) {
-; CHECK-LABEL: @test_mask3_vfmsub_ss_1(
-; CHECK-NEXT:    ret float 1.000000e+00
-;
-  %1 = insertelement <4 x float> %c, float 1.000000e+00, i32 1
-  %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2
-  %3 = insertelement <4 x float> %2, float 3.000000e+00, i32 3
-  %4 = tail call <4 x float> @llvm.x86.avx512.mask3.vfmsub.ss(<4 x float> %a, <4 x float> %b, <4 x float> %3, i8 %mask, i32 4)
-  %5 = extractelement <4 x float> %4, i32 1
-  ret float %5
-}
-
-declare <2 x double> @llvm.x86.avx512.mask3.vfmsub.sd(<2 x double>, <2 x double>, <2 x double>, i8, i32)
-
-define <2 x double> @test_mask3_vfmsub_sd(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) {
-; CHECK-LABEL: @test_mask3_vfmsub_sd(
-; CHECK-NEXT:    [[RES:%.*]] = tail call <2 x double> @llvm.x86.avx512.mask3.vfmsub.sd(<2 x double> [[A:%.*]], <2 x double> [[B:%.*]], <2 x double> [[C:%.*]], i8 [[MASK:%.*]], i32 4)
-; CHECK-NEXT:    ret <2 x double> [[RES]]
-;
-  %1 = insertelement <2 x double> %a, double 1.000000e+00, i32 1
-  %2 = insertelement <2 x double> %b, double 2.000000e+00, i32 1
-  %res = tail call <2 x double> @llvm.x86.avx512.mask3.vfmsub.sd(<2 x double> %1, <2 x double> %2, <2 x double> %c, i8 %mask, i32 4)
-  ret <2 x double> %res
-}
-
-define double @test_mask3_vfmsub_sd_0(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) {
-; CHECK-LABEL: @test_mask3_vfmsub_sd_0(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x double> @llvm.x86.avx512.mask3.vfmsub.sd(<2 x double> [[A:%.*]], <2 x double> [[B:%.*]], <2 x double> [[C:%.*]], i8 [[MASK:%.*]], i32 4)
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x double> [[TMP1]], i32 0
-; CHECK-NEXT:    ret double [[TMP2]]
-;
-  %1 = insertelement <2 x double> %c, double 1.000000e+00, i32 1
-  %2 = tail call <2 x double> @llvm.x86.avx512.mask3.vfmsub.sd(<2 x double> %a, <2 x double> %b, <2 x double> %1, i8 %mask, i32 4)
-  %3 = extractelement <2 x double> %2, i32 0
-  ret double %3
-}
-
-define double @test_mask3_vfmsub_sd_1(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) {
-; CHECK-LABEL: @test_mask3_vfmsub_sd_1(
-; CHECK-NEXT:    ret double 1.000000e+00
-;
-  %1 = insertelement <2 x double> %c, double 1.000000e+00, i32 1
-  %2 = tail call <2 x double> @llvm.x86.avx512.mask3.vfmsub.sd(<2 x double> %a, <2 x double> %b, <2 x double> %1, i8 %mask, i32 4)
-  %3 = extractelement <2 x double> %2, i32 1
-  ret double %3
-}
-
-declare <4 x float> @llvm.x86.avx512.mask3.vfnmsub.ss(<4 x float>, <4 x float>, <4 x float>, i8, i32)
-
-define <4 x float> @test_mask3_vfnmsub_ss(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) {
-; CHECK-LABEL: @test_mask3_vfnmsub_ss(
-; CHECK-NEXT:    [[RES:%.*]] = tail call <4 x float> @llvm.x86.avx512.mask3.vfnmsub.ss(<4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x float> [[C:%.*]], i8 [[MASK:%.*]], i32 4)
-; CHECK-NEXT:    ret <4 x float> [[RES]]
-;
-  %1 = insertelement <4 x float> %a, float 1.000000e+00, i32 1
-  %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2
-  %3 = insertelement <4 x float> %2, float 3.000000e+00, i32 3
-  %4 = insertelement <4 x float> %b, float 4.000000e+00, i32 1
-  %5 = insertelement <4 x float> %4, float 5.000000e+00, i32 2
-  %6 = insertelement <4 x float> %5, float 6.000000e+00, i32 3
-  %res = tail call <4 x float> @llvm.x86.avx512.mask3.vfnmsub.ss(<4 x float> %3, <4 x float> %6, <4 x float> %c, i8 %mask, i32 4)
-  ret <4 x float> %res
-}
-
-define float @test_mask3_vfnmsub_ss_0(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) {
-; CHECK-LABEL: @test_mask3_vfnmsub_ss_0(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x float> @llvm.x86.avx512.mask3.vfnmsub.ss(<4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x float> [[C:%.*]], i8 [[MASK:%.*]], i32 4)
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 0
-; CHECK-NEXT:    ret float [[TMP2]]
-;
-  %1 = insertelement <4 x float> %c, float 1.000000e+00, i32 1
-  %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2
-  %3 = insertelement <4 x float> %2, float 3.000000e+00, i32 3
-  %4 = tail call <4 x float> @llvm.x86.avx512.mask3.vfnmsub.ss(<4 x float> %a, <4 x float> %b, <4 x float> %3, i8 %mask, i32 4)
-  %5 = extractelement <4 x float> %4, i32 0
-  ret float %5
-}
-
-define float @test_mask3_vfnmsub_ss_1(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) {
-; CHECK-LABEL: @test_mask3_vfnmsub_ss_1(
-; CHECK-NEXT:    ret float 1.000000e+00
-;
-  %1 = insertelement <4 x float> %c, float 1.000000e+00, i32 1
-  %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2
-  %3 = insertelement <4 x float> %2, float 3.000000e+00, i32 3
-  %4 = tail call <4 x float> @llvm.x86.avx512.mask3.vfnmsub.ss(<4 x float> %a, <4 x float> %b, <4 x float> %3, i8 %mask, i32 4)
-  %5 = extractelement <4 x float> %4, i32 1
-  ret float %5
-}
-
-declare <2 x double> @llvm.x86.avx512.mask3.vfnmsub.sd(<2 x double>, <2 x double>, <2 x double>, i8, i32)
-
-define <2 x double> @test_mask3_vfnmsub_sd(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) {
-; CHECK-LABEL: @test_mask3_vfnmsub_sd(
-; CHECK-NEXT:    [[RES:%.*]] = tail call <2 x double> @llvm.x86.avx512.mask3.vfnmsub.sd(<2 x double> [[A:%.*]], <2 x double> [[B:%.*]], <2 x double> [[C:%.*]], i8 [[MASK:%.*]], i32 4)
-; CHECK-NEXT:    ret <2 x double> [[RES]]
-;
-  %1 = insertelement <2 x double> %a, double 1.000000e+00, i32 1
-  %2 = insertelement <2 x double> %b, double 2.000000e+00, i32 1
-  %res = tail call <2 x double> @llvm.x86.avx512.mask3.vfnmsub.sd(<2 x double> %1, <2 x double> %2, <2 x double> %c, i8 %mask, i32 4)
-  ret <2 x double> %res
-}
-
-define double @test_mask3_vfnmsub_sd_0(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) {
-; CHECK-LABEL: @test_mask3_vfnmsub_sd_0(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x double> @llvm.x86.avx512.mask3.vfnmsub.sd(<2 x double> [[A:%.*]], <2 x double> [[B:%.*]], <2 x double> [[C:%.*]], i8 [[MASK:%.*]], i32 4)
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x double> [[TMP1]], i32 0
-; CHECK-NEXT:    ret double [[TMP2]]
-;
-  %1 = insertelement <2 x double> %c, double 1.000000e+00, i32 1
-  %2 = tail call <2 x double> @llvm.x86.avx512.mask3.vfnmsub.sd(<2 x double> %a, <2 x double> %b, <2 x double> %1, i8 %mask, i32 4)
-  %3 = extractelement <2 x double> %2, i32 0
-  ret double %3
-}
-
-define double @test_mask3_vfnmsub_sd_1(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) {
-; CHECK-LABEL: @test_mask3_vfnmsub_sd_1(
-; CHECK-NEXT:    ret double 1.000000e+00
-;
-  %1 = insertelement <2 x double> %c, double 1.000000e+00, i32 1
-  %2 = tail call <2 x double> @llvm.x86.avx512.mask3.vfnmsub.sd(<2 x double> %a, <2 x double> %b, <2 x double> %1, i8 %mask, i32 4)
-  %3 = extractelement <2 x double> %2, i32 1
-  ret double %3
-}
-
-declare <8 x i32> @llvm.x86.avx512.mask.permvar.si.256(<8 x i32>, <8 x i32>, <8 x i32>, i8)
-
-define <8 x i32> @identity_test_permvar_si_256(<8 x i32> %a0) {
-; CHECK-LABEL: @identity_test_permvar_si_256(
-; CHECK-NEXT:    ret <8 x i32> [[A0:%.*]]
-;
-  %a = tail call <8 x i32> @llvm.x86.avx512.mask.permvar.si.256(<8 x i32> %a0, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>, <8 x i32> undef, i8 -1)
-  ret <8 x i32> %a
-}
-
-define <8 x i32> @identity_test_permvar_si_256_mask(<8 x i32> %a0, <8 x i32> %passthru, i8 %mask) {
-; CHECK-LABEL: @identity_test_permvar_si_256_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP2:%.*]] = select <8 x i1> [[TMP1]], <8 x i32> [[A0:%.*]], <8 x i32> [[PASSTHRU:%.*]]
-; CHECK-NEXT:    ret <8 x i32> [[TMP2]]
-;
-  %a = tail call <8 x i32> @llvm.x86.avx512.mask.permvar.si.256(<8 x i32> %a0, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>, <8 x i32> %passthru, i8 %mask)
-  ret <8 x i32> %a
-}
-
-define <8 x i32> @zero_test_permvar_si_256(<8 x i32> %a0) {
-; CHECK-LABEL: @zero_test_permvar_si_256(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A0:%.*]], <8 x i32> undef, <8 x i32> zeroinitializer
-; CHECK-NEXT:    ret <8 x i32> [[TMP1]]
-;
-  %a = tail call <8 x i32> @llvm.x86.avx512.mask.permvar.si.256(<8 x i32> %a0, <8 x i32> zeroinitializer, <8 x i32> undef, i8 -1)
-  ret <8 x i32> %a
-}
-
-define <8 x i32> @zero_test_permvar_si_256_mask(<8 x i32> %a0, <8 x i32> %passthru, i8 %mask) {
-; CHECK-LABEL: @zero_test_permvar_si_256_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A0:%.*]], <8 x i32> undef, <8 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP3:%.*]] = select <8 x i1> [[TMP2]], <8 x i32> [[TMP1]], <8 x i32> [[PASSTHRU:%.*]]
-; CHECK-NEXT:    ret <8 x i32> [[TMP3]]
-;
-  %a = tail call <8 x i32> @llvm.x86.avx512.mask.permvar.si.256(<8 x i32> %a0, <8 x i32> zeroinitializer, <8 x i32> %passthru, i8 %mask)
-  ret <8 x i32> %a
-}
-
-define <8 x i32> @shuffle_test_permvar_si_256(<8 x i32> %a0) {
-; CHECK-LABEL: @shuffle_test_permvar_si_256(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A0:%.*]], <8 x i32> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    ret <8 x i32> [[TMP1]]
-;
-  %a = tail call <8 x i32> @llvm.x86.avx512.mask.permvar.si.256(<8 x i32> %a0, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>, <8 x i32> undef, i8 -1)
-  ret <8 x i32> %a
-}
-
-define <8 x i32> @shuffle_test_permvar_si_256_mask(<8 x i32> %a0, <8 x i32> %passthru, i8 %mask) {
-; CHECK-LABEL: @shuffle_test_permvar_si_256_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A0:%.*]], <8 x i32> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP3:%.*]] = select <8 x i1> [[TMP2]], <8 x i32> [[TMP1]], <8 x i32> [[PASSTHRU:%.*]]
-; CHECK-NEXT:    ret <8 x i32> [[TMP3]]
-;
-  %a = tail call <8 x i32> @llvm.x86.avx512.mask.permvar.si.256(<8 x i32> %a0, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>, <8 x i32> %passthru, i8 %mask)
-  ret <8 x i32> %a
-}
-
-define <8 x i32> @undef_test_permvar_si_256(<8 x i32> %a0) {
-; CHECK-LABEL: @undef_test_permvar_si_256(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A0:%.*]], <8 x i32> undef, <8 x i32> <i32 undef, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    ret <8 x i32> [[TMP1]]
-;
-  %a = tail call <8 x i32> @llvm.x86.avx512.mask.permvar.si.256(<8 x i32> %a0, <8 x i32> <i32 undef, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>, <8 x i32> undef, i8 -1)
-  ret <8 x i32> %a
-}
-
-define <8 x i32> @undef_test_permvar_si_256_mask(<8 x i32> %a0, <8 x i32> %passthru, i8 %mask) {
-; CHECK-LABEL: @undef_test_permvar_si_256_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A0:%.*]], <8 x i32> undef, <8 x i32> <i32 undef, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP3:%.*]] = select <8 x i1> [[TMP2]], <8 x i32> [[TMP1]], <8 x i32> [[PASSTHRU:%.*]]
-; CHECK-NEXT:    ret <8 x i32> [[TMP3]]
-;
-  %a = tail call <8 x i32> @llvm.x86.avx512.mask.permvar.si.256(<8 x i32> %a0, <8 x i32> <i32 undef, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>, <8 x i32> %passthru, i8 %mask)
-  ret <8 x i32> %a
-}
-
-declare <8 x float> @llvm.x86.avx512.mask.permvar.sf.256(<8 x float>, <8 x i32>, <8 x float>, i8)
-
-define <8 x float> @identity_test_permvar_sf_256(<8 x float> %a0) {
-; CHECK-LABEL: @identity_test_permvar_sf_256(
-; CHECK-NEXT:    ret <8 x float> [[A0:%.*]]
-;
-  %a = tail call <8 x float> @llvm.x86.avx512.mask.permvar.sf.256(<8 x float> %a0, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>, <8 x float> undef, i8 -1)
-  ret <8 x float> %a
-}
-
-define <8 x float> @identity_test_permvar_sf_256_mask(<8 x float> %a0, <8 x float> %passthru, i8 %mask) {
-; CHECK-LABEL: @identity_test_permvar_sf_256_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP2:%.*]] = select <8 x i1> [[TMP1]], <8 x float> [[A0:%.*]], <8 x float> [[PASSTHRU:%.*]]
-; CHECK-NEXT:    ret <8 x float> [[TMP2]]
-;
-  %a = tail call <8 x float> @llvm.x86.avx512.mask.permvar.sf.256(<8 x float> %a0, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>, <8 x float> %passthru, i8 %mask)
-  ret <8 x float> %a
-}
-
-define <8 x float> @zero_test_permvar_sf_256(<8 x float> %a0) {
-; CHECK-LABEL: @zero_test_permvar_sf_256(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A0:%.*]], <8 x float> undef, <8 x i32> zeroinitializer
-; CHECK-NEXT:    ret <8 x float> [[TMP1]]
-;
-  %a = tail call <8 x float> @llvm.x86.avx512.mask.permvar.sf.256(<8 x float> %a0, <8 x i32> zeroinitializer, <8 x float> undef, i8 -1)
-  ret <8 x float> %a
-}
-
-define <8 x float> @zero_test_permvar_sf_256_mask(<8 x float> %a0, <8 x float> %passthru, i8 %mask) {
-; CHECK-LABEL: @zero_test_permvar_sf_256_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A0:%.*]], <8 x float> undef, <8 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP3:%.*]] = select <8 x i1> [[TMP2]], <8 x float> [[TMP1]], <8 x float> [[PASSTHRU:%.*]]
-; CHECK-NEXT:    ret <8 x float> [[TMP3]]
-;
-  %a = tail call <8 x float> @llvm.x86.avx512.mask.permvar.sf.256(<8 x float> %a0, <8 x i32> zeroinitializer, <8 x float> %passthru, i8 %mask)
-  ret <8 x float> %a
-}
-
-define <8 x float> @shuffle_test_permvar_sf_256(<8 x float> %a0) {
-; CHECK-LABEL: @shuffle_test_permvar_sf_256(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A0:%.*]], <8 x float> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    ret <8 x float> [[TMP1]]
-;
-  %a = tail call <8 x float> @llvm.x86.avx512.mask.permvar.sf.256(<8 x float> %a0, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>, <8 x float> undef, i8 -1)
-  ret <8 x float> %a
-}
-
-define <8 x float> @shuffle_test_permvar_sf_256_mask(<8 x float> %a0, <8 x float> %passthru, i8 %mask) {
-; CHECK-LABEL: @shuffle_test_permvar_sf_256_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A0:%.*]], <8 x float> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP3:%.*]] = select <8 x i1> [[TMP2]], <8 x float> [[TMP1]], <8 x float> [[PASSTHRU:%.*]]
-; CHECK-NEXT:    ret <8 x float> [[TMP3]]
-;
-  %a = tail call <8 x float> @llvm.x86.avx512.mask.permvar.sf.256(<8 x float> %a0, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>, <8 x float> %passthru, i8 %mask)
-  ret <8 x float> %a
-}
-
-define <8 x float> @undef_test_permvar_sf_256(<8 x float> %a0) {
-; CHECK-LABEL: @undef_test_permvar_sf_256(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A0:%.*]], <8 x float> undef, <8 x i32> <i32 undef, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    ret <8 x float> [[TMP1]]
-;
-  %a = tail call <8 x float> @llvm.x86.avx512.mask.permvar.sf.256(<8 x float> %a0, <8 x i32> <i32 undef, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>, <8 x float> undef, i8 -1)
-  ret <8 x float> %a
-}
-
-define <8 x float> @undef_test_permvar_sf_256_mask(<8 x float> %a0, <8 x float> %passthru, i8 %mask) {
-; CHECK-LABEL: @undef_test_permvar_sf_256_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A0:%.*]], <8 x float> undef, <8 x i32> <i32 undef, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP3:%.*]] = select <8 x i1> [[TMP2]], <8 x float> [[TMP1]], <8 x float> [[PASSTHRU:%.*]]
-; CHECK-NEXT:    ret <8 x float> [[TMP3]]
-;
-  %a = tail call <8 x float> @llvm.x86.avx512.mask.permvar.sf.256(<8 x float> %a0, <8 x i32> <i32 undef, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>, <8 x float> %passthru, i8 %mask)
-  ret <8 x float> %a
-}
-
-declare <4 x i64> @llvm.x86.avx512.mask.permvar.di.256(<4 x i64>, <4 x i64>, <4 x i64>, i8)
-
-define <4 x i64> @identity_test_permvar_di_256(<4 x i64> %a0) {
-; CHECK-LABEL: @identity_test_permvar_di_256(
-; CHECK-NEXT:    ret <4 x i64> [[A0:%.*]]
-;
-  %a = tail call <4 x i64> @llvm.x86.avx512.mask.permvar.di.256(<4 x i64> %a0, <4 x i64> <i64 0, i64 1, i64 2, i64 3>, <4 x i64> undef, i8 -1)
-  ret <4 x i64> %a
-}
-
-define <4 x i64> @identity_test_permvar_di_256_mask(<4 x i64> %a0, <4 x i64> %passthru, i8 %mask) {
-; CHECK-LABEL: @identity_test_permvar_di_256_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP1]], <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[TMP2:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i64> [[A0:%.*]], <4 x i64> [[PASSTHRU:%.*]]
-; CHECK-NEXT:    ret <4 x i64> [[TMP2]]
-;
-  %a = tail call <4 x i64> @llvm.x86.avx512.mask.permvar.di.256(<4 x i64> %a0, <4 x i64> <i64 0, i64 1, i64 2, i64 3>, <4 x i64> %passthru, i8 %mask)
-  ret <4 x i64> %a
-}
-
-define <4 x i64> @zero_test_permvar_di_256(<4 x i64> %a0) {
-; CHECK-LABEL: @zero_test_permvar_di_256(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i64> [[A0:%.*]], <4 x i64> undef, <4 x i32> zeroinitializer
-; CHECK-NEXT:    ret <4 x i64> [[TMP1]]
-;
-  %a = tail call <4 x i64> @llvm.x86.avx512.mask.permvar.di.256(<4 x i64> %a0, <4 x i64> zeroinitializer, <4 x i64> undef, i8 -1)
-  ret <4 x i64> %a
-}
-
-define <4 x i64> @zero_test_permvar_di_256_mask(<4 x i64> %a0, <4 x i64> %passthru, i8 %mask) {
-; CHECK-LABEL: @zero_test_permvar_di_256_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i64> [[A0:%.*]], <4 x i64> undef, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[TMP3:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i64> [[TMP1]], <4 x i64> [[PASSTHRU:%.*]]
-; CHECK-NEXT:    ret <4 x i64> [[TMP3]]
-;
-  %a = tail call <4 x i64> @llvm.x86.avx512.mask.permvar.di.256(<4 x i64> %a0, <4 x i64> zeroinitializer, <4 x i64> %passthru, i8 %mask)
-  ret <4 x i64> %a
-}
-
-define <4 x i64> @shuffle_test_permvar_di_256(<4 x i64> %a0) {
-; CHECK-LABEL: @shuffle_test_permvar_di_256(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i64> [[A0:%.*]], <4 x i64> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    ret <4 x i64> [[TMP1]]
-;
-  %a = tail call <4 x i64> @llvm.x86.avx512.mask.permvar.di.256(<4 x i64> %a0, <4 x i64> <i64 3, i64 2, i64 1, i64 0>, <4 x i64> undef, i8 -1)
-  ret <4 x i64> %a
-}
-
-define <4 x i64> @shuffle_test_permvar_di_256_mask(<4 x i64> %a0, <4 x i64> %passthru, i8 %mask) {
-; CHECK-LABEL: @shuffle_test_permvar_di_256_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i64> [[A0:%.*]], <4 x i64> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[TMP3:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i64> [[TMP1]], <4 x i64> [[PASSTHRU:%.*]]
-; CHECK-NEXT:    ret <4 x i64> [[TMP3]]
-;
-  %a = tail call <4 x i64> @llvm.x86.avx512.mask.permvar.di.256(<4 x i64> %a0, <4 x i64> <i64 3, i64 2, i64 1, i64 0>, <4 x i64> %passthru, i8 %mask)
-  ret <4 x i64> %a
-}
-
-define <4 x i64> @undef_test_permvar_di_256(<4 x i64> %a0) {
-; CHECK-LABEL: @undef_test_permvar_di_256(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i64> [[A0:%.*]], <4 x i64> undef, <4 x i32> <i32 undef, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    ret <4 x i64> [[TMP1]]
-;
-  %a = tail call <4 x i64> @llvm.x86.avx512.mask.permvar.di.256(<4 x i64> %a0, <4 x i64> <i64 undef, i64 2, i64 1, i64 0>, <4 x i64> undef, i8 -1)
-  ret <4 x i64> %a
-}
-
-define <4 x i64> @undef_test_permvar_di_256_mask(<4 x i64> %a0, <4 x i64> %passthru, i8 %mask) {
-; CHECK-LABEL: @undef_test_permvar_di_256_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i64> [[A0:%.*]], <4 x i64> undef, <4 x i32> <i32 undef, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[TMP3:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i64> [[TMP1]], <4 x i64> [[PASSTHRU:%.*]]
-; CHECK-NEXT:    ret <4 x i64> [[TMP3]]
-;
-  %a = tail call <4 x i64> @llvm.x86.avx512.mask.permvar.di.256(<4 x i64> %a0, <4 x i64> <i64 undef, i64 2, i64 1, i64 0>, <4 x i64> %passthru, i8 %mask)
-  ret <4 x i64> %a
-}
-
-declare <4 x double> @llvm.x86.avx512.mask.permvar.df.256(<4 x double>, <4 x i64>, <4 x double>, i8)
-
-define <4 x double> @identity_test_permvar_df_256(<4 x double> %a0) {
-; CHECK-LABEL: @identity_test_permvar_df_256(
-; CHECK-NEXT:    ret <4 x double> [[A0:%.*]]
-;
-  %a = tail call <4 x double> @llvm.x86.avx512.mask.permvar.df.256(<4 x double> %a0, <4 x i64> <i64 0, i64 1, i64 2, i64 3>, <4 x double> undef, i8 -1)
-  ret <4 x double> %a
-}
-
-define <4 x double> @identity_test_permvar_df_256_mask(<4 x double> %a0, <4 x double> %passthru, i8 %mask) {
-; CHECK-LABEL: @identity_test_permvar_df_256_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP1]], <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[TMP2:%.*]] = select <4 x i1> [[EXTRACT]], <4 x double> [[A0:%.*]], <4 x double> [[PASSTHRU:%.*]]
-; CHECK-NEXT:    ret <4 x double> [[TMP2]]
-;
-  %a = tail call <4 x double> @llvm.x86.avx512.mask.permvar.df.256(<4 x double> %a0, <4 x i64> <i64 0, i64 1, i64 2, i64 3>, <4 x double> %passthru, i8 %mask)
-  ret <4 x double> %a
-}
-
-define <4 x double> @zero_test_permvar_df_256(<4 x double> %a0) {
-; CHECK-LABEL: @zero_test_permvar_df_256(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A0:%.*]], <4 x double> undef, <4 x i32> zeroinitializer
-; CHECK-NEXT:    ret <4 x double> [[TMP1]]
-;
-  %a = tail call <4 x double> @llvm.x86.avx512.mask.permvar.df.256(<4 x double> %a0, <4 x i64> zeroinitializer, <4 x double> undef, i8 -1)
-  ret <4 x double> %a
-}
-
-define <4 x double> @zero_test_permvar_df_256_mask(<4 x double> %a0, <4 x double> %passthru, i8 %mask) {
-; CHECK-LABEL: @zero_test_permvar_df_256_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A0:%.*]], <4 x double> undef, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[TMP3:%.*]] = select <4 x i1> [[EXTRACT]], <4 x double> [[TMP1]], <4 x double> [[PASSTHRU:%.*]]
-; CHECK-NEXT:    ret <4 x double> [[TMP3]]
-;
-  %a = tail call <4 x double> @llvm.x86.avx512.mask.permvar.df.256(<4 x double> %a0, <4 x i64> zeroinitializer, <4 x double> %passthru, i8 %mask)
-  ret <4 x double> %a
-}
-
-define <4 x double> @shuffle_test_permvar_df_256(<4 x double> %a0) {
-; CHECK-LABEL: @shuffle_test_permvar_df_256(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A0:%.*]], <4 x double> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    ret <4 x double> [[TMP1]]
-;
-  %a = tail call <4 x double> @llvm.x86.avx512.mask.permvar.df.256(<4 x double> %a0, <4 x i64> <i64 3, i64 2, i64 1, i64 0>, <4 x double> undef, i8 -1)
-  ret <4 x double> %a
-}
-
-define <4 x double> @shuffle_test_permvar_df_256_mask(<4 x double> %a0, <4 x double> %passthru, i8 %mask) {
-; CHECK-LABEL: @shuffle_test_permvar_df_256_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A0:%.*]], <4 x double> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[TMP3:%.*]] = select <4 x i1> [[EXTRACT]], <4 x double> [[TMP1]], <4 x double> [[PASSTHRU:%.*]]
-; CHECK-NEXT:    ret <4 x double> [[TMP3]]
-;
-  %a = tail call <4 x double> @llvm.x86.avx512.mask.permvar.df.256(<4 x double> %a0, <4 x i64> <i64 3, i64 2, i64 1, i64 0>, <4 x double> %passthru, i8 %mask)
-  ret <4 x double> %a
-}
-
-define <4 x double> @undef_test_permvar_df_256(<4 x double> %a0) {
-; CHECK-LABEL: @undef_test_permvar_df_256(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A0:%.*]], <4 x double> undef, <4 x i32> <i32 undef, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    ret <4 x double> [[TMP1]]
-;
-  %a = tail call <4 x double> @llvm.x86.avx512.mask.permvar.df.256(<4 x double> %a0, <4 x i64> <i64 undef, i64 2, i64 1, i64 0>, <4 x double> undef, i8 -1)
-  ret <4 x double> %a
-}
-
-define <4 x double> @undef_test_permvar_df_256_mask(<4 x double> %a0, <4 x double> %passthru, i8 %mask) {
-; CHECK-LABEL: @undef_test_permvar_df_256_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A0:%.*]], <4 x double> undef, <4 x i32> <i32 undef, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[TMP3:%.*]] = select <4 x i1> [[EXTRACT]], <4 x double> [[TMP1]], <4 x double> [[PASSTHRU:%.*]]
-; CHECK-NEXT:    ret <4 x double> [[TMP3]]
-;
-  %a = tail call <4 x double> @llvm.x86.avx512.mask.permvar.df.256(<4 x double> %a0, <4 x i64> <i64 undef, i64 2, i64 1, i64 0>, <4 x double> %passthru, i8 %mask)
-  ret <4 x double> %a
-}
-
-declare <16 x i32> @llvm.x86.avx512.mask.permvar.si.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
-
-define <16 x i32> @identity_test_permvar_si_512(<16 x i32> %a0) {
-; CHECK-LABEL: @identity_test_permvar_si_512(
-; CHECK-NEXT:    ret <16 x i32> [[A0:%.*]]
-;
-  %a = tail call <16 x i32> @llvm.x86.avx512.mask.permvar.si.512(<16 x i32> %a0, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>, <16 x i32> undef, i16 -1)
-  ret <16 x i32> %a
-}
-
-define <16 x i32> @identity_test_permvar_si_512_mask(<16 x i32> %a0, <16 x i32> %passthru, i16 %mask) {
-; CHECK-LABEL: @identity_test_permvar_si_512_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP2:%.*]] = select <16 x i1> [[TMP1]], <16 x i32> [[A0:%.*]], <16 x i32> [[PASSTHRU:%.*]]
-; CHECK-NEXT:    ret <16 x i32> [[TMP2]]
-;
-  %a = tail call <16 x i32> @llvm.x86.avx512.mask.permvar.si.512(<16 x i32> %a0, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>, <16 x i32> %passthru, i16 %mask)
-  ret <16 x i32> %a
-}
-
-define <16 x i32> @zero_test_permvar_si_512(<16 x i32> %a0) {
-; CHECK-LABEL: @zero_test_permvar_si_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i32> [[A0:%.*]], <16 x i32> undef, <16 x i32> zeroinitializer
-; CHECK-NEXT:    ret <16 x i32> [[TMP1]]
-;
-  %a = tail call <16 x i32> @llvm.x86.avx512.mask.permvar.si.512(<16 x i32> %a0, <16 x i32> zeroinitializer, <16 x i32> undef, i16 -1)
-  ret <16 x i32> %a
-}
-
-define <16 x i32> @zero_test_permvar_si_512_mask(<16 x i32> %a0, <16 x i32> %passthru, i16 %mask) {
-; CHECK-LABEL: @zero_test_permvar_si_512_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i32> [[A0:%.*]], <16 x i32> undef, <16 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP3:%.*]] = select <16 x i1> [[TMP2]], <16 x i32> [[TMP1]], <16 x i32> [[PASSTHRU:%.*]]
-; CHECK-NEXT:    ret <16 x i32> [[TMP3]]
-;
-  %a = tail call <16 x i32> @llvm.x86.avx512.mask.permvar.si.512(<16 x i32> %a0, <16 x i32> zeroinitializer, <16 x i32> %passthru, i16 %mask)
-  ret <16 x i32> %a
-}
-
-define <16 x i32> @shuffle_test_permvar_si_512(<16 x i32> %a0) {
-; CHECK-LABEL: @shuffle_test_permvar_si_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i32> [[A0:%.*]], <16 x i32> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    ret <16 x i32> [[TMP1]]
-;
-  %a = tail call <16 x i32> @llvm.x86.avx512.mask.permvar.si.512(<16 x i32> %a0, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>, <16 x i32> undef, i16 -1)
-  ret <16 x i32> %a
-}
-
-define <16 x i32> @shuffle_test_permvar_si_512_mask(<16 x i32> %a0, <16 x i32> %passthru, i16 %mask) {
-; CHECK-LABEL: @shuffle_test_permvar_si_512_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i32> [[A0:%.*]], <16 x i32> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP3:%.*]] = select <16 x i1> [[TMP2]], <16 x i32> [[TMP1]], <16 x i32> [[PASSTHRU:%.*]]
-; CHECK-NEXT:    ret <16 x i32> [[TMP3]]
-;
-  %a = tail call <16 x i32> @llvm.x86.avx512.mask.permvar.si.512(<16 x i32> %a0, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>, <16 x i32> %passthru, i16 %mask)
-  ret <16 x i32> %a
-}
-
-define <16 x i32> @undef_test_permvar_si_512(<16 x i32> %a0) {
-; CHECK-LABEL: @undef_test_permvar_si_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i32> [[A0:%.*]], <16 x i32> undef, <16 x i32> <i32 undef, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    ret <16 x i32> [[TMP1]]
-;
-  %a = tail call <16 x i32> @llvm.x86.avx512.mask.permvar.si.512(<16 x i32> %a0, <16 x i32> <i32 undef, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>, <16 x i32> undef, i16 -1)
-  ret <16 x i32> %a
-}
-
-define <16 x i32> @undef_test_permvar_si_512_mask(<16 x i32> %a0, <16 x i32> %passthru, i16 %mask) {
-; CHECK-LABEL: @undef_test_permvar_si_512_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i32> [[A0:%.*]], <16 x i32> undef, <16 x i32> <i32 undef, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP3:%.*]] = select <16 x i1> [[TMP2]], <16 x i32> [[TMP1]], <16 x i32> [[PASSTHRU:%.*]]
-; CHECK-NEXT:    ret <16 x i32> [[TMP3]]
-;
-  %a = tail call <16 x i32> @llvm.x86.avx512.mask.permvar.si.512(<16 x i32> %a0, <16 x i32> <i32 undef, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>, <16 x i32> %passthru, i16 %mask)
-  ret <16 x i32> %a
-}
-
-declare <16 x float> @llvm.x86.avx512.mask.permvar.sf.512(<16 x float>, <16 x i32>, <16 x float>, i16)
-
-define <16 x float> @identity_test_permvar_sf_512(<16 x float> %a0) {
-; CHECK-LABEL: @identity_test_permvar_sf_512(
-; CHECK-NEXT:    ret <16 x float> [[A0:%.*]]
-;
-  %a = tail call <16 x float> @llvm.x86.avx512.mask.permvar.sf.512(<16 x float> %a0, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>, <16 x float> undef, i16 -1)
-  ret <16 x float> %a
-}
-
-define <16 x float> @identity_test_permvar_sf_512_mask(<16 x float> %a0, <16 x float> %passthru, i16 %mask) {
-; CHECK-LABEL: @identity_test_permvar_sf_512_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP2:%.*]] = select <16 x i1> [[TMP1]], <16 x float> [[A0:%.*]], <16 x float> [[PASSTHRU:%.*]]
-; CHECK-NEXT:    ret <16 x float> [[TMP2]]
-;
-  %a = tail call <16 x float> @llvm.x86.avx512.mask.permvar.sf.512(<16 x float> %a0, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>, <16 x float> %passthru, i16 %mask)
-  ret <16 x float> %a
-}
-
-define <16 x float> @zero_test_permvar_sf_512(<16 x float> %a0) {
-; CHECK-LABEL: @zero_test_permvar_sf_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x float> [[A0:%.*]], <16 x float> undef, <16 x i32> zeroinitializer
-; CHECK-NEXT:    ret <16 x float> [[TMP1]]
-;
-  %a = tail call <16 x float> @llvm.x86.avx512.mask.permvar.sf.512(<16 x float> %a0, <16 x i32> zeroinitializer, <16 x float> undef, i16 -1)
-  ret <16 x float> %a
-}
-
-define <16 x float> @zero_test_permvar_sf_512_mask(<16 x float> %a0, <16 x float> %passthru, i16 %mask) {
-; CHECK-LABEL: @zero_test_permvar_sf_512_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x float> [[A0:%.*]], <16 x float> undef, <16 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP3:%.*]] = select <16 x i1> [[TMP2]], <16 x float> [[TMP1]], <16 x float> [[PASSTHRU:%.*]]
-; CHECK-NEXT:    ret <16 x float> [[TMP3]]
-;
-  %a = tail call <16 x float> @llvm.x86.avx512.mask.permvar.sf.512(<16 x float> %a0, <16 x i32> zeroinitializer, <16 x float> %passthru, i16 %mask)
-  ret <16 x float> %a
-}
-
-define <16 x float> @shuffle_test_permvar_sf_512(<16 x float> %a0) {
-; CHECK-LABEL: @shuffle_test_permvar_sf_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x float> [[A0:%.*]], <16 x float> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    ret <16 x float> [[TMP1]]
-;
-  %a = tail call <16 x float> @llvm.x86.avx512.mask.permvar.sf.512(<16 x float> %a0, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>, <16 x float> undef, i16 -1)
-  ret <16 x float> %a
-}
-
-define <16 x float> @shuffle_test_permvar_sf_512_mask(<16 x float> %a0, <16 x float> %passthru, i16 %mask) {
-; CHECK-LABEL: @shuffle_test_permvar_sf_512_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x float> [[A0:%.*]], <16 x float> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP3:%.*]] = select <16 x i1> [[TMP2]], <16 x float> [[TMP1]], <16 x float> [[PASSTHRU:%.*]]
-; CHECK-NEXT:    ret <16 x float> [[TMP3]]
-;
-  %a = tail call <16 x float> @llvm.x86.avx512.mask.permvar.sf.512(<16 x float> %a0, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>, <16 x float> %passthru, i16 %mask)
-  ret <16 x float> %a
-}
-
-define <16 x float> @undef_test_permvar_sf_512(<16 x float> %a0) {
-; CHECK-LABEL: @undef_test_permvar_sf_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x float> [[A0:%.*]], <16 x float> undef, <16 x i32> <i32 undef, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    ret <16 x float> [[TMP1]]
-;
-  %a = tail call <16 x float> @llvm.x86.avx512.mask.permvar.sf.512(<16 x float> %a0, <16 x i32> <i32 undef, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>, <16 x float> undef, i16 -1)
-  ret <16 x float> %a
-}
-
-define <16 x float> @undef_test_permvar_sf_512_mask(<16 x float> %a0, <16 x float> %passthru, i16 %mask) {
-; CHECK-LABEL: @undef_test_permvar_sf_512_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x float> [[A0:%.*]], <16 x float> undef, <16 x i32> <i32 undef, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP3:%.*]] = select <16 x i1> [[TMP2]], <16 x float> [[TMP1]], <16 x float> [[PASSTHRU:%.*]]
-; CHECK-NEXT:    ret <16 x float> [[TMP3]]
-;
-  %a = tail call <16 x float> @llvm.x86.avx512.mask.permvar.sf.512(<16 x float> %a0, <16 x i32> <i32 undef, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>, <16 x float> %passthru, i16 %mask)
-  ret <16 x float> %a
-}
-
-declare <8 x i64> @llvm.x86.avx512.mask.permvar.di.512(<8 x i64>, <8 x i64>, <8 x i64>, i8)
-
-define <8 x i64> @identity_test_permvar_di_512(<8 x i64> %a0) {
-; CHECK-LABEL: @identity_test_permvar_di_512(
-; CHECK-NEXT:    ret <8 x i64> [[A0:%.*]]
-;
-  %a = tail call <8 x i64> @llvm.x86.avx512.mask.permvar.di.512(<8 x i64> %a0, <8 x i64> <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>, <8 x i64> undef, i8 -1)
-  ret <8 x i64> %a
-}
-
-define <8 x i64> @identity_test_permvar_di_512_mask(<8 x i64> %a0, <8 x i64> %passthru, i8 %mask) {
-; CHECK-LABEL: @identity_test_permvar_di_512_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP2:%.*]] = select <8 x i1> [[TMP1]], <8 x i64> [[A0:%.*]], <8 x i64> [[PASSTHRU:%.*]]
-; CHECK-NEXT:    ret <8 x i64> [[TMP2]]
-;
-  %a = tail call <8 x i64> @llvm.x86.avx512.mask.permvar.di.512(<8 x i64> %a0, <8 x i64> <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>, <8 x i64> %passthru, i8 %mask)
-  ret <8 x i64> %a
-}
-
-define <8 x i64> @zero_test_permvar_di_512(<8 x i64> %a0) {
-; CHECK-LABEL: @zero_test_permvar_di_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i64> [[A0:%.*]], <8 x i64> undef, <8 x i32> zeroinitializer
-; CHECK-NEXT:    ret <8 x i64> [[TMP1]]
-;
-  %a = tail call <8 x i64> @llvm.x86.avx512.mask.permvar.di.512(<8 x i64> %a0, <8 x i64> zeroinitializer, <8 x i64> undef, i8 -1)
-  ret <8 x i64> %a
-}
-
-define <8 x i64> @zero_test_permvar_di_512_mask(<8 x i64> %a0, <8 x i64> %passthru, i8 %mask) {
-; CHECK-LABEL: @zero_test_permvar_di_512_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i64> [[A0:%.*]], <8 x i64> undef, <8 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP3:%.*]] = select <8 x i1> [[TMP2]], <8 x i64> [[TMP1]], <8 x i64> [[PASSTHRU:%.*]]
-; CHECK-NEXT:    ret <8 x i64> [[TMP3]]
-;
-  %a = tail call <8 x i64> @llvm.x86.avx512.mask.permvar.di.512(<8 x i64> %a0, <8 x i64> zeroinitializer, <8 x i64> %passthru, i8 %mask)
-  ret <8 x i64> %a
-}
-
-define <8 x i64> @shuffle_test_permvar_di_512(<8 x i64> %a0) {
-; CHECK-LABEL: @shuffle_test_permvar_di_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i64> [[A0:%.*]], <8 x i64> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    ret <8 x i64> [[TMP1]]
-;
-  %a = tail call <8 x i64> @llvm.x86.avx512.mask.permvar.di.512(<8 x i64> %a0, <8 x i64> <i64 7, i64 6, i64 5, i64 4, i64 3, i64 2, i64 1, i64 0>, <8 x i64> undef, i8 -1)
-  ret <8 x i64> %a
-}
-
-define <8 x i64> @shuffle_test_permvar_di_512_mask(<8 x i64> %a0, <8 x i64> %passthru, i8 %mask) {
-; CHECK-LABEL: @shuffle_test_permvar_di_512_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i64> [[A0:%.*]], <8 x i64> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP3:%.*]] = select <8 x i1> [[TMP2]], <8 x i64> [[TMP1]], <8 x i64> [[PASSTHRU:%.*]]
-; CHECK-NEXT:    ret <8 x i64> [[TMP3]]
-;
-  %a = tail call <8 x i64> @llvm.x86.avx512.mask.permvar.di.512(<8 x i64> %a0, <8 x i64> <i64 7, i64 6, i64 5, i64 4, i64 3, i64 2, i64 1, i64 0>, <8 x i64> %passthru, i8 %mask)
-  ret <8 x i64> %a
-}
-
-define <8 x i64> @undef_test_permvar_di_512(<8 x i64> %a0) {
-; CHECK-LABEL: @undef_test_permvar_di_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i64> [[A0:%.*]], <8 x i64> undef, <8 x i32> <i32 undef, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    ret <8 x i64> [[TMP1]]
-;
-  %a = tail call <8 x i64> @llvm.x86.avx512.mask.permvar.di.512(<8 x i64> %a0, <8 x i64> <i64 undef, i64 6, i64 5, i64 4, i64 3, i64 2, i64 1, i64 0>, <8 x i64> undef, i8 -1)
-  ret <8 x i64> %a
-}
-
-define <8 x i64> @undef_test_permvar_di_512_mask(<8 x i64> %a0, <8 x i64> %passthru, i8 %mask) {
-; CHECK-LABEL: @undef_test_permvar_di_512_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i64> [[A0:%.*]], <8 x i64> undef, <8 x i32> <i32 undef, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP3:%.*]] = select <8 x i1> [[TMP2]], <8 x i64> [[TMP1]], <8 x i64> [[PASSTHRU:%.*]]
-; CHECK-NEXT:    ret <8 x i64> [[TMP3]]
-;
-  %a = tail call <8 x i64> @llvm.x86.avx512.mask.permvar.di.512(<8 x i64> %a0, <8 x i64> <i64 undef, i64 6, i64 5, i64 4, i64 3, i64 2, i64 1, i64 0>, <8 x i64> %passthru, i8 %mask)
-  ret <8 x i64> %a
-}
-
-declare <8 x double> @llvm.x86.avx512.mask.permvar.df.512(<8 x double>, <8 x i64>, <8 x double>, i8)
-
-define <8 x double> @identity_test_permvar_df_512(<8 x double> %a0) {
-; CHECK-LABEL: @identity_test_permvar_df_512(
-; CHECK-NEXT:    ret <8 x double> [[A0:%.*]]
-;
-  %a = tail call <8 x double> @llvm.x86.avx512.mask.permvar.df.512(<8 x double> %a0, <8 x i64> <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>, <8 x double> undef, i8 -1)
-  ret <8 x double> %a
-}
-
-define <8 x double> @identity_test_permvar_df_512_mask(<8 x double> %a0, <8 x double> %passthru, i8 %mask) {
-; CHECK-LABEL: @identity_test_permvar_df_512_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP2:%.*]] = select <8 x i1> [[TMP1]], <8 x double> [[A0:%.*]], <8 x double> [[PASSTHRU:%.*]]
-; CHECK-NEXT:    ret <8 x double> [[TMP2]]
-;
-  %a = tail call <8 x double> @llvm.x86.avx512.mask.permvar.df.512(<8 x double> %a0, <8 x i64> <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>, <8 x double> %passthru, i8 %mask)
-  ret <8 x double> %a
-}
-
-define <8 x double> @zero_test_permvar_df_512(<8 x double> %a0) {
-; CHECK-LABEL: @zero_test_permvar_df_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x double> [[A0:%.*]], <8 x double> undef, <8 x i32> zeroinitializer
-; CHECK-NEXT:    ret <8 x double> [[TMP1]]
-;
-  %a = tail call <8 x double> @llvm.x86.avx512.mask.permvar.df.512(<8 x double> %a0, <8 x i64> zeroinitializer, <8 x double> undef, i8 -1)
-  ret <8 x double> %a
-}
-
-define <8 x double> @zero_test_permvar_df_512_mask(<8 x double> %a0, <8 x double> %passthru, i8 %mask) {
-; CHECK-LABEL: @zero_test_permvar_df_512_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x double> [[A0:%.*]], <8 x double> undef, <8 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP3:%.*]] = select <8 x i1> [[TMP2]], <8 x double> [[TMP1]], <8 x double> [[PASSTHRU:%.*]]
-; CHECK-NEXT:    ret <8 x double> [[TMP3]]
-;
-  %a = tail call <8 x double> @llvm.x86.avx512.mask.permvar.df.512(<8 x double> %a0, <8 x i64> zeroinitializer, <8 x double> %passthru, i8 %mask)
-  ret <8 x double> %a
-}
-
-define <8 x double> @shuffle_test_permvar_df_512(<8 x double> %a0) {
-; CHECK-LABEL: @shuffle_test_permvar_df_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x double> [[A0:%.*]], <8 x double> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    ret <8 x double> [[TMP1]]
-;
-  %a = tail call <8 x double> @llvm.x86.avx512.mask.permvar.df.512(<8 x double> %a0, <8 x i64> <i64 7, i64 6, i64 5, i64 4, i64 3, i64 2, i64 1, i64 0>, <8 x double> undef, i8 -1)
-  ret <8 x double> %a
-}
-
-define <8 x double> @shuffle_test_permvar_df_512_mask(<8 x double> %a0, <8 x double> %passthru, i8 %mask) {
-; CHECK-LABEL: @shuffle_test_permvar_df_512_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x double> [[A0:%.*]], <8 x double> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP3:%.*]] = select <8 x i1> [[TMP2]], <8 x double> [[TMP1]], <8 x double> [[PASSTHRU:%.*]]
-; CHECK-NEXT:    ret <8 x double> [[TMP3]]
-;
-  %a = tail call <8 x double> @llvm.x86.avx512.mask.permvar.df.512(<8 x double> %a0, <8 x i64> <i64 7, i64 6, i64 5, i64 4, i64 3, i64 2, i64 1, i64 0>, <8 x double> %passthru, i8 %mask)
-  ret <8 x double> %a
-}
-
-define <8 x double> @undef_test_permvar_df_512(<8 x double> %a0) {
-; CHECK-LABEL: @undef_test_permvar_df_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x double> [[A0:%.*]], <8 x double> undef, <8 x i32> <i32 undef, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    ret <8 x double> [[TMP1]]
-;
-  %a = tail call <8 x double> @llvm.x86.avx512.mask.permvar.df.512(<8 x double> %a0, <8 x i64> <i64 undef, i64 6, i64 5, i64 4, i64 3, i64 2, i64 1, i64 0>, <8 x double> undef, i8 -1)
-  ret <8 x double> %a
-}
-
-define <8 x double> @undef_test_permvar_df_512_mask(<8 x double> %a0, <8 x double> %passthru, i8 %mask) {
-; CHECK-LABEL: @undef_test_permvar_df_512_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x double> [[A0:%.*]], <8 x double> undef, <8 x i32> <i32 undef, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP3:%.*]] = select <8 x i1> [[TMP2]], <8 x double> [[TMP1]], <8 x double> [[PASSTHRU:%.*]]
-; CHECK-NEXT:    ret <8 x double> [[TMP3]]
-;
-  %a = tail call <8 x double> @llvm.x86.avx512.mask.permvar.df.512(<8 x double> %a0, <8 x i64> <i64 undef, i64 6, i64 5, i64 4, i64 3, i64 2, i64 1, i64 0>, <8 x double> %passthru, i8 %mask)
-  ret <8 x double> %a
-}
-
-declare <8 x i16> @llvm.x86.avx512.mask.permvar.hi.128(<8 x i16>, <8 x i16>, <8 x i16>, i8)
-
-define <8 x i16> @identity_test_permvar_hi_128(<8 x i16> %a0) {
-; CHECK-LABEL: @identity_test_permvar_hi_128(
-; CHECK-NEXT:    ret <8 x i16> [[A0:%.*]]
-;
-  %a = tail call <8 x i16> @llvm.x86.avx512.mask.permvar.hi.128(<8 x i16> %a0, <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, <8 x i16> undef, i8 -1)
-  ret <8 x i16> %a
-}
-
-define <8 x i16> @identity_test_permvar_hi_128_mask(<8 x i16> %a0, <8 x i16> %passthru, i8 %mask) {
-; CHECK-LABEL: @identity_test_permvar_hi_128_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP2:%.*]] = select <8 x i1> [[TMP1]], <8 x i16> [[A0:%.*]], <8 x i16> [[PASSTHRU:%.*]]
-; CHECK-NEXT:    ret <8 x i16> [[TMP2]]
-;
-  %a = tail call <8 x i16> @llvm.x86.avx512.mask.permvar.hi.128(<8 x i16> %a0, <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, <8 x i16> %passthru, i8 %mask)
-  ret <8 x i16> %a
-}
-
-define <8 x i16> @zero_test_permvar_hi_128(<8 x i16> %a0) {
-; CHECK-LABEL: @zero_test_permvar_hi_128(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i16> [[A0:%.*]], <8 x i16> undef, <8 x i32> zeroinitializer
-; CHECK-NEXT:    ret <8 x i16> [[TMP1]]
-;
-  %a = tail call <8 x i16> @llvm.x86.avx512.mask.permvar.hi.128(<8 x i16> %a0, <8 x i16> zeroinitializer, <8 x i16> undef, i8 -1)
-  ret <8 x i16> %a
-}
-
-define <8 x i16> @zero_test_permvar_hi_128_mask(<8 x i16> %a0, <8 x i16> %passthru, i8 %mask) {
-; CHECK-LABEL: @zero_test_permvar_hi_128_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i16> [[A0:%.*]], <8 x i16> undef, <8 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP3:%.*]] = select <8 x i1> [[TMP2]], <8 x i16> [[TMP1]], <8 x i16> [[PASSTHRU:%.*]]
-; CHECK-NEXT:    ret <8 x i16> [[TMP3]]
-;
-  %a = tail call <8 x i16> @llvm.x86.avx512.mask.permvar.hi.128(<8 x i16> %a0, <8 x i16> zeroinitializer, <8 x i16> %passthru, i8 %mask)
-  ret <8 x i16> %a
-}
-
-define <8 x i16> @shuffle_test_permvar_hi_128(<8 x i16> %a0) {
-; CHECK-LABEL: @shuffle_test_permvar_hi_128(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i16> [[A0:%.*]], <8 x i16> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    ret <8 x i16> [[TMP1]]
-;
-  %a = tail call <8 x i16> @llvm.x86.avx512.mask.permvar.hi.128(<8 x i16> %a0, <8 x i16> <i16 7, i16 6, i16 5, i16 4, i16 3, i16 2, i16 1, i16 0>, <8 x i16> undef, i8 -1)
-  ret <8 x i16> %a
-}
-
-define <8 x i16> @shuffle_test_permvar_hi_128_mask(<8 x i16> %a0, <8 x i16> %passthru, i8 %mask) {
-; CHECK-LABEL: @shuffle_test_permvar_hi_128_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i16> [[A0:%.*]], <8 x i16> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP3:%.*]] = select <8 x i1> [[TMP2]], <8 x i16> [[TMP1]], <8 x i16> [[PASSTHRU:%.*]]
-; CHECK-NEXT:    ret <8 x i16> [[TMP3]]
-;
-  %a = tail call <8 x i16> @llvm.x86.avx512.mask.permvar.hi.128(<8 x i16> %a0, <8 x i16> <i16 7, i16 6, i16 5, i16 4, i16 3, i16 2, i16 1, i16 0>, <8 x i16> %passthru, i8 %mask)
-  ret <8 x i16> %a
-}
-
-define <8 x i16> @undef_test_permvar_hi_128(<8 x i16> %a0) {
-; CHECK-LABEL: @undef_test_permvar_hi_128(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i16> [[A0:%.*]], <8 x i16> undef, <8 x i32> <i32 undef, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    ret <8 x i16> [[TMP1]]
-;
-  %a = tail call <8 x i16> @llvm.x86.avx512.mask.permvar.hi.128(<8 x i16> %a0, <8 x i16> <i16 undef, i16 6, i16 5, i16 4, i16 3, i16 2, i16 1, i16 0>, <8 x i16> undef, i8 -1)
-  ret <8 x i16> %a
-}
-
-define <8 x i16> @undef_test_permvar_hi_128_mask(<8 x i16> %a0, <8 x i16> %passthru, i8 %mask) {
-; CHECK-LABEL: @undef_test_permvar_hi_128_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i16> [[A0:%.*]], <8 x i16> undef, <8 x i32> <i32 undef, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP3:%.*]] = select <8 x i1> [[TMP2]], <8 x i16> [[TMP1]], <8 x i16> [[PASSTHRU:%.*]]
-; CHECK-NEXT:    ret <8 x i16> [[TMP3]]
-;
-  %a = tail call <8 x i16> @llvm.x86.avx512.mask.permvar.hi.128(<8 x i16> %a0, <8 x i16> <i16 undef, i16 6, i16 5, i16 4, i16 3, i16 2, i16 1, i16 0>, <8 x i16> %passthru, i8 %mask)
-  ret <8 x i16> %a
-}
-
-declare <16 x i16> @llvm.x86.avx512.mask.permvar.hi.256(<16 x i16>, <16 x i16>, <16 x i16>, i16)
-
-define <16 x i16> @identity_test_permvar_hi_256(<16 x i16> %a0) {
-; CHECK-LABEL: @identity_test_permvar_hi_256(
-; CHECK-NEXT:    ret <16 x i16> [[A0:%.*]]
-;
-  %a = tail call <16 x i16> @llvm.x86.avx512.mask.permvar.hi.256(<16 x i16> %a0, <16 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>, <16 x i16> undef, i16 -1)
-  ret <16 x i16> %a
-}
-
-define <16 x i16> @identity_test_permvar_hi_256_mask(<16 x i16> %a0, <16 x i16> %passthru, i16 %mask) {
-; CHECK-LABEL: @identity_test_permvar_hi_256_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP2:%.*]] = select <16 x i1> [[TMP1]], <16 x i16> [[A0:%.*]], <16 x i16> [[PASSTHRU:%.*]]
-; CHECK-NEXT:    ret <16 x i16> [[TMP2]]
-;
-  %a = tail call <16 x i16> @llvm.x86.avx512.mask.permvar.hi.256(<16 x i16> %a0, <16 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>, <16 x i16> %passthru, i16 %mask)
-  ret <16 x i16> %a
-}
-
-define <16 x i16> @zero_test_permvar_hi_256(<16 x i16> %a0) {
-; CHECK-LABEL: @zero_test_permvar_hi_256(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A0:%.*]], <16 x i16> undef, <16 x i32> zeroinitializer
-; CHECK-NEXT:    ret <16 x i16> [[TMP1]]
-;
-  %a = tail call <16 x i16> @llvm.x86.avx512.mask.permvar.hi.256(<16 x i16> %a0, <16 x i16> zeroinitializer, <16 x i16> undef, i16 -1)
-  ret <16 x i16> %a
-}
-
-define <16 x i16> @zero_test_permvar_hi_256_mask(<16 x i16> %a0, <16 x i16> %passthru, i16 %mask) {
-; CHECK-LABEL: @zero_test_permvar_hi_256_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A0:%.*]], <16 x i16> undef, <16 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP3:%.*]] = select <16 x i1> [[TMP2]], <16 x i16> [[TMP1]], <16 x i16> [[PASSTHRU:%.*]]
-; CHECK-NEXT:    ret <16 x i16> [[TMP3]]
-;
-  %a = tail call <16 x i16> @llvm.x86.avx512.mask.permvar.hi.256(<16 x i16> %a0, <16 x i16> zeroinitializer, <16 x i16> %passthru, i16 %mask)
-  ret <16 x i16> %a
-}
-
-define <16 x i16> @shuffle_test_permvar_hi_256(<16 x i16> %a0) {
-; CHECK-LABEL: @shuffle_test_permvar_hi_256(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A0:%.*]], <16 x i16> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    ret <16 x i16> [[TMP1]]
-;
-  %a = tail call <16 x i16> @llvm.x86.avx512.mask.permvar.hi.256(<16 x i16> %a0, <16 x i16> <i16 15, i16 14, i16 13, i16 12, i16 11, i16 10, i16 9, i16 8, i16 7, i16 6, i16 5, i16 4, i16 3, i16 2, i16 1, i16 0>, <16 x i16> undef, i16 -1)
-  ret <16 x i16> %a
-}
-
-define <16 x i16> @shuffle_test_permvar_hi_256_mask(<16 x i16> %a0, <16 x i16> %passthru, i16 %mask) {
-; CHECK-LABEL: @shuffle_test_permvar_hi_256_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A0:%.*]], <16 x i16> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP3:%.*]] = select <16 x i1> [[TMP2]], <16 x i16> [[TMP1]], <16 x i16> [[PASSTHRU:%.*]]
-; CHECK-NEXT:    ret <16 x i16> [[TMP3]]
-;
-  %a = tail call <16 x i16> @llvm.x86.avx512.mask.permvar.hi.256(<16 x i16> %a0, <16 x i16> <i16 15, i16 14, i16 13, i16 12, i16 11, i16 10, i16 9, i16 8, i16 7, i16 6, i16 5, i16 4, i16 3, i16 2, i16 1, i16 0>, <16 x i16> %passthru, i16 %mask)
-  ret <16 x i16> %a
-}
-
-define <16 x i16> @undef_test_permvar_hi_256(<16 x i16> %a0) {
-; CHECK-LABEL: @undef_test_permvar_hi_256(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A0:%.*]], <16 x i16> undef, <16 x i32> <i32 undef, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    ret <16 x i16> [[TMP1]]
-;
-  %a = tail call <16 x i16> @llvm.x86.avx512.mask.permvar.hi.256(<16 x i16> %a0, <16 x i16> <i16 undef, i16 14, i16 13, i16 12, i16 11, i16 10, i16 9, i16 8, i16 7, i16 6, i16 5, i16 4, i16 3, i16 2, i16 1, i16 0>, <16 x i16> undef, i16 -1)
-  ret <16 x i16> %a
-}
-
-define <16 x i16> @undef_test_permvar_hi_256_mask(<16 x i16> %a0, <16 x i16> %passthru, i16 %mask) {
-; CHECK-LABEL: @undef_test_permvar_hi_256_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A0:%.*]], <16 x i16> undef, <16 x i32> <i32 undef, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP3:%.*]] = select <16 x i1> [[TMP2]], <16 x i16> [[TMP1]], <16 x i16> [[PASSTHRU:%.*]]
-; CHECK-NEXT:    ret <16 x i16> [[TMP3]]
-;
-  %a = tail call <16 x i16> @llvm.x86.avx512.mask.permvar.hi.256(<16 x i16> %a0, <16 x i16> <i16 undef, i16 14, i16 13, i16 12, i16 11, i16 10, i16 9, i16 8, i16 7, i16 6, i16 5, i16 4, i16 3, i16 2, i16 1, i16 0>, <16 x i16> %passthru, i16 %mask)
-  ret <16 x i16> %a
-}
-
-declare <32 x i16> @llvm.x86.avx512.mask.permvar.hi.512(<32 x i16>, <32 x i16>, <32 x i16>, i32)
-
-define <32 x i16> @identity_test_permvar_hi_512(<32 x i16> %a0) {
-; CHECK-LABEL: @identity_test_permvar_hi_512(
-; CHECK-NEXT:    ret <32 x i16> [[A0:%.*]]
-;
-  %a = tail call <32 x i16> @llvm.x86.avx512.mask.permvar.hi.512(<32 x i16> %a0, <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19, i16 20, i16 21, i16 22, i16 23, i16 24, i16 25, i16 26, i16 27, i16 28, i16 29, i16 30, i16 31>, <32 x i16> undef, i32 -1)
-  ret <32 x i16> %a
-}
-
-define <32 x i16> @identity_test_permvar_hi_512_mask(<32 x i16> %a0, <32 x i16> %passthru, i32 %mask) {
-; CHECK-LABEL: @identity_test_permvar_hi_512_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32 [[MASK:%.*]] to <32 x i1>
-; CHECK-NEXT:    [[TMP2:%.*]] = select <32 x i1> [[TMP1]], <32 x i16> [[A0:%.*]], <32 x i16> [[PASSTHRU:%.*]]
-; CHECK-NEXT:    ret <32 x i16> [[TMP2]]
-;
-  %a = tail call <32 x i16> @llvm.x86.avx512.mask.permvar.hi.512(<32 x i16> %a0, <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19, i16 20, i16 21, i16 22, i16 23, i16 24, i16 25, i16 26, i16 27, i16 28, i16 29, i16 30, i16 31>, <32 x i16> %passthru, i32 %mask)
-  ret <32 x i16> %a
-}
-
-define <32 x i16> @zero_test_permvar_hi_512(<32 x i16> %a0) {
-; CHECK-LABEL: @zero_test_permvar_hi_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <32 x i16> [[A0:%.*]], <32 x i16> undef, <32 x i32> zeroinitializer
-; CHECK-NEXT:    ret <32 x i16> [[TMP1]]
-;
-  %a = tail call <32 x i16> @llvm.x86.avx512.mask.permvar.hi.512(<32 x i16> %a0, <32 x i16> zeroinitializer, <32 x i16> undef, i32 -1)
-  ret <32 x i16> %a
-}
-
-define <32 x i16> @zero_test_permvar_hi_512_mask(<32 x i16> %a0, <32 x i16> %passthru, i32 %mask) {
-; CHECK-LABEL: @zero_test_permvar_hi_512_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <32 x i16> [[A0:%.*]], <32 x i16> undef, <32 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i32 [[MASK:%.*]] to <32 x i1>
-; CHECK-NEXT:    [[TMP3:%.*]] = select <32 x i1> [[TMP2]], <32 x i16> [[TMP1]], <32 x i16> [[PASSTHRU:%.*]]
-; CHECK-NEXT:    ret <32 x i16> [[TMP3]]
-;
-  %a = tail call <32 x i16> @llvm.x86.avx512.mask.permvar.hi.512(<32 x i16> %a0, <32 x i16> zeroinitializer, <32 x i16> %passthru, i32 %mask)
-  ret <32 x i16> %a
-}
-
-define <32 x i16> @shuffle_test_permvar_hi_512(<32 x i16> %a0) {
-; CHECK-LABEL: @shuffle_test_permvar_hi_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <32 x i16> [[A0:%.*]], <32 x i16> undef, <32 x i32> <i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    ret <32 x i16> [[TMP1]]
-;
-  %a = tail call <32 x i16> @llvm.x86.avx512.mask.permvar.hi.512(<32 x i16> %a0, <32 x i16> <i16 31, i16 30, i16 29, i16 28, i16 27, i16 26, i16 25, i16 24, i16 23, i16 22, i16 21, i16 20, i16 19, i16 18, i16 17, i16 16, i16 15, i16 14, i16 13, i16 12, i16 11, i16 10, i16 9, i16 8, i16 7, i16 6, i16 5, i16 4, i16 3, i16 2, i16 1, i16 0>, <32 x i16> undef, i32 -1)
-  ret <32 x i16> %a
-}
-
-define <32 x i16> @shuffle_test_permvar_hi_512_mask(<32 x i16> %a0, <32 x i16> %passthru, i32 %mask) {
-; CHECK-LABEL: @shuffle_test_permvar_hi_512_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <32 x i16> [[A0:%.*]], <32 x i16> undef, <32 x i32> <i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i32 [[MASK:%.*]] to <32 x i1>
-; CHECK-NEXT:    [[TMP3:%.*]] = select <32 x i1> [[TMP2]], <32 x i16> [[TMP1]], <32 x i16> [[PASSTHRU:%.*]]
-; CHECK-NEXT:    ret <32 x i16> [[TMP3]]
-;
-  %a = tail call <32 x i16> @llvm.x86.avx512.mask.permvar.hi.512(<32 x i16> %a0, <32 x i16> <i16 31, i16 30, i16 29, i16 28, i16 27, i16 26, i16 25, i16 24, i16 23, i16 22, i16 21, i16 20, i16 19, i16 18, i16 17, i16 16, i16 15, i16 14, i16 13, i16 12, i16 11, i16 10, i16 9, i16 8, i16 7, i16 6, i16 5, i16 4, i16 3, i16 2, i16 1, i16 0>, <32 x i16> %passthru, i32 %mask)
-  ret <32 x i16> %a
-}
-
-define <32 x i16> @undef_test_permvar_hi_512(<32 x i16> %a0) {
-; CHECK-LABEL: @undef_test_permvar_hi_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <32 x i16> [[A0:%.*]], <32 x i16> undef, <32 x i32> <i32 undef, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    ret <32 x i16> [[TMP1]]
-;
-  %a = tail call <32 x i16> @llvm.x86.avx512.mask.permvar.hi.512(<32 x i16> %a0, <32 x i16> <i16 undef, i16 30, i16 29, i16 28, i16 27, i16 26, i16 25, i16 24, i16 23, i16 22, i16 21, i16 20, i16 19, i16 18, i16 17, i16 16, i16 15, i16 14, i16 13, i16 12, i16 11, i16 10, i16 9, i16 8, i16 7, i16 6, i16 5, i16 4, i16 3, i16 2, i16 1, i16 0>, <32 x i16> undef, i32 -1)
-  ret <32 x i16> %a
-}
-
-define <32 x i16> @undef_test_permvar_hi_512_mask(<32 x i16> %a0, <32 x i16> %passthru, i32 %mask) {
-; CHECK-LABEL: @undef_test_permvar_hi_512_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <32 x i16> [[A0:%.*]], <32 x i16> undef, <32 x i32> <i32 undef, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i32 [[MASK:%.*]] to <32 x i1>
-; CHECK-NEXT:    [[TMP3:%.*]] = select <32 x i1> [[TMP2]], <32 x i16> [[TMP1]], <32 x i16> [[PASSTHRU:%.*]]
-; CHECK-NEXT:    ret <32 x i16> [[TMP3]]
-;
-  %a = tail call <32 x i16> @llvm.x86.avx512.mask.permvar.hi.512(<32 x i16> %a0, <32 x i16> <i16 undef, i16 30, i16 29, i16 28, i16 27, i16 26, i16 25, i16 24, i16 23, i16 22, i16 21, i16 20, i16 19, i16 18, i16 17, i16 16, i16 15, i16 14, i16 13, i16 12, i16 11, i16 10, i16 9, i16 8, i16 7, i16 6, i16 5, i16 4, i16 3, i16 2, i16 1, i16 0>, <32 x i16> %passthru, i32 %mask)
-  ret <32 x i16> %a
-}
-
-declare <16 x i8> @llvm.x86.avx512.mask.permvar.qi.128(<16 x i8>, <16 x i8>, <16 x i8>, i16)
-
-define <16 x i8> @identity_test_permvar_qi_128(<16 x i8> %a0) {
-; CHECK-LABEL: @identity_test_permvar_qi_128(
-; CHECK-NEXT:    ret <16 x i8> [[A0:%.*]]
-;
-  %a = tail call <16 x i8> @llvm.x86.avx512.mask.permvar.qi.128(<16 x i8> %a0, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, <16 x i8> undef, i16 -1)
-  ret <16 x i8> %a
-}
-
-define <16 x i8> @identity_test_permvar_qi_128_mask(<16 x i8> %a0, <16 x i8> %passthru, i16 %mask) {
-; CHECK-LABEL: @identity_test_permvar_qi_128_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP2:%.*]] = select <16 x i1> [[TMP1]], <16 x i8> [[A0:%.*]], <16 x i8> [[PASSTHRU:%.*]]
-; CHECK-NEXT:    ret <16 x i8> [[TMP2]]
-;
-  %a = tail call <16 x i8> @llvm.x86.avx512.mask.permvar.qi.128(<16 x i8> %a0, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, <16 x i8> %passthru, i16 %mask)
-  ret <16 x i8> %a
-}
-
-define <16 x i8> @zero_test_permvar_qi_128(<16 x i8> %a0) {
-; CHECK-LABEL: @zero_test_permvar_qi_128(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i8> [[A0:%.*]], <16 x i8> undef, <16 x i32> zeroinitializer
-; CHECK-NEXT:    ret <16 x i8> [[TMP1]]
-;
-  %a = tail call <16 x i8> @llvm.x86.avx512.mask.permvar.qi.128(<16 x i8> %a0, <16 x i8> zeroinitializer, <16 x i8> undef, i16 -1)
-  ret <16 x i8> %a
-}
-
-define <16 x i8> @zero_test_permvar_qi_128_mask(<16 x i8> %a0, <16 x i8> %passthru, i16 %mask) {
-; CHECK-LABEL: @zero_test_permvar_qi_128_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i8> [[A0:%.*]], <16 x i8> undef, <16 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP3:%.*]] = select <16 x i1> [[TMP2]], <16 x i8> [[TMP1]], <16 x i8> [[PASSTHRU:%.*]]
-; CHECK-NEXT:    ret <16 x i8> [[TMP3]]
-;
-  %a = tail call <16 x i8> @llvm.x86.avx512.mask.permvar.qi.128(<16 x i8> %a0, <16 x i8> zeroinitializer, <16 x i8> %passthru, i16 %mask)
-  ret <16 x i8> %a
-}
-
-define <16 x i8> @shuffle_test_permvar_qi_128(<16 x i8> %a0) {
-; CHECK-LABEL: @shuffle_test_permvar_qi_128(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i8> [[A0:%.*]], <16 x i8> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    ret <16 x i8> [[TMP1]]
-;
-  %a = tail call <16 x i8> @llvm.x86.avx512.mask.permvar.qi.128(<16 x i8> %a0, <16 x i8> <i8 15, i8 14, i8 13, i8 12, i8 11, i8 10, i8 9, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>, <16 x i8> undef, i16 -1)
-  ret <16 x i8> %a
-}
-
-define <16 x i8> @shuffle_test_permvar_qi_128_mask(<16 x i8> %a0, <16 x i8> %passthru, i16 %mask) {
-; CHECK-LABEL: @shuffle_test_permvar_qi_128_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i8> [[A0:%.*]], <16 x i8> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP3:%.*]] = select <16 x i1> [[TMP2]], <16 x i8> [[TMP1]], <16 x i8> [[PASSTHRU:%.*]]
-; CHECK-NEXT:    ret <16 x i8> [[TMP3]]
-;
-  %a = tail call <16 x i8> @llvm.x86.avx512.mask.permvar.qi.128(<16 x i8> %a0, <16 x i8> <i8 15, i8 14, i8 13, i8 12, i8 11, i8 10, i8 9, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>, <16 x i8> %passthru, i16 %mask)
-  ret <16 x i8> %a
-}
-
-define <16 x i8> @undef_test_permvar_qi_128(<16 x i8> %a0) {
-; CHECK-LABEL: @undef_test_permvar_qi_128(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i8> [[A0:%.*]], <16 x i8> undef, <16 x i32> <i32 undef, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    ret <16 x i8> [[TMP1]]
-;
-  %a = tail call <16 x i8> @llvm.x86.avx512.mask.permvar.qi.128(<16 x i8> %a0, <16 x i8> <i8 undef, i8 14, i8 13, i8 12, i8 11, i8 10, i8 9, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>, <16 x i8> undef, i16 -1)
-  ret <16 x i8> %a
-}
-
-define <16 x i8> @undef_test_permvar_qi_128_mask(<16 x i8> %a0, <16 x i8> %passthru, i16 %mask) {
-; CHECK-LABEL: @undef_test_permvar_qi_128_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i8> [[A0:%.*]], <16 x i8> undef, <16 x i32> <i32 undef, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP3:%.*]] = select <16 x i1> [[TMP2]], <16 x i8> [[TMP1]], <16 x i8> [[PASSTHRU:%.*]]
-; CHECK-NEXT:    ret <16 x i8> [[TMP3]]
-;
-  %a = tail call <16 x i8> @llvm.x86.avx512.mask.permvar.qi.128(<16 x i8> %a0, <16 x i8> <i8 undef, i8 14, i8 13, i8 12, i8 11, i8 10, i8 9, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>, <16 x i8> %passthru, i16 %mask)
-  ret <16 x i8> %a
-}
-
-declare <32 x i8> @llvm.x86.avx512.mask.permvar.qi.256(<32 x i8>, <32 x i8>, <32 x i8>, i32)
-
-define <32 x i8> @identity_test_permvar_qi_256(<32 x i8> %a0) {
-; CHECK-LABEL: @identity_test_permvar_qi_256(
-; CHECK-NEXT:    ret <32 x i8> [[A0:%.*]]
-;
-  %a = tail call <32 x i8> @llvm.x86.avx512.mask.permvar.qi.256(<32 x i8> %a0, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31>, <32 x i8> undef, i32 -1)
-  ret <32 x i8> %a
-}
-
-define <32 x i8> @identity_test_permvar_qi_256_mask(<32 x i8> %a0, <32 x i8> %passthru, i32 %mask) {
-; CHECK-LABEL: @identity_test_permvar_qi_256_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32 [[MASK:%.*]] to <32 x i1>
-; CHECK-NEXT:    [[TMP2:%.*]] = select <32 x i1> [[TMP1]], <32 x i8> [[A0:%.*]], <32 x i8> [[PASSTHRU:%.*]]
-; CHECK-NEXT:    ret <32 x i8> [[TMP2]]
-;
-  %a = tail call <32 x i8> @llvm.x86.avx512.mask.permvar.qi.256(<32 x i8> %a0, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31>, <32 x i8> %passthru, i32 %mask)
-  ret <32 x i8> %a
-}
-
-define <32 x i8> @zero_test_permvar_qi_256(<32 x i8> %a0) {
-; CHECK-LABEL: @zero_test_permvar_qi_256(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <32 x i8> [[A0:%.*]], <32 x i8> undef, <32 x i32> zeroinitializer
-; CHECK-NEXT:    ret <32 x i8> [[TMP1]]
-;
-  %a = tail call <32 x i8> @llvm.x86.avx512.mask.permvar.qi.256(<32 x i8> %a0, <32 x i8> zeroinitializer, <32 x i8> undef, i32 -1)
-  ret <32 x i8> %a
-}
-
-define <32 x i8> @zero_test_permvar_qi_256_mask(<32 x i8> %a0, <32 x i8> %passthru, i32 %mask) {
-; CHECK-LABEL: @zero_test_permvar_qi_256_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <32 x i8> [[A0:%.*]], <32 x i8> undef, <32 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i32 [[MASK:%.*]] to <32 x i1>
-; CHECK-NEXT:    [[TMP3:%.*]] = select <32 x i1> [[TMP2]], <32 x i8> [[TMP1]], <32 x i8> [[PASSTHRU:%.*]]
-; CHECK-NEXT:    ret <32 x i8> [[TMP3]]
-;
-  %a = tail call <32 x i8> @llvm.x86.avx512.mask.permvar.qi.256(<32 x i8> %a0, <32 x i8> zeroinitializer, <32 x i8> %passthru, i32 %mask)
-  ret <32 x i8> %a
-}
-
-define <32 x i8> @shuffle_test_permvar_qi_256(<32 x i8> %a0) {
-; CHECK-LABEL: @shuffle_test_permvar_qi_256(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <32 x i8> [[A0:%.*]], <32 x i8> undef, <32 x i32> <i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    ret <32 x i8> [[TMP1]]
-;
-  %a = tail call <32 x i8> @llvm.x86.avx512.mask.permvar.qi.256(<32 x i8> %a0, <32 x i8> <i8 31, i8 30, i8 29, i8 28, i8 27, i8 26, i8 25, i8 24, i8 23, i8 22, i8 21, i8 20, i8 19, i8 18, i8 17, i8 16, i8 15, i8 14, i8 13, i8 12, i8 11, i8 10, i8 9, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>, <32 x i8> undef, i32 -1)
-  ret <32 x i8> %a
-}
-
-define <32 x i8> @shuffle_test_permvar_qi_256_mask(<32 x i8> %a0, <32 x i8> %passthru, i32 %mask) {
-; CHECK-LABEL: @shuffle_test_permvar_qi_256_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <32 x i8> [[A0:%.*]], <32 x i8> undef, <32 x i32> <i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i32 [[MASK:%.*]] to <32 x i1>
-; CHECK-NEXT:    [[TMP3:%.*]] = select <32 x i1> [[TMP2]], <32 x i8> [[TMP1]], <32 x i8> [[PASSTHRU:%.*]]
-; CHECK-NEXT:    ret <32 x i8> [[TMP3]]
-;
-  %a = tail call <32 x i8> @llvm.x86.avx512.mask.permvar.qi.256(<32 x i8> %a0, <32 x i8> <i8 31, i8 30, i8 29, i8 28, i8 27, i8 26, i8 25, i8 24, i8 23, i8 22, i8 21, i8 20, i8 19, i8 18, i8 17, i8 16, i8 15, i8 14, i8 13, i8 12, i8 11, i8 10, i8 9, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>, <32 x i8> %passthru, i32 %mask)
-  ret <32 x i8> %a
-}
-
-define <32 x i8> @undef_test_permvar_qi_256(<32 x i8> %a0) {
-; CHECK-LABEL: @undef_test_permvar_qi_256(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <32 x i8> [[A0:%.*]], <32 x i8> undef, <32 x i32> <i32 undef, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    ret <32 x i8> [[TMP1]]
-;
-  %a = tail call <32 x i8> @llvm.x86.avx512.mask.permvar.qi.256(<32 x i8> %a0, <32 x i8> <i8 undef, i8 30, i8 29, i8 28, i8 27, i8 26, i8 25, i8 24, i8 23, i8 22, i8 21, i8 20, i8 19, i8 18, i8 17, i8 16, i8 15, i8 14, i8 13, i8 12, i8 11, i8 10, i8 9, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>, <32 x i8> undef, i32 -1)
-  ret <32 x i8> %a
-}
-
-define <32 x i8> @undef_test_permvar_qi_256_mask(<32 x i8> %a0, <32 x i8> %passthru, i32 %mask) {
-; CHECK-LABEL: @undef_test_permvar_qi_256_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <32 x i8> [[A0:%.*]], <32 x i8> undef, <32 x i32> <i32 undef, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i32 [[MASK:%.*]] to <32 x i1>
-; CHECK-NEXT:    [[TMP3:%.*]] = select <32 x i1> [[TMP2]], <32 x i8> [[TMP1]], <32 x i8> [[PASSTHRU:%.*]]
-; CHECK-NEXT:    ret <32 x i8> [[TMP3]]
-;
-  %a = tail call <32 x i8> @llvm.x86.avx512.mask.permvar.qi.256(<32 x i8> %a0, <32 x i8> <i8 undef, i8 30, i8 29, i8 28, i8 27, i8 26, i8 25, i8 24, i8 23, i8 22, i8 21, i8 20, i8 19, i8 18, i8 17, i8 16, i8 15, i8 14, i8 13, i8 12, i8 11, i8 10, i8 9, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>, <32 x i8> %passthru, i32 %mask)
-  ret <32 x i8> %a
-}
-
-declare <64 x i8> @llvm.x86.avx512.mask.permvar.qi.512(<64 x i8>, <64 x i8>, <64 x i8>, i64)
-
-define <64 x i8> @identity_test_permvar_qi_512(<64 x i8> %a0) {
-; CHECK-LABEL: @identity_test_permvar_qi_512(
-; CHECK-NEXT:    ret <64 x i8> [[A0:%.*]]
-;
-  %a = tail call <64 x i8> @llvm.x86.avx512.mask.permvar.qi.512(<64 x i8> %a0, <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31, i8 32, i8 33, i8 34, i8 35, i8 36, i8 37, i8 38, i8 39, i8 40, i8 41, i8 42, i8 43, i8 44, i8 45, i8 46, i8 47, i8 48, i8 49, i8 50, i8 51, i8 52, i8 53, i8 54, i8 55, i8 56, i8 57, i8 58, i8 59, i8 60, i8 61, i8 62, i8 63>, <64 x i8> undef, i64 -1)
-  ret <64 x i8> %a
-}
-
-define <64 x i8> @identity_test_permvar_qi_512_mask(<64 x i8> %a0, <64 x i8> %passthru, i64 %mask) {
-; CHECK-LABEL: @identity_test_permvar_qi_512_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64 [[MASK:%.*]] to <64 x i1>
-; CHECK-NEXT:    [[TMP2:%.*]] = select <64 x i1> [[TMP1]], <64 x i8> [[A0:%.*]], <64 x i8> [[PASSTHRU:%.*]]
-; CHECK-NEXT:    ret <64 x i8> [[TMP2]]
-;
-  %a = tail call <64 x i8> @llvm.x86.avx512.mask.permvar.qi.512(<64 x i8> %a0, <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31, i8 32, i8 33, i8 34, i8 35, i8 36, i8 37, i8 38, i8 39, i8 40, i8 41, i8 42, i8 43, i8 44, i8 45, i8 46, i8 47, i8 48, i8 49, i8 50, i8 51, i8 52, i8 53, i8 54, i8 55, i8 56, i8 57, i8 58, i8 59, i8 60, i8 61, i8 62, i8 63>, <64 x i8> %passthru, i64 %mask)
-  ret <64 x i8> %a
-}
-
-define <64 x i8> @zero_test_permvar_qi_512(<64 x i8> %a0) {
-; CHECK-LABEL: @zero_test_permvar_qi_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <64 x i8> [[A0:%.*]], <64 x i8> undef, <64 x i32> zeroinitializer
-; CHECK-NEXT:    ret <64 x i8> [[TMP1]]
-;
-  %a = tail call <64 x i8> @llvm.x86.avx512.mask.permvar.qi.512(<64 x i8> %a0, <64 x i8> zeroinitializer, <64 x i8> undef, i64 -1)
-  ret <64 x i8> %a
-}
-
-define <64 x i8> @zero_test_permvar_qi_512_mask(<64 x i8> %a0, <64 x i8> %passthru, i64 %mask) {
-; CHECK-LABEL: @zero_test_permvar_qi_512_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <64 x i8> [[A0:%.*]], <64 x i8> undef, <64 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i64 [[MASK:%.*]] to <64 x i1>
-; CHECK-NEXT:    [[TMP3:%.*]] = select <64 x i1> [[TMP2]], <64 x i8> [[TMP1]], <64 x i8> [[PASSTHRU:%.*]]
-; CHECK-NEXT:    ret <64 x i8> [[TMP3]]
-;
-  %a = tail call <64 x i8> @llvm.x86.avx512.mask.permvar.qi.512(<64 x i8> %a0, <64 x i8> zeroinitializer, <64 x i8> %passthru, i64 %mask)
-  ret <64 x i8> %a
-}
-
-define <64 x i8> @shuffle_test_permvar_qi_512(<64 x i8> %a0) {
-; CHECK-LABEL: @shuffle_test_permvar_qi_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <64 x i8> [[A0:%.*]], <64 x i8> undef, <64 x i32> <i32 63, i32 62, i32 61, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    ret <64 x i8> [[TMP1]]
-;
-  %a = tail call <64 x i8> @llvm.x86.avx512.mask.permvar.qi.512(<64 x i8> %a0, <64 x i8> <i8 63, i8 62, i8 61, i8 60, i8 59, i8 58, i8 57, i8 56, i8 55, i8 54, i8 53, i8 52, i8 51, i8 50, i8 49, i8 48, i8 47, i8 46, i8 45, i8 44, i8 43, i8 42, i8 41, i8 40, i8 39, i8 38, i8 37, i8 36, i8 35, i8 34, i8 33, i8 32, i8 31, i8 30, i8 29, i8 28, i8 27, i8 26, i8 25, i8 24, i8 23, i8 22, i8 21, i8 20, i8 19, i8 18, i8 17, i8 16, i8 15, i8 14, i8 13, i8 12, i8 11, i8 10, i8 9, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>, <64 x i8> undef, i64 -1)
-  ret <64 x i8> %a
-}
-
-define <64 x i8> @shuffle_test_permvar_qi_512_mask(<64 x i8> %a0, <64 x i8> %passthru, i64 %mask) {
-; CHECK-LABEL: @shuffle_test_permvar_qi_512_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <64 x i8> [[A0:%.*]], <64 x i8> undef, <64 x i32> <i32 63, i32 62, i32 61, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i64 [[MASK:%.*]] to <64 x i1>
-; CHECK-NEXT:    [[TMP3:%.*]] = select <64 x i1> [[TMP2]], <64 x i8> [[TMP1]], <64 x i8> [[PASSTHRU:%.*]]
-; CHECK-NEXT:    ret <64 x i8> [[TMP3]]
-;
-  %a = tail call <64 x i8> @llvm.x86.avx512.mask.permvar.qi.512(<64 x i8> %a0, <64 x i8> <i8 63, i8 62, i8 61, i8 60, i8 59, i8 58, i8 57, i8 56, i8 55, i8 54, i8 53, i8 52, i8 51, i8 50, i8 49, i8 48, i8 47, i8 46, i8 45, i8 44, i8 43, i8 42, i8 41, i8 40, i8 39, i8 38, i8 37, i8 36, i8 35, i8 34, i8 33, i8 32, i8 31, i8 30, i8 29, i8 28, i8 27, i8 26, i8 25, i8 24, i8 23, i8 22, i8 21, i8 20, i8 19, i8 18, i8 17, i8 16, i8 15, i8 14, i8 13, i8 12, i8 11, i8 10, i8 9, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>, <64 x i8> %passthru, i64 %mask)
-  ret <64 x i8> %a
-}
-
-define <64 x i8> @undef_test_permvar_qi_512(<64 x i8> %a0) {
-; CHECK-LABEL: @undef_test_permvar_qi_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <64 x i8> [[A0:%.*]], <64 x i8> undef, <64 x i32> <i32 undef, i32 62, i32 61, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    ret <64 x i8> [[TMP1]]
-;
-  %a = tail call <64 x i8> @llvm.x86.avx512.mask.permvar.qi.512(<64 x i8> %a0, <64 x i8> <i8 undef, i8 62, i8 61, i8 60, i8 59, i8 58, i8 57, i8 56, i8 55, i8 54, i8 53, i8 52, i8 51, i8 50, i8 49, i8 48, i8 47, i8 46, i8 45, i8 44, i8 43, i8 42, i8 41, i8 40, i8 39, i8 38, i8 37, i8 36, i8 35, i8 34, i8 33, i8 32, i8 31, i8 30, i8 29, i8 28, i8 27, i8 26, i8 25, i8 24, i8 23, i8 22, i8 21, i8 20, i8 19, i8 18, i8 17, i8 16, i8 15, i8 14, i8 13, i8 12, i8 11, i8 10, i8 9, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>, <64 x i8> undef, i64 -1)
-  ret <64 x i8> %a
-}
-
-define <64 x i8> @undef_test_permvar_qi_512_mask(<64 x i8> %a0, <64 x i8> %passthru, i64 %mask) {
-; CHECK-LABEL: @undef_test_permvar_qi_512_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <64 x i8> [[A0:%.*]], <64 x i8> undef, <64 x i32> <i32 undef, i32 62, i32 61, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i64 [[MASK:%.*]] to <64 x i1>
-; CHECK-NEXT:    [[TMP3:%.*]] = select <64 x i1> [[TMP2]], <64 x i8> [[TMP1]], <64 x i8> [[PASSTHRU:%.*]]
-; CHECK-NEXT:    ret <64 x i8> [[TMP3]]
-;
-  %a = tail call <64 x i8> @llvm.x86.avx512.mask.permvar.qi.512(<64 x i8> %a0, <64 x i8> <i8 undef, i8 62, i8 61, i8 60, i8 59, i8 58, i8 57, i8 56, i8 55, i8 54, i8 53, i8 52, i8 51, i8 50, i8 49, i8 48, i8 47, i8 46, i8 45, i8 44, i8 43, i8 42, i8 41, i8 40, i8 39, i8 38, i8 37, i8 36, i8 35, i8 34, i8 33, i8 32, i8 31, i8 30, i8 29, i8 28, i8 27, i8 26, i8 25, i8 24, i8 23, i8 22, i8 21, i8 20, i8 19, i8 18, i8 17, i8 16, i8 15, i8 14, i8 13, i8 12, i8 11, i8 10, i8 9, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>, <64 x i8> %passthru, i64 %mask)
-  ret <64 x i8> %a
-}
-
-declare <16 x float> @llvm.x86.avx512.mask.add.ps.512(<16 x float>, <16 x float>, <16 x float>, i16, i32)
-
-define <16 x float> @test_add_ps(<16 x float> %a, <16 x float> %b) {
-; CHECK-LABEL: @test_add_ps(
-; CHECK-NEXT:    [[TMP1:%.*]] = fadd <16 x float> [[A:%.*]], [[B:%.*]]
-; CHECK-NEXT:    ret <16 x float> [[TMP1]]
-;
-  %1 = tail call <16 x float> @llvm.x86.avx512.mask.add.ps.512(<16 x float> %a, <16 x float> %b, <16 x float> undef, i16 -1, i32 4)
-  ret <16 x float> %1
-}
-
-define <16 x float> @test_add_ps_round(<16 x float> %a, <16 x float> %b) {
-; CHECK-LABEL: @test_add_ps_round(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x float> @llvm.x86.avx512.mask.add.ps.512(<16 x float> [[A:%.*]], <16 x float> [[B:%.*]], <16 x float> undef, i16 -1, i32 8)
-; CHECK-NEXT:    ret <16 x float> [[TMP1]]
-;
-  %1 = tail call <16 x float> @llvm.x86.avx512.mask.add.ps.512(<16 x float> %a, <16 x float> %b, <16 x float> undef, i16 -1, i32 8)
-  ret <16 x float> %1
-}
-
-define <16 x float> @test_add_ps_mask(<16 x float> %a, <16 x float> %b, <16 x float> %c, i16 %mask) {
-; CHECK-LABEL: @test_add_ps_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = fadd <16 x float> [[A:%.*]], [[B:%.*]]
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP3:%.*]] = select <16 x i1> [[TMP2]], <16 x float> [[TMP1]], <16 x float> [[C:%.*]]
-; CHECK-NEXT:    ret <16 x float> [[TMP3]]
-;
-  %1 = tail call <16 x float> @llvm.x86.avx512.mask.add.ps.512(<16 x float> %a, <16 x float> %b, <16 x float> %c, i16 %mask, i32 4)
-  ret <16 x float> %1
-}
-
-define <16 x float> @test_add_ps_mask_round(<16 x float> %a, <16 x float> %b, <16 x float> %c, i16 %mask) {
-; CHECK-LABEL: @test_add_ps_mask_round(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x float> @llvm.x86.avx512.mask.add.ps.512(<16 x float> [[A:%.*]], <16 x float> [[B:%.*]], <16 x float> [[C:%.*]], i16 [[MASK:%.*]], i32 8)
-; CHECK-NEXT:    ret <16 x float> [[TMP1]]
-;
-  %1 = tail call <16 x float> @llvm.x86.avx512.mask.add.ps.512(<16 x float> %a, <16 x float> %b, <16 x float> %c, i16 %mask, i32 8)
-  ret <16 x float> %1
-}
-
-declare <8 x double> @llvm.x86.avx512.mask.add.pd.512(<8 x double>, <8 x double>, <8 x double>, i8, i32)
-
-define <8 x double> @test_add_pd(<8 x double> %a, <8 x double> %b) {
-; CHECK-LABEL: @test_add_pd(
-; CHECK-NEXT:    [[TMP1:%.*]] = fadd <8 x double> [[A:%.*]], [[B:%.*]]
-; CHECK-NEXT:    ret <8 x double> [[TMP1]]
-;
-  %1 = tail call <8 x double> @llvm.x86.avx512.mask.add.pd.512(<8 x double> %a, <8 x double> %b, <8 x double> undef, i8 -1, i32 4)
-  ret <8 x double> %1
-}
-
-define <8 x double> @test_add_pd_round(<8 x double> %a, <8 x double> %b) {
-; CHECK-LABEL: @test_add_pd_round(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x double> @llvm.x86.avx512.mask.add.pd.512(<8 x double> [[A:%.*]], <8 x double> [[B:%.*]], <8 x double> undef, i8 -1, i32 8)
-; CHECK-NEXT:    ret <8 x double> [[TMP1]]
-;
-  %1 = tail call <8 x double> @llvm.x86.avx512.mask.add.pd.512(<8 x double> %a, <8 x double> %b, <8 x double> undef, i8 -1, i32 8)
-  ret <8 x double> %1
-}
-
-define <8 x double> @test_add_pd_mask(<8 x double> %a, <8 x double> %b, <8 x double> %c, i8 %mask) {
-; CHECK-LABEL: @test_add_pd_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = fadd <8 x double> [[A:%.*]], [[B:%.*]]
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP3:%.*]] = select <8 x i1> [[TMP2]], <8 x double> [[TMP1]], <8 x double> [[C:%.*]]
-; CHECK-NEXT:    ret <8 x double> [[TMP3]]
-;
-  %1 = tail call <8 x double> @llvm.x86.avx512.mask.add.pd.512(<8 x double> %a, <8 x double> %b, <8 x double> %c, i8 %mask, i32 4)
-  ret <8 x double> %1
-}
-
-define <8 x double> @test_add_pd_mask_round(<8 x double> %a, <8 x double> %b, <8 x double> %c, i8 %mask) {
-; CHECK-LABEL: @test_add_pd_mask_round(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x double> @llvm.x86.avx512.mask.add.pd.512(<8 x double> [[A:%.*]], <8 x double> [[B:%.*]], <8 x double> [[C:%.*]], i8 [[MASK:%.*]], i32 8)
-; CHECK-NEXT:    ret <8 x double> [[TMP1]]
-;
-  %1 = tail call <8 x double> @llvm.x86.avx512.mask.add.pd.512(<8 x double> %a, <8 x double> %b, <8 x double> %c, i8 %mask, i32 8)
-  ret <8 x double> %1
-}
-
-declare <16 x float> @llvm.x86.avx512.mask.sub.ps.512(<16 x float>, <16 x float>, <16 x float>, i16, i32)
-
-define <16 x float> @test_sub_ps(<16 x float> %a, <16 x float> %b) {
-; CHECK-LABEL: @test_sub_ps(
-; CHECK-NEXT:    [[TMP1:%.*]] = fsub <16 x float> [[A:%.*]], [[B:%.*]]
-; CHECK-NEXT:    ret <16 x float> [[TMP1]]
-;
-  %1 = tail call <16 x float> @llvm.x86.avx512.mask.sub.ps.512(<16 x float> %a, <16 x float> %b, <16 x float> undef, i16 -1, i32 4)
-  ret <16 x float> %1
-}
-
-define <16 x float> @test_sub_ps_round(<16 x float> %a, <16 x float> %b) {
-; CHECK-LABEL: @test_sub_ps_round(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x float> @llvm.x86.avx512.mask.sub.ps.512(<16 x float> [[A:%.*]], <16 x float> [[B:%.*]], <16 x float> undef, i16 -1, i32 8)
-; CHECK-NEXT:    ret <16 x float> [[TMP1]]
-;
-  %1 = tail call <16 x float> @llvm.x86.avx512.mask.sub.ps.512(<16 x float> %a, <16 x float> %b, <16 x float> undef, i16 -1, i32 8)
-  ret <16 x float> %1
-}
-
-define <16 x float> @test_sub_ps_mask(<16 x float> %a, <16 x float> %b, <16 x float> %c, i16 %mask) {
-; CHECK-LABEL: @test_sub_ps_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = fsub <16 x float> [[A:%.*]], [[B:%.*]]
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP3:%.*]] = select <16 x i1> [[TMP2]], <16 x float> [[TMP1]], <16 x float> [[C:%.*]]
-; CHECK-NEXT:    ret <16 x float> [[TMP3]]
-;
-  %1 = tail call <16 x float> @llvm.x86.avx512.mask.sub.ps.512(<16 x float> %a, <16 x float> %b, <16 x float> %c, i16 %mask, i32 4)
-  ret <16 x float> %1
-}
-
-define <16 x float> @test_sub_ps_mask_round(<16 x float> %a, <16 x float> %b, <16 x float> %c, i16 %mask) {
-; CHECK-LABEL: @test_sub_ps_mask_round(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x float> @llvm.x86.avx512.mask.sub.ps.512(<16 x float> [[A:%.*]], <16 x float> [[B:%.*]], <16 x float> [[C:%.*]], i16 [[MASK:%.*]], i32 8)
-; CHECK-NEXT:    ret <16 x float> [[TMP1]]
-;
-  %1 = tail call <16 x float> @llvm.x86.avx512.mask.sub.ps.512(<16 x float> %a, <16 x float> %b, <16 x float> %c, i16 %mask, i32 8)
-  ret <16 x float> %1
-}
-
-declare <8 x double> @llvm.x86.avx512.mask.sub.pd.512(<8 x double>, <8 x double>, <8 x double>, i8, i32)
-
-define <8 x double> @test_sub_pd(<8 x double> %a, <8 x double> %b) {
-; CHECK-LABEL: @test_sub_pd(
-; CHECK-NEXT:    [[TMP1:%.*]] = fsub <8 x double> [[A:%.*]], [[B:%.*]]
-; CHECK-NEXT:    ret <8 x double> [[TMP1]]
-;
-  %1 = tail call <8 x double> @llvm.x86.avx512.mask.sub.pd.512(<8 x double> %a, <8 x double> %b, <8 x double> undef, i8 -1, i32 4)
-  ret <8 x double> %1
-}
-
-define <8 x double> @test_sub_pd_round(<8 x double> %a, <8 x double> %b) {
-; CHECK-LABEL: @test_sub_pd_round(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x double> @llvm.x86.avx512.mask.sub.pd.512(<8 x double> [[A:%.*]], <8 x double> [[B:%.*]], <8 x double> undef, i8 -1, i32 8)
-; CHECK-NEXT:    ret <8 x double> [[TMP1]]
-;
-  %1 = tail call <8 x double> @llvm.x86.avx512.mask.sub.pd.512(<8 x double> %a, <8 x double> %b, <8 x double> undef, i8 -1, i32 8)
-  ret <8 x double> %1
-}
-
-define <8 x double> @test_sub_pd_mask(<8 x double> %a, <8 x double> %b, <8 x double> %c, i8 %mask) {
-; CHECK-LABEL: @test_sub_pd_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = fsub <8 x double> [[A:%.*]], [[B:%.*]]
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP3:%.*]] = select <8 x i1> [[TMP2]], <8 x double> [[TMP1]], <8 x double> [[C:%.*]]
-; CHECK-NEXT:    ret <8 x double> [[TMP3]]
-;
-  %1 = tail call <8 x double> @llvm.x86.avx512.mask.sub.pd.512(<8 x double> %a, <8 x double> %b, <8 x double> %c, i8 %mask, i32 4)
-  ret <8 x double> %1
-}
-
-define <8 x double> @test_sub_pd_mask_round(<8 x double> %a, <8 x double> %b, <8 x double> %c, i8 %mask) {
-; CHECK-LABEL: @test_sub_pd_mask_round(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x double> @llvm.x86.avx512.mask.sub.pd.512(<8 x double> [[A:%.*]], <8 x double> [[B:%.*]], <8 x double> [[C:%.*]], i8 [[MASK:%.*]], i32 8)
-; CHECK-NEXT:    ret <8 x double> [[TMP1]]
-;
-  %1 = tail call <8 x double> @llvm.x86.avx512.mask.sub.pd.512(<8 x double> %a, <8 x double> %b, <8 x double> %c, i8 %mask, i32 8)
-  ret <8 x double> %1
-}
-
-declare <16 x float> @llvm.x86.avx512.mask.mul.ps.512(<16 x float>, <16 x float>, <16 x float>, i16, i32)
-
-define <16 x float> @test_mul_ps(<16 x float> %a, <16 x float> %b) {
-; CHECK-LABEL: @test_mul_ps(
-; CHECK-NEXT:    [[TMP1:%.*]] = fmul <16 x float> [[A:%.*]], [[B:%.*]]
-; CHECK-NEXT:    ret <16 x float> [[TMP1]]
-;
-  %1 = tail call <16 x float> @llvm.x86.avx512.mask.mul.ps.512(<16 x float> %a, <16 x float> %b, <16 x float> undef, i16 -1, i32 4)
-  ret <16 x float> %1
-}
-
-define <16 x float> @test_mul_ps_round(<16 x float> %a, <16 x float> %b) {
-; CHECK-LABEL: @test_mul_ps_round(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x float> @llvm.x86.avx512.mask.mul.ps.512(<16 x float> [[A:%.*]], <16 x float> [[B:%.*]], <16 x float> undef, i16 -1, i32 8)
-; CHECK-NEXT:    ret <16 x float> [[TMP1]]
-;
-  %1 = tail call <16 x float> @llvm.x86.avx512.mask.mul.ps.512(<16 x float> %a, <16 x float> %b, <16 x float> undef, i16 -1, i32 8)
-  ret <16 x float> %1
-}
-
-define <16 x float> @test_mul_ps_mask(<16 x float> %a, <16 x float> %b, <16 x float> %c, i16 %mask) {
-; CHECK-LABEL: @test_mul_ps_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = fmul <16 x float> [[A:%.*]], [[B:%.*]]
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP3:%.*]] = select <16 x i1> [[TMP2]], <16 x float> [[TMP1]], <16 x float> [[C:%.*]]
-; CHECK-NEXT:    ret <16 x float> [[TMP3]]
-;
-  %1 = tail call <16 x float> @llvm.x86.avx512.mask.mul.ps.512(<16 x float> %a, <16 x float> %b, <16 x float> %c, i16 %mask, i32 4)
-  ret <16 x float> %1
-}
-
-define <16 x float> @test_mul_ps_mask_round(<16 x float> %a, <16 x float> %b, <16 x float> %c, i16 %mask) {
-; CHECK-LABEL: @test_mul_ps_mask_round(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x float> @llvm.x86.avx512.mask.mul.ps.512(<16 x float> [[A:%.*]], <16 x float> [[B:%.*]], <16 x float> [[C:%.*]], i16 [[MASK:%.*]], i32 8)
-; CHECK-NEXT:    ret <16 x float> [[TMP1]]
-;
-  %1 = tail call <16 x float> @llvm.x86.avx512.mask.mul.ps.512(<16 x float> %a, <16 x float> %b, <16 x float> %c, i16 %mask, i32 8)
-  ret <16 x float> %1
-}
-
-declare <8 x double> @llvm.x86.avx512.mask.mul.pd.512(<8 x double>, <8 x double>, <8 x double>, i8, i32)
-
-define <8 x double> @test_mul_pd(<8 x double> %a, <8 x double> %b) {
-; CHECK-LABEL: @test_mul_pd(
-; CHECK-NEXT:    [[TMP1:%.*]] = fmul <8 x double> [[A:%.*]], [[B:%.*]]
-; CHECK-NEXT:    ret <8 x double> [[TMP1]]
-;
-  %1 = tail call <8 x double> @llvm.x86.avx512.mask.mul.pd.512(<8 x double> %a, <8 x double> %b, <8 x double> undef, i8 -1, i32 4)
-  ret <8 x double> %1
-}
-
-define <8 x double> @test_mul_pd_round(<8 x double> %a, <8 x double> %b) {
-; CHECK-LABEL: @test_mul_pd_round(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x double> @llvm.x86.avx512.mask.mul.pd.512(<8 x double> [[A:%.*]], <8 x double> [[B:%.*]], <8 x double> undef, i8 -1, i32 8)
-; CHECK-NEXT:    ret <8 x double> [[TMP1]]
-;
-  %1 = tail call <8 x double> @llvm.x86.avx512.mask.mul.pd.512(<8 x double> %a, <8 x double> %b, <8 x double> undef, i8 -1, i32 8)
-  ret <8 x double> %1
-}
-
-define <8 x double> @test_mul_pd_mask(<8 x double> %a, <8 x double> %b, <8 x double> %c, i8 %mask) {
-; CHECK-LABEL: @test_mul_pd_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = fmul <8 x double> [[A:%.*]], [[B:%.*]]
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP3:%.*]] = select <8 x i1> [[TMP2]], <8 x double> [[TMP1]], <8 x double> [[C:%.*]]
-; CHECK-NEXT:    ret <8 x double> [[TMP3]]
-;
-  %1 = tail call <8 x double> @llvm.x86.avx512.mask.mul.pd.512(<8 x double> %a, <8 x double> %b, <8 x double> %c, i8 %mask, i32 4)
-  ret <8 x double> %1
-}
-
-define <8 x double> @test_mul_pd_mask_round(<8 x double> %a, <8 x double> %b, <8 x double> %c, i8 %mask) {
-; CHECK-LABEL: @test_mul_pd_mask_round(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x double> @llvm.x86.avx512.mask.mul.pd.512(<8 x double> [[A:%.*]], <8 x double> [[B:%.*]], <8 x double> [[C:%.*]], i8 [[MASK:%.*]], i32 8)
-; CHECK-NEXT:    ret <8 x double> [[TMP1]]
-;
-  %1 = tail call <8 x double> @llvm.x86.avx512.mask.mul.pd.512(<8 x double> %a, <8 x double> %b, <8 x double> %c, i8 %mask, i32 8)
-  ret <8 x double> %1
-}
-
-declare <16 x float> @llvm.x86.avx512.mask.div.ps.512(<16 x float>, <16 x float>, <16 x float>, i16, i32)
-
-define <16 x float> @test_div_ps(<16 x float> %a, <16 x float> %b) {
-; CHECK-LABEL: @test_div_ps(
-; CHECK-NEXT:    [[TMP1:%.*]] = fdiv <16 x float> [[A:%.*]], [[B:%.*]]
-; CHECK-NEXT:    ret <16 x float> [[TMP1]]
-;
-  %1 = tail call <16 x float> @llvm.x86.avx512.mask.div.ps.512(<16 x float> %a, <16 x float> %b, <16 x float> undef, i16 -1, i32 4)
-  ret <16 x float> %1
-}
-
-define <16 x float> @test_div_ps_round(<16 x float> %a, <16 x float> %b) {
-; CHECK-LABEL: @test_div_ps_round(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x float> @llvm.x86.avx512.mask.div.ps.512(<16 x float> [[A:%.*]], <16 x float> [[B:%.*]], <16 x float> undef, i16 -1, i32 8)
-; CHECK-NEXT:    ret <16 x float> [[TMP1]]
-;
-  %1 = tail call <16 x float> @llvm.x86.avx512.mask.div.ps.512(<16 x float> %a, <16 x float> %b, <16 x float> undef, i16 -1, i32 8)
-  ret <16 x float> %1
-}
-
-define <16 x float> @test_div_ps_mask(<16 x float> %a, <16 x float> %b, <16 x float> %c, i16 %mask) {
-; CHECK-LABEL: @test_div_ps_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = fdiv <16 x float> [[A:%.*]], [[B:%.*]]
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP3:%.*]] = select <16 x i1> [[TMP2]], <16 x float> [[TMP1]], <16 x float> [[C:%.*]]
-; CHECK-NEXT:    ret <16 x float> [[TMP3]]
-;
-  %1 = tail call <16 x float> @llvm.x86.avx512.mask.div.ps.512(<16 x float> %a, <16 x float> %b, <16 x float> %c, i16 %mask, i32 4)
-  ret <16 x float> %1
-}
-
-define <16 x float> @test_div_ps_mask_round(<16 x float> %a, <16 x float> %b, <16 x float> %c, i16 %mask) {
-; CHECK-LABEL: @test_div_ps_mask_round(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x float> @llvm.x86.avx512.mask.div.ps.512(<16 x float> [[A:%.*]], <16 x float> [[B:%.*]], <16 x float> [[C:%.*]], i16 [[MASK:%.*]], i32 8)
-; CHECK-NEXT:    ret <16 x float> [[TMP1]]
-;
-  %1 = tail call <16 x float> @llvm.x86.avx512.mask.div.ps.512(<16 x float> %a, <16 x float> %b, <16 x float> %c, i16 %mask, i32 8)
-  ret <16 x float> %1
-}
-
-declare <8 x double> @llvm.x86.avx512.mask.div.pd.512(<8 x double>, <8 x double>, <8 x double>, i8, i32)
-
-define <8 x double> @test_div_pd(<8 x double> %a, <8 x double> %b) {
-; CHECK-LABEL: @test_div_pd(
-; CHECK-NEXT:    [[TMP1:%.*]] = fdiv <8 x double> [[A:%.*]], [[B:%.*]]
-; CHECK-NEXT:    ret <8 x double> [[TMP1]]
-;
-  %1 = tail call <8 x double> @llvm.x86.avx512.mask.div.pd.512(<8 x double> %a, <8 x double> %b, <8 x double> undef, i8 -1, i32 4)
-  ret <8 x double> %1
-}
-
-define <8 x double> @test_div_pd_round(<8 x double> %a, <8 x double> %b) {
-; CHECK-LABEL: @test_div_pd_round(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x double> @llvm.x86.avx512.mask.div.pd.512(<8 x double> [[A:%.*]], <8 x double> [[B:%.*]], <8 x double> undef, i8 -1, i32 8)
-; CHECK-NEXT:    ret <8 x double> [[TMP1]]
-;
-  %1 = tail call <8 x double> @llvm.x86.avx512.mask.div.pd.512(<8 x double> %a, <8 x double> %b, <8 x double> undef, i8 -1, i32 8)
-  ret <8 x double> %1
-}
-
-define <8 x double> @test_div_pd_mask(<8 x double> %a, <8 x double> %b, <8 x double> %c, i8 %mask) {
-; CHECK-LABEL: @test_div_pd_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = fdiv <8 x double> [[A:%.*]], [[B:%.*]]
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP3:%.*]] = select <8 x i1> [[TMP2]], <8 x double> [[TMP1]], <8 x double> [[C:%.*]]
-; CHECK-NEXT:    ret <8 x double> [[TMP3]]
-;
-  %1 = tail call <8 x double> @llvm.x86.avx512.mask.div.pd.512(<8 x double> %a, <8 x double> %b, <8 x double> %c, i8 %mask, i32 4)
-  ret <8 x double> %1
-}
-
-define <8 x double> @test_div_pd_mask_round(<8 x double> %a, <8 x double> %b, <8 x double> %c, i8 %mask) {
-; CHECK-LABEL: @test_div_pd_mask_round(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x double> @llvm.x86.avx512.mask.div.pd.512(<8 x double> [[A:%.*]], <8 x double> [[B:%.*]], <8 x double> [[C:%.*]], i8 [[MASK:%.*]], i32 8)
-; CHECK-NEXT:    ret <8 x double> [[TMP1]]
-;
-  %1 = tail call <8 x double> @llvm.x86.avx512.mask.div.pd.512(<8 x double> %a, <8 x double> %b, <8 x double> %c, i8 %mask, i32 8)
-  ret <8 x double> %1
-}
-
-declare i32 @llvm.x86.avx512.vcomi.ss(<4 x float>, <4 x float>, i32, i32)
-
-define i32 @test_comi_ss_0(float %a, float %b) {
-; CHECK-LABEL: @test_comi_ss_0(
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x float> undef, float [[A:%.*]], i32 0
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x float> undef, float [[B:%.*]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = tail call i32 @llvm.x86.avx512.vcomi.ss(<4 x float> [[TMP1]], <4 x float> [[TMP2]], i32 0, i32 4)
-; CHECK-NEXT:    ret i32 [[TMP3]]
-;
-  %1 = insertelement <4 x float> undef, float %a, i32 0
-  %2 = insertelement <4 x float> %1, float 1.000000e+00, i32 1
-  %3 = insertelement <4 x float> %2, float 2.000000e+00, i32 2
-  %4 = insertelement <4 x float> %3, float 3.000000e+00, i32 3
-  %5 = insertelement <4 x float> undef, float %b, i32 0
-  %6 = insertelement <4 x float> %5, float 4.000000e+00, i32 1
-  %7 = insertelement <4 x float> %6, float 5.000000e+00, i32 2
-  %8 = insertelement <4 x float> %7, float 6.000000e+00, i32 3
-  %9 = tail call i32 @llvm.x86.avx512.vcomi.ss(<4 x float> %4, <4 x float> %8, i32 0, i32 4)
-  ret i32 %9
-}
-
-declare i32 @llvm.x86.avx512.vcomi.sd(<2 x double>, <2 x double>, i32, i32)
-
-define i32 @test_comi_sd_0(double %a, double %b) {
-; CHECK-LABEL: @test_comi_sd_0(
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> undef, double [[A:%.*]], i32 0
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> undef, double [[B:%.*]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = tail call i32 @llvm.x86.avx512.vcomi.sd(<2 x double> [[TMP1]], <2 x double> [[TMP2]], i32 0, i32 4)
-; CHECK-NEXT:    ret i32 [[TMP3]]
-;
-  %1 = insertelement <2 x double> undef, double %a, i32 0
-  %2 = insertelement <2 x double> %1, double 1.000000e+00, i32 1
-  %3 = insertelement <2 x double> undef, double %b, i32 0
-  %4 = insertelement <2 x double> %3, double 2.000000e+00, i32 1
-  %5 = tail call i32 @llvm.x86.avx512.vcomi.sd(<2 x double> %2, <2 x double> %4, i32 0, i32 4)
-  ret i32 %5
-}
diff --git a/test/Transforms/InstCombine/x86-crc32-demanded.ll b/test/Transforms/InstCombine/x86-crc32-demanded.ll
deleted file mode 100644
index 878b97d1bb22..000000000000
--- a/test/Transforms/InstCombine/x86-crc32-demanded.ll
+++ /dev/null
@@ -1,17 +0,0 @@
-; RUN: opt < %s -instcombine -S | FileCheck %s
-
-; crc32 with 64-bit destination zeros high 32-bit.
-; rdar://9467055
-
-define i64 @test() nounwind {
-entry:
-; CHECK: test
-; CHECK: tail call i64 @llvm.x86.sse42.crc32.64.64
-; CHECK-NOT: and
-; CHECK: ret
-  %0 = tail call i64 @llvm.x86.sse42.crc32.64.64(i64 0, i64 4) nounwind
-  %1 = and i64 %0, 4294967295
-  ret i64 %1
-}
-
-declare i64 @llvm.x86.sse42.crc32.64.64(i64, i64) nounwind readnone
diff --git a/test/Transforms/InstCombine/x86-f16c.ll b/test/Transforms/InstCombine/x86-f16c.ll
deleted file mode 100644
index 6b5b6cb26eda..000000000000
--- a/test/Transforms/InstCombine/x86-f16c.ll
+++ /dev/null
@@ -1,68 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -instcombine -S | FileCheck %s
-
-declare <4 x float> @llvm.x86.vcvtph2ps.128(<8 x i16>)
-declare <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16>)
-
-;
-; Vector Demanded Bits
-;
-
-; Only bottom 4 elements required.
-define <4 x float> @demand_vcvtph2ps_128(<8 x i16> %A) {
-; CHECK-LABEL: @demand_vcvtph2ps_128(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x float> @llvm.x86.vcvtph2ps.128(<8 x i16> %A)
-; CHECK-NEXT:    ret <4 x float> [[TMP1]]
-;
-  %1 = shufflevector <8 x i16> %A, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
-  %2 = tail call <4 x float> @llvm.x86.vcvtph2ps.128(<8 x i16> %1)
-  ret <4 x float> %2
-}
-
-; All 8 elements required.
-define <8 x float> @demand_vcvtph2ps_256(<8 x i16> %A) {
-; CHECK-LABEL: @demand_vcvtph2ps_256(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i16> %A, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16> [[TMP1]])
-; CHECK-NEXT:    ret <8 x float> [[TMP2]]
-;
-  %1 = shufflevector <8 x i16> %A, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
-  %2 = tail call <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16> %1)
-  ret <8 x float> %2
-}
-
-;
-; Constant Folding
-;
-
-define <4 x float> @fold_vcvtph2ps_128() {
-; CHECK-LABEL: @fold_vcvtph2ps_128(
-; CHECK-NEXT:    ret <4 x float> <float 0.000000e+00, float 5.000000e-01, float 1.000000e+00, float -0.000000e+00>
-;
-  %1 = tail call <4 x float> @llvm.x86.vcvtph2ps.128(<8 x i16> <i16 0, i16 14336, i16 15360, i16 32768, i16 16384, i16 31743, i16 48128, i16 49152>)
-  ret <4 x float> %1
-}
-
-define <8 x float> @fold_vcvtph2ps_256() {
-; CHECK-LABEL: @fold_vcvtph2ps_256(
-; CHECK-NEXT:    ret <8 x float> <float 0.000000e+00, float 5.000000e-01, float 1.000000e+00, float -0.000000e+00, float 2.000000e+00, float 6.550400e+04, float -1.000000e+00, float -2.000000e+00>
-;
-  %1 = tail call <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16> <i16 0, i16 14336, i16 15360, i16 32768, i16 16384, i16 31743, i16 48128, i16 49152>)
-  ret <8 x float> %1
-}
-
-define <4 x float> @fold_vcvtph2ps_128_zero() {
-; CHECK-LABEL: @fold_vcvtph2ps_128_zero(
-; CHECK-NEXT:    ret <4 x float> zeroinitializer
-;
-  %1 = tail call <4 x float> @llvm.x86.vcvtph2ps.128(<8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>)
-  ret <4 x float> %1
-}
-
-define <8 x float> @fold_vcvtph2ps_256_zero() {
-; CHECK-LABEL: @fold_vcvtph2ps_256_zero(
-; CHECK-NEXT:    ret <8 x float> zeroinitializer
-;
-  %1 = tail call <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>)
-  ret <8 x float> %1
-}
diff --git a/test/Transforms/InstCombine/x86-fma.ll b/test/Transforms/InstCombine/x86-fma.ll
deleted file mode 100644
index 0d27d3276163..000000000000
--- a/test/Transforms/InstCombine/x86-fma.ll
+++ /dev/null
@@ -1,315 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -instcombine -S | FileCheck %s
-target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-
-declare <4 x float> @llvm.x86.fma.vfmadd.ss(<4 x float>, <4 x float>, <4 x float>)
-
-define <4 x float> @test_vfmadd_ss(<4 x float> %a, <4 x float> %b, <4 x float> %c) {
-; CHECK-LABEL: @test_vfmadd_ss(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x float> @llvm.x86.fma.vfmadd.ss(<4 x float> %a, <4 x float> %b, <4 x float> %c)
-; CHECK-NEXT:    ret <4 x float> [[TMP1]]
-;
-  %1 = insertelement <4 x float> %b, float 1.000000e+00, i32 1
-  %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2
-  %3 = insertelement <4 x float> %2, float 3.000000e+00, i32 3
-  %4 = insertelement <4 x float> %c, float 4.000000e+00, i32 1
-  %5 = insertelement <4 x float> %4, float 5.000000e+00, i32 2
-  %6 = insertelement <4 x float> %5, float 6.000000e+00, i32 3
-  %res = tail call <4 x float> @llvm.x86.fma.vfmadd.ss(<4 x float> %a, <4 x float> %3, <4 x float> %6)
-  ret <4 x float> %res
-}
-
-define float @test_vfmadd_ss_0(<4 x float> %a, <4 x float> %b, <4 x float> %c) {
-; CHECK-LABEL: @test_vfmadd_ss_0(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x float> @llvm.x86.fma.vfmadd.ss(<4 x float> %a, <4 x float> %b, <4 x float> %c)
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 0
-; CHECK-NEXT:    ret float [[TMP2]]
-;
-  %1 = insertelement <4 x float> %a, float 1.000000e+00, i32 1
-  %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2
-  %3 = insertelement <4 x float> %2, float 3.000000e+00, i32 3
-  %4 = tail call <4 x float> @llvm.x86.fma.vfmadd.ss(<4 x float> %3, <4 x float> %b, <4 x float> %c)
-  %5 = extractelement <4 x float> %4, i32 0
-  ret float %5
-}
-
-define float @test_vfmadd_ss_1(<4 x float> %a, <4 x float> %b, <4 x float> %c) {
-; CHECK-LABEL: @test_vfmadd_ss_1(
-; CHECK-NEXT:    ret float 1.000000e+00
-;
-  %1 = insertelement <4 x float> %a, float 1.000000e+00, i32 1
-  %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2
-  %3 = insertelement <4 x float> %2, float 3.000000e+00, i32 3
-  %4 = tail call <4 x float> @llvm.x86.fma.vfmadd.ss(<4 x float> %3, <4 x float> %b, <4 x float> %c)
-  %5 = extractelement <4 x float> %4, i32 1
-  ret float %5
-}
-
-declare <2 x double> @llvm.x86.fma.vfmadd.sd(<2 x double>, <2 x double>, <2 x double>)
-
-define <2 x double> @test_vfmadd_sd(<2 x double> %a, <2 x double> %b, <2 x double> %c) {
-; CHECK-LABEL: @test_vfmadd_sd(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x double> @llvm.x86.fma.vfmadd.sd(<2 x double> %a, <2 x double> %b, <2 x double> %c)
-; CHECK-NEXT:    ret <2 x double> [[TMP1]]
-;
-  %1 = insertelement <2 x double> %b, double 1.000000e+00, i32 1
-  %2 = insertelement <2 x double> %c, double 2.000000e+00, i32 1
-  %res = tail call <2 x double> @llvm.x86.fma.vfmadd.sd(<2 x double> %a, <2 x double> %1, <2 x double> %2)
-  ret <2 x double> %res
-}
-
-define double @test_vfmadd_sd_0(<2 x double> %a, <2 x double> %b, <2 x double> %c) {
-; CHECK-LABEL: @test_vfmadd_sd_0(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x double> @llvm.x86.fma.vfmadd.sd(<2 x double> %a, <2 x double> %b, <2 x double> %c)
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x double> [[TMP1]], i32 0
-; CHECK-NEXT:    ret double [[TMP2]]
-;
-  %1 = insertelement <2 x double> %a, double 1.000000e+00, i32 1
-  %2 = tail call <2 x double> @llvm.x86.fma.vfmadd.sd(<2 x double> %1, <2 x double> %b, <2 x double> %c)
-  %3 = extractelement <2 x double> %2, i32 0
-  ret double %3
-}
-
-define double @test_vfmadd_sd_1(<2 x double> %a, <2 x double> %b, <2 x double> %c) {
-; CHECK-LABEL: @test_vfmadd_sd_1(
-; CHECK-NEXT:    ret double 1.000000e+00
-;
-  %1 = insertelement <2 x double> %a, double 1.000000e+00, i32 1
-  %2 = tail call <2 x double> @llvm.x86.fma.vfmadd.sd(<2 x double> %1, <2 x double> %b, <2 x double> %c)
-  %3 = extractelement <2 x double> %2, i32 1
-  ret double %3
-}
-
-declare <4 x float> @llvm.x86.fma.vfmsub.ss(<4 x float>, <4 x float>, <4 x float>)
-
-define <4 x float> @test_vfmsub_ss(<4 x float> %a, <4 x float> %b, <4 x float> %c) {
-; CHECK-LABEL: @test_vfmsub_ss(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x float> @llvm.x86.fma.vfmsub.ss(<4 x float> %a, <4 x float> %b, <4 x float> %c)
-; CHECK-NEXT:    ret <4 x float> [[TMP1]]
-;
-  %1 = insertelement <4 x float> %b, float 1.000000e+00, i32 1
-  %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2
-  %3 = insertelement <4 x float> %2, float 3.000000e+00, i32 3
-  %4 = insertelement <4 x float> %c, float 4.000000e+00, i32 1
-  %5 = insertelement <4 x float> %4, float 5.000000e+00, i32 2
-  %6 = insertelement <4 x float> %5, float 6.000000e+00, i32 3
-  %res = tail call <4 x float> @llvm.x86.fma.vfmsub.ss(<4 x float> %a, <4 x float> %3, <4 x float> %6)
-  ret <4 x float> %res
-}
-
-define float @test_vfmsub_ss_0(<4 x float> %a, <4 x float> %b, <4 x float> %c) {
-; CHECK-LABEL: @test_vfmsub_ss_0(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x float> @llvm.x86.fma.vfmsub.ss(<4 x float> %a, <4 x float> %b, <4 x float> %c)
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 0
-; CHECK-NEXT:    ret float [[TMP2]]
-;
-  %1 = insertelement <4 x float> %a, float 1.000000e+00, i32 1
-  %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2
-  %3 = insertelement <4 x float> %2, float 3.000000e+00, i32 3
-  %4 = tail call <4 x float> @llvm.x86.fma.vfmsub.ss(<4 x float> %3, <4 x float> %b, <4 x float> %c)
-  %5 = extractelement <4 x float> %4, i32 0
-  ret float %5
-}
-
-define float @test_vfmsub_ss_1(<4 x float> %a, <4 x float> %b, <4 x float> %c) {
-; CHECK-LABEL: @test_vfmsub_ss_1(
-; CHECK-NEXT:    ret float 1.000000e+00
-;
-  %1 = insertelement <4 x float> %a, float 1.000000e+00, i32 1
-  %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2
-  %3 = insertelement <4 x float> %2, float 3.000000e+00, i32 3
-  %4 = tail call <4 x float> @llvm.x86.fma.vfmsub.ss(<4 x float> %3, <4 x float> %b, <4 x float> %c)
-  %5 = extractelement <4 x float> %4, i32 1
-  ret float %5
-}
-
-declare <2 x double> @llvm.x86.fma.vfmsub.sd(<2 x double>, <2 x double>, <2 x double>)
-
-define <2 x double> @test_vfmsub_sd(<2 x double> %a, <2 x double> %b, <2 x double> %c) {
-; CHECK-LABEL: @test_vfmsub_sd(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x double> @llvm.x86.fma.vfmsub.sd(<2 x double> %a, <2 x double> %b, <2 x double> %c)
-; CHECK-NEXT:    ret <2 x double> [[TMP1]]
-;
-  %1 = insertelement <2 x double> %b, double 1.000000e+00, i32 1
-  %2 = insertelement <2 x double> %c, double 2.000000e+00, i32 1
-  %res = tail call <2 x double> @llvm.x86.fma.vfmsub.sd(<2 x double> %a, <2 x double> %1, <2 x double> %2)
-  ret <2 x double> %res
-}
-
-define double @test_vfmsub_sd_0(<2 x double> %a, <2 x double> %b, <2 x double> %c) {
-; CHECK-LABEL: @test_vfmsub_sd_0(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x double> @llvm.x86.fma.vfmsub.sd(<2 x double> %a, <2 x double> %b, <2 x double> %c)
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x double> [[TMP1]], i32 0
-; CHECK-NEXT:    ret double [[TMP2]]
-;
-  %1 = insertelement <2 x double> %a, double 1.000000e+00, i32 1
-  %2 = tail call <2 x double> @llvm.x86.fma.vfmsub.sd(<2 x double> %1, <2 x double> %b, <2 x double> %c)
-  %3 = extractelement <2 x double> %2, i32 0
-  ret double %3
-}
-
-define double @test_vfmsub_sd_1(<2 x double> %a, <2 x double> %b, <2 x double> %c) {
-; CHECK-LABEL: @test_vfmsub_sd_1(
-; CHECK-NEXT:    ret double 1.000000e+00
-;
-  %1 = insertelement <2 x double> %a, double 1.000000e+00, i32 1
-  %2 = tail call <2 x double> @llvm.x86.fma.vfmsub.sd(<2 x double> %1, <2 x double> %b, <2 x double> %c)
-  %3 = extractelement <2 x double> %2, i32 1
-  ret double %3
-}
-
-declare <4 x float> @llvm.x86.fma.vfnmadd.ss(<4 x float>, <4 x float>, <4 x float>)
-
-define <4 x float> @test_vfnmadd_ss(<4 x float> %a, <4 x float> %b, <4 x float> %c) {
-; CHECK-LABEL: @test_vfnmadd_ss(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x float> @llvm.x86.fma.vfnmadd.ss(<4 x float> %a, <4 x float> %b, <4 x float> %c)
-; CHECK-NEXT:    ret <4 x float> [[TMP1]]
-;
-  %1 = insertelement <4 x float> %b, float 1.000000e+00, i32 1
-  %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2
-  %3 = insertelement <4 x float> %2, float 3.000000e+00, i32 3
-  %4 = insertelement <4 x float> %c, float 4.000000e+00, i32 1
-  %5 = insertelement <4 x float> %4, float 5.000000e+00, i32 2
-  %6 = insertelement <4 x float> %5, float 6.000000e+00, i32 3
-  %res = tail call <4 x float> @llvm.x86.fma.vfnmadd.ss(<4 x float> %a, <4 x float> %3, <4 x float> %6)
-  ret <4 x float> %res
-}
-
-define float @test_vfnmadd_ss_0(<4 x float> %a, <4 x float> %b, <4 x float> %c) {
-; CHECK-LABEL: @test_vfnmadd_ss_0(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x float> @llvm.x86.fma.vfnmadd.ss(<4 x float> %a, <4 x float> %b, <4 x float> %c)
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 0
-; CHECK-NEXT:    ret float [[TMP2]]
-;
-  %1 = insertelement <4 x float> %a, float 1.000000e+00, i32 1
-  %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2
-  %3 = insertelement <4 x float> %2, float 3.000000e+00, i32 3
-  %4 = tail call <4 x float> @llvm.x86.fma.vfnmadd.ss(<4 x float> %3, <4 x float> %b, <4 x float> %c)
-  %5 = extractelement <4 x float> %4, i32 0
-  ret float %5
-}
-
-define float @test_vfnmadd_ss_1(<4 x float> %a, <4 x float> %b, <4 x float> %c) {
-; CHECK-LABEL: @test_vfnmadd_ss_1(
-; CHECK-NEXT:    ret float 1.000000e+00
-;
-  %1 = insertelement <4 x float> %a, float 1.000000e+00, i32 1
-  %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2
-  %3 = insertelement <4 x float> %2, float 3.000000e+00, i32 3
-  %4 = tail call <4 x float> @llvm.x86.fma.vfnmadd.ss(<4 x float> %3, <4 x float> %b, <4 x float> %c)
-  %5 = extractelement <4 x float> %4, i32 1
-  ret float %5
-}
-
-declare <2 x double> @llvm.x86.fma.vfnmadd.sd(<2 x double>, <2 x double>, <2 x double>)
-
-define <2 x double> @test_vfnmadd_sd(<2 x double> %a, <2 x double> %b, <2 x double> %c) {
-; CHECK-LABEL: @test_vfnmadd_sd(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x double> @llvm.x86.fma.vfnmadd.sd(<2 x double> %a, <2 x double> %b, <2 x double> %c)
-; CHECK-NEXT:    ret <2 x double> [[TMP1]]
-;
-  %1 = insertelement <2 x double> %b, double 1.000000e+00, i32 1
-  %2 = insertelement <2 x double> %c, double 2.000000e+00, i32 1
-  %res = tail call <2 x double> @llvm.x86.fma.vfnmadd.sd(<2 x double> %a, <2 x double> %1, <2 x double> %2)
-  ret <2 x double> %res
-}
-
-define double @test_vfnmadd_sd_0(<2 x double> %a, <2 x double> %b, <2 x double> %c) {
-; CHECK-LABEL: @test_vfnmadd_sd_0(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x double> @llvm.x86.fma.vfnmadd.sd(<2 x double> %a, <2 x double> %b, <2 x double> %c)
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x double> [[TMP1]], i32 0
-; CHECK-NEXT:    ret double [[TMP2]]
-;
-  %1 = insertelement <2 x double> %a, double 1.000000e+00, i32 1
-  %2 = tail call <2 x double> @llvm.x86.fma.vfnmadd.sd(<2 x double> %1, <2 x double> %b, <2 x double> %c)
-  %3 = extractelement <2 x double> %2, i32 0
-  ret double %3
-}
-
-define double @test_vfnmadd_sd_1(<2 x double> %a, <2 x double> %b, <2 x double> %c) {
-; CHECK-LABEL: @test_vfnmadd_sd_1(
-; CHECK-NEXT:    ret double 1.000000e+00
-;
-  %1 = insertelement <2 x double> %a, double 1.000000e+00, i32 1
-  %2 = tail call <2 x double> @llvm.x86.fma.vfnmadd.sd(<2 x double> %1, <2 x double> %b, <2 x double> %c)
-  %3 = extractelement <2 x double> %2, i32 1
-  ret double %3
-}
-
-declare <4 x float> @llvm.x86.fma.vfnmsub.ss(<4 x float>, <4 x float>, <4 x float>)
-
-define <4 x float> @test_vfnmsub_ss(<4 x float> %a, <4 x float> %b, <4 x float> %c) {
-; CHECK-LABEL: @test_vfnmsub_ss(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x float> @llvm.x86.fma.vfnmsub.ss(<4 x float> %a, <4 x float> %b, <4 x float> %c)
-; CHECK-NEXT:    ret <4 x float> [[TMP1]]
-;
-  %1 = insertelement <4 x float> %b, float 1.000000e+00, i32 1
-  %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2
-  %3 = insertelement <4 x float> %2, float 3.000000e+00, i32 3
-  %4 = insertelement <4 x float> %c, float 4.000000e+00, i32 1
-  %5 = insertelement <4 x float> %4, float 5.000000e+00, i32 2
-  %6 = insertelement <4 x float> %5, float 6.000000e+00, i32 3
-  %res = tail call <4 x float> @llvm.x86.fma.vfnmsub.ss(<4 x float> %a, <4 x float> %3, <4 x float> %6)
-  ret <4 x float> %res
-}
-
-define float @test_vfnmsub_ss_0(<4 x float> %a, <4 x float> %b, <4 x float> %c) {
-; CHECK-LABEL: @test_vfnmsub_ss_0(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x float> @llvm.x86.fma.vfnmsub.ss(<4 x float> %a, <4 x float> %b, <4 x float> %c)
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 0
-; CHECK-NEXT:    ret float [[TMP2]]
-;
-  %1 = insertelement <4 x float> %a, float 1.000000e+00, i32 1
-  %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2
-  %3 = insertelement <4 x float> %2, float 3.000000e+00, i32 3
-  %4 = tail call <4 x float> @llvm.x86.fma.vfnmsub.ss(<4 x float> %3, <4 x float> %b, <4 x float> %c)
-  %5 = extractelement <4 x float> %4, i32 0
-  ret float %5
-}
-
-define float @test_vfnmsub_ss_1(<4 x float> %a, <4 x float> %b, <4 x float> %c) {
-; CHECK-LABEL: @test_vfnmsub_ss_1(
-; CHECK-NEXT:    ret float 1.000000e+00
-;
-  %1 = insertelement <4 x float> %a, float 1.000000e+00, i32 1
-  %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2
-  %3 = insertelement <4 x float> %2, float 3.000000e+00, i32 3
-  %4 = tail call <4 x float> @llvm.x86.fma.vfnmsub.ss(<4 x float> %3, <4 x float> %b, <4 x float> %c)
-  %5 = extractelement <4 x float> %4, i32 1
-  ret float %5
-}
-
-declare <2 x double> @llvm.x86.fma.vfnmsub.sd(<2 x double>, <2 x double>, <2 x double>)
-
-define <2 x double> @test_vfnmsub_sd(<2 x double> %a, <2 x double> %b, <2 x double> %c) {
-; CHECK-LABEL: @test_vfnmsub_sd(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x double> @llvm.x86.fma.vfnmsub.sd(<2 x double> %a, <2 x double> %b, <2 x double> %c)
-; CHECK-NEXT:    ret <2 x double> [[TMP1]]
-;
-  %1 = insertelement <2 x double> %b, double 1.000000e+00, i32 1
-  %2 = insertelement <2 x double> %c, double 2.000000e+00, i32 1
-  %res = tail call <2 x double> @llvm.x86.fma.vfnmsub.sd(<2 x double> %a, <2 x double> %1, <2 x double> %2)
-  ret <2 x double> %res
-}
-
-define double @test_vfnmsub_sd_0(<2 x double> %a, <2 x double> %b, <2 x double> %c) {
-; CHECK-LABEL: @test_vfnmsub_sd_0(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x double> @llvm.x86.fma.vfnmsub.sd(<2 x double> %a, <2 x double> %b, <2 x double> %c)
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x double> [[TMP1]], i32 0
-; CHECK-NEXT:    ret double [[TMP2]]
-;
-  %1 = insertelement <2 x double> %a, double 1.000000e+00, i32 1
-  %2 = tail call <2 x double> @llvm.x86.fma.vfnmsub.sd(<2 x double> %1, <2 x double> %b, <2 x double> %c)
-  %3 = extractelement <2 x double> %2, i32 0
-  ret double %3
-}
-
-define double @test_vfnmsub_sd_1(<2 x double> %a, <2 x double> %b, <2 x double> %c) {
-; CHECK-LABEL: @test_vfnmsub_sd_1(
-; CHECK-NEXT:    ret double 1.000000e+00
-;
-  %1 = insertelement <2 x double> %a, double 1.000000e+00, i32 1
-  %2 = tail call <2 x double> @llvm.x86.fma.vfnmsub.sd(<2 x double> %1, <2 x double> %b, <2 x double> %c)
-  %3 = extractelement <2 x double> %2, i32 1
-  ret double %3
-}
diff --git a/test/Transforms/InstCombine/x86-insertps.ll b/test/Transforms/InstCombine/x86-insertps.ll
deleted file mode 100644
index f55ea6f22d2e..000000000000
--- a/test/Transforms/InstCombine/x86-insertps.ll
+++ /dev/null
@@ -1,166 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -instcombine -S | FileCheck %s
-
-declare <4 x float> @llvm.x86.sse41.insertps(<4 x float>, <4 x float>, i8) nounwind readnone
-
-; This should never happen, but make sure we don't crash handling a non-constant immediate byte.
-
-define <4 x float> @insertps_non_const_imm(<4 x float> %v1, <4 x float> %v2, i8 %c) {
-; CHECK-LABEL: @insertps_non_const_imm(
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %v1, <4 x float> %v2, i8 %c)
-; CHECK-NEXT:    ret <4 x float> [[RES]]
-;
-  %res = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %v1, <4 x float> %v2, i8 %c)
-  ret <4 x float> %res
-
-}
-
-; If all zero mask bits are set, return a zero regardless of the other control bits.
-
-define <4 x float> @insertps_0x0f(<4 x float> %v1, <4 x float> %v2) {
-; CHECK-LABEL: @insertps_0x0f(
-; CHECK-NEXT:    ret <4 x float> zeroinitializer
-;
-  %res = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %v1, <4 x float> %v2, i8 15)
-  ret <4 x float> %res
-
-}
-define <4 x float> @insertps_0xff(<4 x float> %v1, <4 x float> %v2) {
-; CHECK-LABEL: @insertps_0xff(
-; CHECK-NEXT:    ret <4 x float> zeroinitializer
-;
-  %res = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %v1, <4 x float> %v2, i8 255)
-  ret <4 x float> %res
-
-}
-
-; If some zero mask bits are set that do not override the insertion, we do not change anything.
-
-define <4 x float> @insertps_0x0c(<4 x float> %v1, <4 x float> %v2) {
-; CHECK-LABEL: @insertps_0x0c(
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %v1, <4 x float> %v2, i8 12)
-; CHECK-NEXT:    ret <4 x float> [[RES]]
-;
-  %res = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %v1, <4 x float> %v2, i8 12)
-  ret <4 x float> %res
-
-}
-
-; ...unless both input vectors are the same operand.
-
-define <4 x float> @insertps_0x15_single_input(<4 x float> %v1) {
-; CHECK-LABEL: @insertps_0x15_single_input(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> %v1, <4 x float> <float 0.000000e+00, float undef, float 0.000000e+00, float undef>, <4 x i32> <i32 4, i32 0, i32 6, i32 3>
-; CHECK-NEXT:    ret <4 x float> [[TMP1]]
-;
-  %res = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %v1, <4 x float> %v1, i8 21)
-  ret <4 x float> %res
-
-}
-
-; The zero mask overrides the insertion lane.
-
-define <4 x float> @insertps_0x1a_single_input(<4 x float> %v1) {
-; CHECK-LABEL: @insertps_0x1a_single_input(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> %v1, <4 x float> <float undef, float 0.000000e+00, float undef, float 0.000000e+00>, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
-; CHECK-NEXT:    ret <4 x float> [[TMP1]]
-;
-  %res = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %v1, <4 x float> %v1, i8 26)
-  ret <4 x float> %res
-
-}
-
-; The zero mask overrides the insertion lane, so the second input vector is not used.
-
-define <4 x float> @insertps_0xc1(<4 x float> %v1, <4 x float> %v2) {
-; CHECK-LABEL: @insertps_0xc1(
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x float> %v1, float 0.000000e+00, i32 0
-; CHECK-NEXT:    ret <4 x float> [[TMP1]]
-;
-  %res = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %v1, <4 x float> %v2, i8 193)
-  ret <4 x float> %res
-
-}
-
-; If no zero mask bits are set, convert to a shuffle.
-
-define <4 x float> @insertps_0x00(<4 x float> %v1, <4 x float> %v2) {
-; CHECK-LABEL: @insertps_0x00(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> %v1, <4 x float> %v2, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    ret <4 x float> [[TMP1]]
-;
-  %res = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %v1, <4 x float> %v2, i8 0)
-  ret <4 x float> %res
-
-}
-
-define <4 x float> @insertps_0x10(<4 x float> %v1, <4 x float> %v2) {
-; CHECK-LABEL: @insertps_0x10(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> %v1, <4 x float> %v2, <4 x i32> <i32 0, i32 4, i32 2, i32 3>
-; CHECK-NEXT:    ret <4 x float> [[TMP1]]
-;
-  %res = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %v1, <4 x float> %v2, i8 16)
-  ret <4 x float> %res
-
-}
-
-define <4 x float> @insertps_0x20(<4 x float> %v1, <4 x float> %v2) {
-; CHECK-LABEL: @insertps_0x20(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> %v1, <4 x float> %v2, <4 x i32> <i32 0, i32 1, i32 4, i32 3>
-; CHECK-NEXT:    ret <4 x float> [[TMP1]]
-;
-  %res = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %v1, <4 x float> %v2, i8 32)
-  ret <4 x float> %res
-
-}
-
-define <4 x float> @insertps_0x30(<4 x float> %v1, <4 x float> %v2) {
-; CHECK-LABEL: @insertps_0x30(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> %v1, <4 x float> %v2, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
-; CHECK-NEXT:    ret <4 x float> [[TMP1]]
-;
-  %res = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %v1, <4 x float> %v2, i8 48)
-  ret <4 x float> %res
-
-}
-
-define <4 x float> @insertps_0xc0(<4 x float> %v1, <4 x float> %v2) {
-; CHECK-LABEL: @insertps_0xc0(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> %v1, <4 x float> %v2, <4 x i32> <i32 7, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    ret <4 x float> [[TMP1]]
-;
-  %res = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %v1, <4 x float> %v2, i8 192)
-  ret <4 x float> %res
-
-}
-
-define <4 x float> @insertps_0xd0(<4 x float> %v1, <4 x float> %v2) {
-; CHECK-LABEL: @insertps_0xd0(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> %v1, <4 x float> %v2, <4 x i32> <i32 0, i32 7, i32 2, i32 3>
-; CHECK-NEXT:    ret <4 x float> [[TMP1]]
-;
-  %res = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %v1, <4 x float> %v2, i8 208)
-  ret <4 x float> %res
-
-}
-
-define <4 x float> @insertps_0xe0(<4 x float> %v1, <4 x float> %v2) {
-; CHECK-LABEL: @insertps_0xe0(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> %v1, <4 x float> %v2, <4 x i32> <i32 0, i32 1, i32 7, i32 3>
-; CHECK-NEXT:    ret <4 x float> [[TMP1]]
-;
-  %res = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %v1, <4 x float> %v2, i8 224)
-  ret <4 x float> %res
-
-}
-
-define <4 x float> @insertps_0xf0(<4 x float> %v1, <4 x float> %v2) {
-; CHECK-LABEL: @insertps_0xf0(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> %v1, <4 x float> %v2, <4 x i32> <i32 0, i32 1, i32 2, i32 7>
-; CHECK-NEXT:    ret <4 x float> [[TMP1]]
-;
-  %res = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %v1, <4 x float> %v2, i8 240)
-  ret <4 x float> %res
-
-}
-
diff --git a/test/Transforms/InstCombine/x86-masked-memops.ll b/test/Transforms/InstCombine/x86-masked-memops.ll
deleted file mode 100644
index 8502b1899ecb..000000000000
--- a/test/Transforms/InstCombine/x86-masked-memops.ll
+++ /dev/null
@@ -1,328 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -instcombine -S | FileCheck %s
-
-;; MASKED LOADS
-
-; If the mask isn't constant, do nothing.
-
-define <4 x float> @mload(i8* %f, <4 x i32> %mask) {
-; CHECK-LABEL: @mload(
-; CHECK-NEXT:    [[LD:%.*]] = tail call <4 x float> @llvm.x86.avx.maskload.ps(i8* %f, <4 x i32> %mask)
-; CHECK-NEXT:    ret <4 x float> [[LD]]
-;
-  %ld = tail call <4 x float> @llvm.x86.avx.maskload.ps(i8* %f, <4 x i32> %mask)
-  ret <4 x float> %ld
-
-}
-
-; Zero mask returns a zero vector.
-
-define <4 x float> @mload_zeros(i8* %f) {
-; CHECK-LABEL: @mload_zeros(
-; CHECK-NEXT:    ret <4 x float> zeroinitializer
-;
-  %ld = tail call <4 x float> @llvm.x86.avx.maskload.ps(i8* %f, <4 x i32> zeroinitializer)
-  ret <4 x float> %ld
-
-}
-
-; Only the sign bit matters.
-
-define <4 x float> @mload_fake_ones(i8* %f) {
-; CHECK-LABEL: @mload_fake_ones(
-; CHECK-NEXT:    ret <4 x float> zeroinitializer
-;
-  %ld = tail call <4 x float> @llvm.x86.avx.maskload.ps(i8* %f, <4 x i32> <i32 1, i32 2, i32 3, i32 2147483647>)
-  ret <4 x float> %ld
-
-}
-
-; All mask bits are set, so this is just a vector load.
-
-define <4 x float> @mload_real_ones(i8* %f) {
-; CHECK-LABEL: @mload_real_ones(
-; CHECK-NEXT:    [[CASTVEC:%.*]] = bitcast i8* %f to <4 x float>*
-; CHECK-NEXT:    [[UNMASKEDLOAD:%.*]] = load <4 x float>, <4 x float>* [[CASTVEC]], align 1
-; CHECK-NEXT:    ret <4 x float> [[UNMASKEDLOAD]]
-;
-  %ld = tail call <4 x float> @llvm.x86.avx.maskload.ps(i8* %f, <4 x i32> <i32 -1, i32 -2, i32 -3, i32 2147483648>)
-  ret <4 x float> %ld
-
-}
-
-; It's a constant mask, so convert to an LLVM intrinsic. The backend should optimize further.
-
-define <4 x float> @mload_one_one(i8* %f) {
-; CHECK-LABEL: @mload_one_one(
-; CHECK-NEXT:    [[CASTVEC:%.*]] = bitcast i8* %f to <4 x float>*
-; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* [[CASTVEC]], i32 1, <4 x i1> <i1 false, i1 false, i1 false, i1 true>, <4 x float> zeroinitializer)
-; CHECK-NEXT:    ret <4 x float> [[TMP1]]
-;
-  %ld = tail call <4 x float> @llvm.x86.avx.maskload.ps(i8* %f, <4 x i32> <i32 0, i32 0, i32 0, i32 -1>)
-  ret <4 x float> %ld
-
-}
-
-; Try doubles.
-
-define <2 x double> @mload_one_one_double(i8* %f) {
-; CHECK-LABEL: @mload_one_one_double(
-; CHECK-NEXT:    [[CASTVEC:%.*]] = bitcast i8* %f to <2 x double>*
-; CHECK-NEXT:    [[TMP1:%.*]] = call <2 x double> @llvm.masked.load.v2f64.p0v2f64(<2 x double>* [[CASTVEC]], i32 1, <2 x i1> <i1 true, i1 false>, <2 x double> zeroinitializer)
-; CHECK-NEXT:    ret <2 x double> [[TMP1]]
-;
-  %ld = tail call <2 x double> @llvm.x86.avx.maskload.pd(i8* %f, <2 x i64> <i64 -1, i64 0>)
-  ret <2 x double> %ld
-
-}
-
-; Try 256-bit FP ops.
-
-define <8 x float> @mload_v8f32(i8* %f) {
-; CHECK-LABEL: @mload_v8f32(
-; CHECK-NEXT:    [[CASTVEC:%.*]] = bitcast i8* %f to <8 x float>*
-; CHECK-NEXT:    [[TMP1:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* [[CASTVEC]], i32 1, <8 x i1> <i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false>, <8 x float> zeroinitializer)
-; CHECK-NEXT:    ret <8 x float> [[TMP1]]
-;
-  %ld = tail call <8 x float> @llvm.x86.avx.maskload.ps.256(i8* %f, <8 x i32> <i32 0, i32 0, i32 0, i32 -1, i32 0, i32 0, i32 0, i32 0>)
-  ret <8 x float> %ld
-
-}
-
-define <4 x double> @mload_v4f64(i8* %f) {
-; CHECK-LABEL: @mload_v4f64(
-; CHECK-NEXT:    [[CASTVEC:%.*]] = bitcast i8* %f to <4 x double>*
-; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* [[CASTVEC]], i32 1, <4 x i1> <i1 true, i1 false, i1 false, i1 false>, <4 x double> zeroinitializer)
-; CHECK-NEXT:    ret <4 x double> [[TMP1]]
-;
-  %ld = tail call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %f, <4 x i64> <i64 -1, i64 0, i64 0, i64 0>)
-  ret <4 x double> %ld
-
-}
-
-; Try the AVX2 variants.
-
-define <4 x i32> @mload_v4i32(i8* %f) {
-; CHECK-LABEL: @mload_v4i32(
-; CHECK-NEXT:    [[CASTVEC:%.*]] = bitcast i8* %f to <4 x i32>*
-; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[CASTVEC]], i32 1, <4 x i1> <i1 false, i1 false, i1 false, i1 true>, <4 x i32> zeroinitializer)
-; CHECK-NEXT:    ret <4 x i32> [[TMP1]]
-;
-  %ld = tail call <4 x i32> @llvm.x86.avx2.maskload.d(i8* %f, <4 x i32> <i32 0, i32 0, i32 0, i32 -1>)
-  ret <4 x i32> %ld
-
-}
-
-define <2 x i64> @mload_v2i64(i8* %f) {
-; CHECK-LABEL: @mload_v2i64(
-; CHECK-NEXT:    [[CASTVEC:%.*]] = bitcast i8* %f to <2 x i64>*
-; CHECK-NEXT:    [[TMP1:%.*]] = call <2 x i64> @llvm.masked.load.v2i64.p0v2i64(<2 x i64>* [[CASTVEC]], i32 1, <2 x i1> <i1 true, i1 false>, <2 x i64> zeroinitializer)
-; CHECK-NEXT:    ret <2 x i64> [[TMP1]]
-;
-  %ld = tail call <2 x i64> @llvm.x86.avx2.maskload.q(i8* %f, <2 x i64> <i64 -1, i64 0>)
-  ret <2 x i64> %ld
-
-}
-
-define <8 x i32> @mload_v8i32(i8* %f) {
-; CHECK-LABEL: @mload_v8i32(
-; CHECK-NEXT:    [[CASTVEC:%.*]] = bitcast i8* %f to <8 x i32>*
-; CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* [[CASTVEC]], i32 1, <8 x i1> <i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false>, <8 x i32> zeroinitializer)
-; CHECK-NEXT:    ret <8 x i32> [[TMP1]]
-;
-  %ld = tail call <8 x i32> @llvm.x86.avx2.maskload.d.256(i8* %f, <8 x i32> <i32 0, i32 0, i32 0, i32 -1, i32 0, i32 0, i32 0, i32 0>)
-  ret <8 x i32> %ld
-
-}
-
-define <4 x i64> @mload_v4i64(i8* %f) {
-; CHECK-LABEL: @mload_v4i64(
-; CHECK-NEXT:    [[CASTVEC:%.*]] = bitcast i8* %f to <4 x i64>*
-; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i64> @llvm.masked.load.v4i64.p0v4i64(<4 x i64>* [[CASTVEC]], i32 1, <4 x i1> <i1 true, i1 false, i1 false, i1 false>, <4 x i64> zeroinitializer)
-; CHECK-NEXT:    ret <4 x i64> [[TMP1]]
-;
-  %ld = tail call <4 x i64> @llvm.x86.avx2.maskload.q.256(i8* %f, <4 x i64> <i64 -1, i64 0, i64 0, i64 0>)
-  ret <4 x i64> %ld
-
-}
-
-
-;; MASKED STORES
-
-; If the mask isn't constant, do nothing.
-
-define void @mstore(i8* %f, <4 x i32> %mask, <4 x float> %v) {
-; CHECK-LABEL: @mstore(
-; CHECK-NEXT:    tail call void @llvm.x86.avx.maskstore.ps(i8* %f, <4 x i32> %mask, <4 x float> %v)
-; CHECK-NEXT:    ret void
-;
-  tail call void @llvm.x86.avx.maskstore.ps(i8* %f, <4 x i32> %mask, <4 x float> %v)
-  ret void
-
-}
-
-; Zero mask is a nop.
-
-define void @mstore_zeros(i8* %f, <4 x float> %v)  {
-; CHECK-LABEL: @mstore_zeros(
-; CHECK-NEXT:    ret void
-;
-  tail call void @llvm.x86.avx.maskstore.ps(i8* %f, <4 x i32> zeroinitializer, <4 x float> %v)
-  ret void
-
-}
-
-; Only the sign bit matters.
-
-define void @mstore_fake_ones(i8* %f, <4 x float> %v) {
-; CHECK-LABEL: @mstore_fake_ones(
-; CHECK-NEXT:    ret void
-;
-  tail call void @llvm.x86.avx.maskstore.ps(i8* %f, <4 x i32> <i32 1, i32 2, i32 3, i32 2147483647>, <4 x float> %v)
-  ret void
-
-}
-
-; All mask bits are set, so this is just a vector store.
-
-define void @mstore_real_ones(i8* %f, <4 x float> %v) {
-; CHECK-LABEL: @mstore_real_ones(
-; CHECK-NEXT:    [[CASTVEC:%.*]] = bitcast i8* %f to <4 x float>*
-; CHECK-NEXT:    store <4 x float> %v, <4 x float>* [[CASTVEC]], align 1
-; CHECK-NEXT:    ret void
-;
-  tail call void @llvm.x86.avx.maskstore.ps(i8* %f, <4 x i32> <i32 -1, i32 -2, i32 -3, i32 -2147483648>, <4 x float> %v)
-  ret void
-
-}
-
-; It's a constant mask, so convert to an LLVM intrinsic. The backend should optimize further.
-
-define void @mstore_one_one(i8* %f, <4 x float> %v) {
-; CHECK-LABEL: @mstore_one_one(
-; CHECK-NEXT:    [[CASTVEC:%.*]] = bitcast i8* %f to <4 x float>*
-; CHECK-NEXT:    call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %v, <4 x float>* [[CASTVEC]], i32 1, <4 x i1> <i1 false, i1 false, i1 false, i1 true>)
-; CHECK-NEXT:    ret void
-;
-  tail call void @llvm.x86.avx.maskstore.ps(i8* %f, <4 x i32> <i32 0, i32 0, i32 0, i32 -1>, <4 x float> %v)
-  ret void
-
-}
-
-; Try doubles.
-
-define void @mstore_one_one_double(i8* %f, <2 x double> %v) {
-; CHECK-LABEL: @mstore_one_one_double(
-; CHECK-NEXT:    [[CASTVEC:%.*]] = bitcast i8* %f to <2 x double>*
-; CHECK-NEXT:    call void @llvm.masked.store.v2f64.p0v2f64(<2 x double> %v, <2 x double>* [[CASTVEC]], i32 1, <2 x i1> <i1 true, i1 false>)
-; CHECK-NEXT:    ret void
-;
-  tail call void @llvm.x86.avx.maskstore.pd(i8* %f, <2 x i64> <i64 -1, i64 0>, <2 x double> %v)
-  ret void
-
-}
-
-; Try 256-bit FP ops.
-
-define void @mstore_v8f32(i8* %f, <8 x float> %v) {
-; CHECK-LABEL: @mstore_v8f32(
-; CHECK-NEXT:    [[CASTVEC:%.*]] = bitcast i8* %f to <8 x float>*
-; CHECK-NEXT:    call void @llvm.masked.store.v8f32.p0v8f32(<8 x float> %v, <8 x float>* [[CASTVEC]], i32 1, <8 x i1> <i1 false, i1 false, i1 false, i1 false, i1 true, i1 true, i1 true, i1 true>)
-; CHECK-NEXT:    ret void
-;
-  tail call void @llvm.x86.avx.maskstore.ps.256(i8* %f, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 -1, i32 -2, i32 -3, i32 -4>, <8 x float> %v)
-  ret void
-
-}
-
-define void @mstore_v4f64(i8* %f, <4 x double> %v) {
-; CHECK-LABEL: @mstore_v4f64(
-; CHECK-NEXT:    [[CASTVEC:%.*]] = bitcast i8* %f to <4 x double>*
-; CHECK-NEXT:    call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> %v, <4 x double>* [[CASTVEC]], i32 1, <4 x i1> <i1 true, i1 false, i1 false, i1 false>)
-; CHECK-NEXT:    ret void
-;
-  tail call void @llvm.x86.avx.maskstore.pd.256(i8* %f, <4 x i64> <i64 -1, i64 0, i64 1, i64 2>, <4 x double> %v)
-  ret void
-
-}
-
-; Try the AVX2 variants.
-
-define void @mstore_v4i32(i8* %f, <4 x i32> %v) {
-; CHECK-LABEL: @mstore_v4i32(
-; CHECK-NEXT:    [[CASTVEC:%.*]] = bitcast i8* %f to <4 x i32>*
-; CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %v, <4 x i32>* [[CASTVEC]], i32 1, <4 x i1> <i1 false, i1 false, i1 true, i1 true>)
-; CHECK-NEXT:    ret void
-;
-  tail call void @llvm.x86.avx2.maskstore.d(i8* %f, <4 x i32> <i32 0, i32 1, i32 -1, i32 -2>, <4 x i32> %v)
-  ret void
-
-}
-
-define void @mstore_v2i64(i8* %f, <2 x i64> %v) {
-; CHECK-LABEL: @mstore_v2i64(
-; CHECK-NEXT:    [[CASTVEC:%.*]] = bitcast i8* %f to <2 x i64>*
-; CHECK-NEXT:    call void @llvm.masked.store.v2i64.p0v2i64(<2 x i64> %v, <2 x i64>* [[CASTVEC]], i32 1, <2 x i1> <i1 true, i1 false>)
-; CHECK-NEXT:    ret void
-;
-  tail call void @llvm.x86.avx2.maskstore.q(i8* %f, <2 x i64> <i64 -1, i64 0>, <2 x i64> %v)
-  ret void
-
-}
-
-define void @mstore_v8i32(i8* %f, <8 x i32> %v) {
-; CHECK-LABEL: @mstore_v8i32(
-; CHECK-NEXT:    [[CASTVEC:%.*]] = bitcast i8* %f to <8 x i32>*
-; CHECK-NEXT:    call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> %v, <8 x i32>* [[CASTVEC]], i32 1, <8 x i1> <i1 false, i1 false, i1 false, i1 false, i1 true, i1 true, i1 true, i1 true>)
-; CHECK-NEXT:    ret void
-;
-  tail call void @llvm.x86.avx2.maskstore.d.256(i8* %f, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 -1, i32 -2, i32 -3, i32 -4>, <8 x i32> %v)
-  ret void
-
-}
-
-define void @mstore_v4i64(i8* %f, <4 x i64> %v) {
-; CHECK-LABEL: @mstore_v4i64(
-; CHECK-NEXT:    [[CASTVEC:%.*]] = bitcast i8* %f to <4 x i64>*
-; CHECK-NEXT:    call void @llvm.masked.store.v4i64.p0v4i64(<4 x i64> %v, <4 x i64>* [[CASTVEC]], i32 1, <4 x i1> <i1 true, i1 false, i1 false, i1 false>)
-; CHECK-NEXT:    ret void
-;
-  tail call void @llvm.x86.avx2.maskstore.q.256(i8* %f, <4 x i64> <i64 -1, i64 0, i64 1, i64 2>, <4 x i64> %v)
-  ret void
-
-}
-
-; The original SSE2 masked store variant.
-
-define void @mstore_v16i8_sse2_zeros(<16 x i8> %d, i8* %p) {
-; CHECK-LABEL: @mstore_v16i8_sse2_zeros(
-; CHECK-NEXT:    ret void
-;
-  tail call void @llvm.x86.sse2.maskmov.dqu(<16 x i8> %d, <16 x i8> zeroinitializer, i8* %p)
-  ret void
-
-}
-
-
-declare <4 x float> @llvm.x86.avx.maskload.ps(i8*, <4 x i32>)
-declare <2 x double> @llvm.x86.avx.maskload.pd(i8*, <2 x i64>)
-declare <8 x float> @llvm.x86.avx.maskload.ps.256(i8*, <8 x i32>)
-declare <4 x double> @llvm.x86.avx.maskload.pd.256(i8*, <4 x i64>)
-
-declare <4 x i32> @llvm.x86.avx2.maskload.d(i8*, <4 x i32>)
-declare <2 x i64> @llvm.x86.avx2.maskload.q(i8*, <2 x i64>)
-declare <8 x i32> @llvm.x86.avx2.maskload.d.256(i8*, <8 x i32>)
-declare <4 x i64> @llvm.x86.avx2.maskload.q.256(i8*, <4 x i64>)
-
-declare void @llvm.x86.avx.maskstore.ps(i8*, <4 x i32>, <4 x float>)
-declare void @llvm.x86.avx.maskstore.pd(i8*, <2 x i64>, <2 x double>)
-declare void @llvm.x86.avx.maskstore.ps.256(i8*, <8 x i32>, <8 x float>)
-declare void @llvm.x86.avx.maskstore.pd.256(i8*, <4 x i64>, <4 x double>)
-
-declare void @llvm.x86.avx2.maskstore.d(i8*, <4 x i32>, <4 x i32>)
-declare void @llvm.x86.avx2.maskstore.q(i8*, <2 x i64>, <2 x i64>)
-declare void @llvm.x86.avx2.maskstore.d.256(i8*, <8 x i32>, <8 x i32>)
-declare void @llvm.x86.avx2.maskstore.q.256(i8*, <4 x i64>, <4 x i64>)
-
-declare void @llvm.x86.sse2.maskmov.dqu(<16 x i8>, <16 x i8>, i8*)
-
diff --git a/test/Transforms/InstCombine/x86-movmsk.ll b/test/Transforms/InstCombine/x86-movmsk.ll
deleted file mode 100644
index 11acc1dbca84..000000000000
--- a/test/Transforms/InstCombine/x86-movmsk.ll
+++ /dev/null
@@ -1,324 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -instcombine -S | FileCheck %s
-
-target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-
-;
-; DemandedBits - MOVMSK zeros the upper bits of the result.
-;
-
-define i32 @test_upper_x86_mmx_pmovmskb(x86_mmx %a0) {
-; CHECK-LABEL: @test_upper_x86_mmx_pmovmskb(
-; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.x86.mmx.pmovmskb(x86_mmx %a0)
-; CHECK-NEXT:    ret i32 [[TMP1]]
-;
-  %1 = call i32 @llvm.x86.mmx.pmovmskb(x86_mmx %a0)
-  %2 = and i32 %1, 255
-  ret i32 %2
-}
-
-define i32 @test_upper_x86_sse_movmsk_ps(<4 x float> %a0) {
-; CHECK-LABEL: @test_upper_x86_sse_movmsk_ps(
-; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %a0)
-; CHECK-NEXT:    ret i32 [[TMP1]]
-;
-  %1 = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %a0)
-  %2 = and i32 %1, 15
-  ret i32 %2
-}
-
-define i32 @test_upper_x86_sse2_movmsk_pd(<2 x double> %a0) {
-; CHECK-LABEL: @test_upper_x86_sse2_movmsk_pd(
-; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.x86.sse2.movmsk.pd(<2 x double> %a0)
-; CHECK-NEXT:    ret i32 [[TMP1]]
-;
-  %1 = call i32 @llvm.x86.sse2.movmsk.pd(<2 x double> %a0)
-  %2 = and i32 %1, 3
-  ret i32 %2
-}
-
-define i32 @test_upper_x86_sse2_pmovmskb_128(<16 x i8> %a0) {
-; CHECK-LABEL: @test_upper_x86_sse2_pmovmskb_128(
-; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.x86.sse2.pmovmskb.128(<16 x i8> %a0)
-; CHECK-NEXT:    ret i32 [[TMP1]]
-;
-  %1 = call i32 @llvm.x86.sse2.pmovmskb.128(<16 x i8> %a0)
-  %2 = and i32 %1, 65535
-  ret i32 %2
-}
-
-define i32 @test_upper_x86_avx_movmsk_ps_256(<8 x float> %a0) {
-; CHECK-LABEL: @test_upper_x86_avx_movmsk_ps_256(
-; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %a0)
-; CHECK-NEXT:    ret i32 [[TMP1]]
-;
-  %1 = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %a0)
-  %2 = and i32 %1, 255
-  ret i32 %2
-}
-
-define i32 @test_upper_x86_avx_movmsk_pd_256(<4 x double> %a0) {
-; CHECK-LABEL: @test_upper_x86_avx_movmsk_pd_256(
-; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.x86.avx.movmsk.pd.256(<4 x double> %a0)
-; CHECK-NEXT:    ret i32 [[TMP1]]
-;
-  %1 = call i32 @llvm.x86.avx.movmsk.pd.256(<4 x double> %a0)
-  %2 = and i32 %1, 15
-  ret i32 %2
-}
-
-; llvm.x86.avx2.pmovmskb uses the whole of the 32-bit register.
-
-;
-; DemandedBits - If we don't use the lower bits then we just return zero.
-;
-
-define i32 @test_lower_x86_mmx_pmovmskb(x86_mmx %a0) {
-; CHECK-LABEL: @test_lower_x86_mmx_pmovmskb(
-; CHECK-NEXT:    ret i32 0
-;
-  %1 = call i32 @llvm.x86.mmx.pmovmskb(x86_mmx %a0)
-  %2 = and i32 %1, -256
-  ret i32 %2
-}
-
-define i32 @test_lower_x86_sse_movmsk_ps(<4 x float> %a0) {
-; CHECK-LABEL: @test_lower_x86_sse_movmsk_ps(
-; CHECK-NEXT:    ret i32 0
-;
-  %1 = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %a0)
-  %2 = and i32 %1, -16
-  ret i32 %2
-}
-
-define i32 @test_lower_x86_sse2_movmsk_pd(<2 x double> %a0) {
-; CHECK-LABEL: @test_lower_x86_sse2_movmsk_pd(
-; CHECK-NEXT:    ret i32 0
-;
-  %1 = call i32 @llvm.x86.sse2.movmsk.pd(<2 x double> %a0)
-  %2 = and i32 %1, -4
-  ret i32 %2
-}
-
-define i32 @test_lower_x86_sse2_pmovmskb_128(<16 x i8> %a0) {
-; CHECK-LABEL: @test_lower_x86_sse2_pmovmskb_128(
-; CHECK-NEXT:    ret i32 0
-;
-  %1 = call i32 @llvm.x86.sse2.pmovmskb.128(<16 x i8> %a0)
-  %2 = and i32 %1, -65536
-  ret i32 %2
-}
-
-define i32 @test_lower_x86_avx_movmsk_ps_256(<8 x float> %a0) {
-; CHECK-LABEL: @test_lower_x86_avx_movmsk_ps_256(
-; CHECK-NEXT:    ret i32 0
-;
-  %1 = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %a0)
-  %2 = and i32 %1, -256
-  ret i32 %2
-}
-
-define i32 @test_lower_x86_avx_movmsk_pd_256(<4 x double> %a0) {
-; CHECK-LABEL: @test_lower_x86_avx_movmsk_pd_256(
-; CHECK-NEXT:    ret i32 0
-;
-  %1 = call i32 @llvm.x86.avx.movmsk.pd.256(<4 x double> %a0)
-  %2 = and i32 %1, -16
-  ret i32 %2
-}
-
-; llvm.x86.avx2.pmovmskb uses the whole of the 32-bit register.
-
-;
-; Constant Folding (UNDEF -> ZERO)
-;
-
-define i32 @undef_x86_mmx_pmovmskb() {
-; CHECK-LABEL: @undef_x86_mmx_pmovmskb(
-; CHECK-NEXT:    ret i32 0
-;
-  %1 = call i32 @llvm.x86.mmx.pmovmskb(x86_mmx undef)
-  ret i32 %1
-}
-
-define i32 @undef_x86_sse_movmsk_ps() {
-; CHECK-LABEL: @undef_x86_sse_movmsk_ps(
-; CHECK-NEXT:    ret i32 0
-;
-  %1 = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> undef)
-  ret i32 %1
-}
-
-define i32 @undef_x86_sse2_movmsk_pd() {
-; CHECK-LABEL: @undef_x86_sse2_movmsk_pd(
-; CHECK-NEXT:    ret i32 0
-;
-  %1 = call i32 @llvm.x86.sse2.movmsk.pd(<2 x double> undef)
-  ret i32 %1
-}
-
-define i32 @undef_x86_sse2_pmovmskb_128() {
-; CHECK-LABEL: @undef_x86_sse2_pmovmskb_128(
-; CHECK-NEXT:    ret i32 0
-;
-  %1 = call i32 @llvm.x86.sse2.pmovmskb.128(<16 x i8> undef)
-  ret i32 %1
-}
-
-define i32 @undef_x86_avx_movmsk_ps_256() {
-; CHECK-LABEL: @undef_x86_avx_movmsk_ps_256(
-; CHECK-NEXT:    ret i32 0
-;
-  %1 = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> undef)
-  ret i32 %1
-}
-
-define i32 @undef_x86_avx_movmsk_pd_256() {
-; CHECK-LABEL: @undef_x86_avx_movmsk_pd_256(
-; CHECK-NEXT:    ret i32 0
-;
-  %1 = call i32 @llvm.x86.avx.movmsk.pd.256(<4 x double> undef)
-  ret i32 %1
-}
-
-define i32 @undef_x86_avx2_pmovmskb() {
-; CHECK-LABEL: @undef_x86_avx2_pmovmskb(
-; CHECK-NEXT:    ret i32 0
-;
-  %1 = call i32 @llvm.x86.avx2.pmovmskb(<32 x i8> undef)
-  ret i32 %1
-}
-
-;
-; Constant Folding (ZERO -> ZERO)
-;
-
-define i32 @zero_x86_mmx_pmovmskb() {
-; CHECK-LABEL: @zero_x86_mmx_pmovmskb(
-; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.x86.mmx.pmovmskb(x86_mmx bitcast (<1 x i64> zeroinitializer to x86_mmx))
-; CHECK-NEXT:    ret i32 [[TMP1]]
-;
-  %1 = bitcast <1 x i64> zeroinitializer to x86_mmx
-  %2 = call i32 @llvm.x86.mmx.pmovmskb(x86_mmx %1)
-  ret i32 %2
-}
-
-define i32 @zero_x86_sse_movmsk_ps() {
-; CHECK-LABEL: @zero_x86_sse_movmsk_ps(
-; CHECK-NEXT:    ret i32 0
-;
-  %1 = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> zeroinitializer)
-  ret i32 %1
-}
-
-define i32 @zero_x86_sse2_movmsk_pd() {
-; CHECK-LABEL: @zero_x86_sse2_movmsk_pd(
-; CHECK-NEXT:    ret i32 0
-;
-  %1 = call i32 @llvm.x86.sse2.movmsk.pd(<2 x double> zeroinitializer)
-  ret i32 %1
-}
-
-define i32 @zero_x86_sse2_pmovmskb_128() {
-; CHECK-LABEL: @zero_x86_sse2_pmovmskb_128(
-; CHECK-NEXT:    ret i32 0
-;
-  %1 = call i32 @llvm.x86.sse2.pmovmskb.128(<16 x i8> zeroinitializer)
-  ret i32 %1
-}
-
-define i32 @zero_x86_avx_movmsk_ps_256() {
-; CHECK-LABEL: @zero_x86_avx_movmsk_ps_256(
-; CHECK-NEXT:    ret i32 0
-;
-  %1 = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> zeroinitializer)
-  ret i32 %1
-}
-
-define i32 @zero_x86_avx_movmsk_pd_256() {
-; CHECK-LABEL: @zero_x86_avx_movmsk_pd_256(
-; CHECK-NEXT:    ret i32 0
-;
-  %1 = call i32 @llvm.x86.avx.movmsk.pd.256(<4 x double> zeroinitializer)
-  ret i32 %1
-}
-
-define i32 @zero_x86_avx2_pmovmskb() {
-; CHECK-LABEL: @zero_x86_avx2_pmovmskb(
-; CHECK-NEXT:    ret i32 0
-;
-  %1 = call i32 @llvm.x86.avx2.pmovmskb(<32 x i8> zeroinitializer)
-  ret i32 %1
-}
-
-;
-; Constant Folding
-;
-
-define i32 @fold_x86_mmx_pmovmskb() {
-; CHECK-LABEL: @fold_x86_mmx_pmovmskb(
-; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.x86.mmx.pmovmskb(x86_mmx bitcast (<8 x i8> <i8 0, i8 -1, i8 -1, i8 127, i8 -127, i8 63, i8 64, i8 0> to x86_mmx))
-; CHECK-NEXT:    ret i32 [[TMP1]]
-;
-  %1 = bitcast <8 x i8> <i8 0, i8 255, i8 -1, i8 127, i8 -127, i8 63, i8 64, i8 256> to x86_mmx
-  %2 = call i32 @llvm.x86.mmx.pmovmskb(x86_mmx %1)
-  ret i32 %2
-}
-
-define i32 @fold_x86_sse_movmsk_ps() {
-; CHECK-LABEL: @fold_x86_sse_movmsk_ps(
-; CHECK-NEXT:    ret i32 10
-;
-  %1 = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> <float 1.0, float -1.0, float 100.0, float -200.0>)
-  ret i32 %1
-}
-
-define i32 @fold_x86_sse2_movmsk_pd() {
-; CHECK-LABEL: @fold_x86_sse2_movmsk_pd(
-; CHECK-NEXT:    ret i32 2
-;
-  %1 = call i32 @llvm.x86.sse2.movmsk.pd(<2 x double> <double 1.0, double -1.0>)
-  ret i32 %1
-}
-
-define i32 @fold_x86_sse2_pmovmskb_128() {
-; CHECK-LABEL: @fold_x86_sse2_pmovmskb_128(
-; CHECK-NEXT:    ret i32 5654
-;
-  %1 = call i32 @llvm.x86.sse2.pmovmskb.128(<16 x i8> <i8 0, i8 255, i8 -1, i8 127, i8 -127, i8 63, i8 64, i8 256, i8 0, i8 255, i8 -1, i8 127, i8 -127, i8 63, i8 64, i8 256>)
-  ret i32 %1
-}
-
-define i32 @fold_x86_avx_movmsk_ps_256() {
-; CHECK-LABEL: @fold_x86_avx_movmsk_ps_256(
-; CHECK-NEXT:    ret i32 170
-;
-  %1 = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> <float 1.0, float -1.0, float 100.0, float -200.0, float +0.0, float -0.0, float 100000.0, float -5000000.0>)
-  ret i32 %1
-}
-
-define i32 @fold_x86_avx_movmsk_pd_256() {
-; CHECK-LABEL: @fold_x86_avx_movmsk_pd_256(
-; CHECK-NEXT:    ret i32 10
-;
-  %1 = call i32 @llvm.x86.avx.movmsk.pd.256(<4 x double> <double 1.0, double -1.0, double 100.0, double -200.0>)
-  ret i32 %1
-}
-
-define i32 @fold_x86_avx2_pmovmskb() {
-; CHECK-LABEL: @fold_x86_avx2_pmovmskb(
-; CHECK-NEXT:    ret i32 370546176
-;
-  %1 = call i32 @llvm.x86.avx2.pmovmskb(<32 x i8> <i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 0, i8 255, i8 -1, i8 127, i8 -127, i8 63, i8 64, i8 256, i8 0, i8 255, i8 -1, i8 127, i8 -127, i8 63, i8 64, i8 256, i8 0, i8 255, i8 -1, i8 127, i8 -127, i8 63, i8 64, i8 256>)
-  ret i32 %1
-}
-
-declare i32 @llvm.x86.mmx.pmovmskb(x86_mmx)
-
-declare i32 @llvm.x86.sse.movmsk.ps(<4 x float>)
-declare i32 @llvm.x86.sse2.movmsk.pd(<2 x double>)
-declare i32 @llvm.x86.sse2.pmovmskb.128(<16 x i8>)
-
-declare i32 @llvm.x86.avx.movmsk.ps.256(<8 x float>)
-declare i32 @llvm.x86.avx.movmsk.pd.256(<4 x double>)
-declare i32 @llvm.x86.avx2.pmovmskb(<32 x i8>)
diff --git a/test/Transforms/InstCombine/x86-muldq.ll b/test/Transforms/InstCombine/x86-muldq.ll
deleted file mode 100644
index bcbb8919c403..000000000000
--- a/test/Transforms/InstCombine/x86-muldq.ll
+++ /dev/null
@@ -1,245 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -instcombine -S | FileCheck %s
-
-;
-; UNDEF Elts
-;
-
-define <2 x i64> @undef_pmuludq_128(<4 x i32> %a0, <4 x i32> %a1) {
-; CHECK-LABEL: @undef_pmuludq_128(
-; CHECK-NEXT:    ret <2 x i64> zeroinitializer
-;
-  %1 = call <2 x i64> @llvm.x86.sse2.pmulu.dq(<4 x i32> undef, <4 x i32> undef)
-  ret <2 x i64> %1
-}
-
-define <4 x i64> @undef_pmuludq_256(<8 x i32> %a0, <8 x i32> %a1) {
-; CHECK-LABEL: @undef_pmuludq_256(
-; CHECK-NEXT:    ret <4 x i64> zeroinitializer
-;
-  %1 = call <4 x i64> @llvm.x86.avx2.pmulu.dq(<8 x i32> undef, <8 x i32> undef)
-  ret <4 x i64> %1
-}
-
-define <8 x i64> @undef_pmuludq_512(<16 x i32> %a0, <16 x i32> %a1) {
-; CHECK-LABEL: @undef_pmuludq_512(
-; CHECK-NEXT:    ret <8 x i64> zeroinitializer
-;
-  %1 = call <8 x i64> @llvm.x86.avx512.pmulu.dq.512(<16 x i32> undef, <16 x i32> undef)
-  ret <8 x i64> %1
-}
-
-define <2 x i64> @undef_pmuldq_128(<4 x i32> %a0, <4 x i32> %a1) {
-; CHECK-LABEL: @undef_pmuldq_128(
-; CHECK-NEXT:    ret <2 x i64> zeroinitializer
-;
-  %1 = call <2 x i64> @llvm.x86.sse41.pmuldq(<4 x i32> undef, <4 x i32> undef)
-  ret <2 x i64> %1
-}
-
-define <4 x i64> @undef_pmuldq_256(<8 x i32> %a0, <8 x i32> %a1) {
-; CHECK-LABEL: @undef_pmuldq_256(
-; CHECK-NEXT:    ret <4 x i64> zeroinitializer
-;
-  %1 = call <4 x i64> @llvm.x86.avx2.pmul.dq(<8 x i32> undef, <8 x i32> undef)
-  ret <4 x i64> %1
-}
-
-define <8 x i64> @undef_pmuldq_512(<16 x i32> %a0, <16 x i32> %a1) {
-; CHECK-LABEL: @undef_pmuldq_512(
-; CHECK-NEXT:    ret <8 x i64> zeroinitializer
-;
-  %1 = call <8 x i64> @llvm.x86.avx512.pmul.dq.512(<16 x i32> undef, <16 x i32> undef)
-  ret <8 x i64> %1
-}
-
-define <2 x i64> @undef_zero_pmuludq_128(<4 x i32> %a0, <4 x i32> %a1) {
-; CHECK-LABEL: @undef_zero_pmuludq_128(
-; CHECK-NEXT:    ret <2 x i64> zeroinitializer
-;
-  %1 = call <2 x i64> @llvm.x86.sse2.pmulu.dq(<4 x i32> undef, <4 x i32> zeroinitializer)
-  ret <2 x i64> %1
-}
-
-define <4 x i64> @undef_zero_pmuludq_256(<8 x i32> %a0, <8 x i32> %a1) {
-; CHECK-LABEL: @undef_zero_pmuludq_256(
-; CHECK-NEXT:    ret <4 x i64> zeroinitializer
-;
-  %1 = call <4 x i64> @llvm.x86.avx2.pmulu.dq(<8 x i32> zeroinitializer, <8 x i32> undef)
-  ret <4 x i64> %1
-}
-
-define <8 x i64> @undef_zero_pmuludq_512(<16 x i32> %a0, <16 x i32> %a1) {
-; CHECK-LABEL: @undef_zero_pmuludq_512(
-; CHECK-NEXT:    ret <8 x i64> zeroinitializer
-;
-  %1 = call <8 x i64> @llvm.x86.avx512.pmulu.dq.512(<16 x i32> undef, <16 x i32> zeroinitializer)
-  ret <8 x i64> %1
-}
-
-define <2 x i64> @undef_zero_pmuldq_128(<4 x i32> %a0, <4 x i32> %a1) {
-; CHECK-LABEL: @undef_zero_pmuldq_128(
-; CHECK-NEXT:    ret <2 x i64> zeroinitializer
-;
-  %1 = call <2 x i64> @llvm.x86.sse41.pmuldq(<4 x i32> zeroinitializer, <4 x i32> undef)
-  ret <2 x i64> %1
-}
-
-define <4 x i64> @undef_zero_pmuldq_256(<8 x i32> %a0, <8 x i32> %a1) {
-; CHECK-LABEL: @undef_zero_pmuldq_256(
-; CHECK-NEXT:    ret <4 x i64> zeroinitializer
-;
-  %1 = call <4 x i64> @llvm.x86.avx2.pmul.dq(<8 x i32> undef, <8 x i32> zeroinitializer)
-  ret <4 x i64> %1
-}
-
-define <8 x i64> @undef_zero_pmuldq_512(<16 x i32> %a0, <16 x i32> %a1) {
-; CHECK-LABEL: @undef_zero_pmuldq_512(
-; CHECK-NEXT:    ret <8 x i64> zeroinitializer
-;
-  %1 = call <8 x i64> @llvm.x86.avx512.pmul.dq.512(<16 x i32> zeroinitializer, <16 x i32> undef)
-  ret <8 x i64> %1
-}
-
-;
-; Constant Folding
-;
-
-define <2 x i64> @fold_pmuludq_128(<4 x i32> %a0, <4 x i32> %a1) {
-; CHECK-LABEL: @fold_pmuludq_128(
-; CHECK-NEXT:    ret <2 x i64> <i64 9223372030412324865, i64 4294967295>
-;
-  %1 = call <2 x i64> @llvm.x86.sse2.pmulu.dq(<4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, <4 x i32> <i32 2147483647, i32 1, i32 1, i32 3>)
-  ret <2 x i64> %1
-}
-
-define <4 x i64> @fold_pmuludq_256(<8 x i32> %a0, <8 x i32> %a1) {
-; CHECK-LABEL: @fold_pmuludq_256(
-; CHECK-NEXT:    ret <4 x i64> zeroinitializer
-;
-  %1 = call <4 x i64> @llvm.x86.avx2.pmulu.dq(<8 x i32> zeroinitializer, <8 x i32> zeroinitializer)
-  ret <4 x i64> %1
-}
-
-define <8 x i64> @fold_pmuludq_512(<16 x i32> %a0, <16 x i32> %a1) {
-; CHECK-LABEL: @fold_pmuludq_512(
-; CHECK-NEXT:    ret <8 x i64> <i64 0, i64 0, i64 255, i64 131070, i64 0, i64 -281474976645121, i64 140737488289792, i64 281470681743360>
-;
-  %1 = call <8 x i64> @llvm.x86.avx512.pmulu.dq.512(<16 x i32> <i32 0, i32 0, i32 undef, i32 0, i32 1, i32 1, i32 2, i32 2, i32 undef, i32 undef, i32 -1, i32 -1, i32 65536, i32 -1, i32 -65536, i32 undef>, <16 x i32> <i32 undef, i32 undef, i32 undef, i32 1, i32 255, i32 -256, i32 65535, i32 -65536, i32 0, i32 -1, i32 -65535, i32 -65535, i32 2147483647, i32 2147483648, i32 65536, i32 -65535>)
-  ret <8 x i64> %1
-}
-
-define <2 x i64> @fold_pmuldq_128(<4 x i32> %a0, <4 x i32> %a1) {
-; CHECK-LABEL: @fold_pmuldq_128(
-; CHECK-NEXT:    ret <2 x i64> <i64 0, i64 2>
-;
-  %1 = call <2 x i64> @llvm.x86.sse41.pmuldq(<4 x i32> <i32 undef, i32 -1, i32 -1, i32 -1>, <4 x i32> <i32 undef, i32 1, i32 -2, i32 3>)
-  ret <2 x i64> %1
-}
-
-define <4 x i64> @fold_pmuldq_256(<8 x i32> %a0, <8 x i32> %a1) {
-; CHECK-LABEL: @fold_pmuldq_256(
-; CHECK-NEXT:    ret <4 x i64> <i64 0, i64 4294836225, i64 140737488289792, i64 -140737488355328>
-;
-  %1 = call <4 x i64> @llvm.x86.avx2.pmul.dq(<8 x i32> <i32 undef, i32 1, i32 -65535, i32 128, i32 65536, i32 2147483647, i32 -2147483648, i32 65536>, <8 x i32> <i32 0, i32 -1, i32 -65535, i32 -65535, i32 2147483647, i32 2147483648, i32 65536, i32 -65535>)
-  ret <4 x i64> %1
-}
-
-define <8 x i64> @fold_pmuldq_512(<16 x i32> %a0, <16 x i32> %a1) {
-; CHECK-LABEL: @fold_pmuldq_512(
-; CHECK-NEXT:    ret <8 x i64> zeroinitializer
-;
-  %1 = call <8 x i64> @llvm.x86.avx512.pmul.dq.512(<16 x i32> zeroinitializer, <16 x i32> <i32 undef, i32 -1, i32 -3, i32 -1, i32 8, i32 10, i32 -256, i32 65536, i32 undef, i32 1, i32 -65535, i32 128, i32 65536, i32 2147483647, i32 -2147483648, i32 65536>)
-  ret <8 x i64> %1
-}
-
-;
-; PMULUDQ/PMULDQ - only the even elements (0, 2, 4, 6) of the vXi32 inputs are required.
-;
-
-define <2 x i64> @test_demanded_elts_pmuludq_128(<4 x i32> %a0, <4 x i32> %a1) {
-; CHECK-LABEL: @test_demanded_elts_pmuludq_128(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> %a1, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[TMP2:%.*]] = call <2 x i64> @llvm.x86.sse2.pmulu.dq(<4 x i32> %a0, <4 x i32> [[TMP1]])
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> undef, <2 x i32> zeroinitializer
-; CHECK-NEXT:    ret <2 x i64> [[TMP3]]
-;
-  %1 = shufflevector <4 x i32> %a0, <4 x i32> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
-  %2 = shufflevector <4 x i32> %a1, <4 x i32> undef, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
-  %3 = call <2 x i64> @llvm.x86.sse2.pmulu.dq(<4 x i32> %1, <4 x i32> %2)
-  %4 = shufflevector <2 x i64> %3, <2 x i64> undef, <2 x i32> zeroinitializer
-  ret <2 x i64> %4
-}
-
-define <4 x i64> @test_demanded_elts_pmuludq_256(<8 x i32> %a0, <8 x i32> %a1) {
-; CHECK-LABEL: @test_demanded_elts_pmuludq_256(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> %a1, <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 3, i32 undef, i32 5, i32 undef, i32 7, i32 undef>
-; CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i64> @llvm.x86.avx2.pmulu.dq(<8 x i32> %a0, <8 x i32> [[TMP1]])
-; CHECK-NEXT:    ret <4 x i64> [[TMP2]]
-;
-  %1 = shufflevector <8 x i32> %a0, <8 x i32> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
-  %2 = shufflevector <8 x i32> %a1, <8 x i32> undef, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7>
-  %3 = call <4 x i64> @llvm.x86.avx2.pmulu.dq(<8 x i32> %1, <8 x i32> %2)
-  ret <4 x i64> %3
-}
-
-define <8 x i64> @test_demanded_elts_pmuludq_512(<16 x i32> %a0, <16 x i32> %a1) {
-; CHECK-LABEL: @test_demanded_elts_pmuludq_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i32> %a1, <16 x i32> undef, <16 x i32> <i32 1, i32 undef, i32 3, i32 undef, i32 5, i32 undef, i32 7, i32 undef, i32 9, i32 undef, i32 11, i32 undef, i32 13, i32 undef, i32 15, i32 undef>
-; CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i64> @llvm.x86.avx512.pmulu.dq.512(<16 x i32> %a0, <16 x i32> [[TMP1]])
-; CHECK-NEXT:    ret <8 x i64> [[TMP2]]
-;
-  %1 = shufflevector <16 x i32> %a0, <16 x i32> undef, <16 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14>
-  %2 = shufflevector <16 x i32> %a1, <16 x i32> undef, <16 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7, i32 9, i32 9, i32 11, i32 11, i32 13, i32 13, i32 15, i32 15>
-  %3 = call <8 x i64> @llvm.x86.avx512.pmulu.dq.512(<16 x i32> %1, <16 x i32> %2)
-  ret <8 x i64> %3
-}
-
-define <2 x i64> @test_demanded_elts_pmuldq_128(<4 x i32> %a0, <4 x i32> %a1) {
-; CHECK-LABEL: @test_demanded_elts_pmuldq_128(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> %a1, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 3, i32 undef>
-; CHECK-NEXT:    [[TMP2:%.*]] = call <2 x i64> @llvm.x86.sse41.pmuldq(<4 x i32> %a0, <4 x i32> [[TMP1]])
-; CHECK-NEXT:    ret <2 x i64> [[TMP2]]
-;
-  %1 = shufflevector <4 x i32> %a0, <4 x i32> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
-  %2 = shufflevector <4 x i32> %a1, <4 x i32> undef, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
-  %3 = call <2 x i64> @llvm.x86.sse41.pmuldq(<4 x i32> %1, <4 x i32> %2)
-  ret <2 x i64> %3
-}
-
-define <4 x i64> @test_demanded_elts_pmuldq_256(<8 x i32> %a0, <8 x i32> %a1) {
-; CHECK-LABEL: @test_demanded_elts_pmuldq_256(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> %a1, <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 7, i32 undef>
-; CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i64> @llvm.x86.avx2.pmul.dq(<8 x i32> %a0, <8 x i32> [[TMP1]])
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i64> [[TMP2]], <4 x i64> undef, <4 x i32> <i32 0, i32 0, i32 3, i32 3>
-; CHECK-NEXT:    ret <4 x i64> [[TMP3]]
-;
-  %1 = shufflevector <8 x i32> %a0, <8 x i32> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
-  %2 = shufflevector <8 x i32> %a1, <8 x i32> undef, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7>
-  %3 = call <4 x i64> @llvm.x86.avx2.pmul.dq(<8 x i32> %1, <8 x i32> %2)
-  %4 = shufflevector <4 x i64> %3, <4 x i64> undef, <4 x i32> <i32 0, i32 0, i32 3, i32 3>
-  ret <4 x i64> %4
-}
-
-define <8 x i64> @test_demanded_elts_pmuldq_512(<16 x i32> %a0, <16 x i32> %a1) {
-; CHECK-LABEL: @test_demanded_elts_pmuldq_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i32> %a1, <16 x i32> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 7, i32 undef, i32 9, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 15, i32 undef>
-; CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i64> @llvm.x86.avx512.pmul.dq.512(<16 x i32> %a0, <16 x i32> [[TMP1]])
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i64> [[TMP2]], <8 x i64> undef, <8 x i32> <i32 0, i32 0, i32 3, i32 3, i32 4, i32 4, i32 7, i32 7>
-; CHECK-NEXT:    ret <8 x i64> [[TMP3]]
-;
-  %1 = shufflevector <16 x i32> %a0, <16 x i32> undef, <16 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14>
-  %2 = shufflevector <16 x i32> %a1, <16 x i32> undef, <16 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7, i32 9, i32 9, i32 11, i32 11, i32 13, i32 13, i32 15, i32 15>
-  %3 = call <8 x i64> @llvm.x86.avx512.pmul.dq.512(<16 x i32> %1, <16 x i32> %2)
-  %4 = shufflevector <8 x i64> %3, <8 x i64> undef, <8 x i32> <i32 0, i32 0, i32 3, i32 3, i32 4, i32 4, i32 7, i32 7>
-  ret <8 x i64> %4
-}
-
-declare <2 x i64> @llvm.x86.sse2.pmulu.dq(<4 x i32>, <4 x i32>) nounwind readnone
-declare <2 x i64> @llvm.x86.sse41.pmuldq(<4 x i32>, <4 x i32>) nounwind readnone
-
-declare <4 x i64> @llvm.x86.avx2.pmulu.dq(<8 x i32>, <8 x i32>) nounwind readnone
-declare <4 x i64> @llvm.x86.avx2.pmul.dq(<8 x i32>, <8 x i32>) nounwind readnone
-
-declare <8 x i64> @llvm.x86.avx512.pmulu.dq.512(<16 x i32>, <16 x i32>) nounwind readnone
-declare <8 x i64> @llvm.x86.avx512.pmul.dq.512(<16 x i32>, <16 x i32>) nounwind readnone
diff --git a/test/Transforms/InstCombine/x86-pack.ll b/test/Transforms/InstCombine/x86-pack.ll
deleted file mode 100644
index f3c41a8aa476..000000000000
--- a/test/Transforms/InstCombine/x86-pack.ll
+++ /dev/null
@@ -1,366 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -instcombine -S | FileCheck %s
-
-;
-; UNDEF Elts
-;
-
-define <8 x i16> @undef_packssdw_128() {
-; CHECK-LABEL: @undef_packssdw_128(
-; CHECK-NEXT:    ret <8 x i16> undef
-;
-  %1 = call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> undef, <4 x i32> undef)
-  ret <8 x i16> %1
-}
-
-define <8 x i16> @undef_packusdw_128() {
-; CHECK-LABEL: @undef_packusdw_128(
-; CHECK-NEXT:    ret <8 x i16> undef
-;
-  %1 = call <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32> undef, <4 x i32> undef)
-  ret <8 x i16> %1
-}
-
-define <16 x i8> @undef_packsswb_128() {
-; CHECK-LABEL: @undef_packsswb_128(
-; CHECK-NEXT:    ret <16 x i8> undef
-;
-  %1 = call <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16> undef, <8 x i16> undef)
-  ret <16 x i8> %1
-}
-
-define <16 x i8> @undef_packuswb_128() {
-; CHECK-LABEL: @undef_packuswb_128(
-; CHECK-NEXT:    ret <16 x i8> undef
-;
-  %1 = call <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16> undef, <8 x i16> undef)
-  ret <16 x i8> %1
-}
-
-define <16 x i16> @undef_packssdw_256() {
-; CHECK-LABEL: @undef_packssdw_256(
-; CHECK-NEXT:    ret <16 x i16> undef
-;
-  %1 = call <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32> undef, <8 x i32> undef)
-  ret <16 x i16> %1
-}
-
-define <16 x i16> @undef_packusdw_256() {
-; CHECK-LABEL: @undef_packusdw_256(
-; CHECK-NEXT:    ret <16 x i16> undef
-;
-  %1 = call <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32> undef, <8 x i32> undef)
-  ret <16 x i16> %1
-}
-
-define <32 x i8> @undef_packsswb_256() {
-; CHECK-LABEL: @undef_packsswb_256(
-; CHECK-NEXT:    ret <32 x i8> undef
-;
-  %1 = call <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16> undef, <16 x i16> undef)
-  ret <32 x i8> %1
-}
-
-define <32 x i8> @undef_packuswb_256() {
-; CHECK-LABEL: @undef_packuswb_256(
-; CHECK-NEXT:    ret <32 x i8> undef
-;
-  %1 = call <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16> undef, <16 x i16> undef)
-  ret <32 x i8> %1
-}
-
-define <32 x i16> @undef_packssdw_512() {
-; CHECK-LABEL: @undef_packssdw_512(
-; CHECK-NEXT:    ret <32 x i16> undef
-;
-  %1 = call <32 x i16> @llvm.x86.avx512.packssdw.512(<16 x i32> undef, <16 x i32> undef)
-  ret <32 x i16> %1
-}
-
-define <32 x i16> @undef_packusdw_512() {
-; CHECK-LABEL: @undef_packusdw_512(
-; CHECK-NEXT:    ret <32 x i16> undef
-;
-  %1 = call <32 x i16> @llvm.x86.avx512.packusdw.512(<16 x i32> undef, <16 x i32> undef)
-  ret <32 x i16> %1
-}
-
-define <64 x i8> @undef_packsswb_512() {
-; CHECK-LABEL: @undef_packsswb_512(
-; CHECK-NEXT:    ret <64 x i8> undef
-;
-  %1 = call <64 x i8> @llvm.x86.avx512.packsswb.512(<32 x i16> undef, <32 x i16> undef)
-  ret <64 x i8> %1
-}
-
-define <64 x i8> @undef_packuswb_512() {
-; CHECK-LABEL: @undef_packuswb_512(
-; CHECK-NEXT:    ret <64 x i8> undef
-;
-  %1 = call <64 x i8> @llvm.x86.avx512.packuswb.512(<32 x i16> undef, <32 x i16> undef)
-  ret <64 x i8> %1
-}
-
-;
-; Constant Folding
-;
-
-define <8 x i16> @fold_packssdw_128() {
-; CHECK-LABEL: @fold_packssdw_128(
-; CHECK-NEXT:    ret <8 x i16> <i16 0, i16 -1, i16 32767, i16 -32768, i16 0, i16 0, i16 0, i16 0>
-;
-  %1 = call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> <i32 0, i32 -1, i32 65536, i32 -131072>, <4 x i32> zeroinitializer)
-  ret <8 x i16> %1
-}
-
-define <8 x i16> @fold_packusdw_128() {
-; CHECK-LABEL: @fold_packusdw_128(
-; CHECK-NEXT:    ret <8 x i16> <i16 undef, i16 undef, i16 undef, i16 undef, i16 0, i16 0, i16 -32768, i16 -1>
-;
-  %1 = call <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32> undef, <4 x i32> <i32 0, i32 -1, i32 32768, i32 65537>)
-  ret <8 x i16> %1
-}
-
-define <16 x i8> @fold_packsswb_128() {
-; CHECK-LABEL: @fold_packsswb_128(
-; CHECK-NEXT:    ret <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef>
-;
-  %1 = call <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16> zeroinitializer, <8 x i16> undef)
-  ret <16 x i8> %1
-}
-
-define <16 x i8> @fold_packuswb_128() {
-; CHECK-LABEL: @fold_packuswb_128(
-; CHECK-NEXT:    ret <16 x i8> <i8 0, i8 1, i8 0, i8 -1, i8 0, i8 0, i8 0, i8 15, i8 0, i8 127, i8 0, i8 1, i8 0, i8 1, i8 0, i8 0>
-;
-  %1 = call <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16> <i16 0, i16 1, i16 -1, i16 255, i16 65535, i16 -32768, i16 -127, i16 15>, <8 x i16> <i16 -15, i16 127, i16 32768, i16 -65535, i16 -255, i16 1, i16 -1, i16 0>)
-  ret <16 x i8> %1
-}
-
-define <16 x i16> @fold_packssdw_256() {
-; CHECK-LABEL: @fold_packssdw_256(
-; CHECK-NEXT:    ret <16 x i16> <i16 0, i16 256, i16 32767, i16 -32768, i16 undef, i16 undef, i16 undef, i16 undef, i16 -127, i16 -32768, i16 -32767, i16 32767, i16 undef, i16 undef, i16 undef, i16 undef>
-;
-  %1 = call <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32> <i32 0, i32 256, i32 65535, i32 -65536, i32 -127, i32 -32768, i32 -32767, i32 32767>, <8 x i32> undef)
-  ret <16 x i16> %1
-}
-
-define <16 x i16> @fold_packusdw_256() {
-; CHECK-LABEL: @fold_packusdw_256(
-; CHECK-NEXT:    ret <16 x i16> <i16 0, i16 0, i16 0, i16 -1, i16 0, i16 256, i16 -1, i16 0, i16 127, i16 -32768, i16 32767, i16 0, i16 0, i16 0, i16 0, i16 32767>
-;
-  %1 = call <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32> <i32 0, i32 -256, i32 -65535, i32 65536, i32 127, i32 32768, i32 32767, i32 -32767>, <8 x i32> <i32 0, i32 256, i32 65535, i32 -65536, i32 -127, i32 -32768, i32 -32767, i32 32767>)
-  ret <16 x i16> %1
-}
-
-define <32 x i8> @fold_packsswb_256() {
-; CHECK-LABEL: @fold_packsswb_256(
-; CHECK-NEXT:    ret <32 x i8> <i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>
-;
-  %1 = call <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16> undef, <16 x i16> zeroinitializer)
-  ret <32 x i8> %1
-}
-
-define <32 x i8> @fold_packuswb_256() {
-; CHECK-LABEL: @fold_packuswb_256(
-; CHECK-NEXT:    ret <32 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 -1, i8 -1, i8 -1, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 1, i8 2, i8 4, i8 8, i8 16, i8 32, i8 64>
-;
-  %1 = call <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16> zeroinitializer, <16 x i16> <i16 0, i16 -127, i16 -128, i16 -32768, i16 65536, i16 255, i16 256, i16 512, i16 -1, i16 1, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64>)
-  ret <32 x i8> %1
-}
-
-define <32 x i16> @fold_packssdw_512() {
-; CHECK-LABEL: @fold_packssdw_512(
-; CHECK-NEXT:    ret <32 x i16> <i16 0, i16 512, i16 32767, i16 -32768, i16 undef, i16 undef, i16 undef, i16 undef, i16 -127, i16 -32768, i16 -32767, i16 32767, i16 undef, i16 undef, i16 undef, i16 undef, i16 0, i16 512, i16 32767, i16 -32768, i16 undef, i16 undef, i16 undef, i16 undef, i16 -127, i16 -32768, i16 -32767, i16 32767, i16 undef, i16 undef, i16 undef, i16 undef>
-;
-  %1 = call <32 x i16> @llvm.x86.avx512.packssdw.512(<16 x i32> <i32 0, i32 512, i32 65535, i32 -65536, i32 -127, i32 -32768, i32 -32767, i32 32767, i32 0, i32 512, i32 65535, i32 -65536, i32 -127, i32 -32768, i32 -32767, i32 32767>, <16 x i32> undef)
-  ret <32 x i16> %1
-}
-
-define <32 x i16> @fold_packusdw_512() {
-; CHECK-LABEL: @fold_packusdw_512(
-; CHECK-NEXT:    ret <32 x i16> <i16 0, i16 0, i16 0, i16 -1, i16 0, i16 512, i16 -1, i16 0, i16 127, i16 -32768, i16 32767, i16 0, i16 0, i16 0, i16 0, i16 32767, i16 0, i16 0, i16 0, i16 -1, i16 0, i16 512, i16 -1, i16 0, i16 127, i16 -32768, i16 32767, i16 0, i16 0, i16 0, i16 0, i16 32767>
-;
-  %1 = call <32 x i16> @llvm.x86.avx512.packusdw.512(<16 x i32> <i32 0, i32 -512, i32 -65535, i32 65536, i32 127, i32 32768, i32 32767, i32 -32767, i32 0, i32 -512, i32 -65535, i32 65536, i32 127, i32 32768, i32 32767, i32 -32767>, <16 x i32> <i32 0, i32 512, i32 65535, i32 -65536, i32 -127, i32 -32768, i32 -32767, i32 32767, i32 0, i32 512, i32 65535, i32 -65536, i32 -127, i32 -32768, i32 -32767, i32 32767>)
-  ret <32 x i16> %1
-}
-
-define <64 x i8> @fold_packsswb_512() {
-; CHECK-LABEL: @fold_packsswb_512(
-; CHECK-NEXT:    ret <64 x i8> <i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>
-;
-  %1 = call <64 x i8> @llvm.x86.avx512.packsswb.512(<32 x i16> undef, <32 x i16> zeroinitializer)
-  ret <64 x i8> %1
-}
-
-define <64 x i8> @fold_packuswb_512() {
-; CHECK-LABEL: @fold_packuswb_512(
-; CHECK-NEXT:    ret <64 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 -1, i8 -1, i8 -1, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 1, i8 2, i8 4, i8 8, i8 16, i8 32, i8 64, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 -1, i8 -1, i8 -1, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 1, i8 2, i8 4, i8 8, i8 16, i8 32, i8 64>
-;
-  %1 = call <64 x i8> @llvm.x86.avx512.packuswb.512(<32 x i16> zeroinitializer, <32 x i16> <i16 0, i16 -127, i16 -128, i16 -32768, i16 65536, i16 255, i16 512, i16 512, i16 -1, i16 1, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 0, i16 -127, i16 -128, i16 -32768, i16 65536, i16 255, i16 512, i16 512, i16 -1, i16 1, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64>)
-  ret <64 x i8> %1
-}
-
-;
-; Demanded Elts
-;
-
-define <8 x i16> @elts_packssdw_128(<4 x i32> %a0, <4 x i32> %a1) {
-; CHECK-LABEL: @elts_packssdw_128(
-; CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> [[A0:%.*]], <4 x i32> undef)
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> undef, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 undef, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    ret <8 x i16> [[TMP2]]
-;
-  %1 = shufflevector <4 x i32> %a0, <4 x i32> undef, <4 x i32> <i32 3, i32 1, i32 undef, i32 undef>
-  %2 = shufflevector <4 x i32> %a1, <4 x i32> undef, <4 x i32> <i32 undef, i32 2, i32 1, i32 undef>
-  %3 = call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> %1, <4 x i32> %2)
-  %4 = shufflevector <8 x i16> %3, <8 x i16> undef, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 7, i32 7, i32 7, i32 7>
-  ret <8 x i16> %4
-}
-
-define <8 x i16> @elts_packusdw_128(<4 x i32> %a0, <4 x i32> %a1) {
-; CHECK-LABEL: @elts_packusdw_128(
-; CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32> [[A0:%.*]], <4 x i32> [[A1:%.*]])
-; CHECK-NEXT:    ret <8 x i16> [[TMP1]]
-;
-  %1 = insertelement <4 x i32> %a0, i32 0, i32 0
-  %2 = insertelement <4 x i32> %a1, i32 0, i32 3
-  %3 = call <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32> %1, <4 x i32> %2)
-  %4 = shufflevector <8 x i16> %3, <8 x i16> undef, <8 x i32> <i32 undef, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 undef>
-  ret <8 x i16> %4
-}
-
-define <16 x i8> @elts_packsswb_128(<8 x i16> %a0, <8 x i16> %a1) {
-; CHECK-LABEL: @elts_packsswb_128(
-; CHECK-NEXT:    ret <16 x i8> zeroinitializer
-;
-  %1 = insertelement <8 x i16> %a0, i16 0, i32 0
-  %2 = insertelement <8 x i16> %a1, i16 0, i32 0
-  %3 = call <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16> %1, <8 x i16> %2)
-  %4 = shufflevector <16 x i8> %3, <16 x i8> undef, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
-  ret <16 x i8> %4
-}
-
-define <16 x i8> @elts_packuswb_128(<8 x i16> %a0, <8 x i16> %a1) {
-; CHECK-LABEL: @elts_packuswb_128(
-; CHECK-NEXT:    ret <16 x i8> undef
-;
-  %1 = insertelement <8 x i16> undef, i16 0, i32 0
-  %2 = insertelement <8 x i16> undef, i16 0, i32 0
-  %3 = call <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16> %1, <8 x i16> %2)
-  %4 = shufflevector <16 x i8> %3, <16 x i8> undef, <16 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
-  ret <16 x i8> %4
-}
-
-define <16 x i16> @elts_packssdw_256(<8 x i32> %a0, <8 x i32> %a1) {
-; CHECK-LABEL: @elts_packssdw_256(
-; CHECK-NEXT:    [[TMP1:%.*]] = call <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32> [[A0:%.*]], <8 x i32> undef)
-; CHECK-NEXT:    ret <16 x i16> [[TMP1]]
-;
-  %1 = shufflevector <8 x i32> %a0, <8 x i32> undef, <8 x i32> <i32 1, i32 0, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-  %2 = shufflevector <8 x i32> %a1, <8 x i32> undef, <8 x i32> <i32 undef, i32 2, i32 1, i32 undef, i32 undef, i32 6, i32 5, i32 undef>
-  %3 = call <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32> %1, <8 x i32> %2)
-  %4 = shufflevector <16 x i16> %3, <16 x i16> undef, <16 x i32> <i32 undef, i32 undef, i32 2, i32 3, i32 4, i32 undef, i32 undef, i32 7, i32 8, i32 undef, i32 undef, i32 11, i32 12, i32 undef, i32 undef, i32 15>
-  ret <16 x i16> %4
-}
-
-define <16 x i16> @elts_packusdw_256(<8 x i32> %a0, <8 x i32> %a1) {
-; CHECK-LABEL: @elts_packusdw_256(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A1:%.*]], <8 x i32> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    [[TMP2:%.*]] = call <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32> undef, <8 x i32> [[TMP1]])
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <16 x i16> [[TMP2]], <16 x i16> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    ret <16 x i16> [[TMP3]]
-;
-  %1 = shufflevector <8 x i32> %a0, <8 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-  %2 = shufflevector <8 x i32> %a1, <8 x i32> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-  %3 = call <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32> %1, <8 x i32> %2)
-  %4 = shufflevector <16 x i16> %3, <16 x i16> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef>
-  ret <16 x i16> %4
-}
-
-define <32 x i8> @elts_packsswb_256(<16 x i16> %a0, <16 x i16> %a1) {
-; CHECK-LABEL: @elts_packsswb_256(
-; CHECK-NEXT:    ret <32 x i8> zeroinitializer
-;
-  %1 = insertelement <16 x i16> %a0, i16 0, i32 0
-  %2 = insertelement <16 x i16> %a1, i16 0, i32 8
-  %3 = call <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16> %1, <16 x i16> %2)
-  %4 = shufflevector <32 x i8> %3, <32 x i8> undef, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24>
-  ret <32 x i8> %4
-}
-
-define <32 x i8> @elts_packuswb_256(<16 x i16> %a0, <16 x i16> %a1) {
-; CHECK-LABEL: @elts_packuswb_256(
-; CHECK-NEXT:    ret <32 x i8> undef
-;
-  %1 = insertelement <16 x i16> undef, i16 0, i32 1
-  %2 = insertelement <16 x i16> undef, i16 0, i32 0
-  %3 = call <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16> %1, <16 x i16> %2)
-  %4 = shufflevector <32 x i8> %3, <32 x i8> undef, <32 x i32> zeroinitializer
-  ret <32 x i8> %4
-}
-
-define <32 x i16> @elts_packssdw_512(<16 x i32> %a0, <16 x i32> %a1) {
-; CHECK-LABEL: @elts_packssdw_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = call <32 x i16> @llvm.x86.avx512.packssdw.512(<16 x i32> [[A0:%.*]], <16 x i32> undef)
-; CHECK-NEXT:    ret <32 x i16> [[TMP1]]
-;
-  %1 = shufflevector <16 x i32> %a0, <16 x i32> undef, <16 x i32> <i32 1, i32 0, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 9, i32 8, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %2 = shufflevector <16 x i32> %a1, <16 x i32> undef, <16 x i32> <i32 undef, i32 2, i32 1, i32 undef, i32 undef, i32 6, i32 5, i32 undef, i32 undef, i32 10, i32 9, i32 undef, i32 undef, i32 14, i32 13, i32 undef>
-  %3 = call <32 x i16> @llvm.x86.avx512.packssdw.512(<16 x i32> %1, <16 x i32> %2)
-  %4 = shufflevector <32 x i16> %3, <32 x i16> undef, <32 x i32> <i32 undef, i32 undef, i32 2, i32 3, i32 4, i32 undef, i32 undef, i32 7, i32 8, i32 undef, i32 undef, i32 11, i32 12, i32 undef, i32 undef, i32 15, i32 undef, i32 undef, i32 18, i32 19, i32 20, i32 undef, i32 undef, i32 23, i32 24, i32 undef, i32 undef, i32 27, i32 28, i32 undef, i32 undef, i32 31>
-  ret <32 x i16> %4
-}
-
-define <32 x i16> @elts_packusdw_512(<16 x i32> %a0, <16 x i32> %a1) {
-; CHECK-LABEL: @elts_packusdw_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i32> [[A1:%.*]], <16 x i32> undef, <16 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8>
-; CHECK-NEXT:    [[TMP2:%.*]] = call <32 x i16> @llvm.x86.avx512.packusdw.512(<16 x i32> undef, <16 x i32> [[TMP1]])
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <32 x i16> [[TMP2]], <32 x i16> undef, <32 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 20, i32 21, i32 22, i32 23, i32 undef, i32 undef, i32 undef, i32 undef, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    ret <32 x i16> [[TMP3]]
-;
-  %1 = shufflevector <16 x i32> %a0, <16 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %2 = shufflevector <16 x i32> %a1, <16 x i32> undef, <16 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8>
-  %3 = call <32 x i16> @llvm.x86.avx512.packusdw.512(<16 x i32> %1, <16 x i32> %2)
-  %4 = shufflevector <32 x i16> %3, <32 x i16> undef, <32 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 20, i32 21, i32 22, i32 23, i32 undef, i32 undef, i32 undef, i32 undef, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef>
-  ret <32 x i16> %4
-}
-
-define <64 x i8> @elts_packsswb_512(<32 x i16> %a0, <32 x i16> %a1) {
-; CHECK-LABEL: @elts_packsswb_512(
-; CHECK-NEXT:    ret <64 x i8> zeroinitializer
-;
-  %1 = insertelement <32 x i16> %a0, i16 0, i32 0
-  %2 = insertelement <32 x i16> %a1, i16 0, i32 8
-  %3 = insertelement <32 x i16> %1, i16 0, i32 16
-  %4 = insertelement <32 x i16> %2, i16 0, i32 24
-  %5 = call <64 x i8> @llvm.x86.avx512.packsswb.512(<32 x i16> %3, <32 x i16> %4)
-  %6 = shufflevector <64 x i8> %5, <64 x i8> undef, <64 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56>
-  ret <64 x i8> %6
-}
-
-define <64 x i8> @elts_packuswb_512(<32 x i16> %a0, <32 x i16> %a1) {
-; CHECK-LABEL: @elts_packuswb_512(
-; CHECK-NEXT:    ret <64 x i8> undef
-;
-  %1 = insertelement <32 x i16> undef, i16 0, i32 1
-  %2 = insertelement <32 x i16> undef, i16 0, i32 0
-  %3 = call <64 x i8> @llvm.x86.avx512.packuswb.512(<32 x i16> %1, <32 x i16> %2)
-  %4 = shufflevector <64 x i8> %3, <64 x i8> undef, <64 x i32> zeroinitializer
-  ret <64 x i8> %4
-}
-
-declare <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32>, <4 x i32>) nounwind readnone
-declare <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16>, <8 x i16>) nounwind readnone
-declare <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16>, <8 x i16>) nounwind readnone
-declare <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32>, <4 x i32>) nounwind readnone
-
-declare <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32>, <8 x i32>) nounwind readnone
-declare <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32>, <8 x i32>) nounwind readnone
-declare <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16>, <16 x i16>) nounwind readnone
-declare <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16>, <16 x i16>) nounwind readnone
-
-declare <32 x i16> @llvm.x86.avx512.packssdw.512(<16 x i32>, <16 x i32>) nounwind readnone
-declare <32 x i16> @llvm.x86.avx512.packusdw.512(<16 x i32>, <16 x i32>) nounwind readnone
-declare <64 x i8> @llvm.x86.avx512.packsswb.512(<32 x i16>, <32 x i16>) nounwind readnone
-declare <64 x i8> @llvm.x86.avx512.packuswb.512(<32 x i16>, <32 x i16>) nounwind readnone
diff --git a/test/Transforms/InstCombine/x86-pshufb.ll b/test/Transforms/InstCombine/x86-pshufb.ll
deleted file mode 100644
index f181ef57fe20..000000000000
--- a/test/Transforms/InstCombine/x86-pshufb.ll
+++ /dev/null
@@ -1,515 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -instcombine -S | FileCheck %s
-
-; Verify that instcombine is able to fold identity shuffles.
-
-define <16 x i8> @identity_test(<16 x i8> %InVec) {
-; CHECK-LABEL: @identity_test(
-; CHECK-NEXT:    ret <16 x i8> %InVec
-;
-  %1 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %InVec, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>)
-  ret <16 x i8> %1
-}
-
-define <32 x i8> @identity_test_avx2(<32 x i8> %InVec) {
-; CHECK-LABEL: @identity_test_avx2(
-; CHECK-NEXT:    ret <32 x i8> %InVec
-;
-  %1 = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %InVec, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>)
-  ret <32 x i8> %1
-}
-
-define <64 x i8> @identity_test_avx512(<64 x i8> %InVec) {
-; CHECK-LABEL: @identity_test_avx512(
-; CHECK-NEXT:    ret <64 x i8> %InVec
-;
-  %1 = tail call <64 x i8> @llvm.x86.avx512.pshuf.b.512(<64 x i8> %InVec, <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>)
-  ret <64 x i8> %1
-}
-
-; Verify that instcombine is able to fold byte shuffles with zero masks.
-
-define <16 x i8> @fold_to_zero_vector(<16 x i8> %InVec) {
-; CHECK-LABEL: @fold_to_zero_vector(
-; CHECK-NEXT:    ret <16 x i8> zeroinitializer
-;
-  %1 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %InVec, <16 x i8> <i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128>)
-  ret <16 x i8> %1
-}
-
-define <32 x i8> @fold_to_zero_vector_avx2(<32 x i8> %InVec) {
-; CHECK-LABEL: @fold_to_zero_vector_avx2(
-; CHECK-NEXT:    ret <32 x i8> zeroinitializer
-;
-  %1 = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %InVec, <32 x i8> <i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128>)
-  ret <32 x i8> %1
-}
-
-define <64 x i8> @fold_to_zero_vector_avx512(<64 x i8> %InVec) {
-; CHECK-LABEL: @fold_to_zero_vector_avx512(
-; CHECK-NEXT:    ret <64 x i8> zeroinitializer
-;
-  %1 = tail call <64 x i8> @llvm.x86.avx512.pshuf.b.512(<64 x i8> %InVec, <64 x i8> <i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128>)
-  ret <64 x i8> %1
-}
-
-; Instcombine should be able to fold the following byte shuffle to a builtin shufflevector
-; with a shuffle mask of all zeroes.
-
-define <16 x i8> @splat_test(<16 x i8> %InVec) {
-; CHECK-LABEL: @splat_test(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i8> %InVec, <16 x i8> undef, <16 x i32> zeroinitializer
-; CHECK-NEXT:    ret <16 x i8> [[TMP1]]
-;
-  %1 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %InVec, <16 x i8> zeroinitializer)
-  ret <16 x i8> %1
-}
-
-; In the test case below, elements in the low 128-bit lane of the result
-; vector are equal to the lower byte of %InVec (shuffle index 0).
-; Elements in the high 128-bit lane of the result vector are equal to
-; the lower byte in the high 128-bit lane of %InVec (shuffle index 16).
-
-define <32 x i8> @splat_test_avx2(<32 x i8> %InVec) {
-; CHECK-LABEL: @splat_test_avx2(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <32 x i8> %InVec, <32 x i8> undef, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
-; CHECK-NEXT:    ret <32 x i8> [[TMP1]]
-;
-  %1 = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %InVec, <32 x i8> zeroinitializer)
-  ret <32 x i8> %1
-}
-
-define <64 x i8> @splat_test_avx512(<64 x i8> %InVec) {
-; CHECK-LABEL: @splat_test_avx512(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <64 x i8> %InVec, <64 x i8> undef, <64 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48>
-; CHECK-NEXT:    ret <64 x i8> [[TMP1]]
-;
-  %1 = tail call <64 x i8> @llvm.x86.avx512.pshuf.b.512(<64 x i8> %InVec, <64 x i8> zeroinitializer)
-  ret <64 x i8> %1
-}
-
-; Each of the byte shuffles in the following tests is equivalent to a blend between
-; vector %InVec and a vector of all zeroes.
-
-define <16 x i8> @blend1(<16 x i8> %InVec) {
-; CHECK-LABEL: @blend1(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i8> %InVec, <16 x i8> <i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef>, <16 x i32> <i32 16, i32 1, i32 16, i32 3, i32 16, i32 5, i32 16, i32 7, i32 16, i32 9, i32 16, i32 11, i32 16, i32 13, i32 16, i32 15>
-; CHECK-NEXT:    ret <16 x i8> [[TMP1]]
-;
-  %1 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %InVec, <16 x i8> <i8 -128, i8 1, i8 -128, i8 3, i8 -128, i8 5, i8 -128, i8 7, i8 -128, i8 9, i8 -128, i8 11, i8 -128, i8 13, i8 -128, i8 15>)
-  ret <16 x i8> %1
-}
-
-define <16 x i8> @blend2(<16 x i8> %InVec) {
-; CHECK-LABEL: @blend2(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i8> %InVec, <16 x i8> <i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef>, <16 x i32> <i32 16, i32 16, i32 2, i32 3, i32 16, i32 16, i32 6, i32 7, i32 16, i32 16, i32 10, i32 11, i32 16, i32 16, i32 14, i32 15>
-; CHECK-NEXT:    ret <16 x i8> [[TMP1]]
-;
-  %1 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %InVec, <16 x i8> <i8 -128, i8 -128, i8 2, i8 3, i8 -128, i8 -128, i8 6, i8 7, i8 -128, i8 -128, i8 10, i8 11, i8 -128, i8 -128, i8 14, i8 15>)
-  ret <16 x i8> %1
-}
-
-define <16 x i8> @blend3(<16 x i8> %InVec) {
-; CHECK-LABEL: @blend3(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i8> %InVec, <16 x i8> <i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef>, <16 x i32> <i32 16, i32 16, i32 16, i32 16, i32 4, i32 5, i32 6, i32 7, i32 16, i32 16, i32 16, i32 16, i32 12, i32 13, i32 14, i32 15>
-; CHECK-NEXT:    ret <16 x i8> [[TMP1]]
-;
-  %1 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %InVec, <16 x i8> <i8 -128, i8 -128, i8 -128, i8 -128, i8 4, i8 5, i8 6, i8 7, i8 -128, i8 -128, i8 -128, i8 -128, i8 12, i8 13, i8 14, i8 15>)
-  ret <16 x i8> %1
-}
-
-define <16 x i8> @blend4(<16 x i8> %InVec) {
-; CHECK-LABEL: @blend4(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i8> %InVec, <16 x i8> <i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef>, <16 x i32> <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; CHECK-NEXT:    ret <16 x i8> [[TMP1]]
-;
-  %1 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %InVec, <16 x i8> <i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>)
-  ret <16 x i8> %1
-}
-
-define <16 x i8> @blend5(<16 x i8> %InVec) {
-; CHECK-LABEL: @blend5(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i8> %InVec, <16 x i8> <i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef>, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
-; CHECK-NEXT:    ret <16 x i8> [[TMP1]]
-;
-  %1 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %InVec, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128>)
-  ret <16 x i8> %1
-}
-
-define <16 x i8> @blend6(<16 x i8> %InVec) {
-; CHECK-LABEL: @blend6(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i8> %InVec, <16 x i8> <i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef>, <16 x i32> <i32 0, i32 1, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
-; CHECK-NEXT:    ret <16 x i8> [[TMP1]]
-;
-  %1 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %InVec, <16 x i8> <i8 0, i8 1, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128>)
-  ret <16 x i8> %1
-}
-
-define <32 x i8> @blend1_avx2(<32 x i8> %InVec) {
-; CHECK-LABEL: @blend1_avx2(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <32 x i8> %InVec, <32 x i8> <i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef>, <32 x i32> <i32 32, i32 1, i32 32, i32 3, i32 32, i32 5, i32 32, i32 7, i32 32, i32 9, i32 32, i32 11, i32 32, i32 13, i32 32, i32 15, i32 48, i32 17, i32 48, i32 19, i32 48, i32 21, i32 48, i32 23, i32 48, i32 25, i32 48, i32 27, i32 48, i32 29, i32 48, i32 31>
-; CHECK-NEXT:    ret <32 x i8> [[TMP1]]
-;
-  %1 = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %InVec, <32 x i8> <i8 -128, i8 1, i8 -128, i8 3, i8 -128, i8 5, i8 -128, i8 7, i8 -128, i8 9, i8 -128, i8 11, i8 -128, i8 13, i8 -128, i8 15, i8 -128, i8 1, i8 -128, i8 3, i8 -128, i8 5, i8 -128, i8 7, i8 -128, i8 9, i8 -128, i8 11, i8 -128, i8 13, i8 -128, i8 15>)
-  ret <32 x i8> %1
-}
-
-define <32 x i8> @blend2_avx2(<32 x i8> %InVec) {
-; CHECK-LABEL: @blend2_avx2(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <32 x i8> %InVec, <32 x i8> <i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef>, <32 x i32> <i32 32, i32 32, i32 2, i32 3, i32 32, i32 32, i32 6, i32 7, i32 32, i32 32, i32 10, i32 11, i32 32, i32 32, i32 14, i32 15, i32 48, i32 48, i32 18, i32 19, i32 48, i32 48, i32 22, i32 23, i32 48, i32 48, i32 26, i32 27, i32 48, i32 48, i32 30, i32 31>
-; CHECK-NEXT:    ret <32 x i8> [[TMP1]]
-;
-  %1 = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %InVec, <32 x i8> <i8 -128, i8 -128, i8 2, i8 3, i8 -128, i8 -128, i8 6, i8 7, i8 -128, i8 -128, i8 10, i8 11, i8 -128, i8 -128, i8 14, i8 15, i8 -128, i8 -128, i8 2, i8 3, i8 -128, i8 -128, i8 6, i8 7, i8 -128, i8 -128, i8 10, i8 11, i8 -128, i8 -128, i8 14, i8 15>)
-  ret <32 x i8> %1
-}
-
-define <32 x i8> @blend3_avx2(<32 x i8> %InVec) {
-; CHECK-LABEL: @blend3_avx2(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <32 x i8> %InVec, <32 x i8> <i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef>, <32 x i32> <i32 32, i32 32, i32 32, i32 32, i32 4, i32 5, i32 6, i32 7, i32 32, i32 32, i32 32, i32 32, i32 12, i32 13, i32 14, i32 15, i32 48, i32 48, i32 48, i32 48, i32 20, i32 21, i32 22, i32 23, i32 48, i32 48, i32 48, i32 48, i32 28, i32 29, i32 30, i32 31>
-; CHECK-NEXT:    ret <32 x i8> [[TMP1]]
-;
-  %1 = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %InVec, <32 x i8> <i8 -128, i8 -128, i8 -128, i8 -128, i8 4, i8 5, i8 6, i8 7, i8 -128, i8 -128, i8 -128, i8 -128, i8 12, i8 13, i8 14, i8 15, i8 -128, i8 -128, i8 -128, i8 -128, i8 4, i8 5, i8 6, i8 7, i8 -128, i8 -128, i8 -128, i8 -128, i8 12, i8 13, i8 14, i8 15>)
-  ret <32 x i8> %1
-}
-
-define <32 x i8> @blend4_avx2(<32 x i8> %InVec) {
-; CHECK-LABEL: @blend4_avx2(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <32 x i8> %InVec, <32 x i8> <i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef>, <32 x i32> <i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; CHECK-NEXT:    ret <32 x i8> [[TMP1]]
-;
-  %1 = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %InVec, <32 x i8> <i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>)
-  ret <32 x i8> %1
-}
-
-define <32 x i8> @blend5_avx2(<32 x i8> %InVec) {
-; CHECK-LABEL: @blend5_avx2(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <32 x i8> %InVec, <32 x i8> <i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef>, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 16, i32 17, i32 18, i32 19, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48>
-; CHECK-NEXT:    ret <32 x i8> [[TMP1]]
-;
-  %1 = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %InVec, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 0, i8 1, i8 2, i8 3, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128>)
-  ret <32 x i8> %1
-}
-
-define <32 x i8> @blend6_avx2(<32 x i8> %InVec) {
-; CHECK-LABEL: @blend6_avx2(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <32 x i8> %InVec, <32 x i8> <i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef>, <32 x i32> <i32 0, i32 1, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 16, i32 17, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48>
-; CHECK-NEXT:    ret <32 x i8> [[TMP1]]
-;
-  %1 = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %InVec, <32 x i8> <i8 0, i8 1, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 0, i8 1, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128>)
-  ret <32 x i8> %1
-}
-
-define <64 x i8> @blend1_avx512(<64 x i8> %InVec) {
-; CHECK-LABEL: @blend1_avx512(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <64 x i8> %InVec, <64 x i8> <i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef>, <64 x i32> <i32 64, i32 1, i32 64, i32 3, i32 64, i32 5, i32 64, i32 7, i32 64, i32 9, i32 64, i32 11, i32 64, i32 13, i32 64, i32 15, i32 80, i32 17, i32 80, i32 19, i32 80, i32 21, i32 80, i32 23, i32 80, i32 25, i32 80, i32 27, i32 80, i32 29, i32 80, i32 31, i32 96, i32 33, i32 96, i32 35, i32 96, i32 37, i32 96, i32 39, i32 96, i32 41, i32 96, i32 43, i32 96, i32 45, i32 96, i32 47, i32 112, i32 49, i32 112, i32 51, i32 112, i32 53, i32 112, i32 55, i32 112, i32 57, i32 112, i32 59, i32 112, i32 61, i32 112, i32 63>
-; CHECK-NEXT:    ret <64 x i8> [[TMP1]]
-;
-  %1 = tail call <64 x i8> @llvm.x86.avx512.pshuf.b.512(<64 x i8> %InVec, <64 x i8> <i8 -128, i8 1, i8 -128, i8 3, i8 -128, i8 5, i8 -128, i8 7, i8 -128, i8 9, i8 -128, i8 11, i8 -128, i8 13, i8 -128, i8 15, i8 -128, i8 1, i8 -128, i8 3, i8 -128, i8 5, i8 -128, i8 7, i8 -128, i8 9, i8 -128, i8 11, i8 -128, i8 13, i8 -128, i8 15, i8 -128, i8 1, i8 -128, i8 3, i8 -128, i8 5, i8 -128, i8 7, i8 -128, i8 9, i8 -128, i8 11, i8 -128, i8 13, i8 -128, i8 15, i8 -128, i8 1, i8 -128, i8 3, i8 -128, i8 5, i8 -128, i8 7, i8 -128, i8 9, i8 -128, i8 11, i8 -128, i8 13, i8 -128, i8 15>)
-  ret <64 x i8> %1
-}
-
-define <64 x i8> @blend2_avx512(<64 x i8> %InVec) {
-; CHECK-LABEL: @blend2_avx512(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <64 x i8> %InVec, <64 x i8> <i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef>, <64 x i32> <i32 64, i32 64, i32 2, i32 3, i32 64, i32 64, i32 6, i32 7, i32 64, i32 64, i32 10, i32 11, i32 64, i32 64, i32 14, i32 15, i32 80, i32 80, i32 18, i32 19, i32 80, i32 80, i32 22, i32 23, i32 80, i32 80, i32 26, i32 27, i32 80, i32 80, i32 30, i32 31, i32 96, i32 96, i32 34, i32 35, i32 96, i32 96, i32 38, i32 39, i32 96, i32 96, i32 42, i32 43, i32 96, i32 96, i32 46, i32 47, i32 112, i32 112, i32 50, i32 51, i32 112, i32 112, i32 54, i32 55, i32 112, i32 112, i32 58, i32 59, i32 112, i32 112, i32 62, i32 63>
-; CHECK-NEXT:    ret <64 x i8> [[TMP1]]
-;
-  %1 = tail call <64 x i8> @llvm.x86.avx512.pshuf.b.512(<64 x i8> %InVec, <64 x i8> <i8 -128, i8 -128, i8 2, i8 3, i8 -128, i8 -128, i8 6, i8 7, i8 -128, i8 -128, i8 10, i8 11, i8 -128, i8 -128, i8 14, i8 15, i8 -128, i8 -128, i8 2, i8 3, i8 -128, i8 -128, i8 6, i8 7, i8 -128, i8 -128, i8 10, i8 11, i8 -128, i8 -128, i8 14, i8 15, i8 -128, i8 -128, i8 2, i8 3, i8 -128, i8 -128, i8 6, i8 7, i8 -128, i8 -128, i8 10, i8 11, i8 -128, i8 -128, i8 14, i8 15, i8 -128, i8 -128, i8 2, i8 3, i8 -128, i8 -128, i8 6, i8 7, i8 -128, i8 -128, i8 10, i8 11, i8 -128, i8 -128, i8 14, i8 15>)
-  ret <64 x i8> %1
-}
-
-define <64 x i8> @blend3_avx512(<64 x i8> %InVec) {
-; CHECK-LABEL: @blend3_avx512(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <64 x i8> %InVec, <64 x i8> <i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef>, <64 x i32> <i32 64, i32 64, i32 64, i32 64, i32 4, i32 5, i32 6, i32 7, i32 64, i32 64, i32 64, i32 64, i32 12, i32 13, i32 14, i32 15, i32 80, i32 80, i32 80, i32 80, i32 20, i32 21, i32 22, i32 23, i32 80, i32 80, i32 80, i32 80, i32 28, i32 29, i32 30, i32 31, i32 96, i32 96, i32 96, i32 96, i32 36, i32 37, i32 38, i32 39, i32 96, i32 96, i32 96, i32 96, i32 44, i32 45, i32 46, i32 47, i32 112, i32 112, i32 112, i32 112, i32 52, i32 53, i32 54, i32 55, i32 112, i32 112, i32 112, i32 112, i32 60, i32 61, i32 62, i32 63>
-; CHECK-NEXT:    ret <64 x i8> [[TMP1]]
-;
-  %1 = tail call <64 x i8> @llvm.x86.avx512.pshuf.b.512(<64 x i8> %InVec, <64 x i8> <i8 -128, i8 -128, i8 -128, i8 -128, i8 4, i8 5, i8 6, i8 7, i8 -128, i8 -128, i8 -128, i8 -128, i8 12, i8 13, i8 14, i8 15, i8 -128, i8 -128, i8 -128, i8 -128, i8 4, i8 5, i8 6, i8 7, i8 -128, i8 -128, i8 -128, i8 -128, i8 12, i8 13, i8 14, i8 15, i8 -128, i8 -128, i8 -128, i8 -128, i8 4, i8 5, i8 6, i8 7, i8 -128, i8 -128, i8 -128, i8 -128, i8 12, i8 13, i8 14, i8 15, i8 -128, i8 -128, i8 -128, i8 -128, i8 4, i8 5, i8 6, i8 7, i8 -128, i8 -128, i8 -128, i8 -128, i8 12, i8 13, i8 14, i8 15>)
-  ret <64 x i8> %1
-}
-
-define <64 x i8> @blend4_avx512(<64 x i8> %InVec) {
-; CHECK-LABEL: @blend4_avx512(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <64 x i8> %InVec, <64 x i8> <i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef>, <64 x i32> <i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; CHECK-NEXT:    ret <64 x i8> [[TMP1]]
-;
-  %1 = tail call <64 x i8> @llvm.x86.avx512.pshuf.b.512(<64 x i8> %InVec, <64 x i8> <i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>)
-  ret <64 x i8> %1
-}
-
-define <64 x i8> @blend5_avx512(<64 x i8> %InVec) {
-; CHECK-LABEL: @blend5_avx512(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <64 x i8> %InVec, <64 x i8> <i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef>, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 16, i32 17, i32 18, i32 19, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 32, i32 33, i32 34, i32 35, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 48, i32 49, i32 50, i32 51, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112>
-; CHECK-NEXT:    ret <64 x i8> [[TMP1]]
-;
-  %1 = tail call <64 x i8> @llvm.x86.avx512.pshuf.b.512(<64 x i8> %InVec, <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 0, i8 1, i8 2, i8 3, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 0, i8 1, i8 2, i8 3, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 0, i8 1, i8 2, i8 3, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128>)
-  ret <64 x i8> %1
-}
-
-define <64 x i8> @blend6_avx512(<64 x i8> %InVec) {
-; CHECK-LABEL: @blend6_avx512(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <64 x i8> %InVec, <64 x i8> <i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef>, <64 x i32> <i32 0, i32 1, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 16, i32 17, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 32, i32 33, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 48, i32 49, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112>
-; CHECK-NEXT:    ret <64 x i8> [[TMP1]]
-;
-  %1 = tail call <64 x i8> @llvm.x86.avx512.pshuf.b.512(<64 x i8> %InVec, <64 x i8> <i8 0, i8 1, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128,i8 0, i8 1, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 0, i8 1, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 0, i8 1, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128>)
-  ret <64 x i8> %1
-}
-
-; movq idiom.
-define <16 x i8> @movq_idiom(<16 x i8> %InVec) {
-; CHECK-LABEL: @movq_idiom(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i8> %InVec, <16 x i8> <i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef>, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
-; CHECK-NEXT:    ret <16 x i8> [[TMP1]]
-;
-  %1 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %InVec, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128>)
-  ret <16 x i8> %1
-}
-
-define <32 x i8> @movq_idiom_avx2(<32 x i8> %InVec) {
-; CHECK-LABEL: @movq_idiom_avx2(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <32 x i8> %InVec, <32 x i8> <i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef>, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48>
-; CHECK-NEXT:    ret <32 x i8> [[TMP1]]
-;
-  %1 = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %InVec, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128>)
-  ret <32 x i8> %1
-}
-
-define <64 x i8> @movq_idiom_avx512(<64 x i8> %InVec) {
-; CHECK-LABEL: @movq_idiom_avx512(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <64 x i8> %InVec, <64 x i8> <i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef>, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112>
-; CHECK-NEXT:    ret <64 x i8> [[TMP1]]
-;
-  %1 = tail call <64 x i8> @llvm.x86.avx512.pshuf.b.512(<64 x i8> %InVec, <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128>)
-  ret <64 x i8> %1
-}
-
-; Vector permutations using byte shuffles.
-
-define <16 x i8> @permute1(<16 x i8> %InVec) {
-; CHECK-LABEL: @permute1(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i8> %InVec, <16 x i8> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15, i32 12, i32 13, i32 14, i32 15>
-; CHECK-NEXT:    ret <16 x i8> [[TMP1]]
-;
-  %1 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %InVec, <16 x i8> <i8 4, i8 5, i8 6, i8 7, i8 4, i8 5, i8 6, i8 7, i8 12, i8 13, i8 14, i8 15, i8 12, i8 13, i8 14, i8 15>)
-  ret <16 x i8> %1
-}
-
-define <16 x i8> @permute2(<16 x i8> %InVec) {
-; CHECK-LABEL: @permute2(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i8> %InVec, <16 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    ret <16 x i8> [[TMP1]]
-;
-  %1 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %InVec, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7>)
-  ret <16 x i8> %1
-}
-
-define <32 x i8> @permute1_avx2(<32 x i8> %InVec) {
-; CHECK-LABEL: @permute1_avx2(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <32 x i8> %InVec, <32 x i8> undef, <32 x i32> <i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15, i32 12, i32 13, i32 14, i32 15, i32 20, i32 21, i32 22, i32 23, i32 20, i32 21, i32 22, i32 23, i32 28, i32 29, i32 30, i32 31, i32 28, i32 29, i32 30, i32 31>
-; CHECK-NEXT:    ret <32 x i8> [[TMP1]]
-;
-  %1 = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %InVec, <32 x i8> <i8 4, i8 5, i8 6, i8 7, i8 4, i8 5, i8 6, i8 7, i8 12, i8 13, i8 14, i8 15, i8 12, i8 13, i8 14, i8 15, i8 4, i8 5, i8 6, i8 7, i8 4, i8 5, i8 6, i8 7, i8 12, i8 13, i8 14, i8 15, i8 12, i8 13, i8 14, i8 15>)
-  ret <32 x i8> %1
-}
-
-define <32 x i8> @permute2_avx2(<32 x i8> %InVec) {
-; CHECK-LABEL: @permute2_avx2(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <32 x i8> %InVec, <32 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
-; CHECK-NEXT:    ret <32 x i8> [[TMP1]]
-;
-  %1 = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %InVec, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7>)
-  ret <32 x i8> %1
-}
-
-define <64 x i8> @permute1_avx512(<64 x i8> %InVec) {
-; CHECK-LABEL: @permute1_avx512(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <64 x i8> %InVec, <64 x i8> undef, <64 x i32> <i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15, i32 12, i32 13, i32 14, i32 15, i32 20, i32 21, i32 22, i32 23, i32 20, i32 21, i32 22, i32 23, i32 28, i32 29, i32 30, i32 31, i32 28, i32 29, i32 30, i32 31, i32 36, i32 37, i32 38, i32 39, i32 36, i32 37, i32 38, i32 39, i32 44, i32 45, i32 46, i32 47, i32 44, i32 45, i32 46, i32 47, i32 52, i32 53, i32 54, i32 55, i32 52, i32 53, i32 54, i32 55, i32 60, i32 61, i32 62, i32 63, i32 60, i32 61, i32 62, i32 63>
-; CHECK-NEXT:    ret <64 x i8> [[TMP1]]
-;
-  %1 = tail call <64 x i8> @llvm.x86.avx512.pshuf.b.512(<64 x i8> %InVec, <64 x i8> <i8 4, i8 5, i8 6, i8 7, i8 4, i8 5, i8 6, i8 7, i8 12, i8 13, i8 14, i8 15, i8 12, i8 13, i8 14, i8 15, i8 4, i8 5, i8 6, i8 7, i8 4, i8 5, i8 6, i8 7, i8 12, i8 13, i8 14, i8 15, i8 12, i8 13, i8 14, i8 15, i8 4, i8 5, i8 6, i8 7, i8 4, i8 5, i8 6, i8 7, i8 12, i8 13, i8 14, i8 15, i8 12, i8 13, i8 14, i8 15, i8 4, i8 5, i8 6, i8 7, i8 4, i8 5, i8 6, i8 7, i8 12, i8 13, i8 14, i8 15, i8 12, i8 13, i8 14, i8 15>)
-  ret <64 x i8> %1
-}
-
-define <64 x i8> @permute2_avx512(<64 x i8> %InVec) {
-; CHECK-LABEL: @permute2_avx512(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <64 x i8> %InVec, <64 x i8> undef, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55>
-; CHECK-NEXT:    ret <64 x i8> [[TMP1]]
-;
-  %1 = tail call <64 x i8> @llvm.x86.avx512.pshuf.b.512(<64 x i8> %InVec, <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7>)
-  ret <64 x i8> %1
-}
-
-; Test that instcombine correctly folds a pshufb with values that
-; are not -128 and that are not encoded in four bits.
-
-define <16 x i8> @identity_test2_2(<16 x i8> %InVec) {
-; CHECK-LABEL: @identity_test2_2(
-; CHECK-NEXT:    ret <16 x i8> %InVec
-;
-  %1 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %InVec, <16 x i8> <i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31>)
-  ret <16 x i8> %1
-}
-
-define <32 x i8> @identity_test_avx2_2(<32 x i8> %InVec) {
-; CHECK-LABEL: @identity_test_avx2_2(
-; CHECK-NEXT:    ret <32 x i8> %InVec
-;
-  %1 = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %InVec, <32 x i8> <i8 16, i8 33, i8 66, i8 19, i8 36, i8 69, i8 22, i8 39, i8 72, i8 25, i8 42, i8 75, i8 28, i8 45, i8 78, i8 31, i8 48, i8 81, i8 34, i8 51, i8 84, i8 37, i8 54, i8 87, i8 40, i8 57, i8 90, i8 43, i8 60, i8 93, i8 46, i8 63>)
-  ret <32 x i8> %1
-}
-
-define <64 x i8> @identity_test_avx512_2(<64 x i8> %InVec) {
-; CHECK-LABEL: @identity_test_avx512_2(
-; CHECK-NEXT:    ret <64 x i8> %InVec
-;
-  %1 = tail call <64 x i8> @llvm.x86.avx512.pshuf.b.512(<64 x i8> %InVec, <64 x i8> <i8 16, i8 33, i8 66, i8 19, i8 36, i8 69, i8 22, i8 39, i8 72, i8 25, i8 42, i8 75, i8 28, i8 45, i8 78, i8 31, i8 48, i8 81, i8 34, i8 51, i8 84, i8 37, i8 54, i8 87, i8 40, i8 57, i8 90, i8 43, i8 60, i8 93, i8 46, i8 63, i8 96, i8 49, i8 66, i8 99, i8 52, i8 69, i8 102, i8 55, i8 72, i8 105, i8 58, i8 75, i8 108, i8 61, i8 78, i8 111, i8 64, i8 81, i8 114, i8 67, i8 84, i8 117, i8 70, i8 87, i8 120, i8 73, i8 90, i8 123, i8 76, i8 93, i8 126, i8 79>)
-  ret <64 x i8> %1
-}
-
-define <16 x i8> @fold_to_zero_vector_2(<16 x i8> %InVec) {
-; CHECK-LABEL: @fold_to_zero_vector_2(
-; CHECK-NEXT:    ret <16 x i8> zeroinitializer
-;
-  %1 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %InVec, <16 x i8> <i8 -125, i8 -1, i8 -53, i8 -32, i8 -4, i8 -7, i8 -33, i8 -66, i8 -99, i8 -120, i8 -100, i8 -22, i8 -17, i8 -1, i8 -11, i8 -15>)
-  ret <16 x i8> %1
-}
-
-define <32 x i8> @fold_to_zero_vector_avx2_2(<32 x i8> %InVec) {
-; CHECK-LABEL: @fold_to_zero_vector_avx2_2(
-; CHECK-NEXT:    ret <32 x i8> zeroinitializer
-;
-  %1 = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %InVec, <32 x i8> <i8 -127, i8 -1, i8 -53, i8 -32, i8 -4, i8 -7, i8 -33, i8 -66, i8 -99, i8 -120, i8 -100, i8 -22, i8 -17, i8 -1, i8 -11, i8 -15, i8 -126, i8 -2, i8 -52, i8 -31, i8 -5, i8 -8, i8 -34, i8 -67, i8 -100, i8 -119, i8 -101, i8 -23, i8 -16, i8 -2, i8 -12, i8 -16>)
-  ret <32 x i8> %1
-}
-
-define <64 x i8> @fold_to_zero_vector_avx512_2(<64 x i8> %InVec) {
-; CHECK-LABEL: @fold_to_zero_vector_avx512_2(
-; CHECK-NEXT:    ret <64 x i8> zeroinitializer
-;
-  %1 = tail call <64 x i8> @llvm.x86.avx512.pshuf.b.512(<64 x i8> %InVec, <64 x i8> <i8 -127, i8 -1, i8 -53, i8 -32, i8 -4, i8 -7, i8 -33, i8 -66, i8 -99, i8 -120, i8 -100, i8 -22, i8 -17, i8 -1, i8 -11, i8 -15, i8 -126, i8 -2, i8 -52, i8 -31, i8 -5, i8 -8, i8 -34, i8 -67, i8 -100, i8 -119, i8 -101, i8 -23, i8 -16, i8 -2, i8 -12, i8 -16, i8 -125, i8 -3, i8 -51, i8 -30, i8 -6, i8 -9, i8 -35, i8 -68, i8 -101, i8 -118, i8 -102, i8 -24, i8 -15, i8 -3, i8 -13, i8 -17, i8 -124, i8 -4, i8 -56, i8 -29, i8 -7, i8 -10, i8 -36, i8 -69, i8 -102, i8 -117, i8 -103, i8 -25, i8 -14, i8 -4, i8 -14, i8 -18>)
-  ret <64 x i8> %1
-}
-
-define <16 x i8> @permute3(<16 x i8> %InVec) {
-; CHECK-LABEL: @permute3(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i8> %InVec, <16 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    ret <16 x i8> [[TMP1]]
-;
-  %1 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %InVec, <16 x i8> <i8 48, i8 17, i8 34, i8 51, i8 20, i8 37, i8 54, i8 23, i8 16, i8 49, i8 66, i8 19, i8 52, i8 69, i8 22, i8 55>)
-  ret <16 x i8> %1
-}
-
-define <32 x i8> @permute3_avx2(<32 x i8> %InVec) {
-; CHECK-LABEL: @permute3_avx2(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <32 x i8> %InVec, <32 x i8> undef, <32 x i32> <i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15, i32 12, i32 13, i32 14, i32 15, i32 20, i32 21, i32 22, i32 23, i32 20, i32 21, i32 22, i32 23, i32 28, i32 29, i32 30, i32 31, i32 28, i32 29, i32 30, i32 31>
-; CHECK-NEXT:    ret <32 x i8> [[TMP1]]
-;
-  %1 = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %InVec, <32 x i8> <i8 52, i8 21, i8 38, i8 55, i8 20, i8 37, i8 54, i8 23, i8 28, i8 61, i8 78, i8 31, i8 60, i8 29, i8 30, i8 79, i8 52, i8 21, i8 38, i8 55, i8 20, i8 53, i8 102, i8 23, i8 92, i8 93, i8 94, i8 95, i8 108, i8 109, i8 110, i8 111>)
-  ret <32 x i8> %1
-}
-
-define <64 x i8> @permute3_avx512(<64 x i8> %InVec) {
-; CHECK-LABEL: @permute3_avx512(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <64 x i8> %InVec, <64 x i8> undef, <64 x i32> <i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15, i32 12, i32 13, i32 14, i32 15, i32 20, i32 21, i32 22, i32 23, i32 20, i32 21, i32 22, i32 23, i32 28, i32 29, i32 30, i32 31, i32 28, i32 29, i32 30, i32 31, i32 36, i32 37, i32 38, i32 39, i32 36, i32 37, i32 38, i32 39, i32 44, i32 45, i32 46, i32 47, i32 44, i32 45, i32 46, i32 47, i32 52, i32 53, i32 54, i32 55, i32 52, i32 53, i32 54, i32 55, i32 60, i32 61, i32 62, i32 63, i32 60, i32 61, i32 62, i32 63>
-; CHECK-NEXT:    ret <64 x i8> [[TMP1]]
-;
-  %1 = tail call <64 x i8> @llvm.x86.avx512.pshuf.b.512(<64 x i8> %InVec, <64 x i8> <i8 52, i8 21, i8 38, i8 55, i8 20, i8 37, i8 54, i8 23, i8 28, i8 61, i8 78, i8 31, i8 60, i8 29, i8 30, i8 79, i8 52, i8 21, i8 38, i8 55, i8 20, i8 53, i8 102, i8 23, i8 92, i8 93, i8 94, i8 95, i8 108, i8 109, i8 110, i8 111, i8 52, i8 21, i8 38, i8 55, i8 20, i8 37, i8 54, i8 23, i8 28, i8 61, i8 78, i8 31, i8 60, i8 29, i8 30, i8 79, i8 52, i8 21, i8 38, i8 55, i8 20, i8 53, i8 102, i8 23, i8 108, i8 109, i8 110, i8 111, i8 124, i8 125, i8 126, i8 127>)
-  ret <64 x i8> %1
-}
-
-; FIXME: Verify that instcombine is able to fold constant byte shuffles with undef mask elements.
-
-define <16 x i8> @fold_with_undef_elts(<16 x i8> %InVec) {
-; CHECK-LABEL: @fold_with_undef_elts(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i8> %InVec, <16 x i8> <i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef>, <16 x i32> <i32 0, i32 16, i32 undef, i32 16, i32 1, i32 16, i32 undef, i32 16, i32 2, i32 16, i32 undef, i32 16, i32 3, i32 16, i32 undef, i32 16>
-; CHECK-NEXT:    ret <16 x i8> [[TMP1]]
-;
-  %1 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %InVec, <16 x i8> <i8 0, i8 -128, i8 undef, i8 -128, i8 1, i8 -128, i8 undef, i8 -128, i8 2, i8 -128, i8 undef, i8 -128, i8 3, i8 -128, i8 undef, i8 -128>)
-  ret <16 x i8> %1
-}
-
-define <32 x i8> @fold_with_undef_elts_avx2(<32 x i8> %InVec) {
-; CHECK-LABEL: @fold_with_undef_elts_avx2(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <32 x i8> %InVec, <32 x i8> <i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef>, <32 x i32> <i32 0, i32 32, i32 undef, i32 32, i32 1, i32 32, i32 undef, i32 32, i32 2, i32 32, i32 undef, i32 32, i32 3, i32 32, i32 undef, i32 32, i32 16, i32 48, i32 undef, i32 48, i32 17, i32 48, i32 undef, i32 48, i32 18, i32 48, i32 undef, i32 48, i32 19, i32 48, i32 undef, i32 48>
-; CHECK-NEXT:    ret <32 x i8> [[TMP1]]
-;
-  %1 = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %InVec, <32 x i8> <i8 0, i8 -128, i8 undef, i8 -128, i8 1, i8 -128, i8 undef, i8 -128, i8 2, i8 -128, i8 undef, i8 -128, i8 3, i8 -128, i8 undef, i8 -128, i8 0, i8 -128, i8 undef, i8 -128, i8 1, i8 -128, i8 undef, i8 -128, i8 2, i8 -128, i8 undef, i8 -128, i8 3, i8 -128, i8 undef, i8 -128>)
-  ret <32 x i8> %1
-}
-
-define <64 x i8> @fold_with_undef_elts_avx512(<64 x i8> %InVec) {
-; CHECK-LABEL: @fold_with_undef_elts_avx512(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <64 x i8> %InVec, <64 x i8> <i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef>, <64 x i32> <i32 0, i32 64, i32 undef, i32 64, i32 1, i32 64, i32 undef, i32 64, i32 2, i32 64, i32 undef, i32 64, i32 3, i32 64, i32 undef, i32 64, i32 16, i32 80, i32 undef, i32 80, i32 17, i32 80, i32 undef, i32 80, i32 18, i32 80, i32 undef, i32 80, i32 19, i32 80, i32 undef, i32 80, i32 32, i32 96, i32 undef, i32 96, i32 33, i32 96, i32 undef, i32 96, i32 34, i32 96, i32 undef, i32 96, i32 35, i32 96, i32 undef, i32 96, i32 48, i32 112, i32 undef, i32 112, i32 49, i32 112, i32 undef, i32 112, i32 50, i32 112, i32 undef, i32 112, i32 51, i32 112, i32 undef, i32 112>
-; CHECK-NEXT:    ret <64 x i8> [[TMP1]]
-;
-  %1 = tail call <64 x i8> @llvm.x86.avx512.pshuf.b.512(<64 x i8> %InVec, <64 x i8> <i8 0, i8 -128, i8 undef, i8 -128, i8 1, i8 -128, i8 undef, i8 -128, i8 2, i8 -128, i8 undef, i8 -128, i8 3, i8 -128, i8 undef, i8 -128, i8 0, i8 -128, i8 undef, i8 -128, i8 1, i8 -128, i8 undef, i8 -128, i8 2, i8 -128, i8 undef, i8 -128, i8 3, i8 -128, i8 undef, i8 -128, i8 0, i8 -128, i8 undef, i8 -128, i8 1, i8 -128, i8 undef, i8 -128, i8 2, i8 -128, i8 undef, i8 -128, i8 3, i8 -128, i8 undef, i8 -128, i8 0, i8 -128, i8 undef, i8 -128, i8 1, i8 -128, i8 undef, i8 -128, i8 2, i8 -128, i8 undef, i8 -128, i8 3, i8 -128, i8 undef, i8 -128>)
-  ret <64 x i8> %1
-}
-
-define <16 x i8> @fold_with_allundef_elts(<16 x i8> %InVec) {
-; CHECK-LABEL: @fold_with_allundef_elts(
-; CHECK-NEXT:    ret <16 x i8> undef
-;
-  %1 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %InVec, <16 x i8> undef)
-  ret <16 x i8> %1
-}
-
-define <32 x i8> @fold_with_allundef_elts_avx2(<32 x i8> %InVec) {
-; CHECK-LABEL: @fold_with_allundef_elts_avx2(
-; CHECK-NEXT:    ret <32 x i8> undef
-;
-  %1 = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %InVec, <32 x i8> undef)
-  ret <32 x i8> %1
-}
-
-define <64 x i8> @fold_with_allundef_elts_avx512(<64 x i8> %InVec) {
-; CHECK-LABEL: @fold_with_allundef_elts_avx512(
-; CHECK-NEXT:    ret <64 x i8> undef
-;
-  %1 = tail call <64 x i8> @llvm.x86.avx512.pshuf.b.512(<64 x i8> %InVec, <64 x i8> undef)
-  ret <64 x i8> %1
-}
-
-; Demanded elts tests.
-
-define <16 x i8> @demanded_elts_insertion(<16 x i8> %InVec, <16 x i8> %BaseMask, i8 %M0, i8 %M15) {
-; CHECK-LABEL: @demanded_elts_insertion(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %InVec, <16 x i8> %BaseMask)
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> undef, <16 x i32> <i32 undef, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 undef>
-; CHECK-NEXT:    ret <16 x i8> [[TMP2]]
-;
-  %1 = insertelement <16 x i8> %BaseMask, i8 %M0, i32 0
-  %2 = insertelement <16 x i8> %1, i8 %M15, i32 15
-  %3 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %InVec, <16 x i8> %2)
-  %4 = shufflevector <16 x i8> %3, <16 x i8> undef, <16 x i32> <i32 undef, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 undef>
-  ret <16 x i8> %4
-}
-
-define <32 x i8> @demanded_elts_insertion_avx2(<32 x i8> %InVec, <32 x i8> %BaseMask, i8 %M0, i8 %M22) {
-; CHECK-LABEL: @demanded_elts_insertion_avx2(
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <32 x i8> %BaseMask, i8 %M0, i32 0
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %InVec, <32 x i8> [[TMP1]])
-; CHECK-NEXT:    ret <32 x i8> [[TMP2]]
-;
-  %1 = insertelement <32 x i8> %BaseMask, i8 %M0, i32 0
-  %2 = insertelement <32 x i8> %1, i8 %M22, i32 22
-  %3 = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %InVec, <32 x i8> %2)
-  %4 = shufflevector <32 x i8> %3, <32 x i8> undef, <32 x i32> <i32 undef, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 undef, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-  ret <32 x i8> %4
-}
-
-define <64 x i8> @demanded_elts_insertion_avx512(<64 x i8> %InVec, <64 x i8> %BaseMask, i8 %M0, i8 %M30) {
-; CHECK-LABEL: @demanded_elts_insertion_avx512(
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <64 x i8> undef, i8 %M0, i32 0
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call <64 x i8> @llvm.x86.avx512.pshuf.b.512(<64 x i8> %InVec, <64 x i8> [[TMP1]])
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <64 x i8> [[TMP2]], <64 x i8> undef, <64 x i32> zeroinitializer
-; CHECK-NEXT:    ret <64 x i8> [[TMP3]]
-;
-  %1 = insertelement <64 x i8> %BaseMask, i8 %M0, i32 0
-  %2 = insertelement <64 x i8> %1, i8 %M30, i32 30
-  %3 = tail call <64 x i8> @llvm.x86.avx512.pshuf.b.512(<64 x i8> %InVec, <64 x i8> %2)
-  %4 = shufflevector <64 x i8> %3, <64 x i8> undef, <64 x i32> zeroinitializer
-  ret <64 x i8> %4
-}
-
-declare <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8>, <16 x i8>)
-declare <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8>, <32 x i8>)
-declare <64 x i8> @llvm.x86.avx512.pshuf.b.512(<64 x i8>, <64 x i8>)
diff --git a/test/Transforms/InstCombine/x86-sse.ll b/test/Transforms/InstCombine/x86-sse.ll
deleted file mode 100644
index 6ed62a4e0224..000000000000
--- a/test/Transforms/InstCombine/x86-sse.ll
+++ /dev/null
@@ -1,613 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -instcombine -S | FileCheck %s
-target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-
-define float @test_rcp_ss_0(float %a) {
-; CHECK-LABEL: @test_rcp_ss_0(
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x float> undef, float %a, i32 0
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x float> @llvm.x86.sse.rcp.ss(<4 x float> [[TMP1]])
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[TMP2]], i32 0
-; CHECK-NEXT:    ret float [[TMP3]]
-;
-  %1 = insertelement <4 x float> undef, float %a, i32 0
-  %2 = insertelement <4 x float> %1, float 1.000000e+00, i32 1
-  %3 = insertelement <4 x float> %2, float 2.000000e+00, i32 2
-  %4 = insertelement <4 x float> %3, float 3.000000e+00, i32 3
-  %5 = tail call <4 x float> @llvm.x86.sse.rcp.ss(<4 x float> %4)
-  %6 = extractelement <4 x float> %5, i32 0
-  ret float %6
-}
-
-define float @test_rcp_ss_1(float %a) {
-; CHECK-LABEL: @test_rcp_ss_1(
-; CHECK-NEXT:    ret float 1.000000e+00
-;
-  %1 = insertelement <4 x float> undef, float %a, i32 0
-  %2 = insertelement <4 x float> %1, float 1.000000e+00, i32 1
-  %3 = insertelement <4 x float> %2, float 2.000000e+00, i32 2
-  %4 = insertelement <4 x float> %3, float 3.000000e+00, i32 3
-  %5 = tail call <4 x float> @llvm.x86.sse.rcp.ss(<4 x float> %4)
-  %6 = extractelement <4 x float> %5, i32 1
-  ret float %6
-}
-
-define float @test_sqrt_ss_0(float %a) {
-; CHECK-LABEL: @test_sqrt_ss_0(
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x float> undef, float %a, i32 0
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float> [[TMP1]])
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[TMP2]], i32 0
-; CHECK-NEXT:    ret float [[TMP3]]
-;
-  %1 = insertelement <4 x float> undef, float %a, i32 0
-  %2 = insertelement <4 x float> %1, float 1.000000e+00, i32 1
-  %3 = insertelement <4 x float> %2, float 2.000000e+00, i32 2
-  %4 = insertelement <4 x float> %3, float 3.000000e+00, i32 3
-  %5 = tail call <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float> %4)
-  %6 = extractelement <4 x float> %5, i32 0
-  ret float %6
-}
-
-define float @test_sqrt_ss_2(float %a) {
-; CHECK-LABEL: @test_sqrt_ss_2(
-; CHECK-NEXT:    ret float 2.000000e+00
-;
-  %1 = insertelement <4 x float> undef, float %a, i32 0
-  %2 = insertelement <4 x float> %1, float 1.000000e+00, i32 1
-  %3 = insertelement <4 x float> %2, float 2.000000e+00, i32 2
-  %4 = insertelement <4 x float> %3, float 3.000000e+00, i32 3
-  %5 = tail call <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float> %4)
-  %6 = extractelement <4 x float> %5, i32 2
-  ret float %6
-}
-
-define float @test_rsqrt_ss_0(float %a) {
-; CHECK-LABEL: @test_rsqrt_ss_0(
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x float> undef, float %a, i32 0
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float> [[TMP1]])
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[TMP2]], i32 0
-; CHECK-NEXT:    ret float [[TMP3]]
-;
-  %1 = insertelement <4 x float> undef, float %a, i32 0
-  %2 = insertelement <4 x float> %1, float 1.000000e+00, i32 1
-  %3 = insertelement <4 x float> %2, float 2.000000e+00, i32 2
-  %4 = insertelement <4 x float> %3, float 3.000000e+00, i32 3
-  %5 = tail call <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float> %4)
-  %6 = extractelement <4 x float> %5, i32 0
-  ret float %6
-}
-
-define float @test_rsqrt_ss_3(float %a) {
-; CHECK-LABEL: @test_rsqrt_ss_3(
-; CHECK-NEXT:    ret float 3.000000e+00
-;
-  %1 = insertelement <4 x float> undef, float %a, i32 0
-  %2 = insertelement <4 x float> %1, float 1.000000e+00, i32 1
-  %3 = insertelement <4 x float> %2, float 2.000000e+00, i32 2
-  %4 = insertelement <4 x float> %3, float 3.000000e+00, i32 3
-  %5 = tail call <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float> %4)
-  %6 = extractelement <4 x float> %5, i32 3
-  ret float %6
-}
-
-define float @test_add_ss_0(float %a, float %b) {
-; CHECK-LABEL: @test_add_ss_0(
-; CHECK-NEXT:    [[TMP1:%.*]] = fadd float %a, %b
-; CHECK-NEXT:    ret float [[TMP1]]
-;
-  %1 = insertelement <4 x float> undef, float %a, i32 0
-  %2 = insertelement <4 x float> %1, float 1.000000e+00, i32 1
-  %3 = insertelement <4 x float> %2, float 2.000000e+00, i32 2
-  %4 = insertelement <4 x float> %3, float 3.000000e+00, i32 3
-  %5 = insertelement <4 x float> undef, float %b, i32 0
-  %6 = insertelement <4 x float> %5, float 4.000000e+00, i32 1
-  %7 = insertelement <4 x float> %6, float 5.000000e+00, i32 2
-  %8 = insertelement <4 x float> %7, float 6.000000e+00, i32 3
-  %9 = tail call <4 x float> @llvm.x86.sse.add.ss(<4 x float> %4, <4 x float> %8)
-  %r = extractelement <4 x float> %9, i32 0
-  ret float %r
-}
-
-define float @test_add_ss_1(float %a, float %b) {
-; CHECK-LABEL: @test_add_ss_1(
-; CHECK-NEXT:    ret float 1.000000e+00
-;
-  %1 = insertelement <4 x float> undef, float %a, i32 0
-  %2 = insertelement <4 x float> %1, float 1.000000e+00, i32 1
-  %3 = insertelement <4 x float> %2, float 2.000000e+00, i32 2
-  %4 = insertelement <4 x float> %3, float 3.000000e+00, i32 3
-  %5 = insertelement <4 x float> undef, float %b, i32 0
-  %6 = tail call <4 x float> @llvm.x86.sse.add.ss(<4 x float> %4, <4 x float> %5)
-  %7 = extractelement <4 x float> %6, i32 1
-  ret float %7
-}
-
-define float @test_sub_ss_0(float %a, float %b) {
-; CHECK-LABEL: @test_sub_ss_0(
-; CHECK-NEXT:    [[TMP1:%.*]] = fsub float %a, %b
-; CHECK-NEXT:    ret float [[TMP1]]
-;
-  %1 = insertelement <4 x float> undef, float %a, i32 0
-  %2 = insertelement <4 x float> %1, float 1.000000e+00, i32 1
-  %3 = insertelement <4 x float> %2, float 2.000000e+00, i32 2
-  %4 = insertelement <4 x float> %3, float 3.000000e+00, i32 3
-  %5 = insertelement <4 x float> undef, float %b, i32 0
-  %6 = insertelement <4 x float> %5, float 4.000000e+00, i32 1
-  %7 = insertelement <4 x float> %6, float 5.000000e+00, i32 2
-  %8 = insertelement <4 x float> %7, float 6.000000e+00, i32 3
-  %9 = tail call <4 x float> @llvm.x86.sse.sub.ss(<4 x float> %4, <4 x float> %8)
-  %r = extractelement <4 x float> %9, i32 0
-  ret float %r
-}
-
-define float @test_sub_ss_2(float %a, float %b) {
-; CHECK-LABEL: @test_sub_ss_2(
-; CHECK-NEXT:    ret float 2.000000e+00
-;
-  %1 = insertelement <4 x float> undef, float %a, i32 0
-  %2 = insertelement <4 x float> %1, float 1.000000e+00, i32 1
-  %3 = insertelement <4 x float> %2, float 2.000000e+00, i32 2
-  %4 = insertelement <4 x float> %3, float 3.000000e+00, i32 3
-  %5 = insertelement <4 x float> undef, float %b, i32 0
-  %6 = tail call <4 x float> @llvm.x86.sse.sub.ss(<4 x float> %4, <4 x float> %5)
-  %7 = extractelement <4 x float> %6, i32 2
-  ret float %7
-}
-
-define float @test_mul_ss_0(float %a, float %b) {
-; CHECK-LABEL: @test_mul_ss_0(
-; CHECK-NEXT:    [[TMP1:%.*]] = fmul float %a, %b
-; CHECK-NEXT:    ret float [[TMP1]]
-;
-  %1 = insertelement <4 x float> undef, float %a, i32 0
-  %2 = insertelement <4 x float> %1, float 1.000000e+00, i32 1
-  %3 = insertelement <4 x float> %2, float 2.000000e+00, i32 2
-  %4 = insertelement <4 x float> %3, float 3.000000e+00, i32 3
-  %5 = insertelement <4 x float> undef, float %b, i32 0
-  %6 = insertelement <4 x float> %5, float 4.000000e+00, i32 1
-  %7 = insertelement <4 x float> %6, float 5.000000e+00, i32 2
-  %8 = insertelement <4 x float> %7, float 6.000000e+00, i32 3
-  %9 = tail call <4 x float> @llvm.x86.sse.mul.ss(<4 x float> %4, <4 x float> %8)
-  %r = extractelement <4 x float> %9, i32 0
-  ret float %r
-}
-
-define float @test_mul_ss_3(float %a, float %b) {
-; CHECK-LABEL: @test_mul_ss_3(
-; CHECK-NEXT:    ret float 3.000000e+00
-;
-  %1 = insertelement <4 x float> undef, float %a, i32 0
-  %2 = insertelement <4 x float> %1, float 1.000000e+00, i32 1
-  %3 = insertelement <4 x float> %2, float 2.000000e+00, i32 2
-  %4 = insertelement <4 x float> %3, float 3.000000e+00, i32 3
-  %5 = insertelement <4 x float> undef, float %b, i32 0
-  %6 = tail call <4 x float> @llvm.x86.sse.mul.ss(<4 x float> %4, <4 x float> %5)
-  %7 = extractelement <4 x float> %6, i32 3
-  ret float %7
-}
-
-define float @test_div_ss_0(float %a, float %b) {
-; CHECK-LABEL: @test_div_ss_0(
-; CHECK-NEXT:    [[TMP1:%.*]] = fdiv float %a, %b
-; CHECK-NEXT:    ret float [[TMP1]]
-;
-  %1 = insertelement <4 x float> undef, float %a, i32 0
-  %2 = insertelement <4 x float> %1, float 1.000000e+00, i32 1
-  %3 = insertelement <4 x float> %2, float 2.000000e+00, i32 2
-  %4 = insertelement <4 x float> %3, float 3.000000e+00, i32 3
-  %5 = insertelement <4 x float> undef, float %b, i32 0
-  %6 = insertelement <4 x float> %5, float 4.000000e+00, i32 1
-  %7 = insertelement <4 x float> %6, float 5.000000e+00, i32 2
-  %8 = insertelement <4 x float> %7, float 6.000000e+00, i32 3
-  %9 = tail call <4 x float> @llvm.x86.sse.div.ss(<4 x float> %4, <4 x float> %8)
-  %r = extractelement <4 x float> %9, i32 0
-  ret float %r
-}
-
-define float @test_div_ss_1(float %a, float %b) {
-; CHECK-LABEL: @test_div_ss_1(
-; CHECK-NEXT:    ret float 1.000000e+00
-;
-  %1 = insertelement <4 x float> undef, float %a, i32 0
-  %2 = insertelement <4 x float> %1, float 1.000000e+00, i32 1
-  %3 = insertelement <4 x float> %2, float 2.000000e+00, i32 2
-  %4 = insertelement <4 x float> %3, float 3.000000e+00, i32 3
-  %5 = insertelement <4 x float> undef, float %b, i32 0
-  %6 = tail call <4 x float> @llvm.x86.sse.div.ss(<4 x float> %4, <4 x float> %5)
-  %7 = extractelement <4 x float> %6, i32 1
-  ret float %7
-}
-
-define <4 x float> @test_min_ss(<4 x float> %a, <4 x float> %b) {
-; CHECK-LABEL: @test_min_ss(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x float> @llvm.x86.sse.min.ss(<4 x float> %a, <4 x float> %b)
-; CHECK-NEXT:    ret <4 x float> [[TMP1]]
-;
-  %1 = insertelement <4 x float> %b, float 1.000000e+00, i32 1
-  %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2
-  %3 = insertelement <4 x float> %2, float 3.000000e+00, i32 3
-  %4 = tail call <4 x float> @llvm.x86.sse.min.ss(<4 x float> %a, <4 x float> %3)
-  ret <4 x float> %4
-}
-
-define float @test_min_ss_0(float %a, float %b) {
-; CHECK-LABEL: @test_min_ss_0(
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x float> undef, float %a, i32 0
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x float> undef, float %b, i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = tail call <4 x float> @llvm.x86.sse.min.ss(<4 x float> [[TMP1]], <4 x float> [[TMP2]])
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[TMP3]], i32 0
-; CHECK-NEXT:    ret float [[TMP4]]
-;
-  %1 = insertelement <4 x float> undef, float %a, i32 0
-  %2 = insertelement <4 x float> %1, float 1.000000e+00, i32 1
-  %3 = insertelement <4 x float> %2, float 2.000000e+00, i32 2
-  %4 = insertelement <4 x float> %3, float 3.000000e+00, i32 3
-  %5 = insertelement <4 x float> undef, float %b, i32 0
-  %6 = insertelement <4 x float> %5, float 4.000000e+00, i32 1
-  %7 = insertelement <4 x float> %6, float 5.000000e+00, i32 2
-  %8 = insertelement <4 x float> %7, float 6.000000e+00, i32 3
-  %9 = tail call <4 x float> @llvm.x86.sse.min.ss(<4 x float> %4, <4 x float> %8)
-  %10 = extractelement <4 x float> %9, i32 0
-  ret float %10
-}
-
-define float @test_min_ss_2(float %a, float %b) {
-; CHECK-LABEL: @test_min_ss_2(
-; CHECK-NEXT:    ret float 2.000000e+00
-;
-  %1 = insertelement <4 x float> undef, float %a, i32 0
-  %2 = insertelement <4 x float> %1, float 1.000000e+00, i32 1
-  %3 = insertelement <4 x float> %2, float 2.000000e+00, i32 2
-  %4 = insertelement <4 x float> %3, float 3.000000e+00, i32 3
-  %5 = insertelement <4 x float> undef, float %b, i32 0
-  %6 = tail call <4 x float> @llvm.x86.sse.min.ss(<4 x float> %4, <4 x float> %5)
-  %7 = extractelement <4 x float> %6, i32 2
-  ret float %7
-}
-
-define <4 x float> @test_max_ss(<4 x float> %a, <4 x float> %b) {
-; CHECK-LABEL: @test_max_ss(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x float> @llvm.x86.sse.max.ss(<4 x float> %a, <4 x float> %b)
-; CHECK-NEXT:    ret <4 x float> [[TMP1]]
-;
-  %1 = insertelement <4 x float> %b, float 1.000000e+00, i32 1
-  %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2
-  %3 = insertelement <4 x float> %2, float 3.000000e+00, i32 3
-  %4 = tail call <4 x float> @llvm.x86.sse.max.ss(<4 x float> %a, <4 x float> %3)
-  ret <4 x float> %4
-}
-
-define float @test_max_ss_0(float %a, float %b) {
-; CHECK-LABEL: @test_max_ss_0(
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x float> undef, float %a, i32 0
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x float> undef, float %b, i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = tail call <4 x float> @llvm.x86.sse.max.ss(<4 x float> [[TMP1]], <4 x float> [[TMP2]])
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[TMP3]], i32 0
-; CHECK-NEXT:    ret float [[TMP4]]
-;
-  %1 = insertelement <4 x float> undef, float %a, i32 0
-  %2 = insertelement <4 x float> %1, float 1.000000e+00, i32 1
-  %3 = insertelement <4 x float> %2, float 2.000000e+00, i32 2
-  %4 = insertelement <4 x float> %3, float 3.000000e+00, i32 3
-  %5 = insertelement <4 x float> undef, float %b, i32 0
-  %6 = insertelement <4 x float> %5, float 4.000000e+00, i32 1
-  %7 = insertelement <4 x float> %6, float 5.000000e+00, i32 2
-  %8 = insertelement <4 x float> %7, float 6.000000e+00, i32 3
-  %9 = tail call <4 x float> @llvm.x86.sse.max.ss(<4 x float> %4, <4 x float> %8)
-  %10 = extractelement <4 x float> %9, i32 0
-  ret float %10
-}
-
-define float @test_max_ss_3(float %a, float %b) {
-; CHECK-LABEL: @test_max_ss_3(
-; CHECK-NEXT:    ret float 3.000000e+00
-;
-  %1 = insertelement <4 x float> undef, float %a, i32 0
-  %2 = insertelement <4 x float> %1, float 1.000000e+00, i32 1
-  %3 = insertelement <4 x float> %2, float 2.000000e+00, i32 2
-  %4 = insertelement <4 x float> %3, float 3.000000e+00, i32 3
-  %5 = insertelement <4 x float> undef, float %b, i32 0
-  %6 = tail call <4 x float> @llvm.x86.sse.max.ss(<4 x float> %4, <4 x float> %5)
-  %7 = extractelement <4 x float> %6, i32 3
-  ret float %7
-}
-
-define <4 x float> @test_cmp_ss(<4 x float> %a, <4 x float> %b) {
-; CHECK-LABEL: @test_cmp_ss(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %a, <4 x float> %b, i8 0)
-; CHECK-NEXT:    ret <4 x float> [[TMP1]]
-;
-  %1 = insertelement <4 x float> %b, float 1.000000e+00, i32 1
-  %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2
-  %3 = insertelement <4 x float> %2, float 3.000000e+00, i32 3
-  %4 = tail call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %a, <4 x float> %3, i8 0)
-  ret <4 x float> %4
-}
-
-define float @test_cmp_ss_0(float %a, float %b) {
-; CHECK-LABEL: @test_cmp_ss_0(
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x float> undef, float %a, i32 0
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x float> undef, float %b, i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = tail call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> [[TMP1]], <4 x float> [[TMP2]], i8 0)
-; CHECK-NEXT:    [[R:%.*]] = extractelement <4 x float> [[TMP3]], i32 0
-; CHECK-NEXT:    ret float [[R]]
-;
-  %1 = insertelement <4 x float> undef, float %a, i32 0
-  %2 = insertelement <4 x float> %1, float 1.000000e+00, i32 1
-  %3 = insertelement <4 x float> %2, float 2.000000e+00, i32 2
-  %4 = insertelement <4 x float> %3, float 3.000000e+00, i32 3
-  %5 = insertelement <4 x float> undef, float %b, i32 0
-  %6 = insertelement <4 x float> %5, float 4.000000e+00, i32 1
-  %7 = insertelement <4 x float> %6, float 5.000000e+00, i32 2
-  %8 = insertelement <4 x float> %7, float 6.000000e+00, i32 3
-  %9 = tail call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %4, <4 x float> %8, i8 0)
-  %r = extractelement <4 x float> %9, i32 0
-  ret float %r
-}
-
-define float @test_cmp_ss_1(float %a, float %b) {
-; CHECK-LABEL: @test_cmp_ss_1(
-; CHECK-NEXT:    ret float 1.000000e+00
-;
-  %1 = insertelement <4 x float> undef, float %a, i32 0
-  %2 = insertelement <4 x float> %1, float 1.000000e+00, i32 1
-  %3 = insertelement <4 x float> %2, float 2.000000e+00, i32 2
-  %4 = insertelement <4 x float> %3, float 3.000000e+00, i32 3
-  %5 = insertelement <4 x float> undef, float %b, i32 0
-  %6 = tail call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %4, <4 x float> %5, i8 0)
-  %7 = extractelement <4 x float> %6, i32 1
-  ret float %7
-}
-
-define i32 @test_comieq_ss_0(float %a, float %b) {
-; CHECK-LABEL: @test_comieq_ss_0(
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x float> undef, float %a, i32 0
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x float> undef, float %b, i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = tail call i32 @llvm.x86.sse.comieq.ss(<4 x float> [[TMP1]], <4 x float> [[TMP2]])
-; CHECK-NEXT:    ret i32 [[TMP3]]
-;
-  %1 = insertelement <4 x float> undef, float %a, i32 0
-  %2 = insertelement <4 x float> %1, float 1.000000e+00, i32 1
-  %3 = insertelement <4 x float> %2, float 2.000000e+00, i32 2
-  %4 = insertelement <4 x float> %3, float 3.000000e+00, i32 3
-  %5 = insertelement <4 x float> undef, float %b, i32 0
-  %6 = insertelement <4 x float> %5, float 4.000000e+00, i32 1
-  %7 = insertelement <4 x float> %6, float 5.000000e+00, i32 2
-  %8 = insertelement <4 x float> %7, float 6.000000e+00, i32 3
-  %9 = tail call i32 @llvm.x86.sse.comieq.ss(<4 x float> %4, <4 x float> %8)
-  ret i32 %9
-}
-
-define i32 @test_comige_ss_0(float %a, float %b) {
-; CHECK-LABEL: @test_comige_ss_0(
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x float> undef, float %a, i32 0
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x float> undef, float %b, i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = tail call i32 @llvm.x86.sse.comige.ss(<4 x float> [[TMP1]], <4 x float> [[TMP2]])
-; CHECK-NEXT:    ret i32 [[TMP3]]
-;
-  %1 = insertelement <4 x float> undef, float %a, i32 0
-  %2 = insertelement <4 x float> %1, float 1.000000e+00, i32 1
-  %3 = insertelement <4 x float> %2, float 2.000000e+00, i32 2
-  %4 = insertelement <4 x float> %3, float 3.000000e+00, i32 3
-  %5 = insertelement <4 x float> undef, float %b, i32 0
-  %6 = insertelement <4 x float> %5, float 4.000000e+00, i32 1
-  %7 = insertelement <4 x float> %6, float 5.000000e+00, i32 2
-  %8 = insertelement <4 x float> %7, float 6.000000e+00, i32 3
-  %9 = tail call i32 @llvm.x86.sse.comige.ss(<4 x float> %4, <4 x float> %8)
-  ret i32 %9
-}
-
-define i32 @test_comigt_ss_0(float %a, float %b) {
-; CHECK-LABEL: @test_comigt_ss_0(
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x float> undef, float %a, i32 0
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x float> undef, float %b, i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = tail call i32 @llvm.x86.sse.comigt.ss(<4 x float> [[TMP1]], <4 x float> [[TMP2]])
-; CHECK-NEXT:    ret i32 [[TMP3]]
-;
-  %1 = insertelement <4 x float> undef, float %a, i32 0
-  %2 = insertelement <4 x float> %1, float 1.000000e+00, i32 1
-  %3 = insertelement <4 x float> %2, float 2.000000e+00, i32 2
-  %4 = insertelement <4 x float> %3, float 3.000000e+00, i32 3
-  %5 = insertelement <4 x float> undef, float %b, i32 0
-  %6 = insertelement <4 x float> %5, float 4.000000e+00, i32 1
-  %7 = insertelement <4 x float> %6, float 5.000000e+00, i32 2
-  %8 = insertelement <4 x float> %7, float 6.000000e+00, i32 3
-  %9 = tail call i32 @llvm.x86.sse.comigt.ss(<4 x float> %4, <4 x float> %8)
-  ret i32 %9
-}
-
-define i32 @test_comile_ss_0(float %a, float %b) {
-; CHECK-LABEL: @test_comile_ss_0(
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x float> undef, float %a, i32 0
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x float> undef, float %b, i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = tail call i32 @llvm.x86.sse.comile.ss(<4 x float> [[TMP1]], <4 x float> [[TMP2]])
-; CHECK-NEXT:    ret i32 [[TMP3]]
-;
-  %1 = insertelement <4 x float> undef, float %a, i32 0
-  %2 = insertelement <4 x float> %1, float 1.000000e+00, i32 1
-  %3 = insertelement <4 x float> %2, float 2.000000e+00, i32 2
-  %4 = insertelement <4 x float> %3, float 3.000000e+00, i32 3
-  %5 = insertelement <4 x float> undef, float %b, i32 0
-  %6 = insertelement <4 x float> %5, float 4.000000e+00, i32 1
-  %7 = insertelement <4 x float> %6, float 5.000000e+00, i32 2
-  %8 = insertelement <4 x float> %7, float 6.000000e+00, i32 3
-  %9 = tail call i32 @llvm.x86.sse.comile.ss(<4 x float> %4, <4 x float> %8)
-  ret i32 %9
-}
-
-define i32 @test_comilt_ss_0(float %a, float %b) {
-; CHECK-LABEL: @test_comilt_ss_0(
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x float> undef, float %a, i32 0
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x float> undef, float %b, i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = tail call i32 @llvm.x86.sse.comilt.ss(<4 x float> [[TMP1]], <4 x float> [[TMP2]])
-; CHECK-NEXT:    ret i32 [[TMP3]]
-;
-  %1 = insertelement <4 x float> undef, float %a, i32 0
-  %2 = insertelement <4 x float> %1, float 1.000000e+00, i32 1
-  %3 = insertelement <4 x float> %2, float 2.000000e+00, i32 2
-  %4 = insertelement <4 x float> %3, float 3.000000e+00, i32 3
-  %5 = insertelement <4 x float> undef, float %b, i32 0
-  %6 = insertelement <4 x float> %5, float 4.000000e+00, i32 1
-  %7 = insertelement <4 x float> %6, float 5.000000e+00, i32 2
-  %8 = insertelement <4 x float> %7, float 6.000000e+00, i32 3
-  %9 = tail call i32 @llvm.x86.sse.comilt.ss(<4 x float> %4, <4 x float> %8)
-  ret i32 %9
-}
-
-define i32 @test_comineq_ss_0(float %a, float %b) {
-; CHECK-LABEL: @test_comineq_ss_0(
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x float> undef, float %a, i32 0
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x float> undef, float %b, i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = tail call i32 @llvm.x86.sse.comineq.ss(<4 x float> [[TMP1]], <4 x float> [[TMP2]])
-; CHECK-NEXT:    ret i32 [[TMP3]]
-;
-  %1 = insertelement <4 x float> undef, float %a, i32 0
-  %2 = insertelement <4 x float> %1, float 1.000000e+00, i32 1
-  %3 = insertelement <4 x float> %2, float 2.000000e+00, i32 2
-  %4 = insertelement <4 x float> %3, float 3.000000e+00, i32 3
-  %5 = insertelement <4 x float> undef, float %b, i32 0
-  %6 = insertelement <4 x float> %5, float 4.000000e+00, i32 1
-  %7 = insertelement <4 x float> %6, float 5.000000e+00, i32 2
-  %8 = insertelement <4 x float> %7, float 6.000000e+00, i32 3
-  %9 = tail call i32 @llvm.x86.sse.comineq.ss(<4 x float> %4, <4 x float> %8)
-  ret i32 %9
-}
-
-define i32 @test_ucomieq_ss_0(float %a, float %b) {
-; CHECK-LABEL: @test_ucomieq_ss_0(
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x float> undef, float %a, i32 0
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x float> undef, float %b, i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = tail call i32 @llvm.x86.sse.ucomieq.ss(<4 x float> [[TMP1]], <4 x float> [[TMP2]])
-; CHECK-NEXT:    ret i32 [[TMP3]]
-;
-  %1 = insertelement <4 x float> undef, float %a, i32 0
-  %2 = insertelement <4 x float> %1, float 1.000000e+00, i32 1
-  %3 = insertelement <4 x float> %2, float 2.000000e+00, i32 2
-  %4 = insertelement <4 x float> %3, float 3.000000e+00, i32 3
-  %5 = insertelement <4 x float> undef, float %b, i32 0
-  %6 = insertelement <4 x float> %5, float 4.000000e+00, i32 1
-  %7 = insertelement <4 x float> %6, float 5.000000e+00, i32 2
-  %8 = insertelement <4 x float> %7, float 6.000000e+00, i32 3
-  %9 = tail call i32 @llvm.x86.sse.ucomieq.ss(<4 x float> %4, <4 x float> %8)
-  ret i32 %9
-}
-
-define i32 @test_ucomige_ss_0(float %a, float %b) {
-; CHECK-LABEL: @test_ucomige_ss_0(
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x float> undef, float %a, i32 0
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x float> undef, float %b, i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = tail call i32 @llvm.x86.sse.ucomige.ss(<4 x float> [[TMP1]], <4 x float> [[TMP2]])
-; CHECK-NEXT:    ret i32 [[TMP3]]
-;
-  %1 = insertelement <4 x float> undef, float %a, i32 0
-  %2 = insertelement <4 x float> %1, float 1.000000e+00, i32 1
-  %3 = insertelement <4 x float> %2, float 2.000000e+00, i32 2
-  %4 = insertelement <4 x float> %3, float 3.000000e+00, i32 3
-  %5 = insertelement <4 x float> undef, float %b, i32 0
-  %6 = insertelement <4 x float> %5, float 4.000000e+00, i32 1
-  %7 = insertelement <4 x float> %6, float 5.000000e+00, i32 2
-  %8 = insertelement <4 x float> %7, float 6.000000e+00, i32 3
-  %9 = tail call i32 @llvm.x86.sse.ucomige.ss(<4 x float> %4, <4 x float> %8)
-  ret i32 %9
-}
-
-define i32 @test_ucomigt_ss_0(float %a, float %b) {
-; CHECK-LABEL: @test_ucomigt_ss_0(
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x float> undef, float %a, i32 0
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x float> undef, float %b, i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = tail call i32 @llvm.x86.sse.ucomigt.ss(<4 x float> [[TMP1]], <4 x float> [[TMP2]])
-; CHECK-NEXT:    ret i32 [[TMP3]]
-;
-  %1 = insertelement <4 x float> undef, float %a, i32 0
-  %2 = insertelement <4 x float> %1, float 1.000000e+00, i32 1
-  %3 = insertelement <4 x float> %2, float 2.000000e+00, i32 2
-  %4 = insertelement <4 x float> %3, float 3.000000e+00, i32 3
-  %5 = insertelement <4 x float> undef, float %b, i32 0
-  %6 = insertelement <4 x float> %5, float 4.000000e+00, i32 1
-  %7 = insertelement <4 x float> %6, float 5.000000e+00, i32 2
-  %8 = insertelement <4 x float> %7, float 6.000000e+00, i32 3
-  %9 = tail call i32 @llvm.x86.sse.ucomigt.ss(<4 x float> %4, <4 x float> %8)
-  ret i32 %9
-}
-
-define i32 @test_ucomile_ss_0(float %a, float %b) {
-; CHECK-LABEL: @test_ucomile_ss_0(
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x float> undef, float %a, i32 0
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x float> undef, float %b, i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = tail call i32 @llvm.x86.sse.ucomile.ss(<4 x float> [[TMP1]], <4 x float> [[TMP2]])
-; CHECK-NEXT:    ret i32 [[TMP3]]
-;
-  %1 = insertelement <4 x float> undef, float %a, i32 0
-  %2 = insertelement <4 x float> %1, float 1.000000e+00, i32 1
-  %3 = insertelement <4 x float> %2, float 2.000000e+00, i32 2
-  %4 = insertelement <4 x float> %3, float 3.000000e+00, i32 3
-  %5 = insertelement <4 x float> undef, float %b, i32 0
-  %6 = insertelement <4 x float> %5, float 4.000000e+00, i32 1
-  %7 = insertelement <4 x float> %6, float 5.000000e+00, i32 2
-  %8 = insertelement <4 x float> %7, float 6.000000e+00, i32 3
-  %9 = tail call i32 @llvm.x86.sse.ucomile.ss(<4 x float> %4, <4 x float> %8)
-  ret i32 %9
-}
-
-define i32 @test_ucomilt_ss_0(float %a, float %b) {
-; CHECK-LABEL: @test_ucomilt_ss_0(
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x float> undef, float %a, i32 0
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x float> undef, float %b, i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = tail call i32 @llvm.x86.sse.ucomilt.ss(<4 x float> [[TMP1]], <4 x float> [[TMP2]])
-; CHECK-NEXT:    ret i32 [[TMP3]]
-;
-  %1 = insertelement <4 x float> undef, float %a, i32 0
-  %2 = insertelement <4 x float> %1, float 1.000000e+00, i32 1
-  %3 = insertelement <4 x float> %2, float 2.000000e+00, i32 2
-  %4 = insertelement <4 x float> %3, float 3.000000e+00, i32 3
-  %5 = insertelement <4 x float> undef, float %b, i32 0
-  %6 = insertelement <4 x float> %5, float 4.000000e+00, i32 1
-  %7 = insertelement <4 x float> %6, float 5.000000e+00, i32 2
-  %8 = insertelement <4 x float> %7, float 6.000000e+00, i32 3
-  %9 = tail call i32 @llvm.x86.sse.ucomilt.ss(<4 x float> %4, <4 x float> %8)
-  ret i32 %9
-}
-
-define i32 @test_ucomineq_ss_0(float %a, float %b) {
-; CHECK-LABEL: @test_ucomineq_ss_0(
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x float> undef, float %a, i32 0
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x float> undef, float %b, i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = tail call i32 @llvm.x86.sse.ucomineq.ss(<4 x float> [[TMP1]], <4 x float> [[TMP2]])
-; CHECK-NEXT:    ret i32 [[TMP3]]
-;
-  %1 = insertelement <4 x float> undef, float %a, i32 0
-  %2 = insertelement <4 x float> %1, float 1.000000e+00, i32 1
-  %3 = insertelement <4 x float> %2, float 2.000000e+00, i32 2
-  %4 = insertelement <4 x float> %3, float 3.000000e+00, i32 3
-  %5 = insertelement <4 x float> undef, float %b, i32 0
-  %6 = insertelement <4 x float> %5, float 4.000000e+00, i32 1
-  %7 = insertelement <4 x float> %6, float 5.000000e+00, i32 2
-  %8 = insertelement <4 x float> %7, float 6.000000e+00, i32 3
-  %9 = tail call i32 @llvm.x86.sse.ucomineq.ss(<4 x float> %4, <4 x float> %8)
-  ret i32 %9
-}
-
-declare <4 x float> @llvm.x86.sse.rcp.ss(<4 x float>)
-declare <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float>)
-declare <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float>)
-
-declare <4 x float> @llvm.x86.sse.add.ss(<4 x float>, <4 x float>)
-declare <4 x float> @llvm.x86.sse.sub.ss(<4 x float>, <4 x float>)
-declare <4 x float> @llvm.x86.sse.mul.ss(<4 x float>, <4 x float>)
-declare <4 x float> @llvm.x86.sse.div.ss(<4 x float>, <4 x float>)
-declare <4 x float> @llvm.x86.sse.min.ss(<4 x float>, <4 x float>)
-declare <4 x float> @llvm.x86.sse.max.ss(<4 x float>, <4 x float>)
-declare <4 x float> @llvm.x86.sse.cmp.ss(<4 x float>, <4 x float>, i8)
-
-declare i32 @llvm.x86.sse.comieq.ss(<4 x float>, <4 x float>)
-declare i32 @llvm.x86.sse.comige.ss(<4 x float>, <4 x float>)
-declare i32 @llvm.x86.sse.comigt.ss(<4 x float>, <4 x float>)
-declare i32 @llvm.x86.sse.comile.ss(<4 x float>, <4 x float>)
-declare i32 @llvm.x86.sse.comilt.ss(<4 x float>, <4 x float>)
-declare i32 @llvm.x86.sse.comineq.ss(<4 x float>, <4 x float>)
-
-declare i32 @llvm.x86.sse.ucomieq.ss(<4 x float>, <4 x float>)
-declare i32 @llvm.x86.sse.ucomige.ss(<4 x float>, <4 x float>)
-declare i32 @llvm.x86.sse.ucomigt.ss(<4 x float>, <4 x float>)
-declare i32 @llvm.x86.sse.ucomile.ss(<4 x float>, <4 x float>)
-declare i32 @llvm.x86.sse.ucomilt.ss(<4 x float>, <4 x float>)
-declare i32 @llvm.x86.sse.ucomineq.ss(<4 x float>, <4 x float>)
diff --git a/test/Transforms/InstCombine/x86-sse2.ll b/test/Transforms/InstCombine/x86-sse2.ll
deleted file mode 100644
index fe8828bfb5b2..000000000000
--- a/test/Transforms/InstCombine/x86-sse2.ll
+++ /dev/null
@@ -1,460 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -instcombine -S | FileCheck %s
-target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-
-define double @test_sqrt_sd_0(double %a) {
-; CHECK-LABEL: @test_sqrt_sd_0(
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> undef, double %a, i32 0
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double> [[TMP1]])
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x double> [[TMP2]], i32 0
-; CHECK-NEXT:    ret double [[TMP3]]
-;
-  %1 = insertelement <2 x double> undef, double %a, i32 0
-  %2 = insertelement <2 x double> %1, double 1.000000e+00, i32 1
-  %3 = tail call <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double> %2)
-  %4 = extractelement <2 x double> %3, i32 0
-  ret double %4
-}
-
-define double @test_sqrt_sd_1(double %a) {
-; CHECK-LABEL: @test_sqrt_sd_1(
-; CHECK-NEXT:    ret double 1.000000e+00
-;
-  %1 = insertelement <2 x double> undef, double %a, i32 0
-  %2 = insertelement <2 x double> %1, double 1.000000e+00, i32 1
-  %3 = tail call <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double> %2)
-  %4 = extractelement <2 x double> %3, i32 1
-  ret double %4
-}
-
-define double @test_add_sd_0(double %a, double %b) {
-; CHECK-LABEL: @test_add_sd_0(
-; CHECK-NEXT:    [[TMP1:%.*]] = fadd double %a, %b
-; CHECK-NEXT:    ret double [[TMP1]]
-;
-  %1 = insertelement <2 x double> undef, double %a, i32 0
-  %2 = insertelement <2 x double> %1, double 1.000000e+00, i32 1
-  %3 = insertelement <2 x double> undef, double %b, i32 0
-  %4 = insertelement <2 x double> %3, double 2.000000e+00, i32 1
-  %5 = tail call <2 x double> @llvm.x86.sse2.add.sd(<2 x double> %2, <2 x double> %4)
-  %6 = extractelement <2 x double> %5, i32 0
-  ret double %6
-}
-
-define double @test_add_sd_1(double %a, double %b) {
-; CHECK-LABEL: @test_add_sd_1(
-; CHECK-NEXT:    ret double 1.000000e+00
-;
-  %1 = insertelement <2 x double> undef, double %a, i32 0
-  %2 = insertelement <2 x double> %1, double 1.000000e+00, i32 1
-  %3 = insertelement <2 x double> undef, double %b, i32 0
-  %4 = insertelement <2 x double> %3, double 2.000000e+00, i32 1
-  %5 = tail call <2 x double> @llvm.x86.sse2.add.sd(<2 x double> %2, <2 x double> %4)
-  %6 = extractelement <2 x double> %5, i32 1
-  ret double %6
-}
-
-define double @test_sub_sd_0(double %a, double %b) {
-; CHECK-LABEL: @test_sub_sd_0(
-; CHECK-NEXT:    [[TMP1:%.*]] = fsub double %a, %b
-; CHECK-NEXT:    ret double [[TMP1]]
-;
-  %1 = insertelement <2 x double> undef, double %a, i32 0
-  %2 = insertelement <2 x double> %1, double 1.000000e+00, i32 1
-  %3 = insertelement <2 x double> undef, double %b, i32 0
-  %4 = insertelement <2 x double> %3, double 2.000000e+00, i32 1
-  %5 = tail call <2 x double> @llvm.x86.sse2.sub.sd(<2 x double> %2, <2 x double> %4)
-  %6 = extractelement <2 x double> %5, i32 0
-  ret double %6
-}
-
-define double @test_sub_sd_1(double %a, double %b) {
-; CHECK-LABEL: @test_sub_sd_1(
-; CHECK-NEXT:    ret double 1.000000e+00
-;
-  %1 = insertelement <2 x double> undef, double %a, i32 0
-  %2 = insertelement <2 x double> %1, double 1.000000e+00, i32 1
-  %3 = insertelement <2 x double> undef, double %b, i32 0
-  %4 = insertelement <2 x double> %3, double 2.000000e+00, i32 1
-  %5 = tail call <2 x double> @llvm.x86.sse2.sub.sd(<2 x double> %2, <2 x double> %4)
-  %6 = extractelement <2 x double> %5, i32 1
-  ret double %6
-}
-
-define double @test_mul_sd_0(double %a, double %b) {
-; CHECK-LABEL: @test_mul_sd_0(
-; CHECK-NEXT:    [[TMP1:%.*]] = fmul double %a, %b
-; CHECK-NEXT:    ret double [[TMP1]]
-;
-  %1 = insertelement <2 x double> undef, double %a, i32 0
-  %2 = insertelement <2 x double> %1, double 1.000000e+00, i32 1
-  %3 = insertelement <2 x double> undef, double %b, i32 0
-  %4 = insertelement <2 x double> %3, double 2.000000e+00, i32 1
-  %5 = tail call <2 x double> @llvm.x86.sse2.mul.sd(<2 x double> %2, <2 x double> %4)
-  %6 = extractelement <2 x double> %5, i32 0
-  ret double %6
-}
-
-define double @test_mul_sd_1(double %a, double %b) {
-; CHECK-LABEL: @test_mul_sd_1(
-; CHECK-NEXT:    ret double 1.000000e+00
-;
-  %1 = insertelement <2 x double> undef, double %a, i32 0
-  %2 = insertelement <2 x double> %1, double 1.000000e+00, i32 1
-  %3 = insertelement <2 x double> undef, double %b, i32 0
-  %4 = insertelement <2 x double> %3, double 2.000000e+00, i32 1
-  %5 = tail call <2 x double> @llvm.x86.sse2.mul.sd(<2 x double> %2, <2 x double> %4)
-  %6 = extractelement <2 x double> %5, i32 1
-  ret double %6
-}
-
-define double @test_div_sd_0(double %a, double %b) {
-; CHECK-LABEL: @test_div_sd_0(
-; CHECK-NEXT:    [[TMP1:%.*]] = fdiv double %a, %b
-; CHECK-NEXT:    ret double [[TMP1]]
-;
-  %1 = insertelement <2 x double> undef, double %a, i32 0
-  %2 = insertelement <2 x double> %1, double 1.000000e+00, i32 1
-  %3 = insertelement <2 x double> undef, double %b, i32 0
-  %4 = insertelement <2 x double> %3, double 2.000000e+00, i32 1
-  %5 = tail call <2 x double> @llvm.x86.sse2.div.sd(<2 x double> %2, <2 x double> %4)
-  %6 = extractelement <2 x double> %5, i32 0
-  ret double %6
-}
-
-define double @test_div_sd_1(double %a, double %b) {
-; CHECK-LABEL: @test_div_sd_1(
-; CHECK-NEXT:    ret double 1.000000e+00
-;
-  %1 = insertelement <2 x double> undef, double %a, i32 0
-  %2 = insertelement <2 x double> %1, double 1.000000e+00, i32 1
-  %3 = insertelement <2 x double> undef, double %b, i32 0
-  %4 = insertelement <2 x double> %3, double 2.000000e+00, i32 1
-  %5 = tail call <2 x double> @llvm.x86.sse2.div.sd(<2 x double> %2, <2 x double> %4)
-  %6 = extractelement <2 x double> %5, i32 1
-  ret double %6
-}
-
-define <2 x double> @test_min_sd(<2 x double> %a, <2 x double> %b) {
-; CHECK-LABEL: @test_min_sd(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x double> @llvm.x86.sse2.min.sd(<2 x double> %a, <2 x double> %b)
-; CHECK-NEXT:    ret <2 x double> [[TMP1]]
-;
-  %1 = insertelement <2 x double> %b, double 2.000000e+00, i32 1
-  %2 = tail call <2 x double> @llvm.x86.sse2.min.sd(<2 x double> %a, <2 x double> %1)
-  ret <2 x double> %2
-}
-
-define double @test_min_sd_0(double %a, double %b) {
-; CHECK-LABEL: @test_min_sd_0(
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> undef, double %a, i32 0
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> undef, double %b, i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = tail call <2 x double> @llvm.x86.sse2.min.sd(<2 x double> [[TMP1]], <2 x double> [[TMP2]])
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x double> [[TMP3]], i32 0
-; CHECK-NEXT:    ret double [[TMP4]]
-;
-  %1 = insertelement <2 x double> undef, double %a, i32 0
-  %2 = insertelement <2 x double> %1, double 1.000000e+00, i32 1
-  %3 = insertelement <2 x double> undef, double %b, i32 0
-  %4 = insertelement <2 x double> %3, double 2.000000e+00, i32 1
-  %5 = tail call <2 x double> @llvm.x86.sse2.min.sd(<2 x double> %2, <2 x double> %4)
-  %6 = extractelement <2 x double> %5, i32 0
-  ret double %6
-}
-
-define double @test_min_sd_1(double %a, double %b) {
-; CHECK-LABEL: @test_min_sd_1(
-; CHECK-NEXT:    ret double 1.000000e+00
-;
-  %1 = insertelement <2 x double> undef, double %a, i32 0
-  %2 = insertelement <2 x double> %1, double 1.000000e+00, i32 1
-  %3 = insertelement <2 x double> undef, double %b, i32 0
-  %4 = insertelement <2 x double> %3, double 2.000000e+00, i32 1
-  %5 = tail call <2 x double> @llvm.x86.sse2.min.sd(<2 x double> %2, <2 x double> %4)
-  %6 = extractelement <2 x double> %5, i32 1
-  ret double %6
-}
-
-define <2 x double> @test_max_sd(<2 x double> %a, <2 x double> %b) {
-; CHECK-LABEL: @test_max_sd(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x double> @llvm.x86.sse2.max.sd(<2 x double> %a, <2 x double> %b)
-; CHECK-NEXT:    ret <2 x double> [[TMP1]]
-;
-  %1 = insertelement <2 x double> %b, double 2.000000e+00, i32 1
-  %2 = tail call <2 x double> @llvm.x86.sse2.max.sd(<2 x double> %a, <2 x double> %1)
-  ret <2 x double> %2
-}
-
-define double @test_max_sd_0(double %a, double %b) {
-; CHECK-LABEL: @test_max_sd_0(
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> undef, double %a, i32 0
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> undef, double %b, i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = tail call <2 x double> @llvm.x86.sse2.max.sd(<2 x double> [[TMP1]], <2 x double> [[TMP2]])
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x double> [[TMP3]], i32 0
-; CHECK-NEXT:    ret double [[TMP4]]
-;
-  %1 = insertelement <2 x double> undef, double %a, i32 0
-  %2 = insertelement <2 x double> %1, double 1.000000e+00, i32 1
-  %3 = insertelement <2 x double> undef, double %b, i32 0
-  %4 = insertelement <2 x double> %3, double 2.000000e+00, i32 1
-  %5 = tail call <2 x double> @llvm.x86.sse2.max.sd(<2 x double> %2, <2 x double> %4)
-  %6 = extractelement <2 x double> %5, i32 0
-  ret double %6
-}
-
-define double @test_max_sd_1(double %a, double %b) {
-; CHECK-LABEL: @test_max_sd_1(
-; CHECK-NEXT:    ret double 1.000000e+00
-;
-  %1 = insertelement <2 x double> undef, double %a, i32 0
-  %2 = insertelement <2 x double> %1, double 1.000000e+00, i32 1
-  %3 = insertelement <2 x double> undef, double %b, i32 0
-  %4 = insertelement <2 x double> %3, double 2.000000e+00, i32 1
-  %5 = tail call <2 x double> @llvm.x86.sse2.max.sd(<2 x double> %2, <2 x double> %4)
-  %6 = extractelement <2 x double> %5, i32 1
-  ret double %6
-}
-
-define <2 x double> @test_cmp_sd(<2 x double> %a, <2 x double> %b) {
-; CHECK-LABEL: @test_cmp_sd(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double> %a, <2 x double> %b, i8 0)
-; CHECK-NEXT:    ret <2 x double> [[TMP1]]
-;
-  %1 = insertelement <2 x double> %b, double 2.000000e+00, i32 1
-  %2 = tail call <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double> %a, <2 x double> %1, i8 0)
-  ret <2 x double> %2
-}
-
-define double @test_cmp_sd_0(double %a, double %b) {
-; CHECK-LABEL: @test_cmp_sd_0(
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> undef, double %a, i32 0
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> undef, double %b, i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = tail call <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double> [[TMP1]], <2 x double> [[TMP2]], i8 0)
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x double> [[TMP3]], i32 0
-; CHECK-NEXT:    ret double [[TMP4]]
-;
-  %1 = insertelement <2 x double> undef, double %a, i32 0
-  %2 = insertelement <2 x double> %1, double 1.000000e+00, i32 1
-  %3 = insertelement <2 x double> undef, double %b, i32 0
-  %4 = insertelement <2 x double> %3, double 2.000000e+00, i32 1
-  %5 = tail call <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double> %2, <2 x double> %4, i8 0)
-  %6 = extractelement <2 x double> %5, i32 0
-  ret double %6
-}
-
-define double @test_cmp_sd_1(double %a, double %b) {
-; CHECK-LABEL: @test_cmp_sd_1(
-; CHECK-NEXT:    ret double 1.000000e+00
-;
-  %1 = insertelement <2 x double> undef, double %a, i32 0
-  %2 = insertelement <2 x double> %1, double 1.000000e+00, i32 1
-  %3 = insertelement <2 x double> undef, double %b, i32 0
-  %4 = insertelement <2 x double> %3, double 2.000000e+00, i32 1
-  %5 = tail call <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double> %2, <2 x double> %4, i8 0)
-  %6 = extractelement <2 x double> %5, i32 1
-  ret double %6
-}
-
-define i32 @test_comieq_sd_0(double %a, double %b) {
-; CHECK-LABEL: @test_comieq_sd_0(
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> undef, double %a, i32 0
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> undef, double %b, i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = tail call i32 @llvm.x86.sse2.comieq.sd(<2 x double> [[TMP1]], <2 x double> [[TMP2]])
-; CHECK-NEXT:    ret i32 [[TMP3]]
-;
-  %1 = insertelement <2 x double> undef, double %a, i32 0
-  %2 = insertelement <2 x double> %1, double 1.000000e+00, i32 1
-  %3 = insertelement <2 x double> undef, double %b, i32 0
-  %4 = insertelement <2 x double> %3, double 2.000000e+00, i32 1
-  %5 = tail call i32 @llvm.x86.sse2.comieq.sd(<2 x double> %2, <2 x double> %4)
-  ret i32 %5
-}
-
-define i32 @test_comige_sd_0(double %a, double %b) {
-; CHECK-LABEL: @test_comige_sd_0(
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> undef, double %a, i32 0
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> undef, double %b, i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = tail call i32 @llvm.x86.sse2.comige.sd(<2 x double> [[TMP1]], <2 x double> [[TMP2]])
-; CHECK-NEXT:    ret i32 [[TMP3]]
-;
-  %1 = insertelement <2 x double> undef, double %a, i32 0
-  %2 = insertelement <2 x double> %1, double 1.000000e+00, i32 1
-  %3 = insertelement <2 x double> undef, double %b, i32 0
-  %4 = insertelement <2 x double> %3, double 2.000000e+00, i32 1
-  %5 = tail call i32 @llvm.x86.sse2.comige.sd(<2 x double> %2, <2 x double> %4)
-  ret i32 %5
-}
-
-define i32 @test_comigt_sd_0(double %a, double %b) {
-; CHECK-LABEL: @test_comigt_sd_0(
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> undef, double %a, i32 0
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> undef, double %b, i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = tail call i32 @llvm.x86.sse2.comigt.sd(<2 x double> [[TMP1]], <2 x double> [[TMP2]])
-; CHECK-NEXT:    ret i32 [[TMP3]]
-;
-  %1 = insertelement <2 x double> undef, double %a, i32 0
-  %2 = insertelement <2 x double> %1, double 1.000000e+00, i32 1
-  %3 = insertelement <2 x double> undef, double %b, i32 0
-  %4 = insertelement <2 x double> %3, double 2.000000e+00, i32 1
-  %5 = tail call i32 @llvm.x86.sse2.comigt.sd(<2 x double> %2, <2 x double> %4)
-  ret i32 %5
-}
-
-define i32 @test_comile_sd_0(double %a, double %b) {
-; CHECK-LABEL: @test_comile_sd_0(
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> undef, double %a, i32 0
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> undef, double %b, i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = tail call i32 @llvm.x86.sse2.comile.sd(<2 x double> [[TMP1]], <2 x double> [[TMP2]])
-; CHECK-NEXT:    ret i32 [[TMP3]]
-;
-  %1 = insertelement <2 x double> undef, double %a, i32 0
-  %2 = insertelement <2 x double> %1, double 1.000000e+00, i32 1
-  %3 = insertelement <2 x double> undef, double %b, i32 0
-  %4 = insertelement <2 x double> %3, double 2.000000e+00, i32 1
-  %5 = tail call i32 @llvm.x86.sse2.comile.sd(<2 x double> %2, <2 x double> %4)
-  ret i32 %5
-}
-
-define i32 @test_comilt_sd_0(double %a, double %b) {
-; CHECK-LABEL: @test_comilt_sd_0(
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> undef, double %a, i32 0
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> undef, double %b, i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = tail call i32 @llvm.x86.sse2.comilt.sd(<2 x double> [[TMP1]], <2 x double> [[TMP2]])
-; CHECK-NEXT:    ret i32 [[TMP3]]
-;
-  %1 = insertelement <2 x double> undef, double %a, i32 0
-  %2 = insertelement <2 x double> %1, double 1.000000e+00, i32 1
-  %3 = insertelement <2 x double> undef, double %b, i32 0
-  %4 = insertelement <2 x double> %3, double 2.000000e+00, i32 1
-  %5 = tail call i32 @llvm.x86.sse2.comilt.sd(<2 x double> %2, <2 x double> %4)
-  ret i32 %5
-}
-
-define i32 @test_comineq_sd_0(double %a, double %b) {
-; CHECK-LABEL: @test_comineq_sd_0(
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> undef, double %a, i32 0
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> undef, double %b, i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = tail call i32 @llvm.x86.sse2.comineq.sd(<2 x double> [[TMP1]], <2 x double> [[TMP2]])
-; CHECK-NEXT:    ret i32 [[TMP3]]
-;
-  %1 = insertelement <2 x double> undef, double %a, i32 0
-  %2 = insertelement <2 x double> %1, double 1.000000e+00, i32 1
-  %3 = insertelement <2 x double> undef, double %b, i32 0
-  %4 = insertelement <2 x double> %3, double 2.000000e+00, i32 1
-  %5 = tail call i32 @llvm.x86.sse2.comineq.sd(<2 x double> %2, <2 x double> %4)
-  ret i32 %5
-}
-
-define i32 @test_ucomieq_sd_0(double %a, double %b) {
-; CHECK-LABEL: @test_ucomieq_sd_0(
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> undef, double %a, i32 0
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> undef, double %b, i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = tail call i32 @llvm.x86.sse2.ucomieq.sd(<2 x double> [[TMP1]], <2 x double> [[TMP2]])
-; CHECK-NEXT:    ret i32 [[TMP3]]
-;
-  %1 = insertelement <2 x double> undef, double %a, i32 0
-  %2 = insertelement <2 x double> %1, double 1.000000e+00, i32 1
-  %3 = insertelement <2 x double> undef, double %b, i32 0
-  %4 = insertelement <2 x double> %3, double 2.000000e+00, i32 1
-  %5 = tail call i32 @llvm.x86.sse2.ucomieq.sd(<2 x double> %2, <2 x double> %4)
-  ret i32 %5
-}
-
-define i32 @test_ucomige_sd_0(double %a, double %b) {
-; CHECK-LABEL: @test_ucomige_sd_0(
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> undef, double %a, i32 0
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> undef, double %b, i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = tail call i32 @llvm.x86.sse2.ucomige.sd(<2 x double> [[TMP1]], <2 x double> [[TMP2]])
-; CHECK-NEXT:    ret i32 [[TMP3]]
-;
-  %1 = insertelement <2 x double> undef, double %a, i32 0
-  %2 = insertelement <2 x double> %1, double 1.000000e+00, i32 1
-  %3 = insertelement <2 x double> undef, double %b, i32 0
-  %4 = insertelement <2 x double> %3, double 2.000000e+00, i32 1
-  %5 = tail call i32 @llvm.x86.sse2.ucomige.sd(<2 x double> %2, <2 x double> %4)
-  ret i32 %5
-}
-
-define i32 @test_ucomigt_sd_0(double %a, double %b) {
-; CHECK-LABEL: @test_ucomigt_sd_0(
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> undef, double %a, i32 0
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> undef, double %b, i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = tail call i32 @llvm.x86.sse2.ucomigt.sd(<2 x double> [[TMP1]], <2 x double> [[TMP2]])
-; CHECK-NEXT:    ret i32 [[TMP3]]
-;
-  %1 = insertelement <2 x double> undef, double %a, i32 0
-  %2 = insertelement <2 x double> %1, double 1.000000e+00, i32 1
-  %3 = insertelement <2 x double> undef, double %b, i32 0
-  %4 = insertelement <2 x double> %3, double 2.000000e+00, i32 1
-  %5 = tail call i32 @llvm.x86.sse2.ucomigt.sd(<2 x double> %2, <2 x double> %4)
-  ret i32 %5
-}
-
-define i32 @test_ucomile_sd_0(double %a, double %b) {
-; CHECK-LABEL: @test_ucomile_sd_0(
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> undef, double %a, i32 0
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> undef, double %b, i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = tail call i32 @llvm.x86.sse2.ucomile.sd(<2 x double> [[TMP1]], <2 x double> [[TMP2]])
-; CHECK-NEXT:    ret i32 [[TMP3]]
-;
-  %1 = insertelement <2 x double> undef, double %a, i32 0
-  %2 = insertelement <2 x double> %1, double 1.000000e+00, i32 1
-  %3 = insertelement <2 x double> undef, double %b, i32 0
-  %4 = insertelement <2 x double> %3, double 2.000000e+00, i32 1
-  %5 = tail call i32 @llvm.x86.sse2.ucomile.sd(<2 x double> %2, <2 x double> %4)
-  ret i32 %5
-}
-
-define i32 @test_ucomilt_sd_0(double %a, double %b) {
-; CHECK-LABEL: @test_ucomilt_sd_0(
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> undef, double %a, i32 0
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> undef, double %b, i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = tail call i32 @llvm.x86.sse2.ucomilt.sd(<2 x double> [[TMP1]], <2 x double> [[TMP2]])
-; CHECK-NEXT:    ret i32 [[TMP3]]
-;
-  %1 = insertelement <2 x double> undef, double %a, i32 0
-  %2 = insertelement <2 x double> %1, double 1.000000e+00, i32 1
-  %3 = insertelement <2 x double> undef, double %b, i32 0
-  %4 = insertelement <2 x double> %3, double 2.000000e+00, i32 1
-  %5 = tail call i32 @llvm.x86.sse2.ucomilt.sd(<2 x double> %2, <2 x double> %4)
-  ret i32 %5
-}
-
-define i32 @test_ucomineq_sd_0(double %a, double %b) {
-; CHECK-LABEL: @test_ucomineq_sd_0(
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> undef, double %a, i32 0
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> undef, double %b, i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = tail call i32 @llvm.x86.sse2.ucomineq.sd(<2 x double> [[TMP1]], <2 x double> [[TMP2]])
-; CHECK-NEXT:    ret i32 [[TMP3]]
-;
-  %1 = insertelement <2 x double> undef, double %a, i32 0
-  %2 = insertelement <2 x double> %1, double 1.000000e+00, i32 1
-  %3 = insertelement <2 x double> undef, double %b, i32 0
-  %4 = insertelement <2 x double> %3, double 2.000000e+00, i32 1
-  %5 = tail call i32 @llvm.x86.sse2.ucomineq.sd(<2 x double> %2, <2 x double> %4)
-  ret i32 %5
-}
-
-declare <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double>) nounwind readnone
-
-declare <2 x double> @llvm.x86.sse2.add.sd(<2 x double>, <2 x double>)
-declare <2 x double> @llvm.x86.sse2.sub.sd(<2 x double>, <2 x double>)
-declare <2 x double> @llvm.x86.sse2.mul.sd(<2 x double>, <2 x double>)
-declare <2 x double> @llvm.x86.sse2.div.sd(<2 x double>, <2 x double>)
-declare <2 x double> @llvm.x86.sse2.min.sd(<2 x double>, <2 x double>)
-declare <2 x double> @llvm.x86.sse2.max.sd(<2 x double>, <2 x double>)
-declare <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double>, <2 x double>, i8)
-
-declare i32 @llvm.x86.sse2.comieq.sd(<2 x double>, <2 x double>)
-declare i32 @llvm.x86.sse2.comige.sd(<2 x double>, <2 x double>)
-declare i32 @llvm.x86.sse2.comigt.sd(<2 x double>, <2 x double>)
-declare i32 @llvm.x86.sse2.comile.sd(<2 x double>, <2 x double>)
-declare i32 @llvm.x86.sse2.comilt.sd(<2 x double>, <2 x double>)
-declare i32 @llvm.x86.sse2.comineq.sd(<2 x double>, <2 x double>)
-
-declare i32 @llvm.x86.sse2.ucomieq.sd(<2 x double>, <2 x double>)
-declare i32 @llvm.x86.sse2.ucomige.sd(<2 x double>, <2 x double>)
-declare i32 @llvm.x86.sse2.ucomigt.sd(<2 x double>, <2 x double>)
-declare i32 @llvm.x86.sse2.ucomile.sd(<2 x double>, <2 x double>)
-declare i32 @llvm.x86.sse2.ucomilt.sd(<2 x double>, <2 x double>)
-declare i32 @llvm.x86.sse2.ucomineq.sd(<2 x double>, <2 x double>)
diff --git a/test/Transforms/InstCombine/x86-sse41.ll b/test/Transforms/InstCombine/x86-sse41.ll
deleted file mode 100644
index 16975471b9e1..000000000000
--- a/test/Transforms/InstCombine/x86-sse41.ll
+++ /dev/null
@@ -1,98 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -instcombine -S | FileCheck %s
-target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-
-define <2 x double> @test_round_sd(<2 x double> %a, <2 x double> %b) {
-; CHECK-LABEL: @test_round_sd(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %a, <2 x double> %b, i32 10)
-; CHECK-NEXT:    ret <2 x double> [[TMP1]]
-;
-  %1 = insertelement <2 x double> %a, double 1.000000e+00, i32 0
-  %2 = insertelement <2 x double> %b, double 2.000000e+00, i32 1
-  %3 = tail call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %1, <2 x double> %2, i32 10)
-  ret <2 x double> %3
-}
-
-define double @test_round_sd_0(double %a, double %b) {
-; CHECK-LABEL: @test_round_sd_0(
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> undef, double %b, i32 0
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> undef, <2 x double> [[TMP1]], i32 10)
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x double> [[TMP2]], i32 0
-; CHECK-NEXT:    ret double [[TMP3]]
-;
-  %1 = insertelement <2 x double> undef, double %a, i32 0
-  %2 = insertelement <2 x double> %1, double 1.000000e+00, i32 1
-  %3 = insertelement <2 x double> undef, double %b, i32 0
-  %4 = insertelement <2 x double> %3, double 2.000000e+00, i32 1
-  %5 = tail call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %2, <2 x double> %4, i32 10)
-  %6 = extractelement <2 x double> %5, i32 0
-  ret double %6
-}
-
-define double @test_round_sd_1(double %a, double %b) {
-; CHECK-LABEL: @test_round_sd_1(
-; CHECK-NEXT:    ret double 1.000000e+00
-;
-  %1 = insertelement <2 x double> undef, double %a, i32 0
-  %2 = insertelement <2 x double> %1, double 1.000000e+00, i32 1
-  %3 = insertelement <2 x double> undef, double %b, i32 0
-  %4 = insertelement <2 x double> %3, double 2.000000e+00, i32 1
-  %5 = tail call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %2, <2 x double> %4, i32 10)
-  %6 = extractelement <2 x double> %5, i32 1
-  ret double %6
-}
-
-define <4 x float> @test_round_ss(<4 x float> %a, <4 x float> %b) {
-; CHECK-LABEL: @test_round_ss(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> <float undef, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00>, <4 x float> %b, i32 10)
-; CHECK-NEXT:    ret <4 x float> [[TMP1]]
-;
-  %1 = insertelement <4 x float> %a, float 1.000000e+00, i32 1
-  %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2
-  %3 = insertelement <4 x float> %2, float 3.000000e+00, i32 3
-  %4 = insertelement <4 x float> %b, float 1.000000e+00, i32 1
-  %5 = insertelement <4 x float> %4, float 2.000000e+00, i32 2
-  %6 = insertelement <4 x float> %5, float 3.000000e+00, i32 3
-  %7 = tail call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %3, <4 x float> %6, i32 10)
-  ret <4 x float> %7
-}
-
-define float @test_round_ss_0(float %a, float %b) {
-; CHECK-LABEL: @test_round_ss_0(
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x float> undef, float %b, i32 0
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> undef, <4 x float> [[TMP1]], i32 10)
-; CHECK-NEXT:    [[R:%.*]] = extractelement <4 x float> [[TMP2]], i32 0
-; CHECK-NEXT:    ret float [[R]]
-;
-  %1 = insertelement <4 x float> undef, float %a, i32 0
-  %2 = insertelement <4 x float> %1, float 1.000000e+00, i32 1
-  %3 = insertelement <4 x float> %2, float 2.000000e+00, i32 2
-  %4 = insertelement <4 x float> %3, float 3.000000e+00, i32 3
-  %5 = insertelement <4 x float> undef, float %b, i32 0
-  %6 = insertelement <4 x float> %5, float 4.000000e+00, i32 1
-  %7 = insertelement <4 x float> %6, float 5.000000e+00, i32 2
-  %8 = insertelement <4 x float> %7, float 6.000000e+00, i32 3
-  %9 = tail call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %4, <4 x float> %8, i32 10)
-  %r = extractelement <4 x float> %9, i32 0
-  ret float %r
-}
-
-define float @test_round_ss_2(float %a, float %b) {
-; CHECK-LABEL: @test_round_ss_2(
-; CHECK-NEXT:    ret float 2.000000e+00
-;
-  %1 = insertelement <4 x float> undef, float %a, i32 0
-  %2 = insertelement <4 x float> %1, float 1.000000e+00, i32 1
-  %3 = insertelement <4 x float> %2, float 2.000000e+00, i32 2
-  %4 = insertelement <4 x float> %3, float 3.000000e+00, i32 3
-  %5 = insertelement <4 x float> undef, float %b, i32 0
-  %6 = insertelement <4 x float> %5, float 4.000000e+00, i32 1
-  %7 = insertelement <4 x float> %6, float 5.000000e+00, i32 2
-  %8 = insertelement <4 x float> %7, float 6.000000e+00, i32 3
-  %9 = tail call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %4, <4 x float> %8, i32 10)
-  %r = extractelement <4 x float> %9, i32 2
-  ret float %r
-}
-
-declare <2 x double> @llvm.x86.sse41.round.sd(<2 x double>, <2 x double>, i32) nounwind readnone
-declare <4 x float> @llvm.x86.sse41.round.ss(<4 x float>, <4 x float>, i32) nounwind readnone
diff --git a/test/Transforms/InstCombine/x86-sse4a.ll b/test/Transforms/InstCombine/x86-sse4a.ll
deleted file mode 100644
index e36a73532259..000000000000
--- a/test/Transforms/InstCombine/x86-sse4a.ll
+++ /dev/null
@@ -1,408 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -instcombine -S | FileCheck %s
-
-;
-; EXTRQ
-;
-
-define <2 x i64> @test_extrq_call(<2 x i64> %x, <16 x i8> %y) {
-; CHECK-LABEL: @test_extrq_call(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.x86.sse4a.extrq(<2 x i64> %x, <16 x i8> %y) #1
-; CHECK-NEXT:    ret <2 x i64> [[TMP1]]
-;
-  %1 = tail call <2 x i64> @llvm.x86.sse4a.extrq(<2 x i64> %x, <16 x i8> %y) nounwind
-  ret <2 x i64> %1
-}
-
-define <2 x i64> @test_extrq_zero_arg0(<2 x i64> %x, <16 x i8> %y) {
-; CHECK-LABEL: @test_extrq_zero_arg0(
-; CHECK-NEXT:    ret <2 x i64> <i64 0, i64 undef>
-;
-  %1 = tail call <2 x i64> @llvm.x86.sse4a.extrq(<2 x i64> zeroinitializer, <16 x i8> %y) nounwind
-  ret <2 x i64> %1
-}
-
-define <2 x i64> @test_extrq_zero_arg1(<2 x i64> %x, <16 x i8> %y) {
-; CHECK-LABEL: @test_extrq_zero_arg1(
-; CHECK-NEXT:    ret <2 x i64> %x
-;
-  %1 = tail call <2 x i64> @llvm.x86.sse4a.extrq(<2 x i64> %x, <16 x i8> zeroinitializer) nounwind
-  ret <2 x i64> %1
-}
-
-define <2 x i64> @test_extrq_to_extqi(<2 x i64> %x, <16 x i8> %y) {
-; CHECK-LABEL: @test_extrq_to_extqi(
-; CHECK-NEXT:    [[TMP1:%.*]] = call <2 x i64> @llvm.x86.sse4a.extrqi(<2 x i64> %x, i8 8, i8 15)
-; CHECK-NEXT:    ret <2 x i64> [[TMP1]]
-;
-  %1 = tail call <2 x i64> @llvm.x86.sse4a.extrq(<2 x i64> %x, <16 x i8> <i8 8, i8 15, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>) nounwind
-  ret <2 x i64> %1
-}
-
-define <2 x i64> @test_extrq_constant(<2 x i64> %x, <16 x i8> %y) {
-; CHECK-LABEL: @test_extrq_constant(
-; CHECK-NEXT:    ret <2 x i64> <i64 255, i64 undef>
-;
-  %1 = tail call <2 x i64> @llvm.x86.sse4a.extrq(<2 x i64> <i64 -1, i64 55>, <16 x i8> <i8 8, i8 15, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>) nounwind
-  ret <2 x i64> %1
-}
-
-define <2 x i64> @test_extrq_constant_undef(<2 x i64> %x, <16 x i8> %y) {
-; CHECK-LABEL: @test_extrq_constant_undef(
-; CHECK-NEXT:    ret <2 x i64> <i64 65535, i64 undef>
-;
-  %1 = tail call <2 x i64> @llvm.x86.sse4a.extrq(<2 x i64> <i64 -1, i64 undef>, <16 x i8> <i8 16, i8 15, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>) nounwind
-  ret <2 x i64> %1
-}
-
-define <2 x i64> @test_extrq_call_constexpr(<2 x i64> %x) {
-; CHECK-LABEL: @test_extrq_call_constexpr(
-; CHECK-NEXT:    ret <2 x i64> %x
-;
-  %1 = call <2 x i64> @llvm.x86.sse4a.extrq(<2 x i64> %x, <16 x i8> bitcast (<2 x i64> <i64 0, i64 undef> to <16 x i8>))
-  ret <2 x i64> %1
-}
-
-;
-; EXTRQI
-;
-
-define <2 x i64> @test_extrqi_call(<2 x i64> %x) {
-; CHECK-LABEL: @test_extrqi_call(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.x86.sse4a.extrqi(<2 x i64> %x, i8 8, i8 23)
-; CHECK-NEXT:    ret <2 x i64> [[TMP1]]
-;
-  %1 = tail call <2 x i64> @llvm.x86.sse4a.extrqi(<2 x i64> %x, i8 8, i8 23)
-  ret <2 x i64> %1
-}
-
-define <2 x i64> @test_extrqi_shuffle_1zuu(<2 x i64> %x) {
-; CHECK-LABEL: @test_extrqi_shuffle_1zuu(
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> %x to <16 x i8>
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> <i8 undef, i8 undef, i8 undef, i8 undef, i8 0, i8 0, i8 0, i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef>, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 20, i32 21, i32 22, i32 23, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x i64>
-; CHECK-NEXT:    ret <2 x i64> [[TMP3]]
-;
-  %1 = tail call <2 x i64> @llvm.x86.sse4a.extrqi(<2 x i64> %x, i8 32, i8 32)
-  ret <2 x i64> %1
-}
-
-define <2 x i64> @test_extrqi_shuffle_2zzzzzzzuuuuuuuu(<2 x i64> %x) {
-; CHECK-LABEL: @test_extrqi_shuffle_2zzzzzzzuuuuuuuu(
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> %x to <16 x i8>
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> <i8 undef, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef>, <16 x i32> <i32 2, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x i64>
-; CHECK-NEXT:    ret <2 x i64> [[TMP3]]
-;
-  %1 = tail call <2 x i64> @llvm.x86.sse4a.extrqi(<2 x i64> %x, i8 8, i8 16)
-  ret <2 x i64> %1
-}
-
-define <2 x i64> @test_extrqi_undef(<2 x i64> %x) {
-; CHECK-LABEL: @test_extrqi_undef(
-; CHECK-NEXT:    ret <2 x i64> undef
-;
-  %1 = tail call <2 x i64> @llvm.x86.sse4a.extrqi(<2 x i64> zeroinitializer, i8 32, i8 33)
-  ret <2 x i64> %1
-}
-
-define <2 x i64> @test_extrqi_zero(<2 x i64> %x) {
-; CHECK-LABEL: @test_extrqi_zero(
-; CHECK-NEXT:    ret <2 x i64> <i64 0, i64 undef>
-;
-  %1 = tail call <2 x i64> @llvm.x86.sse4a.extrqi(<2 x i64> zeroinitializer, i8 3, i8 18)
-  ret <2 x i64> %1
-}
-
-define <2 x i64> @test_extrqi_constant(<2 x i64> %x) {
-; CHECK-LABEL: @test_extrqi_constant(
-; CHECK-NEXT:    ret <2 x i64> <i64 7, i64 undef>
-;
-  %1 = tail call <2 x i64> @llvm.x86.sse4a.extrqi(<2 x i64> <i64 -1, i64 55>, i8 3, i8 18)
-  ret <2 x i64> %1
-}
-
-define <2 x i64> @test_extrqi_constant_undef(<2 x i64> %x) {
-; CHECK-LABEL: @test_extrqi_constant_undef(
-; CHECK-NEXT:    ret <2 x i64> <i64 15, i64 undef>
-;
-  %1 = tail call <2 x i64> @llvm.x86.sse4a.extrqi(<2 x i64> <i64 -1, i64 undef>, i8 4, i8 18)
-  ret <2 x i64> %1
-}
-
-define <2 x i64> @test_extrqi_call_constexpr() {
-; CHECK-LABEL: @test_extrqi_call_constexpr(
-; CHECK-NEXT:    ret <2 x i64> zeroinitializer
-;
-  %1 = tail call <2 x i64> @llvm.x86.sse4a.extrqi(<2 x i64> bitcast (<16 x i8> trunc (<16 x i16> bitcast (<4 x i64> <i64 0, i64 undef, i64 2, i64 undef> to <16 x i16>) to <16 x i8>) to <2 x i64>), i8 8, i8 16)
-  ret <2 x i64> %1
-}
-
-;
-; INSERTQ
-;
-
-define <2 x i64> @test_insertq_call(<2 x i64> %x, <2 x i64> %y) {
-; CHECK-LABEL: @test_insertq_call(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.x86.sse4a.insertq(<2 x i64> %x, <2 x i64> %y) #1
-; CHECK-NEXT:    ret <2 x i64> [[TMP1]]
-;
-  %1 = tail call <2 x i64> @llvm.x86.sse4a.insertq(<2 x i64> %x, <2 x i64> %y) nounwind
-  ret <2 x i64> %1
-}
-
-define <2 x i64> @test_insertq_to_insertqi(<2 x i64> %x, <2 x i64> %y) {
-; CHECK-LABEL: @test_insertq_to_insertqi(
-; CHECK-NEXT:    [[TMP1:%.*]] = call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %x, <2 x i64> <i64 8, i64 undef>, i8 18, i8 2)
-; CHECK-NEXT:    ret <2 x i64> [[TMP1]]
-;
-  %1 = tail call <2 x i64> @llvm.x86.sse4a.insertq(<2 x i64> %x, <2 x i64> <i64 8, i64 658>) nounwind
-  ret <2 x i64> %1
-}
-
-define <2 x i64> @test_insertq_constant(<2 x i64> %x, <2 x i64> %y) {
-; CHECK-LABEL: @test_insertq_constant(
-; CHECK-NEXT:    ret <2 x i64> <i64 32, i64 undef>
-;
-  %1 = tail call <2 x i64> @llvm.x86.sse4a.insertq(<2 x i64> <i64 0, i64 0>, <2 x i64> <i64 8, i64 658>) nounwind
-  ret <2 x i64> %1
-}
-
-define <2 x i64> @test_insertq_constant_undef(<2 x i64> %x, <2 x i64> %y) {
-; CHECK-LABEL: @test_insertq_constant_undef(
-; CHECK-NEXT:    ret <2 x i64> <i64 33, i64 undef>
-;
-  %1 = tail call <2 x i64> @llvm.x86.sse4a.insertq(<2 x i64> <i64 1, i64 undef>, <2 x i64> <i64 8, i64 658>) nounwind
-  ret <2 x i64> %1
-}
-
-define <2 x i64> @test_insertq_call_constexpr(<2 x i64> %x) {
-; CHECK-LABEL: @test_insertq_call_constexpr(
-; CHECK-NEXT:    [[TMP1:%.*]] = call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %x, <2 x i64> <i64 0, i64 undef>, i8 2, i8 0)
-; CHECK-NEXT:    ret <2 x i64> [[TMP1]]
-;
-  %1 = tail call <2 x i64> @llvm.x86.sse4a.insertq(<2 x i64> %x, <2 x i64> bitcast (<16 x i8> trunc (<16 x i16> bitcast (<4 x i64> <i64 0, i64 undef, i64 2, i64 undef> to <16 x i16>) to <16 x i8>) to <2 x i64>))
-  ret <2 x i64> %1
-}
-
-;
-; INSERTQI
-;
-
-define <16 x i8> @test_insertqi_shuffle_04uu(<16 x i8> %v, <16 x i8> %i) {
-; CHECK-LABEL: @test_insertqi_shuffle_04uu(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i8> %v, <16 x i8> %i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    ret <16 x i8> [[TMP1]]
-;
-  %1 = bitcast <16 x i8> %v to <2 x i64>
-  %2 = bitcast <16 x i8> %i to <2 x i64>
-  %3 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %2, i8 32, i8 32)
-  %4 = bitcast <2 x i64> %3 to <16 x i8>
-  ret <16 x i8> %4
-}
-
-define <16 x i8> @test_insertqi_shuffle_8123uuuu(<16 x i8> %v, <16 x i8> %i) {
-; CHECK-LABEL: @test_insertqi_shuffle_8123uuuu(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i8> %v, <16 x i8> %i, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    ret <16 x i8> [[TMP1]]
-;
-  %1 = bitcast <16 x i8> %v to <2 x i64>
-  %2 = bitcast <16 x i8> %i to <2 x i64>
-  %3 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %2, i8 16, i8 0)
-  %4 = bitcast <2 x i64> %3 to <16 x i8>
-  ret <16 x i8> %4
-}
-
-define <2 x i64> @test_insertqi_constant(<2 x i64> %v, <2 x i64> %i) {
-; CHECK-LABEL: @test_insertqi_constant(
-; CHECK-NEXT:    ret <2 x i64> <i64 -131055, i64 undef>
-;
-  %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> <i64 -1, i64 -1>, <2 x i64> <i64 8, i64 0>, i8 16, i8 1)
-  ret <2 x i64> %1
-}
-
-define <2 x i64> @test_insertqi_call_constexpr(<2 x i64> %x) {
-; CHECK-LABEL: @test_insertqi_call_constexpr(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %x, <2 x i64> <i64 0, i64 undef>, i8 48, i8 3)
-; CHECK-NEXT:    ret <2 x i64> [[TMP1]]
-;
-  %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %x, <2 x i64> bitcast (<16 x i8> trunc (<16 x i16> bitcast (<4 x i64> <i64 0, i64 undef, i64 2, i64 undef> to <16 x i16>) to <16 x i8>) to <2 x i64>), i8 48, i8 3)
-  ret <2 x i64> %1
-}
-
-; The result of this insert is the second arg, since the top 64 bits of
-; the result are undefined, and we copy the bottom 64 bits from the
-; second arg
-define <2 x i64> @testInsert64Bits(<2 x i64> %v, <2 x i64> %i) {
-; CHECK-LABEL: @testInsert64Bits(
-; CHECK-NEXT:    ret <2 x i64> %i
-;
-  %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 64, i8 0)
-  ret <2 x i64> %1
-}
-
-define <2 x i64> @testZeroLength(<2 x i64> %v, <2 x i64> %i) {
-; CHECK-LABEL: @testZeroLength(
-; CHECK-NEXT:    ret <2 x i64> %i
-;
-  %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 0, i8 0)
-  ret <2 x i64> %1
-}
-
-define <2 x i64> @testUndefinedInsertq_1(<2 x i64> %v, <2 x i64> %i) {
-; CHECK-LABEL: @testUndefinedInsertq_1(
-; CHECK-NEXT:    ret <2 x i64> undef
-;
-  %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 0, i8 16)
-  ret <2 x i64> %1
-}
-
-define <2 x i64> @testUndefinedInsertq_2(<2 x i64> %v, <2 x i64> %i) {
-; CHECK-LABEL: @testUndefinedInsertq_2(
-; CHECK-NEXT:    ret <2 x i64> undef
-;
-  %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 48, i8 32)
-  ret <2 x i64> %1
-}
-
-define <2 x i64> @testUndefinedInsertq_3(<2 x i64> %v, <2 x i64> %i) {
-; CHECK-LABEL: @testUndefinedInsertq_3(
-; CHECK-NEXT:    ret <2 x i64> undef
-;
-  %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 64, i8 16)
-  ret <2 x i64> %1
-}
-
-;
-; Vector Demanded Bits
-;
-
-define <2 x i64> @test_extrq_arg0(<2 x i64> %x, <16 x i8> %y) {
-; CHECK-LABEL: @test_extrq_arg0(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.x86.sse4a.extrq(<2 x i64> %x, <16 x i8> %y) #1
-; CHECK-NEXT:    ret <2 x i64> [[TMP1]]
-;
-  %1 = shufflevector <2 x i64> %x, <2 x i64> undef, <2 x i32> <i32 0, i32 0>
-  %2 = tail call <2 x i64> @llvm.x86.sse4a.extrq(<2 x i64> %1, <16 x i8> %y) nounwind
-  ret <2 x i64> %2
-}
-
-define <2 x i64> @test_extrq_arg1(<2 x i64> %x, <16 x i8> %y) {
-; CHECK-LABEL: @test_extrq_arg1(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.x86.sse4a.extrq(<2 x i64> %x, <16 x i8> %y) #1
-; CHECK-NEXT:    ret <2 x i64> [[TMP1]]
-;
-  %1 = shufflevector <16 x i8> %y, <16 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
-  %2 = tail call <2 x i64> @llvm.x86.sse4a.extrq(<2 x i64> %x, <16 x i8> %1) nounwind
-  ret <2 x i64> %2
-}
-
-define <2 x i64> @test_extrq_args01(<2 x i64> %x, <16 x i8> %y) {
-; CHECK-LABEL: @test_extrq_args01(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.x86.sse4a.extrq(<2 x i64> %x, <16 x i8> %y) #1
-; CHECK-NEXT:    ret <2 x i64> [[TMP1]]
-;
-  %1 = shufflevector <2 x i64> %x, <2 x i64> undef, <2 x i32> <i32 0, i32 0>
-  %2 = shufflevector <16 x i8> %y, <16 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
-  %3 = tail call <2 x i64> @llvm.x86.sse4a.extrq(<2 x i64> %1, <16 x i8> %2) nounwind
-  ret <2 x i64> %3
-}
-
-define <2 x i64> @test_extrq_ret(<2 x i64> %x, <16 x i8> %y) {
-; CHECK-LABEL: @test_extrq_ret(
-; CHECK-NEXT:    ret <2 x i64> undef
-;
-  %1 = tail call <2 x i64> @llvm.x86.sse4a.extrq(<2 x i64> %x, <16 x i8> %y) nounwind
-  %2 = shufflevector <2 x i64> %1, <2 x i64> undef, <2 x i32> <i32 1, i32 1>
-  ret <2 x i64> %2
-}
-
-define <2 x i64> @test_extrqi_arg0(<2 x i64> %x) {
-; CHECK-LABEL: @test_extrqi_arg0(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.x86.sse4a.extrqi(<2 x i64> %x, i8 3, i8 2)
-; CHECK-NEXT:    ret <2 x i64> [[TMP1]]
-;
-  %1 = shufflevector <2 x i64> %x, <2 x i64> undef, <2 x i32> <i32 0, i32 0>
-  %2 = tail call <2 x i64> @llvm.x86.sse4a.extrqi(<2 x i64> %1, i8 3, i8 2)
-  ret <2 x i64> %2
-}
-
-define <2 x i64> @test_extrqi_ret(<2 x i64> %x) {
-; CHECK-LABEL: @test_extrqi_ret(
-; CHECK-NEXT:    ret <2 x i64> undef
-;
-  %1 = tail call <2 x i64> @llvm.x86.sse4a.extrqi(<2 x i64> %x, i8 3, i8 2) nounwind
-  %2 = shufflevector <2 x i64> %1, <2 x i64> undef, <2 x i32> <i32 1, i32 1>
-  ret <2 x i64> %2
-}
-
-define <2 x i64> @test_insertq_arg0(<2 x i64> %x, <2 x i64> %y) {
-; CHECK-LABEL: @test_insertq_arg0(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.x86.sse4a.insertq(<2 x i64> %x, <2 x i64> %y) #1
-; CHECK-NEXT:    ret <2 x i64> [[TMP1]]
-;
-  %1 = shufflevector <2 x i64> %x, <2 x i64> undef, <2 x i32> <i32 0, i32 0>
-  %2 = tail call <2 x i64> @llvm.x86.sse4a.insertq(<2 x i64> %1, <2 x i64> %y) nounwind
-  ret <2 x i64> %2
-}
-
-define <2 x i64> @test_insertq_ret(<2 x i64> %x, <2 x i64> %y) {
-; CHECK-LABEL: @test_insertq_ret(
-; CHECK-NEXT:    ret <2 x i64> undef
-;
-  %1 = tail call <2 x i64> @llvm.x86.sse4a.insertq(<2 x i64> %x, <2 x i64> %y) nounwind
-  %2 = shufflevector <2 x i64> %1, <2 x i64> undef, <2 x i32> <i32 1, i32 1>
-  ret <2 x i64> %2
-}
-
-define <2 x i64> @test_insertqi_arg0(<2 x i64> %x, <2 x i64> %y) {
-; CHECK-LABEL: @test_insertqi_arg0(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %x, <2 x i64> %y, i8 3, i8 2) #1
-; CHECK-NEXT:    ret <2 x i64> [[TMP1]]
-;
-  %1 = shufflevector <2 x i64> %x, <2 x i64> undef, <2 x i32> <i32 0, i32 0>
-  %2 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %y, i8 3, i8 2) nounwind
-  ret <2 x i64> %2
-}
-
-define <2 x i64> @test_insertqi_arg1(<2 x i64> %x, <2 x i64> %y) {
-; CHECK-LABEL: @test_insertqi_arg1(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %x, <2 x i64> %y, i8 3, i8 2) #1
-; CHECK-NEXT:    ret <2 x i64> [[TMP1]]
-;
-  %1 = shufflevector <2 x i64> %y, <2 x i64> undef, <2 x i32> <i32 0, i32 0>
-  %2 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %x, <2 x i64> %1, i8 3, i8 2) nounwind
-  ret <2 x i64> %2
-}
-
-define <2 x i64> @test_insertqi_args01(<2 x i64> %x, <2 x i64> %y) {
-; CHECK-LABEL: @test_insertqi_args01(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %x, <2 x i64> %y, i8 3, i8 2) #1
-; CHECK-NEXT:    ret <2 x i64> [[TMP1]]
-;
-  %1 = shufflevector <2 x i64> %x, <2 x i64> undef, <2 x i32> <i32 0, i32 0>
-  %2 = shufflevector <2 x i64> %y, <2 x i64> undef, <2 x i32> <i32 0, i32 0>
-  %3 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %2, i8 3, i8 2) nounwind
-  ret <2 x i64> %3
-}
-
-define <2 x i64> @test_insertqi_ret(<2 x i64> %x, <2 x i64> %y) {
-; CHECK-LABEL: @test_insertqi_ret(
-; CHECK-NEXT:    ret <2 x i64> undef
-;
-  %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %x, <2 x i64> %y, i8 3, i8 2) nounwind
-  %2 = shufflevector <2 x i64> %1, <2 x i64> undef, <2 x i32> <i32 1, i32 1>
-  ret <2 x i64> %2
-}
-
-; CHECK: declare <2 x i64> @llvm.x86.sse4a.extrq
-declare <2 x i64> @llvm.x86.sse4a.extrq(<2 x i64>, <16 x i8>) nounwind
-
-; CHECK: declare <2 x i64> @llvm.x86.sse4a.extrqi
-declare <2 x i64> @llvm.x86.sse4a.extrqi(<2 x i64>, i8, i8) nounwind
-
-; CHECK: declare <2 x i64> @llvm.x86.sse4a.insertq
-declare <2 x i64> @llvm.x86.sse4a.insertq(<2 x i64>, <2 x i64>) nounwind
-
-; CHECK: declare <2 x i64> @llvm.x86.sse4a.insertqi
-declare <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64>, <2 x i64>, i8, i8) nounwind
diff --git a/test/Transforms/InstCombine/x86-vector-shifts.ll b/test/Transforms/InstCombine/x86-vector-shifts.ll
deleted file mode 100644
index 07934fbdfe72..000000000000
--- a/test/Transforms/InstCombine/x86-vector-shifts.ll
+++ /dev/null
@@ -1,3434 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -instcombine -S | FileCheck %s
-target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-
-;
-; ASHR - Immediate
-;
-
-define <8 x i16> @sse2_psrai_w_0(<8 x i16> %v) {
-; CHECK-LABEL: @sse2_psrai_w_0(
-; CHECK-NEXT:    ret <8 x i16> %v
-;
-  %1 = tail call <8 x i16> @llvm.x86.sse2.psrai.w(<8 x i16> %v, i32 0)
-  ret <8 x i16> %1
-}
-
-define <8 x i16> @sse2_psrai_w_15(<8 x i16> %v) {
-; CHECK-LABEL: @sse2_psrai_w_15(
-; CHECK-NEXT:    [[TMP1:%.*]] = ashr <8 x i16> %v, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
-; CHECK-NEXT:    ret <8 x i16> [[TMP1]]
-;
-  %1 = tail call <8 x i16> @llvm.x86.sse2.psrai.w(<8 x i16> %v, i32 15)
-  ret <8 x i16> %1
-}
-
-define <8 x i16> @sse2_psrai_w_64(<8 x i16> %v) {
-; CHECK-LABEL: @sse2_psrai_w_64(
-; CHECK-NEXT:    [[TMP1:%.*]] = ashr <8 x i16> %v, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
-; CHECK-NEXT:    ret <8 x i16> [[TMP1]]
-;
-  %1 = tail call <8 x i16> @llvm.x86.sse2.psrai.w(<8 x i16> %v, i32 64)
-  ret <8 x i16> %1
-}
-
-define <4 x i32> @sse2_psrai_d_0(<4 x i32> %v) {
-; CHECK-LABEL: @sse2_psrai_d_0(
-; CHECK-NEXT:    ret <4 x i32> %v
-;
-  %1 = tail call <4 x i32> @llvm.x86.sse2.psrai.d(<4 x i32> %v, i32 0)
-  ret <4 x i32> %1
-}
-
-define <4 x i32> @sse2_psrai_d_15(<4 x i32> %v) {
-; CHECK-LABEL: @sse2_psrai_d_15(
-; CHECK-NEXT:    [[TMP1:%.*]] = ashr <4 x i32> %v, <i32 15, i32 15, i32 15, i32 15>
-; CHECK-NEXT:    ret <4 x i32> [[TMP1]]
-;
-  %1 = tail call <4 x i32> @llvm.x86.sse2.psrai.d(<4 x i32> %v, i32 15)
-  ret <4 x i32> %1
-}
-
-define <4 x i32> @sse2_psrai_d_64(<4 x i32> %v) {
-; CHECK-LABEL: @sse2_psrai_d_64(
-; CHECK-NEXT:    [[TMP1:%.*]] = ashr <4 x i32> %v, <i32 31, i32 31, i32 31, i32 31>
-; CHECK-NEXT:    ret <4 x i32> [[TMP1]]
-;
-  %1 = tail call <4 x i32> @llvm.x86.sse2.psrai.d(<4 x i32> %v, i32 64)
-  ret <4 x i32> %1
-}
-
-define <16 x i16> @avx2_psrai_w_0(<16 x i16> %v) {
-; CHECK-LABEL: @avx2_psrai_w_0(
-; CHECK-NEXT:    ret <16 x i16> %v
-;
-  %1 = tail call <16 x i16> @llvm.x86.avx2.psrai.w(<16 x i16> %v, i32 0)
-  ret <16 x i16> %1
-}
-
-define <16 x i16> @avx2_psrai_w_15(<16 x i16> %v) {
-; CHECK-LABEL: @avx2_psrai_w_15(
-; CHECK-NEXT:    [[TMP1:%.*]] = ashr <16 x i16> %v, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
-; CHECK-NEXT:    ret <16 x i16> [[TMP1]]
-;
-  %1 = tail call <16 x i16> @llvm.x86.avx2.psrai.w(<16 x i16> %v, i32 15)
-  ret <16 x i16> %1
-}
-
-define <16 x i16> @avx2_psrai_w_64(<16 x i16> %v) {
-; CHECK-LABEL: @avx2_psrai_w_64(
-; CHECK-NEXT:    [[TMP1:%.*]] = ashr <16 x i16> %v, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
-; CHECK-NEXT:    ret <16 x i16> [[TMP1]]
-;
-  %1 = tail call <16 x i16> @llvm.x86.avx2.psrai.w(<16 x i16> %v, i32 64)
-  ret <16 x i16> %1
-}
-
-define <8 x i32> @avx2_psrai_d_0(<8 x i32> %v) {
-; CHECK-LABEL: @avx2_psrai_d_0(
-; CHECK-NEXT:    ret <8 x i32> %v
-;
-  %1 = tail call <8 x i32> @llvm.x86.avx2.psrai.d(<8 x i32> %v, i32 0)
-  ret <8 x i32> %1
-}
-
-define <8 x i32> @avx2_psrai_d_15(<8 x i32> %v) {
-; CHECK-LABEL: @avx2_psrai_d_15(
-; CHECK-NEXT:    [[TMP1:%.*]] = ashr <8 x i32> %v, <i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
-; CHECK-NEXT:    ret <8 x i32> [[TMP1]]
-;
-  %1 = tail call <8 x i32> @llvm.x86.avx2.psrai.d(<8 x i32> %v, i32 15)
-  ret <8 x i32> %1
-}
-
-define <8 x i32> @avx2_psrai_d_64(<8 x i32> %v) {
-; CHECK-LABEL: @avx2_psrai_d_64(
-; CHECK-NEXT:    [[TMP1:%.*]] = ashr <8 x i32> %v, <i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
-; CHECK-NEXT:    ret <8 x i32> [[TMP1]]
-;
-  %1 = tail call <8 x i32> @llvm.x86.avx2.psrai.d(<8 x i32> %v, i32 64)
-  ret <8 x i32> %1
-}
-
-define <2 x i64> @avx512_psrai_q_128_0(<2 x i64> %v) {
-; CHECK-LABEL: @avx512_psrai_q_128_0(
-; CHECK-NEXT:    ret <2 x i64> %v
-;
-  %1 = tail call <2 x i64> @llvm.x86.avx512.psrai.q.128(<2 x i64> %v, i32 0)
-  ret <2 x i64> %1
-}
-
-define <2 x i64> @avx512_psrai_q_128_15(<2 x i64> %v) {
-; CHECK-LABEL: @avx512_psrai_q_128_15(
-; CHECK-NEXT:    [[TMP1:%.*]] = ashr <2 x i64> %v, <i64 15, i64 15>
-; CHECK-NEXT:    ret <2 x i64> [[TMP1]]
-;
-  %1 = tail call <2 x i64> @llvm.x86.avx512.psrai.q.128(<2 x i64> %v, i32 15)
-  ret <2 x i64> %1
-}
-
-define <2 x i64> @avx512_psrai_q_128_64(<2 x i64> %v) {
-; CHECK-LABEL: @avx512_psrai_q_128_64(
-; CHECK-NEXT:    [[TMP1:%.*]] = ashr <2 x i64> %v, <i64 63, i64 63>
-; CHECK-NEXT:    ret <2 x i64> [[TMP1]]
-;
-  %1 = tail call <2 x i64> @llvm.x86.avx512.psrai.q.128(<2 x i64> %v, i32 64)
-  ret <2 x i64> %1
-}
-
-define <4 x i64> @avx512_psrai_q_256_0(<4 x i64> %v) {
-; CHECK-LABEL: @avx512_psrai_q_256_0(
-; CHECK-NEXT:    ret <4 x i64> %v
-;
-  %1 = tail call <4 x i64> @llvm.x86.avx512.psrai.q.256(<4 x i64> %v, i32 0)
-  ret <4 x i64> %1
-}
-
-define <4 x i64> @avx512_psrai_q_256_15(<4 x i64> %v) {
-; CHECK-LABEL: @avx512_psrai_q_256_15(
-; CHECK-NEXT:    [[TMP1:%.*]] = ashr <4 x i64> %v, <i64 15, i64 15, i64 15, i64 15>
-; CHECK-NEXT:    ret <4 x i64> [[TMP1]]
-;
-  %1 = tail call <4 x i64> @llvm.x86.avx512.psrai.q.256(<4 x i64> %v, i32 15)
-  ret <4 x i64> %1
-}
-
-define <4 x i64> @avx512_psrai_q_256_64(<4 x i64> %v) {
-; CHECK-LABEL: @avx512_psrai_q_256_64(
-; CHECK-NEXT:    [[TMP1:%.*]] = ashr <4 x i64> %v, <i64 63, i64 63, i64 63, i64 63>
-; CHECK-NEXT:    ret <4 x i64> [[TMP1]]
-;
-  %1 = tail call <4 x i64> @llvm.x86.avx512.psrai.q.256(<4 x i64> %v, i32 64)
-  ret <4 x i64> %1
-}
-
-define <32 x i16> @avx512_psrai_w_512_0(<32 x i16> %v) {
-; CHECK-LABEL: @avx512_psrai_w_512_0(
-; CHECK-NEXT:    ret <32 x i16> %v
-;
-  %1 = tail call <32 x i16> @llvm.x86.avx512.psrai.w.512(<32 x i16> %v, i32 0)
-  ret <32 x i16> %1
-}
-
-define <32 x i16> @avx512_psrai_w_512_15(<32 x i16> %v) {
-; CHECK-LABEL: @avx512_psrai_w_512_15(
-; CHECK-NEXT:    [[TMP1:%.*]] = ashr <32 x i16> %v, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
-; CHECK-NEXT:    ret <32 x i16> [[TMP1]]
-;
-  %1 = tail call <32 x i16> @llvm.x86.avx512.psrai.w.512(<32 x i16> %v, i32 15)
-  ret <32 x i16> %1
-}
-
-define <32 x i16> @avx512_psrai_w_512_64(<32 x i16> %v) {
-; CHECK-LABEL: @avx512_psrai_w_512_64(
-; CHECK-NEXT:    [[TMP1:%.*]] = ashr <32 x i16> %v, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
-; CHECK-NEXT:    ret <32 x i16> [[TMP1]]
-;
-  %1 = tail call <32 x i16> @llvm.x86.avx512.psrai.w.512(<32 x i16> %v, i32 64)
-  ret <32 x i16> %1
-}
-
-define <16 x i32> @avx512_psrai_d_512_0(<16 x i32> %v) {
-; CHECK-LABEL: @avx512_psrai_d_512_0(
-; CHECK-NEXT:    ret <16 x i32> %v
-;
-  %1 = tail call <16 x i32> @llvm.x86.avx512.psrai.d.512(<16 x i32> %v, i32 0)
-  ret <16 x i32> %1
-}
-
-define <16 x i32> @avx512_psrai_d_512_15(<16 x i32> %v) {
-; CHECK-LABEL: @avx512_psrai_d_512_15(
-; CHECK-NEXT:    [[TMP1:%.*]] = ashr <16 x i32> %v, <i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
-; CHECK-NEXT:    ret <16 x i32> [[TMP1]]
-;
-  %1 = tail call <16 x i32> @llvm.x86.avx512.psrai.d.512(<16 x i32> %v, i32 15)
-  ret <16 x i32> %1
-}
-
-define <16 x i32> @avx512_psrai_d_512_64(<16 x i32> %v) {
-; CHECK-LABEL: @avx512_psrai_d_512_64(
-; CHECK-NEXT:    [[TMP1:%.*]] = ashr <16 x i32> %v, <i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
-; CHECK-NEXT:    ret <16 x i32> [[TMP1]]
-;
-  %1 = tail call <16 x i32> @llvm.x86.avx512.psrai.d.512(<16 x i32> %v, i32 64)
-  ret <16 x i32> %1
-}
-
-define <8 x i64> @avx512_psrai_q_512_0(<8 x i64> %v) {
-; CHECK-LABEL: @avx512_psrai_q_512_0(
-; CHECK-NEXT:    ret <8 x i64> %v
-;
-  %1 = tail call <8 x i64> @llvm.x86.avx512.psrai.q.512(<8 x i64> %v, i32 0)
-  ret <8 x i64> %1
-}
-
-define <8 x i64> @avx512_psrai_q_512_15(<8 x i64> %v) {
-; CHECK-LABEL: @avx512_psrai_q_512_15(
-; CHECK-NEXT:    [[TMP1:%.*]] = ashr <8 x i64> %v, <i64 15, i64 15, i64 15, i64 15, i64 15, i64 15, i64 15, i64 15>
-; CHECK-NEXT:    ret <8 x i64> [[TMP1]]
-;
-  %1 = tail call <8 x i64> @llvm.x86.avx512.psrai.q.512(<8 x i64> %v, i32 15)
-  ret <8 x i64> %1
-}
-
-define <8 x i64> @avx512_psrai_q_512_64(<8 x i64> %v) {
-; CHECK-LABEL: @avx512_psrai_q_512_64(
-; CHECK-NEXT:    [[TMP1:%.*]] = ashr <8 x i64> %v, <i64 63, i64 63, i64 63, i64 63, i64 63, i64 63, i64 63, i64 63>
-; CHECK-NEXT:    ret <8 x i64> [[TMP1]]
-;
-  %1 = tail call <8 x i64> @llvm.x86.avx512.psrai.q.512(<8 x i64> %v, i32 64)
-  ret <8 x i64> %1
-}
-
-;
-; LSHR - Immediate
-;
-
-define <8 x i16> @sse2_psrli_w_0(<8 x i16> %v) {
-; CHECK-LABEL: @sse2_psrli_w_0(
-; CHECK-NEXT:    ret <8 x i16> %v
-;
-  %1 = tail call <8 x i16> @llvm.x86.sse2.psrli.w(<8 x i16> %v, i32 0)
-  ret <8 x i16> %1
-}
-
-define <8 x i16> @sse2_psrli_w_15(<8 x i16> %v) {
-; CHECK-LABEL: @sse2_psrli_w_15(
-; CHECK-NEXT:    [[TMP1:%.*]] = lshr <8 x i16> %v, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
-; CHECK-NEXT:    ret <8 x i16> [[TMP1]]
-;
-  %1 = tail call <8 x i16> @llvm.x86.sse2.psrli.w(<8 x i16> %v, i32 15)
-  ret <8 x i16> %1
-}
-
-define <8 x i16> @sse2_psrli_w_64(<8 x i16> %v) {
-; CHECK-LABEL: @sse2_psrli_w_64(
-; CHECK-NEXT:    ret <8 x i16> zeroinitializer
-;
-  %1 = tail call <8 x i16> @llvm.x86.sse2.psrli.w(<8 x i16> %v, i32 64)
-  ret <8 x i16> %1
-}
-
-define <4 x i32> @sse2_psrli_d_0(<4 x i32> %v) {
-; CHECK-LABEL: @sse2_psrli_d_0(
-; CHECK-NEXT:    ret <4 x i32> %v
-;
-  %1 = tail call <4 x i32> @llvm.x86.sse2.psrli.d(<4 x i32> %v, i32 0)
-  ret <4 x i32> %1
-}
-
-define <4 x i32> @sse2_psrli_d_15(<4 x i32> %v) {
-; CHECK-LABEL: @sse2_psrli_d_15(
-; CHECK-NEXT:    [[TMP1:%.*]] = lshr <4 x i32> %v, <i32 15, i32 15, i32 15, i32 15>
-; CHECK-NEXT:    ret <4 x i32> [[TMP1]]
-;
-  %1 = tail call <4 x i32> @llvm.x86.sse2.psrli.d(<4 x i32> %v, i32 15)
-  ret <4 x i32> %1
-}
-
-define <4 x i32> @sse2_psrli_d_64(<4 x i32> %v) {
-; CHECK-LABEL: @sse2_psrli_d_64(
-; CHECK-NEXT:    ret <4 x i32> zeroinitializer
-;
-  %1 = tail call <4 x i32> @llvm.x86.sse2.psrli.d(<4 x i32> %v, i32 64)
-  ret <4 x i32> %1
-}
-
-define <2 x i64> @sse2_psrli_q_0(<2 x i64> %v) {
-; CHECK-LABEL: @sse2_psrli_q_0(
-; CHECK-NEXT:    ret <2 x i64> %v
-;
-  %1 = tail call <2 x i64> @llvm.x86.sse2.psrli.q(<2 x i64> %v, i32 0)
-  ret <2 x i64> %1
-}
-
-define <2 x i64> @sse2_psrli_q_15(<2 x i64> %v) {
-; CHECK-LABEL: @sse2_psrli_q_15(
-; CHECK-NEXT:    [[TMP1:%.*]] = lshr <2 x i64> %v, <i64 15, i64 15>
-; CHECK-NEXT:    ret <2 x i64> [[TMP1]]
-;
-  %1 = tail call <2 x i64> @llvm.x86.sse2.psrli.q(<2 x i64> %v, i32 15)
-  ret <2 x i64> %1
-}
-
-define <2 x i64> @sse2_psrli_q_64(<2 x i64> %v) {
-; CHECK-LABEL: @sse2_psrli_q_64(
-; CHECK-NEXT:    ret <2 x i64> zeroinitializer
-;
-  %1 = tail call <2 x i64> @llvm.x86.sse2.psrli.q(<2 x i64> %v, i32 64)
-  ret <2 x i64> %1
-}
-
-define <16 x i16> @avx2_psrli_w_0(<16 x i16> %v) {
-; CHECK-LABEL: @avx2_psrli_w_0(
-; CHECK-NEXT:    ret <16 x i16> %v
-;
-  %1 = tail call <16 x i16> @llvm.x86.avx2.psrli.w(<16 x i16> %v, i32 0)
-  ret <16 x i16> %1
-}
-
-define <16 x i16> @avx2_psrli_w_15(<16 x i16> %v) {
-; CHECK-LABEL: @avx2_psrli_w_15(
-; CHECK-NEXT:    [[TMP1:%.*]] = lshr <16 x i16> %v, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
-; CHECK-NEXT:    ret <16 x i16> [[TMP1]]
-;
-  %1 = tail call <16 x i16> @llvm.x86.avx2.psrli.w(<16 x i16> %v, i32 15)
-  ret <16 x i16> %1
-}
-
-define <16 x i16> @avx2_psrli_w_64(<16 x i16> %v) {
-; CHECK-LABEL: @avx2_psrli_w_64(
-; CHECK-NEXT:    ret <16 x i16> zeroinitializer
-;
-  %1 = tail call <16 x i16> @llvm.x86.avx2.psrli.w(<16 x i16> %v, i32 64)
-  ret <16 x i16> %1
-}
-
-define <8 x i32> @avx2_psrli_d_0(<8 x i32> %v) {
-; CHECK-LABEL: @avx2_psrli_d_0(
-; CHECK-NEXT:    ret <8 x i32> %v
-;
-  %1 = tail call <8 x i32> @llvm.x86.avx2.psrli.d(<8 x i32> %v, i32 0)
-  ret <8 x i32> %1
-}
-
-define <8 x i32> @avx2_psrli_d_15(<8 x i32> %v) {
-; CHECK-LABEL: @avx2_psrli_d_15(
-; CHECK-NEXT:    [[TMP1:%.*]] = lshr <8 x i32> %v, <i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
-; CHECK-NEXT:    ret <8 x i32> [[TMP1]]
-;
-  %1 = tail call <8 x i32> @llvm.x86.avx2.psrli.d(<8 x i32> %v, i32 15)
-  ret <8 x i32> %1
-}
-
-define <8 x i32> @avx2_psrli_d_64(<8 x i32> %v) {
-; CHECK-LABEL: @avx2_psrli_d_64(
-; CHECK-NEXT:    ret <8 x i32> zeroinitializer
-;
-  %1 = tail call <8 x i32> @llvm.x86.avx2.psrli.d(<8 x i32> %v, i32 64)
-  ret <8 x i32> %1
-}
-
-define <4 x i64> @avx2_psrli_q_0(<4 x i64> %v) {
-; CHECK-LABEL: @avx2_psrli_q_0(
-; CHECK-NEXT:    ret <4 x i64> %v
-;
-  %1 = tail call <4 x i64> @llvm.x86.avx2.psrli.q(<4 x i64> %v, i32 0)
-  ret <4 x i64> %1
-}
-
-define <4 x i64> @avx2_psrli_q_15(<4 x i64> %v) {
-; CHECK-LABEL: @avx2_psrli_q_15(
-; CHECK-NEXT:    [[TMP1:%.*]] = lshr <4 x i64> %v, <i64 15, i64 15, i64 15, i64 15>
-; CHECK-NEXT:    ret <4 x i64> [[TMP1]]
-;
-  %1 = tail call <4 x i64> @llvm.x86.avx2.psrli.q(<4 x i64> %v, i32 15)
-  ret <4 x i64> %1
-}
-
-define <4 x i64> @avx2_psrli_q_64(<4 x i64> %v) {
-; CHECK-LABEL: @avx2_psrli_q_64(
-; CHECK-NEXT:    ret <4 x i64> zeroinitializer
-;
-  %1 = tail call <4 x i64> @llvm.x86.avx2.psrli.q(<4 x i64> %v, i32 64)
-  ret <4 x i64> %1
-}
-
-define <32 x i16> @avx512_psrli_w_512_0(<32 x i16> %v) {
-; CHECK-LABEL: @avx512_psrli_w_512_0(
-; CHECK-NEXT:    ret <32 x i16> %v
-;
-  %1 = tail call <32 x i16> @llvm.x86.avx512.psrli.w.512(<32 x i16> %v, i32 0)
-  ret <32 x i16> %1
-}
-
-define <32 x i16> @avx512_psrli_w_512_15(<32 x i16> %v) {
-; CHECK-LABEL: @avx512_psrli_w_512_15(
-; CHECK-NEXT:    [[TMP1:%.*]] = lshr <32 x i16> %v, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
-; CHECK-NEXT:    ret <32 x i16> [[TMP1]]
-;
-  %1 = tail call <32 x i16> @llvm.x86.avx512.psrli.w.512(<32 x i16> %v, i32 15)
-  ret <32 x i16> %1
-}
-
-define <32 x i16> @avx512_psrli_w_512_64(<32 x i16> %v) {
-; CHECK-LABEL: @avx512_psrli_w_512_64(
-; CHECK-NEXT:    ret <32 x i16> zeroinitializer
-;
-  %1 = tail call <32 x i16> @llvm.x86.avx512.psrli.w.512(<32 x i16> %v, i32 64)
-  ret <32 x i16> %1
-}
-
-define <16 x i32> @avx512_psrli_d_512_0(<16 x i32> %v) {
-; CHECK-LABEL: @avx512_psrli_d_512_0(
-; CHECK-NEXT:    ret <16 x i32> %v
-;
-  %1 = tail call <16 x i32> @llvm.x86.avx512.psrli.d.512(<16 x i32> %v, i32 0)
-  ret <16 x i32> %1
-}
-
-define <16 x i32> @avx512_psrli_d_512_15(<16 x i32> %v) {
-; CHECK-LABEL: @avx512_psrli_d_512_15(
-; CHECK-NEXT:    [[TMP1:%.*]] = lshr <16 x i32> %v, <i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
-; CHECK-NEXT:    ret <16 x i32> [[TMP1]]
-;
-  %1 = tail call <16 x i32> @llvm.x86.avx512.psrli.d.512(<16 x i32> %v, i32 15)
-  ret <16 x i32> %1
-}
-
-define <16 x i32> @avx512_psrli_d_512_64(<16 x i32> %v) {
-; CHECK-LABEL: @avx512_psrli_d_512_64(
-; CHECK-NEXT:    ret <16 x i32> zeroinitializer
-;
-  %1 = tail call <16 x i32> @llvm.x86.avx512.psrli.d.512(<16 x i32> %v, i32 64)
-  ret <16 x i32> %1
-}
-
-define <8 x i64> @avx512_psrli_q_512_0(<8 x i64> %v) {
-; CHECK-LABEL: @avx512_psrli_q_512_0(
-; CHECK-NEXT:    ret <8 x i64> %v
-;
-  %1 = tail call <8 x i64> @llvm.x86.avx512.psrli.q.512(<8 x i64> %v, i32 0)
-  ret <8 x i64> %1
-}
-
-define <8 x i64> @avx512_psrli_q_512_15(<8 x i64> %v) {
-; CHECK-LABEL: @avx512_psrli_q_512_15(
-; CHECK-NEXT:    [[TMP1:%.*]] = lshr <8 x i64> %v, <i64 15, i64 15, i64 15, i64 15, i64 15, i64 15, i64 15, i64 15>
-; CHECK-NEXT:    ret <8 x i64> [[TMP1]]
-;
-  %1 = tail call <8 x i64> @llvm.x86.avx512.psrli.q.512(<8 x i64> %v, i32 15)
-  ret <8 x i64> %1
-}
-
-define <8 x i64> @avx512_psrli_q_512_64(<8 x i64> %v) {
-; CHECK-LABEL: @avx512_psrli_q_512_64(
-; CHECK-NEXT:    ret <8 x i64> zeroinitializer
-;
-  %1 = tail call <8 x i64> @llvm.x86.avx512.psrli.q.512(<8 x i64> %v, i32 64)
-  ret <8 x i64> %1
-}
-
-;
-; SHL - Immediate
-;
-
-define <8 x i16> @sse2_pslli_w_0(<8 x i16> %v) {
-; CHECK-LABEL: @sse2_pslli_w_0(
-; CHECK-NEXT:    ret <8 x i16> %v
-;
-  %1 = tail call <8 x i16> @llvm.x86.sse2.pslli.w(<8 x i16> %v, i32 0)
-  ret <8 x i16> %1
-}
-
-define <8 x i16> @sse2_pslli_w_15(<8 x i16> %v) {
-; CHECK-LABEL: @sse2_pslli_w_15(
-; CHECK-NEXT:    [[TMP1:%.*]] = shl <8 x i16> %v, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
-; CHECK-NEXT:    ret <8 x i16> [[TMP1]]
-;
-  %1 = tail call <8 x i16> @llvm.x86.sse2.pslli.w(<8 x i16> %v, i32 15)
-  ret <8 x i16> %1
-}
-
-define <8 x i16> @sse2_pslli_w_64(<8 x i16> %v) {
-; CHECK-LABEL: @sse2_pslli_w_64(
-; CHECK-NEXT:    ret <8 x i16> zeroinitializer
-;
-  %1 = tail call <8 x i16> @llvm.x86.sse2.pslli.w(<8 x i16> %v, i32 64)
-  ret <8 x i16> %1
-}
-
-define <4 x i32> @sse2_pslli_d_0(<4 x i32> %v) {
-; CHECK-LABEL: @sse2_pslli_d_0(
-; CHECK-NEXT:    ret <4 x i32> %v
-;
-  %1 = tail call <4 x i32> @llvm.x86.sse2.pslli.d(<4 x i32> %v, i32 0)
-  ret <4 x i32> %1
-}
-
-define <4 x i32> @sse2_pslli_d_15(<4 x i32> %v) {
-; CHECK-LABEL: @sse2_pslli_d_15(
-; CHECK-NEXT:    [[TMP1:%.*]] = shl <4 x i32> %v, <i32 15, i32 15, i32 15, i32 15>
-; CHECK-NEXT:    ret <4 x i32> [[TMP1]]
-;
-  %1 = tail call <4 x i32> @llvm.x86.sse2.pslli.d(<4 x i32> %v, i32 15)
-  ret <4 x i32> %1
-}
-
-define <4 x i32> @sse2_pslli_d_64(<4 x i32> %v) {
-; CHECK-LABEL: @sse2_pslli_d_64(
-; CHECK-NEXT:    ret <4 x i32> zeroinitializer
-;
-  %1 = tail call <4 x i32> @llvm.x86.sse2.pslli.d(<4 x i32> %v, i32 64)
-  ret <4 x i32> %1
-}
-
-define <2 x i64> @sse2_pslli_q_0(<2 x i64> %v) {
-; CHECK-LABEL: @sse2_pslli_q_0(
-; CHECK-NEXT:    ret <2 x i64> %v
-;
-  %1 = tail call <2 x i64> @llvm.x86.sse2.pslli.q(<2 x i64> %v, i32 0)
-  ret <2 x i64> %1
-}
-
-define <2 x i64> @sse2_pslli_q_15(<2 x i64> %v) {
-; CHECK-LABEL: @sse2_pslli_q_15(
-; CHECK-NEXT:    [[TMP1:%.*]] = shl <2 x i64> %v, <i64 15, i64 15>
-; CHECK-NEXT:    ret <2 x i64> [[TMP1]]
-;
-  %1 = tail call <2 x i64> @llvm.x86.sse2.pslli.q(<2 x i64> %v, i32 15)
-  ret <2 x i64> %1
-}
-
-define <2 x i64> @sse2_pslli_q_64(<2 x i64> %v) {
-; CHECK-LABEL: @sse2_pslli_q_64(
-; CHECK-NEXT:    ret <2 x i64> zeroinitializer
-;
-  %1 = tail call <2 x i64> @llvm.x86.sse2.pslli.q(<2 x i64> %v, i32 64)
-  ret <2 x i64> %1
-}
-
-define <16 x i16> @avx2_pslli_w_0(<16 x i16> %v) {
-; CHECK-LABEL: @avx2_pslli_w_0(
-; CHECK-NEXT:    ret <16 x i16> %v
-;
-  %1 = tail call <16 x i16> @llvm.x86.avx2.pslli.w(<16 x i16> %v, i32 0)
-  ret <16 x i16> %1
-}
-
-define <16 x i16> @avx2_pslli_w_15(<16 x i16> %v) {
-; CHECK-LABEL: @avx2_pslli_w_15(
-; CHECK-NEXT:    [[TMP1:%.*]] = shl <16 x i16> %v, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
-; CHECK-NEXT:    ret <16 x i16> [[TMP1]]
-;
-  %1 = tail call <16 x i16> @llvm.x86.avx2.pslli.w(<16 x i16> %v, i32 15)
-  ret <16 x i16> %1
-}
-
-define <16 x i16> @avx2_pslli_w_64(<16 x i16> %v) {
-; CHECK-LABEL: @avx2_pslli_w_64(
-; CHECK-NEXT:    ret <16 x i16> zeroinitializer
-;
-  %1 = tail call <16 x i16> @llvm.x86.avx2.pslli.w(<16 x i16> %v, i32 64)
-  ret <16 x i16> %1
-}
-
-define <8 x i32> @avx2_pslli_d_0(<8 x i32> %v) {
-; CHECK-LABEL: @avx2_pslli_d_0(
-; CHECK-NEXT:    ret <8 x i32> %v
-;
-  %1 = tail call <8 x i32> @llvm.x86.avx2.pslli.d(<8 x i32> %v, i32 0)
-  ret <8 x i32> %1
-}
-
-define <8 x i32> @avx2_pslli_d_15(<8 x i32> %v) {
-; CHECK-LABEL: @avx2_pslli_d_15(
-; CHECK-NEXT:    [[TMP1:%.*]] = shl <8 x i32> %v, <i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
-; CHECK-NEXT:    ret <8 x i32> [[TMP1]]
-;
-  %1 = tail call <8 x i32> @llvm.x86.avx2.pslli.d(<8 x i32> %v, i32 15)
-  ret <8 x i32> %1
-}
-
-define <8 x i32> @avx2_pslli_d_64(<8 x i32> %v) {
-; CHECK-LABEL: @avx2_pslli_d_64(
-; CHECK-NEXT:    ret <8 x i32> zeroinitializer
-;
-  %1 = tail call <8 x i32> @llvm.x86.avx2.pslli.d(<8 x i32> %v, i32 64)
-  ret <8 x i32> %1
-}
-
-define <4 x i64> @avx2_pslli_q_0(<4 x i64> %v) {
-; CHECK-LABEL: @avx2_pslli_q_0(
-; CHECK-NEXT:    ret <4 x i64> %v
-;
-  %1 = tail call <4 x i64> @llvm.x86.avx2.pslli.q(<4 x i64> %v, i32 0)
-  ret <4 x i64> %1
-}
-
-define <4 x i64> @avx2_pslli_q_15(<4 x i64> %v) {
-; CHECK-LABEL: @avx2_pslli_q_15(
-; CHECK-NEXT:    [[TMP1:%.*]] = shl <4 x i64> %v, <i64 15, i64 15, i64 15, i64 15>
-; CHECK-NEXT:    ret <4 x i64> [[TMP1]]
-;
-  %1 = tail call <4 x i64> @llvm.x86.avx2.pslli.q(<4 x i64> %v, i32 15)
-  ret <4 x i64> %1
-}
-
-define <4 x i64> @avx2_pslli_q_64(<4 x i64> %v) {
-; CHECK-LABEL: @avx2_pslli_q_64(
-; CHECK-NEXT:    ret <4 x i64> zeroinitializer
-;
-  %1 = tail call <4 x i64> @llvm.x86.avx2.pslli.q(<4 x i64> %v, i32 64)
-  ret <4 x i64> %1
-}
-
-define <32 x i16> @avx512_pslli_w_512_0(<32 x i16> %v) {
-; CHECK-LABEL: @avx512_pslli_w_512_0(
-; CHECK-NEXT:    ret <32 x i16> %v
-;
-  %1 = tail call <32 x i16> @llvm.x86.avx512.pslli.w.512(<32 x i16> %v, i32 0)
-  ret <32 x i16> %1
-}
-
-define <32 x i16> @avx512_pslli_w_512_15(<32 x i16> %v) {
-; CHECK-LABEL: @avx512_pslli_w_512_15(
-; CHECK-NEXT:    [[TMP1:%.*]] = shl <32 x i16> %v, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
-; CHECK-NEXT:    ret <32 x i16> [[TMP1]]
-;
-  %1 = tail call <32 x i16> @llvm.x86.avx512.pslli.w.512(<32 x i16> %v, i32 15)
-  ret <32 x i16> %1
-}
-
-define <32 x i16> @avx512_pslli_w_512_64(<32 x i16> %v) {
-; CHECK-LABEL: @avx512_pslli_w_512_64(
-; CHECK-NEXT:    ret <32 x i16> zeroinitializer
-;
-  %1 = tail call <32 x i16> @llvm.x86.avx512.pslli.w.512(<32 x i16> %v, i32 64)
-  ret <32 x i16> %1
-}
-
-define <16 x i32> @avx512_pslli_d_512_0(<16 x i32> %v) {
-; CHECK-LABEL: @avx512_pslli_d_512_0(
-; CHECK-NEXT:    ret <16 x i32> %v
-;
-  %1 = tail call <16 x i32> @llvm.x86.avx512.pslli.d.512(<16 x i32> %v, i32 0)
-  ret <16 x i32> %1
-}
-
-define <16 x i32> @avx512_pslli_d_512_15(<16 x i32> %v) {
-; CHECK-LABEL: @avx512_pslli_d_512_15(
-; CHECK-NEXT:    [[TMP1:%.*]] = shl <16 x i32> %v, <i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
-; CHECK-NEXT:    ret <16 x i32> [[TMP1]]
-;
-  %1 = tail call <16 x i32> @llvm.x86.avx512.pslli.d.512(<16 x i32> %v, i32 15)
-  ret <16 x i32> %1
-}
-
-define <16 x i32> @avx512_pslli_d_512_64(<16 x i32> %v) {
-; CHECK-LABEL: @avx512_pslli_d_512_64(
-; CHECK-NEXT:    ret <16 x i32> zeroinitializer
-;
-  %1 = tail call <16 x i32> @llvm.x86.avx512.pslli.d.512(<16 x i32> %v, i32 64)
-  ret <16 x i32> %1
-}
-
-define <8 x i64> @avx512_pslli_q_512_0(<8 x i64> %v) {
-; CHECK-LABEL: @avx512_pslli_q_512_0(
-; CHECK-NEXT:    ret <8 x i64> %v
-;
-  %1 = tail call <8 x i64> @llvm.x86.avx512.pslli.q.512(<8 x i64> %v, i32 0)
-  ret <8 x i64> %1
-}
-
-define <8 x i64> @avx512_pslli_q_512_15(<8 x i64> %v) {
-; CHECK-LABEL: @avx512_pslli_q_512_15(
-; CHECK-NEXT:    [[TMP1:%.*]] = shl <8 x i64> %v, <i64 15, i64 15, i64 15, i64 15, i64 15, i64 15, i64 15, i64 15>
-; CHECK-NEXT:    ret <8 x i64> [[TMP1]]
-;
-  %1 = tail call <8 x i64> @llvm.x86.avx512.pslli.q.512(<8 x i64> %v, i32 15)
-  ret <8 x i64> %1
-}
-
-define <8 x i64> @avx512_pslli_q_512_64(<8 x i64> %v) {
-; CHECK-LABEL: @avx512_pslli_q_512_64(
-; CHECK-NEXT:    ret <8 x i64> zeroinitializer
-;
-  %1 = tail call <8 x i64> @llvm.x86.avx512.pslli.q.512(<8 x i64> %v, i32 64)
-  ret <8 x i64> %1
-}
-
-;
-; ASHR - Constant Vector
-;
-
-define <8 x i16> @sse2_psra_w_0(<8 x i16> %v) {
-; CHECK-LABEL: @sse2_psra_w_0(
-; CHECK-NEXT:    ret <8 x i16> %v
-;
-  %1 = tail call <8 x i16> @llvm.x86.sse2.psra.w(<8 x i16> %v, <8 x i16> zeroinitializer)
-  ret <8 x i16> %1
-}
-
-define <8 x i16> @sse2_psra_w_15(<8 x i16> %v) {
-; CHECK-LABEL: @sse2_psra_w_15(
-; CHECK-NEXT:    [[TMP1:%.*]] = ashr <8 x i16> %v, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
-; CHECK-NEXT:    ret <8 x i16> [[TMP1]]
-;
-  %1 = tail call <8 x i16> @llvm.x86.sse2.psra.w(<8 x i16> %v, <8 x i16> <i16 15, i16 0, i16 0, i16 0, i16 9999, i16 9999, i16 9999, i16 9999>)
-  ret <8 x i16> %1
-}
-
-define <8 x i16> @sse2_psra_w_15_splat(<8 x i16> %v) {
-; CHECK-LABEL: @sse2_psra_w_15_splat(
-; CHECK-NEXT:    [[TMP1:%.*]] = ashr <8 x i16> %v, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
-; CHECK-NEXT:    ret <8 x i16> [[TMP1]]
-;
-  %1 = tail call <8 x i16> @llvm.x86.sse2.psra.w(<8 x i16> %v, <8 x i16> <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>)
-  ret <8 x i16> %1
-}
-
-define <8 x i16> @sse2_psra_w_64(<8 x i16> %v) {
-; CHECK-LABEL: @sse2_psra_w_64(
-; CHECK-NEXT:    [[TMP1:%.*]] = ashr <8 x i16> %v, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
-; CHECK-NEXT:    ret <8 x i16> [[TMP1]]
-;
-  %1 = tail call <8 x i16> @llvm.x86.sse2.psra.w(<8 x i16> %v, <8 x i16> <i16 64, i16 0, i16 0, i16 0, i16 9999, i16 9999, i16 9999, i16 9999>)
-  ret <8 x i16> %1
-}
-
-define <4 x i32> @sse2_psra_d_0(<4 x i32> %v) {
-; CHECK-LABEL: @sse2_psra_d_0(
-; CHECK-NEXT:    ret <4 x i32> %v
-;
-  %1 = tail call <4 x i32> @llvm.x86.sse2.psra.d(<4 x i32> %v, <4 x i32> zeroinitializer)
-  ret <4 x i32> %1
-}
-
-define <4 x i32> @sse2_psra_d_15(<4 x i32> %v) {
-; CHECK-LABEL: @sse2_psra_d_15(
-; CHECK-NEXT:    [[TMP1:%.*]] = ashr <4 x i32> %v, <i32 15, i32 15, i32 15, i32 15>
-; CHECK-NEXT:    ret <4 x i32> [[TMP1]]
-;
-  %1 = tail call <4 x i32> @llvm.x86.sse2.psra.d(<4 x i32> %v, <4 x i32> <i32 15, i32 0, i32 9999, i32 9999>)
-  ret <4 x i32> %1
-}
-
-define <4 x i32> @sse2_psra_d_15_splat(<4 x i32> %v) {
-; CHECK-LABEL: @sse2_psra_d_15_splat(
-; CHECK-NEXT:    [[TMP1:%.*]] = ashr <4 x i32> %v, <i32 31, i32 31, i32 31, i32 31>
-; CHECK-NEXT:    ret <4 x i32> [[TMP1]]
-;
-  %1 = tail call <4 x i32> @llvm.x86.sse2.psra.d(<4 x i32> %v, <4 x i32> <i32 15, i32 15, i32 15, i32 15>)
-  ret <4 x i32> %1
-}
-
-define <4 x i32> @sse2_psra_d_64(<4 x i32> %v) {
-; CHECK-LABEL: @sse2_psra_d_64(
-; CHECK-NEXT:    [[TMP1:%.*]] = ashr <4 x i32> %v, <i32 31, i32 31, i32 31, i32 31>
-; CHECK-NEXT:    ret <4 x i32> [[TMP1]]
-;
-  %1 = tail call <4 x i32> @llvm.x86.sse2.psra.d(<4 x i32> %v, <4 x i32> <i32 64, i32 0, i32 9999, i32 9999>)
-  ret <4 x i32> %1
-}
-
-define <16 x i16> @avx2_psra_w_0(<16 x i16> %v) {
-; CHECK-LABEL: @avx2_psra_w_0(
-; CHECK-NEXT:    ret <16 x i16> %v
-;
-  %1 = tail call <16 x i16> @llvm.x86.avx2.psra.w(<16 x i16> %v, <8 x i16> zeroinitializer)
-  ret <16 x i16> %1
-}
-
-define <16 x i16> @avx2_psra_w_15(<16 x i16> %v) {
-; CHECK-LABEL: @avx2_psra_w_15(
-; CHECK-NEXT:    [[TMP1:%.*]] = ashr <16 x i16> %v, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
-; CHECK-NEXT:    ret <16 x i16> [[TMP1]]
-;
-  %1 = tail call <16 x i16> @llvm.x86.avx2.psra.w(<16 x i16> %v, <8 x i16> <i16 15, i16 0, i16 0, i16 0, i16 9999, i16 9999, i16 9999, i16 9999>)
-  ret <16 x i16> %1
-}
-
-define <16 x i16> @avx2_psra_w_15_splat(<16 x i16> %v) {
-; CHECK-LABEL: @avx2_psra_w_15_splat(
-; CHECK-NEXT:    [[TMP1:%.*]] = ashr <16 x i16> %v, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
-; CHECK-NEXT:    ret <16 x i16> [[TMP1]]
-;
-  %1 = tail call <16 x i16> @llvm.x86.avx2.psra.w(<16 x i16> %v, <8 x i16> <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>)
-  ret <16 x i16> %1
-}
-
-define <16 x i16> @avx2_psra_w_64(<16 x i16> %v) {
-; CHECK-LABEL: @avx2_psra_w_64(
-; CHECK-NEXT:    [[TMP1:%.*]] = ashr <16 x i16> %v, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
-; CHECK-NEXT:    ret <16 x i16> [[TMP1]]
-;
-  %1 = tail call <16 x i16> @llvm.x86.avx2.psra.w(<16 x i16> %v, <8 x i16> <i16 64, i16 0, i16 0, i16 0, i16 9999, i16 9999, i16 9999, i16 9999>)
-  ret <16 x i16> %1
-}
-
-define <8 x i32> @avx2_psra_d_0(<8 x i32> %v) {
-; CHECK-LABEL: @avx2_psra_d_0(
-; CHECK-NEXT:    ret <8 x i32> %v
-;
-  %1 = tail call <8 x i32> @llvm.x86.avx2.psra.d(<8 x i32> %v, <4 x i32> zeroinitializer)
-  ret <8 x i32> %1
-}
-
-define <8 x i32> @avx2_psra_d_15(<8 x i32> %v) {
-; CHECK-LABEL: @avx2_psra_d_15(
-; CHECK-NEXT:    [[TMP1:%.*]] = ashr <8 x i32> %v, <i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
-; CHECK-NEXT:    ret <8 x i32> [[TMP1]]
-;
-  %1 = tail call <8 x i32> @llvm.x86.avx2.psra.d(<8 x i32> %v, <4 x i32> <i32 15, i32 0, i32 9999, i32 9999>)
-  ret <8 x i32> %1
-}
-
-define <8 x i32> @avx2_psra_d_15_splat(<8 x i32> %v) {
-; CHECK-LABEL: @avx2_psra_d_15_splat(
-; CHECK-NEXT:    [[TMP1:%.*]] = ashr <8 x i32> %v, <i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
-; CHECK-NEXT:    ret <8 x i32> [[TMP1]]
-;
-  %1 = tail call <8 x i32> @llvm.x86.avx2.psra.d(<8 x i32> %v, <4 x i32> <i32 15, i32 15, i32 15, i32 15>)
-  ret <8 x i32> %1
-}
-
-define <8 x i32> @avx2_psra_d_64(<8 x i32> %v) {
-; CHECK-LABEL: @avx2_psra_d_64(
-; CHECK-NEXT:    [[TMP1:%.*]] = ashr <8 x i32> %v, <i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
-; CHECK-NEXT:    ret <8 x i32> [[TMP1]]
-;
-  %1 = tail call <8 x i32> @llvm.x86.avx2.psra.d(<8 x i32> %v, <4 x i32> <i32 64, i32 0, i32 9999, i32 9999>)
-  ret <8 x i32> %1
-}
-
-define <2 x i64> @avx512_psra_q_128_0(<2 x i64> %v) {
-; CHECK-LABEL: @avx512_psra_q_128_0(
-; CHECK-NEXT:    ret <2 x i64> %v
-;
-  %1 = tail call <2 x i64> @llvm.x86.avx512.psra.q.128(<2 x i64> %v, <2 x i64> zeroinitializer)
-  ret <2 x i64> %1
-}
-
-define <2 x i64> @avx512_psra_q_128_15(<2 x i64> %v) {
-; CHECK-LABEL: @avx512_psra_q_128_15(
-; CHECK-NEXT:    [[TMP1:%.*]] = ashr <2 x i64> %v, <i64 15, i64 15>
-; CHECK-NEXT:    ret <2 x i64> [[TMP1]]
-;
-  %1 = tail call <2 x i64> @llvm.x86.avx512.psra.q.128(<2 x i64> %v, <2 x i64> <i64 15, i64 9999>)
-  ret <2 x i64> %1
-}
-
-define <2 x i64> @avx512_psra_q_128_64(<2 x i64> %v) {
-; CHECK-LABEL: @avx512_psra_q_128_64(
-; CHECK-NEXT:    [[TMP1:%.*]] = ashr <2 x i64> %v, <i64 63, i64 63>
-; CHECK-NEXT:    ret <2 x i64> [[TMP1]]
-;
-  %1 = tail call <2 x i64> @llvm.x86.avx512.psra.q.128(<2 x i64> %v, <2 x i64> <i64 64, i64 9999>)
-  ret <2 x i64> %1
-}
-
-define <4 x i64> @avx512_psra_q_256_0(<4 x i64> %v) {
-; CHECK-LABEL: @avx512_psra_q_256_0(
-; CHECK-NEXT:    ret <4 x i64> %v
-;
-  %1 = tail call <4 x i64> @llvm.x86.avx512.psra.q.256(<4 x i64> %v, <2 x i64> zeroinitializer)
-  ret <4 x i64> %1
-}
-
-define <4 x i64> @avx512_psra_q_256_15(<4 x i64> %v) {
-; CHECK-LABEL: @avx512_psra_q_256_15(
-; CHECK-NEXT:    [[TMP1:%.*]] = ashr <4 x i64> %v, <i64 15, i64 15, i64 15, i64 15>
-; CHECK-NEXT:    ret <4 x i64> [[TMP1]]
-;
-  %1 = tail call <4 x i64> @llvm.x86.avx512.psra.q.256(<4 x i64> %v, <2 x i64> <i64 15, i64 9999>)
-  ret <4 x i64> %1
-}
-
-define <4 x i64> @avx512_psra_q_256_64(<4 x i64> %v) {
-; CHECK-LABEL: @avx512_psra_q_256_64(
-; CHECK-NEXT:    [[TMP1:%.*]] = ashr <4 x i64> %v, <i64 63, i64 63, i64 63, i64 63>
-; CHECK-NEXT:    ret <4 x i64> [[TMP1]]
-;
-  %1 = tail call <4 x i64> @llvm.x86.avx512.psra.q.256(<4 x i64> %v, <2 x i64> <i64 64, i64 9999>)
-  ret <4 x i64> %1
-}
-
-define <32 x i16> @avx512_psra_w_512_0(<32 x i16> %v) {
-; CHECK-LABEL: @avx512_psra_w_512_0(
-; CHECK-NEXT:    ret <32 x i16> %v
-;
-  %1 = tail call <32 x i16> @llvm.x86.avx512.psra.w.512(<32 x i16> %v, <8 x i16> zeroinitializer)
-  ret <32 x i16> %1
-}
-
-define <32 x i16> @avx512_psra_w_512_15(<32 x i16> %v) {
-; CHECK-LABEL: @avx512_psra_w_512_15(
-; CHECK-NEXT:    [[TMP1:%.*]] = ashr <32 x i16> %v, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
-; CHECK-NEXT:    ret <32 x i16> [[TMP1]]
-;
-  %1 = tail call <32 x i16> @llvm.x86.avx512.psra.w.512(<32 x i16> %v, <8 x i16> <i16 15, i16 0, i16 0, i16 0, i16 9999, i16 9999, i16 9999, i16 9999>)
-  ret <32 x i16> %1
-}
-
-define <32 x i16> @avx512_psra_w_512_15_splat(<32 x i16> %v) {
-; CHECK-LABEL: @avx512_psra_w_512_15_splat(
-; CHECK-NEXT:    [[TMP1:%.*]] = ashr <32 x i16> %v, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
-; CHECK-NEXT:    ret <32 x i16> [[TMP1]]
-;
-  %1 = tail call <32 x i16> @llvm.x86.avx512.psra.w.512(<32 x i16> %v, <8 x i16> <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>)
-  ret <32 x i16> %1
-}
-
-define <32 x i16> @avx512_psra_w_512_64(<32 x i16> %v) {
-; CHECK-LABEL: @avx512_psra_w_512_64(
-; CHECK-NEXT:    [[TMP1:%.*]] = ashr <32 x i16> %v, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
-; CHECK-NEXT:    ret <32 x i16> [[TMP1]]
-;
-  %1 = tail call <32 x i16> @llvm.x86.avx512.psra.w.512(<32 x i16> %v, <8 x i16> <i16 64, i16 0, i16 0, i16 0, i16 9999, i16 9999, i16 9999, i16 9999>)
-  ret <32 x i16> %1
-}
-
-define <16 x i32> @avx512_psra_d_512_0(<16 x i32> %v) {
-; CHECK-LABEL: @avx512_psra_d_512_0(
-; CHECK-NEXT:    ret <16 x i32> %v
-;
-  %1 = tail call <16 x i32> @llvm.x86.avx512.psra.d.512(<16 x i32> %v, <4 x i32> zeroinitializer)
-  ret <16 x i32> %1
-}
-
-define <16 x i32> @avx512_psra_d_512_15(<16 x i32> %v) {
-; CHECK-LABEL: @avx512_psra_d_512_15(
-; CHECK-NEXT:    [[TMP1:%.*]] = ashr <16 x i32> %v, <i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
-; CHECK-NEXT:    ret <16 x i32> [[TMP1]]
-;
-  %1 = tail call <16 x i32> @llvm.x86.avx512.psra.d.512(<16 x i32> %v, <4 x i32> <i32 15, i32 0, i32 9999, i32 9999>)
-  ret <16 x i32> %1
-}
-
-define <16 x i32> @avx512_psra_d_512_15_splat(<16 x i32> %v) {
-; CHECK-LABEL: @avx512_psra_d_512_15_splat(
-; CHECK-NEXT:    [[TMP1:%.*]] = ashr <16 x i32> %v, <i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
-; CHECK-NEXT:    ret <16 x i32> [[TMP1]]
-;
-  %1 = tail call <16 x i32> @llvm.x86.avx512.psra.d.512(<16 x i32> %v, <4 x i32> <i32 15, i32 15, i32 15, i32 15>)
-  ret <16 x i32> %1
-}
-
-define <16 x i32> @avx512_psra_d_512_64(<16 x i32> %v) {
-; CHECK-LABEL: @avx512_psra_d_512_64(
-; CHECK-NEXT:    [[TMP1:%.*]] = ashr <16 x i32> %v, <i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
-; CHECK-NEXT:    ret <16 x i32> [[TMP1]]
-;
-  %1 = tail call <16 x i32> @llvm.x86.avx512.psra.d.512(<16 x i32> %v, <4 x i32> <i32 64, i32 0, i32 9999, i32 9999>)
-  ret <16 x i32> %1
-}
-
-define <8 x i64> @avx512_psra_q_512_0(<8 x i64> %v) {
-; CHECK-LABEL: @avx512_psra_q_512_0(
-; CHECK-NEXT:    ret <8 x i64> %v
-;
-  %1 = tail call <8 x i64> @llvm.x86.avx512.psra.q.512(<8 x i64> %v, <2 x i64> zeroinitializer)
-  ret <8 x i64> %1
-}
-
-define <8 x i64> @avx512_psra_q_512_15(<8 x i64> %v) {
-; CHECK-LABEL: @avx512_psra_q_512_15(
-; CHECK-NEXT:    [[TMP1:%.*]] = ashr <8 x i64> %v, <i64 15, i64 15, i64 15, i64 15, i64 15, i64 15, i64 15, i64 15>
-; CHECK-NEXT:    ret <8 x i64> [[TMP1]]
-;
-  %1 = tail call <8 x i64> @llvm.x86.avx512.psra.q.512(<8 x i64> %v, <2 x i64> <i64 15, i64 9999>)
-  ret <8 x i64> %1
-}
-
-define <8 x i64> @avx512_psra_q_512_64(<8 x i64> %v) {
-; CHECK-LABEL: @avx512_psra_q_512_64(
-; CHECK-NEXT:    [[TMP1:%.*]] = ashr <8 x i64> %v, <i64 63, i64 63, i64 63, i64 63, i64 63, i64 63, i64 63, i64 63>
-; CHECK-NEXT:    ret <8 x i64> [[TMP1]]
-;
-  %1 = tail call <8 x i64> @llvm.x86.avx512.psra.q.512(<8 x i64> %v, <2 x i64> <i64 64, i64 9999>)
-  ret <8 x i64> %1
-}
-
-;
-; LSHR - Constant Vector
-;
-
-define <8 x i16> @sse2_psrl_w_0(<8 x i16> %v) {
-; CHECK-LABEL: @sse2_psrl_w_0(
-; CHECK-NEXT:    ret <8 x i16> %v
-;
-  %1 = tail call <8 x i16> @llvm.x86.sse2.psrl.w(<8 x i16> %v, <8 x i16> zeroinitializer)
-  ret <8 x i16> %1
-}
-
-define <8 x i16> @sse2_psrl_w_15(<8 x i16> %v) {
-; CHECK-LABEL: @sse2_psrl_w_15(
-; CHECK-NEXT:    [[TMP1:%.*]] = lshr <8 x i16> %v, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
-; CHECK-NEXT:    ret <8 x i16> [[TMP1]]
-;
-  %1 = tail call <8 x i16> @llvm.x86.sse2.psrl.w(<8 x i16> %v, <8 x i16> <i16 15, i16 0, i16 0, i16 0, i16 9999, i16 9999, i16 9999, i16 9999>)
-  ret <8 x i16> %1
-}
-
-define <8 x i16> @sse2_psrl_w_15_splat(<8 x i16> %v) {
-; CHECK-LABEL: @sse2_psrl_w_15_splat(
-; CHECK-NEXT:    ret <8 x i16> zeroinitializer
-;
-  %1 = tail call <8 x i16> @llvm.x86.sse2.psrl.w(<8 x i16> %v, <8 x i16> <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>)
-  ret <8 x i16> %1
-}
-
-define <8 x i16> @sse2_psrl_w_64(<8 x i16> %v) {
-; CHECK-LABEL: @sse2_psrl_w_64(
-; CHECK-NEXT:    ret <8 x i16> zeroinitializer
-;
-  %1 = tail call <8 x i16> @llvm.x86.sse2.psrl.w(<8 x i16> %v, <8 x i16> <i16 64, i16 0, i16 0, i16 0, i16 9999, i16 9999, i16 9999, i16 9999>)
-  ret <8 x i16> %1
-}
-
-define <4 x i32> @sse2_psrl_d_0(<4 x i32> %v) {
-; CHECK-LABEL: @sse2_psrl_d_0(
-; CHECK-NEXT:    ret <4 x i32> %v
-;
-  %1 = tail call <4 x i32> @llvm.x86.sse2.psrl.d(<4 x i32> %v, <4 x i32> zeroinitializer)
-  ret <4 x i32> %1
-}
-
-define <4 x i32> @sse2_psrl_d_15(<4 x i32> %v) {
-; CHECK-LABEL: @sse2_psrl_d_15(
-; CHECK-NEXT:    [[TMP1:%.*]] = lshr <4 x i32> %v, <i32 15, i32 15, i32 15, i32 15>
-; CHECK-NEXT:    ret <4 x i32> [[TMP1]]
-;
-  %1 = tail call <4 x i32> @llvm.x86.sse2.psrl.d(<4 x i32> %v, <4 x i32> <i32 15, i32 0, i32 9999, i32 9999>)
-  ret <4 x i32> %1
-}
-
-define <4 x i32> @sse2_psrl_d_15_splat(<4 x i32> %v) {
-; CHECK-LABEL: @sse2_psrl_d_15_splat(
-; CHECK-NEXT:    ret <4 x i32> zeroinitializer
-;
-  %1 = tail call <4 x i32> @llvm.x86.sse2.psrl.d(<4 x i32> %v, <4 x i32> <i32 15, i32 15, i32 15, i32 15>)
-  ret <4 x i32> %1
-}
-
-define <4 x i32> @sse2_psrl_d_64(<4 x i32> %v) {
-; CHECK-LABEL: @sse2_psrl_d_64(
-; CHECK-NEXT:    ret <4 x i32> zeroinitializer
-;
-  %1 = tail call <4 x i32> @llvm.x86.sse2.psrl.d(<4 x i32> %v, <4 x i32> <i32 64, i32 0, i32 9999, i32 9999>)
-  ret <4 x i32> %1
-}
-
-define <2 x i64> @sse2_psrl_q_0(<2 x i64> %v) {
-; CHECK-LABEL: @sse2_psrl_q_0(
-; CHECK-NEXT:    ret <2 x i64> %v
-;
-  %1 = tail call <2 x i64> @llvm.x86.sse2.psrl.q(<2 x i64> %v, <2 x i64> zeroinitializer)
-  ret <2 x i64> %1
-}
-
-define <2 x i64> @sse2_psrl_q_15(<2 x i64> %v) {
-; CHECK-LABEL: @sse2_psrl_q_15(
-; CHECK-NEXT:    [[TMP1:%.*]] = lshr <2 x i64> %v, <i64 15, i64 15>
-; CHECK-NEXT:    ret <2 x i64> [[TMP1]]
-;
-  %1 = tail call <2 x i64> @llvm.x86.sse2.psrl.q(<2 x i64> %v, <2 x i64> <i64 15, i64 9999>)
-  ret <2 x i64> %1
-}
-
-define <2 x i64> @sse2_psrl_q_64(<2 x i64> %v) {
-; CHECK-LABEL: @sse2_psrl_q_64(
-; CHECK-NEXT:    ret <2 x i64> zeroinitializer
-;
-  %1 = tail call <2 x i64> @llvm.x86.sse2.psrl.q(<2 x i64> %v, <2 x i64> <i64 64, i64 9999>)
-  ret <2 x i64> %1
-}
-
-define <16 x i16> @avx2_psrl_w_0(<16 x i16> %v) {
-; CHECK-LABEL: @avx2_psrl_w_0(
-; CHECK-NEXT:    ret <16 x i16> %v
-;
-  %1 = tail call <16 x i16> @llvm.x86.avx2.psrl.w(<16 x i16> %v, <8 x i16> zeroinitializer)
-  ret <16 x i16> %1
-}
-
-define <16 x i16> @avx2_psrl_w_15(<16 x i16> %v) {
-; CHECK-LABEL: @avx2_psrl_w_15(
-; CHECK-NEXT:    [[TMP1:%.*]] = lshr <16 x i16> %v, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
-; CHECK-NEXT:    ret <16 x i16> [[TMP1]]
-;
-  %1 = tail call <16 x i16> @llvm.x86.avx2.psrl.w(<16 x i16> %v, <8 x i16> <i16 15, i16 0, i16 0, i16 0, i16 9999, i16 9999, i16 9999, i16 9999>)
-  ret <16 x i16> %1
-}
-
-define <16 x i16> @avx2_psrl_w_15_splat(<16 x i16> %v) {
-; CHECK-LABEL: @avx2_psrl_w_15_splat(
-; CHECK-NEXT:    ret <16 x i16> zeroinitializer
-;
-  %1 = tail call <16 x i16> @llvm.x86.avx2.psrl.w(<16 x i16> %v, <8 x i16> <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>)
-  ret <16 x i16> %1
-}
-
-define <16 x i16> @avx2_psrl_w_64(<16 x i16> %v) {
-; CHECK-LABEL: @avx2_psrl_w_64(
-; CHECK-NEXT:    ret <16 x i16> zeroinitializer
-;
-  %1 = tail call <16 x i16> @llvm.x86.avx2.psrl.w(<16 x i16> %v, <8 x i16> <i16 64, i16 0, i16 0, i16 0, i16 9999, i16 9999, i16 9999, i16 9999>)
-  ret <16 x i16> %1
-}
-
-define <8 x i32> @avx2_psrl_d_0(<8 x i32> %v) {
-; CHECK-LABEL: @avx2_psrl_d_0(
-; CHECK-NEXT:    ret <8 x i32> %v
-;
-  %1 = tail call <8 x i32> @llvm.x86.avx2.psrl.d(<8 x i32> %v, <4 x i32> zeroinitializer)
-  ret <8 x i32> %1
-}
-
-define <8 x i32> @avx2_psrl_d_15(<8 x i32> %v) {
-; CHECK-LABEL: @avx2_psrl_d_15(
-; CHECK-NEXT:    [[TMP1:%.*]] = lshr <8 x i32> %v, <i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
-; CHECK-NEXT:    ret <8 x i32> [[TMP1]]
-;
-  %1 = tail call <8 x i32> @llvm.x86.avx2.psrl.d(<8 x i32> %v, <4 x i32> <i32 15, i32 0, i32 9999, i32 9999>)
-  ret <8 x i32> %1
-}
-
-define <8 x i32> @avx2_psrl_d_15_splat(<8 x i32> %v) {
-; CHECK-LABEL: @avx2_psrl_d_15_splat(
-; CHECK-NEXT:    ret <8 x i32> zeroinitializer
-;
-  %1 = tail call <8 x i32> @llvm.x86.avx2.psrl.d(<8 x i32> %v, <4 x i32> <i32 15, i32 15, i32 15, i32 15>)
-  ret <8 x i32> %1
-}
-
-define <8 x i32> @avx2_psrl_d_64(<8 x i32> %v) {
-; CHECK-LABEL: @avx2_psrl_d_64(
-; CHECK-NEXT:    ret <8 x i32> zeroinitializer
-;
-  %1 = tail call <8 x i32> @llvm.x86.avx2.psrl.d(<8 x i32> %v, <4 x i32> <i32 64, i32 0, i32 9999, i32 9999>)
-  ret <8 x i32> %1
-}
-
-define <4 x i64> @avx2_psrl_q_0(<4 x i64> %v) {
-; CHECK-LABEL: @avx2_psrl_q_0(
-; CHECK-NEXT:    ret <4 x i64> %v
-;
-  %1 = tail call <4 x i64> @llvm.x86.avx2.psrl.q(<4 x i64> %v, <2 x i64> zeroinitializer)
-  ret <4 x i64> %1
-}
-
-define <4 x i64> @avx2_psrl_q_15(<4 x i64> %v) {
-; CHECK-LABEL: @avx2_psrl_q_15(
-; CHECK-NEXT:    [[TMP1:%.*]] = lshr <4 x i64> %v, <i64 15, i64 15, i64 15, i64 15>
-; CHECK-NEXT:    ret <4 x i64> [[TMP1]]
-;
-  %1 = tail call <4 x i64> @llvm.x86.avx2.psrl.q(<4 x i64> %v, <2 x i64> <i64 15, i64 9999>)
-  ret <4 x i64> %1
-}
-
-define <4 x i64> @avx2_psrl_q_64(<4 x i64> %v) {
-; CHECK-LABEL: @avx2_psrl_q_64(
-; CHECK-NEXT:    ret <4 x i64> zeroinitializer
-;
-  %1 = tail call <4 x i64> @llvm.x86.avx2.psrl.q(<4 x i64> %v, <2 x i64> <i64 64, i64 9999>)
-  ret <4 x i64> %1
-}
-
-define <32 x i16> @avx512_psrl_w_512_0(<32 x i16> %v) {
-; CHECK-LABEL: @avx512_psrl_w_512_0(
-; CHECK-NEXT:    ret <32 x i16> %v
-;
-  %1 = tail call <32 x i16> @llvm.x86.avx512.psrl.w.512(<32 x i16> %v, <8 x i16> zeroinitializer)
-  ret <32 x i16> %1
-}
-
-define <32 x i16> @avx512_psrl_w_512_15(<32 x i16> %v) {
-; CHECK-LABEL: @avx512_psrl_w_512_15(
-; CHECK-NEXT:    [[TMP1:%.*]] = lshr <32 x i16> %v, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
-; CHECK-NEXT:    ret <32 x i16> [[TMP1]]
-;
-  %1 = tail call <32 x i16> @llvm.x86.avx512.psrl.w.512(<32 x i16> %v, <8 x i16> <i16 15, i16 0, i16 0, i16 0, i16 9999, i16 9999, i16 9999, i16 9999>)
-  ret <32 x i16> %1
-}
-
-define <32 x i16> @avx512_psrl_w_512_15_splat(<32 x i16> %v) {
-; CHECK-LABEL: @avx512_psrl_w_512_15_splat(
-; CHECK-NEXT:    ret <32 x i16> zeroinitializer
-;
-  %1 = tail call <32 x i16> @llvm.x86.avx512.psrl.w.512(<32 x i16> %v, <8 x i16> <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>)
-  ret <32 x i16> %1
-}
-
-define <32 x i16> @avx512_psrl_w_512_64(<32 x i16> %v) {
-; CHECK-LABEL: @avx512_psrl_w_512_64(
-; CHECK-NEXT:    ret <32 x i16> zeroinitializer
-;
-  %1 = tail call <32 x i16> @llvm.x86.avx512.psrl.w.512(<32 x i16> %v, <8 x i16> <i16 64, i16 0, i16 0, i16 0, i16 9999, i16 9999, i16 9999, i16 9999>)
-  ret <32 x i16> %1
-}
-
-define <16 x i32> @avx512_psrl_d_512_0(<16 x i32> %v) {
-; CHECK-LABEL: @avx512_psrl_d_512_0(
-; CHECK-NEXT:    ret <16 x i32> %v
-;
-  %1 = tail call <16 x i32> @llvm.x86.avx512.psrl.d.512(<16 x i32> %v, <4 x i32> zeroinitializer)
-  ret <16 x i32> %1
-}
-
-define <16 x i32> @avx512_psrl_d_512_15(<16 x i32> %v) {
-; CHECK-LABEL: @avx512_psrl_d_512_15(
-; CHECK-NEXT:    [[TMP1:%.*]] = lshr <16 x i32> %v, <i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
-; CHECK-NEXT:    ret <16 x i32> [[TMP1]]
-;
-  %1 = tail call <16 x i32> @llvm.x86.avx512.psrl.d.512(<16 x i32> %v, <4 x i32> <i32 15, i32 0, i32 9999, i32 9999>)
-  ret <16 x i32> %1
-}
-
-define <16 x i32> @avx512_psrl_d_512_15_splat(<16 x i32> %v) {
-; CHECK-LABEL: @avx512_psrl_d_512_15_splat(
-; CHECK-NEXT:    ret <16 x i32> zeroinitializer
-;
-  %1 = tail call <16 x i32> @llvm.x86.avx512.psrl.d.512(<16 x i32> %v, <4 x i32> <i32 15, i32 15, i32 15, i32 15>)
-  ret <16 x i32> %1
-}
-
-define <16 x i32> @avx512_psrl_d_512_64(<16 x i32> %v) {
-; CHECK-LABEL: @avx512_psrl_d_512_64(
-; CHECK-NEXT:    ret <16 x i32> zeroinitializer
-;
-  %1 = tail call <16 x i32> @llvm.x86.avx512.psrl.d.512(<16 x i32> %v, <4 x i32> <i32 64, i32 0, i32 9999, i32 9999>)
-  ret <16 x i32> %1
-}
-
-define <8 x i64> @avx512_psrl_q_512_0(<8 x i64> %v) {
-; CHECK-LABEL: @avx512_psrl_q_512_0(
-; CHECK-NEXT:    ret <8 x i64> %v
-;
-  %1 = tail call <8 x i64> @llvm.x86.avx512.psrl.q.512(<8 x i64> %v, <2 x i64> zeroinitializer)
-  ret <8 x i64> %1
-}
-
-define <8 x i64> @avx512_psrl_q_512_15(<8 x i64> %v) {
-; CHECK-LABEL: @avx512_psrl_q_512_15(
-; CHECK-NEXT:    [[TMP1:%.*]] = lshr <8 x i64> %v, <i64 15, i64 15, i64 15, i64 15, i64 15, i64 15, i64 15, i64 15>
-; CHECK-NEXT:    ret <8 x i64> [[TMP1]]
-;
-  %1 = tail call <8 x i64> @llvm.x86.avx512.psrl.q.512(<8 x i64> %v, <2 x i64> <i64 15, i64 9999>)
-  ret <8 x i64> %1
-}
-
-define <8 x i64> @avx512_psrl_q_512_64(<8 x i64> %v) {
-; CHECK-LABEL: @avx512_psrl_q_512_64(
-; CHECK-NEXT:    ret <8 x i64> zeroinitializer
-;
-  %1 = tail call <8 x i64> @llvm.x86.avx512.psrl.q.512(<8 x i64> %v, <2 x i64> <i64 64, i64 9999>)
-  ret <8 x i64> %1
-}
-
-;
-; SHL - Constant Vector
-;
-
-define <8 x i16> @sse2_psll_w_0(<8 x i16> %v) {
-; CHECK-LABEL: @sse2_psll_w_0(
-; CHECK-NEXT:    ret <8 x i16> %v
-;
-  %1 = tail call <8 x i16> @llvm.x86.sse2.psll.w(<8 x i16> %v, <8 x i16> zeroinitializer)
-  ret <8 x i16> %1
-}
-
-define <8 x i16> @sse2_psll_w_15(<8 x i16> %v) {
-; CHECK-LABEL: @sse2_psll_w_15(
-; CHECK-NEXT:    [[TMP1:%.*]] = shl <8 x i16> %v, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
-; CHECK-NEXT:    ret <8 x i16> [[TMP1]]
-;
-  %1 = tail call <8 x i16> @llvm.x86.sse2.psll.w(<8 x i16> %v, <8 x i16> <i16 15, i16 0, i16 0, i16 0, i16 9999, i16 9999, i16 9999, i16 9999>)
-  ret <8 x i16> %1
-}
-
-define <8 x i16> @sse2_psll_w_15_splat(<8 x i16> %v) {
-; CHECK-LABEL: @sse2_psll_w_15_splat(
-; CHECK-NEXT:    ret <8 x i16> zeroinitializer
-;
-  %1 = tail call <8 x i16> @llvm.x86.sse2.psll.w(<8 x i16> %v, <8 x i16> <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>)
-  ret <8 x i16> %1
-}
-
-define <8 x i16> @sse2_psll_w_64(<8 x i16> %v) {
-; CHECK-LABEL: @sse2_psll_w_64(
-; CHECK-NEXT:    ret <8 x i16> zeroinitializer
-;
-  %1 = tail call <8 x i16> @llvm.x86.sse2.psll.w(<8 x i16> %v, <8 x i16> <i16 64, i16 0, i16 0, i16 0, i16 9999, i16 9999, i16 9999, i16 9999>)
-  ret <8 x i16> %1
-}
-
-define <4 x i32> @sse2_psll_d_0(<4 x i32> %v) {
-; CHECK-LABEL: @sse2_psll_d_0(
-; CHECK-NEXT:    ret <4 x i32> %v
-;
-  %1 = tail call <4 x i32> @llvm.x86.sse2.psll.d(<4 x i32> %v, <4 x i32> zeroinitializer)
-  ret <4 x i32> %1
-}
-
-define <4 x i32> @sse2_psll_d_15(<4 x i32> %v) {
-; CHECK-LABEL: @sse2_psll_d_15(
-; CHECK-NEXT:    [[TMP1:%.*]] = shl <4 x i32> %v, <i32 15, i32 15, i32 15, i32 15>
-; CHECK-NEXT:    ret <4 x i32> [[TMP1]]
-;
-  %1 = tail call <4 x i32> @llvm.x86.sse2.psll.d(<4 x i32> %v, <4 x i32> <i32 15, i32 0, i32 9999, i32 9999>)
-  ret <4 x i32> %1
-}
-
-define <4 x i32> @sse2_psll_d_15_splat(<4 x i32> %v) {
-; CHECK-LABEL: @sse2_psll_d_15_splat(
-; CHECK-NEXT:    ret <4 x i32> zeroinitializer
-;
-  %1 = tail call <4 x i32> @llvm.x86.sse2.psll.d(<4 x i32> %v, <4 x i32> <i32 15, i32 15, i32 15, i32 15>)
-  ret <4 x i32> %1
-}
-
-define <4 x i32> @sse2_psll_d_64(<4 x i32> %v) {
-; CHECK-LABEL: @sse2_psll_d_64(
-; CHECK-NEXT:    ret <4 x i32> zeroinitializer
-;
-  %1 = tail call <4 x i32> @llvm.x86.sse2.psll.d(<4 x i32> %v, <4 x i32> <i32 64, i32 0, i32 9999, i32 9999>)
-  ret <4 x i32> %1
-}
-
-define <2 x i64> @sse2_psll_q_0(<2 x i64> %v) {
-; CHECK-LABEL: @sse2_psll_q_0(
-; CHECK-NEXT:    ret <2 x i64> %v
-;
-  %1 = tail call <2 x i64> @llvm.x86.sse2.psll.q(<2 x i64> %v, <2 x i64> zeroinitializer)
-  ret <2 x i64> %1
-}
-
-define <2 x i64> @sse2_psll_q_15(<2 x i64> %v) {
-; CHECK-LABEL: @sse2_psll_q_15(
-; CHECK-NEXT:    [[TMP1:%.*]] = shl <2 x i64> %v, <i64 15, i64 15>
-; CHECK-NEXT:    ret <2 x i64> [[TMP1]]
-;
-  %1 = tail call <2 x i64> @llvm.x86.sse2.psll.q(<2 x i64> %v, <2 x i64> <i64 15, i64 9999>)
-  ret <2 x i64> %1
-}
-
-define <2 x i64> @sse2_psll_q_64(<2 x i64> %v) {
-; CHECK-LABEL: @sse2_psll_q_64(
-; CHECK-NEXT:    ret <2 x i64> zeroinitializer
-;
-  %1 = tail call <2 x i64> @llvm.x86.sse2.psll.q(<2 x i64> %v, <2 x i64> <i64 64, i64 9999>)
-  ret <2 x i64> %1
-}
-
-define <16 x i16> @avx2_psll_w_0(<16 x i16> %v) {
-; CHECK-LABEL: @avx2_psll_w_0(
-; CHECK-NEXT:    ret <16 x i16> %v
-;
-  %1 = tail call <16 x i16> @llvm.x86.avx2.psll.w(<16 x i16> %v, <8 x i16> zeroinitializer)
-  ret <16 x i16> %1
-}
-
-define <16 x i16> @avx2_psll_w_15(<16 x i16> %v) {
-; CHECK-LABEL: @avx2_psll_w_15(
-; CHECK-NEXT:    [[TMP1:%.*]] = shl <16 x i16> %v, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
-; CHECK-NEXT:    ret <16 x i16> [[TMP1]]
-;
-  %1 = tail call <16 x i16> @llvm.x86.avx2.psll.w(<16 x i16> %v, <8 x i16> <i16 15, i16 0, i16 0, i16 0, i16 9999, i16 9999, i16 9999, i16 9999>)
-  ret <16 x i16> %1
-}
-
-define <16 x i16> @avx2_psll_w_15_splat(<16 x i16> %v) {
-; CHECK-LABEL: @avx2_psll_w_15_splat(
-; CHECK-NEXT:    ret <16 x i16> zeroinitializer
-;
-  %1 = tail call <16 x i16> @llvm.x86.avx2.psll.w(<16 x i16> %v, <8 x i16> <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>)
-  ret <16 x i16> %1
-}
-
-define <16 x i16> @avx2_psll_w_64(<16 x i16> %v) {
-; CHECK-LABEL: @avx2_psll_w_64(
-; CHECK-NEXT:    ret <16 x i16> zeroinitializer
-;
-  %1 = tail call <16 x i16> @llvm.x86.avx2.psll.w(<16 x i16> %v, <8 x i16> <i16 64, i16 0, i16 0, i16 0, i16 9999, i16 9999, i16 9999, i16 9999>)
-  ret <16 x i16> %1
-}
-
-define <8 x i32> @avx2_psll_d_0(<8 x i32> %v) {
-; CHECK-LABEL: @avx2_psll_d_0(
-; CHECK-NEXT:    ret <8 x i32> %v
-;
-  %1 = tail call <8 x i32> @llvm.x86.avx2.psll.d(<8 x i32> %v, <4 x i32> zeroinitializer)
-  ret <8 x i32> %1
-}
-
-define <8 x i32> @avx2_psll_d_15(<8 x i32> %v) {
-; CHECK-LABEL: @avx2_psll_d_15(
-; CHECK-NEXT:    [[TMP1:%.*]] = shl <8 x i32> %v, <i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
-; CHECK-NEXT:    ret <8 x i32> [[TMP1]]
-;
-  %1 = tail call <8 x i32> @llvm.x86.avx2.psll.d(<8 x i32> %v, <4 x i32> <i32 15, i32 0, i32 9999, i32 9999>)
-  ret <8 x i32> %1
-}
-
-define <8 x i32> @avx2_psll_d_15_splat(<8 x i32> %v) {
-; CHECK-LABEL: @avx2_psll_d_15_splat(
-; CHECK-NEXT:    ret <8 x i32> zeroinitializer
-;
-  %1 = tail call <8 x i32> @llvm.x86.avx2.psll.d(<8 x i32> %v, <4 x i32> <i32 15, i32 15, i32 15, i32 15>)
-  ret <8 x i32> %1
-}
-
-define <8 x i32> @avx2_psll_d_64(<8 x i32> %v) {
-; CHECK-LABEL: @avx2_psll_d_64(
-; CHECK-NEXT:    ret <8 x i32> zeroinitializer
-;
-  %1 = tail call <8 x i32> @llvm.x86.avx2.psll.d(<8 x i32> %v, <4 x i32> <i32 64, i32 0, i32 9999, i32 9999>)
-  ret <8 x i32> %1
-}
-
-define <4 x i64> @avx2_psll_q_0(<4 x i64> %v) {
-; CHECK-LABEL: @avx2_psll_q_0(
-; CHECK-NEXT:    ret <4 x i64> %v
-;
-  %1 = tail call <4 x i64> @llvm.x86.avx2.psll.q(<4 x i64> %v, <2 x i64> zeroinitializer)
-  ret <4 x i64> %1
-}
-
-define <4 x i64> @avx2_psll_q_15(<4 x i64> %v) {
-; CHECK-LABEL: @avx2_psll_q_15(
-; CHECK-NEXT:    [[TMP1:%.*]] = shl <4 x i64> %v, <i64 15, i64 15, i64 15, i64 15>
-; CHECK-NEXT:    ret <4 x i64> [[TMP1]]
-;
-  %1 = tail call <4 x i64> @llvm.x86.avx2.psll.q(<4 x i64> %v, <2 x i64> <i64 15, i64 9999>)
-  ret <4 x i64> %1
-}
-
-define <4 x i64> @avx2_psll_q_64(<4 x i64> %v) {
-; CHECK-LABEL: @avx2_psll_q_64(
-; CHECK-NEXT:    ret <4 x i64> zeroinitializer
-;
-  %1 = tail call <4 x i64> @llvm.x86.avx2.psll.q(<4 x i64> %v, <2 x i64> <i64 64, i64 9999>)
-  ret <4 x i64> %1
-}
-
-define <32 x i16> @avx512_psll_w_512_0(<32 x i16> %v) {
-; CHECK-LABEL: @avx512_psll_w_512_0(
-; CHECK-NEXT:    ret <32 x i16> %v
-;
-  %1 = tail call <32 x i16> @llvm.x86.avx512.psll.w.512(<32 x i16> %v, <8 x i16> zeroinitializer)
-  ret <32 x i16> %1
-}
-
-define <32 x i16> @avx512_psll_w_512_15(<32 x i16> %v) {
-; CHECK-LABEL: @avx512_psll_w_512_15(
-; CHECK-NEXT:    [[TMP1:%.*]] = shl <32 x i16> %v, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
-; CHECK-NEXT:    ret <32 x i16> [[TMP1]]
-;
-  %1 = tail call <32 x i16> @llvm.x86.avx512.psll.w.512(<32 x i16> %v, <8 x i16> <i16 15, i16 0, i16 0, i16 0, i16 9999, i16 9999, i16 9999, i16 9999>)
-  ret <32 x i16> %1
-}
-
-define <32 x i16> @avx512_psll_w_15_512_splat(<32 x i16> %v) {
-; CHECK-LABEL: @avx512_psll_w_15_512_splat(
-; CHECK-NEXT:    ret <32 x i16> zeroinitializer
-;
-  %1 = tail call <32 x i16> @llvm.x86.avx512.psll.w.512(<32 x i16> %v, <8 x i16> <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>)
-  ret <32 x i16> %1
-}
-
-define <32 x i16> @avx512_psll_w_512_64(<32 x i16> %v) {
-; CHECK-LABEL: @avx512_psll_w_512_64(
-; CHECK-NEXT:    ret <32 x i16> zeroinitializer
-;
-  %1 = tail call <32 x i16> @llvm.x86.avx512.psll.w.512(<32 x i16> %v, <8 x i16> <i16 64, i16 0, i16 0, i16 0, i16 9999, i16 9999, i16 9999, i16 9999>)
-  ret <32 x i16> %1
-}
-
-define <16 x i32> @avx512_psll_d_512_0(<16 x i32> %v) {
-; CHECK-LABEL: @avx512_psll_d_512_0(
-; CHECK-NEXT:    ret <16 x i32> %v
-;
-  %1 = tail call <16 x i32> @llvm.x86.avx512.psll.d.512(<16 x i32> %v, <4 x i32> zeroinitializer)
-  ret <16 x i32> %1
-}
-
-define <16 x i32> @avx512_psll_d_512_15(<16 x i32> %v) {
-; CHECK-LABEL: @avx512_psll_d_512_15(
-; CHECK-NEXT:    [[TMP1:%.*]] = shl <16 x i32> %v, <i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
-; CHECK-NEXT:    ret <16 x i32> [[TMP1]]
-;
-  %1 = tail call <16 x i32> @llvm.x86.avx512.psll.d.512(<16 x i32> %v, <4 x i32> <i32 15, i32 0, i32 9999, i32 9999>)
-  ret <16 x i32> %1
-}
-
-define <16 x i32> @avx512_psll_d_512_15_splat(<16 x i32> %v) {
-; CHECK-LABEL: @avx512_psll_d_512_15_splat(
-; CHECK-NEXT:    ret <16 x i32> zeroinitializer
-;
-  %1 = tail call <16 x i32> @llvm.x86.avx512.psll.d.512(<16 x i32> %v, <4 x i32> <i32 15, i32 15, i32 15, i32 15>)
-  ret <16 x i32> %1
-}
-
-define <16 x i32> @avx512_psll_d_512_64(<16 x i32> %v) {
-; CHECK-LABEL: @avx512_psll_d_512_64(
-; CHECK-NEXT:    ret <16 x i32> zeroinitializer
-;
-  %1 = tail call <16 x i32> @llvm.x86.avx512.psll.d.512(<16 x i32> %v, <4 x i32> <i32 64, i32 0, i32 9999, i32 9999>)
-  ret <16 x i32> %1
-}
-
-define <8 x i64> @avx512_psll_q_512_0(<8 x i64> %v) {
-; CHECK-LABEL: @avx512_psll_q_512_0(
-; CHECK-NEXT:    ret <8 x i64> %v
-;
-  %1 = tail call <8 x i64> @llvm.x86.avx512.psll.q.512(<8 x i64> %v, <2 x i64> zeroinitializer)
-  ret <8 x i64> %1
-}
-
-define <8 x i64> @avx512_psll_q_512_15(<8 x i64> %v) {
-; CHECK-LABEL: @avx512_psll_q_512_15(
-; CHECK-NEXT:    [[TMP1:%.*]] = shl <8 x i64> %v, <i64 15, i64 15, i64 15, i64 15, i64 15, i64 15, i64 15, i64 15>
-; CHECK-NEXT:    ret <8 x i64> [[TMP1]]
-;
-  %1 = tail call <8 x i64> @llvm.x86.avx512.psll.q.512(<8 x i64> %v, <2 x i64> <i64 15, i64 9999>)
-  ret <8 x i64> %1
-}
-
-define <8 x i64> @avx512_psll_q_512_64(<8 x i64> %v) {
-; CHECK-LABEL: @avx512_psll_q_512_64(
-; CHECK-NEXT:    ret <8 x i64> zeroinitializer
-;
-  %1 = tail call <8 x i64> @llvm.x86.avx512.psll.q.512(<8 x i64> %v, <2 x i64> <i64 64, i64 9999>)
-  ret <8 x i64> %1
-}
-
-;
-; ASHR - Constant Per-Element Vector
-;
-
-define <4 x i32> @avx2_psrav_d_128_0(<4 x i32> %v) {
-; CHECK-LABEL: @avx2_psrav_d_128_0(
-; CHECK-NEXT:    ret <4 x i32> %v
-;
-  %1 = tail call <4 x i32> @llvm.x86.avx2.psrav.d(<4 x i32> %v, <4 x i32> zeroinitializer)
-  ret <4 x i32> %1
-}
-
-define <8 x i32> @avx2_psrav_d_256_0(<8 x i32> %v) {
-; CHECK-LABEL: @avx2_psrav_d_256_0(
-; CHECK-NEXT:    ret <8 x i32> %v
-;
-  %1 = tail call <8 x i32> @llvm.x86.avx2.psrav.d.256(<8 x i32> %v, <8 x i32> zeroinitializer)
-  ret <8 x i32> %1
-}
-
-define <16 x i32> @avx512_psrav_d_512_0(<16 x i32> %v) {
-; CHECK-LABEL: @avx512_psrav_d_512_0(
-; CHECK-NEXT:    ret <16 x i32> %v
-;
-  %1 = tail call <16 x i32> @llvm.x86.avx512.psrav.d.512(<16 x i32> %v, <16 x i32> zeroinitializer)
-  ret <16 x i32> %1
-}
-
-define <4 x i32> @avx2_psrav_d_128_var(<4 x i32> %v) {
-; CHECK-LABEL: @avx2_psrav_d_128_var(
-; CHECK-NEXT:    [[TMP1:%.*]] = ashr <4 x i32> %v, <i32 0, i32 8, i32 16, i32 31>
-; CHECK-NEXT:    ret <4 x i32> [[TMP1]]
-;
-  %1 = tail call <4 x i32> @llvm.x86.avx2.psrav.d(<4 x i32> %v, <4 x i32> <i32 0, i32 8, i32 16, i32 64>)
-  ret <4 x i32> %1
-}
-
-define <8 x i32> @avx2_psrav_d_256_var(<8 x i32> %v) {
-; CHECK-LABEL: @avx2_psrav_d_256_var(
-; CHECK-NEXT:    [[TMP1:%.*]] = ashr <8 x i32> %v, <i32 0, i32 8, i32 16, i32 24, i32 31, i32 24, i32 8, i32 0>
-; CHECK-NEXT:    ret <8 x i32> [[TMP1]]
-;
-  %1 = tail call <8 x i32> @llvm.x86.avx2.psrav.d.256(<8 x i32> %v, <8 x i32> <i32 0, i32 8, i32 16, i32 24, i32 32, i32 24, i32 8, i32 0>)
-  ret <8 x i32> %1
-}
-
-define <16 x i32> @avx512_psrav_d_512_var(<16 x i32> %v) {
-; CHECK-LABEL: @avx512_psrav_d_512_var(
-; CHECK-NEXT:    [[TMP1:%.*]] = ashr <16 x i32> %v, <i32 0, i32 8, i32 16, i32 24, i32 31, i32 24, i32 8, i32 0, i32 0, i32 8, i32 16, i32 24, i32 31, i32 24, i32 8, i32 0>
-; CHECK-NEXT:    ret <16 x i32> [[TMP1]]
-;
-  %1 = tail call <16 x i32> @llvm.x86.avx512.psrav.d.512(<16 x i32> %v, <16 x i32> <i32 0, i32 8, i32 16, i32 24, i32 32, i32 24, i32 8, i32 0, i32 0, i32 8, i32 16, i32 24, i32 32, i32 24, i32 8, i32 0>)
-  ret <16 x i32> %1
-}
-
-define <4 x i32> @avx2_psrav_d_128_allbig(<4 x i32> %v) {
-; CHECK-LABEL: @avx2_psrav_d_128_allbig(
-; CHECK-NEXT:    [[TMP1:%.*]] = ashr <4 x i32> %v, <i32 31, i32 31, i32 31, i32 undef>
-; CHECK-NEXT:    ret <4 x i32> [[TMP1]]
-;
-  %1 = tail call <4 x i32> @llvm.x86.avx2.psrav.d(<4 x i32> %v, <4 x i32> <i32 32, i32 100, i32 -255, i32 undef>)
-  ret <4 x i32> %1
-}
-
-define <8 x i32> @avx2_psrav_d_256_allbig(<8 x i32> %v) {
-; CHECK-LABEL: @avx2_psrav_d_256_allbig(
-; CHECK-NEXT:    [[TMP1:%.*]] = ashr <8 x i32> %v, <i32 undef, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
-; CHECK-NEXT:    ret <8 x i32> [[TMP1]]
-;
-  %1 = tail call <8 x i32> @llvm.x86.avx2.psrav.d.256(<8 x i32> %v, <8 x i32> <i32 undef, i32 100, i32 255, i32 55555, i32 -32, i32 -100, i32 -255, i32 -55555>)
-  ret <8 x i32> %1
-}
-
-define <16 x i32> @avx512_psrav_d_512_allbig(<16 x i32> %v) {
-; CHECK-LABEL: @avx512_psrav_d_512_allbig(
-; CHECK-NEXT:    [[TMP1:%.*]] = ashr <16 x i32> %v, <i32 undef, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 undef, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
-; CHECK-NEXT:    ret <16 x i32> [[TMP1]]
-;
-  %1 = tail call <16 x i32> @llvm.x86.avx512.psrav.d.512(<16 x i32> %v, <16 x i32> <i32 undef, i32 100, i32 255, i32 55555, i32 -32, i32 -100, i32 -255, i32 -55555, i32 undef, i32 100, i32 255, i32 55555, i32 -32, i32 -100, i32 -255, i32 -55555>)
-  ret <16 x i32> %1
-}
-
-define <4 x i32> @avx2_psrav_d_128_undef(<4 x i32> %v) {
-; CHECK-LABEL: @avx2_psrav_d_128_undef(
-; CHECK-NEXT:    [[TMP1:%.*]] = ashr <4 x i32> %v, <i32 undef, i32 8, i32 16, i32 31>
-; CHECK-NEXT:    ret <4 x i32> [[TMP1]]
-;
-  %1 = insertelement <4 x i32> <i32 0, i32 8, i32 16, i32 64>, i32 undef, i32 0
-  %2 = tail call <4 x i32> @llvm.x86.avx2.psrav.d(<4 x i32> %v, <4 x i32> %1)
-  ret <4 x i32> %2
-}
-
-define <8 x i32> @avx2_psrav_d_256_undef(<8 x i32> %v) {
-; CHECK-LABEL: @avx2_psrav_d_256_undef(
-; CHECK-NEXT:    [[TMP1:%.*]] = ashr <8 x i32> %v, <i32 0, i32 undef, i32 16, i32 24, i32 31, i32 24, i32 8, i32 0>
-; CHECK-NEXT:    ret <8 x i32> [[TMP1]]
-;
-  %1 = insertelement <8 x i32> <i32 0, i32 8, i32 16, i32 24, i32 32, i32 24, i32 8, i32 0>, i32 undef, i32 1
-  %2 = tail call <8 x i32> @llvm.x86.avx2.psrav.d.256(<8 x i32> %v, <8 x i32> %1)
-  ret <8 x i32> %2
-}
-
-define <16 x i32> @avx512_psrav_d_512_undef(<16 x i32> %v) {
-; CHECK-LABEL: @avx512_psrav_d_512_undef(
-; CHECK-NEXT:    [[TMP1:%.*]] = ashr <16 x i32> %v, <i32 0, i32 undef, i32 16, i32 24, i32 31, i32 24, i32 8, i32 0, i32 0, i32 8, i32 16, i32 24, i32 31, i32 24, i32 8, i32 0>
-; CHECK-NEXT:    ret <16 x i32> [[TMP1]]
-;
-  %1 = insertelement <16 x i32> <i32 0, i32 8, i32 16, i32 24, i32 32, i32 24, i32 8, i32 0, i32 0, i32 8, i32 16, i32 24, i32 32, i32 24, i32 8, i32 0>, i32 undef, i32 1
-  %2 = tail call <16 x i32> @llvm.x86.avx512.psrav.d.512(<16 x i32> %v, <16 x i32> %1)
-  ret <16 x i32> %2
-}
-
-define <2 x i64> @avx512_psrav_q_128_0(<2 x i64> %v) {
-; CHECK-LABEL: @avx512_psrav_q_128_0(
-; CHECK-NEXT:    ret <2 x i64> %v
-;
-  %1 = tail call <2 x i64> @llvm.x86.avx512.psrav.q.128(<2 x i64> %v, <2 x i64> zeroinitializer)
-  ret <2 x i64> %1
-}
-
-define <4 x i64> @avx512_psrav_q_256_0(<4 x i64> %v) {
-; CHECK-LABEL: @avx512_psrav_q_256_0(
-; CHECK-NEXT:    ret <4 x i64> %v
-;
-  %1 = tail call <4 x i64> @llvm.x86.avx512.psrav.q.256(<4 x i64> %v, <4 x i64> zeroinitializer)
-  ret <4 x i64> %1
-}
-
-define <2 x i64> @avx512_psrav_q_128_var(<2 x i64> %v) {
-; CHECK-LABEL: @avx512_psrav_q_128_var(
-; CHECK-NEXT:    [[TMP1:%.*]] = ashr <2 x i64> %v, <i64 0, i64 8>
-  %1 = tail call <2 x i64> @llvm.x86.avx512.psrav.q.128(<2 x i64> %v, <2 x i64> <i64 0, i64 8>)
-  ret <2 x i64> %1
-}
-
-define <4 x i64> @avx512_psrav_q_256_var(<4 x i64> %v) {
-; CHECK-LABEL: @avx512_psrav_q_256_var(
-; CHECK-NEXT:    [[TMP1:%.*]] = ashr <4 x i64> %v, <i64 0, i64 8, i64 16, i64 31>
-; CHECK-NEXT:    ret <4 x i64> [[TMP1]]
-;
-  %1 = tail call <4 x i64> @llvm.x86.avx512.psrav.q.256(<4 x i64> %v, <4 x i64> <i64 0, i64 8, i64 16, i64 31>)
-  ret <4 x i64> %1
-}
-
-define <2 x i64> @avx512_psrav_q_128_allbig(<2 x i64> %v) {
-; CHECK-LABEL: @avx512_psrav_q_128_allbig(
-; CHECK-NEXT:    [[TMP1:%.*]] = ashr <2 x i64> %v, <i64 63, i64 undef>
-; CHECK-NEXT:    ret <2 x i64> [[TMP1]]
-;
-  %1 = tail call <2 x i64> @llvm.x86.avx512.psrav.q.128(<2 x i64> %v, <2 x i64> <i64 64, i64 undef>)
-  ret <2 x i64> %1
-}
-
-define <4 x i64> @avx512_psrav_q_256_allbig(<4 x i64> %v) {
-; CHECK-LABEL: @avx512_psrav_q_256_allbig(
-; CHECK-NEXT:    [[TMP1:%.*]] = ashr <4 x i64> %v, <i64 63, i64 undef, i64 63, i64 63>
-; CHECK-NEXT:    ret <4 x i64> [[TMP1]]
-;
-  %1 = tail call <4 x i64> @llvm.x86.avx512.psrav.q.256(<4 x i64> %v, <4 x i64> <i64 64, i64 undef, i64 -128, i64 -60>)
-  ret <4 x i64> %1
-}
-
-define <2 x i64> @avx512_psrav_q_128_undef(<2 x i64> %v) {
-; CHECK-LABEL: @avx512_psrav_q_128_undef(
-; CHECK-NEXT:    [[TMP1:%.*]] = ashr <2 x i64> %v, <i64 undef, i64 8>
-; CHECK-NEXT:    ret <2 x i64> [[TMP1]]
-;
-  %1 = insertelement <2 x i64> <i64 0, i64 8>, i64 undef, i64 0
-  %2 = tail call <2 x i64> @llvm.x86.avx512.psrav.q.128(<2 x i64> %v, <2 x i64> %1)
-  ret <2 x i64> %2
-}
-
-define <4 x i64> @avx512_psrav_q_256_undef(<4 x i64> %v) {
-; CHECK-LABEL: @avx512_psrav_q_256_undef(
-; CHECK-NEXT:    [[TMP1:%.*]] = ashr <4 x i64> %v, <i64 undef, i64 8, i64 16, i64 31>
-; CHECK-NEXT:    ret <4 x i64> [[TMP1]]
-;
-  %1 = insertelement <4 x i64> <i64 0, i64 8, i64 16, i64 31>, i64 undef, i64 0
-  %2 = tail call <4 x i64> @llvm.x86.avx512.psrav.q.256(<4 x i64> %v, <4 x i64> %1)
-  ret <4 x i64> %2
-}
-
-define <8 x i64> @avx512_psrav_q_512_0(<8 x i64> %v) {
-; CHECK-LABEL: @avx512_psrav_q_512_0(
-; CHECK-NEXT:    ret <8 x i64> %v
-;
-  %1 = tail call <8 x i64> @llvm.x86.avx512.psrav.q.512(<8 x i64> %v, <8 x i64> zeroinitializer)
-  ret <8 x i64> %1
-}
-
-define <8 x i64> @avx512_psrav_q_512_var(<8 x i64> %v) {
-; CHECK-LABEL: @avx512_psrav_q_512_var(
-; CHECK-NEXT:    [[TMP1:%.*]] = ashr <8 x i64> %v, <i64 0, i64 8, i64 16, i64 31, i64 0, i64 8, i64 16, i64 31>
-; CHECK-NEXT:    ret <8 x i64> [[TMP1]]
-;
-  %1 = tail call <8 x i64> @llvm.x86.avx512.psrav.q.512(<8 x i64> %v, <8 x i64> <i64 0, i64 8, i64 16, i64 31, i64 0, i64 8, i64 16, i64 31>)
-  ret <8 x i64> %1
-}
-
-define <8 x i64> @avx512_psrav_q_512_allbig(<8 x i64> %v) {
-; CHECK-LABEL: @avx512_psrav_q_512_allbig(
-; CHECK-NEXT:    [[TMP1:%.*]] = ashr <8 x i64> %v, <i64 63, i64 undef, i64 63, i64 63, i64 63, i64 undef, i64 63, i64 63>
-; CHECK-NEXT:    ret <8 x i64> [[TMP1]]
-;
-  %1 = tail call <8 x i64> @llvm.x86.avx512.psrav.q.512(<8 x i64> %v, <8 x i64> <i64 64, i64 undef, i64 -128, i64 -60, i64 64, i64 undef, i64 -128, i64 -60>)
-  ret <8 x i64> %1
-}
-
-define <8 x i64> @avx512_psrav_q_512_undef(<8 x i64> %v) {
-; CHECK-LABEL: @avx512_psrav_q_512_undef(
-; CHECK-NEXT:    [[TMP1:%.*]] = ashr <8 x i64> %v, <i64 undef, i64 8, i64 16, i64 31, i64 0, i64 8, i64 16, i64 31>
-; CHECK-NEXT:    ret <8 x i64> [[TMP1]]
-;
-  %1 = insertelement <8 x i64> <i64 0, i64 8, i64 16, i64 31, i64 0, i64 8, i64 16, i64 31>, i64 undef, i64 0
-  %2 = tail call <8 x i64> @llvm.x86.avx512.psrav.q.512(<8 x i64> %v, <8 x i64> %1)
-  ret <8 x i64> %2
-}
-
-define <8 x i16> @avx512_psrav_w_128_0(<8 x i16> %v) {
-; CHECK-LABEL: @avx512_psrav_w_128_0(
-; CHECK-NEXT:    ret <8 x i16> %v
-;
-  %1 = tail call <8 x i16> @llvm.x86.avx512.psrav.w.128(<8 x i16> %v, <8 x i16> zeroinitializer)
-  ret <8 x i16> %1
-}
-
-define <8 x i16> @avx512_psrav_w_128_var(<8 x i16> %v) {
-; CHECK-LABEL: @avx512_psrav_w_128_var(
-; CHECK-NEXT:    [[TMP1:%.*]] = ashr <8 x i16> %v, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>
-; CHECK-NEXT:    ret <8 x i16> [[TMP1]]
-;
-  %1 = tail call <8 x i16> @llvm.x86.avx512.psrav.w.128(<8 x i16> %v, <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
-  ret <8 x i16> %1
-}
-
-define <8 x i16> @avx512_psrav_w_128_allbig(<8 x i16> %v) {
-; CHECK-LABEL: @avx512_psrav_w_128_allbig(
-; CHECK-NEXT:    [[TMP1:%.*]] = ashr <8 x i16> %v, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 undef>
-; CHECK-NEXT:    ret <8 x i16> [[TMP1]]
-;
-;
-  %1 = tail call <8 x i16> @llvm.x86.avx512.psrav.w.128(<8 x i16> %v, <8 x i16> <i16 20, i16 -1, i16 -2, i16 33, i16 44, i16 55, i16 66, i16 undef>)
-  ret <8 x i16> %1
-}
-
-define <8 x i16> @avx512_psrav_w_128_undef(<8 x i16> %v) {
-; CHECK-LABEL: @avx512_psrav_w_128_undef(
-; CHECK-NEXT:    [[TMP1:%.*]] = ashr <8 x i16> %v, <i16 undef, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>
-; CHECK-NEXT:    ret <8 x i16> [[TMP1]]
-;
-  %1 = insertelement <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, i16 undef, i64 0
-  %2 = tail call <8 x i16> @llvm.x86.avx512.psrav.w.128(<8 x i16> %v, <8 x i16> %1)
-  ret <8 x i16> %2
-}
-
-define <16 x i16> @avx512_psrav_w_256_0(<16 x i16> %v) {
-; CHECK-LABEL: @avx512_psrav_w_256_0(
-; CHECK-NEXT:    ret <16 x i16> %v
-;
-  %1 = tail call <16 x i16> @llvm.x86.avx512.psrav.w.256(<16 x i16> %v, <16 x i16> zeroinitializer)
-  ret <16 x i16> %1
-}
-
-define <16 x i16> @avx512_psrav_w_256_var(<16 x i16> %v) {
-; CHECK-LABEL: @avx512_psrav_w_256_var(
-; CHECK-NEXT:    [[TMP1:%.*]] = ashr <16 x i16> %v, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>
-; CHECK-NEXT:    ret <16 x i16> [[TMP1]]
-;
-  %1 = tail call <16 x i16> @llvm.x86.avx512.psrav.w.256(<16 x i16> %v, <16 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>)
-  ret <16 x i16> %1
-}
-
-define <16 x i16> @avx512_psrav_w_256_allbig(<16 x i16> %v) {
-; CHECK-LABEL: @avx512_psrav_w_256_allbig(
-; CHECK-NEXT:    [[TMP1:%.*]] = ashr <16 x i16> %v, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 undef, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
-; CHECK-NEXT:    ret <16 x i16> [[TMP1]]
-;
-;
-  %1 = tail call <16 x i16> @llvm.x86.avx512.psrav.w.256(<16 x i16> %v, <16 x i16> <i16 20, i16 -1, i16 -2, i16 33, i16 44, i16 55, i16 66, i16 -7, i16 undef, i16 64, i16 -10, i16 256, i16 16, i16 28, i16 65535, i16 32767>)
-  ret <16 x i16> %1
-}
-
-define <16 x i16> @avx512_psrav_w_256_undef(<16 x i16> %v) {
-; CHECK-LABEL: @avx512_psrav_w_256_undef(
-; CHECK-NEXT:    [[TMP1:%.*]] = ashr <16 x i16> %v, <i16 undef, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>
-; CHECK-NEXT:    ret <16 x i16> [[TMP1]]
-;
-  %1 = insertelement <16 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>, i16 undef, i64 0
-  %2 = tail call <16 x i16> @llvm.x86.avx512.psrav.w.256(<16 x i16> %v, <16 x i16> %1)
-  ret <16 x i16> %2
-}
-
-define <32 x i16> @avx512_psrav_w_512_0(<32 x i16> %v) {
-; CHECK-LABEL: @avx512_psrav_w_512_0(
-; CHECK-NEXT:    ret <32 x i16> %v
-;
-  %1 = tail call <32 x i16> @llvm.x86.avx512.psrav.w.512(<32 x i16> %v, <32 x i16> zeroinitializer)
-  ret <32 x i16> %1
-}
-
-define <32 x i16> @avx512_psrav_w_512_var(<32 x i16> %v) {
-; CHECK-LABEL: @avx512_psrav_w_512_var(
-; CHECK-NEXT:    [[TMP1:%.*]] = ashr <32 x i16> %v, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 15, i16 14, i16 13, i16 12, i16 11, i16 10, i16 9, i16 8, i16 7, i16 6, i16 5, i16 4, i16 3, i16 2, i16 1, i16 0>
-; CHECK-NEXT:    ret <32 x i16> [[TMP1]]
-;
-  %1 = tail call <32 x i16> @llvm.x86.avx512.psrav.w.512(<32 x i16> %v, <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 15, i16 14, i16 13, i16 12, i16 11, i16 10, i16 9, i16 8, i16 7, i16 6, i16 5, i16 4, i16 3, i16 2, i16 1, i16 0>)
-  ret <32 x i16> %1
-}
-
-define <32 x i16> @avx512_psrav_w_512_allbig(<32 x i16> %v) {
-; CHECK-LABEL: @avx512_psrav_w_512_allbig(
-; CHECK-NEXT:    [[TMP1:%.*]] = ashr <32 x i16> %v, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 undef, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 undef, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 undef, i16 15, i16 15, i16 undef, i16 15, i16 15>
-; CHECK-NEXT:    ret <32 x i16> [[TMP1]]
-;
-  %1 = tail call <32 x i16> @llvm.x86.avx512.psrav.w.512(<32 x i16> %v, <32 x i16> <i16 20, i16 -1, i16 -2, i16 33, i16 44, i16 55, i16 66, i16 -7, i16 undef, i16 64, i16 -10, i16 128, i16 16, i16 28, i16 65535, i16 32767, i16 56, i16 -14, i16 undef, i16 16, i16 67, i16 567, i16 -32768, i16 4096, i16 8192, i16 -12345, i16 undef, i16 345, i16 123, i16 undef, i16 1024, i16 54321>)
-  ret <32 x i16> %1
-}
-
-define <32 x i16> @avx512_psrav_w_512_undef(<32 x i16> %v) {
-; CHECK-LABEL: @avx512_psrav_w_512_undef(
-; CHECK-NEXT:    [[TMP1:%.*]] = ashr <32 x i16> %v, <i16 undef, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 15, i16 14, i16 13, i16 12, i16 11, i16 10, i16 9, i16 8, i16 7, i16 6, i16 5, i16 4, i16 3, i16 2, i16 1, i16 0>
-; CHECK-NEXT:    ret <32 x i16> [[TMP1]]
-;
-  %1 = insertelement <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 15, i16 14, i16 13, i16 12, i16 11, i16 10, i16 9, i16 8, i16 7, i16 6, i16 5, i16 4, i16 3, i16 2, i16 1, i16 0>, i16 undef, i64 0
-  %2 = tail call <32 x i16> @llvm.x86.avx512.psrav.w.512(<32 x i16> %v, <32 x i16> %1)
-  ret <32 x i16> %2
-}
-
-;
-; LSHR - Constant Per-Element Vector
-;
-
-define <4 x i32> @avx2_psrlv_d_128_0(<4 x i32> %v) {
-; CHECK-LABEL: @avx2_psrlv_d_128_0(
-; CHECK-NEXT:    ret <4 x i32> %v
-;
-  %1 = tail call <4 x i32> @llvm.x86.avx2.psrlv.d(<4 x i32> %v, <4 x i32> zeroinitializer)
-  ret <4 x i32> %1
-}
-
-define <8 x i32> @avx2_psrlv_d_256_0(<8 x i32> %v) {
-; CHECK-LABEL: @avx2_psrlv_d_256_0(
-; CHECK-NEXT:    ret <8 x i32> %v
-;
-  %1 = tail call <8 x i32> @llvm.x86.avx2.psrlv.d.256(<8 x i32> %v, <8 x i32> zeroinitializer)
-  ret <8 x i32> %1
-}
-
-define <4 x i32> @avx2_psrlv_d_128_var(<4 x i32> %v) {
-; CHECK-LABEL: @avx2_psrlv_d_128_var(
-; CHECK-NEXT:    [[TMP1:%.*]] = lshr <4 x i32> %v, <i32 0, i32 8, i32 16, i32 31>
-; CHECK-NEXT:    ret <4 x i32> [[TMP1]]
-;
-  %1 = tail call <4 x i32> @llvm.x86.avx2.psrlv.d(<4 x i32> %v, <4 x i32> <i32 0, i32 8, i32 16, i32 31>)
-  ret <4 x i32> %1
-}
-
-define <8 x i32> @avx2_psrlv_d_256_var(<8 x i32> %v) {
-; CHECK-LABEL: @avx2_psrlv_d_256_var(
-; CHECK-NEXT:    [[TMP1:%.*]] = lshr <8 x i32> %v, <i32 0, i32 8, i32 16, i32 24, i32 31, i32 24, i32 8, i32 0>
-; CHECK-NEXT:    ret <8 x i32> [[TMP1]]
-;
-  %1 = tail call <8 x i32> @llvm.x86.avx2.psrlv.d.256(<8 x i32> %v, <8 x i32> <i32 0, i32 8, i32 16, i32 24, i32 31, i32 24, i32 8, i32 0>)
-  ret <8 x i32> %1
-}
-
-define <4 x i32> @avx2_psrlv_d_128_big(<4 x i32> %v) {
-; CHECK-LABEL: @avx2_psrlv_d_128_big(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i32> @llvm.x86.avx2.psrlv.d(<4 x i32> %v, <4 x i32> <i32 0, i32 8, i32 16, i32 64>)
-; CHECK-NEXT:    ret <4 x i32> [[TMP1]]
-;
-  %1 = tail call <4 x i32> @llvm.x86.avx2.psrlv.d(<4 x i32> %v, <4 x i32> <i32 0, i32 8, i32 16, i32 64>)
-  ret <4 x i32> %1
-}
-
-define <8 x i32> @avx2_psrlv_d_256_big(<8 x i32> %v) {
-; CHECK-LABEL: @avx2_psrlv_d_256_big(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.x86.avx2.psrlv.d.256(<8 x i32> %v, <8 x i32> <i32 0, i32 8, i32 16, i32 64, i32 31, i32 24, i32 8, i32 0>)
-; CHECK-NEXT:    ret <8 x i32> [[TMP1]]
-;
-  %1 = tail call <8 x i32> @llvm.x86.avx2.psrlv.d.256(<8 x i32> %v, <8 x i32> <i32 0, i32 8, i32 16, i32 64, i32 31, i32 24, i32 8, i32 0>)
-  ret <8 x i32> %1
-}
-
-define <4 x i32> @avx2_psrlv_d_128_allbig(<4 x i32> %v) {
-; CHECK-LABEL: @avx2_psrlv_d_128_allbig(
-; CHECK-NEXT:    ret <4 x i32> <i32 0, i32 0, i32 0, i32 undef>
-;
-  %1 = tail call <4 x i32> @llvm.x86.avx2.psrlv.d(<4 x i32> %v, <4 x i32> <i32 32, i32 100, i32 -255, i32 undef>)
-  ret <4 x i32> %1
-}
-
-define <8 x i32> @avx2_psrlv_d_256_allbig(<8 x i32> %v) {
-; CHECK-LABEL: @avx2_psrlv_d_256_allbig(
-; CHECK-NEXT:    ret <8 x i32> <i32 undef, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
-;
-  %1 = tail call <8 x i32> @llvm.x86.avx2.psrlv.d.256(<8 x i32> %v, <8 x i32> <i32 undef, i32 100, i32 255, i32 55555, i32 -32, i32 -100, i32 -255, i32 -55555>)
-  ret <8 x i32> %1
-}
-
-define <4 x i32> @avx2_psrlv_d_128_undef(<4 x i32> %v) {
-; CHECK-LABEL: @avx2_psrlv_d_128_undef(
-; CHECK-NEXT:    [[TMP1:%.*]] = lshr <4 x i32> %v, <i32 undef, i32 8, i32 16, i32 31>
-; CHECK-NEXT:    ret <4 x i32> [[TMP1]]
-;
-  %1 = insertelement <4 x i32> <i32 0, i32 8, i32 16, i32 31>, i32 undef, i32 0
-  %2 = tail call <4 x i32> @llvm.x86.avx2.psrlv.d(<4 x i32> %v, <4 x i32> %1)
-  ret <4 x i32> %2
-}
-
-define <8 x i32> @avx2_psrlv_d_256_undef(<8 x i32> %v) {
-; CHECK-LABEL: @avx2_psrlv_d_256_undef(
-; CHECK-NEXT:    [[TMP1:%.*]] = lshr <8 x i32> %v, <i32 0, i32 undef, i32 16, i32 31, i32 31, i32 24, i32 8, i32 0>
-; CHECK-NEXT:    ret <8 x i32> [[TMP1]]
-;
-  %1 = insertelement <8 x i32> <i32 0, i32 8, i32 16, i32 31, i32 31, i32 24, i32 8, i32 0>, i32 undef, i32 1
-  %2 = tail call <8 x i32> @llvm.x86.avx2.psrlv.d.256(<8 x i32> %v, <8 x i32> %1)
-  ret <8 x i32> %2
-}
-
-define <2 x i64> @avx2_psrlv_q_128_0(<2 x i64> %v) {
-; CHECK-LABEL: @avx2_psrlv_q_128_0(
-; CHECK-NEXT:    ret <2 x i64> %v
-;
-  %1 = tail call <2 x i64> @llvm.x86.avx2.psrlv.q(<2 x i64> %v, <2 x i64> zeroinitializer)
-  ret <2 x i64> %1
-}
-
-define <4 x i64> @avx2_psrlv_q_256_0(<4 x i64> %v) {
-; CHECK-LABEL: @avx2_psrlv_q_256_0(
-; CHECK-NEXT:    ret <4 x i64> %v
-;
-  %1 = tail call <4 x i64> @llvm.x86.avx2.psrlv.q.256(<4 x i64> %v, <4 x i64> zeroinitializer)
-  ret <4 x i64> %1
-}
-
-define <2 x i64> @avx2_psrlv_q_128_var(<2 x i64> %v) {
-; CHECK-LABEL: @avx2_psrlv_q_128_var(
-; CHECK-NEXT:    [[TMP1:%.*]] = lshr <2 x i64> %v, <i64 0, i64 8>
-; CHECK-NEXT:    ret <2 x i64> [[TMP1]]
-;
-  %1 = tail call <2 x i64> @llvm.x86.avx2.psrlv.q(<2 x i64> %v, <2 x i64> <i64 0, i64 8>)
-  ret <2 x i64> %1
-}
-
-define <4 x i64> @avx2_psrlv_q_256_var(<4 x i64> %v) {
-; CHECK-LABEL: @avx2_psrlv_q_256_var(
-; CHECK-NEXT:    [[TMP1:%.*]] = lshr <4 x i64> %v, <i64 0, i64 8, i64 16, i64 31>
-; CHECK-NEXT:    ret <4 x i64> [[TMP1]]
-;
-  %1 = tail call <4 x i64> @llvm.x86.avx2.psrlv.q.256(<4 x i64> %v, <4 x i64> <i64 0, i64 8, i64 16, i64 31>)
-  ret <4 x i64> %1
-}
-
-define <2 x i64> @avx2_psrlv_q_128_big(<2 x i64> %v) {
-; CHECK-LABEL: @avx2_psrlv_q_128_big(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.x86.avx2.psrlv.q(<2 x i64> %v, <2 x i64> <i64 0, i64 128>)
-; CHECK-NEXT:    ret <2 x i64> [[TMP1]]
-;
-  %1 = tail call <2 x i64> @llvm.x86.avx2.psrlv.q(<2 x i64> %v, <2 x i64> <i64 0, i64 128>)
-  ret <2 x i64> %1
-}
-
-define <4 x i64> @avx2_psrlv_q_256_big(<4 x i64> %v) {
-; CHECK-LABEL: @avx2_psrlv_q_256_big(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.x86.avx2.psrlv.q.256(<4 x i64> %v, <4 x i64> <i64 0, i64 8, i64 16, i64 64>)
-; CHECK-NEXT:    ret <4 x i64> [[TMP1]]
-;
-  %1 = tail call <4 x i64> @llvm.x86.avx2.psrlv.q.256(<4 x i64> %v, <4 x i64> <i64 0, i64 8, i64 16, i64 64>)
-  ret <4 x i64> %1
-}
-
-define <2 x i64> @avx2_psrlv_q_128_allbig(<2 x i64> %v) {
-; CHECK-LABEL: @avx2_psrlv_q_128_allbig(
-; CHECK-NEXT:    ret <2 x i64> zeroinitializer
-;
-  %1 = tail call <2 x i64> @llvm.x86.avx2.psrlv.q(<2 x i64> %v, <2 x i64> <i64 128, i64 -64>)
-  ret <2 x i64> %1
-}
-
-define <4 x i64> @avx2_psrlv_q_256_allbig(<4 x i64> %v) {
-; CHECK-LABEL: @avx2_psrlv_q_256_allbig(
-; CHECK-NEXT:    ret <4 x i64> <i64 0, i64 undef, i64 0, i64 0>
-;
-  %1 = tail call <4 x i64> @llvm.x86.avx2.psrlv.q.256(<4 x i64> %v, <4 x i64> <i64 64, i64 undef, i64 -128, i64 -60>)
-  ret <4 x i64> %1
-}
-
-define <2 x i64> @avx2_psrlv_q_128_undef(<2 x i64> %v) {
-; CHECK-LABEL: @avx2_psrlv_q_128_undef(
-; CHECK-NEXT:    [[TMP1:%.*]] = lshr <2 x i64> %v, <i64 0, i64 undef>
-; CHECK-NEXT:    ret <2 x i64> [[TMP1]]
-;
-  %1 = insertelement <2 x i64> <i64 0, i64 8>, i64 undef, i64 1
-  %2 = tail call <2 x i64> @llvm.x86.avx2.psrlv.q(<2 x i64> %v, <2 x i64> %1)
-  ret <2 x i64> %2
-}
-
-define <4 x i64> @avx2_psrlv_q_256_undef(<4 x i64> %v) {
-; CHECK-LABEL: @avx2_psrlv_q_256_undef(
-; CHECK-NEXT:    [[TMP1:%.*]] = lshr <4 x i64> %v, <i64 undef, i64 8, i64 16, i64 31>
-; CHECK-NEXT:    ret <4 x i64> [[TMP1]]
-;
-  %1 = insertelement <4 x i64> <i64 0, i64 8, i64 16, i64 31>, i64 undef, i64 0
-  %2 = tail call <4 x i64> @llvm.x86.avx2.psrlv.q.256(<4 x i64> %v, <4 x i64> %1)
-  ret <4 x i64> %2
-}
-
-define <16 x i32> @avx2_psrlv_d_512_0(<16 x i32> %v) {
-; CHECK-LABEL: @avx2_psrlv_d_512_0(
-; CHECK-NEXT:    ret <16 x i32> %v
-;
-  %1 = tail call <16 x i32> @llvm.x86.avx512.psrlv.d.512(<16 x i32> %v, <16 x i32> zeroinitializer)
-  ret <16 x i32> %1
-}
-
-define <16 x i32> @avx512_psrlv_d_512_var(<16 x i32> %v) {
-; CHECK-LABEL: @avx512_psrlv_d_512_var(
-; CHECK-NEXT:    [[TMP1:%.*]] = lshr <16 x i32> %v, <i32 0, i32 8, i32 16, i32 24, i32 31, i32 24, i32 8, i32 0, i32 0, i32 8, i32 16, i32 24, i32 31, i32 24, i32 8, i32 0>
-; CHECK-NEXT:    ret <16 x i32> [[TMP1]]
-;
-  %1 = tail call <16 x i32> @llvm.x86.avx512.psrlv.d.512(<16 x i32> %v, <16 x i32> <i32 0, i32 8, i32 16, i32 24, i32 31, i32 24, i32 8, i32 0, i32 0, i32 8, i32 16, i32 24, i32 31, i32 24, i32 8, i32 0>)
-  ret <16 x i32> %1
-}
-
-define <16 x i32> @avx512_psrlv_d_512_big(<16 x i32> %v) {
-; CHECK-LABEL: @avx512_psrlv_d_512_big(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i32> @llvm.x86.avx512.psrlv.d.512(<16 x i32> %v, <16 x i32> <i32 0, i32 8, i32 16, i32 64, i32 31, i32 24, i32 8, i32 0, i32 0, i32 8, i32 16, i32 64, i32 31, i32 24, i32 8, i32 0>)
-; CHECK-NEXT:    ret <16 x i32> [[TMP1]]
-;
-  %1 = tail call <16 x i32> @llvm.x86.avx512.psrlv.d.512(<16 x i32> %v, <16 x i32> <i32 0, i32 8, i32 16, i32 64, i32 31, i32 24, i32 8, i32 0, i32 0, i32 8, i32 16, i32 64, i32 31, i32 24, i32 8, i32 0>)
-  ret <16 x i32> %1
-}
-
-define <16 x i32> @avx512_psrlv_d_512_allbig(<16 x i32> %v) {
-; CHECK-LABEL: @avx512_psrlv_d_512_allbig(
-; CHECK-NEXT:    ret <16 x i32> <i32 undef, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 undef, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
-;
-  %1 = tail call <16 x i32> @llvm.x86.avx512.psrlv.d.512(<16 x i32> %v, <16 x i32> <i32 undef, i32 100, i32 255, i32 55555, i32 -32, i32 -100, i32 -255, i32 -55555, i32 undef, i32 100, i32 255, i32 55555, i32 -32, i32 -100, i32 -255, i32 -55555>)
-  ret <16 x i32> %1
-}
-
-define <16 x i32> @avx512_psrlv_d_512_undef(<16 x i32> %v) {
-; CHECK-LABEL: @avx512_psrlv_d_512_undef(
-; CHECK-NEXT:    [[TMP1:%.*]] = lshr <16 x i32> %v, <i32 0, i32 undef, i32 16, i32 31, i32 31, i32 24, i32 8, i32 0, i32 0, i32 8, i32 16, i32 31, i32 31, i32 24, i32 8, i32 0>
-; CHECK-NEXT:    ret <16 x i32> [[TMP1]]
-;
-  %1 = insertelement <16 x i32> <i32 0, i32 8, i32 16, i32 31, i32 31, i32 24, i32 8, i32 0, i32 0, i32 8, i32 16, i32 31, i32 31, i32 24, i32 8, i32 0>, i32 undef, i32 1
-  %2 = tail call <16 x i32> @llvm.x86.avx512.psrlv.d.512(<16 x i32> %v, <16 x i32> %1)
-  ret <16 x i32> %2
-}
-
-define <8 x i64> @avx512_psrlv_q_512_0(<8 x i64> %v) {
-; CHECK-LABEL: @avx512_psrlv_q_512_0(
-; CHECK-NEXT:    ret <8 x i64> %v
-;
-  %1 = tail call <8 x i64> @llvm.x86.avx512.psrlv.q.512(<8 x i64> %v, <8 x i64> zeroinitializer)
-  ret <8 x i64> %1
-}
-
-define <8 x i64> @avx512_psrlv_q_512_var(<8 x i64> %v) {
-; CHECK-LABEL: @avx512_psrlv_q_512_var(
-; CHECK-NEXT:    [[TMP1:%.*]] = lshr <8 x i64> %v, <i64 0, i64 8, i64 16, i64 31, i64 0, i64 8, i64 16, i64 31>
-; CHECK-NEXT:    ret <8 x i64> [[TMP1]]
-;
-  %1 = tail call <8 x i64> @llvm.x86.avx512.psrlv.q.512(<8 x i64> %v, <8 x i64> <i64 0, i64 8, i64 16, i64 31, i64 0, i64 8, i64 16, i64 31>)
-  ret <8 x i64> %1
-}
-
-define <8 x i64> @avx512_psrlv_q_512_big(<8 x i64> %v) {
-; CHECK-LABEL: @avx512_psrlv_q_512_big(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i64> @llvm.x86.avx512.psrlv.q.512(<8 x i64> %v, <8 x i64> <i64 0, i64 8, i64 16, i64 64, i64 0, i64 8, i64 16, i64 64>)
-; CHECK-NEXT:    ret <8 x i64> [[TMP1]]
-;
-  %1 = tail call <8 x i64> @llvm.x86.avx512.psrlv.q.512(<8 x i64> %v, <8 x i64> <i64 0, i64 8, i64 16, i64 64, i64 0, i64 8, i64 16, i64 64>)
-  ret <8 x i64> %1
-}
-
-define <8 x i64> @avx512_psrlv_q_512_allbig(<8 x i64> %v) {
-; CHECK-LABEL: @avx512_psrlv_q_512_allbig(
-; CHECK-NEXT:    ret <8 x i64> <i64 0, i64 undef, i64 0, i64 0, i64 0, i64 undef, i64 0, i64 0>
-;
-  %1 = tail call <8 x i64> @llvm.x86.avx512.psrlv.q.512(<8 x i64> %v, <8 x i64> <i64 64, i64 undef, i64 -128, i64 -60, i64 64, i64 undef, i64 -128, i64 -60>)
-  ret <8 x i64> %1
-}
-
-define <8 x i64> @avx512_psrlv_q_512_undef(<8 x i64> %v) {
-; CHECK-LABEL: @avx512_psrlv_q_512_undef(
-; CHECK-NEXT:    [[TMP1:%.*]] = lshr <8 x i64> %v, <i64 undef, i64 8, i64 16, i64 31, i64 0, i64 8, i64 16, i64 31>
-; CHECK-NEXT:    ret <8 x i64> [[TMP1]]
-;
-  %1 = insertelement <8 x i64> <i64 0, i64 8, i64 16, i64 31, i64 0, i64 8, i64 16, i64 31>, i64 undef, i64 0
-  %2 = tail call <8 x i64> @llvm.x86.avx512.psrlv.q.512(<8 x i64> %v, <8 x i64> %1)
-  ret <8 x i64> %2
-}
-
-define <8 x i16> @avx512_psrlv_w_128_0(<8 x i16> %v) {
-; CHECK-LABEL: @avx512_psrlv_w_128_0(
-; CHECK-NEXT:    ret <8 x i16> %v
-;
-  %1 = tail call <8 x i16> @llvm.x86.avx512.psrlv.w.128(<8 x i16> %v, <8 x i16> zeroinitializer)
-  ret <8 x i16> %1
-}
-
-define <8 x i16> @avx512_psrlv_w_128_var(<8 x i16> %v) {
-; CHECK-LABEL: @avx512_psrlv_w_128_var(
-; CHECK-NEXT:    [[TMP1:%.*]] = lshr <8 x i16> %v, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>
-; CHECK-NEXT:    ret <8 x i16> [[TMP1]]
-;
-  %1 = tail call <8 x i16> @llvm.x86.avx512.psrlv.w.128(<8 x i16> %v, <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
-  ret <8 x i16> %1
-}
-
-define <8 x i16> @avx512_psrlv_w_128_big(<8 x i16> %v) {
-; CHECK-LABEL: @avx512_psrlv_w_128_big(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i16> @llvm.x86.avx512.psrlv.w.128(<8 x i16> %v, <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 16>)
-; CHECK-NEXT:    ret <8 x i16> [[TMP1]]
-;
-  %1 = tail call <8 x i16> @llvm.x86.avx512.psrlv.w.128(<8 x i16> %v, <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 16>)
-  ret <8 x i16> %1
-}
-
-define <8 x i16> @avx512_psrlv_w_128_allbig(<8 x i16> %v) {
-; CHECK-LABEL: @avx512_psrlv_w_128_allbig(
-; CHECK-NEXT:    ret <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 undef>
-;
-  %1 = tail call <8 x i16> @llvm.x86.avx512.psrlv.w.128(<8 x i16> %v, <8 x i16> <i16 20, i16 -1, i16 -2, i16 33, i16 44, i16 55, i16 66, i16 undef>)
-  ret <8 x i16> %1
-}
-
-define <8 x i16> @avx512_psrlv_w_128_undef(<8 x i16> %v) {
-; CHECK-LABEL: @avx512_psrlv_w_128_undef(
-; CHECK-NEXT:    [[TMP1:%.*]] = lshr <8 x i16> %v, <i16 undef, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>
-; CHECK-NEXT:    ret <8 x i16> [[TMP1]]
-;
-  %1 = insertelement <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, i16 undef, i64 0
-  %2 = tail call <8 x i16> @llvm.x86.avx512.psrlv.w.128(<8 x i16> %v, <8 x i16> %1)
-  ret <8 x i16> %2
-}
-
-define <16 x i16> @avx512_psrlv_w_256_0(<16 x i16> %v) {
-; CHECK-LABEL: @avx512_psrlv_w_256_0(
-; CHECK-NEXT:    ret <16 x i16> %v
-;
-  %1 = tail call <16 x i16> @llvm.x86.avx512.psrlv.w.256(<16 x i16> %v, <16 x i16> zeroinitializer)
-  ret <16 x i16> %1
-}
-
-define <16 x i16> @avx512_psrlv_w_256_var(<16 x i16> %v) {
-; CHECK-LABEL: @avx512_psrlv_w_256_var(
-; CHECK-NEXT:    [[TMP1:%.*]] = lshr <16 x i16> %v, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>
-; CHECK-NEXT:    ret <16 x i16> [[TMP1]]
-;
-  %1 = tail call <16 x i16> @llvm.x86.avx512.psrlv.w.256(<16 x i16> %v, <16 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>)
-  ret <16 x i16> %1
-}
-
-define <16 x i16> @avx512_psrlv_w_256_big(<16 x i16> %v) {
-; CHECK-LABEL: @avx512_psrlv_w_256_big(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.x86.avx512.psrlv.w.256(<16 x i16> %v, <16 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 16>)
-; CHECK-NEXT:    ret <16 x i16> [[TMP1]]
-;
-  %1 = tail call <16 x i16> @llvm.x86.avx512.psrlv.w.256(<16 x i16> %v, <16 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 16>)
-  ret <16 x i16> %1
-}
-
-define <16 x i16> @avx512_psrlv_w_256_allbig(<16 x i16> %v) {
-; CHECK-LABEL: @avx512_psrlv_w_256_allbig(
-; CHECK-NEXT:    ret <16 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 undef, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>
-;
-  %1 = tail call <16 x i16> @llvm.x86.avx512.psrlv.w.256(<16 x i16> %v, <16 x i16> <i16 20, i16 -1, i16 -2, i16 33, i16 44, i16 55, i16 66, i16 -7, i16 undef, i16 64, i16 -10, i16 256, i16 16, i16 28, i16 65535, i16 32767>)
-  ret <16 x i16> %1
-}
-
-define <16 x i16> @avx512_psrlv_w_256_undef(<16 x i16> %v) {
-; CHECK-LABEL: @avx512_psrlv_w_256_undef(
-; CHECK-NEXT:    [[TMP1:%.*]] = lshr <16 x i16> %v, <i16 undef, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>
-; CHECK-NEXT:    ret <16 x i16> [[TMP1]]
-;
-  %1 = insertelement <16 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>, i16 undef, i64 0
-  %2 = tail call <16 x i16> @llvm.x86.avx512.psrlv.w.256(<16 x i16> %v, <16 x i16> %1)
-  ret <16 x i16> %2
-}
-
-define <32 x i16> @avx512_psrlv_w_512_0(<32 x i16> %v) {
-; CHECK-LABEL: @avx512_psrlv_w_512_0(
-; CHECK-NEXT:    ret <32 x i16> %v
-;
-  %1 = tail call <32 x i16> @llvm.x86.avx512.psrlv.w.512(<32 x i16> %v, <32 x i16> zeroinitializer)
-  ret <32 x i16> %1
-}
-
-define <32 x i16> @avx512_psrlv_w_512_var(<32 x i16> %v) {
-; CHECK-LABEL: @avx512_psrlv_w_512_var(
-; CHECK-NEXT:    [[TMP1:%.*]] = lshr <32 x i16> %v, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 15, i16 14, i16 13, i16 12, i16 11, i16 10, i16 9, i16 8, i16 7, i16 6, i16 5, i16 4, i16 3, i16 2, i16 1, i16 0>
-; CHECK-NEXT:    ret <32 x i16> [[TMP1]]
-;
-  %1 = tail call <32 x i16> @llvm.x86.avx512.psrlv.w.512(<32 x i16> %v, <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 15, i16 14, i16 13, i16 12, i16 11, i16 10, i16 9, i16 8, i16 7, i16 6, i16 5, i16 4, i16 3, i16 2, i16 1, i16 0>)
-  ret <32 x i16> %1
-}
-
-define <32 x i16> @avx512_psrlv_w_512_big(<32 x i16> %v) {
-; CHECK-LABEL: @avx512_psrlv_w_512_big(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i16> @llvm.x86.avx512.psrlv.w.512(<32 x i16> %v, <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 14, i16 13, i16 12, i16 11, i16 10, i16 9, i16 8, i16 7, i16 6, i16 5, i16 4, i16 3, i16 2, i16 1, i16 0>)
-; CHECK-NEXT:    ret <32 x i16> [[TMP1]]
-;
-  %1 = tail call <32 x i16> @llvm.x86.avx512.psrlv.w.512(<32 x i16> %v, <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 14, i16 13, i16 12, i16 11, i16 10, i16 9, i16 8, i16 7, i16 6, i16 5, i16 4, i16 3, i16 2, i16 1, i16 0>)
-  ret <32 x i16> %1
-}
-
-define <32 x i16> @avx512_psrlv_w_512_allbig(<32 x i16> %v) {
-; CHECK-LABEL: @avx512_psrlv_w_512_allbig(
-; CHECK-NEXT:    ret <32 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 undef, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 undef, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 undef, i16 0, i16 0, i16 undef, i16 0, i16 0>
-;
-  %1 = tail call <32 x i16> @llvm.x86.avx512.psrlv.w.512(<32 x i16> %v, <32 x i16> <i16 20, i16 -1, i16 -2, i16 33, i16 44, i16 55, i16 66, i16 -7, i16 undef, i16 64, i16 -10, i16 128, i16 16, i16 28, i16 65535, i16 32767, i16 56, i16 -14, i16 undef, i16 16, i16 67, i16 567, i16 -32768, i16 4096, i16 8192, i16 -12345, i16 undef, i16 345, i16 123, i16 undef, i16 1024, i16 54321>)
-  ret <32 x i16> %1
-}
-
-define <32 x i16> @avx512_psrlv_w_512_undef(<32 x i16> %v) {
-; CHECK-LABEL: @avx512_psrlv_w_512_undef(
-; CHECK-NEXT:    [[TMP1:%.*]] = lshr <32 x i16> %v, <i16 undef, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 15, i16 14, i16 13, i16 12, i16 11, i16 10, i16 9, i16 8, i16 7, i16 6, i16 5, i16 4, i16 3, i16 2, i16 1, i16 0>
-; CHECK-NEXT:    ret <32 x i16> [[TMP1]]
-;
-  %1 = insertelement <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 15, i16 14, i16 13, i16 12, i16 11, i16 10, i16 9, i16 8, i16 7, i16 6, i16 5, i16 4, i16 3, i16 2, i16 1, i16 0>, i16 undef, i64 0
-  %2 = tail call <32 x i16> @llvm.x86.avx512.psrlv.w.512(<32 x i16> %v, <32 x i16> %1)
-  ret <32 x i16> %2
-}
-
-;
-; SHL - Constant Per-Element Vector
-;
-
-define <4 x i32> @avx2_psllv_d_128_0(<4 x i32> %v) {
-; CHECK-LABEL: @avx2_psllv_d_128_0(
-; CHECK-NEXT:    ret <4 x i32> %v
-;
-  %1 = tail call <4 x i32> @llvm.x86.avx2.psllv.d(<4 x i32> %v, <4 x i32> zeroinitializer)
-  ret <4 x i32> %1
-}
-
-define <8 x i32> @avx2_psllv_d_256_0(<8 x i32> %v) {
-; CHECK-LABEL: @avx2_psllv_d_256_0(
-; CHECK-NEXT:    ret <8 x i32> %v
-;
-  %1 = tail call <8 x i32> @llvm.x86.avx2.psllv.d.256(<8 x i32> %v, <8 x i32> zeroinitializer)
-  ret <8 x i32> %1
-}
-
-define <4 x i32> @avx2_psllv_d_128_var(<4 x i32> %v) {
-; CHECK-LABEL: @avx2_psllv_d_128_var(
-; CHECK-NEXT:    [[TMP1:%.*]] = shl <4 x i32> %v, <i32 0, i32 8, i32 16, i32 31>
-; CHECK-NEXT:    ret <4 x i32> [[TMP1]]
-;
-  %1 = tail call <4 x i32> @llvm.x86.avx2.psllv.d(<4 x i32> %v, <4 x i32> <i32 0, i32 8, i32 16, i32 31>)
-  ret <4 x i32> %1
-}
-
-define <8 x i32> @avx2_psllv_d_256_var(<8 x i32> %v) {
-; CHECK-LABEL: @avx2_psllv_d_256_var(
-; CHECK-NEXT:    [[TMP1:%.*]] = shl <8 x i32> %v, <i32 0, i32 8, i32 16, i32 24, i32 31, i32 24, i32 8, i32 0>
-; CHECK-NEXT:    ret <8 x i32> [[TMP1]]
-;
-  %1 = tail call <8 x i32> @llvm.x86.avx2.psllv.d.256(<8 x i32> %v, <8 x i32> <i32 0, i32 8, i32 16, i32 24, i32 31, i32 24, i32 8, i32 0>)
-  ret <8 x i32> %1
-}
-
-define <4 x i32> @avx2_psllv_d_128_big(<4 x i32> %v) {
-; CHECK-LABEL: @avx2_psllv_d_128_big(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i32> @llvm.x86.avx2.psllv.d(<4 x i32> %v, <4 x i32> <i32 0, i32 8, i32 16, i32 64>)
-; CHECK-NEXT:    ret <4 x i32> [[TMP1]]
-;
-  %1 = tail call <4 x i32> @llvm.x86.avx2.psllv.d(<4 x i32> %v, <4 x i32> <i32 0, i32 8, i32 16, i32 64>)
-  ret <4 x i32> %1
-}
-
-define <8 x i32> @avx2_psllv_d_256_big(<8 x i32> %v) {
-; CHECK-LABEL: @avx2_psllv_d_256_big(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.x86.avx2.psllv.d.256(<8 x i32> %v, <8 x i32> <i32 0, i32 8, i32 16, i32 64, i32 31, i32 24, i32 8, i32 0>)
-; CHECK-NEXT:    ret <8 x i32> [[TMP1]]
-;
-  %1 = tail call <8 x i32> @llvm.x86.avx2.psllv.d.256(<8 x i32> %v, <8 x i32> <i32 0, i32 8, i32 16, i32 64, i32 31, i32 24, i32 8, i32 0>)
-  ret <8 x i32> %1
-}
-
-define <4 x i32> @avx2_psllv_d_128_allbig(<4 x i32> %v) {
-; CHECK-LABEL: @avx2_psllv_d_128_allbig(
-; CHECK-NEXT:    ret <4 x i32> <i32 0, i32 0, i32 0, i32 undef>
-;
-  %1 = tail call <4 x i32> @llvm.x86.avx2.psllv.d(<4 x i32> %v, <4 x i32> <i32 32, i32 100, i32 -255, i32 undef>)
-  ret <4 x i32> %1
-}
-
-define <8 x i32> @avx2_psllv_d_256_allbig(<8 x i32> %v) {
-; CHECK-LABEL: @avx2_psllv_d_256_allbig(
-; CHECK-NEXT:    ret <8 x i32> <i32 undef, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
-;
-  %1 = tail call <8 x i32> @llvm.x86.avx2.psllv.d.256(<8 x i32> %v, <8 x i32> <i32 undef, i32 100, i32 255, i32 55555, i32 -32, i32 -100, i32 -255, i32 -55555>)
-  ret <8 x i32> %1
-}
-
-define <4 x i32> @avx2_psllv_d_128_undef(<4 x i32> %v) {
-; CHECK-LABEL: @avx2_psllv_d_128_undef(
-; CHECK-NEXT:    [[TMP1:%.*]] = shl <4 x i32> %v, <i32 undef, i32 8, i32 16, i32 31>
-; CHECK-NEXT:    ret <4 x i32> [[TMP1]]
-;
-  %1 = insertelement <4 x i32> <i32 0, i32 8, i32 16, i32 31>, i32 undef, i32 0
-  %2 = tail call <4 x i32> @llvm.x86.avx2.psllv.d(<4 x i32> %v, <4 x i32> %1)
-  ret <4 x i32> %2
-}
-
-define <8 x i32> @avx2_psllv_d_256_undef(<8 x i32> %v) {
-; CHECK-LABEL: @avx2_psllv_d_256_undef(
-; CHECK-NEXT:    [[TMP1:%.*]] = shl <8 x i32> %v, <i32 0, i32 undef, i32 16, i32 31, i32 31, i32 24, i32 8, i32 0>
-; CHECK-NEXT:    ret <8 x i32> [[TMP1]]
-;
-  %1 = insertelement <8 x i32> <i32 0, i32 8, i32 16, i32 31, i32 31, i32 24, i32 8, i32 0>, i32 undef, i32 1
-  %2 = tail call <8 x i32> @llvm.x86.avx2.psllv.d.256(<8 x i32> %v, <8 x i32> %1)
-  ret <8 x i32> %2
-}
-
-define <2 x i64> @avx2_psllv_q_128_0(<2 x i64> %v) {
-; CHECK-LABEL: @avx2_psllv_q_128_0(
-; CHECK-NEXT:    ret <2 x i64> %v
-;
-  %1 = tail call <2 x i64> @llvm.x86.avx2.psllv.q(<2 x i64> %v, <2 x i64> zeroinitializer)
-  ret <2 x i64> %1
-}
-
-define <4 x i64> @avx2_psllv_q_256_0(<4 x i64> %v) {
-; CHECK-LABEL: @avx2_psllv_q_256_0(
-; CHECK-NEXT:    ret <4 x i64> %v
-;
-  %1 = tail call <4 x i64> @llvm.x86.avx2.psllv.q.256(<4 x i64> %v, <4 x i64> zeroinitializer)
-  ret <4 x i64> %1
-}
-
-define <2 x i64> @avx2_psllv_q_128_var(<2 x i64> %v) {
-; CHECK-LABEL: @avx2_psllv_q_128_var(
-; CHECK-NEXT:    [[TMP1:%.*]] = shl <2 x i64> %v, <i64 0, i64 8>
-; CHECK-NEXT:    ret <2 x i64> [[TMP1]]
-;
-  %1 = tail call <2 x i64> @llvm.x86.avx2.psllv.q(<2 x i64> %v, <2 x i64> <i64 0, i64 8>)
-  ret <2 x i64> %1
-}
-
-define <4 x i64> @avx2_psllv_q_256_var(<4 x i64> %v) {
-; CHECK-LABEL: @avx2_psllv_q_256_var(
-; CHECK-NEXT:    [[TMP1:%.*]] = shl <4 x i64> %v, <i64 0, i64 8, i64 16, i64 31>
-; CHECK-NEXT:    ret <4 x i64> [[TMP1]]
-;
-  %1 = tail call <4 x i64> @llvm.x86.avx2.psllv.q.256(<4 x i64> %v, <4 x i64> <i64 0, i64 8, i64 16, i64 31>)
-  ret <4 x i64> %1
-}
-
-define <2 x i64> @avx2_psllv_q_128_big(<2 x i64> %v) {
-; CHECK-LABEL: @avx2_psllv_q_128_big(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.x86.avx2.psllv.q(<2 x i64> %v, <2 x i64> <i64 0, i64 128>)
-; CHECK-NEXT:    ret <2 x i64> [[TMP1]]
-;
-  %1 = tail call <2 x i64> @llvm.x86.avx2.psllv.q(<2 x i64> %v, <2 x i64> <i64 0, i64 128>)
-  ret <2 x i64> %1
-}
-
-define <4 x i64> @avx2_psllv_q_256_big(<4 x i64> %v) {
-; CHECK-LABEL: @avx2_psllv_q_256_big(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.x86.avx2.psllv.q.256(<4 x i64> %v, <4 x i64> <i64 0, i64 8, i64 16, i64 64>)
-; CHECK-NEXT:    ret <4 x i64> [[TMP1]]
-;
-  %1 = tail call <4 x i64> @llvm.x86.avx2.psllv.q.256(<4 x i64> %v, <4 x i64> <i64 0, i64 8, i64 16, i64 64>)
-  ret <4 x i64> %1
-}
-
-define <2 x i64> @avx2_psllv_q_128_allbig(<2 x i64> %v) {
-; CHECK-LABEL: @avx2_psllv_q_128_allbig(
-; CHECK-NEXT:    ret <2 x i64> zeroinitializer
-;
-  %1 = tail call <2 x i64> @llvm.x86.avx2.psllv.q(<2 x i64> %v, <2 x i64> <i64 128, i64 -64>)
-  ret <2 x i64> %1
-}
-
-define <4 x i64> @avx2_psllv_q_256_allbig(<4 x i64> %v) {
-; CHECK-LABEL: @avx2_psllv_q_256_allbig(
-; CHECK-NEXT:    ret <4 x i64> <i64 0, i64 undef, i64 0, i64 0>
-;
-  %1 = tail call <4 x i64> @llvm.x86.avx2.psllv.q.256(<4 x i64> %v, <4 x i64> <i64 64, i64 undef, i64 -128, i64 -60>)
-  ret <4 x i64> %1
-}
-
-define <2 x i64> @avx2_psllv_q_128_undef(<2 x i64> %v) {
-; CHECK-LABEL: @avx2_psllv_q_128_undef(
-; CHECK-NEXT:    [[TMP1:%.*]] = shl <2 x i64> %v, <i64 0, i64 undef>
-; CHECK-NEXT:    ret <2 x i64> [[TMP1]]
-;
-  %1 = insertelement <2 x i64> <i64 0, i64 8>, i64 undef, i64 1
-  %2 = tail call <2 x i64> @llvm.x86.avx2.psllv.q(<2 x i64> %v, <2 x i64> %1)
-  ret <2 x i64> %2
-}
-
-define <4 x i64> @avx2_psllv_q_256_undef(<4 x i64> %v) {
-; CHECK-LABEL: @avx2_psllv_q_256_undef(
-; CHECK-NEXT:    [[TMP1:%.*]] = shl <4 x i64> %v, <i64 undef, i64 8, i64 16, i64 31>
-; CHECK-NEXT:    ret <4 x i64> [[TMP1]]
-;
-  %1 = insertelement <4 x i64> <i64 0, i64 8, i64 16, i64 31>, i64 undef, i64 0
-  %2 = tail call <4 x i64> @llvm.x86.avx2.psllv.q.256(<4 x i64> %v, <4 x i64> %1)
-  ret <4 x i64> %2
-}
-
-define <16 x i32> @avx512_psllv_d_512_0(<16 x i32> %v) {
-; CHECK-LABEL: @avx512_psllv_d_512_0(
-; CHECK-NEXT:    ret <16 x i32> %v
-;
-  %1 = tail call <16 x i32> @llvm.x86.avx512.psllv.d.512(<16 x i32> %v, <16 x i32> zeroinitializer)
-  ret <16 x i32> %1
-}
-
-define <16 x i32> @avx512_psllv_d_512_var(<16 x i32> %v) {
-; CHECK-LABEL: @avx512_psllv_d_512_var(
-; CHECK-NEXT:    [[TMP1:%.*]] = shl <16 x i32> %v, <i32 0, i32 8, i32 16, i32 24, i32 31, i32 24, i32 8, i32 0, i32 0, i32 8, i32 16, i32 24, i32 31, i32 24, i32 8, i32 0>
-; CHECK-NEXT:    ret <16 x i32> [[TMP1]]
-;
-  %1 = tail call <16 x i32> @llvm.x86.avx512.psllv.d.512(<16 x i32> %v, <16 x i32> <i32 0, i32 8, i32 16, i32 24, i32 31, i32 24, i32 8, i32 0, i32 0, i32 8, i32 16, i32 24, i32 31, i32 24, i32 8, i32 0>)
-  ret <16 x i32> %1
-}
-
-define <16 x i32> @avx512_psllv_d_512_big(<16 x i32> %v) {
-; CHECK-LABEL: @avx512_psllv_d_512_big(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i32> @llvm.x86.avx512.psllv.d.512(<16 x i32> %v, <16 x i32> <i32 0, i32 8, i32 16, i32 64, i32 31, i32 24, i32 8, i32 0, i32 0, i32 8, i32 16, i32 64, i32 31, i32 24, i32 8, i32 0>)
-; CHECK-NEXT:    ret <16 x i32> [[TMP1]]
-;
-  %1 = tail call <16 x i32> @llvm.x86.avx512.psllv.d.512(<16 x i32> %v, <16 x i32> <i32 0, i32 8, i32 16, i32 64, i32 31, i32 24, i32 8, i32 0, i32 0, i32 8, i32 16, i32 64, i32 31, i32 24, i32 8, i32 0>)
-  ret <16 x i32> %1
-}
-
-define <16 x i32> @avx512_psllv_d_512_allbig(<16 x i32> %v) {
-; CHECK-LABEL: @avx512_psllv_d_512_allbig(
-; CHECK-NEXT:    ret <16 x i32> <i32 undef, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 undef, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
-;
-  %1 = tail call <16 x i32> @llvm.x86.avx512.psllv.d.512(<16 x i32> %v, <16 x i32> <i32 undef, i32 100, i32 255, i32 55555, i32 -32, i32 -100, i32 -255, i32 -55555, i32 undef, i32 100, i32 255, i32 55555, i32 -32, i32 -100, i32 -255, i32 -55555>)
-  ret <16 x i32> %1
-}
-
-define <16 x i32> @avx512_psllv_d_512_undef(<16 x i32> %v) {
-; CHECK-LABEL: @avx512_psllv_d_512_undef(
-; CHECK-NEXT:    [[TMP1:%.*]] = shl <16 x i32> %v, <i32 0, i32 undef, i32 16, i32 31, i32 31, i32 24, i32 8, i32 0, i32 0, i32 8, i32 16, i32 31, i32 31, i32 24, i32 8, i32 0>
-; CHECK-NEXT:    ret <16 x i32> [[TMP1]]
-;
-  %1 = insertelement <16 x i32> <i32 0, i32 8, i32 16, i32 31, i32 31, i32 24, i32 8, i32 0, i32 0, i32 8, i32 16, i32 31, i32 31, i32 24, i32 8, i32 0>, i32 undef, i32 1
-  %2 = tail call <16 x i32> @llvm.x86.avx512.psllv.d.512(<16 x i32> %v, <16 x i32> %1)
-  ret <16 x i32> %2
-}
-
-define <8 x i64> @avx512_psllv_q_512_0(<8 x i64> %v) {
-; CHECK-LABEL: @avx512_psllv_q_512_0(
-; CHECK-NEXT:    ret <8 x i64> %v
-;
-  %1 = tail call <8 x i64> @llvm.x86.avx512.psllv.q.512(<8 x i64> %v, <8 x i64> zeroinitializer)
-  ret <8 x i64> %1
-}
-
-define <8 x i64> @avx512_psllv_q_512_var(<8 x i64> %v) {
-; CHECK-LABEL: @avx512_psllv_q_512_var(
-; CHECK-NEXT:    [[TMP1:%.*]] = shl <8 x i64> %v, <i64 0, i64 8, i64 16, i64 31, i64 0, i64 8, i64 16, i64 31>
-; CHECK-NEXT:    ret <8 x i64> [[TMP1]]
-;
-  %1 = tail call <8 x i64> @llvm.x86.avx512.psllv.q.512(<8 x i64> %v, <8 x i64> <i64 0, i64 8, i64 16, i64 31, i64 0, i64 8, i64 16, i64 31>)
-  ret <8 x i64> %1
-}
-
-define <8 x i64> @avx512_psllv_q_512_big(<8 x i64> %v) {
-; CHECK-LABEL: @avx512_psllv_q_512_big(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i64> @llvm.x86.avx512.psllv.q.512(<8 x i64> %v, <8 x i64> <i64 0, i64 8, i64 16, i64 64, i64 0, i64 8, i64 16, i64 64>)
-; CHECK-NEXT:    ret <8 x i64> [[TMP1]]
-;
-  %1 = tail call <8 x i64> @llvm.x86.avx512.psllv.q.512(<8 x i64> %v, <8 x i64> <i64 0, i64 8, i64 16, i64 64, i64 0, i64 8, i64 16, i64 64>)
-  ret <8 x i64> %1
-}
-
-define <8 x i64> @avx512_psllv_q_512_allbig(<8 x i64> %v) {
-; CHECK-LABEL: @avx512_psllv_q_512_allbig(
-; CHECK-NEXT:    ret <8 x i64> <i64 0, i64 undef, i64 0, i64 0, i64 0, i64 undef, i64 0, i64 0>
-;
-  %1 = tail call <8 x i64> @llvm.x86.avx512.psllv.q.512(<8 x i64> %v, <8 x i64> <i64 64, i64 undef, i64 -128, i64 -60, i64 64, i64 undef, i64 -128, i64 -60>)
-  ret <8 x i64> %1
-}
-
-define <8 x i64> @avx512_psllv_q_512_undef(<8 x i64> %v) {
-; CHECK-LABEL: @avx512_psllv_q_512_undef(
-; CHECK-NEXT:    [[TMP1:%.*]] = shl <8 x i64> %v, <i64 undef, i64 8, i64 16, i64 31, i64 0, i64 8, i64 16, i64 31>
-; CHECK-NEXT:    ret <8 x i64> [[TMP1]]
-;
-  %1 = insertelement <8 x i64> <i64 0, i64 8, i64 16, i64 31, i64 0, i64 8, i64 16, i64 31>, i64 undef, i64 0
-  %2 = tail call <8 x i64> @llvm.x86.avx512.psllv.q.512(<8 x i64> %v, <8 x i64> %1)
-  ret <8 x i64> %2
-}
-
-define <8 x i16> @avx512_psllv_w_128_0(<8 x i16> %v) {
-; CHECK-LABEL: @avx512_psllv_w_128_0(
-; CHECK-NEXT:    ret <8 x i16> %v
-;
-  %1 = tail call <8 x i16> @llvm.x86.avx512.psllv.w.128(<8 x i16> %v, <8 x i16> zeroinitializer)
-  ret <8 x i16> %1
-}
-
-define <8 x i16> @avx512_psllv_w_128_var(<8 x i16> %v) {
-; CHECK-LABEL: @avx512_psllv_w_128_var(
-; CHECK-NEXT:    [[TMP1:%.*]] = shl <8 x i16> %v, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>
-; CHECK-NEXT:    ret <8 x i16> [[TMP1]]
-;
-  %1 = tail call <8 x i16> @llvm.x86.avx512.psllv.w.128(<8 x i16> %v, <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
-  ret <8 x i16> %1
-}
-
-define <8 x i16> @avx512_psllv_w_128_big(<8 x i16> %v) {
-; CHECK-LABEL: @avx512_psllv_w_128_big(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i16> @llvm.x86.avx512.psllv.w.128(<8 x i16> %v, <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 16>)
-; CHECK-NEXT:    ret <8 x i16> [[TMP1]]
-;
-  %1 = tail call <8 x i16> @llvm.x86.avx512.psllv.w.128(<8 x i16> %v, <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 16>)
-  ret <8 x i16> %1
-}
-
-define <8 x i16> @avx512_psllv_w_128_allbig(<8 x i16> %v) {
-; CHECK-LABEL: @avx512_psllv_w_128_allbig(
-; CHECK-NEXT:    ret <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 undef>
-;
-  %1 = tail call <8 x i16> @llvm.x86.avx512.psllv.w.128(<8 x i16> %v, <8 x i16> <i16 20, i16 -1, i16 -2, i16 33, i16 44, i16 55, i16 66, i16 undef>)
-  ret <8 x i16> %1
-}
-
-define <8 x i16> @avx512_psllv_w_128_undef(<8 x i16> %v) {
-; CHECK-LABEL: @avx512_psllv_w_128_undef(
-; CHECK-NEXT:    [[TMP1:%.*]] = shl <8 x i16> %v, <i16 undef, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>
-; CHECK-NEXT:    ret <8 x i16> [[TMP1]]
-;
-  %1 = insertelement <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, i16 undef, i64 0
-  %2 = tail call <8 x i16> @llvm.x86.avx512.psllv.w.128(<8 x i16> %v, <8 x i16> %1)
-  ret <8 x i16> %2
-}
-
-define <16 x i16> @avx512_psllv_w_256_0(<16 x i16> %v) {
-; CHECK-LABEL: @avx512_psllv_w_256_0(
-; CHECK-NEXT:    ret <16 x i16> %v
-;
-  %1 = tail call <16 x i16> @llvm.x86.avx512.psllv.w.256(<16 x i16> %v, <16 x i16> zeroinitializer)
-  ret <16 x i16> %1
-}
-
-define <16 x i16> @avx512_psllv_w_256_var(<16 x i16> %v) {
-; CHECK-LABEL: @avx512_psllv_w_256_var(
-; CHECK-NEXT:    [[TMP1:%.*]] = shl <16 x i16> %v, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>
-; CHECK-NEXT:    ret <16 x i16> [[TMP1]]
-;
-  %1 = tail call <16 x i16> @llvm.x86.avx512.psllv.w.256(<16 x i16> %v, <16 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>)
-  ret <16 x i16> %1
-}
-
-define <16 x i16> @avx512_psllv_w_256_big(<16 x i16> %v) {
-; CHECK-LABEL: @avx512_psllv_w_256_big(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.x86.avx512.psllv.w.256(<16 x i16> %v, <16 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 16>)
-; CHECK-NEXT:    ret <16 x i16> [[TMP1]]
-;
-  %1 = tail call <16 x i16> @llvm.x86.avx512.psllv.w.256(<16 x i16> %v, <16 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 16>)
-  ret <16 x i16> %1
-}
-
-define <16 x i16> @avx512_psllv_w_256_allbig(<16 x i16> %v) {
-; CHECK-LABEL: @avx512_psllv_w_256_allbig(
-; CHECK-NEXT:    ret <16 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 undef, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>
-;
-  %1 = tail call <16 x i16> @llvm.x86.avx512.psllv.w.256(<16 x i16> %v, <16 x i16> <i16 20, i16 -1, i16 -2, i16 33, i16 44, i16 55, i16 66, i16 -7, i16 undef, i16 64, i16 -10, i16 256, i16 16, i16 28, i16 65535, i16 32767>)
-  ret <16 x i16> %1
-}
-
-define <16 x i16> @avx512_psllv_w_256_undef(<16 x i16> %v) {
-; CHECK-LABEL: @avx512_psllv_w_256_undef(
-; CHECK-NEXT:    [[TMP1:%.*]] = shl <16 x i16> %v, <i16 undef, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>
-; CHECK-NEXT:    ret <16 x i16> [[TMP1]]
-;
-  %1 = insertelement <16 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>, i16 undef, i64 0
-  %2 = tail call <16 x i16> @llvm.x86.avx512.psllv.w.256(<16 x i16> %v, <16 x i16> %1)
-  ret <16 x i16> %2
-}
-
-define <32 x i16> @avx512_psllv_w_512_0(<32 x i16> %v) {
-; CHECK-LABEL: @avx512_psllv_w_512_0(
-; CHECK-NEXT:    ret <32 x i16> %v
-;
-  %1 = tail call <32 x i16> @llvm.x86.avx512.psllv.w.512(<32 x i16> %v, <32 x i16> zeroinitializer)
-  ret <32 x i16> %1
-}
-
-define <32 x i16> @avx512_psllv_w_512_var(<32 x i16> %v) {
-; CHECK-LABEL: @avx512_psllv_w_512_var(
-; CHECK-NEXT:    [[TMP1:%.*]] = shl <32 x i16> %v, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 15, i16 14, i16 13, i16 12, i16 11, i16 10, i16 9, i16 8, i16 7, i16 6, i16 5, i16 4, i16 3, i16 2, i16 1, i16 0>
-; CHECK-NEXT:    ret <32 x i16> [[TMP1]]
-;
-  %1 = tail call <32 x i16> @llvm.x86.avx512.psllv.w.512(<32 x i16> %v, <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 15, i16 14, i16 13, i16 12, i16 11, i16 10, i16 9, i16 8, i16 7, i16 6, i16 5, i16 4, i16 3, i16 2, i16 1, i16 0>)
-  ret <32 x i16> %1
-}
-
-define <32 x i16> @avx512_psllv_w_512_big(<32 x i16> %v) {
-; CHECK-LABEL: @avx512_psllv_w_512_big(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i16> @llvm.x86.avx512.psllv.w.512(<32 x i16> %v, <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 14, i16 13, i16 12, i16 11, i16 10, i16 9, i16 8, i16 7, i16 6, i16 5, i16 4, i16 3, i16 2, i16 1, i16 0>)
-; CHECK-NEXT:    ret <32 x i16> [[TMP1]]
-;
-  %1 = tail call <32 x i16> @llvm.x86.avx512.psllv.w.512(<32 x i16> %v, <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 14, i16 13, i16 12, i16 11, i16 10, i16 9, i16 8, i16 7, i16 6, i16 5, i16 4, i16 3, i16 2, i16 1, i16 0>)
-  ret <32 x i16> %1
-}
-
-define <32 x i16> @avx512_psllv_w_512_allbig(<32 x i16> %v) {
-; CHECK-LABEL: @avx512_psllv_w_512_allbig(
-; CHECK-NEXT:    ret <32 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 undef, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 undef, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 undef, i16 0, i16 0, i16 undef, i16 0, i16 0>
-;
-  %1 = tail call <32 x i16> @llvm.x86.avx512.psllv.w.512(<32 x i16> %v, <32 x i16> <i16 20, i16 -1, i16 -2, i16 33, i16 44, i16 55, i16 66, i16 -7, i16 undef, i16 64, i16 -10, i16 128, i16 16, i16 28, i16 65535, i16 32767, i16 56, i16 -14, i16 undef, i16 16, i16 67, i16 567, i16 -32768, i16 4096, i16 8192, i16 -12345, i16 undef, i16 345, i16 123, i16 undef, i16 1024, i16 54321>)
-  ret <32 x i16> %1
-}
-
-define <32 x i16> @avx512_psllv_w_512_undef(<32 x i16> %v) {
-; CHECK-LABEL: @avx512_psllv_w_512_undef(
-; CHECK-NEXT:    [[TMP1:%.*]] = shl <32 x i16> %v, <i16 undef, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 15, i16 14, i16 13, i16 12, i16 11, i16 10, i16 9, i16 8, i16 7, i16 6, i16 5, i16 4, i16 3, i16 2, i16 1, i16 0>
-; CHECK-NEXT:    ret <32 x i16> [[TMP1]]
-;
-  %1 = insertelement <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 15, i16 14, i16 13, i16 12, i16 11, i16 10, i16 9, i16 8, i16 7, i16 6, i16 5, i16 4, i16 3, i16 2, i16 1, i16 0>, i16 undef, i64 0
-  %2 = tail call <32 x i16> @llvm.x86.avx512.psllv.w.512(<32 x i16> %v, <32 x i16> %1)
-  ret <32 x i16> %2
-}
-
-;
-; Vector Demanded Bits
-;
-
-define <8 x i16> @sse2_psra_w_var(<8 x i16> %v, <8 x i16> %a) {
-; CHECK-LABEL: @sse2_psra_w_var(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i16> @llvm.x86.sse2.psra.w(<8 x i16> %v, <8 x i16> %a)
-; CHECK-NEXT:    ret <8 x i16> [[TMP1]]
-;
-  %1 = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
-  %2 = tail call <8 x i16> @llvm.x86.sse2.psra.w(<8 x i16> %v, <8 x i16> %1)
-  ret <8 x i16> %2
-}
-
-define <8 x i16> @sse2_psra_w_var_bc(<8 x i16> %v, <2 x i64> %a) {
-; CHECK-LABEL: @sse2_psra_w_var_bc(
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> %a to <8 x i16>
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.x86.sse2.psra.w(<8 x i16> %v, <8 x i16> [[TMP1]])
-; CHECK-NEXT:    ret <8 x i16> [[TMP2]]
-;
-  %1 = shufflevector <2 x i64> %a, <2 x i64> undef, <2 x i32> <i32 0, i32 0>
-  %2 = bitcast <2 x i64> %1 to <8 x i16>
-  %3 = tail call <8 x i16> @llvm.x86.sse2.psra.w(<8 x i16> %v, <8 x i16> %2)
-  ret <8 x i16> %3
-}
-
-define <4 x i32> @sse2_psra_d_var(<4 x i32> %v, <4 x i32> %a) {
-; CHECK-LABEL: @sse2_psra_d_var(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i32> @llvm.x86.sse2.psra.d(<4 x i32> %v, <4 x i32> %a)
-; CHECK-NEXT:    ret <4 x i32> [[TMP1]]
-;
-  %1 = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
-  %2 = tail call <4 x i32> @llvm.x86.sse2.psra.d(<4 x i32> %v, <4 x i32> %1)
-  ret <4 x i32> %2
-}
-
-define <4 x i32> @sse2_psra_d_var_bc(<4 x i32> %v, <8 x i16> %a) {
-; CHECK-LABEL: @sse2_psra_d_var_bc(
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> %a to <4 x i32>
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.x86.sse2.psra.d(<4 x i32> %v, <4 x i32> [[TMP1]])
-; CHECK-NEXT:    ret <4 x i32> [[TMP2]]
-;
-  %1 = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
-  %2 = bitcast <8 x i16> %1 to <4 x i32>
-  %3 = tail call <4 x i32> @llvm.x86.sse2.psra.d(<4 x i32> %v, <4 x i32> %2)
-  ret <4 x i32> %3
-}
-
-define <16 x i16> @avx2_psra_w_var(<16 x i16> %v, <8 x i16> %a) {
-; CHECK-LABEL: @avx2_psra_w_var(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.x86.avx2.psra.w(<16 x i16> %v, <8 x i16> %a)
-; CHECK-NEXT:    ret <16 x i16> [[TMP1]]
-;
-  %1 = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
-  %2 = tail call <16 x i16> @llvm.x86.avx2.psra.w(<16 x i16> %v, <8 x i16> %1)
-  ret <16 x i16> %2
-}
-
-define <8 x i32> @avx2_psra_d_var(<8 x i32> %v, <4 x i32> %a) {
-; CHECK-LABEL: @avx2_psra_d_var(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.x86.avx2.psra.d(<8 x i32> %v, <4 x i32> %a)
-; CHECK-NEXT:    ret <8 x i32> [[TMP1]]
-;
-  %1 = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
-  %2 = tail call <8 x i32> @llvm.x86.avx2.psra.d(<8 x i32> %v, <4 x i32> %1)
-  ret <8 x i32> %2
-}
-
-define <2 x i64> @avx512_psra_q_128_var(<2 x i64> %v, <2 x i64> %a) {
-; CHECK-LABEL: @avx512_psra_q_128_var(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.x86.avx512.psra.q.128(<2 x i64> %v, <2 x i64> %a)
-; CHECK-NEXT:    ret <2 x i64> [[TMP1]]
-;
-  %1 = shufflevector <2 x i64> %a, <2 x i64> undef, <2 x i32> <i32 0, i32 0>
-  %2 = tail call <2 x i64> @llvm.x86.avx512.psra.q.128(<2 x i64> %v, <2 x i64> %1)
-  ret <2 x i64> %2
-}
-
-define <4 x i64> @avx512_psra_q_256_var(<4 x i64> %v, <2 x i64> %a) {
-; CHECK-LABEL: @avx512_psra_q_256_var(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.x86.avx512.psra.q.256(<4 x i64> %v, <2 x i64> %a)
-; CHECK-NEXT:    ret <4 x i64> [[TMP1]]
-;
-  %1 = shufflevector <2 x i64> %a, <2 x i64> undef, <2 x i32> <i32 0, i32 0>
-  %2 = tail call <4 x i64> @llvm.x86.avx512.psra.q.256(<4 x i64> %v, <2 x i64> %1)
-  ret <4 x i64> %2
-}
-
-define <32 x i16> @avx512_psra_w_512_var(<32 x i16> %v, <8 x i16> %a) {
-; CHECK-LABEL: @avx512_psra_w_512_var(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i16> @llvm.x86.avx512.psra.w.512(<32 x i16> %v, <8 x i16> %a)
-; CHECK-NEXT:    ret <32 x i16> [[TMP1]]
-;
-  %1 = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
-  %2 = tail call <32 x i16> @llvm.x86.avx512.psra.w.512(<32 x i16> %v, <8 x i16> %1)
-  ret <32 x i16> %2
-}
-
-define <16 x i32> @avx512_psra_d_512_var(<16 x i32> %v, <4 x i32> %a) {
-; CHECK-LABEL: @avx512_psra_d_512_var(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i32> @llvm.x86.avx512.psra.d.512(<16 x i32> %v, <4 x i32> %a)
-; CHECK-NEXT:    ret <16 x i32> [[TMP1]]
-;
-  %1 = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
-  %2 = tail call <16 x i32> @llvm.x86.avx512.psra.d.512(<16 x i32> %v, <4 x i32> %1)
-  ret <16 x i32> %2
-}
-
-define <8 x i64> @avx512_psra_q_512_var(<8 x i64> %v, <2 x i64> %a) {
-; CHECK-LABEL: @avx512_psra_q_512_var(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i64> @llvm.x86.avx512.psra.q.512(<8 x i64> %v, <2 x i64> %a)
-; CHECK-NEXT:    ret <8 x i64> [[TMP1]]
-;
-  %1 = shufflevector <2 x i64> %a, <2 x i64> undef, <2 x i32> <i32 0, i32 0>
-  %2 = tail call <8 x i64> @llvm.x86.avx512.psra.q.512(<8 x i64> %v, <2 x i64> %1)
-  ret <8 x i64> %2
-}
-
-define <8 x i16> @sse2_psrl_w_var(<8 x i16> %v, <8 x i16> %a) {
-; CHECK-LABEL: @sse2_psrl_w_var(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i16> @llvm.x86.sse2.psrl.w(<8 x i16> %v, <8 x i16> %a)
-; CHECK-NEXT:    ret <8 x i16> [[TMP1]]
-;
-  %1 = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
-  %2 = tail call <8 x i16> @llvm.x86.sse2.psrl.w(<8 x i16> %v, <8 x i16> %1)
-  ret <8 x i16> %2
-}
-
-define <4 x i32> @sse2_psrl_d_var(<4 x i32> %v, <4 x i32> %a) {
-; CHECK-LABEL: @sse2_psrl_d_var(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i32> @llvm.x86.sse2.psrl.d(<4 x i32> %v, <4 x i32> %a)
-; CHECK-NEXT:    ret <4 x i32> [[TMP1]]
-;
-  %1 = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
-  %2 = tail call <4 x i32> @llvm.x86.sse2.psrl.d(<4 x i32> %v, <4 x i32> %1)
-  ret <4 x i32> %2
-}
-
-define <2 x i64> @sse2_psrl_q_var(<2 x i64> %v, <2 x i64> %a) {
-; CHECK-LABEL: @sse2_psrl_q_var(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.x86.sse2.psrl.q(<2 x i64> %v, <2 x i64> %a)
-; CHECK-NEXT:    ret <2 x i64> [[TMP1]]
-;
-  %1 = shufflevector <2 x i64> %a, <2 x i64> undef, <2 x i32> <i32 0, i32 0>
-  %2 = tail call <2 x i64> @llvm.x86.sse2.psrl.q(<2 x i64> %v, <2 x i64> %1)
-  ret <2 x i64> %2
-}
-
-define <16 x i16> @avx2_psrl_w_var(<16 x i16> %v, <8 x i16> %a) {
-; CHECK-LABEL: @avx2_psrl_w_var(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.x86.avx2.psrl.w(<16 x i16> %v, <8 x i16> %a)
-; CHECK-NEXT:    ret <16 x i16> [[TMP1]]
-;
-  %1 = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
-  %2 = tail call <16 x i16> @llvm.x86.avx2.psrl.w(<16 x i16> %v, <8 x i16> %1)
-  ret <16 x i16> %2
-}
-
-define <16 x i16> @avx2_psrl_w_var_bc(<16 x i16> %v, <16 x i8> %a) {
-; CHECK-LABEL: @avx2_psrl_w_var_bc(
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> %a to <8 x i16>
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.x86.avx2.psrl.w(<16 x i16> %v, <8 x i16> [[TMP1]])
-; CHECK-NEXT:    ret <16 x i16> [[TMP2]]
-;
-  %1 = shufflevector <16 x i8> %a, <16 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-  %2 = bitcast <16 x i8> %1 to <8 x i16>
-  %3 = tail call <16 x i16> @llvm.x86.avx2.psrl.w(<16 x i16> %v, <8 x i16> %2)
-  ret <16 x i16> %3
-}
-
-define <8 x i32> @avx2_psrl_d_var(<8 x i32> %v, <4 x i32> %a) {
-; CHECK-LABEL: @avx2_psrl_d_var(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.x86.avx2.psrl.d(<8 x i32> %v, <4 x i32> %a)
-; CHECK-NEXT:    ret <8 x i32> [[TMP1]]
-;
-  %1 = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
-  %2 = tail call <8 x i32> @llvm.x86.avx2.psrl.d(<8 x i32> %v, <4 x i32> %1)
-  ret <8 x i32> %2
-}
-
-define <8 x i32> @avx2_psrl_d_var_bc(<8 x i32> %v, <2 x i64> %a) {
-; CHECK-LABEL: @avx2_psrl_d_var_bc(
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> %a to <4 x i32>
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.x86.avx2.psrl.d(<8 x i32> %v, <4 x i32> [[TMP1]])
-; CHECK-NEXT:    ret <8 x i32> [[TMP2]]
-;
-  %1 = shufflevector <2 x i64> %a, <2 x i64> undef, <2 x i32> <i32 0, i32 0>
-  %2 = bitcast <2 x i64> %1 to <4 x i32>
-  %3 = tail call <8 x i32> @llvm.x86.avx2.psrl.d(<8 x i32> %v, <4 x i32> %2)
-  ret <8 x i32> %3
-}
-
-define <4 x i64> @avx2_psrl_q_var(<4 x i64> %v, <2 x i64> %a) {
-; CHECK-LABEL: @avx2_psrl_q_var(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.x86.avx2.psrl.q(<4 x i64> %v, <2 x i64> %a)
-; CHECK-NEXT:    ret <4 x i64> [[TMP1]]
-;
-  %1 = shufflevector <2 x i64> %a, <2 x i64> undef, <2 x i32> <i32 0, i32 0>
-  %2 = tail call <4 x i64> @llvm.x86.avx2.psrl.q(<4 x i64> %v, <2 x i64> %1)
-  ret <4 x i64> %2
-}
-
-define <32 x i16> @avx512_psrl_w_512_var(<32 x i16> %v, <8 x i16> %a) {
-; CHECK-LABEL: @avx512_psrl_w_512_var(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i16> @llvm.x86.avx512.psrl.w.512(<32 x i16> %v, <8 x i16> %a)
-; CHECK-NEXT:    ret <32 x i16> [[TMP1]]
-;
-  %1 = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
-  %2 = tail call <32 x i16> @llvm.x86.avx512.psrl.w.512(<32 x i16> %v, <8 x i16> %1)
-  ret <32 x i16> %2
-}
-
-define <32 x i16> @avx512_psrl_w_512_var_bc(<32 x i16> %v, <16 x i8> %a) {
-; CHECK-LABEL: @avx512_psrl_w_512_var_bc(
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> %a to <8 x i16>
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i16> @llvm.x86.avx512.psrl.w.512(<32 x i16> %v, <8 x i16> [[TMP1]])
-; CHECK-NEXT:    ret <32 x i16> [[TMP2]]
-;
-  %1 = shufflevector <16 x i8> %a, <16 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-  %2 = bitcast <16 x i8> %1 to <8 x i16>
-  %3 = tail call <32 x i16> @llvm.x86.avx512.psrl.w.512(<32 x i16> %v, <8 x i16> %2)
-  ret <32 x i16> %3
-}
-
-define <16 x i32> @avx512_psrl_d_512_var(<16 x i32> %v, <4 x i32> %a) {
-; CHECK-LABEL: @avx512_psrl_d_512_var(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i32> @llvm.x86.avx512.psrl.d.512(<16 x i32> %v, <4 x i32> %a)
-; CHECK-NEXT:    ret <16 x i32> [[TMP1]]
-;
-  %1 = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
-  %2 = tail call <16 x i32> @llvm.x86.avx512.psrl.d.512(<16 x i32> %v, <4 x i32> %1)
-  ret <16 x i32> %2
-}
-
-define <16 x i32> @avx512_psrl_d_512_var_bc(<16 x i32> %v, <2 x i64> %a) {
-; CHECK-LABEL: @avx512_psrl_d_512_var_bc(
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> %a to <4 x i32>
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i32> @llvm.x86.avx512.psrl.d.512(<16 x i32> %v, <4 x i32> [[TMP1]])
-; CHECK-NEXT:    ret <16 x i32> [[TMP2]]
-;
-  %1 = shufflevector <2 x i64> %a, <2 x i64> undef, <2 x i32> <i32 0, i32 0>
-  %2 = bitcast <2 x i64> %1 to <4 x i32>
-  %3 = tail call <16 x i32> @llvm.x86.avx512.psrl.d.512(<16 x i32> %v, <4 x i32> %2)
-  ret <16 x i32> %3
-}
-
-define <8 x i64> @avx512_psrl_q_512_var(<8 x i64> %v, <2 x i64> %a) {
-; CHECK-LABEL: @avx512_psrl_q_512_var(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i64> @llvm.x86.avx512.psrl.q.512(<8 x i64> %v, <2 x i64> %a)
-; CHECK-NEXT:    ret <8 x i64> [[TMP1]]
-;
-  %1 = shufflevector <2 x i64> %a, <2 x i64> undef, <2 x i32> <i32 0, i32 0>
-  %2 = tail call <8 x i64> @llvm.x86.avx512.psrl.q.512(<8 x i64> %v, <2 x i64> %1)
-  ret <8 x i64> %2
-}
-
-define <8 x i16> @sse2_psll_w_var(<8 x i16> %v, <8 x i16> %a) {
-; CHECK-LABEL: @sse2_psll_w_var(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i16> @llvm.x86.sse2.psll.w(<8 x i16> %v, <8 x i16> %a)
-; CHECK-NEXT:    ret <8 x i16> [[TMP1]]
-;
-  %1 = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
-  %2 = tail call <8 x i16> @llvm.x86.sse2.psll.w(<8 x i16> %v, <8 x i16> %1)
-  ret <8 x i16> %2
-}
-
-define <4 x i32> @sse2_psll_d_var(<4 x i32> %v, <4 x i32> %a) {
-; CHECK-LABEL: @sse2_psll_d_var(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i32> @llvm.x86.sse2.psll.d(<4 x i32> %v, <4 x i32> %a)
-; CHECK-NEXT:    ret <4 x i32> [[TMP1]]
-;
-  %1 = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
-  %2 = tail call <4 x i32> @llvm.x86.sse2.psll.d(<4 x i32> %v, <4 x i32> %1)
-  ret <4 x i32> %2
-}
-
-define <2 x i64> @sse2_psll_q_var(<2 x i64> %v, <2 x i64> %a) {
-; CHECK-LABEL: @sse2_psll_q_var(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.x86.sse2.psll.q(<2 x i64> %v, <2 x i64> %a)
-; CHECK-NEXT:    ret <2 x i64> [[TMP1]]
-;
-  %1 = shufflevector <2 x i64> %a, <2 x i64> undef, <2 x i32> <i32 0, i32 0>
-  %2 = tail call <2 x i64> @llvm.x86.sse2.psll.q(<2 x i64> %v, <2 x i64> %1)
-  ret <2 x i64> %2
-}
-
-define <16 x i16> @avx2_psll_w_var(<16 x i16> %v, <8 x i16> %a) {
-; CHECK-LABEL: @avx2_psll_w_var(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.x86.avx2.psll.w(<16 x i16> %v, <8 x i16> %a)
-; CHECK-NEXT:    ret <16 x i16> [[TMP1]]
-;
-  %1 = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
-  %2 = tail call <16 x i16> @llvm.x86.avx2.psll.w(<16 x i16> %v, <8 x i16> %1)
-  ret <16 x i16> %2
-}
-
-define <8 x i32> @avx2_psll_d_var(<8 x i32> %v, <4 x i32> %a) {
-; CHECK-LABEL: @avx2_psll_d_var(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.x86.avx2.psll.d(<8 x i32> %v, <4 x i32> %a)
-; CHECK-NEXT:    ret <8 x i32> [[TMP1]]
-;
-  %1 = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
-  %2 = tail call <8 x i32> @llvm.x86.avx2.psll.d(<8 x i32> %v, <4 x i32> %1)
-  ret <8 x i32> %2
-}
-
-define <4 x i64> @avx2_psll_q_var(<4 x i64> %v, <2 x i64> %a) {
-; CHECK-LABEL: @avx2_psll_q_var(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.x86.avx2.psll.q(<4 x i64> %v, <2 x i64> %a)
-; CHECK-NEXT:    ret <4 x i64> [[TMP1]]
-;
-  %1 = shufflevector <2 x i64> %a, <2 x i64> undef, <2 x i32> <i32 0, i32 0>
-  %2 = tail call <4 x i64> @llvm.x86.avx2.psll.q(<4 x i64> %v, <2 x i64> %1)
-  ret <4 x i64> %2
-}
-
-define <32 x i16> @avx512_psll_w_512_var(<32 x i16> %v, <8 x i16> %a) {
-; CHECK-LABEL: @avx512_psll_w_512_var(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i16> @llvm.x86.avx512.psll.w.512(<32 x i16> %v, <8 x i16> %a)
-; CHECK-NEXT:    ret <32 x i16> [[TMP1]]
-;
-  %1 = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
-  %2 = tail call <32 x i16> @llvm.x86.avx512.psll.w.512(<32 x i16> %v, <8 x i16> %1)
-  ret <32 x i16> %2
-}
-
-define <16 x i32> @avx512_psll_d_512_var(<16 x i32> %v, <4 x i32> %a) {
-; CHECK-LABEL: @avx512_psll_d_512_var(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i32> @llvm.x86.avx512.psll.d.512(<16 x i32> %v, <4 x i32> %a)
-; CHECK-NEXT:    ret <16 x i32> [[TMP1]]
-;
-  %1 = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
-  %2 = tail call <16 x i32> @llvm.x86.avx512.psll.d.512(<16 x i32> %v, <4 x i32> %1)
-  ret <16 x i32> %2
-}
-
-define <8 x i64> @avx512_psll_q_512_var(<8 x i64> %v, <2 x i64> %a) {
-; CHECK-LABEL: @avx512_psll_q_512_var(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i64> @llvm.x86.avx512.psll.q.512(<8 x i64> %v, <2 x i64> %a)
-; CHECK-NEXT:    ret <8 x i64> [[TMP1]]
-;
-  %1 = shufflevector <2 x i64> %a, <2 x i64> undef, <2 x i32> <i32 0, i32 0>
-  %2 = tail call <8 x i64> @llvm.x86.avx512.psll.q.512(<8 x i64> %v, <2 x i64> %1)
-  ret <8 x i64> %2
-}
-
-;
-; Constant Folding
-;
-
-define <8 x i16> @test_sse2_psra_w_0(<8 x i16> %A) {
-; CHECK-LABEL: @test_sse2_psra_w_0(
-; CHECK-NEXT:    ret <8 x i16> %A
-;
-  %1 = tail call <8 x i16> @llvm.x86.sse2.psrai.w(<8 x i16> %A, i32 0)
-  %2 = tail call <8 x i16> @llvm.x86.sse2.psra.w(<8 x i16> %1, <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 7, i16 0, i16 0, i16 0>)
-  %3 = tail call <8 x i16> @llvm.x86.sse2.psrai.w(<8 x i16> %2, i32 0)
-  ret <8 x i16> %3
-}
-
-define <8 x i16> @test_sse2_psra_w_8() {
-; CHECK-LABEL: @test_sse2_psra_w_8(
-; CHECK-NEXT:    ret <8 x i16> <i16 -128, i16 64, i16 32, i16 16, i16 -128, i16 64, i16 32, i16 16>
-;
-  %1 = bitcast <2 x i64> <i64 1152956690052710400, i64 1152956690052710400> to <8 x i16>
-  %2 = tail call <8 x i16> @llvm.x86.sse2.psrai.w(<8 x i16> %1, i32 3)
-  %3 = tail call <8 x i16> @llvm.x86.sse2.psra.w(<8 x i16> %2, <8 x i16> <i16 3, i16 0, i16 0, i16 0, i16 7, i16 0, i16 0, i16 0>)
-  %4 = tail call <8 x i16> @llvm.x86.sse2.psrai.w(<8 x i16> %3, i32 2)
-  ret <8 x i16> %4
-}
-
-define <4 x i32> @test_sse2_psra_d_0(<4 x i32> %A) {
-; CHECK-LABEL: @test_sse2_psra_d_0(
-; CHECK-NEXT:    ret <4 x i32> %A
-;
-  %1 = tail call <4 x i32> @llvm.x86.sse2.psrai.d(<4 x i32> %A, i32 0)
-  %2 = tail call <4 x i32> @llvm.x86.sse2.psra.d(<4 x i32> %1, <4 x i32> <i32 0, i32 0, i32 7, i32 0>)
-  %3 = tail call <4 x i32> @llvm.x86.sse2.psrai.d(<4 x i32> %1, i32 0)
-  ret <4 x i32> %3
-}
-
-define <4 x i32> @sse2_psra_d_8() {
-; CHECK-LABEL: @sse2_psra_d_8(
-; CHECK-NEXT:    ret <4 x i32> <i32 4194432, i32 1048608, i32 4194432, i32 1048608>
-;
-  %1 = bitcast <2 x i64> <i64 1152956690052710400, i64 1152956690052710400> to <4 x i32>
-  %2 = tail call <4 x i32> @llvm.x86.sse2.psrai.d(<4 x i32> %1, i32 3)
-  %3 = tail call <4 x i32> @llvm.x86.sse2.psra.d(<4 x i32> %2, <4 x i32> <i32 3, i32 0, i32 7, i32 0>)
-  %4 = tail call <4 x i32> @llvm.x86.sse2.psrai.d(<4 x i32> %3, i32 2)
-  ret <4 x i32> %4
-}
-
-define <16 x i16> @test_avx2_psra_w_0(<16 x i16> %A) {
-; CHECK-LABEL: @test_avx2_psra_w_0(
-; CHECK-NEXT:    ret <16 x i16> %A
-;
-  %1 = tail call <16 x i16> @llvm.x86.avx2.psrai.w(<16 x i16> %A, i32 0)
-  %2 = tail call <16 x i16> @llvm.x86.avx2.psra.w(<16 x i16> %1, <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 7, i16 0, i16 0, i16 0>)
-  %3 = tail call <16 x i16> @llvm.x86.avx2.psrai.w(<16 x i16> %2, i32 0)
-  ret <16 x i16> %3
-}
-
-define <16 x i16> @test_avx2_psra_w_8(<16 x i16> %A) {
-; CHECK-LABEL: @test_avx2_psra_w_8(
-; CHECK-NEXT:    ret <16 x i16> <i16 -128, i16 64, i16 32, i16 16, i16 -128, i16 64, i16 32, i16 16, i16 -128, i16 64, i16 32, i16 16, i16 -128, i16 64, i16 32, i16 16>
-;
-  %1 = bitcast <4 x i64> <i64 1152956690052710400, i64 1152956690052710400, i64 1152956690052710400, i64 1152956690052710400> to <16 x i16>
-  %2 = tail call <16 x i16> @llvm.x86.avx2.psrai.w(<16 x i16> %1, i32 3)
-  %3 = tail call <16 x i16> @llvm.x86.avx2.psra.w(<16 x i16> %2, <8 x i16> <i16 3, i16 0, i16 0, i16 0, i16 7, i16 0, i16 0, i16 0>)
-  %4 = tail call <16 x i16> @llvm.x86.avx2.psrai.w(<16 x i16> %3, i32 2)
-  ret <16 x i16> %4
-}
-
-define <8 x i32> @test_avx2_psra_d_0(<8 x i32> %A) {
-; CHECK-LABEL: @test_avx2_psra_d_0(
-; CHECK-NEXT:    ret <8 x i32> %A
-;
-  %1 = tail call <8 x i32> @llvm.x86.avx2.psrai.d(<8 x i32> %A, i32 0)
-  %2 = tail call <8 x i32> @llvm.x86.avx2.psra.d(<8 x i32> %1, <4 x i32> <i32 0, i32 0, i32 7, i32 0>)
-  %3 = tail call <8 x i32> @llvm.x86.avx2.psrai.d(<8 x i32> %2, i32 0)
-  ret <8 x i32> %3
-}
-
-define <8 x i32> @test_avx2_psra_d_8() {
-; CHECK-LABEL: @test_avx2_psra_d_8(
-; CHECK-NEXT:    ret <8 x i32> <i32 4194432, i32 1048608, i32 4194432, i32 1048608, i32 4194432, i32 1048608, i32 4194432, i32 1048608>
-;
-  %1 = bitcast <4 x i64> <i64 1152956690052710400, i64 1152956690052710400, i64 1152956690052710400, i64 1152956690052710400> to <8 x i32>
-  %2 = tail call <8 x i32> @llvm.x86.avx2.psrai.d(<8 x i32> %1, i32 3)
-  %3 = tail call <8 x i32> @llvm.x86.avx2.psra.d(<8 x i32> %2, <4 x i32> <i32 3, i32 0, i32 7, i32 0>)
-  %4 = tail call <8 x i32> @llvm.x86.avx2.psrai.d(<8 x i32> %3, i32 2)
-  ret <8 x i32> %4
-}
-
-define <32 x i16> @test_avx512_psra_w_512_0(<32 x i16> %A) {
-; CHECK-LABEL: @test_avx512_psra_w_512_0(
-; CHECK-NEXT:    ret <32 x i16> %A
-;
-  %1 = tail call <32 x i16> @llvm.x86.avx512.psrai.w.512(<32 x i16> %A, i32 0)
-  %2 = tail call <32 x i16> @llvm.x86.avx512.psra.w.512(<32 x i16> %1, <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 7, i16 0, i16 0, i16 0>)
-  %3 = tail call <32 x i16> @llvm.x86.avx512.psrai.w.512(<32 x i16> %2, i32 0)
-  ret <32 x i16> %3
-}
-
-define <32 x i16> @test_avx512_psra_w_512_8(<32 x i16> %A) {
-; CHECK-LABEL: @test_avx512_psra_w_512_8(
-; CHECK-NEXT:    ret <32 x i16> <i16 -128, i16 64, i16 32, i16 16, i16 -128, i16 64, i16 32, i16 16, i16 -128, i16 64, i16 32, i16 16, i16 -128, i16 64, i16 32, i16 16, i16 -128, i16 64, i16 32, i16 16, i16 -128, i16 64, i16 32, i16 16, i16 -128, i16 64, i16 32, i16 16, i16 -128, i16 64, i16 32, i16 16>
-;
-  %1 = bitcast <8 x i64> <i64 1152956690052710400, i64 1152956690052710400, i64 1152956690052710400, i64 1152956690052710400, i64 1152956690052710400, i64 1152956690052710400, i64 1152956690052710400, i64 1152956690052710400> to <32 x i16>
-  %2 = tail call <32 x i16> @llvm.x86.avx512.psrai.w.512(<32 x i16> %1, i32 3)
-  %3 = tail call <32 x i16> @llvm.x86.avx512.psra.w.512(<32 x i16> %2, <8 x i16> <i16 3, i16 0, i16 0, i16 0, i16 7, i16 0, i16 0, i16 0>)
-  %4 = tail call <32 x i16> @llvm.x86.avx512.psrai.w.512(<32 x i16> %3, i32 2)
-  ret <32 x i16> %4
-}
-
-define <16 x i32> @test_avx512_psra_d_512_0(<16 x i32> %A) {
-; CHECK-LABEL: @test_avx512_psra_d_512_0(
-; CHECK-NEXT:    ret <16 x i32> %A
-;
-  %1 = tail call <16 x i32> @llvm.x86.avx512.psrai.d.512(<16 x i32> %A, i32 0)
-  %2 = tail call <16 x i32> @llvm.x86.avx512.psra.d.512(<16 x i32> %1, <4 x i32> <i32 0, i32 0, i32 7, i32 0>)
-  %3 = tail call <16 x i32> @llvm.x86.avx512.psrai.d.512(<16 x i32> %2, i32 0)
-  ret <16 x i32> %3
-}
-
-define <16 x i32> @test_avx512_psra_d_512_8() {
-; CHECK-LABEL: @test_avx512_psra_d_512_8(
-; CHECK-NEXT:    ret <16 x i32> <i32 4194432, i32 1048608, i32 4194432, i32 1048608, i32 4194432, i32 1048608, i32 4194432, i32 1048608, i32 4194432, i32 1048608, i32 4194432, i32 1048608, i32 4194432, i32 1048608, i32 4194432, i32 1048608>
-;
-  %1 = bitcast <8 x i64> <i64 1152956690052710400, i64 1152956690052710400, i64 1152956690052710400, i64 1152956690052710400, i64 1152956690052710400, i64 1152956690052710400, i64 1152956690052710400, i64 1152956690052710400> to <16 x i32>
-  %2 = tail call <16 x i32> @llvm.x86.avx512.psrai.d.512(<16 x i32> %1, i32 3)
-  %3 = tail call <16 x i32> @llvm.x86.avx512.psra.d.512(<16 x i32> %2, <4 x i32> <i32 3, i32 0, i32 7, i32 0>)
-  %4 = tail call <16 x i32> @llvm.x86.avx512.psrai.d.512(<16 x i32> %3, i32 2)
-  ret <16 x i32> %4
-}
-
-;
-; Old Tests
-;
-
-define <2 x i64> @test_sse2_1() {
-; CHECK-LABEL: @test_sse2_1(
-; CHECK-NEXT:    ret <2 x i64> <i64 72058418680037440, i64 144117112246370624>
-;
-  %S = bitcast i32 1 to i32
-  %1 = zext i32 %S to i64
-  %2 = insertelement <2 x i64> undef, i64 %1, i32 0
-  %3 = insertelement <2 x i64> %2, i64 0, i32 1
-  %4 = bitcast <2 x i64> %3 to <8 x i16>
-  %5 = tail call <8 x i16> @llvm.x86.sse2.psll.w(<8 x i16> <i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8>, <8 x i16> %4)
-  %6 = bitcast <8 x i16> %5 to <4 x i32>
-  %7 = bitcast <2 x i64> %3 to <4 x i32>
-  %8 = tail call <4 x i32> @llvm.x86.sse2.psll.d(<4 x i32> %6, <4 x i32> %7)
-  %9 = bitcast <4 x i32> %8 to <2 x i64>
-  %10 = tail call <2 x i64> @llvm.x86.sse2.psll.q(<2 x i64> %9, <2 x i64> %3)
-  %11 = bitcast <2 x i64> %10 to <8 x i16>
-  %12 = tail call <8 x i16> @llvm.x86.sse2.pslli.w(<8 x i16> %11, i32 %S)
-  %13 = bitcast <8 x i16> %12 to <4 x i32>
-  %14 = tail call <4 x i32> @llvm.x86.sse2.pslli.d(<4 x i32> %13, i32 %S)
-  %15 = bitcast <4 x i32> %14 to <2 x i64>
-  %16 = tail call <2 x i64> @llvm.x86.sse2.pslli.q(<2 x i64> %15, i32 %S)
-  ret <2 x i64> %16
-}
-
-define <4 x i64> @test_avx2_1() {
-; CHECK-LABEL: @test_avx2_1(
-; CHECK-NEXT:    ret <4 x i64> <i64 64, i64 128, i64 192, i64 256>
-;
-  %S = bitcast i32 1 to i32
-  %1 = zext i32 %S to i64
-  %2 = insertelement <2 x i64> undef, i64 %1, i32 0
-  %3 = insertelement <2 x i64> %2, i64 0, i32 1
-  %4 = bitcast <2 x i64> %3 to <8 x i16>
-  %5 = tail call <16 x i16> @llvm.x86.avx2.psll.w(<16 x i16> <i16 1, i16 0, i16 0, i16 0, i16 2, i16 0, i16 0, i16 0, i16 3, i16 0, i16 0, i16 0, i16 4, i16 0, i16 0, i16 0>, <8 x i16> %4)
-  %6 = bitcast <16 x i16> %5 to <8 x i32>
-  %7 = bitcast <2 x i64> %3 to <4 x i32>
-  %8 = tail call <8 x i32> @llvm.x86.avx2.psll.d(<8 x i32> %6, <4 x i32> %7)
-  %9 = bitcast <8 x i32> %8 to <4 x i64>
-  %10 = tail call <4 x i64> @llvm.x86.avx2.psll.q(<4 x i64> %9, <2 x i64> %3)
-  %11 = bitcast <4 x i64> %10 to <16 x i16>
-  %12 = tail call <16 x i16> @llvm.x86.avx2.pslli.w(<16 x i16> %11, i32 %S)
-  %13 = bitcast <16 x i16> %12 to <8 x i32>
-  %14 = tail call <8 x i32> @llvm.x86.avx2.pslli.d(<8 x i32> %13, i32 %S)
-  %15 = bitcast <8 x i32> %14 to <4 x i64>
-  %16 = tail call <4 x i64> @llvm.x86.avx2.pslli.q(<4 x i64> %15, i32 %S)
-  ret <4 x i64> %16
-}
-
-define <2 x i64> @test_sse2_0() {
-; CHECK-LABEL: @test_sse2_0(
-; CHECK-NEXT:    ret <2 x i64> zeroinitializer
-;
-  %S = bitcast i32 128 to i32
-  %1 = zext i32 %S to i64
-  %2 = insertelement <2 x i64> undef, i64 %1, i32 0
-  %3 = insertelement <2 x i64> %2, i64 0, i32 1
-  %4 = bitcast <2 x i64> %3 to <8 x i16>
-  %5 = tail call <8 x i16> @llvm.x86.sse2.psll.w(<8 x i16> <i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8>, <8 x i16> %4)
-  %6 = bitcast <8 x i16> %5 to <4 x i32>
-  %7 = bitcast <2 x i64> %3 to <4 x i32>
-  %8 = tail call <4 x i32> @llvm.x86.sse2.psll.d(<4 x i32> %6, <4 x i32> %7)
-  %9 = bitcast <4 x i32> %8 to <2 x i64>
-  %10 = tail call <2 x i64> @llvm.x86.sse2.psll.q(<2 x i64> %9, <2 x i64> %3)
-  %11 = bitcast <2 x i64> %10 to <8 x i16>
-  %12 = tail call <8 x i16> @llvm.x86.sse2.pslli.w(<8 x i16> %11, i32 %S)
-  %13 = bitcast <8 x i16> %12 to <4 x i32>
-  %14 = tail call <4 x i32> @llvm.x86.sse2.pslli.d(<4 x i32> %13, i32 %S)
-  %15 = bitcast <4 x i32> %14 to <2 x i64>
-  %16 = tail call <2 x i64> @llvm.x86.sse2.pslli.q(<2 x i64> %15, i32 %S)
-  ret <2 x i64> %16
-}
-
-define <4 x i64> @test_avx2_0() {
-; CHECK-LABEL: @test_avx2_0(
-; CHECK-NEXT:    ret <4 x i64> zeroinitializer
-;
-  %S = bitcast i32 128 to i32
-  %1 = zext i32 %S to i64
-  %2 = insertelement <2 x i64> undef, i64 %1, i32 0
-  %3 = insertelement <2 x i64> %2, i64 0, i32 1
-  %4 = bitcast <2 x i64> %3 to <8 x i16>
-  %5 = tail call <16 x i16> @llvm.x86.avx2.psll.w(<16 x i16> <i16 1, i16 0, i16 0, i16 0, i16 2, i16 0, i16 0, i16 0, i16 3, i16 0, i16 0, i16 0, i16 4, i16 0, i16 0, i16 0>, <8 x i16> %4)
-  %6 = bitcast <16 x i16> %5 to <8 x i32>
-  %7 = bitcast <2 x i64> %3 to <4 x i32>
-  %8 = tail call <8 x i32> @llvm.x86.avx2.psll.d(<8 x i32> %6, <4 x i32> %7)
-  %9 = bitcast <8 x i32> %8 to <4 x i64>
-  %10 = tail call <4 x i64> @llvm.x86.avx2.psll.q(<4 x i64> %9, <2 x i64> %3)
-  %11 = bitcast <4 x i64> %10 to <16 x i16>
-  %12 = tail call <16 x i16> @llvm.x86.avx2.pslli.w(<16 x i16> %11, i32 %S)
-  %13 = bitcast <16 x i16> %12 to <8 x i32>
-  %14 = tail call <8 x i32> @llvm.x86.avx2.pslli.d(<8 x i32> %13, i32 %S)
-  %15 = bitcast <8 x i32> %14 to <4 x i64>
-  %16 = tail call <4 x i64> @llvm.x86.avx2.pslli.q(<4 x i64> %15, i32 %S)
-  ret <4 x i64> %16
-}
-define <2 x i64> @test_sse2_psrl_1() {
-; CHECK-LABEL: @test_sse2_psrl_1(
-; CHECK-NEXT:    ret <2 x i64> <i64 562954248421376, i64 9007267974742020>
-;
-  %S = bitcast i32 1 to i32
-  %1 = zext i32 %S to i64
-  %2 = insertelement <2 x i64> undef, i64 %1, i32 0
-  %3 = insertelement <2 x i64> %2, i64 0, i32 1
-  %4 = bitcast <2 x i64> %3 to <8 x i16>
-  %5 = tail call <8 x i16> @llvm.x86.sse2.psrl.w(<8 x i16> <i16 16, i16 32, i16 64, i16 128, i16 256, i16 512, i16 1024, i16 2048>, <8 x i16> %4)
-  %6 = bitcast <8 x i16> %5 to <4 x i32>
-  %7 = bitcast <2 x i64> %3 to <4 x i32>
-  %8 = tail call <4 x i32> @llvm.x86.sse2.psrl.d(<4 x i32> %6, <4 x i32> %7)
-  %9 = bitcast <4 x i32> %8 to <2 x i64>
-  %10 = tail call <2 x i64> @llvm.x86.sse2.psrl.q(<2 x i64> %9, <2 x i64> %3)
-  %11 = bitcast <2 x i64> %10 to <8 x i16>
-  %12 = tail call <8 x i16> @llvm.x86.sse2.psrli.w(<8 x i16> %11, i32 %S)
-  %13 = bitcast <8 x i16> %12 to <4 x i32>
-  %14 = tail call <4 x i32> @llvm.x86.sse2.psrli.d(<4 x i32> %13, i32 %S)
-  %15 = bitcast <4 x i32> %14 to <2 x i64>
-  %16 = tail call <2 x i64> @llvm.x86.sse2.psrli.q(<2 x i64> %15, i32 %S)
-  ret <2 x i64> %16
-}
-
-define <4 x i64> @test_avx2_psrl_1() {
-; CHECK-LABEL: @test_avx2_psrl_1(
-; CHECK-NEXT:    ret <4 x i64> <i64 16, i64 32, i64 64, i64 128>
-;
-  %S = bitcast i32 1 to i32
-  %1 = zext i32 %S to i64
-  %2 = insertelement <2 x i64> undef, i64 %1, i32 0
-  %3 = insertelement <2 x i64> %2, i64 0, i32 1
-  %4 = bitcast <2 x i64> %3 to <8 x i16>
-  %5 = tail call <16 x i16> @llvm.x86.avx2.psrl.w(<16 x i16> <i16 1024, i16 0, i16 0, i16 0, i16 2048, i16 0, i16 0, i16 0, i16 4096, i16 0, i16 0, i16 0, i16 8192, i16 0, i16 0, i16 0>, <8 x i16> %4)
-  %6 = bitcast <16 x i16> %5 to <8 x i32>
-  %7 = bitcast <2 x i64> %3 to <4 x i32>
-  %8 = tail call <8 x i32> @llvm.x86.avx2.psrl.d(<8 x i32> %6, <4 x i32> %7)
-  %9 = bitcast <8 x i32> %8 to <4 x i64>
-  %10 = tail call <4 x i64> @llvm.x86.avx2.psrl.q(<4 x i64> %9, <2 x i64> %3)
-  %11 = bitcast <4 x i64> %10 to <16 x i16>
-  %12 = tail call <16 x i16> @llvm.x86.avx2.psrli.w(<16 x i16> %11, i32 %S)
-  %13 = bitcast <16 x i16> %12 to <8 x i32>
-  %14 = tail call <8 x i32> @llvm.x86.avx2.psrli.d(<8 x i32> %13, i32 %S)
-  %15 = bitcast <8 x i32> %14 to <4 x i64>
-  %16 = tail call <4 x i64> @llvm.x86.avx2.psrli.q(<4 x i64> %15, i32 %S)
-  ret <4 x i64> %16
-}
-
-define <2 x i64> @test_sse2_psrl_0() {
-; CHECK-LABEL: @test_sse2_psrl_0(
-; CHECK-NEXT:    ret <2 x i64> zeroinitializer
-;
-  %S = bitcast i32 128 to i32
-  %1 = zext i32 %S to i64
-  %2 = insertelement <2 x i64> undef, i64 %1, i32 0
-  %3 = insertelement <2 x i64> %2, i64 0, i32 1
-  %4 = bitcast <2 x i64> %3 to <8 x i16>
-  %5 = tail call <8 x i16> @llvm.x86.sse2.psrl.w(<8 x i16> <i16 32, i16 64, i16 128, i16 256, i16 512, i16 1024, i16 2048, i16 4096>, <8 x i16> %4)
-  %6 = bitcast <8 x i16> %5 to <4 x i32>
-  %7 = bitcast <2 x i64> %3 to <4 x i32>
-  %8 = tail call <4 x i32> @llvm.x86.sse2.psrl.d(<4 x i32> %6, <4 x i32> %7)
-  %9 = bitcast <4 x i32> %8 to <2 x i64>
-  %10 = tail call <2 x i64> @llvm.x86.sse2.psrl.q(<2 x i64> %9, <2 x i64> %3)
-  %11 = bitcast <2 x i64> %10 to <8 x i16>
-  %12 = tail call <8 x i16> @llvm.x86.sse2.psrli.w(<8 x i16> %11, i32 %S)
-  %13 = bitcast <8 x i16> %12 to <4 x i32>
-  %14 = tail call <4 x i32> @llvm.x86.sse2.psrli.d(<4 x i32> %13, i32 %S)
-  %15 = bitcast <4 x i32> %14 to <2 x i64>
-  %16 = tail call <2 x i64> @llvm.x86.sse2.psrli.q(<2 x i64> %15, i32 %S)
-  ret <2 x i64> %16
-}
-
-define <4 x i64> @test_avx2_psrl_0() {
-; CHECK-LABEL: @test_avx2_psrl_0(
-; CHECK-NEXT:    ret <4 x i64> zeroinitializer
-;
-  %S = bitcast i32 128 to i32
-  %1 = zext i32 %S to i64
-  %2 = insertelement <2 x i64> undef, i64 %1, i32 0
-  %3 = insertelement <2 x i64> %2, i64 0, i32 1
-  %4 = bitcast <2 x i64> %3 to <8 x i16>
-  %5 = tail call <16 x i16> @llvm.x86.avx2.psrl.w(<16 x i16> <i16 1024, i16 0, i16 0, i16 0, i16 2048, i16 0, i16 0, i16 0, i16 4096, i16 0, i16 0, i16 0, i16 8192, i16 0, i16 0, i16 0>, <8 x i16> %4)
-  %6 = bitcast <16 x i16> %5 to <8 x i32>
-  %7 = bitcast <2 x i64> %3 to <4 x i32>
-  %8 = tail call <8 x i32> @llvm.x86.avx2.psrl.d(<8 x i32> %6, <4 x i32> %7)
-  %9 = bitcast <8 x i32> %8 to <4 x i64>
-  %10 = tail call <4 x i64> @llvm.x86.avx2.psrl.q(<4 x i64> %9, <2 x i64> %3)
-  %11 = bitcast <4 x i64> %10 to <16 x i16>
-  %12 = tail call <16 x i16> @llvm.x86.avx2.psrli.w(<16 x i16> %11, i32 %S)
-  %13 = bitcast <16 x i16> %12 to <8 x i32>
-  %14 = tail call <8 x i32> @llvm.x86.avx2.psrli.d(<8 x i32> %13, i32 %S)
-  %15 = bitcast <8 x i32> %14 to <4 x i64>
-  %16 = tail call <4 x i64> @llvm.x86.avx2.psrli.q(<4 x i64> %15, i32 %S)
-  ret <4 x i64> %16
-}
-
-declare <8 x i64> @llvm.x86.avx512.pslli.q.512(<8 x i64>, i32) #1
-declare <16 x i32> @llvm.x86.avx512.pslli.d.512(<16 x i32>, i32) #1
-declare <32 x i16> @llvm.x86.avx512.pslli.w.512(<32 x i16>, i32) #1
-declare <8 x i64> @llvm.x86.avx512.psll.q.512(<8 x i64>, <2 x i64>) #1
-declare <16 x i32> @llvm.x86.avx512.psll.d.512(<16 x i32>, <4 x i32>) #1
-declare <32 x i16> @llvm.x86.avx512.psll.w.512(<32 x i16>, <8 x i16>) #1
-declare <4 x i64> @llvm.x86.avx2.pslli.q(<4 x i64>, i32) #1
-declare <8 x i32> @llvm.x86.avx2.pslli.d(<8 x i32>, i32) #1
-declare <16 x i16> @llvm.x86.avx2.pslli.w(<16 x i16>, i32) #1
-declare <4 x i64> @llvm.x86.avx2.psll.q(<4 x i64>, <2 x i64>) #1
-declare <8 x i32> @llvm.x86.avx2.psll.d(<8 x i32>, <4 x i32>) #1
-declare <16 x i16> @llvm.x86.avx2.psll.w(<16 x i16>, <8 x i16>) #1
-declare <2 x i64> @llvm.x86.sse2.pslli.q(<2 x i64>, i32) #1
-declare <4 x i32> @llvm.x86.sse2.pslli.d(<4 x i32>, i32) #1
-declare <8 x i16> @llvm.x86.sse2.pslli.w(<8 x i16>, i32) #1
-declare <2 x i64> @llvm.x86.sse2.psll.q(<2 x i64>, <2 x i64>) #1
-declare <4 x i32> @llvm.x86.sse2.psll.d(<4 x i32>, <4 x i32>) #1
-declare <8 x i16> @llvm.x86.sse2.psll.w(<8 x i16>, <8 x i16>) #1
-
-declare <8 x i64> @llvm.x86.avx512.psrli.q.512(<8 x i64>, i32) #1
-declare <16 x i32> @llvm.x86.avx512.psrli.d.512(<16 x i32>, i32) #1
-declare <32 x i16> @llvm.x86.avx512.psrli.w.512(<32 x i16>, i32) #1
-declare <8 x i64> @llvm.x86.avx512.psrl.q.512(<8 x i64>, <2 x i64>) #1
-declare <16 x i32> @llvm.x86.avx512.psrl.d.512(<16 x i32>, <4 x i32>) #1
-declare <32 x i16> @llvm.x86.avx512.psrl.w.512(<32 x i16>, <8 x i16>) #1
-declare <4 x i64> @llvm.x86.avx2.psrli.q(<4 x i64>, i32) #1
-declare <8 x i32> @llvm.x86.avx2.psrli.d(<8 x i32>, i32) #1
-declare <16 x i16> @llvm.x86.avx2.psrli.w(<16 x i16>, i32) #1
-declare <4 x i64> @llvm.x86.avx2.psrl.q(<4 x i64>, <2 x i64>) #1
-declare <8 x i32> @llvm.x86.avx2.psrl.d(<8 x i32>, <4 x i32>) #1
-declare <16 x i16> @llvm.x86.avx2.psrl.w(<16 x i16>, <8 x i16>) #1
-declare <2 x i64> @llvm.x86.sse2.psrli.q(<2 x i64>, i32) #1
-declare <4 x i32> @llvm.x86.sse2.psrli.d(<4 x i32>, i32) #1
-declare <8 x i16> @llvm.x86.sse2.psrli.w(<8 x i16>, i32) #1
-declare <2 x i64> @llvm.x86.sse2.psrl.q(<2 x i64>, <2 x i64>) #1
-declare <4 x i32> @llvm.x86.sse2.psrl.d(<4 x i32>, <4 x i32>) #1
-declare <8 x i16> @llvm.x86.sse2.psrl.w(<8 x i16>, <8 x i16>) #1
-
-declare <8 x i64> @llvm.x86.avx512.psrai.q.512(<8 x i64>, i32) #1
-declare <16 x i32> @llvm.x86.avx512.psrai.d.512(<16 x i32>, i32) #1
-declare <32 x i16> @llvm.x86.avx512.psrai.w.512(<32 x i16>, i32) #1
-declare <8 x i64> @llvm.x86.avx512.psra.q.512(<8 x i64>, <2 x i64>) #1
-declare <16 x i32> @llvm.x86.avx512.psra.d.512(<16 x i32>, <4 x i32>) #1
-declare <32 x i16> @llvm.x86.avx512.psra.w.512(<32 x i16>, <8 x i16>) #1
-declare <4 x i64> @llvm.x86.avx512.psrai.q.256(<4 x i64>, i32) #1
-declare <8 x i32> @llvm.x86.avx2.psrai.d(<8 x i32>, i32) #1
-declare <16 x i16> @llvm.x86.avx2.psrai.w(<16 x i16>, i32) #1
-declare <4 x i64> @llvm.x86.avx512.psra.q.256(<4 x i64>, <2 x i64>) #1
-declare <8 x i32> @llvm.x86.avx2.psra.d(<8 x i32>, <4 x i32>) #1
-declare <16 x i16> @llvm.x86.avx2.psra.w(<16 x i16>, <8 x i16>) #1
-declare <2 x i64> @llvm.x86.avx512.psrai.q.128(<2 x i64>, i32) #1
-declare <4 x i32> @llvm.x86.sse2.psrai.d(<4 x i32>, i32) #1
-declare <8 x i16> @llvm.x86.sse2.psrai.w(<8 x i16>, i32) #1
-declare <2 x i64> @llvm.x86.avx512.psra.q.128(<2 x i64>, <2 x i64>) #1
-declare <4 x i32> @llvm.x86.sse2.psra.d(<4 x i32>, <4 x i32>) #1
-declare <8 x i16> @llvm.x86.sse2.psra.w(<8 x i16>, <8 x i16>) #1
-
-declare <4 x i32> @llvm.x86.avx2.psrav.d(<4 x i32>, <4 x i32>) #1
-declare <8 x i32> @llvm.x86.avx2.psrav.d.256(<8 x i32>, <8 x i32>) #1
-declare <16 x i32> @llvm.x86.avx512.psrav.d.512(<16 x i32>, <16 x i32>) #1
-declare <2 x i64> @llvm.x86.avx512.psrav.q.128(<2 x i64>, <2 x i64>) #1
-declare <4 x i64> @llvm.x86.avx512.psrav.q.256(<4 x i64>, <4 x i64>) #1
-declare <8 x i64> @llvm.x86.avx512.psrav.q.512(<8 x i64>, <8 x i64>) #1
-
-declare <4 x i32> @llvm.x86.avx2.psrlv.d(<4 x i32>, <4 x i32>) #1
-declare <8 x i32> @llvm.x86.avx2.psrlv.d.256(<8 x i32>, <8 x i32>) #1
-declare <2 x i64> @llvm.x86.avx2.psrlv.q(<2 x i64>, <2 x i64>) #1
-declare <4 x i64> @llvm.x86.avx2.psrlv.q.256(<4 x i64>, <4 x i64>) #1
-declare <16 x i32> @llvm.x86.avx512.psrlv.d.512(<16 x i32>, <16 x i32>) #1
-declare <8 x i64> @llvm.x86.avx512.psrlv.q.512(<8 x i64>, <8 x i64>) #1
-
-declare <4 x i32> @llvm.x86.avx2.psllv.d(<4 x i32>, <4 x i32>) #1
-declare <8 x i32> @llvm.x86.avx2.psllv.d.256(<8 x i32>, <8 x i32>) #1
-declare <2 x i64> @llvm.x86.avx2.psllv.q(<2 x i64>, <2 x i64>) #1
-declare <4 x i64> @llvm.x86.avx2.psllv.q.256(<4 x i64>, <4 x i64>) #1
-declare <16 x i32> @llvm.x86.avx512.psllv.d.512(<16 x i32>, <16 x i32>) #1
-declare <8 x i64> @llvm.x86.avx512.psllv.q.512(<8 x i64>, <8 x i64>) #1
-
-declare <8 x i16> @llvm.x86.avx512.psrav.w.128(<8 x i16>, <8 x i16>) #1
-declare <16 x i16> @llvm.x86.avx512.psrav.w.256(<16 x i16>, <16 x i16>) #1
-declare <32 x i16> @llvm.x86.avx512.psrav.w.512(<32 x i16>, <32 x i16>) #1
-declare <8 x i16> @llvm.x86.avx512.psrlv.w.128(<8 x i16>, <8 x i16>) #1
-declare <16 x i16> @llvm.x86.avx512.psrlv.w.256(<16 x i16>, <16 x i16>) #1
-declare <32 x i16> @llvm.x86.avx512.psrlv.w.512(<32 x i16>, <32 x i16>) #1
-declare <8 x i16> @llvm.x86.avx512.psllv.w.128(<8 x i16>, <8 x i16>) #1
-declare <16 x i16> @llvm.x86.avx512.psllv.w.256(<16 x i16>, <16 x i16>) #1
-declare <32 x i16> @llvm.x86.avx512.psllv.w.512(<32 x i16>, <32 x i16>) #1
-
-attributes #1 = { nounwind readnone }
diff --git a/test/Transforms/InstCombine/x86-vperm2.ll b/test/Transforms/InstCombine/x86-vperm2.ll
deleted file mode 100644
index 84f69aa25d24..000000000000
--- a/test/Transforms/InstCombine/x86-vperm2.ll
+++ /dev/null
@@ -1,313 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -instcombine -S | FileCheck %s
-
-; This should never happen, but make sure we don't crash handling a non-constant immediate byte.
-
-define <4 x double> @perm2pd_non_const_imm(<4 x double> %a0, <4 x double> %a1, i8 %b) {
-; CHECK-LABEL: @perm2pd_non_const_imm(
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 %b)
-; CHECK-NEXT:    ret <4 x double> [[RES]]
-;
-  %res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 %b)
-  ret <4 x double> %res
-
-}
-
-
-; In the following 4 tests, both zero mask bits of the immediate are set.
-
-define <4 x double> @perm2pd_0x88(<4 x double> %a0, <4 x double> %a1) {
-; CHECK-LABEL: @perm2pd_0x88(
-; CHECK-NEXT:    ret <4 x double> zeroinitializer
-;
-  %res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 136)
-  ret <4 x double> %res
-
-}
-
-define <8 x float> @perm2ps_0x88(<8 x float> %a0, <8 x float> %a1) {
-; CHECK-LABEL: @perm2ps_0x88(
-; CHECK-NEXT:    ret <8 x float> zeroinitializer
-;
-  %res = call <8 x float> @llvm.x86.avx.vperm2f128.ps.256(<8 x float> %a0, <8 x float> %a1, i8 136)
-  ret <8 x float> %res
-
-}
-
-define <8 x i32> @perm2si_0x88(<8 x i32> %a0, <8 x i32> %a1) {
-; CHECK-LABEL: @perm2si_0x88(
-; CHECK-NEXT:    ret <8 x i32> zeroinitializer
-;
-  %res = call <8 x i32> @llvm.x86.avx.vperm2f128.si.256(<8 x i32> %a0, <8 x i32> %a1, i8 136)
-  ret <8 x i32> %res
-
-}
-
-define <4 x i64> @perm2i_0x88(<4 x i64> %a0, <4 x i64> %a1) {
-; CHECK-LABEL: @perm2i_0x88(
-; CHECK-NEXT:    ret <4 x i64> zeroinitializer
-;
-  %res = call <4 x i64> @llvm.x86.avx2.vperm2i128(<4 x i64> %a0, <4 x i64> %a1, i8 136)
-  ret <4 x i64> %res
-
-}
-
-
-; The other control bits are ignored when zero mask bits of the immediate are set.
-
-define <4 x double> @perm2pd_0xff(<4 x double> %a0, <4 x double> %a1) {
-; CHECK-LABEL: @perm2pd_0xff(
-; CHECK-NEXT:    ret <4 x double> zeroinitializer
-;
-  %res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 255)
-  ret <4 x double> %res
-
-}
-
-
-; The following 16 tests are simple shuffles, except for 2 cases where we can just return one of the
-; source vectors. Verify that we generate the right shuffle masks and undef source operand where possible..
-
-define <4 x double> @perm2pd_0x00(<4 x double> %a0, <4 x double> %a1) {
-; CHECK-LABEL: @perm2pd_0x00(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> %a0, <4 x double> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
-; CHECK-NEXT:    ret <4 x double> [[TMP1]]
-;
-  %res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 0)
-  ret <4 x double> %res
-
-}
-
-define <4 x double> @perm2pd_0x01(<4 x double> %a0, <4 x double> %a1) {
-; CHECK-LABEL: @perm2pd_0x01(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> %a0, <4 x double> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
-; CHECK-NEXT:    ret <4 x double> [[TMP1]]
-;
-  %res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 1)
-  ret <4 x double> %res
-
-}
-
-define <4 x double> @perm2pd_0x02(<4 x double> %a0, <4 x double> %a1) {
-; CHECK-LABEL: @perm2pd_0x02(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> %a1, <4 x double> %a0, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; CHECK-NEXT:    ret <4 x double> [[TMP1]]
-;
-  %res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 2)
-  ret <4 x double> %res
-
-}
-
-define <4 x double> @perm2pd_0x03(<4 x double> %a0, <4 x double> %a1) {
-; CHECK-LABEL: @perm2pd_0x03(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> %a1, <4 x double> %a0, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
-; CHECK-NEXT:    ret <4 x double> [[TMP1]]
-;
-  %res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 3)
-  ret <4 x double> %res
-
-}
-
-define <4 x double> @perm2pd_0x10(<4 x double> %a0, <4 x double> %a1) {
-; CHECK-LABEL: @perm2pd_0x10(
-; CHECK-NEXT:    ret <4 x double> %a0
-;
-  %res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 16)
-  ret <4 x double> %res
-
-}
-
-define <4 x double> @perm2pd_0x11(<4 x double> %a0, <4 x double> %a1) {
-; CHECK-LABEL: @perm2pd_0x11(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> %a0, <4 x double> undef, <4 x i32> <i32 2, i32 3, i32 2, i32 3>
-; CHECK-NEXT:    ret <4 x double> [[TMP1]]
-;
-  %res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 17)
-  ret <4 x double> %res
-
-}
-
-define <4 x double> @perm2pd_0x12(<4 x double> %a0, <4 x double> %a1) {
-; CHECK-LABEL: @perm2pd_0x12(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> %a1, <4 x double> %a0, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
-; CHECK-NEXT:    ret <4 x double> [[TMP1]]
-;
-  %res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 18)
-  ret <4 x double> %res
-
-}
-
-define <4 x double> @perm2pd_0x13(<4 x double> %a0, <4 x double> %a1) {
-; CHECK-LABEL: @perm2pd_0x13(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> %a1, <4 x double> %a0, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
-; CHECK-NEXT:    ret <4 x double> [[TMP1]]
-;
-  %res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 19)
-  ret <4 x double> %res
-
-}
-
-define <4 x double> @perm2pd_0x20(<4 x double> %a0, <4 x double> %a1) {
-; CHECK-LABEL: @perm2pd_0x20(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> %a0, <4 x double> %a1, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; CHECK-NEXT:    ret <4 x double> [[TMP1]]
-;
-  %res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 32)
-  ret <4 x double> %res
-
-}
-
-define <4 x double> @perm2pd_0x21(<4 x double> %a0, <4 x double> %a1) {
-; CHECK-LABEL: @perm2pd_0x21(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> %a0, <4 x double> %a1, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
-; CHECK-NEXT:    ret <4 x double> [[TMP1]]
-;
-  %res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 33)
-  ret <4 x double> %res
-
-}
-
-define <4 x double> @perm2pd_0x22(<4 x double> %a0, <4 x double> %a1) {
-; CHECK-LABEL: @perm2pd_0x22(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> %a1, <4 x double> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
-; CHECK-NEXT:    ret <4 x double> [[TMP1]]
-;
-  %res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 34)
-  ret <4 x double> %res
-
-}
-
-define <4 x double> @perm2pd_0x23(<4 x double> %a0, <4 x double> %a1) {
-; CHECK-LABEL: @perm2pd_0x23(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> %a1, <4 x double> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
-; CHECK-NEXT:    ret <4 x double> [[TMP1]]
-;
-  %res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 35)
-  ret <4 x double> %res
-
-}
-
-define <4 x double> @perm2pd_0x30(<4 x double> %a0, <4 x double> %a1) {
-; CHECK-LABEL: @perm2pd_0x30(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> %a0, <4 x double> %a1, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
-; CHECK-NEXT:    ret <4 x double> [[TMP1]]
-;
-  %res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 48)
-  ret <4 x double> %res
-
-}
-
-define <4 x double> @perm2pd_0x31(<4 x double> %a0, <4 x double> %a1) {
-; CHECK-LABEL: @perm2pd_0x31(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> %a0, <4 x double> %a1, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
-; CHECK-NEXT:    ret <4 x double> [[TMP1]]
-;
-  %res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 49)
-  ret <4 x double> %res
-
-}
-
-define <4 x double> @perm2pd_0x32(<4 x double> %a0, <4 x double> %a1) {
-; CHECK-LABEL: @perm2pd_0x32(
-; CHECK-NEXT:    ret <4 x double> %a1
-;
-  %res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 50)
-  ret <4 x double> %res
-
-}
-
-define <4 x double> @perm2pd_0x33(<4 x double> %a0, <4 x double> %a1) {
-; CHECK-LABEL: @perm2pd_0x33(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> %a1, <4 x double> undef, <4 x i32> <i32 2, i32 3, i32 2, i32 3>
-; CHECK-NEXT:    ret <4 x double> [[TMP1]]
-;
-  %res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 51)
-  ret <4 x double> %res
-
-}
-
-; Confirm that a mask for 32-bit elements is also correct.
-
-define <8 x float> @perm2ps_0x31(<8 x float> %a0, <8 x float> %a1) {
-; CHECK-LABEL: @perm2ps_0x31(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> %a0, <8 x float> %a1, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
-; CHECK-NEXT:    ret <8 x float> [[TMP1]]
-;
-  %res = call <8 x float> @llvm.x86.avx.vperm2f128.ps.256(<8 x float> %a0, <8 x float> %a1, i8 49)
-  ret <8 x float> %res
-
-}
-
-
-; Confirm that the AVX2 version works the same.
-
-define <4 x i64> @perm2i_0x33(<4 x i64> %a0, <4 x i64> %a1) {
-; CHECK-LABEL: @perm2i_0x33(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i64> %a1, <4 x i64> undef, <4 x i32> <i32 2, i32 3, i32 2, i32 3>
-; CHECK-NEXT:    ret <4 x i64> [[TMP1]]
-;
-  %res = call <4 x i64> @llvm.x86.avx2.vperm2i128(<4 x i64> %a0, <4 x i64> %a1, i8 51)
-  ret <4 x i64> %res
-
-}
-
-
-; Confirm that when a single zero mask bit is set, we replace a source vector with zeros.
-
-define <4 x double> @perm2pd_0x81(<4 x double> %a0, <4 x double> %a1) {
-; CHECK-LABEL: @perm2pd_0x81(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> %a0, <4 x double> <double 0.000000e+00, double 0.000000e+00, double undef, double undef>, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
-; CHECK-NEXT:    ret <4 x double> [[TMP1]]
-;
-  %res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 129)
-  ret <4 x double> %res
-
-}
-
-define <4 x double> @perm2pd_0x83(<4 x double> %a0, <4 x double> %a1) {
-; CHECK-LABEL: @perm2pd_0x83(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> %a1, <4 x double> <double 0.000000e+00, double 0.000000e+00, double undef, double undef>, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
-; CHECK-NEXT:    ret <4 x double> [[TMP1]]
-;
-  %res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 131)
-  ret <4 x double> %res
-
-}
-
-define <4 x double> @perm2pd_0x28(<4 x double> %a0, <4 x double> %a1) {
-; CHECK-LABEL: @perm2pd_0x28(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> <double 0.000000e+00, double 0.000000e+00, double undef, double undef>, <4 x double> %a1, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; CHECK-NEXT:    ret <4 x double> [[TMP1]]
-;
-  %res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 40)
-  ret <4 x double> %res
-
-}
-
-define <4 x double> @perm2pd_0x08(<4 x double> %a0, <4 x double> %a1) {
-; CHECK-LABEL: @perm2pd_0x08(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> <double 0.000000e+00, double 0.000000e+00, double undef, double undef>, <4 x double> %a0, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; CHECK-NEXT:    ret <4 x double> [[TMP1]]
-;
-  %res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 8)
-  ret <4 x double> %res
-
-}
-
-; Check one more with the AVX2 version.
-
-define <4 x i64> @perm2i_0x28(<4 x i64> %a0, <4 x i64> %a1) {
-; CHECK-LABEL: @perm2i_0x28(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i64> <i64 0, i64 0, i64 undef, i64 undef>, <4 x i64> %a1, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; CHECK-NEXT:    ret <4 x i64> [[TMP1]]
-;
-  %res = call <4 x i64> @llvm.x86.avx2.vperm2i128(<4 x i64> %a0, <4 x i64> %a1, i8 40)
-  ret <4 x i64> %res
-
-}
-
-declare <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double>, <4 x double>, i8) nounwind readnone
-declare <8 x float> @llvm.x86.avx.vperm2f128.ps.256(<8 x float>, <8 x float>, i8) nounwind readnone
-declare <8 x i32> @llvm.x86.avx.vperm2f128.si.256(<8 x i32>, <8 x i32>, i8) nounwind readnone
-declare <4 x i64> @llvm.x86.avx2.vperm2i128(<4 x i64>, <4 x i64>, i8) nounwind readnone
-
diff --git a/test/Transforms/InstCombine/x86-vpermil.ll b/test/Transforms/InstCombine/x86-vpermil.ll
deleted file mode 100644
index f68eb36c4b58..000000000000
--- a/test/Transforms/InstCombine/x86-vpermil.ll
+++ /dev/null
@@ -1,298 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -instcombine -S | FileCheck %s
-target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-
-; Verify that instcombine is able to fold identity shuffles.
-
-define <4 x float> @identity_test_vpermilvar_ps(<4 x float> %v) {
-; CHECK-LABEL: @identity_test_vpermilvar_ps(
-; CHECK-NEXT:    ret <4 x float> %v
-;
-  %a = tail call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> %v, <4 x i32> <i32 0, i32 1, i32 2, i32 3>)
-  ret <4 x float> %a
-}
-
-define <8 x float> @identity_test_vpermilvar_ps_256(<8 x float> %v) {
-; CHECK-LABEL: @identity_test_vpermilvar_ps_256(
-; CHECK-NEXT:    ret <8 x float> %v
-;
-  %a = tail call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %v, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>)
-  ret <8 x float> %a
-}
-
-define <16 x float> @identity_test_vpermilvar_ps_512(<16 x float> %v) {
-; CHECK-LABEL: @identity_test_vpermilvar_ps_512(
-; CHECK-NEXT:    ret <16 x float> %v
-;
-  %a = tail call <16 x float> @llvm.x86.avx512.vpermilvar.ps.512(<16 x float> %v, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>)
-  ret <16 x float> %a
-}
-
-define <2 x double> @identity_test_vpermilvar_pd(<2 x double> %v) {
-; CHECK-LABEL: @identity_test_vpermilvar_pd(
-; CHECK-NEXT:    ret <2 x double> %v
-;
-  %a = tail call <2 x double> @llvm.x86.avx.vpermilvar.pd(<2 x double> %v, <2 x i64> <i64 0, i64 2>)
-  ret <2 x double> %a
-}
-
-define <4 x double> @identity_test_vpermilvar_pd_256(<4 x double> %v) {
-; CHECK-LABEL: @identity_test_vpermilvar_pd_256(
-; CHECK-NEXT:    ret <4 x double> %v
-;
-  %a = tail call <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double> %v, <4 x i64> <i64 0, i64 2, i64 0, i64 2>)
-  ret <4 x double> %a
-}
-
-define <8 x double> @identity_test_vpermilvar_pd_512(<8 x double> %v) {
-; CHECK-LABEL: @identity_test_vpermilvar_pd_512(
-; CHECK-NEXT:    ret <8 x double> %v
-;
-  %a = tail call <8 x double> @llvm.x86.avx512.vpermilvar.pd.512(<8 x double> %v, <8 x i64> <i64 0, i64 2, i64 0, i64 2, i64 0, i64 2, i64 0, i64 2>)
-  ret <8 x double> %a
-}
-
-; Instcombine should be able to fold the following byte shuffle to a builtin shufflevector
-; with a shuffle mask of all zeroes.
-
-define <4 x float> @zero_test_vpermilvar_ps_zero(<4 x float> %v) {
-; CHECK-LABEL: @zero_test_vpermilvar_ps_zero(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> zeroinitializer
-; CHECK-NEXT:    ret <4 x float> [[TMP1]]
-;
-  %a = tail call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> %v, <4 x i32> zeroinitializer)
-  ret <4 x float> %a
-}
-
-define <8 x float> @zero_test_vpermilvar_ps_256_zero(<8 x float> %v) {
-; CHECK-LABEL: @zero_test_vpermilvar_ps_256_zero(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> %v, <8 x float> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4>
-; CHECK-NEXT:    ret <8 x float> [[TMP1]]
-;
-  %a = tail call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %v, <8 x i32> zeroinitializer)
-  ret <8 x float> %a
-}
-
-define <16 x float> @zero_test_vpermilvar_ps_512_zero(<16 x float> %v) {
-; CHECK-LABEL: @zero_test_vpermilvar_ps_512_zero(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x float> %v, <16 x float> undef, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4, i32 8, i32 8, i32 8, i32 8, i32 12, i32 12, i32 12, i32 12>
-; CHECK-NEXT:    ret <16 x float> [[TMP1]]
-;
-  %a = tail call <16 x float> @llvm.x86.avx512.vpermilvar.ps.512(<16 x float> %v, <16 x i32> zeroinitializer)
-  ret <16 x float> %a
-}
-
-define <2 x double> @zero_test_vpermilvar_pd_zero(<2 x double> %v) {
-; CHECK-LABEL: @zero_test_vpermilvar_pd_zero(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x double> %v, <2 x double> undef, <2 x i32> zeroinitializer
-; CHECK-NEXT:    ret <2 x double> [[TMP1]]
-;
-  %a = tail call <2 x double> @llvm.x86.avx.vpermilvar.pd(<2 x double> %v, <2 x i64> zeroinitializer)
-  ret <2 x double> %a
-}
-
-define <4 x double> @zero_test_vpermilvar_pd_256_zero(<4 x double> %v) {
-; CHECK-LABEL: @zero_test_vpermilvar_pd_256_zero(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> %v, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
-; CHECK-NEXT:    ret <4 x double> [[TMP1]]
-;
-  %a = tail call <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double> %v, <4 x i64> zeroinitializer)
-  ret <4 x double> %a
-}
-
-define <8 x double> @zero_test_vpermilvar_pd_512_zero(<8 x double> %v) {
-; CHECK-LABEL: @zero_test_vpermilvar_pd_512_zero(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x double> %v, <8 x double> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
-; CHECK-NEXT:    ret <8 x double> [[TMP1]]
-;
-  %a = tail call <8 x double> @llvm.x86.avx512.vpermilvar.pd.512(<8 x double> %v, <8 x i64> zeroinitializer)
-  ret <8 x double> %a
-}
-
-; Verify that instcombine is able to fold constant shuffles.
-
-define <4 x float> @test_vpermilvar_ps(<4 x float> %v) {
-; CHECK-LABEL: @test_vpermilvar_ps(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    ret <4 x float> [[TMP1]]
-;
-  %a = tail call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> %v, <4 x i32> <i32 3, i32 2, i32 1, i32 0>)
-  ret <4 x float> %a
-}
-
-define <8 x float> @test_vpermilvar_ps_256(<8 x float> %v) {
-; CHECK-LABEL: @test_vpermilvar_ps_256(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> %v, <8 x float> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
-; CHECK-NEXT:    ret <8 x float> [[TMP1]]
-;
-  %a = tail call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %v, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>)
-  ret <8 x float> %a
-}
-
-define <16 x float> @test_vpermilvar_ps_512(<16 x float> %v) {
-; CHECK-LABEL: @test_vpermilvar_ps_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x float> %v, <16 x float> undef, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>
-; CHECK-NEXT:    ret <16 x float> [[TMP1]]
-;
-  %a = tail call <16 x float> @llvm.x86.avx512.vpermilvar.ps.512(<16 x float> %v, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>)
-  ret <16 x float> %a
-}
-
-define <2 x double> @test_vpermilvar_pd(<2 x double> %v) {
-; CHECK-LABEL: @test_vpermilvar_pd(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x double> %v, <2 x double> undef, <2 x i32> <i32 1, i32 0>
-; CHECK-NEXT:    ret <2 x double> [[TMP1]]
-;
-  %a = tail call <2 x double> @llvm.x86.avx.vpermilvar.pd(<2 x double> %v, <2 x i64> <i64 2, i64 0>)
-  ret <2 x double> %a
-}
-
-define <4 x double> @test_vpermilvar_pd_256(<4 x double> %v) {
-; CHECK-LABEL: @test_vpermilvar_pd_256(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> %v, <4 x double> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
-; CHECK-NEXT:    ret <4 x double> [[TMP1]]
-;
-  %a = tail call <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double> %v, <4 x i64> <i64 3, i64 1, i64 2, i64 0>)
-  ret <4 x double> %a
-}
-
-define <8 x double> @test_vpermilvar_pd_512(<8 x double> %v) {
-; CHECK-LABEL: @test_vpermilvar_pd_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x double> %v, <8 x double> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
-; CHECK-NEXT:    ret <8 x double> [[TMP1]]
-;
-  %a = tail call <8 x double> @llvm.x86.avx512.vpermilvar.pd.512(<8 x double> %v, <8 x i64> <i64 3, i64 1, i64 2, i64 0, i64 7, i64 5, i64 6, i64 4>)
-  ret <8 x double> %a
-}
-
-; Verify that instcombine is able to fold constant shuffles with undef mask elements.
-
-define <4 x float> @undef_test_vpermilvar_ps(<4 x float> %v) {
-; CHECK-LABEL: @undef_test_vpermilvar_ps(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> <i32 undef, i32 2, i32 1, i32 undef>
-; CHECK-NEXT:    ret <4 x float> [[TMP1]]
-;
-  %a = tail call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> %v, <4 x i32> <i32 undef, i32 2, i32 1, i32 undef>)
-  ret <4 x float> %a
-}
-
-define <8 x float> @undef_test_vpermilvar_ps_256(<8 x float> %v) {
-; CHECK-LABEL: @undef_test_vpermilvar_ps_256(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> %v, <8 x float> undef, <8 x i32> <i32 undef, i32 2, i32 1, i32 undef, i32 7, i32 6, i32 5, i32 4>
-; CHECK-NEXT:    ret <8 x float> [[TMP1]]
-;
-  %a = tail call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %v, <8 x i32> <i32 undef, i32 6, i32 5, i32 undef, i32 3, i32 2, i32 1, i32 0>)
-  ret <8 x float> %a
-}
-
-define <16 x float> @undef_test_vpermilvar_ps_512(<16 x float> %v) {
-; CHECK-LABEL: @undef_test_vpermilvar_ps_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x float> %v, <16 x float> undef, <16 x i32> <i32 undef, i32 2, i32 1, i32 undef, i32 7, i32 6, i32 5, i32 4, i32 undef, i32 10, i32 9, i32 undef, i32 15, i32 14, i32 13, i32 12>
-; CHECK-NEXT:    ret <16 x float> [[TMP1]]
-;
-  %a = tail call <16 x float> @llvm.x86.avx512.vpermilvar.ps.512(<16 x float> %v, <16 x i32> <i32 undef, i32 6, i32 5, i32 undef, i32 3, i32 2, i32 1, i32 0, i32 undef, i32 6, i32 5, i32 undef, i32 3, i32 2, i32 1, i32 0>)
-  ret <16 x float> %a
-}
-
-define <2 x double> @undef_test_vpermilvar_pd(<2 x double> %v) {
-; CHECK-LABEL: @undef_test_vpermilvar_pd(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x double> %v, <2 x double> undef, <2 x i32> <i32 undef, i32 0>
-; CHECK-NEXT:    ret <2 x double> [[TMP1]]
-;
-  %a = tail call <2 x double> @llvm.x86.avx.vpermilvar.pd(<2 x double> %v, <2 x i64> <i64 undef, i64 0>)
-  ret <2 x double> %a
-}
-
-define <4 x double> @undef_test_vpermilvar_pd_256(<4 x double> %v) {
-; CHECK-LABEL: @undef_test_vpermilvar_pd_256(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> %v, <4 x double> undef, <4 x i32> <i32 undef, i32 0, i32 3, i32 undef>
-; CHECK-NEXT:    ret <4 x double> [[TMP1]]
-;
-  %a = tail call <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double> %v, <4 x i64> <i64 undef, i64 1, i64 2, i64 undef>)
-  ret <4 x double> %a
-}
-
-define <8 x double> @undef_test_vpermilvar_pd_512(<8 x double> %v) {
-; CHECK-LABEL: @undef_test_vpermilvar_pd_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x double> %v, <8 x double> undef, <8 x i32> <i32 undef, i32 0, i32 3, i32 undef, i32 undef, i32 4, i32 7, i32 undef>
-; CHECK-NEXT:    ret <8 x double> [[TMP1]]
-;
-  %a = tail call <8 x double> @llvm.x86.avx512.vpermilvar.pd.512(<8 x double> %v, <8 x i64> <i64 undef, i64 1, i64 2, i64 undef, i64 undef, i64 1, i64 2, i64 undef>)
-  ret <8 x double> %a
-}
-
-; Simplify demanded elts
-
-define <4 x float> @elts_test_vpermilvar_ps(<4 x float> %a0, i32 %a1) {
-; CHECK-LABEL: @elts_test_vpermilvar_ps(
-; CHECK-NEXT:    ret <4 x float> %a0
-;
-  %1 = insertelement <4 x i32> <i32 0, i32 1, i32 2, i32 3>, i32 %a1, i32 3
-  %2 = tail call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> %a0, <4 x i32> %1)
-  %3 = shufflevector <4 x float> %2, <4 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 undef>
-  ret <4 x float> %3
-}
-
-define <8 x float> @elts_test_vpermilvar_ps_256(<8 x float> %a0, <8 x i32> %a1) {
-; CHECK-LABEL: @elts_test_vpermilvar_ps_256(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> %a0, <8 x float> undef, <8 x i32> <i32 undef, i32 0, i32 undef, i32 1, i32 undef, i32 6, i32 undef, i32 7>
-; CHECK-NEXT:    ret <8 x float> [[TMP1]]
-;
-  %1 = shufflevector <8 x i32> %a1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 3, i32 2, i32 1, i32 0>, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
-  %2 = tail call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %a0, <8 x i32> %1)
-  %3 = shufflevector <8 x float> %2, <8 x float> undef, <8 x i32> <i32 undef, i32 1, i32 undef, i32 3, i32 undef, i32 5, i32 undef, i32 7>
-  ret <8 x float> %3
-}
-
-define <16 x float> @elts_test_vpermilvar_ps_512(<16 x float> %a0, <16 x i32> %a1, i32 %a2) {
-; CHECK-LABEL: @elts_test_vpermilvar_ps_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x float> @llvm.x86.avx512.vpermilvar.ps.512(<16 x float> %a0, <16 x i32> %a1)
-; CHECK-NEXT:    ret <16 x float> [[TMP1]]
-;
-  %1 = insertelement <16 x i32> %a1, i32 %a2, i32 0
-  %2 = tail call <16 x float> @llvm.x86.avx512.vpermilvar.ps.512(<16 x float> %a0, <16 x i32> %1)
-  %3 = shufflevector <16 x float> %2, <16 x float> undef, <16 x i32> <i32 undef, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  ret <16 x float> %3
-}
-
-define <2 x double> @elts_test_vpermilvar_pd(<2 x double> %a0, i64 %a1) {
-; CHECK-LABEL: @elts_test_vpermilvar_pd(
-; CHECK-NEXT:    ret <2 x double> %a0
-;
-  %1 = insertelement <2 x i64> <i64 0, i64 2>, i64 %a1, i32 1
-  %2 = tail call <2 x double> @llvm.x86.avx.vpermilvar.pd(<2 x double> %a0, <2 x i64> %1)
-  %3 = shufflevector <2 x double> %2, <2 x double> undef, <2 x i32> <i32 0, i32 undef>
-  ret <2 x double> %3
-}
-
-define <4 x double> @elts_test_vpermilvar_pd_256(<4 x double> %a0, <4 x i64> %a1) {
-; CHECK-LABEL: @elts_test_vpermilvar_pd_256(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> %a0, <4 x double> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 undef>
-; CHECK-NEXT:    ret <4 x double> [[TMP1]]
-;
-  %1 = shufflevector <4 x i64> <i64 0, i64 2, i64 0, i64 2>, <4 x i64> %a1, <4 x i32> <i32 1, i32 2, i32 3, i32 4>
-  %2 = tail call <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double> %a0, <4 x i64> %1)
-  %3 = shufflevector <4 x double> %2, <4 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 undef>
-  ret <4 x double> %3
-}
-
-define <8 x double> @elts_test_vpermilvar_pd_512(<8 x double> %a0, <8 x i64> %a1, i64 %a2) {
-; CHECK-LABEL: @elts_test_vpermilvar_pd_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <8 x i64> undef, i64 %a2, i32 0
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x double> @llvm.x86.avx512.vpermilvar.pd.512(<8 x double> %a0, <8 x i64> [[TMP1]])
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <8 x double> [[TMP2]], <8 x double> undef, <8 x i32> zeroinitializer
-; CHECK-NEXT:    ret <8 x double> [[TMP3]]
-;
-  %1 = insertelement <8 x i64> %a1, i64 %a2, i32 0
-  %2 = tail call <8 x double> @llvm.x86.avx512.vpermilvar.pd.512(<8 x double> %a0, <8 x i64> %1)
-  %3 = shufflevector <8 x double> %2, <8 x double> undef, <8 x i32> zeroinitializer
-  ret <8 x double> %3
-}
-
-declare <2 x double> @llvm.x86.avx.vpermilvar.pd(<2 x double>, <2 x i64>)
-declare <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double>, <4 x i64>)
-declare <8 x double> @llvm.x86.avx512.vpermilvar.pd.512(<8 x double>, <8 x i64>)
-
-declare <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float>, <4 x i32>)
-declare <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float>, <8 x i32>)
-declare <16 x float> @llvm.x86.avx512.vpermilvar.ps.512(<16 x float>, <16 x i32>)
diff --git a/test/Transforms/InstCombine/x86-xop.ll b/test/Transforms/InstCombine/x86-xop.ll
deleted file mode 100644
index 03a3f921abb2..000000000000
--- a/test/Transforms/InstCombine/x86-xop.ll
+++ /dev/null
@@ -1,305 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -instcombine -S | FileCheck %s
-
-define <2 x double> @test_vfrcz_sd(<2 x double> %a) {
-; CHECK-LABEL: @test_vfrcz_sd(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x double> @llvm.x86.xop.vfrcz.sd(<2 x double> %a)
-; CHECK-NEXT:    ret <2 x double> [[TMP1]]
-;
-  %1 = insertelement <2 x double> %a, double 1.000000e+00, i32 1
-  %2 = tail call <2 x double> @llvm.x86.xop.vfrcz.sd(<2 x double> %1)
-  ret <2 x double> %2
-}
-
-define double @test_vfrcz_sd_0(double %a) {
-; CHECK-LABEL: @test_vfrcz_sd_0(
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> undef, double %a, i32 0
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x double> @llvm.x86.xop.vfrcz.sd(<2 x double> [[TMP1]])
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x double> [[TMP2]], i32 0
-; CHECK-NEXT:    ret double [[TMP3]]
-;
-  %1 = insertelement <2 x double> undef, double %a, i32 0
-  %2 = insertelement <2 x double> %1, double 1.000000e+00, i32 1
-  %3 = tail call <2 x double> @llvm.x86.xop.vfrcz.sd(<2 x double> %2)
-  %4 = extractelement <2 x double> %3, i32 0
-  ret double %4
-}
-
-define double @test_vfrcz_sd_1(double %a) {
-; CHECK-LABEL: @test_vfrcz_sd_1(
-; CHECK-NEXT:    ret double 0.000000e+00
-;
-  %1 = insertelement <2 x double> undef, double %a, i32 0
-  %2 = insertelement <2 x double> %1, double 1.000000e+00, i32 1
-  %3 = tail call <2 x double> @llvm.x86.xop.vfrcz.sd(<2 x double> %2)
-  %4 = extractelement <2 x double> %3, i32 1
-  ret double %4
-}
-
-define <4 x float> @test_vfrcz_ss(<4 x float> %a) {
-; CHECK-LABEL: @test_vfrcz_ss(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x float> @llvm.x86.xop.vfrcz.ss(<4 x float> %a)
-; CHECK-NEXT:    ret <4 x float> [[TMP1]]
-;
-  %1 = insertelement <4 x float> %a, float 1.000000e+00, i32 1
-  %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2
-  %3 = insertelement <4 x float> %2, float 3.000000e+00, i32 3
-  %4 = tail call <4 x float> @llvm.x86.xop.vfrcz.ss(<4 x float> %3)
-  ret <4 x float> %4
-}
-
-define float @test_vfrcz_ss_0(float %a) {
-; CHECK-LABEL: @test_vfrcz_ss_0(
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x float> undef, float %a, i32 0
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x float> @llvm.x86.xop.vfrcz.ss(<4 x float> [[TMP1]])
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[TMP2]], i32 0
-; CHECK-NEXT:    ret float [[TMP3]]
-;
-  %1 = insertelement <4 x float> undef, float %a, i32 0
-  %2 = insertelement <4 x float> %1, float 1.000000e+00, i32 1
-  %3 = insertelement <4 x float> %2, float 2.000000e+00, i32 2
-  %4 = insertelement <4 x float> %3, float 3.000000e+00, i32 3
-  %5 = tail call <4 x float> @llvm.x86.xop.vfrcz.ss(<4 x float> %4)
-  %6 = extractelement <4 x float> %5, i32 0
-  ret float %6
-}
-
-define float @test_vfrcz_ss_3(float %a) {
-; CHECK-LABEL: @test_vfrcz_ss_3(
-; CHECK-NEXT:    ret float 0.000000e+00
-;
-  %1 = insertelement <4 x float> undef, float %a, i32 0
-  %2 = insertelement <4 x float> %1, float 1.000000e+00, i32 1
-  %3 = insertelement <4 x float> %2, float 2.000000e+00, i32 2
-  %4 = insertelement <4 x float> %3, float 3.000000e+00, i32 3
-  %5 = tail call <4 x float> @llvm.x86.xop.vfrcz.ss(<4 x float> %4)
-  %6 = extractelement <4 x float> %5, i32 3
-  ret float %6
-}
-
-define <2 x i64> @cmp_slt_v2i64(<2 x i64> %a, <2 x i64> %b) {
-; CHECK-LABEL: @cmp_slt_v2i64(
-; CHECK-NEXT:    [[TMP1:%.*]] = icmp slt <2 x i64> %a, %b
-; CHECK-NEXT:    [[TMP2:%.*]] = sext <2 x i1> [[TMP1]] to <2 x i64>
-; CHECK-NEXT:    ret <2 x i64> [[TMP2]]
-;
-  %1 = tail call <2 x i64> @llvm.x86.xop.vpcomltq(<2 x i64> %a, <2 x i64> %b)
-  ret <2 x i64> %1
-}
-
-define <2 x i64> @cmp_ult_v2i64(<2 x i64> %a, <2 x i64> %b) {
-; CHECK-LABEL: @cmp_ult_v2i64(
-; CHECK-NEXT:    [[TMP1:%.*]] = icmp ult <2 x i64> %a, %b
-; CHECK-NEXT:    [[TMP2:%.*]] = sext <2 x i1> [[TMP1]] to <2 x i64>
-; CHECK-NEXT:    ret <2 x i64> [[TMP2]]
-;
-  %1 = tail call <2 x i64> @llvm.x86.xop.vpcomltuq(<2 x i64> %a, <2 x i64> %b)
-  ret <2 x i64> %1
-}
-
-define <2 x i64> @cmp_sle_v2i64(<2 x i64> %a, <2 x i64> %b) {
-; CHECK-LABEL: @cmp_sle_v2i64(
-; CHECK-NEXT:    [[TMP1:%.*]] = icmp sle <2 x i64> %a, %b
-; CHECK-NEXT:    [[TMP2:%.*]] = sext <2 x i1> [[TMP1]] to <2 x i64>
-; CHECK-NEXT:    ret <2 x i64> [[TMP2]]
-;
-  %1 = tail call <2 x i64> @llvm.x86.xop.vpcomleq(<2 x i64> %a, <2 x i64> %b)
-  ret <2 x i64> %1
-}
-
-define <2 x i64> @cmp_ule_v2i64(<2 x i64> %a, <2 x i64> %b) {
-; CHECK-LABEL: @cmp_ule_v2i64(
-; CHECK-NEXT:    [[TMP1:%.*]] = icmp ule <2 x i64> %a, %b
-; CHECK-NEXT:    [[TMP2:%.*]] = sext <2 x i1> [[TMP1]] to <2 x i64>
-; CHECK-NEXT:    ret <2 x i64> [[TMP2]]
-;
-  %1 = tail call <2 x i64> @llvm.x86.xop.vpcomleuq(<2 x i64> %a, <2 x i64> %b)
-  ret <2 x i64> %1
-}
-
-define <4 x i32> @cmp_sgt_v4i32(<4 x i32> %a, <4 x i32> %b) {
-; CHECK-LABEL: @cmp_sgt_v4i32(
-; CHECK-NEXT:    [[TMP1:%.*]] = icmp sgt <4 x i32> %a, %b
-; CHECK-NEXT:    [[TMP2:%.*]] = sext <4 x i1> [[TMP1]] to <4 x i32>
-; CHECK-NEXT:    ret <4 x i32> [[TMP2]]
-;
-  %1 = tail call <4 x i32> @llvm.x86.xop.vpcomgtd(<4 x i32> %a, <4 x i32> %b)
-  ret <4 x i32> %1
-}
-
-define <4 x i32> @cmp_ugt_v4i32(<4 x i32> %a, <4 x i32> %b) {
-; CHECK-LABEL: @cmp_ugt_v4i32(
-; CHECK-NEXT:    [[TMP1:%.*]] = icmp ugt <4 x i32> %a, %b
-; CHECK-NEXT:    [[TMP2:%.*]] = sext <4 x i1> [[TMP1]] to <4 x i32>
-; CHECK-NEXT:    ret <4 x i32> [[TMP2]]
-;
-  %1 = tail call <4 x i32> @llvm.x86.xop.vpcomgtud(<4 x i32> %a, <4 x i32> %b)
-  ret <4 x i32> %1
-}
-
-define <4 x i32> @cmp_sge_v4i32(<4 x i32> %a, <4 x i32> %b) {
-; CHECK-LABEL: @cmp_sge_v4i32(
-; CHECK-NEXT:    [[TMP1:%.*]] = icmp sge <4 x i32> %a, %b
-; CHECK-NEXT:    [[TMP2:%.*]] = sext <4 x i1> [[TMP1]] to <4 x i32>
-; CHECK-NEXT:    ret <4 x i32> [[TMP2]]
-;
-  %1 = tail call <4 x i32> @llvm.x86.xop.vpcomged(<4 x i32> %a, <4 x i32> %b)
-  ret <4 x i32> %1
-}
-
-define <4 x i32> @cmp_uge_v4i32(<4 x i32> %a, <4 x i32> %b) {
-; CHECK-LABEL: @cmp_uge_v4i32(
-; CHECK-NEXT:    [[TMP1:%.*]] = icmp uge <4 x i32> %a, %b
-; CHECK-NEXT:    [[TMP2:%.*]] = sext <4 x i1> [[TMP1]] to <4 x i32>
-; CHECK-NEXT:    ret <4 x i32> [[TMP2]]
-;
-  %1 = tail call <4 x i32> @llvm.x86.xop.vpcomgeud(<4 x i32> %a, <4 x i32> %b)
-  ret <4 x i32> %1
-}
-
-define <8 x i16> @cmp_seq_v8i16(<8 x i16> %a, <8 x i16> %b) {
-; CHECK-LABEL: @cmp_seq_v8i16(
-; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq <8 x i16> %a, %b
-; CHECK-NEXT:    [[TMP2:%.*]] = sext <8 x i1> [[TMP1]] to <8 x i16>
-; CHECK-NEXT:    ret <8 x i16> [[TMP2]]
-;
-  %1 = tail call <8 x i16> @llvm.x86.xop.vpcomeqw(<8 x i16> %a, <8 x i16> %b)
-  ret <8 x i16> %1
-}
-
-define <8 x i16> @cmp_ueq_v8i16(<8 x i16> %a, <8 x i16> %b) {
-; CHECK-LABEL: @cmp_ueq_v8i16(
-; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq <8 x i16> %a, %b
-; CHECK-NEXT:    [[TMP2:%.*]] = sext <8 x i1> [[TMP1]] to <8 x i16>
-; CHECK-NEXT:    ret <8 x i16> [[TMP2]]
-;
-  %1 = tail call <8 x i16> @llvm.x86.xop.vpcomequw(<8 x i16> %a, <8 x i16> %b)
-  ret <8 x i16> %1
-}
-
-define <8 x i16> @cmp_sne_v8i16(<8 x i16> %a, <8 x i16> %b) {
-; CHECK-LABEL: @cmp_sne_v8i16(
-; CHECK-NEXT:    [[TMP1:%.*]] = icmp ne <8 x i16> %a, %b
-; CHECK-NEXT:    [[TMP2:%.*]] = sext <8 x i1> [[TMP1]] to <8 x i16>
-; CHECK-NEXT:    ret <8 x i16> [[TMP2]]
-;
-  %1 = tail call <8 x i16> @llvm.x86.xop.vpcomnew(<8 x i16> %a, <8 x i16> %b)
-  ret <8 x i16> %1
-}
-
-define <8 x i16> @cmp_une_v8i16(<8 x i16> %a, <8 x i16> %b) {
-; CHECK-LABEL: @cmp_une_v8i16(
-; CHECK-NEXT:    [[TMP1:%.*]] = icmp ne <8 x i16> %a, %b
-; CHECK-NEXT:    [[TMP2:%.*]] = sext <8 x i1> [[TMP1]] to <8 x i16>
-; CHECK-NEXT:    ret <8 x i16> [[TMP2]]
-;
-  %1 = tail call <8 x i16> @llvm.x86.xop.vpcomneuw(<8 x i16> %a, <8 x i16> %b)
-  ret <8 x i16> %1
-}
-
-define <16 x i8> @cmp_strue_v16i8(<16 x i8> %a, <16 x i8> %b) {
-; CHECK-LABEL: @cmp_strue_v16i8(
-; CHECK-NEXT:    ret <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
-;
-  %1 = tail call <16 x i8> @llvm.x86.xop.vpcomtrueb(<16 x i8> %a, <16 x i8> %b)
-  ret <16 x i8> %1
-}
-
-define <16 x i8> @cmp_utrue_v16i8(<16 x i8> %a, <16 x i8> %b) {
-; CHECK-LABEL: @cmp_utrue_v16i8(
-; CHECK-NEXT:    ret <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
-;
-  %1 = tail call <16 x i8> @llvm.x86.xop.vpcomtrueub(<16 x i8> %a, <16 x i8> %b)
-  ret <16 x i8> %1
-}
-
-define <16 x i8> @cmp_sfalse_v16i8(<16 x i8> %a, <16 x i8> %b) {
-; CHECK-LABEL: @cmp_sfalse_v16i8(
-; CHECK-NEXT:    ret <16 x i8> zeroinitializer
-;
-  %1 = tail call <16 x i8> @llvm.x86.xop.vpcomfalseb(<16 x i8> %a, <16 x i8> %b)
-  ret <16 x i8> %1
-}
-
-define <16 x i8> @cmp_ufalse_v16i8(<16 x i8> %a, <16 x i8> %b) {
-; CHECK-LABEL: @cmp_ufalse_v16i8(
-; CHECK-NEXT:    ret <16 x i8> zeroinitializer
-;
-  %1 = tail call <16 x i8> @llvm.x86.xop.vpcomfalseub(<16 x i8> %a, <16 x i8> %b)
-  ret <16 x i8> %1
-}
-
-declare <2 x double> @llvm.x86.xop.vfrcz.sd(<2 x double>) nounwind readnone
-declare <4 x float> @llvm.x86.xop.vfrcz.ss(<4 x float>) nounwind readnone
-
-declare <16 x i8> @llvm.x86.xop.vpcomltb(<16 x i8>, <16 x i8>) nounwind readnone
-declare <8 x i16> @llvm.x86.xop.vpcomltw(<8 x i16>, <8 x i16>) nounwind readnone
-declare <4 x i32> @llvm.x86.xop.vpcomltd(<4 x i32>, <4 x i32>) nounwind readnone
-declare <2 x i64> @llvm.x86.xop.vpcomltq(<2 x i64>, <2 x i64>) nounwind readnone
-declare <16 x i8> @llvm.x86.xop.vpcomltub(<16 x i8>, <16 x i8>) nounwind readnone
-declare <8 x i16> @llvm.x86.xop.vpcomltuw(<8 x i16>, <8 x i16>) nounwind readnone
-declare <4 x i32> @llvm.x86.xop.vpcomltud(<4 x i32>, <4 x i32>) nounwind readnone
-declare <2 x i64> @llvm.x86.xop.vpcomltuq(<2 x i64>, <2 x i64>) nounwind readnone
-
-declare <16 x i8> @llvm.x86.xop.vpcomleb(<16 x i8>, <16 x i8>) nounwind readnone
-declare <8 x i16> @llvm.x86.xop.vpcomlew(<8 x i16>, <8 x i16>) nounwind readnone
-declare <4 x i32> @llvm.x86.xop.vpcomled(<4 x i32>, <4 x i32>) nounwind readnone
-declare <2 x i64> @llvm.x86.xop.vpcomleq(<2 x i64>, <2 x i64>) nounwind readnone
-declare <16 x i8> @llvm.x86.xop.vpcomleub(<16 x i8>, <16 x i8>) nounwind readnone
-declare <8 x i16> @llvm.x86.xop.vpcomleuw(<8 x i16>, <8 x i16>) nounwind readnone
-declare <4 x i32> @llvm.x86.xop.vpcomleud(<4 x i32>, <4 x i32>) nounwind readnone
-declare <2 x i64> @llvm.x86.xop.vpcomleuq(<2 x i64>, <2 x i64>) nounwind readnone
-
-declare <16 x i8> @llvm.x86.xop.vpcomgtb(<16 x i8>, <16 x i8>) nounwind readnone
-declare <8 x i16> @llvm.x86.xop.vpcomgtw(<8 x i16>, <8 x i16>) nounwind readnone
-declare <4 x i32> @llvm.x86.xop.vpcomgtd(<4 x i32>, <4 x i32>) nounwind readnone
-declare <2 x i64> @llvm.x86.xop.vpcomgtq(<2 x i64>, <2 x i64>) nounwind readnone
-declare <16 x i8> @llvm.x86.xop.vpcomgtub(<16 x i8>, <16 x i8>) nounwind readnone
-declare <8 x i16> @llvm.x86.xop.vpcomgtuw(<8 x i16>, <8 x i16>) nounwind readnone
-declare <4 x i32> @llvm.x86.xop.vpcomgtud(<4 x i32>, <4 x i32>) nounwind readnone
-declare <2 x i64> @llvm.x86.xop.vpcomgtuq(<2 x i64>, <2 x i64>) nounwind readnone
-
-declare <16 x i8> @llvm.x86.xop.vpcomgeb(<16 x i8>, <16 x i8>) nounwind readnone
-declare <8 x i16> @llvm.x86.xop.vpcomgew(<8 x i16>, <8 x i16>) nounwind readnone
-declare <4 x i32> @llvm.x86.xop.vpcomged(<4 x i32>, <4 x i32>) nounwind readnone
-declare <2 x i64> @llvm.x86.xop.vpcomgeq(<2 x i64>, <2 x i64>) nounwind readnone
-declare <16 x i8> @llvm.x86.xop.vpcomgeub(<16 x i8>, <16 x i8>) nounwind readnone
-declare <8 x i16> @llvm.x86.xop.vpcomgeuw(<8 x i16>, <8 x i16>) nounwind readnone
-declare <4 x i32> @llvm.x86.xop.vpcomgeud(<4 x i32>, <4 x i32>) nounwind readnone
-declare <2 x i64> @llvm.x86.xop.vpcomgeuq(<2 x i64>, <2 x i64>) nounwind readnone
-
-declare <16 x i8> @llvm.x86.xop.vpcomeqb(<16 x i8>, <16 x i8>) nounwind readnone
-declare <8 x i16> @llvm.x86.xop.vpcomeqw(<8 x i16>, <8 x i16>) nounwind readnone
-declare <4 x i32> @llvm.x86.xop.vpcomeqd(<4 x i32>, <4 x i32>) nounwind readnone
-declare <2 x i64> @llvm.x86.xop.vpcomeqq(<2 x i64>, <2 x i64>) nounwind readnone
-declare <16 x i8> @llvm.x86.xop.vpcomequb(<16 x i8>, <16 x i8>) nounwind readnone
-declare <8 x i16> @llvm.x86.xop.vpcomequw(<8 x i16>, <8 x i16>) nounwind readnone
-declare <4 x i32> @llvm.x86.xop.vpcomequd(<4 x i32>, <4 x i32>) nounwind readnone
-declare <2 x i64> @llvm.x86.xop.vpcomequq(<2 x i64>, <2 x i64>) nounwind readnone
-
-declare <16 x i8> @llvm.x86.xop.vpcomneb(<16 x i8>, <16 x i8>) nounwind readnone
-declare <8 x i16> @llvm.x86.xop.vpcomnew(<8 x i16>, <8 x i16>) nounwind readnone
-declare <4 x i32> @llvm.x86.xop.vpcomned(<4 x i32>, <4 x i32>) nounwind readnone
-declare <2 x i64> @llvm.x86.xop.vpcomneq(<2 x i64>, <2 x i64>) nounwind readnone
-declare <16 x i8> @llvm.x86.xop.vpcomneub(<16 x i8>, <16 x i8>) nounwind readnone
-declare <8 x i16> @llvm.x86.xop.vpcomneuw(<8 x i16>, <8 x i16>) nounwind readnone
-declare <4 x i32> @llvm.x86.xop.vpcomneud(<4 x i32>, <4 x i32>) nounwind readnone
-declare <2 x i64> @llvm.x86.xop.vpcomneuq(<2 x i64>, <2 x i64>) nounwind readnone
-
-declare <16 x i8> @llvm.x86.xop.vpcomfalseb(<16 x i8>, <16 x i8>) nounwind readnone
-declare <8 x i16> @llvm.x86.xop.vpcomfalsew(<8 x i16>, <8 x i16>) nounwind readnone
-declare <4 x i32> @llvm.x86.xop.vpcomfalsed(<4 x i32>, <4 x i32>) nounwind readnone
-declare <2 x i64> @llvm.x86.xop.vpcomfalseq(<2 x i64>, <2 x i64>) nounwind readnone
-declare <16 x i8> @llvm.x86.xop.vpcomfalseub(<16 x i8>, <16 x i8>) nounwind readnone
-declare <8 x i16> @llvm.x86.xop.vpcomfalseuw(<8 x i16>, <8 x i16>) nounwind readnone
-declare <4 x i32> @llvm.x86.xop.vpcomfalseud(<4 x i32>, <4 x i32>) nounwind readnone
-declare <2 x i64> @llvm.x86.xop.vpcomfalseuq(<2 x i64>, <2 x i64>) nounwind readnone
-
-declare <16 x i8> @llvm.x86.xop.vpcomtrueb(<16 x i8>, <16 x i8>) nounwind readnone
-declare <4 x i32> @llvm.x86.xop.vpcomtrued(<4 x i32>, <4 x i32>) nounwind readnone
-declare <2 x i64> @llvm.x86.xop.vpcomtrueq(<2 x i64>, <2 x i64>) nounwind readnone
-declare <8 x i16> @llvm.x86.xop.vpcomtruew(<8 x i16>, <8 x i16>) nounwind readnone
-declare <16 x i8> @llvm.x86.xop.vpcomtrueub(<16 x i8>, <16 x i8>) nounwind readnone
-declare <8 x i16> @llvm.x86.xop.vpcomtrueuw(<8 x i16>, <8 x i16>) nounwind readnone
-declare <4 x i32> @llvm.x86.xop.vpcomtrueud(<4 x i32>, <4 x i32>) nounwind readnone
-declare <2 x i64> @llvm.x86.xop.vpcomtrueuq(<2 x i64>, <2 x i64>) nounwind readnone
diff --git a/test/Transforms/InstCombine/xor2.ll b/test/Transforms/InstCombine/xor2.ll
index f817ac5ca40c..3afbf632f6e1 100644
--- a/test/Transforms/InstCombine/xor2.ll
+++ b/test/Transforms/InstCombine/xor2.ll
@@ -57,17 +57,6 @@ define i32 @test3(i32 %tmp1) {
   ret i32 %ov110
 }
 
-define i32 @test4(i32 %A, i32 %B) {
-; CHECK-LABEL: @test4(
-; CHECK-NEXT:    [[TMP1:%.*]] = ashr i32 %A, %B
-; CHECK-NEXT:    ret i32 [[TMP1]]
-;
-  %1 = xor i32 %A, -1
-  %2 = ashr i32 %1, %B
-  %3 = xor i32 %2, -1
-  ret i32 %3
-}
-
 ; defect-2 in rdar://12329730
 ; (X^C1) >> C2) ^ C3 -> (X>>C2) ^ ((C1>>C2)^C3)
 ;   where the "X" has more than one use
diff --git a/test/Transforms/InstNamer/basic.ll b/test/Transforms/InstNamer/basic.ll
new file mode 100644
index 000000000000..4c819246b90b
--- /dev/null
+++ b/test/Transforms/InstNamer/basic.ll
@@ -0,0 +1,19 @@
+; RUN: opt -S -instnamer < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define i32 @f_0(i32) {
+; CHECK-LABEL: @f_0(
+; CHECK: bb:
+; CHECK-NEXT:   %tmp = add i32 %arg, 2
+; CHECK-NEXT:   br label %bb1
+; CHECK: bb1:
+; CHECK-NEXT:   ret i32 %tmp
+
+  %2 = add i32 %0, 2
+  br label %3
+
+; <label>:3:
+  ret i32 %2
+}
diff --git a/test/Transforms/InstSimplify/AndOrXor.ll b/test/Transforms/InstSimplify/AndOrXor.ll
index e059d77f1fa8..427ea655fcb2 100644
--- a/test/Transforms/InstSimplify/AndOrXor.ll
+++ b/test/Transforms/InstSimplify/AndOrXor.ll
@@ -628,3 +628,176 @@ define i32 @test46_commuted_and(i32 %a, i32 %b) {
   %or = or i32 %xor, %and
   ret i32 %or
 }
+
+; (~A ^ B) | (A & B) -> ~A ^ B
+
+define i32 @test47(i32 %a, i32 %b) {
+; CHECK-LABEL: @test47(
+; CHECK-NEXT:    [[NEGA:%.*]] = xor i32 [[A:%.*]], -1
+; CHECK-NEXT:    [[XOR:%.*]] = xor i32 [[NEGA]], [[B:%.*]]
+; CHECK-NEXT:    ret i32 [[XOR]]
+;
+  %nega = xor i32 %a, -1
+  %and = and i32 %a, %b
+  %xor = xor i32 %nega, %b
+  %or = or i32 %xor, %and
+  ret i32 %or
+}
+
+define i32 @test48(i32 %a, i32 %b) {
+; CHECK-LABEL: @test48(
+; CHECK-NEXT:    [[NEGA:%.*]] = xor i32 [[A:%.*]], -1
+; CHECK-NEXT:    [[XOR:%.*]] = xor i32 [[B:%.*]], [[NEGA]]
+; CHECK-NEXT:    ret i32 [[XOR]]
+;
+  %nega = xor i32 %a, -1
+  %and = and i32 %a, %b
+  %xor = xor i32 %b, %nega
+  %or = or i32 %xor, %and
+  ret i32 %or
+}
+
+define i32 @test49(i32 %a, i32 %b) {
+; CHECK-LABEL: @test49(
+; CHECK-NEXT:    [[NEGA:%.*]] = xor i32 [[A:%.*]], -1
+; CHECK-NEXT:    [[XOR:%.*]] = xor i32 [[B:%.*]], [[NEGA]]
+; CHECK-NEXT:    ret i32 [[XOR]]
+;
+  %nega = xor i32 %a, -1
+  %and = and i32 %b, %a
+  %xor = xor i32 %b, %nega
+  %or = or i32 %xor, %and
+  ret i32 %or
+}
+
+define i32 @test50(i32 %a, i32 %b) {
+; CHECK-LABEL: @test50(
+; CHECK-NEXT:    [[NEGA:%.*]] = xor i32 [[A:%.*]], -1
+; CHECK-NEXT:    [[XOR:%.*]] = xor i32 [[NEGA]], [[B:%.*]]
+; CHECK-NEXT:    ret i32 [[XOR]]
+;
+  %nega = xor i32 %a, -1
+  %and = and i32 %b, %a
+  %xor = xor i32 %nega, %b
+  %or = or i32 %xor, %and
+  ret i32 %or
+}
+
+define i32 @test51(i32 %a, i32 %b) {
+; CHECK-LABEL: @test51(
+; CHECK-NEXT:    [[NEGA:%.*]] = xor i32 [[A:%.*]], -1
+; CHECK-NEXT:    [[XOR:%.*]] = xor i32 [[NEGA]], [[B:%.*]]
+; CHECK-NEXT:    ret i32 [[XOR]]
+;
+  %nega = xor i32 %a, -1
+  %and = and i32 %a, %b
+  %xor = xor i32 %nega, %b
+  %or = or i32 %and, %xor
+  ret i32 %or
+}
+
+define i32 @test52(i32 %a, i32 %b) {
+; CHECK-LABEL: @test52(
+; CHECK-NEXT:    [[NEGA:%.*]] = xor i32 [[A:%.*]], -1
+; CHECK-NEXT:    [[XOR:%.*]] = xor i32 [[B:%.*]], [[NEGA]]
+; CHECK-NEXT:    ret i32 [[XOR]]
+;
+  %nega = xor i32 %a, -1
+  %and = and i32 %a, %b
+  %xor = xor i32 %b, %nega
+  %or = or i32 %and, %xor
+  ret i32 %or
+}
+
+define i32 @test53(i32 %a, i32 %b) {
+; CHECK-LABEL: @test53(
+; CHECK-NEXT:    [[NEGA:%.*]] = xor i32 [[A:%.*]], -1
+; CHECK-NEXT:    [[XOR:%.*]] = xor i32 [[B:%.*]], [[NEGA]]
+; CHECK-NEXT:    ret i32 [[XOR]]
+;
+  %nega = xor i32 %a, -1
+  %and = and i32 %b, %a
+  %xor = xor i32 %b, %nega
+  %or = or i32 %and, %xor
+  ret i32 %or
+}
+
+define i32 @test54(i32 %a, i32 %b) {
+; CHECK-LABEL: @test54(
+; CHECK-NEXT:    [[NEGA:%.*]] = xor i32 [[A:%.*]], -1
+; CHECK-NEXT:    [[XOR:%.*]] = xor i32 [[NEGA]], [[B:%.*]]
+; CHECK-NEXT:    ret i32 [[XOR]]
+;
+  %nega = xor i32 %a, -1
+  %and = and i32 %b, %a
+  %xor = xor i32 %nega, %b
+  %or = or i32 %and, %xor
+  ret i32 %or
+}
+
+define i8 @lshr_perfect_mask(i8 %x) {
+; CHECK-LABEL: @lshr_perfect_mask(
+; CHECK-NEXT:    [[SH:%.*]] = lshr i8 %x, 5
+; CHECK-NEXT:    [[MASK:%.*]] = and i8 [[SH]], 7
+; CHECK-NEXT:    ret i8 [[MASK]]
+;
+  %sh = lshr i8 %x, 5
+  %mask = and i8 %sh, 7  ; 0x07
+  ret i8 %mask
+}
+
+define <2 x i8> @lshr_oversized_mask_splat(<2 x i8> %x) {
+; CHECK-LABEL: @lshr_oversized_mask_splat(
+; CHECK-NEXT:    [[SH:%.*]] = lshr <2 x i8> %x, <i8 5, i8 5>
+; CHECK-NEXT:    [[MASK:%.*]] = and <2 x i8> [[SH]], <i8 -121, i8 -121>
+; CHECK-NEXT:    ret <2 x i8> [[MASK]]
+;
+  %sh = lshr <2 x i8> %x, <i8 5, i8 5>
+  %mask = and <2 x i8> %sh, <i8 135, i8 135>  ; 0x87
+  ret <2 x i8> %mask
+}
+
+define i8 @lshr_undersized_mask(i8 %x) {
+; CHECK-LABEL: @lshr_undersized_mask(
+; CHECK-NEXT:    [[SH:%.*]] = lshr i8 %x, 5
+; CHECK-NEXT:    [[MASK:%.*]] = and i8 [[SH]], -2
+; CHECK-NEXT:    ret i8 [[MASK]]
+;
+  %sh = lshr i8 %x, 5
+  %mask = and i8 %sh, -2  ; 0xFE
+  ret i8 %mask
+}
+
+define <2 x i8> @shl_perfect_mask_splat(<2 x i8> %x) {
+; CHECK-LABEL: @shl_perfect_mask_splat(
+; CHECK-NEXT:    [[SH:%.*]] = shl <2 x i8> %x, <i8 6, i8 6>
+; CHECK-NEXT:    [[MASK:%.*]] = and <2 x i8> [[SH]], <i8 -64, i8 -64>
+; CHECK-NEXT:    ret <2 x i8> [[MASK]]
+;
+  %sh = shl <2 x i8> %x, <i8 6, i8 6>
+  %mask = and <2 x i8> %sh, <i8 192, i8 192>  ; 0xC0
+  ret <2 x i8> %mask
+}
+
+define i8 @shl_oversized_mask(i8 %x) {
+; CHECK-LABEL: @shl_oversized_mask(
+; CHECK-NEXT:    [[SH:%.*]] = shl i8 %x, 6
+; CHECK-NEXT:    [[MASK:%.*]] = and i8 [[SH]], -61
+; CHECK-NEXT:    ret i8 [[MASK]]
+;
+  %sh = shl i8 %x, 6
+  %mask = and i8 %sh, 195  ; 0xC3
+  ret i8 %mask
+}
+
+define <2 x i8> @shl_undersized_mask_splat(<2 x i8> %x) {
+; CHECK-LABEL: @shl_undersized_mask_splat(
+; CHECK-NEXT:    [[SH:%.*]] = shl <2 x i8> [[X:%.*]], <i8 6, i8 6>
+; CHECK-NEXT:    [[MASK:%.*]] = and <2 x i8> [[SH]], <i8 -120, i8 -120>
+; CHECK-NEXT:    ret <2 x i8> [[MASK]]
+;
+  %sh = shl <2 x i8> %x, <i8 6, i8 6>
+  %mask = and <2 x i8> %sh, <i8 136, i8 136>  ; 0x88
+  ret <2 x i8> %mask
+}
+
diff --git a/test/Transforms/InstSimplify/apint-or.ll b/test/Transforms/InstSimplify/apint-or.ll
deleted file mode 100644
index e3dc2c48fb40..000000000000
--- a/test/Transforms/InstSimplify/apint-or.ll
+++ /dev/null
@@ -1,72 +0,0 @@
-; NOTE: Assertions have been autogenerated by update_test_checks.py
-; RUN: opt < %s -instsimplify -S | FileCheck %s
-
-; Test the case where integer BitWidth <= 64 && BitWidth % 2 != 0.
-define i39 @test1(i39 %V, i39 %M) {
-; CHECK-LABEL: @test1(
-; CHECK:         [[N:%.*]] = and i39 %M, -274877906944
-; CHECK-NEXT:    [[A:%.*]] = add i39 %V, [[N]]
-; CHECK-NEXT:    ret i39 [[A]]
-;
-    ;; If we have: ((V + N) & C1) | (V & C2)
-    ;; .. and C2 = ~C1 and C2 is 0+1+ and (N & C2) == 0
-    ;; replace with V+N.
-    %C1 = xor i39 274877906943, -1 ;; C2 = 274877906943
-    %N = and i39 %M, 274877906944
-    %A = add i39 %V, %N
-    %B = and i39 %A, %C1
-    %D = and i39 %V, 274877906943
-    %R = or i39 %B, %D
-    ret i39 %R
-}
-
-define i7 @test2(i7 %X) {
-; CHECK-LABEL: @test2(
-; CHECK:         ret i7 %X
-;
-    %Y = or i7 %X, 0
-    ret i7 %Y
-}
-
-define i17 @test3(i17 %X) {
-; CHECK-LABEL: @test3(
-; CHECK:         ret i17 -1
-;
-    %Y = or i17 %X, -1
-    ret i17 %Y
-}
-
-; Test the case where Integer BitWidth > 64 && BitWidth <= 1024.
-define i399 @test4(i399 %V, i399 %M) {
-; CHECK-LABEL: @test4(
-; CHECK:         [[N:%.*]] = and i399 %M, 18446742974197923840
-; CHECK-NEXT:    [[A:%.*]] = add i399 %V, [[N]]
-; CHECK-NEXT:    ret i399 [[A]]
-;
-    ;; If we have: ((V + N) & C1) | (V & C2)
-    ;; .. and C2 = ~C1 and C2 is 0+1+ and (N & C2) == 0
-    ;; replace with V+N.
-    %C1 = xor i399 274877906943, -1 ;; C2 = 274877906943
-    %N = and i399 %M, 18446742974197923840
-    %A = add i399 %V, %N
-    %B = and i399 %A, %C1
-    %D = and i399 %V, 274877906943
-    %R = or i399 %B, %D
-    ret i399 %R
-}
-
-define i777 @test5(i777 %X) {
-; CHECK-LABEL: @test5(
-; CHECK:         ret i777 %X
-;
-    %Y = or i777 %X, 0
-    ret i777 %Y
-}
-
-define i117 @test6(i117 %X) {
-; CHECK-LABEL: @test6(
-; CHECK:         ret i117 -1
-;
-    %Y = or i117 %X, -1
-    ret i117 %Y
-}
diff --git a/test/Transforms/InstSimplify/compare.ll b/test/Transforms/InstSimplify/compare.ll
index 883bf31ff77a..d6f1b634102f 100644
--- a/test/Transforms/InstSimplify/compare.ll
+++ b/test/Transforms/InstSimplify/compare.ll
@@ -598,11 +598,14 @@ define i1 @sdiv_exact_equality(i32 %Z) {
   ret i1 %C
 }
 
-; FIXME: But not other preds: PR32949 - https://bugs.llvm.org/show_bug.cgi?id=32949
+; But not other preds: PR32949 - https://bugs.llvm.org/show_bug.cgi?id=32949
 
 define i1 @sdiv_exact_not_equality(i32 %Z) {
 ; CHECK-LABEL: @sdiv_exact_not_equality(
-; CHECK-NEXT:    ret i1 true
+; CHECK-NEXT:    [[A:%.*]] = sdiv exact i32 10, %Z
+; CHECK-NEXT:    [[B:%.*]] = sdiv exact i32 20, %Z
+; CHECK-NEXT:    [[C:%.*]] = icmp ult i32 [[A]], [[B]]
+; CHECK-NEXT:    ret i1 [[C]]
 ;
   %A = sdiv exact i32 10, %Z
   %B = sdiv exact i32 20, %Z
diff --git a/test/Transforms/InstSimplify/or.ll b/test/Transforms/InstSimplify/or.ll
new file mode 100644
index 000000000000..2c5b6181bc6c
--- /dev/null
+++ b/test/Transforms/InstSimplify/or.ll
@@ -0,0 +1,181 @@
+; RUN: opt < %s -instsimplify -S | FileCheck %s
+
+define i32 @test1(i32 %A) {
+; CHECK-LABEL: @test1(
+; CHECK-NEXT:    ret i32 %A
+;
+  %B = or i32 %A, 0
+  ret i32 %B
+}
+
+define i32 @test2(i32 %A) {
+; CHECK-LABEL: @test2(
+; CHECK-NEXT:    ret i32 -1
+;
+  %B = or i32 %A, -1
+  ret i32 %B
+}
+
+define i8 @test2a(i8 %A) {
+; CHECK-LABEL: @test2a(
+; CHECK-NEXT:    ret i8 -1
+;
+  %B = or i8 %A, -1
+  ret i8 %B
+}
+
+define i1 @test3(i1 %A) {
+; CHECK-LABEL: @test3(
+; CHECK-NEXT:    ret i1 %A
+;
+  %B = or i1 %A, false
+  ret i1 %B
+}
+
+define i1 @test4(i1 %A) {
+; CHECK-LABEL: @test4(
+; CHECK-NEXT:    ret i1 true
+;
+  %B = or i1 %A, true
+  ret i1 %B
+}
+
+define i1 @test5(i1 %A) {
+; CHECK-LABEL: @test5(
+; CHECK-NEXT:    ret i1 %A
+;
+  %B = or i1 %A, %A
+  ret i1 %B
+}
+
+define i32 @test6(i32 %A) {
+; CHECK-LABEL: @test6(
+; CHECK-NEXT:    ret i32 %A
+;
+  %B = or i32 %A, %A
+  ret i32 %B
+}
+
+; A | ~A == -1
+define i32 @test7(i32 %A) {
+; CHECK-LABEL: @test7(
+; CHECK-NEXT:    ret i32 -1
+;
+  %NotA = xor i32 %A, -1
+  %B = or i32 %A, %NotA
+  ret i32 %B
+}
+
+define i8 @test8(i8 %A) {
+; CHECK-LABEL: @test8(
+; CHECK-NEXT:    ret i8 -1
+;
+  %B = or i8 %A, -2
+  %C = or i8 %B, 1
+  ret i8 %C
+}
+
+; Test that (A|c1)|(B|c2) == (A|B)|(c1|c2)
+define i8 @test9(i8 %A, i8 %B) {
+; CHECK-LABEL: @test9(
+; CHECK-NEXT:    ret i8 -1
+;
+  %C = or i8 %A, 1
+  %D = or i8 %B, -2
+  %E = or i8 %C, %D
+  ret i8 %E
+}
+
+define i8 @test10(i8 %A) {
+; CHECK-LABEL: @test10(
+; CHECK-NEXT:    ret i8 -2
+;
+  %B = or i8 %A, 1
+  %C = and i8 %B, -2
+  ; (X & C1) | C2 --> (X | C2) & (C1|C2)
+  %D = or i8 %C, -2
+  ret i8 %D
+}
+
+define i8 @test11(i8 %A) {
+; CHECK-LABEL: @test11(
+; CHECK-NEXT:    ret i8 -1
+;
+  %B = or i8 %A, -2
+  %C = xor i8 %B, 13
+  ; (X ^ C1) | C2 --> (X | C2) ^ (C1&~C2)
+  %D = or i8 %C, 1
+  %E = xor i8 %D, 12
+  ret i8 %E
+}
+
+; Test the case where integer BitWidth <= 64 && BitWidth % 2 != 0.
+define i39 @test1_apint(i39 %V, i39 %M) {
+; CHECK-LABEL: @test1_apint(
+; CHECK:         [[N:%.*]] = and i39 %M, -274877906944
+; CHECK-NEXT:    [[A:%.*]] = add i39 %V, [[N]]
+; CHECK-NEXT:    ret i39 [[A]]
+;
+    ;; If we have: ((V + N) & C1) | (V & C2)
+    ;; .. and C2 = ~C1 and C2 is 0+1+ and (N & C2) == 0
+    ;; replace with V+N.
+    %C1 = xor i39 274877906943, -1 ;; C2 = 274877906943
+    %N = and i39 %M, 274877906944
+    %A = add i39 %V, %N
+    %B = and i39 %A, %C1
+    %D = and i39 %V, 274877906943
+    %R = or i39 %B, %D
+    ret i39 %R
+}
+
+define i7 @test2_apint(i7 %X) {
+; CHECK-LABEL: @test2_apint(
+; CHECK:         ret i7 %X
+;
+    %Y = or i7 %X, 0
+    ret i7 %Y
+}
+
+define i17 @test3_apint(i17 %X) {
+; CHECK-LABEL: @test3_apint(
+; CHECK:         ret i17 -1
+;
+    %Y = or i17 %X, -1
+    ret i17 %Y
+}
+
+; Test the case where Integer BitWidth > 64 && BitWidth <= 1024.
+define i399 @test4_apint(i399 %V, i399 %M) {
+; CHECK-LABEL: @test4_apint(
+; CHECK:         [[N:%.*]] = and i399 %M, 18446742974197923840
+; CHECK-NEXT:    [[A:%.*]] = add i399 %V, [[N]]
+; CHECK-NEXT:    ret i399 [[A]]
+;
+    ;; If we have: ((V + N) & C1) | (V & C2)
+    ;; .. and C2 = ~C1 and C2 is 0+1+ and (N & C2) == 0
+    ;; replace with V+N.
+    %C1 = xor i399 274877906943, -1 ;; C2 = 274877906943
+    %N = and i399 %M, 18446742974197923840
+    %A = add i399 %V, %N
+    %B = and i399 %A, %C1
+    %D = and i399 %V, 274877906943
+    %R = or i399 %B, %D
+    ret i399 %R
+}
+
+define i777 @test5_apint(i777 %X) {
+; CHECK-LABEL: @test5_apint(
+; CHECK:         ret i777 %X
+;
+    %Y = or i777 %X, 0
+    ret i777 %Y
+}
+
+define i117 @test6_apint(i117 %X) {
+; CHECK-LABEL: @test6_apint(
+; CHECK:         ret i117 -1
+;
+    %Y = or i117 %X, -1
+    ret i117 %Y
+}
+
diff --git a/test/Transforms/LoopIdiom/ARM/ctlz.ll b/test/Transforms/LoopIdiom/ARM/ctlz.ll
new file mode 100644
index 000000000000..281d97c8c338
--- /dev/null
+++ b/test/Transforms/LoopIdiom/ARM/ctlz.ll
@@ -0,0 +1,185 @@
+; RUN: opt -loop-idiom -mtriple=armv7a < %s -S | FileCheck -check-prefix=LZCNT --check-prefix=ALL %s
+; RUN: opt -loop-idiom -mtriple=armv4t < %s -S | FileCheck -check-prefix=NOLZCNT --check-prefix=ALL %s
+
+; Recognize CTLZ builtin pattern.
+; Here we'll just convert loop to countable,
+; so do not insert builtin if CPU do not support CTLZ
+;
+; int ctlz_and_other(int n, char *a)
+; {
+;   int i = 0, n0 = n;
+;   while(n >>= 1) {
+;     a[i] = (n0 & (1 << i)) ? 1 : 0;
+;     i++;
+;   }
+;   return i;
+; }
+;
+; LZCNT:  entry
+; LZCNT:  %0 = call i32 @llvm.ctlz.i32(i32 %shr8, i1 true)
+; LZCNT-NEXT:  %1 = sub i32 32, %0
+; LZCNT-NEXT:  %2 = zext i32 %1 to i64
+; LZCNT:  %indvars.iv.next.lcssa = phi i64 [ %2, %while.body ]
+; LZCNT:  %4 = trunc i64 %indvars.iv.next.lcssa to i32
+; LZCNT:  %i.0.lcssa = phi i32 [ 0, %entry ], [ %4, %while.end.loopexit ]
+; LZCNT:  ret i32 %i.0.lcssa
+
+; NOLZCNT:  entry
+; NOLZCNT-NOT:  @llvm.ctlz
+
+; Function Attrs: norecurse nounwind uwtable
+define i32 @ctlz_and_other(i32 %n, i8* nocapture %a) {
+entry:
+  %shr8 = ashr i32 %n, 1
+  %tobool9 = icmp eq i32 %shr8, 0
+  br i1 %tobool9, label %while.end, label %while.body.preheader
+
+while.body.preheader:                             ; preds = %entry
+  br label %while.body
+
+while.body:                                       ; preds = %while.body.preheader, %while.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %while.body ], [ 0, %while.body.preheader ]
+  %shr11 = phi i32 [ %shr, %while.body ], [ %shr8, %while.body.preheader ]
+  %0 = trunc i64 %indvars.iv to i32
+  %shl = shl i32 1, %0
+  %and = and i32 %shl, %n
+  %tobool1 = icmp ne i32 %and, 0
+  %conv = zext i1 %tobool1 to i8
+  %arrayidx = getelementptr inbounds i8, i8* %a, i64 %indvars.iv
+  store i8 %conv, i8* %arrayidx, align 1
+  %indvars.iv.next = add nuw i64 %indvars.iv, 1
+  %shr = ashr i32 %shr11, 1
+  %tobool = icmp eq i32 %shr, 0
+  br i1 %tobool, label %while.end.loopexit, label %while.body
+
+while.end.loopexit:                               ; preds = %while.body
+  %1 = trunc i64 %indvars.iv.next to i32
+  br label %while.end
+
+while.end:                                        ; preds = %while.end.loopexit, %entry
+  %i.0.lcssa = phi i32 [ 0, %entry ], [ %1, %while.end.loopexit ]
+  ret i32 %i.0.lcssa
+}
+
+; Recognize CTLZ builtin pattern.
+; Here it will replace the loop -
+; assume builtin is always profitable.
+;
+; int ctlz_zero_check(int n)
+; {
+;   int i = 0;
+;   while(n) {
+;     n >>= 1;
+;     i++;
+;   }
+;   return i;
+; }
+;
+; ALL:  entry
+; ALL:  %0 = call i32 @llvm.ctlz.i32(i32 %n, i1 true)
+; ALL-NEXT:  %1 = sub i32 32, %0
+; ALL:  %inc.lcssa = phi i32 [ %1, %while.body ]
+; ALL:  %i.0.lcssa = phi i32 [ 0, %entry ], [ %inc.lcssa, %while.end.loopexit ]
+; ALL:  ret i32 %i.0.lcssa
+
+; Function Attrs: norecurse nounwind readnone uwtable
+define i32 @ctlz_zero_check(i32 %n) {
+entry:
+  %tobool4 = icmp eq i32 %n, 0
+  br i1 %tobool4, label %while.end, label %while.body.preheader
+
+while.body.preheader:                             ; preds = %entry
+  br label %while.body
+
+while.body:                                       ; preds = %while.body.preheader, %while.body
+  %i.06 = phi i32 [ %inc, %while.body ], [ 0, %while.body.preheader ]
+  %n.addr.05 = phi i32 [ %shr, %while.body ], [ %n, %while.body.preheader ]
+  %shr = ashr i32 %n.addr.05, 1
+  %inc = add nsw i32 %i.06, 1
+  %tobool = icmp eq i32 %shr, 0
+  br i1 %tobool, label %while.end.loopexit, label %while.body
+
+while.end.loopexit:                               ; preds = %while.body
+  br label %while.end
+
+while.end:                                        ; preds = %while.end.loopexit, %entry
+  %i.0.lcssa = phi i32 [ 0, %entry ], [ %inc, %while.end.loopexit ]
+  ret i32 %i.0.lcssa
+}
+
+; Recognize CTLZ builtin pattern.
+; Here it will replace the loop -
+; assume builtin is always profitable.
+;
+; int ctlz(int n)
+; {
+;   int i = 0;
+;   while(n >>= 1) {
+;     i++;
+;   }
+;   return i;
+; }
+;
+; ALL:  entry
+; ALL:  %0 = ashr i32 %n, 1
+; ALL-NEXT:  %1 = call i32 @llvm.ctlz.i32(i32 %0, i1 false)
+; ALL-NEXT:  %2 = sub i32 32, %1
+; ALL-NEXT:  %3 = add i32 %2, 1
+; ALL:  %i.0.lcssa = phi i32 [ %2, %while.cond ]
+; ALL:  ret i32 %i.0.lcssa
+
+; Function Attrs: norecurse nounwind readnone uwtable
+define i32 @ctlz(i32 %n) {
+entry:
+  br label %while.cond
+
+while.cond:                                       ; preds = %while.cond, %entry
+  %n.addr.0 = phi i32 [ %n, %entry ], [ %shr, %while.cond ]
+  %i.0 = phi i32 [ 0, %entry ], [ %inc, %while.cond ]
+  %shr = ashr i32 %n.addr.0, 1
+  %tobool = icmp eq i32 %shr, 0
+  %inc = add nsw i32 %i.0, 1
+  br i1 %tobool, label %while.end, label %while.cond
+
+while.end:                                        ; preds = %while.cond
+  ret i32 %i.0
+}
+
+; Recognize CTLZ builtin pattern.
+; Here it will replace the loop -
+; assume builtin is always profitable.
+;
+; int ctlz_add(int n, int i0)
+; {
+;   int i = i0;
+;   while(n >>= 1) {
+;     i++;
+;   }
+;   return i;
+; }
+;
+; ALL:  entry
+; ALL:  %0 = ashr i32 %n, 1
+; ALL-NEXT:  %1 = call i32 @llvm.ctlz.i32(i32 %0, i1 false)
+; ALL-NEXT:  %2 = sub i32 32, %1
+; ALL-NEXT:  %3 = add i32 %2, 1
+; ALL-NEXT:  %4 = add i32 %2, %i0
+; ALL:  %i.0.lcssa = phi i32 [ %4, %while.cond ]
+; ALL:  ret i32 %i.0.lcssa
+;
+; Function Attrs: norecurse nounwind readnone uwtable
+define i32 @ctlz_add(i32 %n, i32 %i0) {
+entry:
+  br label %while.cond
+
+while.cond:                                       ; preds = %while.cond, %entry
+  %n.addr.0 = phi i32 [ %n, %entry ], [ %shr, %while.cond ]
+  %i.0 = phi i32 [ %i0, %entry ], [ %inc, %while.cond ]
+  %shr = ashr i32 %n.addr.0, 1
+  %tobool = icmp eq i32 %shr, 0
+  %inc = add nsw i32 %i.0, 1
+  br i1 %tobool, label %while.end, label %while.cond
+
+while.end:                                        ; preds = %while.cond
+  ret i32 %i.0
+}
diff --git a/test/Transforms/LoopIdiom/X86/ctlz.ll b/test/Transforms/LoopIdiom/X86/ctlz.ll
new file mode 100644
index 000000000000..d8daa3a9bbab
--- /dev/null
+++ b/test/Transforms/LoopIdiom/X86/ctlz.ll
@@ -0,0 +1,185 @@
+; RUN: opt -loop-idiom -mtriple=x86_64 -mcpu=core-avx2 < %s -S | FileCheck -check-prefix=LZCNT --check-prefix=ALL %s
+; RUN: opt -loop-idiom -mtriple=x86_64 -mcpu=corei7 < %s -S | FileCheck -check-prefix=NOLZCNT --check-prefix=ALL %s
+
+; Recognize CTLZ builtin pattern.
+; Here we'll just convert loop to countable,
+; so do not insert builtin if CPU do not support CTLZ
+;
+; int ctlz_and_other(int n, char *a)
+; {
+;   int i = 0, n0 = n;
+;   while(n >>= 1) {
+;     a[i] = (n0 & (1 << i)) ? 1 : 0;
+;     i++;
+;   }
+;   return i;
+; }
+;
+; LZCNT:  entry
+; LZCNT:  %0 = call i32 @llvm.ctlz.i32(i32 %shr8, i1 true)
+; LZCNT-NEXT:  %1 = sub i32 32, %0
+; LZCNT-NEXT:  %2 = zext i32 %1 to i64
+; LZCNT:  %indvars.iv.next.lcssa = phi i64 [ %2, %while.body ]
+; LZCNT:  %4 = trunc i64 %indvars.iv.next.lcssa to i32
+; LZCNT:  %i.0.lcssa = phi i32 [ 0, %entry ], [ %4, %while.end.loopexit ]
+; LZCNT:  ret i32 %i.0.lcssa
+
+; NOLZCNT:  entry
+; NOLZCNT-NOT:  @llvm.ctlz
+
+; Function Attrs: norecurse nounwind uwtable
+define i32 @ctlz_and_other(i32 %n, i8* nocapture %a) {
+entry:
+  %shr8 = ashr i32 %n, 1
+  %tobool9 = icmp eq i32 %shr8, 0
+  br i1 %tobool9, label %while.end, label %while.body.preheader
+
+while.body.preheader:                             ; preds = %entry
+  br label %while.body
+
+while.body:                                       ; preds = %while.body.preheader, %while.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %while.body ], [ 0, %while.body.preheader ]
+  %shr11 = phi i32 [ %shr, %while.body ], [ %shr8, %while.body.preheader ]
+  %0 = trunc i64 %indvars.iv to i32
+  %shl = shl i32 1, %0
+  %and = and i32 %shl, %n
+  %tobool1 = icmp ne i32 %and, 0
+  %conv = zext i1 %tobool1 to i8
+  %arrayidx = getelementptr inbounds i8, i8* %a, i64 %indvars.iv
+  store i8 %conv, i8* %arrayidx, align 1
+  %indvars.iv.next = add nuw i64 %indvars.iv, 1
+  %shr = ashr i32 %shr11, 1
+  %tobool = icmp eq i32 %shr, 0
+  br i1 %tobool, label %while.end.loopexit, label %while.body
+
+while.end.loopexit:                               ; preds = %while.body
+  %1 = trunc i64 %indvars.iv.next to i32
+  br label %while.end
+
+while.end:                                        ; preds = %while.end.loopexit, %entry
+  %i.0.lcssa = phi i32 [ 0, %entry ], [ %1, %while.end.loopexit ]
+  ret i32 %i.0.lcssa
+}
+
+; Recognize CTLZ builtin pattern.
+; Here it will replace the loop -
+; assume builtin is always profitable.
+;
+; int ctlz_zero_check(int n)
+; {
+;   int i = 0;
+;   while(n) {
+;     n >>= 1;
+;     i++;
+;   }
+;   return i;
+; }
+;
+; ALL:  entry
+; ALL:  %0 = call i32 @llvm.ctlz.i32(i32 %n, i1 true)
+; ALL-NEXT:  %1 = sub i32 32, %0
+; ALL:  %inc.lcssa = phi i32 [ %1, %while.body ]
+; ALL:  %i.0.lcssa = phi i32 [ 0, %entry ], [ %inc.lcssa, %while.end.loopexit ]
+; ALL:  ret i32 %i.0.lcssa
+
+; Function Attrs: norecurse nounwind readnone uwtable
+define i32 @ctlz_zero_check(i32 %n) {
+entry:
+  %tobool4 = icmp eq i32 %n, 0
+  br i1 %tobool4, label %while.end, label %while.body.preheader
+
+while.body.preheader:                             ; preds = %entry
+  br label %while.body
+
+while.body:                                       ; preds = %while.body.preheader, %while.body
+  %i.06 = phi i32 [ %inc, %while.body ], [ 0, %while.body.preheader ]
+  %n.addr.05 = phi i32 [ %shr, %while.body ], [ %n, %while.body.preheader ]
+  %shr = ashr i32 %n.addr.05, 1
+  %inc = add nsw i32 %i.06, 1
+  %tobool = icmp eq i32 %shr, 0
+  br i1 %tobool, label %while.end.loopexit, label %while.body
+
+while.end.loopexit:                               ; preds = %while.body
+  br label %while.end
+
+while.end:                                        ; preds = %while.end.loopexit, %entry
+  %i.0.lcssa = phi i32 [ 0, %entry ], [ %inc, %while.end.loopexit ]
+  ret i32 %i.0.lcssa
+}
+
+; Recognize CTLZ builtin pattern.
+; Here it will replace the loop -
+; assume builtin is always profitable.
+;
+; int ctlz(int n)
+; {
+;   int i = 0;
+;   while(n >>= 1) {
+;     i++;
+;   }
+;   return i;
+; }
+;
+; ALL:  entry
+; ALL:  %0 = ashr i32 %n, 1
+; ALL-NEXT:  %1 = call i32 @llvm.ctlz.i32(i32 %0, i1 false)
+; ALL-NEXT:  %2 = sub i32 32, %1
+; ALL-NEXT:  %3 = add i32 %2, 1
+; ALL:  %i.0.lcssa = phi i32 [ %2, %while.cond ]
+; ALL:  ret i32 %i.0.lcssa
+
+; Function Attrs: norecurse nounwind readnone uwtable
+define i32 @ctlz(i32 %n) {
+entry:
+  br label %while.cond
+
+while.cond:                                       ; preds = %while.cond, %entry
+  %n.addr.0 = phi i32 [ %n, %entry ], [ %shr, %while.cond ]
+  %i.0 = phi i32 [ 0, %entry ], [ %inc, %while.cond ]
+  %shr = ashr i32 %n.addr.0, 1
+  %tobool = icmp eq i32 %shr, 0
+  %inc = add nsw i32 %i.0, 1
+  br i1 %tobool, label %while.end, label %while.cond
+
+while.end:                                        ; preds = %while.cond
+  ret i32 %i.0
+}
+
+; Recognize CTLZ builtin pattern.
+; Here it will replace the loop -
+; assume builtin is always profitable.
+;
+; int ctlz_add(int n, int i0)
+; {
+;   int i = i0;
+;   while(n >>= 1) {
+;     i++;
+;   }
+;   return i;
+; }
+;
+; ALL:  entry
+; ALL:  %0 = ashr i32 %n, 1
+; ALL-NEXT:  %1 = call i32 @llvm.ctlz.i32(i32 %0, i1 false)
+; ALL-NEXT:  %2 = sub i32 32, %1
+; ALL-NEXT:  %3 = add i32 %2, 1
+; ALL-NEXT:  %4 = add i32 %2, %i0
+; ALL:  %i.0.lcssa = phi i32 [ %4, %while.cond ]
+; ALL:  ret i32 %i.0.lcssa
+;
+; Function Attrs: norecurse nounwind readnone uwtable
+define i32 @ctlz_add(i32 %n, i32 %i0) {
+entry:
+  br label %while.cond
+
+while.cond:                                       ; preds = %while.cond, %entry
+  %n.addr.0 = phi i32 [ %n, %entry ], [ %shr, %while.cond ]
+  %i.0 = phi i32 [ %i0, %entry ], [ %inc, %while.cond ]
+  %shr = ashr i32 %n.addr.0, 1
+  %tobool = icmp eq i32 %shr, 0
+  %inc = add nsw i32 %i.0, 1
+  br i1 %tobool, label %while.end, label %while.cond
+
+while.end:                                        ; preds = %while.cond
+  ret i32 %i.0
+}
diff --git a/test/Transforms/LoopUnroll/not-rotated.ll b/test/Transforms/LoopUnroll/not-rotated.ll
index ffe80920d948..b4b88e096079 100644
--- a/test/Transforms/LoopUnroll/not-rotated.ll
+++ b/test/Transforms/LoopUnroll/not-rotated.ll
@@ -4,7 +4,7 @@
 ; properly handled by LoopUnroll, currently.
 
 ; RUN: opt -loop-unroll -verify-dom-info %s
-; REQUIRE: asserts
+; REQUIRES: asserts
 
 define void @tinkywinky(i1 %patatino) {
 entry:
diff --git a/test/Transforms/LoopVectorize/X86/svml-calls-finite.ll b/test/Transforms/LoopVectorize/X86/svml-calls-finite.ll
new file mode 100644
index 000000000000..5a4bfe5e6bdd
--- /dev/null
+++ b/test/Transforms/LoopVectorize/X86/svml-calls-finite.ll
@@ -0,0 +1,187 @@
+; RUN: opt -vector-library=SVML -loop-vectorize -S < %s | FileCheck %s
+
+; Test to verify that when math headers are built with
+; __FINITE_MATH_ONLY__ enabled, causing use of __<func>_finite
+; function versions, vectorization can map these to vector versions.
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+declare float @__expf_finite(float) #0
+
+; CHECK-LABEL: @exp_f32
+; CHECK: <4 x float> @__svml_expf4
+; CHECK: ret
+define void @exp_f32(float* nocapture %varray) {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %tmp = trunc i64 %indvars.iv to i32
+  %conv = sitofp i32 %tmp to float
+  %call = tail call fast float @__expf_finite(float %conv)
+  %arrayidx = getelementptr inbounds float, float* %varray, i64 %indvars.iv
+  store float %call, float* %arrayidx, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !1
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+!1 = distinct !{!1, !2, !3}
+!2 = !{!"llvm.loop.vectorize.width", i32 4}
+!3 = !{!"llvm.loop.vectorize.enable", i1 true}
+
+
+declare double @__exp_finite(double) #0
+
+; CHECK-LABEL: @exp_f64
+; CHECK: <4 x double> @__svml_exp4
+; CHECK: ret
+define void @exp_f64(double* nocapture %varray) {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %tmp = trunc i64 %indvars.iv to i32
+  %conv = sitofp i32 %tmp to double
+  %call = tail call fast double @__exp_finite(double %conv)
+  %arrayidx = getelementptr inbounds double, double* %varray, i64 %indvars.iv
+  store double %call, double* %arrayidx, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !11
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+!11 = distinct !{!11, !12, !13}
+!12 = !{!"llvm.loop.vectorize.width", i32 4}
+!13 = !{!"llvm.loop.vectorize.enable", i1 true}
+
+
+
+
+declare float @__logf_finite(float) #0
+
+; CHECK-LABEL: @log_f32
+; CHECK: <4 x float> @__svml_logf4
+; CHECK: ret
+define void @log_f32(float* nocapture %varray) {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %tmp = trunc i64 %indvars.iv to i32
+  %conv = sitofp i32 %tmp to float
+  %call = tail call fast float @__logf_finite(float %conv)
+  %arrayidx = getelementptr inbounds float, float* %varray, i64 %indvars.iv
+  store float %call, float* %arrayidx, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !21
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+!21 = distinct !{!21, !22, !23}
+!22 = !{!"llvm.loop.vectorize.width", i32 4}
+!23 = !{!"llvm.loop.vectorize.enable", i1 true}
+
+
+declare double @__log_finite(double) #0
+
+; CHECK-LABEL: @log_f64
+; CHECK: <4 x double> @__svml_log4
+; CHECK: ret
+define void @log_f64(double* nocapture %varray) {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %tmp = trunc i64 %indvars.iv to i32
+  %conv = sitofp i32 %tmp to double
+  %call = tail call fast double @__log_finite(double %conv)
+  %arrayidx = getelementptr inbounds double, double* %varray, i64 %indvars.iv
+  store double %call, double* %arrayidx, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !31
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+!31 = distinct !{!31, !32, !33}
+!32 = !{!"llvm.loop.vectorize.width", i32 4}
+!33 = !{!"llvm.loop.vectorize.enable", i1 true}
+
+
+declare float @__powf_finite(float, float) #0
+
+; CHECK-LABEL: @pow_f32
+; CHECK: <4 x float> @__svml_powf4
+; CHECK: ret
+define void @pow_f32(float* nocapture %varray, float* nocapture readonly %exp) {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %tmp = trunc i64 %indvars.iv to i32
+  %conv = sitofp i32 %tmp to float
+  %arrayidx = getelementptr inbounds float, float* %exp, i64 %indvars.iv
+  %tmp1 = load float, float* %arrayidx, align 4
+  %tmp2 = tail call fast float @__powf_finite(float %conv, float %tmp1)
+  %arrayidx2 = getelementptr inbounds float, float* %varray, i64 %indvars.iv
+  store float %tmp2, float* %arrayidx2, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !41
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+!41 = distinct !{!41, !42, !43}
+!42 = !{!"llvm.loop.vectorize.width", i32 4}
+!43 = !{!"llvm.loop.vectorize.enable", i1 true}
+
+
+declare double @__pow_finite(double, double) #0
+
+; CHECK-LABEL: @pow_f64
+; CHECK: <4 x double> @__svml_pow4
+; CHECK: ret
+define void @pow_f64(double* nocapture %varray, double* nocapture readonly %exp) {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %tmp = trunc i64 %indvars.iv to i32
+  %conv = sitofp i32 %tmp to double
+  %arrayidx = getelementptr inbounds double, double* %exp, i64 %indvars.iv
+  %tmp1 = load double, double* %arrayidx, align 4
+  %tmp2 = tail call fast double @__pow_finite(double %conv, double %tmp1)
+  %arrayidx2 = getelementptr inbounds double, double* %varray, i64 %indvars.iv
+  store double %tmp2, double* %arrayidx2, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !51
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+!51 = distinct !{!51, !52, !53}
+!52 = !{!"llvm.loop.vectorize.width", i32 4}
+!53 = !{!"llvm.loop.vectorize.enable", i1 true}
diff --git a/test/Transforms/LoopVectorize/induction.ll b/test/Transforms/LoopVectorize/induction.ll
index 6507166dd1f2..7e9e6b1cdc8e 100644
--- a/test/Transforms/LoopVectorize/induction.ll
+++ b/test/Transforms/LoopVectorize/induction.ll
@@ -849,3 +849,48 @@ for.end:
   %tmp7 = phi i32 [ %tmp6, %for.inc ]
   ret i32 %tmp7
 }
+
+; Ensure that the shuffle vector for first order recurrence is inserted
+; correctly after all the phis. These new phis correspond to new IVs 
+; that are generated by optimizing non-free truncs of IVs to IVs themselves 
+define i64 @trunc_with_first_order_recurrence() {
+; CHECK-LABEL: trunc_with_first_order_recurrence
+; CHECK-LABEL: vector.body:
+; CHECK-NEXT:    %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+; CHECK-NEXT:    %vec.phi = phi <2 x i64>
+; CHECK-NEXT:    %vec.ind = phi <2 x i64> [ <i64 1, i64 2>, %vector.ph ], [ %vec.ind.next, %vector.body ]
+; CHECK-NEXT:    %vec.ind2 = phi <2 x i32> [ <i32 1, i32 2>, %vector.ph ], [ %vec.ind.next3, %vector.body ]
+; CHECK-NEXT:    %vector.recur = phi <2 x i32> [ <i32 undef, i32 42>, %vector.ph ], [ %vec.ind5, %vector.body ]
+; CHECK-NEXT:    %vec.ind5 = phi <2 x i32> [ <i32 1, i32 2>, %vector.ph ], [ %vec.ind.next6, %vector.body ]
+; CHECK-NEXT:    %vec.ind7 = phi <2 x i32> [ <i32 1, i32 2>, %vector.ph ], [ %vec.ind.next8, %vector.body ]
+; CHECK-NEXT:    shufflevector <2 x i32> %vector.recur, <2 x i32> %vec.ind5, <2 x i32> <i32 1, i32 2>
+entry:
+  br label %loop
+
+exit:                                        ; preds = %loop
+  %.lcssa = phi i64 [ %c23, %loop ]
+  ret i64 %.lcssa
+
+loop:                                         ; preds = %loop, %entry
+  %c5 = phi i64 [ %c23, %loop ], [ 0, %entry ]
+  %indvars.iv = phi i64 [ %indvars.iv.next, %loop ], [ 1, %entry ]
+  %x = phi i32 [ %c24, %loop ], [ 1, %entry ]
+  %y = phi i32 [ %c6, %loop ], [ 42, %entry ]
+  %c6 = trunc i64 %indvars.iv to i32
+  %c8 = mul i32 %x, %c6
+  %c9 = add i32 %c8, 42
+  %c10 = add i32 %y, %c6
+  %c11 = add i32 %c10, %c9
+  %c12 = sext i32 %c11 to i64
+  %c13 = add i64 %c5, %c12
+  %indvars.iv.tr = trunc i64 %indvars.iv to i32
+  %c14 = shl i32 %indvars.iv.tr, 1
+  %c15 = add i32 %c9, %c14
+  %c16 = sext i32 %c15 to i64
+  %c23 = add i64 %c13, %c16
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %c24 = add nuw nsw i32 %x, 1
+  %exitcond.i = icmp eq i64 %indvars.iv.next, 114
+  br i1 %exitcond.i, label %exit, label %loop
+
+}
diff --git a/test/Transforms/LoopVectorize/pr32859.ll b/test/Transforms/LoopVectorize/pr32859.ll
new file mode 100644
index 000000000000..31cb84699f71
--- /dev/null
+++ b/test/Transforms/LoopVectorize/pr32859.ll
@@ -0,0 +1,30 @@
+; RUN: opt < %s -loop-vectorize -S | FileCheck %s
+
+; Out of the LCSSA form we could have 'phi i32 [ loop-invariant, %for.inc.2.i ]'
+; but the IR Verifier requires for PHI one entry for each predecessor of
+; it's parent basic block. The original PR14725 solution for the issue just
+; added 'undef' for an predecessor BB and which is not correct. We copy the real
+; value for another predecessor instead of bringing 'undef'.
+
+; CHECK-LABEL: for.cond.preheader:
+; CHECK: %e.0.ph = phi i32 [ 0, %if.end.2.i ], [ 0, %middle.block ]
+
+; Function Attrs: nounwind uwtable
+define void @main() #0 {
+entry:
+  br label %for.cond1.preheader.i
+
+for.cond1.preheader.i:                            ; preds = %if.end.2.i, %entry
+  %c.06.i = phi i32 [ 0, %entry ], [ %inc5.i, %if.end.2.i ]
+  %tobool.i = icmp ne i32 undef, 0
+  br label %if.end.2.i
+
+if.end.2.i:                                       ; preds = %for.cond1.preheader.i
+  %inc5.i = add nsw i32 %c.06.i, 1
+  %cmp.i = icmp slt i32 %inc5.i, 16
+  br i1 %cmp.i, label %for.cond1.preheader.i, label %for.cond.preheader
+
+for.cond.preheader:                               ; preds = %if.end.2.i
+  %e.0.ph = phi i32 [ 0, %if.end.2.i ]
+  unreachable
+}
diff --git a/test/Transforms/NewGVN/pr32934.ll b/test/Transforms/NewGVN/pr32934.ll
new file mode 100644
index 000000000000..4bb7ea150437
--- /dev/null
+++ b/test/Transforms/NewGVN/pr32934.ll
@@ -0,0 +1,69 @@
+; REQUIRES: disabled
+; RUN: opt -S -newgvn %s | FileCheck %s
+
+; CHECK: define void @tinkywinky() {
+; CHECK-NEXT: entry:
+; CHECK-NEXT:   %d = alloca i32, align 4
+; CHECK-NEXT:   store i32 0, i32* null, align 4
+; CHECK-NEXT:   br label %for.cond
+; CHECK: for.cond:                                         ; preds = %if.end, %entry
+; CHECK-NEXT:   %0 = load i32, i32* null, align 4
+; CHECK-NEXT:   %cmp = icmp slt i32 %0, 1
+; CHECK-NEXT:   br i1 %cmp, label %for.body, label %while.cond
+; CHECK: for.body:                                         ; preds = %for.cond
+; CHECK-NEXT:   %1 = load i32, i32* @a, align 4
+; CHECK-NEXT:   store i32 %1, i32* %d, align 4
+; CHECK-NEXT:   br label %L
+; CHECK: L:                                                ; preds = %if.then, %for.body
+; CHECK-NEXT:   %tobool = icmp ne i32 %1, 0
+; CHECK-NEXT:   br i1 %tobool, label %if.then, label %if.end
+; CHECK: if.then:                                          ; preds = %L
+; CHECK-NEXT:   call void (i8*, ...) @printf(i8* getelementptr inbounds ([2 x i8], [2 x i8]* @patatino, i32 0, i32 0))
+; CHECK-NEXT:   br label %L
+; CHECK: if.end:                                           ; preds = %L
+; CHECK-NEXT:   br label %for.cond
+; CHECK: while.cond:                                       ; preds = %while.body, %for.cond
+; CHECK-NEXT:   br i1 undef, label %while.body, label %while.end
+; CHECK: while.body:                                       ; preds = %while.cond
+; CHECK-NEXT:   call void (i8*, ...) @printf(i8* getelementptr inbounds ([2 x i8], [2 x i8]* @patatino, i32 0, i32 0))
+; CHECK-NEXT:   br label %while.cond
+; CHECK: while.end:
+; CHECK-NEXT:   %2 = load i32, i32* @a, align 4
+; CHECK-NEXT:   store i32 %2, i32* undef, align 4
+; CHECK-NEXT:   ret void
+
+@a = external global i32, align 4
+@patatino = external unnamed_addr constant [2 x i8], align 1
+define void @tinkywinky() {
+entry:
+  %d = alloca i32, align 4
+  store i32 0, i32* null, align 4
+  br label %for.cond
+for.cond:
+  %0 = load i32, i32* null, align 4
+  %cmp = icmp slt i32 %0, 1
+  br i1 %cmp, label %for.body, label %while.cond
+for.body:
+  %1 = load i32, i32* @a, align 4
+  store i32 %1, i32* %d, align 4
+  br label %L
+L:
+  %2 = load i32, i32* %d, align 4
+  %tobool = icmp ne i32 %2, 0
+  br i1 %tobool, label %if.then, label %if.end
+if.then:
+  call void (i8*, ...) @printf(i8* getelementptr inbounds ([2 x i8], [2 x i8]* @patatino, i32 0, i32 0))
+  br label %L
+if.end:
+  br label %for.cond
+while.cond:
+  br i1 undef, label %while.body, label %while.end
+while.body:
+  call void (i8*, ...) @printf(i8* getelementptr inbounds ([2 x i8], [2 x i8]* @patatino, i32 0, i32 0))
+  br label %while.cond
+while.end:
+  %3 = load i32, i32* @a, align 4
+  store i32 %3, i32* undef, align 4
+  ret void
+}
+declare void @printf(i8*, ...) #1
diff --git a/test/Transforms/NewGVN/pr32952.ll b/test/Transforms/NewGVN/pr32952.ll
new file mode 100644
index 000000000000..056b3a5105ec
--- /dev/null
+++ b/test/Transforms/NewGVN/pr32952.ll
@@ -0,0 +1,42 @@
+; PR32952: Don't erroneously consider congruent two phi nodes which
+; have the same arguments but different incoming edges.
+; RUN: opt -newgvn -S %s | FileCheck %s
+
+@a = common global i16 0, align 2
+@.str = private unnamed_addr constant [4 x i8] c"%d\0A\00", align 1
+
+define i32 @tinkywinky() {
+entry:
+  %0 = load i16, i16* @a, align 2
+  %conv = sext i16 %0 to i32
+  %neg = xor i32 %conv, -1
+  %conv1 = trunc i32 %neg to i16
+  %conv3 = zext i16 %conv1 to i32
+  %cmp = icmp slt i32 %conv, %conv3
+  br i1 %cmp, label %tinky, label %winky
+
+tinky:
+  store i16 2, i16* @a, align 2
+  br label %patatino
+
+winky:
+  br label %patatino
+
+patatino:
+; CHECK: %meh = phi i16 [ %0, %winky ], [ %conv1, %tinky ]
+; CHECK: %banana = phi i16 [ %0, %tinky ], [ %conv1, %winky ]
+  %meh = phi i16 [ %0, %winky ], [ %conv1, %tinky ]
+  %banana = phi i16 [ %0, %tinky ], [ %conv1, %winky ]
+  br label %end
+
+end:
+; CHECK: %promoted = zext i16 %banana to i32
+; CHECK: %other = zext i16 %meh to i32
+  %promoted = zext i16 %banana to i32
+  %other = zext i16 %meh to i32
+  %first = tail call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i64 0, i64 0), i32 %promoted)
+  %second = tail call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i64 0, i64 0), i32 %other)
+  ret i32 0
+}
+
+declare i32 @printf(i8*, ...)
diff --git a/test/Transforms/NewGVN/verify-memoryphi.ll b/test/Transforms/NewGVN/verify-memoryphi.ll
new file mode 100644
index 000000000000..57dbd18986d2
--- /dev/null
+++ b/test/Transforms/NewGVN/verify-memoryphi.ll
@@ -0,0 +1,29 @@
+; Skip dead MemoryPhis when performing memory congruency verification
+; in NewGVN.
+; RUN: opt -S -newgvn %s | FileCheck %s
+; REQUIRES: asserts
+
+; CHECK: define void @tinkywinky() {
+; CHECK-NEXT: entry:
+; CHECK-NEXT:   br i1 false, label %body, label %end
+; CHECK:      body:
+; CHECK-NEXT:   store i8 undef, i8* null
+; CHECK-NEXT:   br label %end
+; CHECK:      end:
+; CHECK-NEXT:   ret void
+; CHECK-NEXT: }
+
+declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture)
+
+define void @tinkywinky() {
+entry:
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* undef)
+  br i1 false, label %body, label %end
+
+body:
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* undef)
+  br label %end
+
+end:
+  ret void
+}
diff --git a/test/Transforms/SLPVectorizer/AArch64/64-bit-vector.ll b/test/Transforms/SLPVectorizer/AArch64/64-bit-vector.ll
new file mode 100644
index 000000000000..edc8042a217d
--- /dev/null
+++ b/test/Transforms/SLPVectorizer/AArch64/64-bit-vector.ll
@@ -0,0 +1,22 @@
+; RUN: opt -S -slp-vectorizer -mtriple=aarch64--linux-gnu -mcpu=generic < %s | FileCheck %s
+; RUN: opt -S -slp-vectorizer -mtriple=aarch64-apple-ios -mcpu=cyclone < %s | FileCheck %s
+; Currently disabled for a few subtargets (e.g. Kryo):
+; RUN: opt -S -slp-vectorizer -mtriple=aarch64--linux-gnu -mcpu=kryo < %s | FileCheck --check-prefix=NO_SLP %s
+; RUN: opt -S -slp-vectorizer -mtriple=aarch64--linux-gnu -mcpu=generic -slp-min-reg-size=128 < %s | FileCheck --check-prefix=NO_SLP %s
+
+define void @f(float* %r, float* %w) {
+  %r0 = getelementptr inbounds float, float* %r, i64 0
+  %r1 = getelementptr inbounds float, float* %r, i64 1
+  %f0 = load float, float* %r0
+  %f1 = load float, float* %r1
+  %add0 = fadd float %f0, %f0
+; CHECK:  fadd <2 x float>
+; NO_SLP: fadd float
+; NO_SLP: fadd float
+  %add1 = fadd float %f1, %f1
+  %w0 = getelementptr inbounds float, float* %w, i64 0
+  %w1 = getelementptr inbounds float, float* %w, i64 1
+  store float %add0, float* %w0
+  store float %add1, float* %w1
+  ret void
+}
diff --git a/test/Transforms/SLPVectorizer/AArch64/getelementptr.ll b/test/Transforms/SLPVectorizer/AArch64/getelementptr.ll
index e9b71963530c..962a6c3b57b3 100644
--- a/test/Transforms/SLPVectorizer/AArch64/getelementptr.ll
+++ b/test/Transforms/SLPVectorizer/AArch64/getelementptr.ll
@@ -1,4 +1,5 @@
-; RUN: opt -S -slp-vectorizer -slp-threshold=-18 -dce -instcombine < %s | FileCheck %s
+; RUN: opt -S -slp-vectorizer -slp-threshold=-18 -dce -instcombine -pass-remarks-output=%t < %s | FileCheck %s
+; RUN: cat %t | FileCheck -check-prefix=YAML %s
 
 target datalayout = "e-m:e-i32:64-i128:128-n32:64-S128"
 target triple = "aarch64--linux-gnu"
@@ -23,7 +24,25 @@ target triple = "aarch64--linux-gnu"
 ; CHECK: [[A:%[a-zA-Z0-9.]+]] = add nsw <4 x i32>
 ; CHECK: [[X:%[a-zA-Z0-9.]+]] = extractelement <4 x i32> [[A]]
 ; CHECK: sext i32 [[X]] to i64
-;
+
+; YAML:      Pass:            slp-vectorizer
+; YAML-NEXT: Name:            VectorizedList
+; YAML-NEXT: Function:        getelementptr_4x32
+; YAML-NEXT: Args:
+; YAML-NEXT:   - String:          'SLP vectorized with cost '
+; YAML-NEXT:   - Cost:            '11'
+; YAML-NEXT:   - String:          ' and with tree size '
+; YAML-NEXT:   - TreeSize:        '5'
+
+; YAML:      Pass:            slp-vectorizer
+; YAML-NEXT: Name:            VectorizedList
+; YAML-NEXT: Function:        getelementptr_4x32
+; YAML-NEXT: Args:
+; YAML-NEXT:   - String:          'SLP vectorized with cost '
+; YAML-NEXT:   - Cost:            '16'
+; YAML-NEXT:   - String:          ' and with tree size '
+; YAML-NEXT:   - TreeSize:        '3'
+
 define i32 @getelementptr_4x32(i32* nocapture readonly %g, i32 %n, i32 %x, i32 %y, i32 %z) {
 entry:
   %cmp31 = icmp sgt i32 %n, 0
@@ -69,7 +88,25 @@ for.body:
 ; CHECK: [[A:%[a-zA-Z0-9.]+]] = add nsw <2 x i32>
 ; CHECK: [[X:%[a-zA-Z0-9.]+]] = extractelement <2 x i32> [[A]]
 ; CHECK: sext i32 [[X]] to i64
-;
+
+; YAML:      Pass:            slp-vectorizer
+; YAML-NEXT: Name:            VectorizedList
+; YAML-NEXT: Function:        getelementptr_2x32
+; YAML-NEXT: Args:
+; YAML-NEXT:   - String:          'SLP vectorized with cost '
+; YAML-NEXT:   - Cost:            '11'
+; YAML-NEXT:   - String:          ' and with tree size '
+; YAML-NEXT:   - TreeSize:        '5'
+
+; YAML:      Pass:            slp-vectorizer
+; YAML-NEXT: Name:            VectorizedList
+; YAML-NEXT: Function:        getelementptr_2x32
+; YAML-NEXT: Args:
+; YAML-NEXT:   - String:          'SLP vectorized with cost '
+; YAML-NEXT:   - Cost:            '6'
+; YAML-NEXT:   - String:          ' and with tree size '
+; YAML-NEXT:   - TreeSize:        '3'
+
 define i32 @getelementptr_2x32(i32* nocapture readonly %g, i32 %n, i32 %x, i32 %y, i32 %z) {
 entry:
   %cmp31 = icmp sgt i32 %n, 0
diff --git a/test/Transforms/SLPVectorizer/AArch64/horizontal.ll b/test/Transforms/SLPVectorizer/AArch64/horizontal.ll
index 8f8bf2648aa2..1a6a2fb890d3 100644
--- a/test/Transforms/SLPVectorizer/AArch64/horizontal.ll
+++ b/test/Transforms/SLPVectorizer/AArch64/horizontal.ll
@@ -1,4 +1,5 @@
-; RUN: opt -slp-vectorizer -slp-threshold=-6 -S <  %s | FileCheck %s
+; RUN: opt -slp-vectorizer -slp-threshold=-6 -S -pass-remarks-output=%t <  %s | FileCheck %s
+; RUN: cat %t | FileCheck -check-prefix=YAML %s
 
 ; FIXME: The threshold is changed to keep this test case a bit smaller.
 ; The AArch64 cost model should not give such high costs to select statements.
@@ -10,6 +11,16 @@ target triple = "aarch64--linux"
 ; CHECK: load <4 x i32>
 ; CHECK: load <4 x i32>
 ; CHECK: select <4 x i1>
+
+; YAML:      Pass:            slp-vectorizer
+; YAML-NEXT: Name:            VectorizedHorizontalReduction
+; YAML-NEXT: Function:        test_select
+; YAML-NEXT: Args:
+; YAML-NEXT:   - String:          'Vectorized horizontal reduction with cost '
+; YAML-NEXT:   - Cost:            '4'
+; YAML-NEXT:   - String:          ' and with tree size '
+; YAML-NEXT:   - TreeSize:        '8'
+
 define i32 @test_select(i32* noalias nocapture readonly %blk1, i32* noalias nocapture readonly %blk2, i32 %lx, i32 %h) {
 entry:
   %cmp.22 = icmp sgt i32 %h, 0
@@ -93,6 +104,16 @@ define i32 @reduction_with_br(i32* noalias nocapture readonly %blk1, i32* noalia
 ; CHECK: load <4 x i32>
 ; CHECK: load <4 x i32>
 ; CHECK: mul nsw <4 x i32>
+
+; YAML:      Pass:            slp-vectorizer
+; YAML-NEXT: Name:            VectorizedHorizontalReduction
+; YAML-NEXT: Function:        reduction_with_br
+; YAML-NEXT: Args:
+; YAML-NEXT:   - String:          'Vectorized horizontal reduction with cost '
+; YAML-NEXT:   - Cost:            '1'
+; YAML-NEXT:   - String:          ' and with tree size '
+; YAML-NEXT:   - TreeSize:        '3'
+
 entry:
   %cmp.16 = icmp sgt i32 %h, 0
   br i1 %cmp.16, label %for.body.lr.ph, label %for.end
@@ -150,6 +171,16 @@ for.end:                                          ; preds = %for.end.loopexit, %
 ; CHECK: load <8 x i8>
 ; CHECK: load <8 x i8>
 ; CHECK: select <8 x i1>
+
+; YAML:      Pass:            slp-vectorizer
+; YAML-NEXT: Name:            VectorizedHorizontalReduction
+; YAML-NEXT: Function:        test_unrolled_select
+; YAML-NEXT: Args:
+; YAML-NEXT:   - String:          'Vectorized horizontal reduction with cost '
+; YAML-NEXT:   - Cost:            '-33'
+; YAML-NEXT:   - String:          ' and with tree size '
+; YAML-NEXT:   - TreeSize:        '10'
+
 define i32 @test_unrolled_select(i8* noalias nocapture readonly %blk1, i8* noalias nocapture readonly %blk2, i32 %lx, i32 %h, i32 %lim) #0 {
 entry:
   %cmp.43 = icmp sgt i32 %h, 0
diff --git a/test/Transforms/SLPVectorizer/AArch64/remarks.ll b/test/Transforms/SLPVectorizer/AArch64/remarks.ll
new file mode 100644
index 000000000000..e8c37512594e
--- /dev/null
+++ b/test/Transforms/SLPVectorizer/AArch64/remarks.ll
@@ -0,0 +1,32 @@
+; RUN: opt -S -slp-vectorizer -mtriple=aarch64--linux-gnu -mcpu=generic -pass-remarks=slp-vectorizer -o /dev/null < %s 2>&1 | FileCheck %s
+
+define void @f(double* %r, double* %w) {
+  %r0 = getelementptr inbounds double, double* %r, i64 0
+  %r1 = getelementptr inbounds double, double* %r, i64 1
+  %f0 = load double, double* %r0
+  %f1 = load double, double* %r1
+  %add0 = fadd double %f0, %f0
+  %add1 = fadd double %f1, %f1
+  %w0 = getelementptr inbounds double, double* %w, i64 0
+  %w1 = getelementptr inbounds double, double* %w, i64 1
+; CHECK: remark: /tmp/s.c:5:10: Stores SLP vectorized with cost -4 and with tree size 3
+  store double %add0, double* %w0, !dbg !9
+  store double %add1, double* %w1
+  ret void
+}
+
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!3, !4, !5}
+!llvm.ident = !{!6}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 4.0.0 (trunk 281293) (llvm/trunk 281290)", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly, enums: !2)
+!1 = !DIFile(filename: "/tmp/s.c", directory: "/tmp")
+!2 = !{}
+!3 = !{i32 2, !"Dwarf Version", i32 4}
+!4 = !{i32 2, !"Debug Info Version", i32 3}
+!5 = !{i32 1, !"PIC Level", i32 2}
+!6 = !{!"clang version 4.0.0 (trunk 281293) (llvm/trunk 281290)"}
+!7 = distinct !DISubprogram(name: "baz", scope: !1, file: !1, line: 4, type: !8, isLocal: false, isDefinition: true, scopeLine: 4, isOptimized: true, unit: !0, variables: !2)
+!8 = !DISubroutineType(types: !2)
+!9 = !DILocation(line: 5, column: 10, scope: !7)
diff --git a/test/Transforms/SLPVectorizer/X86/arith-add.ll b/test/Transforms/SLPVectorizer/X86/arith-add.ll
new file mode 100644
index 000000000000..0266758b27d2
--- /dev/null
+++ b/test/Transforms/SLPVectorizer/X86/arith-add.ll
@@ -0,0 +1,649 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -mtriple=x86_64-unknown -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=SSE
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX1
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX2
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 --check-prefix=AVX512F
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -mattr=+avx512bw -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 --check-prefix=AVX512BW
+
+@a64 = common global [8 x i64] zeroinitializer, align 64
+@b64 = common global [8 x i64] zeroinitializer, align 64
+@c64 = common global [8 x i64] zeroinitializer, align 64
+@a32 = common global [16 x i32] zeroinitializer, align 64
+@b32 = common global [16 x i32] zeroinitializer, align 64
+@c32 = common global [16 x i32] zeroinitializer, align 64
+@a16 = common global [32 x i16] zeroinitializer, align 64
+@b16 = common global [32 x i16] zeroinitializer, align 64
+@c16 = common global [32 x i16] zeroinitializer, align 64
+@a8  = common global [64 x i8] zeroinitializer, align 64
+@b8  = common global [64 x i8] zeroinitializer, align 64
+@c8  = common global [64 x i8] zeroinitializer, align 64
+
+define void @add_v8i64() {
+; SSE-LABEL: @add_v8i64(
+; SSE-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @a64 to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2) to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6) to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @b64 to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2) to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6) to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP9:%.*]] = add <2 x i64> [[TMP1]], [[TMP5]]
+; SSE-NEXT:    [[TMP10:%.*]] = add <2 x i64> [[TMP2]], [[TMP6]]
+; SSE-NEXT:    [[TMP11:%.*]] = add <2 x i64> [[TMP3]], [[TMP7]]
+; SSE-NEXT:    [[TMP12:%.*]] = add <2 x i64> [[TMP4]], [[TMP8]]
+; SSE-NEXT:    store <2 x i64> [[TMP9]], <2 x i64>* bitcast ([8 x i64]* @c64 to <2 x i64>*), align 8
+; SSE-NEXT:    store <2 x i64> [[TMP10]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2) to <2 x i64>*), align 8
+; SSE-NEXT:    store <2 x i64> [[TMP11]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <2 x i64>*), align 8
+; SSE-NEXT:    store <2 x i64> [[TMP12]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6) to <2 x i64>*), align 8
+; SSE-NEXT:    ret void
+;
+; AVX-LABEL: @add_v8i64(
+; AVX-NEXT:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @a64 to <4 x i64>*), align 8
+; AVX-NEXT:    [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8
+; AVX-NEXT:    [[TMP3:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @b64 to <4 x i64>*), align 8
+; AVX-NEXT:    [[TMP4:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <4 x i64>*), align 8
+; AVX-NEXT:    [[TMP5:%.*]] = add <4 x i64> [[TMP1]], [[TMP3]]
+; AVX-NEXT:    [[TMP6:%.*]] = add <4 x i64> [[TMP2]], [[TMP4]]
+; AVX-NEXT:    store <4 x i64> [[TMP5]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8
+; AVX-NEXT:    store <4 x i64> [[TMP6]], <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <4 x i64>*), align 8
+; AVX-NEXT:    ret void
+;
+; AVX512-LABEL: @add_v8i64(
+; AVX512-NEXT:    [[TMP1:%.*]] = load <8 x i64>, <8 x i64>* bitcast ([8 x i64]* @a64 to <8 x i64>*), align 8
+; AVX512-NEXT:    [[TMP2:%.*]] = load <8 x i64>, <8 x i64>* bitcast ([8 x i64]* @b64 to <8 x i64>*), align 8
+; AVX512-NEXT:    [[TMP3:%.*]] = add <8 x i64> [[TMP1]], [[TMP2]]
+; AVX512-NEXT:    store <8 x i64> [[TMP3]], <8 x i64>* bitcast ([8 x i64]* @c64 to <8 x i64>*), align 8
+; AVX512-NEXT:    ret void
+;
+  %a0 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 0), align 8
+  %a1 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 1), align 8
+  %a2 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2), align 8
+  %a3 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 3), align 8
+  %a4 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4), align 8
+  %a5 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 5), align 8
+  %a6 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6), align 8
+  %a7 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 7), align 8
+  %b0 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 0), align 8
+  %b1 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 1), align 8
+  %b2 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2), align 8
+  %b3 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 3), align 8
+  %b4 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4), align 8
+  %b5 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 5), align 8
+  %b6 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6), align 8
+  %b7 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 7), align 8
+  %r0 = add i64 %a0, %b0
+  %r1 = add i64 %a1, %b1
+  %r2 = add i64 %a2, %b2
+  %r3 = add i64 %a3, %b3
+  %r4 = add i64 %a4, %b4
+  %r5 = add i64 %a5, %b5
+  %r6 = add i64 %a6, %b6
+  %r7 = add i64 %a7, %b7
+  store i64 %r0, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 0), align 8
+  store i64 %r1, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 1), align 8
+  store i64 %r2, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2), align 8
+  store i64 %r3, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 3), align 8
+  store i64 %r4, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4), align 8
+  store i64 %r5, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 5), align 8
+  store i64 %r6, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6), align 8
+  store i64 %r7, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 7), align 8
+  ret void
+}
+
+define void @add_v16i32() {
+; SSE-LABEL: @add_v16i32(
+; SSE-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @a32 to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @b32 to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP9:%.*]] = add <4 x i32> [[TMP1]], [[TMP5]]
+; SSE-NEXT:    [[TMP10:%.*]] = add <4 x i32> [[TMP2]], [[TMP6]]
+; SSE-NEXT:    [[TMP11:%.*]] = add <4 x i32> [[TMP3]], [[TMP7]]
+; SSE-NEXT:    [[TMP12:%.*]] = add <4 x i32> [[TMP4]], [[TMP8]]
+; SSE-NEXT:    store <4 x i32> [[TMP9]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4
+; SSE-NEXT:    store <4 x i32> [[TMP10]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4
+; SSE-NEXT:    store <4 x i32> [[TMP11]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4
+; SSE-NEXT:    store <4 x i32> [[TMP12]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12) to <4 x i32>*), align 4
+; SSE-NEXT:    ret void
+;
+; AVX-LABEL: @add_v16i32(
+; AVX-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @a32 to <8 x i32>*), align 4
+; AVX-NEXT:    [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <8 x i32>*), align 4
+; AVX-NEXT:    [[TMP3:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @b32 to <8 x i32>*), align 4
+; AVX-NEXT:    [[TMP4:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <8 x i32>*), align 4
+; AVX-NEXT:    [[TMP5:%.*]] = add <8 x i32> [[TMP1]], [[TMP3]]
+; AVX-NEXT:    [[TMP6:%.*]] = add <8 x i32> [[TMP2]], [[TMP4]]
+; AVX-NEXT:    store <8 x i32> [[TMP5]], <8 x i32>* bitcast ([16 x i32]* @c32 to <8 x i32>*), align 4
+; AVX-NEXT:    store <8 x i32> [[TMP6]], <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <8 x i32>*), align 4
+; AVX-NEXT:    ret void
+;
+; AVX512-LABEL: @add_v16i32(
+; AVX512-NEXT:    [[TMP1:%.*]] = load <16 x i32>, <16 x i32>* bitcast ([16 x i32]* @a32 to <16 x i32>*), align 4
+; AVX512-NEXT:    [[TMP2:%.*]] = load <16 x i32>, <16 x i32>* bitcast ([16 x i32]* @b32 to <16 x i32>*), align 4
+; AVX512-NEXT:    [[TMP3:%.*]] = add <16 x i32> [[TMP1]], [[TMP2]]
+; AVX512-NEXT:    store <16 x i32> [[TMP3]], <16 x i32>* bitcast ([16 x i32]* @c32 to <16 x i32>*), align 4
+; AVX512-NEXT:    ret void
+;
+  %a0  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 0 ), align 4
+  %a1  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 1 ), align 4
+  %a2  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 2 ), align 4
+  %a3  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 3 ), align 4
+  %a4  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4 ), align 4
+  %a5  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 5 ), align 4
+  %a6  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 6 ), align 4
+  %a7  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 7 ), align 4
+  %a8  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8 ), align 4
+  %a9  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 9 ), align 4
+  %a10 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 10), align 4
+  %a11 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 11), align 4
+  %a12 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12), align 4
+  %a13 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 13), align 4
+  %a14 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 14), align 4
+  %a15 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 15), align 4
+  %b0  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 0 ), align 4
+  %b1  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 1 ), align 4
+  %b2  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 2 ), align 4
+  %b3  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 3 ), align 4
+  %b4  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4 ), align 4
+  %b5  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 5 ), align 4
+  %b6  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 6 ), align 4
+  %b7  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 7 ), align 4
+  %b8  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8 ), align 4
+  %b9  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 9 ), align 4
+  %b10 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 10), align 4
+  %b11 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 11), align 4
+  %b12 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12), align 4
+  %b13 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 13), align 4
+  %b14 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 14), align 4
+  %b15 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 15), align 4
+  %r0  = add i32 %a0 , %b0
+  %r1  = add i32 %a1 , %b1
+  %r2  = add i32 %a2 , %b2
+  %r3  = add i32 %a3 , %b3
+  %r4  = add i32 %a4 , %b4
+  %r5  = add i32 %a5 , %b5
+  %r6  = add i32 %a6 , %b6
+  %r7  = add i32 %a7 , %b7
+  %r8  = add i32 %a8 , %b8
+  %r9  = add i32 %a9 , %b9
+  %r10 = add i32 %a10, %b10
+  %r11 = add i32 %a11, %b11
+  %r12 = add i32 %a12, %b12
+  %r13 = add i32 %a13, %b13
+  %r14 = add i32 %a14, %b14
+  %r15 = add i32 %a15, %b15
+  store i32 %r0 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 0 ), align 4
+  store i32 %r1 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 1 ), align 4
+  store i32 %r2 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 2 ), align 4
+  store i32 %r3 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 3 ), align 4
+  store i32 %r4 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4 ), align 4
+  store i32 %r5 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 5 ), align 4
+  store i32 %r6 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 6 ), align 4
+  store i32 %r7 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 7 ), align 4
+  store i32 %r8 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8 ), align 4
+  store i32 %r9 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 9 ), align 4
+  store i32 %r10, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 10), align 4
+  store i32 %r11, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 11), align 4
+  store i32 %r12, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12), align 4
+  store i32 %r13, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 13), align 4
+  store i32 %r14, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 14), align 4
+  store i32 %r15, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 15), align 4
+  ret void
+}
+
+define void @add_v32i16() {
+; SSE-LABEL: @add_v32i16(
+; SSE-NEXT:    [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @a16 to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @b16 to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP9:%.*]] = add <8 x i16> [[TMP1]], [[TMP5]]
+; SSE-NEXT:    [[TMP10:%.*]] = add <8 x i16> [[TMP2]], [[TMP6]]
+; SSE-NEXT:    [[TMP11:%.*]] = add <8 x i16> [[TMP3]], [[TMP7]]
+; SSE-NEXT:    [[TMP12:%.*]] = add <8 x i16> [[TMP4]], [[TMP8]]
+; SSE-NEXT:    store <8 x i16> [[TMP9]], <8 x i16>* bitcast ([32 x i16]* @c16 to <8 x i16>*), align 2
+; SSE-NEXT:    store <8 x i16> [[TMP10]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8) to <8 x i16>*), align 2
+; SSE-NEXT:    store <8 x i16> [[TMP11]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <8 x i16>*), align 2
+; SSE-NEXT:    store <8 x i16> [[TMP12]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 24) to <8 x i16>*), align 2
+; SSE-NEXT:    ret void
+;
+; AVX-LABEL: @add_v32i16(
+; AVX-NEXT:    [[TMP1:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @a16 to <16 x i16>*), align 2
+; AVX-NEXT:    [[TMP2:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <16 x i16>*), align 2
+; AVX-NEXT:    [[TMP3:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @b16 to <16 x i16>*), align 2
+; AVX-NEXT:    [[TMP4:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <16 x i16>*), align 2
+; AVX-NEXT:    [[TMP5:%.*]] = add <16 x i16> [[TMP1]], [[TMP3]]
+; AVX-NEXT:    [[TMP6:%.*]] = add <16 x i16> [[TMP2]], [[TMP4]]
+; AVX-NEXT:    store <16 x i16> [[TMP5]], <16 x i16>* bitcast ([32 x i16]* @c16 to <16 x i16>*), align 2
+; AVX-NEXT:    store <16 x i16> [[TMP6]], <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <16 x i16>*), align 2
+; AVX-NEXT:    ret void
+;
+; AVX512-LABEL: @add_v32i16(
+; AVX512-NEXT:    [[TMP1:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @a16 to <16 x i16>*), align 2
+; AVX512-NEXT:    [[TMP2:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <16 x i16>*), align 2
+; AVX512-NEXT:    [[TMP3:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @b16 to <16 x i16>*), align 2
+; AVX512-NEXT:    [[TMP4:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <16 x i16>*), align 2
+; AVX512-NEXT:    [[TMP5:%.*]] = add <16 x i16> [[TMP1]], [[TMP3]]
+; AVX512-NEXT:    [[TMP6:%.*]] = add <16 x i16> [[TMP2]], [[TMP4]]
+; AVX512-NEXT:    store <16 x i16> [[TMP5]], <16 x i16>* bitcast ([32 x i16]* @c16 to <16 x i16>*), align 2
+; AVX512-NEXT:    store <16 x i16> [[TMP6]], <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <16 x i16>*), align 2
+; AVX512-NEXT:    ret void
+;
+  %a0  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 0 ), align 2
+  %a1  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 1 ), align 2
+  %a2  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 2 ), align 2
+  %a3  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 3 ), align 2
+  %a4  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 4 ), align 2
+  %a5  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 5 ), align 2
+  %a6  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 6 ), align 2
+  %a7  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 7 ), align 2
+  %a8  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8 ), align 2
+  %a9  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 9 ), align 2
+  %a10 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 10), align 2
+  %a11 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 11), align 2
+  %a12 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 12), align 2
+  %a13 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 13), align 2
+  %a14 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 14), align 2
+  %a15 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 15), align 2
+  %a16 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16), align 2
+  %a17 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 17), align 2
+  %a18 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 18), align 2
+  %a19 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 19), align 2
+  %a20 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 20), align 2
+  %a21 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 21), align 2
+  %a22 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 22), align 2
+  %a23 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 23), align 2
+  %a24 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24), align 2
+  %a25 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 25), align 2
+  %a26 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 26), align 2
+  %a27 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 27), align 2
+  %a28 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 28), align 2
+  %a29 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 29), align 2
+  %a30 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 30), align 2
+  %a31 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 31), align 2
+  %b0  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 0 ), align 2
+  %b1  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 1 ), align 2
+  %b2  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 2 ), align 2
+  %b3  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 3 ), align 2
+  %b4  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 4 ), align 2
+  %b5  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 5 ), align 2
+  %b6  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 6 ), align 2
+  %b7  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 7 ), align 2
+  %b8  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8 ), align 2
+  %b9  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 9 ), align 2
+  %b10 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 10), align 2
+  %b11 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 11), align 2
+  %b12 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 12), align 2
+  %b13 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 13), align 2
+  %b14 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 14), align 2
+  %b15 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 15), align 2
+  %b16 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16), align 2
+  %b17 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 17), align 2
+  %b18 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 18), align 2
+  %b19 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 19), align 2
+  %b20 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 20), align 2
+  %b21 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 21), align 2
+  %b22 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 22), align 2
+  %b23 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 23), align 2
+  %b24 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24), align 2
+  %b25 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 25), align 2
+  %b26 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 26), align 2
+  %b27 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 27), align 2
+  %b28 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 28), align 2
+  %b29 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 29), align 2
+  %b30 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 30), align 2
+  %b31 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 31), align 2
+  %r0  = add i16 %a0 , %b0
+  %r1  = add i16 %a1 , %b1
+  %r2  = add i16 %a2 , %b2
+  %r3  = add i16 %a3 , %b3
+  %r4  = add i16 %a4 , %b4
+  %r5  = add i16 %a5 , %b5
+  %r6  = add i16 %a6 , %b6
+  %r7  = add i16 %a7 , %b7
+  %r8  = add i16 %a8 , %b8
+  %r9  = add i16 %a9 , %b9
+  %r10 = add i16 %a10, %b10
+  %r11 = add i16 %a11, %b11
+  %r12 = add i16 %a12, %b12
+  %r13 = add i16 %a13, %b13
+  %r14 = add i16 %a14, %b14
+  %r15 = add i16 %a15, %b15
+  %r16 = add i16 %a16, %b16
+  %r17 = add i16 %a17, %b17
+  %r18 = add i16 %a18, %b18
+  %r19 = add i16 %a19, %b19
+  %r20 = add i16 %a20, %b20
+  %r21 = add i16 %a21, %b21
+  %r22 = add i16 %a22, %b22
+  %r23 = add i16 %a23, %b23
+  %r24 = add i16 %a24, %b24
+  %r25 = add i16 %a25, %b25
+  %r26 = add i16 %a26, %b26
+  %r27 = add i16 %a27, %b27
+  %r28 = add i16 %a28, %b28
+  %r29 = add i16 %a29, %b29
+  %r30 = add i16 %a30, %b30
+  %r31 = add i16 %a31, %b31
+  store i16 %r0 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 0 ), align 2
+  store i16 %r1 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 1 ), align 2
+  store i16 %r2 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 2 ), align 2
+  store i16 %r3 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 3 ), align 2
+  store i16 %r4 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 4 ), align 2
+  store i16 %r5 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 5 ), align 2
+  store i16 %r6 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 6 ), align 2
+  store i16 %r7 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 7 ), align 2
+  store i16 %r8 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8 ), align 2
+  store i16 %r9 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 9 ), align 2
+  store i16 %r10, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 10), align 2
+  store i16 %r11, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 11), align 2
+  store i16 %r12, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 12), align 2
+  store i16 %r13, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 13), align 2
+  store i16 %r14, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 14), align 2
+  store i16 %r15, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 15), align 2
+  store i16 %r16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16), align 2
+  store i16 %r17, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 17), align 2
+  store i16 %r18, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 18), align 2
+  store i16 %r19, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 19), align 2
+  store i16 %r20, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 20), align 2
+  store i16 %r21, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 21), align 2
+  store i16 %r22, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 22), align 2
+  store i16 %r23, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 23), align 2
+  store i16 %r24, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 24), align 2
+  store i16 %r25, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 25), align 2
+  store i16 %r26, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 26), align 2
+  store i16 %r27, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 27), align 2
+  store i16 %r28, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 28), align 2
+  store i16 %r29, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 29), align 2
+  store i16 %r30, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 30), align 2
+  store i16 %r31, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 31), align 2
+  ret void
+}
+
+define void @add_v64i8() {
+; CHECK-LABEL: @add_v64i8(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @a8 to <16 x i8>*), align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16) to <16 x i8>*), align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <16 x i8>*), align 1
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48) to <16 x i8>*), align 1
+; CHECK-NEXT:    [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @b8 to <16 x i8>*), align 1
+; CHECK-NEXT:    [[TMP6:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16) to <16 x i8>*), align 1
+; CHECK-NEXT:    [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <16 x i8>*), align 1
+; CHECK-NEXT:    [[TMP8:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48) to <16 x i8>*), align 1
+; CHECK-NEXT:    [[TMP9:%.*]] = add <16 x i8> [[TMP1]], [[TMP5]]
+; CHECK-NEXT:    [[TMP10:%.*]] = add <16 x i8> [[TMP2]], [[TMP6]]
+; CHECK-NEXT:    [[TMP11:%.*]] = add <16 x i8> [[TMP3]], [[TMP7]]
+; CHECK-NEXT:    [[TMP12:%.*]] = add <16 x i8> [[TMP4]], [[TMP8]]
+; CHECK-NEXT:    store <16 x i8> [[TMP9]], <16 x i8>* bitcast ([64 x i8]* @c8 to <16 x i8>*), align 1
+; CHECK-NEXT:    store <16 x i8> [[TMP10]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16) to <16 x i8>*), align 1
+; CHECK-NEXT:    store <16 x i8> [[TMP11]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <16 x i8>*), align 1
+; CHECK-NEXT:    store <16 x i8> [[TMP12]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 48) to <16 x i8>*), align 1
+; CHECK-NEXT:    ret void
+;
+  %a0  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 0 ), align 1
+  %a1  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 1 ), align 1
+  %a2  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 2 ), align 1
+  %a3  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 3 ), align 1
+  %a4  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 4 ), align 1
+  %a5  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 5 ), align 1
+  %a6  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 6 ), align 1
+  %a7  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 7 ), align 1
+  %a8  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 8 ), align 1
+  %a9  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 9 ), align 1
+  %a10 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 10), align 1
+  %a11 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 11), align 1
+  %a12 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 12), align 1
+  %a13 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 13), align 1
+  %a14 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 14), align 1
+  %a15 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 15), align 1
+  %a16 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16), align 1
+  %a17 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 17), align 1
+  %a18 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 18), align 1
+  %a19 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 19), align 1
+  %a20 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 20), align 1
+  %a21 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 21), align 1
+  %a22 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 22), align 1
+  %a23 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 23), align 1
+  %a24 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 24), align 1
+  %a25 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 25), align 1
+  %a26 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 26), align 1
+  %a27 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 27), align 1
+  %a28 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 28), align 1
+  %a29 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 29), align 1
+  %a30 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 30), align 1
+  %a31 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 31), align 1
+  %a32 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32), align 1
+  %a33 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 33), align 1
+  %a34 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 34), align 1
+  %a35 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 35), align 1
+  %a36 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 36), align 1
+  %a37 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 37), align 1
+  %a38 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 38), align 1
+  %a39 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 39), align 1
+  %a40 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 40), align 1
+  %a41 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 41), align 1
+  %a42 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 42), align 1
+  %a43 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 43), align 1
+  %a44 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 44), align 1
+  %a45 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 45), align 1
+  %a46 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 46), align 1
+  %a47 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 47), align 1
+  %a48 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48), align 1
+  %a49 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 49), align 1
+  %a50 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 50), align 1
+  %a51 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 51), align 1
+  %a52 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 52), align 1
+  %a53 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 53), align 1
+  %a54 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 54), align 1
+  %a55 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 55), align 1
+  %a56 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 56), align 1
+  %a57 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 57), align 1
+  %a58 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 58), align 1
+  %a59 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 59), align 1
+  %a60 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 60), align 1
+  %a61 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 61), align 1
+  %a62 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 62), align 1
+  %a63 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 63), align 1
+  %b0  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 0 ), align 1
+  %b1  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 1 ), align 1
+  %b2  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 2 ), align 1
+  %b3  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 3 ), align 1
+  %b4  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 4 ), align 1
+  %b5  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 5 ), align 1
+  %b6  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 6 ), align 1
+  %b7  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 7 ), align 1
+  %b8  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 8 ), align 1
+  %b9  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 9 ), align 1
+  %b10 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 10), align 1
+  %b11 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 11), align 1
+  %b12 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 12), align 1
+  %b13 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 13), align 1
+  %b14 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 14), align 1
+  %b15 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 15), align 1
+  %b16 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16), align 1
+  %b17 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 17), align 1
+  %b18 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 18), align 1
+  %b19 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 19), align 1
+  %b20 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 20), align 1
+  %b21 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 21), align 1
+  %b22 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 22), align 1
+  %b23 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 23), align 1
+  %b24 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 24), align 1
+  %b25 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 25), align 1
+  %b26 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 26), align 1
+  %b27 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 27), align 1
+  %b28 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 28), align 1
+  %b29 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 29), align 1
+  %b30 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 30), align 1
+  %b31 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 31), align 1
+  %b32 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32), align 1
+  %b33 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 33), align 1
+  %b34 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 34), align 1
+  %b35 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 35), align 1
+  %b36 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 36), align 1
+  %b37 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 37), align 1
+  %b38 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 38), align 1
+  %b39 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 39), align 1
+  %b40 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 40), align 1
+  %b41 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 41), align 1
+  %b42 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 42), align 1
+  %b43 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 43), align 1
+  %b44 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 44), align 1
+  %b45 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 45), align 1
+  %b46 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 46), align 1
+  %b47 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 47), align 1
+  %b48 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48), align 1
+  %b49 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 49), align 1
+  %b50 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 50), align 1
+  %b51 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 51), align 1
+  %b52 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 52), align 1
+  %b53 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 53), align 1
+  %b54 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 54), align 1
+  %b55 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 55), align 1
+  %b56 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 56), align 1
+  %b57 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 57), align 1
+  %b58 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 58), align 1
+  %b59 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 59), align 1
+  %b60 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 60), align 1
+  %b61 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 61), align 1
+  %b62 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 62), align 1
+  %b63 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 63), align 1
+  %r0  = add i8 %a0 , %b0
+  %r1  = add i8 %a1 , %b1
+  %r2  = add i8 %a2 , %b2
+  %r3  = add i8 %a3 , %b3
+  %r4  = add i8 %a4 , %b4
+  %r5  = add i8 %a5 , %b5
+  %r6  = add i8 %a6 , %b6
+  %r7  = add i8 %a7 , %b7
+  %r8  = add i8 %a8 , %b8
+  %r9  = add i8 %a9 , %b9
+  %r10 = add i8 %a10, %b10
+  %r11 = add i8 %a11, %b11
+  %r12 = add i8 %a12, %b12
+  %r13 = add i8 %a13, %b13
+  %r14 = add i8 %a14, %b14
+  %r15 = add i8 %a15, %b15
+  %r16 = add i8 %a16, %b16
+  %r17 = add i8 %a17, %b17
+  %r18 = add i8 %a18, %b18
+  %r19 = add i8 %a19, %b19
+  %r20 = add i8 %a20, %b20
+  %r21 = add i8 %a21, %b21
+  %r22 = add i8 %a22, %b22
+  %r23 = add i8 %a23, %b23
+  %r24 = add i8 %a24, %b24
+  %r25 = add i8 %a25, %b25
+  %r26 = add i8 %a26, %b26
+  %r27 = add i8 %a27, %b27
+  %r28 = add i8 %a28, %b28
+  %r29 = add i8 %a29, %b29
+  %r30 = add i8 %a30, %b30
+  %r31 = add i8 %a31, %b31
+  %r32 = add i8 %a32, %b32
+  %r33 = add i8 %a33, %b33
+  %r34 = add i8 %a34, %b34
+  %r35 = add i8 %a35, %b35
+  %r36 = add i8 %a36, %b36
+  %r37 = add i8 %a37, %b37
+  %r38 = add i8 %a38, %b38
+  %r39 = add i8 %a39, %b39
+  %r40 = add i8 %a40, %b40
+  %r41 = add i8 %a41, %b41
+  %r42 = add i8 %a42, %b42
+  %r43 = add i8 %a43, %b43
+  %r44 = add i8 %a44, %b44
+  %r45 = add i8 %a45, %b45
+  %r46 = add i8 %a46, %b46
+  %r47 = add i8 %a47, %b47
+  %r48 = add i8 %a48, %b48
+  %r49 = add i8 %a49, %b49
+  %r50 = add i8 %a50, %b50
+  %r51 = add i8 %a51, %b51
+  %r52 = add i8 %a52, %b52
+  %r53 = add i8 %a53, %b53
+  %r54 = add i8 %a54, %b54
+  %r55 = add i8 %a55, %b55
+  %r56 = add i8 %a56, %b56
+  %r57 = add i8 %a57, %b57
+  %r58 = add i8 %a58, %b58
+  %r59 = add i8 %a59, %b59
+  %r60 = add i8 %a60, %b60
+  %r61 = add i8 %a61, %b61
+  %r62 = add i8 %a62, %b62
+  %r63 = add i8 %a63, %b63
+  store i8 %r0 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 0 ), align 1
+  store i8 %r1 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 1 ), align 1
+  store i8 %r2 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 2 ), align 1
+  store i8 %r3 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 3 ), align 1
+  store i8 %r4 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 4 ), align 1
+  store i8 %r5 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 5 ), align 1
+  store i8 %r6 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 6 ), align 1
+  store i8 %r7 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 7 ), align 1
+  store i8 %r8 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 8 ), align 1
+  store i8 %r9 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 9 ), align 1
+  store i8 %r10, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 10), align 1
+  store i8 %r11, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 11), align 1
+  store i8 %r12, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 12), align 1
+  store i8 %r13, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 13), align 1
+  store i8 %r14, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 14), align 1
+  store i8 %r15, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 15), align 1
+  store i8 %r16, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16), align 1
+  store i8 %r17, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 17), align 1
+  store i8 %r18, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 18), align 1
+  store i8 %r19, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 19), align 1
+  store i8 %r20, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 20), align 1
+  store i8 %r21, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 21), align 1
+  store i8 %r22, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 22), align 1
+  store i8 %r23, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 23), align 1
+  store i8 %r24, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 24), align 1
+  store i8 %r25, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 25), align 1
+  store i8 %r26, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 26), align 1
+  store i8 %r27, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 27), align 1
+  store i8 %r28, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 28), align 1
+  store i8 %r29, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 29), align 1
+  store i8 %r30, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 30), align 1
+  store i8 %r31, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 31), align 1
+  store i8 %r32, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32), align 1
+  store i8 %r33, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 33), align 1
+  store i8 %r34, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 34), align 1
+  store i8 %r35, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 35), align 1
+  store i8 %r36, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 36), align 1
+  store i8 %r37, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 37), align 1
+  store i8 %r38, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 38), align 1
+  store i8 %r39, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 39), align 1
+  store i8 %r40, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 40), align 1
+  store i8 %r41, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 41), align 1
+  store i8 %r42, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 42), align 1
+  store i8 %r43, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 43), align 1
+  store i8 %r44, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 44), align 1
+  store i8 %r45, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 45), align 1
+  store i8 %r46, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 46), align 1
+  store i8 %r47, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 47), align 1
+  store i8 %r48, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 48), align 1
+  store i8 %r49, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 49), align 1
+  store i8 %r50, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 50), align 1
+  store i8 %r51, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 51), align 1
+  store i8 %r52, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 52), align 1
+  store i8 %r53, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 53), align 1
+  store i8 %r54, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 54), align 1
+  store i8 %r55, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 55), align 1
+  store i8 %r56, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 56), align 1
+  store i8 %r57, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 57), align 1
+  store i8 %r58, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 58), align 1
+  store i8 %r59, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 59), align 1
+  store i8 %r60, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 60), align 1
+  store i8 %r61, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 61), align 1
+  store i8 %r62, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 62), align 1
+  store i8 %r63, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 63), align 1
+  ret void
+}
diff --git a/test/Transforms/SLPVectorizer/X86/arith-mul.ll b/test/Transforms/SLPVectorizer/X86/arith-mul.ll
new file mode 100644
index 000000000000..95875d7f01fd
--- /dev/null
+++ b/test/Transforms/SLPVectorizer/X86/arith-mul.ll
@@ -0,0 +1,700 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -mtriple=x86_64-unknown -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=SSE
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX1
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX2
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 --check-prefix=AVX512F
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 --check-prefix=AVX512BW
+
+@a64 = common global [8 x i64] zeroinitializer, align 64
+@b64 = common global [8 x i64] zeroinitializer, align 64
+@c64 = common global [8 x i64] zeroinitializer, align 64
+@a32 = common global [16 x i32] zeroinitializer, align 64
+@b32 = common global [16 x i32] zeroinitializer, align 64
+@c32 = common global [16 x i32] zeroinitializer, align 64
+@a16 = common global [32 x i16] zeroinitializer, align 64
+@b16 = common global [32 x i16] zeroinitializer, align 64
+@c16 = common global [32 x i16] zeroinitializer, align 64
+@a8  = common global [64 x i8] zeroinitializer, align 64
+@b8  = common global [64 x i8] zeroinitializer, align 64
+@c8  = common global [64 x i8] zeroinitializer, align 64
+
+define void @mul_v8i64() {
+; SSE-LABEL: @mul_v8i64(
+; SSE-NEXT:    [[A0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 0), align 8
+; SSE-NEXT:    [[A1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 1), align 8
+; SSE-NEXT:    [[A2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2), align 8
+; SSE-NEXT:    [[A3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 3), align 8
+; SSE-NEXT:    [[A4:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4), align 8
+; SSE-NEXT:    [[A5:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 5), align 8
+; SSE-NEXT:    [[A6:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6), align 8
+; SSE-NEXT:    [[A7:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 7), align 8
+; SSE-NEXT:    [[B0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 0), align 8
+; SSE-NEXT:    [[B1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 1), align 8
+; SSE-NEXT:    [[B2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2), align 8
+; SSE-NEXT:    [[B3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 3), align 8
+; SSE-NEXT:    [[B4:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4), align 8
+; SSE-NEXT:    [[B5:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 5), align 8
+; SSE-NEXT:    [[B6:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6), align 8
+; SSE-NEXT:    [[B7:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 7), align 8
+; SSE-NEXT:    [[R0:%.*]] = mul i64 [[A0]], [[B0]]
+; SSE-NEXT:    [[R1:%.*]] = mul i64 [[A1]], [[B1]]
+; SSE-NEXT:    [[R2:%.*]] = mul i64 [[A2]], [[B2]]
+; SSE-NEXT:    [[R3:%.*]] = mul i64 [[A3]], [[B3]]
+; SSE-NEXT:    [[R4:%.*]] = mul i64 [[A4]], [[B4]]
+; SSE-NEXT:    [[R5:%.*]] = mul i64 [[A5]], [[B5]]
+; SSE-NEXT:    [[R6:%.*]] = mul i64 [[A6]], [[B6]]
+; SSE-NEXT:    [[R7:%.*]] = mul i64 [[A7]], [[B7]]
+; SSE-NEXT:    store i64 [[R0]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 0), align 8
+; SSE-NEXT:    store i64 [[R1]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 1), align 8
+; SSE-NEXT:    store i64 [[R2]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2), align 8
+; SSE-NEXT:    store i64 [[R3]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 3), align 8
+; SSE-NEXT:    store i64 [[R4]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4), align 8
+; SSE-NEXT:    store i64 [[R5]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 5), align 8
+; SSE-NEXT:    store i64 [[R6]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6), align 8
+; SSE-NEXT:    store i64 [[R7]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 7), align 8
+; SSE-NEXT:    ret void
+;
+; AVX1-LABEL: @mul_v8i64(
+; AVX1-NEXT:    [[A0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 0), align 8
+; AVX1-NEXT:    [[A1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 1), align 8
+; AVX1-NEXT:    [[A2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2), align 8
+; AVX1-NEXT:    [[A3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 3), align 8
+; AVX1-NEXT:    [[A4:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4), align 8
+; AVX1-NEXT:    [[A5:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 5), align 8
+; AVX1-NEXT:    [[A6:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6), align 8
+; AVX1-NEXT:    [[A7:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 7), align 8
+; AVX1-NEXT:    [[B0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 0), align 8
+; AVX1-NEXT:    [[B1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 1), align 8
+; AVX1-NEXT:    [[B2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2), align 8
+; AVX1-NEXT:    [[B3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 3), align 8
+; AVX1-NEXT:    [[B4:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4), align 8
+; AVX1-NEXT:    [[B5:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 5), align 8
+; AVX1-NEXT:    [[B6:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6), align 8
+; AVX1-NEXT:    [[B7:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 7), align 8
+; AVX1-NEXT:    [[R0:%.*]] = mul i64 [[A0]], [[B0]]
+; AVX1-NEXT:    [[R1:%.*]] = mul i64 [[A1]], [[B1]]
+; AVX1-NEXT:    [[R2:%.*]] = mul i64 [[A2]], [[B2]]
+; AVX1-NEXT:    [[R3:%.*]] = mul i64 [[A3]], [[B3]]
+; AVX1-NEXT:    [[R4:%.*]] = mul i64 [[A4]], [[B4]]
+; AVX1-NEXT:    [[R5:%.*]] = mul i64 [[A5]], [[B5]]
+; AVX1-NEXT:    [[R6:%.*]] = mul i64 [[A6]], [[B6]]
+; AVX1-NEXT:    [[R7:%.*]] = mul i64 [[A7]], [[B7]]
+; AVX1-NEXT:    store i64 [[R0]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 0), align 8
+; AVX1-NEXT:    store i64 [[R1]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 1), align 8
+; AVX1-NEXT:    store i64 [[R2]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2), align 8
+; AVX1-NEXT:    store i64 [[R3]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 3), align 8
+; AVX1-NEXT:    store i64 [[R4]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4), align 8
+; AVX1-NEXT:    store i64 [[R5]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 5), align 8
+; AVX1-NEXT:    store i64 [[R6]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6), align 8
+; AVX1-NEXT:    store i64 [[R7]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 7), align 8
+; AVX1-NEXT:    ret void
+;
+; AVX2-LABEL: @mul_v8i64(
+; AVX2-NEXT:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @a64 to <4 x i64>*), align 8
+; AVX2-NEXT:    [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8
+; AVX2-NEXT:    [[TMP3:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @b64 to <4 x i64>*), align 8
+; AVX2-NEXT:    [[TMP4:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <4 x i64>*), align 8
+; AVX2-NEXT:    [[TMP5:%.*]] = mul <4 x i64> [[TMP1]], [[TMP3]]
+; AVX2-NEXT:    [[TMP6:%.*]] = mul <4 x i64> [[TMP2]], [[TMP4]]
+; AVX2-NEXT:    store <4 x i64> [[TMP5]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8
+; AVX2-NEXT:    store <4 x i64> [[TMP6]], <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <4 x i64>*), align 8
+; AVX2-NEXT:    ret void
+;
+; AVX512-LABEL: @mul_v8i64(
+; AVX512-NEXT:    [[TMP1:%.*]] = load <8 x i64>, <8 x i64>* bitcast ([8 x i64]* @a64 to <8 x i64>*), align 8
+; AVX512-NEXT:    [[TMP2:%.*]] = load <8 x i64>, <8 x i64>* bitcast ([8 x i64]* @b64 to <8 x i64>*), align 8
+; AVX512-NEXT:    [[TMP3:%.*]] = mul <8 x i64> [[TMP1]], [[TMP2]]
+; AVX512-NEXT:    store <8 x i64> [[TMP3]], <8 x i64>* bitcast ([8 x i64]* @c64 to <8 x i64>*), align 8
+; AVX512-NEXT:    ret void
+;
+  %a0 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 0), align 8
+  %a1 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 1), align 8
+  %a2 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2), align 8
+  %a3 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 3), align 8
+  %a4 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4), align 8
+  %a5 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 5), align 8
+  %a6 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6), align 8
+  %a7 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 7), align 8
+  %b0 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 0), align 8
+  %b1 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 1), align 8
+  %b2 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2), align 8
+  %b3 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 3), align 8
+  %b4 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4), align 8
+  %b5 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 5), align 8
+  %b6 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6), align 8
+  %b7 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 7), align 8
+  %r0 = mul i64 %a0, %b0
+  %r1 = mul i64 %a1, %b1
+  %r2 = mul i64 %a2, %b2
+  %r3 = mul i64 %a3, %b3
+  %r4 = mul i64 %a4, %b4
+  %r5 = mul i64 %a5, %b5
+  %r6 = mul i64 %a6, %b6
+  %r7 = mul i64 %a7, %b7
+  store i64 %r0, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 0), align 8
+  store i64 %r1, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 1), align 8
+  store i64 %r2, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2), align 8
+  store i64 %r3, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 3), align 8
+  store i64 %r4, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4), align 8
+  store i64 %r5, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 5), align 8
+  store i64 %r6, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6), align 8
+  store i64 %r7, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 7), align 8
+  ret void
+}
+
+define void @mul_v16i32() {
+; SSE-LABEL: @mul_v16i32(
+; SSE-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @a32 to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @b32 to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP9:%.*]] = mul <4 x i32> [[TMP1]], [[TMP5]]
+; SSE-NEXT:    [[TMP10:%.*]] = mul <4 x i32> [[TMP2]], [[TMP6]]
+; SSE-NEXT:    [[TMP11:%.*]] = mul <4 x i32> [[TMP3]], [[TMP7]]
+; SSE-NEXT:    [[TMP12:%.*]] = mul <4 x i32> [[TMP4]], [[TMP8]]
+; SSE-NEXT:    store <4 x i32> [[TMP9]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4
+; SSE-NEXT:    store <4 x i32> [[TMP10]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4
+; SSE-NEXT:    store <4 x i32> [[TMP11]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4
+; SSE-NEXT:    store <4 x i32> [[TMP12]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12) to <4 x i32>*), align 4
+; SSE-NEXT:    ret void
+;
+; AVX-LABEL: @mul_v16i32(
+; AVX-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @a32 to <8 x i32>*), align 4
+; AVX-NEXT:    [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <8 x i32>*), align 4
+; AVX-NEXT:    [[TMP3:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @b32 to <8 x i32>*), align 4
+; AVX-NEXT:    [[TMP4:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <8 x i32>*), align 4
+; AVX-NEXT:    [[TMP5:%.*]] = mul <8 x i32> [[TMP1]], [[TMP3]]
+; AVX-NEXT:    [[TMP6:%.*]] = mul <8 x i32> [[TMP2]], [[TMP4]]
+; AVX-NEXT:    store <8 x i32> [[TMP5]], <8 x i32>* bitcast ([16 x i32]* @c32 to <8 x i32>*), align 4
+; AVX-NEXT:    store <8 x i32> [[TMP6]], <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <8 x i32>*), align 4
+; AVX-NEXT:    ret void
+;
+; AVX512-LABEL: @mul_v16i32(
+; AVX512-NEXT:    [[TMP1:%.*]] = load <16 x i32>, <16 x i32>* bitcast ([16 x i32]* @a32 to <16 x i32>*), align 4
+; AVX512-NEXT:    [[TMP2:%.*]] = load <16 x i32>, <16 x i32>* bitcast ([16 x i32]* @b32 to <16 x i32>*), align 4
+; AVX512-NEXT:    [[TMP3:%.*]] = mul <16 x i32> [[TMP1]], [[TMP2]]
+; AVX512-NEXT:    store <16 x i32> [[TMP3]], <16 x i32>* bitcast ([16 x i32]* @c32 to <16 x i32>*), align 4
+; AVX512-NEXT:    ret void
+;
+  %a0  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 0 ), align 4
+  %a1  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 1 ), align 4
+  %a2  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 2 ), align 4
+  %a3  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 3 ), align 4
+  %a4  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4 ), align 4
+  %a5  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 5 ), align 4
+  %a6  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 6 ), align 4
+  %a7  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 7 ), align 4
+  %a8  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8 ), align 4
+  %a9  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 9 ), align 4
+  %a10 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 10), align 4
+  %a11 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 11), align 4
+  %a12 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12), align 4
+  %a13 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 13), align 4
+  %a14 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 14), align 4
+  %a15 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 15), align 4
+  %b0  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 0 ), align 4
+  %b1  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 1 ), align 4
+  %b2  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 2 ), align 4
+  %b3  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 3 ), align 4
+  %b4  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4 ), align 4
+  %b5  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 5 ), align 4
+  %b6  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 6 ), align 4
+  %b7  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 7 ), align 4
+  %b8  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8 ), align 4
+  %b9  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 9 ), align 4
+  %b10 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 10), align 4
+  %b11 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 11), align 4
+  %b12 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12), align 4
+  %b13 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 13), align 4
+  %b14 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 14), align 4
+  %b15 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 15), align 4
+  %r0  = mul i32 %a0 , %b0
+  %r1  = mul i32 %a1 , %b1
+  %r2  = mul i32 %a2 , %b2
+  %r3  = mul i32 %a3 , %b3
+  %r4  = mul i32 %a4 , %b4
+  %r5  = mul i32 %a5 , %b5
+  %r6  = mul i32 %a6 , %b6
+  %r7  = mul i32 %a7 , %b7
+  %r8  = mul i32 %a8 , %b8
+  %r9  = mul i32 %a9 , %b9
+  %r10 = mul i32 %a10, %b10
+  %r11 = mul i32 %a11, %b11
+  %r12 = mul i32 %a12, %b12
+  %r13 = mul i32 %a13, %b13
+  %r14 = mul i32 %a14, %b14
+  %r15 = mul i32 %a15, %b15
+  store i32 %r0 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 0 ), align 4
+  store i32 %r1 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 1 ), align 4
+  store i32 %r2 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 2 ), align 4
+  store i32 %r3 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 3 ), align 4
+  store i32 %r4 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4 ), align 4
+  store i32 %r5 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 5 ), align 4
+  store i32 %r6 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 6 ), align 4
+  store i32 %r7 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 7 ), align 4
+  store i32 %r8 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8 ), align 4
+  store i32 %r9 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 9 ), align 4
+  store i32 %r10, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 10), align 4
+  store i32 %r11, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 11), align 4
+  store i32 %r12, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12), align 4
+  store i32 %r13, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 13), align 4
+  store i32 %r14, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 14), align 4
+  store i32 %r15, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 15), align 4
+  ret void
+}
+
+define void @mul_v32i16() {
+; SSE-LABEL: @mul_v32i16(
+; SSE-NEXT:    [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @a16 to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @b16 to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP9:%.*]] = mul <8 x i16> [[TMP1]], [[TMP5]]
+; SSE-NEXT:    [[TMP10:%.*]] = mul <8 x i16> [[TMP2]], [[TMP6]]
+; SSE-NEXT:    [[TMP11:%.*]] = mul <8 x i16> [[TMP3]], [[TMP7]]
+; SSE-NEXT:    [[TMP12:%.*]] = mul <8 x i16> [[TMP4]], [[TMP8]]
+; SSE-NEXT:    store <8 x i16> [[TMP9]], <8 x i16>* bitcast ([32 x i16]* @c16 to <8 x i16>*), align 2
+; SSE-NEXT:    store <8 x i16> [[TMP10]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8) to <8 x i16>*), align 2
+; SSE-NEXT:    store <8 x i16> [[TMP11]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <8 x i16>*), align 2
+; SSE-NEXT:    store <8 x i16> [[TMP12]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 24) to <8 x i16>*), align 2
+; SSE-NEXT:    ret void
+;
+; AVX-LABEL: @mul_v32i16(
+; AVX-NEXT:    [[TMP1:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @a16 to <16 x i16>*), align 2
+; AVX-NEXT:    [[TMP2:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <16 x i16>*), align 2
+; AVX-NEXT:    [[TMP3:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @b16 to <16 x i16>*), align 2
+; AVX-NEXT:    [[TMP4:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <16 x i16>*), align 2
+; AVX-NEXT:    [[TMP5:%.*]] = mul <16 x i16> [[TMP1]], [[TMP3]]
+; AVX-NEXT:    [[TMP6:%.*]] = mul <16 x i16> [[TMP2]], [[TMP4]]
+; AVX-NEXT:    store <16 x i16> [[TMP5]], <16 x i16>* bitcast ([32 x i16]* @c16 to <16 x i16>*), align 2
+; AVX-NEXT:    store <16 x i16> [[TMP6]], <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <16 x i16>*), align 2
+; AVX-NEXT:    ret void
+;
+; AVX512-LABEL: @mul_v32i16(
+; AVX512-NEXT:    [[TMP1:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @a16 to <16 x i16>*), align 2
+; AVX512-NEXT:    [[TMP2:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <16 x i16>*), align 2
+; AVX512-NEXT:    [[TMP3:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @b16 to <16 x i16>*), align 2
+; AVX512-NEXT:    [[TMP4:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <16 x i16>*), align 2
+; AVX512-NEXT:    [[TMP5:%.*]] = mul <16 x i16> [[TMP1]], [[TMP3]]
+; AVX512-NEXT:    [[TMP6:%.*]] = mul <16 x i16> [[TMP2]], [[TMP4]]
+; AVX512-NEXT:    store <16 x i16> [[TMP5]], <16 x i16>* bitcast ([32 x i16]* @c16 to <16 x i16>*), align 2
+; AVX512-NEXT:    store <16 x i16> [[TMP6]], <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <16 x i16>*), align 2
+; AVX512-NEXT:    ret void
+;
+  %a0  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 0 ), align 2
+  %a1  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 1 ), align 2
+  %a2  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 2 ), align 2
+  %a3  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 3 ), align 2
+  %a4  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 4 ), align 2
+  %a5  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 5 ), align 2
+  %a6  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 6 ), align 2
+  %a7  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 7 ), align 2
+  %a8  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8 ), align 2
+  %a9  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 9 ), align 2
+  %a10 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 10), align 2
+  %a11 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 11), align 2
+  %a12 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 12), align 2
+  %a13 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 13), align 2
+  %a14 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 14), align 2
+  %a15 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 15), align 2
+  %a16 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16), align 2
+  %a17 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 17), align 2
+  %a18 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 18), align 2
+  %a19 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 19), align 2
+  %a20 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 20), align 2
+  %a21 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 21), align 2
+  %a22 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 22), align 2
+  %a23 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 23), align 2
+  %a24 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24), align 2
+  %a25 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 25), align 2
+  %a26 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 26), align 2
+  %a27 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 27), align 2
+  %a28 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 28), align 2
+  %a29 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 29), align 2
+  %a30 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 30), align 2
+  %a31 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 31), align 2
+  %b0  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 0 ), align 2
+  %b1  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 1 ), align 2
+  %b2  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 2 ), align 2
+  %b3  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 3 ), align 2
+  %b4  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 4 ), align 2
+  %b5  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 5 ), align 2
+  %b6  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 6 ), align 2
+  %b7  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 7 ), align 2
+  %b8  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8 ), align 2
+  %b9  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 9 ), align 2
+  %b10 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 10), align 2
+  %b11 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 11), align 2
+  %b12 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 12), align 2
+  %b13 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 13), align 2
+  %b14 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 14), align 2
+  %b15 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 15), align 2
+  %b16 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16), align 2
+  %b17 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 17), align 2
+  %b18 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 18), align 2
+  %b19 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 19), align 2
+  %b20 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 20), align 2
+  %b21 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 21), align 2
+  %b22 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 22), align 2
+  %b23 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 23), align 2
+  %b24 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24), align 2
+  %b25 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 25), align 2
+  %b26 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 26), align 2
+  %b27 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 27), align 2
+  %b28 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 28), align 2
+  %b29 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 29), align 2
+  %b30 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 30), align 2
+  %b31 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 31), align 2
+  %r0  = mul i16 %a0 , %b0
+  %r1  = mul i16 %a1 , %b1
+  %r2  = mul i16 %a2 , %b2
+  %r3  = mul i16 %a3 , %b3
+  %r4  = mul i16 %a4 , %b4
+  %r5  = mul i16 %a5 , %b5
+  %r6  = mul i16 %a6 , %b6
+  %r7  = mul i16 %a7 , %b7
+  %r8  = mul i16 %a8 , %b8
+  %r9  = mul i16 %a9 , %b9
+  %r10 = mul i16 %a10, %b10
+  %r11 = mul i16 %a11, %b11
+  %r12 = mul i16 %a12, %b12
+  %r13 = mul i16 %a13, %b13
+  %r14 = mul i16 %a14, %b14
+  %r15 = mul i16 %a15, %b15
+  %r16 = mul i16 %a16, %b16
+  %r17 = mul i16 %a17, %b17
+  %r18 = mul i16 %a18, %b18
+  %r19 = mul i16 %a19, %b19
+  %r20 = mul i16 %a20, %b20
+  %r21 = mul i16 %a21, %b21
+  %r22 = mul i16 %a22, %b22
+  %r23 = mul i16 %a23, %b23
+  %r24 = mul i16 %a24, %b24
+  %r25 = mul i16 %a25, %b25
+  %r26 = mul i16 %a26, %b26
+  %r27 = mul i16 %a27, %b27
+  %r28 = mul i16 %a28, %b28
+  %r29 = mul i16 %a29, %b29
+  %r30 = mul i16 %a30, %b30
+  %r31 = mul i16 %a31, %b31
+  store i16 %r0 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 0 ), align 2
+  store i16 %r1 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 1 ), align 2
+  store i16 %r2 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 2 ), align 2
+  store i16 %r3 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 3 ), align 2
+  store i16 %r4 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 4 ), align 2
+  store i16 %r5 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 5 ), align 2
+  store i16 %r6 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 6 ), align 2
+  store i16 %r7 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 7 ), align 2
+  store i16 %r8 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8 ), align 2
+  store i16 %r9 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 9 ), align 2
+  store i16 %r10, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 10), align 2
+  store i16 %r11, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 11), align 2
+  store i16 %r12, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 12), align 2
+  store i16 %r13, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 13), align 2
+  store i16 %r14, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 14), align 2
+  store i16 %r15, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 15), align 2
+  store i16 %r16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16), align 2
+  store i16 %r17, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 17), align 2
+  store i16 %r18, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 18), align 2
+  store i16 %r19, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 19), align 2
+  store i16 %r20, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 20), align 2
+  store i16 %r21, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 21), align 2
+  store i16 %r22, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 22), align 2
+  store i16 %r23, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 23), align 2
+  store i16 %r24, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 24), align 2
+  store i16 %r25, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 25), align 2
+  store i16 %r26, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 26), align 2
+  store i16 %r27, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 27), align 2
+  store i16 %r28, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 28), align 2
+  store i16 %r29, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 29), align 2
+  store i16 %r30, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 30), align 2
+  store i16 %r31, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 31), align 2
+  ret void
+}
+
+define void @mul_v64i8() {
+; CHECK-LABEL: @mul_v64i8(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @a8 to <16 x i8>*), align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16) to <16 x i8>*), align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <16 x i8>*), align 1
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48) to <16 x i8>*), align 1
+; CHECK-NEXT:    [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @b8 to <16 x i8>*), align 1
+; CHECK-NEXT:    [[TMP6:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16) to <16 x i8>*), align 1
+; CHECK-NEXT:    [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <16 x i8>*), align 1
+; CHECK-NEXT:    [[TMP8:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48) to <16 x i8>*), align 1
+; CHECK-NEXT:    [[TMP9:%.*]] = mul <16 x i8> [[TMP1]], [[TMP5]]
+; CHECK-NEXT:    [[TMP10:%.*]] = mul <16 x i8> [[TMP2]], [[TMP6]]
+; CHECK-NEXT:    [[TMP11:%.*]] = mul <16 x i8> [[TMP3]], [[TMP7]]
+; CHECK-NEXT:    [[TMP12:%.*]] = mul <16 x i8> [[TMP4]], [[TMP8]]
+; CHECK-NEXT:    store <16 x i8> [[TMP9]], <16 x i8>* bitcast ([64 x i8]* @c8 to <16 x i8>*), align 1
+; CHECK-NEXT:    store <16 x i8> [[TMP10]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16) to <16 x i8>*), align 1
+; CHECK-NEXT:    store <16 x i8> [[TMP11]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <16 x i8>*), align 1
+; CHECK-NEXT:    store <16 x i8> [[TMP12]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 48) to <16 x i8>*), align 1
+; CHECK-NEXT:    ret void
+;
+  %a0  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 0 ), align 1
+  %a1  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 1 ), align 1
+  %a2  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 2 ), align 1
+  %a3  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 3 ), align 1
+  %a4  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 4 ), align 1
+  %a5  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 5 ), align 1
+  %a6  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 6 ), align 1
+  %a7  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 7 ), align 1
+  %a8  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 8 ), align 1
+  %a9  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 9 ), align 1
+  %a10 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 10), align 1
+  %a11 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 11), align 1
+  %a12 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 12), align 1
+  %a13 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 13), align 1
+  %a14 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 14), align 1
+  %a15 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 15), align 1
+  %a16 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16), align 1
+  %a17 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 17), align 1
+  %a18 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 18), align 1
+  %a19 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 19), align 1
+  %a20 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 20), align 1
+  %a21 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 21), align 1
+  %a22 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 22), align 1
+  %a23 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 23), align 1
+  %a24 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 24), align 1
+  %a25 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 25), align 1
+  %a26 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 26), align 1
+  %a27 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 27), align 1
+  %a28 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 28), align 1
+  %a29 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 29), align 1
+  %a30 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 30), align 1
+  %a31 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 31), align 1
+  %a32 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32), align 1
+  %a33 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 33), align 1
+  %a34 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 34), align 1
+  %a35 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 35), align 1
+  %a36 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 36), align 1
+  %a37 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 37), align 1
+  %a38 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 38), align 1
+  %a39 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 39), align 1
+  %a40 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 40), align 1
+  %a41 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 41), align 1
+  %a42 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 42), align 1
+  %a43 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 43), align 1
+  %a44 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 44), align 1
+  %a45 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 45), align 1
+  %a46 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 46), align 1
+  %a47 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 47), align 1
+  %a48 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48), align 1
+  %a49 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 49), align 1
+  %a50 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 50), align 1
+  %a51 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 51), align 1
+  %a52 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 52), align 1
+  %a53 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 53), align 1
+  %a54 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 54), align 1
+  %a55 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 55), align 1
+  %a56 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 56), align 1
+  %a57 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 57), align 1
+  %a58 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 58), align 1
+  %a59 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 59), align 1
+  %a60 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 60), align 1
+  %a61 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 61), align 1
+  %a62 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 62), align 1
+  %a63 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 63), align 1
+  %b0  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 0 ), align 1
+  %b1  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 1 ), align 1
+  %b2  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 2 ), align 1
+  %b3  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 3 ), align 1
+  %b4  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 4 ), align 1
+  %b5  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 5 ), align 1
+  %b6  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 6 ), align 1
+  %b7  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 7 ), align 1
+  %b8  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 8 ), align 1
+  %b9  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 9 ), align 1
+  %b10 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 10), align 1
+  %b11 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 11), align 1
+  %b12 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 12), align 1
+  %b13 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 13), align 1
+  %b14 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 14), align 1
+  %b15 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 15), align 1
+  %b16 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16), align 1
+  %b17 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 17), align 1
+  %b18 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 18), align 1
+  %b19 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 19), align 1
+  %b20 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 20), align 1
+  %b21 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 21), align 1
+  %b22 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 22), align 1
+  %b23 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 23), align 1
+  %b24 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 24), align 1
+  %b25 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 25), align 1
+  %b26 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 26), align 1
+  %b27 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 27), align 1
+  %b28 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 28), align 1
+  %b29 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 29), align 1
+  %b30 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 30), align 1
+  %b31 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 31), align 1
+  %b32 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32), align 1
+  %b33 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 33), align 1
+  %b34 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 34), align 1
+  %b35 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 35), align 1
+  %b36 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 36), align 1
+  %b37 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 37), align 1
+  %b38 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 38), align 1
+  %b39 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 39), align 1
+  %b40 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 40), align 1
+  %b41 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 41), align 1
+  %b42 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 42), align 1
+  %b43 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 43), align 1
+  %b44 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 44), align 1
+  %b45 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 45), align 1
+  %b46 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 46), align 1
+  %b47 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 47), align 1
+  %b48 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48), align 1
+  %b49 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 49), align 1
+  %b50 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 50), align 1
+  %b51 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 51), align 1
+  %b52 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 52), align 1
+  %b53 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 53), align 1
+  %b54 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 54), align 1
+  %b55 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 55), align 1
+  %b56 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 56), align 1
+  %b57 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 57), align 1
+  %b58 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 58), align 1
+  %b59 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 59), align 1
+  %b60 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 60), align 1
+  %b61 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 61), align 1
+  %b62 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 62), align 1
+  %b63 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 63), align 1
+  %r0  = mul i8 %a0 , %b0
+  %r1  = mul i8 %a1 , %b1
+  %r2  = mul i8 %a2 , %b2
+  %r3  = mul i8 %a3 , %b3
+  %r4  = mul i8 %a4 , %b4
+  %r5  = mul i8 %a5 , %b5
+  %r6  = mul i8 %a6 , %b6
+  %r7  = mul i8 %a7 , %b7
+  %r8  = mul i8 %a8 , %b8
+  %r9  = mul i8 %a9 , %b9
+  %r10 = mul i8 %a10, %b10
+  %r11 = mul i8 %a11, %b11
+  %r12 = mul i8 %a12, %b12
+  %r13 = mul i8 %a13, %b13
+  %r14 = mul i8 %a14, %b14
+  %r15 = mul i8 %a15, %b15
+  %r16 = mul i8 %a16, %b16
+  %r17 = mul i8 %a17, %b17
+  %r18 = mul i8 %a18, %b18
+  %r19 = mul i8 %a19, %b19
+  %r20 = mul i8 %a20, %b20
+  %r21 = mul i8 %a21, %b21
+  %r22 = mul i8 %a22, %b22
+  %r23 = mul i8 %a23, %b23
+  %r24 = mul i8 %a24, %b24
+  %r25 = mul i8 %a25, %b25
+  %r26 = mul i8 %a26, %b26
+  %r27 = mul i8 %a27, %b27
+  %r28 = mul i8 %a28, %b28
+  %r29 = mul i8 %a29, %b29
+  %r30 = mul i8 %a30, %b30
+  %r31 = mul i8 %a31, %b31
+  %r32 = mul i8 %a32, %b32
+  %r33 = mul i8 %a33, %b33
+  %r34 = mul i8 %a34, %b34
+  %r35 = mul i8 %a35, %b35
+  %r36 = mul i8 %a36, %b36
+  %r37 = mul i8 %a37, %b37
+  %r38 = mul i8 %a38, %b38
+  %r39 = mul i8 %a39, %b39
+  %r40 = mul i8 %a40, %b40
+  %r41 = mul i8 %a41, %b41
+  %r42 = mul i8 %a42, %b42
+  %r43 = mul i8 %a43, %b43
+  %r44 = mul i8 %a44, %b44
+  %r45 = mul i8 %a45, %b45
+  %r46 = mul i8 %a46, %b46
+  %r47 = mul i8 %a47, %b47
+  %r48 = mul i8 %a48, %b48
+  %r49 = mul i8 %a49, %b49
+  %r50 = mul i8 %a50, %b50
+  %r51 = mul i8 %a51, %b51
+  %r52 = mul i8 %a52, %b52
+  %r53 = mul i8 %a53, %b53
+  %r54 = mul i8 %a54, %b54
+  %r55 = mul i8 %a55, %b55
+  %r56 = mul i8 %a56, %b56
+  %r57 = mul i8 %a57, %b57
+  %r58 = mul i8 %a58, %b58
+  %r59 = mul i8 %a59, %b59
+  %r60 = mul i8 %a60, %b60
+  %r61 = mul i8 %a61, %b61
+  %r62 = mul i8 %a62, %b62
+  %r63 = mul i8 %a63, %b63
+  store i8 %r0 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 0 ), align 1
+  store i8 %r1 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 1 ), align 1
+  store i8 %r2 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 2 ), align 1
+  store i8 %r3 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 3 ), align 1
+  store i8 %r4 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 4 ), align 1
+  store i8 %r5 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 5 ), align 1
+  store i8 %r6 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 6 ), align 1
+  store i8 %r7 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 7 ), align 1
+  store i8 %r8 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 8 ), align 1
+  store i8 %r9 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 9 ), align 1
+  store i8 %r10, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 10), align 1
+  store i8 %r11, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 11), align 1
+  store i8 %r12, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 12), align 1
+  store i8 %r13, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 13), align 1
+  store i8 %r14, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 14), align 1
+  store i8 %r15, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 15), align 1
+  store i8 %r16, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16), align 1
+  store i8 %r17, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 17), align 1
+  store i8 %r18, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 18), align 1
+  store i8 %r19, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 19), align 1
+  store i8 %r20, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 20), align 1
+  store i8 %r21, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 21), align 1
+  store i8 %r22, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 22), align 1
+  store i8 %r23, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 23), align 1
+  store i8 %r24, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 24), align 1
+  store i8 %r25, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 25), align 1
+  store i8 %r26, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 26), align 1
+  store i8 %r27, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 27), align 1
+  store i8 %r28, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 28), align 1
+  store i8 %r29, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 29), align 1
+  store i8 %r30, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 30), align 1
+  store i8 %r31, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 31), align 1
+  store i8 %r32, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32), align 1
+  store i8 %r33, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 33), align 1
+  store i8 %r34, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 34), align 1
+  store i8 %r35, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 35), align 1
+  store i8 %r36, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 36), align 1
+  store i8 %r37, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 37), align 1
+  store i8 %r38, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 38), align 1
+  store i8 %r39, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 39), align 1
+  store i8 %r40, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 40), align 1
+  store i8 %r41, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 41), align 1
+  store i8 %r42, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 42), align 1
+  store i8 %r43, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 43), align 1
+  store i8 %r44, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 44), align 1
+  store i8 %r45, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 45), align 1
+  store i8 %r46, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 46), align 1
+  store i8 %r47, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 47), align 1
+  store i8 %r48, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 48), align 1
+  store i8 %r49, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 49), align 1
+  store i8 %r50, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 50), align 1
+  store i8 %r51, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 51), align 1
+  store i8 %r52, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 52), align 1
+  store i8 %r53, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 53), align 1
+  store i8 %r54, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 54), align 1
+  store i8 %r55, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 55), align 1
+  store i8 %r56, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 56), align 1
+  store i8 %r57, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 57), align 1
+  store i8 %r58, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 58), align 1
+  store i8 %r59, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 59), align 1
+  store i8 %r60, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 60), align 1
+  store i8 %r61, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 61), align 1
+  store i8 %r62, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 62), align 1
+  store i8 %r63, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 63), align 1
+  ret void
+}
diff --git a/test/Transforms/SLPVectorizer/X86/arith-sub.ll b/test/Transforms/SLPVectorizer/X86/arith-sub.ll
new file mode 100644
index 000000000000..85838369e226
--- /dev/null
+++ b/test/Transforms/SLPVectorizer/X86/arith-sub.ll
@@ -0,0 +1,649 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -mtriple=x86_64-unknown -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=SSE
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX1
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX2
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 --check-prefix=AVX512F
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 --check-prefix=AVX512BW
+
+@a64 = common global [8 x i64] zeroinitializer, align 64
+@b64 = common global [8 x i64] zeroinitializer, align 64
+@c64 = common global [8 x i64] zeroinitializer, align 64
+@a32 = common global [16 x i32] zeroinitializer, align 64
+@b32 = common global [16 x i32] zeroinitializer, align 64
+@c32 = common global [16 x i32] zeroinitializer, align 64
+@a16 = common global [32 x i16] zeroinitializer, align 64
+@b16 = common global [32 x i16] zeroinitializer, align 64
+@c16 = common global [32 x i16] zeroinitializer, align 64
+@a8  = common global [64 x i8] zeroinitializer, align 64
+@b8  = common global [64 x i8] zeroinitializer, align 64
+@c8  = common global [64 x i8] zeroinitializer, align 64
+
+define void @sub_v8i64() {
+; SSE-LABEL: @sub_v8i64(
+; SSE-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @a64 to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2) to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6) to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @b64 to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2) to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6) to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP9:%.*]] = sub <2 x i64> [[TMP1]], [[TMP5]]
+; SSE-NEXT:    [[TMP10:%.*]] = sub <2 x i64> [[TMP2]], [[TMP6]]
+; SSE-NEXT:    [[TMP11:%.*]] = sub <2 x i64> [[TMP3]], [[TMP7]]
+; SSE-NEXT:    [[TMP12:%.*]] = sub <2 x i64> [[TMP4]], [[TMP8]]
+; SSE-NEXT:    store <2 x i64> [[TMP9]], <2 x i64>* bitcast ([8 x i64]* @c64 to <2 x i64>*), align 8
+; SSE-NEXT:    store <2 x i64> [[TMP10]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2) to <2 x i64>*), align 8
+; SSE-NEXT:    store <2 x i64> [[TMP11]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <2 x i64>*), align 8
+; SSE-NEXT:    store <2 x i64> [[TMP12]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6) to <2 x i64>*), align 8
+; SSE-NEXT:    ret void
+;
+; AVX-LABEL: @sub_v8i64(
+; AVX-NEXT:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @a64 to <4 x i64>*), align 8
+; AVX-NEXT:    [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8
+; AVX-NEXT:    [[TMP3:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @b64 to <4 x i64>*), align 8
+; AVX-NEXT:    [[TMP4:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <4 x i64>*), align 8
+; AVX-NEXT:    [[TMP5:%.*]] = sub <4 x i64> [[TMP1]], [[TMP3]]
+; AVX-NEXT:    [[TMP6:%.*]] = sub <4 x i64> [[TMP2]], [[TMP4]]
+; AVX-NEXT:    store <4 x i64> [[TMP5]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8
+; AVX-NEXT:    store <4 x i64> [[TMP6]], <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <4 x i64>*), align 8
+; AVX-NEXT:    ret void
+;
+; AVX512-LABEL: @sub_v8i64(
+; AVX512-NEXT:    [[TMP1:%.*]] = load <8 x i64>, <8 x i64>* bitcast ([8 x i64]* @a64 to <8 x i64>*), align 8
+; AVX512-NEXT:    [[TMP2:%.*]] = load <8 x i64>, <8 x i64>* bitcast ([8 x i64]* @b64 to <8 x i64>*), align 8
+; AVX512-NEXT:    [[TMP3:%.*]] = sub <8 x i64> [[TMP1]], [[TMP2]]
+; AVX512-NEXT:    store <8 x i64> [[TMP3]], <8 x i64>* bitcast ([8 x i64]* @c64 to <8 x i64>*), align 8
+; AVX512-NEXT:    ret void
+;
+  %a0 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 0), align 8
+  %a1 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 1), align 8
+  %a2 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2), align 8
+  %a3 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 3), align 8
+  %a4 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4), align 8
+  %a5 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 5), align 8
+  %a6 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6), align 8
+  %a7 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 7), align 8
+  %b0 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 0), align 8
+  %b1 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 1), align 8
+  %b2 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2), align 8
+  %b3 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 3), align 8
+  %b4 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4), align 8
+  %b5 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 5), align 8
+  %b6 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6), align 8
+  %b7 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 7), align 8
+  %r0 = sub i64 %a0, %b0
+  %r1 = sub i64 %a1, %b1
+  %r2 = sub i64 %a2, %b2
+  %r3 = sub i64 %a3, %b3
+  %r4 = sub i64 %a4, %b4
+  %r5 = sub i64 %a5, %b5
+  %r6 = sub i64 %a6, %b6
+  %r7 = sub i64 %a7, %b7
+  store i64 %r0, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 0), align 8
+  store i64 %r1, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 1), align 8
+  store i64 %r2, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2), align 8
+  store i64 %r3, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 3), align 8
+  store i64 %r4, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4), align 8
+  store i64 %r5, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 5), align 8
+  store i64 %r6, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6), align 8
+  store i64 %r7, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 7), align 8
+  ret void
+}
+
+define void @sub_v16i32() {
+; SSE-LABEL: @sub_v16i32(
+; SSE-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @a32 to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @b32 to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP9:%.*]] = sub <4 x i32> [[TMP1]], [[TMP5]]
+; SSE-NEXT:    [[TMP10:%.*]] = sub <4 x i32> [[TMP2]], [[TMP6]]
+; SSE-NEXT:    [[TMP11:%.*]] = sub <4 x i32> [[TMP3]], [[TMP7]]
+; SSE-NEXT:    [[TMP12:%.*]] = sub <4 x i32> [[TMP4]], [[TMP8]]
+; SSE-NEXT:    store <4 x i32> [[TMP9]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4
+; SSE-NEXT:    store <4 x i32> [[TMP10]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4
+; SSE-NEXT:    store <4 x i32> [[TMP11]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4
+; SSE-NEXT:    store <4 x i32> [[TMP12]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12) to <4 x i32>*), align 4
+; SSE-NEXT:    ret void
+;
+; AVX-LABEL: @sub_v16i32(
+; AVX-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @a32 to <8 x i32>*), align 4
+; AVX-NEXT:    [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <8 x i32>*), align 4
+; AVX-NEXT:    [[TMP3:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @b32 to <8 x i32>*), align 4
+; AVX-NEXT:    [[TMP4:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <8 x i32>*), align 4
+; AVX-NEXT:    [[TMP5:%.*]] = sub <8 x i32> [[TMP1]], [[TMP3]]
+; AVX-NEXT:    [[TMP6:%.*]] = sub <8 x i32> [[TMP2]], [[TMP4]]
+; AVX-NEXT:    store <8 x i32> [[TMP5]], <8 x i32>* bitcast ([16 x i32]* @c32 to <8 x i32>*), align 4
+; AVX-NEXT:    store <8 x i32> [[TMP6]], <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <8 x i32>*), align 4
+; AVX-NEXT:    ret void
+;
+; AVX512-LABEL: @sub_v16i32(
+; AVX512-NEXT:    [[TMP1:%.*]] = load <16 x i32>, <16 x i32>* bitcast ([16 x i32]* @a32 to <16 x i32>*), align 4
+; AVX512-NEXT:    [[TMP2:%.*]] = load <16 x i32>, <16 x i32>* bitcast ([16 x i32]* @b32 to <16 x i32>*), align 4
+; AVX512-NEXT:    [[TMP3:%.*]] = sub <16 x i32> [[TMP1]], [[TMP2]]
+; AVX512-NEXT:    store <16 x i32> [[TMP3]], <16 x i32>* bitcast ([16 x i32]* @c32 to <16 x i32>*), align 4
+; AVX512-NEXT:    ret void
+;
+  %a0  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 0 ), align 4
+  %a1  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 1 ), align 4
+  %a2  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 2 ), align 4
+  %a3  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 3 ), align 4
+  %a4  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4 ), align 4
+  %a5  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 5 ), align 4
+  %a6  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 6 ), align 4
+  %a7  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 7 ), align 4
+  %a8  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8 ), align 4
+  %a9  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 9 ), align 4
+  %a10 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 10), align 4
+  %a11 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 11), align 4
+  %a12 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12), align 4
+  %a13 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 13), align 4
+  %a14 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 14), align 4
+  %a15 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 15), align 4
+  %b0  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 0 ), align 4
+  %b1  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 1 ), align 4
+  %b2  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 2 ), align 4
+  %b3  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 3 ), align 4
+  %b4  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4 ), align 4
+  %b5  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 5 ), align 4
+  %b6  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 6 ), align 4
+  %b7  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 7 ), align 4
+  %b8  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8 ), align 4
+  %b9  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 9 ), align 4
+  %b10 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 10), align 4
+  %b11 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 11), align 4
+  %b12 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12), align 4
+  %b13 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 13), align 4
+  %b14 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 14), align 4
+  %b15 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 15), align 4
+  %r0  = sub i32 %a0 , %b0
+  %r1  = sub i32 %a1 , %b1
+  %r2  = sub i32 %a2 , %b2
+  %r3  = sub i32 %a3 , %b3
+  %r4  = sub i32 %a4 , %b4
+  %r5  = sub i32 %a5 , %b5
+  %r6  = sub i32 %a6 , %b6
+  %r7  = sub i32 %a7 , %b7
+  %r8  = sub i32 %a8 , %b8
+  %r9  = sub i32 %a9 , %b9
+  %r10 = sub i32 %a10, %b10
+  %r11 = sub i32 %a11, %b11
+  %r12 = sub i32 %a12, %b12
+  %r13 = sub i32 %a13, %b13
+  %r14 = sub i32 %a14, %b14
+  %r15 = sub i32 %a15, %b15
+  store i32 %r0 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 0 ), align 4
+  store i32 %r1 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 1 ), align 4
+  store i32 %r2 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 2 ), align 4
+  store i32 %r3 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 3 ), align 4
+  store i32 %r4 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4 ), align 4
+  store i32 %r5 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 5 ), align 4
+  store i32 %r6 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 6 ), align 4
+  store i32 %r7 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 7 ), align 4
+  store i32 %r8 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8 ), align 4
+  store i32 %r9 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 9 ), align 4
+  store i32 %r10, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 10), align 4
+  store i32 %r11, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 11), align 4
+  store i32 %r12, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12), align 4
+  store i32 %r13, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 13), align 4
+  store i32 %r14, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 14), align 4
+  store i32 %r15, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 15), align 4
+  ret void
+}
+
+define void @sub_v32i16() {
+; SSE-LABEL: @sub_v32i16(
+; SSE-NEXT:    [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @a16 to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @b16 to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP9:%.*]] = sub <8 x i16> [[TMP1]], [[TMP5]]
+; SSE-NEXT:    [[TMP10:%.*]] = sub <8 x i16> [[TMP2]], [[TMP6]]
+; SSE-NEXT:    [[TMP11:%.*]] = sub <8 x i16> [[TMP3]], [[TMP7]]
+; SSE-NEXT:    [[TMP12:%.*]] = sub <8 x i16> [[TMP4]], [[TMP8]]
+; SSE-NEXT:    store <8 x i16> [[TMP9]], <8 x i16>* bitcast ([32 x i16]* @c16 to <8 x i16>*), align 2
+; SSE-NEXT:    store <8 x i16> [[TMP10]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8) to <8 x i16>*), align 2
+; SSE-NEXT:    store <8 x i16> [[TMP11]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <8 x i16>*), align 2
+; SSE-NEXT:    store <8 x i16> [[TMP12]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 24) to <8 x i16>*), align 2
+; SSE-NEXT:    ret void
+;
+; AVX-LABEL: @sub_v32i16(
+; AVX-NEXT:    [[TMP1:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @a16 to <16 x i16>*), align 2
+; AVX-NEXT:    [[TMP2:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <16 x i16>*), align 2
+; AVX-NEXT:    [[TMP3:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @b16 to <16 x i16>*), align 2
+; AVX-NEXT:    [[TMP4:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <16 x i16>*), align 2
+; AVX-NEXT:    [[TMP5:%.*]] = sub <16 x i16> [[TMP1]], [[TMP3]]
+; AVX-NEXT:    [[TMP6:%.*]] = sub <16 x i16> [[TMP2]], [[TMP4]]
+; AVX-NEXT:    store <16 x i16> [[TMP5]], <16 x i16>* bitcast ([32 x i16]* @c16 to <16 x i16>*), align 2
+; AVX-NEXT:    store <16 x i16> [[TMP6]], <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <16 x i16>*), align 2
+; AVX-NEXT:    ret void
+;
+; AVX512-LABEL: @sub_v32i16(
+; AVX512-NEXT:    [[TMP1:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @a16 to <16 x i16>*), align 2
+; AVX512-NEXT:    [[TMP2:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <16 x i16>*), align 2
+; AVX512-NEXT:    [[TMP3:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @b16 to <16 x i16>*), align 2
+; AVX512-NEXT:    [[TMP4:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <16 x i16>*), align 2
+; AVX512-NEXT:    [[TMP5:%.*]] = sub <16 x i16> [[TMP1]], [[TMP3]]
+; AVX512-NEXT:    [[TMP6:%.*]] = sub <16 x i16> [[TMP2]], [[TMP4]]
+; AVX512-NEXT:    store <16 x i16> [[TMP5]], <16 x i16>* bitcast ([32 x i16]* @c16 to <16 x i16>*), align 2
+; AVX512-NEXT:    store <16 x i16> [[TMP6]], <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <16 x i16>*), align 2
+; AVX512-NEXT:    ret void
+;
+  %a0  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 0 ), align 2
+  %a1  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 1 ), align 2
+  %a2  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 2 ), align 2
+  %a3  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 3 ), align 2
+  %a4  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 4 ), align 2
+  %a5  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 5 ), align 2
+  %a6  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 6 ), align 2
+  %a7  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 7 ), align 2
+  %a8  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8 ), align 2
+  %a9  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 9 ), align 2
+  %a10 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 10), align 2
+  %a11 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 11), align 2
+  %a12 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 12), align 2
+  %a13 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 13), align 2
+  %a14 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 14), align 2
+  %a15 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 15), align 2
+  %a16 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16), align 2
+  %a17 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 17), align 2
+  %a18 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 18), align 2
+  %a19 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 19), align 2
+  %a20 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 20), align 2
+  %a21 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 21), align 2
+  %a22 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 22), align 2
+  %a23 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 23), align 2
+  %a24 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24), align 2
+  %a25 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 25), align 2
+  %a26 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 26), align 2
+  %a27 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 27), align 2
+  %a28 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 28), align 2
+  %a29 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 29), align 2
+  %a30 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 30), align 2
+  %a31 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 31), align 2
+  %b0  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 0 ), align 2
+  %b1  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 1 ), align 2
+  %b2  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 2 ), align 2
+  %b3  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 3 ), align 2
+  %b4  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 4 ), align 2
+  %b5  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 5 ), align 2
+  %b6  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 6 ), align 2
+  %b7  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 7 ), align 2
+  %b8  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8 ), align 2
+  %b9  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 9 ), align 2
+  %b10 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 10), align 2
+  %b11 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 11), align 2
+  %b12 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 12), align 2
+  %b13 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 13), align 2
+  %b14 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 14), align 2
+  %b15 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 15), align 2
+  %b16 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16), align 2
+  %b17 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 17), align 2
+  %b18 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 18), align 2
+  %b19 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 19), align 2
+  %b20 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 20), align 2
+  %b21 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 21), align 2
+  %b22 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 22), align 2
+  %b23 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 23), align 2
+  %b24 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24), align 2
+  %b25 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 25), align 2
+  %b26 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 26), align 2
+  %b27 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 27), align 2
+  %b28 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 28), align 2
+  %b29 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 29), align 2
+  %b30 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 30), align 2
+  %b31 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 31), align 2
+  %r0  = sub i16 %a0 , %b0
+  %r1  = sub i16 %a1 , %b1
+  %r2  = sub i16 %a2 , %b2
+  %r3  = sub i16 %a3 , %b3
+  %r4  = sub i16 %a4 , %b4
+  %r5  = sub i16 %a5 , %b5
+  %r6  = sub i16 %a6 , %b6
+  %r7  = sub i16 %a7 , %b7
+  %r8  = sub i16 %a8 , %b8
+  %r9  = sub i16 %a9 , %b9
+  %r10 = sub i16 %a10, %b10
+  %r11 = sub i16 %a11, %b11
+  %r12 = sub i16 %a12, %b12
+  %r13 = sub i16 %a13, %b13
+  %r14 = sub i16 %a14, %b14
+  %r15 = sub i16 %a15, %b15
+  %r16 = sub i16 %a16, %b16
+  %r17 = sub i16 %a17, %b17
+  %r18 = sub i16 %a18, %b18
+  %r19 = sub i16 %a19, %b19
+  %r20 = sub i16 %a20, %b20
+  %r21 = sub i16 %a21, %b21
+  %r22 = sub i16 %a22, %b22
+  %r23 = sub i16 %a23, %b23
+  %r24 = sub i16 %a24, %b24
+  %r25 = sub i16 %a25, %b25
+  %r26 = sub i16 %a26, %b26
+  %r27 = sub i16 %a27, %b27
+  %r28 = sub i16 %a28, %b28
+  %r29 = sub i16 %a29, %b29
+  %r30 = sub i16 %a30, %b30
+  %r31 = sub i16 %a31, %b31
+  store i16 %r0 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 0 ), align 2
+  store i16 %r1 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 1 ), align 2
+  store i16 %r2 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 2 ), align 2
+  store i16 %r3 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 3 ), align 2
+  store i16 %r4 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 4 ), align 2
+  store i16 %r5 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 5 ), align 2
+  store i16 %r6 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 6 ), align 2
+  store i16 %r7 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 7 ), align 2
+  store i16 %r8 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8 ), align 2
+  store i16 %r9 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 9 ), align 2
+  store i16 %r10, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 10), align 2
+  store i16 %r11, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 11), align 2
+  store i16 %r12, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 12), align 2
+  store i16 %r13, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 13), align 2
+  store i16 %r14, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 14), align 2
+  store i16 %r15, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 15), align 2
+  store i16 %r16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16), align 2
+  store i16 %r17, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 17), align 2
+  store i16 %r18, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 18), align 2
+  store i16 %r19, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 19), align 2
+  store i16 %r20, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 20), align 2
+  store i16 %r21, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 21), align 2
+  store i16 %r22, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 22), align 2
+  store i16 %r23, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 23), align 2
+  store i16 %r24, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 24), align 2
+  store i16 %r25, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 25), align 2
+  store i16 %r26, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 26), align 2
+  store i16 %r27, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 27), align 2
+  store i16 %r28, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 28), align 2
+  store i16 %r29, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 29), align 2
+  store i16 %r30, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 30), align 2
+  store i16 %r31, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 31), align 2
+  ret void
+}
+
+define void @sub_v64i8() {
+; CHECK-LABEL: @sub_v64i8(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @a8 to <16 x i8>*), align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16) to <16 x i8>*), align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <16 x i8>*), align 1
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48) to <16 x i8>*), align 1
+; CHECK-NEXT:    [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @b8 to <16 x i8>*), align 1
+; CHECK-NEXT:    [[TMP6:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16) to <16 x i8>*), align 1
+; CHECK-NEXT:    [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <16 x i8>*), align 1
+; CHECK-NEXT:    [[TMP8:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48) to <16 x i8>*), align 1
+; CHECK-NEXT:    [[TMP9:%.*]] = sub <16 x i8> [[TMP1]], [[TMP5]]
+; CHECK-NEXT:    [[TMP10:%.*]] = sub <16 x i8> [[TMP2]], [[TMP6]]
+; CHECK-NEXT:    [[TMP11:%.*]] = sub <16 x i8> [[TMP3]], [[TMP7]]
+; CHECK-NEXT:    [[TMP12:%.*]] = sub <16 x i8> [[TMP4]], [[TMP8]]
+; CHECK-NEXT:    store <16 x i8> [[TMP9]], <16 x i8>* bitcast ([64 x i8]* @c8 to <16 x i8>*), align 1
+; CHECK-NEXT:    store <16 x i8> [[TMP10]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16) to <16 x i8>*), align 1
+; CHECK-NEXT:    store <16 x i8> [[TMP11]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <16 x i8>*), align 1
+; CHECK-NEXT:    store <16 x i8> [[TMP12]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 48) to <16 x i8>*), align 1
+; CHECK-NEXT:    ret void
+;
+  %a0  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 0 ), align 1
+  %a1  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 1 ), align 1
+  %a2  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 2 ), align 1
+  %a3  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 3 ), align 1
+  %a4  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 4 ), align 1
+  %a5  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 5 ), align 1
+  %a6  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 6 ), align 1
+  %a7  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 7 ), align 1
+  %a8  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 8 ), align 1
+  %a9  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 9 ), align 1
+  %a10 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 10), align 1
+  %a11 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 11), align 1
+  %a12 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 12), align 1
+  %a13 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 13), align 1
+  %a14 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 14), align 1
+  %a15 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 15), align 1
+  %a16 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16), align 1
+  %a17 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 17), align 1
+  %a18 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 18), align 1
+  %a19 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 19), align 1
+  %a20 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 20), align 1
+  %a21 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 21), align 1
+  %a22 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 22), align 1
+  %a23 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 23), align 1
+  %a24 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 24), align 1
+  %a25 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 25), align 1
+  %a26 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 26), align 1
+  %a27 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 27), align 1
+  %a28 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 28), align 1
+  %a29 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 29), align 1
+  %a30 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 30), align 1
+  %a31 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 31), align 1
+  %a32 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32), align 1
+  %a33 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 33), align 1
+  %a34 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 34), align 1
+  %a35 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 35), align 1
+  %a36 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 36), align 1
+  %a37 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 37), align 1
+  %a38 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 38), align 1
+  %a39 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 39), align 1
+  %a40 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 40), align 1
+  %a41 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 41), align 1
+  %a42 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 42), align 1
+  %a43 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 43), align 1
+  %a44 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 44), align 1
+  %a45 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 45), align 1
+  %a46 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 46), align 1
+  %a47 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 47), align 1
+  %a48 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48), align 1
+  %a49 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 49), align 1
+  %a50 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 50), align 1
+  %a51 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 51), align 1
+  %a52 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 52), align 1
+  %a53 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 53), align 1
+  %a54 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 54), align 1
+  %a55 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 55), align 1
+  %a56 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 56), align 1
+  %a57 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 57), align 1
+  %a58 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 58), align 1
+  %a59 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 59), align 1
+  %a60 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 60), align 1
+  %a61 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 61), align 1
+  %a62 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 62), align 1
+  %a63 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 63), align 1
+  %b0  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 0 ), align 1
+  %b1  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 1 ), align 1
+  %b2  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 2 ), align 1
+  %b3  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 3 ), align 1
+  %b4  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 4 ), align 1
+  %b5  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 5 ), align 1
+  %b6  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 6 ), align 1
+  %b7  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 7 ), align 1
+  %b8  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 8 ), align 1
+  %b9  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 9 ), align 1
+  %b10 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 10), align 1
+  %b11 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 11), align 1
+  %b12 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 12), align 1
+  %b13 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 13), align 1
+  %b14 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 14), align 1
+  %b15 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 15), align 1
+  %b16 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16), align 1
+  %b17 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 17), align 1
+  %b18 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 18), align 1
+  %b19 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 19), align 1
+  %b20 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 20), align 1
+  %b21 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 21), align 1
+  %b22 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 22), align 1
+  %b23 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 23), align 1
+  %b24 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 24), align 1
+  %b25 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 25), align 1
+  %b26 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 26), align 1
+  %b27 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 27), align 1
+  %b28 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 28), align 1
+  %b29 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 29), align 1
+  %b30 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 30), align 1
+  %b31 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 31), align 1
+  %b32 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32), align 1
+  %b33 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 33), align 1
+  %b34 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 34), align 1
+  %b35 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 35), align 1
+  %b36 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 36), align 1
+  %b37 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 37), align 1
+  %b38 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 38), align 1
+  %b39 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 39), align 1
+  %b40 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 40), align 1
+  %b41 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 41), align 1
+  %b42 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 42), align 1
+  %b43 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 43), align 1
+  %b44 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 44), align 1
+  %b45 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 45), align 1
+  %b46 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 46), align 1
+  %b47 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 47), align 1
+  %b48 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48), align 1
+  %b49 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 49), align 1
+  %b50 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 50), align 1
+  %b51 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 51), align 1
+  %b52 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 52), align 1
+  %b53 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 53), align 1
+  %b54 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 54), align 1
+  %b55 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 55), align 1
+  %b56 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 56), align 1
+  %b57 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 57), align 1
+  %b58 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 58), align 1
+  %b59 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 59), align 1
+  %b60 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 60), align 1
+  %b61 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 61), align 1
+  %b62 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 62), align 1
+  %b63 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 63), align 1
+  %r0  = sub i8 %a0 , %b0
+  %r1  = sub i8 %a1 , %b1
+  %r2  = sub i8 %a2 , %b2
+  %r3  = sub i8 %a3 , %b3
+  %r4  = sub i8 %a4 , %b4
+  %r5  = sub i8 %a5 , %b5
+  %r6  = sub i8 %a6 , %b6
+  %r7  = sub i8 %a7 , %b7
+  %r8  = sub i8 %a8 , %b8
+  %r9  = sub i8 %a9 , %b9
+  %r10 = sub i8 %a10, %b10
+  %r11 = sub i8 %a11, %b11
+  %r12 = sub i8 %a12, %b12
+  %r13 = sub i8 %a13, %b13
+  %r14 = sub i8 %a14, %b14
+  %r15 = sub i8 %a15, %b15
+  %r16 = sub i8 %a16, %b16
+  %r17 = sub i8 %a17, %b17
+  %r18 = sub i8 %a18, %b18
+  %r19 = sub i8 %a19, %b19
+  %r20 = sub i8 %a20, %b20
+  %r21 = sub i8 %a21, %b21
+  %r22 = sub i8 %a22, %b22
+  %r23 = sub i8 %a23, %b23
+  %r24 = sub i8 %a24, %b24
+  %r25 = sub i8 %a25, %b25
+  %r26 = sub i8 %a26, %b26
+  %r27 = sub i8 %a27, %b27
+  %r28 = sub i8 %a28, %b28
+  %r29 = sub i8 %a29, %b29
+  %r30 = sub i8 %a30, %b30
+  %r31 = sub i8 %a31, %b31
+  %r32 = sub i8 %a32, %b32
+  %r33 = sub i8 %a33, %b33
+  %r34 = sub i8 %a34, %b34
+  %r35 = sub i8 %a35, %b35
+  %r36 = sub i8 %a36, %b36
+  %r37 = sub i8 %a37, %b37
+  %r38 = sub i8 %a38, %b38
+  %r39 = sub i8 %a39, %b39
+  %r40 = sub i8 %a40, %b40
+  %r41 = sub i8 %a41, %b41
+  %r42 = sub i8 %a42, %b42
+  %r43 = sub i8 %a43, %b43
+  %r44 = sub i8 %a44, %b44
+  %r45 = sub i8 %a45, %b45
+  %r46 = sub i8 %a46, %b46
+  %r47 = sub i8 %a47, %b47
+  %r48 = sub i8 %a48, %b48
+  %r49 = sub i8 %a49, %b49
+  %r50 = sub i8 %a50, %b50
+  %r51 = sub i8 %a51, %b51
+  %r52 = sub i8 %a52, %b52
+  %r53 = sub i8 %a53, %b53
+  %r54 = sub i8 %a54, %b54
+  %r55 = sub i8 %a55, %b55
+  %r56 = sub i8 %a56, %b56
+  %r57 = sub i8 %a57, %b57
+  %r58 = sub i8 %a58, %b58
+  %r59 = sub i8 %a59, %b59
+  %r60 = sub i8 %a60, %b60
+  %r61 = sub i8 %a61, %b61
+  %r62 = sub i8 %a62, %b62
+  %r63 = sub i8 %a63, %b63
+  store i8 %r0 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 0 ), align 1
+  store i8 %r1 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 1 ), align 1
+  store i8 %r2 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 2 ), align 1
+  store i8 %r3 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 3 ), align 1
+  store i8 %r4 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 4 ), align 1
+  store i8 %r5 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 5 ), align 1
+  store i8 %r6 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 6 ), align 1
+  store i8 %r7 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 7 ), align 1
+  store i8 %r8 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 8 ), align 1
+  store i8 %r9 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 9 ), align 1
+  store i8 %r10, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 10), align 1
+  store i8 %r11, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 11), align 1
+  store i8 %r12, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 12), align 1
+  store i8 %r13, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 13), align 1
+  store i8 %r14, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 14), align 1
+  store i8 %r15, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 15), align 1
+  store i8 %r16, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16), align 1
+  store i8 %r17, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 17), align 1
+  store i8 %r18, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 18), align 1
+  store i8 %r19, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 19), align 1
+  store i8 %r20, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 20), align 1
+  store i8 %r21, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 21), align 1
+  store i8 %r22, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 22), align 1
+  store i8 %r23, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 23), align 1
+  store i8 %r24, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 24), align 1
+  store i8 %r25, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 25), align 1
+  store i8 %r26, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 26), align 1
+  store i8 %r27, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 27), align 1
+  store i8 %r28, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 28), align 1
+  store i8 %r29, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 29), align 1
+  store i8 %r30, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 30), align 1
+  store i8 %r31, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 31), align 1
+  store i8 %r32, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32), align 1
+  store i8 %r33, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 33), align 1
+  store i8 %r34, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 34), align 1
+  store i8 %r35, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 35), align 1
+  store i8 %r36, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 36), align 1
+  store i8 %r37, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 37), align 1
+  store i8 %r38, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 38), align 1
+  store i8 %r39, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 39), align 1
+  store i8 %r40, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 40), align 1
+  store i8 %r41, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 41), align 1
+  store i8 %r42, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 42), align 1
+  store i8 %r43, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 43), align 1
+  store i8 %r44, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 44), align 1
+  store i8 %r45, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 45), align 1
+  store i8 %r46, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 46), align 1
+  store i8 %r47, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 47), align 1
+  store i8 %r48, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 48), align 1
+  store i8 %r49, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 49), align 1
+  store i8 %r50, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 50), align 1
+  store i8 %r51, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 51), align 1
+  store i8 %r52, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 52), align 1
+  store i8 %r53, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 53), align 1
+  store i8 %r54, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 54), align 1
+  store i8 %r55, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 55), align 1
+  store i8 %r56, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 56), align 1
+  store i8 %r57, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 57), align 1
+  store i8 %r58, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 58), align 1
+  store i8 %r59, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 59), align 1
+  store i8 %r60, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 60), align 1
+  store i8 %r61, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 61), align 1
+  store i8 %r62, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 62), align 1
+  store i8 %r63, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 63), align 1
+  ret void
+}
diff --git a/test/Transforms/SLPVectorizer/X86/shift-ashr.ll b/test/Transforms/SLPVectorizer/X86/shift-ashr.ll
new file mode 100644
index 000000000000..646f599ce340
--- /dev/null
+++ b/test/Transforms/SLPVectorizer/X86/shift-ashr.ll
@@ -0,0 +1,913 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -mtriple=x86_64-unknown -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=SSE
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX1
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX2
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 --check-prefix=AVX512F
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 --check-prefix=AVX512BW
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=bdver4 -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=XOP
+
+@a64 = common global [8 x i64] zeroinitializer, align 64
+@b64 = common global [8 x i64] zeroinitializer, align 64
+@c64 = common global [8 x i64] zeroinitializer, align 64
+@a32 = common global [16 x i32] zeroinitializer, align 64
+@b32 = common global [16 x i32] zeroinitializer, align 64
+@c32 = common global [16 x i32] zeroinitializer, align 64
+@a16 = common global [32 x i16] zeroinitializer, align 64
+@b16 = common global [32 x i16] zeroinitializer, align 64
+@c16 = common global [32 x i16] zeroinitializer, align 64
+@a8  = common global [64 x i8] zeroinitializer, align 64
+@b8  = common global [64 x i8] zeroinitializer, align 64
+@c8  = common global [64 x i8] zeroinitializer, align 64
+
+define void @ashr_v8i64() {
+; SSE-LABEL: @ashr_v8i64(
+; SSE-NEXT:    [[A0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 0), align 8
+; SSE-NEXT:    [[A1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 1), align 8
+; SSE-NEXT:    [[A2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2), align 8
+; SSE-NEXT:    [[A3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 3), align 8
+; SSE-NEXT:    [[A4:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4), align 8
+; SSE-NEXT:    [[A5:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 5), align 8
+; SSE-NEXT:    [[A6:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6), align 8
+; SSE-NEXT:    [[A7:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 7), align 8
+; SSE-NEXT:    [[B0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 0), align 8
+; SSE-NEXT:    [[B1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 1), align 8
+; SSE-NEXT:    [[B2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2), align 8
+; SSE-NEXT:    [[B3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 3), align 8
+; SSE-NEXT:    [[B4:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4), align 8
+; SSE-NEXT:    [[B5:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 5), align 8
+; SSE-NEXT:    [[B6:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6), align 8
+; SSE-NEXT:    [[B7:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 7), align 8
+; SSE-NEXT:    [[R0:%.*]] = ashr i64 [[A0]], [[B0]]
+; SSE-NEXT:    [[R1:%.*]] = ashr i64 [[A1]], [[B1]]
+; SSE-NEXT:    [[R2:%.*]] = ashr i64 [[A2]], [[B2]]
+; SSE-NEXT:    [[R3:%.*]] = ashr i64 [[A3]], [[B3]]
+; SSE-NEXT:    [[R4:%.*]] = ashr i64 [[A4]], [[B4]]
+; SSE-NEXT:    [[R5:%.*]] = ashr i64 [[A5]], [[B5]]
+; SSE-NEXT:    [[R6:%.*]] = ashr i64 [[A6]], [[B6]]
+; SSE-NEXT:    [[R7:%.*]] = ashr i64 [[A7]], [[B7]]
+; SSE-NEXT:    store i64 [[R0]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 0), align 8
+; SSE-NEXT:    store i64 [[R1]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 1), align 8
+; SSE-NEXT:    store i64 [[R2]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2), align 8
+; SSE-NEXT:    store i64 [[R3]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 3), align 8
+; SSE-NEXT:    store i64 [[R4]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4), align 8
+; SSE-NEXT:    store i64 [[R5]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 5), align 8
+; SSE-NEXT:    store i64 [[R6]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6), align 8
+; SSE-NEXT:    store i64 [[R7]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 7), align 8
+; SSE-NEXT:    ret void
+;
+; AVX1-LABEL: @ashr_v8i64(
+; AVX1-NEXT:    [[A0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 0), align 8
+; AVX1-NEXT:    [[A1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 1), align 8
+; AVX1-NEXT:    [[A2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2), align 8
+; AVX1-NEXT:    [[A3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 3), align 8
+; AVX1-NEXT:    [[A4:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4), align 8
+; AVX1-NEXT:    [[A5:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 5), align 8
+; AVX1-NEXT:    [[A6:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6), align 8
+; AVX1-NEXT:    [[A7:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 7), align 8
+; AVX1-NEXT:    [[B0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 0), align 8
+; AVX1-NEXT:    [[B1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 1), align 8
+; AVX1-NEXT:    [[B2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2), align 8
+; AVX1-NEXT:    [[B3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 3), align 8
+; AVX1-NEXT:    [[B4:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4), align 8
+; AVX1-NEXT:    [[B5:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 5), align 8
+; AVX1-NEXT:    [[B6:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6), align 8
+; AVX1-NEXT:    [[B7:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 7), align 8
+; AVX1-NEXT:    [[R0:%.*]] = ashr i64 [[A0]], [[B0]]
+; AVX1-NEXT:    [[R1:%.*]] = ashr i64 [[A1]], [[B1]]
+; AVX1-NEXT:    [[R2:%.*]] = ashr i64 [[A2]], [[B2]]
+; AVX1-NEXT:    [[R3:%.*]] = ashr i64 [[A3]], [[B3]]
+; AVX1-NEXT:    [[R4:%.*]] = ashr i64 [[A4]], [[B4]]
+; AVX1-NEXT:    [[R5:%.*]] = ashr i64 [[A5]], [[B5]]
+; AVX1-NEXT:    [[R6:%.*]] = ashr i64 [[A6]], [[B6]]
+; AVX1-NEXT:    [[R7:%.*]] = ashr i64 [[A7]], [[B7]]
+; AVX1-NEXT:    store i64 [[R0]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 0), align 8
+; AVX1-NEXT:    store i64 [[R1]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 1), align 8
+; AVX1-NEXT:    store i64 [[R2]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2), align 8
+; AVX1-NEXT:    store i64 [[R3]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 3), align 8
+; AVX1-NEXT:    store i64 [[R4]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4), align 8
+; AVX1-NEXT:    store i64 [[R5]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 5), align 8
+; AVX1-NEXT:    store i64 [[R6]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6), align 8
+; AVX1-NEXT:    store i64 [[R7]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 7), align 8
+; AVX1-NEXT:    ret void
+;
+; AVX2-LABEL: @ashr_v8i64(
+; AVX2-NEXT:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @a64 to <4 x i64>*), align 8
+; AVX2-NEXT:    [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8
+; AVX2-NEXT:    [[TMP3:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @b64 to <4 x i64>*), align 8
+; AVX2-NEXT:    [[TMP4:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <4 x i64>*), align 8
+; AVX2-NEXT:    [[TMP5:%.*]] = ashr <4 x i64> [[TMP1]], [[TMP3]]
+; AVX2-NEXT:    [[TMP6:%.*]] = ashr <4 x i64> [[TMP2]], [[TMP4]]
+; AVX2-NEXT:    store <4 x i64> [[TMP5]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8
+; AVX2-NEXT:    store <4 x i64> [[TMP6]], <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <4 x i64>*), align 8
+; AVX2-NEXT:    ret void
+;
+; AVX512-LABEL: @ashr_v8i64(
+; AVX512-NEXT:    [[TMP1:%.*]] = load <8 x i64>, <8 x i64>* bitcast ([8 x i64]* @a64 to <8 x i64>*), align 8
+; AVX512-NEXT:    [[TMP2:%.*]] = load <8 x i64>, <8 x i64>* bitcast ([8 x i64]* @b64 to <8 x i64>*), align 8
+; AVX512-NEXT:    [[TMP3:%.*]] = ashr <8 x i64> [[TMP1]], [[TMP2]]
+; AVX512-NEXT:    store <8 x i64> [[TMP3]], <8 x i64>* bitcast ([8 x i64]* @c64 to <8 x i64>*), align 8
+; AVX512-NEXT:    ret void
+;
+; XOP-LABEL: @ashr_v8i64(
+; XOP-NEXT:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @a64 to <4 x i64>*), align 8
+; XOP-NEXT:    [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8
+; XOP-NEXT:    [[TMP3:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @b64 to <4 x i64>*), align 8
+; XOP-NEXT:    [[TMP4:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <4 x i64>*), align 8
+; XOP-NEXT:    [[TMP5:%.*]] = ashr <4 x i64> [[TMP1]], [[TMP3]]
+; XOP-NEXT:    [[TMP6:%.*]] = ashr <4 x i64> [[TMP2]], [[TMP4]]
+; XOP-NEXT:    store <4 x i64> [[TMP5]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8
+; XOP-NEXT:    store <4 x i64> [[TMP6]], <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <4 x i64>*), align 8
+; XOP-NEXT:    ret void
+;
+  %a0 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 0), align 8
+  %a1 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 1), align 8
+  %a2 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2), align 8
+  %a3 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 3), align 8
+  %a4 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4), align 8
+  %a5 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 5), align 8
+  %a6 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6), align 8
+  %a7 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 7), align 8
+  %b0 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 0), align 8
+  %b1 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 1), align 8
+  %b2 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2), align 8
+  %b3 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 3), align 8
+  %b4 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4), align 8
+  %b5 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 5), align 8
+  %b6 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6), align 8
+  %b7 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 7), align 8
+  %r0 = ashr i64 %a0, %b0
+  %r1 = ashr i64 %a1, %b1
+  %r2 = ashr i64 %a2, %b2
+  %r3 = ashr i64 %a3, %b3
+  %r4 = ashr i64 %a4, %b4
+  %r5 = ashr i64 %a5, %b5
+  %r6 = ashr i64 %a6, %b6
+  %r7 = ashr i64 %a7, %b7
+  store i64 %r0, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 0), align 8
+  store i64 %r1, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 1), align 8
+  store i64 %r2, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2), align 8
+  store i64 %r3, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 3), align 8
+  store i64 %r4, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4), align 8
+  store i64 %r5, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 5), align 8
+  store i64 %r6, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6), align 8
+  store i64 %r7, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 7), align 8
+  ret void
+}
+
+define void @ashr_v16i32() {
+; SSE-LABEL: @ashr_v16i32(
+; SSE-NEXT:    [[A0:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 0), align 4
+; SSE-NEXT:    [[A1:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 1), align 4
+; SSE-NEXT:    [[A2:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 2), align 4
+; SSE-NEXT:    [[A3:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 3), align 4
+; SSE-NEXT:    [[A4:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4), align 4
+; SSE-NEXT:    [[A5:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 5), align 4
+; SSE-NEXT:    [[A6:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 6), align 4
+; SSE-NEXT:    [[A7:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 7), align 4
+; SSE-NEXT:    [[A8:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8), align 4
+; SSE-NEXT:    [[A9:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 9), align 4
+; SSE-NEXT:    [[A10:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 10), align 4
+; SSE-NEXT:    [[A11:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 11), align 4
+; SSE-NEXT:    [[A12:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12), align 4
+; SSE-NEXT:    [[A13:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 13), align 4
+; SSE-NEXT:    [[A14:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 14), align 4
+; SSE-NEXT:    [[A15:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 15), align 4
+; SSE-NEXT:    [[B0:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 0), align 4
+; SSE-NEXT:    [[B1:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 1), align 4
+; SSE-NEXT:    [[B2:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 2), align 4
+; SSE-NEXT:    [[B3:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 3), align 4
+; SSE-NEXT:    [[B4:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4), align 4
+; SSE-NEXT:    [[B5:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 5), align 4
+; SSE-NEXT:    [[B6:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 6), align 4
+; SSE-NEXT:    [[B7:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 7), align 4
+; SSE-NEXT:    [[B8:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8), align 4
+; SSE-NEXT:    [[B9:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 9), align 4
+; SSE-NEXT:    [[B10:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 10), align 4
+; SSE-NEXT:    [[B11:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 11), align 4
+; SSE-NEXT:    [[B12:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12), align 4
+; SSE-NEXT:    [[B13:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 13), align 4
+; SSE-NEXT:    [[B14:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 14), align 4
+; SSE-NEXT:    [[B15:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 15), align 4
+; SSE-NEXT:    [[R0:%.*]] = ashr i32 [[A0]], [[B0]]
+; SSE-NEXT:    [[R1:%.*]] = ashr i32 [[A1]], [[B1]]
+; SSE-NEXT:    [[R2:%.*]] = ashr i32 [[A2]], [[B2]]
+; SSE-NEXT:    [[R3:%.*]] = ashr i32 [[A3]], [[B3]]
+; SSE-NEXT:    [[R4:%.*]] = ashr i32 [[A4]], [[B4]]
+; SSE-NEXT:    [[R5:%.*]] = ashr i32 [[A5]], [[B5]]
+; SSE-NEXT:    [[R6:%.*]] = ashr i32 [[A6]], [[B6]]
+; SSE-NEXT:    [[R7:%.*]] = ashr i32 [[A7]], [[B7]]
+; SSE-NEXT:    [[R8:%.*]] = ashr i32 [[A8]], [[B8]]
+; SSE-NEXT:    [[R9:%.*]] = ashr i32 [[A9]], [[B9]]
+; SSE-NEXT:    [[R10:%.*]] = ashr i32 [[A10]], [[B10]]
+; SSE-NEXT:    [[R11:%.*]] = ashr i32 [[A11]], [[B11]]
+; SSE-NEXT:    [[R12:%.*]] = ashr i32 [[A12]], [[B12]]
+; SSE-NEXT:    [[R13:%.*]] = ashr i32 [[A13]], [[B13]]
+; SSE-NEXT:    [[R14:%.*]] = ashr i32 [[A14]], [[B14]]
+; SSE-NEXT:    [[R15:%.*]] = ashr i32 [[A15]], [[B15]]
+; SSE-NEXT:    store i32 [[R0]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 0), align 4
+; SSE-NEXT:    store i32 [[R1]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 1), align 4
+; SSE-NEXT:    store i32 [[R2]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 2), align 4
+; SSE-NEXT:    store i32 [[R3]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 3), align 4
+; SSE-NEXT:    store i32 [[R4]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4), align 4
+; SSE-NEXT:    store i32 [[R5]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 5), align 4
+; SSE-NEXT:    store i32 [[R6]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 6), align 4
+; SSE-NEXT:    store i32 [[R7]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 7), align 4
+; SSE-NEXT:    store i32 [[R8]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8), align 4
+; SSE-NEXT:    store i32 [[R9]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 9), align 4
+; SSE-NEXT:    store i32 [[R10]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 10), align 4
+; SSE-NEXT:    store i32 [[R11]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 11), align 4
+; SSE-NEXT:    store i32 [[R12]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12), align 4
+; SSE-NEXT:    store i32 [[R13]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 13), align 4
+; SSE-NEXT:    store i32 [[R14]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 14), align 4
+; SSE-NEXT:    store i32 [[R15]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 15), align 4
+; SSE-NEXT:    ret void
+;
+; AVX1-LABEL: @ashr_v16i32(
+; AVX1-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @a32 to <4 x i32>*), align 4
+; AVX1-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4
+; AVX1-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4
+; AVX1-NEXT:    [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4
+; AVX1-NEXT:    [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @b32 to <4 x i32>*), align 4
+; AVX1-NEXT:    [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4) to <4 x i32>*), align 4
+; AVX1-NEXT:    [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <4 x i32>*), align 4
+; AVX1-NEXT:    [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12) to <4 x i32>*), align 4
+; AVX1-NEXT:    [[TMP9:%.*]] = ashr <4 x i32> [[TMP1]], [[TMP5]]
+; AVX1-NEXT:    [[TMP10:%.*]] = ashr <4 x i32> [[TMP2]], [[TMP6]]
+; AVX1-NEXT:    [[TMP11:%.*]] = ashr <4 x i32> [[TMP3]], [[TMP7]]
+; AVX1-NEXT:    [[TMP12:%.*]] = ashr <4 x i32> [[TMP4]], [[TMP8]]
+; AVX1-NEXT:    store <4 x i32> [[TMP9]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4
+; AVX1-NEXT:    store <4 x i32> [[TMP10]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4
+; AVX1-NEXT:    store <4 x i32> [[TMP11]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4
+; AVX1-NEXT:    store <4 x i32> [[TMP12]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12) to <4 x i32>*), align 4
+; AVX1-NEXT:    ret void
+;
+; AVX2-LABEL: @ashr_v16i32(
+; AVX2-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @a32 to <8 x i32>*), align 4
+; AVX2-NEXT:    [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <8 x i32>*), align 4
+; AVX2-NEXT:    [[TMP3:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @b32 to <8 x i32>*), align 4
+; AVX2-NEXT:    [[TMP4:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <8 x i32>*), align 4
+; AVX2-NEXT:    [[TMP5:%.*]] = ashr <8 x i32> [[TMP1]], [[TMP3]]
+; AVX2-NEXT:    [[TMP6:%.*]] = ashr <8 x i32> [[TMP2]], [[TMP4]]
+; AVX2-NEXT:    store <8 x i32> [[TMP5]], <8 x i32>* bitcast ([16 x i32]* @c32 to <8 x i32>*), align 4
+; AVX2-NEXT:    store <8 x i32> [[TMP6]], <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <8 x i32>*), align 4
+; AVX2-NEXT:    ret void
+;
+; AVX512-LABEL: @ashr_v16i32(
+; AVX512-NEXT:    [[TMP1:%.*]] = load <16 x i32>, <16 x i32>* bitcast ([16 x i32]* @a32 to <16 x i32>*), align 4
+; AVX512-NEXT:    [[TMP2:%.*]] = load <16 x i32>, <16 x i32>* bitcast ([16 x i32]* @b32 to <16 x i32>*), align 4
+; AVX512-NEXT:    [[TMP3:%.*]] = ashr <16 x i32> [[TMP1]], [[TMP2]]
+; AVX512-NEXT:    store <16 x i32> [[TMP3]], <16 x i32>* bitcast ([16 x i32]* @c32 to <16 x i32>*), align 4
+; AVX512-NEXT:    ret void
+;
+; XOP-LABEL: @ashr_v16i32(
+; XOP-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @a32 to <8 x i32>*), align 4
+; XOP-NEXT:    [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <8 x i32>*), align 4
+; XOP-NEXT:    [[TMP3:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @b32 to <8 x i32>*), align 4
+; XOP-NEXT:    [[TMP4:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <8 x i32>*), align 4
+; XOP-NEXT:    [[TMP5:%.*]] = ashr <8 x i32> [[TMP1]], [[TMP3]]
+; XOP-NEXT:    [[TMP6:%.*]] = ashr <8 x i32> [[TMP2]], [[TMP4]]
+; XOP-NEXT:    store <8 x i32> [[TMP5]], <8 x i32>* bitcast ([16 x i32]* @c32 to <8 x i32>*), align 4
+; XOP-NEXT:    store <8 x i32> [[TMP6]], <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <8 x i32>*), align 4
+; XOP-NEXT:    ret void
+;
+  %a0  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 0 ), align 4
+  %a1  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 1 ), align 4
+  %a2  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 2 ), align 4
+  %a3  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 3 ), align 4
+  %a4  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4 ), align 4
+  %a5  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 5 ), align 4
+  %a6  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 6 ), align 4
+  %a7  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 7 ), align 4
+  %a8  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8 ), align 4
+  %a9  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 9 ), align 4
+  %a10 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 10), align 4
+  %a11 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 11), align 4
+  %a12 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12), align 4
+  %a13 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 13), align 4
+  %a14 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 14), align 4
+  %a15 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 15), align 4
+  %b0  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 0 ), align 4
+  %b1  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 1 ), align 4
+  %b2  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 2 ), align 4
+  %b3  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 3 ), align 4
+  %b4  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4 ), align 4
+  %b5  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 5 ), align 4
+  %b6  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 6 ), align 4
+  %b7  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 7 ), align 4
+  %b8  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8 ), align 4
+  %b9  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 9 ), align 4
+  %b10 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 10), align 4
+  %b11 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 11), align 4
+  %b12 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12), align 4
+  %b13 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 13), align 4
+  %b14 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 14), align 4
+  %b15 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 15), align 4
+  %r0  = ashr i32 %a0 , %b0
+  %r1  = ashr i32 %a1 , %b1
+  %r2  = ashr i32 %a2 , %b2
+  %r3  = ashr i32 %a3 , %b3
+  %r4  = ashr i32 %a4 , %b4
+  %r5  = ashr i32 %a5 , %b5
+  %r6  = ashr i32 %a6 , %b6
+  %r7  = ashr i32 %a7 , %b7
+  %r8  = ashr i32 %a8 , %b8
+  %r9  = ashr i32 %a9 , %b9
+  %r10 = ashr i32 %a10, %b10
+  %r11 = ashr i32 %a11, %b11
+  %r12 = ashr i32 %a12, %b12
+  %r13 = ashr i32 %a13, %b13
+  %r14 = ashr i32 %a14, %b14
+  %r15 = ashr i32 %a15, %b15
+  store i32 %r0 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 0 ), align 4
+  store i32 %r1 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 1 ), align 4
+  store i32 %r2 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 2 ), align 4
+  store i32 %r3 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 3 ), align 4
+  store i32 %r4 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4 ), align 4
+  store i32 %r5 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 5 ), align 4
+  store i32 %r6 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 6 ), align 4
+  store i32 %r7 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 7 ), align 4
+  store i32 %r8 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8 ), align 4
+  store i32 %r9 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 9 ), align 4
+  store i32 %r10, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 10), align 4
+  store i32 %r11, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 11), align 4
+  store i32 %r12, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12), align 4
+  store i32 %r13, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 13), align 4
+  store i32 %r14, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 14), align 4
+  store i32 %r15, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 15), align 4
+  ret void
+}
+
+define void @ashr_v32i16() {
+; SSE-LABEL: @ashr_v32i16(
+; SSE-NEXT:    [[A0:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 0), align 2
+; SSE-NEXT:    [[A1:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 1), align 2
+; SSE-NEXT:    [[A2:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 2), align 2
+; SSE-NEXT:    [[A3:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 3), align 2
+; SSE-NEXT:    [[A4:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 4), align 2
+; SSE-NEXT:    [[A5:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 5), align 2
+; SSE-NEXT:    [[A6:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 6), align 2
+; SSE-NEXT:    [[A7:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 7), align 2
+; SSE-NEXT:    [[A8:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8), align 2
+; SSE-NEXT:    [[A9:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 9), align 2
+; SSE-NEXT:    [[A10:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 10), align 2
+; SSE-NEXT:    [[A11:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 11), align 2
+; SSE-NEXT:    [[A12:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 12), align 2
+; SSE-NEXT:    [[A13:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 13), align 2
+; SSE-NEXT:    [[A14:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 14), align 2
+; SSE-NEXT:    [[A15:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 15), align 2
+; SSE-NEXT:    [[A16:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16), align 2
+; SSE-NEXT:    [[A17:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 17), align 2
+; SSE-NEXT:    [[A18:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 18), align 2
+; SSE-NEXT:    [[A19:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 19), align 2
+; SSE-NEXT:    [[A20:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 20), align 2
+; SSE-NEXT:    [[A21:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 21), align 2
+; SSE-NEXT:    [[A22:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 22), align 2
+; SSE-NEXT:    [[A23:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 23), align 2
+; SSE-NEXT:    [[A24:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24), align 2
+; SSE-NEXT:    [[A25:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 25), align 2
+; SSE-NEXT:    [[A26:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 26), align 2
+; SSE-NEXT:    [[A27:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 27), align 2
+; SSE-NEXT:    [[A28:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 28), align 2
+; SSE-NEXT:    [[A29:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 29), align 2
+; SSE-NEXT:    [[A30:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 30), align 2
+; SSE-NEXT:    [[A31:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 31), align 2
+; SSE-NEXT:    [[B0:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 0), align 2
+; SSE-NEXT:    [[B1:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 1), align 2
+; SSE-NEXT:    [[B2:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 2), align 2
+; SSE-NEXT:    [[B3:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 3), align 2
+; SSE-NEXT:    [[B4:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 4), align 2
+; SSE-NEXT:    [[B5:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 5), align 2
+; SSE-NEXT:    [[B6:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 6), align 2
+; SSE-NEXT:    [[B7:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 7), align 2
+; SSE-NEXT:    [[B8:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8), align 2
+; SSE-NEXT:    [[B9:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 9), align 2
+; SSE-NEXT:    [[B10:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 10), align 2
+; SSE-NEXT:    [[B11:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 11), align 2
+; SSE-NEXT:    [[B12:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 12), align 2
+; SSE-NEXT:    [[B13:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 13), align 2
+; SSE-NEXT:    [[B14:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 14), align 2
+; SSE-NEXT:    [[B15:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 15), align 2
+; SSE-NEXT:    [[B16:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16), align 2
+; SSE-NEXT:    [[B17:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 17), align 2
+; SSE-NEXT:    [[B18:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 18), align 2
+; SSE-NEXT:    [[B19:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 19), align 2
+; SSE-NEXT:    [[B20:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 20), align 2
+; SSE-NEXT:    [[B21:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 21), align 2
+; SSE-NEXT:    [[B22:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 22), align 2
+; SSE-NEXT:    [[B23:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 23), align 2
+; SSE-NEXT:    [[B24:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24), align 2
+; SSE-NEXT:    [[B25:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 25), align 2
+; SSE-NEXT:    [[B26:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 26), align 2
+; SSE-NEXT:    [[B27:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 27), align 2
+; SSE-NEXT:    [[B28:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 28), align 2
+; SSE-NEXT:    [[B29:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 29), align 2
+; SSE-NEXT:    [[B30:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 30), align 2
+; SSE-NEXT:    [[B31:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 31), align 2
+; SSE-NEXT:    [[R0:%.*]] = ashr i16 [[A0]], [[B0]]
+; SSE-NEXT:    [[R1:%.*]] = ashr i16 [[A1]], [[B1]]
+; SSE-NEXT:    [[R2:%.*]] = ashr i16 [[A2]], [[B2]]
+; SSE-NEXT:    [[R3:%.*]] = ashr i16 [[A3]], [[B3]]
+; SSE-NEXT:    [[R4:%.*]] = ashr i16 [[A4]], [[B4]]
+; SSE-NEXT:    [[R5:%.*]] = ashr i16 [[A5]], [[B5]]
+; SSE-NEXT:    [[R6:%.*]] = ashr i16 [[A6]], [[B6]]
+; SSE-NEXT:    [[R7:%.*]] = ashr i16 [[A7]], [[B7]]
+; SSE-NEXT:    [[R8:%.*]] = ashr i16 [[A8]], [[B8]]
+; SSE-NEXT:    [[R9:%.*]] = ashr i16 [[A9]], [[B9]]
+; SSE-NEXT:    [[R10:%.*]] = ashr i16 [[A10]], [[B10]]
+; SSE-NEXT:    [[R11:%.*]] = ashr i16 [[A11]], [[B11]]
+; SSE-NEXT:    [[R12:%.*]] = ashr i16 [[A12]], [[B12]]
+; SSE-NEXT:    [[R13:%.*]] = ashr i16 [[A13]], [[B13]]
+; SSE-NEXT:    [[R14:%.*]] = ashr i16 [[A14]], [[B14]]
+; SSE-NEXT:    [[R15:%.*]] = ashr i16 [[A15]], [[B15]]
+; SSE-NEXT:    [[R16:%.*]] = ashr i16 [[A16]], [[B16]]
+; SSE-NEXT:    [[R17:%.*]] = ashr i16 [[A17]], [[B17]]
+; SSE-NEXT:    [[R18:%.*]] = ashr i16 [[A18]], [[B18]]
+; SSE-NEXT:    [[R19:%.*]] = ashr i16 [[A19]], [[B19]]
+; SSE-NEXT:    [[R20:%.*]] = ashr i16 [[A20]], [[B20]]
+; SSE-NEXT:    [[R21:%.*]] = ashr i16 [[A21]], [[B21]]
+; SSE-NEXT:    [[R22:%.*]] = ashr i16 [[A22]], [[B22]]
+; SSE-NEXT:    [[R23:%.*]] = ashr i16 [[A23]], [[B23]]
+; SSE-NEXT:    [[R24:%.*]] = ashr i16 [[A24]], [[B24]]
+; SSE-NEXT:    [[R25:%.*]] = ashr i16 [[A25]], [[B25]]
+; SSE-NEXT:    [[R26:%.*]] = ashr i16 [[A26]], [[B26]]
+; SSE-NEXT:    [[R27:%.*]] = ashr i16 [[A27]], [[B27]]
+; SSE-NEXT:    [[R28:%.*]] = ashr i16 [[A28]], [[B28]]
+; SSE-NEXT:    [[R29:%.*]] = ashr i16 [[A29]], [[B29]]
+; SSE-NEXT:    [[R30:%.*]] = ashr i16 [[A30]], [[B30]]
+; SSE-NEXT:    [[R31:%.*]] = ashr i16 [[A31]], [[B31]]
+; SSE-NEXT:    store i16 [[R0]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 0), align 2
+; SSE-NEXT:    store i16 [[R1]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 1), align 2
+; SSE-NEXT:    store i16 [[R2]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 2), align 2
+; SSE-NEXT:    store i16 [[R3]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 3), align 2
+; SSE-NEXT:    store i16 [[R4]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 4), align 2
+; SSE-NEXT:    store i16 [[R5]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 5), align 2
+; SSE-NEXT:    store i16 [[R6]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 6), align 2
+; SSE-NEXT:    store i16 [[R7]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 7), align 2
+; SSE-NEXT:    store i16 [[R8]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8), align 2
+; SSE-NEXT:    store i16 [[R9]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 9), align 2
+; SSE-NEXT:    store i16 [[R10]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 10), align 2
+; SSE-NEXT:    store i16 [[R11]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 11), align 2
+; SSE-NEXT:    store i16 [[R12]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 12), align 2
+; SSE-NEXT:    store i16 [[R13]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 13), align 2
+; SSE-NEXT:    store i16 [[R14]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 14), align 2
+; SSE-NEXT:    store i16 [[R15]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 15), align 2
+; SSE-NEXT:    store i16 [[R16]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16), align 2
+; SSE-NEXT:    store i16 [[R17]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 17), align 2
+; SSE-NEXT:    store i16 [[R18]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 18), align 2
+; SSE-NEXT:    store i16 [[R19]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 19), align 2
+; SSE-NEXT:    store i16 [[R20]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 20), align 2
+; SSE-NEXT:    store i16 [[R21]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 21), align 2
+; SSE-NEXT:    store i16 [[R22]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 22), align 2
+; SSE-NEXT:    store i16 [[R23]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 23), align 2
+; SSE-NEXT:    store i16 [[R24]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 24), align 2
+; SSE-NEXT:    store i16 [[R25]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 25), align 2
+; SSE-NEXT:    store i16 [[R26]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 26), align 2
+; SSE-NEXT:    store i16 [[R27]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 27), align 2
+; SSE-NEXT:    store i16 [[R28]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 28), align 2
+; SSE-NEXT:    store i16 [[R29]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 29), align 2
+; SSE-NEXT:    store i16 [[R30]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 30), align 2
+; SSE-NEXT:    store i16 [[R31]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 31), align 2
+; SSE-NEXT:    ret void
+;
+; AVX-LABEL: @ashr_v32i16(
+; AVX-NEXT:    [[TMP1:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @a16 to <16 x i16>*), align 2
+; AVX-NEXT:    [[TMP2:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <16 x i16>*), align 2
+; AVX-NEXT:    [[TMP3:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @b16 to <16 x i16>*), align 2
+; AVX-NEXT:    [[TMP4:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <16 x i16>*), align 2
+; AVX-NEXT:    [[TMP5:%.*]] = ashr <16 x i16> [[TMP1]], [[TMP3]]
+; AVX-NEXT:    [[TMP6:%.*]] = ashr <16 x i16> [[TMP2]], [[TMP4]]
+; AVX-NEXT:    store <16 x i16> [[TMP5]], <16 x i16>* bitcast ([32 x i16]* @c16 to <16 x i16>*), align 2
+; AVX-NEXT:    store <16 x i16> [[TMP6]], <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <16 x i16>*), align 2
+; AVX-NEXT:    ret void
+;
+; AVX512-LABEL: @ashr_v32i16(
+; AVX512-NEXT:    [[TMP1:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @a16 to <16 x i16>*), align 2
+; AVX512-NEXT:    [[TMP2:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <16 x i16>*), align 2
+; AVX512-NEXT:    [[TMP3:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @b16 to <16 x i16>*), align 2
+; AVX512-NEXT:    [[TMP4:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <16 x i16>*), align 2
+; AVX512-NEXT:    [[TMP5:%.*]] = ashr <16 x i16> [[TMP1]], [[TMP3]]
+; AVX512-NEXT:    [[TMP6:%.*]] = ashr <16 x i16> [[TMP2]], [[TMP4]]
+; AVX512-NEXT:    store <16 x i16> [[TMP5]], <16 x i16>* bitcast ([32 x i16]* @c16 to <16 x i16>*), align 2
+; AVX512-NEXT:    store <16 x i16> [[TMP6]], <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <16 x i16>*), align 2
+; AVX512-NEXT:    ret void
+;
+; XOP-LABEL: @ashr_v32i16(
+; XOP-NEXT:    [[TMP1:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @a16 to <16 x i16>*), align 2
+; XOP-NEXT:    [[TMP2:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <16 x i16>*), align 2
+; XOP-NEXT:    [[TMP3:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @b16 to <16 x i16>*), align 2
+; XOP-NEXT:    [[TMP4:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <16 x i16>*), align 2
+; XOP-NEXT:    [[TMP5:%.*]] = ashr <16 x i16> [[TMP1]], [[TMP3]]
+; XOP-NEXT:    [[TMP6:%.*]] = ashr <16 x i16> [[TMP2]], [[TMP4]]
+; XOP-NEXT:    store <16 x i16> [[TMP5]], <16 x i16>* bitcast ([32 x i16]* @c16 to <16 x i16>*), align 2
+; XOP-NEXT:    store <16 x i16> [[TMP6]], <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <16 x i16>*), align 2
+; XOP-NEXT:    ret void
+;
+  %a0  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 0 ), align 2
+  %a1  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 1 ), align 2
+  %a2  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 2 ), align 2
+  %a3  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 3 ), align 2
+  %a4  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 4 ), align 2
+  %a5  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 5 ), align 2
+  %a6  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 6 ), align 2
+  %a7  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 7 ), align 2
+  %a8  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8 ), align 2
+  %a9  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 9 ), align 2
+  %a10 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 10), align 2
+  %a11 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 11), align 2
+  %a12 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 12), align 2
+  %a13 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 13), align 2
+  %a14 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 14), align 2
+  %a15 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 15), align 2
+  %a16 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16), align 2
+  %a17 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 17), align 2
+  %a18 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 18), align 2
+  %a19 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 19), align 2
+  %a20 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 20), align 2
+  %a21 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 21), align 2
+  %a22 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 22), align 2
+  %a23 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 23), align 2
+  %a24 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24), align 2
+  %a25 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 25), align 2
+  %a26 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 26), align 2
+  %a27 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 27), align 2
+  %a28 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 28), align 2
+  %a29 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 29), align 2
+  %a30 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 30), align 2
+  %a31 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 31), align 2
+  %b0  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 0 ), align 2
+  %b1  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 1 ), align 2
+  %b2  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 2 ), align 2
+  %b3  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 3 ), align 2
+  %b4  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 4 ), align 2
+  %b5  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 5 ), align 2
+  %b6  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 6 ), align 2
+  %b7  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 7 ), align 2
+  %b8  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8 ), align 2
+  %b9  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 9 ), align 2
+  %b10 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 10), align 2
+  %b11 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 11), align 2
+  %b12 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 12), align 2
+  %b13 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 13), align 2
+  %b14 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 14), align 2
+  %b15 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 15), align 2
+  %b16 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16), align 2
+  %b17 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 17), align 2
+  %b18 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 18), align 2
+  %b19 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 19), align 2
+  %b20 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 20), align 2
+  %b21 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 21), align 2
+  %b22 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 22), align 2
+  %b23 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 23), align 2
+  %b24 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24), align 2
+  %b25 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 25), align 2
+  %b26 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 26), align 2
+  %b27 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 27), align 2
+  %b28 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 28), align 2
+  %b29 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 29), align 2
+  %b30 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 30), align 2
+  %b31 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 31), align 2
+  %r0  = ashr i16 %a0 , %b0
+  %r1  = ashr i16 %a1 , %b1
+  %r2  = ashr i16 %a2 , %b2
+  %r3  = ashr i16 %a3 , %b3
+  %r4  = ashr i16 %a4 , %b4
+  %r5  = ashr i16 %a5 , %b5
+  %r6  = ashr i16 %a6 , %b6
+  %r7  = ashr i16 %a7 , %b7
+  %r8  = ashr i16 %a8 , %b8
+  %r9  = ashr i16 %a9 , %b9
+  %r10 = ashr i16 %a10, %b10
+  %r11 = ashr i16 %a11, %b11
+  %r12 = ashr i16 %a12, %b12
+  %r13 = ashr i16 %a13, %b13
+  %r14 = ashr i16 %a14, %b14
+  %r15 = ashr i16 %a15, %b15
+  %r16 = ashr i16 %a16, %b16
+  %r17 = ashr i16 %a17, %b17
+  %r18 = ashr i16 %a18, %b18
+  %r19 = ashr i16 %a19, %b19
+  %r20 = ashr i16 %a20, %b20
+  %r21 = ashr i16 %a21, %b21
+  %r22 = ashr i16 %a22, %b22
+  %r23 = ashr i16 %a23, %b23
+  %r24 = ashr i16 %a24, %b24
+  %r25 = ashr i16 %a25, %b25
+  %r26 = ashr i16 %a26, %b26
+  %r27 = ashr i16 %a27, %b27
+  %r28 = ashr i16 %a28, %b28
+  %r29 = ashr i16 %a29, %b29
+  %r30 = ashr i16 %a30, %b30
+  %r31 = ashr i16 %a31, %b31
+  store i16 %r0 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 0 ), align 2
+  store i16 %r1 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 1 ), align 2
+  store i16 %r2 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 2 ), align 2
+  store i16 %r3 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 3 ), align 2
+  store i16 %r4 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 4 ), align 2
+  store i16 %r5 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 5 ), align 2
+  store i16 %r6 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 6 ), align 2
+  store i16 %r7 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 7 ), align 2
+  store i16 %r8 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8 ), align 2
+  store i16 %r9 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 9 ), align 2
+  store i16 %r10, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 10), align 2
+  store i16 %r11, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 11), align 2
+  store i16 %r12, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 12), align 2
+  store i16 %r13, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 13), align 2
+  store i16 %r14, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 14), align 2
+  store i16 %r15, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 15), align 2
+  store i16 %r16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16), align 2
+  store i16 %r17, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 17), align 2
+  store i16 %r18, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 18), align 2
+  store i16 %r19, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 19), align 2
+  store i16 %r20, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 20), align 2
+  store i16 %r21, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 21), align 2
+  store i16 %r22, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 22), align 2
+  store i16 %r23, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 23), align 2
+  store i16 %r24, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 24), align 2
+  store i16 %r25, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 25), align 2
+  store i16 %r26, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 26), align 2
+  store i16 %r27, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 27), align 2
+  store i16 %r28, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 28), align 2
+  store i16 %r29, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 29), align 2
+  store i16 %r30, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 30), align 2
+  store i16 %r31, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 31), align 2
+  ret void
+}
+
+define void @ashr_v64i8() {
+; CHECK-LABEL: @ashr_v64i8(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @a8 to <16 x i8>*), align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16) to <16 x i8>*), align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <16 x i8>*), align 1
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48) to <16 x i8>*), align 1
+; CHECK-NEXT:    [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @b8 to <16 x i8>*), align 1
+; CHECK-NEXT:    [[TMP6:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16) to <16 x i8>*), align 1
+; CHECK-NEXT:    [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <16 x i8>*), align 1
+; CHECK-NEXT:    [[TMP8:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48) to <16 x i8>*), align 1
+; CHECK-NEXT:    [[TMP9:%.*]] = ashr <16 x i8> [[TMP1]], [[TMP5]]
+; CHECK-NEXT:    [[TMP10:%.*]] = ashr <16 x i8> [[TMP2]], [[TMP6]]
+; CHECK-NEXT:    [[TMP11:%.*]] = ashr <16 x i8> [[TMP3]], [[TMP7]]
+; CHECK-NEXT:    [[TMP12:%.*]] = ashr <16 x i8> [[TMP4]], [[TMP8]]
+; CHECK-NEXT:    store <16 x i8> [[TMP9]], <16 x i8>* bitcast ([64 x i8]* @c8 to <16 x i8>*), align 1
+; CHECK-NEXT:    store <16 x i8> [[TMP10]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16) to <16 x i8>*), align 1
+; CHECK-NEXT:    store <16 x i8> [[TMP11]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <16 x i8>*), align 1
+; CHECK-NEXT:    store <16 x i8> [[TMP12]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 48) to <16 x i8>*), align 1
+; CHECK-NEXT:    ret void
+;
+  %a0  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 0 ), align 1
+  %a1  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 1 ), align 1
+  %a2  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 2 ), align 1
+  %a3  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 3 ), align 1
+  %a4  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 4 ), align 1
+  %a5  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 5 ), align 1
+  %a6  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 6 ), align 1
+  %a7  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 7 ), align 1
+  %a8  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 8 ), align 1
+  %a9  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 9 ), align 1
+  %a10 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 10), align 1
+  %a11 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 11), align 1
+  %a12 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 12), align 1
+  %a13 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 13), align 1
+  %a14 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 14), align 1
+  %a15 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 15), align 1
+  %a16 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16), align 1
+  %a17 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 17), align 1
+  %a18 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 18), align 1
+  %a19 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 19), align 1
+  %a20 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 20), align 1
+  %a21 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 21), align 1
+  %a22 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 22), align 1
+  %a23 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 23), align 1
+  %a24 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 24), align 1
+  %a25 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 25), align 1
+  %a26 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 26), align 1
+  %a27 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 27), align 1
+  %a28 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 28), align 1
+  %a29 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 29), align 1
+  %a30 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 30), align 1
+  %a31 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 31), align 1
+  %a32 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32), align 1
+  %a33 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 33), align 1
+  %a34 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 34), align 1
+  %a35 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 35), align 1
+  %a36 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 36), align 1
+  %a37 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 37), align 1
+  %a38 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 38), align 1
+  %a39 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 39), align 1
+  %a40 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 40), align 1
+  %a41 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 41), align 1
+  %a42 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 42), align 1
+  %a43 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 43), align 1
+  %a44 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 44), align 1
+  %a45 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 45), align 1
+  %a46 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 46), align 1
+  %a47 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 47), align 1
+  %a48 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48), align 1
+  %a49 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 49), align 1
+  %a50 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 50), align 1
+  %a51 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 51), align 1
+  %a52 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 52), align 1
+  %a53 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 53), align 1
+  %a54 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 54), align 1
+  %a55 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 55), align 1
+  %a56 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 56), align 1
+  %a57 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 57), align 1
+  %a58 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 58), align 1
+  %a59 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 59), align 1
+  %a60 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 60), align 1
+  %a61 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 61), align 1
+  %a62 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 62), align 1
+  %a63 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 63), align 1
+  %b0  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 0 ), align 1
+  %b1  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 1 ), align 1
+  %b2  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 2 ), align 1
+  %b3  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 3 ), align 1
+  %b4  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 4 ), align 1
+  %b5  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 5 ), align 1
+  %b6  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 6 ), align 1
+  %b7  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 7 ), align 1
+  %b8  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 8 ), align 1
+  %b9  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 9 ), align 1
+  %b10 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 10), align 1
+  %b11 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 11), align 1
+  %b12 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 12), align 1
+  %b13 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 13), align 1
+  %b14 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 14), align 1
+  %b15 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 15), align 1
+  %b16 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16), align 1
+  %b17 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 17), align 1
+  %b18 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 18), align 1
+  %b19 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 19), align 1
+  %b20 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 20), align 1
+  %b21 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 21), align 1
+  %b22 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 22), align 1
+  %b23 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 23), align 1
+  %b24 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 24), align 1
+  %b25 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 25), align 1
+  %b26 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 26), align 1
+  %b27 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 27), align 1
+  %b28 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 28), align 1
+  %b29 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 29), align 1
+  %b30 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 30), align 1
+  %b31 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 31), align 1
+  %b32 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32), align 1
+  %b33 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 33), align 1
+  %b34 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 34), align 1
+  %b35 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 35), align 1
+  %b36 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 36), align 1
+  %b37 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 37), align 1
+  %b38 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 38), align 1
+  %b39 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 39), align 1
+  %b40 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 40), align 1
+  %b41 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 41), align 1
+  %b42 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 42), align 1
+  %b43 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 43), align 1
+  %b44 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 44), align 1
+  %b45 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 45), align 1
+  %b46 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 46), align 1
+  %b47 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 47), align 1
+  %b48 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48), align 1
+  %b49 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 49), align 1
+  %b50 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 50), align 1
+  %b51 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 51), align 1
+  %b52 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 52), align 1
+  %b53 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 53), align 1
+  %b54 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 54), align 1
+  %b55 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 55), align 1
+  %b56 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 56), align 1
+  %b57 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 57), align 1
+  %b58 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 58), align 1
+  %b59 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 59), align 1
+  %b60 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 60), align 1
+  %b61 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 61), align 1
+  %b62 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 62), align 1
+  %b63 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 63), align 1
+  %r0  = ashr i8 %a0 , %b0
+  %r1  = ashr i8 %a1 , %b1
+  %r2  = ashr i8 %a2 , %b2
+  %r3  = ashr i8 %a3 , %b3
+  %r4  = ashr i8 %a4 , %b4
+  %r5  = ashr i8 %a5 , %b5
+  %r6  = ashr i8 %a6 , %b6
+  %r7  = ashr i8 %a7 , %b7
+  %r8  = ashr i8 %a8 , %b8
+  %r9  = ashr i8 %a9 , %b9
+  %r10 = ashr i8 %a10, %b10
+  %r11 = ashr i8 %a11, %b11
+  %r12 = ashr i8 %a12, %b12
+  %r13 = ashr i8 %a13, %b13
+  %r14 = ashr i8 %a14, %b14
+  %r15 = ashr i8 %a15, %b15
+  %r16 = ashr i8 %a16, %b16
+  %r17 = ashr i8 %a17, %b17
+  %r18 = ashr i8 %a18, %b18
+  %r19 = ashr i8 %a19, %b19
+  %r20 = ashr i8 %a20, %b20
+  %r21 = ashr i8 %a21, %b21
+  %r22 = ashr i8 %a22, %b22
+  %r23 = ashr i8 %a23, %b23
+  %r24 = ashr i8 %a24, %b24
+  %r25 = ashr i8 %a25, %b25
+  %r26 = ashr i8 %a26, %b26
+  %r27 = ashr i8 %a27, %b27
+  %r28 = ashr i8 %a28, %b28
+  %r29 = ashr i8 %a29, %b29
+  %r30 = ashr i8 %a30, %b30
+  %r31 = ashr i8 %a31, %b31
+  %r32 = ashr i8 %a32, %b32
+  %r33 = ashr i8 %a33, %b33
+  %r34 = ashr i8 %a34, %b34
+  %r35 = ashr i8 %a35, %b35
+  %r36 = ashr i8 %a36, %b36
+  %r37 = ashr i8 %a37, %b37
+  %r38 = ashr i8 %a38, %b38
+  %r39 = ashr i8 %a39, %b39
+  %r40 = ashr i8 %a40, %b40
+  %r41 = ashr i8 %a41, %b41
+  %r42 = ashr i8 %a42, %b42
+  %r43 = ashr i8 %a43, %b43
+  %r44 = ashr i8 %a44, %b44
+  %r45 = ashr i8 %a45, %b45
+  %r46 = ashr i8 %a46, %b46
+  %r47 = ashr i8 %a47, %b47
+  %r48 = ashr i8 %a48, %b48
+  %r49 = ashr i8 %a49, %b49
+  %r50 = ashr i8 %a50, %b50
+  %r51 = ashr i8 %a51, %b51
+  %r52 = ashr i8 %a52, %b52
+  %r53 = ashr i8 %a53, %b53
+  %r54 = ashr i8 %a54, %b54
+  %r55 = ashr i8 %a55, %b55
+  %r56 = ashr i8 %a56, %b56
+  %r57 = ashr i8 %a57, %b57
+  %r58 = ashr i8 %a58, %b58
+  %r59 = ashr i8 %a59, %b59
+  %r60 = ashr i8 %a60, %b60
+  %r61 = ashr i8 %a61, %b61
+  %r62 = ashr i8 %a62, %b62
+  %r63 = ashr i8 %a63, %b63
+  store i8 %r0 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 0 ), align 1
+  store i8 %r1 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 1 ), align 1
+  store i8 %r2 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 2 ), align 1
+  store i8 %r3 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 3 ), align 1
+  store i8 %r4 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 4 ), align 1
+  store i8 %r5 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 5 ), align 1
+  store i8 %r6 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 6 ), align 1
+  store i8 %r7 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 7 ), align 1
+  store i8 %r8 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 8 ), align 1
+  store i8 %r9 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 9 ), align 1
+  store i8 %r10, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 10), align 1
+  store i8 %r11, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 11), align 1
+  store i8 %r12, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 12), align 1
+  store i8 %r13, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 13), align 1
+  store i8 %r14, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 14), align 1
+  store i8 %r15, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 15), align 1
+  store i8 %r16, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16), align 1
+  store i8 %r17, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 17), align 1
+  store i8 %r18, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 18), align 1
+  store i8 %r19, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 19), align 1
+  store i8 %r20, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 20), align 1
+  store i8 %r21, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 21), align 1
+  store i8 %r22, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 22), align 1
+  store i8 %r23, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 23), align 1
+  store i8 %r24, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 24), align 1
+  store i8 %r25, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 25), align 1
+  store i8 %r26, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 26), align 1
+  store i8 %r27, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 27), align 1
+  store i8 %r28, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 28), align 1
+  store i8 %r29, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 29), align 1
+  store i8 %r30, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 30), align 1
+  store i8 %r31, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 31), align 1
+  store i8 %r32, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32), align 1
+  store i8 %r33, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 33), align 1
+  store i8 %r34, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 34), align 1
+  store i8 %r35, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 35), align 1
+  store i8 %r36, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 36), align 1
+  store i8 %r37, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 37), align 1
+  store i8 %r38, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 38), align 1
+  store i8 %r39, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 39), align 1
+  store i8 %r40, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 40), align 1
+  store i8 %r41, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 41), align 1
+  store i8 %r42, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 42), align 1
+  store i8 %r43, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 43), align 1
+  store i8 %r44, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 44), align 1
+  store i8 %r45, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 45), align 1
+  store i8 %r46, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 46), align 1
+  store i8 %r47, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 47), align 1
+  store i8 %r48, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 48), align 1
+  store i8 %r49, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 49), align 1
+  store i8 %r50, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 50), align 1
+  store i8 %r51, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 51), align 1
+  store i8 %r52, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 52), align 1
+  store i8 %r53, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 53), align 1
+  store i8 %r54, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 54), align 1
+  store i8 %r55, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 55), align 1
+  store i8 %r56, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 56), align 1
+  store i8 %r57, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 57), align 1
+  store i8 %r58, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 58), align 1
+  store i8 %r59, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 59), align 1
+  store i8 %r60, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 60), align 1
+  store i8 %r61, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 61), align 1
+  store i8 %r62, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 62), align 1
+  store i8 %r63, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 63), align 1
+  ret void
+}
diff --git a/test/Transforms/SLPVectorizer/X86/shift-lshr.ll b/test/Transforms/SLPVectorizer/X86/shift-lshr.ll
new file mode 100644
index 000000000000..6fd78e7c9699
--- /dev/null
+++ b/test/Transforms/SLPVectorizer/X86/shift-lshr.ll
@@ -0,0 +1,862 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -mtriple=x86_64-unknown -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=SSE
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX1
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX2
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 --check-prefix=AVX512F
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 --check-prefix=AVX512BW
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=bdver4 -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=XOP
+
+@a64 = common global [8 x i64] zeroinitializer, align 64
+@b64 = common global [8 x i64] zeroinitializer, align 64
+@c64 = common global [8 x i64] zeroinitializer, align 64
+@a32 = common global [16 x i32] zeroinitializer, align 64
+@b32 = common global [16 x i32] zeroinitializer, align 64
+@c32 = common global [16 x i32] zeroinitializer, align 64
+@a16 = common global [32 x i16] zeroinitializer, align 64
+@b16 = common global [32 x i16] zeroinitializer, align 64
+@c16 = common global [32 x i16] zeroinitializer, align 64
+@a8  = common global [64 x i8] zeroinitializer, align 64
+@b8  = common global [64 x i8] zeroinitializer, align 64
+@c8  = common global [64 x i8] zeroinitializer, align 64
+
+define void @lshr_v8i64() {
+; SSE-LABEL: @lshr_v8i64(
+; SSE-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @a64 to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2) to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6) to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @b64 to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2) to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6) to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP9:%.*]] = lshr <2 x i64> [[TMP1]], [[TMP5]]
+; SSE-NEXT:    [[TMP10:%.*]] = lshr <2 x i64> [[TMP2]], [[TMP6]]
+; SSE-NEXT:    [[TMP11:%.*]] = lshr <2 x i64> [[TMP3]], [[TMP7]]
+; SSE-NEXT:    [[TMP12:%.*]] = lshr <2 x i64> [[TMP4]], [[TMP8]]
+; SSE-NEXT:    store <2 x i64> [[TMP9]], <2 x i64>* bitcast ([8 x i64]* @c64 to <2 x i64>*), align 8
+; SSE-NEXT:    store <2 x i64> [[TMP10]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2) to <2 x i64>*), align 8
+; SSE-NEXT:    store <2 x i64> [[TMP11]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <2 x i64>*), align 8
+; SSE-NEXT:    store <2 x i64> [[TMP12]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6) to <2 x i64>*), align 8
+; SSE-NEXT:    ret void
+;
+; AVX1-LABEL: @lshr_v8i64(
+; AVX1-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @a64 to <2 x i64>*), align 8
+; AVX1-NEXT:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2) to <2 x i64>*), align 8
+; AVX1-NEXT:    [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <2 x i64>*), align 8
+; AVX1-NEXT:    [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6) to <2 x i64>*), align 8
+; AVX1-NEXT:    [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @b64 to <2 x i64>*), align 8
+; AVX1-NEXT:    [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2) to <2 x i64>*), align 8
+; AVX1-NEXT:    [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <2 x i64>*), align 8
+; AVX1-NEXT:    [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6) to <2 x i64>*), align 8
+; AVX1-NEXT:    [[TMP9:%.*]] = lshr <2 x i64> [[TMP1]], [[TMP5]]
+; AVX1-NEXT:    [[TMP10:%.*]] = lshr <2 x i64> [[TMP2]], [[TMP6]]
+; AVX1-NEXT:    [[TMP11:%.*]] = lshr <2 x i64> [[TMP3]], [[TMP7]]
+; AVX1-NEXT:    [[TMP12:%.*]] = lshr <2 x i64> [[TMP4]], [[TMP8]]
+; AVX1-NEXT:    store <2 x i64> [[TMP9]], <2 x i64>* bitcast ([8 x i64]* @c64 to <2 x i64>*), align 8
+; AVX1-NEXT:    store <2 x i64> [[TMP10]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2) to <2 x i64>*), align 8
+; AVX1-NEXT:    store <2 x i64> [[TMP11]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <2 x i64>*), align 8
+; AVX1-NEXT:    store <2 x i64> [[TMP12]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6) to <2 x i64>*), align 8
+; AVX1-NEXT:    ret void
+;
+; AVX2-LABEL: @lshr_v8i64(
+; AVX2-NEXT:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @a64 to <4 x i64>*), align 8
+; AVX2-NEXT:    [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8
+; AVX2-NEXT:    [[TMP3:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @b64 to <4 x i64>*), align 8
+; AVX2-NEXT:    [[TMP4:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <4 x i64>*), align 8
+; AVX2-NEXT:    [[TMP5:%.*]] = lshr <4 x i64> [[TMP1]], [[TMP3]]
+; AVX2-NEXT:    [[TMP6:%.*]] = lshr <4 x i64> [[TMP2]], [[TMP4]]
+; AVX2-NEXT:    store <4 x i64> [[TMP5]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8
+; AVX2-NEXT:    store <4 x i64> [[TMP6]], <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <4 x i64>*), align 8
+; AVX2-NEXT:    ret void
+;
+; AVX512-LABEL: @lshr_v8i64(
+; AVX512-NEXT:    [[TMP1:%.*]] = load <8 x i64>, <8 x i64>* bitcast ([8 x i64]* @a64 to <8 x i64>*), align 8
+; AVX512-NEXT:    [[TMP2:%.*]] = load <8 x i64>, <8 x i64>* bitcast ([8 x i64]* @b64 to <8 x i64>*), align 8
+; AVX512-NEXT:    [[TMP3:%.*]] = lshr <8 x i64> [[TMP1]], [[TMP2]]
+; AVX512-NEXT:    store <8 x i64> [[TMP3]], <8 x i64>* bitcast ([8 x i64]* @c64 to <8 x i64>*), align 8
+; AVX512-NEXT:    ret void
+;
+; XOP-LABEL: @lshr_v8i64(
+; XOP-NEXT:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @a64 to <4 x i64>*), align 8
+; XOP-NEXT:    [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8
+; XOP-NEXT:    [[TMP3:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @b64 to <4 x i64>*), align 8
+; XOP-NEXT:    [[TMP4:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <4 x i64>*), align 8
+; XOP-NEXT:    [[TMP5:%.*]] = lshr <4 x i64> [[TMP1]], [[TMP3]]
+; XOP-NEXT:    [[TMP6:%.*]] = lshr <4 x i64> [[TMP2]], [[TMP4]]
+; XOP-NEXT:    store <4 x i64> [[TMP5]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8
+; XOP-NEXT:    store <4 x i64> [[TMP6]], <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <4 x i64>*), align 8
+; XOP-NEXT:    ret void
+;
+  %a0 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 0), align 8
+  %a1 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 1), align 8
+  %a2 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2), align 8
+  %a3 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 3), align 8
+  %a4 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4), align 8
+  %a5 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 5), align 8
+  %a6 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6), align 8
+  %a7 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 7), align 8
+  %b0 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 0), align 8
+  %b1 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 1), align 8
+  %b2 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2), align 8
+  %b3 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 3), align 8
+  %b4 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4), align 8
+  %b5 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 5), align 8
+  %b6 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6), align 8
+  %b7 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 7), align 8
+  %r0 = lshr i64 %a0, %b0
+  %r1 = lshr i64 %a1, %b1
+  %r2 = lshr i64 %a2, %b2
+  %r3 = lshr i64 %a3, %b3
+  %r4 = lshr i64 %a4, %b4
+  %r5 = lshr i64 %a5, %b5
+  %r6 = lshr i64 %a6, %b6
+  %r7 = lshr i64 %a7, %b7
+  store i64 %r0, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 0), align 8
+  store i64 %r1, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 1), align 8
+  store i64 %r2, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2), align 8
+  store i64 %r3, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 3), align 8
+  store i64 %r4, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4), align 8
+  store i64 %r5, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 5), align 8
+  store i64 %r6, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6), align 8
+  store i64 %r7, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 7), align 8
+  ret void
+}
+
+define void @lshr_v16i32() {
+; SSE-LABEL: @lshr_v16i32(
+; SSE-NEXT:    [[A0:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 0), align 4
+; SSE-NEXT:    [[A1:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 1), align 4
+; SSE-NEXT:    [[A2:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 2), align 4
+; SSE-NEXT:    [[A3:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 3), align 4
+; SSE-NEXT:    [[A4:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4), align 4
+; SSE-NEXT:    [[A5:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 5), align 4
+; SSE-NEXT:    [[A6:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 6), align 4
+; SSE-NEXT:    [[A7:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 7), align 4
+; SSE-NEXT:    [[A8:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8), align 4
+; SSE-NEXT:    [[A9:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 9), align 4
+; SSE-NEXT:    [[A10:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 10), align 4
+; SSE-NEXT:    [[A11:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 11), align 4
+; SSE-NEXT:    [[A12:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12), align 4
+; SSE-NEXT:    [[A13:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 13), align 4
+; SSE-NEXT:    [[A14:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 14), align 4
+; SSE-NEXT:    [[A15:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 15), align 4
+; SSE-NEXT:    [[B0:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 0), align 4
+; SSE-NEXT:    [[B1:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 1), align 4
+; SSE-NEXT:    [[B2:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 2), align 4
+; SSE-NEXT:    [[B3:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 3), align 4
+; SSE-NEXT:    [[B4:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4), align 4
+; SSE-NEXT:    [[B5:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 5), align 4
+; SSE-NEXT:    [[B6:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 6), align 4
+; SSE-NEXT:    [[B7:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 7), align 4
+; SSE-NEXT:    [[B8:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8), align 4
+; SSE-NEXT:    [[B9:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 9), align 4
+; SSE-NEXT:    [[B10:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 10), align 4
+; SSE-NEXT:    [[B11:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 11), align 4
+; SSE-NEXT:    [[B12:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12), align 4
+; SSE-NEXT:    [[B13:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 13), align 4
+; SSE-NEXT:    [[B14:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 14), align 4
+; SSE-NEXT:    [[B15:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 15), align 4
+; SSE-NEXT:    [[R0:%.*]] = lshr i32 [[A0]], [[B0]]
+; SSE-NEXT:    [[R1:%.*]] = lshr i32 [[A1]], [[B1]]
+; SSE-NEXT:    [[R2:%.*]] = lshr i32 [[A2]], [[B2]]
+; SSE-NEXT:    [[R3:%.*]] = lshr i32 [[A3]], [[B3]]
+; SSE-NEXT:    [[R4:%.*]] = lshr i32 [[A4]], [[B4]]
+; SSE-NEXT:    [[R5:%.*]] = lshr i32 [[A5]], [[B5]]
+; SSE-NEXT:    [[R6:%.*]] = lshr i32 [[A6]], [[B6]]
+; SSE-NEXT:    [[R7:%.*]] = lshr i32 [[A7]], [[B7]]
+; SSE-NEXT:    [[R8:%.*]] = lshr i32 [[A8]], [[B8]]
+; SSE-NEXT:    [[R9:%.*]] = lshr i32 [[A9]], [[B9]]
+; SSE-NEXT:    [[R10:%.*]] = lshr i32 [[A10]], [[B10]]
+; SSE-NEXT:    [[R11:%.*]] = lshr i32 [[A11]], [[B11]]
+; SSE-NEXT:    [[R12:%.*]] = lshr i32 [[A12]], [[B12]]
+; SSE-NEXT:    [[R13:%.*]] = lshr i32 [[A13]], [[B13]]
+; SSE-NEXT:    [[R14:%.*]] = lshr i32 [[A14]], [[B14]]
+; SSE-NEXT:    [[R15:%.*]] = lshr i32 [[A15]], [[B15]]
+; SSE-NEXT:    store i32 [[R0]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 0), align 4
+; SSE-NEXT:    store i32 [[R1]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 1), align 4
+; SSE-NEXT:    store i32 [[R2]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 2), align 4
+; SSE-NEXT:    store i32 [[R3]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 3), align 4
+; SSE-NEXT:    store i32 [[R4]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4), align 4
+; SSE-NEXT:    store i32 [[R5]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 5), align 4
+; SSE-NEXT:    store i32 [[R6]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 6), align 4
+; SSE-NEXT:    store i32 [[R7]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 7), align 4
+; SSE-NEXT:    store i32 [[R8]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8), align 4
+; SSE-NEXT:    store i32 [[R9]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 9), align 4
+; SSE-NEXT:    store i32 [[R10]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 10), align 4
+; SSE-NEXT:    store i32 [[R11]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 11), align 4
+; SSE-NEXT:    store i32 [[R12]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12), align 4
+; SSE-NEXT:    store i32 [[R13]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 13), align 4
+; SSE-NEXT:    store i32 [[R14]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 14), align 4
+; SSE-NEXT:    store i32 [[R15]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 15), align 4
+; SSE-NEXT:    ret void
+;
+; AVX-LABEL: @lshr_v16i32(
+; AVX-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @a32 to <8 x i32>*), align 4
+; AVX-NEXT:    [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <8 x i32>*), align 4
+; AVX-NEXT:    [[TMP3:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @b32 to <8 x i32>*), align 4
+; AVX-NEXT:    [[TMP4:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <8 x i32>*), align 4
+; AVX-NEXT:    [[TMP5:%.*]] = lshr <8 x i32> [[TMP1]], [[TMP3]]
+; AVX-NEXT:    [[TMP6:%.*]] = lshr <8 x i32> [[TMP2]], [[TMP4]]
+; AVX-NEXT:    store <8 x i32> [[TMP5]], <8 x i32>* bitcast ([16 x i32]* @c32 to <8 x i32>*), align 4
+; AVX-NEXT:    store <8 x i32> [[TMP6]], <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <8 x i32>*), align 4
+; AVX-NEXT:    ret void
+;
+; AVX512-LABEL: @lshr_v16i32(
+; AVX512-NEXT:    [[TMP1:%.*]] = load <16 x i32>, <16 x i32>* bitcast ([16 x i32]* @a32 to <16 x i32>*), align 4
+; AVX512-NEXT:    [[TMP2:%.*]] = load <16 x i32>, <16 x i32>* bitcast ([16 x i32]* @b32 to <16 x i32>*), align 4
+; AVX512-NEXT:    [[TMP3:%.*]] = lshr <16 x i32> [[TMP1]], [[TMP2]]
+; AVX512-NEXT:    store <16 x i32> [[TMP3]], <16 x i32>* bitcast ([16 x i32]* @c32 to <16 x i32>*), align 4
+; AVX512-NEXT:    ret void
+;
+; XOP-LABEL: @lshr_v16i32(
+; XOP-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @a32 to <8 x i32>*), align 4
+; XOP-NEXT:    [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <8 x i32>*), align 4
+; XOP-NEXT:    [[TMP3:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @b32 to <8 x i32>*), align 4
+; XOP-NEXT:    [[TMP4:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <8 x i32>*), align 4
+; XOP-NEXT:    [[TMP5:%.*]] = lshr <8 x i32> [[TMP1]], [[TMP3]]
+; XOP-NEXT:    [[TMP6:%.*]] = lshr <8 x i32> [[TMP2]], [[TMP4]]
+; XOP-NEXT:    store <8 x i32> [[TMP5]], <8 x i32>* bitcast ([16 x i32]* @c32 to <8 x i32>*), align 4
+; XOP-NEXT:    store <8 x i32> [[TMP6]], <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <8 x i32>*), align 4
+; XOP-NEXT:    ret void
+;
+  %a0  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 0 ), align 4
+  %a1  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 1 ), align 4
+  %a2  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 2 ), align 4
+  %a3  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 3 ), align 4
+  %a4  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4 ), align 4
+  %a5  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 5 ), align 4
+  %a6  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 6 ), align 4
+  %a7  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 7 ), align 4
+  %a8  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8 ), align 4
+  %a9  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 9 ), align 4
+  %a10 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 10), align 4
+  %a11 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 11), align 4
+  %a12 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12), align 4
+  %a13 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 13), align 4
+  %a14 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 14), align 4
+  %a15 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 15), align 4
+  %b0  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 0 ), align 4
+  %b1  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 1 ), align 4
+  %b2  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 2 ), align 4
+  %b3  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 3 ), align 4
+  %b4  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4 ), align 4
+  %b5  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 5 ), align 4
+  %b6  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 6 ), align 4
+  %b7  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 7 ), align 4
+  %b8  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8 ), align 4
+  %b9  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 9 ), align 4
+  %b10 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 10), align 4
+  %b11 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 11), align 4
+  %b12 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12), align 4
+  %b13 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 13), align 4
+  %b14 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 14), align 4
+  %b15 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 15), align 4
+  %r0  = lshr i32 %a0 , %b0
+  %r1  = lshr i32 %a1 , %b1
+  %r2  = lshr i32 %a2 , %b2
+  %r3  = lshr i32 %a3 , %b3
+  %r4  = lshr i32 %a4 , %b4
+  %r5  = lshr i32 %a5 , %b5
+  %r6  = lshr i32 %a6 , %b6
+  %r7  = lshr i32 %a7 , %b7
+  %r8  = lshr i32 %a8 , %b8
+  %r9  = lshr i32 %a9 , %b9
+  %r10 = lshr i32 %a10, %b10
+  %r11 = lshr i32 %a11, %b11
+  %r12 = lshr i32 %a12, %b12
+  %r13 = lshr i32 %a13, %b13
+  %r14 = lshr i32 %a14, %b14
+  %r15 = lshr i32 %a15, %b15
+  store i32 %r0 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 0 ), align 4
+  store i32 %r1 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 1 ), align 4
+  store i32 %r2 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 2 ), align 4
+  store i32 %r3 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 3 ), align 4
+  store i32 %r4 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4 ), align 4
+  store i32 %r5 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 5 ), align 4
+  store i32 %r6 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 6 ), align 4
+  store i32 %r7 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 7 ), align 4
+  store i32 %r8 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8 ), align 4
+  store i32 %r9 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 9 ), align 4
+  store i32 %r10, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 10), align 4
+  store i32 %r11, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 11), align 4
+  store i32 %r12, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12), align 4
+  store i32 %r13, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 13), align 4
+  store i32 %r14, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 14), align 4
+  store i32 %r15, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 15), align 4
+  ret void
+}
+
+define void @lshr_v32i16() {
+; SSE-LABEL: @lshr_v32i16(
+; SSE-NEXT:    [[A0:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 0), align 2
+; SSE-NEXT:    [[A1:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 1), align 2
+; SSE-NEXT:    [[A2:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 2), align 2
+; SSE-NEXT:    [[A3:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 3), align 2
+; SSE-NEXT:    [[A4:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 4), align 2
+; SSE-NEXT:    [[A5:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 5), align 2
+; SSE-NEXT:    [[A6:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 6), align 2
+; SSE-NEXT:    [[A7:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 7), align 2
+; SSE-NEXT:    [[A8:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8), align 2
+; SSE-NEXT:    [[A9:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 9), align 2
+; SSE-NEXT:    [[A10:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 10), align 2
+; SSE-NEXT:    [[A11:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 11), align 2
+; SSE-NEXT:    [[A12:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 12), align 2
+; SSE-NEXT:    [[A13:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 13), align 2
+; SSE-NEXT:    [[A14:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 14), align 2
+; SSE-NEXT:    [[A15:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 15), align 2
+; SSE-NEXT:    [[A16:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16), align 2
+; SSE-NEXT:    [[A17:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 17), align 2
+; SSE-NEXT:    [[A18:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 18), align 2
+; SSE-NEXT:    [[A19:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 19), align 2
+; SSE-NEXT:    [[A20:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 20), align 2
+; SSE-NEXT:    [[A21:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 21), align 2
+; SSE-NEXT:    [[A22:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 22), align 2
+; SSE-NEXT:    [[A23:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 23), align 2
+; SSE-NEXT:    [[A24:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24), align 2
+; SSE-NEXT:    [[A25:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 25), align 2
+; SSE-NEXT:    [[A26:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 26), align 2
+; SSE-NEXT:    [[A27:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 27), align 2
+; SSE-NEXT:    [[A28:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 28), align 2
+; SSE-NEXT:    [[A29:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 29), align 2
+; SSE-NEXT:    [[A30:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 30), align 2
+; SSE-NEXT:    [[A31:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 31), align 2
+; SSE-NEXT:    [[B0:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 0), align 2
+; SSE-NEXT:    [[B1:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 1), align 2
+; SSE-NEXT:    [[B2:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 2), align 2
+; SSE-NEXT:    [[B3:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 3), align 2
+; SSE-NEXT:    [[B4:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 4), align 2
+; SSE-NEXT:    [[B5:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 5), align 2
+; SSE-NEXT:    [[B6:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 6), align 2
+; SSE-NEXT:    [[B7:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 7), align 2
+; SSE-NEXT:    [[B8:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8), align 2
+; SSE-NEXT:    [[B9:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 9), align 2
+; SSE-NEXT:    [[B10:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 10), align 2
+; SSE-NEXT:    [[B11:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 11), align 2
+; SSE-NEXT:    [[B12:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 12), align 2
+; SSE-NEXT:    [[B13:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 13), align 2
+; SSE-NEXT:    [[B14:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 14), align 2
+; SSE-NEXT:    [[B15:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 15), align 2
+; SSE-NEXT:    [[B16:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16), align 2
+; SSE-NEXT:    [[B17:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 17), align 2
+; SSE-NEXT:    [[B18:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 18), align 2
+; SSE-NEXT:    [[B19:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 19), align 2
+; SSE-NEXT:    [[B20:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 20), align 2
+; SSE-NEXT:    [[B21:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 21), align 2
+; SSE-NEXT:    [[B22:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 22), align 2
+; SSE-NEXT:    [[B23:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 23), align 2
+; SSE-NEXT:    [[B24:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24), align 2
+; SSE-NEXT:    [[B25:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 25), align 2
+; SSE-NEXT:    [[B26:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 26), align 2
+; SSE-NEXT:    [[B27:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 27), align 2
+; SSE-NEXT:    [[B28:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 28), align 2
+; SSE-NEXT:    [[B29:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 29), align 2
+; SSE-NEXT:    [[B30:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 30), align 2
+; SSE-NEXT:    [[B31:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 31), align 2
+; SSE-NEXT:    [[R0:%.*]] = lshr i16 [[A0]], [[B0]]
+; SSE-NEXT:    [[R1:%.*]] = lshr i16 [[A1]], [[B1]]
+; SSE-NEXT:    [[R2:%.*]] = lshr i16 [[A2]], [[B2]]
+; SSE-NEXT:    [[R3:%.*]] = lshr i16 [[A3]], [[B3]]
+; SSE-NEXT:    [[R4:%.*]] = lshr i16 [[A4]], [[B4]]
+; SSE-NEXT:    [[R5:%.*]] = lshr i16 [[A5]], [[B5]]
+; SSE-NEXT:    [[R6:%.*]] = lshr i16 [[A6]], [[B6]]
+; SSE-NEXT:    [[R7:%.*]] = lshr i16 [[A7]], [[B7]]
+; SSE-NEXT:    [[R8:%.*]] = lshr i16 [[A8]], [[B8]]
+; SSE-NEXT:    [[R9:%.*]] = lshr i16 [[A9]], [[B9]]
+; SSE-NEXT:    [[R10:%.*]] = lshr i16 [[A10]], [[B10]]
+; SSE-NEXT:    [[R11:%.*]] = lshr i16 [[A11]], [[B11]]
+; SSE-NEXT:    [[R12:%.*]] = lshr i16 [[A12]], [[B12]]
+; SSE-NEXT:    [[R13:%.*]] = lshr i16 [[A13]], [[B13]]
+; SSE-NEXT:    [[R14:%.*]] = lshr i16 [[A14]], [[B14]]
+; SSE-NEXT:    [[R15:%.*]] = lshr i16 [[A15]], [[B15]]
+; SSE-NEXT:    [[R16:%.*]] = lshr i16 [[A16]], [[B16]]
+; SSE-NEXT:    [[R17:%.*]] = lshr i16 [[A17]], [[B17]]
+; SSE-NEXT:    [[R18:%.*]] = lshr i16 [[A18]], [[B18]]
+; SSE-NEXT:    [[R19:%.*]] = lshr i16 [[A19]], [[B19]]
+; SSE-NEXT:    [[R20:%.*]] = lshr i16 [[A20]], [[B20]]
+; SSE-NEXT:    [[R21:%.*]] = lshr i16 [[A21]], [[B21]]
+; SSE-NEXT:    [[R22:%.*]] = lshr i16 [[A22]], [[B22]]
+; SSE-NEXT:    [[R23:%.*]] = lshr i16 [[A23]], [[B23]]
+; SSE-NEXT:    [[R24:%.*]] = lshr i16 [[A24]], [[B24]]
+; SSE-NEXT:    [[R25:%.*]] = lshr i16 [[A25]], [[B25]]
+; SSE-NEXT:    [[R26:%.*]] = lshr i16 [[A26]], [[B26]]
+; SSE-NEXT:    [[R27:%.*]] = lshr i16 [[A27]], [[B27]]
+; SSE-NEXT:    [[R28:%.*]] = lshr i16 [[A28]], [[B28]]
+; SSE-NEXT:    [[R29:%.*]] = lshr i16 [[A29]], [[B29]]
+; SSE-NEXT:    [[R30:%.*]] = lshr i16 [[A30]], [[B30]]
+; SSE-NEXT:    [[R31:%.*]] = lshr i16 [[A31]], [[B31]]
+; SSE-NEXT:    store i16 [[R0]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 0), align 2
+; SSE-NEXT:    store i16 [[R1]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 1), align 2
+; SSE-NEXT:    store i16 [[R2]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 2), align 2
+; SSE-NEXT:    store i16 [[R3]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 3), align 2
+; SSE-NEXT:    store i16 [[R4]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 4), align 2
+; SSE-NEXT:    store i16 [[R5]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 5), align 2
+; SSE-NEXT:    store i16 [[R6]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 6), align 2
+; SSE-NEXT:    store i16 [[R7]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 7), align 2
+; SSE-NEXT:    store i16 [[R8]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8), align 2
+; SSE-NEXT:    store i16 [[R9]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 9), align 2
+; SSE-NEXT:    store i16 [[R10]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 10), align 2
+; SSE-NEXT:    store i16 [[R11]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 11), align 2
+; SSE-NEXT:    store i16 [[R12]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 12), align 2
+; SSE-NEXT:    store i16 [[R13]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 13), align 2
+; SSE-NEXT:    store i16 [[R14]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 14), align 2
+; SSE-NEXT:    store i16 [[R15]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 15), align 2
+; SSE-NEXT:    store i16 [[R16]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16), align 2
+; SSE-NEXT:    store i16 [[R17]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 17), align 2
+; SSE-NEXT:    store i16 [[R18]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 18), align 2
+; SSE-NEXT:    store i16 [[R19]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 19), align 2
+; SSE-NEXT:    store i16 [[R20]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 20), align 2
+; SSE-NEXT:    store i16 [[R21]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 21), align 2
+; SSE-NEXT:    store i16 [[R22]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 22), align 2
+; SSE-NEXT:    store i16 [[R23]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 23), align 2
+; SSE-NEXT:    store i16 [[R24]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 24), align 2
+; SSE-NEXT:    store i16 [[R25]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 25), align 2
+; SSE-NEXT:    store i16 [[R26]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 26), align 2
+; SSE-NEXT:    store i16 [[R27]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 27), align 2
+; SSE-NEXT:    store i16 [[R28]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 28), align 2
+; SSE-NEXT:    store i16 [[R29]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 29), align 2
+; SSE-NEXT:    store i16 [[R30]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 30), align 2
+; SSE-NEXT:    store i16 [[R31]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 31), align 2
+; SSE-NEXT:    ret void
+;
+; AVX-LABEL: @lshr_v32i16(
+; AVX-NEXT:    [[TMP1:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @a16 to <16 x i16>*), align 2
+; AVX-NEXT:    [[TMP2:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <16 x i16>*), align 2
+; AVX-NEXT:    [[TMP3:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @b16 to <16 x i16>*), align 2
+; AVX-NEXT:    [[TMP4:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <16 x i16>*), align 2
+; AVX-NEXT:    [[TMP5:%.*]] = lshr <16 x i16> [[TMP1]], [[TMP3]]
+; AVX-NEXT:    [[TMP6:%.*]] = lshr <16 x i16> [[TMP2]], [[TMP4]]
+; AVX-NEXT:    store <16 x i16> [[TMP5]], <16 x i16>* bitcast ([32 x i16]* @c16 to <16 x i16>*), align 2
+; AVX-NEXT:    store <16 x i16> [[TMP6]], <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <16 x i16>*), align 2
+; AVX-NEXT:    ret void
+;
+; AVX512-LABEL: @lshr_v32i16(
+; AVX512-NEXT:    [[TMP1:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @a16 to <16 x i16>*), align 2
+; AVX512-NEXT:    [[TMP2:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <16 x i16>*), align 2
+; AVX512-NEXT:    [[TMP3:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @b16 to <16 x i16>*), align 2
+; AVX512-NEXT:    [[TMP4:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <16 x i16>*), align 2
+; AVX512-NEXT:    [[TMP5:%.*]] = lshr <16 x i16> [[TMP1]], [[TMP3]]
+; AVX512-NEXT:    [[TMP6:%.*]] = lshr <16 x i16> [[TMP2]], [[TMP4]]
+; AVX512-NEXT:    store <16 x i16> [[TMP5]], <16 x i16>* bitcast ([32 x i16]* @c16 to <16 x i16>*), align 2
+; AVX512-NEXT:    store <16 x i16> [[TMP6]], <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <16 x i16>*), align 2
+; AVX512-NEXT:    ret void
+;
+; XOP-LABEL: @lshr_v32i16(
+; XOP-NEXT:    [[TMP1:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @a16 to <16 x i16>*), align 2
+; XOP-NEXT:    [[TMP2:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <16 x i16>*), align 2
+; XOP-NEXT:    [[TMP3:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @b16 to <16 x i16>*), align 2
+; XOP-NEXT:    [[TMP4:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <16 x i16>*), align 2
+; XOP-NEXT:    [[TMP5:%.*]] = lshr <16 x i16> [[TMP1]], [[TMP3]]
+; XOP-NEXT:    [[TMP6:%.*]] = lshr <16 x i16> [[TMP2]], [[TMP4]]
+; XOP-NEXT:    store <16 x i16> [[TMP5]], <16 x i16>* bitcast ([32 x i16]* @c16 to <16 x i16>*), align 2
+; XOP-NEXT:    store <16 x i16> [[TMP6]], <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <16 x i16>*), align 2
+; XOP-NEXT:    ret void
+;
+  %a0  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 0 ), align 2
+  %a1  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 1 ), align 2
+  %a2  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 2 ), align 2
+  %a3  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 3 ), align 2
+  %a4  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 4 ), align 2
+  %a5  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 5 ), align 2
+  %a6  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 6 ), align 2
+  %a7  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 7 ), align 2
+  %a8  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8 ), align 2
+  %a9  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 9 ), align 2
+  %a10 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 10), align 2
+  %a11 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 11), align 2
+  %a12 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 12), align 2
+  %a13 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 13), align 2
+  %a14 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 14), align 2
+  %a15 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 15), align 2
+  %a16 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16), align 2
+  %a17 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 17), align 2
+  %a18 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 18), align 2
+  %a19 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 19), align 2
+  %a20 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 20), align 2
+  %a21 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 21), align 2
+  %a22 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 22), align 2
+  %a23 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 23), align 2
+  %a24 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24), align 2
+  %a25 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 25), align 2
+  %a26 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 26), align 2
+  %a27 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 27), align 2
+  %a28 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 28), align 2
+  %a29 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 29), align 2
+  %a30 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 30), align 2
+  %a31 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 31), align 2
+  %b0  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 0 ), align 2
+  %b1  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 1 ), align 2
+  %b2  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 2 ), align 2
+  %b3  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 3 ), align 2
+  %b4  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 4 ), align 2
+  %b5  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 5 ), align 2
+  %b6  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 6 ), align 2
+  %b7  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 7 ), align 2
+  %b8  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8 ), align 2
+  %b9  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 9 ), align 2
+  %b10 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 10), align 2
+  %b11 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 11), align 2
+  %b12 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 12), align 2
+  %b13 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 13), align 2
+  %b14 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 14), align 2
+  %b15 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 15), align 2
+  %b16 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16), align 2
+  %b17 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 17), align 2
+  %b18 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 18), align 2
+  %b19 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 19), align 2
+  %b20 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 20), align 2
+  %b21 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 21), align 2
+  %b22 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 22), align 2
+  %b23 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 23), align 2
+  %b24 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24), align 2
+  %b25 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 25), align 2
+  %b26 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 26), align 2
+  %b27 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 27), align 2
+  %b28 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 28), align 2
+  %b29 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 29), align 2
+  %b30 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 30), align 2
+  %b31 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 31), align 2
+  %r0  = lshr i16 %a0 , %b0
+  %r1  = lshr i16 %a1 , %b1
+  %r2  = lshr i16 %a2 , %b2
+  %r3  = lshr i16 %a3 , %b3
+  %r4  = lshr i16 %a4 , %b4
+  %r5  = lshr i16 %a5 , %b5
+  %r6  = lshr i16 %a6 , %b6
+  %r7  = lshr i16 %a7 , %b7
+  %r8  = lshr i16 %a8 , %b8
+  %r9  = lshr i16 %a9 , %b9
+  %r10 = lshr i16 %a10, %b10
+  %r11 = lshr i16 %a11, %b11
+  %r12 = lshr i16 %a12, %b12
+  %r13 = lshr i16 %a13, %b13
+  %r14 = lshr i16 %a14, %b14
+  %r15 = lshr i16 %a15, %b15
+  %r16 = lshr i16 %a16, %b16
+  %r17 = lshr i16 %a17, %b17
+  %r18 = lshr i16 %a18, %b18
+  %r19 = lshr i16 %a19, %b19
+  %r20 = lshr i16 %a20, %b20
+  %r21 = lshr i16 %a21, %b21
+  %r22 = lshr i16 %a22, %b22
+  %r23 = lshr i16 %a23, %b23
+  %r24 = lshr i16 %a24, %b24
+  %r25 = lshr i16 %a25, %b25
+  %r26 = lshr i16 %a26, %b26
+  %r27 = lshr i16 %a27, %b27
+  %r28 = lshr i16 %a28, %b28
+  %r29 = lshr i16 %a29, %b29
+  %r30 = lshr i16 %a30, %b30
+  %r31 = lshr i16 %a31, %b31
+  store i16 %r0 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 0 ), align 2
+  store i16 %r1 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 1 ), align 2
+  store i16 %r2 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 2 ), align 2
+  store i16 %r3 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 3 ), align 2
+  store i16 %r4 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 4 ), align 2
+  store i16 %r5 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 5 ), align 2
+  store i16 %r6 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 6 ), align 2
+  store i16 %r7 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 7 ), align 2
+  store i16 %r8 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8 ), align 2
+  store i16 %r9 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 9 ), align 2
+  store i16 %r10, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 10), align 2
+  store i16 %r11, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 11), align 2
+  store i16 %r12, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 12), align 2
+  store i16 %r13, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 13), align 2
+  store i16 %r14, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 14), align 2
+  store i16 %r15, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 15), align 2
+  store i16 %r16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16), align 2
+  store i16 %r17, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 17), align 2
+  store i16 %r18, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 18), align 2
+  store i16 %r19, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 19), align 2
+  store i16 %r20, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 20), align 2
+  store i16 %r21, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 21), align 2
+  store i16 %r22, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 22), align 2
+  store i16 %r23, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 23), align 2
+  store i16 %r24, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 24), align 2
+  store i16 %r25, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 25), align 2
+  store i16 %r26, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 26), align 2
+  store i16 %r27, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 27), align 2
+  store i16 %r28, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 28), align 2
+  store i16 %r29, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 29), align 2
+  store i16 %r30, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 30), align 2
+  store i16 %r31, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 31), align 2
+  ret void
+}
+
+define void @lshr_v64i8() {
+; CHECK-LABEL: @lshr_v64i8(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @a8 to <16 x i8>*), align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16) to <16 x i8>*), align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <16 x i8>*), align 1
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48) to <16 x i8>*), align 1
+; CHECK-NEXT:    [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @b8 to <16 x i8>*), align 1
+; CHECK-NEXT:    [[TMP6:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16) to <16 x i8>*), align 1
+; CHECK-NEXT:    [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <16 x i8>*), align 1
+; CHECK-NEXT:    [[TMP8:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48) to <16 x i8>*), align 1
+; CHECK-NEXT:    [[TMP9:%.*]] = lshr <16 x i8> [[TMP1]], [[TMP5]]
+; CHECK-NEXT:    [[TMP10:%.*]] = lshr <16 x i8> [[TMP2]], [[TMP6]]
+; CHECK-NEXT:    [[TMP11:%.*]] = lshr <16 x i8> [[TMP3]], [[TMP7]]
+; CHECK-NEXT:    [[TMP12:%.*]] = lshr <16 x i8> [[TMP4]], [[TMP8]]
+; CHECK-NEXT:    store <16 x i8> [[TMP9]], <16 x i8>* bitcast ([64 x i8]* @c8 to <16 x i8>*), align 1
+; CHECK-NEXT:    store <16 x i8> [[TMP10]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16) to <16 x i8>*), align 1
+; CHECK-NEXT:    store <16 x i8> [[TMP11]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <16 x i8>*), align 1
+; CHECK-NEXT:    store <16 x i8> [[TMP12]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 48) to <16 x i8>*), align 1
+; CHECK-NEXT:    ret void
+;
+  %a0  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 0 ), align 1
+  %a1  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 1 ), align 1
+  %a2  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 2 ), align 1
+  %a3  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 3 ), align 1
+  %a4  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 4 ), align 1
+  %a5  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 5 ), align 1
+  %a6  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 6 ), align 1
+  %a7  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 7 ), align 1
+  %a8  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 8 ), align 1
+  %a9  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 9 ), align 1
+  %a10 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 10), align 1
+  %a11 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 11), align 1
+  %a12 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 12), align 1
+  %a13 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 13), align 1
+  %a14 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 14), align 1
+  %a15 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 15), align 1
+  %a16 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16), align 1
+  %a17 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 17), align 1
+  %a18 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 18), align 1
+  %a19 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 19), align 1
+  %a20 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 20), align 1
+  %a21 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 21), align 1
+  %a22 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 22), align 1
+  %a23 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 23), align 1
+  %a24 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 24), align 1
+  %a25 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 25), align 1
+  %a26 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 26), align 1
+  %a27 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 27), align 1
+  %a28 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 28), align 1
+  %a29 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 29), align 1
+  %a30 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 30), align 1
+  %a31 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 31), align 1
+  %a32 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32), align 1
+  %a33 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 33), align 1
+  %a34 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 34), align 1
+  %a35 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 35), align 1
+  %a36 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 36), align 1
+  %a37 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 37), align 1
+  %a38 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 38), align 1
+  %a39 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 39), align 1
+  %a40 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 40), align 1
+  %a41 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 41), align 1
+  %a42 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 42), align 1
+  %a43 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 43), align 1
+  %a44 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 44), align 1
+  %a45 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 45), align 1
+  %a46 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 46), align 1
+  %a47 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 47), align 1
+  %a48 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48), align 1
+  %a49 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 49), align 1
+  %a50 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 50), align 1
+  %a51 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 51), align 1
+  %a52 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 52), align 1
+  %a53 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 53), align 1
+  %a54 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 54), align 1
+  %a55 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 55), align 1
+  %a56 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 56), align 1
+  %a57 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 57), align 1
+  %a58 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 58), align 1
+  %a59 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 59), align 1
+  %a60 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 60), align 1
+  %a61 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 61), align 1
+  %a62 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 62), align 1
+  %a63 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 63), align 1
+  %b0  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 0 ), align 1
+  %b1  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 1 ), align 1
+  %b2  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 2 ), align 1
+  %b3  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 3 ), align 1
+  %b4  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 4 ), align 1
+  %b5  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 5 ), align 1
+  %b6  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 6 ), align 1
+  %b7  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 7 ), align 1
+  %b8  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 8 ), align 1
+  %b9  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 9 ), align 1
+  %b10 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 10), align 1
+  %b11 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 11), align 1
+  %b12 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 12), align 1
+  %b13 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 13), align 1
+  %b14 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 14), align 1
+  %b15 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 15), align 1
+  %b16 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16), align 1
+  %b17 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 17), align 1
+  %b18 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 18), align 1
+  %b19 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 19), align 1
+  %b20 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 20), align 1
+  %b21 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 21), align 1
+  %b22 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 22), align 1
+  %b23 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 23), align 1
+  %b24 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 24), align 1
+  %b25 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 25), align 1
+  %b26 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 26), align 1
+  %b27 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 27), align 1
+  %b28 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 28), align 1
+  %b29 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 29), align 1
+  %b30 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 30), align 1
+  %b31 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 31), align 1
+  %b32 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32), align 1
+  %b33 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 33), align 1
+  %b34 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 34), align 1
+  %b35 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 35), align 1
+  %b36 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 36), align 1
+  %b37 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 37), align 1
+  %b38 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 38), align 1
+  %b39 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 39), align 1
+  %b40 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 40), align 1
+  %b41 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 41), align 1
+  %b42 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 42), align 1
+  %b43 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 43), align 1
+  %b44 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 44), align 1
+  %b45 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 45), align 1
+  %b46 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 46), align 1
+  %b47 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 47), align 1
+  %b48 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48), align 1
+  %b49 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 49), align 1
+  %b50 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 50), align 1
+  %b51 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 51), align 1
+  %b52 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 52), align 1
+  %b53 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 53), align 1
+  %b54 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 54), align 1
+  %b55 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 55), align 1
+  %b56 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 56), align 1
+  %b57 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 57), align 1
+  %b58 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 58), align 1
+  %b59 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 59), align 1
+  %b60 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 60), align 1
+  %b61 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 61), align 1
+  %b62 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 62), align 1
+  %b63 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 63), align 1
+  %r0  = lshr i8 %a0 , %b0
+  %r1  = lshr i8 %a1 , %b1
+  %r2  = lshr i8 %a2 , %b2
+  %r3  = lshr i8 %a3 , %b3
+  %r4  = lshr i8 %a4 , %b4
+  %r5  = lshr i8 %a5 , %b5
+  %r6  = lshr i8 %a6 , %b6
+  %r7  = lshr i8 %a7 , %b7
+  %r8  = lshr i8 %a8 , %b8
+  %r9  = lshr i8 %a9 , %b9
+  %r10 = lshr i8 %a10, %b10
+  %r11 = lshr i8 %a11, %b11
+  %r12 = lshr i8 %a12, %b12
+  %r13 = lshr i8 %a13, %b13
+  %r14 = lshr i8 %a14, %b14
+  %r15 = lshr i8 %a15, %b15
+  %r16 = lshr i8 %a16, %b16
+  %r17 = lshr i8 %a17, %b17
+  %r18 = lshr i8 %a18, %b18
+  %r19 = lshr i8 %a19, %b19
+  %r20 = lshr i8 %a20, %b20
+  %r21 = lshr i8 %a21, %b21
+  %r22 = lshr i8 %a22, %b22
+  %r23 = lshr i8 %a23, %b23
+  %r24 = lshr i8 %a24, %b24
+  %r25 = lshr i8 %a25, %b25
+  %r26 = lshr i8 %a26, %b26
+  %r27 = lshr i8 %a27, %b27
+  %r28 = lshr i8 %a28, %b28
+  %r29 = lshr i8 %a29, %b29
+  %r30 = lshr i8 %a30, %b30
+  %r31 = lshr i8 %a31, %b31
+  %r32 = lshr i8 %a32, %b32
+  %r33 = lshr i8 %a33, %b33
+  %r34 = lshr i8 %a34, %b34
+  %r35 = lshr i8 %a35, %b35
+  %r36 = lshr i8 %a36, %b36
+  %r37 = lshr i8 %a37, %b37
+  %r38 = lshr i8 %a38, %b38
+  %r39 = lshr i8 %a39, %b39
+  %r40 = lshr i8 %a40, %b40
+  %r41 = lshr i8 %a41, %b41
+  %r42 = lshr i8 %a42, %b42
+  %r43 = lshr i8 %a43, %b43
+  %r44 = lshr i8 %a44, %b44
+  %r45 = lshr i8 %a45, %b45
+  %r46 = lshr i8 %a46, %b46
+  %r47 = lshr i8 %a47, %b47
+  %r48 = lshr i8 %a48, %b48
+  %r49 = lshr i8 %a49, %b49
+  %r50 = lshr i8 %a50, %b50
+  %r51 = lshr i8 %a51, %b51
+  %r52 = lshr i8 %a52, %b52
+  %r53 = lshr i8 %a53, %b53
+  %r54 = lshr i8 %a54, %b54
+  %r55 = lshr i8 %a55, %b55
+  %r56 = lshr i8 %a56, %b56
+  %r57 = lshr i8 %a57, %b57
+  %r58 = lshr i8 %a58, %b58
+  %r59 = lshr i8 %a59, %b59
+  %r60 = lshr i8 %a60, %b60
+  %r61 = lshr i8 %a61, %b61
+  %r62 = lshr i8 %a62, %b62
+  %r63 = lshr i8 %a63, %b63
+  store i8 %r0 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 0 ), align 1
+  store i8 %r1 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 1 ), align 1
+  store i8 %r2 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 2 ), align 1
+  store i8 %r3 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 3 ), align 1
+  store i8 %r4 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 4 ), align 1
+  store i8 %r5 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 5 ), align 1
+  store i8 %r6 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 6 ), align 1
+  store i8 %r7 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 7 ), align 1
+  store i8 %r8 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 8 ), align 1
+  store i8 %r9 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 9 ), align 1
+  store i8 %r10, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 10), align 1
+  store i8 %r11, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 11), align 1
+  store i8 %r12, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 12), align 1
+  store i8 %r13, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 13), align 1
+  store i8 %r14, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 14), align 1
+  store i8 %r15, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 15), align 1
+  store i8 %r16, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16), align 1
+  store i8 %r17, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 17), align 1
+  store i8 %r18, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 18), align 1
+  store i8 %r19, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 19), align 1
+  store i8 %r20, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 20), align 1
+  store i8 %r21, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 21), align 1
+  store i8 %r22, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 22), align 1
+  store i8 %r23, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 23), align 1
+  store i8 %r24, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 24), align 1
+  store i8 %r25, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 25), align 1
+  store i8 %r26, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 26), align 1
+  store i8 %r27, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 27), align 1
+  store i8 %r28, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 28), align 1
+  store i8 %r29, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 29), align 1
+  store i8 %r30, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 30), align 1
+  store i8 %r31, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 31), align 1
+  store i8 %r32, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32), align 1
+  store i8 %r33, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 33), align 1
+  store i8 %r34, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 34), align 1
+  store i8 %r35, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 35), align 1
+  store i8 %r36, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 36), align 1
+  store i8 %r37, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 37), align 1
+  store i8 %r38, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 38), align 1
+  store i8 %r39, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 39), align 1
+  store i8 %r40, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 40), align 1
+  store i8 %r41, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 41), align 1
+  store i8 %r42, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 42), align 1
+  store i8 %r43, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 43), align 1
+  store i8 %r44, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 44), align 1
+  store i8 %r45, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 45), align 1
+  store i8 %r46, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 46), align 1
+  store i8 %r47, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 47), align 1
+  store i8 %r48, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 48), align 1
+  store i8 %r49, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 49), align 1
+  store i8 %r50, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 50), align 1
+  store i8 %r51, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 51), align 1
+  store i8 %r52, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 52), align 1
+  store i8 %r53, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 53), align 1
+  store i8 %r54, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 54), align 1
+  store i8 %r55, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 55), align 1
+  store i8 %r56, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 56), align 1
+  store i8 %r57, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 57), align 1
+  store i8 %r58, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 58), align 1
+  store i8 %r59, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 59), align 1
+  store i8 %r60, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 60), align 1
+  store i8 %r61, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 61), align 1
+  store i8 %r62, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 62), align 1
+  store i8 %r63, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 63), align 1
+  ret void
+}
diff --git a/test/Transforms/SLPVectorizer/X86/shift-shl.ll b/test/Transforms/SLPVectorizer/X86/shift-shl.ll
new file mode 100644
index 000000000000..70de82bdea5f
--- /dev/null
+++ b/test/Transforms/SLPVectorizer/X86/shift-shl.ll
@@ -0,0 +1,814 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -mtriple=x86_64-unknown -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=SSE
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX1
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX2
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 --check-prefix=AVX512F
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 --check-prefix=AVX512BW
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=bdver4 -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=XOP
+
+@a64 = common global [8 x i64] zeroinitializer, align 64
+@b64 = common global [8 x i64] zeroinitializer, align 64
+@c64 = common global [8 x i64] zeroinitializer, align 64
+@a32 = common global [16 x i32] zeroinitializer, align 64
+@b32 = common global [16 x i32] zeroinitializer, align 64
+@c32 = common global [16 x i32] zeroinitializer, align 64
+@a16 = common global [32 x i16] zeroinitializer, align 64
+@b16 = common global [32 x i16] zeroinitializer, align 64
+@c16 = common global [32 x i16] zeroinitializer, align 64
+@a8  = common global [64 x i8] zeroinitializer, align 64
+@b8  = common global [64 x i8] zeroinitializer, align 64
+@c8  = common global [64 x i8] zeroinitializer, align 64
+
+define void @shl_v8i64() {
+; SSE-LABEL: @shl_v8i64(
+; SSE-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @a64 to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2) to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6) to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @b64 to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2) to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6) to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP9:%.*]] = shl <2 x i64> [[TMP1]], [[TMP5]]
+; SSE-NEXT:    [[TMP10:%.*]] = shl <2 x i64> [[TMP2]], [[TMP6]]
+; SSE-NEXT:    [[TMP11:%.*]] = shl <2 x i64> [[TMP3]], [[TMP7]]
+; SSE-NEXT:    [[TMP12:%.*]] = shl <2 x i64> [[TMP4]], [[TMP8]]
+; SSE-NEXT:    store <2 x i64> [[TMP9]], <2 x i64>* bitcast ([8 x i64]* @c64 to <2 x i64>*), align 8
+; SSE-NEXT:    store <2 x i64> [[TMP10]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2) to <2 x i64>*), align 8
+; SSE-NEXT:    store <2 x i64> [[TMP11]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <2 x i64>*), align 8
+; SSE-NEXT:    store <2 x i64> [[TMP12]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6) to <2 x i64>*), align 8
+; SSE-NEXT:    ret void
+;
+; AVX1-LABEL: @shl_v8i64(
+; AVX1-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @a64 to <2 x i64>*), align 8
+; AVX1-NEXT:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2) to <2 x i64>*), align 8
+; AVX1-NEXT:    [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <2 x i64>*), align 8
+; AVX1-NEXT:    [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6) to <2 x i64>*), align 8
+; AVX1-NEXT:    [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @b64 to <2 x i64>*), align 8
+; AVX1-NEXT:    [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2) to <2 x i64>*), align 8
+; AVX1-NEXT:    [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <2 x i64>*), align 8
+; AVX1-NEXT:    [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6) to <2 x i64>*), align 8
+; AVX1-NEXT:    [[TMP9:%.*]] = shl <2 x i64> [[TMP1]], [[TMP5]]
+; AVX1-NEXT:    [[TMP10:%.*]] = shl <2 x i64> [[TMP2]], [[TMP6]]
+; AVX1-NEXT:    [[TMP11:%.*]] = shl <2 x i64> [[TMP3]], [[TMP7]]
+; AVX1-NEXT:    [[TMP12:%.*]] = shl <2 x i64> [[TMP4]], [[TMP8]]
+; AVX1-NEXT:    store <2 x i64> [[TMP9]], <2 x i64>* bitcast ([8 x i64]* @c64 to <2 x i64>*), align 8
+; AVX1-NEXT:    store <2 x i64> [[TMP10]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2) to <2 x i64>*), align 8
+; AVX1-NEXT:    store <2 x i64> [[TMP11]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <2 x i64>*), align 8
+; AVX1-NEXT:    store <2 x i64> [[TMP12]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6) to <2 x i64>*), align 8
+; AVX1-NEXT:    ret void
+;
+; AVX2-LABEL: @shl_v8i64(
+; AVX2-NEXT:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @a64 to <4 x i64>*), align 8
+; AVX2-NEXT:    [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8
+; AVX2-NEXT:    [[TMP3:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @b64 to <4 x i64>*), align 8
+; AVX2-NEXT:    [[TMP4:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <4 x i64>*), align 8
+; AVX2-NEXT:    [[TMP5:%.*]] = shl <4 x i64> [[TMP1]], [[TMP3]]
+; AVX2-NEXT:    [[TMP6:%.*]] = shl <4 x i64> [[TMP2]], [[TMP4]]
+; AVX2-NEXT:    store <4 x i64> [[TMP5]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8
+; AVX2-NEXT:    store <4 x i64> [[TMP6]], <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <4 x i64>*), align 8
+; AVX2-NEXT:    ret void
+;
+; AVX512-LABEL: @shl_v8i64(
+; AVX512-NEXT:    [[TMP1:%.*]] = load <8 x i64>, <8 x i64>* bitcast ([8 x i64]* @a64 to <8 x i64>*), align 8
+; AVX512-NEXT:    [[TMP2:%.*]] = load <8 x i64>, <8 x i64>* bitcast ([8 x i64]* @b64 to <8 x i64>*), align 8
+; AVX512-NEXT:    [[TMP3:%.*]] = shl <8 x i64> [[TMP1]], [[TMP2]]
+; AVX512-NEXT:    store <8 x i64> [[TMP3]], <8 x i64>* bitcast ([8 x i64]* @c64 to <8 x i64>*), align 8
+; AVX512-NEXT:    ret void
+;
+; XOP-LABEL: @shl_v8i64(
+; XOP-NEXT:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @a64 to <4 x i64>*), align 8
+; XOP-NEXT:    [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8
+; XOP-NEXT:    [[TMP3:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @b64 to <4 x i64>*), align 8
+; XOP-NEXT:    [[TMP4:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <4 x i64>*), align 8
+; XOP-NEXT:    [[TMP5:%.*]] = shl <4 x i64> [[TMP1]], [[TMP3]]
+; XOP-NEXT:    [[TMP6:%.*]] = shl <4 x i64> [[TMP2]], [[TMP4]]
+; XOP-NEXT:    store <4 x i64> [[TMP5]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8
+; XOP-NEXT:    store <4 x i64> [[TMP6]], <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <4 x i64>*), align 8
+; XOP-NEXT:    ret void
+;
+  %a0 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 0), align 8
+  %a1 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 1), align 8
+  %a2 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2), align 8
+  %a3 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 3), align 8
+  %a4 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4), align 8
+  %a5 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 5), align 8
+  %a6 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6), align 8
+  %a7 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 7), align 8
+  %b0 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 0), align 8
+  %b1 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 1), align 8
+  %b2 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2), align 8
+  %b3 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 3), align 8
+  %b4 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4), align 8
+  %b5 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 5), align 8
+  %b6 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6), align 8
+  %b7 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 7), align 8
+  %r0 = shl i64 %a0, %b0
+  %r1 = shl i64 %a1, %b1
+  %r2 = shl i64 %a2, %b2
+  %r3 = shl i64 %a3, %b3
+  %r4 = shl i64 %a4, %b4
+  %r5 = shl i64 %a5, %b5
+  %r6 = shl i64 %a6, %b6
+  %r7 = shl i64 %a7, %b7
+  store i64 %r0, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 0), align 8
+  store i64 %r1, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 1), align 8
+  store i64 %r2, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2), align 8
+  store i64 %r3, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 3), align 8
+  store i64 %r4, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4), align 8
+  store i64 %r5, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 5), align 8
+  store i64 %r6, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6), align 8
+  store i64 %r7, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 7), align 8
+  ret void
+}
+
+define void @shl_v16i32() {
+; SSE-LABEL: @shl_v16i32(
+; SSE-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @a32 to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @b32 to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP9:%.*]] = shl <4 x i32> [[TMP1]], [[TMP5]]
+; SSE-NEXT:    [[TMP10:%.*]] = shl <4 x i32> [[TMP2]], [[TMP6]]
+; SSE-NEXT:    [[TMP11:%.*]] = shl <4 x i32> [[TMP3]], [[TMP7]]
+; SSE-NEXT:    [[TMP12:%.*]] = shl <4 x i32> [[TMP4]], [[TMP8]]
+; SSE-NEXT:    store <4 x i32> [[TMP9]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4
+; SSE-NEXT:    store <4 x i32> [[TMP10]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4
+; SSE-NEXT:    store <4 x i32> [[TMP11]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4
+; SSE-NEXT:    store <4 x i32> [[TMP12]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12) to <4 x i32>*), align 4
+; SSE-NEXT:    ret void
+;
+; AVX-LABEL: @shl_v16i32(
+; AVX-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @a32 to <8 x i32>*), align 4
+; AVX-NEXT:    [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <8 x i32>*), align 4
+; AVX-NEXT:    [[TMP3:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @b32 to <8 x i32>*), align 4
+; AVX-NEXT:    [[TMP4:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <8 x i32>*), align 4
+; AVX-NEXT:    [[TMP5:%.*]] = shl <8 x i32> [[TMP1]], [[TMP3]]
+; AVX-NEXT:    [[TMP6:%.*]] = shl <8 x i32> [[TMP2]], [[TMP4]]
+; AVX-NEXT:    store <8 x i32> [[TMP5]], <8 x i32>* bitcast ([16 x i32]* @c32 to <8 x i32>*), align 4
+; AVX-NEXT:    store <8 x i32> [[TMP6]], <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <8 x i32>*), align 4
+; AVX-NEXT:    ret void
+;
+; AVX512-LABEL: @shl_v16i32(
+; AVX512-NEXT:    [[TMP1:%.*]] = load <16 x i32>, <16 x i32>* bitcast ([16 x i32]* @a32 to <16 x i32>*), align 4
+; AVX512-NEXT:    [[TMP2:%.*]] = load <16 x i32>, <16 x i32>* bitcast ([16 x i32]* @b32 to <16 x i32>*), align 4
+; AVX512-NEXT:    [[TMP3:%.*]] = shl <16 x i32> [[TMP1]], [[TMP2]]
+; AVX512-NEXT:    store <16 x i32> [[TMP3]], <16 x i32>* bitcast ([16 x i32]* @c32 to <16 x i32>*), align 4
+; AVX512-NEXT:    ret void
+;
+; XOP-LABEL: @shl_v16i32(
+; XOP-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @a32 to <8 x i32>*), align 4
+; XOP-NEXT:    [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <8 x i32>*), align 4
+; XOP-NEXT:    [[TMP3:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @b32 to <8 x i32>*), align 4
+; XOP-NEXT:    [[TMP4:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <8 x i32>*), align 4
+; XOP-NEXT:    [[TMP5:%.*]] = shl <8 x i32> [[TMP1]], [[TMP3]]
+; XOP-NEXT:    [[TMP6:%.*]] = shl <8 x i32> [[TMP2]], [[TMP4]]
+; XOP-NEXT:    store <8 x i32> [[TMP5]], <8 x i32>* bitcast ([16 x i32]* @c32 to <8 x i32>*), align 4
+; XOP-NEXT:    store <8 x i32> [[TMP6]], <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <8 x i32>*), align 4
+; XOP-NEXT:    ret void
+;
+  %a0  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 0 ), align 4
+  %a1  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 1 ), align 4
+  %a2  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 2 ), align 4
+  %a3  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 3 ), align 4
+  %a4  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4 ), align 4
+  %a5  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 5 ), align 4
+  %a6  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 6 ), align 4
+  %a7  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 7 ), align 4
+  %a8  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8 ), align 4
+  %a9  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 9 ), align 4
+  %a10 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 10), align 4
+  %a11 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 11), align 4
+  %a12 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12), align 4
+  %a13 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 13), align 4
+  %a14 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 14), align 4
+  %a15 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 15), align 4
+  %b0  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 0 ), align 4
+  %b1  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 1 ), align 4
+  %b2  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 2 ), align 4
+  %b3  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 3 ), align 4
+  %b4  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4 ), align 4
+  %b5  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 5 ), align 4
+  %b6  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 6 ), align 4
+  %b7  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 7 ), align 4
+  %b8  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8 ), align 4
+  %b9  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 9 ), align 4
+  %b10 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 10), align 4
+  %b11 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 11), align 4
+  %b12 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12), align 4
+  %b13 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 13), align 4
+  %b14 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 14), align 4
+  %b15 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 15), align 4
+  %r0  = shl i32 %a0 , %b0
+  %r1  = shl i32 %a1 , %b1
+  %r2  = shl i32 %a2 , %b2
+  %r3  = shl i32 %a3 , %b3
+  %r4  = shl i32 %a4 , %b4
+  %r5  = shl i32 %a5 , %b5
+  %r6  = shl i32 %a6 , %b6
+  %r7  = shl i32 %a7 , %b7
+  %r8  = shl i32 %a8 , %b8
+  %r9  = shl i32 %a9 , %b9
+  %r10 = shl i32 %a10, %b10
+  %r11 = shl i32 %a11, %b11
+  %r12 = shl i32 %a12, %b12
+  %r13 = shl i32 %a13, %b13
+  %r14 = shl i32 %a14, %b14
+  %r15 = shl i32 %a15, %b15
+  store i32 %r0 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 0 ), align 4
+  store i32 %r1 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 1 ), align 4
+  store i32 %r2 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 2 ), align 4
+  store i32 %r3 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 3 ), align 4
+  store i32 %r4 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4 ), align 4
+  store i32 %r5 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 5 ), align 4
+  store i32 %r6 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 6 ), align 4
+  store i32 %r7 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 7 ), align 4
+  store i32 %r8 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8 ), align 4
+  store i32 %r9 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 9 ), align 4
+  store i32 %r10, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 10), align 4
+  store i32 %r11, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 11), align 4
+  store i32 %r12, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12), align 4
+  store i32 %r13, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 13), align 4
+  store i32 %r14, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 14), align 4
+  store i32 %r15, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 15), align 4
+  ret void
+}
+
+define void @shl_v32i16() {
+; SSE-LABEL: @shl_v32i16(
+; SSE-NEXT:    [[A0:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 0), align 2
+; SSE-NEXT:    [[A1:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 1), align 2
+; SSE-NEXT:    [[A2:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 2), align 2
+; SSE-NEXT:    [[A3:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 3), align 2
+; SSE-NEXT:    [[A4:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 4), align 2
+; SSE-NEXT:    [[A5:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 5), align 2
+; SSE-NEXT:    [[A6:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 6), align 2
+; SSE-NEXT:    [[A7:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 7), align 2
+; SSE-NEXT:    [[A8:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8), align 2
+; SSE-NEXT:    [[A9:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 9), align 2
+; SSE-NEXT:    [[A10:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 10), align 2
+; SSE-NEXT:    [[A11:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 11), align 2
+; SSE-NEXT:    [[A12:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 12), align 2
+; SSE-NEXT:    [[A13:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 13), align 2
+; SSE-NEXT:    [[A14:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 14), align 2
+; SSE-NEXT:    [[A15:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 15), align 2
+; SSE-NEXT:    [[A16:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16), align 2
+; SSE-NEXT:    [[A17:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 17), align 2
+; SSE-NEXT:    [[A18:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 18), align 2
+; SSE-NEXT:    [[A19:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 19), align 2
+; SSE-NEXT:    [[A20:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 20), align 2
+; SSE-NEXT:    [[A21:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 21), align 2
+; SSE-NEXT:    [[A22:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 22), align 2
+; SSE-NEXT:    [[A23:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 23), align 2
+; SSE-NEXT:    [[A24:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24), align 2
+; SSE-NEXT:    [[A25:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 25), align 2
+; SSE-NEXT:    [[A26:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 26), align 2
+; SSE-NEXT:    [[A27:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 27), align 2
+; SSE-NEXT:    [[A28:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 28), align 2
+; SSE-NEXT:    [[A29:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 29), align 2
+; SSE-NEXT:    [[A30:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 30), align 2
+; SSE-NEXT:    [[A31:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 31), align 2
+; SSE-NEXT:    [[B0:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 0), align 2
+; SSE-NEXT:    [[B1:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 1), align 2
+; SSE-NEXT:    [[B2:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 2), align 2
+; SSE-NEXT:    [[B3:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 3), align 2
+; SSE-NEXT:    [[B4:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 4), align 2
+; SSE-NEXT:    [[B5:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 5), align 2
+; SSE-NEXT:    [[B6:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 6), align 2
+; SSE-NEXT:    [[B7:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 7), align 2
+; SSE-NEXT:    [[B8:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8), align 2
+; SSE-NEXT:    [[B9:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 9), align 2
+; SSE-NEXT:    [[B10:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 10), align 2
+; SSE-NEXT:    [[B11:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 11), align 2
+; SSE-NEXT:    [[B12:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 12), align 2
+; SSE-NEXT:    [[B13:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 13), align 2
+; SSE-NEXT:    [[B14:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 14), align 2
+; SSE-NEXT:    [[B15:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 15), align 2
+; SSE-NEXT:    [[B16:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16), align 2
+; SSE-NEXT:    [[B17:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 17), align 2
+; SSE-NEXT:    [[B18:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 18), align 2
+; SSE-NEXT:    [[B19:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 19), align 2
+; SSE-NEXT:    [[B20:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 20), align 2
+; SSE-NEXT:    [[B21:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 21), align 2
+; SSE-NEXT:    [[B22:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 22), align 2
+; SSE-NEXT:    [[B23:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 23), align 2
+; SSE-NEXT:    [[B24:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24), align 2
+; SSE-NEXT:    [[B25:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 25), align 2
+; SSE-NEXT:    [[B26:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 26), align 2
+; SSE-NEXT:    [[B27:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 27), align 2
+; SSE-NEXT:    [[B28:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 28), align 2
+; SSE-NEXT:    [[B29:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 29), align 2
+; SSE-NEXT:    [[B30:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 30), align 2
+; SSE-NEXT:    [[B31:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 31), align 2
+; SSE-NEXT:    [[R0:%.*]] = shl i16 [[A0]], [[B0]]
+; SSE-NEXT:    [[R1:%.*]] = shl i16 [[A1]], [[B1]]
+; SSE-NEXT:    [[R2:%.*]] = shl i16 [[A2]], [[B2]]
+; SSE-NEXT:    [[R3:%.*]] = shl i16 [[A3]], [[B3]]
+; SSE-NEXT:    [[R4:%.*]] = shl i16 [[A4]], [[B4]]
+; SSE-NEXT:    [[R5:%.*]] = shl i16 [[A5]], [[B5]]
+; SSE-NEXT:    [[R6:%.*]] = shl i16 [[A6]], [[B6]]
+; SSE-NEXT:    [[R7:%.*]] = shl i16 [[A7]], [[B7]]
+; SSE-NEXT:    [[R8:%.*]] = shl i16 [[A8]], [[B8]]
+; SSE-NEXT:    [[R9:%.*]] = shl i16 [[A9]], [[B9]]
+; SSE-NEXT:    [[R10:%.*]] = shl i16 [[A10]], [[B10]]
+; SSE-NEXT:    [[R11:%.*]] = shl i16 [[A11]], [[B11]]
+; SSE-NEXT:    [[R12:%.*]] = shl i16 [[A12]], [[B12]]
+; SSE-NEXT:    [[R13:%.*]] = shl i16 [[A13]], [[B13]]
+; SSE-NEXT:    [[R14:%.*]] = shl i16 [[A14]], [[B14]]
+; SSE-NEXT:    [[R15:%.*]] = shl i16 [[A15]], [[B15]]
+; SSE-NEXT:    [[R16:%.*]] = shl i16 [[A16]], [[B16]]
+; SSE-NEXT:    [[R17:%.*]] = shl i16 [[A17]], [[B17]]
+; SSE-NEXT:    [[R18:%.*]] = shl i16 [[A18]], [[B18]]
+; SSE-NEXT:    [[R19:%.*]] = shl i16 [[A19]], [[B19]]
+; SSE-NEXT:    [[R20:%.*]] = shl i16 [[A20]], [[B20]]
+; SSE-NEXT:    [[R21:%.*]] = shl i16 [[A21]], [[B21]]
+; SSE-NEXT:    [[R22:%.*]] = shl i16 [[A22]], [[B22]]
+; SSE-NEXT:    [[R23:%.*]] = shl i16 [[A23]], [[B23]]
+; SSE-NEXT:    [[R24:%.*]] = shl i16 [[A24]], [[B24]]
+; SSE-NEXT:    [[R25:%.*]] = shl i16 [[A25]], [[B25]]
+; SSE-NEXT:    [[R26:%.*]] = shl i16 [[A26]], [[B26]]
+; SSE-NEXT:    [[R27:%.*]] = shl i16 [[A27]], [[B27]]
+; SSE-NEXT:    [[R28:%.*]] = shl i16 [[A28]], [[B28]]
+; SSE-NEXT:    [[R29:%.*]] = shl i16 [[A29]], [[B29]]
+; SSE-NEXT:    [[R30:%.*]] = shl i16 [[A30]], [[B30]]
+; SSE-NEXT:    [[R31:%.*]] = shl i16 [[A31]], [[B31]]
+; SSE-NEXT:    store i16 [[R0]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 0), align 2
+; SSE-NEXT:    store i16 [[R1]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 1), align 2
+; SSE-NEXT:    store i16 [[R2]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 2), align 2
+; SSE-NEXT:    store i16 [[R3]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 3), align 2
+; SSE-NEXT:    store i16 [[R4]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 4), align 2
+; SSE-NEXT:    store i16 [[R5]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 5), align 2
+; SSE-NEXT:    store i16 [[R6]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 6), align 2
+; SSE-NEXT:    store i16 [[R7]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 7), align 2
+; SSE-NEXT:    store i16 [[R8]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8), align 2
+; SSE-NEXT:    store i16 [[R9]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 9), align 2
+; SSE-NEXT:    store i16 [[R10]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 10), align 2
+; SSE-NEXT:    store i16 [[R11]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 11), align 2
+; SSE-NEXT:    store i16 [[R12]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 12), align 2
+; SSE-NEXT:    store i16 [[R13]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 13), align 2
+; SSE-NEXT:    store i16 [[R14]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 14), align 2
+; SSE-NEXT:    store i16 [[R15]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 15), align 2
+; SSE-NEXT:    store i16 [[R16]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16), align 2
+; SSE-NEXT:    store i16 [[R17]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 17), align 2
+; SSE-NEXT:    store i16 [[R18]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 18), align 2
+; SSE-NEXT:    store i16 [[R19]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 19), align 2
+; SSE-NEXT:    store i16 [[R20]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 20), align 2
+; SSE-NEXT:    store i16 [[R21]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 21), align 2
+; SSE-NEXT:    store i16 [[R22]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 22), align 2
+; SSE-NEXT:    store i16 [[R23]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 23), align 2
+; SSE-NEXT:    store i16 [[R24]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 24), align 2
+; SSE-NEXT:    store i16 [[R25]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 25), align 2
+; SSE-NEXT:    store i16 [[R26]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 26), align 2
+; SSE-NEXT:    store i16 [[R27]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 27), align 2
+; SSE-NEXT:    store i16 [[R28]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 28), align 2
+; SSE-NEXT:    store i16 [[R29]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 29), align 2
+; SSE-NEXT:    store i16 [[R30]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 30), align 2
+; SSE-NEXT:    store i16 [[R31]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 31), align 2
+; SSE-NEXT:    ret void
+;
+; AVX-LABEL: @shl_v32i16(
+; AVX-NEXT:    [[TMP1:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @a16 to <16 x i16>*), align 2
+; AVX-NEXT:    [[TMP2:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <16 x i16>*), align 2
+; AVX-NEXT:    [[TMP3:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @b16 to <16 x i16>*), align 2
+; AVX-NEXT:    [[TMP4:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <16 x i16>*), align 2
+; AVX-NEXT:    [[TMP5:%.*]] = shl <16 x i16> [[TMP1]], [[TMP3]]
+; AVX-NEXT:    [[TMP6:%.*]] = shl <16 x i16> [[TMP2]], [[TMP4]]
+; AVX-NEXT:    store <16 x i16> [[TMP5]], <16 x i16>* bitcast ([32 x i16]* @c16 to <16 x i16>*), align 2
+; AVX-NEXT:    store <16 x i16> [[TMP6]], <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <16 x i16>*), align 2
+; AVX-NEXT:    ret void
+;
+; AVX512-LABEL: @shl_v32i16(
+; AVX512-NEXT:    [[TMP1:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @a16 to <16 x i16>*), align 2
+; AVX512-NEXT:    [[TMP2:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <16 x i16>*), align 2
+; AVX512-NEXT:    [[TMP3:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @b16 to <16 x i16>*), align 2
+; AVX512-NEXT:    [[TMP4:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <16 x i16>*), align 2
+; AVX512-NEXT:    [[TMP5:%.*]] = shl <16 x i16> [[TMP1]], [[TMP3]]
+; AVX512-NEXT:    [[TMP6:%.*]] = shl <16 x i16> [[TMP2]], [[TMP4]]
+; AVX512-NEXT:    store <16 x i16> [[TMP5]], <16 x i16>* bitcast ([32 x i16]* @c16 to <16 x i16>*), align 2
+; AVX512-NEXT:    store <16 x i16> [[TMP6]], <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <16 x i16>*), align 2
+; AVX512-NEXT:    ret void
+;
+; XOP-LABEL: @shl_v32i16(
+; XOP-NEXT:    [[TMP1:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @a16 to <16 x i16>*), align 2
+; XOP-NEXT:    [[TMP2:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <16 x i16>*), align 2
+; XOP-NEXT:    [[TMP3:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @b16 to <16 x i16>*), align 2
+; XOP-NEXT:    [[TMP4:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <16 x i16>*), align 2
+; XOP-NEXT:    [[TMP5:%.*]] = shl <16 x i16> [[TMP1]], [[TMP3]]
+; XOP-NEXT:    [[TMP6:%.*]] = shl <16 x i16> [[TMP2]], [[TMP4]]
+; XOP-NEXT:    store <16 x i16> [[TMP5]], <16 x i16>* bitcast ([32 x i16]* @c16 to <16 x i16>*), align 2
+; XOP-NEXT:    store <16 x i16> [[TMP6]], <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <16 x i16>*), align 2
+; XOP-NEXT:    ret void
+;
+  %a0  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 0 ), align 2
+  %a1  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 1 ), align 2
+  %a2  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 2 ), align 2
+  %a3  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 3 ), align 2
+  %a4  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 4 ), align 2
+  %a5  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 5 ), align 2
+  %a6  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 6 ), align 2
+  %a7  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 7 ), align 2
+  %a8  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8 ), align 2
+  %a9  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 9 ), align 2
+  %a10 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 10), align 2
+  %a11 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 11), align 2
+  %a12 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 12), align 2
+  %a13 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 13), align 2
+  %a14 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 14), align 2
+  %a15 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 15), align 2
+  %a16 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16), align 2
+  %a17 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 17), align 2
+  %a18 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 18), align 2
+  %a19 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 19), align 2
+  %a20 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 20), align 2
+  %a21 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 21), align 2
+  %a22 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 22), align 2
+  %a23 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 23), align 2
+  %a24 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24), align 2
+  %a25 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 25), align 2
+  %a26 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 26), align 2
+  %a27 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 27), align 2
+  %a28 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 28), align 2
+  %a29 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 29), align 2
+  %a30 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 30), align 2
+  %a31 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 31), align 2
+  %b0  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 0 ), align 2
+  %b1  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 1 ), align 2
+  %b2  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 2 ), align 2
+  %b3  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 3 ), align 2
+  %b4  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 4 ), align 2
+  %b5  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 5 ), align 2
+  %b6  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 6 ), align 2
+  %b7  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 7 ), align 2
+  %b8  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8 ), align 2
+  %b9  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 9 ), align 2
+  %b10 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 10), align 2
+  %b11 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 11), align 2
+  %b12 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 12), align 2
+  %b13 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 13), align 2
+  %b14 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 14), align 2
+  %b15 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 15), align 2
+  %b16 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16), align 2
+  %b17 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 17), align 2
+  %b18 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 18), align 2
+  %b19 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 19), align 2
+  %b20 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 20), align 2
+  %b21 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 21), align 2
+  %b22 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 22), align 2
+  %b23 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 23), align 2
+  %b24 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24), align 2
+  %b25 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 25), align 2
+  %b26 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 26), align 2
+  %b27 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 27), align 2
+  %b28 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 28), align 2
+  %b29 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 29), align 2
+  %b30 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 30), align 2
+  %b31 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 31), align 2
+  %r0  = shl i16 %a0 , %b0
+  %r1  = shl i16 %a1 , %b1
+  %r2  = shl i16 %a2 , %b2
+  %r3  = shl i16 %a3 , %b3
+  %r4  = shl i16 %a4 , %b4
+  %r5  = shl i16 %a5 , %b5
+  %r6  = shl i16 %a6 , %b6
+  %r7  = shl i16 %a7 , %b7
+  %r8  = shl i16 %a8 , %b8
+  %r9  = shl i16 %a9 , %b9
+  %r10 = shl i16 %a10, %b10
+  %r11 = shl i16 %a11, %b11
+  %r12 = shl i16 %a12, %b12
+  %r13 = shl i16 %a13, %b13
+  %r14 = shl i16 %a14, %b14
+  %r15 = shl i16 %a15, %b15
+  %r16 = shl i16 %a16, %b16
+  %r17 = shl i16 %a17, %b17
+  %r18 = shl i16 %a18, %b18
+  %r19 = shl i16 %a19, %b19
+  %r20 = shl i16 %a20, %b20
+  %r21 = shl i16 %a21, %b21
+  %r22 = shl i16 %a22, %b22
+  %r23 = shl i16 %a23, %b23
+  %r24 = shl i16 %a24, %b24
+  %r25 = shl i16 %a25, %b25
+  %r26 = shl i16 %a26, %b26
+  %r27 = shl i16 %a27, %b27
+  %r28 = shl i16 %a28, %b28
+  %r29 = shl i16 %a29, %b29
+  %r30 = shl i16 %a30, %b30
+  %r31 = shl i16 %a31, %b31
+  store i16 %r0 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 0 ), align 2
+  store i16 %r1 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 1 ), align 2
+  store i16 %r2 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 2 ), align 2
+  store i16 %r3 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 3 ), align 2
+  store i16 %r4 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 4 ), align 2
+  store i16 %r5 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 5 ), align 2
+  store i16 %r6 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 6 ), align 2
+  store i16 %r7 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 7 ), align 2
+  store i16 %r8 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8 ), align 2
+  store i16 %r9 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 9 ), align 2
+  store i16 %r10, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 10), align 2
+  store i16 %r11, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 11), align 2
+  store i16 %r12, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 12), align 2
+  store i16 %r13, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 13), align 2
+  store i16 %r14, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 14), align 2
+  store i16 %r15, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 15), align 2
+  store i16 %r16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16), align 2
+  store i16 %r17, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 17), align 2
+  store i16 %r18, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 18), align 2
+  store i16 %r19, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 19), align 2
+  store i16 %r20, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 20), align 2
+  store i16 %r21, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 21), align 2
+  store i16 %r22, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 22), align 2
+  store i16 %r23, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 23), align 2
+  store i16 %r24, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 24), align 2
+  store i16 %r25, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 25), align 2
+  store i16 %r26, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 26), align 2
+  store i16 %r27, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 27), align 2
+  store i16 %r28, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 28), align 2
+  store i16 %r29, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 29), align 2
+  store i16 %r30, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 30), align 2
+  store i16 %r31, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 31), align 2
+  ret void
+}
+
+define void @shl_v64i8() {
+; CHECK-LABEL: @shl_v64i8(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @a8 to <16 x i8>*), align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16) to <16 x i8>*), align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <16 x i8>*), align 1
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48) to <16 x i8>*), align 1
+; CHECK-NEXT:    [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @b8 to <16 x i8>*), align 1
+; CHECK-NEXT:    [[TMP6:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16) to <16 x i8>*), align 1
+; CHECK-NEXT:    [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <16 x i8>*), align 1
+; CHECK-NEXT:    [[TMP8:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48) to <16 x i8>*), align 1
+; CHECK-NEXT:    [[TMP9:%.*]] = shl <16 x i8> [[TMP1]], [[TMP5]]
+; CHECK-NEXT:    [[TMP10:%.*]] = shl <16 x i8> [[TMP2]], [[TMP6]]
+; CHECK-NEXT:    [[TMP11:%.*]] = shl <16 x i8> [[TMP3]], [[TMP7]]
+; CHECK-NEXT:    [[TMP12:%.*]] = shl <16 x i8> [[TMP4]], [[TMP8]]
+; CHECK-NEXT:    store <16 x i8> [[TMP9]], <16 x i8>* bitcast ([64 x i8]* @c8 to <16 x i8>*), align 1
+; CHECK-NEXT:    store <16 x i8> [[TMP10]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16) to <16 x i8>*), align 1
+; CHECK-NEXT:    store <16 x i8> [[TMP11]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <16 x i8>*), align 1
+; CHECK-NEXT:    store <16 x i8> [[TMP12]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 48) to <16 x i8>*), align 1
+; CHECK-NEXT:    ret void
+;
+  %a0  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 0 ), align 1
+  %a1  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 1 ), align 1
+  %a2  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 2 ), align 1
+  %a3  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 3 ), align 1
+  %a4  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 4 ), align 1
+  %a5  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 5 ), align 1
+  %a6  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 6 ), align 1
+  %a7  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 7 ), align 1
+  %a8  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 8 ), align 1
+  %a9  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 9 ), align 1
+  %a10 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 10), align 1
+  %a11 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 11), align 1
+  %a12 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 12), align 1
+  %a13 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 13), align 1
+  %a14 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 14), align 1
+  %a15 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 15), align 1
+  %a16 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16), align 1
+  %a17 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 17), align 1
+  %a18 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 18), align 1
+  %a19 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 19), align 1
+  %a20 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 20), align 1
+  %a21 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 21), align 1
+  %a22 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 22), align 1
+  %a23 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 23), align 1
+  %a24 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 24), align 1
+  %a25 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 25), align 1
+  %a26 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 26), align 1
+  %a27 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 27), align 1
+  %a28 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 28), align 1
+  %a29 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 29), align 1
+  %a30 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 30), align 1
+  %a31 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 31), align 1
+  %a32 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32), align 1
+  %a33 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 33), align 1
+  %a34 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 34), align 1
+  %a35 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 35), align 1
+  %a36 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 36), align 1
+  %a37 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 37), align 1
+  %a38 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 38), align 1
+  %a39 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 39), align 1
+  %a40 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 40), align 1
+  %a41 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 41), align 1
+  %a42 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 42), align 1
+  %a43 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 43), align 1
+  %a44 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 44), align 1
+  %a45 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 45), align 1
+  %a46 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 46), align 1
+  %a47 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 47), align 1
+  %a48 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48), align 1
+  %a49 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 49), align 1
+  %a50 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 50), align 1
+  %a51 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 51), align 1
+  %a52 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 52), align 1
+  %a53 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 53), align 1
+  %a54 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 54), align 1
+  %a55 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 55), align 1
+  %a56 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 56), align 1
+  %a57 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 57), align 1
+  %a58 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 58), align 1
+  %a59 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 59), align 1
+  %a60 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 60), align 1
+  %a61 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 61), align 1
+  %a62 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 62), align 1
+  %a63 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 63), align 1
+  %b0  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 0 ), align 1
+  %b1  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 1 ), align 1
+  %b2  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 2 ), align 1
+  %b3  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 3 ), align 1
+  %b4  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 4 ), align 1
+  %b5  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 5 ), align 1
+  %b6  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 6 ), align 1
+  %b7  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 7 ), align 1
+  %b8  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 8 ), align 1
+  %b9  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 9 ), align 1
+  %b10 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 10), align 1
+  %b11 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 11), align 1
+  %b12 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 12), align 1
+  %b13 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 13), align 1
+  %b14 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 14), align 1
+  %b15 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 15), align 1
+  %b16 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16), align 1
+  %b17 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 17), align 1
+  %b18 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 18), align 1
+  %b19 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 19), align 1
+  %b20 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 20), align 1
+  %b21 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 21), align 1
+  %b22 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 22), align 1
+  %b23 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 23), align 1
+  %b24 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 24), align 1
+  %b25 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 25), align 1
+  %b26 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 26), align 1
+  %b27 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 27), align 1
+  %b28 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 28), align 1
+  %b29 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 29), align 1
+  %b30 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 30), align 1
+  %b31 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 31), align 1
+  %b32 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32), align 1
+  %b33 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 33), align 1
+  %b34 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 34), align 1
+  %b35 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 35), align 1
+  %b36 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 36), align 1
+  %b37 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 37), align 1
+  %b38 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 38), align 1
+  %b39 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 39), align 1
+  %b40 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 40), align 1
+  %b41 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 41), align 1
+  %b42 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 42), align 1
+  %b43 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 43), align 1
+  %b44 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 44), align 1
+  %b45 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 45), align 1
+  %b46 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 46), align 1
+  %b47 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 47), align 1
+  %b48 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48), align 1
+  %b49 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 49), align 1
+  %b50 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 50), align 1
+  %b51 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 51), align 1
+  %b52 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 52), align 1
+  %b53 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 53), align 1
+  %b54 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 54), align 1
+  %b55 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 55), align 1
+  %b56 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 56), align 1
+  %b57 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 57), align 1
+  %b58 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 58), align 1
+  %b59 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 59), align 1
+  %b60 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 60), align 1
+  %b61 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 61), align 1
+  %b62 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 62), align 1
+  %b63 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 63), align 1
+  %r0  = shl i8 %a0 , %b0
+  %r1  = shl i8 %a1 , %b1
+  %r2  = shl i8 %a2 , %b2
+  %r3  = shl i8 %a3 , %b3
+  %r4  = shl i8 %a4 , %b4
+  %r5  = shl i8 %a5 , %b5
+  %r6  = shl i8 %a6 , %b6
+  %r7  = shl i8 %a7 , %b7
+  %r8  = shl i8 %a8 , %b8
+  %r9  = shl i8 %a9 , %b9
+  %r10 = shl i8 %a10, %b10
+  %r11 = shl i8 %a11, %b11
+  %r12 = shl i8 %a12, %b12
+  %r13 = shl i8 %a13, %b13
+  %r14 = shl i8 %a14, %b14
+  %r15 = shl i8 %a15, %b15
+  %r16 = shl i8 %a16, %b16
+  %r17 = shl i8 %a17, %b17
+  %r18 = shl i8 %a18, %b18
+  %r19 = shl i8 %a19, %b19
+  %r20 = shl i8 %a20, %b20
+  %r21 = shl i8 %a21, %b21
+  %r22 = shl i8 %a22, %b22
+  %r23 = shl i8 %a23, %b23
+  %r24 = shl i8 %a24, %b24
+  %r25 = shl i8 %a25, %b25
+  %r26 = shl i8 %a26, %b26
+  %r27 = shl i8 %a27, %b27
+  %r28 = shl i8 %a28, %b28
+  %r29 = shl i8 %a29, %b29
+  %r30 = shl i8 %a30, %b30
+  %r31 = shl i8 %a31, %b31
+  %r32 = shl i8 %a32, %b32
+  %r33 = shl i8 %a33, %b33
+  %r34 = shl i8 %a34, %b34
+  %r35 = shl i8 %a35, %b35
+  %r36 = shl i8 %a36, %b36
+  %r37 = shl i8 %a37, %b37
+  %r38 = shl i8 %a38, %b38
+  %r39 = shl i8 %a39, %b39
+  %r40 = shl i8 %a40, %b40
+  %r41 = shl i8 %a41, %b41
+  %r42 = shl i8 %a42, %b42
+  %r43 = shl i8 %a43, %b43
+  %r44 = shl i8 %a44, %b44
+  %r45 = shl i8 %a45, %b45
+  %r46 = shl i8 %a46, %b46
+  %r47 = shl i8 %a47, %b47
+  %r48 = shl i8 %a48, %b48
+  %r49 = shl i8 %a49, %b49
+  %r50 = shl i8 %a50, %b50
+  %r51 = shl i8 %a51, %b51
+  %r52 = shl i8 %a52, %b52
+  %r53 = shl i8 %a53, %b53
+  %r54 = shl i8 %a54, %b54
+  %r55 = shl i8 %a55, %b55
+  %r56 = shl i8 %a56, %b56
+  %r57 = shl i8 %a57, %b57
+  %r58 = shl i8 %a58, %b58
+  %r59 = shl i8 %a59, %b59
+  %r60 = shl i8 %a60, %b60
+  %r61 = shl i8 %a61, %b61
+  %r62 = shl i8 %a62, %b62
+  %r63 = shl i8 %a63, %b63
+  store i8 %r0 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 0 ), align 1
+  store i8 %r1 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 1 ), align 1
+  store i8 %r2 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 2 ), align 1
+  store i8 %r3 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 3 ), align 1
+  store i8 %r4 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 4 ), align 1
+  store i8 %r5 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 5 ), align 1
+  store i8 %r6 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 6 ), align 1
+  store i8 %r7 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 7 ), align 1
+  store i8 %r8 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 8 ), align 1
+  store i8 %r9 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 9 ), align 1
+  store i8 %r10, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 10), align 1
+  store i8 %r11, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 11), align 1
+  store i8 %r12, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 12), align 1
+  store i8 %r13, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 13), align 1
+  store i8 %r14, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 14), align 1
+  store i8 %r15, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 15), align 1
+  store i8 %r16, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16), align 1
+  store i8 %r17, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 17), align 1
+  store i8 %r18, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 18), align 1
+  store i8 %r19, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 19), align 1
+  store i8 %r20, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 20), align 1
+  store i8 %r21, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 21), align 1
+  store i8 %r22, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 22), align 1
+  store i8 %r23, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 23), align 1
+  store i8 %r24, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 24), align 1
+  store i8 %r25, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 25), align 1
+  store i8 %r26, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 26), align 1
+  store i8 %r27, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 27), align 1
+  store i8 %r28, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 28), align 1
+  store i8 %r29, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 29), align 1
+  store i8 %r30, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 30), align 1
+  store i8 %r31, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 31), align 1
+  store i8 %r32, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32), align 1
+  store i8 %r33, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 33), align 1
+  store i8 %r34, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 34), align 1
+  store i8 %r35, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 35), align 1
+  store i8 %r36, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 36), align 1
+  store i8 %r37, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 37), align 1
+  store i8 %r38, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 38), align 1
+  store i8 %r39, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 39), align 1
+  store i8 %r40, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 40), align 1
+  store i8 %r41, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 41), align 1
+  store i8 %r42, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 42), align 1
+  store i8 %r43, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 43), align 1
+  store i8 %r44, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 44), align 1
+  store i8 %r45, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 45), align 1
+  store i8 %r46, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 46), align 1
+  store i8 %r47, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 47), align 1
+  store i8 %r48, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 48), align 1
+  store i8 %r49, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 49), align 1
+  store i8 %r50, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 50), align 1
+  store i8 %r51, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 51), align 1
+  store i8 %r52, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 52), align 1
+  store i8 %r53, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 53), align 1
+  store i8 %r54, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 54), align 1
+  store i8 %r55, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 55), align 1
+  store i8 %r56, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 56), align 1
+  store i8 %r57, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 57), align 1
+  store i8 %r58, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 58), align 1
+  store i8 %r59, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 59), align 1
+  store i8 %r60, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 60), align 1
+  store i8 %r61, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 61), align 1
+  store i8 %r62, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 62), align 1
+  store i8 %r63, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 63), align 1
+  ret void
+}
diff --git a/test/Transforms/SimpleLoopUnswitch/trivial-unswitch.ll b/test/Transforms/SimpleLoopUnswitch/trivial-unswitch.ll
index 42b4f3dea75b..3ac3c5138ae7 100644
--- a/test/Transforms/SimpleLoopUnswitch/trivial-unswitch.ll
+++ b/test/Transforms/SimpleLoopUnswitch/trivial-unswitch.ll
@@ -183,3 +183,202 @@ loop_exit3:
 ; CHECK:       [[UNREACHABLE]]:
 ; CHECK-NEXT:    unreachable
 }
+
+; This test contains a trivially unswitchable branch with an LCSSA phi node in
+; a loop exit block.
+define i32 @test5(i1 %cond1, i32 %x, i32 %y) {
+; CHECK-LABEL: @test5(
+entry:
+  br label %loop_begin
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 %{{.*}}, label %entry.split, label %loop_exit
+;
+; CHECK:       entry.split:
+; CHECK-NEXT:    br label %loop_begin
+
+loop_begin:
+  br i1 %cond1, label %latch, label %loop_exit
+; CHECK:       loop_begin:
+; CHECK-NEXT:    br label %latch
+
+latch:
+  call void @some_func() noreturn nounwind
+  br label %loop_begin
+; CHECK:       latch:
+; CHECK-NEXT:    call
+; CHECK-NEXT:    br label %loop_begin
+
+loop_exit:
+  %result1 = phi i32 [ %x, %loop_begin ]
+  %result2 = phi i32 [ %y, %loop_begin ]
+  %result = add i32 %result1, %result2
+  ret i32 %result
+; CHECK:       loop_exit:
+; CHECK-NEXT:    %[[R1:.*]] = phi i32 [ %x, %entry ]
+; CHECK-NEXT:    %[[R2:.*]] = phi i32 [ %y, %entry ]
+; CHECK-NEXT:    %[[R:.*]] = add i32 %[[R1]], %[[R2]]
+; CHECK-NEXT:    ret i32 %[[R]]
+}
+
+; This test contains a trivially unswitchable branch with a real phi node in LCSSA
+; position in a shared exit block where a different path through the loop
+; produces a non-invariant input to the PHI node.
+define i32 @test6(i32* %var, i1 %cond1, i1 %cond2, i32 %x, i32 %y) {
+; CHECK-LABEL: @test6(
+entry:
+  br label %loop_begin
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 %{{.*}}, label %entry.split, label %loop_exit.split
+;
+; CHECK:       entry.split:
+; CHECK-NEXT:    br label %loop_begin
+
+loop_begin:
+  br i1 %cond1, label %continue, label %loop_exit
+; CHECK:       loop_begin:
+; CHECK-NEXT:    br label %continue
+
+continue:
+  %var_val = load i32, i32* %var
+  br i1 %cond2, label %latch, label %loop_exit
+; CHECK:       continue:
+; CHECK-NEXT:    load
+; CHECK-NEXT:    br i1 %cond2, label %latch, label %loop_exit
+
+latch:
+  call void @some_func() noreturn nounwind
+  br label %loop_begin
+; CHECK:       latch:
+; CHECK-NEXT:    call
+; CHECK-NEXT:    br label %loop_begin
+
+loop_exit:
+  %result1 = phi i32 [ %x, %loop_begin ], [ %var_val, %continue ]
+  %result2 = phi i32 [ %var_val, %continue ], [ %y, %loop_begin ]
+  %result = add i32 %result1, %result2
+  ret i32 %result
+; CHECK:       loop_exit:
+; CHECK-NEXT:    %[[R1:.*]] = phi i32 [ %var_val, %continue ]
+; CHECK-NEXT:    %[[R2:.*]] = phi i32 [ %var_val, %continue ]
+; CHECK-NEXT:    br label %loop_exit.split
+;
+; CHECK:       loop_exit.split:
+; CHECK-NEXT:    %[[R1S:.*]] = phi i32 [ %x, %entry ], [ %[[R1]], %loop_exit ]
+; CHECK-NEXT:    %[[R2S:.*]] = phi i32 [ %y, %entry ], [ %[[R2]], %loop_exit ]
+; CHECK-NEXT:    %[[R:.*]] = add i32 %[[R1S]], %[[R2S]]
+; CHECK-NEXT:    ret i32 %[[R]]
+}
+
+; This test contains a trivially unswitchable switch with an LCSSA phi node in
+; a loop exit block.
+define i32 @test7(i32 %cond1, i32 %x, i32 %y) {
+; CHECK-LABEL: @test7(
+entry:
+  br label %loop_begin
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    switch i32 %cond1, label %entry.split [
+; CHECK-NEXT:      i32 0, label %loop_exit
+; CHECK-NEXT:      i32 1, label %loop_exit
+; CHECK-NEXT:    ]
+;
+; CHECK:       entry.split:
+; CHECK-NEXT:    br label %loop_begin
+
+loop_begin:
+  switch i32 %cond1, label %latch [
+    i32 0, label %loop_exit
+    i32 1, label %loop_exit
+  ]
+; CHECK:       loop_begin:
+; CHECK-NEXT:    br label %latch
+
+latch:
+  call void @some_func() noreturn nounwind
+  br label %loop_begin
+; CHECK:       latch:
+; CHECK-NEXT:    call
+; CHECK-NEXT:    br label %loop_begin
+
+loop_exit:
+  %result1 = phi i32 [ %x, %loop_begin ], [ %x, %loop_begin ]
+  %result2 = phi i32 [ %y, %loop_begin ], [ %y, %loop_begin ]
+  %result = add i32 %result1, %result2
+  ret i32 %result
+; CHECK:       loop_exit:
+; CHECK-NEXT:    %[[R1:.*]] = phi i32 [ %x, %entry ], [ %x, %entry ]
+; CHECK-NEXT:    %[[R2:.*]] = phi i32 [ %y, %entry ], [ %y, %entry ]
+; CHECK-NEXT:    %[[R:.*]] = add i32 %[[R1]], %[[R2]]
+; CHECK-NEXT:    ret i32 %[[R]]
+}
+
+; This test contains a trivially unswitchable switch with a real phi node in
+; LCSSA position in a shared exit block where a different path through the loop
+; produces a non-invariant input to the PHI node.
+define i32 @test8(i32* %var, i32 %cond1, i32 %cond2, i32 %x, i32 %y) {
+; CHECK-LABEL: @test8(
+entry:
+  br label %loop_begin
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    switch i32 %cond1, label %entry.split [
+; CHECK-NEXT:      i32 0, label %loop_exit.split
+; CHECK-NEXT:      i32 1, label %loop_exit2
+; CHECK-NEXT:      i32 2, label %loop_exit.split
+; CHECK-NEXT:    ]
+;
+; CHECK:       entry.split:
+; CHECK-NEXT:    br label %loop_begin
+
+loop_begin:
+  switch i32 %cond1, label %continue [
+    i32 0, label %loop_exit
+    i32 1, label %loop_exit2
+    i32 2, label %loop_exit
+  ]
+; CHECK:       loop_begin:
+; CHECK-NEXT:    br label %continue
+
+continue:
+  %var_val = load i32, i32* %var
+  switch i32 %cond2, label %latch [
+    i32 0, label %loop_exit
+  ]
+; CHECK:       continue:
+; CHECK-NEXT:    load
+; CHECK-NEXT:    switch i32 %cond2, label %latch [
+; CHECK-NEXT:      i32 0, label %loop_exit
+; CHECK-NEXT:    ]
+
+latch:
+  call void @some_func() noreturn nounwind
+  br label %loop_begin
+; CHECK:       latch:
+; CHECK-NEXT:    call
+; CHECK-NEXT:    br label %loop_begin
+
+loop_exit:
+  %result1.1 = phi i32 [ %x, %loop_begin ], [ %x, %loop_begin ], [ %var_val, %continue ]
+  %result1.2 = phi i32 [ %var_val, %continue ], [ %y, %loop_begin ], [ %y, %loop_begin ]
+  %result1 = add i32 %result1.1, %result1.2
+  ret i32 %result1
+; CHECK:       loop_exit:
+; CHECK-NEXT:    %[[R1:.*]] = phi i32 [ %var_val, %continue ]
+; CHECK-NEXT:    %[[R2:.*]] = phi i32 [ %var_val, %continue ]
+; CHECK-NEXT:    br label %loop_exit.split
+;
+; CHECK:       loop_exit.split:
+; CHECK-NEXT:    %[[R1S:.*]] = phi i32 [ %x, %entry ], [ %x, %entry ], [ %[[R1]], %loop_exit ]
+; CHECK-NEXT:    %[[R2S:.*]] = phi i32 [ %y, %entry ], [ %y, %entry ], [ %[[R2]], %loop_exit ]
+; CHECK-NEXT:    %[[R:.*]] = add i32 %[[R1S]], %[[R2S]]
+; CHECK-NEXT:    ret i32 %[[R]]
+
+loop_exit2:
+  %result2.1 = phi i32 [ %x, %loop_begin ]
+  %result2.2 = phi i32 [ %y, %loop_begin ]
+  %result2 = add i32 %result2.1, %result2.2
+  ret i32 %result2
+; CHECK:       loop_exit2:
+; CHECK-NEXT:    %[[R1:.*]] = phi i32 [ %x, %entry ]
+; CHECK-NEXT:    %[[R2:.*]] = phi i32 [ %y, %entry ]
+; CHECK-NEXT:    %[[R:.*]] = add i32 %[[R1]], %[[R2]]
+; CHECK-NEXT:    ret i32 %[[R]]
+}
diff --git a/test/Transforms/SpeculativeExecution/spec-other.ll b/test/Transforms/SpeculativeExecution/spec-other.ll
deleted file mode 100644
index 65e14b69e9e6..000000000000
--- a/test/Transforms/SpeculativeExecution/spec-other.ll
+++ /dev/null
@@ -1,32 +0,0 @@
-; RUN: opt < %s -S -speculative-execution \
-; RUN:   -spec-exec-max-speculation-cost 4 -spec-exec-max-not-hoisted 3 \
-; RUN:   | FileCheck %s
-
-; CHECK-LABEL: @ifThen_extractvalue(
-; CHECK: extractvalue
-; CHECK: br i1 true
-define void @ifThen_extractvalue() {
-  br i1 true, label %a, label %b
-
-a:
-  %x = extractvalue { i32, i32 } undef, 0
-  br label %b
-
-b:
-  ret void
-}
-
-; CHECK-LABEL: @ifThen_insertvalue(
-; CHECK: insertvalue
-; CHECK: br i1 true
-define void @ifThen_insertvalue() {
-  br i1 true, label %a, label %b
-
-a:
-  %x = insertvalue { i32, i32 } undef, i32 undef, 0
-  br label %b
-
-b:
-  ret void
-}
-
diff --git a/test/Transforms/SpeculativeExecution/spec-vector.ll b/test/Transforms/SpeculativeExecution/spec-vector.ll
deleted file mode 100644
index 9c64f1fb1005..000000000000
--- a/test/Transforms/SpeculativeExecution/spec-vector.ll
+++ /dev/null
@@ -1,73 +0,0 @@
-; RUN: opt < %s -S -speculative-execution \
-; RUN:   -spec-exec-max-speculation-cost 4 -spec-exec-max-not-hoisted 3 \
-; RUN:   | FileCheck %s
-
-; CHECK-LABEL: @ifThen_extractelement_constindex(
-; CHECK: extractelement
-; CHECK: br i1 true
-define void @ifThen_extractelement_constindex() {
-  br i1 true, label %a, label %b
-
-a:
-  %x = extractelement <4 x i32> undef, i32 0
-  br label %b
-
-b:
-  ret void
-}
-
-; CHECK-LABEL: @ifThen_extractelement_varindex(
-; CHECK: extractelement
-; CHECK: br i1 true
-define void @ifThen_extractelement_varindex(i32 %idx) {
-  br i1 true, label %a, label %b
-
-a:
-  %x = extractelement <4 x i32> undef, i32 %idx
-  br label %b
-
-b:
-  ret void
-}
-
-; CHECK-LABEL: @ifThen_insertelement_constindex(
-; CHECK: insertelement
-; CHECK: br i1 true
-define void @ifThen_insertelement_constindex() {
-  br i1 true, label %a, label %b
-
-a:
-  %x = insertelement <4 x i32> undef, i32 undef, i32 0
-  br label %b
-
-b:
-  ret void
-}
-
-; CHECK-LABEL: @ifThen_insertelement_varindex(
-; CHECK: insertelement
-; CHECK: br i1 true
-define void @ifThen_insertelement_varindex(i32 %idx) {
-  br i1 true, label %a, label %b
-
-a:
-  %x = insertelement <4 x i32> undef, i32 undef, i32 %idx
-  br label %b
-
-b:
-  ret void
-}
-
-; CHECK-LABEL: @ifThen_shufflevector(
-; CHECK: shufflevector
-; CHECK: br i1 true
-define void @ifThen_shufflevector() {
-  br i1 true, label %a, label %b
-
-a:
-  %x = shufflevector <4 x i32> undef, <4 x i32> undef, <4 x i32> undef
-  br label %b
-
-b:
-  ret void
-}
diff --git a/test/Transforms/Util/split-bit-piece.ll b/test/Transforms/Util/split-bit-piece.ll
index 3d7bcac73ca3..5a374e839926 100644
--- a/test/Transforms/Util/split-bit-piece.ll
+++ b/test/Transforms/Util/split-bit-piece.ll
@@ -3,43 +3,85 @@
 ; if it only describes part of the variable.
 ; RUN: opt -S -sroa %s | FileCheck %s
 
-; Function Attrs: nounwind readnone
-declare void @llvm.dbg.declare(metadata, metadata, metadata) #0
+; Built from:
+; struct foo { bool b; long i; };
+; void f(bool b, bool expr, foo g) {
+; }
+; And modifying the frag dbg.declare to use a fragmented DIExpression (with offset: 0, size: 4)
+; to test the dbg.declare+fragment case here.
 
-; Function Attrs: nounwind uwtable
-define hidden void @_ZN6__tsan9FastState14SetHistorySizeEi(i32 %hs) #1 align 2 {
+; Expect two fragments:
+; * first starting at bit 0, 8 bits (for the bool)
+; * second starting at bit 32, 32 bits (for the long)
+; (this happens to create/demonstrate a gap from bits [7, 32))
+
+; But also check that a complex expression is not used for a lone bool
+; parameter. It can reference the register it's in directly without masking off
+; high bits or anything
+
+; CHECK: call void @llvm.dbg.value(metadata i8 %g.coerce0, i64 0, metadata ![[VAR_STRUCT:[0-9]+]], metadata ![[EXPR_STRUCT1:[0-9]+]])
+; CHECK: call void @llvm.dbg.value(metadata i64 %g.coerce1, i64 0, metadata ![[VAR_STRUCT]], metadata ![[EXPR_STRUCT2:[0-9]+]])
+; CHECK: call void @llvm.dbg.value(metadata i1 %b, i64 0, metadata ![[VAR_BOOL:[0-9]+]], metadata ![[EXPR_BOOL:[0-9]+]])
+; CHECK: call void @llvm.dbg.value(metadata i1 %frag, i64 0, metadata ![[FRAG_BOOL:[0-9]+]], metadata ![[FRAG_BOOL:[0-9]+]])
+; CHECK: ![[EXPR_STRUCT1]] = !DIExpression(DW_OP_LLVM_fragment, 0, 8)
+; CHECK: ![[EXPR_STRUCT2]] = !DIExpression(DW_OP_LLVM_fragment, 32, 64)
+; CHECK: ![[EXPR_BOOL]] = !DIExpression()
+; CHECK: ![[FRAG_BOOL]] = !DIExpression(DW_OP_LLVM_fragment, 0, 1)
+
+%struct.foo = type { i8, i64 }
+
+; Function Attrs: noinline nounwind uwtable
+define void @_Z1fbb3foo(i1 zeroext %b, i1 zeroext %frag, i8 %g.coerce0, i64 %g.coerce1) #0 !dbg !6 {
 entry:
-  %hs.addr = alloca i32, align 4
-  %v1 = alloca i64, align 8
-  %v2 = alloca i64, align 8
-  store i32 %hs, i32* %hs.addr, align 4
-; CHECK: call void @llvm.dbg.value(metadata i32 %hs, i64 0, metadata !{{[0-9]+}}, metadata ![[EXPR:[0-9]+]])
-; CHECK: ![[EXPR]] = !DIExpression(DW_OP_LLVM_fragment, 0
-  call void @llvm.dbg.declare(metadata i64* %v1, metadata !9, metadata !12), !dbg !13
-  %0 = load i32, i32* %hs.addr, align 4
-  %conv = sext i32 %0 to i64
-  store i64 %conv, i64* %v1, align 8
-  %1 = load i64, i64* %v2, align 8
-  unreachable
+  %g = alloca %struct.foo, align 8
+  %b.addr = alloca i8, align 1
+  %frag.addr = alloca i8, align 1
+  %0 = bitcast %struct.foo* %g to { i8, i64 }*
+  %1 = getelementptr inbounds { i8, i64 }, { i8, i64 }* %0, i32 0, i32 0
+  store i8 %g.coerce0, i8* %1, align 8
+  %2 = getelementptr inbounds { i8, i64 }, { i8, i64 }* %0, i32 0, i32 1
+  store i64 %g.coerce1, i64* %2, align 8
+  %frombool = zext i1 %b to i8
+  store i8 %frombool, i8* %b.addr, align 1
+  call void @llvm.dbg.declare(metadata i8* %b.addr, metadata !15, metadata !16), !dbg !17
+  %frombool1 = zext i1 %frag to i8
+  store i8 %frombool1, i8* %frag.addr, align 1
+  call void @llvm.dbg.declare(metadata i8* %frag.addr, metadata !18, metadata !23), !dbg !19
+  call void @llvm.dbg.declare(metadata %struct.foo* %g, metadata !20, metadata !16), !dbg !21
+  ret void, !dbg !22
 }
 
-attributes #0 = { nounwind readnone }
+; Function Attrs: nounwind readnone speculatable
+declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
+
+attributes #0 = { noinline nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind readnone speculatable }
 
 !llvm.dbg.cu = !{!0}
-!llvm.module.flags = !{!7}
-!llvm.ident = !{!8}
-
-!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, producer: "clang version 3.8.0 (trunk 256979) (llvm/trunk 257107)", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, retainedTypes: !2)
-!1 = !DIFile(filename: "tsan_shadow_test.cc", directory: "/tmp")
-!2 = !{!3, !5}
-!3 = !DICompositeType(tag: DW_TAG_class_type, name: "FastState", file: !4, line: 91, size: 64, align: 64, identifier: "_ZTSN6__tsan9FastStateE")
-!4 = !DIFile(filename: "/mnt/extra/llvm/projects/compiler-rt/lib/tsan/rtl/tsan_rtl.h", directory: "/tmp")
-!5 = distinct !DIDerivedType(tag: DW_TAG_typedef, name: "u64", line: 78, baseType: !6)
-!6 = !DIBasicType(name: "long long unsigned int", size: 64, align: 64, encoding: DW_ATE_unsigned)
-!7 = !{i32 2, !"Debug Info Version", i32 3}
-!8 = !{!"clang version 3.8.0 (trunk 256979) (llvm/trunk 257107)"}
-!9 = !DILocalVariable(name: "v1", scope: !10, file: !4, line: 136, type: !5)
-!10 = distinct !DILexicalBlock(scope: !11, file: !4, line: 136, column: 5)
-!11 = distinct !DISubprogram(name: "SetHistorySize", linkageName: "_ZN6__tsan9FastState14SetHistorySizeEi", scope: !3, file: !4, line: 135, isLocal: false, isDefinition: true, scopeLine: 135, flags: DIFlagPrototyped, isOptimized: false, unit: !0)
-!12 = !DIExpression()
-!13 = !DILocation(line: 136, column: 5, scope: !10)
+!llvm.module.flags = !{!3, !4}
+!llvm.ident = !{!5}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, producer: "clang version 5.0.0 (trunk 303077) (llvm/trunk 303098)", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !2)
+!1 = !DIFile(filename: "foo.cpp", directory: "/usr/local/google/home/blaikie/dev/scratch")
+!2 = !{}
+!3 = !{i32 2, !"Dwarf Version", i32 4}
+!4 = !{i32 2, !"Debug Info Version", i32 3}
+!5 = !{!"clang version 5.0.0 (trunk 303077) (llvm/trunk 303098)"}
+!6 = distinct !DISubprogram(name: "f", linkageName: "_Z1fbb3foo", scope: !1, file: !1, line: 2, type: !7, isLocal: false, isDefinition: true, scopeLine: 2, flags: DIFlagPrototyped, isOptimized: false, unit: !0, variables: !2)
+!7 = !DISubroutineType(types: !8)
+!8 = !{null, !9, !9, !10}
+!9 = !DIBasicType(name: "bool", size: 8, encoding: DW_ATE_boolean)
+!10 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "foo", file: !1, line: 1, size: 128, elements: !11, identifier: "_ZTS3foo")
+!11 = !{!12, !13}
+!12 = !DIDerivedType(tag: DW_TAG_member, name: "b", scope: !10, file: !1, line: 1, baseType: !9, size: 8)
+!13 = !DIDerivedType(tag: DW_TAG_member, name: "i", scope: !10, file: !1, line: 1, baseType: !14, size: 64, offset: 64)
+!14 = !DIBasicType(name: "long int", size: 64, encoding: DW_ATE_signed)
+!15 = !DILocalVariable(name: "b", arg: 1, scope: !6, file: !1, line: 2, type: !9)
+!16 = !DIExpression()
+!17 = !DILocation(line: 2, column: 13, scope: !6)
+!18 = !DILocalVariable(name: "frag", arg: 2, scope: !6, file: !1, line: 2, type: !9)
+!19 = !DILocation(line: 2, column: 21, scope: !6)
+!20 = !DILocalVariable(name: "g", arg: 3, scope: !6, file: !1, line: 2, type: !10)
+!21 = !DILocation(line: 2, column: 31, scope: !6)
+!22 = !DILocation(line: 3, column: 1, scope: !6)
+!23 = !DIExpression(DW_OP_LLVM_fragment, 0, 4)
diff --git a/test/Verifier/metadata-function-dbg.ll b/test/Verifier/metadata-function-dbg.ll
index 24989ed7aa2e..6db40943ec38 100644
--- a/test/Verifier/metadata-function-dbg.ll
+++ b/test/Verifier/metadata-function-dbg.ll
@@ -3,12 +3,18 @@
 ; CHECK:      function declaration may not have a !dbg attachment
 declare !dbg !4 void @f1()
 
-define void @f2() !dbg !4 {
+; CHECK:      function must have a single !dbg attachment
+define void @f2() !dbg !4 !dbg !4 {
   unreachable
 }
 
-; CHECK:      function must have a single !dbg attachment
-define void @f3() !dbg !4 !dbg !4 {
+; CHECK:      DISubprogram attached to more than one function
+define void @f3() !dbg !4 {
+  unreachable
+}
+
+; CHECK:      DISubprogram attached to more than one function
+define void @f4() !dbg !4 {
   unreachable
 }
 
@@ -16,7 +22,7 @@ define void @f3() !dbg !4 !dbg !4 {
 ; CHECK:      function !dbg attachment must be a subprogram
 ; CHECK-NEXT: void ()* @bar
 ; CHECK-NEXT: !{{[0-9]+}} = !{}
-define void @bar() !dbg !6 {
+define void @bar() !dbg !3 {
   unreachable
 }
 
@@ -26,5 +32,5 @@ define void @bar() !dbg !6 {
 !llvm.dbg.cu = !{!1}
 !1 = distinct !DICompileUnit(language: DW_LANG_C99, file: !2)
 !2 = !DIFile(filename: "t.c", directory: "/path/to/dir")
+!3 = !{}
 !4 = distinct !DISubprogram(name: "foo", scope: !1, file: !2, unit: !1)
-!6 = !{}
diff --git a/test/tools/llvm-pdbdump/Inputs/FilterTest.cpp b/test/tools/llvm-pdbdump/Inputs/FilterTest.cpp
index bcf9360d4a9b..4dd5581e2fca 100644
--- a/test/tools/llvm-pdbdump/Inputs/FilterTest.cpp
+++ b/test/tools/llvm-pdbdump/Inputs/FilterTest.cpp
@@ -10,6 +10,8 @@ public:
 
   void MemberFunc() {}
 
+  int foo() const { return IntMemberVar; }
+
 private:
   int IntMemberVar;
   double DoubleMemberVar;
@@ -18,10 +20,26 @@ private:
 int IntGlobalVar;
 double DoubleGlobalVar;
 typedef int GlobalTypedef;
+char OneByte;
+char TwoBytes[2];
+char ThreeBytes[3];
+
 enum GlobalEnum {
   GlobalEnumVal1
 } GlobalEnumVar;
 
+int CFunc() {
+  return (int)OneByte * 2;
+}
+int BFunc() {
+  return 42;
+}
+int AFunc() {
+  static FilterTestClass FC;
+
+  return (CFunc() + BFunc()) * IntGlobalVar + FC.foo();
+}
+
 int main(int argc, char **argv) {
   FilterTestClass TestClass;
   GlobalTypedef v1;
diff --git a/test/tools/llvm-pdbdump/Inputs/FilterTest.pdb b/test/tools/llvm-pdbdump/Inputs/FilterTest.pdb
index 5f01ec701b81..ce7e017f9151 100644
Binary files a/test/tools/llvm-pdbdump/Inputs/FilterTest.pdb and b/test/tools/llvm-pdbdump/Inputs/FilterTest.pdb differ
diff --git a/test/tools/llvm-pdbdump/regex-filter.test b/test/tools/llvm-pdbdump/regex-filter.test
index d2f500e88c33..36c3da33e2e4 100644
--- a/test/tools/llvm-pdbdump/regex-filter.test
+++ b/test/tools/llvm-pdbdump/regex-filter.test
@@ -1,4 +1,4 @@
-; RUN: llvm-pdbdump pretty -symbols -globals -types %p/Inputs/FilterTest.pdb \
+; RUN: llvm-pdbdump pretty -module-syms -globals -types %p/Inputs/FilterTest.pdb \
 ; RUN:    | FileCheck --check-prefix=NO_FILTER %s
 
 ; RUN: llvm-pdbdump pretty -types -exclude-types="GlobalTypedef|NestedTypedef" \
@@ -11,15 +11,15 @@
 ; RUN: llvm-pdbdump pretty -classes -typedefs %p/Inputs/FilterTest.pdb \
 ; RUN:    | FileCheck --check-prefix=EXCLUDE_ENUMS %s
 
-; RUN: llvm-pdbdump pretty -types -symbols -globals -exclude-symbols="MemberVar|GlobalVar" \
+; RUN: llvm-pdbdump pretty -types -module-syms -globals -exclude-symbols="MemberVar|GlobalVar" \
 ; RUN:    %p/Inputs/FilterTest.pdb | FileCheck --check-prefix=EXCLUDE_VARS %s
 ; RUN: llvm-pdbdump pretty -types -exclude-types="FilterTestClass" \
 ; RUN:    %p/Inputs/FilterTest.pdb | FileCheck  --check-prefix=EXCLUDE_WHOLE_CLASS %s
-; RUN: llvm-pdbdump pretty -symbols -globals -exclude-compilands="FilterTest.obj"  \
+; RUN: llvm-pdbdump pretty -module-syms -globals -exclude-compilands="FilterTest.obj"  \
 ; RUN:    %p/Inputs/FilterTest.pdb | FileCheck  --check-prefix=EXCLUDE_COMPILAND %s
 ; RUN: llvm-pdbdump pretty -types -include-types="FilterTestClass" \
 ; RUN:    %p/Inputs/FilterTest.pdb | FileCheck --check-prefix=INCLUDE_ONLY_TYPES %s
-; RUN: llvm-pdbdump pretty -types -symbols -globals -include-symbols="[[:<:]](IntGlobalVar|DoubleGlobalVar)[[:>:]]" \
+; RUN: llvm-pdbdump pretty -types -module-syms -globals -include-symbols="[[:<:]](IntGlobalVar|DoubleGlobalVar)[[:>:]]" \
 ; RUN:    %p/Inputs/FilterTest.pdb | FileCheck --check-prefix=INCLUDE_ONLY_VARS %s
 
 ; NO_FILTER: ---TYPES---
diff --git a/test/tools/llvm-pdbdump/symbol-filters.test b/test/tools/llvm-pdbdump/symbol-filters.test
new file mode 100644
index 000000000000..d12d2aa8be0f
--- /dev/null
+++ b/test/tools/llvm-pdbdump/symbol-filters.test
@@ -0,0 +1,74 @@
+; RUN: llvm-pdbdump pretty -globals -module-syms -sym-types=data %p/Inputs/FilterTest.pdb \
+; RUN:    | FileCheck --check-prefix=ONLY_DATA %s
+
+; RUN: llvm-pdbdump pretty -globals -module-syms -sym-types=thunks %p/Inputs/FilterTest.pdb \
+; RUN:    | FileCheck --check-prefix=ONLY_THUNKS %s
+
+; RUN: llvm-pdbdump pretty -globals -module-syms -sym-types=funcs %p/Inputs/FilterTest.pdb \
+; RUN:    | FileCheck --check-prefix=ONLY_FUNCS %s
+
+; RUN: llvm-pdbdump pretty -globals -module-syms -sym-types=funcs -sym-types=data \
+; RUN: %p/Inputs/FilterTest.pdb | FileCheck --check-prefix=TWO_TYPES %s
+
+; RUN: llvm-pdbdump pretty -globals -module-syms -sym-types=data \
+; RUN: -symbol-order=name %p/Inputs/FilterTest.pdb | FileCheck --check-prefix=NAME_SORT_DATA %s
+
+; RUN: llvm-pdbdump pretty -globals -module-syms -sym-types=data \
+; RUN: -symbol-order=size %p/Inputs/FilterTest.pdb | FileCheck --check-prefix=SIZE_SORT_DATA %s
+
+; RUN: llvm-pdbdump pretty -globals -module-syms -sym-types=funcs \
+; RUN: -symbol-order=name %p/Inputs/FilterTest.pdb | FileCheck --check-prefix=NAME_SORT_FUNCS %s
+
+; RUN: llvm-pdbdump pretty -globals -module-syms -sym-types=funcs \
+; RUN: -symbol-order=size %p/Inputs/FilterTest.pdb | FileCheck --check-prefix=SIZE_SORT_FUNCS %s
+
+; ONLY_DATA-NOT: func
+; ONLY_DATA-NOT: thunk
+; ONLY_DATA-DAG: data {{.*}} static char OneByte
+; ONLY_DATA-DAG: data {{.*}} static double DoubleGlobalVar
+; ONLY_DATA-DAG: data {{.*}} static char TwoBytes[2]
+; ONLY_DATA-DAG: data {{.*}} static char ThreeBytes[3]
+; ONLY_DATA-DAG: data {{.*}} static int IntGlobalVar
+; ONLY_DATA-DAG: data {{.*}} static GlobalEnum GlobalEnumVar
+
+; ONLY_FUNCS-NOT: data
+; ONLY_FUNCS-NOT: thunk
+; ONLY_FUNCS: func {{.*}} int __cdecl main(int argc, char** argv)
+; ONLY_FUNCS: func {{.*}} int __cdecl CFunc()
+; ONLY_FUNCS: func {{.*}} int __cdecl BFunc()
+; ONLY_FUNCS: func {{.*}} int __cdecl AFunc()
+; ONLY_FUNCS: func {{.*}} int FilterTestClass::foo()
+
+; ONLY_THUNKS-NOT: func
+; ONLY_THUNKS-NOT: data
+; ONLY_THUNKS-DAG: thunk {{.*}} (TrampIncremental)
+
+; TWO_TYPES-NOT: thunk
+; TWO_TYPES-DAG: func {{.*}} int __cdecl main(int argc, char** argv)
+; TWO_TYPES-DAG: data {{.*}} static double DoubleGlobalVar
+
+; NAME_SORT_DATA: data {{.*}} static double DoubleGlobalVar
+; NAME_SORT_DATA: data {{.*}} static GlobalEnum GlobalEnumVar
+; NAME_SORT_DATA: data {{.*}} static int IntGlobalVar
+; NAME_SORT_DATA: data {{.*}} static char OneByte
+; NAME_SORT_DATA: data {{.*}} static char ThreeBytes[3]
+; NAME_SORT_DATA: data {{.*}} static char TwoBytes[2]
+
+; SIZE_SORT_DATA: data {{.*}}sizeof=8{{.*}}double DoubleGlobalVar
+; SIZE_SORT_DATA-DAG: data {{.*}}sizeof=4{{.*}}GlobalEnum GlobalEnumVar
+; SIZE_SORT_DATA-DAG: data {{.*}}sizeof=4{{.*}}int IntGlobalVar
+; SIZE_SORT_DATA: data {{.*}}sizeof=3{{.*}}char ThreeBytes[3]
+; SIZE_SORT_DATA: data {{.*}}sizeof=2{{.*}}char TwoBytes[2]
+; SIZE_SORT_DATA: data {{.*}}sizeof=1{{.*}}char OneByte
+
+; NAME_SORT_FUNCS: func {{.*}}sizeof= 40{{.*}}AFunc
+; NAME_SORT_FUNCS: func {{.*}}sizeof= 10{{.*}}BFunc
+; NAME_SORT_FUNCS: func {{.*}}sizeof= 14{{.*}}CFunc
+; NAME_SORT_FUNCS: func {{.*}}sizeof= 16{{.*}}FilterTestClass::foo
+; NAME_SORT_FUNCS: func {{.*}}sizeof=  7{{.*}}main
+
+; SIZE_SORT_FUNCS: func {{.*}}sizeof= 40{{.*}}AFunc
+; SIZE_SORT_FUNCS: func {{.*}}sizeof= 16{{.*}}FilterTestClass::foo
+; SIZE_SORT_FUNCS: func {{.*}}sizeof= 14{{.*}}CFunc
+; SIZE_SORT_FUNCS: func {{.*}}sizeof= 10{{.*}}BFunc
+; SIZE_SORT_FUNCS: func {{.*}}sizeof=  7{{.*}}main
diff --git a/test/tools/llvm-profdata/sample-profile-basic.test b/test/tools/llvm-profdata/sample-profile-basic.test
index 211d8c5bbd84..3ba42c20f2e8 100644
--- a/test/tools/llvm-profdata/sample-profile-basic.test
+++ b/test/tools/llvm-profdata/sample-profile-basic.test
@@ -25,9 +25,10 @@ RUN: diff %t-binary %t-text
    counters have doubled.
 RUN: llvm-profdata merge --sample %p/Inputs/sample-profile.proftext -o %t-binprof
 RUN: llvm-profdata merge --sample --text %p/Inputs/sample-profile.proftext %t-binprof -o - | FileCheck %s --check-prefix=MERGE1
-MERGE1-DAG: main:368038:0
-MERGE1-DAG: 9: 4128 _Z3fooi:1262 _Z3bari:2942
-MERGE1-DAG: _Z3fooi:15422:1220
+MERGE1: main:368038:0
+MERGE1: 9: 4128 _Z3fooi:1262 _Z3bari:2942
+MERGE1: _Z3bari:40602:2874
+MERGE1: _Z3fooi:15422:1220
 
 5- Detect invalid text encoding (e.g. instrumentation profile text format).
 RUN: not llvm-profdata show --sample %p/Inputs/foo3bar3-1.proftext 2>&1 | FileCheck %s --check-prefix=BADTEXT
diff --git a/test/tools/llvm-readobj/wasm-invalid.test b/test/tools/llvm-readobj/wasm-invalid.test
new file mode 100644
index 000000000000..d500d582ca03
--- /dev/null
+++ b/test/tools/llvm-readobj/wasm-invalid.test
@@ -0,0 +1,7 @@
+# RUN: yaml2obj %s | not llvm-readobj -t - 2>&1 | FileCheck %s
+
+--- !WASM
+FileHeader:
+  Version:         0x0000000c
+
+# CHECK: Error reading file: <stdin>: Bad version number
diff --git a/tools/bugpoint/ExtractFunction.cpp b/tools/bugpoint/ExtractFunction.cpp
index 82c61b6e1be7..72872e83f792 100644
--- a/tools/bugpoint/ExtractFunction.cpp
+++ b/tools/bugpoint/ExtractFunction.cpp
@@ -232,8 +232,7 @@ static Constant *GetTorInit(std::vector<std::pair<Function *, int>> &TorList) {
   std::vector<Constant *> ArrayElts;
   Type *Int32Ty = Type::getInt32Ty(TorList[0].first->getContext());
 
-  StructType *STy =
-      StructType::get(Int32Ty, TorList[0].first->getType(), nullptr);
+  StructType *STy = StructType::get(Int32Ty, TorList[0].first->getType());
   for (unsigned i = 0, e = TorList.size(); i != e; ++i) {
     Constant *Elts[] = {ConstantInt::get(Int32Ty, TorList[i].second),
                         TorList[i].first};
diff --git a/tools/llc/llc.cpp b/tools/llc/llc.cpp
index 7c81abaed755..8c786950036f 100644
--- a/tools/llc/llc.cpp
+++ b/tools/llc/llc.cpp
@@ -301,6 +301,8 @@ int main(int argc, char **argv) {
   initializeConstantHoistingLegacyPassPass(*Registry);
   initializeScalarOpts(*Registry);
   initializeVectorization(*Registry);
+  initializeScalarizeMaskedMemIntrinPass(*Registry);
+  initializeExpandReductionsPass(*Registry);
 
   // Register the target printer for --version.
   cl::AddExtraVersionPrinter(TargetRegistry::printRegisteredTargetsForVersion);
diff --git a/tools/lli/RemoteJITUtils.h b/tools/lli/RemoteJITUtils.h
index 89a514202567..3c82f73ff072 100644
--- a/tools/lli/RemoteJITUtils.h
+++ b/tools/lli/RemoteJITUtils.h
@@ -118,9 +118,8 @@ public:
     MemMgr->registerEHFrames(Addr, LoadAddr, Size);
   }
 
-  void deregisterEHFrames(uint8_t *Addr, uint64_t LoadAddr,
-                          size_t Size) override {
-    MemMgr->deregisterEHFrames(Addr, LoadAddr, Size);
+  void deregisterEHFrames() override {
+    MemMgr->deregisterEHFrames();
   }
 
   bool finalizeMemory(std::string *ErrMsg = nullptr) override {
diff --git a/tools/llvm-ar/llvm-ar.cpp b/tools/llvm-ar/llvm-ar.cpp
index 1519464521dd..3de260410bd9 100644
--- a/tools/llvm-ar/llvm-ar.cpp
+++ b/tools/llvm-ar/llvm-ar.cpp
@@ -16,7 +16,7 @@
 #include "llvm/ADT/Triple.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Module.h"
-#include "llvm/LibDriver/LibDriver.h"
+#include "llvm/ToolDrivers/llvm-lib/LibDriver.h"
 #include "llvm/Object/Archive.h"
 #include "llvm/Object/ArchiveWriter.h"
 #include "llvm/Object/MachO.h"
diff --git a/tools/llvm-pdbdump/LLVMOutputStyle.cpp b/tools/llvm-pdbdump/LLVMOutputStyle.cpp
index 2dd4ef0fb30d..e975a5220af6 100644
--- a/tools/llvm-pdbdump/LLVMOutputStyle.cpp
+++ b/tools/llvm-pdbdump/LLVMOutputStyle.cpp
@@ -180,7 +180,7 @@ private:
     CompactTypeDumpVisitor CTDV(DB, Index, &P);
     CVTypeVisitor Visitor(CTDV);
     DictScope D(P, Label);
-    if (DB.containsTypeIndex(Index)) {
+    if (DB.contains(Index)) {
       CVType &Type = DB.getTypeRecord(Index);
       if (auto EC = Visitor.visitTypeRecord(Type))
         return EC;
diff --git a/tools/llvm-pdbdump/PrettyCompilandDumper.cpp b/tools/llvm-pdbdump/PrettyCompilandDumper.cpp
index 6257313e3e1a..9cf7bf82a164 100644
--- a/tools/llvm-pdbdump/PrettyCompilandDumper.cpp
+++ b/tools/llvm-pdbdump/PrettyCompilandDumper.cpp
@@ -115,6 +115,8 @@ void CompilandDumper::start(const PDBSymbolCompiland &Symbol,
 }
 
 void CompilandDumper::dump(const PDBSymbolData &Symbol) {
+  if (!shouldDumpSymLevel(opts::pretty::SymLevel::Data))
+    return;
   if (Printer.IsSymbolExcluded(Symbol.getName()))
     return;
 
@@ -125,11 +127,17 @@ void CompilandDumper::dump(const PDBSymbolData &Symbol) {
     Printer << "data: ";
     WithColor(Printer, PDB_ColorItem::Address).get()
         << "[" << format_hex(Symbol.getVirtualAddress(), 10) << "]";
+
+    WithColor(Printer, PDB_ColorItem::Comment).get()
+        << " [sizeof = " << getTypeLength(Symbol) << "]";
+
     break;
   case PDB_LocType::Constant:
     Printer << "constant: ";
     WithColor(Printer, PDB_ColorItem::LiteralValue).get()
         << "[" << Symbol.getValue() << "]";
+    WithColor(Printer, PDB_ColorItem::Comment).get()
+        << " [sizeof = " << getTypeLength(Symbol) << "]";
     break;
   default:
     Printer << "data(unexpected type=" << LocType << ")";
@@ -140,6 +148,8 @@ void CompilandDumper::dump(const PDBSymbolData &Symbol) {
 }
 
 void CompilandDumper::dump(const PDBSymbolFunc &Symbol) {
+  if (!shouldDumpSymLevel(opts::pretty::SymLevel::Functions))
+    return;
   if (Symbol.getLength() == 0)
     return;
   if (Printer.IsSymbolExcluded(Symbol.getName()))
@@ -162,6 +172,8 @@ void CompilandDumper::dump(const PDBSymbolLabel &Symbol) {
 }
 
 void CompilandDumper::dump(const PDBSymbolThunk &Symbol) {
+  if (!shouldDumpSymLevel(opts::pretty::SymLevel::Thunks))
+    return;
   if (Printer.IsSymbolExcluded(Symbol.getName()))
     return;
 
diff --git a/tools/llvm-pdbdump/PrettyFunctionDumper.cpp b/tools/llvm-pdbdump/PrettyFunctionDumper.cpp
index b0be33c157ce..8b2043989b81 100644
--- a/tools/llvm-pdbdump/PrettyFunctionDumper.cpp
+++ b/tools/llvm-pdbdump/PrettyFunctionDumper.cpp
@@ -26,6 +26,7 @@
 #include "llvm/DebugInfo/PDB/PDBSymbolTypeTypedef.h"
 #include "llvm/DebugInfo/PDB/PDBSymbolTypeUDT.h"
 #include "llvm/Support/Format.h"
+#include "llvm/Support/FormatVariadic.h"
 
 using namespace llvm;
 using namespace llvm::codeview;
@@ -119,14 +120,19 @@ void FunctionDumper::start(const PDBSymbolFunc &Symbol, PointerType Pointer) {
   WithColor(Printer, PDB_ColorItem::Address).get() << format_hex(FuncStart, 10);
   if (auto DebugStart = Symbol.findOneChild<PDBSymbolFuncDebugStart>()) {
     uint64_t Prologue = DebugStart->getVirtualAddress() - FuncStart;
-    WithColor(Printer, PDB_ColorItem::Offset).get() << "+" << Prologue;
+    WithColor(Printer, PDB_ColorItem::Offset).get()
+        << formatv("+{0,2}", Prologue);
   }
   Printer << " - ";
   WithColor(Printer, PDB_ColorItem::Address).get() << format_hex(FuncEnd, 10);
   if (auto DebugEnd = Symbol.findOneChild<PDBSymbolFuncDebugEnd>()) {
     uint64_t Epilogue = FuncEnd - DebugEnd->getVirtualAddress();
-    WithColor(Printer, PDB_ColorItem::Offset).get() << "-" << Epilogue;
+    WithColor(Printer, PDB_ColorItem::Offset).get()
+        << formatv("-{0,2}", Epilogue);
   }
+
+  WithColor(Printer, PDB_ColorItem::Comment).get()
+      << formatv(" | sizeof={0,3}", Symbol.getLength());
   Printer << "] (";
 
   if (Symbol.hasFramePointer()) {
diff --git a/tools/llvm-pdbdump/llvm-pdbdump.cpp b/tools/llvm-pdbdump/llvm-pdbdump.cpp
index 4cdd87620c86..0e5913fa3c93 100644
--- a/tools/llvm-pdbdump/llvm-pdbdump.cpp
+++ b/tools/llvm-pdbdump/llvm-pdbdump.cpp
@@ -110,12 +110,22 @@ cl::list<std::string> InputFilenames(cl::Positional,
 
 cl::opt<bool> Compilands("compilands", cl::desc("Display compilands"),
                          cl::cat(TypeCategory), cl::sub(PrettySubcommand));
-cl::opt<bool> Symbols("symbols", cl::desc("Display symbols for each compiland"),
+cl::opt<bool> Symbols("module-syms",
+                      cl::desc("Display symbols for each compiland"),
                       cl::cat(TypeCategory), cl::sub(PrettySubcommand));
 cl::opt<bool> Globals("globals", cl::desc("Dump global symbols"),
                       cl::cat(TypeCategory), cl::sub(PrettySubcommand));
 cl::opt<bool> Externals("externals", cl::desc("Dump external symbols"),
                         cl::cat(TypeCategory), cl::sub(PrettySubcommand));
+cl::list<SymLevel> SymTypes(
+    "sym-types", cl::desc("Type of symbols to dump (default all)"),
+    cl::cat(TypeCategory), cl::sub(PrettySubcommand), cl::ZeroOrMore,
+    cl::values(
+        clEnumValN(SymLevel::Thunks, "thunks", "Display thunk symbols"),
+        clEnumValN(SymLevel::Data, "data", "Display data symbols"),
+        clEnumValN(SymLevel::Functions, "funcs", "Display function symbols"),
+        clEnumValN(SymLevel::All, "all", "Display all symbols (default)")));
+
 cl::opt<bool>
     Types("types",
           cl::desc("Display all types (implies -classes, -enums, -typedefs)"),
@@ -126,6 +136,16 @@ cl::opt<bool> Enums("enums", cl::desc("Display enum types"),
                     cl::cat(TypeCategory), cl::sub(PrettySubcommand));
 cl::opt<bool> Typedefs("typedefs", cl::desc("Display typedef types"),
                        cl::cat(TypeCategory), cl::sub(PrettySubcommand));
+cl::opt<SymbolSortMode> SymbolOrder(
+    "symbol-order", cl::desc("symbol sort order"),
+    cl::init(SymbolSortMode::None),
+    cl::values(clEnumValN(SymbolSortMode::None, "none",
+                          "Undefined / no particular sort order"),
+               clEnumValN(SymbolSortMode::Name, "name", "Sort symbols by name"),
+               clEnumValN(SymbolSortMode::Size, "size",
+                          "Sort symbols by size")),
+    cl::cat(TypeCategory), cl::sub(PrettySubcommand));
+
 cl::opt<ClassSortMode> ClassOrder(
     "class-order", cl::desc("Class sort order"), cl::init(ClassSortMode::None),
     cl::values(
@@ -620,6 +640,49 @@ static void diff(StringRef Path1, StringRef Path2) {
   ExitOnErr(O->dump());
 }
 
+bool opts::pretty::shouldDumpSymLevel(SymLevel Search) {
+  if (SymTypes.empty())
+    return true;
+  if (llvm::find(SymTypes, Search) != SymTypes.end())
+    return true;
+  if (llvm::find(SymTypes, SymLevel::All) != SymTypes.end())
+    return true;
+  return false;
+}
+
+uint32_t llvm::pdb::getTypeLength(const PDBSymbolData &Symbol) {
+  auto SymbolType = Symbol.getType();
+  const IPDBRawSymbol &RawType = SymbolType->getRawSymbol();
+
+  return RawType.getLength();
+}
+
+bool opts::pretty::compareFunctionSymbols(
+    const std::unique_ptr<PDBSymbolFunc> &F1,
+    const std::unique_ptr<PDBSymbolFunc> &F2) {
+  assert(opts::pretty::SymbolOrder != opts::pretty::SymbolSortMode::None);
+
+  if (opts::pretty::SymbolOrder == opts::pretty::SymbolSortMode::Name)
+    return F1->getName() < F2->getName();
+
+  // Note that we intentionally sort in descending order on length, since
+  // long functions are more interesting than short functions.
+  return F1->getLength() > F2->getLength();
+}
+
+bool opts::pretty::compareDataSymbols(
+    const std::unique_ptr<PDBSymbolData> &F1,
+    const std::unique_ptr<PDBSymbolData> &F2) {
+  assert(opts::pretty::SymbolOrder != opts::pretty::SymbolSortMode::None);
+
+  if (opts::pretty::SymbolOrder == opts::pretty::SymbolSortMode::Name)
+    return F1->getName() < F2->getName();
+
+  // Note that we intentionally sort in descending order on length, since
+  // large types are more interesting than short ones.
+  return getTypeLength(*F1) > getTypeLength(*F2);
+}
+
 static void dumpPretty(StringRef Path) {
   std::unique_ptr<IPDBSession> Session;
 
@@ -708,21 +771,42 @@ static void dumpPretty(StringRef Path) {
     Printer.NewLine();
     WithColor(Printer, PDB_ColorItem::SectionHeader).get() << "---GLOBALS---";
     Printer.Indent();
-    {
+    if (shouldDumpSymLevel(opts::pretty::SymLevel::Functions)) {
       FunctionDumper Dumper(Printer);
       auto Functions = GlobalScope->findAllChildren<PDBSymbolFunc>();
-      while (auto Function = Functions->getNext()) {
-        Printer.NewLine();
-        Dumper.start(*Function, FunctionDumper::PointerType::None);
+      if (opts::pretty::SymbolOrder == opts::pretty::SymbolSortMode::None) {
+        while (auto Function = Functions->getNext()) {
+          Printer.NewLine();
+          Dumper.start(*Function, FunctionDumper::PointerType::None);
+        }
+      } else {
+        std::vector<std::unique_ptr<PDBSymbolFunc>> Funcs;
+        while (auto Func = Functions->getNext())
+          Funcs.push_back(std::move(Func));
+        std::sort(Funcs.begin(), Funcs.end(),
+                  opts::pretty::compareFunctionSymbols);
+        for (const auto &Func : Funcs) {
+          Printer.NewLine();
+          Dumper.start(*Func, FunctionDumper::PointerType::None);
+        }
       }
     }
-    {
+    if (shouldDumpSymLevel(opts::pretty::SymLevel::Data)) {
       auto Vars = GlobalScope->findAllChildren<PDBSymbolData>();
       VariableDumper Dumper(Printer);
-      while (auto Var = Vars->getNext())
-        Dumper.start(*Var);
+      if (opts::pretty::SymbolOrder == opts::pretty::SymbolSortMode::None) {
+        while (auto Var = Vars->getNext())
+          Dumper.start(*Var);
+      } else {
+        std::vector<std::unique_ptr<PDBSymbolData>> Datas;
+        while (auto Var = Vars->getNext())
+          Datas.push_back(std::move(Var));
+        std::sort(Datas.begin(), Datas.end(), opts::pretty::compareDataSymbols);
+        for (const auto &Var : Datas)
+          Dumper.start(*Var);
+      }
     }
-    {
+    if (shouldDumpSymLevel(opts::pretty::SymLevel::Thunks)) {
       auto Thunks = GlobalScope->findAllChildren<PDBSymbolThunk>();
       CompilandDumper Dumper(Printer);
       while (auto Thunk = Thunks->getNext())
diff --git a/tools/llvm-pdbdump/llvm-pdbdump.h b/tools/llvm-pdbdump/llvm-pdbdump.h
index 8b1dde9399bf..e38b32c6a345 100644
--- a/tools/llvm-pdbdump/llvm-pdbdump.h
+++ b/tools/llvm-pdbdump/llvm-pdbdump.h
@@ -14,6 +14,17 @@
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/raw_ostream.h"
 
+#include <memory>
+#include <stdint.h>
+
+namespace llvm {
+namespace pdb {
+class PDBSymbolData;
+class PDBSymbolFunc;
+uint32_t getTypeLength(const PDBSymbolData &Symbol);
+}
+}
+
 namespace opts {
 
 namespace pretty {
@@ -29,6 +40,17 @@ enum class ClassSortMode {
   PaddingPctImmediate
 };
 
+enum class SymbolSortMode { None, Name, Size };
+
+enum class SymLevel { Functions, Data, Thunks, All };
+
+bool shouldDumpSymLevel(SymLevel Level);
+bool compareFunctionSymbols(
+    const std::unique_ptr<llvm::pdb::PDBSymbolFunc> &F1,
+    const std::unique_ptr<llvm::pdb::PDBSymbolFunc> &F2);
+bool compareDataSymbols(const std::unique_ptr<llvm::pdb::PDBSymbolData> &F1,
+                        const std::unique_ptr<llvm::pdb::PDBSymbolData> &F2);
+
 extern llvm::cl::opt<bool> Compilands;
 extern llvm::cl::opt<bool> Symbols;
 extern llvm::cl::opt<bool> Globals;
@@ -45,6 +67,7 @@ extern llvm::cl::list<std::string> ExcludeCompilands;
 extern llvm::cl::list<std::string> IncludeTypes;
 extern llvm::cl::list<std::string> IncludeSymbols;
 extern llvm::cl::list<std::string> IncludeCompilands;
+extern llvm::cl::opt<SymbolSortMode> SymbolOrder;
 extern llvm::cl::opt<ClassSortMode> ClassOrder;
 extern llvm::cl::opt<uint32_t> SizeThreshold;
 extern llvm::cl::opt<uint32_t> PaddingThreshold;
diff --git a/tools/llvm-readobj/COFFDumper.cpp b/tools/llvm-readobj/COFFDumper.cpp
index 049af2c4f076..aca7de840d80 100644
--- a/tools/llvm-readobj/COFFDumper.cpp
+++ b/tools/llvm-readobj/COFFDumper.cpp
@@ -1562,6 +1562,14 @@ void COFFDumper::printResourceDirectoryTable(
     raw_svector_ostream OS(IDStr);
     if (i < Table.NumberOfNameEntries) {
       ArrayRef<UTF16> RawEntryNameString = unwrapOrError(RSF.getEntryNameString(Entry));
+      std::vector<UTF16> EndianCorrectedNameString;
+      if (llvm::sys::IsBigEndianHost) {
+        EndianCorrectedNameString.resize(RawEntryNameString.size() + 1);
+        std::copy(RawEntryNameString.begin(), RawEntryNameString.end(),
+                  EndianCorrectedNameString.begin() + 1);
+        EndianCorrectedNameString[0] = UNI_UTF16_BYTE_ORDER_MARK_SWAPPED;
+        RawEntryNameString = makeArrayRef(EndianCorrectedNameString);
+      }
       std::string EntryNameString;
       if (!llvm::convertUTF16ToUTF8String(RawEntryNameString, EntryNameString))
         error(object_error::parse_failed);
diff --git a/tools/llvm-readobj/llvm-readobj.cpp b/tools/llvm-readobj/llvm-readobj.cpp
index 8a9d7bc720c3..cd7244a8f970 100644
--- a/tools/llvm-readobj/llvm-readobj.cpp
+++ b/tools/llvm-readobj/llvm-readobj.cpp
@@ -311,13 +311,6 @@ static void reportError(StringRef Input, std::error_code EC) {
   reportError(Twine(Input) + ": " + EC.message());
 }
 
-static void reportError(StringRef Input, StringRef Message) {
-  if (Input == "-")
-    Input = "<stdin>";
-
-  reportError(Twine(Input) + ": " + Message);
-}
-
 static void reportError(StringRef Input, Error Err) {
   if (Input == "-")
     Input = "<stdin>";
@@ -481,11 +474,7 @@ static void dumpArchive(const Archive *Arc) {
     Expected<std::unique_ptr<Binary>> ChildOrErr = Child.getAsBinary();
     if (!ChildOrErr) {
       if (auto E = isNotObjectErrorInvalidFileType(ChildOrErr.takeError())) {
-        std::string Buf;
-        raw_string_ostream OS(Buf);
-        logAllUnhandledErrors(ChildOrErr.takeError(), OS, "");
-        OS.flush();
-        reportError(Arc->getFileName(), Buf);
+        reportError(Arc->getFileName(), ChildOrErr.takeError());
       }
       continue;
     }
@@ -507,11 +496,7 @@ static void dumpMachOUniversalBinary(const MachOUniversalBinary *UBinary) {
     if (ObjOrErr)
       dumpObject(&*ObjOrErr.get());
     else if (auto E = isNotObjectErrorInvalidFileType(ObjOrErr.takeError())) {
-      std::string Buf;
-      raw_string_ostream OS(Buf);
-      logAllUnhandledErrors(ObjOrErr.takeError(), OS, "");
-      OS.flush();
-      reportError(UBinary->getFileName(), Buf);
+      reportError(UBinary->getFileName(), ObjOrErr.takeError());
     }
     else if (Expected<std::unique_ptr<Archive>> AOrErr = Obj.getAsArchive())
       dumpArchive(&*AOrErr.get());
@@ -524,7 +509,7 @@ static void dumpInput(StringRef File) {
   // Attempt to open the binary.
   Expected<OwningBinary<Binary>> BinaryOrErr = createBinary(File);
   if (!BinaryOrErr)
-    reportError(File, errorToErrorCode(BinaryOrErr.takeError()));
+    reportError(File, BinaryOrErr.takeError());
   Binary &Binary = *BinaryOrErr.get().getBinary();
 
   if (Archive *Arc = dyn_cast<Archive>(&Binary))
diff --git a/tools/llvm-rtdyld/llvm-rtdyld.cpp b/tools/llvm-rtdyld/llvm-rtdyld.cpp
index 75345de50280..ba130ce80be8 100644
--- a/tools/llvm-rtdyld/llvm-rtdyld.cpp
+++ b/tools/llvm-rtdyld/llvm-rtdyld.cpp
@@ -175,8 +175,7 @@ public:
 
   void registerEHFrames(uint8_t *Addr, uint64_t LoadAddr,
                         size_t Size) override {}
-  void deregisterEHFrames(uint8_t *Addr, uint64_t LoadAddr,
-                          size_t Size) override {}
+  void deregisterEHFrames() override {}
 
   void preallocateSlab(uint64_t Size) {
     std::string Err;
diff --git a/tools/obj2yaml/wasm2yaml.cpp b/tools/obj2yaml/wasm2yaml.cpp
index cc04b995f667..d4d978f028e2 100644
--- a/tools/obj2yaml/wasm2yaml.cpp
+++ b/tools/obj2yaml/wasm2yaml.cpp
@@ -25,6 +25,23 @@ public:
   ErrorOr<WasmYAML::Object *> dump();
 };
 
+WasmYAML::Table make_table(const wasm::WasmTable &Table) {
+  WasmYAML::Table T;
+  T.ElemType = Table.ElemType;
+  T.TableLimits.Flags = Table.Limits.Flags;
+  T.TableLimits.Initial = Table.Limits.Initial;
+  T.TableLimits.Maximum = Table.Limits.Maximum;
+  return T;
+}
+
+WasmYAML::Limits make_limits(const wasm::WasmLimits &Limits) {
+  WasmYAML::Limits L;
+  L.Flags = Limits.Flags;
+  L.Initial = Limits.Initial;
+  L.Maximum = Limits.Maximum;
+  return L;
+}
+
 ErrorOr<WasmYAML::Object *> WasmDumper::dump() {
   auto Y = make_unique<WasmYAML::Object>();
 
@@ -82,17 +99,26 @@ ErrorOr<WasmYAML::Object *> WasmDumper::dump() {
     case wasm::WASM_SEC_IMPORT: {
       auto ImportSec = make_unique<WasmYAML::ImportSection>();
       for (auto &Import : Obj.imports()) {
-        WasmYAML::Import Ex;
-        Ex.Module = Import.Module;
-        Ex.Field = Import.Field;
-        Ex.Kind = Import.Kind;
-        if (Ex.Kind == wasm::WASM_EXTERNAL_FUNCTION) {
-          Ex.SigIndex = Import.SigIndex;
-        } else if (Ex.Kind == wasm::WASM_EXTERNAL_GLOBAL) {
-          Ex.GlobalType = Import.GlobalType;
-          Ex.GlobalMutable = Import.GlobalMutable;
+        WasmYAML::Import Im;
+        Im.Module = Import.Module;
+        Im.Field = Import.Field;
+        Im.Kind = Import.Kind;
+        switch (Im.Kind) {
+        case wasm::WASM_EXTERNAL_FUNCTION:
+          Im.SigIndex = Import.SigIndex;
+          break;
+        case wasm::WASM_EXTERNAL_GLOBAL:
+          Im.GlobalImport.Type = Import.Global.Type;
+          Im.GlobalImport.Mutable = Import.Global.Mutable;
+          break;
+        case wasm::WASM_EXTERNAL_TABLE:
+          Im.TableImport = make_table(Import.Table);
+          break;
+        case wasm::WASM_EXTERNAL_MEMORY:
+          Im.Memory = make_limits(Import.Memory);
+          break;
         }
-        ImportSec->Imports.push_back(Ex);
+        ImportSec->Imports.push_back(Im);
       }
       S = std::move(ImportSec);
       break;
@@ -107,25 +133,16 @@ ErrorOr<WasmYAML::Object *> WasmDumper::dump() {
     }
     case wasm::WASM_SEC_TABLE: {
       auto TableSec = make_unique<WasmYAML::TableSection>();
-      for (auto &Table : Obj.tables()) {
-        WasmYAML::Table T;
-        T.ElemType = Table.ElemType;
-        T.TableLimits.Flags = Table.Limits.Flags;
-        T.TableLimits.Initial = Table.Limits.Initial;
-        T.TableLimits.Maximum = Table.Limits.Maximum;
-        TableSec->Tables.push_back(T);
+      for (const wasm::WasmTable &Table : Obj.tables()) {
+        TableSec->Tables.push_back(make_table(Table));
       }
       S = std::move(TableSec);
       break;
     }
     case wasm::WASM_SEC_MEMORY: {
       auto MemorySec = make_unique<WasmYAML::MemorySection>();
-      for (auto &Memory : Obj.memories()) {
-        WasmYAML::Limits L;
-        L.Flags = Memory.Flags;
-        L.Initial = Memory.Initial;
-        L.Maximum = Memory.Maximum;
-        MemorySec->Memories.push_back(L);
+      for (const wasm::WasmLimits &Memory : Obj.memories()) {
+        MemorySec->Memories.push_back(make_limits(Memory));
       }
       S = std::move(MemorySec);
       break;
diff --git a/tools/opt/opt.cpp b/tools/opt/opt.cpp
index 40459e559986..c362dff3a3e0 100644
--- a/tools/opt/opt.cpp
+++ b/tools/opt/opt.cpp
@@ -385,18 +385,20 @@ int main(int argc, char **argv) {
   initializeTarget(Registry);
   // For codegen passes, only passes that do IR to IR transformation are
   // supported.
+  initializeScalarizeMaskedMemIntrinPass(Registry);
   initializeCodeGenPreparePass(Registry);
   initializeAtomicExpandPass(Registry);
   initializeRewriteSymbolsLegacyPassPass(Registry);
   initializeWinEHPreparePass(Registry);
   initializeDwarfEHPreparePass(Registry);
-  initializeSafeStackPass(Registry);
+  initializeSafeStackLegacyPassPass(Registry);
   initializeSjLjEHPreparePass(Registry);
   initializePreISelIntrinsicLoweringLegacyPassPass(Registry);
   initializeGlobalMergePass(Registry);
   initializeInterleavedAccessPass(Registry);
   initializeCountingFunctionInserterPass(Registry);
   initializeUnreachableBlockElimLegacyPassPass(Registry);
+  initializeExpandReductionsPass(Registry);
 
 #ifdef LINK_POLLY_INTO_TOOLS
   polly::initializePollyPasses(Registry);
diff --git a/tools/yaml2obj/yaml2wasm.cpp b/tools/yaml2obj/yaml2wasm.cpp
index eed9f2c4039b..5c8aba33ee80 100644
--- a/tools/yaml2obj/yaml2wasm.cpp
+++ b/tools/yaml2obj/yaml2wasm.cpp
@@ -169,8 +169,15 @@ int WasmWriter::writeSectionContent(raw_ostream &OS,
       encodeULEB128(Import.SigIndex, OS);
       break;
     case wasm::WASM_EXTERNAL_GLOBAL:
-      encodeSLEB128(Import.GlobalType, OS);
-      writeUint8(OS, Import.GlobalMutable);
+      encodeSLEB128(Import.GlobalImport.Type, OS);
+      writeUint8(OS, Import.GlobalImport.Mutable);
+      break;
+    case wasm::WASM_EXTERNAL_MEMORY:
+      writeLimits(Import.Memory, OS);
+      break;
+    case wasm::WASM_EXTERNAL_TABLE:
+      encodeSLEB128(Import.TableImport.ElemType, OS);
+      writeLimits(Import.TableImport.TableLimits, OS);
       break;
     default:
       errs() << "Unknown import type: " << Import.Kind;
diff --git a/unittests/Analysis/ProfileSummaryInfoTest.cpp b/unittests/Analysis/ProfileSummaryInfoTest.cpp
index 0b4b1de28053..3454474f0376 100644
--- a/unittests/Analysis/ProfileSummaryInfoTest.cpp
+++ b/unittests/Analysis/ProfileSummaryInfoTest.cpp
@@ -162,6 +162,12 @@ TEST_F(ProfileSummaryInfoTest, InstrProf) {
 
   EXPECT_TRUE(PSI.isHotCallSite(CS1, &BFI));
   EXPECT_FALSE(PSI.isHotCallSite(CS2, &BFI));
+
+  // Test that adding an MD_prof metadata with a hot count on CS2 does not
+  // change its hotness as it has no effect in instrumented profiling.
+  MDBuilder MDB(M->getContext());
+  CI2->setMetadata(llvm::LLVMContext::MD_prof, MDB.createBranchWeights({400}));
+  EXPECT_FALSE(PSI.isHotCallSite(CS2, &BFI));
 }
 
 TEST_F(ProfileSummaryInfoTest, SampleProf) {
diff --git a/unittests/Analysis/TargetLibraryInfoTest.cpp b/unittests/Analysis/TargetLibraryInfoTest.cpp
index 44c141d6a1e9..9d852cf0301b 100644
--- a/unittests/Analysis/TargetLibraryInfoTest.cpp
+++ b/unittests/Analysis/TargetLibraryInfoTest.cpp
@@ -470,6 +470,52 @@ TEST_F(TargetLibraryInfoTest, ValidProto) {
     "declare i32 @isascii(i32)\n"
     "declare i32 @isdigit(i32)\n"
     "declare i32 @toascii(i32)\n"
+
+    // These functions were extracted from math-finite.h which provides
+    // functions similar to those in math.h, but optimized for handling
+    // finite values only.
+    "declare double @__acos_finite(double)\n"
+    "declare float @__acosf_finite(float)\n"
+    "declare x86_fp80 @__acosl_finite(x86_fp80)\n"
+    "declare double @__acosh_finite(double)\n"
+    "declare float @__acoshf_finite(float)\n"
+    "declare x86_fp80 @__acoshl_finite(x86_fp80)\n"
+    "declare double @__asin_finite(double)\n"
+    "declare float @__asinf_finite(float)\n"
+    "declare x86_fp80 @__asinl_finite(x86_fp80)\n"
+    "declare double @__atan2_finite(double, double)\n"
+    "declare float @__atan2f_finite(float, float)\n"
+    "declare x86_fp80 @__atan2l_finite(x86_fp80, x86_fp80)\n"
+    "declare double @__atanh_finite(double)\n"
+    "declare float @__atanhf_finite(float)\n"
+    "declare x86_fp80 @__atanhl_finite(x86_fp80)\n"
+    "declare double @__cosh_finite(double)\n"
+    "declare float @__coshf_finite(float)\n"
+    "declare x86_fp80 @__coshl_finite(x86_fp80)\n"
+    "declare double @__exp10_finite(double)\n"
+    "declare float @__exp10f_finite(float)\n"
+    "declare x86_fp80 @__exp10l_finite(x86_fp80)\n"
+    "declare double @__exp2_finite(double)\n"
+    "declare float @__exp2f_finite(float)\n"
+    "declare x86_fp80 @__exp2l_finite(x86_fp80)\n"
+    "declare double @__exp_finite(double)\n"
+    "declare float @__expf_finite(float)\n"
+    "declare x86_fp80 @__expl_finite(x86_fp80)\n"     
+    "declare double @__log10_finite(double)\n"
+    "declare float @__log10f_finite(float)\n"
+    "declare x86_fp80 @__log10l_finite(x86_fp80)\n"
+    "declare double @__log2_finite(double)\n"
+    "declare float @__log2f_finite(float)\n"
+    "declare x86_fp80 @__log2l_finite(x86_fp80)\n"
+    "declare double @__log_finite(double)\n"
+    "declare float @__logf_finite(float)\n"
+    "declare x86_fp80 @__logl_finite(x86_fp80)\n"
+    "declare double @__pow_finite(double, double)\n"
+    "declare float @__powf_finite(float, float)\n"
+    "declare x86_fp80 @__powl_finite(x86_fp80, x86_fp80)\n"
+    "declare double @__sinh_finite(double)\n"
+    "declare float @__sinhf_finite(float)\n"
+    "declare x86_fp80 @__sinhl_finite(x86_fp80)\n"
     );
 
   for (unsigned FI = 0; FI != LibFunc::NumLibFuncs; ++FI) {
diff --git a/unittests/DebugInfo/CMakeLists.txt b/unittests/DebugInfo/CMakeLists.txt
index dae472bafdd7..e38fff58cae6 100644
--- a/unittests/DebugInfo/CMakeLists.txt
+++ b/unittests/DebugInfo/CMakeLists.txt
@@ -1,3 +1,3 @@
-
+add_subdirectory(CodeView)
 add_subdirectory(DWARF)
 add_subdirectory(PDB)
diff --git a/unittests/DebugInfo/CodeView/CMakeLists.txt b/unittests/DebugInfo/CodeView/CMakeLists.txt
new file mode 100644
index 000000000000..854182c4efb4
--- /dev/null
+++ b/unittests/DebugInfo/CodeView/CMakeLists.txt
@@ -0,0 +1,11 @@
+set(LLVM_LINK_COMPONENTS
+  DebugInfoCodeView
+  )
+
+set(DebugInfoCodeViewSources
+  RandomAccessVisitorTest.cpp
+  )
+
+add_llvm_unittest(DebugInfoCodeViewTests
+  ${DebugInfoCodeViewSources}
+  )
diff --git a/unittests/DebugInfo/CodeView/ErrorChecking.h b/unittests/DebugInfo/CodeView/ErrorChecking.h
new file mode 100644
index 000000000000..09310883bf58
--- /dev/null
+++ b/unittests/DebugInfo/CodeView/ErrorChecking.h
@@ -0,0 +1,61 @@
+//===- ErrorChecking.h - Helpers for verifying llvm::Errors -----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_UNITTESTS_DEBUGINFO_CODEVIEW_ERRORCHECKING_H
+#define LLVM_UNITTESTS_DEBUGINFO_CODEVIEW_ERRORCHECKING_H
+
+#define EXPECT_NO_ERROR(Err)                                                   \
+  {                                                                            \
+    auto E = Err;                                                              \
+    EXPECT_FALSE(static_cast<bool>(E));                                        \
+    if (E)                                                                     \
+      consumeError(std::move(E));                                              \
+  }
+
+#define EXPECT_ERROR(Err)                                                      \
+  {                                                                            \
+    auto E = Err;                                                              \
+    EXPECT_TRUE(static_cast<bool>(E));                                         \
+    if (E)                                                                     \
+      consumeError(std::move(E));                                              \
+  }
+
+#define EXPECT_EXPECTED(Exp)                                                   \
+  {                                                                            \
+    auto E = Exp.takeError();                                                  \
+    EXPECT_FALSE(static_cast<bool>(E));                                        \
+    if (E) {                                                                   \
+      consumeError(std::move(E));                                              \
+      return;                                                                  \
+    }                                                                          \
+  }
+
+#define EXPECT_EXPECTED_EQ(Val, Exp)                                           \
+  {                                                                            \
+    auto Result = Exp;                                                         \
+    auto E = Result.takeError();                                               \
+    EXPECT_FALSE(static_cast<bool>(E));                                        \
+    if (E) {                                                                   \
+      consumeError(std::move(E));                                              \
+      return;                                                                  \
+    }                                                                          \
+    EXPECT_EQ(Val, *Result);                                                   \
+  }
+
+#define EXPECT_UNEXPECTED(Exp)                                                 \
+  {                                                                            \
+    auto E = Exp.takeError();                                                  \
+    EXPECT_TRUE(static_cast<bool>(E));                                         \
+    if (E) {                                                                   \
+      consumeError(std::move(E));                                              \
+      return;                                                                  \
+    }                                                                          \
+  }
+
+#endif
diff --git a/unittests/DebugInfo/CodeView/RandomAccessVisitorTest.cpp b/unittests/DebugInfo/CodeView/RandomAccessVisitorTest.cpp
new file mode 100644
index 000000000000..fedb5978da81
--- /dev/null
+++ b/unittests/DebugInfo/CodeView/RandomAccessVisitorTest.cpp
@@ -0,0 +1,353 @@
+//===- llvm/unittest/DebugInfo/CodeView/RandomAccessVisitorTest.cpp -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ErrorChecking.h"
+
+#include "llvm/ADT/SmallBitVector.h"
+#include "llvm/DebugInfo/CodeView/CVTypeVisitor.h"
+#include "llvm/DebugInfo/CodeView/RandomAccessTypeVisitor.h"
+#include "llvm/DebugInfo/CodeView/TypeRecord.h"
+#include "llvm/DebugInfo/CodeView/TypeRecordMapping.h"
+#include "llvm/DebugInfo/CodeView/TypeSerializer.h"
+#include "llvm/DebugInfo/CodeView/TypeServerHandler.h"
+#include "llvm/DebugInfo/CodeView/TypeTableBuilder.h"
+#include "llvm/DebugInfo/CodeView/TypeVisitorCallbackPipeline.h"
+#include "llvm/DebugInfo/CodeView/TypeVisitorCallbacks.h"
+#include "llvm/DebugInfo/PDB/Native/RawTypes.h"
+#include "llvm/Support/Allocator.h"
+#include "llvm/Support/BinaryItemStream.h"
+#include "llvm/Support/Error.h"
+
+#include "gtest/gtest.h"
+
+using namespace llvm;
+using namespace llvm::codeview;
+using namespace llvm::pdb;
+
+namespace llvm {
+namespace codeview {
+inline bool operator==(const ArrayRecord &R1, const ArrayRecord &R2) {
+  if (R1.ElementType != R2.ElementType)
+    return false;
+  if (R1.IndexType != R2.IndexType)
+    return false;
+  if (R1.Name != R2.Name)
+    return false;
+  if (R1.Size != R2.Size)
+    return false;
+  return true;
+}
+inline bool operator!=(const ArrayRecord &R1, const ArrayRecord &R2) {
+  return !(R1 == R2);
+}
+
+inline bool operator==(const CVType &R1, const CVType &R2) {
+  if (R1.Type != R2.Type)
+    return false;
+  if (R1.RecordData != R2.RecordData)
+    return false;
+  return true;
+}
+inline bool operator!=(const CVType &R1, const CVType &R2) {
+  return !(R1 == R2);
+}
+}
+}
+
+namespace llvm {
+template <> struct BinaryItemTraits<CVType> {
+  static size_t length(const CVType &Item) { return Item.length(); }
+  static ArrayRef<uint8_t> bytes(const CVType &Item) { return Item.data(); }
+};
+}
+
+namespace {
+
+class MockCallbacks : public TypeVisitorCallbacks {
+public:
+  virtual Error visitTypeBegin(CVType &CVR, TypeIndex Index) {
+    Indices.push_back(Index);
+    return Error::success();
+  }
+  virtual Error visitKnownRecord(CVType &CVR, ArrayRecord &AR) {
+    VisitedRecords.push_back(AR);
+    RawRecords.push_back(CVR);
+    return Error::success();
+  }
+
+  uint32_t count() const {
+    assert(Indices.size() == RawRecords.size());
+    assert(Indices.size() == VisitedRecords.size());
+    return Indices.size();
+  }
+  std::vector<TypeIndex> Indices;
+  std::vector<CVType> RawRecords;
+  std::vector<ArrayRecord> VisitedRecords;
+};
+
+class RandomAccessVisitorTest : public testing::Test {
+public:
+  RandomAccessVisitorTest() {}
+
+  static void SetUpTestCase() {
+    GlobalState = llvm::make_unique<GlobalTestState>();
+
+    TypeTableBuilder Builder(GlobalState->Allocator);
+
+    uint32_t Offset = 0;
+    for (int I = 0; I < 11; ++I) {
+      ArrayRecord AR(TypeRecordKind::Array);
+      AR.ElementType = TypeIndex::Int32();
+      AR.IndexType = TypeIndex::UInt32();
+      AR.Size = I;
+      std::string Name;
+      raw_string_ostream Stream(Name);
+      Stream << "Array [" << I << "]";
+      AR.Name = GlobalState->Strings.save(Stream.str());
+      GlobalState->Records.push_back(AR);
+      GlobalState->Indices.push_back(Builder.writeKnownType(AR));
+
+      CVType Type(TypeLeafKind::LF_ARRAY, Builder.records().back());
+      GlobalState->TypeVector.push_back(Type);
+
+      GlobalState->AllOffsets.push_back(
+          {GlobalState->Indices.back(), ulittle32_t(Offset)});
+      Offset += Type.length();
+    }
+
+    GlobalState->ItemStream.setItems(GlobalState->TypeVector);
+    GlobalState->TypeArray = VarStreamArray<CVType>(GlobalState->ItemStream);
+  }
+
+  static void TearDownTestCase() { GlobalState.reset(); }
+
+  void SetUp() override {
+    TestState = llvm::make_unique<PerTestState>();
+
+    TestState->Pipeline.addCallbackToPipeline(TestState->Deserializer);
+    TestState->Pipeline.addCallbackToPipeline(TestState->Callbacks);
+  }
+
+  void TearDown() override { TestState.reset(); }
+
+protected:
+  bool ValidateDatabaseRecord(const RandomAccessTypeVisitor &Visitor,
+                              uint32_t Index) {
+    TypeIndex TI = TypeIndex::fromArrayIndex(Index);
+    if (!Visitor.database().contains(TI))
+      return false;
+    if (GlobalState->TypeVector[Index] != Visitor.database().getTypeRecord(TI))
+      return false;
+    return true;
+  }
+
+  bool ValidateVisitedRecord(uint32_t VisitationOrder,
+                             uint32_t GlobalArrayIndex) {
+    TypeIndex TI = TypeIndex::fromArrayIndex(GlobalArrayIndex);
+    if (TI != TestState->Callbacks.Indices[VisitationOrder])
+      return false;
+
+    if (GlobalState->TypeVector[TI.toArrayIndex()] !=
+        TestState->Callbacks.RawRecords[VisitationOrder])
+      return false;
+
+    if (GlobalState->Records[TI.toArrayIndex()] !=
+        TestState->Callbacks.VisitedRecords[VisitationOrder])
+      return false;
+
+    return true;
+  }
+
+  struct GlobalTestState {
+    GlobalTestState() : Strings(Allocator), ItemStream(llvm::support::little) {}
+
+    BumpPtrAllocator Allocator;
+    StringSaver Strings;
+
+    std::vector<ArrayRecord> Records;
+    std::vector<TypeIndex> Indices;
+    std::vector<TypeIndexOffset> AllOffsets;
+    std::vector<CVType> TypeVector;
+    BinaryItemStream<CVType> ItemStream;
+    VarStreamArray<CVType> TypeArray;
+
+    MutableBinaryByteStream Stream;
+  };
+
+  struct PerTestState {
+    FixedStreamArray<TypeIndexOffset> Offsets;
+
+    TypeVisitorCallbackPipeline Pipeline;
+    TypeDeserializer Deserializer;
+    MockCallbacks Callbacks;
+  };
+
+  FixedStreamArray<TypeIndexOffset>
+  createPartialOffsets(MutableBinaryByteStream &Storage,
+                       std::initializer_list<uint32_t> Indices) {
+
+    uint32_t Count = Indices.size();
+    uint32_t Size = Count * sizeof(TypeIndexOffset);
+    uint8_t *Buffer = GlobalState->Allocator.Allocate<uint8_t>(Size);
+    MutableArrayRef<uint8_t> Bytes(Buffer, Size);
+    Storage = MutableBinaryByteStream(Bytes, support::little);
+    BinaryStreamWriter Writer(Storage);
+    for (const auto I : Indices)
+      consumeError(Writer.writeObject(GlobalState->AllOffsets[I]));
+
+    BinaryStreamReader Reader(Storage);
+    FixedStreamArray<TypeIndexOffset> Result;
+    consumeError(Reader.readArray(Result, Count));
+    return Result;
+  }
+
+  static std::unique_ptr<GlobalTestState> GlobalState;
+  std::unique_ptr<PerTestState> TestState;
+};
+
+std::unique_ptr<RandomAccessVisitorTest::GlobalTestState>
+    RandomAccessVisitorTest::GlobalState;
+}
+
+TEST_F(RandomAccessVisitorTest, MultipleVisits) {
+  TestState->Offsets = createPartialOffsets(GlobalState->Stream, {0, 8});
+  RandomAccessTypeVisitor Visitor(GlobalState->TypeArray,
+                                  GlobalState->TypeVector.size(),
+                                  TestState->Offsets);
+
+  std::vector<uint32_t> IndicesToVisit = {5, 5, 5};
+
+  for (uint32_t I : IndicesToVisit) {
+    TypeIndex TI = TypeIndex::fromArrayIndex(I);
+    EXPECT_NO_ERROR(Visitor.visitTypeIndex(TI, TestState->Pipeline));
+  }
+
+  // [0,8) should be present
+  EXPECT_EQ(8u, Visitor.database().size());
+  for (uint32_t I = 0; I < 8; ++I)
+    EXPECT_TRUE(ValidateDatabaseRecord(Visitor, I));
+
+  // 5, 5, 5
+  EXPECT_EQ(3u, TestState->Callbacks.count());
+  for (auto I : enumerate(IndicesToVisit))
+    EXPECT_TRUE(ValidateVisitedRecord(I.index(), I.value()));
+}
+
+TEST_F(RandomAccessVisitorTest, DescendingWithinChunk) {
+  // Visit multiple items from the same "chunk" in reverse order.  In this
+  // example, it's 7 then 4 then 2.  At the end, all records from 0 to 7 should
+  // be known by the database, but only 2, 4, and 7 should have been visited.
+  TestState->Offsets = createPartialOffsets(GlobalState->Stream, {0, 8});
+
+  std::vector<uint32_t> IndicesToVisit = {7, 4, 2};
+
+  RandomAccessTypeVisitor Visitor(GlobalState->TypeArray,
+                                  GlobalState->TypeVector.size(),
+                                  TestState->Offsets);
+
+  for (uint32_t I : IndicesToVisit) {
+    TypeIndex TI = TypeIndex::fromArrayIndex(I);
+    EXPECT_NO_ERROR(Visitor.visitTypeIndex(TI, TestState->Pipeline));
+  }
+
+  // [0, 7]
+  EXPECT_EQ(8u, Visitor.database().size());
+  for (uint32_t I = 0; I < 8; ++I)
+    EXPECT_TRUE(ValidateDatabaseRecord(Visitor, I));
+
+  // 2, 4, 7
+  EXPECT_EQ(3u, TestState->Callbacks.count());
+  for (auto I : enumerate(IndicesToVisit))
+    EXPECT_TRUE(ValidateVisitedRecord(I.index(), I.value()));
+}
+
+TEST_F(RandomAccessVisitorTest, AscendingWithinChunk) {
+  // * Visit multiple items from the same chunk in ascending order, ensuring
+  //   that intermediate items are not visited.  In the below example, it's
+  //   5 -> 6 -> 7 which come from the [4,8) chunk.
+  TestState->Offsets = createPartialOffsets(GlobalState->Stream, {0, 8});
+
+  std::vector<uint32_t> IndicesToVisit = {2, 4, 7};
+
+  RandomAccessTypeVisitor Visitor(GlobalState->TypeArray,
+                                  GlobalState->TypeVector.size(),
+                                  TestState->Offsets);
+
+  for (uint32_t I : IndicesToVisit) {
+    TypeIndex TI = TypeIndex::fromArrayIndex(I);
+    EXPECT_NO_ERROR(Visitor.visitTypeIndex(TI, TestState->Pipeline));
+  }
+
+  // [0, 7]
+  EXPECT_EQ(8u, Visitor.database().size());
+  for (uint32_t I = 0; I < 8; ++I)
+    EXPECT_TRUE(ValidateDatabaseRecord(Visitor, I));
+
+  // 2, 4, 7
+  EXPECT_EQ(3u, TestState->Callbacks.count());
+  for (auto &I : enumerate(IndicesToVisit))
+    EXPECT_TRUE(ValidateVisitedRecord(I.index(), I.value()));
+}
+
+TEST_F(RandomAccessVisitorTest, StopPrematurelyInChunk) {
+  // * Don't visit the last item in one chunk, ensuring that visitation stops
+  //   at the record you specify, and the chunk is only partially visited.
+  //   In the below example, this is tested by visiting 0 and 1 but not 2,
+  //   all from the [0,3) chunk.
+  TestState->Offsets = createPartialOffsets(GlobalState->Stream, {0, 8});
+
+  std::vector<uint32_t> IndicesToVisit = {0, 1, 2};
+
+  RandomAccessTypeVisitor Visitor(GlobalState->TypeArray,
+                                  GlobalState->TypeVector.size(),
+                                  TestState->Offsets);
+
+  for (uint32_t I : IndicesToVisit) {
+    TypeIndex TI = TypeIndex::fromArrayIndex(I);
+    EXPECT_NO_ERROR(Visitor.visitTypeIndex(TI, TestState->Pipeline));
+  }
+
+  // [0, 8) should be visited.
+  EXPECT_EQ(8u, Visitor.database().size());
+  for (uint32_t I = 0; I < 8; ++I)
+    EXPECT_TRUE(ValidateDatabaseRecord(Visitor, I));
+
+  // [0, 2]
+  EXPECT_EQ(3u, TestState->Callbacks.count());
+  for (auto I : enumerate(IndicesToVisit))
+    EXPECT_TRUE(ValidateVisitedRecord(I.index(), I.value()));
+}
+
+TEST_F(RandomAccessVisitorTest, InnerChunk) {
+  // Test that when a request comes from a chunk in the middle of the partial
+  // offsets array, that items from surrounding chunks are not visited or
+  // added to the database.
+  TestState->Offsets = createPartialOffsets(GlobalState->Stream, {0, 4, 9});
+
+  std::vector<uint32_t> IndicesToVisit = {5, 7};
+
+  RandomAccessTypeVisitor Visitor(GlobalState->TypeArray,
+                                  GlobalState->TypeVector.size(),
+                                  TestState->Offsets);
+
+  for (uint32_t I : IndicesToVisit) {
+    TypeIndex TI = TypeIndex::fromArrayIndex(I);
+    EXPECT_NO_ERROR(Visitor.visitTypeIndex(TI, TestState->Pipeline));
+  }
+
+  // [4, 9)
+  EXPECT_EQ(5u, Visitor.database().size());
+  for (uint32_t I = 4; I < 9; ++I)
+    EXPECT_TRUE(ValidateDatabaseRecord(Visitor, I));
+
+  // 5, 7
+  EXPECT_EQ(2u, TestState->Callbacks.count());
+  for (auto &I : enumerate(IndicesToVisit))
+    EXPECT_TRUE(ValidateVisitedRecord(I.index(), I.value()));
+}
diff --git a/unittests/ExecutionEngine/Orc/ObjectTransformLayerTest.cpp b/unittests/ExecutionEngine/Orc/ObjectTransformLayerTest.cpp
index 96214a368dce..362c143c54ef 100644
--- a/unittests/ExecutionEngine/Orc/ObjectTransformLayerTest.cpp
+++ b/unittests/ExecutionEngine/Orc/ObjectTransformLayerTest.cpp
@@ -304,7 +304,7 @@ TEST(ObjectTransformLayerTest, Main) {
       return nullptr;
     }
     void registerEHFrames(uint8_t *, uint64_t, size_t) override {}
-    void deregisterEHFrames(uint8_t *, uint64_t, size_t) override {}
+    void deregisterEHFrames() override {}
     bool finalizeMemory(std::string *) override { return false; }
   };
 
diff --git a/unittests/ExecutionEngine/Orc/OrcTestCommon.h b/unittests/ExecutionEngine/Orc/OrcTestCommon.h
index 7fb26634c7a7..dff72c6b9d57 100644
--- a/unittests/ExecutionEngine/Orc/OrcTestCommon.h
+++ b/unittests/ExecutionEngine/Orc/OrcTestCommon.h
@@ -101,7 +101,7 @@ class TypeBuilder<DummyStruct, XCompile> {
 public:
   static StructType *get(LLVMContext &Context) {
     return StructType::get(
-      TypeBuilder<types::i<32>[256], XCompile>::get(Context), nullptr);
+        TypeBuilder<types::i<32>[256], XCompile>::get(Context));
   }
 };
 
diff --git a/unittests/ExecutionEngine/Orc/RTDyldObjectLinkingLayerTest.cpp b/unittests/ExecutionEngine/Orc/RTDyldObjectLinkingLayerTest.cpp
index de99c022fb9d..c13a75a5cbfe 100644
--- a/unittests/ExecutionEngine/Orc/RTDyldObjectLinkingLayerTest.cpp
+++ b/unittests/ExecutionEngine/Orc/RTDyldObjectLinkingLayerTest.cpp
@@ -90,7 +90,8 @@ TEST(RTDyldObjectLinkingLayerTest, TestSetProcessAllSections) {
   Objs.push_back(OwningObj.getBinary());
 
   bool DebugSectionSeen = false;
-  SectionMemoryManagerWrapper SMMW(DebugSectionSeen);
+  auto SMMW =
+    std::make_shared<SectionMemoryManagerWrapper>(DebugSectionSeen);
   auto Resolver =
     createLambdaResolver(
       [](const std::string &Name) {
@@ -102,7 +103,7 @@ TEST(RTDyldObjectLinkingLayerTest, TestSetProcessAllSections) {
 
   {
     // Test with ProcessAllSections = false (the default).
-    auto H = ObjLayer.addObjectSet(Objs, &SMMW, &*Resolver);
+    auto H = ObjLayer.addObjectSet(Objs, SMMW, &*Resolver);
     ObjLayer.emitAndFinalize(H);
     EXPECT_EQ(DebugSectionSeen, false)
       << "Unexpected debug info section";
@@ -112,7 +113,7 @@ TEST(RTDyldObjectLinkingLayerTest, TestSetProcessAllSections) {
   {
     // Test with ProcessAllSections = true.
     ObjLayer.setProcessAllSections(true);
-    auto H = ObjLayer.addObjectSet(Objs, &SMMW, &*Resolver);
+    auto H = ObjLayer.addObjectSet(Objs, SMMW, &*Resolver);
     ObjLayer.emitAndFinalize(H);
     EXPECT_EQ(DebugSectionSeen, true)
       << "Expected debug info section not seen";
@@ -178,14 +179,15 @@ TEST_F(RTDyldObjectLinkingLayerExecutionTest, NoDuplicateFinalization) {
         return JITSymbol(nullptr);
       });
 
-  SectionMemoryManagerWrapper SMMW;
-  ObjLayer.addObjectSet(std::move(Obj1Set), &SMMW, &*Resolver);
-  auto H = ObjLayer.addObjectSet(std::move(Obj2Set), &SMMW, &*Resolver);
+  auto SMMW = std::make_shared<SectionMemoryManagerWrapper>();
+  ObjLayer.addObjectSet(std::move(Obj1Set), SMMW, &*Resolver);
+  auto H = ObjLayer.addObjectSet(std::move(Obj2Set), SMMW, &*Resolver);
   ObjLayer.emitAndFinalize(H);
-
+  ObjLayer.removeObjectSet(H);
+  
   // Finalization of module 2 should trigger finalization of module 1.
   // Verify that finalize on SMMW is only called once.
-  EXPECT_EQ(SMMW.FinalizationCount, 1)
+  EXPECT_EQ(SMMW->FinalizationCount, 1)
       << "Extra call to finalize";
 }
 
@@ -238,14 +240,15 @@ TEST_F(RTDyldObjectLinkingLayerExecutionTest, NoPrematureAllocation) {
   std::vector<object::ObjectFile*> Obj2Set;
   Obj2Set.push_back(Obj2.getBinary());
 
-  SectionMemoryManagerWrapper SMMW;
+  auto SMMW = std::make_shared<SectionMemoryManagerWrapper>();
   NullResolver NR;
-  auto H = ObjLayer.addObjectSet(std::move(Obj1Set), &SMMW, &NR);
-  ObjLayer.addObjectSet(std::move(Obj2Set), &SMMW, &NR);
+  auto H = ObjLayer.addObjectSet(std::move(Obj1Set), SMMW, &NR);
+  ObjLayer.addObjectSet(std::move(Obj2Set), SMMW, &NR);
   ObjLayer.emitAndFinalize(H);
-
+  ObjLayer.removeObjectSet(H);
+  
   // Only one call to needsToReserveAllocationSpace should have been made.
-  EXPECT_EQ(SMMW.NeedsToReserveAllocationSpaceCount, 1)
+  EXPECT_EQ(SMMW->NeedsToReserveAllocationSpaceCount, 1)
       << "More than one call to needsToReserveAllocationSpace "
          "(multiple unrelated objects loaded prior to finalization)";
 }
diff --git a/unittests/IR/ConstantRangeTest.cpp b/unittests/IR/ConstantRangeTest.cpp
index b22f82154f40..c6c9bf6d6b50 100644
--- a/unittests/IR/ConstantRangeTest.cpp
+++ b/unittests/IR/ConstantRangeTest.cpp
@@ -443,6 +443,11 @@ TEST_F(ConstantRangeTest, Multiply) {
   EXPECT_EQ(ConstantRange(APInt(8, 254), APInt(8, 255)).multiply(
               ConstantRange(APInt(8, 2), APInt(8, 4))),
             ConstantRange(APInt(8, 250), APInt(8, 253)));
+
+  // TODO: This should be return [-2, 0]
+  EXPECT_EQ(ConstantRange(APInt(8, -2)).multiply(
+              ConstantRange(APInt(8, 0), APInt(8, 2))),
+            ConstantRange(APInt(8, -2), APInt(8, 1)));
 }
 
 TEST_F(ConstantRangeTest, UMax) {
@@ -703,13 +708,13 @@ TEST(ConstantRange, MakeGuaranteedNoWrapRegion) {
       Instruction::Add, ConstantRange(32, /* isFullSet = */ true),
       OBO::NoUnsignedWrap);
   EXPECT_TRUE(NUWForAllValues.isSingleElement() &&
-              NSWForAllValues.getSingleElement()->isMinValue());
+              NUWForAllValues.getSingleElement()->isMinValue());
 
   auto NUWAndNSWForAllValues = ConstantRange::makeGuaranteedNoWrapRegion(
       Instruction::Add, ConstantRange(32, /* isFullSet = */ true),
       OBO::NoUnsignedWrap | OBO::NoSignedWrap);
   EXPECT_TRUE(NUWAndNSWForAllValues.isSingleElement() &&
-              NSWForAllValues.getSingleElement()->isMinValue());
+              NUWAndNSWForAllValues.getSingleElement()->isMinValue());
 
   ConstantRange OneToFive(APInt(32, 1), APInt(32, 6));
   EXPECT_EQ(ConstantRange::makeGuaranteedNoWrapRegion(
diff --git a/unittests/IR/InstructionsTest.cpp b/unittests/IR/InstructionsTest.cpp
index 7c75aaec1753..b8d398c5cc38 100644
--- a/unittests/IR/InstructionsTest.cpp
+++ b/unittests/IR/InstructionsTest.cpp
@@ -21,6 +21,7 @@
 #include "llvm/IR/Module.h"
 #include "llvm/IR/NoFolder.h"
 #include "llvm/IR/Operator.h"
+#include "gmock/gmock-matchers.h"
 #include "gtest/gtest.h"
 #include <memory>
 
@@ -740,5 +741,11 @@ TEST(InstructionsTest, SwitchInst) {
   EXPECT_EQ(BB1.get(), Handle.getCaseSuccessor());
 }
 
+TEST(InstructionsTest, CommuteShuffleMask) {
+  SmallVector<int, 16> Indices({-1, 0, 7});
+  ShuffleVectorInst::commuteShuffleMask(Indices, 4);
+  EXPECT_THAT(Indices, testing::ContainerEq(ArrayRef<int>({-1, 4, 3})));
+}
+
 } // end anonymous namespace
 } // end namespace llvm
diff --git a/unittests/IR/TypeBuilderTest.cpp b/unittests/IR/TypeBuilderTest.cpp
index f2dccac001a4..9ba776543d94 100644
--- a/unittests/IR/TypeBuilderTest.cpp
+++ b/unittests/IR/TypeBuilderTest.cpp
@@ -264,23 +264,21 @@ namespace {
 
 TEST(TypeBuilderTest, Extensions) {
   LLVMContext Context;
-  EXPECT_EQ(PointerType::getUnqual(StructType::get(
-                TypeBuilder<int, false>::get(Context),
-                TypeBuilder<int *, false>::get(Context),
-                TypeBuilder<void *[], false>::get(Context), (void *)nullptr)),
+  EXPECT_EQ(PointerType::getUnqual(
+                StructType::get(TypeBuilder<int, false>::get(Context),
+                                TypeBuilder<int *, false>::get(Context),
+                                TypeBuilder<void *[], false>::get(Context))),
             (TypeBuilder<MyType *, false>::get(Context)));
-  EXPECT_EQ(
-      PointerType::getUnqual(StructType::get(
-          TypeBuilder<types::i<32>, false>::get(Context),
-          TypeBuilder<types::i<32> *, false>::get(Context),
-          TypeBuilder<types::i<8> *[], false>::get(Context), (void *)nullptr)),
-      (TypeBuilder<MyPortableType *, false>::get(Context)));
-  EXPECT_EQ(
-      PointerType::getUnqual(StructType::get(
-          TypeBuilder<types::i<32>, false>::get(Context),
-          TypeBuilder<types::i<32> *, false>::get(Context),
-          TypeBuilder<types::i<8> *[], false>::get(Context), (void *)nullptr)),
-      (TypeBuilder<MyPortableType *, true>::get(Context)));
+  EXPECT_EQ(PointerType::getUnqual(StructType::get(
+                TypeBuilder<types::i<32>, false>::get(Context),
+                TypeBuilder<types::i<32> *, false>::get(Context),
+                TypeBuilder<types::i<8> *[], false>::get(Context))),
+            (TypeBuilder<MyPortableType *, false>::get(Context)));
+  EXPECT_EQ(PointerType::getUnqual(StructType::get(
+                TypeBuilder<types::i<32>, false>::get(Context),
+                TypeBuilder<types::i<32> *, false>::get(Context),
+                TypeBuilder<types::i<8> *[], false>::get(Context))),
+            (TypeBuilder<MyPortableType *, true>::get(Context)));
 }
 
 }  // anonymous namespace
diff --git a/unittests/Support/CMakeLists.txt b/unittests/Support/CMakeLists.txt
index 1f677100dcef..f8d3c1c9a8c7 100644
--- a/unittests/Support/CMakeLists.txt
+++ b/unittests/Support/CMakeLists.txt
@@ -36,6 +36,7 @@ add_llvm_unittest(SupportTests
   MemoryBufferTest.cpp
   MemoryTest.cpp
   NativeFormatTests.cpp
+  ParallelTest.cpp
   Path.cpp
   ProcessTest.cpp
   ProgramTest.cpp
diff --git a/unittests/Support/DynamicLibrary/DynamicLibraryTest.cpp b/unittests/Support/DynamicLibrary/DynamicLibraryTest.cpp
index d46eadc9a046..0674a91282a1 100644
--- a/unittests/Support/DynamicLibrary/DynamicLibraryTest.cpp
+++ b/unittests/Support/DynamicLibrary/DynamicLibraryTest.cpp
@@ -23,8 +23,10 @@ using namespace llvm::sys;
 extern "C" PIPSQUEAK_EXPORT const char *TestA() { return "ProcessCall"; }
 
 std::string LibPath() {
+  const std::vector<testing::internal::string>& Argvs = testing::internal::GetArgvs();
+  const char *Argv0 = Argvs.size() > 0 ? Argvs[0].c_str() : "DynamicLibraryTests";
   void *Ptr = (void*)(intptr_t)TestA;
-  std::string Path = fs::getMainExecutable("DynamicLibraryTests", Ptr);
+  std::string Path = fs::getMainExecutable(Argv0, Ptr);
   llvm::SmallString<256> Buf(path::parent_path(Path));
   path::append(Buf, "PipSqueak.so");
   return Buf.str();
diff --git a/unittests/Support/ParallelTest.cpp b/unittests/Support/ParallelTest.cpp
new file mode 100644
index 000000000000..d734e0dd8586
--- /dev/null
+++ b/unittests/Support/ParallelTest.cpp
@@ -0,0 +1,53 @@
+//===- llvm/unittest/Support/ParallelTest.cpp -----------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief Parallel.h unit tests.
+///
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/Parallel.h"
+#include "gtest/gtest.h"
+#include <array>
+#include <random>
+
+uint32_t array[1024 * 1024];
+
+using namespace llvm;
+
+// Tests below are hanging up on mingw. Investigating.
+#if !defined(__MINGW32__)
+
+TEST(Parallel, sort) {
+  std::mt19937 randEngine;
+  std::uniform_int_distribution<uint32_t> dist;
+
+  for (auto &i : array)
+    i = dist(randEngine);
+
+  sort(parallel::par, std::begin(array), std::end(array));
+  ASSERT_TRUE(std::is_sorted(std::begin(array), std::end(array)));
+}
+
+TEST(Parallel, parallel_for) {
+  // We need to test the case with a TaskSize > 1. We are white-box testing
+  // here. The TaskSize is calculated as (End - Begin) / 1024 at the time of
+  // writing.
+  uint32_t range[2050];
+  std::fill(range, range + 2050, 1);
+  for_each_n(parallel::par, 0, 2049, [&range](size_t I) { ++range[I]; });
+
+  uint32_t expected[2049];
+  std::fill(expected, expected + 2049, 2);
+  ASSERT_TRUE(std::equal(range, range + 2049, expected));
+  // Check that we don't write past the end of the requested range.
+  ASSERT_EQ(range[2049], 1u);
+}
+
+#endif
diff --git a/unittests/Support/Path.cpp b/unittests/Support/Path.cpp
index 426aff47c746..a4bdcb5c79a2 100644
--- a/unittests/Support/Path.cpp
+++ b/unittests/Support/Path.cpp
@@ -1047,7 +1047,7 @@ TEST_F(FileSystemTest, MD5) {
   SmallString<64> TempPath;
   ASSERT_NO_ERROR(fs::createTemporaryFile("prefix", "temp", FD, TempPath));
   StringRef Data("abcdefghijklmnopqrstuvwxyz");
-  write(FD, Data.data(), Data.size());
+  ASSERT_EQ(write(FD, Data.data(), Data.size()), static_cast<ssize_t>(Data.size()));
   lseek(FD, 0, SEEK_SET);
   auto Hash = fs::md5_contents(FD);
   ::close(FD);
diff --git a/unittests/Transforms/Utils/Cloning.cpp b/unittests/Transforms/Utils/Cloning.cpp
index 2f4ee8636530..83f146dca704 100644
--- a/unittests/Transforms/Utils/Cloning.cpp
+++ b/unittests/Transforms/Utils/Cloning.cpp
@@ -296,7 +296,6 @@ protected:
     Value* AllocaContent = IBuilder.getInt32(1);
     Instruction* Store = IBuilder.CreateStore(AllocaContent, Alloca);
     IBuilder.SetCurrentDebugLocation(DebugLoc::get(5, 2, Subprogram));
-    Instruction* Terminator = IBuilder.CreateRetVoid();
 
     // Create a local variable around the alloca
     auto *IntType = DBuilder.createBasicType("int", 32, dwarf::DW_ATE_signed);
@@ -306,12 +305,25 @@ protected:
     auto *DL = DILocation::get(Subprogram->getContext(), 5, 0, Subprogram);
     DBuilder.insertDeclare(Alloca, Variable, E, DL, Store);
     DBuilder.insertDbgValueIntrinsic(AllocaContent, 0, Variable, E, DL,
-                                     Terminator);
-    // Finalize the debug info
+                                     Entry);
+    // Also create an inlined variable.
+    auto *InlinedSP =
+        DBuilder.createFunction(CU, "inlined", "inlined", File, 8, FuncType,
+                                true, true, 9, DINode::FlagZero, false);
+    auto *InlinedVar =
+        DBuilder.createAutoVariable(InlinedSP, "inlined", File, 5, IntType, true);
+    auto *Scope = DBuilder.createLexicalBlock(
+        DBuilder.createLexicalBlockFile(InlinedSP, File), File, 1, 1);
+    auto InlinedDL =
+        DebugLoc::get(9, 4, Scope, DebugLoc::get(5, 2, Subprogram));
+    IBuilder.SetCurrentDebugLocation(InlinedDL);
+    DBuilder.insertDeclare(Alloca, InlinedVar, E, InlinedDL, Store);
+    IBuilder.CreateStore(IBuilder.getInt32(2), Alloca);
+    // Finalize the debug info.
     DBuilder.finalize();
+    IBuilder.CreateRetVoid();
 
-
-    // Create another, empty, compile unit
+    // Create another, empty, compile unit.
     DIBuilder DBuilder2(*M);
     DBuilder2.createCompileUnit(dwarf::DW_LANG_C99,
                                 DBuilder.createFile("extra.c", "/file/dir"),
@@ -345,15 +357,8 @@ TEST_F(CloneFunc, NewFunctionCreated) {
 // function, while the original subprogram still points to the old one.
 TEST_F(CloneFunc, Subprogram) {
   EXPECT_FALSE(verifyModule(*M));
-
-  unsigned SubprogramCount = Finder->subprogram_count();
-  EXPECT_EQ(1U, SubprogramCount);
-
-  auto Iter = Finder->subprograms().begin();
-  auto *Sub = cast<DISubprogram>(*Iter);
-
-  EXPECT_TRUE(Sub == OldFunc->getSubprogram());
-  EXPECT_TRUE(Sub == NewFunc->getSubprogram());
+  EXPECT_EQ(3U, Finder->subprogram_count());
+  EXPECT_NE(NewFunc->getSubprogram(), OldFunc->getSubprogram());
 }
 
 // Test that instructions in the old function still belong to it in the
@@ -380,8 +385,8 @@ TEST_F(CloneFunc, InstructionOwnership) {
       EXPECT_EQ(OldDL.getCol(), NewDL.getCol());
 
       // But that they belong to different functions
-      auto *OldSubprogram = cast<DISubprogram>(OldDL.getScope());
-      auto *NewSubprogram = cast<DISubprogram>(NewDL.getScope());
+      auto *OldSubprogram = cast<DISubprogram>(OldDL.getInlinedAtScope());
+      auto *NewSubprogram = cast<DISubprogram>(NewDL.getInlinedAtScope());
       EXPECT_EQ(OldFunc->getSubprogram(), OldSubprogram);
       EXPECT_EQ(NewFunc->getSubprogram(), NewSubprogram);
     }
@@ -416,22 +421,26 @@ TEST_F(CloneFunc, DebugIntrinsics) {
       EXPECT_EQ(NewFunc, cast<AllocaInst>(NewIntrin->getAddress())->
                          getParent()->getParent());
 
-      // Old variable must belong to the old function
-      EXPECT_EQ(OldFunc->getSubprogram(),
-                cast<DISubprogram>(OldIntrin->getVariable()->getScope()));
-      // New variable must belong to the New function
-      EXPECT_EQ(NewFunc->getSubprogram(),
-                cast<DISubprogram>(NewIntrin->getVariable()->getScope()));
+      if (!OldIntrin->getDebugLoc()->getInlinedAt()) {
+        // Old variable must belong to the old function.
+        EXPECT_EQ(OldFunc->getSubprogram(),
+                  cast<DISubprogram>(OldIntrin->getVariable()->getScope()));
+        // New variable must belong to the new function.
+        EXPECT_EQ(NewFunc->getSubprogram(),
+                  cast<DISubprogram>(NewIntrin->getVariable()->getScope()));
+      }
     } else if (DbgValueInst* OldIntrin = dyn_cast<DbgValueInst>(&OldI)) {
       DbgValueInst* NewIntrin = dyn_cast<DbgValueInst>(&NewI);
       EXPECT_TRUE(NewIntrin);
 
-      // Old variable must belong to the old function
-      EXPECT_EQ(OldFunc->getSubprogram(),
-                cast<DISubprogram>(OldIntrin->getVariable()->getScope()));
-      // New variable must belong to the New function
-      EXPECT_EQ(NewFunc->getSubprogram(),
-                cast<DISubprogram>(NewIntrin->getVariable()->getScope()));
+      if (!OldIntrin->getDebugLoc()->getInlinedAt()) {
+        // Old variable must belong to the old function.
+        EXPECT_EQ(OldFunc->getSubprogram(),
+                  cast<DISubprogram>(OldIntrin->getVariable()->getScope()));
+        // New variable must belong to the new function.
+        EXPECT_EQ(NewFunc->getSubprogram(),
+                  cast<DISubprogram>(NewIntrin->getVariable()->getScope()));
+      }
     }
 
     ++OldIter;
diff --git a/utils/TableGen/CodeGenInstruction.cpp b/utils/TableGen/CodeGenInstruction.cpp
index bb2ec2a64e49..f4a760990999 100644
--- a/utils/TableGen/CodeGenInstruction.cpp
+++ b/utils/TableGen/CodeGenInstruction.cpp
@@ -77,6 +77,7 @@ CGIOperandList::CGIOperandList(Record *R) : TheDef(R) {
       PrintMethod = Rec->getValueAsString("PrintMethod");
       OperandType = Rec->getValueAsString("OperandType");
       OperandNamespace = Rec->getValueAsString("OperandNamespace");
+      EncoderMethod = Rec->getValueAsString("EncoderMethod");
     } else if (Rec->isSubClassOf("Operand")) {
       PrintMethod = Rec->getValueAsString("PrintMethod");
       OperandType = Rec->getValueAsString("OperandType");
diff --git a/utils/TableGen/SubtargetEmitter.cpp b/utils/TableGen/SubtargetEmitter.cpp
index 30516ef5d10d..1903f405d859 100644
--- a/utils/TableGen/SubtargetEmitter.cpp
+++ b/utils/TableGen/SubtargetEmitter.cpp
@@ -415,7 +415,7 @@ EmitStageAndOperandCycleData(raw_ostream &OS,
   BypassTable += " 0, // No itinerary\n";
 
   // For each Itinerary across all processors, add a unique entry to the stages,
-  // operand cycles, and pipepine bypess tables. Then add the new Itinerary
+  // operand cycles, and pipeline bypass tables. Then add the new Itinerary
   // object with computed offsets to the ProcItinLists result.
   unsigned StageCount = 1, OperandCycleCount = 1;
   std::map<std::string, unsigned> ItinStageMap, ItinOperandMap;
diff --git a/utils/TableGen/X86RecognizableInstr.cpp b/utils/TableGen/X86RecognizableInstr.cpp
index 4298bc5763b6..55e75763ad69 100644
--- a/utils/TableGen/X86RecognizableInstr.cpp
+++ b/utils/TableGen/X86RecognizableInstr.cpp
@@ -21,129 +21,6 @@
 #include <string>
 
 using namespace llvm;
-
-#define MRM_MAPPING     \
-  MAP(C0, 64)           \
-  MAP(C1, 65)           \
-  MAP(C2, 66)           \
-  MAP(C3, 67)           \
-  MAP(C4, 68)           \
-  MAP(C5, 69)           \
-  MAP(C6, 70)           \
-  MAP(C7, 71)           \
-  MAP(C8, 72)           \
-  MAP(C9, 73)           \
-  MAP(CA, 74)           \
-  MAP(CB, 75)           \
-  MAP(CC, 76)           \
-  MAP(CD, 77)           \
-  MAP(CE, 78)           \
-  MAP(CF, 79)           \
-  MAP(D0, 80)           \
-  MAP(D1, 81)           \
-  MAP(D2, 82)           \
-  MAP(D3, 83)           \
-  MAP(D4, 84)           \
-  MAP(D5, 85)           \
-  MAP(D6, 86)           \
-  MAP(D7, 87)           \
-  MAP(D8, 88)           \
-  MAP(D9, 89)           \
-  MAP(DA, 90)           \
-  MAP(DB, 91)           \
-  MAP(DC, 92)           \
-  MAP(DD, 93)           \
-  MAP(DE, 94)           \
-  MAP(DF, 95)           \
-  MAP(E0, 96)           \
-  MAP(E1, 97)           \
-  MAP(E2, 98)           \
-  MAP(E3, 99)           \
-  MAP(E4, 100)          \
-  MAP(E5, 101)          \
-  MAP(E6, 102)          \
-  MAP(E7, 103)          \
-  MAP(E8, 104)          \
-  MAP(E9, 105)          \
-  MAP(EA, 106)          \
-  MAP(EB, 107)          \
-  MAP(EC, 108)          \
-  MAP(ED, 109)          \
-  MAP(EE, 110)          \
-  MAP(EF, 111)          \
-  MAP(F0, 112)          \
-  MAP(F1, 113)          \
-  MAP(F2, 114)          \
-  MAP(F3, 115)          \
-  MAP(F4, 116)          \
-  MAP(F5, 117)          \
-  MAP(F6, 118)          \
-  MAP(F7, 119)          \
-  MAP(F8, 120)          \
-  MAP(F9, 121)          \
-  MAP(FA, 122)          \
-  MAP(FB, 123)          \
-  MAP(FC, 124)          \
-  MAP(FD, 125)          \
-  MAP(FE, 126)          \
-  MAP(FF, 127)
-
-// A clone of X86 since we can't depend on something that is generated.
-namespace X86Local {
-  enum {
-    Pseudo        = 0,
-    RawFrm        = 1,
-    AddRegFrm     = 2,
-    RawFrmMemOffs = 3,
-    RawFrmSrc     = 4,
-    RawFrmDst     = 5,
-    RawFrmDstSrc  = 6,
-    RawFrmImm8    = 7,
-    RawFrmImm16   = 8,
-    MRMDestMem     = 32,
-    MRMSrcMem      = 33,
-    MRMSrcMem4VOp3 = 34,
-    MRMSrcMemOp4   = 35,
-    MRMXm = 39,
-    MRM0m = 40, MRM1m = 41, MRM2m = 42, MRM3m = 43,
-    MRM4m = 44, MRM5m = 45, MRM6m = 46, MRM7m = 47,
-    MRMDestReg     = 48,
-    MRMSrcReg      = 49,
-    MRMSrcReg4VOp3 = 50,
-    MRMSrcRegOp4   = 51,
-    MRMXr = 55,
-    MRM0r = 56, MRM1r = 57, MRM2r = 58, MRM3r = 59,
-    MRM4r = 60, MRM5r = 61, MRM6r = 62, MRM7r = 63,
-#define MAP(from, to) MRM_##from = to,
-    MRM_MAPPING
-#undef MAP
-  };
-
-  enum {
-    OB = 0, TB = 1, T8 = 2, TA = 3, XOP8 = 4, XOP9 = 5, XOPA = 6
-  };
-
-  enum {
-    PS = 1, PD = 2, XS = 3, XD = 4
-  };
-
-  enum {
-    VEX = 1, XOP = 2, EVEX = 3
-  };
-
-  enum {
-    OpSize16 = 1, OpSize32 = 2
-  };
-
-  enum {
-    AdSize16 = 1, AdSize32 = 2, AdSize64 = 3
-  };
-
-  enum {
-    VEX_W0 = 0, VEX_W1 = 1, VEX_WIG = 2
-  };
-}
-
 using namespace X86Disassembler;
 
 /// byteFromBitsInit - Extracts a value at most 8 bits in width from a BitsInit.
@@ -890,7 +767,7 @@ void RecognizableInstr::emitDecodePath(DisassemblerTables &tables) const {
     case X86Local::MRM6m:      case X86Local::MRM7m:
       filter = new ExtendedFilter(false, Form - X86Local::MRM0m);
       break;
-    MRM_MAPPING
+    X86_INSTR_MRM_MAPPING
       filter = new ExactFilter(0xC0 + Form - X86Local::MRM_C0);   \
       break;
     } // switch (Form)
diff --git a/utils/TableGen/X86RecognizableInstr.h b/utils/TableGen/X86RecognizableInstr.h
index 91ed928540c3..7fe731ec8b1c 100644
--- a/utils/TableGen/X86RecognizableInstr.h
+++ b/utils/TableGen/X86RecognizableInstr.h
@@ -24,6 +24,128 @@
 
 namespace llvm {
 
+#define X86_INSTR_MRM_MAPPING     \
+  MAP(C0, 64)                     \
+  MAP(C1, 65)                     \
+  MAP(C2, 66)                     \
+  MAP(C3, 67)                     \
+  MAP(C4, 68)                     \
+  MAP(C5, 69)                     \
+  MAP(C6, 70)                     \
+  MAP(C7, 71)                     \
+  MAP(C8, 72)                     \
+  MAP(C9, 73)                     \
+  MAP(CA, 74)                     \
+  MAP(CB, 75)                     \
+  MAP(CC, 76)                     \
+  MAP(CD, 77)                     \
+  MAP(CE, 78)                     \
+  MAP(CF, 79)                     \
+  MAP(D0, 80)                     \
+  MAP(D1, 81)                     \
+  MAP(D2, 82)                     \
+  MAP(D3, 83)                     \
+  MAP(D4, 84)                     \
+  MAP(D5, 85)                     \
+  MAP(D6, 86)                     \
+  MAP(D7, 87)                     \
+  MAP(D8, 88)                     \
+  MAP(D9, 89)                     \
+  MAP(DA, 90)                     \
+  MAP(DB, 91)                     \
+  MAP(DC, 92)                     \
+  MAP(DD, 93)                     \
+  MAP(DE, 94)                     \
+  MAP(DF, 95)                     \
+  MAP(E0, 96)                     \
+  MAP(E1, 97)                     \
+  MAP(E2, 98)                     \
+  MAP(E3, 99)                     \
+  MAP(E4, 100)                    \
+  MAP(E5, 101)                    \
+  MAP(E6, 102)                    \
+  MAP(E7, 103)                    \
+  MAP(E8, 104)                    \
+  MAP(E9, 105)                    \
+  MAP(EA, 106)                    \
+  MAP(EB, 107)                    \
+  MAP(EC, 108)                    \
+  MAP(ED, 109)                    \
+  MAP(EE, 110)                    \
+  MAP(EF, 111)                    \
+  MAP(F0, 112)                    \
+  MAP(F1, 113)                    \
+  MAP(F2, 114)                    \
+  MAP(F3, 115)                    \
+  MAP(F4, 116)                    \
+  MAP(F5, 117)                    \
+  MAP(F6, 118)                    \
+  MAP(F7, 119)                    \
+  MAP(F8, 120)                    \
+  MAP(F9, 121)                    \
+  MAP(FA, 122)                    \
+  MAP(FB, 123)                    \
+  MAP(FC, 124)                    \
+  MAP(FD, 125)                    \
+  MAP(FE, 126)                    \
+  MAP(FF, 127)
+
+// A clone of X86 since we can't depend on something that is generated.
+namespace X86Local {
+  enum {
+    Pseudo        = 0,
+    RawFrm        = 1,
+    AddRegFrm     = 2,
+    RawFrmMemOffs = 3,
+    RawFrmSrc     = 4,
+    RawFrmDst     = 5,
+    RawFrmDstSrc  = 6,
+    RawFrmImm8    = 7,
+    RawFrmImm16   = 8,
+    MRMDestMem     = 32,
+    MRMSrcMem      = 33,
+    MRMSrcMem4VOp3 = 34,
+    MRMSrcMemOp4   = 35,
+    MRMXm = 39,
+    MRM0m = 40, MRM1m = 41, MRM2m = 42, MRM3m = 43,
+    MRM4m = 44, MRM5m = 45, MRM6m = 46, MRM7m = 47,
+    MRMDestReg     = 48,
+    MRMSrcReg      = 49,
+    MRMSrcReg4VOp3 = 50,
+    MRMSrcRegOp4   = 51,
+    MRMXr = 55,
+    MRM0r = 56, MRM1r = 57, MRM2r = 58, MRM3r = 59,
+    MRM4r = 60, MRM5r = 61, MRM6r = 62, MRM7r = 63,
+#define MAP(from, to) MRM_##from = to,
+    X86_INSTR_MRM_MAPPING
+#undef MAP
+  };
+
+  enum {
+    OB = 0, TB = 1, T8 = 2, TA = 3, XOP8 = 4, XOP9 = 5, XOPA = 6
+  };
+
+  enum {
+    PS = 1, PD = 2, XS = 3, XD = 4
+  };
+
+  enum {
+    VEX = 1, XOP = 2, EVEX = 3
+  };
+
+  enum {
+    OpSize16 = 1, OpSize32 = 2
+  };
+
+  enum {
+    AdSize16 = 1, AdSize32 = 2, AdSize64 = 3
+  };
+
+  enum {
+    VEX_W0 = 0, VEX_W1 = 1, VEX_WIG = 2
+  };
+}
+
 namespace X86Disassembler {
 
 /// RecognizableInstr - Encapsulates all information required to decode a single
diff --git a/utils/git-svn/git-llvm b/utils/git-svn/git-llvm
index c2eaa5b6e640..55d3129c4a82 100755
--- a/utils/git-svn/git-llvm
+++ b/utils/git-svn/git-llvm
@@ -205,21 +205,26 @@ def fix_eol_style_native(rev, sr, svn_sr_path):
     # Use ignore_errors because 'svn propget' prints errors if the file doesn't
     # have the named property. There doesn't seem to be a way to suppress that.
     eol_props = svn(svn_sr_path, 'propget', 'svn:eol-style', *files,
-                    ignore_errors=True).split('\n')
+                    ignore_errors=True)
     crlf_files = []
-    for eol_prop in eol_props:
-        # Remove spare CR.
-        eol_prop = eol_prop.strip('\r')
-        if not eol_prop:
-            continue
-        prop_parts = eol_prop.rsplit(' - ', 1)
-        if len(prop_parts) != 2:
-            eprint("unable to parse svn propget line:")
-            eprint(eol_prop)
-            continue
-        (f, eol_style) = prop_parts
-        if eol_style == 'native':
-            crlf_files.append(f)
+    if len(files) == 1:
+        # No need to split propget output on ' - ' when we have one file.
+        if eol_props.strip() == 'native':
+            crlf_files = files
+    else:
+        for eol_prop in eol_props.split('\n'):
+            # Remove spare CR.
+            eol_prop = eol_prop.strip('\r')
+            if not eol_prop:
+                continue
+            prop_parts = eol_prop.rsplit(' - ', 1)
+            if len(prop_parts) != 2:
+                eprint("unable to parse svn propget line:")
+                eprint(eol_prop)
+                continue
+            (f, eol_style) = prop_parts
+            if eol_style == 'native':
+                crlf_files.append(f)
     # Reformat all files with native SVN line endings to Unix format. SVN knows
     # files with native line endings are text files. It will commit just the
     # diff, and not a mass line ending change.
diff --git a/utils/release/build_llvm_package.bat b/utils/release/build_llvm_package.bat
index eca74347cf3e..79871781211a 100755
--- a/utils/release/build_llvm_package.bat
+++ b/utils/release/build_llvm_package.bat
@@ -10,7 +10,8 @@ REM Prerequisites:
 REM
 REM   Visual Studio 2017, CMake, Ninja, SVN, GNUWin32, SWIG, Python 3,
 REM   NSIS with the strlen_8192 patch,
-REM   Visual Studio 2017 SDK (for the clang-format plugin).
+REM   Visual Studio 2017 SDK and Nuget (for the clang-format plugin),
+REM   Perl (for the OpenMP run-time).
 REM
 REM
 REM   For LLDB, SWIG version <= 3.0.8 needs to be used to work around
@@ -20,9 +21,8 @@ REM   https://github.com/swig/swig/issues/769
 REM You need to modify the paths below:
 set vsdevcmd=C:\Program Files (x86)\Microsoft Visual Studio\2017\Professional\Common7\Tools\VsDevCmd.bat
 
-set python32_dir=C:\Users\hwennborg\AppData\Local\Programs\Python\Python35-32
-set python64_dir=C:\Users\hwennborg\AppData\Local\Programs\Python\Python35
-set PATH=%PATH%;c:\gnuwin32\bin
+set python32_dir=C:\Users\%USER%\AppData\Local\Programs\Python\Python35-32
+set python64_dir=C:\Users\%USER%\AppData\Local\Programs\Python\Python35
 
 set revision=%1
 set branch=trunk
diff --git a/utils/vscode/README b/utils/vscode/README
new file mode 100644
index 000000000000..6febb5e3c100
--- /dev/null
+++ b/utils/vscode/README
@@ -0,0 +1,18 @@
+This directory contains a "bundle" for doing syntax highlighting of TableGen
+files for the Microsoft VSCode editor. The highlighting follows that done by
+the TextMate "C" bundle as it is a translation of the textmate bundle to VSCode
+using the "yo code" npm package. Currently, keywords, comments, and strings are
+highlighted.
+
+This colorizer was generate by the vscode-generator tool "Yo Code"
+(https://github.com/Microsoft/vscode-generator-code) from the existing TableGen
+text TableGen.tmLanguage syntax colorizer in utils/textmate. This README was
+copied from utils/textmate/README.
+
+To install this VSCode .td file colorizer, copy it to the following locations
+per your Operating System:
+
+  - Windows: %USERPROFILE%\.vscode\extensions
+  - Mac: ~/.vscode/extensions
+  - Linux: ~/.vscode/extensions
+
diff --git a/utils/vscode/tablegen/.vscode/launch.json b/utils/vscode/tablegen/.vscode/launch.json
new file mode 100644
index 000000000000..8384213de75f
--- /dev/null
+++ b/utils/vscode/tablegen/.vscode/launch.json
@@ -0,0 +1,13 @@
+// A launch configuration that launches the extension inside a new window
+{
+    "version": "0.1.0",
+    "configurations": [
+        {
+            "name": "Launch Extension",
+            "type": "extensionHost",
+            "request": "launch",
+            "runtimeExecutable": "${execPath}",
+            "args": ["--extensionDevelopmentPath=${workspaceRoot}" ]
+        }
+    ]
+}
\ No newline at end of file
diff --git a/utils/vscode/tablegen/CHANGELOG.md b/utils/vscode/tablegen/CHANGELOG.md
new file mode 100644
index 000000000000..4cedbb953a97
--- /dev/null
+++ b/utils/vscode/tablegen/CHANGELOG.md
@@ -0,0 +1,4 @@
+# Change Log
+
+- Initial release
+
diff --git a/utils/vscode/tablegen/README.md b/utils/vscode/tablegen/README.md
new file mode 100644
index 000000000000..e726004edf78
--- /dev/null
+++ b/utils/vscode/tablegen/README.md
@@ -0,0 +1,13 @@
+# tablegen README
+
+This VSCode colorizer extension is a translation of the textmate bunble to
+VSCode using the "yo code" npm package. Currently, keywords, comments, and
+strings are highlighted.
+
+To install this VSCode .td file colorizer, copy it to the following locations
+per your Operating System:
+
+  - Windows: %USERPROFILE%\.vscode\extensions
+  - Mac: ~/.vscode/extensions
+  - Linux: ~/.vscode/extensions
+
diff --git a/utils/vscode/tablegen/language-configuration.json b/utils/vscode/tablegen/language-configuration.json
new file mode 100644
index 000000000000..aa2571000769
--- /dev/null
+++ b/utils/vscode/tablegen/language-configuration.json
@@ -0,0 +1,30 @@
+{
+    "comments": {
+        // symbol used for single line comment. Remove this entry if your language does not support line comments
+        "lineComment": "//",
+        // symbols used for start and end a block comment. Remove this entry if your language does not support block comments
+        "blockComment": [ "/*", "*/" ]
+    },
+    // symbols used as brackets
+    "brackets": [
+        ["{", "}"],
+        ["[", "]"],
+        ["(", ")"]
+    ],
+    // symbols that are auto closed when typing
+    "autoClosingPairs": [
+        ["{", "}"],
+        ["[", "]"],
+        ["(", ")"],
+        ["\"", "\""],
+        ["'", "'"]
+    ],
+    // symbols that that can be used to surround a selection
+    "surroundingPairs": [
+        ["{", "}"],
+        ["[", "]"],
+        ["(", ")"],
+        ["\"", "\""],
+        ["'", "'"]
+    ]
+}
\ No newline at end of file
diff --git a/utils/vscode/tablegen/package.json b/utils/vscode/tablegen/package.json
new file mode 100644
index 000000000000..efd32accf139
--- /dev/null
+++ b/utils/vscode/tablegen/package.json
@@ -0,0 +1,26 @@
+{
+    "name": "tablegen",
+    "displayName": "TableGen",
+    "description": "VSCode Language Colorizer for LLVM's TableGen language.",
+    "version": "0.0.1",
+    "publisher": "llvm",
+    "engines": {
+        "vscode": "^1.12.0"
+    },
+    "categories": [
+        "Languages"
+    ],
+    "contributes": {
+        "languages": [{
+            "id": "tablegen",
+            "aliases": ["TableGen", "tablegen"],
+            "extensions": [".td"],
+            "configuration": "./language-configuration.json"
+        }],
+        "grammars": [{
+            "language": "tablegen",
+            "scopeName": "source.tablegen",
+            "path": "./syntaxes/TableGen.tmLanguage"
+        }]
+    }
+}
\ No newline at end of file
diff --git a/utils/vscode/tablegen/syntaxes/TableGen.tmLanguage b/utils/vscode/tablegen/syntaxes/TableGen.tmLanguage
new file mode 100644
index 000000000000..f3cf2d618fd5
--- /dev/null
+++ b/utils/vscode/tablegen/syntaxes/TableGen.tmLanguage
@@ -0,0 +1,132 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
+<plist version="1.0">
+<dict>
+	<key>fileTypes</key>
+	<array><string>td</string></array>
+	<key>foldingStartMarker</key>
+	<string>/\*\*|\{\s*$</string>
+	<key>foldingStopMarker</key>
+	<string>\*\*/|^\s*\}</string>
+	<key>name</key>
+	<string>TableGen</string>
+	<key>patterns</key>
+	<array>
+		<dict>
+			<key>include</key>
+			<string>#comments</string>
+		</dict>
+		<dict>
+			<key>match</key>
+			<string>\b(def|let|in|code|dag|string|list|bits|bit|field|include|defm|foreach|class|multiclass|int)\b</string>
+			<key>name</key>
+			<string>keyword.control.tablegen</string>
+		</dict>
+		<dict>
+			<key>begin</key>
+			<string>"</string>
+			<key>end</key>
+			<string>"</string>
+			<key>name</key>
+			<string>string.quoted.double.untitled</string>
+			<key>patterns</key>
+			<array>
+				<dict>
+					<key>match</key>
+					<string>\\.</string>
+					<key>name</key>
+					<string>constant.character.escape.tablegen</string>
+				</dict>
+			</array>
+		</dict>
+	</array>
+	<key>repository</key>
+	<dict>
+		<key>comments</key>
+		<dict>
+			<key>patterns</key>
+			<array>
+				<dict>
+					<key>captures</key>
+					<dict>
+						<key>1</key>
+						<dict>
+							<key>name</key>
+							<string>meta.toc-list.banner.block.tablegen</string>
+						</dict>
+					</dict>
+					<key>match</key>
+					<string>^/\* =(\s*.*?)\s*= \*/$\n?</string>
+					<key>name</key>
+					<string>comment.block.tablegen</string>
+				</dict>
+				<dict>
+					<key>begin</key>
+					<string>/\*</string>
+					<key>captures</key>
+					<dict>
+						<key>0</key>
+						<dict>
+							<key>name</key>
+							<string>punctuation.definition.comment.tablegen</string>
+						</dict>
+					</dict>
+					<key>end</key>
+					<string>\*/</string>
+					<key>name</key>
+					<string>comment.block.tablegen</string>
+				</dict>
+				<dict>
+					<key>match</key>
+					<string>\*/.*\n</string>
+					<key>name</key>
+					<string>invalid.illegal.stray-comment-end.tablegen</string>
+				</dict>
+				<dict>
+					<key>captures</key>
+					<dict>
+						<key>1</key>
+						<dict>
+							<key>name</key>
+							<string>meta.toc-list.banner.line.tablegen</string>
+						</dict>
+					</dict>
+					<key>match</key>
+					<string>^// =(\s*.*?)\s*=\s*$\n?</string>
+					<key>name</key>
+					<string>comment.line.banner.tablegen</string>
+				</dict>
+				<dict>
+					<key>begin</key>
+					<string>//</string>
+					<key>beginCaptures</key>
+					<dict>
+						<key>0</key>
+						<dict>
+							<key>name</key>
+							<string>punctuation.definition.comment.tablegen</string>
+						</dict>
+					</dict>
+					<key>end</key>
+					<string>$\n?</string>
+					<key>name</key>
+					<string>comment.line.double-slash.tablegen</string>
+					<key>patterns</key>
+					<array>
+						<dict>
+							<key>match</key>
+							<string>(?&gt;\\\s*\n)</string>
+							<key>name</key>
+							<string>punctuation.separator.continuation.tablegen</string>
+						</dict>
+					</array>
+				</dict>
+			</array>
+		</dict>
+	</dict>
+	<key>scopeName</key>
+	<string>source.tablegen</string>
+	<key>uuid</key>
+	<string>3A090BFC-E74B-4993-8DAE-7CCF6D238A32</string>
+</dict>
+</plist>
diff --git a/utils/vscode/tablegen/vsc-extension-quickstart.md b/utils/vscode/tablegen/vsc-extension-quickstart.md
new file mode 100644
index 000000000000..abfbfdb70214
--- /dev/null
+++ b/utils/vscode/tablegen/vsc-extension-quickstart.md
@@ -0,0 +1,27 @@
+# Welcome to your VS Code Extension
+
+## What's in the folder
+* This folder contains all of the files necessary for your extension
+* `package.json` - this is the manifest file in which you declare your language support and define
+the location of the grammar file that has been copied into you extension.
+* `syntaxes/TableGen.tmLanguage` - this is the Text mate grammar file that is used for tokenization
+* `language-configuration.json` - this the language configuration, defining the tokens that are used for
+comments and brackets.
+
+## Get up and running straight away
+* Make sure the language configuration settings in `language-configuration.json` are accurate
+* press `F5` to open a new window with your extension loaded
+* create a new file with a file name suffix matching your language
+* verify that syntax highlight works and that the language configuration settings are working
+
+## Make changes
+* you can relaunch the extension from the debug toolbar after making changes to the files listed above
+* you can also reload (`Ctrl+R` or `Cmd+R` on Mac) the VS Code window with your extension to load your changes
+
+## Add more language features
+* To add features such as intellisense, hovers and validators check out the VS Code extenders documentation at
+https://code.visualstudio.com/docs
+
+## Install your extension
+* To start using your extension with Visual Studio Code copy it into the `<user home>/.vscode/extensions` folder and restart Code.
+* To share your extension with the world, read on https://code.visualstudio.com/docs about publishing an extension.
\ No newline at end of file
-- 
cgit v1.2.3


From f9102cdabba485d415359124bece145f4a7d9089 Mon Sep 17 00:00:00 2001
From: Dimitry Andric <dim@FreeBSD.org>
Date: Tue, 16 May 2017 19:47:19 +0000
Subject: Vendor import of compiler-rt trunk r303197:
 https://llvm.org/svn/llvm-project/compiler-rt/trunk@303197

---
 CMakeLists.txt                                     |  17 +-
 cmake/config-ix.cmake                              |   2 +-
 include/xray/xray_interface.h                      |  13 +-
 lib/asan/asan_allocator.h                          |  15 +-
 lib/asan/asan_flags.cc                             |   4 +
 lib/asan/tests/asan_str_test.cc                    |  21 ++
 lib/builtins/CMakeLists.txt                        |  16 +-
 lib/builtins/adddf3.c                              |   9 +-
 lib/builtins/addsf3.c                              |   9 +-
 lib/builtins/arm/aeabi_cdcmpeq_check_nan.c         |   4 +-
 lib/builtins/arm/aeabi_cfcmpeq_check_nan.c         |   4 +-
 lib/builtins/arm/aeabi_div0.c                      |   6 +-
 lib/builtins/arm/aeabi_drsub.c                     |   4 +-
 lib/builtins/arm/aeabi_frsub.c                     |   4 +-
 lib/builtins/ashldi3.c                             |   9 +-
 lib/builtins/ashrdi3.c                             |   9 +-
 lib/builtins/assembly.h                            |   3 +-
 lib/builtins/comparedf2.c                          |   8 +-
 lib/builtins/comparesf2.c                          |   9 +-
 lib/builtins/divdf3.c                              |   9 +-
 lib/builtins/divsf3.c                              |   9 +-
 lib/builtins/divsi3.c                              |   9 +-
 lib/builtins/extendhfsf2.c                         |   9 +-
 lib/builtins/extendsfdf2.c                         |   9 +-
 lib/builtins/fixdfdi.c                             |  13 +-
 lib/builtins/fixdfsi.c                             |   9 +-
 lib/builtins/fixsfdi.c                             |  14 +-
 lib/builtins/fixsfsi.c                             |   9 +-
 lib/builtins/fixunsdfdi.c                          |  14 +-
 lib/builtins/fixunsdfsi.c                          |   9 +-
 lib/builtins/fixunssfdi.c                          |  14 +-
 lib/builtins/fixunssfsi.c                          |   9 +-
 lib/builtins/floatdidf.c                           |   9 +-
 lib/builtins/floatdisf.c                           |   9 +-
 lib/builtins/floatsidf.c                           |   9 +-
 lib/builtins/floatsisf.c                           |   9 +-
 lib/builtins/floatundidf.c                         |   9 +-
 lib/builtins/floatundisf.c                         |   9 +-
 lib/builtins/floatunsidf.c                         |   9 +-
 lib/builtins/floatunsisf.c                         |   9 +-
 lib/builtins/int_lib.h                             |   7 +-
 lib/builtins/lshrdi3.c                             |   9 +-
 lib/builtins/muldf3.c                              |   9 +-
 lib/builtins/muldi3.c                              |   9 +-
 lib/builtins/mulsf3.c                              |   9 +-
 lib/builtins/negdf2.c                              |   9 +-
 lib/builtins/negsf2.c                              |   9 +-
 lib/builtins/subdf3.c                              |   8 +-
 lib/builtins/subsf3.c                              |   8 +-
 lib/builtins/truncdfhf2.c                          |   9 +-
 lib/builtins/truncdfsf2.c                          |   9 +-
 lib/builtins/truncsfhf2.c                          |   9 +-
 lib/builtins/udivsi3.c                             |   9 +-
 lib/esan/esan_interceptors.cpp                     |   2 +
 lib/lsan/lsan_allocator.h                          |  16 +-
 lib/lsan/lsan_common_linux.cc                      |   6 +-
 lib/lsan/lsan_common_mac.cc                        |   5 +
 lib/msan/msan_allocator.cc                         |  31 ++-
 lib/msan/msan_interceptors.cc                      |  36 +---
 lib/msan/tests/msan_test.cc                        |  60 +++++-
 .../sanitizer_allocator_internal.h                 |  22 +-
 .../sanitizer_allocator_primary32.h                |  27 ++-
 .../sanitizer_common_interceptors.inc              |  43 ++++
 lib/sanitizer_common/sanitizer_flags.inc           |   3 +
 .../sanitizer_platform_interceptors.h              |  14 ++
 lib/sanitizer_common/sanitizer_procmaps.h          |   1 +
 lib/sanitizer_common/sanitizer_procmaps_linux.cc   |   4 +-
 lib/sanitizer_common/sanitizer_procmaps_mac.cc     |  87 +++++++-
 lib/sanitizer_common/sanitizer_stoptheworld_mac.cc |   4 +
 .../tests/sanitizer_allocator_test.cc              |  38 ++--
 lib/sanitizer_common/tests/sanitizer_test_utils.h  |   6 +
 lib/scudo/scudo_allocator.cpp                      | 126 ++++++-----
 lib/scudo/scudo_allocator.h                        |  23 +-
 lib/scudo/scudo_allocator_combined.h               |  84 ++++++++
 lib/scudo/scudo_allocator_secondary.h              | 101 +++------
 lib/scudo/scudo_crc32.cpp                          |  19 +-
 lib/scudo/scudo_crc32.h                            | 101 +++++++++
 lib/scudo/scudo_utils.h                            |  59 -----
 lib/tsan/check_analyze.sh                          |   2 +-
 lib/tsan/rtl/tsan_rtl.h                            |  18 +-
 lib/xray/xray_AArch64.cc                           |  13 +-
 lib/xray/xray_arm.cc                               |  12 +-
 lib/xray/xray_fdr_log_records.h                    |   1 +
 lib/xray/xray_fdr_logging.cc                       | 150 +++++++++----
 lib/xray/xray_fdr_logging_impl.h                   | 237 +++++++++++++--------
 lib/xray/xray_interface.cc                         |  21 +-
 lib/xray/xray_interface_internal.h                 |   2 +
 lib/xray/xray_mips.cc                              |   9 +-
 lib/xray/xray_mips64.cc                            |   8 +-
 lib/xray/xray_powerpc64.cc                         |   6 +
 lib/xray/xray_trampoline_powerpc64_asm.S           |  90 ++++++--
 lib/xray/xray_trampoline_x86_64.S                  |  46 +++-
 lib/xray/xray_x86_64.cc                            |  36 ++++
 test/asan/CMakeLists.txt                           |  17 +-
 .../Linux/sanbox_read_proc_self_maps_test.cc       |  30 +++
 test/asan/TestCases/Posix/strndup_oob_test.cc      |  27 +++
 test/asan/lit.cfg                                  |   9 +-
 test/dfsan/custom.cc                               |   2 +-
 test/lsan/TestCases/link_turned_off.cc             |   2 +
 test/lsan/TestCases/recoverable_leak_check.cc      |   2 +
 test/lsan/lit.common.cfg                           |   5 +-
 test/msan/chained_origin_memcpy.cc                 |   2 +-
 test/msan/pr32842.c                                |  22 ++
 test/msan/strndup.cc                               |  28 +++
 test/msan/wcsncpy.cc                               |   8 +-
 test/safestack/canary.c                            |   3 +-
 .../TestCases/Linux/getpwnam_r_invalid_user.cc     |   2 +
 .../TestCases/sanitizer_coverage_no_prune.cc       |   2 +-
 test/ubsan/TestCases/Misc/missing_return.cpp       |   2 +-
 test/ubsan/TestCases/TypeCheck/misaligned.cpp      |   6 +-
 test/ubsan/lit.common.cfg                          |   4 +
 test/xray/TestCases/Linux/coverage-sample.cc       |   3 -
 test/xray/TestCases/Linux/custom-event-logging.cc  |  40 ++++
 test/xray/TestCases/Linux/func-id-utils.cc         |  24 ++-
 114 files changed, 1685 insertions(+), 611 deletions(-)
 create mode 100644 lib/scudo/scudo_allocator_combined.h
 create mode 100644 lib/scudo/scudo_crc32.h
 create mode 100644 test/asan/TestCases/Linux/sanbox_read_proc_self_maps_test.cc
 create mode 100644 test/asan/TestCases/Posix/strndup_oob_test.cc
 create mode 100644 test/msan/pr32842.c
 create mode 100644 test/msan/strndup.cc
 create mode 100644 test/xray/TestCases/Linux/custom-event-logging.cc

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 32358a1262c9..b522c340d669 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -31,6 +31,9 @@ mark_as_advanced(COMPILER_RT_BUILD_SANITIZERS)
 option(COMPILER_RT_BUILD_XRAY "Build xray" ON)
 mark_as_advanced(COMPILER_RT_BUILD_XRAY)
 
+set(COMPILER_RT_BAREMETAL_BUILD OFF CACHE BOOLEAN
+  "Build for a bare-metal target.")
+
 if (COMPILER_RT_STANDALONE_BUILD)
   load_llvm_config()
 
@@ -239,14 +242,24 @@ set(COMPILER_RT_LIBCXX_PATH ${LLVM_MAIN_SRC_DIR}/projects/libcxx)
 if(EXISTS ${COMPILER_RT_LIBCXX_PATH}/)
   set(COMPILER_RT_HAS_LIBCXX_SOURCES TRUE)
 else()
-  set(COMPILER_RT_HAS_LIBCXX_SOURCES FALSE)
+  set(COMPILER_RT_LIBCXX_PATH ${LLVM_MAIN_SRC_DIR}/../libcxx)
+  if(EXISTS ${COMPILER_RT_LIBCXX_PATH}/)
+    set(COMPILER_RT_HAS_LIBCXX_SOURCES TRUE)
+  else()
+    set(COMPILER_RT_HAS_LIBCXX_SOURCES FALSE)
+  endif()
 endif()
 
 set(COMPILER_RT_LLD_PATH ${LLVM_MAIN_SRC_DIR}/tools/lld)
 if(EXISTS ${COMPILER_RT_LLD_PATH}/ AND LLVM_TOOL_LLD_BUILD)
   set(COMPILER_RT_HAS_LLD TRUE)
 else()
-  set(COMPILER_RT_HAS_LLD FALSE)
+  set(COMPILER_RT_LLD_PATH ${LLVM_MAIN_SRC_DIR}/../lld)
+  if(EXISTS ${COMPILER_RT_LLD_PATH}/)
+    set(COMPILER_RT_HAS_LLD TRUE)
+  else()
+    set(COMPILER_RT_HAS_LLD FALSE)
+  endif()
 endif()
 pythonize_bool(COMPILER_RT_HAS_LLD)
 
diff --git a/cmake/config-ix.cmake b/cmake/config-ix.cmake
index 60cb39a93b29..ae2a262a14a9 100644
--- a/cmake/config-ix.cmake
+++ b/cmake/config-ix.cmake
@@ -476,7 +476,7 @@ else()
 endif()
 
 if (COMPILER_RT_HAS_SANITIZER_COMMON AND LSAN_SUPPORTED_ARCH AND
-    OS_NAME MATCHES "Linux|FreeBSD")
+    OS_NAME MATCHES "Darwin|Linux|FreeBSD")
   set(COMPILER_RT_HAS_LSAN TRUE)
 else()
   set(COMPILER_RT_HAS_LSAN FALSE)
diff --git a/include/xray/xray_interface.h b/include/xray/xray_interface.h
index c90025e38aae..c3833f0be357 100644
--- a/include/xray/xray_interface.h
+++ b/include/xray/xray_interface.h
@@ -1,4 +1,4 @@
-//===-- xray_interface.h ----------------------------------------*- C++ -*-===//
+//===- xray_interface.h -----------------------------------------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -11,11 +11,12 @@
 //
 // APIs for controlling XRay functionality explicitly.
 //===----------------------------------------------------------------------===//
+
 #ifndef XRAY_XRAY_INTERFACE_H
 #define XRAY_XRAY_INTERFACE_H
 
+#include <cstddef>
 #include <cstdint>
-#include <stddef.h>
 
 extern "C" {
 
@@ -25,6 +26,7 @@ enum XRayEntryType {
   EXIT = 1,
   TAIL = 2,
   LOG_ARGS_ENTRY = 3,
+  CUSTOM_EVENT = 4,
 };
 
 /// Provide a function to invoke for when instrumentation points are hit. This
@@ -64,6 +66,9 @@ extern int __xray_set_handler_arg1(void (*)(int32_t, XRayEntryType, uint64_t));
 /// Returns 1 on success, 0 on error.
 extern int __xray_remove_handler_arg1();
 
+/// Provide a function to invoke when XRay encounters a custom event.
+extern int __xray_set_customevent_handler(void (*entry)(void*, std::size_t));
+
 enum XRayPatchingStatus {
   NOT_INITIALIZED = 0,
   SUCCESS = 1,
@@ -96,6 +101,6 @@ extern uintptr_t __xray_function_address(int32_t FuncId);
 /// encounter errors (when there are no instrumented functions, etc.).
 extern size_t __xray_max_function_id();
 
-}
+} // end extern "C"
 
-#endif
+#endif // XRAY_XRAY_INTERFACE_H
diff --git a/lib/asan/asan_allocator.h b/lib/asan/asan_allocator.h
index ee28ecf98cab..ad1aeb58a86b 100644
--- a/lib/asan/asan_allocator.h
+++ b/lib/asan/asan_allocator.h
@@ -161,10 +161,17 @@ typedef FlatByteMap<kNumRegions> ByteMap;
 typedef TwoLevelByteMap<(kNumRegions >> 12), 1 << 12> ByteMap;
 # endif
 typedef CompactSizeClassMap SizeClassMap;
-typedef SizeClassAllocator32<0, SANITIZER_MMAP_RANGE_SIZE, 16,
-  SizeClassMap, kRegionSizeLog,
-  ByteMap,
-  AsanMapUnmapCallback> PrimaryAllocator;
+struct AP32 {
+  static const uptr kSpaceBeg = 0;
+  static const u64 kSpaceSize = SANITIZER_MMAP_RANGE_SIZE;
+  static const uptr kMetadataSize = 16;
+  typedef __asan::SizeClassMap SizeClassMap;
+  static const uptr kRegionSizeLog = __asan::kRegionSizeLog;
+  typedef __asan::ByteMap ByteMap;
+  typedef AsanMapUnmapCallback MapUnmapCallback;
+  static const uptr kFlags = 0;
+};
+typedef SizeClassAllocator32<AP32> PrimaryAllocator;
 #endif  // SANITIZER_CAN_USE_ALLOCATOR64
 
 static const uptr kNumberOfSizeClasses = SizeClassMap::kNumClasses;
diff --git a/lib/asan/asan_flags.cc b/lib/asan/asan_flags.cc
index c8ae3faed7c2..6be0d6e94b9a 100644
--- a/lib/asan/asan_flags.cc
+++ b/lib/asan/asan_flags.cc
@@ -194,6 +194,10 @@ void InitializeFlags() {
     Report("WARNING: strchr* interceptors are enabled even though "
            "replace_str=0. Use intercept_strchr=0 to disable them.");
   }
+  if (!f->replace_str && common_flags()->intercept_strndup) {
+    Report("WARNING: strndup* interceptors are enabled even though "
+           "replace_str=0. Use intercept_strndup=0 to disable them.");
+  }
 }
 
 }  // namespace __asan
diff --git a/lib/asan/tests/asan_str_test.cc b/lib/asan/tests/asan_str_test.cc
index c790088f8f9e..8f4911fd9ff8 100644
--- a/lib/asan/tests/asan_str_test.cc
+++ b/lib/asan/tests/asan_str_test.cc
@@ -154,6 +154,27 @@ TEST(AddressSanitizer, MAYBE_StrDupOOBTest) {
   free(str);
 }
 
+#if SANITIZER_TEST_HAS_STRNDUP
+TEST(AddressSanitizer, MAYBE_StrNDupOOBTest) {
+  size_t size = Ident(42);
+  char *str = MallocAndMemsetString(size);
+  char *new_str;
+  // Normal strndup calls.
+  str[size - 1] = '\0';
+  new_str = strndup(str, size - 13);
+  free(new_str);
+  new_str = strndup(str + size - 1, 13);
+  free(new_str);
+  // Argument points to not allocated memory.
+  EXPECT_DEATH(Ident(strndup(str - 1, 13)), LeftOOBReadMessage(1));
+  EXPECT_DEATH(Ident(strndup(str + size, 13)), RightOOBReadMessage(0));
+  // Overwrite the terminating '\0' and hit unallocated memory.
+  str[size - 1] = 'z';
+  EXPECT_DEATH(Ident(strndup(str, size + 13)), RightOOBReadMessage(0));
+  free(str);
+}
+#endif // SANITIZER_TEST_HAS_STRNDUP
+
 TEST(AddressSanitizer, StrCpyOOBTest) {
   size_t to_size = Ident(30);
   size_t from_size = Ident(6);  // less than to_size
diff --git a/lib/builtins/CMakeLists.txt b/lib/builtins/CMakeLists.txt
index e3779ca79ceb..df80a50444e5 100644
--- a/lib/builtins/CMakeLists.txt
+++ b/lib/builtins/CMakeLists.txt
@@ -66,7 +66,6 @@ set(GENERIC_SOURCES
   divti3.c
   divtf3.c
   divxc3.c
-  enable_execute_stack.c
   eprintf.c
   extendsfdf2.c
   extendhfsf2.c
@@ -191,6 +190,12 @@ option(COMPILER_RT_EXCLUDE_ATOMIC_BUILTIN
   "Skip the atomic builtin (this may be needed if system headers are unavailable)"
   Off)
 
+if(NOT COMPILER_RT_BAREMETAL_BUILD)
+  set(GENERIC_SOURCES
+    ${GENERIC_SOURCES}
+    enable_execute_stack.c)
+endif()
+
 if(COMPILER_RT_HAS_ATOMIC_KEYWORD AND NOT COMPILER_RT_EXCLUDE_ATOMIC_BUILTIN)
   set(GENERIC_SOURCES
     ${GENERIC_SOURCES}
@@ -478,11 +483,18 @@ else ()
 
   foreach (arch ${BUILTIN_SUPPORTED_ARCH})
     if (CAN_TARGET_${arch})
+      # NOTE: some architectures (e.g. i386) have multiple names.  Ensure that
+      # we catch them all.
+      set(_arch ${arch})
+      if("${arch}" STREQUAL "i686")
+        set(_arch "i386|i686")
+      endif()
+
       # Filter out generic versions of routines that are re-implemented in
       # architecture specific manner.  This prevents multiple definitions of the
       # same symbols, making the symbol selection non-deterministic.
       foreach (_file ${${arch}_SOURCES})
-        if (${_file} MATCHES ${arch}/*)
+        if (${_file} MATCHES ${_arch}/*)
           get_filename_component(_name ${_file} NAME)
           string(REPLACE ".S" ".c" _cname "${_name}")
           list(REMOVE_ITEM ${arch}_SOURCES ${_cname})
diff --git a/lib/builtins/adddf3.c b/lib/builtins/adddf3.c
index 8b7aae0a6f87..c528e9e21f51 100644
--- a/lib/builtins/adddf3.c
+++ b/lib/builtins/adddf3.c
@@ -15,8 +15,13 @@
 #define DOUBLE_PRECISION
 #include "fp_add_impl.inc"
 
-ARM_EABI_FNALIAS(dadd, adddf3)
-
 COMPILER_RT_ABI double __adddf3(double a, double b){
     return __addXf3__(a, b);
 }
+
+#if defined(__ARM_EABI__)
+AEABI_RTABI double __aeabi_dadd(double a, double b) {
+  return __adddf3(a, b);
+}
+#endif
+
diff --git a/lib/builtins/addsf3.c b/lib/builtins/addsf3.c
index 0f5d6ea4097a..fe570687a25e 100644
--- a/lib/builtins/addsf3.c
+++ b/lib/builtins/addsf3.c
@@ -15,8 +15,13 @@
 #define SINGLE_PRECISION
 #include "fp_add_impl.inc"
 
-ARM_EABI_FNALIAS(fadd, addsf3)
-
 COMPILER_RT_ABI float __addsf3(float a, float b) {
     return __addXf3__(a, b);
 }
+
+#if defined(__ARM_EABI__)
+AEABI_RTABI float __aeabi_fadd(float a, float b) {
+  return __addsf3(a, b);
+}
+#endif
+
diff --git a/lib/builtins/arm/aeabi_cdcmpeq_check_nan.c b/lib/builtins/arm/aeabi_cdcmpeq_check_nan.c
index 577f6b2c5535..7578433a1df7 100644
--- a/lib/builtins/arm/aeabi_cdcmpeq_check_nan.c
+++ b/lib/builtins/arm/aeabi_cdcmpeq_check_nan.c
@@ -8,9 +8,9 @@
 //===----------------------------------------------------------------------===//
 
 #include <stdint.h>
+#include "../int_lib.h"
 
-__attribute__((pcs("aapcs")))
-__attribute__((visibility("hidden")))
+AEABI_RTABI __attribute__((visibility("hidden")))
 int __aeabi_cdcmpeq_check_nan(double a, double b) {
     return __builtin_isnan(a) || __builtin_isnan(b);
 }
diff --git a/lib/builtins/arm/aeabi_cfcmpeq_check_nan.c b/lib/builtins/arm/aeabi_cfcmpeq_check_nan.c
index 992e31fbd8d6..43dde9a49597 100644
--- a/lib/builtins/arm/aeabi_cfcmpeq_check_nan.c
+++ b/lib/builtins/arm/aeabi_cfcmpeq_check_nan.c
@@ -8,9 +8,9 @@
 //===----------------------------------------------------------------------===//
 
 #include <stdint.h>
+#include "../int_lib.h"
 
-__attribute__((pcs("aapcs")))
-__attribute__((visibility("hidden")))
+AEABI_RTABI __attribute__((visibility("hidden")))
 int __aeabi_cfcmpeq_check_nan(float a, float b) {
     return __builtin_isnan(a) || __builtin_isnan(b);
 }
diff --git a/lib/builtins/arm/aeabi_div0.c b/lib/builtins/arm/aeabi_div0.c
index ccc95fa5c12e..dc3031326e37 100644
--- a/lib/builtins/arm/aeabi_div0.c
+++ b/lib/builtins/arm/aeabi_div0.c
@@ -26,16 +26,18 @@
  * line.
  */
 
+#include "../int_lib.h"
+
 /* provide an unused declaration to pacify pendantic compilation */
 extern unsigned char declaration;
 
 #if defined(__ARM_EABI__)
-int __attribute__((weak)) __attribute__((visibility("hidden")))
+AEABI_RTABI int __attribute__((weak)) __attribute__((visibility("hidden")))
 __aeabi_idiv0(int return_value) {
   return return_value;
 }
 
-long long __attribute__((weak)) __attribute__((visibility("hidden")))
+AEABI_RTABI long long __attribute__((weak)) __attribute__((visibility("hidden")))
 __aeabi_ldiv0(long long return_value) {
   return return_value;
 }
diff --git a/lib/builtins/arm/aeabi_drsub.c b/lib/builtins/arm/aeabi_drsub.c
index fc17d5a4cc76..1254886086fb 100644
--- a/lib/builtins/arm/aeabi_drsub.c
+++ b/lib/builtins/arm/aeabi_drsub.c
@@ -10,10 +10,10 @@
 #define DOUBLE_PRECISION
 #include "../fp_lib.h"
 
-COMPILER_RT_ABI fp_t
+AEABI_RTABI fp_t
 __aeabi_dsub(fp_t, fp_t);
 
-COMPILER_RT_ABI fp_t
+AEABI_RTABI fp_t
 __aeabi_drsub(fp_t a, fp_t b) {
     return __aeabi_dsub(b, a);
 }
diff --git a/lib/builtins/arm/aeabi_frsub.c b/lib/builtins/arm/aeabi_frsub.c
index 64258dc7e070..34f2303745bc 100644
--- a/lib/builtins/arm/aeabi_frsub.c
+++ b/lib/builtins/arm/aeabi_frsub.c
@@ -10,10 +10,10 @@
 #define SINGLE_PRECISION
 #include "../fp_lib.h"
 
-COMPILER_RT_ABI fp_t
+AEABI_RTABI fp_t
 __aeabi_fsub(fp_t, fp_t);
 
-COMPILER_RT_ABI fp_t
+AEABI_RTABI fp_t
 __aeabi_frsub(fp_t a, fp_t b) {
     return __aeabi_fsub(b, a);
 }
diff --git a/lib/builtins/ashldi3.c b/lib/builtins/ashldi3.c
index eb4698ac517c..fcb0abdb1fce 100644
--- a/lib/builtins/ashldi3.c
+++ b/lib/builtins/ashldi3.c
@@ -18,8 +18,6 @@
 
 /* Precondition:  0 <= b < bits_in_dword */
 
-ARM_EABI_FNALIAS(llsl, ashldi3)
-
 COMPILER_RT_ABI di_int
 __ashldi3(di_int a, si_int b)
 {
@@ -41,3 +39,10 @@ __ashldi3(di_int a, si_int b)
     }
     return result.all;
 }
+
+#if defined(__ARM_EABI__)
+AEABI_RTABI di_int __aeabi_llsl(di_int a, si_int b) {
+  return __ashldi3(a, b);
+}
+#endif
+
diff --git a/lib/builtins/ashrdi3.c b/lib/builtins/ashrdi3.c
index 14c878bb7793..b4ab4c617ba0 100644
--- a/lib/builtins/ashrdi3.c
+++ b/lib/builtins/ashrdi3.c
@@ -18,8 +18,6 @@
 
 /* Precondition:  0 <= b < bits_in_dword */
 
-ARM_EABI_FNALIAS(lasr, ashrdi3)
-
 COMPILER_RT_ABI di_int
 __ashrdi3(di_int a, si_int b)
 {
@@ -42,3 +40,10 @@ __ashrdi3(di_int a, si_int b)
     }
     return result.all;
 }
+
+#if defined(__ARM_EABI__)
+AEABI_RTABI di_int __aeabi_lasr(di_int a, si_int b) {
+  return __ashrdi3(a, b);
+}
+#endif
+
diff --git a/lib/builtins/assembly.h b/lib/builtins/assembly.h
index 29d9f8844a6a..12c13c495509 100644
--- a/lib/builtins/assembly.h
+++ b/lib/builtins/assembly.h
@@ -44,7 +44,8 @@
 #endif
 #define CONST_SECTION .section .rodata
 
-#if defined(__GNU__) || defined(__ANDROID__) || defined(__FreeBSD__)
+#if defined(__GNU__) || defined(__FreeBSD__) || defined(__Fuchsia__) || \
+    defined(__linux__)
 #define NO_EXEC_STACK_DIRECTIVE .section .note.GNU-stack,"",%progbits
 #else
 #define NO_EXEC_STACK_DIRECTIVE
diff --git a/lib/builtins/comparedf2.c b/lib/builtins/comparedf2.c
index 9e29752231e9..c5bb169d0021 100644
--- a/lib/builtins/comparedf2.c
+++ b/lib/builtins/comparedf2.c
@@ -113,8 +113,6 @@ __gedf2(fp_t a, fp_t b) {
     }
 }
 
-ARM_EABI_FNALIAS(dcmpun, unorddf2)
-
 COMPILER_RT_ABI int
 __unorddf2(fp_t a, fp_t b) {
     const rep_t aAbs = toRep(a) & absMask;
@@ -144,3 +142,9 @@ __gtdf2(fp_t a, fp_t b) {
     return __gedf2(a, b);
 }
 
+#if defined(__ARM_EABI__)
+AEABI_RTABI int __aeabi_dcmpun(fp_t a, fp_t b) {
+  return __unorddf2(a, b);
+}
+#endif
+
diff --git a/lib/builtins/comparesf2.c b/lib/builtins/comparesf2.c
index 1fd50636abaf..4badb5e1b9f7 100644
--- a/lib/builtins/comparesf2.c
+++ b/lib/builtins/comparesf2.c
@@ -113,8 +113,6 @@ __gesf2(fp_t a, fp_t b) {
     }
 }
 
-ARM_EABI_FNALIAS(fcmpun, unordsf2)
-
 COMPILER_RT_ABI int
 __unordsf2(fp_t a, fp_t b) {
     const rep_t aAbs = toRep(a) & absMask;
@@ -143,3 +141,10 @@ COMPILER_RT_ABI enum GE_RESULT
 __gtsf2(fp_t a, fp_t b) {
     return __gesf2(a, b);
 }
+
+#if defined(__ARM_EABI__)
+AEABI_RTABI int __aeabi_fcmpun(fp_t a, fp_t b) {
+  return __unordsf2(a, b);
+}
+#endif
+
diff --git a/lib/builtins/divdf3.c b/lib/builtins/divdf3.c
index ab44c2b25fe5..492e32b851e9 100644
--- a/lib/builtins/divdf3.c
+++ b/lib/builtins/divdf3.c
@@ -19,8 +19,6 @@
 #define DOUBLE_PRECISION
 #include "fp_lib.h"
 
-ARM_EABI_FNALIAS(ddiv, divdf3)
-
 COMPILER_RT_ABI fp_t
 __divdf3(fp_t a, fp_t b) {
     
@@ -183,3 +181,10 @@ __divdf3(fp_t a, fp_t b) {
         return result;
     }
 }
+
+#if defined(__ARM_EABI__)
+AEABI_RTABI fp_t __aeabi_ddiv(fp_t a, fp_t b) {
+  return __divdf3(a, b);
+}
+#endif
+
diff --git a/lib/builtins/divsf3.c b/lib/builtins/divsf3.c
index de2e376125b6..aa6289a6d70a 100644
--- a/lib/builtins/divsf3.c
+++ b/lib/builtins/divsf3.c
@@ -19,8 +19,6 @@
 #define SINGLE_PRECISION
 #include "fp_lib.h"
 
-ARM_EABI_FNALIAS(fdiv, divsf3)
-
 COMPILER_RT_ABI fp_t
 __divsf3(fp_t a, fp_t b) {
     
@@ -167,3 +165,10 @@ __divsf3(fp_t a, fp_t b) {
         return fromRep(absResult | quotientSign);
     }
 }
+
+#if defined(__ARM_EABI__)
+AEABI_RTABI fp_t __aeabi_fdiv(fp_t a, fp_t b) {
+  return __divsf3(a, b);
+}
+#endif
+
diff --git a/lib/builtins/divsi3.c b/lib/builtins/divsi3.c
index bab4aefda30a..3852e3990b5b 100644
--- a/lib/builtins/divsi3.c
+++ b/lib/builtins/divsi3.c
@@ -16,8 +16,6 @@
 
 /* Returns: a / b */
 
-ARM_EABI_FNALIAS(idiv, divsi3)
-
 COMPILER_RT_ABI si_int
 __divsi3(si_int a, si_int b)
 {
@@ -35,3 +33,10 @@ __divsi3(si_int a, si_int b)
      */
     return ((su_int)a/(su_int)b ^ s_a) - s_a;    /* negate if s_a == -1 */
 }
+
+#if defined(__ARM_EABI__)
+AEABI_RTABI si_int __aeabi_idiv(si_int a, si_int b) {
+  return __divsi3(a, b);
+}
+#endif
+
diff --git a/lib/builtins/extendhfsf2.c b/lib/builtins/extendhfsf2.c
index 27115a48c184..e7d9fde8abfc 100644
--- a/lib/builtins/extendhfsf2.c
+++ b/lib/builtins/extendhfsf2.c
@@ -12,8 +12,6 @@
 #define DST_SINGLE
 #include "fp_extend_impl.inc"
 
-ARM_EABI_FNALIAS(h2f, extendhfsf2)
-
 // Use a forwarding definition and noinline to implement a poor man's alias,
 // as there isn't a good cross-platform way of defining one.
 COMPILER_RT_ABI NOINLINE float __extendhfsf2(uint16_t a) {
@@ -23,3 +21,10 @@ COMPILER_RT_ABI NOINLINE float __extendhfsf2(uint16_t a) {
 COMPILER_RT_ABI float __gnu_h2f_ieee(uint16_t a) {
     return __extendhfsf2(a);
 }
+
+#if defined(__ARM_EABI__)
+AEABI_RTABI float __aeabi_h2f(uint16_t a) {
+  return __extendhfsf2(a);
+}
+#endif
+
diff --git a/lib/builtins/extendsfdf2.c b/lib/builtins/extendsfdf2.c
index 7a267c2f47ad..b9e7a7471a98 100644
--- a/lib/builtins/extendsfdf2.c
+++ b/lib/builtins/extendsfdf2.c
@@ -12,8 +12,13 @@
 #define DST_DOUBLE
 #include "fp_extend_impl.inc"
 
-ARM_EABI_FNALIAS(f2d, extendsfdf2)
-
 COMPILER_RT_ABI double __extendsfdf2(float a) {
     return __extendXfYf2__(a);
 }
+
+#if defined(__ARM_EABI__)
+AEABI_RTABI double __aeabi_f2d(float a) {
+  return __extendsfdf2(a);
+}
+#endif
+
diff --git a/lib/builtins/fixdfdi.c b/lib/builtins/fixdfdi.c
index 14283ef42e61..31d76df28255 100644
--- a/lib/builtins/fixdfdi.c
+++ b/lib/builtins/fixdfdi.c
@@ -10,7 +10,6 @@
 
 #define DOUBLE_PRECISION
 #include "fp_lib.h"
-ARM_EABI_FNALIAS(d2lz, fixdfdi)
 
 #ifndef __SOFT_FP__
 /* Support for systems that have hardware floating-point; can set the invalid
@@ -44,3 +43,15 @@ __fixdfdi(fp_t a) {
 }
 
 #endif
+
+#if defined(__ARM_EABI__)
+AEABI_RTABI di_int
+#if defined(__SOFT_FP__)
+__aeabi_d2lz(fp_t a) {
+#else
+__aeabi_d2lz(double a) {
+#endif
+  return __fixdfdi(a);
+}
+#endif
+
diff --git a/lib/builtins/fixdfsi.c b/lib/builtins/fixdfsi.c
index 704e65bc43a1..fc316dcd0545 100644
--- a/lib/builtins/fixdfsi.c
+++ b/lib/builtins/fixdfsi.c
@@ -14,9 +14,14 @@ typedef si_int fixint_t;
 typedef su_int fixuint_t;
 #include "fp_fixint_impl.inc"
 
-ARM_EABI_FNALIAS(d2iz, fixdfsi)
-
 COMPILER_RT_ABI si_int
 __fixdfsi(fp_t a) {
     return __fixint(a);
 }
+
+#if defined(__ARM_EABI__)
+AEABI_RTABI si_int __aeabi_d2iz(fp_t a) {
+  return __fixdfsi(a);
+}
+#endif
+
diff --git a/lib/builtins/fixsfdi.c b/lib/builtins/fixsfdi.c
index fab47e272a25..c43473637d60 100644
--- a/lib/builtins/fixsfdi.c
+++ b/lib/builtins/fixsfdi.c
@@ -11,8 +11,6 @@
 #define SINGLE_PRECISION
 #include "fp_lib.h"
 
-ARM_EABI_FNALIAS(f2lz, fixsfdi)
-
 #ifndef __SOFT_FP__
 /* Support for systems that have hardware floating-point; can set the invalid
  * flag as a side-effect of computation.
@@ -45,3 +43,15 @@ __fixsfdi(fp_t a) {
 }
 
 #endif
+
+#if defined(__ARM_EABI__)
+AEABI_RTABI di_int
+#if defined(__SOFT_FP__)
+__aeabi_f2lz(fp_t a) {
+#else
+__aeabi_f2lz(float a) {
+#endif
+  return __fixsfdi(a);
+}
+#endif
+
diff --git a/lib/builtins/fixsfsi.c b/lib/builtins/fixsfsi.c
index f045536d6857..3276df966460 100644
--- a/lib/builtins/fixsfsi.c
+++ b/lib/builtins/fixsfsi.c
@@ -14,9 +14,14 @@ typedef si_int fixint_t;
 typedef su_int fixuint_t;
 #include "fp_fixint_impl.inc"
 
-ARM_EABI_FNALIAS(f2iz, fixsfsi)
-
 COMPILER_RT_ABI si_int
 __fixsfsi(fp_t a) {
     return __fixint(a);
 }
+
+#if defined(__ARM_EABI__)
+AEABI_RTABI si_int __aeabi_f2iz(fp_t a) {
+  return __fixsfsi(a);
+}
+#endif
+
diff --git a/lib/builtins/fixunsdfdi.c b/lib/builtins/fixunsdfdi.c
index 4b0bc9e1d051..b734409709bf 100644
--- a/lib/builtins/fixunsdfdi.c
+++ b/lib/builtins/fixunsdfdi.c
@@ -11,8 +11,6 @@
 #define DOUBLE_PRECISION
 #include "fp_lib.h"
 
-ARM_EABI_FNALIAS(d2ulz, fixunsdfdi)
-
 #ifndef __SOFT_FP__
 /* Support for systems that have hardware floating-point; can set the invalid
  * flag as a side-effect of computation.
@@ -42,3 +40,15 @@ __fixunsdfdi(fp_t a) {
 }
 
 #endif
+
+#if defined(__ARM_EABI__)
+AEABI_RTABI du_int
+#if defined(__SOFT_FP__)
+__aeabi_d2ulz(fp_t a) {
+#else
+__aeabi_d2ulz(double a) {
+#endif
+  return __fixunsdfdi(a);
+}
+#endif
+
diff --git a/lib/builtins/fixunsdfsi.c b/lib/builtins/fixunsdfsi.c
index 232d342d77da..bb3d8e0f831b 100644
--- a/lib/builtins/fixunsdfsi.c
+++ b/lib/builtins/fixunsdfsi.c
@@ -13,9 +13,14 @@
 typedef su_int fixuint_t;
 #include "fp_fixuint_impl.inc"
 
-ARM_EABI_FNALIAS(d2uiz, fixunsdfsi)
-
 COMPILER_RT_ABI su_int
 __fixunsdfsi(fp_t a) {
     return __fixuint(a);
 }
+
+#if defined(__ARM_EABI__)
+AEABI_RTABI su_int __aeabi_d2uiz(fp_t a) {
+  return __fixunsdfsi(a);
+}
+#endif
+
diff --git a/lib/builtins/fixunssfdi.c b/lib/builtins/fixunssfdi.c
index f8ebab854f95..5d92245df0d9 100644
--- a/lib/builtins/fixunssfdi.c
+++ b/lib/builtins/fixunssfdi.c
@@ -11,8 +11,6 @@
 #define SINGLE_PRECISION
 #include "fp_lib.h"
 
-ARM_EABI_FNALIAS(f2ulz, fixunssfdi)
-
 #ifndef __SOFT_FP__
 /* Support for systems that have hardware floating-point; can set the invalid
  * flag as a side-effect of computation.
@@ -43,3 +41,15 @@ __fixunssfdi(fp_t a) {
 }
 
 #endif
+
+#if defined(__ARM_EABI__)
+AEABI_RTABI du_int
+#if defined(__SOFT_FP__)
+__aeabi_f2ulz(fp_t a) {
+#else
+__aeabi_f2ulz(float a) {
+#endif
+  return __fixunssfdi(a);
+}
+#endif
+
diff --git a/lib/builtins/fixunssfsi.c b/lib/builtins/fixunssfsi.c
index cc2b05bd84f8..91d5e8ae5d7f 100644
--- a/lib/builtins/fixunssfsi.c
+++ b/lib/builtins/fixunssfsi.c
@@ -17,9 +17,14 @@
 typedef su_int fixuint_t;
 #include "fp_fixuint_impl.inc"
 
-ARM_EABI_FNALIAS(f2uiz, fixunssfsi)
-
 COMPILER_RT_ABI su_int
 __fixunssfsi(fp_t a) {
     return __fixuint(a);
 }
+
+#if defined(__ARM_EABI__)
+AEABI_RTABI su_int __aeabi_f2uiz(fp_t a) {
+  return __fixunssfsi(a);
+}
+#endif
+
diff --git a/lib/builtins/floatdidf.c b/lib/builtins/floatdidf.c
index 2b023ad08beb..fccb29072407 100644
--- a/lib/builtins/floatdidf.c
+++ b/lib/builtins/floatdidf.c
@@ -22,8 +22,6 @@
 
 /* seee eeee eeee mmmm mmmm mmmm mmmm mmmm | mmmm mmmm mmmm mmmm mmmm mmmm mmmm mmmm */
 
-ARM_EABI_FNALIAS(l2d, floatdidf)
-
 #ifndef __SOFT_FP__
 /* Support for systems that have hardware floating-point; we'll set the inexact flag
  * as a side-effect of this computation.
@@ -105,3 +103,10 @@ __floatdidf(di_int a)
     return fb.f;
 }
 #endif
+
+#if defined(__AEABI__)
+AEABI_RTABI double __aeabi_l2d(di_int a) {
+  return __floatdidf(a);
+}
+#endif
+
diff --git a/lib/builtins/floatdisf.c b/lib/builtins/floatdisf.c
index 3e47580ef576..dd548165c373 100644
--- a/lib/builtins/floatdisf.c
+++ b/lib/builtins/floatdisf.c
@@ -22,8 +22,6 @@
 
 #include "int_lib.h"
 
-ARM_EABI_FNALIAS(l2f, floatdisf)
-
 COMPILER_RT_ABI float
 __floatdisf(di_int a)
 {
@@ -78,3 +76,10 @@ __floatdisf(di_int a)
            ((su_int)a & 0x007FFFFF);   /* mantissa */
     return fb.f;
 }
+
+#if defined(__ARM_EABI__)
+AEABI_RTABI float __aeabi_l2f(di_int a) {
+  return __floatdisf(a);
+}
+#endif
+
diff --git a/lib/builtins/floatsidf.c b/lib/builtins/floatsidf.c
index 1cf99b782a60..2ae395bdc1db 100644
--- a/lib/builtins/floatsidf.c
+++ b/lib/builtins/floatsidf.c
@@ -18,8 +18,6 @@
 
 #include "int_lib.h"
 
-ARM_EABI_FNALIAS(i2d, floatsidf)
-
 COMPILER_RT_ABI fp_t
 __floatsidf(int a) {
     
@@ -51,3 +49,10 @@ __floatsidf(int a) {
     // Insert the sign bit and return
     return fromRep(result | sign);
 }
+
+#if defined(__ARM_EABI__)
+AEABI_RTABI fp_t __aeabi_i2d(int a) {
+  return __floatsidf(a);
+}
+#endif
+
diff --git a/lib/builtins/floatsisf.c b/lib/builtins/floatsisf.c
index 467dd1d1eaf1..08891fcdf201 100644
--- a/lib/builtins/floatsisf.c
+++ b/lib/builtins/floatsisf.c
@@ -18,8 +18,6 @@
 
 #include "int_lib.h"
 
-ARM_EABI_FNALIAS(i2f, floatsisf)
-
 COMPILER_RT_ABI fp_t
 __floatsisf(int a) {
     
@@ -57,3 +55,10 @@ __floatsisf(int a) {
     // Insert the sign bit and return
     return fromRep(result | sign);
 }
+
+#if defined(__ARM_EABI__)
+AEABI_RTABI fp_t __aeabi_i2f(int a) {
+  return __floatsisf(a);
+}
+#endif
+
diff --git a/lib/builtins/floatundidf.c b/lib/builtins/floatundidf.c
index cfd3a7a3b33f..6c1a931ef2f3 100644
--- a/lib/builtins/floatundidf.c
+++ b/lib/builtins/floatundidf.c
@@ -22,8 +22,6 @@
 
 #include "int_lib.h"
 
-ARM_EABI_FNALIAS(ul2d, floatundidf)
-
 #ifndef __SOFT_FP__
 /* Support for systems that have hardware floating-point; we'll set the inexact flag
  * as a side-effect of this computation.
@@ -104,3 +102,10 @@ __floatundidf(du_int a)
     return fb.f;
 }
 #endif
+
+#if defined(__ARM_EABI__)
+AEABI_RTABI double __aeabi_ul2d(du_int a) {
+  return __floatundidf(a);
+}
+#endif
+
diff --git a/lib/builtins/floatundisf.c b/lib/builtins/floatundisf.c
index 713a44abc8bd..86841a75dc66 100644
--- a/lib/builtins/floatundisf.c
+++ b/lib/builtins/floatundisf.c
@@ -22,8 +22,6 @@
 
 #include "int_lib.h"
 
-ARM_EABI_FNALIAS(ul2f, floatundisf)
-
 COMPILER_RT_ABI float
 __floatundisf(du_int a)
 {
@@ -75,3 +73,10 @@ __floatundisf(du_int a)
            ((su_int)a & 0x007FFFFF);  /* mantissa */
     return fb.f;
 }
+
+#if defined(__ARM_EABI__)
+AEABI_RTABI float __aeabi_ul2f(du_int a) {
+  return __floatundisf(a);
+}
+#endif
+
diff --git a/lib/builtins/floatunsidf.c b/lib/builtins/floatunsidf.c
index 445e18041c48..8d4807194f0b 100644
--- a/lib/builtins/floatunsidf.c
+++ b/lib/builtins/floatunsidf.c
@@ -18,8 +18,6 @@
 
 #include "int_lib.h"
 
-ARM_EABI_FNALIAS(ui2d, floatunsidf)
-
 COMPILER_RT_ABI fp_t
 __floatunsidf(unsigned int a) {
     
@@ -40,3 +38,10 @@ __floatunsidf(unsigned int a) {
     result += (rep_t)(exponent + exponentBias) << significandBits;
     return fromRep(result);
 }
+
+#if defined(__ARM_EABI__)
+AEABI_RTABI fp_t __aeabi_ui2d(unsigned int a) {
+  return __floatunsidf(a);
+}
+#endif
+
diff --git a/lib/builtins/floatunsisf.c b/lib/builtins/floatunsisf.c
index ea6f161adc02..f194c046d2fb 100644
--- a/lib/builtins/floatunsisf.c
+++ b/lib/builtins/floatunsisf.c
@@ -18,8 +18,6 @@
 
 #include "int_lib.h"
 
-ARM_EABI_FNALIAS(ui2f, floatunsisf)
-
 COMPILER_RT_ABI fp_t
 __floatunsisf(unsigned int a) {
     
@@ -48,3 +46,10 @@ __floatunsisf(unsigned int a) {
     result += (rep_t)(exponent + exponentBias) << significandBits;
     return fromRep(result);
 }
+
+#if defined(__ARM_EABI__)
+AEABI_RTABI fp_t __aeabi_ui2f(unsigned int a) {
+  return __floatunsisf(a);
+}
+#endif
+
diff --git a/lib/builtins/int_lib.h b/lib/builtins/int_lib.h
index 8a202dde70f1..9a8092d50d8e 100644
--- a/lib/builtins/int_lib.h
+++ b/lib/builtins/int_lib.h
@@ -30,18 +30,17 @@
 /* ABI macro definitions */
 
 #if __ARM_EABI__
-# define ARM_EABI_FNALIAS(aeabi_name, name)         \
-  void __aeabi_##aeabi_name() __attribute__((alias("__" #name)));
 # ifdef COMPILER_RT_ARMHF_TARGET
 #   define COMPILER_RT_ABI
 # else
-#   define COMPILER_RT_ABI __attribute__((pcs("aapcs")))
+#   define COMPILER_RT_ABI __attribute__((__pcs__("aapcs")))
 # endif
 #else
-# define ARM_EABI_FNALIAS(aeabi_name, name)
 # define COMPILER_RT_ABI
 #endif
 
+#define AEABI_RTABI __attribute__((__pcs__("aapcs")))
+
 #ifdef _MSC_VER
 #define ALWAYS_INLINE __forceinline
 #define NOINLINE __declspec(noinline)
diff --git a/lib/builtins/lshrdi3.c b/lib/builtins/lshrdi3.c
index 6b1ea923b778..becbbef4eb09 100644
--- a/lib/builtins/lshrdi3.c
+++ b/lib/builtins/lshrdi3.c
@@ -18,8 +18,6 @@
 
 /* Precondition:  0 <= b < bits_in_dword */
 
-ARM_EABI_FNALIAS(llsr, lshrdi3)
-
 COMPILER_RT_ABI di_int
 __lshrdi3(di_int a, si_int b)
 {
@@ -41,3 +39,10 @@ __lshrdi3(di_int a, si_int b)
     }
     return result.all;
 }
+
+#if defined(__ARM_EABI__)
+AEABI_RTABI di_int __aeabi_llsr(di_int a, si_int b) {
+  return __lshrdi3(a, b);
+}
+#endif
+
diff --git a/lib/builtins/muldf3.c b/lib/builtins/muldf3.c
index 1eb733849e5a..59a60190eba3 100644
--- a/lib/builtins/muldf3.c
+++ b/lib/builtins/muldf3.c
@@ -15,8 +15,13 @@
 #define DOUBLE_PRECISION
 #include "fp_mul_impl.inc"
 
-ARM_EABI_FNALIAS(dmul, muldf3)
-
 COMPILER_RT_ABI fp_t __muldf3(fp_t a, fp_t b) {
     return __mulXf3__(a, b);
 }
+
+#if defined(__ARM_EABI__)
+AEABI_RTABI fp_t __aeabi_dmul(fp_t a, fp_t b) {
+  return __muldf3(a, b);
+}
+#endif
+
diff --git a/lib/builtins/muldi3.c b/lib/builtins/muldi3.c
index 2dae44c11b95..6818a9e2f722 100644
--- a/lib/builtins/muldi3.c
+++ b/lib/builtins/muldi3.c
@@ -40,8 +40,6 @@ __muldsi3(su_int a, su_int b)
 
 /* Returns: a * b */
 
-ARM_EABI_FNALIAS(lmul, muldi3)
-
 COMPILER_RT_ABI di_int
 __muldi3(di_int a, di_int b)
 {
@@ -54,3 +52,10 @@ __muldi3(di_int a, di_int b)
     r.s.high += x.s.high * y.s.low + x.s.low * y.s.high;
     return r.all;
 }
+
+#if defined(__ARM_EABI__)
+AEABI_RTABI di_int __aeabi_lmul(di_int a, di_int b) {
+  return __muldi3(a, b);
+}
+#endif
+
diff --git a/lib/builtins/mulsf3.c b/lib/builtins/mulsf3.c
index 478b3bc0e0e0..f141af1acc58 100644
--- a/lib/builtins/mulsf3.c
+++ b/lib/builtins/mulsf3.c
@@ -15,8 +15,13 @@
 #define SINGLE_PRECISION
 #include "fp_mul_impl.inc"
 
-ARM_EABI_FNALIAS(fmul, mulsf3)
-
 COMPILER_RT_ABI fp_t __mulsf3(fp_t a, fp_t b) {
     return __mulXf3__(a, b);
 }
+
+#if defined(__ARM_EABI__)
+AEABI_RTABI fp_t __aeabi_fmul(fp_t a, fp_t b) {
+  return __mulsf3(a, b);
+}
+#endif
+
diff --git a/lib/builtins/negdf2.c b/lib/builtins/negdf2.c
index d634b421cb79..5e2544cdb4be 100644
--- a/lib/builtins/negdf2.c
+++ b/lib/builtins/negdf2.c
@@ -14,9 +14,14 @@
 #define DOUBLE_PRECISION
 #include "fp_lib.h"
 
-ARM_EABI_FNALIAS(dneg, negdf2)
-
 COMPILER_RT_ABI fp_t
 __negdf2(fp_t a) {
     return fromRep(toRep(a) ^ signBit);
 }
+
+#if defined(__ARM_EABI__)
+AEABI_RTABI fp_t __aeabi_dneg(fp_t a) {
+  return __negdf2(a);
+}
+#endif
+
diff --git a/lib/builtins/negsf2.c b/lib/builtins/negsf2.c
index 29c17be4145f..f90b34335680 100644
--- a/lib/builtins/negsf2.c
+++ b/lib/builtins/negsf2.c
@@ -14,9 +14,14 @@
 #define SINGLE_PRECISION
 #include "fp_lib.h"
 
-ARM_EABI_FNALIAS(fneg, negsf2)
-
 COMPILER_RT_ABI fp_t
 __negsf2(fp_t a) {
     return fromRep(toRep(a) ^ signBit);
 }
+
+#if defined(__ARM_EABI__)
+AEABI_RTABI fp_t __aeabi_fneg(fp_t a) {
+  return __negsf2(a);
+}
+#endif
+
diff --git a/lib/builtins/subdf3.c b/lib/builtins/subdf3.c
index 7a79e5e7765d..38340dfab1a6 100644
--- a/lib/builtins/subdf3.c
+++ b/lib/builtins/subdf3.c
@@ -15,11 +15,15 @@
 #define DOUBLE_PRECISION
 #include "fp_lib.h"
 
-ARM_EABI_FNALIAS(dsub, subdf3)
-
 // Subtraction; flip the sign bit of b and add.
 COMPILER_RT_ABI fp_t
 __subdf3(fp_t a, fp_t b) {
     return __adddf3(a, fromRep(toRep(b) ^ signBit));
 }
 
+#if defined(__ARM_EABI__)
+AEABI_RTABI fp_t __aeabi_dsub(fp_t a, fp_t b) {
+  return __subdf3(a, b);
+}
+#endif
+
diff --git a/lib/builtins/subsf3.c b/lib/builtins/subsf3.c
index c3b85144af48..34276b1447ba 100644
--- a/lib/builtins/subsf3.c
+++ b/lib/builtins/subsf3.c
@@ -15,11 +15,15 @@
 #define SINGLE_PRECISION
 #include "fp_lib.h"
 
-ARM_EABI_FNALIAS(fsub, subsf3)
-
 // Subtraction; flip the sign bit of b and add.
 COMPILER_RT_ABI fp_t
 __subsf3(fp_t a, fp_t b) {
     return __addsf3(a, fromRep(toRep(b) ^ signBit));
 }
 
+#if defined(__ARM_EABI__)
+AEABI_RTABI fp_t __aeabi_fsub(fp_t a, fp_t b) {
+  return __subsf3(a, b);
+}
+#endif
+
diff --git a/lib/builtins/truncdfhf2.c b/lib/builtins/truncdfhf2.c
index 17195cd9e799..4bb71aa178a0 100644
--- a/lib/builtins/truncdfhf2.c
+++ b/lib/builtins/truncdfhf2.c
@@ -11,8 +11,13 @@
 #define DST_HALF
 #include "fp_trunc_impl.inc"
 
-ARM_EABI_FNALIAS(d2h, truncdfhf2)
-
 COMPILER_RT_ABI uint16_t __truncdfhf2(double a) {
     return __truncXfYf2__(a);
 }
+
+#if defined(__ARM_EABI__)
+AEABI_RTABI uint16_t __aeabi_d2h(double a) {
+  return __truncdfhf2(a);
+}
+#endif
+
diff --git a/lib/builtins/truncdfsf2.c b/lib/builtins/truncdfsf2.c
index 46ec11dccd79..8bf58bb23a3b 100644
--- a/lib/builtins/truncdfsf2.c
+++ b/lib/builtins/truncdfsf2.c
@@ -11,8 +11,13 @@
 #define DST_SINGLE
 #include "fp_trunc_impl.inc"
 
-ARM_EABI_FNALIAS(d2f, truncdfsf2)
-
 COMPILER_RT_ABI float __truncdfsf2(double a) {
     return __truncXfYf2__(a);
 }
+
+#if defined(__ARM_EABI__)
+AEABI_RTABI float __aeabi_d2f(double a) {
+  return __truncdfsf2(a);
+}
+#endif
+
diff --git a/lib/builtins/truncsfhf2.c b/lib/builtins/truncsfhf2.c
index 9d61895bfd88..f6ce1fa1de05 100644
--- a/lib/builtins/truncsfhf2.c
+++ b/lib/builtins/truncsfhf2.c
@@ -11,8 +11,6 @@
 #define DST_HALF
 #include "fp_trunc_impl.inc"
 
-ARM_EABI_FNALIAS(f2h, truncsfhf2)
-
 // Use a forwarding definition and noinline to implement a poor man's alias,
 // as there isn't a good cross-platform way of defining one.
 COMPILER_RT_ABI NOINLINE uint16_t __truncsfhf2(float a) {
@@ -22,3 +20,10 @@ COMPILER_RT_ABI NOINLINE uint16_t __truncsfhf2(float a) {
 COMPILER_RT_ABI uint16_t __gnu_f2h_ieee(float a) {
     return __truncsfhf2(a);
 }
+
+#if defined(__ARM_EABI__)
+AEABI_RTABI uint16_t __aeabi_f2h(float a) {
+  return __truncsfhf2(a);
+}
+#endif
+
diff --git a/lib/builtins/udivsi3.c b/lib/builtins/udivsi3.c
index 5d0140cc3e75..8eccf102cc97 100644
--- a/lib/builtins/udivsi3.c
+++ b/lib/builtins/udivsi3.c
@@ -18,8 +18,6 @@
 
 /* Translated from Figure 3-40 of The PowerPC Compiler Writer's Guide */
 
-ARM_EABI_FNALIAS(uidiv, udivsi3)
-
 /* This function should not call __divsi3! */
 COMPILER_RT_ABI su_int
 __udivsi3(su_int n, su_int d)
@@ -64,3 +62,10 @@ __udivsi3(su_int n, su_int d)
     q = (q << 1) | carry;
     return q;
 }
+
+#if defined(__ARM_EABI__)
+AEABI_RTABI su_int __aeabi_uidiv(su_int n, su_int d) {
+  return __udivsi3(n, d);
+}
+#endif
+
diff --git a/lib/esan/esan_interceptors.cpp b/lib/esan/esan_interceptors.cpp
index 9740f4dae8fa..62fa13c83822 100644
--- a/lib/esan/esan_interceptors.cpp
+++ b/lib/esan/esan_interceptors.cpp
@@ -31,6 +31,8 @@ using namespace __esan; // NOLINT
 // Get the per-platform defines for what is possible to intercept
 #include "sanitizer_common/sanitizer_platform_interceptors.h"
 
+DECLARE_REAL_AND_INTERCEPTOR(void *, malloc, uptr)
+
 // TODO(bruening): tsan disables several interceptors (getpwent, etc.) claiming
 // that interception is a perf hit: should we do the same?
 
diff --git a/lib/lsan/lsan_allocator.h b/lib/lsan/lsan_allocator.h
index fad5adb01a7f..5a0d94c71415 100644
--- a/lib/lsan/lsan_allocator.h
+++ b/lib/lsan/lsan_allocator.h
@@ -55,10 +55,18 @@ struct ChunkMetadata {
 static const uptr kRegionSizeLog = 20;
 static const uptr kNumRegions = SANITIZER_MMAP_RANGE_SIZE >> kRegionSizeLog;
 typedef TwoLevelByteMap<(kNumRegions >> 12), 1 << 12> ByteMap;
-typedef CompactSizeClassMap SizeClassMap;
-typedef SizeClassAllocator32<0, SANITIZER_MMAP_RANGE_SIZE,
-    sizeof(ChunkMetadata), SizeClassMap, kRegionSizeLog, ByteMap>
-    PrimaryAllocator;
+
+struct AP32 {
+  static const uptr kSpaceBeg = 0;
+  static const u64 kSpaceSize = SANITIZER_MMAP_RANGE_SIZE;
+  static const uptr kMetadataSize = sizeof(ChunkMetadata);
+  typedef __sanitizer::CompactSizeClassMap SizeClassMap;
+  static const uptr kRegionSizeLog = __lsan::kRegionSizeLog;
+  typedef __lsan::ByteMap ByteMap;
+  typedef NoOpMapUnmapCallback MapUnmapCallback;
+  static const uptr kFlags = 0;
+};
+typedef SizeClassAllocator32<AP32> PrimaryAllocator;
 #elif defined(__x86_64__) || defined(__powerpc64__)
 struct AP64 {  // Allocator64 parameters. Deliberately using a short name.
   static const uptr kSpaceBeg = 0x600000000000ULL;
diff --git a/lib/lsan/lsan_common_linux.cc b/lib/lsan/lsan_common_linux.cc
index fadd0263de73..c903be42d1e7 100644
--- a/lib/lsan/lsan_common_linux.cc
+++ b/lib/lsan/lsan_common_linux.cc
@@ -62,8 +62,10 @@ void InitializePlatformSpecificModules() {
       return;
     }
   }
-  VReport(1, "LeakSanitizer: Dynamic linker not found. "
-             "TLS will not be handled correctly.\n");
+  if (linker == nullptr) {
+    VReport(1, "LeakSanitizer: Dynamic linker not found. "
+               "TLS will not be handled correctly.\n");
+  }
 }
 
 static int ProcessGlobalRegionsCallback(struct dl_phdr_info *info, size_t size,
diff --git a/lib/lsan/lsan_common_mac.cc b/lib/lsan/lsan_common_mac.cc
index a9adcdfff37f..5ee1e228691a 100644
--- a/lib/lsan/lsan_common_mac.cc
+++ b/lib/lsan/lsan_common_mac.cc
@@ -144,6 +144,11 @@ void ProcessPlatformSpecificAllocations(Frontier *frontier) {
     if (info.user_tag == VM_MEMORY_OS_ALLOC_ONCE) {
       ScanRangeForPointers(address, end_address, frontier, "GLOBAL",
                            kReachable);
+
+      // Recursing over the full memory map is very slow, break out
+      // early if we don't need the full iteration.
+      if (!flags()->use_root_regions || !root_regions->size())
+        break;
     }
 
     // This additional root region scan is required on Darwin in order to
diff --git a/lib/msan/msan_allocator.cc b/lib/msan/msan_allocator.cc
index 6c389f008cf7..1be573faa412 100644
--- a/lib/msan/msan_allocator.cc
+++ b/lib/msan/msan_allocator.cc
@@ -47,12 +47,18 @@ struct MsanMapUnmapCallback {
   static const uptr kRegionSizeLog = 20;
   static const uptr kNumRegions = SANITIZER_MMAP_RANGE_SIZE >> kRegionSizeLog;
   typedef TwoLevelByteMap<(kNumRegions >> 12), 1 << 12> ByteMap;
-  typedef CompactSizeClassMap SizeClassMap;
-
-  typedef SizeClassAllocator32<0, SANITIZER_MMAP_RANGE_SIZE, sizeof(Metadata),
-                               SizeClassMap, kRegionSizeLog, ByteMap,
-                               MsanMapUnmapCallback> PrimaryAllocator;
 
+  struct AP32 {
+    static const uptr kSpaceBeg = 0;
+    static const u64 kSpaceSize = SANITIZER_MMAP_RANGE_SIZE;
+    static const uptr kMetadataSize = sizeof(Metadata);
+    typedef __sanitizer::CompactSizeClassMap SizeClassMap;
+    static const uptr kRegionSizeLog = __msan::kRegionSizeLog;
+    typedef __msan::ByteMap ByteMap;
+    typedef MsanMapUnmapCallback MapUnmapCallback;
+    static const uptr kFlags = 0;
+  };
+  typedef SizeClassAllocator32<AP32> PrimaryAllocator;
 #elif defined(__x86_64__)
 #if SANITIZER_LINUX && !defined(MSAN_LINUX_X86_64_OLD_MAPPING)
   static const uptr kAllocatorSpace = 0x700000000000ULL;
@@ -90,11 +96,18 @@ struct MsanMapUnmapCallback {
   static const uptr kRegionSizeLog = 20;
   static const uptr kNumRegions = SANITIZER_MMAP_RANGE_SIZE >> kRegionSizeLog;
   typedef TwoLevelByteMap<(kNumRegions >> 12), 1 << 12> ByteMap;
-  typedef CompactSizeClassMap SizeClassMap;
 
-  typedef SizeClassAllocator32<0, SANITIZER_MMAP_RANGE_SIZE, sizeof(Metadata),
-                               SizeClassMap, kRegionSizeLog, ByteMap,
-                               MsanMapUnmapCallback> PrimaryAllocator;
+  struct AP32 {
+    static const uptr kSpaceBeg = 0;
+    static const u64 kSpaceSize = SANITIZER_MMAP_RANGE_SIZE;
+    static const uptr kMetadataSize = sizeof(Metadata);
+    typedef __sanitizer::CompactSizeClassMap SizeClassMap;
+    static const uptr kRegionSizeLog = __msan::kRegionSizeLog;
+    typedef __msan::ByteMap ByteMap;
+    typedef MsanMapUnmapCallback MapUnmapCallback;
+    static const uptr kFlags = 0;
+  };
+  typedef SizeClassAllocator32<AP32> PrimaryAllocator;
 #endif
 typedef SizeClassAllocatorLocalCache<PrimaryAllocator> AllocatorCache;
 typedef LargeMmapAllocator<MsanMapUnmapCallback> SecondaryAllocator;
diff --git a/lib/msan/msan_interceptors.cc b/lib/msan/msan_interceptors.cc
index 15543bd912d6..0f50693441be 100644
--- a/lib/msan/msan_interceptors.cc
+++ b/lib/msan/msan_interceptors.cc
@@ -341,33 +341,6 @@ INTERCEPTOR(char *, __strdup, char *src) {
 #define MSAN_MAYBE_INTERCEPT___STRDUP
 #endif
 
-INTERCEPTOR(char *, strndup, char *src, SIZE_T n) {
-  ENSURE_MSAN_INITED();
-  GET_STORE_STACK_TRACE;
-  // On FreeBSD strndup() leverages strnlen().
-  InterceptorScope interceptor_scope;
-  SIZE_T copy_size = REAL(strnlen)(src, n);
-  char *res = REAL(strndup)(src, n);
-  CopyShadowAndOrigin(res, src, copy_size, &stack);
-  __msan_unpoison(res + copy_size, 1); // \0
-  return res;
-}
-
-#if !SANITIZER_FREEBSD
-INTERCEPTOR(char *, __strndup, char *src, SIZE_T n) {
-  ENSURE_MSAN_INITED();
-  GET_STORE_STACK_TRACE;
-  SIZE_T copy_size = REAL(strnlen)(src, n);
-  char *res = REAL(__strndup)(src, n);
-  CopyShadowAndOrigin(res, src, copy_size, &stack);
-  __msan_unpoison(res + copy_size, 1); // \0
-  return res;
-}
-#define MSAN_MAYBE_INTERCEPT___STRNDUP INTERCEPT_FUNCTION(__strndup)
-#else
-#define MSAN_MAYBE_INTERCEPT___STRNDUP
-#endif
-
 INTERCEPTOR(char *, gcvt, double number, SIZE_T ndigit, char *buf) {
   ENSURE_MSAN_INITED();
   char *res = REAL(gcvt)(number, ndigit, buf);
@@ -1371,6 +1344,13 @@ int OnExit() {
     return __msan_memcpy(to, from, size);                   \
   }
 
+#define COMMON_INTERCEPTOR_COPY_STRING(ctx, to, from, size)                    \
+  do {                                                                         \
+    GET_STORE_STACK_TRACE;                                                     \
+    CopyShadowAndOrigin(to, from, size, &stack);                               \
+    __msan_unpoison(to + size, 1);                                             \
+  } while (false)
+
 #include "sanitizer_common/sanitizer_platform_interceptors.h"
 #include "sanitizer_common/sanitizer_common_interceptors.inc"
 
@@ -1538,8 +1518,6 @@ void InitializeInterceptors() {
   INTERCEPT_FUNCTION(stpcpy);  // NOLINT
   INTERCEPT_FUNCTION(strdup);
   MSAN_MAYBE_INTERCEPT___STRDUP;
-  INTERCEPT_FUNCTION(strndup);
-  MSAN_MAYBE_INTERCEPT___STRNDUP;
   INTERCEPT_FUNCTION(strncpy);  // NOLINT
   INTERCEPT_FUNCTION(gcvt);
   INTERCEPT_FUNCTION(strcat);  // NOLINT
diff --git a/lib/msan/tests/msan_test.cc b/lib/msan/tests/msan_test.cc
index dd81c4d798f6..58f695e69e12 100644
--- a/lib/msan/tests/msan_test.cc
+++ b/lib/msan/tests/msan_test.cc
@@ -1581,7 +1581,8 @@ TEST(MemorySanitizer, strdup) {
 TEST(MemorySanitizer, strndup) {
   char buf[4] = "abc";
   __msan_poison(buf + 2, sizeof(*buf));
-  char *x = strndup(buf, 3);
+  char *x;
+  EXPECT_UMR(x = strndup(buf, 3));
   EXPECT_NOT_POISONED(x[0]);
   EXPECT_NOT_POISONED(x[1]);
   EXPECT_POISONED(x[2]);
@@ -1593,7 +1594,8 @@ TEST(MemorySanitizer, strndup_short) {
   char buf[4] = "abc";
   __msan_poison(buf + 1, sizeof(*buf));
   __msan_poison(buf + 2, sizeof(*buf));
-  char *x = strndup(buf, 2);
+  char *x;
+  EXPECT_UMR(x = strndup(buf, 2));
   EXPECT_NOT_POISONED(x[0]);
   EXPECT_POISONED(x[1]);
   EXPECT_NOT_POISONED(x[2]);
@@ -2203,10 +2205,51 @@ TEST(MemorySanitizer, localtime_r) {
   EXPECT_NE(0U, strlen(time.tm_zone));
 }
 
+#if !defined(__FreeBSD__)
+/* Creates a temporary file with contents similar to /etc/fstab to be used
+   with getmntent{_r}.  */
+class TempFstabFile {
+ public:
+   TempFstabFile() : fd (-1) { }
+   ~TempFstabFile() {
+     if (fd >= 0)
+       close (fd);
+   }
+
+   bool Create(void) {
+     snprintf(tmpfile, sizeof(tmpfile), "/tmp/msan.getmntent.tmp.XXXXXX");
+
+     fd = mkstemp(tmpfile);
+     if (fd == -1)
+       return false;
+
+     const char entry[] = "/dev/root / ext4 errors=remount-ro 0 1";
+     size_t entrylen = sizeof(entry);
+
+     size_t bytesWritten = write(fd, entry, entrylen);
+     if (entrylen != bytesWritten)
+       return false;
+
+     return true;
+   }
+
+   const char* FileName(void) {
+     return tmpfile;
+   }
+
+ private:
+  char tmpfile[128];
+  int fd;
+};
+#endif
+
 // There's no getmntent() on FreeBSD.
 #if !defined(__FreeBSD__)
 TEST(MemorySanitizer, getmntent) {
-  FILE *fp = setmntent("/etc/fstab", "r");
+  TempFstabFile fstabtmp;
+  ASSERT_TRUE(fstabtmp.Create());
+  FILE *fp = setmntent(fstabtmp.FileName(), "r");
+
   struct mntent *mnt = getmntent(fp);
   ASSERT_TRUE(mnt != NULL);
   ASSERT_NE(0U, strlen(mnt->mnt_fsname));
@@ -2222,7 +2265,10 @@ TEST(MemorySanitizer, getmntent) {
 // There's no getmntent_r() on FreeBSD.
 #if !defined(__FreeBSD__)
 TEST(MemorySanitizer, getmntent_r) {
-  FILE *fp = setmntent("/etc/fstab", "r");
+  TempFstabFile fstabtmp;
+  ASSERT_TRUE(fstabtmp.Create());
+  FILE *fp = setmntent(fstabtmp.FileName(), "r");
+
   struct mntent mntbuf;
   char buf[1000];
   struct mntent *mnt = getmntent_r(fp, &mntbuf, buf, sizeof(buf));
@@ -3678,8 +3724,10 @@ TEST(MemorySanitizer, ICmpRelational) {
 
   EXPECT_POISONED(poisoned(6, 0xF) > poisoned(7, 0));
   EXPECT_POISONED(poisoned(0xF, 0xF) > poisoned(7, 0));
-
-  EXPECT_NOT_POISONED(poisoned(-1, 0x80000000U) >= poisoned(-1, 0U));
+  // Note that "icmp op X, Y" is approximated with "or shadow(X), shadow(Y)"
+  // and therefore may generate false positives in some cases, e.g. the
+  // following one:
+  // EXPECT_NOT_POISONED(poisoned(-1, 0x80000000U) >= poisoned(-1, 0U));
 }
 
 #if MSAN_HAS_M128
diff --git a/lib/sanitizer_common/sanitizer_allocator_internal.h b/lib/sanitizer_common/sanitizer_allocator_internal.h
index e939cbe01c3c..d1890f20f810 100644
--- a/lib/sanitizer_common/sanitizer_allocator_internal.h
+++ b/lib/sanitizer_common/sanitizer_allocator_internal.h
@@ -23,21 +23,25 @@ namespace __sanitizer {
 // purposes.
 typedef CompactSizeClassMap InternalSizeClassMap;
 
-static const uptr kInternalAllocatorSpace = 0;
-static const u64 kInternalAllocatorSize = SANITIZER_MMAP_RANGE_SIZE;
 static const uptr kInternalAllocatorRegionSizeLog = 20;
-#if SANITIZER_WORDSIZE == 32
 static const uptr kInternalAllocatorNumRegions =
-    kInternalAllocatorSize >> kInternalAllocatorRegionSizeLog;
+    SANITIZER_MMAP_RANGE_SIZE >> kInternalAllocatorRegionSizeLog;
+#if SANITIZER_WORDSIZE == 32
 typedef FlatByteMap<kInternalAllocatorNumRegions> ByteMap;
 #else
-static const uptr kInternalAllocatorNumRegions =
-    kInternalAllocatorSize >> kInternalAllocatorRegionSizeLog;
 typedef TwoLevelByteMap<(kInternalAllocatorNumRegions >> 12), 1 << 12> ByteMap;
 #endif
-typedef SizeClassAllocator32<
-    kInternalAllocatorSpace, kInternalAllocatorSize, 0, InternalSizeClassMap,
-    kInternalAllocatorRegionSizeLog, ByteMap> PrimaryInternalAllocator;
+struct AP32 {
+  static const uptr kSpaceBeg = 0;
+  static const u64 kSpaceSize = SANITIZER_MMAP_RANGE_SIZE;
+  static const uptr kMetadataSize = 0;
+  typedef InternalSizeClassMap SizeClassMap;
+  static const uptr kRegionSizeLog = kInternalAllocatorRegionSizeLog;
+  typedef __sanitizer::ByteMap ByteMap;
+  typedef NoOpMapUnmapCallback MapUnmapCallback;
+  static const uptr kFlags = 0;
+};
+typedef SizeClassAllocator32<AP32> PrimaryInternalAllocator;
 
 typedef SizeClassAllocatorLocalCache<PrimaryInternalAllocator>
     InternalAllocatorCache;
diff --git a/lib/sanitizer_common/sanitizer_allocator_primary32.h b/lib/sanitizer_common/sanitizer_allocator_primary32.h
index 2882afd1fe1d..0f6f4f7f8503 100644
--- a/lib/sanitizer_common/sanitizer_allocator_primary32.h
+++ b/lib/sanitizer_common/sanitizer_allocator_primary32.h
@@ -36,13 +36,27 @@ template<class SizeClassAllocator> struct SizeClassAllocator32LocalCache;
 //
 // In order to avoid false sharing the objects of this class should be
 // chache-line aligned.
-template <const uptr kSpaceBeg, const u64 kSpaceSize,
-          const uptr kMetadataSize, class SizeClassMap,
-          const uptr kRegionSizeLog,
-          class ByteMap,
-          class MapUnmapCallback = NoOpMapUnmapCallback>
+
+struct SizeClassAllocator32FlagMasks {  //  Bit masks.
+  enum {
+    kRandomShuffleChunks = 1,
+  };
+};
+
+template <class Params>
 class SizeClassAllocator32 {
  public:
+  static const uptr kSpaceBeg = Params::kSpaceBeg;
+  static const u64 kSpaceSize = Params::kSpaceSize;
+  static const uptr kMetadataSize = Params::kMetadataSize;
+  typedef typename Params::SizeClassMap SizeClassMap;
+  static const uptr kRegionSizeLog = Params::kRegionSizeLog;
+  typedef typename Params::ByteMap ByteMap;
+  typedef typename Params::MapUnmapCallback MapUnmapCallback;
+
+  static const bool kRandomShuffleChunks =
+      Params::kFlags & SizeClassAllocator32FlagMasks::kRandomShuffleChunks;
+
   struct TransferBatch {
     static const uptr kMaxNumCached = SizeClassMap::kMaxNumCachedHint - 2;
     void SetFromArray(uptr region_beg_unused, void *batch[], uptr count) {
@@ -86,8 +100,7 @@ class SizeClassAllocator32 {
     return SizeClassMap::Size(class_id);
   }
 
-  typedef SizeClassAllocator32<kSpaceBeg, kSpaceSize, kMetadataSize,
-      SizeClassMap, kRegionSizeLog, ByteMap, MapUnmapCallback> ThisT;
+  typedef SizeClassAllocator32<Params> ThisT;
   typedef SizeClassAllocator32LocalCache<ThisT> AllocatorCache;
 
   void Init(s32 release_to_os_interval_ms) {
diff --git a/lib/sanitizer_common/sanitizer_common_interceptors.inc b/lib/sanitizer_common/sanitizer_common_interceptors.inc
index 53204b48e300..3c69726d7c91 100644
--- a/lib/sanitizer_common/sanitizer_common_interceptors.inc
+++ b/lib/sanitizer_common/sanitizer_common_interceptors.inc
@@ -34,6 +34,8 @@
 //   COMMON_INTERCEPTOR_MEMSET_IMPL
 //   COMMON_INTERCEPTOR_MEMMOVE_IMPL
 //   COMMON_INTERCEPTOR_MEMCPY_IMPL
+//   COMMON_INTERCEPTOR_COPY_STRING
+//   COMMON_INTERCEPTOR_STRNDUP_IMPL
 //===----------------------------------------------------------------------===//
 
 #include "interception/interception.h"
@@ -217,6 +219,25 @@ bool PlatformHasDifferentMemcpyAndMemmove();
   }
 #endif
 
+#ifndef COMMON_INTERCEPTOR_COPY_STRING
+#define COMMON_INTERCEPTOR_COPY_STRING(ctx, to, from, size) {}
+#endif
+
+#ifndef COMMON_INTERCEPTOR_STRNDUP_IMPL
+#define COMMON_INTERCEPTOR_STRNDUP_IMPL(ctx, s, size)                          \
+  COMMON_INTERCEPTOR_ENTER(ctx, strndup, s, size);                             \
+  uptr from_length = internal_strnlen(s, size);                                \
+  uptr copy_length = Min(size, from_length);                                   \
+  char *new_mem = (char *)WRAP(malloc)(copy_length + 1);                       \
+  if (common_flags()->intercept_strndup) {                                     \
+    COMMON_INTERCEPTOR_READ_RANGE(ctx, s, copy_length + 1);                    \
+  }                                                                            \
+  COMMON_INTERCEPTOR_COPY_STRING(ctx, new_mem, s, copy_length);                \
+  internal_memcpy(new_mem, s, copy_length);                                    \
+  new_mem[copy_length] = '\0';                                                 \
+  return new_mem;
+#endif
+
 struct FileMetadata {
   // For open_memstream().
   char **addr;
@@ -300,6 +321,26 @@ INTERCEPTOR(SIZE_T, strnlen, const char *s, SIZE_T maxlen) {
 #define INIT_STRNLEN
 #endif
 
+#if SANITIZER_INTERCEPT_STRNDUP
+INTERCEPTOR(char*, strndup, const char *s, uptr size) {
+  void *ctx;
+  COMMON_INTERCEPTOR_STRNDUP_IMPL(ctx, s, size);
+}
+#define INIT_STRNDUP COMMON_INTERCEPT_FUNCTION(strndup)
+#else
+#define INIT_STRNDUP
+#endif // SANITIZER_INTERCEPT_STRNDUP
+
+#if SANITIZER_INTERCEPT___STRNDUP
+INTERCEPTOR(char*, __strndup, const char *s, uptr size) {
+  void *ctx;
+  COMMON_INTERCEPTOR_STRNDUP_IMPL(ctx, s, size);
+}
+#define INIT___STRNDUP COMMON_INTERCEPT_FUNCTION(__strndup)
+#else
+#define INIT___STRNDUP
+#endif // SANITIZER_INTERCEPT___STRNDUP
+
 #if SANITIZER_INTERCEPT_TEXTDOMAIN
 INTERCEPTOR(char*, textdomain, const char *domainname) {
   void *ctx;
@@ -6163,6 +6204,8 @@ static void InitializeCommonInterceptors() {
   INIT_TEXTDOMAIN;
   INIT_STRLEN;
   INIT_STRNLEN;
+  INIT_STRNDUP;
+  INIT___STRNDUP;
   INIT_STRCMP;
   INIT_STRNCMP;
   INIT_STRCASECMP;
diff --git a/lib/sanitizer_common/sanitizer_flags.inc b/lib/sanitizer_common/sanitizer_flags.inc
index 7a5fffcf6165..67a0a5810a28 100644
--- a/lib/sanitizer_common/sanitizer_flags.inc
+++ b/lib/sanitizer_common/sanitizer_flags.inc
@@ -195,6 +195,9 @@ COMMON_FLAG(bool, intercept_strpbrk, true,
 COMMON_FLAG(bool, intercept_strlen, true,
             "If set, uses custom wrappers for strlen and strnlen functions "
             "to find more errors.")
+COMMON_FLAG(bool, intercept_strndup, true,
+            "If set, uses custom wrappers for strndup functions "
+            "to find more errors.")
 COMMON_FLAG(bool, intercept_strchr, true,
             "If set, uses custom wrappers for strchr, strchrnul, and strrchr "
             "functions to find more errors.")
diff --git a/lib/sanitizer_common/sanitizer_platform_interceptors.h b/lib/sanitizer_common/sanitizer_platform_interceptors.h
index e5644ef25e83..a95497467d61 100644
--- a/lib/sanitizer_common/sanitizer_platform_interceptors.h
+++ b/lib/sanitizer_common/sanitizer_platform_interceptors.h
@@ -25,6 +25,12 @@
 # define SI_NOT_WINDOWS 0
 #endif
 
+#if SANITIZER_POSIX
+# define SI_POSIX 1
+#else
+# define SI_POSIX 0
+#endif
+
 #if SANITIZER_LINUX && !SANITIZER_ANDROID
 # define SI_LINUX_NOT_ANDROID 1
 #else
@@ -69,6 +75,12 @@
 # define SI_UNIX_NOT_MAC 0
 #endif
 
+#if SANITIZER_LINUX && !SANITIZER_FREEBSD
+# define SI_LINUX_NOT_FREEBSD 1
+# else
+# define SI_LINUX_NOT_FREEBSD 0
+#endif
+
 #define SANITIZER_INTERCEPT_STRLEN 1
 #define SANITIZER_INTERCEPT_STRNLEN SI_NOT_MAC
 #define SANITIZER_INTERCEPT_STRCMP 1
@@ -86,6 +98,8 @@
 #define SANITIZER_INTERCEPT_MEMMOVE 1
 #define SANITIZER_INTERCEPT_MEMCPY 1
 #define SANITIZER_INTERCEPT_MEMCMP 1
+#define SANITIZER_INTERCEPT_STRNDUP SI_POSIX
+#define SANITIZER_INTERCEPT___STRNDUP SI_LINUX_NOT_FREEBSD
 #if defined(__ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__) && \
     __ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__ < 1070
 # define SI_MAC_DEPLOYMENT_BELOW_10_7 1
diff --git a/lib/sanitizer_common/sanitizer_procmaps.h b/lib/sanitizer_common/sanitizer_procmaps.h
index 9dbb5ef0f4f6..5aad6b959ad4 100644
--- a/lib/sanitizer_common/sanitizer_procmaps.h
+++ b/lib/sanitizer_common/sanitizer_procmaps.h
@@ -70,6 +70,7 @@ class MemoryMappingLayout {
   bool NextSegmentLoad(uptr *start, uptr *end, uptr *offset, char filename[],
                        uptr filename_size, ModuleArch *arch, u8 *uuid,
                        uptr *protection);
+  void GetSegmentAddrRange(uptr *start, uptr *end, uptr vmaddr, uptr vmsize);
   int current_image_;
   u32 current_magic_;
   u32 current_filetype_;
diff --git a/lib/sanitizer_common/sanitizer_procmaps_linux.cc b/lib/sanitizer_common/sanitizer_procmaps_linux.cc
index fdf85b77a680..7e4a44be95b6 100644
--- a/lib/sanitizer_common/sanitizer_procmaps_linux.cc
+++ b/lib/sanitizer_common/sanitizer_procmaps_linux.cc
@@ -18,8 +18,8 @@
 namespace __sanitizer {
 
 void ReadProcMaps(ProcSelfMapsBuff *proc_maps) {
-  CHECK(ReadFileToBuffer("/proc/self/maps", &proc_maps->data,
-                         &proc_maps->mmaped_size, &proc_maps->len));
+  ReadFileToBuffer("/proc/self/maps", &proc_maps->data, &proc_maps->mmaped_size,
+                   &proc_maps->len);
 }
 
 static bool IsOneOf(char c, char c1, char c2) {
diff --git a/lib/sanitizer_common/sanitizer_procmaps_mac.cc b/lib/sanitizer_common/sanitizer_procmaps_mac.cc
index be59b481f5a1..0b4171a90f60 100644
--- a/lib/sanitizer_common/sanitizer_procmaps_mac.cc
+++ b/lib/sanitizer_common/sanitizer_procmaps_mac.cc
@@ -18,6 +18,7 @@
 
 #include <mach-o/dyld.h>
 #include <mach-o/loader.h>
+#include <mach/mach.h>
 
 // These are not available in older macOS SDKs.
 #ifndef CPU_SUBTYPE_X86_64_H
@@ -71,6 +72,13 @@ void MemoryMappingLayout::Reset() {
   internal_memset(current_uuid_, 0, kModuleUUIDSize);
 }
 
+// The dyld load address should be unchanged throughout process execution,
+// and it is expensive to compute once many libraries have been loaded,
+// so cache it here and do not reset.
+static mach_header *dyld_hdr = 0;
+static const char kDyldPath[] = "/usr/lib/dyld";
+static const int kDyldImageIdx = -1;
+
 // static
 void MemoryMappingLayout::CacheMemoryMappings() {
   // No-op on Mac for now.
@@ -95,14 +103,12 @@ bool MemoryMappingLayout::NextSegmentLoad(uptr *start, uptr *end, uptr *offset,
   const char *lc = current_load_cmd_addr_;
   current_load_cmd_addr_ += ((const load_command *)lc)->cmdsize;
   if (((const load_command *)lc)->cmd == kLCSegment) {
-    const sptr dlloff = _dyld_get_image_vmaddr_slide(current_image_);
     const SegmentCommand* sc = (const SegmentCommand *)lc;
-    if (start) *start = sc->vmaddr + dlloff;
+    GetSegmentAddrRange(start, end, sc->vmaddr, sc->vmsize);
     if (protection) {
       // Return the initial protection.
       *protection = sc->initprot;
     }
-    if (end) *end = sc->vmaddr + sc->vmsize + dlloff;
     if (offset) {
       if (current_filetype_ == /*MH_EXECUTE*/ 0x2) {
         *offset = sc->vmaddr;
@@ -111,8 +117,12 @@ bool MemoryMappingLayout::NextSegmentLoad(uptr *start, uptr *end, uptr *offset,
       }
     }
     if (filename) {
-      internal_strncpy(filename, _dyld_get_image_name(current_image_),
-                       filename_size);
+      if (current_image_ == kDyldImageIdx) {
+        internal_strncpy(filename, kDyldPath, filename_size);
+      } else {
+        internal_strncpy(filename, _dyld_get_image_name(current_image_),
+                         filename_size);
+      }
     }
     if (arch) {
       *arch = current_arch_;
@@ -180,11 +190,74 @@ static bool IsModuleInstrumented(const load_command *first_lc) {
   return false;
 }
 
+// _dyld_get_image_header() and related APIs don't report dyld itself.
+// We work around this by manually recursing through the memory map
+// until we hit a Mach header matching dyld instead. These recurse
+// calls are expensive, but the first memory map generation occurs
+// early in the process, when dyld is one of the only images loaded,
+// so it will be hit after only a few iterations.
+static mach_header *get_dyld_image_header() {
+  mach_port_name_t port;
+  if (task_for_pid(mach_task_self(), internal_getpid(), &port) !=
+      KERN_SUCCESS) {
+    return nullptr;
+  }
+
+  unsigned depth = 1;
+  vm_size_t size = 0;
+  vm_address_t address = 0;
+  kern_return_t err = KERN_SUCCESS;
+  mach_msg_type_number_t count = VM_REGION_SUBMAP_INFO_COUNT_64;
+
+  while (true) {
+    struct vm_region_submap_info_64 info;
+    err = vm_region_recurse_64(port, &address, &size, &depth,
+                               (vm_region_info_t)&info, &count);
+    if (err != KERN_SUCCESS) return nullptr;
+
+    if (size >= sizeof(mach_header) &&
+        info.protection & MemoryMappingLayout::kProtectionRead) {
+      mach_header *hdr = (mach_header *)address;
+      if ((hdr->magic == MH_MAGIC || hdr->magic == MH_MAGIC_64) &&
+          hdr->filetype == MH_DYLINKER) {
+        return hdr;
+      }
+    }
+    address += size;
+  }
+}
+
+const mach_header *get_dyld_hdr() {
+  if (!dyld_hdr) dyld_hdr = get_dyld_image_header();
+
+  return dyld_hdr;
+}
+
+void MemoryMappingLayout::GetSegmentAddrRange(uptr *start, uptr *end,
+                                              uptr vmaddr, uptr vmsize) {
+  if (current_image_ == kDyldImageIdx) {
+    // vmaddr is masked with 0xfffff because on macOS versions < 10.12,
+    // it contains an absolute address rather than an offset for dyld.
+    // To make matters even more complicated, this absolute address
+    // isn't actually the absolute segment address, but the offset portion
+    // of the address is accurate when combined with the dyld base address,
+    // and the mask will give just this offset.
+    if (start) *start = (vmaddr & 0xfffff) + (uptr)get_dyld_hdr();
+    if (end) *end = (vmaddr & 0xfffff) + vmsize + (uptr)get_dyld_hdr();
+  } else {
+    const sptr dlloff = _dyld_get_image_vmaddr_slide(current_image_);
+    if (start) *start = vmaddr + dlloff;
+    if (end) *end = vmaddr + vmsize + dlloff;
+  }
+}
+
 bool MemoryMappingLayout::Next(uptr *start, uptr *end, uptr *offset,
                                char filename[], uptr filename_size,
                                uptr *protection, ModuleArch *arch, u8 *uuid) {
-  for (; current_image_ >= 0; current_image_--) {
-    const mach_header* hdr = _dyld_get_image_header(current_image_);
+  for (; current_image_ >= kDyldImageIdx; current_image_--) {
+    const mach_header *hdr = (current_image_ == kDyldImageIdx)
+                                 ? get_dyld_hdr()
+                                 : _dyld_get_image_header(current_image_);
     if (!hdr) continue;
     if (current_load_cmd_count_ < 0) {
       // Set up for this image;
diff --git a/lib/sanitizer_common/sanitizer_stoptheworld_mac.cc b/lib/sanitizer_common/sanitizer_stoptheworld_mac.cc
index 20b8760935bd..0c27c472f02e 100644
--- a/lib/sanitizer_common/sanitizer_stoptheworld_mac.cc
+++ b/lib/sanitizer_common/sanitizer_stoptheworld_mac.cc
@@ -170,6 +170,10 @@ PtraceRegistersStatus SuspendedThreadsListMac::GetRegistersAndSP(
   internal_memcpy(buffer, &regs, sizeof(regs));
   *sp = regs.SP_REG;
 
+  // On x86_64 and aarch64, we must account for the stack redzone, which is 128
+  // bytes.
+  if (SANITIZER_WORDSIZE == 64) *sp -= 128;
+
   return REGISTERS_AVAILABLE;
 }
 
diff --git a/lib/sanitizer_common/tests/sanitizer_allocator_test.cc b/lib/sanitizer_common/tests/sanitizer_allocator_test.cc
index e14517fca518..b28159a2adaf 100644
--- a/lib/sanitizer_common/tests/sanitizer_allocator_test.cc
+++ b/lib/sanitizer_common/tests/sanitizer_allocator_test.cc
@@ -108,13 +108,17 @@ static const u64 kAddressSpaceSize = 1ULL << 32;
 static const uptr kRegionSizeLog = FIRST_32_SECOND_64(20, 24);
 static const uptr kFlatByteMapSize = kAddressSpaceSize >> kRegionSizeLog;
 
-typedef SizeClassAllocator32<
-  0, kAddressSpaceSize,
-  /*kMetadataSize*/16,
-  CompactSizeClassMap,
-  kRegionSizeLog,
-  FlatByteMap<kFlatByteMapSize> >
-  Allocator32Compact;
+struct AP32Compact {
+  static const uptr kSpaceBeg = 0;
+  static const u64 kSpaceSize = kAddressSpaceSize;
+  static const uptr kMetadataSize = 16;
+  typedef CompactSizeClassMap SizeClassMap;
+  static const uptr kRegionSizeLog = ::kRegionSizeLog;
+  typedef FlatByteMap<kFlatByteMapSize> ByteMap;
+  typedef NoOpMapUnmapCallback MapUnmapCallback;
+  static const uptr kFlags = 0;
+};
+typedef SizeClassAllocator32<AP32Compact> Allocator32Compact;
 
 template <class SizeClassMap>
 void TestSizeClassMap() {
@@ -386,17 +390,21 @@ TEST(SanitizerCommon, SizeClassAllocator64MapUnmapCallback) {
 #endif
 #endif
 
+struct AP32WithCallback {
+  static const uptr kSpaceBeg = 0;
+  static const u64 kSpaceSize = kAddressSpaceSize;
+  static const uptr kMetadataSize = 16;
+  typedef CompactSizeClassMap SizeClassMap;
+  static const uptr kRegionSizeLog = ::kRegionSizeLog;
+  typedef FlatByteMap<kFlatByteMapSize> ByteMap;
+  typedef TestMapUnmapCallback MapUnmapCallback;
+  static const uptr kFlags = 0;
+};
+
 TEST(SanitizerCommon, SizeClassAllocator32MapUnmapCallback) {
   TestMapUnmapCallback::map_count = 0;
   TestMapUnmapCallback::unmap_count = 0;
-  typedef SizeClassAllocator32<
-      0, kAddressSpaceSize,
-      /*kMetadataSize*/16,
-      CompactSizeClassMap,
-      kRegionSizeLog,
-      FlatByteMap<kFlatByteMapSize>,
-      TestMapUnmapCallback>
-    Allocator32WithCallBack;
+  typedef SizeClassAllocator32<AP32WithCallback> Allocator32WithCallBack;
   Allocator32WithCallBack *a = new Allocator32WithCallBack;
   a->Init(kReleaseToOSIntervalNever);
   EXPECT_EQ(TestMapUnmapCallback::map_count, 0);
diff --git a/lib/sanitizer_common/tests/sanitizer_test_utils.h b/lib/sanitizer_common/tests/sanitizer_test_utils.h
index 9c162a66f547..b7728d9ea25e 100644
--- a/lib/sanitizer_common/tests/sanitizer_test_utils.h
+++ b/lib/sanitizer_common/tests/sanitizer_test_utils.h
@@ -124,4 +124,10 @@ static inline uint32_t my_rand() {
 # define SANITIZER_TEST_HAS_PRINTF_L 0
 #endif
 
+#if !defined(_MSC_VER)
+# define SANITIZER_TEST_HAS_STRNDUP 1
+#else
+# define SANITIZER_TEST_HAS_STRNDUP 0
+#endif
+
 #endif  // SANITIZER_TEST_UTILS_H
diff --git a/lib/scudo/scudo_allocator.cpp b/lib/scudo/scudo_allocator.cpp
index 5420fc9649ca..ce69ddf55531 100644
--- a/lib/scudo/scudo_allocator.cpp
+++ b/lib/scudo/scudo_allocator.cpp
@@ -15,6 +15,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "scudo_allocator.h"
+#include "scudo_crc32.h"
 #include "scudo_tls.h"
 #include "scudo_utils.h"
 
@@ -34,21 +35,28 @@ static uptr Cookie;
 // at compilation or at runtime.
 static atomic_uint8_t HashAlgorithm = { CRC32Software };
 
-SANITIZER_WEAK_ATTRIBUTE u32 computeHardwareCRC32(u32 Crc, uptr Data);
-
-INLINE u32 computeCRC32(u32 Crc, uptr Data, u8 HashType) {
-  // If SSE4.2 is defined here, it was enabled everywhere, as opposed to only
-  // for scudo_crc32.cpp. This means that other SSE instructions were likely
-  // emitted at other places, and as a result there is no reason to not use
-  // the hardware version of the CRC32.
+INLINE u32 computeCRC32(uptr Crc, uptr Value, uptr *Array, uptr ArraySize) {
+  // If the hardware CRC32 feature is defined here, it was enabled everywhere,
+  // as opposed to only for scudo_crc32.cpp. This means that other hardware
+  // specific instructions were likely emitted at other places, and as a
+  // result there is no reason to not use it here.
 #if defined(__SSE4_2__) || defined(__ARM_FEATURE_CRC32)
-  return computeHardwareCRC32(Crc, Data);
+  Crc = CRC32_INTRINSIC(Crc, Value);
+  for (uptr i = 0; i < ArraySize; i++)
+    Crc = CRC32_INTRINSIC(Crc, Array[i]);
+  return Crc;
 #else
-  if (computeHardwareCRC32 && HashType == CRC32Hardware)
-    return computeHardwareCRC32(Crc, Data);
-  else
-    return computeSoftwareCRC32(Crc, Data);
-#endif  // defined(__SSE4_2__)
+  if (atomic_load_relaxed(&HashAlgorithm) == CRC32Hardware) {
+    Crc = computeHardwareCRC32(Crc, Value);
+    for (uptr i = 0; i < ArraySize; i++)
+      Crc = computeHardwareCRC32(Crc, Array[i]);
+    return Crc;
+  }
+  Crc = computeSoftwareCRC32(Crc, Value);
+  for (uptr i = 0; i < ArraySize; i++)
+    Crc = computeSoftwareCRC32(Crc, Array[i]);
+  return Crc;
+#endif  // defined(__SSE4_2__) || defined(__ARM_FEATURE_CRC32)
 }
 
 static ScudoBackendAllocator &getBackendAllocator();
@@ -65,8 +73,9 @@ struct ScudoChunk : UnpackedHeader {
   // Returns the usable size for a chunk, meaning the amount of bytes from the
   // beginning of the user data to the end of the backend allocated chunk.
   uptr getUsableSize(UnpackedHeader *Header) {
-    uptr Size = getBackendAllocator().GetActuallyAllocatedSize(
-        getAllocBeg(Header));
+    uptr Size =
+        getBackendAllocator().GetActuallyAllocatedSize(getAllocBeg(Header),
+                                                       Header->FromPrimary);
     if (Size == 0)
       return 0;
     return Size - AlignedChunkHeaderSize - (Header->Offset << MinAlignmentLog);
@@ -78,10 +87,8 @@ struct ScudoChunk : UnpackedHeader {
     ZeroChecksumHeader.Checksum = 0;
     uptr HeaderHolder[sizeof(UnpackedHeader) / sizeof(uptr)];
     memcpy(&HeaderHolder, &ZeroChecksumHeader, sizeof(HeaderHolder));
-    u8 HashType = atomic_load_relaxed(&HashAlgorithm);
-    u32 Crc = computeCRC32(Cookie, reinterpret_cast<uptr>(this), HashType);
-    for (uptr i = 0; i < ARRAY_SIZE(HeaderHolder); i++)
-      Crc = computeCRC32(Crc, HeaderHolder[i], HashType);
+    u32 Crc = computeCRC32(Cookie, reinterpret_cast<uptr>(this), HeaderHolder,
+                           ARRAY_SIZE(HeaderHolder));
     return static_cast<u16>(Crc);
   }
 
@@ -195,10 +202,10 @@ void initScudo() {
   CHECK(!ScudoInitIsRunning && "Scudo init calls itself!");
   ScudoInitIsRunning = true;
 
-  // Check is SSE4.2 is supported, if so, opt for the CRC32 hardware version.
-  if (testCPUFeature(CRC32CPUFeature)) {
+  // Check if hardware CRC32 is supported in the binary and by the platform, if
+  // so, opt for the CRC32 hardware version of the checksum.
+  if (computeHardwareCRC32 && testCPUFeature(CRC32CPUFeature))
     atomic_store_relaxed(&HashAlgorithm, CRC32Hardware);
-  }
 
   initFlags();
 
@@ -215,7 +222,8 @@ struct QuarantineCallback {
   explicit QuarantineCallback(AllocatorCache *Cache)
     : Cache_(Cache) {}
 
-  // Chunk recycling function, returns a quarantined chunk to the backend.
+  // Chunk recycling function, returns a quarantined chunk to the backend,
+  // first making sure it hasn't been tampered with.
   void Recycle(ScudoChunk *Chunk) {
     UnpackedHeader Header;
     Chunk->loadHeader(&Header);
@@ -225,17 +233,19 @@ struct QuarantineCallback {
     }
     Chunk->eraseHeader();
     void *Ptr = Chunk->getAllocBeg(&Header);
-    getBackendAllocator().Deallocate(Cache_, Ptr);
+    getBackendAllocator().Deallocate(Cache_, Ptr, Header.FromPrimary);
   }
 
-  /// Internal quarantine allocation and deallocation functions.
+  // Internal quarantine allocation and deallocation functions. We first check
+  // that the batches are indeed serviced by the Primary.
+  // TODO(kostyak): figure out the best way to protect the batches.
+  COMPILER_CHECK(sizeof(QuarantineBatch) < SizeClassMap::kMaxSize);
   void *Allocate(uptr Size) {
-    // TODO(kostyak): figure out the best way to protect the batches.
-    return getBackendAllocator().Allocate(Cache_, Size, MinAlignment);
+    return getBackendAllocator().Allocate(Cache_, Size, MinAlignment, true);
   }
 
   void Deallocate(void *Ptr) {
-    getBackendAllocator().Deallocate(Cache_, Ptr);
+    getBackendAllocator().Deallocate(Cache_, Ptr, true);
   }
 
   AllocatorCache *Cache_;
@@ -353,58 +363,55 @@ struct ScudoAllocator {
       Size = 1;
 
     uptr NeededSize = RoundUpTo(Size, MinAlignment) + AlignedChunkHeaderSize;
-    if (Alignment > MinAlignment)
-      NeededSize += Alignment;
-    if (NeededSize >= MaxAllowedMallocSize)
+    uptr AlignedSize = (Alignment > MinAlignment) ?
+        NeededSize + (Alignment - AlignedChunkHeaderSize) : NeededSize;
+    if (AlignedSize >= MaxAllowedMallocSize)
       return BackendAllocator.ReturnNullOrDieOnBadRequest();
 
-    // Primary backed and Secondary backed allocations have a different
-    // treatment. We deal with alignment requirements of Primary serviced
-    // allocations here, but the Secondary will take care of its own alignment
-    // needs, which means we also have to work around some limitations of the
-    // combined allocator to accommodate the situation.
-    bool FromPrimary = PrimaryAllocator::CanAllocate(NeededSize, MinAlignment);
+    // Primary and Secondary backed allocations have a different treatment. We
+    // deal with alignment requirements of Primary serviced allocations here,
+    // but the Secondary will take care of its own alignment needs.
+    bool FromPrimary = PrimaryAllocator::CanAllocate(AlignedSize, MinAlignment);
 
     void *Ptr;
     uptr Salt;
+    uptr AllocationSize = FromPrimary ? AlignedSize : NeededSize;
     uptr AllocationAlignment = FromPrimary ? MinAlignment : Alignment;
     ScudoThreadContext *ThreadContext = getThreadContextAndLock();
     if (LIKELY(ThreadContext)) {
       Salt = getPrng(ThreadContext)->getNext();
       Ptr = BackendAllocator.Allocate(getAllocatorCache(ThreadContext),
-                                      NeededSize, AllocationAlignment);
+                                      AllocationSize, AllocationAlignment,
+                                      FromPrimary);
       ThreadContext->unlock();
     } else {
       SpinMutexLock l(&FallbackMutex);
       Salt = FallbackPrng.getNext();
-      Ptr = BackendAllocator.Allocate(&FallbackAllocatorCache, NeededSize,
-                                      AllocationAlignment);
+      Ptr = BackendAllocator.Allocate(&FallbackAllocatorCache, AllocationSize,
+                                      AllocationAlignment, FromPrimary);
     }
     if (!Ptr)
       return BackendAllocator.ReturnNullOrDieOnOOM();
 
-    uptr AllocBeg = reinterpret_cast<uptr>(Ptr);
-    // If the allocation was serviced by the secondary, the returned pointer
-    // accounts for ChunkHeaderSize to pass the alignment check of the combined
-    // allocator. Adjust it here.
-    if (!FromPrimary) {
-      AllocBeg -= AlignedChunkHeaderSize;
-      if (Alignment > MinAlignment)
-        NeededSize -= Alignment;
-    }
-
     // If requested, we will zero out the entire contents of the returned chunk.
     if ((ForceZeroContents || ZeroContents) && FromPrimary)
-       memset(Ptr, 0, BackendAllocator.GetActuallyAllocatedSize(Ptr));
+       memset(Ptr, 0,
+              BackendAllocator.GetActuallyAllocatedSize(Ptr, FromPrimary));
 
+    UnpackedHeader Header = {};
+    uptr AllocBeg = reinterpret_cast<uptr>(Ptr);
     uptr UserBeg = AllocBeg + AlignedChunkHeaderSize;
-    if (!IsAligned(UserBeg, Alignment))
+    if (!IsAligned(UserBeg, Alignment)) {
+      // Since the Secondary takes care of alignment, a non-aligned pointer
+      // means it is from the Primary. It is also the only case where the offset
+      // field of the header would be non-zero.
+      CHECK(FromPrimary);
       UserBeg = RoundUpTo(UserBeg, Alignment);
-    CHECK_LE(UserBeg + Size, AllocBeg + NeededSize);
-    UnpackedHeader Header = {};
+      uptr Offset = UserBeg - AlignedChunkHeaderSize - AllocBeg;
+      Header.Offset = Offset >> MinAlignmentLog;
+    }
+    CHECK_LE(UserBeg + Size, AllocBeg + AllocationSize);
     Header.State = ChunkAllocated;
-    uptr Offset = UserBeg - AlignedChunkHeaderSize - AllocBeg;
-    Header.Offset = Offset >> MinAlignmentLog;
     Header.AllocType = Type;
     if (FromPrimary) {
       Header.FromPrimary = FromPrimary;
@@ -431,17 +438,20 @@ struct ScudoAllocator {
   // with no additional security value.
   void quarantineOrDeallocateChunk(ScudoChunk *Chunk, UnpackedHeader *Header,
                                    uptr Size) {
+    bool FromPrimary = Header->FromPrimary;
     bool BypassQuarantine = (AllocatorQuarantine.GetCacheSize() == 0);
     if (BypassQuarantine) {
       Chunk->eraseHeader();
       void *Ptr = Chunk->getAllocBeg(Header);
       ScudoThreadContext *ThreadContext = getThreadContextAndLock();
       if (LIKELY(ThreadContext)) {
-        getBackendAllocator().Deallocate(getAllocatorCache(ThreadContext), Ptr);
+        getBackendAllocator().Deallocate(getAllocatorCache(ThreadContext), Ptr,
+                                         FromPrimary);
         ThreadContext->unlock();
       } else {
         SpinMutexLock Lock(&FallbackMutex);
-        getBackendAllocator().Deallocate(&FallbackAllocatorCache, Ptr);
+        getBackendAllocator().Deallocate(&FallbackAllocatorCache, Ptr,
+                                         FromPrimary);
       }
     } else {
       UnpackedHeader NewHeader = *Header;
diff --git a/lib/scudo/scudo_allocator.h b/lib/scudo/scudo_allocator.h
index f159deffb1d5..523808750eec 100644
--- a/lib/scudo/scudo_allocator.h
+++ b/lib/scudo/scudo_allocator.h
@@ -80,7 +80,7 @@ const uptr AllocatorSize = 0x10000000000ULL;  // 1T.
 const uptr AllocatorSize = 0x40000000000ULL;  // 4T.
 # endif
 typedef DefaultSizeClassMap SizeClassMap;
-struct AP {
+struct AP64 {
   static const uptr kSpaceBeg = AllocatorSpace;
   static const uptr kSpaceSize = AllocatorSize;
   static const uptr kMetadataSize = 0;
@@ -89,7 +89,7 @@ struct AP {
   static const uptr kFlags =
       SizeClassAllocator64FlagMasks::kRandomShuffleChunks;
 };
-typedef SizeClassAllocator64<AP> PrimaryAllocator;
+typedef SizeClassAllocator64<AP64> PrimaryAllocator;
 #else
 // Currently, the 32-bit Sanitizer allocator has not yet benefited from all the
 // security improvements brought to the 64-bit one. This makes the 32-bit
@@ -102,16 +102,27 @@ typedef FlatByteMap<NumRegions> ByteMap;
 typedef TwoLevelByteMap<(NumRegions >> 12), 1 << 12> ByteMap;
 # endif  // SANITIZER_WORDSIZE
 typedef DefaultSizeClassMap SizeClassMap;
-typedef SizeClassAllocator32<0, SANITIZER_MMAP_RANGE_SIZE, 0, SizeClassMap,
-    RegionSizeLog, ByteMap> PrimaryAllocator;
+struct AP32 {
+  static const uptr kSpaceBeg = 0;
+  static const u64 kSpaceSize = SANITIZER_MMAP_RANGE_SIZE;
+  static const uptr kMetadataSize = 0;
+  typedef __scudo::SizeClassMap SizeClassMap;
+  static const uptr kRegionSizeLog = RegionSizeLog;
+  typedef __scudo::ByteMap ByteMap;
+  typedef NoOpMapUnmapCallback MapUnmapCallback;
+  static const uptr kFlags =
+      SizeClassAllocator32FlagMasks::kRandomShuffleChunks;
+};
+typedef SizeClassAllocator32<AP32> PrimaryAllocator;
 #endif  // SANITIZER_CAN_USE_ALLOCATOR64
 
 #include "scudo_allocator_secondary.h"
+#include "scudo_allocator_combined.h"
 
 typedef SizeClassAllocatorLocalCache<PrimaryAllocator> AllocatorCache;
 typedef ScudoLargeMmapAllocator SecondaryAllocator;
-typedef CombinedAllocator<PrimaryAllocator, AllocatorCache, SecondaryAllocator>
-    ScudoBackendAllocator;
+typedef ScudoCombinedAllocator<PrimaryAllocator, AllocatorCache,
+    SecondaryAllocator> ScudoBackendAllocator;
 
 void initScudo();
 
diff --git a/lib/scudo/scudo_allocator_combined.h b/lib/scudo/scudo_allocator_combined.h
new file mode 100644
index 000000000000..c978db55a9d9
--- /dev/null
+++ b/lib/scudo/scudo_allocator_combined.h
@@ -0,0 +1,84 @@
+//===-- scudo_allocator_combined.h ------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// Scudo Combined Allocator, dispatches allocation & deallocation requests to
+/// the Primary or the Secondary backend allocators.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef SCUDO_ALLOCATOR_COMBINED_H_
+#define SCUDO_ALLOCATOR_COMBINED_H_
+
+#ifndef SCUDO_ALLOCATOR_H_
+#error "This file must be included inside scudo_allocator.h."
+#endif
+
+template <class PrimaryAllocator, class AllocatorCache,
+    class SecondaryAllocator>
+class ScudoCombinedAllocator {
+ public:
+  void Init(bool AllocatorMayReturnNull, s32 ReleaseToOSIntervalMs) {
+    Primary.Init(ReleaseToOSIntervalMs);
+    Secondary.Init(AllocatorMayReturnNull);
+    Stats.Init();
+    atomic_store_relaxed(&MayReturnNull, AllocatorMayReturnNull);
+  }
+
+  void *Allocate(AllocatorCache *Cache, uptr Size, uptr Alignment,
+                 bool FromPrimary) {
+    if (FromPrimary)
+      return Cache->Allocate(&Primary, Primary.ClassID(Size));
+    return Secondary.Allocate(&Stats, Size, Alignment);
+  }
+
+  void *ReturnNullOrDieOnBadRequest() {
+    if (atomic_load_relaxed(&MayReturnNull))
+      return nullptr;
+    ReportAllocatorCannotReturnNull(false);
+  }
+
+  void *ReturnNullOrDieOnOOM() {
+    if (atomic_load_relaxed(&MayReturnNull))
+      return nullptr;
+    ReportAllocatorCannotReturnNull(true);
+  }
+
+  void Deallocate(AllocatorCache *Cache, void *Ptr, bool FromPrimary) {
+    if (FromPrimary)
+      Cache->Deallocate(&Primary, Primary.GetSizeClass(Ptr), Ptr);
+    else
+      Secondary.Deallocate(&Stats, Ptr);
+  }
+
+  uptr GetActuallyAllocatedSize(void *Ptr, bool FromPrimary) {
+    if (FromPrimary)
+      return Primary.GetActuallyAllocatedSize(Ptr);
+    return Secondary.GetActuallyAllocatedSize(Ptr);
+  }
+
+  void InitCache(AllocatorCache *Cache) {
+    Cache->Init(&Stats);
+  }
+
+  void DestroyCache(AllocatorCache *Cache) {
+    Cache->Destroy(&Primary, &Stats);
+  }
+
+  void GetStats(AllocatorStatCounters StatType) const {
+    Stats.Get(StatType);
+  }
+
+ private:
+  PrimaryAllocator Primary;
+  SecondaryAllocator Secondary;
+  AllocatorGlobalStats Stats;
+  atomic_uint8_t MayReturnNull;
+};
+
+#endif  // SCUDO_ALLOCATOR_COMBINED_H_
diff --git a/lib/scudo/scudo_allocator_secondary.h b/lib/scudo/scudo_allocator_secondary.h
index fbc7f247d708..2950909b547e 100644
--- a/lib/scudo/scudo_allocator_secondary.h
+++ b/lib/scudo/scudo_allocator_secondary.h
@@ -26,20 +26,19 @@ class ScudoLargeMmapAllocator {
 
   void Init(bool AllocatorMayReturnNull) {
     PageSize = GetPageSizeCached();
-    atomic_store(&MayReturnNull, AllocatorMayReturnNull, memory_order_relaxed);
+    atomic_store_relaxed(&MayReturnNull, AllocatorMayReturnNull);
   }
 
   void *Allocate(AllocatorStats *Stats, uptr Size, uptr Alignment) {
+    uptr UserSize = Size - AlignedChunkHeaderSize;
     // The Scudo frontend prevents us from allocating more than
     // MaxAllowedMallocSize, so integer overflow checks would be superfluous.
     uptr MapSize = Size + SecondaryHeaderSize;
+    if (Alignment > MinAlignment)
+      MapSize += Alignment;
     MapSize = RoundUpTo(MapSize, PageSize);
     // Account for 2 guard pages, one before and one after the chunk.
     MapSize += 2 * PageSize;
-    // The size passed to the Secondary comprises the alignment, if large
-    // enough. Subtract it here to get the requested size, including header.
-    if (Alignment > MinAlignment)
-      Size -= Alignment;
 
     uptr MapBeg = reinterpret_cast<uptr>(MmapNoAccess(MapSize));
     if (MapBeg == ~static_cast<uptr>(0))
@@ -51,32 +50,32 @@ class ScudoLargeMmapAllocator {
     // initial guard page, and both headers. This is the pointer that has to
     // abide by alignment requirements.
     uptr UserBeg = MapBeg + PageSize + HeadersSize;
+    uptr UserEnd = UserBeg + UserSize;
 
     // In the rare event of larger alignments, we will attempt to fit the mmap
     // area better and unmap extraneous memory. This will also ensure that the
     // offset and unused bytes field of the header stay small.
     if (Alignment > MinAlignment) {
-      if (UserBeg & (Alignment - 1))
-        UserBeg += Alignment - (UserBeg & (Alignment - 1));
-      CHECK_GE(UserBeg, MapBeg);
-      uptr NewMapBeg = RoundDownTo(UserBeg - HeadersSize, PageSize) - PageSize;
-      CHECK_GE(NewMapBeg, MapBeg);
-      uptr NewMapEnd = RoundUpTo(UserBeg + (Size - AlignedChunkHeaderSize),
-                                 PageSize) + PageSize;
-      CHECK_LE(NewMapEnd, MapEnd);
-      // Unmap the extra memory if it's large enough, on both sides.
-      uptr Diff = NewMapBeg - MapBeg;
-      if (Diff > PageSize)
-        UnmapOrDie(reinterpret_cast<void *>(MapBeg), Diff);
-      Diff = MapEnd - NewMapEnd;
-      if (Diff > PageSize)
-        UnmapOrDie(reinterpret_cast<void *>(NewMapEnd), Diff);
-      MapBeg = NewMapBeg;
-      MapEnd = NewMapEnd;
-      MapSize = NewMapEnd - NewMapBeg;
+      if (!IsAligned(UserBeg, Alignment)) {
+        UserBeg = RoundUpTo(UserBeg, Alignment);
+        CHECK_GE(UserBeg, MapBeg);
+        uptr NewMapBeg = RoundDownTo(UserBeg - HeadersSize, PageSize) -
+            PageSize;
+        CHECK_GE(NewMapBeg, MapBeg);
+        if (NewMapBeg != MapBeg) {
+          UnmapOrDie(reinterpret_cast<void *>(MapBeg), NewMapBeg - MapBeg);
+          MapBeg = NewMapBeg;
+        }
+        UserEnd = UserBeg + UserSize;
+      }
+      uptr NewMapEnd = RoundUpTo(UserEnd, PageSize) + PageSize;
+      if (NewMapEnd != MapEnd) {
+        UnmapOrDie(reinterpret_cast<void *>(NewMapEnd), MapEnd - NewMapEnd);
+        MapEnd = NewMapEnd;
+      }
+      MapSize = MapEnd - MapBeg;
     }
 
-    uptr UserEnd = UserBeg + (Size - AlignedChunkHeaderSize);
     CHECK_LE(UserEnd, MapEnd - PageSize);
     // Actually mmap the memory, preserving the guard pages on either side.
     CHECK_EQ(MapBeg + PageSize, reinterpret_cast<uptr>(
@@ -94,25 +93,15 @@ class ScudoLargeMmapAllocator {
       Stats->Add(AllocatorStatMapped, MapSize - 2 * PageSize);
     }
 
-    return reinterpret_cast<void *>(UserBeg);
-  }
-
-  void *ReturnNullOrDieOnBadRequest() {
-    if (atomic_load(&MayReturnNull, memory_order_acquire))
-      return nullptr;
-    ReportAllocatorCannotReturnNull(false);
+    return reinterpret_cast<void *>(Ptr);
   }
 
   void *ReturnNullOrDieOnOOM() {
-    if (atomic_load(&MayReturnNull, memory_order_acquire))
+    if (atomic_load_relaxed(&MayReturnNull))
       return nullptr;
     ReportAllocatorCannotReturnNull(true);
   }
 
-  void SetMayReturnNull(bool AllocatorMayReturnNull) {
-    atomic_store(&MayReturnNull, AllocatorMayReturnNull, memory_order_release);
-  }
-
   void Deallocate(AllocatorStats *Stats, void *Ptr) {
     SecondaryHeader *Header = getHeader(Ptr);
     {
@@ -123,14 +112,6 @@ class ScudoLargeMmapAllocator {
     UnmapOrDie(reinterpret_cast<void *>(Header->MapBeg), Header->MapSize);
   }
 
-  uptr TotalMemoryUsed() {
-    UNIMPLEMENTED();
-  }
-
-  bool PointerIsMine(const void *Ptr) {
-    UNIMPLEMENTED();
-  }
-
   uptr GetActuallyAllocatedSize(void *Ptr) {
     SecondaryHeader *Header = getHeader(Ptr);
     // Deduct PageSize as MapSize includes the trailing guard page.
@@ -138,39 +119,9 @@ class ScudoLargeMmapAllocator {
     return MapEnd - reinterpret_cast<uptr>(Ptr);
   }
 
-  void *GetMetaData(const void *Ptr) {
-    UNIMPLEMENTED();
-  }
-
-  void *GetBlockBegin(const void *Ptr) {
-    UNIMPLEMENTED();
-  }
-
-  void *GetBlockBeginFastLocked(void *Ptr) {
-    UNIMPLEMENTED();
-  }
-
-  void PrintStats() {
-    UNIMPLEMENTED();
-  }
-
-  void ForceLock() {
-    UNIMPLEMENTED();
-  }
-
-  void ForceUnlock() {
-    UNIMPLEMENTED();
-  }
-
-  void ForEachChunk(ForEachChunkCallback Callback, void *Arg) {
-    UNIMPLEMENTED();
-  }
-
  private:
   // A Secondary allocated chunk header contains the base of the mapping and
-  // its size. Currently, the base is always a page before the header, but
-  // we might want to extend that number in the future based on the size of
-  // the allocation.
+  // its size, which comprises the guard pages.
   struct SecondaryHeader {
     uptr MapBeg;
     uptr MapSize;
diff --git a/lib/scudo/scudo_crc32.cpp b/lib/scudo/scudo_crc32.cpp
index 56be22f4ee62..a267dc4e3fb8 100644
--- a/lib/scudo/scudo_crc32.cpp
+++ b/lib/scudo/scudo_crc32.cpp
@@ -12,24 +12,7 @@
 ///
 //===----------------------------------------------------------------------===//
 
-#include "sanitizer_common/sanitizer_internal_defs.h"
-
-// Hardware CRC32 is supported at compilation via the following:
-// - for i386 & x86_64: -msse4.2
-// - for ARM & AArch64: -march=armv8-a+crc or -mcrc
-// An additional check must be performed at runtime as well to make sure the
-// emitted instructions are valid on the target host.
-
-#if defined(__SSE4_2__) || defined(__ARM_FEATURE_CRC32)
-# ifdef __SSE4_2__
-#  include <smmintrin.h>
-#  define CRC32_INTRINSIC FIRST_32_SECOND_64(_mm_crc32_u32, _mm_crc32_u64)
-# endif
-# ifdef __ARM_FEATURE_CRC32
-#  include <arm_acle.h>
-#  define CRC32_INTRINSIC FIRST_32_SECOND_64(__crc32cw, __crc32cd)
-# endif
-#endif  // defined(__SSE4_2__) || defined(__ARM_FEATURE_CRC32)
+#include "scudo_crc32.h"
 
 namespace __scudo {
 
diff --git a/lib/scudo/scudo_crc32.h b/lib/scudo/scudo_crc32.h
new file mode 100644
index 000000000000..5ffcc62658cc
--- /dev/null
+++ b/lib/scudo/scudo_crc32.h
@@ -0,0 +1,101 @@
+//===-- scudo_crc32.h -------------------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// Scudo chunk header checksum related definitions.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef SCUDO_CRC32_H_
+#define SCUDO_CRC32_H_
+
+#include "sanitizer_common/sanitizer_internal_defs.h"
+
+// Hardware CRC32 is supported at compilation via the following:
+// - for i386 & x86_64: -msse4.2
+// - for ARM & AArch64: -march=armv8-a+crc or -mcrc
+// An additional check must be performed at runtime as well to make sure the
+// emitted instructions are valid on the target host.
+
+#if defined(__SSE4_2__) || defined(__ARM_FEATURE_CRC32)
+# ifdef __SSE4_2__
+#  include <smmintrin.h>
+#  define CRC32_INTRINSIC FIRST_32_SECOND_64(_mm_crc32_u32, _mm_crc32_u64)
+# endif
+# ifdef __ARM_FEATURE_CRC32
+#  include <arm_acle.h>
+#  define CRC32_INTRINSIC FIRST_32_SECOND_64(__crc32cw, __crc32cd)
+# endif
+#endif  // defined(__SSE4_2__) || defined(__ARM_FEATURE_CRC32)
+
+namespace __scudo {
+
+enum : u8 {
+  CRC32Software = 0,
+  CRC32Hardware = 1,
+};
+
+const static u32 CRC32Table[] = {
+  0x00000000, 0x77073096, 0xee0e612c, 0x990951ba, 0x076dc419, 0x706af48f,
+  0xe963a535, 0x9e6495a3, 0x0edb8832, 0x79dcb8a4, 0xe0d5e91e, 0x97d2d988,
+  0x09b64c2b, 0x7eb17cbd, 0xe7b82d07, 0x90bf1d91, 0x1db71064, 0x6ab020f2,
+  0xf3b97148, 0x84be41de, 0x1adad47d, 0x6ddde4eb, 0xf4d4b551, 0x83d385c7,
+  0x136c9856, 0x646ba8c0, 0xfd62f97a, 0x8a65c9ec, 0x14015c4f, 0x63066cd9,
+  0xfa0f3d63, 0x8d080df5, 0x3b6e20c8, 0x4c69105e, 0xd56041e4, 0xa2677172,
+  0x3c03e4d1, 0x4b04d447, 0xd20d85fd, 0xa50ab56b, 0x35b5a8fa, 0x42b2986c,
+  0xdbbbc9d6, 0xacbcf940, 0x32d86ce3, 0x45df5c75, 0xdcd60dcf, 0xabd13d59,
+  0x26d930ac, 0x51de003a, 0xc8d75180, 0xbfd06116, 0x21b4f4b5, 0x56b3c423,
+  0xcfba9599, 0xb8bda50f, 0x2802b89e, 0x5f058808, 0xc60cd9b2, 0xb10be924,
+  0x2f6f7c87, 0x58684c11, 0xc1611dab, 0xb6662d3d, 0x76dc4190, 0x01db7106,
+  0x98d220bc, 0xefd5102a, 0x71b18589, 0x06b6b51f, 0x9fbfe4a5, 0xe8b8d433,
+  0x7807c9a2, 0x0f00f934, 0x9609a88e, 0xe10e9818, 0x7f6a0dbb, 0x086d3d2d,
+  0x91646c97, 0xe6635c01, 0x6b6b51f4, 0x1c6c6162, 0x856530d8, 0xf262004e,
+  0x6c0695ed, 0x1b01a57b, 0x8208f4c1, 0xf50fc457, 0x65b0d9c6, 0x12b7e950,
+  0x8bbeb8ea, 0xfcb9887c, 0x62dd1ddf, 0x15da2d49, 0x8cd37cf3, 0xfbd44c65,
+  0x4db26158, 0x3ab551ce, 0xa3bc0074, 0xd4bb30e2, 0x4adfa541, 0x3dd895d7,
+  0xa4d1c46d, 0xd3d6f4fb, 0x4369e96a, 0x346ed9fc, 0xad678846, 0xda60b8d0,
+  0x44042d73, 0x33031de5, 0xaa0a4c5f, 0xdd0d7cc9, 0x5005713c, 0x270241aa,
+  0xbe0b1010, 0xc90c2086, 0x5768b525, 0x206f85b3, 0xb966d409, 0xce61e49f,
+  0x5edef90e, 0x29d9c998, 0xb0d09822, 0xc7d7a8b4, 0x59b33d17, 0x2eb40d81,
+  0xb7bd5c3b, 0xc0ba6cad, 0xedb88320, 0x9abfb3b6, 0x03b6e20c, 0x74b1d29a,
+  0xead54739, 0x9dd277af, 0x04db2615, 0x73dc1683, 0xe3630b12, 0x94643b84,
+  0x0d6d6a3e, 0x7a6a5aa8, 0xe40ecf0b, 0x9309ff9d, 0x0a00ae27, 0x7d079eb1,
+  0xf00f9344, 0x8708a3d2, 0x1e01f268, 0x6906c2fe, 0xf762575d, 0x806567cb,
+  0x196c3671, 0x6e6b06e7, 0xfed41b76, 0x89d32be0, 0x10da7a5a, 0x67dd4acc,
+  0xf9b9df6f, 0x8ebeeff9, 0x17b7be43, 0x60b08ed5, 0xd6d6a3e8, 0xa1d1937e,
+  0x38d8c2c4, 0x4fdff252, 0xd1bb67f1, 0xa6bc5767, 0x3fb506dd, 0x48b2364b,
+  0xd80d2bda, 0xaf0a1b4c, 0x36034af6, 0x41047a60, 0xdf60efc3, 0xa867df55,
+  0x316e8eef, 0x4669be79, 0xcb61b38c, 0xbc66831a, 0x256fd2a0, 0x5268e236,
+  0xcc0c7795, 0xbb0b4703, 0x220216b9, 0x5505262f, 0xc5ba3bbe, 0xb2bd0b28,
+  0x2bb45a92, 0x5cb36a04, 0xc2d7ffa7, 0xb5d0cf31, 0x2cd99e8b, 0x5bdeae1d,
+  0x9b64c2b0, 0xec63f226, 0x756aa39c, 0x026d930a, 0x9c0906a9, 0xeb0e363f,
+  0x72076785, 0x05005713, 0x95bf4a82, 0xe2b87a14, 0x7bb12bae, 0x0cb61b38,
+  0x92d28e9b, 0xe5d5be0d, 0x7cdcefb7, 0x0bdbdf21, 0x86d3d2d4, 0xf1d4e242,
+  0x68ddb3f8, 0x1fda836e, 0x81be16cd, 0xf6b9265b, 0x6fb077e1, 0x18b74777,
+  0x88085ae6, 0xff0f6a70, 0x66063bca, 0x11010b5c, 0x8f659eff, 0xf862ae69,
+  0x616bffd3, 0x166ccf45, 0xa00ae278, 0xd70dd2ee, 0x4e048354, 0x3903b3c2,
+  0xa7672661, 0xd06016f7, 0x4969474d, 0x3e6e77db, 0xaed16a4a, 0xd9d65adc,
+  0x40df0b66, 0x37d83bf0, 0xa9bcae53, 0xdebb9ec5, 0x47b2cf7f, 0x30b5ffe9,
+  0xbdbdf21c, 0xcabac28a, 0x53b39330, 0x24b4a3a6, 0xbad03605, 0xcdd70693,
+  0x54de5729, 0x23d967bf, 0xb3667a2e, 0xc4614ab8, 0x5d681b02, 0x2a6f2b94,
+  0xb40bbe37, 0xc30c8ea1, 0x5a05df1b, 0x2d02ef8d
+};
+
+INLINE u32 computeSoftwareCRC32(u32 Crc, uptr Data) {
+  for (uptr i = 0; i < sizeof(Data); i++) {
+    Crc = CRC32Table[(Crc ^ Data) & 0xff] ^ (Crc >> 8);
+    Data >>= 8;
+  }
+  return Crc;
+}
+
+SANITIZER_WEAK_ATTRIBUTE u32 computeHardwareCRC32(u32 Crc, uptr Data);
+
+}  // namespace __scudo
+
+#endif  // SCUDO_CRC32_H_
diff --git a/lib/scudo/scudo_utils.h b/lib/scudo/scudo_utils.h
index 484b0c859e3d..7198476f42cf 100644
--- a/lib/scudo/scudo_utils.h
+++ b/lib/scudo/scudo_utils.h
@@ -53,65 +53,6 @@ struct Xorshift128Plus {
   u64 State[2];
 };
 
-enum : u8 {
-  CRC32Software = 0,
-  CRC32Hardware = 1,
-};
-
-const static u32 CRC32Table[] = {
-  0x00000000, 0x77073096, 0xee0e612c, 0x990951ba, 0x076dc419, 0x706af48f,
-  0xe963a535, 0x9e6495a3, 0x0edb8832, 0x79dcb8a4, 0xe0d5e91e, 0x97d2d988,
-  0x09b64c2b, 0x7eb17cbd, 0xe7b82d07, 0x90bf1d91, 0x1db71064, 0x6ab020f2,
-  0xf3b97148, 0x84be41de, 0x1adad47d, 0x6ddde4eb, 0xf4d4b551, 0x83d385c7,
-  0x136c9856, 0x646ba8c0, 0xfd62f97a, 0x8a65c9ec, 0x14015c4f, 0x63066cd9,
-  0xfa0f3d63, 0x8d080df5, 0x3b6e20c8, 0x4c69105e, 0xd56041e4, 0xa2677172,
-  0x3c03e4d1, 0x4b04d447, 0xd20d85fd, 0xa50ab56b, 0x35b5a8fa, 0x42b2986c,
-  0xdbbbc9d6, 0xacbcf940, 0x32d86ce3, 0x45df5c75, 0xdcd60dcf, 0xabd13d59,
-  0x26d930ac, 0x51de003a, 0xc8d75180, 0xbfd06116, 0x21b4f4b5, 0x56b3c423,
-  0xcfba9599, 0xb8bda50f, 0x2802b89e, 0x5f058808, 0xc60cd9b2, 0xb10be924,
-  0x2f6f7c87, 0x58684c11, 0xc1611dab, 0xb6662d3d, 0x76dc4190, 0x01db7106,
-  0x98d220bc, 0xefd5102a, 0x71b18589, 0x06b6b51f, 0x9fbfe4a5, 0xe8b8d433,
-  0x7807c9a2, 0x0f00f934, 0x9609a88e, 0xe10e9818, 0x7f6a0dbb, 0x086d3d2d,
-  0x91646c97, 0xe6635c01, 0x6b6b51f4, 0x1c6c6162, 0x856530d8, 0xf262004e,
-  0x6c0695ed, 0x1b01a57b, 0x8208f4c1, 0xf50fc457, 0x65b0d9c6, 0x12b7e950,
-  0x8bbeb8ea, 0xfcb9887c, 0x62dd1ddf, 0x15da2d49, 0x8cd37cf3, 0xfbd44c65,
-  0x4db26158, 0x3ab551ce, 0xa3bc0074, 0xd4bb30e2, 0x4adfa541, 0x3dd895d7,
-  0xa4d1c46d, 0xd3d6f4fb, 0x4369e96a, 0x346ed9fc, 0xad678846, 0xda60b8d0,
-  0x44042d73, 0x33031de5, 0xaa0a4c5f, 0xdd0d7cc9, 0x5005713c, 0x270241aa,
-  0xbe0b1010, 0xc90c2086, 0x5768b525, 0x206f85b3, 0xb966d409, 0xce61e49f,
-  0x5edef90e, 0x29d9c998, 0xb0d09822, 0xc7d7a8b4, 0x59b33d17, 0x2eb40d81,
-  0xb7bd5c3b, 0xc0ba6cad, 0xedb88320, 0x9abfb3b6, 0x03b6e20c, 0x74b1d29a,
-  0xead54739, 0x9dd277af, 0x04db2615, 0x73dc1683, 0xe3630b12, 0x94643b84,
-  0x0d6d6a3e, 0x7a6a5aa8, 0xe40ecf0b, 0x9309ff9d, 0x0a00ae27, 0x7d079eb1,
-  0xf00f9344, 0x8708a3d2, 0x1e01f268, 0x6906c2fe, 0xf762575d, 0x806567cb,
-  0x196c3671, 0x6e6b06e7, 0xfed41b76, 0x89d32be0, 0x10da7a5a, 0x67dd4acc,
-  0xf9b9df6f, 0x8ebeeff9, 0x17b7be43, 0x60b08ed5, 0xd6d6a3e8, 0xa1d1937e,
-  0x38d8c2c4, 0x4fdff252, 0xd1bb67f1, 0xa6bc5767, 0x3fb506dd, 0x48b2364b,
-  0xd80d2bda, 0xaf0a1b4c, 0x36034af6, 0x41047a60, 0xdf60efc3, 0xa867df55,
-  0x316e8eef, 0x4669be79, 0xcb61b38c, 0xbc66831a, 0x256fd2a0, 0x5268e236,
-  0xcc0c7795, 0xbb0b4703, 0x220216b9, 0x5505262f, 0xc5ba3bbe, 0xb2bd0b28,
-  0x2bb45a92, 0x5cb36a04, 0xc2d7ffa7, 0xb5d0cf31, 0x2cd99e8b, 0x5bdeae1d,
-  0x9b64c2b0, 0xec63f226, 0x756aa39c, 0x026d930a, 0x9c0906a9, 0xeb0e363f,
-  0x72076785, 0x05005713, 0x95bf4a82, 0xe2b87a14, 0x7bb12bae, 0x0cb61b38,
-  0x92d28e9b, 0xe5d5be0d, 0x7cdcefb7, 0x0bdbdf21, 0x86d3d2d4, 0xf1d4e242,
-  0x68ddb3f8, 0x1fda836e, 0x81be16cd, 0xf6b9265b, 0x6fb077e1, 0x18b74777,
-  0x88085ae6, 0xff0f6a70, 0x66063bca, 0x11010b5c, 0x8f659eff, 0xf862ae69,
-  0x616bffd3, 0x166ccf45, 0xa00ae278, 0xd70dd2ee, 0x4e048354, 0x3903b3c2,
-  0xa7672661, 0xd06016f7, 0x4969474d, 0x3e6e77db, 0xaed16a4a, 0xd9d65adc,
-  0x40df0b66, 0x37d83bf0, 0xa9bcae53, 0xdebb9ec5, 0x47b2cf7f, 0x30b5ffe9,
-  0xbdbdf21c, 0xcabac28a, 0x53b39330, 0x24b4a3a6, 0xbad03605, 0xcdd70693,
-  0x54de5729, 0x23d967bf, 0xb3667a2e, 0xc4614ab8, 0x5d681b02, 0x2a6f2b94,
-  0xb40bbe37, 0xc30c8ea1, 0x5a05df1b, 0x2d02ef8d
-};
-
-INLINE u32 computeSoftwareCRC32(u32 Crc, uptr Data) {
-  for (uptr i = 0; i < sizeof(Data); i++) {
-    Crc = CRC32Table[(Crc ^ Data) & 0xff] ^ (Crc >> 8);
-    Data >>= 8;
-  }
-  return Crc;
-}
-
 }  // namespace __scudo
 
 #endif  // SCUDO_UTILS_H_
diff --git a/lib/tsan/check_analyze.sh b/lib/tsan/check_analyze.sh
index d454ec2dd0fd..22eb444198a4 100755
--- a/lib/tsan/check_analyze.sh
+++ b/lib/tsan/check_analyze.sh
@@ -29,7 +29,7 @@ check() {
 for f in write1 write2 write4 write8; do
   check $f rsp 1
   check $f push 2
-  check $f pop 2
+  check $f pop 12
 done
 
 for f in read1 read2 read4 read8; do
diff --git a/lib/tsan/rtl/tsan_rtl.h b/lib/tsan/rtl/tsan_rtl.h
index e92a0f35705e..2cf2e168454d 100644
--- a/lib/tsan/rtl/tsan_rtl.h
+++ b/lib/tsan/rtl/tsan_rtl.h
@@ -55,16 +55,22 @@ namespace __tsan {
 #if !SANITIZER_GO
 struct MapUnmapCallback;
 #if defined(__mips64) || defined(__aarch64__) || defined(__powerpc__)
-static const uptr kAllocatorSpace = 0;
-static const uptr kAllocatorSize = SANITIZER_MMAP_RANGE_SIZE;
 static const uptr kAllocatorRegionSizeLog = 20;
 static const uptr kAllocatorNumRegions =
-    kAllocatorSize >> kAllocatorRegionSizeLog;
+    SANITIZER_MMAP_RANGE_SIZE >> kAllocatorRegionSizeLog;
 typedef TwoLevelByteMap<(kAllocatorNumRegions >> 12), 1 << 12,
     MapUnmapCallback> ByteMap;
-typedef SizeClassAllocator32<kAllocatorSpace, kAllocatorSize, 0,
-    CompactSizeClassMap, kAllocatorRegionSizeLog, ByteMap,
-    MapUnmapCallback> PrimaryAllocator;
+struct AP32 {
+  static const uptr kSpaceBeg = 0;
+  static const u64 kSpaceSize = SANITIZER_MMAP_RANGE_SIZE;
+  static const uptr kMetadataSize = 0;
+  typedef __sanitizer::CompactSizeClassMap SizeClassMap;
+  static const uptr kRegionSizeLog = kAllocatorRegionSizeLog;
+  typedef __tsan::ByteMap ByteMap;
+  typedef __tsan::MapUnmapCallback MapUnmapCallback;
+  static const uptr kFlags = 0;
+};
+typedef SizeClassAllocator32<AP32> PrimaryAllocator;
 #else
 struct AP64 {  // Allocator64 parameters. Deliberately using a short name.
   static const uptr kSpaceBeg = Mapping::kHeapMemBeg;
diff --git a/lib/xray/xray_AArch64.cc b/lib/xray/xray_AArch64.cc
index 8d1c7c5d807f..f26e77dd7fc1 100644
--- a/lib/xray/xray_AArch64.cc
+++ b/lib/xray/xray_AArch64.cc
@@ -18,8 +18,7 @@
 #include <atomic>
 #include <cassert>
 
-
-extern "C" void __clear_cache(void* start, void* end);
+extern "C" void __clear_cache(void *start, void *end);
 
 namespace __xray {
 
@@ -86,8 +85,8 @@ inline static bool patchSled(const bool Enable, const uint32_t FuncId,
         reinterpret_cast<std::atomic<uint32_t> *>(FirstAddress),
         uint32_t(PatchOpcodes::PO_B32), std::memory_order_release);
   }
-  __clear_cache(reinterpret_cast<char*>(FirstAddress),
-      reinterpret_cast<char*>(CurAddress));
+  __clear_cache(reinterpret_cast<char *>(FirstAddress),
+                reinterpret_cast<char *>(CurAddress));
   return true;
 }
 
@@ -107,6 +106,12 @@ bool patchFunctionTailExit(const bool Enable, const uint32_t FuncId,
   return patchSled(Enable, FuncId, Sled, __xray_FunctionTailExit);
 }
 
+bool patchCustomEvent(const bool Enable, const uint32_t FuncId,
+                      const XRaySledEntry &Sled)
+    XRAY_NEVER_INSTRUMENT { // FIXME: Implement in aarch64?
+  return false;
+}
+
 // FIXME: Maybe implement this better?
 bool probeRequiredCPUFeatures() XRAY_NEVER_INSTRUMENT { return true; }
 
diff --git a/lib/xray/xray_arm.cc b/lib/xray/xray_arm.cc
index 26d673ec23a0..da4efcdd2b17 100644
--- a/lib/xray/xray_arm.cc
+++ b/lib/xray/xray_arm.cc
@@ -18,7 +18,7 @@
 #include <atomic>
 #include <cassert>
 
-extern "C" void __clear_cache(void* start, void* end);
+extern "C" void __clear_cache(void *start, void *end);
 
 namespace __xray {
 
@@ -122,8 +122,8 @@ inline static bool patchSled(const bool Enable, const uint32_t FuncId,
         reinterpret_cast<std::atomic<uint32_t> *>(FirstAddress),
         uint32_t(PatchOpcodes::PO_B20), std::memory_order_release);
   }
-  __clear_cache(reinterpret_cast<char*>(FirstAddress),
-      reinterpret_cast<char*>(CurAddress));
+  __clear_cache(reinterpret_cast<char *>(FirstAddress),
+                reinterpret_cast<char *>(CurAddress));
   return true;
 }
 
@@ -143,6 +143,12 @@ bool patchFunctionTailExit(const bool Enable, const uint32_t FuncId,
   return patchSled(Enable, FuncId, Sled, __xray_FunctionTailExit);
 }
 
+bool patchCustomEvent(const bool Enable, const uint32_t FuncId,
+                      const XRaySledEntry &Sled)
+    XRAY_NEVER_INSTRUMENT { // FIXME: Implement in arm?
+  return false;
+}
+
 // FIXME: Maybe implement this better?
 bool probeRequiredCPUFeatures() XRAY_NEVER_INSTRUMENT { return true; }
 
diff --git a/lib/xray/xray_fdr_log_records.h b/lib/xray/xray_fdr_log_records.h
index 36d9410d16f6..3d6d38892c76 100644
--- a/lib/xray/xray_fdr_log_records.h
+++ b/lib/xray/xray_fdr_log_records.h
@@ -29,6 +29,7 @@ struct alignas(16) MetadataRecord {
     NewCPUId,
     TSCWrap,
     WalltimeMarker,
+    CustomEventMarker,
   };
   // Use 7 bits to identify this record type.
   /* RecordKinds */ uint8_t RecordKind : 7;
diff --git a/lib/xray/xray_fdr_logging.cc b/lib/xray/xray_fdr_logging.cc
index e538b477a3de..a7e1382c3865 100644
--- a/lib/xray/xray_fdr_logging.cc
+++ b/lib/xray/xray_fdr_logging.cc
@@ -41,45 +41,12 @@ namespace __xray {
 // Global BufferQueue.
 std::shared_ptr<BufferQueue> BQ;
 
-__sanitizer::atomic_sint32_t LoggingStatus = {
-    XRayLogInitStatus::XRAY_LOG_UNINITIALIZED};
-
 __sanitizer::atomic_sint32_t LogFlushStatus = {
     XRayLogFlushStatus::XRAY_LOG_NOT_FLUSHING};
 
-std::unique_ptr<FDRLoggingOptions> FDROptions;
-
-XRayLogInitStatus fdrLoggingInit(std::size_t BufferSize, std::size_t BufferMax,
-                                 void *Options,
-                                 size_t OptionsSize) XRAY_NEVER_INSTRUMENT {
-  if (OptionsSize != sizeof(FDRLoggingOptions))
-    return static_cast<XRayLogInitStatus>(__sanitizer::atomic_load(
-        &LoggingStatus, __sanitizer::memory_order_acquire));
-  s32 CurrentStatus = XRayLogInitStatus::XRAY_LOG_UNINITIALIZED;
-  if (!__sanitizer::atomic_compare_exchange_strong(
-          &LoggingStatus, &CurrentStatus,
-          XRayLogInitStatus::XRAY_LOG_INITIALIZING,
-          __sanitizer::memory_order_release))
-    return static_cast<XRayLogInitStatus>(CurrentStatus);
-
-  FDROptions.reset(new FDRLoggingOptions());
-  memcpy(FDROptions.get(), Options, OptionsSize);
-  bool Success = false;
-  BQ = std::make_shared<BufferQueue>(BufferSize, BufferMax, Success);
-  if (!Success) {
-    Report("BufferQueue init failed.\n");
-    return XRayLogInitStatus::XRAY_LOG_UNINITIALIZED;
-  }
-
-  // Install the actual handleArg0 handler after initialising the buffers.
-  __xray_set_handler(fdrLoggingHandleArg0);
+FDRLoggingOptions FDROptions;
 
-  __sanitizer::atomic_store(&LoggingStatus,
-                            XRayLogInitStatus::XRAY_LOG_INITIALIZED,
-                            __sanitizer::memory_order_release);
-  Report("XRay FDR init successful.\n");
-  return XRayLogInitStatus::XRAY_LOG_INITIALIZED;
-}
+__sanitizer::SpinMutex FDROptionsMutex;
 
 // Must finalize before flushing.
 XRayLogFlushStatus fdrLoggingFlush() XRAY_NEVER_INSTRUMENT {
@@ -108,7 +75,11 @@ XRayLogFlushStatus fdrLoggingFlush() XRAY_NEVER_INSTRUMENT {
   //      (fixed-sized) and let the tools reading the buffers deal with the data
   //      afterwards.
   //
-  int Fd = FDROptions->Fd;
+  int Fd = -1;
+  {
+    __sanitizer::SpinMutexLock Guard(&FDROptionsMutex);
+    Fd = FDROptions.Fd;
+  }
   if (Fd == -1)
     Fd = getLogFD();
   if (Fd == -1) {
@@ -120,8 +91,8 @@ XRayLogFlushStatus fdrLoggingFlush() XRAY_NEVER_INSTRUMENT {
 
   // Test for required CPU features and cache the cycle frequency
   static bool TSCSupported = probeRequiredCPUFeatures();
-  static uint64_t CycleFrequency = TSCSupported ? getTSCFrequency()
-                                   : __xray::NanosecondsPerSecond;
+  static uint64_t CycleFrequency =
+      TSCSupported ? getTSCFrequency() : __xray::NanosecondsPerSecond;
 
   XRayFileHeader Header;
   Header.Version = 1;
@@ -192,8 +163,8 @@ XRayLogInitStatus fdrLoggingReset() XRAY_NEVER_INSTRUMENT {
   return XRayLogInitStatus::XRAY_LOG_UNINITIALIZED;
 }
 
-void fdrLoggingHandleArg0(int32_t FuncId,
-                          XRayEntryType Entry) XRAY_NEVER_INSTRUMENT {
+static std::tuple<uint64_t, unsigned char>
+getTimestamp() XRAY_NEVER_INSTRUMENT {
   // We want to get the TSC as early as possible, so that we can check whether
   // we've seen this CPU before. We also do it before we load anything else, to
   // allow for forward progress with the scheduling.
@@ -203,7 +174,7 @@ void fdrLoggingHandleArg0(int32_t FuncId,
   // Test once for required CPU features
   static bool TSCSupported = probeRequiredCPUFeatures();
 
-  if(TSCSupported) {
+  if (TSCSupported) {
     TSC = __xray::readTSC(CPU);
   } else {
     // FIXME: This code needs refactoring as it appears in multiple locations
@@ -216,9 +187,102 @@ void fdrLoggingHandleArg0(int32_t FuncId,
     CPU = 0;
     TSC = TS.tv_sec * __xray::NanosecondsPerSecond + TS.tv_nsec;
   }
+  return std::make_tuple(TSC, CPU);
+}
+
+void fdrLoggingHandleArg0(int32_t FuncId,
+                          XRayEntryType Entry) XRAY_NEVER_INSTRUMENT {
+  auto TSC_CPU = getTimestamp();
+  __xray_fdr_internal::processFunctionHook(FuncId, Entry, std::get<0>(TSC_CPU),
+                                           std::get<1>(TSC_CPU), clock_gettime,
+                                           LoggingStatus, BQ);
+}
 
-  __xray_fdr_internal::processFunctionHook(FuncId, Entry, TSC, CPU,
-                                           clock_gettime, LoggingStatus, BQ);
+void fdrLoggingHandleCustomEvent(void *Event,
+                                 std::size_t EventSize) XRAY_NEVER_INSTRUMENT {
+  using namespace __xray_fdr_internal;
+  auto TSC_CPU = getTimestamp();
+  auto &TSC = std::get<0>(TSC_CPU);
+  auto &CPU = std::get<1>(TSC_CPU);
+  thread_local bool Running = false;
+  RecursionGuard Guard{Running};
+  if (!Guard) {
+    assert(Running && "RecursionGuard is buggy!");
+    return;
+  }
+  if (EventSize > std::numeric_limits<int32_t>::max()) {
+    using Empty = struct {};
+    static Empty Once = [&] {
+      Report("Event size too large = %zu ; > max = %d\n", EventSize,
+             std::numeric_limits<int32_t>::max());
+      return Empty();
+    }();
+    (void)Once;
+  }
+  int32_t ReducedEventSize = static_cast<int32_t>(EventSize);
+  if (!isLogInitializedAndReady(LocalBQ, TSC, CPU, clock_gettime))
+    return;
+
+  // Here we need to prepare the log to handle:
+  //   - The metadata record we're going to write. (16 bytes)
+  //   - The additional data we're going to write. Currently, that's the size of
+  //   the event we're going to dump into the log as free-form bytes.
+  if (!prepareBuffer(clock_gettime, MetadataRecSize + EventSize)) {
+    LocalBQ = nullptr;
+    return;
+  }
+
+  // Write the custom event metadata record, which consists of the following
+  // information:
+  //   - 8 bytes (64-bits) for the full TSC when the event started.
+  //   - 4 bytes (32-bits) for the length of the data.
+  MetadataRecord CustomEvent;
+  CustomEvent.Type = uint8_t(RecordType::Metadata);
+  CustomEvent.RecordKind =
+      uint8_t(MetadataRecord::RecordKinds::CustomEventMarker);
+  constexpr auto TSCSize = sizeof(std::get<0>(TSC_CPU));
+  std::memcpy(&CustomEvent.Data, &ReducedEventSize, sizeof(int32_t));
+  std::memcpy(&CustomEvent.Data[sizeof(int32_t)], &TSC, TSCSize);
+  std::memcpy(RecordPtr, &CustomEvent, sizeof(CustomEvent));
+  RecordPtr += sizeof(CustomEvent);
+  std::memcpy(RecordPtr, Event, ReducedEventSize);
+  endBufferIfFull();
+}
+
+XRayLogInitStatus fdrLoggingInit(std::size_t BufferSize, std::size_t BufferMax,
+                                 void *Options,
+                                 size_t OptionsSize) XRAY_NEVER_INSTRUMENT {
+  if (OptionsSize != sizeof(FDRLoggingOptions))
+    return static_cast<XRayLogInitStatus>(__sanitizer::atomic_load(
+        &LoggingStatus, __sanitizer::memory_order_acquire));
+  s32 CurrentStatus = XRayLogInitStatus::XRAY_LOG_UNINITIALIZED;
+  if (!__sanitizer::atomic_compare_exchange_strong(
+          &LoggingStatus, &CurrentStatus,
+          XRayLogInitStatus::XRAY_LOG_INITIALIZING,
+          __sanitizer::memory_order_release))
+    return static_cast<XRayLogInitStatus>(CurrentStatus);
+
+  {
+    __sanitizer::SpinMutexLock Guard(&FDROptionsMutex);
+    memcpy(&FDROptions, Options, OptionsSize);
+  }
+
+  bool Success = false;
+  BQ = std::make_shared<BufferQueue>(BufferSize, BufferMax, Success);
+  if (!Success) {
+    Report("BufferQueue init failed.\n");
+    return XRayLogInitStatus::XRAY_LOG_UNINITIALIZED;
+  }
+
+  // Install the actual handleArg0 handler after initialising the buffers.
+  __xray_set_handler(fdrLoggingHandleArg0);
+  __xray_set_customevent_handler(fdrLoggingHandleCustomEvent);
+
+  __sanitizer::atomic_store(&LoggingStatus,
+                            XRayLogInitStatus::XRAY_LOG_INITIALIZED,
+                            __sanitizer::memory_order_release);
+  Report("XRay FDR init successful.\n");
+  return XRayLogInitStatus::XRAY_LOG_INITIALIZED;
 }
 
 } // namespace __xray
diff --git a/lib/xray/xray_fdr_logging_impl.h b/lib/xray/xray_fdr_logging_impl.h
index ce360cb03ea7..4a1d80fd0eba 100644
--- a/lib/xray/xray_fdr_logging_impl.h
+++ b/lib/xray/xray_fdr_logging_impl.h
@@ -37,6 +37,9 @@
 
 namespace __xray {
 
+__sanitizer::atomic_sint32_t LoggingStatus = {
+    XRayLogInitStatus::XRAY_LOG_UNINITIALIZED};
+
 /// We expose some of the state transitions when FDR logging mode is operating
 /// such that we can simulate a series of log events that may occur without
 /// and test with determinism without worrying about the real CPU time.
@@ -123,12 +126,21 @@ thread_local uint8_t NumTailCalls = 0;
 constexpr auto MetadataRecSize = sizeof(MetadataRecord);
 constexpr auto FunctionRecSize = sizeof(FunctionRecord);
 
+// We use a thread_local variable to keep track of which CPUs we've already
+// run, and the TSC times for these CPUs. This allows us to stop repeating the
+// CPU field in the function records.
+//
+// We assume that we'll support only 65536 CPUs for x86_64.
+thread_local uint16_t CurrentCPU = std::numeric_limits<uint16_t>::max();
+thread_local uint64_t LastTSC = 0;
+thread_local uint64_t LastFunctionEntryTSC = 0;
+
 class ThreadExitBufferCleanup {
-  std::weak_ptr<BufferQueue> Buffers;
+  std::shared_ptr<BufferQueue> &Buffers;
   BufferQueue::Buffer &Buffer;
 
 public:
-  explicit ThreadExitBufferCleanup(std::weak_ptr<BufferQueue> BQ,
+  explicit ThreadExitBufferCleanup(std::shared_ptr<BufferQueue> &BQ,
                                    BufferQueue::Buffer &Buffer)
       XRAY_NEVER_INSTRUMENT : Buffers(BQ),
                               Buffer(Buffer) {}
@@ -142,17 +154,24 @@ public:
     // the queue.
     assert((RecordPtr + MetadataRecSize) - static_cast<char *>(Buffer.Buffer) >=
            static_cast<ptrdiff_t>(MetadataRecSize));
-    if (auto BQ = Buffers.lock()) {
+    if (Buffers) {
       writeEOBMetadata();
-      auto EC = BQ->releaseBuffer(Buffer);
+      auto EC = Buffers->releaseBuffer(Buffer);
       if (EC != BufferQueue::ErrorCode::Ok)
         Report("Failed to release buffer at %p; error=%s\n", Buffer.Buffer,
                BufferQueue::getErrorString(EC));
+      Buffers = nullptr;
       return;
     }
   }
 };
 
+// Make sure a thread that's ever called handleArg0 has a thread-local
+// live reference to the buffer queue for this particular instance of
+// FDRLogging, and that we're going to clean it up when the thread exits.
+thread_local std::shared_ptr<BufferQueue> LocalBQ = nullptr;
+thread_local ThreadExitBufferCleanup Cleanup(LocalBQ, Buffer);
+
 class RecursionGuard {
   bool &Running;
   const bool Valid;
@@ -176,7 +195,7 @@ public:
   }
 };
 
-static inline bool loggingInitialized(
+inline bool loggingInitialized(
     const __sanitizer::atomic_sint32_t &LoggingStatus) XRAY_NEVER_INSTRUMENT {
   return __sanitizer::atomic_load(&LoggingStatus,
                                   __sanitizer::memory_order_acquire) ==
@@ -185,8 +204,8 @@ static inline bool loggingInitialized(
 
 } // namespace
 
-static inline void writeNewBufferPreamble(pid_t Tid, timespec TS,
-                                          char *&MemPtr) XRAY_NEVER_INSTRUMENT {
+inline void writeNewBufferPreamble(pid_t Tid, timespec TS,
+                                   char *&MemPtr) XRAY_NEVER_INSTRUMENT {
   static constexpr int InitRecordsCount = 2;
   std::aligned_storage<sizeof(MetadataRecord)>::type Records[InitRecordsCount];
   {
@@ -222,9 +241,8 @@ static inline void writeNewBufferPreamble(pid_t Tid, timespec TS,
   NumTailCalls = 0;
 }
 
-static inline void setupNewBuffer(int (*wall_clock_reader)(clockid_t,
-                                                           struct timespec *))
-    XRAY_NEVER_INSTRUMENT {
+inline void setupNewBuffer(int (*wall_clock_reader)(
+    clockid_t, struct timespec *)) XRAY_NEVER_INSTRUMENT {
   RecordPtr = static_cast<char *>(Buffer.Buffer);
   pid_t Tid = syscall(SYS_gettid);
   timespec TS{0, 0};
@@ -235,8 +253,8 @@ static inline void setupNewBuffer(int (*wall_clock_reader)(clockid_t,
   NumTailCalls = 0;
 }
 
-static inline void writeNewCPUIdMetadata(uint16_t CPU, uint64_t TSC,
-                                         char *&MemPtr) XRAY_NEVER_INSTRUMENT {
+inline void writeNewCPUIdMetadata(uint16_t CPU, uint64_t TSC,
+                                  char *&MemPtr) XRAY_NEVER_INSTRUMENT {
   MetadataRecord NewCPUId;
   NewCPUId.Type = uint8_t(RecordType::Metadata);
   NewCPUId.RecordKind = uint8_t(MetadataRecord::RecordKinds::NewCPUId);
@@ -253,12 +271,12 @@ static inline void writeNewCPUIdMetadata(uint16_t CPU, uint64_t TSC,
   NumTailCalls = 0;
 }
 
-static inline void writeNewCPUIdMetadata(uint16_t CPU,
-                                         uint64_t TSC) XRAY_NEVER_INSTRUMENT {
+inline void writeNewCPUIdMetadata(uint16_t CPU,
+                                  uint64_t TSC) XRAY_NEVER_INSTRUMENT {
   writeNewCPUIdMetadata(CPU, TSC, RecordPtr);
 }
 
-static inline void writeEOBMetadata(char *&MemPtr) XRAY_NEVER_INSTRUMENT {
+inline void writeEOBMetadata(char *&MemPtr) XRAY_NEVER_INSTRUMENT {
   MetadataRecord EOBMeta;
   EOBMeta.Type = uint8_t(RecordType::Metadata);
   EOBMeta.RecordKind = uint8_t(MetadataRecord::RecordKinds::EndOfBuffer);
@@ -269,12 +287,12 @@ static inline void writeEOBMetadata(char *&MemPtr) XRAY_NEVER_INSTRUMENT {
   NumTailCalls = 0;
 }
 
-static inline void writeEOBMetadata() XRAY_NEVER_INSTRUMENT {
+inline void writeEOBMetadata() XRAY_NEVER_INSTRUMENT {
   writeEOBMetadata(RecordPtr);
 }
 
-static inline void writeTSCWrapMetadata(uint64_t TSC,
-                                        char *&MemPtr) XRAY_NEVER_INSTRUMENT {
+inline void writeTSCWrapMetadata(uint64_t TSC,
+                                 char *&MemPtr) XRAY_NEVER_INSTRUMENT {
   MetadataRecord TSCWrap;
   TSCWrap.Type = uint8_t(RecordType::Metadata);
   TSCWrap.RecordKind = uint8_t(MetadataRecord::RecordKinds::TSCWrap);
@@ -289,13 +307,13 @@ static inline void writeTSCWrapMetadata(uint64_t TSC,
   NumTailCalls = 0;
 }
 
-static inline void writeTSCWrapMetadata(uint64_t TSC) XRAY_NEVER_INSTRUMENT {
+inline void writeTSCWrapMetadata(uint64_t TSC) XRAY_NEVER_INSTRUMENT {
   writeTSCWrapMetadata(TSC, RecordPtr);
 }
 
-static inline void writeFunctionRecord(int FuncId, uint32_t TSCDelta,
-                                       XRayEntryType EntryType,
-                                       char *&MemPtr) XRAY_NEVER_INSTRUMENT {
+inline void writeFunctionRecord(int FuncId, uint32_t TSCDelta,
+                                XRayEntryType EntryType,
+                                char *&MemPtr) XRAY_NEVER_INSTRUMENT {
   std::aligned_storage<sizeof(FunctionRecord), alignof(FunctionRecord)>::type
       AlignedFuncRecordBuffer;
   auto &FuncRecord =
@@ -339,6 +357,17 @@ static inline void writeFunctionRecord(int FuncId, uint32_t TSCDelta,
     FuncRecord.RecordKind =
         uint8_t(FunctionRecord::RecordKinds::FunctionTailExit);
     break;
+  case XRayEntryType::CUSTOM_EVENT: {
+    // This is a bug in patching, so we'll report it once and move on.
+    static bool Once = [&] {
+      Report("Internal error: patched an XRay custom event call as a function; "
+             "func id = %d\n",
+             FuncId);
+      return true;
+    }();
+    (void)Once;
+    return;
+  }
   }
 
   std::memcpy(MemPtr, &AlignedFuncRecordBuffer, sizeof(FunctionRecord));
@@ -346,8 +375,9 @@ static inline void writeFunctionRecord(int FuncId, uint32_t TSCDelta,
 }
 
 static uint64_t thresholdTicks() {
-  static uint64_t TicksPerSec = probeRequiredCPUFeatures() ? getTSCFrequency() :
-                                __xray::NanosecondsPerSecond;
+  static uint64_t TicksPerSec = probeRequiredCPUFeatures()
+                                    ? getTSCFrequency()
+                                    : __xray::NanosecondsPerSecond;
   static const uint64_t ThresholdTicks =
       TicksPerSec * flags()->xray_fdr_log_func_duration_threshold_us / 1000000;
   return ThresholdTicks;
@@ -397,9 +427,8 @@ static void rewindRecentCall(uint64_t TSC, uint64_t &LastTSC,
     RewindingRecordPtr -= FunctionRecSize;
     RewindingTSC -= ExpectedTailExit.TSCDelta;
     AlignedFuncStorage FunctionEntryBuffer;
-    const auto &ExpectedFunctionEntry =
-        *reinterpret_cast<FunctionRecord *>(std::memcpy(
-            &FunctionEntryBuffer, RewindingRecordPtr, FunctionRecSize));
+    const auto &ExpectedFunctionEntry = *reinterpret_cast<FunctionRecord *>(
+        std::memcpy(&FunctionEntryBuffer, RewindingRecordPtr, FunctionRecSize));
     assert(ExpectedFunctionEntry.RecordKind ==
                uint8_t(FunctionRecord::RecordKinds::FunctionEnter) &&
            "Expected to find function entry when rewinding tail call.");
@@ -422,7 +451,7 @@ static void rewindRecentCall(uint64_t TSC, uint64_t &LastTSC,
   }
 }
 
-static inline bool releaseThreadLocalBuffer(BufferQueue *BQ) {
+inline bool releaseThreadLocalBuffer(BufferQueue *BQ) {
   auto EC = BQ->releaseBuffer(Buffer);
   if (EC != BufferQueue::ErrorCode::Ok) {
     Report("Failed to release buffer at %p; error=%s\n", Buffer.Buffer,
@@ -432,11 +461,29 @@ static inline bool releaseThreadLocalBuffer(BufferQueue *BQ) {
   return true;
 }
 
-static inline void processFunctionHook(
-    int32_t FuncId, XRayEntryType Entry, uint64_t TSC, unsigned char CPU,
-    int (*wall_clock_reader)(clockid_t, struct timespec *),
-    __sanitizer::atomic_sint32_t &LoggingStatus,
-    const std::shared_ptr<BufferQueue> &BQ) XRAY_NEVER_INSTRUMENT {
+inline bool prepareBuffer(int (*wall_clock_reader)(clockid_t,
+                                                   struct timespec *),
+                          size_t MaxSize) XRAY_NEVER_INSTRUMENT {
+  char *BufferStart = static_cast<char *>(Buffer.Buffer);
+  if ((RecordPtr + MaxSize) > (BufferStart + Buffer.Size - MetadataRecSize)) {
+    writeEOBMetadata();
+    if (!releaseThreadLocalBuffer(LocalBQ.get()))
+      return false;
+    auto EC = LocalBQ->getBuffer(Buffer);
+    if (EC != BufferQueue::ErrorCode::Ok) {
+      Report("Failed to acquire a buffer; error=%s\n",
+             BufferQueue::getErrorString(EC));
+      return false;
+    }
+    setupNewBuffer(wall_clock_reader);
+  }
+  return true;
+}
+
+inline bool isLogInitializedAndReady(
+    std::shared_ptr<BufferQueue> &LocalBQ, uint64_t TSC, unsigned char CPU,
+    int (*wall_clock_reader)(clockid_t,
+                             struct timespec *)) XRAY_NEVER_INSTRUMENT {
   // Bail out right away if logging is not initialized yet.
   // We should take the opportunity to release the buffer though.
   auto Status = __sanitizer::atomic_load(&LoggingStatus,
@@ -446,44 +493,19 @@ static inline void processFunctionHook(
         (Status == XRayLogInitStatus::XRAY_LOG_FINALIZING ||
          Status == XRayLogInitStatus::XRAY_LOG_FINALIZED)) {
       writeEOBMetadata();
-      if (!releaseThreadLocalBuffer(BQ.get()))
-        return;
+      if (!releaseThreadLocalBuffer(LocalBQ.get()))
+        return false;
       RecordPtr = nullptr;
+      LocalBQ = nullptr;
+      return false;
     }
-    return;
-  }
-
-  // We use a thread_local variable to keep track of which CPUs we've already
-  // run, and the TSC times for these CPUs. This allows us to stop repeating the
-  // CPU field in the function records.
-  //
-  // We assume that we'll support only 65536 CPUs for x86_64.
-  thread_local uint16_t CurrentCPU = std::numeric_limits<uint16_t>::max();
-  thread_local uint64_t LastTSC = 0;
-  thread_local uint64_t LastFunctionEntryTSC = 0;
-
-  // Make sure a thread that's ever called handleArg0 has a thread-local
-  // live reference to the buffer queue for this particular instance of
-  // FDRLogging, and that we're going to clean it up when the thread exits.
-  thread_local auto LocalBQ = BQ;
-  thread_local ThreadExitBufferCleanup Cleanup(LocalBQ, Buffer);
-
-  // Prevent signal handler recursion, so in case we're already in a log writing
-  // mode and the signal handler comes in (and is also instrumented) then we
-  // don't want to be clobbering potentially partial writes already happening in
-  // the thread. We use a simple thread_local latch to only allow one on-going
-  // handleArg0 to happen at any given time.
-  thread_local bool Running = false;
-  RecursionGuard Guard{Running};
-  if (!Guard) {
-    assert(Running == true && "RecursionGuard is buggy!");
-    return;
+    return false;
   }
 
   if (!loggingInitialized(LoggingStatus) || LocalBQ->finalizing()) {
     writeEOBMetadata();
-    if (!releaseThreadLocalBuffer(BQ.get()))
-      return;
+    if (!releaseThreadLocalBuffer(LocalBQ.get()))
+      return false;
     RecordPtr = nullptr;
   }
 
@@ -496,19 +518,57 @@ static inline void processFunctionHook(
           LS != XRayLogInitStatus::XRAY_LOG_FINALIZED)
         Report("Failed to acquire a buffer; error=%s\n",
                BufferQueue::getErrorString(EC));
-      return;
+      return false;
     }
 
     setupNewBuffer(wall_clock_reader);
   }
 
   if (CurrentCPU == std::numeric_limits<uint16_t>::max()) {
-    // This means this is the first CPU this thread has ever run on. We set the
-    // current CPU and record this as the first TSC we've seen.
+    // This means this is the first CPU this thread has ever run on. We set
+    // the current CPU and record this as the first TSC we've seen.
     CurrentCPU = CPU;
     writeNewCPUIdMetadata(CPU, TSC);
   }
 
+  return true;
+} // namespace __xray_fdr_internal
+
+inline void endBufferIfFull() XRAY_NEVER_INSTRUMENT {
+  auto BufferStart = static_cast<char *>(Buffer.Buffer);
+  if ((RecordPtr + MetadataRecSize) - BufferStart == MetadataRecSize) {
+    writeEOBMetadata();
+    if (!releaseThreadLocalBuffer(LocalBQ.get()))
+      return;
+    RecordPtr = nullptr;
+  }
+}
+
+inline void processFunctionHook(
+    int32_t FuncId, XRayEntryType Entry, uint64_t TSC, unsigned char CPU,
+    int (*wall_clock_reader)(clockid_t, struct timespec *),
+    __sanitizer::atomic_sint32_t &LoggingStatus,
+    const std::shared_ptr<BufferQueue> &BQ) XRAY_NEVER_INSTRUMENT {
+  // Prevent signal handler recursion, so in case we're already in a log writing
+  // mode and the signal handler comes in (and is also instrumented) then we
+  // don't want to be clobbering potentially partial writes already happening in
+  // the thread. We use a simple thread_local latch to only allow one on-going
+  // handleArg0 to happen at any given time.
+  thread_local bool Running = false;
+  RecursionGuard Guard{Running};
+  if (!Guard) {
+    assert(Running == true && "RecursionGuard is buggy!");
+    return;
+  }
+
+  // In case the reference has been cleaned up before, we make sure we
+  // initialize it to the provided BufferQueue.
+  if (LocalBQ == nullptr)
+    LocalBQ = BQ;
+
+  if (!isLogInitializedAndReady(LocalBQ, TSC, CPU, wall_clock_reader))
+    return;
+
   // Before we go setting up writing new function entries, we need to be really
   // careful about the pointer math we're doing. This means we need to ensure
   // that the record we are about to write is going to fit into the buffer,
@@ -545,25 +605,15 @@ static inline void processFunctionHook(
   // bytes in the end of the buffer, we need to write out the EOB, get a new
   // Buffer, set it up properly before doing any further writing.
   //
-  char *BufferStart = static_cast<char *>(Buffer.Buffer);
-  if ((RecordPtr + (MetadataRecSize + FunctionRecSize)) - BufferStart <
-      static_cast<ptrdiff_t>(MetadataRecSize)) {
-    writeEOBMetadata();
-    if (!releaseThreadLocalBuffer(LocalBQ.get()))
-      return;
-    auto EC = LocalBQ->getBuffer(Buffer);
-    if (EC != BufferQueue::ErrorCode::Ok) {
-      Report("Failed to acquire a buffer; error=%s\n",
-             BufferQueue::getErrorString(EC));
-      return;
-    }
-    setupNewBuffer(wall_clock_reader);
+  if (!prepareBuffer(wall_clock_reader, FunctionRecSize + MetadataRecSize)) {
+    LocalBQ = nullptr;
+    return;
   }
 
   // By this point, we are now ready to write at most 24 bytes (one metadata
   // record and one function record).
-  BufferStart = static_cast<char *>(Buffer.Buffer);
-  assert((RecordPtr + (MetadataRecSize + FunctionRecSize)) - BufferStart >=
+  assert((RecordPtr + (MetadataRecSize + FunctionRecSize)) -
+                 static_cast<char *>(Buffer.Buffer) >=
              static_cast<ptrdiff_t>(MetadataRecSize) &&
          "Misconfigured BufferQueue provided; Buffer size not large enough.");
 
@@ -586,7 +636,6 @@ static inline void processFunctionHook(
   //     FunctionRecord. In this case we write down just a FunctionRecord with
   //     the correct TSC delta.
   //
-
   uint32_t RecordTSCDelta = 0;
   if (CPU != CurrentCPU) {
     // We've moved to a new CPU.
@@ -619,21 +668,27 @@ static inline void processFunctionHook(
       break;
     rewindRecentCall(TSC, LastTSC, LastFunctionEntryTSC, FuncId);
     return; // without writing log.
+  case XRayEntryType::CUSTOM_EVENT: {
+    // This is a bug in patching, so we'll report it once and move on.
+    static bool Once = [&] {
+      Report("Internal error: patched an XRay custom event call as a function; "
+             "func id = %d",
+             FuncId);
+      return true;
+    }();
+    (void)Once;
+    return;
+  }
   }
 
   writeFunctionRecord(FuncId, RecordTSCDelta, Entry, RecordPtr);
 
   // If we've exhausted the buffer by this time, we then release the buffer to
   // make sure that other threads may start using this buffer.
-  if ((RecordPtr + MetadataRecSize) - BufferStart == MetadataRecSize) {
-    writeEOBMetadata();
-    if (!releaseThreadLocalBuffer(LocalBQ.get()))
-      return;
-    RecordPtr = nullptr;
-  }
+  endBufferIfFull();
 }
 
 } // namespace __xray_fdr_internal
-
 } // namespace __xray
+
 #endif // XRAY_XRAY_FDR_LOGGING_IMPL_H
diff --git a/lib/xray/xray_interface.cc b/lib/xray/xray_interface.cc
index 26f0ab122db2..c437a72e3f05 100644
--- a/lib/xray/xray_interface.cc
+++ b/lib/xray/xray_interface.cc
@@ -50,6 +50,9 @@ __sanitizer::atomic_uintptr_t XRayPatchedFunction{0};
 // This is the function to call from the arg1-enabled sleds/trampolines.
 __sanitizer::atomic_uintptr_t XRayArgLogger{0};
 
+// This is the function to call when we encounter a custom event log call.
+__sanitizer::atomic_uintptr_t XRayPatchedCustomEvent{0};
+
 // MProtectHelper is an RAII wrapper for calls to mprotect(...) that will undo
 // any successful mprotect(...) changes. This is used to make a page writeable
 // and executable, and upon destruction if it was successful in doing so returns
@@ -97,7 +100,19 @@ int __xray_set_handler(void (*entry)(int32_t,
                                __sanitizer::memory_order_acquire)) {
 
     __sanitizer::atomic_store(&__xray::XRayPatchedFunction,
-                              reinterpret_cast<uint64_t>(entry),
+                              reinterpret_cast<uintptr_t>(entry),
+                              __sanitizer::memory_order_release);
+    return 1;
+  }
+  return 0;
+}
+
+int __xray_set_customevent_handler(void (*entry)(void *, size_t))
+    XRAY_NEVER_INSTRUMENT {
+  if (__sanitizer::atomic_load(&XRayInitialized,
+                               __sanitizer::memory_order_acquire)) {
+    __sanitizer::atomic_store(&__xray::XRayPatchedCustomEvent,
+                              reinterpret_cast<uintptr_t>(entry),
                               __sanitizer::memory_order_release);
     return 1;
   }
@@ -161,6 +176,9 @@ inline bool patchSled(const XRaySledEntry &Sled, bool Enable,
   case XRayEntryType::LOG_ARGS_ENTRY:
     Success = patchFunctionEntry(Enable, FuncId, Sled, __xray_ArgLoggerEntry);
     break;
+  case XRayEntryType::CUSTOM_EVENT:
+    Success = patchCustomEvent(Enable, FuncId, Sled);
+    break;
   default:
     Report("Unsupported sled kind '%d' @%04x\n", Sled.Address, int(Sled.Kind));
     return false;
@@ -301,6 +319,7 @@ int __xray_set_handler_arg1(void (*Handler)(int32_t, XRayEntryType, uint64_t)) {
                             __sanitizer::memory_order_release);
   return 1;
 }
+
 int __xray_remove_handler_arg1() { return __xray_set_handler_arg1(nullptr); }
 
 uintptr_t __xray_function_address(int32_t FuncId) XRAY_NEVER_INSTRUMENT {
diff --git a/lib/xray/xray_interface_internal.h b/lib/xray/xray_interface_internal.h
index ef0c6b15809b..4a2784612fcb 100644
--- a/lib/xray/xray_interface_internal.h
+++ b/lib/xray/xray_interface_internal.h
@@ -60,6 +60,7 @@ bool patchFunctionEntry(bool Enable, uint32_t FuncId,
 bool patchFunctionExit(bool Enable, uint32_t FuncId, const XRaySledEntry &Sled);
 bool patchFunctionTailExit(bool Enable, uint32_t FuncId,
                            const XRaySledEntry &Sled);
+bool patchCustomEvent(bool Enable, uint32_t FuncId, const XRaySledEntry &Sled);
 
 } // namespace __xray
 
@@ -70,6 +71,7 @@ extern void __xray_FunctionEntry();
 extern void __xray_FunctionExit();
 extern void __xray_FunctionTailExit();
 extern void __xray_ArgLoggerEntry();
+extern void __xray_CustomEvent();
 }
 
 #endif
diff --git a/lib/xray/xray_mips.cc b/lib/xray/xray_mips.cc
index c8ff39936c5a..cd863304db29 100644
--- a/lib/xray/xray_mips.cc
+++ b/lib/xray/xray_mips.cc
@@ -95,7 +95,8 @@ inline static bool patchSled(const bool Enable, const uint32_t FuncId,
   //   B #44
 
   if (Enable) {
-    uint32_t LoTracingHookAddr = reinterpret_cast<int32_t>(TracingHook) & 0xffff;
+    uint32_t LoTracingHookAddr =
+        reinterpret_cast<int32_t>(TracingHook) & 0xffff;
     uint32_t HiTracingHookAddr =
         (reinterpret_cast<int32_t>(TracingHook) >> 16) & 0xffff;
     uint32_t LoFunctionID = FuncId & 0xffff;
@@ -151,6 +152,12 @@ bool patchFunctionTailExit(const bool Enable, const uint32_t FuncId,
   return patchSled(Enable, FuncId, Sled, __xray_FunctionExit);
 }
 
+bool patchCustomEvent(const bool Enable, const uint32_t FuncId,
+                      const XRaySledEntry &Sled) XRAY_NEVER_INSTRUMENT {
+  // FIXME: Implement in mips?
+  return false;
+}
+
 } // namespace __xray
 
 extern "C" void __xray_ArgLoggerEntry() XRAY_NEVER_INSTRUMENT {
diff --git a/lib/xray/xray_mips64.cc b/lib/xray/xray_mips64.cc
index 21136848c8af..fa8fdd5abccc 100644
--- a/lib/xray/xray_mips64.cc
+++ b/lib/xray/xray_mips64.cc
@@ -93,7 +93,8 @@ inline static bool patchSled(const bool Enable, const uint32_t FuncId,
   if (Enable) {
     uint32_t LoTracingHookAddr =
         reinterpret_cast<int64_t>(TracingHook) & 0xffff;
-    uint32_t HiTracingHookAddr = (reinterpret_cast<int64_t>(TracingHook) >> 16) & 0xffff;
+    uint32_t HiTracingHookAddr =
+        (reinterpret_cast<int64_t>(TracingHook) >> 16) & 0xffff;
     uint32_t HigherTracingHookAddr =
         (reinterpret_cast<int64_t>(TracingHook) >> 32) & 0xffff;
     uint32_t HighestTracingHookAddr =
@@ -160,6 +161,11 @@ bool patchFunctionTailExit(const bool Enable, const uint32_t FuncId,
   return patchSled(Enable, FuncId, Sled, __xray_FunctionExit);
 }
 
+bool patchCustomEvent(const bool Enable, const uint32_t FuncId,
+                      const XRaySledEntry &Sled) XRAY_NEVER_INSTRUMENT {
+  // FIXME: Implement in mips64?
+  return false;
+}
 } // namespace __xray
 
 extern "C" void __xray_ArgLoggerEntry() XRAY_NEVER_INSTRUMENT {
diff --git a/lib/xray/xray_powerpc64.cc b/lib/xray/xray_powerpc64.cc
index 6a7554cfc1b6..ab03cb10042f 100644
--- a/lib/xray/xray_powerpc64.cc
+++ b/lib/xray/xray_powerpc64.cc
@@ -93,6 +93,12 @@ bool patchFunctionTailExit(const bool Enable, const uint32_t FuncId,
 // FIXME: Maybe implement this better?
 bool probeRequiredCPUFeatures() XRAY_NEVER_INSTRUMENT { return true; }
 
+bool patchCustomEvent(const bool Enable, const uint32_t FuncId,
+                      const XRaySledEntry &Sled) XRAY_NEVER_INSTRUMENT {
+  // FIXME: Implement in powerpc64?
+  return false;
+}
+
 } // namespace __xray
 
 extern "C" void __xray_ArgLoggerEntry() XRAY_NEVER_INSTRUMENT {
diff --git a/lib/xray/xray_trampoline_powerpc64_asm.S b/lib/xray/xray_trampoline_powerpc64_asm.S
index d43231ead22c..250e2e5be67a 100644
--- a/lib/xray/xray_trampoline_powerpc64_asm.S
+++ b/lib/xray/xray_trampoline_powerpc64_asm.S
@@ -145,27 +145,91 @@ __xray_FunctionEntry:
 	.p2align	4
 __xray_FunctionExit:
 	std 0, 16(1)
-	ld 0, -8(1) # FuncId
-	stdu 1, -72(1)
-# Spill r3, f1, and vsr34, the return value registers.
+	stdu 1, -256(1)
+# Spill r3-r4, f1-f8, and vsr34-vsr41, which are return registers.
+# If this appears to be slow, the caller needs to pass in number of generic,
+# floating point, and vector parameters, so that we only spill those live ones.
 	std 3, 32(1)
-	mr 3, 0
-	addi 4, 1, 40
-	stxsdx 1, 0, 4
+	ld 3, 248(1) # FuncId
+	std 4, 40(1)
 	addi 4, 1, 48
+	stxsdx 1, 0, 4
+	addi 4, 1, 56
+	stxsdx 2, 0, 4
+	addi 4, 1, 64
+	stxsdx 3, 0, 4
+	addi 4, 1, 72
+	stxsdx 4, 0, 4
+	addi 4, 1, 80
+	stxsdx 5, 0, 4
+	addi 4, 1, 88
+	stxsdx 6, 0, 4
+	addi 4, 1, 96
+	stxsdx 7, 0, 4
+	addi 4, 1, 104
+	stxsdx 8, 0, 4
+	addi 4, 1, 112
 	stxvd2x 34, 0, 4
+	addi 4, 1, 128
+	stxvd2x 35, 0, 4
+	addi 4, 1, 144
+	stxvd2x 36, 0, 4
+	addi 4, 1, 160
+	stxvd2x 37, 0, 4
+	addi 4, 1, 176
+	stxvd2x 38, 0, 4
+	addi 4, 1, 192
+	stxvd2x 39, 0, 4
+	addi 4, 1, 208
+	stxvd2x 40, 0, 4
+	addi 4, 1, 224
+	stxvd2x 41, 0, 4
+	std 2, 240(1)
 	mflr 0
-	std 0, 64(1)
+	std 0, 248(1)
+
 	li 4, 1
 	bl _ZN6__xray23CallXRayPatchedFunctionEi13XRayEntryType
 	nop
-	ld 0, 64(1)
-	mtlr 0
-	ld 3, 32(1)
-	addi 4, 1, 40
-	lxsdx 1, 0, 4
+
 	addi 4, 1, 48
+	lxsdx 1, 0, 4
+	addi 4, 1, 56
+	lxsdx 2, 0, 4
+	addi 4, 1, 64
+	lxsdx 3, 0, 4
+	addi 4, 1, 72
+	lxsdx 4, 0, 4
+	addi 4, 1, 80
+	lxsdx 5, 0, 4
+	addi 4, 1, 88
+	lxsdx 6, 0, 4
+	addi 4, 1, 96
+	lxsdx 7, 0, 4
+	addi 4, 1, 104
+	lxsdx 8, 0, 4
+	addi 4, 1, 112
 	lxvd2x 34, 0, 4
-	addi 1, 1, 72
+	addi 4, 1, 128
+	lxvd2x 35, 0, 4
+	addi 4, 1, 144
+	lxvd2x 36, 0, 4
+	addi 4, 1, 160
+	lxvd2x 37, 0, 4
+	addi 4, 1, 176
+	lxvd2x 38, 0, 4
+	addi 4, 1, 192
+	lxvd2x 39, 0, 4
+	addi 4, 1, 208
+	lxvd2x 40, 0, 4
+	addi 4, 1, 224
+	lxvd2x 41, 0, 4
+	ld 0, 248(1)
+	mtlr 0
+	ld 2, 240(1)
+	ld 3, 32(1)
+	ld 4, 40(1)
+
+	addi 1, 1, 256
 	ld 0, 16(1)
 	blr
diff --git a/lib/xray/xray_trampoline_x86_64.S b/lib/xray/xray_trampoline_x86_64.S
index 847ecef8d425..b59eedc4bb1b 100644
--- a/lib/xray/xray_trampoline_x86_64.S
+++ b/lib/xray/xray_trampoline_x86_64.S
@@ -176,9 +176,15 @@ __xray_ArgLoggerEntry:
 	je	.Larg1entryFail
 
 .Larg1entryLog:
-	movq	%rdi, %rdx	// first argument will become the third
-	xorq	%rsi, %rsi	// XRayEntryType::ENTRY into the second
-	movl	%r10d, %edi	// 32-bit function ID becomes the first
+
+  // First argument will become the third
+	movq	%rdi, %rdx
+
+  // XRayEntryType::ENTRY into the second
+	xorq	%rsi, %rsi
+
+	// 32-bit function ID becomes the first
+	movl	%r10d, %edi
 	callq	*%rax
 
 .Larg1entryFail:
@@ -189,4 +195,38 @@ __xray_ArgLoggerEntry:
 	.size __xray_ArgLoggerEntry, .Larg1entryEnd-__xray_ArgLoggerEntry
 	.cfi_endproc
 
+//===----------------------------------------------------------------------===//
+
+	.global __xray_CustomEvent
+	.align 16, 0x90
+	.type __xray_CustomEvent,@function
+__xray_CustomEvent:
+  .cfi_startproc
+	subq $16, %rsp
+	.cfi_def_cfa_offset 24
+	movq %rbp, 8(%rsp)
+	movq %rax, 0(%rsp)
+
+	// We take two arguments to this trampoline, which should be in rdi	and rsi
+	// already. We also make sure that we stash %rax because we use that register
+	// to call the logging handler.
+	movq _ZN6__xray22XRayPatchedCustomEventE(%rip), %rax
+	testq %rax,%rax
+	je .LcustomEventCleanup
+
+	// At this point we know that rcx and rdx already has the data, so we just
+	// call the logging handler.
+  callq *%rax
+
+.LcustomEventCleanup:
+	movq 0(%rsp), %rax
+	movq 8(%rsp), %rbp
+	addq $16, %rsp
+	.cfi_def_cfa_offset 8
+	retq
+
+.Ltmp8:
+	.size __xray_CustomEvent, .Ltmp8-__xray_CustomEvent
+	.cfi_endproc
+
 NO_EXEC_STACK_DIRECTIVE
diff --git a/lib/xray/xray_x86_64.cc b/lib/xray/xray_x86_64.cc
index 2e9a8d270c33..e34806fa1cea 100644
--- a/lib/xray/xray_x86_64.cc
+++ b/lib/xray/xray_x86_64.cc
@@ -75,8 +75,10 @@ uint64_t getTSCFrequency() XRAY_NEVER_INSTRUMENT {
 static constexpr uint8_t CallOpCode = 0xe8;
 static constexpr uint16_t MovR10Seq = 0xba41;
 static constexpr uint16_t Jmp9Seq = 0x09eb;
+static constexpr uint16_t Jmp20Seq = 0x14eb;
 static constexpr uint8_t JmpOpCode = 0xe9;
 static constexpr uint8_t RetOpCode = 0xc3;
+static constexpr uint16_t NopwSeq = 0x9066;
 
 static constexpr int64_t MinOffset{std::numeric_limits<int32_t>::min()};
 static constexpr int64_t MaxOffset{std::numeric_limits<int32_t>::max()};
@@ -201,6 +203,40 @@ bool patchFunctionTailExit(const bool Enable, const uint32_t FuncId,
   return true;
 }
 
+bool patchCustomEvent(const bool Enable, const uint32_t FuncId,
+                      const XRaySledEntry &Sled) XRAY_NEVER_INSTRUMENT {
+  // Here we do the dance of replacing the following sled:
+  //
+  // xray_sled_n:
+  //   jmp +19          // 2 bytes
+  //   ...
+  //
+  // With the following:
+  //
+  //   nopw             // 2 bytes*
+  //   ...
+  //
+  // We need to do this in the following order:
+  //
+  // 1. Overwrite the 5-byte nop with the call (relative), where (relative) is
+  //    the relative offset to the __xray_CustomEvent trampoline.
+  // 2. Do a two-byte atomic write over the 'jmp +24' to turn it into a 'nopw'.
+  //    This allows us to "enable" this code once the changes have committed.
+  //
+  // The "unpatch" should just turn the 'nopw' back to a 'jmp +24'.
+  //
+  if (Enable) {
+    std::atomic_store_explicit(
+        reinterpret_cast<std::atomic<uint16_t> *>(Sled.Address), NopwSeq,
+        std::memory_order_release);
+  } else {
+    std::atomic_store_explicit(
+        reinterpret_cast<std::atomic<uint16_t> *>(Sled.Address), Jmp20Seq,
+        std::memory_order_release);
+  }
+  return false;
+}
+
 // We determine whether the CPU we're running on has the correct features we
 // need. In x86_64 this will be rdtscp support.
 bool probeRequiredCPUFeatures() XRAY_NEVER_INSTRUMENT {
diff --git a/test/asan/CMakeLists.txt b/test/asan/CMakeLists.txt
index b8e365227780..87fa9d138748 100644
--- a/test/asan/CMakeLists.txt
+++ b/test/asan/CMakeLists.txt
@@ -3,9 +3,17 @@ set(ASAN_LIT_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR})
 set(ASAN_TESTSUITES)
 set(ASAN_DYNAMIC_TESTSUITES)
 
-# FIXME: Shadow memory for 64-bit asan easily exhausts swap on most machines.
-# Find a way to make these tests pass reliably, and re-enable them.
-if(OS_NAME MATCHES "Windows" AND CMAKE_SIZEOF_VOID_P EQUAL 8)
+# Before Windows 8 (CMAKE_SYSTEM_VERSION 6.2), reserving large regions of shadow
+# memory allocated physical memory for page tables, which made it very
+# unreliable. Remove the asan tests from check-all in this configuration.
+set(SHADOW_MAPPING_UNRELIABLE FALSE)
+if(OS_NAME MATCHES "Windows" AND CMAKE_SIZEOF_VOID_P EQUAL 8 AND
+    ${CMAKE_SYSTEM_VERSION} LESS 6.2)
+  set(SHADOW_MAPPING_UNRELIABLE TRUE)
+  message(WARNING "Disabling ASan tests because they are unreliable on Windows 7 and earlier")
+endif()
+
+if (SHADOW_MAPPING_UNRELIABLE)
   set(EXCLUDE_FROM_ALL TRUE)
 endif()
 
@@ -165,7 +173,6 @@ if(COMPILER_RT_ASAN_HAS_STATIC_RUNTIME)
 endif()
 
 # Reset EXCLUDE_FROM_ALL to its initial value.
-# FIXME: Remove when we run Win64 asan tests.
-if(OS_NAME MATCHES "Windows" AND CMAKE_SIZEOF_VOID_P EQUAL 8)
+if (SHADOW_MAPPING_UNRELIABLE)
   set(EXCLUDE_FROM_ALL FALSE)
 endif()
diff --git a/test/asan/TestCases/Linux/sanbox_read_proc_self_maps_test.cc b/test/asan/TestCases/Linux/sanbox_read_proc_self_maps_test.cc
new file mode 100644
index 000000000000..a845721d5982
--- /dev/null
+++ b/test/asan/TestCases/Linux/sanbox_read_proc_self_maps_test.cc
@@ -0,0 +1,30 @@
+// REQUIRES: x86_64-target-arch
+// RUN: %clangxx_asan  %s -o %t
+// RUN: not %run %t 2>&1 | FileCheck %s
+#include <sanitizer/common_interface_defs.h>
+#include <sched.h>
+#include <unistd.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+int main() {
+  __sanitizer_sandbox_arguments args = {0};
+  // should cache /proc/self/maps
+  __sanitizer_sandbox_on_notify(&args);
+
+  if (unshare(CLONE_NEWUSER)) {
+    printf("unshare failed\n");
+    abort();
+  }
+
+  // remove access to /proc/self/maps
+  if (chroot("/tmp")) {
+    printf("chroot failed\n");
+    abort();
+  }
+
+  *(volatile int*)0x42 = 0;
+// CHECK: AddressSanitizer: SEGV on unknown address 0x000000000042
+// CHECK-NOT: AddressSanitizer CHECK failed
+// CHECK: SUMMARY: AddressSanitizer: SEGV
+}
diff --git a/test/asan/TestCases/Posix/strndup_oob_test.cc b/test/asan/TestCases/Posix/strndup_oob_test.cc
new file mode 100644
index 000000000000..7ea0b7a33400
--- /dev/null
+++ b/test/asan/TestCases/Posix/strndup_oob_test.cc
@@ -0,0 +1,27 @@
+// RUN: %clangxx_asan -O0 %s -o %t && not %run %t 2>&1 | FileCheck %s
+// RUN: %clangxx_asan -O1 %s -o %t && not %run %t 2>&1 | FileCheck %s
+// RUN: %clangxx_asan -O2 %s -o %t && not %run %t 2>&1 | FileCheck %s
+// RUN: %clangxx_asan -O3 %s -o %t && not %run %t 2>&1 | FileCheck %s
+
+// When built as C on Linux, strndup is transformed to __strndup.
+// RUN: %clangxx_asan -O3 -xc %s -o %t && not %run %t 2>&1 | FileCheck %s
+
+// Unwind problem on arm: "main" is missing from the allocation stack trace.
+// UNSUPPORTED: win32,s390,armv7l-unknown-linux-gnueabihf
+
+#include <string.h>
+
+char kString[] = "foo";
+
+int main(int argc, char **argv) {
+  char *copy = strndup(kString, 2);
+  int x = copy[2 + argc];  // BOOM
+  // CHECK: AddressSanitizer: heap-buffer-overflow
+  // CHECK: #0 {{.*}}main {{.*}}strndup_oob_test.cc:[[@LINE-2]]
+  // CHECK-LABEL: allocated by thread T{{.*}} here:
+  // CHECK: #{{[01]}} {{.*}}strndup
+  // CHECK: #{{.*}}main {{.*}}strndup_oob_test.cc:[[@LINE-6]]
+  // CHECK-LABEL: SUMMARY
+  // CHECK: strndup_oob_test.cc:[[@LINE-7]]
+  return x;
+}
diff --git a/test/asan/lit.cfg b/test/asan/lit.cfg
index b433a91e830e..063c33b02697 100644
--- a/test/asan/lit.cfg
+++ b/test/asan/lit.cfg
@@ -38,6 +38,11 @@ if config.host_os == 'Darwin':
   # Also, make sure we do not overwhelm the syslog while testing.
   default_asan_opts = 'abort_on_error=0'
   default_asan_opts += ':log_to_syslog=0'
+
+  # On Darwin, leak checking is not enabled by default. Enable for x86_64
+  # tests to prevent regressions
+  if config.target_arch == 'x86_64':
+    default_asan_opts += ':detect_leaks=1'
 elif config.android:
   # The same as on Darwin, we default to "abort_on_error=1" which slows down
   # testing. Also, all existing tests are using "not" instead of "not --crash"
@@ -215,7 +220,9 @@ if re.search('mthumb', config.target_cflags) is not None:
   config.available_features.add('fast-unwinder-works')
 
 # Turn on leak detection on 64-bit Linux.
-if config.host_os == 'Linux' and (config.target_arch == 'x86_64' or config.target_arch == 'i386'):
+leak_detection_linux = (config.host_os == 'Linux') and (config.target_arch == 'x86_64' or config.target_arch == 'i386')
+leak_detection_mac = (config.host_os == 'Darwin') and (config.target_arch == 'x86_64')
+if leak_detection_linux or leak_detection_mac:
   config.available_features.add('leak-detection')
 
 # Set LD_LIBRARY_PATH to pick dynamic runtime up properly.
diff --git a/test/dfsan/custom.cc b/test/dfsan/custom.cc
index c96d94053986..b36db01bc48c 100644
--- a/test/dfsan/custom.cc
+++ b/test/dfsan/custom.cc
@@ -3,7 +3,7 @@
 // RUN: %clang_dfsan -DSTRICT_DATA_DEPENDENCIES %s -o %t && %run %t
 // RUN: %clang_dfsan -DSTRICT_DATA_DEPENDENCIES -mllvm -dfsan-args-abi %s -o %t && %run %t
 
-// XFAIL: target-is-mips64el
+// XFAIL: target-is-mips64,target-is-mips64el
 
 // Tests custom implementations of various glibc functions.
 
diff --git a/test/lsan/TestCases/link_turned_off.cc b/test/lsan/TestCases/link_turned_off.cc
index b8458de63ddd..fd11272ceae3 100644
--- a/test/lsan/TestCases/link_turned_off.cc
+++ b/test/lsan/TestCases/link_turned_off.cc
@@ -3,6 +3,8 @@
 // RUN: %clangxx_lsan %s -o %t
 // RUN: %env_lsan_opts=$LSAN_BASE %run %t
 // RUN: %env_lsan_opts=$LSAN_BASE not %run %t foo 2>&1 | FileCheck %s
+//
+// UNSUPPORTED: darwin
 
 #include <sanitizer/lsan_interface.h>
 
diff --git a/test/lsan/TestCases/recoverable_leak_check.cc b/test/lsan/TestCases/recoverable_leak_check.cc
index 909698561e82..85988e2c1c70 100644
--- a/test/lsan/TestCases/recoverable_leak_check.cc
+++ b/test/lsan/TestCases/recoverable_leak_check.cc
@@ -3,6 +3,8 @@
 // RUN: %clangxx_lsan %s -o %t
 // RUN: %env_lsan_opts=$LSAN_BASE %run %t foo 2>&1 | FileCheck %s
 // RUN: %env_lsan_opts=$LSAN_BASE %run %t 2>&1 | FileCheck %s
+//
+// UNSUPPORTED: darwin
 
 #include <assert.h>
 #include <stdio.h>
diff --git a/test/lsan/lit.common.cfg b/test/lsan/lit.common.cfg
index da439d4c0282..309e8f27be66 100644
--- a/test/lsan/lit.common.cfg
+++ b/test/lsan/lit.common.cfg
@@ -67,9 +67,10 @@ config.substitutions.append( ("%clangxx ", build_invocation(clang_cxxflags)) )
 config.substitutions.append( ("%clang_lsan ", build_invocation(clang_lsan_cflags)) )
 config.substitutions.append( ("%clangxx_lsan ", build_invocation(clang_lsan_cxxflags)) )
 
-# LeakSanitizer tests are currently supported on x86-64 Linux, PowerPC64 Linux, arm Linux, and mips64 Linux only.
+# LeakSanitizer tests are currently supported on x86-64 Linux, PowerPC64 Linux, arm Linux, mips64 Linux, and x86_64 Darwin.
 supported_linux = config.host_os is 'Linux' and config.host_arch in ['x86_64', 'ppc64', 'mips64', 'arm', 'armhf', 'armv7l']
-if not (supported_linux):
+supported_darwin = config.host_os is 'Darwin' and config.target_arch is 'x86_64'
+if not (supported_linux or supported_darwin):
   config.unsupported = True
 
 # Don't support Thumb due to broken fast unwinder
diff --git a/test/msan/chained_origin_memcpy.cc b/test/msan/chained_origin_memcpy.cc
index bfe50dfec3f5..0c94f2b13f17 100644
--- a/test/msan/chained_origin_memcpy.cc
+++ b/test/msan/chained_origin_memcpy.cc
@@ -50,7 +50,7 @@ int main(int argc, char *argv[]) {
 
 // CHECK: Uninitialized value was stored to memory at
 // CHECK-FULL-STACK: {{#1 .* in fn_h.*chained_origin_memcpy.cc:}}[[@LINE-15]]
-// CHECK-SHORT-STACK: {{#0 .* in __msan_memcpy .*msan_interceptors.cc:}}
+// CHECK-SHORT-STACK: {{#0 .* in __msan_memcpy.*msan_interceptors.cc:}}
 
 // CHECK: Uninitialized value was stored to memory at
 // CHECK-FULL-STACK: {{#0 .* in fn_g.*chained_origin_memcpy.cc:}}[[@LINE-29]]
diff --git a/test/msan/pr32842.c b/test/msan/pr32842.c
new file mode 100644
index 000000000000..b0a05f751d55
--- /dev/null
+++ b/test/msan/pr32842.c
@@ -0,0 +1,22 @@
+// Regression test for https://bugs.llvm.org/show_bug.cgi?id=32842
+//
+// RUN: %clang_msan -g %s -o %t
+// RUN: not %run %t 2>&1 | FileCheck  %s
+
+struct iphdr {
+  unsigned char pad1: 2, ihl:4, pad2: 2;
+};
+
+int raw_send_hdrinc(unsigned long int length) {
+  struct iphdr iph;
+  if (iph.ihl * 4 > length) {
+    return 1;
+  }
+  return 0;
+}
+
+int main(int argc, char *argv[]) {
+  return raw_send_hdrinc(12);
+}
+
+// CHECK: WARNING: MemorySanitizer: use-of-uninitialized-value
diff --git a/test/msan/strndup.cc b/test/msan/strndup.cc
new file mode 100644
index 000000000000..d4b9af1a9a6e
--- /dev/null
+++ b/test/msan/strndup.cc
@@ -0,0 +1,28 @@
+// RUN: %clangxx_msan %s -o %t && not %run %t 2>&1 | FileCheck --check-prefix=ON %s
+// RUN: %clangxx_msan %s -o %t && MSAN_OPTIONS=intercept_strndup=0 %run %t 2>&1 | FileCheck --check-prefix=OFF --allow-empty %s
+
+// When built as C on Linux, strndup is transformed to __strndup.
+// RUN: %clangxx_msan -O3 -xc %s -o %t && not %run %t 2>&1 | FileCheck --check-prefix=ON %s
+
+// UNSUPPORTED: win32
+
+#include <assert.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sanitizer/msan_interface.h>
+
+int main(int argc, char **argv) {
+  char kString[4] = "abc";
+  __msan_poison(kString + 2, 1);
+  char *copy = strndup(kString, 4); // BOOM
+  assert(__msan_test_shadow(copy, 4) == 2); // Poisoning is preserved.
+  free(copy);
+  return 0;
+  // ON: Uninitialized bytes in __interceptor_{{(__)?}}strndup at offset 2 inside [{{.*}}, 4)
+  // ON: MemorySanitizer: use-of-uninitialized-value
+  // ON: #0 {{.*}}main {{.*}}strndup.cc:[[@LINE-6]]
+  // ON-LABEL: SUMMARY
+  // ON: {{.*}}strndup.cc:[[@LINE-8]]
+  // OFF-NOT: MemorySanitizer
+}
+
diff --git a/test/msan/wcsncpy.cc b/test/msan/wcsncpy.cc
index f582c37b7aab..6471371de980 100644
--- a/test/msan/wcsncpy.cc
+++ b/test/msan/wcsncpy.cc
@@ -1,6 +1,8 @@
 // RUN: %clangxx_msan -fsanitize-memory-track-origins -O0 %s -o %t && not %run %t >%t.out 2>&1
 // RUN: FileCheck %s < %t.out && FileCheck %s < %t.out
 
+// XFAIL: mips
+
 #include <assert.h>
 #include <wchar.h>
 
@@ -27,12 +29,12 @@ int main() {
 }
 // CHECK:  Uninitialized bytes in __msan_check_mem_is_initialized
 // CHECK:  WARNING: MemorySanitizer: use-of-uninitialized-value
-// CHECK:    in main {{.*}}wcsncpy.cc:26
+// CHECK:    in main {{.*}}wcsncpy.cc:28
 
 // CHECK:  Uninitialized value was stored to memory at
 // CHECK:    in {{[^\s]*}}wcsncpy
-// CHECK:    in main {{.*}}wcsncpy.cc:25
+// CHECK:    in main {{.*}}wcsncpy.cc:27
 
 // CHECK:  Memory was marked as uninitialized
 // CHECK:    in __msan_allocated_memory
-// CHECK:    in main {{.*}}wcsncpy.cc:23
+// CHECK:    in main {{.*}}wcsncpy.cc:25
diff --git a/test/safestack/canary.c b/test/safestack/canary.c
index c6b81f24327f..1ceaa50656f6 100644
--- a/test/safestack/canary.c
+++ b/test/safestack/canary.c
@@ -2,7 +2,8 @@
 // RUN: %run %t.nossp 2>&1 | FileCheck --check-prefix=NOSSP %s
 
 // RUN: %clang_safestack -fstack-protector-all -D_FORTIFY_SOURCE=0 -g %s -o %t.ssp
-// RUN: not --crash %run %t.ssp 2>&1 | FileCheck -check-prefix=SSP %s
+// RUN: env LIBC_FATAL_STDERR_=1 not --crash %run %t.ssp 2>&1 | \
+// RUN:     FileCheck -check-prefix=SSP %s
 
 // Test stack canaries on the unsafe stack.
 
diff --git a/test/sanitizer_common/TestCases/Linux/getpwnam_r_invalid_user.cc b/test/sanitizer_common/TestCases/Linux/getpwnam_r_invalid_user.cc
index c0d6cfea1fbe..5bee1fb4bc93 100644
--- a/test/sanitizer_common/TestCases/Linux/getpwnam_r_invalid_user.cc
+++ b/test/sanitizer_common/TestCases/Linux/getpwnam_r_invalid_user.cc
@@ -1,6 +1,8 @@
 // Regression test for a crash in getpwnam_r and similar interceptors.
 // RUN: %clangxx -O0 -g %s -o %t && %run %t
 
+// XFAIL: mips
+
 #include <assert.h>
 #include <errno.h>
 #include <pwd.h>
diff --git a/test/sanitizer_common/TestCases/sanitizer_coverage_no_prune.cc b/test/sanitizer_common/TestCases/sanitizer_coverage_no_prune.cc
index 8430539829b0..8751930345e5 100644
--- a/test/sanitizer_common/TestCases/sanitizer_coverage_no_prune.cc
+++ b/test/sanitizer_common/TestCases/sanitizer_coverage_no_prune.cc
@@ -2,7 +2,7 @@
 //
 // REQUIRES: has_sancovcc,stable-runtime
 // UNSUPPORTED: i386-darwin
-// XFAIL: tsan,powerpc64,s390x,mips
+// XFAIL: tsan
 //
 // RUN: %clangxx -O0 %s -S -o - -emit-llvm -fsanitize-coverage=trace-pc,bb,no-prune 2>&1 | grep "call void @__sanitizer_cov_trace_pc" | count 3
 // RUN: %clangxx -O0 %s -S -o - -emit-llvm -fsanitize-coverage=trace-pc,bb          2>&1 | grep "call void @__sanitizer_cov_trace_pc" | count 2
diff --git a/test/ubsan/TestCases/Misc/missing_return.cpp b/test/ubsan/TestCases/Misc/missing_return.cpp
index 7b56b97048e3..5c5b286f1a65 100644
--- a/test/ubsan/TestCases/Misc/missing_return.cpp
+++ b/test/ubsan/TestCases/Misc/missing_return.cpp
@@ -1,4 +1,4 @@
-// RUN: %clangxx -fsanitize=return -g %s -O3 -o %t
+// RUN: %clangxx -fsanitize=return %gmlt %s -O3 -o %t
 // RUN: not %run %t 2>&1 | FileCheck %s
 // RUN: %env_ubsan_opts=print_stacktrace=1 not %run %t 2>&1 | FileCheck %s --check-prefix=CHECK-STACKTRACE
 
diff --git a/test/ubsan/TestCases/TypeCheck/misaligned.cpp b/test/ubsan/TestCases/TypeCheck/misaligned.cpp
index b3ff3588ba28..4eaedf37e565 100644
--- a/test/ubsan/TestCases/TypeCheck/misaligned.cpp
+++ b/test/ubsan/TestCases/TypeCheck/misaligned.cpp
@@ -1,8 +1,4 @@
-// FIXME: This test currently fails on Windows because we use the MSVC linker,
-// which throws away DWARF debug info.
-// XFAIL: win32
-//
-// RUN: %clangxx -fsanitize=alignment -g %s -O3 -o %t
+// RUN: %clangxx %gmlt -fsanitize=alignment %s -O3 -o %t
 // RUN: %run %t l0 && %run %t s0 && %run %t r0 && %run %t m0 && %run %t f0 && %run %t n0 && %run %t u0
 // RUN: %run %t l1 2>&1 | FileCheck %s --check-prefix=CHECK-LOAD --strict-whitespace
 // RUN: %run %t s1 2>&1 | FileCheck %s --check-prefix=CHECK-STORE
diff --git a/test/ubsan/lit.common.cfg b/test/ubsan/lit.common.cfg
index cd6d209ee4a9..e3a1367e748a 100644
--- a/test/ubsan/lit.common.cfg
+++ b/test/ubsan/lit.common.cfg
@@ -38,6 +38,9 @@ else:
   lit_config.fatal("Unknown UBSan test mode: %r" % ubsan_lit_test_mode)
 
 # Platform-specific default for lit tests.
+if config.target_arch == 's390x':
+  # On SystemZ we need -mbackchain to make the fast unwinder work.
+  clang_ubsan_cflags.append("-mbackchain")
 if config.host_os == 'Darwin':
   # On Darwin, we default to `abort_on_error=1`, which would make tests run
   # much slower. Let's override this and run lit tests with 'abort_on_error=0'.
@@ -61,6 +64,7 @@ clang_ubsan_cxxflags = config.cxx_mode_flags + clang_ubsan_cflags
 # Define %clang and %clangxx substitutions to use in test RUN lines.
 config.substitutions.append( ("%clang ", build_invocation(clang_ubsan_cflags)) )
 config.substitutions.append( ("%clangxx ", build_invocation(clang_ubsan_cxxflags)) )
+config.substitutions.append( ("%gmlt ", " ".join(config.debug_info_flags) + " ") )
 
 # Default test suffixes.
 config.suffixes = ['.c', '.cc', '.cpp']
diff --git a/test/xray/TestCases/Linux/coverage-sample.cc b/test/xray/TestCases/Linux/coverage-sample.cc
index df23d9f738de..623b4e34541b 100644
--- a/test/xray/TestCases/Linux/coverage-sample.cc
+++ b/test/xray/TestCases/Linux/coverage-sample.cc
@@ -2,9 +2,6 @@
 //
 // RUN: %clangxx_xray -std=c++11 %s -o %t
 // RUN: XRAY_OPTIONS="patch_premain=false xray_naive_log=false" %run %t | FileCheck %s
-// FIXME: When run this test case causes a segementation fault on powerpc64le.
-// Remove the xfail when the problem is fixed.
-// XFAIL: powerpc64le
 
 #include "xray/xray_interface.h"
 
diff --git a/test/xray/TestCases/Linux/custom-event-logging.cc b/test/xray/TestCases/Linux/custom-event-logging.cc
new file mode 100644
index 000000000000..b1a766d46045
--- /dev/null
+++ b/test/xray/TestCases/Linux/custom-event-logging.cc
@@ -0,0 +1,40 @@
+// Use the clang feature for custom xray event logging.
+//
+// RUN: %clangxx_xray -std=c++11 %s -o %t
+// RUN: XRAY_OPTIONS="patch_premain=false verbosity=1 xray_naive_log=false xray_logfile_base=custom-event-logging.xray-" %run %t 2>&1 | FileCheck %s
+// FIXME: Support this in non-x86_64 as well
+// REQUIRES: x86_64-linux
+// REQUIRES: built-in-llvm-tree
+#include <cstdio>
+#include "xray/xray_interface.h"
+
+[[clang::xray_always_instrument]] void foo() {
+  static constexpr char CustomLogged[] = "hello custom logging!";
+  printf("before calling the custom logging...\n");
+  __xray_customevent(CustomLogged, sizeof(CustomLogged));
+  printf("after calling the custom logging...\n");
+}
+
+void myprinter(void* ptr, size_t size) {
+  printf("%.*s\n", static_cast<int>(size), static_cast<const char*>(ptr));
+}
+
+int main() {
+  foo();
+  // CHECK: before calling the custom logging...
+  // CHECK-NEXT: after calling the custom logging...
+  printf("setting up custom event handler...\n");
+  // CHECK-NEXT: setting up custom event handler...
+  __xray_set_customevent_handler(myprinter);
+  __xray_patch();
+  // CHECK-NEXT: before calling the custom logging...
+  foo();
+  // CHECK-NEXT: hello custom logging!
+  // CHECK-NEXT: after calling the custom logging...
+  printf("removing custom event handler...\n");
+  // CHECK-NEXT: removing custom event handler...
+  __xray_set_customevent_handler(nullptr);
+  foo();
+  // CHECK-NEXT: before calling the custom logging...
+  // CHECK-NEXT: after calling the custom logging...
+}
diff --git a/test/xray/TestCases/Linux/func-id-utils.cc b/test/xray/TestCases/Linux/func-id-utils.cc
index 82ba34d30acc..c9a2952c695d 100644
--- a/test/xray/TestCases/Linux/func-id-utils.cc
+++ b/test/xray/TestCases/Linux/func-id-utils.cc
@@ -3,8 +3,6 @@
 //
 // RUN: %clangxx_xray -std=c++11 %s -o %t
 // RUN: XRAY_OPTIONS="patch_premain=false xray_naive_log=false" %run %t
-// FIXME: When we know why this fails in ppc, un-xfail it.
-// XFAIL: powerpc64le
 
 #include "xray/xray_interface.h"
 #include <algorithm>
@@ -32,13 +30,21 @@
   assert(all_instrumented.size() == __xray_max_function_id() &&
          "each function id must be assigned to a unique function");
 
-  std::set<void *> common;
-  std::set_intersection(all_instrumented.begin(), all_instrumented.end(),
-                        must_be_instrumented.begin(),
-                        must_be_instrumented.end(),
-                        std::inserter(common, common.begin()));
+  std::set<void *> not_instrumented;
+  const auto comp = [](void *lhs, void *rhs) {
+#ifdef __PPC__
+    return reinterpret_cast<uintptr_t>(lhs) + 8 <
+           reinterpret_cast<uintptr_t>(rhs);
+#else
+    return lhs < rhs;
+#endif
+  };
+  std::set_difference(must_be_instrumented.begin(), must_be_instrumented.end(),
+                      all_instrumented.begin(), all_instrumented.end(),
+                      std::inserter(not_instrumented, not_instrumented.begin()),
+                      comp);
   assert(
-      common == must_be_instrumented &&
+      not_instrumented.empty() &&
       "we should see all explicitly instrumented functions with function ids");
-  return common == must_be_instrumented ? 0 : 1;
+  return not_instrumented.empty() ? 0 : 1;
 }
-- 
cgit v1.2.3


From b76161e41bc2c07cd47f9c61f875d1be95e26d10 Mon Sep 17 00:00:00 2001
From: Dimitry Andric <dim@FreeBSD.org>
Date: Tue, 16 May 2017 19:47:58 +0000
Subject: Vendor import of lldb trunk r303197:
 https://llvm.org/svn/llvm-project/lldb/trunk@303197

---
 include/lldb/API/SBError.h                         |  12 +-
 include/lldb/API/SBPlatform.h                      |   3 +-
 include/lldb/API/SBTarget.h                        |   2 +-
 include/lldb/Breakpoint/Breakpoint.h               |   6 +-
 include/lldb/Breakpoint/BreakpointID.h             |   2 +-
 include/lldb/Breakpoint/BreakpointLocation.h       |   2 +-
 include/lldb/Breakpoint/BreakpointOptions.h        |   4 +-
 include/lldb/Breakpoint/BreakpointResolver.h       |   4 +-
 .../lldb/Breakpoint/BreakpointResolverAddress.h    |   2 +-
 .../lldb/Breakpoint/BreakpointResolverFileLine.h   |   2 +-
 .../lldb/Breakpoint/BreakpointResolverFileRegex.h  |   2 +-
 include/lldb/Breakpoint/BreakpointResolverName.h   |   2 +-
 include/lldb/Breakpoint/Watchpoint.h               |   6 +-
 include/lldb/Core/Communication.h                  |  24 +-
 include/lldb/Core/Connection.h                     |  14 +-
 include/lldb/Core/Debugger.h                       |  12 +-
 include/lldb/Core/FormatEntity.h                   |  16 +-
 include/lldb/Core/MappedHash.h                     |   4 +-
 include/lldb/Core/Module.h                         |   8 +-
 include/lldb/Core/ModuleList.h                     |  16 +-
 include/lldb/Core/PluginManager.h                  |   6 +-
 include/lldb/Core/RegisterValue.h                  |  18 +-
 include/lldb/Core/Scalar.h                         |  12 +-
 include/lldb/Core/SearchFilter.h                   |  14 +-
 include/lldb/Core/StructuredData.h                 |  40 ++-
 include/lldb/Core/StructuredDataImpl.h             |  12 +-
 include/lldb/Core/Timer.h                          |  17 +-
 include/lldb/Core/UserSettingsController.h         |  17 +-
 include/lldb/Core/Value.h                          |  10 +-
 include/lldb/Core/ValueObject.h                    |  23 +-
 include/lldb/Core/ValueObjectConstResult.h         |  10 +-
 include/lldb/Core/ValueObjectConstResultCast.h     |   6 +-
 include/lldb/Core/ValueObjectConstResultChild.h    |   6 +-
 include/lldb/Core/ValueObjectConstResultImpl.h     |   6 +-
 include/lldb/Core/ValueObjectDynamicValue.h        |   6 +-
 include/lldb/Core/ValueObjectRegister.h            |   6 +-
 include/lldb/Core/ValueObjectSyntheticFilter.h     |   4 +-
 include/lldb/Core/ValueObjectVariable.h            |   6 +-
 include/lldb/DataFormatters/TypeSummary.h          |   4 +-
 include/lldb/Expression/DWARFExpression.h          |   8 +-
 include/lldb/Expression/ExpressionParser.h         |   4 +-
 include/lldb/Expression/IRExecutionUnit.h          |  10 +-
 include/lldb/Expression/IRInterpreter.h            |   6 +-
 include/lldb/Expression/IRMemoryMap.h              |  20 +-
 include/lldb/Expression/LLVMUserExpression.h       |   2 +-
 include/lldb/Expression/Materializer.h             |  20 +-
 include/lldb/Expression/REPL.h                     |   6 +-
 include/lldb/Expression/UserExpression.h           |   8 +-
 include/lldb/Expression/UtilityFunction.h          |   2 +-
 include/lldb/Host/File.h                           |  42 ++--
 include/lldb/Host/FileCache.h                      |  10 +-
 include/lldb/Host/FileSystem.h                     |   8 +-
 include/lldb/Host/Host.h                           |  16 +-
 include/lldb/Host/HostNativeProcessBase.h          |   6 +-
 include/lldb/Host/HostNativeThreadBase.h           |   6 +-
 include/lldb/Host/HostProcess.h                    |   4 +-
 include/lldb/Host/HostThread.h                     |   6 +-
 include/lldb/Host/IOObject.h                       |   6 +-
 include/lldb/Host/LockFileBase.h                   |  26 +-
 include/lldb/Host/MainLoop.h                       |   6 +-
 include/lldb/Host/MainLoopBase.h                   |   6 +-
 include/lldb/Host/MonitoringProcessLauncher.h      |   2 +-
 include/lldb/Host/PipeBase.h                       |  33 +--
 include/lldb/Host/ProcessLauncher.h                |   4 +-
 include/lldb/Host/Socket.h                         |  55 ++--
 include/lldb/Host/ThreadLauncher.h                 |   4 +-
 include/lldb/Host/common/NativeBreakpoint.h        |   8 +-
 include/lldb/Host/common/NativeBreakpointList.h    |  21 +-
 include/lldb/Host/common/NativeProcessProtocol.h   |  82 +++---
 include/lldb/Host/common/NativeRegisterContext.h   |  42 ++--
 include/lldb/Host/common/NativeThreadProtocol.h    |  18 +-
 include/lldb/Host/common/NativeWatchpointList.h    |   8 +-
 include/lldb/Host/common/SoftwareBreakpoint.h      |  20 +-
 include/lldb/Host/common/TCPSocket.h               |   8 +-
 include/lldb/Host/common/UDPSocket.h               |  10 +-
 .../Host/posix/ConnectionFileDescriptorPosix.h     |  24 +-
 include/lldb/Host/posix/DomainSocket.h             |   6 +-
 include/lldb/Host/posix/HostProcessPosix.h         |  10 +-
 include/lldb/Host/posix/HostThreadPosix.h          |   6 +-
 include/lldb/Host/posix/LockFilePosix.h            |  10 +-
 include/lldb/Host/posix/PipePosix.h                |  24 +-
 include/lldb/Host/posix/ProcessLauncherPosix.h     |   2 +-
 include/lldb/Host/posix/ProcessLauncherPosixFork.h |   2 +-
 .../Host/windows/ConnectionGenericFileWindows.h    |  10 +-
 include/lldb/Host/windows/HostProcessWindows.h     |   4 +-
 include/lldb/Host/windows/HostThreadWindows.h      |   4 +-
 include/lldb/Host/windows/LockFileWindows.h        |  10 +-
 include/lldb/Host/windows/PipeWindows.h            |  28 ++-
 include/lldb/Host/windows/ProcessLauncherWindows.h |   2 +-
 include/lldb/Interpreter/Args.h                    |  18 +-
 include/lldb/Interpreter/CommandInterpreter.h      |   2 +-
 include/lldb/Interpreter/CommandReturnObject.h     |   2 +-
 include/lldb/Interpreter/OptionGroupArchitecture.h |   4 +-
 include/lldb/Interpreter/OptionGroupBoolean.h      |   6 +-
 include/lldb/Interpreter/OptionGroupFile.h         |  12 +-
 include/lldb/Interpreter/OptionGroupFormat.h       |   6 +-
 include/lldb/Interpreter/OptionGroupOutputFile.h   |   6 +-
 include/lldb/Interpreter/OptionGroupPlatform.h     |   8 +-
 include/lldb/Interpreter/OptionGroupString.h       |   6 +-
 include/lldb/Interpreter/OptionGroupUInt64.h       |   6 +-
 include/lldb/Interpreter/OptionGroupUUID.h         |   6 +-
 .../Interpreter/OptionGroupValueObjectDisplay.h    |   6 +-
 include/lldb/Interpreter/OptionGroupVariable.h     |   6 +-
 include/lldb/Interpreter/OptionGroupWatchpoint.h   |   6 +-
 include/lldb/Interpreter/OptionValue.h             |  17 +-
 include/lldb/Interpreter/OptionValueArch.h         |   4 +-
 include/lldb/Interpreter/OptionValueArray.h        |   8 +-
 include/lldb/Interpreter/OptionValueBoolean.h      |   4 +-
 include/lldb/Interpreter/OptionValueChar.h         |   4 +-
 include/lldb/Interpreter/OptionValueDictionary.h   |  10 +-
 include/lldb/Interpreter/OptionValueEnumeration.h  |   6 +-
 include/lldb/Interpreter/OptionValueFileSpec.h     |   4 +-
 include/lldb/Interpreter/OptionValueFileSpecList.h |   4 +-
 include/lldb/Interpreter/OptionValueFormat.h       |   4 +-
 include/lldb/Interpreter/OptionValueFormatEntity.h |   4 +-
 include/lldb/Interpreter/OptionValueLanguage.h     |   4 +-
 include/lldb/Interpreter/OptionValuePathMappings.h |   4 +-
 include/lldb/Interpreter/OptionValueProperties.h   |  17 +-
 include/lldb/Interpreter/OptionValueRegex.h        |   4 +-
 include/lldb/Interpreter/OptionValueSInt64.h       |   4 +-
 include/lldb/Interpreter/OptionValueString.h       |  12 +-
 include/lldb/Interpreter/OptionValueUInt64.h       |   8 +-
 include/lldb/Interpreter/OptionValueUUID.h         |   4 +-
 include/lldb/Interpreter/Options.h                 |  32 +--
 include/lldb/Interpreter/ScriptInterpreter.h       |  54 ++--
 include/lldb/Symbol/ClangASTContext.h              |  13 +-
 include/lldb/Symbol/ObjectFile.h                   |   2 +-
 include/lldb/Symbol/SymbolContext.h                |   2 +-
 include/lldb/Symbol/Variable.h                     |   2 +-
 include/lldb/Target/ABI.h                          |   6 +-
 include/lldb/Target/DynamicLoader.h                |   4 +-
 include/lldb/Target/Language.h                     |   2 +-
 include/lldb/Target/LanguageRuntime.h              |   2 +-
 include/lldb/Target/Memory.h                       |   6 +-
 include/lldb/Target/ModuleCache.h                  |  28 +--
 include/lldb/Target/ObjCLanguageRuntime.h          |   2 +-
 include/lldb/Target/PathMappingList.h              |   2 +-
 include/lldb/Target/Platform.h                     | 130 +++++-----
 include/lldb/Target/Process.h                      | 215 ++++++++--------
 include/lldb/Target/ProcessLaunchInfo.h            |   2 +-
 include/lldb/Target/ProcessStructReader.h          |   4 +-
 include/lldb/Target/RegisterContext.h              |   4 +-
 include/lldb/Target/StackFrame.h                   |  10 +-
 include/lldb/Target/StructuredDataPlugin.h         |   4 +-
 include/lldb/Target/Target.h                       |  57 ++---
 include/lldb/Target/TargetList.h                   |  38 +--
 include/lldb/Target/Thread.h                       |  24 +-
 include/lldb/Target/ThreadSpec.h                   |   6 +-
 include/lldb/Utility/Error.h                       | 275 --------------------
 include/lldb/Utility/JSON.h                        |   2 +-
 include/lldb/Utility/RegularExpression.h           |   2 +-
 include/lldb/Utility/SelectHelper.h                |   4 +-
 include/lldb/Utility/Status.h                      | 278 +++++++++++++++++++++
 include/lldb/Utility/UUID.h                        |   1 +
 include/lldb/lldb-forward.h                        |   2 +-
 include/lldb/lldb-private-enumerations.h           |   7 +-
 include/lldb/lldb-private-interfaces.h             |   8 +-
 include/lldb/lldb-private-types.h                  |   2 +-
 lldb.xcodeproj/project.pbxproj                     |  14 +-
 .../return-value/TestReturnValue.py                |   7 +-
 .../TestWatchedVarHitWhenInScope.py                |   3 +
 .../cpp/class-template-parameter-pack/Makefile     |   3 +
 .../TestClassTemplateParameterPack.py              |   9 +
 .../cpp/class-template-parameter-pack/main.cpp     |  61 +++++
 .../cpp/function-template-parameter-pack/Makefile  |   3 +
 .../TestFunctionTemplateParameterPack.py           |   6 +
 .../cpp/function-template-parameter-pack/main.cpp  |  24 ++
 .../lldbsuite/test/lang/objc/ptr_refs/Makefile     |   5 +
 .../test/lang/objc/ptr_refs/TestPtrRefsObjC.py     |  50 ++++
 .../lldbsuite/test/lang/objc/ptr_refs/main.m       |  39 +++
 scripts/Xcode/repo.py                              |   1 +
 source/API/SBBreakpoint.cpp                        |   6 +-
 source/API/SBBreakpointLocation.cpp                |   2 +-
 source/API/SBCommandReturnObject.cpp               |   6 +-
 source/API/SBDebugger.cpp                          |  12 +-
 source/API/SBError.cpp                             |  18 +-
 source/API/SBFrame.cpp                             |   2 +-
 source/API/SBHostOS.cpp                            |   6 +-
 source/API/SBModule.cpp                            |   4 +-
 source/API/SBPlatform.cpp                          |  12 +-
 source/API/SBProcess.cpp                           |   8 +-
 source/API/SBStream.cpp                            |   2 +-
 source/API/SBStructuredData.cpp                    |   4 +-
 source/API/SBTarget.cpp                            |  10 +-
 source/API/SBThread.cpp                            |   4 +-
 source/API/SBValue.cpp                             |  16 +-
 source/API/SystemInitializerFull.cpp               |   3 +-
 source/Breakpoint/Breakpoint.cpp                   |  23 +-
 source/Breakpoint/BreakpointID.cpp                 |   4 +-
 source/Breakpoint/BreakpointIDList.cpp             |   2 +-
 source/Breakpoint/BreakpointList.cpp               |   2 +-
 source/Breakpoint/BreakpointLocation.cpp           |   4 +-
 source/Breakpoint/BreakpointOptions.cpp            |  14 +-
 source/Breakpoint/BreakpointResolver.cpp           |  14 +-
 source/Breakpoint/BreakpointResolverAddress.cpp    |   4 +-
 source/Breakpoint/BreakpointResolverFileLine.cpp   |   4 +-
 source/Breakpoint/BreakpointResolverFileRegex.cpp  |   6 +-
 source/Breakpoint/BreakpointResolverName.cpp       |  12 +-
 source/Breakpoint/Watchpoint.cpp                   |   2 +-
 source/Commands/CommandObjectArgs.cpp              |   4 +-
 source/Commands/CommandObjectArgs.h                |   4 +-
 source/Commands/CommandObjectBreakpoint.cpp        |  64 ++---
 source/Commands/CommandObjectBreakpointCommand.cpp |  12 +-
 source/Commands/CommandObjectBugreport.cpp         |   2 +-
 source/Commands/CommandObjectCommands.cpp          |  54 ++--
 source/Commands/CommandObjectDisassemble.cpp       |   8 +-
 source/Commands/CommandObjectDisassemble.h         |   6 +-
 source/Commands/CommandObjectExpression.cpp        |  18 +-
 source/Commands/CommandObjectExpression.h          |   4 +-
 source/Commands/CommandObjectFrame.cpp             |  14 +-
 source/Commands/CommandObjectHelp.h                |   6 +-
 source/Commands/CommandObjectLog.cpp               |   6 +-
 source/Commands/CommandObjectMemory.cpp            |  42 ++--
 source/Commands/CommandObjectPlatform.cpp          |  69 ++---
 source/Commands/CommandObjectPlugin.cpp            |   2 +-
 source/Commands/CommandObjectProcess.cpp           |  62 ++---
 source/Commands/CommandObjectRegister.cpp          |   8 +-
 source/Commands/CommandObjectSettings.cpp          |  24 +-
 source/Commands/CommandObjectSource.cpp            |  12 +-
 source/Commands/CommandObjectTarget.cpp            |  48 ++--
 source/Commands/CommandObjectThread.cpp            |  68 ++---
 source/Commands/CommandObjectType.cpp              |  92 +++----
 source/Commands/CommandObjectWatchpoint.cpp        |  28 +--
 source/Commands/CommandObjectWatchpointCommand.cpp |   6 +-
 source/Core/Address.cpp                            |   8 +-
 source/Core/ArchSpec.cpp                           |   4 +-
 source/Core/Communication.cpp                      |  24 +-
 source/Core/Debugger.cpp                           |  27 +-
 source/Core/Disassembler.cpp                       |  14 +-
 source/Core/DynamicLoader.cpp                      |   8 +-
 source/Core/EmulateInstruction.cpp                 |   6 +-
 source/Core/FormatEntity.cpp                       |  38 +--
 source/Core/IOHandler.cpp                          |   6 +-
 source/Core/Mangled.cpp                            |   4 +-
 source/Core/Module.cpp                             |  44 ++--
 source/Core/ModuleList.cpp                         |  18 +-
 source/Core/PluginManager.cpp                      |  10 +-
 source/Core/RegisterValue.cpp                      |  24 +-
 source/Core/Scalar.cpp                             |  16 +-
 source/Core/SearchFilter.cpp                       |  35 +--
 source/Core/StructuredData.cpp                     |   4 +-
 source/Core/Timer.cpp                              |  71 +++---
 source/Core/UserSettingsController.cpp             |  25 +-
 source/Core/Value.cpp                              |  12 +-
 source/Core/ValueObject.cpp                        |  36 +--
 source/Core/ValueObjectCast.cpp                    |   2 +-
 source/Core/ValueObjectChild.cpp                   |   6 +-
 source/Core/ValueObjectConstResult.cpp             |   8 +-
 source/Core/ValueObjectConstResultCast.cpp         |   6 +-
 source/Core/ValueObjectConstResultChild.cpp        |   6 +-
 source/Core/ValueObjectConstResultImpl.cpp         |   6 +-
 source/Core/ValueObjectDynamicValue.cpp            |   6 +-
 source/Core/ValueObjectMemory.cpp                  |   2 +-
 source/Core/ValueObjectRegister.cpp                |   6 +-
 source/Core/ValueObjectSyntheticFilter.cpp         |   4 +-
 source/Core/ValueObjectVariable.cpp                |   6 +-
 source/DataFormatters/StringPrinter.cpp            |   6 +-
 source/DataFormatters/TypeFormat.cpp               |   8 +-
 source/Expression/DWARFExpression.cpp              |  14 +-
 source/Expression/FunctionCaller.cpp               |   8 +-
 source/Expression/IRDynamicChecks.cpp              |   2 +-
 source/Expression/IRExecutionUnit.cpp              |  34 +--
 source/Expression/IRInterpreter.cpp                |  38 +--
 source/Expression/IRMemoryMap.cpp                  |  31 +--
 source/Expression/LLVMUserExpression.cpp           |  10 +-
 source/Expression/Materializer.cpp                 | 104 ++++----
 source/Expression/REPL.cpp                         |   8 +-
 source/Expression/UserExpression.cpp               |   4 +-
 source/Expression/UtilityFunction.cpp              |   2 +-
 source/Host/common/Editline.cpp                    |   2 +-
 source/Host/common/File.cpp                        |  54 ++--
 source/Host/common/FileCache.cpp                   |   9 +-
 source/Host/common/Host.cpp                        |  34 +--
 source/Host/common/HostProcess.cpp                 |   4 +-
 source/Host/common/HostThread.cpp                  |   4 +-
 source/Host/common/LockFileBase.cpp                |  20 +-
 source/Host/common/MainLoop.cpp                    |  45 ++--
 source/Host/common/MonitoringProcessLauncher.cpp   |   4 +-
 source/Host/common/NativeBreakpoint.cpp            |  14 +-
 source/Host/common/NativeBreakpointList.cpp        |  36 +--
 source/Host/common/NativeProcessProtocol.cpp       |  90 +++----
 source/Host/common/NativeRegisterContext.cpp       |  65 ++---
 source/Host/common/NativeThreadProtocol.cpp        |  24 +-
 source/Host/common/NativeWatchpointList.cpp        |  10 +-
 source/Host/common/PipeBase.cpp                    |   5 +-
 source/Host/common/Socket.cpp                      |  73 +++---
 source/Host/common/SoftwareBreakpoint.cpp          |  73 +++---
 source/Host/common/Symbols.cpp                     |   7 +-
 source/Host/common/TCPSocket.cpp                   |  16 +-
 source/Host/common/ThreadLauncher.cpp              |   4 +-
 source/Host/common/UDPSocket.cpp                   |  18 +-
 source/Host/freebsd/Host.cpp                       |   6 +-
 source/Host/linux/Host.cpp                         |   8 +-
 source/Host/macosx/Host.mm                         |  35 ++-
 source/Host/macosx/Symbols.cpp                     |   2 +-
 source/Host/netbsd/Host.cpp                        |   6 +-
 source/Host/openbsd/Host.cpp                       |   6 +-
 .../Host/posix/ConnectionFileDescriptorPosix.cpp   |  54 ++--
 source/Host/posix/DomainSocket.cpp                 |  16 +-
 source/Host/posix/FileSystem.cpp                   |  18 +-
 source/Host/posix/HostProcessPosix.cpp             |  16 +-
 source/Host/posix/HostThreadPosix.cpp              |  14 +-
 source/Host/posix/LockFilePosix.cpp                |  16 +-
 source/Host/posix/PipePosix.cpp                    |  61 ++---
 source/Host/posix/ProcessLauncherPosix.cpp         |   2 +-
 source/Host/posix/ProcessLauncherPosixFork.cpp     |   2 +-
 .../Host/windows/ConnectionGenericFileWindows.cpp  |  14 +-
 source/Host/windows/FileSystem.cpp                 |  12 +-
 source/Host/windows/Host.cpp                       |   8 +-
 source/Host/windows/HostProcessWindows.cpp         |   8 +-
 source/Host/windows/HostThreadWindows.cpp          |  10 +-
 source/Host/windows/LockFileWindows.cpp            |  31 +--
 source/Host/windows/PipeWindows.cpp                |  78 +++---
 source/Host/windows/ProcessLauncherWindows.cpp     |   2 +-
 source/Initialization/SystemInitializerCommon.cpp  |   6 +-
 source/Interpreter/Args.cpp                        |  18 +-
 source/Interpreter/CommandInterpreter.cpp          |  22 +-
 source/Interpreter/CommandObject.cpp               |   2 +-
 source/Interpreter/CommandReturnObject.cpp         |   6 +-
 source/Interpreter/OptionGroupArchitecture.cpp     |   9 +-
 source/Interpreter/OptionGroupBoolean.cpp          |   8 +-
 source/Interpreter/OptionGroupFile.cpp             |  17 +-
 source/Interpreter/OptionGroupFormat.cpp           |   8 +-
 source/Interpreter/OptionGroupOutputFile.cpp       |   9 +-
 source/Interpreter/OptionGroupPlatform.cpp         |  11 +-
 source/Interpreter/OptionGroupString.cpp           |   8 +-
 source/Interpreter/OptionGroupUInt64.cpp           |   8 +-
 source/Interpreter/OptionGroupUUID.cpp             |   8 +-
 .../Interpreter/OptionGroupValueObjectDisplay.cpp  |   4 +-
 source/Interpreter/OptionGroupVariable.cpp         |  25 +-
 source/Interpreter/OptionGroupWatchpoint.cpp       |   9 +-
 source/Interpreter/OptionValue.cpp                 |  16 +-
 source/Interpreter/OptionValueArch.cpp             |   6 +-
 source/Interpreter/OptionValueArray.cpp            |  15 +-
 source/Interpreter/OptionValueBoolean.cpp          |   6 +-
 source/Interpreter/OptionValueChar.cpp             |   6 +-
 source/Interpreter/OptionValueDictionary.cpp       |  24 +-
 source/Interpreter/OptionValueEnumeration.cpp      |   6 +-
 source/Interpreter/OptionValueFileSpec.cpp         |   6 +-
 source/Interpreter/OptionValueFileSpecLIst.cpp     |   6 +-
 source/Interpreter/OptionValueFormat.cpp           |   6 +-
 source/Interpreter/OptionValueFormatEntity.cpp     |   8 +-
 source/Interpreter/OptionValueLanguage.cpp         |   6 +-
 source/Interpreter/OptionValuePathMappings.cpp     |   6 +-
 source/Interpreter/OptionValueProperties.cpp       |  29 +--
 source/Interpreter/OptionValueRegex.cpp            |   6 +-
 source/Interpreter/OptionValueSInt64.cpp           |   6 +-
 source/Interpreter/OptionValueString.cpp           |  18 +-
 source/Interpreter/OptionValueUInt64.cpp           |   8 +-
 source/Interpreter/OptionValueUUID.cpp             |   6 +-
 source/Interpreter/Options.cpp                     |  18 +-
 source/Interpreter/ScriptInterpreter.cpp           |   6 +-
 source/Plugins/ABI/MacOSX-arm/ABIMacOSX_arm.cpp    |  14 +-
 source/Plugins/ABI/MacOSX-arm/ABIMacOSX_arm.h      |   2 +-
 .../Plugins/ABI/MacOSX-arm64/ABIMacOSX_arm64.cpp   |  20 +-
 source/Plugins/ABI/MacOSX-arm64/ABIMacOSX_arm64.h  |   2 +-
 source/Plugins/ABI/MacOSX-i386/ABIMacOSX_i386.cpp  |  14 +-
 source/Plugins/ABI/MacOSX-i386/ABIMacOSX_i386.h    |   2 +-
 source/Plugins/ABI/SysV-arm/ABISysV_arm.cpp        |  16 +-
 source/Plugins/ABI/SysV-arm/ABISysV_arm.h          |   2 +-
 source/Plugins/ABI/SysV-arm64/ABISysV_arm64.cpp    |  18 +-
 source/Plugins/ABI/SysV-arm64/ABISysV_arm64.h      |   2 +-
 .../Plugins/ABI/SysV-hexagon/ABISysV_hexagon.cpp   |  11 +-
 source/Plugins/ABI/SysV-hexagon/ABISysV_hexagon.h  |   2 +-
 source/Plugins/ABI/SysV-i386/ABISysV_i386.cpp      |  18 +-
 source/Plugins/ABI/SysV-i386/ABISysV_i386.h        |   2 +-
 source/Plugins/ABI/SysV-mips/ABISysV_mips.cpp      |  12 +-
 source/Plugins/ABI/SysV-mips/ABISysV_mips.h        |   2 +-
 source/Plugins/ABI/SysV-mips64/ABISysV_mips64.cpp  |  14 +-
 source/Plugins/ABI/SysV-mips64/ABISysV_mips64.h    |   2 +-
 source/Plugins/ABI/SysV-ppc/ABISysV_ppc.cpp        |  18 +-
 source/Plugins/ABI/SysV-ppc/ABISysV_ppc.h          |   2 +-
 source/Plugins/ABI/SysV-ppc64/ABISysV_ppc64.cpp    |  18 +-
 source/Plugins/ABI/SysV-ppc64/ABISysV_ppc64.h      |   2 +-
 source/Plugins/ABI/SysV-s390x/ABISysV_s390x.cpp    |  16 +-
 source/Plugins/ABI/SysV-s390x/ABISysV_s390x.h      |   2 +-
 source/Plugins/ABI/SysV-x86_64/ABISysV_x86_64.cpp  |  20 +-
 source/Plugins/ABI/SysV-x86_64/ABISysV_x86_64.h    |   2 +-
 .../Disassembler/llvm/DisassemblerLLVMC.cpp        |   4 +-
 .../Darwin-Kernel/DynamicLoaderDarwinKernel.cpp    |  14 +-
 .../Darwin-Kernel/DynamicLoaderDarwinKernel.h      |   2 +-
 .../Hexagon-DYLD/DynamicLoaderHexagonDYLD.cpp      |   4 +-
 .../Hexagon-DYLD/DynamicLoaderHexagonDYLD.h        |   2 +-
 .../Hexagon-DYLD/HexagonDYLDRendezvous.cpp         |  12 +-
 .../MacOSX-DYLD/DynamicLoaderDarwin.cpp            |  12 +-
 .../MacOSX-DYLD/DynamicLoaderMacOS.cpp             |   6 +-
 .../DynamicLoader/MacOSX-DYLD/DynamicLoaderMacOS.h |   2 +-
 .../MacOSX-DYLD/DynamicLoaderMacOSXDYLD.cpp        |  16 +-
 .../MacOSX-DYLD/DynamicLoaderMacOSXDYLD.h          |   2 +-
 .../DynamicLoader/POSIX-DYLD/DYLDRendezvous.cpp    |  14 +-
 .../POSIX-DYLD/DynamicLoaderPOSIXDYLD.cpp          |   2 +-
 .../POSIX-DYLD/DynamicLoaderPOSIXDYLD.h            |   2 +-
 .../DynamicLoader/Static/DynamicLoaderStatic.cpp   |   4 +-
 .../DynamicLoader/Static/DynamicLoaderStatic.h     |   2 +-
 .../Windows-DYLD/DynamicLoaderWindowsDYLD.cpp      |   2 +-
 .../Windows-DYLD/DynamicLoaderWindowsDYLD.h        |   2 +-
 .../Clang/ClangExpressionDeclMap.cpp               |  37 ++-
 .../Clang/ClangExpressionParser.cpp                |  10 +-
 .../ExpressionParser/Clang/ClangExpressionParser.h |   8 +-
 .../ExpressionParser/Clang/ClangUserExpression.cpp |  10 +-
 .../ExpressionParser/Clang/ClangUserExpression.h   |   2 +-
 .../Clang/ClangUtilityFunction.cpp                 |   2 +-
 .../Plugins/ExpressionParser/Clang/IRForTarget.cpp |   2 +-
 .../Plugins/ExpressionParser/Clang/IRForTarget.h   |   2 +-
 source/Plugins/ExpressionParser/Go/GoParser.cpp    |   4 +-
 source/Plugins/ExpressionParser/Go/GoParser.h      |   2 +-
 .../ExpressionParser/Go/GoUserExpression.cpp       |   6 +-
 .../Instruction/ARM/EmulateInstructionARM.h        |   2 +-
 .../Instruction/ARM64/EmulateInstructionARM64.cpp  |   4 +-
 .../Instruction/ARM64/EmulateInstructionARM64.h    |   2 +-
 .../Instruction/MIPS/EmulateInstructionMIPS.cpp    |  14 +-
 .../Instruction/MIPS/EmulateInstructionMIPS.h      |   2 +-
 .../MIPS64/EmulateInstructionMIPS64.cpp            |   8 +-
 .../Instruction/MIPS64/EmulateInstructionMIPS64.h  |   2 +-
 .../AddressSanitizer/AddressSanitizerRuntime.cpp   |   4 +-
 .../ThreadSanitizer/ThreadSanitizerRuntime.cpp     |   4 +-
 source/Plugins/JITLoader/GDB/JITLoaderGDB.cpp      |   4 +-
 source/Plugins/Language/CPlusPlus/BlockPointer.cpp |   4 +-
 .../Plugins/Language/CPlusPlus/CxxStringTypes.cpp  |   8 +-
 source/Plugins/Language/CPlusPlus/LibCxx.cpp       |   6 +-
 source/Plugins/Language/CPlusPlus/LibCxxList.cpp   |   6 +-
 source/Plugins/Language/CPlusPlus/LibCxxMap.cpp    |   8 +-
 .../Language/CPlusPlus/LibCxxUnorderedMap.cpp      |   6 +-
 source/Plugins/Language/CPlusPlus/LibCxxVector.cpp |   2 +-
 source/Plugins/Language/CPlusPlus/LibStdcpp.cpp    |  10 +-
 .../Language/CPlusPlus/LibStdcppUniquePointer.cpp  |   2 +-
 .../Plugins/Language/Go/GoFormatterFunctions.cpp   |   2 +-
 .../Language/Java/JavaFormatterFunctions.cpp       |   8 +-
 source/Plugins/Language/ObjC/CF.cpp                |   8 +-
 source/Plugins/Language/ObjC/Cocoa.cpp             |  20 +-
 source/Plugins/Language/ObjC/NSArray.cpp           |  18 +-
 source/Plugins/Language/ObjC/NSDictionary.cpp      |  20 +-
 source/Plugins/Language/ObjC/NSError.cpp           |   8 +-
 source/Plugins/Language/ObjC/NSException.cpp       |   6 +-
 source/Plugins/Language/ObjC/NSSet.cpp             |  20 +-
 source/Plugins/Language/ObjC/NSString.cpp          |   8 +-
 .../ItaniumABI/ItaniumABILanguageRuntime.cpp       |   4 +-
 .../LanguageRuntime/Go/GoLanguageRuntime.cpp       |  10 +-
 .../AppleObjCClassDescriptorV2.cpp                 |  18 +-
 .../ObjC/AppleObjCRuntime/AppleObjCRuntime.cpp     |   6 +-
 .../ObjC/AppleObjCRuntime/AppleObjCRuntimeV1.cpp   |  10 +-
 .../ObjC/AppleObjCRuntime/AppleObjCRuntimeV2.cpp   |  45 ++--
 .../AppleObjCTrampolineHandler.cpp                 |  10 +-
 .../RenderScriptRuntime/RenderScriptRuntime.cpp    |  72 +++---
 .../RenderScriptRuntime/RenderScriptRuntime.h      |   2 +-
 .../RenderScriptScriptGroup.cpp                    |   2 +-
 .../MemoryHistory/asan/MemoryHistoryASan.cpp       |   2 +-
 .../BSD-Archive/ObjectContainerBSDArchive.cpp      |   3 +-
 source/Plugins/ObjectFile/ELF/ObjectFileELF.cpp    |  13 +-
 source/Plugins/ObjectFile/ELF/ObjectFileELF.h      |   2 +-
 .../Plugins/ObjectFile/Mach-O/ObjectFileMachO.cpp  |  14 +-
 source/Plugins/ObjectFile/Mach-O/ObjectFileMachO.h |   2 +-
 .../Plugins/ObjectFile/PECOFF/ObjectFilePECOFF.cpp |   4 +-
 .../Plugins/ObjectFile/PECOFF/ObjectFilePECOFF.h   |   2 +-
 .../Plugins/ObjectFile/PECOFF/WindowsMiniDump.cpp  |   2 +-
 source/Plugins/ObjectFile/PECOFF/WindowsMiniDump.h |   2 +-
 .../OperatingSystem/Go/OperatingSystemGo.cpp       |   8 +-
 .../Plugins/OperatingSystem/Go/OperatingSystemGo.h |   2 +-
 .../Python/OperatingSystemPython.cpp               |  10 +-
 source/Plugins/Platform/Android/AdbClient.cpp      | 189 +++++++-------
 source/Plugins/Platform/Android/AdbClient.h        |  87 +++----
 .../Plugins/Platform/Android/PlatformAndroid.cpp   |  68 ++---
 source/Plugins/Platform/Android/PlatformAndroid.h  |  22 +-
 .../Android/PlatformAndroidRemoteGDBServer.cpp     |  31 +--
 .../Android/PlatformAndroidRemoteGDBServer.h       |  12 +-
 .../Plugins/Platform/FreeBSD/PlatformFreeBSD.cpp   |   8 +-
 source/Plugins/Platform/FreeBSD/PlatformFreeBSD.h  |   4 +-
 .../Plugins/Platform/Kalimba/PlatformKalimba.cpp   |   8 +-
 source/Plugins/Platform/Kalimba/PlatformKalimba.h  |   4 +-
 source/Plugins/Platform/Linux/PlatformLinux.cpp    |   4 +-
 source/Plugins/Platform/Linux/PlatformLinux.h      |   2 +-
 .../Platform/MacOSX/PlatformAppleSimulator.cpp     |  24 +-
 .../Platform/MacOSX/PlatformAppleSimulator.h       |   8 +-
 .../Platform/MacOSX/PlatformAppleTVSimulator.cpp   |  18 +-
 .../Platform/MacOSX/PlatformAppleTVSimulator.h     |   6 +-
 .../MacOSX/PlatformAppleWatchSimulator.cpp         |  18 +-
 .../Platform/MacOSX/PlatformAppleWatchSimulator.h  |   6 +-
 source/Plugins/Platform/MacOSX/PlatformDarwin.cpp  |  53 ++--
 source/Plugins/Platform/MacOSX/PlatformDarwin.h    |   8 +-
 .../Platform/MacOSX/PlatformDarwinKernel.cpp       |  10 +-
 .../Plugins/Platform/MacOSX/PlatformDarwinKernel.h |   4 +-
 source/Plugins/Platform/MacOSX/PlatformMacOSX.cpp  |  33 +--
 source/Plugins/Platform/MacOSX/PlatformMacOSX.h    |  13 +-
 .../Platform/MacOSX/PlatformRemoteAppleTV.cpp      |   2 +-
 .../Platform/MacOSX/PlatformRemoteAppleWatch.cpp   |   2 +-
 .../Platform/MacOSX/PlatformRemoteDarwinDevice.cpp |  20 +-
 .../Platform/MacOSX/PlatformRemoteDarwinDevice.h   |   6 +-
 .../Plugins/Platform/MacOSX/PlatformRemoteiOS.cpp  |   2 +-
 .../Platform/MacOSX/PlatformiOSSimulator.cpp       |  18 +-
 .../Plugins/Platform/MacOSX/PlatformiOSSimulator.h |   6 +-
 .../PlatformiOSSimulatorCoreSimulatorSupport.h     |  14 +-
 .../PlatformiOSSimulatorCoreSimulatorSupport.mm    |  20 +-
 source/Plugins/Platform/NetBSD/PlatformNetBSD.cpp  |  12 +-
 source/Plugins/Platform/NetBSD/PlatformNetBSD.h    |   2 +-
 .../Plugins/Platform/OpenBSD/PlatformOpenBSD.cpp   |   2 +-
 source/Plugins/Platform/POSIX/PlatformPOSIX.cpp    | 141 +++++------
 source/Plugins/Platform/POSIX/PlatformPOSIX.h      |  69 ++---
 .../Plugins/Platform/Windows/PlatformWindows.cpp   |  34 +--
 source/Plugins/Platform/Windows/PlatformWindows.h  |  21 +-
 .../gdb-server/PlatformRemoteGDBServer.cpp         |  90 +++----
 .../Platform/gdb-server/PlatformRemoteGDBServer.h  |  56 ++---
 .../Process/Darwin/DarwinProcessLauncher.cpp       |  32 +--
 .../Plugins/Process/Darwin/DarwinProcessLauncher.h |   5 +-
 source/Plugins/Process/Darwin/MachException.cpp    |  26 +-
 source/Plugins/Process/Darwin/MachException.h      |  12 +-
 .../Plugins/Process/Darwin/NativeProcessDarwin.cpp | 153 ++++++------
 .../Plugins/Process/Darwin/NativeProcessDarwin.h   | 107 ++++----
 .../Plugins/Process/Darwin/NativeThreadDarwin.cpp  |  10 +-
 source/Plugins/Process/Darwin/NativeThreadDarwin.h |  12 +-
 .../Process/Darwin/NativeThreadListDarwin.cpp      |   4 +-
 .../Process/Darwin/NativeThreadListDarwin.h        |   2 +-
 source/Plugins/Process/FreeBSD/ProcessFreeBSD.cpp  | 100 ++++----
 source/Plugins/Process/FreeBSD/ProcessFreeBSD.h    |  52 ++--
 source/Plugins/Process/FreeBSD/ProcessMonitor.cpp  |  38 +--
 source/Plugins/Process/FreeBSD/ProcessMonitor.h    |  18 +-
 .../RegisterContextPOSIXProcessMonitor_arm.cpp     |   2 +-
 .../RegisterContextPOSIXProcessMonitor_arm64.cpp   |   2 +-
 .../RegisterContextPOSIXProcessMonitor_mips64.cpp  |   2 +-
 .../RegisterContextPOSIXProcessMonitor_powerpc.cpp |   2 +-
 .../RegisterContextPOSIXProcessMonitor_x86.cpp     |   2 +-
 .../Plugins/Process/Linux/NativeProcessLinux.cpp   | 267 ++++++++++----------
 source/Plugins/Process/Linux/NativeProcessLinux.h  |  87 +++----
 .../Process/Linux/NativeRegisterContextLinux.cpp   |  63 ++---
 .../Process/Linux/NativeRegisterContextLinux.h     |  38 +--
 .../Linux/NativeRegisterContextLinux_arm.cpp       | 119 ++++-----
 .../Process/Linux/NativeRegisterContextLinux_arm.h |  44 ++--
 .../Linux/NativeRegisterContextLinux_arm64.cpp     | 115 ++++-----
 .../Linux/NativeRegisterContextLinux_arm64.h       |  44 ++--
 .../Linux/NativeRegisterContextLinux_mips64.cpp    |  97 +++----
 .../Linux/NativeRegisterContextLinux_mips64.h      |  45 ++--
 .../Linux/NativeRegisterContextLinux_s390x.cpp     | 150 +++++------
 .../Linux/NativeRegisterContextLinux_s390x.h       |  46 ++--
 .../Linux/NativeRegisterContextLinux_x86_64.cpp    |  99 ++++----
 .../Linux/NativeRegisterContextLinux_x86_64.h      |  31 +--
 source/Plugins/Process/Linux/NativeThreadLinux.cpp |  49 ++--
 source/Plugins/Process/Linux/NativeThreadLinux.h   |  16 +-
 source/Plugins/Process/Linux/SingleStepCheck.cpp   |  20 +-
 .../Process/MacOSX-Kernel/CommunicationKDP.cpp     |  12 +-
 .../Process/MacOSX-Kernel/CommunicationKDP.h       |  12 +-
 .../Plugins/Process/MacOSX-Kernel/ProcessKDP.cpp   |  85 ++++---
 source/Plugins/Process/MacOSX-Kernel/ProcessKDP.h  |  50 ++--
 .../MacOSX-Kernel/RegisterContextKDP_arm.cpp       |  16 +-
 .../MacOSX-Kernel/RegisterContextKDP_arm64.cpp     |  16 +-
 .../MacOSX-Kernel/RegisterContextKDP_i386.cpp      |  12 +-
 .../MacOSX-Kernel/RegisterContextKDP_x86_64.cpp    |  12 +-
 .../Plugins/Process/NetBSD/NativeProcessNetBSD.cpp | 168 ++++++-------
 .../Plugins/Process/NetBSD/NativeProcessNetBSD.h   |  67 ++---
 .../Process/NetBSD/NativeRegisterContextNetBSD.cpp |  36 +--
 .../Process/NetBSD/NativeRegisterContextNetBSD.h   |  24 +-
 .../NetBSD/NativeRegisterContextNetBSD_x86_64.cpp  |  63 ++---
 .../NetBSD/NativeRegisterContextNetBSD_x86_64.h    |  27 +-
 .../Plugins/Process/NetBSD/NativeThreadNetBSD.cpp  |  42 ++--
 source/Plugins/Process/NetBSD/NativeThreadNetBSD.h |  10 +-
 .../Process/Utility/DynamicRegisterInfo.cpp        |  28 +--
 .../Process/Utility/RegisterContextLLDB.cpp        |  10 +-
 .../Process/Utility/RegisterContextMemory.cpp      |  10 +-
 .../Utility/RegisterContextThreadMemory.cpp        |  10 +-
 .../Process/Utility/RegisterContextThreadMemory.h  |  12 +-
 source/Plugins/Process/Utility/ThreadMemory.cpp    |  12 +-
 source/Plugins/Process/Utility/ThreadMemory.h      |   2 +-
 .../Process/Utility/UnwindMacOSXFrameBackchain.cpp |   4 +-
 .../Process/Windows/Common/DebuggerThread.cpp      |  22 +-
 .../Process/Windows/Common/DebuggerThread.h        |   6 +-
 .../Process/Windows/Common/IDebugDelegate.h        |   4 +-
 .../Process/Windows/Common/LocalDebugDelegate.cpp  |   2 +-
 .../Process/Windows/Common/LocalDebugDelegate.h    |   2 +-
 .../Process/Windows/Common/ProcessWindows.cpp      |  75 +++---
 .../Process/Windows/Common/ProcessWindows.h        |  38 +--
 .../Windows/Common/RegisterContextWindows.cpp      |   2 +-
 .../Common/x64/RegisterContextWindows_x64.cpp      |   2 +-
 .../Common/x86/RegisterContextWindows_x86.cpp      |   2 +-
 source/Plugins/Process/elf-core/ProcessElfCore.cpp |  26 +-
 source/Plugins/Process/elf-core/ProcessElfCore.h   |  14 +-
 .../elf-core/RegisterContextPOSIXCore_x86_64.cpp   |   2 +-
 source/Plugins/Process/elf-core/ThreadElfCore.cpp  |  12 +-
 source/Plugins/Process/elf-core/ThreadElfCore.h    |  12 +-
 .../Process/gdb-remote/GDBRemoteCommunication.cpp  |  14 +-
 .../Process/gdb-remote/GDBRemoteCommunication.h    |  10 +-
 .../gdb-remote/GDBRemoteCommunicationClient.cpp    | 109 ++++----
 .../gdb-remote/GDBRemoteCommunicationClient.h      |  46 ++--
 .../gdb-remote/GDBRemoteCommunicationServer.cpp    |   2 +-
 .../gdb-remote/GDBRemoteCommunicationServer.h      |   6 +-
 .../GDBRemoteCommunicationServerCommon.cpp         |  25 +-
 .../GDBRemoteCommunicationServerCommon.h           |  11 +-
 .../GDBRemoteCommunicationServerLLGS.cpp           | 113 ++++-----
 .../gdb-remote/GDBRemoteCommunicationServerLLGS.h  |  20 +-
 .../GDBRemoteCommunicationServerPlatform.cpp       |  16 +-
 .../GDBRemoteCommunicationServerPlatform.h         |   8 +-
 .../gdb-remote/GDBRemoteRegisterContext.cpp        |   4 +-
 .../Process/gdb-remote/ProcessGDBRemote.cpp        | 187 +++++++-------
 .../Plugins/Process/gdb-remote/ProcessGDBRemote.h  |  78 +++---
 .../Plugins/Process/mach-core/ProcessMachCore.cpp  |  24 +-
 source/Plugins/Process/mach-core/ProcessMachCore.h |  12 +-
 source/Plugins/Process/minidump/MinidumpParser.cpp |   4 +-
 source/Plugins/Process/minidump/MinidumpParser.h   |   2 +-
 source/Plugins/Process/minidump/MinidumpTypes.cpp  |  24 +-
 source/Plugins/Process/minidump/MinidumpTypes.h    |   6 +-
 .../Plugins/Process/minidump/ProcessMinidump.cpp   |  18 +-
 source/Plugins/Process/minidump/ProcessMinidump.h  |  14 +-
 .../Python/ScriptInterpreterPython.cpp             |  63 ++---
 .../Python/ScriptInterpreterPython.h               |  38 +--
 .../DarwinLog/StructuredDataDarwinLog.cpp          |  50 ++--
 .../DarwinLog/StructuredDataDarwinLog.h            |   7 +-
 .../SymbolFile/DWARF/DWARFASTParserClang.cpp       |  44 ++--
 .../SymbolFile/DWARF/DWARFASTParserOCaml.cpp       |   2 +-
 .../Plugins/SymbolFile/DWARF/DWARFCompileUnit.cpp  |   3 +-
 .../Plugins/SymbolFile/DWARF/DWARFDebugAranges.cpp |   3 +-
 .../SymbolFile/DWARF/DWARFDebugInfoEntry.cpp       |   4 +-
 source/Plugins/SymbolFile/DWARF/DWARFDebugLine.cpp |   4 +-
 .../SymbolFile/DWARF/DWARFDebugPubnames.cpp        |   6 +-
 source/Plugins/SymbolFile/DWARF/DWARFDefines.cpp   |   2 +
 .../Plugins/SymbolFile/DWARF/SymbolFileDWARF.cpp   |  35 +--
 .../SymbolFile/DWARF/SymbolFileDWARFDebugMap.cpp   |   9 +-
 .../Plugins/SymbolVendor/ELF/SymbolVendorELF.cpp   |   4 +-
 .../SymbolVendor/MacOSX/SymbolVendorMacOSX.cpp     |   7 +-
 .../MacOSX/AppleGetItemInfoHandler.cpp             |   7 +-
 .../SystemRuntime/MacOSX/AppleGetItemInfoHandler.h |   4 +-
 .../MacOSX/AppleGetPendingItemsHandler.cpp         |   6 +-
 .../MacOSX/AppleGetPendingItemsHandler.h           |   4 +-
 .../SystemRuntime/MacOSX/AppleGetQueuesHandler.cpp |   6 +-
 .../SystemRuntime/MacOSX/AppleGetQueuesHandler.h   |   4 +-
 .../MacOSX/AppleGetThreadItemInfoHandler.cpp       |   4 +-
 .../MacOSX/AppleGetThreadItemInfoHandler.h         |   4 +-
 .../SystemRuntime/MacOSX/SystemRuntimeMacOSX.cpp   |  28 +--
 .../InstEmulation/UnwindAssemblyInstEmulation.cpp  |   4 +-
 .../UnwindAssembly/x86/UnwindAssembly-x86.cpp      |  10 +-
 source/Symbol/ClangASTContext.cpp                  |  67 +++--
 source/Symbol/CompactUnwindInfo.cpp                |   6 +-
 source/Symbol/CompilerType.cpp                     |   4 +-
 source/Symbol/DWARFCallFrameInfo.cpp               |   3 +-
 source/Symbol/JavaASTContext.cpp                   |   4 +-
 source/Symbol/ObjectFile.cpp                       |  25 +-
 source/Symbol/SymbolContext.cpp                    |   2 +-
 source/Symbol/Symtab.cpp                           |  24 +-
 source/Symbol/Type.cpp                             |   2 +-
 source/Symbol/Variable.cpp                         |   8 +-
 source/Target/Language.cpp                         |   2 +-
 source/Target/LanguageRuntime.cpp                  |   3 +-
 source/Target/Memory.cpp                           |   7 +-
 source/Target/ModuleCache.cpp                      |  89 +++----
 source/Target/ObjCLanguageRuntime.cpp              |   6 +-
 source/Target/PathMappingList.cpp                  |   2 +-
 source/Target/Platform.cpp                         | 196 ++++++++-------
 source/Target/Process.cpp                          | 191 +++++++-------
 source/Target/ProcessLaunchInfo.cpp                |   2 +-
 source/Target/RegisterContext.cpp                  |  17 +-
 source/Target/StackFrame.cpp                       |  18 +-
 source/Target/StopInfo.cpp                         |   4 +-
 source/Target/Target.cpp                           | 111 ++++----
 source/Target/TargetList.cpp                       |  58 ++---
 source/Target/Thread.cpp                           |  69 +++--
 source/Target/ThreadPlanCallFunction.cpp           |   2 +-
 source/Target/ThreadPlanTracer.cpp                 |   2 +-
 source/Target/ThreadSpec.cpp                       |  10 +-
 source/Utility/CMakeLists.txt                      |   2 +-
 source/Utility/Error.cpp                           | 274 --------------------
 source/Utility/JSON.cpp                            |  22 +-
 source/Utility/SelectHelper.cpp                    |   8 +-
 source/Utility/Status.cpp                          | 275 ++++++++++++++++++++
 source/Utility/UUID.cpp                            |  16 +-
 tools/debugserver/source/DNBError.h                |   2 +-
 tools/debugserver/source/JSON.cpp                  |  22 +-
 tools/debugserver/source/JSON.h                    |   2 +-
 tools/debugserver/source/MacOSX/MachException.cpp  |  14 +-
 tools/debugserver/source/MacOSX/MachProcess.mm     |  13 +-
 tools/debugserver/source/MacOSX/MachTask.mm        |  20 +-
 tools/debugserver/source/MacOSX/MachThreadList.cpp |   2 +-
 tools/debugserver/source/MacOSX/MachVMMemory.cpp   |   2 +-
 .../debugserver/source/MacOSX/arm/DNBArchImpl.cpp  |   4 +-
 .../source/MacOSX/arm64/DNBArchImplARM64.cpp       |   4 +-
 tools/debugserver/source/PseudoTerminal.cpp        |   6 +-
 tools/debugserver/source/PseudoTerminal.h          |   8 +-
 tools/debugserver/source/RNBContext.cpp            |   2 +-
 tools/debugserver/source/RNBRemote.cpp             |   2 +-
 tools/lldb-mi/MICmdBase.cpp                        |   2 +-
 tools/lldb-mi/MICmdCmdData.h                       |   2 +-
 tools/lldb-mi/MIDriver.cpp                         |   2 +-
 tools/lldb-mi/MIDriverBase.cpp                     |   2 +-
 tools/lldb-server/Acceptor.cpp                     |   6 +-
 tools/lldb-server/Acceptor.h                       |   8 +-
 tools/lldb-server/lldb-gdbserver.cpp               |  20 +-
 tools/lldb-server/lldb-platform.cpp                |  30 +--
 unittests/Breakpoint/BreakpointIDTest.cpp          |   4 +-
 unittests/Core/ScalarTest.cpp                      |   6 +-
 unittests/Core/TimerTest.cpp                       |  17 +-
 unittests/Editline/EditlineTest.cpp                |   4 +-
 unittests/Expression/GoParserTest.cpp              |   4 +-
 unittests/Host/MainLoopTest.cpp                    |  10 +-
 unittests/Host/SocketTest.cpp                      |   8 +-
 .../GDBRemoteCommunicationClientTest.cpp           |   6 +-
 .../Process/gdb-remote/GDBRemoteTestUtils.cpp      |   7 +-
 unittests/Target/ModuleCacheTest.cpp               |   6 +-
 unittests/Utility/CMakeLists.txt                   |   2 +-
 unittests/Utility/ErrorTest.cpp                    |  19 --
 unittests/Utility/StatusTest.cpp                   |  19 ++
 unittests/debugserver/RNBSocketTest.cpp            |   4 +-
 696 files changed, 7043 insertions(+), 6600 deletions(-)
 delete mode 100644 include/lldb/Utility/Error.h
 create mode 100644 include/lldb/Utility/Status.h
 create mode 100644 packages/Python/lldbsuite/test/lang/cpp/class-template-parameter-pack/Makefile
 create mode 100644 packages/Python/lldbsuite/test/lang/cpp/class-template-parameter-pack/TestClassTemplateParameterPack.py
 create mode 100644 packages/Python/lldbsuite/test/lang/cpp/class-template-parameter-pack/main.cpp
 create mode 100644 packages/Python/lldbsuite/test/lang/cpp/function-template-parameter-pack/Makefile
 create mode 100644 packages/Python/lldbsuite/test/lang/cpp/function-template-parameter-pack/TestFunctionTemplateParameterPack.py
 create mode 100644 packages/Python/lldbsuite/test/lang/cpp/function-template-parameter-pack/main.cpp
 create mode 100644 packages/Python/lldbsuite/test/lang/objc/ptr_refs/Makefile
 create mode 100644 packages/Python/lldbsuite/test/lang/objc/ptr_refs/TestPtrRefsObjC.py
 create mode 100644 packages/Python/lldbsuite/test/lang/objc/ptr_refs/main.m
 delete mode 100644 source/Utility/Error.cpp
 create mode 100644 source/Utility/Status.cpp
 delete mode 100644 unittests/Utility/ErrorTest.cpp
 create mode 100644 unittests/Utility/StatusTest.cpp

diff --git a/include/lldb/API/SBError.h b/include/lldb/API/SBError.h
index a3b1f87053ad..a099a9be271a 100644
--- a/include/lldb/API/SBError.h
+++ b/include/lldb/API/SBError.h
@@ -68,18 +68,18 @@ protected:
   friend class SBBreakpoint;
   friend class SBBreakpointLocation;
 
-  lldb_private::Error *get();
+  lldb_private::Status *get();
 
-  lldb_private::Error *operator->();
+  lldb_private::Status *operator->();
 
-  const lldb_private::Error &operator*() const;
+  const lldb_private::Status &operator*() const;
 
-  lldb_private::Error &ref();
+  lldb_private::Status &ref();
 
-  void SetError(const lldb_private::Error &lldb_error);
+  void SetError(const lldb_private::Status &lldb_error);
 
 private:
-  std::unique_ptr<lldb_private::Error> m_opaque_ap;
+  std::unique_ptr<lldb_private::Status> m_opaque_ap;
 
   void CreateIfNeeded();
 };
diff --git a/include/lldb/API/SBPlatform.h b/include/lldb/API/SBPlatform.h
index 3f7e1acb74b5..952e31066319 100644
--- a/include/lldb/API/SBPlatform.h
+++ b/include/lldb/API/SBPlatform.h
@@ -156,7 +156,8 @@ protected:
   void SetSP(const lldb::PlatformSP &platform_sp);
 
   SBError ExecuteConnected(
-      const std::function<lldb_private::Error(const lldb::PlatformSP &)> &func);
+      const std::function<lldb_private::Status(const lldb::PlatformSP &)>
+          &func);
 
   lldb::PlatformSP m_opaque_sp;
 };
diff --git a/include/lldb/API/SBTarget.h b/include/lldb/API/SBTarget.h
index 21341fbc27fa..62398fcd4b94 100644
--- a/include/lldb/API/SBTarget.h
+++ b/include/lldb/API/SBTarget.h
@@ -544,7 +544,7 @@ public:
   ///     into this call
   ///
   /// @param[out] error
-  ///     Error information is written here if the memory read fails.
+  ///     Status information is written here if the memory read fails.
   ///
   /// @return
   ///     The amount of data read in host bytes.
diff --git a/include/lldb/Breakpoint/Breakpoint.h b/include/lldb/Breakpoint/Breakpoint.h
index 4a16df047ee3..408647c3b5da 100644
--- a/include/lldb/Breakpoint/Breakpoint.h
+++ b/include/lldb/Breakpoint/Breakpoint.h
@@ -169,7 +169,7 @@ public:
 
     virtual bool EvaluatePrecondition(StoppointCallbackContext &context);
 
-    virtual Error ConfigurePrecondition(Args &options);
+    virtual Status ConfigurePrecondition(Args &options);
 
     virtual void GetDescription(Stream &stream, lldb::DescriptionLevel level);
   };
@@ -178,7 +178,7 @@ public:
 
   // Saving & restoring breakpoints:
   static lldb::BreakpointSP CreateFromStructuredData(
-      Target &target, StructuredData::ObjectSP &data_object_sp, Error &error);
+      Target &target, StructuredData::ObjectSP &data_object_sp, Status &error);
 
   static bool
   SerializedBreakpointMatchesNames(StructuredData::ObjectSP &bkpt_object_sp,
@@ -613,7 +613,7 @@ public:
 
   lldb::SearchFilterSP GetSearchFilter() { return m_filter_sp; }
 
-  bool AddName(const char *new_name, Error &error);
+  bool AddName(llvm::StringRef new_name, Status &error);
 
   void RemoveName(const char *name_to_remove) {
     if (name_to_remove)
diff --git a/include/lldb/Breakpoint/BreakpointID.h b/include/lldb/Breakpoint/BreakpointID.h
index aa4add4cd665..57411b316317 100644
--- a/include/lldb/Breakpoint/BreakpointID.h
+++ b/include/lldb/Breakpoint/BreakpointID.h
@@ -84,7 +84,7 @@ public:
   ///     \b true if the name is a breakpoint name (as opposed to an ID or
   ///     range) false otherwise.
   //------------------------------------------------------------------
-  static bool StringIsBreakpointName(llvm::StringRef str, Error &error);
+  static bool StringIsBreakpointName(llvm::StringRef str, Status &error);
 
   //------------------------------------------------------------------
   /// Takes a breakpoint ID and the breakpoint location id and returns
diff --git a/include/lldb/Breakpoint/BreakpointLocation.h b/include/lldb/Breakpoint/BreakpointLocation.h
index d0567ccf2e04..a1086aa3fe5f 100644
--- a/include/lldb/Breakpoint/BreakpointLocation.h
+++ b/include/lldb/Breakpoint/BreakpointLocation.h
@@ -161,7 +161,7 @@ public:
   //------------------------------------------------------------------
   const char *GetConditionText(size_t *hash = nullptr) const;
 
-  bool ConditionSaysStop(ExecutionContext &exe_ctx, Error &error);
+  bool ConditionSaysStop(ExecutionContext &exe_ctx, Status &error);
 
   //------------------------------------------------------------------
   /// Set the valid thread to be checked when the breakpoint is hit.
diff --git a/include/lldb/Breakpoint/BreakpointOptions.h b/include/lldb/Breakpoint/BreakpointOptions.h
index 024e915908c0..4ed8469eba3d 100644
--- a/include/lldb/Breakpoint/BreakpointOptions.h
+++ b/include/lldb/Breakpoint/BreakpointOptions.h
@@ -50,7 +50,7 @@ public:
 
     static std::unique_ptr<CommandData>
     CreateFromStructuredData(const StructuredData::Dictionary &options_dict,
-                             Error &error);
+                             Status &error);
 
     StringList user_source;
     std::string script_source;
@@ -119,7 +119,7 @@ public:
   static std::unique_ptr<BreakpointOptions>
   CreateFromStructuredData(Target &target,
                            const StructuredData::Dictionary &data_dict,
-                           Error &error);
+                           Status &error);
 
   virtual StructuredData::ObjectSP SerializeToStructuredData();
 
diff --git a/include/lldb/Breakpoint/BreakpointResolver.h b/include/lldb/Breakpoint/BreakpointResolver.h
index 7a57250d632b..7bcd889ce78b 100644
--- a/include/lldb/Breakpoint/BreakpointResolver.h
+++ b/include/lldb/Breakpoint/BreakpointResolver.h
@@ -140,7 +140,7 @@ public:
 
   static lldb::BreakpointResolverSP
   CreateFromStructuredData(const StructuredData::Dictionary &resolver_dict,
-                           Error &error);
+                           Status &error);
 
   virtual StructuredData::ObjectSP SerializeToStructuredData() {
     return StructuredData::ObjectSP();
@@ -192,7 +192,7 @@ public:
 
   static const char *ResolverTyToName(enum ResolverTy);
 
-  static ResolverTy NameToResolverTy(const char *name);
+  static ResolverTy NameToResolverTy(llvm::StringRef name);
 
   virtual lldb::BreakpointResolverSP
   CopyForBreakpoint(Breakpoint &breakpoint) = 0;
diff --git a/include/lldb/Breakpoint/BreakpointResolverAddress.h b/include/lldb/Breakpoint/BreakpointResolverAddress.h
index 6d2d505811f9..9d757c8853b9 100644
--- a/include/lldb/Breakpoint/BreakpointResolverAddress.h
+++ b/include/lldb/Breakpoint/BreakpointResolverAddress.h
@@ -39,7 +39,7 @@ public:
   static BreakpointResolver *
   CreateFromStructuredData(Breakpoint *bkpt,
                            const StructuredData::Dictionary &options_dict,
-                           Error &error);
+                           Status &error);
 
   StructuredData::ObjectSP SerializeToStructuredData() override;
 
diff --git a/include/lldb/Breakpoint/BreakpointResolverFileLine.h b/include/lldb/Breakpoint/BreakpointResolverFileLine.h
index 50750b1fbdf5..f7bba3d4ccb4 100644
--- a/include/lldb/Breakpoint/BreakpointResolverFileLine.h
+++ b/include/lldb/Breakpoint/BreakpointResolverFileLine.h
@@ -36,7 +36,7 @@ public:
   static BreakpointResolver *
   CreateFromStructuredData(Breakpoint *bkpt,
                            const StructuredData::Dictionary &data_dict,
-                           Error &error);
+                           Status &error);
 
   StructuredData::ObjectSP SerializeToStructuredData() override;
 
diff --git a/include/lldb/Breakpoint/BreakpointResolverFileRegex.h b/include/lldb/Breakpoint/BreakpointResolverFileRegex.h
index 6f00dac3f0a7..d620e99ffc60 100644
--- a/include/lldb/Breakpoint/BreakpointResolverFileRegex.h
+++ b/include/lldb/Breakpoint/BreakpointResolverFileRegex.h
@@ -38,7 +38,7 @@ public:
   static BreakpointResolver *
   CreateFromStructuredData(Breakpoint *bkpt,
                            const StructuredData::Dictionary &options_dict,
-                           Error &error);
+                           Status &error);
 
   StructuredData::ObjectSP SerializeToStructuredData() override;
 
diff --git a/include/lldb/Breakpoint/BreakpointResolverName.h b/include/lldb/Breakpoint/BreakpointResolverName.h
index 4223125d9268..c7716d5146ef 100644
--- a/include/lldb/Breakpoint/BreakpointResolverName.h
+++ b/include/lldb/Breakpoint/BreakpointResolverName.h
@@ -57,7 +57,7 @@ public:
   static BreakpointResolver *
   CreateFromStructuredData(Breakpoint *bkpt,
                            const StructuredData::Dictionary &data_dict,
-                           Error &error);
+                           Status &error);
 
   StructuredData::ObjectSP SerializeToStructuredData() override;
 
diff --git a/include/lldb/Breakpoint/Watchpoint.h b/include/lldb/Breakpoint/Watchpoint.h
index beb7bad1ee9d..69067a567621 100644
--- a/include/lldb/Breakpoint/Watchpoint.h
+++ b/include/lldb/Breakpoint/Watchpoint.h
@@ -100,7 +100,7 @@ public:
   void DumpSnapshots(Stream *s, const char *prefix = nullptr) const;
   void DumpWithLevel(Stream *s, lldb::DescriptionLevel description_level) const;
   Target &GetTarget() { return m_target; }
-  const Error &GetError() { return m_error; }
+  const Status &GetError() { return m_error; }
 
   //------------------------------------------------------------------
   /// Returns the WatchpointOptions structure set for this watchpoint.
@@ -213,8 +213,8 @@ private:
   lldb::ValueObjectSP m_old_value_sp;
   lldb::ValueObjectSP m_new_value_sp;
   CompilerType m_type;
-  Error m_error; // An error object describing errors associated with this
-                 // watchpoint.
+  Status m_error; // An error object describing errors associated with this
+                  // watchpoint.
   WatchpointOptions
       m_options; // Settable watchpoint options, which is a delegate to handle
                  // the callback machinery.
diff --git a/include/lldb/Core/Communication.h b/include/lldb/Core/Communication.h
index 9459573ca90e..57fa483bd3a3 100644
--- a/include/lldb/Core/Communication.h
+++ b/include/lldb/Core/Communication.h
@@ -33,7 +33,7 @@ namespace lldb_private {
 class ConstString;
 }
 namespace lldb_private {
-class Error;
+class Status;
 }
 
 namespace lldb_private {
@@ -71,7 +71,7 @@ namespace lldb_private {
 /// reads data and caches any received bytes. To start the read thread
 /// clients call:
 ///
-///     bool Communication::StartReadThread (Error *);
+///     bool Communication::StartReadThread (Status *);
 ///
 /// If true is returned a read thread has been spawned that will
 /// continually execute a call to the pure virtual DoRead function:
@@ -154,10 +154,10 @@ public:
   ///     internal error object should be filled in with an
   ///     appropriate value based on the result of this function.
   ///
-  /// @see Error& Communication::GetError ();
+  /// @see Status& Communication::GetError ();
   /// @see bool Connection::Connect (const char *url);
   //------------------------------------------------------------------
-  lldb::ConnectionStatus Connect(const char *url, Error *error_ptr);
+  lldb::ConnectionStatus Connect(const char *url, Status *error_ptr);
 
   //------------------------------------------------------------------
   /// Disconnect the communications connection if one is currently
@@ -168,10 +168,10 @@ public:
   ///     internal error object should be filled in with an
   ///     appropriate value based on the result of this function.
   ///
-  /// @see Error& Communication::GetError ();
+  /// @see Status& Communication::GetError ();
   /// @see bool Connection::Disconnect ();
   //------------------------------------------------------------------
-  lldb::ConnectionStatus Disconnect(Error *error_ptr = nullptr);
+  lldb::ConnectionStatus Disconnect(Status *error_ptr = nullptr);
 
   //------------------------------------------------------------------
   /// Check if the connection is valid.
@@ -217,7 +217,7 @@ public:
   /// @see size_t Connection::Read (void *, size_t);
   //------------------------------------------------------------------
   size_t Read(void *dst, size_t dst_len, const Timeout<std::micro> &timeout,
-              lldb::ConnectionStatus &status, Error *error_ptr);
+              lldb::ConnectionStatus &status, Status *error_ptr);
 
   //------------------------------------------------------------------
   /// The actual write function that attempts to write to the
@@ -237,7 +237,7 @@ public:
   ///     The number of bytes actually Written.
   //------------------------------------------------------------------
   size_t Write(const void *src, size_t src_len, lldb::ConnectionStatus &status,
-               Error *error_ptr);
+               Status *error_ptr);
 
   //------------------------------------------------------------------
   /// Sets the connection that it to be used by this class.
@@ -280,7 +280,7 @@ public:
   /// @see void Communication::AppendBytesToCache (const uint8_t * bytes, size_t
   /// len, bool broadcast);
   //------------------------------------------------------------------
-  virtual bool StartReadThread(Error *error_ptr = nullptr);
+  virtual bool StartReadThread(Status *error_ptr = nullptr);
 
   //------------------------------------------------------------------
   /// Stops the read thread by cancelling it.
@@ -289,9 +289,9 @@ public:
   ///     \b True if the read thread was successfully canceled, \b
   ///     false otherwise.
   //------------------------------------------------------------------
-  virtual bool StopReadThread(Error *error_ptr = nullptr);
+  virtual bool StopReadThread(Status *error_ptr = nullptr);
 
-  virtual bool JoinReadThread(Error *error_ptr = nullptr);
+  virtual bool JoinReadThread(Status *error_ptr = nullptr);
   //------------------------------------------------------------------
   /// Checks if there is a currently running read thread.
   ///
@@ -361,7 +361,7 @@ protected:
 
   size_t ReadFromConnection(void *dst, size_t dst_len,
                             const Timeout<std::micro> &timeout,
-                            lldb::ConnectionStatus &status, Error *error_ptr);
+                            lldb::ConnectionStatus &status, Status *error_ptr);
 
   //------------------------------------------------------------------
   /// Append new bytes that get read from the read thread into the
diff --git a/include/lldb/Core/Connection.h b/include/lldb/Core/Connection.h
index 8d84377e3757..27757b2f77e7 100644
--- a/include/lldb/Core/Connection.h
+++ b/include/lldb/Core/Connection.h
@@ -22,7 +22,7 @@
 #include <stddef.h> // for size_t
 
 namespace lldb_private {
-class Error;
+class Status;
 }
 namespace lldb_private {
 template <typename Ratio> class Timeout;
@@ -75,10 +75,10 @@ public:
   ///     internal error object should be filled in with an
   ///     appropriate value based on the result of this function.
   ///
-  /// @see Error& Communication::GetError ();
+  /// @see Status& Communication::GetError ();
   //------------------------------------------------------------------
   virtual lldb::ConnectionStatus Connect(llvm::StringRef url,
-                                         Error *error_ptr) = 0;
+                                         Status *error_ptr) = 0;
 
   //------------------------------------------------------------------
   /// Disconnect the communications connection if one is currently
@@ -94,9 +94,9 @@ public:
   ///     internal error object should be filled in with an
   ///     appropriate value based on the result of this function.
   ///
-  /// @see Error& Communication::GetError ();
+  /// @see Status& Communication::GetError ();
   //------------------------------------------------------------------
-  virtual lldb::ConnectionStatus Disconnect(Error *error_ptr) = 0;
+  virtual lldb::ConnectionStatus Disconnect(Status *error_ptr) = 0;
 
   //------------------------------------------------------------------
   /// Check if the connection is valid.
@@ -137,7 +137,7 @@ public:
   //------------------------------------------------------------------
   virtual size_t Read(void *dst, size_t dst_len,
                       const Timeout<std::micro> &timeout,
-                      lldb::ConnectionStatus &status, Error *error_ptr) = 0;
+                      lldb::ConnectionStatus &status, Status *error_ptr) = 0;
 
   //------------------------------------------------------------------
   /// The actual write function that attempts to write to the
@@ -162,7 +162,7 @@ public:
   ///     The number of bytes actually Written.
   //------------------------------------------------------------------
   virtual size_t Write(const void *dst, size_t dst_len,
-                       lldb::ConnectionStatus &status, Error *error_ptr) = 0;
+                       lldb::ConnectionStatus &status, Status *error_ptr) = 0;
 
   //------------------------------------------------------------------
   /// Returns a URI that describes this connection object
diff --git a/include/lldb/Core/Debugger.h b/include/lldb/Core/Debugger.h
index 56201273fdd2..cedf9ecdb120 100644
--- a/include/lldb/Core/Debugger.h
+++ b/include/lldb/Core/Debugger.h
@@ -30,8 +30,8 @@
 #include "lldb/Target/Platform.h"
 #include "lldb/Target/TargetList.h"
 #include "lldb/Utility/ConstString.h" // for ConstString
-#include "lldb/Utility/Error.h"       // for Error
 #include "lldb/Utility/FileSpec.h"    // for FileSpec
+#include "lldb/Utility/Status.h"      // for Status
 #include "lldb/Utility/UserID.h"
 #include "lldb/lldb-defines.h"              // for DISALLOW_COPY_AND_ASSIGN
 #include "lldb/lldb-enumerations.h"         // for ScriptLanguage, Langua...
@@ -239,9 +239,9 @@ public:
     eStopDisassemblyTypeAlways
   };
 
-  Error SetPropertyValue(const ExecutionContext *exe_ctx,
-                         VarSetOperationType op, llvm::StringRef property_path,
-    llvm::StringRef value) override;
+  Status SetPropertyValue(const ExecutionContext *exe_ctx,
+                          VarSetOperationType op, llvm::StringRef property_path,
+                          llvm::StringRef value) override;
 
   bool GetAutoConfirm() const;
 
@@ -306,7 +306,7 @@ public:
 
   const ConstString &GetInstanceName() { return m_instance_name; }
 
-  bool LoadPlugin(const FileSpec &spec, Error &error);
+  bool LoadPlugin(const FileSpec &spec, Status &error);
 
   void ExecuteIOHandlers();
 
@@ -318,7 +318,7 @@ public:
 
   bool IsHandlingEvents() const { return m_event_handler_thread.IsJoinable(); }
 
-  Error RunREPL(lldb::LanguageType language, const char *repl_options);
+  Status RunREPL(lldb::LanguageType language, const char *repl_options);
 
   // This is for use in the command interpreter, when you either want the
   // selected target, or if no target
diff --git a/include/lldb/Core/FormatEntity.h b/include/lldb/Core/FormatEntity.h
index 6705c155cefc..aa5ccb48e56a 100644
--- a/include/lldb/Core/FormatEntity.h
+++ b/include/lldb/Core/FormatEntity.h
@@ -10,8 +10,8 @@
 #ifndef liblldb_FormatEntity_h_
 #define liblldb_FormatEntity_h_
 
-#include "lldb/Utility/Error.h"
-#include "lldb/Utility/FileSpec.h"  // for FileSpec
+#include "lldb/Utility/FileSpec.h" // for FileSpec
+#include "lldb/Utility/Status.h"
 #include "lldb/lldb-enumerations.h" // for Format::eFormatDefault, Format
 #include "lldb/lldb-types.h"        // for addr_t
 #include <algorithm>                // for min
@@ -205,11 +205,11 @@ public:
                             const Address *addr, ValueObject *valobj,
                             bool function_changed, bool initial_function);
 
-  static Error Parse(const llvm::StringRef &format, Entry &entry);
+  static Status Parse(const llvm::StringRef &format, Entry &entry);
 
-  static Error ExtractVariableInfo(llvm::StringRef &format_str,
-                                   llvm::StringRef &variable_name,
-                                   llvm::StringRef &variable_format);
+  static Status ExtractVariableInfo(llvm::StringRef &format_str,
+                                    llvm::StringRef &variable_name,
+                                    llvm::StringRef &variable_format);
 
   static size_t AutoComplete(llvm::StringRef s, int match_start_point,
                              int max_return_elements, bool &word_complete,
@@ -228,8 +228,8 @@ public:
                              llvm::StringRef element_format);
 
 protected:
-  static Error ParseInternal(llvm::StringRef &format, Entry &parent_entry,
-                             uint32_t depth);
+  static Status ParseInternal(llvm::StringRef &format, Entry &parent_entry,
+                              uint32_t depth);
 };
 } // namespace lldb_private
 
diff --git a/include/lldb/Core/MappedHash.h b/include/lldb/Core/MappedHash.h
index cab98ee5b302..18d383ed581c 100644
--- a/include/lldb/Core/MappedHash.h
+++ b/include/lldb/Core/MappedHash.h
@@ -326,7 +326,7 @@ public:
           1u, // Bucket hash data collision, but key didn't match
       eResultEndOfHashData = 2u, // The chain of items for this hash data in
                                  // this bucket is terminated, search no more
-      eResultError = 3u          // Error parsing the hash data, abort
+      eResultError = 3u          // Status parsing the hash data, abort
     };
 
     struct Pair {
@@ -409,7 +409,7 @@ public:
                   // searching
                   return false;
                 case eResultError:
-                  // Error parsing the hash data, abort
+                  // Status parsing the hash data, abort
                   return false;
                 }
               }
diff --git a/include/lldb/Core/Module.h b/include/lldb/Core/Module.h
index 33735bc99f60..2ffe57ad8a4d 100644
--- a/include/lldb/Core/Module.h
+++ b/include/lldb/Core/Module.h
@@ -17,8 +17,8 @@
 #include "lldb/Symbol/TypeSystem.h"
 #include "lldb/Target/PathMappingList.h"
 #include "lldb/Utility/ConstString.h" // for ConstString
-#include "lldb/Utility/Error.h"       // for Error
 #include "lldb/Utility/FileSpec.h"
+#include "lldb/Utility/Status.h" // for Status
 #include "lldb/Utility/UUID.h"
 #include "lldb/lldb-defines.h"      // for DISALLOW_COPY_AND_ASSIGN
 #include "lldb/lldb-enumerations.h" // for LanguageType, SymbolType
@@ -652,7 +652,7 @@ public:
   //------------------------------------------------------------------
   bool IsLoadedInTarget(Target *target);
 
-  bool LoadScriptingResourceInTarget(Target *target, Error &error,
+  bool LoadScriptingResourceInTarget(Target *target, Status &error,
                                      Stream *feedback_stream = nullptr);
 
   //------------------------------------------------------------------
@@ -728,7 +728,7 @@ public:
   ///     failed (see the `error` for more information in that case).
   //------------------------------------------------------------------
   ObjectFile *GetMemoryObjectFile(const lldb::ProcessSP &process_sp,
-                                  lldb::addr_t header_addr, Error &error,
+                                  lldb::addr_t header_addr, Status &error,
                                   size_t size_to_read = 512);
   //------------------------------------------------------------------
   /// Get the symbol vendor interface for the current architecture.
@@ -1033,7 +1033,7 @@ public:
   ///
   /// @return
   //------------------------------------------------------------------
-  Error LoadInMemory(Target &target, bool set_pc);
+  Status LoadInMemory(Target &target, bool set_pc);
 
   //----------------------------------------------------------------------
   /// @class LookupInfo Module.h "lldb/Core/Module.h"
diff --git a/include/lldb/Core/ModuleList.h b/include/lldb/Core/ModuleList.h
index d82d1c0d48d8..4b637c9b0427 100644
--- a/include/lldb/Core/ModuleList.h
+++ b/include/lldb/Core/ModuleList.h
@@ -12,9 +12,9 @@
 
 #include "lldb/Core/Address.h"     // for Address
 #include "lldb/Core/ModuleSpec.h"  // for ModuleSpec
-#include "lldb/Utility/Error.h"    // for Error
 #include "lldb/Utility/FileSpec.h" // for FileSpec
 #include "lldb/Utility/Iterable.h"
+#include "lldb/Utility/Status.h" // for Status
 #include "lldb/lldb-enumerations.h"
 #include "lldb/lldb-forward.h"
 #include "lldb/lldb-types.h"
@@ -530,18 +530,18 @@ public:
   //------------------------------------------------------------------
   size_t GetSize() const;
 
-  bool LoadScriptingResourcesInTarget(Target *target, std::list<Error> &errors,
+  bool LoadScriptingResourcesInTarget(Target *target, std::list<Status> &errors,
                                       Stream *feedback_stream = nullptr,
                                       bool continue_on_error = true);
 
   static bool ModuleIsInCache(const Module *module_ptr);
 
-  static Error GetSharedModule(const ModuleSpec &module_spec,
-                               lldb::ModuleSP &module_sp,
-                               const FileSpecList *module_search_paths_ptr,
-                               lldb::ModuleSP *old_module_sp_ptr,
-                               bool *did_create_ptr,
-                               bool always_create = false);
+  static Status GetSharedModule(const ModuleSpec &module_spec,
+                                lldb::ModuleSP &module_sp,
+                                const FileSpecList *module_search_paths_ptr,
+                                lldb::ModuleSP *old_module_sp_ptr,
+                                bool *did_create_ptr,
+                                bool always_create = false);
 
   static bool RemoveSharedModule(lldb::ModuleSP &module_sp);
 
diff --git a/include/lldb/Core/PluginManager.h b/include/lldb/Core/PluginManager.h
index 645ab85eaa7e..d9851e5ecfc1 100644
--- a/include/lldb/Core/PluginManager.h
+++ b/include/lldb/Core/PluginManager.h
@@ -10,8 +10,8 @@
 #ifndef liblldb_PluginManager_h_
 #define liblldb_PluginManager_h_
 
-#include "lldb/Utility/Error.h" // for Error
 #include "lldb/Utility/FileSpec.h"
+#include "lldb/Utility/Status.h"          // for Status
 #include "lldb/lldb-enumerations.h"       // for ScriptLanguage
 #include "lldb/lldb-forward.h"            // for OptionValuePropertiesSP
 #include "lldb/lldb-private-interfaces.h" // for DebuggerInitializeCallback
@@ -202,8 +202,8 @@ public:
   static ObjectFileCreateMemoryInstance
   GetObjectFileCreateMemoryCallbackForPluginName(const ConstString &name);
 
-  static Error SaveCore(const lldb::ProcessSP &process_sp,
-                        const FileSpec &outfile);
+  static Status SaveCore(const lldb::ProcessSP &process_sp,
+                         const FileSpec &outfile);
 
   //------------------------------------------------------------------
   // ObjectContainer
diff --git a/include/lldb/Core/RegisterValue.h b/include/lldb/Core/RegisterValue.h
index 66c4689d702d..a45db00fb76e 100644
--- a/include/lldb/Core/RegisterValue.h
+++ b/include/lldb/Core/RegisterValue.h
@@ -12,7 +12,7 @@
 
 #include "lldb/Core/Scalar.h"
 #include "lldb/Utility/Endian.h"
-#include "lldb/Utility/Error.h"     // for Error
+#include "lldb/Utility/Status.h"    // for Status
 #include "lldb/lldb-enumerations.h" // for ByteOrder, Format
 #include "lldb/lldb-types.h"        // for offset_t
 
@@ -105,11 +105,11 @@ public:
   // into "dst".
   uint32_t GetAsMemoryData(const RegisterInfo *reg_info, void *dst,
                            uint32_t dst_len, lldb::ByteOrder dst_byte_order,
-                           Error &error) const;
+                           Status &error) const;
 
   uint32_t SetFromMemoryData(const RegisterInfo *reg_info, const void *src,
                              uint32_t src_len, lldb::ByteOrder src_byte_order,
-                             Error &error);
+                             Status &error);
 
   bool GetScalarValue(Scalar &scalar) const;
 
@@ -241,13 +241,13 @@ public:
 
   bool SignExtend(uint32_t sign_bitpos);
 
-  Error SetValueFromString(const RegisterInfo *reg_info,
-                           llvm::StringRef value_str);
-  Error SetValueFromString(const RegisterInfo *reg_info,
-                           const char *value_str) = delete;
+  Status SetValueFromString(const RegisterInfo *reg_info,
+                            llvm::StringRef value_str);
+  Status SetValueFromString(const RegisterInfo *reg_info,
+                            const char *value_str) = delete;
 
-  Error SetValueFromData(const RegisterInfo *reg_info, DataExtractor &data,
-                         lldb::offset_t offset, bool partial_data_ok);
+  Status SetValueFromData(const RegisterInfo *reg_info, DataExtractor &data,
+                          lldb::offset_t offset, bool partial_data_ok);
 
   // The default value of 0 for reg_name_right_align_at means no alignment at
   // all.
diff --git a/include/lldb/Core/Scalar.h b/include/lldb/Core/Scalar.h
index c534044af753..943398b88020 100644
--- a/include/lldb/Core/Scalar.h
+++ b/include/lldb/Core/Scalar.h
@@ -10,7 +10,7 @@
 #ifndef liblldb_Scalar_h_
 #define liblldb_Scalar_h_
 
-#include "lldb/Utility/Error.h"      // for Error
+#include "lldb/Utility/Status.h"     // for Status
 #include "lldb/lldb-enumerations.h"  // for Encoding, ByteOrder
 #include "lldb/lldb-private-types.h" // for type128
 
@@ -146,7 +146,7 @@ public:
   bool GetData(DataExtractor &data, size_t limit_byte_size = UINT32_MAX) const;
 
   size_t GetAsMemoryData(void *dst, size_t dst_len,
-                         lldb::ByteOrder dst_byte_order, Error &error) const;
+                         lldb::ByteOrder dst_byte_order, Status &error) const;
 
   bool IsZero() const;
 
@@ -272,11 +272,11 @@ public:
 
   long double LongDouble(long double fail_value = 0.0) const;
 
-  Error SetValueFromCString(const char *s, lldb::Encoding encoding,
-                            size_t byte_size);
+  Status SetValueFromCString(const char *s, lldb::Encoding encoding,
+                             size_t byte_size);
 
-  Error SetValueFromData(DataExtractor &data, lldb::Encoding encoding,
-                         size_t byte_size);
+  Status SetValueFromData(DataExtractor &data, lldb::Encoding encoding,
+                          size_t byte_size);
 
   static bool UIntValueIsValidForSize(uint64_t uval64, size_t total_byte_size) {
     if (total_byte_size > 8)
diff --git a/include/lldb/Core/SearchFilter.h b/include/lldb/Core/SearchFilter.h
index 829305ff5dee..06fb94d8fbdf 100644
--- a/include/lldb/Core/SearchFilter.h
+++ b/include/lldb/Core/SearchFilter.h
@@ -28,7 +28,7 @@ namespace lldb_private {
 class CompileUnit;
 }
 namespace lldb_private {
-class Error;
+class Status;
 }
 namespace lldb_private {
 class Function;
@@ -250,7 +250,7 @@ public:
   static lldb::SearchFilterSP
   CreateFromStructuredData(Target &target,
                            const StructuredData::Dictionary &data_dict,
-                           Error &error);
+                           Status &error);
 
   virtual StructuredData::ObjectSP SerializeToStructuredData() {
     return StructuredData::ObjectSP();
@@ -285,7 +285,7 @@ public:
 
   static const char *FilterTyToName(enum FilterTy);
 
-  static FilterTy NameToFilterTy(const char *name);
+  static FilterTy NameToFilterTy(llvm::StringRef name);
 
 protected:
   // Serialization of SearchFilter options:
@@ -351,7 +351,7 @@ public:
   static lldb::SearchFilterSP
   CreateFromStructuredData(Target &target,
                            const StructuredData::Dictionary &data_dict,
-                           Error &error);
+                           Status &error);
 
   StructuredData::ObjectSP SerializeToStructuredData() override;
 
@@ -405,7 +405,7 @@ public:
   static lldb::SearchFilterSP
   CreateFromStructuredData(Target &target,
                            const StructuredData::Dictionary &data_dict,
-                           Error &error);
+                           Status &error);
 
   StructuredData::ObjectSP SerializeToStructuredData() override;
 
@@ -462,7 +462,7 @@ public:
   static lldb::SearchFilterSP
   CreateFromStructuredData(Target &target,
                            const StructuredData::Dictionary &data_dict,
-                           Error &error);
+                           Status &error);
 
   StructuredData::ObjectSP SerializeToStructuredData() override;
 
@@ -515,7 +515,7 @@ public:
   static lldb::SearchFilterSP
   CreateFromStructuredData(Target &target,
                            const StructuredData::Dictionary &data_dict,
-                           Error &error);
+                           Status &error);
 
   StructuredData::ObjectSP SerializeToStructuredData() override;
 
diff --git a/include/lldb/Core/StructuredData.h b/include/lldb/Core/StructuredData.h
index eecfd86f7988..6cb78dc48ab5 100644
--- a/include/lldb/Core/StructuredData.h
+++ b/include/lldb/Core/StructuredData.h
@@ -28,7 +28,7 @@
 #include <stdint.h> // for uint64_t
 
 namespace lldb_private {
-class Error;
+class Status;
 }
 namespace lldb_private {
 class Stream;
@@ -143,15 +143,12 @@ public:
                                             : nullptr);
     }
 
-    std::string GetStringValue(const char *fail_value = nullptr) {
+    llvm::StringRef GetStringValue(const char *fail_value = nullptr) {
       String *s = GetAsString();
       if (s)
         return s->GetValue();
 
-      if (fail_value && fail_value[0])
-        return std::string(fail_value);
-
-      return std::string();
+      return fail_value;
     }
 
     Generic *GetAsGeneric() {
@@ -220,7 +217,7 @@ public:
       return success;
     }
 
-    bool GetItemAtIndexAsString(size_t idx, std::string &result) const {
+    bool GetItemAtIndexAsString(size_t idx, llvm::StringRef &result) const {
       ObjectSP value_sp = GetItemAtIndex(idx);
       if (value_sp.get()) {
         if (auto string_value = value_sp->GetAsString()) {
@@ -231,8 +228,8 @@ public:
       return false;
     }
 
-    bool GetItemAtIndexAsString(size_t idx, std::string &result,
-                                const std::string &default_val) const {
+    bool GetItemAtIndexAsString(size_t idx, llvm::StringRef &result,
+                                llvm::StringRef default_val) const {
       bool success = GetItemAtIndexAsString(idx, result);
       if (!success)
         result = default_val;
@@ -339,18 +336,13 @@ public:
 
   class String : public Object {
   public:
-    String(const char *cstr = nullptr) : Object(Type::eTypeString), m_value() {
-      if (cstr)
-        m_value = cstr;
-    }
-
-    String(const std::string &s) : Object(Type::eTypeString), m_value(s) {}
-
-    String(const std::string &&s) : Object(Type::eTypeString), m_value(s) {}
+    String() : Object(Type::eTypeString) {}
+    explicit String(llvm::StringRef S)
+        : Object(Type::eTypeString), m_value(S) {}
 
-    void SetValue(const std::string &string) { m_value = string; }
+    void SetValue(llvm::StringRef S) { m_value = S; }
 
-    const std::string &GetValue() { return m_value; }
+    llvm::StringRef GetValue() { return m_value; }
 
     void Dump(Stream &s, bool pretty_print = true) const override;
 
@@ -430,7 +422,7 @@ public:
     }
 
     bool GetValueForKeyAsString(llvm::StringRef key,
-                                std::string &result) const {
+                                llvm::StringRef &result) const {
       ObjectSP value_sp = GetValueForKey(key);
       if (value_sp.get()) {
         if (auto string_value = value_sp->GetAsString()) {
@@ -441,14 +433,14 @@ public:
       return false;
     }
 
-    bool GetValueForKeyAsString(llvm::StringRef key, std::string &result,
+    bool GetValueForKeyAsString(llvm::StringRef key, llvm::StringRef &result,
                                 const char *default_val) const {
       bool success = GetValueForKeyAsString(key, result);
       if (!success) {
         if (default_val)
           result = default_val;
         else
-          result.clear();
+          result = llvm::StringRef();
       }
       return success;
     }
@@ -513,7 +505,7 @@ public:
       AddItem(key, std::make_shared<Float>(value));
     }
 
-    void AddStringItem(llvm::StringRef key, std::string value) {
+    void AddStringItem(llvm::StringRef key, llvm::StringRef value) {
       AddItem(key, std::make_shared<String>(std::move(value)));
     }
 
@@ -558,7 +550,7 @@ public:
 
   static ObjectSP ParseJSON(std::string json_text);
 
-  static ObjectSP ParseJSONFromFile(const FileSpec &file, Error &error);
+  static ObjectSP ParseJSONFromFile(const FileSpec &file, Status &error);
 };
 
 } // namespace lldb_private
diff --git a/include/lldb/Core/StructuredDataImpl.h b/include/lldb/Core/StructuredDataImpl.h
index 94f9cce52548..81d59f83ac3d 100644
--- a/include/lldb/Core/StructuredDataImpl.h
+++ b/include/lldb/Core/StructuredDataImpl.h
@@ -12,9 +12,9 @@
 
 #include "lldb/Core/Event.h"
 #include "lldb/Core/StructuredData.h"
-#include "lldb/Utility/Error.h"
-#include "lldb/Utility/Stream.h"
 #include "lldb/Target/StructuredDataPlugin.h"
+#include "lldb/Utility/Status.h"
+#include "lldb/Utility/Stream.h"
 #include "lldb/lldb-forward.h"
 
 #pragma mark--
@@ -45,8 +45,8 @@ public:
     m_data_sp.reset();
   }
 
-  Error GetAsJSON(Stream &stream) const {
-    Error error;
+  Status GetAsJSON(Stream &stream) const {
+    Status error;
 
     if (!m_data_sp) {
       error.SetErrorString("No structured data.");
@@ -57,8 +57,8 @@ public:
     return error;
   }
 
-  Error GetDescription(Stream &stream) const {
-    Error error;
+  Status GetDescription(Stream &stream) const {
+    Status error;
 
     if (!m_data_sp) {
       error.SetErrorString("Cannot pretty print structured data: "
diff --git a/include/lldb/Core/Timer.h b/include/lldb/Core/Timer.h
index 6da00b0e27a4..55aab193032e 100644
--- a/include/lldb/Core/Timer.h
+++ b/include/lldb/Core/Timer.h
@@ -37,10 +37,23 @@ namespace lldb_private {
 
 class Timer {
 public:
+  class Category {
+  public:
+    explicit Category(const char *category_name);
+
+  private:
+    friend class Timer;
+    const char *m_name;
+    std::atomic<uint64_t> m_nanos;
+    std::atomic<Category *> m_next;
+
+    DISALLOW_COPY_AND_ASSIGN(Category);
+  };
+
   //--------------------------------------------------------------
   /// Default constructor.
   //--------------------------------------------------------------
-  Timer(const char *category, const char *format, ...)
+  Timer(Category &category, const char *format, ...)
       __attribute__((format(printf, 3, 4)));
 
   //--------------------------------------------------------------
@@ -62,7 +75,7 @@ protected:
   using TimePoint = std::chrono::steady_clock::time_point;
   void ChildDuration(TimePoint::duration dur) { m_child_duration += dur; }
 
-  const char *m_category;
+  Category &m_category;
   TimePoint m_total_start;
   TimePoint::duration m_child_duration{0};
 
diff --git a/include/lldb/Core/UserSettingsController.h b/include/lldb/Core/UserSettingsController.h
index 0207187f0718..67bc9b2c0713 100644
--- a/include/lldb/Core/UserSettingsController.h
+++ b/include/lldb/Core/UserSettingsController.h
@@ -10,7 +10,7 @@
 #ifndef liblldb_UserSettingsController_h_
 #define liblldb_UserSettingsController_h_
 
-#include "lldb/Utility/Error.h"             // for Error
+#include "lldb/Utility/Status.h"            // for Status
 #include "lldb/lldb-forward.h"              // for OptionValuePropertiesSP
 #include "lldb/lldb-private-enumerations.h" // for VarSetOperationType
 
@@ -57,15 +57,16 @@ public:
   virtual lldb::OptionValueSP GetPropertyValue(const ExecutionContext *exe_ctx,
                                                llvm::StringRef property_path,
                                                bool will_modify,
-                                               Error &error) const;
+                                               Status &error) const;
 
-  virtual Error SetPropertyValue(const ExecutionContext *exe_ctx,
-                                 VarSetOperationType op,
-    llvm::StringRef property_path, llvm::StringRef value);
+  virtual Status SetPropertyValue(const ExecutionContext *exe_ctx,
+                                  VarSetOperationType op,
+                                  llvm::StringRef property_path,
+                                  llvm::StringRef value);
 
-  virtual Error DumpPropertyValue(const ExecutionContext *exe_ctx, Stream &strm,
-    llvm::StringRef property_path,
-                                  uint32_t dump_mask);
+  virtual Status DumpPropertyValue(const ExecutionContext *exe_ctx,
+                                   Stream &strm, llvm::StringRef property_path,
+                                   uint32_t dump_mask);
 
   virtual void DumpAllPropertyValues(const ExecutionContext *exe_ctx,
                                      Stream &strm, uint32_t dump_mask);
diff --git a/include/lldb/Core/Value.h b/include/lldb/Core/Value.h
index 9a667cf65752..678b56fc49f5 100644
--- a/include/lldb/Core/Value.h
+++ b/include/lldb/Core/Value.h
@@ -13,7 +13,7 @@
 #include "lldb/Core/Scalar.h"
 #include "lldb/Symbol/CompilerType.h"
 #include "lldb/Utility/DataBufferHeap.h"
-#include "lldb/Utility/Error.h"
+#include "lldb/Utility/Status.h"
 #include "lldb/lldb-enumerations.h"         // for ByteOrder, ByteOrder::eB...
 #include "lldb/lldb-private-enumerations.h" // for AddressType
 #include "lldb/lldb-private-types.h"        // for type128, RegisterInfo
@@ -219,11 +219,11 @@ public:
 
   lldb::Format GetValueDefaultFormat();
 
-  uint64_t GetValueByteSize(Error *error_ptr, ExecutionContext *exe_ctx);
+  uint64_t GetValueByteSize(Status *error_ptr, ExecutionContext *exe_ctx);
 
-  Error GetValueAsData(ExecutionContext *exe_ctx, DataExtractor &data,
-                       uint32_t data_offset,
-                       Module *module); // Can be nullptr
+  Status GetValueAsData(ExecutionContext *exe_ctx, DataExtractor &data,
+                        uint32_t data_offset,
+                        Module *module); // Can be nullptr
 
   static const char *GetValueTypeAsCString(ValueType context_type);
 
diff --git a/include/lldb/Core/ValueObject.h b/include/lldb/Core/ValueObject.h
index 0898754b211a..fa1d14870b05 100644
--- a/include/lldb/Core/ValueObject.h
+++ b/include/lldb/Core/ValueObject.h
@@ -18,8 +18,8 @@
 #include "lldb/Target/Process.h"
 #include "lldb/Utility/ConstString.h"
 #include "lldb/Utility/DataExtractor.h"
-#include "lldb/Utility/Error.h"
 #include "lldb/Utility/SharedCluster.h"
+#include "lldb/Utility/Status.h"
 #include "lldb/Utility/UserID.h"
 #include "lldb/lldb-defines.h"              // for LLDB_INVALID...
 #include "lldb/lldb-enumerations.h"         // for DynamicValue...
@@ -462,7 +462,7 @@ public:
 
   virtual int64_t GetValueAsSigned(int64_t fail_value, bool *success = nullptr);
 
-  virtual bool SetValueFromCString(const char *value_str, Error &error);
+  virtual bool SetValueFromCString(const char *value_str, Status &error);
 
   // Return the module associated with this value object in case the
   // value is from an executable file and might have its data in
@@ -482,7 +482,7 @@ public:
   //------------------------------------------------------------------
   // The functions below should NOT be modified by subclasses
   //------------------------------------------------------------------
-  const Error &GetError();
+  const Status &GetError();
 
   const ConstString &GetName() const;
 
@@ -520,7 +520,7 @@ public:
   // return 'false' whenever you set the error, otherwise
   // callers may assume true means everything is OK - this will
   // break breakpoint conditions among potentially a few others
-  virtual bool IsLogicalTrue(Error &error);
+  virtual bool IsLogicalTrue(Status &error);
 
   virtual const char *GetLocationAsCString();
 
@@ -620,7 +620,7 @@ public:
 
   virtual lldb::ValueObjectSP CreateConstantValue(const ConstString &name);
 
-  virtual lldb::ValueObjectSP Dereference(Error &error);
+  virtual lldb::ValueObjectSP Dereference(Status &error);
 
   // Creates a copy of the ValueObject with a new name and setting the current
   // ValueObject as its parent. It should be used when we want to change the
@@ -628,7 +628,7 @@ public:
   // (e.g. sythetic child provider).
   virtual lldb::ValueObjectSP Clone(const ConstString &new_name);
 
-  virtual lldb::ValueObjectSP AddressOf(Error &error);
+  virtual lldb::ValueObjectSP AddressOf(Status &error);
 
   virtual lldb::addr_t GetLiveAddress() { return LLDB_INVALID_ADDRESS; }
 
@@ -700,16 +700,16 @@ public:
   bool IsCStringContainer(bool check_pointer = false);
 
   std::pair<size_t, bool>
-  ReadPointedString(lldb::DataBufferSP &buffer_sp, Error &error,
+  ReadPointedString(lldb::DataBufferSP &buffer_sp, Status &error,
                     uint32_t max_length = 0, bool honor_array = true,
                     lldb::Format item_format = lldb::eFormatCharArray);
 
   virtual size_t GetPointeeData(DataExtractor &data, uint32_t item_idx = 0,
                                 uint32_t item_count = 1);
 
-  virtual uint64_t GetData(DataExtractor &data, Error &error);
+  virtual uint64_t GetData(DataExtractor &data, Status &error);
 
-  virtual bool SetData(DataExtractor &data, Error &error);
+  virtual bool SetData(DataExtractor &data, Status &error);
 
   virtual bool GetIsConstant() const { return m_update_point.IsConstant(); }
 
@@ -880,8 +880,9 @@ protected:
   DataExtractor
       m_data; // A data extractor that can be used to extract the value.
   Value m_value;
-  Error m_error; // An error object that can describe any errors that occur when
-                 // updating values.
+  Status
+      m_error; // An error object that can describe any errors that occur when
+               // updating values.
   std::string m_value_str; // Cached value string that will get cleared if/when
                            // the value is updated.
   std::string m_old_value_str; // Cached old value string from the last time the
diff --git a/include/lldb/Core/ValueObjectConstResult.h b/include/lldb/Core/ValueObjectConstResult.h
index 403d19324c46..1f56129df24a 100644
--- a/include/lldb/Core/ValueObjectConstResult.h
+++ b/include/lldb/Core/ValueObjectConstResult.h
@@ -15,7 +15,7 @@
 #include "lldb/Core/ValueObjectConstResultImpl.h"
 #include "lldb/Symbol/CompilerType.h"       // for CompilerType
 #include "lldb/Utility/ConstString.h"       // for ConstString
-#include "lldb/Utility/Error.h"             // for Error
+#include "lldb/Utility/Status.h"            // for Status
 #include "lldb/lldb-defines.h"              // for LLDB_INVALID_ADDRESS
 #include "lldb/lldb-enumerations.h"         // for ByteOrder, Dynamic...
 #include "lldb/lldb-forward.h"              // for ValueObjectSP, Dat...
@@ -69,7 +69,7 @@ public:
 
   // When an expression fails to evaluate, we return an error
   static lldb::ValueObjectSP Create(ExecutionContextScope *exe_scope,
-                                    const Error &error);
+                                    const Status &error);
 
   uint64_t GetByteSize() override;
 
@@ -85,7 +85,7 @@ public:
 
   void SetByteSize(size_t size);
 
-  lldb::ValueObjectSP Dereference(Error &error) override;
+  lldb::ValueObjectSP Dereference(Status &error) override;
 
   ValueObject *CreateChildAtIndex(size_t idx, bool synthetic_array_member,
                                   int32_t synthetic_index) override;
@@ -94,7 +94,7 @@ public:
       uint32_t offset, const CompilerType &type, bool can_create,
       ConstString name_const_str = ConstString()) override;
 
-  lldb::ValueObjectSP AddressOf(Error &error) override;
+  lldb::ValueObjectSP AddressOf(Status &error) override;
 
   lldb::addr_t GetAddressOf(bool scalar_is_load_address = true,
                             AddressType *address_type = nullptr) override;
@@ -153,7 +153,7 @@ private:
   ValueObjectConstResult(ExecutionContextScope *exe_scope, const Value &value,
                          const ConstString &name, Module *module = nullptr);
 
-  ValueObjectConstResult(ExecutionContextScope *exe_scope, const Error &error);
+  ValueObjectConstResult(ExecutionContextScope *exe_scope, const Status &error);
 
   DISALLOW_COPY_AND_ASSIGN(ValueObjectConstResult);
 };
diff --git a/include/lldb/Core/ValueObjectConstResultCast.h b/include/lldb/Core/ValueObjectConstResultCast.h
index 3443bfb4885b..442cce420855 100644
--- a/include/lldb/Core/ValueObjectConstResultCast.h
+++ b/include/lldb/Core/ValueObjectConstResultCast.h
@@ -25,7 +25,7 @@ namespace lldb_private {
 class DataExtractor;
 }
 namespace lldb_private {
-class Error;
+class Status;
 }
 namespace lldb_private {
 class ValueObject;
@@ -41,7 +41,7 @@ public:
 
   ~ValueObjectConstResultCast() override;
 
-  lldb::ValueObjectSP Dereference(Error &error) override;
+  lldb::ValueObjectSP Dereference(Status &error) override;
 
   ValueObject *CreateChildAtIndex(size_t idx, bool synthetic_array_member,
                                   int32_t synthetic_index) override;
@@ -54,7 +54,7 @@ public:
       uint32_t offset, const CompilerType &type, bool can_create,
       ConstString name_const_str = ConstString()) override;
 
-  lldb::ValueObjectSP AddressOf(Error &error) override;
+  lldb::ValueObjectSP AddressOf(Status &error) override;
 
   size_t GetPointeeData(DataExtractor &data, uint32_t item_idx = 0,
                         uint32_t item_count = 1) override;
diff --git a/include/lldb/Core/ValueObjectConstResultChild.h b/include/lldb/Core/ValueObjectConstResultChild.h
index 3da59dc2eb56..a74da0013aba 100644
--- a/include/lldb/Core/ValueObjectConstResultChild.h
+++ b/include/lldb/Core/ValueObjectConstResultChild.h
@@ -24,7 +24,7 @@ namespace lldb_private {
 class DataExtractor;
 }
 namespace lldb_private {
-class Error;
+class Status;
 }
 namespace lldb_private {
 class ValueObject;
@@ -48,7 +48,7 @@ public:
 
   ~ValueObjectConstResultChild() override;
 
-  lldb::ValueObjectSP Dereference(Error &error) override;
+  lldb::ValueObjectSP Dereference(Status &error) override;
 
   ValueObject *CreateChildAtIndex(size_t idx, bool synthetic_array_member,
                                   int32_t synthetic_index) override;
@@ -61,7 +61,7 @@ public:
       uint32_t offset, const CompilerType &type, bool can_create,
       ConstString name_const_str = ConstString()) override;
 
-  lldb::ValueObjectSP AddressOf(Error &error) override;
+  lldb::ValueObjectSP AddressOf(Status &error) override;
 
   size_t GetPointeeData(DataExtractor &data, uint32_t item_idx = 0,
                         uint32_t item_count = 1) override;
diff --git a/include/lldb/Core/ValueObjectConstResultImpl.h b/include/lldb/Core/ValueObjectConstResultImpl.h
index 2586aab4c781..d86f25e30579 100644
--- a/include/lldb/Core/ValueObjectConstResultImpl.h
+++ b/include/lldb/Core/ValueObjectConstResultImpl.h
@@ -25,7 +25,7 @@ namespace lldb_private {
 class DataExtractor;
 }
 namespace lldb_private {
-class Error;
+class Status;
 }
 namespace lldb_private {
 class ValueObject;
@@ -45,7 +45,7 @@ public:
 
   virtual ~ValueObjectConstResultImpl() = default;
 
-  lldb::ValueObjectSP Dereference(Error &error);
+  lldb::ValueObjectSP Dereference(Status &error);
 
   ValueObject *CreateChildAtIndex(size_t idx, bool synthetic_array_member,
                                   int32_t synthetic_index);
@@ -55,7 +55,7 @@ public:
                             bool can_create,
                             ConstString name_const_str = ConstString());
 
-  lldb::ValueObjectSP AddressOf(Error &error);
+  lldb::ValueObjectSP AddressOf(Status &error);
 
   lldb::addr_t GetLiveAddress() { return m_live_address; }
 
diff --git a/include/lldb/Core/ValueObjectDynamicValue.h b/include/lldb/Core/ValueObjectDynamicValue.h
index 99eb1ebcf977..018ee2c764bf 100644
--- a/include/lldb/Core/ValueObjectDynamicValue.h
+++ b/include/lldb/Core/ValueObjectDynamicValue.h
@@ -32,7 +32,7 @@ namespace lldb_private {
 class Declaration;
 }
 namespace lldb_private {
-class Error;
+class Status;
 }
 namespace lldb_private {
 
@@ -86,9 +86,9 @@ public:
     m_owning_valobj_sp = owning_sp;
   }
 
-  bool SetValueFromCString(const char *value_str, Error &error) override;
+  bool SetValueFromCString(const char *value_str, Status &error) override;
 
-  bool SetData(DataExtractor &data, Error &error) override;
+  bool SetData(DataExtractor &data, Status &error) override;
 
   TypeImpl GetTypeImpl() override;
 
diff --git a/include/lldb/Core/ValueObjectRegister.h b/include/lldb/Core/ValueObjectRegister.h
index 50d0e12bb3f4..2aaef9bee99e 100644
--- a/include/lldb/Core/ValueObjectRegister.h
+++ b/include/lldb/Core/ValueObjectRegister.h
@@ -26,7 +26,7 @@ namespace lldb_private {
 class DataExtractor;
 }
 namespace lldb_private {
-class Error;
+class Status;
 }
 namespace lldb_private {
 class ExecutionContextScope;
@@ -148,9 +148,9 @@ public:
 
   size_t CalculateNumChildren(uint32_t max) override;
 
-  bool SetValueFromCString(const char *value_str, Error &error) override;
+  bool SetValueFromCString(const char *value_str, Status &error) override;
 
-  bool SetData(DataExtractor &data, Error &error) override;
+  bool SetData(DataExtractor &data, Status &error) override;
 
   bool ResolveValue(Scalar &scalar) override;
 
diff --git a/include/lldb/Core/ValueObjectSyntheticFilter.h b/include/lldb/Core/ValueObjectSyntheticFilter.h
index d83f038744e5..e32e14030418 100644
--- a/include/lldb/Core/ValueObjectSyntheticFilter.h
+++ b/include/lldb/Core/ValueObjectSyntheticFilter.h
@@ -29,7 +29,7 @@ namespace lldb_private {
 class Declaration;
 }
 namespace lldb_private {
-class Error;
+class Status;
 }
 namespace lldb_private {
 class SyntheticChildrenFrontEnd;
@@ -110,7 +110,7 @@ public:
 
   bool GetIsConstant() const override { return false; }
 
-  bool SetValueFromCString(const char *value_str, Error &error) override;
+  bool SetValueFromCString(const char *value_str, Status &error) override;
 
   void SetFormat(lldb::Format format) override;
 
diff --git a/include/lldb/Core/ValueObjectVariable.h b/include/lldb/Core/ValueObjectVariable.h
index dbb7c2b9483e..9dd140a84f45 100644
--- a/include/lldb/Core/ValueObjectVariable.h
+++ b/include/lldb/Core/ValueObjectVariable.h
@@ -29,7 +29,7 @@ namespace lldb_private {
 class Declaration;
 }
 namespace lldb_private {
-class Error;
+class Status;
 }
 namespace lldb_private {
 class ExecutionContextScope;
@@ -73,9 +73,9 @@ public:
 
   const char *GetLocationAsCString() override;
 
-  bool SetValueFromCString(const char *value_str, Error &error) override;
+  bool SetValueFromCString(const char *value_str, Status &error) override;
 
-  bool SetData(DataExtractor &data, Error &error) override;
+  bool SetData(DataExtractor &data, Status &error) override;
 
   virtual lldb::VariableSP GetVariable() override { return m_variable_sp; }
 
diff --git a/include/lldb/DataFormatters/TypeSummary.h b/include/lldb/DataFormatters/TypeSummary.h
index fbfc25dd9c9f..a4d51b065480 100644
--- a/include/lldb/DataFormatters/TypeSummary.h
+++ b/include/lldb/DataFormatters/TypeSummary.h
@@ -25,7 +25,7 @@
 
 #include "lldb/Core/FormatEntity.h"
 #include "lldb/Core/StructuredData.h"
-#include "lldb/Utility/Error.h"
+#include "lldb/Utility/Status.h"
 
 namespace lldb_private {
 class TypeSummaryOptions {
@@ -286,7 +286,7 @@ private:
 struct StringSummaryFormat : public TypeSummaryImpl {
   std::string m_format_str;
   FormatEntity::Entry m_format;
-  Error m_error;
+  Status m_error;
 
   StringSummaryFormat(const TypeSummaryImpl::Flags &flags, const char *f);
 
diff --git a/include/lldb/Expression/DWARFExpression.h b/include/lldb/Expression/DWARFExpression.h
index ae9eb3fb2d16..1816c3b7a00f 100644
--- a/include/lldb/Expression/DWARFExpression.h
+++ b/include/lldb/Expression/DWARFExpression.h
@@ -14,7 +14,7 @@
 #include "lldb/Core/Disassembler.h"
 #include "lldb/Core/Scalar.h"
 #include "lldb/Utility/DataExtractor.h"
-#include "lldb/Utility/Error.h"
+#include "lldb/Utility/Status.h"
 #include "lldb/lldb-private.h"
 #include <functional>
 
@@ -266,7 +266,7 @@ public:
                 ClangExpressionDeclMap *decl_map,
                 lldb::addr_t loclist_base_load_addr,
                 const Value *initial_value_ptr, const Value *object_address_ptr,
-                Value &result, Error *error_ptr) const;
+                Value &result, Status *error_ptr) const;
 
   //------------------------------------------------------------------
   /// Wrapper for the static evaluate function that uses member
@@ -277,7 +277,7 @@ public:
                 ClangExpressionDeclMap *decl_map, RegisterContext *reg_ctx,
                 lldb::addr_t loclist_base_load_addr,
                 const Value *initial_value_ptr, const Value *object_address_ptr,
-                Value &result, Error *error_ptr) const;
+                Value &result, Status *error_ptr) const;
 
   //------------------------------------------------------------------
   /// Evaluate a DWARF location expression in a particular context
@@ -345,7 +345,7 @@ public:
            DWARFCompileUnit *dwarf_cu, const lldb::offset_t offset,
            const lldb::offset_t length, const lldb::RegisterKind reg_set,
            const Value *initial_value_ptr, const Value *object_address_ptr,
-           Value &result, Error *error_ptr);
+           Value &result, Status *error_ptr);
 
   //------------------------------------------------------------------
   /// Loads a ClangExpressionVariableList into the object
diff --git a/include/lldb/Expression/ExpressionParser.h b/include/lldb/Expression/ExpressionParser.h
index 9f33908b8f1b..19526d28b9b3 100644
--- a/include/lldb/Expression/ExpressionParser.h
+++ b/include/lldb/Expression/ExpressionParser.h
@@ -10,7 +10,7 @@
 #ifndef liblldb_ExpressionParser_h_
 #define liblldb_ExpressionParser_h_
 
-#include "lldb/Utility/Error.h"
+#include "lldb/Utility/Status.h"
 #include "lldb/lldb-public.h"
 
 namespace lldb_private {
@@ -108,7 +108,7 @@ public:
   ///     An error code indicating the success or failure of the operation.
   ///     Test with Success().
   //------------------------------------------------------------------
-  virtual Error
+  virtual Status
   PrepareForExecution(lldb::addr_t &func_addr, lldb::addr_t &func_end,
                       std::shared_ptr<IRExecutionUnit> &execution_unit_sp,
                       ExecutionContext &exe_ctx, bool &can_interpret,
diff --git a/include/lldb/Expression/IRExecutionUnit.h b/include/lldb/Expression/IRExecutionUnit.h
index 635e0b727d5c..b0b4d7a5c586 100644
--- a/include/lldb/Expression/IRExecutionUnit.h
+++ b/include/lldb/Expression/IRExecutionUnit.h
@@ -39,7 +39,7 @@ class ObjectCache;
 
 namespace lldb_private {
 
-class Error;
+class Status;
 
 //----------------------------------------------------------------------
 /// @class IRExecutionUnit IRExecutionUnit.h "lldb/Expression/IRExecutionUnit.h"
@@ -86,7 +86,7 @@ public:
                                   : nullptr);
   }
 
-  void GetRunnableInfo(Error &error, lldb::addr_t &func_addr,
+  void GetRunnableInfo(Status &error, lldb::addr_t &func_addr,
                        lldb::addr_t &func_end);
 
   //------------------------------------------------------------------
@@ -95,7 +95,7 @@ public:
   /// IRExecutionUnit unless the client explicitly chooses to free it.
   //------------------------------------------------------------------
 
-  lldb::addr_t WriteNow(const uint8_t *bytes, size_t size, Error &error);
+  lldb::addr_t WriteNow(const uint8_t *bytes, size_t size, Status &error);
 
   void FreeNow(lldb::addr_t allocation);
 
@@ -240,7 +240,7 @@ private:
   //------------------------------------------------------------------
   bool WriteData(lldb::ProcessSP &process_sp);
 
-  Error DisassembleFunction(Stream &stream, lldb::ProcessSP &process_sp);
+  Status DisassembleFunction(Stream &stream, lldb::ProcessSP &process_sp);
 
   struct SearchSpec;
 
@@ -391,7 +391,7 @@ private:
     void dump(Log *log);
   };
 
-  bool CommitOneAllocation(lldb::ProcessSP &process_sp, Error &error,
+  bool CommitOneAllocation(lldb::ProcessSP &process_sp, Status &error,
                            AllocationRecord &record);
 
   typedef std::vector<AllocationRecord> RecordVector;
diff --git a/include/lldb/Expression/IRInterpreter.h b/include/lldb/Expression/IRInterpreter.h
index f9392c18c25c..36e03c6fc4f6 100644
--- a/include/lldb/Expression/IRInterpreter.h
+++ b/include/lldb/Expression/IRInterpreter.h
@@ -39,20 +39,20 @@ class IRMemoryMap;
 class IRInterpreter {
 public:
   static bool CanInterpret(llvm::Module &module, llvm::Function &function,
-                           lldb_private::Error &error,
+                           lldb_private::Status &error,
                            const bool support_function_calls);
 
   static bool Interpret(llvm::Module &module, llvm::Function &function,
                         llvm::ArrayRef<lldb::addr_t> args,
                         lldb_private::IRExecutionUnit &execution_unit,
-                        lldb_private::Error &error,
+                        lldb_private::Status &error,
                         lldb::addr_t stack_frame_bottom,
                         lldb::addr_t stack_frame_top,
                         lldb_private::ExecutionContext &exe_ctx);
 
 private:
   static bool supportsFunction(llvm::Function &llvm_function,
-                               lldb_private::Error &err);
+                               lldb_private::Status &err);
 };
 
 #endif
diff --git a/include/lldb/Expression/IRMemoryMap.h b/include/lldb/Expression/IRMemoryMap.h
index c1a194d1afad..abb5cd745053 100644
--- a/include/lldb/Expression/IRMemoryMap.h
+++ b/include/lldb/Expression/IRMemoryMap.h
@@ -54,25 +54,25 @@ public:
   };
 
   lldb::addr_t Malloc(size_t size, uint8_t alignment, uint32_t permissions,
-                      AllocationPolicy policy, bool zero_memory, Error &error);
-  void Leak(lldb::addr_t process_address, Error &error);
-  void Free(lldb::addr_t process_address, Error &error);
+                      AllocationPolicy policy, bool zero_memory, Status &error);
+  void Leak(lldb::addr_t process_address, Status &error);
+  void Free(lldb::addr_t process_address, Status &error);
 
   void WriteMemory(lldb::addr_t process_address, const uint8_t *bytes,
-                   size_t size, Error &error);
+                   size_t size, Status &error);
   void WriteScalarToMemory(lldb::addr_t process_address, Scalar &scalar,
-                           size_t size, Error &error);
+                           size_t size, Status &error);
   void WritePointerToMemory(lldb::addr_t process_address, lldb::addr_t address,
-                            Error &error);
+                            Status &error);
   void ReadMemory(uint8_t *bytes, lldb::addr_t process_address, size_t size,
-                  Error &error);
+                  Status &error);
   void ReadScalarFromMemory(Scalar &scalar, lldb::addr_t process_address,
-                            size_t size, Error &error);
+                            size_t size, Status &error);
   void ReadPointerFromMemory(lldb::addr_t *address,
-                             lldb::addr_t process_address, Error &error);
+                             lldb::addr_t process_address, Status &error);
   bool GetAllocSize(lldb::addr_t address, size_t &size);
   void GetMemoryData(DataExtractor &extractor, lldb::addr_t process_address,
-                     size_t size, Error &error);
+                     size_t size, Status &error);
 
   lldb::ByteOrder GetByteOrder();
   uint32_t GetAddressByteSize();
diff --git a/include/lldb/Expression/LLVMUserExpression.h b/include/lldb/Expression/LLVMUserExpression.h
index 48d0161991d7..745d413e077b 100644
--- a/include/lldb/Expression/LLVMUserExpression.h
+++ b/include/lldb/Expression/LLVMUserExpression.h
@@ -81,7 +81,7 @@ protected:
             lldb::ExpressionVariableSP &result) override;
 
   virtual void ScanContext(ExecutionContext &exe_ctx,
-                           lldb_private::Error &err) = 0;
+                           lldb_private::Status &err) = 0;
 
   bool PrepareToExecuteJITExpression(DiagnosticManager &diagnostic_manager,
                                      ExecutionContext &exe_ctx,
diff --git a/include/lldb/Expression/Materializer.h b/include/lldb/Expression/Materializer.h
index ed3f91cc67ec..b86bc656d6b5 100644
--- a/include/lldb/Expression/Materializer.h
+++ b/include/lldb/Expression/Materializer.h
@@ -20,7 +20,7 @@
 #include "lldb/Expression/IRMemoryMap.h"
 #include "lldb/Symbol/TaggedASTType.h"
 #include "lldb/Target/StackFrame.h"
-#include "lldb/Utility/Error.h"
+#include "lldb/Utility/Status.h"
 #include "lldb/lldb-private-types.h"
 
 namespace lldb_private {
@@ -38,7 +38,7 @@ public:
 
     ~Dematerializer() { Wipe(); }
 
-    void Dematerialize(Error &err, lldb::addr_t frame_top,
+    void Dematerialize(Status &err, lldb::addr_t frame_top,
                        lldb::addr_t frame_bottom);
 
     void Wipe();
@@ -72,7 +72,7 @@ public:
   typedef std::weak_ptr<Dematerializer> DematerializerWP;
 
   DematerializerSP Materialize(lldb::StackFrameSP &frame_sp, IRMemoryMap &map,
-                               lldb::addr_t process_address, Error &err);
+                               lldb::addr_t process_address, Status &err);
 
   class PersistentVariableDelegate {
   public:
@@ -83,13 +83,13 @@ public:
 
   uint32_t
   AddPersistentVariable(lldb::ExpressionVariableSP &persistent_variable_sp,
-                        PersistentVariableDelegate *delegate, Error &err);
-  uint32_t AddVariable(lldb::VariableSP &variable_sp, Error &err);
+                        PersistentVariableDelegate *delegate, Status &err);
+  uint32_t AddVariable(lldb::VariableSP &variable_sp, Status &err);
   uint32_t AddResultVariable(const CompilerType &type, bool is_lvalue,
                              bool keep_in_memory,
-                             PersistentVariableDelegate *delegate, Error &err);
-  uint32_t AddSymbol(const Symbol &symbol_sp, Error &err);
-  uint32_t AddRegister(const RegisterInfo &register_info, Error &err);
+                             PersistentVariableDelegate *delegate, Status &err);
+  uint32_t AddSymbol(const Symbol &symbol_sp, Status &err);
+  uint32_t AddRegister(const RegisterInfo &register_info, Status &err);
 
   uint32_t GetStructAlignment() { return m_struct_alignment; }
 
@@ -102,11 +102,11 @@ public:
     virtual ~Entity() = default;
 
     virtual void Materialize(lldb::StackFrameSP &frame_sp, IRMemoryMap &map,
-                             lldb::addr_t process_address, Error &err) = 0;
+                             lldb::addr_t process_address, Status &err) = 0;
     virtual void Dematerialize(lldb::StackFrameSP &frame_sp, IRMemoryMap &map,
                                lldb::addr_t process_address,
                                lldb::addr_t frame_top,
-                               lldb::addr_t frame_bottom, Error &err) = 0;
+                               lldb::addr_t frame_bottom, Status &err) = 0;
     virtual void DumpToLog(IRMemoryMap &map, lldb::addr_t process_address,
                            Log *log) = 0;
     virtual void Wipe(IRMemoryMap &map, lldb::addr_t process_address) = 0;
diff --git a/include/lldb/Expression/REPL.h b/include/lldb/Expression/REPL.h
index 56d31cfd3b15..0c1e97fec259 100644
--- a/include/lldb/Expression/REPL.h
+++ b/include/lldb/Expression/REPL.h
@@ -60,7 +60,7 @@ public:
   /// @return
   ///     The range of the containing object in the target process.
   //------------------------------------------------------------------
-  static lldb::REPLSP Create(Error &Error, lldb::LanguageType language,
+  static lldb::REPLSP Create(Status &Status, lldb::LanguageType language,
                              Debugger *debugger, Target *target,
                              const char *repl_options);
 
@@ -85,7 +85,7 @@ public:
 
   lldb::IOHandlerSP GetIOHandler();
 
-  Error RunLoop();
+  Status RunLoop();
 
   //------------------------------------------------------------------
   // IOHandler::Delegate functions
@@ -126,7 +126,7 @@ protected:
   // Subclasses should override these functions to implement a functional REPL.
   //----------------------------------------------------------------------
 
-  virtual Error DoInitialization() = 0;
+  virtual Status DoInitialization() = 0;
 
   virtual ConstString GetSourceFileBasename() = 0;
 
diff --git a/include/lldb/Expression/UserExpression.h b/include/lldb/Expression/UserExpression.h
index fca667e8ee96..ced5cb2bf2b7 100644
--- a/include/lldb/Expression/UserExpression.h
+++ b/include/lldb/Expression/UserExpression.h
@@ -259,13 +259,13 @@ public:
   static lldb::ExpressionResults
   Evaluate(ExecutionContext &exe_ctx, const EvaluateExpressionOptions &options,
            llvm::StringRef expr_cstr, llvm::StringRef expr_prefix,
-           lldb::ValueObjectSP &result_valobj_sp, Error &error,
+           lldb::ValueObjectSP &result_valobj_sp, Status &error,
            uint32_t line_offset = 0, std::string *fixed_expression = nullptr,
            lldb::ModuleSP *jit_module_sp_ptr = nullptr);
 
-  static const Error::ValueType kNoResult =
+  static const Status::ValueType kNoResult =
       0x1001; ///< ValueObject::GetError() returns this if there is no result
-              ///from the expression.
+              /// from the expression.
 
   const char *GetFixedText() {
     if (m_fixed_text.empty())
@@ -281,7 +281,7 @@ protected:
             lldb::ExpressionVariableSP &result) = 0;
 
   static lldb::addr_t GetObjectPointer(lldb::StackFrameSP frame_sp,
-                                       ConstString &object_name, Error &err);
+                                       ConstString &object_name, Status &err);
 
   //------------------------------------------------------------------
   /// Populate m_in_cplusplus_method and m_in_objectivec_method based on the
diff --git a/include/lldb/Expression/UtilityFunction.h b/include/lldb/Expression/UtilityFunction.h
index 0e2b87da20ad..9c54db35fa37 100644
--- a/include/lldb/Expression/UtilityFunction.h
+++ b/include/lldb/Expression/UtilityFunction.h
@@ -123,7 +123,7 @@ public:
   FunctionCaller *MakeFunctionCaller(const CompilerType &return_type,
                                      const ValueList &arg_value_list,
                                      lldb::ThreadSP compilation_thread,
-                                     Error &error);
+                                     Status &error);
 
   // This one retrieves the function caller that is already made.  If you
   // haven't made it yet, this returns nullptr
diff --git a/include/lldb/Host/File.h b/include/lldb/Host/File.h
index 35317780f1ce..560a655237f3 100644
--- a/include/lldb/Host/File.h
+++ b/include/lldb/Host/File.h
@@ -12,7 +12,7 @@
 
 #include "lldb/Host/IOObject.h"
 #include "lldb/Host/PosixApi.h"
-#include "lldb/Utility/Error.h"
+#include "lldb/Utility/Status.h"
 #include "lldb/lldb-private.h"
 
 #include <stdarg.h>
@@ -164,7 +164,7 @@ public:
   /// @return
   ///     A reference to the file specification object.
   //------------------------------------------------------------------
-  Error GetFileSpec(FileSpec &file_spec) const;
+  Status GetFileSpec(FileSpec &file_spec) const;
 
   //------------------------------------------------------------------
   /// Open a file for read/writing with the specified options.
@@ -181,10 +181,10 @@ public:
   /// @param[in] permissions
   ///     Options to use when opening (see File::Permissions)
   //------------------------------------------------------------------
-  Error Open(const char *path, uint32_t options,
-             uint32_t permissions = lldb::eFilePermissionsFileDefault);
+  Status Open(const char *path, uint32_t options,
+              uint32_t permissions = lldb::eFilePermissionsFileDefault);
 
-  Error Close() override;
+  Status Close() override;
 
   void Clear();
 
@@ -216,7 +216,7 @@ public:
   ///     An error object that indicates success or the reason for
   ///     failure.
   //------------------------------------------------------------------
-  Error Read(void *buf, size_t &num_bytes) override;
+  Status Read(void *buf, size_t &num_bytes) override;
 
   //------------------------------------------------------------------
   /// Write bytes to a file at the current file position.
@@ -237,7 +237,7 @@ public:
   ///     An error object that indicates success or the reason for
   ///     failure.
   //------------------------------------------------------------------
-  Error Write(const void *buf, size_t &num_bytes) override;
+  Status Write(const void *buf, size_t &num_bytes) override;
 
   //------------------------------------------------------------------
   /// Seek to an offset relative to the beginning of the file.
@@ -253,13 +253,13 @@ public:
   ///     beginning of the file.
   ///
   /// @param[in] error_ptr
-  ///     A pointer to a lldb_private::Error object that will be
+  ///     A pointer to a lldb_private::Status object that will be
   ///     filled in if non-nullptr.
   ///
   /// @return
   ///     The resulting seek offset, or -1 on error.
   //------------------------------------------------------------------
-  off_t SeekFromStart(off_t offset, Error *error_ptr = nullptr);
+  off_t SeekFromStart(off_t offset, Status *error_ptr = nullptr);
 
   //------------------------------------------------------------------
   /// Seek to an offset relative to the current file position.
@@ -275,13 +275,13 @@ public:
   ///     current file position.
   ///
   /// @param[in] error_ptr
-  ///     A pointer to a lldb_private::Error object that will be
+  ///     A pointer to a lldb_private::Status object that will be
   ///     filled in if non-nullptr.
   ///
   /// @return
   ///     The resulting seek offset, or -1 on error.
   //------------------------------------------------------------------
-  off_t SeekFromCurrent(off_t offset, Error *error_ptr = nullptr);
+  off_t SeekFromCurrent(off_t offset, Status *error_ptr = nullptr);
 
   //------------------------------------------------------------------
   /// Seek to an offset relative to the end of the file.
@@ -298,13 +298,13 @@ public:
   ///     absolute file offset.
   ///
   /// @param[in] error_ptr
-  ///     A pointer to a lldb_private::Error object that will be
+  ///     A pointer to a lldb_private::Status object that will be
   ///     filled in if non-nullptr.
   ///
   /// @return
   ///     The resulting seek offset, or -1 on error.
   //------------------------------------------------------------------
-  off_t SeekFromEnd(off_t offset, Error *error_ptr = nullptr);
+  off_t SeekFromEnd(off_t offset, Status *error_ptr = nullptr);
 
   //------------------------------------------------------------------
   /// Read bytes from a file from the specified file offset.
@@ -329,7 +329,7 @@ public:
   ///     An error object that indicates success or the reason for
   ///     failure.
   //------------------------------------------------------------------
-  Error Read(void *dst, size_t &num_bytes, off_t &offset);
+  Status Read(void *dst, size_t &num_bytes, off_t &offset);
 
   //------------------------------------------------------------------
   /// Read bytes from a file from the specified file offset.
@@ -360,8 +360,8 @@ public:
   ///     An error object that indicates success or the reason for
   ///     failure.
   //------------------------------------------------------------------
-  Error Read(size_t &num_bytes, off_t &offset, bool null_terminate,
-             lldb::DataBufferSP &data_buffer_sp);
+  Status Read(size_t &num_bytes, off_t &offset, bool null_terminate,
+              lldb::DataBufferSP &data_buffer_sp);
 
   //------------------------------------------------------------------
   /// Write bytes to a file at the specified file offset.
@@ -388,7 +388,7 @@ public:
   ///     An error object that indicates success or the reason for
   ///     failure.
   //------------------------------------------------------------------
-  Error Write(const void *src, size_t &num_bytes, off_t &offset);
+  Status Write(const void *src, size_t &num_bytes, off_t &offset);
 
   //------------------------------------------------------------------
   /// Flush the current stream
@@ -397,7 +397,7 @@ public:
   ///     An error object that indicates success or the reason for
   ///     failure.
   //------------------------------------------------------------------
-  Error Flush();
+  Status Flush();
 
   //------------------------------------------------------------------
   /// Sync to disk.
@@ -406,7 +406,7 @@ public:
   ///     An error object that indicates success or the reason for
   ///     failure.
   //------------------------------------------------------------------
-  Error Sync();
+  Status Sync();
 
   //------------------------------------------------------------------
   /// Get the permissions for a this file.
@@ -415,9 +415,9 @@ public:
   ///     Bits logical OR'ed together from the permission bits defined
   ///     in lldb_private::File::Permissions.
   //------------------------------------------------------------------
-  uint32_t GetPermissions(Error &error) const;
+  uint32_t GetPermissions(Status &error) const;
 
-  static uint32_t GetPermissions(const FileSpec &file_spec, Error &error);
+  static uint32_t GetPermissions(const FileSpec &file_spec, Status &error);
 
   //------------------------------------------------------------------
   /// Return true if this file is interactive.
diff --git a/include/lldb/Host/FileCache.h b/include/lldb/Host/FileCache.h
index 094ee695ce32..1c03540c1eb3 100644
--- a/include/lldb/Host/FileCache.h
+++ b/include/lldb/Host/FileCache.h
@@ -15,8 +15,8 @@
 #include "lldb/lldb-forward.h"
 #include "lldb/lldb-types.h"
 
-#include "lldb/Utility/Error.h"
 #include "lldb/Utility/FileSpec.h"
+#include "lldb/Utility/Status.h"
 
 namespace lldb_private {
 class FileCache {
@@ -29,13 +29,13 @@ public:
   static FileCache &GetInstance();
 
   lldb::user_id_t OpenFile(const FileSpec &file_spec, uint32_t flags,
-                           uint32_t mode, Error &error);
-  bool CloseFile(lldb::user_id_t fd, Error &error);
+                           uint32_t mode, Status &error);
+  bool CloseFile(lldb::user_id_t fd, Status &error);
 
   uint64_t WriteFile(lldb::user_id_t fd, uint64_t offset, const void *src,
-                     uint64_t src_len, Error &error);
+                     uint64_t src_len, Status &error);
   uint64_t ReadFile(lldb::user_id_t fd, uint64_t offset, void *dst,
-                    uint64_t dst_len, Error &error);
+                    uint64_t dst_len, Status &error);
 
 private:
   static FileCache *m_instance;
diff --git a/include/lldb/Host/FileSystem.h b/include/lldb/Host/FileSystem.h
index 794308ff4244..c13d5c84c631 100644
--- a/include/lldb/Host/FileSystem.h
+++ b/include/lldb/Host/FileSystem.h
@@ -10,8 +10,8 @@
 #ifndef liblldb_Host_FileSystem_h
 #define liblldb_Host_FileSystem_h
 
-#include "lldb/Utility/Error.h"
 #include "lldb/Utility/FileSpec.h"
+#include "lldb/Utility/Status.h"
 #include "llvm/Support/Chrono.h"
 
 #include "lldb/lldb-types.h"
@@ -26,10 +26,10 @@ public:
   static const char *DEV_NULL;
   static const char *PATH_CONVERSION_ERROR;
 
-  static Error Symlink(const FileSpec &src, const FileSpec &dst);
-  static Error Readlink(const FileSpec &src, FileSpec &dst);
+  static Status Symlink(const FileSpec &src, const FileSpec &dst);
+  static Status Readlink(const FileSpec &src, FileSpec &dst);
 
-  static Error ResolveSymbolicLink(const FileSpec &src, FileSpec &dst);
+  static Status ResolveSymbolicLink(const FileSpec &src, FileSpec &dst);
 
   /// Wraps ::fopen in a platform-independent way. Once opened, FILEs can be
   /// manipulated and closed with the normal ::fread, ::fclose, etc. functions.
diff --git a/include/lldb/Host/Host.h b/include/lldb/Host/Host.h
index 8415d429d5e2..c474dccab5db 100644
--- a/include/lldb/Host/Host.h
+++ b/include/lldb/Host/Host.h
@@ -190,19 +190,19 @@ public:
 
   static short GetPosixspawnFlags(const ProcessLaunchInfo &launch_info);
 
-  static Error LaunchProcessPosixSpawn(const char *exe_path,
-                                       const ProcessLaunchInfo &launch_info,
-                                       lldb::pid_t &pid);
+  static Status LaunchProcessPosixSpawn(const char *exe_path,
+                                        const ProcessLaunchInfo &launch_info,
+                                        lldb::pid_t &pid);
 
   static bool AddPosixSpawnFileAction(void *file_actions,
                                       const FileAction *info, Log *log,
-                                      Error &error);
+                                      Status &error);
 
 #endif
 
   static const lldb::UnixSignalsSP &GetUnixSignals();
 
-  static Error LaunchProcess(ProcessLaunchInfo &launch_info);
+  static Status LaunchProcess(ProcessLaunchInfo &launch_info);
 
   //------------------------------------------------------------------
   /// Perform expansion of the command-line for this launch info
@@ -211,10 +211,10 @@ public:
   //  argument magic the platform defines as part of its typical
   //  user experience
   //------------------------------------------------------------------
-  static Error ShellExpandArguments(ProcessLaunchInfo &launch_info);
+  static Status ShellExpandArguments(ProcessLaunchInfo &launch_info);
 
   // TODO: Convert this function to take a StringRef.
-  static Error RunShellCommand(
+  static Status RunShellCommand(
       const char *command,         // Shouldn't be NULL
       const FileSpec &working_dir, // Pass empty FileSpec to use the current
                                    // working directory
@@ -226,7 +226,7 @@ public:
       uint32_t timeout_sec,
       bool run_in_default_shell = true);
 
-  static Error RunShellCommand(
+  static Status RunShellCommand(
       const Args &args,
       const FileSpec &working_dir, // Pass empty FileSpec to use the current
                                    // working directory
diff --git a/include/lldb/Host/HostNativeProcessBase.h b/include/lldb/Host/HostNativeProcessBase.h
index 0befe867c296..07f98c24776c 100644
--- a/include/lldb/Host/HostNativeProcessBase.h
+++ b/include/lldb/Host/HostNativeProcessBase.h
@@ -11,7 +11,7 @@
 #define lldb_Host_HostNativeProcessBase_h_
 
 #include "lldb/Host/HostProcess.h"
-#include "lldb/Utility/Error.h"
+#include "lldb/Utility/Status.h"
 #include "lldb/lldb-defines.h"
 #include "lldb/lldb-types.h"
 
@@ -28,8 +28,8 @@ public:
       : m_process(process) {}
   virtual ~HostNativeProcessBase() {}
 
-  virtual Error Terminate() = 0;
-  virtual Error GetMainModule(FileSpec &file_spec) const = 0;
+  virtual Status Terminate() = 0;
+  virtual Status GetMainModule(FileSpec &file_spec) const = 0;
 
   virtual lldb::pid_t GetProcessId() const = 0;
   virtual bool IsRunning() const = 0;
diff --git a/include/lldb/Host/HostNativeThreadBase.h b/include/lldb/Host/HostNativeThreadBase.h
index 9bf86e0759f5..f1f89f3b75fe 100644
--- a/include/lldb/Host/HostNativeThreadBase.h
+++ b/include/lldb/Host/HostNativeThreadBase.h
@@ -10,7 +10,7 @@
 #ifndef lldb_Host_HostNativeThreadBase_h_
 #define lldb_Host_HostNativeThreadBase_h_
 
-#include "lldb/Utility/Error.h"
+#include "lldb/Utility/Status.h"
 #include "lldb/lldb-defines.h"
 #include "lldb/lldb-types.h"
 
@@ -31,8 +31,8 @@ public:
   explicit HostNativeThreadBase(lldb::thread_t thread);
   virtual ~HostNativeThreadBase() {}
 
-  virtual Error Join(lldb::thread_result_t *result) = 0;
-  virtual Error Cancel() = 0;
+  virtual Status Join(lldb::thread_result_t *result) = 0;
+  virtual Status Cancel() = 0;
   virtual bool IsJoinable() const;
   virtual void Reset();
   lldb::thread_t Release();
diff --git a/include/lldb/Host/HostProcess.h b/include/lldb/Host/HostProcess.h
index 56c3ddd9a1c2..dfc997bd81f7 100644
--- a/include/lldb/Host/HostProcess.h
+++ b/include/lldb/Host/HostProcess.h
@@ -40,8 +40,8 @@ public:
   HostProcess(lldb::process_t process);
   ~HostProcess();
 
-  Error Terminate();
-  Error GetMainModule(FileSpec &file_spec) const;
+  Status Terminate();
+  Status GetMainModule(FileSpec &file_spec) const;
 
   lldb::pid_t GetProcessId() const;
   bool IsRunning() const;
diff --git a/include/lldb/Host/HostThread.h b/include/lldb/Host/HostThread.h
index 96314813cf7c..0d2fbe6045af 100644
--- a/include/lldb/Host/HostThread.h
+++ b/include/lldb/Host/HostThread.h
@@ -11,7 +11,7 @@
 #define lldb_Host_HostThread_h_
 
 #include "lldb/Host/HostNativeThreadForward.h"
-#include "lldb/Utility/Error.h"
+#include "lldb/Utility/Status.h"
 #include "lldb/lldb-types.h"
 
 #include <memory>
@@ -34,8 +34,8 @@ public:
   HostThread();
   HostThread(lldb::thread_t thread);
 
-  Error Join(lldb::thread_result_t *result);
-  Error Cancel();
+  Status Join(lldb::thread_result_t *result);
+  Status Cancel();
   void Reset();
   lldb::thread_t Release();
 
diff --git a/include/lldb/Host/IOObject.h b/include/lldb/Host/IOObject.h
index 90e0bcdcd9df..98ad5056de74 100644
--- a/include/lldb/Host/IOObject.h
+++ b/include/lldb/Host/IOObject.h
@@ -34,10 +34,10 @@ public:
       : m_fd_type(type), m_should_close_fd(should_close) {}
   virtual ~IOObject() {}
 
-  virtual Error Read(void *buf, size_t &num_bytes) = 0;
-  virtual Error Write(const void *buf, size_t &num_bytes) = 0;
+  virtual Status Read(void *buf, size_t &num_bytes) = 0;
+  virtual Status Write(const void *buf, size_t &num_bytes) = 0;
   virtual bool IsValid() const = 0;
-  virtual Error Close() = 0;
+  virtual Status Close() = 0;
 
   FDType GetFdType() const { return m_fd_type; }
 
diff --git a/include/lldb/Host/LockFileBase.h b/include/lldb/Host/LockFileBase.h
index 74eafb279324..4eda1916c72e 100644
--- a/include/lldb/Host/LockFileBase.h
+++ b/include/lldb/Host/LockFileBase.h
@@ -10,7 +10,7 @@
 #ifndef liblldb_Host_LockFileBase_h_
 #define liblldb_Host_LockFileBase_h_
 
-#include "lldb/Utility/Error.h"
+#include "lldb/Utility/Status.h"
 
 #include <functional>
 
@@ -22,30 +22,30 @@ public:
 
   bool IsLocked() const;
 
-  Error WriteLock(const uint64_t start, const uint64_t len);
-  Error TryWriteLock(const uint64_t start, const uint64_t len);
+  Status WriteLock(const uint64_t start, const uint64_t len);
+  Status TryWriteLock(const uint64_t start, const uint64_t len);
 
-  Error ReadLock(const uint64_t start, const uint64_t len);
-  Error TryReadLock(const uint64_t start, const uint64_t len);
+  Status ReadLock(const uint64_t start, const uint64_t len);
+  Status TryReadLock(const uint64_t start, const uint64_t len);
 
-  Error Unlock();
+  Status Unlock();
 
 protected:
-  using Locker = std::function<Error(const uint64_t, const uint64_t)>;
+  using Locker = std::function<Status(const uint64_t, const uint64_t)>;
 
   LockFileBase(int fd);
 
   virtual bool IsValidFile() const;
 
-  virtual Error DoWriteLock(const uint64_t start, const uint64_t len) = 0;
-  virtual Error DoTryWriteLock(const uint64_t start, const uint64_t len) = 0;
+  virtual Status DoWriteLock(const uint64_t start, const uint64_t len) = 0;
+  virtual Status DoTryWriteLock(const uint64_t start, const uint64_t len) = 0;
 
-  virtual Error DoReadLock(const uint64_t start, const uint64_t len) = 0;
-  virtual Error DoTryReadLock(const uint64_t start, const uint64_t len) = 0;
+  virtual Status DoReadLock(const uint64_t start, const uint64_t len) = 0;
+  virtual Status DoTryReadLock(const uint64_t start, const uint64_t len) = 0;
 
-  virtual Error DoUnlock() = 0;
+  virtual Status DoUnlock() = 0;
 
-  Error DoLock(const Locker &locker, const uint64_t start, const uint64_t len);
+  Status DoLock(const Locker &locker, const uint64_t start, const uint64_t len);
 
   int m_fd; // not owned.
   bool m_locked;
diff --git a/include/lldb/Host/MainLoop.h b/include/lldb/Host/MainLoop.h
index f5d906e98a7b..a722348b8843 100644
--- a/include/lldb/Host/MainLoop.h
+++ b/include/lldb/Host/MainLoop.h
@@ -47,7 +47,7 @@ public:
 
   ReadHandleUP RegisterReadObject(const lldb::IOObjectSP &object_sp,
                                   const Callback &callback,
-                                  Error &error) override;
+                                  Status &error) override;
 
   // Listening for signals from multiple MainLoop instances is perfectly safe as
   // long as they don't try to listen for the same signal. The callback function
@@ -57,9 +57,9 @@ public:
   // However, since the callback is not invoked synchronously, you cannot use
   // this mechanism to handle SIGSEGV and the like.
   SignalHandleUP RegisterSignal(int signo, const Callback &callback,
-                                Error &error);
+                                Status &error);
 
-  Error Run() override;
+  Status Run() override;
 
   // This should only be performed from a callback. Do not attempt to terminate
   // the processing from another thread.
diff --git a/include/lldb/Host/MainLoopBase.h b/include/lldb/Host/MainLoopBase.h
index b746a9cb208c..39ca9a0f0c9e 100644
--- a/include/lldb/Host/MainLoopBase.h
+++ b/include/lldb/Host/MainLoopBase.h
@@ -15,7 +15,7 @@
 #include "llvm/Support/ErrorHandling.h"
 
 #include "lldb/Host/IOObject.h"
-#include "lldb/Utility/Error.h"
+#include "lldb/Utility/Status.h"
 
 namespace lldb_private {
 
@@ -49,14 +49,14 @@ public:
 
   virtual ReadHandleUP RegisterReadObject(const lldb::IOObjectSP &object_sp,
                                           const Callback &callback,
-                                          Error &error) {
+                                          Status &error) {
     llvm_unreachable("Not implemented");
   }
 
   // Waits for registered events and invoke the proper callbacks. Returns when
   // all callbacks
   // deregister themselves or when someone requests termination.
-  virtual Error Run() { llvm_unreachable("Not implemented"); }
+  virtual Status Run() { llvm_unreachable("Not implemented"); }
 
   // Requests the exit of the Run() function.
   virtual void RequestTermination() { llvm_unreachable("Not implemented"); }
diff --git a/include/lldb/Host/MonitoringProcessLauncher.h b/include/lldb/Host/MonitoringProcessLauncher.h
index 7aa819a16c7c..9ad36e90a779 100644
--- a/include/lldb/Host/MonitoringProcessLauncher.h
+++ b/include/lldb/Host/MonitoringProcessLauncher.h
@@ -25,7 +25,7 @@ public:
       std::unique_ptr<ProcessLauncher> delegate_launcher);
 
   HostProcess LaunchProcess(const ProcessLaunchInfo &launch_info,
-                            Error &error) override;
+                            Status &error) override;
 
 private:
   std::unique_ptr<ProcessLauncher> m_delegate_launcher;
diff --git a/include/lldb/Host/PipeBase.h b/include/lldb/Host/PipeBase.h
index f141b5d413a0..ad62072c7ba5 100644
--- a/include/lldb/Host/PipeBase.h
+++ b/include/lldb/Host/PipeBase.h
@@ -14,7 +14,7 @@
 #include <chrono>
 #include <string>
 
-#include "lldb/Utility/Error.h"
+#include "lldb/Utility/Status.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
 
@@ -23,17 +23,18 @@ class PipeBase {
 public:
   virtual ~PipeBase();
 
-  virtual Error CreateNew(bool child_process_inherit) = 0;
-  virtual Error CreateNew(llvm::StringRef name, bool child_process_inherit) = 0;
-  virtual Error CreateWithUniqueName(llvm::StringRef prefix,
-                                     bool child_process_inherit,
-                                     llvm::SmallVectorImpl<char> &name) = 0;
+  virtual Status CreateNew(bool child_process_inherit) = 0;
+  virtual Status CreateNew(llvm::StringRef name,
+                           bool child_process_inherit) = 0;
+  virtual Status CreateWithUniqueName(llvm::StringRef prefix,
+                                      bool child_process_inherit,
+                                      llvm::SmallVectorImpl<char> &name) = 0;
 
-  virtual Error OpenAsReader(llvm::StringRef name,
-                             bool child_process_inherit) = 0;
+  virtual Status OpenAsReader(llvm::StringRef name,
+                              bool child_process_inherit) = 0;
 
-  Error OpenAsWriter(llvm::StringRef name, bool child_process_inherit);
-  virtual Error
+  Status OpenAsWriter(llvm::StringRef name, bool child_process_inherit);
+  virtual Status
   OpenAsWriterWithTimeout(llvm::StringRef name, bool child_process_inherit,
                           const std::chrono::microseconds &timeout) = 0;
 
@@ -51,13 +52,13 @@ public:
   virtual void Close() = 0;
 
   // Delete named pipe.
-  virtual Error Delete(llvm::StringRef name) = 0;
+  virtual Status Delete(llvm::StringRef name) = 0;
 
-  virtual Error Write(const void *buf, size_t size, size_t &bytes_written) = 0;
-  virtual Error ReadWithTimeout(void *buf, size_t size,
-                                const std::chrono::microseconds &timeout,
-                                size_t &bytes_read) = 0;
-  Error Read(void *buf, size_t size, size_t &bytes_read);
+  virtual Status Write(const void *buf, size_t size, size_t &bytes_written) = 0;
+  virtual Status ReadWithTimeout(void *buf, size_t size,
+                                 const std::chrono::microseconds &timeout,
+                                 size_t &bytes_read) = 0;
+  Status Read(void *buf, size_t size, size_t &bytes_read);
 };
 }
 
diff --git a/include/lldb/Host/ProcessLauncher.h b/include/lldb/Host/ProcessLauncher.h
index 12049a69b3da..49fa84113f6c 100644
--- a/include/lldb/Host/ProcessLauncher.h
+++ b/include/lldb/Host/ProcessLauncher.h
@@ -13,14 +13,14 @@
 namespace lldb_private {
 
 class ProcessLaunchInfo;
-class Error;
+class Status;
 class HostProcess;
 
 class ProcessLauncher {
 public:
   virtual ~ProcessLauncher() {}
   virtual HostProcess LaunchProcess(const ProcessLaunchInfo &launch_info,
-                                    Error &error) = 0;
+                                    Status &error) = 0;
 };
 }
 
diff --git a/include/lldb/Host/Socket.h b/include/lldb/Host/Socket.h
index 36d506281cf8..c4233a233a89 100644
--- a/include/lldb/Host/Socket.h
+++ b/include/lldb/Host/Socket.h
@@ -18,7 +18,7 @@
 #include "lldb/Host/IOObject.h"
 #include "lldb/Host/Predicate.h"
 #include "lldb/Host/SocketAddress.h"
-#include "lldb/Utility/Error.h"
+#include "lldb/Utility/Status.h"
 
 #ifdef _WIN32
 #include "lldb/Host/windows/windows.h"
@@ -53,34 +53,35 @@ public:
 
   static std::unique_ptr<Socket> Create(const SocketProtocol protocol,
                                         bool child_processes_inherit,
-                                        Error &error);
+                                        Status &error);
 
-  virtual Error Connect(llvm::StringRef name) = 0;
-  virtual Error Listen(llvm::StringRef name, int backlog) = 0;
-  virtual Error Accept(Socket *&socket) = 0;
+  virtual Status Connect(llvm::StringRef name) = 0;
+  virtual Status Listen(llvm::StringRef name, int backlog) = 0;
+  virtual Status Accept(Socket *&socket) = 0;
 
   // Initialize a Tcp Socket object in listening mode.  listen and accept are
   // implemented
   // separately because the caller may wish to manipulate or query the socket
   // after it is
   // initialized, but before entering a blocking accept.
-  static Error TcpListen(llvm::StringRef host_and_port,
-                         bool child_processes_inherit, Socket *&socket,
-                         Predicate<uint16_t> *predicate, int backlog = 5);
-  static Error TcpConnect(llvm::StringRef host_and_port,
-                          bool child_processes_inherit, Socket *&socket);
-  static Error UdpConnect(llvm::StringRef host_and_port,
-                          bool child_processes_inherit, Socket *&socket);
-  static Error UnixDomainConnect(llvm::StringRef host_and_port,
+  static Status TcpListen(llvm::StringRef host_and_port,
+                          bool child_processes_inherit, Socket *&socket,
+                          Predicate<uint16_t> *predicate, int backlog = 5);
+  static Status TcpConnect(llvm::StringRef host_and_port,
+                           bool child_processes_inherit, Socket *&socket);
+  static Status UdpConnect(llvm::StringRef host_and_port,
+                           bool child_processes_inherit, Socket *&socket);
+  static Status UnixDomainConnect(llvm::StringRef host_and_port,
+                                  bool child_processes_inherit,
+                                  Socket *&socket);
+  static Status UnixDomainAccept(llvm::StringRef host_and_port,
                                  bool child_processes_inherit, Socket *&socket);
-  static Error UnixDomainAccept(llvm::StringRef host_and_port,
-                                bool child_processes_inherit, Socket *&socket);
-  static Error UnixAbstractConnect(llvm::StringRef host_and_port,
+  static Status UnixAbstractConnect(llvm::StringRef host_and_port,
+                                    bool child_processes_inherit,
+                                    Socket *&socket);
+  static Status UnixAbstractAccept(llvm::StringRef host_and_port,
                                    bool child_processes_inherit,
                                    Socket *&socket);
-  static Error UnixAbstractAccept(llvm::StringRef host_and_port,
-                                  bool child_processes_inherit,
-                                  Socket *&socket);
 
   int GetOption(int level, int option_name, int &option_value);
   int SetOption(int level, int option_name, int option_value);
@@ -88,18 +89,18 @@ public:
   NativeSocket GetNativeSocket() const { return m_socket; }
   SocketProtocol GetSocketProtocol() const { return m_protocol; }
 
-  Error Read(void *buf, size_t &num_bytes) override;
-  Error Write(const void *buf, size_t &num_bytes) override;
+  Status Read(void *buf, size_t &num_bytes) override;
+  Status Write(const void *buf, size_t &num_bytes) override;
 
-  virtual Error PreDisconnect();
-  Error Close() override;
+  virtual Status PreDisconnect();
+  Status Close() override;
 
   bool IsValid() const override { return m_socket != kInvalidSocketValue; }
   WaitableHandle GetWaitableHandle() override;
 
   static bool DecodeHostAndPort(llvm::StringRef host_and_port,
                                 std::string &host_str, std::string &port_str,
-                                int32_t &port, Error *error_ptr);
+                                int32_t &port, Status *error_ptr);
 
 protected:
   Socket(SocketProtocol protocol, bool should_close,
@@ -107,13 +108,13 @@ protected:
 
   virtual size_t Send(const void *buf, const size_t num_bytes);
 
-  static void SetLastError(Error &error);
+  static void SetLastError(Status &error);
   static NativeSocket CreateSocket(const int domain, const int type,
                                    const int protocol,
-                                   bool child_processes_inherit, Error &error);
+                                   bool child_processes_inherit, Status &error);
   static NativeSocket AcceptSocket(NativeSocket sockfd, struct sockaddr *addr,
                                    socklen_t *addrlen,
-                                   bool child_processes_inherit, Error &error);
+                                   bool child_processes_inherit, Status &error);
 
   SocketProtocol m_protocol;
   NativeSocket m_socket;
diff --git a/include/lldb/Host/ThreadLauncher.h b/include/lldb/Host/ThreadLauncher.h
index 4e388ca6bb9c..b50f0e2c2c43 100644
--- a/include/lldb/Host/ThreadLauncher.h
+++ b/include/lldb/Host/ThreadLauncher.h
@@ -12,7 +12,7 @@
 #define lldb_Host_ThreadLauncher_h_
 
 #include "lldb/Host/HostThread.h"
-#include "lldb/Utility/Error.h"
+#include "lldb/Utility/Status.h"
 #include "lldb/lldb-types.h"
 
 #include "llvm/ADT/StringRef.h"
@@ -23,7 +23,7 @@ class ThreadLauncher {
 public:
   static HostThread
   LaunchThread(llvm::StringRef name, lldb::thread_func_t thread_function,
-               lldb::thread_arg_t thread_arg, Error *error_ptr,
+               lldb::thread_arg_t thread_arg, Status *error_ptr,
                size_t min_stack_byte_size = 0); // Minimum stack size in bytes,
                                                 // set stack size to zero for
                                                 // default platform thread stack
diff --git a/include/lldb/Host/common/NativeBreakpoint.h b/include/lldb/Host/common/NativeBreakpoint.h
index 2f872e1d5a52..73639d64c9e8 100644
--- a/include/lldb/Host/common/NativeBreakpoint.h
+++ b/include/lldb/Host/common/NativeBreakpoint.h
@@ -24,9 +24,9 @@ public:
 
   virtual ~NativeBreakpoint();
 
-  Error Enable();
+  Status Enable();
 
-  Error Disable();
+  Status Disable();
 
   lldb::addr_t GetAddress() const { return m_addr; }
 
@@ -38,9 +38,9 @@ protected:
   const lldb::addr_t m_addr;
   int32_t m_ref_count;
 
-  virtual Error DoEnable() = 0;
+  virtual Status DoEnable() = 0;
 
-  virtual Error DoDisable() = 0;
+  virtual Status DoDisable() = 0;
 
 private:
   bool m_enabled;
diff --git a/include/lldb/Host/common/NativeBreakpointList.h b/include/lldb/Host/common/NativeBreakpointList.h
index 1d314e02b1d5..ffa659fdd869 100644
--- a/include/lldb/Host/common/NativeBreakpointList.h
+++ b/include/lldb/Host/common/NativeBreakpointList.h
@@ -10,7 +10,7 @@
 #ifndef liblldb_NativeBreakpointList_h_
 #define liblldb_NativeBreakpointList_h_
 
-#include "lldb/Utility/Error.h"
+#include "lldb/Utility/Status.h"
 #include "lldb/lldb-private-forward.h"
 // #include "lldb/Host/NativeBreakpoint.h"
 
@@ -29,24 +29,25 @@ using HardwareBreakpointMap = std::map<lldb::addr_t, HardwareBreakpoint>;
 
 class NativeBreakpointList {
 public:
-  typedef std::function<Error(lldb::addr_t addr, size_t size_hint,
-                              bool hardware, NativeBreakpointSP &breakpoint_sp)>
+  typedef std::function<Status(lldb::addr_t addr, size_t size_hint,
+                               bool hardware,
+                               NativeBreakpointSP &breakpoint_sp)>
       CreateBreakpointFunc;
 
   NativeBreakpointList();
 
-  Error AddRef(lldb::addr_t addr, size_t size_hint, bool hardware,
-               CreateBreakpointFunc create_func);
+  Status AddRef(lldb::addr_t addr, size_t size_hint, bool hardware,
+                CreateBreakpointFunc create_func);
 
-  Error DecRef(lldb::addr_t addr);
+  Status DecRef(lldb::addr_t addr);
 
-  Error EnableBreakpoint(lldb::addr_t addr);
+  Status EnableBreakpoint(lldb::addr_t addr);
 
-  Error DisableBreakpoint(lldb::addr_t addr);
+  Status DisableBreakpoint(lldb::addr_t addr);
 
-  Error GetBreakpoint(lldb::addr_t addr, NativeBreakpointSP &breakpoint_sp);
+  Status GetBreakpoint(lldb::addr_t addr, NativeBreakpointSP &breakpoint_sp);
 
-  Error RemoveTrapsFromBuffer(lldb::addr_t addr, void *buf, size_t size) const;
+  Status RemoveTrapsFromBuffer(lldb::addr_t addr, void *buf, size_t size) const;
 
 private:
   typedef std::map<lldb::addr_t, NativeBreakpointSP> BreakpointMap;
diff --git a/include/lldb/Host/common/NativeProcessProtocol.h b/include/lldb/Host/common/NativeProcessProtocol.h
index 7ad09d41eacf..388edef0578d 100644
--- a/include/lldb/Host/common/NativeProcessProtocol.h
+++ b/include/lldb/Host/common/NativeProcessProtocol.h
@@ -11,7 +11,7 @@
 #define liblldb_NativeProcessProtocol_h_
 
 #include "lldb/Host/MainLoop.h"
-#include "lldb/Utility/Error.h"
+#include "lldb/Utility/Status.h"
 #include "lldb/lldb-private-forward.h"
 #include "lldb/lldb-types.h"
 #include "llvm/ADT/ArrayRef.h"
@@ -37,11 +37,11 @@ class NativeProcessProtocol
 public:
   virtual ~NativeProcessProtocol() {}
 
-  virtual Error Resume(const ResumeActionList &resume_actions) = 0;
+  virtual Status Resume(const ResumeActionList &resume_actions) = 0;
 
-  virtual Error Halt() = 0;
+  virtual Status Halt() = 0;
 
-  virtual Error Detach() = 0;
+  virtual Status Detach() = 0;
 
   //------------------------------------------------------------------
   /// Sends a process a UNIX signal \a signal.
@@ -49,7 +49,7 @@ public:
   /// @return
   ///     Returns an error object.
   //------------------------------------------------------------------
-  virtual Error Signal(int signo) = 0;
+  virtual Status Signal(int signo) = 0;
 
   //------------------------------------------------------------------
   /// Tells a process to interrupt all operations as if by a Ctrl-C.
@@ -61,36 +61,36 @@ public:
   /// @return
   ///     Returns an error object.
   //------------------------------------------------------------------
-  virtual Error Interrupt();
+  virtual Status Interrupt();
 
-  virtual Error Kill() = 0;
+  virtual Status Kill() = 0;
 
   //------------------------------------------------------------------
   // Tells a process not to stop the inferior on given signals
   // and just reinject them back.
   //------------------------------------------------------------------
-  virtual Error IgnoreSignals(llvm::ArrayRef<int> signals);
+  virtual Status IgnoreSignals(llvm::ArrayRef<int> signals);
 
   //----------------------------------------------------------------------
   // Memory and memory region functions
   //----------------------------------------------------------------------
 
-  virtual Error GetMemoryRegionInfo(lldb::addr_t load_addr,
-                                    MemoryRegionInfo &range_info);
+  virtual Status GetMemoryRegionInfo(lldb::addr_t load_addr,
+                                     MemoryRegionInfo &range_info);
 
-  virtual Error ReadMemory(lldb::addr_t addr, void *buf, size_t size,
-                           size_t &bytes_read) = 0;
+  virtual Status ReadMemory(lldb::addr_t addr, void *buf, size_t size,
+                            size_t &bytes_read) = 0;
 
-  virtual Error ReadMemoryWithoutTrap(lldb::addr_t addr, void *buf, size_t size,
-                                      size_t &bytes_read) = 0;
+  virtual Status ReadMemoryWithoutTrap(lldb::addr_t addr, void *buf,
+                                       size_t size, size_t &bytes_read) = 0;
 
-  virtual Error WriteMemory(lldb::addr_t addr, const void *buf, size_t size,
-                            size_t &bytes_written) = 0;
+  virtual Status WriteMemory(lldb::addr_t addr, const void *buf, size_t size,
+                             size_t &bytes_written) = 0;
 
-  virtual Error AllocateMemory(size_t size, uint32_t permissions,
-                               lldb::addr_t &addr) = 0;
+  virtual Status AllocateMemory(size_t size, uint32_t permissions,
+                                lldb::addr_t &addr) = 0;
 
-  virtual Error DeallocateMemory(lldb::addr_t addr) = 0;
+  virtual Status DeallocateMemory(lldb::addr_t addr) = 0;
 
   virtual lldb::addr_t GetSharedLibraryInfoAddress() = 0;
 
@@ -103,23 +103,23 @@ public:
   //----------------------------------------------------------------------
   // Breakpoint functions
   //----------------------------------------------------------------------
-  virtual Error SetBreakpoint(lldb::addr_t addr, uint32_t size,
-                              bool hardware) = 0;
+  virtual Status SetBreakpoint(lldb::addr_t addr, uint32_t size,
+                               bool hardware) = 0;
 
-  virtual Error RemoveBreakpoint(lldb::addr_t addr, bool hardware = false);
+  virtual Status RemoveBreakpoint(lldb::addr_t addr, bool hardware = false);
 
-  virtual Error EnableBreakpoint(lldb::addr_t addr);
+  virtual Status EnableBreakpoint(lldb::addr_t addr);
 
-  virtual Error DisableBreakpoint(lldb::addr_t addr);
+  virtual Status DisableBreakpoint(lldb::addr_t addr);
 
   //----------------------------------------------------------------------
   // Hardware Breakpoint functions
   //----------------------------------------------------------------------
   virtual const HardwareBreakpointMap &GetHardwareBreakpointMap() const;
 
-  virtual Error SetHardwareBreakpoint(lldb::addr_t addr, size_t size);
+  virtual Status SetHardwareBreakpoint(lldb::addr_t addr, size_t size);
 
-  virtual Error RemoveHardwareBreakpoint(lldb::addr_t addr);
+  virtual Status RemoveHardwareBreakpoint(lldb::addr_t addr);
 
   //----------------------------------------------------------------------
   // Watchpoint functions
@@ -129,10 +129,10 @@ public:
   virtual llvm::Optional<std::pair<uint32_t, uint32_t>>
   GetHardwareDebugSupportInfo() const;
 
-  virtual Error SetWatchpoint(lldb::addr_t addr, size_t size,
-                              uint32_t watch_flags, bool hardware);
+  virtual Status SetWatchpoint(lldb::addr_t addr, size_t size,
+                               uint32_t watch_flags, bool hardware);
 
-  virtual Error RemoveWatchpoint(lldb::addr_t addr);
+  virtual Status RemoveWatchpoint(lldb::addr_t addr);
 
   //----------------------------------------------------------------------
   // Accessors
@@ -239,11 +239,11 @@ public:
   //------------------------------------------------------------------
   bool UnregisterNativeDelegate(NativeDelegate &native_delegate);
 
-  virtual Error GetLoadedModuleFileSpec(const char *module_path,
-                                        FileSpec &file_spec) = 0;
+  virtual Status GetLoadedModuleFileSpec(const char *module_path,
+                                         FileSpec &file_spec) = 0;
 
-  virtual Error GetFileLoadAddress(const llvm::StringRef &file_name,
-                                   lldb::addr_t &load_addr) = 0;
+  virtual Status GetFileLoadAddress(const llvm::StringRef &file_name,
+                                    lldb::addr_t &load_addr) = 0;
 
   //------------------------------------------------------------------
   /// Launch a process for debugging. This method will create an concrete
@@ -273,9 +273,9 @@ public:
   ///     An error object indicating if the operation succeeded,
   ///     and if not, what error occurred.
   //------------------------------------------------------------------
-  static Error Launch(ProcessLaunchInfo &launch_info,
-                      NativeDelegate &native_delegate, MainLoop &mainloop,
-                      NativeProcessProtocolSP &process_sp);
+  static Status Launch(ProcessLaunchInfo &launch_info,
+                       NativeDelegate &native_delegate, MainLoop &mainloop,
+                       NativeProcessProtocolSP &process_sp);
 
   //------------------------------------------------------------------
   /// Attach to an existing process. This method will create an concrete
@@ -305,8 +305,8 @@ public:
   ///     An error object indicating if the operation succeeded,
   ///     and if not, what error occurred.
   //------------------------------------------------------------------
-  static Error Attach(lldb::pid_t pid, NativeDelegate &native_delegate,
-                      MainLoop &mainloop, NativeProcessProtocolSP &process_sp);
+  static Status Attach(lldb::pid_t pid, NativeDelegate &native_delegate,
+                       MainLoop &mainloop, NativeProcessProtocolSP &process_sp);
 
 protected:
   lldb::pid_t m_pid;
@@ -356,9 +356,9 @@ protected:
   // -----------------------------------------------------------
   // Internal interface for software breakpoints
   // -----------------------------------------------------------
-  Error SetSoftwareBreakpoint(lldb::addr_t addr, uint32_t size_hint);
+  Status SetSoftwareBreakpoint(lldb::addr_t addr, uint32_t size_hint);
 
-  virtual Error
+  virtual Status
   GetSoftwareBreakpointTrapOpcode(size_t trap_opcode_size_hint,
                                   size_t &actual_opcode_size,
                                   const uint8_t *&trap_opcode_bytes) = 0;
@@ -376,7 +376,7 @@ protected:
   // -----------------------------------------------------------
   // Static helper methods for derived classes.
   // -----------------------------------------------------------
-  static Error ResolveProcessArchitecture(lldb::pid_t pid, ArchSpec &arch);
+  static Status ResolveProcessArchitecture(lldb::pid_t pid, ArchSpec &arch);
 
 private:
   void SynchronouslyNotifyProcessStateChanged(lldb::StateType state);
diff --git a/include/lldb/Host/common/NativeRegisterContext.h b/include/lldb/Host/common/NativeRegisterContext.h
index 1d8b51c60cc4..982d81b9ac4c 100644
--- a/include/lldb/Host/common/NativeRegisterContext.h
+++ b/include/lldb/Host/common/NativeRegisterContext.h
@@ -53,15 +53,15 @@ public:
 
   virtual const RegisterSet *GetRegisterSet(uint32_t set_index) const = 0;
 
-  virtual Error ReadRegister(const RegisterInfo *reg_info,
-                             RegisterValue &reg_value) = 0;
+  virtual Status ReadRegister(const RegisterInfo *reg_info,
+                              RegisterValue &reg_value) = 0;
 
-  virtual Error WriteRegister(const RegisterInfo *reg_info,
-                              const RegisterValue &reg_value) = 0;
+  virtual Status WriteRegister(const RegisterInfo *reg_info,
+                               const RegisterValue &reg_value) = 0;
 
-  virtual Error ReadAllRegisterValues(lldb::DataBufferSP &data_sp) = 0;
+  virtual Status ReadAllRegisterValues(lldb::DataBufferSP &data_sp) = 0;
 
-  virtual Error WriteAllRegisterValues(const lldb::DataBufferSP &data_sp) = 0;
+  virtual Status WriteAllRegisterValues(const lldb::DataBufferSP &data_sp) = 0;
 
   uint32_t ConvertRegisterKindToRegisterNumber(uint32_t kind,
                                                uint32_t num) const;
@@ -75,10 +75,10 @@ public:
 
   virtual bool ClearHardwareBreakpoint(uint32_t hw_idx);
 
-  virtual Error ClearAllHardwareBreakpoints();
+  virtual Status ClearAllHardwareBreakpoints();
 
-  virtual Error GetHardwareBreakHitIndex(uint32_t &bp_index,
-                                         lldb::addr_t trap_addr);
+  virtual Status GetHardwareBreakHitIndex(uint32_t &bp_index,
+                                          lldb::addr_t trap_addr);
 
   virtual uint32_t NumSupportedHardwareWatchpoints();
 
@@ -87,14 +87,14 @@ public:
 
   virtual bool ClearHardwareWatchpoint(uint32_t hw_index);
 
-  virtual Error ClearAllHardwareWatchpoints();
+  virtual Status ClearAllHardwareWatchpoints();
 
-  virtual Error IsWatchpointHit(uint32_t wp_index, bool &is_hit);
+  virtual Status IsWatchpointHit(uint32_t wp_index, bool &is_hit);
 
-  virtual Error GetWatchpointHitIndex(uint32_t &wp_index,
-                                      lldb::addr_t trap_addr);
+  virtual Status GetWatchpointHitIndex(uint32_t &wp_index,
+                                       lldb::addr_t trap_addr);
 
-  virtual Error IsWatchpointVacant(uint32_t wp_index, bool &is_vacant);
+  virtual Status IsWatchpointVacant(uint32_t wp_index, bool &is_vacant);
 
   virtual lldb::addr_t GetWatchpointAddress(uint32_t wp_index);
 
@@ -114,12 +114,12 @@ public:
 
   virtual bool HardwareSingleStep(bool enable);
 
-  virtual Error
+  virtual Status
   ReadRegisterValueFromMemory(const lldb_private::RegisterInfo *reg_info,
                               lldb::addr_t src_addr, size_t src_len,
                               RegisterValue &reg_value);
 
-  virtual Error
+  virtual Status
   WriteRegisterValueToMemory(const lldb_private::RegisterInfo *reg_info,
                              lldb::addr_t dst_addr, size_t dst_len,
                              const RegisterValue &reg_value);
@@ -141,15 +141,15 @@ public:
   virtual lldb::addr_t
   GetPCfromBreakpointLocation(lldb::addr_t fail_value = LLDB_INVALID_ADDRESS);
 
-  Error SetPC(lldb::addr_t pc);
+  Status SetPC(lldb::addr_t pc);
 
   lldb::addr_t GetSP(lldb::addr_t fail_value = LLDB_INVALID_ADDRESS);
 
-  Error SetSP(lldb::addr_t sp);
+  Status SetSP(lldb::addr_t sp);
 
   lldb::addr_t GetFP(lldb::addr_t fail_value = LLDB_INVALID_ADDRESS);
 
-  Error SetFP(lldb::addr_t fp);
+  Status SetFP(lldb::addr_t fp);
 
   const char *GetRegisterName(uint32_t reg);
 
@@ -162,9 +162,9 @@ public:
   lldb::addr_t ReadRegisterAsUnsigned(const RegisterInfo *reg_info,
                                       lldb::addr_t fail_value);
 
-  Error WriteRegisterFromUnsigned(uint32_t reg, uint64_t uval);
+  Status WriteRegisterFromUnsigned(uint32_t reg, uint64_t uval);
 
-  Error WriteRegisterFromUnsigned(const RegisterInfo *reg_info, uint64_t uval);
+  Status WriteRegisterFromUnsigned(const RegisterInfo *reg_info, uint64_t uval);
 
   // uint32_t
   // GetStopID () const
diff --git a/include/lldb/Host/common/NativeThreadProtocol.h b/include/lldb/Host/common/NativeThreadProtocol.h
index 8f26616a2b4a..2e6c96a34cf5 100644
--- a/include/lldb/Host/common/NativeThreadProtocol.h
+++ b/include/lldb/Host/common/NativeThreadProtocol.h
@@ -33,13 +33,13 @@ public:
 
   virtual NativeRegisterContextSP GetRegisterContext() = 0;
 
-  virtual Error ReadRegister(uint32_t reg, RegisterValue &reg_value);
+  virtual Status ReadRegister(uint32_t reg, RegisterValue &reg_value);
 
-  virtual Error WriteRegister(uint32_t reg, const RegisterValue &reg_value);
+  virtual Status WriteRegister(uint32_t reg, const RegisterValue &reg_value);
 
-  virtual Error SaveAllRegisters(lldb::DataBufferSP &data_sp);
+  virtual Status SaveAllRegisters(lldb::DataBufferSP &data_sp);
 
-  virtual Error RestoreAllRegisters(lldb::DataBufferSP &data_sp);
+  virtual Status RestoreAllRegisters(lldb::DataBufferSP &data_sp);
 
   virtual bool GetStopReason(ThreadStopInfo &stop_info,
                              std::string &description) = 0;
@@ -51,17 +51,17 @@ public:
   // ---------------------------------------------------------------------
   // Thread-specific watchpoints
   // ---------------------------------------------------------------------
-  virtual Error SetWatchpoint(lldb::addr_t addr, size_t size,
-                              uint32_t watch_flags, bool hardware) = 0;
+  virtual Status SetWatchpoint(lldb::addr_t addr, size_t size,
+                               uint32_t watch_flags, bool hardware) = 0;
 
-  virtual Error RemoveWatchpoint(lldb::addr_t addr) = 0;
+  virtual Status RemoveWatchpoint(lldb::addr_t addr) = 0;
 
   // ---------------------------------------------------------------------
   // Thread-specific Hardware Breakpoint routines
   // ---------------------------------------------------------------------
-  virtual Error SetHardwareBreakpoint(lldb::addr_t addr, size_t size) = 0;
+  virtual Status SetHardwareBreakpoint(lldb::addr_t addr, size_t size) = 0;
 
-  virtual Error RemoveHardwareBreakpoint(lldb::addr_t addr) = 0;
+  virtual Status RemoveHardwareBreakpoint(lldb::addr_t addr) = 0;
 
 protected:
   NativeProcessProtocolWP m_process_wp;
diff --git a/include/lldb/Host/common/NativeWatchpointList.h b/include/lldb/Host/common/NativeWatchpointList.h
index ae3476f5f07e..02920e6faacb 100644
--- a/include/lldb/Host/common/NativeWatchpointList.h
+++ b/include/lldb/Host/common/NativeWatchpointList.h
@@ -10,7 +10,7 @@
 #ifndef liblldb_NativeWatchpointList_h_
 #define liblldb_NativeWatchpointList_h_
 
-#include "lldb/Utility/Error.h"
+#include "lldb/Utility/Status.h"
 #include "lldb/lldb-private-forward.h"
 
 #include <map>
@@ -25,10 +25,10 @@ struct NativeWatchpoint {
 
 class NativeWatchpointList {
 public:
-  Error Add(lldb::addr_t addr, size_t size, uint32_t watch_flags,
-            bool hardware);
+  Status Add(lldb::addr_t addr, size_t size, uint32_t watch_flags,
+             bool hardware);
 
-  Error Remove(lldb::addr_t addr);
+  Status Remove(lldb::addr_t addr);
 
   using WatchpointMap = std::map<lldb::addr_t, NativeWatchpoint>;
 
diff --git a/include/lldb/Host/common/SoftwareBreakpoint.h b/include/lldb/Host/common/SoftwareBreakpoint.h
index 703f6bb95d48..e0f235fecd93 100644
--- a/include/lldb/Host/common/SoftwareBreakpoint.h
+++ b/include/lldb/Host/common/SoftwareBreakpoint.h
@@ -18,18 +18,18 @@ class SoftwareBreakpoint : public NativeBreakpoint {
   friend class NativeBreakpointList;
 
 public:
-  static Error CreateSoftwareBreakpoint(NativeProcessProtocol &process,
-                                        lldb::addr_t addr, size_t size_hint,
-                                        NativeBreakpointSP &breakpoint_spn);
+  static Status CreateSoftwareBreakpoint(NativeProcessProtocol &process,
+                                         lldb::addr_t addr, size_t size_hint,
+                                         NativeBreakpointSP &breakpoint_spn);
 
   SoftwareBreakpoint(NativeProcessProtocol &process, lldb::addr_t addr,
                      const uint8_t *saved_opcodes, const uint8_t *trap_opcodes,
                      size_t opcode_size);
 
 protected:
-  Error DoEnable() override;
+  Status DoEnable() override;
 
-  Error DoDisable() override;
+  Status DoDisable() override;
 
   bool IsSoftwareBreakpoint() const override;
 
@@ -42,11 +42,11 @@ private:
   uint8_t m_trap_opcodes[MAX_TRAP_OPCODE_SIZE];
   const size_t m_opcode_size;
 
-  static Error EnableSoftwareBreakpoint(NativeProcessProtocol &process,
-                                        lldb::addr_t addr,
-                                        size_t bp_opcode_size,
-                                        const uint8_t *bp_opcode_bytes,
-                                        uint8_t *saved_opcode_bytes);
+  static Status EnableSoftwareBreakpoint(NativeProcessProtocol &process,
+                                         lldb::addr_t addr,
+                                         size_t bp_opcode_size,
+                                         const uint8_t *bp_opcode_bytes,
+                                         uint8_t *saved_opcode_bytes);
 };
 }
 
diff --git a/include/lldb/Host/common/TCPSocket.h b/include/lldb/Host/common/TCPSocket.h
index 5b72f344019f..0d32a70fd381 100644
--- a/include/lldb/Host/common/TCPSocket.h
+++ b/include/lldb/Host/common/TCPSocket.h
@@ -39,11 +39,11 @@ public:
   int SetOptionNoDelay();
   int SetOptionReuseAddress();
 
-  Error Connect(llvm::StringRef name) override;
-  Error Listen(llvm::StringRef name, int backlog) override;
-  Error Accept(Socket *&conn_socket) override;
+  Status Connect(llvm::StringRef name) override;
+  Status Listen(llvm::StringRef name, int backlog) override;
+  Status Accept(Socket *&conn_socket) override;
 
-  Error CreateSocket(int domain);
+  Status CreateSocket(int domain);
 
   bool IsValid() const override;
 
diff --git a/include/lldb/Host/common/UDPSocket.h b/include/lldb/Host/common/UDPSocket.h
index 977ce151e4ff..27b2d1dc9834 100644
--- a/include/lldb/Host/common/UDPSocket.h
+++ b/include/lldb/Host/common/UDPSocket.h
@@ -17,16 +17,16 @@ class UDPSocket : public Socket {
 public:
   UDPSocket(bool should_close, bool child_processes_inherit);
 
-  static Error Connect(llvm::StringRef name, bool child_processes_inherit,
-                       Socket *&socket);
+  static Status Connect(llvm::StringRef name, bool child_processes_inherit,
+                        Socket *&socket);
 
 private:
   UDPSocket(NativeSocket socket);
 
   size_t Send(const void *buf, const size_t num_bytes) override;
-  Error Connect(llvm::StringRef name) override;
-  Error Listen(llvm::StringRef name, int backlog) override;
-  Error Accept(Socket *&socket) override;
+  Status Connect(llvm::StringRef name) override;
+  Status Listen(llvm::StringRef name, int backlog) override;
+  Status Accept(Socket *&socket) override;
 
   SocketAddress m_sockaddr;
 };
diff --git a/include/lldb/Host/posix/ConnectionFileDescriptorPosix.h b/include/lldb/Host/posix/ConnectionFileDescriptorPosix.h
index f002dd167ce2..f57c93684002 100644
--- a/include/lldb/Host/posix/ConnectionFileDescriptorPosix.h
+++ b/include/lldb/Host/posix/ConnectionFileDescriptorPosix.h
@@ -26,7 +26,7 @@
 
 namespace lldb_private {
 
-class Error;
+class Status;
 class Socket;
 class SocketAddress;
 
@@ -53,20 +53,20 @@ public:
 
   bool IsConnected() const override;
 
-  lldb::ConnectionStatus Connect(llvm::StringRef s, Error *error_ptr) override;
+  lldb::ConnectionStatus Connect(llvm::StringRef s, Status *error_ptr) override;
 
-  lldb::ConnectionStatus Disconnect(Error *error_ptr) override;
+  lldb::ConnectionStatus Disconnect(Status *error_ptr) override;
 
   size_t Read(void *dst, size_t dst_len, const Timeout<std::micro> &timeout,
-              lldb::ConnectionStatus &status, Error *error_ptr) override;
+              lldb::ConnectionStatus &status, Status *error_ptr) override;
 
   size_t Write(const void *src, size_t src_len, lldb::ConnectionStatus &status,
-               Error *error_ptr) override;
+               Status *error_ptr) override;
 
   std::string GetURI() override;
 
   lldb::ConnectionStatus BytesAvailable(const Timeout<std::micro> &timeout,
-                                        Error *error_ptr);
+                                        Status *error_ptr);
 
   bool InterruptRead() override;
 
@@ -83,21 +83,21 @@ protected:
   void CloseCommandPipe();
 
   lldb::ConnectionStatus SocketListenAndAccept(llvm::StringRef host_and_port,
-                                               Error *error_ptr);
+                                               Status *error_ptr);
 
   lldb::ConnectionStatus ConnectTCP(llvm::StringRef host_and_port,
-                                    Error *error_ptr);
+                                    Status *error_ptr);
 
-  lldb::ConnectionStatus ConnectUDP(llvm::StringRef args, Error *error_ptr);
+  lldb::ConnectionStatus ConnectUDP(llvm::StringRef args, Status *error_ptr);
 
   lldb::ConnectionStatus NamedSocketConnect(llvm::StringRef socket_name,
-                                            Error *error_ptr);
+                                            Status *error_ptr);
 
   lldb::ConnectionStatus NamedSocketAccept(llvm::StringRef socket_name,
-                                           Error *error_ptr);
+                                           Status *error_ptr);
 
   lldb::ConnectionStatus UnixAbstractSocketConnect(llvm::StringRef socket_name,
-                                                   Error *error_ptr);
+                                                   Status *error_ptr);
 
   lldb::IOObjectSP m_read_sp;
   lldb::IOObjectSP m_write_sp;
diff --git a/include/lldb/Host/posix/DomainSocket.h b/include/lldb/Host/posix/DomainSocket.h
index 78a3dc89828a..e66b3f971faf 100644
--- a/include/lldb/Host/posix/DomainSocket.h
+++ b/include/lldb/Host/posix/DomainSocket.h
@@ -17,9 +17,9 @@ class DomainSocket : public Socket {
 public:
   DomainSocket(bool should_close, bool child_processes_inherit);
 
-  Error Connect(llvm::StringRef name) override;
-  Error Listen(llvm::StringRef name, int backlog) override;
-  Error Accept(Socket *&socket) override;
+  Status Connect(llvm::StringRef name) override;
+  Status Listen(llvm::StringRef name, int backlog) override;
+  Status Accept(Socket *&socket) override;
 
 protected:
   DomainSocket(SocketProtocol protocol, bool child_processes_inherit);
diff --git a/include/lldb/Host/posix/HostProcessPosix.h b/include/lldb/Host/posix/HostProcessPosix.h
index aed3cccef2e2..0a6d8822d44f 100644
--- a/include/lldb/Host/posix/HostProcessPosix.h
+++ b/include/lldb/Host/posix/HostProcessPosix.h
@@ -15,7 +15,7 @@
 // Other libraries and framework includes
 // Project includes
 #include "lldb/Host/HostNativeProcessBase.h"
-#include "lldb/Utility/Error.h"
+#include "lldb/Utility/Status.h"
 #include "lldb/lldb-types.h"
 
 namespace lldb_private {
@@ -28,11 +28,11 @@ public:
   HostProcessPosix(lldb::process_t process);
   ~HostProcessPosix() override;
 
-  virtual Error Signal(int signo) const;
-  static Error Signal(lldb::process_t process, int signo);
+  virtual Status Signal(int signo) const;
+  static Status Signal(lldb::process_t process, int signo);
 
-  Error Terminate() override;
-  Error GetMainModule(FileSpec &file_spec) const override;
+  Status Terminate() override;
+  Status GetMainModule(FileSpec &file_spec) const override;
 
   lldb::pid_t GetProcessId() const override;
   bool IsRunning() const override;
diff --git a/include/lldb/Host/posix/HostThreadPosix.h b/include/lldb/Host/posix/HostThreadPosix.h
index 99b99d767b18..c230a61bc575 100644
--- a/include/lldb/Host/posix/HostThreadPosix.h
+++ b/include/lldb/Host/posix/HostThreadPosix.h
@@ -22,10 +22,10 @@ public:
   HostThreadPosix(lldb::thread_t thread);
   ~HostThreadPosix() override;
 
-  Error Join(lldb::thread_result_t *result) override;
-  Error Cancel() override;
+  Status Join(lldb::thread_result_t *result) override;
+  Status Cancel() override;
 
-  Error Detach();
+  Status Detach();
 };
 
 } // namespace lldb_private
diff --git a/include/lldb/Host/posix/LockFilePosix.h b/include/lldb/Host/posix/LockFilePosix.h
index dde9bd0eefe6..a59a7fe3e729 100644
--- a/include/lldb/Host/posix/LockFilePosix.h
+++ b/include/lldb/Host/posix/LockFilePosix.h
@@ -20,15 +20,15 @@ public:
   ~LockFilePosix() override;
 
 protected:
-  Error DoWriteLock(const uint64_t start, const uint64_t len) override;
+  Status DoWriteLock(const uint64_t start, const uint64_t len) override;
 
-  Error DoTryWriteLock(const uint64_t start, const uint64_t len) override;
+  Status DoTryWriteLock(const uint64_t start, const uint64_t len) override;
 
-  Error DoReadLock(const uint64_t start, const uint64_t len) override;
+  Status DoReadLock(const uint64_t start, const uint64_t len) override;
 
-  Error DoTryReadLock(const uint64_t start, const uint64_t len) override;
+  Status DoTryReadLock(const uint64_t start, const uint64_t len) override;
 
-  Error DoUnlock() override;
+  Status DoUnlock() override;
 };
 
 } // namespace lldb_private
diff --git a/include/lldb/Host/posix/PipePosix.h b/include/lldb/Host/posix/PipePosix.h
index 738f95ed0db8..8208b1b8bd6b 100644
--- a/include/lldb/Host/posix/PipePosix.h
+++ b/include/lldb/Host/posix/PipePosix.h
@@ -35,12 +35,14 @@ public:
 
   ~PipePosix() override;
 
-  Error CreateNew(bool child_process_inherit) override;
-  Error CreateNew(llvm::StringRef name, bool child_process_inherit) override;
-  Error CreateWithUniqueName(llvm::StringRef prefix, bool child_process_inherit,
-                             llvm::SmallVectorImpl<char> &name) override;
-  Error OpenAsReader(llvm::StringRef name, bool child_process_inherit) override;
-  Error
+  Status CreateNew(bool child_process_inherit) override;
+  Status CreateNew(llvm::StringRef name, bool child_process_inherit) override;
+  Status CreateWithUniqueName(llvm::StringRef prefix,
+                              bool child_process_inherit,
+                              llvm::SmallVectorImpl<char> &name) override;
+  Status OpenAsReader(llvm::StringRef name,
+                      bool child_process_inherit) override;
+  Status
   OpenAsWriterWithTimeout(llvm::StringRef name, bool child_process_inherit,
                           const std::chrono::microseconds &timeout) override;
 
@@ -57,12 +59,12 @@ public:
   // Close both descriptors
   void Close() override;
 
-  Error Delete(llvm::StringRef name) override;
+  Status Delete(llvm::StringRef name) override;
 
-  Error Write(const void *buf, size_t size, size_t &bytes_written) override;
-  Error ReadWithTimeout(void *buf, size_t size,
-                        const std::chrono::microseconds &timeout,
-                        size_t &bytes_read) override;
+  Status Write(const void *buf, size_t size, size_t &bytes_written) override;
+  Status ReadWithTimeout(void *buf, size_t size,
+                         const std::chrono::microseconds &timeout,
+                         size_t &bytes_read) override;
 
 private:
   int m_fds[2];
diff --git a/include/lldb/Host/posix/ProcessLauncherPosix.h b/include/lldb/Host/posix/ProcessLauncherPosix.h
index e481e3dc7ea9..4800c4066049 100644
--- a/include/lldb/Host/posix/ProcessLauncherPosix.h
+++ b/include/lldb/Host/posix/ProcessLauncherPosix.h
@@ -17,7 +17,7 @@ namespace lldb_private {
 class ProcessLauncherPosix : public ProcessLauncher {
 public:
   HostProcess LaunchProcess(const ProcessLaunchInfo &launch_info,
-                            Error &error) override;
+                            Status &error) override;
 };
 }
 
diff --git a/include/lldb/Host/posix/ProcessLauncherPosixFork.h b/include/lldb/Host/posix/ProcessLauncherPosixFork.h
index 77bdab535c1b..1193a20b4d36 100644
--- a/include/lldb/Host/posix/ProcessLauncherPosixFork.h
+++ b/include/lldb/Host/posix/ProcessLauncherPosixFork.h
@@ -17,7 +17,7 @@ namespace lldb_private {
 class ProcessLauncherPosixFork : public ProcessLauncher {
 public:
   HostProcess LaunchProcess(const ProcessLaunchInfo &launch_info,
-                            Error &error) override;
+                            Status &error) override;
 };
 
 } // end of namespace lldb_private
diff --git a/include/lldb/Host/windows/ConnectionGenericFileWindows.h b/include/lldb/Host/windows/ConnectionGenericFileWindows.h
index 4a5a3858a826..9309288b8c0a 100644
--- a/include/lldb/Host/windows/ConnectionGenericFileWindows.h
+++ b/include/lldb/Host/windows/ConnectionGenericFileWindows.h
@@ -16,7 +16,7 @@
 
 namespace lldb_private {
 
-class Error;
+class Status;
 
 class ConnectionGenericFile : public lldb_private::Connection {
 public:
@@ -28,15 +28,15 @@ public:
 
   bool IsConnected() const override;
 
-  lldb::ConnectionStatus Connect(llvm::StringRef s, Error *error_ptr) override;
+  lldb::ConnectionStatus Connect(llvm::StringRef s, Status *error_ptr) override;
 
-  lldb::ConnectionStatus Disconnect(Error *error_ptr) override;
+  lldb::ConnectionStatus Disconnect(Status *error_ptr) override;
 
   size_t Read(void *dst, size_t dst_len, const Timeout<std::micro> &timeout,
-              lldb::ConnectionStatus &status, Error *error_ptr) override;
+              lldb::ConnectionStatus &status, Status *error_ptr) override;
 
   size_t Write(const void *src, size_t src_len, lldb::ConnectionStatus &status,
-               Error *error_ptr) override;
+               Status *error_ptr) override;
 
   std::string GetURI() override;
 
diff --git a/include/lldb/Host/windows/HostProcessWindows.h b/include/lldb/Host/windows/HostProcessWindows.h
index 2896c7be793f..4ef35337d70f 100644
--- a/include/lldb/Host/windows/HostProcessWindows.h
+++ b/include/lldb/Host/windows/HostProcessWindows.h
@@ -25,8 +25,8 @@ public:
 
   void SetOwnsHandle(bool owns);
 
-  Error Terminate() override;
-  Error GetMainModule(FileSpec &file_spec) const override;
+  Status Terminate() override;
+  Status GetMainModule(FileSpec &file_spec) const override;
 
   lldb::pid_t GetProcessId() const override;
   bool IsRunning() const override;
diff --git a/include/lldb/Host/windows/HostThreadWindows.h b/include/lldb/Host/windows/HostThreadWindows.h
index 2acc28e29b5c..35d83c0ba2b2 100644
--- a/include/lldb/Host/windows/HostThreadWindows.h
+++ b/include/lldb/Host/windows/HostThreadWindows.h
@@ -26,8 +26,8 @@ public:
 
   void SetOwnsHandle(bool owns);
 
-  virtual Error Join(lldb::thread_result_t *result);
-  virtual Error Cancel();
+  virtual Status Join(lldb::thread_result_t *result);
+  virtual Status Cancel();
   virtual void Reset();
 
   lldb::tid_t GetThreadId() const;
diff --git a/include/lldb/Host/windows/LockFileWindows.h b/include/lldb/Host/windows/LockFileWindows.h
index 9cd1faedc262..10456a1b8b25 100644
--- a/include/lldb/Host/windows/LockFileWindows.h
+++ b/include/lldb/Host/windows/LockFileWindows.h
@@ -21,15 +21,15 @@ public:
   ~LockFileWindows();
 
 protected:
-  Error DoWriteLock(const uint64_t start, const uint64_t len) override;
+  Status DoWriteLock(const uint64_t start, const uint64_t len) override;
 
-  Error DoTryWriteLock(const uint64_t start, const uint64_t len) override;
+  Status DoTryWriteLock(const uint64_t start, const uint64_t len) override;
 
-  Error DoReadLock(const uint64_t start, const uint64_t len) override;
+  Status DoReadLock(const uint64_t start, const uint64_t len) override;
 
-  Error DoTryReadLock(const uint64_t start, const uint64_t len) override;
+  Status DoTryReadLock(const uint64_t start, const uint64_t len) override;
 
-  Error DoUnlock() override;
+  Status DoUnlock() override;
 
   bool IsValidFile() const override;
 
diff --git a/include/lldb/Host/windows/PipeWindows.h b/include/lldb/Host/windows/PipeWindows.h
index e9468ffc4153..86dec5a79d8e 100644
--- a/include/lldb/Host/windows/PipeWindows.h
+++ b/include/lldb/Host/windows/PipeWindows.h
@@ -27,12 +27,14 @@ public:
   PipeWindows();
   ~PipeWindows() override;
 
-  Error CreateNew(bool child_process_inherit) override;
-  Error CreateNew(llvm::StringRef name, bool child_process_inherit) override;
-  Error CreateWithUniqueName(llvm::StringRef prefix, bool child_process_inherit,
-                             llvm::SmallVectorImpl<char> &name) override;
-  Error OpenAsReader(llvm::StringRef name, bool child_process_inherit) override;
-  Error
+  Status CreateNew(bool child_process_inherit) override;
+  Status CreateNew(llvm::StringRef name, bool child_process_inherit) override;
+  Status CreateWithUniqueName(llvm::StringRef prefix,
+                              bool child_process_inherit,
+                              llvm::SmallVectorImpl<char> &name) override;
+  Status OpenAsReader(llvm::StringRef name,
+                      bool child_process_inherit) override;
+  Status
   OpenAsWriterWithTimeout(llvm::StringRef name, bool child_process_inherit,
                           const std::chrono::microseconds &timeout) override;
 
@@ -48,12 +50,12 @@ public:
 
   void Close() override;
 
-  Error Delete(llvm::StringRef name) override;
+  Status Delete(llvm::StringRef name) override;
 
-  Error Write(const void *buf, size_t size, size_t &bytes_written) override;
-  Error ReadWithTimeout(void *buf, size_t size,
-                        const std::chrono::microseconds &timeout,
-                        size_t &bytes_read) override;
+  Status Write(const void *buf, size_t size, size_t &bytes_written) override;
+  Status ReadWithTimeout(void *buf, size_t size,
+                         const std::chrono::microseconds &timeout,
+                         size_t &bytes_read) override;
 
   // PipeWindows specific methods.  These allow access to the underlying OS
   // handle.
@@ -61,8 +63,8 @@ public:
   HANDLE GetWriteNativeHandle();
 
 private:
-  Error OpenNamedPipe(llvm::StringRef name, bool child_process_inherit,
-                      bool is_read);
+  Status OpenNamedPipe(llvm::StringRef name, bool child_process_inherit,
+                       bool is_read);
 
   HANDLE m_read;
   HANDLE m_write;
diff --git a/include/lldb/Host/windows/ProcessLauncherWindows.h b/include/lldb/Host/windows/ProcessLauncherWindows.h
index 9f9e39709375..1df377d5eb88 100644
--- a/include/lldb/Host/windows/ProcessLauncherWindows.h
+++ b/include/lldb/Host/windows/ProcessLauncherWindows.h
@@ -20,7 +20,7 @@ class ProcessLaunchInfo;
 class ProcessLauncherWindows : public ProcessLauncher {
 public:
   virtual HostProcess LaunchProcess(const ProcessLaunchInfo &launch_info,
-                                    Error &error);
+                                    Status &error);
 
 protected:
   HANDLE GetStdioHandle(const ProcessLaunchInfo &launch_info, int fd);
diff --git a/include/lldb/Interpreter/Args.h b/include/lldb/Interpreter/Args.h
index bdbf81e02d9d..98046cd59545 100644
--- a/include/lldb/Interpreter/Args.h
+++ b/include/lldb/Interpreter/Args.h
@@ -21,7 +21,7 @@
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/StringRef.h"
 // Project includes
-#include "lldb/Utility/Error.h"
+#include "lldb/Utility/Status.h"
 #include "lldb/lldb-private-types.h"
 #include "lldb/lldb-types.h"
 
@@ -322,8 +322,8 @@ public:
   ///
   /// @see class Options
   //------------------------------------------------------------------
-  Error ParseOptions(Options &options, ExecutionContext *execution_context,
-                     lldb::PlatformSP platform_sp, bool require_validation);
+  Status ParseOptions(Options &options, ExecutionContext *execution_context,
+                      lldb::PlatformSP platform_sp, bool require_validation);
 
   bool IsPositionalArgument(const char *arg);
 
@@ -377,7 +377,7 @@ public:
 
   static lldb::addr_t StringToAddress(const ExecutionContext *exe_ctx,
                                       llvm::StringRef s,
-                                      lldb::addr_t fail_value, Error *error);
+                                      lldb::addr_t fail_value, Status *error);
 
   static bool StringToBoolean(llvm::StringRef s, bool fail_value,
                               bool *success_ptr);
@@ -387,17 +387,17 @@ public:
 
   static int64_t StringToOptionEnum(llvm::StringRef s,
                                     OptionEnumValueElement *enum_values,
-                                    int32_t fail_value, Error &error);
+                                    int32_t fail_value, Status &error);
 
   static lldb::ScriptLanguage
   StringToScriptLanguage(llvm::StringRef s, lldb::ScriptLanguage fail_value,
                          bool *success_ptr);
 
   // TODO: Use StringRef
-  static Error StringToFormat(const char *s, lldb::Format &format,
-                              size_t *byte_size_ptr); // If non-NULL, then a
-                                                      // byte size can precede
-                                                      // the format character
+  static Status StringToFormat(const char *s, lldb::Format &format,
+                               size_t *byte_size_ptr); // If non-NULL, then a
+                                                       // byte size can precede
+                                                       // the format character
 
   static lldb::Encoding
   StringToEncoding(llvm::StringRef s,
diff --git a/include/lldb/Interpreter/CommandInterpreter.h b/include/lldb/Interpreter/CommandInterpreter.h
index 031ea10d6ad4..f47411079a3a 100644
--- a/include/lldb/Interpreter/CommandInterpreter.h
+++ b/include/lldb/Interpreter/CommandInterpreter.h
@@ -510,7 +510,7 @@ protected:
                                      StringList *matches = nullptr) const;
 
 private:
-  Error PreprocessCommand(std::string &command);
+  Status PreprocessCommand(std::string &command);
 
   // Completely resolves aliases and abbreviations, returning a pointer to the
   // final command object and updating command_line to the fully substituted
diff --git a/include/lldb/Interpreter/CommandReturnObject.h b/include/lldb/Interpreter/CommandReturnObject.h
index 8e467e118477..7b04c391bc76 100644
--- a/include/lldb/Interpreter/CommandReturnObject.h
+++ b/include/lldb/Interpreter/CommandReturnObject.h
@@ -129,7 +129,7 @@ public:
     AppendError(llvm::formatv(format, std::forward<Args>(args)...).str());
   }
 
-  void SetError(const Error &error, const char *fallback_error_cstr = nullptr);
+  void SetError(const Status &error, const char *fallback_error_cstr = nullptr);
 
   void SetError(llvm::StringRef error_cstr);
 
diff --git a/include/lldb/Interpreter/OptionGroupArchitecture.h b/include/lldb/Interpreter/OptionGroupArchitecture.h
index 742cd6d1321d..e73a53421f28 100644
--- a/include/lldb/Interpreter/OptionGroupArchitecture.h
+++ b/include/lldb/Interpreter/OptionGroupArchitecture.h
@@ -31,8 +31,8 @@ public:
 
   llvm::ArrayRef<OptionDefinition> GetDefinitions() override;
 
-  Error SetOptionValue(uint32_t option_idx, llvm::StringRef option_value,
-                       ExecutionContext *execution_context) override;
+  Status SetOptionValue(uint32_t option_idx, llvm::StringRef option_value,
+                        ExecutionContext *execution_context) override;
 
   void OptionParsingStarting(ExecutionContext *execution_context) override;
 
diff --git a/include/lldb/Interpreter/OptionGroupBoolean.h b/include/lldb/Interpreter/OptionGroupBoolean.h
index 931a6d73e728..53d08d79d67a 100644
--- a/include/lldb/Interpreter/OptionGroupBoolean.h
+++ b/include/lldb/Interpreter/OptionGroupBoolean.h
@@ -38,9 +38,9 @@ public:
     return llvm::ArrayRef<OptionDefinition>(&m_option_definition, 1);
   }
 
-  Error SetOptionValue(uint32_t option_idx, llvm::StringRef option_value,
-                       ExecutionContext *execution_context) override;
-  Error SetOptionValue(uint32_t, const char *, ExecutionContext *) = delete;
+  Status SetOptionValue(uint32_t option_idx, llvm::StringRef option_value,
+                        ExecutionContext *execution_context) override;
+  Status SetOptionValue(uint32_t, const char *, ExecutionContext *) = delete;
 
   void OptionParsingStarting(ExecutionContext *execution_context) override;
 
diff --git a/include/lldb/Interpreter/OptionGroupFile.h b/include/lldb/Interpreter/OptionGroupFile.h
index 93e3ed75d342..d0c25b8bf8e6 100644
--- a/include/lldb/Interpreter/OptionGroupFile.h
+++ b/include/lldb/Interpreter/OptionGroupFile.h
@@ -37,9 +37,9 @@ public:
     return llvm::ArrayRef<OptionDefinition>(&m_option_definition, 1);
   }
 
-  Error SetOptionValue(uint32_t option_idx, llvm::StringRef option_value,
-                       ExecutionContext *execution_context) override;
-  Error SetOptionValue(uint32_t, const char *, ExecutionContext *) = delete;
+  Status SetOptionValue(uint32_t option_idx, llvm::StringRef option_value,
+                        ExecutionContext *execution_context) override;
+  Status SetOptionValue(uint32_t, const char *, ExecutionContext *) = delete;
 
   void OptionParsingStarting(ExecutionContext *execution_context) override;
 
@@ -70,9 +70,9 @@ public:
     return llvm::ArrayRef<OptionDefinition>(&m_option_definition, 1);
   }
 
-  Error SetOptionValue(uint32_t option_idx, llvm::StringRef option_value,
-                       ExecutionContext *execution_context) override;
-  Error SetOptionValue(uint32_t, const char *, ExecutionContext *) = delete;
+  Status SetOptionValue(uint32_t option_idx, llvm::StringRef option_value,
+                        ExecutionContext *execution_context) override;
+  Status SetOptionValue(uint32_t, const char *, ExecutionContext *) = delete;
 
   void OptionParsingStarting(ExecutionContext *execution_context) override;
 
diff --git a/include/lldb/Interpreter/OptionGroupFormat.h b/include/lldb/Interpreter/OptionGroupFormat.h
index f3a42037afc5..ddf2ccece3bd 100644
--- a/include/lldb/Interpreter/OptionGroupFormat.h
+++ b/include/lldb/Interpreter/OptionGroupFormat.h
@@ -43,9 +43,9 @@ public:
 
   llvm::ArrayRef<OptionDefinition> GetDefinitions() override;
 
-  Error SetOptionValue(uint32_t option_idx, llvm::StringRef option_value,
-                       ExecutionContext *execution_context) override;
-  Error SetOptionValue(uint32_t, const char *, ExecutionContext *) = delete;
+  Status SetOptionValue(uint32_t option_idx, llvm::StringRef option_value,
+                        ExecutionContext *execution_context) override;
+  Status SetOptionValue(uint32_t, const char *, ExecutionContext *) = delete;
 
   void OptionParsingStarting(ExecutionContext *execution_context) override;
 
diff --git a/include/lldb/Interpreter/OptionGroupOutputFile.h b/include/lldb/Interpreter/OptionGroupOutputFile.h
index cb891f0f7ecb..19453c6215ef 100644
--- a/include/lldb/Interpreter/OptionGroupOutputFile.h
+++ b/include/lldb/Interpreter/OptionGroupOutputFile.h
@@ -31,9 +31,9 @@ public:
 
   llvm::ArrayRef<OptionDefinition> GetDefinitions() override;
 
-  Error SetOptionValue(uint32_t option_idx, llvm::StringRef option_value,
-                       ExecutionContext *execution_context) override;
-  Error SetOptionValue(uint32_t, const char *, ExecutionContext *) = delete;
+  Status SetOptionValue(uint32_t option_idx, llvm::StringRef option_value,
+                        ExecutionContext *execution_context) override;
+  Status SetOptionValue(uint32_t, const char *, ExecutionContext *) = delete;
 
   void OptionParsingStarting(ExecutionContext *execution_context) override;
 
diff --git a/include/lldb/Interpreter/OptionGroupPlatform.h b/include/lldb/Interpreter/OptionGroupPlatform.h
index 8cee9a671869..e79662400279 100644
--- a/include/lldb/Interpreter/OptionGroupPlatform.h
+++ b/include/lldb/Interpreter/OptionGroupPlatform.h
@@ -36,15 +36,15 @@ public:
 
   llvm::ArrayRef<OptionDefinition> GetDefinitions() override;
 
-  Error SetOptionValue(uint32_t option_idx, llvm::StringRef option_value,
-                       ExecutionContext *execution_context) override;
-  Error SetOptionValue(uint32_t, const char *, ExecutionContext *) = delete;
+  Status SetOptionValue(uint32_t option_idx, llvm::StringRef option_value,
+                        ExecutionContext *execution_context) override;
+  Status SetOptionValue(uint32_t, const char *, ExecutionContext *) = delete;
 
   void OptionParsingStarting(ExecutionContext *execution_context) override;
 
   lldb::PlatformSP CreatePlatformWithOptions(CommandInterpreter &interpreter,
                                              const ArchSpec &arch,
-                                             bool make_selected, Error &error,
+                                             bool make_selected, Status &error,
                                              ArchSpec &platform_arch) const;
 
   bool PlatformWasSpecified() const { return !m_platform_name.empty(); }
diff --git a/include/lldb/Interpreter/OptionGroupString.h b/include/lldb/Interpreter/OptionGroupString.h
index 21bfc08d130f..01e8aa9dc3b6 100644
--- a/include/lldb/Interpreter/OptionGroupString.h
+++ b/include/lldb/Interpreter/OptionGroupString.h
@@ -35,9 +35,9 @@ public:
     return llvm::ArrayRef<OptionDefinition>(&m_option_definition, 1);
   }
 
-  Error SetOptionValue(uint32_t option_idx, llvm::StringRef option_value,
-                       ExecutionContext *execution_context) override;
-  Error SetOptionValue(uint32_t, const char *, ExecutionContext *) = delete;
+  Status SetOptionValue(uint32_t option_idx, llvm::StringRef option_value,
+                        ExecutionContext *execution_context) override;
+  Status SetOptionValue(uint32_t, const char *, ExecutionContext *) = delete;
 
   void OptionParsingStarting(ExecutionContext *execution_context) override;
 
diff --git a/include/lldb/Interpreter/OptionGroupUInt64.h b/include/lldb/Interpreter/OptionGroupUInt64.h
index 3f04eac72888..82eb0e26af9b 100644
--- a/include/lldb/Interpreter/OptionGroupUInt64.h
+++ b/include/lldb/Interpreter/OptionGroupUInt64.h
@@ -36,9 +36,9 @@ public:
     return llvm::ArrayRef<OptionDefinition>(&m_option_definition, 1);
   }
 
-  Error SetOptionValue(uint32_t option_idx, llvm::StringRef option_value,
-                       ExecutionContext *execution_context) override;
-  Error SetOptionValue(uint32_t, const char *, ExecutionContext *) = delete;
+  Status SetOptionValue(uint32_t option_idx, llvm::StringRef option_value,
+                        ExecutionContext *execution_context) override;
+  Status SetOptionValue(uint32_t, const char *, ExecutionContext *) = delete;
 
   void OptionParsingStarting(ExecutionContext *execution_context) override;
 
diff --git a/include/lldb/Interpreter/OptionGroupUUID.h b/include/lldb/Interpreter/OptionGroupUUID.h
index 9a7c6cf9cffe..def97d5f2551 100644
--- a/include/lldb/Interpreter/OptionGroupUUID.h
+++ b/include/lldb/Interpreter/OptionGroupUUID.h
@@ -31,9 +31,9 @@ public:
 
   llvm::ArrayRef<OptionDefinition> GetDefinitions() override;
 
-  Error SetOptionValue(uint32_t option_idx, llvm::StringRef option_value,
-                       ExecutionContext *execution_context) override;
-  Error SetOptionValue(uint32_t, const char *, ExecutionContext *) = delete;
+  Status SetOptionValue(uint32_t option_idx, llvm::StringRef option_value,
+                        ExecutionContext *execution_context) override;
+  Status SetOptionValue(uint32_t, const char *, ExecutionContext *) = delete;
 
   void OptionParsingStarting(ExecutionContext *execution_context) override;
 
diff --git a/include/lldb/Interpreter/OptionGroupValueObjectDisplay.h b/include/lldb/Interpreter/OptionGroupValueObjectDisplay.h
index 694464b8b917..0e075773b3f7 100644
--- a/include/lldb/Interpreter/OptionGroupValueObjectDisplay.h
+++ b/include/lldb/Interpreter/OptionGroupValueObjectDisplay.h
@@ -31,9 +31,9 @@ public:
 
   llvm::ArrayRef<OptionDefinition> GetDefinitions() override;
 
-  Error SetOptionValue(uint32_t option_idx, llvm::StringRef option_value,
-                       ExecutionContext *execution_context) override;
-  Error SetOptionValue(uint32_t, const char *, ExecutionContext *) = delete;
+  Status SetOptionValue(uint32_t option_idx, llvm::StringRef option_value,
+                        ExecutionContext *execution_context) override;
+  Status SetOptionValue(uint32_t, const char *, ExecutionContext *) = delete;
 
   void OptionParsingStarting(ExecutionContext *execution_context) override;
 
diff --git a/include/lldb/Interpreter/OptionGroupVariable.h b/include/lldb/Interpreter/OptionGroupVariable.h
index 0735de45d1a6..c9e90320e141 100644
--- a/include/lldb/Interpreter/OptionGroupVariable.h
+++ b/include/lldb/Interpreter/OptionGroupVariable.h
@@ -31,9 +31,9 @@ public:
 
   llvm::ArrayRef<OptionDefinition> GetDefinitions() override;
 
-  Error SetOptionValue(uint32_t option_idx, llvm::StringRef option_value,
-                       ExecutionContext *execution_context) override;
-  Error SetOptionValue(uint32_t, const char *, ExecutionContext *) = delete;
+  Status SetOptionValue(uint32_t option_idx, llvm::StringRef option_value,
+                        ExecutionContext *execution_context) override;
+  Status SetOptionValue(uint32_t, const char *, ExecutionContext *) = delete;
 
   void OptionParsingStarting(ExecutionContext *execution_context) override;
 
diff --git a/include/lldb/Interpreter/OptionGroupWatchpoint.h b/include/lldb/Interpreter/OptionGroupWatchpoint.h
index c67c8f071b54..f2665638ba85 100644
--- a/include/lldb/Interpreter/OptionGroupWatchpoint.h
+++ b/include/lldb/Interpreter/OptionGroupWatchpoint.h
@@ -32,9 +32,9 @@ public:
 
   llvm::ArrayRef<OptionDefinition> GetDefinitions() override;
 
-  Error SetOptionValue(uint32_t option_idx, llvm::StringRef option_value,
-                       ExecutionContext *execution_context) override;
-  Error SetOptionValue(uint32_t, const char *, ExecutionContext *) = delete;
+  Status SetOptionValue(uint32_t option_idx, llvm::StringRef option_value,
+                        ExecutionContext *execution_context) override;
+  Status SetOptionValue(uint32_t, const char *, ExecutionContext *) = delete;
 
   void OptionParsingStarting(ExecutionContext *execution_context) override;
 
diff --git a/include/lldb/Interpreter/OptionValue.h b/include/lldb/Interpreter/OptionValue.h
index 648ad33e2fc8..6008e1ea4411 100644
--- a/include/lldb/Interpreter/OptionValue.h
+++ b/include/lldb/Interpreter/OptionValue.h
@@ -16,7 +16,7 @@
 // Project includes
 #include "lldb/Core/FormatEntity.h"
 #include "lldb/Utility/ConstString.h"
-#include "lldb/Utility/Error.h"
+#include "lldb/Utility/Status.h"
 #include "lldb/lldb-defines.h"
 #include "lldb/lldb-private-enumerations.h"
 #include "lldb/lldb-private-interfaces.h"
@@ -91,7 +91,7 @@ public:
   virtual void DumpValue(const ExecutionContext *exe_ctx, Stream &strm,
                          uint32_t dump_mask) = 0;
 
-  virtual Error
+  virtual Status
   SetValueFromString(llvm::StringRef value,
                      VarSetOperationType op = eVarSetOperationAssign);
 
@@ -108,15 +108,16 @@ public:
   // Subclasses can override these functions
   //-----------------------------------------------------------------
   virtual lldb::OptionValueSP GetSubValue(const ExecutionContext *exe_ctx,
-    llvm::StringRef name, bool will_modify,
-                                          Error &error) const {
+                                          llvm::StringRef name,
+                                          bool will_modify,
+                                          Status &error) const {
     error.SetErrorStringWithFormat("'%s' is not a value subvalue", name.str().c_str());
     return lldb::OptionValueSP();
   }
 
-  virtual Error SetSubValue(const ExecutionContext *exe_ctx,
-                            VarSetOperationType op, llvm::StringRef name,
-    llvm::StringRef value);
+  virtual Status SetSubValue(const ExecutionContext *exe_ctx,
+                             VarSetOperationType op, llvm::StringRef name,
+                             llvm::StringRef value);
 
   virtual bool IsAggregateValue() const { return false; }
 
@@ -180,7 +181,7 @@ public:
 
   static lldb::OptionValueSP
   CreateValueFromCStringForTypeMask(const char *value_cstr, uint32_t type_mask,
-                                    Error &error);
+                                    Status &error);
 
   // Get this value as a uint64_t value if it is encoded as a boolean,
   // uint64_t or int64_t. Other types will cause "fail_value" to be
diff --git a/include/lldb/Interpreter/OptionValueArch.h b/include/lldb/Interpreter/OptionValueArch.h
index aa548124e2d6..e1a354d4ae21 100644
--- a/include/lldb/Interpreter/OptionValueArch.h
+++ b/include/lldb/Interpreter/OptionValueArch.h
@@ -46,10 +46,10 @@ public:
   void DumpValue(const ExecutionContext *exe_ctx, Stream &strm,
                  uint32_t dump_mask) override;
 
-  Error
+  Status
   SetValueFromString(llvm::StringRef value,
                      VarSetOperationType op = eVarSetOperationAssign) override;
-  Error
+  Status
   SetValueFromString(const char *,
                      VarSetOperationType = eVarSetOperationAssign) = delete;
 
diff --git a/include/lldb/Interpreter/OptionValueArray.h b/include/lldb/Interpreter/OptionValueArray.h
index 1e568cac402d..bbf4e371a893 100644
--- a/include/lldb/Interpreter/OptionValueArray.h
+++ b/include/lldb/Interpreter/OptionValueArray.h
@@ -36,10 +36,10 @@ public:
   void DumpValue(const ExecutionContext *exe_ctx, Stream &strm,
                  uint32_t dump_mask) override;
 
-  Error
+  Status
   SetValueFromString(llvm::StringRef value,
                      VarSetOperationType op = eVarSetOperationAssign) override;
-  Error
+  Status
   SetValueFromString(const char *,
                      VarSetOperationType = eVarSetOperationAssign) = delete;
 
@@ -55,7 +55,7 @@ public:
 
   lldb::OptionValueSP GetSubValue(const ExecutionContext *exe_ctx,
                                   llvm::StringRef name, bool will_modify,
-                                  Error &error) const override;
+                                  Status &error) const override;
 
   //---------------------------------------------------------------------
   // Subclass specific functions
@@ -122,7 +122,7 @@ public:
 
   size_t GetArgs(Args &args) const;
 
-  Error SetArgs(const Args &args, VarSetOperationType op);
+  Status SetArgs(const Args &args, VarSetOperationType op);
 
 protected:
   typedef std::vector<lldb::OptionValueSP> collection;
diff --git a/include/lldb/Interpreter/OptionValueBoolean.h b/include/lldb/Interpreter/OptionValueBoolean.h
index d429c0577ecb..1ff84dd3367d 100644
--- a/include/lldb/Interpreter/OptionValueBoolean.h
+++ b/include/lldb/Interpreter/OptionValueBoolean.h
@@ -37,10 +37,10 @@ public:
   void DumpValue(const ExecutionContext *exe_ctx, Stream &strm,
                  uint32_t dump_mask) override;
 
-  Error
+  Status
   SetValueFromString(llvm::StringRef value,
                      VarSetOperationType op = eVarSetOperationAssign) override;
-  Error
+  Status
   SetValueFromString(const char *,
                      VarSetOperationType = eVarSetOperationAssign) = delete;
 
diff --git a/include/lldb/Interpreter/OptionValueChar.h b/include/lldb/Interpreter/OptionValueChar.h
index d18644700573..0c5f602efe9b 100644
--- a/include/lldb/Interpreter/OptionValueChar.h
+++ b/include/lldb/Interpreter/OptionValueChar.h
@@ -38,10 +38,10 @@ public:
   void DumpValue(const ExecutionContext *exe_ctx, Stream &strm,
                  uint32_t dump_mask) override;
 
-  Error
+  Status
   SetValueFromString(llvm::StringRef value,
                      VarSetOperationType op = eVarSetOperationAssign) override;
-  Error
+  Status
   SetValueFromString(const char *,
                      VarSetOperationType = eVarSetOperationAssign) = delete;
 
diff --git a/include/lldb/Interpreter/OptionValueDictionary.h b/include/lldb/Interpreter/OptionValueDictionary.h
index 5d015a55a8f9..4e8c86008ea4 100644
--- a/include/lldb/Interpreter/OptionValueDictionary.h
+++ b/include/lldb/Interpreter/OptionValueDictionary.h
@@ -38,7 +38,7 @@ public:
   void DumpValue(const ExecutionContext *exe_ctx, Stream &strm,
                  uint32_t dump_mask) override;
 
-  Error
+  Status
   SetValueFromString(llvm::StringRef value,
                      VarSetOperationType op = eVarSetOperationAssign) override;
 
@@ -66,10 +66,10 @@ public:
 
   lldb::OptionValueSP GetSubValue(const ExecutionContext *exe_ctx,
                                   llvm::StringRef name, bool will_modify,
-                                  Error &error) const override;
+                                  Status &error) const override;
 
-  Error SetSubValue(const ExecutionContext *exe_ctx, VarSetOperationType op,
-    llvm::StringRef name, llvm::StringRef value) override;
+  Status SetSubValue(const ExecutionContext *exe_ctx, VarSetOperationType op,
+                     llvm::StringRef name, llvm::StringRef value) override;
 
   bool SetValueForKey(const ConstString &key,
                       const lldb::OptionValueSP &value_sp,
@@ -79,7 +79,7 @@ public:
 
   size_t GetArgs(Args &args) const;
 
-  Error SetArgs(const Args &args, VarSetOperationType op);
+  Status SetArgs(const Args &args, VarSetOperationType op);
 
 protected:
   typedef std::map<ConstString, lldb::OptionValueSP> collection;
diff --git a/include/lldb/Interpreter/OptionValueEnumeration.h b/include/lldb/Interpreter/OptionValueEnumeration.h
index d47248b72c67..4aa8823e620f 100644
--- a/include/lldb/Interpreter/OptionValueEnumeration.h
+++ b/include/lldb/Interpreter/OptionValueEnumeration.h
@@ -13,7 +13,7 @@
 #include "lldb/Core/UniqueCStringMap.h"
 #include "lldb/Interpreter/OptionValue.h"
 #include "lldb/Utility/ConstString.h"
-#include "lldb/Utility/Error.h"
+#include "lldb/Utility/Status.h"
 #include "lldb/Utility/Stream.h"
 #include "lldb/Utility/StreamString.h"
 #include "lldb/lldb-private-types.h"
@@ -44,10 +44,10 @@ public:
   void DumpValue(const ExecutionContext *exe_ctx, Stream &strm,
                  uint32_t dump_mask) override;
 
-  Error
+  Status
   SetValueFromString(llvm::StringRef value,
                      VarSetOperationType op = eVarSetOperationAssign) override;
-  Error
+  Status
   SetValueFromString(const char *,
                      VarSetOperationType = eVarSetOperationAssign) = delete;
 
diff --git a/include/lldb/Interpreter/OptionValueFileSpec.h b/include/lldb/Interpreter/OptionValueFileSpec.h
index 654c04d4d274..b53c03471e2b 100644
--- a/include/lldb/Interpreter/OptionValueFileSpec.h
+++ b/include/lldb/Interpreter/OptionValueFileSpec.h
@@ -37,10 +37,10 @@ public:
   void DumpValue(const ExecutionContext *exe_ctx, Stream &strm,
                  uint32_t dump_mask) override;
 
-  Error
+  Status
   SetValueFromString(llvm::StringRef value,
                      VarSetOperationType op = eVarSetOperationAssign) override;
-  Error
+  Status
   SetValueFromString(const char *,
                      VarSetOperationType = eVarSetOperationAssign) = delete;
 
diff --git a/include/lldb/Interpreter/OptionValueFileSpecList.h b/include/lldb/Interpreter/OptionValueFileSpecList.h
index 1e42362c9163..9529fbcf38bc 100644
--- a/include/lldb/Interpreter/OptionValueFileSpecList.h
+++ b/include/lldb/Interpreter/OptionValueFileSpecList.h
@@ -37,10 +37,10 @@ public:
   void DumpValue(const ExecutionContext *exe_ctx, Stream &strm,
                  uint32_t dump_mask) override;
 
-  Error
+  Status
   SetValueFromString(llvm::StringRef value,
                      VarSetOperationType op = eVarSetOperationAssign) override;
-  Error
+  Status
   SetValueFromString(const char *,
                      VarSetOperationType = eVarSetOperationAssign) = delete;
 
diff --git a/include/lldb/Interpreter/OptionValueFormat.h b/include/lldb/Interpreter/OptionValueFormat.h
index 5351aeca0627..ce7997024b09 100644
--- a/include/lldb/Interpreter/OptionValueFormat.h
+++ b/include/lldb/Interpreter/OptionValueFormat.h
@@ -38,10 +38,10 @@ public:
   void DumpValue(const ExecutionContext *exe_ctx, Stream &strm,
                  uint32_t dump_mask) override;
 
-  Error
+  Status
   SetValueFromString(llvm::StringRef value,
                      VarSetOperationType op = eVarSetOperationAssign) override;
-  Error
+  Status
   SetValueFromString(const char *,
                      VarSetOperationType = eVarSetOperationAssign) = delete;
 
diff --git a/include/lldb/Interpreter/OptionValueFormatEntity.h b/include/lldb/Interpreter/OptionValueFormatEntity.h
index 472cd9ee4f92..e5a65b7e7eb6 100644
--- a/include/lldb/Interpreter/OptionValueFormatEntity.h
+++ b/include/lldb/Interpreter/OptionValueFormatEntity.h
@@ -34,10 +34,10 @@ public:
   void DumpValue(const ExecutionContext *exe_ctx, Stream &strm,
                  uint32_t dump_mask) override;
 
-  Error
+  Status
   SetValueFromString(llvm::StringRef value,
                      VarSetOperationType op = eVarSetOperationAssign) override;
-  Error
+  Status
   SetValueFromString(const char *,
                      VarSetOperationType = eVarSetOperationAssign) = delete;
 
diff --git a/include/lldb/Interpreter/OptionValueLanguage.h b/include/lldb/Interpreter/OptionValueLanguage.h
index a388a6a92eaf..8f81c5df0739 100644
--- a/include/lldb/Interpreter/OptionValueLanguage.h
+++ b/include/lldb/Interpreter/OptionValueLanguage.h
@@ -41,10 +41,10 @@ public:
   void DumpValue(const ExecutionContext *exe_ctx, Stream &strm,
                  uint32_t dump_mask) override;
 
-  Error
+  Status
   SetValueFromString(llvm::StringRef value,
                      VarSetOperationType op = eVarSetOperationAssign) override;
-  Error
+  Status
   SetValueFromString(const char *,
                      VarSetOperationType = eVarSetOperationAssign) = delete;
 
diff --git a/include/lldb/Interpreter/OptionValuePathMappings.h b/include/lldb/Interpreter/OptionValuePathMappings.h
index d5336aafc993..0e2e98d74b74 100644
--- a/include/lldb/Interpreter/OptionValuePathMappings.h
+++ b/include/lldb/Interpreter/OptionValuePathMappings.h
@@ -35,10 +35,10 @@ public:
   void DumpValue(const ExecutionContext *exe_ctx, Stream &strm,
                  uint32_t dump_mask) override;
 
-  Error
+  Status
   SetValueFromString(llvm::StringRef value,
                      VarSetOperationType op = eVarSetOperationAssign) override;
-  Error
+  Status
   SetValueFromString(const char *,
                      VarSetOperationType = eVarSetOperationAssign) = delete;
 
diff --git a/include/lldb/Interpreter/OptionValueProperties.h b/include/lldb/Interpreter/OptionValueProperties.h
index 0b49e0b00672..16d31aa4ea90 100644
--- a/include/lldb/Interpreter/OptionValueProperties.h
+++ b/include/lldb/Interpreter/OptionValueProperties.h
@@ -43,7 +43,7 @@ public:
 
   lldb::OptionValueSP DeepCopy() const override;
 
-  Error
+  Status
   SetValueFromString(llvm::StringRef value,
                      VarSetOperationType op = eVarSetOperationAssign) override;
 
@@ -52,9 +52,9 @@ public:
 
   ConstString GetName() const override { return m_name; }
 
-  virtual Error DumpPropertyValue(const ExecutionContext *exe_ctx, Stream &strm,
-    llvm::StringRef property_path,
-                                  uint32_t dump_mask);
+  virtual Status DumpPropertyValue(const ExecutionContext *exe_ctx,
+                                   Stream &strm, llvm::StringRef property_path,
+                                   uint32_t dump_mask);
 
   virtual void DumpAllDescriptions(CommandInterpreter &interpreter,
                                    Stream &strm) const;
@@ -110,11 +110,12 @@ public:
                                              bool value_will_be_modified) const;
 
   lldb::OptionValueSP GetSubValue(const ExecutionContext *exe_ctx,
-    llvm::StringRef name, bool value_will_be_modified,
-                                  Error &error) const override;
+                                  llvm::StringRef name,
+                                  bool value_will_be_modified,
+                                  Status &error) const override;
 
-  Error SetSubValue(const ExecutionContext *exe_ctx, VarSetOperationType op,
-    llvm::StringRef path, llvm::StringRef value) override;
+  Status SetSubValue(const ExecutionContext *exe_ctx, VarSetOperationType op,
+                     llvm::StringRef path, llvm::StringRef value) override;
 
   virtual bool PredicateMatches(const ExecutionContext *exe_ctx,
     llvm::StringRef predicate) const {
diff --git a/include/lldb/Interpreter/OptionValueRegex.h b/include/lldb/Interpreter/OptionValueRegex.h
index 7bb8d419bde0..afe9318ae014 100644
--- a/include/lldb/Interpreter/OptionValueRegex.h
+++ b/include/lldb/Interpreter/OptionValueRegex.h
@@ -35,10 +35,10 @@ public:
   void DumpValue(const ExecutionContext *exe_ctx, Stream &strm,
                  uint32_t dump_mask) override;
 
-  Error
+  Status
   SetValueFromString(llvm::StringRef value,
                      VarSetOperationType op = eVarSetOperationAssign) override;
-  Error
+  Status
   SetValueFromString(const char *,
                      VarSetOperationType = eVarSetOperationAssign) = delete;
 
diff --git a/include/lldb/Interpreter/OptionValueSInt64.h b/include/lldb/Interpreter/OptionValueSInt64.h
index 1e1647866873..a6893d23e692 100644
--- a/include/lldb/Interpreter/OptionValueSInt64.h
+++ b/include/lldb/Interpreter/OptionValueSInt64.h
@@ -50,10 +50,10 @@ public:
   void DumpValue(const ExecutionContext *exe_ctx, Stream &strm,
                  uint32_t dump_mask) override;
 
-  Error
+  Status
   SetValueFromString(llvm::StringRef value,
                      VarSetOperationType op = eVarSetOperationAssign) override;
-  Error
+  Status
   SetValueFromString(const char *,
                      VarSetOperationType = eVarSetOperationAssign) = delete;
 
diff --git a/include/lldb/Interpreter/OptionValueString.h b/include/lldb/Interpreter/OptionValueString.h
index 18b8215fe3c7..4a9f2227680e 100644
--- a/include/lldb/Interpreter/OptionValueString.h
+++ b/include/lldb/Interpreter/OptionValueString.h
@@ -24,7 +24,7 @@ namespace lldb_private {
 
 class OptionValueString : public OptionValue {
 public:
-  typedef Error (*ValidatorCallback)(const char *string, void *baton);
+  typedef Status (*ValidatorCallback)(const char *string, void *baton);
 
   enum Options { eOptionEncodeCharacterEscapeSequences = (1u << 0) };
 
@@ -85,10 +85,10 @@ public:
   void DumpValue(const ExecutionContext *exe_ctx, Stream &strm,
                  uint32_t dump_mask) override;
 
-  Error
+  Status
   SetValueFromString(llvm::StringRef value,
                      VarSetOperationType op = eVarSetOperationAssign) override;
-  Error
+  Status
   SetValueFromString(const char *,
                      VarSetOperationType = eVarSetOperationAssign) = delete;
 
@@ -119,10 +119,10 @@ public:
   const char *GetDefaultValue() const { return m_default_value.c_str(); }
   llvm::StringRef GetDefaultValueAsRef() const { return m_default_value; }
 
-  Error SetCurrentValue(const char *) = delete;
-  Error SetCurrentValue(llvm::StringRef value);
+  Status SetCurrentValue(const char *) = delete;
+  Status SetCurrentValue(llvm::StringRef value);
 
-  Error AppendToCurrentValue(const char *value);
+  Status AppendToCurrentValue(const char *value);
 
   void SetDefaultValue(const char *value) {
     if (value && value[0])
diff --git a/include/lldb/Interpreter/OptionValueUInt64.h b/include/lldb/Interpreter/OptionValueUInt64.h
index cbf932b1bda4..be13ff073721 100644
--- a/include/lldb/Interpreter/OptionValueUInt64.h
+++ b/include/lldb/Interpreter/OptionValueUInt64.h
@@ -38,8 +38,8 @@ public:
   // string isn't a uint64_t value or any other error occurs, return an
   // empty lldb::OptionValueSP and fill error in with the correct stuff.
   //---------------------------------------------------------------------
-  static lldb::OptionValueSP Create(const char *, Error &) = delete;
-  static lldb::OptionValueSP Create(llvm::StringRef value_str, Error &error);
+  static lldb::OptionValueSP Create(const char *, Status &) = delete;
+  static lldb::OptionValueSP Create(llvm::StringRef value_str, Status &error);
   //---------------------------------------------------------------------
   // Virtual subclass pure virtual overrides
   //---------------------------------------------------------------------
@@ -49,10 +49,10 @@ public:
   void DumpValue(const ExecutionContext *exe_ctx, Stream &strm,
                  uint32_t dump_mask) override;
 
-  Error
+  Status
   SetValueFromString(llvm::StringRef value,
                      VarSetOperationType op = eVarSetOperationAssign) override;
-  Error
+  Status
   SetValueFromString(const char *,
                      VarSetOperationType = eVarSetOperationAssign) = delete;
 
diff --git a/include/lldb/Interpreter/OptionValueUUID.h b/include/lldb/Interpreter/OptionValueUUID.h
index 62c25859580c..6e0aeebb0e99 100644
--- a/include/lldb/Interpreter/OptionValueUUID.h
+++ b/include/lldb/Interpreter/OptionValueUUID.h
@@ -36,10 +36,10 @@ public:
   void DumpValue(const ExecutionContext *exe_ctx, Stream &strm,
                  uint32_t dump_mask) override;
 
-  Error
+  Status
   SetValueFromString(llvm::StringRef value,
                      VarSetOperationType op = eVarSetOperationAssign) override;
-  Error
+  Status
   SetValueFromString(const char *,
                      VarSetOperationType = eVarSetOperationAssign) = delete;
 
diff --git a/include/lldb/Interpreter/Options.h b/include/lldb/Interpreter/Options.h
index fc5ff972da46..87121005575a 100644
--- a/include/lldb/Interpreter/Options.h
+++ b/include/lldb/Interpreter/Options.h
@@ -38,7 +38,7 @@ static inline bool isprint8(int ch) {
 /// Options is designed to be subclassed to contain all needed
 /// options for a given command. The options can be parsed by calling:
 /// \code
-///     Error Args::ParseOptions (Options &);
+///     Status Args::ParseOptions (Options &);
 /// \endcode
 ///
 /// The options are specified using the format defined for the libc
@@ -62,11 +62,11 @@ static inline bool isprint8(int ch) {
 ///             return g_options;
 ///         }
 ///
-///         virtual Error
+///         virtual Status
 ///         SetOptionValue (uint32_t option_idx, int option_val, const char
 ///         *option_arg)
 ///         {
-///             Error error;
+///             Status error;
 ///             switch (option_val)
 ///             {
 ///             case 'g': debug = true; break;
@@ -171,7 +171,7 @@ public:
   // prone and subclasses shouldn't have to do it.
   void NotifyOptionParsingStarting(ExecutionContext *execution_context);
 
-  Error NotifyOptionParsingFinished(ExecutionContext *execution_context);
+  Status NotifyOptionParsingFinished(ExecutionContext *execution_context);
 
   //------------------------------------------------------------------
   /// Set the value of an option.
@@ -192,8 +192,8 @@ public:
   /// @see Args::ParseOptions (Options&)
   /// @see man getopt_long_only
   //------------------------------------------------------------------
-  virtual Error SetOptionValue(uint32_t option_idx, llvm::StringRef option_arg,
-                               ExecutionContext *execution_context) = 0;
+  virtual Status SetOptionValue(uint32_t option_idx, llvm::StringRef option_arg,
+                                ExecutionContext *execution_context) = 0;
 
   //------------------------------------------------------------------
   /// Handles the generic bits of figuring out whether we are in an
@@ -324,10 +324,10 @@ protected:
   // all option settings to default values.
   virtual void OptionParsingStarting(ExecutionContext *execution_context) = 0;
 
-  virtual Error OptionParsingFinished(ExecutionContext *execution_context) {
+  virtual Status OptionParsingFinished(ExecutionContext *execution_context) {
     // If subclasses need to know when the options are done being parsed
     // they can implement this function to do extra checking
-    Error error;
+    Status error;
     return error;
   }
 };
@@ -340,16 +340,16 @@ public:
 
   virtual llvm::ArrayRef<OptionDefinition> GetDefinitions() = 0;
 
-  virtual Error SetOptionValue(uint32_t option_idx,
-                               llvm::StringRef option_value,
-                               ExecutionContext *execution_context) = 0;
+  virtual Status SetOptionValue(uint32_t option_idx,
+                                llvm::StringRef option_value,
+                                ExecutionContext *execution_context) = 0;
 
   virtual void OptionParsingStarting(ExecutionContext *execution_context) = 0;
 
-  virtual Error OptionParsingFinished(ExecutionContext *execution_context) {
+  virtual Status OptionParsingFinished(ExecutionContext *execution_context) {
     // If subclasses need to know when the options are done being parsed
     // they can implement this function to do extra checking
-    Error error;
+    Status error;
     return error;
   }
 };
@@ -401,12 +401,12 @@ public:
 
   bool DidFinalize() { return m_did_finalize; }
 
-  Error SetOptionValue(uint32_t option_idx, llvm::StringRef option_arg,
-                       ExecutionContext *execution_context) override;
+  Status SetOptionValue(uint32_t option_idx, llvm::StringRef option_arg,
+                        ExecutionContext *execution_context) override;
 
   void OptionParsingStarting(ExecutionContext *execution_context) override;
 
-  Error OptionParsingFinished(ExecutionContext *execution_context) override;
+  Status OptionParsingFinished(ExecutionContext *execution_context) override;
 
   llvm::ArrayRef<OptionDefinition> GetDefinitions() override {
     assert(m_did_finalize);
diff --git a/include/lldb/Interpreter/ScriptInterpreter.h b/include/lldb/Interpreter/ScriptInterpreter.h
index 271a5bba761e..01b342ae41ca 100644
--- a/include/lldb/Interpreter/ScriptInterpreter.h
+++ b/include/lldb/Interpreter/ScriptInterpreter.h
@@ -20,7 +20,7 @@
 #include "lldb/Core/Broadcaster.h"
 #include "lldb/Core/PluginInterface.h"
 #include "lldb/Core/StructuredData.h"
-#include "lldb/Utility/Error.h"
+#include "lldb/Utility/Status.h"
 
 #include "lldb/Host/PseudoTerminal.h"
 
@@ -107,24 +107,24 @@ public:
     return true;
   }
 
-  virtual Error ExecuteMultipleLines(
+  virtual Status ExecuteMultipleLines(
       const char *in_string,
       const ExecuteScriptOptions &options = ExecuteScriptOptions()) {
-    Error error;
+    Status error;
     error.SetErrorString("not implemented");
     return error;
   }
 
-  virtual Error
+  virtual Status
   ExportFunctionDefinitionToInterpreter(StringList &function_def) {
-    Error error;
+    Status error;
     error.SetErrorString("not implemented");
     return error;
   }
 
-  virtual Error GenerateBreakpointCommandCallbackData(StringList &input,
-                                                      std::string &output) {
-    Error error;
+  virtual Status GenerateBreakpointCommandCallbackData(StringList &input,
+                                                       std::string &output) {
+    Status error;
     error.SetErrorString("not implemented");
     return error;
   }
@@ -235,19 +235,19 @@ public:
   }
 
   virtual StructuredData::ObjectSP
-  LoadPluginModule(const FileSpec &file_spec, lldb_private::Error &error) {
+  LoadPluginModule(const FileSpec &file_spec, lldb_private::Status &error) {
     return StructuredData::ObjectSP();
   }
 
   virtual StructuredData::DictionarySP
   GetDynamicSettings(StructuredData::ObjectSP plugin_module_sp, Target *target,
-                     const char *setting_name, lldb_private::Error &error) {
+                     const char *setting_name, lldb_private::Status &error) {
     return StructuredData::DictionarySP();
   }
 
-  virtual Error GenerateFunction(const char *signature,
-                                 const StringList &input) {
-    Error error;
+  virtual Status GenerateFunction(const char *signature,
+                                  const StringList &input) {
+    Status error;
     error.SetErrorString("unimplemented");
     return error;
   }
@@ -260,22 +260,22 @@ public:
                                           CommandReturnObject &result);
 
   /// Set the specified text as the callback for the breakpoint.
-  Error
+  Status
   SetBreakpointCommandCallback(std::vector<BreakpointOptions *> &bp_options_vec,
                                const char *callback_text);
 
-  virtual Error SetBreakpointCommandCallback(BreakpointOptions *bp_options,
-                                             const char *callback_text) {
-    Error error;
+  virtual Status SetBreakpointCommandCallback(BreakpointOptions *bp_options,
+                                              const char *callback_text) {
+    Status error;
     error.SetErrorString("unimplemented");
     return error;
   }
 
   /// This one is for deserialization:
-  virtual Error SetBreakpointCommandCallback(
+  virtual Status SetBreakpointCommandCallback(
       BreakpointOptions *bp_options,
       std::unique_ptr<BreakpointOptions::CommandData> &data_up) {
-    Error error;
+    Status error;
     error.SetErrorString("unimplemented");
     return error;
   }
@@ -346,7 +346,7 @@ public:
   RunScriptBasedCommand(const char *impl_function, const char *args,
                         ScriptedCommandSynchronicity synchronicity,
                         lldb_private::CommandReturnObject &cmd_retobj,
-                        Error &error,
+                        Status &error,
                         const lldb_private::ExecutionContext &exe_ctx) {
     return false;
   }
@@ -355,40 +355,40 @@ public:
   RunScriptBasedCommand(StructuredData::GenericSP impl_obj_sp, const char *args,
                         ScriptedCommandSynchronicity synchronicity,
                         lldb_private::CommandReturnObject &cmd_retobj,
-                        Error &error,
+                        Status &error,
                         const lldb_private::ExecutionContext &exe_ctx) {
     return false;
   }
 
   virtual bool RunScriptFormatKeyword(const char *impl_function,
                                       Process *process, std::string &output,
-                                      Error &error) {
+                                      Status &error) {
     error.SetErrorString("unimplemented");
     return false;
   }
 
   virtual bool RunScriptFormatKeyword(const char *impl_function, Thread *thread,
-                                      std::string &output, Error &error) {
+                                      std::string &output, Status &error) {
     error.SetErrorString("unimplemented");
     return false;
   }
 
   virtual bool RunScriptFormatKeyword(const char *impl_function, Target *target,
-                                      std::string &output, Error &error) {
+                                      std::string &output, Status &error) {
     error.SetErrorString("unimplemented");
     return false;
   }
 
   virtual bool RunScriptFormatKeyword(const char *impl_function,
                                       StackFrame *frame, std::string &output,
-                                      Error &error) {
+                                      Status &error) {
     error.SetErrorString("unimplemented");
     return false;
   }
 
   virtual bool RunScriptFormatKeyword(const char *impl_function,
                                       ValueObject *value, std::string &output,
-                                      Error &error) {
+                                      Status &error) {
     error.SetErrorString("unimplemented");
     return false;
   }
@@ -420,7 +420,7 @@ public:
 
   virtual bool
   LoadScriptingModule(const char *filename, bool can_reload, bool init_session,
-                      lldb_private::Error &error,
+                      lldb_private::Status &error,
                       StructuredData::ObjectSP *module_sp = nullptr) {
     error.SetErrorString("loading unimplemented");
     return false;
diff --git a/include/lldb/Symbol/ClangASTContext.h b/include/lldb/Symbol/ClangASTContext.h
index ef1659159697..9d0a08414e93 100644
--- a/include/lldb/Symbol/ClangASTContext.h
+++ b/include/lldb/Symbol/ClangASTContext.h
@@ -275,17 +275,16 @@ public:
     bool IsValid() const {
       if (args.empty())
         return false;
-      return args.size() == names.size();
-    }
-
-    size_t GetSize() const {
-      if (IsValid())
-        return args.size();
-      return 0;
+      return args.size() == names.size() &&
+        ((bool)pack_name == (bool)packed_args) &&
+        (!packed_args || !packed_args->packed_args);
     }
 
     llvm::SmallVector<const char *, 2> names;
     llvm::SmallVector<clang::TemplateArgument, 2> args;
+    
+    const char * pack_name = nullptr;
+    std::unique_ptr<TemplateParameterInfos> packed_args;
   };
 
   clang::FunctionTemplateDecl *
diff --git a/include/lldb/Symbol/ObjectFile.h b/include/lldb/Symbol/ObjectFile.h
index 296c9ff2129f..a4c7b01ece17 100644
--- a/include/lldb/Symbol/ObjectFile.h
+++ b/include/lldb/Symbol/ObjectFile.h
@@ -826,7 +826,7 @@ public:
   ///
   /// @return
   //------------------------------------------------------------------
-  virtual Error LoadInMemory(Target &target, bool set_pc);
+  virtual Status LoadInMemory(Target &target, bool set_pc);
 
 protected:
   //------------------------------------------------------------------
diff --git a/include/lldb/Symbol/SymbolContext.h b/include/lldb/Symbol/SymbolContext.h
index 2ca3fdb71286..e4dcc73bb52b 100644
--- a/include/lldb/Symbol/SymbolContext.h
+++ b/include/lldb/Symbol/SymbolContext.h
@@ -234,7 +234,7 @@ public:
                        bool use_inline_block_range, AddressRange &range) const;
 
   bool GetAddressRangeFromHereToEndLine(uint32_t end_line, AddressRange &range,
-                                        Error &error);
+                                        Status &error);
 
   void GetDescription(Stream *s, lldb::DescriptionLevel level,
                       Target *target) const;
diff --git a/include/lldb/Symbol/Variable.h b/include/lldb/Symbol/Variable.h
index f076a04434e1..507b41309042 100644
--- a/include/lldb/Symbol/Variable.h
+++ b/include/lldb/Symbol/Variable.h
@@ -97,7 +97,7 @@ public:
   typedef size_t (*GetVariableCallback)(void *baton, const char *name,
                                         VariableList &var_list);
 
-  static Error GetValuesForVariableExpressionPath(
+  static Status GetValuesForVariableExpressionPath(
       llvm::StringRef variable_expr_path, ExecutionContextScope *scope,
       GetVariableCallback callback, void *baton, VariableList &variable_list,
       ValueObjectList &valobj_list);
diff --git a/include/lldb/Target/ABI.h b/include/lldb/Target/ABI.h
index 4b611d244e21..a8e08e1a800a 100644
--- a/include/lldb/Target/ABI.h
+++ b/include/lldb/Target/ABI.h
@@ -16,7 +16,7 @@
 // Project includes
 #include "lldb/Core/PluginInterface.h"
 #include "lldb/Symbol/UnwindPlan.h"
-#include "lldb/Utility/Error.h"
+#include "lldb/Utility/Status.h"
 #include "lldb/lldb-private.h"
 
 #include "llvm/ADT/ArrayRef.h"
@@ -77,8 +77,8 @@ public:
                                            bool persistent = true) const;
 
   // Set the Return value object in the current frame as though a function with
-  virtual Error SetReturnValueObject(lldb::StackFrameSP &frame_sp,
-                                     lldb::ValueObjectSP &new_value) = 0;
+  virtual Status SetReturnValueObject(lldb::StackFrameSP &frame_sp,
+                                      lldb::ValueObjectSP &new_value) = 0;
 
 protected:
   // This is the method the ABI will call to actually calculate the return
diff --git a/include/lldb/Target/DynamicLoader.h b/include/lldb/Target/DynamicLoader.h
index ced6ef44000a..b5890662d4e5 100644
--- a/include/lldb/Target/DynamicLoader.h
+++ b/include/lldb/Target/DynamicLoader.h
@@ -12,8 +12,8 @@
 
 // Project includes
 #include "lldb/Core/PluginInterface.h"
-#include "lldb/Utility/Error.h"
 #include "lldb/Utility/FileSpec.h" // for FileSpec
+#include "lldb/Utility/Status.h"
 #include "lldb/Utility/UUID.h"
 #include "lldb/lldb-defines.h"              // for LLDB_INVALID_ADDRESS
 #include "lldb/lldb-forward.h"              // for ModuleSP, ThreadPlanSP
@@ -213,7 +213,7 @@ public:
   ///     \b true if it is currently ok to try and load a shared
   ///     library into the process, \b false otherwise.
   //------------------------------------------------------------------
-  virtual Error CanLoadImage() = 0;
+  virtual Status CanLoadImage() = 0;
 
   //------------------------------------------------------------------
   /// Ask if the eh_frame information for the given SymbolContext should
diff --git a/include/lldb/Target/Language.h b/include/lldb/Target/Language.h
index bcf840f93edd..f81679f78129 100644
--- a/include/lldb/Target/Language.h
+++ b/include/lldb/Target/Language.h
@@ -199,7 +199,7 @@ public:
   // it should return an appropriate closure here
   virtual DumpValueObjectOptions::DeclPrintingHelper GetDeclPrintingHelper();
 
-  virtual LazyBool IsLogicalTrue(ValueObject &valobj, Error &error);
+  virtual LazyBool IsLogicalTrue(ValueObject &valobj, Status &error);
 
   // for a ValueObject of some "reference type", if the value points to the
   // nil/null object, this method returns true
diff --git a/include/lldb/Target/LanguageRuntime.h b/include/lldb/Target/LanguageRuntime.h
index a57216e84794..98db941669c7 100644
--- a/include/lldb/Target/LanguageRuntime.h
+++ b/include/lldb/Target/LanguageRuntime.h
@@ -48,7 +48,7 @@ public:
   static SearchFilter *
   CreateFromStructuredData(Target &target,
                            const StructuredData::Dictionary &data_dict,
-                           Error &error);
+                           Status &error);
 
   StructuredData::ObjectSP SerializeToStructuredData() override;
 
diff --git a/include/lldb/Target/Memory.h b/include/lldb/Target/Memory.h
index f0a4bc5881f0..af6be15df9fb 100644
--- a/include/lldb/Target/Memory.h
+++ b/include/lldb/Target/Memory.h
@@ -40,7 +40,7 @@ public:
 
   void Flush(lldb::addr_t addr, size_t size);
 
-  size_t Read(lldb::addr_t addr, void *dst, size_t dst_len, Error &error);
+  size_t Read(lldb::addr_t addr, void *dst, size_t dst_len, Status &error);
 
   uint32_t GetMemoryCacheLineSize() const { return m_L2_cache_line_byte_size; }
 
@@ -135,7 +135,7 @@ public:
   void Clear();
 
   lldb::addr_t AllocateMemory(size_t byte_size, uint32_t permissions,
-                              Error &error);
+                              Status &error);
 
   bool DeallocateMemory(lldb::addr_t ptr);
 
@@ -143,7 +143,7 @@ protected:
   typedef std::shared_ptr<AllocatedBlock> AllocatedBlockSP;
 
   AllocatedBlockSP AllocatePage(uint32_t byte_size, uint32_t permissions,
-                                uint32_t chunk_size, Error &error);
+                                uint32_t chunk_size, Status &error);
 
   //------------------------------------------------------------------
   // Classes that inherit from MemoryCache can see and modify these
diff --git a/include/lldb/Target/ModuleCache.h b/include/lldb/Target/ModuleCache.h
index 49a7c97c60c3..4959ee8ea0a3 100644
--- a/include/lldb/Target/ModuleCache.h
+++ b/include/lldb/Target/ModuleCache.h
@@ -14,8 +14,8 @@
 #include "lldb/lldb-types.h"
 
 #include "lldb/Host/File.h"
-#include "lldb/Utility/Error.h"
 #include "lldb/Utility/FileSpec.h"
+#include "lldb/Utility/Status.h"
 
 #include <functional>
 #include <string>
@@ -49,24 +49,24 @@ class UUID;
 class ModuleCache {
 public:
   using ModuleDownloader =
-      std::function<Error(const ModuleSpec &, const FileSpec &)>;
+      std::function<Status(const ModuleSpec &, const FileSpec &)>;
   using SymfileDownloader =
-      std::function<Error(const lldb::ModuleSP &, const FileSpec &)>;
+      std::function<Status(const lldb::ModuleSP &, const FileSpec &)>;
 
-  Error GetAndPut(const FileSpec &root_dir_spec, const char *hostname,
-                  const ModuleSpec &module_spec,
-                  const ModuleDownloader &module_downloader,
-                  const SymfileDownloader &symfile_downloader,
-                  lldb::ModuleSP &cached_module_sp, bool *did_create_ptr);
+  Status GetAndPut(const FileSpec &root_dir_spec, const char *hostname,
+                   const ModuleSpec &module_spec,
+                   const ModuleDownloader &module_downloader,
+                   const SymfileDownloader &symfile_downloader,
+                   lldb::ModuleSP &cached_module_sp, bool *did_create_ptr);
 
 private:
-  Error Put(const FileSpec &root_dir_spec, const char *hostname,
-            const ModuleSpec &module_spec, const FileSpec &tmp_file,
-            const FileSpec &target_file);
+  Status Put(const FileSpec &root_dir_spec, const char *hostname,
+             const ModuleSpec &module_spec, const FileSpec &tmp_file,
+             const FileSpec &target_file);
 
-  Error Get(const FileSpec &root_dir_spec, const char *hostname,
-            const ModuleSpec &module_spec, lldb::ModuleSP &cached_module_sp,
-            bool *did_create_ptr);
+  Status Get(const FileSpec &root_dir_spec, const char *hostname,
+             const ModuleSpec &module_spec, lldb::ModuleSP &cached_module_sp,
+             bool *did_create_ptr);
 
   std::unordered_map<std::string, lldb::ModuleWP> m_loaded_modules;
 };
diff --git a/include/lldb/Target/ObjCLanguageRuntime.h b/include/lldb/Target/ObjCLanguageRuntime.h
index 0a9ffa933bd1..97a2a731581a 100644
--- a/include/lldb/Target/ObjCLanguageRuntime.h
+++ b/include/lldb/Target/ObjCLanguageRuntime.h
@@ -168,7 +168,7 @@ public:
 
     bool EvaluatePrecondition(StoppointCallbackContext &context) override;
     void GetDescription(Stream &stream, lldb::DescriptionLevel level) override;
-    Error ConfigurePrecondition(Args &args) override;
+    Status ConfigurePrecondition(Args &args) override;
 
   protected:
     void AddClassName(const char *class_name);
diff --git a/include/lldb/Target/PathMappingList.h b/include/lldb/Target/PathMappingList.h
index 2b844882e4ad..b1b551f2d24d 100644
--- a/include/lldb/Target/PathMappingList.h
+++ b/include/lldb/Target/PathMappingList.h
@@ -16,7 +16,7 @@
 #include <vector>
 // Other libraries and framework includes
 #include "lldb/Utility/ConstString.h"
-#include "lldb/Utility/Error.h"
+#include "lldb/Utility/Status.h"
 // Project includes
 
 namespace lldb_private {
diff --git a/include/lldb/Target/Platform.h b/include/lldb/Target/Platform.h
index fb05d3e06dd5..cc007d959c65 100644
--- a/include/lldb/Target/Platform.h
+++ b/include/lldb/Target/Platform.h
@@ -111,10 +111,10 @@ public:
   // Find an existing platform plug-in by name
   static lldb::PlatformSP Find(const ConstString &name);
 
-  static lldb::PlatformSP Create(const ConstString &name, Error &error);
+  static lldb::PlatformSP Create(const ConstString &name, Status &error);
 
   static lldb::PlatformSP Create(const ArchSpec &arch,
-                                 ArchSpec *platform_arch_ptr, Error &error);
+                                 ArchSpec *platform_arch_ptr, Status &error);
 
   static uint32_t GetNumConnectedRemotePlatforms();
 
@@ -156,9 +156,9 @@ public:
   ///     Returns \b true if this Platform plug-in was able to find
   ///     a suitable executable, \b false otherwise.
   //------------------------------------------------------------------
-  virtual Error ResolveExecutable(const ModuleSpec &module_spec,
-                                  lldb::ModuleSP &module_sp,
-                                  const FileSpecList *module_search_paths_ptr);
+  virtual Status ResolveExecutable(const ModuleSpec &module_spec,
+                                   lldb::ModuleSP &module_sp,
+                                   const FileSpecList *module_search_paths_ptr);
 
   //------------------------------------------------------------------
   /// Find a symbol file given a symbol file module specification.
@@ -207,8 +207,8 @@ public:
   /// @return
   ///     Returns an error that describes success or failure.
   //------------------------------------------------------------------
-  virtual Error ResolveSymbolFile(Target &target, const ModuleSpec &sym_spec,
-                                  FileSpec &sym_file);
+  virtual Status ResolveSymbolFile(Target &target, const ModuleSpec &sym_spec,
+                                   FileSpec &sym_file);
 
   //------------------------------------------------------------------
   /// Resolves the FileSpec to a (possibly) remote path. Remote
@@ -316,8 +316,8 @@ public:
   /// @return
   ///     An error object.
   //------------------------------------------------------------------
-  virtual Error GetFileWithUUID(const FileSpec &platform_file,
-                                const UUID *uuid_ptr, FileSpec &local_file);
+  virtual Status GetFileWithUUID(const FileSpec &platform_file,
+                                 const UUID *uuid_ptr, FileSpec &local_file);
 
   //----------------------------------------------------------------------
   // Locate the scripting resource given a module specification.
@@ -329,18 +329,18 @@ public:
   LocateExecutableScriptingResources(Target *target, Module &module,
                                      Stream *feedback_stream);
 
-  virtual Error GetSharedModule(const ModuleSpec &module_spec, Process *process,
-                                lldb::ModuleSP &module_sp,
-                                const FileSpecList *module_search_paths_ptr,
-                                lldb::ModuleSP *old_module_sp_ptr,
-                                bool *did_create_ptr);
+  virtual Status GetSharedModule(const ModuleSpec &module_spec,
+                                 Process *process, lldb::ModuleSP &module_sp,
+                                 const FileSpecList *module_search_paths_ptr,
+                                 lldb::ModuleSP *old_module_sp_ptr,
+                                 bool *did_create_ptr);
 
   virtual bool GetModuleSpec(const FileSpec &module_file_spec,
                              const ArchSpec &arch, ModuleSpec &module_spec);
 
-  virtual Error ConnectRemote(Args &args);
+  virtual Status ConnectRemote(Args &args);
 
-  virtual Error DisconnectRemote();
+  virtual Status DisconnectRemote();
 
   //------------------------------------------------------------------
   /// Get the platform's supported architectures in the order in which
@@ -367,7 +367,7 @@ public:
   /// Launch a new process on a platform, not necessarily for
   /// debugging, it could be just for running the process.
   //------------------------------------------------------------------
-  virtual Error LaunchProcess(ProcessLaunchInfo &launch_info);
+  virtual Status LaunchProcess(ProcessLaunchInfo &launch_info);
 
   //------------------------------------------------------------------
   /// Perform expansion of the command-line for this launch info
@@ -376,12 +376,12 @@ public:
   //  argument magic the platform defines as part of its typical
   //  user experience
   //------------------------------------------------------------------
-  virtual Error ShellExpandArguments(ProcessLaunchInfo &launch_info);
+  virtual Status ShellExpandArguments(ProcessLaunchInfo &launch_info);
 
   //------------------------------------------------------------------
   /// Kill process on a platform.
   //------------------------------------------------------------------
-  virtual Error KillProcess(const lldb::pid_t pid);
+  virtual Status KillProcess(const lldb::pid_t pid);
 
   //------------------------------------------------------------------
   /// Lets a platform answer if it is compatible with a given
@@ -411,13 +411,13 @@ public:
   DebugProcess(ProcessLaunchInfo &launch_info, Debugger &debugger,
                Target *target, // Can be nullptr, if nullptr create a new
                                // target, else use existing one
-               Error &error);
+               Status &error);
 
   virtual lldb::ProcessSP ConnectProcess(llvm::StringRef connect_url,
                                          llvm::StringRef plugin_name,
                                          lldb_private::Debugger &debugger,
                                          lldb_private::Target *target,
-                                         lldb_private::Error &error);
+                                         lldb_private::Status &error);
 
   //------------------------------------------------------------------
   /// Attach to an existing process using a process ID.
@@ -442,7 +442,7 @@ public:
                                  Target *target, // Can be nullptr, if nullptr
                                                  // create a new target, else
                                                  // use existing one
-                                 Error &error) = 0;
+                                 Status &error) = 0;
 
   //------------------------------------------------------------------
   /// Attach to an existing process by process name.
@@ -464,7 +464,7 @@ public:
   //        virtual lldb::ProcessSP
   //        Attach (const char *process_name,
   //                bool wait_for_launch,
-  //                Error &error) = 0;
+  //                Status &error) = 0;
 
   //------------------------------------------------------------------
   // The base class Platform will take care of the host platform.
@@ -552,27 +552,27 @@ public:
     return false;
   }
 
-  virtual Error MakeDirectory(const FileSpec &file_spec, uint32_t permissions);
+  virtual Status MakeDirectory(const FileSpec &file_spec, uint32_t permissions);
 
-  virtual Error GetFilePermissions(const FileSpec &file_spec,
-                                   uint32_t &file_permissions);
+  virtual Status GetFilePermissions(const FileSpec &file_spec,
+                                    uint32_t &file_permissions);
 
-  virtual Error SetFilePermissions(const FileSpec &file_spec,
-                                   uint32_t file_permissions);
+  virtual Status SetFilePermissions(const FileSpec &file_spec,
+                                    uint32_t file_permissions);
 
   virtual lldb::user_id_t OpenFile(const FileSpec &file_spec, uint32_t flags,
-                                   uint32_t mode, Error &error) {
+                                   uint32_t mode, Status &error) {
     return UINT64_MAX;
   }
 
-  virtual bool CloseFile(lldb::user_id_t fd, Error &error) { return false; }
+  virtual bool CloseFile(lldb::user_id_t fd, Status &error) { return false; }
 
   virtual lldb::user_id_t GetFileSize(const FileSpec &file_spec) {
     return UINT64_MAX;
   }
 
   virtual uint64_t ReadFile(lldb::user_id_t fd, uint64_t offset, void *dst,
-                            uint64_t dst_len, Error &error) {
+                            uint64_t dst_len, Status &error) {
     error.SetErrorStringWithFormat(
         "Platform::ReadFile() is not supported in the %s platform",
         GetName().GetCString());
@@ -580,19 +580,19 @@ public:
   }
 
   virtual uint64_t WriteFile(lldb::user_id_t fd, uint64_t offset,
-                             const void *src, uint64_t src_len, Error &error) {
+                             const void *src, uint64_t src_len, Status &error) {
     error.SetErrorStringWithFormat(
         "Platform::ReadFile() is not supported in the %s platform",
         GetName().GetCString());
     return -1;
   }
 
-  virtual Error GetFile(const FileSpec &source, const FileSpec &destination);
+  virtual Status GetFile(const FileSpec &source, const FileSpec &destination);
 
-  virtual Error PutFile(const FileSpec &source, const FileSpec &destination,
-                        uint32_t uid = UINT32_MAX, uint32_t gid = UINT32_MAX);
+  virtual Status PutFile(const FileSpec &source, const FileSpec &destination,
+                         uint32_t uid = UINT32_MAX, uint32_t gid = UINT32_MAX);
 
-  virtual Error
+  virtual Status
   CreateSymlink(const FileSpec &src,  // The name of the link is in src
                 const FileSpec &dst); // The symlink points to dst
 
@@ -620,13 +620,13 @@ public:
   /// @return
   ///     An error object that describes anything that went wrong.
   //----------------------------------------------------------------------
-  virtual Error Install(const FileSpec &src, const FileSpec &dst);
+  virtual Status Install(const FileSpec &src, const FileSpec &dst);
 
   virtual size_t GetEnvironment(StringList &environment);
 
   virtual bool GetFileExists(const lldb_private::FileSpec &file_spec);
 
-  virtual Error Unlink(const FileSpec &file_spec);
+  virtual Status Unlink(const FileSpec &file_spec);
 
   virtual uint64_t ConvertMmapFlagsToPlatform(const ArchSpec &arch,
                                               unsigned flags);
@@ -664,7 +664,7 @@ public:
     return nullptr;
   }
 
-  virtual lldb_private::Error RunShellCommand(
+  virtual lldb_private::Status RunShellCommand(
       const char *command,         // Shouldn't be nullptr
       const FileSpec &working_dir, // Pass empty FileSpec to use the current
                                    // working directory
@@ -830,14 +830,14 @@ public:
   uint32_t LoadImage(lldb_private::Process *process,
                      const lldb_private::FileSpec &local_file,
                      const lldb_private::FileSpec &remote_file,
-                     lldb_private::Error &error);
+                     lldb_private::Status &error);
 
   virtual uint32_t DoLoadImage(lldb_private::Process *process,
                                const lldb_private::FileSpec &remote_file,
-                               lldb_private::Error &error);
+                               lldb_private::Status &error);
 
-  virtual Error UnloadImage(lldb_private::Process *process,
-                            uint32_t image_token);
+  virtual Status UnloadImage(lldb_private::Process *process,
+                             uint32_t image_token);
 
   //------------------------------------------------------------------
   /// Connect to all processes waiting for a debugger to attach
@@ -856,7 +856,7 @@ public:
   ///     The number of processes we are successfully connected to.
   //------------------------------------------------------------------
   virtual size_t ConnectToWaitingProcesses(lldb_private::Debugger &debugger,
-                                           lldb_private::Error &error);
+                                           lldb_private::Status &error);
 
 protected:
   bool m_is_host;
@@ -977,35 +977,35 @@ protected:
     m_gid_map.clear();
   }
 
-  Error GetCachedExecutable(ModuleSpec &module_spec, lldb::ModuleSP &module_sp,
-                            const FileSpecList *module_search_paths_ptr,
-                            Platform &remote_platform);
+  Status GetCachedExecutable(ModuleSpec &module_spec, lldb::ModuleSP &module_sp,
+                             const FileSpecList *module_search_paths_ptr,
+                             Platform &remote_platform);
 
-  virtual Error DownloadModuleSlice(const FileSpec &src_file_spec,
-                                    const uint64_t src_offset,
-                                    const uint64_t src_size,
-                                    const FileSpec &dst_file_spec);
+  virtual Status DownloadModuleSlice(const FileSpec &src_file_spec,
+                                     const uint64_t src_offset,
+                                     const uint64_t src_size,
+                                     const FileSpec &dst_file_spec);
 
-  virtual Error DownloadSymbolFile(const lldb::ModuleSP &module_sp,
-                                   const FileSpec &dst_file_spec);
+  virtual Status DownloadSymbolFile(const lldb::ModuleSP &module_sp,
+                                    const FileSpec &dst_file_spec);
 
   virtual const char *GetCacheHostname();
 
 private:
-  typedef std::function<Error(const ModuleSpec &)> ModuleResolver;
+  typedef std::function<Status(const ModuleSpec &)> ModuleResolver;
 
-  Error GetRemoteSharedModule(const ModuleSpec &module_spec, Process *process,
-                              lldb::ModuleSP &module_sp,
-                              const ModuleResolver &module_resolver,
-                              bool *did_create_ptr);
+  Status GetRemoteSharedModule(const ModuleSpec &module_spec, Process *process,
+                               lldb::ModuleSP &module_sp,
+                               const ModuleResolver &module_resolver,
+                               bool *did_create_ptr);
 
   bool GetCachedSharedModule(const ModuleSpec &module_spec,
                              lldb::ModuleSP &module_sp, bool *did_create_ptr);
 
-  Error LoadCachedExecutable(const ModuleSpec &module_spec,
-                             lldb::ModuleSP &module_sp,
-                             const FileSpecList *module_search_paths_ptr,
-                             Platform &remote_platform);
+  Status LoadCachedExecutable(const ModuleSpec &module_spec,
+                              lldb::ModuleSP &module_sp,
+                              const FileSpecList *module_search_paths_ptr,
+                              Platform &remote_platform);
 
   FileSpec GetModuleCacheRoot();
 
@@ -1088,7 +1088,7 @@ public:
 
   ~OptionGroupPlatformRSync() override = default;
 
-  lldb_private::Error
+  lldb_private::Status
   SetOptionValue(uint32_t option_idx, llvm::StringRef option_value,
                  ExecutionContext *execution_context) override;
 
@@ -1117,7 +1117,7 @@ public:
 
   ~OptionGroupPlatformSSH() override = default;
 
-  lldb_private::Error
+  lldb_private::Status
   SetOptionValue(uint32_t option_idx, llvm::StringRef option_value,
                  ExecutionContext *execution_context) override;
 
@@ -1144,7 +1144,7 @@ public:
 
   ~OptionGroupPlatformCaching() override = default;
 
-  lldb_private::Error
+  lldb_private::Status
   SetOptionValue(uint32_t option_idx, llvm::StringRef option_value,
                  ExecutionContext *execution_context) override;
 
diff --git a/include/lldb/Target/Process.h b/include/lldb/Target/Process.h
index 9a749efa7ae1..d2ab85d1652a 100644
--- a/include/lldb/Target/Process.h
+++ b/include/lldb/Target/Process.h
@@ -48,8 +48,8 @@
 #include "lldb/Target/ProcessLaunchInfo.h"
 #include "lldb/Target/QueueList.h"
 #include "lldb/Target/ThreadList.h"
-#include "lldb/Utility/Error.h"
 #include "lldb/Utility/NameMatches.h"
+#include "lldb/Utility/Status.h"
 #include "lldb/lldb-private.h"
 
 #include "llvm/ADT/ArrayRef.h"
@@ -282,8 +282,8 @@ public:
 
   ~ProcessLaunchCommandOptions() override = default;
 
-  Error SetOptionValue(uint32_t option_idx, llvm::StringRef option_arg,
-                       ExecutionContext *execution_context) override;
+  Status SetOptionValue(uint32_t option_idx, llvm::StringRef option_arg,
+                        ExecutionContext *execution_context) override;
 
   void OptionParsingStarting(ExecutionContext *execution_context) override {
     launch_info.Clear();
@@ -792,12 +792,12 @@ public:
   ///     An error object. Call GetID() to get the process ID if
   ///     the error object is success.
   //------------------------------------------------------------------
-  virtual Error Launch(ProcessLaunchInfo &launch_info);
+  virtual Status Launch(ProcessLaunchInfo &launch_info);
 
-  virtual Error LoadCore();
+  virtual Status LoadCore();
 
-  virtual Error DoLoadCore() {
-    Error error;
+  virtual Status DoLoadCore() {
+    Status error;
     error.SetErrorStringWithFormat(
         "error: %s does not support loading core files.",
         GetPluginName().GetCString());
@@ -869,7 +869,7 @@ public:
   ///     Returns \a pid if attaching was successful, or
   ///     LLDB_INVALID_PROCESS_ID if attaching fails.
   //------------------------------------------------------------------
-  virtual Error Attach(ProcessAttachInfo &attach_info);
+  virtual Status Attach(ProcessAttachInfo &attach_info);
 
   //------------------------------------------------------------------
   /// Attach to a remote system via a URL
@@ -885,7 +885,7 @@ public:
   /// @return
   ///     Returns an error object.
   //------------------------------------------------------------------
-  virtual Error ConnectRemote(Stream *strm, llvm::StringRef remote_url);
+  virtual Status ConnectRemote(Stream *strm, llvm::StringRef remote_url);
 
   bool GetShouldDetach() const { return m_should_detach; }
 
@@ -992,9 +992,9 @@ public:
   /// @see Thread:Step()
   /// @see Thread:Suspend()
   //------------------------------------------------------------------
-  Error Resume();
+  Status Resume();
 
-  Error ResumeSynchronous(Stream *stream);
+  Status ResumeSynchronous(Stream *stream);
 
   //------------------------------------------------------------------
   /// Halts a running process.
@@ -1016,7 +1016,7 @@ public:
   ///     halted.
   ///     otherwise the halt has failed.
   //------------------------------------------------------------------
-  Error Halt(bool clear_thread_plans = false, bool use_run_lock = true);
+  Status Halt(bool clear_thread_plans = false, bool use_run_lock = true);
 
   //------------------------------------------------------------------
   /// Detaches from a running or stopped process.
@@ -1030,7 +1030,7 @@ public:
   /// @return
   ///     Returns an error object.
   //------------------------------------------------------------------
-  Error Detach(bool keep_stopped);
+  Status Detach(bool keep_stopped);
 
   //------------------------------------------------------------------
   /// Kills the process and shuts down all threads that were spawned
@@ -1050,7 +1050,7 @@ public:
   /// @return
   ///     Returns an error object.
   //------------------------------------------------------------------
-  Error Destroy(bool force_kill);
+  Status Destroy(bool force_kill);
 
   //------------------------------------------------------------------
   /// Sends a process a UNIX signal \a signal.
@@ -1061,7 +1061,7 @@ public:
   /// @return
   ///     Returns an error object.
   //------------------------------------------------------------------
-  Error Signal(int signal);
+  Status Signal(int signal);
 
   void SetUnixSignals(lldb::UnixSignalsSP &&signals_sp);
 
@@ -1080,7 +1080,7 @@ public:
   /// @return
   ///     Returns an error object.
   //------------------------------------------------------------------
-  virtual Error WillAttachToProcessWithID(lldb::pid_t pid) { return Error(); }
+  virtual Status WillAttachToProcessWithID(lldb::pid_t pid) { return Status(); }
 
   //------------------------------------------------------------------
   /// Called before attaching to a process.
@@ -1091,9 +1091,9 @@ public:
   /// @return
   ///     Returns an error object.
   //------------------------------------------------------------------
-  virtual Error WillAttachToProcessWithName(const char *process_name,
-                                            bool wait_for_launch) {
-    return Error();
+  virtual Status WillAttachToProcessWithName(const char *process_name,
+                                             bool wait_for_launch) {
+    return Status();
   }
 
   //------------------------------------------------------------------
@@ -1110,8 +1110,8 @@ public:
   /// @return
   ///     Returns an error object.
   //------------------------------------------------------------------
-  virtual Error DoConnectRemote(Stream *strm, llvm::StringRef remote_url) {
-    Error error;
+  virtual Status DoConnectRemote(Stream *strm, llvm::StringRef remote_url) {
+    Status error;
     error.SetErrorString("remote connections are not supported");
     return error;
   }
@@ -1127,14 +1127,14 @@ public:
   ///     will return the uid to attach as.
   ///
   /// @return
-  ///     Returns a successful Error attaching was successful, or
+  ///     Returns a successful Status attaching was successful, or
   ///     an appropriate (possibly platform-specific) error code if
   ///     attaching fails.
   /// hanming : need flag
   //------------------------------------------------------------------
-  virtual Error DoAttachToProcessWithID(lldb::pid_t pid,
-                                        const ProcessAttachInfo &attach_info) {
-    Error error;
+  virtual Status DoAttachToProcessWithID(lldb::pid_t pid,
+                                         const ProcessAttachInfo &attach_info) {
+    Status error;
     error.SetErrorStringWithFormat(
         "error: %s does not support attaching to a process by pid",
         GetPluginName().GetCString());
@@ -1152,14 +1152,14 @@ public:
   ///     will return the uid to attach as.
   ///
   /// @return
-  ///     Returns a successful Error attaching was successful, or
+  ///     Returns a successful Status attaching was successful, or
   ///     an appropriate (possibly platform-specific) error code if
   ///     attaching fails.
   //------------------------------------------------------------------
-  virtual Error
+  virtual Status
   DoAttachToProcessWithName(const char *process_name,
                             const ProcessAttachInfo &attach_info) {
-    Error error;
+    Status error;
     error.SetErrorString("attach by name is not supported");
     return error;
   }
@@ -1202,7 +1202,7 @@ public:
   /// @return
   ///     Returns an error object.
   //------------------------------------------------------------------
-  virtual Error WillLaunch(Module *module) { return Error(); }
+  virtual Status WillLaunch(Module *module) { return Status(); }
 
   //------------------------------------------------------------------
   /// Launch a new process.
@@ -1220,11 +1220,11 @@ public:
   ///     requested launch.
   ///
   /// @return
-  ///     An Error instance indicating success or failure of the
+  ///     An Status instance indicating success or failure of the
   ///     operation.
   //------------------------------------------------------------------
-  virtual Error DoLaunch(Module *exe_module, ProcessLaunchInfo &launch_info) {
-    Error error;
+  virtual Status DoLaunch(Module *exe_module, ProcessLaunchInfo &launch_info) {
+    Status error;
     error.SetErrorStringWithFormat(
         "error: %s does not support launching processes",
         GetPluginName().GetCString());
@@ -1248,7 +1248,7 @@ public:
   /// @return
   ///     Returns an error object.
   //------------------------------------------------------------------
-  virtual Error WillResume() { return Error(); }
+  virtual Status WillResume() { return Status(); }
 
   //------------------------------------------------------------------
   /// Resumes all of a process's threads as configured using the
@@ -1267,8 +1267,8 @@ public:
   /// @see Thread:Step()
   /// @see Thread:Suspend()
   //------------------------------------------------------------------
-  virtual Error DoResume() {
-    Error error;
+  virtual Status DoResume() {
+    Status error;
     error.SetErrorStringWithFormat(
         "error: %s does not support resuming processes",
         GetPluginName().GetCString());
@@ -1292,7 +1292,7 @@ public:
   /// @return
   ///     Returns an error object.
   //------------------------------------------------------------------
-  virtual Error WillHalt() { return Error(); }
+  virtual Status WillHalt() { return Status(); }
 
   //------------------------------------------------------------------
   /// Halts a running process.
@@ -1314,8 +1314,8 @@ public:
   ///     Returns \b true if the process successfully halts, \b false
   ///     otherwise.
   //------------------------------------------------------------------
-  virtual Error DoHalt(bool &caused_stop) {
-    Error error;
+  virtual Status DoHalt(bool &caused_stop) {
+    Status error;
     error.SetErrorStringWithFormat(
         "error: %s does not support halting processes",
         GetPluginName().GetCString());
@@ -1339,7 +1339,7 @@ public:
   /// @return
   ///     Returns an error object.
   //------------------------------------------------------------------
-  virtual Error WillDetach() { return Error(); }
+  virtual Status WillDetach() { return Status(); }
 
   //------------------------------------------------------------------
   /// Detaches from a running or stopped process.
@@ -1348,8 +1348,8 @@ public:
   ///     Returns \b true if the process successfully detaches, \b
   ///     false otherwise.
   //------------------------------------------------------------------
-  virtual Error DoDetach(bool keep_stopped) {
-    Error error;
+  virtual Status DoDetach(bool keep_stopped) {
+    Status error;
     error.SetErrorStringWithFormat(
         "error: %s does not support detaching from processes",
         GetPluginName().GetCString());
@@ -1377,7 +1377,7 @@ public:
   ///     Process::DoSignal(int), otherwise an error describing what
   ///     prevents the signal from being sent.
   //------------------------------------------------------------------
-  virtual Error WillSignal() { return Error(); }
+  virtual Status WillSignal() { return Status(); }
 
   //------------------------------------------------------------------
   /// Sends a process a UNIX signal \a signal.
@@ -1385,17 +1385,17 @@ public:
   /// @return
   ///     Returns an error object.
   //------------------------------------------------------------------
-  virtual Error DoSignal(int signal) {
-    Error error;
+  virtual Status DoSignal(int signal) {
+    Status error;
     error.SetErrorStringWithFormat(
         "error: %s does not support sending signals to processes",
         GetPluginName().GetCString());
     return error;
   }
 
-  virtual Error WillDestroy() { return Error(); }
+  virtual Status WillDestroy() { return Status(); }
 
-  virtual Error DoDestroy() = 0;
+  virtual Status DoDestroy() = 0;
 
   virtual void DidDestroy() {}
 
@@ -1706,7 +1706,7 @@ public:
   ///     The number of bytes that were actually read into \a buf.
   //------------------------------------------------------------------
   virtual size_t DoReadMemory(lldb::addr_t vm_addr, void *buf, size_t size,
-                              Error &error) = 0;
+                              Status &error) = 0;
 
   //------------------------------------------------------------------
   /// Read of memory from a process.
@@ -1738,7 +1738,7 @@ public:
   ///     returned to indicate an error.
   //------------------------------------------------------------------
   virtual size_t ReadMemory(lldb::addr_t vm_addr, void *buf, size_t size,
-                            Error &error);
+                            Status &error);
 
   //------------------------------------------------------------------
   /// Read a NULL terminated string from memory
@@ -1770,7 +1770,7 @@ public:
   ///     The error status or the number of bytes prior to the null terminator.
   //------------------------------------------------------------------
   size_t ReadStringFromMemory(lldb::addr_t vm_addr, char *str, size_t max_bytes,
-                              Error &error, size_t type_width = 1);
+                              Status &error, size_t type_width = 1);
 
   //------------------------------------------------------------------
   /// Read a NULL terminated C string from memory
@@ -1782,13 +1782,13 @@ public:
   /// terminated (at most cstr_max_len - 1 bytes will be read).
   //------------------------------------------------------------------
   size_t ReadCStringFromMemory(lldb::addr_t vm_addr, char *cstr,
-                               size_t cstr_max_len, Error &error);
+                               size_t cstr_max_len, Status &error);
 
   size_t ReadCStringFromMemory(lldb::addr_t vm_addr, std::string &out_str,
-                               Error &error);
+                               Status &error);
 
   size_t ReadMemoryFromInferior(lldb::addr_t vm_addr, void *buf, size_t size,
-                                Error &error);
+                                Status &error);
 
   //------------------------------------------------------------------
   /// Reads an unsigned integer of the specified byte size from
@@ -1819,15 +1819,15 @@ public:
   //------------------------------------------------------------------
   uint64_t ReadUnsignedIntegerFromMemory(lldb::addr_t load_addr,
                                          size_t byte_size, uint64_t fail_value,
-                                         Error &error);
+                                         Status &error);
 
   int64_t ReadSignedIntegerFromMemory(lldb::addr_t load_addr, size_t byte_size,
-                                      int64_t fail_value, Error &error);
+                                      int64_t fail_value, Status &error);
 
-  lldb::addr_t ReadPointerFromMemory(lldb::addr_t vm_addr, Error &error);
+  lldb::addr_t ReadPointerFromMemory(lldb::addr_t vm_addr, Status &error);
 
   bool WritePointerToMemory(lldb::addr_t vm_addr, lldb::addr_t ptr_value,
-                            Error &error);
+                            Status &error);
 
   //------------------------------------------------------------------
   /// Actually do the writing of memory to a process.
@@ -1850,7 +1850,7 @@ public:
   ///     The number of bytes that were actually written.
   //------------------------------------------------------------------
   virtual size_t DoWriteMemory(lldb::addr_t vm_addr, const void *buf,
-                               size_t size, Error &error) {
+                               size_t size, Status &error) {
     error.SetErrorStringWithFormat(
         "error: %s does not support writing to processes",
         GetPluginName().GetCString());
@@ -1890,11 +1890,11 @@ public:
   ///     The number of bytes that were actually written.
   //------------------------------------------------------------------
   size_t WriteScalarToMemory(lldb::addr_t vm_addr, const Scalar &scalar,
-                             size_t size, Error &error);
+                             size_t size, Status &error);
 
   size_t ReadScalarIntegerFromMemory(lldb::addr_t addr, uint32_t byte_size,
                                      bool is_signed, Scalar &scalar,
-                                     Error &error);
+                                     Status &error);
 
   //------------------------------------------------------------------
   /// Write memory to a process.
@@ -1923,7 +1923,7 @@ public:
   //------------------------------------------------------------------
   // TODO: change this to take an ArrayRef<uint8_t>
   size_t WriteMemory(lldb::addr_t vm_addr, const void *buf, size_t size,
-                     Error &error);
+                     Status &error);
 
   //------------------------------------------------------------------
   /// Actually allocate memory in the process.
@@ -1941,7 +1941,7 @@ public:
   //------------------------------------------------------------------
 
   virtual lldb::addr_t DoAllocateMemory(size_t size, uint32_t permissions,
-                                        Error &error) {
+                                        Status &error) {
     error.SetErrorStringWithFormat(
         "error: %s does not support allocating in the debug process",
         GetPluginName().GetCString());
@@ -1971,7 +1971,7 @@ public:
   ///     The address of the allocated buffer in the process, or
   ///     LLDB_INVALID_ADDRESS if the allocation failed.
   //------------------------------------------------------------------
-  lldb::addr_t AllocateMemory(size_t size, uint32_t permissions, Error &error);
+  lldb::addr_t AllocateMemory(size_t size, uint32_t permissions, Status &error);
 
   //------------------------------------------------------------------
   /// The public interface to allocating memory in the process, this also
@@ -1998,7 +1998,8 @@ public:
   ///     LLDB_INVALID_ADDRESS if the allocation failed.
   //------------------------------------------------------------------
 
-  lldb::addr_t CallocateMemory(size_t size, uint32_t permissions, Error &error);
+  lldb::addr_t CallocateMemory(size_t size, uint32_t permissions,
+                               Status &error);
 
   //------------------------------------------------------------------
   /// Resolve dynamically loaded indirect functions.
@@ -2014,7 +2015,7 @@ public:
   ///     LLDB_INVALID_ADDRESS if the resolution failed.
   //------------------------------------------------------------------
   virtual lldb::addr_t ResolveIndirectFunction(const Address *address,
-                                               Error &error);
+                                               Status &error);
 
   //------------------------------------------------------------------
   /// Locate the memory region that contains load_addr.
@@ -2042,9 +2043,9 @@ public:
   /// @return
   ///     An error value.
   //------------------------------------------------------------------
-  virtual Error GetMemoryRegionInfo(lldb::addr_t load_addr,
-                                    MemoryRegionInfo &range_info) {
-    Error error;
+  virtual Status GetMemoryRegionInfo(lldb::addr_t load_addr,
+                                     MemoryRegionInfo &range_info) {
+    Status error;
     error.SetErrorString("Process::GetMemoryRegionInfo() not supported");
     return error;
   }
@@ -2059,18 +2060,18 @@ public:
   /// @return
   ///     An error value.
   //------------------------------------------------------------------
-  virtual Error
+  virtual Status
   GetMemoryRegions(std::vector<lldb::MemoryRegionInfoSP> &region_list);
 
-  virtual Error GetWatchpointSupportInfo(uint32_t &num) {
-    Error error;
+  virtual Status GetWatchpointSupportInfo(uint32_t &num) {
+    Status error;
     num = 0;
     error.SetErrorString("Process::GetWatchpointSupportInfo() not supported");
     return error;
   }
 
-  virtual Error GetWatchpointSupportInfo(uint32_t &num, bool &after) {
-    Error error;
+  virtual Status GetWatchpointSupportInfo(uint32_t &num, bool &after) {
+    Status error;
     num = 0;
     after = true;
     error.SetErrorString("Process::GetWatchpointSupportInfo() not supported");
@@ -2165,8 +2166,8 @@ public:
   /// @return
   ///     \btrue if the memory was deallocated, \bfalse otherwise.
   //------------------------------------------------------------------
-  virtual Error DoDeallocateMemory(lldb::addr_t ptr) {
-    Error error;
+  virtual Status DoDeallocateMemory(lldb::addr_t ptr) {
+    Status error;
     error.SetErrorStringWithFormat(
         "error: %s does not support deallocating in the debug process",
         GetPluginName().GetCString());
@@ -2186,7 +2187,7 @@ public:
   /// @return
   ///     \btrue if the memory was deallocated, \bfalse otherwise.
   //------------------------------------------------------------------
-  Error DeallocateMemory(lldb::addr_t ptr);
+  Status DeallocateMemory(lldb::addr_t ptr);
 
   //------------------------------------------------------------------
   /// Get any available STDOUT.
@@ -2218,7 +2219,7 @@ public:
   ///     equal to \a buf_size, another call to this function should
   ///     be made to retrieve more STDOUT data.
   //------------------------------------------------------------------
-  virtual size_t GetSTDOUT(char *buf, size_t buf_size, Error &error);
+  virtual size_t GetSTDOUT(char *buf, size_t buf_size, Status &error);
 
   //------------------------------------------------------------------
   /// Get any available STDERR.
@@ -2250,7 +2251,7 @@ public:
   ///     equal to \a buf_size, another call to this function should
   ///     be made to retrieve more STDERR data.
   //------------------------------------------------------------------
-  virtual size_t GetSTDERR(char *buf, size_t buf_size, Error &error);
+  virtual size_t GetSTDERR(char *buf, size_t buf_size, Status &error);
 
   //------------------------------------------------------------------
   /// Puts data into this process's STDIN.
@@ -2273,7 +2274,7 @@ public:
   ///     less than \a buf_size, another call to this function should
   ///     be made to write the rest of the data.
   //------------------------------------------------------------------
-  virtual size_t PutSTDIN(const char *buf, size_t buf_size, Error &error) {
+  virtual size_t PutSTDIN(const char *buf, size_t buf_size, Status &error) {
     error.SetErrorString("stdin unsupported");
     return 0;
   }
@@ -2293,23 +2294,23 @@ public:
   ///     equal to \a buf_size, another call to this function should
   ///     be made to retrieve more profile data.
   //------------------------------------------------------------------
-  virtual size_t GetAsyncProfileData(char *buf, size_t buf_size, Error &error);
+  virtual size_t GetAsyncProfileData(char *buf, size_t buf_size, Status &error);
 
   //----------------------------------------------------------------------
   // Process Breakpoints
   //----------------------------------------------------------------------
   size_t GetSoftwareBreakpointTrapOpcode(BreakpointSite *bp_site);
 
-  virtual Error EnableBreakpointSite(BreakpointSite *bp_site) {
-    Error error;
+  virtual Status EnableBreakpointSite(BreakpointSite *bp_site) {
+    Status error;
     error.SetErrorStringWithFormat(
         "error: %s does not support enabling breakpoints",
         GetPluginName().GetCString());
     return error;
   }
 
-  virtual Error DisableBreakpointSite(BreakpointSite *bp_site) {
-    Error error;
+  virtual Status DisableBreakpointSite(BreakpointSite *bp_site) {
+    Status error;
     error.SetErrorStringWithFormat(
         "error: %s does not support disabling breakpoints",
         GetPluginName().GetCString());
@@ -2320,13 +2321,13 @@ public:
   // don't need to implement this function unless the standard flow of
   // read existing opcode, write breakpoint opcode, verify breakpoint opcode
   // doesn't work for a specific process plug-in.
-  virtual Error EnableSoftwareBreakpoint(BreakpointSite *bp_site);
+  virtual Status EnableSoftwareBreakpoint(BreakpointSite *bp_site);
 
   // This is implemented completely using the lldb::Process API. Subclasses
   // don't need to implement this function unless the standard flow of
   // restoring original opcode in memory and verifying the restored opcode
   // doesn't work for a specific process plug-in.
-  virtual Error DisableSoftwareBreakpoint(BreakpointSite *bp_site);
+  virtual Status DisableSoftwareBreakpoint(BreakpointSite *bp_site);
 
   BreakpointSiteList &GetBreakpointSiteList();
 
@@ -2334,14 +2335,14 @@ public:
 
   void DisableAllBreakpointSites();
 
-  Error ClearBreakpointSiteByID(lldb::user_id_t break_id);
+  Status ClearBreakpointSiteByID(lldb::user_id_t break_id);
 
   lldb::break_id_t CreateBreakpointSite(const lldb::BreakpointLocationSP &owner,
                                         bool use_hardware);
 
-  Error DisableBreakpointSiteByID(lldb::user_id_t break_id);
+  Status DisableBreakpointSiteByID(lldb::user_id_t break_id);
 
-  Error EnableBreakpointSiteByID(lldb::user_id_t break_id);
+  Status EnableBreakpointSiteByID(lldb::user_id_t break_id);
 
   // BreakpointLocations use RemoveOwnerFromBreakpointSite to remove
   // themselves from the owner's list of this breakpoint sites.
@@ -2352,9 +2353,9 @@ public:
   //----------------------------------------------------------------------
   // Process Watchpoints (optional)
   //----------------------------------------------------------------------
-  virtual Error EnableWatchpoint(Watchpoint *wp, bool notify = true);
+  virtual Status EnableWatchpoint(Watchpoint *wp, bool notify = true);
 
-  virtual Error DisableWatchpoint(Watchpoint *wp, bool notify = true);
+  virtual Status DisableWatchpoint(Watchpoint *wp, bool notify = true);
 
   //------------------------------------------------------------------
   // Thread Queries
@@ -2613,8 +2614,8 @@ public:
 
   ProcessRunLock &GetRunLock();
 
-  virtual Error SendEventData(const char *data) {
-    Error return_error("Sending an event is not supported for this process.");
+  virtual Status SendEventData(const char *data) {
+    Status return_error("Sending an event is not supported for this process.");
     return return_error;
   }
 
@@ -2665,9 +2666,9 @@ public:
   ///     The load address of the file if it is loaded into the
   ///     processes address space, LLDB_INVALID_ADDRESS otherwise.
   //------------------------------------------------------------------
-  virtual Error GetFileLoadAddress(const FileSpec &file, bool &is_loaded,
-                                   lldb::addr_t &load_addr) {
-    return Error("Not supported");
+  virtual Status GetFileLoadAddress(const FileSpec &file, bool &is_loaded,
+                                    lldb::addr_t &load_addr) {
+    return Status("Not supported");
   }
 
   size_t AddImageToken(lldb::addr_t image_ptr);
@@ -2729,7 +2730,7 @@ public:
   /// @return
   ///     Returns the result of attempting to configure the feature.
   //------------------------------------------------------------------
-  virtual Error
+  virtual Status
   ConfigureStructuredData(const ConstString &type_name,
                           const StructuredData::ObjectSP &config_sp);
 
@@ -2781,7 +2782,7 @@ public:
   /// configuration.
   //------------------------------------------------------------------
   virtual lldb::user_id_t StartTrace(lldb::TraceOptionsSP &options,
-                                     Error &error) {
+                                     Status &error) {
     error.SetErrorString("Not implemented");
     return LLDB_INVALID_UID;
   }
@@ -2796,7 +2797,7 @@ public:
   /// to be stopped a thread_id can be supplied.
   //------------------------------------------------------------------
   virtual void StopTrace(lldb::user_id_t uid, lldb::tid_t thread_id,
-                         Error &error) {
+                         Status &error) {
     error.SetErrorString("Not implemented");
   }
 
@@ -2809,7 +2810,7 @@ public:
   /// thread for trace extraction.
   //------------------------------------------------------------------
   virtual size_t GetData(lldb::user_id_t uid, lldb::tid_t thread_id,
-                         Error &error, void *buf, size_t size,
+                         Status &error, void *buf, size_t size,
                          size_t offset = 0) {
     error.SetErrorString("Not implemented");
     return 0;
@@ -2819,7 +2820,7 @@ public:
   /// Similar API as above except for obtaining meta data
   //------------------------------------------------------------------
   virtual size_t GetMetaData(lldb::user_id_t uid, lldb::tid_t thread_id,
-                             Error &error, void *buf, size_t size,
+                             Status &error, void *buf, size_t size,
                              size_t offset = 0) {
     error.SetErrorString("Not implemented");
     return 0;
@@ -2834,7 +2835,7 @@ public:
   /// configuration used by a specific thread. The thread_id specified
   /// should also match the uid otherwise an error will be returned.
   //------------------------------------------------------------------
-  virtual void GetTraceConfig(lldb::user_id_t uid, Error &error,
+  virtual void GetTraceConfig(lldb::user_id_t uid, Status &error,
                               lldb::TraceOptionsSP &options) {
     error.SetErrorString("Not implemented");
     return;
@@ -2850,9 +2851,9 @@ protected:
   /// state of m_run_lock, but just causes the process to resume.
   ///
   /// @return
-  ///     An Error object describing the success or failure of the resume.
+  ///     An Status object describing the success or failure of the resume.
   //------------------------------------------------------------------
-  Error PrivateResume();
+  Status PrivateResume();
 
   //------------------------------------------------------------------
   // Called internally
@@ -3179,7 +3180,7 @@ private:
 protected:
   void HandlePrivateEvent(lldb::EventSP &event_sp);
 
-  Error HaltPrivate();
+  Status HaltPrivate();
 
   lldb::StateType WaitForProcessStopPrivate(lldb::EventSP &event_sp,
                                             const Timeout<std::micro> &timeout);
@@ -3196,7 +3197,7 @@ protected:
                                const Timeout<std::micro> &timeout);
 
   size_t WriteMemoryPrivate(lldb::addr_t addr, const void *buf, size_t size,
-                            Error &error);
+                            Status &error);
 
   void AppendSTDOUT(const char *s, size_t len);
 
@@ -3217,9 +3218,9 @@ protected:
     return static_cast<bool>(m_process_input_reader);
   }
 
-  Error StopForDestroyOrDetach(lldb::EventSP &exit_event_sp);
+  Status StopForDestroyOrDetach(lldb::EventSP &exit_event_sp);
 
-  virtual Error UpdateAutomaticSignalFiltering();
+  virtual Status UpdateAutomaticSignalFiltering();
 
   bool StateChangedIsExternallyHijacked();
 
diff --git a/include/lldb/Target/ProcessLaunchInfo.h b/include/lldb/Target/ProcessLaunchInfo.h
index 083e0bbed8bd..93a1a9ebd239 100644
--- a/include/lldb/Target/ProcessLaunchInfo.h
+++ b/include/lldb/Target/ProcessLaunchInfo.h
@@ -94,7 +94,7 @@ public:
 
   void Clear();
 
-  bool ConvertArgumentsForLaunchingInShell(Error &error, bool localhost,
+  bool ConvertArgumentsForLaunchingInShell(Status &error, bool localhost,
                                            bool will_debug,
                                            bool first_arg_is_full_shell_command,
                                            int32_t num_resumes);
diff --git a/include/lldb/Target/ProcessStructReader.h b/include/lldb/Target/ProcessStructReader.h
index acc5c1f32686..cfc8fe11a39a 100644
--- a/include/lldb/Target/ProcessStructReader.h
+++ b/include/lldb/Target/ProcessStructReader.h
@@ -17,7 +17,7 @@
 #include "lldb/Target/Process.h"
 #include "lldb/Utility/ConstString.h"
 #include "lldb/Utility/DataExtractor.h"
-#include "lldb/Utility/Error.h"
+#include "lldb/Utility/Status.h"
 
 #include <initializer_list>
 #include <map>
@@ -68,7 +68,7 @@ public:
     }
     size_t total_size = struct_type.GetByteSize(nullptr);
     lldb::DataBufferSP buffer_sp(new DataBufferHeap(total_size, 0));
-    Error error;
+    Status error;
     process->ReadMemoryFromInferior(base_addr, buffer_sp->GetBytes(),
                                     total_size, error);
     if (error.Fail())
diff --git a/include/lldb/Target/RegisterContext.h b/include/lldb/Target/RegisterContext.h
index 485645b2dc14..c438a0cd12cf 100644
--- a/include/lldb/Target/RegisterContext.h
+++ b/include/lldb/Target/RegisterContext.h
@@ -138,12 +138,12 @@ public:
 
   virtual bool HardwareSingleStep(bool enable);
 
-  virtual Error
+  virtual Status
   ReadRegisterValueFromMemory(const lldb_private::RegisterInfo *reg_info,
                               lldb::addr_t src_addr, uint32_t src_len,
                               RegisterValue &reg_value);
 
-  virtual Error
+  virtual Status
   WriteRegisterValueToMemory(const lldb_private::RegisterInfo *reg_info,
                              lldb::addr_t dst_addr, uint32_t dst_len,
                              const RegisterValue &reg_value);
diff --git a/include/lldb/Target/StackFrame.h b/include/lldb/Target/StackFrame.h
index 1f25575e236a..cb503875a5c0 100644
--- a/include/lldb/Target/StackFrame.h
+++ b/include/lldb/Target/StackFrame.h
@@ -24,7 +24,7 @@
 #include "lldb/Symbol/SymbolContext.h"
 #include "lldb/Target/ExecutionContextScope.h"
 #include "lldb/Target/StackID.h"
-#include "lldb/Utility/Error.h"
+#include "lldb/Utility/Status.h"
 #include "lldb/Utility/StreamString.h"
 #include "lldb/Utility/UserID.h"
 
@@ -201,7 +201,7 @@ public:
   ///   Returns true if the CFA value was successfully set in value.  Some
   ///   frames may be unable to provide this value; they will return false.
   //------------------------------------------------------------------
-  bool GetFrameBaseValue(Scalar &value, Error *error_ptr);
+  bool GetFrameBaseValue(Scalar &value, Status *error_ptr);
 
   //------------------------------------------------------------------
   /// Get the DWARFExpression corresponding to the Canonical Frame Address.
@@ -215,7 +215,7 @@ public:
   /// @return
   ///   Returns the corresponding DWARF expression, or NULL.
   //------------------------------------------------------------------
-  DWARFExpression *GetFrameBaseExpression(Error *error_ptr);
+  DWARFExpression *GetFrameBaseExpression(Status *error_ptr);
 
   //------------------------------------------------------------------
   /// Get the current lexical scope block for this StackFrame, if possible.
@@ -315,7 +315,7 @@ public:
   //------------------------------------------------------------------
   lldb::ValueObjectSP GetValueForVariableExpressionPath(
       llvm::StringRef var_expr, lldb::DynamicValueType use_dynamic,
-      uint32_t options, lldb::VariableSP &var_sp, Error &error);
+      uint32_t options, lldb::VariableSP &var_sp, Status &error);
 
   //------------------------------------------------------------------
   /// Determine whether this StackFrame has debug information available or not
@@ -535,7 +535,7 @@ private:
   SymbolContext m_sc;
   Flags m_flags;
   Scalar m_frame_base;
-  Error m_frame_base_error;
+  Status m_frame_base_error;
   bool m_cfa_is_valid; // Does this frame have a CFA?  Different from CFA ==
                        // LLDB_INVALID_ADDRESS
   uint32_t m_stop_id;
diff --git a/include/lldb/Target/StructuredDataPlugin.h b/include/lldb/Target/StructuredDataPlugin.h
index aa4452a9c3f2..a4d2c0382628 100644
--- a/include/lldb/Target/StructuredDataPlugin.h
+++ b/include/lldb/Target/StructuredDataPlugin.h
@@ -123,8 +123,8 @@ public:
   ///     The error if formatting the object contents failed; otherwise,
   ///     success.
   // -------------------------------------------------------------------------
-  virtual Error GetDescription(const StructuredData::ObjectSP &object_sp,
-                               lldb_private::Stream &stream) = 0;
+  virtual Status GetDescription(const StructuredData::ObjectSP &object_sp,
+                                lldb_private::Stream &stream) = 0;
 
   // -------------------------------------------------------------------------
   /// Returns whether the plugin's features are enabled.
diff --git a/include/lldb/Target/Target.h b/include/lldb/Target/Target.h
index 8aa263f59254..ff9451939909 100644
--- a/include/lldb/Target/Target.h
+++ b/include/lldb/Target/Target.h
@@ -490,7 +490,7 @@ public:
   //    UpdateInstanceName ();
 
   lldb::ModuleSP GetSharedModule(const ModuleSpec &module_spec,
-                                 Error *error_ptr = nullptr);
+                                 Status *error_ptr = nullptr);
 
   //----------------------------------------------------------------------
   // Settings accessors
@@ -528,11 +528,11 @@ public:
 
   void Destroy();
 
-  Error Launch(ProcessLaunchInfo &launch_info,
-               Stream *stream); // Optional stream to receive first stop info
+  Status Launch(ProcessLaunchInfo &launch_info,
+                Stream *stream); // Optional stream to receive first stop info
 
-  Error Attach(ProcessAttachInfo &attach_info,
-               Stream *stream); // Optional stream to receive first stop info
+  Status Attach(ProcessAttachInfo &attach_info,
+                Stream *stream); // Optional stream to receive first stop info
 
   //------------------------------------------------------------------
   // This part handles the breakpoints.
@@ -611,7 +611,7 @@ public:
   CreateExceptionBreakpoint(enum lldb::LanguageType language, bool catch_bp,
                             bool throw_bp, bool internal,
                             Args *additional_args = nullptr,
-                            Error *additional_args_error = nullptr);
+                            Status *additional_args_error = nullptr);
 
   // This is the same as the func_name breakpoint except that you can specify a
   // vector of names.  This is cheaper
@@ -644,7 +644,7 @@ public:
   // Use this to create a watchpoint:
   lldb::WatchpointSP CreateWatchpoint(lldb::addr_t addr, size_t size,
                                       const CompilerType *type, uint32_t kind,
-                                      Error &error);
+                                      Status &error);
 
   lldb::WatchpointSP GetLastCreatedWatchpoint() {
     return m_last_created_watchpoint;
@@ -687,15 +687,16 @@ public:
 
   bool IgnoreWatchpointByID(lldb::watch_id_t watch_id, uint32_t ignore_count);
 
-  Error SerializeBreakpointsToFile(const FileSpec &file,
-                                   const BreakpointIDList &bp_ids, bool append);
+  Status SerializeBreakpointsToFile(const FileSpec &file,
+                                    const BreakpointIDList &bp_ids,
+                                    bool append);
 
-  Error CreateBreakpointsFromFile(const FileSpec &file,
-                                  BreakpointIDList &new_bps);
+  Status CreateBreakpointsFromFile(const FileSpec &file,
+                                   BreakpointIDList &new_bps);
 
-  Error CreateBreakpointsFromFile(const FileSpec &file,
-                                  std::vector<std::string> &names,
-                                  BreakpointIDList &new_bps);
+  Status CreateBreakpointsFromFile(const FileSpec &file,
+                                   std::vector<std::string> &names,
+                                   BreakpointIDList &new_bps);
 
   //------------------------------------------------------------------
   /// Get \a load_addr as a callable code load address for this target
@@ -808,7 +809,7 @@ public:
   //------------------------------------------------------------------
   void SetExecutableModule(lldb::ModuleSP &module_sp, bool get_dependent_files);
 
-  bool LoadScriptingResources(std::list<Error> &errors,
+  bool LoadScriptingResources(std::list<Status> &errors,
                               Stream *feedback_stream = nullptr,
                               bool continue_on_error = true) {
     return m_images.LoadScriptingResourcesInTarget(
@@ -914,7 +915,7 @@ public:
   Debugger &GetDebugger() { return m_debugger; }
 
   size_t ReadMemoryFromFileCache(const Address &addr, void *dst, size_t dst_len,
-                                 Error &error);
+                                 Status &error);
 
   // Reading memory through the target allows us to skip going to the process
   // for reading memory if possible and it allows us to try and read from
@@ -927,27 +928,27 @@ public:
   // 2 - if there is a valid process, try and read from its memory
   // 3 - if (prefer_file_cache == false) then read from object file cache
   size_t ReadMemory(const Address &addr, bool prefer_file_cache, void *dst,
-                    size_t dst_len, Error &error,
+                    size_t dst_len, Status &error,
                     lldb::addr_t *load_addr_ptr = nullptr);
 
   size_t ReadCStringFromMemory(const Address &addr, std::string &out_str,
-                               Error &error);
+                               Status &error);
 
   size_t ReadCStringFromMemory(const Address &addr, char *dst,
-                               size_t dst_max_len, Error &result_error);
+                               size_t dst_max_len, Status &result_error);
 
   size_t ReadScalarIntegerFromMemory(const Address &addr,
                                      bool prefer_file_cache, uint32_t byte_size,
                                      bool is_signed, Scalar &scalar,
-                                     Error &error);
+                                     Status &error);
 
   uint64_t ReadUnsignedIntegerFromMemory(const Address &addr,
                                          bool prefer_file_cache,
                                          size_t integer_byte_size,
-                                         uint64_t fail_value, Error &error);
+                                         uint64_t fail_value, Status &error);
 
   bool ReadPointerFromMemory(const Address &addr, bool prefer_file_cache,
-                             Error &error, Address &pointer_addr);
+                             Status &error, Address &pointer_addr);
 
   SectionLoadList &GetSectionLoadList() {
     return m_section_load_history.GetCurrentSectionLoadList();
@@ -978,7 +979,7 @@ public:
 
   PathMappingList &GetImageSearchPathList();
 
-  TypeSystem *GetScratchTypeSystemForLanguage(Error *error,
+  TypeSystem *GetScratchTypeSystemForLanguage(Status *error,
                                               lldb::LanguageType language,
                                               bool create_on_demand = true);
 
@@ -993,7 +994,7 @@ public:
   UserExpression *GetUserExpressionForLanguage(
       llvm::StringRef expr, llvm::StringRef prefix, lldb::LanguageType language,
       Expression::ResultType desired_type,
-      const EvaluateExpressionOptions &options, Error &error);
+      const EvaluateExpressionOptions &options, Status &error);
 
   // Creates a FunctionCaller for the given language, the rest of the parameters
   // have the
@@ -1008,7 +1009,7 @@ public:
                                                const CompilerType &return_type,
                                                const Address &function_address,
                                                const ValueList &arg_value_list,
-                                               const char *name, Error &error);
+                                               const char *name, Status &error);
 
   // Creates a UtilityFunction for the given language, the rest of the
   // parameters have the
@@ -1018,7 +1019,7 @@ public:
   UtilityFunction *GetUtilityFunctionForLanguage(const char *expr,
                                                  lldb::LanguageType language,
                                                  const char *name,
-                                                 Error &error);
+                                                 Status &error);
 
   ClangASTContext *GetScratchClangASTContext(bool create_on_demand = true);
 
@@ -1028,7 +1029,7 @@ public:
   // Install any files through the platform that need be to installed
   // prior to launching or attaching.
   //----------------------------------------------------------------------
-  Error Install(ProcessLaunchInfo *launch_info);
+  Status Install(ProcessLaunchInfo *launch_info);
 
   bool ResolveFileAddress(lldb::addr_t load_addr, Address &so_addr);
 
@@ -1183,7 +1184,7 @@ public:
   GetSearchFilterForModuleAndCUList(const FileSpecList *containingModules,
                                     const FileSpecList *containingSourceFiles);
 
-  lldb::REPLSP GetREPL(Error &err, lldb::LanguageType language,
+  lldb::REPLSP GetREPL(Status &err, lldb::LanguageType language,
                        const char *repl_options, bool can_create);
 
   void SetREPL(lldb::LanguageType language, lldb::REPLSP repl_sp);
diff --git a/include/lldb/Target/TargetList.h b/include/lldb/Target/TargetList.h
index 81b81dba7757..43f4520369b6 100644
--- a/include/lldb/Target/TargetList.h
+++ b/include/lldb/Target/TargetList.h
@@ -91,10 +91,10 @@ public:
   /// @return
   ///     An error object that indicates success or failure
   //------------------------------------------------------------------
-  Error CreateTarget(Debugger &debugger, llvm::StringRef user_exe_path,
-                     llvm::StringRef triple_str, bool get_dependent_modules,
-                     const OptionGroupPlatform *platform_options,
-                     lldb::TargetSP &target_sp);
+  Status CreateTarget(Debugger &debugger, llvm::StringRef user_exe_path,
+                      llvm::StringRef triple_str, bool get_dependent_modules,
+                      const OptionGroupPlatform *platform_options,
+                      lldb::TargetSP &target_sp);
 
   //------------------------------------------------------------------
   /// Create a new Target.
@@ -102,9 +102,9 @@ public:
   /// Same as the function above, but used when you already know the
   /// platform you will be using
   //------------------------------------------------------------------
-  Error CreateTarget(Debugger &debugger, llvm::StringRef user_exe_path,
-                     const ArchSpec &arch, bool get_dependent_modules,
-                     lldb::PlatformSP &platform_sp, lldb::TargetSP &target_sp);
+  Status CreateTarget(Debugger &debugger, llvm::StringRef user_exe_path,
+                      const ArchSpec &arch, bool get_dependent_modules,
+                      lldb::PlatformSP &platform_sp, lldb::TargetSP &target_sp);
 
   //------------------------------------------------------------------
   /// Delete a Target object from the list.
@@ -211,20 +211,20 @@ protected:
 private:
   lldb::TargetSP GetDummyTarget(lldb_private::Debugger &debugger);
 
-  Error CreateDummyTarget(Debugger &debugger,
-                          llvm::StringRef specified_arch_name,
-                          lldb::TargetSP &target_sp);
+  Status CreateDummyTarget(Debugger &debugger,
+                           llvm::StringRef specified_arch_name,
+                           lldb::TargetSP &target_sp);
 
-  Error CreateTargetInternal(Debugger &debugger, llvm::StringRef user_exe_path,
-                             llvm::StringRef triple_str,
-                             bool get_dependent_files,
-                             const OptionGroupPlatform *platform_options,
-                             lldb::TargetSP &target_sp, bool is_dummy_target);
+  Status CreateTargetInternal(Debugger &debugger, llvm::StringRef user_exe_path,
+                              llvm::StringRef triple_str,
+                              bool get_dependent_files,
+                              const OptionGroupPlatform *platform_options,
+                              lldb::TargetSP &target_sp, bool is_dummy_target);
 
-  Error CreateTargetInternal(Debugger &debugger, llvm::StringRef user_exe_path,
-                             const ArchSpec &arch, bool get_dependent_modules,
-                             lldb::PlatformSP &platform_sp,
-                             lldb::TargetSP &target_sp, bool is_dummy_target);
+  Status CreateTargetInternal(Debugger &debugger, llvm::StringRef user_exe_path,
+                              const ArchSpec &arch, bool get_dependent_modules,
+                              lldb::PlatformSP &platform_sp,
+                              lldb::TargetSP &target_sp, bool is_dummy_target);
 
   DISALLOW_COPY_AND_ASSIGN(TargetList);
 };
diff --git a/include/lldb/Target/Thread.h b/include/lldb/Target/Thread.h
index 47d0b7d767d6..cfab13069278 100644
--- a/include/lldb/Target/Thread.h
+++ b/include/lldb/Target/Thread.h
@@ -441,16 +441,16 @@ public:
     return GetStackFrameList()->GetCurrentInlinedDepth();
   }
 
-  Error ReturnFromFrameWithIndex(uint32_t frame_idx,
-                                 lldb::ValueObjectSP return_value_sp,
-                                 bool broadcast = false);
+  Status ReturnFromFrameWithIndex(uint32_t frame_idx,
+                                  lldb::ValueObjectSP return_value_sp,
+                                  bool broadcast = false);
 
-  Error ReturnFromFrame(lldb::StackFrameSP frame_sp,
-                        lldb::ValueObjectSP return_value_sp,
-                        bool broadcast = false);
+  Status ReturnFromFrame(lldb::StackFrameSP frame_sp,
+                         lldb::ValueObjectSP return_value_sp,
+                         bool broadcast = false);
 
-  Error JumpToLine(const FileSpec &file, uint32_t line, bool can_leave_function,
-                   std::string *warnings = nullptr);
+  Status JumpToLine(const FileSpec &file, uint32_t line,
+                    bool can_leave_function, std::string *warnings = nullptr);
 
   virtual lldb::StackFrameSP GetFrameWithStackID(const StackID &stack_id) {
     if (stack_id.IsValid())
@@ -530,7 +530,7 @@ public:
   /// @return
   ///     An error that describes anything that went wrong
   //------------------------------------------------------------------
-  virtual Error
+  virtual Status
   StepIn(bool source_step,
          LazyBool step_in_avoids_code_without_debug_info = eLazyBoolCalculate,
          LazyBool step_out_avoids_code_without_debug_info = eLazyBoolCalculate);
@@ -548,7 +548,7 @@ public:
   /// @return
   ///     An error that describes anything that went wrong
   //------------------------------------------------------------------
-  virtual Error StepOver(
+  virtual Status StepOver(
       bool source_step,
       LazyBool step_out_avoids_code_without_debug_info = eLazyBoolCalculate);
 
@@ -561,7 +561,7 @@ public:
   /// @return
   ///     An error that describes anything that went wrong
   //------------------------------------------------------------------
-  virtual Error StepOut();
+  virtual Status StepOut();
 
   //------------------------------------------------------------------
   /// Retrieves the per-thread data area.
@@ -973,7 +973,7 @@ public:
   ///     An error if the thread plan could not be unwound.
   //------------------------------------------------------------------
 
-  Error UnwindInnermostExpression();
+  Status UnwindInnermostExpression();
 
   //------------------------------------------------------------------
   /// Gets the outer-most plan that was popped off the plan stack in the
diff --git a/include/lldb/Target/ThreadSpec.h b/include/lldb/Target/ThreadSpec.h
index 204f1f9fbd70..d00172d1763b 100644
--- a/include/lldb/Target/ThreadSpec.h
+++ b/include/lldb/Target/ThreadSpec.h
@@ -45,7 +45,7 @@ public:
 
   static std::unique_ptr<ThreadSpec>
   CreateFromStructuredData(const StructuredData::Dictionary &data_dict,
-                           Error &error);
+                           Status &error);
 
   StructuredData::ObjectSP SerializeToStructuredData();
 
@@ -55,9 +55,9 @@ public:
 
   void SetTID(lldb::tid_t tid) { m_tid = tid; }
 
-  void SetName(const char *name) { m_name = name; }
+  void SetName(llvm::StringRef name) { m_name = name; }
 
-  void SetQueueName(const char *queue_name) { m_queue_name = queue_name; }
+  void SetQueueName(llvm::StringRef queue_name) { m_queue_name = queue_name; }
 
   uint32_t GetIndex() const { return m_index; }
 
diff --git a/include/lldb/Utility/Error.h b/include/lldb/Utility/Error.h
deleted file mode 100644
index a236ab45a0bb..000000000000
--- a/include/lldb/Utility/Error.h
+++ /dev/null
@@ -1,275 +0,0 @@
-//===-- Error.h -------------------------------------------------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef __DCError_h__
-#define __DCError_h__
-#if defined(__cplusplus)
-
-#include "lldb/lldb-defines.h"
-#include "lldb/lldb-enumerations.h" // for ErrorType, ErrorType...
-#include "llvm/ADT/StringRef.h"     // for StringRef
-#include "llvm/Support/FormatVariadic.h"
-
-#include <cstdarg>
-#include <string>
-#include <system_error> // for error_code
-#include <type_traits>  // for forward
-
-#include <stdint.h> // for uint32_t
-
-namespace llvm {
-class raw_ostream;
-}
-
-namespace lldb_private {
-
-//----------------------------------------------------------------------
-/// @class Error Error.h "lldb/Utility/Error.h"
-/// @brief An error handling class.
-///
-/// This class is designed to be able to hold any error code that can be
-/// encountered on a given platform. The errors are stored as a value
-/// of type Error::ValueType. This value should be large enough to hold
-/// any and all errors that the class supports. Each error has an
-/// associated type that is of type lldb::ErrorType. New types
-/// can be added to support new error types, and architecture specific
-/// types can be enabled. In the future we may wish to switch to a
-/// registration mechanism where new error types can be registered at
-/// runtime instead of a hard coded scheme.
-///
-/// All errors in this class also know how to generate a string
-/// representation of themselves for printing results and error codes.
-/// The string value will be fetched on demand and its string value will
-/// be cached until the error is cleared of the value of the error
-/// changes.
-//----------------------------------------------------------------------
-class Error {
-public:
-  //------------------------------------------------------------------
-  /// Every error value that this object can contain needs to be able
-  /// to fit into ValueType.
-  //------------------------------------------------------------------
-  typedef uint32_t ValueType;
-
-  //------------------------------------------------------------------
-  /// Default constructor.
-  ///
-  /// Initialize the error object with a generic success value.
-  ///
-  /// @param[in] err
-  ///     An error code.
-  ///
-  /// @param[in] type
-  ///     The type for \a err.
-  //------------------------------------------------------------------
-  Error();
-
-  explicit Error(ValueType err, lldb::ErrorType type = lldb::eErrorTypeGeneric);
-
-  /* implicit */ Error(std::error_code EC);
-
-  explicit Error(const char *format, ...) __attribute__((format(printf, 2, 3)));
-
-  Error(const Error &rhs);
-  //------------------------------------------------------------------
-  /// Assignment operator.
-  ///
-  /// @param[in] err
-  ///     An error code.
-  ///
-  /// @return
-  ///     A const reference to this object.
-  //------------------------------------------------------------------
-  const Error &operator=(const Error &rhs);
-
-  //------------------------------------------------------------------
-  /// Assignment operator from a kern_return_t.
-  ///
-  /// Sets the type to \c MachKernel and the error code to \a err.
-  ///
-  /// @param[in] err
-  ///     A mach error code.
-  ///
-  /// @return
-  ///     A const reference to this object.
-  //------------------------------------------------------------------
-  const Error &operator=(uint32_t err);
-
-  ~Error();
-
-  //------------------------------------------------------------------
-  /// Get the error string associated with the current error.
-  //
-  /// Gets the error value as a NULL terminated C string. The error
-  /// string will be fetched and cached on demand. The error string
-  /// will be retrieved from a callback that is appropriate for the
-  /// type of the error and will be cached until the error value is
-  /// changed or cleared.
-  ///
-  /// @return
-  ///     The error as a NULL terminated C string value if the error
-  ///     is valid and is able to be converted to a string value,
-  ///     NULL otherwise.
-  //------------------------------------------------------------------
-  const char *AsCString(const char *default_error_str = "unknown error") const;
-
-  //------------------------------------------------------------------
-  /// Clear the object state.
-  ///
-  /// Reverts the state of this object to contain a generic success
-  /// value and frees any cached error string value.
-  //------------------------------------------------------------------
-  void Clear();
-
-  //------------------------------------------------------------------
-  /// Test for error condition.
-  ///
-  /// @return
-  ///     \b true if this object contains an error, \b false
-  ///     otherwise.
-  //------------------------------------------------------------------
-  bool Fail() const;
-
-  //------------------------------------------------------------------
-  /// Access the error value.
-  ///
-  /// @return
-  ///     The error value.
-  //------------------------------------------------------------------
-  ValueType GetError() const;
-
-  //------------------------------------------------------------------
-  /// Access the error type.
-  ///
-  /// @return
-  ///     The error type enumeration value.
-  //------------------------------------------------------------------
-  lldb::ErrorType GetType() const;
-
-  //------------------------------------------------------------------
-  /// Set accessor from a kern_return_t.
-  ///
-  /// Set accesssor for the error value to \a err and the error type
-  /// to \c MachKernel.
-  ///
-  /// @param[in] err
-  ///     A mach error code.
-  //------------------------------------------------------------------
-  void SetMachError(uint32_t err);
-
-  void SetExpressionError(lldb::ExpressionResults, const char *mssg);
-
-  int SetExpressionErrorWithFormat(lldb::ExpressionResults, const char *format,
-                                   ...) __attribute__((format(printf, 3, 4)));
-
-  //------------------------------------------------------------------
-  /// Set accesssor with an error value and type.
-  ///
-  /// Set accesssor for the error value to \a err and the error type
-  /// to \a type.
-  ///
-  /// @param[in] err
-  ///     A mach error code.
-  ///
-  /// @param[in] type
-  ///     The type for \a err.
-  //------------------------------------------------------------------
-  void SetError(ValueType err, lldb::ErrorType type);
-
-  //------------------------------------------------------------------
-  /// Set the current error to errno.
-  ///
-  /// Update the error value to be \c errno and update the type to
-  /// be \c Error::POSIX.
-  //------------------------------------------------------------------
-  void SetErrorToErrno();
-
-  //------------------------------------------------------------------
-  /// Set the current error to a generic error.
-  ///
-  /// Update the error value to be \c LLDB_GENERIC_ERROR and update the
-  /// type to be \c Error::Generic.
-  //------------------------------------------------------------------
-  void SetErrorToGenericError();
-
-  //------------------------------------------------------------------
-  /// Set the current error string to \a err_str.
-  ///
-  /// Set accessor for the error string value for a generic errors,
-  /// or to supply additional details above and beyond the standard
-  /// error strings that the standard type callbacks typically
-  /// provide. This allows custom strings to be supplied as an
-  /// error explanation. The error string value will remain until the
-  /// error value is cleared or a new error value/type is assigned.
-  ///
-  /// @param err_str
-  ///     The new custom error string to copy and cache.
-  //------------------------------------------------------------------
-  void SetErrorString(llvm::StringRef err_str);
-
-  //------------------------------------------------------------------
-  /// Set the current error string to a formatted error string.
-  ///
-  /// @param format
-  ///     A printf style format string
-  //------------------------------------------------------------------
-  int SetErrorStringWithFormat(const char *format, ...)
-      __attribute__((format(printf, 2, 3)));
-
-  int SetErrorStringWithVarArg(const char *format, va_list args);
-
-  template <typename... Args>
-  void SetErrorStringWithFormatv(const char *format, Args &&... args) {
-    SetErrorString(llvm::formatv(format, std::forward<Args>(args)...).str());
-  }
-
-  //------------------------------------------------------------------
-  /// Test for success condition.
-  ///
-  /// Returns true if the error code in this object is considered a
-  /// successful return value.
-  ///
-  /// @return
-  ///     \b true if this object contains an value that describes
-  ///     success (non-erro), \b false otherwise.
-  //------------------------------------------------------------------
-  bool Success() const;
-
-  //------------------------------------------------------------------
-  /// Test for a failure due to a generic interrupt.
-  ///
-  /// Returns true if the error code in this object was caused by an interrupt.
-  /// At present only supports Posix EINTR.
-  ///
-  /// @return
-  ///     \b true if this object contains an value that describes
-  ///     failure due to interrupt, \b false otherwise.
-  //------------------------------------------------------------------
-  bool WasInterrupted() const;
-
-protected:
-  //------------------------------------------------------------------
-  /// Member variables
-  //------------------------------------------------------------------
-  ValueType m_code;             ///< Error code as an integer value.
-  lldb::ErrorType m_type;       ///< The type of the above error code.
-  mutable std::string m_string; ///< A string representation of the error code.
-};
-
-} // namespace lldb_private
-
-namespace llvm {
-template <> struct format_provider<lldb_private::Error> {
-  static void format(const lldb_private::Error &error, llvm::raw_ostream &OS,
-                     llvm::StringRef Options);
-};
-}
-
-#endif // #if defined(__cplusplus)
-#endif // #ifndef __DCError_h__
diff --git a/include/lldb/Utility/JSON.h b/include/lldb/Utility/JSON.h
index 5c5d382048c1..bf8ed9afc008 100644
--- a/include/lldb/Utility/JSON.h
+++ b/include/lldb/Utility/JSON.h
@@ -253,7 +253,7 @@ class JSONParser : public StringExtractor {
 public:
   enum Token {
     Invalid,
-    Error,
+    Status,
     ObjectStart,
     ObjectEnd,
     ArrayStart,
diff --git a/include/lldb/Utility/RegularExpression.h b/include/lldb/Utility/RegularExpression.h
index d97e35647583..dba7001ce0fa 100644
--- a/include/lldb/Utility/RegularExpression.h
+++ b/include/lldb/Utility/RegularExpression.h
@@ -210,7 +210,7 @@ private:
   // Member variables
   //------------------------------------------------------------------
   std::string m_re; ///< A copy of the original regular expression text
-  int m_comp_err;   ///< Error code for the regular expression compilation
+  int m_comp_err;   ///< Status code for the regular expression compilation
   regex_t m_preg;   ///< The compiled regular expression
 };
 
diff --git a/include/lldb/Utility/SelectHelper.h b/include/lldb/Utility/SelectHelper.h
index 5fa856b15d9f..12950d032fb6 100644
--- a/include/lldb/Utility/SelectHelper.h
+++ b/include/lldb/Utility/SelectHelper.h
@@ -10,7 +10,7 @@
 #ifndef liblldb_SelectHelper_h_
 #define liblldb_SelectHelper_h_
 
-#include "lldb/Utility/Error.h" // for Error
+#include "lldb/Utility/Status.h" // for Status
 #include "lldb/lldb-types.h"    // for socket_t
 
 #include "llvm/ADT/DenseMap.h"
@@ -49,7 +49,7 @@ public:
   // Call the system's select() to wait for descriptors using
   // timeout provided in a call the SelectHelper::SetTimeout(),
   // or infinite wait if no timeout was set.
-  lldb_private::Error Select();
+  lldb_private::Status Select();
 
 protected:
   struct FDInfo {
diff --git a/include/lldb/Utility/Status.h b/include/lldb/Utility/Status.h
new file mode 100644
index 000000000000..4ac191b1f613
--- /dev/null
+++ b/include/lldb/Utility/Status.h
@@ -0,0 +1,278 @@
+//===-- Status.h -------------------------------------------------*- C++
+//-*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __DCError_h__
+#define __DCError_h__
+#if defined(__cplusplus)
+
+#include "lldb/lldb-defines.h"
+#include "lldb/lldb-enumerations.h" // for ErrorType, ErrorType...
+#include "llvm/ADT/StringRef.h"     // for StringRef
+#include "llvm/Support/FormatVariadic.h"
+
+#include <cstdarg>
+#include <string>
+#include <system_error> // for error_code
+#include <type_traits>  // for forward
+
+#include <stdint.h> // for uint32_t
+
+namespace llvm {
+class raw_ostream;
+}
+
+namespace lldb_private {
+
+//----------------------------------------------------------------------
+/// @class Status Status.h "lldb/Utility/Status.h"
+/// @brief An error handling class.
+///
+/// This class is designed to be able to hold any error code that can be
+/// encountered on a given platform. The errors are stored as a value
+/// of type Status::ValueType. This value should be large enough to hold
+/// any and all errors that the class supports. Each error has an
+/// associated type that is of type lldb::ErrorType. New types
+/// can be added to support new error types, and architecture specific
+/// types can be enabled. In the future we may wish to switch to a
+/// registration mechanism where new error types can be registered at
+/// runtime instead of a hard coded scheme.
+///
+/// All errors in this class also know how to generate a string
+/// representation of themselves for printing results and error codes.
+/// The string value will be fetched on demand and its string value will
+/// be cached until the error is cleared of the value of the error
+/// changes.
+//----------------------------------------------------------------------
+class Status {
+public:
+  //------------------------------------------------------------------
+  /// Every error value that this object can contain needs to be able
+  /// to fit into ValueType.
+  //------------------------------------------------------------------
+  typedef uint32_t ValueType;
+
+  //------------------------------------------------------------------
+  /// Default constructor.
+  ///
+  /// Initialize the error object with a generic success value.
+  ///
+  /// @param[in] err
+  ///     An error code.
+  ///
+  /// @param[in] type
+  ///     The type for \a err.
+  //------------------------------------------------------------------
+  Status();
+
+  explicit Status(ValueType err,
+                  lldb::ErrorType type = lldb::eErrorTypeGeneric);
+
+  /* implicit */ Status(std::error_code EC);
+
+  explicit Status(const char *format, ...)
+      __attribute__((format(printf, 2, 3)));
+
+  Status(const Status &rhs);
+  //------------------------------------------------------------------
+  /// Assignment operator.
+  ///
+  /// @param[in] err
+  ///     An error code.
+  ///
+  /// @return
+  ///     A const reference to this object.
+  //------------------------------------------------------------------
+  const Status &operator=(const Status &rhs);
+
+  //------------------------------------------------------------------
+  /// Assignment operator from a kern_return_t.
+  ///
+  /// Sets the type to \c MachKernel and the error code to \a err.
+  ///
+  /// @param[in] err
+  ///     A mach error code.
+  ///
+  /// @return
+  ///     A const reference to this object.
+  //------------------------------------------------------------------
+  const Status &operator=(uint32_t err);
+
+  ~Status();
+
+  //------------------------------------------------------------------
+  /// Get the error string associated with the current error.
+  //
+  /// Gets the error value as a NULL terminated C string. The error
+  /// string will be fetched and cached on demand. The error string
+  /// will be retrieved from a callback that is appropriate for the
+  /// type of the error and will be cached until the error value is
+  /// changed or cleared.
+  ///
+  /// @return
+  ///     The error as a NULL terminated C string value if the error
+  ///     is valid and is able to be converted to a string value,
+  ///     NULL otherwise.
+  //------------------------------------------------------------------
+  const char *AsCString(const char *default_error_str = "unknown error") const;
+
+  //------------------------------------------------------------------
+  /// Clear the object state.
+  ///
+  /// Reverts the state of this object to contain a generic success
+  /// value and frees any cached error string value.
+  //------------------------------------------------------------------
+  void Clear();
+
+  //------------------------------------------------------------------
+  /// Test for error condition.
+  ///
+  /// @return
+  ///     \b true if this object contains an error, \b false
+  ///     otherwise.
+  //------------------------------------------------------------------
+  bool Fail() const;
+
+  //------------------------------------------------------------------
+  /// Access the error value.
+  ///
+  /// @return
+  ///     The error value.
+  //------------------------------------------------------------------
+  ValueType GetError() const;
+
+  //------------------------------------------------------------------
+  /// Access the error type.
+  ///
+  /// @return
+  ///     The error type enumeration value.
+  //------------------------------------------------------------------
+  lldb::ErrorType GetType() const;
+
+  //------------------------------------------------------------------
+  /// Set accessor from a kern_return_t.
+  ///
+  /// Set accesssor for the error value to \a err and the error type
+  /// to \c MachKernel.
+  ///
+  /// @param[in] err
+  ///     A mach error code.
+  //------------------------------------------------------------------
+  void SetMachError(uint32_t err);
+
+  void SetExpressionError(lldb::ExpressionResults, const char *mssg);
+
+  int SetExpressionErrorWithFormat(lldb::ExpressionResults, const char *format,
+                                   ...) __attribute__((format(printf, 3, 4)));
+
+  //------------------------------------------------------------------
+  /// Set accesssor with an error value and type.
+  ///
+  /// Set accesssor for the error value to \a err and the error type
+  /// to \a type.
+  ///
+  /// @param[in] err
+  ///     A mach error code.
+  ///
+  /// @param[in] type
+  ///     The type for \a err.
+  //------------------------------------------------------------------
+  void SetError(ValueType err, lldb::ErrorType type);
+
+  //------------------------------------------------------------------
+  /// Set the current error to errno.
+  ///
+  /// Update the error value to be \c errno and update the type to
+  /// be \c Status::POSIX.
+  //------------------------------------------------------------------
+  void SetErrorToErrno();
+
+  //------------------------------------------------------------------
+  /// Set the current error to a generic error.
+  ///
+  /// Update the error value to be \c LLDB_GENERIC_ERROR and update the
+  /// type to be \c Status::Generic.
+  //------------------------------------------------------------------
+  void SetErrorToGenericError();
+
+  //------------------------------------------------------------------
+  /// Set the current error string to \a err_str.
+  ///
+  /// Set accessor for the error string value for a generic errors,
+  /// or to supply additional details above and beyond the standard
+  /// error strings that the standard type callbacks typically
+  /// provide. This allows custom strings to be supplied as an
+  /// error explanation. The error string value will remain until the
+  /// error value is cleared or a new error value/type is assigned.
+  ///
+  /// @param err_str
+  ///     The new custom error string to copy and cache.
+  //------------------------------------------------------------------
+  void SetErrorString(llvm::StringRef err_str);
+
+  //------------------------------------------------------------------
+  /// Set the current error string to a formatted error string.
+  ///
+  /// @param format
+  ///     A printf style format string
+  //------------------------------------------------------------------
+  int SetErrorStringWithFormat(const char *format, ...)
+      __attribute__((format(printf, 2, 3)));
+
+  int SetErrorStringWithVarArg(const char *format, va_list args);
+
+  template <typename... Args>
+  void SetErrorStringWithFormatv(const char *format, Args &&... args) {
+    SetErrorString(llvm::formatv(format, std::forward<Args>(args)...).str());
+  }
+
+  //------------------------------------------------------------------
+  /// Test for success condition.
+  ///
+  /// Returns true if the error code in this object is considered a
+  /// successful return value.
+  ///
+  /// @return
+  ///     \b true if this object contains an value that describes
+  ///     success (non-erro), \b false otherwise.
+  //------------------------------------------------------------------
+  bool Success() const;
+
+  //------------------------------------------------------------------
+  /// Test for a failure due to a generic interrupt.
+  ///
+  /// Returns true if the error code in this object was caused by an interrupt.
+  /// At present only supports Posix EINTR.
+  ///
+  /// @return
+  ///     \b true if this object contains an value that describes
+  ///     failure due to interrupt, \b false otherwise.
+  //------------------------------------------------------------------
+  bool WasInterrupted() const;
+
+protected:
+  //------------------------------------------------------------------
+  /// Member variables
+  //------------------------------------------------------------------
+  ValueType m_code;             ///< Status code as an integer value.
+  lldb::ErrorType m_type;       ///< The type of the above error code.
+  mutable std::string m_string; ///< A string representation of the error code.
+};
+
+} // namespace lldb_private
+
+namespace llvm {
+template <> struct format_provider<lldb_private::Status> {
+  static void format(const lldb_private::Status &error, llvm::raw_ostream &OS,
+                     llvm::StringRef Options);
+};
+}
+
+#endif // #if defined(__cplusplus)
+#endif // #ifndef __DCError_h__
diff --git a/include/lldb/Utility/UUID.h b/include/lldb/Utility/UUID.h
index 28069bbe4c73..6313025dec15 100644
--- a/include/lldb/Utility/UUID.h
+++ b/include/lldb/Utility/UUID.h
@@ -54,6 +54,7 @@ public:
 
   std::string GetAsString(const char *separator = nullptr) const;
 
+  size_t SetFromStringRef(llvm::StringRef str, uint32_t num_uuid_bytes = 16);
   size_t SetFromCString(const char *c_str, uint32_t num_uuid_bytes = 16);
 
   // Decode as many UUID bytes (up to 16) as possible from the C string "cstr"
diff --git a/include/lldb/lldb-forward.h b/include/lldb/lldb-forward.h
index 2180b31527e0..392dc641558d 100644
--- a/include/lldb/lldb-forward.h
+++ b/include/lldb/lldb-forward.h
@@ -87,7 +87,7 @@ class DynamicCheckerFunctions;
 class DynamicLoader;
 class Editline;
 class EmulateInstruction;
-class Error;
+class Status;
 class EvaluateExpressionOptions;
 class Event;
 class EventData;
diff --git a/include/lldb/lldb-private-enumerations.h b/include/lldb/lldb-private-enumerations.h
index 9572bee81177..332265665237 100644
--- a/include/lldb/lldb-private-enumerations.h
+++ b/include/lldb/lldb-private-enumerations.h
@@ -205,9 +205,10 @@ typedef enum MemoryModuleLoadLevel {
 enum class LineStatus {
   Success, // The line that was just edited if good and should be added to the
            // lines
-  Error, // There is an error with the current line and it needs to be re-edited
-         // before it can be accepted
-  Done   // Lines are complete
+  Status,  // There is an error with the current line and it needs to be
+           // re-edited
+           // before it can be accepted
+  Done     // Lines are complete
 };
 
 //----------------------------------------------------------------------
diff --git a/include/lldb/lldb-private-interfaces.h b/include/lldb/lldb-private-interfaces.h
index d3e80075625e..9f25eb1f9a29 100644
--- a/include/lldb/lldb-private-interfaces.h
+++ b/include/lldb/lldb-private-interfaces.h
@@ -46,7 +46,7 @@ typedef ObjectFile *(*ObjectFileCreateMemoryInstance)(
     const lldb::ModuleSP &module_sp, lldb::DataBufferSP &data_sp,
     const lldb::ProcessSP &process_sp, lldb::addr_t offset);
 typedef bool (*ObjectFileSaveCore)(const lldb::ProcessSP &process_sp,
-                                   const FileSpec &outfile, Error &error);
+                                   const FileSpec &outfile, Status &error);
 typedef EmulateInstruction *(*EmulateInstructionCreateInstance)(
     const ArchSpec &arch, InstructionType inst_type);
 typedef OperatingSystem *(*OperatingSystemCreateInstance)(Process *process,
@@ -58,8 +58,8 @@ typedef lldb::CommandObjectSP (*LanguageRuntimeGetCommandObject)(
     CommandInterpreter &interpreter);
 typedef lldb::StructuredDataPluginSP (*StructuredDataPluginCreateInstance)(
     Process &process);
-typedef Error (*StructuredDataFilterLaunchInfo)(ProcessLaunchInfo &launch_info,
-                                                Target *target);
+typedef Status (*StructuredDataFilterLaunchInfo)(ProcessLaunchInfo &launch_info,
+                                                 Target *target);
 typedef SystemRuntime *(*SystemRuntimeCreateInstance)(Process *process);
 typedef lldb::PlatformSP (*PlatformCreateInstance)(bool force,
                                                    const ArchSpec *arch);
@@ -96,7 +96,7 @@ typedef lldb::InstrumentationRuntimeSP (*InstrumentationRuntimeCreateInstance)(
     const lldb::ProcessSP &process_sp);
 typedef lldb::TypeSystemSP (*TypeSystemCreateInstance)(
     lldb::LanguageType language, Module *module, Target *target);
-typedef lldb::REPLSP (*REPLCreateInstance)(Error &error,
+typedef lldb::REPLSP (*REPLCreateInstance)(Status &error,
                                            lldb::LanguageType language,
                                            Debugger *debugger, Target *target,
                                            const char *repl_options);
diff --git a/include/lldb/lldb-private-types.h b/include/lldb/lldb-private-types.h
index 4f192b66369f..fd21641218fb 100644
--- a/include/lldb/lldb-private-types.h
+++ b/include/lldb/lldb-private-types.h
@@ -27,7 +27,7 @@ class Platform;
 class ExecutionContext;
 
 typedef llvm::sys::DynamicLibrary (*LoadPluginCallbackType)(
-    const lldb::DebuggerSP &debugger_sp, const FileSpec &spec, Error &error);
+    const lldb::DebuggerSP &debugger_sp, const FileSpec &spec, Status &error);
 
 //----------------------------------------------------------------------
 // Every register is described in detail including its name, alternate
diff --git a/lldb.xcodeproj/project.pbxproj b/lldb.xcodeproj/project.pbxproj
index e988dba59992..a01a1798ce9a 100644
--- a/lldb.xcodeproj/project.pbxproj
+++ b/lldb.xcodeproj/project.pbxproj
@@ -333,7 +333,6 @@
 		26744EF11338317700EF765A /* GDBRemoteCommunicationClient.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 26744EED1338317700EF765A /* GDBRemoteCommunicationClient.cpp */; };
 		26744EF31338317700EF765A /* GDBRemoteCommunicationServer.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 26744EEF1338317700EF765A /* GDBRemoteCommunicationServer.cpp */; };
 		26764C971E48F482008D3573 /* ConstString.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 26764C961E48F482008D3573 /* ConstString.cpp */; };
-		26764C991E48F4D2008D3573 /* Error.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 26764C981E48F4D2008D3573 /* Error.cpp */; };
 		26764C9E1E48F51E008D3573 /* Stream.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 26764C9D1E48F51E008D3573 /* Stream.cpp */; };
 		26764CA01E48F528008D3573 /* RegularExpression.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 26764C9F1E48F528008D3573 /* RegularExpression.cpp */; };
 		26764CA21E48F547008D3573 /* StreamString.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 26764CA11E48F547008D3573 /* StreamString.cpp */; };
@@ -691,6 +690,8 @@
 		3FDFED2D19C257A0009756A7 /* HostProcess.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 3FDFED2C19C257A0009756A7 /* HostProcess.cpp */; };
 		490A36C0180F0E6F00BA31F8 /* PlatformWindows.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 490A36BD180F0E6F00BA31F8 /* PlatformWindows.cpp */; };
 		490A966B1628C3BF00F0002E /* SBDeclaration.h in Headers */ = {isa = PBXBuildFile; fileRef = 9452573816262CEF00325455 /* SBDeclaration.h */; settings = {ATTRIBUTES = (Public, ); }; };
+		492DB7E71EC662B100B9E9AF /* Status.h in CopyFiles */ = {isa = PBXBuildFile; fileRef = 492DB7E61EC662B100B9E9AF /* Status.h */; };
+		492DB7EB1EC662E200B9E9AF /* Status.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 492DB7E81EC662D100B9E9AF /* Status.cpp */; };
 		4939EA8D1BD56B6D00084382 /* REPL.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 4939EA8C1BD56B6D00084382 /* REPL.cpp */; };
 		494260DA14579144003C1C78 /* VerifyDecl.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 494260D914579144003C1C78 /* VerifyDecl.cpp */; };
 		4959511F1A1BC4BC00F6F8FC /* ClangModulesDeclVendor.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 4959511E1A1BC4BC00F6F8FC /* ClangModulesDeclVendor.cpp */; };
@@ -1193,6 +1194,7 @@
 			dstPath = "$(DEVELOPER_INSTALL_DIR)/usr/share/man/man1";
 			dstSubfolderSpec = 0;
 			files = (
+				492DB7E71EC662B100B9E9AF /* Status.h in CopyFiles */,
 				AF90106515AB7D3600FF120D /* lldb.1 in CopyFiles */,
 			);
 			runOnlyForDeploymentPostprocessing = 1;
@@ -1853,8 +1855,6 @@
 		2675F6FF1332BE690067997B /* PlatformRemoteiOS.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = PlatformRemoteiOS.h; sourceTree = "<group>"; };
 		26764C951E48F46F008D3573 /* ConstString.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; name = ConstString.h; path = include/lldb/Utility/ConstString.h; sourceTree = "<group>"; };
 		26764C961E48F482008D3573 /* ConstString.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = ConstString.cpp; path = source/Utility/ConstString.cpp; sourceTree = "<group>"; };
-		26764C981E48F4D2008D3573 /* Error.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = Error.cpp; path = source/Utility/Error.cpp; sourceTree = "<group>"; };
-		26764C9A1E48F4DD008D3573 /* Error.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; name = Error.h; path = include/lldb/Utility/Error.h; sourceTree = "<group>"; };
 		26764C9B1E48F50C008D3573 /* Stream.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; name = Stream.h; path = include/lldb/Utility/Stream.h; sourceTree = "<group>"; };
 		26764C9C1E48F516008D3573 /* RegularExpression.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; name = RegularExpression.h; path = include/lldb/Utility/RegularExpression.h; sourceTree = "<group>"; };
 		26764C9D1E48F51E008D3573 /* Stream.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = Stream.cpp; path = source/Utility/Stream.cpp; sourceTree = "<group>"; };
@@ -2412,6 +2412,8 @@
 		490A36BE180F0E6F00BA31F8 /* PlatformWindows.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = PlatformWindows.h; sourceTree = "<group>"; };
 		4911934B1226383D00578B7F /* ASTStructExtractor.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = ASTStructExtractor.h; path = ExpressionParser/Clang/ASTStructExtractor.h; sourceTree = "<group>"; };
 		491193501226386000578B7F /* ASTStructExtractor.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = ASTStructExtractor.cpp; path = ExpressionParser/Clang/ASTStructExtractor.cpp; sourceTree = "<group>"; };
+		492DB7E61EC662B100B9E9AF /* Status.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = Status.h; path = include/lldb/Utility/Status.h; sourceTree = "<group>"; };
+		492DB7E81EC662D100B9E9AF /* Status.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = Status.cpp; path = source/Utility/Status.cpp; sourceTree = "<group>"; };
 		49307AAD11DEA4D90081F992 /* IRForTarget.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = IRForTarget.cpp; path = ExpressionParser/Clang/IRForTarget.cpp; sourceTree = "<group>"; };
 		49307AB111DEA4F20081F992 /* IRForTarget.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = IRForTarget.h; path = ExpressionParser/Clang/IRForTarget.h; sourceTree = "<group>"; };
 		4939EA8B1BD56B3700084382 /* REPL.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; name = REPL.h; path = include/lldb/Expression/REPL.h; sourceTree = "<group>"; };
@@ -4240,8 +4242,6 @@
 				49CA96F41E6AAC8E00C03FEE /* DataExtractor.h */,
 				49CA96E91E6AAC6600C03FEE /* DataExtractor.cpp */,
 				9481FE6B1B5F2D9200DED357 /* Either.h */,
-				26764C9A1E48F4DD008D3573 /* Error.h */,
-				26764C981E48F4D2008D3573 /* Error.cpp */,
 				AFC2DCE61E6E2ED000283714 /* FastDemangle.cpp */,
 				AFC2DCED1E6E2F9800283714 /* FastDemangle.h */,
 				AFC2DCF21E6E30CF00283714 /* History.cpp */,
@@ -4261,6 +4261,8 @@
 				261B5A5211C3F2AD00AABD0A /* SharingPtr.cpp */,
 				4C2FAE2E135E3A70001EDE44 /* SharedCluster.h */,
 				261B5A5311C3F2AD00AABD0A /* SharingPtr.h */,
+				492DB7E61EC662B100B9E9AF /* Status.h */,
+				492DB7E81EC662D100B9E9AF /* Status.cpp */,
 				26764C9B1E48F50C008D3573 /* Stream.h */,
 				26764C9D1E48F51E008D3573 /* Stream.cpp */,
 				AFC2DCF51E6E316A00283714 /* StreamCallback.cpp */,
@@ -7249,6 +7251,7 @@
 				6D0F61591C80AB3500A4ECEE /* JavaFormatterFunctions.cpp in Sources */,
 				2689005913353E0400698AC0 /* ValueObjectConstResult.cpp in Sources */,
 				2689005A13353E0400698AC0 /* ValueObjectList.cpp in Sources */,
+				492DB7EB1EC662E200B9E9AF /* Status.cpp in Sources */,
 				2689005B13353E0400698AC0 /* ValueObjectRegister.cpp in Sources */,
 				2689005C13353E0400698AC0 /* ValueObjectVariable.cpp in Sources */,
 				264A58EE1A7DBCAD00A6B1B0 /* OptionValueFormatEntity.cpp in Sources */,
@@ -7360,7 +7363,6 @@
 				268900B413353E5000698AC0 /* RegisterContextMacOSXFrameBackchain.cpp in Sources */,
 				AE44FB321BB07EBC0033EB62 /* GoParser.cpp in Sources */,
 				3F8169311ABB7A6D001DA9DF /* SystemInitializer.cpp in Sources */,
-				26764C991E48F4D2008D3573 /* Error.cpp in Sources */,
 				949EEDB21BA76731008C63CF /* NSIndexPath.cpp in Sources */,
 				3FDFED2D19C257A0009756A7 /* HostProcess.cpp in Sources */,
 				268900B513353E5000698AC0 /* StopInfoMachException.cpp in Sources */,
diff --git a/packages/Python/lldbsuite/test/functionalities/return-value/TestReturnValue.py b/packages/Python/lldbsuite/test/functionalities/return-value/TestReturnValue.py
index 90562f52a4b2..e476c48d1844 100644
--- a/packages/Python/lldbsuite/test/functionalities/return-value/TestReturnValue.py
+++ b/packages/Python/lldbsuite/test/functionalities/return-value/TestReturnValue.py
@@ -18,6 +18,10 @@ class ReturnValueTestCase(TestBase):
 
     mydir = TestBase.compute_mydir(__file__)
 
+    def affected_by_pr33042(self):
+        return ("clang" in self.getCompiler() and self.getArchitecture() ==
+            "aarch64" and self.getPlatform() == "linux")
+
     @expectedFailureAll(oslist=["freebsd"], archs=["i386"])
     @expectedFailureAll(oslist=["macosx"], archs=["i386"], bugnumber="<rdar://problem/28719652>")
     @expectedFailureAll(
@@ -148,7 +152,8 @@ class ReturnValueTestCase(TestBase):
         self.return_and_test_struct_value("return_two_int")
         self.return_and_test_struct_value("return_three_int")
         self.return_and_test_struct_value("return_four_int")
-        self.return_and_test_struct_value("return_five_int")
+        if not self.affected_by_pr33042():
+            self.return_and_test_struct_value("return_five_int")
 
         self.return_and_test_struct_value("return_two_double")
         self.return_and_test_struct_value("return_one_double_two_float")
diff --git a/packages/Python/lldbsuite/test/functionalities/watchpoint/variable_out_of_scope/TestWatchedVarHitWhenInScope.py b/packages/Python/lldbsuite/test/functionalities/watchpoint/variable_out_of_scope/TestWatchedVarHitWhenInScope.py
index d30741497635..47454498e70e 100644
--- a/packages/Python/lldbsuite/test/functionalities/watchpoint/variable_out_of_scope/TestWatchedVarHitWhenInScope.py
+++ b/packages/Python/lldbsuite/test/functionalities/watchpoint/variable_out_of_scope/TestWatchedVarHitWhenInScope.py
@@ -11,6 +11,7 @@ import time
 import lldb
 from lldbsuite.test.lldbtest import *
 import lldbsuite.test.lldbutil as lldbutil
+from lldbsuite.test.decorators import *
 
 
 class WatchedVariableHitWhenInScopeTestCase(TestBase):
@@ -33,6 +34,8 @@ class WatchedVariableHitWhenInScopeTestCase(TestBase):
         self.exe_name = self.testMethodName
         self.d = {'C_SOURCES': self.source, 'EXE': self.exe_name}
 
+    # Test hangs due to a kernel bug, see fdfeff0f in the linux kernel for details
+    @skipIfTargetAndroid(api_levels=list(range(25+1)), archs=["aarch64", "arm"])
     @unittest2.expectedFailure("rdar://problem/18685649")
     def test_watched_var_should_only_hit_when_in_scope(self):
         """Test that a variable watchpoint should only hit when in scope."""
diff --git a/packages/Python/lldbsuite/test/lang/cpp/class-template-parameter-pack/Makefile b/packages/Python/lldbsuite/test/lang/cpp/class-template-parameter-pack/Makefile
new file mode 100644
index 000000000000..99bfa7e03b47
--- /dev/null
+++ b/packages/Python/lldbsuite/test/lang/cpp/class-template-parameter-pack/Makefile
@@ -0,0 +1,3 @@
+LEVEL = ../../../make
+CXX_SOURCES := main.cpp
+include $(LEVEL)/Makefile.rules
diff --git a/packages/Python/lldbsuite/test/lang/cpp/class-template-parameter-pack/TestClassTemplateParameterPack.py b/packages/Python/lldbsuite/test/lang/cpp/class-template-parameter-pack/TestClassTemplateParameterPack.py
new file mode 100644
index 000000000000..aad2ea20c133
--- /dev/null
+++ b/packages/Python/lldbsuite/test/lang/cpp/class-template-parameter-pack/TestClassTemplateParameterPack.py
@@ -0,0 +1,9 @@
+from lldbsuite.test import lldbinline
+from lldbsuite.test import decorators
+
+lldbinline.MakeInlineTest(
+    __file__, globals(), [
+        decorators.expectedFailureAll(
+            oslist=["windows"], bugnumber="llvm.org/pr24764"),
+        decorators.expectedFailureAll(
+            compiler="gcc")])
diff --git a/packages/Python/lldbsuite/test/lang/cpp/class-template-parameter-pack/main.cpp b/packages/Python/lldbsuite/test/lang/cpp/class-template-parameter-pack/main.cpp
new file mode 100644
index 000000000000..90e63b40f417
--- /dev/null
+++ b/packages/Python/lldbsuite/test/lang/cpp/class-template-parameter-pack/main.cpp
@@ -0,0 +1,61 @@
+//===-- main.cpp ------------------------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LIDENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+template <class T, int... Args> struct C {
+  T member;
+  bool isSixteenThirtyTwo() { return false; }
+};
+
+template <> struct C<int, 16> {
+  int member;
+  bool isSixteenThirtyTwo() { return false; }
+};
+
+template <> struct C<int, 16, 32> : C<int, 16> {
+  bool isSixteenThirtyTwo() { return true; }
+};
+
+template <class T, typename... Args> struct D {
+  T member;
+  bool isIntBool() { return false; }
+};
+
+template <> struct D<int, int> {
+  int member;
+  bool isIntBool() { return false; }
+};
+
+template <> struct D<int, int, bool> : D<int, int> {
+  bool isIntBool() { return true; }
+};
+
+int main (int argc, char const *argv[])
+{
+    C<int,16,32> myC;
+    C<int,16> myLesserC;
+    myC.member = 64;
+    (void)C<int,16,32>().isSixteenThirtyTwo();
+    (void)C<int,16>().isSixteenThirtyTwo();
+    (void)(myC.member != 64);   //% self.expect("expression -- myC", DATA_TYPES_DISPLAYED_CORRECTLY, substrs = ["64"])
+                                //% self.expect("expression -- C<int, 16>().isSixteenThirtyTwo()", DATA_TYPES_DISPLAYED_CORRECTLY, substrs = ["false"])
+                                //% self.expect("expression -- C<int, 16, 32>().isSixteenThirtyTwo()", DATA_TYPES_DISPLAYED_CORRECTLY, substrs = ["true"])
+                                //% self.expect("expression -- myLesserC.isSixteenThirtyTwo()", DATA_TYPES_DISPLAYED_CORRECTLY, substrs = ["false"])
+                                //% self.expect("expression -- myC.isSixteenThirtyTwo()", DATA_TYPES_DISPLAYED_CORRECTLY, substrs = ["true"])
+   
+    D<int,int,bool> myD;
+    D<int,int> myLesserD;
+    myD.member = 64;
+    (void)D<int,int,bool>().isIntBool();
+    (void)D<int,int>().isIntBool();
+    return myD.member != 64;	//% self.expect("expression -- myD", DATA_TYPES_DISPLAYED_CORRECTLY, substrs = ["64"])
+                                //% self.expect("expression -- D<int, int>().isIntBool()", DATA_TYPES_DISPLAYED_CORRECTLY, substrs = ["false"])
+                                //% self.expect("expression -- D<int, int, bool>().isIntBool()", DATA_TYPES_DISPLAYED_CORRECTLY, substrs = ["true"])
+                                //% self.expect("expression -- myLesserD.isIntBool()", DATA_TYPES_DISPLAYED_CORRECTLY, substrs = ["false"])
+                                //% self.expect("expression -- myD.isIntBool()", DATA_TYPES_DISPLAYED_CORRECTLY, substrs = ["true"])
+}
diff --git a/packages/Python/lldbsuite/test/lang/cpp/function-template-parameter-pack/Makefile b/packages/Python/lldbsuite/test/lang/cpp/function-template-parameter-pack/Makefile
new file mode 100644
index 000000000000..99bfa7e03b47
--- /dev/null
+++ b/packages/Python/lldbsuite/test/lang/cpp/function-template-parameter-pack/Makefile
@@ -0,0 +1,3 @@
+LEVEL = ../../../make
+CXX_SOURCES := main.cpp
+include $(LEVEL)/Makefile.rules
diff --git a/packages/Python/lldbsuite/test/lang/cpp/function-template-parameter-pack/TestFunctionTemplateParameterPack.py b/packages/Python/lldbsuite/test/lang/cpp/function-template-parameter-pack/TestFunctionTemplateParameterPack.py
new file mode 100644
index 000000000000..810aefee0f07
--- /dev/null
+++ b/packages/Python/lldbsuite/test/lang/cpp/function-template-parameter-pack/TestFunctionTemplateParameterPack.py
@@ -0,0 +1,6 @@
+from lldbsuite.test import lldbinline
+from lldbsuite.test import decorators
+
+lldbinline.MakeInlineTest(
+    __file__, globals(), [
+        decorators.expectedFailureAll(bugnumber="rdar://problem/32096064")])
diff --git a/packages/Python/lldbsuite/test/lang/cpp/function-template-parameter-pack/main.cpp b/packages/Python/lldbsuite/test/lang/cpp/function-template-parameter-pack/main.cpp
new file mode 100644
index 000000000000..e802d40e5f8b
--- /dev/null
+++ b/packages/Python/lldbsuite/test/lang/cpp/function-template-parameter-pack/main.cpp
@@ -0,0 +1,24 @@
+//===-- main.cpp ------------------------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LIDENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+template <class T> int staticSizeof() {
+  return sizeof(T);
+}
+
+template <class T1, class T2, class... Ts> int staticSizeof() {
+  return staticSizeof<T2, Ts...>() + sizeof(T1);
+}
+
+int main (int argc, char const *argv[])
+{
+  int sz = staticSizeof<long, int, char>();
+  return staticSizeof<long, int, char>() != sz; //% self.expect("expression -- sz == staticSizeof<long, int, char>()", "staticSizeof<long, int, char> worked", substrs = ["true"])
+                                  //% self.expect("expression -- sz == staticSizeof<long, int>() + sizeof(char)", "staticSizeof<long, int> worked", substrs = ["true"])
+                                  //% self.expect("expression -- sz == staticSizeof<long>() + sizeof(int) + sizeof(char)", "staticSizeof<long> worked", substrs = ["true"])
+}
diff --git a/packages/Python/lldbsuite/test/lang/objc/ptr_refs/Makefile b/packages/Python/lldbsuite/test/lang/objc/ptr_refs/Makefile
new file mode 100644
index 000000000000..b05ff34b739b
--- /dev/null
+++ b/packages/Python/lldbsuite/test/lang/objc/ptr_refs/Makefile
@@ -0,0 +1,5 @@
+LEVEL = ../../../make
+
+OBJC_SOURCES := main.m
+
+include $(LEVEL)/Makefile.rules
diff --git a/packages/Python/lldbsuite/test/lang/objc/ptr_refs/TestPtrRefsObjC.py b/packages/Python/lldbsuite/test/lang/objc/ptr_refs/TestPtrRefsObjC.py
new file mode 100644
index 000000000000..e5633156cd18
--- /dev/null
+++ b/packages/Python/lldbsuite/test/lang/objc/ptr_refs/TestPtrRefsObjC.py
@@ -0,0 +1,50 @@
+"""
+Test the ptr_refs tool on Darwin with Objective-C
+"""
+
+from __future__ import print_function
+
+import os
+import lldb
+from lldbsuite.test.decorators import *
+from lldbsuite.test.lldbtest import *
+from lldbsuite.test import lldbutil
+
+
+class TestPtrRefsObjC(TestBase):
+
+    mydir = TestBase.compute_mydir(__file__)
+
+    @skipUnlessDarwin
+    def test_ptr_refs(self):
+        """Test the ptr_refs tool on Darwin with Objective-C"""
+        self.build()
+        exe_name = 'a.out'
+        exe = os.path.join(os.getcwd(), exe_name)
+
+        target = self.dbg.CreateTarget(exe)
+        self.assertTrue(target, VALID_TARGET)
+
+        main_file_spec = lldb.SBFileSpec('main.m')
+        breakpoint = target.BreakpointCreateBySourceRegex(
+            'break', main_file_spec)
+        self.assertTrue(breakpoint and
+                        breakpoint.GetNumLocations() == 1,
+                        VALID_BREAKPOINT)
+
+        process = target.LaunchSimple(
+            None, None, self.get_process_working_directory())
+        self.assertTrue(process, PROCESS_IS_VALID)
+
+        # Frame #0 should be on self.line1 and the break condition should hold.
+        thread = lldbutil.get_stopped_thread(
+            process, lldb.eStopReasonBreakpoint)
+        self.assertTrue(
+            thread.IsValid(),
+            "There should be a thread stopped due to breakpoint condition")
+
+        frame = thread.GetFrameAtIndex(0)
+
+        self.dbg.HandleCommand("script import lldb.macosx.heap")
+        self.expect("ptr_refs self", substrs=["malloc", "stack"])
+
diff --git a/packages/Python/lldbsuite/test/lang/objc/ptr_refs/main.m b/packages/Python/lldbsuite/test/lang/objc/ptr_refs/main.m
new file mode 100644
index 000000000000..8203165e4971
--- /dev/null
+++ b/packages/Python/lldbsuite/test/lang/objc/ptr_refs/main.m
@@ -0,0 +1,39 @@
+//===-- main.c --------------------------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#import <Foundation/Foundation.h>
+
+@interface MyClass : NSObject {
+};
+-(void)test;
+@end
+
+@implementation MyClass
+-(void)test {
+    printf("%p\n", self); // break here
+}
+@end
+
+@interface MyOwner : NSObject {
+  @public id ownedThing; // should be id, to test <rdar://problem/31363513>
+};
+@end
+
+@implementation MyOwner
+@end
+
+int main (int argc, char const *argv[]) {
+    @autoreleasepool {
+        MyOwner *owner = [[MyOwner alloc] init];
+        owner->ownedThing = [[MyClass alloc] init];
+        [(MyClass*)owner->ownedThing test];
+    }
+    return 0;
+}
+
diff --git a/scripts/Xcode/repo.py b/scripts/Xcode/repo.py
index c1e5fe339d26..7623c503ba9f 100644
--- a/scripts/Xcode/repo.py
+++ b/scripts/Xcode/repo.py
@@ -3,6 +3,7 @@ import os
 import re
 import shutil
 import subprocess
+import sys
 
 def identifier():
 	try:
diff --git a/source/API/SBBreakpoint.cpp b/source/API/SBBreakpoint.cpp
index 0b661a646f31..bf9603248d71 100644
--- a/source/API/SBBreakpoint.cpp
+++ b/source/API/SBBreakpoint.cpp
@@ -557,7 +557,7 @@ SBError SBBreakpoint::SetScriptCallbackBody(const char *callback_body_text) {
     std::lock_guard<std::recursive_mutex> guard(
         bkpt_sp->GetTarget().GetAPIMutex());
     BreakpointOptions *bp_options = bkpt_sp->GetOptions();
-    Error error =
+    Status error =
         bkpt_sp->GetTarget()
             .GetDebugger()
             .GetCommandInterpreter()
@@ -578,8 +578,8 @@ bool SBBreakpoint::AddName(const char *new_name) {
   if (bkpt_sp) {
     std::lock_guard<std::recursive_mutex> guard(
         bkpt_sp->GetTarget().GetAPIMutex());
-    Error error; // Think I'm just going to swallow the error here, it's
-                 // probably more annoying to have to provide it.
+    Status error; // Think I'm just going to swallow the error here, it's
+                  // probably more annoying to have to provide it.
     return bkpt_sp->AddName(new_name, error);
   }
 
diff --git a/source/API/SBBreakpointLocation.cpp b/source/API/SBBreakpointLocation.cpp
index d8779ffe2ba8..dc9c00d8dd57 100644
--- a/source/API/SBBreakpointLocation.cpp
+++ b/source/API/SBBreakpointLocation.cpp
@@ -171,7 +171,7 @@ SBBreakpointLocation::SetScriptCallbackBody(const char *callback_body_text) {
     std::lock_guard<std::recursive_mutex> guard(
         loc_sp->GetTarget().GetAPIMutex());
     BreakpointOptions *bp_options = loc_sp->GetLocationOptions();
-    Error error =
+    Status error =
         loc_sp->GetBreakpoint()
             .GetTarget()
             .GetDebugger()
diff --git a/source/API/SBCommandReturnObject.cpp b/source/API/SBCommandReturnObject.cpp
index 7eed94e05b10..5a8909b98e53 100644
--- a/source/API/SBCommandReturnObject.cpp
+++ b/source/API/SBCommandReturnObject.cpp
@@ -17,8 +17,8 @@
 
 #include "lldb/Interpreter/CommandReturnObject.h"
 #include "lldb/Utility/ConstString.h"
-#include "lldb/Utility/Error.h"
 #include "lldb/Utility/Log.h"
+#include "lldb/Utility/Status.h"
 
 using namespace lldb;
 using namespace lldb_private;
@@ -182,7 +182,7 @@ bool SBCommandReturnObject::GetDescription(SBStream &description) {
   Stream &strm = description.ref();
 
   if (m_opaque_ap) {
-    description.Printf("Status:  ");
+    description.Printf("Error:  ");
     lldb::ReturnStatus status = m_opaque_ap->GetStatus();
     if (status == lldb::eReturnStatusStarted)
       strm.PutCString("Started");
@@ -271,7 +271,7 @@ void SBCommandReturnObject::SetError(lldb::SBError &error,
     if (error.IsValid())
       m_opaque_ap->SetError(error.ref(), fallback_error_cstr);
     else if (fallback_error_cstr)
-      m_opaque_ap->SetError(Error(), fallback_error_cstr);
+      m_opaque_ap->SetError(Status(), fallback_error_cstr);
   }
 }
 
diff --git a/source/API/SBDebugger.cpp b/source/API/SBDebugger.cpp
index 8d23d3eb8505..3cdb6bbfd5f9 100644
--- a/source/API/SBDebugger.cpp
+++ b/source/API/SBDebugger.cpp
@@ -57,7 +57,7 @@ using namespace lldb_private;
 
 static llvm::sys::DynamicLibrary LoadPlugin(const lldb::DebuggerSP &debugger_sp,
                                             const FileSpec &spec,
-                                            Error &error) {
+                                            Status &error) {
   llvm::sys::DynamicLibrary dynlib =
       llvm::sys::DynamicLibrary::getPermanentLibrary(spec.GetPath().c_str());
   if (dynlib.isValid()) {
@@ -551,7 +551,7 @@ SBDebugger::CreateTargetWithFileAndTargetTriple(const char *filename,
   TargetSP target_sp;
   if (m_opaque_sp) {
     const bool add_dependent_modules = true;
-    Error error(m_opaque_sp->GetTargetList().CreateTarget(
+    Status error(m_opaque_sp->GetTargetList().CreateTarget(
         *m_opaque_sp, filename, target_triple, add_dependent_modules, nullptr,
         target_sp));
     sb_target.SetSP(target_sp);
@@ -574,7 +574,7 @@ SBTarget SBDebugger::CreateTargetWithFileAndArch(const char *filename,
   SBTarget sb_target;
   TargetSP target_sp;
   if (m_opaque_sp) {
-    Error error;
+    Status error;
     const bool add_dependent_modules = true;
 
     error = m_opaque_sp->GetTargetList().CreateTarget(
@@ -600,7 +600,7 @@ SBTarget SBDebugger::CreateTarget(const char *filename) {
   SBTarget sb_target;
   TargetSP target_sp;
   if (m_opaque_sp) {
-    Error error;
+    Status error;
     const bool add_dependent_modules = true;
     error = m_opaque_sp->GetTargetList().CreateTarget(
         *m_opaque_sp, filename, "", add_dependent_modules, nullptr, target_sp);
@@ -873,7 +873,7 @@ SBError SBDebugger::SetInternalVariable(const char *var_name, const char *value,
   SBError sb_error;
   DebuggerSP debugger_sp(Debugger::FindDebuggerWithInstanceName(
       ConstString(debugger_instance_name)));
-  Error error;
+  Status error;
   if (debugger_sp) {
     ExecutionContext exe_ctx(
         debugger_sp->GetCommandInterpreter().GetExecutionContext());
@@ -894,7 +894,7 @@ SBDebugger::GetInternalVariableValue(const char *var_name,
   SBStringList ret_value;
   DebuggerSP debugger_sp(Debugger::FindDebuggerWithInstanceName(
       ConstString(debugger_instance_name)));
-  Error error;
+  Status error;
   if (debugger_sp) {
     ExecutionContext exe_ctx(
         debugger_sp->GetCommandInterpreter().GetExecutionContext());
diff --git a/source/API/SBError.cpp b/source/API/SBError.cpp
index a692a9678cde..b2811d0ac381 100644
--- a/source/API/SBError.cpp
+++ b/source/API/SBError.cpp
@@ -9,8 +9,8 @@
 
 #include "lldb/API/SBError.h"
 #include "lldb/API/SBStream.h"
-#include "lldb/Utility/Error.h"
 #include "lldb/Utility/Log.h"
+#include "lldb/Utility/Status.h"
 
 #include <stdarg.h>
 
@@ -21,7 +21,7 @@ SBError::SBError() : m_opaque_ap() {}
 
 SBError::SBError(const SBError &rhs) : m_opaque_ap() {
   if (rhs.IsValid())
-    m_opaque_ap.reset(new Error(*rhs));
+    m_opaque_ap.reset(new Status(*rhs));
 }
 
 SBError::~SBError() {}
@@ -31,7 +31,7 @@ const SBError &SBError::operator=(const SBError &rhs) {
     if (m_opaque_ap.get())
       *m_opaque_ap = *rhs;
     else
-      m_opaque_ap.reset(new Error(*rhs));
+      m_opaque_ap.reset(new Status(*rhs));
   } else
     m_opaque_ap.reset();
 
@@ -108,7 +108,7 @@ void SBError::SetError(uint32_t err, ErrorType type) {
   m_opaque_ap->SetError(err, type);
 }
 
-void SBError::SetError(const Error &lldb_error) {
+void SBError::SetError(const Status &lldb_error) {
   CreateIfNeeded();
   *m_opaque_ap = lldb_error;
 }
@@ -141,19 +141,19 @@ bool SBError::IsValid() const { return m_opaque_ap.get() != NULL; }
 
 void SBError::CreateIfNeeded() {
   if (m_opaque_ap.get() == NULL)
-    m_opaque_ap.reset(new Error());
+    m_opaque_ap.reset(new Status());
 }
 
-lldb_private::Error *SBError::operator->() { return m_opaque_ap.get(); }
+lldb_private::Status *SBError::operator->() { return m_opaque_ap.get(); }
 
-lldb_private::Error *SBError::get() { return m_opaque_ap.get(); }
+lldb_private::Status *SBError::get() { return m_opaque_ap.get(); }
 
-lldb_private::Error &SBError::ref() {
+lldb_private::Status &SBError::ref() {
   CreateIfNeeded();
   return *m_opaque_ap;
 }
 
-const lldb_private::Error &SBError::operator*() const {
+const lldb_private::Status &SBError::operator*() const {
   // Be sure to call "IsValid()" before calling this function or it will crash
   return *m_opaque_ap;
 }
diff --git a/source/API/SBFrame.cpp b/source/API/SBFrame.cpp
index d52bbe8069f8..684a707dda94 100644
--- a/source/API/SBFrame.cpp
+++ b/source/API/SBFrame.cpp
@@ -606,7 +606,7 @@ lldb::SBValue SBFrame::GetValueForVariablePath(const char *var_path,
       frame = exe_ctx.GetFramePtr();
       if (frame) {
         VariableSP var_sp;
-        Error error;
+        Status error;
         ValueObjectSP value_sp(frame->GetValueForVariableExpressionPath(
             var_path, eNoDynamicValues,
             StackFrame::eExpressionPathOptionCheckPtrVsMember |
diff --git a/source/API/SBHostOS.cpp b/source/API/SBHostOS.cpp
index c25499db89cd..90b75a6ecd7b 100644
--- a/source/API/SBHostOS.cpp
+++ b/source/API/SBHostOS.cpp
@@ -80,7 +80,7 @@ lldb::thread_t SBHostOS::ThreadCreate(const char *name,
 void SBHostOS::ThreadCreated(const char *name) {}
 
 bool SBHostOS::ThreadCancel(lldb::thread_t thread, SBError *error_ptr) {
-  Error error;
+  Status error;
   HostThread host_thread(thread);
   error = host_thread.Cancel();
   if (error_ptr)
@@ -90,7 +90,7 @@ bool SBHostOS::ThreadCancel(lldb::thread_t thread, SBError *error_ptr) {
 }
 
 bool SBHostOS::ThreadDetach(lldb::thread_t thread, SBError *error_ptr) {
-  Error error;
+  Status error;
 #if defined(_WIN32)
   if (error_ptr)
     error_ptr->SetErrorString("ThreadDetach is not supported on this platform");
@@ -106,7 +106,7 @@ bool SBHostOS::ThreadDetach(lldb::thread_t thread, SBError *error_ptr) {
 
 bool SBHostOS::ThreadJoin(lldb::thread_t thread, lldb::thread_result_t *result,
                           SBError *error_ptr) {
-  Error error;
+  Status error;
   HostThread host_thread(thread);
   error = host_thread.Join(result);
   if (error_ptr)
diff --git a/source/API/SBModule.cpp b/source/API/SBModule.cpp
index 3865ba927977..17f3dcc5656d 100644
--- a/source/API/SBModule.cpp
+++ b/source/API/SBModule.cpp
@@ -37,8 +37,8 @@ SBModule::SBModule(const lldb::ModuleSP &module_sp) : m_opaque_sp(module_sp) {}
 
 SBModule::SBModule(const SBModuleSpec &module_spec) : m_opaque_sp() {
   ModuleSP module_sp;
-  Error error = ModuleList::GetSharedModule(*module_spec.m_opaque_ap, module_sp,
-                                            NULL, NULL, NULL);
+  Status error = ModuleList::GetSharedModule(*module_spec.m_opaque_ap,
+                                             module_sp, NULL, NULL, NULL);
   if (module_sp)
     SetSP(module_sp);
 }
diff --git a/source/API/SBPlatform.cpp b/source/API/SBPlatform.cpp
index 0f1b99236a71..87cbc4537a3c 100644
--- a/source/API/SBPlatform.cpp
+++ b/source/API/SBPlatform.cpp
@@ -17,7 +17,7 @@
 #include "lldb/Interpreter/Args.h"
 #include "lldb/Target/Platform.h"
 #include "lldb/Target/Target.h"
-#include "lldb/Utility/Error.h"
+#include "lldb/Utility/Status.h"
 
 #include "llvm/Support/FileSystem.h"
 
@@ -205,7 +205,7 @@ const char *SBPlatformShellCommand::GetOutput() {
 SBPlatform::SBPlatform() : m_opaque_sp() {}
 
 SBPlatform::SBPlatform(const char *platform_name) : m_opaque_sp() {
-  Error error;
+  Status error;
   if (platform_name && platform_name[0])
     m_opaque_sp = Platform::Create(ConstString(platform_name), error);
 }
@@ -374,7 +374,7 @@ SBError SBPlatform::Put(SBFileSpec &src, SBFileSpec &dst) {
       return platform_sp->PutFile(src.ref(), dst.ref(), permissions);
     }
 
-    Error error;
+    Status error;
     error.SetErrorStringWithFormat("'src' argument doesn't exist: '%s'",
                                    src.ref().GetPath().c_str());
     return error;
@@ -386,7 +386,7 @@ SBError SBPlatform::Install(SBFileSpec &src, SBFileSpec &dst) {
     if (src.Exists())
       return platform_sp->Install(src.ref(), dst.ref());
 
-    Error error;
+    Status error;
     error.SetErrorStringWithFormat("'src' argument doesn't exist: '%s'",
                                    src.ref().GetPath().c_str());
     return error;
@@ -397,7 +397,7 @@ SBError SBPlatform::Run(SBPlatformShellCommand &shell_command) {
   return ExecuteConnected([&](const lldb::PlatformSP &platform_sp) {
     const char *command = shell_command.GetCommand();
     if (!command)
-      return Error("invalid shell command (empty)");
+      return Status("invalid shell command (empty)");
 
     const char *working_dir = shell_command.GetWorkingDirectory();
     if (working_dir == NULL) {
@@ -427,7 +427,7 @@ SBError SBPlatform::Kill(const lldb::pid_t pid) {
 }
 
 SBError SBPlatform::ExecuteConnected(
-    const std::function<Error(const lldb::PlatformSP &)> &func) {
+    const std::function<Status(const lldb::PlatformSP &)> &func) {
   SBError sb_error;
   const auto platform_sp(GetSP());
   if (platform_sp) {
diff --git a/source/API/SBProcess.cpp b/source/API/SBProcess.cpp
index 0348113a9873..8b79e521a371 100644
--- a/source/API/SBProcess.cpp
+++ b/source/API/SBProcess.cpp
@@ -281,7 +281,7 @@ size_t SBProcess::PutSTDIN(const char *src, size_t src_len) {
   size_t ret_val = 0;
   ProcessSP process_sp(GetSP());
   if (process_sp) {
-    Error error;
+    Status error;
     ret_val = process_sp->PutSTDIN(src, src_len, error);
   }
 
@@ -298,7 +298,7 @@ size_t SBProcess::GetSTDOUT(char *dst, size_t dst_len) const {
   size_t bytes_read = 0;
   ProcessSP process_sp(GetSP());
   if (process_sp) {
-    Error error;
+    Status error;
     bytes_read = process_sp->GetSTDOUT(dst, dst_len, error);
   }
 
@@ -317,7 +317,7 @@ size_t SBProcess::GetSTDERR(char *dst, size_t dst_len) const {
   size_t bytes_read = 0;
   ProcessSP process_sp(GetSP());
   if (process_sp) {
-    Error error;
+    Status error;
     bytes_read = process_sp->GetSTDERR(dst, dst_len, error);
   }
 
@@ -336,7 +336,7 @@ size_t SBProcess::GetAsyncProfileData(char *dst, size_t dst_len) const {
   size_t bytes_read = 0;
   ProcessSP process_sp(GetSP());
   if (process_sp) {
-    Error error;
+    Status error;
     bytes_read = process_sp->GetAsyncProfileData(dst, dst_len, error);
   }
 
diff --git a/source/API/SBStream.cpp b/source/API/SBStream.cpp
index 5ae5a01589de..159ec07e4e02 100644
--- a/source/API/SBStream.cpp
+++ b/source/API/SBStream.cpp
@@ -10,7 +10,7 @@
 #include "lldb/API/SBStream.h"
 
 #include "lldb/Core/StreamFile.h"
-#include "lldb/Utility/Error.h"
+#include "lldb/Utility/Status.h"
 #include "lldb/Utility/Stream.h"
 #include "lldb/Utility/StreamString.h"
 
diff --git a/source/API/SBStructuredData.cpp b/source/API/SBStructuredData.cpp
index 2fca56f2f223..971c4ab2295d 100644
--- a/source/API/SBStructuredData.cpp
+++ b/source/API/SBStructuredData.cpp
@@ -14,7 +14,7 @@
 #include "lldb/Core/StructuredData.h"
 #include "lldb/Core/StructuredDataImpl.h"
 #include "lldb/Target/StructuredDataPlugin.h"
-#include "lldb/Utility/Error.h"
+#include "lldb/Utility/Status.h"
 #include "lldb/Utility/Stream.h"
 
 using namespace lldb;
@@ -62,7 +62,7 @@ SBError SBStructuredData::GetAsJSON(lldb::SBStream &stream) const {
 }
 
 lldb::SBError SBStructuredData::GetDescription(lldb::SBStream &stream) const {
-  Error error = m_impl_up->GetDescription(stream.ref());
+  Status error = m_impl_up->GetDescription(stream.ref());
   SBError sb_error;
   sb_error.SetError(error);
   return sb_error;
diff --git a/source/API/SBTarget.cpp b/source/API/SBTarget.cpp
index 4032383eb56c..0ab7375ccc30 100644
--- a/source/API/SBTarget.cpp
+++ b/source/API/SBTarget.cpp
@@ -74,7 +74,7 @@ using namespace lldb_private;
 
 namespace {
 
-Error AttachToProcess(ProcessAttachInfo &attach_info, Target &target) {
+Status AttachToProcess(ProcessAttachInfo &attach_info, Target &target) {
   std::lock_guard<std::recursive_mutex> guard(target.GetAPIMutex());
 
   auto process_sp = target.GetProcessSP();
@@ -85,8 +85,8 @@ Error AttachToProcess(ProcessAttachInfo &attach_info, Target &target) {
       // listener, so if a valid listener is supplied, we need to error out
       // to let the client know.
       if (attach_info.GetListener())
-        return Error("process is connected and already has a listener, pass "
-                     "empty listener");
+        return Status("process is connected and already has a listener, pass "
+                      "empty listener");
     }
   }
 
@@ -1279,7 +1279,7 @@ lldb::SBWatchpoint SBTarget::WatchAddress(lldb::addr_t addr, size_t size,
     }
 
     // Target::CreateWatchpoint() is thread safe.
-    Error cw_error;
+    Status cw_error;
     // This API doesn't take in a type, so we can't figure out what it is.
     CompilerType *type = NULL;
     watchpoint_sp =
@@ -1863,7 +1863,7 @@ lldb::SBInstructionList SBTarget::ReadInstructions(lldb::SBAddress base_addr,
       DataBufferHeap data(
           target_sp->GetArchitecture().GetMaximumOpcodeByteSize() * count, 0);
       bool prefer_file_cache = false;
-      lldb_private::Error error;
+      lldb_private::Status error;
       lldb::addr_t load_addr = LLDB_INVALID_ADDRESS;
       const size_t bytes_read =
           target_sp->ReadMemory(*addr_ptr, prefer_file_cache, data.GetBytes(),
diff --git a/source/API/SBThread.cpp b/source/API/SBThread.cpp
index 3961a7f925a0..2c82bc3bcdcf 100644
--- a/source/API/SBThread.cpp
+++ b/source/API/SBThread.cpp
@@ -562,7 +562,7 @@ bool SBThread::GetInfoItemByPathAsString(const char *path, SBStream &strm) {
             info_root_sp->GetObjectForDotSeparatedPath(path);
         if (node) {
           if (node->GetType() == StructuredData::Type::eTypeString) {
-            strm.Printf("%s", node->GetAsString()->GetValue().c_str());
+            strm.Printf("%s", node->GetAsString()->GetValue().str().c_str());
             success = true;
           }
           if (node->GetType() == StructuredData::Type::eTypeInteger) {
@@ -1037,7 +1037,7 @@ SBError SBThread::JumpToLine(lldb::SBFileSpec &file_spec, uint32_t line) {
 
   Thread *thread = exe_ctx.GetThreadPtr();
 
-  Error err = thread->JumpToLine(file_spec.get(), line, true);
+  Status err = thread->JumpToLine(file_spec.get(), line, true);
   sb_error.SetError(err);
   return sb_error;
 }
diff --git a/source/API/SBValue.cpp b/source/API/SBValue.cpp
index ea0f9f591ab9..b6f044c61a78 100644
--- a/source/API/SBValue.cpp
+++ b/source/API/SBValue.cpp
@@ -112,7 +112,7 @@ public:
 
   lldb::ValueObjectSP GetSP(Process::StopLocker &stop_locker,
                             std::unique_lock<std::recursive_mutex> &lock,
-                            Error &error) {
+                            Status &error) {
     Log *log(lldb_private::GetLogIfAllCategoriesSet(LIBLLDB_LOG_API));
     if (!m_valobj_sp) {
       error.SetErrorString("invalid value object");
@@ -218,12 +218,12 @@ public:
     return in_value.GetSP(m_stop_locker, m_lock, m_lock_error);
   }
 
-  Error &GetError() { return m_lock_error; }
+  Status &GetError() { return m_lock_error; }
 
 private:
   Process::StopLocker m_stop_locker;
   std::unique_lock<std::recursive_mutex> m_lock;
-  Error m_lock_error;
+  Status m_lock_error;
 };
 
 SBValue::SBValue() : m_opaque_sp() {}
@@ -1112,7 +1112,7 @@ SBValue SBValue::Dereference() {
   ValueLocker locker;
   lldb::ValueObjectSP value_sp(GetSP(locker));
   if (value_sp) {
-    Error error;
+    Status error;
     sb_value = value_sp->Dereference(error);
   }
   Log *log(lldb_private::GetLogIfAllCategoriesSet(LIBLLDB_LOG_API));
@@ -1336,7 +1336,7 @@ lldb::SBValue SBValue::AddressOf() {
   ValueLocker locker;
   lldb::ValueObjectSP value_sp(GetSP(locker));
   if (value_sp) {
-    Error error;
+    Status error;
     sb_value.SetSP(value_sp->AddressOf(error), GetPreferDynamicValue(),
                    GetPreferSyntheticValue());
   }
@@ -1445,7 +1445,7 @@ lldb::SBData SBValue::GetData() {
   lldb::ValueObjectSP value_sp(GetSP(locker));
   if (value_sp) {
     DataExtractorSP data_sp(new DataExtractor());
-    Error error;
+    Status error;
     value_sp->GetData(*data_sp, error);
     if (error.Success())
       *sb_data = data_sp;
@@ -1475,7 +1475,7 @@ bool SBValue::SetData(lldb::SBData &data, SBError &error) {
       error.SetErrorString("No data to set");
       ret = false;
     } else {
-      Error set_error;
+      Status set_error;
 
       value_sp->SetData(*data_extractor, set_error);
 
@@ -1541,7 +1541,7 @@ lldb::SBWatchpoint SBValue::Watch(bool resolve_location, bool read, bool write,
     if (write)
       watch_type |= LLDB_WATCH_TYPE_WRITE;
 
-    Error rc;
+    Status rc;
     CompilerType type(value_sp->GetCompilerType());
     WatchpointSP watchpoint_sp =
         target_sp->CreateWatchpoint(addr, byte_size, &type, watch_type, rc);
diff --git a/source/API/SystemInitializerFull.cpp b/source/API/SystemInitializerFull.cpp
index 23aec1d6fa0a..6be352567e8b 100644
--- a/source/API/SystemInitializerFull.cpp
+++ b/source/API/SystemInitializerFull.cpp
@@ -400,7 +400,8 @@ void SystemInitializerFull::InitializeSWIG() {
 }
 
 void SystemInitializerFull::Terminate() {
-  Timer scoped_timer(LLVM_PRETTY_FUNCTION, LLVM_PRETTY_FUNCTION);
+  static Timer::Category func_cat(LLVM_PRETTY_FUNCTION);
+  Timer scoped_timer(func_cat, LLVM_PRETTY_FUNCTION);
 
   Debugger::SettingsTerminate();
 
diff --git a/source/Breakpoint/Breakpoint.cpp b/source/Breakpoint/Breakpoint.cpp
index 2b44691186c0..4c58f8231344 100644
--- a/source/Breakpoint/Breakpoint.cpp
+++ b/source/Breakpoint/Breakpoint.cpp
@@ -127,7 +127,7 @@ StructuredData::ObjectSP Breakpoint::SerializeToStructuredData() {
 }
 
 lldb::BreakpointSP Breakpoint::CreateFromStructuredData(
-    Target &target, StructuredData::ObjectSP &object_data, Error &error) {
+    Target &target, StructuredData::ObjectSP &object_data, Status &error) {
   BreakpointSP result_sp;
 
   StructuredData::Dictionary *breakpoint_dict = object_data->GetAsDictionary();
@@ -146,7 +146,7 @@ lldb::BreakpointSP Breakpoint::CreateFromStructuredData(
     return result_sp;
   }
 
-  Error create_error;
+  Status create_error;
   BreakpointResolverSP resolver_sp =
       BreakpointResolver::CreateFromStructuredData(*resolver_dict,
                                                    create_error);
@@ -207,10 +207,10 @@ lldb::BreakpointSP Breakpoint::CreateFromStructuredData(
   if (success && names_array) {
     size_t num_names = names_array->GetSize();
     for (size_t i = 0; i < num_names; i++) {
-      std::string name;
-      Error error;
+      llvm::StringRef name;
+      Status error;
       success = names_array->GetItemAtIndexAsString(i, name);
-      result_sp->AddName(name.c_str(), error);
+      result_sp->AddName(name, error);
     }
   }
 
@@ -242,7 +242,7 @@ bool Breakpoint::SerializedBreakpointMatchesNames(
   std::vector<std::string>::iterator end = names.end();
 
   for (size_t i = 0; i < num_names; i++) {
-    std::string name;
+    llvm::StringRef name;
     if (names_array->GetItemAtIndexAsString(i, name)) {
       if (std::find(begin, end, name) != end) {
         return true;
@@ -833,10 +833,10 @@ size_t Breakpoint::GetNumResolvedLocations() const {
 
 size_t Breakpoint::GetNumLocations() const { return m_locations.GetSize(); }
 
-bool Breakpoint::AddName(const char *new_name, Error &error) {
-  if (!new_name)
+bool Breakpoint::AddName(llvm::StringRef new_name, Status &error) {
+  if (new_name.empty())
     return false;
-  if (!BreakpointID::StringIsBreakpointName(llvm::StringRef(new_name), error)) {
+  if (!BreakpointID::StringIsBreakpointName(new_name, error)) {
     error.SetErrorStringWithFormat("input name \"%s\" not a breakpoint name.",
                                    new_name);
     return false;
@@ -997,8 +997,9 @@ bool Breakpoint::BreakpointPrecondition::EvaluatePrecondition(
 void Breakpoint::BreakpointPrecondition::GetDescription(
     Stream &stream, lldb::DescriptionLevel level) {}
 
-Error Breakpoint::BreakpointPrecondition::ConfigurePrecondition(Args &options) {
-  Error error;
+Status
+Breakpoint::BreakpointPrecondition::ConfigurePrecondition(Args &options) {
+  Status error;
   error.SetErrorString("Base breakpoint precondition has no options.");
   return error;
 }
diff --git a/source/Breakpoint/BreakpointID.cpp b/source/Breakpoint/BreakpointID.cpp
index 07742d9d7cf8..112f7c0b5195 100644
--- a/source/Breakpoint/BreakpointID.cpp
+++ b/source/Breakpoint/BreakpointID.cpp
@@ -15,7 +15,7 @@
 // Project includes
 #include "lldb/Breakpoint/Breakpoint.h"
 #include "lldb/Breakpoint/BreakpointID.h"
-#include "lldb/Utility/Error.h"
+#include "lldb/Utility/Status.h"
 #include "lldb/Utility/Stream.h"
 
 using namespace lldb;
@@ -98,7 +98,7 @@ BreakpointID::ParseCanonicalReference(llvm::StringRef input) {
   return BreakpointID(bp_id, loc_id);
 }
 
-bool BreakpointID::StringIsBreakpointName(llvm::StringRef str, Error &error) {
+bool BreakpointID::StringIsBreakpointName(llvm::StringRef str, Status &error) {
   error.Clear();
   if (str.empty())
     return false;
diff --git a/source/Breakpoint/BreakpointIDList.cpp b/source/Breakpoint/BreakpointIDList.cpp
index 037b03e1e46a..7b461147a4e9 100644
--- a/source/Breakpoint/BreakpointIDList.cpp
+++ b/source/Breakpoint/BreakpointIDList.cpp
@@ -137,7 +137,7 @@ void BreakpointIDList::FindAndReplaceIDRanges(Args &old_args, Target *target,
     }
 
     llvm::StringRef range_expr;
-    Error error;
+    Status error;
 
     std::tie(range_from, range_to) =
         BreakpointIDList::SplitIDRangeExpression(current_arg);
diff --git a/source/Breakpoint/BreakpointList.cpp b/source/Breakpoint/BreakpointList.cpp
index 7f35588ea877..15bcb34a3d85 100644
--- a/source/Breakpoint/BreakpointList.cpp
+++ b/source/Breakpoint/BreakpointList.cpp
@@ -139,7 +139,7 @@ BreakpointList::FindBreakpointByID(break_id_t break_id) const {
 
 bool BreakpointList::FindBreakpointsByName(const char *name,
                                            BreakpointList &matching_bps) {
-  Error error;
+  Status error;
   if (!name)
     return false;
 
diff --git a/source/Breakpoint/BreakpointLocation.cpp b/source/Breakpoint/BreakpointLocation.cpp
index 52bdefc4077f..ec8f141e8d38 100644
--- a/source/Breakpoint/BreakpointLocation.cpp
+++ b/source/Breakpoint/BreakpointLocation.cpp
@@ -197,7 +197,7 @@ const char *BreakpointLocation::GetConditionText(size_t *hash) const {
 }
 
 bool BreakpointLocation::ConditionSaysStop(ExecutionContext &exe_ctx,
-                                           Error &error) {
+                                           Status &error) {
   Log *log = lldb_private::GetLogIfAllCategoriesSet(LIBLLDB_LOG_BREAKPOINTS);
 
   std::lock_guard<std::mutex> guard(m_condition_mutex);
@@ -260,7 +260,7 @@ bool BreakpointLocation::ConditionSaysStop(ExecutionContext &exe_ctx,
   options.SetResultIsInternal(
       true); // Don't generate a user variable for condition expressions.
 
-  Error expr_error;
+  Status expr_error;
 
   diagnostics.Clear();
 
diff --git a/source/Breakpoint/BreakpointOptions.cpp b/source/Breakpoint/BreakpointOptions.cpp
index 9851990a838e..bef63cc0f222 100644
--- a/source/Breakpoint/BreakpointOptions.cpp
+++ b/source/Breakpoint/BreakpointOptions.cpp
@@ -63,7 +63,7 @@ BreakpointOptions::CommandData::SerializeToStructuredData() {
 
 std::unique_ptr<BreakpointOptions::CommandData>
 BreakpointOptions::CommandData::CreateFromStructuredData(
-    const StructuredData::Dictionary &options_dict, Error &error) {
+    const StructuredData::Dictionary &options_dict, Status &error) {
   std::unique_ptr<CommandData> data_up(new CommandData());
   bool found_something = false;
 
@@ -73,7 +73,7 @@ BreakpointOptions::CommandData::CreateFromStructuredData(
   if (success)
     found_something = true;
 
-  std::string interpreter_str;
+  llvm::StringRef interpreter_str;
   ScriptLanguage interp_language;
   success = options_dict.GetValueForKeyAsString(
       GetKey(OptionNames::Interpreter), interpreter_str);
@@ -99,7 +99,7 @@ BreakpointOptions::CommandData::CreateFromStructuredData(
     found_something = true;
     size_t num_elems = user_source->GetSize();
     for (size_t i = 0; i < num_elems; i++) {
-      std::string elem_string;
+      llvm::StringRef elem_string;
       success = user_source->GetItemAtIndexAsString(i, elem_string);
       if (success)
         data_up->user_source.AppendString(elem_string);
@@ -196,7 +196,7 @@ BreakpointOptions::~BreakpointOptions() = default;
 
 std::unique_ptr<BreakpointOptions> BreakpointOptions::CreateFromStructuredData(
     Target &target, const StructuredData::Dictionary &options_dict,
-    Error &error) {
+    Status &error) {
   bool enabled = true;
   bool one_shot = false;
   int32_t ignore_count = 0;
@@ -230,7 +230,7 @@ std::unique_ptr<BreakpointOptions> BreakpointOptions::CreateFromStructuredData(
   success = options_dict.GetValueForKeyAsDictionary(
       CommandData::GetSerializationKey(), cmds_dict);
   if (success && cmds_dict) {
-    Error cmds_error;
+    Status cmds_error;
     cmd_data_up = CommandData::CreateFromStructuredData(*cmds_dict, cmds_error);
     if (cmds_error.Fail()) {
       error.SetErrorStringWithFormat(
@@ -260,7 +260,7 @@ std::unique_ptr<BreakpointOptions> BreakpointOptions::CreateFromStructuredData(
                 .c_str());
         return nullptr;
       }
-      Error script_error;
+      Status script_error;
       script_error =
           interp->SetBreakpointCommandCallback(bp_options.get(), cmd_data_up);
       if (script_error.Fail()) {
@@ -275,7 +275,7 @@ std::unique_ptr<BreakpointOptions> BreakpointOptions::CreateFromStructuredData(
   success = options_dict.GetValueForKeyAsDictionary(
       ThreadSpec::GetSerializationKey(), thread_spec_dict);
   if (success) {
-    Error thread_spec_error;
+    Status thread_spec_error;
     std::unique_ptr<ThreadSpec> thread_spec_up =
         ThreadSpec::CreateFromStructuredData(*thread_spec_dict,
                                              thread_spec_error);
diff --git a/source/Breakpoint/BreakpointResolver.cpp b/source/Breakpoint/BreakpointResolver.cpp
index f2579a0877e2..31aefb08f976 100644
--- a/source/Breakpoint/BreakpointResolver.cpp
+++ b/source/Breakpoint/BreakpointResolver.cpp
@@ -56,9 +56,9 @@ const char *BreakpointResolver::ResolverTyToName(enum ResolverTy type) {
 }
 
 BreakpointResolver::ResolverTy
-BreakpointResolver::NameToResolverTy(const char *name) {
+BreakpointResolver::NameToResolverTy(llvm::StringRef name) {
   for (size_t i = 0; i < LastKnownResolverType; i++) {
-    if (strcmp(name, g_ty_to_name[i]) == 0)
+    if (name == g_ty_to_name[i])
       return (ResolverTy)i;
   }
   return UnknownResolver;
@@ -72,14 +72,14 @@ BreakpointResolver::BreakpointResolver(Breakpoint *bkpt,
 BreakpointResolver::~BreakpointResolver() {}
 
 BreakpointResolverSP BreakpointResolver::CreateFromStructuredData(
-    const StructuredData::Dictionary &resolver_dict, Error &error) {
+    const StructuredData::Dictionary &resolver_dict, Status &error) {
   BreakpointResolverSP result_sp;
   if (!resolver_dict.IsValid()) {
     error.SetErrorString("Can't deserialize from an invalid data object.");
     return result_sp;
   }
 
-  std::string subclass_name;
+  llvm::StringRef subclass_name;
 
   bool success = resolver_dict.GetValueForKeyAsString(
       GetSerializationSubclassKey(), subclass_name);
@@ -90,10 +90,10 @@ BreakpointResolverSP BreakpointResolver::CreateFromStructuredData(
     return result_sp;
   }
 
-  ResolverTy resolver_type = NameToResolverTy(subclass_name.c_str());
+  ResolverTy resolver_type = NameToResolverTy(subclass_name);
   if (resolver_type == UnknownResolver) {
-    error.SetErrorStringWithFormat("Unknown resolver type: %s.",
-                                   subclass_name.c_str());
+    error.SetErrorStringWithFormatv("Unknown resolver type: {0}.",
+                                    subclass_name);
     return result_sp;
   }
 
diff --git a/source/Breakpoint/BreakpointResolverAddress.cpp b/source/Breakpoint/BreakpointResolverAddress.cpp
index 4674ef08eae1..32f2045ed59a 100644
--- a/source/Breakpoint/BreakpointResolverAddress.cpp
+++ b/source/Breakpoint/BreakpointResolverAddress.cpp
@@ -44,8 +44,8 @@ BreakpointResolverAddress::~BreakpointResolverAddress() {}
 
 BreakpointResolver *BreakpointResolverAddress::CreateFromStructuredData(
     Breakpoint *bkpt, const StructuredData::Dictionary &options_dict,
-    Error &error) {
-  std::string module_name;
+    Status &error) {
+  llvm::StringRef module_name;
   lldb::addr_t addr_offset;
   FileSpec module_filespec;
   bool success;
diff --git a/source/Breakpoint/BreakpointResolverFileLine.cpp b/source/Breakpoint/BreakpointResolverFileLine.cpp
index 610c8c956cf5..780d25db9ccb 100644
--- a/source/Breakpoint/BreakpointResolverFileLine.cpp
+++ b/source/Breakpoint/BreakpointResolverFileLine.cpp
@@ -38,8 +38,8 @@ BreakpointResolverFileLine::~BreakpointResolverFileLine() {}
 
 BreakpointResolver *BreakpointResolverFileLine::CreateFromStructuredData(
     Breakpoint *bkpt, const StructuredData::Dictionary &options_dict,
-    Error &error) {
-  std::string filename;
+    Status &error) {
+  llvm::StringRef filename;
   uint32_t line_no;
   bool check_inlines;
   bool skip_prologue;
diff --git a/source/Breakpoint/BreakpointResolverFileRegex.cpp b/source/Breakpoint/BreakpointResolverFileRegex.cpp
index df6f5c2e4bf2..54c05a042468 100644
--- a/source/Breakpoint/BreakpointResolverFileRegex.cpp
+++ b/source/Breakpoint/BreakpointResolverFileRegex.cpp
@@ -37,10 +37,10 @@ BreakpointResolverFileRegex::~BreakpointResolverFileRegex() {}
 
 BreakpointResolver *BreakpointResolverFileRegex::CreateFromStructuredData(
     Breakpoint *bkpt, const StructuredData::Dictionary &options_dict,
-    Error &error) {
+    Status &error) {
   bool success;
 
-  std::string regex_string;
+  llvm::StringRef regex_string;
   success = options_dict.GetValueForKeyAsString(
       GetKey(OptionNames::RegexString), regex_string);
   if (!success) {
@@ -65,7 +65,7 @@ BreakpointResolver *BreakpointResolverFileRegex::CreateFromStructuredData(
   if (success && names_array) {
     size_t num_names = names_array->GetSize();
     for (size_t i = 0; i < num_names; i++) {
-      std::string name;
+      llvm::StringRef name;
       success = names_array->GetItemAtIndexAsString(i, name);
       if (!success) {
         error.SetErrorStringWithFormat(
diff --git a/source/Breakpoint/BreakpointResolverName.cpp b/source/Breakpoint/BreakpointResolverName.cpp
index f8ce775096ce..468de35db0ee 100644
--- a/source/Breakpoint/BreakpointResolverName.cpp
+++ b/source/Breakpoint/BreakpointResolverName.cpp
@@ -92,16 +92,16 @@ BreakpointResolverName::BreakpointResolverName(
 
 BreakpointResolver *BreakpointResolverName::CreateFromStructuredData(
     Breakpoint *bkpt, const StructuredData::Dictionary &options_dict,
-    Error &error) {
+    Status &error) {
   LanguageType language = eLanguageTypeUnknown;
-  std::string language_name;
+  llvm::StringRef language_name;
   bool success = options_dict.GetValueForKeyAsString(
       GetKey(OptionNames::LanguageName), language_name);
   if (success) {
     language = Language::GetLanguageTypeFromString(language_name);
     if (language == eLanguageTypeUnknown) {
-      error.SetErrorStringWithFormat("BRN::CFSD: Unknown language: %s.",
-                                     language_name.c_str());
+      error.SetErrorStringWithFormatv("BRN::CFSD: Unknown language: {0}.",
+                                      language_name);
       return nullptr;
     }
   }
@@ -122,7 +122,7 @@ BreakpointResolver *BreakpointResolverName::CreateFromStructuredData(
     return nullptr;
   }
 
-  std::string regex_text;
+  llvm::StringRef regex_text;
   success = options_dict.GetValueForKeyAsString(
       GetKey(OptionNames::RegexString), regex_text);
   if (success) {
@@ -162,7 +162,7 @@ BreakpointResolver *BreakpointResolverName::CreateFromStructuredData(
     std::vector<uint32_t> name_masks;
     for (size_t i = 0; i < num_elem; i++) {
       uint32_t name_mask;
-      std::string name;
+      llvm::StringRef name;
 
       success = names_array->GetItemAtIndexAsString(i, name);
       if (!success) {
diff --git a/source/Breakpoint/Watchpoint.cpp b/source/Breakpoint/Watchpoint.cpp
index 3dbd6d23821c..a141a6b5c2f9 100644
--- a/source/Breakpoint/Watchpoint.cpp
+++ b/source/Breakpoint/Watchpoint.cpp
@@ -286,7 +286,7 @@ void Watchpoint::SetCondition(const char *condition) {
       m_condition_ap.reset();
   } else {
     // Pass nullptr for expr_prefix (no translation-unit level definitions).
-    Error error;
+    Status error;
     m_condition_ap.reset(m_target.GetUserExpressionForLanguage(
         condition, llvm::StringRef(), lldb::eLanguageTypeUnknown,
         UserExpression::eResultTypeAny, EvaluateExpressionOptions(), error));
diff --git a/source/Commands/CommandObjectArgs.cpp b/source/Commands/CommandObjectArgs.cpp
index 8042aa9d81db..92c2107dc33c 100644
--- a/source/Commands/CommandObjectArgs.cpp
+++ b/source/Commands/CommandObjectArgs.cpp
@@ -55,10 +55,10 @@ CommandObjectArgs::CommandOptions::CommandOptions(
 
 CommandObjectArgs::CommandOptions::~CommandOptions() = default;
 
-Error CommandObjectArgs::CommandOptions::SetOptionValue(
+Status CommandObjectArgs::CommandOptions::SetOptionValue(
     uint32_t option_idx, llvm::StringRef option_arg,
     ExecutionContext *execution_context) {
-  Error error;
+  Status error;
 
   const int short_option = m_getopt_table[option_idx].val;
   error.SetErrorStringWithFormat("invalid short option character '%c'",
diff --git a/source/Commands/CommandObjectArgs.h b/source/Commands/CommandObjectArgs.h
index a4b3f9fed0ee..aa7d2411adc8 100644
--- a/source/Commands/CommandObjectArgs.h
+++ b/source/Commands/CommandObjectArgs.h
@@ -27,8 +27,8 @@ public:
 
     ~CommandOptions() override;
 
-    Error SetOptionValue(uint32_t option_idx, llvm::StringRef option_arg,
-                         ExecutionContext *execution_context) override;
+    Status SetOptionValue(uint32_t option_idx, llvm::StringRef option_arg,
+                          ExecutionContext *execution_context) override;
 
     void OptionParsingStarting(ExecutionContext *execution_context) override;
 
diff --git a/source/Commands/CommandObjectBreakpoint.cpp b/source/Commands/CommandObjectBreakpoint.cpp
index d77cf55b60e9..266864d1a1f0 100644
--- a/source/Commands/CommandObjectBreakpoint.cpp
+++ b/source/Commands/CommandObjectBreakpoint.cpp
@@ -178,9 +178,9 @@ public:
 
     ~CommandOptions() override = default;
 
-    Error SetOptionValue(uint32_t option_idx, llvm::StringRef option_arg,
-                         ExecutionContext *execution_context) override {
-      Error error;
+    Status SetOptionValue(uint32_t option_idx, llvm::StringRef option_arg,
+                          ExecutionContext *execution_context) override {
+      Status error;
       const int short_option = m_getopt_table[option_idx].val;
 
       switch (short_option) {
@@ -662,7 +662,7 @@ protected:
                .get();
     } break;
     case eSetTypeException: {
-      Error precond_error;
+      Status precond_error;
       bp = target
                ->CreateExceptionBreakpoint(
                    m_options.m_exception_language, m_options.m_catch_bp,
@@ -705,7 +705,7 @@ protected:
         bp->GetOptions()->SetCondition(m_options.m_condition.c_str());
 
       if (!m_options.m_breakpoint_names.empty()) {
-        Error name_error;
+        Status name_error;
         for (auto name : m_options.m_breakpoint_names) {
           bp->AddName(name.c_str(), name_error);
           if (name_error.Fail()) {
@@ -844,9 +844,9 @@ public:
 
     ~CommandOptions() override = default;
 
-    Error SetOptionValue(uint32_t option_idx, llvm::StringRef option_arg,
-                         ExecutionContext *execution_context) override {
-      Error error;
+    Status SetOptionValue(uint32_t option_idx, llvm::StringRef option_arg,
+                          ExecutionContext *execution_context) override {
+      Status error;
       const int short_option = m_getopt_table[option_idx].val;
 
       switch (short_option) {
@@ -1305,9 +1305,9 @@ public:
 
     ~CommandOptions() override = default;
 
-    Error SetOptionValue(uint32_t option_idx, llvm::StringRef option_arg,
-                         ExecutionContext *execution_context) override {
-      Error error;
+    Status SetOptionValue(uint32_t option_idx, llvm::StringRef option_arg,
+                          ExecutionContext *execution_context) override {
+      Status error;
       const int short_option = m_getopt_table[option_idx].val;
 
       switch (short_option) {
@@ -1452,9 +1452,9 @@ public:
 
     ~CommandOptions() override = default;
 
-    Error SetOptionValue(uint32_t option_idx, llvm::StringRef option_arg,
-                         ExecutionContext *execution_context) override {
-      Error error;
+    Status SetOptionValue(uint32_t option_idx, llvm::StringRef option_arg,
+                          ExecutionContext *execution_context) override {
+      Status error;
       const int short_option = m_getopt_table[option_idx].val;
 
       switch (short_option) {
@@ -1611,9 +1611,9 @@ public:
 
     ~CommandOptions() override = default;
 
-    Error SetOptionValue(uint32_t option_idx, llvm::StringRef option_arg,
-                         ExecutionContext *execution_context) override {
-      Error error;
+    Status SetOptionValue(uint32_t option_idx, llvm::StringRef option_arg,
+                          ExecutionContext *execution_context) override {
+      Status error;
       const int short_option = m_getopt_table[option_idx].val;
 
       switch (short_option) {
@@ -1751,9 +1751,9 @@ public:
     return llvm::makeArrayRef(g_breakpoint_name_options);
   }
 
-  Error SetOptionValue(uint32_t option_idx, llvm::StringRef option_arg,
-                       ExecutionContext *execution_context) override {
-    Error error;
+  Status SetOptionValue(uint32_t option_idx, llvm::StringRef option_arg,
+                        ExecutionContext *execution_context) override {
+    Status error;
     const int short_option = g_breakpoint_name_options[option_idx].short_option;
 
     switch (short_option) {
@@ -1864,8 +1864,8 @@ protected:
         lldb::break_id_t bp_id =
             valid_bp_ids.GetBreakpointIDAtIndex(index).GetBreakpointID();
         BreakpointSP bp_sp = breakpoints.FindBreakpointByID(bp_id);
-        Error error; // We don't need to check the error here, since the option
-                     // parser checked it...
+        Status error; // We don't need to check the error here, since the option
+                      // parser checked it...
         bp_sp->AddName(m_name_options.m_name.GetCurrentValue(), error);
       }
     }
@@ -2093,9 +2093,9 @@ public:
 
     ~CommandOptions() override = default;
 
-    Error SetOptionValue(uint32_t option_idx, llvm::StringRef option_arg,
-                         ExecutionContext *execution_context) override {
-      Error error;
+    Status SetOptionValue(uint32_t option_idx, llvm::StringRef option_arg,
+                          ExecutionContext *execution_context) override {
+      Status error;
       const int short_option = m_getopt_table[option_idx].val;
 
       switch (short_option) {
@@ -2103,7 +2103,7 @@ public:
         m_filename.assign(option_arg);
         break;
       case 'N': {
-        Error name_error;
+        Status name_error;
         if (!BreakpointID::StringIsBreakpointName(llvm::StringRef(option_arg),
                                                   name_error)) {
           error.SetErrorStringWithFormat("Invalid breakpoint name: %s",
@@ -2150,8 +2150,8 @@ protected:
 
     FileSpec input_spec(m_options.m_filename, true);
     BreakpointIDList new_bps;
-    Error error = target->CreateBreakpointsFromFile(input_spec,
-                                                    m_options.m_names, new_bps);
+    Status error = target->CreateBreakpointsFromFile(
+        input_spec, m_options.m_names, new_bps);
 
     if (!error.Success()) {
       result.AppendError(error.AsCString());
@@ -2223,9 +2223,9 @@ public:
 
     ~CommandOptions() override = default;
 
-    Error SetOptionValue(uint32_t option_idx, llvm::StringRef option_arg,
-                         ExecutionContext *execution_context) override {
-      Error error;
+    Status SetOptionValue(uint32_t option_idx, llvm::StringRef option_arg,
+                          ExecutionContext *execution_context) override {
+      Status error;
       const int short_option = m_getopt_table[option_idx].val;
 
       switch (short_option) {
@@ -2281,7 +2281,7 @@ protected:
         return false;
       }
     }
-    Error error = target->SerializeBreakpointsToFile(
+    Status error = target->SerializeBreakpointsToFile(
         FileSpec(m_options.m_filename, true), valid_bp_ids, m_options.m_append);
     if (!error.Success()) {
       result.AppendErrorWithFormat("error serializing breakpoints: %s.",
diff --git a/source/Commands/CommandObjectBreakpointCommand.cpp b/source/Commands/CommandObjectBreakpointCommand.cpp
index 73c0c314533c..de4911953107 100644
--- a/source/Commands/CommandObjectBreakpointCommand.cpp
+++ b/source/Commands/CommandObjectBreakpointCommand.cpp
@@ -281,9 +281,9 @@ are no syntax errors may indicate that a function was declared but never called.
 
     ~CommandOptions() override = default;
 
-    Error SetOptionValue(uint32_t option_idx, llvm::StringRef option_arg,
-                         ExecutionContext *execution_context) override {
-      Error error;
+    Status SetOptionValue(uint32_t option_idx, llvm::StringRef option_arg,
+                          ExecutionContext *execution_context) override {
+      Status error;
       const int short_option = m_getopt_table[option_idx].val;
 
       switch (short_option) {
@@ -511,9 +511,9 @@ public:
 
     ~CommandOptions() override = default;
 
-    Error SetOptionValue(uint32_t option_idx, llvm::StringRef option_arg,
-                         ExecutionContext *execution_context) override {
-      Error error;
+    Status SetOptionValue(uint32_t option_idx, llvm::StringRef option_arg,
+                          ExecutionContext *execution_context) override {
+      Status error;
       const int short_option = m_getopt_table[option_idx].val;
 
       switch (short_option) {
diff --git a/source/Commands/CommandObjectBugreport.cpp b/source/Commands/CommandObjectBugreport.cpp
index 04050e3a6692..7ba8ab945eae 100644
--- a/source/Commands/CommandObjectBugreport.cpp
+++ b/source/Commands/CommandObjectBugreport.cpp
@@ -84,7 +84,7 @@ protected:
         open_options |= File::eOpenOptionTruncate;
 
       StreamFileSP outfile_stream = std::make_shared<StreamFile>();
-      Error error = outfile_stream->GetFile().Open(path, open_options);
+      Status error = outfile_stream->GetFile().Open(path, open_options);
       if (error.Fail()) {
         result.AppendErrorWithFormat("Failed to open file '%s' for %s: %s\n",
                                      path, append ? "append" : "write",
diff --git a/source/Commands/CommandObjectCommands.cpp b/source/Commands/CommandObjectCommands.cpp
index e39c0330b653..c9d71a65754b 100644
--- a/source/Commands/CommandObjectCommands.cpp
+++ b/source/Commands/CommandObjectCommands.cpp
@@ -71,9 +71,9 @@ protected:
 
     ~CommandOptions() override = default;
 
-    Error SetOptionValue(uint32_t option_idx, llvm::StringRef option_arg,
-                         ExecutionContext *execution_context) override {
-      Error error;
+    Status SetOptionValue(uint32_t option_idx, llvm::StringRef option_arg,
+                          ExecutionContext *execution_context) override {
+      Status error;
       const int short_option = m_getopt_table[option_idx].val;
 
       switch (short_option) {
@@ -261,9 +261,9 @@ protected:
 
     ~CommandOptions() override = default;
 
-    Error SetOptionValue(uint32_t option_idx, llvm::StringRef option_arg,
-                         ExecutionContext *execution_context) override {
-      Error error;
+    Status SetOptionValue(uint32_t option_idx, llvm::StringRef option_arg,
+                          ExecutionContext *execution_context) override {
+      Status error;
       const int short_option = m_getopt_table[option_idx].val;
 
       switch (short_option) {
@@ -371,9 +371,9 @@ protected:
       return llvm::makeArrayRef(g_alias_options);
     }
 
-    Error SetOptionValue(uint32_t option_idx, llvm::StringRef option_value,
-                         ExecutionContext *execution_context) override {
-      Error error;
+    Status SetOptionValue(uint32_t option_idx, llvm::StringRef option_value,
+                          ExecutionContext *execution_context) override {
+      Status error;
 
       const int short_option = GetDefinitions()[option_idx].short_option;
       std::string option_str(option_value);
@@ -579,7 +579,7 @@ protected:
         if (!ParseOptions(args, result))
           return false;
 
-        Error error(m_option_group.NotifyOptionParsingFinished(&exe_ctx));
+        Status error(m_option_group.NotifyOptionParsingFinished(&exe_ctx));
         if (error.Fail()) {
           result.AppendError(error.AsCString());
           result.SetStatus(eReturnStatusFailed);
@@ -1030,7 +1030,7 @@ protected:
         bool check_only = false;
         for (size_t i = 0; i < num_lines; ++i) {
           llvm::StringRef bytes_strref(lines[i]);
-          Error error = AppendRegexSubstitution(bytes_strref, check_only);
+          Status error = AppendRegexSubstitution(bytes_strref, check_only);
           if (error.Fail()) {
             if (!m_interpreter.GetDebugger()
                      .GetCommandInterpreter()
@@ -1058,7 +1058,7 @@ protected:
       return false;
     }
 
-    Error error;
+    Status error;
     auto name = command[0].ref;
     m_regex_cmd_ap = llvm::make_unique<CommandObjectRegexCommand>(
         m_interpreter, name, m_options.GetHelp(), m_options.GetSyntax(), 10, 0,
@@ -1101,9 +1101,9 @@ protected:
     return result.Succeeded();
   }
 
-  Error AppendRegexSubstitution(const llvm::StringRef &regex_sed,
-                                bool check_only) {
-    Error error;
+  Status AppendRegexSubstitution(const llvm::StringRef &regex_sed,
+                                 bool check_only) {
+    Status error;
 
     if (!m_regex_cmd_ap) {
       error.SetErrorStringWithFormat(
@@ -1215,9 +1215,9 @@ private:
 
     ~CommandOptions() override = default;
 
-    Error SetOptionValue(uint32_t option_idx, llvm::StringRef option_arg,
-                         ExecutionContext *execution_context) override {
-      Error error;
+    Status SetOptionValue(uint32_t option_idx, llvm::StringRef option_arg,
+                          ExecutionContext *execution_context) override {
+      Status error;
       const int short_option = m_getopt_table[option_idx].val;
 
       switch (short_option) {
@@ -1311,7 +1311,7 @@ protected:
                  CommandReturnObject &result) override {
     ScriptInterpreter *scripter = m_interpreter.GetScriptInterpreter();
 
-    Error error;
+    Status error;
 
     result.SetStatus(eReturnStatusInvalid);
 
@@ -1400,7 +1400,7 @@ protected:
                  CommandReturnObject &result) override {
     ScriptInterpreter *scripter = m_interpreter.GetScriptInterpreter();
 
-    Error error;
+    Status error;
 
     result.SetStatus(eReturnStatusInvalid);
 
@@ -1487,9 +1487,9 @@ protected:
 
     ~CommandOptions() override = default;
 
-    Error SetOptionValue(uint32_t option_idx, llvm::StringRef option_arg,
-                         ExecutionContext *execution_context) override {
-      Error error;
+    Status SetOptionValue(uint32_t option_idx, llvm::StringRef option_arg,
+                          ExecutionContext *execution_context) override {
+      Status error;
       const int short_option = m_getopt_table[option_idx].val;
 
       switch (short_option) {
@@ -1534,7 +1534,7 @@ protected:
     }
 
     for (auto &entry : command.entries()) {
-      Error error;
+      Status error;
 
       const bool init_session = true;
       // FIXME: this is necessary because CommandObject::CheckRequirements()
@@ -1619,9 +1619,9 @@ protected:
 
     ~CommandOptions() override = default;
 
-    Error SetOptionValue(uint32_t option_idx, llvm::StringRef option_arg,
-                         ExecutionContext *execution_context) override {
-      Error error;
+    Status SetOptionValue(uint32_t option_idx, llvm::StringRef option_arg,
+                          ExecutionContext *execution_context) override {
+      Status error;
       const int short_option = m_getopt_table[option_idx].val;
 
       switch (short_option) {
diff --git a/source/Commands/CommandObjectDisassemble.cpp b/source/Commands/CommandObjectDisassemble.cpp
index 4496462476b4..5d0f2417f992 100644
--- a/source/Commands/CommandObjectDisassemble.cpp
+++ b/source/Commands/CommandObjectDisassemble.cpp
@@ -71,10 +71,10 @@ CommandObjectDisassemble::CommandOptions::CommandOptions()
 
 CommandObjectDisassemble::CommandOptions::~CommandOptions() = default;
 
-Error CommandObjectDisassemble::CommandOptions::SetOptionValue(
+Status CommandObjectDisassemble::CommandOptions::SetOptionValue(
     uint32_t option_idx, llvm::StringRef option_arg,
     ExecutionContext *execution_context) {
-  Error error;
+  Status error;
 
   const int short_option = m_getopt_table[option_idx].val;
 
@@ -224,11 +224,11 @@ void CommandObjectDisassemble::CommandOptions::OptionParsingStarting(
   some_location_specified = false;
 }
 
-Error CommandObjectDisassemble::CommandOptions::OptionParsingFinished(
+Status CommandObjectDisassemble::CommandOptions::OptionParsingFinished(
     ExecutionContext *execution_context) {
   if (!some_location_specified)
     current_function = true;
-  return Error();
+  return Status();
 }
 
 llvm::ArrayRef<OptionDefinition>
diff --git a/source/Commands/CommandObjectDisassemble.h b/source/Commands/CommandObjectDisassemble.h
index db89aa24d6a0..8ca390056dd8 100644
--- a/source/Commands/CommandObjectDisassemble.h
+++ b/source/Commands/CommandObjectDisassemble.h
@@ -32,8 +32,8 @@ public:
 
     ~CommandOptions() override;
 
-    Error SetOptionValue(uint32_t option_idx, llvm::StringRef option_arg,
-                         ExecutionContext *execution_context) override;
+    Status SetOptionValue(uint32_t option_idx, llvm::StringRef option_arg,
+                          ExecutionContext *execution_context) override;
 
     void OptionParsingStarting(ExecutionContext *execution_context) override;
 
@@ -49,7 +49,7 @@ public:
       return flavor_string.c_str();
     }
 
-    Error OptionParsingFinished(ExecutionContext *execution_context) override;
+    Status OptionParsingFinished(ExecutionContext *execution_context) override;
 
     bool show_mixed; // Show mixed source/assembly
     bool show_bytes;
diff --git a/source/Commands/CommandObjectExpression.cpp b/source/Commands/CommandObjectExpression.cpp
index 8a0afce741e9..b6e0016c88e4 100644
--- a/source/Commands/CommandObjectExpression.cpp
+++ b/source/Commands/CommandObjectExpression.cpp
@@ -69,10 +69,10 @@ static OptionDefinition g_expression_options[] = {
     // clang-format on
 };
 
-Error CommandObjectExpression::CommandOptions::SetOptionValue(
+Status CommandObjectExpression::CommandOptions::SetOptionValue(
     uint32_t option_idx, llvm::StringRef option_arg,
     ExecutionContext *execution_context) {
-  Error error;
+  Status error;
 
   const int short_option = GetDefinitions()[option_idx].short_option;
 
@@ -295,15 +295,15 @@ CommandObjectExpression::~CommandObjectExpression() = default;
 
 Options *CommandObjectExpression::GetOptions() { return &m_option_group; }
 
-static lldb_private::Error
+static lldb_private::Status
 CanBeUsedForElementCountPrinting(ValueObject &valobj) {
   CompilerType type(valobj.GetCompilerType());
   CompilerType pointee;
   if (!type.IsPointerType(&pointee))
-    return Error("as it does not refer to a pointer");
+    return Status("as it does not refer to a pointer");
   if (pointee.IsVoidType())
-    return Error("as it refers to a pointer to void");
-  return Error();
+    return Status("as it refers to a pointer to void");
+  return Status();
 }
 
 bool CommandObjectExpression::EvaluateExpression(const char *expr,
@@ -384,7 +384,7 @@ bool CommandObjectExpression::EvaluateExpression(const char *expr,
             result_valobj_sp->SetFormat(format);
 
           if (m_varobj_options.elem_count > 0) {
-            Error error(CanBeUsedForElementCountPrinting(*result_valobj_sp));
+            Status error(CanBeUsedForElementCountPrinting(*result_valobj_sp));
             if (error.Fail()) {
               result->AppendErrorWithFormat(
                   "expression cannot be used with --element-count %s\n",
@@ -533,7 +533,7 @@ bool CommandObjectExpression::DoExecute(const char *command,
       if (!ParseOptions(args, result))
         return false;
 
-      Error error(m_option_group.NotifyOptionParsingFinished(&exe_ctx));
+      Status error(m_option_group.NotifyOptionParsingFinished(&exe_ctx));
       if (error.Fail()) {
         result.AppendError(error.AsCString());
         result.SetStatus(eReturnStatusFailed);
@@ -564,7 +564,7 @@ bool CommandObjectExpression::DoExecute(const char *command,
             // interpreter,
             // so just push one
             bool initialize = false;
-            Error repl_error;
+            Status repl_error;
             REPLSP repl_sp(target->GetREPL(
                 repl_error, m_command_options.language, nullptr, false));
 
diff --git a/source/Commands/CommandObjectExpression.h b/source/Commands/CommandObjectExpression.h
index 7c21adcc26fe..0cf2a7263d52 100644
--- a/source/Commands/CommandObjectExpression.h
+++ b/source/Commands/CommandObjectExpression.h
@@ -34,8 +34,8 @@ public:
 
     llvm::ArrayRef<OptionDefinition> GetDefinitions() override;
 
-    Error SetOptionValue(uint32_t option_idx, llvm::StringRef option_value,
-                         ExecutionContext *execution_context) override;
+    Status SetOptionValue(uint32_t option_idx, llvm::StringRef option_value,
+                          ExecutionContext *execution_context) override;
 
     void OptionParsingStarting(ExecutionContext *execution_context) override;
 
diff --git a/source/Commands/CommandObjectFrame.cpp b/source/Commands/CommandObjectFrame.cpp
index 7e81f5f94140..8b981a93ad7f 100644
--- a/source/Commands/CommandObjectFrame.cpp
+++ b/source/Commands/CommandObjectFrame.cpp
@@ -77,9 +77,9 @@ public:
 
     ~CommandOptions() override = default;
 
-    Error SetOptionValue(uint32_t option_idx, llvm::StringRef option_arg,
-                         ExecutionContext *execution_context) override {
-      Error error;
+    Status SetOptionValue(uint32_t option_idx, llvm::StringRef option_arg,
+                          ExecutionContext *execution_context) override {
+      Status error;
       const int short_option = m_getopt_table[option_idx].val;
       switch (short_option) {
       case 'r':
@@ -263,9 +263,9 @@ public:
 
     ~CommandOptions() override = default;
 
-    Error SetOptionValue(uint32_t option_idx, llvm::StringRef option_arg,
-                         ExecutionContext *execution_context) override {
-      Error error;
+    Status SetOptionValue(uint32_t option_idx, llvm::StringRef option_arg,
+                          ExecutionContext *execution_context) override {
+      Status error;
       const int short_option = m_getopt_table[option_idx].val;
       switch (short_option) {
       case 'r':
@@ -604,7 +604,7 @@ protected:
           } else // No regex, either exact variable names or variable
                  // expressions.
           {
-            Error error;
+            Status error;
             uint32_t expr_path_options =
                 StackFrame::eExpressionPathOptionCheckPtrVsMember |
                 StackFrame::eExpressionPathOptionsAllowDirectIVarAccess |
diff --git a/source/Commands/CommandObjectHelp.h b/source/Commands/CommandObjectHelp.h
index cd9006619bc8..f1f87f8e63cc 100644
--- a/source/Commands/CommandObjectHelp.h
+++ b/source/Commands/CommandObjectHelp.h
@@ -46,9 +46,9 @@ public:
 
     ~CommandOptions() override {}
 
-    Error SetOptionValue(uint32_t option_idx, llvm::StringRef option_arg,
-                         ExecutionContext *execution_context) override {
-      Error error;
+    Status SetOptionValue(uint32_t option_idx, llvm::StringRef option_arg,
+                          ExecutionContext *execution_context) override {
+      Status error;
       const int short_option = m_getopt_table[option_idx].val;
 
       switch (short_option) {
diff --git a/source/Commands/CommandObjectLog.cpp b/source/Commands/CommandObjectLog.cpp
index 2099310d32c3..e545d5679725 100644
--- a/source/Commands/CommandObjectLog.cpp
+++ b/source/Commands/CommandObjectLog.cpp
@@ -93,9 +93,9 @@ public:
 
     ~CommandOptions() override = default;
 
-    Error SetOptionValue(uint32_t option_idx, llvm::StringRef option_arg,
-                         ExecutionContext *execution_context) override {
-      Error error;
+    Status SetOptionValue(uint32_t option_idx, llvm::StringRef option_arg,
+                          ExecutionContext *execution_context) override {
+      Status error;
       const int short_option = m_getopt_table[option_idx].val;
 
       switch (short_option) {
diff --git a/source/Commands/CommandObjectMemory.cpp b/source/Commands/CommandObjectMemory.cpp
index 1679614fe3f7..8f4c186a8d69 100644
--- a/source/Commands/CommandObjectMemory.cpp
+++ b/source/Commands/CommandObjectMemory.cpp
@@ -74,9 +74,9 @@ public:
     return llvm::makeArrayRef(g_read_memory_options);
   }
 
-  Error SetOptionValue(uint32_t option_idx, llvm::StringRef option_value,
-                       ExecutionContext *execution_context) override {
-    Error error;
+  Status SetOptionValue(uint32_t option_idx, llvm::StringRef option_value,
+                        ExecutionContext *execution_context) override {
+    Status error;
     const int short_option = g_read_memory_options[option_idx].short_option;
 
     switch (short_option) {
@@ -120,8 +120,8 @@ public:
     m_offset.Clear();
   }
 
-  Error FinalizeSettings(Target *target, OptionGroupFormat &format_options) {
-    Error error;
+  Status FinalizeSettings(Target *target, OptionGroupFormat &format_options) {
+    Status error;
     OptionValueUInt64 &byte_size_value = format_options.GetByteSizeValue();
     OptionValueUInt64 &count_value = format_options.GetCountValue();
     const bool byte_size_option_set = byte_size_value.OptionWasSet();
@@ -378,7 +378,7 @@ protected:
     }
 
     CompilerType clang_ast_type;
-    Error error;
+    Status error;
 
     const char *view_as_type_cstr =
         m_memory_options.m_view_as_type.GetCurrentValue();
@@ -716,7 +716,7 @@ protected:
       while (item_count < count) {
         std::string buffer;
         buffer.resize(item_byte_size + 1, 0);
-        Error error;
+        Status error;
         size_t read = target->ReadCStringFromMemory(data_addr, &buffer[0],
                                                     item_byte_size + 1, error);
         if (error.Fail()) {
@@ -909,9 +909,9 @@ public:
       return llvm::makeArrayRef(g_memory_find_option_table);
     }
 
-    Error SetOptionValue(uint32_t option_idx, llvm::StringRef option_value,
-                         ExecutionContext *execution_context) override {
-      Error error;
+    Status SetOptionValue(uint32_t option_idx, llvm::StringRef option_value,
+                          ExecutionContext *execution_context) override {
+      Status error;
       const int short_option =
           g_memory_find_option_table[option_idx].short_option;
 
@@ -1008,7 +1008,7 @@ protected:
         return 0;
 
       uint8_t retval = 0;
-      Error error;
+      Status error;
       if (0 ==
           m_process_sp->ReadMemory(m_base_addr + offset, &retval, 1, error)) {
         m_is_valid = false;
@@ -1035,7 +1035,7 @@ protected:
       return false;
     }
 
-    Error error;
+    Status error;
     lldb::addr_t low_addr = Args::StringToAddress(&m_exe_ctx, command[0].ref,
                                                   LLDB_INVALID_ADDRESS, &error);
     if (low_addr == LLDB_INVALID_ADDRESS || error.Fail()) {
@@ -1202,9 +1202,9 @@ public:
       return llvm::makeArrayRef(g_memory_write_option_table);
     }
 
-    Error SetOptionValue(uint32_t option_idx, llvm::StringRef option_value,
-                         ExecutionContext *execution_context) override {
-      Error error;
+    Status SetOptionValue(uint32_t option_idx, llvm::StringRef option_value,
+                          ExecutionContext *execution_context) override {
+      Status error;
       const int short_option =
           g_memory_write_option_table[option_idx].short_option;
 
@@ -1344,7 +1344,7 @@ protected:
     OptionValueUInt64 &byte_size_value = m_format_options.GetByteSizeValue();
     size_t item_byte_size = byte_size_value.GetCurrentValue();
 
-    Error error;
+    Status error;
     lldb::addr_t addr = Args::StringToAddress(&m_exe_ctx, command[0].ref,
                                               LLDB_INVALID_ADDRESS, &error);
 
@@ -1365,7 +1365,7 @@ protected:
       if (data_sp) {
         length = data_sp->GetByteSize();
         if (length > 0) {
-          Error error;
+          Status error;
           size_t bytes_written =
               process->WriteMemory(addr, data_sp->GetBytes(), length, error);
 
@@ -1506,7 +1506,7 @@ protected:
         // Include the NULL for C strings...
         if (m_format_options.GetFormat() == eFormatCString)
           ++len;
-        Error error;
+        Status error;
         if (process->WriteMemory(addr, entry.c_str(), len, error) == len) {
           addr += len;
         } else {
@@ -1574,7 +1574,7 @@ protected:
     }
 
     if (!buffer.GetString().empty()) {
-      Error error;
+      Status error;
       if (process->WriteMemory(addr, buffer.GetString().data(),
                                buffer.GetString().size(),
                                error) == buffer.GetString().size())
@@ -1641,7 +1641,7 @@ protected:
       return false;
     }
 
-    Error error;
+    Status error;
     lldb::addr_t addr = Args::StringToAddress(&m_exe_ctx, command[0].ref,
                                               LLDB_INVALID_ADDRESS, &error);
 
@@ -1699,7 +1699,7 @@ protected:
   bool DoExecute(Args &command, CommandReturnObject &result) override {
     ProcessSP process_sp = m_exe_ctx.GetProcessSP();
     if (process_sp) {
-      Error error;
+      Status error;
       lldb::addr_t load_addr = m_prev_end_addr;
       m_prev_end_addr = LLDB_INVALID_ADDRESS;
 
diff --git a/source/Commands/CommandObjectPlatform.cpp b/source/Commands/CommandObjectPlatform.cpp
index 62ea683e6e0d..5fa851f584a7 100644
--- a/source/Commands/CommandObjectPlatform.cpp
+++ b/source/Commands/CommandObjectPlatform.cpp
@@ -85,10 +85,10 @@ public:
 
   ~OptionPermissions() override = default;
 
-  lldb_private::Error
+  lldb_private::Status
   SetOptionValue(uint32_t option_idx, llvm::StringRef option_arg,
                  ExecutionContext *execution_context) override {
-    Error error;
+    Status error;
     char short_option = (char)GetDefinitions()[option_idx].short_option;
     switch (short_option) {
     case 'v': {
@@ -200,7 +200,7 @@ protected:
       if (platform_name && platform_name[0]) {
         const bool select = true;
         m_platform_options.SetPlatformName(platform_name);
-        Error error;
+        Status error;
         ArchSpec platform_arch;
         PlatformSP platform_sp(m_platform_options.CreatePlatformWithOptions(
             m_interpreter, ArchSpec(), select, error, platform_arch));
@@ -329,7 +329,7 @@ protected:
     PlatformSP platform_sp(
         m_interpreter.GetDebugger().GetPlatformList().GetSelectedPlatform());
     if (platform_sp) {
-      Error error(platform_sp->ConnectRemote(args));
+      Status error(platform_sp->ConnectRemote(args));
       if (error.Success()) {
         platform_sp->GetStatus(ostrm);
         result.SetStatus(eReturnStatusSuccessFinishResult);
@@ -382,7 +382,7 @@ protected:
         m_interpreter.GetDebugger().GetPlatformList().GetSelectedPlatform());
     if (platform_sp) {
       if (args.GetArgumentCount() == 0) {
-        Error error;
+        Status error;
 
         if (platform_sp->IsConnected()) {
           // Cache the instance name if there is one since we are
@@ -498,7 +498,8 @@ public:
       else
         mode = lldb::eFilePermissionsUserRWX | lldb::eFilePermissionsGroupRWX |
                lldb::eFilePermissionsWorldRX;
-      Error error = platform_sp->MakeDirectory(FileSpec{cmd_line, false}, mode);
+      Status error =
+          platform_sp->MakeDirectory(FileSpec{cmd_line, false}, mode);
       if (error.Success()) {
         result.SetStatus(eReturnStatusSuccessFinishResult);
       } else {
@@ -539,7 +540,7 @@ public:
     PlatformSP platform_sp(
         m_interpreter.GetDebugger().GetPlatformList().GetSelectedPlatform());
     if (platform_sp) {
-      Error error;
+      Status error;
       std::string cmd_line;
       args.GetCommandString(cmd_line);
       mode_t perms;
@@ -599,7 +600,7 @@ public:
       args.GetCommandString(cmd_line);
       const lldb::user_id_t fd =
           StringConvert::ToUInt64(cmd_line.c_str(), UINT64_MAX);
-      Error error;
+      Status error;
       bool success = platform_sp->CloseFile(fd, error);
       if (success) {
         result.AppendMessageWithFormat("file %" PRIu64 " closed.\n", fd);
@@ -646,7 +647,7 @@ public:
       const lldb::user_id_t fd =
           StringConvert::ToUInt64(cmd_line.c_str(), UINT64_MAX);
       std::string buffer(m_options.m_count, 0);
-      Error error;
+      Status error;
       uint32_t retcode = platform_sp->ReadFile(
           fd, m_options.m_offset, &buffer[0], m_options.m_count, error);
       result.AppendMessageWithFormat("Return = %d\n", retcode);
@@ -668,9 +669,9 @@ protected:
 
     ~CommandOptions() override = default;
 
-    Error SetOptionValue(uint32_t option_idx, llvm::StringRef option_arg,
-                         ExecutionContext *execution_context) override {
-      Error error;
+    Status SetOptionValue(uint32_t option_idx, llvm::StringRef option_arg,
+                          ExecutionContext *execution_context) override {
+      Status error;
       char short_option = (char)m_getopt_table[option_idx].val;
 
       switch (short_option) {
@@ -738,7 +739,7 @@ public:
     if (platform_sp) {
       std::string cmd_line;
       args.GetCommandString(cmd_line);
-      Error error;
+      Status error;
       const lldb::user_id_t fd =
           StringConvert::ToUInt64(cmd_line.c_str(), UINT64_MAX);
       uint32_t retcode =
@@ -762,9 +763,9 @@ protected:
 
     ~CommandOptions() override = default;
 
-    Error SetOptionValue(uint32_t option_idx, llvm::StringRef option_arg,
-                         ExecutionContext *execution_context) override {
-      Error error;
+    Status SetOptionValue(uint32_t option_idx, llvm::StringRef option_arg,
+                          ExecutionContext *execution_context) override {
+      Status error;
       char short_option = (char)m_getopt_table[option_idx].val;
 
       switch (short_option) {
@@ -889,8 +890,8 @@ public:
     if (platform_sp) {
       const char *remote_file_path = args.GetArgumentAtIndex(0);
       const char *local_file_path = args.GetArgumentAtIndex(1);
-      Error error = platform_sp->GetFile(FileSpec(remote_file_path, false),
-                                         FileSpec(local_file_path, false));
+      Status error = platform_sp->GetFile(FileSpec(remote_file_path, false),
+                                          FileSpec(local_file_path, false));
       if (error.Success()) {
         result.AppendMessageWithFormat(
             "successfully get-file from %s (remote) to %s (host)\n",
@@ -999,7 +1000,7 @@ public:
     PlatformSP platform_sp(
         m_interpreter.GetDebugger().GetPlatformList().GetSelectedPlatform());
     if (platform_sp) {
-      Error error(platform_sp->PutFile(src_fs, dst_fs));
+      Status error(platform_sp->PutFile(src_fs, dst_fs));
       if (error.Success()) {
         result.SetStatus(eReturnStatusSuccessFinishNoResult);
       } else {
@@ -1043,7 +1044,7 @@ protected:
     }
 
     if (platform_sp) {
-      Error error;
+      Status error;
       const size_t argc = args.GetArgumentCount();
       Target *target = m_exe_ctx.GetTargetPtr();
       Module *exe_module = target->GetExecutableModulePointer();
@@ -1153,7 +1154,7 @@ protected:
     }
 
     if (platform_sp) {
-      Error error;
+      Status error;
       if (args.GetArgumentCount() == 0) {
         if (platform_sp) {
           Stream &ostrm = result.GetOutputStream();
@@ -1271,9 +1272,9 @@ protected:
 
     ~CommandOptions() override = default;
 
-    Error SetOptionValue(uint32_t option_idx, llvm::StringRef option_arg,
-                         ExecutionContext *execution_context) override {
-      Error error;
+    Status SetOptionValue(uint32_t option_idx, llvm::StringRef option_arg,
+                          ExecutionContext *execution_context) override {
+      Status error;
       const int short_option = m_getopt_table[option_idx].val;
       bool success = false;
 
@@ -1449,7 +1450,7 @@ protected:
     if (platform_sp) {
       const size_t argc = args.GetArgumentCount();
       if (argc > 0) {
-        Error error;
+        Status error;
 
         if (platform_sp->IsConnected()) {
           Stream &ostrm = result.GetOutputStream();
@@ -1515,9 +1516,9 @@ public:
 
     ~CommandOptions() override = default;
 
-    Error SetOptionValue(uint32_t option_idx, llvm::StringRef option_arg,
-                         ExecutionContext *execution_context) override {
-      Error error;
+    Status SetOptionValue(uint32_t option_idx, llvm::StringRef option_arg,
+                          ExecutionContext *execution_context) override {
+      Status error;
       char short_option = (char)m_getopt_table[option_idx].val;
       switch (short_option) {
       case 'p': {
@@ -1624,7 +1625,7 @@ public:
     PlatformSP platform_sp(
         m_interpreter.GetDebugger().GetPlatformList().GetSelectedPlatform());
     if (platform_sp) {
-      Error err;
+      Status err;
       ProcessSP remote_process_sp = platform_sp->Attach(
           m_options.attach_info, m_interpreter.GetDebugger(), nullptr, err);
       if (err.Fail()) {
@@ -1700,9 +1701,9 @@ public:
       return llvm::makeArrayRef(g_platform_shell_options);
     }
 
-    Error SetOptionValue(uint32_t option_idx, llvm::StringRef option_arg,
-                         ExecutionContext *execution_context) override {
-      Error error;
+    Status SetOptionValue(uint32_t option_idx, llvm::StringRef option_arg,
+                          ExecutionContext *execution_context) override {
+      Status error;
 
       const char short_option = (char)GetDefinitions()[option_idx].short_option;
 
@@ -1782,7 +1783,7 @@ public:
 
     PlatformSP platform_sp(
         m_interpreter.GetDebugger().GetPlatformList().GetSelectedPlatform());
-    Error error;
+    Status error;
     if (platform_sp) {
       FileSpec working_dir{};
       std::string output;
@@ -1861,7 +1862,7 @@ public:
       return false;
     }
 
-    Error error = platform_sp->Install(src, dst);
+    Status error = platform_sp->Install(src, dst);
     if (error.Success()) {
       result.SetStatus(eReturnStatusSuccessFinishNoResult);
     } else {
diff --git a/source/Commands/CommandObjectPlugin.cpp b/source/Commands/CommandObjectPlugin.cpp
index 7ae968b95fc8..7e1b7f61f766 100644
--- a/source/Commands/CommandObjectPlugin.cpp
+++ b/source/Commands/CommandObjectPlugin.cpp
@@ -68,7 +68,7 @@ protected:
       return false;
     }
 
-    Error error;
+    Status error;
 
     FileSpec dylib_fspec(command[0].ref, true);
 
diff --git a/source/Commands/CommandObjectProcess.cpp b/source/Commands/CommandObjectProcess.cpp
index 557bdeecc22c..9fbdd7630548 100644
--- a/source/Commands/CommandObjectProcess.cpp
+++ b/source/Commands/CommandObjectProcess.cpp
@@ -74,7 +74,7 @@ protected:
         } else {
           if (process->GetShouldDetach()) {
             bool keep_stopped = false;
-            Error detach_error(process->Detach(keep_stopped));
+            Status detach_error(process->Detach(keep_stopped));
             if (detach_error.Success()) {
               result.SetStatus(eReturnStatusSuccessFinishResult);
               process = nullptr;
@@ -85,7 +85,7 @@ protected:
               result.SetStatus(eReturnStatusFailed);
             }
           } else {
-            Error destroy_error(process->Destroy(false));
+            Status destroy_error(process->Destroy(false));
             if (destroy_error.Success()) {
               result.SetStatus(eReturnStatusSuccessFinishResult);
               process = nullptr;
@@ -231,7 +231,7 @@ protected:
     }
 
     StreamString stream;
-    Error error = target->Launch(m_options.launch_info, &stream);
+    Status error = target->Launch(m_options.launch_info, &stream);
 
     if (error.Success()) {
       ProcessSP process_sp(target->GetProcessSP());
@@ -338,9 +338,9 @@ public:
 
     ~CommandOptions() override = default;
 
-    Error SetOptionValue(uint32_t option_idx, llvm::StringRef option_arg,
-                         ExecutionContext *execution_context) override {
-      Error error;
+    Status SetOptionValue(uint32_t option_idx, llvm::StringRef option_arg,
+                          ExecutionContext *execution_context) override {
+      Status error;
       const int short_option = m_getopt_table[option_idx].val;
       switch (short_option) {
       case 'c':
@@ -470,7 +470,7 @@ protected:
     if (target == nullptr) {
       // If there isn't a current target create one.
       TargetSP new_target_sp;
-      Error error;
+      Status error;
 
       error = m_interpreter.GetDebugger().GetTargetList().CreateTarget(
           m_interpreter.GetDebugger(), "", "", false,
@@ -603,9 +603,9 @@ protected:
 
     ~CommandOptions() override = default;
 
-    Error SetOptionValue(uint32_t option_idx, llvm::StringRef option_arg,
-                         ExecutionContext *execution_context) override {
-      Error error;
+    Status SetOptionValue(uint32_t option_idx, llvm::StringRef option_arg,
+                          ExecutionContext *execution_context) override {
+      Status error;
       const int short_option = m_getopt_table[option_idx].val;
       switch (short_option) {
       case 'i':
@@ -687,7 +687,7 @@ protected:
       const uint32_t iohandler_id = process->GetIOHandlerID();
 
       StreamString stream;
-      Error error;
+      Status error;
       if (synchronous_execution)
         error = process->ResumeSynchronous(&stream);
       else
@@ -751,9 +751,9 @@ public:
 
     ~CommandOptions() override = default;
 
-    Error SetOptionValue(uint32_t option_idx, llvm::StringRef option_arg,
-                         ExecutionContext *execution_context) override {
-      Error error;
+    Status SetOptionValue(uint32_t option_idx, llvm::StringRef option_arg,
+                          ExecutionContext *execution_context) override {
+      Status error;
       const int short_option = m_getopt_table[option_idx].val;
 
       switch (short_option) {
@@ -816,7 +816,7 @@ protected:
     else
       keep_stopped = false;
 
-    Error error(process->Detach(keep_stopped));
+    Status error(process->Detach(keep_stopped));
     if (error.Success()) {
       result.SetStatus(eReturnStatusSuccessFinishResult);
     } else {
@@ -854,9 +854,9 @@ public:
 
     ~CommandOptions() override = default;
 
-    Error SetOptionValue(uint32_t option_idx, llvm::StringRef option_arg,
-                         ExecutionContext *execution_context) override {
-      Error error;
+    Status SetOptionValue(uint32_t option_idx, llvm::StringRef option_arg,
+                          ExecutionContext *execution_context) override {
+      Status error;
       const int short_option = m_getopt_table[option_idx].val;
 
       switch (short_option) {
@@ -919,7 +919,7 @@ protected:
     if (!m_options.plugin_name.empty())
       plugin_name = m_options.plugin_name.c_str();
 
-    Error error;
+    Status error;
     Debugger &debugger = m_interpreter.GetDebugger();
     PlatformSP platform_sp = m_interpreter.GetPlatform(true);
     ProcessSP process_sp = platform_sp->ConnectProcess(
@@ -983,9 +983,9 @@ public:
 
     ~CommandOptions() override = default;
 
-    Error SetOptionValue(uint32_t option_idx, llvm::StringRef option_arg,
-                         ExecutionContext *execution_context) override {
-      Error error;
+    Status SetOptionValue(uint32_t option_idx, llvm::StringRef option_arg,
+                          ExecutionContext *execution_context) override {
+      Status error;
       const int short_option = m_getopt_table[option_idx].val;
       switch (short_option) {
       case 'i':
@@ -1033,7 +1033,7 @@ protected:
     Process *process = m_exe_ctx.GetProcessPtr();
 
     for (auto &entry : command.entries()) {
-      Error error;
+      Status error;
       PlatformSP platform = process->GetTarget().GetPlatform();
       llvm::StringRef image_path = entry.ref;
       uint32_t image_token = LLDB_INVALID_IMAGE_TOKEN;
@@ -1103,7 +1103,7 @@ protected:
         result.SetStatus(eReturnStatusFailed);
         break;
       } else {
-        Error error(process->GetTarget().GetPlatform()->UnloadImage(
+        Status error(process->GetTarget().GetPlatform()->UnloadImage(
             process, image_token));
         if (error.Success()) {
           result.AppendMessageWithFormat(
@@ -1169,7 +1169,7 @@ protected:
                                      command.GetArgumentAtIndex(0));
         result.SetStatus(eReturnStatusFailed);
       } else {
-        Error error(process->Signal(signo));
+        Status error(process->Signal(signo));
         if (error.Success()) {
           result.SetStatus(eReturnStatusSuccessFinishResult);
         } else {
@@ -1215,7 +1215,7 @@ protected:
 
     if (command.GetArgumentCount() == 0) {
       bool clear_thread_plans = true;
-      Error error(process->Halt(clear_thread_plans));
+      Status error(process->Halt(clear_thread_plans));
       if (error.Success()) {
         result.SetStatus(eReturnStatusSuccessFinishResult);
       } else {
@@ -1258,7 +1258,7 @@ protected:
     }
 
     if (command.GetArgumentCount() == 0) {
-      Error error(process->Destroy(true));
+      Status error(process->Destroy(true));
       if (error.Success()) {
         result.SetStatus(eReturnStatusSuccessFinishResult);
       } else {
@@ -1298,7 +1298,7 @@ protected:
     if (process_sp) {
       if (command.GetArgumentCount() == 1) {
         FileSpec output_file(command.GetArgumentAtIndex(0), false);
-        Error error = PluginManager::SaveCore(process_sp, output_file);
+        Status error = PluginManager::SaveCore(process_sp, output_file);
         if (error.Success()) {
           result.SetStatus(eReturnStatusSuccessFinishResult);
         } else {
@@ -1377,9 +1377,9 @@ public:
 
     ~CommandOptions() override = default;
 
-    Error SetOptionValue(uint32_t option_idx, llvm::StringRef option_arg,
-                         ExecutionContext *execution_context) override {
-      Error error;
+    Status SetOptionValue(uint32_t option_idx, llvm::StringRef option_arg,
+                          ExecutionContext *execution_context) override {
+      Status error;
       const int short_option = m_getopt_table[option_idx].val;
 
       switch (short_option) {
diff --git a/source/Commands/CommandObjectRegister.cpp b/source/Commands/CommandObjectRegister.cpp
index 4d856d6bd1e0..6de8c667e7a7 100644
--- a/source/Commands/CommandObjectRegister.cpp
+++ b/source/Commands/CommandObjectRegister.cpp
@@ -257,9 +257,9 @@ protected:
       alternate_name.Clear();
     }
 
-    Error SetOptionValue(uint32_t option_idx, llvm::StringRef option_value,
-                         ExecutionContext *execution_context) override {
-      Error error;
+    Status SetOptionValue(uint32_t option_idx, llvm::StringRef option_value,
+                          ExecutionContext *execution_context) override {
+      Status error;
       const int short_option = GetDefinitions()[option_idx].short_option;
       switch (short_option) {
       case 's': {
@@ -367,7 +367,7 @@ protected:
       if (reg_info) {
         RegisterValue reg_value;
 
-        Error error(reg_value.SetValueFromString(reg_info, value_str));
+        Status error(reg_value.SetValueFromString(reg_info, value_str));
         if (error.Success()) {
           if (reg_ctx->WriteRegister(reg_info, reg_value)) {
             // Toss all frames and anything else in the thread
diff --git a/source/Commands/CommandObjectSettings.cpp b/source/Commands/CommandObjectSettings.cpp
index 4a9f69f9c192..d42466cd13b1 100644
--- a/source/Commands/CommandObjectSettings.cpp
+++ b/source/Commands/CommandObjectSettings.cpp
@@ -102,9 +102,9 @@ insert-before or insert-after.");
 
     ~CommandOptions() override = default;
 
-    Error SetOptionValue(uint32_t option_idx, llvm::StringRef option_arg,
-                         ExecutionContext *execution_context) override {
-      Error error;
+    Status SetOptionValue(uint32_t option_idx, llvm::StringRef option_arg,
+                          ExecutionContext *execution_context) override {
+      Status error;
       const int short_option = m_getopt_table[option_idx].val;
 
       switch (short_option) {
@@ -167,7 +167,7 @@ insert-before or insert-after.");
           // Complete setting value
           const char *setting_var_name =
               input.GetArgumentAtIndex(setting_var_idx);
-          Error error;
+          Status error;
           lldb::OptionValueSP value_sp(
               m_interpreter.GetDebugger().GetPropertyValue(
                   &m_exe_ctx, setting_var_name, false, error));
@@ -211,7 +211,7 @@ protected:
     const char *var_value_cstr =
         Args::StripSpaces(var_value_string, true, false, false);
 
-    Error error;
+    Status error;
     if (m_options.m_global) {
       error = m_interpreter.GetDebugger().SetPropertyValue(
           nullptr, eVarSetOperationAssign, var_name, var_value_cstr);
@@ -296,7 +296,7 @@ protected:
 
     if (!args.empty()) {
       for (const auto &arg : args) {
-        Error error(m_interpreter.GetDebugger().DumpPropertyValue(
+        Status error(m_interpreter.GetDebugger().DumpPropertyValue(
             &m_exe_ctx, result.GetOutputStream(), arg.ref,
             OptionValue::eDumpGroupValue));
         if (error.Success()) {
@@ -494,7 +494,7 @@ protected:
     const char *var_value_cstr =
         Args::StripSpaces(var_value_string, true, true, false);
 
-    Error error(m_interpreter.GetDebugger().SetPropertyValue(
+    Status error(m_interpreter.GetDebugger().SetPropertyValue(
         &m_exe_ctx, eVarSetOperationRemove, var_name, var_value_cstr));
     if (error.Fail()) {
       result.AppendError(error.AsCString());
@@ -602,7 +602,7 @@ protected:
     const char *var_value_cstr =
         Args::StripSpaces(var_value_string, true, true, false);
 
-    Error error(m_interpreter.GetDebugger().SetPropertyValue(
+    Status error(m_interpreter.GetDebugger().SetPropertyValue(
         &m_exe_ctx, eVarSetOperationReplace, var_name, var_value_cstr));
     if (error.Fail()) {
       result.AppendError(error.AsCString());
@@ -716,7 +716,7 @@ protected:
     const char *var_value_cstr =
         Args::StripSpaces(var_value_string, true, true, false);
 
-    Error error(m_interpreter.GetDebugger().SetPropertyValue(
+    Status error(m_interpreter.GetDebugger().SetPropertyValue(
         &m_exe_ctx, eVarSetOperationInsertBefore, var_name, var_value_cstr));
     if (error.Fail()) {
       result.AppendError(error.AsCString());
@@ -827,7 +827,7 @@ protected:
     const char *var_value_cstr =
         Args::StripSpaces(var_value_string, true, true, false);
 
-    Error error(m_interpreter.GetDebugger().SetPropertyValue(
+    Status error(m_interpreter.GetDebugger().SetPropertyValue(
         &m_exe_ctx, eVarSetOperationInsertAfter, var_name, var_value_cstr));
     if (error.Fail()) {
       result.AppendError(error.AsCString());
@@ -929,7 +929,7 @@ protected:
     const char *var_value_cstr =
         Args::StripSpaces(var_value_string, true, true, false);
 
-    Error error(m_interpreter.GetDebugger().SetPropertyValue(
+    Status error(m_interpreter.GetDebugger().SetPropertyValue(
         &m_exe_ctx, eVarSetOperationAppend, var_name, var_value_cstr));
     if (error.Fail()) {
       result.AppendError(error.AsCString());
@@ -1006,7 +1006,7 @@ protected:
       return false;
     }
 
-    Error error(m_interpreter.GetDebugger().SetPropertyValue(
+    Status error(m_interpreter.GetDebugger().SetPropertyValue(
         &m_exe_ctx, eVarSetOperationClear, var_name, llvm::StringRef()));
     if (error.Fail()) {
       result.AppendError(error.AsCString());
diff --git a/source/Commands/CommandObjectSource.cpp b/source/Commands/CommandObjectSource.cpp
index 1b9ee1bf8c73..f3c92b9a28c0 100644
--- a/source/Commands/CommandObjectSource.cpp
+++ b/source/Commands/CommandObjectSource.cpp
@@ -59,9 +59,9 @@ class CommandObjectSourceInfo : public CommandObjectParsed {
 
     ~CommandOptions() override = default;
 
-    Error SetOptionValue(uint32_t option_idx, llvm::StringRef option_arg,
-                         ExecutionContext *execution_context) override {
-      Error error;
+    Status SetOptionValue(uint32_t option_idx, llvm::StringRef option_arg,
+                          ExecutionContext *execution_context) override {
+      Status error;
       const int short_option = GetDefinitions()[option_idx].short_option;
       switch (short_option) {
       case 'l':
@@ -683,9 +683,9 @@ class CommandObjectSourceList : public CommandObjectParsed {
 
     ~CommandOptions() override = default;
 
-    Error SetOptionValue(uint32_t option_idx, llvm::StringRef option_arg,
-                         ExecutionContext *execution_context) override {
-      Error error;
+    Status SetOptionValue(uint32_t option_idx, llvm::StringRef option_arg,
+                          ExecutionContext *execution_context) override {
+      Status error;
       const int short_option = GetDefinitions()[option_idx].short_option;
       switch (short_option) {
       case 'l':
diff --git a/source/Commands/CommandObjectTarget.cpp b/source/Commands/CommandObjectTarget.cpp
index a2df4909dc02..343530ea4f35 100644
--- a/source/Commands/CommandObjectTarget.cpp
+++ b/source/Commands/CommandObjectTarget.cpp
@@ -269,8 +269,8 @@ protected:
       }
 
       const char *file_path = command.GetArgumentAtIndex(0);
-      Timer scoped_timer(LLVM_PRETTY_FUNCTION, "(lldb) target create '%s'",
-                         file_path);
+      static Timer::Category func_cat(LLVM_PRETTY_FUNCTION);
+      Timer scoped_timer(func_cat, "(lldb) target create '%s'", file_path);
       FileSpec file_spec;
 
       if (file_path)
@@ -284,7 +284,7 @@ protected:
       llvm::StringRef arch_cstr = m_arch_option.GetArchitectureName();
       const bool get_dependent_files =
           m_add_dependents.GetOptionValue().GetCurrentValue();
-      Error error(debugger.GetTargetList().CreateTarget(
+      Status error(debugger.GetTargetList().CreateTarget(
           debugger, file_path, arch_cstr, get_dependent_files, nullptr,
           target_sp));
 
@@ -303,7 +303,7 @@ protected:
             if (file_spec && file_spec.Exists()) {
               // if the remote file does not exist, push it there
               if (!platform_sp->GetFileExists(remote_file)) {
-                Error err = platform_sp->PutFile(file_spec, remote_file);
+                Status err = platform_sp->PutFile(file_spec, remote_file);
                 if (err.Fail()) {
                   result.AppendError(err.AsCString());
                   result.SetStatus(eReturnStatusFailed);
@@ -324,7 +324,7 @@ protected:
               }
               if (file_path) {
                 // copy the remote file to the local file
-                Error err = platform_sp->GetFile(remote_file, file_spec);
+                Status err = platform_sp->GetFile(remote_file, file_spec);
                 if (err.Fail()) {
                   result.AppendError(err.AsCString());
                   result.SetStatus(eReturnStatusFailed);
@@ -839,7 +839,7 @@ protected:
           matches = target->GetImages().FindGlobalVariables(
               regex, true, UINT32_MAX, variable_list);
         } else {
-          Error error(Variable::GetValuesForVariableExpressionPath(
+          Status error(Variable::GetValuesForVariableExpressionPath(
               arg, m_exe_ctx.GetBestExecutionContextScope(),
               GetVariableCallback, target, variable_list, valobj_list));
           matches = variable_list.GetSize();
@@ -1993,9 +1993,9 @@ public:
 
     ~CommandOptions() override = default;
 
-    Error SetOptionValue(uint32_t option_idx, llvm::StringRef option_arg,
-                         ExecutionContext *execution_context) override {
-      Error error;
+    Status SetOptionValue(uint32_t option_idx, llvm::StringRef option_arg,
+                          ExecutionContext *execution_context) override {
+      Status error;
       const int short_option = m_getopt_table[option_idx].val;
 
       switch (short_option) {
@@ -2515,7 +2515,7 @@ protected:
                   m_symbol_file.GetOptionValue().GetCurrentValue();
             if (!module_spec.GetArchitecture().IsValid())
               module_spec.GetArchitecture() = target->GetArchitecture();
-            Error error;
+            Status error;
             ModuleSP module_sp(target->GetSharedModule(module_spec, &error));
             if (!module_sp) {
               const char *error_cstr = error.AsCString();
@@ -2750,7 +2750,7 @@ protected:
                     process->Flush();
                 }
                 if (load) {
-                  Error error = module->LoadInMemory(*target, set_pc);
+                  Status error = module->LoadInMemory(*target, set_pc);
                   if (error.Fail()) {
                     result.AppendError(error.AsCString());
                     return false;
@@ -2857,9 +2857,9 @@ public:
 
     ~CommandOptions() override = default;
 
-    Error SetOptionValue(uint32_t option_idx, llvm::StringRef option_arg,
-                         ExecutionContext *execution_context) override {
-      Error error;
+    Status SetOptionValue(uint32_t option_idx, llvm::StringRef option_arg,
+                          ExecutionContext *execution_context) override {
+      Status error;
 
       const int short_option = m_getopt_table[option_idx].val;
       if (short_option == 'g') {
@@ -3220,9 +3220,9 @@ public:
 
     ~CommandOptions() override = default;
 
-    Error SetOptionValue(uint32_t option_idx, llvm::StringRef option_arg,
-                         ExecutionContext *execution_context) override {
-      Error error;
+    Status SetOptionValue(uint32_t option_idx, llvm::StringRef option_arg,
+                          ExecutionContext *execution_context) override {
+      Status error;
 
       const int short_option = m_getopt_table[option_idx].val;
 
@@ -3520,9 +3520,9 @@ public:
 
     ~CommandOptions() override = default;
 
-    Error SetOptionValue(uint32_t option_idx, llvm::StringRef option_arg,
-                         ExecutionContext *execution_context) override {
-      Error error;
+    Status SetOptionValue(uint32_t option_idx, llvm::StringRef option_arg,
+                          ExecutionContext *execution_context) override {
+      Status error;
 
       const int short_option = m_getopt_table[option_idx].val;
 
@@ -4114,7 +4114,7 @@ protected:
 
               // Make sure we load any scripting resources that may be embedded
               // in the debug info files in case the platform supports that.
-              Error error;
+              Status error;
               StreamString feedback_stream;
               module_sp->LoadScriptingResourceInTarget(target, error,
                                                        &feedback_stream);
@@ -4398,9 +4398,9 @@ public:
       return llvm::makeArrayRef(g_target_stop_hook_add_options);
     }
 
-    Error SetOptionValue(uint32_t option_idx, llvm::StringRef option_arg,
-                         ExecutionContext *execution_context) override {
-      Error error;
+    Status SetOptionValue(uint32_t option_idx, llvm::StringRef option_arg,
+                          ExecutionContext *execution_context) override {
+      Status error;
       const int short_option = m_getopt_table[option_idx].val;
 
       switch (short_option) {
diff --git a/source/Commands/CommandObjectThread.cpp b/source/Commands/CommandObjectThread.cpp
index 7ba6f2c19a8d..6b0f1b455bc1 100644
--- a/source/Commands/CommandObjectThread.cpp
+++ b/source/Commands/CommandObjectThread.cpp
@@ -162,9 +162,9 @@ public:
 
     ~CommandOptions() override = default;
 
-    Error SetOptionValue(uint32_t option_idx, llvm::StringRef option_arg,
-                         ExecutionContext *execution_context) override {
-      Error error;
+    Status SetOptionValue(uint32_t option_idx, llvm::StringRef option_arg,
+                          ExecutionContext *execution_context) override {
+      Status error;
       const int short_option = m_getopt_table[option_idx].val;
 
       switch (short_option) {
@@ -330,9 +330,9 @@ public:
 
     ~CommandOptions() override = default;
 
-    Error SetOptionValue(uint32_t option_idx, llvm::StringRef option_arg,
-                         ExecutionContext *execution_context) override {
-      Error error;
+    Status SetOptionValue(uint32_t option_idx, llvm::StringRef option_arg,
+                          ExecutionContext *execution_context) override {
+      Status error;
       const int short_option = m_getopt_table[option_idx].val;
 
       switch (short_option) {
@@ -556,7 +556,7 @@ protected:
         AddressRange range;
         SymbolContext sc = frame->GetSymbolContext(eSymbolContextEverything);
         if (m_options.m_end_line != LLDB_INVALID_LINE_NUMBER) {
-          Error error;
+          Status error;
           if (!sc.GetAddressRangeFromHereToEndLine(m_options.m_end_line, range,
                                                    error)) {
             result.AppendErrorWithFormat("invalid end-line option: %s.",
@@ -565,7 +565,7 @@ protected:
             return false;
           }
         } else if (m_options.m_end_line_is_block_end) {
-          Error error;
+          Status error;
           Block *block = frame->GetSymbolContext(eSymbolContextBlock).block;
           if (!block) {
             result.AppendErrorWithFormat("Could not find the current block.");
@@ -660,7 +660,7 @@ protected:
       const uint32_t iohandler_id = process->GetIOHandlerID();
 
       StreamString stream;
-      Error error;
+      Status error;
       if (synchronous_execution)
         error = process->ResumeSynchronous(&stream);
       else
@@ -841,7 +841,7 @@ public:
       }
 
       StreamString stream;
-      Error error;
+      Status error;
       if (synchronous_execution)
         error = process->ResumeSynchronous(&stream);
       else
@@ -908,9 +908,9 @@ public:
 
     ~CommandOptions() override = default;
 
-    Error SetOptionValue(uint32_t option_idx, llvm::StringRef option_arg,
-                         ExecutionContext *execution_context) override {
-      Error error;
+    Status SetOptionValue(uint32_t option_idx, llvm::StringRef option_arg,
+                          ExecutionContext *execution_context) override {
+      Status error;
       const int short_option = m_getopt_table[option_idx].val;
 
       switch (short_option) {
@@ -1174,7 +1174,7 @@ protected:
       process->GetThreadList().SetSelectedThreadByID(m_options.m_thread_idx);
 
       StreamString stream;
-      Error error;
+      Status error;
       if (synchronous_execution)
         error = process->ResumeSynchronous(&stream);
       else
@@ -1325,10 +1325,10 @@ public:
       m_json_stopinfo = false;
     }
 
-    Error SetOptionValue(uint32_t option_idx, llvm::StringRef option_arg,
-                         ExecutionContext *execution_context) override {
+    Status SetOptionValue(uint32_t option_idx, llvm::StringRef option_arg,
+                          ExecutionContext *execution_context) override {
       const int short_option = m_getopt_table[option_idx].val;
-      Error error;
+      Status error;
 
       switch (short_option) {
       case 'j':
@@ -1340,7 +1340,7 @@ public:
         break;
 
       default:
-        return Error("invalid short option character '%c'", short_option);
+        return Status("invalid short option character '%c'", short_option);
       }
       return error;
     }
@@ -1418,9 +1418,9 @@ public:
 
     ~CommandOptions() override = default;
 
-    Error SetOptionValue(uint32_t option_idx, llvm::StringRef option_arg,
-                         ExecutionContext *execution_context) override {
-      Error error;
+    Status SetOptionValue(uint32_t option_idx, llvm::StringRef option_arg,
+                          ExecutionContext *execution_context) override {
+      Status error;
       const int short_option = m_getopt_table[option_idx].val;
 
       switch (short_option) {
@@ -1498,7 +1498,7 @@ protected:
                              "called expressions");
 
       Thread *thread = m_exe_ctx.GetThreadPtr();
-      Error error;
+      Status error;
       error = thread->UnwindInnermostExpression();
       if (!error.Success()) {
         result.AppendErrorWithFormat("Unwinding expression failed - %s.",
@@ -1553,7 +1553,7 @@ protected:
       }
     }
 
-    Error error;
+    Status error;
     ThreadSP thread_sp = m_exe_ctx.GetThreadSP();
     const bool broadcast = true;
     error = thread_sp->ReturnFromFrame(frame_sp, return_valobj_sp, broadcast);
@@ -1602,24 +1602,24 @@ public:
       m_force = false;
     }
 
-    Error SetOptionValue(uint32_t option_idx, llvm::StringRef option_arg,
-                         ExecutionContext *execution_context) override {
+    Status SetOptionValue(uint32_t option_idx, llvm::StringRef option_arg,
+                          ExecutionContext *execution_context) override {
       const int short_option = m_getopt_table[option_idx].val;
-      Error error;
+      Status error;
 
       switch (short_option) {
       case 'f':
         m_filenames.AppendIfUnique(FileSpec(option_arg, false));
         if (m_filenames.GetSize() > 1)
-          return Error("only one source file expected.");
+          return Status("only one source file expected.");
         break;
       case 'l':
         if (option_arg.getAsInteger(0, m_line_num))
-          return Error("invalid line number: '%s'.", option_arg.str().c_str());
+          return Status("invalid line number: '%s'.", option_arg.str().c_str());
         break;
       case 'b':
         if (option_arg.getAsInteger(0, m_line_offset))
-          return Error("invalid line offset: '%s'.", option_arg.str().c_str());
+          return Status("invalid line offset: '%s'.", option_arg.str().c_str());
         break;
       case 'a':
         m_load_addr = Args::StringToAddress(execution_context, option_arg,
@@ -1629,7 +1629,7 @@ public:
         m_force = true;
         break;
       default:
-        return Error("invalid short option character '%c'", short_option);
+        return Status("invalid short option character '%c'", short_option);
       }
       return error;
     }
@@ -1702,7 +1702,7 @@ protected:
       }
 
       std::string warnings;
-      Error err = thread->JumpToLine(file, line, m_options.m_force, &warnings);
+      Status err = thread->JumpToLine(file, line, m_options.m_force, &warnings);
 
       if (err.Fail()) {
         result.SetError(err);
@@ -1747,9 +1747,9 @@ public:
 
     ~CommandOptions() override = default;
 
-    Error SetOptionValue(uint32_t option_idx, llvm::StringRef option_arg,
-                         ExecutionContext *execution_context) override {
-      Error error;
+    Status SetOptionValue(uint32_t option_idx, llvm::StringRef option_arg,
+                          ExecutionContext *execution_context) override {
+      Status error;
       const int short_option = m_getopt_table[option_idx].val;
 
       switch (short_option) {
diff --git a/source/Commands/CommandObjectType.cpp b/source/Commands/CommandObjectType.cpp
index b34a42738d4f..2d4271cab362 100644
--- a/source/Commands/CommandObjectType.cpp
+++ b/source/Commands/CommandObjectType.cpp
@@ -127,8 +127,8 @@ private:
 
     ~CommandOptions() override = default;
 
-    Error SetOptionValue(uint32_t option_idx, llvm::StringRef option_arg,
-                         ExecutionContext *execution_context) override;
+    Status SetOptionValue(uint32_t option_idx, llvm::StringRef option_arg,
+                          ExecutionContext *execution_context) override;
 
     void OptionParsingStarting(ExecutionContext *execution_context) override;
 
@@ -213,7 +213,7 @@ public:
                     options->m_flags, funct_name_str.c_str(),
                     lines.CopyList("    ").c_str()));
 
-                Error error;
+                Status error;
 
                 for (size_t i = 0; i < options->m_target_types.GetSize(); i++) {
                   const char *type_name =
@@ -283,7 +283,7 @@ public:
 
   static bool AddSummary(ConstString type_name, lldb::TypeSummaryImplSP entry,
                          SummaryFormatType type, std::string category,
-                         Error *error = nullptr);
+                         Status *error = nullptr);
 
 protected:
   bool DoExecute(Args &command, CommandReturnObject &result) override;
@@ -321,9 +321,9 @@ private:
 
     ~CommandOptions() override = default;
 
-    Error SetOptionValue(uint32_t option_idx, llvm::StringRef option_arg,
-                         ExecutionContext *execution_context) override {
-      Error error;
+    Status SetOptionValue(uint32_t option_idx, llvm::StringRef option_arg,
+                          ExecutionContext *execution_context) override {
+      Status error;
       const int short_option = m_getopt_table[option_idx].val;
       bool success;
 
@@ -464,7 +464,7 @@ protected:
                 DataVisualization::Categories::GetCategory(
                     ConstString(options->m_category.c_str()), category);
 
-                Error error;
+                Status error;
 
                 for (size_t i = 0; i < options->m_target_types.GetSize(); i++) {
                   const char *type_name =
@@ -523,7 +523,7 @@ public:
 
   static bool AddSynth(ConstString type_name, lldb::SyntheticChildrenSP entry,
                        SynthFormatType type, std::string category_name,
-                       Error *error);
+                       Status *error);
 };
 
 //-------------------------------------------------------------------------
@@ -562,9 +562,9 @@ private:
       m_custom_type_name.clear();
     }
 
-    Error SetOptionValue(uint32_t option_idx, llvm::StringRef option_value,
-                         ExecutionContext *execution_context) override {
-      Error error;
+    Status SetOptionValue(uint32_t option_idx, llvm::StringRef option_value,
+                          ExecutionContext *execution_context) override {
+      Status error;
       const int short_option =
           g_type_format_add_options[option_idx].short_option;
       bool success;
@@ -769,9 +769,9 @@ protected:
 
     ~CommandOptions() override = default;
 
-    Error SetOptionValue(uint32_t option_idx, llvm::StringRef option_arg,
-                         ExecutionContext *execution_context) override {
-      Error error;
+    Status SetOptionValue(uint32_t option_idx, llvm::StringRef option_arg,
+                          ExecutionContext *execution_context) override {
+      Status error;
       const int short_option = m_getopt_table[option_idx].val;
 
       switch (short_option) {
@@ -909,9 +909,9 @@ private:
 
     ~CommandOptions() override = default;
 
-    Error SetOptionValue(uint32_t option_idx, llvm::StringRef option_arg,
-                         ExecutionContext *execution_context) override {
-      Error error;
+    Status SetOptionValue(uint32_t option_idx, llvm::StringRef option_arg,
+                          ExecutionContext *execution_context) override {
+      Status error;
       const int short_option = m_getopt_table[option_idx].val;
 
       switch (short_option) {
@@ -1025,9 +1025,9 @@ class CommandObjectTypeFormatterList : public CommandObjectParsed {
 
     ~CommandOptions() override = default;
 
-    Error SetOptionValue(uint32_t option_idx, llvm::StringRef option_arg,
-                         ExecutionContext *execution_context) override {
-      Error error;
+    Status SetOptionValue(uint32_t option_idx, llvm::StringRef option_arg,
+                          ExecutionContext *execution_context) override {
+      Status error;
       const int short_option = m_getopt_table[option_idx].val;
       switch (short_option) {
       case 'w':
@@ -1243,10 +1243,10 @@ public:
 
 #endif // LLDB_DISABLE_PYTHON
 
-Error CommandObjectTypeSummaryAdd::CommandOptions::SetOptionValue(
+Status CommandObjectTypeSummaryAdd::CommandOptions::SetOptionValue(
     uint32_t option_idx, llvm::StringRef option_arg,
     ExecutionContext *execution_context) {
-  Error error;
+  Status error;
   const int short_option = m_getopt_table[option_idx].val;
   bool success;
 
@@ -1423,7 +1423,7 @@ bool CommandObjectTypeSummaryAdd::Execute_ScriptSummary(
   // if I am here, script_format must point to something good, so I can add that
   // as a script summary to all interested parties
 
-  Error error;
+  Status error;
 
   for (auto &entry : command.entries()) {
     CommandObjectTypeSummaryAdd::AddSummary(
@@ -1498,7 +1498,7 @@ bool CommandObjectTypeSummaryAdd::Execute_StringSummary(
   lldb::TypeSummaryImplSP entry(string_format.release());
 
   // now I have a valid format, let's add it to every type
-  Error error;
+  Status error;
   for (auto &arg_entry : command.entries()) {
     if (arg_entry.ref.empty()) {
       result.AppendError("empty typenames not allowed");
@@ -1681,7 +1681,7 @@ bool CommandObjectTypeSummaryAdd::AddSummary(ConstString type_name,
                                              TypeSummaryImplSP entry,
                                              SummaryFormatType type,
                                              std::string category_name,
-                                             Error *error) {
+                                             Status *error) {
   lldb::TypeCategoryImplSP category;
   DataVisualization::Categories::GetCategory(ConstString(category_name.c_str()),
                                              category);
@@ -1799,9 +1799,9 @@ class CommandObjectTypeCategoryDefine : public CommandObjectParsed {
 
     ~CommandOptions() override = default;
 
-    Error SetOptionValue(uint32_t option_idx, llvm::StringRef option_arg,
-                         ExecutionContext *execution_context) override {
-      Error error;
+    Status SetOptionValue(uint32_t option_idx, llvm::StringRef option_arg,
+                          ExecutionContext *execution_context) override {
+      Status error;
       const int short_option = m_getopt_table[option_idx].val;
 
       switch (short_option) {
@@ -1903,9 +1903,9 @@ class CommandObjectTypeCategoryEnable : public CommandObjectParsed {
 
     ~CommandOptions() override = default;
 
-    Error SetOptionValue(uint32_t option_idx, llvm::StringRef option_arg,
-                         ExecutionContext *execution_context) override {
-      Error error;
+    Status SetOptionValue(uint32_t option_idx, llvm::StringRef option_arg,
+                          ExecutionContext *execution_context) override {
+      Status error;
       const int short_option = m_getopt_table[option_idx].val;
 
       switch (short_option) {
@@ -2080,9 +2080,9 @@ class CommandObjectTypeCategoryDisable : public CommandObjectParsed {
 
     ~CommandOptions() override = default;
 
-    Error SetOptionValue(uint32_t option_idx, llvm::StringRef option_arg,
-                         ExecutionContext *execution_context) override {
-      Error error;
+    Status SetOptionValue(uint32_t option_idx, llvm::StringRef option_arg,
+                          ExecutionContext *execution_context) override {
+      Status error;
       const int short_option = m_getopt_table[option_idx].val;
 
       switch (short_option) {
@@ -2407,7 +2407,7 @@ bool CommandObjectTypeSynthAdd::Execute_PythonClass(
   DataVisualization::Categories::GetCategory(
       ConstString(m_options.m_category.c_str()), category);
 
-  Error error;
+  Status error;
 
   for (auto &arg_entry : command.entries()) {
     if (arg_entry.ref.empty()) {
@@ -2450,7 +2450,7 @@ bool CommandObjectTypeSynthAdd::AddSynth(ConstString type_name,
                                          SyntheticChildrenSP entry,
                                          SynthFormatType type,
                                          std::string category_name,
-                                         Error *error) {
+                                         Status *error) {
   lldb::TypeCategoryImplSP category;
   DataVisualization::Categories::GetCategory(ConstString(category_name.c_str()),
                                              category);
@@ -2512,9 +2512,9 @@ private:
 
     ~CommandOptions() override = default;
 
-    Error SetOptionValue(uint32_t option_idx, llvm::StringRef option_arg,
-                         ExecutionContext *execution_context) override {
-      Error error;
+    Status SetOptionValue(uint32_t option_idx, llvm::StringRef option_arg,
+                          ExecutionContext *execution_context) override {
+      Status error;
       const int short_option = m_getopt_table[option_idx].val;
       bool success;
 
@@ -2586,7 +2586,7 @@ private:
 
   bool AddFilter(ConstString type_name, TypeFilterImplSP entry,
                  FilterFormatType type, std::string category_name,
-                 Error *error) {
+                 Status *error) {
     lldb::TypeCategoryImplSP category;
     DataVisualization::Categories::GetCategory(
         ConstString(category_name.c_str()), category);
@@ -2717,7 +2717,7 @@ protected:
     DataVisualization::Categories::GetCategory(
         ConstString(m_options.m_category.c_str()), category);
 
-    Error error;
+    Status error;
 
     WarnOnPotentialUnquotedUnsignedType(command, result);
 
@@ -2787,9 +2787,9 @@ protected:
       return llvm::makeArrayRef(g_type_lookup_options);
     }
 
-    Error SetOptionValue(uint32_t option_idx, llvm::StringRef option_value,
-                         ExecutionContext *execution_context) override {
-      Error error;
+    Status SetOptionValue(uint32_t option_idx, llvm::StringRef option_value,
+                          ExecutionContext *execution_context) override {
+      Status error;
 
       const int short_option = g_type_lookup_options[option_idx].short_option;
 
@@ -2899,7 +2899,7 @@ public:
         if (!ParseOptions(args, result))
           return false;
 
-        Error error(m_option_group.NotifyOptionParsingFinished(&exe_ctx));
+        Status error(m_option_group.NotifyOptionParsingFinished(&exe_ctx));
         if (error.Fail()) {
           result.AppendError(error.AsCString());
           result.SetStatus(eReturnStatusFailed);
diff --git a/source/Commands/CommandObjectWatchpoint.cpp b/source/Commands/CommandObjectWatchpoint.cpp
index 1ad53cdb88ad..9c84c992e715 100644
--- a/source/Commands/CommandObjectWatchpoint.cpp
+++ b/source/Commands/CommandObjectWatchpoint.cpp
@@ -197,9 +197,9 @@ public:
 
     ~CommandOptions() override = default;
 
-    Error SetOptionValue(uint32_t option_idx, llvm::StringRef option_arg,
-                         ExecutionContext *execution_context) override {
-      Error error;
+    Status SetOptionValue(uint32_t option_idx, llvm::StringRef option_arg,
+                          ExecutionContext *execution_context) override {
+      Status error;
       const int short_option = m_getopt_table[option_idx].val;
 
       switch (short_option) {
@@ -245,7 +245,7 @@ protected:
 
     if (target->GetProcessSP() && target->GetProcessSP()->IsAlive()) {
       uint32_t num_supported_hardware_watchpoints;
-      Error error = target->GetProcessSP()->GetWatchpointSupportInfo(
+      Status error = target->GetProcessSP()->GetWatchpointSupportInfo(
           num_supported_hardware_watchpoints);
       if (error.Success())
         result.AppendMessageWithFormat(
@@ -561,9 +561,9 @@ public:
 
     ~CommandOptions() override = default;
 
-    Error SetOptionValue(uint32_t option_idx, llvm::StringRef option_arg,
-                         ExecutionContext *execution_context) override {
-      Error error;
+    Status SetOptionValue(uint32_t option_idx, llvm::StringRef option_arg,
+                          ExecutionContext *execution_context) override {
+      Status error;
       const int short_option = m_getopt_table[option_idx].val;
 
       switch (short_option) {
@@ -689,9 +689,9 @@ public:
 
     ~CommandOptions() override = default;
 
-    Error SetOptionValue(uint32_t option_idx, llvm::StringRef option_arg,
-                         ExecutionContext *execution_context) override {
-      Error error;
+    Status SetOptionValue(uint32_t option_idx, llvm::StringRef option_arg,
+                          ExecutionContext *execution_context) override {
+      Status error;
       const int short_option = m_getopt_table[option_idx].val;
 
       switch (short_option) {
@@ -881,7 +881,7 @@ protected:
     }
 
     // Things have checked out ok...
-    Error error;
+    Status error;
     uint32_t expr_path_options =
         StackFrame::eExpressionPathOptionCheckPtrVsMember |
         StackFrame::eExpressionPathOptionsAllowDirectIVarAccess;
@@ -895,7 +895,7 @@ protected:
       VariableList variable_list;
       ValueObjectList valobj_list;
 
-      Error error(Variable::GetValuesForVariableExpressionPath(
+      Status error(Variable::GetValuesForVariableExpressionPath(
           command.GetArgumentAtIndex(0),
           m_exe_ctx.GetBestExecutionContextScope(), GetVariableCallback, target,
           variable_list, valobj_list));
@@ -1060,7 +1060,7 @@ protected:
         if (!ParseOptions(args, result))
           return false;
 
-        Error error(m_option_group.NotifyOptionParsingFinished(&exe_ctx));
+        Status error(m_option_group.NotifyOptionParsingFinished(&exe_ctx));
         if (error.Fail()) {
           result.AppendError(error.AsCString());
           result.SetStatus(eReturnStatusFailed);
@@ -1135,7 +1135,7 @@ protected:
     /// of the expression, so convert to that if we  found a valid type.
     CompilerType compiler_type(valobj_sp->GetCompilerType());
 
-    Error error;
+    Status error;
     Watchpoint *wp =
         target->CreateWatchpoint(addr, size, &compiler_type, watch_type, error)
             .get();
diff --git a/source/Commands/CommandObjectWatchpointCommand.cpp b/source/Commands/CommandObjectWatchpointCommand.cpp
index 1509c487a8a7..ec7e4a1f9cde 100644
--- a/source/Commands/CommandObjectWatchpointCommand.cpp
+++ b/source/Commands/CommandObjectWatchpointCommand.cpp
@@ -319,9 +319,9 @@ are no syntax errors may indicate that a function was declared but never called.
 
     ~CommandOptions() override = default;
 
-    Error SetOptionValue(uint32_t option_idx, llvm::StringRef option_arg,
-                         ExecutionContext *execution_context) override {
-      Error error;
+    Status SetOptionValue(uint32_t option_idx, llvm::StringRef option_arg,
+                          ExecutionContext *execution_context) override {
+      Status error;
       const int short_option = m_getopt_table[option_idx].val;
 
       switch (short_option) {
diff --git a/source/Core/Address.cpp b/source/Core/Address.cpp
index 91229a9b18eb..6328e433852a 100644
--- a/source/Core/Address.cpp
+++ b/source/Core/Address.cpp
@@ -33,8 +33,8 @@
 #include "lldb/Utility/ConstString.h"   // for ConstString
 #include "lldb/Utility/DataExtractor.h" // for DataExtractor
 #include "lldb/Utility/Endian.h"        // for InlHostByteOrder
-#include "lldb/Utility/Error.h"         // for Error
 #include "lldb/Utility/FileSpec.h"      // for FileSpec
+#include "lldb/Utility/Status.h"        // for Status
 #include "lldb/Utility/Stream.h"        // for Stream
 #include "lldb/Utility/StreamString.h"  // for StreamString
 
@@ -67,7 +67,7 @@ static size_t ReadBytes(ExecutionContextScope *exe_scope,
 
   TargetSP target_sp(exe_scope->CalculateTarget());
   if (target_sp) {
-    Error error;
+    Status error;
     bool prefer_file_cache = false;
     return target_sp->ReadMemory(address, prefer_file_cache, dst, dst_len,
                                  error);
@@ -322,7 +322,7 @@ addr_t Address::GetCallableLoadAddress(Target *target, bool is_indirect) const {
 
   if (is_indirect && target) {
     ProcessSP processSP = target->GetProcessSP();
-    Error error;
+    Status error;
     if (processSP) {
       code_addr = processSP->ResolveIndirectFunction(this, error);
       if (!error.Success())
@@ -734,7 +734,7 @@ bool Address::Dump(Stream *s, ExecutionContextScope *exe_scope, DumpStyle style,
     if (process) {
       addr_t load_addr = GetLoadAddress(target);
       if (load_addr != LLDB_INVALID_ADDRESS) {
-        Error memory_error;
+        Status memory_error;
         addr_t dereferenced_load_addr =
             process->ReadPointerFromMemory(load_addr, memory_error);
         if (dereferenced_load_addr != LLDB_INVALID_ADDRESS) {
diff --git a/source/Core/ArchSpec.cpp b/source/Core/ArchSpec.cpp
index 7c1b399177fd..91b73847ac1f 100644
--- a/source/Core/ArchSpec.cpp
+++ b/source/Core/ArchSpec.cpp
@@ -1555,7 +1555,7 @@ static void StopInfoOverrideCallbackTypeARM(lldb_private::Thread &thread) {
 #if 0
                 // ARM mode: check for condition on intsruction
                 const addr_t pc = reg_ctx_sp->GetPC();
-                Error error;
+                Status error;
                 // If we fail to read the opcode we will get UINT64_MAX as the
                 // result in "opcode" which we can use to detect if we read a
                 // valid opcode.
@@ -1635,7 +1635,7 @@ void ArchSpec::PiecewiseTripleCompare(
 }
 
 bool ArchSpec::IsAlwaysThumbInstructions() const {
-  std::string Error;
+  std::string Status;
   if (GetTriple().getArch() == llvm::Triple::arm ||
       GetTriple().getArch() == llvm::Triple::thumb) {
     // v. https://en.wikipedia.org/wiki/ARM_Cortex-M
diff --git a/source/Core/Communication.cpp b/source/Core/Communication.cpp
index a543858582ef..72873a9510b5 100644
--- a/source/Core/Communication.cpp
+++ b/source/Core/Communication.cpp
@@ -15,9 +15,9 @@
 #include "lldb/Host/HostThread.h"
 #include "lldb/Host/ThreadLauncher.h"
 #include "lldb/Utility/ConstString.h" // for ConstString
-#include "lldb/Utility/Error.h"       // for Error
 #include "lldb/Utility/Log.h"
 #include "lldb/Utility/Logging.h" // for LogIfAnyCategoriesSet, LIBLLDB...
+#include "lldb/Utility/Status.h"  // for Status
 
 #include "llvm/ADT/None.h"         // for None
 #include "llvm/ADT/Optional.h"     // for Optional
@@ -75,7 +75,7 @@ void Communication::Clear() {
   StopReadThread(nullptr);
 }
 
-ConnectionStatus Communication::Connect(const char *url, Error *error_ptr) {
+ConnectionStatus Communication::Connect(const char *url, Status *error_ptr) {
   Clear();
 
   lldb_private::LogIfAnyCategoriesSet(LIBLLDB_LOG_COMMUNICATION,
@@ -90,7 +90,7 @@ ConnectionStatus Communication::Connect(const char *url, Error *error_ptr) {
   return eConnectionStatusNoConnection;
 }
 
-ConnectionStatus Communication::Disconnect(Error *error_ptr) {
+ConnectionStatus Communication::Disconnect(Status *error_ptr) {
   lldb_private::LogIfAnyCategoriesSet(LIBLLDB_LOG_COMMUNICATION,
                                       "%p Communication::Disconnect ()", this);
 
@@ -123,7 +123,7 @@ bool Communication::HasConnection() const {
 
 size_t Communication::Read(void *dst, size_t dst_len,
                            const Timeout<std::micro> &timeout,
-                           ConnectionStatus &status, Error *error_ptr) {
+                           ConnectionStatus &status, Status *error_ptr) {
   Log *log = GetLogIfAllCategoriesSet(LIBLLDB_LOG_COMMUNICATION);
   LLDB_LOG(
       log,
@@ -170,7 +170,7 @@ size_t Communication::Read(void *dst, size_t dst_len,
 }
 
 size_t Communication::Write(const void *src, size_t src_len,
-                            ConnectionStatus &status, Error *error_ptr) {
+                            ConnectionStatus &status, Status *error_ptr) {
   lldb::ConnectionSP connection_sp(m_connection_sp);
 
   std::lock_guard<std::mutex> guard(m_write_mutex);
@@ -189,7 +189,7 @@ size_t Communication::Write(const void *src, size_t src_len,
   return 0;
 }
 
-bool Communication::StartReadThread(Error *error_ptr) {
+bool Communication::StartReadThread(Status *error_ptr) {
   if (error_ptr)
     error_ptr->Clear();
 
@@ -212,7 +212,7 @@ bool Communication::StartReadThread(Error *error_ptr) {
   return m_read_thread_enabled;
 }
 
-bool Communication::StopReadThread(Error *error_ptr) {
+bool Communication::StopReadThread(Status *error_ptr) {
   if (!m_read_thread.IsJoinable())
     return true;
 
@@ -225,15 +225,15 @@ bool Communication::StopReadThread(Error *error_ptr) {
 
   // error = m_read_thread.Cancel();
 
-  Error error = m_read_thread.Join(nullptr);
+  Status error = m_read_thread.Join(nullptr);
   return error.Success();
 }
 
-bool Communication::JoinReadThread(Error *error_ptr) {
+bool Communication::JoinReadThread(Status *error_ptr) {
   if (!m_read_thread.IsJoinable())
     return true;
 
-  Error error = m_read_thread.Join(nullptr);
+  Status error = m_read_thread.Join(nullptr);
   return error.Success();
 }
 
@@ -280,7 +280,7 @@ void Communication::AppendBytesToCache(const uint8_t *bytes, size_t len,
 size_t Communication::ReadFromConnection(void *dst, size_t dst_len,
                                          const Timeout<std::micro> &timeout,
                                          ConnectionStatus &status,
-                                         Error *error_ptr) {
+                                         Status *error_ptr) {
   lldb::ConnectionSP connection_sp(m_connection_sp);
   if (connection_sp)
     return connection_sp->Read(dst, dst_len, timeout, status, error_ptr);
@@ -303,7 +303,7 @@ lldb::thread_result_t Communication::ReadThread(lldb::thread_arg_t p) {
 
   uint8_t buf[1024];
 
-  Error error;
+  Status error;
   ConnectionStatus status = eConnectionStatusSuccess;
   bool done = false;
   while (!done && comm->m_read_thread_enabled) {
diff --git a/source/Core/Debugger.cpp b/source/Core/Debugger.cpp
index 751ceed13149..75fcedb10156 100644
--- a/source/Core/Debugger.cpp
+++ b/source/Core/Debugger.cpp
@@ -287,9 +287,10 @@ enum {
 
 LoadPluginCallbackType Debugger::g_load_plugin_callback = nullptr;
 
-Error Debugger::SetPropertyValue(const ExecutionContext *exe_ctx,
-                                 VarSetOperationType op,
-  llvm::StringRef property_path, llvm::StringRef value) {
+Status Debugger::SetPropertyValue(const ExecutionContext *exe_ctx,
+                                  VarSetOperationType op,
+                                  llvm::StringRef property_path,
+                                  llvm::StringRef value) {
   bool is_load_script = (property_path == "target.load-script-from-symbol-file");
   bool is_escape_non_printables = (property_path == "escape-non-printables");
   TargetSP target_sp;
@@ -299,7 +300,7 @@ Error Debugger::SetPropertyValue(const ExecutionContext *exe_ctx,
     load_script_old_value =
         target_sp->TargetProperties::GetLoadScriptFromSymbolFile();
   }
-  Error error(Properties::SetPropertyValue(exe_ctx, op, property_path, value));
+  Status error(Properties::SetPropertyValue(exe_ctx, op, property_path, value));
   if (error.Success()) {
     // FIXME it would be nice to have "on-change" callbacks for properties
     if (property_path == g_properties[ePropertyPrompt].name) {
@@ -321,7 +322,7 @@ Error Debugger::SetPropertyValue(const ExecutionContext *exe_ctx,
                load_script_old_value == eLoadScriptFromSymFileWarn) {
       if (target_sp->TargetProperties::GetLoadScriptFromSymbolFile() ==
           eLoadScriptFromSymFileTrue) {
-        std::list<Error> errors;
+        std::list<Status> errors;
         StreamString feedback_stream;
         if (!target_sp->LoadScriptingResources(errors, &feedback_stream)) {
           StreamFileSP stream_sp(GetErrorFile());
@@ -550,7 +551,7 @@ void Debugger::SettingsInitialize() { Target::SettingsInitialize(); }
 
 void Debugger::SettingsTerminate() { Target::SettingsTerminate(); }
 
-bool Debugger::LoadPlugin(const FileSpec &spec, Error &error) {
+bool Debugger::LoadPlugin(const FileSpec &spec, Status &error) {
   if (g_load_plugin_callback) {
     llvm::sys::DynamicLibrary dynlib =
         g_load_plugin_callback(shared_from_this(), spec, error);
@@ -570,7 +571,7 @@ bool Debugger::LoadPlugin(const FileSpec &spec, Error &error) {
 static FileSpec::EnumerateDirectoryResult
 LoadPluginCallback(void *baton, llvm::sys::fs::file_type ft,
                    const FileSpec &file_spec) {
-  Error error;
+  Status error;
 
   static ConstString g_dylibext("dylib");
   static ConstString g_solibext("so");
@@ -595,7 +596,7 @@ LoadPluginCallback(void *baton, llvm::sys::fs::file_type ft,
       return FileSpec::eEnumerateDirectoryResultNext;
     }
 
-    Error plugin_load_error;
+    Status plugin_load_error;
     debugger->LoadPlugin(plugin_file_spec, plugin_load_error);
 
     return FileSpec::eEnumerateDirectoryResultNext;
@@ -1365,7 +1366,7 @@ size_t Debugger::GetProcessSTDOUT(Process *process, Stream *stream) {
         process = target_sp->GetProcessSP().get();
     }
     if (process) {
-      Error error;
+      Status error;
       size_t len;
       char stdio_buffer[1024];
       while ((len = process->GetSTDOUT(stdio_buffer, sizeof(stdio_buffer),
@@ -1393,7 +1394,7 @@ size_t Debugger::GetProcessSTDERR(Process *process, Stream *stream) {
         process = target_sp->GetProcessSP().get();
     }
     if (process) {
-      Error error;
+      Status error;
       size_t len;
       char stdio_buffer[1024];
       while ((len = process->GetSTDERR(stdio_buffer, sizeof(stdio_buffer),
@@ -1463,7 +1464,7 @@ void Debugger::HandleProcessEvent(const EventSP &event_sp) {
             EventDataStructuredData::GetObjectFromEvent(event_sp.get());
         if (output_stream_sp) {
           StreamString content_stream;
-          Error error =
+          Status error =
               plugin_sp->GetDescription(structured_data_sp, content_stream);
           if (error.Success()) {
             if (!content_stream.GetString().empty()) {
@@ -1702,8 +1703,8 @@ Target *Debugger::GetSelectedOrDummyTarget(bool prefer_dummy) {
   return GetDummyTarget();
 }
 
-Error Debugger::RunREPL(LanguageType language, const char *repl_options) {
-  Error err;
+Status Debugger::RunREPL(LanguageType language, const char *repl_options) {
+  Status err;
   FileSpec repl_executable;
 
   if (language == eLanguageTypeUnknown) {
diff --git a/source/Core/Disassembler.cpp b/source/Core/Disassembler.cpp
index 51d93d9acdbb..0a5d763b6d3f 100644
--- a/source/Core/Disassembler.cpp
+++ b/source/Core/Disassembler.cpp
@@ -35,8 +35,8 @@
 #include "lldb/Target/Thread.h" // for Thread
 #include "lldb/Utility/DataBufferHeap.h"
 #include "lldb/Utility/DataExtractor.h"
-#include "lldb/Utility/Error.h"
 #include "lldb/Utility/RegularExpression.h"
+#include "lldb/Utility/Status.h"
 #include "lldb/Utility/Stream.h"            // for Stream
 #include "lldb/Utility/StreamString.h"      // for StreamString
 #include "lldb/lldb-private-enumerations.h" // for InstructionType:...
@@ -59,7 +59,8 @@ using namespace lldb_private;
 DisassemblerSP Disassembler::FindPlugin(const ArchSpec &arch,
                                         const char *flavor,
                                         const char *plugin_name) {
-  Timer scoped_timer(LLVM_PRETTY_FUNCTION,
+  static Timer::Category func_cat(LLVM_PRETTY_FUNCTION);
+  Timer scoped_timer(func_cat,
                      "Disassembler::FindPlugin (arch = %s, plugin_name = %s)",
                      arch.GetArchitectureName(), plugin_name);
 
@@ -340,7 +341,7 @@ bool Disassembler::ElideMixedSourceAndDisassemblyLine(
   } else {
     TargetSP target_sp = exe_ctx.GetTargetSP();
     if (target_sp) {
-      Error error;
+      Status error;
       OptionValueSP value_sp = target_sp->GetDebugger().GetPropertyValue(
           &exe_ctx, "target.process.thread.step-avoid-regexp", false, error);
       if (value_sp && value_sp->GetType() == OptionValue::eTypeRegex) {
@@ -1118,7 +1119,7 @@ InstructionList::GetIndexOfNextBranchInstruction(uint32_t start,
     while (i > start) {
       --i;
 
-      Error error;
+      Status error;
       uint32_t inst_bytes;
       bool prefer_file_cache = false; // Read from process if process is running
       lldb::addr_t load_addr = LLDB_INVALID_ADDRESS;
@@ -1179,7 +1180,7 @@ size_t Disassembler::ParseInstructions(const ExecutionContext *exe_ctx,
 
     auto data_sp = std::make_shared<DataBufferHeap>(byte_size, '\0');
 
-    Error error;
+    Status error;
     lldb::addr_t load_addr = LLDB_INVALID_ADDRESS;
     const size_t bytes_read = target->ReadMemory(
         range.GetBaseAddress(), prefer_file_cache, data_sp->GetBytes(),
@@ -1224,7 +1225,7 @@ size_t Disassembler::ParseInstructions(const ExecutionContext *exe_ctx,
   DataBufferHeap *heap_buffer = new DataBufferHeap(byte_size, '\0');
   DataBufferSP data_sp(heap_buffer);
 
-  Error error;
+  Status error;
   lldb::addr_t load_addr = LLDB_INVALID_ADDRESS;
   const size_t bytes_read =
       target->ReadMemory(start, prefer_file_cache, heap_buffer->GetBytes(),
@@ -1460,4 +1461,3 @@ std::function<bool(const Instruction::Operand &)>
 lldb_private::OperandMatchers::MatchOpType(Instruction::Operand::Type type) {
   return [type](const Instruction::Operand &op) { return op.m_type == type; };
 }
-
diff --git a/source/Core/DynamicLoader.cpp b/source/Core/DynamicLoader.cpp
index 03fad244acfc..5477498dadc8 100644
--- a/source/Core/DynamicLoader.cpp
+++ b/source/Core/DynamicLoader.cpp
@@ -181,7 +181,7 @@ ModuleSP DynamicLoader::LoadModuleAtAddress(const FileSpec &file,
     // address to read the file out of the memory instead of a load bias.
     bool is_loaded = false;
     lldb::addr_t load_addr;
-    Error error = m_process->GetFileLoadAddress(file, is_loaded, load_addr);
+    Status error = m_process->GetFileLoadAddress(file, is_loaded, load_addr);
     if (error.Success() && is_loaded) {
       check_alternative_file_name = false;
       base_addr = load_addr;
@@ -193,7 +193,7 @@ ModuleSP DynamicLoader::LoadModuleAtAddress(const FileSpec &file,
   // different name based on the memory region info.
   if (check_alternative_file_name) {
     MemoryRegionInfo memory_info;
-    Error error = m_process->GetMemoryRegionInfo(base_addr, memory_info);
+    Status error = m_process->GetMemoryRegionInfo(base_addr, memory_info);
     if (error.Success() && memory_info.GetMapped() &&
         memory_info.GetRange().GetRangeBase() == base_addr && 
         !(memory_info.GetName().IsEmpty())) {
@@ -223,7 +223,7 @@ ModuleSP DynamicLoader::LoadModuleAtAddress(const FileSpec &file,
 
 int64_t DynamicLoader::ReadUnsignedIntWithSizeInBytes(addr_t addr,
                                                       int size_in_bytes) {
-  Error error;
+  Status error;
   uint64_t value =
       m_process->ReadUnsignedIntegerFromMemory(addr, size_in_bytes, 0, error);
   if (error.Fail())
@@ -233,7 +233,7 @@ int64_t DynamicLoader::ReadUnsignedIntWithSizeInBytes(addr_t addr,
 }
 
 addr_t DynamicLoader::ReadPointer(addr_t addr) {
-  Error error;
+  Status error;
   addr_t value = m_process->ReadPointerFromMemory(addr, error);
   if (error.Fail())
     return LLDB_INVALID_ADDRESS;
diff --git a/source/Core/EmulateInstruction.cpp b/source/Core/EmulateInstruction.cpp
index f18a4af67ef9..2ee2c79de270 100644
--- a/source/Core/EmulateInstruction.cpp
+++ b/source/Core/EmulateInstruction.cpp
@@ -19,7 +19,7 @@
 #include "lldb/Target/StackFrame.h"   // for StackFrame
 #include "lldb/Utility/ConstString.h" // for ConstString
 #include "lldb/Utility/DataExtractor.h"
-#include "lldb/Utility/Error.h"
+#include "lldb/Utility/Status.h"
 #include "lldb/Utility/Stream.h" // for Stream, Stream::::eBinary
 #include "lldb/Utility/StreamString.h"
 #include "lldb/lldb-forward.h"            // for ProcessSP
@@ -263,7 +263,7 @@ size_t EmulateInstruction::ReadMemoryFrame(EmulateInstruction *instruction,
 
   ProcessSP process_sp(frame->CalculateProcess());
   if (process_sp) {
-    Error error;
+    Status error;
     return process_sp->ReadMemory(addr, dst, dst_len, error);
   }
   return 0;
@@ -280,7 +280,7 @@ size_t EmulateInstruction::WriteMemoryFrame(EmulateInstruction *instruction,
 
   ProcessSP process_sp(frame->CalculateProcess());
   if (process_sp) {
-    Error error;
+    Status error;
     return process_sp->WriteMemory(addr, src, src_len, error);
   }
 
diff --git a/source/Core/FormatEntity.cpp b/source/Core/FormatEntity.cpp
index 835a1c54a0e0..9fb294aad2fb 100644
--- a/source/Core/FormatEntity.cpp
+++ b/source/Core/FormatEntity.cpp
@@ -303,7 +303,7 @@ void FormatEntity::Entry::AppendText(const char *cstr) {
   return AppendText(llvm::StringRef(cstr));
 }
 
-Error FormatEntity::Parse(const llvm::StringRef &format_str, Entry &entry) {
+Status FormatEntity::Parse(const llvm::StringRef &format_str, Entry &entry) {
   entry.Clear();
   entry.type = Entry::Type::Root;
   llvm::StringRef modifiable_format(format_str);
@@ -408,7 +408,7 @@ static bool RunScriptFormatKeyword(Stream &s, const SymbolContext *sc,
     ScriptInterpreter *script_interpreter =
         target->GetDebugger().GetCommandInterpreter().GetScriptInterpreter();
     if (script_interpreter) {
-      Error error;
+      Status error;
       std::string script_output;
 
       if (script_interpreter->RunScriptFormatKeyword(script_function_name, t,
@@ -778,7 +778,7 @@ static bool DumpValue(Stream &s, const SymbolContext *sc,
                        var_name_final_if_array_range, index_lower,
                        index_higher);
 
-    Error error;
+    Status error;
 
     const std::string &expr_path = entry.string;
 
@@ -824,7 +824,7 @@ static bool DumpValue(Stream &s, const SymbolContext *sc,
     // this happens when we are not going through
     // GetValueForVariableExpressionPath
     // to get to the target ValueObject
-    Error error;
+    Status error;
     target = target->Dereference(error).get();
     if (error.Fail()) {
       if (log)
@@ -1050,7 +1050,7 @@ static bool FormatThreadExtendedInfoRecurse(
       s.Printf("%f", value->GetAsFloat()->GetValue());
       return true;
     } else if (value->GetType() == StructuredData::Type::eTypeString) {
-      s.Printf("%s", value->GetAsString()->GetValue().c_str());
+      s.Format("{0}", value->GetAsString()->GetValue());
       return true;
     } else if (value->GetType() == StructuredData::Type::eTypeArray) {
       if (value->GetAsArray()->GetSize() > 0) {
@@ -1079,7 +1079,7 @@ bool FormatEntity::FormatStringRef(const llvm::StringRef &format_str, Stream &s,
                                    bool initial_function) {
   if (!format_str.empty()) {
     FormatEntity::Entry root;
-    Error error = FormatEntity::Parse(format_str, root);
+    Status error = FormatEntity::Parse(format_str, root);
     if (error.Success()) {
       return FormatEntity::Format(root, s, sc, exe_ctx, addr, valobj,
                                   function_changed, initial_function);
@@ -1096,7 +1096,7 @@ bool FormatEntity::FormatCString(const char *format, Stream &s,
   if (format && format[0]) {
     FormatEntity::Entry root;
     llvm::StringRef format_str(format);
-    Error error = FormatEntity::Parse(format_str, root);
+    Status error = FormatEntity::Parse(format_str, root);
     if (error.Success()) {
       return FormatEntity::Format(root, s, sc, exe_ctx, addr, valobj,
                                   function_changed, initial_function);
@@ -1866,10 +1866,10 @@ static bool DumpCommaSeparatedChildEntryNames(
   return false;
 }
 
-static Error ParseEntry(const llvm::StringRef &format_str,
-                        const FormatEntity::Entry::Definition *parent,
-                        FormatEntity::Entry &entry) {
-  Error error;
+static Status ParseEntry(const llvm::StringRef &format_str,
+                         const FormatEntity::Entry::Definition *parent,
+                         FormatEntity::Entry &entry) {
+  Status error;
 
   const size_t sep_pos = format_str.find_first_of(".[:");
   const char sep_char =
@@ -1956,7 +1956,7 @@ static const FormatEntity::Entry::Definition *
 FindEntry(const llvm::StringRef &format_str,
           const FormatEntity::Entry::Definition *parent,
           llvm::StringRef &remainder) {
-  Error error;
+  Status error;
 
   std::pair<llvm::StringRef, llvm::StringRef> p = format_str.split('.');
   const size_t n = parent->num_children;
@@ -1983,9 +1983,9 @@ FindEntry(const llvm::StringRef &format_str,
   return parent;
 }
 
-Error FormatEntity::ParseInternal(llvm::StringRef &format, Entry &parent_entry,
-                                  uint32_t depth) {
-  Error error;
+Status FormatEntity::ParseInternal(llvm::StringRef &format, Entry &parent_entry,
+                                   uint32_t depth) {
+  Status error;
   while (!format.empty() && error.Success()) {
     const size_t non_special_chars = format.find_first_of("${}\\");
 
@@ -2279,10 +2279,10 @@ Error FormatEntity::ParseInternal(llvm::StringRef &format, Entry &parent_entry,
   return error;
 }
 
-Error FormatEntity::ExtractVariableInfo(llvm::StringRef &format_str,
-                                        llvm::StringRef &variable_name,
-                                        llvm::StringRef &variable_format) {
-  Error error;
+Status FormatEntity::ExtractVariableInfo(llvm::StringRef &format_str,
+                                         llvm::StringRef &variable_name,
+                                         llvm::StringRef &variable_format) {
+  Status error;
   variable_name = llvm::StringRef();
   variable_format = llvm::StringRef();
 
diff --git a/source/Core/IOHandler.cpp b/source/Core/IOHandler.cpp
index b5dd0bd8a25f..e5fe490991f9 100644
--- a/source/Core/IOHandler.cpp
+++ b/source/Core/IOHandler.cpp
@@ -27,7 +27,7 @@
 #include "lldb/Core/StreamFile.h"
 #include "lldb/Host/File.h"            // for File
 #include "lldb/Host/Predicate.h"       // for Predicate, ::eBroad...
-#include "lldb/Utility/Error.h"        // for Error
+#include "lldb/Utility/Status.h"       // for Status
 #include "lldb/Utility/StreamString.h" // for StreamString
 #include "lldb/Utility/StringList.h"   // for StringList
 #include "lldb/lldb-forward.h"         // for StreamFileSP
@@ -515,7 +515,7 @@ bool IOHandlerEditline::GetLines(StringList &lines, bool &interrupted) {
   } else {
 #endif
     bool done = false;
-    Error error;
+    Status error;
 
     while (!done) {
       // Show line numbers if we are asked to
@@ -4640,7 +4640,7 @@ void IOHandlerCursesGUI::Activate() {
     WindowSP threads_window_sp(
         main_window_sp->CreateSubWindow("Threads", threads_bounds, false));
     WindowSP status_window_sp(
-        main_window_sp->CreateSubWindow("Status", status_bounds, false));
+        main_window_sp->CreateSubWindow("Error", status_bounds, false));
     status_window_sp->SetCanBeActive(
         false); // Don't let the status bar become the active window
     main_window_sp->SetDelegate(
diff --git a/source/Core/Mangled.cpp b/source/Core/Mangled.cpp
index 3d96340b911c..c2e9b8904a0a 100644
--- a/source/Core/Mangled.cpp
+++ b/source/Core/Mangled.cpp
@@ -258,8 +258,8 @@ Mangled::GetDemangledName(lldb::LanguageType language) const {
   // haven't already decoded our mangled name.
   if (m_mangled && !m_demangled) {
     // We need to generate and cache the demangled name.
-    Timer scoped_timer(LLVM_PRETTY_FUNCTION,
-                       "Mangled::GetDemangledName (m_mangled = %s)",
+    static Timer::Category func_cat(LLVM_PRETTY_FUNCTION);
+    Timer scoped_timer(func_cat, "Mangled::GetDemangledName (m_mangled = %s)",
                        m_mangled.GetCString());
 
     Log *log = lldb_private::GetLogIfAllCategoriesSet(LIBLLDB_LOG_DEMANGLE);
diff --git a/source/Core/Module.cpp b/source/Core/Module.cpp
index d168474c3479..1b510d2ff7b2 100644
--- a/source/Core/Module.cpp
+++ b/source/Core/Module.cpp
@@ -39,10 +39,10 @@
 #include "lldb/Target/Process.h"
 #include "lldb/Target/Target.h"
 #include "lldb/Utility/DataBufferHeap.h"
-#include "lldb/Utility/Error.h"
 #include "lldb/Utility/Log.h"
 #include "lldb/Utility/Logging.h" // for GetLogIfAn...
 #include "lldb/Utility/RegularExpression.h"
+#include "lldb/Utility/Status.h"
 #include "lldb/Utility/Stream.h" // for Stream
 #include "lldb/Utility/StreamString.h"
 
@@ -322,7 +322,7 @@ Module::~Module() {
 }
 
 ObjectFile *Module::GetMemoryObjectFile(const lldb::ProcessSP &process_sp,
-                                        lldb::addr_t header_addr, Error &error,
+                                        lldb::addr_t header_addr, Status &error,
                                         size_t size_to_read) {
   if (m_objfile_sp) {
     error.SetErrorString("object file already exists");
@@ -331,7 +331,7 @@ ObjectFile *Module::GetMemoryObjectFile(const lldb::ProcessSP &process_sp,
     if (process_sp) {
       m_did_load_objfile = true;
       auto data_ap = llvm::make_unique<DataBufferHeap>(size_to_read, 0);
-      Error readmem_error;
+      Status readmem_error;
       const size_t bytes_read =
           process_sp->ReadMemory(header_addr, data_ap->GetBytes(),
                                  data_ap->GetByteSize(), readmem_error);
@@ -429,8 +429,8 @@ void Module::DumpSymbolContext(Stream *s) {
 
 size_t Module::GetNumCompileUnits() {
   std::lock_guard<std::recursive_mutex> guard(m_mutex);
-  Timer scoped_timer(LLVM_PRETTY_FUNCTION,
-                     "Module::GetNumCompileUnits (module = %p)",
+  static Timer::Category func_cat(LLVM_PRETTY_FUNCTION);
+  Timer scoped_timer(func_cat, "Module::GetNumCompileUnits (module = %p)",
                      static_cast<void *>(this));
   SymbolVendor *symbols = GetSymbolVendor();
   if (symbols)
@@ -453,7 +453,8 @@ CompUnitSP Module::GetCompileUnitAtIndex(size_t index) {
 
 bool Module::ResolveFileAddress(lldb::addr_t vm_addr, Address &so_addr) {
   std::lock_guard<std::recursive_mutex> guard(m_mutex);
-  Timer scoped_timer(LLVM_PRETTY_FUNCTION,
+  static Timer::Category func_cat(LLVM_PRETTY_FUNCTION);
+  Timer scoped_timer(func_cat,
                      "Module::ResolveFileAddress (vm_addr = 0x%" PRIx64 ")",
                      vm_addr);
   SectionList *section_list = GetSectionList();
@@ -616,7 +617,8 @@ uint32_t Module::ResolveSymbolContextsForFileSpec(const FileSpec &file_spec,
                                                   uint32_t resolve_scope,
                                                   SymbolContextList &sc_list) {
   std::lock_guard<std::recursive_mutex> guard(m_mutex);
-  Timer scoped_timer(LLVM_PRETTY_FUNCTION,
+  static Timer::Category func_cat(LLVM_PRETTY_FUNCTION);
+  Timer scoped_timer(func_cat,
                      "Module::ResolveSymbolContextForFilePath (%s:%u, "
                      "check_inlines = %s, resolve_scope = 0x%8.8x)",
                      file_spec.GetPath().c_str(), line,
@@ -987,7 +989,8 @@ size_t Module::FindTypes_Impl(
     const CompilerDeclContext *parent_decl_ctx, bool append, size_t max_matches,
     llvm::DenseSet<lldb_private::SymbolFile *> &searched_symbol_files,
     TypeMap &types) {
-  Timer scoped_timer(LLVM_PRETTY_FUNCTION, LLVM_PRETTY_FUNCTION);
+  static Timer::Category func_cat(LLVM_PRETTY_FUNCTION);
+  Timer scoped_timer(func_cat, LLVM_PRETTY_FUNCTION);
   if (!sc.module_sp || sc.module_sp.get() == this) {
     SymbolVendor *symbols = GetSymbolVendor();
     if (symbols)
@@ -1078,7 +1081,8 @@ SymbolVendor *Module::GetSymbolVendor(bool can_create,
     if (!m_did_load_symbol_vendor.load() && can_create) {
       ObjectFile *obj_file = GetObjectFile();
       if (obj_file != nullptr) {
-        Timer scoped_timer(LLVM_PRETTY_FUNCTION, LLVM_PRETTY_FUNCTION);
+        static Timer::Category func_cat(LLVM_PRETTY_FUNCTION);
+        Timer scoped_timer(func_cat, LLVM_PRETTY_FUNCTION);
         m_symfile_ap.reset(
             SymbolVendor::FindPlugin(shared_from_this(), feedback_strm));
         m_did_load_symbol_vendor = true;
@@ -1278,8 +1282,8 @@ ObjectFile *Module::GetObjectFile() {
   if (!m_did_load_objfile.load()) {
     std::lock_guard<std::recursive_mutex> guard(m_mutex);
     if (!m_did_load_objfile.load()) {
-      Timer scoped_timer(LLVM_PRETTY_FUNCTION,
-                         "Module::GetObjectFile () module = %s",
+      static Timer::Category func_cat(LLVM_PRETTY_FUNCTION);
+      Timer scoped_timer(func_cat, "Module::GetObjectFile () module = %s",
                          GetFileSpec().GetFilename().AsCString(""));
       DataBufferSP data_sp;
       lldb::offset_t data_offset = 0;
@@ -1338,9 +1342,9 @@ SectionList *Module::GetUnifiedSectionList() {
 
 const Symbol *Module::FindFirstSymbolWithNameAndType(const ConstString &name,
                                                      SymbolType symbol_type) {
+  static Timer::Category func_cat(LLVM_PRETTY_FUNCTION);
   Timer scoped_timer(
-      LLVM_PRETTY_FUNCTION,
-      "Module::FindFirstSymbolWithNameAndType (name = %s, type = %i)",
+      func_cat, "Module::FindFirstSymbolWithNameAndType (name = %s, type = %i)",
       name.AsCString(), symbol_type);
   SymbolVendor *sym_vendor = GetSymbolVendor();
   if (sym_vendor) {
@@ -1372,7 +1376,8 @@ void Module::SymbolIndicesToSymbolContextList(
 size_t Module::FindFunctionSymbols(const ConstString &name,
                                    uint32_t name_type_mask,
                                    SymbolContextList &sc_list) {
-  Timer scoped_timer(LLVM_PRETTY_FUNCTION,
+  static Timer::Category func_cat(LLVM_PRETTY_FUNCTION);
+  Timer scoped_timer(func_cat,
                      "Module::FindSymbolsFunctions (name = %s, mask = 0x%8.8x)",
                      name.AsCString(), name_type_mask);
   SymbolVendor *sym_vendor = GetSymbolVendor();
@@ -1390,9 +1395,9 @@ size_t Module::FindSymbolsWithNameAndType(const ConstString &name,
   // No need to protect this call using m_mutex all other method calls are
   // already thread safe.
 
+  static Timer::Category func_cat(LLVM_PRETTY_FUNCTION);
   Timer scoped_timer(
-      LLVM_PRETTY_FUNCTION,
-      "Module::FindSymbolsWithNameAndType (name = %s, type = %i)",
+      func_cat, "Module::FindSymbolsWithNameAndType (name = %s, type = %i)",
       name.AsCString(), symbol_type);
   const size_t initial_size = sc_list.GetSize();
   SymbolVendor *sym_vendor = GetSymbolVendor();
@@ -1413,8 +1418,9 @@ size_t Module::FindSymbolsMatchingRegExAndType(const RegularExpression &regex,
   // No need to protect this call using m_mutex all other method calls are
   // already thread safe.
 
+  static Timer::Category func_cat(LLVM_PRETTY_FUNCTION);
   Timer scoped_timer(
-      LLVM_PRETTY_FUNCTION,
+      func_cat,
       "Module::FindSymbolsMatchingRegExAndType (regex = %s, type = %i)",
       regex.GetText().str().c_str(), symbol_type);
   const size_t initial_size = sc_list.GetSize();
@@ -1537,7 +1543,7 @@ bool Module::IsLoadedInTarget(Target *target) {
   return false;
 }
 
-bool Module::LoadScriptingResourceInTarget(Target *target, Error &error,
+bool Module::LoadScriptingResourceInTarget(Target *target, Status &error,
                                            Stream *feedback_stream) {
   if (!target) {
     error.SetErrorString("invalid destination Target");
@@ -1717,6 +1723,6 @@ bool Module::GetIsDynamicLinkEditor() {
   return false;
 }
 
-Error Module::LoadInMemory(Target &target, bool set_pc) {
+Status Module::LoadInMemory(Target &target, bool set_pc) {
   return m_objfile_sp->LoadInMemory(target, set_pc);
 }
diff --git a/source/Core/ModuleList.cpp b/source/Core/ModuleList.cpp
index da23329cc3b6..b04299ead804 100644
--- a/source/Core/ModuleList.cpp
+++ b/source/Core/ModuleList.cpp
@@ -704,17 +704,17 @@ size_t ModuleList::RemoveOrphanSharedModules(bool mandatory) {
   return GetSharedModuleList().RemoveOrphans(mandatory);
 }
 
-Error ModuleList::GetSharedModule(const ModuleSpec &module_spec,
-                                  ModuleSP &module_sp,
-                                  const FileSpecList *module_search_paths_ptr,
-                                  ModuleSP *old_module_sp_ptr,
-                                  bool *did_create_ptr, bool always_create) {
+Status ModuleList::GetSharedModule(const ModuleSpec &module_spec,
+                                   ModuleSP &module_sp,
+                                   const FileSpecList *module_search_paths_ptr,
+                                   ModuleSP *old_module_sp_ptr,
+                                   bool *did_create_ptr, bool always_create) {
   ModuleList &shared_module_list = GetSharedModuleList();
   std::lock_guard<std::recursive_mutex> guard(
       shared_module_list.m_modules_mutex);
   char path[PATH_MAX];
 
-  Error error;
+  Status error;
 
   module_sp.reset();
 
@@ -821,7 +821,7 @@ Error ModuleList::GetSharedModule(const ModuleSpec &module_spec,
               *did_create_ptr = true;
 
             shared_module_list.ReplaceEquivalent(module_sp);
-            return Error();
+            return Status();
           }
         }
       } else {
@@ -955,14 +955,14 @@ bool ModuleList::RemoveSharedModuleIfOrphaned(const Module *module_ptr) {
 }
 
 bool ModuleList::LoadScriptingResourcesInTarget(Target *target,
-                                                std::list<Error> &errors,
+                                                std::list<Status> &errors,
                                                 Stream *feedback_stream,
                                                 bool continue_on_error) {
   if (!target)
     return false;
   std::lock_guard<std::recursive_mutex> guard(m_modules_mutex);
   for (auto module : m_modules) {
-    Error error;
+    Status error;
     if (module) {
       if (!module->LoadScriptingResourceInTarget(target, error,
                                                  feedback_stream)) {
diff --git a/source/Core/PluginManager.cpp b/source/Core/PluginManager.cpp
index bd4ba5995204..9bb615b6a55e 100644
--- a/source/Core/PluginManager.cpp
+++ b/source/Core/PluginManager.cpp
@@ -13,8 +13,8 @@
 #include "lldb/Host/HostInfo.h"
 #include "lldb/Interpreter/OptionValueProperties.h"
 #include "lldb/Utility/ConstString.h" // for ConstString
-#include "lldb/Utility/Error.h"
 #include "lldb/Utility/FileSpec.h"
+#include "lldb/Utility/Status.h"
 #include "lldb/Utility/StringList.h" // for StringList
 
 #if defined(LLVM_ON_WIN32)
@@ -93,7 +93,7 @@ static FileSpec::EnumerateDirectoryResult
 LoadPluginCallback(void *baton, llvm::sys::fs::file_type ft,
                    const FileSpec &file_spec) {
   //    PluginManager *plugin_manager = (PluginManager *)baton;
-  Error error;
+  Status error;
 
   namespace fs = llvm::sys::fs;
   // If we have a regular file, a symbolic link or unknown file type, try
@@ -1066,9 +1066,9 @@ PluginManager::GetObjectFileCreateMemoryCallbackForPluginName(
   return nullptr;
 }
 
-Error PluginManager::SaveCore(const lldb::ProcessSP &process_sp,
-                              const FileSpec &outfile) {
-  Error error;
+Status PluginManager::SaveCore(const lldb::ProcessSP &process_sp,
+                               const FileSpec &outfile) {
+  Status error;
   std::lock_guard<std::recursive_mutex> guard(GetObjectFileMutex());
   ObjectFileInstances &instances = GetObjectFileInstances();
 
diff --git a/source/Core/RegisterValue.cpp b/source/Core/RegisterValue.cpp
index 26a7515febc3..28ce67e63dc2 100644
--- a/source/Core/RegisterValue.cpp
+++ b/source/Core/RegisterValue.cpp
@@ -13,7 +13,7 @@
 #include "lldb/Core/Scalar.h"
 #include "lldb/Interpreter/Args.h"
 #include "lldb/Utility/DataExtractor.h"
-#include "lldb/Utility/Error.h"
+#include "lldb/Utility/Status.h"
 #include "lldb/Utility/Stream.h"
 #include "lldb/Utility/StreamString.h"
 #include "lldb/lldb-defines.h"       // for LLDB_INVALID_ADDRESS
@@ -102,7 +102,7 @@ bool RegisterValue::GetData(DataExtractor &data) const {
 uint32_t RegisterValue::GetAsMemoryData(const RegisterInfo *reg_info, void *dst,
                                         uint32_t dst_len,
                                         lldb::ByteOrder dst_byte_order,
-                                        Error &error) const {
+                                        Status &error) const {
   if (reg_info == nullptr) {
     error.SetErrorString("invalid register info argument.");
     return 0;
@@ -148,7 +148,7 @@ uint32_t RegisterValue::GetAsMemoryData(const RegisterInfo *reg_info, void *dst,
 uint32_t RegisterValue::SetFromMemoryData(const RegisterInfo *reg_info,
                                           const void *src, uint32_t src_len,
                                           lldb::ByteOrder src_byte_order,
-                                          Error &error) {
+                                          Status &error) {
   if (reg_info == nullptr) {
     error.SetErrorString("invalid register info argument.");
     return 0;
@@ -163,7 +163,7 @@ uint32_t RegisterValue::SetFromMemoryData(const RegisterInfo *reg_info,
   //
   // Case 2: src_len > dst_len
   //
-  //   Error!  (The register should always be big enough to hold the data)
+  //   Status!  (The register should always be big enough to hold the data)
   //
   // Case 3: src_len < dst_len
   //
@@ -257,11 +257,11 @@ RegisterValue::Type RegisterValue::SetType(const RegisterInfo *reg_info) {
   return m_type;
 }
 
-Error RegisterValue::SetValueFromData(const RegisterInfo *reg_info,
-                                      DataExtractor &src,
-                                      lldb::offset_t src_offset,
-                                      bool partial_data_ok) {
-  Error error;
+Status RegisterValue::SetValueFromData(const RegisterInfo *reg_info,
+                                       DataExtractor &src,
+                                       lldb::offset_t src_offset,
+                                       bool partial_data_ok) {
+  Status error;
 
   if (src.GetByteSize() == 0) {
     error.SetErrorString("empty data.");
@@ -391,9 +391,9 @@ static bool ParseVectorEncoding(const RegisterInfo *reg_info,
   return true;
 }
 
-Error RegisterValue::SetValueFromString(const RegisterInfo *reg_info,
-                                        llvm::StringRef value_str) {
-  Error error;
+Status RegisterValue::SetValueFromString(const RegisterInfo *reg_info,
+                                         llvm::StringRef value_str) {
+  Status error;
   if (reg_info == nullptr) {
     error.SetErrorString("Invalid register info argument.");
     return error;
diff --git a/source/Core/Scalar.cpp b/source/Core/Scalar.cpp
index 3adf85098648..630083bae930 100644
--- a/source/Core/Scalar.cpp
+++ b/source/Core/Scalar.cpp
@@ -12,7 +12,7 @@
 #include "lldb/Host/StringConvert.h"
 #include "lldb/Utility/DataExtractor.h"
 #include "lldb/Utility/Endian.h"
-#include "lldb/Utility/Error.h"
+#include "lldb/Utility/Status.h"
 #include "lldb/Utility/Stream.h"
 #include "lldb/lldb-types.h" // for offset_t
 
@@ -2460,9 +2460,9 @@ const Scalar lldb_private::operator>>(const Scalar &lhs, const Scalar &rhs) {
   return result;
 }
 
-Error Scalar::SetValueFromCString(const char *value_str, Encoding encoding,
-                                  size_t byte_size) {
-  Error error;
+Status Scalar::SetValueFromCString(const char *value_str, Encoding encoding,
+                                   size_t byte_size) {
+  Status error;
   if (value_str == nullptr || value_str[0] == '\0') {
     error.SetErrorString("Invalid c-string value string.");
     return error;
@@ -2596,9 +2596,9 @@ Error Scalar::SetValueFromCString(const char *value_str, Encoding encoding,
   return error;
 }
 
-Error Scalar::SetValueFromData(DataExtractor &data, lldb::Encoding encoding,
-                               size_t byte_size) {
-  Error error;
+Status Scalar::SetValueFromData(DataExtractor &data, lldb::Encoding encoding,
+                                size_t byte_size) {
+  Status error;
 
   type128 int128;
   type256 int256;
@@ -2762,7 +2762,7 @@ bool Scalar::SignExtend(uint32_t sign_bit_pos) {
 
 size_t Scalar::GetAsMemoryData(void *dst, size_t dst_len,
                                lldb::ByteOrder dst_byte_order,
-                               Error &error) const {
+                               Status &error) const {
   // Get a data extractor that points to the native scalar data
   DataExtractor data;
   if (!GetData(data)) {
diff --git a/source/Core/SearchFilter.cpp b/source/Core/SearchFilter.cpp
index 6d29e21c310e..0701955233a1 100644
--- a/source/Core/SearchFilter.cpp
+++ b/source/Core/SearchFilter.cpp
@@ -16,7 +16,7 @@
 #include "lldb/Symbol/SymbolContext.h" // for SymbolContext
 #include "lldb/Target/Target.h"
 #include "lldb/Utility/ConstString.h" // for ConstString
-#include "lldb/Utility/Error.h"       // for Error
+#include "lldb/Utility/Status.h"      // for Status
 #include "lldb/Utility/Stream.h"      // for Stream
 #include "lldb/lldb-enumerations.h"   // for SymbolContextItem::eSymbolCo...
 
@@ -55,9 +55,9 @@ const char *SearchFilter::FilterTyToName(enum FilterTy type) {
   return g_ty_to_name[type];
 }
 
-SearchFilter::FilterTy SearchFilter::NameToFilterTy(const char *name) {
+SearchFilter::FilterTy SearchFilter::NameToFilterTy(llvm::StringRef name) {
   for (size_t i = 0; i <= LastKnownFilterType; i++) {
-    if (strcmp(name, g_ty_to_name[i]) == 0)
+    if (name == g_ty_to_name[i])
       return (FilterTy)i;
   }
   return UnknownFilter;
@@ -80,14 +80,14 @@ SearchFilter::~SearchFilter() = default;
 
 SearchFilterSP SearchFilter::CreateFromStructuredData(
     Target &target, const StructuredData::Dictionary &filter_dict,
-    Error &error) {
+    Status &error) {
   SearchFilterSP result_sp;
   if (!filter_dict.IsValid()) {
     error.SetErrorString("Can't deserialize from an invalid data object.");
     return result_sp;
   }
 
-  std::string subclass_name;
+  llvm::StringRef subclass_name;
 
   bool success = filter_dict.GetValueForKeyAsString(
       GetSerializationSubclassKey(), subclass_name);
@@ -96,10 +96,9 @@ SearchFilterSP SearchFilter::CreateFromStructuredData(
     return result_sp;
   }
 
-  FilterTy filter_type = NameToFilterTy(subclass_name.c_str());
+  FilterTy filter_type = NameToFilterTy(subclass_name);
   if (filter_type == UnknownFilter) {
-    error.SetErrorStringWithFormat("Unknown filter type: %s.",
-                                   subclass_name.c_str());
+    error.SetErrorStringWithFormatv("Unknown filter type: {0}.", subclass_name);
     return result_sp;
   }
 
@@ -338,7 +337,8 @@ Searcher::CallbackReturn SearchFilter::DoFunctionIteration(
 //  "black list".
 //----------------------------------------------------------------------
 SearchFilterSP SearchFilterForUnconstrainedSearches::CreateFromStructuredData(
-    Target &target, const StructuredData::Dictionary &data_dict, Error &error) {
+    Target &target, const StructuredData::Dictionary &data_dict,
+    Status &error) {
   // No options for an unconstrained search.
   return std::make_shared<SearchFilterForUnconstrainedSearches>(
       target.shared_from_this());
@@ -466,7 +466,8 @@ SearchFilterByModule::DoCopyForBreakpoint(Breakpoint &breakpoint) {
 }
 
 SearchFilterSP SearchFilterByModule::CreateFromStructuredData(
-    Target &target, const StructuredData::Dictionary &data_dict, Error &error) {
+    Target &target, const StructuredData::Dictionary &data_dict,
+    Status &error) {
   StructuredData::Array *modules_array;
   bool success = data_dict.GetValueForKeyAsArray(GetKey(OptionNames::ModList),
                                                  modules_array);
@@ -482,7 +483,7 @@ SearchFilterSP SearchFilterByModule::CreateFromStructuredData(
     return nullptr;
   }
 
-  std::string module;
+  llvm::StringRef module;
   success = modules_array->GetItemAtIndexAsString(0, module);
   if (!success) {
     error.SetErrorString("SFBM::CFSD: filter module item not a string.");
@@ -628,7 +629,8 @@ SearchFilterByModuleList::DoCopyForBreakpoint(Breakpoint &breakpoint) {
 }
 
 SearchFilterSP SearchFilterByModuleList::CreateFromStructuredData(
-    Target &target, const StructuredData::Dictionary &data_dict, Error &error) {
+    Target &target, const StructuredData::Dictionary &data_dict,
+    Status &error) {
   StructuredData::Array *modules_array;
   bool success = data_dict.GetValueForKeyAsArray(GetKey(OptionNames::ModList),
                                                  modules_array);
@@ -636,7 +638,7 @@ SearchFilterSP SearchFilterByModuleList::CreateFromStructuredData(
   if (success) {
     size_t num_modules = modules_array->GetSize();
     for (size_t i = 0; i < num_modules; i++) {
-      std::string module;
+      llvm::StringRef module;
       success = modules_array->GetItemAtIndexAsString(i, module);
       if (!success) {
         error.SetErrorStringWithFormat(
@@ -691,7 +693,8 @@ operator=(const SearchFilterByModuleListAndCU &rhs) {
 SearchFilterByModuleListAndCU::~SearchFilterByModuleListAndCU() = default;
 
 lldb::SearchFilterSP SearchFilterByModuleListAndCU::CreateFromStructuredData(
-    Target &target, const StructuredData::Dictionary &data_dict, Error &error) {
+    Target &target, const StructuredData::Dictionary &data_dict,
+    Status &error) {
   StructuredData::Array *modules_array = nullptr;
   SearchFilterSP result_sp;
   bool success = data_dict.GetValueForKeyAsArray(GetKey(OptionNames::ModList),
@@ -700,7 +703,7 @@ lldb::SearchFilterSP SearchFilterByModuleListAndCU::CreateFromStructuredData(
   if (success) {
     size_t num_modules = modules_array->GetSize();
     for (size_t i = 0; i < num_modules; i++) {
-      std::string module;
+      llvm::StringRef module;
       success = modules_array->GetItemAtIndexAsString(i, module);
       if (!success) {
         error.SetErrorStringWithFormat(
@@ -722,7 +725,7 @@ lldb::SearchFilterSP SearchFilterByModuleListAndCU::CreateFromStructuredData(
   size_t num_cus = cus_array->GetSize();
   FileSpecList cus;
   for (size_t i = 0; i < num_cus; i++) {
-    std::string cu;
+    llvm::StringRef cu;
     success = cus_array->GetItemAtIndexAsString(i, cu);
     if (!success) {
       error.SetErrorStringWithFormat(
diff --git a/source/Core/StructuredData.cpp b/source/Core/StructuredData.cpp
index 649c4615ad4f..d52b7730cc65 100644
--- a/source/Core/StructuredData.cpp
+++ b/source/Core/StructuredData.cpp
@@ -12,9 +12,9 @@
 #include "lldb/Host/File.h"
 #include "lldb/Host/StringConvert.h"
 #include "lldb/Utility/DataBuffer.h"
-#include "lldb/Utility/Error.h"
 #include "lldb/Utility/FileSpec.h"
 #include "lldb/Utility/JSON.h"
+#include "lldb/Utility/Status.h"
 #include "lldb/Utility/Stream.h" // for Stream
 #include "lldb/Utility/StreamString.h"
 #include "lldb/lldb-enumerations.h" // for FilePermissions::eFilePermiss...
@@ -40,7 +40,7 @@ static StructuredData::ObjectSP ParseJSONObject(JSONParser &json_parser);
 static StructuredData::ObjectSP ParseJSONArray(JSONParser &json_parser);
 
 StructuredData::ObjectSP
-StructuredData::ParseJSONFromFile(const FileSpec &input_spec, Error &error) {
+StructuredData::ParseJSONFromFile(const FileSpec &input_spec, Status &error) {
   StructuredData::ObjectSP return_sp;
   if (!input_spec.Exists()) {
     error.SetErrorStringWithFormat("input file %s does not exist.",
diff --git a/source/Core/Timer.cpp b/source/Core/Timer.cpp
index 60da39b7f199..6de4516f721d 100644
--- a/source/Core/Timer.cpp
+++ b/source/Core/Timer.cpp
@@ -27,8 +27,8 @@ using namespace lldb_private;
 #define TIMER_INDENT_AMOUNT 2
 
 namespace {
-typedef std::map<const char *, std::chrono::nanoseconds> TimerCategoryMap;
 typedef std::vector<Timer *> TimerStack;
+static std::atomic<Timer::Category *> g_categories;
 } // end of anonymous namespace
 
 std::atomic<bool> Timer::g_quiet(true);
@@ -38,16 +38,6 @@ static std::mutex &GetFileMutex() {
   return *g_file_mutex_ptr;
 }
 
-static std::mutex &GetCategoryMutex() {
-  static std::mutex g_category_mutex;
-  return g_category_mutex;
-}
-
-static TimerCategoryMap &GetCategoryMap() {
-  static TimerCategoryMap g_category_map;
-  return g_category_map;
-}
-
 static void ThreadSpecificCleanup(void *p) {
   delete static_cast<TimerStack *>(p);
 }
@@ -64,9 +54,17 @@ static TimerStack *GetTimerStackForCurrentThread() {
   return (TimerStack *)timer_stack;
 }
 
+Timer::Category::Category(const char *cat) : m_name(cat) {
+  m_nanos.store(0, std::memory_order_release);
+  Category *expected = g_categories;
+  do {
+    m_next = expected;
+  } while (!g_categories.compare_exchange_weak(expected, this));
+}
+
 void Timer::SetQuiet(bool value) { g_quiet = value; }
 
-Timer::Timer(const char *category, const char *format, ...)
+Timer::Timer(Timer::Category &category, const char *format, ...)
     : m_category(category), m_total_start(std::chrono::steady_clock::now()) {
   TimerStack *stack = GetTimerStackForCurrentThread();
   if (!stack)
@@ -114,11 +112,7 @@ Timer::~Timer() {
     stack->back()->ChildDuration(total_dur);
 
   // Keep total results for each category so we can dump results.
-  {
-    std::lock_guard<std::mutex> guard(GetCategoryMutex());
-    TimerCategoryMap &category_map = GetCategoryMap();
-    category_map[m_category] += timer_dur;
-  }
+  m_category.m_nanos += std::chrono::nanoseconds(timer_dur).count();
 }
 
 void Timer::SetDisplayDepth(uint32_t depth) { g_display_depth = depth; }
@@ -126,33 +120,32 @@ void Timer::SetDisplayDepth(uint32_t depth) { g_display_depth = depth; }
 /* binary function predicate:
  * - returns whether a person is less than another person
  */
-static bool
-CategoryMapIteratorSortCriterion(const TimerCategoryMap::const_iterator &lhs,
-                                 const TimerCategoryMap::const_iterator &rhs) {
-  return lhs->second > rhs->second;
+
+typedef std::pair<const char *, uint64_t> TimerEntry;
+
+static bool CategoryMapIteratorSortCriterion(const TimerEntry &lhs,
+                                             const TimerEntry &rhs) {
+  return lhs.second > rhs.second;
 }
 
 void Timer::ResetCategoryTimes() {
-  std::lock_guard<std::mutex> guard(GetCategoryMutex());
-  TimerCategoryMap &category_map = GetCategoryMap();
-  category_map.clear();
+  for (Category *i = g_categories; i; i = i->m_next)
+    i->m_nanos.store(0, std::memory_order_release);
 }
 
 void Timer::DumpCategoryTimes(Stream *s) {
-  std::lock_guard<std::mutex> guard(GetCategoryMutex());
-  TimerCategoryMap &category_map = GetCategoryMap();
-  std::vector<TimerCategoryMap::const_iterator> sorted_iterators;
-  TimerCategoryMap::const_iterator pos, end = category_map.end();
-  for (pos = category_map.begin(); pos != end; ++pos) {
-    sorted_iterators.push_back(pos);
-  }
-  std::sort(sorted_iterators.begin(), sorted_iterators.end(),
-            CategoryMapIteratorSortCriterion);
-
-  const size_t count = sorted_iterators.size();
-  for (size_t i = 0; i < count; ++i) {
-    const auto timer = sorted_iterators[i]->second;
-    s->Printf("%.9f sec for %s\n", std::chrono::duration<double>(timer).count(),
-              sorted_iterators[i]->first);
+  std::vector<TimerEntry> sorted;
+  for (Category *i = g_categories; i; i = i->m_next) {
+    uint64_t nanos = i->m_nanos.load(std::memory_order_acquire);
+    if (nanos)
+      sorted.push_back(std::make_pair(i->m_name, nanos));
   }
+  if (sorted.empty())
+    return; // Later code will break without any elements.
+
+  // Sort by time
+  std::sort(sorted.begin(), sorted.end(), CategoryMapIteratorSortCriterion);
+
+  for (const auto &timer : sorted)
+    s->Printf("%.9f sec for %s\n", timer.second / 1000000000., timer.first);
 }
diff --git a/source/Core/UserSettingsController.cpp b/source/Core/UserSettingsController.cpp
index 59a88ccdb931..a4661a6c9e8c 100644
--- a/source/Core/UserSettingsController.cpp
+++ b/source/Core/UserSettingsController.cpp
@@ -10,7 +10,7 @@
 #include "lldb/Core/UserSettingsController.h"
 
 #include "lldb/Interpreter/OptionValueProperties.h"
-#include "lldb/Utility/Error.h"
+#include "lldb/Utility/Status.h"
 #include "lldb/Utility/Stream.h"
 
 #include <memory> // for shared_ptr
@@ -32,21 +32,23 @@ using namespace lldb;
 using namespace lldb_private;
 
 lldb::OptionValueSP
-Properties::GetPropertyValue(const ExecutionContext *exe_ctx, llvm::StringRef path,
-                             bool will_modify, Error &error) const {
+Properties::GetPropertyValue(const ExecutionContext *exe_ctx,
+                             llvm::StringRef path, bool will_modify,
+                             Status &error) const {
   OptionValuePropertiesSP properties_sp(GetValueProperties());
   if (properties_sp)
     return properties_sp->GetSubValue(exe_ctx, path, will_modify, error);
   return lldb::OptionValueSP();
 }
 
-Error Properties::SetPropertyValue(const ExecutionContext *exe_ctx,
-                                   VarSetOperationType op, llvm::StringRef path,
-  llvm::StringRef value) {
+Status Properties::SetPropertyValue(const ExecutionContext *exe_ctx,
+                                    VarSetOperationType op,
+                                    llvm::StringRef path,
+                                    llvm::StringRef value) {
   OptionValuePropertiesSP properties_sp(GetValueProperties());
   if (properties_sp)
     return properties_sp->SetSubValue(exe_ctx, op, path, value);
-  Error error;
+  Status error;
   error.SetErrorString("no properties");
   return error;
 }
@@ -67,15 +69,16 @@ void Properties::DumpAllDescriptions(CommandInterpreter &interpreter,
     return properties_sp->DumpAllDescriptions(interpreter, strm);
 }
 
-Error Properties::DumpPropertyValue(const ExecutionContext *exe_ctx,
-                                    Stream &strm, llvm::StringRef property_path,
-                                    uint32_t dump_mask) {
+Status Properties::DumpPropertyValue(const ExecutionContext *exe_ctx,
+                                     Stream &strm,
+                                     llvm::StringRef property_path,
+                                     uint32_t dump_mask) {
   OptionValuePropertiesSP properties_sp(GetValueProperties());
   if (properties_sp) {
     return properties_sp->DumpPropertyValue(exe_ctx, strm, property_path,
                                             dump_mask);
   }
-  Error error;
+  Status error;
   error.SetErrorString("empty property list");
   return error;
 }
diff --git a/source/Core/Value.cpp b/source/Core/Value.cpp
index 9aaddf77021c..63385511edb6 100644
--- a/source/Core/Value.cpp
+++ b/source/Core/Value.cpp
@@ -144,7 +144,7 @@ Type *Value::GetType() {
 
 size_t Value::AppendDataToHostBuffer(const Value &rhs) {
   size_t curr_size = m_data_buffer.GetByteSize();
-  Error error;
+  Status error;
   switch (rhs.GetValueType()) {
   case eValueTypeScalar: {
     const size_t scalar_size = rhs.m_value.GetByteSize();
@@ -207,7 +207,7 @@ bool Value::ValueOf(ExecutionContext *exe_ctx) {
   return false;
 }
 
-uint64_t Value::GetValueByteSize(Error *error_ptr, ExecutionContext *exe_ctx) {
+uint64_t Value::GetValueByteSize(Status *error_ptr, ExecutionContext *exe_ctx) {
   uint64_t byte_size = 0;
 
   switch (m_context_type) {
@@ -315,11 +315,11 @@ bool Value::GetData(DataExtractor &data) {
   return false;
 }
 
-Error Value::GetValueAsData(ExecutionContext *exe_ctx, DataExtractor &data,
-                            uint32_t data_offset, Module *module) {
+Status Value::GetValueAsData(ExecutionContext *exe_ctx, DataExtractor &data,
+                             uint32_t data_offset, Module *module) {
   data.Clear();
 
-  Error error;
+  Status error;
   lldb::addr_t address = LLDB_INVALID_ADDRESS;
   AddressType address_type = eAddressTypeFile;
   Address file_so_addr;
@@ -623,7 +623,7 @@ Scalar &Value::ResolveValue(ExecutionContext *exe_ctx) {
     {
       DataExtractor data;
       lldb::addr_t addr = m_value.ULongLong(LLDB_INVALID_ADDRESS);
-      Error error(GetValueAsData(exe_ctx, data, 0, NULL));
+      Status error(GetValueAsData(exe_ctx, data, 0, NULL));
       if (error.Success()) {
         Scalar scalar;
         if (compiler_type.GetValueAsScalar(data, 0, data.GetByteSize(),
diff --git a/source/Core/ValueObject.cpp b/source/Core/ValueObject.cpp
index 9e6b9da78b99..c09139fe2b09 100644
--- a/source/Core/ValueObject.cpp
+++ b/source/Core/ValueObject.cpp
@@ -347,7 +347,7 @@ DataExtractor &ValueObject::GetDataExtractor() {
   return m_data;
 }
 
-const Error &ValueObject::GetError() {
+const Status &ValueObject::GetError() {
   UpdateValueIfNeeded(false);
   return m_error;
 }
@@ -423,7 +423,7 @@ bool ValueObject::ResolveValue(Scalar &scalar) {
   return false;
 }
 
-bool ValueObject::IsLogicalTrue(Error &error) {
+bool ValueObject::IsLogicalTrue(Status &error) {
   if (Language *language = Language::FindPlugin(GetObjectRuntimeLanguage())) {
     LazyBool is_logical_true = language->IsLogicalTrue(*this, error);
     switch (is_logical_true) {
@@ -771,7 +771,7 @@ size_t ValueObject::GetPointeeData(DataExtractor &data, uint32_t item_idx,
   if (item_idx == 0 && item_count == 1) // simply a deref
   {
     if (is_pointer_type) {
-      Error error;
+      Status error;
       ValueObjectSP pointee_sp = Dereference(error);
       if (error.Fail() || pointee_sp.get() == NULL)
         return 0;
@@ -780,13 +780,13 @@ size_t ValueObject::GetPointeeData(DataExtractor &data, uint32_t item_idx,
       ValueObjectSP child_sp = GetChildAtIndex(0, true);
       if (child_sp.get() == NULL)
         return 0;
-      Error error;
+      Status error;
       return child_sp->GetData(data, error);
     }
     return true;
   } else /* (items > 1) */
   {
-    Error error;
+    Status error;
     lldb_private::DataBufferHeap *heap_buf_ptr = NULL;
     lldb::DataBufferSP data_sp(heap_buf_ptr =
                                    new lldb_private::DataBufferHeap());
@@ -848,7 +848,7 @@ size_t ValueObject::GetPointeeData(DataExtractor &data, uint32_t item_idx,
   return 0;
 }
 
-uint64_t ValueObject::GetData(DataExtractor &data, Error &error) {
+uint64_t ValueObject::GetData(DataExtractor &data, Status &error) {
   UpdateValueIfNeeded(false);
   ExecutionContext exe_ctx(GetExecutionContextRef());
   error = m_value.GetValueAsData(&exe_ctx, data, 0, GetModule().get());
@@ -866,7 +866,7 @@ uint64_t ValueObject::GetData(DataExtractor &data, Error &error) {
   return data.GetByteSize();
 }
 
-bool ValueObject::SetData(DataExtractor &data, Error &error) {
+bool ValueObject::SetData(DataExtractor &data, Status &error) {
   error.Clear();
   // Make sure our value is up to date first so that our location and location
   // type is valid.
@@ -884,7 +884,7 @@ bool ValueObject::SetData(DataExtractor &data, Error &error) {
 
   switch (value_type) {
   case Value::eValueTypeScalar: {
-    Error set_error =
+    Status set_error =
         m_value.GetScalar().SetValueFromData(data, encoding, byte_size);
 
     if (!set_error.Success()) {
@@ -938,7 +938,7 @@ static bool CopyStringDataToBufferSP(const StreamString &source,
 }
 
 std::pair<size_t, bool>
-ValueObject::ReadPointedString(lldb::DataBufferSP &buffer_sp, Error &error,
+ValueObject::ReadPointedString(lldb::DataBufferSP &buffer_sp, Status &error,
                                uint32_t max_length, bool honor_array,
                                Format item_format) {
   bool was_capped = false;
@@ -1285,7 +1285,7 @@ bool ValueObject::DumpPrintableRepresentation(
            custom_format ==
                eFormatVectorOfChar)) // print char[] & char* directly
       {
-        Error error;
+        Status error;
         lldb::DataBufferSP buffer_sp;
         std::pair<size_t, bool> read_string = ReadPointedString(
             buffer_sp, error, 0, (custom_format == eFormatVectorOfChar) ||
@@ -1561,7 +1561,7 @@ addr_t ValueObject::GetPointerValue(AddressType *address_type) {
   return address;
 }
 
-bool ValueObject::SetValueFromCString(const char *value_str, Error &error) {
+bool ValueObject::SetValueFromCString(const char *value_str, Status &error) {
   error.Clear();
   // Make sure our value is up to date first so that our location and location
   // type is valid.
@@ -2185,7 +2185,7 @@ ValueObjectSP ValueObject::GetValueForExpressionPath(
     if ((final_task_on_target ? *final_task_on_target
                               : dummy_final_task_on_target) ==
         ValueObject::eExpressionPathAftermathDereference) {
-      Error error;
+      Status error;
       ValueObjectSP final_value = ret_val->Dereference(error);
       if (error.Fail() || !final_value.get()) {
         if (reason_to_stop)
@@ -2202,7 +2202,7 @@ ValueObjectSP ValueObject::GetValueForExpressionPath(
     }
     if (*final_task_on_target ==
         ValueObject::eExpressionPathAftermathTakeAddress) {
-      Error error;
+      Status error;
       ValueObjectSP final_value = ret_val->AddressOf(error);
       if (error.Fail() || !final_value.get()) {
         if (reason_to_stop)
@@ -2552,7 +2552,7 @@ ValueObjectSP ValueObject::GetValueForExpressionPath_Impl(
                                                              // and use this as
                                                              // a bitfield
               pointee_compiler_type_info.Test(eTypeIsScalar)) {
-            Error error;
+            Status error;
             root = root->Dereference(error);
             if (error.Fail() || !root) {
               *reason_to_stop =
@@ -2697,7 +2697,7 @@ ValueObjectSP ValueObject::GetValueForExpressionPath_Impl(
                    *what_next ==
                        ValueObject::eExpressionPathAftermathDereference &&
                    pointee_compiler_type_info.Test(eTypeIsScalar)) {
-          Error error;
+          Status error;
           root = root->Dereference(error);
           if (error.Fail() || !root) {
             *reason_to_stop =
@@ -2839,7 +2839,7 @@ lldb::addr_t ValueObject::GetCPPVTableAddress(AddressType &address_type) {
   return LLDB_INVALID_ADDRESS;
 }
 
-ValueObjectSP ValueObject::Dereference(Error &error) {
+ValueObjectSP ValueObject::Dereference(Status &error) {
   if (m_deref_valobj)
     return m_deref_valobj->GetSP();
 
@@ -2904,7 +2904,7 @@ ValueObjectSP ValueObject::Dereference(Error &error) {
   }
 }
 
-ValueObjectSP ValueObject::AddressOf(Error &error) {
+ValueObjectSP ValueObject::AddressOf(Status &error) {
   if (m_addr_of_valobj_sp)
     return m_addr_of_valobj_sp;
 
@@ -3191,7 +3191,7 @@ lldb::ValueObjectSP ValueObject::CreateValueObjectFromAddress(
       if (ptr_result_valobj_sp) {
         ptr_result_valobj_sp->GetValue().SetValueType(
             Value::eValueTypeLoadAddress);
-        Error err;
+        Status err;
         ptr_result_valobj_sp = ptr_result_valobj_sp->Dereference(err);
         if (ptr_result_valobj_sp && !name.empty())
           ptr_result_valobj_sp->SetName(ConstString(name));
diff --git a/source/Core/ValueObjectCast.cpp b/source/Core/ValueObjectCast.cpp
index aa4cf60c1f9e..a489bdc74f5f 100644
--- a/source/Core/ValueObjectCast.cpp
+++ b/source/Core/ValueObjectCast.cpp
@@ -14,7 +14,7 @@
 #include "lldb/Core/ValueObject.h"
 #include "lldb/Symbol/CompilerType.h"
 #include "lldb/Target/ExecutionContext.h"
-#include "lldb/Utility/Error.h" // for Error
+#include "lldb/Utility/Status.h" // for Status
 
 namespace lldb_private {
 class ConstString;
diff --git a/source/Core/ValueObjectChild.cpp b/source/Core/ValueObjectChild.cpp
index eeb28c960a3a..591bc21711ae 100644
--- a/source/Core/ValueObjectChild.cpp
+++ b/source/Core/ValueObjectChild.cpp
@@ -14,9 +14,9 @@
 #include "lldb/Symbol/CompilerType.h"
 #include "lldb/Target/ExecutionContext.h"
 #include "lldb/Target/Process.h"
-#include "lldb/Utility/Error.h" // for Error
-#include "lldb/Utility/Flags.h" // for Flags
-#include "lldb/lldb-forward.h"  // for ProcessSP, ModuleSP
+#include "lldb/Utility/Flags.h"  // for Flags
+#include "lldb/Utility/Status.h" // for Status
+#include "lldb/lldb-forward.h"   // for ProcessSP, ModuleSP
 
 #include <functional> // for _Func_impl<>::_Mybase
 #include <memory>     // for shared_ptr
diff --git a/source/Core/ValueObjectConstResult.cpp b/source/Core/ValueObjectConstResult.cpp
index cf437ce6f7d5..1023696c35a7 100644
--- a/source/Core/ValueObjectConstResult.cpp
+++ b/source/Core/ValueObjectConstResult.cpp
@@ -163,12 +163,12 @@ ValueObjectConstResult::ValueObjectConstResult(
 }
 
 ValueObjectSP ValueObjectConstResult::Create(ExecutionContextScope *exe_scope,
-                                             const Error &error) {
+                                             const Status &error) {
   return (new ValueObjectConstResult(exe_scope, error))->GetSP();
 }
 
 ValueObjectConstResult::ValueObjectConstResult(ExecutionContextScope *exe_scope,
-                                               const Error &error)
+                                               const Status &error)
     : ValueObject(exe_scope), m_type_name(), m_byte_size(0), m_impl(this) {
   m_error = error;
   SetIsConstant();
@@ -234,7 +234,7 @@ bool ValueObjectConstResult::IsInScope() {
   return true;
 }
 
-lldb::ValueObjectSP ValueObjectConstResult::Dereference(Error &error) {
+lldb::ValueObjectSP ValueObjectConstResult::Dereference(Status &error) {
   return m_impl.Dereference(error);
 }
 
@@ -245,7 +245,7 @@ lldb::ValueObjectSP ValueObjectConstResult::GetSyntheticChildAtOffset(
                                           name_const_str);
 }
 
-lldb::ValueObjectSP ValueObjectConstResult::AddressOf(Error &error) {
+lldb::ValueObjectSP ValueObjectConstResult::AddressOf(Status &error) {
   return m_impl.AddressOf(error);
 }
 
diff --git a/source/Core/ValueObjectConstResultCast.cpp b/source/Core/ValueObjectConstResultCast.cpp
index f575bebd7110..c04043264af1 100644
--- a/source/Core/ValueObjectConstResultCast.cpp
+++ b/source/Core/ValueObjectConstResultCast.cpp
@@ -13,7 +13,7 @@ namespace lldb_private {
 class DataExtractor;
 }
 namespace lldb_private {
-class Error;
+class Status;
 }
 namespace lldb_private {
 class ValueObject;
@@ -30,7 +30,7 @@ ValueObjectConstResultCast::ValueObjectConstResultCast(
 
 ValueObjectConstResultCast::~ValueObjectConstResultCast() {}
 
-lldb::ValueObjectSP ValueObjectConstResultCast::Dereference(Error &error) {
+lldb::ValueObjectSP ValueObjectConstResultCast::Dereference(Status &error) {
   return m_impl.Dereference(error);
 }
 
@@ -41,7 +41,7 @@ lldb::ValueObjectSP ValueObjectConstResultCast::GetSyntheticChildAtOffset(
                                           name_const_str);
 }
 
-lldb::ValueObjectSP ValueObjectConstResultCast::AddressOf(Error &error) {
+lldb::ValueObjectSP ValueObjectConstResultCast::AddressOf(Status &error) {
   return m_impl.AddressOf(error);
 }
 
diff --git a/source/Core/ValueObjectConstResultChild.cpp b/source/Core/ValueObjectConstResultChild.cpp
index 9c6ad8becf3b..3e9f87684162 100644
--- a/source/Core/ValueObjectConstResultChild.cpp
+++ b/source/Core/ValueObjectConstResultChild.cpp
@@ -14,7 +14,7 @@ namespace lldb_private {
 class DataExtractor;
 }
 namespace lldb_private {
-class Error;
+class Status;
 }
 namespace lldb_private {
 class ValueObject;
@@ -37,7 +37,7 @@ ValueObjectConstResultChild::ValueObjectConstResultChild(
 
 ValueObjectConstResultChild::~ValueObjectConstResultChild() {}
 
-lldb::ValueObjectSP ValueObjectConstResultChild::Dereference(Error &error) {
+lldb::ValueObjectSP ValueObjectConstResultChild::Dereference(Status &error) {
   return m_impl.Dereference(error);
 }
 
@@ -48,7 +48,7 @@ lldb::ValueObjectSP ValueObjectConstResultChild::GetSyntheticChildAtOffset(
                                           name_const_str);
 }
 
-lldb::ValueObjectSP ValueObjectConstResultChild::AddressOf(Error &error) {
+lldb::ValueObjectSP ValueObjectConstResultChild::AddressOf(Status &error) {
   return m_impl.AddressOf(error);
 }
 
diff --git a/source/Core/ValueObjectConstResultImpl.cpp b/source/Core/ValueObjectConstResultImpl.cpp
index ed25ea8071a2..714634ed56e3 100644
--- a/source/Core/ValueObjectConstResultImpl.cpp
+++ b/source/Core/ValueObjectConstResultImpl.cpp
@@ -27,7 +27,7 @@ namespace lldb_private {
 class DataExtractor;
 }
 namespace lldb_private {
-class Error;
+class Status;
 }
 
 using namespace lldb;
@@ -39,7 +39,7 @@ ValueObjectConstResultImpl::ValueObjectConstResultImpl(
       m_live_address_type(eAddressTypeLoad), m_load_addr_backend(),
       m_address_of_backend() {}
 
-lldb::ValueObjectSP ValueObjectConstResultImpl::Dereference(Error &error) {
+lldb::ValueObjectSP ValueObjectConstResultImpl::Dereference(Status &error) {
   if (m_impl_backend == NULL)
     return lldb::ValueObjectSP();
 
@@ -108,7 +108,7 @@ lldb::ValueObjectSP ValueObjectConstResultImpl::GetSyntheticChildAtOffset(
       offset, type, can_create, name_const_str);
 }
 
-lldb::ValueObjectSP ValueObjectConstResultImpl::AddressOf(Error &error) {
+lldb::ValueObjectSP ValueObjectConstResultImpl::AddressOf(Status &error) {
   if (m_address_of_backend.get() != NULL)
     return m_address_of_backend;
 
diff --git a/source/Core/ValueObjectDynamicValue.cpp b/source/Core/ValueObjectDynamicValue.cpp
index 59bbc025f994..bb39caa767eb 100644
--- a/source/Core/ValueObjectDynamicValue.cpp
+++ b/source/Core/ValueObjectDynamicValue.cpp
@@ -20,9 +20,9 @@
 #include "lldb/Target/Process.h"
 #include "lldb/Target/Target.h"
 #include "lldb/Utility/DataExtractor.h" // for DataExtractor
-#include "lldb/Utility/Error.h"         // for Error
 #include "lldb/Utility/Log.h"
 #include "lldb/Utility/Logging.h" // for GetLogIfAllCategoriesSet
+#include "lldb/Utility/Status.h"  // for Status
 #include "lldb/lldb-types.h"      // for addr_t, offset_t
 
 #include <string.h> // for strcmp, size_t
@@ -275,7 +275,7 @@ bool ValueObjectDynamicValue::UpdateValue() {
 bool ValueObjectDynamicValue::IsInScope() { return m_parent->IsInScope(); }
 
 bool ValueObjectDynamicValue::SetValueFromCString(const char *value_str,
-                                                  Error &error) {
+                                                  Status &error) {
   if (!UpdateValueIfNeeded(false)) {
     error.SetErrorString("unable to read value");
     return false;
@@ -310,7 +310,7 @@ bool ValueObjectDynamicValue::SetValueFromCString(const char *value_str,
   return ret_val;
 }
 
-bool ValueObjectDynamicValue::SetData(DataExtractor &data, Error &error) {
+bool ValueObjectDynamicValue::SetData(DataExtractor &data, Status &error) {
   if (!UpdateValueIfNeeded(false)) {
     error.SetErrorString("unable to read value");
     return false;
diff --git a/source/Core/ValueObjectMemory.cpp b/source/Core/ValueObjectMemory.cpp
index 0fb8f0d2de03..713751110dce 100644
--- a/source/Core/ValueObjectMemory.cpp
+++ b/source/Core/ValueObjectMemory.cpp
@@ -17,7 +17,7 @@
 #include "lldb/Target/ExecutionContext.h"
 #include "lldb/Target/Target.h"
 #include "lldb/Utility/DataExtractor.h" // for DataExtractor
-#include "lldb/Utility/Error.h"         // for Error
+#include "lldb/Utility/Status.h"        // for Status
 #include "lldb/lldb-types.h"            // for addr_t
 #include "llvm/Support/ErrorHandling.h" // for llvm_unreachable
 
diff --git a/source/Core/ValueObjectRegister.cpp b/source/Core/ValueObjectRegister.cpp
index 6469340201e5..05022d3ed10a 100644
--- a/source/Core/ValueObjectRegister.cpp
+++ b/source/Core/ValueObjectRegister.cpp
@@ -20,7 +20,7 @@
 #include "lldb/Target/StackFrame.h" // for StackFrame
 #include "lldb/Target/Target.h"
 #include "lldb/Utility/DataExtractor.h" // for DataExtractor
-#include "lldb/Utility/Error.h"         // for Error
+#include "lldb/Utility/Status.h"        // for Status
 #include "lldb/Utility/Stream.h"        // for Stream
 
 #include "llvm/ADT/StringRef.h" // for StringRef
@@ -318,7 +318,7 @@ bool ValueObjectRegister::UpdateValue() {
 }
 
 bool ValueObjectRegister::SetValueFromCString(const char *value_str,
-                                              Error &error) {
+                                              Status &error) {
   // The new value will be in the m_data.  Copy that into our register value.
   error =
       m_reg_value.SetValueFromString(&m_reg_info, llvm::StringRef(value_str));
@@ -332,7 +332,7 @@ bool ValueObjectRegister::SetValueFromCString(const char *value_str,
     return false;
 }
 
-bool ValueObjectRegister::SetData(DataExtractor &data, Error &error) {
+bool ValueObjectRegister::SetData(DataExtractor &data, Status &error) {
   error = m_reg_value.SetValueFromData(&m_reg_info, data, 0, false);
   if (error.Success()) {
     if (m_reg_ctx_sp->WriteRegister(&m_reg_info, m_reg_value)) {
diff --git a/source/Core/ValueObjectSyntheticFilter.cpp b/source/Core/ValueObjectSyntheticFilter.cpp
index f0fd76ed09fc..87310c423cb8 100644
--- a/source/Core/ValueObjectSyntheticFilter.cpp
+++ b/source/Core/ValueObjectSyntheticFilter.cpp
@@ -13,10 +13,10 @@
 #include "lldb/Core/ValueObject.h"
 #include "lldb/DataFormatters/TypeSynthetic.h"
 #include "lldb/Target/ExecutionContext.h" // for ExecutionContext
-#include "lldb/Utility/Error.h"           // for Error
 #include "lldb/Utility/Log.h"
 #include "lldb/Utility/Logging.h"    // for GetLogIfAllCategoriesSet
 #include "lldb/Utility/SharingPtr.h" // for SharingPtr
+#include "lldb/Utility/Status.h"     // for Status
 
 #include "llvm/ADT/STLExtras.h"
 
@@ -324,7 +324,7 @@ bool ValueObjectSynthetic::CanProvideValue() {
 }
 
 bool ValueObjectSynthetic::SetValueFromCString(const char *value_str,
-                                               Error &error) {
+                                               Status &error) {
   return m_parent->SetValueFromCString(value_str, error);
 }
 
diff --git a/source/Core/ValueObjectVariable.cpp b/source/Core/ValueObjectVariable.cpp
index 169f8f0f6c28..9b9e51a0abb8 100644
--- a/source/Core/ValueObjectVariable.cpp
+++ b/source/Core/ValueObjectVariable.cpp
@@ -29,7 +29,7 @@
 #include "lldb/Target/RegisterContext.h"
 #include "lldb/Target/Target.h"
 #include "lldb/Utility/DataExtractor.h"     // for DataExtractor
-#include "lldb/Utility/Error.h"             // for Error
+#include "lldb/Utility/Status.h"            // for Status
 #include "lldb/lldb-private-enumerations.h" // for AddressType::eAddressTy...
 #include "lldb/lldb-types.h"                // for addr_t
 
@@ -344,7 +344,7 @@ const char *ValueObjectVariable::GetLocationAsCString() {
 }
 
 bool ValueObjectVariable::SetValueFromCString(const char *value_str,
-                                              Error &error) {
+                                              Status &error) {
   if (!UpdateValueIfNeeded()) {
     error.SetErrorString("unable to update value before writing");
     return false;
@@ -373,7 +373,7 @@ bool ValueObjectVariable::SetValueFromCString(const char *value_str,
     return ValueObject::SetValueFromCString(value_str, error);
 }
 
-bool ValueObjectVariable::SetData(DataExtractor &data, Error &error) {
+bool ValueObjectVariable::SetData(DataExtractor &data, Status &error) {
   if (!UpdateValueIfNeeded()) {
     error.SetErrorString("unable to update value before writing");
     return false;
diff --git a/source/DataFormatters/StringPrinter.cpp b/source/DataFormatters/StringPrinter.cpp
index 7ca3744a247a..84143a541087 100644
--- a/source/DataFormatters/StringPrinter.cpp
+++ b/source/DataFormatters/StringPrinter.cpp
@@ -15,7 +15,7 @@
 #include "lldb/Target/Language.h"
 #include "lldb/Target/Process.h"
 #include "lldb/Target/Target.h"
-#include "lldb/Utility/Error.h"
+#include "lldb/Utility/Status.h"
 
 #include "llvm/Support/ConvertUTF.h"
 
@@ -417,7 +417,7 @@ bool StringPrinter::ReadStringAndDumpToStream<
     StringPrinter::StringElementType::ASCII>(
     const ReadStringAndDumpToStreamOptions &options) {
   assert(options.GetStream() && "need a Stream to print the string to");
-  Error my_error;
+  Status my_error;
 
   ProcessSP process_sp(options.GetProcessSP());
 
@@ -561,7 +561,7 @@ static bool ReadUTFBufferAndDumpToStream(
   if (!buffer_sp->GetBytes())
     return false;
 
-  Error error;
+  Status error;
   char *buffer = reinterpret_cast<char *>(buffer_sp->GetBytes());
 
   if (needs_zero_terminator)
diff --git a/source/DataFormatters/TypeFormat.cpp b/source/DataFormatters/TypeFormat.cpp
index 4d1a0096bb68..333fe91b76db 100644
--- a/source/DataFormatters/TypeFormat.cpp
+++ b/source/DataFormatters/TypeFormat.cpp
@@ -56,7 +56,7 @@ bool TypeFormatImpl_Format::FormatObject(ValueObject *valobj,
     if (context_type == Value::eContextTypeRegisterInfo) {
       const RegisterInfo *reg_info = value.GetRegisterInfo();
       if (reg_info) {
-        Error error;
+        Status error;
         valobj->GetData(data, error);
         if (error.Fail())
           return false;
@@ -82,7 +82,7 @@ bool TypeFormatImpl_Format::FormatObject(ValueObject *valobj,
             TargetSP target_sp(valobj->GetTargetSP());
             if (target_sp) {
               size_t max_len = target_sp->GetMaximumSizeOfStringSummary();
-              Error error;
+              Status error;
               DataBufferSP buffer_sp(new DataBufferHeap(max_len + 1, 0));
               Address address(valobj->GetPointerValue());
               if (target_sp->ReadCStringFromMemory(
@@ -92,7 +92,7 @@ bool TypeFormatImpl_Format::FormatObject(ValueObject *valobj,
             }
           }
         } else {
-          Error error;
+          Status error;
           valobj->GetData(data, error);
           if (error.Fail())
             return false;
@@ -185,7 +185,7 @@ bool TypeFormatImpl_EnumType::FormatObject(ValueObject *valobj,
   if (valobj_enum_type.IsValid() == false)
     return false;
   DataExtractor data;
-  Error error;
+  Status error;
   valobj->GetData(data, error);
   if (error.Fail())
     return false;
diff --git a/source/Expression/DWARFExpression.cpp b/source/Expression/DWARFExpression.cpp
index 928577cd7ee4..592a30cdd780 100644
--- a/source/Expression/DWARFExpression.cpp
+++ b/source/Expression/DWARFExpression.cpp
@@ -655,7 +655,7 @@ void DWARFExpression::GetDescription(Stream *s, lldb::DescriptionLevel level,
 
 static bool ReadRegisterValueAsScalar(RegisterContext *reg_ctx,
                                       lldb::RegisterKind reg_kind,
-                                      uint32_t reg_num, Error *error_ptr,
+                                      uint32_t reg_num, Status *error_ptr,
                                       Value &value) {
   if (reg_ctx == NULL) {
     if (error_ptr)
@@ -1250,7 +1250,7 @@ bool DWARFExpression::Evaluate(ExecutionContextScope *exe_scope,
                                lldb::addr_t loclist_base_load_addr,
                                const Value *initial_value_ptr,
                                const Value *object_address_ptr, Value &result,
-                               Error *error_ptr) const {
+                               Status *error_ptr) const {
   ExecutionContext exe_ctx(exe_scope);
   return Evaluate(&exe_ctx, expr_locals, decl_map, nullptr,
                   loclist_base_load_addr, initial_value_ptr, object_address_ptr,
@@ -1261,7 +1261,7 @@ bool DWARFExpression::Evaluate(
     ExecutionContext *exe_ctx, ClangExpressionVariableList *expr_locals,
     ClangExpressionDeclMap *decl_map, RegisterContext *reg_ctx,
     lldb::addr_t loclist_base_load_addr, const Value *initial_value_ptr,
-    const Value *object_address_ptr, Value &result, Error *error_ptr) const {
+    const Value *object_address_ptr, Value &result, Status *error_ptr) const {
   ModuleSP module_sp = m_module_wp.lock();
 
   if (IsLocationList()) {
@@ -1333,7 +1333,7 @@ bool DWARFExpression::Evaluate(
     DWARFCompileUnit *dwarf_cu, const lldb::offset_t opcodes_offset,
     const lldb::offset_t opcodes_length, const lldb::RegisterKind reg_kind,
     const Value *initial_value_ptr, const Value *object_address_ptr,
-    Value &result, Error *error_ptr) {
+    Value &result, Status *error_ptr) {
 
   if (opcodes_length == 0) {
     if (error_ptr)
@@ -1467,7 +1467,7 @@ bool DWARFExpression::Evaluate(
           if (process) {
             lldb::addr_t pointer_addr =
                 stack.back().GetScalar().ULongLong(LLDB_INVALID_ADDRESS);
-            Error error;
+            Status error;
             lldb::addr_t pointer_value =
                 process->ReadPointerFromMemory(pointer_addr, error);
             if (pointer_value != LLDB_INVALID_ADDRESS) {
@@ -1572,7 +1572,7 @@ bool DWARFExpression::Evaluate(
             lldb::addr_t pointer_addr =
                 stack.back().GetScalar().ULongLong(LLDB_INVALID_ADDRESS);
             uint8_t addr_bytes[sizeof(lldb::addr_t)];
-            Error error;
+            Status error;
             if (process->ReadMemory(pointer_addr, &addr_bytes, size, error) ==
                 size) {
               DataExtractor addr_data(addr_bytes, sizeof(addr_bytes),
@@ -2568,7 +2568,7 @@ bool DWARFExpression::Evaluate(
           ::memset(curr_piece.GetBuffer().GetBytes(), 0, piece_byte_size);
           pieces.AppendDataToHostBuffer(curr_piece);
         } else {
-          Error error;
+          Status error;
           // Extract the current piece into "curr_piece"
           Value curr_piece_source_value(stack.back());
           stack.pop_back();
diff --git a/source/Expression/FunctionCaller.cpp b/source/Expression/FunctionCaller.cpp
index e2f38a9f50bb..6f60f8bf9c13 100644
--- a/source/Expression/FunctionCaller.cpp
+++ b/source/Expression/FunctionCaller.cpp
@@ -86,7 +86,7 @@ bool FunctionCaller::WriteFunctionWrapper(
 
   bool can_interpret = false; // should stay that way
 
-  Error jit_error(m_parser->PrepareForExecution(
+  Status jit_error(m_parser->PrepareForExecution(
       m_jit_start_addr, m_jit_end_addr, m_execution_unit_sp, exe_ctx,
       can_interpret, eExecutionPolicyAlways));
 
@@ -135,7 +135,7 @@ bool FunctionCaller::WriteFunctionArguments(
     return false;
   }
 
-  Error error;
+  Status error;
   lldb::ExpressionResults return_value = lldb::eExpressionSetupError;
 
   Process *process = exe_ctx.GetProcessPtr();
@@ -172,7 +172,7 @@ bool FunctionCaller::WriteFunctionArguments(
 
   // FIXME: We will need to extend this for Variadic functions.
 
-  Error value_error;
+  Status value_error;
 
   size_t num_args = arg_values.GetSize();
   if (num_args != m_arg_values.GetSize()) {
@@ -289,7 +289,7 @@ bool FunctionCaller::FetchFunctionResults(ExecutionContext &exe_ctx,
   if (process != jit_process_sp.get())
     return false;
 
-  Error error;
+  Status error;
   ret_value.GetScalar() = process->ReadUnsignedIntegerFromMemory(
       args_addr + m_return_offset, m_return_size, 0, error);
 
diff --git a/source/Expression/IRDynamicChecks.cpp b/source/Expression/IRDynamicChecks.cpp
index 44ff6295ca18..0c8cba2050c0 100644
--- a/source/Expression/IRDynamicChecks.cpp
+++ b/source/Expression/IRDynamicChecks.cpp
@@ -51,7 +51,7 @@ DynamicCheckerFunctions::~DynamicCheckerFunctions() = default;
 
 bool DynamicCheckerFunctions::Install(DiagnosticManager &diagnostic_manager,
                                       ExecutionContext &exe_ctx) {
-  Error error;
+  Status error;
   m_valid_pointer_check.reset(
       exe_ctx.GetTargetRef().GetUtilityFunctionForLanguage(
           g_valid_pointer_check_text, lldb::eLanguageTypeC,
diff --git a/source/Expression/IRExecutionUnit.cpp b/source/Expression/IRExecutionUnit.cpp
index 4309caefbd44..e31483f1728d 100644
--- a/source/Expression/IRExecutionUnit.cpp
+++ b/source/Expression/IRExecutionUnit.cpp
@@ -50,7 +50,7 @@ IRExecutionUnit::IRExecutionUnit(std::unique_ptr<llvm::LLVMContext> &context_ap,
       m_reported_allocations(false) {}
 
 lldb::addr_t IRExecutionUnit::WriteNow(const uint8_t *bytes, size_t size,
-                                       Error &error) {
+                                       Status &error) {
   const bool zero_memory = false;
   lldb::addr_t allocation_process_addr =
       Malloc(size, 8, lldb::ePermissionsWritable | lldb::ePermissionsReadable,
@@ -62,7 +62,7 @@ lldb::addr_t IRExecutionUnit::WriteNow(const uint8_t *bytes, size_t size,
   WriteMemory(allocation_process_addr, bytes, size, error);
 
   if (!error.Success()) {
-    Error err;
+    Status err;
     Free(allocation_process_addr, err);
 
     return LLDB_INVALID_ADDRESS;
@@ -71,7 +71,7 @@ lldb::addr_t IRExecutionUnit::WriteNow(const uint8_t *bytes, size_t size,
   if (Log *log =
           lldb_private::GetLogIfAllCategoriesSet(LIBLLDB_LOG_EXPRESSIONS)) {
     DataBufferHeap my_buffer(size, 0);
-    Error err;
+    Status err;
     ReadMemory(my_buffer.GetBytes(), allocation_process_addr, size, err);
 
     if (err.Success()) {
@@ -90,18 +90,18 @@ void IRExecutionUnit::FreeNow(lldb::addr_t allocation) {
   if (allocation == LLDB_INVALID_ADDRESS)
     return;
 
-  Error err;
+  Status err;
 
   Free(allocation, err);
 }
 
-Error IRExecutionUnit::DisassembleFunction(Stream &stream,
-                                           lldb::ProcessSP &process_wp) {
+Status IRExecutionUnit::DisassembleFunction(Stream &stream,
+                                            lldb::ProcessSP &process_wp) {
   Log *log(lldb_private::GetLogIfAllCategoriesSet(LIBLLDB_LOG_EXPRESSIONS));
 
   ExecutionContext exe_ctx(process_wp);
 
-  Error ret;
+  Status ret;
 
   ret.Clear();
 
@@ -152,7 +152,7 @@ Error IRExecutionUnit::DisassembleFunction(Stream &stream,
   lldb::DataBufferSP buffer_sp(new DataBufferHeap(func_range.second, 0));
 
   Process *process = exe_ctx.GetProcessPtr();
-  Error err;
+  Status err;
   process->ReadMemory(func_remote_addr, buffer_sp->GetBytes(),
                       buffer_sp->GetByteSize(), err);
 
@@ -203,7 +203,7 @@ Error IRExecutionUnit::DisassembleFunction(Stream &stream,
 
 static void ReportInlineAsmError(const llvm::SMDiagnostic &diagnostic,
                                  void *Context, unsigned LocCookie) {
-  Error *err = static_cast<Error *>(Context);
+  Status *err = static_cast<Status *>(Context);
 
   if (err && err->Success()) {
     err->SetErrorToGenericError();
@@ -216,7 +216,7 @@ void IRExecutionUnit::ReportSymbolLookupError(const ConstString &name) {
   m_failed_lookups.push_back(name);
 }
 
-void IRExecutionUnit::GetRunnableInfo(Error &error, lldb::addr_t &func_addr,
+void IRExecutionUnit::GetRunnableInfo(Status &error, lldb::addr_t &func_addr,
                                       lldb::addr_t &func_end) {
   lldb::ProcessSP process_sp(GetProcessWP().lock());
 
@@ -452,7 +452,7 @@ void IRExecutionUnit::GetRunnableInfo(Error &error, lldb::addr_t &func_addr,
 
     StreamString disassembly_stream;
 
-    Error err = DisassembleFunction(disassembly_stream, process_sp);
+    Status err = DisassembleFunction(disassembly_stream, process_sp);
 
     if (!err.Success()) {
       log->Printf("Couldn't disassemble function : %s",
@@ -467,7 +467,7 @@ void IRExecutionUnit::GetRunnableInfo(Error &error, lldb::addr_t &func_addr,
         record.dump(log);
 
         DataBufferHeap my_buffer(record.m_size, 0);
-        Error err;
+        Status err;
         ReadMemory(my_buffer.GetBytes(), record.m_process_address,
                    record.m_size, err);
 
@@ -643,7 +643,7 @@ uint8_t *IRExecutionUnit::MemoryManager::allocateCodeSection(
   }
 
   if (m_parent.m_reported_allocations) {
-    Error err;
+    Status err;
     lldb::ProcessSP process_sp =
         m_parent.GetBestExecutionContextScope()->CalculateProcess();
 
@@ -675,7 +675,7 @@ uint8_t *IRExecutionUnit::MemoryManager::allocateDataSection(
   }
 
   if (m_parent.m_reported_allocations) {
-    Error err;
+    Status err;
     lldb::ProcessSP process_sp =
         m_parent.GetBestExecutionContextScope()->CalculateProcess();
 
@@ -1104,7 +1104,7 @@ IRExecutionUnit::GetRemoteRangeForLocal(lldb::addr_t local_address) {
 }
 
 bool IRExecutionUnit::CommitOneAllocation(lldb::ProcessSP &process_sp,
-                                          Error &error,
+                                          Status &error,
                                           AllocationRecord &record) {
   if (record.m_process_address != LLDB_INVALID_ADDRESS) {
     return true;
@@ -1145,7 +1145,7 @@ bool IRExecutionUnit::CommitOneAllocation(lldb::ProcessSP &process_sp,
 bool IRExecutionUnit::CommitAllocations(lldb::ProcessSP &process_sp) {
   bool ret = true;
 
-  lldb_private::Error err;
+  lldb_private::Status err;
 
   for (AllocationRecord &record : m_records) {
     ret = CommitOneAllocation(process_sp, err, record);
@@ -1189,7 +1189,7 @@ bool IRExecutionUnit::WriteData(lldb::ProcessSP &process_sp) {
   bool wrote_something = false;
   for (AllocationRecord &record : m_records) {
     if (record.m_process_address != LLDB_INVALID_ADDRESS) {
-      lldb_private::Error err;
+      lldb_private::Status err;
       WriteMemory(record.m_process_address, (uint8_t *)record.m_host_address,
                   record.m_size, err);
       if (err.Success())
diff --git a/source/Expression/IRInterpreter.cpp b/source/Expression/IRInterpreter.cpp
index 6867443cdf5e..6b5e22329af8 100644
--- a/source/Expression/IRInterpreter.cpp
+++ b/source/Expression/IRInterpreter.cpp
@@ -18,8 +18,8 @@
 #include "lldb/Utility/ConstString.h"
 #include "lldb/Utility/DataExtractor.h"
 #include "lldb/Utility/Endian.h"
-#include "lldb/Utility/Error.h"
 #include "lldb/Utility/Log.h"
+#include "lldb/Utility/Status.h"
 #include "lldb/Utility/StreamString.h"
 
 #include "lldb/Target/ABI.h"
@@ -189,7 +189,7 @@ public:
       size_t value_size = m_target_data.getTypeStoreSize(value->getType());
 
       lldb_private::DataExtractor value_extractor;
-      lldb_private::Error extract_error;
+      lldb_private::Status extract_error;
 
       m_execution_unit.GetMemoryData(value_extractor, process_address,
                                      value_size, extract_error);
@@ -224,13 +224,13 @@ public:
 
     lldb_private::DataBufferHeap buf(value_byte_size, 0);
 
-    lldb_private::Error get_data_error;
+    lldb_private::Status get_data_error;
 
     if (!cast_scalar.GetAsMemoryData(buf.GetBytes(), buf.GetByteSize(),
                                      m_byte_order, get_data_error))
       return false;
 
-    lldb_private::Error write_error;
+    lldb_private::Status write_error;
 
     m_execution_unit.WriteMemory(process_address, buf.GetBytes(),
                                  buf.GetByteSize(), write_error);
@@ -322,12 +322,12 @@ public:
     if (data_address == LLDB_INVALID_ADDRESS)
       return false;
 
-    lldb_private::Error write_error;
+    lldb_private::Status write_error;
 
     m_execution_unit.WritePointerToMemory(data_address, address, write_error);
 
     if (!write_error.Success()) {
-      lldb_private::Error free_error;
+      lldb_private::Status free_error;
       m_execution_unit.Free(data_address, free_error);
       return false;
     }
@@ -356,7 +356,7 @@ public:
     size_t constant_size = m_target_data.getTypeStoreSize(constant->getType());
     lldb_private::DataBufferHeap buf(constant_size, 0);
 
-    lldb_private::Error get_data_error;
+    lldb_private::Status get_data_error;
 
     lldb_private::Scalar resolved_scalar(
         resolved_value.zextOrTrunc(llvm::NextPowerOf2(constant_size) * 8));
@@ -364,7 +364,7 @@ public:
                                          m_byte_order, get_data_error))
       return false;
 
-    lldb_private::Error write_error;
+    lldb_private::Status write_error;
 
     m_execution_unit.WriteMemory(process_address, buf.GetBytes(),
                                  buf.GetByteSize(), write_error);
@@ -391,7 +391,7 @@ public:
   }
 
   lldb::addr_t Malloc(llvm::Type *type) {
-    lldb_private::Error alloc_error;
+    lldb_private::Status alloc_error;
 
     return Malloc(m_target_data.getTypeAllocSize(type),
                   m_target_data.getPrefTypeAlignment(type));
@@ -402,7 +402,7 @@ public:
 
     lldb_private::DataBufferHeap buf(length, 0);
 
-    lldb_private::Error read_error;
+    lldb_private::Status read_error;
 
     m_execution_unit.ReadMemory(buf.GetBytes(), addr, length, read_error);
 
@@ -433,7 +433,7 @@ public:
 
     if (const Constant *constant = dyn_cast<Constant>(value)) {
       if (!ResolveConstant(data_address, constant)) {
-        lldb_private::Error free_error;
+        lldb_private::Status free_error;
         m_execution_unit.Free(data_address, free_error);
         return LLDB_INVALID_ADDRESS;
       }
@@ -499,7 +499,7 @@ static bool CanResolveConstant(llvm::Constant *constant) {
 }
 
 bool IRInterpreter::CanInterpret(llvm::Module &module, llvm::Function &function,
-                                 lldb_private::Error &error,
+                                 lldb_private::Status &error,
                                  const bool support_function_calls) {
   lldb_private::Log *log(
       lldb_private::GetLogIfAllCategoriesSet(LIBLLDB_LOG_EXPRESSIONS));
@@ -648,7 +648,7 @@ bool IRInterpreter::CanInterpret(llvm::Module &module, llvm::Function &function,
 bool IRInterpreter::Interpret(llvm::Module &module, llvm::Function &function,
                               llvm::ArrayRef<lldb::addr_t> args,
                               lldb_private::IRExecutionUnit &execution_unit,
-                              lldb_private::Error &error,
+                              lldb_private::Status &error,
                               lldb::addr_t stack_frame_bottom,
                               lldb::addr_t stack_frame_top,
                               lldb_private::ExecutionContext &exe_ctx) {
@@ -867,7 +867,7 @@ bool IRInterpreter::Interpret(llvm::Module &module, llvm::Function &function,
         return false;
       }
 
-      lldb_private::Error write_error;
+      lldb_private::Status write_error;
 
       execution_unit.WritePointerToMemory(P, R, write_error);
 
@@ -876,7 +876,7 @@ bool IRInterpreter::Interpret(llvm::Module &module, llvm::Function &function,
           log->Printf("Couldn't write the result pointer for an AllocaInst");
         error.SetErrorToGenericError();
         error.SetErrorString(memory_write_error);
-        lldb_private::Error free_error;
+        lldb_private::Status free_error;
         execution_unit.Free(P, free_error);
         execution_unit.Free(R, free_error);
         return false;
@@ -1349,7 +1349,7 @@ bool IRInterpreter::Interpret(llvm::Module &module, llvm::Function &function,
       }
 
       lldb::addr_t R;
-      lldb_private::Error read_error;
+      lldb_private::Status read_error;
       execution_unit.ReadPointerFromMemory(&R, P, read_error);
 
       if (!read_error.Success()) {
@@ -1374,7 +1374,7 @@ bool IRInterpreter::Interpret(llvm::Module &module, llvm::Function &function,
         return false;
       }
 
-      lldb_private::Error write_error;
+      lldb_private::Status write_error;
       execution_unit.WriteMemory(D, buffer.GetBytes(), buffer.GetByteSize(),
                                  write_error);
       if (!write_error.Success()) {
@@ -1442,7 +1442,7 @@ bool IRInterpreter::Interpret(llvm::Module &module, llvm::Function &function,
       }
 
       lldb::addr_t R;
-      lldb_private::Error read_error;
+      lldb_private::Status read_error;
       execution_unit.ReadPointerFromMemory(&R, P, read_error);
 
       if (!read_error.Success()) {
@@ -1467,7 +1467,7 @@ bool IRInterpreter::Interpret(llvm::Module &module, llvm::Function &function,
         return false;
       }
 
-      lldb_private::Error write_error;
+      lldb_private::Status write_error;
       execution_unit.WriteMemory(R, buffer.GetBytes(), buffer.GetByteSize(),
                                  write_error);
       if (!write_error.Success()) {
diff --git a/source/Expression/IRMemoryMap.cpp b/source/Expression/IRMemoryMap.cpp
index 66510ac978ae..817c75e66a3b 100644
--- a/source/Expression/IRMemoryMap.cpp
+++ b/source/Expression/IRMemoryMap.cpp
@@ -14,9 +14,9 @@
 #include "lldb/Target/Target.h"
 #include "lldb/Utility/DataBufferHeap.h"
 #include "lldb/Utility/DataExtractor.h"
-#include "lldb/Utility/Error.h"
 #include "lldb/Utility/LLDBAssert.h"
 #include "lldb/Utility/Log.h"
+#include "lldb/Utility/Status.h"
 
 using namespace lldb_private;
 
@@ -31,7 +31,7 @@ IRMemoryMap::~IRMemoryMap() {
   if (process_sp) {
     AllocationMap::iterator iter;
 
-    Error err;
+    Status err;
 
     while ((iter = m_allocations.begin()) != m_allocations.end()) {
       err.Clear();
@@ -66,7 +66,7 @@ lldb::addr_t IRMemoryMap::FindSpace(size_t size) {
     return ret;
 
   if (process_is_alive && process_sp->CanJIT()) {
-    Error alloc_error;
+    Status alloc_error;
 
     ret = process_sp->AllocateMemory(size, lldb::ePermissionsReadable |
                                                lldb::ePermissionsWritable,
@@ -104,7 +104,7 @@ lldb::addr_t IRMemoryMap::FindSpace(size_t size) {
                end_of_memory != 0xffffffffull);
 
     MemoryRegionInfo region_info;
-    Error err = process_sp->GetMemoryRegionInfo(ret, region_info);
+    Status err = process_sp->GetMemoryRegionInfo(ret, region_info);
     if (err.Success()) {
       while (true) {
         if (region_info.GetReadable() != MemoryRegionInfo::OptionalBool::eNo ||
@@ -297,7 +297,7 @@ IRMemoryMap::Allocation::Allocation(lldb::addr_t process_alloc,
 
 lldb::addr_t IRMemoryMap::Malloc(size_t size, uint8_t alignment,
                                  uint32_t permissions, AllocationPolicy policy,
-                                 bool zero_memory, Error &error) {
+                                 bool zero_memory, Status &error) {
   lldb_private::Log *log(
       lldb_private::GetLogIfAllCategoriesSet(LIBLLDB_LOG_EXPRESSIONS));
   error.Clear();
@@ -397,7 +397,7 @@ lldb::addr_t IRMemoryMap::Malloc(size_t size, uint8_t alignment,
                  permissions, alignment, policy);
 
   if (zero_memory) {
-    Error write_error;
+    Status write_error;
     std::vector<uint8_t> zero_buf(size, 0);
     WriteMemory(aligned_address, zero_buf.data(), size, write_error);
   }
@@ -429,7 +429,7 @@ lldb::addr_t IRMemoryMap::Malloc(size_t size, uint8_t alignment,
   return aligned_address;
 }
 
-void IRMemoryMap::Leak(lldb::addr_t process_address, Error &error) {
+void IRMemoryMap::Leak(lldb::addr_t process_address, Status &error) {
   error.Clear();
 
   AllocationMap::iterator iter = m_allocations.find(process_address);
@@ -445,7 +445,7 @@ void IRMemoryMap::Leak(lldb::addr_t process_address, Error &error) {
   allocation.m_leak = true;
 }
 
-void IRMemoryMap::Free(lldb::addr_t process_address, Error &error) {
+void IRMemoryMap::Free(lldb::addr_t process_address, Status &error) {
   error.Clear();
 
   AllocationMap::iterator iter = m_allocations.find(process_address);
@@ -512,7 +512,8 @@ bool IRMemoryMap::GetAllocSize(lldb::addr_t address, size_t &size) {
 }
 
 void IRMemoryMap::WriteMemory(lldb::addr_t process_address,
-                              const uint8_t *bytes, size_t size, Error &error) {
+                              const uint8_t *bytes, size_t size,
+                              Status &error) {
   error.Clear();
 
   AllocationMap::iterator iter = FindAllocation(process_address, size);
@@ -587,7 +588,7 @@ void IRMemoryMap::WriteMemory(lldb::addr_t process_address,
 
 void IRMemoryMap::WriteScalarToMemory(lldb::addr_t process_address,
                                       Scalar &scalar, size_t size,
-                                      Error &error) {
+                                      Status &error) {
   error.Clear();
 
   if (size == UINT32_MAX)
@@ -612,7 +613,7 @@ void IRMemoryMap::WriteScalarToMemory(lldb::addr_t process_address,
 }
 
 void IRMemoryMap::WritePointerToMemory(lldb::addr_t process_address,
-                                       lldb::addr_t address, Error &error) {
+                                       lldb::addr_t address, Status &error) {
   error.Clear();
 
   Scalar scalar(address);
@@ -621,7 +622,7 @@ void IRMemoryMap::WritePointerToMemory(lldb::addr_t process_address,
 }
 
 void IRMemoryMap::ReadMemory(uint8_t *bytes, lldb::addr_t process_address,
-                             size_t size, Error &error) {
+                             size_t size, Status &error) {
   error.Clear();
 
   AllocationMap::iterator iter = FindAllocation(process_address, size);
@@ -717,7 +718,7 @@ void IRMemoryMap::ReadMemory(uint8_t *bytes, lldb::addr_t process_address,
 
 void IRMemoryMap::ReadScalarFromMemory(Scalar &scalar,
                                        lldb::addr_t process_address,
-                                       size_t size, Error &error) {
+                                       size_t size, Status &error) {
   error.Clear();
 
   if (size > 0) {
@@ -760,7 +761,7 @@ void IRMemoryMap::ReadScalarFromMemory(Scalar &scalar,
 
 void IRMemoryMap::ReadPointerFromMemory(lldb::addr_t *address,
                                         lldb::addr_t process_address,
-                                        Error &error) {
+                                        Status &error) {
   error.Clear();
 
   Scalar pointer_scalar;
@@ -777,7 +778,7 @@ void IRMemoryMap::ReadPointerFromMemory(lldb::addr_t *address,
 
 void IRMemoryMap::GetMemoryData(DataExtractor &extractor,
                                 lldb::addr_t process_address, size_t size,
-                                Error &error) {
+                                Status &error) {
   error.Clear();
 
   if (size > 0) {
diff --git a/source/Expression/LLVMUserExpression.cpp b/source/Expression/LLVMUserExpression.cpp
index 396a7e295033..83acb8249ba6 100644
--- a/source/Expression/LLVMUserExpression.cpp
+++ b/source/Expression/LLVMUserExpression.cpp
@@ -106,7 +106,7 @@ LLVMUserExpression::DoExecute(DiagnosticManager &diagnostic_manager,
         return lldb::eExpressionSetupError;
       }
 
-      Error interpreter_error;
+      Status interpreter_error;
 
       std::vector<lldb::addr_t> args;
 
@@ -270,7 +270,7 @@ bool LLVMUserExpression::FinalizeJITExecution(
     return false;
   }
 
-  Error dematerialize_error;
+  Status dematerialize_error;
 
   m_dematerializer_sp->Dematerialize(dematerialize_error, function_stack_bottom,
                                      function_stack_top);
@@ -309,7 +309,7 @@ bool LLVMUserExpression::PrepareToExecuteJITExpression(
 
   if (m_jit_start_addr != LLDB_INVALID_ADDRESS || m_can_interpret) {
     if (m_materialized_address == LLDB_INVALID_ADDRESS) {
-      Error alloc_error;
+      Status alloc_error;
 
       IRMemoryMap::AllocationPolicy policy =
           m_can_interpret ? IRMemoryMap::eAllocationPolicyHostOnly
@@ -335,7 +335,7 @@ bool LLVMUserExpression::PrepareToExecuteJITExpression(
     struct_address = m_materialized_address;
 
     if (m_can_interpret && m_stack_frame_bottom == LLDB_INVALID_ADDRESS) {
-      Error alloc_error;
+      Status alloc_error;
 
       const size_t stack_frame_size = 512 * 1024;
 
@@ -357,7 +357,7 @@ bool LLVMUserExpression::PrepareToExecuteJITExpression(
       }
     }
 
-    Error materialize_error;
+    Status materialize_error;
 
     m_dematerializer_sp = m_materializer_ap->Materialize(
         frame, *m_execution_unit_sp, struct_address, materialize_error);
diff --git a/source/Expression/Materializer.cpp b/source/Expression/Materializer.cpp
index 8a22daa5acdd..39fc5c8c2539 100644
--- a/source/Expression/Materializer.cpp
+++ b/source/Expression/Materializer.cpp
@@ -74,12 +74,12 @@ public:
     m_alignment = 8;
   }
 
-  void MakeAllocation(IRMemoryMap &map, Error &err) {
+  void MakeAllocation(IRMemoryMap &map, Status &err) {
     Log *log(lldb_private::GetLogIfAllCategoriesSet(LIBLLDB_LOG_EXPRESSIONS));
 
     // Allocate a spare memory area to store the persistent variable's contents.
 
-    Error allocate_error;
+    Status allocate_error;
     const bool zero_memory = false;
 
     lldb::addr_t mem = map.Malloc(
@@ -112,7 +112,7 @@ public:
 
     if (m_persistent_variable_sp->m_flags &
         ExpressionVariable::EVKeepInTarget) {
-      Error leak_error;
+      Status leak_error;
       map.Leak(mem, leak_error);
       m_persistent_variable_sp->m_flags &=
           ~ExpressionVariable::EVNeedsAllocation;
@@ -120,7 +120,7 @@ public:
 
     // Write the contents of the variable to the area.
 
-    Error write_error;
+    Status write_error;
 
     map.WriteMemory(mem, m_persistent_variable_sp->GetValueBytes(),
                     m_persistent_variable_sp->GetByteSize(), write_error);
@@ -134,8 +134,8 @@ public:
     }
   }
 
-  void DestroyAllocation(IRMemoryMap &map, Error &err) {
-    Error deallocate_error;
+  void DestroyAllocation(IRMemoryMap &map, Status &err) {
+    Status deallocate_error;
 
     map.Free((lldb::addr_t)m_persistent_variable_sp->m_live_sp->GetValue()
                  .GetScalar()
@@ -153,7 +153,7 @@ public:
   }
 
   void Materialize(lldb::StackFrameSP &frame_sp, IRMemoryMap &map,
-                   lldb::addr_t process_address, Error &err) override {
+                   lldb::addr_t process_address, Status &err) override {
     Log *log(lldb_private::GetLogIfAllCategoriesSet(LIBLLDB_LOG_EXPRESSIONS));
 
     const lldb::addr_t load_addr = process_address + m_offset;
@@ -181,7 +181,7 @@ public:
          m_persistent_variable_sp->m_live_sp) ||
         m_persistent_variable_sp->m_flags &
             ExpressionVariable::EVIsLLDBAllocated) {
-      Error write_error;
+      Status write_error;
 
       map.WriteScalarToMemory(
           load_addr,
@@ -204,7 +204,7 @@ public:
 
   void Dematerialize(lldb::StackFrameSP &frame_sp, IRMemoryMap &map,
                      lldb::addr_t process_address, lldb::addr_t frame_top,
-                     lldb::addr_t frame_bottom, Error &err) override {
+                     lldb::addr_t frame_bottom, Status &err) override {
     Log *log(lldb_private::GetLogIfAllCategoriesSet(LIBLLDB_LOG_EXPRESSIONS));
 
     const lldb::addr_t load_addr = process_address + m_offset;
@@ -234,7 +234,7 @@ public:
         // live variable data hasn't been set up yet.  Do this now.
 
         lldb::addr_t location;
-        Error read_error;
+        Status read_error;
 
         map.ReadPointerFromMemory(&location, load_addr, read_error);
 
@@ -304,7 +304,7 @@ public:
 
         m_persistent_variable_sp->ValueUpdated();
 
-        Error read_error;
+        Status read_error;
 
         map.ReadMemory(m_persistent_variable_sp->GetValueBytes(), mem,
                        m_persistent_variable_sp->GetByteSize(), read_error);
@@ -353,7 +353,7 @@ public:
                  Log *log) override {
     StreamString dump_stream;
 
-    Error err;
+    Status err;
 
     const lldb::addr_t load_addr = process_address + m_offset;
 
@@ -416,7 +416,7 @@ private:
 
 uint32_t Materializer::AddPersistentVariable(
     lldb::ExpressionVariableSP &persistent_variable_sp,
-    PersistentVariableDelegate *delegate, Error &err) {
+    PersistentVariableDelegate *delegate, Status &err) {
   EntityVector::iterator iter = m_entities.insert(m_entities.end(), EntityUP());
   iter->reset(new EntityPersistentVariable(persistent_variable_sp, delegate));
   uint32_t ret = AddStructMember(**iter);
@@ -439,7 +439,7 @@ public:
   }
 
   void Materialize(lldb::StackFrameSP &frame_sp, IRMemoryMap &map,
-                   lldb::addr_t process_address, Error &err) override {
+                   lldb::addr_t process_address, Status &err) override {
     Log *log(lldb_private::GetLogIfAllCategoriesSet(LIBLLDB_LOG_EXPRESSIONS));
 
     const lldb::addr_t load_addr = process_address + m_offset;
@@ -464,7 +464,7 @@ public:
       return;
     }
 
-    Error valobj_error = valobj_sp->GetError();
+    Status valobj_error = valobj_sp->GetError();
 
     if (valobj_error.Fail()) {
       err.SetErrorStringWithFormat("couldn't get the value of variable %s: %s",
@@ -475,7 +475,7 @@ public:
 
     if (m_is_reference) {
       DataExtractor valobj_extractor;
-      Error extract_error;
+      Status extract_error;
       valobj_sp->GetData(valobj_extractor, extract_error);
 
       if (!extract_error.Success()) {
@@ -488,7 +488,7 @@ public:
       lldb::offset_t offset = 0;
       lldb::addr_t reference_addr = valobj_extractor.GetAddress(&offset);
 
-      Error write_error;
+      Status write_error;
       map.WritePointerToMemory(load_addr, reference_addr, write_error);
 
       if (!write_error.Success()) {
@@ -504,7 +504,7 @@ public:
       lldb::addr_t addr_of_valobj =
           valobj_sp->GetAddressOf(scalar_is_load_address, &address_type);
       if (addr_of_valobj != LLDB_INVALID_ADDRESS) {
-        Error write_error;
+        Status write_error;
         map.WritePointerToMemory(load_addr, addr_of_valobj, write_error);
 
         if (!write_error.Success()) {
@@ -515,7 +515,7 @@ public:
         }
       } else {
         DataExtractor data;
-        Error extract_error;
+        Status extract_error;
         valobj_sp->GetData(data, extract_error);
         if (!extract_error.Success()) {
           err.SetErrorStringWithFormat("couldn't get the value of %s: %s",
@@ -554,7 +554,7 @@ public:
         if (!byte_align)
           byte_align = 1;
 
-        Error alloc_error;
+        Status alloc_error;
         const bool zero_memory = false;
 
         m_temporary_allocation = map.Malloc(
@@ -574,7 +574,7 @@ public:
           return;
         }
 
-        Error write_error;
+        Status write_error;
 
         map.WriteMemory(m_temporary_allocation, data.GetDataStart(),
                         data.GetByteSize(), write_error);
@@ -586,7 +586,7 @@ public:
           return;
         }
 
-        Error pointer_write_error;
+        Status pointer_write_error;
 
         map.WritePointerToMemory(load_addr, m_temporary_allocation,
                                  pointer_write_error);
@@ -603,7 +603,7 @@ public:
 
   void Dematerialize(lldb::StackFrameSP &frame_sp, IRMemoryMap &map,
                      lldb::addr_t process_address, lldb::addr_t frame_top,
-                     lldb::addr_t frame_bottom, Error &err) override {
+                     lldb::addr_t frame_bottom, Status &err) override {
     Log *log(lldb_private::GetLogIfAllCategoriesSet(LIBLLDB_LOG_EXPRESSIONS));
 
     const lldb::addr_t load_addr = process_address + m_offset;
@@ -631,7 +631,7 @@ public:
 
       lldb_private::DataExtractor data;
 
-      Error extract_error;
+      Status extract_error;
 
       map.GetMemoryData(data, m_temporary_allocation, valobj_sp->GetByteSize(),
                         extract_error);
@@ -652,7 +652,7 @@ public:
         }
       }
 
-      Error set_error;
+      Status set_error;
 
       if (actually_write) {
         valobj_sp->SetData(data, set_error);
@@ -665,7 +665,7 @@ public:
         }
       }
 
-      Error free_error;
+      Status free_error;
 
       map.Free(m_temporary_allocation, free_error);
 
@@ -689,7 +689,7 @@ public:
     const lldb::addr_t load_addr = process_address + m_offset;
     dump_stream.Printf("0x%" PRIx64 ": EntityVariable\n", load_addr);
 
-    Error err;
+    Status err;
 
     lldb::addr_t ptr = LLDB_INVALID_ADDRESS;
 
@@ -746,7 +746,7 @@ public:
 
   void Wipe(IRMemoryMap &map, lldb::addr_t process_address) override {
     if (m_temporary_allocation != LLDB_INVALID_ADDRESS) {
-      Error free_error;
+      Status free_error;
 
       map.Free(m_temporary_allocation, free_error);
 
@@ -763,7 +763,7 @@ private:
   lldb::DataBufferSP m_original_data;
 };
 
-uint32_t Materializer::AddVariable(lldb::VariableSP &variable_sp, Error &err) {
+uint32_t Materializer::AddVariable(lldb::VariableSP &variable_sp, Status &err) {
   EntityVector::iterator iter = m_entities.insert(m_entities.end(), EntityUP());
   iter->reset(new EntityVariable(variable_sp));
   uint32_t ret = AddStructMember(**iter);
@@ -787,7 +787,7 @@ public:
   }
 
   void Materialize(lldb::StackFrameSP &frame_sp, IRMemoryMap &map,
-                   lldb::addr_t process_address, Error &err) override {
+                   lldb::addr_t process_address, Status &err) override {
     if (!m_is_program_reference) {
       if (m_temporary_allocation != LLDB_INVALID_ADDRESS) {
         err.SetErrorString("Trying to create a temporary region for the result "
@@ -806,7 +806,7 @@ public:
       if (!byte_align)
         byte_align = 1;
 
-      Error alloc_error;
+      Status alloc_error;
       const bool zero_memory = true;
 
       m_temporary_allocation = map.Malloc(
@@ -822,7 +822,7 @@ public:
         return;
       }
 
-      Error pointer_write_error;
+      Status pointer_write_error;
 
       map.WritePointerToMemory(load_addr, m_temporary_allocation,
                                pointer_write_error);
@@ -837,7 +837,7 @@ public:
 
   void Dematerialize(lldb::StackFrameSP &frame_sp, IRMemoryMap &map,
                      lldb::addr_t process_address, lldb::addr_t frame_top,
-                     lldb::addr_t frame_bottom, Error &err) override {
+                     lldb::addr_t frame_bottom, Status &err) override {
     err.Clear();
 
     ExecutionContextScope *exe_scope = map.GetBestExecutionContextScope();
@@ -849,7 +849,7 @@ public:
     }
 
     lldb::addr_t address;
-    Error read_error;
+    Status read_error;
     const lldb::addr_t load_addr = process_address + m_offset;
 
     map.ReadPointerFromMemory(&address, load_addr, read_error);
@@ -867,7 +867,7 @@ public:
       return;
     }
 
-    Error type_system_error;
+    Status type_system_error;
     TypeSystem *type_system = target_sp->GetScratchTypeSystemForLanguage(
         &type_system_error, m_type.GetMinimumLanguage());
 
@@ -937,7 +937,7 @@ public:
       ret->m_flags |= ExpressionVariable::EVNeedsAllocation;
 
       if (m_temporary_allocation != LLDB_INVALID_ADDRESS) {
-        Error free_error;
+        Status free_error;
         map.Free(m_temporary_allocation, free_error);
       }
     } else {
@@ -956,7 +956,7 @@ public:
 
     dump_stream.Printf("0x%" PRIx64 ": EntityResultVariable\n", load_addr);
 
-    Error err;
+    Status err;
 
     lldb::addr_t ptr = LLDB_INVALID_ADDRESS;
 
@@ -1013,7 +1013,7 @@ public:
 
   void Wipe(IRMemoryMap &map, lldb::addr_t process_address) override {
     if (!m_keep_in_memory && m_temporary_allocation != LLDB_INVALID_ADDRESS) {
-      Error free_error;
+      Status free_error;
 
       map.Free(m_temporary_allocation, free_error);
     }
@@ -1036,7 +1036,7 @@ uint32_t Materializer::AddResultVariable(const CompilerType &type,
                                          bool is_program_reference,
                                          bool keep_in_memory,
                                          PersistentVariableDelegate *delegate,
-                                         Error &err) {
+                                         Status &err) {
   EntityVector::iterator iter = m_entities.insert(m_entities.end(), EntityUP());
   iter->reset(new EntityResultVariable(type, is_program_reference,
                                        keep_in_memory, delegate));
@@ -1054,7 +1054,7 @@ public:
   }
 
   void Materialize(lldb::StackFrameSP &frame_sp, IRMemoryMap &map,
-                   lldb::addr_t process_address, Error &err) override {
+                   lldb::addr_t process_address, Status &err) override {
     Log *log(lldb_private::GetLogIfAllCategoriesSet(LIBLLDB_LOG_EXPRESSIONS));
 
     const lldb::addr_t load_addr = process_address + m_offset;
@@ -1086,7 +1086,7 @@ public:
     if (resolved_address == LLDB_INVALID_ADDRESS)
       resolved_address = sym_address.GetFileAddress();
 
-    Error pointer_write_error;
+    Status pointer_write_error;
 
     map.WritePointerToMemory(load_addr, resolved_address, pointer_write_error);
 
@@ -1100,7 +1100,7 @@ public:
 
   void Dematerialize(lldb::StackFrameSP &frame_sp, IRMemoryMap &map,
                      lldb::addr_t process_address, lldb::addr_t frame_top,
-                     lldb::addr_t frame_bottom, Error &err) override {
+                     lldb::addr_t frame_bottom, Status &err) override {
     Log *log(lldb_private::GetLogIfAllCategoriesSet(LIBLLDB_LOG_EXPRESSIONS));
 
     const lldb::addr_t load_addr = process_address + m_offset;
@@ -1118,7 +1118,7 @@ public:
                  Log *log) override {
     StreamString dump_stream;
 
-    Error err;
+    Status err;
 
     const lldb::addr_t load_addr = process_address + m_offset;
 
@@ -1151,7 +1151,7 @@ private:
   Symbol m_symbol;
 };
 
-uint32_t Materializer::AddSymbol(const Symbol &symbol_sp, Error &err) {
+uint32_t Materializer::AddSymbol(const Symbol &symbol_sp, Status &err) {
   EntityVector::iterator iter = m_entities.insert(m_entities.end(), EntityUP());
   iter->reset(new EntitySymbol(symbol_sp));
   uint32_t ret = AddStructMember(**iter);
@@ -1169,7 +1169,7 @@ public:
   }
 
   void Materialize(lldb::StackFrameSP &frame_sp, IRMemoryMap &map,
-                   lldb::addr_t process_address, Error &err) override {
+                   lldb::addr_t process_address, Status &err) override {
     Log *log(lldb_private::GetLogIfAllCategoriesSet(LIBLLDB_LOG_EXPRESSIONS));
 
     const lldb::addr_t load_addr = process_address + m_offset;
@@ -1216,7 +1216,7 @@ public:
     m_register_contents.reset(new DataBufferHeap(register_data.GetDataStart(),
                                                  register_data.GetByteSize()));
 
-    Error write_error;
+    Status write_error;
 
     map.WriteMemory(load_addr, register_data.GetDataStart(),
                     register_data.GetByteSize(), write_error);
@@ -1231,7 +1231,7 @@ public:
 
   void Dematerialize(lldb::StackFrameSP &frame_sp, IRMemoryMap &map,
                      lldb::addr_t process_address, lldb::addr_t frame_top,
-                     lldb::addr_t frame_bottom, Error &err) override {
+                     lldb::addr_t frame_bottom, Status &err) override {
     Log *log(lldb_private::GetLogIfAllCategoriesSet(LIBLLDB_LOG_EXPRESSIONS));
 
     const lldb::addr_t load_addr = process_address + m_offset;
@@ -1242,7 +1242,7 @@ public:
                   (uint64_t)load_addr, m_register_info.name);
     }
 
-    Error extract_error;
+    Status extract_error;
 
     DataExtractor register_data;
 
@@ -1291,7 +1291,7 @@ public:
                  Log *log) override {
     StreamString dump_stream;
 
-    Error err;
+    Status err;
 
     const lldb::addr_t load_addr = process_address + m_offset;
 
@@ -1326,7 +1326,7 @@ private:
 };
 
 uint32_t Materializer::AddRegister(const RegisterInfo &register_info,
-                                   Error &err) {
+                                   Status &err) {
   EntityVector::iterator iter = m_entities.insert(m_entities.end(), EntityUP());
   iter->reset(new EntityRegister(register_info));
   uint32_t ret = AddStructMember(**iter);
@@ -1346,7 +1346,7 @@ Materializer::~Materializer() {
 
 Materializer::DematerializerSP
 Materializer::Materialize(lldb::StackFrameSP &frame_sp, IRMemoryMap &map,
-                          lldb::addr_t process_address, Error &error) {
+                          lldb::addr_t process_address, Status &error) {
   ExecutionContextScope *exe_scope = frame_sp.get();
 
   if (!exe_scope)
@@ -1389,7 +1389,7 @@ Materializer::Materialize(lldb::StackFrameSP &frame_sp, IRMemoryMap &map,
   return ret;
 }
 
-void Materializer::Dematerializer::Dematerialize(Error &error,
+void Materializer::Dematerializer::Dematerialize(Status &error,
                                                  lldb::addr_t frame_bottom,
                                                  lldb::addr_t frame_top) {
   lldb::StackFrameSP frame_sp;
diff --git a/source/Expression/REPL.cpp b/source/Expression/REPL.cpp
index e404537562b7..84a6405f3268 100644
--- a/source/Expression/REPL.cpp
+++ b/source/Expression/REPL.cpp
@@ -42,7 +42,7 @@ REPL::REPL(LLVMCastKind kind, Target &target) : m_target(target), m_kind(kind) {
 
 REPL::~REPL() = default;
 
-lldb::REPLSP REPL::Create(Error &err, lldb::LanguageType language,
+lldb::REPLSP REPL::Create(Status &err, lldb::LanguageType language,
                           Debugger *debugger, Target *target,
                           const char *repl_options) {
   uint32_t idx = 0;
@@ -309,7 +309,7 @@ void REPL::IOHandlerInputComplete(IOHandler &io_handler, std::string &code) {
 
       const char *expr_prefix = nullptr;
       lldb::ValueObjectSP result_valobj_sp;
-      Error error;
+      Status error;
       lldb::ModuleSP jit_module_sp;
       lldb::ExpressionResults execution_results =
           UserExpression::Evaluate(exe_ctx, expr_options, code.c_str(),
@@ -518,8 +518,8 @@ bool QuitCommandOverrideCallback(void *baton, const char **argv) {
   return false;
 }
 
-Error REPL::RunLoop() {
-  Error error;
+Status REPL::RunLoop() {
+  Status error;
 
   error = DoInitialization();
   m_repl_source_path = GetSourcePath();
diff --git a/source/Expression/UserExpression.cpp b/source/Expression/UserExpression.cpp
index c7cf106e19df..3386bc4577ae 100644
--- a/source/Expression/UserExpression.cpp
+++ b/source/Expression/UserExpression.cpp
@@ -101,7 +101,7 @@ bool UserExpression::MatchesContext(ExecutionContext &exe_ctx) {
 
 lldb::addr_t UserExpression::GetObjectPointer(lldb::StackFrameSP frame_sp,
                                               ConstString &object_name,
-                                              Error &err) {
+                                              Status &err) {
   err.Clear();
 
   if (!frame_sp) {
@@ -140,7 +140,7 @@ lldb::addr_t UserExpression::GetObjectPointer(lldb::StackFrameSP frame_sp,
 lldb::ExpressionResults UserExpression::Evaluate(
     ExecutionContext &exe_ctx, const EvaluateExpressionOptions &options,
     llvm::StringRef expr, llvm::StringRef prefix,
-    lldb::ValueObjectSP &result_valobj_sp, Error &error, uint32_t line_offset,
+    lldb::ValueObjectSP &result_valobj_sp, Status &error, uint32_t line_offset,
     std::string *fixed_expression, lldb::ModuleSP *jit_module_sp_ptr) {
   Log *log(lldb_private::GetLogIfAnyCategoriesSet(LIBLLDB_LOG_EXPRESSIONS |
                                                   LIBLLDB_LOG_STEP));
diff --git a/source/Expression/UtilityFunction.cpp b/source/Expression/UtilityFunction.cpp
index 6772fd18ac5b..52f3bfc4d128 100644
--- a/source/Expression/UtilityFunction.cpp
+++ b/source/Expression/UtilityFunction.cpp
@@ -65,7 +65,7 @@ UtilityFunction::~UtilityFunction() {
 
 FunctionCaller *UtilityFunction::MakeFunctionCaller(
     const CompilerType &return_type, const ValueList &arg_value_list,
-    lldb::ThreadSP thread_to_use_sp, Error &error) {
+    lldb::ThreadSP thread_to_use_sp, Status &error) {
   if (m_caller_up)
     return m_caller_up.get();
 
diff --git a/source/Host/common/Editline.cpp b/source/Host/common/Editline.cpp
index 851287e76331..7d4b398a171d 100644
--- a/source/Host/common/Editline.cpp
+++ b/source/Host/common/Editline.cpp
@@ -14,10 +14,10 @@
 #include "lldb/Host/ConnectionFileDescriptor.h"
 #include "lldb/Host/Editline.h"
 #include "lldb/Host/Host.h"
-#include "lldb/Utility/Error.h"
 #include "lldb/Utility/FileSpec.h"
 #include "lldb/Utility/LLDBAssert.h"
 #include "lldb/Utility/SelectHelper.h"
+#include "lldb/Utility/Status.h"
 #include "lldb/Utility/StreamString.h"
 #include "lldb/Utility/StringList.h"
 #include "lldb/Utility/Timeout.h"
diff --git a/source/Host/common/File.cpp b/source/Host/common/File.cpp
index 1869a6db49c9..3de93ebc220b 100644
--- a/source/Host/common/File.cpp
+++ b/source/Host/common/File.cpp
@@ -157,8 +157,8 @@ void File::SetStream(FILE *fh, bool transfer_ownership) {
   m_own_stream = transfer_ownership;
 }
 
-Error File::Open(const char *path, uint32_t options, uint32_t permissions) {
-  Error error;
+Status File::Open(const char *path, uint32_t options, uint32_t permissions) {
+  Status error;
   if (IsValid())
     Close();
 
@@ -246,20 +246,20 @@ Error File::Open(const char *path, uint32_t options, uint32_t permissions) {
   return error;
 }
 
-uint32_t File::GetPermissions(const FileSpec &file_spec, Error &error) {
+uint32_t File::GetPermissions(const FileSpec &file_spec, Status &error) {
   if (file_spec) {
     error.Clear();
     auto Perms = llvm::sys::fs::getPermissions(file_spec.GetPath());
     if (Perms)
       return *Perms;
-    error = Error(Perms.getError());
+    error = Status(Perms.getError());
     return 0;
   } else
     error.SetErrorString("empty file spec");
   return 0;
 }
 
-uint32_t File::GetPermissions(Error &error) const {
+uint32_t File::GetPermissions(Status &error) const {
   int fd = GetDescriptor();
   if (fd != kInvalidDescriptor) {
     struct stat file_stats;
@@ -275,8 +275,8 @@ uint32_t File::GetPermissions(Error &error) const {
   return 0;
 }
 
-Error File::Close() {
-  Error error;
+Status File::Close() {
+  Status error;
   if (StreamIsValid() && m_own_stream) {
     if (::fclose(m_stream) == EOF)
       error.SetErrorToErrno();
@@ -305,8 +305,8 @@ void File::Clear() {
       eLazyBoolCalculate;
 }
 
-Error File::GetFileSpec(FileSpec &file_spec) const {
-  Error error;
+Status File::GetFileSpec(FileSpec &file_spec) const {
+  Status error;
 #ifdef F_GETPATH
   if (IsValid()) {
     char path[PATH_MAX];
@@ -340,7 +340,7 @@ Error File::GetFileSpec(FileSpec &file_spec) const {
   return error;
 }
 
-off_t File::SeekFromStart(off_t offset, Error *error_ptr) {
+off_t File::SeekFromStart(off_t offset, Status *error_ptr) {
   off_t result = 0;
   if (DescriptorIsValid()) {
     result = ::lseek(m_descriptor, offset, SEEK_SET);
@@ -366,7 +366,7 @@ off_t File::SeekFromStart(off_t offset, Error *error_ptr) {
   return result;
 }
 
-off_t File::SeekFromCurrent(off_t offset, Error *error_ptr) {
+off_t File::SeekFromCurrent(off_t offset, Status *error_ptr) {
   off_t result = -1;
   if (DescriptorIsValid()) {
     result = ::lseek(m_descriptor, offset, SEEK_CUR);
@@ -392,7 +392,7 @@ off_t File::SeekFromCurrent(off_t offset, Error *error_ptr) {
   return result;
 }
 
-off_t File::SeekFromEnd(off_t offset, Error *error_ptr) {
+off_t File::SeekFromEnd(off_t offset, Status *error_ptr) {
   off_t result = -1;
   if (DescriptorIsValid()) {
     result = ::lseek(m_descriptor, offset, SEEK_END);
@@ -418,8 +418,8 @@ off_t File::SeekFromEnd(off_t offset, Error *error_ptr) {
   return result;
 }
 
-Error File::Flush() {
-  Error error;
+Status File::Flush() {
+  Status error;
   if (StreamIsValid()) {
     int err = 0;
     do {
@@ -434,8 +434,8 @@ Error File::Flush() {
   return error;
 }
 
-Error File::Sync() {
-  Error error;
+Status File::Sync() {
+  Status error;
   if (DescriptorIsValid()) {
 #ifdef _WIN32
     int err = FlushFileBuffers((HANDLE)_get_osfhandle(m_descriptor));
@@ -462,8 +462,8 @@ Error File::Sync() {
 #define MAX_WRITE_SIZE INT_MAX
 #endif
 
-Error File::Read(void *buf, size_t &num_bytes) {
-  Error error;
+Status File::Read(void *buf, size_t &num_bytes) {
+  Status error;
 
 #if defined(MAX_READ_SIZE)
   if (num_bytes > MAX_READ_SIZE) {
@@ -524,8 +524,8 @@ Error File::Read(void *buf, size_t &num_bytes) {
   return error;
 }
 
-Error File::Write(const void *buf, size_t &num_bytes) {
-  Error error;
+Status File::Write(const void *buf, size_t &num_bytes) {
+  Status error;
 
 #if defined(MAX_WRITE_SIZE)
   if (num_bytes > MAX_WRITE_SIZE) {
@@ -588,8 +588,8 @@ Error File::Write(const void *buf, size_t &num_bytes) {
   return error;
 }
 
-Error File::Read(void *buf, size_t &num_bytes, off_t &offset) {
-  Error error;
+Status File::Read(void *buf, size_t &num_bytes, off_t &offset) {
+  Status error;
 
 #if defined(MAX_READ_SIZE)
   if (num_bytes > MAX_READ_SIZE) {
@@ -650,9 +650,9 @@ Error File::Read(void *buf, size_t &num_bytes, off_t &offset) {
   return error;
 }
 
-Error File::Read(size_t &num_bytes, off_t &offset, bool null_terminate,
-                 DataBufferSP &data_buffer_sp) {
-  Error error;
+Status File::Read(size_t &num_bytes, off_t &offset, bool null_terminate,
+                  DataBufferSP &data_buffer_sp) {
+  Status error;
 
   if (num_bytes > 0) {
     int fd = GetDescriptor();
@@ -694,8 +694,8 @@ Error File::Read(size_t &num_bytes, off_t &offset, bool null_terminate,
   return error;
 }
 
-Error File::Write(const void *buf, size_t &num_bytes, off_t &offset) {
-  Error error;
+Status File::Write(const void *buf, size_t &num_bytes, off_t &offset) {
+  Status error;
 
 #if defined(MAX_WRITE_SIZE)
   if (num_bytes > MAX_WRITE_SIZE) {
diff --git a/source/Host/common/FileCache.cpp b/source/Host/common/FileCache.cpp
index db71813e4ffb..b4629255c852 100644
--- a/source/Host/common/FileCache.cpp
+++ b/source/Host/common/FileCache.cpp
@@ -24,7 +24,7 @@ FileCache &FileCache::GetInstance() {
 }
 
 lldb::user_id_t FileCache::OpenFile(const FileSpec &file_spec, uint32_t flags,
-                                    uint32_t mode, Error &error) {
+                                    uint32_t mode, Status &error) {
   std::string path(file_spec.GetPath());
   if (path.empty()) {
     error.SetErrorString("empty path");
@@ -39,7 +39,7 @@ lldb::user_id_t FileCache::OpenFile(const FileSpec &file_spec, uint32_t flags,
   return fd;
 }
 
-bool FileCache::CloseFile(lldb::user_id_t fd, Error &error) {
+bool FileCache::CloseFile(lldb::user_id_t fd, Status &error) {
   if (fd == UINT64_MAX) {
     error.SetErrorString("invalid file descriptor");
     return false;
@@ -60,7 +60,8 @@ bool FileCache::CloseFile(lldb::user_id_t fd, Error &error) {
 }
 
 uint64_t FileCache::WriteFile(lldb::user_id_t fd, uint64_t offset,
-                              const void *src, uint64_t src_len, Error &error) {
+                              const void *src, uint64_t src_len,
+                              Status &error) {
   if (fd == UINT64_MAX) {
     error.SetErrorString("invalid file descriptor");
     return UINT64_MAX;
@@ -86,7 +87,7 @@ uint64_t FileCache::WriteFile(lldb::user_id_t fd, uint64_t offset,
 }
 
 uint64_t FileCache::ReadFile(lldb::user_id_t fd, uint64_t offset, void *dst,
-                             uint64_t dst_len, Error &error) {
+                             uint64_t dst_len, Status &error) {
   if (fd == UINT64_MAX) {
     error.SetErrorString("invalid file descriptor");
     return UINT64_MAX;
diff --git a/source/Host/common/Host.cpp b/source/Host/common/Host.cpp
index 7754d96ad331..da35022c813c 100644
--- a/source/Host/common/Host.cpp
+++ b/source/Host/common/Host.cpp
@@ -63,9 +63,9 @@
 #include "lldb/Target/UnixSignals.h"
 #include "lldb/Utility/CleanUp.h"
 #include "lldb/Utility/DataBufferLLVM.h"
-#include "lldb/Utility/Error.h"
 #include "lldb/Utility/FileSpec.h"
 #include "lldb/Utility/Log.h"
+#include "lldb/Utility/Status.h"
 #include "lldb/lldb-private-forward.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/Support/FileSystem.h"
@@ -484,19 +484,19 @@ MonitorShellCommand(std::shared_ptr<ShellInfo> shell_info, lldb::pid_t pid,
   return true;
 }
 
-Error Host::RunShellCommand(const char *command, const FileSpec &working_dir,
-                            int *status_ptr, int *signo_ptr,
-                            std::string *command_output_ptr,
-                            uint32_t timeout_sec, bool run_in_default_shell) {
+Status Host::RunShellCommand(const char *command, const FileSpec &working_dir,
+                             int *status_ptr, int *signo_ptr,
+                             std::string *command_output_ptr,
+                             uint32_t timeout_sec, bool run_in_default_shell) {
   return RunShellCommand(Args(command), working_dir, status_ptr, signo_ptr,
                          command_output_ptr, timeout_sec, run_in_default_shell);
 }
 
-Error Host::RunShellCommand(const Args &args, const FileSpec &working_dir,
-                            int *status_ptr, int *signo_ptr,
-                            std::string *command_output_ptr,
-                            uint32_t timeout_sec, bool run_in_default_shell) {
-  Error error;
+Status Host::RunShellCommand(const Args &args, const FileSpec &working_dir,
+                             int *status_ptr, int *signo_ptr,
+                             std::string *command_output_ptr,
+                             uint32_t timeout_sec, bool run_in_default_shell) {
+  Status error;
   ProcessLaunchInfo launch_info;
   launch_info.SetArchitecture(HostInfo::GetArchitecture());
   if (run_in_default_shell) {
@@ -654,10 +654,10 @@ short Host::GetPosixspawnFlags(const ProcessLaunchInfo &launch_info) {
   return flags;
 }
 
-Error Host::LaunchProcessPosixSpawn(const char *exe_path,
-                                    const ProcessLaunchInfo &launch_info,
-                                    lldb::pid_t &pid) {
-  Error error;
+Status Host::LaunchProcessPosixSpawn(const char *exe_path,
+                                     const ProcessLaunchInfo &launch_info,
+                                     lldb::pid_t &pid) {
+  Status error;
   Log *log(lldb_private::GetLogIfAllCategoriesSet(LIBLLDB_LOG_HOST |
                                                   LIBLLDB_LOG_PROCESS));
 
@@ -866,7 +866,7 @@ Error Host::LaunchProcessPosixSpawn(const char *exe_path,
 }
 
 bool Host::AddPosixSpawnFileAction(void *_file_actions, const FileAction *info,
-                                   Log *log, Error &error) {
+                                   Log *log, Status &error) {
   if (info == NULL)
     return false;
 
@@ -947,7 +947,7 @@ bool Host::AddPosixSpawnFileAction(void *_file_actions, const FileAction *info,
 // The functions below implement process launching via posix_spawn() for Linux,
 // FreeBSD and NetBSD.
 
-Error Host::LaunchProcess(ProcessLaunchInfo &launch_info) {
+Status Host::LaunchProcess(ProcessLaunchInfo &launch_info) {
   std::unique_ptr<ProcessLauncher> delegate_launcher;
 #if defined(_WIN32)
   delegate_launcher.reset(new ProcessLauncherWindows());
@@ -958,7 +958,7 @@ Error Host::LaunchProcess(ProcessLaunchInfo &launch_info) {
 #endif
   MonitoringProcessLauncher launcher(std::move(delegate_launcher));
 
-  Error error;
+  Status error;
   HostProcess process = launcher.LaunchProcess(launch_info, error);
 
   // TODO(zturner): It would be better if the entire HostProcess were returned
diff --git a/source/Host/common/HostProcess.cpp b/source/Host/common/HostProcess.cpp
index 2d99d8e1fc99..154033321514 100644
--- a/source/Host/common/HostProcess.cpp
+++ b/source/Host/common/HostProcess.cpp
@@ -21,9 +21,9 @@ HostProcess::HostProcess(lldb::process_t process)
 
 HostProcess::~HostProcess() {}
 
-Error HostProcess::Terminate() { return m_native_process->Terminate(); }
+Status HostProcess::Terminate() { return m_native_process->Terminate(); }
 
-Error HostProcess::GetMainModule(FileSpec &file_spec) const {
+Status HostProcess::GetMainModule(FileSpec &file_spec) const {
   return m_native_process->GetMainModule(file_spec);
 }
 
diff --git a/source/Host/common/HostThread.cpp b/source/Host/common/HostThread.cpp
index 738b7ef72f16..02882c523908 100644
--- a/source/Host/common/HostThread.cpp
+++ b/source/Host/common/HostThread.cpp
@@ -18,11 +18,11 @@ HostThread::HostThread() : m_native_thread(new HostNativeThread) {}
 HostThread::HostThread(lldb::thread_t thread)
     : m_native_thread(new HostNativeThread(thread)) {}
 
-Error HostThread::Join(lldb::thread_result_t *result) {
+Status HostThread::Join(lldb::thread_result_t *result) {
   return m_native_thread->Join(result);
 }
 
-Error HostThread::Cancel() { return m_native_thread->Cancel(); }
+Status HostThread::Cancel() { return m_native_thread->Cancel(); }
 
 void HostThread::Reset() { return m_native_thread->Reset(); }
 
diff --git a/source/Host/common/LockFileBase.cpp b/source/Host/common/LockFileBase.cpp
index b30acc5d5044..a8d7881ab896 100644
--- a/source/Host/common/LockFileBase.cpp
+++ b/source/Host/common/LockFileBase.cpp
@@ -14,9 +14,9 @@ using namespace lldb_private;
 
 namespace {
 
-Error AlreadyLocked() { return Error("Already locked"); }
+Status AlreadyLocked() { return Status("Already locked"); }
 
-Error NotLocked() { return Error("Not locked"); }
+Status NotLocked() { return Status("Not locked"); }
 }
 
 LockFileBase::LockFileBase(int fd)
@@ -24,31 +24,31 @@ LockFileBase::LockFileBase(int fd)
 
 bool LockFileBase::IsLocked() const { return m_locked; }
 
-Error LockFileBase::WriteLock(const uint64_t start, const uint64_t len) {
+Status LockFileBase::WriteLock(const uint64_t start, const uint64_t len) {
   return DoLock([&](const uint64_t start,
                     const uint64_t len) { return DoWriteLock(start, len); },
                 start, len);
 }
 
-Error LockFileBase::TryWriteLock(const uint64_t start, const uint64_t len) {
+Status LockFileBase::TryWriteLock(const uint64_t start, const uint64_t len) {
   return DoLock([&](const uint64_t start,
                     const uint64_t len) { return DoTryWriteLock(start, len); },
                 start, len);
 }
 
-Error LockFileBase::ReadLock(const uint64_t start, const uint64_t len) {
+Status LockFileBase::ReadLock(const uint64_t start, const uint64_t len) {
   return DoLock([&](const uint64_t start,
                     const uint64_t len) { return DoReadLock(start, len); },
                 start, len);
 }
 
-Error LockFileBase::TryReadLock(const uint64_t start, const uint64_t len) {
+Status LockFileBase::TryReadLock(const uint64_t start, const uint64_t len) {
   return DoLock([&](const uint64_t start,
                     const uint64_t len) { return DoTryReadLock(start, len); },
                 start, len);
 }
 
-Error LockFileBase::Unlock() {
+Status LockFileBase::Unlock() {
   if (!IsLocked())
     return NotLocked();
 
@@ -63,10 +63,10 @@ Error LockFileBase::Unlock() {
 
 bool LockFileBase::IsValidFile() const { return m_fd != -1; }
 
-Error LockFileBase::DoLock(const Locker &locker, const uint64_t start,
-                           const uint64_t len) {
+Status LockFileBase::DoLock(const Locker &locker, const uint64_t start,
+                            const uint64_t len) {
   if (!IsValidFile())
-    return Error("File is invalid");
+    return Status("File is invalid");
 
   if (IsLocked())
     return AlreadyLocked();
diff --git a/source/Host/common/MainLoop.cpp b/source/Host/common/MainLoop.cpp
index abd52f7f46fb..7de6f7fa865d 100644
--- a/source/Host/common/MainLoop.cpp
+++ b/source/Host/common/MainLoop.cpp
@@ -10,13 +10,13 @@
 #include "llvm/Config/llvm-config.h"
 
 #include "lldb/Host/MainLoop.h"
-#include "lldb/Utility/Error.h"
+#include "lldb/Utility/Status.h"
 #include <algorithm>
 #include <cassert>
 #include <cerrno>
 #include <csignal>
-#include <vector>
 #include <time.h>
+#include <vector>
 
 // Multiplexing is implemented using kqueue on systems that support it (BSD
 // variants including OSX). On linux we use ppoll, while android uses pselect
@@ -73,7 +73,7 @@ public:
   RunImpl(MainLoop &loop);
   ~RunImpl() = default;
 
-  Error Poll();
+  Status Poll();
   void ProcessEvents();
 
 private:
@@ -100,7 +100,7 @@ MainLoop::RunImpl::RunImpl(MainLoop &loop) : loop(loop) {
   in_events.reserve(loop.m_read_fds.size());
 }
 
-Error MainLoop::RunImpl::Poll() {
+Status MainLoop::RunImpl::Poll() {
   in_events.resize(loop.m_read_fds.size());
   unsigned i = 0;
   for (auto &fd : loop.m_read_fds)
@@ -110,8 +110,8 @@ Error MainLoop::RunImpl::Poll() {
                       out_events, llvm::array_lengthof(out_events), nullptr);
 
   if (num_events < 0)
-    return Error("kevent() failed with error %d\n", num_events);
-  return Error();
+    return Status("kevent() failed with error %d\n", num_events);
+  return Status();
 }
 
 void MainLoop::RunImpl::ProcessEvents() {
@@ -154,7 +154,7 @@ sigset_t MainLoop::RunImpl::get_sigmask() {
 }
 
 #ifdef FORCE_PSELECT
-Error MainLoop::RunImpl::Poll() {
+Status MainLoop::RunImpl::Poll() {
   FD_ZERO(&read_fd_set);
   int nfds = 0;
   for (const auto &fd : loop.m_read_fds) {
@@ -165,12 +165,12 @@ Error MainLoop::RunImpl::Poll() {
   sigset_t sigmask = get_sigmask();
   if (pselect(nfds, &read_fd_set, nullptr, nullptr, nullptr, &sigmask) == -1 &&
       errno != EINTR)
-    return Error(errno, eErrorTypePOSIX);
+    return Status(errno, eErrorTypePOSIX);
 
-  return Error();
+  return Status();
 }
 #else
-Error MainLoop::RunImpl::Poll() {
+Status MainLoop::RunImpl::Poll() {
   read_fds.clear();
 
   sigset_t sigmask = get_sigmask();
@@ -185,9 +185,9 @@ Error MainLoop::RunImpl::Poll() {
 
   if (ppoll(read_fds.data(), read_fds.size(), nullptr, &sigmask) == -1 &&
       errno != EINTR)
-    return Error(errno, eErrorTypePOSIX);
+    return Status(errno, eErrorTypePOSIX);
 
-  return Error();
+  return Status();
 }
 #endif
 
@@ -234,9 +234,9 @@ MainLoop::~MainLoop() {
   assert(m_signals.size() == 0);
 }
 
-MainLoop::ReadHandleUP
-MainLoop::RegisterReadObject(const IOObjectSP &object_sp,
-                                  const Callback &callback, Error &error) {
+MainLoop::ReadHandleUP MainLoop::RegisterReadObject(const IOObjectSP &object_sp,
+                                                    const Callback &callback,
+                                                    Status &error) {
 #ifdef LLVM_ON_WIN32
   if (object_sp->GetFdType() != IOObject:: eFDTypeSocket) {
     error.SetErrorString("MainLoop: non-socket types unsupported on Windows");
@@ -263,8 +263,7 @@ MainLoop::RegisterReadObject(const IOObjectSP &object_sp,
 // be unblocked in
 // the Run() function to check for signal delivery.
 MainLoop::SignalHandleUP
-MainLoop::RegisterSignal(int signo, const Callback &callback,
-                              Error &error) {
+MainLoop::RegisterSignal(int signo, const Callback &callback, Status &error) {
 #ifdef SIGNAL_POLLING_UNSUPPORTED
   error.SetErrorString("Signal polling is not supported on this platform.");
   return nullptr;
@@ -318,7 +317,7 @@ void MainLoop::UnregisterReadObject(IOObject::WaitableHandle handle) {
 
 void MainLoop::UnregisterSignal(int signo) {
 #if SIGNAL_POLLING_UNSUPPORTED
-  Error("Signal polling is not supported on this platform.");
+  Status("Signal polling is not supported on this platform.");
 #else
   auto it = m_signals.find(signo);
   assert(it != m_signals.end());
@@ -344,10 +343,10 @@ void MainLoop::UnregisterSignal(int signo) {
 #endif
 }
 
-Error MainLoop::Run() {
+Status MainLoop::Run() {
   m_terminate_request = false;
-  
-  Error error;
+
+  Status error;
   RunImpl impl(*this);
 
   // run until termination or until we run out of things to listen to
@@ -360,9 +359,9 @@ Error MainLoop::Run() {
     impl.ProcessEvents();
 
     if (m_terminate_request)
-      return Error();
+      return Status();
   }
-  return Error();
+  return Status();
 }
 
 void MainLoop::ProcessSignal(int signo) {
diff --git a/source/Host/common/MonitoringProcessLauncher.cpp b/source/Host/common/MonitoringProcessLauncher.cpp
index 2aa6c7f50b66..f1fcd0b44c15 100644
--- a/source/Host/common/MonitoringProcessLauncher.cpp
+++ b/source/Host/common/MonitoringProcessLauncher.cpp
@@ -14,8 +14,8 @@
 #include "lldb/Target/Platform.h"
 #include "lldb/Target/Process.h"
 #include "lldb/Target/ProcessLaunchInfo.h"
-#include "lldb/Utility/Error.h"
 #include "lldb/Utility/Log.h"
+#include "lldb/Utility/Status.h"
 
 #include "llvm/Support/FileSystem.h"
 
@@ -28,7 +28,7 @@ MonitoringProcessLauncher::MonitoringProcessLauncher(
 
 HostProcess
 MonitoringProcessLauncher::LaunchProcess(const ProcessLaunchInfo &launch_info,
-                                         Error &error) {
+                                         Status &error) {
   ProcessLaunchInfo resolved_info(launch_info);
 
   error.Clear();
diff --git a/source/Host/common/NativeBreakpoint.cpp b/source/Host/common/NativeBreakpoint.cpp
index 8a3ee72179c3..5eee3de482c1 100644
--- a/source/Host/common/NativeBreakpoint.cpp
+++ b/source/Host/common/NativeBreakpoint.cpp
@@ -9,8 +9,8 @@
 
 #include "lldb/Host/common/NativeBreakpoint.h"
 
-#include "lldb/Utility/Error.h"
 #include "lldb/Utility/Log.h"
+#include "lldb/Utility/Status.h"
 #include "lldb/lldb-defines.h"
 
 using namespace lldb_private;
@@ -44,7 +44,7 @@ int32_t NativeBreakpoint::DecRef() {
   return m_ref_count;
 }
 
-Error NativeBreakpoint::Enable() {
+Status NativeBreakpoint::Enable() {
   Log *log(GetLogIfAnyCategoriesSet(LIBLLDB_LOG_BREAKPOINTS));
 
   if (m_enabled) {
@@ -53,7 +53,7 @@ Error NativeBreakpoint::Enable() {
       log->Printf("NativeBreakpoint::%s addr = 0x%" PRIx64
                   " already enabled, ignoring.",
                   __FUNCTION__, m_addr);
-    return Error();
+    return Status();
   }
 
   // Log and enable.
@@ -61,7 +61,7 @@ Error NativeBreakpoint::Enable() {
     log->Printf("NativeBreakpoint::%s addr = 0x%" PRIx64 " enabling...",
                 __FUNCTION__, m_addr);
 
-  Error error = DoEnable();
+  Status error = DoEnable();
   if (error.Success()) {
     m_enabled = true;
     if (log)
@@ -76,7 +76,7 @@ Error NativeBreakpoint::Enable() {
   return error;
 }
 
-Error NativeBreakpoint::Disable() {
+Status NativeBreakpoint::Disable() {
   Log *log(GetLogIfAnyCategoriesSet(LIBLLDB_LOG_BREAKPOINTS));
 
   if (!m_enabled) {
@@ -85,7 +85,7 @@ Error NativeBreakpoint::Disable() {
       log->Printf("NativeBreakpoint::%s addr = 0x%" PRIx64
                   " already disabled, ignoring.",
                   __FUNCTION__, m_addr);
-    return Error();
+    return Status();
   }
 
   // Log and disable.
@@ -93,7 +93,7 @@ Error NativeBreakpoint::Disable() {
     log->Printf("NativeBreakpoint::%s addr = 0x%" PRIx64 " disabling...",
                 __FUNCTION__, m_addr);
 
-  Error error = DoDisable();
+  Status error = DoDisable();
   if (error.Success()) {
     m_enabled = false;
     if (log)
diff --git a/source/Host/common/NativeBreakpointList.cpp b/source/Host/common/NativeBreakpointList.cpp
index 60608a0bbc55..ce5eb94a8d1f 100644
--- a/source/Host/common/NativeBreakpointList.cpp
+++ b/source/Host/common/NativeBreakpointList.cpp
@@ -19,9 +19,9 @@ using namespace lldb_private;
 
 NativeBreakpointList::NativeBreakpointList() : m_mutex() {}
 
-Error NativeBreakpointList::AddRef(lldb::addr_t addr, size_t size_hint,
-                                   bool hardware,
-                                   CreateBreakpointFunc create_func) {
+Status NativeBreakpointList::AddRef(lldb::addr_t addr, size_t size_hint,
+                                    bool hardware,
+                                    CreateBreakpointFunc create_func) {
   Log *log(GetLogIfAnyCategoriesSet(LIBLLDB_LOG_BREAKPOINTS));
   if (log)
     log->Printf("NativeBreakpointList::%s addr = 0x%" PRIx64
@@ -40,7 +40,7 @@ Error NativeBreakpointList::AddRef(lldb::addr_t addr, size_t size_hint,
                   __FUNCTION__, addr);
 
     iter->second->AddRef();
-    return Error();
+    return Status();
   }
 
   // Create a new breakpoint using the given create func.
@@ -51,7 +51,7 @@ Error NativeBreakpointList::AddRef(lldb::addr_t addr, size_t size_hint,
         __FUNCTION__, addr, size_hint, hardware ? "true" : "false");
 
   NativeBreakpointSP breakpoint_sp;
-  Error error = create_func(addr, size_hint, hardware, breakpoint_sp);
+  Status error = create_func(addr, size_hint, hardware, breakpoint_sp);
   if (error.Fail()) {
     if (log)
       log->Printf(
@@ -70,8 +70,8 @@ Error NativeBreakpointList::AddRef(lldb::addr_t addr, size_t size_hint,
   return error;
 }
 
-Error NativeBreakpointList::DecRef(lldb::addr_t addr) {
-  Error error;
+Status NativeBreakpointList::DecRef(lldb::addr_t addr) {
+  Status error;
 
   Log *log(GetLogIfAnyCategoriesSet(LIBLLDB_LOG_BREAKPOINTS));
   if (log)
@@ -142,7 +142,7 @@ Error NativeBreakpointList::DecRef(lldb::addr_t addr) {
   return error;
 }
 
-Error NativeBreakpointList::EnableBreakpoint(lldb::addr_t addr) {
+Status NativeBreakpointList::EnableBreakpoint(lldb::addr_t addr) {
   Log *log(GetLogIfAnyCategoriesSet(LIBLLDB_LOG_BREAKPOINTS));
   if (log)
     log->Printf("NativeBreakpointList::%s addr = 0x%" PRIx64, __FUNCTION__,
@@ -157,14 +157,14 @@ Error NativeBreakpointList::EnableBreakpoint(lldb::addr_t addr) {
     if (log)
       log->Printf("NativeBreakpointList::%s addr = 0x%" PRIx64 " -- NOT FOUND",
                   __FUNCTION__, addr);
-    return Error("breakpoint not found");
+    return Status("breakpoint not found");
   }
 
   // Enable it.
   return iter->second->Enable();
 }
 
-Error NativeBreakpointList::DisableBreakpoint(lldb::addr_t addr) {
+Status NativeBreakpointList::DisableBreakpoint(lldb::addr_t addr) {
   Log *log(GetLogIfAnyCategoriesSet(LIBLLDB_LOG_BREAKPOINTS));
   if (log)
     log->Printf("NativeBreakpointList::%s addr = 0x%" PRIx64, __FUNCTION__,
@@ -179,15 +179,15 @@ Error NativeBreakpointList::DisableBreakpoint(lldb::addr_t addr) {
     if (log)
       log->Printf("NativeBreakpointList::%s addr = 0x%" PRIx64 " -- NOT FOUND",
                   __FUNCTION__, addr);
-    return Error("breakpoint not found");
+    return Status("breakpoint not found");
   }
 
   // Disable it.
   return iter->second->Disable();
 }
 
-Error NativeBreakpointList::GetBreakpoint(lldb::addr_t addr,
-                                          NativeBreakpointSP &breakpoint_sp) {
+Status NativeBreakpointList::GetBreakpoint(lldb::addr_t addr,
+                                           NativeBreakpointSP &breakpoint_sp) {
   Log *log(GetLogIfAnyCategoriesSet(LIBLLDB_LOG_BREAKPOINTS));
   if (log)
     log->Printf("NativeBreakpointList::%s addr = 0x%" PRIx64, __FUNCTION__,
@@ -200,16 +200,16 @@ Error NativeBreakpointList::GetBreakpoint(lldb::addr_t addr,
   if (iter == m_breakpoints.end()) {
     // Not found!
     breakpoint_sp.reset();
-    return Error("breakpoint not found");
+    return Status("breakpoint not found");
   }
 
   // Disable it.
   breakpoint_sp = iter->second;
-  return Error();
+  return Status();
 }
 
-Error NativeBreakpointList::RemoveTrapsFromBuffer(lldb::addr_t addr, void *buf,
-                                                  size_t size) const {
+Status NativeBreakpointList::RemoveTrapsFromBuffer(lldb::addr_t addr, void *buf,
+                                                   size_t size) const {
   for (const auto &map : m_breakpoints) {
     lldb::addr_t bp_addr = map.first;
     // Breapoint not in range, ignore
@@ -225,5 +225,5 @@ Error NativeBreakpointList::RemoveTrapsFromBuffer(lldb::addr_t addr, void *buf,
     auto opcode_size = software_bp_sp->m_opcode_size;
     ::memcpy(opcode_addr, saved_opcodes, opcode_size);
   }
-  return Error();
+  return Status();
 }
diff --git a/source/Host/common/NativeProcessProtocol.cpp b/source/Host/common/NativeProcessProtocol.cpp
index 9d4149d700ba..6f1a9f895b61 100644
--- a/source/Host/common/NativeProcessProtocol.cpp
+++ b/source/Host/common/NativeProcessProtocol.cpp
@@ -36,8 +36,8 @@ NativeProcessProtocol::NativeProcessProtocol(lldb::pid_t pid)
       m_delegates_mutex(), m_delegates(), m_breakpoint_list(),
       m_watchpoint_list(), m_terminal_fd(-1), m_stop_id(0) {}
 
-lldb_private::Error NativeProcessProtocol::Interrupt() {
-  Error error;
+lldb_private::Status NativeProcessProtocol::Interrupt() {
+  Status error;
 #if !defined(SIGSTOP)
   error.SetErrorString("local host does not support signaling");
   return error;
@@ -46,17 +46,17 @@ lldb_private::Error NativeProcessProtocol::Interrupt() {
 #endif
 }
 
-Error NativeProcessProtocol::IgnoreSignals(llvm::ArrayRef<int> signals) {
+Status NativeProcessProtocol::IgnoreSignals(llvm::ArrayRef<int> signals) {
   m_signals_to_ignore.clear();
   m_signals_to_ignore.insert(signals.begin(), signals.end());
-  return Error();
+  return Status();
 }
 
-lldb_private::Error
+lldb_private::Status
 NativeProcessProtocol::GetMemoryRegionInfo(lldb::addr_t load_addr,
                                            MemoryRegionInfo &range_info) {
   // Default: not implemented.
-  return Error("not implemented");
+  return Status("not implemented");
 }
 
 bool NativeProcessProtocol::GetExitStatus(ExitType *exit_type, int *status,
@@ -173,9 +173,9 @@ NativeProcessProtocol::GetHardwareDebugSupportInfo() const {
                         reg_ctx_sp->NumSupportedHardwareWatchpoints());
 }
 
-Error NativeProcessProtocol::SetWatchpoint(lldb::addr_t addr, size_t size,
-                                           uint32_t watch_flags,
-                                           bool hardware) {
+Status NativeProcessProtocol::SetWatchpoint(lldb::addr_t addr, size_t size,
+                                            uint32_t watch_flags,
+                                            bool hardware) {
   // This default implementation assumes setting the watchpoint for
   // the process will require setting the watchpoint for each of the
   // threads.  Furthermore, it will track watchpoints set for the
@@ -205,7 +205,7 @@ Error NativeProcessProtocol::SetWatchpoint(lldb::addr_t addr, size_t size,
     if (!thread_sp)
       continue;
 
-    Error thread_error =
+    Status thread_error =
         thread_sp->SetWatchpoint(addr, size, watch_flags, hardware);
     if (thread_error.Fail() && hardware) {
       // Try software watchpoints since we failed on hardware watchpoint setting
@@ -227,7 +227,7 @@ Error NativeProcessProtocol::SetWatchpoint(lldb::addr_t addr, size_t size,
       // set so that we get back to a consistent state of "not
       // set" for the watchpoint.
       for (auto unwatch_thread_sp : watchpoint_established_threads) {
-        Error remove_error = unwatch_thread_sp->RemoveWatchpoint(addr);
+        Status remove_error = unwatch_thread_sp->RemoveWatchpoint(addr);
         if (remove_error.Fail() && log) {
           log->Warning("NativeProcessProtocol::%s (): RemoveWatchpoint failed "
                        "for pid=%" PRIu64 ", tid=%" PRIu64 ": %s",
@@ -242,11 +242,11 @@ Error NativeProcessProtocol::SetWatchpoint(lldb::addr_t addr, size_t size,
   return m_watchpoint_list.Add(addr, size, watch_flags, hardware);
 }
 
-Error NativeProcessProtocol::RemoveWatchpoint(lldb::addr_t addr) {
+Status NativeProcessProtocol::RemoveWatchpoint(lldb::addr_t addr) {
   // Update the thread list
   UpdateThreads();
 
-  Error overall_error;
+  Status overall_error;
 
   std::lock_guard<std::recursive_mutex> guard(m_threads_mutex);
   for (auto thread_sp : m_threads) {
@@ -254,7 +254,7 @@ Error NativeProcessProtocol::RemoveWatchpoint(lldb::addr_t addr) {
     if (!thread_sp)
       continue;
 
-    const Error thread_error = thread_sp->RemoveWatchpoint(addr);
+    const Status thread_error = thread_sp->RemoveWatchpoint(addr);
     if (thread_error.Fail()) {
       // Keep track of the first thread error if any threads
       // fail. We want to try to remove the watchpoint from
@@ -263,7 +263,7 @@ Error NativeProcessProtocol::RemoveWatchpoint(lldb::addr_t addr) {
         overall_error = thread_error;
     }
   }
-  const Error error = m_watchpoint_list.Remove(addr);
+  const Status error = m_watchpoint_list.Remove(addr);
   return overall_error.Fail() ? overall_error : error;
 }
 
@@ -272,8 +272,8 @@ NativeProcessProtocol::GetHardwareBreakpointMap() const {
   return m_hw_breakpoints_map;
 }
 
-Error NativeProcessProtocol::SetHardwareBreakpoint(lldb::addr_t addr,
-                                                   size_t size) {
+Status NativeProcessProtocol::SetHardwareBreakpoint(lldb::addr_t addr,
+                                                    size_t size) {
   // This default implementation assumes setting a hardware breakpoint for
   // this process will require setting same hardware breakpoint for each
   // of its existing threads. New thread will do the same once created.
@@ -287,7 +287,7 @@ Error NativeProcessProtocol::SetHardwareBreakpoint(lldb::addr_t addr,
 
   if (hw_debug_cap == llvm::None || hw_debug_cap->first == 0 ||
       hw_debug_cap->first <= m_hw_breakpoints_map.size())
-    return Error("Target does not have required no of hardware breakpoints");
+    return Status("Target does not have required no of hardware breakpoints");
 
   // Vector below stores all thread pointer for which we have we successfully
   // set this hardware breakpoint. If any of the current process threads fails
@@ -302,7 +302,7 @@ Error NativeProcessProtocol::SetHardwareBreakpoint(lldb::addr_t addr,
     if (!thread_sp)
       continue;
 
-    Error thread_error = thread_sp->SetHardwareBreakpoint(addr, size);
+    Status thread_error = thread_sp->SetHardwareBreakpoint(addr, size);
     if (thread_error.Success()) {
       // Remember that we set this breakpoint successfully in
       // case we need to clear it later.
@@ -312,7 +312,8 @@ Error NativeProcessProtocol::SetHardwareBreakpoint(lldb::addr_t addr,
       // set so that we get back to a consistent state of "not
       // set" for this hardware breakpoint.
       for (auto rollback_thread_sp : breakpoint_established_threads) {
-        Error remove_error = rollback_thread_sp->RemoveHardwareBreakpoint(addr);
+        Status remove_error =
+            rollback_thread_sp->RemoveHardwareBreakpoint(addr);
         if (remove_error.Fail() && log) {
           log->Warning("NativeProcessProtocol::%s (): RemoveHardwareBreakpoint"
                        " failed for pid=%" PRIu64 ", tid=%" PRIu64 ": %s",
@@ -329,14 +330,14 @@ Error NativeProcessProtocol::SetHardwareBreakpoint(lldb::addr_t addr,
   // process.
   m_hw_breakpoints_map[addr] = {addr, size};
 
-  return Error();
+  return Status();
 }
 
-Error NativeProcessProtocol::RemoveHardwareBreakpoint(lldb::addr_t addr) {
+Status NativeProcessProtocol::RemoveHardwareBreakpoint(lldb::addr_t addr) {
   // Update the thread list
   UpdateThreads();
 
-  Error error;
+  Status error;
 
   std::lock_guard<std::recursive_mutex> guard(m_threads_mutex);
   for (auto thread_sp : m_threads) {
@@ -413,8 +414,8 @@ void NativeProcessProtocol::NotifyDidExec() {
   }
 }
 
-Error NativeProcessProtocol::SetSoftwareBreakpoint(lldb::addr_t addr,
-                                                   uint32_t size_hint) {
+Status NativeProcessProtocol::SetSoftwareBreakpoint(lldb::addr_t addr,
+                                                    uint32_t size_hint) {
   Log *log(GetLogIfAnyCategoriesSet(LIBLLDB_LOG_BREAKPOINTS));
   if (log)
     log->Printf("NativeProcessProtocol::%s addr = 0x%" PRIx64, __FUNCTION__,
@@ -423,25 +424,25 @@ Error NativeProcessProtocol::SetSoftwareBreakpoint(lldb::addr_t addr,
   return m_breakpoint_list.AddRef(
       addr, size_hint, false,
       [this](lldb::addr_t addr, size_t size_hint, bool /* hardware */,
-             NativeBreakpointSP &breakpoint_sp) -> Error {
+             NativeBreakpointSP &breakpoint_sp) -> Status {
         return SoftwareBreakpoint::CreateSoftwareBreakpoint(
             *this, addr, size_hint, breakpoint_sp);
       });
 }
 
-Error NativeProcessProtocol::RemoveBreakpoint(lldb::addr_t addr,
-                                              bool hardware) {
+Status NativeProcessProtocol::RemoveBreakpoint(lldb::addr_t addr,
+                                               bool hardware) {
   if (hardware)
     return RemoveHardwareBreakpoint(addr);
   else
     return m_breakpoint_list.DecRef(addr);
 }
 
-Error NativeProcessProtocol::EnableBreakpoint(lldb::addr_t addr) {
+Status NativeProcessProtocol::EnableBreakpoint(lldb::addr_t addr) {
   return m_breakpoint_list.EnableBreakpoint(addr);
 }
 
-Error NativeProcessProtocol::DisableBreakpoint(lldb::addr_t addr) {
+Status NativeProcessProtocol::DisableBreakpoint(lldb::addr_t addr) {
   return m_breakpoint_list.DisableBreakpoint(addr);
 }
 
@@ -483,25 +484,26 @@ void NativeProcessProtocol::DoStopIDBumped(uint32_t /* newBumpId */) {
   // Default implementation does nothing.
 }
 
-Error NativeProcessProtocol::ResolveProcessArchitecture(lldb::pid_t pid,
-                                                        ArchSpec &arch) {
+Status NativeProcessProtocol::ResolveProcessArchitecture(lldb::pid_t pid,
+                                                         ArchSpec &arch) {
   // Grab process info for the running process.
   ProcessInstanceInfo process_info;
   if (!Host::GetProcessInfo(pid, process_info))
-    return Error("failed to get process info");
+    return Status("failed to get process info");
 
   // Resolve the executable module.
   ModuleSpecList module_specs;
   if (!ObjectFile::GetModuleSpecifications(process_info.GetExecutableFile(), 0,
                                            0, module_specs))
-    return Error("failed to get module specifications");
+    return Status("failed to get module specifications");
   lldbassert(module_specs.GetSize() == 1);
 
   arch = module_specs.GetModuleSpecRefAtIndex(0).GetArchitecture();
   if (arch.IsValid())
-    return Error();
+    return Status();
   else
-    return Error("failed to retrieve a valid architecture from the exe module");
+    return Status(
+        "failed to retrieve a valid architecture from the exe module");
 }
 
 #if !defined(__linux__) && !defined(__NetBSD__)
@@ -509,17 +511,17 @@ Error NativeProcessProtocol::ResolveProcessArchitecture(lldb::pid_t pid,
 // Stubs are
 // provided to make the rest of the code link on non-supported platforms.
 
-Error NativeProcessProtocol::Launch(ProcessLaunchInfo &launch_info,
-                                    NativeDelegate &native_delegate,
-                                    MainLoop &mainloop,
-                                    NativeProcessProtocolSP &process_sp) {
+Status NativeProcessProtocol::Launch(ProcessLaunchInfo &launch_info,
+                                     NativeDelegate &native_delegate,
+                                     MainLoop &mainloop,
+                                     NativeProcessProtocolSP &process_sp) {
   llvm_unreachable("Platform has no NativeProcessProtocol support");
 }
 
-Error NativeProcessProtocol::Attach(lldb::pid_t pid,
-                                    NativeDelegate &native_delegate,
-                                    MainLoop &mainloop,
-                                    NativeProcessProtocolSP &process_sp) {
+Status NativeProcessProtocol::Attach(lldb::pid_t pid,
+                                     NativeDelegate &native_delegate,
+                                     MainLoop &mainloop,
+                                     NativeProcessProtocolSP &process_sp) {
   llvm_unreachable("Platform has no NativeProcessProtocol support");
 }
 
diff --git a/source/Host/common/NativeRegisterContext.cpp b/source/Host/common/NativeRegisterContext.cpp
index 3bc0a0d9705c..2ca95d707963 100644
--- a/source/Host/common/NativeRegisterContext.cpp
+++ b/source/Host/common/NativeRegisterContext.cpp
@@ -138,7 +138,7 @@ NativeRegisterContext::GetPCfromBreakpointLocation(lldb::addr_t fail_value) {
   return GetPC(fail_value);
 }
 
-Error NativeRegisterContext::SetPC(lldb::addr_t pc) {
+Status NativeRegisterContext::SetPC(lldb::addr_t pc) {
   uint32_t reg = ConvertRegisterKindToRegisterNumber(eRegisterKindGeneric,
                                                      LLDB_REGNUM_GENERIC_PC);
   return WriteRegisterFromUnsigned(reg, pc);
@@ -150,7 +150,7 @@ lldb::addr_t NativeRegisterContext::GetSP(lldb::addr_t fail_value) {
   return ReadRegisterAsUnsigned(reg, fail_value);
 }
 
-Error NativeRegisterContext::SetSP(lldb::addr_t sp) {
+Status NativeRegisterContext::SetSP(lldb::addr_t sp) {
   uint32_t reg = ConvertRegisterKindToRegisterNumber(eRegisterKindGeneric,
                                                      LLDB_REGNUM_GENERIC_SP);
   return WriteRegisterFromUnsigned(reg, sp);
@@ -162,7 +162,7 @@ lldb::addr_t NativeRegisterContext::GetFP(lldb::addr_t fail_value) {
   return ReadRegisterAsUnsigned(reg, fail_value);
 }
 
-Error NativeRegisterContext::SetFP(lldb::addr_t fp) {
+Status NativeRegisterContext::SetFP(lldb::addr_t fp) {
   uint32_t reg = ConvertRegisterKindToRegisterNumber(eRegisterKindGeneric,
                                                      LLDB_REGNUM_GENERIC_FP);
   return WriteRegisterFromUnsigned(reg, fp);
@@ -195,7 +195,7 @@ NativeRegisterContext::ReadRegisterAsUnsigned(const RegisterInfo *reg_info,
 
   if (reg_info) {
     RegisterValue value;
-    Error error = ReadRegister(reg_info, value);
+    Status error = ReadRegister(reg_info, value);
     if (error.Success()) {
       if (log)
         log->Printf("NativeRegisterContext::%s ReadRegister() succeeded, value "
@@ -215,22 +215,23 @@ NativeRegisterContext::ReadRegisterAsUnsigned(const RegisterInfo *reg_info,
   return fail_value;
 }
 
-Error NativeRegisterContext::WriteRegisterFromUnsigned(uint32_t reg,
-                                                       uint64_t uval) {
+Status NativeRegisterContext::WriteRegisterFromUnsigned(uint32_t reg,
+                                                        uint64_t uval) {
   if (reg == LLDB_INVALID_REGNUM)
-    return Error("NativeRegisterContext::%s (): reg is invalid", __FUNCTION__);
+    return Status("NativeRegisterContext::%s (): reg is invalid", __FUNCTION__);
   return WriteRegisterFromUnsigned(GetRegisterInfoAtIndex(reg), uval);
 }
 
-Error NativeRegisterContext::WriteRegisterFromUnsigned(
-    const RegisterInfo *reg_info, uint64_t uval) {
+Status
+NativeRegisterContext::WriteRegisterFromUnsigned(const RegisterInfo *reg_info,
+                                                 uint64_t uval) {
   assert(reg_info);
   if (!reg_info)
-    return Error("reg_info is nullptr");
+    return Status("reg_info is nullptr");
 
   RegisterValue value;
   if (!value.SetUInt(uval, reg_info->byte_size))
-    return Error("RegisterValue::SetUInt () failed");
+    return Status("RegisterValue::SetUInt () failed");
 
   return WriteRegister(reg_info, value);
 }
@@ -246,18 +247,18 @@ uint32_t NativeRegisterContext::SetHardwareBreakpoint(lldb::addr_t addr,
   return LLDB_INVALID_INDEX32;
 }
 
-Error NativeRegisterContext::ClearAllHardwareBreakpoints() {
-  return Error("not implemented");
+Status NativeRegisterContext::ClearAllHardwareBreakpoints() {
+  return Status("not implemented");
 }
 
 bool NativeRegisterContext::ClearHardwareBreakpoint(uint32_t hw_idx) {
   return false;
 }
 
-Error NativeRegisterContext::GetHardwareBreakHitIndex(uint32_t &bp_index,
-                                                      lldb::addr_t trap_addr) {
+Status NativeRegisterContext::GetHardwareBreakHitIndex(uint32_t &bp_index,
+                                                       lldb::addr_t trap_addr) {
   bp_index = LLDB_INVALID_INDEX32;
-  return Error("not implemented");
+  return Status("not implemented");
 }
 
 uint32_t NativeRegisterContext::NumSupportedHardwareWatchpoints() { return 0; }
@@ -272,25 +273,25 @@ bool NativeRegisterContext::ClearHardwareWatchpoint(uint32_t hw_index) {
   return false;
 }
 
-Error NativeRegisterContext::ClearAllHardwareWatchpoints() {
-  return Error("not implemented");
+Status NativeRegisterContext::ClearAllHardwareWatchpoints() {
+  return Status("not implemented");
 }
 
-Error NativeRegisterContext::IsWatchpointHit(uint32_t wp_index, bool &is_hit) {
+Status NativeRegisterContext::IsWatchpointHit(uint32_t wp_index, bool &is_hit) {
   is_hit = false;
-  return Error("not implemented");
+  return Status("not implemented");
 }
 
-Error NativeRegisterContext::GetWatchpointHitIndex(uint32_t &wp_index,
-                                                   lldb::addr_t trap_addr) {
+Status NativeRegisterContext::GetWatchpointHitIndex(uint32_t &wp_index,
+                                                    lldb::addr_t trap_addr) {
   wp_index = LLDB_INVALID_INDEX32;
-  return Error("not implemented");
+  return Status("not implemented");
 }
 
-Error NativeRegisterContext::IsWatchpointVacant(uint32_t wp_index,
-                                                bool &is_vacant) {
+Status NativeRegisterContext::IsWatchpointVacant(uint32_t wp_index,
+                                                 bool &is_vacant) {
   is_vacant = false;
-  return Error("not implemented");
+  return Status("not implemented");
 }
 
 lldb::addr_t NativeRegisterContext::GetWatchpointAddress(uint32_t wp_index) {
@@ -303,10 +304,10 @@ lldb::addr_t NativeRegisterContext::GetWatchpointHitAddress(uint32_t wp_index) {
 
 bool NativeRegisterContext::HardwareSingleStep(bool enable) { return false; }
 
-Error NativeRegisterContext::ReadRegisterValueFromMemory(
+Status NativeRegisterContext::ReadRegisterValueFromMemory(
     const RegisterInfo *reg_info, lldb::addr_t src_addr, size_t src_len,
     RegisterValue &reg_value) {
-  Error error;
+  Status error;
   if (reg_info == nullptr) {
     error.SetErrorString("invalid register info argument.");
     return error;
@@ -321,7 +322,7 @@ Error NativeRegisterContext::ReadRegisterValueFromMemory(
   //
   // Case 2: src_len > dst_len
   //
-  //   Error!  (The register should always be big enough to hold the data)
+  //   Status!  (The register should always be big enough to hold the data)
   //
   // Case 3: src_len < dst_len
   //
@@ -383,13 +384,13 @@ Error NativeRegisterContext::ReadRegisterValueFromMemory(
   return error;
 }
 
-Error NativeRegisterContext::WriteRegisterValueToMemory(
+Status NativeRegisterContext::WriteRegisterValueToMemory(
     const RegisterInfo *reg_info, lldb::addr_t dst_addr, size_t dst_len,
     const RegisterValue &reg_value) {
 
   uint8_t dst[RegisterValue::kMaxRegisterByteSize];
 
-  Error error;
+  Status error;
 
   NativeProcessProtocolSP process_sp(m_thread.GetProcess());
   if (process_sp) {
@@ -400,7 +401,7 @@ Error NativeRegisterContext::WriteRegisterValueToMemory(
     // they are the same.
     lldb::ByteOrder byte_order;
     if (!process_sp->GetByteOrder(byte_order))
-      return Error("NativeProcessProtocol::GetByteOrder () failed");
+      return Status("NativeProcessProtocol::GetByteOrder () failed");
 
     const size_t bytes_copied =
         reg_value.GetAsMemoryData(reg_info, dst, dst_len, byte_order, error);
diff --git a/source/Host/common/NativeThreadProtocol.cpp b/source/Host/common/NativeThreadProtocol.cpp
index 2e76cff0d670..29e25bbc5692 100644
--- a/source/Host/common/NativeThreadProtocol.cpp
+++ b/source/Host/common/NativeThreadProtocol.cpp
@@ -20,46 +20,46 @@ NativeThreadProtocol::NativeThreadProtocol(NativeProcessProtocol *process,
                                            lldb::tid_t tid)
     : m_process_wp(process->shared_from_this()), m_tid(tid) {}
 
-Error NativeThreadProtocol::ReadRegister(uint32_t reg,
-                                         RegisterValue &reg_value) {
+Status NativeThreadProtocol::ReadRegister(uint32_t reg,
+                                          RegisterValue &reg_value) {
   NativeRegisterContextSP register_context_sp = GetRegisterContext();
   if (!register_context_sp)
-    return Error("no register context");
+    return Status("no register context");
 
   const RegisterInfo *const reg_info =
       register_context_sp->GetRegisterInfoAtIndex(reg);
   if (!reg_info)
-    return Error("no register info for reg num %" PRIu32, reg);
+    return Status("no register info for reg num %" PRIu32, reg);
 
   return register_context_sp->ReadRegister(reg_info, reg_value);
   ;
 }
 
-Error NativeThreadProtocol::WriteRegister(uint32_t reg,
-                                          const RegisterValue &reg_value) {
+Status NativeThreadProtocol::WriteRegister(uint32_t reg,
+                                           const RegisterValue &reg_value) {
   NativeRegisterContextSP register_context_sp = GetRegisterContext();
   if (!register_context_sp)
-    return Error("no register context");
+    return Status("no register context");
 
   const RegisterInfo *const reg_info =
       register_context_sp->GetRegisterInfoAtIndex(reg);
   if (!reg_info)
-    return Error("no register info for reg num %" PRIu32, reg);
+    return Status("no register info for reg num %" PRIu32, reg);
 
   return register_context_sp->WriteRegister(reg_info, reg_value);
 }
 
-Error NativeThreadProtocol::SaveAllRegisters(lldb::DataBufferSP &data_sp) {
+Status NativeThreadProtocol::SaveAllRegisters(lldb::DataBufferSP &data_sp) {
   NativeRegisterContextSP register_context_sp = GetRegisterContext();
   if (!register_context_sp)
-    return Error("no register context");
+    return Status("no register context");
   return register_context_sp->WriteAllRegisterValues(data_sp);
 }
 
-Error NativeThreadProtocol::RestoreAllRegisters(lldb::DataBufferSP &data_sp) {
+Status NativeThreadProtocol::RestoreAllRegisters(lldb::DataBufferSP &data_sp) {
   NativeRegisterContextSP register_context_sp = GetRegisterContext();
   if (!register_context_sp)
-    return Error("no register context");
+    return Status("no register context");
   return register_context_sp->ReadAllRegisterValues(data_sp);
 }
 
diff --git a/source/Host/common/NativeWatchpointList.cpp b/source/Host/common/NativeWatchpointList.cpp
index 168e5b42b961..e6ef7300eb22 100644
--- a/source/Host/common/NativeWatchpointList.cpp
+++ b/source/Host/common/NativeWatchpointList.cpp
@@ -14,15 +14,15 @@
 using namespace lldb;
 using namespace lldb_private;
 
-Error NativeWatchpointList::Add(addr_t addr, size_t size, uint32_t watch_flags,
-                                bool hardware) {
+Status NativeWatchpointList::Add(addr_t addr, size_t size, uint32_t watch_flags,
+                                 bool hardware) {
   m_watchpoints[addr] = {addr, size, watch_flags, hardware};
-  return Error();
+  return Status();
 }
 
-Error NativeWatchpointList::Remove(addr_t addr) {
+Status NativeWatchpointList::Remove(addr_t addr) {
   m_watchpoints.erase(addr);
-  return Error();
+  return Status();
 }
 
 const NativeWatchpointList::WatchpointMap &
diff --git a/source/Host/common/PipeBase.cpp b/source/Host/common/PipeBase.cpp
index cf7e6c97c3c3..632bfcb3a2e0 100644
--- a/source/Host/common/PipeBase.cpp
+++ b/source/Host/common/PipeBase.cpp
@@ -13,12 +13,13 @@ using namespace lldb_private;
 
 PipeBase::~PipeBase() = default;
 
-Error PipeBase::OpenAsWriter(llvm::StringRef name, bool child_process_inherit) {
+Status PipeBase::OpenAsWriter(llvm::StringRef name,
+                              bool child_process_inherit) {
   return OpenAsWriterWithTimeout(name, child_process_inherit,
                                  std::chrono::microseconds::zero());
 }
 
-Error PipeBase::Read(void *buf, size_t size, size_t &bytes_read) {
+Status PipeBase::Read(void *buf, size_t size, size_t &bytes_read) {
   return ReadWithTimeout(buf, size, std::chrono::microseconds::zero(),
                          bytes_read);
 }
diff --git a/source/Host/common/Socket.cpp b/source/Host/common/Socket.cpp
index d73b5d0ad073..0df9dc02c70f 100644
--- a/source/Host/common/Socket.cpp
+++ b/source/Host/common/Socket.cpp
@@ -79,7 +79,7 @@ Socket::~Socket() { Close(); }
 
 std::unique_ptr<Socket> Socket::Create(const SocketProtocol protocol,
                                        bool child_processes_inherit,
-                                       Error &error) {
+                                       Status &error) {
   error.Clear();
 
   std::unique_ptr<Socket> socket_up;
@@ -118,14 +118,14 @@ std::unique_ptr<Socket> Socket::Create(const SocketProtocol protocol,
   return socket_up;
 }
 
-Error Socket::TcpConnect(llvm::StringRef host_and_port,
-                         bool child_processes_inherit, Socket *&socket) {
+Status Socket::TcpConnect(llvm::StringRef host_and_port,
+                          bool child_processes_inherit, Socket *&socket) {
   Log *log(lldb_private::GetLogIfAnyCategoriesSet(LIBLLDB_LOG_COMMUNICATION));
   if (log)
     log->Printf("Socket::%s (host/port = %s)", __FUNCTION__,
                 host_and_port.data());
 
-  Error error;
+  Status error;
   std::unique_ptr<Socket> connect_socket(
       Create(ProtocolTcp, child_processes_inherit, error));
   if (error.Fail())
@@ -138,14 +138,14 @@ Error Socket::TcpConnect(llvm::StringRef host_and_port,
   return error;
 }
 
-Error Socket::TcpListen(llvm::StringRef host_and_port,
-                        bool child_processes_inherit, Socket *&socket,
-                        Predicate<uint16_t> *predicate, int backlog) {
+Status Socket::TcpListen(llvm::StringRef host_and_port,
+                         bool child_processes_inherit, Socket *&socket,
+                         Predicate<uint16_t> *predicate, int backlog) {
   Log *log(lldb_private::GetLogIfAnyCategoriesSet(LIBLLDB_LOG_CONNECTION));
   if (log)
     log->Printf("Socket::%s (%s)", __FUNCTION__, host_and_port.data());
 
-  Error error;
+  Status error;
   std::string host_str;
   std::string port_str;
   int32_t port = INT32_MIN;
@@ -179,8 +179,8 @@ Error Socket::TcpListen(llvm::StringRef host_and_port,
   return error;
 }
 
-Error Socket::UdpConnect(llvm::StringRef host_and_port,
-                         bool child_processes_inherit, Socket *&socket) {
+Status Socket::UdpConnect(llvm::StringRef host_and_port,
+                          bool child_processes_inherit, Socket *&socket) {
   Log *log(lldb_private::GetLogIfAnyCategoriesSet(LIBLLDB_LOG_CONNECTION));
   if (log)
     log->Printf("Socket::%s (host/port = %s)", __FUNCTION__,
@@ -189,9 +189,10 @@ Error Socket::UdpConnect(llvm::StringRef host_and_port,
   return UDPSocket::Connect(host_and_port, child_processes_inherit, socket);
 }
 
-Error Socket::UnixDomainConnect(llvm::StringRef name,
-                                bool child_processes_inherit, Socket *&socket) {
-  Error error;
+Status Socket::UnixDomainConnect(llvm::StringRef name,
+                                 bool child_processes_inherit,
+                                 Socket *&socket) {
+  Status error;
   std::unique_ptr<Socket> connect_socket(
       Create(ProtocolUnixDomain, child_processes_inherit, error));
   if (error.Fail())
@@ -204,9 +205,9 @@ Error Socket::UnixDomainConnect(llvm::StringRef name,
   return error;
 }
 
-Error Socket::UnixDomainAccept(llvm::StringRef name,
-                               bool child_processes_inherit, Socket *&socket) {
-  Error error;
+Status Socket::UnixDomainAccept(llvm::StringRef name,
+                                bool child_processes_inherit, Socket *&socket) {
+  Status error;
   std::unique_ptr<Socket> listen_socket(
       Create(ProtocolUnixDomain, child_processes_inherit, error));
   if (error.Fail())
@@ -220,10 +221,10 @@ Error Socket::UnixDomainAccept(llvm::StringRef name,
   return error;
 }
 
-Error Socket::UnixAbstractConnect(llvm::StringRef name,
-                                  bool child_processes_inherit,
-                                  Socket *&socket) {
-  Error error;
+Status Socket::UnixAbstractConnect(llvm::StringRef name,
+                                   bool child_processes_inherit,
+                                   Socket *&socket) {
+  Status error;
   std::unique_ptr<Socket> connect_socket(
       Create(ProtocolUnixAbstract, child_processes_inherit, error));
   if (error.Fail())
@@ -235,10 +236,10 @@ Error Socket::UnixAbstractConnect(llvm::StringRef name,
   return error;
 }
 
-Error Socket::UnixAbstractAccept(llvm::StringRef name,
-                                 bool child_processes_inherit,
-                                 Socket *&socket) {
-  Error error;
+Status Socket::UnixAbstractAccept(llvm::StringRef name,
+                                  bool child_processes_inherit,
+                                  Socket *&socket) {
+  Status error;
   std::unique_ptr<Socket> listen_socket(
       Create(ProtocolUnixAbstract, child_processes_inherit, error));
   if (error.Fail())
@@ -254,7 +255,7 @@ Error Socket::UnixAbstractAccept(llvm::StringRef name,
 
 bool Socket::DecodeHostAndPort(llvm::StringRef host_and_port,
                                std::string &host_str, std::string &port_str,
-                               int32_t &port, Error *error_ptr) {
+                               int32_t &port, Status *error_ptr) {
   static RegularExpression g_regex(
       llvm::StringRef("([^:]+|\\[[0-9a-fA-F:]+.*\\]):([0-9]+)"));
   RegularExpression::Match regex_match(2);
@@ -304,8 +305,8 @@ IOObject::WaitableHandle Socket::GetWaitableHandle() {
   return m_socket;
 }
 
-Error Socket::Read(void *buf, size_t &num_bytes) {
-  Error error;
+Status Socket::Read(void *buf, size_t &num_bytes) {
+  Status error;
   int bytes_received = 0;
   do {
     bytes_received = ::recv(m_socket, static_cast<char *>(buf), num_bytes, 0);
@@ -330,8 +331,8 @@ Error Socket::Read(void *buf, size_t &num_bytes) {
   return error;
 }
 
-Error Socket::Write(const void *buf, size_t &num_bytes) {
-  Error error;
+Status Socket::Write(const void *buf, size_t &num_bytes) {
+  Status error;
   int bytes_sent = 0;
   do {
     bytes_sent = Send(buf, num_bytes);
@@ -356,13 +357,13 @@ Error Socket::Write(const void *buf, size_t &num_bytes) {
   return error;
 }
 
-Error Socket::PreDisconnect() {
-  Error error;
+Status Socket::PreDisconnect() {
+  Status error;
   return error;
 }
 
-Error Socket::Close() {
-  Error error;
+Status Socket::Close() {
+  Status error;
   if (!IsValid() || !m_should_close_fd)
     return error;
 
@@ -404,7 +405,7 @@ size_t Socket::Send(const void *buf, const size_t num_bytes) {
   return ::send(m_socket, static_cast<const char *>(buf), num_bytes, 0);
 }
 
-void Socket::SetLastError(Error &error) {
+void Socket::SetLastError(Status &error) {
 #if defined(_WIN32)
   error.SetError(::WSAGetLastError(), lldb::eErrorTypeWin32);
 #else
@@ -414,7 +415,7 @@ void Socket::SetLastError(Error &error) {
 
 NativeSocket Socket::CreateSocket(const int domain, const int type,
                                   const int protocol,
-                                  bool child_processes_inherit, Error &error) {
+                                  bool child_processes_inherit, Status &error) {
   error.Clear();
   auto socket_type = type;
 #ifdef SOCK_CLOEXEC
@@ -430,7 +431,7 @@ NativeSocket Socket::CreateSocket(const int domain, const int type,
 
 NativeSocket Socket::AcceptSocket(NativeSocket sockfd, struct sockaddr *addr,
                                   socklen_t *addrlen,
-                                  bool child_processes_inherit, Error &error) {
+                                  bool child_processes_inherit, Status &error) {
   error.Clear();
 #if defined(ANDROID_USE_ACCEPT_WORKAROUND)
   // Hack:
diff --git a/source/Host/common/SoftwareBreakpoint.cpp b/source/Host/common/SoftwareBreakpoint.cpp
index 436cb2bb112e..14dbafd94c03 100644
--- a/source/Host/common/SoftwareBreakpoint.cpp
+++ b/source/Host/common/SoftwareBreakpoint.cpp
@@ -10,8 +10,8 @@
 #include "lldb/Host/common/SoftwareBreakpoint.h"
 
 #include "lldb/Host/Debug.h"
-#include "lldb/Utility/Error.h"
 #include "lldb/Utility/Log.h"
+#include "lldb/Utility/Status.h"
 
 #include "lldb/Host/common/NativeProcessProtocol.h"
 
@@ -21,7 +21,7 @@ using namespace lldb_private;
 // static members
 // -------------------------------------------------------------------
 
-Error SoftwareBreakpoint::CreateSoftwareBreakpoint(
+Status SoftwareBreakpoint::CreateSoftwareBreakpoint(
     NativeProcessProtocol &process, lldb::addr_t addr, size_t size_hint,
     NativeBreakpointSP &breakpoint_sp) {
   Log *log(GetLogIfAnyCategoriesSet(LIBLLDB_LOG_BREAKPOINTS));
@@ -30,15 +30,15 @@ Error SoftwareBreakpoint::CreateSoftwareBreakpoint(
 
   // Validate the address.
   if (addr == LLDB_INVALID_ADDRESS)
-    return Error("SoftwareBreakpoint::%s invalid load address specified.",
-                 __FUNCTION__);
+    return Status("SoftwareBreakpoint::%s invalid load address specified.",
+                  __FUNCTION__);
 
   // Ask the NativeProcessProtocol subclass to fill in the correct software
   // breakpoint
   // trap for the breakpoint site.
   size_t bp_opcode_size = 0;
   const uint8_t *bp_opcode_bytes = NULL;
-  Error error = process.GetSoftwareBreakpointTrapOpcode(
+  Status error = process.GetSoftwareBreakpointTrapOpcode(
       size_hint, bp_opcode_size, bp_opcode_bytes);
 
   if (error.Fail()) {
@@ -54,10 +54,10 @@ Error SoftwareBreakpoint::CreateSoftwareBreakpoint(
     if (log)
       log->Printf("SoftwareBreakpoint::%s failed to retrieve any trap opcodes",
                   __FUNCTION__);
-    return Error("SoftwareBreakpoint::GetSoftwareBreakpointTrapOpcode() "
-                 "returned zero, unable to get breakpoint trap for address "
-                 "0x%" PRIx64,
-                 addr);
+    return Status("SoftwareBreakpoint::GetSoftwareBreakpointTrapOpcode() "
+                  "returned zero, unable to get breakpoint trap for address "
+                  "0x%" PRIx64,
+                  addr);
   }
 
   if (bp_opcode_size > MAX_TRAP_OPCODE_SIZE) {
@@ -65,10 +65,10 @@ Error SoftwareBreakpoint::CreateSoftwareBreakpoint(
       log->Printf("SoftwareBreakpoint::%s cannot support %zu trapcode bytes, "
                   "max size is %zu",
                   __FUNCTION__, bp_opcode_size, MAX_TRAP_OPCODE_SIZE);
-    return Error("SoftwareBreakpoint::GetSoftwareBreakpointTrapOpcode() "
-                 "returned too many trap opcode bytes: requires %zu but we "
-                 "only support a max of %zu",
-                 bp_opcode_size, MAX_TRAP_OPCODE_SIZE);
+    return Status("SoftwareBreakpoint::GetSoftwareBreakpointTrapOpcode() "
+                  "returned too many trap opcode bytes: requires %zu but we "
+                  "only support a max of %zu",
+                  bp_opcode_size, MAX_TRAP_OPCODE_SIZE);
   }
 
   // Validate that we received opcodes.
@@ -76,10 +76,10 @@ Error SoftwareBreakpoint::CreateSoftwareBreakpoint(
     if (log)
       log->Printf("SoftwareBreakpoint::%s failed to retrieve trap opcode bytes",
                   __FUNCTION__);
-    return Error("SoftwareBreakpoint::GetSoftwareBreakpointTrapOpcode() "
-                 "returned NULL trap opcode bytes, unable to get breakpoint "
-                 "trap for address 0x%" PRIx64,
-                 addr);
+    return Status("SoftwareBreakpoint::GetSoftwareBreakpointTrapOpcode() "
+                  "returned NULL trap opcode bytes, unable to get breakpoint "
+                  "trap for address 0x%" PRIx64,
+                  addr);
   }
 
   // Enable the breakpoint.
@@ -103,10 +103,10 @@ Error SoftwareBreakpoint::CreateSoftwareBreakpoint(
   // breakpoint.
   breakpoint_sp.reset(new SoftwareBreakpoint(process, addr, saved_opcode_bytes,
                                              bp_opcode_bytes, bp_opcode_size));
-  return Error();
+  return Status();
 }
 
-Error SoftwareBreakpoint::EnableSoftwareBreakpoint(
+Status SoftwareBreakpoint::EnableSoftwareBreakpoint(
     NativeProcessProtocol &process, lldb::addr_t addr, size_t bp_opcode_size,
     const uint8_t *bp_opcode_bytes, uint8_t *saved_opcode_bytes) {
   assert(bp_opcode_size <= MAX_TRAP_OPCODE_SIZE &&
@@ -121,7 +121,7 @@ Error SoftwareBreakpoint::EnableSoftwareBreakpoint(
   // Save the original opcodes by reading them so we can restore later.
   size_t bytes_read = 0;
 
-  Error error =
+  Status error =
       process.ReadMemory(addr, saved_opcode_bytes, bp_opcode_size, bytes_read);
   if (error.Fail()) {
     if (log)
@@ -138,10 +138,10 @@ Error SoftwareBreakpoint::EnableSoftwareBreakpoint(
                   "attempting to set breakpoint: attempted to read %zu bytes "
                   "but only read %zu",
                   __FUNCTION__, bp_opcode_size, bytes_read);
-    return Error("SoftwareBreakpoint::%s failed to read memory while "
-                 "attempting to set breakpoint: attempted to read %zu bytes "
-                 "but only read %zu",
-                 __FUNCTION__, bp_opcode_size, bytes_read);
+    return Status("SoftwareBreakpoint::%s failed to read memory while "
+                  "attempting to set breakpoint: attempted to read %zu bytes "
+                  "but only read %zu",
+                  __FUNCTION__, bp_opcode_size, bytes_read);
   }
 
   // Log what we read.
@@ -197,10 +197,11 @@ Error SoftwareBreakpoint::EnableSoftwareBreakpoint(
                   "attempting to verify breakpoint: attempted to read %zu "
                   "bytes but only read %zu",
                   __FUNCTION__, bp_opcode_size, verify_bytes_read);
-    return Error("SoftwareBreakpoint::%s failed to read memory while "
-                 "attempting to verify breakpoint: attempted to read %zu bytes "
-                 "but only read %zu",
-                 __FUNCTION__, bp_opcode_size, verify_bytes_read);
+    return Status(
+        "SoftwareBreakpoint::%s failed to read memory while "
+        "attempting to verify breakpoint: attempted to read %zu bytes "
+        "but only read %zu",
+        __FUNCTION__, bp_opcode_size, verify_bytes_read);
   }
 
   if (::memcmp(bp_opcode_bytes, verify_bp_opcode_bytes, bp_opcode_size) != 0) {
@@ -209,17 +210,17 @@ Error SoftwareBreakpoint::EnableSoftwareBreakpoint(
                   "writing failed - trap opcodes not successfully read back "
                   "after writing when setting breakpoint at 0x%" PRIx64,
                   __FUNCTION__, addr);
-    return Error("SoftwareBreakpoint::%s: verification of software breakpoint "
-                 "writing failed - trap opcodes not successfully read back "
-                 "after writing when setting breakpoint at 0x%" PRIx64,
-                 __FUNCTION__, addr);
+    return Status("SoftwareBreakpoint::%s: verification of software breakpoint "
+                  "writing failed - trap opcodes not successfully read back "
+                  "after writing when setting breakpoint at 0x%" PRIx64,
+                  __FUNCTION__, addr);
   }
 
   if (log)
     log->Printf("SoftwareBreakpoint::%s addr = 0x%" PRIx64 " -- SUCCESS",
                 __FUNCTION__, addr);
 
-  return Error();
+  return Status();
 }
 
 // -------------------------------------------------------------------
@@ -240,13 +241,13 @@ SoftwareBreakpoint::SoftwareBreakpoint(NativeProcessProtocol &process,
   ::memcpy(m_trap_opcodes, trap_opcodes, opcode_size);
 }
 
-Error SoftwareBreakpoint::DoEnable() {
+Status SoftwareBreakpoint::DoEnable() {
   return EnableSoftwareBreakpoint(m_process, m_addr, m_opcode_size,
                                   m_trap_opcodes, m_saved_opcodes);
 }
 
-Error SoftwareBreakpoint::DoDisable() {
-  Error error;
+Status SoftwareBreakpoint::DoDisable() {
+  Status error;
   assert(m_addr && (m_addr != LLDB_INVALID_ADDRESS) &&
          "can't remove a software breakpoint for an invalid address");
 
diff --git a/source/Host/common/Symbols.cpp b/source/Host/common/Symbols.cpp
index 9e0a3b5bf4df..1d9180bf528e 100644
--- a/source/Host/common/Symbols.cpp
+++ b/source/Host/common/Symbols.cpp
@@ -151,8 +151,9 @@ FileSpec LocateExecutableSymbolFileDsym(const ModuleSpec &module_spec) {
   const ArchSpec *arch = module_spec.GetArchitecturePtr();
   const UUID *uuid = module_spec.GetUUIDPtr();
 
+  static Timer::Category func_cat(LLVM_PRETTY_FUNCTION);
   Timer scoped_timer(
-      LLVM_PRETTY_FUNCTION,
+      func_cat,
       "LocateExecutableSymbolFileDsym (file = %s, arch = %s, uuid = %p)",
       exec_fspec ? exec_fspec->GetFilename().AsCString("<NULL>") : "<NULL>",
       arch ? arch->GetArchitectureName() : "<NULL>", (const void *)uuid);
@@ -175,9 +176,9 @@ ModuleSpec Symbols::LocateExecutableObjectFile(const ModuleSpec &module_spec) {
   const FileSpec *exec_fspec = module_spec.GetFileSpecPtr();
   const ArchSpec *arch = module_spec.GetArchitecturePtr();
   const UUID *uuid = module_spec.GetUUIDPtr();
+  static Timer::Category func_cat(LLVM_PRETTY_FUNCTION);
   Timer scoped_timer(
-      LLVM_PRETTY_FUNCTION,
-      "LocateExecutableObjectFile (file = %s, arch = %s, uuid = %p)",
+      func_cat, "LocateExecutableObjectFile (file = %s, arch = %s, uuid = %p)",
       exec_fspec ? exec_fspec->GetFilename().AsCString("<NULL>") : "<NULL>",
       arch ? arch->GetArchitectureName() : "<NULL>", (const void *)uuid);
 
diff --git a/source/Host/common/TCPSocket.cpp b/source/Host/common/TCPSocket.cpp
index 55db4bb0c456..c013334ce23a 100644
--- a/source/Host/common/TCPSocket.cpp
+++ b/source/Host/common/TCPSocket.cpp
@@ -117,8 +117,8 @@ std::string TCPSocket::GetRemoteIPAddress() const {
   return "";
 }
 
-Error TCPSocket::CreateSocket(int domain) {
-  Error error;
+Status TCPSocket::CreateSocket(int domain) {
+  Status error;
   if (IsValid())
     error = Close();
   if (error.Fail())
@@ -128,13 +128,13 @@ Error TCPSocket::CreateSocket(int domain) {
   return error;
 }
 
-Error TCPSocket::Connect(llvm::StringRef name) {
+Status TCPSocket::Connect(llvm::StringRef name) {
 
   Log *log(lldb_private::GetLogIfAnyCategoriesSet(LIBLLDB_LOG_COMMUNICATION));
   if (log)
     log->Printf("TCPSocket::%s (host/port = %s)", __FUNCTION__, name.data());
 
-  Error error;
+  Status error;
   std::string host_str;
   std::string port_str;
   int32_t port = INT32_MIN;
@@ -166,12 +166,12 @@ Error TCPSocket::Connect(llvm::StringRef name) {
   return error;
 }
 
-Error TCPSocket::Listen(llvm::StringRef name, int backlog) {
+Status TCPSocket::Listen(llvm::StringRef name, int backlog) {
   Log *log(lldb_private::GetLogIfAnyCategoriesSet(LIBLLDB_LOG_CONNECTION));
   if (log)
     log->Printf("TCPSocket::%s (%s)", __FUNCTION__, name.data());
 
-  Error error;
+  Status error;
   std::string host_str;
   std::string port_str;
   int32_t port = INT32_MIN;
@@ -227,8 +227,8 @@ void TCPSocket::CloseListenSockets() {
   m_listen_sockets.clear();
 }
 
-Error TCPSocket::Accept(Socket *&conn_socket) {
-  Error error;
+Status TCPSocket::Accept(Socket *&conn_socket) {
+  Status error;
   if (m_listen_sockets.size() == 0) {
     error.SetErrorString("No open listening sockets!");
     return error;
diff --git a/source/Host/common/ThreadLauncher.cpp b/source/Host/common/ThreadLauncher.cpp
index 32641efe408a..f3401016393f 100644
--- a/source/Host/common/ThreadLauncher.cpp
+++ b/source/Host/common/ThreadLauncher.cpp
@@ -24,9 +24,9 @@ using namespace lldb_private;
 HostThread ThreadLauncher::LaunchThread(llvm::StringRef name,
                                         lldb::thread_func_t thread_function,
                                         lldb::thread_arg_t thread_arg,
-                                        Error *error_ptr,
+                                        Status *error_ptr,
                                         size_t min_stack_byte_size) {
-  Error error;
+  Status error;
   if (error_ptr)
     error_ptr->Clear();
 
diff --git a/source/Host/common/UDPSocket.cpp b/source/Host/common/UDPSocket.cpp
index ce8d90891b2b..21dacbc626ee 100644
--- a/source/Host/common/UDPSocket.cpp
+++ b/source/Host/common/UDPSocket.cpp
@@ -42,27 +42,27 @@ size_t UDPSocket::Send(const void *buf, const size_t num_bytes) {
                   m_sockaddr, m_sockaddr.GetLength());
 }
 
-Error UDPSocket::Connect(llvm::StringRef name) {
-  return Error("%s", g_not_supported_error);
+Status UDPSocket::Connect(llvm::StringRef name) {
+  return Status("%s", g_not_supported_error);
 }
 
-Error UDPSocket::Listen(llvm::StringRef name, int backlog) {
-  return Error("%s", g_not_supported_error);
+Status UDPSocket::Listen(llvm::StringRef name, int backlog) {
+  return Status("%s", g_not_supported_error);
 }
 
-Error UDPSocket::Accept(Socket *&socket) {
-  return Error("%s", g_not_supported_error);
+Status UDPSocket::Accept(Socket *&socket) {
+  return Status("%s", g_not_supported_error);
 }
 
-Error UDPSocket::Connect(llvm::StringRef name, bool child_processes_inherit,
-                         Socket *&socket) {
+Status UDPSocket::Connect(llvm::StringRef name, bool child_processes_inherit,
+                          Socket *&socket) {
   std::unique_ptr<UDPSocket> final_socket;
 
   Log *log(lldb_private::GetLogIfAnyCategoriesSet(LIBLLDB_LOG_CONNECTION));
   if (log)
     log->Printf("UDPSocket::%s (host/port = %s)", __FUNCTION__, name.data());
 
-  Error error;
+  Status error;
   std::string host_str;
   std::string port_str;
   int32_t port = INT32_MIN;
diff --git a/source/Host/freebsd/Host.cpp b/source/Host/freebsd/Host.cpp
index f1abb49055e5..bd547bbeb9c5 100644
--- a/source/Host/freebsd/Host.cpp
+++ b/source/Host/freebsd/Host.cpp
@@ -34,8 +34,8 @@
 #include "lldb/Target/Process.h"
 #include "lldb/Utility/DataExtractor.h"
 #include "lldb/Utility/Endian.h"
-#include "lldb/Utility/Error.h"
 #include "lldb/Utility/Log.h"
+#include "lldb/Utility/Status.h"
 #include "lldb/Utility/StreamString.h"
 
 #include "lldb/Utility/CleanUp.h"
@@ -248,6 +248,6 @@ size_t Host::GetEnvironment(StringList &env) {
   return i;
 }
 
-Error Host::ShellExpandArguments(ProcessLaunchInfo &launch_info) {
-  return Error("unimplemented");
+Status Host::ShellExpandArguments(ProcessLaunchInfo &launch_info) {
+  return Status("unimplemented");
 }
diff --git a/source/Host/linux/Host.cpp b/source/Host/linux/Host.cpp
index 810222cbbf2f..486d4e3f0b81 100644
--- a/source/Host/linux/Host.cpp
+++ b/source/Host/linux/Host.cpp
@@ -22,8 +22,8 @@
 #include "llvm/Support/ScopedPrinter.h"
 // Project includes
 #include "lldb/Target/Process.h"
-#include "lldb/Utility/Error.h"
 #include "lldb/Utility/Log.h"
+#include "lldb/Utility/Status.h"
 
 #include "lldb/Host/Host.h"
 #include "lldb/Host/HostInfo.h"
@@ -162,7 +162,7 @@ static bool GetProcessAndStatInfo(::pid_t pid,
   ssize_t len = readlink(ProcExe.c_str(), &ExePath[0], PATH_MAX);
   if (len <= 0) {
     LLDB_LOG(log, "failed to read link exe link for {0}: {1}", pid,
-             Error(errno, eErrorTypePOSIX));
+             Status(errno, eErrorTypePOSIX));
     return false;
   }
   ExePath.resize(len);
@@ -305,6 +305,6 @@ size_t Host::GetEnvironment(StringList &env) {
   return i;
 }
 
-Error Host::ShellExpandArguments(ProcessLaunchInfo &launch_info) {
-  return Error("unimplemented");
+Status Host::ShellExpandArguments(ProcessLaunchInfo &launch_info) {
+  return Status("unimplemented");
 }
diff --git a/source/Host/macosx/Host.mm b/source/Host/macosx/Host.mm
index 4c51e8ff154b..be205f953862 100644
--- a/source/Host/macosx/Host.mm
+++ b/source/Host/macosx/Host.mm
@@ -140,7 +140,7 @@ bool Host::ResolveExecutableInBundle(FileSpec &file) {
 static void *AcceptPIDFromInferior(void *arg) {
   const char *connect_url = (const char *)arg;
   ConnectionFileDescriptor file_conn;
-  Error error;
+  Status error;
   if (file_conn.Connect(connect_url, &error) == eConnectionStatusSuccess) {
     char pid_str[256];
     ::memset(pid_str, 0, sizeof(pid_str));
@@ -310,7 +310,7 @@ static bool WaitForProcessToSIGSTOP(const lldb::pid_t pid,
 //
 //    lldb::pid_t pid = LLDB_INVALID_PROCESS_ID;
 //
-//    Error lldb_error;
+//    Status lldb_error;
 //    // Sleep and wait a bit for debugserver to start to listen...
 //    char connect_url[128];
 //    ::snprintf (connect_url, sizeof(connect_url), "unix-accept://%s",
@@ -377,10 +377,10 @@ tell application \"Terminal\"\n\
 	do script the_shell_script\n\
 end tell\n";
 
-static Error
+static Status
 LaunchInNewTerminalWithAppleScript(const char *exe_path,
                                    ProcessLaunchInfo &launch_info) {
-  Error error;
+  Status error;
   char unix_socket_name[PATH_MAX] = "/tmp/XXXXXX";
   if (::mktemp(unix_socket_name) == NULL) {
     error.SetErrorString("failed to make temporary path for a unix socket");
@@ -500,7 +500,7 @@ LaunchInNewTerminalWithAppleScript(const char *exe_path,
 
   lldb::pid_t pid = LLDB_INVALID_PROCESS_ID;
 
-  Error lldb_error;
+  Status lldb_error;
   // Sleep and wait a bit for debugserver to start to listen...
   ConnectionFileDescriptor file_conn;
   char connect_url[128];
@@ -946,8 +946,8 @@ static void PackageXPCArguments(xpc_object_t message, const char *prefix,
  Once obtained, it will be valid for as long as the process lives.
  */
 static AuthorizationRef authorizationRef = NULL;
-static Error getXPCAuthorization(ProcessLaunchInfo &launch_info) {
-  Error error;
+static Status getXPCAuthorization(ProcessLaunchInfo &launch_info) {
+  Status error;
   Log *log(lldb_private::GetLogIfAllCategoriesSet(LIBLLDB_LOG_HOST |
                                                   LIBLLDB_LOG_PROCESS));
 
@@ -1024,11 +1024,11 @@ static Error getXPCAuthorization(ProcessLaunchInfo &launch_info) {
 }
 #endif
 
-static Error LaunchProcessXPC(const char *exe_path,
-                              ProcessLaunchInfo &launch_info,
-                              lldb::pid_t &pid) {
+static Status LaunchProcessXPC(const char *exe_path,
+                               ProcessLaunchInfo &launch_info,
+                               lldb::pid_t &pid) {
 #if !NO_XPC_SERVICES
-  Error error = getXPCAuthorization(launch_info);
+  Status error = getXPCAuthorization(launch_info);
   if (error.Fail())
     return error;
 
@@ -1156,7 +1156,7 @@ static Error LaunchProcessXPC(const char *exe_path,
 
   return error;
 #else
-  Error error;
+  Status error;
   return error;
 #endif
 }
@@ -1177,8 +1177,8 @@ static bool ShouldLaunchUsingXPC(ProcessLaunchInfo &launch_info) {
   return result;
 }
 
-Error Host::LaunchProcess(ProcessLaunchInfo &launch_info) {
-  Error error;
+Status Host::LaunchProcess(ProcessLaunchInfo &launch_info) {
+  Status error;
   char exe_path[PATH_MAX];
   PlatformSP host_platform_sp(Platform::GetHostPlatform());
 
@@ -1246,8 +1246,8 @@ Error Host::LaunchProcess(ProcessLaunchInfo &launch_info) {
   return error;
 }
 
-Error Host::ShellExpandArguments(ProcessLaunchInfo &launch_info) {
-  Error error;
+Status Host::ShellExpandArguments(ProcessLaunchInfo &launch_info) {
+  Status error;
   if (launch_info.GetFlags().Test(eLaunchFlagShellExpandArguments)) {
     FileSpec expand_tool_spec;
     if (!HostInfo::GetLLDBPath(lldb::ePathTypeSupportExecutableDir,
@@ -1328,8 +1328,7 @@ Error Host::ShellExpandArguments(ProcessLaunchInfo &launch_info) {
       if (!str_sp)
         continue;
 
-      launch_info.GetArguments().AppendArgument(
-          llvm::StringRef(str_sp->GetValue().c_str()));
+      launch_info.GetArguments().AppendArgument(str_sp->GetValue());
     }
   }
 
diff --git a/source/Host/macosx/Symbols.cpp b/source/Host/macosx/Symbols.cpp
index a5085681495d..20508a658563 100644
--- a/source/Host/macosx/Symbols.cpp
+++ b/source/Host/macosx/Symbols.cpp
@@ -536,7 +536,7 @@ bool Symbols::DownloadObjectAndSymbolFile(ModuleSpec &module_spec,
             log->Printf("Calling %s with file %s to find dSYM",
                         g_dsym_for_uuid_exe_path, file_path);
         }
-        Error error = Host::RunShellCommand(
+        Status error = Host::RunShellCommand(
             command.GetData(),
             NULL,            // current working directory
             &exit_status,    // Exit status
diff --git a/source/Host/netbsd/Host.cpp b/source/Host/netbsd/Host.cpp
index 6a6b8ab51a17..bbe34a83d999 100644
--- a/source/Host/netbsd/Host.cpp
+++ b/source/Host/netbsd/Host.cpp
@@ -33,8 +33,8 @@
 #include "lldb/Target/Process.h"
 #include "lldb/Utility/DataExtractor.h"
 #include "lldb/Utility/Endian.h"
-#include "lldb/Utility/Error.h"
 #include "lldb/Utility/Log.h"
+#include "lldb/Utility/Status.h"
 #include "lldb/Utility/StreamString.h"
 
 #include "lldb/Utility/CleanUp.h"
@@ -254,6 +254,6 @@ bool Host::GetProcessInfo(lldb::pid_t pid, ProcessInstanceInfo &process_info) {
   return false;
 }
 
-Error Host::ShellExpandArguments(ProcessLaunchInfo &launch_info) {
-  return Error("unimplemented");
+Status Host::ShellExpandArguments(ProcessLaunchInfo &launch_info) {
+  return Status("unimplemented");
 }
diff --git a/source/Host/openbsd/Host.cpp b/source/Host/openbsd/Host.cpp
index c9ff69366c2f..7e4a64fbd9ae 100644
--- a/source/Host/openbsd/Host.cpp
+++ b/source/Host/openbsd/Host.cpp
@@ -30,8 +30,8 @@
 #include "lldb/Target/Process.h"
 #include "lldb/Utility/DataExtractor.h"
 #include "lldb/Utility/Endian.h"
-#include "lldb/Utility/Error.h"
 #include "lldb/Utility/Log.h"
+#include "lldb/Utility/Status.h"
 #include "lldb/Utility/StreamString.h"
 
 #include "lldb/Utility/CleanUp.h"
@@ -220,6 +220,6 @@ bool Host::GetProcessInfo(lldb::pid_t pid, ProcessInstanceInfo &process_info) {
   return false;
 }
 
-Error Host::ShellExpandArguments(ProcessLaunchInfo &launch_info) {
-  return Error("unimplemented");
+Status Host::ShellExpandArguments(ProcessLaunchInfo &launch_info) {
+  return Status("unimplemented");
 }
diff --git a/source/Host/posix/ConnectionFileDescriptorPosix.cpp b/source/Host/posix/ConnectionFileDescriptorPosix.cpp
index befc847d8a86..7a0c92b44918 100644
--- a/source/Host/posix/ConnectionFileDescriptorPosix.cpp
+++ b/source/Host/posix/ConnectionFileDescriptorPosix.cpp
@@ -123,7 +123,7 @@ void ConnectionFileDescriptor::OpenCommandPipe() {
 
   Log *log(lldb_private::GetLogIfAnyCategoriesSet(LIBLLDB_LOG_CONNECTION));
   // Make the command file descriptor here:
-  Error result = m_pipe.CreateNew(m_child_processes_inherit);
+  Status result = m_pipe.CreateNew(m_child_processes_inherit);
   if (!result.Success()) {
     if (log)
       log->Printf("%p ConnectionFileDescriptor::OpenCommandPipe () - could not "
@@ -153,7 +153,7 @@ bool ConnectionFileDescriptor::IsConnected() const {
 }
 
 ConnectionStatus ConnectionFileDescriptor::Connect(llvm::StringRef path,
-                                                   Error *error_ptr) {
+                                                   Status *error_ptr) {
   std::lock_guard<std::recursive_mutex> guard(m_mutex);
   Log *log(lldb_private::GetLogIfAnyCategoriesSet(LIBLLDB_LOG_CONNECTION));
   if (log)
@@ -299,11 +299,11 @@ ConnectionStatus ConnectionFileDescriptor::Connect(llvm::StringRef path,
 
 bool ConnectionFileDescriptor::InterruptRead() {
   size_t bytes_written = 0;
-  Error result = m_pipe.Write("i", 1, bytes_written);
+  Status result = m_pipe.Write("i", 1, bytes_written);
   return result.Success();
 }
 
-ConnectionStatus ConnectionFileDescriptor::Disconnect(Error *error_ptr) {
+ConnectionStatus ConnectionFileDescriptor::Disconnect(Status *error_ptr) {
   Log *log(lldb_private::GetLogIfAnyCategoriesSet(LIBLLDB_LOG_CONNECTION));
   if (log)
     log->Printf("%p ConnectionFileDescriptor::Disconnect ()",
@@ -337,7 +337,7 @@ ConnectionStatus ConnectionFileDescriptor::Disconnect(Error *error_ptr) {
   if (!locker.try_lock()) {
     if (m_pipe.CanWrite()) {
       size_t bytes_written = 0;
-      Error result = m_pipe.Write("q", 1, bytes_written);
+      Status result = m_pipe.Write("q", 1, bytes_written);
       if (log)
         log->Printf("%p ConnectionFileDescriptor::Disconnect(): Couldn't get "
                     "the lock, sent 'q' to %d, error = '%s'.",
@@ -351,8 +351,8 @@ ConnectionStatus ConnectionFileDescriptor::Disconnect(Error *error_ptr) {
     locker.lock();
   }
 
-  Error error = m_read_sp->Close();
-  Error error2 = m_write_sp->Close();
+  Status error = m_read_sp->Close();
+  Status error2 = m_write_sp->Close();
   if (error.Fail() || error2.Fail())
     status = eConnectionStatusError;
   if (error_ptr)
@@ -369,7 +369,7 @@ ConnectionStatus ConnectionFileDescriptor::Disconnect(Error *error_ptr) {
 size_t ConnectionFileDescriptor::Read(void *dst, size_t dst_len,
                                       const Timeout<std::micro> &timeout,
                                       ConnectionStatus &status,
-                                      Error *error_ptr) {
+                                      Status *error_ptr) {
   Log *log(lldb_private::GetLogIfAnyCategoriesSet(LIBLLDB_LOG_CONNECTION));
 
   std::unique_lock<std::recursive_mutex> locker(m_mutex, std::defer_lock);
@@ -394,7 +394,7 @@ size_t ConnectionFileDescriptor::Read(void *dst, size_t dst_len,
   if (status != eConnectionStatusSuccess)
     return 0;
 
-  Error error;
+  Status error;
   size_t bytes_read = dst_len;
   error = m_read_sp->Read(dst, bytes_read);
 
@@ -476,7 +476,7 @@ size_t ConnectionFileDescriptor::Read(void *dst, size_t dst_len,
 
 size_t ConnectionFileDescriptor::Write(const void *src, size_t src_len,
                                        ConnectionStatus &status,
-                                       Error *error_ptr) {
+                                       Status *error_ptr) {
   Log *log(lldb_private::GetLogIfAnyCategoriesSet(LIBLLDB_LOG_CONNECTION));
   if (log)
     log->Printf(
@@ -491,7 +491,7 @@ size_t ConnectionFileDescriptor::Write(const void *src, size_t src_len,
     return 0;
   }
 
-  Error error;
+  Status error;
 
   size_t bytes_sent = src_len;
   error = m_write_sp->Write(src, bytes_sent);
@@ -553,7 +553,7 @@ std::string ConnectionFileDescriptor::GetURI() { return m_uri; }
 
 ConnectionStatus
 ConnectionFileDescriptor::BytesAvailable(const Timeout<std::micro> &timeout,
-                                         Error *error_ptr) {
+                                         Status *error_ptr) {
   // Don't need to take the mutex here separately since we are only called from
   // Read.  If we
   // ever get used more generally we will need to lock here as well.
@@ -588,7 +588,7 @@ ConnectionFileDescriptor::BytesAvailable(const Timeout<std::micro> &timeout,
 
     while (handle == m_read_sp->GetWaitableHandle()) {
 
-      Error error = select_helper.Select();
+      Status error = select_helper.Select();
 
       if (error_ptr)
         *error_ptr = error;
@@ -653,9 +653,9 @@ ConnectionFileDescriptor::BytesAvailable(const Timeout<std::micro> &timeout,
 
 ConnectionStatus
 ConnectionFileDescriptor::NamedSocketAccept(llvm::StringRef socket_name,
-                                            Error *error_ptr) {
+                                            Status *error_ptr) {
   Socket *socket = nullptr;
-  Error error =
+  Status error =
       Socket::UnixDomainAccept(socket_name, m_child_processes_inherit, socket);
   if (error_ptr)
     *error_ptr = error;
@@ -670,9 +670,9 @@ ConnectionFileDescriptor::NamedSocketAccept(llvm::StringRef socket_name,
 
 ConnectionStatus
 ConnectionFileDescriptor::NamedSocketConnect(llvm::StringRef socket_name,
-                                             Error *error_ptr) {
+                                             Status *error_ptr) {
   Socket *socket = nullptr;
-  Error error =
+  Status error =
       Socket::UnixDomainConnect(socket_name, m_child_processes_inherit, socket);
   if (error_ptr)
     *error_ptr = error;
@@ -687,10 +687,10 @@ ConnectionFileDescriptor::NamedSocketConnect(llvm::StringRef socket_name,
 
 lldb::ConnectionStatus
 ConnectionFileDescriptor::UnixAbstractSocketConnect(llvm::StringRef socket_name,
-                                                    Error *error_ptr) {
+                                                    Status *error_ptr) {
   Socket *socket = nullptr;
-  Error error = Socket::UnixAbstractConnect(socket_name,
-                                            m_child_processes_inherit, socket);
+  Status error = Socket::UnixAbstractConnect(socket_name,
+                                             m_child_processes_inherit, socket);
   if (error_ptr)
     *error_ptr = error;
   m_write_sp.reset(socket);
@@ -704,13 +704,13 @@ ConnectionFileDescriptor::UnixAbstractSocketConnect(llvm::StringRef socket_name,
 
 ConnectionStatus
 ConnectionFileDescriptor::SocketListenAndAccept(llvm::StringRef s,
-                                                Error *error_ptr) {
+                                                Status *error_ptr) {
   m_port_predicate.SetValue(0, eBroadcastNever);
 
   Socket *socket = nullptr;
   m_waiting_for_accept = true;
-  Error error = Socket::TcpListen(s, m_child_processes_inherit, socket,
-                                  &m_port_predicate);
+  Status error = Socket::TcpListen(s, m_child_processes_inherit, socket,
+                                   &m_port_predicate);
   if (error_ptr)
     *error_ptr = error;
   if (error.Fail())
@@ -732,9 +732,9 @@ ConnectionFileDescriptor::SocketListenAndAccept(llvm::StringRef s,
 }
 
 ConnectionStatus ConnectionFileDescriptor::ConnectTCP(llvm::StringRef s,
-                                                      Error *error_ptr) {
+                                                      Status *error_ptr) {
   Socket *socket = nullptr;
-  Error error = Socket::TcpConnect(s, m_child_processes_inherit, socket);
+  Status error = Socket::TcpConnect(s, m_child_processes_inherit, socket);
   if (error_ptr)
     *error_ptr = error;
   m_write_sp.reset(socket);
@@ -747,9 +747,9 @@ ConnectionStatus ConnectionFileDescriptor::ConnectTCP(llvm::StringRef s,
 }
 
 ConnectionStatus ConnectionFileDescriptor::ConnectUDP(llvm::StringRef s,
-                                                      Error *error_ptr) {
+                                                      Status *error_ptr) {
   Socket *socket = nullptr;
-  Error error = Socket::UdpConnect(s, m_child_processes_inherit, socket);
+  Status error = Socket::UdpConnect(s, m_child_processes_inherit, socket);
   if (error_ptr)
     *error_ptr = error;
   m_write_sp.reset(socket);
diff --git a/source/Host/posix/DomainSocket.cpp b/source/Host/posix/DomainSocket.cpp
index 33c71268c2e3..3e3abadc2e5a 100644
--- a/source/Host/posix/DomainSocket.cpp
+++ b/source/Host/posix/DomainSocket.cpp
@@ -72,13 +72,13 @@ DomainSocket::DomainSocket(NativeSocket socket,
   m_socket = socket;
 }
 
-Error DomainSocket::Connect(llvm::StringRef name) {
+Status DomainSocket::Connect(llvm::StringRef name) {
   sockaddr_un saddr_un;
   socklen_t saddr_un_len;
   if (!SetSockAddr(name, GetNameOffset(), &saddr_un, saddr_un_len))
-    return Error("Failed to set socket address");
+    return Status("Failed to set socket address");
 
-  Error error;
+  Status error;
   m_socket = CreateSocket(kDomain, kType, 0, m_child_processes_inherit, error);
   if (error.Fail())
     return error;
@@ -89,15 +89,15 @@ Error DomainSocket::Connect(llvm::StringRef name) {
   return error;
 }
 
-Error DomainSocket::Listen(llvm::StringRef name, int backlog) {
+Status DomainSocket::Listen(llvm::StringRef name, int backlog) {
   sockaddr_un saddr_un;
   socklen_t saddr_un_len;
   if (!SetSockAddr(name, GetNameOffset(), &saddr_un, saddr_un_len))
-    return Error("Failed to set socket address");
+    return Status("Failed to set socket address");
 
   DeleteSocketFile(name);
 
-  Error error;
+  Status error;
   m_socket = CreateSocket(kDomain, kType, 0, m_child_processes_inherit, error);
   if (error.Fail())
     return error;
@@ -110,8 +110,8 @@ Error DomainSocket::Listen(llvm::StringRef name, int backlog) {
   return error;
 }
 
-Error DomainSocket::Accept(Socket *&socket) {
-  Error error;
+Status DomainSocket::Accept(Socket *&socket) {
+  Status error;
   auto conn_fd = AcceptSocket(GetNativeSocket(), nullptr, nullptr,
                               m_child_processes_inherit, error);
   if (error.Success())
diff --git a/source/Host/posix/FileSystem.cpp b/source/Host/posix/FileSystem.cpp
index 22f337fcfec5..e5a99e1aa754 100644
--- a/source/Host/posix/FileSystem.cpp
+++ b/source/Host/posix/FileSystem.cpp
@@ -26,7 +26,7 @@
 
 // lldb Includes
 #include "lldb/Host/Host.h"
-#include "lldb/Utility/Error.h"
+#include "lldb/Utility/Status.h"
 #include "lldb/Utility/StreamString.h"
 
 #include "llvm/Support/FileSystem.h"
@@ -36,15 +36,15 @@ using namespace lldb_private;
 
 const char *FileSystem::DEV_NULL = "/dev/null";
 
-Error FileSystem::Symlink(const FileSpec &src, const FileSpec &dst) {
-  Error error;
+Status FileSystem::Symlink(const FileSpec &src, const FileSpec &dst) {
+  Status error;
   if (::symlink(dst.GetCString(), src.GetCString()) == -1)
     error.SetErrorToErrno();
   return error;
 }
 
-Error FileSystem::Readlink(const FileSpec &src, FileSpec &dst) {
-  Error error;
+Status FileSystem::Readlink(const FileSpec &src, FileSpec &dst) {
+  Status error;
   char buf[PATH_MAX];
   ssize_t count = ::readlink(src.GetCString(), buf, sizeof(buf) - 1);
   if (count < 0)
@@ -56,22 +56,22 @@ Error FileSystem::Readlink(const FileSpec &src, FileSpec &dst) {
   return error;
 }
 
-Error FileSystem::ResolveSymbolicLink(const FileSpec &src, FileSpec &dst) {
+Status FileSystem::ResolveSymbolicLink(const FileSpec &src, FileSpec &dst) {
   char resolved_path[PATH_MAX];
   if (!src.GetPath(resolved_path, sizeof(resolved_path))) {
-    return Error("Couldn't get the canonical path for %s", src.GetCString());
+    return Status("Couldn't get the canonical path for %s", src.GetCString());
   }
 
   char real_path[PATH_MAX + 1];
   if (realpath(resolved_path, real_path) == nullptr) {
-    Error err;
+    Status err;
     err.SetErrorToErrno();
     return err;
   }
 
   dst = FileSpec(real_path, false);
 
-  return Error();
+  return Status();
 }
 
 FILE *FileSystem::Fopen(const char *path, const char *mode) {
diff --git a/source/Host/posix/HostProcessPosix.cpp b/source/Host/posix/HostProcessPosix.cpp
index 9dd9fef94cdb..b5505dbec65b 100644
--- a/source/Host/posix/HostProcessPosix.cpp
+++ b/source/Host/posix/HostProcessPosix.cpp
@@ -29,9 +29,9 @@ HostProcessPosix::HostProcessPosix(lldb::process_t process)
 
 HostProcessPosix::~HostProcessPosix() {}
 
-Error HostProcessPosix::Signal(int signo) const {
+Status HostProcessPosix::Signal(int signo) const {
   if (m_process == kInvalidPosixProcess) {
-    Error error;
+    Status error;
     error.SetErrorString("HostProcessPosix refers to an invalid process");
     return error;
   }
@@ -39,8 +39,8 @@ Error HostProcessPosix::Signal(int signo) const {
   return HostProcessPosix::Signal(m_process, signo);
 }
 
-Error HostProcessPosix::Signal(lldb::process_t process, int signo) {
-  Error error;
+Status HostProcessPosix::Signal(lldb::process_t process, int signo) {
+  Status error;
 
   if (-1 == ::kill(process, signo))
     error.SetErrorToErrno();
@@ -48,10 +48,10 @@ Error HostProcessPosix::Signal(lldb::process_t process, int signo) {
   return error;
 }
 
-Error HostProcessPosix::Terminate() { return Signal(SIGKILL); }
+Status HostProcessPosix::Terminate() { return Signal(SIGKILL); }
 
-Error HostProcessPosix::GetMainModule(FileSpec &file_spec) const {
-  Error error;
+Status HostProcessPosix::GetMainModule(FileSpec &file_spec) const {
+  Status error;
 
   // Use special code here because proc/[pid]/exe is a symbolic link.
   char link_path[PATH_MAX];
@@ -82,7 +82,7 @@ bool HostProcessPosix::IsRunning() const {
     return false;
 
   // Send this process the null signal.  If it succeeds the process is running.
-  Error error = Signal(0);
+  Status error = Signal(0);
   return error.Success();
 }
 
diff --git a/source/Host/posix/HostThreadPosix.cpp b/source/Host/posix/HostThreadPosix.cpp
index 073b7b0b11e8..0f4434d25e2e 100644
--- a/source/Host/posix/HostThreadPosix.cpp
+++ b/source/Host/posix/HostThreadPosix.cpp
@@ -8,7 +8,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "lldb/Host/posix/HostThreadPosix.h"
-#include "lldb/Utility/Error.h"
+#include "lldb/Utility/Status.h"
 
 #include <errno.h>
 #include <pthread.h>
@@ -23,8 +23,8 @@ HostThreadPosix::HostThreadPosix(lldb::thread_t thread)
 
 HostThreadPosix::~HostThreadPosix() {}
 
-Error HostThreadPosix::Join(lldb::thread_result_t *result) {
-  Error error;
+Status HostThreadPosix::Join(lldb::thread_result_t *result) {
+  Status error;
   if (IsJoinable()) {
     int err = ::pthread_join(m_thread, result);
     error.SetError(err, lldb::eErrorTypePOSIX);
@@ -38,8 +38,8 @@ Error HostThreadPosix::Join(lldb::thread_result_t *result) {
   return error;
 }
 
-Error HostThreadPosix::Cancel() {
-  Error error;
+Status HostThreadPosix::Cancel() {
+  Status error;
   if (IsJoinable()) {
 #ifndef __ANDROID__
 #ifndef __FreeBSD__
@@ -54,8 +54,8 @@ Error HostThreadPosix::Cancel() {
   return error;
 }
 
-Error HostThreadPosix::Detach() {
-  Error error;
+Status HostThreadPosix::Detach() {
+  Status error;
   if (IsJoinable()) {
     int err = ::pthread_detach(m_thread);
     error.SetError(err, eErrorTypePOSIX);
diff --git a/source/Host/posix/LockFilePosix.cpp b/source/Host/posix/LockFilePosix.cpp
index d1cc617146d0..2b7d548a021c 100644
--- a/source/Host/posix/LockFilePosix.cpp
+++ b/source/Host/posix/LockFilePosix.cpp
@@ -16,8 +16,8 @@ using namespace lldb_private;
 
 namespace {
 
-Error fileLock(int fd, int cmd, int lock_type, const uint64_t start,
-               const uint64_t len) {
+Status fileLock(int fd, int cmd, int lock_type, const uint64_t start,
+                const uint64_t len) {
   struct flock fl;
 
   fl.l_type = lock_type;
@@ -26,7 +26,7 @@ Error fileLock(int fd, int cmd, int lock_type, const uint64_t start,
   fl.l_len = len;
   fl.l_pid = ::getpid();
 
-  Error error;
+  Status error;
   if (::fcntl(fd, cmd, &fl) == -1)
     error.SetErrorToErrno();
 
@@ -39,22 +39,22 @@ LockFilePosix::LockFilePosix(int fd) : LockFileBase(fd) {}
 
 LockFilePosix::~LockFilePosix() { Unlock(); }
 
-Error LockFilePosix::DoWriteLock(const uint64_t start, const uint64_t len) {
+Status LockFilePosix::DoWriteLock(const uint64_t start, const uint64_t len) {
   return fileLock(m_fd, F_SETLKW, F_WRLCK, start, len);
 }
 
-Error LockFilePosix::DoTryWriteLock(const uint64_t start, const uint64_t len) {
+Status LockFilePosix::DoTryWriteLock(const uint64_t start, const uint64_t len) {
   return fileLock(m_fd, F_SETLK, F_WRLCK, start, len);
 }
 
-Error LockFilePosix::DoReadLock(const uint64_t start, const uint64_t len) {
+Status LockFilePosix::DoReadLock(const uint64_t start, const uint64_t len) {
   return fileLock(m_fd, F_SETLKW, F_RDLCK, start, len);
 }
 
-Error LockFilePosix::DoTryReadLock(const uint64_t start, const uint64_t len) {
+Status LockFilePosix::DoTryReadLock(const uint64_t start, const uint64_t len) {
   return fileLock(m_fd, F_SETLK, F_RDLCK, start, len);
 }
 
-Error LockFilePosix::DoUnlock() {
+Status LockFilePosix::DoUnlock() {
   return fileLock(m_fd, F_SETLK, F_UNLCK, m_start, m_len);
 }
diff --git a/source/Host/posix/PipePosix.cpp b/source/Host/posix/PipePosix.cpp
index 3ac5d480de89..da99fd702424 100644
--- a/source/Host/posix/PipePosix.cpp
+++ b/source/Host/posix/PipePosix.cpp
@@ -82,11 +82,11 @@ PipePosix &PipePosix::operator=(PipePosix &&pipe_posix) {
 
 PipePosix::~PipePosix() { Close(); }
 
-Error PipePosix::CreateNew(bool child_processes_inherit) {
+Status PipePosix::CreateNew(bool child_processes_inherit) {
   if (CanRead() || CanWrite())
-    return Error(EINVAL, eErrorTypePOSIX);
+    return Status(EINVAL, eErrorTypePOSIX);
 
-  Error error;
+  Status error;
 #if PIPE2_SUPPORTED
   if (::pipe2(m_fds, (child_processes_inherit) ? 0 : O_CLOEXEC) == 0)
     return error;
@@ -111,20 +111,20 @@ Error PipePosix::CreateNew(bool child_processes_inherit) {
   return error;
 }
 
-Error PipePosix::CreateNew(llvm::StringRef name, bool child_process_inherit) {
+Status PipePosix::CreateNew(llvm::StringRef name, bool child_process_inherit) {
   if (CanRead() || CanWrite())
-    return Error("Pipe is already opened");
+    return Status("Pipe is already opened");
 
-  Error error;
+  Status error;
   if (::mkfifo(name.data(), 0660) != 0)
     error.SetErrorToErrno();
 
   return error;
 }
 
-Error PipePosix::CreateWithUniqueName(llvm::StringRef prefix,
-                                      bool child_process_inherit,
-                                      llvm::SmallVectorImpl<char> &name) {
+Status PipePosix::CreateWithUniqueName(llvm::StringRef prefix,
+                                       bool child_process_inherit,
+                                       llvm::SmallVectorImpl<char> &name) {
   llvm::SmallString<PATH_MAX> named_pipe_path;
   llvm::SmallString<PATH_MAX> pipe_spec((prefix + ".%%%%%%").str());
   FileSpec tmpdir_file_spec;
@@ -139,7 +139,7 @@ Error PipePosix::CreateWithUniqueName(llvm::StringRef prefix,
   // It's possible that another process creates the target path after we've
   // verified it's available but before we create it, in which case we
   // should try again.
-  Error error;
+  Status error;
   do {
     llvm::sys::fs::createUniqueFile(tmpdir_file_spec.GetPath(),
                                     named_pipe_path);
@@ -151,16 +151,16 @@ Error PipePosix::CreateWithUniqueName(llvm::StringRef prefix,
   return error;
 }
 
-Error PipePosix::OpenAsReader(llvm::StringRef name,
-                              bool child_process_inherit) {
+Status PipePosix::OpenAsReader(llvm::StringRef name,
+                               bool child_process_inherit) {
   if (CanRead() || CanWrite())
-    return Error("Pipe is already opened");
+    return Status("Pipe is already opened");
 
   int flags = O_RDONLY | O_NONBLOCK;
   if (!child_process_inherit)
     flags |= O_CLOEXEC;
 
-  Error error;
+  Status error;
   int fd = ::open(name.data(), flags);
   if (fd != -1)
     m_fds[READ] = fd;
@@ -170,11 +170,12 @@ Error PipePosix::OpenAsReader(llvm::StringRef name,
   return error;
 }
 
-Error PipePosix::OpenAsWriterWithTimeout(
-    llvm::StringRef name, bool child_process_inherit,
-    const std::chrono::microseconds &timeout) {
+Status
+PipePosix::OpenAsWriterWithTimeout(llvm::StringRef name,
+                                   bool child_process_inherit,
+                                   const std::chrono::microseconds &timeout) {
   if (CanRead() || CanWrite())
-    return Error("Pipe is already opened");
+    return Status("Pipe is already opened");
 
   int flags = O_WRONLY | O_NONBLOCK;
   if (!child_process_inherit)
@@ -187,7 +188,7 @@ Error PipePosix::OpenAsWriterWithTimeout(
     if (timeout != microseconds::zero()) {
       const auto dur = duration_cast<microseconds>(finish_time - Now()).count();
       if (dur <= 0)
-        return Error("timeout exceeded - reader hasn't opened so far");
+        return Status("timeout exceeded - reader hasn't opened so far");
     }
 
     errno = 0;
@@ -196,7 +197,7 @@ Error PipePosix::OpenAsWriterWithTimeout(
       const auto errno_copy = errno;
       // We may get ENXIO if a reader side of the pipe hasn't opened yet.
       if (errno_copy != ENXIO)
-        return Error(errno_copy, eErrorTypePOSIX);
+        return Status(errno_copy, eErrorTypePOSIX);
 
       std::this_thread::sleep_for(
           milliseconds(OPEN_WRITER_SLEEP_TIMEOUT_MSECS));
@@ -205,7 +206,7 @@ Error PipePosix::OpenAsWriterWithTimeout(
     }
   }
 
-  return Error();
+  return Status();
 }
 
 int PipePosix::GetReadFileDescriptor() const { return m_fds[READ]; }
@@ -229,7 +230,7 @@ void PipePosix::Close() {
   CloseWriteFileDescriptor();
 }
 
-Error PipePosix::Delete(llvm::StringRef name) {
+Status PipePosix::Delete(llvm::StringRef name) {
   return llvm::sys::fs::remove(name);
 }
 
@@ -255,12 +256,12 @@ void PipePosix::CloseWriteFileDescriptor() {
   }
 }
 
-Error PipePosix::ReadWithTimeout(void *buf, size_t size,
-                                 const std::chrono::microseconds &timeout,
-                                 size_t &bytes_read) {
+Status PipePosix::ReadWithTimeout(void *buf, size_t size,
+                                  const std::chrono::microseconds &timeout,
+                                  size_t &bytes_read) {
   bytes_read = 0;
   if (!CanRead())
-    return Error(EINVAL, eErrorTypePOSIX);
+    return Status(EINVAL, eErrorTypePOSIX);
 
   const int fd = GetReadFileDescriptor();
 
@@ -268,7 +269,7 @@ Error PipePosix::ReadWithTimeout(void *buf, size_t size,
   select_helper.SetTimeout(timeout);
   select_helper.FDSetRead(fd);
 
-  Error error;
+  Status error;
   while (error.Success()) {
     error = select_helper.Select();
     if (error.Success()) {
@@ -287,17 +288,17 @@ Error PipePosix::ReadWithTimeout(void *buf, size_t size,
   return error;
 }
 
-Error PipePosix::Write(const void *buf, size_t size, size_t &bytes_written) {
+Status PipePosix::Write(const void *buf, size_t size, size_t &bytes_written) {
   bytes_written = 0;
   if (!CanWrite())
-    return Error(EINVAL, eErrorTypePOSIX);
+    return Status(EINVAL, eErrorTypePOSIX);
 
   const int fd = GetWriteFileDescriptor();
   SelectHelper select_helper;
   select_helper.SetTimeout(std::chrono::seconds(0));
   select_helper.FDSetWrite(fd);
 
-  Error error;
+  Status error;
   while (error.Success()) {
     error = select_helper.Select();
     if (error.Success()) {
diff --git a/source/Host/posix/ProcessLauncherPosix.cpp b/source/Host/posix/ProcessLauncherPosix.cpp
index f90bf0cd88a2..6d07be1eec64 100644
--- a/source/Host/posix/ProcessLauncherPosix.cpp
+++ b/source/Host/posix/ProcessLauncherPosix.cpp
@@ -20,7 +20,7 @@ using namespace lldb_private;
 
 HostProcess
 ProcessLauncherPosix::LaunchProcess(const ProcessLaunchInfo &launch_info,
-                                    Error &error) {
+                                    Status &error) {
   lldb::pid_t pid;
   char exe_path[PATH_MAX];
 
diff --git a/source/Host/posix/ProcessLauncherPosixFork.cpp b/source/Host/posix/ProcessLauncherPosixFork.cpp
index 91c32d6e6426..378670cd2a9a 100644
--- a/source/Host/posix/ProcessLauncherPosixFork.cpp
+++ b/source/Host/posix/ProcessLauncherPosixFork.cpp
@@ -190,7 +190,7 @@ static void LLVM_ATTRIBUTE_NORETURN ChildFunc(int error_fd,
 
 HostProcess
 ProcessLauncherPosixFork::LaunchProcess(const ProcessLaunchInfo &launch_info,
-                                        Error &error) {
+                                        Status &error) {
   char exe_path[PATH_MAX];
   launch_info.GetExecutableFile().GetPath(exe_path, sizeof(exe_path));
 
diff --git a/source/Host/windows/ConnectionGenericFileWindows.cpp b/source/Host/windows/ConnectionGenericFileWindows.cpp
index 8fc038f3f9b6..41bdb5f41fb6 100644
--- a/source/Host/windows/ConnectionGenericFileWindows.cpp
+++ b/source/Host/windows/ConnectionGenericFileWindows.cpp
@@ -8,8 +8,8 @@
 //===----------------------------------------------------------------------===//
 
 #include "lldb/Host/windows/ConnectionGenericFileWindows.h"
-#include "lldb/Utility/Error.h"
 #include "lldb/Utility/Log.h"
+#include "lldb/Utility/Status.h"
 #include "lldb/Utility/Timeout.h"
 
 #include "llvm/ADT/STLExtras.h"
@@ -43,10 +43,10 @@ public:
 
   size_t GetBytes() const { return m_bytes; }
   ConnectionStatus GetStatus() const { return m_status; }
-  const Error &GetError() const { return m_error; }
+  const Status &GetError() const { return m_error; }
 
 private:
-  Error m_error;
+  Status m_error;
   size_t m_bytes;
   ConnectionStatus m_status;
 };
@@ -94,7 +94,7 @@ bool ConnectionGenericFile::IsConnected() const {
 }
 
 lldb::ConnectionStatus ConnectionGenericFile::Connect(llvm::StringRef path,
-                                                      Error *error_ptr) {
+                                                      Status *error_ptr) {
   Log *log(lldb_private::GetLogIfAnyCategoriesSet(LIBLLDB_LOG_CONNECTION));
   if (log)
     log->Printf("%p ConnectionGenericFile::Connect (url = '%s')",
@@ -137,7 +137,7 @@ lldb::ConnectionStatus ConnectionGenericFile::Connect(llvm::StringRef path,
   return eConnectionStatusSuccess;
 }
 
-lldb::ConnectionStatus ConnectionGenericFile::Disconnect(Error *error_ptr) {
+lldb::ConnectionStatus ConnectionGenericFile::Disconnect(Status *error_ptr) {
   Log *log(lldb_private::GetLogIfAnyCategoriesSet(LIBLLDB_LOG_CONNECTION));
   if (log)
     log->Printf("%p ConnectionGenericFile::Disconnect ()",
@@ -171,7 +171,7 @@ lldb::ConnectionStatus ConnectionGenericFile::Disconnect(Error *error_ptr) {
 size_t ConnectionGenericFile::Read(void *dst, size_t dst_len,
                                    const Timeout<std::micro> &timeout,
                                    lldb::ConnectionStatus &status,
-                                   Error *error_ptr) {
+                                   Status *error_ptr) {
   ReturnInfo return_info;
   BOOL result = 0;
   DWORD bytes_read = 0;
@@ -269,7 +269,7 @@ finish:
 
 size_t ConnectionGenericFile::Write(const void *src, size_t src_len,
                                     lldb::ConnectionStatus &status,
-                                    Error *error_ptr) {
+                                    Status *error_ptr) {
   ReturnInfo return_info;
   DWORD bytes_written = 0;
   BOOL result = 0;
diff --git a/source/Host/windows/FileSystem.cpp b/source/Host/windows/FileSystem.cpp
index 092b70b1f4dc..02b7e550f86c 100644
--- a/source/Host/windows/FileSystem.cpp
+++ b/source/Host/windows/FileSystem.cpp
@@ -27,8 +27,8 @@ const char *FileSystem::DEV_NULL = "nul";
 const char *FileSystem::PATH_CONVERSION_ERROR =
     "Error converting path between UTF-8 and native encoding";
 
-Error FileSystem::Symlink(const FileSpec &src, const FileSpec &dst) {
-  Error error;
+Status FileSystem::Symlink(const FileSpec &src, const FileSpec &dst) {
+  Status error;
   std::wstring wsrc, wdst;
   if (!llvm::ConvertUTF8toWide(src.GetCString(), wsrc) ||
       !llvm::ConvertUTF8toWide(dst.GetCString(), wdst))
@@ -48,8 +48,8 @@ Error FileSystem::Symlink(const FileSpec &src, const FileSpec &dst) {
   return error;
 }
 
-Error FileSystem::Readlink(const FileSpec &src, FileSpec &dst) {
-  Error error;
+Status FileSystem::Readlink(const FileSpec &src, FileSpec &dst) {
+  Status error;
   std::wstring wsrc;
   if (!llvm::ConvertUTF8toWide(src.GetCString(), wsrc)) {
     error.SetErrorString(PATH_CONVERSION_ERROR);
@@ -81,8 +81,8 @@ Error FileSystem::Readlink(const FileSpec &src, FileSpec &dst) {
   return error;
 }
 
-Error FileSystem::ResolveSymbolicLink(const FileSpec &src, FileSpec &dst) {
-  return Error("ResolveSymbolicLink() isn't implemented on Windows");
+Status FileSystem::ResolveSymbolicLink(const FileSpec &src, FileSpec &dst) {
+  return Status("ResolveSymbolicLink() isn't implemented on Windows");
 }
 
 FILE *FileSystem::Fopen(const char *path, const char *mode) {
diff --git a/source/Host/windows/Host.cpp b/source/Host/windows/Host.cpp
index 9a6957b2a1ff..e1acd23d5c81 100644
--- a/source/Host/windows/Host.cpp
+++ b/source/Host/windows/Host.cpp
@@ -16,8 +16,8 @@
 // Other libraries and framework includes
 // Project includes
 #include "lldb/Target/Process.h"
-#include "lldb/Utility/Error.h"
 #include "lldb/Utility/Log.h"
+#include "lldb/Utility/Status.h"
 
 #include "lldb/Core/StreamFile.h"
 #include "lldb/Core/StructuredData.h"
@@ -50,7 +50,7 @@ bool GetTripleForProcess(const FileSpec &executable, llvm::Triple &triple) {
   imageBinary.SeekFromStart(peOffset);
   imageBinary.Read(&peHead, readSize);
   if (peHead != 0x00004550) // "PE\0\0", little-endian
-    return false;           // Error: Can't find PE header
+    return false;           // Status: Can't find PE header
   readSize = 2;
   imageBinary.Read(&machineType, readSize);
   triple.setVendor(llvm::Triple::PC);
@@ -196,8 +196,8 @@ HostThread Host::StartMonitoringChildProcess(
   return HostThread();
 }
 
-Error Host::ShellExpandArguments(ProcessLaunchInfo &launch_info) {
-  Error error;
+Status Host::ShellExpandArguments(ProcessLaunchInfo &launch_info) {
+  Status error;
   if (launch_info.GetFlags().Test(eLaunchFlagShellExpandArguments)) {
     FileSpec expand_tool_spec;
     if (!HostInfo::GetLLDBPath(lldb::ePathTypeSupportExecutableDir,
diff --git a/source/Host/windows/HostProcessWindows.cpp b/source/Host/windows/HostProcessWindows.cpp
index 3bbc84a4ca59..49d42ce05422 100644
--- a/source/Host/windows/HostProcessWindows.cpp
+++ b/source/Host/windows/HostProcessWindows.cpp
@@ -37,8 +37,8 @@ HostProcessWindows::~HostProcessWindows() { Close(); }
 
 void HostProcessWindows::SetOwnsHandle(bool owns) { m_owns_handle = owns; }
 
-Error HostProcessWindows::Terminate() {
-  Error error;
+Status HostProcessWindows::Terminate() {
+  Status error;
   if (m_process == nullptr)
     error.SetError(ERROR_INVALID_HANDLE, lldb::eErrorTypeWin32);
 
@@ -48,8 +48,8 @@ Error HostProcessWindows::Terminate() {
   return error;
 }
 
-Error HostProcessWindows::GetMainModule(FileSpec &file_spec) const {
-  Error error;
+Status HostProcessWindows::GetMainModule(FileSpec &file_spec) const {
+  Status error;
   if (m_process == nullptr)
     error.SetError(ERROR_INVALID_HANDLE, lldb::eErrorTypeWin32);
 
diff --git a/source/Host/windows/HostThreadWindows.cpp b/source/Host/windows/HostThreadWindows.cpp
index aa791714c338..3d603ff61663 100644
--- a/source/Host/windows/HostThreadWindows.cpp
+++ b/source/Host/windows/HostThreadWindows.cpp
@@ -7,7 +7,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "lldb/Utility/Error.h"
+#include "lldb/Utility/Status.h"
 
 #include "lldb/Host/windows/HostThreadWindows.h"
 #include "lldb/Host/windows/windows.h"
@@ -33,8 +33,8 @@ HostThreadWindows::~HostThreadWindows() { Reset(); }
 
 void HostThreadWindows::SetOwnsHandle(bool owns) { m_owns_handle = owns; }
 
-Error HostThreadWindows::Join(lldb::thread_result_t *result) {
-  Error error;
+Status HostThreadWindows::Join(lldb::thread_result_t *result) {
+  Status error;
   if (IsJoinable()) {
     DWORD wait_result = ::WaitForSingleObject(m_thread, INFINITE);
     if (WAIT_OBJECT_0 == wait_result && result) {
@@ -51,8 +51,8 @@ Error HostThreadWindows::Join(lldb::thread_result_t *result) {
   return error;
 }
 
-Error HostThreadWindows::Cancel() {
-  Error error;
+Status HostThreadWindows::Cancel() {
+  Status error;
 
   DWORD result = ::QueueUserAPC(::ExitThreadProxy, m_thread, 0);
   error.SetError(result, eErrorTypeWin32);
diff --git a/source/Host/windows/LockFileWindows.cpp b/source/Host/windows/LockFileWindows.cpp
index 0e7f595e6cfd..2178fd1f5f6c 100644
--- a/source/Host/windows/LockFileWindows.cpp
+++ b/source/Host/windows/LockFileWindows.cpp
@@ -16,22 +16,22 @@ using namespace lldb_private;
 
 namespace {
 
-Error fileLock(HANDLE file_handle, DWORD flags, const uint64_t start,
-               const uint64_t len) {
+Status fileLock(HANDLE file_handle, DWORD flags, const uint64_t start,
+                const uint64_t len) {
   if (start != 0)
-    return Error("Non-zero start lock regions are not supported");
+    return Status("Non-zero start lock regions are not supported");
 
   OVERLAPPED overlapped = {};
 
   if (!::LockFileEx(file_handle, flags, 0, len, 0, &overlapped) &&
       ::GetLastError() != ERROR_IO_PENDING)
-    return Error(::GetLastError(), eErrorTypeWin32);
+    return Status(::GetLastError(), eErrorTypeWin32);
 
   DWORD bytes;
   if (!::GetOverlappedResult(file_handle, &overlapped, &bytes, TRUE))
-    return Error(::GetLastError(), eErrorTypeWin32);
+    return Status(::GetLastError(), eErrorTypeWin32);
 
-  return Error();
+  return Status();
 }
 
 } // namespace
@@ -45,34 +45,35 @@ bool LockFileWindows::IsValidFile() const {
   return LockFileBase::IsValidFile() && m_file != INVALID_HANDLE_VALUE;
 }
 
-Error LockFileWindows::DoWriteLock(const uint64_t start, const uint64_t len) {
+Status LockFileWindows::DoWriteLock(const uint64_t start, const uint64_t len) {
   return fileLock(m_file, LOCKFILE_EXCLUSIVE_LOCK, start, len);
 }
 
-Error LockFileWindows::DoTryWriteLock(const uint64_t start,
-                                      const uint64_t len) {
+Status LockFileWindows::DoTryWriteLock(const uint64_t start,
+                                       const uint64_t len) {
   return fileLock(m_file, LOCKFILE_EXCLUSIVE_LOCK | LOCKFILE_FAIL_IMMEDIATELY,
                   start, len);
 }
 
-Error LockFileWindows::DoReadLock(const uint64_t start, const uint64_t len) {
+Status LockFileWindows::DoReadLock(const uint64_t start, const uint64_t len) {
   return fileLock(m_file, 0, start, len);
 }
 
-Error LockFileWindows::DoTryReadLock(const uint64_t start, const uint64_t len) {
+Status LockFileWindows::DoTryReadLock(const uint64_t start,
+                                      const uint64_t len) {
   return fileLock(m_file, LOCKFILE_FAIL_IMMEDIATELY, start, len);
 }
 
-Error LockFileWindows::DoUnlock() {
+Status LockFileWindows::DoUnlock() {
   OVERLAPPED overlapped = {};
 
   if (!::UnlockFileEx(m_file, 0, m_len, 0, &overlapped) &&
       ::GetLastError() != ERROR_IO_PENDING)
-    return Error(::GetLastError(), eErrorTypeWin32);
+    return Status(::GetLastError(), eErrorTypeWin32);
 
   DWORD bytes;
   if (!::GetOverlappedResult(m_file, &overlapped, &bytes, TRUE))
-    return Error(::GetLastError(), eErrorTypeWin32);
+    return Status(::GetLastError(), eErrorTypeWin32);
 
-  return Error();
+  return Status();
 }
diff --git a/source/Host/windows/PipeWindows.cpp b/source/Host/windows/PipeWindows.cpp
index 407f0468e6cb..e8f4753d11e2 100644
--- a/source/Host/windows/PipeWindows.cpp
+++ b/source/Host/windows/PipeWindows.cpp
@@ -39,7 +39,7 @@ PipeWindows::PipeWindows() {
 
 PipeWindows::~PipeWindows() { Close(); }
 
-Error PipeWindows::CreateNew(bool child_process_inherit) {
+Status PipeWindows::CreateNew(bool child_process_inherit) {
   // Even for anonymous pipes, we open a named pipe.  This is because you cannot
   // get
   // overlapped i/o on Windows without using a named pipe.  So we synthesize a
@@ -54,12 +54,13 @@ Error PipeWindows::CreateNew(bool child_process_inherit) {
   return CreateNew(pipe_name.c_str(), child_process_inherit);
 }
 
-Error PipeWindows::CreateNew(llvm::StringRef name, bool child_process_inherit) {
+Status PipeWindows::CreateNew(llvm::StringRef name,
+                              bool child_process_inherit) {
   if (name.empty())
-    return Error(ERROR_INVALID_PARAMETER, eErrorTypeWin32);
+    return Status(ERROR_INVALID_PARAMETER, eErrorTypeWin32);
 
   if (CanRead() || CanWrite())
-    return Error(ERROR_ALREADY_EXISTS, eErrorTypeWin32);
+    return Status(ERROR_ALREADY_EXISTS, eErrorTypeWin32);
 
   std::string pipe_path = "\\\\.\\Pipe\\";
   pipe_path.append(name);
@@ -71,13 +72,13 @@ Error PipeWindows::CreateNew(llvm::StringRef name, bool child_process_inherit) {
       pipe_path.c_str(), PIPE_ACCESS_INBOUND | read_mode,
       PIPE_TYPE_BYTE | PIPE_WAIT, 1, 1024, 1024, 120 * 1000, NULL);
   if (INVALID_HANDLE_VALUE == m_read)
-    return Error(::GetLastError(), eErrorTypeWin32);
+    return Status(::GetLastError(), eErrorTypeWin32);
   m_read_fd = _open_osfhandle((intptr_t)m_read, _O_RDONLY);
   ZeroMemory(&m_read_overlapped, sizeof(m_read_overlapped));
   m_read_overlapped.hEvent = ::CreateEvent(nullptr, TRUE, FALSE, nullptr);
 
   // Open the write end of the pipe.
-  Error result = OpenNamedPipe(name, child_process_inherit, false);
+  Status result = OpenNamedPipe(name, child_process_inherit, false);
   if (!result.Success()) {
     CloseReadFileDescriptor();
     return result;
@@ -86,11 +87,11 @@ Error PipeWindows::CreateNew(llvm::StringRef name, bool child_process_inherit) {
   return result;
 }
 
-Error PipeWindows::CreateWithUniqueName(llvm::StringRef prefix,
-                                        bool child_process_inherit,
-                                        llvm::SmallVectorImpl<char> &name) {
+Status PipeWindows::CreateWithUniqueName(llvm::StringRef prefix,
+                                         bool child_process_inherit,
+                                         llvm::SmallVectorImpl<char> &name) {
   llvm::SmallString<128> pipe_name;
-  Error error;
+  Status error;
   ::UUID unique_id;
   RPC_CSTR unique_string;
   RPC_STATUS status = ::UuidCreate(&unique_id);
@@ -110,27 +111,28 @@ Error PipeWindows::CreateWithUniqueName(llvm::StringRef prefix,
   return error;
 }
 
-Error PipeWindows::OpenAsReader(llvm::StringRef name,
-                                bool child_process_inherit) {
+Status PipeWindows::OpenAsReader(llvm::StringRef name,
+                                 bool child_process_inherit) {
   if (CanRead() || CanWrite())
-    return Error(ERROR_ALREADY_EXISTS, eErrorTypeWin32);
+    return Status(ERROR_ALREADY_EXISTS, eErrorTypeWin32);
 
   return OpenNamedPipe(name, child_process_inherit, true);
 }
 
-Error PipeWindows::OpenAsWriterWithTimeout(
-    llvm::StringRef name, bool child_process_inherit,
-    const std::chrono::microseconds &timeout) {
+Status
+PipeWindows::OpenAsWriterWithTimeout(llvm::StringRef name,
+                                     bool child_process_inherit,
+                                     const std::chrono::microseconds &timeout) {
   if (CanRead() || CanWrite())
-    return Error(ERROR_ALREADY_EXISTS, eErrorTypeWin32);
+    return Status(ERROR_ALREADY_EXISTS, eErrorTypeWin32);
 
   return OpenNamedPipe(name, child_process_inherit, false);
 }
 
-Error PipeWindows::OpenNamedPipe(llvm::StringRef name,
-                                 bool child_process_inherit, bool is_read) {
+Status PipeWindows::OpenNamedPipe(llvm::StringRef name,
+                                  bool child_process_inherit, bool is_read) {
   if (name.empty())
-    return Error(ERROR_INVALID_PARAMETER, eErrorTypeWin32);
+    return Status(ERROR_INVALID_PARAMETER, eErrorTypeWin32);
 
   assert(is_read ? !CanRead() : !CanWrite());
 
@@ -144,7 +146,7 @@ Error PipeWindows::OpenNamedPipe(llvm::StringRef name,
     m_read = ::CreateFileA(pipe_path.c_str(), GENERIC_READ, 0, &attributes,
                            OPEN_EXISTING, FILE_FLAG_OVERLAPPED, NULL);
     if (INVALID_HANDLE_VALUE == m_read)
-      return Error(::GetLastError(), eErrorTypeWin32);
+      return Status(::GetLastError(), eErrorTypeWin32);
 
     m_read_fd = _open_osfhandle((intptr_t)m_read, _O_RDONLY);
 
@@ -154,14 +156,14 @@ Error PipeWindows::OpenNamedPipe(llvm::StringRef name,
     m_write = ::CreateFileA(pipe_path.c_str(), GENERIC_WRITE, 0, &attributes,
                             OPEN_EXISTING, FILE_FLAG_OVERLAPPED, NULL);
     if (INVALID_HANDLE_VALUE == m_write)
-      return Error(::GetLastError(), eErrorTypeWin32);
+      return Status(::GetLastError(), eErrorTypeWin32);
 
     m_write_fd = _open_osfhandle((intptr_t)m_write, _O_WRONLY);
 
     ZeroMemory(&m_write_overlapped, sizeof(m_write_overlapped));
   }
 
-  return Error();
+  return Status();
 }
 
 int PipeWindows::GetReadFileDescriptor() const { return m_read_fd; }
@@ -217,7 +219,7 @@ void PipeWindows::Close() {
   CloseWriteFileDescriptor();
 }
 
-Error PipeWindows::Delete(llvm::StringRef name) { return Error(); }
+Status PipeWindows::Delete(llvm::StringRef name) { return Status(); }
 
 bool PipeWindows::CanRead() const { return (m_read != INVALID_HANDLE_VALUE); }
 
@@ -229,18 +231,18 @@ PipeWindows::GetReadNativeHandle() { return m_read; }
 HANDLE
 PipeWindows::GetWriteNativeHandle() { return m_write; }
 
-Error PipeWindows::ReadWithTimeout(void *buf, size_t size,
-                                   const std::chrono::microseconds &duration,
-                                   size_t &bytes_read) {
+Status PipeWindows::ReadWithTimeout(void *buf, size_t size,
+                                    const std::chrono::microseconds &duration,
+                                    size_t &bytes_read) {
   if (!CanRead())
-    return Error(ERROR_INVALID_HANDLE, eErrorTypeWin32);
+    return Status(ERROR_INVALID_HANDLE, eErrorTypeWin32);
 
   bytes_read = 0;
   DWORD sys_bytes_read = size;
   BOOL result = ::ReadFile(m_read, buf, sys_bytes_read, &sys_bytes_read,
                            &m_read_overlapped);
   if (!result && GetLastError() != ERROR_IO_PENDING)
-    return Error(::GetLastError(), eErrorTypeWin32);
+    return Status(::GetLastError(), eErrorTypeWin32);
 
   DWORD timeout = (duration == std::chrono::microseconds::zero())
                       ? INFINITE
@@ -263,33 +265,33 @@ Error PipeWindows::ReadWithTimeout(void *buf, size_t size,
         failed = false;
     }
     if (failed)
-      return Error(failure_error, eErrorTypeWin32);
+      return Status(failure_error, eErrorTypeWin32);
   }
 
   // Now we call GetOverlappedResult setting bWait to false, since we've already
   // waited
   // as long as we're willing to.
   if (!GetOverlappedResult(m_read, &m_read_overlapped, &sys_bytes_read, FALSE))
-    return Error(::GetLastError(), eErrorTypeWin32);
+    return Status(::GetLastError(), eErrorTypeWin32);
 
   bytes_read = sys_bytes_read;
-  return Error();
+  return Status();
 }
 
-Error PipeWindows::Write(const void *buf, size_t num_bytes,
-                         size_t &bytes_written) {
+Status PipeWindows::Write(const void *buf, size_t num_bytes,
+                          size_t &bytes_written) {
   if (!CanWrite())
-    return Error(ERROR_INVALID_HANDLE, eErrorTypeWin32);
+    return Status(ERROR_INVALID_HANDLE, eErrorTypeWin32);
 
   DWORD sys_bytes_written = 0;
   BOOL write_result = ::WriteFile(m_write, buf, num_bytes, &sys_bytes_written,
                                   &m_write_overlapped);
   if (!write_result && GetLastError() != ERROR_IO_PENDING)
-    return Error(::GetLastError(), eErrorTypeWin32);
+    return Status(::GetLastError(), eErrorTypeWin32);
 
   BOOL result = GetOverlappedResult(m_write, &m_write_overlapped,
                                     &sys_bytes_written, TRUE);
   if (!result)
-    return Error(::GetLastError(), eErrorTypeWin32);
-  return Error();
+    return Status(::GetLastError(), eErrorTypeWin32);
+  return Status();
 }
diff --git a/source/Host/windows/ProcessLauncherWindows.cpp b/source/Host/windows/ProcessLauncherWindows.cpp
index 16805ba7df88..56089742f093 100644
--- a/source/Host/windows/ProcessLauncherWindows.cpp
+++ b/source/Host/windows/ProcessLauncherWindows.cpp
@@ -41,7 +41,7 @@ void CreateEnvironmentBuffer(const Args &env, std::vector<char> &buffer) {
 
 HostProcess
 ProcessLauncherWindows::LaunchProcess(const ProcessLaunchInfo &launch_info,
-                                      Error &error) {
+                                      Status &error) {
   error.Clear();
 
   std::string executable;
diff --git a/source/Initialization/SystemInitializerCommon.cpp b/source/Initialization/SystemInitializerCommon.cpp
index 805e6091e68a..77869807c79c 100644
--- a/source/Initialization/SystemInitializerCommon.cpp
+++ b/source/Initialization/SystemInitializerCommon.cpp
@@ -72,7 +72,8 @@ void SystemInitializerCommon::Initialize() {
   llvm::EnablePrettyStackTrace();
   InitializeLog();
   HostInfo::Initialize();
-  Timer scoped_timer(LLVM_PRETTY_FUNCTION, LLVM_PRETTY_FUNCTION);
+  static Timer::Category func_cat(LLVM_PRETTY_FUNCTION);
+  Timer scoped_timer(func_cat, LLVM_PRETTY_FUNCTION);
 
   process_gdb_remote::ProcessGDBRemoteLog::Initialize();
 
@@ -102,7 +103,8 @@ void SystemInitializerCommon::Initialize() {
 }
 
 void SystemInitializerCommon::Terminate() {
-  Timer scoped_timer(LLVM_PRETTY_FUNCTION, LLVM_PRETTY_FUNCTION);
+  static Timer::Category func_cat(LLVM_PRETTY_FUNCTION);
+  Timer scoped_timer(func_cat, LLVM_PRETTY_FUNCTION);
   ObjectContainerBSDArchive::Terminate();
   ObjectFileELF::Terminate();
   ObjectFilePECOFF::Terminate();
diff --git a/source/Interpreter/Args.cpp b/source/Interpreter/Args.cpp
index 5bfe13f55e49..a23ba3094b22 100644
--- a/source/Interpreter/Args.cpp
+++ b/source/Interpreter/Args.cpp
@@ -420,10 +420,10 @@ void Args::SetArguments(const char **argv) {
   SetArguments(ArgvToArgc(argv), argv);
 }
 
-Error Args::ParseOptions(Options &options, ExecutionContext *execution_context,
-                         PlatformSP platform_sp, bool require_validation) {
+Status Args::ParseOptions(Options &options, ExecutionContext *execution_context,
+                          PlatformSP platform_sp, bool require_validation) {
   StreamString sstr;
-  Error error;
+  Status error;
   Option *long_options = options.GetLongOptions();
   if (long_options == nullptr) {
     error.SetErrorStringWithFormat("invalid long options");
@@ -547,7 +547,7 @@ void Args::Clear() {
 
 lldb::addr_t Args::StringToAddress(const ExecutionContext *exe_ctx,
                                    llvm::StringRef s, lldb::addr_t fail_value,
-                                   Error *error_ptr) {
+                                   Status *error_ptr) {
   bool error_set = false;
   if (s.empty()) {
     if (error_ptr)
@@ -630,7 +630,7 @@ lldb::addr_t Args::StringToAddress(const ExecutionContext *exe_ctx,
 
           if (regex_match.GetMatchAtIndex(s, 3, str)) {
             if (!llvm::StringRef(str).getAsInteger(0, offset)) {
-              Error error;
+              Status error;
               addr = StringToAddress(exe_ctx, name.c_str(),
                                      LLDB_INVALID_ADDRESS, &error);
               if (addr != LLDB_INVALID_ADDRESS) {
@@ -774,7 +774,7 @@ const char *Args::GetShellSafeArgument(const FileSpec &shell,
 
 int64_t Args::StringToOptionEnum(llvm::StringRef s,
                                  OptionEnumValueElement *enum_values,
-                                 int32_t fail_value, Error &error) {
+                                 int32_t fail_value, Status &error) {
   error.Clear();
   if (!enum_values) {
     error.SetErrorString("invalid enumeration argument");
@@ -819,10 +819,10 @@ Args::StringToScriptLanguage(llvm::StringRef s, lldb::ScriptLanguage fail_value,
   return fail_value;
 }
 
-Error Args::StringToFormat(const char *s, lldb::Format &format,
-                           size_t *byte_size_ptr) {
+Status Args::StringToFormat(const char *s, lldb::Format &format,
+                            size_t *byte_size_ptr) {
   format = eFormatInvalid;
-  Error error;
+  Status error;
 
   if (s && s[0]) {
     if (byte_size_ptr) {
diff --git a/source/Interpreter/CommandInterpreter.cpp b/source/Interpreter/CommandInterpreter.cpp
index 8703bc97f06e..a1d8eded09f5 100644
--- a/source/Interpreter/CommandInterpreter.cpp
+++ b/source/Interpreter/CommandInterpreter.cpp
@@ -169,7 +169,8 @@ bool CommandInterpreter::GetSpaceReplPrompts() const {
 }
 
 void CommandInterpreter::Initialize() {
-  Timer scoped_timer(LLVM_PRETTY_FUNCTION, LLVM_PRETTY_FUNCTION);
+  static Timer::Category func_cat(LLVM_PRETTY_FUNCTION);
+  Timer scoped_timer(func_cat, LLVM_PRETTY_FUNCTION);
 
   CommandReturnObject result;
 
@@ -391,7 +392,8 @@ const char *CommandInterpreter::ProcessEmbeddedScriptCommands(const char *arg) {
 }
 
 void CommandInterpreter::LoadCommandDictionary() {
-  Timer scoped_timer(LLVM_PRETTY_FUNCTION, LLVM_PRETTY_FUNCTION);
+  static Timer::Category func_cat(LLVM_PRETTY_FUNCTION);
+  Timer scoped_timer(func_cat, LLVM_PRETTY_FUNCTION);
 
   lldb::ScriptLanguage script_language = m_debugger.GetScriptLanguage();
 
@@ -1373,7 +1375,7 @@ CommandObject *CommandInterpreter::BuildAliasResult(
   return alias_cmd_obj;
 }
 
-Error CommandInterpreter::PreprocessCommand(std::string &command) {
+Status CommandInterpreter::PreprocessCommand(std::string &command) {
   // The command preprocessor needs to do things to the command
   // line before any parsing of arguments or anything else is done.
   // The only current stuff that gets preprocessed is anything enclosed
@@ -1381,7 +1383,7 @@ Error CommandInterpreter::PreprocessCommand(std::string &command) {
   // the result of the expression must be a scalar that can be substituted
   // into the command. An example would be:
   // (lldb) memory read `$rsp + 20`
-  Error error; // Error for any expressions that might not evaluate
+  Status error; // Status for any expressions that might not evaluate
   size_t start_backtick;
   size_t pos = 0;
   while ((start_backtick = command.find('`', pos)) != std::string::npos) {
@@ -1533,8 +1535,8 @@ bool CommandInterpreter::HandleCommand(const char *command_line,
   if (log)
     log->Printf("Processing command: %s", command_line);
 
-  Timer scoped_timer(LLVM_PRETTY_FUNCTION, "Handling command: %s.",
-                     command_line);
+  static Timer::Category func_cat(LLVM_PRETTY_FUNCTION);
+  Timer scoped_timer(func_cat, "Handling command: %s.", command_line);
 
   if (!no_context_switching)
     UpdateExecutionContext(override_context);
@@ -1601,7 +1603,7 @@ bool CommandInterpreter::HandleCommand(const char *command_line,
     return true;
   }
 
-  Error error(PreprocessCommand(command_string));
+  Status error(PreprocessCommand(command_string));
 
   if (error.Fail()) {
     result.AppendError(error.AsCString());
@@ -2355,8 +2357,8 @@ void CommandInterpreter::HandleCommandsFromFile(
     StreamFileSP input_file_sp(new StreamFile());
 
     std::string cmd_file_path = cmd_file.GetPath();
-    Error error = input_file_sp->GetFile().Open(cmd_file_path.c_str(),
-                                                File::eOpenOptionRead);
+    Status error = input_file_sp->GetFile().Open(cmd_file_path.c_str(),
+                                                 File::eOpenOptionRead);
 
     if (error.Success()) {
       Debugger &debugger = GetDebugger();
@@ -2653,7 +2655,7 @@ size_t CommandInterpreter::GetProcessOutput() {
   char stdio_buffer[1024];
   size_t len;
   size_t total_bytes = 0;
-  Error error;
+  Status error;
   TargetSP target_sp(m_debugger.GetTargetList().GetSelectedTarget());
   if (target_sp) {
     ProcessSP process_sp(target_sp->GetProcessSP());
diff --git a/source/Interpreter/CommandObject.cpp b/source/Interpreter/CommandObject.cpp
index 2a6c77d7deba..f9a5b10bb4b8 100644
--- a/source/Interpreter/CommandObject.cpp
+++ b/source/Interpreter/CommandObject.cpp
@@ -99,7 +99,7 @@ bool CommandObject::ParseOptions(Args &args, CommandReturnObject &result) {
   // See if the subclass has options?
   Options *options = GetOptions();
   if (options != nullptr) {
-    Error error;
+    Status error;
 
     auto exe_ctx = GetCommandInterpreter().GetExecutionContext();
     options->NotifyOptionParsingStarting(&exe_ctx);
diff --git a/source/Interpreter/CommandReturnObject.cpp b/source/Interpreter/CommandReturnObject.cpp
index 00fcff510cff..75c02588066c 100644
--- a/source/Interpreter/CommandReturnObject.cpp
+++ b/source/Interpreter/CommandReturnObject.cpp
@@ -13,7 +13,7 @@
 // C++ Includes
 // Other libraries and framework includes
 // Project includes
-#include "lldb/Utility/Error.h"
+#include "lldb/Utility/Status.h"
 #include "lldb/Utility/StreamString.h"
 
 using namespace lldb;
@@ -111,7 +111,7 @@ void CommandReturnObject::AppendError(llvm::StringRef in_string) {
   GetErrorStream() << "error: " << in_string << "\n";
 }
 
-void CommandReturnObject::SetError(const Error &error,
+void CommandReturnObject::SetError(const Status &error,
                                    const char *fallback_error_cstr) {
   const char *error_cstr = error.AsCString();
   if (error_cstr == nullptr)
@@ -127,7 +127,7 @@ void CommandReturnObject::SetError(llvm::StringRef error_str) {
   SetStatus(eReturnStatusFailed);
 }
 
-// Similar to AppendError, but do not prepend 'Error: ' to message, and
+// Similar to AppendError, but do not prepend 'Status: ' to message, and
 // don't append "\n" to the end of it.
 
 void CommandReturnObject::AppendRawError(llvm::StringRef in_string) {
diff --git a/source/Interpreter/OptionGroupArchitecture.cpp b/source/Interpreter/OptionGroupArchitecture.cpp
index 4a6bf48a24d4..d5354fe0739c 100644
--- a/source/Interpreter/OptionGroupArchitecture.cpp
+++ b/source/Interpreter/OptionGroupArchitecture.cpp
@@ -41,10 +41,11 @@ bool OptionGroupArchitecture::GetArchitecture(Platform *platform,
   return arch.IsValid();
 }
 
-Error OptionGroupArchitecture::SetOptionValue(
-    uint32_t option_idx, llvm::StringRef option_arg,
-    ExecutionContext *execution_context) {
-  Error error;
+Status
+OptionGroupArchitecture::SetOptionValue(uint32_t option_idx,
+                                        llvm::StringRef option_arg,
+                                        ExecutionContext *execution_context) {
+  Status error;
   const int short_option = g_option_table[option_idx].short_option;
 
   switch (short_option) {
diff --git a/source/Interpreter/OptionGroupBoolean.cpp b/source/Interpreter/OptionGroupBoolean.cpp
index 4956b83c8f7a..5fd4ce7aecfa 100644
--- a/source/Interpreter/OptionGroupBoolean.cpp
+++ b/source/Interpreter/OptionGroupBoolean.cpp
@@ -40,10 +40,10 @@ OptionGroupBoolean::OptionGroupBoolean(uint32_t usage_mask, bool required,
 
 OptionGroupBoolean::~OptionGroupBoolean() {}
 
-Error OptionGroupBoolean::SetOptionValue(uint32_t option_idx,
-                                         llvm::StringRef option_value,
-                                         ExecutionContext *execution_context) {
-  Error error;
+Status OptionGroupBoolean::SetOptionValue(uint32_t option_idx,
+                                          llvm::StringRef option_value,
+                                          ExecutionContext *execution_context) {
+  Status error;
   if (m_option_definition.option_has_arg == OptionParser::eNoArgument) {
     // Not argument, toggle the default value and mark the option as having been
     // set
diff --git a/source/Interpreter/OptionGroupFile.cpp b/source/Interpreter/OptionGroupFile.cpp
index 995a6a46e48e..d45f00a66616 100644
--- a/source/Interpreter/OptionGroupFile.cpp
+++ b/source/Interpreter/OptionGroupFile.cpp
@@ -38,10 +38,10 @@ OptionGroupFile::OptionGroupFile(uint32_t usage_mask, bool required,
 
 OptionGroupFile::~OptionGroupFile() {}
 
-Error OptionGroupFile::SetOptionValue(uint32_t option_idx,
-                                      llvm::StringRef option_arg,
-                                      ExecutionContext *execution_context) {
-  Error error(m_file.SetValueFromString(option_arg));
+Status OptionGroupFile::SetOptionValue(uint32_t option_idx,
+                                       llvm::StringRef option_arg,
+                                       ExecutionContext *execution_context) {
+  Status error(m_file.SetValueFromString(option_arg));
   return error;
 }
 
@@ -69,10 +69,11 @@ OptionGroupFileList::OptionGroupFileList(
 
 OptionGroupFileList::~OptionGroupFileList() {}
 
-Error OptionGroupFileList::SetOptionValue(uint32_t option_idx,
-                                          llvm::StringRef option_value,
-                                          ExecutionContext *execution_context) {
-  Error error(m_file_list.SetValueFromString(option_value));
+Status
+OptionGroupFileList::SetOptionValue(uint32_t option_idx,
+                                    llvm::StringRef option_value,
+                                    ExecutionContext *execution_context) {
+  Status error(m_file_list.SetValueFromString(option_value));
   return error;
 }
 
diff --git a/source/Interpreter/OptionGroupFormat.cpp b/source/Interpreter/OptionGroupFormat.cpp
index df5e2b3cd346..7c4239f92642 100644
--- a/source/Interpreter/OptionGroupFormat.cpp
+++ b/source/Interpreter/OptionGroupFormat.cpp
@@ -58,10 +58,10 @@ llvm::ArrayRef<OptionDefinition> OptionGroupFormat::GetDefinitions() {
   return result.take_front(2);
 }
 
-Error OptionGroupFormat::SetOptionValue(uint32_t option_idx,
-                                        llvm::StringRef option_arg,
-                                        ExecutionContext *execution_context) {
-  Error error;
+Status OptionGroupFormat::SetOptionValue(uint32_t option_idx,
+                                         llvm::StringRef option_arg,
+                                         ExecutionContext *execution_context) {
+  Status error;
   const int short_option = g_option_table[option_idx].short_option;
 
   switch (short_option) {
diff --git a/source/Interpreter/OptionGroupOutputFile.cpp b/source/Interpreter/OptionGroupOutputFile.cpp
index 7b0ee61dd7eb..fd406494ea97 100644
--- a/source/Interpreter/OptionGroupOutputFile.cpp
+++ b/source/Interpreter/OptionGroupOutputFile.cpp
@@ -38,10 +38,11 @@ llvm::ArrayRef<OptionDefinition> OptionGroupOutputFile::GetDefinitions() {
   return llvm::makeArrayRef(g_option_table);
 }
 
-Error OptionGroupOutputFile::SetOptionValue(
-    uint32_t option_idx, llvm::StringRef option_arg,
-    ExecutionContext *execution_context) {
-  Error error;
+Status
+OptionGroupOutputFile::SetOptionValue(uint32_t option_idx,
+                                      llvm::StringRef option_arg,
+                                      ExecutionContext *execution_context) {
+  Status error;
   const int short_option = g_option_table[option_idx].short_option;
 
   switch (short_option) {
diff --git a/source/Interpreter/OptionGroupPlatform.cpp b/source/Interpreter/OptionGroupPlatform.cpp
index f6edca82a142..5747c6a8815d 100644
--- a/source/Interpreter/OptionGroupPlatform.cpp
+++ b/source/Interpreter/OptionGroupPlatform.cpp
@@ -22,7 +22,7 @@ using namespace lldb_private;
 
 PlatformSP OptionGroupPlatform::CreatePlatformWithOptions(
     CommandInterpreter &interpreter, const ArchSpec &arch, bool make_selected,
-    Error &error, ArchSpec &platform_arch) const {
+    Status &error, ArchSpec &platform_arch) const {
   PlatformSP platform_sp;
 
   if (!m_platform_name.empty()) {
@@ -92,10 +92,11 @@ llvm::ArrayRef<OptionDefinition> OptionGroupPlatform::GetDefinitions() {
   return result.drop_front();
 }
 
-Error OptionGroupPlatform::SetOptionValue(uint32_t option_idx,
-                                          llvm::StringRef option_arg,
-                                          ExecutionContext *execution_context) {
-  Error error;
+Status
+OptionGroupPlatform::SetOptionValue(uint32_t option_idx,
+                                    llvm::StringRef option_arg,
+                                    ExecutionContext *execution_context) {
+  Status error;
   if (!m_include_platform_option)
     ++option_idx;
 
diff --git a/source/Interpreter/OptionGroupString.cpp b/source/Interpreter/OptionGroupString.cpp
index 5705264dba20..1a161945a17f 100644
--- a/source/Interpreter/OptionGroupString.cpp
+++ b/source/Interpreter/OptionGroupString.cpp
@@ -39,10 +39,10 @@ OptionGroupString::OptionGroupString(uint32_t usage_mask, bool required,
 
 OptionGroupString::~OptionGroupString() {}
 
-Error OptionGroupString::SetOptionValue(uint32_t option_idx,
-                                        llvm::StringRef option_arg,
-                                        ExecutionContext *execution_context) {
-  Error error(m_value.SetValueFromString(option_arg));
+Status OptionGroupString::SetOptionValue(uint32_t option_idx,
+                                         llvm::StringRef option_arg,
+                                         ExecutionContext *execution_context) {
+  Status error(m_value.SetValueFromString(option_arg));
   return error;
 }
 
diff --git a/source/Interpreter/OptionGroupUInt64.cpp b/source/Interpreter/OptionGroupUInt64.cpp
index a6a0d49232c1..ae4828c3e926 100644
--- a/source/Interpreter/OptionGroupUInt64.cpp
+++ b/source/Interpreter/OptionGroupUInt64.cpp
@@ -39,10 +39,10 @@ OptionGroupUInt64::OptionGroupUInt64(uint32_t usage_mask, bool required,
 
 OptionGroupUInt64::~OptionGroupUInt64() {}
 
-Error OptionGroupUInt64::SetOptionValue(uint32_t option_idx,
-                                        llvm::StringRef option_arg,
-                                        ExecutionContext *execution_context) {
-  Error error(m_value.SetValueFromString(option_arg));
+Status OptionGroupUInt64::SetOptionValue(uint32_t option_idx,
+                                         llvm::StringRef option_arg,
+                                         ExecutionContext *execution_context) {
+  Status error(m_value.SetValueFromString(option_arg));
   return error;
 }
 
diff --git a/source/Interpreter/OptionGroupUUID.cpp b/source/Interpreter/OptionGroupUUID.cpp
index 32a9962b5b97..bf02d1b660cf 100644
--- a/source/Interpreter/OptionGroupUUID.cpp
+++ b/source/Interpreter/OptionGroupUUID.cpp
@@ -31,10 +31,10 @@ llvm::ArrayRef<OptionDefinition> OptionGroupUUID::GetDefinitions() {
   return llvm::makeArrayRef(g_option_table);
 }
 
-Error OptionGroupUUID::SetOptionValue(uint32_t option_idx,
-                                      llvm::StringRef option_arg,
-                                      ExecutionContext *execution_context) {
-  Error error;
+Status OptionGroupUUID::SetOptionValue(uint32_t option_idx,
+                                       llvm::StringRef option_arg,
+                                       ExecutionContext *execution_context) {
+  Status error;
   const int short_option = g_option_table[option_idx].short_option;
 
   switch (short_option) {
diff --git a/source/Interpreter/OptionGroupValueObjectDisplay.cpp b/source/Interpreter/OptionGroupValueObjectDisplay.cpp
index 89acf3f0d3f9..ce27d948c93c 100644
--- a/source/Interpreter/OptionGroupValueObjectDisplay.cpp
+++ b/source/Interpreter/OptionGroupValueObjectDisplay.cpp
@@ -75,10 +75,10 @@ OptionGroupValueObjectDisplay::GetDefinitions() {
   return llvm::makeArrayRef(g_option_table);
 }
 
-Error OptionGroupValueObjectDisplay::SetOptionValue(
+Status OptionGroupValueObjectDisplay::SetOptionValue(
     uint32_t option_idx, llvm::StringRef option_arg,
     ExecutionContext *execution_context) {
-  Error error;
+  Status error;
   const int short_option = g_option_table[option_idx].short_option;
   bool success = false;
 
diff --git a/source/Interpreter/OptionGroupVariable.cpp b/source/Interpreter/OptionGroupVariable.cpp
index 760563071d7d..0793d3731446 100644
--- a/source/Interpreter/OptionGroupVariable.cpp
+++ b/source/Interpreter/OptionGroupVariable.cpp
@@ -17,7 +17,7 @@
 #include "lldb/Host/OptionParser.h"
 #include "lldb/Interpreter/CommandInterpreter.h"
 #include "lldb/Target/Target.h"
-#include "lldb/Utility/Error.h"
+#include "lldb/Utility/Status.h"
 
 using namespace lldb;
 using namespace lldb_private;
@@ -52,20 +52,20 @@ static OptionDefinition g_variable_options[] = {
      "Specify a summary string to use to format the variable output."},
 };
 
-static Error ValidateNamedSummary(const char *str, void *) {
+static Status ValidateNamedSummary(const char *str, void *) {
   if (!str || !str[0])
-    return Error("must specify a valid named summary");
+    return Status("must specify a valid named summary");
   TypeSummaryImplSP summary_sp;
   if (DataVisualization::NamedSummaryFormats::GetSummaryFormat(
           ConstString(str), summary_sp) == false)
-    return Error("must specify a valid named summary");
-  return Error();
+    return Status("must specify a valid named summary");
+  return Status();
 }
 
-static Error ValidateSummaryString(const char *str, void *) {
+static Status ValidateSummaryString(const char *str, void *) {
   if (!str || !str[0])
-    return Error("must specify a non-empty summary string");
-  return Error();
+    return Status("must specify a non-empty summary string");
+  return Status();
 }
 
 OptionGroupVariable::OptionGroupVariable(bool show_frame_options)
@@ -74,10 +74,11 @@ OptionGroupVariable::OptionGroupVariable(bool show_frame_options)
 
 OptionGroupVariable::~OptionGroupVariable() {}
 
-Error OptionGroupVariable::SetOptionValue(uint32_t option_idx,
-                                          llvm::StringRef option_arg,
-                                          ExecutionContext *execution_context) {
-  Error error;
+Status
+OptionGroupVariable::SetOptionValue(uint32_t option_idx,
+                                    llvm::StringRef option_arg,
+                                    ExecutionContext *execution_context) {
+  Status error;
   if (!include_frame_options)
     option_idx += 3;
   const int short_option = g_variable_options[option_idx].short_option;
diff --git a/source/Interpreter/OptionGroupWatchpoint.cpp b/source/Interpreter/OptionGroupWatchpoint.cpp
index 1e6fab929ca7..dd4b8c86c910 100644
--- a/source/Interpreter/OptionGroupWatchpoint.cpp
+++ b/source/Interpreter/OptionGroupWatchpoint.cpp
@@ -56,10 +56,11 @@ OptionGroupWatchpoint::OptionGroupWatchpoint() : OptionGroup() {}
 
 OptionGroupWatchpoint::~OptionGroupWatchpoint() {}
 
-Error OptionGroupWatchpoint::SetOptionValue(
-    uint32_t option_idx, llvm::StringRef option_arg,
-    ExecutionContext *execution_context) {
-  Error error;
+Status
+OptionGroupWatchpoint::SetOptionValue(uint32_t option_idx,
+                                      llvm::StringRef option_arg,
+                                      ExecutionContext *execution_context) {
+  Status error;
   const int short_option = g_option_table[option_idx].short_option;
   switch (short_option) {
   case 'w': {
diff --git a/source/Interpreter/OptionValue.cpp b/source/Interpreter/OptionValue.cpp
index 5f42f7f7056a..afcace2567ce 100644
--- a/source/Interpreter/OptionValue.cpp
+++ b/source/Interpreter/OptionValue.cpp
@@ -42,10 +42,10 @@ uint64_t OptionValue::GetUInt64Value(uint64_t fail_value, bool *success_ptr) {
   return fail_value;
 }
 
-Error OptionValue::SetSubValue(const ExecutionContext *exe_ctx,
-                               VarSetOperationType op, llvm::StringRef name,
-  llvm::StringRef value) {
-  Error error;
+Status OptionValue::SetSubValue(const ExecutionContext *exe_ctx,
+                                VarSetOperationType op, llvm::StringRef name,
+                                llvm::StringRef value) {
+  Status error;
   error.SetErrorStringWithFormat("SetSubValue is not supported");
   return error;
 }
@@ -507,7 +507,7 @@ const char *OptionValue::GetBuiltinTypeAsCString(Type t) {
 }
 
 lldb::OptionValueSP OptionValue::CreateValueFromCStringForTypeMask(
-    const char *value_cstr, uint32_t type_mask, Error &error) {
+    const char *value_cstr, uint32_t type_mask, Status &error) {
   // If only 1 bit is set in the type mask for a dictionary or array
   // then we know how to decode a value from a cstring
   lldb::OptionValueSP value_sp;
@@ -582,9 +582,9 @@ size_t OptionValue::AutoComplete(CommandInterpreter &interpreter,
   return matches.GetSize();
 }
 
-Error OptionValue::SetValueFromString(llvm::StringRef value,
-                                      VarSetOperationType op) {
-  Error error;
+Status OptionValue::SetValueFromString(llvm::StringRef value,
+                                       VarSetOperationType op) {
+  Status error;
   switch (op) {
   case eVarSetOperationReplace:
     error.SetErrorStringWithFormat(
diff --git a/source/Interpreter/OptionValueArch.cpp b/source/Interpreter/OptionValueArch.cpp
index 3e41300a4182..1d920a114723 100644
--- a/source/Interpreter/OptionValueArch.cpp
+++ b/source/Interpreter/OptionValueArch.cpp
@@ -38,9 +38,9 @@ void OptionValueArch::DumpValue(const ExecutionContext *exe_ctx, Stream &strm,
   }
 }
 
-Error OptionValueArch::SetValueFromString(llvm::StringRef value,
-                                          VarSetOperationType op) {
-  Error error;
+Status OptionValueArch::SetValueFromString(llvm::StringRef value,
+                                           VarSetOperationType op) {
+  Status error;
   switch (op) {
   case eVarSetOperationClear:
     Clear();
diff --git a/source/Interpreter/OptionValueArray.cpp b/source/Interpreter/OptionValueArray.cpp
index 0d1825cec28b..8b62070352a6 100644
--- a/source/Interpreter/OptionValueArray.cpp
+++ b/source/Interpreter/OptionValueArray.cpp
@@ -70,18 +70,19 @@ void OptionValueArray::DumpValue(const ExecutionContext *exe_ctx, Stream &strm,
   }
 }
 
-Error OptionValueArray::SetValueFromString(llvm::StringRef value,
-                                           VarSetOperationType op) {
+Status OptionValueArray::SetValueFromString(llvm::StringRef value,
+                                            VarSetOperationType op) {
   Args args(value.str());
-  Error error = SetArgs(args, op);
+  Status error = SetArgs(args, op);
   if (error.Success())
     NotifyValueChanged();
   return error;
 }
 
 lldb::OptionValueSP
-OptionValueArray::GetSubValue(const ExecutionContext *exe_ctx, llvm::StringRef name,
-                              bool will_modify, Error &error) const {
+OptionValueArray::GetSubValue(const ExecutionContext *exe_ctx,
+                              llvm::StringRef name, bool will_modify,
+                              Status &error) const {
   if (name.empty() || name.front() != '[') {
     error.SetErrorStringWithFormat(
       "invalid value path '%s', %s values only support '[<index>]' subvalues "
@@ -149,8 +150,8 @@ size_t OptionValueArray::GetArgs(Args &args) const {
   return args.GetArgumentCount();
 }
 
-Error OptionValueArray::SetArgs(const Args &args, VarSetOperationType op) {
-  Error error;
+Status OptionValueArray::SetArgs(const Args &args, VarSetOperationType op) {
+  Status error;
   const size_t argc = args.GetArgumentCount();
   switch (op) {
   case eVarSetOperationInvalid:
diff --git a/source/Interpreter/OptionValueBoolean.cpp b/source/Interpreter/OptionValueBoolean.cpp
index 92e4d891c1a8..2cb84cd6abb3 100644
--- a/source/Interpreter/OptionValueBoolean.cpp
+++ b/source/Interpreter/OptionValueBoolean.cpp
@@ -35,9 +35,9 @@ void OptionValueBoolean::DumpValue(const ExecutionContext *exe_ctx,
   }
 }
 
-Error OptionValueBoolean::SetValueFromString(llvm::StringRef value_str,
-                                             VarSetOperationType op) {
-  Error error;
+Status OptionValueBoolean::SetValueFromString(llvm::StringRef value_str,
+                                              VarSetOperationType op) {
+  Status error;
   switch (op) {
   case eVarSetOperationClear:
     Clear();
diff --git a/source/Interpreter/OptionValueChar.cpp b/source/Interpreter/OptionValueChar.cpp
index 1f6acf61c68c..27684a27264d 100644
--- a/source/Interpreter/OptionValueChar.cpp
+++ b/source/Interpreter/OptionValueChar.cpp
@@ -36,9 +36,9 @@ void OptionValueChar::DumpValue(const ExecutionContext *exe_ctx, Stream &strm,
   }
 }
 
-Error OptionValueChar::SetValueFromString(llvm::StringRef value,
-                                          VarSetOperationType op) {
-  Error error;
+Status OptionValueChar::SetValueFromString(llvm::StringRef value,
+                                           VarSetOperationType op) {
+  Status error;
   switch (op) {
   case eVarSetOperationClear:
     Clear();
diff --git a/source/Interpreter/OptionValueDictionary.cpp b/source/Interpreter/OptionValueDictionary.cpp
index dfba435603bb..b245a09c1951 100644
--- a/source/Interpreter/OptionValueDictionary.cpp
+++ b/source/Interpreter/OptionValueDictionary.cpp
@@ -90,8 +90,9 @@ size_t OptionValueDictionary::GetArgs(Args &args) const {
   return args.GetArgumentCount();
 }
 
-Error OptionValueDictionary::SetArgs(const Args &args, VarSetOperationType op) {
-  Error error;
+Status OptionValueDictionary::SetArgs(const Args &args,
+                                      VarSetOperationType op) {
+  Status error;
   const size_t argc = args.GetArgumentCount();
   switch (op) {
   case eVarSetOperationClear:
@@ -197,10 +198,10 @@ Error OptionValueDictionary::SetArgs(const Args &args, VarSetOperationType op) {
   return error;
 }
 
-Error OptionValueDictionary::SetValueFromString(llvm::StringRef value,
-                                                VarSetOperationType op) {
+Status OptionValueDictionary::SetValueFromString(llvm::StringRef value,
+                                                 VarSetOperationType op) {
   Args args(value.str());
-  Error error = SetArgs(args, op);
+  Status error = SetArgs(args, op);
   if (error.Success())
     NotifyValueChanged();
   return error;
@@ -208,8 +209,8 @@ Error OptionValueDictionary::SetValueFromString(llvm::StringRef value,
 
 lldb::OptionValueSP
 OptionValueDictionary::GetSubValue(const ExecutionContext *exe_ctx,
-  llvm::StringRef name, bool will_modify,
-                                   Error &error) const {
+                                   llvm::StringRef name, bool will_modify,
+                                   Status &error) const {
   lldb::OptionValueSP value_sp;
   if (name.empty())
     return nullptr;
@@ -258,10 +259,11 @@ OptionValueDictionary::GetSubValue(const ExecutionContext *exe_ctx,
   return value_sp->GetSubValue(exe_ctx, sub_name, will_modify, error);
 }
 
-Error OptionValueDictionary::SetSubValue(const ExecutionContext *exe_ctx,
-                                         VarSetOperationType op,
-  llvm::StringRef name, llvm::StringRef value) {
-  Error error;
+Status OptionValueDictionary::SetSubValue(const ExecutionContext *exe_ctx,
+                                          VarSetOperationType op,
+                                          llvm::StringRef name,
+                                          llvm::StringRef value) {
+  Status error;
   const bool will_modify = true;
   lldb::OptionValueSP value_sp(GetSubValue(exe_ctx, name, will_modify, error));
   if (value_sp)
diff --git a/source/Interpreter/OptionValueEnumeration.cpp b/source/Interpreter/OptionValueEnumeration.cpp
index 2bff0bdcec37..9510f4a671d1 100644
--- a/source/Interpreter/OptionValueEnumeration.cpp
+++ b/source/Interpreter/OptionValueEnumeration.cpp
@@ -45,9 +45,9 @@ void OptionValueEnumeration::DumpValue(const ExecutionContext *exe_ctx,
   }
 }
 
-Error OptionValueEnumeration::SetValueFromString(llvm::StringRef value,
-                                                 VarSetOperationType op) {
-  Error error;
+Status OptionValueEnumeration::SetValueFromString(llvm::StringRef value,
+                                                  VarSetOperationType op) {
+  Status error;
   switch (op) {
   case eVarSetOperationClear:
     Clear();
diff --git a/source/Interpreter/OptionValueFileSpec.cpp b/source/Interpreter/OptionValueFileSpec.cpp
index 0df581af5d67..b235d4ac6863 100644
--- a/source/Interpreter/OptionValueFileSpec.cpp
+++ b/source/Interpreter/OptionValueFileSpec.cpp
@@ -54,9 +54,9 @@ void OptionValueFileSpec::DumpValue(const ExecutionContext *exe_ctx,
   }
 }
 
-Error OptionValueFileSpec::SetValueFromString(llvm::StringRef value,
-                                              VarSetOperationType op) {
-  Error error;
+Status OptionValueFileSpec::SetValueFromString(llvm::StringRef value,
+                                               VarSetOperationType op) {
+  Status error;
   switch (op) {
   case eVarSetOperationClear:
     Clear();
diff --git a/source/Interpreter/OptionValueFileSpecLIst.cpp b/source/Interpreter/OptionValueFileSpecLIst.cpp
index 9cddf32206e4..7773bdc5ea9c 100644
--- a/source/Interpreter/OptionValueFileSpecLIst.cpp
+++ b/source/Interpreter/OptionValueFileSpecLIst.cpp
@@ -38,9 +38,9 @@ void OptionValueFileSpecList::DumpValue(const ExecutionContext *exe_ctx,
   }
 }
 
-Error OptionValueFileSpecList::SetValueFromString(llvm::StringRef value,
-                                                  VarSetOperationType op) {
-  Error error;
+Status OptionValueFileSpecList::SetValueFromString(llvm::StringRef value,
+                                                   VarSetOperationType op) {
+  Status error;
   Args args(value.str());
   const size_t argc = args.GetArgumentCount();
 
diff --git a/source/Interpreter/OptionValueFormat.cpp b/source/Interpreter/OptionValueFormat.cpp
index 7d58f63a381e..24dd8fd5f71c 100644
--- a/source/Interpreter/OptionValueFormat.cpp
+++ b/source/Interpreter/OptionValueFormat.cpp
@@ -31,9 +31,9 @@ void OptionValueFormat::DumpValue(const ExecutionContext *exe_ctx, Stream &strm,
   }
 }
 
-Error OptionValueFormat::SetValueFromString(llvm::StringRef value,
-                                            VarSetOperationType op) {
-  Error error;
+Status OptionValueFormat::SetValueFromString(llvm::StringRef value,
+                                             VarSetOperationType op) {
+  Status error;
   switch (op) {
   case eVarSetOperationClear:
     Clear();
diff --git a/source/Interpreter/OptionValueFormatEntity.cpp b/source/Interpreter/OptionValueFormatEntity.cpp
index 5259398dcd33..e9431d4562ec 100644
--- a/source/Interpreter/OptionValueFormatEntity.cpp
+++ b/source/Interpreter/OptionValueFormatEntity.cpp
@@ -25,7 +25,7 @@ OptionValueFormatEntity::OptionValueFormatEntity(const char *default_format)
       m_default_entry() {
   if (default_format && default_format[0]) {
     llvm::StringRef default_format_str(default_format);
-    Error error = FormatEntity::Parse(default_format_str, m_default_entry);
+    Status error = FormatEntity::Parse(default_format_str, m_default_entry);
     if (error.Success()) {
       m_default_format = default_format;
       m_current_format = default_format;
@@ -52,9 +52,9 @@ void OptionValueFormatEntity::DumpValue(const ExecutionContext *exe_ctx,
   }
 }
 
-Error OptionValueFormatEntity::SetValueFromString(llvm::StringRef value_str,
-                                                  VarSetOperationType op) {
-  Error error;
+Status OptionValueFormatEntity::SetValueFromString(llvm::StringRef value_str,
+                                                   VarSetOperationType op) {
+  Status error;
   switch (op) {
   case eVarSetOperationClear:
     Clear();
diff --git a/source/Interpreter/OptionValueLanguage.cpp b/source/Interpreter/OptionValueLanguage.cpp
index 2ea152c436c7..3290e88c1815 100644
--- a/source/Interpreter/OptionValueLanguage.cpp
+++ b/source/Interpreter/OptionValueLanguage.cpp
@@ -32,9 +32,9 @@ void OptionValueLanguage::DumpValue(const ExecutionContext *exe_ctx,
   }
 }
 
-Error OptionValueLanguage::SetValueFromString(llvm::StringRef value,
-                                              VarSetOperationType op) {
-  Error error;
+Status OptionValueLanguage::SetValueFromString(llvm::StringRef value,
+                                               VarSetOperationType op) {
+  Status error;
   switch (op) {
   case eVarSetOperationClear:
     Clear();
diff --git a/source/Interpreter/OptionValuePathMappings.cpp b/source/Interpreter/OptionValuePathMappings.cpp
index 54797422cba3..5f805720bd3f 100644
--- a/source/Interpreter/OptionValuePathMappings.cpp
+++ b/source/Interpreter/OptionValuePathMappings.cpp
@@ -40,9 +40,9 @@ void OptionValuePathMappings::DumpValue(const ExecutionContext *exe_ctx,
   }
 }
 
-Error OptionValuePathMappings::SetValueFromString(llvm::StringRef value,
-                                                  VarSetOperationType op) {
-  Error error;
+Status OptionValuePathMappings::SetValueFromString(llvm::StringRef value,
+                                                   VarSetOperationType op) {
+  Status error;
   Args args(value.str());
   const size_t argc = args.GetArgumentCount();
 
diff --git a/source/Interpreter/OptionValueProperties.cpp b/source/Interpreter/OptionValueProperties.cpp
index 732769f6e6df..ae7669605592 100644
--- a/source/Interpreter/OptionValueProperties.cpp
+++ b/source/Interpreter/OptionValueProperties.cpp
@@ -116,8 +116,8 @@ OptionValueProperties::GetValueForKey(const ExecutionContext *exe_ctx,
 
 lldb::OptionValueSP
 OptionValueProperties::GetSubValue(const ExecutionContext *exe_ctx,
-  llvm::StringRef name, bool will_modify,
-                                   Error &error) const {
+                                   llvm::StringRef name, bool will_modify,
+                                   Status &error) const {
   lldb::OptionValueSP value_sp;
   if (name.empty())
     return OptionValueSP();
@@ -201,10 +201,11 @@ OptionValueProperties::GetSubValue(const ExecutionContext *exe_ctx,
   return value_sp;
 }
 
-Error OptionValueProperties::SetSubValue(const ExecutionContext *exe_ctx,
-                                         VarSetOperationType op,
-                                         llvm::StringRef name, llvm::StringRef value) {
-  Error error;
+Status OptionValueProperties::SetSubValue(const ExecutionContext *exe_ctx,
+                                          VarSetOperationType op,
+                                          llvm::StringRef name,
+                                          llvm::StringRef value) {
+  Status error;
   const bool will_modify = true;
   lldb::OptionValueSP value_sp(GetSubValue(exe_ctx, name, will_modify, error));
   if (value_sp)
@@ -523,9 +524,9 @@ bool OptionValueProperties::Clear() {
   return true;
 }
 
-Error OptionValueProperties::SetValueFromString(llvm::StringRef value,
-                                                VarSetOperationType op) {
-  Error error;
+Status OptionValueProperties::SetValueFromString(llvm::StringRef value,
+                                                 VarSetOperationType op) {
+  Status error;
 
   //    Args args(value_cstr);
   //    const size_t argc = args.GetArgumentCount();
@@ -564,11 +565,11 @@ void OptionValueProperties::DumpValue(const ExecutionContext *exe_ctx,
   }
 }
 
-Error OptionValueProperties::DumpPropertyValue(const ExecutionContext *exe_ctx,
-                                               Stream &strm,
-                                               llvm::StringRef property_path,
-                                               uint32_t dump_mask) {
-  Error error;
+Status OptionValueProperties::DumpPropertyValue(const ExecutionContext *exe_ctx,
+                                                Stream &strm,
+                                                llvm::StringRef property_path,
+                                                uint32_t dump_mask) {
+  Status error;
   const bool will_modify = false;
   lldb::OptionValueSP value_sp(
       GetSubValue(exe_ctx, property_path, will_modify, error));
diff --git a/source/Interpreter/OptionValueRegex.cpp b/source/Interpreter/OptionValueRegex.cpp
index 6823300c515a..aee8f97db163 100644
--- a/source/Interpreter/OptionValueRegex.cpp
+++ b/source/Interpreter/OptionValueRegex.cpp
@@ -32,9 +32,9 @@ void OptionValueRegex::DumpValue(const ExecutionContext *exe_ctx, Stream &strm,
   }
 }
 
-Error OptionValueRegex::SetValueFromString(llvm::StringRef value,
-                                           VarSetOperationType op) {
-  Error error;
+Status OptionValueRegex::SetValueFromString(llvm::StringRef value,
+                                            VarSetOperationType op) {
+  Status error;
   switch (op) {
   case eVarSetOperationInvalid:
   case eVarSetOperationInsertBefore:
diff --git a/source/Interpreter/OptionValueSInt64.cpp b/source/Interpreter/OptionValueSInt64.cpp
index 3bdbfb9e2fae..9dbcd58dc3e9 100644
--- a/source/Interpreter/OptionValueSInt64.cpp
+++ b/source/Interpreter/OptionValueSInt64.cpp
@@ -34,9 +34,9 @@ void OptionValueSInt64::DumpValue(const ExecutionContext *exe_ctx, Stream &strm,
   }
 }
 
-Error OptionValueSInt64::SetValueFromString(llvm::StringRef value_ref,
-                                            VarSetOperationType op) {
-  Error error;
+Status OptionValueSInt64::SetValueFromString(llvm::StringRef value_ref,
+                                             VarSetOperationType op) {
+  Status error;
   switch (op) {
   case eVarSetOperationClear:
     Clear();
diff --git a/source/Interpreter/OptionValueString.cpp b/source/Interpreter/OptionValueString.cpp
index 149ccbc7b518..1d7332dd41a6 100644
--- a/source/Interpreter/OptionValueString.cpp
+++ b/source/Interpreter/OptionValueString.cpp
@@ -47,9 +47,9 @@ void OptionValueString::DumpValue(const ExecutionContext *exe_ctx, Stream &strm,
   }
 }
 
-Error OptionValueString::SetValueFromString(llvm::StringRef value,
-                                            VarSetOperationType op) {
-  Error error;
+Status OptionValueString::SetValueFromString(llvm::StringRef value,
+                                             VarSetOperationType op) {
+  Status error;
 
   std::string value_str = value.str();
   value = value.trim();
@@ -127,27 +127,27 @@ lldb::OptionValueSP OptionValueString::DeepCopy() const {
   return OptionValueSP(new OptionValueString(*this));
 }
 
-Error OptionValueString::SetCurrentValue(llvm::StringRef value) {
+Status OptionValueString::SetCurrentValue(llvm::StringRef value) {
   if (m_validator) {
-    Error error(m_validator(value.str().c_str(), m_validator_baton));
+    Status error(m_validator(value.str().c_str(), m_validator_baton));
     if (error.Fail())
       return error;
   }
   m_current_value.assign(value);
-  return Error();
+  return Status();
 }
 
-Error OptionValueString::AppendToCurrentValue(const char *value) {
+Status OptionValueString::AppendToCurrentValue(const char *value) {
   if (value && value[0]) {
     if (m_validator) {
       std::string new_value(m_current_value);
       new_value.append(value);
-      Error error(m_validator(value, m_validator_baton));
+      Status error(m_validator(value, m_validator_baton));
       if (error.Fail())
         return error;
       m_current_value.assign(new_value);
     } else
       m_current_value.append(value);
   }
-  return Error();
+  return Status();
 }
diff --git a/source/Interpreter/OptionValueUInt64.cpp b/source/Interpreter/OptionValueUInt64.cpp
index 826c9e5ba887..c8db1bd532c6 100644
--- a/source/Interpreter/OptionValueUInt64.cpp
+++ b/source/Interpreter/OptionValueUInt64.cpp
@@ -21,7 +21,7 @@ using namespace lldb;
 using namespace lldb_private;
 
 lldb::OptionValueSP OptionValueUInt64::Create(llvm::StringRef value_str,
-                                              Error &error) {
+                                              Status &error) {
   lldb::OptionValueSP value_sp(new OptionValueUInt64());
   error = value_sp->SetValueFromString(value_str);
   if (error.Fail())
@@ -40,9 +40,9 @@ void OptionValueUInt64::DumpValue(const ExecutionContext *exe_ctx, Stream &strm,
   }
 }
 
-Error OptionValueUInt64::SetValueFromString(llvm::StringRef value_ref,
-                                            VarSetOperationType op) {
-  Error error;
+Status OptionValueUInt64::SetValueFromString(llvm::StringRef value_ref,
+                                             VarSetOperationType op) {
+  Status error;
   switch (op) {
   case eVarSetOperationClear:
     Clear();
diff --git a/source/Interpreter/OptionValueUUID.cpp b/source/Interpreter/OptionValueUUID.cpp
index 7378195a2f94..bec04cba8c37 100644
--- a/source/Interpreter/OptionValueUUID.cpp
+++ b/source/Interpreter/OptionValueUUID.cpp
@@ -32,9 +32,9 @@ void OptionValueUUID::DumpValue(const ExecutionContext *exe_ctx, Stream &strm,
   }
 }
 
-Error OptionValueUUID::SetValueFromString(llvm::StringRef value,
-                                          VarSetOperationType op) {
-  Error error;
+Status OptionValueUUID::SetValueFromString(llvm::StringRef value,
+                                           VarSetOperationType op) {
+  Status error;
   switch (op) {
   case eVarSetOperationClear:
     Clear();
diff --git a/source/Interpreter/Options.cpp b/source/Interpreter/Options.cpp
index b8b9bdf54a0d..f8b1a8d6beba 100644
--- a/source/Interpreter/Options.cpp
+++ b/source/Interpreter/Options.cpp
@@ -42,8 +42,8 @@ void Options::NotifyOptionParsingStarting(ExecutionContext *execution_context) {
   OptionParsingStarting(execution_context);
 }
 
-Error Options::NotifyOptionParsingFinished(
-    ExecutionContext *execution_context) {
+Status
+Options::NotifyOptionParsingFinished(ExecutionContext *execution_context) {
   return OptionParsingFinished(execution_context);
 }
 
@@ -905,13 +905,13 @@ void OptionGroupOptions::Finalize() {
   m_did_finalize = true;
 }
 
-Error OptionGroupOptions::SetOptionValue(uint32_t option_idx,
-                                         llvm::StringRef option_value,
-                                         ExecutionContext *execution_context) {
+Status OptionGroupOptions::SetOptionValue(uint32_t option_idx,
+                                          llvm::StringRef option_value,
+                                          ExecutionContext *execution_context) {
   // After calling OptionGroupOptions::Append(...), you must finalize the groups
   // by calling OptionGroupOptions::Finlize()
   assert(m_did_finalize);
-  Error error;
+  Status error;
   if (option_idx < m_option_infos.size()) {
     error = m_option_infos[option_idx].option_group->SetOptionValue(
         m_option_infos[option_idx].option_index, option_value,
@@ -935,10 +935,10 @@ void OptionGroupOptions::OptionParsingStarting(
     }
   }
 }
-Error OptionGroupOptions::OptionParsingFinished(
-    ExecutionContext *execution_context) {
+Status
+OptionGroupOptions::OptionParsingFinished(ExecutionContext *execution_context) {
   std::set<OptionGroup *> group_set;
-  Error error;
+  Status error;
   OptionInfos::iterator pos, end = m_option_infos.end();
   for (pos = m_option_infos.begin(); pos != end; ++pos) {
     OptionGroup *group = pos->option_group;
diff --git a/source/Interpreter/ScriptInterpreter.cpp b/source/Interpreter/ScriptInterpreter.cpp
index 0f7e1f8d74f4..57f7eea6e4c1 100644
--- a/source/Interpreter/ScriptInterpreter.cpp
+++ b/source/Interpreter/ScriptInterpreter.cpp
@@ -15,7 +15,7 @@
 
 #include "lldb/Host/PseudoTerminal.h"
 #include "lldb/Interpreter/CommandReturnObject.h"
-#include "lldb/Utility/Error.h"
+#include "lldb/Utility/Status.h"
 #include "lldb/Utility/Stream.h"
 #include "lldb/Utility/StringList.h"
 
@@ -75,10 +75,10 @@ ScriptInterpreter::StringToLanguage(const llvm::StringRef &language) {
     return eScriptLanguageUnknown;
 }
 
-Error ScriptInterpreter::SetBreakpointCommandCallback(
+Status ScriptInterpreter::SetBreakpointCommandCallback(
     std::vector<BreakpointOptions *> &bp_options_vec,
     const char *callback_text) {
-  Error return_error;
+  Status return_error;
   for (BreakpointOptions *bp_options : bp_options_vec) {
     return_error = SetBreakpointCommandCallback(bp_options, callback_text);
     if (return_error.Success())
diff --git a/source/Plugins/ABI/MacOSX-arm/ABIMacOSX_arm.cpp b/source/Plugins/ABI/MacOSX-arm/ABIMacOSX_arm.cpp
index dc1c8dcf4ab0..06b1c7054c1f 100644
--- a/source/Plugins/ABI/MacOSX-arm/ABIMacOSX_arm.cpp
+++ b/source/Plugins/ABI/MacOSX-arm/ABIMacOSX_arm.cpp
@@ -30,7 +30,7 @@
 #include "lldb/Target/Target.h"
 #include "lldb/Target/Thread.h"
 #include "lldb/Utility/ConstString.h"
-#include "lldb/Utility/Error.h"
+#include "lldb/Utility/Status.h"
 
 #include "Plugins/Process/Utility/ARMDefines.h"
 #include "Utility/ARM_DWARF_Registers.h"
@@ -1533,7 +1533,7 @@ bool ABIMacOSX_arm::GetArgumentValues(Thread &thread, ValueList &values) const {
 
           // Arguments 5 on up are on the stack
           const uint32_t arg_byte_size = (bit_width + (8 - 1)) / 8;
-          Error error;
+          Status error;
           if (!exe_ctx.GetProcessRef().ReadScalarIntegerFromMemory(
                   sp, arg_byte_size, is_signed, value->GetScalar(), error))
             return false;
@@ -1619,7 +1619,7 @@ ValueObjectSP ABIMacOSX_arm::GetReturnValueObjectImpl(
                   reg_ctx->ReadRegister(r1_reg_info, r1_reg_value) &&
                   reg_ctx->ReadRegister(r2_reg_info, r2_reg_value) &&
                   reg_ctx->ReadRegister(r3_reg_info, r3_reg_value)) {
-                Error error;
+                Status error;
                 if (r0_reg_value.GetAsMemoryData(r0_reg_info,
                                                  heap_data_ap->GetBytes() + 0,
                                                  4, byte_order, error) &&
@@ -1702,9 +1702,9 @@ ValueObjectSP ABIMacOSX_arm::GetReturnValueObjectImpl(
   return return_valobj_sp;
 }
 
-Error ABIMacOSX_arm::SetReturnValueObject(lldb::StackFrameSP &frame_sp,
-                                          lldb::ValueObjectSP &new_value_sp) {
-  Error error;
+Status ABIMacOSX_arm::SetReturnValueObject(lldb::StackFrameSP &frame_sp,
+                                           lldb::ValueObjectSP &new_value_sp) {
+  Status error;
   if (!new_value_sp) {
     error.SetErrorString("Empty value object for return value.");
     return error;
@@ -1728,7 +1728,7 @@ Error ABIMacOSX_arm::SetReturnValueObject(lldb::StackFrameSP &frame_sp,
   if (compiler_type.IsIntegerOrEnumerationType(is_signed) ||
       compiler_type.IsPointerType()) {
     DataExtractor data;
-    Error data_error;
+    Status data_error;
     size_t num_bytes = new_value_sp->GetData(data, data_error);
     if (data_error.Fail()) {
       error.SetErrorStringWithFormat(
diff --git a/source/Plugins/ABI/MacOSX-arm/ABIMacOSX_arm.h b/source/Plugins/ABI/MacOSX-arm/ABIMacOSX_arm.h
index 39f57a0c5ee2..5aa817c20ec2 100644
--- a/source/Plugins/ABI/MacOSX-arm/ABIMacOSX_arm.h
+++ b/source/Plugins/ABI/MacOSX-arm/ABIMacOSX_arm.h
@@ -30,7 +30,7 @@ public:
   bool GetArgumentValues(lldb_private::Thread &thread,
                          lldb_private::ValueList &values) const override;
 
-  lldb_private::Error
+  lldb_private::Status
   SetReturnValueObject(lldb::StackFrameSP &frame_sp,
                        lldb::ValueObjectSP &new_value) override;
 
diff --git a/source/Plugins/ABI/MacOSX-arm64/ABIMacOSX_arm64.cpp b/source/Plugins/ABI/MacOSX-arm64/ABIMacOSX_arm64.cpp
index c5e53361bf99..a545dfc837da 100644
--- a/source/Plugins/ABI/MacOSX-arm64/ABIMacOSX_arm64.cpp
+++ b/source/Plugins/ABI/MacOSX-arm64/ABIMacOSX_arm64.cpp
@@ -22,7 +22,6 @@
 #include "lldb/Core/RegisterValue.h"
 #include "lldb/Core/Scalar.h"
 #include "lldb/Core/Value.h"
-#include "lldb/Core/Value.h"
 #include "lldb/Core/ValueObjectConstResult.h"
 #include "lldb/Symbol/UnwindPlan.h"
 #include "lldb/Target/Process.h"
@@ -30,8 +29,8 @@
 #include "lldb/Target/Target.h"
 #include "lldb/Target/Thread.h"
 #include "lldb/Utility/ConstString.h"
-#include "lldb/Utility/Error.h"
 #include "lldb/Utility/Log.h"
+#include "lldb/Utility/Status.h"
 
 #include "Utility/ARM64_DWARF_Registers.h"
 
@@ -1841,7 +1840,7 @@ bool ABIMacOSX_arm64::GetArgumentValues(Thread &thread,
 
           // Arguments 5 on up are on the stack
           const uint32_t arg_byte_size = (bit_width + (8 - 1)) / 8;
-          Error error;
+          Status error;
           if (!exe_ctx.GetProcessRef().ReadScalarIntegerFromMemory(
                   sp, arg_byte_size, is_signed, value->GetScalar(), error))
             return false;
@@ -1860,9 +1859,10 @@ bool ABIMacOSX_arm64::GetArgumentValues(Thread &thread,
   return true;
 }
 
-Error ABIMacOSX_arm64::SetReturnValueObject(lldb::StackFrameSP &frame_sp,
-                                            lldb::ValueObjectSP &new_value_sp) {
-  Error error;
+Status
+ABIMacOSX_arm64::SetReturnValueObject(lldb::StackFrameSP &frame_sp,
+                                      lldb::ValueObjectSP &new_value_sp) {
+  Status error;
   if (!new_value_sp) {
     error.SetErrorString("Empty value object for return value.");
     return error;
@@ -1880,7 +1880,7 @@ Error ABIMacOSX_arm64::SetReturnValueObject(lldb::StackFrameSP &frame_sp,
 
   if (reg_ctx) {
     DataExtractor data;
-    Error data_error;
+    Status data_error;
     const uint64_t byte_size = new_value_sp->GetData(data, data_error);
     if (data_error.Fail()) {
       error.SetErrorStringWithFormat(
@@ -2124,7 +2124,7 @@ static bool LoadValueFromConsecutiveGPRRegisters(
   std::unique_ptr<DataBufferHeap> heap_data_ap(
       new DataBufferHeap(byte_size, 0));
   const ByteOrder byte_order = exe_ctx.GetProcessRef().GetByteOrder();
-  Error error;
+  Status error;
 
   CompilerType base_type;
   const uint32_t homogeneous_count =
@@ -2305,7 +2305,7 @@ ValueObjectSP ABIMacOSX_arm64::GetReturnValueObjectImpl(
                   RegisterValue x1_reg_value;
                   if (reg_ctx->ReadRegister(x0_reg_info, x0_reg_value) &&
                       reg_ctx->ReadRegister(x1_reg_info, x1_reg_value)) {
-                    Error error;
+                    Status error;
                     if (x0_reg_value.GetAsMemoryData(
                             x0_reg_info, heap_data_ap->GetBytes() + 0, 8,
                             byte_order, error) &&
@@ -2402,7 +2402,7 @@ ValueObjectSP ABIMacOSX_arm64::GetReturnValueObjectImpl(
           const ByteOrder byte_order = exe_ctx.GetProcessRef().GetByteOrder();
           RegisterValue reg_value;
           if (reg_ctx->ReadRegister(v0_info, reg_value)) {
-            Error error;
+            Status error;
             if (reg_value.GetAsMemoryData(v0_info, heap_data_ap->GetBytes(),
                                           heap_data_ap->GetByteSize(),
                                           byte_order, error)) {
diff --git a/source/Plugins/ABI/MacOSX-arm64/ABIMacOSX_arm64.h b/source/Plugins/ABI/MacOSX-arm64/ABIMacOSX_arm64.h
index 93a548a79682..589e2ea468ef 100644
--- a/source/Plugins/ABI/MacOSX-arm64/ABIMacOSX_arm64.h
+++ b/source/Plugins/ABI/MacOSX-arm64/ABIMacOSX_arm64.h
@@ -92,7 +92,7 @@ public:
 
   uint32_t GetPluginVersion() override;
 
-  lldb_private::Error
+  lldb_private::Status
   SetReturnValueObject(lldb::StackFrameSP &frame_sp,
                        lldb::ValueObjectSP &new_value) override;
 
diff --git a/source/Plugins/ABI/MacOSX-i386/ABIMacOSX_i386.cpp b/source/Plugins/ABI/MacOSX-i386/ABIMacOSX_i386.cpp
index bbaec25b37b8..c393ac9c916b 100644
--- a/source/Plugins/ABI/MacOSX-i386/ABIMacOSX_i386.cpp
+++ b/source/Plugins/ABI/MacOSX-i386/ABIMacOSX_i386.cpp
@@ -29,7 +29,7 @@
 #include "lldb/Target/Target.h"
 #include "lldb/Target/Thread.h"
 #include "lldb/Utility/ConstString.h"
-#include "lldb/Utility/Error.h"
+#include "lldb/Utility/Status.h"
 
 using namespace lldb;
 using namespace lldb_private;
@@ -746,7 +746,7 @@ bool ABIMacOSX_i386::PrepareTrivialCall(Thread &thread, addr_t sp,
 
   // Make room for the argument(s) on the stack
 
-  Error error;
+  Status error;
   RegisterValue reg_value;
 
   // Write any arguments onto the stack
@@ -793,7 +793,7 @@ static bool ReadIntegerArgument(Scalar &scalar, unsigned int bit_width,
                                 addr_t &current_stack_argument) {
 
   uint32_t byte_size = (bit_width + (8 - 1)) / 8;
-  Error error;
+  Status error;
   if (process->ReadScalarIntegerFromMemory(current_stack_argument, byte_size,
                                            is_signed, scalar, error)) {
     current_stack_argument += byte_size;
@@ -849,9 +849,9 @@ bool ABIMacOSX_i386::GetArgumentValues(Thread &thread,
   return true;
 }
 
-Error ABIMacOSX_i386::SetReturnValueObject(lldb::StackFrameSP &frame_sp,
-                                           lldb::ValueObjectSP &new_value_sp) {
-  Error error;
+Status ABIMacOSX_i386::SetReturnValueObject(lldb::StackFrameSP &frame_sp,
+                                            lldb::ValueObjectSP &new_value_sp) {
+  Status error;
   if (!new_value_sp) {
     error.SetErrorString("Empty value object for return value.");
     return error;
@@ -875,7 +875,7 @@ Error ABIMacOSX_i386::SetReturnValueObject(lldb::StackFrameSP &frame_sp,
   if (compiler_type.IsIntegerOrEnumerationType(is_signed) ||
       compiler_type.IsPointerType()) {
     DataExtractor data;
-    Error data_error;
+    Status data_error;
     size_t num_bytes = new_value_sp->GetData(data, data_error);
     if (data_error.Fail()) {
       error.SetErrorStringWithFormat(
diff --git a/source/Plugins/ABI/MacOSX-i386/ABIMacOSX_i386.h b/source/Plugins/ABI/MacOSX-i386/ABIMacOSX_i386.h
index 4a2555ac61a2..88fb6ffd7a40 100644
--- a/source/Plugins/ABI/MacOSX-i386/ABIMacOSX_i386.h
+++ b/source/Plugins/ABI/MacOSX-i386/ABIMacOSX_i386.h
@@ -31,7 +31,7 @@ public:
   bool GetArgumentValues(lldb_private::Thread &thread,
                          lldb_private::ValueList &values) const override;
 
-  lldb_private::Error
+  lldb_private::Status
   SetReturnValueObject(lldb::StackFrameSP &frame_sp,
                        lldb::ValueObjectSP &new_value) override;
 
diff --git a/source/Plugins/ABI/SysV-arm/ABISysV_arm.cpp b/source/Plugins/ABI/SysV-arm/ABISysV_arm.cpp
index 3df225cc895d..614c6e893806 100644
--- a/source/Plugins/ABI/SysV-arm/ABISysV_arm.cpp
+++ b/source/Plugins/ABI/SysV-arm/ABISysV_arm.cpp
@@ -30,7 +30,7 @@
 #include "lldb/Target/Target.h"
 #include "lldb/Target/Thread.h"
 #include "lldb/Utility/ConstString.h"
-#include "lldb/Utility/Error.h"
+#include "lldb/Utility/Status.h"
 
 #include "Plugins/Process/Utility/ARMDefines.h"
 #include "Utility/ARM_DWARF_Registers.h"
@@ -1518,7 +1518,7 @@ bool ABISysV_arm::GetArgumentValues(Thread &thread, ValueList &values) const {
 
           // Arguments 5 on up are on the stack
           const uint32_t arg_byte_size = (bit_width + (8 - 1)) / 8;
-          Error error;
+          Status error;
           if (!exe_ctx.GetProcessRef().ReadScalarIntegerFromMemory(
                   sp, arg_byte_size, is_signed, value->GetScalar(), error))
             return false;
@@ -1534,7 +1534,7 @@ bool ABISysV_arm::GetArgumentValues(Thread &thread, ValueList &values) const {
 static bool GetReturnValuePassedInMemory(Thread &thread,
                                          RegisterContext *reg_ctx,
                                          size_t byte_size, Value &value) {
-  Error error;
+  Status error;
   DataBufferHeap buffer(byte_size, 0);
 
   const RegisterInfo *r0_reg_info =
@@ -1815,7 +1815,7 @@ ValueObjectSP ABISysV_arm::GetReturnValueObjectImpl(
 
       // Make sure we have enough room in "data_sp"
       if ((data_offset + vfp_byte_size) <= data_sp->GetByteSize()) {
-        Error error;
+        Status error;
         const size_t bytes_copied = reg_value.GetAsMemoryData(
             reg_info, data_sp->GetBytes() + data_offset, vfp_byte_size,
             byte_order, error);
@@ -1846,9 +1846,9 @@ ValueObjectSP ABISysV_arm::GetReturnValueObjectImpl(
   return return_valobj_sp;
 }
 
-Error ABISysV_arm::SetReturnValueObject(lldb::StackFrameSP &frame_sp,
-                                        lldb::ValueObjectSP &new_value_sp) {
-  Error error;
+Status ABISysV_arm::SetReturnValueObject(lldb::StackFrameSP &frame_sp,
+                                         lldb::ValueObjectSP &new_value_sp) {
+  Status error;
   if (!new_value_sp) {
     error.SetErrorString("Empty value object for return value.");
     return error;
@@ -1872,7 +1872,7 @@ Error ABISysV_arm::SetReturnValueObject(lldb::StackFrameSP &frame_sp,
   if (compiler_type.IsIntegerOrEnumerationType(is_signed) ||
       compiler_type.IsPointerType()) {
     DataExtractor data;
-    Error data_error;
+    Status data_error;
     size_t num_bytes = new_value_sp->GetData(data, data_error);
     if (data_error.Fail()) {
       error.SetErrorStringWithFormat(
diff --git a/source/Plugins/ABI/SysV-arm/ABISysV_arm.h b/source/Plugins/ABI/SysV-arm/ABISysV_arm.h
index d85b3dff0ca5..81f1277419a6 100644
--- a/source/Plugins/ABI/SysV-arm/ABISysV_arm.h
+++ b/source/Plugins/ABI/SysV-arm/ABISysV_arm.h
@@ -30,7 +30,7 @@ public:
   bool GetArgumentValues(lldb_private::Thread &thread,
                          lldb_private::ValueList &values) const override;
 
-  lldb_private::Error
+  lldb_private::Status
   SetReturnValueObject(lldb::StackFrameSP &frame_sp,
                        lldb::ValueObjectSP &new_value) override;
 
diff --git a/source/Plugins/ABI/SysV-arm64/ABISysV_arm64.cpp b/source/Plugins/ABI/SysV-arm64/ABISysV_arm64.cpp
index 65cbd271e979..b202c8395776 100644
--- a/source/Plugins/ABI/SysV-arm64/ABISysV_arm64.cpp
+++ b/source/Plugins/ABI/SysV-arm64/ABISysV_arm64.cpp
@@ -30,8 +30,8 @@
 #include "lldb/Target/Target.h"
 #include "lldb/Target/Thread.h"
 #include "lldb/Utility/ConstString.h"
-#include "lldb/Utility/Error.h"
 #include "lldb/Utility/Log.h"
+#include "lldb/Utility/Status.h"
 
 #include "Utility/ARM64_DWARF_Registers.h"
 
@@ -1813,7 +1813,7 @@ bool ABISysV_arm64::GetArgumentValues(Thread &thread, ValueList &values) const {
 
           // Arguments 5 on up are on the stack
           const uint32_t arg_byte_size = (bit_width + (8 - 1)) / 8;
-          Error error;
+          Status error;
           if (!exe_ctx.GetProcessRef().ReadScalarIntegerFromMemory(
                   sp, arg_byte_size, is_signed, value->GetScalar(), error))
             return false;
@@ -1832,9 +1832,9 @@ bool ABISysV_arm64::GetArgumentValues(Thread &thread, ValueList &values) const {
   return true;
 }
 
-Error ABISysV_arm64::SetReturnValueObject(lldb::StackFrameSP &frame_sp,
-                                          lldb::ValueObjectSP &new_value_sp) {
-  Error error;
+Status ABISysV_arm64::SetReturnValueObject(lldb::StackFrameSP &frame_sp,
+                                           lldb::ValueObjectSP &new_value_sp) {
+  Status error;
   if (!new_value_sp) {
     error.SetErrorString("Empty value object for return value.");
     return error;
@@ -1852,7 +1852,7 @@ Error ABISysV_arm64::SetReturnValueObject(lldb::StackFrameSP &frame_sp,
 
   if (reg_ctx) {
     DataExtractor data;
-    Error data_error;
+    Status data_error;
     const uint64_t byte_size = new_value_sp->GetData(data, data_error);
     if (data_error.Fail()) {
       error.SetErrorStringWithFormat(
@@ -2101,7 +2101,7 @@ static bool LoadValueFromConsecutiveGPRRegisters(
   std::unique_ptr<DataBufferHeap> heap_data_ap(
       new DataBufferHeap(byte_size, 0));
   const ByteOrder byte_order = exe_ctx.GetProcessRef().GetByteOrder();
-  Error error;
+  Status error;
 
   CompilerType base_type;
   const uint32_t homogeneous_count =
@@ -2277,7 +2277,7 @@ ValueObjectSP ABISysV_arm64::GetReturnValueObjectImpl(
                   RegisterValue x1_reg_value;
                   if (reg_ctx->ReadRegister(x0_reg_info, x0_reg_value) &&
                       reg_ctx->ReadRegister(x1_reg_info, x1_reg_value)) {
-                    Error error;
+                    Status error;
                     if (x0_reg_value.GetAsMemoryData(
                             x0_reg_info, heap_data_ap->GetBytes() + 0, 8,
                             byte_order, error) &&
@@ -2372,7 +2372,7 @@ ValueObjectSP ABISysV_arm64::GetReturnValueObjectImpl(
         const ByteOrder byte_order = exe_ctx.GetProcessRef().GetByteOrder();
         RegisterValue reg_value;
         if (reg_ctx->ReadRegister(v0_info, reg_value)) {
-          Error error;
+          Status error;
           if (reg_value.GetAsMemoryData(v0_info, heap_data_ap->GetBytes(),
                                         heap_data_ap->GetByteSize(), byte_order,
                                         error)) {
diff --git a/source/Plugins/ABI/SysV-arm64/ABISysV_arm64.h b/source/Plugins/ABI/SysV-arm64/ABISysV_arm64.h
index 68c300c0909d..c048e8634ae5 100644
--- a/source/Plugins/ABI/SysV-arm64/ABISysV_arm64.h
+++ b/source/Plugins/ABI/SysV-arm64/ABISysV_arm64.h
@@ -31,7 +31,7 @@ public:
   bool GetArgumentValues(lldb_private::Thread &thread,
                          lldb_private::ValueList &values) const override;
 
-  lldb_private::Error
+  lldb_private::Status
   SetReturnValueObject(lldb::StackFrameSP &frame_sp,
                        lldb::ValueObjectSP &new_value) override;
 
diff --git a/source/Plugins/ABI/SysV-hexagon/ABISysV_hexagon.cpp b/source/Plugins/ABI/SysV-hexagon/ABISysV_hexagon.cpp
index 0b024dbac07d..425bf2ce0bf6 100644
--- a/source/Plugins/ABI/SysV-hexagon/ABISysV_hexagon.cpp
+++ b/source/Plugins/ABI/SysV-hexagon/ABISysV_hexagon.cpp
@@ -31,8 +31,8 @@
 #include "lldb/Target/Thread.h"
 #include "lldb/Utility/ConstString.h"
 #include "lldb/Utility/DataExtractor.h"
-#include "lldb/Utility/Error.h"
 #include "lldb/Utility/Log.h"
+#include "lldb/Utility/Status.h"
 
 using namespace lldb;
 using namespace lldb_private;
@@ -1073,7 +1073,7 @@ bool ABISysV_hexagon::PrepareTrivialCall(
     llvm::Type &prototype, llvm::ArrayRef<ABI::CallArgument> args) const {
   // default number of register passed arguments for varg functions
   const int nVArgRegParams = 1;
-  Error error;
+  Status error;
 
   // grab the process so we have access to the memory for spilling
   lldb::ProcessSP proc = thread.GetProcess();
@@ -1195,9 +1195,10 @@ bool ABISysV_hexagon::GetArgumentValues(Thread &thread,
   return false;
 }
 
-Error ABISysV_hexagon::SetReturnValueObject(lldb::StackFrameSP &frame_sp,
-                                            lldb::ValueObjectSP &new_value_sp) {
-  Error error;
+Status
+ABISysV_hexagon::SetReturnValueObject(lldb::StackFrameSP &frame_sp,
+                                      lldb::ValueObjectSP &new_value_sp) {
+  Status error;
   return error;
 }
 
diff --git a/source/Plugins/ABI/SysV-hexagon/ABISysV_hexagon.h b/source/Plugins/ABI/SysV-hexagon/ABISysV_hexagon.h
index aed3dd71b69a..5f808291eac0 100644
--- a/source/Plugins/ABI/SysV-hexagon/ABISysV_hexagon.h
+++ b/source/Plugins/ABI/SysV-hexagon/ABISysV_hexagon.h
@@ -39,7 +39,7 @@ public:
   bool GetArgumentValues(lldb_private::Thread &thread,
                          lldb_private::ValueList &values) const override;
 
-  lldb_private::Error
+  lldb_private::Status
   SetReturnValueObject(lldb::StackFrameSP &frame_sp,
                        lldb::ValueObjectSP &new_value) override;
 
diff --git a/source/Plugins/ABI/SysV-i386/ABISysV_i386.cpp b/source/Plugins/ABI/SysV-i386/ABISysV_i386.cpp
index aee102c16ba1..cb3644d46af4 100644
--- a/source/Plugins/ABI/SysV-i386/ABISysV_i386.cpp
+++ b/source/Plugins/ABI/SysV-i386/ABISysV_i386.cpp
@@ -31,8 +31,8 @@
 #include "lldb/Target/Thread.h"
 #include "lldb/Utility/ConstString.h"
 #include "lldb/Utility/DataExtractor.h"
-#include "lldb/Utility/Error.h"
 #include "lldb/Utility/Log.h"
+#include "lldb/Utility/Status.h"
 
 using namespace lldb;
 using namespace lldb_private;
@@ -237,7 +237,7 @@ bool ABISysV_i386::PrepareTrivialCall(Thread &thread, addr_t sp,
   if (!reg_info_32)
     return false; // TODO this should actually never happen
 
-  Error error;
+  Status error;
   RegisterValue reg_value;
 
   // Make room for the argument(s) on the stack
@@ -280,7 +280,7 @@ static bool ReadIntegerArgument(Scalar &scalar, unsigned int bit_width,
                                 bool is_signed, Process *process,
                                 addr_t &current_stack_argument) {
   uint32_t byte_size = (bit_width + (8 - 1)) / 8;
-  Error error;
+  Status error;
 
   if (!process)
     return false;
@@ -333,9 +333,9 @@ bool ABISysV_i386::GetArgumentValues(Thread &thread, ValueList &values) const {
   return true;
 }
 
-Error ABISysV_i386::SetReturnValueObject(lldb::StackFrameSP &frame_sp,
-                                         lldb::ValueObjectSP &new_value_sp) {
-  Error error;
+Status ABISysV_i386::SetReturnValueObject(lldb::StackFrameSP &frame_sp,
+                                          lldb::ValueObjectSP &new_value_sp) {
+  Status error;
   if (!new_value_sp) {
     error.SetErrorString("Empty value object for return value.");
     return error;
@@ -351,7 +351,7 @@ Error ABISysV_i386::SetReturnValueObject(lldb::StackFrameSP &frame_sp,
   Thread *thread = frame_sp->GetThread().get();
   RegisterContext *reg_ctx = thread->GetRegisterContext().get();
   DataExtractor data;
-  Error data_error;
+  Status data_error;
   size_t num_bytes = new_value_sp->GetData(data, data_error);
   bool register_write_successful = true;
 
@@ -661,7 +661,7 @@ ValueObjectSP ABISysV_i386::GetReturnValueObjectSimple(
             const ByteOrder byte_order = process_sp->GetByteOrder();
             RegisterValue reg_value;
             if (reg_ctx->ReadRegister(vec_reg, reg_value)) {
-              Error error;
+              Status error;
               if (reg_value.GetAsMemoryData(vec_reg, heap_data_ap->GetBytes(),
                                             heap_data_ap->GetByteSize(),
                                             byte_order, error)) {
@@ -688,7 +688,7 @@ ValueObjectSP ABISysV_i386::GetReturnValueObjectSimple(
               if (reg_ctx->ReadRegister(vec_reg, reg_value) &&
                   reg_ctx->ReadRegister(vec_reg2, reg_value2)) {
 
-                Error error;
+                Status error;
                 if (reg_value.GetAsMemoryData(vec_reg, heap_data_ap->GetBytes(),
                                               vec_reg->byte_size, byte_order,
                                               error) &&
diff --git a/source/Plugins/ABI/SysV-i386/ABISysV_i386.h b/source/Plugins/ABI/SysV-i386/ABISysV_i386.h
index 8d1d76b5dfd9..038bf656b1f1 100644
--- a/source/Plugins/ABI/SysV-i386/ABISysV_i386.h
+++ b/source/Plugins/ABI/SysV-i386/ABISysV_i386.h
@@ -33,7 +33,7 @@ public:
   bool GetArgumentValues(lldb_private::Thread &thread,
                          lldb_private::ValueList &values) const override;
 
-  lldb_private::Error
+  lldb_private::Status
   SetReturnValueObject(lldb::StackFrameSP &frame_sp,
                        lldb::ValueObjectSP &new_value) override;
 
diff --git a/source/Plugins/ABI/SysV-mips/ABISysV_mips.cpp b/source/Plugins/ABI/SysV-mips/ABISysV_mips.cpp
index 69725e30b063..a77252a20499 100644
--- a/source/Plugins/ABI/SysV-mips/ABISysV_mips.cpp
+++ b/source/Plugins/ABI/SysV-mips/ABISysV_mips.cpp
@@ -31,8 +31,8 @@
 #include "lldb/Target/Thread.h"
 #include "lldb/Utility/ConstString.h"
 #include "lldb/Utility/DataExtractor.h"
-#include "lldb/Utility/Error.h"
 #include "lldb/Utility/Log.h"
+#include "lldb/Utility/Status.h"
 
 using namespace lldb;
 using namespace lldb_private;
@@ -654,7 +654,7 @@ bool ABISysV_mips::PrepareTrivialCall(Thread &thread, addr_t sp,
     }
   }
 
-  Error error;
+  Status error;
   const RegisterInfo *pc_reg_info =
       reg_ctx->GetRegisterInfo(eRegisterKindGeneric, LLDB_REGNUM_GENERIC_PC);
   const RegisterInfo *sp_reg_info =
@@ -710,9 +710,9 @@ bool ABISysV_mips::GetArgumentValues(Thread &thread, ValueList &values) const {
   return false;
 }
 
-Error ABISysV_mips::SetReturnValueObject(lldb::StackFrameSP &frame_sp,
-                                         lldb::ValueObjectSP &new_value_sp) {
-  Error error;
+Status ABISysV_mips::SetReturnValueObject(lldb::StackFrameSP &frame_sp,
+                                          lldb::ValueObjectSP &new_value_sp) {
+  Status error;
   if (!new_value_sp) {
     error.SetErrorString("Empty value object for return value.");
     return error;
@@ -736,7 +736,7 @@ Error ABISysV_mips::SetReturnValueObject(lldb::StackFrameSP &frame_sp,
   if (compiler_type.IsIntegerOrEnumerationType(is_signed) ||
       compiler_type.IsPointerType()) {
     DataExtractor data;
-    Error data_error;
+    Status data_error;
     size_t num_bytes = new_value_sp->GetData(data, data_error);
     if (data_error.Fail()) {
       error.SetErrorStringWithFormat(
diff --git a/source/Plugins/ABI/SysV-mips/ABISysV_mips.h b/source/Plugins/ABI/SysV-mips/ABISysV_mips.h
index 7ac51463e6eb..980553c506bd 100644
--- a/source/Plugins/ABI/SysV-mips/ABISysV_mips.h
+++ b/source/Plugins/ABI/SysV-mips/ABISysV_mips.h
@@ -31,7 +31,7 @@ public:
   bool GetArgumentValues(lldb_private::Thread &thread,
                          lldb_private::ValueList &values) const override;
 
-  lldb_private::Error
+  lldb_private::Status
   SetReturnValueObject(lldb::StackFrameSP &frame_sp,
                        lldb::ValueObjectSP &new_value) override;
 
diff --git a/source/Plugins/ABI/SysV-mips64/ABISysV_mips64.cpp b/source/Plugins/ABI/SysV-mips64/ABISysV_mips64.cpp
index bbcf2e575825..baa478ea1106 100644
--- a/source/Plugins/ABI/SysV-mips64/ABISysV_mips64.cpp
+++ b/source/Plugins/ABI/SysV-mips64/ABISysV_mips64.cpp
@@ -31,8 +31,8 @@
 #include "lldb/Target/Thread.h"
 #include "lldb/Utility/ConstString.h"
 #include "lldb/Utility/DataExtractor.h"
-#include "lldb/Utility/Error.h"
 #include "lldb/Utility/Log.h"
+#include "lldb/Utility/Status.h"
 
 using namespace lldb;
 using namespace lldb_private;
@@ -617,7 +617,7 @@ bool ABISysV_mips64::PrepareTrivialCall(Thread &thread, addr_t sp,
 
   sp &= ~(0xfull); // 16-byte alignment
 
-  Error error;
+  Status error;
   const RegisterInfo *pc_reg_info =
       reg_ctx->GetRegisterInfo(eRegisterKindGeneric, LLDB_REGNUM_GENERIC_PC);
   const RegisterInfo *sp_reg_info =
@@ -674,9 +674,9 @@ bool ABISysV_mips64::GetArgumentValues(Thread &thread,
   return false;
 }
 
-Error ABISysV_mips64::SetReturnValueObject(lldb::StackFrameSP &frame_sp,
-                                           lldb::ValueObjectSP &new_value_sp) {
-  Error error;
+Status ABISysV_mips64::SetReturnValueObject(lldb::StackFrameSP &frame_sp,
+                                            lldb::ValueObjectSP &new_value_sp) {
+  Status error;
   if (!new_value_sp) {
     error.SetErrorString("Empty value object for return value.");
     return error;
@@ -696,7 +696,7 @@ Error ABISysV_mips64::SetReturnValueObject(lldb::StackFrameSP &frame_sp,
     error.SetErrorString("no registers are available");
 
   DataExtractor data;
-  Error data_error;
+  Status data_error;
   size_t num_bytes = new_value_sp->GetData(data, data_error);
   if (data_error.Fail()) {
     error.SetErrorStringWithFormat(
@@ -754,7 +754,7 @@ ValueObjectSP ABISysV_mips64::GetReturnValueObjectImpl(
     Thread &thread, CompilerType &return_compiler_type) const {
   ValueObjectSP return_valobj_sp;
   Value value;
-  Error error;
+  Status error;
 
   ExecutionContext exe_ctx(thread.shared_from_this());
   if (exe_ctx.GetTargetPtr() == nullptr || exe_ctx.GetProcessPtr() == nullptr)
diff --git a/source/Plugins/ABI/SysV-mips64/ABISysV_mips64.h b/source/Plugins/ABI/SysV-mips64/ABISysV_mips64.h
index 672a43825625..ac7d9b871946 100644
--- a/source/Plugins/ABI/SysV-mips64/ABISysV_mips64.h
+++ b/source/Plugins/ABI/SysV-mips64/ABISysV_mips64.h
@@ -31,7 +31,7 @@ public:
   bool GetArgumentValues(lldb_private::Thread &thread,
                          lldb_private::ValueList &values) const override;
 
-  lldb_private::Error
+  lldb_private::Status
   SetReturnValueObject(lldb::StackFrameSP &frame_sp,
                        lldb::ValueObjectSP &new_value) override;
 
diff --git a/source/Plugins/ABI/SysV-ppc/ABISysV_ppc.cpp b/source/Plugins/ABI/SysV-ppc/ABISysV_ppc.cpp
index 2dff0d203611..00adfe1be82d 100644
--- a/source/Plugins/ABI/SysV-ppc/ABISysV_ppc.cpp
+++ b/source/Plugins/ABI/SysV-ppc/ABISysV_ppc.cpp
@@ -31,8 +31,8 @@
 #include "lldb/Target/Thread.h"
 #include "lldb/Utility/ConstString.h"
 #include "lldb/Utility/DataExtractor.h"
-#include "lldb/Utility/Error.h"
 #include "lldb/Utility/Log.h"
+#include "lldb/Utility/Status.h"
 
 using namespace lldb;
 using namespace lldb_private;
@@ -282,7 +282,7 @@ bool ABISysV_ppc::PrepareTrivialCall(Thread &thread, addr_t sp,
 
   sp -= 8;
 
-  Error error;
+  Status error;
   const RegisterInfo *pc_reg_info =
       reg_ctx->GetRegisterInfo(eRegisterKindGeneric, LLDB_REGNUM_GENERIC_PC);
   const RegisterInfo *sp_reg_info =
@@ -376,7 +376,7 @@ static bool ReadIntegerArgument(Scalar &scalar, unsigned int bit_width,
       scalar.SignExtend(bit_width);
   } else {
     uint32_t byte_size = (bit_width + (8 - 1)) / 8;
-    Error error;
+    Status error;
     if (thread.GetProcess()->ReadScalarIntegerFromMemory(
             current_stack_argument, byte_size, is_signed, scalar, error)) {
       current_stack_argument += byte_size;
@@ -464,9 +464,9 @@ bool ABISysV_ppc::GetArgumentValues(Thread &thread, ValueList &values) const {
   return true;
 }
 
-Error ABISysV_ppc::SetReturnValueObject(lldb::StackFrameSP &frame_sp,
-                                        lldb::ValueObjectSP &new_value_sp) {
-  Error error;
+Status ABISysV_ppc::SetReturnValueObject(lldb::StackFrameSP &frame_sp,
+                                         lldb::ValueObjectSP &new_value_sp) {
+  Status error;
   if (!new_value_sp) {
     error.SetErrorString("Empty value object for return value.");
     return error;
@@ -492,7 +492,7 @@ Error ABISysV_ppc::SetReturnValueObject(lldb::StackFrameSP &frame_sp,
     const RegisterInfo *reg_info = reg_ctx->GetRegisterInfoByName("r3", 0);
 
     DataExtractor data;
-    Error data_error;
+    Status data_error;
     size_t num_bytes = new_value_sp->GetData(data, data_error);
     if (data_error.Fail()) {
       error.SetErrorStringWithFormat(
@@ -518,7 +518,7 @@ Error ABISysV_ppc::SetReturnValueObject(lldb::StackFrameSP &frame_sp,
       size_t bit_width = compiler_type.GetBitSize(frame_sp.get());
       if (bit_width <= 64) {
         DataExtractor data;
-        Error data_error;
+        Status data_error;
         size_t num_bytes = new_value_sp->GetData(data, data_error);
         if (data_error.Fail()) {
           error.SetErrorStringWithFormat(
@@ -663,7 +663,7 @@ ValueObjectSP ABISysV_ppc::GetReturnValueObjectSimple(
             const ByteOrder byte_order = process_sp->GetByteOrder();
             RegisterValue reg_value;
             if (reg_ctx->ReadRegister(altivec_reg, reg_value)) {
-              Error error;
+              Status error;
               if (reg_value.GetAsMemoryData(
                       altivec_reg, heap_data_ap->GetBytes(),
                       heap_data_ap->GetByteSize(), byte_order, error)) {
diff --git a/source/Plugins/ABI/SysV-ppc/ABISysV_ppc.h b/source/Plugins/ABI/SysV-ppc/ABISysV_ppc.h
index c9c1c985f679..8cb9bf24881f 100644
--- a/source/Plugins/ABI/SysV-ppc/ABISysV_ppc.h
+++ b/source/Plugins/ABI/SysV-ppc/ABISysV_ppc.h
@@ -31,7 +31,7 @@ public:
   bool GetArgumentValues(lldb_private::Thread &thread,
                          lldb_private::ValueList &values) const override;
 
-  lldb_private::Error
+  lldb_private::Status
   SetReturnValueObject(lldb::StackFrameSP &frame_sp,
                        lldb::ValueObjectSP &new_value) override;
 
diff --git a/source/Plugins/ABI/SysV-ppc64/ABISysV_ppc64.cpp b/source/Plugins/ABI/SysV-ppc64/ABISysV_ppc64.cpp
index 59430caa6e37..449990d71304 100644
--- a/source/Plugins/ABI/SysV-ppc64/ABISysV_ppc64.cpp
+++ b/source/Plugins/ABI/SysV-ppc64/ABISysV_ppc64.cpp
@@ -31,8 +31,8 @@
 #include "lldb/Target/Thread.h"
 #include "lldb/Utility/ConstString.h"
 #include "lldb/Utility/DataExtractor.h"
-#include "lldb/Utility/Error.h"
 #include "lldb/Utility/Log.h"
+#include "lldb/Utility/Status.h"
 
 using namespace lldb;
 using namespace lldb_private;
@@ -282,7 +282,7 @@ bool ABISysV_ppc64::PrepareTrivialCall(Thread &thread, addr_t sp,
 
   sp -= 8;
 
-  Error error;
+  Status error;
   const RegisterInfo *pc_reg_info =
       reg_ctx->GetRegisterInfo(eRegisterKindGeneric, LLDB_REGNUM_GENERIC_PC);
   const RegisterInfo *sp_reg_info =
@@ -376,7 +376,7 @@ static bool ReadIntegerArgument(Scalar &scalar, unsigned int bit_width,
       scalar.SignExtend(bit_width);
   } else {
     uint32_t byte_size = (bit_width + (8 - 1)) / 8;
-    Error error;
+    Status error;
     if (thread.GetProcess()->ReadScalarIntegerFromMemory(
             current_stack_argument, byte_size, is_signed, scalar, error)) {
       current_stack_argument += byte_size;
@@ -464,9 +464,9 @@ bool ABISysV_ppc64::GetArgumentValues(Thread &thread, ValueList &values) const {
   return true;
 }
 
-Error ABISysV_ppc64::SetReturnValueObject(lldb::StackFrameSP &frame_sp,
-                                          lldb::ValueObjectSP &new_value_sp) {
-  Error error;
+Status ABISysV_ppc64::SetReturnValueObject(lldb::StackFrameSP &frame_sp,
+                                           lldb::ValueObjectSP &new_value_sp) {
+  Status error;
   if (!new_value_sp) {
     error.SetErrorString("Empty value object for return value.");
     return error;
@@ -492,7 +492,7 @@ Error ABISysV_ppc64::SetReturnValueObject(lldb::StackFrameSP &frame_sp,
     const RegisterInfo *reg_info = reg_ctx->GetRegisterInfoByName("r3", 0);
 
     DataExtractor data;
-    Error data_error;
+    Status data_error;
     size_t num_bytes = new_value_sp->GetData(data, data_error);
     if (data_error.Fail()) {
       error.SetErrorStringWithFormat(
@@ -518,7 +518,7 @@ Error ABISysV_ppc64::SetReturnValueObject(lldb::StackFrameSP &frame_sp,
       size_t bit_width = compiler_type.GetBitSize(frame_sp.get());
       if (bit_width <= 64) {
         DataExtractor data;
-        Error data_error;
+        Status data_error;
         size_t num_bytes = new_value_sp->GetData(data, data_error);
         if (data_error.Fail()) {
           error.SetErrorStringWithFormat(
@@ -663,7 +663,7 @@ ValueObjectSP ABISysV_ppc64::GetReturnValueObjectSimple(
             const ByteOrder byte_order = process_sp->GetByteOrder();
             RegisterValue reg_value;
             if (reg_ctx->ReadRegister(altivec_reg, reg_value)) {
-              Error error;
+              Status error;
               if (reg_value.GetAsMemoryData(
                       altivec_reg, heap_data_ap->GetBytes(),
                       heap_data_ap->GetByteSize(), byte_order, error)) {
diff --git a/source/Plugins/ABI/SysV-ppc64/ABISysV_ppc64.h b/source/Plugins/ABI/SysV-ppc64/ABISysV_ppc64.h
index 7f321dff49c1..29237a68fc9e 100644
--- a/source/Plugins/ABI/SysV-ppc64/ABISysV_ppc64.h
+++ b/source/Plugins/ABI/SysV-ppc64/ABISysV_ppc64.h
@@ -31,7 +31,7 @@ public:
   bool GetArgumentValues(lldb_private::Thread &thread,
                          lldb_private::ValueList &values) const override;
 
-  lldb_private::Error
+  lldb_private::Status
   SetReturnValueObject(lldb::StackFrameSP &frame_sp,
                        lldb::ValueObjectSP &new_value) override;
 
diff --git a/source/Plugins/ABI/SysV-s390x/ABISysV_s390x.cpp b/source/Plugins/ABI/SysV-s390x/ABISysV_s390x.cpp
index a899b24cea5c..7f76d49bfb0f 100644
--- a/source/Plugins/ABI/SysV-s390x/ABISysV_s390x.cpp
+++ b/source/Plugins/ABI/SysV-s390x/ABISysV_s390x.cpp
@@ -31,8 +31,8 @@
 #include "lldb/Target/Thread.h"
 #include "lldb/Utility/ConstString.h"
 #include "lldb/Utility/DataExtractor.h"
-#include "lldb/Utility/Error.h"
 #include "lldb/Utility/Log.h"
+#include "lldb/Utility/Status.h"
 
 using namespace lldb;
 using namespace lldb_private;
@@ -268,7 +268,7 @@ bool ABISysV_s390x::PrepareTrivialCall(Thread &thread, addr_t sp,
       if (!reg_ctx->WriteRegisterFromUnsigned(reg_info, args[i]))
         return false;
     } else {
-      Error error;
+      Status error;
       if (log)
         log->Printf("About to write arg%" PRIu64 " (0x%" PRIx64 ") onto stack",
                     static_cast<uint64_t>(i + 1), args[i]);
@@ -321,7 +321,7 @@ static bool ReadIntegerArgument(Scalar &scalar, unsigned int bit_width,
       scalar.SignExtend(bit_width);
   } else {
     uint32_t byte_size = (bit_width + (8 - 1)) / 8;
-    Error error;
+    Status error;
     if (thread.GetProcess()->ReadScalarIntegerFromMemory(
             current_stack_argument + 8 - byte_size, byte_size, is_signed,
             scalar, error)) {
@@ -401,9 +401,9 @@ bool ABISysV_s390x::GetArgumentValues(Thread &thread, ValueList &values) const {
   return true;
 }
 
-Error ABISysV_s390x::SetReturnValueObject(lldb::StackFrameSP &frame_sp,
-                                          lldb::ValueObjectSP &new_value_sp) {
-  Error error;
+Status ABISysV_s390x::SetReturnValueObject(lldb::StackFrameSP &frame_sp,
+                                           lldb::ValueObjectSP &new_value_sp) {
+  Status error;
   if (!new_value_sp) {
     error.SetErrorString("Empty value object for return value.");
     return error;
@@ -429,7 +429,7 @@ Error ABISysV_s390x::SetReturnValueObject(lldb::StackFrameSP &frame_sp,
     const RegisterInfo *reg_info = reg_ctx->GetRegisterInfoByName("r2", 0);
 
     DataExtractor data;
-    Error data_error;
+    Status data_error;
     size_t num_bytes = new_value_sp->GetData(data, data_error);
     if (data_error.Fail()) {
       error.SetErrorStringWithFormat(
@@ -457,7 +457,7 @@ Error ABISysV_s390x::SetReturnValueObject(lldb::StackFrameSP &frame_sp,
         const RegisterInfo *f0_info = reg_ctx->GetRegisterInfoByName("f0", 0);
         RegisterValue f0_value;
         DataExtractor data;
-        Error data_error;
+        Status data_error;
         size_t num_bytes = new_value_sp->GetData(data, data_error);
         if (data_error.Fail()) {
           error.SetErrorStringWithFormat(
diff --git a/source/Plugins/ABI/SysV-s390x/ABISysV_s390x.h b/source/Plugins/ABI/SysV-s390x/ABISysV_s390x.h
index 6ccabd6f75ec..e233a900e395 100644
--- a/source/Plugins/ABI/SysV-s390x/ABISysV_s390x.h
+++ b/source/Plugins/ABI/SysV-s390x/ABISysV_s390x.h
@@ -31,7 +31,7 @@ public:
   bool GetArgumentValues(lldb_private::Thread &thread,
                          lldb_private::ValueList &values) const override;
 
-  lldb_private::Error
+  lldb_private::Status
   SetReturnValueObject(lldb::StackFrameSP &frame_sp,
                        lldb::ValueObjectSP &new_value) override;
 
diff --git a/source/Plugins/ABI/SysV-x86_64/ABISysV_x86_64.cpp b/source/Plugins/ABI/SysV-x86_64/ABISysV_x86_64.cpp
index 42bb9d057e61..deccca8c29e9 100644
--- a/source/Plugins/ABI/SysV-x86_64/ABISysV_x86_64.cpp
+++ b/source/Plugins/ABI/SysV-x86_64/ABISysV_x86_64.cpp
@@ -31,8 +31,8 @@
 #include "lldb/Target/Thread.h"
 #include "lldb/Utility/ConstString.h"
 #include "lldb/Utility/DataExtractor.h"
-#include "lldb/Utility/Error.h"
 #include "lldb/Utility/Log.h"
+#include "lldb/Utility/Status.h"
 
 using namespace lldb;
 using namespace lldb_private;
@@ -1152,7 +1152,7 @@ bool ABISysV_x86_64::PrepareTrivialCall(Thread &thread, addr_t sp,
 
   sp -= 8;
 
-  Error error;
+  Status error;
   const RegisterInfo *pc_reg_info =
       reg_ctx->GetRegisterInfo(eRegisterKindGeneric, LLDB_REGNUM_GENERIC_PC);
   const RegisterInfo *sp_reg_info =
@@ -1246,7 +1246,7 @@ static bool ReadIntegerArgument(Scalar &scalar, unsigned int bit_width,
       scalar.SignExtend(bit_width);
   } else {
     uint32_t byte_size = (bit_width + (8 - 1)) / 8;
-    Error error;
+    Status error;
     if (thread.GetProcess()->ReadScalarIntegerFromMemory(
             current_stack_argument, byte_size, is_signed, scalar, error)) {
       current_stack_argument += byte_size;
@@ -1329,9 +1329,9 @@ bool ABISysV_x86_64::GetArgumentValues(Thread &thread,
   return true;
 }
 
-Error ABISysV_x86_64::SetReturnValueObject(lldb::StackFrameSP &frame_sp,
-                                           lldb::ValueObjectSP &new_value_sp) {
-  Error error;
+Status ABISysV_x86_64::SetReturnValueObject(lldb::StackFrameSP &frame_sp,
+                                            lldb::ValueObjectSP &new_value_sp) {
+  Status error;
   if (!new_value_sp) {
     error.SetErrorString("Empty value object for return value.");
     return error;
@@ -1357,7 +1357,7 @@ Error ABISysV_x86_64::SetReturnValueObject(lldb::StackFrameSP &frame_sp,
     const RegisterInfo *reg_info = reg_ctx->GetRegisterInfoByName("rax", 0);
 
     DataExtractor data;
-    Error data_error;
+    Status data_error;
     size_t num_bytes = new_value_sp->GetData(data, data_error);
     if (data_error.Fail()) {
       error.SetErrorStringWithFormat(
@@ -1386,7 +1386,7 @@ Error ABISysV_x86_64::SetReturnValueObject(lldb::StackFrameSP &frame_sp,
             reg_ctx->GetRegisterInfoByName("xmm0", 0);
         RegisterValue xmm0_value;
         DataExtractor data;
-        Error data_error;
+        Status data_error;
         size_t num_bytes = new_value_sp->GetData(data, data_error);
         if (data_error.Fail()) {
           error.SetErrorStringWithFormat(
@@ -1542,7 +1542,7 @@ ValueObjectSP ABISysV_x86_64::GetReturnValueObjectSimple(
             const ByteOrder byte_order = process_sp->GetByteOrder();
             RegisterValue reg_value;
             if (reg_ctx->ReadRegister(altivec_reg, reg_value)) {
-              Error error;
+              Status error;
               if (reg_value.GetAsMemoryData(
                       altivec_reg, heap_data_ap->GetBytes(),
                       heap_data_ap->GetByteSize(), byte_order, error)) {
@@ -1569,7 +1569,7 @@ ValueObjectSP ABISysV_x86_64::GetReturnValueObjectSimple(
               if (reg_ctx->ReadRegister(altivec_reg, reg_value) &&
                   reg_ctx->ReadRegister(altivec_reg2, reg_value2)) {
 
-                Error error;
+                Status error;
                 if (reg_value.GetAsMemoryData(
                         altivec_reg, heap_data_ap->GetBytes(),
                         altivec_reg->byte_size, byte_order, error) &&
diff --git a/source/Plugins/ABI/SysV-x86_64/ABISysV_x86_64.h b/source/Plugins/ABI/SysV-x86_64/ABISysV_x86_64.h
index 29f2ce133d42..8d420e88167c 100644
--- a/source/Plugins/ABI/SysV-x86_64/ABISysV_x86_64.h
+++ b/source/Plugins/ABI/SysV-x86_64/ABISysV_x86_64.h
@@ -31,7 +31,7 @@ public:
   bool GetArgumentValues(lldb_private::Thread &thread,
                          lldb_private::ValueList &values) const override;
 
-  lldb_private::Error
+  lldb_private::Status
   SetReturnValueObject(lldb::StackFrameSP &frame_sp,
                        lldb::ValueObjectSP &new_value) override;
 
diff --git a/source/Plugins/Disassembler/llvm/DisassemblerLLVMC.cpp b/source/Plugins/Disassembler/llvm/DisassemblerLLVMC.cpp
index e144ef18b413..21e19bf2b127 100644
--- a/source/Plugins/Disassembler/llvm/DisassemblerLLVMC.cpp
+++ b/source/Plugins/Disassembler/llvm/DisassemblerLLVMC.cpp
@@ -885,9 +885,9 @@ DisassemblerLLVMC::LLVMCDisassembler::LLVMCDisassembler(
     const char *triple, const char *cpu, const char *features_str,
     unsigned flavor, DisassemblerLLVMC &owner)
     : m_is_valid(true) {
-  std::string Error;
+  std::string Status;
   const llvm::Target *curr_target =
-      llvm::TargetRegistry::lookupTarget(triple, Error);
+      llvm::TargetRegistry::lookupTarget(triple, Status);
   if (!curr_target) {
     m_is_valid = false;
     return;
diff --git a/source/Plugins/DynamicLoader/Darwin-Kernel/DynamicLoaderDarwinKernel.cpp b/source/Plugins/DynamicLoader/Darwin-Kernel/DynamicLoaderDarwinKernel.cpp
index 80f3f6857fd4..a82b3fe267b3 100644
--- a/source/Plugins/DynamicLoader/Darwin-Kernel/DynamicLoaderDarwinKernel.cpp
+++ b/source/Plugins/DynamicLoader/Darwin-Kernel/DynamicLoaderDarwinKernel.cpp
@@ -239,7 +239,7 @@ DynamicLoaderDarwinKernel::SearchForKernelWithDebugHints(Process *process) {
   if (GetGlobalProperties()->GetScanType() == eKASLRScanNone)
     return LLDB_INVALID_ADDRESS;
 
-  Error read_err;
+  Status read_err;
   addr_t kernel_addresses_64[] = {
       0xfffffff000004010ULL, // newest arm64 devices
       0xffffff8000004010ULL, // 2014-2015-ish arm64 devices
@@ -395,7 +395,7 @@ DynamicLoaderDarwinKernel::CheckForKernelImageAtAddress(lldb::addr_t addr,
   // valid Mach-O magic field there
   // (the first field of the mach_header/mach_header_64 struct).
 
-  Error read_error;
+  Status read_error;
   uint8_t magicbuf[4];
   if (process->ReadMemoryFromInferior (addr, magicbuf, sizeof (magicbuf), read_error) != sizeof (magicbuf))
       return UUID();
@@ -483,7 +483,7 @@ DynamicLoaderDarwinKernel::DynamicLoaderDarwinKernel(Process *process,
       m_kext_summary_header_ptr_addr(), m_kext_summary_header_addr(),
       m_kext_summary_header(), m_known_kexts(), m_mutex(),
       m_break_id(LLDB_INVALID_BREAK_ID) {
-  Error error;
+  Status error;
   PlatformSP platform_sp(
       Platform::Create(PlatformDarwinKernel::GetPluginNameStatic(), error));
   // Only select the darwin-kernel Platform if we've been asked to load kexts.
@@ -1086,7 +1086,7 @@ bool DynamicLoaderDarwinKernel::ReadKextSummaryHeader() {
   if (m_kext_summary_header_ptr_addr.IsValid()) {
     const uint32_t addr_size = m_kernel.GetAddressByteSize();
     const ByteOrder byte_order = m_kernel.GetByteOrder();
-    Error error;
+    Status error;
     // Read enough bytes for a "OSKextLoadedKextSummaryHeader" structure
     // which is currently 4 uint32_t and a pointer.
     uint8_t buf[24];
@@ -1338,7 +1338,7 @@ uint32_t DynamicLoaderDarwinKernel::ReadKextSummaries(
   image_infos.resize(image_infos_count);
   const size_t count = image_infos.size() * m_kext_summary_header.entry_size;
   DataBufferHeap data(count, 0);
-  Error error;
+  Status error;
 
   const bool prefer_file_cache = false;
   const size_t bytes_read = m_process->GetTarget().ReadMemory(
@@ -1517,8 +1517,8 @@ DynamicLoaderDarwinKernel::GetStepThroughTrampolinePlan(Thread &thread,
   return thread_plan_sp;
 }
 
-Error DynamicLoaderDarwinKernel::CanLoadImage() {
-  Error error;
+Status DynamicLoaderDarwinKernel::CanLoadImage() {
+  Status error;
   error.SetErrorString(
       "always unsafe to load or unload shared libraries in the darwin kernel");
   return error;
diff --git a/source/Plugins/DynamicLoader/Darwin-Kernel/DynamicLoaderDarwinKernel.h b/source/Plugins/DynamicLoader/Darwin-Kernel/DynamicLoaderDarwinKernel.h
index 7ca9bada1a1f..ad4f7c631d84 100644
--- a/source/Plugins/DynamicLoader/Darwin-Kernel/DynamicLoaderDarwinKernel.h
+++ b/source/Plugins/DynamicLoader/Darwin-Kernel/DynamicLoaderDarwinKernel.h
@@ -61,7 +61,7 @@ public:
   lldb::ThreadPlanSP GetStepThroughTrampolinePlan(lldb_private::Thread &thread,
                                                   bool stop_others) override;
 
-  lldb_private::Error CanLoadImage() override;
+  lldb_private::Status CanLoadImage() override;
 
   //------------------------------------------------------------------
   // PluginInterface protocol
diff --git a/source/Plugins/DynamicLoader/Hexagon-DYLD/DynamicLoaderHexagonDYLD.cpp b/source/Plugins/DynamicLoader/Hexagon-DYLD/DynamicLoaderHexagonDYLD.cpp
index 25bf6e9a7296..d0d60017e869 100644
--- a/source/Plugins/DynamicLoader/Hexagon-DYLD/DynamicLoaderHexagonDYLD.cpp
+++ b/source/Plugins/DynamicLoader/Hexagon-DYLD/DynamicLoaderHexagonDYLD.cpp
@@ -213,7 +213,7 @@ ModuleSP DynamicLoaderHexagonDYLD::GetTargetExecutable() {
 }
 
 // AD: Needs to be updated?
-Error DynamicLoaderHexagonDYLD::CanLoadImage() { return Error(); }
+Status DynamicLoaderHexagonDYLD::CanLoadImage() { return Status(); }
 
 void DynamicLoaderHexagonDYLD::UpdateLoadedSections(ModuleSP module,
                                                     addr_t link_map_addr,
@@ -558,7 +558,7 @@ const SectionList *DynamicLoaderHexagonDYLD::GetSectionListFromModule(
 }
 
 static int ReadInt(Process *process, addr_t addr) {
-  Error error;
+  Status error;
   int value = (int)process->ReadUnsignedIntegerFromMemory(
       addr, sizeof(uint32_t), 0, error);
   if (error.Fail())
diff --git a/source/Plugins/DynamicLoader/Hexagon-DYLD/DynamicLoaderHexagonDYLD.h b/source/Plugins/DynamicLoader/Hexagon-DYLD/DynamicLoaderHexagonDYLD.h
index 05709d07fd67..200a4171bd1c 100644
--- a/source/Plugins/DynamicLoader/Hexagon-DYLD/DynamicLoaderHexagonDYLD.h
+++ b/source/Plugins/DynamicLoader/Hexagon-DYLD/DynamicLoaderHexagonDYLD.h
@@ -47,7 +47,7 @@ public:
   lldb::ThreadPlanSP GetStepThroughTrampolinePlan(lldb_private::Thread &thread,
                                                   bool stop_others) override;
 
-  lldb_private::Error CanLoadImage() override;
+  lldb_private::Status CanLoadImage() override;
 
   lldb::addr_t GetThreadLocalData(const lldb::ModuleSP module,
                                   const lldb::ThreadSP thread,
diff --git a/source/Plugins/DynamicLoader/Hexagon-DYLD/HexagonDYLDRendezvous.cpp b/source/Plugins/DynamicLoader/Hexagon-DYLD/HexagonDYLDRendezvous.cpp
index f22644aec107..1ef1be5d0adb 100644
--- a/source/Plugins/DynamicLoader/Hexagon-DYLD/HexagonDYLDRendezvous.cpp
+++ b/source/Plugins/DynamicLoader/Hexagon-DYLD/HexagonDYLDRendezvous.cpp
@@ -16,8 +16,8 @@
 #include "lldb/Symbol/SymbolContext.h"
 #include "lldb/Target/Process.h"
 #include "lldb/Target/Target.h"
-#include "lldb/Utility/Error.h"
 #include "lldb/Utility/Log.h"
+#include "lldb/Utility/Status.h"
 
 #include "lldb/Symbol/ObjectFile.h"
 #include "lldb/Target/Process.h"
@@ -33,7 +33,7 @@ using namespace lldb_private;
 static addr_t ResolveRendezvousAddress(Process *process) {
   addr_t info_location;
   addr_t info_addr;
-  Error error;
+  Status error;
 
   info_location = process->GetImageInfoAddress();
 
@@ -222,7 +222,7 @@ bool HexagonDYLDRendezvous::TakeSnapshot(SOEntryList &entry_list) {
 
 addr_t HexagonDYLDRendezvous::ReadWord(addr_t addr, uint64_t *dst,
                                        size_t size) {
-  Error error;
+  Status error;
 
   *dst = m_process->ReadUnsignedIntegerFromMemory(addr, size, 0, error);
   if (error.Fail())
@@ -232,7 +232,7 @@ addr_t HexagonDYLDRendezvous::ReadWord(addr_t addr, uint64_t *dst,
 }
 
 addr_t HexagonDYLDRendezvous::ReadPointer(addr_t addr, addr_t *dst) {
-  Error error;
+  Status error;
 
   *dst = m_process->ReadPointerFromMemory(addr, error);
   if (error.Fail())
@@ -243,7 +243,7 @@ addr_t HexagonDYLDRendezvous::ReadPointer(addr_t addr, addr_t *dst) {
 
 std::string HexagonDYLDRendezvous::ReadStringFromMemory(addr_t addr) {
   std::string str;
-  Error error;
+  Status error;
   size_t size;
   char c;
 
@@ -304,7 +304,7 @@ bool HexagonDYLDRendezvous::FindMetadata(const char *name, PThreadField field,
   if (addr == LLDB_INVALID_ADDRESS)
     return false;
 
-  Error error;
+  Status error;
   value = (uint32_t)m_process->ReadUnsignedIntegerFromMemory(
       addr + field * sizeof(uint32_t), sizeof(uint32_t), 0, error);
   if (error.Fail())
diff --git a/source/Plugins/DynamicLoader/MacOSX-DYLD/DynamicLoaderDarwin.cpp b/source/Plugins/DynamicLoader/MacOSX-DYLD/DynamicLoaderDarwin.cpp
index 4ffd216e98a7..703b461f6fe1 100644
--- a/source/Plugins/DynamicLoader/MacOSX-DYLD/DynamicLoaderDarwin.cpp
+++ b/source/Plugins/DynamicLoader/MacOSX-DYLD/DynamicLoaderDarwin.cpp
@@ -440,8 +440,8 @@ bool DynamicLoaderDarwin::JSONImageInformationIntoImageInfo(
       Segment segment;
       StructuredData::Dictionary *seg =
           segments->GetItemAtIndex(j)->GetAsDictionary();
-      segment.name = ConstString(
-          seg->GetValueForKey("name")->GetAsString()->GetValue().c_str());
+      segment.name =
+          ConstString(seg->GetValueForKey("name")->GetAsString()->GetValue());
       segment.vmaddr =
           seg->GetValueForKey("vmaddr")->GetAsInteger()->GetValue();
       segment.vmsize =
@@ -478,8 +478,8 @@ bool DynamicLoaderDarwin::JSONImageInformationIntoImageInfo(
       image_infos[i].segments.push_back(segment);
     }
 
-    image_infos[i].uuid.SetFromCString(
-        image->GetValueForKey("uuid")->GetAsString()->GetValue().c_str());
+    image_infos[i].uuid.SetFromStringRef(
+        image->GetValueForKey("uuid")->GetAsString()->GetValue());
 
     // All sections listed in the dyld image info structure will all
     // either be fixed up already, or they will all be off by a single
@@ -962,7 +962,7 @@ DynamicLoaderDarwin::GetStepThroughTrampolinePlan(Thread &thread,
       for (Address address : addresses) {
         Symbol *symbol = address.CalculateSymbolContextSymbol();
         if (symbol && symbol->IsIndirect()) {
-          Error error;
+          Status error;
           Address symbol_address = symbol->GetAddress();
           addr_t resolved_addr = thread.GetProcess()->ResolveIndirectFunction(
               &symbol_address, error);
@@ -1062,7 +1062,7 @@ DynamicLoaderDarwin::GetThreadLocalData(const lldb::ModuleSP module_sp,
 
   lldb_private::Address tls_addr;
   if (module_sp->ResolveFileAddress(tls_file_addr, tls_addr)) {
-    Error error;
+    Status error;
     const size_t tsl_data_size = addr_size * 3;
     Target &target = m_process->GetTarget();
     if (target.ReadMemory(tls_addr, false, buf, tsl_data_size, error) ==
diff --git a/source/Plugins/DynamicLoader/MacOSX-DYLD/DynamicLoaderMacOS.cpp b/source/Plugins/DynamicLoader/MacOSX-DYLD/DynamicLoaderMacOS.cpp
index c824653b2e93..66085a23759b 100644
--- a/source/Plugins/DynamicLoader/MacOSX-DYLD/DynamicLoaderMacOS.cpp
+++ b/source/Plugins/DynamicLoader/MacOSX-DYLD/DynamicLoaderMacOS.cpp
@@ -270,7 +270,7 @@ bool DynamicLoaderMacOS::NotifyBreakpointHit(void *baton,
           if (header_array != static_cast<uint64_t>(-1)) {
             std::vector<addr_t> image_load_addresses;
             for (uint64_t i = 0; i < image_infos_count; i++) {
-              Error error;
+              Status error;
               addr_t addr = process->ReadUnsignedIntegerFromMemory(
                   header_array + (8 * i), 8, LLDB_INVALID_ADDRESS, error);
               if (addr != LLDB_INVALID_ADDRESS) {
@@ -397,8 +397,8 @@ DynamicLoaderMacOS::GetDyldLockVariableAddressFromModule(Module *module) {
 //  0;
 //
 //  in libdyld.dylib.
-Error DynamicLoaderMacOS::CanLoadImage() {
-  Error error;
+Status DynamicLoaderMacOS::CanLoadImage() {
+  Status error;
   addr_t symbol_address = LLDB_INVALID_ADDRESS;
   Target &target = m_process->GetTarget();
   const ModuleList &target_modules = target.GetImages();
diff --git a/source/Plugins/DynamicLoader/MacOSX-DYLD/DynamicLoaderMacOS.h b/source/Plugins/DynamicLoader/MacOSX-DYLD/DynamicLoaderMacOS.h
index 60c4beed383b..dd42686195b1 100644
--- a/source/Plugins/DynamicLoader/MacOSX-DYLD/DynamicLoaderMacOS.h
+++ b/source/Plugins/DynamicLoader/MacOSX-DYLD/DynamicLoaderMacOS.h
@@ -62,7 +62,7 @@ public:
   //------------------------------------------------------------------
   bool ProcessDidExec() override;
 
-  lldb_private::Error CanLoadImage() override;
+  lldb_private::Status CanLoadImage() override;
 
   bool GetSharedCacheInformation(
       lldb::addr_t &base_address, lldb_private::UUID &uuid,
diff --git a/source/Plugins/DynamicLoader/MacOSX-DYLD/DynamicLoaderMacOSXDYLD.cpp b/source/Plugins/DynamicLoader/MacOSX-DYLD/DynamicLoaderMacOSXDYLD.cpp
index 46742c1f9b5e..e0def58f41c1 100644
--- a/source/Plugins/DynamicLoader/MacOSX-DYLD/DynamicLoaderMacOSXDYLD.cpp
+++ b/source/Plugins/DynamicLoader/MacOSX-DYLD/DynamicLoaderMacOSXDYLD.cpp
@@ -201,7 +201,7 @@ void DynamicLoaderMacOSXDYLD::DoInitialImageFetch() {
           m_process->GetTarget().GetArchitecture().GetByteOrder();
       uint8_t buf[4];
       DataExtractor data(buf, sizeof(buf), byte_order, 4);
-      Error error;
+      Status error;
       if (m_process->ReadMemory(shlib_addr, buf, 4, error) == 4) {
         lldb::offset_t offset = 0;
         uint32_t magic = data.GetU32(&offset);
@@ -463,7 +463,7 @@ bool DynamicLoaderMacOSXDYLD::ReadAllImageInfosStructure() {
     UNUSED_IF_ASSERT_DISABLED(count_v13);
     assert(sizeof(buf) >= count_v13);
 
-    Error error;
+    Status error;
     if (m_process->ReadMemory(m_dyld_all_image_infos_addr, buf, 4, error) ==
         4) {
       m_dyld_all_image_infos.version = data.GetU32(&offset);
@@ -683,7 +683,7 @@ bool DynamicLoaderMacOSXDYLD::ReadImageInfos(
   image_infos.resize(image_infos_count);
   const size_t count = image_infos.size() * 3 * addr_size;
   DataBufferHeap info_data(count, 0);
-  Error error;
+  Status error;
   const size_t bytes_read = m_process->ReadMemory(
       image_infos_addr, info_data.GetBytes(), info_data.GetByteSize(), error);
   if (bytes_read == count) {
@@ -793,7 +793,7 @@ bool DynamicLoaderMacOSXDYLD::ReadMachHeader(lldb::addr_t addr,
                                              llvm::MachO::mach_header *header,
                                              DataExtractor *load_command_data) {
   DataBufferHeap header_bytes(sizeof(llvm::MachO::mach_header), 0);
-  Error error;
+  Status error;
   size_t bytes_read = m_process->ReadMemory(addr, header_bytes.GetBytes(),
                                             header_bytes.GetByteSize(), error);
   if (bytes_read == sizeof(llvm::MachO::mach_header)) {
@@ -1069,8 +1069,8 @@ bool DynamicLoaderMacOSXDYLD::SetNotificationBreakpoint() {
   return m_break_id != LLDB_INVALID_BREAK_ID;
 }
 
-Error DynamicLoaderMacOSXDYLD::CanLoadImage() {
-  Error error;
+Status DynamicLoaderMacOSXDYLD::CanLoadImage() {
+  Status error;
   // In order for us to tell if we can load a shared library we verify that
   // the dylib_info_addr isn't zero (which means no shared libraries have
   // been set yet, or dyld is currently mucking with the shared library list).
@@ -1107,7 +1107,7 @@ bool DynamicLoaderMacOSXDYLD::GetSharedCacheInformation(
     // Version 13 and higher of dyld_all_image_infos is required to get the
     // sharedCacheUUID field.
 
-    Error err;
+    Status err;
     uint32_t version_or_magic =
         m_process->ReadUnsignedIntegerFromMemory(all_image_infos, 4, -1, err);
     if (version_or_magic != static_cast<uint32_t>(-1) &&
@@ -1140,7 +1140,7 @@ bool DynamicLoaderMacOSXDYLD::GetSharedCacheInformation(
           // The sharedCacheBaseAddress field is the next one in the
           // dyld_all_image_infos struct.
           addr_t sharedCacheBaseAddr_address = sharedCacheUUID_address + 16;
-          Error error;
+          Status error;
           base_address = m_process->ReadUnsignedIntegerFromMemory(
               sharedCacheBaseAddr_address, wordsize, LLDB_INVALID_ADDRESS,
               error);
diff --git a/source/Plugins/DynamicLoader/MacOSX-DYLD/DynamicLoaderMacOSXDYLD.h b/source/Plugins/DynamicLoader/MacOSX-DYLD/DynamicLoaderMacOSXDYLD.h
index d5f1b51e508a..81c2fcfb194f 100644
--- a/source/Plugins/DynamicLoader/MacOSX-DYLD/DynamicLoaderMacOSXDYLD.h
+++ b/source/Plugins/DynamicLoader/MacOSX-DYLD/DynamicLoaderMacOSXDYLD.h
@@ -65,7 +65,7 @@ public:
   //------------------------------------------------------------------
   bool ProcessDidExec() override;
 
-  lldb_private::Error CanLoadImage() override;
+  lldb_private::Status CanLoadImage() override;
 
   bool GetSharedCacheInformation(
       lldb::addr_t &base_address, lldb_private::UUID &uuid,
diff --git a/source/Plugins/DynamicLoader/POSIX-DYLD/DYLDRendezvous.cpp b/source/Plugins/DynamicLoader/POSIX-DYLD/DYLDRendezvous.cpp
index c4917c08fa90..c1986976b0fc 100644
--- a/source/Plugins/DynamicLoader/POSIX-DYLD/DYLDRendezvous.cpp
+++ b/source/Plugins/DynamicLoader/POSIX-DYLD/DYLDRendezvous.cpp
@@ -18,8 +18,8 @@
 #include "lldb/Target/Platform.h"
 #include "lldb/Target/Process.h"
 #include "lldb/Target/Target.h"
-#include "lldb/Utility/Error.h"
 #include "lldb/Utility/Log.h"
+#include "lldb/Utility/Status.h"
 
 #include "llvm/Support/Path.h"
 
@@ -34,7 +34,7 @@ static addr_t ResolveRendezvousAddress(Process *process) {
   Log *log(GetLogIfAnyCategoriesSet(LIBLLDB_LOG_DYNAMIC_LOADER));
   addr_t info_location;
   addr_t info_addr;
-  Error error;
+  Status error;
 
   if (!process) {
     if (log)
@@ -420,7 +420,7 @@ bool DYLDRendezvous::TakeSnapshot(SOEntryList &entry_list) {
 }
 
 addr_t DYLDRendezvous::ReadWord(addr_t addr, uint64_t *dst, size_t size) {
-  Error error;
+  Status error;
 
   *dst = m_process->ReadUnsignedIntegerFromMemory(addr, size, 0, error);
   if (error.Fail())
@@ -430,7 +430,7 @@ addr_t DYLDRendezvous::ReadWord(addr_t addr, uint64_t *dst, size_t size) {
 }
 
 addr_t DYLDRendezvous::ReadPointer(addr_t addr, addr_t *dst) {
-  Error error;
+  Status error;
 
   *dst = m_process->ReadPointerFromMemory(addr, error);
   if (error.Fail())
@@ -441,7 +441,7 @@ addr_t DYLDRendezvous::ReadPointer(addr_t addr, addr_t *dst) {
 
 std::string DYLDRendezvous::ReadStringFromMemory(addr_t addr) {
   std::string str;
-  Error error;
+  Status error;
 
   if (addr == LLDB_INVALID_ADDRESS)
     return std::string();
@@ -479,7 +479,7 @@ void DYLDRendezvous::UpdateBaseAddrIfNecessary(SOEntry &entry,
   if (isLoadBiasIncorrect(m_process->GetTarget(), file_path)) {
     lldb::addr_t load_addr = LLDB_INVALID_ADDRESS;
     bool is_loaded = false;
-    Error error =
+    Status error =
         m_process->GetFileLoadAddress(entry.file_spec, is_loaded, load_addr);
     if (error.Success() && is_loaded)
       entry.base_addr = load_addr;
@@ -545,7 +545,7 @@ bool DYLDRendezvous::FindMetadata(const char *name, PThreadField field,
   if (addr == LLDB_INVALID_ADDRESS)
     return false;
 
-  Error error;
+  Status error;
   value = (uint32_t)m_process->ReadUnsignedIntegerFromMemory(
       addr + field * sizeof(uint32_t), sizeof(uint32_t), 0, error);
   if (error.Fail())
diff --git a/source/Plugins/DynamicLoader/POSIX-DYLD/DynamicLoaderPOSIXDYLD.cpp b/source/Plugins/DynamicLoader/POSIX-DYLD/DynamicLoaderPOSIXDYLD.cpp
index c809d2c77834..5f56bfc027a4 100644
--- a/source/Plugins/DynamicLoader/POSIX-DYLD/DynamicLoaderPOSIXDYLD.cpp
+++ b/source/Plugins/DynamicLoader/POSIX-DYLD/DynamicLoaderPOSIXDYLD.cpp
@@ -222,7 +222,7 @@ void DynamicLoaderPOSIXDYLD::DidLaunch() {
   }
 }
 
-Error DynamicLoaderPOSIXDYLD::CanLoadImage() { return Error(); }
+Status DynamicLoaderPOSIXDYLD::CanLoadImage() { return Status(); }
 
 void DynamicLoaderPOSIXDYLD::UpdateLoadedSections(ModuleSP module,
                                                   addr_t link_map_addr,
diff --git a/source/Plugins/DynamicLoader/POSIX-DYLD/DynamicLoaderPOSIXDYLD.h b/source/Plugins/DynamicLoader/POSIX-DYLD/DynamicLoaderPOSIXDYLD.h
index 8e4be1d4a06a..86e0311c2919 100644
--- a/source/Plugins/DynamicLoader/POSIX-DYLD/DynamicLoaderPOSIXDYLD.h
+++ b/source/Plugins/DynamicLoader/POSIX-DYLD/DynamicLoaderPOSIXDYLD.h
@@ -52,7 +52,7 @@ public:
   lldb::ThreadPlanSP GetStepThroughTrampolinePlan(lldb_private::Thread &thread,
                                                   bool stop_others) override;
 
-  lldb_private::Error CanLoadImage() override;
+  lldb_private::Status CanLoadImage() override;
 
   lldb::addr_t GetThreadLocalData(const lldb::ModuleSP module,
                                   const lldb::ThreadSP thread,
diff --git a/source/Plugins/DynamicLoader/Static/DynamicLoaderStatic.cpp b/source/Plugins/DynamicLoader/Static/DynamicLoaderStatic.cpp
index 04f1f3390570..291695cb68c1 100644
--- a/source/Plugins/DynamicLoader/Static/DynamicLoaderStatic.cpp
+++ b/source/Plugins/DynamicLoader/Static/DynamicLoaderStatic.cpp
@@ -132,8 +132,8 @@ DynamicLoaderStatic::GetStepThroughTrampolinePlan(Thread &thread,
   return ThreadPlanSP();
 }
 
-Error DynamicLoaderStatic::CanLoadImage() {
-  Error error;
+Status DynamicLoaderStatic::CanLoadImage() {
+  Status error;
   error.SetErrorString("can't load images on with a static debug session");
   return error;
 }
diff --git a/source/Plugins/DynamicLoader/Static/DynamicLoaderStatic.h b/source/Plugins/DynamicLoader/Static/DynamicLoaderStatic.h
index 413ad80a2fd9..2d18ec86afd3 100644
--- a/source/Plugins/DynamicLoader/Static/DynamicLoaderStatic.h
+++ b/source/Plugins/DynamicLoader/Static/DynamicLoaderStatic.h
@@ -52,7 +52,7 @@ public:
   lldb::ThreadPlanSP GetStepThroughTrampolinePlan(lldb_private::Thread &thread,
                                                   bool stop_others) override;
 
-  lldb_private::Error CanLoadImage() override;
+  lldb_private::Status CanLoadImage() override;
 
   //------------------------------------------------------------------
   // PluginInterface protocol
diff --git a/source/Plugins/DynamicLoader/Windows-DYLD/DynamicLoaderWindowsDYLD.cpp b/source/Plugins/DynamicLoader/Windows-DYLD/DynamicLoaderWindowsDYLD.cpp
index 20bf3609f46f..c381326ebf4f 100644
--- a/source/Plugins/DynamicLoader/Windows-DYLD/DynamicLoaderWindowsDYLD.cpp
+++ b/source/Plugins/DynamicLoader/Windows-DYLD/DynamicLoaderWindowsDYLD.cpp
@@ -61,7 +61,7 @@ void DynamicLoaderWindowsDYLD::DidAttach() {}
 
 void DynamicLoaderWindowsDYLD::DidLaunch() {}
 
-Error DynamicLoaderWindowsDYLD::CanLoadImage() { return Error(); }
+Status DynamicLoaderWindowsDYLD::CanLoadImage() { return Status(); }
 
 ConstString DynamicLoaderWindowsDYLD::GetPluginName() {
   return GetPluginNameStatic();
diff --git a/source/Plugins/DynamicLoader/Windows-DYLD/DynamicLoaderWindowsDYLD.h b/source/Plugins/DynamicLoader/Windows-DYLD/DynamicLoaderWindowsDYLD.h
index 3494082eea8d..de6e295f7891 100644
--- a/source/Plugins/DynamicLoader/Windows-DYLD/DynamicLoaderWindowsDYLD.h
+++ b/source/Plugins/DynamicLoader/Windows-DYLD/DynamicLoaderWindowsDYLD.h
@@ -34,7 +34,7 @@ public:
 
   void DidAttach() override;
   void DidLaunch() override;
-  Error CanLoadImage() override;
+  Status CanLoadImage() override;
   lldb::ThreadPlanSP GetStepThroughTrampolinePlan(Thread &thread,
                                                   bool stop) override;
 
diff --git a/source/Plugins/ExpressionParser/Clang/ClangExpressionDeclMap.cpp b/source/Plugins/ExpressionParser/Clang/ClangExpressionDeclMap.cpp
index 1f78fb96bc34..256d46a15420 100644
--- a/source/Plugins/ExpressionParser/Clang/ClangExpressionDeclMap.cpp
+++ b/source/Plugins/ExpressionParser/Clang/ClangExpressionDeclMap.cpp
@@ -43,8 +43,8 @@
 #include "lldb/Target/Target.h"
 #include "lldb/Target/Thread.h"
 #include "lldb/Utility/Endian.h"
-#include "lldb/Utility/Error.h"
 #include "lldb/Utility/Log.h"
+#include "lldb/Utility/Status.h"
 #include "lldb/lldb-private.h"
 #include "clang/AST/ASTConsumer.h"
 #include "clang/AST/ASTContext.h"
@@ -191,7 +191,7 @@ bool ClangExpressionDeclMap::AddPersistentVariable(const NamedDecl *decl,
     return false;
 
   if (m_parser_vars->m_materializer && is_result) {
-    Error err;
+    Status err;
 
     ExecutionContext &exe_ctx = m_parser_vars->m_exe_ctx;
     Target *target = exe_ctx.GetTargetPtr();
@@ -364,7 +364,7 @@ bool ClangExpressionDeclMap::AddValueToStruct(const NamedDecl *decl,
   if (m_parser_vars->m_materializer) {
     uint32_t offset = 0;
 
-    Error err;
+    Status err;
 
     if (is_persistent_variable) {
       ExpressionVariableSP var_sp(var->shared_from_this());
@@ -1630,7 +1630,7 @@ bool ClangExpressionDeclMap::GetVariableValue(VariableSP &var,
   DWARFExpression &var_location_expr = var->LocationExpression();
 
   Target *target = m_parser_vars->m_exe_ctx.GetTargetPtr();
-  Error err;
+  Status err;
 
   if (var->GetLocationIsConstantValueData()) {
     DataExtractor const_value_extractor;
@@ -1987,8 +1987,33 @@ void ClangExpressionDeclMap::AddOneFunction(NameSearchContext &context,
                 .GetOpaqueDeclContext();
         clang::FunctionDecl *src_function_decl =
             llvm::dyn_cast_or_null<clang::FunctionDecl>(src_decl_context);
-
-        if (src_function_decl) {
+        if (src_function_decl &&
+            src_function_decl->getTemplateSpecializationInfo()) {
+          clang::FunctionTemplateDecl *function_template =
+              src_function_decl->getTemplateSpecializationInfo()->getTemplate();
+          clang::FunctionTemplateDecl *copied_function_template =
+              llvm::dyn_cast_or_null<clang::FunctionTemplateDecl>(
+                 m_ast_importer_sp->CopyDecl(m_ast_context,
+                                             src_ast->getASTContext(),
+                                             function_template));
+          if (copied_function_template) {
+            if (log) {
+              ASTDumper ast_dumper((clang::Decl *)copied_function_template);
+              
+              StreamString ss;
+              
+              function->DumpSymbolContext(&ss);
+              
+              log->Printf("  CEDM::FEVD[%u] Imported decl for function template"
+                          " %s (description %s), returned %s",
+                          current_id,
+                          copied_function_template->getNameAsString().c_str(),
+                          ss.GetData(), ast_dumper.GetCString());
+            }
+            
+            context.AddNamedDecl(copied_function_template);
+          }
+        } else if (src_function_decl) {
           if (clang::FunctionDecl *copied_function_decl =
                   llvm::dyn_cast_or_null<clang::FunctionDecl>(
                       m_ast_importer_sp->CopyDecl(m_ast_context,
diff --git a/source/Plugins/ExpressionParser/Clang/ClangExpressionParser.cpp b/source/Plugins/ExpressionParser/Clang/ClangExpressionParser.cpp
index 6c923ced1ec2..4e20be79f68b 100644
--- a/source/Plugins/ExpressionParser/Clang/ClangExpressionParser.cpp
+++ b/source/Plugins/ExpressionParser/Clang/ClangExpressionParser.cpp
@@ -756,7 +756,7 @@ static bool FindFunctionInModule(ConstString &mangled_name,
   return false;
 }
 
-lldb_private::Error ClangExpressionParser::PrepareForExecution(
+lldb_private::Status ClangExpressionParser::PrepareForExecution(
     lldb::addr_t &func_addr, lldb::addr_t &func_end,
     lldb::IRExecutionUnitSP &execution_unit_sp, ExecutionContext &exe_ctx,
     bool &can_interpret, ExecutionPolicy execution_policy) {
@@ -764,7 +764,7 @@ lldb_private::Error ClangExpressionParser::PrepareForExecution(
   func_end = LLDB_INVALID_ADDRESS;
   Log *log(lldb_private::GetLogIfAllCategoriesSet(LIBLLDB_LOG_EXPRESSIONS));
 
-  lldb_private::Error err;
+  lldb_private::Status err;
 
   std::unique_ptr<llvm::Module> llvm_module_ap(
       m_code_generator->ReleaseModule());
@@ -857,7 +857,7 @@ lldb_private::Error ClangExpressionParser::PrepareForExecution(
 
     if (execution_policy != eExecutionPolicyAlways &&
         execution_policy != eExecutionPolicyTopLevel) {
-      lldb_private::Error interpret_error;
+      lldb_private::Status interpret_error;
 
       bool interpret_function_calls =
           !process ? false : process->CanInterpretFunctionCalls();
@@ -941,9 +941,9 @@ lldb_private::Error ClangExpressionParser::PrepareForExecution(
   return err;
 }
 
-lldb_private::Error ClangExpressionParser::RunStaticInitializers(
+lldb_private::Status ClangExpressionParser::RunStaticInitializers(
     lldb::IRExecutionUnitSP &execution_unit_sp, ExecutionContext &exe_ctx) {
-  lldb_private::Error err;
+  lldb_private::Status err;
 
   lldbassert(execution_unit_sp.get());
   lldbassert(exe_ctx.HasThreadScope());
diff --git a/source/Plugins/ExpressionParser/Clang/ClangExpressionParser.h b/source/Plugins/ExpressionParser/Clang/ClangExpressionParser.h
index f0203f36e59b..3e6a109a4af3 100644
--- a/source/Plugins/ExpressionParser/Clang/ClangExpressionParser.h
+++ b/source/Plugins/ExpressionParser/Clang/ClangExpressionParser.h
@@ -14,7 +14,7 @@
 #include "lldb/Core/ClangForward.h"
 #include "lldb/Expression/DiagnosticManager.h"
 #include "lldb/Expression/ExpressionParser.h"
-#include "lldb/Utility/Error.h"
+#include "lldb/Utility/Status.h"
 #include "lldb/lldb-public.h"
 
 #include <string>
@@ -110,7 +110,7 @@ public:
   ///     An error code indicating the success or failure of the operation.
   ///     Test with Success().
   //------------------------------------------------------------------
-  Error
+  Status
   PrepareForExecution(lldb::addr_t &func_addr, lldb::addr_t &func_end,
                       lldb::IRExecutionUnitSP &execution_unit_sp,
                       ExecutionContext &exe_ctx, bool &can_interpret,
@@ -128,8 +128,8 @@ public:
   /// @return
   ///     The error code indicating the
   //------------------------------------------------------------------
-  Error RunStaticInitializers(lldb::IRExecutionUnitSP &execution_unit_sp,
-                              ExecutionContext &exe_ctx);
+  Status RunStaticInitializers(lldb::IRExecutionUnitSP &execution_unit_sp,
+                               ExecutionContext &exe_ctx);
 
   //------------------------------------------------------------------
   /// Returns a string representing current ABI.
diff --git a/source/Plugins/ExpressionParser/Clang/ClangUserExpression.cpp b/source/Plugins/ExpressionParser/Clang/ClangUserExpression.cpp
index 95d81db12801..2a6261a6df4d 100644
--- a/source/Plugins/ExpressionParser/Clang/ClangUserExpression.cpp
+++ b/source/Plugins/ExpressionParser/Clang/ClangUserExpression.cpp
@@ -83,7 +83,7 @@ ClangUserExpression::ClangUserExpression(
 
 ClangUserExpression::~ClangUserExpression() {}
 
-void ClangUserExpression::ScanContext(ExecutionContext &exe_ctx, Error &err) {
+void ClangUserExpression::ScanContext(ExecutionContext &exe_ctx, Status &err) {
   Log *log(lldb_private::GetLogIfAllCategoriesSet(LIBLLDB_LOG_EXPRESSIONS));
 
   if (log)
@@ -315,7 +315,7 @@ bool ClangUserExpression::Parse(DiagnosticManager &diagnostic_manager,
                                 bool generate_debug_info) {
   Log *log(lldb_private::GetLogIfAllCategoriesSet(LIBLLDB_LOG_EXPRESSIONS));
 
-  Error err;
+  Status err;
 
   InstallContext(exe_ctx);
 
@@ -501,7 +501,7 @@ bool ClangUserExpression::Parse(DiagnosticManager &diagnostic_manager,
   //
 
   {
-    Error jit_error = parser.PrepareForExecution(
+    Status jit_error = parser.PrepareForExecution(
         m_jit_start_addr, m_jit_end_addr, m_execution_unit_sp, exe_ctx,
         m_can_interpret, execution_policy);
 
@@ -517,7 +517,7 @@ bool ClangUserExpression::Parse(DiagnosticManager &diagnostic_manager,
   }
 
   if (exe_ctx.GetProcessPtr() && execution_policy == eExecutionPolicyTopLevel) {
-    Error static_init_error =
+    Status static_init_error =
         parser.RunStaticInitializers(m_execution_unit_sp, exe_ctx);
 
     if (!static_init_error.Success()) {
@@ -603,7 +603,7 @@ bool ClangUserExpression::AddArguments(ExecutionContext &exe_ctx,
       return false;
     }
 
-    Error object_ptr_error;
+    Status object_ptr_error;
 
     object_ptr = GetObjectPointer(frame_sp, object_name, object_ptr_error);
 
diff --git a/source/Plugins/ExpressionParser/Clang/ClangUserExpression.h b/source/Plugins/ExpressionParser/Clang/ClangUserExpression.h
index 155c153b873c..88a78798b657 100644
--- a/source/Plugins/ExpressionParser/Clang/ClangUserExpression.h
+++ b/source/Plugins/ExpressionParser/Clang/ClangUserExpression.h
@@ -168,7 +168,7 @@ private:
   //------------------------------------------------------------------
 
   void ScanContext(ExecutionContext &exe_ctx,
-                   lldb_private::Error &err) override;
+                   lldb_private::Status &err) override;
 
   bool AddArguments(ExecutionContext &exe_ctx, std::vector<lldb::addr_t> &args,
                     lldb::addr_t struct_address,
diff --git a/source/Plugins/ExpressionParser/Clang/ClangUtilityFunction.cpp b/source/Plugins/ExpressionParser/Clang/ClangUtilityFunction.cpp
index a54ab4a2267a..065e5db4c9f8 100644
--- a/source/Plugins/ExpressionParser/Clang/ClangUtilityFunction.cpp
+++ b/source/Plugins/ExpressionParser/Clang/ClangUtilityFunction.cpp
@@ -118,7 +118,7 @@ bool ClangUtilityFunction::Install(DiagnosticManager &diagnostic_manager,
 
   bool can_interpret = false; // should stay that way
 
-  Error jit_error = parser.PrepareForExecution(
+  Status jit_error = parser.PrepareForExecution(
       m_jit_start_addr, m_jit_end_addr, m_execution_unit_sp, exe_ctx,
       can_interpret, eExecutionPolicyAlways);
 
diff --git a/source/Plugins/ExpressionParser/Clang/IRForTarget.cpp b/source/Plugins/ExpressionParser/Clang/IRForTarget.cpp
index 68a214ee4683..13f5657eedd8 100644
--- a/source/Plugins/ExpressionParser/Clang/IRForTarget.cpp
+++ b/source/Plugins/ExpressionParser/Clang/IRForTarget.cpp
@@ -1263,7 +1263,7 @@ bool IRForTarget::MaterializeInitializer(uint8_t *data, Constant *initializer) {
     lldb_private::Scalar scalar = int_initializer->getValue().zextOrTrunc(
         llvm::NextPowerOf2(constant_size) * 8);
 
-    lldb_private::Error get_data_error;
+    lldb_private::Status get_data_error;
     if (!scalar.GetAsMemoryData(data, constant_size,
                                 lldb_private::endian::InlHostByteOrder(),
                                 get_data_error))
diff --git a/source/Plugins/ExpressionParser/Clang/IRForTarget.h b/source/Plugins/ExpressionParser/Clang/IRForTarget.h
index eb52730f3a87..93ce8aa44eb2 100644
--- a/source/Plugins/ExpressionParser/Clang/IRForTarget.h
+++ b/source/Plugins/ExpressionParser/Clang/IRForTarget.h
@@ -13,7 +13,7 @@
 
 #include "lldb/Symbol/TaggedASTType.h"
 #include "lldb/Utility/ConstString.h"
-#include "lldb/Utility/Error.h"
+#include "lldb/Utility/Status.h"
 #include "lldb/Utility/Stream.h"
 #include "lldb/Utility/StreamString.h"
 #include "lldb/lldb-public.h"
diff --git a/source/Plugins/ExpressionParser/Go/GoParser.cpp b/source/Plugins/ExpressionParser/Go/GoParser.cpp
index 0bae4a4574d9..538bd05e25f8 100644
--- a/source/Plugins/ExpressionParser/Go/GoParser.cpp
+++ b/source/Plugins/ExpressionParser/Go/GoParser.cpp
@@ -12,7 +12,7 @@
 #include "GoParser.h"
 
 #include "Plugins/ExpressionParser/Go/GoAST.h"
-#include "lldb/Utility/Error.h"
+#include "lldb/Utility/Status.h"
 #include "llvm/ADT/SmallString.h"
 
 using namespace lldb_private;
@@ -860,7 +860,7 @@ llvm::StringRef GoParser::CopyString(llvm::StringRef s) {
   return m_strings.insert(std::make_pair(s, 'x')).first->getKey();
 }
 
-void GoParser::GetError(Error &error) {
+void GoParser::GetError(Status &error) {
   llvm::StringRef want;
   if (m_failed)
     want =
diff --git a/source/Plugins/ExpressionParser/Go/GoParser.h b/source/Plugins/ExpressionParser/Go/GoParser.h
index bd1285580228..9ed2ae2033bd 100644
--- a/source/Plugins/ExpressionParser/Go/GoParser.h
+++ b/source/Plugins/ExpressionParser/Go/GoParser.h
@@ -82,7 +82,7 @@ public:
     return m_lexer.BytesRemaining() == 0 && m_pos == m_tokens.size();
   }
 
-  void GetError(Error &error);
+  void GetError(Status &error);
 
 private:
   class Rule;
diff --git a/source/Plugins/ExpressionParser/Go/GoUserExpression.cpp b/source/Plugins/ExpressionParser/Go/GoUserExpression.cpp
index 50d45a1ad1e7..f4b8cfbe03d4 100644
--- a/source/Plugins/ExpressionParser/Go/GoUserExpression.cpp
+++ b/source/Plugins/ExpressionParser/Go/GoUserExpression.cpp
@@ -150,7 +150,7 @@ public:
 
   CompilerType EvaluateType(const GoASTExpr *e);
 
-  Error &error() { return m_error; }
+  Status &error() { return m_error; }
 
 private:
   std::nullptr_t NotImplemented(const GoASTExpr *e) {
@@ -163,7 +163,7 @@ private:
   lldb::StackFrameSP m_frame;
   GoParser m_parser;
   DynamicValueType m_use_dynamic;
-  Error m_error;
+  Status m_error;
   llvm::StringRef m_package;
   std::vector<std::unique_ptr<GoASTStmt>> m_statements;
 };
@@ -254,7 +254,7 @@ GoUserExpression::DoExecute(DiagnosticManager &diagnostic_manager,
 
   m_interpreter->set_use_dynamic(options.GetUseDynamic());
   ValueObjectSP result_val_sp = m_interpreter->Evaluate(exe_ctx);
-  Error err = m_interpreter->error();
+  Status err = m_interpreter->error();
   m_interpreter.reset();
 
   if (!result_val_sp) {
diff --git a/source/Plugins/Instruction/ARM/EmulateInstructionARM.h b/source/Plugins/Instruction/ARM/EmulateInstructionARM.h
index 40a31169ecd8..fb1867c6362a 100644
--- a/source/Plugins/Instruction/ARM/EmulateInstructionARM.h
+++ b/source/Plugins/Instruction/ARM/EmulateInstructionARM.h
@@ -13,7 +13,7 @@
 #include "Plugins/Process/Utility/ARMDefines.h"
 #include "lldb/Core/EmulateInstruction.h"
 #include "lldb/Utility/ConstString.h"
-#include "lldb/Utility/Error.h"
+#include "lldb/Utility/Status.h"
 
 namespace lldb_private {
 
diff --git a/source/Plugins/Instruction/ARM64/EmulateInstructionARM64.cpp b/source/Plugins/Instruction/ARM64/EmulateInstructionARM64.cpp
index 655fada3a4b3..54dd237eb4b7 100644
--- a/source/Plugins/Instruction/ARM64/EmulateInstructionARM64.cpp
+++ b/source/Plugins/Instruction/ARM64/EmulateInstructionARM64.cpp
@@ -846,7 +846,7 @@ bool EmulateInstructionARM64::EmulateLDPSTP(const uint32_t opcode) {
   Context context_t2;
 
   uint8_t buffer[RegisterValue::kMaxRegisterByteSize];
-  Error error;
+  Status error;
 
   switch (memop) {
   case MemOp_STORE: {
@@ -992,7 +992,7 @@ bool EmulateInstructionARM64::EmulateLDRSTRImm(const uint32_t opcode) {
       return false;
   }
 
-  Error error;
+  Status error;
   bool success = false;
   uint64_t address;
   uint8_t buffer[RegisterValue::kMaxRegisterByteSize];
diff --git a/source/Plugins/Instruction/ARM64/EmulateInstructionARM64.h b/source/Plugins/Instruction/ARM64/EmulateInstructionARM64.h
index 475410ab2d13..253bb935bca7 100644
--- a/source/Plugins/Instruction/ARM64/EmulateInstructionARM64.h
+++ b/source/Plugins/Instruction/ARM64/EmulateInstructionARM64.h
@@ -17,7 +17,7 @@
 #include "Plugins/Process/Utility/ARMDefines.h"
 #include "lldb/Core/EmulateInstruction.h"
 #include "lldb/Interpreter/OptionValue.h"
-#include "lldb/Utility/Error.h"
+#include "lldb/Utility/Status.h"
 
 class EmulateInstructionARM64 : public lldb_private::EmulateInstruction {
 public:
diff --git a/source/Plugins/Instruction/MIPS/EmulateInstructionMIPS.cpp b/source/Plugins/Instruction/MIPS/EmulateInstructionMIPS.cpp
index aa7c6e5d1fe0..4d1d89abb1d0 100644
--- a/source/Plugins/Instruction/MIPS/EmulateInstructionMIPS.cpp
+++ b/source/Plugins/Instruction/MIPS/EmulateInstructionMIPS.cpp
@@ -63,10 +63,10 @@ EmulateInstructionMIPS::EmulateInstructionMIPS(
     const lldb_private::ArchSpec &arch)
     : EmulateInstruction(arch) {
   /* Create instance of llvm::MCDisassembler */
-  std::string Error;
+  std::string Status;
   llvm::Triple triple = arch.GetTriple();
   const llvm::Target *target =
-      llvm::TargetRegistry::lookupTarget(triple.getTriple(), Error);
+      llvm::TargetRegistry::lookupTarget(triple.getTriple(), Status);
 
 /*
  * If we fail to get the target then we haven't registered it. The
@@ -83,7 +83,7 @@ EmulateInstructionMIPS::EmulateInstructionMIPS(
     LLVMInitializeMipsAsmPrinter();
     LLVMInitializeMipsTargetMC();
     LLVMInitializeMipsDisassembler();
-    target = llvm::TargetRegistry::lookupTarget(triple.getTriple(), Error);
+    target = llvm::TargetRegistry::lookupTarget(triple.getTriple(), Status);
   }
 #endif
 
@@ -1019,7 +1019,7 @@ bool EmulateInstructionMIPS::SetInstruction(const Opcode &insn_opcode,
 
   if (EmulateInstruction::SetInstruction(insn_opcode, inst_addr, target)) {
     if (inst_addr.GetAddressClass() == eAddressClassCodeAlternateISA) {
-      Error error;
+      Status error;
       lldb::addr_t load_addr = LLDB_INVALID_ADDRESS;
 
       /*
@@ -1297,7 +1297,7 @@ bool EmulateInstructionMIPS::Emulate_SW(llvm::MCInst &insn) {
     context.SetRegisterToRegisterPlusOffset(reg_info_src, reg_info_base, 0);
 
     uint8_t buffer[RegisterValue::kMaxRegisterByteSize];
-    Error error;
+    Status error;
 
     if (!ReadRegister(&reg_info_base, data_src))
       return false;
@@ -1563,7 +1563,7 @@ bool EmulateInstructionMIPS::Emulate_SWSP(llvm::MCInst &insn) {
     context.SetRegisterToRegisterPlusOffset(reg_info_src, reg_info_base, 0);
 
     uint8_t buffer[RegisterValue::kMaxRegisterByteSize];
-    Error error;
+    Status error;
 
     if (!ReadRegister(&reg_info_base, data_src))
       return false;
@@ -1646,7 +1646,7 @@ bool EmulateInstructionMIPS::Emulate_SWM16_32(llvm::MCInst &insn) {
     context.SetRegisterToRegisterPlusOffset(reg_info_src, reg_info_base, 0);
 
     uint8_t buffer[RegisterValue::kMaxRegisterByteSize];
-    Error error;
+    Status error;
 
     if (!ReadRegister(&reg_info_base, data_src))
       return false;
diff --git a/source/Plugins/Instruction/MIPS/EmulateInstructionMIPS.h b/source/Plugins/Instruction/MIPS/EmulateInstructionMIPS.h
index 0375056f87c7..8d6e0be3cd78 100644
--- a/source/Plugins/Instruction/MIPS/EmulateInstructionMIPS.h
+++ b/source/Plugins/Instruction/MIPS/EmulateInstructionMIPS.h
@@ -23,7 +23,7 @@ class MCInst;
 
 #include "lldb/Core/EmulateInstruction.h"
 #include "lldb/Interpreter/OptionValue.h"
-#include "lldb/Utility/Error.h"
+#include "lldb/Utility/Status.h"
 
 class EmulateInstructionMIPS : public lldb_private::EmulateInstruction {
 public:
diff --git a/source/Plugins/Instruction/MIPS64/EmulateInstructionMIPS64.cpp b/source/Plugins/Instruction/MIPS64/EmulateInstructionMIPS64.cpp
index 4c90a1901da7..1e6d98756958 100644
--- a/source/Plugins/Instruction/MIPS64/EmulateInstructionMIPS64.cpp
+++ b/source/Plugins/Instruction/MIPS64/EmulateInstructionMIPS64.cpp
@@ -63,10 +63,10 @@ EmulateInstructionMIPS64::EmulateInstructionMIPS64(
     const lldb_private::ArchSpec &arch)
     : EmulateInstruction(arch) {
   /* Create instance of llvm::MCDisassembler */
-  std::string Error;
+  std::string Status;
   llvm::Triple triple = arch.GetTriple();
   const llvm::Target *target =
-      llvm::TargetRegistry::lookupTarget(triple.getTriple(), Error);
+      llvm::TargetRegistry::lookupTarget(triple.getTriple(), Status);
 
 /*
  * If we fail to get the target then we haven't registered it. The
@@ -83,7 +83,7 @@ EmulateInstructionMIPS64::EmulateInstructionMIPS64(
     LLVMInitializeMipsAsmPrinter();
     LLVMInitializeMipsTargetMC();
     LLVMInitializeMipsDisassembler();
-    target = llvm::TargetRegistry::lookupTarget(triple.getTriple(), Error);
+    target = llvm::TargetRegistry::lookupTarget(triple.getTriple(), Status);
   }
 #endif
 
@@ -1141,7 +1141,7 @@ bool EmulateInstructionMIPS64::Emulate_SD(llvm::MCInst &insn) {
     context.SetRegisterToRegisterPlusOffset(reg_info_src, reg_info_base, 0);
 
     uint8_t buffer[RegisterValue::kMaxRegisterByteSize];
-    Error error;
+    Status error;
 
     if (!ReadRegister(&reg_info_base, data_src))
       return false;
diff --git a/source/Plugins/Instruction/MIPS64/EmulateInstructionMIPS64.h b/source/Plugins/Instruction/MIPS64/EmulateInstructionMIPS64.h
index 1e5be516d130..c2433d59830e 100644
--- a/source/Plugins/Instruction/MIPS64/EmulateInstructionMIPS64.h
+++ b/source/Plugins/Instruction/MIPS64/EmulateInstructionMIPS64.h
@@ -16,7 +16,7 @@
 // Project includes
 #include "lldb/Core/EmulateInstruction.h"
 #include "lldb/Interpreter/OptionValue.h"
-#include "lldb/Utility/Error.h"
+#include "lldb/Utility/Status.h"
 
 namespace llvm {
 class MCDisassembler;
diff --git a/source/Plugins/InstrumentationRuntime/AddressSanitizer/AddressSanitizerRuntime.cpp b/source/Plugins/InstrumentationRuntime/AddressSanitizer/AddressSanitizerRuntime.cpp
index 96370528f260..91c5d6ce3d60 100644
--- a/source/Plugins/InstrumentationRuntime/AddressSanitizer/AddressSanitizerRuntime.cpp
+++ b/source/Plugins/InstrumentationRuntime/AddressSanitizer/AddressSanitizerRuntime.cpp
@@ -134,7 +134,7 @@ StructuredData::ObjectSP AddressSanitizerRuntime::RetrieveReportData() {
 
   ValueObjectSP return_value_sp;
   ExecutionContext exe_ctx;
-  Error eval_error;
+  Status eval_error;
   frame_sp->CalculateExecutionContext(exe_ctx);
   ExpressionResults result = UserExpression::Evaluate(
       exe_ctx, options, address_sanitizer_retrieve_report_data_command, "",
@@ -171,7 +171,7 @@ StructuredData::ObjectSP AddressSanitizerRuntime::RetrieveReportData() {
       return_value_sp->GetValueForExpressionPath(".description")
           ->GetValueAsUnsigned(0);
   std::string description;
-  Error error;
+  Status error;
   process_sp->ReadCStringFromMemory(description_ptr, description, error);
 
   StructuredData::Dictionary *dict = new StructuredData::Dictionary();
diff --git a/source/Plugins/InstrumentationRuntime/ThreadSanitizer/ThreadSanitizerRuntime.cpp b/source/Plugins/InstrumentationRuntime/ThreadSanitizer/ThreadSanitizerRuntime.cpp
index 562d988be837..d7b518982fcf 100644
--- a/source/Plugins/InstrumentationRuntime/ThreadSanitizer/ThreadSanitizerRuntime.cpp
+++ b/source/Plugins/InstrumentationRuntime/ThreadSanitizer/ThreadSanitizerRuntime.cpp
@@ -256,7 +256,7 @@ static std::string RetrieveString(ValueObjectSP return_value_sp,
       return_value_sp->GetValueForExpressionPath(expression_path.c_str())
           ->GetValueAsUnsigned(0);
   std::string str;
-  Error error;
+  Status error;
   process_sp->ReadCStringFromMemory(ptr, str, error);
   return str;
 }
@@ -325,7 +325,7 @@ ThreadSanitizerRuntime::RetrieveReportData(ExecutionContextRef exe_ctx_ref) {
 
   ValueObjectSP main_value;
   ExecutionContext exe_ctx;
-  Error eval_error;
+  Status eval_error;
   frame_sp->CalculateExecutionContext(exe_ctx);
   ExpressionResults result = UserExpression::Evaluate(
       exe_ctx, options, thread_sanitizer_retrieve_report_data_command, "",
diff --git a/source/Plugins/JITLoader/GDB/JITLoaderGDB.cpp b/source/Plugins/JITLoader/GDB/JITLoaderGDB.cpp
index b97d67a0bb42..7ef3aecdb89f 100644
--- a/source/Plugins/JITLoader/GDB/JITLoaderGDB.cpp
+++ b/source/Plugins/JITLoader/GDB/JITLoaderGDB.cpp
@@ -103,7 +103,7 @@ bool ReadJITEntry(const addr_t from_addr, Process *process,
   const size_t data_byte_size =
       llvm::alignTo(sizeof(ptr_t) * 3, uint64_align_bytes) + sizeof(uint64_t);
 
-  Error error;
+  Status error;
   DataBufferHeap data(data_byte_size, 0);
   size_t bytes_read = process->ReadMemory(from_addr, data.GetBytes(),
                                           data.GetByteSize(), error);
@@ -277,7 +277,7 @@ bool JITLoaderGDB::ReadJITDescriptorImpl(bool all_entries) {
 
   jit_descriptor<ptr_t> jit_desc;
   const size_t jit_desc_size = sizeof(jit_desc);
-  Error error;
+  Status error;
   size_t bytes_read = m_process->DoReadMemory(m_jit_descriptor_addr, &jit_desc,
                                               jit_desc_size, error);
   if (bytes_read != jit_desc_size || !error.Success()) {
diff --git a/source/Plugins/Language/CPlusPlus/BlockPointer.cpp b/source/Plugins/Language/CPlusPlus/BlockPointer.cpp
index db7c24675825..5e8f051dec98 100644
--- a/source/Plugins/Language/CPlusPlus/BlockPointer.cpp
+++ b/source/Plugins/Language/CPlusPlus/BlockPointer.cpp
@@ -44,7 +44,7 @@ public:
       return;
     }
 
-    Error err;
+    Status err;
     TypeSystem *type_system = target_sp->GetScratchTypeSystemForLanguage(
         &err, lldb::eLanguageTypeC_plus_plus);
 
@@ -133,7 +133,7 @@ public:
       return lldb::ValueObjectSP();
     }
 
-    Error err;
+    Status err;
     ValueObjectSP struct_sp = struct_pointer_sp->Dereference(err);
 
     if (!struct_sp || !err.Success()) {
diff --git a/source/Plugins/Language/CPlusPlus/CxxStringTypes.cpp b/source/Plugins/Language/CPlusPlus/CxxStringTypes.cpp
index b69b69530428..0f6fb54e8384 100644
--- a/source/Plugins/Language/CPlusPlus/CxxStringTypes.cpp
+++ b/source/Plugins/Language/CPlusPlus/CxxStringTypes.cpp
@@ -24,7 +24,7 @@
 #include "lldb/Target/Thread.h"
 #include "lldb/Utility/DataBufferHeap.h"
 #include "lldb/Utility/Endian.h"
-#include "lldb/Utility/Error.h"
+#include "lldb/Utility/Status.h"
 #include "lldb/Utility/Stream.h"
 
 #include <algorithm>
@@ -129,7 +129,7 @@ bool lldb_private::formatters::WCharStringSummaryProvider(
 bool lldb_private::formatters::Char16SummaryProvider(
     ValueObject &valobj, Stream &stream, const TypeSummaryOptions &) {
   DataExtractor data;
-  Error error;
+  Status error;
   valobj.GetData(data, error);
 
   if (error.Fail())
@@ -155,7 +155,7 @@ bool lldb_private::formatters::Char16SummaryProvider(
 bool lldb_private::formatters::Char32SummaryProvider(
     ValueObject &valobj, Stream &stream, const TypeSummaryOptions &) {
   DataExtractor data;
-  Error error;
+  Status error;
   valobj.GetData(data, error);
 
   if (error.Fail())
@@ -181,7 +181,7 @@ bool lldb_private::formatters::Char32SummaryProvider(
 bool lldb_private::formatters::WCharSummaryProvider(
     ValueObject &valobj, Stream &stream, const TypeSummaryOptions &) {
   DataExtractor data;
-  Error error;
+  Status error;
   valobj.GetData(data, error);
 
   if (error.Fail())
diff --git a/source/Plugins/Language/CPlusPlus/LibCxx.cpp b/source/Plugins/Language/CPlusPlus/LibCxx.cpp
index 659a12b7eecf..11245e1310b7 100644
--- a/source/Plugins/Language/CPlusPlus/LibCxx.cpp
+++ b/source/Plugins/Language/CPlusPlus/LibCxx.cpp
@@ -26,7 +26,7 @@
 #include "lldb/Target/Target.h"
 #include "lldb/Utility/DataBufferHeap.h"
 #include "lldb/Utility/Endian.h"
-#include "lldb/Utility/Error.h"
+#include "lldb/Utility/Status.h"
 #include "lldb/Utility/Stream.h"
 
 using namespace lldb;
@@ -53,7 +53,7 @@ bool lldb_private::formatters::LibcxxSmartPointerSummaryProvider(
     return true;
   } else {
     bool print_pointee = false;
-    Error error;
+    Status error;
     ValueObjectSP pointee_sp = ptr_sp->Dereference(error);
     if (pointee_sp && error.Success()) {
       if (pointee_sp->DumpPrintableRepresentation(
@@ -181,7 +181,7 @@ bool lldb_private::formatters::LibCxxMapIteratorSyntheticFrontEnd::Update() {
         });
         DataBufferSP buffer_sp(new DataBufferHeap(tree_node_type.GetByteSize(nullptr),0));
         ProcessSP process_sp(target_sp->GetProcessSP());
-        Error error;
+        Status error;
         process_sp->ReadMemory(addr, buffer_sp->GetBytes(), buffer_sp->GetByteSize(), error);
         if (error.Fail())
           return false;
diff --git a/source/Plugins/Language/CPlusPlus/LibCxxList.cpp b/source/Plugins/Language/CPlusPlus/LibCxxList.cpp
index 4ad3df3d6038..56d8edaba72a 100644
--- a/source/Plugins/Language/CPlusPlus/LibCxxList.cpp
+++ b/source/Plugins/Language/CPlusPlus/LibCxxList.cpp
@@ -20,7 +20,7 @@
 #include "lldb/Target/Target.h"
 #include "lldb/Utility/DataBufferHeap.h"
 #include "lldb/Utility/Endian.h"
-#include "lldb/Utility/Error.h"
+#include "lldb/Utility/Status.h"
 #include "lldb/Utility/Stream.h"
 
 using namespace lldb;
@@ -291,7 +291,7 @@ lldb_private::formatters::LibcxxStdListSyntheticFrontEnd::GetChildAtIndex(
   // we need to copy current_sp into a new object otherwise we will end up with
   // all items named __value_
   DataExtractor data;
-  Error error;
+  Status error;
   current_sp->GetData(data, error);
   if (error.Fail())
     return lldb::ValueObjectSP();
@@ -312,7 +312,7 @@ bool lldb_private::formatters::LibcxxStdListSyntheticFrontEnd::Update() {
   m_slow_runner.SetEntry(nullptr);
   m_fast_runner.SetEntry(nullptr);
 
-  Error err;
+  Status err;
   ValueObjectSP backend_addr(m_backend.AddressOf(err));
   m_list_capping_size = 0;
   if (m_backend.GetTargetSP())
diff --git a/source/Plugins/Language/CPlusPlus/LibCxxMap.cpp b/source/Plugins/Language/CPlusPlus/LibCxxMap.cpp
index 293d64075921..b7215dbcbb48 100644
--- a/source/Plugins/Language/CPlusPlus/LibCxxMap.cpp
+++ b/source/Plugins/Language/CPlusPlus/LibCxxMap.cpp
@@ -20,7 +20,7 @@
 #include "lldb/Target/Target.h"
 #include "lldb/Utility/DataBufferHeap.h"
 #include "lldb/Utility/Endian.h"
-#include "lldb/Utility/Error.h"
+#include "lldb/Utility/Status.h"
 #include "lldb/Utility/Stream.h"
 
 using namespace lldb;
@@ -259,7 +259,7 @@ bool lldb_private::formatters::LibcxxStdMapSyntheticFrontEnd::GetDataType() {
     return true;
   m_element_type.Clear();
   ValueObjectSP deref;
-  Error error;
+  Status error;
   deref = m_root_node->Dereference(error);
   if (!deref || error.Fail())
     return false;
@@ -365,7 +365,7 @@ lldb_private::formatters::LibcxxStdMapSyntheticFrontEnd::GetChildAtIndex(
   }
   if (GetDataType()) {
     if (!need_to_skip) {
-      Error error;
+      Status error;
       iterated_sp = iterated_sp->Dereference(error);
       if (!iterated_sp || error.Fail()) {
         m_tree = nullptr;
@@ -406,7 +406,7 @@ lldb_private::formatters::LibcxxStdMapSyntheticFrontEnd::GetChildAtIndex(
   // we need to copy current_sp into a new object otherwise we will end up with
   // all items named __value_
   DataExtractor data;
-  Error error;
+  Status error;
   iterated_sp->GetData(data, error);
   if (error.Fail()) {
     m_tree = nullptr;
diff --git a/source/Plugins/Language/CPlusPlus/LibCxxUnorderedMap.cpp b/source/Plugins/Language/CPlusPlus/LibCxxUnorderedMap.cpp
index 526bae6900f5..190b5f64381e 100644
--- a/source/Plugins/Language/CPlusPlus/LibCxxUnorderedMap.cpp
+++ b/source/Plugins/Language/CPlusPlus/LibCxxUnorderedMap.cpp
@@ -20,7 +20,7 @@
 #include "lldb/Target/Target.h"
 #include "lldb/Utility/DataBufferHeap.h"
 #include "lldb/Utility/Endian.h"
-#include "lldb/Utility/Error.h"
+#include "lldb/Utility/Status.h"
 #include "lldb/Utility/Stream.h"
 
 using namespace lldb;
@@ -83,7 +83,7 @@ lldb::ValueObjectSP lldb_private::formatters::
     if (m_next_element == nullptr)
       return lldb::ValueObjectSP();
 
-    Error error;
+    Status error;
     ValueObjectSP node_sp = m_next_element->Dereference(error);
     if (!node_sp || error.Fail())
       return lldb::ValueObjectSP();
@@ -153,7 +153,7 @@ lldb::ValueObjectSP lldb_private::formatters::
   StreamString stream;
   stream.Printf("[%" PRIu64 "]", (uint64_t)idx);
   DataExtractor data;
-  Error error;
+  Status error;
   val_hash.first->GetData(data, error);
   if (error.Fail())
     return lldb::ValueObjectSP();
diff --git a/source/Plugins/Language/CPlusPlus/LibCxxVector.cpp b/source/Plugins/Language/CPlusPlus/LibCxxVector.cpp
index 96d7e51deba4..6f601c9f6ccb 100644
--- a/source/Plugins/Language/CPlusPlus/LibCxxVector.cpp
+++ b/source/Plugins/Language/CPlusPlus/LibCxxVector.cpp
@@ -209,7 +209,7 @@ lldb_private::formatters::LibcxxVectorBoolSyntheticFrontEnd::GetChildAtIndex(
     return ValueObjectSP();
   uint8_t byte = 0;
   uint8_t mask = 0;
-  Error err;
+  Status err;
   size_t bytes_read = process_sp->ReadMemory(byte_location, &byte, 1, err);
   if (err.Fail() || bytes_read == 0)
     return ValueObjectSP();
diff --git a/source/Plugins/Language/CPlusPlus/LibStdcpp.cpp b/source/Plugins/Language/CPlusPlus/LibStdcpp.cpp
index a4633db8157e..e3018a1884be 100644
--- a/source/Plugins/Language/CPlusPlus/LibStdcpp.cpp
+++ b/source/Plugins/Language/CPlusPlus/LibStdcpp.cpp
@@ -21,7 +21,7 @@
 #include "lldb/Target/Target.h"
 #include "lldb/Utility/DataBufferHeap.h"
 #include "lldb/Utility/Endian.h"
-#include "lldb/Utility/Error.h"
+#include "lldb/Utility/Status.h"
 #include "lldb/Utility/Stream.h"
 
 using namespace lldb;
@@ -209,7 +209,7 @@ bool VectorIteratorSyntheticFrontEnd::Update() {
     return false;
   if (item_ptr->GetValueAsUnsigned(0) == 0)
     return false;
-  Error err;
+  Status err;
   m_exe_ctx_ref = valobj_sp->GetExecutionContextRef();
   m_item_sp = CreateValueObjectFromAddress(
       "item", item_ptr->GetValueAsUnsigned(0), m_exe_ctx_ref,
@@ -251,7 +251,7 @@ bool lldb_private::formatters::LibStdcppStringSummaryProvider(
         return false;
 
       StringPrinter::ReadStringAndDumpToStreamOptions options(valobj);
-      Error error;
+      Status error;
       lldb::addr_t addr_of_data =
           process_sp->ReadPointerFromMemory(addr_of_string, error);
       if (error.Fail() || addr_of_data == 0 ||
@@ -308,7 +308,7 @@ bool lldb_private::formatters::LibStdcppWStringSummaryProvider(
           nullptr); // Safe to pass NULL for exe_scope here
 
       StringPrinter::ReadStringAndDumpToStreamOptions options(valobj);
-      Error error;
+      Status error;
       lldb::addr_t addr_of_data =
           process_sp->ReadPointerFromMemory(addr_of_string, error);
       if (error.Fail() || addr_of_data == 0 ||
@@ -414,7 +414,7 @@ bool lldb_private::formatters::LibStdcppSmartPointerSummaryProvider(
     return true;
   }
 
-  Error error;
+  Status error;
   ValueObjectSP pointee_sp = ptr_sp->Dereference(error);
   if (pointee_sp && error.Success()) {
     if (pointee_sp->DumpPrintableRepresentation(
diff --git a/source/Plugins/Language/CPlusPlus/LibStdcppUniquePointer.cpp b/source/Plugins/Language/CPlusPlus/LibStdcppUniquePointer.cpp
index 7693961cae17..b6d664df16ef 100644
--- a/source/Plugins/Language/CPlusPlus/LibStdcppUniquePointer.cpp
+++ b/source/Plugins/Language/CPlusPlus/LibStdcppUniquePointer.cpp
@@ -79,7 +79,7 @@ bool LibStdcppUniquePtrSyntheticFrontEnd::Update() {
     m_del_obj = del_obj->Clone(ConstString("deleter"));
 
   if (m_ptr_obj) {
-    Error error;
+    Status error;
     ValueObjectSP obj_obj = m_ptr_obj->Dereference(error);
     if (error.Success()) {
       m_obj_obj = obj_obj->Clone(ConstString("object"));
diff --git a/source/Plugins/Language/Go/GoFormatterFunctions.cpp b/source/Plugins/Language/Go/GoFormatterFunctions.cpp
index 0f78f643321b..aac75205c6ef 100644
--- a/source/Plugins/Language/Go/GoFormatterFunctions.cpp
+++ b/source/Plugins/Language/Go/GoFormatterFunctions.cpp
@@ -96,7 +96,7 @@ bool lldb_private::formatters::GoStringSummaryProvider(
     return false;
 
   if (valobj.IsPointerType()) {
-    Error err;
+    Status err;
     ValueObjectSP deref = valobj.Dereference(err);
     if (!err.Success())
       return false;
diff --git a/source/Plugins/Language/Java/JavaFormatterFunctions.cpp b/source/Plugins/Language/Java/JavaFormatterFunctions.cpp
index 286651a64409..498795c90be8 100644
--- a/source/Plugins/Language/Java/JavaFormatterFunctions.cpp
+++ b/source/Plugins/Language/Java/JavaFormatterFunctions.cpp
@@ -57,7 +57,7 @@ public:
         valobj->GetAddressOf() +
         JavaASTContext::CalculateArrayElementOffset(type, idx);
 
-    Error error;
+    Status error;
     size_t byte_size = element_type.GetByteSize(nullptr);
     DataBufferSP buffer_sp(new DataBufferHeap(byte_size, 0));
     size_t bytes_read = process_sp->ReadMemory(address, buffer_sp->GetBytes(),
@@ -86,7 +86,7 @@ private:
     if (!m_backend.IsPointerOrReferenceType())
       return m_backend.GetSP();
 
-    Error error;
+    Status error;
     return m_backend.Dereference(error);
   }
 };
@@ -96,7 +96,7 @@ private:
 bool lldb_private::formatters::JavaStringSummaryProvider(
     ValueObject &valobj, Stream &stream, const TypeSummaryOptions &opts) {
   if (valobj.IsPointerOrReferenceType()) {
-    Error error;
+    Status error;
     ValueObjectSP deref = valobj.Dereference(error);
     if (error.Fail())
       return false;
@@ -145,7 +145,7 @@ bool lldb_private::formatters::JavaStringSummaryProvider(
 bool lldb_private::formatters::JavaArraySummaryProvider(
     ValueObject &valobj, Stream &stream, const TypeSummaryOptions &options) {
   if (valobj.IsPointerOrReferenceType()) {
-    Error error;
+    Status error;
     ValueObjectSP deref = valobj.Dereference(error);
     if (error.Fail())
       return false;
diff --git a/source/Plugins/Language/ObjC/CF.cpp b/source/Plugins/Language/ObjC/CF.cpp
index a75c034afa9f..9bb8eeab1d2e 100644
--- a/source/Plugins/Language/ObjC/CF.cpp
+++ b/source/Plugins/Language/ObjC/CF.cpp
@@ -20,7 +20,7 @@
 #include "lldb/Target/Target.h"
 #include "lldb/Utility/DataBufferHeap.h"
 #include "lldb/Utility/Endian.h"
-#include "lldb/Utility/Error.h"
+#include "lldb/Utility/Status.h"
 #include "lldb/Utility/Stream.h"
 
 using namespace lldb;
@@ -88,7 +88,7 @@ bool lldb_private::formatters::CFBagSummaryProvider(
 
   if (is_type_ok) {
     lldb::addr_t offset = 2 * ptr_size + 4 + valobj_addr;
-    Error error;
+    Status error;
     count = process_sp->ReadUnsignedIntegerFromMemory(offset, 4, 0, error);
     if (error.Fail())
       return false;
@@ -152,7 +152,7 @@ bool lldb_private::formatters::CFBitVectorSummaryProvider(
   if (is_type_ok == false)
     return false;
 
-  Error error;
+  Status error;
   count = process_sp->ReadUnsignedIntegerFromMemory(valobj_addr + 2 * ptr_size,
                                                     ptr_size, 0, error);
   if (error.Fail())
@@ -281,7 +281,7 @@ bool lldb_private::formatters::CFBinaryHeapSummaryProvider(
 
   if (is_type_ok) {
     lldb::addr_t offset = 2 * ptr_size + valobj_addr;
-    Error error;
+    Status error;
     count = process_sp->ReadUnsignedIntegerFromMemory(offset, 4, 0, error);
     if (error.Fail())
       return false;
diff --git a/source/Plugins/Language/ObjC/Cocoa.cpp b/source/Plugins/Language/ObjC/Cocoa.cpp
index 0fc690606d16..dd3dc434f753 100644
--- a/source/Plugins/Language/ObjC/Cocoa.cpp
+++ b/source/Plugins/Language/ObjC/Cocoa.cpp
@@ -28,7 +28,7 @@
 #include "lldb/Target/Target.h"
 #include "lldb/Utility/DataBufferHeap.h"
 #include "lldb/Utility/Endian.h"
-#include "lldb/Utility/Error.h"
+#include "lldb/Utility/Status.h"
 #include "lldb/Utility/Stream.h"
 
 #include "Plugins/LanguageRuntime/ObjC/AppleObjCRuntime/AppleObjCRuntime.h"
@@ -218,7 +218,7 @@ bool lldb_private::formatters::NSMachPortSummaryProvider(
 
   if (!strcmp(class_name, "NSMachPort")) {
     uint64_t offset = (ptr_size == 4 ? 12 : 20);
-    Error error;
+    Status error;
     port_number = process_sp->ReadUnsignedIntegerFromMemory(
         offset + valobj_addr, 4, 0, error);
     if (error.Success()) {
@@ -267,7 +267,7 @@ bool lldb_private::formatters::NSIndexSetSummaryProvider(
   do {
     if (!strcmp(class_name, "NSIndexSet") ||
         !strcmp(class_name, "NSMutableIndexSet")) {
-      Error error;
+      Status error;
       uint32_t mode = process_sp->ReadUnsignedIntegerFromMemory(
           valobj_addr + ptr_size, 4, 0, error);
       if (error.Fail())
@@ -461,7 +461,7 @@ bool lldb_private::formatters::NSNumberSummaryProvider(
       }
       return true;
     } else {
-      Error error;
+      Status error;
       uint8_t data_type = (process_sp->ReadUnsignedIntegerFromMemory(
                                valobj_addr + ptr_size, 1, 0, error) &
                            0x1F);
@@ -653,7 +653,7 @@ bool lldb_private::formatters::NSDateSummaryProvider(
           process_sp->GetTarget().GetArchitecture().GetTriple());
       uint32_t delta =
           (triple.isWatchOS() && triple.isWatchABI()) ? 8 : ptr_size;
-      Error error;
+      Status error;
       date_value_bits = process_sp->ReadUnsignedIntegerFromMemory(
           valobj_addr + delta, 8, 0, error);
       memcpy(&date_value, &date_value_bits, sizeof(date_value_bits));
@@ -661,7 +661,7 @@ bool lldb_private::formatters::NSDateSummaryProvider(
         return false;
     }
   } else if (class_name == g_NSCalendarDate) {
-    Error error;
+    Status error;
     date_value_bits = process_sp->ReadUnsignedIntegerFromMemory(
         valobj_addr + 2 * ptr_size, 8, 0, error);
     memcpy(&date_value, &date_value_bits, sizeof(date_value_bits));
@@ -788,14 +788,14 @@ bool lldb_private::formatters::NSDataSummaryProvider(
       !strcmp(class_name, "NSConcreteMutableData") ||
       !strcmp(class_name, "__NSCFData")) {
     uint32_t offset = (is_64bit ? 16 : 8);
-    Error error;
+    Status error;
     value = process_sp->ReadUnsignedIntegerFromMemory(
         valobj_addr + offset, is_64bit ? 8 : 4, 0, error);
     if (error.Fail())
       return false;
   } else if (!strcmp(class_name, "_NSInlineData")) {
     uint32_t offset = (is_64bit ? 8 : 4);
-    Error error;
+    Status error;
     value = process_sp->ReadUnsignedIntegerFromMemory(valobj_addr + offset, 2,
                                                       0, error);
     if (error.Fail())
@@ -818,7 +818,7 @@ bool lldb_private::formatters::ObjCBOOLSummaryProvider(
   ValueObjectSP real_guy_sp = valobj.GetSP();
 
   if (type_info & eTypeIsPointer) {
-    Error err;
+    Status err;
     real_guy_sp = valobj.Dereference(err);
     if (err.Fail() || !real_guy_sp)
       return false;
@@ -893,7 +893,7 @@ bool lldb_private::formatters::ObjCSELSummaryProvider(
                                                           exe_ctx, charstar);
   } else {
     DataExtractor data;
-    Error error;
+    Status error;
     valobj.GetData(data, error);
     if (error.Fail())
       return false;
diff --git a/source/Plugins/Language/ObjC/NSArray.cpp b/source/Plugins/Language/ObjC/NSArray.cpp
index 79773fdf1d15..b07b9ba5888f 100644
--- a/source/Plugins/Language/ObjC/NSArray.cpp
+++ b/source/Plugins/Language/ObjC/NSArray.cpp
@@ -26,7 +26,7 @@
 #include "lldb/Target/Target.h"
 #include "lldb/Utility/DataBufferHeap.h"
 #include "lldb/Utility/Endian.h"
-#include "lldb/Utility/Error.h"
+#include "lldb/Utility/Status.h"
 #include "lldb/Utility/Stream.h"
 
 using namespace lldb;
@@ -262,13 +262,13 @@ bool lldb_private::formatters::NSArraySummaryProvider(
     return false;
 
   if (class_name == g_NSArrayI) {
-    Error error;
+    Status error;
     value = process_sp->ReadUnsignedIntegerFromMemory(valobj_addr + ptr_size,
                                                       ptr_size, 0, error);
     if (error.Fail())
       return false;
   } else if (class_name == g_NSArrayM) {
-    Error error;
+    Status error;
     value = process_sp->ReadUnsignedIntegerFromMemory(valobj_addr + ptr_size,
                                                       ptr_size, 0, error);
     if (error.Fail())
@@ -278,7 +278,7 @@ bool lldb_private::formatters::NSArraySummaryProvider(
   } else if (class_name == g_NSArray1) {
     value = 1;
   } else if (class_name == g_NSArrayCF) {
-    Error error;
+    Status error;
     value = process_sp->ReadUnsignedIntegerFromMemory(
         valobj_addr + 2 * ptr_size, ptr_size, 0, error);
     if (error.Fail())
@@ -364,7 +364,7 @@ bool lldb_private::formatters::NSArrayMSyntheticFrontEnd_109::Update() {
   if (!valobj_sp)
     return false;
   m_exe_ctx_ref = valobj_sp->GetExecutionContextRef();
-  Error error;
+  Status error;
   error.Clear();
   lldb::ProcessSP process_sp(valobj_sp->GetProcessSP());
   if (!process_sp)
@@ -395,7 +395,7 @@ bool lldb_private::formatters::NSArrayMSyntheticFrontEnd_1010::Update() {
   if (!valobj_sp)
     return false;
   m_exe_ctx_ref = valobj_sp->GetExecutionContextRef();
-  Error error;
+  Status error;
   error.Clear();
   lldb::ProcessSP process_sp(valobj_sp->GetProcessSP());
   if (!process_sp)
@@ -538,7 +538,7 @@ bool lldb_private::formatters::NSArrayISyntheticFrontEnd::Update() {
   if (!valobj_sp)
     return false;
   m_exe_ctx_ref = valobj_sp->GetExecutionContextRef();
-  Error error;
+  Status error;
   error.Clear();
   lldb::ProcessSP process_sp(valobj_sp->GetProcessSP());
   if (!process_sp)
@@ -566,7 +566,7 @@ lldb_private::formatters::NSArrayISyntheticFrontEnd::GetChildAtIndex(
   ProcessSP process_sp = m_exe_ctx_ref.GetProcessSP();
   if (!process_sp)
     return lldb::ValueObjectSP();
-  Error error;
+  Status error;
   if (error.Fail())
     return lldb::ValueObjectSP();
   StreamString idx_name;
@@ -665,7 +665,7 @@ lldb_private::formatters::NSArraySyntheticFrontEndCreator(
   Flags flags(valobj_type.GetTypeInfo());
 
   if (flags.IsClear(eTypeIsPointer)) {
-    Error error;
+    Status error;
     valobj_sp = valobj_sp->AddressOf(error);
     if (error.Fail() || !valobj_sp)
       return nullptr;
diff --git a/source/Plugins/Language/ObjC/NSDictionary.cpp b/source/Plugins/Language/ObjC/NSDictionary.cpp
index 672cc1a26f4e..6df83d52acca 100644
--- a/source/Plugins/Language/ObjC/NSDictionary.cpp
+++ b/source/Plugins/Language/ObjC/NSDictionary.cpp
@@ -27,7 +27,7 @@
 #include "lldb/Target/Target.h"
 #include "lldb/Utility/DataBufferHeap.h"
 #include "lldb/Utility/Endian.h"
-#include "lldb/Utility/Error.h"
+#include "lldb/Utility/Status.h"
 #include "lldb/Utility/Stream.h"
 
 using namespace lldb;
@@ -256,14 +256,14 @@ bool lldb_private::formatters::NSDictionarySummaryProvider(
     return false;
 
   if (class_name == g_DictionaryI) {
-    Error error;
+    Status error;
     value = process_sp->ReadUnsignedIntegerFromMemory(valobj_addr + ptr_size,
                                                       ptr_size, 0, error);
     if (error.Fail())
       return false;
     value &= (is_64bit ? ~0xFC00000000000000UL : ~0xFC000000U);
   } else if (class_name == g_DictionaryM) {
-    Error error;
+    Status error;
     value = process_sp->ReadUnsignedIntegerFromMemory(valobj_addr + ptr_size,
                                                       ptr_size, 0, error);
     if (error.Fail())
@@ -274,7 +274,7 @@ bool lldb_private::formatters::NSDictionarySummaryProvider(
   }
   /*else if (!strcmp(class_name,"__NSCFDictionary"))
    {
-   Error error;
+   Status error;
    value = process_sp->ReadUnsignedIntegerFromMemory(valobj_addr + (is_64bit ?
    20 : 12), 4, 0, error);
    if (error.Fail())
@@ -321,7 +321,7 @@ lldb_private::formatters::NSDictionarySyntheticFrontEndCreator(
   Flags flags(valobj_type.GetTypeInfo());
 
   if (flags.IsClear(eTypeIsPointer)) {
-    Error error;
+    Status error;
     valobj_sp = valobj_sp->AddressOf(error);
     if (error.Fail() || !valobj_sp)
       return nullptr;
@@ -400,7 +400,7 @@ bool lldb_private::formatters::NSDictionaryISyntheticFrontEnd::Update() {
   if (!valobj_sp)
     return false;
   m_exe_ctx_ref = valobj_sp->GetExecutionContextRef();
-  Error error;
+  Status error;
   error.Clear();
   lldb::ProcessSP process_sp(valobj_sp->GetProcessSP());
   if (!process_sp)
@@ -449,7 +449,7 @@ lldb_private::formatters::NSDictionaryISyntheticFrontEnd::GetChildAtIndex(
       ProcessSP process_sp = m_exe_ctx_ref.GetProcessSP();
       if (!process_sp)
         return lldb::ValueObjectSP();
-      Error error;
+      Status error;
       key_at_idx = process_sp->ReadPointerFromMemory(key_at_idx, error);
       if (error.Fail())
         return lldb::ValueObjectSP();
@@ -553,7 +553,7 @@ lldb_private::formatters::NSDictionary1SyntheticFrontEnd::GetChildAtIndex(
       m_backend.GetValueAsUnsigned(LLDB_INVALID_ADDRESS) + ptr_size;
   lldb::addr_t value_ptr = key_ptr + ptr_size;
 
-  Error error;
+  Status error;
 
   lldb::addr_t value_at_idx = process_sp->ReadPointerFromMemory(key_ptr, error);
   if (error.Fail())
@@ -625,7 +625,7 @@ bool lldb_private::formatters::NSDictionaryMSyntheticFrontEnd::Update() {
   if (!valobj_sp)
     return false;
   m_exe_ctx_ref = valobj_sp->GetExecutionContextRef();
-  Error error;
+  Status error;
   error.Clear();
   lldb::ProcessSP process_sp(valobj_sp->GetProcessSP());
   if (!process_sp)
@@ -679,7 +679,7 @@ lldb_private::formatters::NSDictionaryMSyntheticFrontEnd::GetChildAtIndex(
       ProcessSP process_sp = m_exe_ctx_ref.GetProcessSP();
       if (!process_sp)
         return lldb::ValueObjectSP();
-      Error error;
+      Status error;
       key_at_idx = process_sp->ReadPointerFromMemory(key_at_idx, error);
       if (error.Fail())
         return lldb::ValueObjectSP();
diff --git a/source/Plugins/Language/ObjC/NSError.cpp b/source/Plugins/Language/ObjC/NSError.cpp
index e8044d3b0013..4365a12b54e1 100644
--- a/source/Plugins/Language/ObjC/NSError.cpp
+++ b/source/Plugins/Language/ObjC/NSError.cpp
@@ -24,7 +24,7 @@
 #include "lldb/Target/Target.h"
 #include "lldb/Utility/DataBufferHeap.h"
 #include "lldb/Utility/Endian.h"
-#include "lldb/Utility/Error.h"
+#include "lldb/Utility/Status.h"
 #include "lldb/Utility/Stream.h"
 
 #include "Plugins/Language/ObjC/NSString.h"
@@ -46,7 +46,7 @@ static lldb::addr_t DerefToNSErrorPointer(ValueObject &valobj) {
       Flags pointee_flags(pointee_type.GetTypeInfo());
       if (pointee_flags.AllSet(eTypeIsPointer)) {
         if (ProcessSP process_sp = valobj.GetProcessSP()) {
-          Error error;
+          Status error;
           ptr_value = process_sp->ReadPointerFromMemory(ptr_value, error);
         }
       }
@@ -71,7 +71,7 @@ bool lldb_private::formatters::NSError_SummaryProvider(
   lldb::addr_t code_location = ptr_value + 2 * ptr_size;
   lldb::addr_t domain_location = ptr_value + 3 * ptr_size;
 
-  Error error;
+  Status error;
   uint64_t code = process_sp->ReadUnsignedIntegerFromMemory(code_location,
                                                             ptr_size, 0, error);
   if (error.Fail())
@@ -152,7 +152,7 @@ public:
     size_t ptr_size = process_sp->GetAddressByteSize();
 
     userinfo_location += 4 * ptr_size;
-    Error error;
+    Status error;
     lldb::addr_t userinfo =
         process_sp->ReadPointerFromMemory(userinfo_location, error);
     if (userinfo == LLDB_INVALID_ADDRESS || error.Fail())
diff --git a/source/Plugins/Language/ObjC/NSException.cpp b/source/Plugins/Language/ObjC/NSException.cpp
index aa86e207783d..1da4f6de19a0 100644
--- a/source/Plugins/Language/ObjC/NSException.cpp
+++ b/source/Plugins/Language/ObjC/NSException.cpp
@@ -24,7 +24,7 @@
 #include "lldb/Target/Target.h"
 #include "lldb/Utility/DataBufferHeap.h"
 #include "lldb/Utility/Endian.h"
-#include "lldb/Utility/Error.h"
+#include "lldb/Utility/Status.h"
 #include "lldb/Utility/Stream.h"
 
 #include "Plugins/Language/ObjC/NSString.h"
@@ -55,7 +55,7 @@ bool lldb_private::formatters::NSException_SummaryProvider(
   lldb::addr_t name_location = ptr_value + 1 * ptr_size;
   lldb::addr_t reason_location = ptr_value + 2 * ptr_size;
 
-  Error error;
+  Status error;
   lldb::addr_t name = process_sp->ReadPointerFromMemory(name_location, error);
   if (error.Fail() || name == LLDB_INVALID_ADDRESS)
     return false;
@@ -146,7 +146,7 @@ public:
     size_t ptr_size = process_sp->GetAddressByteSize();
 
     userinfo_location += 3 * ptr_size;
-    Error error;
+    Status error;
     lldb::addr_t userinfo =
         process_sp->ReadPointerFromMemory(userinfo_location, error);
     if (userinfo == LLDB_INVALID_ADDRESS || error.Fail())
diff --git a/source/Plugins/Language/ObjC/NSSet.cpp b/source/Plugins/Language/ObjC/NSSet.cpp
index adefba902b61..b6f6ff7af19d 100644
--- a/source/Plugins/Language/ObjC/NSSet.cpp
+++ b/source/Plugins/Language/ObjC/NSSet.cpp
@@ -22,7 +22,7 @@
 #include "lldb/Target/Target.h"
 #include "lldb/Utility/DataBufferHeap.h"
 #include "lldb/Utility/Endian.h"
-#include "lldb/Utility/Error.h"
+#include "lldb/Utility/Status.h"
 #include "lldb/Utility/Stream.h"
 
 using namespace lldb;
@@ -185,14 +185,14 @@ bool lldb_private::formatters::NSSetSummaryProvider(
     return false;
 
   if (!strcmp(class_name, "__NSSetI")) {
-    Error error;
+    Status error;
     value = process_sp->ReadUnsignedIntegerFromMemory(valobj_addr + ptr_size,
                                                       ptr_size, 0, error);
     if (error.Fail())
       return false;
     value &= (is_64bit ? ~0xFC00000000000000UL : ~0xFC000000U);
   } else if (!strcmp(class_name, "__NSSetM")) {
-    Error error;
+    Status error;
     value = process_sp->ReadUnsignedIntegerFromMemory(valobj_addr + ptr_size,
                                                       ptr_size, 0, error);
     if (error.Fail())
@@ -201,7 +201,7 @@ bool lldb_private::formatters::NSSetSummaryProvider(
   }
   /*else if (!strcmp(class_name,"__NSCFSet"))
    {
-   Error error;
+   Status error;
    value = process_sp->ReadUnsignedIntegerFromMemory(valobj_addr + (is_64bit ?
    20 : 12), 4, 0, error);
    if (error.Fail())
@@ -211,7 +211,7 @@ bool lldb_private::formatters::NSSetSummaryProvider(
    }
    else if (!strcmp(class_name,"NSCountedSet"))
    {
-   Error error;
+   Status error;
    value = process_sp->ReadUnsignedIntegerFromMemory(valobj_addr + ptr_size,
    ptr_size, 0, error);
    if (error.Fail())
@@ -262,7 +262,7 @@ lldb_private::formatters::NSSetSyntheticFrontEndCreator(
   Flags flags(valobj_type.GetTypeInfo());
 
   if (flags.IsClear(eTypeIsPointer)) {
-    Error error;
+    Status error;
     valobj_sp = valobj_sp->AddressOf(error);
     if (error.Fail() || !valobj_sp)
       return nullptr;
@@ -338,7 +338,7 @@ bool lldb_private::formatters::NSSetISyntheticFrontEnd::Update() {
   if (!valobj_sp)
     return false;
   m_exe_ctx_ref = valobj_sp->GetExecutionContextRef();
-  Error error;
+  Status error;
   if (valobj_sp->IsPointerType()) {
     valobj_sp = valobj_sp->Dereference(error);
     if (error.Fail() || !valobj_sp)
@@ -391,7 +391,7 @@ lldb_private::formatters::NSSetISyntheticFrontEnd::GetChildAtIndex(size_t idx) {
       obj_at_idx = m_data_ptr + (test_idx * m_ptr_size);
       if (!process_sp)
         return lldb::ValueObjectSP();
-      Error error;
+      Status error;
       obj_at_idx = process_sp->ReadPointerFromMemory(obj_at_idx, error);
       if (error.Fail())
         return lldb::ValueObjectSP();
@@ -487,7 +487,7 @@ bool lldb_private::formatters::NSSetMSyntheticFrontEnd::Update() {
   if (!valobj_sp)
     return false;
   m_exe_ctx_ref = valobj_sp->GetExecutionContextRef();
-  Error error;
+  Status error;
   if (valobj_sp->IsPointerType()) {
     valobj_sp = valobj_sp->Dereference(error);
     if (error.Fail() || !valobj_sp)
@@ -542,7 +542,7 @@ lldb_private::formatters::NSSetMSyntheticFrontEnd::GetChildAtIndex(size_t idx) {
       obj_at_idx = m_objs_addr + (test_idx * m_ptr_size);
       if (!process_sp)
         return lldb::ValueObjectSP();
-      Error error;
+      Status error;
       obj_at_idx = process_sp->ReadPointerFromMemory(obj_at_idx, error);
       if (error.Fail())
         return lldb::ValueObjectSP();
diff --git a/source/Plugins/Language/ObjC/NSString.cpp b/source/Plugins/Language/ObjC/NSString.cpp
index d0258be8211e..3b4edf68e063 100644
--- a/source/Plugins/Language/ObjC/NSString.cpp
+++ b/source/Plugins/Language/ObjC/NSString.cpp
@@ -20,7 +20,7 @@
 #include "lldb/Target/Target.h"
 #include "lldb/Utility/DataBufferHeap.h"
 #include "lldb/Utility/Endian.h"
-#include "lldb/Utility/Error.h"
+#include "lldb/Utility/Status.h"
 #include "lldb/Utility/Stream.h"
 
 using namespace lldb;
@@ -103,7 +103,7 @@ bool lldb_private::formatters::NSStringSummaryProvider(
   if (process_sp->GetByteOrder() != lldb::eByteOrderLittle)
     info_bits_location += 3;
 
-  Error error;
+  Status error;
 
   uint8_t info_bits = process_sp->ReadUnsignedIntegerFromMemory(
       info_bits_location, 1, 0, error);
@@ -258,7 +258,7 @@ bool lldb_private::formatters::NSStringSummaryProvider(
       // in this kind of string, the byte before the string content is a length
       // byte
       // so let's try and use it to handle the embedded NUL case
-      Error error;
+      Status error;
       explicit_length =
           process_sp->ReadUnsignedIntegerFromMemory(location, 1, 0, error);
       if (error.Fail() || explicit_length == 0)
@@ -319,7 +319,7 @@ bool lldb_private::formatters::NSAttributedStringSummaryProvider(
   if (!child_ptr_sp)
     return false;
   DataExtractor data;
-  Error error;
+  Status error;
   child_ptr_sp->GetData(data, error);
   if (error.Fail())
     return false;
diff --git a/source/Plugins/LanguageRuntime/CPlusPlus/ItaniumABI/ItaniumABILanguageRuntime.cpp b/source/Plugins/LanguageRuntime/CPlusPlus/ItaniumABI/ItaniumABILanguageRuntime.cpp
index e308747378d3..e5a459dfe861 100644
--- a/source/Plugins/LanguageRuntime/CPlusPlus/ItaniumABI/ItaniumABILanguageRuntime.cpp
+++ b/source/Plugins/LanguageRuntime/CPlusPlus/ItaniumABI/ItaniumABILanguageRuntime.cpp
@@ -31,8 +31,8 @@
 #include "lldb/Target/Target.h"
 #include "lldb/Target/Thread.h"
 #include "lldb/Utility/ConstString.h"
-#include "lldb/Utility/Error.h"
 #include "lldb/Utility/Log.h"
+#include "lldb/Utility/Status.h"
 
 #include <vector>
 
@@ -217,7 +217,7 @@ bool ItaniumABILanguageRuntime::GetDynamicTypeAndAddress(
     if (process == nullptr)
       return false;
 
-    Error error;
+    Status error;
     const lldb::addr_t vtable_address_point =
         process->ReadPointerFromMemory(original_ptr, error);
 
diff --git a/source/Plugins/LanguageRuntime/Go/GoLanguageRuntime.cpp b/source/Plugins/LanguageRuntime/Go/GoLanguageRuntime.cpp
index c52fc1e121d8..6670f89dde5f 100644
--- a/source/Plugins/LanguageRuntime/Go/GoLanguageRuntime.cpp
+++ b/source/Plugins/LanguageRuntime/Go/GoLanguageRuntime.cpp
@@ -27,8 +27,8 @@
 #include "lldb/Target/Target.h"
 #include "lldb/Target/Thread.h"
 #include "lldb/Utility/ConstString.h"
-#include "lldb/Utility/Error.h"
 #include "lldb/Utility/Log.h"
+#include "lldb/Utility/Status.h"
 #include "llvm/ADT/Twine.h"
 
 #include <vector>
@@ -42,7 +42,7 @@ ValueObjectSP GetChild(ValueObject &obj, const char *name,
   ConstString name_const_str(name);
   ValueObjectSP result = obj.GetChildMemberWithName(name_const_str, true);
   if (dereference && result && result->IsPointerType()) {
-    Error err;
+    Status err;
     result = result->Dereference(err);
     if (err.Fail())
       result.reset();
@@ -55,7 +55,7 @@ ConstString ReadString(ValueObject &str, Process *process) {
   ValueObjectSP data = GetChild(str, "str", false);
   ValueObjectSP len = GetChild(str, "len");
   if (len && data) {
-    Error err;
+    Status err;
     lldb::addr_t addr = data->GetPointerValue();
     if (addr == LLDB_INVALID_ADDRESS)
       return result;
@@ -97,7 +97,7 @@ CompilerType LookupRuntimeType(ValueObjectSP type, ExecutionContext *exe_ctx,
   *is_direct = GoASTContext::IsDirectIface(kind);
   if (GoASTContext::IsPointerKind(kind)) {
     CompilerType type_ptr = type->GetCompilerType().GetPointerType();
-    Error err;
+    Status err;
     ValueObjectSP elem =
         type->CreateValueObjectFromAddress("elem", type->GetAddressOf() +
                                                        type->GetByteSize(),
@@ -138,7 +138,7 @@ bool GoLanguageRuntime::GetDynamicTypeAndAddress(
   value_type = Value::eValueTypeScalar;
   class_type_or_name.Clear();
   if (CouldHaveDynamicValue(in_value)) {
-    Error err;
+    Status err;
     ValueObjectSP iface = in_value.GetStaticValue();
     ValueObjectSP data_sp = GetChild(*iface, "data", false);
     if (!data_sp)
diff --git a/source/Plugins/LanguageRuntime/ObjC/AppleObjCRuntime/AppleObjCClassDescriptorV2.cpp b/source/Plugins/LanguageRuntime/ObjC/AppleObjCRuntime/AppleObjCClassDescriptorV2.cpp
index 6b5e6b1eb764..ecb0b64f2346 100644
--- a/source/Plugins/LanguageRuntime/ObjC/AppleObjCRuntime/AppleObjCClassDescriptorV2.cpp
+++ b/source/Plugins/LanguageRuntime/ObjC/AppleObjCRuntime/AppleObjCClassDescriptorV2.cpp
@@ -52,7 +52,7 @@ bool ClassDescriptorV2::objc_class_t::Read(Process *process,
                            + ptr_size; // uintptr_t data_NEVER_USE;
 
   DataBufferHeap objc_class_buf(objc_class_size, '\0');
-  Error error;
+  Status error;
 
   process->ReadMemory(addr, objc_class_buf.GetBytes(), objc_class_size, error);
   if (error.Fail()) {
@@ -92,7 +92,7 @@ bool ClassDescriptorV2::class_rw_t::Read(Process *process, lldb::addr_t addr) {
                 + ptr_size;        // Class nextSiblingClass;
 
   DataBufferHeap buffer(size, '\0');
-  Error error;
+  Status error;
 
   process->ReadMemory(addr, buffer.GetBytes(), size, error);
   if (error.Fail()) {
@@ -132,7 +132,7 @@ bool ClassDescriptorV2::class_ro_t::Read(Process *process, lldb::addr_t addr) {
                 + ptr_size;           // const property_list_t *baseProperties;
 
   DataBufferHeap buffer(size, '\0');
-  Error error;
+  Status error;
 
   process->ReadMemory(addr, buffer.GetBytes(), size, error);
   if (error.Fail()) {
@@ -180,7 +180,7 @@ bool ClassDescriptorV2::Read_class_row(
   class_ro.reset();
   class_rw.reset();
 
-  Error error;
+  Status error;
   uint32_t class_row_t_flags = process->ReadUnsignedIntegerFromMemory(
       objc_class.m_data_ptr, sizeof(uint32_t), 0, error);
   if (!error.Success())
@@ -219,7 +219,7 @@ bool ClassDescriptorV2::method_list_t::Read(Process *process,
                 + sizeof(uint32_t); // uint32_t count;
 
   DataBufferHeap buffer(size, '\0');
-  Error error;
+  Status error;
 
   process->ReadMemory(addr, buffer.GetBytes(), size, error);
   if (error.Fail()) {
@@ -242,7 +242,7 @@ bool ClassDescriptorV2::method_t::Read(Process *process, lldb::addr_t addr) {
   size_t size = GetSize(process);
 
   DataBufferHeap buffer(size, '\0');
-  Error error;
+  Status error;
 
   process->ReadMemory(addr, buffer.GetBytes(), size, error);
   if (error.Fail()) {
@@ -276,7 +276,7 @@ bool ClassDescriptorV2::ivar_list_t::Read(Process *process, lldb::addr_t addr) {
                 + sizeof(uint32_t); // uint32_t count;
 
   DataBufferHeap buffer(size, '\0');
-  Error error;
+  Status error;
 
   process->ReadMemory(addr, buffer.GetBytes(), size, error);
   if (error.Fail()) {
@@ -299,7 +299,7 @@ bool ClassDescriptorV2::ivar_t::Read(Process *process, lldb::addr_t addr) {
   size_t size = GetSize(process);
 
   DataBufferHeap buffer(size, '\0');
-  Error error;
+  Status error;
 
   process->ReadMemory(addr, buffer.GetBytes(), size, error);
   if (error.Fail()) {
@@ -527,7 +527,7 @@ void ClassDescriptorV2::iVarsStorage::fill(AppleObjCRuntimeV2 &runtime,
                 "{3}, type_size = {4}",
                 name, type, offset_ptr, size, ivar_type.GetByteSize(nullptr));
       Scalar offset_scalar;
-      Error error;
+      Status error;
       const int offset_ptr_size = 4;
       const bool is_signed = false;
       size_t read = process->ReadScalarIntegerFromMemory(
diff --git a/source/Plugins/LanguageRuntime/ObjC/AppleObjCRuntime/AppleObjCRuntime.cpp b/source/Plugins/LanguageRuntime/ObjC/AppleObjCRuntime/AppleObjCRuntime.cpp
index 6b27009a0727..eacc98a07193 100644
--- a/source/Plugins/LanguageRuntime/ObjC/AppleObjCRuntime/AppleObjCRuntime.cpp
+++ b/source/Plugins/LanguageRuntime/ObjC/AppleObjCRuntime/AppleObjCRuntime.cpp
@@ -31,8 +31,8 @@
 #include "lldb/Target/Target.h"
 #include "lldb/Target/Thread.h"
 #include "lldb/Utility/ConstString.h"
-#include "lldb/Utility/Error.h"
 #include "lldb/Utility/Log.h"
+#include "lldb/Utility/Status.h"
 #include "lldb/Utility/StreamString.h"
 
 #include <vector>
@@ -145,7 +145,7 @@ bool AppleObjCRuntime::GetObjectDescription(Stream &strm, Value &value,
   lldb::addr_t wrapper_struct_addr = LLDB_INVALID_ADDRESS;
 
   if (!m_print_object_caller_up) {
-    Error error;
+    Status error;
     m_print_object_caller_up.reset(
         exe_scope->CalculateTarget()->GetFunctionCallerForLanguage(
             eLanguageTypeObjC, return_compiler_type, *function_address,
@@ -185,7 +185,7 @@ bool AppleObjCRuntime::GetObjectDescription(Stream &strm, Value &value,
   size_t full_buffer_len = sizeof(buf) - 1;
   size_t curr_len = full_buffer_len;
   while (curr_len == full_buffer_len) {
-    Error error;
+    Status error;
     curr_len = process->ReadCStringFromMemory(result_ptr + cstr_len, buf,
                                               sizeof(buf), error);
     strm.Write(buf, curr_len);
diff --git a/source/Plugins/LanguageRuntime/ObjC/AppleObjCRuntime/AppleObjCRuntimeV1.cpp b/source/Plugins/LanguageRuntime/ObjC/AppleObjCRuntime/AppleObjCRuntimeV1.cpp
index 2c92b922b9df..566fefaf7984 100644
--- a/source/Plugins/LanguageRuntime/ObjC/AppleObjCRuntime/AppleObjCRuntimeV1.cpp
+++ b/source/Plugins/LanguageRuntime/ObjC/AppleObjCRuntime/AppleObjCRuntimeV1.cpp
@@ -28,8 +28,8 @@
 #include "lldb/Target/Target.h"
 #include "lldb/Target/Thread.h"
 #include "lldb/Utility/ConstString.h"
-#include "lldb/Utility/Error.h"
 #include "lldb/Utility/Log.h"
+#include "lldb/Utility/Status.h"
 #include "lldb/Utility/StreamString.h"
 
 #include <vector>
@@ -172,7 +172,7 @@ UtilityFunction *AppleObjCRuntimeV1::CreateObjectChecker(const char *name) {
                   name);
   assert(strformatsize < (int)sizeof(buf->contents));
 
-  Error error;
+  Status error;
   return GetTargetRef().GetUtilityFunctionForLanguage(
       buf->contents, eLanguageTypeObjC, name, error);
 }
@@ -196,7 +196,7 @@ void AppleObjCRuntimeV1::ClassDescriptorV1::Initialize(
 
   m_valid = true;
 
-  Error error;
+  Status error;
 
   m_isa = process_sp->ReadPointerFromMemory(isa, error);
 
@@ -302,7 +302,7 @@ lldb::addr_t AppleObjCRuntimeV1::GetISAHashTablePointer() {
             symbol->GetAddressRef().GetLoadAddress(&process->GetTarget());
 
         if (objc_debug_class_hash_addr != LLDB_INVALID_ADDRESS) {
-          Error error;
+          Status error;
           lldb::addr_t objc_debug_class_hash_ptr =
               process->ReadPointerFromMemory(objc_debug_class_hash_addr, error);
           if (objc_debug_class_hash_ptr != 0 &&
@@ -348,7 +348,7 @@ void AppleObjCRuntimeV1::UpdateISAToDescriptorMapIfNeeded() {
       //     const void *info;
       // } NXHashTable;
 
-      Error error;
+      Status error;
       DataBufferHeap buffer(1024, 0);
       if (process->ReadMemory(hash_table_ptr, buffer.GetBytes(), 20, error) ==
           20) {
diff --git a/source/Plugins/LanguageRuntime/ObjC/AppleObjCRuntime/AppleObjCRuntimeV2.cpp b/source/Plugins/LanguageRuntime/ObjC/AppleObjCRuntime/AppleObjCRuntimeV2.cpp
index 42c5fe9248f6..f15bcaa13844 100644
--- a/source/Plugins/LanguageRuntime/ObjC/AppleObjCRuntime/AppleObjCRuntimeV2.cpp
+++ b/source/Plugins/LanguageRuntime/ObjC/AppleObjCRuntime/AppleObjCRuntimeV2.cpp
@@ -51,8 +51,8 @@
 #include "lldb/Target/Target.h"
 #include "lldb/Target/Thread.h"
 #include "lldb/Utility/ConstString.h"
-#include "lldb/Utility/Error.h"
 #include "lldb/Utility/Log.h"
+#include "lldb/Utility/Status.h"
 #include "lldb/Utility/Stream.h"
 #include "lldb/Utility/StreamString.h"
 
@@ -338,7 +338,7 @@ __lldb_apple_objc_v2_get_shared_cache_class_info (void *objc_opt_ro_ptr,
 
 static uint64_t
 ExtractRuntimeGlobalSymbol(Process *process, ConstString name,
-                           const ModuleSP &module_sp, Error &error,
+                           const ModuleSP &module_sp, Status &error,
                            bool read_value = true, uint8_t byte_size = 0,
                            uint64_t default_value = LLDB_INVALID_ADDRESS,
                            SymbolType sym_type = lldb::eSymbolTypeData) {
@@ -483,9 +483,9 @@ public:
 
     ~CommandOptions() override = default;
 
-    Error SetOptionValue(uint32_t option_idx, llvm::StringRef option_arg,
-                         ExecutionContext *execution_context) override {
-      Error error;
+    Status SetOptionValue(uint32_t option_idx, llvm::StringRef option_arg,
+                          ExecutionContext *execution_context) override {
+      Status error;
       const int short_option = m_getopt_table[option_idx].val;
       switch (short_option) {
       case 'v':
@@ -676,7 +676,7 @@ protected:
           const char *arg_str = command.GetArgumentAtIndex(i);
           if (!arg_str)
             continue;
-          Error error;
+          Status error;
           lldb::addr_t arg_addr = Args::StringToAddress(
               &exe_ctx, arg_str, LLDB_INVALID_ADDRESS, &error);
           if (arg_addr == 0 || arg_addr == LLDB_INVALID_ADDRESS || error.Fail())
@@ -895,7 +895,7 @@ UtilityFunction *AppleObjCRuntimeV2::CreateObjectChecker(const char *name) {
   assert(len < (int)sizeof(check_function_code));
   UNUSED_IF_ASSERT_DISABLED(len);
 
-  Error error;
+  Status error;
   return GetTargetRef().GetUtilityFunctionForLanguage(
       check_function_code, eLanguageTypeObjC, name, error);
 }
@@ -927,7 +927,7 @@ size_t AppleObjCRuntimeV2::GetByteOffsetForIvar(CompilerType &parent_ast_type,
 
     addr_t ivar_offset_address = LLDB_INVALID_ADDRESS;
 
-    Error error;
+    Status error;
     SymbolContext ivar_offset_symbol;
     if (sc_list.GetSize() == 1 &&
         sc_list.GetContextAtIndex(0, ivar_offset_symbol)) {
@@ -984,7 +984,7 @@ public:
     m_map_pair_size = m_process->GetAddressByteSize() * 2;
     m_invalid_key =
         m_process->GetAddressByteSize() == 8 ? UINT64_MAX : UINT32_MAX;
-    Error err;
+    Status err;
 
     // This currently holds true for all platforms we support, but we might
     // need to change this to use get the actually byte size of "unsigned"
@@ -1077,7 +1077,7 @@ public:
       size_t map_pair_size = m_parent.m_map_pair_size;
       lldb::addr_t pair_ptr = pairs_ptr + (m_index * map_pair_size);
 
-      Error err;
+      Status err;
 
       lldb::addr_t key =
           m_parent.m_process->ReadPointerFromMemory(pair_ptr, err);
@@ -1106,7 +1106,7 @@ public:
       const lldb::addr_t pairs_ptr = m_parent.m_buckets_ptr;
       const size_t map_pair_size = m_parent.m_map_pair_size;
       const lldb::addr_t invalid_key = m_parent.m_invalid_key;
-      Error err;
+      Status err;
 
       while (m_index--) {
         lldb::addr_t pair_ptr = pairs_ptr + (m_index * map_pair_size);
@@ -1218,7 +1218,7 @@ AppleObjCRuntimeV2::GetClassDescriptor(ValueObject &valobj) {
 
       Process *process = exe_ctx.GetProcessPtr();
       if (process) {
-        Error error;
+        Status error;
         ObjCISA isa = process->ReadPointerFromMemory(isa_pointer, error);
         if (isa != LLDB_INVALID_ADDRESS) {
           objc_class_sp = GetClassDescriptorFromISA(isa);
@@ -1255,7 +1255,7 @@ lldb::addr_t AppleObjCRuntimeV2::GetISAHashTablePointer() {
           symbol->GetLoadAddress(&process->GetTarget());
 
       if (gdb_objc_realized_classes_ptr != LLDB_INVALID_ADDRESS) {
-        Error error;
+        Status error;
         m_isa_hash_table_ptr = process->ReadPointerFromMemory(
             gdb_objc_realized_classes_ptr, error);
       }
@@ -1295,7 +1295,7 @@ AppleObjCRuntimeV2::UpdateISAToDescriptorMapDynamic(
 
   const uint32_t addr_size = process->GetAddressByteSize();
 
-  Error err;
+  Status err;
 
   // Read the total number of classes from the hash table
   const uint32_t num_classes = hash_table.GetCount();
@@ -1315,7 +1315,7 @@ AppleObjCRuntimeV2::UpdateISAToDescriptorMapDynamic(
   FunctionCaller *get_class_info_function = nullptr;
 
   if (!m_get_class_info_code.get()) {
-    Error error;
+    Status error;
     m_get_class_info_code.reset(GetTargetRef().GetUtilityFunctionForLanguage(
         g_get_dynamic_class_info_body, eLanguageTypeObjC,
         g_get_dynamic_class_info_name, error));
@@ -1547,7 +1547,7 @@ AppleObjCRuntimeV2::UpdateISAToDescriptorMapSharedCache() {
 
   const uint32_t addr_size = process->GetAddressByteSize();
 
-  Error err;
+  Status err;
 
   uint32_t num_class_infos = 0;
 
@@ -1568,7 +1568,7 @@ AppleObjCRuntimeV2::UpdateISAToDescriptorMapSharedCache() {
   FunctionCaller *get_shared_cache_class_info_function = nullptr;
 
   if (!m_get_shared_cache_class_info_code.get()) {
-    Error error;
+    Status error;
     m_get_shared_cache_class_info_code.reset(
         GetTargetRef().GetUtilityFunctionForLanguage(
             g_get_shared_cache_class_info_body, eLanguageTypeObjC,
@@ -1803,7 +1803,8 @@ lldb::addr_t AppleObjCRuntimeV2::GetSharedCacheReadOnlyAddress() {
 void AppleObjCRuntimeV2::UpdateISAToDescriptorMapIfNeeded() {
   Log *log(GetLogIfAnyCategoriesSet(LIBLLDB_LOG_PROCESS | LIBLLDB_LOG_TYPES));
 
-  Timer scoped_timer(LLVM_PRETTY_FUNCTION, LLVM_PRETTY_FUNCTION);
+  static Timer::Category func_cat(LLVM_PRETTY_FUNCTION);
+  Timer scoped_timer(func_cat, LLVM_PRETTY_FUNCTION);
 
   // Else we need to check with our process to see when the map was updated.
   Process *process = GetProcess();
@@ -2019,7 +2020,7 @@ AppleObjCRuntimeV2::NonPointerISACache::CreateInstance(
     AppleObjCRuntimeV2 &runtime, const lldb::ModuleSP &objc_module_sp) {
   Process *process(runtime.GetProcess());
 
-  Error error;
+  Status error;
 
   auto objc_debug_isa_magic_mask = ExtractRuntimeGlobalSymbol(
       process, ConstString("objc_debug_isa_magic_mask"), objc_module_sp, error);
@@ -2050,7 +2051,7 @@ AppleObjCRuntimeV2::TaggedPointerVendorV2::CreateInstance(
     AppleObjCRuntimeV2 &runtime, const lldb::ModuleSP &objc_module_sp) {
   Process *process(runtime.GetProcess());
 
-  Error error;
+  Status error;
 
   auto objc_debug_taggedpointer_mask = ExtractRuntimeGlobalSymbol(
       process, ConstString("objc_debug_taggedpointer_mask"), objc_module_sp,
@@ -2262,7 +2263,7 @@ AppleObjCRuntimeV2::TaggedPointerVendorRuntimeAssisted::GetClassDescriptor(
     Process *process(m_runtime.GetProcess());
     uintptr_t slot_ptr = slot * process->GetAddressByteSize() +
                          m_objc_debug_taggedpointer_classes;
-    Error error;
+    Status error;
     uintptr_t slot_data = process->ReadPointerFromMemory(slot_ptr, error);
     if (error.Fail() || slot_data == 0 ||
         slot_data == uintptr_t(LLDB_INVALID_ADDRESS))
@@ -2349,7 +2350,7 @@ AppleObjCRuntimeV2::TaggedPointerVendorExtended::GetClassDescriptor(
     Process *process(m_runtime.GetProcess());
     uintptr_t slot_ptr = slot * process->GetAddressByteSize() +
                          m_objc_debug_taggedpointer_ext_classes;
-    Error error;
+    Status error;
     uintptr_t slot_data = process->ReadPointerFromMemory(slot_ptr, error);
     if (error.Fail() || slot_data == 0 ||
         slot_data == uintptr_t(LLDB_INVALID_ADDRESS))
diff --git a/source/Plugins/LanguageRuntime/ObjC/AppleObjCRuntime/AppleObjCTrampolineHandler.cpp b/source/Plugins/LanguageRuntime/ObjC/AppleObjCRuntime/AppleObjCTrampolineHandler.cpp
index 8ce65b07684f..a6e9c0c16f16 100644
--- a/source/Plugins/LanguageRuntime/ObjC/AppleObjCRuntime/AppleObjCTrampolineHandler.cpp
+++ b/source/Plugins/LanguageRuntime/ObjC/AppleObjCRuntime/AppleObjCTrampolineHandler.cpp
@@ -299,7 +299,7 @@ void AppleObjCTrampolineHandler::AppleObjCVTables::VTableRegion::SetUpRegion() {
                      process_sp->GetByteOrder(),
                      process_sp->GetAddressByteSize());
   size_t actual_size = 8 + process_sp->GetAddressByteSize();
-  Error error;
+  Status error;
   size_t bytes_read =
       process_sp->ReadMemory(m_header_addr, memory_buffer, actual_size, error);
   if (bytes_read != actual_size) {
@@ -534,7 +534,7 @@ bool AppleObjCTrampolineHandler::AppleObjCVTables::RefreshTrampolines(
       return false;
 
     // Now get a pointer value from the zeroth argument.
-    Error error;
+    Status error;
     DataExtractor data;
     error = argument_values.GetValueAtIndex(0)->GetValueAsData(&exe_ctx, data,
                                                                0, NULL);
@@ -555,7 +555,7 @@ bool AppleObjCTrampolineHandler::AppleObjCVTables::ReadRegions() {
   m_regions.clear();
   if (!InitializeVTableSymbols())
     return false;
-  Error error;
+  Status error;
   ProcessSP process_sp = GetProcessSP();
   if (process_sp) {
     lldb::addr_t region_addr =
@@ -770,7 +770,7 @@ AppleObjCTrampolineHandler::SetupDispatchFunction(Thread &thread,
 
     if (!m_impl_code.get()) {
       if (m_lookup_implementation_function_code != NULL) {
-        Error error;
+        Status error;
         m_impl_code.reset(exe_ctx.GetTargetRef().GetUtilityFunctionForLanguage(
             m_lookup_implementation_function_code, eLanguageTypeObjC,
             g_lookup_implementation_function_name, error));
@@ -802,7 +802,7 @@ AppleObjCTrampolineHandler::SetupDispatchFunction(Thread &thread,
           thread.GetProcess()->GetTarget().GetScratchClangASTContext();
       CompilerType clang_void_ptr_type =
           clang_ast_context->GetBasicType(eBasicTypeVoid).GetPointerType();
-      Error error;
+      Status error;
 
       impl_function_caller = m_impl_code->MakeFunctionCaller(
           clang_void_ptr_type, dispatch_values, thread_sp, error);
diff --git a/source/Plugins/LanguageRuntime/RenderScript/RenderScriptRuntime/RenderScriptRuntime.cpp b/source/Plugins/LanguageRuntime/RenderScript/RenderScriptRuntime/RenderScriptRuntime.cpp
index 7a9e66cf5481..7e46afcccdab 100644
--- a/source/Plugins/LanguageRuntime/RenderScript/RenderScriptRuntime/RenderScriptRuntime.cpp
+++ b/source/Plugins/LanguageRuntime/RenderScript/RenderScriptRuntime/RenderScriptRuntime.cpp
@@ -42,9 +42,9 @@
 #include "lldb/Target/Thread.h"
 #include "lldb/Utility/ConstString.h"
 #include "lldb/Utility/DataBufferLLVM.h"
-#include "lldb/Utility/Error.h"
 #include "lldb/Utility/Log.h"
 #include "lldb/Utility/RegularExpression.h"
+#include "lldb/Utility/Status.h"
 
 using namespace lldb;
 using namespace lldb_private;
@@ -123,7 +123,7 @@ struct GetArgsCtx {
 bool GetArgsX86(const GetArgsCtx &ctx, ArgItem *arg_list, size_t num_args) {
   Log *log = GetLogIfAllCategoriesSet(LIBLLDB_LOG_LANGUAGE);
 
-  Error err;
+  Status err;
 
   // get the current stack pointer
   uint64_t sp = ctx.reg_ctx->GetSP();
@@ -136,7 +136,7 @@ bool GetArgsX86(const GetArgsCtx &ctx, ArgItem *arg_list, size_t num_args) {
     size_t arg_size = sizeof(uint32_t);
     // read the argument from memory
     arg.value = 0;
-    Error err;
+    Status err;
     size_t read =
         ctx.process->ReadMemory(sp, &arg.value, sizeof(uint32_t), err);
     if (read != arg_size || !err.Success()) {
@@ -166,7 +166,7 @@ bool GetArgsX86_64(GetArgsCtx &ctx, ArgItem *arg_list, size_t num_args) {
       4, // eBool,
   }};
 
-  Error err;
+  Status err;
 
   // get the current stack pointer
   uint64_t sp = ctx.reg_ctx->GetSP();
@@ -230,7 +230,7 @@ bool GetArgsArm(GetArgsCtx &ctx, ArgItem *arg_list, size_t num_args) {
 
   Log *log = GetLogIfAllCategoriesSet(LIBLLDB_LOG_LANGUAGE);
 
-  Error err;
+  Status err;
 
   // get the current stack pointer
   uint64_t sp = ctx.reg_ctx->GetSP();
@@ -310,7 +310,7 @@ bool GetArgsMipsel(GetArgsCtx &ctx, ArgItem *arg_list, size_t num_args) {
 
   Log *log = GetLogIfAllCategoriesSet(LIBLLDB_LOG_LANGUAGE);
 
-  Error err;
+  Status err;
 
   // find offset to arguments on the stack (+16 to skip over a0-a3 shadow space)
   uint64_t sp = ctx.reg_ctx->GetSP() + 16;
@@ -355,7 +355,7 @@ bool GetArgsMips64el(GetArgsCtx &ctx, ArgItem *arg_list, size_t num_args) {
 
   Log *log = GetLogIfAllCategoriesSet(LIBLLDB_LOG_LANGUAGE);
 
-  Error err;
+  Status err;
 
   // get the current stack pointer
   uint64_t sp = ctx.reg_ctx->GetSP();
@@ -1189,7 +1189,7 @@ void RenderScriptRuntime::CaptureDebugHintScriptGroup2(
   // parse script group name
   ConstString group_name;
   {
-    Error err;
+    Status err;
     const uint64_t len = uint64_t(args[eGroupNameSize]);
     std::unique_ptr<char[]> buffer(new char[uint32_t(len + 1)]);
     m_process->ReadMemory(addr_t(args[eGroupName]), buffer.get(), len, err);
@@ -1238,7 +1238,7 @@ void RenderScriptRuntime::CaptureDebugHintScriptGroup2(
     // extract script group kernel addresses from the target
     const addr_t ptr_addr = addr_t(args[eKernel]) + i * target_ptr_size;
     uint64_t kernel_addr = 0;
-    Error err;
+    Status err;
     size_t read =
         m_process->ReadMemory(ptr_addr, &kernel_addr, target_ptr_size, err);
     if (!err.Success() || read != target_ptr_size) {
@@ -1337,7 +1337,7 @@ void RenderScriptRuntime::CaptureScriptInvokeForEachMulti(
   }
 
   const uint32_t target_ptr_size = m_process->GetAddressByteSize();
-  Error err;
+  Status err;
   std::vector<uint64_t> allocs;
 
   // traverse allocation list
@@ -1524,7 +1524,7 @@ void RenderScriptRuntime::CaptureScriptInit(RuntimeHook *hook,
                                             ExecutionContext &exe_ctx) {
   Log *log(GetLogIfAllCategoriesSet(LIBLLDB_LOG_LANGUAGE));
 
-  Error err;
+  Status err;
   Process *process = exe_ctx.GetProcessPtr();
 
   enum { eRsContext, eRsScript, eRsResNamePtr, eRsCachedDirPtr };
@@ -1756,7 +1756,7 @@ bool RenderScriptRuntime::EvalRSExpression(const char *expr,
 
   // The result of the expression is invalid
   if (!expr_result->GetError().Success()) {
-    Error err = expr_result->GetError();
+    Status err = expr_result->GetError();
     // Expression returned is void, so this is actually a success
     if (err.GetError() == UserExpression::kNoResult) {
       if (log)
@@ -2140,7 +2140,7 @@ bool RenderScriptRuntime::JITSubelements(Element &elem,
       case 1: // Name of child
       {
         lldb::addr_t address = static_cast<addr_t>(results);
-        Error err;
+        Status err;
         std::string name;
         GetProcess()->ReadCStringFromMemory(address, name, err);
         if (!err.Fail())
@@ -2386,7 +2386,7 @@ void RenderScriptRuntime::FindStructTypeName(Element &elem,
     if (found) {
       // Dereference since our Element type isn't a pointer.
       if (valobj_sp->IsPointerType()) {
-        Error err;
+        Status err;
         ValueObjectSP deref_valobj = valobj_sp->Dereference(err);
         if (!err.Fail())
           valobj_sp = deref_valobj;
@@ -2482,7 +2482,7 @@ RenderScriptRuntime::GetAllocationData(AllocationDetails *alloc,
   }
 
   // Read the inferior memory
-  Error err;
+  Status err;
   lldb::addr_t data_ptr = *alloc->data_ptr.get();
   GetProcess()->ReadMemory(data_ptr, buffer.get(), size, err);
   if (err.Fail()) {
@@ -2643,7 +2643,7 @@ bool RenderScriptRuntime::LoadAllocation(Stream &strm, const uint32_t alloc_id,
 
   // Copy file data from our buffer into the target allocation.
   lldb::addr_t alloc_data = *alloc->data_ptr.get();
-  Error err;
+  Status err;
   size_t written = GetProcess()->WriteMemory(alloc_data, file_buf, size, err);
   if (!err.Success() || written != size) {
     strm.Printf("Error: Couldn't write data to allocation %s", err.AsCString());
@@ -2795,7 +2795,7 @@ bool RenderScriptRuntime::SaveAllocation(Stream &strm, const uint32_t alloc_id,
     log->Printf("%s - writing File Header, 0x%" PRIx64 " bytes", __FUNCTION__,
                 (uint64_t)num_bytes);
 
-  Error err = file.Write(&head, num_bytes);
+  Status err = file.Write(&head, num_bytes);
   if (!err.Success()) {
     strm.Printf("Error: '%s' when writing to file '%s'", err.AsCString(), path);
     strm.EOL();
@@ -2900,7 +2900,7 @@ bool RenderScriptRuntime::LoadModule(const lldb::ModuleSP &module_sp) {
         const Symbol *debug_present = m_libRS->FindFirstSymbolWithNameAndType(
             gDbgPresentStr, eSymbolTypeData);
         if (debug_present) {
-          Error err;
+          Status err;
           uint32_t flag = 0x00000001U;
           Target &target = GetProcess()->GetTarget();
           addr_t addr = debug_present->GetLoadAddress(&target);
@@ -3179,7 +3179,7 @@ bool RSModuleDescriptor::ParseRSInfo() {
   return info_lines.size() > 0;
 }
 
-void RenderScriptRuntime::Status(Stream &strm) const {
+void RenderScriptRuntime::DumpStatus(Stream &strm) const {
   if (m_libRS) {
     strm.Printf("Runtime Library discovered.");
     strm.EOL();
@@ -3620,7 +3620,7 @@ RenderScriptRuntime::CreateKernelBreakpoint(const ConstString &name) {
 
   // Give RS breakpoints a specific name, so the user can manipulate them as a
   // group.
-  Error err;
+  Status err;
   if (!bp->AddName("RenderScriptKernel", err))
     if (log)
       log->Printf("%s - error setting break name, '%s'.", __FUNCTION__,
@@ -3648,7 +3648,7 @@ RenderScriptRuntime::CreateReductionBreakpoint(const ConstString &name,
 
   // Give RS breakpoints a specific name, so the user can manipulate them as a
   // group.
-  Error err;
+  Status err;
   if (!bp->AddName("RenderScriptReduction", err))
     if (log)
       log->Printf("%s - error setting break name, '%s'.", __FUNCTION__,
@@ -3664,7 +3664,7 @@ bool RenderScriptRuntime::GetFrameVarAsUnsigned(const StackFrameSP frame_sp,
                                                 const char *var_name,
                                                 uint64_t &val) {
   Log *log(GetLogIfAnyCategoriesSet(LIBLLDB_LOG_LANGUAGE));
-  Error err;
+  Status err;
   VariableSP var_sp;
 
   // Find variable in stack frame
@@ -3889,7 +3889,7 @@ RenderScriptRuntime::CreateScriptGroupBreakpoint(const ConstString &name,
       m_filtersp, resolver_sp, false, false, false);
   // Give RS breakpoints a specific name, so the user can manipulate them as a
   // group.
-  Error err;
+  Status err;
   if (!bp->AddName(name.AsCString(), err))
     if (log)
       log->Printf("%s - error setting break name, '%s'.", __FUNCTION__,
@@ -4213,9 +4213,9 @@ public:
 
     ~CommandOptions() override = default;
 
-    Error SetOptionValue(uint32_t option_idx, llvm::StringRef option_arg,
-                         ExecutionContext *exe_ctx) override {
-      Error err;
+    Status SetOptionValue(uint32_t option_idx, llvm::StringRef option_arg,
+                          ExecutionContext *exe_ctx) override {
+      Status err;
       StreamString err_str;
       const int short_option = m_getopt_table[option_idx].val;
       switch (short_option) {
@@ -4369,9 +4369,9 @@ public:
 
     ~CommandOptions() override = default;
 
-    Error SetOptionValue(uint32_t option_idx, llvm::StringRef option_arg,
-                         ExecutionContext *exe_ctx) override {
-      Error err;
+    Status SetOptionValue(uint32_t option_idx, llvm::StringRef option_arg,
+                          ExecutionContext *exe_ctx) override {
+      Status err;
       const int short_option = m_getopt_table[option_idx].val;
 
       switch (short_option) {
@@ -4650,9 +4650,9 @@ public:
 
     ~CommandOptions() override = default;
 
-    Error SetOptionValue(uint32_t option_idx, llvm::StringRef option_arg,
-                         ExecutionContext *exe_ctx) override {
-      Error err;
+    Status SetOptionValue(uint32_t option_idx, llvm::StringRef option_arg,
+                          ExecutionContext *exe_ctx) override {
+      Status err;
       const int short_option = m_getopt_table[option_idx].val;
 
       switch (short_option) {
@@ -4772,9 +4772,9 @@ public:
 
     ~CommandOptions() override = default;
 
-    Error SetOptionValue(uint32_t option_idx, llvm::StringRef option_arg,
-                         ExecutionContext *exe_ctx) override {
-      Error err;
+    Status SetOptionValue(uint32_t option_idx, llvm::StringRef option_arg,
+                          ExecutionContext *exe_ctx) override {
+      Status err;
       const int short_option = m_getopt_table[option_idx].val;
 
       switch (short_option) {
@@ -4993,7 +4993,7 @@ public:
     RenderScriptRuntime *runtime =
         (RenderScriptRuntime *)m_exe_ctx.GetProcessPtr()->GetLanguageRuntime(
             eLanguageTypeExtRenderScript);
-    runtime->Status(result.GetOutputStream());
+    runtime->DumpStatus(result.GetOutputStream());
     result.SetStatus(eReturnStatusSuccessFinishResult);
     return true;
   }
diff --git a/source/Plugins/LanguageRuntime/RenderScript/RenderScriptRuntime/RenderScriptRuntime.h b/source/Plugins/LanguageRuntime/RenderScript/RenderScriptRuntime/RenderScriptRuntime.h
index 5b2bb57ac8c8..0fe9134ce9e4 100644
--- a/source/Plugins/LanguageRuntime/RenderScript/RenderScriptRuntime/RenderScriptRuntime.h
+++ b/source/Plugins/LanguageRuntime/RenderScript/RenderScriptRuntime/RenderScriptRuntime.h
@@ -374,7 +374,7 @@ public:
 
   void SetBreakAllKernels(bool do_break, lldb::TargetSP target);
 
-  void Status(Stream &strm) const;
+  void DumpStatus(Stream &strm) const;
 
   void ModulesDidLoad(const ModuleList &module_list) override;
 
diff --git a/source/Plugins/LanguageRuntime/RenderScript/RenderScriptRuntime/RenderScriptScriptGroup.cpp b/source/Plugins/LanguageRuntime/RenderScript/RenderScriptRuntime/RenderScriptScriptGroup.cpp
index 9ca8fb4444c0..fe4ae21a0c39 100644
--- a/source/Plugins/LanguageRuntime/RenderScript/RenderScriptRuntime/RenderScriptScriptGroup.cpp
+++ b/source/Plugins/LanguageRuntime/RenderScript/RenderScriptRuntime/RenderScriptScriptGroup.cpp
@@ -21,8 +21,8 @@
 #include "lldb/Target/Process.h"
 #include "lldb/Target/Target.h"
 #include "lldb/Utility/ConstString.h"
-#include "lldb/Utility/Error.h"
 #include "lldb/Utility/Log.h"
+#include "lldb/Utility/Status.h"
 
 #include "RenderScriptRuntime.h"
 #include "RenderScriptScriptGroup.h"
diff --git a/source/Plugins/MemoryHistory/asan/MemoryHistoryASan.cpp b/source/Plugins/MemoryHistory/asan/MemoryHistoryASan.cpp
index 35247edfbd56..b9e49d1faa1b 100644
--- a/source/Plugins/MemoryHistory/asan/MemoryHistoryASan.cpp
+++ b/source/Plugins/MemoryHistory/asan/MemoryHistoryASan.cpp
@@ -170,7 +170,7 @@ HistoryThreads MemoryHistoryASan::GetHistoryThreads(lldb::addr_t address) {
   ExecutionContext exe_ctx(frame_sp);
   ValueObjectSP return_value_sp;
   StreamString expr;
-  Error eval_error;
+  Status eval_error;
   expr.Printf(memory_history_asan_command_format, address, address);
 
   EvaluateExpressionOptions options;
diff --git a/source/Plugins/ObjectContainer/BSD-Archive/ObjectContainerBSDArchive.cpp b/source/Plugins/ObjectContainer/BSD-Archive/ObjectContainerBSDArchive.cpp
index 928157516808..8a388a667372 100644
--- a/source/Plugins/ObjectContainer/BSD-Archive/ObjectContainerBSDArchive.cpp
+++ b/source/Plugins/ObjectContainer/BSD-Archive/ObjectContainerBSDArchive.cpp
@@ -301,8 +301,9 @@ ObjectContainer *ObjectContainerBSDArchive::CreateInstance(
     DataExtractor data;
     data.SetData(data_sp, data_offset, length);
     if (file && data_sp && ObjectContainerBSDArchive::MagicBytesMatch(data)) {
+      static Timer::Category func_cat(LLVM_PRETTY_FUNCTION);
       Timer scoped_timer(
-          LLVM_PRETTY_FUNCTION,
+          func_cat,
           "ObjectContainerBSDArchive::CreateInstance (module = %s, file = "
           "%p, file_offset = 0x%8.8" PRIx64 ", file_size = 0x%8.8" PRIx64 ")",
           module_sp->GetFileSpec().GetPath().c_str(),
diff --git a/source/Plugins/ObjectFile/ELF/ObjectFileELF.cpp b/source/Plugins/ObjectFile/ELF/ObjectFileELF.cpp
index 0720cca27341..b71d6fa4ebea 100644
--- a/source/Plugins/ObjectFile/ELF/ObjectFileELF.cpp
+++ b/source/Plugins/ObjectFile/ELF/ObjectFileELF.cpp
@@ -25,8 +25,8 @@
 #include "lldb/Target/SectionLoadList.h"
 #include "lldb/Target/Target.h"
 #include "lldb/Utility/DataBufferLLVM.h"
-#include "lldb/Utility/Error.h"
 #include "lldb/Utility/Log.h"
+#include "lldb/Utility/Status.h"
 #include "lldb/Utility/Stream.h"
 
 #include "llvm/ADT/PointerUnion.h"
@@ -727,8 +727,9 @@ size_t ObjectFileELF::GetModuleSpecifications(
             uint32_t core_notes_crc = 0;
 
             if (!gnu_debuglink_crc) {
+              static Timer::Category func_cat(LLVM_PRETTY_FUNCTION);
               lldb_private::Timer scoped_timer(
-                  LLVM_PRETTY_FUNCTION,
+                  func_cat,
                   "Calculating module crc32 %s with size %" PRIu64 " KiB",
                   file.GetLastPathComponent().AsCString(),
                   (file.GetByteSize() - file_offset) / 1024);
@@ -1079,7 +1080,7 @@ Address ObjectFileELF::GetImageInfoAddress(Target *target) {
       if (dyn_base == LLDB_INVALID_ADDRESS)
         return Address();
 
-      Error error;
+      Status error;
       if (symbol.d_tag == DT_MIPS_RLD_MAP) {
         // DT_MIPS_RLD_MAP tag stores an absolute address of the debug pointer.
         Address addr;
@@ -1232,12 +1233,12 @@ size_t ObjectFileELF::ParseProgramHeaders() {
       m_header);
 }
 
-lldb_private::Error
+lldb_private::Status
 ObjectFileELF::RefineModuleDetailsFromNote(lldb_private::DataExtractor &data,
                                            lldb_private::ArchSpec &arch_spec,
                                            lldb_private::UUID &uuid) {
   Log *log(lldb_private::GetLogIfAllCategoriesSet(LIBLLDB_LOG_MODULES));
-  Error error;
+  Status error;
 
   lldb::offset_t offset = 0;
 
@@ -1763,7 +1764,7 @@ size_t ObjectFileELF::GetSectionHeaderInfo(SectionHeaderColl &section_headers,
           DataExtractor data;
           if (section_size && (set_data(data, sheader.sh_offset,
                                         section_size) == section_size)) {
-            Error error = RefineModuleDetailsFromNote(data, arch_spec, uuid);
+            Status error = RefineModuleDetailsFromNote(data, arch_spec, uuid);
             if (error.Fail()) {
               if (log)
                 log->Printf("ObjectFileELF::%s ELF note processing failed: %s",
diff --git a/source/Plugins/ObjectFile/ELF/ObjectFileELF.h b/source/Plugins/ObjectFile/ELF/ObjectFileELF.h
index 9b2d58b7be82..06f1a4d22d2d 100644
--- a/source/Plugins/ObjectFile/ELF/ObjectFileELF.h
+++ b/source/Plugins/ObjectFile/ELF/ObjectFileELF.h
@@ -375,7 +375,7 @@ private:
 
   unsigned PLTRelocationType();
 
-  static lldb_private::Error
+  static lldb_private::Status
   RefineModuleDetailsFromNote(lldb_private::DataExtractor &data,
                               lldb_private::ArchSpec &arch_spec,
                               lldb_private::UUID &uuid);
diff --git a/source/Plugins/ObjectFile/Mach-O/ObjectFileMachO.cpp b/source/Plugins/ObjectFile/Mach-O/ObjectFileMachO.cpp
index 84ecfdc67bee..1a4ae1a21aec 100644
--- a/source/Plugins/ObjectFile/Mach-O/ObjectFileMachO.cpp
+++ b/source/Plugins/ObjectFile/Mach-O/ObjectFileMachO.cpp
@@ -40,9 +40,9 @@
 #include "lldb/Target/Thread.h"
 #include "lldb/Target/ThreadList.h"
 #include "lldb/Utility/DataBufferLLVM.h"
-#include "lldb/Utility/Error.h"
 #include "lldb/Utility/FileSpec.h"
 #include "lldb/Utility/Log.h"
+#include "lldb/Utility/Status.h"
 #include "lldb/Utility/StreamString.h"
 #include "lldb/Utility/UUID.h"
 
@@ -2121,8 +2121,8 @@ UUID ObjectFileMachO::GetSharedCacheUUID(FileSpec dyld_shared_cache,
 }
 
 size_t ObjectFileMachO::ParseSymtab() {
-  Timer scoped_timer(LLVM_PRETTY_FUNCTION,
-                     "ObjectFileMachO::ParseSymtab () module = %s",
+  static Timer::Category func_cat(LLVM_PRETTY_FUNCTION);
+  Timer scoped_timer(func_cat, "ObjectFileMachO::ParseSymtab () module = %s",
                      m_file.GetFilename().AsCString(""));
   ModuleSP module_sp(GetModule());
   if (!module_sp)
@@ -3850,7 +3850,7 @@ size_t ObjectFileMachO::ParseSymtab() {
             symbol_name = NULL;
         } else {
           const addr_t str_addr = strtab_addr + nlist.n_strx;
-          Error str_error;
+          Status str_error;
           if (process->ReadCStringFromMemory(str_addr, memory_symbol_name,
                                              str_error))
             symbol_name = memory_symbol_name.c_str();
@@ -5968,7 +5968,7 @@ bool ObjectFileMachO::SetLoadAddress(Target &target, lldb::addr_t value,
 }
 
 bool ObjectFileMachO::SaveCore(const lldb::ProcessSP &process_sp,
-                               const FileSpec &outfile, Error &error) {
+                               const FileSpec &outfile, Status &error) {
   if (process_sp) {
     Target &target = process_sp->GetTarget();
     const ArchSpec target_arch = target.GetArchitecture();
@@ -5997,7 +5997,7 @@ bool ObjectFileMachO::SaveCore(const lldb::ProcessSP &process_sp,
         std::vector<segment_command_64> segment_load_commands;
         //                uint32_t range_info_idx = 0;
         MemoryRegionInfo range_info;
-        Error range_error = process_sp->GetMemoryRegionInfo(0, range_info);
+        Status range_error = process_sp->GetMemoryRegionInfo(0, range_info);
         const uint32_t addr_byte_size = target_arch.GetAddressByteSize();
         const ByteOrder byte_order = target_arch.GetByteOrder();
         if (range_error.Success()) {
@@ -6231,7 +6231,7 @@ bool ObjectFileMachO::SaveCore(const lldb::ProcessSP &process_sp,
                        segment.vmsize, segment.vmaddr);
                 addr_t bytes_left = segment.vmsize;
                 addr_t addr = segment.vmaddr;
-                Error memory_read_error;
+                Status memory_read_error;
                 while (bytes_left > 0 && error.Success()) {
                   const size_t bytes_to_read =
                       bytes_left > sizeof(bytes) ? sizeof(bytes) : bytes_left;
diff --git a/source/Plugins/ObjectFile/Mach-O/ObjectFileMachO.h b/source/Plugins/ObjectFile/Mach-O/ObjectFileMachO.h
index 96379a4db411..9a42f41a4c94 100644
--- a/source/Plugins/ObjectFile/Mach-O/ObjectFileMachO.h
+++ b/source/Plugins/ObjectFile/Mach-O/ObjectFileMachO.h
@@ -67,7 +67,7 @@ public:
 
   static bool SaveCore(const lldb::ProcessSP &process_sp,
                        const lldb_private::FileSpec &outfile,
-                       lldb_private::Error &error);
+                       lldb_private::Status &error);
 
   static bool MagicBytesMatch(lldb::DataBufferSP &data_sp, lldb::addr_t offset,
                               lldb::addr_t length);
diff --git a/source/Plugins/ObjectFile/PECOFF/ObjectFilePECOFF.cpp b/source/Plugins/ObjectFile/PECOFF/ObjectFilePECOFF.cpp
index c89383a7b7ba..dcb9527f24c8 100644
--- a/source/Plugins/ObjectFile/PECOFF/ObjectFilePECOFF.cpp
+++ b/source/Plugins/ObjectFile/PECOFF/ObjectFilePECOFF.cpp
@@ -145,7 +145,7 @@ size_t ObjectFilePECOFF::GetModuleSpecifications(
 
 bool ObjectFilePECOFF::SaveCore(const lldb::ProcessSP &process_sp,
                                 const lldb_private::FileSpec &outfile,
-                                lldb_private::Error &error) {
+                                lldb_private::Status &error) {
   return SaveMiniDump(process_sp, outfile, error);
 }
 
@@ -440,7 +440,7 @@ DataExtractor ObjectFilePECOFF::ReadImageData(uint32_t offset, size_t size) {
   DataExtractor data;
   if (process_sp) {
     auto data_ap = llvm::make_unique<DataBufferHeap>(size, 0);
-    Error readmem_error;
+    Status readmem_error;
     size_t bytes_read =
         process_sp->ReadMemory(m_image_base + offset, data_ap->GetBytes(),
                                data_ap->GetByteSize(), readmem_error);
diff --git a/source/Plugins/ObjectFile/PECOFF/ObjectFilePECOFF.h b/source/Plugins/ObjectFile/PECOFF/ObjectFilePECOFF.h
index 2313047735e0..d8a94e19d34f 100644
--- a/source/Plugins/ObjectFile/PECOFF/ObjectFilePECOFF.h
+++ b/source/Plugins/ObjectFile/PECOFF/ObjectFilePECOFF.h
@@ -85,7 +85,7 @@ public:
 
   static bool SaveCore(const lldb::ProcessSP &process_sp,
                        const lldb_private::FileSpec &outfile,
-                       lldb_private::Error &error);
+                       lldb_private::Status &error);
 
   static bool MagicBytesMatch(lldb::DataBufferSP &data_sp);
 
diff --git a/source/Plugins/ObjectFile/PECOFF/WindowsMiniDump.cpp b/source/Plugins/ObjectFile/PECOFF/WindowsMiniDump.cpp
index 094e258a7a03..d6553f6e5142 100644
--- a/source/Plugins/ObjectFile/PECOFF/WindowsMiniDump.cpp
+++ b/source/Plugins/ObjectFile/PECOFF/WindowsMiniDump.cpp
@@ -23,7 +23,7 @@ namespace lldb_private {
 
 bool SaveMiniDump(const lldb::ProcessSP &process_sp,
                   const lldb_private::FileSpec &outfile,
-                  lldb_private::Error &error) {
+                  lldb_private::Status &error) {
   if (!process_sp)
     return false;
 #ifdef _WIN32
diff --git a/source/Plugins/ObjectFile/PECOFF/WindowsMiniDump.h b/source/Plugins/ObjectFile/PECOFF/WindowsMiniDump.h
index 3f741dc04f51..135d53387387 100644
--- a/source/Plugins/ObjectFile/PECOFF/WindowsMiniDump.h
+++ b/source/Plugins/ObjectFile/PECOFF/WindowsMiniDump.h
@@ -16,7 +16,7 @@ namespace lldb_private {
 
 bool SaveMiniDump(const lldb::ProcessSP &process_sp,
                   const lldb_private::FileSpec &outfile,
-                  lldb_private::Error &error);
+                  lldb_private::Status &error);
 
 } // namespace lldb_private
 
diff --git a/source/Plugins/OperatingSystem/Go/OperatingSystemGo.cpp b/source/Plugins/OperatingSystem/Go/OperatingSystemGo.cpp
index 1a0c32729865..75bc518f7538 100644
--- a/source/Plugins/OperatingSystem/Go/OperatingSystemGo.cpp
+++ b/source/Plugins/OperatingSystem/Go/OperatingSystemGo.cpp
@@ -315,7 +315,7 @@ bool OperatingSystemGo::UpdateThreadList(ThreadList &old_thread_list,
   // the
   // lldb_private::Process subclass, no memory threads will be in this list.
 
-  Error err;
+  Status err;
   for (uint64_t i = 0; i < allglen; ++i) {
     goroutines.push_back(CreateGoroutineAtIndex(i, err));
     if (err.Fail()) {
@@ -340,8 +340,8 @@ bool OperatingSystemGo::UpdateThreadList(ThreadList &old_thread_list,
         memory_thread->IsValid()) {
       memory_thread->ClearBackingThread();
     } else {
-      memory_thread.reset(new ThreadMemory(
-          *m_process, goroutine.m_goid, nullptr, nullptr, goroutine.m_gobuf));
+      memory_thread.reset(new ThreadMemory(*m_process, goroutine.m_goid, "", "",
+                                           goroutine.m_gobuf));
     }
     // Search for the backing thread if the goroutine is running.
     if (2 == (goroutine.m_status & 0xfff)) {
@@ -448,7 +448,7 @@ TypeSP OperatingSystemGo::FindType(TargetSP target_sp, const char *name) {
 }
 
 OperatingSystemGo::Goroutine
-OperatingSystemGo::CreateGoroutineAtIndex(uint64_t idx, Error &err) {
+OperatingSystemGo::CreateGoroutineAtIndex(uint64_t idx, Status &err) {
   err.Clear();
   Goroutine result = {};
   ValueObjectSP g =
diff --git a/source/Plugins/OperatingSystem/Go/OperatingSystemGo.h b/source/Plugins/OperatingSystem/Go/OperatingSystemGo.h
index d289985c72ad..5d255a348a63 100644
--- a/source/Plugins/OperatingSystem/Go/OperatingSystemGo.h
+++ b/source/Plugins/OperatingSystem/Go/OperatingSystemGo.h
@@ -80,7 +80,7 @@ private:
 
   bool Init(lldb_private::ThreadList &threads);
 
-  Goroutine CreateGoroutineAtIndex(uint64_t idx, lldb_private::Error &err);
+  Goroutine CreateGoroutineAtIndex(uint64_t idx, lldb_private::Status &err);
 
   std::unique_ptr<DynamicRegisterInfo> m_reginfo;
   lldb::ValueObjectSP m_allg_sp;
diff --git a/source/Plugins/OperatingSystem/Python/OperatingSystemPython.cpp b/source/Plugins/OperatingSystem/Python/OperatingSystemPython.cpp
index dbbb55e474f8..14bf0784dc2c 100644
--- a/source/Plugins/OperatingSystem/Python/OperatingSystemPython.cpp
+++ b/source/Plugins/OperatingSystem/Python/OperatingSystemPython.cpp
@@ -94,7 +94,7 @@ OperatingSystemPython::OperatingSystemPython(lldb_private::Process *process,
       char python_module_path_cstr[PATH_MAX];
       python_module_path.GetPath(python_module_path_cstr,
                                  sizeof(python_module_path_cstr));
-      Error error;
+      Status error;
       if (m_interpreter->LoadScriptingModule(
               python_module_path_cstr, allow_reload, init_session, error)) {
         // Strip the ".py" extension if there is one
@@ -240,8 +240,8 @@ ThreadSP OperatingSystemPython::CreateThreadFromThreadInfo(
 
   uint32_t core_number;
   addr_t reg_data_addr;
-  std::string name;
-  std::string queue;
+  llvm::StringRef name;
+  llvm::StringRef queue;
 
   thread_dict.GetValueForKeyAsInteger("core", core_number, UINT32_MAX);
   thread_dict.GetValueForKeyAsInteger("register_data_addr", reg_data_addr,
@@ -266,8 +266,8 @@ ThreadSP OperatingSystemPython::CreateThreadFromThreadInfo(
   if (!thread_sp) {
     if (did_create_ptr)
       *did_create_ptr = true;
-    thread_sp.reset(new ThreadMemory(*m_process, tid, name.c_str(),
-                                     queue.c_str(), reg_data_addr));
+    thread_sp.reset(
+        new ThreadMemory(*m_process, tid, name, queue, reg_data_addr));
   }
 
   if (core_number < core_thread_list.GetSize(false)) {
diff --git a/source/Plugins/Platform/Android/AdbClient.cpp b/source/Plugins/Platform/Android/AdbClient.cpp
index 2060bd1de735..d3bcee6f487d 100644
--- a/source/Plugins/Platform/Android/AdbClient.cpp
+++ b/source/Plugins/Platform/Android/AdbClient.cpp
@@ -46,7 +46,7 @@ using namespace std::chrono;
 
 namespace {
 
-const seconds kReadTimeout(8);
+const seconds kReadTimeout(12);
 const char *kOKAY = "OKAY";
 const char *kFAIL = "FAIL";
 const char *kDATA = "DATA";
@@ -65,9 +65,9 @@ const uint32_t kDefaultMode = 0100770; // S_IFREG | S_IRWXU | S_IRWXG
 const char *kSocketNamespaceAbstract = "localabstract";
 const char *kSocketNamespaceFileSystem = "localfilesystem";
 
-Error ReadAllBytes(Connection &conn, void *buffer, size_t size) {
+Status ReadAllBytes(Connection &conn, void *buffer, size_t size) {
 
-  Error error;
+  Status error;
   ConnectionStatus status;
   char *read_buffer = static_cast<char *>(buffer);
 
@@ -86,7 +86,7 @@ Error ReadAllBytes(Connection &conn, void *buffer, size_t size) {
     now = steady_clock::now();
   }
   if (total_read_bytes < size)
-    error = Error(
+    error = Status(
         "Unable to read requested number of bytes. Connection status: %d.",
         status);
   return error;
@@ -94,8 +94,8 @@ Error ReadAllBytes(Connection &conn, void *buffer, size_t size) {
 
 } // namespace
 
-Error AdbClient::CreateByDeviceID(const std::string &device_id,
-                                  AdbClient &adb) {
+Status AdbClient::CreateByDeviceID(const std::string &device_id,
+                                   AdbClient &adb) {
   DeviceIDList connect_devices;
   auto error = adb.GetDevices(connect_devices);
   if (error.Fail())
@@ -109,15 +109,15 @@ Error AdbClient::CreateByDeviceID(const std::string &device_id,
 
   if (android_serial.empty()) {
     if (connect_devices.size() != 1)
-      return Error("Expected a single connected device, got instead %zu - try "
-                   "setting 'ANDROID_SERIAL'",
-                   connect_devices.size());
+      return Status("Expected a single connected device, got instead %zu - try "
+                    "setting 'ANDROID_SERIAL'",
+                    connect_devices.size());
     adb.SetDeviceID(connect_devices.front());
   } else {
     auto find_it = std::find(connect_devices.begin(), connect_devices.end(),
                              android_serial);
     if (find_it == connect_devices.end())
-      return Error("Device \"%s\" not found", android_serial.c_str());
+      return Status("Device \"%s\" not found", android_serial.c_str());
 
     adb.SetDeviceID(*find_it);
   }
@@ -136,15 +136,15 @@ void AdbClient::SetDeviceID(const std::string &device_id) {
 
 const std::string &AdbClient::GetDeviceID() const { return m_device_id; }
 
-Error AdbClient::Connect() {
-  Error error;
+Status AdbClient::Connect() {
+  Status error;
   m_conn.reset(new ConnectionFileDescriptor);
   m_conn->Connect("connect://localhost:5037", &error);
 
   return error;
 }
 
-Error AdbClient::GetDevices(DeviceIDList &device_list) {
+Status AdbClient::GetDevices(DeviceIDList &device_list) {
   device_list.clear();
 
   auto error = SendMessage("host:devices");
@@ -171,8 +171,8 @@ Error AdbClient::GetDevices(DeviceIDList &device_list) {
   return error;
 }
 
-Error AdbClient::SetPortForwarding(const uint16_t local_port,
-                                   const uint16_t remote_port) {
+Status AdbClient::SetPortForwarding(const uint16_t local_port,
+                                    const uint16_t remote_port) {
   char message[48];
   snprintf(message, sizeof(message), "forward:tcp:%d;tcp:%d", local_port,
            remote_port);
@@ -184,9 +184,10 @@ Error AdbClient::SetPortForwarding(const uint16_t local_port,
   return ReadResponseStatus();
 }
 
-Error AdbClient::SetPortForwarding(const uint16_t local_port,
-                                   llvm::StringRef remote_socket_name,
-                                   const UnixSocketNamespace socket_namespace) {
+Status
+AdbClient::SetPortForwarding(const uint16_t local_port,
+                             llvm::StringRef remote_socket_name,
+                             const UnixSocketNamespace socket_namespace) {
   char message[PATH_MAX];
   const char *sock_namespace_str =
       (socket_namespace == UnixSocketNamespaceAbstract)
@@ -202,7 +203,7 @@ Error AdbClient::SetPortForwarding(const uint16_t local_port,
   return ReadResponseStatus();
 }
 
-Error AdbClient::DeletePortForwarding(const uint16_t local_port) {
+Status AdbClient::DeletePortForwarding(const uint16_t local_port) {
   char message[32];
   snprintf(message, sizeof(message), "killforward:tcp:%d", local_port);
 
@@ -213,8 +214,8 @@ Error AdbClient::DeletePortForwarding(const uint16_t local_port) {
   return ReadResponseStatus();
 }
 
-Error AdbClient::SendMessage(const std::string &packet, const bool reconnect) {
-  Error error;
+Status AdbClient::SendMessage(const std::string &packet, const bool reconnect) {
+  Status error;
   if (!m_conn || reconnect) {
     error = Connect();
     if (error.Fail())
@@ -235,13 +236,13 @@ Error AdbClient::SendMessage(const std::string &packet, const bool reconnect) {
   return error;
 }
 
-Error AdbClient::SendDeviceMessage(const std::string &packet) {
+Status AdbClient::SendDeviceMessage(const std::string &packet) {
   std::ostringstream msg;
   msg << "host-serial:" << m_device_id << ":" << packet;
   return SendMessage(msg.str());
 }
 
-Error AdbClient::ReadMessage(std::vector<char> &message) {
+Status AdbClient::ReadMessage(std::vector<char> &message) {
   message.clear();
 
   char buffer[5];
@@ -262,19 +263,19 @@ Error AdbClient::ReadMessage(std::vector<char> &message) {
   return error;
 }
 
-Error AdbClient::ReadMessageStream(std::vector<char> &message,
-                                   milliseconds timeout) {
+Status AdbClient::ReadMessageStream(std::vector<char> &message,
+                                    milliseconds timeout) {
   auto start = steady_clock::now();
   message.clear();
 
-  Error error;
+  Status error;
   lldb::ConnectionStatus status = lldb::eConnectionStatusSuccess;
   char buffer[1024];
   while (error.Success() && status == lldb::eConnectionStatusSuccess) {
     auto end = steady_clock::now();
     auto elapsed = end - start;
     if (elapsed >= timeout)
-      return Error("Timed out");
+      return Status("Timed out");
 
     size_t n = m_conn->Read(buffer, sizeof(buffer),
                             duration_cast<microseconds>(timeout - elapsed),
@@ -285,7 +286,7 @@ Error AdbClient::ReadMessageStream(std::vector<char> &message,
   return error;
 }
 
-Error AdbClient::ReadResponseStatus() {
+Status AdbClient::ReadResponseStatus() {
   char response_id[5];
 
   static const size_t packet_len = 4;
@@ -301,9 +302,9 @@ Error AdbClient::ReadResponseStatus() {
   return error;
 }
 
-Error AdbClient::GetResponseError(const char *response_id) {
+Status AdbClient::GetResponseError(const char *response_id) {
   if (strcmp(response_id, kFAIL) != 0)
-    return Error("Got unexpected response id from adb: \"%s\"", response_id);
+    return Status("Got unexpected response id from adb: \"%s\"", response_id);
 
   std::vector<char> error_message;
   auto error = ReadMessage(error_message);
@@ -314,7 +315,7 @@ Error AdbClient::GetResponseError(const char *response_id) {
   return error;
 }
 
-Error AdbClient::SwitchDeviceTransport() {
+Status AdbClient::SwitchDeviceTransport() {
   std::ostringstream msg;
   msg << "host:transport:" << m_device_id;
 
@@ -325,19 +326,20 @@ Error AdbClient::SwitchDeviceTransport() {
   return ReadResponseStatus();
 }
 
-Error AdbClient::StartSync() {
+Status AdbClient::StartSync() {
   auto error = SwitchDeviceTransport();
   if (error.Fail())
-    return Error("Failed to switch to device transport: %s", error.AsCString());
+    return Status("Failed to switch to device transport: %s",
+                  error.AsCString());
 
   error = Sync();
   if (error.Fail())
-    return Error("Sync failed: %s", error.AsCString());
+    return Status("Sync failed: %s", error.AsCString());
 
   return error;
 }
 
-Error AdbClient::Sync() {
+Status AdbClient::Sync() {
   auto error = SendMessage("sync:", false);
   if (error.Fail())
     return error;
@@ -345,17 +347,18 @@ Error AdbClient::Sync() {
   return ReadResponseStatus();
 }
 
-Error AdbClient::ReadAllBytes(void *buffer, size_t size) {
+Status AdbClient::ReadAllBytes(void *buffer, size_t size) {
   return ::ReadAllBytes(*m_conn, buffer, size);
 }
 
-Error AdbClient::internalShell(const char *command, milliseconds timeout,
-                               std::vector<char> &output_buf) {
+Status AdbClient::internalShell(const char *command, milliseconds timeout,
+                                std::vector<char> &output_buf) {
   output_buf.clear();
 
   auto error = SwitchDeviceTransport();
   if (error.Fail())
-    return Error("Failed to switch to device transport: %s", error.AsCString());
+    return Status("Failed to switch to device transport: %s",
+                  error.AsCString());
 
   StreamString adb_command;
   adb_command.Printf("shell:%s", command);
@@ -376,15 +379,15 @@ Error AdbClient::internalShell(const char *command, milliseconds timeout,
   static const char *kShellPrefix = "/system/bin/sh:";
   if (output_buf.size() > strlen(kShellPrefix)) {
     if (!memcmp(&output_buf[0], kShellPrefix, strlen(kShellPrefix)))
-      return Error("Shell command %s failed: %s", command,
-                   std::string(output_buf.begin(), output_buf.end()).c_str());
+      return Status("Shell command %s failed: %s", command,
+                    std::string(output_buf.begin(), output_buf.end()).c_str());
   }
 
-  return Error();
+  return Status();
 }
 
-Error AdbClient::Shell(const char *command, milliseconds timeout,
-                       std::string *output) {
+Status AdbClient::Shell(const char *command, milliseconds timeout,
+                        std::string *output) {
   std::vector<char> output_buffer;
   auto error = internalShell(command, timeout, output_buffer);
   if (error.Fail())
@@ -395,8 +398,8 @@ Error AdbClient::Shell(const char *command, milliseconds timeout,
   return error;
 }
 
-Error AdbClient::ShellToFile(const char *command, milliseconds timeout,
-                             const FileSpec &output_file_spec) {
+Status AdbClient::ShellToFile(const char *command, milliseconds timeout,
+                              const FileSpec &output_file_spec) {
   std::vector<char> output_buffer;
   auto error = internalShell(command, timeout, output_buffer);
   if (error.Fail())
@@ -406,17 +409,17 @@ Error AdbClient::ShellToFile(const char *command, milliseconds timeout,
   std::error_code EC;
   llvm::raw_fd_ostream dst(output_filename, EC, llvm::sys::fs::F_None);
   if (EC)
-    return Error("Unable to open local file %s", output_filename.c_str());
+    return Status("Unable to open local file %s", output_filename.c_str());
 
   dst.write(&output_buffer[0], output_buffer.size());
   dst.close();
   if (dst.has_error())
-    return Error("Failed to write file %s", output_filename.c_str());
-  return Error();
+    return Status("Failed to write file %s", output_filename.c_str());
+  return Status();
 }
 
 std::unique_ptr<AdbClient::SyncService>
-AdbClient::GetSyncService(Error &error) {
+AdbClient::GetSyncService(Status &error) {
   std::unique_ptr<SyncService> sync_service;
   error = StartSync();
   if (error.Success())
@@ -425,15 +428,15 @@ AdbClient::GetSyncService(Error &error) {
   return sync_service;
 }
 
-Error AdbClient::SyncService::internalPullFile(const FileSpec &remote_file,
-                                               const FileSpec &local_file) {
+Status AdbClient::SyncService::internalPullFile(const FileSpec &remote_file,
+                                                const FileSpec &local_file) {
   const auto local_file_path = local_file.GetPath();
   llvm::FileRemover local_file_remover(local_file_path);
 
   std::error_code EC;
   llvm::raw_fd_ostream dst(local_file_path, EC, llvm::sys::fs::F_None);
   if (EC)
-    return Error("Unable to open local file %s", local_file_path.c_str());
+    return Status("Unable to open local file %s", local_file_path.c_str());
 
   const auto remote_file_path = remote_file.GetPath(false);
   auto error = SendSyncRequest(kRECV, remote_file_path.length(),
@@ -452,18 +455,18 @@ Error AdbClient::SyncService::internalPullFile(const FileSpec &remote_file,
   }
   dst.close();
   if (dst.has_error())
-    return Error("Failed to write file %s", local_file_path.c_str());
+    return Status("Failed to write file %s", local_file_path.c_str());
 
   local_file_remover.releaseFile();
   return error;
 }
 
-Error AdbClient::SyncService::internalPushFile(const FileSpec &local_file,
-                                               const FileSpec &remote_file) {
+Status AdbClient::SyncService::internalPushFile(const FileSpec &local_file,
+                                                const FileSpec &remote_file) {
   const auto local_file_path(local_file.GetPath());
   std::ifstream src(local_file_path.c_str(), std::ios::in | std::ios::binary);
   if (!src.is_open())
-    return Error("Unable to open local file %s", local_file_path.c_str());
+    return Status("Unable to open local file %s", local_file_path.c_str());
 
   std::stringstream file_description;
   file_description << remote_file.GetPath(false).c_str() << "," << kDefaultMode;
@@ -478,7 +481,7 @@ Error AdbClient::SyncService::internalPushFile(const FileSpec &local_file,
     size_t chunk_size = src.gcount();
     error = SendSyncRequest(kDATA, chunk_size, chunk);
     if (error.Fail())
-      return Error("Failed to send file chunk: %s", error.AsCString());
+      return Status("Failed to send file chunk: %s", error.AsCString());
   }
   error = SendSyncRequest(
       kDONE, llvm::sys::toTimeT(FileSystem::GetModificationTime(local_file)),
@@ -490,31 +493,31 @@ Error AdbClient::SyncService::internalPushFile(const FileSpec &local_file,
   uint32_t data_len;
   error = ReadSyncHeader(response_id, data_len);
   if (error.Fail())
-    return Error("Failed to read DONE response: %s", error.AsCString());
+    return Status("Failed to read DONE response: %s", error.AsCString());
   if (response_id == kFAIL) {
     std::string error_message(data_len, 0);
     error = ReadAllBytes(&error_message[0], data_len);
     if (error.Fail())
-      return Error("Failed to read DONE error message: %s", error.AsCString());
-    return Error("Failed to push file: %s", error_message.c_str());
+      return Status("Failed to read DONE error message: %s", error.AsCString());
+    return Status("Failed to push file: %s", error_message.c_str());
   } else if (response_id != kOKAY)
-    return Error("Got unexpected DONE response: %s", response_id.c_str());
+    return Status("Got unexpected DONE response: %s", response_id.c_str());
 
   // If there was an error reading the source file, finish the adb file
   // transfer first so that adb isn't expecting any more data.
   if (src.bad())
-    return Error("Failed read on %s", local_file_path.c_str());
+    return Status("Failed read on %s", local_file_path.c_str());
   return error;
 }
 
-Error AdbClient::SyncService::internalStat(const FileSpec &remote_file,
-                                           uint32_t &mode, uint32_t &size,
-                                           uint32_t &mtime) {
+Status AdbClient::SyncService::internalStat(const FileSpec &remote_file,
+                                            uint32_t &mode, uint32_t &size,
+                                            uint32_t &mtime) {
   const std::string remote_file_path(remote_file.GetPath(false));
   auto error = SendSyncRequest(kSTAT, remote_file_path.length(),
                                remote_file_path.c_str());
   if (error.Fail())
-    return Error("Failed to send request: %s", error.AsCString());
+    return Status("Failed to send request: %s", error.AsCString());
 
   static const size_t stat_len = strlen(kSTAT);
   static const size_t response_len = stat_len + (sizeof(uint32_t) * 3);
@@ -522,7 +525,7 @@ Error AdbClient::SyncService::internalStat(const FileSpec &remote_file,
   std::vector<char> buffer(response_len);
   error = ReadAllBytes(&buffer[0], buffer.size());
   if (error.Fail())
-    return Error("Failed to read response: %s", error.AsCString());
+    return Status("Failed to read response: %s", error.AsCString());
 
   DataExtractor extractor(&buffer[0], buffer.size(), eByteOrderLittle,
                           sizeof(void *));
@@ -530,33 +533,33 @@ Error AdbClient::SyncService::internalStat(const FileSpec &remote_file,
 
   const void *command = extractor.GetData(&offset, stat_len);
   if (!command)
-    return Error("Failed to get response command");
+    return Status("Failed to get response command");
   const char *command_str = static_cast<const char *>(command);
   if (strncmp(command_str, kSTAT, stat_len))
-    return Error("Got invalid stat command: %s", command_str);
+    return Status("Got invalid stat command: %s", command_str);
 
   mode = extractor.GetU32(&offset);
   size = extractor.GetU32(&offset);
   mtime = extractor.GetU32(&offset);
-  return Error();
+  return Status();
 }
 
-Error AdbClient::SyncService::PullFile(const FileSpec &remote_file,
-                                       const FileSpec &local_file) {
+Status AdbClient::SyncService::PullFile(const FileSpec &remote_file,
+                                        const FileSpec &local_file) {
   return executeCommand([this, &remote_file, &local_file]() {
     return internalPullFile(remote_file, local_file);
   });
 }
 
-Error AdbClient::SyncService::PushFile(const FileSpec &local_file,
-                                       const FileSpec &remote_file) {
+Status AdbClient::SyncService::PushFile(const FileSpec &local_file,
+                                        const FileSpec &remote_file) {
   return executeCommand([this, &local_file, &remote_file]() {
     return internalPushFile(local_file, remote_file);
   });
 }
 
-Error AdbClient::SyncService::Stat(const FileSpec &remote_file, uint32_t &mode,
-                                   uint32_t &size, uint32_t &mtime) {
+Status AdbClient::SyncService::Stat(const FileSpec &remote_file, uint32_t &mode,
+                                    uint32_t &size, uint32_t &mtime) {
   return executeCommand([this, &remote_file, &mode, &size, &mtime]() {
     return internalStat(remote_file, mode, size, mtime);
   });
@@ -569,10 +572,10 @@ bool AdbClient::SyncService::IsConnected() const {
 AdbClient::SyncService::SyncService(std::unique_ptr<Connection> &&conn)
     : m_conn(std::move(conn)) {}
 
-Error AdbClient::SyncService::executeCommand(
-    const std::function<Error()> &cmd) {
+Status
+AdbClient::SyncService::executeCommand(const std::function<Status()> &cmd) {
   if (!m_conn)
-    return Error("SyncService is disconnected");
+    return Status("SyncService is disconnected");
 
   const auto error = cmd();
   if (error.Fail())
@@ -583,15 +586,15 @@ Error AdbClient::SyncService::executeCommand(
 
 AdbClient::SyncService::~SyncService() {}
 
-Error AdbClient::SyncService::SendSyncRequest(const char *request_id,
-                                              const uint32_t data_len,
-                                              const void *data) {
+Status AdbClient::SyncService::SendSyncRequest(const char *request_id,
+                                               const uint32_t data_len,
+                                               const void *data) {
   const DataBufferSP data_sp(new DataBufferHeap(kSyncPacketLen, 0));
   DataEncoder encoder(data_sp, eByteOrderLittle, sizeof(void *));
   auto offset = encoder.PutData(0, request_id, strlen(request_id));
   encoder.PutU32(offset, data_len);
 
-  Error error;
+  Status error;
   ConnectionStatus status;
   m_conn->Write(data_sp->GetBytes(), kSyncPacketLen, status, &error);
   if (error.Fail())
@@ -602,8 +605,8 @@ Error AdbClient::SyncService::SendSyncRequest(const char *request_id,
   return error;
 }
 
-Error AdbClient::SyncService::ReadSyncHeader(std::string &response_id,
-                                             uint32_t &data_len) {
+Status AdbClient::SyncService::ReadSyncHeader(std::string &response_id,
+                                              uint32_t &data_len) {
   char buffer[kSyncPacketLen];
 
   auto error = ReadAllBytes(buffer, kSyncPacketLen);
@@ -617,8 +620,8 @@ Error AdbClient::SyncService::ReadSyncHeader(std::string &response_id,
   return error;
 }
 
-Error AdbClient::SyncService::PullFileChunk(std::vector<char> &buffer,
-                                            bool &eof) {
+Status AdbClient::SyncService::PullFileChunk(std::vector<char> &buffer,
+                                             bool &eof) {
   buffer.clear();
 
   std::string response_id;
@@ -638,14 +641,14 @@ Error AdbClient::SyncService::PullFileChunk(std::vector<char> &buffer,
     std::string error_message(data_len, 0);
     error = ReadAllBytes(&error_message[0], data_len);
     if (error.Fail())
-      return Error("Failed to read pull error message: %s", error.AsCString());
-    return Error("Failed to pull file: %s", error_message.c_str());
+      return Status("Failed to read pull error message: %s", error.AsCString());
+    return Status("Failed to pull file: %s", error_message.c_str());
   } else
-    return Error("Pull failed with unknown response: %s", response_id.c_str());
+    return Status("Pull failed with unknown response: %s", response_id.c_str());
 
-  return Error();
+  return Status();
 }
 
-Error AdbClient::SyncService::ReadAllBytes(void *buffer, size_t size) {
+Status AdbClient::SyncService::ReadAllBytes(void *buffer, size_t size) {
   return ::ReadAllBytes(*m_conn, buffer, size);
 }
diff --git a/source/Plugins/Platform/Android/AdbClient.h b/source/Plugins/Platform/Android/AdbClient.h
index 9e8726c93b61..0d2100fc5663 100644
--- a/source/Plugins/Platform/Android/AdbClient.h
+++ b/source/Plugins/Platform/Android/AdbClient.h
@@ -10,7 +10,7 @@
 #ifndef liblldb_AdbClient_h_
 #define liblldb_AdbClient_h_
 
-#include "lldb/Utility/Error.h"
+#include "lldb/Utility/Status.h"
 #include <chrono>
 #include <functional>
 #include <list>
@@ -39,42 +39,42 @@ public:
   public:
     ~SyncService();
 
-    Error PullFile(const FileSpec &remote_file, const FileSpec &local_file);
+    Status PullFile(const FileSpec &remote_file, const FileSpec &local_file);
 
-    Error PushFile(const FileSpec &local_file, const FileSpec &remote_file);
+    Status PushFile(const FileSpec &local_file, const FileSpec &remote_file);
 
-    Error Stat(const FileSpec &remote_file, uint32_t &mode, uint32_t &size,
-               uint32_t &mtime);
+    Status Stat(const FileSpec &remote_file, uint32_t &mode, uint32_t &size,
+                uint32_t &mtime);
 
     bool IsConnected() const;
 
   private:
     explicit SyncService(std::unique_ptr<Connection> &&conn);
 
-    Error SendSyncRequest(const char *request_id, const uint32_t data_len,
-                          const void *data);
+    Status SendSyncRequest(const char *request_id, const uint32_t data_len,
+                           const void *data);
 
-    Error ReadSyncHeader(std::string &response_id, uint32_t &data_len);
+    Status ReadSyncHeader(std::string &response_id, uint32_t &data_len);
 
-    Error PullFileChunk(std::vector<char> &buffer, bool &eof);
+    Status PullFileChunk(std::vector<char> &buffer, bool &eof);
 
-    Error ReadAllBytes(void *buffer, size_t size);
+    Status ReadAllBytes(void *buffer, size_t size);
 
-    Error internalPullFile(const FileSpec &remote_file,
-                           const FileSpec &local_file);
+    Status internalPullFile(const FileSpec &remote_file,
+                            const FileSpec &local_file);
 
-    Error internalPushFile(const FileSpec &local_file,
-                           const FileSpec &remote_file);
+    Status internalPushFile(const FileSpec &local_file,
+                            const FileSpec &remote_file);
 
-    Error internalStat(const FileSpec &remote_file, uint32_t &mode,
-                       uint32_t &size, uint32_t &mtime);
+    Status internalStat(const FileSpec &remote_file, uint32_t &mode,
+                        uint32_t &size, uint32_t &mtime);
 
-    Error executeCommand(const std::function<Error()> &cmd);
+    Status executeCommand(const std::function<Status()> &cmd);
 
     std::unique_ptr<Connection> m_conn;
   };
 
-  static Error CreateByDeviceID(const std::string &device_id, AdbClient &adb);
+  static Status CreateByDeviceID(const std::string &device_id, AdbClient &adb);
 
   AdbClient();
   explicit AdbClient(const std::string &device_id);
@@ -83,52 +83,53 @@ public:
 
   const std::string &GetDeviceID() const;
 
-  Error GetDevices(DeviceIDList &device_list);
+  Status GetDevices(DeviceIDList &device_list);
 
-  Error SetPortForwarding(const uint16_t local_port,
-                          const uint16_t remote_port);
+  Status SetPortForwarding(const uint16_t local_port,
+                           const uint16_t remote_port);
 
-  Error SetPortForwarding(const uint16_t local_port,
-                          llvm::StringRef remote_socket_name,
-                          const UnixSocketNamespace socket_namespace);
+  Status SetPortForwarding(const uint16_t local_port,
+                           llvm::StringRef remote_socket_name,
+                           const UnixSocketNamespace socket_namespace);
 
-  Error DeletePortForwarding(const uint16_t local_port);
+  Status DeletePortForwarding(const uint16_t local_port);
 
-  Error Shell(const char *command, std::chrono::milliseconds timeout,
-              std::string *output);
+  Status Shell(const char *command, std::chrono::milliseconds timeout,
+               std::string *output);
 
-  Error ShellToFile(const char *command, std::chrono::milliseconds timeout,
-                    const FileSpec &output_file_spec);
+  Status ShellToFile(const char *command, std::chrono::milliseconds timeout,
+                     const FileSpec &output_file_spec);
 
-  std::unique_ptr<SyncService> GetSyncService(Error &error);
+  std::unique_ptr<SyncService> GetSyncService(Status &error);
 
-  Error SwitchDeviceTransport();
+  Status SwitchDeviceTransport();
 
 private:
-  Error Connect();
+  Status Connect();
 
   void SetDeviceID(const std::string &device_id);
 
-  Error SendMessage(const std::string &packet, const bool reconnect = true);
+  Status SendMessage(const std::string &packet, const bool reconnect = true);
 
-  Error SendDeviceMessage(const std::string &packet);
+  Status SendDeviceMessage(const std::string &packet);
 
-  Error ReadMessage(std::vector<char> &message);
+  Status ReadMessage(std::vector<char> &message);
 
-  Error ReadMessageStream(std::vector<char> &message, std::chrono::milliseconds timeout);
+  Status ReadMessageStream(std::vector<char> &message,
+                           std::chrono::milliseconds timeout);
 
-  Error GetResponseError(const char *response_id);
+  Status GetResponseError(const char *response_id);
 
-  Error ReadResponseStatus();
+  Status ReadResponseStatus();
 
-  Error Sync();
+  Status Sync();
 
-  Error StartSync();
+  Status StartSync();
 
-  Error internalShell(const char *command, std::chrono::milliseconds timeout,
-                      std::vector<char> &output_buf);
+  Status internalShell(const char *command, std::chrono::milliseconds timeout,
+                       std::vector<char> &output_buf);
 
-  Error ReadAllBytes(void *buffer, size_t size);
+  Status ReadAllBytes(void *buffer, size_t size);
 
   std::string m_device_id;
   std::unique_ptr<Connection> m_conn;
diff --git a/source/Plugins/Platform/Android/PlatformAndroid.cpp b/source/Plugins/Platform/Android/PlatformAndroid.cpp
index ad3918d4e202..d896a9f99e63 100644
--- a/source/Plugins/Platform/Android/PlatformAndroid.cpp
+++ b/source/Plugins/Platform/Android/PlatformAndroid.cpp
@@ -154,12 +154,12 @@ ConstString PlatformAndroid::GetPluginName() {
   return GetPluginNameStatic(IsHost());
 }
 
-Error PlatformAndroid::ConnectRemote(Args &args) {
+Status PlatformAndroid::ConnectRemote(Args &args) {
   m_device_id.clear();
 
   if (IsHost()) {
-    return Error("can't connect to the host platform '%s', always connected",
-                 GetPluginName().GetCString());
+    return Status("can't connect to the host platform '%s', always connected",
+                  GetPluginName().GetCString());
   }
 
   if (!m_remote_platform_sp)
@@ -169,9 +169,9 @@ Error PlatformAndroid::ConnectRemote(Args &args) {
   llvm::StringRef scheme, host, path;
   const char *url = args.GetArgumentAtIndex(0);
   if (!url)
-    return Error("URL is null.");
+    return Status("URL is null.");
   if (!UriParser::Parse(url, scheme, host, port, path))
-    return Error("Invalid URL: %s", url);
+    return Status("Invalid URL: %s", url);
   if (host != "localhost")
     m_device_id = host;
 
@@ -187,8 +187,8 @@ Error PlatformAndroid::ConnectRemote(Args &args) {
   return error;
 }
 
-Error PlatformAndroid::GetFile(const FileSpec &source,
-                               const FileSpec &destination) {
+Status PlatformAndroid::GetFile(const FileSpec &source,
+                                const FileSpec &destination) {
   if (IsHost() || !m_remote_platform_sp)
     return PlatformLinux::GetFile(source, destination);
 
@@ -198,7 +198,7 @@ Error PlatformAndroid::GetFile(const FileSpec &source,
     source_spec = GetRemoteWorkingDirectory().CopyByAppendingPathComponent(
         source_spec.GetCString(false));
 
-  Error error;
+  Status error;
   auto sync_service = GetSyncService(error);
   if (error.Fail())
     return error;
@@ -219,7 +219,7 @@ Error PlatformAndroid::GetFile(const FileSpec &source,
                 source_file);
 
   if (strchr(source_file, '\'') != nullptr)
-    return Error("Doesn't support single-quotes in filenames");
+    return Status("Doesn't support single-quotes in filenames");
 
   // mode == 0 can signify that adbd cannot access the file
   // due security constraints - try "cat ..." as a fallback.
@@ -231,9 +231,9 @@ Error PlatformAndroid::GetFile(const FileSpec &source,
   return adb.ShellToFile(cmd, minutes(1), destination);
 }
 
-Error PlatformAndroid::PutFile(const FileSpec &source,
-                               const FileSpec &destination, uint32_t uid,
-                               uint32_t gid) {
+Status PlatformAndroid::PutFile(const FileSpec &source,
+                                const FileSpec &destination, uint32_t uid,
+                                uint32_t gid) {
   if (IsHost() || !m_remote_platform_sp)
     return PlatformLinux::PutFile(source, destination, uid, gid);
 
@@ -244,7 +244,7 @@ Error PlatformAndroid::PutFile(const FileSpec &source,
         destination_spec.GetCString(false));
 
   // TODO: Set correct uid and gid on remote file.
-  Error error;
+  Status error;
   auto sync_service = GetSyncService(error);
   if (error.Fail())
     return error;
@@ -253,18 +253,18 @@ Error PlatformAndroid::PutFile(const FileSpec &source,
 
 const char *PlatformAndroid::GetCacheHostname() { return m_device_id.c_str(); }
 
-Error PlatformAndroid::DownloadModuleSlice(const FileSpec &src_file_spec,
-                                           const uint64_t src_offset,
-                                           const uint64_t src_size,
-                                           const FileSpec &dst_file_spec) {
+Status PlatformAndroid::DownloadModuleSlice(const FileSpec &src_file_spec,
+                                            const uint64_t src_offset,
+                                            const uint64_t src_size,
+                                            const FileSpec &dst_file_spec) {
   if (src_offset != 0)
-    return Error("Invalid offset - %" PRIu64, src_offset);
+    return Status("Invalid offset - %" PRIu64, src_offset);
 
   return GetFile(src_file_spec, dst_file_spec);
 }
 
-Error PlatformAndroid::DisconnectRemote() {
-  Error error = PlatformLinux::DisconnectRemote();
+Status PlatformAndroid::DisconnectRemote() {
+  Status error = PlatformLinux::DisconnectRemote();
   if (error.Success()) {
     m_device_id.clear();
     m_sdk_version = 0;
@@ -285,7 +285,7 @@ uint32_t PlatformAndroid::GetSdkVersion() {
 
   std::string version_string;
   AdbClient adb(m_device_id);
-  Error error =
+  Status error =
       adb.Shell("getprop ro.build.version.sdk", seconds(5), &version_string);
   version_string = llvm::StringRef(version_string).trim().str();
 
@@ -301,34 +301,34 @@ uint32_t PlatformAndroid::GetSdkVersion() {
   return m_sdk_version;
 }
 
-Error PlatformAndroid::DownloadSymbolFile(const lldb::ModuleSP &module_sp,
-                                          const FileSpec &dst_file_spec) {
+Status PlatformAndroid::DownloadSymbolFile(const lldb::ModuleSP &module_sp,
+                                           const FileSpec &dst_file_spec) {
   // For oat file we can try to fetch additional debug info from the device
   ConstString extension = module_sp->GetFileSpec().GetFileNameExtension();
   if (extension != ConstString("oat") && extension != ConstString("odex"))
-    return Error(
+    return Status(
         "Symbol file downloading only supported for oat and odex files");
 
   // If we have no information about the platform file we can't execute oatdump
   if (!module_sp->GetPlatformFileSpec())
-    return Error("No platform file specified");
+    return Status("No platform file specified");
 
   // Symbolizer isn't available before SDK version 23
   if (GetSdkVersion() < 23)
-    return Error("Symbol file generation only supported on SDK 23+");
+    return Status("Symbol file generation only supported on SDK 23+");
 
   // If we already have symtab then we don't have to try and generate one
   if (module_sp->GetSectionList()->FindSectionByName(ConstString(".symtab")) !=
       nullptr)
-    return Error("Symtab already available in the module");
+    return Status("Symtab already available in the module");
 
   AdbClient adb(m_device_id);
   std::string tmpdir;
-  Error error = adb.Shell("mktemp --directory --tmpdir /data/local/tmp",
-                          seconds(5), &tmpdir);
+  Status error = adb.Shell("mktemp --directory --tmpdir /data/local/tmp",
+                           seconds(5), &tmpdir);
   if (error.Fail() || tmpdir.empty())
-    return Error("Failed to generate temporary directory on the device (%s)",
-                 error.AsCString());
+    return Status("Failed to generate temporary directory on the device (%s)",
+                  error.AsCString());
   tmpdir = llvm::StringRef(tmpdir).trim().str();
 
   // Create file remover for the temporary directory created on the device
@@ -336,7 +336,7 @@ Error PlatformAndroid::DownloadSymbolFile(const lldb::ModuleSP &module_sp,
   tmpdir_remover(&tmpdir, [&adb](std::string *s) {
     StreamString command;
     command.Printf("rm -rf %s", s->c_str());
-    Error error = adb.Shell(command.GetData(), seconds(5), nullptr);
+    Status error = adb.Shell(command.GetData(), seconds(5), nullptr);
 
     Log *log(GetLogIfAllCategoriesSet(LIBLLDB_LOG_PLATFORM));
     if (log && error.Fail())
@@ -353,7 +353,7 @@ Error PlatformAndroid::DownloadSymbolFile(const lldb::ModuleSP &module_sp,
                  symfile_platform_filespec.GetCString(false));
   error = adb.Shell(command.GetData(), minutes(1), nullptr);
   if (error.Fail())
-    return Error("Oatdump failed: %s", error.AsCString());
+    return Status("Oatdump failed: %s", error.AsCString());
 
   // Download the symbolfile from the remote device
   return GetFile(symfile_platform_filespec, dst_file_spec);
@@ -375,7 +375,7 @@ const char *PlatformAndroid::GetLibdlFunctionDeclarations() const {
              )";
 }
 
-AdbClient::SyncService *PlatformAndroid::GetSyncService(Error &error) {
+AdbClient::SyncService *PlatformAndroid::GetSyncService(Status &error) {
   if (m_adb_sync_svc && m_adb_sync_svc->IsConnected())
     return m_adb_sync_svc.get();
 
diff --git a/source/Plugins/Platform/Android/PlatformAndroid.h b/source/Plugins/Platform/Android/PlatformAndroid.h
index 8417055733f6..8fb4cc71a69f 100644
--- a/source/Plugins/Platform/Android/PlatformAndroid.h
+++ b/source/Plugins/Platform/Android/PlatformAndroid.h
@@ -51,35 +51,35 @@ public:
   // lldb_private::Platform functions
   //------------------------------------------------------------
 
-  Error ConnectRemote(Args &args) override;
+  Status ConnectRemote(Args &args) override;
 
-  Error GetFile(const FileSpec &source, const FileSpec &destination) override;
+  Status GetFile(const FileSpec &source, const FileSpec &destination) override;
 
-  Error PutFile(const FileSpec &source, const FileSpec &destination,
-                uint32_t uid = UINT32_MAX, uint32_t gid = UINT32_MAX) override;
+  Status PutFile(const FileSpec &source, const FileSpec &destination,
+                 uint32_t uid = UINT32_MAX, uint32_t gid = UINT32_MAX) override;
 
   uint32_t GetSdkVersion();
 
   bool GetRemoteOSVersion() override;
 
-  Error DisconnectRemote() override;
+  Status DisconnectRemote() override;
 
   uint32_t GetDefaultMemoryCacheLineSize() override;
 
 protected:
   const char *GetCacheHostname() override;
 
-  Error DownloadModuleSlice(const FileSpec &src_file_spec,
-                            const uint64_t src_offset, const uint64_t src_size,
-                            const FileSpec &dst_file_spec) override;
+  Status DownloadModuleSlice(const FileSpec &src_file_spec,
+                             const uint64_t src_offset, const uint64_t src_size,
+                             const FileSpec &dst_file_spec) override;
 
-  Error DownloadSymbolFile(const lldb::ModuleSP &module_sp,
-                           const FileSpec &dst_file_spec) override;
+  Status DownloadSymbolFile(const lldb::ModuleSP &module_sp,
+                            const FileSpec &dst_file_spec) override;
 
   const char *GetLibdlFunctionDeclarations() const override;
 
 private:
-  AdbClient::SyncService *GetSyncService(Error &error);
+  AdbClient::SyncService *GetSyncService(Status &error);
 
   std::unique_ptr<AdbClient::SyncService> m_adb_sync_svc;
   std::string m_device_id;
diff --git a/source/Plugins/Platform/Android/PlatformAndroidRemoteGDBServer.cpp b/source/Plugins/Platform/Android/PlatformAndroidRemoteGDBServer.cpp
index 034518c1d2e3..dbc74833e287 100644
--- a/source/Plugins/Platform/Android/PlatformAndroidRemoteGDBServer.cpp
+++ b/source/Plugins/Platform/Android/PlatformAndroidRemoteGDBServer.cpp
@@ -10,8 +10,8 @@
 // Other libraries and framework includes
 #include "lldb/Host/ConnectionFileDescriptor.h"
 #include "lldb/Host/common/TCPSocket.h"
-#include "lldb/Utility/Error.h"
 #include "lldb/Utility/Log.h"
+#include "lldb/Utility/Status.h"
 #include "lldb/Utility/UriParser.h"
 
 #include "PlatformAndroidRemoteGDBServer.h"
@@ -25,7 +25,7 @@ using namespace platform_android;
 static const lldb::pid_t g_remote_platform_pid =
     0; // Alias for the process id of lldb-platform
 
-static Error ForwardPortWithAdb(
+static Status ForwardPortWithAdb(
     const uint16_t local_port, const uint16_t remote_port,
     llvm::StringRef remote_socket_name,
     const llvm::Optional<AdbClient::UnixSocketNamespace> &socket_namespace,
@@ -53,20 +53,20 @@ static Error ForwardPortWithAdb(
                 remote_socket_name.str().c_str(), local_port);
 
   if (!socket_namespace)
-    return Error("Invalid socket namespace");
+    return Status("Invalid socket namespace");
 
   return adb.SetPortForwarding(local_port, remote_socket_name,
                                *socket_namespace);
 }
 
-static Error DeleteForwardPortWithAdb(uint16_t local_port,
-                                      const std::string &device_id) {
+static Status DeleteForwardPortWithAdb(uint16_t local_port,
+                                       const std::string &device_id) {
   AdbClient adb(device_id);
   return adb.DeletePortForwarding(local_port);
 }
 
-static Error FindUnusedPort(uint16_t &port) {
-  Error error;
+static Status FindUnusedPort(uint16_t &port) {
+  Status error;
   std::unique_ptr<TCPSocket> tcp_socket(new TCPSocket(true, false));
   if (error.Fail())
     return error;
@@ -107,19 +107,20 @@ bool PlatformAndroidRemoteGDBServer::KillSpawnedProcess(lldb::pid_t pid) {
   return m_gdb_client.KillSpawnedProcess(pid);
 }
 
-Error PlatformAndroidRemoteGDBServer::ConnectRemote(Args &args) {
+Status PlatformAndroidRemoteGDBServer::ConnectRemote(Args &args) {
   m_device_id.clear();
 
   if (args.GetArgumentCount() != 1)
-    return Error("\"platform connect\" takes a single argument: <connect-url>");
+    return Status(
+        "\"platform connect\" takes a single argument: <connect-url>");
 
   int remote_port;
   llvm::StringRef scheme, host, path;
   const char *url = args.GetArgumentAtIndex(0);
   if (!url)
-    return Error("URL is null.");
+    return Status("URL is null.");
   if (!UriParser::Parse(url, scheme, host, remote_port, path))
-    return Error("Invalid URL: %s", url);
+    return Status("Invalid URL: %s", url);
   if (host != "localhost")
     m_device_id = host;
 
@@ -150,7 +151,7 @@ Error PlatformAndroidRemoteGDBServer::ConnectRemote(Args &args) {
   return error;
 }
 
-Error PlatformAndroidRemoteGDBServer::DisconnectRemote() {
+Status PlatformAndroidRemoteGDBServer::DisconnectRemote() {
   DeleteForwardPort(g_remote_platform_pid);
   return PlatformRemoteGDBServer::DisconnectRemote();
 }
@@ -173,12 +174,12 @@ void PlatformAndroidRemoteGDBServer::DeleteForwardPort(lldb::pid_t pid) {
   m_port_forwards.erase(it);
 }
 
-Error PlatformAndroidRemoteGDBServer::MakeConnectURL(
+Status PlatformAndroidRemoteGDBServer::MakeConnectURL(
     const lldb::pid_t pid, const uint16_t remote_port,
     llvm::StringRef remote_socket_name, std::string &connect_url) {
   static const int kAttempsNum = 5;
 
-  Error error;
+  Status error;
   // There is a race possibility that somebody will occupy
   // a port while we're in between FindUnusedPort and ForwardPortWithAdb -
   // adding the loop to mitigate such problem.
@@ -205,7 +206,7 @@ Error PlatformAndroidRemoteGDBServer::MakeConnectURL(
 lldb::ProcessSP PlatformAndroidRemoteGDBServer::ConnectProcess(
     llvm::StringRef connect_url, llvm::StringRef plugin_name,
     lldb_private::Debugger &debugger, lldb_private::Target *target,
-    lldb_private::Error &error) {
+    lldb_private::Status &error) {
   // We don't have the pid of the remote gdbserver when it isn't started by us
   // but we still want
   // to store the list of port forwards we set up in our port forward map.
diff --git a/source/Plugins/Platform/Android/PlatformAndroidRemoteGDBServer.h b/source/Plugins/Platform/Android/PlatformAndroidRemoteGDBServer.h
index 6d5bfecd9938..1bd13ffe89fe 100644
--- a/source/Plugins/Platform/Android/PlatformAndroidRemoteGDBServer.h
+++ b/source/Plugins/Platform/Android/PlatformAndroidRemoteGDBServer.h
@@ -33,15 +33,15 @@ public:
 
   ~PlatformAndroidRemoteGDBServer() override;
 
-  Error ConnectRemote(Args &args) override;
+  Status ConnectRemote(Args &args) override;
 
-  Error DisconnectRemote() override;
+  Status DisconnectRemote() override;
 
   lldb::ProcessSP ConnectProcess(llvm::StringRef connect_url,
                                  llvm::StringRef plugin_name,
                                  lldb_private::Debugger &debugger,
                                  lldb_private::Target *target,
-                                 lldb_private::Error &error) override;
+                                 lldb_private::Status &error) override;
 
 protected:
   std::string m_device_id;
@@ -54,9 +54,9 @@ protected:
 
   void DeleteForwardPort(lldb::pid_t pid);
 
-  Error MakeConnectURL(const lldb::pid_t pid, const uint16_t remote_port,
-                       llvm::StringRef remote_socket_name,
-                       std::string &connect_url);
+  Status MakeConnectURL(const lldb::pid_t pid, const uint16_t remote_port,
+                        llvm::StringRef remote_socket_name,
+                        std::string &connect_url);
 
 private:
   DISALLOW_COPY_AND_ASSIGN(PlatformAndroidRemoteGDBServer);
diff --git a/source/Plugins/Platform/FreeBSD/PlatformFreeBSD.cpp b/source/Plugins/Platform/FreeBSD/PlatformFreeBSD.cpp
index 2a150b5d452b..53cec45f986e 100644
--- a/source/Plugins/Platform/FreeBSD/PlatformFreeBSD.cpp
+++ b/source/Plugins/Platform/FreeBSD/PlatformFreeBSD.cpp
@@ -27,9 +27,9 @@
 #include "lldb/Host/HostInfo.h"
 #include "lldb/Target/Process.h"
 #include "lldb/Target/Target.h"
-#include "lldb/Utility/Error.h"
 #include "lldb/Utility/FileSpec.h"
 #include "lldb/Utility/Log.h"
+#include "lldb/Utility/Status.h"
 #include "lldb/Utility/StreamString.h"
 
 // Define these constants from FreeBSD mman.h for use when targeting
@@ -255,8 +255,8 @@ PlatformFreeBSD::GetSoftwareBreakpointTrapOpcode(Target &target,
   }
 }
 
-Error PlatformFreeBSD::LaunchProcess(ProcessLaunchInfo &launch_info) {
-  Error error;
+Status PlatformFreeBSD::LaunchProcess(ProcessLaunchInfo &launch_info) {
+  Status error;
   if (IsHost()) {
     error = Platform::LaunchProcess(launch_info);
   } else {
@@ -270,7 +270,7 @@ Error PlatformFreeBSD::LaunchProcess(ProcessLaunchInfo &launch_info) {
 
 lldb::ProcessSP PlatformFreeBSD::Attach(ProcessAttachInfo &attach_info,
                                         Debugger &debugger, Target *target,
-                                        Error &error) {
+                                        Status &error) {
   lldb::ProcessSP process_sp;
   if (IsHost()) {
     if (target == NULL) {
diff --git a/source/Plugins/Platform/FreeBSD/PlatformFreeBSD.h b/source/Plugins/Platform/FreeBSD/PlatformFreeBSD.h
index c8ac7b29f3a2..4bde2148a4d4 100644
--- a/source/Plugins/Platform/FreeBSD/PlatformFreeBSD.h
+++ b/source/Plugins/Platform/FreeBSD/PlatformFreeBSD.h
@@ -54,10 +54,10 @@ public:
   size_t GetSoftwareBreakpointTrapOpcode(Target &target,
                                          BreakpointSite *bp_site) override;
 
-  Error LaunchProcess(ProcessLaunchInfo &launch_info) override;
+  Status LaunchProcess(ProcessLaunchInfo &launch_info) override;
 
   lldb::ProcessSP Attach(ProcessAttachInfo &attach_info, Debugger &debugger,
-                         Target *target, Error &error) override;
+                         Target *target, Status &error) override;
 
   void CalculateTrapHandlerSymbolNames() override;
 
diff --git a/source/Plugins/Platform/Kalimba/PlatformKalimba.cpp b/source/Plugins/Platform/Kalimba/PlatformKalimba.cpp
index 08a3a6aa6c26..00327e485bf3 100644
--- a/source/Plugins/Platform/Kalimba/PlatformKalimba.cpp
+++ b/source/Plugins/Platform/Kalimba/PlatformKalimba.cpp
@@ -22,8 +22,8 @@
 #include "lldb/Host/HostInfo.h"
 #include "lldb/Target/Process.h"
 #include "lldb/Target/Target.h"
-#include "lldb/Utility/Error.h"
 #include "lldb/Utility/FileSpec.h"
+#include "lldb/Utility/Status.h"
 #include "lldb/Utility/StreamString.h"
 
 using namespace lldb;
@@ -125,8 +125,8 @@ PlatformKalimba::GetSoftwareBreakpointTrapOpcode(Target & /*target*/,
   return 0;
 }
 
-Error PlatformKalimba::LaunchProcess(ProcessLaunchInfo &launch_info) {
-  Error error;
+Status PlatformKalimba::LaunchProcess(ProcessLaunchInfo &launch_info) {
+  Status error;
 
   if (IsHost()) {
     error.SetErrorString("native execution is not possible");
@@ -138,7 +138,7 @@ Error PlatformKalimba::LaunchProcess(ProcessLaunchInfo &launch_info) {
 
 lldb::ProcessSP PlatformKalimba::Attach(ProcessAttachInfo &attach_info,
                                         Debugger &debugger, Target *target,
-                                        Error &error) {
+                                        Status &error) {
   lldb::ProcessSP process_sp;
   if (IsHost()) {
     error.SetErrorString("native execution is not possible");
diff --git a/source/Plugins/Platform/Kalimba/PlatformKalimba.h b/source/Plugins/Platform/Kalimba/PlatformKalimba.h
index 0c94ec9ec8f7..53a8e5594aaa 100644
--- a/source/Plugins/Platform/Kalimba/PlatformKalimba.h
+++ b/source/Plugins/Platform/Kalimba/PlatformKalimba.h
@@ -56,11 +56,11 @@ public:
   size_t GetSoftwareBreakpointTrapOpcode(Target &target,
                                          BreakpointSite *bp_site) override;
 
-  lldb_private::Error
+  lldb_private::Status
   LaunchProcess(lldb_private::ProcessLaunchInfo &launch_info) override;
 
   lldb::ProcessSP Attach(ProcessAttachInfo &attach_info, Debugger &debugger,
-                         Target *target, Error &error) override;
+                         Target *target, Status &error) override;
 
   // Kalimba processes can not be launched by spawning and attaching.
   bool CanDebugProcess() override { return false; }
diff --git a/source/Plugins/Platform/Linux/PlatformLinux.cpp b/source/Plugins/Platform/Linux/PlatformLinux.cpp
index 4dd5398fcfa3..0bf00ea30798 100644
--- a/source/Plugins/Platform/Linux/PlatformLinux.cpp
+++ b/source/Plugins/Platform/Linux/PlatformLinux.cpp
@@ -25,9 +25,9 @@
 #include "lldb/Host/HostInfo.h"
 #include "lldb/Target/Process.h"
 #include "lldb/Target/Target.h"
-#include "lldb/Utility/Error.h"
 #include "lldb/Utility/FileSpec.h"
 #include "lldb/Utility/Log.h"
+#include "lldb/Utility/Status.h"
 #include "lldb/Utility/StreamString.h"
 
 // Define these constants from Linux mman.h for use when targeting
@@ -278,7 +278,7 @@ lldb::ProcessSP
 PlatformLinux::DebugProcess(ProcessLaunchInfo &launch_info, Debugger &debugger,
                             Target *target, // Can be NULL, if NULL create a new
                                             // target, else use existing one
-                            Error &error) {
+                            Status &error) {
   Log *log(GetLogIfAllCategoriesSet(LIBLLDB_LOG_PLATFORM));
   LLDB_LOG(log, "target {0}", target);
 
diff --git a/source/Plugins/Platform/Linux/PlatformLinux.h b/source/Plugins/Platform/Linux/PlatformLinux.h
index f1386d1e4fe5..bc7b723427f8 100644
--- a/source/Plugins/Platform/Linux/PlatformLinux.h
+++ b/source/Plugins/Platform/Linux/PlatformLinux.h
@@ -55,7 +55,7 @@ public:
 
   lldb::ProcessSP DebugProcess(ProcessLaunchInfo &launch_info,
                                Debugger &debugger, Target *target,
-                               Error &error) override;
+                               Status &error) override;
 
   void CalculateTrapHandlerSymbolNames() override;
 
diff --git a/source/Plugins/Platform/MacOSX/PlatformAppleSimulator.cpp b/source/Plugins/Platform/MacOSX/PlatformAppleSimulator.cpp
index 478d482eb024..0197d27e76ef 100644
--- a/source/Plugins/Platform/MacOSX/PlatformAppleSimulator.cpp
+++ b/source/Plugins/Platform/MacOSX/PlatformAppleSimulator.cpp
@@ -21,8 +21,8 @@
 // Project includes
 #include "lldb/Host/PseudoTerminal.h"
 #include "lldb/Target/Process.h"
-#include "lldb/Utility/Error.h"
 #include "lldb/Utility/LLDBAssert.h"
+#include "lldb/Utility/Status.h"
 #include "lldb/Utility/StreamString.h"
 #include "llvm/Support/Threading.h"
 
@@ -55,14 +55,14 @@ PlatformAppleSimulator::PlatformAppleSimulator()
 //------------------------------------------------------------------
 PlatformAppleSimulator::~PlatformAppleSimulator() {}
 
-lldb_private::Error PlatformAppleSimulator::LaunchProcess(
+lldb_private::Status PlatformAppleSimulator::LaunchProcess(
     lldb_private::ProcessLaunchInfo &launch_info) {
 #if defined(__APPLE__)
   LoadCoreSimulator();
   CoreSimulatorSupport::Device device(GetSimulatorDevice());
 
   if (device.GetState() != CoreSimulatorSupport::Device::State::Booted) {
-    Error boot_err;
+    Status boot_err;
     device.Boot(boot_err);
     if (boot_err.Fail())
       return boot_err;
@@ -72,11 +72,11 @@ lldb_private::Error PlatformAppleSimulator::LaunchProcess(
 
   if (spawned) {
     launch_info.SetProcessID(spawned.GetPID());
-    return Error();
+    return Status();
   } else
     return spawned.GetError();
 #else
-  Error err;
+  Status err;
   err.SetErrorString(UNSUPPORTED_ERROR);
   return err;
 #endif
@@ -124,9 +124,9 @@ void PlatformAppleSimulator::GetStatus(Stream &strm) {
 #endif
 }
 
-Error PlatformAppleSimulator::ConnectRemote(Args &args) {
+Status PlatformAppleSimulator::ConnectRemote(Args &args) {
 #if defined(__APPLE__)
-  Error error;
+  Status error;
   if (args.GetArgumentCount() == 1) {
     if (m_device)
       DisconnectRemote();
@@ -156,18 +156,18 @@ Error PlatformAppleSimulator::ConnectRemote(Args &args) {
   }
   return error;
 #else
-  Error err;
+  Status err;
   err.SetErrorString(UNSUPPORTED_ERROR);
   return err;
 #endif
 }
 
-Error PlatformAppleSimulator::DisconnectRemote() {
+Status PlatformAppleSimulator::DisconnectRemote() {
 #if defined(__APPLE__)
   m_device.reset();
-  return Error();
+  return Status();
 #else
-  Error err;
+  Status err;
   err.SetErrorString(UNSUPPORTED_ERROR);
   return err;
 #endif
@@ -177,7 +177,7 @@ lldb::ProcessSP PlatformAppleSimulator::DebugProcess(
     ProcessLaunchInfo &launch_info, Debugger &debugger,
     Target *target, // Can be NULL, if NULL create a new target, else use
                     // existing one
-    Error &error) {
+    Status &error) {
 #if defined(__APPLE__)
   ProcessSP process_sp;
   // Make sure we stop at the entry point
diff --git a/source/Plugins/Platform/MacOSX/PlatformAppleSimulator.h b/source/Plugins/Platform/MacOSX/PlatformAppleSimulator.h
index 34f2ba2d9bfb..44feb019dc73 100644
--- a/source/Plugins/Platform/MacOSX/PlatformAppleSimulator.h
+++ b/source/Plugins/Platform/MacOSX/PlatformAppleSimulator.h
@@ -38,19 +38,19 @@ public:
 
   virtual ~PlatformAppleSimulator();
 
-  lldb_private::Error
+  lldb_private::Status
   LaunchProcess(lldb_private::ProcessLaunchInfo &launch_info) override;
 
   void GetStatus(lldb_private::Stream &strm) override;
 
-  lldb_private::Error ConnectRemote(lldb_private::Args &args) override;
+  lldb_private::Status ConnectRemote(lldb_private::Args &args) override;
 
-  lldb_private::Error DisconnectRemote() override;
+  lldb_private::Status DisconnectRemote() override;
 
   lldb::ProcessSP DebugProcess(lldb_private::ProcessLaunchInfo &launch_info,
                                lldb_private::Debugger &debugger,
                                lldb_private::Target *target,
-                               lldb_private::Error &error) override;
+                               lldb_private::Status &error) override;
 
 protected:
   std::mutex m_core_sim_path_mutex;
diff --git a/source/Plugins/Platform/MacOSX/PlatformAppleTVSimulator.cpp b/source/Plugins/Platform/MacOSX/PlatformAppleTVSimulator.cpp
index 38fe412c8574..52188eefb366 100644
--- a/source/Plugins/Platform/MacOSX/PlatformAppleTVSimulator.cpp
+++ b/source/Plugins/Platform/MacOSX/PlatformAppleTVSimulator.cpp
@@ -23,9 +23,9 @@
 #include "lldb/Host/HostInfo.h"
 #include "lldb/Target/Process.h"
 #include "lldb/Target/Target.h"
-#include "lldb/Utility/Error.h"
 #include "lldb/Utility/FileSpec.h"
 #include "lldb/Utility/Log.h"
+#include "lldb/Utility/Status.h"
 #include "lldb/Utility/StreamString.h"
 
 #include "llvm/Support/FileSystem.h"
@@ -171,10 +171,10 @@ void PlatformAppleTVSimulator::GetStatus(Stream &strm) {
     strm.PutCString("  SDK Path: error: unable to locate SDK\n");
 }
 
-Error PlatformAppleTVSimulator::ResolveExecutable(
+Status PlatformAppleTVSimulator::ResolveExecutable(
     const ModuleSpec &module_spec, lldb::ModuleSP &exe_module_sp,
     const FileSpecList *module_search_paths_ptr) {
-  Error error;
+  Status error;
   // Nothing special to do here, just use the actual file and architecture
 
   ModuleSpec resolved_module_spec(module_spec);
@@ -301,10 +301,10 @@ const char *PlatformAppleTVSimulator::GetSDKDirectoryAsCString() {
   return NULL;
 }
 
-Error PlatformAppleTVSimulator::GetSymbolFile(const FileSpec &platform_file,
-                                              const UUID *uuid_ptr,
-                                              FileSpec &local_file) {
-  Error error;
+Status PlatformAppleTVSimulator::GetSymbolFile(const FileSpec &platform_file,
+                                               const UUID *uuid_ptr,
+                                               FileSpec &local_file) {
+  Status error;
   char platform_file_path[PATH_MAX];
   if (platform_file.GetPath(platform_file_path, sizeof(platform_file_path))) {
     char resolved_path[PATH_MAX];
@@ -333,7 +333,7 @@ Error PlatformAppleTVSimulator::GetSymbolFile(const FileSpec &platform_file,
   return error;
 }
 
-Error PlatformAppleTVSimulator::GetSharedModule(
+Status PlatformAppleTVSimulator::GetSharedModule(
     const ModuleSpec &module_spec, lldb_private::Process *process,
     ModuleSP &module_sp, const FileSpecList *module_search_paths_ptr,
     ModuleSP *old_module_sp_ptr, bool *did_create_ptr) {
@@ -341,7 +341,7 @@ Error PlatformAppleTVSimulator::GetSharedModule(
   // system. So first we ask for the file in the cached SDK,
   // then we attempt to get a shared module for the right architecture
   // with the right UUID.
-  Error error;
+  Status error;
   ModuleSpec platform_module_spec(module_spec);
   const FileSpec &platform_file = module_spec.GetFileSpec();
   error = GetSymbolFile(platform_file, module_spec.GetUUIDPtr(),
diff --git a/source/Plugins/Platform/MacOSX/PlatformAppleTVSimulator.h b/source/Plugins/Platform/MacOSX/PlatformAppleTVSimulator.h
index 311ba05d76a2..8cecb4d496ab 100644
--- a/source/Plugins/Platform/MacOSX/PlatformAppleTVSimulator.h
+++ b/source/Plugins/Platform/MacOSX/PlatformAppleTVSimulator.h
@@ -51,7 +51,7 @@ public:
   //------------------------------------------------------------
   // lldb_private::Platform functions
   //------------------------------------------------------------
-  lldb_private::Error ResolveExecutable(
+  lldb_private::Status ResolveExecutable(
       const lldb_private::ModuleSpec &module_spec, lldb::ModuleSP &module_sp,
       const lldb_private::FileSpecList *module_search_paths_ptr) override;
 
@@ -59,12 +59,12 @@ public:
 
   void GetStatus(lldb_private::Stream &strm) override;
 
-  virtual lldb_private::Error
+  virtual lldb_private::Status
   GetSymbolFile(const lldb_private::FileSpec &platform_file,
                 const lldb_private::UUID *uuid_ptr,
                 lldb_private::FileSpec &local_file);
 
-  lldb_private::Error
+  lldb_private::Status
   GetSharedModule(const lldb_private::ModuleSpec &module_spec,
                   lldb_private::Process *process, lldb::ModuleSP &module_sp,
                   const lldb_private::FileSpecList *module_search_paths_ptr,
diff --git a/source/Plugins/Platform/MacOSX/PlatformAppleWatchSimulator.cpp b/source/Plugins/Platform/MacOSX/PlatformAppleWatchSimulator.cpp
index 1ffdc1ab7c8e..b9f493294a03 100644
--- a/source/Plugins/Platform/MacOSX/PlatformAppleWatchSimulator.cpp
+++ b/source/Plugins/Platform/MacOSX/PlatformAppleWatchSimulator.cpp
@@ -23,9 +23,9 @@
 #include "lldb/Host/HostInfo.h"
 #include "lldb/Target/Process.h"
 #include "lldb/Target/Target.h"
-#include "lldb/Utility/Error.h"
 #include "lldb/Utility/FileSpec.h"
 #include "lldb/Utility/Log.h"
+#include "lldb/Utility/Status.h"
 #include "lldb/Utility/StreamString.h"
 
 using namespace lldb;
@@ -171,10 +171,10 @@ void PlatformAppleWatchSimulator::GetStatus(Stream &strm) {
     strm.PutCString("  SDK Path: error: unable to locate SDK\n");
 }
 
-Error PlatformAppleWatchSimulator::ResolveExecutable(
+Status PlatformAppleWatchSimulator::ResolveExecutable(
     const ModuleSpec &module_spec, lldb::ModuleSP &exe_module_sp,
     const FileSpecList *module_search_paths_ptr) {
-  Error error;
+  Status error;
   // Nothing special to do here, just use the actual file and architecture
 
   ModuleSpec resolved_module_spec(module_spec);
@@ -301,10 +301,10 @@ const char *PlatformAppleWatchSimulator::GetSDKDirectoryAsCString() {
   return NULL;
 }
 
-Error PlatformAppleWatchSimulator::GetSymbolFile(const FileSpec &platform_file,
-                                                 const UUID *uuid_ptr,
-                                                 FileSpec &local_file) {
-  Error error;
+Status PlatformAppleWatchSimulator::GetSymbolFile(const FileSpec &platform_file,
+                                                  const UUID *uuid_ptr,
+                                                  FileSpec &local_file) {
+  Status error;
   char platform_file_path[PATH_MAX];
   if (platform_file.GetPath(platform_file_path, sizeof(platform_file_path))) {
     char resolved_path[PATH_MAX];
@@ -333,7 +333,7 @@ Error PlatformAppleWatchSimulator::GetSymbolFile(const FileSpec &platform_file,
   return error;
 }
 
-Error PlatformAppleWatchSimulator::GetSharedModule(
+Status PlatformAppleWatchSimulator::GetSharedModule(
     const ModuleSpec &module_spec, lldb_private::Process *process,
     ModuleSP &module_sp, const FileSpecList *module_search_paths_ptr,
     ModuleSP *old_module_sp_ptr, bool *did_create_ptr) {
@@ -341,7 +341,7 @@ Error PlatformAppleWatchSimulator::GetSharedModule(
   // system. So first we ask for the file in the cached SDK,
   // then we attempt to get a shared module for the right architecture
   // with the right UUID.
-  Error error;
+  Status error;
   ModuleSpec platform_module_spec(module_spec);
   const FileSpec &platform_file = module_spec.GetFileSpec();
   error = GetSymbolFile(platform_file, module_spec.GetUUIDPtr(),
diff --git a/source/Plugins/Platform/MacOSX/PlatformAppleWatchSimulator.h b/source/Plugins/Platform/MacOSX/PlatformAppleWatchSimulator.h
index 2b15611df47b..30aa42c964a5 100644
--- a/source/Plugins/Platform/MacOSX/PlatformAppleWatchSimulator.h
+++ b/source/Plugins/Platform/MacOSX/PlatformAppleWatchSimulator.h
@@ -51,7 +51,7 @@ public:
   //------------------------------------------------------------
   // lldb_private::Platform functions
   //------------------------------------------------------------
-  lldb_private::Error ResolveExecutable(
+  lldb_private::Status ResolveExecutable(
       const lldb_private::ModuleSpec &module_spec, lldb::ModuleSP &module_sp,
       const lldb_private::FileSpecList *module_search_paths_ptr) override;
 
@@ -59,12 +59,12 @@ public:
 
   void GetStatus(lldb_private::Stream &strm) override;
 
-  virtual lldb_private::Error
+  virtual lldb_private::Status
   GetSymbolFile(const lldb_private::FileSpec &platform_file,
                 const lldb_private::UUID *uuid_ptr,
                 lldb_private::FileSpec &local_file);
 
-  lldb_private::Error
+  lldb_private::Status
   GetSharedModule(const lldb_private::ModuleSpec &module_spec,
                   lldb_private::Process *process, lldb::ModuleSP &module_sp,
                   const lldb_private::FileSpecList *module_search_paths_ptr,
diff --git a/source/Plugins/Platform/MacOSX/PlatformDarwin.cpp b/source/Plugins/Platform/MacOSX/PlatformDarwin.cpp
index 02459045869a..d69a02e41d51 100644
--- a/source/Plugins/Platform/MacOSX/PlatformDarwin.cpp
+++ b/source/Plugins/Platform/MacOSX/PlatformDarwin.cpp
@@ -36,8 +36,8 @@
 #include "lldb/Target/Process.h"
 #include "lldb/Target/Target.h"
 #include "lldb/Utility/DataBufferLLVM.h"
-#include "lldb/Utility/Error.h"
 #include "lldb/Utility/Log.h"
+#include "lldb/Utility/Status.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/Threading.h"
@@ -195,10 +195,10 @@ FileSpecList PlatformDarwin::LocateExecutableScriptingResources(
   return file_list;
 }
 
-Error PlatformDarwin::ResolveSymbolFile(Target &target,
-                                        const ModuleSpec &sym_spec,
-                                        FileSpec &sym_file) {
-  Error error;
+Status PlatformDarwin::ResolveSymbolFile(Target &target,
+                                         const ModuleSpec &sym_spec,
+                                         FileSpec &sym_file) {
+  Status error;
   sym_file = sym_spec.GetSymbolFileSpec();
 
   llvm::sys::fs::file_status st;
@@ -219,23 +219,23 @@ Error PlatformDarwin::ResolveSymbolFile(Target &target,
   return error;
 }
 
-static lldb_private::Error
+static lldb_private::Status
 MakeCacheFolderForFile(const FileSpec &module_cache_spec) {
   FileSpec module_cache_folder =
       module_cache_spec.CopyByRemovingLastPathComponent();
   return llvm::sys::fs::create_directory(module_cache_folder.GetPath());
 }
 
-static lldb_private::Error
+static lldb_private::Status
 BringInRemoteFile(Platform *platform,
                   const lldb_private::ModuleSpec &module_spec,
                   const FileSpec &module_cache_spec) {
   MakeCacheFolderForFile(module_cache_spec);
-  Error err = platform->GetFile(module_spec.GetFileSpec(), module_cache_spec);
+  Status err = platform->GetFile(module_spec.GetFileSpec(), module_cache_spec);
   return err;
 }
 
-lldb_private::Error PlatformDarwin::GetSharedModuleWithLocalCache(
+lldb_private::Status PlatformDarwin::GetSharedModuleWithLocalCache(
     const lldb_private::ModuleSpec &module_spec, lldb::ModuleSP &module_sp,
     const lldb_private::FileSpecList *module_search_paths_ptr,
     lldb::ModuleSP *old_module_sp_ptr, bool *did_create_ptr) {
@@ -252,7 +252,7 @@ lldb_private::Error PlatformDarwin::GetSharedModuleWithLocalCache(
                 module_spec.GetSymbolFileSpec().GetDirectory().AsCString(),
                 module_spec.GetSymbolFileSpec().GetFilename().AsCString());
 
-  Error err;
+  Status err;
 
   err = ModuleList::GetSharedModule(module_spec, module_sp,
                                     module_search_paths_ptr, old_module_sp_ptr,
@@ -286,7 +286,7 @@ lldb_private::Error PlatformDarwin::GetSharedModuleWithLocalCache(
                                 module_spec.GetArchitecture());
           module_sp.reset(new Module(local_spec));
           module_sp->SetPlatformFileSpec(module_spec.GetFileSpec());
-          return Error();
+          return Status();
         }
       }
 
@@ -300,7 +300,7 @@ lldb_private::Error PlatformDarwin::GetSharedModuleWithLocalCache(
           uint64_t high_local, high_remote, low_local, low_remote;
           auto MD5 = llvm::sys::fs::md5_contents(module_cache_spec.GetPath());
           if (!MD5)
-            return Error(MD5.getError());
+            return Status(MD5.getError());
           std::tie(high_local, low_local) = MD5->words();
 
           m_remote_platform_sp->CalculateMD5(module_spec.GetFileSpec(),
@@ -314,7 +314,8 @@ lldb_private::Error PlatformDarwin::GetSharedModuleWithLocalCache(
                   (IsHost() ? "host" : "remote"),
                   module_spec.GetFileSpec().GetDirectory().AsCString(),
                   module_spec.GetFileSpec().GetFilename().AsCString());
-            Error err = BringInRemoteFile(this, module_spec, module_cache_spec);
+            Status err =
+                BringInRemoteFile(this, module_spec, module_cache_spec);
             if (err.Fail())
               return err;
           }
@@ -329,7 +330,7 @@ lldb_private::Error PlatformDarwin::GetSharedModuleWithLocalCache(
                       (IsHost() ? "host" : "remote"),
                       module_spec.GetFileSpec().GetDirectory().AsCString(),
                       module_spec.GetFileSpec().GetFilename().AsCString());
-        return Error();
+        return Status();
       }
 
       // bring in the remote module file
@@ -338,7 +339,7 @@ lldb_private::Error PlatformDarwin::GetSharedModuleWithLocalCache(
                     (IsHost() ? "host" : "remote"),
                     module_spec.GetFileSpec().GetDirectory().AsCString(),
                     module_spec.GetFileSpec().GetFilename().AsCString());
-      Error err = BringInRemoteFile(this, module_spec, module_cache_spec);
+      Status err = BringInRemoteFile(this, module_spec, module_cache_spec);
       if (err.Fail())
         return err;
       if (module_cache_spec.Exists()) {
@@ -351,20 +352,20 @@ lldb_private::Error PlatformDarwin::GetSharedModuleWithLocalCache(
         ModuleSpec local_spec(module_cache_spec, module_spec.GetArchitecture());
         module_sp.reset(new Module(local_spec));
         module_sp->SetPlatformFileSpec(module_spec.GetFileSpec());
-        return Error();
+        return Status();
       } else
-        return Error("unable to obtain valid module file");
+        return Status("unable to obtain valid module file");
     } else
-      return Error("no cache path");
+      return Status("no cache path");
   } else
-    return Error("unable to resolve module");
+    return Status("unable to resolve module");
 }
 
-Error PlatformDarwin::GetSharedModule(
+Status PlatformDarwin::GetSharedModule(
     const ModuleSpec &module_spec, Process *process, ModuleSP &module_sp,
     const FileSpecList *module_search_paths_ptr, ModuleSP *old_module_sp_ptr,
     bool *did_create_ptr) {
-  Error error;
+  Status error;
   module_sp.reset();
 
   if (IsRemote()) {
@@ -393,7 +394,7 @@ Error PlatformDarwin::GetSharedModule(
           ModuleSpec new_module_spec(module_spec);
           new_module_spec.GetFileSpec() = bundle_directory;
           if (Host::ResolveExecutableInBundle(new_module_spec.GetFileSpec())) {
-            Error new_error(Platform::GetSharedModule(
+            Status new_error(Platform::GetSharedModule(
                 new_module_spec, process, module_sp, NULL, old_module_sp_ptr,
                 did_create_ptr));
 
@@ -420,7 +421,7 @@ Error PlatformDarwin::GetSharedModule(
               if (new_file_spec.Exists()) {
                 ModuleSpec new_module_spec(module_spec);
                 new_module_spec.GetFileSpec() = new_file_spec;
-                Error new_error(Platform::GetSharedModule(
+                Status new_error(Platform::GetSharedModule(
                     new_module_spec, process, module_sp, NULL,
                     old_module_sp_ptr, did_create_ptr));
 
@@ -1185,7 +1186,7 @@ const char *PlatformDarwin::GetDeveloperDirectory() {
         int exit_status = -1;
         int signo = -1;
         std::string command_output;
-        Error error =
+        Status error =
             Host::RunShellCommand("/usr/bin/xcode-select --print-path",
                                   NULL, // current working directory
                                   &exit_status, &signo, &command_output,
@@ -1361,7 +1362,7 @@ static FileSpec GetXcodeContentsPath() {
         int signo = 0;
         std::string output;
         const char *command = "/usr/bin/xcode-select -p";
-        lldb_private::Error error = Host::RunShellCommand(
+        lldb_private::Status error = Host::RunShellCommand(
             command, // shell command to run
             NULL,    // current working directory
             &status, // Put the exit status of the process in here
@@ -1739,7 +1740,7 @@ lldb_private::FileSpec PlatformDarwin::LocateExecutable(const char *basename) {
   return FileSpec();
 }
 
-lldb_private::Error
+lldb_private::Status
 PlatformDarwin::LaunchProcess(lldb_private::ProcessLaunchInfo &launch_info) {
   // Starting in Fall 2016 OSes, NSLog messages only get mirrored to stderr
   // if the OS_ACTIVITY_DT_MODE environment variable is set.  (It doesn't
diff --git a/source/Plugins/Platform/MacOSX/PlatformDarwin.h b/source/Plugins/Platform/MacOSX/PlatformDarwin.h
index 9430c269c27e..6495609ac495 100644
--- a/source/Plugins/Platform/MacOSX/PlatformDarwin.h
+++ b/source/Plugins/Platform/MacOSX/PlatformDarwin.h
@@ -32,7 +32,7 @@ public:
   //------------------------------------------------------------
   // lldb_private::Platform functions
   //------------------------------------------------------------
-  lldb_private::Error
+  lldb_private::Status
   ResolveSymbolFile(lldb_private::Target &target,
                     const lldb_private::ModuleSpec &sym_spec,
                     lldb_private::FileSpec &sym_file) override;
@@ -41,7 +41,7 @@ public:
       lldb_private::Target *target, lldb_private::Module &module,
       lldb_private::Stream *feedback_stream) override;
 
-  lldb_private::Error
+  lldb_private::Status
   GetSharedModule(const lldb_private::ModuleSpec &module_spec,
                   lldb_private::Process *process, lldb::ModuleSP &module_sp,
                   const lldb_private::FileSpecList *module_search_paths_ptr,
@@ -79,7 +79,7 @@ public:
 
   lldb_private::FileSpec LocateExecutable(const char *basename) override;
 
-  lldb_private::Error
+  lldb_private::Status
   LaunchProcess(lldb_private::ProcessLaunchInfo &launch_info) override;
 
   static std::tuple<uint32_t, uint32_t, uint32_t, llvm::StringRef>
@@ -90,7 +90,7 @@ protected:
 
   void ReadLibdispatchOffsets(lldb_private::Process *process);
 
-  virtual lldb_private::Error GetSharedModuleWithLocalCache(
+  virtual lldb_private::Status GetSharedModuleWithLocalCache(
       const lldb_private::ModuleSpec &module_spec, lldb::ModuleSP &module_sp,
       const lldb_private::FileSpecList *module_search_paths_ptr,
       lldb::ModuleSP *old_module_sp_ptr, bool *did_create_ptr);
diff --git a/source/Plugins/Platform/MacOSX/PlatformDarwinKernel.cpp b/source/Plugins/Platform/MacOSX/PlatformDarwinKernel.cpp
index 08df0565acc8..f168fb6fda5e 100644
--- a/source/Plugins/Platform/MacOSX/PlatformDarwinKernel.cpp
+++ b/source/Plugins/Platform/MacOSX/PlatformDarwinKernel.cpp
@@ -31,9 +31,9 @@
 #include "lldb/Target/Platform.h"
 #include "lldb/Target/Process.h"
 #include "lldb/Target/Target.h"
-#include "lldb/Utility/Error.h"
 #include "lldb/Utility/FileSpec.h"
 #include "lldb/Utility/Log.h"
+#include "lldb/Utility/Status.h"
 #include "lldb/Utility/StreamString.h"
 
 #include "llvm/Support/FileSystem.h"
@@ -664,11 +664,11 @@ bool PlatformDarwinKernel::KernelHasdSYMSibling(const FileSpec &kernel_binary) {
   return false;
 }
 
-Error PlatformDarwinKernel::GetSharedModule(
+Status PlatformDarwinKernel::GetSharedModule(
     const ModuleSpec &module_spec, Process *process, ModuleSP &module_sp,
     const FileSpecList *module_search_paths_ptr, ModuleSP *old_module_sp_ptr,
     bool *did_create_ptr) {
-  Error error;
+  Status error;
   module_sp.reset();
   const FileSpec &platform_file = module_spec.GetFileSpec();
 
@@ -774,10 +774,10 @@ Error PlatformDarwinKernel::GetSharedModule(
                                          old_module_sp_ptr, did_create_ptr);
 }
 
-Error PlatformDarwinKernel::ExamineKextForMatchingUUID(
+Status PlatformDarwinKernel::ExamineKextForMatchingUUID(
     const FileSpec &kext_bundle_path, const lldb_private::UUID &uuid,
     const ArchSpec &arch, ModuleSP &exe_module_sp) {
-  Error error;
+  Status error;
   FileSpec exe_file = kext_bundle_path;
   Host::ResolveExecutableInBundle(exe_file);
   if (exe_file.Exists()) {
diff --git a/source/Plugins/Platform/MacOSX/PlatformDarwinKernel.h b/source/Plugins/Platform/MacOSX/PlatformDarwinKernel.h
index 6ee5916e613a..9b3ec5e0d717 100644
--- a/source/Plugins/Platform/MacOSX/PlatformDarwinKernel.h
+++ b/source/Plugins/Platform/MacOSX/PlatformDarwinKernel.h
@@ -66,7 +66,7 @@ public:
 
   void GetStatus(lldb_private::Stream &strm) override;
 
-  lldb_private::Error
+  lldb_private::Status
   GetSharedModule(const lldb_private::ModuleSpec &module_spec,
                   lldb_private::Process *process, lldb::ModuleSP &module_sp,
                   const lldb_private::FileSpecList *module_search_paths_ptr,
@@ -139,7 +139,7 @@ protected:
   static bool
   KernelHasdSYMSibling(const lldb_private::FileSpec &kext_bundle_filepath);
 
-  lldb_private::Error
+  lldb_private::Status
   ExamineKextForMatchingUUID(const lldb_private::FileSpec &kext_bundle_path,
                              const lldb_private::UUID &uuid,
                              const lldb_private::ArchSpec &arch,
diff --git a/source/Plugins/Platform/MacOSX/PlatformMacOSX.cpp b/source/Plugins/Platform/MacOSX/PlatformMacOSX.cpp
index 11d0457a783e..c08417a80ae4 100644
--- a/source/Plugins/Platform/MacOSX/PlatformMacOSX.cpp
+++ b/source/Plugins/Platform/MacOSX/PlatformMacOSX.cpp
@@ -27,9 +27,9 @@
 #include "lldb/Target/Process.h"
 #include "lldb/Target/Target.h"
 #include "lldb/Utility/DataBufferHeap.h"
-#include "lldb/Utility/Error.h"
 #include "lldb/Utility/FileSpec.h"
 #include "lldb/Utility/Log.h"
+#include "lldb/Utility/Status.h"
 #include "lldb/Utility/StreamString.h"
 
 using namespace lldb;
@@ -190,7 +190,7 @@ ConstString PlatformMacOSX::GetSDKDirectory(lldb_private::Target &target) {
             int signo = 0;
             std::string output;
             const char *command = "xcrun -sdk macosx --show-sdk-path";
-            lldb_private::Error error = RunShellCommand(
+            lldb_private::Status error = RunShellCommand(
                 command, // shell command to run
                 NULL,    // current working directory
                 &status, // Put the exit status of the process in here
@@ -235,9 +235,9 @@ ConstString PlatformMacOSX::GetSDKDirectory(lldb_private::Target &target) {
   return ConstString();
 }
 
-Error PlatformMacOSX::GetSymbolFile(const FileSpec &platform_file,
-                                    const UUID *uuid_ptr,
-                                    FileSpec &local_file) {
+Status PlatformMacOSX::GetSymbolFile(const FileSpec &platform_file,
+                                     const UUID *uuid_ptr,
+                                     FileSpec &local_file) {
   if (IsRemote()) {
     if (m_remote_platform_sp)
       return m_remote_platform_sp->GetFileWithUUID(platform_file, uuid_ptr,
@@ -246,10 +246,10 @@ Error PlatformMacOSX::GetSymbolFile(const FileSpec &platform_file,
 
   // Default to the local case
   local_file = platform_file;
-  return Error();
+  return Status();
 }
 
-lldb_private::Error
+lldb_private::Status
 PlatformMacOSX::GetFileWithUUID(const lldb_private::FileSpec &platform_file,
                                 const lldb_private::UUID *uuid_ptr,
                                 lldb_private::FileSpec &local_file) {
@@ -263,7 +263,7 @@ PlatformMacOSX::GetFileWithUUID(const lldb_private::FileSpec &platform_file,
     if (local_os_build.compare(remote_os_build) == 0) {
       // same OS version: the local file is good enough
       local_file = platform_file;
-      return Error();
+      return Status();
     } else {
       // try to find the file in the cache
       std::string cache_path(GetLocalCacheDirectory());
@@ -272,13 +272,14 @@ PlatformMacOSX::GetFileWithUUID(const lldb_private::FileSpec &platform_file,
       FileSpec module_cache_spec(cache_path, false);
       if (module_cache_spec.Exists()) {
         local_file = module_cache_spec;
-        return Error();
+        return Status();
       }
       // bring in the remote module file
       FileSpec module_cache_folder =
           module_cache_spec.CopyByRemovingLastPathComponent();
       // try to make the local directory first
-      Error err(llvm::sys::fs::create_directory(module_cache_folder.GetPath()));
+      Status err(
+          llvm::sys::fs::create_directory(module_cache_folder.GetPath()));
       if (err.Fail())
         return err;
       err = GetFile(platform_file, module_cache_spec);
@@ -286,13 +287,13 @@ PlatformMacOSX::GetFileWithUUID(const lldb_private::FileSpec &platform_file,
         return err;
       if (module_cache_spec.Exists()) {
         local_file = module_cache_spec;
-        return Error();
+        return Status();
       } else
-        return Error("unable to obtain valid module file");
+        return Status("unable to obtain valid module file");
     }
   }
   local_file = platform_file;
-  return Error();
+  return Status();
 }
 
 bool PlatformMacOSX::GetSupportedArchitectureAtIndex(uint32_t idx,
@@ -304,12 +305,12 @@ bool PlatformMacOSX::GetSupportedArchitectureAtIndex(uint32_t idx,
 #endif
 }
 
-lldb_private::Error PlatformMacOSX::GetSharedModule(
+lldb_private::Status PlatformMacOSX::GetSharedModule(
     const lldb_private::ModuleSpec &module_spec, Process *process,
     lldb::ModuleSP &module_sp,
     const lldb_private::FileSpecList *module_search_paths_ptr,
     lldb::ModuleSP *old_module_sp_ptr, bool *did_create_ptr) {
-  Error error = GetSharedModuleWithLocalCache(
+  Status error = GetSharedModuleWithLocalCache(
       module_spec, module_sp, module_search_paths_ptr, old_module_sp_ptr,
       did_create_ptr);
 
@@ -324,7 +325,7 @@ lldb_private::Error PlatformMacOSX::GetSharedModule(
         lldb::ModuleSP x86_64_module_sp;
         lldb::ModuleSP old_x86_64_module_sp;
         bool did_create = false;
-        Error x86_64_error = GetSharedModuleWithLocalCache(
+        Status x86_64_error = GetSharedModuleWithLocalCache(
             module_spec_x86_64, x86_64_module_sp, module_search_paths_ptr,
             &old_x86_64_module_sp, &did_create);
         if (x86_64_module_sp && x86_64_module_sp->GetObjectFile()) {
diff --git a/source/Plugins/Platform/MacOSX/PlatformMacOSX.h b/source/Plugins/Platform/MacOSX/PlatformMacOSX.h
index d5b5d69f1fb3..d1e609258d4d 100644
--- a/source/Plugins/Platform/MacOSX/PlatformMacOSX.h
+++ b/source/Plugins/Platform/MacOSX/PlatformMacOSX.h
@@ -45,7 +45,7 @@ public:
 
   uint32_t GetPluginVersion() override { return 1; }
 
-  lldb_private::Error
+  lldb_private::Status
   GetSharedModule(const lldb_private::ModuleSpec &module_spec,
                   lldb_private::Process *process, lldb::ModuleSP &module_sp,
                   const lldb_private::FileSpecList *module_search_paths_ptr,
@@ -56,17 +56,18 @@ public:
     return GetDescriptionStatic(IsHost());
   }
 
-  lldb_private::Error GetSymbolFile(const lldb_private::FileSpec &platform_file,
-                                    const lldb_private::UUID *uuid_ptr,
-                                    lldb_private::FileSpec &local_file);
+  lldb_private::Status
+  GetSymbolFile(const lldb_private::FileSpec &platform_file,
+                const lldb_private::UUID *uuid_ptr,
+                lldb_private::FileSpec &local_file);
 
-  lldb_private::Error
+  lldb_private::Status
   GetFile(const lldb_private::FileSpec &source,
           const lldb_private::FileSpec &destination) override {
     return PlatformDarwin::GetFile(source, destination);
   }
 
-  lldb_private::Error
+  lldb_private::Status
   GetFileWithUUID(const lldb_private::FileSpec &platform_file,
                   const lldb_private::UUID *uuid_ptr,
                   lldb_private::FileSpec &local_file) override;
diff --git a/source/Plugins/Platform/MacOSX/PlatformRemoteAppleTV.cpp b/source/Plugins/Platform/MacOSX/PlatformRemoteAppleTV.cpp
index 6fdaa5997b46..38facc4aa124 100644
--- a/source/Plugins/Platform/MacOSX/PlatformRemoteAppleTV.cpp
+++ b/source/Plugins/Platform/MacOSX/PlatformRemoteAppleTV.cpp
@@ -25,9 +25,9 @@
 #include "lldb/Host/Host.h"
 #include "lldb/Target/Process.h"
 #include "lldb/Target/Target.h"
-#include "lldb/Utility/Error.h"
 #include "lldb/Utility/FileSpec.h"
 #include "lldb/Utility/Log.h"
+#include "lldb/Utility/Status.h"
 #include "lldb/Utility/StreamString.h"
 
 using namespace lldb;
diff --git a/source/Plugins/Platform/MacOSX/PlatformRemoteAppleWatch.cpp b/source/Plugins/Platform/MacOSX/PlatformRemoteAppleWatch.cpp
index 186926430973..bbd8f1698937 100644
--- a/source/Plugins/Platform/MacOSX/PlatformRemoteAppleWatch.cpp
+++ b/source/Plugins/Platform/MacOSX/PlatformRemoteAppleWatch.cpp
@@ -25,9 +25,9 @@
 #include "lldb/Host/Host.h"
 #include "lldb/Target/Process.h"
 #include "lldb/Target/Target.h"
-#include "lldb/Utility/Error.h"
 #include "lldb/Utility/FileSpec.h"
 #include "lldb/Utility/Log.h"
+#include "lldb/Utility/Status.h"
 #include "lldb/Utility/StreamString.h"
 
 using namespace lldb;
diff --git a/source/Plugins/Platform/MacOSX/PlatformRemoteDarwinDevice.cpp b/source/Plugins/Platform/MacOSX/PlatformRemoteDarwinDevice.cpp
index 0302e7b3aaf8..f7395fb8cf3d 100644
--- a/source/Plugins/Platform/MacOSX/PlatformRemoteDarwinDevice.cpp
+++ b/source/Plugins/Platform/MacOSX/PlatformRemoteDarwinDevice.cpp
@@ -22,9 +22,9 @@
 #include "lldb/Host/Host.h"
 #include "lldb/Target/Process.h"
 #include "lldb/Target/Target.h"
-#include "lldb/Utility/Error.h"
 #include "lldb/Utility/FileSpec.h"
 #include "lldb/Utility/Log.h"
+#include "lldb/Utility/Status.h"
 #include "lldb/Utility/StreamString.h"
 
 using namespace lldb;
@@ -75,10 +75,10 @@ void PlatformRemoteDarwinDevice::GetStatus(Stream &strm) {
   }
 }
 
-Error PlatformRemoteDarwinDevice::ResolveExecutable(
+Status PlatformRemoteDarwinDevice::ResolveExecutable(
     const ModuleSpec &ms, lldb::ModuleSP &exe_module_sp,
     const FileSpecList *module_search_paths_ptr) {
-  Error error;
+  Status error;
   // Nothing special to do here, just use the actual file and architecture
 
   ModuleSpec resolved_module_spec(ms);
@@ -429,11 +429,11 @@ bool PlatformRemoteDarwinDevice::GetFileInSDK(const char *platform_file_path,
   return false;
 }
 
-Error PlatformRemoteDarwinDevice::GetSymbolFile(const FileSpec &platform_file,
-                                       const UUID *uuid_ptr,
-                                       FileSpec &local_file) {
+Status PlatformRemoteDarwinDevice::GetSymbolFile(const FileSpec &platform_file,
+                                                 const UUID *uuid_ptr,
+                                                 FileSpec &local_file) {
   Log *log = lldb_private::GetLogIfAllCategoriesSet(LIBLLDB_LOG_HOST);
-  Error error;
+  Status error;
   char platform_file_path[PATH_MAX];
   if (platform_file.GetPath(platform_file_path, sizeof(platform_file_path))) {
     char resolved_path[PATH_MAX];
@@ -489,7 +489,7 @@ Error PlatformRemoteDarwinDevice::GetSymbolFile(const FileSpec &platform_file,
   return error;
 }
 
-Error PlatformRemoteDarwinDevice::GetSharedModule(
+Status PlatformRemoteDarwinDevice::GetSharedModule(
     const ModuleSpec &module_spec, Process *process, ModuleSP &module_sp,
     const FileSpecList *module_search_paths_ptr, ModuleSP *old_module_sp_ptr,
     bool *did_create_ptr) {
@@ -500,7 +500,7 @@ Error PlatformRemoteDarwinDevice::GetSharedModule(
   const FileSpec &platform_file = module_spec.GetFileSpec();
   Log *log = lldb_private::GetLogIfAllCategoriesSet(LIBLLDB_LOG_HOST);
 
-  Error error;
+  Status error;
   char platform_file_path[PATH_MAX];
 
   if (platform_file.GetPath(platform_file_path, sizeof(platform_file_path))) {
@@ -657,7 +657,7 @@ Error PlatformRemoteDarwinDevice::GetSharedModule(
         if (path_to_try.Exists()) {
           ModuleSpec new_module_spec(module_spec);
           new_module_spec.GetFileSpec() = path_to_try;
-          Error new_error(Platform::GetSharedModule(
+          Status new_error(Platform::GetSharedModule(
               new_module_spec, process, module_sp, NULL, old_module_sp_ptr,
               did_create_ptr));
 
diff --git a/source/Plugins/Platform/MacOSX/PlatformRemoteDarwinDevice.h b/source/Plugins/Platform/MacOSX/PlatformRemoteDarwinDevice.h
index 55fb4f920c66..f159e8575d76 100644
--- a/source/Plugins/Platform/MacOSX/PlatformRemoteDarwinDevice.h
+++ b/source/Plugins/Platform/MacOSX/PlatformRemoteDarwinDevice.h
@@ -30,18 +30,18 @@ public:
   //------------------------------------------------------------
   // lldb_private::Platform functions
   //------------------------------------------------------------
-  lldb_private::Error ResolveExecutable(
+  lldb_private::Status ResolveExecutable(
       const lldb_private::ModuleSpec &module_spec, lldb::ModuleSP &module_sp,
       const lldb_private::FileSpecList *module_search_paths_ptr) override;
 
   void GetStatus(lldb_private::Stream &strm) override;
 
-  virtual lldb_private::Error
+  virtual lldb_private::Status
   GetSymbolFile(const lldb_private::FileSpec &platform_file,
                 const lldb_private::UUID *uuid_ptr,
                 lldb_private::FileSpec &local_file);
 
-  lldb_private::Error
+  lldb_private::Status
   GetSharedModule(const lldb_private::ModuleSpec &module_spec,
                   lldb_private::Process *process, lldb::ModuleSP &module_sp,
                   const lldb_private::FileSpecList *module_search_paths_ptr,
diff --git a/source/Plugins/Platform/MacOSX/PlatformRemoteiOS.cpp b/source/Plugins/Platform/MacOSX/PlatformRemoteiOS.cpp
index ec1109fb4b44..c52b636c8496 100644
--- a/source/Plugins/Platform/MacOSX/PlatformRemoteiOS.cpp
+++ b/source/Plugins/Platform/MacOSX/PlatformRemoteiOS.cpp
@@ -22,9 +22,9 @@
 #include "lldb/Host/Host.h"
 #include "lldb/Target/Process.h"
 #include "lldb/Target/Target.h"
-#include "lldb/Utility/Error.h"
 #include "lldb/Utility/FileSpec.h"
 #include "lldb/Utility/Log.h"
+#include "lldb/Utility/Status.h"
 #include "lldb/Utility/StreamString.h"
 
 using namespace lldb;
diff --git a/source/Plugins/Platform/MacOSX/PlatformiOSSimulator.cpp b/source/Plugins/Platform/MacOSX/PlatformiOSSimulator.cpp
index 9a082c701f02..3037dd854be7 100644
--- a/source/Plugins/Platform/MacOSX/PlatformiOSSimulator.cpp
+++ b/source/Plugins/Platform/MacOSX/PlatformiOSSimulator.cpp
@@ -24,9 +24,9 @@
 #include "lldb/Host/HostInfo.h"
 #include "lldb/Target/Process.h"
 #include "lldb/Target/Target.h"
-#include "lldb/Utility/Error.h"
 #include "lldb/Utility/FileSpec.h"
 #include "lldb/Utility/Log.h"
+#include "lldb/Utility/Status.h"
 #include "lldb/Utility/StreamString.h"
 
 #include "llvm/Support/FileSystem.h"
@@ -177,10 +177,10 @@ void PlatformiOSSimulator::GetStatus(Stream &strm) {
   PlatformAppleSimulator::GetStatus(strm);
 }
 
-Error PlatformiOSSimulator::ResolveExecutable(
+Status PlatformiOSSimulator::ResolveExecutable(
     const ModuleSpec &module_spec, lldb::ModuleSP &exe_module_sp,
     const FileSpecList *module_search_paths_ptr) {
-  Error error;
+  Status error;
   // Nothing special to do here, just use the actual file and architecture
 
   ModuleSpec resolved_module_spec(module_spec);
@@ -306,10 +306,10 @@ const char *PlatformiOSSimulator::GetSDKDirectoryAsCString() {
   return NULL;
 }
 
-Error PlatformiOSSimulator::GetSymbolFile(const FileSpec &platform_file,
-                                          const UUID *uuid_ptr,
-                                          FileSpec &local_file) {
-  Error error;
+Status PlatformiOSSimulator::GetSymbolFile(const FileSpec &platform_file,
+                                           const UUID *uuid_ptr,
+                                           FileSpec &local_file) {
+  Status error;
   char platform_file_path[PATH_MAX];
   if (platform_file.GetPath(platform_file_path, sizeof(platform_file_path))) {
     char resolved_path[PATH_MAX];
@@ -338,7 +338,7 @@ Error PlatformiOSSimulator::GetSymbolFile(const FileSpec &platform_file,
   return error;
 }
 
-Error PlatformiOSSimulator::GetSharedModule(
+Status PlatformiOSSimulator::GetSharedModule(
     const ModuleSpec &module_spec, Process *process, ModuleSP &module_sp,
     const FileSpecList *module_search_paths_ptr, ModuleSP *old_module_sp_ptr,
     bool *did_create_ptr) {
@@ -346,7 +346,7 @@ Error PlatformiOSSimulator::GetSharedModule(
   // system. So first we ask for the file in the cached SDK,
   // then we attempt to get a shared module for the right architecture
   // with the right UUID.
-  Error error;
+  Status error;
   ModuleSpec platform_module_spec(module_spec);
   const FileSpec &platform_file = module_spec.GetFileSpec();
   error = GetSymbolFile(platform_file, module_spec.GetUUIDPtr(),
diff --git a/source/Plugins/Platform/MacOSX/PlatformiOSSimulator.h b/source/Plugins/Platform/MacOSX/PlatformiOSSimulator.h
index c8c7872b530d..2d81d6229f73 100644
--- a/source/Plugins/Platform/MacOSX/PlatformiOSSimulator.h
+++ b/source/Plugins/Platform/MacOSX/PlatformiOSSimulator.h
@@ -51,7 +51,7 @@ public:
   //------------------------------------------------------------
   // lldb_private::Platform functions
   //------------------------------------------------------------
-  lldb_private::Error ResolveExecutable(
+  lldb_private::Status ResolveExecutable(
       const lldb_private::ModuleSpec &module_spec, lldb::ModuleSP &module_sp,
       const lldb_private::FileSpecList *module_search_paths_ptr) override;
 
@@ -59,12 +59,12 @@ public:
 
   void GetStatus(lldb_private::Stream &strm) override;
 
-  virtual lldb_private::Error
+  virtual lldb_private::Status
   GetSymbolFile(const lldb_private::FileSpec &platform_file,
                 const lldb_private::UUID *uuid_ptr,
                 lldb_private::FileSpec &local_file);
 
-  lldb_private::Error
+  lldb_private::Status
   GetSharedModule(const lldb_private::ModuleSpec &module_spec,
                   lldb_private::Process *process, lldb::ModuleSP &module_sp,
                   const lldb_private::FileSpecList *module_search_paths_ptr,
diff --git a/source/Plugins/Platform/MacOSX/PlatformiOSSimulatorCoreSimulatorSupport.h b/source/Plugins/Platform/MacOSX/PlatformiOSSimulatorCoreSimulatorSupport.h
index 2a2a6f73a0e1..31e11a60e419 100644
--- a/source/Plugins/Platform/MacOSX/PlatformiOSSimulatorCoreSimulatorSupport.h
+++ b/source/Plugins/Platform/MacOSX/PlatformiOSSimulatorCoreSimulatorSupport.h
@@ -27,7 +27,7 @@ typedef void *id;
 #include "lldb/Interpreter/Args.h"
 #include "lldb/Target/ProcessLaunchInfo.h"
 #include "lldb/Utility/ConstString.h"
-#include "lldb/Utility/Error.h"
+#include "lldb/Utility/Status.h"
 
 #include "llvm/ADT/Optional.h"
 
@@ -39,17 +39,17 @@ public:
 
   explicit operator bool() { return m_pid != LLDB_INVALID_PROCESS_ID; }
 
-  lldb_private::Error GetError() { return m_error; }
+  lldb_private::Status GetError() { return m_error; }
 
 private:
   Process(lldb::pid_t p);
 
-  Process(lldb_private::Error error);
+  Process(lldb_private::Status error);
 
-  Process(lldb::pid_t p, lldb_private::Error error);
+  Process(lldb::pid_t p, lldb_private::Status error);
 
   lldb::pid_t m_pid;
-  lldb_private::Error m_error;
+  lldb_private::Status m_error;
 
   friend class Device;
 };
@@ -165,9 +165,9 @@ public:
 
   State GetState();
 
-  bool Boot(lldb_private::Error &err);
+  bool Boot(lldb_private::Status &err);
 
-  bool Shutdown(lldb_private::Error &err);
+  bool Shutdown(lldb_private::Status &err);
 
   std::string GetUDID() const;
 
diff --git a/source/Plugins/Platform/MacOSX/PlatformiOSSimulatorCoreSimulatorSupport.mm b/source/Plugins/Platform/MacOSX/PlatformiOSSimulatorCoreSimulatorSupport.mm
index de92aa0de7aa..6a49b645c1e1 100644
--- a/source/Plugins/Platform/MacOSX/PlatformiOSSimulatorCoreSimulatorSupport.mm
+++ b/source/Plugins/Platform/MacOSX/PlatformiOSSimulatorCoreSimulatorSupport.mm
@@ -61,10 +61,10 @@ using namespace lldb_utility;
 
 CoreSimulatorSupport::Process::Process(lldb::pid_t p) : m_pid(p), m_error() {}
 
-CoreSimulatorSupport::Process::Process(Error error)
+CoreSimulatorSupport::Process::Process(Status error)
     : m_pid(LLDB_INVALID_PROCESS_ID), m_error(error) {}
 
-CoreSimulatorSupport::Process::Process(lldb::pid_t p, Error error)
+CoreSimulatorSupport::Process::Process(lldb::pid_t p, Status error)
     : m_pid(p), m_error(error) {}
 
 CoreSimulatorSupport::DeviceType::DeviceType()
@@ -345,7 +345,7 @@ operator!=(const CoreSimulatorSupport::ModelIdentifier &lhs,
   return false;
 }
 
-bool CoreSimulatorSupport::Device::Boot(Error &err) {
+bool CoreSimulatorSupport::Device::Boot(Status &err) {
   if (m_dev == nil) {
     err.SetErrorString("no valid simulator instance");
     return false;
@@ -371,7 +371,7 @@ bool CoreSimulatorSupport::Device::Boot(Error &err) {
   }
 }
 
-bool CoreSimulatorSupport::Device::Shutdown(Error &err) {
+bool CoreSimulatorSupport::Device::Shutdown(Status &err) {
   NSError *nserror;
   if ([m_dev shutdownWithError:&nserror]) {
     err.Clear();
@@ -382,10 +382,10 @@ bool CoreSimulatorSupport::Device::Shutdown(Error &err) {
   }
 }
 
-static Error HandleFileAction(ProcessLaunchInfo &launch_info,
-                              NSMutableDictionary *options, NSString *key,
-                              const int fd, File &file) {
-  Error error;
+static Status HandleFileAction(ProcessLaunchInfo &launch_info,
+                               NSMutableDictionary *options, NSString *key,
+                               const int fd, File &file) {
+  Status error;
   const FileAction *file_action = launch_info.GetFileActionForFD(fd);
   if (file_action) {
     switch (file_action->GetAction()) {
@@ -426,7 +426,7 @@ static Error HandleFileAction(ProcessLaunchInfo &launch_info,
             }
           }
         }
-        Error posix_error;
+        Status posix_error;
         int created_fd =
             open(file_spec.GetPath().c_str(), file_action->GetActionArgument(),
                  S_IRUSR | S_IWUSR);
@@ -499,7 +499,7 @@ CoreSimulatorSupport::Device::Spawn(ProcessLaunchInfo &launch_info) {
     [options setObject:env_dict forKey:kSimDeviceSpawnEnvironment];
   }
 
-  Error error;
+  Status error;
   File stdin_file;
   File stdout_file;
   File stderr_file;
diff --git a/source/Plugins/Platform/NetBSD/PlatformNetBSD.cpp b/source/Plugins/Platform/NetBSD/PlatformNetBSD.cpp
index 409f12deefca..9df5b9fac380 100644
--- a/source/Plugins/Platform/NetBSD/PlatformNetBSD.cpp
+++ b/source/Plugins/Platform/NetBSD/PlatformNetBSD.cpp
@@ -25,9 +25,9 @@
 #include "lldb/Host/HostInfo.h"
 #include "lldb/Target/Process.h"
 #include "lldb/Target/Target.h"
-#include "lldb/Utility/Error.h"
 #include "lldb/Utility/FileSpec.h"
 #include "lldb/Utility/Log.h"
+#include "lldb/Utility/Status.h"
 #include "lldb/Utility/StreamString.h"
 
 // Define these constants from NetBSD mman.h for use when targeting
@@ -262,11 +262,11 @@ bool PlatformNetBSD::CanDebugProcess() {
 // lldb-launch, llgs-attach.  This differs from current lldb-launch,
 // debugserver-attach
 // approach on MacOSX.
-lldb::ProcessSP
-PlatformNetBSD::DebugProcess(ProcessLaunchInfo &launch_info, Debugger &debugger,
-                            Target *target, // Can be NULL, if NULL create a new
-                                            // target, else use existing one
-                            Error &error) {
+lldb::ProcessSP PlatformNetBSD::DebugProcess(
+    ProcessLaunchInfo &launch_info, Debugger &debugger,
+    Target *target, // Can be NULL, if NULL create a new
+                    // target, else use existing one
+    Status &error) {
   Log *log(GetLogIfAllCategoriesSet(LIBLLDB_LOG_PLATFORM));
   if (log)
     log->Printf("PlatformNetBSD::%s entered (target %p)", __FUNCTION__,
diff --git a/source/Plugins/Platform/NetBSD/PlatformNetBSD.h b/source/Plugins/Platform/NetBSD/PlatformNetBSD.h
index 500c61dab970..b1aaa4ab5f59 100644
--- a/source/Plugins/Platform/NetBSD/PlatformNetBSD.h
+++ b/source/Plugins/Platform/NetBSD/PlatformNetBSD.h
@@ -55,7 +55,7 @@ public:
 
   lldb::ProcessSP DebugProcess(ProcessLaunchInfo &launch_info,
                                Debugger &debugger, Target *target,
-                               Error &error) override;
+                               Status &error) override;
 
   void CalculateTrapHandlerSymbolNames() override;
 
diff --git a/source/Plugins/Platform/OpenBSD/PlatformOpenBSD.cpp b/source/Plugins/Platform/OpenBSD/PlatformOpenBSD.cpp
index e3816d0276b2..edb8ec951d37 100644
--- a/source/Plugins/Platform/OpenBSD/PlatformOpenBSD.cpp
+++ b/source/Plugins/Platform/OpenBSD/PlatformOpenBSD.cpp
@@ -25,9 +25,9 @@
 #include "lldb/Host/HostInfo.h"
 #include "lldb/Target/Process.h"
 #include "lldb/Target/Target.h"
-#include "lldb/Utility/Error.h"
 #include "lldb/Utility/FileSpec.h"
 #include "lldb/Utility/Log.h"
+#include "lldb/Utility/Status.h"
 #include "lldb/Utility/StreamString.h"
 
 // Define these constants from OpenBSD mman.h for use when targeting
diff --git a/source/Plugins/Platform/POSIX/PlatformPOSIX.cpp b/source/Plugins/Platform/POSIX/PlatformPOSIX.cpp
index 0032c804987c..f4cf22ad7583 100644
--- a/source/Plugins/Platform/POSIX/PlatformPOSIX.cpp
+++ b/source/Plugins/Platform/POSIX/PlatformPOSIX.cpp
@@ -88,7 +88,7 @@ bool PlatformPOSIX::IsConnected() const {
   return false;
 }
 
-lldb_private::Error PlatformPOSIX::RunShellCommand(
+lldb_private::Status PlatformPOSIX::RunShellCommand(
     const char *command, // Shouldn't be NULL
     const FileSpec &
         working_dir, // Pass empty FileSpec to use the current working directory
@@ -109,14 +109,15 @@ lldb_private::Error PlatformPOSIX::RunShellCommand(
                                                    status_ptr, signo_ptr,
                                                    command_output, timeout_sec);
     else
-      return Error("unable to run a remote command without a platform");
+      return Status("unable to run a remote command without a platform");
   }
 }
 
-Error PlatformPOSIX::ResolveExecutable(const ModuleSpec &module_spec,
-                                  lldb::ModuleSP &exe_module_sp,
-                                  const FileSpecList *module_search_paths_ptr) {
-  Error error;
+Status
+PlatformPOSIX::ResolveExecutable(const ModuleSpec &module_spec,
+                                 lldb::ModuleSP &exe_module_sp,
+                                 const FileSpecList *module_search_paths_ptr) {
+  Status error;
   // Nothing special to do here, just use the actual file and architecture
 
   char exe_path[PATH_MAX];
@@ -250,16 +251,16 @@ Error PlatformPOSIX::ResolveExecutable(const ModuleSpec &module_spec,
   return error;
 }
 
-Error PlatformPOSIX::GetFileWithUUID(const FileSpec &platform_file,
-                                       const UUID *uuid_ptr,
-                                       FileSpec &local_file) {
+Status PlatformPOSIX::GetFileWithUUID(const FileSpec &platform_file,
+                                      const UUID *uuid_ptr,
+                                      FileSpec &local_file) {
   if (IsRemote() && m_remote_platform_sp)
       return m_remote_platform_sp->GetFileWithUUID(platform_file, uuid_ptr,
                                                    local_file);
 
   // Default to the local case
   local_file = platform_file;
-  return Error();
+  return Status();
 }
 
 bool PlatformPOSIX::GetProcessInfo(lldb::pid_t pid,
@@ -282,16 +283,16 @@ PlatformPOSIX::FindProcesses(const ProcessInstanceInfoMatch &match_info,
   return 0;
 }
 
-Error PlatformPOSIX::MakeDirectory(const FileSpec &file_spec,
-                                   uint32_t file_permissions) {
+Status PlatformPOSIX::MakeDirectory(const FileSpec &file_spec,
+                                    uint32_t file_permissions) {
   if (m_remote_platform_sp)
     return m_remote_platform_sp->MakeDirectory(file_spec, file_permissions);
   else
     return Platform::MakeDirectory(file_spec, file_permissions);
 }
 
-Error PlatformPOSIX::GetFilePermissions(const FileSpec &file_spec,
-                                        uint32_t &file_permissions) {
+Status PlatformPOSIX::GetFilePermissions(const FileSpec &file_spec,
+                                         uint32_t &file_permissions) {
   if (m_remote_platform_sp)
     return m_remote_platform_sp->GetFilePermissions(file_spec,
                                                     file_permissions);
@@ -299,8 +300,8 @@ Error PlatformPOSIX::GetFilePermissions(const FileSpec &file_spec,
     return Platform::GetFilePermissions(file_spec, file_permissions);
 }
 
-Error PlatformPOSIX::SetFilePermissions(const FileSpec &file_spec,
-                                        uint32_t file_permissions) {
+Status PlatformPOSIX::SetFilePermissions(const FileSpec &file_spec,
+                                         uint32_t file_permissions) {
   if (m_remote_platform_sp)
     return m_remote_platform_sp->SetFilePermissions(file_spec,
                                                     file_permissions);
@@ -310,7 +311,7 @@ Error PlatformPOSIX::SetFilePermissions(const FileSpec &file_spec,
 
 lldb::user_id_t PlatformPOSIX::OpenFile(const FileSpec &file_spec,
                                         uint32_t flags, uint32_t mode,
-                                        Error &error) {
+                                        Status &error) {
   if (IsHost())
     return FileCache::GetInstance().OpenFile(file_spec, flags, mode, error);
   else if (m_remote_platform_sp)
@@ -319,7 +320,7 @@ lldb::user_id_t PlatformPOSIX::OpenFile(const FileSpec &file_spec,
     return Platform::OpenFile(file_spec, flags, mode, error);
 }
 
-bool PlatformPOSIX::CloseFile(lldb::user_id_t fd, Error &error) {
+bool PlatformPOSIX::CloseFile(lldb::user_id_t fd, Status &error) {
   if (IsHost())
     return FileCache::GetInstance().CloseFile(fd, error);
   else if (m_remote_platform_sp)
@@ -329,7 +330,7 @@ bool PlatformPOSIX::CloseFile(lldb::user_id_t fd, Error &error) {
 }
 
 uint64_t PlatformPOSIX::ReadFile(lldb::user_id_t fd, uint64_t offset, void *dst,
-                                 uint64_t dst_len, Error &error) {
+                                 uint64_t dst_len, Status &error) {
   if (IsHost())
     return FileCache::GetInstance().ReadFile(fd, offset, dst, dst_len, error);
   else if (m_remote_platform_sp)
@@ -340,7 +341,7 @@ uint64_t PlatformPOSIX::ReadFile(lldb::user_id_t fd, uint64_t offset, void *dst,
 
 uint64_t PlatformPOSIX::WriteFile(lldb::user_id_t fd, uint64_t offset,
                                   const void *src, uint64_t src_len,
-                                  Error &error) {
+                                  Status &error) {
   if (IsHost())
     return FileCache::GetInstance().WriteFile(fd, offset, src, src_len, error);
   else if (m_remote_platform_sp)
@@ -370,7 +371,7 @@ static uint32_t chown_file(Platform *platform, const char *path,
   return status;
 }
 
-lldb_private::Error
+lldb_private::Status
 PlatformPOSIX::PutFile(const lldb_private::FileSpec &source,
                        const lldb_private::FileSpec &destination, uint32_t uid,
                        uint32_t gid) {
@@ -378,34 +379,34 @@ PlatformPOSIX::PutFile(const lldb_private::FileSpec &source,
 
   if (IsHost()) {
     if (FileSpec::Equal(source, destination, true))
-      return Error();
+      return Status();
     // cp src dst
     // chown uid:gid dst
     std::string src_path(source.GetPath());
     if (src_path.empty())
-      return Error("unable to get file path for source");
+      return Status("unable to get file path for source");
     std::string dst_path(destination.GetPath());
     if (dst_path.empty())
-      return Error("unable to get file path for destination");
+      return Status("unable to get file path for destination");
     StreamString command;
     command.Printf("cp %s %s", src_path.c_str(), dst_path.c_str());
     int status;
     RunShellCommand(command.GetData(), NULL, &status, NULL, NULL, 10);
     if (status != 0)
-      return Error("unable to perform copy");
+      return Status("unable to perform copy");
     if (uid == UINT32_MAX && gid == UINT32_MAX)
-      return Error();
+      return Status();
     if (chown_file(this, dst_path.c_str(), uid, gid) != 0)
-      return Error("unable to perform chown");
-    return Error();
+      return Status("unable to perform chown");
+    return Status();
   } else if (m_remote_platform_sp) {
     if (GetSupportsRSync()) {
       std::string src_path(source.GetPath());
       if (src_path.empty())
-        return Error("unable to get file path for source");
+        return Status("unable to get file path for source");
       std::string dst_path(destination.GetPath());
       if (dst_path.empty())
-        return Error("unable to get file path for destination");
+        return Status("unable to get file path for destination");
       StreamString command;
       if (GetIgnoresRemoteHostname()) {
         if (!GetRSyncPrefix())
@@ -424,8 +425,8 @@ PlatformPOSIX::PutFile(const lldb_private::FileSpec &source,
       if (retcode == 0) {
         // Don't chown a local file for a remote system
         //                if (chown_file(this,dst_path.c_str(),uid,gid) != 0)
-        //                    return Error("unable to perform chown");
-        return Error();
+        //                    return Status("unable to perform chown");
+        return Status();
       }
       // if we are still here rsync has failed - let's try the slow way before
       // giving up
@@ -446,7 +447,7 @@ lldb::user_id_t PlatformPOSIX::GetFileSize(const FileSpec &file_spec) {
     return Platform::GetFileSize(file_spec);
 }
 
-Error PlatformPOSIX::CreateSymlink(const FileSpec &src, const FileSpec &dst) {
+Status PlatformPOSIX::CreateSymlink(const FileSpec &src, const FileSpec &dst) {
   if (IsHost())
     return FileSystem::Symlink(src, dst);
   else if (m_remote_platform_sp)
@@ -464,7 +465,7 @@ bool PlatformPOSIX::GetFileExists(const FileSpec &file_spec) {
     return Platform::GetFileExists(file_spec);
 }
 
-Error PlatformPOSIX::Unlink(const FileSpec &file_spec) {
+Status PlatformPOSIX::Unlink(const FileSpec &file_spec) {
   if (IsHost())
     return llvm::sys::fs::remove(file_spec.GetPath());
   else if (m_remote_platform_sp)
@@ -473,7 +474,7 @@ Error PlatformPOSIX::Unlink(const FileSpec &file_spec) {
     return Platform::Unlink(file_spec);
 }
 
-lldb_private::Error PlatformPOSIX::GetFile(
+lldb_private::Status PlatformPOSIX::GetFile(
     const lldb_private::FileSpec &source,      // remote file path
     const lldb_private::FileSpec &destination) // local file path
 {
@@ -482,22 +483,22 @@ lldb_private::Error PlatformPOSIX::GetFile(
   // Check the args, first.
   std::string src_path(source.GetPath());
   if (src_path.empty())
-    return Error("unable to get file path for source");
+    return Status("unable to get file path for source");
   std::string dst_path(destination.GetPath());
   if (dst_path.empty())
-    return Error("unable to get file path for destination");
+    return Status("unable to get file path for destination");
   if (IsHost()) {
     if (FileSpec::Equal(source, destination, true))
-      return Error("local scenario->source and destination are the same file "
-                   "path: no operation performed");
+      return Status("local scenario->source and destination are the same file "
+                    "path: no operation performed");
     // cp src dst
     StreamString cp_command;
     cp_command.Printf("cp %s %s", src_path.c_str(), dst_path.c_str());
     int status;
     RunShellCommand(cp_command.GetData(), NULL, &status, NULL, NULL, 10);
     if (status != 0)
-      return Error("unable to perform copy");
-    return Error();
+      return Status("unable to perform copy");
+    return Status();
   } else if (m_remote_platform_sp) {
     if (GetSupportsRSync()) {
       StreamString command;
@@ -517,7 +518,7 @@ lldb_private::Error PlatformPOSIX::GetFile(
       int retcode;
       Host::RunShellCommand(command.GetData(), NULL, &retcode, NULL, NULL, 60);
       if (retcode == 0)
-        return Error();
+        return Status();
       // If we are here, rsync has failed - let's try the slow way before giving
       // up
     }
@@ -527,12 +528,12 @@ lldb_private::Error PlatformPOSIX::GetFile(
     // close dst
     if (log)
       log->Printf("[GetFile] Using block by block transfer....\n");
-    Error error;
+    Status error;
     user_id_t fd_src = OpenFile(source, File::eOpenOptionRead,
                                 lldb::eFilePermissionsFileDefault, error);
 
     if (fd_src == UINT64_MAX)
-      return Error("unable to open source file");
+      return Status("unable to open source file");
 
     uint32_t permissions = 0;
     error = GetFilePermissions(source, permissions);
@@ -710,8 +711,8 @@ const char *PlatformPOSIX::GetGroupName(uint32_t gid) {
   return NULL;
 }
 
-Error PlatformPOSIX::ConnectRemote(Args &args) {
-  Error error;
+Status PlatformPOSIX::ConnectRemote(Args &args) {
+  Status error;
   if (IsHost()) {
     error.SetErrorStringWithFormat(
         "can't connect to the host platform '%s', always connected",
@@ -753,8 +754,8 @@ Error PlatformPOSIX::ConnectRemote(Args &args) {
   return error;
 }
 
-Error PlatformPOSIX::DisconnectRemote() {
-  Error error;
+Status PlatformPOSIX::DisconnectRemote() {
+  Status error;
 
   if (IsHost()) {
     error.SetErrorStringWithFormat(
@@ -769,8 +770,8 @@ Error PlatformPOSIX::DisconnectRemote() {
   return error;
 }
 
-Error PlatformPOSIX::LaunchProcess(ProcessLaunchInfo &launch_info) {
-  Error error;
+Status PlatformPOSIX::LaunchProcess(ProcessLaunchInfo &launch_info) {
+  Status error;
 
   if (IsHost()) {
     error = Platform::LaunchProcess(launch_info);
@@ -783,19 +784,19 @@ Error PlatformPOSIX::LaunchProcess(ProcessLaunchInfo &launch_info) {
   return error;
 }
 
-lldb_private::Error PlatformPOSIX::KillProcess(const lldb::pid_t pid) {
+lldb_private::Status PlatformPOSIX::KillProcess(const lldb::pid_t pid) {
   if (IsHost())
     return Platform::KillProcess(pid);
 
   if (m_remote_platform_sp)
     return m_remote_platform_sp->KillProcess(pid);
 
-  return Error("the platform is not currently connected");
+  return Status("the platform is not currently connected");
 }
 
 lldb::ProcessSP PlatformPOSIX::Attach(ProcessAttachInfo &attach_info,
                                       Debugger &debugger, Target *target,
-                                      Error &error) {
+                                      Status &error) {
   lldb::ProcessSP process_sp;
   Log *log(GetLogIfAnyCategoriesSet(LIBLLDB_LOG_PLATFORM));
 
@@ -855,7 +856,7 @@ lldb::ProcessSP
 PlatformPOSIX::DebugProcess(ProcessLaunchInfo &launch_info, Debugger &debugger,
                             Target *target, // Can be NULL, if NULL create a new
                                             // target, else use existing one
-                            Error &error) {
+                            Status &error) {
   ProcessSP process_sp;
 
   if (IsHost()) {
@@ -881,23 +882,23 @@ void PlatformPOSIX::CalculateTrapHandlerSymbolNames() {
   m_trap_handlers.push_back(ConstString("_sigtramp"));
 }
 
-Error PlatformPOSIX::EvaluateLibdlExpression(
+Status PlatformPOSIX::EvaluateLibdlExpression(
     lldb_private::Process *process, const char *expr_cstr,
     const char *expr_prefix, lldb::ValueObjectSP &result_valobj_sp) {
   DynamicLoader *loader = process->GetDynamicLoader();
   if (loader) {
-    Error error = loader->CanLoadImage();
+    Status error = loader->CanLoadImage();
     if (error.Fail())
       return error;
   }
 
   ThreadSP thread_sp(process->GetThreadList().GetExpressionExecutionThread());
   if (!thread_sp)
-    return Error("Selected thread isn't valid");
+    return Status("Selected thread isn't valid");
 
   StackFrameSP frame_sp(thread_sp->GetStackFrameAtIndex(0));
   if (!frame_sp)
-    return Error("Frame 0 isn't valid");
+    return Status("Frame 0 isn't valid");
 
   ExecutionContext exe_ctx;
   frame_sp->CalculateExecutionContext(exe_ctx);
@@ -910,7 +911,7 @@ Error PlatformPOSIX::EvaluateLibdlExpression(
                                          // don't do the work to trap them.
   expr_options.SetTimeout(std::chrono::seconds(2));
 
-  Error expr_error;
+  Status expr_error;
   ExpressionResults result =
       UserExpression::Evaluate(exe_ctx, expr_options, expr_cstr, expr_prefix,
                                result_valobj_sp, expr_error);
@@ -919,12 +920,12 @@ Error PlatformPOSIX::EvaluateLibdlExpression(
 
   if (result_valobj_sp->GetError().Fail())
     return result_valobj_sp->GetError();
-  return Error();
+  return Status();
 }
 
 uint32_t PlatformPOSIX::DoLoadImage(lldb_private::Process *process,
                                     const lldb_private::FileSpec &remote_file,
-                                    lldb_private::Error &error) {
+                                    lldb_private::Status &error) {
   char path[PATH_MAX];
   remote_file.GetPath(path, sizeof(path));
 
@@ -983,18 +984,18 @@ uint32_t PlatformPOSIX::DoLoadImage(lldb_private::Process *process,
   return LLDB_INVALID_IMAGE_TOKEN;
 }
 
-Error PlatformPOSIX::UnloadImage(lldb_private::Process *process,
-                                 uint32_t image_token) {
+Status PlatformPOSIX::UnloadImage(lldb_private::Process *process,
+                                  uint32_t image_token) {
   const addr_t image_addr = process->GetImagePtrFromToken(image_token);
   if (image_addr == LLDB_INVALID_ADDRESS)
-    return Error("Invalid image token");
+    return Status("Invalid image token");
 
   StreamString expr;
   expr.Printf("dlclose((void *)0x%" PRIx64 ")", image_addr);
   const char *prefix = GetLibdlFunctionDeclarations();
   lldb::ValueObjectSP result_valobj_sp;
-  Error error = EvaluateLibdlExpression(process, expr.GetData(), prefix,
-                                        result_valobj_sp);
+  Status error = EvaluateLibdlExpression(process, expr.GetData(), prefix,
+                                         result_valobj_sp);
   if (error.Fail())
     return error;
 
@@ -1004,17 +1005,17 @@ Error PlatformPOSIX::UnloadImage(lldb_private::Process *process,
   Scalar scalar;
   if (result_valobj_sp->ResolveValue(scalar)) {
     if (scalar.UInt(1))
-      return Error("expression failed: \"%s\"", expr.GetData());
+      return Status("expression failed: \"%s\"", expr.GetData());
     process->ResetImageToken(image_token);
   }
-  return Error();
+  return Status();
 }
 
 lldb::ProcessSP PlatformPOSIX::ConnectProcess(llvm::StringRef connect_url,
                                               llvm::StringRef plugin_name,
                                               lldb_private::Debugger &debugger,
                                               lldb_private::Target *target,
-                                              lldb_private::Error &error) {
+                                              lldb_private::Status &error) {
   if (m_remote_platform_sp)
     return m_remote_platform_sp->ConnectProcess(connect_url, plugin_name,
                                                 debugger, target, error);
@@ -1033,7 +1034,7 @@ const char *PlatformPOSIX::GetLibdlFunctionDeclarations() const {
 }
 
 size_t PlatformPOSIX::ConnectToWaitingProcesses(Debugger &debugger,
-                                                Error &error) {
+                                                Status &error) {
   if (m_remote_platform_sp)
     return m_remote_platform_sp->ConnectToWaitingProcesses(debugger, error);
   return Platform::ConnectToWaitingProcesses(debugger, error);
diff --git a/source/Plugins/Platform/POSIX/PlatformPOSIX.h b/source/Plugins/Platform/POSIX/PlatformPOSIX.h
index 6c5c62797a6e..742702b07b88 100644
--- a/source/Plugins/Platform/POSIX/PlatformPOSIX.h
+++ b/source/Plugins/Platform/POSIX/PlatformPOSIX.h
@@ -43,29 +43,30 @@ public:
 
   const char *GetGroupName(uint32_t gid) override;
 
-  lldb_private::Error PutFile(const lldb_private::FileSpec &source,
-                              const lldb_private::FileSpec &destination,
-                              uint32_t uid = UINT32_MAX,
-                              uint32_t gid = UINT32_MAX) override;
+  lldb_private::Status PutFile(const lldb_private::FileSpec &source,
+                               const lldb_private::FileSpec &destination,
+                               uint32_t uid = UINT32_MAX,
+                               uint32_t gid = UINT32_MAX) override;
 
   lldb::user_id_t OpenFile(const lldb_private::FileSpec &file_spec,
                            uint32_t flags, uint32_t mode,
-                           lldb_private::Error &error) override;
+                           lldb_private::Status &error) override;
 
-  bool CloseFile(lldb::user_id_t fd, lldb_private::Error &error) override;
+  bool CloseFile(lldb::user_id_t fd, lldb_private::Status &error) override;
 
   uint64_t ReadFile(lldb::user_id_t fd, uint64_t offset, void *dst,
-                    uint64_t dst_len, lldb_private::Error &error) override;
+                    uint64_t dst_len, lldb_private::Status &error) override;
 
   uint64_t WriteFile(lldb::user_id_t fd, uint64_t offset, const void *src,
-                     uint64_t src_len, lldb_private::Error &error) override;
+                     uint64_t src_len, lldb_private::Status &error) override;
 
   lldb::user_id_t GetFileSize(const lldb_private::FileSpec &file_spec) override;
 
-  lldb_private::Error CreateSymlink(const lldb_private::FileSpec &src,
-                                    const lldb_private::FileSpec &dst) override;
+  lldb_private::Status
+  CreateSymlink(const lldb_private::FileSpec &src,
+                const lldb_private::FileSpec &dst) override;
 
-  lldb_private::Error
+  lldb_private::Status
   GetFile(const lldb_private::FileSpec &source,
           const lldb_private::FileSpec &destination) override;
 
@@ -88,7 +89,7 @@ public:
 
   bool IsConnected() const override;
 
-  lldb_private::Error RunShellCommand(
+  lldb_private::Status RunShellCommand(
       const char *command,                       // Shouldn't be nullptr
       const lldb_private::FileSpec &working_dir, // Pass empty FileSpec to use
                                                  // the current working
@@ -101,37 +102,39 @@ public:
       uint32_t timeout_sec)
       override; // Timeout in seconds to wait for shell program to finish
 
-  lldb_private::Error ResolveExecutable(const lldb_private::ModuleSpec &module_spec,
-      lldb::ModuleSP &module_sp,
+  lldb_private::Status ResolveExecutable(
+      const lldb_private::ModuleSpec &module_spec, lldb::ModuleSP &module_sp,
       const lldb_private::FileSpecList *module_search_paths_ptr) override;
 
-  lldb_private::Error GetFileWithUUID(const lldb_private::FileSpec &platform_file, const lldb_private::UUID *uuid,
-                        lldb_private::FileSpec &local_file) override;
+  lldb_private::Status
+  GetFileWithUUID(const lldb_private::FileSpec &platform_file,
+                  const lldb_private::UUID *uuid,
+                  lldb_private::FileSpec &local_file) override;
 
   bool GetProcessInfo(lldb::pid_t pid, lldb_private::ProcessInstanceInfo &proc_info) override;
 
   uint32_t FindProcesses(const lldb_private::ProcessInstanceInfoMatch &match_info,
                          lldb_private::ProcessInstanceInfoList &process_infos) override;
 
-  lldb_private::Error MakeDirectory(const lldb_private::FileSpec &file_spec,
-                                    uint32_t mode) override;
+  lldb_private::Status MakeDirectory(const lldb_private::FileSpec &file_spec,
+                                     uint32_t mode) override;
 
-  lldb_private::Error
+  lldb_private::Status
   GetFilePermissions(const lldb_private::FileSpec &file_spec,
                      uint32_t &file_permissions) override;
 
-  lldb_private::Error
+  lldb_private::Status
   SetFilePermissions(const lldb_private::FileSpec &file_spec,
                      uint32_t file_permissions) override;
 
   bool GetFileExists(const lldb_private::FileSpec &file_spec) override;
 
-  lldb_private::Error Unlink(const lldb_private::FileSpec &file_spec) override;
+  lldb_private::Status Unlink(const lldb_private::FileSpec &file_spec) override;
 
-  lldb_private::Error
+  lldb_private::Status
   LaunchProcess(lldb_private::ProcessLaunchInfo &launch_info) override;
 
-  lldb_private::Error KillProcess(const lldb::pid_t pid) override;
+  lldb_private::Status KillProcess(const lldb::pid_t pid) override;
 
   lldb::ProcessSP Attach(lldb_private::ProcessAttachInfo &attach_info,
                          lldb_private::Debugger &debugger,
@@ -139,7 +142,7 @@ public:
                                                        // nullptr create a new
                                                        // target, else use
                                                        // existing one
-                         lldb_private::Error &error) override;
+                         lldb_private::Status &error) override;
 
   lldb::ProcessSP DebugProcess(lldb_private::ProcessLaunchInfo &launch_info,
                                lldb_private::Debugger &debugger,
@@ -148,7 +151,7 @@ public:
                                                              // create a new
                                                              // target, else use
                                                              // existing one
-                               lldb_private::Error &error) override;
+                               lldb_private::Status &error) override;
 
   std::string GetPlatformSpecificConnectionInformation() override;
 
@@ -157,25 +160,25 @@ public:
 
   void CalculateTrapHandlerSymbolNames() override;
 
-  lldb_private::Error ConnectRemote(lldb_private::Args &args) override;
+  lldb_private::Status ConnectRemote(lldb_private::Args &args) override;
 
-  lldb_private::Error DisconnectRemote() override;
+  lldb_private::Status DisconnectRemote() override;
 
   uint32_t DoLoadImage(lldb_private::Process *process,
                        const lldb_private::FileSpec &remote_file,
-                       lldb_private::Error &error) override;
+                       lldb_private::Status &error) override;
 
-  lldb_private::Error UnloadImage(lldb_private::Process *process,
-                                  uint32_t image_token) override;
+  lldb_private::Status UnloadImage(lldb_private::Process *process,
+                                   uint32_t image_token) override;
 
   lldb::ProcessSP ConnectProcess(llvm::StringRef connect_url,
                                  llvm::StringRef plugin_name,
                                  lldb_private::Debugger &debugger,
                                  lldb_private::Target *target,
-                                 lldb_private::Error &error) override;
+                                 lldb_private::Status &error) override;
 
   size_t ConnectToWaitingProcesses(lldb_private::Debugger &debugger,
-                                   lldb_private::Error &error) override;
+                                   lldb_private::Status &error) override;
 
   lldb_private::ConstString GetFullNameForDylib(lldb_private::ConstString basename) override;
 
@@ -193,7 +196,7 @@ protected:
   lldb::PlatformSP m_remote_platform_sp; // Allow multiple ways to connect to a
                                          // remote POSIX-compliant OS
 
-  lldb_private::Error
+  lldb_private::Status
   EvaluateLibdlExpression(lldb_private::Process *process, const char *expr_cstr,
                           const char *expr_prefix,
                           lldb::ValueObjectSP &result_valobj_sp);
diff --git a/source/Plugins/Platform/Windows/PlatformWindows.cpp b/source/Plugins/Platform/Windows/PlatformWindows.cpp
index f57842ee6e51..3535df0c65cc 100644
--- a/source/Plugins/Platform/Windows/PlatformWindows.cpp
+++ b/source/Plugins/Platform/Windows/PlatformWindows.cpp
@@ -27,7 +27,7 @@
 #include "lldb/Core/PluginManager.h"
 #include "lldb/Host/HostInfo.h"
 #include "lldb/Target/Process.h"
-#include "lldb/Utility/Error.h"
+#include "lldb/Utility/Status.h"
 
 using namespace lldb;
 using namespace lldb_private;
@@ -180,10 +180,10 @@ bool PlatformWindows::GetModuleSpec(const FileSpec &module_file_spec,
   return Platform::GetModuleSpec(module_file_spec, arch, module_spec);
 }
 
-Error PlatformWindows::ResolveExecutable(
+Status PlatformWindows::ResolveExecutable(
     const ModuleSpec &ms, lldb::ModuleSP &exe_module_sp,
     const FileSpecList *module_search_paths_ptr) {
-  Error error;
+  Status error;
   // Nothing special to do here, just use the actual file and architecture
 
   char exe_path[PATH_MAX];
@@ -323,8 +323,8 @@ bool PlatformWindows::IsConnected() const {
   return false;
 }
 
-Error PlatformWindows::ConnectRemote(Args &args) {
-  Error error;
+Status PlatformWindows::ConnectRemote(Args &args) {
+  Status error;
   if (IsHost()) {
     error.SetErrorStringWithFormat(
         "can't connect to the host platform '%s', always connected",
@@ -353,8 +353,8 @@ Error PlatformWindows::ConnectRemote(Args &args) {
   return error;
 }
 
-Error PlatformWindows::DisconnectRemote() {
-  Error error;
+Status PlatformWindows::DisconnectRemote() {
+  Status error;
 
   if (IsHost()) {
     error.SetErrorStringWithFormat(
@@ -396,8 +396,8 @@ PlatformWindows::FindProcesses(const ProcessInstanceInfoMatch &match_info,
   return match_count;
 }
 
-Error PlatformWindows::LaunchProcess(ProcessLaunchInfo &launch_info) {
-  Error error;
+Status PlatformWindows::LaunchProcess(ProcessLaunchInfo &launch_info) {
+  Status error;
   if (IsHost()) {
     error = Platform::LaunchProcess(launch_info);
   } else {
@@ -411,7 +411,7 @@ Error PlatformWindows::LaunchProcess(ProcessLaunchInfo &launch_info) {
 
 ProcessSP PlatformWindows::DebugProcess(ProcessLaunchInfo &launch_info,
                                         Debugger &debugger, Target *target,
-                                        Error &error) {
+                                        Status &error) {
   // Windows has special considerations that must be followed when launching or
   // attaching to a process.  The
   // key requirement is that when launching or attaching to a process, you must
@@ -457,7 +457,7 @@ ProcessSP PlatformWindows::DebugProcess(ProcessLaunchInfo &launch_info,
 
 lldb::ProcessSP PlatformWindows::Attach(ProcessAttachInfo &attach_info,
                                         Debugger &debugger, Target *target,
-                                        Error &error) {
+                                        Status &error) {
   error.Clear();
   lldb::ProcessSP process_sp;
   if (!IsHost()) {
@@ -516,9 +516,9 @@ const char *PlatformWindows::GetGroupName(uint32_t gid) {
   return nullptr;
 }
 
-Error PlatformWindows::GetFileWithUUID(const FileSpec &platform_file,
-                                       const UUID *uuid_ptr,
-                                       FileSpec &local_file) {
+Status PlatformWindows::GetFileWithUUID(const FileSpec &platform_file,
+                                        const UUID *uuid_ptr,
+                                        FileSpec &local_file) {
   if (IsRemote()) {
     if (m_remote_platform_sp)
       return m_remote_platform_sp->GetFileWithUUID(platform_file, uuid_ptr,
@@ -527,14 +527,14 @@ Error PlatformWindows::GetFileWithUUID(const FileSpec &platform_file,
 
   // Default to the local case
   local_file = platform_file;
-  return Error();
+  return Status();
 }
 
-Error PlatformWindows::GetSharedModule(
+Status PlatformWindows::GetSharedModule(
     const ModuleSpec &module_spec, Process *process, ModuleSP &module_sp,
     const FileSpecList *module_search_paths_ptr, ModuleSP *old_module_sp_ptr,
     bool *did_create_ptr) {
-  Error error;
+  Status error;
   module_sp.reset();
 
   if (IsRemote()) {
diff --git a/source/Plugins/Platform/Windows/PlatformWindows.h b/source/Plugins/Platform/Windows/PlatformWindows.h
index 375d5c5dad2f..9af42116680e 100644
--- a/source/Plugins/Platform/Windows/PlatformWindows.h
+++ b/source/Plugins/Platform/Windows/PlatformWindows.h
@@ -49,9 +49,10 @@ public:
                      const lldb_private::ArchSpec &arch,
                      lldb_private::ModuleSpec &module_spec) override;
 
-  Error ResolveExecutable(const lldb_private::ModuleSpec &module_spec,
-                          lldb::ModuleSP &module_sp,
-                          const FileSpecList *module_search_paths_ptr) override;
+  Status
+  ResolveExecutable(const lldb_private::ModuleSpec &module_spec,
+                    lldb::ModuleSP &module_sp,
+                    const FileSpecList *module_search_paths_ptr) override;
 
   const char *GetDescription() override {
     return GetPluginDescriptionStatic(IsHost());
@@ -68,9 +69,9 @@ public:
 
   bool IsConnected() const override;
 
-  lldb_private::Error ConnectRemote(lldb_private::Args &args) override;
+  lldb_private::Status ConnectRemote(lldb_private::Args &args) override;
 
-  lldb_private::Error DisconnectRemote() override;
+  lldb_private::Status DisconnectRemote() override;
 
   const char *GetHostname() override;
 
@@ -85,25 +86,25 @@ public:
   FindProcesses(const lldb_private::ProcessInstanceInfoMatch &match_info,
                 lldb_private::ProcessInstanceInfoList &process_infos) override;
 
-  lldb_private::Error
+  lldb_private::Status
   LaunchProcess(lldb_private::ProcessLaunchInfo &launch_info) override;
 
   lldb::ProcessSP DebugProcess(lldb_private::ProcessLaunchInfo &launch_info,
                                lldb_private::Debugger &debugger,
                                lldb_private::Target *target,
-                               lldb_private::Error &error) override;
+                               lldb_private::Status &error) override;
 
   lldb::ProcessSP Attach(lldb_private::ProcessAttachInfo &attach_info,
                          lldb_private::Debugger &debugger,
                          lldb_private::Target *target,
-                         lldb_private::Error &error) override;
+                         lldb_private::Status &error) override;
 
-  lldb_private::Error
+  lldb_private::Status
   GetFileWithUUID(const lldb_private::FileSpec &platform_file,
                   const lldb_private::UUID *uuid,
                   lldb_private::FileSpec &local_file) override;
 
-  lldb_private::Error
+  lldb_private::Status
   GetSharedModule(const lldb_private::ModuleSpec &module_spec,
                   lldb_private::Process *process, lldb::ModuleSP &module_sp,
                   const lldb_private::FileSpecList *module_search_paths_ptr,
diff --git a/source/Plugins/Platform/gdb-server/PlatformRemoteGDBServer.cpp b/source/Plugins/Platform/gdb-server/PlatformRemoteGDBServer.cpp
index 218c62860114..645bfdfa770d 100644
--- a/source/Plugins/Platform/gdb-server/PlatformRemoteGDBServer.cpp
+++ b/source/Plugins/Platform/gdb-server/PlatformRemoteGDBServer.cpp
@@ -25,9 +25,9 @@
 #include "lldb/Host/HostInfo.h"
 #include "lldb/Target/Process.h"
 #include "lldb/Target/Target.h"
-#include "lldb/Utility/Error.h"
 #include "lldb/Utility/FileSpec.h"
 #include "lldb/Utility/Log.h"
+#include "lldb/Utility/Status.h"
 #include "lldb/Utility/StreamString.h"
 #include "lldb/Utility/UriParser.h"
 
@@ -93,12 +93,12 @@ const char *PlatformRemoteGDBServer::GetDescription() {
   return GetDescriptionStatic();
 }
 
-Error PlatformRemoteGDBServer::ResolveExecutable(
+Status PlatformRemoteGDBServer::ResolveExecutable(
     const ModuleSpec &module_spec, lldb::ModuleSP &exe_module_sp,
     const FileSpecList *module_search_paths_ptr) {
   // copied from PlatformRemoteiOS
 
-  Error error;
+  Status error;
   // Nothing special to do here, just use the actual file and architecture
 
   ModuleSpec resolved_module_spec(module_spec);
@@ -189,12 +189,12 @@ bool PlatformRemoteGDBServer::GetModuleSpec(const FileSpec &module_file_spec,
   return true;
 }
 
-Error PlatformRemoteGDBServer::GetFileWithUUID(const FileSpec &platform_file,
-                                               const UUID *uuid_ptr,
-                                               FileSpec &local_file) {
+Status PlatformRemoteGDBServer::GetFileWithUUID(const FileSpec &platform_file,
+                                                const UUID *uuid_ptr,
+                                                FileSpec &local_file) {
   // Default to the local case
   local_file = platform_file;
-  return Error();
+  return Status();
 }
 
 //------------------------------------------------------------------
@@ -291,8 +291,8 @@ bool PlatformRemoteGDBServer::IsConnected() const {
   return m_gdb_client.IsConnected();
 }
 
-Error PlatformRemoteGDBServer::ConnectRemote(Args &args) {
-  Error error;
+Status PlatformRemoteGDBServer::ConnectRemote(Args &args) {
+  Status error;
   if (IsConnected()) {
     error.SetErrorStringWithFormat("the platform is already connected to '%s', "
                                    "execute 'platform disconnect' to close the "
@@ -306,10 +306,10 @@ Error PlatformRemoteGDBServer::ConnectRemote(Args &args) {
       std::string path;
       const char *url = args.GetArgumentAtIndex(0);
       if (!url)
-        return Error("URL is null.");
+        return Status("URL is null.");
       llvm::StringRef scheme, hostname, pathname;
       if (!UriParser::Parse(url, scheme, hostname, port, pathname))
-        return Error("Invalid URL: %s", url);
+        return Status("Invalid URL: %s", url);
       m_platform_scheme = scheme;
       m_platform_hostname = hostname;
       path = pathname;
@@ -336,8 +336,8 @@ Error PlatformRemoteGDBServer::ConnectRemote(Args &args) {
   return error;
 }
 
-Error PlatformRemoteGDBServer::DisconnectRemote() {
-  Error error;
+Status PlatformRemoteGDBServer::DisconnectRemote() {
+  Status error;
   m_gdb_client.Disconnect(&error);
   m_remote_signals_sp.reset();
   return error;
@@ -386,9 +386,9 @@ bool PlatformRemoteGDBServer::GetProcessInfo(
   return m_gdb_client.GetProcessInfo(pid, process_info);
 }
 
-Error PlatformRemoteGDBServer::LaunchProcess(ProcessLaunchInfo &launch_info) {
+Status PlatformRemoteGDBServer::LaunchProcess(ProcessLaunchInfo &launch_info) {
   Log *log(GetLogIfAllCategoriesSet(LIBLLDB_LOG_PLATFORM));
-  Error error;
+  Status error;
 
   if (log)
     log->Printf("PlatformRemoteGDBServer::%s() called", __FUNCTION__);
@@ -480,17 +480,17 @@ Error PlatformRemoteGDBServer::LaunchProcess(ProcessLaunchInfo &launch_info) {
   return error;
 }
 
-Error PlatformRemoteGDBServer::KillProcess(const lldb::pid_t pid) {
+Status PlatformRemoteGDBServer::KillProcess(const lldb::pid_t pid) {
   if (!KillSpawnedProcess(pid))
-    return Error("failed to kill remote spawned process");
-  return Error();
+    return Status("failed to kill remote spawned process");
+  return Status();
 }
 
 lldb::ProcessSP PlatformRemoteGDBServer::DebugProcess(
     ProcessLaunchInfo &launch_info, Debugger &debugger,
     Target *target, // Can be NULL, if NULL create a new target, else use
                     // existing one
-    Error &error) {
+    Status &error) {
   lldb::ProcessSP process_sp;
   if (IsRemote()) {
     if (IsConnected()) {
@@ -577,7 +577,7 @@ lldb::ProcessSP PlatformRemoteGDBServer::Attach(
     ProcessAttachInfo &attach_info, Debugger &debugger,
     Target *target, // Can be NULL, if NULL create a new target, else use
                     // existing one
-    Error &error) {
+    Status &error) {
   lldb::ProcessSP process_sp;
   if (IsRemote()) {
     if (IsConnected()) {
@@ -625,9 +625,9 @@ lldb::ProcessSP PlatformRemoteGDBServer::Attach(
   return process_sp;
 }
 
-Error PlatformRemoteGDBServer::MakeDirectory(const FileSpec &file_spec,
-                                             uint32_t mode) {
-  Error error = m_gdb_client.MakeDirectory(file_spec, mode);
+Status PlatformRemoteGDBServer::MakeDirectory(const FileSpec &file_spec,
+                                              uint32_t mode) {
+  Status error = m_gdb_client.MakeDirectory(file_spec, mode);
   Log *log = GetLogIfAnyCategoriesSet(LIBLLDB_LOG_PLATFORM);
   if (log)
     log->Printf("PlatformRemoteGDBServer::MakeDirectory(path='%s', mode=%o) "
@@ -637,9 +637,9 @@ Error PlatformRemoteGDBServer::MakeDirectory(const FileSpec &file_spec,
   return error;
 }
 
-Error PlatformRemoteGDBServer::GetFilePermissions(const FileSpec &file_spec,
-                                                  uint32_t &file_permissions) {
-  Error error = m_gdb_client.GetFilePermissions(file_spec, file_permissions);
+Status PlatformRemoteGDBServer::GetFilePermissions(const FileSpec &file_spec,
+                                                   uint32_t &file_permissions) {
+  Status error = m_gdb_client.GetFilePermissions(file_spec, file_permissions);
   Log *log = GetLogIfAnyCategoriesSet(LIBLLDB_LOG_PLATFORM);
   if (log)
     log->Printf("PlatformRemoteGDBServer::GetFilePermissions(path='%s', "
@@ -649,9 +649,9 @@ Error PlatformRemoteGDBServer::GetFilePermissions(const FileSpec &file_spec,
   return error;
 }
 
-Error PlatformRemoteGDBServer::SetFilePermissions(const FileSpec &file_spec,
-                                                  uint32_t file_permissions) {
-  Error error = m_gdb_client.SetFilePermissions(file_spec, file_permissions);
+Status PlatformRemoteGDBServer::SetFilePermissions(const FileSpec &file_spec,
+                                                   uint32_t file_permissions) {
+  Status error = m_gdb_client.SetFilePermissions(file_spec, file_permissions);
   Log *log = GetLogIfAnyCategoriesSet(LIBLLDB_LOG_PLATFORM);
   if (log)
     log->Printf("PlatformRemoteGDBServer::SetFilePermissions(path='%s', "
@@ -663,11 +663,11 @@ Error PlatformRemoteGDBServer::SetFilePermissions(const FileSpec &file_spec,
 
 lldb::user_id_t PlatformRemoteGDBServer::OpenFile(const FileSpec &file_spec,
                                                   uint32_t flags, uint32_t mode,
-                                                  Error &error) {
+                                                  Status &error) {
   return m_gdb_client.OpenFile(file_spec, flags, mode, error);
 }
 
-bool PlatformRemoteGDBServer::CloseFile(lldb::user_id_t fd, Error &error) {
+bool PlatformRemoteGDBServer::CloseFile(lldb::user_id_t fd, Status &error) {
   return m_gdb_client.CloseFile(fd, error);
 }
 
@@ -678,27 +678,27 @@ PlatformRemoteGDBServer::GetFileSize(const FileSpec &file_spec) {
 
 uint64_t PlatformRemoteGDBServer::ReadFile(lldb::user_id_t fd, uint64_t offset,
                                            void *dst, uint64_t dst_len,
-                                           Error &error) {
+                                           Status &error) {
   return m_gdb_client.ReadFile(fd, offset, dst, dst_len, error);
 }
 
 uint64_t PlatformRemoteGDBServer::WriteFile(lldb::user_id_t fd, uint64_t offset,
                                             const void *src, uint64_t src_len,
-                                            Error &error) {
+                                            Status &error) {
   return m_gdb_client.WriteFile(fd, offset, src, src_len, error);
 }
 
-Error PlatformRemoteGDBServer::PutFile(const FileSpec &source,
-                                       const FileSpec &destination,
-                                       uint32_t uid, uint32_t gid) {
+Status PlatformRemoteGDBServer::PutFile(const FileSpec &source,
+                                        const FileSpec &destination,
+                                        uint32_t uid, uint32_t gid) {
   return Platform::PutFile(source, destination, uid, gid);
 }
 
-Error PlatformRemoteGDBServer::CreateSymlink(
+Status PlatformRemoteGDBServer::CreateSymlink(
     const FileSpec &src, // The name of the link is in src
     const FileSpec &dst) // The symlink points to dst
 {
-  Error error = m_gdb_client.CreateSymlink(src, dst);
+  Status error = m_gdb_client.CreateSymlink(src, dst);
   Log *log = GetLogIfAnyCategoriesSet(LIBLLDB_LOG_PLATFORM);
   if (log)
     log->Printf("PlatformRemoteGDBServer::CreateSymlink(src='%s', dst='%s') "
@@ -708,8 +708,8 @@ Error PlatformRemoteGDBServer::CreateSymlink(
   return error;
 }
 
-Error PlatformRemoteGDBServer::Unlink(const FileSpec &file_spec) {
-  Error error = m_gdb_client.Unlink(file_spec);
+Status PlatformRemoteGDBServer::Unlink(const FileSpec &file_spec) {
+  Status error = m_gdb_client.Unlink(file_spec);
   Log *log = GetLogIfAnyCategoriesSet(LIBLLDB_LOG_PLATFORM);
   if (log)
     log->Printf("PlatformRemoteGDBServer::Unlink(path='%s') error = %u (%s)",
@@ -721,7 +721,7 @@ bool PlatformRemoteGDBServer::GetFileExists(const FileSpec &file_spec) {
   return m_gdb_client.GetFileExists(file_spec);
 }
 
-Error PlatformRemoteGDBServer::RunShellCommand(
+Status PlatformRemoteGDBServer::RunShellCommand(
     const char *command, // Shouldn't be NULL
     const FileSpec &
         working_dir, // Pass empty FileSpec to use the current working directory
@@ -784,7 +784,7 @@ const UnixSignalsSP &PlatformRemoteGDBServer::GetRemoteUnixSignals() {
         if (!dict->GetValueForKeyAsInteger("signo", signo))
           return false;
 
-        std::string name;
+        llvm::StringRef name;
         if (!dict->GetValueForKeyAsString("name", name))
           return false;
 
@@ -809,7 +809,7 @@ const UnixSignalsSP &PlatformRemoteGDBServer::GetRemoteUnixSignals() {
         if (object_sp && object_sp->IsValid())
           description = object_sp->GetStringValue();
 
-        remote_signals_sp->AddSignal(signo, name.c_str(), suppress, stop,
+        remote_signals_sp->AddSignal(signo, name.str().c_str(), suppress, stop,
                                      notify, description.c_str());
         return true;
       });
@@ -852,7 +852,7 @@ std::string PlatformRemoteGDBServer::MakeUrl(const char *scheme,
 lldb::ProcessSP PlatformRemoteGDBServer::ConnectProcess(
     llvm::StringRef connect_url, llvm::StringRef plugin_name,
     lldb_private::Debugger &debugger, lldb_private::Target *target,
-    lldb_private::Error &error) {
+    lldb_private::Status &error) {
   if (!IsRemote() || !IsConnected()) {
     error.SetErrorString("Not connected to remote gdb server");
     return nullptr;
@@ -862,7 +862,7 @@ lldb::ProcessSP PlatformRemoteGDBServer::ConnectProcess(
 }
 
 size_t PlatformRemoteGDBServer::ConnectToWaitingProcesses(Debugger &debugger,
-                                                          Error &error) {
+                                                          Status &error) {
   std::vector<std::string> connection_urls;
   GetPendingGdbServerList(connection_urls);
 
diff --git a/source/Plugins/Platform/gdb-server/PlatformRemoteGDBServer.h b/source/Plugins/Platform/gdb-server/PlatformRemoteGDBServer.h
index edc223a2d7fb..210544f752e6 100644
--- a/source/Plugins/Platform/gdb-server/PlatformRemoteGDBServer.h
+++ b/source/Plugins/Platform/gdb-server/PlatformRemoteGDBServer.h
@@ -50,38 +50,38 @@ public:
   //------------------------------------------------------------
   // lldb_private::Platform functions
   //------------------------------------------------------------
-  Error ResolveExecutable(const ModuleSpec &module_spec,
-                          lldb::ModuleSP &module_sp,
-                          const FileSpecList *module_search_paths_ptr) override;
+  Status
+  ResolveExecutable(const ModuleSpec &module_spec, lldb::ModuleSP &module_sp,
+                    const FileSpecList *module_search_paths_ptr) override;
 
   bool GetModuleSpec(const FileSpec &module_file_spec, const ArchSpec &arch,
                      ModuleSpec &module_spec) override;
 
   const char *GetDescription() override;
 
-  Error GetFileWithUUID(const FileSpec &platform_file, const UUID *uuid_ptr,
-                        FileSpec &local_file) override;
+  Status GetFileWithUUID(const FileSpec &platform_file, const UUID *uuid_ptr,
+                         FileSpec &local_file) override;
 
   bool GetProcessInfo(lldb::pid_t pid, ProcessInstanceInfo &proc_info) override;
 
   uint32_t FindProcesses(const ProcessInstanceInfoMatch &match_info,
                          ProcessInstanceInfoList &process_infos) override;
 
-  Error LaunchProcess(ProcessLaunchInfo &launch_info) override;
+  Status LaunchProcess(ProcessLaunchInfo &launch_info) override;
 
-  Error KillProcess(const lldb::pid_t pid) override;
+  Status KillProcess(const lldb::pid_t pid) override;
 
   lldb::ProcessSP DebugProcess(ProcessLaunchInfo &launch_info,
                                Debugger &debugger,
                                Target *target, // Can be NULL, if NULL create a
                                                // new target, else use existing
                                                // one
-                               Error &error) override;
+                               Status &error) override;
 
   lldb::ProcessSP Attach(ProcessAttachInfo &attach_info, Debugger &debugger,
                          Target *target, // Can be NULL, if NULL create a new
                                          // target, else use existing one
-                         Error &error) override;
+                         Status &error) override;
 
   bool GetSupportedArchitectureAtIndex(uint32_t idx, ArchSpec &arch) override;
 
@@ -111,42 +111,42 @@ public:
 
   bool IsConnected() const override;
 
-  Error ConnectRemote(Args &args) override;
+  Status ConnectRemote(Args &args) override;
 
-  Error DisconnectRemote() override;
+  Status DisconnectRemote() override;
 
-  Error MakeDirectory(const FileSpec &file_spec,
-                      uint32_t file_permissions) override;
+  Status MakeDirectory(const FileSpec &file_spec,
+                       uint32_t file_permissions) override;
 
-  Error GetFilePermissions(const FileSpec &file_spec,
-                           uint32_t &file_permissions) override;
+  Status GetFilePermissions(const FileSpec &file_spec,
+                            uint32_t &file_permissions) override;
 
-  Error SetFilePermissions(const FileSpec &file_spec,
-                           uint32_t file_permissions) override;
+  Status SetFilePermissions(const FileSpec &file_spec,
+                            uint32_t file_permissions) override;
 
   lldb::user_id_t OpenFile(const FileSpec &file_spec, uint32_t flags,
-                           uint32_t mode, Error &error) override;
+                           uint32_t mode, Status &error) override;
 
-  bool CloseFile(lldb::user_id_t fd, Error &error) override;
+  bool CloseFile(lldb::user_id_t fd, Status &error) override;
 
   uint64_t ReadFile(lldb::user_id_t fd, uint64_t offset, void *data_ptr,
-                    uint64_t len, Error &error) override;
+                    uint64_t len, Status &error) override;
 
   uint64_t WriteFile(lldb::user_id_t fd, uint64_t offset, const void *data,
-                     uint64_t len, Error &error) override;
+                     uint64_t len, Status &error) override;
 
   lldb::user_id_t GetFileSize(const FileSpec &file_spec) override;
 
-  Error PutFile(const FileSpec &source, const FileSpec &destination,
-                uint32_t uid = UINT32_MAX, uint32_t gid = UINT32_MAX) override;
+  Status PutFile(const FileSpec &source, const FileSpec &destination,
+                 uint32_t uid = UINT32_MAX, uint32_t gid = UINT32_MAX) override;
 
-  Error CreateSymlink(const FileSpec &src, const FileSpec &dst) override;
+  Status CreateSymlink(const FileSpec &src, const FileSpec &dst) override;
 
   bool GetFileExists(const FileSpec &file_spec) override;
 
-  Error Unlink(const FileSpec &path) override;
+  Status Unlink(const FileSpec &path) override;
 
-  Error RunShellCommand(
+  Status RunShellCommand(
       const char *command,         // Shouldn't be NULL
       const FileSpec &working_dir, // Pass empty FileSpec to use the current
                                    // working directory
@@ -166,10 +166,10 @@ public:
                                  llvm::StringRef plugin_name,
                                  lldb_private::Debugger &debugger,
                                  lldb_private::Target *target,
-                                 lldb_private::Error &error) override;
+                                 lldb_private::Status &error) override;
 
   size_t ConnectToWaitingProcesses(lldb_private::Debugger &debugger,
-                                   lldb_private::Error &error) override;
+                                   lldb_private::Status &error) override;
 
   virtual size_t
   GetPendingGdbServerList(std::vector<std::string> &connection_urls);
diff --git a/source/Plugins/Process/Darwin/DarwinProcessLauncher.cpp b/source/Plugins/Process/Darwin/DarwinProcessLauncher.cpp
index feb7a11584f8..6845a36730c9 100644
--- a/source/Plugins/Process/Darwin/DarwinProcessLauncher.cpp
+++ b/source/Plugins/Process/Darwin/DarwinProcessLauncher.cpp
@@ -32,8 +32,8 @@
 
 #include "lldb/Host/PseudoTerminal.h"
 #include "lldb/Target/ProcessLaunchInfo.h"
-#include "lldb/Utility/Error.h"
 #include "lldb/Utility/Log.h"
+#include "lldb/Utility/Status.h"
 #include "lldb/Utility/StreamString.h"
 
 #include "CFBundle.h"
@@ -131,10 +131,10 @@ static bool ResolveExecutablePath(const char *path, char *resolved_path,
 
 // TODO check if we have a general purpose fork and exec.  We may be
 // able to get rid of this entirely.
-static Error ForkChildForPTraceDebugging(const char *path, char const *argv[],
-                                         char const *envp[], ::pid_t *pid,
-                                         int *pty_fd) {
-  Error error;
+static Status ForkChildForPTraceDebugging(const char *path, char const *argv[],
+                                          char const *envp[], ::pid_t *pid,
+                                          int *pty_fd) {
+  Status error;
   if (!path || !argv || !envp || !pid || !pty_fd) {
     error.SetErrorString("invalid arguments");
     return error;
@@ -149,7 +149,7 @@ static Error ForkChildForPTraceDebugging(const char *path, char const *argv[],
   *pid = static_cast<::pid_t>(pty.Fork(fork_error, sizeof(fork_error)));
   if (*pid < 0) {
     //--------------------------------------------------------------
-    // Error during fork.
+    // Status during fork.
     //--------------------------------------------------------------
     *pid = static_cast<::pid_t>(LLDB_INVALID_PROCESS_ID);
     error.SetErrorStringWithFormat("%s(): fork failed: %s", __FUNCTION__,
@@ -205,10 +205,10 @@ static Error ForkChildForPTraceDebugging(const char *path, char const *argv[],
   return error;
 }
 
-static Error
+static Status
 CreatePosixSpawnFileAction(const FileAction &action,
                            posix_spawn_file_actions_t *file_actions) {
-  Error error;
+  Status error;
 
   // Log it.
   Log *log(GetLogIfAllCategoriesSet(LIBLLDB_LOG_PROCESS));
@@ -270,11 +270,11 @@ CreatePosixSpawnFileAction(const FileAction &action,
   return error;
 }
 
-static Error PosixSpawnChildForPTraceDebugging(const char *path,
-                                               ProcessLaunchInfo &launch_info,
-                                               ::pid_t *pid,
-                                               cpu_type_t *actual_cpu_type) {
-  Error error;
+static Status PosixSpawnChildForPTraceDebugging(const char *path,
+                                                ProcessLaunchInfo &launch_info,
+                                                ::pid_t *pid,
+                                                cpu_type_t *actual_cpu_type) {
+  Status error;
   Log *log(GetLogIfAllCategoriesSet(LIBLLDB_LOG_PROCESS));
 
   if (!pid) {
@@ -436,9 +436,9 @@ static Error PosixSpawnChildForPTraceDebugging(const char *path,
   return error;
 }
 
-Error LaunchInferior(ProcessLaunchInfo &launch_info, int *pty_master_fd,
-                     LaunchFlavor *launch_flavor) {
-  Error error;
+Status LaunchInferior(ProcessLaunchInfo &launch_info, int *pty_master_fd,
+                      LaunchFlavor *launch_flavor) {
+  Status error;
   Log *log(GetLogIfAllCategoriesSet(LIBLLDB_LOG_PROCESS));
 
   if (!launch_flavor) {
diff --git a/source/Plugins/Process/Darwin/DarwinProcessLauncher.h b/source/Plugins/Process/Darwin/DarwinProcessLauncher.h
index d1af4d09f8b7..a0e8ce5cb9dc 100644
--- a/source/Plugins/Process/Darwin/DarwinProcessLauncher.h
+++ b/source/Plugins/Process/Darwin/DarwinProcessLauncher.h
@@ -39,8 +39,9 @@ namespace darwin_process_launcher {
 /// @param[out] launch_flavor
 ///     Contains the launch flavor used when launching the process.
 // =============================================================================
-Error LaunchInferior(ProcessLaunchInfo &launch_info, int *pty_master_fd,
-                     lldb_private::process_darwin::LaunchFlavor *launch_flavor);
+Status
+LaunchInferior(ProcessLaunchInfo &launch_info, int *pty_master_fd,
+               lldb_private::process_darwin::LaunchFlavor *launch_flavor);
 
 } // darwin_process_launcher
 } // lldb_private
diff --git a/source/Plugins/Process/Darwin/MachException.cpp b/source/Plugins/Process/Darwin/MachException.cpp
index 5a97a4b01be3..7d956dfc6506 100644
--- a/source/Plugins/Process/Darwin/MachException.cpp
+++ b/source/Plugins/Process/Darwin/MachException.cpp
@@ -23,9 +23,9 @@
 
 // LLDB includes
 #include "lldb/Target/UnixSignals.h"
-#include "lldb/Utility/Error.h"
 #include "lldb/Utility/LLDBAssert.h"
 #include "lldb/Utility/Log.h"
+#include "lldb/Utility/Status.h"
 #include "lldb/Utility/Stream.h"
 
 using namespace lldb;
@@ -211,11 +211,11 @@ bool MachException::Data::GetStopInfo(struct ThreadStopInfo *stop_info,
   return true;
 }
 
-Error MachException::Message::Receive(mach_port_t port,
-                                      mach_msg_option_t options,
-                                      mach_msg_timeout_t timeout,
-                                      mach_port_t notify_port) {
-  Error error;
+Status MachException::Message::Receive(mach_port_t port,
+                                       mach_msg_option_t options,
+                                       mach_msg_timeout_t timeout,
+                                       mach_port_t notify_port) {
+  Status error;
   Log *log(GetLogIfAllCategoriesSet(LIBLLDB_LOG_PROCESS | LIBLLDB_LOG_VERBOSE));
 
   mach_msg_timeout_t mach_msg_timeout =
@@ -312,10 +312,10 @@ bool MachException::Message::CatchExceptionRaise(task_t task) {
   return success;
 }
 
-Error MachException::Message::Reply(::pid_t inferior_pid, task_t inferior_task,
-                                    int signal) {
+Status MachException::Message::Reply(::pid_t inferior_pid, task_t inferior_task,
+                                     int signal) {
   // Reply to the exception...
-  Error error;
+  Status error;
 
   Log *log(GetLogIfAllCategoriesSet(LIBLLDB_LOG_PROCESS | LIBLLDB_LOG_VERBOSE));
 
@@ -412,8 +412,8 @@ Error MachException::Message::Reply(::pid_t inferior_pid, task_t inferior_task,
 
 #define LLDB_EXC_MASK (EXC_MASK_ALL & ~EXC_MASK_RESOURCE)
 
-Error MachException::PortInfo::Save(task_t task) {
-  Error error;
+Status MachException::PortInfo::Save(task_t task) {
+  Status error;
   Log *log(GetLogIfAllCategoriesSet(LIBLLDB_LOG_PROCESS | LIBLLDB_LOG_VERBOSE));
 
   if (log)
@@ -471,8 +471,8 @@ Error MachException::PortInfo::Save(task_t task) {
   return error;
 }
 
-Error MachException::PortInfo::Restore(task_t task) {
-  Error error;
+Status MachException::PortInfo::Restore(task_t task) {
+  Status error;
 
   Log *log(GetLogIfAllCategoriesSet(LIBLLDB_LOG_PROCESS | LIBLLDB_LOG_VERBOSE));
 
diff --git a/source/Plugins/Process/Darwin/MachException.h b/source/Plugins/Process/Darwin/MachException.h
index ac8cd7030c55..2da6a36a921e 100644
--- a/source/Plugins/Process/Darwin/MachException.h
+++ b/source/Plugins/Process/Darwin/MachException.h
@@ -40,9 +40,9 @@ public:
     thread_state_flavor_t flavors[EXC_TYPES_COUNT];
     mach_msg_type_number_t count;
 
-    Error Save(task_t task);
+    Status Save(task_t task);
 
-    Error Restore(task_t task);
+    Status Restore(task_t task);
   };
 
   struct Data {
@@ -96,11 +96,11 @@ public:
 
     bool CatchExceptionRaise(task_t task);
 
-    Error Reply(::pid_t inferior_pid, task_t inferior_task, int signal);
+    Status Reply(::pid_t inferior_pid, task_t inferior_task, int signal);
 
-    Error Receive(mach_port_t receive_port, mach_msg_option_t options,
-                  mach_msg_timeout_t timeout,
-                  mach_port_t notify_port = MACH_PORT_NULL);
+    Status Receive(mach_port_t receive_port, mach_msg_option_t options,
+                   mach_msg_timeout_t timeout,
+                   mach_port_t notify_port = MACH_PORT_NULL);
 
     void Dump(Stream &stream) const;
 
diff --git a/source/Plugins/Process/Darwin/NativeProcessDarwin.cpp b/source/Plugins/Process/Darwin/NativeProcessDarwin.cpp
index 65ab12fe1adf..f6c8c78ccb73 100644
--- a/source/Plugins/Process/Darwin/NativeProcessDarwin.cpp
+++ b/source/Plugins/Process/Darwin/NativeProcessDarwin.cpp
@@ -53,13 +53,13 @@ struct hack_task_dyld_info {
 // Public Static Methods
 // -----------------------------------------------------------------------------
 
-Error NativeProcessProtocol::Launch(
+Status NativeProcessProtocol::Launch(
     ProcessLaunchInfo &launch_info,
     NativeProcessProtocol::NativeDelegate &native_delegate, MainLoop &mainloop,
     NativeProcessProtocolSP &native_process_sp) {
   Log *log(GetLogIfAllCategoriesSet(LIBLLDB_LOG_PROCESS));
 
-  Error error;
+  Status error;
 
   // Verify the working directory is valid if one was specified.
   FileSpec working_dir(launch_info.GetWorkingDirectory());
@@ -120,7 +120,7 @@ Error NativeProcessProtocol::Launch(
   return error;
 }
 
-Error NativeProcessProtocol::Attach(
+Status NativeProcessProtocol::Attach(
     lldb::pid_t pid, NativeProcessProtocol::NativeDelegate &native_delegate,
     MainLoop &mainloop, NativeProcessProtocolSP &native_process_sp) {
   Log *log(GetLogIfAllCategoriesSet(LIBLLDB_LOG_PROCESS));
@@ -130,7 +130,7 @@ Error NativeProcessProtocol::Attach(
 
   // Retrieve the architecture for the running process.
   ArchSpec process_arch;
-  Error error = ResolveProcessArchitecture(pid, process_arch);
+  Status error = ResolveProcessArchitecture(pid, process_arch);
   if (!error.Success())
     return error;
 
@@ -174,9 +174,9 @@ NativeProcessDarwin::~NativeProcessDarwin() {}
 // Instance methods
 // -----------------------------------------------------------------------------
 
-Error NativeProcessDarwin::FinalizeLaunch(LaunchFlavor launch_flavor,
-                                          MainLoop &main_loop) {
-  Error error;
+Status NativeProcessDarwin::FinalizeLaunch(LaunchFlavor launch_flavor,
+                                           MainLoop &main_loop) {
+  Status error;
   Log *log(GetLogIfAllCategoriesSet(LIBLLDB_LOG_PROCESS));
 
 #if 0
@@ -261,7 +261,7 @@ Error NativeProcessDarwin::FinalizeLaunch(LaunchFlavor launch_flavor,
   return error;
 }
 
-Error NativeProcessDarwin::SaveExceptionPortInfo() {
+Status NativeProcessDarwin::SaveExceptionPortInfo() {
   return m_exc_port_info.Save(m_task);
 }
 
@@ -348,7 +348,7 @@ void *NativeProcessDarwin::DoExceptionThread() {
   // polling is expensive.  On devices, we need to minimize overhead caused
   // by the process monitor.
   uint32_t num_exceptions_received = 0;
-  Error error;
+  Status error;
   task_t task = m_task;
   mach_msg_timeout_t periodic_timeout = 0;
 
@@ -550,8 +550,8 @@ void *NativeProcessDarwin::DoExceptionThread() {
   return nullptr;
 }
 
-Error NativeProcessDarwin::StartExceptionThread() {
-  Error error;
+Status NativeProcessDarwin::StartExceptionThread() {
+  Status error;
   Log *log(GetLogIfAllCategoriesSet(LIBLLDB_LOG_PROCESS));
   if (log)
     log->Printf("NativeProcessDarwin::%s() called", __FUNCTION__);
@@ -640,7 +640,7 @@ Error NativeProcessDarwin::StartExceptionThread() {
 }
 
 lldb::addr_t
-NativeProcessDarwin::GetDYLDAllImageInfosAddress(Error &error) const {
+NativeProcessDarwin::GetDYLDAllImageInfosAddress(Status &error) const {
   error.Clear();
 
   struct hack_task_dyld_info dyld_info;
@@ -694,7 +694,7 @@ uint32_t NativeProcessDarwin::GetCPUType() const {
 
 task_t NativeProcessDarwin::ExceptionMessageBundleComplete() {
   // We have a complete bundle of exceptions for our child process.
-  Error error;
+  Status error;
   Log *log(GetLogIfAllCategoriesSet(LIBLLDB_LOG_PROCESS | LIBLLDB_LOG_VERBOSE));
 
   std::lock_guard<std::recursive_mutex> locker(m_exception_messages_mutex);
@@ -737,7 +737,7 @@ task_t NativeProcessDarwin::ExceptionMessageBundleComplete() {
         const addr_t info_array_count_addr = aii_addr + 4;
         uint32_t info_array_count = 0;
         size_t bytes_read = 0;
-        Error read_error;
+        Status read_error;
         read_error = ReadMemory(info_array_count_addr, // source addr
                                 &info_array_count,     // dest addr
                                 4,                     // byte count
@@ -885,8 +885,8 @@ void NativeProcessDarwin::StartSTDIOThread() {
   // TODO implement
 }
 
-Error NativeProcessDarwin::StartWaitpidThread(MainLoop &main_loop) {
-  Error error;
+Status NativeProcessDarwin::StartWaitpidThread(MainLoop &main_loop) {
+  Status error;
   Log *log(GetLogIfAllCategoriesSet(LIBLLDB_LOG_PROCESS));
 
   // Strategy: create a thread that sits on waitpid(), waiting for the
@@ -973,7 +973,7 @@ void *NativeProcessDarwin::DoWaitpidThread() {
   // Ensure we don't get CPU starved.
   MaybeRaiseThreadPriority();
 
-  Error error;
+  Status error;
   int status = -1;
 
   while (1) {
@@ -1038,9 +1038,9 @@ void *NativeProcessDarwin::DoWaitpidThread() {
   return nullptr;
 }
 
-Error NativeProcessDarwin::SendInferiorExitStatusToMainLoop(::pid_t pid,
-                                                            int status) {
-  Error error;
+Status NativeProcessDarwin::SendInferiorExitStatusToMainLoop(::pid_t pid,
+                                                             int status) {
+  Status error;
   Log *log(GetLogIfAllCategoriesSet(LIBLLDB_LOG_PROCESS));
 
   size_t bytes_written = 0;
@@ -1069,8 +1069,8 @@ Error NativeProcessDarwin::SendInferiorExitStatusToMainLoop(::pid_t pid,
   return error;
 }
 
-Error NativeProcessDarwin::HandleWaitpidResult() {
-  Error error;
+Status NativeProcessDarwin::HandleWaitpidResult() {
+  Status error;
   Log *log(GetLogIfAllCategoriesSet(LIBLLDB_LOG_PROCESS));
 
   // Read the pid.
@@ -1126,7 +1126,7 @@ Error NativeProcessDarwin::HandleWaitpidResult() {
   return error;
 }
 
-task_t NativeProcessDarwin::TaskPortForProcessID(Error &error,
+task_t NativeProcessDarwin::TaskPortForProcessID(Status &error,
                                                  bool force) const {
   if ((m_task == TASK_NULL) || force) {
     Log *log(GetLogIfAllCategoriesSet(LIBLLDB_LOG_PROCESS));
@@ -1178,12 +1178,12 @@ task_t NativeProcessDarwin::TaskPortForProcessID(Error &error,
 }
 
 void NativeProcessDarwin::AttachToInferior(MainLoop &mainloop, lldb::pid_t pid,
-                                           Error &error) {
+                                           Status &error) {
   error.SetErrorString("TODO: implement");
 }
 
-Error NativeProcessDarwin::PrivateResume() {
-  Error error;
+Status NativeProcessDarwin::PrivateResume() {
+  Status error;
   Log *log(GetLogIfAllCategoriesSet(LIBLLDB_LOG_PROCESS));
 
   std::lock_guard<std::recursive_mutex> locker(m_exception_messages_mutex);
@@ -1225,8 +1225,8 @@ Error NativeProcessDarwin::PrivateResume() {
   return error;
 }
 
-Error NativeProcessDarwin::ReplyToAllExceptions() {
-  Error error;
+Status NativeProcessDarwin::ReplyToAllExceptions() {
+  Status error;
   Log *log(GetLogIfAllCategoriesSet(LIBLLDB_LOG_PROCESS | LIBLLDB_LOG_VERBOSE));
 
   TaskPortForProcessID(error);
@@ -1282,8 +1282,8 @@ Error NativeProcessDarwin::ReplyToAllExceptions() {
   return error;
 }
 
-Error NativeProcessDarwin::ResumeTask() {
-  Error error;
+Status NativeProcessDarwin::ResumeTask() {
+  Status error;
   Log *log(GetLogIfAllCategoriesSet(LIBLLDB_LOG_PROCESS));
 
   TaskPortForProcessID(error);
@@ -1364,9 +1364,10 @@ bool NativeProcessDarwin::IsExceptionPortValid() const {
   return MACH_PORT_VALID(m_exception_port);
 }
 
-Error NativeProcessDarwin::GetTaskBasicInfo(
-    task_t task, struct task_basic_info *info) const {
-  Error error;
+Status
+NativeProcessDarwin::GetTaskBasicInfo(task_t task,
+                                      struct task_basic_info *info) const {
+  Status error;
   Log *log(GetLogIfAllCategoriesSet(LIBLLDB_LOG_PROCESS));
 
   // Validate args.
@@ -1412,8 +1413,8 @@ Error NativeProcessDarwin::GetTaskBasicInfo(
   return error;
 }
 
-Error NativeProcessDarwin::SuspendTask() {
-  Error error;
+Status NativeProcessDarwin::SuspendTask() {
+  Status error;
   Log *log(GetLogIfAllCategoriesSet(LIBLLDB_LOG_PROCESS));
 
   if (m_task == TASK_NULL) {
@@ -1432,8 +1433,8 @@ Error NativeProcessDarwin::SuspendTask() {
   return error;
 }
 
-Error NativeProcessDarwin::Resume(const ResumeActionList &resume_actions) {
-  Error error;
+Status NativeProcessDarwin::Resume(const ResumeActionList &resume_actions) {
+  Status error;
   Log *log(GetLogIfAllCategoriesSet(LIBLLDB_LOG_PROCESS));
 
   if (log)
@@ -1461,74 +1462,74 @@ Error NativeProcessDarwin::Resume(const ResumeActionList &resume_actions) {
   return error;
 }
 
-Error NativeProcessDarwin::Halt() {
-  Error error;
+Status NativeProcessDarwin::Halt() {
+  Status error;
   error.SetErrorString("TODO: implement");
   return error;
 }
 
-Error NativeProcessDarwin::Detach() {
-  Error error;
+Status NativeProcessDarwin::Detach() {
+  Status error;
   error.SetErrorString("TODO: implement");
   return error;
 }
 
-Error NativeProcessDarwin::Signal(int signo) {
-  Error error;
+Status NativeProcessDarwin::Signal(int signo) {
+  Status error;
   error.SetErrorString("TODO: implement");
   return error;
 }
 
-Error NativeProcessDarwin::Interrupt() {
-  Error error;
+Status NativeProcessDarwin::Interrupt() {
+  Status error;
   error.SetErrorString("TODO: implement");
   return error;
 }
 
-Error NativeProcessDarwin::Kill() {
-  Error error;
+Status NativeProcessDarwin::Kill() {
+  Status error;
   error.SetErrorString("TODO: implement");
   return error;
 }
 
-Error NativeProcessDarwin::GetMemoryRegionInfo(lldb::addr_t load_addr,
-                                               MemoryRegionInfo &range_info) {
-  Error error;
+Status NativeProcessDarwin::GetMemoryRegionInfo(lldb::addr_t load_addr,
+                                                MemoryRegionInfo &range_info) {
+  Status error;
   error.SetErrorString("TODO: implement");
   return error;
 }
 
-Error NativeProcessDarwin::ReadMemory(lldb::addr_t addr, void *buf, size_t size,
-                                      size_t &bytes_read) {
-  Error error;
+Status NativeProcessDarwin::ReadMemory(lldb::addr_t addr, void *buf,
+                                       size_t size, size_t &bytes_read) {
+  Status error;
   error.SetErrorString("TODO: implement");
   return error;
 }
 
-Error NativeProcessDarwin::ReadMemoryWithoutTrap(lldb::addr_t addr, void *buf,
-                                                 size_t size,
-                                                 size_t &bytes_read) {
-  Error error;
+Status NativeProcessDarwin::ReadMemoryWithoutTrap(lldb::addr_t addr, void *buf,
+                                                  size_t size,
+                                                  size_t &bytes_read) {
+  Status error;
   error.SetErrorString("TODO: implement");
   return error;
 }
 
-Error NativeProcessDarwin::WriteMemory(lldb::addr_t addr, const void *buf,
-                                       size_t size, size_t &bytes_written) {
-  Error error;
+Status NativeProcessDarwin::WriteMemory(lldb::addr_t addr, const void *buf,
+                                        size_t size, size_t &bytes_written) {
+  Status error;
   error.SetErrorString("TODO: implement");
   return error;
 }
 
-Error NativeProcessDarwin::AllocateMemory(size_t size, uint32_t permissions,
-                                          lldb::addr_t &addr) {
-  Error error;
+Status NativeProcessDarwin::AllocateMemory(size_t size, uint32_t permissions,
+                                           lldb::addr_t &addr) {
+  Status error;
   error.SetErrorString("TODO: implement");
   return error;
 }
 
-Error NativeProcessDarwin::DeallocateMemory(lldb::addr_t addr) {
-  Error error;
+Status NativeProcessDarwin::DeallocateMemory(lldb::addr_t addr) {
+  Status error;
   error.SetErrorString("TODO: implement");
   return error;
 }
@@ -1543,25 +1544,25 @@ bool NativeProcessDarwin::GetArchitecture(ArchSpec &arch) const {
   return false;
 }
 
-Error NativeProcessDarwin::SetBreakpoint(lldb::addr_t addr, uint32_t size,
-                                         bool hardware) {
-  Error error;
+Status NativeProcessDarwin::SetBreakpoint(lldb::addr_t addr, uint32_t size,
+                                          bool hardware) {
+  Status error;
   error.SetErrorString("TODO: implement");
   return error;
 }
 
 void NativeProcessDarwin::DoStopIDBumped(uint32_t newBumpId) {}
 
-Error NativeProcessDarwin::GetLoadedModuleFileSpec(const char *module_path,
-                                                   FileSpec &file_spec) {
-  Error error;
+Status NativeProcessDarwin::GetLoadedModuleFileSpec(const char *module_path,
+                                                    FileSpec &file_spec) {
+  Status error;
   error.SetErrorString("TODO: implement");
   return error;
 }
 
-Error NativeProcessDarwin::GetFileLoadAddress(const llvm::StringRef &file_name,
-                                              lldb::addr_t &load_addr) {
-  Error error;
+Status NativeProcessDarwin::GetFileLoadAddress(const llvm::StringRef &file_name,
+                                               lldb::addr_t &load_addr) {
+  Status error;
   error.SetErrorString("TODO: implement");
   return error;
 }
@@ -1569,10 +1570,10 @@ Error NativeProcessDarwin::GetFileLoadAddress(const llvm::StringRef &file_name,
 // -----------------------------------------------------------------
 // NativeProcessProtocol protected interface
 // -----------------------------------------------------------------
-Error NativeProcessDarwin::GetSoftwareBreakpointTrapOpcode(
+Status NativeProcessDarwin::GetSoftwareBreakpointTrapOpcode(
     size_t trap_opcode_size_hint, size_t &actual_opcode_size,
     const uint8_t *&trap_opcode_bytes) {
-  Error error;
+  Status error;
   error.SetErrorString("TODO: implement");
   return error;
 }
diff --git a/source/Plugins/Process/Darwin/NativeProcessDarwin.h b/source/Plugins/Process/Darwin/NativeProcessDarwin.h
index 01fdd64b1273..2214bbc52ca4 100644
--- a/source/Plugins/Process/Darwin/NativeProcessDarwin.h
+++ b/source/Plugins/Process/Darwin/NativeProcessDarwin.h
@@ -37,7 +37,7 @@
 #include "NativeThreadListDarwin.h"
 
 namespace lldb_private {
-class Error;
+class Status;
 class Scalar;
 
 namespace process_darwin {
@@ -50,11 +50,11 @@ namespace process_darwin {
 ///
 /// Changes in the inferior process state are broadcasted.
 class NativeProcessDarwin : public NativeProcessProtocol {
-  friend Error NativeProcessProtocol::Launch(
+  friend Status NativeProcessProtocol::Launch(
       ProcessLaunchInfo &launch_info, NativeDelegate &native_delegate,
       MainLoop &mainloop, NativeProcessProtocolSP &process_sp);
 
-  friend Error NativeProcessProtocol::Attach(
+  friend Status NativeProcessProtocol::Attach(
       lldb::pid_t pid, NativeProcessProtocol::NativeDelegate &native_delegate,
       MainLoop &mainloop, NativeProcessProtocolSP &process_sp);
 
@@ -64,34 +64,34 @@ public:
   // -----------------------------------------------------------------
   // NativeProcessProtocol Interface
   // -----------------------------------------------------------------
-  Error Resume(const ResumeActionList &resume_actions) override;
+  Status Resume(const ResumeActionList &resume_actions) override;
 
-  Error Halt() override;
+  Status Halt() override;
 
-  Error Detach() override;
+  Status Detach() override;
 
-  Error Signal(int signo) override;
+  Status Signal(int signo) override;
 
-  Error Interrupt() override;
+  Status Interrupt() override;
 
-  Error Kill() override;
+  Status Kill() override;
 
-  Error GetMemoryRegionInfo(lldb::addr_t load_addr,
-                            MemoryRegionInfo &range_info) override;
+  Status GetMemoryRegionInfo(lldb::addr_t load_addr,
+                             MemoryRegionInfo &range_info) override;
 
-  Error ReadMemory(lldb::addr_t addr, void *buf, size_t size,
-                   size_t &bytes_read) override;
+  Status ReadMemory(lldb::addr_t addr, void *buf, size_t size,
+                    size_t &bytes_read) override;
 
-  Error ReadMemoryWithoutTrap(lldb::addr_t addr, void *buf, size_t size,
-                              size_t &bytes_read) override;
+  Status ReadMemoryWithoutTrap(lldb::addr_t addr, void *buf, size_t size,
+                               size_t &bytes_read) override;
 
-  Error WriteMemory(lldb::addr_t addr, const void *buf, size_t size,
-                    size_t &bytes_written) override;
+  Status WriteMemory(lldb::addr_t addr, const void *buf, size_t size,
+                     size_t &bytes_written) override;
 
-  Error AllocateMemory(size_t size, uint32_t permissions,
-                       lldb::addr_t &addr) override;
+  Status AllocateMemory(size_t size, uint32_t permissions,
+                        lldb::addr_t &addr) override;
 
-  Error DeallocateMemory(lldb::addr_t addr) override;
+  Status DeallocateMemory(lldb::addr_t addr) override;
 
   lldb::addr_t GetSharedLibraryInfoAddress() override;
 
@@ -99,15 +99,16 @@ public:
 
   bool GetArchitecture(ArchSpec &arch) const override;
 
-  Error SetBreakpoint(lldb::addr_t addr, uint32_t size, bool hardware) override;
+  Status SetBreakpoint(lldb::addr_t addr, uint32_t size,
+                       bool hardware) override;
 
   void DoStopIDBumped(uint32_t newBumpId) override;
 
-  Error GetLoadedModuleFileSpec(const char *module_path,
-                                FileSpec &file_spec) override;
+  Status GetLoadedModuleFileSpec(const char *module_path,
+                                 FileSpec &file_spec) override;
 
-  Error GetFileLoadAddress(const llvm::StringRef &file_name,
-                           lldb::addr_t &load_addr) override;
+  Status GetFileLoadAddress(const llvm::StringRef &file_name,
+                            lldb::addr_t &load_addr) override;
 
   NativeThreadDarwinSP GetThreadByID(lldb::tid_t id);
 
@@ -116,9 +117,9 @@ public:
   // -----------------------------------------------------------------
   // Interface used by NativeRegisterContext-derived classes.
   // -----------------------------------------------------------------
-  static Error PtraceWrapper(int req, lldb::pid_t pid, void *addr = nullptr,
-                             void *data = nullptr, size_t data_size = 0,
-                             long *result = nullptr);
+  static Status PtraceWrapper(int req, lldb::pid_t pid, void *addr = nullptr,
+                              void *data = nullptr, size_t data_size = 0,
+                              long *result = nullptr);
 
   bool SupportHardwareSingleStepping() const;
 
@@ -126,7 +127,7 @@ protected:
   // -----------------------------------------------------------------
   // NativeProcessProtocol protected interface
   // -----------------------------------------------------------------
-  Error
+  Status
   GetSoftwareBreakpointTrapOpcode(size_t trap_opcode_size_hint,
                                   size_t &actual_opcode_size,
                                   const uint8_t *&trap_opcode_bytes) override;
@@ -236,19 +237,19 @@ private:
   ///     operations.  Failure here will force termination of the
   ///     launched process and debugging session.
   // -----------------------------------------------------------------
-  Error FinalizeLaunch(LaunchFlavor launch_flavor, MainLoop &main_loop);
+  Status FinalizeLaunch(LaunchFlavor launch_flavor, MainLoop &main_loop);
 
-  Error SaveExceptionPortInfo();
+  Status SaveExceptionPortInfo();
 
   void ExceptionMessageReceived(const MachException::Message &message);
 
   void MaybeRaiseThreadPriority();
 
-  Error StartExceptionThread();
+  Status StartExceptionThread();
 
-  Error SendInferiorExitStatusToMainLoop(::pid_t pid, int status);
+  Status SendInferiorExitStatusToMainLoop(::pid_t pid, int status);
 
-  Error HandleWaitpidResult();
+  Status HandleWaitpidResult();
 
   bool ProcessUsingSpringBoard() const;
 
@@ -258,7 +259,7 @@ private:
 
   void *DoExceptionThread();
 
-  lldb::addr_t GetDYLDAllImageInfosAddress(Error &error) const;
+  lldb::addr_t GetDYLDAllImageInfosAddress(Status &error) const;
 
   static uint32_t GetCPUTypeForLocalProcess(::pid_t pid);
 
@@ -268,25 +269,25 @@ private:
 
   void StartSTDIOThread();
 
-  Error StartWaitpidThread(MainLoop &main_loop);
+  Status StartWaitpidThread(MainLoop &main_loop);
 
   static void *WaitpidThread(void *arg);
 
   void *DoWaitpidThread();
 
-  task_t TaskPortForProcessID(Error &error, bool force = false) const;
+  task_t TaskPortForProcessID(Status &error, bool force = false) const;
 
   /// Attaches to an existing process.  Forms the
   /// implementation of Process::DoAttach.
-  void AttachToInferior(MainLoop &mainloop, lldb::pid_t pid, Error &error);
+  void AttachToInferior(MainLoop &mainloop, lldb::pid_t pid, Status &error);
 
-  ::pid_t Attach(lldb::pid_t pid, Error &error);
+  ::pid_t Attach(lldb::pid_t pid, Status &error);
 
-  Error PrivateResume();
+  Status PrivateResume();
 
-  Error ReplyToAllExceptions();
+  Status ReplyToAllExceptions();
 
-  Error ResumeTask();
+  Status ResumeTask();
 
   bool IsTaskValid() const;
 
@@ -296,11 +297,11 @@ private:
 
   bool IsExceptionPortValid() const;
 
-  Error GetTaskBasicInfo(task_t task, struct task_basic_info *info) const;
+  Status GetTaskBasicInfo(task_t task, struct task_basic_info *info) const;
 
-  Error SuspendTask();
+  Status SuspendTask();
 
-  static Error SetDefaultPtraceOpts(const lldb::pid_t);
+  static Status SetDefaultPtraceOpts(const lldb::pid_t);
 
   static void *MonitorThread(void *baton);
 
@@ -319,7 +320,7 @@ private:
   void MonitorSignal(const siginfo_t &info, NativeThreadDarwin &thread,
                      bool exited);
 
-  Error SetupSoftwareSingleStepping(NativeThreadDarwin &thread);
+  Status SetupSoftwareSingleStepping(NativeThreadDarwin &thread);
 
 #if 0
             static ::ProcessMessage::CrashReason
@@ -341,22 +342,22 @@ private:
 
   NativeThreadDarwinSP AddThread(lldb::tid_t thread_id);
 
-  Error GetSoftwareBreakpointPCOffset(uint32_t &actual_opcode_size);
+  Status GetSoftwareBreakpointPCOffset(uint32_t &actual_opcode_size);
 
-  Error FixupBreakpointPCAsNeeded(NativeThreadDarwin &thread);
+  Status FixupBreakpointPCAsNeeded(NativeThreadDarwin &thread);
 
   /// Writes a siginfo_t structure corresponding to the given thread
   /// ID to the memory region pointed to by @p siginfo.
-  Error GetSignalInfo(lldb::tid_t tid, void *siginfo);
+  Status GetSignalInfo(lldb::tid_t tid, void *siginfo);
 
   /// Writes the raw event message code (vis-a-vis PTRACE_GETEVENTMSG)
   /// corresponding to the given thread ID to the memory pointed to
   /// by @p message.
-  Error GetEventMessage(lldb::tid_t tid, unsigned long *message);
+  Status GetEventMessage(lldb::tid_t tid, unsigned long *message);
 
   void NotifyThreadDeath(lldb::tid_t tid);
 
-  Error Detach(lldb::tid_t tid);
+  Status Detach(lldb::tid_t tid);
 
   // This method is requests a stop on all threads which are still
   // running. It sets up a deferred delegate notification, which will
@@ -370,8 +371,8 @@ private:
   // Resume the given thread, optionally passing it the given signal.
   // The type of resume operation (continue, single-step) depends on
   // the state parameter.
-  Error ResumeThread(NativeThreadDarwin &thread, lldb::StateType state,
-                     int signo);
+  Status ResumeThread(NativeThreadDarwin &thread, lldb::StateType state,
+                      int signo);
 
   void ThreadWasCreated(NativeThreadDarwin &thread);
 
diff --git a/source/Plugins/Process/Darwin/NativeThreadDarwin.cpp b/source/Plugins/Process/Darwin/NativeThreadDarwin.cpp
index b04f9053136b..07398ab7b678 100644
--- a/source/Plugins/Process/Darwin/NativeThreadDarwin.cpp
+++ b/source/Plugins/Process/Darwin/NativeThreadDarwin.cpp
@@ -94,15 +94,15 @@ NativeRegisterContextSP NativeThreadDarwin::GetRegisterContext() {
   return NativeRegisterContextSP();
 }
 
-Error NativeThreadDarwin::SetWatchpoint(lldb::addr_t addr, size_t size,
-                                        uint32_t watch_flags, bool hardware) {
-  Error error;
+Status NativeThreadDarwin::SetWatchpoint(lldb::addr_t addr, size_t size,
+                                         uint32_t watch_flags, bool hardware) {
+  Status error;
   error.SetErrorString("not yet implemented");
   return error;
 }
 
-Error NativeThreadDarwin::RemoveWatchpoint(lldb::addr_t addr) {
-  Error error;
+Status NativeThreadDarwin::RemoveWatchpoint(lldb::addr_t addr) {
+  Status error;
   error.SetErrorString("not yet implemented");
   return error;
 }
diff --git a/source/Plugins/Process/Darwin/NativeThreadDarwin.h b/source/Plugins/Process/Darwin/NativeThreadDarwin.h
index b8d9089e673e..f66f8fe8738c 100644
--- a/source/Plugins/Process/Darwin/NativeThreadDarwin.h
+++ b/source/Plugins/Process/Darwin/NativeThreadDarwin.h
@@ -58,10 +58,10 @@ public:
 
   NativeRegisterContextSP GetRegisterContext() override;
 
-  Error SetWatchpoint(lldb::addr_t addr, size_t size, uint32_t watch_flags,
-                      bool hardware) override;
+  Status SetWatchpoint(lldb::addr_t addr, size_t size, uint32_t watch_flags,
+                       bool hardware) override;
 
-  Error RemoveWatchpoint(lldb::addr_t addr) override;
+  Status RemoveWatchpoint(lldb::addr_t addr) override;
 
   // -----------------------------------------------------------------
   // New methods that are fine for others to call.
@@ -75,11 +75,11 @@ private:
 
   /// Resumes the thread.  If @p signo is anything but
   /// LLDB_INVALID_SIGNAL_NUMBER, deliver that signal to the thread.
-  Error Resume(uint32_t signo);
+  Status Resume(uint32_t signo);
 
   /// Single steps the thread.  If @p signo is anything but
   /// LLDB_INVALID_SIGNAL_NUMBER, deliver that signal to the thread.
-  Error SingleStep(uint32_t signo);
+  Status SingleStep(uint32_t signo);
 
   bool NotifyException(MachException::Data &exc);
 
@@ -117,7 +117,7 @@ private:
 
   void SetExited();
 
-  Error RequestStop();
+  Status RequestStop();
 
   // -------------------------------------------------------------------------
   /// Return the mach thread port number for this thread.
diff --git a/source/Plugins/Process/Darwin/NativeThreadListDarwin.cpp b/source/Plugins/Process/Darwin/NativeThreadListDarwin.cpp
index fa06fb8b2a5f..7d44adeec375 100644
--- a/source/Plugins/Process/Darwin/NativeThreadListDarwin.cpp
+++ b/source/Plugins/Process/Darwin/NativeThreadListDarwin.cpp
@@ -20,8 +20,8 @@
 #include <sys/sysctl.h>
 
 // LLDB includes
-#include "lldb/Utility/Error.h"
 #include "lldb/Utility/Log.h"
+#include "lldb/Utility/Status.h"
 #include "lldb/Utility/Stream.h"
 #include "lldb/lldb-enumerations.h"
 
@@ -343,7 +343,7 @@ uint32_t NativeThreadListDarwin::UpdateThreadList(NativeProcessDarwin &process,
     mach_msg_type_number_t thread_list_count = 0;
     task_t task = process.GetTask();
 
-    Error error;
+    Status error;
     auto mach_err = ::task_threads(task, &thread_list, &thread_list_count);
     error.SetError(mach_err, eErrorTypeMachKernel);
     if (error.Fail()) {
diff --git a/source/Plugins/Process/Darwin/NativeThreadListDarwin.h b/source/Plugins/Process/Darwin/NativeThreadListDarwin.h
index 2b194bcc1537..7b59afb96e95 100644
--- a/source/Plugins/Process/Darwin/NativeThreadListDarwin.h
+++ b/source/Plugins/Process/Darwin/NativeThreadListDarwin.h
@@ -123,7 +123,7 @@ protected:
   typedef collection::iterator iterator;
   typedef collection::const_iterator const_iterator;
 
-  // Consider having this return an lldb_private::Error.
+  // Consider having this return an lldb_private::Status.
   uint32_t UpdateThreadList(NativeProcessDarwin &process, bool update,
                             collection *num_threads = nullptr);
 
diff --git a/source/Plugins/Process/FreeBSD/ProcessFreeBSD.cpp b/source/Plugins/Process/FreeBSD/ProcessFreeBSD.cpp
index 93d294fd040a..3046150e3246 100644
--- a/source/Plugins/Process/FreeBSD/ProcessFreeBSD.cpp
+++ b/source/Plugins/Process/FreeBSD/ProcessFreeBSD.cpp
@@ -111,8 +111,8 @@ uint32_t ProcessFreeBSD::GetPluginVersion() { return 1; }
 
 void ProcessFreeBSD::Terminate() {}
 
-Error ProcessFreeBSD::DoDetach(bool keep_stopped) {
-  Error error;
+Status ProcessFreeBSD::DoDetach(bool keep_stopped) {
+  Status error;
   if (keep_stopped) {
     error.SetErrorString("Detaching with keep_stopped true is not currently "
                          "supported on FreeBSD.");
@@ -127,7 +127,7 @@ Error ProcessFreeBSD::DoDetach(bool keep_stopped) {
   return error;
 }
 
-Error ProcessFreeBSD::DoResume() {
+Status ProcessFreeBSD::DoResume() {
   Log *log(ProcessPOSIXLog::GetLogIfAllCategoriesSet(POSIX_LOG_PROCESS));
 
   SetPrivateState(eStateRunning);
@@ -147,7 +147,7 @@ Error ProcessFreeBSD::DoResume() {
     m_monitor->ThreadSuspend(*t_pos, false);
     do_step = true;
     if (software_single_step) {
-      Error error = SetupSoftwareSingleStepping(*t_pos);
+      Status error = SetupSoftwareSingleStepping(*t_pos);
       if (error.Fail())
         return error;
     }
@@ -168,7 +168,7 @@ Error ProcessFreeBSD::DoResume() {
   else
     m_monitor->Resume(GetID(), m_resume_signo);
 
-  return Error();
+  return Status();
 }
 
 bool ProcessFreeBSD::UpdateThreadList(ThreadList &old_thread_list,
@@ -209,7 +209,7 @@ bool ProcessFreeBSD::UpdateThreadList(ThreadList &old_thread_list,
   return true;
 }
 
-Error ProcessFreeBSD::WillResume() {
+Status ProcessFreeBSD::WillResume() {
   m_resume_signo = 0;
   m_suspend_tids.clear();
   m_run_tids.clear();
@@ -293,9 +293,10 @@ bool ProcessFreeBSD::CanDebug(lldb::TargetSP target_sp,
   return true;
 }
 
-Error ProcessFreeBSD::DoAttachToProcessWithID(
-    lldb::pid_t pid, const ProcessAttachInfo &attach_info) {
-  Error error;
+Status
+ProcessFreeBSD::DoAttachToProcessWithID(lldb::pid_t pid,
+                                        const ProcessAttachInfo &attach_info) {
+  Status error;
   assert(m_monitor == NULL);
 
   Log *log(ProcessPOSIXLog::GetLogIfAllCategoriesSet(POSIX_LOG_PROCESS));
@@ -343,8 +344,8 @@ Error ProcessFreeBSD::DoAttachToProcessWithID(
   return error;
 }
 
-Error ProcessFreeBSD::WillLaunch(Module *module) {
-  Error error;
+Status ProcessFreeBSD::WillLaunch(Module *module) {
+  Status error;
   return error;
 }
 
@@ -366,8 +367,9 @@ ProcessFreeBSD::GetFileSpec(const lldb_private::FileAction *file_action,
   return file_spec;
 }
 
-Error ProcessFreeBSD::DoLaunch(Module *module, ProcessLaunchInfo &launch_info) {
-  Error error;
+Status ProcessFreeBSD::DoLaunch(Module *module,
+                                ProcessLaunchInfo &launch_info) {
+  Status error;
   assert(m_monitor == NULL);
 
   FileSpec working_dir = launch_info.GetWorkingDirectory();
@@ -456,8 +458,8 @@ addr_t ProcessFreeBSD::GetImageInfoAddress() {
   return LLDB_INVALID_ADDRESS;
 }
 
-Error ProcessFreeBSD::DoHalt(bool &caused_stop) {
-  Error error;
+Status ProcessFreeBSD::DoHalt(bool &caused_stop) {
+  Status error;
 
   if (IsStopped()) {
     caused_stop = false;
@@ -470,8 +472,8 @@ Error ProcessFreeBSD::DoHalt(bool &caused_stop) {
   return error;
 }
 
-Error ProcessFreeBSD::DoSignal(int signal) {
-  Error error;
+Status ProcessFreeBSD::DoSignal(int signal) {
+  Status error;
 
   if (kill(GetID(), signal))
     error.SetErrorToErrno();
@@ -479,8 +481,8 @@ Error ProcessFreeBSD::DoSignal(int signal) {
   return error;
 }
 
-Error ProcessFreeBSD::DoDestroy() {
-  Error error;
+Status ProcessFreeBSD::DoDestroy() {
+  Status error;
 
   if (!HasExited()) {
     assert(m_monitor);
@@ -513,7 +515,7 @@ void ProcessFreeBSD::DoDidExec() {
                                  target->GetArchitecture());
       FileSpecList executable_search_paths(
           Target::GetDefaultExecutableSearchPaths());
-      Error error = platform_sp->ResolveExecutable(
+      Status error = platform_sp->ResolveExecutable(
           exe_module_spec, exe_module_sp,
           executable_search_paths.GetSize() ? &executable_search_paths : NULL);
       if (!error.Success())
@@ -589,19 +591,19 @@ bool ProcessFreeBSD::IsAlive() {
 }
 
 size_t ProcessFreeBSD::DoReadMemory(addr_t vm_addr, void *buf, size_t size,
-                                    Error &error) {
+                                    Status &error) {
   assert(m_monitor);
   return m_monitor->ReadMemory(vm_addr, buf, size, error);
 }
 
 size_t ProcessFreeBSD::DoWriteMemory(addr_t vm_addr, const void *buf,
-                                     size_t size, Error &error) {
+                                     size_t size, Status &error) {
   assert(m_monitor);
   return m_monitor->WriteMemory(vm_addr, buf, size, error);
 }
 
 addr_t ProcessFreeBSD::DoAllocateMemory(size_t size, uint32_t permissions,
-                                        Error &error) {
+                                        Status &error) {
   addr_t allocated_addr = LLDB_INVALID_ADDRESS;
 
   unsigned prot = 0;
@@ -626,8 +628,8 @@ addr_t ProcessFreeBSD::DoAllocateMemory(size_t size, uint32_t permissions,
   return allocated_addr;
 }
 
-Error ProcessFreeBSD::DoDeallocateMemory(lldb::addr_t addr) {
-  Error error;
+Status ProcessFreeBSD::DoDeallocateMemory(lldb::addr_t addr) {
+  Status error;
   MMapMap::iterator pos = m_addr_to_mmap_size.find(addr);
   if (pos != m_addr_to_mmap_size.end() &&
       InferiorCallMunmap(this, addr, pos->second))
@@ -691,16 +693,16 @@ ProcessFreeBSD::GetSoftwareBreakpointTrapOpcode(BreakpointSite *bp_site) {
   return opcode_size;
 }
 
-Error ProcessFreeBSD::EnableBreakpointSite(BreakpointSite *bp_site) {
+Status ProcessFreeBSD::EnableBreakpointSite(BreakpointSite *bp_site) {
   return EnableSoftwareBreakpoint(bp_site);
 }
 
-Error ProcessFreeBSD::DisableBreakpointSite(BreakpointSite *bp_site) {
+Status ProcessFreeBSD::DisableBreakpointSite(BreakpointSite *bp_site) {
   return DisableSoftwareBreakpoint(bp_site);
 }
 
-Error ProcessFreeBSD::EnableWatchpoint(Watchpoint *wp, bool notify) {
-  Error error;
+Status ProcessFreeBSD::EnableWatchpoint(Watchpoint *wp, bool notify) {
+  Status error;
   if (wp) {
     user_id_t watchID = wp->GetID();
     addr_t addr = wp->GetLoadAddress();
@@ -754,8 +756,8 @@ Error ProcessFreeBSD::EnableWatchpoint(Watchpoint *wp, bool notify) {
   return error;
 }
 
-Error ProcessFreeBSD::DisableWatchpoint(Watchpoint *wp, bool notify) {
-  Error error;
+Status ProcessFreeBSD::DisableWatchpoint(Watchpoint *wp, bool notify) {
+  Status error;
   if (wp) {
     user_id_t watchID = wp->GetID();
     addr_t addr = wp->GetLoadAddress();
@@ -797,8 +799,8 @@ Error ProcessFreeBSD::DisableWatchpoint(Watchpoint *wp, bool notify) {
   return error;
 }
 
-Error ProcessFreeBSD::GetWatchpointSupportInfo(uint32_t &num) {
-  Error error;
+Status ProcessFreeBSD::GetWatchpointSupportInfo(uint32_t &num) {
+  Status error;
   std::lock_guard<std::recursive_mutex> guard(m_thread_list.GetMutex());
   FreeBSDThread *thread = static_cast<FreeBSDThread *>(
       m_thread_list.GetThreadAtIndex(0, false).get());
@@ -809,8 +811,8 @@ Error ProcessFreeBSD::GetWatchpointSupportInfo(uint32_t &num) {
   return error;
 }
 
-Error ProcessFreeBSD::GetWatchpointSupportInfo(uint32_t &num, bool &after) {
-  Error error = GetWatchpointSupportInfo(num);
+Status ProcessFreeBSD::GetWatchpointSupportInfo(uint32_t &num, bool &after) {
+  Status error = GetWatchpointSupportInfo(num);
   // Watchpoints trigger and halt the inferior after
   // the corresponding instruction has been executed.
   after = true;
@@ -855,7 +857,7 @@ ByteOrder ProcessFreeBSD::GetByteOrder() const {
   return m_byte_order;
 }
 
-size_t ProcessFreeBSD::PutSTDIN(const char *buf, size_t len, Error &error) {
+size_t ProcessFreeBSD::PutSTDIN(const char *buf, size_t len, Status &error) {
   ssize_t status;
   if ((status = write(m_monitor->GetTerminalFD(), buf, len)) < 0) {
     error.SetErrorToErrno();
@@ -943,7 +945,7 @@ static size_t ReadMemoryCallback(EmulateInstruction *instruction, void *baton,
                                  lldb::addr_t addr, void *dst, size_t length) {
   EmulatorBaton *emulator_baton = static_cast<EmulatorBaton *>(baton);
 
-  Error error;
+  Status error;
   size_t bytes_read =
       emulator_baton->m_process->DoReadMemory(addr, dst, length, error);
   if (!error.Success())
@@ -998,9 +1000,9 @@ bool ProcessFreeBSD::SingleStepBreakpointHit(
   return false;
 }
 
-Error ProcessFreeBSD::SetSoftwareSingleStepBreakpoint(lldb::tid_t tid,
-                                                      lldb::addr_t addr) {
-  Error error;
+Status ProcessFreeBSD::SetSoftwareSingleStepBreakpoint(lldb::tid_t tid,
+                                                       lldb::addr_t addr) {
+  Status error;
 
   Log *log(ProcessPOSIXLog::GetLogIfAllCategoriesSet(POSIX_LOG_PROCESS));
   if (log) {
@@ -1010,8 +1012,8 @@ Error ProcessFreeBSD::SetSoftwareSingleStepBreakpoint(lldb::tid_t tid,
 
   // Validate the address.
   if (addr == LLDB_INVALID_ADDRESS)
-    return Error("ProcessFreeBSD::%s invalid load address specified.",
-                 __FUNCTION__);
+    return Status("ProcessFreeBSD::%s invalid load address specified.",
+                  __FUNCTION__);
 
   Breakpoint *const sw_step_break =
       m_process->GetTarget().CreateBreakpoint(addr, true, false).get();
@@ -1023,7 +1025,7 @@ Error ProcessFreeBSD::SetSoftwareSingleStepBreakpoint(lldb::tid_t tid,
                 __FUNCTION__, addr);
 
   m_threads_stepping_with_breakpoint.insert({tid, sw_step_break->GetID()});
-  return Error();
+  return Status();
 }
 
 bool ProcessFreeBSD::IsSoftwareStepBreakpoint(lldb::tid_t tid) {
@@ -1063,18 +1065,18 @@ bool ProcessFreeBSD::SupportHardwareSingleStepping() const {
   return true;
 }
 
-Error ProcessFreeBSD::SetupSoftwareSingleStepping(lldb::tid_t tid) {
+Status ProcessFreeBSD::SetupSoftwareSingleStepping(lldb::tid_t tid) {
   std::unique_ptr<EmulateInstruction> emulator_ap(
       EmulateInstruction::FindPlugin(GetTarget().GetArchitecture(),
                                      eInstructionTypePCModifying, nullptr));
 
   if (emulator_ap == nullptr)
-    return Error("Instruction emulator not found!");
+    return Status("Instruction emulator not found!");
 
   FreeBSDThread *thread = static_cast<FreeBSDThread *>(
       m_thread_list.FindThreadByID(tid, false).get());
   if (thread == NULL)
-    return Error("Thread not found not found!");
+    return Status("Thread not found not found!");
 
   lldb::RegisterContextSP register_context_sp = thread->GetRegisterContext();
 
@@ -1086,7 +1088,7 @@ Error ProcessFreeBSD::SetupSoftwareSingleStepping(lldb::tid_t tid) {
   emulator_ap->SetWriteRegCallback(&WriteRegisterCallback);
 
   if (!emulator_ap->ReadInstruction())
-    return Error("Read instruction failed!");
+    return Status("Read instruction failed!");
 
   bool emulation_result =
       emulator_ap->EvaluateInstruction(eEmulateInstructionOptionAutoAdvancePC);
@@ -1111,9 +1113,9 @@ Error ProcessFreeBSD::SetupSoftwareSingleStepping(lldb::tid_t tid) {
     // The instruction emulation failed after it modified the PC. It is an
     // unknown error where we can't continue because the next instruction is
     // modifying the PC but we don't  know how.
-    return Error("Instruction emulation failed unexpectedly");
+    return Status("Instruction emulation failed unexpectedly");
   }
 
   SetSoftwareSingleStepBreakpoint(tid, next_pc);
-  return Error();
+  return Status();
 }
diff --git a/source/Plugins/Process/FreeBSD/ProcessFreeBSD.h b/source/Plugins/Process/FreeBSD/ProcessFreeBSD.h
index 063eb6f68123..7ed2a56cd549 100644
--- a/source/Plugins/Process/FreeBSD/ProcessFreeBSD.h
+++ b/source/Plugins/Process/FreeBSD/ProcessFreeBSD.h
@@ -47,7 +47,7 @@ public:
 
   ~ProcessFreeBSD();
 
-  virtual lldb_private::Error WillResume() override;
+  virtual lldb_private::Status WillResume() override;
 
   //------------------------------------------------------------------
   // PluginInterface protocol
@@ -65,27 +65,27 @@ public:
   bool CanDebug(lldb::TargetSP target_sp,
                 bool plugin_specified_by_name) override;
 
-  lldb_private::Error WillLaunch(lldb_private::Module *module) override;
+  lldb_private::Status WillLaunch(lldb_private::Module *module) override;
 
-  lldb_private::Error DoAttachToProcessWithID(
+  lldb_private::Status DoAttachToProcessWithID(
       lldb::pid_t pid,
       const lldb_private::ProcessAttachInfo &attach_info) override;
 
-  lldb_private::Error
+  lldb_private::Status
   DoLaunch(lldb_private::Module *exe_module,
            lldb_private::ProcessLaunchInfo &launch_info) override;
 
   void DidLaunch() override;
 
-  lldb_private::Error DoResume() override;
+  lldb_private::Status DoResume() override;
 
-  lldb_private::Error DoHalt(bool &caused_stop) override;
+  lldb_private::Status DoHalt(bool &caused_stop) override;
 
-  lldb_private::Error DoDetach(bool keep_stopped) override;
+  lldb_private::Status DoDetach(bool keep_stopped) override;
 
-  lldb_private::Error DoSignal(int signal) override;
+  lldb_private::Status DoSignal(int signal) override;
 
-  lldb_private::Error DoDestroy() override;
+  lldb_private::Status DoDestroy() override;
 
   void DoDidExec() override;
 
@@ -94,35 +94,35 @@ public:
   bool IsAlive() override;
 
   size_t DoReadMemory(lldb::addr_t vm_addr, void *buf, size_t size,
-                      lldb_private::Error &error) override;
+                      lldb_private::Status &error) override;
 
   size_t DoWriteMemory(lldb::addr_t vm_addr, const void *buf, size_t size,
-                       lldb_private::Error &error) override;
+                       lldb_private::Status &error) override;
 
   lldb::addr_t DoAllocateMemory(size_t size, uint32_t permissions,
-                                lldb_private::Error &error) override;
+                                lldb_private::Status &error) override;
 
-  lldb_private::Error DoDeallocateMemory(lldb::addr_t ptr) override;
+  lldb_private::Status DoDeallocateMemory(lldb::addr_t ptr) override;
 
   virtual size_t
   GetSoftwareBreakpointTrapOpcode(lldb_private::BreakpointSite *bp_site);
 
-  lldb_private::Error
+  lldb_private::Status
   EnableBreakpointSite(lldb_private::BreakpointSite *bp_site) override;
 
-  lldb_private::Error
+  lldb_private::Status
   DisableBreakpointSite(lldb_private::BreakpointSite *bp_site) override;
 
-  lldb_private::Error EnableWatchpoint(lldb_private::Watchpoint *wp,
-                                       bool notify = true) override;
-
-  lldb_private::Error DisableWatchpoint(lldb_private::Watchpoint *wp,
+  lldb_private::Status EnableWatchpoint(lldb_private::Watchpoint *wp,
                                         bool notify = true) override;
 
-  lldb_private::Error GetWatchpointSupportInfo(uint32_t &num) override;
+  lldb_private::Status DisableWatchpoint(lldb_private::Watchpoint *wp,
+                                         bool notify = true) override;
+
+  lldb_private::Status GetWatchpointSupportInfo(uint32_t &num) override;
 
-  lldb_private::Error GetWatchpointSupportInfo(uint32_t &num,
-                                               bool &after) override;
+  lldb_private::Status GetWatchpointSupportInfo(uint32_t &num,
+                                                bool &after) override;
 
   virtual uint32_t UpdateThreadListIfNeeded();
 
@@ -134,7 +134,7 @@ public:
   lldb::addr_t GetImageInfoAddress() override;
 
   size_t PutSTDIN(const char *buf, size_t len,
-                  lldb_private::Error &error) override;
+                  lldb_private::Status &error) override;
 
   const lldb::DataBufferSP GetAuxvData() override;
 
@@ -169,10 +169,10 @@ public:
       void *baton, lldb_private::StoppointCallbackContext *context,
       lldb::user_id_t break_id, lldb::user_id_t break_loc_id);
 
-  lldb_private::Error SetupSoftwareSingleStepping(lldb::tid_t tid);
+  lldb_private::Status SetupSoftwareSingleStepping(lldb::tid_t tid);
 
-  lldb_private::Error SetSoftwareSingleStepBreakpoint(lldb::tid_t tid,
-                                                      lldb::addr_t addr);
+  lldb_private::Status SetSoftwareSingleStepBreakpoint(lldb::tid_t tid,
+                                                       lldb::addr_t addr);
 
   bool IsSoftwareStepBreakpoint(lldb::tid_t tid);
 
diff --git a/source/Plugins/Process/FreeBSD/ProcessMonitor.cpp b/source/Plugins/Process/FreeBSD/ProcessMonitor.cpp
index 68ab41651162..1667490f1344 100644
--- a/source/Plugins/Process/FreeBSD/ProcessMonitor.cpp
+++ b/source/Plugins/Process/FreeBSD/ProcessMonitor.cpp
@@ -29,7 +29,7 @@
 #include "lldb/Target/RegisterContext.h"
 #include "lldb/Target/Thread.h"
 #include "lldb/Target/UnixSignals.h"
-#include "lldb/Utility/Error.h"
+#include "lldb/Utility/Status.h"
 
 #include "FreeBSDThread.h"
 #include "Plugins/Process/POSIX/CrashReason.h"
@@ -154,7 +154,7 @@ PtraceWrapper((req), (pid), (addr), (data))
 // functions without needed to go thru the thread funnel.
 
 static size_t DoReadMemory(lldb::pid_t pid, lldb::addr_t vm_addr, void *buf,
-                           size_t size, Error &error) {
+                           size_t size, Status &error) {
   struct ptrace_io_desc pi_desc;
 
   pi_desc.piod_op = PIOD_READ_D;
@@ -168,7 +168,7 @@ static size_t DoReadMemory(lldb::pid_t pid, lldb::addr_t vm_addr, void *buf,
 }
 
 static size_t DoWriteMemory(lldb::pid_t pid, lldb::addr_t vm_addr,
-                            const void *buf, size_t size, Error &error) {
+                            const void *buf, size_t size, Status &error) {
   struct ptrace_io_desc pi_desc;
 
   pi_desc.piod_op = PIOD_WRITE_D;
@@ -183,7 +183,7 @@ static size_t DoWriteMemory(lldb::pid_t pid, lldb::addr_t vm_addr,
 
 // Simple helper function to ensure flags are enabled on the given file
 // descriptor.
-static bool EnsureFDFlags(int fd, int flags, Error &error) {
+static bool EnsureFDFlags(int fd, int flags, Status &error) {
   int status;
 
   if ((status = fcntl(fd, F_GETFL)) == -1) {
@@ -221,7 +221,7 @@ public:
 /// @brief Implements ProcessMonitor::ReadMemory.
 class ReadOperation : public Operation {
 public:
-  ReadOperation(lldb::addr_t addr, void *buff, size_t size, Error &error,
+  ReadOperation(lldb::addr_t addr, void *buff, size_t size, Status &error,
                 size_t &result)
       : m_addr(addr), m_buff(buff), m_size(size), m_error(error),
         m_result(result) {}
@@ -232,7 +232,7 @@ private:
   lldb::addr_t m_addr;
   void *m_buff;
   size_t m_size;
-  Error &m_error;
+  Status &m_error;
   size_t &m_result;
 };
 
@@ -247,8 +247,8 @@ void ReadOperation::Execute(ProcessMonitor *monitor) {
 /// @brief Implements ProcessMonitor::WriteMemory.
 class WriteOperation : public Operation {
 public:
-  WriteOperation(lldb::addr_t addr, const void *buff, size_t size, Error &error,
-                 size_t &result)
+  WriteOperation(lldb::addr_t addr, const void *buff, size_t size,
+                 Status &error, size_t &result)
       : m_addr(addr), m_buff(buff), m_size(size), m_error(error),
         m_result(result) {}
 
@@ -258,7 +258,7 @@ private:
   lldb::addr_t m_addr;
   const void *m_buff;
   size_t m_size;
-  Error &m_error;
+  Status &m_error;
   size_t &m_result;
 };
 
@@ -672,12 +672,12 @@ void KillOperation::Execute(ProcessMonitor *monitor) {
 /// @brief Implements ProcessMonitor::Detach.
 class DetachOperation : public Operation {
 public:
-  DetachOperation(Error &result) : m_error(result) {}
+  DetachOperation(Status &result) : m_error(result) {}
 
   void Execute(ProcessMonitor *monitor);
 
 private:
-  Error &m_error;
+  Status &m_error;
 };
 
 void DetachOperation::Execute(ProcessMonitor *monitor) {
@@ -731,7 +731,7 @@ ProcessMonitor::ProcessMonitor(
     const FileSpec &stdout_file_spec, const FileSpec &stderr_file_spec,
     const FileSpec &working_dir,
     const lldb_private::ProcessLaunchInfo & /* launch_info */,
-    lldb_private::Error &error)
+    lldb_private::Status &error)
     : m_process(static_cast<ProcessFreeBSD *>(process)),
       m_pid(LLDB_INVALID_PROCESS_ID), m_terminal_fd(-1), m_operation(0) {
   using namespace std::placeholders;
@@ -777,7 +777,7 @@ WAIT_AGAIN:
 }
 
 ProcessMonitor::ProcessMonitor(ProcessFreeBSD *process, lldb::pid_t pid,
-                               lldb_private::Error &error)
+                               lldb_private::Status &error)
     : m_process(static_cast<ProcessFreeBSD *>(process)), m_pid(pid),
       m_terminal_fd(-1), m_operation(0) {
   using namespace std::placeholders;
@@ -824,7 +824,7 @@ ProcessMonitor::~ProcessMonitor() { StopMonitor(); }
 
 //------------------------------------------------------------------------------
 // Thread setup and tear down.
-void ProcessMonitor::StartLaunchOpThread(LaunchArgs *args, Error &error) {
+void ProcessMonitor::StartLaunchOpThread(LaunchArgs *args, Status &error) {
   static const char *g_thread_name = "lldb.process.freebsd.operation";
 
   if (m_operation_thread.IsJoinable())
@@ -992,7 +992,7 @@ FINISH:
 }
 
 void ProcessMonitor::StartAttachOpThread(AttachArgs *args,
-                                         lldb_private::Error &error) {
+                                         lldb_private::Status &error) {
   static const char *g_thread_name = "lldb.process.freebsd.operation";
 
   if (m_operation_thread.IsJoinable())
@@ -1240,7 +1240,7 @@ void ProcessMonitor::DoOperation(Operation *op) {
 }
 
 size_t ProcessMonitor::ReadMemory(lldb::addr_t vm_addr, void *buf, size_t size,
-                                  Error &error) {
+                                  Status &error) {
   size_t result;
   ReadOperation op(vm_addr, buf, size, error, result);
   DoOperation(&op);
@@ -1248,7 +1248,7 @@ size_t ProcessMonitor::ReadMemory(lldb::addr_t vm_addr, void *buf, size_t size,
 }
 
 size_t ProcessMonitor::WriteMemory(lldb::addr_t vm_addr, const void *buf,
-                                   size_t size, lldb_private::Error &error) {
+                                   size_t size, lldb_private::Status &error) {
   size_t result;
   WriteOperation op(vm_addr, buf, size, error, result);
   DoOperation(&op);
@@ -1389,8 +1389,8 @@ bool ProcessMonitor::GetEventMessage(lldb::tid_t tid, unsigned long *message) {
   return result;
 }
 
-lldb_private::Error ProcessMonitor::Detach(lldb::tid_t tid) {
-  lldb_private::Error error;
+lldb_private::Status ProcessMonitor::Detach(lldb::tid_t tid) {
+  lldb_private::Status error;
   if (tid != LLDB_INVALID_THREAD_ID) {
     DetachOperation op(error);
     DoOperation(&op);
diff --git a/source/Plugins/Process/FreeBSD/ProcessMonitor.h b/source/Plugins/Process/FreeBSD/ProcessMonitor.h
index 58629189b7b8..0963453a31b1 100644
--- a/source/Plugins/Process/FreeBSD/ProcessMonitor.h
+++ b/source/Plugins/Process/FreeBSD/ProcessMonitor.h
@@ -23,7 +23,7 @@
 #include "lldb/lldb-types.h"
 
 namespace lldb_private {
-class Error;
+class Status;
 class Module;
 class Scalar;
 } // End lldb_private namespace.
@@ -54,10 +54,10 @@ public:
                  const lldb_private::FileSpec &stderr_file_spec,
                  const lldb_private::FileSpec &working_dir,
                  const lldb_private::ProcessLaunchInfo &launch_info,
-                 lldb_private::Error &error);
+                 lldb_private::Status &error);
 
   ProcessMonitor(ProcessFreeBSD *process, lldb::pid_t pid,
-                 lldb_private::Error &error);
+                 lldb_private::Status &error);
 
   ~ProcessMonitor();
 
@@ -86,14 +86,14 @@ public:
   ///
   /// This method is provided to implement Process::DoReadMemory.
   size_t ReadMemory(lldb::addr_t vm_addr, void *buf, size_t size,
-                    lldb_private::Error &error);
+                    lldb_private::Status &error);
 
   /// Writes @p size bytes from address @p vm_adder in the inferior process
   /// address space.
   ///
   /// This method is provided to implement Process::DoWriteMemory.
   size_t WriteMemory(lldb::addr_t vm_addr, const void *buf, size_t size,
-                     lldb_private::Error &error);
+                     lldb_private::Status &error);
 
   /// Reads the contents from the register identified by the given (architecture
   /// dependent) offset.
@@ -178,7 +178,7 @@ public:
   /// Terminate the traced process.
   bool Kill();
 
-  lldb_private::Error Detach(lldb::tid_t tid);
+  lldb_private::Status Detach(lldb::tid_t tid);
 
   void StopMonitor();
 
@@ -210,7 +210,7 @@ private:
 
     ProcessMonitor *m_monitor;   // The monitor performing the attach.
     sem_t m_semaphore;           // Posted to once operation complete.
-    lldb_private::Error m_error; // Set if process operation failed.
+    lldb_private::Status m_error; // Set if process operation failed.
   };
 
   /// @class LauchArgs
@@ -238,7 +238,7 @@ private:
     const lldb_private::FileSpec m_working_dir; // Working directory or empty.
   };
 
-  void StartLaunchOpThread(LaunchArgs *args, lldb_private::Error &error);
+  void StartLaunchOpThread(LaunchArgs *args, lldb_private::Status &error);
 
   static void *LaunchOpThread(void *arg);
 
@@ -252,7 +252,7 @@ private:
     lldb::pid_t m_pid; // pid of the process to be attached.
   };
 
-  void StartAttachOpThread(AttachArgs *args, lldb_private::Error &error);
+  void StartAttachOpThread(AttachArgs *args, lldb_private::Status &error);
 
   static void *AttachOpThread(void *args);
 
diff --git a/source/Plugins/Process/FreeBSD/RegisterContextPOSIXProcessMonitor_arm.cpp b/source/Plugins/Process/FreeBSD/RegisterContextPOSIXProcessMonitor_arm.cpp
index 14171d614c9e..59d42b9ad72e 100644
--- a/source/Plugins/Process/FreeBSD/RegisterContextPOSIXProcessMonitor_arm.cpp
+++ b/source/Plugins/Process/FreeBSD/RegisterContextPOSIXProcessMonitor_arm.cpp
@@ -75,7 +75,7 @@ bool RegisterContextPOSIXProcessMonitor_arm::WriteRegister(
 
     // Read the full register.
     if (ReadRegister(full_reg_info, full_value)) {
-      Error error;
+      Status error;
       ByteOrder byte_order = GetByteOrder();
       uint8_t dst[RegisterValue::kMaxRegisterByteSize];
 
diff --git a/source/Plugins/Process/FreeBSD/RegisterContextPOSIXProcessMonitor_arm64.cpp b/source/Plugins/Process/FreeBSD/RegisterContextPOSIXProcessMonitor_arm64.cpp
index 8a8eb0520cca..93ffeb5ea79b 100644
--- a/source/Plugins/Process/FreeBSD/RegisterContextPOSIXProcessMonitor_arm64.cpp
+++ b/source/Plugins/Process/FreeBSD/RegisterContextPOSIXProcessMonitor_arm64.cpp
@@ -77,7 +77,7 @@ bool RegisterContextPOSIXProcessMonitor_arm64::WriteRegister(
 
     // Read the full register.
     if (ReadRegister(full_reg_info, full_value)) {
-      lldb_private::Error error;
+      lldb_private::Status error;
       lldb::ByteOrder byte_order = GetByteOrder();
       uint8_t dst[lldb_private::RegisterValue::kMaxRegisterByteSize];
 
diff --git a/source/Plugins/Process/FreeBSD/RegisterContextPOSIXProcessMonitor_mips64.cpp b/source/Plugins/Process/FreeBSD/RegisterContextPOSIXProcessMonitor_mips64.cpp
index ab50a5db3f3e..b911ee222015 100644
--- a/source/Plugins/Process/FreeBSD/RegisterContextPOSIXProcessMonitor_mips64.cpp
+++ b/source/Plugins/Process/FreeBSD/RegisterContextPOSIXProcessMonitor_mips64.cpp
@@ -76,7 +76,7 @@ bool RegisterContextPOSIXProcessMonitor_mips64::WriteRegister(
 
     // Read the full register.
     if (ReadRegister(full_reg_info, full_value)) {
-      Error error;
+      Status error;
       ByteOrder byte_order = GetByteOrder();
       uint8_t dst[RegisterValue::kMaxRegisterByteSize];
 
diff --git a/source/Plugins/Process/FreeBSD/RegisterContextPOSIXProcessMonitor_powerpc.cpp b/source/Plugins/Process/FreeBSD/RegisterContextPOSIXProcessMonitor_powerpc.cpp
index 70eec945ce8d..bc1d4df89fc8 100644
--- a/source/Plugins/Process/FreeBSD/RegisterContextPOSIXProcessMonitor_powerpc.cpp
+++ b/source/Plugins/Process/FreeBSD/RegisterContextPOSIXProcessMonitor_powerpc.cpp
@@ -88,7 +88,7 @@ bool RegisterContextPOSIXProcessMonitor_powerpc::WriteRegister(
 
     // Read the full register.
     if (ReadRegister(full_reg_info, full_value)) {
-      Error error;
+      Status error;
       ByteOrder byte_order = GetByteOrder();
       uint8_t dst[RegisterValue::kMaxRegisterByteSize];
 
diff --git a/source/Plugins/Process/FreeBSD/RegisterContextPOSIXProcessMonitor_x86.cpp b/source/Plugins/Process/FreeBSD/RegisterContextPOSIXProcessMonitor_x86.cpp
index 036306058ff8..1cd8f1aafa6e 100644
--- a/source/Plugins/Process/FreeBSD/RegisterContextPOSIXProcessMonitor_x86.cpp
+++ b/source/Plugins/Process/FreeBSD/RegisterContextPOSIXProcessMonitor_x86.cpp
@@ -130,7 +130,7 @@ bool RegisterContextPOSIXProcessMonitor_x86_64::WriteRegister(
 
     // Read the full register.
     if (ReadRegister(full_reg_info, full_value)) {
-      Error error;
+      Status error;
       ByteOrder byte_order = GetByteOrder();
       uint8_t dst[RegisterValue::kMaxRegisterByteSize];
 
diff --git a/source/Plugins/Process/Linux/NativeProcessLinux.cpp b/source/Plugins/Process/Linux/NativeProcessLinux.cpp
index 914d690ad88c..15e7c9b5f698 100644
--- a/source/Plugins/Process/Linux/NativeProcessLinux.cpp
+++ b/source/Plugins/Process/Linux/NativeProcessLinux.cpp
@@ -40,8 +40,8 @@
 #include "lldb/Target/Process.h"
 #include "lldb/Target/ProcessLaunchInfo.h"
 #include "lldb/Target/Target.h"
-#include "lldb/Utility/Error.h"
 #include "lldb/Utility/LLDBAssert.h"
+#include "lldb/Utility/Status.h"
 #include "lldb/Utility/StringExtractor.h"
 
 #include "NativeThreadLinux.h"
@@ -193,8 +193,8 @@ static_assert(sizeof(long) >= k_ptrace_word_size,
 
 // Simple helper function to ensure flags are enabled on the given file
 // descriptor.
-static Error EnsureFDFlags(int fd, int flags) {
-  Error error;
+static Status EnsureFDFlags(int fd, int flags) {
+  Status error;
 
   int status = fcntl(fd, F_GETFL);
   if (status == -1) {
@@ -214,13 +214,13 @@ static Error EnsureFDFlags(int fd, int flags) {
 // Public Static Methods
 // -----------------------------------------------------------------------------
 
-Error NativeProcessProtocol::Launch(
+Status NativeProcessProtocol::Launch(
     ProcessLaunchInfo &launch_info,
     NativeProcessProtocol::NativeDelegate &native_delegate, MainLoop &mainloop,
     NativeProcessProtocolSP &native_process_sp) {
   Log *log(ProcessPOSIXLog::GetLogIfAllCategoriesSet(POSIX_LOG_PROCESS));
 
-  Error error;
+  Status error;
 
   // Verify the working directory is valid if one was specified.
   FileSpec working_dir{launch_info.GetWorkingDirectory()};
@@ -254,7 +254,7 @@ Error NativeProcessProtocol::Launch(
   return error;
 }
 
-Error NativeProcessProtocol::Attach(
+Status NativeProcessProtocol::Attach(
     lldb::pid_t pid, NativeProcessProtocol::NativeDelegate &native_delegate,
     MainLoop &mainloop, NativeProcessProtocolSP &native_process_sp) {
   Log *log(ProcessPOSIXLog::GetLogIfAllCategoriesSet(POSIX_LOG_PROCESS));
@@ -262,7 +262,7 @@ Error NativeProcessProtocol::Attach(
 
   // Retrieve the architecture for the running process.
   ArchSpec process_arch;
-  Error error = ResolveProcessArchitecture(pid, process_arch);
+  Status error = ResolveProcessArchitecture(pid, process_arch);
   if (!error.Success())
     return error;
 
@@ -292,7 +292,7 @@ NativeProcessLinux::NativeProcessLinux()
       m_pending_notification_tid(LLDB_INVALID_THREAD_ID) {}
 
 void NativeProcessLinux::AttachToInferior(MainLoop &mainloop, lldb::pid_t pid,
-                                          Error &error) {
+                                          Status &error) {
   Log *log(ProcessPOSIXLog::GetLogIfAllCategoriesSet(POSIX_LOG_PROCESS));
   LLDB_LOG(log, "pid = {0:x}", pid);
 
@@ -314,9 +314,9 @@ void NativeProcessLinux::AttachToInferior(MainLoop &mainloop, lldb::pid_t pid,
   Attach(pid, error);
 }
 
-Error NativeProcessLinux::LaunchInferior(MainLoop &mainloop,
-                                         ProcessLaunchInfo &launch_info) {
-  Error error;
+Status NativeProcessLinux::LaunchInferior(MainLoop &mainloop,
+                                          ProcessLaunchInfo &launch_info) {
+  Status error;
   m_sigchld_handle = mainloop.RegisterSignal(
       SIGCHLD, [this](MainLoopBase &) { SigchldHandler(); }, error);
   if (!m_sigchld_handle)
@@ -402,7 +402,7 @@ Error NativeProcessLinux::LaunchInferior(MainLoop &mainloop,
   return error;
 }
 
-::pid_t NativeProcessLinux::Attach(lldb::pid_t pid, Error &error) {
+::pid_t NativeProcessLinux::Attach(lldb::pid_t pid, Status &error) {
   Log *log(ProcessPOSIXLog::GetLogIfAllCategoriesSet(POSIX_LOG_PROCESS));
 
   // Use a map to keep track of the threads which we have attached/need to
@@ -484,7 +484,7 @@ Error NativeProcessLinux::LaunchInferior(MainLoop &mainloop,
   return pid;
 }
 
-Error NativeProcessLinux::SetDefaultPtraceOpts(lldb::pid_t pid) {
+Status NativeProcessLinux::SetDefaultPtraceOpts(lldb::pid_t pid) {
   long ptrace_opts = 0;
 
   // Have the child raise an event on exit.  This is used to keep the child in
@@ -857,7 +857,7 @@ void NativeProcessLinux::MonitorSIGTRAP(const siginfo_t &info,
   {
     // If a watchpoint was hit, report it
     uint32_t wp_index;
-    Error error = thread.GetRegisterContext()->GetWatchpointHitIndex(
+    Status error = thread.GetRegisterContext()->GetWatchpointHitIndex(
         wp_index, (uintptr_t)info.si_addr);
     if (error.Fail())
       LLDB_LOG(log,
@@ -894,7 +894,7 @@ void NativeProcessLinux::MonitorSIGTRAP(const siginfo_t &info,
     {
       // If a watchpoint was hit, report it
       uint32_t wp_index;
-      Error error = thread.GetRegisterContext()->GetWatchpointHitIndex(
+      Status error = thread.GetRegisterContext()->GetWatchpointHitIndex(
           wp_index, LLDB_INVALID_ADDRESS);
       if (error.Fail())
         LLDB_LOG(log,
@@ -950,7 +950,7 @@ void NativeProcessLinux::MonitorBreakpoint(NativeThreadLinux &thread) {
 
   // Mark the thread as stopped at breakpoint.
   thread.SetStoppedByBreakpoint();
-  Error error = FixupBreakpointPCAsNeeded(thread);
+  Status error = FixupBreakpointPCAsNeeded(thread);
   if (error.Fail())
     LLDB_LOG(log, "pid = {0} fixup: {1}", thread.GetID(), error);
 
@@ -1032,7 +1032,7 @@ void NativeProcessLinux::MonitorSignal(const siginfo_t &info,
       } else {
         // We can end up here if stop was initiated by LLGS but by this time a
         // thread stop has occurred - maybe initiated by another event.
-        Error error = ResumeThread(thread, thread.GetState(), 0);
+        Status error = ResumeThread(thread, thread.GetState(), 0);
         if (error.Fail())
           LLDB_LOG(log, "failed to resume thread {0}: {1}", thread.GetID(),
                    error);
@@ -1108,7 +1108,7 @@ static bool ReadRegisterCallback(EmulateInstruction *instruction, void *baton,
       emulator_baton->m_reg_context->GetRegisterInfo(
           eRegisterKindDWARF, reg_info->kinds[eRegisterKindDWARF]);
 
-  Error error =
+  Status error =
       emulator_baton->m_reg_context->ReadRegister(full_reg_info, reg_value);
   if (error.Success())
     return true;
@@ -1140,9 +1140,9 @@ static lldb::addr_t ReadFlags(NativeRegisterContext *regsiter_context) {
                                                   LLDB_INVALID_ADDRESS);
 }
 
-Error NativeProcessLinux::SetupSoftwareSingleStepping(
-    NativeThreadLinux &thread) {
-  Error error;
+Status
+NativeProcessLinux::SetupSoftwareSingleStepping(NativeThreadLinux &thread) {
+  Status error;
   NativeRegisterContextSP register_context_sp = thread.GetRegisterContext();
 
   std::unique_ptr<EmulateInstruction> emulator_ap(
@@ -1150,7 +1150,7 @@ Error NativeProcessLinux::SetupSoftwareSingleStepping(
                                      nullptr));
 
   if (emulator_ap == nullptr)
-    return Error("Instruction emulator not found!");
+    return Status("Instruction emulator not found!");
 
   EmulatorBaton baton(this, register_context_sp.get());
   emulator_ap->SetBaton(&baton);
@@ -1160,7 +1160,7 @@ Error NativeProcessLinux::SetupSoftwareSingleStepping(
   emulator_ap->SetWriteRegCallback(&WriteRegisterCallback);
 
   if (!emulator_ap->ReadInstruction())
-    return Error("Read instruction failed!");
+    return Status("Read instruction failed!");
 
   bool emulation_result =
       emulator_ap->EvaluateInstruction(eEmulateInstructionOptionAutoAdvancePC);
@@ -1198,7 +1198,7 @@ Error NativeProcessLinux::SetupSoftwareSingleStepping(
     // The instruction emulation failed after it modified the PC. It is an
     // unknown error where we can't continue because the next instruction is
     // modifying the PC but we don't  know how.
-    return Error("Instruction emulation failed unexpectedly.");
+    return Status("Instruction emulation failed unexpectedly.");
   }
 
   if (m_arch.GetMachine() == llvm::Triple::arm) {
@@ -1222,13 +1222,13 @@ Error NativeProcessLinux::SetupSoftwareSingleStepping(
   // If setting the breakpoint fails because next_pc is out of
   // the address space, ignore it and let the debugee segfault.
   if (error.GetError() == EIO || error.GetError() == EFAULT) {
-    return Error();
+    return Status();
   } else if (error.Fail())
     return error;
 
   m_threads_stepping_with_breakpoint.insert({thread.GetID(), next_pc});
 
-  return Error();
+  return Status();
 }
 
 bool NativeProcessLinux::SupportHardwareSingleStepping() const {
@@ -1241,7 +1241,7 @@ bool NativeProcessLinux::SupportHardwareSingleStepping() const {
   return true;
 }
 
-Error NativeProcessLinux::Resume(const ResumeActionList &resume_actions) {
+Status NativeProcessLinux::Resume(const ResumeActionList &resume_actions) {
   Log *log(ProcessPOSIXLog::GetLogIfAllCategoriesSet(POSIX_LOG_PROCESS));
   LLDB_LOG(log, "pid {0}", GetID());
 
@@ -1257,7 +1257,7 @@ Error NativeProcessLinux::Resume(const ResumeActionList &resume_actions) {
         continue;
 
       if (action->state == eStateStepping) {
-        Error error = SetupSoftwareSingleStepping(
+        Status error = SetupSoftwareSingleStepping(
             static_cast<NativeThreadLinux &>(*thread_sp));
         if (error.Fail())
           return error;
@@ -1295,18 +1295,18 @@ Error NativeProcessLinux::Resume(const ResumeActionList &resume_actions) {
       llvm_unreachable("Unexpected state");
 
     default:
-      return Error("NativeProcessLinux::%s (): unexpected state %s specified "
-                   "for pid %" PRIu64 ", tid %" PRIu64,
-                   __FUNCTION__, StateAsCString(action->state), GetID(),
-                   thread_sp->GetID());
+      return Status("NativeProcessLinux::%s (): unexpected state %s specified "
+                    "for pid %" PRIu64 ", tid %" PRIu64,
+                    __FUNCTION__, StateAsCString(action->state), GetID(),
+                    thread_sp->GetID());
     }
   }
 
-  return Error();
+  return Status();
 }
 
-Error NativeProcessLinux::Halt() {
-  Error error;
+Status NativeProcessLinux::Halt() {
+  Status error;
 
   if (kill(GetID(), SIGSTOP) != 0)
     error.SetErrorToErrno();
@@ -1314,8 +1314,8 @@ Error NativeProcessLinux::Halt() {
   return error;
 }
 
-Error NativeProcessLinux::Detach() {
-  Error error;
+Status NativeProcessLinux::Detach() {
+  Status error;
 
   // Stop monitoring the inferior.
   m_sigchld_handle.reset();
@@ -1325,7 +1325,7 @@ Error NativeProcessLinux::Detach() {
     return error;
 
   for (auto thread_sp : m_threads) {
-    Error e = Detach(thread_sp->GetID());
+    Status e = Detach(thread_sp->GetID());
     if (e.Fail())
       error =
           e; // Save the error, but still attempt to detach from other threads.
@@ -1334,8 +1334,8 @@ Error NativeProcessLinux::Detach() {
   return error;
 }
 
-Error NativeProcessLinux::Signal(int signo) {
-  Error error;
+Status NativeProcessLinux::Signal(int signo) {
+  Status error;
 
   Log *log(ProcessPOSIXLog::GetLogIfAllCategoriesSet(POSIX_LOG_PROCESS));
   LLDB_LOG(log, "sending signal {0} ({1}) to pid {1}", signo,
@@ -1347,7 +1347,7 @@ Error NativeProcessLinux::Signal(int signo) {
   return error;
 }
 
-Error NativeProcessLinux::Interrupt() {
+Status NativeProcessLinux::Interrupt() {
   // Pick a running thread (or if none, a not-dead stopped thread) as
   // the chosen thread that will be the stop-reason thread.
   Log *log(ProcessPOSIXLog::GetLogIfAllCategoriesSet(POSIX_LOG_PROCESS));
@@ -1375,8 +1375,8 @@ Error NativeProcessLinux::Interrupt() {
   }
 
   if (!running_thread_sp && !stopped_thread_sp) {
-    Error error("found no running/stepping or live stopped threads as target "
-                "for interrupt");
+    Status error("found no running/stepping or live stopped threads as target "
+                 "for interrupt");
     LLDB_LOG(log, "skipping due to error: {0}", error);
 
     return error;
@@ -1391,14 +1391,14 @@ Error NativeProcessLinux::Interrupt() {
 
   StopRunningThreads(deferred_signal_thread_sp->GetID());
 
-  return Error();
+  return Status();
 }
 
-Error NativeProcessLinux::Kill() {
+Status NativeProcessLinux::Kill() {
   Log *log(ProcessPOSIXLog::GetLogIfAllCategoriesSet(POSIX_LOG_PROCESS));
   LLDB_LOG(log, "pid {0}", GetID());
 
-  Error error;
+  Status error;
 
   switch (m_state) {
   case StateType::eStateInvalid:
@@ -1430,7 +1430,7 @@ Error NativeProcessLinux::Kill() {
   return error;
 }
 
-static Error
+static Status
 ParseMemoryRegionInfoFromProcMapsLine(llvm::StringRef &maps_line,
                                       MemoryRegionInfo &memory_region_info) {
   memory_region_info.Clear();
@@ -1447,7 +1447,7 @@ ParseMemoryRegionInfoFromProcMapsLine(llvm::StringRef &maps_line,
 
   // Parse out hyphen separating start and end address from range.
   if (!line_extractor.GetBytesLeft() || (line_extractor.GetChar() != '-'))
-    return Error(
+    return Status(
         "malformed /proc/{pid}/maps entry, missing dash between address range");
 
   // Parse out the ending address
@@ -1455,7 +1455,8 @@ ParseMemoryRegionInfoFromProcMapsLine(llvm::StringRef &maps_line,
 
   // Parse out the space after the address.
   if (!line_extractor.GetBytesLeft() || (line_extractor.GetChar() != ' '))
-    return Error("malformed /proc/{pid}/maps entry, missing space after range");
+    return Status(
+        "malformed /proc/{pid}/maps entry, missing space after range");
 
   // Save the range.
   memory_region_info.GetRange().SetRangeBase(start_address);
@@ -1467,8 +1468,8 @@ ParseMemoryRegionInfoFromProcMapsLine(llvm::StringRef &maps_line,
 
   // Parse out each permission entry.
   if (line_extractor.GetBytesLeft() < 4)
-    return Error("malformed /proc/{pid}/maps entry, missing some portion of "
-                 "permissions");
+    return Status("malformed /proc/{pid}/maps entry, missing some portion of "
+                  "permissions");
 
   // Handle read permission.
   const char read_perm_char = line_extractor.GetChar();
@@ -1477,7 +1478,7 @@ ParseMemoryRegionInfoFromProcMapsLine(llvm::StringRef &maps_line,
   else if (read_perm_char == '-')
     memory_region_info.SetReadable(MemoryRegionInfo::OptionalBool::eNo);
   else
-    return Error("unexpected /proc/{pid}/maps read permission char");
+    return Status("unexpected /proc/{pid}/maps read permission char");
 
   // Handle write permission.
   const char write_perm_char = line_extractor.GetChar();
@@ -1486,7 +1487,7 @@ ParseMemoryRegionInfoFromProcMapsLine(llvm::StringRef &maps_line,
   else if (write_perm_char == '-')
     memory_region_info.SetWritable(MemoryRegionInfo::OptionalBool::eNo);
   else
-    return Error("unexpected /proc/{pid}/maps write permission char");
+    return Status("unexpected /proc/{pid}/maps write permission char");
 
   // Handle execute permission.
   const char exec_perm_char = line_extractor.GetChar();
@@ -1495,7 +1496,7 @@ ParseMemoryRegionInfoFromProcMapsLine(llvm::StringRef &maps_line,
   else if (exec_perm_char == '-')
     memory_region_info.SetExecutable(MemoryRegionInfo::OptionalBool::eNo);
   else
-    return Error("unexpected /proc/{pid}/maps exec permission char");
+    return Status("unexpected /proc/{pid}/maps exec permission char");
 
   line_extractor.GetChar();              // Read the private bit
   line_extractor.SkipSpaces();           // Skip the separator
@@ -1511,11 +1512,11 @@ ParseMemoryRegionInfoFromProcMapsLine(llvm::StringRef &maps_line,
   if (name)
     memory_region_info.SetName(name);
 
-  return Error();
+  return Status();
 }
 
-Error NativeProcessLinux::GetMemoryRegionInfo(lldb::addr_t load_addr,
-                                              MemoryRegionInfo &range_info) {
+Status NativeProcessLinux::GetMemoryRegionInfo(lldb::addr_t load_addr,
+                                               MemoryRegionInfo &range_info) {
   // FIXME review that the final memory region returned extends to the end of
   // the virtual address space,
   // with no perms if it is not mapped.
@@ -1526,10 +1527,10 @@ Error NativeProcessLinux::GetMemoryRegionInfo(lldb::addr_t load_addr,
 
   if (m_supports_mem_region == LazyBool::eLazyBoolNo) {
     // We're done.
-    return Error("unsupported");
+    return Status("unsupported");
   }
 
-  Error error = PopulateMemoryRegionCache();
+  Status error = PopulateMemoryRegionCache();
   if (error.Fail()) {
     return error;
   }
@@ -1585,7 +1586,7 @@ Error NativeProcessLinux::GetMemoryRegionInfo(lldb::addr_t load_addr,
   return error;
 }
 
-Error NativeProcessLinux::PopulateMemoryRegionCache() {
+Status NativeProcessLinux::PopulateMemoryRegionCache() {
   Log *log(ProcessPOSIXLog::GetLogIfAllCategoriesSet(POSIX_LOG_PROCESS));
 
   // If our cache is empty, pull the latest.  There should always be at least
@@ -1593,7 +1594,7 @@ Error NativeProcessLinux::PopulateMemoryRegionCache() {
   if (!m_mem_region_cache.empty()) {
     LLDB_LOG(log, "reusing {0} cached memory region entries",
              m_mem_region_cache.size());
-    return Error();
+    return Status();
   }
 
   auto BufferOrError = getProcFile(GetID(), "maps");
@@ -1606,7 +1607,8 @@ Error NativeProcessLinux::PopulateMemoryRegionCache() {
     StringRef Line;
     std::tie(Line, Rest) = Rest.split('\n');
     MemoryRegionInfo info;
-    const Error parse_error = ParseMemoryRegionInfoFromProcMapsLine(Line, info);
+    const Status parse_error =
+        ParseMemoryRegionInfoFromProcMapsLine(Line, info);
     if (parse_error.Fail()) {
       LLDB_LOG(log, "failed to parse proc maps line '{0}': {1}", Line,
                parse_error);
@@ -1625,7 +1627,7 @@ Error NativeProcessLinux::PopulateMemoryRegionCache() {
     LLDB_LOG(log,
              "failed to find any procfs maps entries, assuming no support "
              "for memory region metadata retrieval");
-    return Error("not supported");
+    return Status("not supported");
   }
 
   LLDB_LOG(log, "read {0} memory region entries from /proc/{1}/maps",
@@ -1633,7 +1635,7 @@ Error NativeProcessLinux::PopulateMemoryRegionCache() {
 
   // We support memory retrieval, remember that.
   m_supports_mem_region = LazyBool::eLazyBoolYes;
-  return Error();
+  return Status();
 }
 
 void NativeProcessLinux::DoStopIDBumped(uint32_t newBumpId) {
@@ -1644,13 +1646,13 @@ void NativeProcessLinux::DoStopIDBumped(uint32_t newBumpId) {
   m_mem_region_cache.clear();
 }
 
-Error NativeProcessLinux::AllocateMemory(size_t size, uint32_t permissions,
-                                         lldb::addr_t &addr) {
+Status NativeProcessLinux::AllocateMemory(size_t size, uint32_t permissions,
+                                          lldb::addr_t &addr) {
 // FIXME implementing this requires the equivalent of
 // InferiorCallPOSIX::InferiorCallMmap, which depends on
 // functional ThreadPlans working with Native*Protocol.
 #if 1
-  return Error("not implemented yet");
+  return Status("not implemented yet");
 #else
   addr = LLDB_INVALID_ADDRESS;
 
@@ -1668,20 +1670,20 @@ Error NativeProcessLinux::AllocateMemory(size_t size, uint32_t permissions,
   if (InferiorCallMmap(this, addr, 0, size, prot,
                        eMmapFlagsAnon | eMmapFlagsPrivate, -1, 0)) {
     m_addr_to_mmap_size[addr] = size;
-    return Error();
+    return Status();
   } else {
     addr = LLDB_INVALID_ADDRESS;
-    return Error("unable to allocate %" PRIu64
-                 " bytes of memory with permissions %s",
-                 size, GetPermissionsAsCString(permissions));
+    return Status("unable to allocate %" PRIu64
+                  " bytes of memory with permissions %s",
+                  size, GetPermissionsAsCString(permissions));
   }
 #endif
 }
 
-Error NativeProcessLinux::DeallocateMemory(lldb::addr_t addr) {
+Status NativeProcessLinux::DeallocateMemory(lldb::addr_t addr) {
   // FIXME see comments in AllocateMemory - required lower-level
   // bits not in place yet (ThreadPlans)
-  return Error("not implemented");
+  return Status("not implemented");
 }
 
 lldb::addr_t NativeProcessLinux::GetSharedLibraryInfoAddress() {
@@ -1702,7 +1704,7 @@ bool NativeProcessLinux::GetArchitecture(ArchSpec &arch) const {
   return true;
 }
 
-Error NativeProcessLinux::GetSoftwareBreakpointPCOffset(
+Status NativeProcessLinux::GetSoftwareBreakpointPCOffset(
     uint32_t &actual_opcode_size) {
   // FIXME put this behind a breakpoint protocol class that can be
   // set per architecture.  Need ARM, MIPS support here.
@@ -1713,11 +1715,11 @@ Error NativeProcessLinux::GetSoftwareBreakpointPCOffset(
   case llvm::Triple::x86:
   case llvm::Triple::x86_64:
     actual_opcode_size = static_cast<uint32_t>(sizeof(g_i386_opcode));
-    return Error();
+    return Status();
 
   case llvm::Triple::systemz:
     actual_opcode_size = static_cast<uint32_t>(sizeof(g_s390x_opcode));
-    return Error();
+    return Status();
 
   case llvm::Triple::arm:
   case llvm::Triple::aarch64:
@@ -1727,30 +1729,30 @@ Error NativeProcessLinux::GetSoftwareBreakpointPCOffset(
   case llvm::Triple::mipsel:
     // On these architectures the PC don't get updated for breakpoint hits
     actual_opcode_size = 0;
-    return Error();
+    return Status();
 
   default:
     assert(false && "CPU type not supported!");
-    return Error("CPU type not supported");
+    return Status("CPU type not supported");
   }
 }
 
-Error NativeProcessLinux::SetBreakpoint(lldb::addr_t addr, uint32_t size,
-                                        bool hardware) {
+Status NativeProcessLinux::SetBreakpoint(lldb::addr_t addr, uint32_t size,
+                                         bool hardware) {
   if (hardware)
     return SetHardwareBreakpoint(addr, size);
   else
     return SetSoftwareBreakpoint(addr, size);
 }
 
-Error NativeProcessLinux::RemoveBreakpoint(lldb::addr_t addr, bool hardware) {
+Status NativeProcessLinux::RemoveBreakpoint(lldb::addr_t addr, bool hardware) {
   if (hardware)
     return RemoveHardwareBreakpoint(addr);
   else
     return NativeProcessProtocol::RemoveBreakpoint(addr);
 }
 
-Error NativeProcessLinux::GetSoftwareBreakpointTrapOpcode(
+Status NativeProcessLinux::GetSoftwareBreakpointTrapOpcode(
     size_t trap_opcode_size_hint, size_t &actual_opcode_size,
     const uint8_t *&trap_opcode_bytes) {
   // FIXME put this behind a breakpoint protocol class that can be set per
@@ -1769,49 +1771,49 @@ Error NativeProcessLinux::GetSoftwareBreakpointTrapOpcode(
   case llvm::Triple::aarch64:
     trap_opcode_bytes = g_aarch64_opcode;
     actual_opcode_size = sizeof(g_aarch64_opcode);
-    return Error();
+    return Status();
 
   case llvm::Triple::arm:
     switch (trap_opcode_size_hint) {
     case 2:
       trap_opcode_bytes = g_thumb_breakpoint_opcode;
       actual_opcode_size = sizeof(g_thumb_breakpoint_opcode);
-      return Error();
+      return Status();
     case 4:
       trap_opcode_bytes = g_arm_breakpoint_opcode;
       actual_opcode_size = sizeof(g_arm_breakpoint_opcode);
-      return Error();
+      return Status();
     default:
       assert(false && "Unrecognised trap opcode size hint!");
-      return Error("Unrecognised trap opcode size hint!");
+      return Status("Unrecognised trap opcode size hint!");
     }
 
   case llvm::Triple::x86:
   case llvm::Triple::x86_64:
     trap_opcode_bytes = g_i386_opcode;
     actual_opcode_size = sizeof(g_i386_opcode);
-    return Error();
+    return Status();
 
   case llvm::Triple::mips:
   case llvm::Triple::mips64:
     trap_opcode_bytes = g_mips64_opcode;
     actual_opcode_size = sizeof(g_mips64_opcode);
-    return Error();
+    return Status();
 
   case llvm::Triple::mipsel:
   case llvm::Triple::mips64el:
     trap_opcode_bytes = g_mips64el_opcode;
     actual_opcode_size = sizeof(g_mips64el_opcode);
-    return Error();
+    return Status();
 
   case llvm::Triple::systemz:
     trap_opcode_bytes = g_s390x_opcode;
     actual_opcode_size = sizeof(g_s390x_opcode);
-    return Error();
+    return Status();
 
   default:
     assert(false && "CPU type not supported!");
-    return Error("CPU type not supported");
+    return Status("CPU type not supported");
   }
 }
 
@@ -1964,8 +1966,8 @@ NativeProcessLinux::GetCrashReasonForSIGBUS(const siginfo_t *info)
 }
 #endif
 
-Error NativeProcessLinux::ReadMemory(lldb::addr_t addr, void *buf, size_t size,
-                                     size_t &bytes_read) {
+Status NativeProcessLinux::ReadMemory(lldb::addr_t addr, void *buf, size_t size,
+                                      size_t &bytes_read) {
   if (ProcessVmReadvSupported()) {
     // The process_vm_readv path is about 50 times faster than ptrace api. We
     // want to use
@@ -1989,7 +1991,7 @@ Error NativeProcessLinux::ReadMemory(lldb::addr_t addr, void *buf, size_t size,
              size, addr, success ? "Success" : strerror(errno));
 
     if (success)
-      return Error();
+      return Status();
     // else the call failed for some reason, let's retry the read using ptrace
     // api.
   }
@@ -2002,7 +2004,7 @@ Error NativeProcessLinux::ReadMemory(lldb::addr_t addr, void *buf, size_t size,
   LLDB_LOG(log, "addr = {0}, buf = {1}, size = {2}", addr, buf, size);
 
   for (bytes_read = 0; bytes_read < size; bytes_read += remainder) {
-    Error error = NativeProcessLinux::PtraceWrapper(
+    Status error = NativeProcessLinux::PtraceWrapper(
         PTRACE_PEEKDATA, GetID(), (void *)addr, nullptr, 0, &data);
     if (error.Fail())
       return error;
@@ -2017,23 +2019,23 @@ Error NativeProcessLinux::ReadMemory(lldb::addr_t addr, void *buf, size_t size,
     addr += k_ptrace_word_size;
     dst += k_ptrace_word_size;
   }
-  return Error();
+  return Status();
 }
 
-Error NativeProcessLinux::ReadMemoryWithoutTrap(lldb::addr_t addr, void *buf,
-                                                size_t size,
-                                                size_t &bytes_read) {
-  Error error = ReadMemory(addr, buf, size, bytes_read);
+Status NativeProcessLinux::ReadMemoryWithoutTrap(lldb::addr_t addr, void *buf,
+                                                 size_t size,
+                                                 size_t &bytes_read) {
+  Status error = ReadMemory(addr, buf, size, bytes_read);
   if (error.Fail())
     return error;
   return m_breakpoint_list.RemoveTrapsFromBuffer(addr, buf, size);
 }
 
-Error NativeProcessLinux::WriteMemory(lldb::addr_t addr, const void *buf,
-                                      size_t size, size_t &bytes_written) {
+Status NativeProcessLinux::WriteMemory(lldb::addr_t addr, const void *buf,
+                                       size_t size, size_t &bytes_written) {
   const unsigned char *src = static_cast<const unsigned char *>(buf);
   size_t remainder;
-  Error error;
+  Status error;
 
   Log *log(ProcessPOSIXLog::GetLogIfAllCategoriesSet(POSIX_LOG_MEMORY));
   LLDB_LOG(log, "addr = {0}, buf = {1}, size = {2}", addr, buf, size);
@@ -2075,18 +2077,18 @@ Error NativeProcessLinux::WriteMemory(lldb::addr_t addr, const void *buf,
   return error;
 }
 
-Error NativeProcessLinux::GetSignalInfo(lldb::tid_t tid, void *siginfo) {
+Status NativeProcessLinux::GetSignalInfo(lldb::tid_t tid, void *siginfo) {
   return PtraceWrapper(PTRACE_GETSIGINFO, tid, nullptr, siginfo);
 }
 
-Error NativeProcessLinux::GetEventMessage(lldb::tid_t tid,
-                                          unsigned long *message) {
+Status NativeProcessLinux::GetEventMessage(lldb::tid_t tid,
+                                           unsigned long *message) {
   return PtraceWrapper(PTRACE_GETEVENTMSG, tid, nullptr, message);
 }
 
-Error NativeProcessLinux::Detach(lldb::tid_t tid) {
+Status NativeProcessLinux::Detach(lldb::tid_t tid) {
   if (tid == LLDB_INVALID_THREAD_ID)
-    return Error();
+    return Status();
 
   return PtraceWrapper(PTRACE_DETACH, tid);
 }
@@ -2137,10 +2139,11 @@ NativeThreadLinuxSP NativeProcessLinux::AddThread(lldb::tid_t thread_id) {
   return thread_sp;
 }
 
-Error NativeProcessLinux::FixupBreakpointPCAsNeeded(NativeThreadLinux &thread) {
+Status
+NativeProcessLinux::FixupBreakpointPCAsNeeded(NativeThreadLinux &thread) {
   Log *log(ProcessPOSIXLog::GetLogIfAllCategoriesSet(POSIX_LOG_BREAKPOINTS));
 
-  Error error;
+  Status error;
 
   // Find out the size of a breakpoint (might depend on where we are in the
   // code).
@@ -2179,7 +2182,7 @@ Error NativeProcessLinux::FixupBreakpointPCAsNeeded(NativeThreadLinux &thread) {
              "pid {0} no lldb breakpoint found at current pc with "
              "adjustment: {1}",
              GetID(), breakpoint_addr);
-    return Error();
+    return Status();
   }
 
   // If the breakpoint is not a software breakpoint, nothing to do.
@@ -2188,7 +2191,7 @@ Error NativeProcessLinux::FixupBreakpointPCAsNeeded(NativeThreadLinux &thread) {
         log,
         "pid {0} breakpoint found at {1:x}, not software, nothing to adjust",
         GetID(), breakpoint_addr);
-    return Error();
+    return Status();
   }
 
   //
@@ -2202,7 +2205,7 @@ Error NativeProcessLinux::FixupBreakpointPCAsNeeded(NativeThreadLinux &thread) {
              "pid {0} breakpoint found at {1:x}, it is software, but the "
              "size is zero, nothing to do (unexpected)",
              GetID(), breakpoint_addr);
-    return Error();
+    return Status();
   }
 
   // Change the program counter.
@@ -2219,9 +2222,9 @@ Error NativeProcessLinux::FixupBreakpointPCAsNeeded(NativeThreadLinux &thread) {
   return error;
 }
 
-Error NativeProcessLinux::GetLoadedModuleFileSpec(const char *module_path,
-                                                  FileSpec &file_spec) {
-  Error error = PopulateMemoryRegionCache();
+Status NativeProcessLinux::GetLoadedModuleFileSpec(const char *module_path,
+                                                   FileSpec &file_spec) {
+  Status error = PopulateMemoryRegionCache();
   if (error.Fail())
     return error;
 
@@ -2231,17 +2234,17 @@ Error NativeProcessLinux::GetLoadedModuleFileSpec(const char *module_path,
   for (const auto &it : m_mem_region_cache) {
     if (it.second.GetFilename() == module_file_spec.GetFilename()) {
       file_spec = it.second;
-      return Error();
+      return Status();
     }
   }
-  return Error("Module file (%s) not found in /proc/%" PRIu64 "/maps file!",
-               module_file_spec.GetFilename().AsCString(), GetID());
+  return Status("Module file (%s) not found in /proc/%" PRIu64 "/maps file!",
+                module_file_spec.GetFilename().AsCString(), GetID());
 }
 
-Error NativeProcessLinux::GetFileLoadAddress(const llvm::StringRef &file_name,
-                                             lldb::addr_t &load_addr) {
+Status NativeProcessLinux::GetFileLoadAddress(const llvm::StringRef &file_name,
+                                              lldb::addr_t &load_addr) {
   load_addr = LLDB_INVALID_ADDRESS;
-  Error error = PopulateMemoryRegionCache();
+  Status error = PopulateMemoryRegionCache();
   if (error.Fail())
     return error;
 
@@ -2249,10 +2252,10 @@ Error NativeProcessLinux::GetFileLoadAddress(const llvm::StringRef &file_name,
   for (const auto &it : m_mem_region_cache) {
     if (it.second == file) {
       load_addr = it.first.GetRange().GetRangeBase();
-      return Error();
+      return Status();
     }
   }
-  return Error("No load address found for specified file.");
+  return Status("No load address found for specified file.");
 }
 
 NativeThreadLinuxSP NativeProcessLinux::GetThreadByID(lldb::tid_t tid) {
@@ -2260,8 +2263,8 @@ NativeThreadLinuxSP NativeProcessLinux::GetThreadByID(lldb::tid_t tid) {
       NativeProcessProtocol::GetThreadByID(tid));
 }
 
-Error NativeProcessLinux::ResumeThread(NativeThreadLinux &thread,
-                                       lldb::StateType state, int signo) {
+Status NativeProcessLinux::ResumeThread(NativeThreadLinux &thread,
+                                        lldb::StateType state, int signo) {
   Log *const log = ProcessPOSIXLog::GetLogIfAllCategoriesSet(POSIX_LOG_THREAD);
   LLDB_LOG(log, "tid: {0}", thread.GetID());
 
@@ -2336,7 +2339,7 @@ void NativeProcessLinux::SignalIfAllThreadsStopped() {
   // Clear any temporary breakpoints we used to implement software single
   // stepping.
   for (const auto &thread_info : m_threads_stepping_with_breakpoint) {
-    Error error = RemoveBreakpoint(thread_info.second);
+    Status error = RemoveBreakpoint(thread_info.second);
     if (error.Fail())
       LLDB_LOG(log, "pid = {0} remove stepping breakpoint: {1}",
                thread_info.first, error);
@@ -2376,7 +2379,7 @@ void NativeProcessLinux::SigchldHandler() {
       if (errno == EINTR)
         continue;
 
-      Error error(errno, eErrorTypePOSIX);
+      Status error(errno, eErrorTypePOSIX);
       LLDB_LOG(log, "waitpid (-1, &status, _) failed: {0}", error);
       break;
     }
@@ -2414,10 +2417,10 @@ void NativeProcessLinux::SigchldHandler() {
 // Wrapper for ptrace to catch errors and log calls.
 // Note that ptrace sets errno on error because -1 can be a valid result (i.e.
 // for PTRACE_PEEK*)
-Error NativeProcessLinux::PtraceWrapper(int req, lldb::pid_t pid, void *addr,
-                                        void *data, size_t data_size,
-                                        long *result) {
-  Error error;
+Status NativeProcessLinux::PtraceWrapper(int req, lldb::pid_t pid, void *addr,
+                                         void *data, size_t data_size,
+                                         long *result) {
+  Status error;
   long int ret;
 
   Log *log(ProcessPOSIXLog::GetLogIfAllCategoriesSet(POSIX_LOG_PTRACE));
diff --git a/source/Plugins/Process/Linux/NativeProcessLinux.h b/source/Plugins/Process/Linux/NativeProcessLinux.h
index e4809d082b50..98fc88baab6e 100644
--- a/source/Plugins/Process/Linux/NativeProcessLinux.h
+++ b/source/Plugins/Process/Linux/NativeProcessLinux.h
@@ -26,7 +26,7 @@
 #include "lldb/Host/common/NativeProcessProtocol.h"
 
 namespace lldb_private {
-class Error;
+class Status;
 class Scalar;
 
 namespace process_linux {
@@ -38,11 +38,11 @@ namespace process_linux {
 ///
 /// Changes in the inferior process state are broadcasted.
 class NativeProcessLinux : public NativeProcessProtocol {
-  friend Error NativeProcessProtocol::Launch(
+  friend Status NativeProcessProtocol::Launch(
       ProcessLaunchInfo &launch_info, NativeDelegate &native_delegate,
       MainLoop &mainloop, NativeProcessProtocolSP &process_sp);
 
-  friend Error NativeProcessProtocol::Attach(
+  friend Status NativeProcessProtocol::Attach(
       lldb::pid_t pid, NativeProcessProtocol::NativeDelegate &native_delegate,
       MainLoop &mainloop, NativeProcessProtocolSP &process_sp);
 
@@ -50,34 +50,34 @@ public:
   // ---------------------------------------------------------------------
   // NativeProcessProtocol Interface
   // ---------------------------------------------------------------------
-  Error Resume(const ResumeActionList &resume_actions) override;
+  Status Resume(const ResumeActionList &resume_actions) override;
 
-  Error Halt() override;
+  Status Halt() override;
 
-  Error Detach() override;
+  Status Detach() override;
 
-  Error Signal(int signo) override;
+  Status Signal(int signo) override;
 
-  Error Interrupt() override;
+  Status Interrupt() override;
 
-  Error Kill() override;
+  Status Kill() override;
 
-  Error GetMemoryRegionInfo(lldb::addr_t load_addr,
-                            MemoryRegionInfo &range_info) override;
+  Status GetMemoryRegionInfo(lldb::addr_t load_addr,
+                             MemoryRegionInfo &range_info) override;
 
-  Error ReadMemory(lldb::addr_t addr, void *buf, size_t size,
-                   size_t &bytes_read) override;
+  Status ReadMemory(lldb::addr_t addr, void *buf, size_t size,
+                    size_t &bytes_read) override;
 
-  Error ReadMemoryWithoutTrap(lldb::addr_t addr, void *buf, size_t size,
-                              size_t &bytes_read) override;
+  Status ReadMemoryWithoutTrap(lldb::addr_t addr, void *buf, size_t size,
+                               size_t &bytes_read) override;
 
-  Error WriteMemory(lldb::addr_t addr, const void *buf, size_t size,
-                    size_t &bytes_written) override;
+  Status WriteMemory(lldb::addr_t addr, const void *buf, size_t size,
+                     size_t &bytes_written) override;
 
-  Error AllocateMemory(size_t size, uint32_t permissions,
-                       lldb::addr_t &addr) override;
+  Status AllocateMemory(size_t size, uint32_t permissions,
+                        lldb::addr_t &addr) override;
 
-  Error DeallocateMemory(lldb::addr_t addr) override;
+  Status DeallocateMemory(lldb::addr_t addr) override;
 
   lldb::addr_t GetSharedLibraryInfoAddress() override;
 
@@ -85,17 +85,18 @@ public:
 
   bool GetArchitecture(ArchSpec &arch) const override;
 
-  Error SetBreakpoint(lldb::addr_t addr, uint32_t size, bool hardware) override;
+  Status SetBreakpoint(lldb::addr_t addr, uint32_t size,
+                       bool hardware) override;
 
-  Error RemoveBreakpoint(lldb::addr_t addr, bool hardware = false) override;
+  Status RemoveBreakpoint(lldb::addr_t addr, bool hardware = false) override;
 
   void DoStopIDBumped(uint32_t newBumpId) override;
 
-  Error GetLoadedModuleFileSpec(const char *module_path,
-                                FileSpec &file_spec) override;
+  Status GetLoadedModuleFileSpec(const char *module_path,
+                                 FileSpec &file_spec) override;
 
-  Error GetFileLoadAddress(const llvm::StringRef &file_name,
-                           lldb::addr_t &load_addr) override;
+  Status GetFileLoadAddress(const llvm::StringRef &file_name,
+                            lldb::addr_t &load_addr) override;
 
   NativeThreadLinuxSP GetThreadByID(lldb::tid_t id);
 
@@ -107,9 +108,9 @@ public:
   // ---------------------------------------------------------------------
   // Interface used by NativeRegisterContext-derived classes.
   // ---------------------------------------------------------------------
-  static Error PtraceWrapper(int req, lldb::pid_t pid, void *addr = nullptr,
-                             void *data = nullptr, size_t data_size = 0,
-                             long *result = nullptr);
+  static Status PtraceWrapper(int req, lldb::pid_t pid, void *addr = nullptr,
+                              void *data = nullptr, size_t data_size = 0,
+                              long *result = nullptr);
 
   bool SupportHardwareSingleStepping() const;
 
@@ -117,7 +118,7 @@ protected:
   // ---------------------------------------------------------------------
   // NativeProcessProtocol protected interface
   // ---------------------------------------------------------------------
-  Error
+  Status
   GetSoftwareBreakpointTrapOpcode(size_t trap_opcode_size_hint,
                                   size_t &actual_opcode_size,
                                   const uint8_t *&trap_opcode_bytes) override;
@@ -140,15 +141,15 @@ private:
   // ---------------------------------------------------------------------
   NativeProcessLinux();
 
-  Error LaunchInferior(MainLoop &mainloop, ProcessLaunchInfo &launch_info);
+  Status LaunchInferior(MainLoop &mainloop, ProcessLaunchInfo &launch_info);
 
   /// Attaches to an existing process.  Forms the
   /// implementation of Process::DoAttach
-  void AttachToInferior(MainLoop &mainloop, lldb::pid_t pid, Error &error);
+  void AttachToInferior(MainLoop &mainloop, lldb::pid_t pid, Status &error);
 
-  ::pid_t Attach(lldb::pid_t pid, Error &error);
+  ::pid_t Attach(lldb::pid_t pid, Status &error);
 
-  static Error SetDefaultPtraceOpts(const lldb::pid_t);
+  static Status SetDefaultPtraceOpts(const lldb::pid_t);
 
   static void *MonitorThread(void *baton);
 
@@ -167,7 +168,7 @@ private:
   void MonitorSignal(const siginfo_t &info, NativeThreadLinux &thread,
                      bool exited);
 
-  Error SetupSoftwareSingleStepping(NativeThreadLinux &thread);
+  Status SetupSoftwareSingleStepping(NativeThreadLinux &thread);
 
 #if 0
         static ::ProcessMessage::CrashReason
@@ -189,22 +190,22 @@ private:
 
   NativeThreadLinuxSP AddThread(lldb::tid_t thread_id);
 
-  Error GetSoftwareBreakpointPCOffset(uint32_t &actual_opcode_size);
+  Status GetSoftwareBreakpointPCOffset(uint32_t &actual_opcode_size);
 
-  Error FixupBreakpointPCAsNeeded(NativeThreadLinux &thread);
+  Status FixupBreakpointPCAsNeeded(NativeThreadLinux &thread);
 
   /// Writes a siginfo_t structure corresponding to the given thread ID to the
   /// memory region pointed to by @p siginfo.
-  Error GetSignalInfo(lldb::tid_t tid, void *siginfo);
+  Status GetSignalInfo(lldb::tid_t tid, void *siginfo);
 
   /// Writes the raw event message code (vis-a-vis PTRACE_GETEVENTMSG)
   /// corresponding to the given thread ID to the memory pointed to by @p
   /// message.
-  Error GetEventMessage(lldb::tid_t tid, unsigned long *message);
+  Status GetEventMessage(lldb::tid_t tid, unsigned long *message);
 
   void NotifyThreadDeath(lldb::tid_t tid);
 
-  Error Detach(lldb::tid_t tid);
+  Status Detach(lldb::tid_t tid);
 
   // This method is requests a stop on all threads which are still running. It
   // sets up a
@@ -219,14 +220,14 @@ private:
   // Resume the given thread, optionally passing it the given signal. The type
   // of resume
   // operation (continue, single-step) depends on the state parameter.
-  Error ResumeThread(NativeThreadLinux &thread, lldb::StateType state,
-                     int signo);
+  Status ResumeThread(NativeThreadLinux &thread, lldb::StateType state,
+                      int signo);
 
   void ThreadWasCreated(NativeThreadLinux &thread);
 
   void SigchldHandler();
 
-  Error PopulateMemoryRegionCache();
+  Status PopulateMemoryRegionCache();
 };
 
 } // namespace process_linux
diff --git a/source/Plugins/Process/Linux/NativeRegisterContextLinux.cpp b/source/Plugins/Process/Linux/NativeRegisterContextLinux.cpp
index be256e972215..43253f388019 100644
--- a/source/Plugins/Process/Linux/NativeRegisterContextLinux.cpp
+++ b/source/Plugins/Process/Linux/NativeRegisterContextLinux.cpp
@@ -41,18 +41,19 @@ lldb::ByteOrder NativeRegisterContextLinux::GetByteOrder() const {
   return byte_order;
 }
 
-Error NativeRegisterContextLinux::ReadRegisterRaw(uint32_t reg_index,
-                                                  RegisterValue &reg_value) {
+Status NativeRegisterContextLinux::ReadRegisterRaw(uint32_t reg_index,
+                                                   RegisterValue &reg_value) {
   const RegisterInfo *const reg_info = GetRegisterInfoAtIndex(reg_index);
   if (!reg_info)
-    return Error("register %" PRIu32 " not found", reg_index);
+    return Status("register %" PRIu32 " not found", reg_index);
 
   return DoReadRegisterValue(reg_info->byte_offset, reg_info->name,
                              reg_info->byte_size, reg_value);
 }
 
-Error NativeRegisterContextLinux::WriteRegisterRaw(
-    uint32_t reg_index, const RegisterValue &reg_value) {
+Status
+NativeRegisterContextLinux::WriteRegisterRaw(uint32_t reg_index,
+                                             const RegisterValue &reg_value) {
   uint32_t reg_to_write = reg_index;
   RegisterValue value_to_write = reg_value;
 
@@ -60,7 +61,7 @@ Error NativeRegisterContextLinux::WriteRegisterRaw(
   const RegisterInfo *reg_info = GetRegisterInfoAtIndex(reg_index);
   if (reg_info->invalidate_regs &&
       (reg_info->invalidate_regs[0] != LLDB_INVALID_REGNUM)) {
-    Error error;
+    Status error;
 
     RegisterValue full_value;
     uint32_t full_reg = reg_info->invalidate_regs[0];
@@ -99,71 +100,71 @@ Error NativeRegisterContextLinux::WriteRegisterRaw(
   assert(register_to_write_info_p &&
          "register to write does not have valid RegisterInfo");
   if (!register_to_write_info_p)
-    return Error("NativeRegisterContextLinux::%s failed to get RegisterInfo "
-                 "for write register index %" PRIu32,
-                 __FUNCTION__, reg_to_write);
+    return Status("NativeRegisterContextLinux::%s failed to get RegisterInfo "
+                  "for write register index %" PRIu32,
+                  __FUNCTION__, reg_to_write);
 
   return DoWriteRegisterValue(reg_info->byte_offset, reg_info->name, reg_value);
 }
 
-Error NativeRegisterContextLinux::ReadGPR() {
+Status NativeRegisterContextLinux::ReadGPR() {
   void *buf = GetGPRBuffer();
   if (!buf)
-    return Error("GPR buffer is NULL");
+    return Status("GPR buffer is NULL");
   size_t buf_size = GetGPRSize();
 
   return DoReadGPR(buf, buf_size);
 }
 
-Error NativeRegisterContextLinux::WriteGPR() {
+Status NativeRegisterContextLinux::WriteGPR() {
   void *buf = GetGPRBuffer();
   if (!buf)
-    return Error("GPR buffer is NULL");
+    return Status("GPR buffer is NULL");
   size_t buf_size = GetGPRSize();
 
   return DoWriteGPR(buf, buf_size);
 }
 
-Error NativeRegisterContextLinux::ReadFPR() {
+Status NativeRegisterContextLinux::ReadFPR() {
   void *buf = GetFPRBuffer();
   if (!buf)
-    return Error("FPR buffer is NULL");
+    return Status("FPR buffer is NULL");
   size_t buf_size = GetFPRSize();
 
   return DoReadFPR(buf, buf_size);
 }
 
-Error NativeRegisterContextLinux::WriteFPR() {
+Status NativeRegisterContextLinux::WriteFPR() {
   void *buf = GetFPRBuffer();
   if (!buf)
-    return Error("FPR buffer is NULL");
+    return Status("FPR buffer is NULL");
   size_t buf_size = GetFPRSize();
 
   return DoWriteFPR(buf, buf_size);
 }
 
-Error NativeRegisterContextLinux::ReadRegisterSet(void *buf, size_t buf_size,
-                                                  unsigned int regset) {
+Status NativeRegisterContextLinux::ReadRegisterSet(void *buf, size_t buf_size,
+                                                   unsigned int regset) {
   return NativeProcessLinux::PtraceWrapper(PTRACE_GETREGSET, m_thread.GetID(),
                                            static_cast<void *>(&regset), buf,
                                            buf_size);
 }
 
-Error NativeRegisterContextLinux::WriteRegisterSet(void *buf, size_t buf_size,
-                                                   unsigned int regset) {
+Status NativeRegisterContextLinux::WriteRegisterSet(void *buf, size_t buf_size,
+                                                    unsigned int regset) {
   return NativeProcessLinux::PtraceWrapper(PTRACE_SETREGSET, m_thread.GetID(),
                                            static_cast<void *>(&regset), buf,
                                            buf_size);
 }
 
-Error NativeRegisterContextLinux::DoReadRegisterValue(uint32_t offset,
-                                                      const char *reg_name,
-                                                      uint32_t size,
-                                                      RegisterValue &value) {
+Status NativeRegisterContextLinux::DoReadRegisterValue(uint32_t offset,
+                                                       const char *reg_name,
+                                                       uint32_t size,
+                                                       RegisterValue &value) {
   Log *log(ProcessPOSIXLog::GetLogIfAllCategoriesSet(POSIX_LOG_REGISTERS));
 
   long data;
-  Error error = NativeProcessLinux::PtraceWrapper(
+  Status error = NativeProcessLinux::PtraceWrapper(
       PTRACE_PEEKUSER, m_thread.GetID(), reinterpret_cast<void *>(offset),
       nullptr, 0, &data);
 
@@ -175,7 +176,7 @@ Error NativeRegisterContextLinux::DoReadRegisterValue(uint32_t offset,
   return error;
 }
 
-Error NativeRegisterContextLinux::DoWriteRegisterValue(
+Status NativeRegisterContextLinux::DoWriteRegisterValue(
     uint32_t offset, const char *reg_name, const RegisterValue &value) {
   Log *log(ProcessPOSIXLog::GetLogIfAllCategoriesSet(POSIX_LOG_REGISTERS));
 
@@ -186,22 +187,22 @@ Error NativeRegisterContextLinux::DoWriteRegisterValue(
       PTRACE_POKEUSER, m_thread.GetID(), reinterpret_cast<void *>(offset), buf);
 }
 
-Error NativeRegisterContextLinux::DoReadGPR(void *buf, size_t buf_size) {
+Status NativeRegisterContextLinux::DoReadGPR(void *buf, size_t buf_size) {
   return NativeProcessLinux::PtraceWrapper(PTRACE_GETREGS, m_thread.GetID(),
                                            nullptr, buf, buf_size);
 }
 
-Error NativeRegisterContextLinux::DoWriteGPR(void *buf, size_t buf_size) {
+Status NativeRegisterContextLinux::DoWriteGPR(void *buf, size_t buf_size) {
   return NativeProcessLinux::PtraceWrapper(PTRACE_SETREGS, m_thread.GetID(),
                                            nullptr, buf, buf_size);
 }
 
-Error NativeRegisterContextLinux::DoReadFPR(void *buf, size_t buf_size) {
+Status NativeRegisterContextLinux::DoReadFPR(void *buf, size_t buf_size) {
   return NativeProcessLinux::PtraceWrapper(PTRACE_GETFPREGS, m_thread.GetID(),
                                            nullptr, buf, buf_size);
 }
 
-Error NativeRegisterContextLinux::DoWriteFPR(void *buf, size_t buf_size) {
+Status NativeRegisterContextLinux::DoWriteFPR(void *buf, size_t buf_size) {
   return NativeProcessLinux::PtraceWrapper(PTRACE_SETFPREGS, m_thread.GetID(),
                                            nullptr, buf, buf_size);
 }
diff --git a/source/Plugins/Process/Linux/NativeRegisterContextLinux.h b/source/Plugins/Process/Linux/NativeRegisterContextLinux.h
index 4dfc5365f357..26074a6ce0e3 100644
--- a/source/Plugins/Process/Linux/NativeRegisterContextLinux.h
+++ b/source/Plugins/Process/Linux/NativeRegisterContextLinux.h
@@ -39,24 +39,24 @@ public:
 protected:
   lldb::ByteOrder GetByteOrder() const;
 
-  virtual Error ReadRegisterRaw(uint32_t reg_index, RegisterValue &reg_value);
+  virtual Status ReadRegisterRaw(uint32_t reg_index, RegisterValue &reg_value);
 
-  virtual Error WriteRegisterRaw(uint32_t reg_index,
-                                 const RegisterValue &reg_value);
+  virtual Status WriteRegisterRaw(uint32_t reg_index,
+                                  const RegisterValue &reg_value);
 
-  virtual Error ReadRegisterSet(void *buf, size_t buf_size,
-                                unsigned int regset);
-
-  virtual Error WriteRegisterSet(void *buf, size_t buf_size,
+  virtual Status ReadRegisterSet(void *buf, size_t buf_size,
                                  unsigned int regset);
 
-  virtual Error ReadGPR();
+  virtual Status WriteRegisterSet(void *buf, size_t buf_size,
+                                  unsigned int regset);
+
+  virtual Status ReadGPR();
 
-  virtual Error WriteGPR();
+  virtual Status WriteGPR();
 
-  virtual Error ReadFPR();
+  virtual Status ReadFPR();
 
-  virtual Error WriteFPR();
+  virtual Status WriteFPR();
 
   virtual void *GetGPRBuffer() { return nullptr; }
 
@@ -71,19 +71,19 @@ protected:
   // The Do*** functions are executed on the privileged thread and can perform
   // ptrace
   // operations directly.
-  virtual Error DoReadRegisterValue(uint32_t offset, const char *reg_name,
-                                    uint32_t size, RegisterValue &value);
+  virtual Status DoReadRegisterValue(uint32_t offset, const char *reg_name,
+                                     uint32_t size, RegisterValue &value);
 
-  virtual Error DoWriteRegisterValue(uint32_t offset, const char *reg_name,
-                                     const RegisterValue &value);
+  virtual Status DoWriteRegisterValue(uint32_t offset, const char *reg_name,
+                                      const RegisterValue &value);
 
-  virtual Error DoReadGPR(void *buf, size_t buf_size);
+  virtual Status DoReadGPR(void *buf, size_t buf_size);
 
-  virtual Error DoWriteGPR(void *buf, size_t buf_size);
+  virtual Status DoWriteGPR(void *buf, size_t buf_size);
 
-  virtual Error DoReadFPR(void *buf, size_t buf_size);
+  virtual Status DoReadFPR(void *buf, size_t buf_size);
 
-  virtual Error DoWriteFPR(void *buf, size_t buf_size);
+  virtual Status DoWriteFPR(void *buf, size_t buf_size);
 };
 
 } // namespace process_linux
diff --git a/source/Plugins/Process/Linux/NativeRegisterContextLinux_arm.cpp b/source/Plugins/Process/Linux/NativeRegisterContextLinux_arm.cpp
index 2dd23ad75a03..22b7d10869fe 100644
--- a/source/Plugins/Process/Linux/NativeRegisterContextLinux_arm.cpp
+++ b/source/Plugins/Process/Linux/NativeRegisterContextLinux_arm.cpp
@@ -13,8 +13,8 @@
 
 #include "lldb/Core/RegisterValue.h"
 #include "lldb/Utility/DataBufferHeap.h"
-#include "lldb/Utility/Error.h"
 #include "lldb/Utility/Log.h"
+#include "lldb/Utility/Status.h"
 
 #include "Plugins/Process/Linux/Procfs.h"
 #include "Plugins/Process/POSIX/ProcessPOSIXLog.h"
@@ -157,9 +157,10 @@ NativeRegisterContextLinux_arm::GetRegisterSet(uint32_t set_index) const {
   return nullptr;
 }
 
-Error NativeRegisterContextLinux_arm::ReadRegister(const RegisterInfo *reg_info,
-                                                   RegisterValue &reg_value) {
-  Error error;
+Status
+NativeRegisterContextLinux_arm::ReadRegister(const RegisterInfo *reg_info,
+                                             RegisterValue &reg_value) {
+  Status error;
 
   if (!reg_info) {
     error.SetErrorString("reg_info NULL");
@@ -226,16 +227,17 @@ Error NativeRegisterContextLinux_arm::ReadRegister(const RegisterInfo *reg_info,
   return error;
 }
 
-Error NativeRegisterContextLinux_arm::WriteRegister(
-    const RegisterInfo *reg_info, const RegisterValue &reg_value) {
+Status
+NativeRegisterContextLinux_arm::WriteRegister(const RegisterInfo *reg_info,
+                                              const RegisterValue &reg_value) {
   if (!reg_info)
-    return Error("reg_info NULL");
+    return Status("reg_info NULL");
 
   const uint32_t reg_index = reg_info->kinds[lldb::eRegisterKindLLDB];
   if (reg_index == LLDB_INVALID_REGNUM)
-    return Error("no lldb regnum for %s", reg_info && reg_info->name
-                                              ? reg_info->name
-                                              : "<unknown register>");
+    return Status("no lldb regnum for %s", reg_info && reg_info->name
+                                               ? reg_info->name
+                                               : "<unknown register>");
 
   if (IsGPR(reg_index))
     return WriteRegisterRaw(reg_index, reg_value);
@@ -257,29 +259,29 @@ Error NativeRegisterContextLinux_arm::WriteRegister(
       break;
     default:
       assert(false && "Unhandled data size.");
-      return Error("unhandled register data size %" PRIu32,
-                   reg_info->byte_size);
+      return Status("unhandled register data size %" PRIu32,
+                    reg_info->byte_size);
     }
 
-    Error error = WriteFPR();
+    Status error = WriteFPR();
     if (error.Fail())
       return error;
 
-    return Error();
+    return Status();
   }
 
-  return Error("failed - register wasn't recognized to be a GPR or an FPR, "
-               "write strategy unknown");
+  return Status("failed - register wasn't recognized to be a GPR or an FPR, "
+                "write strategy unknown");
 }
 
-Error NativeRegisterContextLinux_arm::ReadAllRegisterValues(
+Status NativeRegisterContextLinux_arm::ReadAllRegisterValues(
     lldb::DataBufferSP &data_sp) {
-  Error error;
+  Status error;
 
   data_sp.reset(new DataBufferHeap(REG_CONTEXT_SIZE, 0));
   if (!data_sp)
-    return Error("failed to allocate DataBufferHeap instance of size %" PRIu64,
-                 (uint64_t)REG_CONTEXT_SIZE);
+    return Status("failed to allocate DataBufferHeap instance of size %" PRIu64,
+                  (uint64_t)REG_CONTEXT_SIZE);
 
   error = ReadGPR();
   if (error.Fail())
@@ -304,9 +306,9 @@ Error NativeRegisterContextLinux_arm::ReadAllRegisterValues(
   return error;
 }
 
-Error NativeRegisterContextLinux_arm::WriteAllRegisterValues(
+Status NativeRegisterContextLinux_arm::WriteAllRegisterValues(
     const lldb::DataBufferSP &data_sp) {
-  Error error;
+  Status error;
 
   if (!data_sp) {
     error.SetErrorStringWithFormat(
@@ -361,7 +363,7 @@ uint32_t NativeRegisterContextLinux_arm::NumSupportedHardwareBreakpoints() {
   if (log)
     log->Printf("NativeRegisterContextLinux_arm::%s()", __FUNCTION__);
 
-  Error error;
+  Status error;
 
   // Read hardware breakpoint and watchpoint information.
   error = ReadHardwareDebugInfo();
@@ -380,7 +382,7 @@ NativeRegisterContextLinux_arm::SetHardwareBreakpoint(lldb::addr_t addr,
   LLDB_LOG(log, "addr: {0:x}, size: {1:x}", addr, size);
 
   // Read hardware breakpoint and watchpoint information.
-  Error error = ReadHardwareDebugInfo();
+  Status error = ReadHardwareDebugInfo();
 
   if (error.Fail())
     return LLDB_INVALID_INDEX32;
@@ -438,7 +440,7 @@ bool NativeRegisterContextLinux_arm::ClearHardwareBreakpoint(uint32_t hw_idx) {
   LLDB_LOG(log, "hw_idx: {0}", hw_idx);
 
   // Read hardware breakpoint and watchpoint information.
-  Error error = ReadHardwareDebugInfo();
+  Status error = ReadHardwareDebugInfo();
 
   if (error.Fail())
     return false;
@@ -466,7 +468,7 @@ bool NativeRegisterContextLinux_arm::ClearHardwareBreakpoint(uint32_t hw_idx) {
   return true;
 }
 
-Error NativeRegisterContextLinux_arm::GetHardwareBreakHitIndex(
+Status NativeRegisterContextLinux_arm::GetHardwareBreakHitIndex(
     uint32_t &bp_index, lldb::addr_t trap_addr) {
   Log *log(ProcessPOSIXLog::GetLogIfAllCategoriesSet(POSIX_LOG_BREAKPOINTS));
 
@@ -480,21 +482,21 @@ Error NativeRegisterContextLinux_arm::GetHardwareBreakHitIndex(
 
     if ((m_hbr_regs[bp_index].control & 0x1) && (trap_addr == break_addr)) {
       m_hbr_regs[bp_index].hit_addr = trap_addr;
-      return Error();
+      return Status();
     }
   }
 
   bp_index = LLDB_INVALID_INDEX32;
-  return Error();
+  return Status();
 }
 
-Error NativeRegisterContextLinux_arm::ClearAllHardwareBreakpoints() {
+Status NativeRegisterContextLinux_arm::ClearAllHardwareBreakpoints() {
   Log *log(ProcessPOSIXLog::GetLogIfAllCategoriesSet(POSIX_LOG_BREAKPOINTS));
 
   if (log)
     log->Printf("NativeRegisterContextLinux_arm::%s()", __FUNCTION__);
 
-  Error error;
+  Status error;
 
   // Read hardware breakpoint and watchpoint information.
   error = ReadHardwareDebugInfo();
@@ -527,14 +529,14 @@ Error NativeRegisterContextLinux_arm::ClearAllHardwareBreakpoints() {
     }
   }
 
-  return Error();
+  return Status();
 }
 
 uint32_t NativeRegisterContextLinux_arm::NumSupportedHardwareWatchpoints() {
   Log *log(ProcessPOSIXLog::GetLogIfAllCategoriesSet(POSIX_LOG_WATCHPOINTS));
 
   // Read hardware breakpoint and watchpoint information.
-  Error error = ReadHardwareDebugInfo();
+  Status error = ReadHardwareDebugInfo();
 
   if (error.Fail())
     return 0;
@@ -550,7 +552,7 @@ uint32_t NativeRegisterContextLinux_arm::SetHardwareWatchpoint(
            watch_flags);
 
   // Read hardware breakpoint and watchpoint information.
-  Error error = ReadHardwareDebugInfo();
+  Status error = ReadHardwareDebugInfo();
 
   if (error.Fail())
     return LLDB_INVALID_INDEX32;
@@ -654,7 +656,7 @@ bool NativeRegisterContextLinux_arm::ClearHardwareWatchpoint(
   LLDB_LOG(log, "wp_index: {0}", wp_index);
 
   // Read hardware breakpoint and watchpoint information.
-  Error error = ReadHardwareDebugInfo();
+  Status error = ReadHardwareDebugInfo();
 
   if (error.Fail())
     return false;
@@ -683,9 +685,9 @@ bool NativeRegisterContextLinux_arm::ClearHardwareWatchpoint(
   return true;
 }
 
-Error NativeRegisterContextLinux_arm::ClearAllHardwareWatchpoints() {
+Status NativeRegisterContextLinux_arm::ClearAllHardwareWatchpoints() {
   // Read hardware breakpoint and watchpoint information.
-  Error error = ReadHardwareDebugInfo();
+  Status error = ReadHardwareDebugInfo();
 
   if (error.Fail())
     return error;
@@ -715,7 +717,7 @@ Error NativeRegisterContextLinux_arm::ClearAllHardwareWatchpoints() {
     }
   }
 
-  return Error();
+  return Status();
 }
 
 uint32_t NativeRegisterContextLinux_arm::GetWatchpointSize(uint32_t wp_index) {
@@ -745,8 +747,9 @@ bool NativeRegisterContextLinux_arm::WatchpointIsEnabled(uint32_t wp_index) {
     return false;
 }
 
-Error NativeRegisterContextLinux_arm::GetWatchpointHitIndex(
-    uint32_t &wp_index, lldb::addr_t trap_addr) {
+Status
+NativeRegisterContextLinux_arm::GetWatchpointHitIndex(uint32_t &wp_index,
+                                                      lldb::addr_t trap_addr) {
   Log *log(ProcessPOSIXLog::GetLogIfAllCategoriesSet(POSIX_LOG_WATCHPOINTS));
   LLDB_LOG(log, "wp_index: {0}, trap_addr: {1:x}", wp_index, trap_addr);
 
@@ -760,12 +763,12 @@ Error NativeRegisterContextLinux_arm::GetWatchpointHitIndex(
     if (WatchpointIsEnabled(wp_index) && trap_addr >= watch_addr &&
         trap_addr < watch_addr + watch_size) {
       m_hwp_regs[wp_index].hit_addr = trap_addr;
-      return Error();
+      return Status();
     }
   }
 
   wp_index = LLDB_INVALID_INDEX32;
-  return Error();
+  return Status();
 }
 
 lldb::addr_t
@@ -796,11 +799,11 @@ NativeRegisterContextLinux_arm::GetWatchpointHitAddress(uint32_t wp_index) {
     return LLDB_INVALID_ADDRESS;
 }
 
-Error NativeRegisterContextLinux_arm::ReadHardwareDebugInfo() {
-  Error error;
+Status NativeRegisterContextLinux_arm::ReadHardwareDebugInfo() {
+  Status error;
 
   if (!m_refresh_hwdebug_info) {
-    return Error();
+    return Status();
   }
 
   unsigned int cap_val;
@@ -819,9 +822,9 @@ Error NativeRegisterContextLinux_arm::ReadHardwareDebugInfo() {
   return error;
 }
 
-Error NativeRegisterContextLinux_arm::WriteHardwareDebugRegs(int hwbType,
-                                                             int hwb_index) {
-  Error error;
+Status NativeRegisterContextLinux_arm::WriteHardwareDebugRegs(int hwbType,
+                                                              int hwb_index) {
+  Status error;
 
   lldb::addr_t *addr_buf;
   uint32_t *ctrl_buf;
@@ -869,7 +872,7 @@ uint32_t NativeRegisterContextLinux_arm::CalculateFprOffset(
          GetRegisterInfoAtIndex(m_reg_info.first_fpr)->byte_offset;
 }
 
-Error NativeRegisterContextLinux_arm::DoReadRegisterValue(
+Status NativeRegisterContextLinux_arm::DoReadRegisterValue(
     uint32_t offset, const char *reg_name, uint32_t size,
     RegisterValue &value) {
   // PTRACE_PEEKUSER don't work in the aarch64 linux kernel used on android
@@ -881,17 +884,17 @@ Error NativeRegisterContextLinux_arm::DoReadRegisterValue(
   // comparision to processing time in lldb-server.
   assert(offset % 4 == 0 && "Try to write a register with unaligned offset");
   if (offset + sizeof(uint32_t) > sizeof(m_gpr_arm))
-    return Error("Register isn't fit into the size of the GPR area");
+    return Status("Register isn't fit into the size of the GPR area");
 
-  Error error = DoReadGPR(m_gpr_arm, sizeof(m_gpr_arm));
+  Status error = DoReadGPR(m_gpr_arm, sizeof(m_gpr_arm));
   if (error.Fail())
     return error;
 
   value.SetUInt32(m_gpr_arm[offset / sizeof(uint32_t)]);
-  return Error();
+  return Status();
 }
 
-Error NativeRegisterContextLinux_arm::DoWriteRegisterValue(
+Status NativeRegisterContextLinux_arm::DoWriteRegisterValue(
     uint32_t offset, const char *reg_name, const RegisterValue &value) {
   // PTRACE_POKEUSER don't work in the aarch64 linux kernel used on android
   // devices (always return
@@ -903,9 +906,9 @@ Error NativeRegisterContextLinux_arm::DoWriteRegisterValue(
   // lldb-server.
   assert(offset % 4 == 0 && "Try to write a register with unaligned offset");
   if (offset + sizeof(uint32_t) > sizeof(m_gpr_arm))
-    return Error("Register isn't fit into the size of the GPR area");
+    return Status("Register isn't fit into the size of the GPR area");
 
-  Error error = DoReadGPR(m_gpr_arm, sizeof(m_gpr_arm));
+  Status error = DoReadGPR(m_gpr_arm, sizeof(m_gpr_arm));
   if (error.Fail())
     return error;
 
@@ -927,7 +930,7 @@ Error NativeRegisterContextLinux_arm::DoWriteRegisterValue(
   return DoWriteGPR(m_gpr_arm, sizeof(m_gpr_arm));
 }
 
-Error NativeRegisterContextLinux_arm::DoReadGPR(void *buf, size_t buf_size) {
+Status NativeRegisterContextLinux_arm::DoReadGPR(void *buf, size_t buf_size) {
 #ifdef __arm__
   return NativeRegisterContextLinux::DoReadGPR(buf, buf_size);
 #else  // __aarch64__
@@ -939,7 +942,7 @@ Error NativeRegisterContextLinux_arm::DoReadGPR(void *buf, size_t buf_size) {
 #endif // __arm__
 }
 
-Error NativeRegisterContextLinux_arm::DoWriteGPR(void *buf, size_t buf_size) {
+Status NativeRegisterContextLinux_arm::DoWriteGPR(void *buf, size_t buf_size) {
 #ifdef __arm__
   return NativeRegisterContextLinux::DoWriteGPR(buf, buf_size);
 #else  // __aarch64__
@@ -951,7 +954,7 @@ Error NativeRegisterContextLinux_arm::DoWriteGPR(void *buf, size_t buf_size) {
 #endif // __arm__
 }
 
-Error NativeRegisterContextLinux_arm::DoReadFPR(void *buf, size_t buf_size) {
+Status NativeRegisterContextLinux_arm::DoReadFPR(void *buf, size_t buf_size) {
 #ifdef __arm__
   return NativeProcessLinux::PtraceWrapper(PTRACE_GETVFPREGS, m_thread.GetID(),
                                            nullptr, buf, buf_size);
@@ -964,7 +967,7 @@ Error NativeRegisterContextLinux_arm::DoReadFPR(void *buf, size_t buf_size) {
 #endif // __arm__
 }
 
-Error NativeRegisterContextLinux_arm::DoWriteFPR(void *buf, size_t buf_size) {
+Status NativeRegisterContextLinux_arm::DoWriteFPR(void *buf, size_t buf_size) {
 #ifdef __arm__
   return NativeProcessLinux::PtraceWrapper(PTRACE_SETVFPREGS, m_thread.GetID(),
                                            nullptr, buf, buf_size);
diff --git a/source/Plugins/Process/Linux/NativeRegisterContextLinux_arm.h b/source/Plugins/Process/Linux/NativeRegisterContextLinux_arm.h
index 824ac88ad9ef..ec99c05e1644 100644
--- a/source/Plugins/Process/Linux/NativeRegisterContextLinux_arm.h
+++ b/source/Plugins/Process/Linux/NativeRegisterContextLinux_arm.h
@@ -32,15 +32,15 @@ public:
 
   uint32_t GetUserRegisterCount() const override;
 
-  Error ReadRegister(const RegisterInfo *reg_info,
-                     RegisterValue &reg_value) override;
+  Status ReadRegister(const RegisterInfo *reg_info,
+                      RegisterValue &reg_value) override;
 
-  Error WriteRegister(const RegisterInfo *reg_info,
-                      const RegisterValue &reg_value) override;
+  Status WriteRegister(const RegisterInfo *reg_info,
+                       const RegisterValue &reg_value) override;
 
-  Error ReadAllRegisterValues(lldb::DataBufferSP &data_sp) override;
+  Status ReadAllRegisterValues(lldb::DataBufferSP &data_sp) override;
 
-  Error WriteAllRegisterValues(const lldb::DataBufferSP &data_sp) override;
+  Status WriteAllRegisterValues(const lldb::DataBufferSP &data_sp) override;
 
   //------------------------------------------------------------------
   // Hardware breakpoints/watchpoint mangement functions
@@ -52,10 +52,10 @@ public:
 
   bool ClearHardwareBreakpoint(uint32_t hw_idx) override;
 
-  Error ClearAllHardwareBreakpoints() override;
+  Status ClearAllHardwareBreakpoints() override;
 
-  Error GetHardwareBreakHitIndex(uint32_t &bp_index,
-                                 lldb::addr_t trap_addr) override;
+  Status GetHardwareBreakHitIndex(uint32_t &bp_index,
+                                  lldb::addr_t trap_addr) override;
 
   uint32_t NumSupportedHardwareWatchpoints() override;
 
@@ -64,10 +64,10 @@ public:
 
   bool ClearHardwareWatchpoint(uint32_t hw_index) override;
 
-  Error ClearAllHardwareWatchpoints() override;
+  Status ClearAllHardwareWatchpoints() override;
 
-  Error GetWatchpointHitIndex(uint32_t &wp_index,
-                              lldb::addr_t trap_addr) override;
+  Status GetWatchpointHitIndex(uint32_t &wp_index,
+                               lldb::addr_t trap_addr) override;
 
   lldb::addr_t GetWatchpointHitAddress(uint32_t wp_index) override;
 
@@ -81,19 +81,19 @@ public:
   enum DREGType { eDREGTypeWATCH = 0, eDREGTypeBREAK };
 
 protected:
-  Error DoReadRegisterValue(uint32_t offset, const char *reg_name,
-                            uint32_t size, RegisterValue &value) override;
+  Status DoReadRegisterValue(uint32_t offset, const char *reg_name,
+                             uint32_t size, RegisterValue &value) override;
 
-  Error DoWriteRegisterValue(uint32_t offset, const char *reg_name,
-                             const RegisterValue &value) override;
+  Status DoWriteRegisterValue(uint32_t offset, const char *reg_name,
+                              const RegisterValue &value) override;
 
-  Error DoReadGPR(void *buf, size_t buf_size) override;
+  Status DoReadGPR(void *buf, size_t buf_size) override;
 
-  Error DoWriteGPR(void *buf, size_t buf_size) override;
+  Status DoWriteGPR(void *buf, size_t buf_size) override;
 
-  Error DoReadFPR(void *buf, size_t buf_size) override;
+  Status DoReadFPR(void *buf, size_t buf_size) override;
 
-  Error DoWriteFPR(void *buf, size_t buf_size) override;
+  Status DoWriteFPR(void *buf, size_t buf_size) override;
 
   void *GetGPRBuffer() override { return &m_gpr_arm; }
 
@@ -155,9 +155,9 @@ private:
 
   bool IsFPR(unsigned reg) const;
 
-  Error ReadHardwareDebugInfo();
+  Status ReadHardwareDebugInfo();
 
-  Error WriteHardwareDebugRegs(int hwbType, int hwb_index);
+  Status WriteHardwareDebugRegs(int hwbType, int hwb_index);
 
   uint32_t CalculateFprOffset(const RegisterInfo *reg_info) const;
 };
diff --git a/source/Plugins/Process/Linux/NativeRegisterContextLinux_arm64.cpp b/source/Plugins/Process/Linux/NativeRegisterContextLinux_arm64.cpp
index f3715147fa3e..c3b58f16256a 100644
--- a/source/Plugins/Process/Linux/NativeRegisterContextLinux_arm64.cpp
+++ b/source/Plugins/Process/Linux/NativeRegisterContextLinux_arm64.cpp
@@ -19,8 +19,8 @@
 #include "lldb/Core/RegisterValue.h"
 #include "lldb/Host/common/NativeProcessProtocol.h"
 #include "lldb/Utility/DataBufferHeap.h"
-#include "lldb/Utility/Error.h"
 #include "lldb/Utility/Log.h"
+#include "lldb/Utility/Status.h"
 
 #include "Plugins/Process/Linux/NativeProcessLinux.h"
 #include "Plugins/Process/Linux/Procfs.h"
@@ -180,9 +180,10 @@ uint32_t NativeRegisterContextLinux_arm64::GetUserRegisterCount() const {
   return count;
 }
 
-Error NativeRegisterContextLinux_arm64::ReadRegister(
-    const RegisterInfo *reg_info, RegisterValue &reg_value) {
-  Error error;
+Status
+NativeRegisterContextLinux_arm64::ReadRegister(const RegisterInfo *reg_info,
+                                               RegisterValue &reg_value) {
+  Status error;
 
   if (!reg_info) {
     error.SetErrorString("reg_info NULL");
@@ -232,16 +233,16 @@ Error NativeRegisterContextLinux_arm64::ReadRegister(
   return error;
 }
 
-Error NativeRegisterContextLinux_arm64::WriteRegister(
+Status NativeRegisterContextLinux_arm64::WriteRegister(
     const RegisterInfo *reg_info, const RegisterValue &reg_value) {
   if (!reg_info)
-    return Error("reg_info NULL");
+    return Status("reg_info NULL");
 
   const uint32_t reg_index = reg_info->kinds[lldb::eRegisterKindLLDB];
   if (reg_index == LLDB_INVALID_REGNUM)
-    return Error("no lldb regnum for %s", reg_info && reg_info->name
-                                              ? reg_info->name
-                                              : "<unknown register>");
+    return Status("no lldb regnum for %s", reg_info && reg_info->name
+                                               ? reg_info->name
+                                               : "<unknown register>");
 
   if (IsGPR(reg_index))
     return WriteRegisterRaw(reg_index, reg_value);
@@ -263,29 +264,29 @@ Error NativeRegisterContextLinux_arm64::WriteRegister(
       break;
     default:
       assert(false && "Unhandled data size.");
-      return Error("unhandled register data size %" PRIu32,
-                   reg_info->byte_size);
+      return Status("unhandled register data size %" PRIu32,
+                    reg_info->byte_size);
     }
 
-    Error error = WriteFPR();
+    Status error = WriteFPR();
     if (error.Fail())
       return error;
 
-    return Error();
+    return Status();
   }
 
-  return Error("failed - register wasn't recognized to be a GPR or an FPR, "
-               "write strategy unknown");
+  return Status("failed - register wasn't recognized to be a GPR or an FPR, "
+                "write strategy unknown");
 }
 
-Error NativeRegisterContextLinux_arm64::ReadAllRegisterValues(
+Status NativeRegisterContextLinux_arm64::ReadAllRegisterValues(
     lldb::DataBufferSP &data_sp) {
-  Error error;
+  Status error;
 
   data_sp.reset(new DataBufferHeap(REG_CONTEXT_SIZE, 0));
   if (!data_sp)
-    return Error("failed to allocate DataBufferHeap instance of size %" PRIu64,
-                 REG_CONTEXT_SIZE);
+    return Status("failed to allocate DataBufferHeap instance of size %" PRIu64,
+                  REG_CONTEXT_SIZE);
 
   error = ReadGPR();
   if (error.Fail())
@@ -310,9 +311,9 @@ Error NativeRegisterContextLinux_arm64::ReadAllRegisterValues(
   return error;
 }
 
-Error NativeRegisterContextLinux_arm64::WriteAllRegisterValues(
+Status NativeRegisterContextLinux_arm64::WriteAllRegisterValues(
     const lldb::DataBufferSP &data_sp) {
-  Error error;
+  Status error;
 
   if (!data_sp) {
     error.SetErrorStringWithFormat(
@@ -367,7 +368,7 @@ uint32_t NativeRegisterContextLinux_arm64::NumSupportedHardwareBreakpoints() {
   if (log)
     log->Printf("NativeRegisterContextLinux_arm64::%s()", __FUNCTION__);
 
-  Error error;
+  Status error;
 
   // Read hardware breakpoint and watchpoint information.
   error = ReadHardwareDebugInfo();
@@ -385,7 +386,7 @@ NativeRegisterContextLinux_arm64::SetHardwareBreakpoint(lldb::addr_t addr,
   LLDB_LOG(log, "addr: {0:x}, size: {1:x}", addr, size);
 
   // Read hardware breakpoint and watchpoint information.
-  Error error = ReadHardwareDebugInfo();
+  Status error = ReadHardwareDebugInfo();
 
   if (error.Fail())
     return LLDB_INVALID_INDEX32;
@@ -443,7 +444,7 @@ bool NativeRegisterContextLinux_arm64::ClearHardwareBreakpoint(
   LLDB_LOG(log, "hw_idx: {0}", hw_idx);
 
   // Read hardware breakpoint and watchpoint information.
-  Error error = ReadHardwareDebugInfo();
+  Status error = ReadHardwareDebugInfo();
 
   if (error.Fail())
     return false;
@@ -471,7 +472,7 @@ bool NativeRegisterContextLinux_arm64::ClearHardwareBreakpoint(
   return true;
 }
 
-Error NativeRegisterContextLinux_arm64::GetHardwareBreakHitIndex(
+Status NativeRegisterContextLinux_arm64::GetHardwareBreakHitIndex(
     uint32_t &bp_index, lldb::addr_t trap_addr) {
   Log *log(ProcessPOSIXLog::GetLogIfAllCategoriesSet(POSIX_LOG_BREAKPOINTS));
 
@@ -485,21 +486,21 @@ Error NativeRegisterContextLinux_arm64::GetHardwareBreakHitIndex(
 
     if ((m_hbr_regs[bp_index].control & 0x1) && (trap_addr == break_addr)) {
       m_hbr_regs[bp_index].hit_addr = trap_addr;
-      return Error();
+      return Status();
     }
   }
 
   bp_index = LLDB_INVALID_INDEX32;
-  return Error();
+  return Status();
 }
 
-Error NativeRegisterContextLinux_arm64::ClearAllHardwareBreakpoints() {
+Status NativeRegisterContextLinux_arm64::ClearAllHardwareBreakpoints() {
   Log *log(ProcessPOSIXLog::GetLogIfAllCategoriesSet(POSIX_LOG_BREAKPOINTS));
 
   if (log)
     log->Printf("NativeRegisterContextLinux_arm64::%s()", __FUNCTION__);
 
-  Error error;
+  Status error;
 
   // Read hardware breakpoint and watchpoint information.
   error = ReadHardwareDebugInfo();
@@ -532,14 +533,14 @@ Error NativeRegisterContextLinux_arm64::ClearAllHardwareBreakpoints() {
     }
   }
 
-  return Error();
+  return Status();
 }
 
 uint32_t NativeRegisterContextLinux_arm64::NumSupportedHardwareWatchpoints() {
   Log *log(ProcessPOSIXLog::GetLogIfAllCategoriesSet(POSIX_LOG_WATCHPOINTS));
 
   // Read hardware breakpoint and watchpoint information.
-  Error error = ReadHardwareDebugInfo();
+  Status error = ReadHardwareDebugInfo();
 
   if (error.Fail())
     return 0;
@@ -555,7 +556,7 @@ uint32_t NativeRegisterContextLinux_arm64::SetHardwareWatchpoint(
            watch_flags);
 
   // Read hardware breakpoint and watchpoint information.
-  Error error = ReadHardwareDebugInfo();
+  Status error = ReadHardwareDebugInfo();
 
   if (error.Fail())
     return LLDB_INVALID_INDEX32;
@@ -642,7 +643,7 @@ bool NativeRegisterContextLinux_arm64::ClearHardwareWatchpoint(
   LLDB_LOG(log, "wp_index: {0}", wp_index);
 
   // Read hardware breakpoint and watchpoint information.
-  Error error = ReadHardwareDebugInfo();
+  Status error = ReadHardwareDebugInfo();
 
   if (error.Fail())
     return false;
@@ -671,9 +672,9 @@ bool NativeRegisterContextLinux_arm64::ClearHardwareWatchpoint(
   return true;
 }
 
-Error NativeRegisterContextLinux_arm64::ClearAllHardwareWatchpoints() {
+Status NativeRegisterContextLinux_arm64::ClearAllHardwareWatchpoints() {
   // Read hardware breakpoint and watchpoint information.
-  Error error = ReadHardwareDebugInfo();
+  Status error = ReadHardwareDebugInfo();
 
   if (error.Fail())
     return error;
@@ -703,7 +704,7 @@ Error NativeRegisterContextLinux_arm64::ClearAllHardwareWatchpoints() {
     }
   }
 
-  return Error();
+  return Status();
 }
 
 uint32_t
@@ -734,7 +735,7 @@ bool NativeRegisterContextLinux_arm64::WatchpointIsEnabled(uint32_t wp_index) {
     return false;
 }
 
-Error NativeRegisterContextLinux_arm64::GetWatchpointHitIndex(
+Status NativeRegisterContextLinux_arm64::GetWatchpointHitIndex(
     uint32_t &wp_index, lldb::addr_t trap_addr) {
   Log *log(ProcessPOSIXLog::GetLogIfAllCategoriesSet(POSIX_LOG_WATCHPOINTS));
   LLDB_LOG(log, "wp_index: {0}, trap_addr: {1:x}", wp_index, trap_addr);
@@ -749,12 +750,12 @@ Error NativeRegisterContextLinux_arm64::GetWatchpointHitIndex(
     if (WatchpointIsEnabled(wp_index) && trap_addr >= watch_addr &&
         trap_addr < watch_addr + watch_size) {
       m_hwp_regs[wp_index].hit_addr = trap_addr;
-      return Error();
+      return Status();
     }
   }
 
   wp_index = LLDB_INVALID_INDEX32;
-  return Error();
+  return Status();
 }
 
 lldb::addr_t
@@ -785,9 +786,9 @@ NativeRegisterContextLinux_arm64::GetWatchpointHitAddress(uint32_t wp_index) {
     return LLDB_INVALID_ADDRESS;
 }
 
-Error NativeRegisterContextLinux_arm64::ReadHardwareDebugInfo() {
+Status NativeRegisterContextLinux_arm64::ReadHardwareDebugInfo() {
   if (!m_refresh_hwdebug_info) {
-    return Error();
+    return Status();
   }
 
   ::pid_t tid = m_thread.GetID();
@@ -795,7 +796,7 @@ Error NativeRegisterContextLinux_arm64::ReadHardwareDebugInfo() {
   int regset = NT_ARM_HW_WATCH;
   struct iovec ioVec;
   struct user_hwdebug_state dreg_state;
-  Error error;
+  Status error;
 
   ioVec.iov_base = &dreg_state;
   ioVec.iov_len = sizeof(dreg_state);
@@ -820,10 +821,10 @@ Error NativeRegisterContextLinux_arm64::ReadHardwareDebugInfo() {
   return error;
 }
 
-Error NativeRegisterContextLinux_arm64::WriteHardwareDebugRegs(int hwbType) {
+Status NativeRegisterContextLinux_arm64::WriteHardwareDebugRegs(int hwbType) {
   struct iovec ioVec;
   struct user_hwdebug_state dreg_state;
-  Error error;
+  Status error;
 
   memset(&dreg_state, 0, sizeof(dreg_state));
   ioVec.iov_base = &dreg_state;
@@ -852,10 +853,10 @@ Error NativeRegisterContextLinux_arm64::WriteHardwareDebugRegs(int hwbType) {
                                            &hwbType, &ioVec, ioVec.iov_len);
 }
 
-Error NativeRegisterContextLinux_arm64::DoReadRegisterValue(
+Status NativeRegisterContextLinux_arm64::DoReadRegisterValue(
     uint32_t offset, const char *reg_name, uint32_t size,
     RegisterValue &value) {
-  Error error;
+  Status error;
   if (offset > sizeof(struct user_pt_regs)) {
     uintptr_t offset = offset - sizeof(struct user_pt_regs);
     if (offset > sizeof(struct user_fpsimd_state)) {
@@ -899,9 +900,9 @@ Error NativeRegisterContextLinux_arm64::DoReadRegisterValue(
   return error;
 }
 
-Error NativeRegisterContextLinux_arm64::DoWriteRegisterValue(
+Status NativeRegisterContextLinux_arm64::DoWriteRegisterValue(
     uint32_t offset, const char *reg_name, const RegisterValue &value) {
-  Error error;
+  Status error;
   ::pid_t tid = m_thread.GetID();
   if (offset > sizeof(struct user_pt_regs)) {
     uintptr_t offset = offset - sizeof(struct user_pt_regs);
@@ -943,10 +944,10 @@ Error NativeRegisterContextLinux_arm64::DoWriteRegisterValue(
   return error;
 }
 
-Error NativeRegisterContextLinux_arm64::DoReadGPR(void *buf, size_t buf_size) {
+Status NativeRegisterContextLinux_arm64::DoReadGPR(void *buf, size_t buf_size) {
   int regset = NT_PRSTATUS;
   struct iovec ioVec;
-  Error error;
+  Status error;
 
   ioVec.iov_base = buf;
   ioVec.iov_len = buf_size;
@@ -954,10 +955,11 @@ Error NativeRegisterContextLinux_arm64::DoReadGPR(void *buf, size_t buf_size) {
                                            &regset, &ioVec, buf_size);
 }
 
-Error NativeRegisterContextLinux_arm64::DoWriteGPR(void *buf, size_t buf_size) {
+Status NativeRegisterContextLinux_arm64::DoWriteGPR(void *buf,
+                                                    size_t buf_size) {
   int regset = NT_PRSTATUS;
   struct iovec ioVec;
-  Error error;
+  Status error;
 
   ioVec.iov_base = buf;
   ioVec.iov_len = buf_size;
@@ -965,10 +967,10 @@ Error NativeRegisterContextLinux_arm64::DoWriteGPR(void *buf, size_t buf_size) {
                                            &regset, &ioVec, buf_size);
 }
 
-Error NativeRegisterContextLinux_arm64::DoReadFPR(void *buf, size_t buf_size) {
+Status NativeRegisterContextLinux_arm64::DoReadFPR(void *buf, size_t buf_size) {
   int regset = NT_FPREGSET;
   struct iovec ioVec;
-  Error error;
+  Status error;
 
   ioVec.iov_base = buf;
   ioVec.iov_len = buf_size;
@@ -976,10 +978,11 @@ Error NativeRegisterContextLinux_arm64::DoReadFPR(void *buf, size_t buf_size) {
                                            &regset, &ioVec, buf_size);
 }
 
-Error NativeRegisterContextLinux_arm64::DoWriteFPR(void *buf, size_t buf_size) {
+Status NativeRegisterContextLinux_arm64::DoWriteFPR(void *buf,
+                                                    size_t buf_size) {
   int regset = NT_FPREGSET;
   struct iovec ioVec;
-  Error error;
+  Status error;
 
   ioVec.iov_base = buf;
   ioVec.iov_len = buf_size;
diff --git a/source/Plugins/Process/Linux/NativeRegisterContextLinux_arm64.h b/source/Plugins/Process/Linux/NativeRegisterContextLinux_arm64.h
index 4ffbd97ee33a..9877dec37c48 100644
--- a/source/Plugins/Process/Linux/NativeRegisterContextLinux_arm64.h
+++ b/source/Plugins/Process/Linux/NativeRegisterContextLinux_arm64.h
@@ -32,15 +32,15 @@ public:
 
   const RegisterSet *GetRegisterSet(uint32_t set_index) const override;
 
-  Error ReadRegister(const RegisterInfo *reg_info,
-                     RegisterValue &reg_value) override;
+  Status ReadRegister(const RegisterInfo *reg_info,
+                      RegisterValue &reg_value) override;
 
-  Error WriteRegister(const RegisterInfo *reg_info,
-                      const RegisterValue &reg_value) override;
+  Status WriteRegister(const RegisterInfo *reg_info,
+                       const RegisterValue &reg_value) override;
 
-  Error ReadAllRegisterValues(lldb::DataBufferSP &data_sp) override;
+  Status ReadAllRegisterValues(lldb::DataBufferSP &data_sp) override;
 
-  Error WriteAllRegisterValues(const lldb::DataBufferSP &data_sp) override;
+  Status WriteAllRegisterValues(const lldb::DataBufferSP &data_sp) override;
 
   //------------------------------------------------------------------
   // Hardware breakpoints/watchpoint mangement functions
@@ -52,10 +52,10 @@ public:
 
   bool ClearHardwareBreakpoint(uint32_t hw_idx) override;
 
-  Error ClearAllHardwareBreakpoints() override;
+  Status ClearAllHardwareBreakpoints() override;
 
-  Error GetHardwareBreakHitIndex(uint32_t &bp_index,
-                                 lldb::addr_t trap_addr) override;
+  Status GetHardwareBreakHitIndex(uint32_t &bp_index,
+                                  lldb::addr_t trap_addr) override;
 
   uint32_t NumSupportedHardwareWatchpoints() override;
 
@@ -64,10 +64,10 @@ public:
 
   bool ClearHardwareWatchpoint(uint32_t hw_index) override;
 
-  Error ClearAllHardwareWatchpoints() override;
+  Status ClearAllHardwareWatchpoints() override;
 
-  Error GetWatchpointHitIndex(uint32_t &wp_index,
-                              lldb::addr_t trap_addr) override;
+  Status GetWatchpointHitIndex(uint32_t &wp_index,
+                               lldb::addr_t trap_addr) override;
 
   lldb::addr_t GetWatchpointHitAddress(uint32_t wp_index) override;
 
@@ -81,19 +81,19 @@ public:
   enum DREGType { eDREGTypeWATCH = 0, eDREGTypeBREAK };
 
 protected:
-  Error DoReadRegisterValue(uint32_t offset, const char *reg_name,
-                            uint32_t size, RegisterValue &value) override;
+  Status DoReadRegisterValue(uint32_t offset, const char *reg_name,
+                             uint32_t size, RegisterValue &value) override;
 
-  Error DoWriteRegisterValue(uint32_t offset, const char *reg_name,
-                             const RegisterValue &value) override;
+  Status DoWriteRegisterValue(uint32_t offset, const char *reg_name,
+                              const RegisterValue &value) override;
 
-  Error DoReadGPR(void *buf, size_t buf_size) override;
+  Status DoReadGPR(void *buf, size_t buf_size) override;
 
-  Error DoWriteGPR(void *buf, size_t buf_size) override;
+  Status DoWriteGPR(void *buf, size_t buf_size) override;
 
-  Error DoReadFPR(void *buf, size_t buf_size) override;
+  Status DoReadFPR(void *buf, size_t buf_size) override;
 
-  Error DoWriteFPR(void *buf, size_t buf_size) override;
+  Status DoWriteFPR(void *buf, size_t buf_size) override;
 
   void *GetGPRBuffer() override { return &m_gpr_arm64; }
 
@@ -155,9 +155,9 @@ private:
 
   bool IsFPR(unsigned reg) const;
 
-  Error ReadHardwareDebugInfo();
+  Status ReadHardwareDebugInfo();
 
-  Error WriteHardwareDebugRegs(int hwbType);
+  Status WriteHardwareDebugRegs(int hwbType);
 
   uint32_t CalculateFprOffset(const RegisterInfo *reg_info) const;
 };
diff --git a/source/Plugins/Process/Linux/NativeRegisterContextLinux_mips64.cpp b/source/Plugins/Process/Linux/NativeRegisterContextLinux_mips64.cpp
index 7c5c42477170..dee2c064a346 100644
--- a/source/Plugins/Process/Linux/NativeRegisterContextLinux_mips64.cpp
+++ b/source/Plugins/Process/Linux/NativeRegisterContextLinux_mips64.cpp
@@ -25,9 +25,9 @@
 #include "lldb/Host/Host.h"
 #include "lldb/Host/HostInfo.h"
 #include "lldb/Utility/DataBufferHeap.h"
-#include "lldb/Utility/Error.h"
 #include "lldb/Utility/LLDBAssert.h"
 #include "lldb/Utility/Log.h"
+#include "lldb/Utility/Status.h"
 #include "lldb/lldb-enumerations.h"
 #include "lldb/lldb-private-enumerations.h"
 #define NT_MIPS_MSA 0x600
@@ -178,7 +178,7 @@ uint32_t NativeRegisterContextLinux_mips64::GetRegisterSetCount() const {
 
 lldb::addr_t NativeRegisterContextLinux_mips64::GetPCfromBreakpointLocation(
     lldb::addr_t fail_value) {
-  Error error;
+  Status error;
   RegisterValue pc_value;
   lldb::addr_t pc = fail_value;
   Log *log(ProcessPOSIXLog::GetLogIfAllCategoriesSet(POSIX_LOG_BREAKPOINTS));
@@ -244,10 +244,10 @@ NativeRegisterContextLinux_mips64::GetRegisterSet(uint32_t set_index) const {
   }
 }
 
-lldb_private::Error
+lldb_private::Status
 NativeRegisterContextLinux_mips64::ReadRegister(const RegisterInfo *reg_info,
                                                 RegisterValue &reg_value) {
-  Error error;
+  Status error;
 
   if (!reg_info) {
     error.SetErrorString("reg_info NULL");
@@ -315,18 +315,18 @@ NativeRegisterContextLinux_mips64::ReadRegister(const RegisterInfo *reg_info,
   return error;
 }
 
-lldb_private::Error NativeRegisterContextLinux_mips64::WriteRegister(
+lldb_private::Status NativeRegisterContextLinux_mips64::WriteRegister(
     const RegisterInfo *reg_info, const RegisterValue &reg_value) {
-  Error error;
+  Status error;
 
   assert(reg_info && "reg_info is null");
 
   const uint32_t reg_index = reg_info->kinds[lldb::eRegisterKindLLDB];
 
   if (reg_index == LLDB_INVALID_REGNUM)
-    return Error("no lldb regnum for %s", reg_info && reg_info->name
-                                              ? reg_info->name
-                                              : "<unknown register>");
+    return Status("no lldb regnum for %s", reg_info && reg_info->name
+                                               ? reg_info->name
+                                               : "<unknown register>");
 
   if (IsMSA(reg_index) && !IsMSAAvailable()) {
     error.SetErrorString("MSA not available on this processor");
@@ -383,9 +383,9 @@ lldb_private::Error NativeRegisterContextLinux_mips64::WriteRegister(
   return error;
 }
 
-Error NativeRegisterContextLinux_mips64::ReadAllRegisterValues(
+Status NativeRegisterContextLinux_mips64::ReadAllRegisterValues(
     lldb::DataBufferSP &data_sp) {
-  Error error;
+  Status error;
 
   data_sp.reset(new DataBufferHeap(REG_CONTEXT_SIZE, 0));
   if (!data_sp) {
@@ -426,9 +426,9 @@ Error NativeRegisterContextLinux_mips64::ReadAllRegisterValues(
   return error;
 }
 
-Error NativeRegisterContextLinux_mips64::WriteAllRegisterValues(
+Status NativeRegisterContextLinux_mips64::WriteAllRegisterValues(
     const lldb::DataBufferSP &data_sp) {
-  Error error;
+  Status error;
 
   if (!data_sp) {
     error.SetErrorStringWithFormat(
@@ -481,8 +481,8 @@ Error NativeRegisterContextLinux_mips64::WriteAllRegisterValues(
   return error;
 }
 
-Error NativeRegisterContextLinux_mips64::ReadCP1() {
-  Error error;
+Status NativeRegisterContextLinux_mips64::ReadCP1() {
+  Status error;
 
   uint8_t *src = nullptr;
   uint8_t *dst = nullptr;
@@ -529,8 +529,8 @@ NativeRegisterContextLinux_mips64::ReturnFPOffset(uint8_t reg_index,
   return fp_buffer_ptr;
 }
 
-Error NativeRegisterContextLinux_mips64::WriteCP1() {
-  Error error;
+Status NativeRegisterContextLinux_mips64::WriteCP1() {
+  Status error;
 
   uint8_t *src = nullptr;
   uint8_t *dst = nullptr;
@@ -740,7 +740,7 @@ bool NativeRegisterContextLinux_mips64::IsMSAAvailable() {
   MSA_linux_mips msa_buf;
   unsigned int regset = NT_MIPS_MSA;
 
-  Error error = NativeProcessLinux::PtraceWrapper(
+  Status error = NativeProcessLinux::PtraceWrapper(
       PTRACE_GETREGSET, Host::GetCurrentProcessID(),
       static_cast<void *>(&regset), &msa_buf, sizeof(MSA_linux_mips));
 
@@ -751,14 +751,14 @@ bool NativeRegisterContextLinux_mips64::IsMSAAvailable() {
   return false;
 }
 
-Error NativeRegisterContextLinux_mips64::IsWatchpointHit(uint32_t wp_index,
-                                                         bool &is_hit) {
+Status NativeRegisterContextLinux_mips64::IsWatchpointHit(uint32_t wp_index,
+                                                          bool &is_hit) {
   if (wp_index >= NumSupportedHardwareWatchpoints())
-    return Error("Watchpoint index out of range");
+    return Status("Watchpoint index out of range");
 
   // reading the current state of watch regs
   struct pt_watch_regs watch_readback;
-  Error error = DoReadWatchPointRegisterValue(
+  Status error = DoReadWatchPointRegisterValue(
       m_thread.GetID(), static_cast<void *>(&watch_readback));
 
   if (GetWatchHi(&watch_readback, wp_index) & (IRW)) {
@@ -775,12 +775,12 @@ Error NativeRegisterContextLinux_mips64::IsWatchpointHit(uint32_t wp_index,
   return error;
 }
 
-Error NativeRegisterContextLinux_mips64::GetWatchpointHitIndex(
+Status NativeRegisterContextLinux_mips64::GetWatchpointHitIndex(
     uint32_t &wp_index, lldb::addr_t trap_addr) {
   uint32_t num_hw_wps = NumSupportedHardwareWatchpoints();
   for (wp_index = 0; wp_index < num_hw_wps; ++wp_index) {
     bool is_hit;
-    Error error = IsWatchpointHit(wp_index, is_hit);
+    Status error = IsWatchpointHit(wp_index, is_hit);
     if (error.Fail()) {
       wp_index = LLDB_INVALID_INDEX32;
     } else if (is_hit) {
@@ -788,15 +788,15 @@ Error NativeRegisterContextLinux_mips64::GetWatchpointHitIndex(
     }
   }
   wp_index = LLDB_INVALID_INDEX32;
-  return Error();
+  return Status();
 }
 
-Error NativeRegisterContextLinux_mips64::IsWatchpointVacant(uint32_t wp_index,
-                                                            bool &is_vacant) {
+Status NativeRegisterContextLinux_mips64::IsWatchpointVacant(uint32_t wp_index,
+                                                             bool &is_vacant) {
   is_vacant = false;
-  return Error("MIPS TODO: "
-               "NativeRegisterContextLinux_mips64::IsWatchpointVacant not "
-               "implemented");
+  return Status("MIPS TODO: "
+                "NativeRegisterContextLinux_mips64::IsWatchpointVacant not "
+                "implemented");
 }
 
 bool NativeRegisterContextLinux_mips64::ClearHardwareWatchpoint(
@@ -821,8 +821,8 @@ bool NativeRegisterContextLinux_mips64::ClearHardwareWatchpoint(
         default_watch_regs.mips64.watch_masks[wp_index];
   }
 
-  Error error = DoWriteWatchPointRegisterValue(m_thread.GetID(),
-                                               static_cast<void *>(&regs));
+  Status error = DoWriteWatchPointRegisterValue(m_thread.GetID(),
+                                                static_cast<void *>(&regs));
   if (!error.Fail()) {
     hw_addr_map[wp_index] = LLDB_INVALID_ADDRESS;
     return true;
@@ -830,14 +830,14 @@ bool NativeRegisterContextLinux_mips64::ClearHardwareWatchpoint(
   return false;
 }
 
-Error NativeRegisterContextLinux_mips64::ClearAllHardwareWatchpoints() {
+Status NativeRegisterContextLinux_mips64::ClearAllHardwareWatchpoints() {
   return DoWriteWatchPointRegisterValue(
       m_thread.GetID(), static_cast<void *>(&default_watch_regs));
 }
 
-Error NativeRegisterContextLinux_mips64::SetHardwareWatchpointWithIndex(
+Status NativeRegisterContextLinux_mips64::SetHardwareWatchpointWithIndex(
     lldb::addr_t addr, size_t size, uint32_t watch_flags, uint32_t wp_index) {
-  Error error;
+  Status error;
   error.SetErrorString("MIPS TODO: "
                        "NativeRegisterContextLinux_mips64::"
                        "SetHardwareWatchpointWithIndex not implemented");
@@ -910,7 +910,7 @@ static bool ReadRegisterCallback(EmulateInstruction *instruction, void *baton,
       emulator_baton->m_reg_context->GetRegisterInfo(
           lldb::eRegisterKindDWARF, reg_info->kinds[lldb::eRegisterKindDWARF]);
 
-  Error error =
+  Status error =
       emulator_baton->m_reg_context->ReadRegister(full_reg_info, reg_value);
   if (error.Success())
     return true;
@@ -991,12 +991,13 @@ uint32_t NativeRegisterContextLinux_mips64::NumSupportedHardwareWatchpoints() {
   return num_valid;
 }
 
-Error NativeRegisterContextLinux_mips64::ReadRegisterRaw(uint32_t reg_index,
-                                                         RegisterValue &value) {
+Status
+NativeRegisterContextLinux_mips64::ReadRegisterRaw(uint32_t reg_index,
+                                                   RegisterValue &value) {
   const RegisterInfo *const reg_info = GetRegisterInfoAtIndex(reg_index);
 
   if (!reg_info)
-    return Error("register %" PRIu32 " not found", reg_index);
+    return Status("register %" PRIu32 " not found", reg_index);
 
   uint32_t offset = reg_info->kinds[lldb::eRegisterKindProcessPlugin];
 
@@ -1008,12 +1009,12 @@ Error NativeRegisterContextLinux_mips64::ReadRegisterRaw(uint32_t reg_index,
                              value);
 }
 
-Error NativeRegisterContextLinux_mips64::WriteRegisterRaw(
+Status NativeRegisterContextLinux_mips64::WriteRegisterRaw(
     uint32_t reg_index, const RegisterValue &value) {
   const RegisterInfo *const reg_info = GetRegisterInfoAtIndex(reg_index);
 
   if (!reg_info)
-    return Error("register %" PRIu32 " not found", reg_index);
+    return Status("register %" PRIu32 " not found", reg_index);
 
   if (reg_info->invalidate_regs)
     lldbassert(false && "reg_info->invalidate_regs is unhandled");
@@ -1022,14 +1023,14 @@ Error NativeRegisterContextLinux_mips64::WriteRegisterRaw(
   return DoWriteRegisterValue(offset, reg_info->name, value);
 }
 
-Error NativeRegisterContextLinux_mips64::Read_SR_Config(uint32_t offset,
-                                                        const char *reg_name,
-                                                        uint32_t size,
-                                                        RegisterValue &value) {
+Status NativeRegisterContextLinux_mips64::Read_SR_Config(uint32_t offset,
+                                                         const char *reg_name,
+                                                         uint32_t size,
+                                                         RegisterValue &value) {
   GPR_linux_mips regs;
   ::memset(&regs, 0, sizeof(GPR_linux_mips));
 
-  Error error = NativeProcessLinux::PtraceWrapper(
+  Status error = NativeProcessLinux::PtraceWrapper(
       PTRACE_GETREGS, m_thread.GetID(), NULL, &regs, sizeof regs);
   if (error.Success()) {
     lldb_private::ArchSpec arch;
@@ -1043,13 +1044,13 @@ Error NativeRegisterContextLinux_mips64::Read_SR_Config(uint32_t offset,
   return error;
 }
 
-Error NativeRegisterContextLinux_mips64::DoReadWatchPointRegisterValue(
+Status NativeRegisterContextLinux_mips64::DoReadWatchPointRegisterValue(
     lldb::tid_t tid, void *watch_readback) {
   return NativeProcessLinux::PtraceWrapper(PTRACE_GET_WATCH_REGS,
                                            m_thread.GetID(), watch_readback);
 }
 
-Error NativeRegisterContextLinux_mips64::DoWriteWatchPointRegisterValue(
+Status NativeRegisterContextLinux_mips64::DoWriteWatchPointRegisterValue(
     lldb::tid_t tid, void *watch_reg_value) {
   return NativeProcessLinux::PtraceWrapper(PTRACE_SET_WATCH_REGS,
                                            m_thread.GetID(), watch_reg_value);
diff --git a/source/Plugins/Process/Linux/NativeRegisterContextLinux_mips64.h b/source/Plugins/Process/Linux/NativeRegisterContextLinux_mips64.h
index 1b25609205df..3e14da5a2725 100644
--- a/source/Plugins/Process/Linux/NativeRegisterContextLinux_mips64.h
+++ b/source/Plugins/Process/Linux/NativeRegisterContextLinux_mips64.h
@@ -38,35 +38,36 @@ public:
 
   const RegisterSet *GetRegisterSet(uint32_t set_index) const override;
 
-  Error ReadRegister(const RegisterInfo *reg_info,
-                     RegisterValue &reg_value) override;
+  Status ReadRegister(const RegisterInfo *reg_info,
+                      RegisterValue &reg_value) override;
 
-  Error WriteRegister(const RegisterInfo *reg_info,
-                      const RegisterValue &reg_value) override;
+  Status WriteRegister(const RegisterInfo *reg_info,
+                       const RegisterValue &reg_value) override;
 
-  Error ReadAllRegisterValues(lldb::DataBufferSP &data_sp) override;
+  Status ReadAllRegisterValues(lldb::DataBufferSP &data_sp) override;
 
-  Error WriteAllRegisterValues(const lldb::DataBufferSP &data_sp) override;
+  Status WriteAllRegisterValues(const lldb::DataBufferSP &data_sp) override;
 
-  Error ReadCP1();
+  Status ReadCP1();
 
-  Error WriteCP1();
+  Status WriteCP1();
 
   uint8_t *ReturnFPOffset(uint8_t reg_index, uint32_t byte_offset);
 
-  Error IsWatchpointHit(uint32_t wp_index, bool &is_hit) override;
+  Status IsWatchpointHit(uint32_t wp_index, bool &is_hit) override;
 
-  Error GetWatchpointHitIndex(uint32_t &wp_index,
-                              lldb::addr_t trap_addr) override;
+  Status GetWatchpointHitIndex(uint32_t &wp_index,
+                               lldb::addr_t trap_addr) override;
 
-  Error IsWatchpointVacant(uint32_t wp_index, bool &is_vacant) override;
+  Status IsWatchpointVacant(uint32_t wp_index, bool &is_vacant) override;
 
   bool ClearHardwareWatchpoint(uint32_t wp_index) override;
 
-  Error ClearAllHardwareWatchpoints() override;
+  Status ClearAllHardwareWatchpoints() override;
 
-  Error SetHardwareWatchpointWithIndex(lldb::addr_t addr, size_t size,
-                                       uint32_t watch_flags, uint32_t wp_index);
+  Status SetHardwareWatchpointWithIndex(lldb::addr_t addr, size_t size,
+                                        uint32_t watch_flags,
+                                        uint32_t wp_index);
 
   uint32_t SetHardwareWatchpoint(lldb::addr_t addr, size_t size,
                                  uint32_t watch_flags) override;
@@ -78,17 +79,17 @@ public:
   static bool IsMSAAvailable();
 
 protected:
-  Error Read_SR_Config(uint32_t offset, const char *reg_name, uint32_t size,
-                       RegisterValue &value);
+  Status Read_SR_Config(uint32_t offset, const char *reg_name, uint32_t size,
+                        RegisterValue &value);
 
-  Error ReadRegisterRaw(uint32_t reg_index, RegisterValue &value) override;
+  Status ReadRegisterRaw(uint32_t reg_index, RegisterValue &value) override;
 
-  Error WriteRegisterRaw(uint32_t reg_index,
-                         const RegisterValue &value) override;
+  Status WriteRegisterRaw(uint32_t reg_index,
+                          const RegisterValue &value) override;
 
-  Error DoReadWatchPointRegisterValue(lldb::tid_t tid, void *watch_readback);
+  Status DoReadWatchPointRegisterValue(lldb::tid_t tid, void *watch_readback);
 
-  Error DoWriteWatchPointRegisterValue(lldb::tid_t tid, void *watch_readback);
+  Status DoWriteWatchPointRegisterValue(lldb::tid_t tid, void *watch_readback);
 
   bool IsFR0();
 
diff --git a/source/Plugins/Process/Linux/NativeRegisterContextLinux_s390x.cpp b/source/Plugins/Process/Linux/NativeRegisterContextLinux_s390x.cpp
index 3e782d39f72e..c2a696e08bf9 100644
--- a/source/Plugins/Process/Linux/NativeRegisterContextLinux_s390x.cpp
+++ b/source/Plugins/Process/Linux/NativeRegisterContextLinux_s390x.cpp
@@ -14,8 +14,8 @@
 #include "lldb/Core/RegisterValue.h"
 #include "lldb/Host/HostInfo.h"
 #include "lldb/Utility/DataBufferHeap.h"
-#include "lldb/Utility/Error.h"
 #include "lldb/Utility/Log.h"
+#include "lldb/Utility/Status.h"
 
 #include "Plugins/Process/Utility/RegisterContextLinux_s390x.h"
 
@@ -192,20 +192,21 @@ bool NativeRegisterContextLinux_s390x::IsFPR(uint32_t reg_index) const {
           reg_index <= m_reg_info.last_fpr);
 }
 
-Error NativeRegisterContextLinux_s390x::ReadRegister(
-    const RegisterInfo *reg_info, RegisterValue &reg_value) {
+Status
+NativeRegisterContextLinux_s390x::ReadRegister(const RegisterInfo *reg_info,
+                                               RegisterValue &reg_value) {
   if (!reg_info)
-    return Error("reg_info NULL");
+    return Status("reg_info NULL");
 
   const uint32_t reg = reg_info->kinds[lldb::eRegisterKindLLDB];
   if (reg == LLDB_INVALID_REGNUM)
-    return Error("register \"%s\" is an internal-only lldb register, cannot "
-                 "read directly",
-                 reg_info->name);
+    return Status("register \"%s\" is an internal-only lldb register, cannot "
+                  "read directly",
+                  reg_info->name);
 
   if (IsGPR(reg)) {
     s390_regs regs;
-    Error error = DoReadGPR(&regs, sizeof(regs));
+    Status error = DoReadGPR(&regs, sizeof(regs));
     if (error.Fail())
       return error;
 
@@ -220,14 +221,14 @@ Error NativeRegisterContextLinux_s390x::ReadRegister(
       break;
     default:
       assert(false && "Unhandled data size.");
-      return Error("unhandled byte size: %" PRIu32, reg_info->byte_size);
+      return Status("unhandled byte size: %" PRIu32, reg_info->byte_size);
     }
-    return Error();
+    return Status();
   }
 
   if (IsFPR(reg)) {
     s390_fp_regs fp_regs;
-    Error error = DoReadFPR(&fp_regs, sizeof(fp_regs));
+    Status error = DoReadFPR(&fp_regs, sizeof(fp_regs));
     if (error.Fail())
       return error;
 
@@ -243,48 +244,48 @@ Error NativeRegisterContextLinux_s390x::ReadRegister(
       break;
     default:
       assert(false && "Unhandled data size.");
-      return Error("unhandled byte size: %" PRIu32, reg_info->byte_size);
+      return Status("unhandled byte size: %" PRIu32, reg_info->byte_size);
     }
-    return Error();
+    return Status();
   }
 
   if (reg == lldb_last_break_s390x) {
     uint64_t last_break;
-    Error error = DoReadRegisterSet(NT_S390_LAST_BREAK, &last_break, 8);
+    Status error = DoReadRegisterSet(NT_S390_LAST_BREAK, &last_break, 8);
     if (error.Fail())
       return error;
 
     reg_value.SetUInt64(last_break);
-    return Error();
+    return Status();
   }
 
   if (reg == lldb_system_call_s390x) {
     uint32_t system_call;
-    Error error = DoReadRegisterSet(NT_S390_SYSTEM_CALL, &system_call, 4);
+    Status error = DoReadRegisterSet(NT_S390_SYSTEM_CALL, &system_call, 4);
     if (error.Fail())
       return error;
 
     reg_value.SetUInt32(system_call);
-    return Error();
+    return Status();
   }
 
-  return Error("failed - register wasn't recognized");
+  return Status("failed - register wasn't recognized");
 }
 
-Error NativeRegisterContextLinux_s390x::WriteRegister(
+Status NativeRegisterContextLinux_s390x::WriteRegister(
     const RegisterInfo *reg_info, const RegisterValue &reg_value) {
   if (!reg_info)
-    return Error("reg_info NULL");
+    return Status("reg_info NULL");
 
   const uint32_t reg = reg_info->kinds[lldb::eRegisterKindLLDB];
   if (reg == LLDB_INVALID_REGNUM)
-    return Error("register \"%s\" is an internal-only lldb register, cannot "
-                 "write directly",
-                 reg_info->name);
+    return Status("register \"%s\" is an internal-only lldb register, cannot "
+                  "write directly",
+                  reg_info->name);
 
   if (IsGPR(reg)) {
     s390_regs regs;
-    Error error = DoReadGPR(&regs, sizeof(regs));
+    Status error = DoReadGPR(&regs, sizeof(regs));
     if (error.Fail())
       return error;
 
@@ -299,14 +300,14 @@ Error NativeRegisterContextLinux_s390x::WriteRegister(
       break;
     default:
       assert(false && "Unhandled data size.");
-      return Error("unhandled byte size: %" PRIu32, reg_info->byte_size);
+      return Status("unhandled byte size: %" PRIu32, reg_info->byte_size);
     }
     return DoWriteGPR(&regs, sizeof(regs));
   }
 
   if (IsFPR(reg)) {
     s390_fp_regs fp_regs;
-    Error error = DoReadFPR(&fp_regs, sizeof(fp_regs));
+    Status error = DoReadFPR(&fp_regs, sizeof(fp_regs));
     if (error.Fail())
       return error;
 
@@ -322,13 +323,13 @@ Error NativeRegisterContextLinux_s390x::WriteRegister(
       break;
     default:
       assert(false && "Unhandled data size.");
-      return Error("unhandled byte size: %" PRIu32, reg_info->byte_size);
+      return Status("unhandled byte size: %" PRIu32, reg_info->byte_size);
     }
     return DoWriteFPR(&fp_regs, sizeof(fp_regs));
   }
 
   if (reg == lldb_last_break_s390x) {
-    return Error("The last break address is read-only");
+    return Status("The last break address is read-only");
   }
 
   if (reg == lldb_system_call_s390x) {
@@ -336,12 +337,12 @@ Error NativeRegisterContextLinux_s390x::WriteRegister(
     return DoWriteRegisterSet(NT_S390_SYSTEM_CALL, &system_call, 4);
   }
 
-  return Error("failed - register wasn't recognized");
+  return Status("failed - register wasn't recognized");
 }
 
-Error NativeRegisterContextLinux_s390x::ReadAllRegisterValues(
+Status NativeRegisterContextLinux_s390x::ReadAllRegisterValues(
     lldb::DataBufferSP &data_sp) {
-  Error error;
+  Status error;
 
   data_sp.reset(new DataBufferHeap(REG_CONTEXT_SIZE, 0));
   if (!data_sp) {
@@ -383,9 +384,9 @@ Error NativeRegisterContextLinux_s390x::ReadAllRegisterValues(
   return error;
 }
 
-Error NativeRegisterContextLinux_s390x::WriteAllRegisterValues(
+Status NativeRegisterContextLinux_s390x::WriteAllRegisterValues(
     const lldb::DataBufferSP &data_sp) {
-  Error error;
+  Status error;
 
   if (!data_sp) {
     error.SetErrorStringWithFormat(
@@ -428,19 +429,20 @@ Error NativeRegisterContextLinux_s390x::WriteAllRegisterValues(
   return error;
 }
 
-Error NativeRegisterContextLinux_s390x::DoReadRegisterValue(
+Status NativeRegisterContextLinux_s390x::DoReadRegisterValue(
     uint32_t offset, const char *reg_name, uint32_t size,
     RegisterValue &value) {
-  return Error("DoReadRegisterValue unsupported");
+  return Status("DoReadRegisterValue unsupported");
 }
 
-Error NativeRegisterContextLinux_s390x::DoWriteRegisterValue(
+Status NativeRegisterContextLinux_s390x::DoWriteRegisterValue(
     uint32_t offset, const char *reg_name, const RegisterValue &value) {
-  return Error("DoWriteRegisterValue unsupported");
+  return Status("DoWriteRegisterValue unsupported");
 }
 
-Error NativeRegisterContextLinux_s390x::PeekUserArea(uint32_t offset, void *buf,
-                                                     size_t buf_size) {
+Status NativeRegisterContextLinux_s390x::PeekUserArea(uint32_t offset,
+                                                      void *buf,
+                                                      size_t buf_size) {
   ptrace_area parea;
   parea.len = buf_size;
   parea.process_addr = (addr_t)buf;
@@ -450,9 +452,9 @@ Error NativeRegisterContextLinux_s390x::PeekUserArea(uint32_t offset, void *buf,
                                            m_thread.GetID(), &parea);
 }
 
-Error NativeRegisterContextLinux_s390x::PokeUserArea(uint32_t offset,
-                                                     const void *buf,
-                                                     size_t buf_size) {
+Status NativeRegisterContextLinux_s390x::PokeUserArea(uint32_t offset,
+                                                      const void *buf,
+                                                      size_t buf_size) {
   ptrace_area parea;
   parea.len = buf_size;
   parea.process_addr = (addr_t)buf;
@@ -462,29 +464,31 @@ Error NativeRegisterContextLinux_s390x::PokeUserArea(uint32_t offset,
                                            m_thread.GetID(), &parea);
 }
 
-Error NativeRegisterContextLinux_s390x::DoReadGPR(void *buf, size_t buf_size) {
+Status NativeRegisterContextLinux_s390x::DoReadGPR(void *buf, size_t buf_size) {
   assert(buf_size == sizeof(s390_regs));
   return PeekUserArea(offsetof(user_regs_struct, psw), buf, buf_size);
 }
 
-Error NativeRegisterContextLinux_s390x::DoWriteGPR(void *buf, size_t buf_size) {
+Status NativeRegisterContextLinux_s390x::DoWriteGPR(void *buf,
+                                                    size_t buf_size) {
   assert(buf_size == sizeof(s390_regs));
   return PokeUserArea(offsetof(user_regs_struct, psw), buf, buf_size);
 }
 
-Error NativeRegisterContextLinux_s390x::DoReadFPR(void *buf, size_t buf_size) {
+Status NativeRegisterContextLinux_s390x::DoReadFPR(void *buf, size_t buf_size) {
   assert(buf_size == sizeof(s390_fp_regs));
   return PeekUserArea(offsetof(user_regs_struct, fp_regs), buf, buf_size);
 }
 
-Error NativeRegisterContextLinux_s390x::DoWriteFPR(void *buf, size_t buf_size) {
+Status NativeRegisterContextLinux_s390x::DoWriteFPR(void *buf,
+                                                    size_t buf_size) {
   assert(buf_size == sizeof(s390_fp_regs));
   return PokeUserArea(offsetof(user_regs_struct, fp_regs), buf, buf_size);
 }
 
-Error NativeRegisterContextLinux_s390x::DoReadRegisterSet(uint32_t regset,
-                                                          void *buf,
-                                                          size_t buf_size) {
+Status NativeRegisterContextLinux_s390x::DoReadRegisterSet(uint32_t regset,
+                                                           void *buf,
+                                                           size_t buf_size) {
   struct iovec iov;
   iov.iov_base = buf;
   iov.iov_len = buf_size;
@@ -492,9 +496,9 @@ Error NativeRegisterContextLinux_s390x::DoReadRegisterSet(uint32_t regset,
   return ReadRegisterSet(&iov, buf_size, regset);
 }
 
-Error NativeRegisterContextLinux_s390x::DoWriteRegisterSet(uint32_t regset,
-                                                           const void *buf,
-                                                           size_t buf_size) {
+Status NativeRegisterContextLinux_s390x::DoWriteRegisterSet(uint32_t regset,
+                                                            const void *buf,
+                                                            size_t buf_size) {
   struct iovec iov;
   iov.iov_base = const_cast<void *>(buf);
   iov.iov_len = buf_size;
@@ -502,20 +506,20 @@ Error NativeRegisterContextLinux_s390x::DoWriteRegisterSet(uint32_t regset,
   return WriteRegisterSet(&iov, buf_size, regset);
 }
 
-Error NativeRegisterContextLinux_s390x::IsWatchpointHit(uint32_t wp_index,
-                                                        bool &is_hit) {
+Status NativeRegisterContextLinux_s390x::IsWatchpointHit(uint32_t wp_index,
+                                                         bool &is_hit) {
   per_lowcore_bits per_lowcore;
 
   if (wp_index >= NumSupportedHardwareWatchpoints())
-    return Error("Watchpoint index out of range");
+    return Status("Watchpoint index out of range");
 
   if (m_watchpoint_addr == LLDB_INVALID_ADDRESS) {
     is_hit = false;
-    return Error();
+    return Status();
   }
 
-  Error error = PeekUserArea(offsetof(user_regs_struct, per_info.lowcore),
-                             &per_lowcore, sizeof(per_lowcore));
+  Status error = PeekUserArea(offsetof(user_regs_struct, per_info.lowcore),
+                              &per_lowcore, sizeof(per_lowcore));
   if (error.Fail()) {
     is_hit = false;
     return error;
@@ -531,15 +535,15 @@ Error NativeRegisterContextLinux_s390x::IsWatchpointHit(uint32_t wp_index,
                  sizeof(per_lowcore));
   }
 
-  return Error();
+  return Status();
 }
 
-Error NativeRegisterContextLinux_s390x::GetWatchpointHitIndex(
+Status NativeRegisterContextLinux_s390x::GetWatchpointHitIndex(
     uint32_t &wp_index, lldb::addr_t trap_addr) {
   uint32_t num_hw_wps = NumSupportedHardwareWatchpoints();
   for (wp_index = 0; wp_index < num_hw_wps; ++wp_index) {
     bool is_hit;
-    Error error = IsWatchpointHit(wp_index, is_hit);
+    Status error = IsWatchpointHit(wp_index, is_hit);
     if (error.Fail()) {
       wp_index = LLDB_INVALID_INDEX32;
       return error;
@@ -548,17 +552,17 @@ Error NativeRegisterContextLinux_s390x::GetWatchpointHitIndex(
     }
   }
   wp_index = LLDB_INVALID_INDEX32;
-  return Error();
+  return Status();
 }
 
-Error NativeRegisterContextLinux_s390x::IsWatchpointVacant(uint32_t wp_index,
-                                                           bool &is_vacant) {
+Status NativeRegisterContextLinux_s390x::IsWatchpointVacant(uint32_t wp_index,
+                                                            bool &is_vacant) {
   if (wp_index >= NumSupportedHardwareWatchpoints())
-    return Error("Watchpoint index out of range");
+    return Status("Watchpoint index out of range");
 
   is_vacant = m_watchpoint_addr == LLDB_INVALID_ADDRESS;
 
-  return Error();
+  return Status();
 }
 
 bool NativeRegisterContextLinux_s390x::ClearHardwareWatchpoint(
@@ -568,8 +572,8 @@ bool NativeRegisterContextLinux_s390x::ClearHardwareWatchpoint(
   if (wp_index >= NumSupportedHardwareWatchpoints())
     return false;
 
-  Error error = PeekUserArea(offsetof(user_regs_struct, per_info), &per_info,
-                             sizeof(per_info));
+  Status error = PeekUserArea(offsetof(user_regs_struct, per_info), &per_info,
+                              sizeof(per_info));
   if (error.Fail())
     return false;
 
@@ -587,10 +591,10 @@ bool NativeRegisterContextLinux_s390x::ClearHardwareWatchpoint(
   return true;
 }
 
-Error NativeRegisterContextLinux_s390x::ClearAllHardwareWatchpoints() {
+Status NativeRegisterContextLinux_s390x::ClearAllHardwareWatchpoints() {
   if (ClearHardwareWatchpoint(0))
-    return Error();
-  return Error("Clearing all hardware watchpoints failed.");
+    return Status();
+  return Status("Clearing all hardware watchpoints failed.");
 }
 
 uint32_t NativeRegisterContextLinux_s390x::SetHardwareWatchpoint(
@@ -603,8 +607,8 @@ uint32_t NativeRegisterContextLinux_s390x::SetHardwareWatchpoint(
   if (m_watchpoint_addr != LLDB_INVALID_ADDRESS)
     return LLDB_INVALID_INDEX32;
 
-  Error error = PeekUserArea(offsetof(user_regs_struct, per_info), &per_info,
-                             sizeof(per_info));
+  Status error = PeekUserArea(offsetof(user_regs_struct, per_info), &per_info,
+                              sizeof(per_info));
   if (error.Fail())
     return LLDB_INVALID_INDEX32;
 
diff --git a/source/Plugins/Process/Linux/NativeRegisterContextLinux_s390x.h b/source/Plugins/Process/Linux/NativeRegisterContextLinux_s390x.h
index 4bd737767fa4..3ffbaeeb0bba 100644
--- a/source/Plugins/Process/Linux/NativeRegisterContextLinux_s390x.h
+++ b/source/Plugins/Process/Linux/NativeRegisterContextLinux_s390x.h
@@ -33,26 +33,26 @@ public:
 
   uint32_t GetUserRegisterCount() const override;
 
-  Error ReadRegister(const RegisterInfo *reg_info,
-                     RegisterValue &reg_value) override;
+  Status ReadRegister(const RegisterInfo *reg_info,
+                      RegisterValue &reg_value) override;
 
-  Error WriteRegister(const RegisterInfo *reg_info,
-                      const RegisterValue &reg_value) override;
+  Status WriteRegister(const RegisterInfo *reg_info,
+                       const RegisterValue &reg_value) override;
 
-  Error ReadAllRegisterValues(lldb::DataBufferSP &data_sp) override;
+  Status ReadAllRegisterValues(lldb::DataBufferSP &data_sp) override;
 
-  Error WriteAllRegisterValues(const lldb::DataBufferSP &data_sp) override;
+  Status WriteAllRegisterValues(const lldb::DataBufferSP &data_sp) override;
 
-  Error IsWatchpointHit(uint32_t wp_index, bool &is_hit) override;
+  Status IsWatchpointHit(uint32_t wp_index, bool &is_hit) override;
 
-  Error GetWatchpointHitIndex(uint32_t &wp_index,
-                              lldb::addr_t trap_addr) override;
+  Status GetWatchpointHitIndex(uint32_t &wp_index,
+                               lldb::addr_t trap_addr) override;
 
-  Error IsWatchpointVacant(uint32_t wp_index, bool &is_vacant) override;
+  Status IsWatchpointVacant(uint32_t wp_index, bool &is_vacant) override;
 
   bool ClearHardwareWatchpoint(uint32_t wp_index) override;
 
-  Error ClearAllHardwareWatchpoints() override;
+  Status ClearAllHardwareWatchpoints() override;
 
   uint32_t SetHardwareWatchpoint(lldb::addr_t addr, size_t size,
                                  uint32_t watch_flags) override;
@@ -62,19 +62,19 @@ public:
   uint32_t NumSupportedHardwareWatchpoints() override;
 
 protected:
-  Error DoReadRegisterValue(uint32_t offset, const char *reg_name,
-                            uint32_t size, RegisterValue &value) override;
+  Status DoReadRegisterValue(uint32_t offset, const char *reg_name,
+                             uint32_t size, RegisterValue &value) override;
 
-  Error DoWriteRegisterValue(uint32_t offset, const char *reg_name,
-                             const RegisterValue &value) override;
+  Status DoWriteRegisterValue(uint32_t offset, const char *reg_name,
+                              const RegisterValue &value) override;
 
-  Error DoReadGPR(void *buf, size_t buf_size) override;
+  Status DoReadGPR(void *buf, size_t buf_size) override;
 
-  Error DoWriteGPR(void *buf, size_t buf_size) override;
+  Status DoWriteGPR(void *buf, size_t buf_size) override;
 
-  Error DoReadFPR(void *buf, size_t buf_size) override;
+  Status DoReadFPR(void *buf, size_t buf_size) override;
 
-  Error DoWriteFPR(void *buf, size_t buf_size) override;
+  Status DoWriteFPR(void *buf, size_t buf_size) override;
 
 private:
   // Info about register ranges.
@@ -99,13 +99,13 @@ private:
 
   bool IsFPR(uint32_t reg_index) const;
 
-  Error PeekUserArea(uint32_t offset, void *buf, size_t buf_size);
+  Status PeekUserArea(uint32_t offset, void *buf, size_t buf_size);
 
-  Error PokeUserArea(uint32_t offset, const void *buf, size_t buf_size);
+  Status PokeUserArea(uint32_t offset, const void *buf, size_t buf_size);
 
-  Error DoReadRegisterSet(uint32_t regset, void *buf, size_t buf_size);
+  Status DoReadRegisterSet(uint32_t regset, void *buf, size_t buf_size);
 
-  Error DoWriteRegisterSet(uint32_t regset, const void *buf, size_t buf_size);
+  Status DoWriteRegisterSet(uint32_t regset, const void *buf, size_t buf_size);
 };
 
 } // namespace process_linux
diff --git a/source/Plugins/Process/Linux/NativeRegisterContextLinux_x86_64.cpp b/source/Plugins/Process/Linux/NativeRegisterContextLinux_x86_64.cpp
index dd35705a8ae8..59dc9e9f7d45 100755
--- a/source/Plugins/Process/Linux/NativeRegisterContextLinux_x86_64.cpp
+++ b/source/Plugins/Process/Linux/NativeRegisterContextLinux_x86_64.cpp
@@ -14,8 +14,8 @@
 #include "lldb/Core/RegisterValue.h"
 #include "lldb/Host/HostInfo.h"
 #include "lldb/Utility/DataBufferHeap.h"
-#include "lldb/Utility/Error.h"
 #include "lldb/Utility/Log.h"
+#include "lldb/Utility/Status.h"
 
 #include "Plugins/Process/Utility/RegisterContextLinux_i386.h"
 #include "Plugins/Process/Utility/RegisterContextLinux_x86_64.h"
@@ -385,9 +385,10 @@ NativeRegisterContextLinux_x86_64::GetRegisterSet(uint32_t set_index) const {
   return nullptr;
 }
 
-Error NativeRegisterContextLinux_x86_64::ReadRegister(
-    const RegisterInfo *reg_info, RegisterValue &reg_value) {
-  Error error;
+Status
+NativeRegisterContextLinux_x86_64::ReadRegister(const RegisterInfo *reg_info,
+                                                RegisterValue &reg_value) {
+  Status error;
 
   if (!reg_info) {
     error.SetErrorString("reg_info NULL");
@@ -529,15 +530,15 @@ Error NativeRegisterContextLinux_x86_64::ReadRegister(
   return error;
 }
 
-Error NativeRegisterContextLinux_x86_64::WriteRegister(
+Status NativeRegisterContextLinux_x86_64::WriteRegister(
     const RegisterInfo *reg_info, const RegisterValue &reg_value) {
   assert(reg_info && "reg_info is null");
 
   const uint32_t reg_index = reg_info->kinds[lldb::eRegisterKindLLDB];
   if (reg_index == LLDB_INVALID_REGNUM)
-    return Error("no lldb regnum for %s", reg_info && reg_info->name
-                                              ? reg_info->name
-                                              : "<unknown register>");
+    return Status("no lldb regnum for %s", reg_info && reg_info->name
+                                               ? reg_info->name
+                                               : "<unknown register>");
 
   if (IsGPR(reg_index))
     return WriteRegisterRaw(reg_index, reg_value);
@@ -566,7 +567,7 @@ Error NativeRegisterContextLinux_x86_64::WriteRegister(
         ::memcpy(m_ymm_set.ymm[reg_index - m_reg_info.first_ymm].bytes,
                  reg_value.GetBytes(), reg_value.GetByteSize());
         if (!CopyYMMtoXSTATE(reg_index, GetByteOrder()))
-          return Error("CopyYMMtoXSTATE() failed");
+          return Status("CopyYMMtoXSTATE() failed");
       }
 
       if (reg_index >= m_reg_info.first_mpxr &&
@@ -574,7 +575,7 @@ Error NativeRegisterContextLinux_x86_64::WriteRegister(
         ::memcpy(m_mpx_set.mpxr[reg_index - m_reg_info.first_mpxr].bytes,
                  reg_value.GetBytes(), reg_value.GetByteSize());
         if (!CopyMPXtoXSTATE(reg_index))
-          return Error("CopyMPXtoXSTATE() failed");
+          return Status("CopyMPXtoXSTATE() failed");
       }
 
       if (reg_index >= m_reg_info.first_mpxc &&
@@ -582,7 +583,7 @@ Error NativeRegisterContextLinux_x86_64::WriteRegister(
         ::memcpy(m_mpx_set.mpxc[reg_index - m_reg_info.first_mpxc].bytes,
                  reg_value.GetBytes(), reg_value.GetByteSize());
         if (!CopyMPXtoXSTATE(reg_index))
-          return Error("CopyMPXtoXSTATE() failed");
+          return Status("CopyMPXtoXSTATE() failed");
       }
     } else {
       // Get pointer to m_fpr.xstate.fxsave variable and set the data to it.
@@ -616,33 +617,33 @@ Error NativeRegisterContextLinux_x86_64::WriteRegister(
         break;
       default:
         assert(false && "Unhandled data size.");
-        return Error("unhandled register data size %" PRIu32,
-                     reg_info->byte_size);
+        return Status("unhandled register data size %" PRIu32,
+                      reg_info->byte_size);
       }
     }
 
-    Error error = WriteFPR();
+    Status error = WriteFPR();
     if (error.Fail())
       return error;
 
     if (IsAVX(reg_index)) {
       if (!CopyYMMtoXSTATE(reg_index, GetByteOrder()))
-        return Error("CopyYMMtoXSTATE() failed");
+        return Status("CopyYMMtoXSTATE() failed");
     }
 
     if (IsMPX(reg_index)) {
       if (!CopyMPXtoXSTATE(reg_index))
-        return Error("CopyMPXtoXSTATE() failed");
+        return Status("CopyMPXtoXSTATE() failed");
     }
-    return Error();
+    return Status();
   }
-  return Error("failed - register wasn't recognized to be a GPR or an FPR, "
-               "write strategy unknown");
+  return Status("failed - register wasn't recognized to be a GPR or an FPR, "
+                "write strategy unknown");
 }
 
-Error NativeRegisterContextLinux_x86_64::ReadAllRegisterValues(
+Status NativeRegisterContextLinux_x86_64::ReadAllRegisterValues(
     lldb::DataBufferSP &data_sp) {
-  Error error;
+  Status error;
 
   data_sp.reset(new DataBufferHeap(REG_CONTEXT_SIZE, 0));
   if (!data_sp) {
@@ -728,9 +729,9 @@ Error NativeRegisterContextLinux_x86_64::ReadAllRegisterValues(
   return error;
 }
 
-Error NativeRegisterContextLinux_x86_64::WriteAllRegisterValues(
+Status NativeRegisterContextLinux_x86_64::WriteAllRegisterValues(
     const lldb::DataBufferSP &data_sp) {
-  Error error;
+  Status error;
 
   if (!data_sp) {
     error.SetErrorStringWithFormat(
@@ -857,7 +858,7 @@ bool NativeRegisterContextLinux_x86_64::IsFPR(uint32_t reg_index) const {
           reg_index <= m_reg_info.last_fpr);
 }
 
-Error NativeRegisterContextLinux_x86_64::WriteFPR() {
+Status NativeRegisterContextLinux_x86_64::WriteFPR() {
   switch (m_xstate_type) {
   case XStateType::FXSAVE:
     return WriteRegisterSet(
@@ -867,7 +868,7 @@ Error NativeRegisterContextLinux_x86_64::WriteFPR() {
     return WriteRegisterSet(&m_iovec, sizeof(m_fpr.xstate.xsave),
                             NT_X86_XSTATE);
   default:
-    return Error("Unrecognized FPR type.");
+    return Status("Unrecognized FPR type.");
   }
 }
 
@@ -954,8 +955,8 @@ size_t NativeRegisterContextLinux_x86_64::GetFPRSize() {
   }
 }
 
-Error NativeRegisterContextLinux_x86_64::ReadFPR() {
-  Error error;
+Status NativeRegisterContextLinux_x86_64::ReadFPR() {
+  Status error;
 
   // Probe XSAVE and if it is not supported fall back to FXSAVE.
   if (m_xstate_type != XStateType::FXSAVE) {
@@ -973,7 +974,7 @@ Error NativeRegisterContextLinux_x86_64::ReadFPR() {
     m_xstate_type = XStateType::FXSAVE;
     return error;
   }
-  return Error("Unrecognized FPR type.");
+  return Status("Unrecognized FPR type.");
 }
 
 bool NativeRegisterContextLinux_x86_64::IsMPX(uint32_t reg_index) const {
@@ -1013,13 +1014,13 @@ bool NativeRegisterContextLinux_x86_64::CopyMPXtoXSTATE(uint32_t reg) {
   return true;
 }
 
-Error NativeRegisterContextLinux_x86_64::IsWatchpointHit(uint32_t wp_index,
-                                                         bool &is_hit) {
+Status NativeRegisterContextLinux_x86_64::IsWatchpointHit(uint32_t wp_index,
+                                                          bool &is_hit) {
   if (wp_index >= NumSupportedHardwareWatchpoints())
-    return Error("Watchpoint index out of range");
+    return Status("Watchpoint index out of range");
 
   RegisterValue reg_value;
-  Error error = ReadRegisterRaw(m_reg_info.first_dr + 6, reg_value);
+  Status error = ReadRegisterRaw(m_reg_info.first_dr + 6, reg_value);
   if (error.Fail()) {
     is_hit = false;
     return error;
@@ -1032,12 +1033,12 @@ Error NativeRegisterContextLinux_x86_64::IsWatchpointHit(uint32_t wp_index,
   return error;
 }
 
-Error NativeRegisterContextLinux_x86_64::GetWatchpointHitIndex(
+Status NativeRegisterContextLinux_x86_64::GetWatchpointHitIndex(
     uint32_t &wp_index, lldb::addr_t trap_addr) {
   uint32_t num_hw_wps = NumSupportedHardwareWatchpoints();
   for (wp_index = 0; wp_index < num_hw_wps; ++wp_index) {
     bool is_hit;
-    Error error = IsWatchpointHit(wp_index, is_hit);
+    Status error = IsWatchpointHit(wp_index, is_hit);
     if (error.Fail()) {
       wp_index = LLDB_INVALID_INDEX32;
       return error;
@@ -1046,16 +1047,16 @@ Error NativeRegisterContextLinux_x86_64::GetWatchpointHitIndex(
     }
   }
   wp_index = LLDB_INVALID_INDEX32;
-  return Error();
+  return Status();
 }
 
-Error NativeRegisterContextLinux_x86_64::IsWatchpointVacant(uint32_t wp_index,
-                                                            bool &is_vacant) {
+Status NativeRegisterContextLinux_x86_64::IsWatchpointVacant(uint32_t wp_index,
+                                                             bool &is_vacant) {
   if (wp_index >= NumSupportedHardwareWatchpoints())
-    return Error("Watchpoint index out of range");
+    return Status("Watchpoint index out of range");
 
   RegisterValue reg_value;
-  Error error = ReadRegisterRaw(m_reg_info.first_dr + 7, reg_value);
+  Status error = ReadRegisterRaw(m_reg_info.first_dr + 7, reg_value);
   if (error.Fail()) {
     is_vacant = false;
     return error;
@@ -1068,11 +1069,11 @@ Error NativeRegisterContextLinux_x86_64::IsWatchpointVacant(uint32_t wp_index,
   return error;
 }
 
-Error NativeRegisterContextLinux_x86_64::SetHardwareWatchpointWithIndex(
+Status NativeRegisterContextLinux_x86_64::SetHardwareWatchpointWithIndex(
     lldb::addr_t addr, size_t size, uint32_t watch_flags, uint32_t wp_index) {
 
   if (wp_index >= NumSupportedHardwareWatchpoints())
-    return Error("Watchpoint index out of range");
+    return Status("Watchpoint index out of range");
 
   // Read only watchpoints aren't supported on x86_64. Fall back to read/write
   // waitchpoints instead.
@@ -1082,17 +1083,17 @@ Error NativeRegisterContextLinux_x86_64::SetHardwareWatchpointWithIndex(
     watch_flags = 0x3;
 
   if (watch_flags != 0x1 && watch_flags != 0x3)
-    return Error("Invalid read/write bits for watchpoint");
+    return Status("Invalid read/write bits for watchpoint");
 
   if (size != 1 && size != 2 && size != 4 && size != 8)
-    return Error("Invalid size for watchpoint");
+    return Status("Invalid size for watchpoint");
 
   bool is_vacant;
-  Error error = IsWatchpointVacant(wp_index, is_vacant);
+  Status error = IsWatchpointVacant(wp_index, is_vacant);
   if (error.Fail())
     return error;
   if (!is_vacant)
-    return Error("Watchpoint index not vacant");
+    return Status("Watchpoint index not vacant");
 
   RegisterValue reg_value;
   error = ReadRegisterRaw(m_reg_info.first_dr + 7, reg_value);
@@ -1140,7 +1141,7 @@ bool NativeRegisterContextLinux_x86_64::ClearHardwareWatchpoint(
 
   // for watchpoints 0, 1, 2, or 3, respectively,
   // clear bits 0, 1, 2, or 3 of the debug status register (DR6)
-  Error error = ReadRegisterRaw(m_reg_info.first_dr + 6, reg_value);
+  Status error = ReadRegisterRaw(m_reg_info.first_dr + 6, reg_value);
   if (error.Fail())
     return false;
   uint64_t bit_mask = 1 << wp_index;
@@ -1161,11 +1162,11 @@ bool NativeRegisterContextLinux_x86_64::ClearHardwareWatchpoint(
       .Success();
 }
 
-Error NativeRegisterContextLinux_x86_64::ClearAllHardwareWatchpoints() {
+Status NativeRegisterContextLinux_x86_64::ClearAllHardwareWatchpoints() {
   RegisterValue reg_value;
 
   // clear bits {0-4} of the debug status register (DR6)
-  Error error = ReadRegisterRaw(m_reg_info.first_dr + 6, reg_value);
+  Status error = ReadRegisterRaw(m_reg_info.first_dr + 6, reg_value);
   if (error.Fail())
     return error;
   uint64_t bit_mask = 0xF;
@@ -1189,7 +1190,7 @@ uint32_t NativeRegisterContextLinux_x86_64::SetHardwareWatchpoint(
   const uint32_t num_hw_watchpoints = NumSupportedHardwareWatchpoints();
   for (uint32_t wp_index = 0; wp_index < num_hw_watchpoints; ++wp_index) {
     bool is_vacant;
-    Error error = IsWatchpointVacant(wp_index, is_vacant);
+    Status error = IsWatchpointVacant(wp_index, is_vacant);
     if (is_vacant) {
       error = SetHardwareWatchpointWithIndex(addr, size, watch_flags, wp_index);
       if (error.Success())
diff --git a/source/Plugins/Process/Linux/NativeRegisterContextLinux_x86_64.h b/source/Plugins/Process/Linux/NativeRegisterContextLinux_x86_64.h
index cc05ec06b297..abb0dba4d91c 100644
--- a/source/Plugins/Process/Linux/NativeRegisterContextLinux_x86_64.h
+++ b/source/Plugins/Process/Linux/NativeRegisterContextLinux_x86_64.h
@@ -33,29 +33,30 @@ public:
 
   uint32_t GetUserRegisterCount() const override;
 
-  Error ReadRegister(const RegisterInfo *reg_info,
-                     RegisterValue &reg_value) override;
+  Status ReadRegister(const RegisterInfo *reg_info,
+                      RegisterValue &reg_value) override;
 
-  Error WriteRegister(const RegisterInfo *reg_info,
-                      const RegisterValue &reg_value) override;
+  Status WriteRegister(const RegisterInfo *reg_info,
+                       const RegisterValue &reg_value) override;
 
-  Error ReadAllRegisterValues(lldb::DataBufferSP &data_sp) override;
+  Status ReadAllRegisterValues(lldb::DataBufferSP &data_sp) override;
 
-  Error WriteAllRegisterValues(const lldb::DataBufferSP &data_sp) override;
+  Status WriteAllRegisterValues(const lldb::DataBufferSP &data_sp) override;
 
-  Error IsWatchpointHit(uint32_t wp_index, bool &is_hit) override;
+  Status IsWatchpointHit(uint32_t wp_index, bool &is_hit) override;
 
-  Error GetWatchpointHitIndex(uint32_t &wp_index,
-                              lldb::addr_t trap_addr) override;
+  Status GetWatchpointHitIndex(uint32_t &wp_index,
+                               lldb::addr_t trap_addr) override;
 
-  Error IsWatchpointVacant(uint32_t wp_index, bool &is_vacant) override;
+  Status IsWatchpointVacant(uint32_t wp_index, bool &is_vacant) override;
 
   bool ClearHardwareWatchpoint(uint32_t wp_index) override;
 
-  Error ClearAllHardwareWatchpoints() override;
+  Status ClearAllHardwareWatchpoints() override;
 
-  Error SetHardwareWatchpointWithIndex(lldb::addr_t addr, size_t size,
-                                       uint32_t watch_flags, uint32_t wp_index);
+  Status SetHardwareWatchpointWithIndex(lldb::addr_t addr, size_t size,
+                                        uint32_t watch_flags,
+                                        uint32_t wp_index);
 
   uint32_t SetHardwareWatchpoint(lldb::addr_t addr, size_t size,
                                  uint32_t watch_flags) override;
@@ -71,9 +72,9 @@ protected:
 
   size_t GetFPRSize() override;
 
-  Error ReadFPR() override;
+  Status ReadFPR() override;
 
-  Error WriteFPR() override;
+  Status WriteFPR() override;
 
 private:
   // Private member types.
diff --git a/source/Plugins/Process/Linux/NativeThreadLinux.cpp b/source/Plugins/Process/Linux/NativeThreadLinux.cpp
index 04b6fe6d71e9..b1d13668f327 100644
--- a/source/Plugins/Process/Linux/NativeThreadLinux.cpp
+++ b/source/Plugins/Process/Linux/NativeThreadLinux.cpp
@@ -160,39 +160,40 @@ NativeRegisterContextSP NativeThreadLinux::GetRegisterContext() {
   return m_reg_context_sp;
 }
 
-Error NativeThreadLinux::SetWatchpoint(lldb::addr_t addr, size_t size,
-                                       uint32_t watch_flags, bool hardware) {
+Status NativeThreadLinux::SetWatchpoint(lldb::addr_t addr, size_t size,
+                                        uint32_t watch_flags, bool hardware) {
   if (!hardware)
-    return Error("not implemented");
+    return Status("not implemented");
   if (m_state == eStateLaunching)
-    return Error();
-  Error error = RemoveWatchpoint(addr);
+    return Status();
+  Status error = RemoveWatchpoint(addr);
   if (error.Fail())
     return error;
   NativeRegisterContextSP reg_ctx = GetRegisterContext();
   uint32_t wp_index = reg_ctx->SetHardwareWatchpoint(addr, size, watch_flags);
   if (wp_index == LLDB_INVALID_INDEX32)
-    return Error("Setting hardware watchpoint failed.");
+    return Status("Setting hardware watchpoint failed.");
   m_watchpoint_index_map.insert({addr, wp_index});
-  return Error();
+  return Status();
 }
 
-Error NativeThreadLinux::RemoveWatchpoint(lldb::addr_t addr) {
+Status NativeThreadLinux::RemoveWatchpoint(lldb::addr_t addr) {
   auto wp = m_watchpoint_index_map.find(addr);
   if (wp == m_watchpoint_index_map.end())
-    return Error();
+    return Status();
   uint32_t wp_index = wp->second;
   m_watchpoint_index_map.erase(wp);
   if (GetRegisterContext()->ClearHardwareWatchpoint(wp_index))
-    return Error();
-  return Error("Clearing hardware watchpoint failed.");
+    return Status();
+  return Status("Clearing hardware watchpoint failed.");
 }
 
-Error NativeThreadLinux::SetHardwareBreakpoint(lldb::addr_t addr, size_t size) {
+Status NativeThreadLinux::SetHardwareBreakpoint(lldb::addr_t addr,
+                                                size_t size) {
   if (m_state == eStateLaunching)
-    return Error();
+    return Status();
 
-  Error error = RemoveHardwareBreakpoint(addr);
+  Status error = RemoveHardwareBreakpoint(addr);
   if (error.Fail())
     return error;
 
@@ -200,27 +201,27 @@ Error NativeThreadLinux::SetHardwareBreakpoint(lldb::addr_t addr, size_t size) {
   uint32_t bp_index = reg_ctx->SetHardwareBreakpoint(addr, size);
 
   if (bp_index == LLDB_INVALID_INDEX32)
-    return Error("Setting hardware breakpoint failed.");
+    return Status("Setting hardware breakpoint failed.");
 
   m_hw_break_index_map.insert({addr, bp_index});
-  return Error();
+  return Status();
 }
 
-Error NativeThreadLinux::RemoveHardwareBreakpoint(lldb::addr_t addr) {
+Status NativeThreadLinux::RemoveHardwareBreakpoint(lldb::addr_t addr) {
   auto bp = m_hw_break_index_map.find(addr);
   if (bp == m_hw_break_index_map.end())
-    return Error();
+    return Status();
 
   uint32_t bp_index = bp->second;
   if (GetRegisterContext()->ClearHardwareBreakpoint(bp_index)) {
     m_hw_break_index_map.erase(bp);
-    return Error();
+    return Status();
   }
 
-  return Error("Clearing hardware breakpoint failed.");
+  return Status("Clearing hardware breakpoint failed.");
 }
 
-Error NativeThreadLinux::Resume(uint32_t signo) {
+Status NativeThreadLinux::Resume(uint32_t signo) {
   const StateType new_state = StateType::eStateRunning;
   MaybeLogStateChange(new_state);
   m_state = new_state;
@@ -262,7 +263,7 @@ Error NativeThreadLinux::Resume(uint32_t signo) {
                                            reinterpret_cast<void *>(data));
 }
 
-Error NativeThreadLinux::SingleStep(uint32_t signo) {
+Status NativeThreadLinux::SingleStep(uint32_t signo) {
   const StateType new_state = StateType::eStateStepping;
   MaybeLogStateChange(new_state);
   m_state = new_state;
@@ -422,7 +423,7 @@ void NativeThreadLinux::SetExited() {
   m_stop_info.reason = StopReason::eStopReasonThreadExiting;
 }
 
-Error NativeThreadLinux::RequestStop() {
+Status NativeThreadLinux::RequestStop() {
   Log *log(GetLogIfAllCategoriesSet(LIBLLDB_LOG_THREAD));
 
   NativeProcessLinux &process = GetProcess();
@@ -435,7 +436,7 @@ Error NativeThreadLinux::RequestStop() {
                 ", tid: %" PRIu64 ")",
                 __FUNCTION__, pid, tid);
 
-  Error err;
+  Status err;
   errno = 0;
   if (::tgkill(pid, tid, SIGSTOP) != 0) {
     err.SetErrorToErrno();
diff --git a/source/Plugins/Process/Linux/NativeThreadLinux.h b/source/Plugins/Process/Linux/NativeThreadLinux.h
index 42697497c0af..b9126b3752a0 100644
--- a/source/Plugins/Process/Linux/NativeThreadLinux.h
+++ b/source/Plugins/Process/Linux/NativeThreadLinux.h
@@ -41,14 +41,14 @@ public:
 
   NativeRegisterContextSP GetRegisterContext() override;
 
-  Error SetWatchpoint(lldb::addr_t addr, size_t size, uint32_t watch_flags,
-                      bool hardware) override;
+  Status SetWatchpoint(lldb::addr_t addr, size_t size, uint32_t watch_flags,
+                       bool hardware) override;
 
-  Error RemoveWatchpoint(lldb::addr_t addr) override;
+  Status RemoveWatchpoint(lldb::addr_t addr) override;
 
-  Error SetHardwareBreakpoint(lldb::addr_t addr, size_t size) override;
+  Status SetHardwareBreakpoint(lldb::addr_t addr, size_t size) override;
 
-  Error RemoveHardwareBreakpoint(lldb::addr_t addr) override;
+  Status RemoveHardwareBreakpoint(lldb::addr_t addr) override;
 
 private:
   // ---------------------------------------------------------------------
@@ -57,11 +57,11 @@ private:
 
   /// Resumes the thread.  If @p signo is anything but
   /// LLDB_INVALID_SIGNAL_NUMBER, deliver that signal to the thread.
-  Error Resume(uint32_t signo);
+  Status Resume(uint32_t signo);
 
   /// Single steps the thread.  If @p signo is anything but
   /// LLDB_INVALID_SIGNAL_NUMBER, deliver that signal to the thread.
-  Error SingleStep(uint32_t signo);
+  Status SingleStep(uint32_t signo);
 
   void SetStoppedBySignal(uint32_t signo, const siginfo_t *info = nullptr);
 
@@ -86,7 +86,7 @@ private:
 
   void SetExited();
 
-  Error RequestStop();
+  Status RequestStop();
 
   // ---------------------------------------------------------------------
   // Private interface
diff --git a/source/Plugins/Process/Linux/SingleStepCheck.cpp b/source/Plugins/Process/Linux/SingleStepCheck.cpp
index 4e979bd45532..251cb4b2f10a 100644
--- a/source/Plugins/Process/Linux/SingleStepCheck.cpp
+++ b/source/Plugins/Process/Linux/SingleStepCheck.cpp
@@ -20,7 +20,7 @@
 
 #include "Plugins/Process/POSIX/ProcessPOSIXLog.h"
 #include "lldb/Host/linux/Ptrace.h"
-#include "lldb/Utility/Error.h"
+#include "lldb/Utility/Status.h"
 
 using namespace lldb;
 using namespace lldb_private;
@@ -66,7 +66,7 @@ bool WorkaroundNeeded() {
   Log *log = ProcessPOSIXLog::GetLogIfAllCategoriesSet(POSIX_LOG_THREAD);
   ::pid_t child_pid = fork();
   if (child_pid == -1) {
-    LLDB_LOG(log, "failed to fork(): {0}", Error(errno, eErrorTypePOSIX));
+    LLDB_LOG(log, "failed to fork(): {0}", Status(errno, eErrorTypePOSIX));
     return false;
   }
   if (child_pid == 0)
@@ -77,7 +77,7 @@ bool WorkaroundNeeded() {
   if (sched_getaffinity(child_pid, sizeof available_cpus, &available_cpus) ==
       -1) {
     LLDB_LOG(log, "failed to get available cpus: {0}",
-             Error(errno, eErrorTypePOSIX));
+             Status(errno, eErrorTypePOSIX));
     return false;
   }
 
@@ -85,7 +85,7 @@ bool WorkaroundNeeded() {
   ::pid_t wpid = waitpid(child_pid, &status, __WALL);
   if (wpid != child_pid || !WIFSTOPPED(status)) {
     LLDB_LOG(log, "waitpid() failed (status = {0:x}): {1}", status,
-             Error(errno, eErrorTypePOSIX));
+             Status(errno, eErrorTypePOSIX));
     return false;
   }
 
@@ -99,12 +99,12 @@ bool WorkaroundNeeded() {
     CPU_SET(cpu, &cpus);
     if (sched_setaffinity(child_pid, sizeof cpus, &cpus) == -1) {
       LLDB_LOG(log, "failed to switch to cpu {0}: {1}", cpu,
-               Error(errno, eErrorTypePOSIX));
+               Status(errno, eErrorTypePOSIX));
       continue;
     }
 
     int status;
-    Error error =
+    Status error =
         NativeProcessLinux::PtraceWrapper(PTRACE_SINGLESTEP, child_pid);
     if (error.Fail()) {
       LLDB_LOG(log, "single step failed: {0}", error);
@@ -114,7 +114,7 @@ bool WorkaroundNeeded() {
     wpid = waitpid(child_pid, &status, __WALL);
     if (wpid != child_pid || !WIFSTOPPED(status)) {
       LLDB_LOG(log, "waitpid() failed (status = {0:x}): {1}", status,
-               Error(errno, eErrorTypePOSIX));
+               Status(errno, eErrorTypePOSIX));
       break;
     }
     if (WSTOPSIG(status) != SIGTRAP) {
@@ -152,7 +152,7 @@ std::unique_ptr<SingleStepWorkaround> SingleStepWorkaround::Get(::pid_t tid) {
   if (sched_getaffinity(tid, sizeof original_set, &original_set) != 0) {
     // This should really not fail. But, just in case...
     LLDB_LOG(log, "Unable to get cpu affinity for thread {0}: {1}", tid,
-             Error(errno, eErrorTypePOSIX));
+             Status(errno, eErrorTypePOSIX));
     return nullptr;
   }
 
@@ -164,7 +164,7 @@ std::unique_ptr<SingleStepWorkaround> SingleStepWorkaround::Get(::pid_t tid) {
     // to run on cpu 0. If that happens, only thing we can do is it log it and
     // continue...
     LLDB_LOG(log, "Unable to set cpu affinity for thread {0}: {1}", tid,
-             Error(errno, eErrorTypePOSIX));
+             Status(errno, eErrorTypePOSIX));
   }
 
   LLDB_LOG(log, "workaround for thread {0} prepared", tid);
@@ -176,7 +176,7 @@ SingleStepWorkaround::~SingleStepWorkaround() {
   LLDB_LOG(log, "Removing workaround");
   if (sched_setaffinity(m_tid, sizeof m_original_set, &m_original_set) != 0) {
     LLDB_LOG(log, "Unable to reset cpu affinity for thread {0}: {1}", m_tid,
-             Error(errno, eErrorTypePOSIX));
+             Status(errno, eErrorTypePOSIX));
   }
 }
 #endif
diff --git a/source/Plugins/Process/MacOSX-Kernel/CommunicationKDP.cpp b/source/Plugins/Process/MacOSX-Kernel/CommunicationKDP.cpp
index da0ed9aa0c6a..46d522531a5b 100644
--- a/source/Plugins/Process/MacOSX-Kernel/CommunicationKDP.cpp
+++ b/source/Plugins/Process/MacOSX-Kernel/CommunicationKDP.cpp
@@ -199,7 +199,7 @@ CommunicationKDP::WaitForPacketWithTimeoutMicroSeconds(DataExtractor &packet,
 size_t CommunicationKDP::WaitForPacketWithTimeoutMicroSecondsNoLock(
     DataExtractor &packet, uint32_t timeout_usec) {
   uint8_t buffer[8192];
-  Error error;
+  Status error;
 
   Log *log(ProcessKDPLog::GetLogIfAllCategoriesSet(KDP_LOG_PACKETS));
 
@@ -602,7 +602,7 @@ bool CommunicationKDP::SendRequestDisconnect() {
 
 uint32_t CommunicationKDP::SendRequestReadMemory(lldb::addr_t addr, void *dst,
                                                  uint32_t dst_len,
-                                                 Error &error) {
+                                                 Status &error) {
   PacketStreamType request_packet(Stream::eBinary, m_addr_byte_size,
                                   m_byte_order);
   bool use_64 = (GetVersion() >= 11);
@@ -641,7 +641,7 @@ uint32_t CommunicationKDP::SendRequestReadMemory(lldb::addr_t addr, void *dst,
 uint32_t CommunicationKDP::SendRequestWriteMemory(lldb::addr_t addr,
                                                   const void *src,
                                                   uint32_t src_len,
-                                                  Error &error) {
+                                                  Status &error) {
   PacketStreamType request_packet(Stream::eBinary, m_addr_byte_size,
                                   m_byte_order);
   bool use_64 = (GetVersion() >= 11);
@@ -675,7 +675,7 @@ bool CommunicationKDP::SendRawRequest(
     uint8_t command_byte,
     const void *src,  // Raw packet payload bytes
     uint32_t src_len, // Raw packet payload length
-    DataExtractor &reply_packet, Error &error) {
+    DataExtractor &reply_packet, Status &error) {
   PacketStreamType request_packet(Stream::eBinary, m_addr_byte_size,
                                   m_byte_order);
   // Size is header + address size + uint32_t length
@@ -1224,7 +1224,7 @@ void CommunicationKDP::DumpPacket(Stream &s, const DataExtractor &packet) {
 uint32_t CommunicationKDP::SendRequestReadRegisters(uint32_t cpu,
                                                     uint32_t flavor, void *dst,
                                                     uint32_t dst_len,
-                                                    Error &error) {
+                                                    Status &error) {
   PacketStreamType request_packet(Stream::eBinary, m_addr_byte_size,
                                   m_byte_order);
   const CommandType command = KDP_READREGS;
@@ -1267,7 +1267,7 @@ uint32_t CommunicationKDP::SendRequestWriteRegisters(uint32_t cpu,
                                                      uint32_t flavor,
                                                      const void *src,
                                                      uint32_t src_len,
-                                                     Error &error) {
+                                                     Status &error) {
   PacketStreamType request_packet(Stream::eBinary, m_addr_byte_size,
                                   m_byte_order);
   const CommandType command = KDP_WRITEREGS;
diff --git a/source/Plugins/Process/MacOSX-Kernel/CommunicationKDP.h b/source/Plugins/Process/MacOSX-Kernel/CommunicationKDP.h
index eed3daa4647e..afac6601a56b 100644
--- a/source/Plugins/Process/MacOSX-Kernel/CommunicationKDP.h
+++ b/source/Plugins/Process/MacOSX-Kernel/CommunicationKDP.h
@@ -128,22 +128,24 @@ public:
   bool SendRequestDisconnect();
 
   uint32_t SendRequestReadMemory(lldb::addr_t addr, void *dst,
-                                 uint32_t dst_size, lldb_private::Error &error);
+                                 uint32_t dst_size,
+                                 lldb_private::Status &error);
 
   uint32_t SendRequestWriteMemory(lldb::addr_t addr, const void *src,
-                                  uint32_t src_len, lldb_private::Error &error);
+                                  uint32_t src_len,
+                                  lldb_private::Status &error);
 
   bool SendRawRequest(uint8_t command_byte, const void *src, uint32_t src_len,
                       lldb_private::DataExtractor &reply,
-                      lldb_private::Error &error);
+                      lldb_private::Status &error);
 
   uint32_t SendRequestReadRegisters(uint32_t cpu, uint32_t flavor, void *dst,
                                     uint32_t dst_size,
-                                    lldb_private::Error &error);
+                                    lldb_private::Status &error);
 
   uint32_t SendRequestWriteRegisters(uint32_t cpu, uint32_t flavor,
                                      const void *src, uint32_t src_size,
-                                     lldb_private::Error &error);
+                                     lldb_private::Status &error);
 
   const char *GetKernelVersion();
 
diff --git a/source/Plugins/Process/MacOSX-Kernel/ProcessKDP.cpp b/source/Plugins/Process/MacOSX-Kernel/ProcessKDP.cpp
index 6b2e675afaea..f01f1ace583c 100644
--- a/source/Plugins/Process/MacOSX-Kernel/ProcessKDP.cpp
+++ b/source/Plugins/Process/MacOSX-Kernel/ProcessKDP.cpp
@@ -188,22 +188,22 @@ lldb_private::ConstString ProcessKDP::GetPluginName() {
 
 uint32_t ProcessKDP::GetPluginVersion() { return 1; }
 
-Error ProcessKDP::WillLaunch(Module *module) {
-  Error error;
+Status ProcessKDP::WillLaunch(Module *module) {
+  Status error;
   error.SetErrorString("launching not supported in kdp-remote plug-in");
   return error;
 }
 
-Error ProcessKDP::WillAttachToProcessWithID(lldb::pid_t pid) {
-  Error error;
+Status ProcessKDP::WillAttachToProcessWithID(lldb::pid_t pid) {
+  Status error;
   error.SetErrorString(
       "attaching to a by process ID not supported in kdp-remote plug-in");
   return error;
 }
 
-Error ProcessKDP::WillAttachToProcessWithName(const char *process_name,
-                                              bool wait_for_launch) {
-  Error error;
+Status ProcessKDP::WillAttachToProcessWithName(const char *process_name,
+                                               bool wait_for_launch) {
+  Status error;
   error.SetErrorString(
       "attaching to a by process name not supported in kdp-remote plug-in");
   return error;
@@ -223,8 +223,8 @@ bool ProcessKDP::GetHostArchitecture(ArchSpec &arch) {
   return false;
 }
 
-Error ProcessKDP::DoConnectRemote(Stream *strm, llvm::StringRef remote_url) {
-  Error error;
+Status ProcessKDP::DoConnectRemote(Stream *strm, llvm::StringRef remote_url) {
+  Status error;
 
   // Don't let any JIT happen when doing KDP as we can't allocate
   // memory and we don't want to be mucking with threads that might
@@ -374,23 +374,26 @@ Error ProcessKDP::DoConnectRemote(Stream *strm, llvm::StringRef remote_url) {
 //----------------------------------------------------------------------
 // Process Control
 //----------------------------------------------------------------------
-Error ProcessKDP::DoLaunch(Module *exe_module, ProcessLaunchInfo &launch_info) {
-  Error error;
+Status ProcessKDP::DoLaunch(Module *exe_module,
+                            ProcessLaunchInfo &launch_info) {
+  Status error;
   error.SetErrorString("launching not supported in kdp-remote plug-in");
   return error;
 }
 
-Error ProcessKDP::DoAttachToProcessWithID(
-    lldb::pid_t attach_pid, const ProcessAttachInfo &attach_info) {
-  Error error;
+Status
+ProcessKDP::DoAttachToProcessWithID(lldb::pid_t attach_pid,
+                                    const ProcessAttachInfo &attach_info) {
+  Status error;
   error.SetErrorString(
       "attach to process by ID is not suppported in kdp remote debugging");
   return error;
 }
 
-Error ProcessKDP::DoAttachToProcessWithName(
-    const char *process_name, const ProcessAttachInfo &attach_info) {
-  Error error;
+Status
+ProcessKDP::DoAttachToProcessWithName(const char *process_name,
+                                      const ProcessAttachInfo &attach_info) {
+  Status error;
   error.SetErrorString(
       "attach to process by name is not suppported in kdp remote debugging");
   return error;
@@ -417,10 +420,10 @@ lldb_private::DynamicLoader *ProcessKDP::GetDynamicLoader() {
   return m_dyld_ap.get();
 }
 
-Error ProcessKDP::WillResume() { return Error(); }
+Status ProcessKDP::WillResume() { return Status(); }
 
-Error ProcessKDP::DoResume() {
-  Error error;
+Status ProcessKDP::DoResume() {
+  Status error;
   Log *log(ProcessKDPLog::GetLogIfAllCategoriesSet(KDP_LOG_PROCESS));
   // Only start the async thread if we try to do any process control
   if (!m_async_thread.IsJoinable())
@@ -537,8 +540,8 @@ void ProcessKDP::RefreshStateAfterStop() {
   m_thread_list.RefreshStateAfterStop();
 }
 
-Error ProcessKDP::DoHalt(bool &caused_stop) {
-  Error error;
+Status ProcessKDP::DoHalt(bool &caused_stop) {
+  Status error;
 
   if (m_comm.IsRunning()) {
     if (m_destroy_in_process) {
@@ -553,8 +556,8 @@ Error ProcessKDP::DoHalt(bool &caused_stop) {
   return error;
 }
 
-Error ProcessKDP::DoDetach(bool keep_stopped) {
-  Error error;
+Status ProcessKDP::DoDetach(bool keep_stopped) {
+  Status error;
   Log *log(ProcessKDPLog::GetLogIfAllCategoriesSet(KDP_LOG_PROCESS));
   if (log)
     log->Printf("ProcessKDP::DoDetach(keep_stopped = %i)", keep_stopped);
@@ -588,7 +591,7 @@ Error ProcessKDP::DoDetach(bool keep_stopped) {
   return error;
 }
 
-Error ProcessKDP::DoDestroy() {
+Status ProcessKDP::DoDestroy() {
   // For KDP there really is no difference between destroy and detach
   bool keep_stopped = false;
   return DoDetach(keep_stopped);
@@ -606,7 +609,7 @@ bool ProcessKDP::IsAlive() {
 // Process Memory
 //------------------------------------------------------------------
 size_t ProcessKDP::DoReadMemory(addr_t addr, void *buf, size_t size,
-                                Error &error) {
+                                Status &error) {
   uint8_t *data_buffer = (uint8_t *)buf;
   if (m_comm.IsConnected()) {
     const size_t max_read_size = 512;
@@ -634,7 +637,7 @@ size_t ProcessKDP::DoReadMemory(addr_t addr, void *buf, size_t size,
 }
 
 size_t ProcessKDP::DoWriteMemory(addr_t addr, const void *buf, size_t size,
-                                 Error &error) {
+                                 Status &error) {
   if (m_comm.IsConnected())
     return m_comm.SendRequestWriteMemory(addr, buf, size, error);
   error.SetErrorString("not connected");
@@ -642,22 +645,22 @@ size_t ProcessKDP::DoWriteMemory(addr_t addr, const void *buf, size_t size,
 }
 
 lldb::addr_t ProcessKDP::DoAllocateMemory(size_t size, uint32_t permissions,
-                                          Error &error) {
+                                          Status &error) {
   error.SetErrorString(
       "memory allocation not suppported in kdp remote debugging");
   return LLDB_INVALID_ADDRESS;
 }
 
-Error ProcessKDP::DoDeallocateMemory(lldb::addr_t addr) {
-  Error error;
+Status ProcessKDP::DoDeallocateMemory(lldb::addr_t addr) {
+  Status error;
   error.SetErrorString(
       "memory deallocation not suppported in kdp remote debugging");
   return error;
 }
 
-Error ProcessKDP::EnableBreakpointSite(BreakpointSite *bp_site) {
+Status ProcessKDP::EnableBreakpointSite(BreakpointSite *bp_site) {
   if (m_comm.LocalBreakpointsAreSupported()) {
-    Error error;
+    Status error;
     if (!bp_site->IsEnabled()) {
       if (m_comm.SendRequestBreakpoint(true, bp_site->GetLoadAddress())) {
         bp_site->SetEnabled(true);
@@ -671,9 +674,9 @@ Error ProcessKDP::EnableBreakpointSite(BreakpointSite *bp_site) {
   return EnableSoftwareBreakpoint(bp_site);
 }
 
-Error ProcessKDP::DisableBreakpointSite(BreakpointSite *bp_site) {
+Status ProcessKDP::DisableBreakpointSite(BreakpointSite *bp_site) {
   if (m_comm.LocalBreakpointsAreSupported()) {
-    Error error;
+    Status error;
     if (bp_site->IsEnabled()) {
       BreakpointSite::Type bp_type = bp_site->GetType();
       if (bp_type == BreakpointSite::eExternal) {
@@ -695,15 +698,15 @@ Error ProcessKDP::DisableBreakpointSite(BreakpointSite *bp_site) {
   return DisableSoftwareBreakpoint(bp_site);
 }
 
-Error ProcessKDP::EnableWatchpoint(Watchpoint *wp, bool notify) {
-  Error error;
+Status ProcessKDP::EnableWatchpoint(Watchpoint *wp, bool notify) {
+  Status error;
   error.SetErrorString(
       "watchpoints are not suppported in kdp remote debugging");
   return error;
 }
 
-Error ProcessKDP::DisableWatchpoint(Watchpoint *wp, bool notify) {
-  Error error;
+Status ProcessKDP::DisableWatchpoint(Watchpoint *wp, bool notify) {
+  Status error;
   error.SetErrorString(
       "watchpoints are not suppported in kdp remote debugging");
   return error;
@@ -711,8 +714,8 @@ Error ProcessKDP::DisableWatchpoint(Watchpoint *wp, bool notify) {
 
 void ProcessKDP::Clear() { m_thread_list.Clear(); }
 
-Error ProcessKDP::DoSignal(int signo) {
-  Error error;
+Status ProcessKDP::DoSignal(int signo) {
+  Status error;
   error.SetErrorString(
       "sending signals is not suppported in kdp remote debugging");
   return error;
@@ -950,7 +953,7 @@ public:
                   return false;
                 }
               }
-              Error error;
+              Status error;
               DataExtractor reply;
               process->GetCommunication().SendRawRequest(
                   command_byte,
diff --git a/source/Plugins/Process/MacOSX-Kernel/ProcessKDP.h b/source/Plugins/Process/MacOSX-Kernel/ProcessKDP.h
index 8f1033b9fd36..52b9441e0e79 100644
--- a/source/Plugins/Process/MacOSX-Kernel/ProcessKDP.h
+++ b/source/Plugins/Process/MacOSX-Kernel/ProcessKDP.h
@@ -24,7 +24,7 @@
 #include "lldb/Target/Process.h"
 #include "lldb/Target/Thread.h"
 #include "lldb/Utility/ConstString.h"
-#include "lldb/Utility/Error.h"
+#include "lldb/Utility/Status.h"
 #include "lldb/Utility/StreamString.h"
 #include "lldb/Utility/StringList.h"
 
@@ -68,26 +68,26 @@ public:
   //------------------------------------------------------------------
   // Creating a new process, or attaching to an existing one
   //------------------------------------------------------------------
-  lldb_private::Error WillLaunch(lldb_private::Module *module) override;
+  lldb_private::Status WillLaunch(lldb_private::Module *module) override;
 
-  lldb_private::Error
+  lldb_private::Status
   DoLaunch(lldb_private::Module *exe_module,
            lldb_private::ProcessLaunchInfo &launch_info) override;
 
-  lldb_private::Error WillAttachToProcessWithID(lldb::pid_t pid) override;
+  lldb_private::Status WillAttachToProcessWithID(lldb::pid_t pid) override;
 
-  lldb_private::Error
+  lldb_private::Status
   WillAttachToProcessWithName(const char *process_name,
                               bool wait_for_launch) override;
 
-  lldb_private::Error DoConnectRemote(lldb_private::Stream *strm,
-                                      llvm::StringRef remote_url) override;
+  lldb_private::Status DoConnectRemote(lldb_private::Stream *strm,
+                                       llvm::StringRef remote_url) override;
 
-  lldb_private::Error DoAttachToProcessWithID(
+  lldb_private::Status DoAttachToProcessWithID(
       lldb::pid_t pid,
       const lldb_private::ProcessAttachInfo &attach_info) override;
 
-  lldb_private::Error DoAttachToProcessWithName(
+  lldb_private::Status DoAttachToProcessWithName(
       const char *process_name,
       const lldb_private::ProcessAttachInfo &attach_info) override;
 
@@ -107,17 +107,17 @@ public:
   //------------------------------------------------------------------
   // Process Control
   //------------------------------------------------------------------
-  lldb_private::Error WillResume() override;
+  lldb_private::Status WillResume() override;
 
-  lldb_private::Error DoResume() override;
+  lldb_private::Status DoResume() override;
 
-  lldb_private::Error DoHalt(bool &caused_stop) override;
+  lldb_private::Status DoHalt(bool &caused_stop) override;
 
-  lldb_private::Error DoDetach(bool keep_stopped) override;
+  lldb_private::Status DoDetach(bool keep_stopped) override;
 
-  lldb_private::Error DoSignal(int signal) override;
+  lldb_private::Status DoSignal(int signal) override;
 
-  lldb_private::Error DoDestroy() override;
+  lldb_private::Status DoDestroy() override;
 
   void RefreshStateAfterStop() override;
 
@@ -130,34 +130,34 @@ public:
   // Process Memory
   //------------------------------------------------------------------
   size_t DoReadMemory(lldb::addr_t addr, void *buf, size_t size,
-                      lldb_private::Error &error) override;
+                      lldb_private::Status &error) override;
 
   size_t DoWriteMemory(lldb::addr_t addr, const void *buf, size_t size,
-                       lldb_private::Error &error) override;
+                       lldb_private::Status &error) override;
 
   lldb::addr_t DoAllocateMemory(size_t size, uint32_t permissions,
-                                lldb_private::Error &error) override;
+                                lldb_private::Status &error) override;
 
-  lldb_private::Error DoDeallocateMemory(lldb::addr_t ptr) override;
+  lldb_private::Status DoDeallocateMemory(lldb::addr_t ptr) override;
 
   //----------------------------------------------------------------------
   // Process Breakpoints
   //----------------------------------------------------------------------
-  lldb_private::Error
+  lldb_private::Status
   EnableBreakpointSite(lldb_private::BreakpointSite *bp_site) override;
 
-  lldb_private::Error
+  lldb_private::Status
   DisableBreakpointSite(lldb_private::BreakpointSite *bp_site) override;
 
   //----------------------------------------------------------------------
   // Process Watchpoints
   //----------------------------------------------------------------------
-  lldb_private::Error EnableWatchpoint(lldb_private::Watchpoint *wp,
-                                       bool notify = true) override;
-
-  lldb_private::Error DisableWatchpoint(lldb_private::Watchpoint *wp,
+  lldb_private::Status EnableWatchpoint(lldb_private::Watchpoint *wp,
                                         bool notify = true) override;
 
+  lldb_private::Status DisableWatchpoint(lldb_private::Watchpoint *wp,
+                                         bool notify = true) override;
+
   CommunicationKDP &GetCommunication() { return m_comm; }
 
 protected:
diff --git a/source/Plugins/Process/MacOSX-Kernel/RegisterContextKDP_arm.cpp b/source/Plugins/Process/MacOSX-Kernel/RegisterContextKDP_arm.cpp
index 06c33dba8576..159a046b617d 100644
--- a/source/Plugins/Process/MacOSX-Kernel/RegisterContextKDP_arm.cpp
+++ b/source/Plugins/Process/MacOSX-Kernel/RegisterContextKDP_arm.cpp
@@ -29,7 +29,7 @@ RegisterContextKDP_arm::~RegisterContextKDP_arm() {}
 int RegisterContextKDP_arm::DoReadGPR(lldb::tid_t tid, int flavor, GPR &gpr) {
   ProcessSP process_sp(CalculateProcess());
   if (process_sp) {
-    Error error;
+    Status error;
     if (static_cast<ProcessKDP *>(process_sp.get())
             ->GetCommunication()
             .SendRequestReadRegisters(tid, GPRRegSet, &gpr, sizeof(gpr),
@@ -44,7 +44,7 @@ int RegisterContextKDP_arm::DoReadGPR(lldb::tid_t tid, int flavor, GPR &gpr) {
 int RegisterContextKDP_arm::DoReadFPU(lldb::tid_t tid, int flavor, FPU &fpu) {
   ProcessSP process_sp(CalculateProcess());
   if (process_sp) {
-    Error error;
+    Status error;
     if (static_cast<ProcessKDP *>(process_sp.get())
             ->GetCommunication()
             .SendRequestReadRegisters(tid, FPURegSet, &fpu, sizeof(fpu),
@@ -59,7 +59,7 @@ int RegisterContextKDP_arm::DoReadFPU(lldb::tid_t tid, int flavor, FPU &fpu) {
 int RegisterContextKDP_arm::DoReadEXC(lldb::tid_t tid, int flavor, EXC &exc) {
   ProcessSP process_sp(CalculateProcess());
   if (process_sp) {
-    Error error;
+    Status error;
     if (static_cast<ProcessKDP *>(process_sp.get())
             ->GetCommunication()
             .SendRequestReadRegisters(tid, EXCRegSet, &exc, sizeof(exc),
@@ -74,7 +74,7 @@ int RegisterContextKDP_arm::DoReadEXC(lldb::tid_t tid, int flavor, EXC &exc) {
 int RegisterContextKDP_arm::DoReadDBG(lldb::tid_t tid, int flavor, DBG &dbg) {
   ProcessSP process_sp(CalculateProcess());
   if (process_sp) {
-    Error error;
+    Status error;
     if (static_cast<ProcessKDP *>(process_sp.get())
             ->GetCommunication()
             .SendRequestReadRegisters(tid, DBGRegSet, &dbg, sizeof(dbg),
@@ -90,7 +90,7 @@ int RegisterContextKDP_arm::DoWriteGPR(lldb::tid_t tid, int flavor,
                                        const GPR &gpr) {
   ProcessSP process_sp(CalculateProcess());
   if (process_sp) {
-    Error error;
+    Status error;
     if (static_cast<ProcessKDP *>(process_sp.get())
             ->GetCommunication()
             .SendRequestWriteRegisters(tid, GPRRegSet, &gpr, sizeof(gpr),
@@ -106,7 +106,7 @@ int RegisterContextKDP_arm::DoWriteFPU(lldb::tid_t tid, int flavor,
                                        const FPU &fpu) {
   ProcessSP process_sp(CalculateProcess());
   if (process_sp) {
-    Error error;
+    Status error;
     if (static_cast<ProcessKDP *>(process_sp.get())
             ->GetCommunication()
             .SendRequestWriteRegisters(tid, FPURegSet, &fpu, sizeof(fpu),
@@ -122,7 +122,7 @@ int RegisterContextKDP_arm::DoWriteEXC(lldb::tid_t tid, int flavor,
                                        const EXC &exc) {
   ProcessSP process_sp(CalculateProcess());
   if (process_sp) {
-    Error error;
+    Status error;
     if (static_cast<ProcessKDP *>(process_sp.get())
             ->GetCommunication()
             .SendRequestWriteRegisters(tid, EXCRegSet, &exc, sizeof(exc),
@@ -138,7 +138,7 @@ int RegisterContextKDP_arm::DoWriteDBG(lldb::tid_t tid, int flavor,
                                        const DBG &dbg) {
   ProcessSP process_sp(CalculateProcess());
   if (process_sp) {
-    Error error;
+    Status error;
     if (static_cast<ProcessKDP *>(process_sp.get())
             ->GetCommunication()
             .SendRequestWriteRegisters(tid, DBGRegSet, &dbg, sizeof(dbg),
diff --git a/source/Plugins/Process/MacOSX-Kernel/RegisterContextKDP_arm64.cpp b/source/Plugins/Process/MacOSX-Kernel/RegisterContextKDP_arm64.cpp
index 6a2733e62759..44534a252568 100644
--- a/source/Plugins/Process/MacOSX-Kernel/RegisterContextKDP_arm64.cpp
+++ b/source/Plugins/Process/MacOSX-Kernel/RegisterContextKDP_arm64.cpp
@@ -30,7 +30,7 @@ RegisterContextKDP_arm64::~RegisterContextKDP_arm64() {}
 int RegisterContextKDP_arm64::DoReadGPR(lldb::tid_t tid, int flavor, GPR &gpr) {
   ProcessSP process_sp(CalculateProcess());
   if (process_sp) {
-    Error error;
+    Status error;
     if (static_cast<ProcessKDP *>(process_sp.get())
             ->GetCommunication()
             .SendRequestReadRegisters(tid, GPRRegSet, &gpr, sizeof(gpr),
@@ -45,7 +45,7 @@ int RegisterContextKDP_arm64::DoReadGPR(lldb::tid_t tid, int flavor, GPR &gpr) {
 int RegisterContextKDP_arm64::DoReadFPU(lldb::tid_t tid, int flavor, FPU &fpu) {
   ProcessSP process_sp(CalculateProcess());
   if (process_sp) {
-    Error error;
+    Status error;
     if (static_cast<ProcessKDP *>(process_sp.get())
             ->GetCommunication()
             .SendRequestReadRegisters(tid, FPURegSet, &fpu, sizeof(fpu),
@@ -60,7 +60,7 @@ int RegisterContextKDP_arm64::DoReadFPU(lldb::tid_t tid, int flavor, FPU &fpu) {
 int RegisterContextKDP_arm64::DoReadEXC(lldb::tid_t tid, int flavor, EXC &exc) {
   ProcessSP process_sp(CalculateProcess());
   if (process_sp) {
-    Error error;
+    Status error;
     if (static_cast<ProcessKDP *>(process_sp.get())
             ->GetCommunication()
             .SendRequestReadRegisters(tid, EXCRegSet, &exc, sizeof(exc),
@@ -75,7 +75,7 @@ int RegisterContextKDP_arm64::DoReadEXC(lldb::tid_t tid, int flavor, EXC &exc) {
 int RegisterContextKDP_arm64::DoReadDBG(lldb::tid_t tid, int flavor, DBG &dbg) {
   ProcessSP process_sp(CalculateProcess());
   if (process_sp) {
-    Error error;
+    Status error;
     if (static_cast<ProcessKDP *>(process_sp.get())
             ->GetCommunication()
             .SendRequestReadRegisters(tid, DBGRegSet, &dbg, sizeof(dbg),
@@ -91,7 +91,7 @@ int RegisterContextKDP_arm64::DoWriteGPR(lldb::tid_t tid, int flavor,
                                          const GPR &gpr) {
   ProcessSP process_sp(CalculateProcess());
   if (process_sp) {
-    Error error;
+    Status error;
     if (static_cast<ProcessKDP *>(process_sp.get())
             ->GetCommunication()
             .SendRequestWriteRegisters(tid, GPRRegSet, &gpr, sizeof(gpr),
@@ -107,7 +107,7 @@ int RegisterContextKDP_arm64::DoWriteFPU(lldb::tid_t tid, int flavor,
                                          const FPU &fpu) {
   ProcessSP process_sp(CalculateProcess());
   if (process_sp) {
-    Error error;
+    Status error;
     if (static_cast<ProcessKDP *>(process_sp.get())
             ->GetCommunication()
             .SendRequestWriteRegisters(tid, FPURegSet, &fpu, sizeof(fpu),
@@ -123,7 +123,7 @@ int RegisterContextKDP_arm64::DoWriteEXC(lldb::tid_t tid, int flavor,
                                          const EXC &exc) {
   ProcessSP process_sp(CalculateProcess());
   if (process_sp) {
-    Error error;
+    Status error;
     if (static_cast<ProcessKDP *>(process_sp.get())
             ->GetCommunication()
             .SendRequestWriteRegisters(tid, EXCRegSet, &exc, sizeof(exc),
@@ -139,7 +139,7 @@ int RegisterContextKDP_arm64::DoWriteDBG(lldb::tid_t tid, int flavor,
                                          const DBG &dbg) {
   ProcessSP process_sp(CalculateProcess());
   if (process_sp) {
-    Error error;
+    Status error;
     if (static_cast<ProcessKDP *>(process_sp.get())
             ->GetCommunication()
             .SendRequestWriteRegisters(tid, DBGRegSet, &dbg, sizeof(dbg),
diff --git a/source/Plugins/Process/MacOSX-Kernel/RegisterContextKDP_i386.cpp b/source/Plugins/Process/MacOSX-Kernel/RegisterContextKDP_i386.cpp
index 4a130c18d075..e48232ad8dda 100644
--- a/source/Plugins/Process/MacOSX-Kernel/RegisterContextKDP_i386.cpp
+++ b/source/Plugins/Process/MacOSX-Kernel/RegisterContextKDP_i386.cpp
@@ -28,7 +28,7 @@ RegisterContextKDP_i386::~RegisterContextKDP_i386() {}
 int RegisterContextKDP_i386::DoReadGPR(lldb::tid_t tid, int flavor, GPR &gpr) {
   ProcessSP process_sp(CalculateProcess());
   if (process_sp) {
-    Error error;
+    Status error;
     if (static_cast<ProcessKDP *>(process_sp.get())
             ->GetCommunication()
             .SendRequestReadRegisters(tid, GPRRegSet, &gpr, sizeof(gpr),
@@ -43,7 +43,7 @@ int RegisterContextKDP_i386::DoReadGPR(lldb::tid_t tid, int flavor, GPR &gpr) {
 int RegisterContextKDP_i386::DoReadFPU(lldb::tid_t tid, int flavor, FPU &fpu) {
   ProcessSP process_sp(CalculateProcess());
   if (process_sp) {
-    Error error;
+    Status error;
     if (static_cast<ProcessKDP *>(process_sp.get())
             ->GetCommunication()
             .SendRequestReadRegisters(tid, FPURegSet, &fpu, sizeof(fpu),
@@ -58,7 +58,7 @@ int RegisterContextKDP_i386::DoReadFPU(lldb::tid_t tid, int flavor, FPU &fpu) {
 int RegisterContextKDP_i386::DoReadEXC(lldb::tid_t tid, int flavor, EXC &exc) {
   ProcessSP process_sp(CalculateProcess());
   if (process_sp) {
-    Error error;
+    Status error;
     if (static_cast<ProcessKDP *>(process_sp.get())
             ->GetCommunication()
             .SendRequestReadRegisters(tid, EXCRegSet, &exc, sizeof(exc),
@@ -74,7 +74,7 @@ int RegisterContextKDP_i386::DoWriteGPR(lldb::tid_t tid, int flavor,
                                         const GPR &gpr) {
   ProcessSP process_sp(CalculateProcess());
   if (process_sp) {
-    Error error;
+    Status error;
     if (static_cast<ProcessKDP *>(process_sp.get())
             ->GetCommunication()
             .SendRequestWriteRegisters(tid, GPRRegSet, &gpr, sizeof(gpr),
@@ -90,7 +90,7 @@ int RegisterContextKDP_i386::DoWriteFPU(lldb::tid_t tid, int flavor,
                                         const FPU &fpu) {
   ProcessSP process_sp(CalculateProcess());
   if (process_sp) {
-    Error error;
+    Status error;
     if (static_cast<ProcessKDP *>(process_sp.get())
             ->GetCommunication()
             .SendRequestWriteRegisters(tid, FPURegSet, &fpu, sizeof(fpu),
@@ -106,7 +106,7 @@ int RegisterContextKDP_i386::DoWriteEXC(lldb::tid_t tid, int flavor,
                                         const EXC &exc) {
   ProcessSP process_sp(CalculateProcess());
   if (process_sp) {
-    Error error;
+    Status error;
     if (static_cast<ProcessKDP *>(process_sp.get())
             ->GetCommunication()
             .SendRequestWriteRegisters(tid, EXCRegSet, &exc, sizeof(exc),
diff --git a/source/Plugins/Process/MacOSX-Kernel/RegisterContextKDP_x86_64.cpp b/source/Plugins/Process/MacOSX-Kernel/RegisterContextKDP_x86_64.cpp
index ad10d3f6be52..50e11f381925 100644
--- a/source/Plugins/Process/MacOSX-Kernel/RegisterContextKDP_x86_64.cpp
+++ b/source/Plugins/Process/MacOSX-Kernel/RegisterContextKDP_x86_64.cpp
@@ -29,7 +29,7 @@ int RegisterContextKDP_x86_64::DoReadGPR(lldb::tid_t tid, int flavor,
                                          GPR &gpr) {
   ProcessSP process_sp(CalculateProcess());
   if (process_sp) {
-    Error error;
+    Status error;
     if (static_cast<ProcessKDP *>(process_sp.get())
             ->GetCommunication()
             .SendRequestReadRegisters(tid, GPRRegSet, &gpr, sizeof(gpr),
@@ -45,7 +45,7 @@ int RegisterContextKDP_x86_64::DoReadFPU(lldb::tid_t tid, int flavor,
                                          FPU &fpu) {
   ProcessSP process_sp(CalculateProcess());
   if (process_sp) {
-    Error error;
+    Status error;
     if (static_cast<ProcessKDP *>(process_sp.get())
             ->GetCommunication()
             .SendRequestReadRegisters(tid, FPURegSet, &fpu, sizeof(fpu),
@@ -61,7 +61,7 @@ int RegisterContextKDP_x86_64::DoReadEXC(lldb::tid_t tid, int flavor,
                                          EXC &exc) {
   ProcessSP process_sp(CalculateProcess());
   if (process_sp) {
-    Error error;
+    Status error;
     if (static_cast<ProcessKDP *>(process_sp.get())
             ->GetCommunication()
             .SendRequestReadRegisters(tid, EXCRegSet, &exc, sizeof(exc),
@@ -77,7 +77,7 @@ int RegisterContextKDP_x86_64::DoWriteGPR(lldb::tid_t tid, int flavor,
                                           const GPR &gpr) {
   ProcessSP process_sp(CalculateProcess());
   if (process_sp) {
-    Error error;
+    Status error;
     if (static_cast<ProcessKDP *>(process_sp.get())
             ->GetCommunication()
             .SendRequestWriteRegisters(tid, GPRRegSet, &gpr, sizeof(gpr),
@@ -93,7 +93,7 @@ int RegisterContextKDP_x86_64::DoWriteFPU(lldb::tid_t tid, int flavor,
                                           const FPU &fpu) {
   ProcessSP process_sp(CalculateProcess());
   if (process_sp) {
-    Error error;
+    Status error;
     if (static_cast<ProcessKDP *>(process_sp.get())
             ->GetCommunication()
             .SendRequestWriteRegisters(tid, FPURegSet, &fpu, sizeof(fpu),
@@ -109,7 +109,7 @@ int RegisterContextKDP_x86_64::DoWriteEXC(lldb::tid_t tid, int flavor,
                                           const EXC &exc) {
   ProcessSP process_sp(CalculateProcess());
   if (process_sp) {
-    Error error;
+    Status error;
     if (static_cast<ProcessKDP *>(process_sp.get())
             ->GetCommunication()
             .SendRequestWriteRegisters(tid, EXCRegSet, &exc, sizeof(exc),
diff --git a/source/Plugins/Process/NetBSD/NativeProcessNetBSD.cpp b/source/Plugins/Process/NetBSD/NativeProcessNetBSD.cpp
index 347c12943bd5..80751147b4f5 100644
--- a/source/Plugins/Process/NetBSD/NativeProcessNetBSD.cpp
+++ b/source/Plugins/Process/NetBSD/NativeProcessNetBSD.cpp
@@ -68,8 +68,8 @@ static int convert_pid_status_to_return_code(int status) {
 
 // Simple helper function to ensure flags are enabled on the given file
 // descriptor.
-static Error EnsureFDFlags(int fd, int flags) {
-  Error error;
+static Status EnsureFDFlags(int fd, int flags) {
+  Status error;
 
   int status = fcntl(fd, F_GETFL);
   if (status == -1) {
@@ -89,13 +89,13 @@ static Error EnsureFDFlags(int fd, int flags) {
 // Public Static Methods
 // -----------------------------------------------------------------------------
 
-Error NativeProcessProtocol::Launch(
+Status NativeProcessProtocol::Launch(
     ProcessLaunchInfo &launch_info,
     NativeProcessProtocol::NativeDelegate &native_delegate, MainLoop &mainloop,
     NativeProcessProtocolSP &native_process_sp) {
   Log *log(ProcessPOSIXLog::GetLogIfAllCategoriesSet(POSIX_LOG_PROCESS));
 
-  Error error;
+  Status error;
 
   // Verify the working directory is valid if one was specified.
   FileSpec working_dir{launch_info.GetWorkingDirectory()};
@@ -129,7 +129,7 @@ Error NativeProcessProtocol::Launch(
   return error;
 }
 
-Error NativeProcessProtocol::Attach(
+Status NativeProcessProtocol::Attach(
     lldb::pid_t pid, NativeProcessProtocol::NativeDelegate &native_delegate,
     MainLoop &mainloop, NativeProcessProtocolSP &native_process_sp) {
   Log *log(ProcessPOSIXLog::GetLogIfAllCategoriesSet(POSIX_LOG_PROCESS));
@@ -137,7 +137,7 @@ Error NativeProcessProtocol::Attach(
 
   // Retrieve the architecture for the running process.
   ArchSpec process_arch;
-  Error error = ResolveProcessArchitecture(pid, process_arch);
+  Status error = ResolveProcessArchitecture(pid, process_arch);
   if (!error.Success())
     return error;
 
@@ -245,7 +245,7 @@ void NativeProcessNetBSD::MonitorSIGTRAP(lldb::pid_t pid) {
     SetState(StateType::eStateStopped, true);
     break;
   case TRAP_EXEC: {
-    Error error = ReinitializeThreads();
+    Status error = ReinitializeThreads();
     if (error.Fail()) {
       SetState(StateType::eStateInvalid);
       return;
@@ -262,7 +262,7 @@ void NativeProcessNetBSD::MonitorSIGTRAP(lldb::pid_t pid) {
   case TRAP_DBREG: {
     // If a watchpoint was hit, report it
     uint32_t wp_index;
-    Error error =
+    Status error =
         static_pointer_cast<NativeThreadNetBSD>(m_threads[info.psi_lwpid])
             ->GetRegisterContext()
             ->GetWatchpointHitIndex(wp_index,
@@ -318,10 +318,10 @@ void NativeProcessNetBSD::MonitorSignal(lldb::pid_t pid, int signal) {
   SetState(StateType::eStateStopped, true);
 }
 
-Error NativeProcessNetBSD::PtraceWrapper(int req, lldb::pid_t pid, void *addr,
-                                         int data, int *result) {
+Status NativeProcessNetBSD::PtraceWrapper(int req, lldb::pid_t pid, void *addr,
+                                          int data, int *result) {
   Log *log(ProcessPOSIXLog::GetLogIfAllCategoriesSet(POSIX_LOG_PTRACE));
-  Error error;
+  Status error;
   int ret;
 
   errno = 0;
@@ -341,7 +341,7 @@ Error NativeProcessNetBSD::PtraceWrapper(int req, lldb::pid_t pid, void *addr,
   return error;
 }
 
-Error NativeProcessNetBSD::GetSoftwareBreakpointPCOffset(
+Status NativeProcessNetBSD::GetSoftwareBreakpointPCOffset(
     uint32_t &actual_opcode_size) {
   // FIXME put this behind a breakpoint protocol class that can be
   // set per architecture.  Need ARM, MIPS support here.
@@ -349,17 +349,17 @@ Error NativeProcessNetBSD::GetSoftwareBreakpointPCOffset(
   switch (m_arch.GetMachine()) {
   case llvm::Triple::x86_64:
     actual_opcode_size = static_cast<uint32_t>(sizeof(g_i386_opcode));
-    return Error();
+    return Status();
   default:
     assert(false && "CPU type not supported!");
-    return Error("CPU type not supported");
+    return Status("CPU type not supported");
   }
 }
 
-Error NativeProcessNetBSD::FixupBreakpointPCAsNeeded(
-    NativeThreadNetBSD &thread) {
+Status
+NativeProcessNetBSD::FixupBreakpointPCAsNeeded(NativeThreadNetBSD &thread) {
   Log *log(ProcessPOSIXLog::GetLogIfAllCategoriesSet(POSIX_LOG_BREAKPOINTS));
-  Error error;
+  Status error;
   // Find out the size of a breakpoint (might depend on where we are in the
   // code).
   NativeRegisterContextSP context_sp = thread.GetRegisterContext();
@@ -394,7 +394,7 @@ Error NativeProcessNetBSD::FixupBreakpointPCAsNeeded(
              "pid {0} no lldb breakpoint found at current pc with "
              "adjustment: {1}",
              GetID(), breakpoint_addr);
-    return Error();
+    return Status();
   }
   // If the breakpoint is not a software breakpoint, nothing to do.
   if (!breakpoint_sp->IsSoftwareBreakpoint()) {
@@ -402,7 +402,7 @@ Error NativeProcessNetBSD::FixupBreakpointPCAsNeeded(
         log,
         "pid {0} breakpoint found at {1:x}, not software, nothing to adjust",
         GetID(), breakpoint_addr);
-    return Error();
+    return Status();
   }
   //
   // We have a software breakpoint and need to adjust the PC.
@@ -414,7 +414,7 @@ Error NativeProcessNetBSD::FixupBreakpointPCAsNeeded(
              "pid {0} breakpoint found at {1:x}, it is software, but the "
              "size is zero, nothing to do (unexpected)",
              GetID(), breakpoint_addr);
-    return Error();
+    return Status();
   }
   //
   // We have a software breakpoint and need to adjust the PC.
@@ -426,7 +426,7 @@ Error NativeProcessNetBSD::FixupBreakpointPCAsNeeded(
              "pid {0} breakpoint found at {1:x}, it is software, but the "
              "size is zero, nothing to do (unexpected)",
              GetID(), breakpoint_addr);
-    return Error();
+    return Status();
   }
   // Change the program counter.
   LLDB_LOG(log, "pid {0} tid {1}: changing PC from {2:x} to {3:x}", GetID(),
@@ -440,7 +440,7 @@ Error NativeProcessNetBSD::FixupBreakpointPCAsNeeded(
   return error;
 }
 
-Error NativeProcessNetBSD::Resume(const ResumeActionList &resume_actions) {
+Status NativeProcessNetBSD::Resume(const ResumeActionList &resume_actions) {
   Log *log(ProcessPOSIXLog::GetLogIfAllCategoriesSet(POSIX_LOG_PROCESS));
   LLDB_LOG(log, "pid {0}", GetID());
 
@@ -451,10 +451,10 @@ Error NativeProcessNetBSD::Resume(const ResumeActionList &resume_actions) {
   if (action == nullptr) {
     LLDB_LOG(log, "no action specified for pid {0} tid {1}", GetID(),
              thread_sp->GetID());
-    return Error();
+    return Status();
   }
 
-  Error error;
+  Status error;
 
   switch (action->state) {
   case eStateRunning: {
@@ -486,17 +486,17 @@ Error NativeProcessNetBSD::Resume(const ResumeActionList &resume_actions) {
     llvm_unreachable("Unexpected state");
 
   default:
-    return Error("NativeProcessNetBSD::%s (): unexpected state %s specified "
-                 "for pid %" PRIu64 ", tid %" PRIu64,
-                 __FUNCTION__, StateAsCString(action->state), GetID(),
-                 thread_sp->GetID());
+    return Status("NativeProcessNetBSD::%s (): unexpected state %s specified "
+                  "for pid %" PRIu64 ", tid %" PRIu64,
+                  __FUNCTION__, StateAsCString(action->state), GetID(),
+                  thread_sp->GetID());
   }
 
-  return Error();
+  return Status();
 }
 
-Error NativeProcessNetBSD::Halt() {
-  Error error;
+Status NativeProcessNetBSD::Halt() {
+  Status error;
 
   if (kill(GetID(), SIGSTOP) != 0)
     error.SetErrorToErrno();
@@ -504,8 +504,8 @@ Error NativeProcessNetBSD::Halt() {
   return error;
 }
 
-Error NativeProcessNetBSD::Detach() {
-  Error error;
+Status NativeProcessNetBSD::Detach() {
+  Status error;
 
   // Stop monitoring the inferior.
   m_sigchld_handle.reset();
@@ -517,8 +517,8 @@ Error NativeProcessNetBSD::Detach() {
   return PtraceWrapper(PT_DETACH, GetID());
 }
 
-Error NativeProcessNetBSD::Signal(int signo) {
-  Error error;
+Status NativeProcessNetBSD::Signal(int signo) {
+  Status error;
 
   if (kill(GetID(), signo))
     error.SetErrorToErrno();
@@ -526,11 +526,11 @@ Error NativeProcessNetBSD::Signal(int signo) {
   return error;
 }
 
-Error NativeProcessNetBSD::Kill() {
+Status NativeProcessNetBSD::Kill() {
   Log *log(ProcessPOSIXLog::GetLogIfAllCategoriesSet(POSIX_LOG_PROCESS));
   LLDB_LOG(log, "pid {0}", GetID());
 
-  Error error;
+  Status error;
 
   switch (m_state) {
   case StateType::eStateInvalid:
@@ -562,15 +562,15 @@ Error NativeProcessNetBSD::Kill() {
   return error;
 }
 
-Error NativeProcessNetBSD::GetMemoryRegionInfo(lldb::addr_t load_addr,
-                                               MemoryRegionInfo &range_info) {
+Status NativeProcessNetBSD::GetMemoryRegionInfo(lldb::addr_t load_addr,
+                                                MemoryRegionInfo &range_info) {
 
   if (m_supports_mem_region == LazyBool::eLazyBoolNo) {
     // We're done.
-    return Error("unsupported");
+    return Status("unsupported");
   }
 
-  Error error = PopulateMemoryRegionCache();
+  Status error = PopulateMemoryRegionCache();
   if (error.Fail()) {
     return error;
   }
@@ -619,14 +619,14 @@ Error NativeProcessNetBSD::GetMemoryRegionInfo(lldb::addr_t load_addr,
   return error;
 }
 
-Error NativeProcessNetBSD::PopulateMemoryRegionCache() {
+Status NativeProcessNetBSD::PopulateMemoryRegionCache() {
   Log *log(ProcessPOSIXLog::GetLogIfAllCategoriesSet(POSIX_LOG_PROCESS));
   // If our cache is empty, pull the latest.  There should always be at least
   // one memory region if memory region handling is supported.
   if (!m_mem_region_cache.empty()) {
     LLDB_LOG(log, "reusing {0} cached memory region entries",
              m_mem_region_cache.size());
-    return Error();
+    return Status();
   }
 
   struct kinfo_vmentry *vm;
@@ -634,7 +634,7 @@ Error NativeProcessNetBSD::PopulateMemoryRegionCache() {
   vm = kinfo_getvmmap(GetID(), &count);
   if (vm == NULL) {
     m_supports_mem_region = LazyBool::eLazyBoolNo;
-    Error error;
+    Status error;
     error.SetErrorString("not supported");
     return error;
   }
@@ -674,7 +674,7 @@ Error NativeProcessNetBSD::PopulateMemoryRegionCache() {
     LLDB_LOG(log, "failed to find any vmmap entries, assuming no support "
                   "for memory region metadata retrieval");
     m_supports_mem_region = LazyBool::eLazyBoolNo;
-    Error error;
+    Status error;
     error.SetErrorString("not supported");
     return error;
   }
@@ -682,16 +682,16 @@ Error NativeProcessNetBSD::PopulateMemoryRegionCache() {
            m_mem_region_cache.size(), GetID());
   // We support memory retrieval, remember that.
   m_supports_mem_region = LazyBool::eLazyBoolYes;
-  return Error();
+  return Status();
 }
 
-Error NativeProcessNetBSD::AllocateMemory(size_t size, uint32_t permissions,
-                                          lldb::addr_t &addr) {
-  return Error("Unimplemented");
+Status NativeProcessNetBSD::AllocateMemory(size_t size, uint32_t permissions,
+                                           lldb::addr_t &addr) {
+  return Status("Unimplemented");
 }
 
-Error NativeProcessNetBSD::DeallocateMemory(lldb::addr_t addr) {
-  return Error("Unimplemented");
+Status NativeProcessNetBSD::DeallocateMemory(lldb::addr_t addr) {
+  return Status("Unimplemented");
 }
 
 lldb::addr_t NativeProcessNetBSD::GetSharedLibraryInfoAddress() {
@@ -706,15 +706,15 @@ bool NativeProcessNetBSD::GetArchitecture(ArchSpec &arch) const {
   return true;
 }
 
-Error NativeProcessNetBSD::SetBreakpoint(lldb::addr_t addr, uint32_t size,
-                                         bool hardware) {
+Status NativeProcessNetBSD::SetBreakpoint(lldb::addr_t addr, uint32_t size,
+                                          bool hardware) {
   if (hardware)
-    return Error("NativeProcessNetBSD does not support hardware breakpoints");
+    return Status("NativeProcessNetBSD does not support hardware breakpoints");
   else
     return SetSoftwareBreakpoint(addr, size);
 }
 
-Error NativeProcessNetBSD::GetSoftwareBreakpointTrapOpcode(
+Status NativeProcessNetBSD::GetSoftwareBreakpointTrapOpcode(
     size_t trap_opcode_size_hint, size_t &actual_opcode_size,
     const uint8_t *&trap_opcode_bytes) {
   static const uint8_t g_i386_opcode[] = {0xCC};
@@ -724,27 +724,27 @@ Error NativeProcessNetBSD::GetSoftwareBreakpointTrapOpcode(
   case llvm::Triple::x86_64:
     trap_opcode_bytes = g_i386_opcode;
     actual_opcode_size = sizeof(g_i386_opcode);
-    return Error();
+    return Status();
   default:
     assert(false && "CPU type not supported!");
-    return Error("CPU type not supported");
+    return Status("CPU type not supported");
   }
 }
 
-Error NativeProcessNetBSD::GetLoadedModuleFileSpec(const char *module_path,
-                                                   FileSpec &file_spec) {
-  return Error("Unimplemented");
+Status NativeProcessNetBSD::GetLoadedModuleFileSpec(const char *module_path,
+                                                    FileSpec &file_spec) {
+  return Status("Unimplemented");
 }
 
-Error NativeProcessNetBSD::GetFileLoadAddress(const llvm::StringRef &file_name,
-                                              lldb::addr_t &load_addr) {
+Status NativeProcessNetBSD::GetFileLoadAddress(const llvm::StringRef &file_name,
+                                               lldb::addr_t &load_addr) {
   load_addr = LLDB_INVALID_ADDRESS;
-  return Error();
+  return Status();
 }
 
-Error NativeProcessNetBSD::LaunchInferior(MainLoop &mainloop,
-                                          ProcessLaunchInfo &launch_info) {
-  Error error;
+Status NativeProcessNetBSD::LaunchInferior(MainLoop &mainloop,
+                                           ProcessLaunchInfo &launch_info) {
+  Status error;
   m_sigchld_handle = mainloop.RegisterSignal(
       SIGCHLD, [this](MainLoopBase &) { SigchldHandler(); }, error);
   if (!m_sigchld_handle)
@@ -826,7 +826,7 @@ Error NativeProcessNetBSD::LaunchInferior(MainLoop &mainloop,
 }
 
 void NativeProcessNetBSD::AttachToInferior(MainLoop &mainloop, lldb::pid_t pid,
-                                           Error &error) {
+                                           Status &error) {
   Log *log(ProcessPOSIXLog::GetLogIfAllCategoriesSet(POSIX_LOG_PROCESS));
   LLDB_LOG(log, "pid = {0:x}", pid);
 
@@ -862,7 +862,7 @@ void NativeProcessNetBSD::SigchldHandler() {
     if (errno == EINTR)
       return;
 
-    Error error(errno, eErrorTypePOSIX);
+    Status error(errno, eErrorTypePOSIX);
     LLDB_LOG(log, "waitpid ({0}, &status, _) failed: {1}", GetID(), error);
   }
 
@@ -915,7 +915,7 @@ NativeThreadNetBSDSP NativeProcessNetBSD::AddThread(lldb::tid_t thread_id) {
   return thread_sp;
 }
 
-::pid_t NativeProcessNetBSD::Attach(lldb::pid_t pid, Error &error) {
+::pid_t NativeProcessNetBSD::Attach(lldb::pid_t pid, Status &error) {
   Log *log(GetLogIfAllCategoriesSet(LIBLLDB_LOG_PROCESS));
 
   if (pid <= 1) {
@@ -956,8 +956,8 @@ NativeThreadNetBSDSP NativeProcessNetBSD::AddThread(lldb::tid_t thread_id) {
   return pid;
 }
 
-Error NativeProcessNetBSD::ReadMemory(lldb::addr_t addr, void *buf, size_t size,
-                                      size_t &bytes_read) {
+Status NativeProcessNetBSD::ReadMemory(lldb::addr_t addr, void *buf,
+                                       size_t size, size_t &bytes_read) {
   unsigned char *dst = static_cast<unsigned char *>(buf);
   struct ptrace_io_desc io;
 
@@ -972,7 +972,7 @@ Error NativeProcessNetBSD::ReadMemory(lldb::addr_t addr, void *buf, size_t size,
     io.piod_offs = (void *)(addr + bytes_read);
     io.piod_addr = dst + bytes_read;
 
-    Error error = NativeProcessNetBSD::PtraceWrapper(PT_IO, GetID(), &io);
+    Status error = NativeProcessNetBSD::PtraceWrapper(PT_IO, GetID(), &io);
     if (error.Fail())
       return error;
 
@@ -980,22 +980,22 @@ Error NativeProcessNetBSD::ReadMemory(lldb::addr_t addr, void *buf, size_t size,
     io.piod_len = size - bytes_read;
   } while (bytes_read < size);
 
-  return Error();
+  return Status();
 }
 
-Error NativeProcessNetBSD::ReadMemoryWithoutTrap(lldb::addr_t addr, void *buf,
-                                                 size_t size,
-                                                 size_t &bytes_read) {
-  Error error = ReadMemory(addr, buf, size, bytes_read);
+Status NativeProcessNetBSD::ReadMemoryWithoutTrap(lldb::addr_t addr, void *buf,
+                                                  size_t size,
+                                                  size_t &bytes_read) {
+  Status error = ReadMemory(addr, buf, size, bytes_read);
   if (error.Fail())
     return error;
   return m_breakpoint_list.RemoveTrapsFromBuffer(addr, buf, size);
 }
 
-Error NativeProcessNetBSD::WriteMemory(lldb::addr_t addr, const void *buf,
-                                       size_t size, size_t &bytes_written) {
+Status NativeProcessNetBSD::WriteMemory(lldb::addr_t addr, const void *buf,
+                                        size_t size, size_t &bytes_written) {
   const unsigned char *src = static_cast<const unsigned char *>(buf);
-  Error error;
+  Status error;
   struct ptrace_io_desc io;
 
   Log *log(ProcessPOSIXLog::GetLogIfAllCategoriesSet(POSIX_LOG_MEMORY));
@@ -1009,7 +1009,7 @@ Error NativeProcessNetBSD::WriteMemory(lldb::addr_t addr, const void *buf,
     io.piod_addr = (void *)(src + bytes_written);
     io.piod_offs = (void *)(addr + bytes_written);
 
-    Error error = NativeProcessNetBSD::PtraceWrapper(PT_IO, GetID(), &io);
+    Status error = NativeProcessNetBSD::PtraceWrapper(PT_IO, GetID(), &io);
     if (error.Fail())
       return error;
 
@@ -1039,7 +1039,7 @@ NativeProcessNetBSD::GetAuxvData() const {
                               .piod_addr = (void *)buf.get()->getBufferStart(),
                               .piod_len = auxv_size};
 
-  Error error = NativeProcessNetBSD::PtraceWrapper(PT_IO, GetID(), &io);
+  Status error = NativeProcessNetBSD::PtraceWrapper(PT_IO, GetID(), &io);
 
   if (error.Fail())
     return std::error_code(error.GetError(), std::generic_category());
@@ -1050,13 +1050,13 @@ NativeProcessNetBSD::GetAuxvData() const {
   return buf;
 }
 
-Error NativeProcessNetBSD::ReinitializeThreads() {
+Status NativeProcessNetBSD::ReinitializeThreads() {
   // Clear old threads
   m_threads.clear();
 
   // Initialize new thread
   struct ptrace_lwpinfo info = {};
-  Error error = PtraceWrapper(PT_LWPINFO, GetID(), &info, sizeof(info));
+  Status error = PtraceWrapper(PT_LWPINFO, GetID(), &info, sizeof(info));
   if (error.Fail()) {
     return error;
   }
diff --git a/source/Plugins/Process/NetBSD/NativeProcessNetBSD.h b/source/Plugins/Process/NetBSD/NativeProcessNetBSD.h
index ae946a8e004d..e18ba162e523 100644
--- a/source/Plugins/Process/NetBSD/NativeProcessNetBSD.h
+++ b/source/Plugins/Process/NetBSD/NativeProcessNetBSD.h
@@ -31,11 +31,11 @@ namespace process_netbsd {
 ///
 /// Changes in the inferior process state are broadcasted.
 class NativeProcessNetBSD : public NativeProcessProtocol {
-  friend Error NativeProcessProtocol::Launch(
+  friend Status NativeProcessProtocol::Launch(
       ProcessLaunchInfo &launch_info, NativeDelegate &native_delegate,
       MainLoop &mainloop, NativeProcessProtocolSP &process_sp);
 
-  friend Error NativeProcessProtocol::Attach(
+  friend Status NativeProcessProtocol::Attach(
       lldb::pid_t pid, NativeProcessProtocol::NativeDelegate &native_delegate,
       MainLoop &mainloop, NativeProcessProtocolSP &process_sp);
 
@@ -43,32 +43,32 @@ public:
   // ---------------------------------------------------------------------
   // NativeProcessProtocol Interface
   // ---------------------------------------------------------------------
-  Error Resume(const ResumeActionList &resume_actions) override;
+  Status Resume(const ResumeActionList &resume_actions) override;
 
-  Error Halt() override;
+  Status Halt() override;
 
-  Error Detach() override;
+  Status Detach() override;
 
-  Error Signal(int signo) override;
+  Status Signal(int signo) override;
 
-  Error Kill() override;
+  Status Kill() override;
 
-  Error GetMemoryRegionInfo(lldb::addr_t load_addr,
-                            MemoryRegionInfo &range_info) override;
+  Status GetMemoryRegionInfo(lldb::addr_t load_addr,
+                             MemoryRegionInfo &range_info) override;
 
-  Error ReadMemory(lldb::addr_t addr, void *buf, size_t size,
-                   size_t &bytes_read) override;
+  Status ReadMemory(lldb::addr_t addr, void *buf, size_t size,
+                    size_t &bytes_read) override;
 
-  Error ReadMemoryWithoutTrap(lldb::addr_t addr, void *buf, size_t size,
-                              size_t &bytes_read) override;
+  Status ReadMemoryWithoutTrap(lldb::addr_t addr, void *buf, size_t size,
+                               size_t &bytes_read) override;
 
-  Error WriteMemory(lldb::addr_t addr, const void *buf, size_t size,
-                    size_t &bytes_written) override;
+  Status WriteMemory(lldb::addr_t addr, const void *buf, size_t size,
+                     size_t &bytes_written) override;
 
-  Error AllocateMemory(size_t size, uint32_t permissions,
-                       lldb::addr_t &addr) override;
+  Status AllocateMemory(size_t size, uint32_t permissions,
+                        lldb::addr_t &addr) override;
 
-  Error DeallocateMemory(lldb::addr_t addr) override;
+  Status DeallocateMemory(lldb::addr_t addr) override;
 
   lldb::addr_t GetSharedLibraryInfoAddress() override;
 
@@ -76,13 +76,14 @@ public:
 
   bool GetArchitecture(ArchSpec &arch) const override;
 
-  Error SetBreakpoint(lldb::addr_t addr, uint32_t size, bool hardware) override;
+  Status SetBreakpoint(lldb::addr_t addr, uint32_t size,
+                       bool hardware) override;
 
-  Error GetLoadedModuleFileSpec(const char *module_path,
-                                FileSpec &file_spec) override;
+  Status GetLoadedModuleFileSpec(const char *module_path,
+                                 FileSpec &file_spec) override;
 
-  Error GetFileLoadAddress(const llvm::StringRef &file_name,
-                           lldb::addr_t &load_addr) override;
+  Status GetFileLoadAddress(const llvm::StringRef &file_name,
+                            lldb::addr_t &load_addr) override;
 
   llvm::ErrorOr<std::unique_ptr<llvm::MemoryBuffer>>
   GetAuxvData() const override;
@@ -90,15 +91,15 @@ public:
   // ---------------------------------------------------------------------
   // Interface used by NativeRegisterContext-derived classes.
   // ---------------------------------------------------------------------
-  static Error PtraceWrapper(int req, lldb::pid_t pid, void *addr = nullptr,
-                             int data = 0, int *result = nullptr);
+  static Status PtraceWrapper(int req, lldb::pid_t pid, void *addr = nullptr,
+                              int data = 0, int *result = nullptr);
 
 protected:
   // ---------------------------------------------------------------------
   // NativeProcessProtocol protected interface
   // ---------------------------------------------------------------------
 
-  Error
+  Status
   GetSoftwareBreakpointTrapOpcode(size_t trap_opcode_size_hint,
                                   size_t &actual_opcode_size,
                                   const uint8_t *&trap_opcode_bytes) override;
@@ -116,8 +117,8 @@ private:
 
   NativeThreadNetBSDSP AddThread(lldb::tid_t thread_id);
 
-  Error LaunchInferior(MainLoop &mainloop, ProcessLaunchInfo &launch_info);
-  void AttachToInferior(MainLoop &mainloop, lldb::pid_t pid, Error &error);
+  Status LaunchInferior(MainLoop &mainloop, ProcessLaunchInfo &launch_info);
+  void AttachToInferior(MainLoop &mainloop, lldb::pid_t pid, Status &error);
 
   void MonitorCallback(lldb::pid_t pid, int signal);
   void MonitorExited(lldb::pid_t pid, int signal, int status);
@@ -125,14 +126,14 @@ private:
   void MonitorSIGTRAP(lldb::pid_t pid);
   void MonitorSignal(lldb::pid_t pid, int signal);
 
-  Error GetSoftwareBreakpointPCOffset(uint32_t &actual_opcode_size);
-  Error FixupBreakpointPCAsNeeded(NativeThreadNetBSD &thread);
-  Error PopulateMemoryRegionCache();
+  Status GetSoftwareBreakpointPCOffset(uint32_t &actual_opcode_size);
+  Status FixupBreakpointPCAsNeeded(NativeThreadNetBSD &thread);
+  Status PopulateMemoryRegionCache();
   void SigchldHandler();
 
-  ::pid_t Attach(lldb::pid_t pid, Error &error);
+  ::pid_t Attach(lldb::pid_t pid, Status &error);
 
-  Error ReinitializeThreads();
+  Status ReinitializeThreads();
 };
 
 } // namespace process_netbsd
diff --git a/source/Plugins/Process/NetBSD/NativeRegisterContextNetBSD.cpp b/source/Plugins/Process/NetBSD/NativeRegisterContextNetBSD.cpp
index cd47deac73ad..b442fc3462cc 100644
--- a/source/Plugins/Process/NetBSD/NativeRegisterContextNetBSD.cpp
+++ b/source/Plugins/Process/NetBSD/NativeRegisterContextNetBSD.cpp
@@ -25,80 +25,80 @@ NativeRegisterContextNetBSD::NativeRegisterContextNetBSD(
     : NativeRegisterContextRegisterInfo(native_thread, concrete_frame_idx,
                                         reg_info_interface_p) {}
 
-Error NativeRegisterContextNetBSD::ReadGPR() {
+Status NativeRegisterContextNetBSD::ReadGPR() {
   void *buf = GetGPRBuffer();
   if (!buf)
-    return Error("GPR buffer is NULL");
+    return Status("GPR buffer is NULL");
 
   return DoReadGPR(buf);
 }
 
-Error NativeRegisterContextNetBSD::WriteGPR() {
+Status NativeRegisterContextNetBSD::WriteGPR() {
   void *buf = GetGPRBuffer();
   if (!buf)
-    return Error("GPR buffer is NULL");
+    return Status("GPR buffer is NULL");
 
   return DoWriteGPR(buf);
 }
 
-Error NativeRegisterContextNetBSD::ReadFPR() {
+Status NativeRegisterContextNetBSD::ReadFPR() {
   void *buf = GetFPRBuffer();
   if (!buf)
-    return Error("FPR buffer is NULL");
+    return Status("FPR buffer is NULL");
 
   return DoReadFPR(buf);
 }
 
-Error NativeRegisterContextNetBSD::WriteFPR() {
+Status NativeRegisterContextNetBSD::WriteFPR() {
   void *buf = GetFPRBuffer();
   if (!buf)
-    return Error("FPR buffer is NULL");
+    return Status("FPR buffer is NULL");
 
   return DoWriteFPR(buf);
 }
 
-Error NativeRegisterContextNetBSD::ReadDBR() {
+Status NativeRegisterContextNetBSD::ReadDBR() {
   void *buf = GetDBRBuffer();
   if (!buf)
-    return Error("DBR buffer is NULL");
+    return Status("DBR buffer is NULL");
 
   return DoReadDBR(buf);
 }
 
-Error NativeRegisterContextNetBSD::WriteDBR() {
+Status NativeRegisterContextNetBSD::WriteDBR() {
   void *buf = GetDBRBuffer();
   if (!buf)
-    return Error("DBR buffer is NULL");
+    return Status("DBR buffer is NULL");
 
   return DoWriteDBR(buf);
 }
 
-Error NativeRegisterContextNetBSD::DoReadGPR(void *buf) {
+Status NativeRegisterContextNetBSD::DoReadGPR(void *buf) {
   return NativeProcessNetBSD::PtraceWrapper(PT_GETREGS, GetProcessPid(), buf,
                                             m_thread.GetID());
 }
 
-Error NativeRegisterContextNetBSD::DoWriteGPR(void *buf) {
+Status NativeRegisterContextNetBSD::DoWriteGPR(void *buf) {
   return NativeProcessNetBSD::PtraceWrapper(PT_SETREGS, GetProcessPid(), buf,
                                             m_thread.GetID());
 }
 
-Error NativeRegisterContextNetBSD::DoReadFPR(void *buf) {
+Status NativeRegisterContextNetBSD::DoReadFPR(void *buf) {
   return NativeProcessNetBSD::PtraceWrapper(PT_GETFPREGS, GetProcessPid(), buf,
                                             m_thread.GetID());
 }
 
-Error NativeRegisterContextNetBSD::DoWriteFPR(void *buf) {
+Status NativeRegisterContextNetBSD::DoWriteFPR(void *buf) {
   return NativeProcessNetBSD::PtraceWrapper(PT_SETFPREGS, GetProcessPid(), buf,
                                             m_thread.GetID());
 }
 
-Error NativeRegisterContextNetBSD::DoReadDBR(void *buf) {
+Status NativeRegisterContextNetBSD::DoReadDBR(void *buf) {
   return NativeProcessNetBSD::PtraceWrapper(PT_GETDBREGS, GetProcessPid(), buf,
                                             m_thread.GetID());
 }
 
-Error NativeRegisterContextNetBSD::DoWriteDBR(void *buf) {
+Status NativeRegisterContextNetBSD::DoWriteDBR(void *buf) {
   return NativeProcessNetBSD::PtraceWrapper(PT_SETDBREGS, GetProcessPid(), buf,
                                             m_thread.GetID());
 }
diff --git a/source/Plugins/Process/NetBSD/NativeRegisterContextNetBSD.h b/source/Plugins/Process/NetBSD/NativeRegisterContextNetBSD.h
index d820baac3afa..d96b7aea0048 100644
--- a/source/Plugins/Process/NetBSD/NativeRegisterContextNetBSD.h
+++ b/source/Plugins/Process/NetBSD/NativeRegisterContextNetBSD.h
@@ -35,14 +35,14 @@ public:
                                         uint32_t concrete_frame_idx);
 
 protected:
-  virtual Error ReadGPR();
-  virtual Error WriteGPR();
+  virtual Status ReadGPR();
+  virtual Status WriteGPR();
 
-  virtual Error ReadFPR();
-  virtual Error WriteFPR();
+  virtual Status ReadFPR();
+  virtual Status WriteFPR();
 
-  virtual Error ReadDBR();
-  virtual Error WriteDBR();
+  virtual Status ReadDBR();
+  virtual Status WriteDBR();
 
   virtual void *GetGPRBuffer() { return nullptr; }
   virtual size_t GetGPRSize() {
@@ -55,14 +55,14 @@ protected:
   virtual void *GetDBRBuffer() { return nullptr; }
   virtual size_t GetDBRSize() { return 0; }
 
-  virtual Error DoReadGPR(void *buf);
-  virtual Error DoWriteGPR(void *buf);
+  virtual Status DoReadGPR(void *buf);
+  virtual Status DoWriteGPR(void *buf);
 
-  virtual Error DoReadFPR(void *buf);
-  virtual Error DoWriteFPR(void *buf);
+  virtual Status DoReadFPR(void *buf);
+  virtual Status DoWriteFPR(void *buf);
 
-  virtual Error DoReadDBR(void *buf);
-  virtual Error DoWriteDBR(void *buf);
+  virtual Status DoReadDBR(void *buf);
+  virtual Status DoWriteDBR(void *buf);
 
   virtual NativeProcessNetBSD &GetProcess();
   virtual ::pid_t GetProcessPid();
diff --git a/source/Plugins/Process/NetBSD/NativeRegisterContextNetBSD_x86_64.cpp b/source/Plugins/Process/NetBSD/NativeRegisterContextNetBSD_x86_64.cpp
index dc37be7b934b..9690da0e1374 100644
--- a/source/Plugins/Process/NetBSD/NativeRegisterContextNetBSD_x86_64.cpp
+++ b/source/Plugins/Process/NetBSD/NativeRegisterContextNetBSD_x86_64.cpp
@@ -14,8 +14,8 @@
 #include "lldb/Core/RegisterValue.h"
 #include "lldb/Host/HostInfo.h"
 #include "lldb/Utility/DataBufferHeap.h"
-#include "lldb/Utility/Error.h"
 #include "lldb/Utility/Log.h"
+#include "lldb/Utility/Status.h"
 
 #include "Plugins/Process/Utility/RegisterContextNetBSD_x86_64.h"
 
@@ -251,9 +251,10 @@ int NativeRegisterContextNetBSD_x86_64::WriteRegisterSet(uint32_t set) {
   return -1;
 }
 
-Error NativeRegisterContextNetBSD_x86_64::ReadRegister(
-    const RegisterInfo *reg_info, RegisterValue &reg_value) {
-  Error error;
+Status
+NativeRegisterContextNetBSD_x86_64::ReadRegister(const RegisterInfo *reg_info,
+                                                 RegisterValue &reg_value) {
+  Status error;
 
   if (!reg_info) {
     error.SetErrorString("reg_info NULL");
@@ -446,10 +447,10 @@ Error NativeRegisterContextNetBSD_x86_64::ReadRegister(
   return error;
 }
 
-Error NativeRegisterContextNetBSD_x86_64::WriteRegister(
+Status NativeRegisterContextNetBSD_x86_64::WriteRegister(
     const RegisterInfo *reg_info, const RegisterValue &reg_value) {
 
-  Error error;
+  Status error;
 
   if (!reg_info) {
     error.SetErrorString("reg_info NULL");
@@ -645,9 +646,9 @@ Error NativeRegisterContextNetBSD_x86_64::WriteRegister(
   return error;
 }
 
-Error NativeRegisterContextNetBSD_x86_64::ReadAllRegisterValues(
+Status NativeRegisterContextNetBSD_x86_64::ReadAllRegisterValues(
     lldb::DataBufferSP &data_sp) {
-  Error error;
+  Status error;
 
   data_sp.reset(new DataBufferHeap(REG_CONTEXT_SIZE, 0));
   if (!data_sp) {
@@ -680,9 +681,9 @@ Error NativeRegisterContextNetBSD_x86_64::ReadAllRegisterValues(
   return error;
 }
 
-Error NativeRegisterContextNetBSD_x86_64::WriteAllRegisterValues(
+Status NativeRegisterContextNetBSD_x86_64::WriteAllRegisterValues(
     const lldb::DataBufferSP &data_sp) {
-  Error error;
+  Status error;
 
   if (!data_sp) {
     error.SetErrorStringWithFormat(
@@ -717,14 +718,14 @@ Error NativeRegisterContextNetBSD_x86_64::WriteAllRegisterValues(
   return error;
 }
 
-Error NativeRegisterContextNetBSD_x86_64::IsWatchpointHit(uint32_t wp_index,
-                                                          bool &is_hit) {
+Status NativeRegisterContextNetBSD_x86_64::IsWatchpointHit(uint32_t wp_index,
+                                                           bool &is_hit) {
   if (wp_index >= NumSupportedHardwareWatchpoints())
-    return Error("Watchpoint index out of range");
+    return Status("Watchpoint index out of range");
 
   RegisterValue reg_value;
   const RegisterInfo *const reg_info = GetRegisterInfoAtIndex(lldb_dr6_x86_64);
-  Error error = ReadRegister(reg_info, reg_value);
+  Status error = ReadRegister(reg_info, reg_value);
   if (error.Fail()) {
     is_hit = false;
     return error;
@@ -737,12 +738,12 @@ Error NativeRegisterContextNetBSD_x86_64::IsWatchpointHit(uint32_t wp_index,
   return error;
 }
 
-Error NativeRegisterContextNetBSD_x86_64::GetWatchpointHitIndex(
+Status NativeRegisterContextNetBSD_x86_64::GetWatchpointHitIndex(
     uint32_t &wp_index, lldb::addr_t trap_addr) {
   uint32_t num_hw_wps = NumSupportedHardwareWatchpoints();
   for (wp_index = 0; wp_index < num_hw_wps; ++wp_index) {
     bool is_hit;
-    Error error = IsWatchpointHit(wp_index, is_hit);
+    Status error = IsWatchpointHit(wp_index, is_hit);
     if (error.Fail()) {
       wp_index = LLDB_INVALID_INDEX32;
       return error;
@@ -751,17 +752,17 @@ Error NativeRegisterContextNetBSD_x86_64::GetWatchpointHitIndex(
     }
   }
   wp_index = LLDB_INVALID_INDEX32;
-  return Error();
+  return Status();
 }
 
-Error NativeRegisterContextNetBSD_x86_64::IsWatchpointVacant(uint32_t wp_index,
-                                                             bool &is_vacant) {
+Status NativeRegisterContextNetBSD_x86_64::IsWatchpointVacant(uint32_t wp_index,
+                                                              bool &is_vacant) {
   if (wp_index >= NumSupportedHardwareWatchpoints())
-    return Error("Watchpoint index out of range");
+    return Status("Watchpoint index out of range");
 
   RegisterValue reg_value;
   const RegisterInfo *const reg_info = GetRegisterInfoAtIndex(lldb_dr7_x86_64);
-  Error error = ReadRegister(reg_info, reg_value);
+  Status error = ReadRegister(reg_info, reg_value);
   if (error.Fail()) {
     is_vacant = false;
     return error;
@@ -774,11 +775,11 @@ Error NativeRegisterContextNetBSD_x86_64::IsWatchpointVacant(uint32_t wp_index,
   return error;
 }
 
-Error NativeRegisterContextNetBSD_x86_64::SetHardwareWatchpointWithIndex(
+Status NativeRegisterContextNetBSD_x86_64::SetHardwareWatchpointWithIndex(
     lldb::addr_t addr, size_t size, uint32_t watch_flags, uint32_t wp_index) {
 
   if (wp_index >= NumSupportedHardwareWatchpoints())
-    return Error("Watchpoint index out of range");
+    return Status("Watchpoint index out of range");
 
   // Read only watchpoints aren't supported on x86_64. Fall back to read/write
   // waitchpoints instead.
@@ -788,17 +789,17 @@ Error NativeRegisterContextNetBSD_x86_64::SetHardwareWatchpointWithIndex(
     watch_flags = 0x3;
 
   if (watch_flags != 0x1 && watch_flags != 0x3)
-    return Error("Invalid read/write bits for watchpoint");
+    return Status("Invalid read/write bits for watchpoint");
 
   if (size != 1 && size != 2 && size != 4 && size != 8)
-    return Error("Invalid size for watchpoint");
+    return Status("Invalid size for watchpoint");
 
   bool is_vacant;
-  Error error = IsWatchpointVacant(wp_index, is_vacant);
+  Status error = IsWatchpointVacant(wp_index, is_vacant);
   if (error.Fail())
     return error;
   if (!is_vacant)
-    return Error("Watchpoint index not vacant");
+    return Status("Watchpoint index not vacant");
 
   RegisterValue reg_value;
   const RegisterInfo *const reg_info_dr7 =
@@ -851,7 +852,7 @@ bool NativeRegisterContextNetBSD_x86_64::ClearHardwareWatchpoint(
   // clear bits 0, 1, 2, or 3 of the debug status register (DR6)
   const RegisterInfo *const reg_info_dr6 =
       GetRegisterInfoAtIndex(lldb_dr6_x86_64);
-  Error error = ReadRegister(reg_info_dr6, reg_value);
+  Status error = ReadRegister(reg_info_dr6, reg_value);
   if (error.Fail())
     return false;
   uint64_t bit_mask = 1 << wp_index;
@@ -873,13 +874,13 @@ bool NativeRegisterContextNetBSD_x86_64::ClearHardwareWatchpoint(
   return WriteRegister(reg_info_dr7, RegisterValue(control_bits)).Success();
 }
 
-Error NativeRegisterContextNetBSD_x86_64::ClearAllHardwareWatchpoints() {
+Status NativeRegisterContextNetBSD_x86_64::ClearAllHardwareWatchpoints() {
   RegisterValue reg_value;
 
   // clear bits {0-4} of the debug status register (DR6)
   const RegisterInfo *const reg_info_dr6 =
       GetRegisterInfoAtIndex(lldb_dr6_x86_64);
-  Error error = ReadRegister(reg_info_dr6, reg_value);
+  Status error = ReadRegister(reg_info_dr6, reg_value);
   if (error.Fail())
     return error;
   uint64_t bit_mask = 0xF;
@@ -905,7 +906,7 @@ uint32_t NativeRegisterContextNetBSD_x86_64::SetHardwareWatchpoint(
   const uint32_t num_hw_watchpoints = NumSupportedHardwareWatchpoints();
   for (uint32_t wp_index = 0; wp_index < num_hw_watchpoints; ++wp_index) {
     bool is_vacant;
-    Error error = IsWatchpointVacant(wp_index, is_vacant);
+    Status error = IsWatchpointVacant(wp_index, is_vacant);
     if (is_vacant) {
       error = SetHardwareWatchpointWithIndex(addr, size, watch_flags, wp_index);
       if (error.Success())
diff --git a/source/Plugins/Process/NetBSD/NativeRegisterContextNetBSD_x86_64.h b/source/Plugins/Process/NetBSD/NativeRegisterContextNetBSD_x86_64.h
index 35b7cf1c2f19..5f5a6a0792e4 100644
--- a/source/Plugins/Process/NetBSD/NativeRegisterContextNetBSD_x86_64.h
+++ b/source/Plugins/Process/NetBSD/NativeRegisterContextNetBSD_x86_64.h
@@ -36,29 +36,30 @@ public:
 
   const RegisterSet *GetRegisterSet(uint32_t set_index) const override;
 
-  Error ReadRegister(const RegisterInfo *reg_info,
-                     RegisterValue &reg_value) override;
+  Status ReadRegister(const RegisterInfo *reg_info,
+                      RegisterValue &reg_value) override;
 
-  Error WriteRegister(const RegisterInfo *reg_info,
-                      const RegisterValue &reg_value) override;
+  Status WriteRegister(const RegisterInfo *reg_info,
+                       const RegisterValue &reg_value) override;
 
-  Error ReadAllRegisterValues(lldb::DataBufferSP &data_sp) override;
+  Status ReadAllRegisterValues(lldb::DataBufferSP &data_sp) override;
 
-  Error WriteAllRegisterValues(const lldb::DataBufferSP &data_sp) override;
+  Status WriteAllRegisterValues(const lldb::DataBufferSP &data_sp) override;
 
-  Error IsWatchpointHit(uint32_t wp_index, bool &is_hit) override;
+  Status IsWatchpointHit(uint32_t wp_index, bool &is_hit) override;
 
-  Error GetWatchpointHitIndex(uint32_t &wp_index,
-                              lldb::addr_t trap_addr) override;
+  Status GetWatchpointHitIndex(uint32_t &wp_index,
+                               lldb::addr_t trap_addr) override;
 
-  Error IsWatchpointVacant(uint32_t wp_index, bool &is_vacant) override;
+  Status IsWatchpointVacant(uint32_t wp_index, bool &is_vacant) override;
 
   bool ClearHardwareWatchpoint(uint32_t wp_index) override;
 
-  Error ClearAllHardwareWatchpoints() override;
+  Status ClearAllHardwareWatchpoints() override;
 
-  Error SetHardwareWatchpointWithIndex(lldb::addr_t addr, size_t size,
-                                       uint32_t watch_flags, uint32_t wp_index);
+  Status SetHardwareWatchpointWithIndex(lldb::addr_t addr, size_t size,
+                                        uint32_t watch_flags,
+                                        uint32_t wp_index);
 
   uint32_t SetHardwareWatchpoint(lldb::addr_t addr, size_t size,
                                  uint32_t watch_flags) override;
diff --git a/source/Plugins/Process/NetBSD/NativeThreadNetBSD.cpp b/source/Plugins/Process/NetBSD/NativeThreadNetBSD.cpp
index 9beb65288c2f..8a16431b016d 100644
--- a/source/Plugins/Process/NetBSD/NativeThreadNetBSD.cpp
+++ b/source/Plugins/Process/NetBSD/NativeThreadNetBSD.cpp
@@ -160,40 +160,40 @@ NativeRegisterContextSP NativeThreadNetBSD::GetRegisterContext() {
   return m_reg_context_sp;
 }
 
-Error NativeThreadNetBSD::SetWatchpoint(lldb::addr_t addr, size_t size,
-                                        uint32_t watch_flags, bool hardware) {
+Status NativeThreadNetBSD::SetWatchpoint(lldb::addr_t addr, size_t size,
+                                         uint32_t watch_flags, bool hardware) {
   if (!hardware)
-    return Error("not implemented");
+    return Status("not implemented");
   if (m_state == eStateLaunching)
-    return Error();
-  Error error = RemoveWatchpoint(addr);
+    return Status();
+  Status error = RemoveWatchpoint(addr);
   if (error.Fail())
     return error;
   NativeRegisterContextSP reg_ctx = GetRegisterContext();
   uint32_t wp_index = reg_ctx->SetHardwareWatchpoint(addr, size, watch_flags);
   if (wp_index == LLDB_INVALID_INDEX32)
-    return Error("Setting hardware watchpoint failed.");
+    return Status("Setting hardware watchpoint failed.");
   m_watchpoint_index_map.insert({addr, wp_index});
-  return Error();
+  return Status();
 }
 
-Error NativeThreadNetBSD::RemoveWatchpoint(lldb::addr_t addr) {
+Status NativeThreadNetBSD::RemoveWatchpoint(lldb::addr_t addr) {
   auto wp = m_watchpoint_index_map.find(addr);
   if (wp == m_watchpoint_index_map.end())
-    return Error();
+    return Status();
   uint32_t wp_index = wp->second;
   m_watchpoint_index_map.erase(wp);
   if (GetRegisterContext()->ClearHardwareWatchpoint(wp_index))
-    return Error();
-  return Error("Clearing hardware watchpoint failed.");
+    return Status();
+  return Status("Clearing hardware watchpoint failed.");
 }
 
-Error NativeThreadNetBSD::SetHardwareBreakpoint(lldb::addr_t addr,
-                                                size_t size) {
+Status NativeThreadNetBSD::SetHardwareBreakpoint(lldb::addr_t addr,
+                                                 size_t size) {
   if (m_state == eStateLaunching)
-    return Error();
+    return Status();
 
-  Error error = RemoveHardwareBreakpoint(addr);
+  Status error = RemoveHardwareBreakpoint(addr);
   if (error.Fail())
     return error;
 
@@ -201,22 +201,22 @@ Error NativeThreadNetBSD::SetHardwareBreakpoint(lldb::addr_t addr,
   uint32_t bp_index = reg_ctx->SetHardwareBreakpoint(addr, size);
 
   if (bp_index == LLDB_INVALID_INDEX32)
-    return Error("Setting hardware breakpoint failed.");
+    return Status("Setting hardware breakpoint failed.");
 
   m_hw_break_index_map.insert({addr, bp_index});
-  return Error();
+  return Status();
 }
 
-Error NativeThreadNetBSD::RemoveHardwareBreakpoint(lldb::addr_t addr) {
+Status NativeThreadNetBSD::RemoveHardwareBreakpoint(lldb::addr_t addr) {
   auto bp = m_hw_break_index_map.find(addr);
   if (bp == m_hw_break_index_map.end())
-    return Error();
+    return Status();
 
   uint32_t bp_index = bp->second;
   if (GetRegisterContext()->ClearHardwareBreakpoint(bp_index)) {
     m_hw_break_index_map.erase(bp);
-    return Error();
+    return Status();
   }
 
-  return Error("Clearing hardware breakpoint failed.");
+  return Status("Clearing hardware breakpoint failed.");
 }
diff --git a/source/Plugins/Process/NetBSD/NativeThreadNetBSD.h b/source/Plugins/Process/NetBSD/NativeThreadNetBSD.h
index 96d7fd0ce03b..dcd360cdd310 100644
--- a/source/Plugins/Process/NetBSD/NativeThreadNetBSD.h
+++ b/source/Plugins/Process/NetBSD/NativeThreadNetBSD.h
@@ -38,14 +38,14 @@ public:
 
   NativeRegisterContextSP GetRegisterContext() override;
 
-  Error SetWatchpoint(lldb::addr_t addr, size_t size, uint32_t watch_flags,
-                      bool hardware) override;
+  Status SetWatchpoint(lldb::addr_t addr, size_t size, uint32_t watch_flags,
+                       bool hardware) override;
 
-  Error RemoveWatchpoint(lldb::addr_t addr) override;
+  Status RemoveWatchpoint(lldb::addr_t addr) override;
 
-  Error SetHardwareBreakpoint(lldb::addr_t addr, size_t size) override;
+  Status SetHardwareBreakpoint(lldb::addr_t addr, size_t size) override;
 
-  Error RemoveHardwareBreakpoint(lldb::addr_t addr) override;
+  Status RemoveHardwareBreakpoint(lldb::addr_t addr) override;
 
 private:
   // ---------------------------------------------------------------------
diff --git a/source/Plugins/Process/Utility/DynamicRegisterInfo.cpp b/source/Plugins/Process/Utility/DynamicRegisterInfo.cpp
index 5e933d3b3dee..0bd90dbf7620 100644
--- a/source/Plugins/Process/Utility/DynamicRegisterInfo.cpp
+++ b/source/Plugins/Process/Utility/DynamicRegisterInfo.cpp
@@ -48,10 +48,10 @@ DynamicRegisterInfo::SetRegisterInfo(const StructuredData::Dictionary &dict,
   if (dict.GetValueForKeyAsArray("sets", sets)) {
     const uint32_t num_sets = sets->GetSize();
     for (uint32_t i = 0; i < num_sets; ++i) {
-      std::string set_name_str;
+      llvm::StringRef set_name_str;
       ConstString set_name;
       if (sets->GetItemAtIndexAsString(i, set_name_str))
-        set_name.SetCString(set_name_str.c_str());
+        set_name.SetString(set_name_str);
       if (set_name) {
         RegisterSet new_set = {set_name.AsCString(), NULL, 0, NULL};
         m_sets.push_back(new_set);
@@ -115,7 +115,7 @@ DynamicRegisterInfo::SetRegisterInfo(const StructuredData::Dictionary &dict,
       // expression
       // we can calculate the offset
       bool success = false;
-      std::string slice_str;
+      llvm::StringRef slice_str;
       if (reg_info_dict->GetValueForKeyAsString("slice", slice_str, nullptr)) {
         // Slices use the following format:
         //  REGNAME[MSBIT:LSBIT]
@@ -131,9 +131,9 @@ DynamicRegisterInfo::SetRegisterInfo(const StructuredData::Dictionary &dict,
           llvm::StringRef reg_name_str;
           std::string msbit_str;
           std::string lsbit_str;
-          if (regex_match.GetMatchAtIndex(slice_str.c_str(), 1, reg_name_str) &&
-              regex_match.GetMatchAtIndex(slice_str.c_str(), 2, msbit_str) &&
-              regex_match.GetMatchAtIndex(slice_str.c_str(), 3, lsbit_str)) {
+          if (regex_match.GetMatchAtIndex(slice_str, 1, reg_name_str) &&
+              regex_match.GetMatchAtIndex(slice_str, 2, msbit_str) &&
+              regex_match.GetMatchAtIndex(slice_str, 3, lsbit_str)) {
             const uint32_t msbit =
                 StringConvert::ToUInt32(msbit_str.c_str(), UINT32_MAX);
             const uint32_t lsbit =
@@ -269,17 +269,15 @@ DynamicRegisterInfo::SetRegisterInfo(const StructuredData::Dictionary &dict,
 
     reg_info.byte_size = bitsize / 8;
 
-    std::string dwarf_opcode_string;
+    llvm::StringRef dwarf_opcode_string;
     if (reg_info_dict->GetValueForKeyAsString("dynamic_size_dwarf_expr_bytes",
                                               dwarf_opcode_string)) {
-      reg_info.dynamic_size_dwarf_len = dwarf_opcode_string.length() / 2;
+      reg_info.dynamic_size_dwarf_len = dwarf_opcode_string.size() / 2;
       assert(reg_info.dynamic_size_dwarf_len > 0);
 
       std::vector<uint8_t> dwarf_opcode_bytes(reg_info.dynamic_size_dwarf_len);
       uint32_t j;
-      StringExtractor opcode_extractor;
-      // Swap "dwarf_opcode_string" over into "opcode_extractor"
-      opcode_extractor.GetStringRef().swap(dwarf_opcode_string);
+      StringExtractor opcode_extractor(dwarf_opcode_string);
       uint32_t ret_val = opcode_extractor.GetHexBytesAvail(dwarf_opcode_bytes);
       UNUSED_IF_ASSERT_DISABLED(ret_val);
       assert(ret_val == reg_info.dynamic_size_dwarf_len);
@@ -290,9 +288,9 @@ DynamicRegisterInfo::SetRegisterInfo(const StructuredData::Dictionary &dict,
       reg_info.dynamic_size_dwarf_expr_bytes = m_dynamic_reg_size_map[i].data();
     }
 
-    std::string format_str;
+    llvm::StringRef format_str;
     if (reg_info_dict->GetValueForKeyAsString("format", format_str, nullptr)) {
-      if (Args::StringToFormat(format_str.c_str(), reg_info.format, NULL)
+      if (Args::StringToFormat(format_str.str().c_str(), reg_info.format, NULL)
               .Fail()) {
         Clear();
         printf("error: invalid 'format' value in register dictionary\n");
@@ -304,7 +302,7 @@ DynamicRegisterInfo::SetRegisterInfo(const StructuredData::Dictionary &dict,
                                              eFormatHex);
     }
 
-    std::string encoding_str;
+    llvm::StringRef encoding_str;
     if (reg_info_dict->GetValueForKeyAsString("encoding", encoding_str))
       reg_info.encoding = Args::StringToEncoding(encoding_str, eEncodingUint);
     else
@@ -334,7 +332,7 @@ DynamicRegisterInfo::SetRegisterInfo(const StructuredData::Dictionary &dict,
     reg_info.kinds[lldb::eRegisterKindEHFrame] = eh_frame_regno;
     reg_info_dict->GetValueForKeyAsInteger(
         "dwarf", reg_info.kinds[lldb::eRegisterKindDWARF], LLDB_INVALID_REGNUM);
-    std::string generic_str;
+    llvm::StringRef generic_str;
     if (reg_info_dict->GetValueForKeyAsString("generic", generic_str))
       reg_info.kinds[lldb::eRegisterKindGeneric] =
           Args::StringToGenericRegister(generic_str);
diff --git a/source/Plugins/Process/Utility/RegisterContextLLDB.cpp b/source/Plugins/Process/Utility/RegisterContextLLDB.cpp
index 485a39e6c9a5..312c1887b581 100644
--- a/source/Plugins/Process/Utility/RegisterContextLLDB.cpp
+++ b/source/Plugins/Process/Utility/RegisterContextLLDB.cpp
@@ -1055,7 +1055,7 @@ bool RegisterContextLLDB::ReadRegisterValueFromRegisterLocation(
   case UnwindLLDB::RegisterLocation::eRegisterSavedAtHostMemoryLocation:
     llvm_unreachable("FIXME debugger inferior function call unwind");
   case UnwindLLDB::RegisterLocation::eRegisterSavedAtMemoryLocation: {
-    Error error(ReadRegisterValueFromMemory(
+    Status error(ReadRegisterValueFromMemory(
         reg_info, regloc.location.target_memory_location, reg_info->byte_size,
         value));
     success = error.Success();
@@ -1097,7 +1097,7 @@ bool RegisterContextLLDB::WriteRegisterValueToRegisterLocation(
   case UnwindLLDB::RegisterLocation::eRegisterSavedAtHostMemoryLocation:
     llvm_unreachable("FIXME debugger inferior function call unwind");
   case UnwindLLDB::RegisterLocation::eRegisterSavedAtMemoryLocation: {
-    Error error(WriteRegisterValueToMemory(
+    Status error(WriteRegisterValueToMemory(
         reg_info, regloc.location.target_memory_location, reg_info->byte_size,
         value));
     success = error.Success();
@@ -1514,7 +1514,7 @@ RegisterContextLLDB::SavedLocationForRegister(
                               unwindplan_regloc.GetDWARFExpressionLength());
     dwarfexpr.SetRegisterKind(unwindplan_registerkind);
     Value result;
-    Error error;
+    Status error;
     if (dwarfexpr.Evaluate(&exe_ctx, nullptr, nullptr, this, 0, nullptr,
                            nullptr, result, &error)) {
       addr_t val;
@@ -1769,7 +1769,7 @@ bool RegisterContextLLDB::ReadCFAValueForRow(
           GetRegisterInfoAtIndex(cfa_reg.GetAsKind(eRegisterKindLLDB));
       RegisterValue reg_value;
       if (reg_info) {
-        Error error = ReadRegisterValueFromMemory(
+        Status error = ReadRegisterValueFromMemory(
             reg_info, cfa_reg_contents, reg_info->byte_size, reg_value);
         if (error.Success()) {
           cfa_value = reg_value.GetAsUInt64();
@@ -1824,7 +1824,7 @@ bool RegisterContextLLDB::ReadCFAValueForRow(
                               row->GetCFAValue().GetDWARFExpressionLength());
     dwarfexpr.SetRegisterKind(row_register_kind);
     Value result;
-    Error error;
+    Status error;
     if (dwarfexpr.Evaluate(&exe_ctx, nullptr, nullptr, this, 0, nullptr,
                            nullptr, result, &error)) {
       cfa_value = result.GetScalar().ULongLong();
diff --git a/source/Plugins/Process/Utility/RegisterContextMemory.cpp b/source/Plugins/Process/Utility/RegisterContextMemory.cpp
index eed5eec8fae8..8f0dfd2a5b50 100644
--- a/source/Plugins/Process/Utility/RegisterContextMemory.cpp
+++ b/source/Plugins/Process/Utility/RegisterContextMemory.cpp
@@ -18,7 +18,7 @@
 #include "lldb/Target/Process.h"
 #include "lldb/Target/Thread.h"
 #include "lldb/Utility/DataBufferHeap.h"
-#include "lldb/Utility/Error.h"
+#include "lldb/Utility/Status.h"
 
 using namespace lldb;
 using namespace lldb_private;
@@ -101,8 +101,8 @@ bool RegisterContextMemory::WriteRegister(const RegisterInfo *reg_info,
   if (m_reg_data_addr != LLDB_INVALID_ADDRESS) {
     const uint32_t reg_num = reg_info->kinds[eRegisterKindLLDB];
     addr_t reg_addr = m_reg_data_addr + reg_info->byte_offset;
-    Error error(WriteRegisterValueToMemory(reg_info, reg_addr,
-                                           reg_info->byte_size, reg_value));
+    Status error(WriteRegisterValueToMemory(reg_info, reg_addr,
+                                            reg_info->byte_size, reg_value));
     m_reg_valid[reg_num] = false;
     return error.Success();
   }
@@ -113,7 +113,7 @@ bool RegisterContextMemory::ReadAllRegisterValues(DataBufferSP &data_sp) {
   if (m_reg_data_addr != LLDB_INVALID_ADDRESS) {
     ProcessSP process_sp(CalculateProcess());
     if (process_sp) {
-      Error error;
+      Status error;
       if (process_sp->ReadMemory(m_reg_data_addr, data_sp->GetBytes(),
                                  data_sp->GetByteSize(),
                                  error) == data_sp->GetByteSize()) {
@@ -130,7 +130,7 @@ bool RegisterContextMemory::WriteAllRegisterValues(
   if (m_reg_data_addr != LLDB_INVALID_ADDRESS) {
     ProcessSP process_sp(CalculateProcess());
     if (process_sp) {
-      Error error;
+      Status error;
       SetAllRegisterValid(false);
       if (process_sp->WriteMemory(m_reg_data_addr, data_sp->GetBytes(),
                                   data_sp->GetByteSize(),
diff --git a/source/Plugins/Process/Utility/RegisterContextThreadMemory.cpp b/source/Plugins/Process/Utility/RegisterContextThreadMemory.cpp
index 7d990e73b5be..96ad139f7364 100644
--- a/source/Plugins/Process/Utility/RegisterContextThreadMemory.cpp
+++ b/source/Plugins/Process/Utility/RegisterContextThreadMemory.cpp
@@ -10,7 +10,7 @@
 #include "lldb/Target/OperatingSystem.h"
 #include "lldb/Target/Process.h"
 #include "lldb/Target/Thread.h"
-#include "lldb/Utility/Error.h"
+#include "lldb/Utility/Status.h"
 #include "lldb/lldb-private.h"
 
 #include "RegisterContextThreadMemory.h"
@@ -194,26 +194,26 @@ bool RegisterContextThreadMemory::HardwareSingleStep(bool enable) {
   return false;
 }
 
-Error RegisterContextThreadMemory::ReadRegisterValueFromMemory(
+Status RegisterContextThreadMemory::ReadRegisterValueFromMemory(
     const lldb_private::RegisterInfo *reg_info, lldb::addr_t src_addr,
     uint32_t src_len, RegisterValue &reg_value) {
   UpdateRegisterContext();
   if (m_reg_ctx_sp)
     return m_reg_ctx_sp->ReadRegisterValueFromMemory(reg_info, src_addr,
                                                      src_len, reg_value);
-  Error error;
+  Status error;
   error.SetErrorString("invalid register context");
   return error;
 }
 
-Error RegisterContextThreadMemory::WriteRegisterValueToMemory(
+Status RegisterContextThreadMemory::WriteRegisterValueToMemory(
     const lldb_private::RegisterInfo *reg_info, lldb::addr_t dst_addr,
     uint32_t dst_len, const RegisterValue &reg_value) {
   UpdateRegisterContext();
   if (m_reg_ctx_sp)
     return m_reg_ctx_sp->WriteRegisterValueToMemory(reg_info, dst_addr, dst_len,
                                                     reg_value);
-  Error error;
+  Status error;
   error.SetErrorString("invalid register context");
   return error;
 }
diff --git a/source/Plugins/Process/Utility/RegisterContextThreadMemory.h b/source/Plugins/Process/Utility/RegisterContextThreadMemory.h
index 7e0a2a9f7f1e..3b3b0856a4ca 100644
--- a/source/Plugins/Process/Utility/RegisterContextThreadMemory.h
+++ b/source/Plugins/Process/Utility/RegisterContextThreadMemory.h
@@ -80,13 +80,13 @@ public:
 
   bool HardwareSingleStep(bool enable) override;
 
-  Error ReadRegisterValueFromMemory(const lldb_private::RegisterInfo *reg_info,
-                                    lldb::addr_t src_addr, uint32_t src_len,
-                                    RegisterValue &reg_value) override;
+  Status ReadRegisterValueFromMemory(const lldb_private::RegisterInfo *reg_info,
+                                     lldb::addr_t src_addr, uint32_t src_len,
+                                     RegisterValue &reg_value) override;
 
-  Error WriteRegisterValueToMemory(const lldb_private::RegisterInfo *reg_info,
-                                   lldb::addr_t dst_addr, uint32_t dst_len,
-                                   const RegisterValue &reg_value) override;
+  Status WriteRegisterValueToMemory(const lldb_private::RegisterInfo *reg_info,
+                                    lldb::addr_t dst_addr, uint32_t dst_len,
+                                    const RegisterValue &reg_value) override;
 
 protected:
   void UpdateRegisterContext();
diff --git a/source/Plugins/Process/Utility/ThreadMemory.cpp b/source/Plugins/Process/Utility/ThreadMemory.cpp
index b3cac1c854d1..5ff928c69e59 100644
--- a/source/Plugins/Process/Utility/ThreadMemory.cpp
+++ b/source/Plugins/Process/Utility/ThreadMemory.cpp
@@ -24,15 +24,11 @@ ThreadMemory::ThreadMemory(Process &process, tid_t tid,
     : Thread(process, tid), m_backing_thread_sp(),
       m_thread_info_valobj_sp(thread_info_valobj_sp), m_name(), m_queue() {}
 
-ThreadMemory::ThreadMemory(Process &process, lldb::tid_t tid, const char *name,
-                           const char *queue, lldb::addr_t register_data_addr)
+ThreadMemory::ThreadMemory(Process &process, lldb::tid_t tid,
+                           llvm::StringRef name, llvm::StringRef queue,
+                           lldb::addr_t register_data_addr)
     : Thread(process, tid), m_backing_thread_sp(), m_thread_info_valobj_sp(),
-      m_name(), m_queue(), m_register_data_addr(register_data_addr) {
-  if (name)
-    m_name = name;
-  if (queue)
-    m_queue = queue;
-}
+      m_name(name), m_queue(queue), m_register_data_addr(register_data_addr) {}
 
 ThreadMemory::~ThreadMemory() { DestroyThread(); }
 
diff --git a/source/Plugins/Process/Utility/ThreadMemory.h b/source/Plugins/Process/Utility/ThreadMemory.h
index 095544d244ba..89229710da4d 100644
--- a/source/Plugins/Process/Utility/ThreadMemory.h
+++ b/source/Plugins/Process/Utility/ThreadMemory.h
@@ -24,7 +24,7 @@ public:
                const lldb::ValueObjectSP &thread_info_valobj_sp);
 
   ThreadMemory(lldb_private::Process &process, lldb::tid_t tid,
-               const char *name, const char *queue,
+               llvm::StringRef name, llvm::StringRef queue,
                lldb::addr_t register_data_addr);
 
   ~ThreadMemory() override;
diff --git a/source/Plugins/Process/Utility/UnwindMacOSXFrameBackchain.cpp b/source/Plugins/Process/Utility/UnwindMacOSXFrameBackchain.cpp
index e2691be603ec..f907735d8f58 100644
--- a/source/Plugins/Process/Utility/UnwindMacOSXFrameBackchain.cpp
+++ b/source/Plugins/Process/Utility/UnwindMacOSXFrameBackchain.cpp
@@ -105,7 +105,7 @@ size_t UnwindMacOSXFrameBackchain::GetStackFrameData_i386(
   m_cursors.push_back(cursor);
 
   const size_t k_frame_size = sizeof(frame);
-  Error error;
+  Status error;
   while (frame.fp != 0 && frame.pc != 0 && ((frame.fp & 7) == 0)) {
     // Read both the FP and PC (8 bytes)
     if (process->ReadMemory(frame.fp, &frame.fp, k_frame_size, error) !=
@@ -196,7 +196,7 @@ size_t UnwindMacOSXFrameBackchain::GetStackFrameData_x86_64(
   Frame_x86_64 frame = {cursor.fp, cursor.pc};
 
   m_cursors.push_back(cursor);
-  Error error;
+  Status error;
   const size_t k_frame_size = sizeof(frame);
   while (frame.fp != 0 && frame.pc != 0 && ((frame.fp & 7) == 0)) {
     // Read both the FP and PC (16 bytes)
diff --git a/source/Plugins/Process/Windows/Common/DebuggerThread.cpp b/source/Plugins/Process/Windows/Common/DebuggerThread.cpp
index b79359ba9667..ac9e65c3c108 100644
--- a/source/Plugins/Process/Windows/Common/DebuggerThread.cpp
+++ b/source/Plugins/Process/Windows/Common/DebuggerThread.cpp
@@ -19,9 +19,9 @@
 #include "lldb/Host/windows/ProcessLauncherWindows.h"
 #include "lldb/Target/Process.h"
 #include "lldb/Target/ProcessLaunchInfo.h"
-#include "lldb/Utility/Error.h"
 #include "lldb/Utility/FileSpec.h"
 #include "lldb/Utility/Log.h"
+#include "lldb/Utility/Status.h"
 
 #include "Plugins/Process/Windows/Common/ProcessWindowsLog.h"
 
@@ -60,11 +60,11 @@ DebuggerThread::DebuggerThread(DebugDelegateSP debug_delegate)
 
 DebuggerThread::~DebuggerThread() { ::CloseHandle(m_debugging_ended_event); }
 
-Error DebuggerThread::DebugLaunch(const ProcessLaunchInfo &launch_info) {
+Status DebuggerThread::DebugLaunch(const ProcessLaunchInfo &launch_info) {
   Log *log = ProcessWindowsLog::GetLogIfAny(WINDOWS_LOG_PROCESS);
   LLDB_LOG(log, "launching '{0}'", launch_info.GetExecutableFile().GetPath());
 
-  Error error;
+  Status error;
   DebugLaunchContext *context = new DebugLaunchContext(this, launch_info);
   HostThread slave_thread(ThreadLauncher::LaunchThread(
       "lldb.plugin.process-windows.slave[?]", DebuggerThreadLaunchRoutine,
@@ -76,12 +76,12 @@ Error DebuggerThread::DebugLaunch(const ProcessLaunchInfo &launch_info) {
   return error;
 }
 
-Error DebuggerThread::DebugAttach(lldb::pid_t pid,
-                                  const ProcessAttachInfo &attach_info) {
+Status DebuggerThread::DebugAttach(lldb::pid_t pid,
+                                   const ProcessAttachInfo &attach_info) {
   Log *log = ProcessWindowsLog::GetLogIfAny(WINDOWS_LOG_PROCESS);
   LLDB_LOG(log, "attaching to '{0}'", pid);
 
-  Error error;
+  Status error;
   DebugAttachContext *context = new DebugAttachContext(this, pid, attach_info);
   HostThread slave_thread(ThreadLauncher::LaunchThread(
       "lldb.plugin.process-windows.slave[?]", DebuggerThreadAttachRoutine,
@@ -120,7 +120,7 @@ lldb::thread_result_t DebuggerThread::DebuggerThreadLaunchRoutine(
   LLDB_LOG(log, "preparing to launch '{0}' on background thread.",
            launch_info.GetExecutableFile().GetPath());
 
-  Error error;
+  Status error;
   ProcessLauncherWindows launcher;
   HostProcess process(launcher.LaunchProcess(launch_info, error));
   // If we couldn't create the process, notify waiters immediately.  Otherwise
@@ -152,7 +152,7 @@ lldb::thread_result_t DebuggerThread::DebuggerThreadAttachRoutine(
            pid);
 
   if (!DebugActiveProcess((DWORD)pid)) {
-    Error error(::GetLastError(), eErrorTypeWin32);
+    Status error(::GetLastError(), eErrorTypeWin32);
     m_debug_delegate->OnDebuggerError(error, 0);
     return 0;
   }
@@ -167,8 +167,8 @@ lldb::thread_result_t DebuggerThread::DebuggerThreadAttachRoutine(
   return 0;
 }
 
-Error DebuggerThread::StopDebugging(bool terminate) {
-  Error error;
+Status DebuggerThread::StopDebugging(bool terminate) {
+  Status error;
 
   lldb::pid_t pid = m_process.GetProcessId();
 
@@ -515,7 +515,7 @@ DebuggerThread::HandleRipEvent(const RIP_INFO &info, DWORD thread_id) {
   LLDB_LOG(log, "encountered error {0} (type={1}) in process {2} thread {3}",
            info.dwError, info.dwType, m_process.GetProcessId(), thread_id);
 
-  Error error(info.dwError, eErrorTypeWin32);
+  Status error(info.dwError, eErrorTypeWin32);
   m_debug_delegate->OnDebuggerError(error, info.dwType);
 
   return DBG_CONTINUE;
diff --git a/source/Plugins/Process/Windows/Common/DebuggerThread.h b/source/Plugins/Process/Windows/Common/DebuggerThread.h
index ef4b47bab8c7..fcf36f7dec9b 100644
--- a/source/Plugins/Process/Windows/Common/DebuggerThread.h
+++ b/source/Plugins/Process/Windows/Common/DebuggerThread.h
@@ -32,8 +32,8 @@ public:
   DebuggerThread(DebugDelegateSP debug_delegate);
   virtual ~DebuggerThread();
 
-  Error DebugLaunch(const ProcessLaunchInfo &launch_info);
-  Error DebugAttach(lldb::pid_t pid, const ProcessAttachInfo &attach_info);
+  Status DebugLaunch(const ProcessLaunchInfo &launch_info);
+  Status DebugAttach(lldb::pid_t pid, const ProcessAttachInfo &attach_info);
 
   HostProcess GetProcess() const { return m_process; }
   HostThread GetMainThread() const { return m_main_thread; }
@@ -41,7 +41,7 @@ public:
     return m_active_exception;
   }
 
-  Error StopDebugging(bool terminate);
+  Status StopDebugging(bool terminate);
 
   void ContinueAsyncException(ExceptionResult result);
 
diff --git a/source/Plugins/Process/Windows/Common/IDebugDelegate.h b/source/Plugins/Process/Windows/Common/IDebugDelegate.h
index e88e0ada053b..73c285f1ecc7 100644
--- a/source/Plugins/Process/Windows/Common/IDebugDelegate.h
+++ b/source/Plugins/Process/Windows/Common/IDebugDelegate.h
@@ -16,7 +16,7 @@
 #include <string>
 
 namespace lldb_private {
-class Error;
+class Status;
 class HostThread;
 
 //----------------------------------------------------------------------
@@ -39,7 +39,7 @@ public:
                          lldb::addr_t module_addr) = 0;
   virtual void OnUnloadDll(lldb::addr_t module_addr) = 0;
   virtual void OnDebugString(const std::string &string) = 0;
-  virtual void OnDebuggerError(const Error &error, uint32_t type) = 0;
+  virtual void OnDebuggerError(const Status &error, uint32_t type) = 0;
 };
 }
 
diff --git a/source/Plugins/Process/Windows/Common/LocalDebugDelegate.cpp b/source/Plugins/Process/Windows/Common/LocalDebugDelegate.cpp
index 600aef372505..92aa7e2678b9 100644
--- a/source/Plugins/Process/Windows/Common/LocalDebugDelegate.cpp
+++ b/source/Plugins/Process/Windows/Common/LocalDebugDelegate.cpp
@@ -62,7 +62,7 @@ void LocalDebugDelegate::OnDebugString(const std::string &string) {
     process->OnDebugString(string);
 }
 
-void LocalDebugDelegate::OnDebuggerError(const Error &error, uint32_t type) {
+void LocalDebugDelegate::OnDebuggerError(const Status &error, uint32_t type) {
   if (ProcessWindowsSP process = GetProcessPointer())
     process->OnDebuggerError(error, type);
 }
diff --git a/source/Plugins/Process/Windows/Common/LocalDebugDelegate.h b/source/Plugins/Process/Windows/Common/LocalDebugDelegate.h
index 819854a1e631..2cb479ccce8a 100644
--- a/source/Plugins/Process/Windows/Common/LocalDebugDelegate.h
+++ b/source/Plugins/Process/Windows/Common/LocalDebugDelegate.h
@@ -55,7 +55,7 @@ public:
                  lldb::addr_t module_addr) override;
   void OnUnloadDll(lldb::addr_t module_addr) override;
   void OnDebugString(const std::string &message) override;
-  void OnDebuggerError(const Error &error, uint32_t type) override;
+  void OnDebuggerError(const Status &error, uint32_t type) override;
 
 private:
   ProcessWindowsSP GetProcessPointer();
diff --git a/source/Plugins/Process/Windows/Common/ProcessWindows.cpp b/source/Plugins/Process/Windows/Common/ProcessWindows.cpp
index 56a98a8eef63..a1c9cfaed41c 100644
--- a/source/Plugins/Process/Windows/Common/ProcessWindows.cpp
+++ b/source/Plugins/Process/Windows/Common/ProcessWindows.cpp
@@ -87,7 +87,7 @@ public:
 
   ~ProcessWindowsData() { ::CloseHandle(m_initial_stop_event); }
 
-  Error m_launch_error;
+  Status m_launch_error;
   DebuggerThreadSP m_debugger;
   StopInfoSP m_pending_stop_info;
   HANDLE m_initial_stop_event = nullptr;
@@ -132,18 +132,18 @@ ProcessWindows::ProcessWindows(lldb::TargetSP target_sp,
 
 ProcessWindows::~ProcessWindows() {}
 
-size_t ProcessWindows::GetSTDOUT(char *buf, size_t buf_size, Error &error) {
+size_t ProcessWindows::GetSTDOUT(char *buf, size_t buf_size, Status &error) {
   error.SetErrorString("GetSTDOUT unsupported on Windows");
   return 0;
 }
 
-size_t ProcessWindows::GetSTDERR(char *buf, size_t buf_size, Error &error) {
+size_t ProcessWindows::GetSTDERR(char *buf, size_t buf_size, Status &error) {
   error.SetErrorString("GetSTDERR unsupported on Windows");
   return 0;
 }
 
 size_t ProcessWindows::PutSTDIN(const char *buf, size_t buf_size,
-                                Error &error) {
+                                Status &error) {
   error.SetErrorString("PutSTDIN unsupported on Windows");
   return 0;
 }
@@ -157,30 +157,30 @@ lldb_private::ConstString ProcessWindows::GetPluginName() {
 
 uint32_t ProcessWindows::GetPluginVersion() { return 1; }
 
-Error ProcessWindows::EnableBreakpointSite(BreakpointSite *bp_site) {
+Status ProcessWindows::EnableBreakpointSite(BreakpointSite *bp_site) {
   Log *log = ProcessWindowsLog::GetLogIfAny(WINDOWS_LOG_BREAKPOINTS);
   LLDB_LOG(log, "bp_site = {0:x}, id={1}, addr={2:x}", bp_site,
            bp_site->GetID(), bp_site->GetLoadAddress());
 
-  Error error = EnableSoftwareBreakpoint(bp_site);
+  Status error = EnableSoftwareBreakpoint(bp_site);
   if (!error.Success())
     LLDB_LOG(log, "error: {0}", error);
   return error;
 }
 
-Error ProcessWindows::DisableBreakpointSite(BreakpointSite *bp_site) {
+Status ProcessWindows::DisableBreakpointSite(BreakpointSite *bp_site) {
   Log *log = ProcessWindowsLog::GetLogIfAny(WINDOWS_LOG_BREAKPOINTS);
   LLDB_LOG(log, "bp_site = {0:x}, id={1}, addr={2:x}", bp_site,
            bp_site->GetID(), bp_site->GetLoadAddress());
 
-  Error error = DisableSoftwareBreakpoint(bp_site);
+  Status error = DisableSoftwareBreakpoint(bp_site);
 
   if (!error.Success())
     LLDB_LOG(log, "error: {0}", error);
   return error;
 }
 
-Error ProcessWindows::DoDetach(bool keep_stopped) {
+Status ProcessWindows::DoDetach(bool keep_stopped) {
   Log *log = ProcessWindowsLog::GetLogIfAny(WINDOWS_LOG_PROCESS);
   DebuggerThreadSP debugger_thread;
   StateType private_state;
@@ -196,13 +196,13 @@ Error ProcessWindows::DoDetach(bool keep_stopped) {
     if (!m_session_data) {
       LLDB_LOG(log, "state = {0}, but there is no active session.",
                private_state);
-      return Error();
+      return Status();
     }
 
     debugger_thread = m_session_data->m_debugger;
   }
 
-  Error error;
+  Status error;
   if (private_state != eStateExited && private_state != eStateDetached) {
     LLDB_LOG(log, "detaching from process {0} while state = {1}.",
              debugger_thread->GetProcess().GetNativeProcess().GetSystemHandle(),
@@ -226,8 +226,8 @@ Error ProcessWindows::DoDetach(bool keep_stopped) {
   return error;
 }
 
-Error ProcessWindows::DoLaunch(Module *exe_module,
-                               ProcessLaunchInfo &launch_info) {
+Status ProcessWindows::DoLaunch(Module *exe_module,
+                                ProcessLaunchInfo &launch_info) {
   // Even though m_session_data is accessed here, it is before a debugger thread
   // has been
   // kicked off.  So there's no race conditions, and it shouldn't be necessary
@@ -235,7 +235,7 @@ Error ProcessWindows::DoLaunch(Module *exe_module,
   // the mutex.
 
   Log *log = ProcessWindowsLog::GetLogIfAny(WINDOWS_LOG_PROCESS);
-  Error result;
+  Status result;
   if (!launch_info.GetFlags().Test(eLaunchFlagDebug)) {
     StreamString stream;
     stream.Printf("ProcessWindows unable to launch '%s'.  ProcessWindows can "
@@ -265,7 +265,7 @@ Error ProcessWindows::DoLaunch(Module *exe_module,
   }
 
   HostProcess process;
-  Error error = WaitForDebuggerConnection(debugger, process);
+  Status error = WaitForDebuggerConnection(debugger, process);
   if (error.Fail()) {
     LLDB_LOG(log, "failed launching '{0}'. {1}",
              launch_info.GetExecutableFile().GetPath(), error);
@@ -288,8 +288,9 @@ Error ProcessWindows::DoLaunch(Module *exe_module,
   return result;
 }
 
-Error ProcessWindows::DoAttachToProcessWithID(
-    lldb::pid_t pid, const ProcessAttachInfo &attach_info) {
+Status
+ProcessWindows::DoAttachToProcessWithID(lldb::pid_t pid,
+                                        const ProcessAttachInfo &attach_info) {
   Log *log = ProcessWindowsLog::GetLogIfAny(WINDOWS_LOG_PROCESS);
   m_session_data.reset(
       new ProcessWindowsData(!attach_info.GetContinueOnceAttached()));
@@ -300,7 +301,7 @@ Error ProcessWindows::DoAttachToProcessWithID(
   m_session_data->m_debugger = debugger;
 
   DWORD process_id = static_cast<DWORD>(pid);
-  Error error = debugger->DebugAttach(process_id, attach_info);
+  Status error = debugger->DebugAttach(process_id, attach_info);
   if (error.Fail()) {
     LLDB_LOG(
         log,
@@ -331,10 +332,10 @@ Error ProcessWindows::DoAttachToProcessWithID(
   return error;
 }
 
-Error ProcessWindows::DoResume() {
+Status ProcessWindows::DoResume() {
   Log *log = ProcessWindowsLog::GetLogIfAny(WINDOWS_LOG_PROCESS);
   llvm::sys::ScopedLock lock(m_mutex);
-  Error error;
+  Status error;
 
   StateType private_state = GetPrivateState();
   if (private_state == eStateStopped || private_state == eStateCrashed) {
@@ -369,7 +370,7 @@ Error ProcessWindows::DoResume() {
   return error;
 }
 
-Error ProcessWindows::DoDestroy() {
+Status ProcessWindows::DoDestroy() {
   Log *log = ProcessWindowsLog::GetLogIfAny(WINDOWS_LOG_PROCESS);
   DebuggerThreadSP debugger_thread;
   StateType private_state;
@@ -386,13 +387,13 @@ Error ProcessWindows::DoDestroy() {
     if (!m_session_data) {
       LLDB_LOG(log, "warning: state = {0}, but there is no active session.",
                private_state);
-      return Error();
+      return Status();
     }
 
     debugger_thread = m_session_data->m_debugger;
   }
 
-  Error error;
+  Status error;
   if (private_state != eStateExited && private_state != eStateDetached) {
     LLDB_LOG(log, "Shutting down process {0} while state = {1}.",
              debugger_thread->GetProcess().GetNativeProcess().GetSystemHandle(),
@@ -411,9 +412,9 @@ Error ProcessWindows::DoDestroy() {
   return error;
 }
 
-Error ProcessWindows::DoHalt(bool &caused_stop) {
+Status ProcessWindows::DoHalt(bool &caused_stop) {
   Log *log = ProcessWindowsLog::GetLogIfAny(WINDOWS_LOG_PROCESS);
-  Error error;
+  Status error;
   StateType state = GetPrivateState();
   if (state == eStateStopped)
     caused_stop = false;
@@ -623,7 +624,7 @@ bool ProcessWindows::IsAlive() {
 }
 
 size_t ProcessWindows::DoReadMemory(lldb::addr_t vm_addr, void *buf,
-                                    size_t size, Error &error) {
+                                    size_t size, Status &error) {
   Log *log = ProcessWindowsLog::GetLogIfAny(WINDOWS_LOG_MEMORY);
   llvm::sys::ScopedLock lock(m_mutex);
 
@@ -645,7 +646,7 @@ size_t ProcessWindows::DoReadMemory(lldb::addr_t vm_addr, void *buf,
 }
 
 size_t ProcessWindows::DoWriteMemory(lldb::addr_t vm_addr, const void *buf,
-                                     size_t size, Error &error) {
+                                     size_t size, Status &error) {
   Log *log = ProcessWindowsLog::GetLogIfAny(WINDOWS_LOG_MEMORY);
   llvm::sys::ScopedLock lock(m_mutex);
   LLDB_LOG(log, "attempting to write {0} bytes into address {1:x}", size,
@@ -669,10 +670,10 @@ size_t ProcessWindows::DoWriteMemory(lldb::addr_t vm_addr, const void *buf,
   return bytes_written;
 }
 
-Error ProcessWindows::GetMemoryRegionInfo(lldb::addr_t vm_addr,
-                                          MemoryRegionInfo &info) {
+Status ProcessWindows::GetMemoryRegionInfo(lldb::addr_t vm_addr,
+                                           MemoryRegionInfo &info) {
   Log *log = ProcessWindowsLog::GetLogIfAny(WINDOWS_LOG_MEMORY);
-  Error error;
+  Status error;
   llvm::sys::ScopedLock lock(m_mutex);
   info.Clear();
 
@@ -807,7 +808,7 @@ void ProcessWindows::OnDebuggerConnected(lldb::addr_t image_base) {
 
     FileSpec executable_file(file_name, true);
     ModuleSpec module_spec(executable_file);
-    Error error;
+    Status error;
     module = GetTarget().GetSharedModule(module_spec, &error);
     if (!module) {
       return;
@@ -931,7 +932,7 @@ void ProcessWindows::OnLoadDll(const ModuleSpec &module_spec,
   // GetSharedModule() with
   // a new module will add it to the module list and return a corresponding
   // ModuleSP.
-  Error error;
+  Status error;
   ModuleSP module = GetTarget().GetSharedModule(module_spec, &error);
   bool load_addr_changed = false;
   module->SetLoadAddress(GetTarget(), module_addr, false, load_addr_changed);
@@ -955,7 +956,7 @@ void ProcessWindows::OnUnloadDll(lldb::addr_t module_addr) {
 
 void ProcessWindows::OnDebugString(const std::string &string) {}
 
-void ProcessWindows::OnDebuggerError(const Error &error, uint32_t type) {
+void ProcessWindows::OnDebuggerError(const Status &error, uint32_t type) {
   llvm::sys::ScopedLock lock(m_mutex);
   Log *log = ProcessWindowsLog::GetLogIfAny(WINDOWS_LOG_PROCESS);
 
@@ -981,9 +982,9 @@ void ProcessWindows::OnDebuggerError(const Error &error, uint32_t type) {
   }
 }
 
-Error ProcessWindows::WaitForDebuggerConnection(DebuggerThreadSP debugger,
-                                                HostProcess &process) {
-  Error result;
+Status ProcessWindows::WaitForDebuggerConnection(DebuggerThreadSP debugger,
+                                                 HostProcess &process) {
+  Status result;
   Log *log = ProcessWindowsLog::GetLogIfAny(WINDOWS_LOG_PROCESS |
                                             WINDOWS_LOG_BREAKPOINTS);
   LLDB_LOG(log, "Waiting for loader breakpoint.");
@@ -996,7 +997,7 @@ Error ProcessWindows::WaitForDebuggerConnection(DebuggerThreadSP debugger,
     process = debugger->GetProcess();
     return m_session_data->m_launch_error;
   } else
-    return Error(::GetLastError(), eErrorTypeWin32);
+    return Status(::GetLastError(), eErrorTypeWin32);
 }
 
 // The Windows page protection bits are NOT independent masks that can be
diff --git a/source/Plugins/Process/Windows/Common/ProcessWindows.h b/source/Plugins/Process/Windows/Common/ProcessWindows.h
index f2db102299ae..ed3938beb347 100644
--- a/source/Plugins/Process/Windows/Common/ProcessWindows.h
+++ b/source/Plugins/Process/Windows/Common/ProcessWindows.h
@@ -12,7 +12,7 @@
 
 // Other libraries and framework includes
 #include "lldb/Target/Process.h"
-#include "lldb/Utility/Error.h"
+#include "lldb/Utility/Status.h"
 #include "lldb/lldb-forward.h"
 
 #include "llvm/Support/Mutex.h"
@@ -48,25 +48,25 @@ public:
 
   ~ProcessWindows();
 
-  size_t GetSTDOUT(char *buf, size_t buf_size, Error &error) override;
-  size_t GetSTDERR(char *buf, size_t buf_size, Error &error) override;
-  size_t PutSTDIN(const char *buf, size_t buf_size, Error &error) override;
+  size_t GetSTDOUT(char *buf, size_t buf_size, Status &error) override;
+  size_t GetSTDERR(char *buf, size_t buf_size, Status &error) override;
+  size_t PutSTDIN(const char *buf, size_t buf_size, Status &error) override;
 
   // lldb_private::Process overrides
   ConstString GetPluginName() override;
   uint32_t GetPluginVersion() override;
 
-  Error EnableBreakpointSite(BreakpointSite *bp_site) override;
-  Error DisableBreakpointSite(BreakpointSite *bp_site) override;
+  Status EnableBreakpointSite(BreakpointSite *bp_site) override;
+  Status DisableBreakpointSite(BreakpointSite *bp_site) override;
 
-  Error DoDetach(bool keep_stopped) override;
-  Error DoLaunch(Module *exe_module, ProcessLaunchInfo &launch_info) override;
-  Error DoAttachToProcessWithID(
+  Status DoDetach(bool keep_stopped) override;
+  Status DoLaunch(Module *exe_module, ProcessLaunchInfo &launch_info) override;
+  Status DoAttachToProcessWithID(
       lldb::pid_t pid,
       const lldb_private::ProcessAttachInfo &attach_info) override;
-  Error DoResume() override;
-  Error DoDestroy() override;
-  Error DoHalt(bool &caused_stop) override;
+  Status DoResume() override;
+  Status DoDestroy() override;
+  Status DoHalt(bool &caused_stop) override;
 
   void DidLaunch() override;
   void DidAttach(lldb_private::ArchSpec &arch_spec) override;
@@ -81,11 +81,11 @@ public:
   bool IsAlive() override;
 
   size_t DoReadMemory(lldb::addr_t vm_addr, void *buf, size_t size,
-                      Error &error) override;
+                      Status &error) override;
   size_t DoWriteMemory(lldb::addr_t vm_addr, const void *buf, size_t size,
-                       Error &error) override;
-  Error GetMemoryRegionInfo(lldb::addr_t vm_addr,
-                            MemoryRegionInfo &info) override;
+                       Status &error) override;
+  Status GetMemoryRegionInfo(lldb::addr_t vm_addr,
+                             MemoryRegionInfo &info) override;
 
   lldb::addr_t GetImageInfoAddress() override;
 
@@ -100,11 +100,11 @@ public:
                  lldb::addr_t module_addr) override;
   void OnUnloadDll(lldb::addr_t module_addr) override;
   void OnDebugString(const std::string &string) override;
-  void OnDebuggerError(const Error &error, uint32_t type) override;
+  void OnDebuggerError(const Status &error, uint32_t type) override;
 
 private:
-  Error WaitForDebuggerConnection(DebuggerThreadSP debugger,
-                                  HostProcess &process);
+  Status WaitForDebuggerConnection(DebuggerThreadSP debugger,
+                                   HostProcess &process);
 
   // These decode the page protection bits.
   static bool IsPageReadable(uint32_t protect);
diff --git a/source/Plugins/Process/Windows/Common/RegisterContextWindows.cpp b/source/Plugins/Process/Windows/Common/RegisterContextWindows.cpp
index bfed3044910d..b3f507128f82 100644
--- a/source/Plugins/Process/Windows/Common/RegisterContextWindows.cpp
+++ b/source/Plugins/Process/Windows/Common/RegisterContextWindows.cpp
@@ -10,7 +10,7 @@
 #include "lldb/Host/windows/HostThreadWindows.h"
 #include "lldb/Host/windows/windows.h"
 #include "lldb/Utility/DataBufferHeap.h"
-#include "lldb/Utility/Error.h"
+#include "lldb/Utility/Status.h"
 #include "lldb/lldb-private-types.h"
 
 #include "ProcessWindowsLog.h"
diff --git a/source/Plugins/Process/Windows/Common/x64/RegisterContextWindows_x64.cpp b/source/Plugins/Process/Windows/Common/x64/RegisterContextWindows_x64.cpp
index 53fe1d902494..e64bade5ff90 100644
--- a/source/Plugins/Process/Windows/Common/x64/RegisterContextWindows_x64.cpp
+++ b/source/Plugins/Process/Windows/Common/x64/RegisterContextWindows_x64.cpp
@@ -10,7 +10,7 @@
 #include "lldb/Core/RegisterValue.h"
 #include "lldb/Host/windows/HostThreadWindows.h"
 #include "lldb/Host/windows/windows.h"
-#include "lldb/Utility/Error.h"
+#include "lldb/Utility/Status.h"
 #include "lldb/lldb-private-types.h"
 
 #include "RegisterContextWindows_x64.h"
diff --git a/source/Plugins/Process/Windows/Common/x86/RegisterContextWindows_x86.cpp b/source/Plugins/Process/Windows/Common/x86/RegisterContextWindows_x86.cpp
index 8127606583cd..f56836de4a67 100644
--- a/source/Plugins/Process/Windows/Common/x86/RegisterContextWindows_x86.cpp
+++ b/source/Plugins/Process/Windows/Common/x86/RegisterContextWindows_x86.cpp
@@ -10,7 +10,7 @@
 #include "lldb/Core/RegisterValue.h"
 #include "lldb/Host/windows/HostThreadWindows.h"
 #include "lldb/Host/windows/windows.h"
-#include "lldb/Utility/Error.h"
+#include "lldb/Utility/Status.h"
 #include "lldb/lldb-private-types.h"
 
 #include "ProcessWindowsLog.h"
diff --git a/source/Plugins/Process/elf-core/ProcessElfCore.cpp b/source/Plugins/Process/elf-core/ProcessElfCore.cpp
index 6561d2a05828..5a459e80348b 100644
--- a/source/Plugins/Process/elf-core/ProcessElfCore.cpp
+++ b/source/Plugins/Process/elf-core/ProcessElfCore.cpp
@@ -84,8 +84,8 @@ bool ProcessElfCore::CanDebug(lldb::TargetSP target_sp,
   // For now we are just making sure the file exists for a given module
   if (!m_core_module_sp && m_core_file.Exists()) {
     ModuleSpec core_module_spec(m_core_file, target_sp->GetArchitecture());
-    Error error(ModuleList::GetSharedModule(core_module_spec, m_core_module_sp,
-                                            NULL, NULL, NULL));
+    Status error(ModuleList::GetSharedModule(core_module_spec, m_core_module_sp,
+                                             NULL, NULL, NULL));
     if (m_core_module_sp) {
       ObjectFile *core_objfile = m_core_module_sp->GetObjectFile();
       if (core_objfile && core_objfile->GetType() == ObjectFile::eTypeCoreFile)
@@ -157,8 +157,8 @@ lldb::addr_t ProcessElfCore::AddAddressRangeFromLoadSegment(
 //----------------------------------------------------------------------
 // Process Control
 //----------------------------------------------------------------------
-Error ProcessElfCore::DoLoadCore() {
-  Error error;
+Status ProcessElfCore::DoLoadCore() {
+  Status error;
   if (!m_core_module_sp) {
     error.SetErrorString("invalid core module");
     return error;
@@ -289,7 +289,7 @@ bool ProcessElfCore::UpdateThreadList(ThreadList &old_thread_list,
 
 void ProcessElfCore::RefreshStateAfterStop() {}
 
-Error ProcessElfCore::DoDestroy() { return Error(); }
+Status ProcessElfCore::DoDestroy() { return Status(); }
 
 //------------------------------------------------------------------
 // Process Queries
@@ -301,14 +301,14 @@ bool ProcessElfCore::IsAlive() { return true; }
 // Process Memory
 //------------------------------------------------------------------
 size_t ProcessElfCore::ReadMemory(lldb::addr_t addr, void *buf, size_t size,
-                                  Error &error) {
+                                  Status &error) {
   // Don't allow the caching that lldb_private::Process::ReadMemory does
   // since in core files we have it all cached our our core file anyway.
   return DoReadMemory(addr, buf, size, error);
 }
 
-Error ProcessElfCore::GetMemoryRegionInfo(lldb::addr_t load_addr,
-                                          MemoryRegionInfo &region_info) {
+Status ProcessElfCore::GetMemoryRegionInfo(lldb::addr_t load_addr,
+                                           MemoryRegionInfo &region_info) {
   region_info.Clear();
   const VMRangeToPermissions::Entry *permission_entry =
       m_core_range_infos.FindEntryThatContainsOrFollows(load_addr);
@@ -335,7 +335,7 @@ Error ProcessElfCore::GetMemoryRegionInfo(lldb::addr_t load_addr,
       region_info.SetExecutable(MemoryRegionInfo::eNo);
       region_info.SetMapped(MemoryRegionInfo::eNo);
     }
-    return Error();
+    return Status();
   }
 
   region_info.GetRange().SetRangeBase(load_addr);
@@ -344,11 +344,11 @@ Error ProcessElfCore::GetMemoryRegionInfo(lldb::addr_t load_addr,
   region_info.SetWritable(MemoryRegionInfo::eNo);
   region_info.SetExecutable(MemoryRegionInfo::eNo);
   region_info.SetMapped(MemoryRegionInfo::eNo);
-  return Error();
+  return Status();
 }
 
 size_t ProcessElfCore::DoReadMemory(lldb::addr_t addr, void *buf, size_t size,
-                                    Error &error) {
+                                    Status &error) {
   ObjectFile *core_objfile = m_core_module_sp->GetObjectFile();
 
   if (core_objfile == NULL)
@@ -540,7 +540,7 @@ static void ParseOpenBSDProcInfo(ThreadData &thread_data, DataExtractor &data) {
 ///        new thread when it finds NT_PRSTATUS or NT_PRPSINFO NOTE entry.
 ///    For case (b) there may be either one NT_PRPSINFO per thread, or a single
 ///    one that applies to all threads (depending on the platform type).
-Error ProcessElfCore::ParseThreadContextsFromNoteSegment(
+Status ProcessElfCore::ParseThreadContextsFromNoteSegment(
     const elf::ELFProgramHeader *segment_header, DataExtractor segment_data) {
   assert(segment_header && segment_header->p_type == llvm::ELF::PT_NOTE);
 
@@ -555,7 +555,7 @@ Error ProcessElfCore::ParseThreadContextsFromNoteSegment(
   ELFLinuxSigInfo siginfo;
   size_t header_size;
   size_t len;
-  Error error;
+  Status error;
 
   // Loop through the NOTE entires in the segment
   while (offset < segment_header->p_filesz) {
diff --git a/source/Plugins/Process/elf-core/ProcessElfCore.h b/source/Plugins/Process/elf-core/ProcessElfCore.h
index cb2f31bde4c5..dbf7f926f855 100644
--- a/source/Plugins/Process/elf-core/ProcessElfCore.h
+++ b/source/Plugins/Process/elf-core/ProcessElfCore.h
@@ -26,7 +26,7 @@
 // Project includes
 #include "lldb/Target/Process.h"
 #include "lldb/Utility/ConstString.h"
-#include "lldb/Utility/Error.h"
+#include "lldb/Utility/Status.h"
 
 #include "Plugins/ObjectFile/ELF/ELFHeader.h"
 
@@ -66,7 +66,7 @@ public:
   //------------------------------------------------------------------
   // Creating a new process, or attaching to an existing one
   //------------------------------------------------------------------
-  lldb_private::Error DoLoadCore() override;
+  lldb_private::Status DoLoadCore() override;
 
   lldb_private::DynamicLoader *GetDynamicLoader() override;
 
@@ -80,7 +80,7 @@ public:
   //------------------------------------------------------------------
   // Process Control
   //------------------------------------------------------------------
-  lldb_private::Error DoDestroy() override;
+  lldb_private::Status DoDestroy() override;
 
   void RefreshStateAfterStop() override;
 
@@ -93,12 +93,12 @@ public:
   // Process Memory
   //------------------------------------------------------------------
   size_t ReadMemory(lldb::addr_t addr, void *buf, size_t size,
-                    lldb_private::Error &error) override;
+                    lldb_private::Status &error) override;
 
   size_t DoReadMemory(lldb::addr_t addr, void *buf, size_t size,
-                      lldb_private::Error &error) override;
+                      lldb_private::Status &error) override;
 
-  lldb_private::Error
+  lldb_private::Status
   GetMemoryRegionInfo(lldb::addr_t load_addr,
                       lldb_private::MemoryRegionInfo &region_info) override;
 
@@ -160,7 +160,7 @@ private:
   std::vector<NT_FILE_Entry> m_nt_file_entries;
 
   // Parse thread(s) data structures(prstatus, prpsinfo) from given NOTE segment
-  lldb_private::Error ParseThreadContextsFromNoteSegment(
+  lldb_private::Status ParseThreadContextsFromNoteSegment(
       const elf::ELFProgramHeader *segment_header,
       lldb_private::DataExtractor segment_data);
 
diff --git a/source/Plugins/Process/elf-core/RegisterContextPOSIXCore_x86_64.cpp b/source/Plugins/Process/elf-core/RegisterContextPOSIXCore_x86_64.cpp
index 260ae15d7a54..5766923186d1 100644
--- a/source/Plugins/Process/elf-core/RegisterContextPOSIXCore_x86_64.cpp
+++ b/source/Plugins/Process/elf-core/RegisterContextPOSIXCore_x86_64.cpp
@@ -71,7 +71,7 @@ bool RegisterContextCorePOSIX_x86_64::ReadRegister(const RegisterInfo *reg_info,
     return false;
   }
 
-  Error error;
+  Status error;
   value.SetFromMemoryData(reg_info, src + offset, reg_info->byte_size,
                           lldb::eByteOrderLittle, error);
 
diff --git a/source/Plugins/Process/elf-core/ThreadElfCore.cpp b/source/Plugins/Process/elf-core/ThreadElfCore.cpp
index 13ad82d92c5d..096c20363c78 100644
--- a/source/Plugins/Process/elf-core/ThreadElfCore.cpp
+++ b/source/Plugins/Process/elf-core/ThreadElfCore.cpp
@@ -274,8 +274,8 @@ size_t ELFLinuxPrStatus::GetSize(lldb_private::ArchSpec &arch) {
   }
 }
 
-Error ELFLinuxPrStatus::Parse(DataExtractor &data, ArchSpec &arch) {
-  Error error;
+Status ELFLinuxPrStatus::Parse(DataExtractor &data, ArchSpec &arch) {
+  Status error;
   if (GetSize(arch) > data.GetByteSize()) {
     error.SetErrorStringWithFormat(
         "NT_PRSTATUS size should be %zu, but the remaining bytes are: %" PRIu64,
@@ -344,8 +344,8 @@ size_t ELFLinuxPrPsInfo::GetSize(lldb_private::ArchSpec &arch) {
   }
 }
 
-Error ELFLinuxPrPsInfo::Parse(DataExtractor &data, ArchSpec &arch) {
-  Error error;
+Status ELFLinuxPrPsInfo::Parse(DataExtractor &data, ArchSpec &arch) {
+  Status error;
   ByteOrder byteorder = data.GetByteOrder();
   if (GetSize(arch) > data.GetByteSize()) {
     error.SetErrorStringWithFormat(
@@ -413,8 +413,8 @@ size_t ELFLinuxSigInfo::GetSize(const lldb_private::ArchSpec &arch) {
   }
 }
 
-Error ELFLinuxSigInfo::Parse(DataExtractor &data, const ArchSpec &arch) {
-  Error error;
+Status ELFLinuxSigInfo::Parse(DataExtractor &data, const ArchSpec &arch) {
+  Status error;
   if (GetSize(arch) > data.GetByteSize()) {
     error.SetErrorStringWithFormat(
         "NT_SIGINFO size should be %zu, but the remaining bytes are: %" PRIu64,
diff --git a/source/Plugins/Process/elf-core/ThreadElfCore.h b/source/Plugins/Process/elf-core/ThreadElfCore.h
index 38c52658a23a..521875413715 100644
--- a/source/Plugins/Process/elf-core/ThreadElfCore.h
+++ b/source/Plugins/Process/elf-core/ThreadElfCore.h
@@ -57,8 +57,8 @@ struct ELFLinuxPrStatus {
 
   ELFLinuxPrStatus();
 
-  lldb_private::Error Parse(lldb_private::DataExtractor &data,
-                            lldb_private::ArchSpec &arch);
+  lldb_private::Status Parse(lldb_private::DataExtractor &data,
+                             lldb_private::ArchSpec &arch);
 
   // Return the bytesize of the structure
   // 64 bit - just sizeof
@@ -78,8 +78,8 @@ struct ELFLinuxSigInfo {
 
   ELFLinuxSigInfo();
 
-  lldb_private::Error Parse(lldb_private::DataExtractor &data,
-                            const lldb_private::ArchSpec &arch);
+  lldb_private::Status Parse(lldb_private::DataExtractor &data,
+                             const lldb_private::ArchSpec &arch);
 
   // Return the bytesize of the structure
   // 64 bit - just sizeof
@@ -113,8 +113,8 @@ struct ELFLinuxPrPsInfo {
 
   ELFLinuxPrPsInfo();
 
-  lldb_private::Error Parse(lldb_private::DataExtractor &data,
-                            lldb_private::ArchSpec &arch);
+  lldb_private::Status Parse(lldb_private::DataExtractor &data,
+                             lldb_private::ArchSpec &arch);
 
   // Return the bytesize of the structure
   // 64 bit - just sizeof
diff --git a/source/Plugins/Process/gdb-remote/GDBRemoteCommunication.cpp b/source/Plugins/Process/gdb-remote/GDBRemoteCommunication.cpp
index d527b4daaab9..0c4df7e3f306 100644
--- a/source/Plugins/Process/gdb-remote/GDBRemoteCommunication.cpp
+++ b/source/Plugins/Process/gdb-remote/GDBRemoteCommunication.cpp
@@ -319,7 +319,7 @@ GDBRemoteCommunication::WaitForPacketNoLock(StringExtractorGDBRemote &packet,
                                             Timeout<std::micro> timeout,
                                             bool sync_on_timeout) {
   uint8_t buffer[8192];
-  Error error;
+  Status error;
 
   Log *log(ProcessGDBRemoteLog::GetLogIfAllCategoriesSet(GDBR_LOG_PACKETS));
 
@@ -933,9 +933,9 @@ GDBRemoteCommunication::CheckForPacket(const uint8_t *src, size_t src_len,
   return GDBRemoteCommunication::PacketType::Invalid;
 }
 
-Error GDBRemoteCommunication::StartListenThread(const char *hostname,
-                                                uint16_t port) {
-  Error error;
+Status GDBRemoteCommunication::StartListenThread(const char *hostname,
+                                                 uint16_t port) {
+  Status error;
   if (m_listen_thread.IsJoinable()) {
     error.SetErrorString("listen thread already running");
   } else {
@@ -962,7 +962,7 @@ bool GDBRemoteCommunication::JoinListenThread() {
 lldb::thread_result_t
 GDBRemoteCommunication::ListenThread(lldb::thread_arg_t arg) {
   GDBRemoteCommunication *comm = (GDBRemoteCommunication *)arg;
-  Error error;
+  Status error;
   ConnectionFileDescriptor *connection =
       (ConnectionFileDescriptor *)comm->GetConnection();
 
@@ -975,7 +975,7 @@ GDBRemoteCommunication::ListenThread(lldb::thread_arg_t arg) {
   return NULL;
 }
 
-Error GDBRemoteCommunication::StartDebugserverProcess(
+Status GDBRemoteCommunication::StartDebugserverProcess(
     const char *url, Platform *platform, ProcessLaunchInfo &launch_info,
     uint16_t *port, const Args *inferior_args, int pass_comm_fd) {
   Log *log(ProcessGDBRemoteLog::GetLogIfAllCategoriesSet(GDBR_LOG_PROCESS));
@@ -984,7 +984,7 @@ Error GDBRemoteCommunication::StartDebugserverProcess(
                 __FUNCTION__, url ? url : "<empty>",
                 port ? *port : uint16_t(0));
 
-  Error error;
+  Status error;
   // If we locate debugserver, keep that located version around
   static FileSpec g_debugserver_file_spec;
 
diff --git a/source/Plugins/Process/gdb-remote/GDBRemoteCommunication.h b/source/Plugins/Process/gdb-remote/GDBRemoteCommunication.h
index b49e05e22d95..ce90de3e8470 100644
--- a/source/Plugins/Process/gdb-remote/GDBRemoteCommunication.h
+++ b/source/Plugins/Process/gdb-remote/GDBRemoteCommunication.h
@@ -65,9 +65,9 @@ public:
 
   enum class PacketResult {
     Success = 0,        // Success
-    ErrorSendFailed,    // Error sending the packet
+    ErrorSendFailed,    // Status sending the packet
     ErrorSendAck,       // Didn't get an ack back after sending a packet
-    ErrorReplyFailed,   // Error getting the reply
+    ErrorReplyFailed,   // Status getting the reply
     ErrorReplyTimeout,  // Timed out waiting for reply
     ErrorReplyInvalid,  // Got a reply but it wasn't valid for the packet that
                         // was sent
@@ -131,7 +131,7 @@ public:
   // Start a debugserver instance on the current host using the
   // supplied connection URL.
   //------------------------------------------------------------------
-  Error StartDebugserverProcess(
+  Status StartDebugserverProcess(
       const char *url,
       Platform *platform, // If non nullptr, then check with the platform for
                           // the GDB server binary if it can't be located
@@ -255,8 +255,8 @@ protected:
   // on m_bytes.  The checksum was for the compressed packet.
   bool DecompressPacket();
 
-  Error StartListenThread(const char *hostname = "127.0.0.1",
-                          uint16_t port = 0);
+  Status StartListenThread(const char *hostname = "127.0.0.1",
+                           uint16_t port = 0);
 
   bool JoinListenThread();
 
diff --git a/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationClient.cpp b/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationClient.cpp
index 2e94fa94331d..550ec0ea499a 100644
--- a/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationClient.cpp
+++ b/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationClient.cpp
@@ -113,7 +113,7 @@ GDBRemoteCommunicationClient::~GDBRemoteCommunicationClient() {
     Disconnect();
 }
 
-bool GDBRemoteCommunicationClient::HandshakeWithServer(Error *error_ptr) {
+bool GDBRemoteCommunicationClient::HandshakeWithServer(Status *error_ptr) {
   ResetDiscoverableSettings(false);
 
   // Start the read thread after we send the handshake ack since if we
@@ -1394,8 +1394,8 @@ bool GDBRemoteCommunicationClient::DeallocateMemory(addr_t addr) {
   return false;
 }
 
-Error GDBRemoteCommunicationClient::Detach(bool keep_stopped) {
-  Error error;
+Status GDBRemoteCommunicationClient::Detach(bool keep_stopped) {
+  Status error;
 
   if (keep_stopped) {
     if (m_supports_detach_stay_stopped == eLazyBoolCalculate) {
@@ -1434,9 +1434,9 @@ Error GDBRemoteCommunicationClient::Detach(bool keep_stopped) {
   return error;
 }
 
-Error GDBRemoteCommunicationClient::GetMemoryRegionInfo(
+Status GDBRemoteCommunicationClient::GetMemoryRegionInfo(
     lldb::addr_t addr, lldb_private::MemoryRegionInfo &region_info) {
-  Error error;
+  Status error;
   region_info.Clear();
 
   if (m_supports_memory_region_info != eLazyBoolNo) {
@@ -1529,8 +1529,8 @@ Error GDBRemoteCommunicationClient::GetMemoryRegionInfo(
   return error;
 }
 
-Error GDBRemoteCommunicationClient::GetWatchpointSupportInfo(uint32_t &num) {
-  Error error;
+Status GDBRemoteCommunicationClient::GetWatchpointSupportInfo(uint32_t &num) {
+  Status error;
 
   if (m_supports_watchpoint_support_info == eLazyBoolYes) {
     num = m_num_supported_hardware_watchpoints;
@@ -1568,18 +1568,18 @@ Error GDBRemoteCommunicationClient::GetWatchpointSupportInfo(uint32_t &num) {
   return error;
 }
 
-lldb_private::Error GDBRemoteCommunicationClient::GetWatchpointSupportInfo(
+lldb_private::Status GDBRemoteCommunicationClient::GetWatchpointSupportInfo(
     uint32_t &num, bool &after, const ArchSpec &arch) {
-  Error error(GetWatchpointSupportInfo(num));
+  Status error(GetWatchpointSupportInfo(num));
   if (error.Success())
     error = GetWatchpointsTriggerAfterInstruction(after, arch);
   return error;
 }
 
-lldb_private::Error
+lldb_private::Status
 GDBRemoteCommunicationClient::GetWatchpointsTriggerAfterInstruction(
     bool &after, const ArchSpec &arch) {
-  Error error;
+  Status error;
   llvm::Triple::ArchType atype = arch.GetMachine();
 
   // we assume watchpoints will happen after running the relevant opcode
@@ -2539,7 +2539,7 @@ uint8_t GDBRemoteCommunicationClient::SendGDBStoppointTypePacket(
     if (response.IsOKResponse())
       return 0;
 
-    // Error while setting breakpoint, send back specific error
+    // Status while setting breakpoint, send back specific error
     if (response.IsErrorResponse())
       return response.GetError();
 
@@ -2635,7 +2635,7 @@ lldb::addr_t GDBRemoteCommunicationClient::GetShlibInfoAddr() {
   return response.GetHexMaxU64(false, LLDB_INVALID_ADDRESS);
 }
 
-lldb_private::Error GDBRemoteCommunicationClient::RunShellCommand(
+lldb_private::Status GDBRemoteCommunicationClient::RunShellCommand(
     const char *command, // Shouldn't be NULL
     const FileSpec &
         working_dir, // Pass empty FileSpec to use the current working directory
@@ -2661,32 +2661,32 @@ lldb_private::Error GDBRemoteCommunicationClient::RunShellCommand(
   if (SendPacketAndWaitForResponse(stream.GetString(), response, false) ==
       PacketResult::Success) {
     if (response.GetChar() != 'F')
-      return Error("malformed reply");
+      return Status("malformed reply");
     if (response.GetChar() != ',')
-      return Error("malformed reply");
+      return Status("malformed reply");
     uint32_t exitcode = response.GetHexMaxU32(false, UINT32_MAX);
     if (exitcode == UINT32_MAX)
-      return Error("unable to run remote process");
+      return Status("unable to run remote process");
     else if (status_ptr)
       *status_ptr = exitcode;
     if (response.GetChar() != ',')
-      return Error("malformed reply");
+      return Status("malformed reply");
     uint32_t signo = response.GetHexMaxU32(false, UINT32_MAX);
     if (signo_ptr)
       *signo_ptr = signo;
     if (response.GetChar() != ',')
-      return Error("malformed reply");
+      return Status("malformed reply");
     std::string output;
     response.GetEscapedBinaryData(output);
     if (command_output)
       command_output->assign(output);
-    return Error();
+    return Status();
   }
-  return Error("unable to send packet");
+  return Status("unable to send packet");
 }
 
-Error GDBRemoteCommunicationClient::MakeDirectory(const FileSpec &file_spec,
-                                                  uint32_t file_permissions) {
+Status GDBRemoteCommunicationClient::MakeDirectory(const FileSpec &file_spec,
+                                                   uint32_t file_permissions) {
   std::string path{file_spec.GetPath(false)};
   lldb_private::StreamString stream;
   stream.PutCString("qPlatform_mkdir:");
@@ -2698,16 +2698,17 @@ Error GDBRemoteCommunicationClient::MakeDirectory(const FileSpec &file_spec,
 
   if (SendPacketAndWaitForResponse(packet, response, false) !=
       PacketResult::Success)
-    return Error("failed to send '%s' packet", packet.str().c_str());
+    return Status("failed to send '%s' packet", packet.str().c_str());
 
   if (response.GetChar() != 'F')
-    return Error("invalid response to '%s' packet", packet.str().c_str());
+    return Status("invalid response to '%s' packet", packet.str().c_str());
 
-  return Error(response.GetU32(UINT32_MAX), eErrorTypePOSIX);
+  return Status(response.GetU32(UINT32_MAX), eErrorTypePOSIX);
 }
 
-Error GDBRemoteCommunicationClient::SetFilePermissions(
-    const FileSpec &file_spec, uint32_t file_permissions) {
+Status
+GDBRemoteCommunicationClient::SetFilePermissions(const FileSpec &file_spec,
+                                                 uint32_t file_permissions) {
   std::string path{file_spec.GetPath(false)};
   lldb_private::StreamString stream;
   stream.PutCString("qPlatform_chmod:");
@@ -2719,16 +2720,16 @@ Error GDBRemoteCommunicationClient::SetFilePermissions(
 
   if (SendPacketAndWaitForResponse(packet, response, false) !=
       PacketResult::Success)
-    return Error("failed to send '%s' packet", stream.GetData());
+    return Status("failed to send '%s' packet", stream.GetData());
 
   if (response.GetChar() != 'F')
-    return Error("invalid response to '%s' packet", stream.GetData());
+    return Status("invalid response to '%s' packet", stream.GetData());
 
-  return Error(response.GetU32(UINT32_MAX), eErrorTypePOSIX);
+  return Status(response.GetU32(UINT32_MAX), eErrorTypePOSIX);
 }
 
 static uint64_t ParseHostIOPacketResponse(StringExtractorGDBRemote &response,
-                                          uint64_t fail_result, Error &error) {
+                                          uint64_t fail_result, Status &error) {
   response.SetFilePos(0);
   if (response.GetChar() != 'F')
     return fail_result;
@@ -2748,7 +2749,7 @@ static uint64_t ParseHostIOPacketResponse(StringExtractorGDBRemote &response,
 lldb::user_id_t
 GDBRemoteCommunicationClient::OpenFile(const lldb_private::FileSpec &file_spec,
                                        uint32_t flags, mode_t mode,
-                                       Error &error) {
+                                       Status &error) {
   std::string path(file_spec.GetPath(false));
   lldb_private::StreamString stream;
   stream.PutCString("vFile:open:");
@@ -2767,7 +2768,8 @@ GDBRemoteCommunicationClient::OpenFile(const lldb_private::FileSpec &file_spec,
   return UINT64_MAX;
 }
 
-bool GDBRemoteCommunicationClient::CloseFile(lldb::user_id_t fd, Error &error) {
+bool GDBRemoteCommunicationClient::CloseFile(lldb::user_id_t fd,
+                                             Status &error) {
   lldb_private::StreamString stream;
   stream.Printf("vFile:close:%i", (int)fd);
   StringExtractorGDBRemote response;
@@ -2796,10 +2798,11 @@ lldb::user_id_t GDBRemoteCommunicationClient::GetFileSize(
   return UINT64_MAX;
 }
 
-Error GDBRemoteCommunicationClient::GetFilePermissions(
-    const FileSpec &file_spec, uint32_t &file_permissions) {
+Status
+GDBRemoteCommunicationClient::GetFilePermissions(const FileSpec &file_spec,
+                                                 uint32_t &file_permissions) {
   std::string path{file_spec.GetPath(false)};
-  Error error;
+  Status error;
   lldb_private::StreamString stream;
   stream.PutCString("vFile:mode:");
   stream.PutCStringAsRawHex8(path.c_str());
@@ -2834,7 +2837,7 @@ Error GDBRemoteCommunicationClient::GetFilePermissions(
 uint64_t GDBRemoteCommunicationClient::ReadFile(lldb::user_id_t fd,
                                                 uint64_t offset, void *dst,
                                                 uint64_t dst_len,
-                                                Error &error) {
+                                                Status &error) {
   lldb_private::StreamString stream;
   stream.Printf("vFile:pread:%i,%" PRId64 ",%" PRId64, (int)fd, dst_len,
                 offset);
@@ -2868,7 +2871,7 @@ uint64_t GDBRemoteCommunicationClient::WriteFile(lldb::user_id_t fd,
                                                  uint64_t offset,
                                                  const void *src,
                                                  uint64_t src_len,
-                                                 Error &error) {
+                                                 Status &error) {
   lldb_private::StreamGDBRemote stream;
   stream.Printf("vFile:pwrite:%i,%" PRId64 ",", (int)fd, offset);
   stream.PutEscapedBytes(src, src_len);
@@ -2896,10 +2899,10 @@ uint64_t GDBRemoteCommunicationClient::WriteFile(lldb::user_id_t fd,
   return 0;
 }
 
-Error GDBRemoteCommunicationClient::CreateSymlink(const FileSpec &src,
-                                                  const FileSpec &dst) {
+Status GDBRemoteCommunicationClient::CreateSymlink(const FileSpec &src,
+                                                   const FileSpec &dst) {
   std::string src_path{src.GetPath(false)}, dst_path{dst.GetPath(false)};
-  Error error;
+  Status error;
   lldb_private::StreamGDBRemote stream;
   stream.PutCString("vFile:symlink:");
   // the unix symlink() command reverses its parameters where the dst if first,
@@ -2930,9 +2933,9 @@ Error GDBRemoteCommunicationClient::CreateSymlink(const FileSpec &src,
   return error;
 }
 
-Error GDBRemoteCommunicationClient::Unlink(const FileSpec &file_spec) {
+Status GDBRemoteCommunicationClient::Unlink(const FileSpec &file_spec) {
   std::string path{file_spec.GetPath(false)};
-  Error error;
+  Status error;
   lldb_private::StreamGDBRemote stream;
   stream.PutCString("vFile:unlink:");
   // the unix symlink() command reverses its parameters where the dst if first,
@@ -3221,12 +3224,12 @@ ParseModuleSpec(StructuredData::Dictionary *dict) {
   if (!dict)
     return llvm::None;
 
-  std::string string;
+  llvm::StringRef string;
   uint64_t integer;
 
   if (!dict->GetValueForKeyAsString("uuid", string))
     return llvm::None;
-  result.GetUUID().SetFromCString(string.c_str(), string.size());
+  result.GetUUID().SetFromStringRef(string, string.size());
 
   if (!dict->GetValueForKeyAsInteger("file_offset", integer))
     return llvm::None;
@@ -3238,7 +3241,7 @@ ParseModuleSpec(StructuredData::Dictionary *dict) {
 
   if (!dict->GetValueForKeyAsString("triple", string))
     return llvm::None;
-  result.GetArchitecture().SetTriple(string.c_str());
+  result.GetArchitecture().SetTriple(string);
 
   if (!dict->GetValueForKeyAsString("file_path", string))
     return llvm::None;
@@ -3311,7 +3314,7 @@ GDBRemoteCommunicationClient::GetModulesInfo(
 bool GDBRemoteCommunicationClient::ReadExtFeature(
     const lldb_private::ConstString object,
     const lldb_private::ConstString annex, std::string &out,
-    lldb_private::Error &err) {
+    lldb_private::Status &err) {
 
   std::stringstream output;
   StringExtractorGDBRemote chunk;
@@ -3590,7 +3593,7 @@ GDBRemoteCommunicationClient::GetSupportedStructuredDataPlugins() {
              : nullptr;
 }
 
-Error GDBRemoteCommunicationClient::SendSignalsToIgnore(
+Status GDBRemoteCommunicationClient::SendSignalsToIgnore(
     llvm::ArrayRef<int32_t> signals) {
   // Format packet:
   // QPassSignals:<hex_sig1>;<hex_sig2>...;<hex_sigN>
@@ -3601,18 +3604,18 @@ Error GDBRemoteCommunicationClient::SendSignalsToIgnore(
   auto send_status = SendPacketAndWaitForResponse(packet, response, false);
 
   if (send_status != GDBRemoteCommunication::PacketResult::Success)
-    return Error("Sending QPassSignals packet failed");
+    return Status("Sending QPassSignals packet failed");
 
   if (response.IsOKResponse()) {
-    return Error();
+    return Status();
   } else {
-    return Error("Unknown error happened during sending QPassSignals packet.");
+    return Status("Unknown error happened during sending QPassSignals packet.");
   }
 }
 
-Error GDBRemoteCommunicationClient::ConfigureRemoteStructuredData(
+Status GDBRemoteCommunicationClient::ConfigureRemoteStructuredData(
     const ConstString &type_name, const StructuredData::ObjectSP &config_sp) {
-  Error error;
+  Status error;
 
   if (type_name.GetLength() == 0) {
     error.SetErrorString("invalid type_name argument");
diff --git a/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationClient.h b/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationClient.h
index 63b9708cc9a0..08d0bd5d690b 100644
--- a/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationClient.h
+++ b/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationClient.h
@@ -41,7 +41,7 @@ public:
   // After connecting, send the handshake to the server to make sure
   // we are communicating with it.
   //------------------------------------------------------------------
-  bool HandshakeWithServer(Error *error_ptr);
+  bool HandshakeWithServer(Status *error_ptr);
 
   // For packets which specify a range of output to be returned,
   // return all of the output via a series of request packets of the form
@@ -230,17 +230,17 @@ public:
 
   bool DeallocateMemory(lldb::addr_t addr);
 
-  Error Detach(bool keep_stopped);
+  Status Detach(bool keep_stopped);
 
-  Error GetMemoryRegionInfo(lldb::addr_t addr, MemoryRegionInfo &range_info);
+  Status GetMemoryRegionInfo(lldb::addr_t addr, MemoryRegionInfo &range_info);
 
-  Error GetWatchpointSupportInfo(uint32_t &num);
+  Status GetWatchpointSupportInfo(uint32_t &num);
 
-  Error GetWatchpointSupportInfo(uint32_t &num, bool &after,
-                                 const ArchSpec &arch);
+  Status GetWatchpointSupportInfo(uint32_t &num, bool &after,
+                                  const ArchSpec &arch);
 
-  Error GetWatchpointsTriggerAfterInstruction(bool &after,
-                                              const ArchSpec &arch);
+  Status GetWatchpointsTriggerAfterInstruction(bool &after,
+                                               const ArchSpec &arch);
 
   const ArchSpec &GetHostArchitecture();
 
@@ -365,33 +365,33 @@ public:
                              bool &sequence_mutex_unavailable);
 
   lldb::user_id_t OpenFile(const FileSpec &file_spec, uint32_t flags,
-                           mode_t mode, Error &error);
+                           mode_t mode, Status &error);
 
-  bool CloseFile(lldb::user_id_t fd, Error &error);
+  bool CloseFile(lldb::user_id_t fd, Status &error);
 
   lldb::user_id_t GetFileSize(const FileSpec &file_spec);
 
-  Error GetFilePermissions(const FileSpec &file_spec,
-                           uint32_t &file_permissions);
+  Status GetFilePermissions(const FileSpec &file_spec,
+                            uint32_t &file_permissions);
 
-  Error SetFilePermissions(const FileSpec &file_spec,
-                           uint32_t file_permissions);
+  Status SetFilePermissions(const FileSpec &file_spec,
+                            uint32_t file_permissions);
 
   uint64_t ReadFile(lldb::user_id_t fd, uint64_t offset, void *dst,
-                    uint64_t dst_len, Error &error);
+                    uint64_t dst_len, Status &error);
 
   uint64_t WriteFile(lldb::user_id_t fd, uint64_t offset, const void *src,
-                     uint64_t src_len, Error &error);
+                     uint64_t src_len, Status &error);
 
-  Error CreateSymlink(const FileSpec &src, const FileSpec &dst);
+  Status CreateSymlink(const FileSpec &src, const FileSpec &dst);
 
-  Error Unlink(const FileSpec &file_spec);
+  Status Unlink(const FileSpec &file_spec);
 
-  Error MakeDirectory(const FileSpec &file_spec, uint32_t mode);
+  Status MakeDirectory(const FileSpec &file_spec, uint32_t mode);
 
   bool GetFileExists(const FileSpec &file_spec);
 
-  Error RunShellCommand(
+  Status RunShellCommand(
       const char *command,         // Shouldn't be nullptr
       const FileSpec &working_dir, // Pass empty FileSpec to use the current
                                    // working directory
@@ -448,12 +448,12 @@ public:
 
   bool ReadExtFeature(const lldb_private::ConstString object,
                       const lldb_private::ConstString annex, std::string &out,
-                      lldb_private::Error &err);
+                      lldb_private::Status &err);
 
   void ServeSymbolLookups(lldb_private::Process *process);
 
   // Sends QPassSignals packet to the server with given signals to ignore.
-  Error SendSignalsToIgnore(llvm::ArrayRef<int32_t> signals);
+  Status SendSignalsToIgnore(llvm::ArrayRef<int32_t> signals);
 
   //------------------------------------------------------------------
   /// Return the feature set supported by the gdb-remote server.
@@ -495,7 +495,7 @@ public:
   ///
   /// @see \b Process::ConfigureStructuredData(...) for details.
   //------------------------------------------------------------------
-  Error
+  Status
   ConfigureRemoteStructuredData(const ConstString &type_name,
                                 const StructuredData::ObjectSP &config_sp);
 
diff --git a/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationServer.cpp b/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationServer.cpp
index 934824e214dc..dac675ee9432 100644
--- a/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationServer.cpp
+++ b/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationServer.cpp
@@ -39,7 +39,7 @@ void GDBRemoteCommunicationServer::RegisterPacketHandler(
 
 GDBRemoteCommunication::PacketResult
 GDBRemoteCommunicationServer::GetPacketAndSendResponse(
-    Timeout<std::micro> timeout, Error &error, bool &interrupt, bool &quit) {
+    Timeout<std::micro> timeout, Status &error, bool &interrupt, bool &quit) {
   StringExtractorGDBRemote packet;
 
   PacketResult packet_result = WaitForPacketNoLock(packet, timeout, false);
diff --git a/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationServer.h b/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationServer.h
index 0c583e62d76b..6eb25f8b9f98 100644
--- a/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationServer.h
+++ b/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationServer.h
@@ -31,8 +31,8 @@ class GDBRemoteCommunicationServer : public GDBRemoteCommunication {
 public:
   using PortMap = std::map<uint16_t, lldb::pid_t>;
   using PacketHandler =
-      std::function<PacketResult(StringExtractorGDBRemote &packet, Error &error,
-                                 bool &interrupt, bool &quit)>;
+      std::function<PacketResult(StringExtractorGDBRemote &packet,
+                                 Status &error, bool &interrupt, bool &quit)>;
 
   GDBRemoteCommunicationServer(const char *comm_name,
                                const char *listener_name);
@@ -44,7 +44,7 @@ public:
                         PacketHandler handler);
 
   PacketResult GetPacketAndSendResponse(Timeout<std::micro> timeout,
-                                        Error &error, bool &interrupt,
+                                        Status &error, bool &interrupt,
                                         bool &quit);
 
   // After connecting, do a little handshake with the client to make sure
diff --git a/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationServerCommon.cpp b/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationServerCommon.cpp
index 66c1b15ff857..de2400c51ba3 100644
--- a/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationServerCommon.cpp
+++ b/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationServerCommon.cpp
@@ -523,7 +523,7 @@ GDBRemoteCommunicationServerCommon::Handle_vFile_Open(
           File::ConvertOpenOptionsForPOSIXOpen(packet.GetHexMaxU32(false, 0));
       if (packet.GetChar() == ',') {
         mode_t mode = packet.GetHexMaxU32(false, 0600);
-        Error error;
+        Status error;
         const FileSpec path_spec{path, true};
         int fd = ::open(path_spec.GetCString(), flags, mode);
         const int save_errno = fd == -1 ? errno : 0;
@@ -544,7 +544,7 @@ GDBRemoteCommunicationServerCommon::Handle_vFile_Close(
     StringExtractorGDBRemote &packet) {
   packet.SetFilePos(::strlen("vFile:close:"));
   int fd = packet.GetS32(-1);
-  Error error;
+  Status error;
   int err = -1;
   int save_errno = 0;
   if (fd >= 0) {
@@ -663,7 +663,7 @@ GDBRemoteCommunicationServerCommon::Handle_vFile_Mode(
   std::string path;
   packet.GetHexByteString(path);
   if (!path.empty()) {
-    Error error;
+    Status error;
     const uint32_t mode = File::GetPermissions(FileSpec{path, true}, error);
     StreamString response;
     response.Printf("F%u", mode);
@@ -702,7 +702,7 @@ GDBRemoteCommunicationServerCommon::Handle_vFile_symlink(
   packet.GetHexByteStringTerminatedBy(dst, ',');
   packet.GetChar(); // Skip ',' char
   packet.GetHexByteString(src);
-  Error error = FileSystem::Symlink(FileSpec{src, true}, FileSpec{dst, false});
+  Status error = FileSystem::Symlink(FileSpec{src, true}, FileSpec{dst, false});
   StreamString response;
   response.Printf("F%u,%u", error.GetError(), error.GetError());
   return SendPacketNoLock(response.GetString());
@@ -714,7 +714,7 @@ GDBRemoteCommunicationServerCommon::Handle_vFile_unlink(
   packet.SetFilePos(::strlen("vFile:unlink:"));
   std::string path;
   packet.GetHexByteString(path);
-  Error error(llvm::sys::fs::remove(path));
+  Status error(llvm::sys::fs::remove(path));
   StreamString response;
   response.Printf("F%u,%u", error.GetError(), error.GetError());
   return SendPacketNoLock(response.GetString());
@@ -736,7 +736,7 @@ GDBRemoteCommunicationServerCommon::Handle_qPlatform_shell(
         packet.GetHexByteString(working_dir);
       int status, signo;
       std::string output;
-      Error err =
+      Status err =
           Host::RunShellCommand(path.c_str(), FileSpec{working_dir, true},
                                 &status, &signo, &output, timeout);
       StreamGDBRemote response;
@@ -794,7 +794,7 @@ GDBRemoteCommunicationServerCommon::Handle_qPlatform_mkdir(
   if (packet.GetChar() == ',') {
     std::string path;
     packet.GetHexByteString(path);
-    Error error(llvm::sys::fs::create_directory(path, mode));
+    Status error(llvm::sys::fs::create_directory(path, mode));
 
     StreamGDBRemote response;
     response.Printf("F%u", error.GetError());
@@ -814,7 +814,7 @@ GDBRemoteCommunicationServerCommon::Handle_qPlatform_chmod(
   if (packet.GetChar() == ',') {
     std::string path;
     packet.GetHexByteString(path);
-    Error error(llvm::sys::fs::setPermissions(path, perms));
+    Status error(llvm::sys::fs::setPermissions(path, perms));
 
     StreamGDBRemote response;
     response.Printf("F%u", error.GetError());
@@ -1140,7 +1140,7 @@ GDBRemoteCommunicationServerCommon::Handle_jModulesInfo(
         packet_array->GetItemAtIndex(i)->GetAsDictionary();
     if (!query)
       continue;
-    std::string file, triple;
+    llvm::StringRef file, triple;
     if (!query->GetValueForKeyAsString("file", file) ||
         !query->GetValueForKeyAsString("triple", triple))
       continue;
@@ -1278,9 +1278,10 @@ FileSpec GDBRemoteCommunicationServerCommon::FindModuleFile(
 #endif
 }
 
-ModuleSpec GDBRemoteCommunicationServerCommon::GetModuleInfo(
-    const std::string &module_path, const std::string &triple) {
-  ArchSpec arch(triple.c_str());
+ModuleSpec
+GDBRemoteCommunicationServerCommon::GetModuleInfo(llvm::StringRef module_path,
+                                                  llvm::StringRef triple) {
+  ArchSpec arch(triple);
 
   const FileSpec req_module_path_spec(module_path, true);
   const FileSpec module_path_spec =
diff --git a/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationServerCommon.h b/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationServerCommon.h
index 321a92266bdd..e9ab8f1a11de 100644
--- a/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationServerCommon.h
+++ b/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationServerCommon.h
@@ -38,7 +38,7 @@ public:
 
 protected:
   ProcessLaunchInfo m_process_launch_info;
-  Error m_process_launch_error;
+  Status m_process_launch_error;
   ProcessInstanceInfoList m_proc_infos;
   uint32_t m_proc_infos_index;
   bool m_thread_suffix_supported;
@@ -130,7 +130,7 @@ protected:
       PacketResult (T::*handler)(StringExtractorGDBRemote &packet)) {
     RegisterPacketHandler(packet_type,
                           [this, handler](StringExtractorGDBRemote packet,
-                                          Error &error, bool &interrupt,
+                                          Status &error, bool &interrupt,
                                           bool &quit) {
                             return (static_cast<T *>(this)->*handler)(packet);
                           });
@@ -144,17 +144,16 @@ protected:
   /// with all the information for a child process to be launched.
   ///
   /// @return
-  ///     An Error object indicating the success or failure of the
+  ///     An Status object indicating the success or failure of the
   ///     launch.
   //------------------------------------------------------------------
-  virtual Error LaunchProcess() = 0;
+  virtual Status LaunchProcess() = 0;
 
   virtual FileSpec FindModuleFile(const std::string &module_path,
                                   const ArchSpec &arch);
 
 private:
-  ModuleSpec GetModuleInfo(const std::string &module_path,
-                           const std::string &triple);
+  ModuleSpec GetModuleInfo(llvm::StringRef module_path, llvm::StringRef triple);
 };
 
 } // namespace process_gdb_remote
diff --git a/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationServerLLGS.cpp b/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationServerLLGS.cpp
index 290889ec662a..ec7c2f5330d7 100644
--- a/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationServerLLGS.cpp
+++ b/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationServerLLGS.cpp
@@ -184,35 +184,36 @@ void GDBRemoteCommunicationServerLLGS::RegisterPacketHandlers() {
       &GDBRemoteCommunicationServerLLGS::Handle_QPassSignals);
 
   RegisterPacketHandler(StringExtractorGDBRemote::eServerPacketType_k,
-                        [this](StringExtractorGDBRemote packet, Error &error,
+                        [this](StringExtractorGDBRemote packet, Status &error,
                                bool &interrupt, bool &quit) {
                           quit = true;
                           return this->Handle_k(packet);
                         });
 }
 
-Error GDBRemoteCommunicationServerLLGS::SetLaunchArguments(
-    const char *const args[], int argc) {
+Status
+GDBRemoteCommunicationServerLLGS::SetLaunchArguments(const char *const args[],
+                                                     int argc) {
   if ((argc < 1) || !args || !args[0] || !args[0][0])
-    return Error("%s: no process command line specified to launch",
-                 __FUNCTION__);
+    return Status("%s: no process command line specified to launch",
+                  __FUNCTION__);
 
   m_process_launch_info.SetArguments(const_cast<const char **>(args), true);
-  return Error();
+  return Status();
 }
 
-Error GDBRemoteCommunicationServerLLGS::SetLaunchFlags(
-    unsigned int launch_flags) {
+Status
+GDBRemoteCommunicationServerLLGS::SetLaunchFlags(unsigned int launch_flags) {
   m_process_launch_info.GetFlags().Set(launch_flags);
-  return Error();
+  return Status();
 }
 
-Error GDBRemoteCommunicationServerLLGS::LaunchProcess() {
+Status GDBRemoteCommunicationServerLLGS::LaunchProcess() {
   Log *log(GetLogIfAnyCategoriesSet(LIBLLDB_LOG_PROCESS));
 
   if (!m_process_launch_info.GetArguments().GetArgumentCount())
-    return Error("%s: no process command line specified to launch",
-                 __FUNCTION__);
+    return Status("%s: no process command line specified to launch",
+                  __FUNCTION__);
 
   const bool should_forward_stdio =
       m_process_launch_info.GetFileActionForFD(STDIN_FILENO) == nullptr ||
@@ -224,7 +225,7 @@ Error GDBRemoteCommunicationServerLLGS::LaunchProcess() {
   const bool default_to_use_pty = true;
   m_process_launch_info.FinalizeFileActions(nullptr, default_to_use_pty);
 
-  Error error;
+  Status error;
   {
     std::lock_guard<std::recursive_mutex> guard(m_debugged_process_mutex);
     assert(!m_debugged_process_sp && "lldb-server creating debugged "
@@ -286,8 +287,8 @@ Error GDBRemoteCommunicationServerLLGS::LaunchProcess() {
   return error;
 }
 
-Error GDBRemoteCommunicationServerLLGS::AttachToProcess(lldb::pid_t pid) {
-  Error error;
+Status GDBRemoteCommunicationServerLLGS::AttachToProcess(lldb::pid_t pid) {
+  Status error;
 
   Log *log(GetLogIfAnyCategoriesSet(LIBLLDB_LOG_PROCESS));
   if (log)
@@ -298,10 +299,10 @@ Error GDBRemoteCommunicationServerLLGS::AttachToProcess(lldb::pid_t pid) {
   // else.
   if (m_debugged_process_sp &&
       m_debugged_process_sp->GetID() != LLDB_INVALID_PROCESS_ID)
-    return Error("cannot attach to a process %" PRIu64
-                 " when another process with pid %" PRIu64
-                 " is being debugged.",
-                 pid, m_debugged_process_sp->GetID());
+    return Status("cannot attach to a process %" PRIu64
+                  " when another process with pid %" PRIu64
+                  " is being debugged.",
+                  pid, m_debugged_process_sp->GetID());
 
   // Try to attach.
   error = NativeProcessProtocol::Attach(pid, *this, m_mainloop,
@@ -420,7 +421,7 @@ static void WriteRegisterValueInHexFixedWidth(
     lldb::ByteOrder byte_order) {
   RegisterValue reg_value;
   if (!reg_value_p) {
-    Error error = reg_ctx_sp->ReadRegister(&reg_info, reg_value);
+    Status error = reg_ctx_sp->ReadRegister(&reg_info, reg_value);
     if (error.Success())
       reg_value_p = &reg_value;
     // else log.
@@ -488,7 +489,7 @@ static JSONObject::SP GetRegistersAsJSON(NativeThreadProtocol &thread) {
                 // registers.
 
     RegisterValue reg_value;
-    Error error = reg_ctx_sp->ReadRegister(reg_info_p, reg_value);
+    Status error = reg_ctx_sp->ReadRegister(reg_info_p, reg_value);
     if (error.Fail()) {
       if (log)
         log->Printf("%s failed to read register '%s' index %" PRIu32 ": %s",
@@ -739,7 +740,7 @@ GDBRemoteCommunicationServerLLGS::SendStopReplyPacketForThread(
           reg_ctx_sp->GetRegisterInfoAtIndex(reg_to_read);
 
       RegisterValue reg_value;
-      Error error = reg_ctx_sp->ReadRegister(reg_info_p, reg_value);
+      Status error = reg_ctx_sp->ReadRegister(reg_info_p, reg_value);
       if (error.Fail()) {
         if (log)
           log->Printf("%s failed to read register '%s' index %" PRIu32 ": %s",
@@ -793,7 +794,7 @@ GDBRemoteCommunicationServerLLGS::SendStopReplyPacketForThread(
         } else if (reg_info_p->value_regs == nullptr) {
           // Only expediate registers that are not contained in other registers.
           RegisterValue reg_value;
-          Error error = reg_ctx_sp->ReadRegister(reg_info_p, reg_value);
+          Status error = reg_ctx_sp->ReadRegister(reg_info_p, reg_value);
           if (error.Success()) {
             response.Printf("%.02x:", *reg_num_p);
             WriteRegisterValueInHexFixedWidth(response, reg_ctx_sp, *reg_info_p,
@@ -960,7 +961,7 @@ void GDBRemoteCommunicationServerLLGS::DataAvailableCallback() {
 
   bool interrupt = false;
   bool done = false;
-  Error error;
+  Status error;
   while (true) {
     const PacketResult result = GetPacketAndSendResponse(
         std::chrono::microseconds(0), error, interrupt, done);
@@ -978,12 +979,12 @@ void GDBRemoteCommunicationServerLLGS::DataAvailableCallback() {
   }
 }
 
-Error GDBRemoteCommunicationServerLLGS::InitializeConnection(
+Status GDBRemoteCommunicationServerLLGS::InitializeConnection(
     std::unique_ptr<Connection> &&connection) {
   IOObjectSP read_object_sp = connection->GetReadObject();
   GDBRemoteCommunicationServer::SetConnection(connection.release());
 
-  Error error;
+  Status error;
   m_network_handle_up = m_mainloop.RegisterReadObject(
       read_object_sp, [this](MainLoopBase &) { DataAvailableCallback(); },
       error);
@@ -1005,8 +1006,8 @@ GDBRemoteCommunicationServerLLGS::SendONotification(const char *buffer,
   return SendPacketNoLock(response.GetString());
 }
 
-Error GDBRemoteCommunicationServerLLGS::SetSTDIOFileDescriptor(int fd) {
-  Error error;
+Status GDBRemoteCommunicationServerLLGS::SetSTDIOFileDescriptor(int fd) {
+  Status error;
 
   // Set up the reading/handling of process I/O
   std::unique_ptr<ConnectionFileDescriptor> conn_up(
@@ -1024,7 +1025,7 @@ Error GDBRemoteCommunicationServerLLGS::SetSTDIOFileDescriptor(int fd) {
     return error;
   }
 
-  return Error();
+  return Status();
 }
 
 void GDBRemoteCommunicationServerLLGS::StartSTDIOForwarding() {
@@ -1032,7 +1033,7 @@ void GDBRemoteCommunicationServerLLGS::StartSTDIOForwarding() {
   if (!m_stdio_communication.IsConnected())
     return;
 
-  Error error;
+  Status error;
   lldbassert(!m_stdio_handle_up);
   m_stdio_handle_up = m_mainloop.RegisterReadObject(
       m_stdio_communication.GetConnection()->GetReadObject(),
@@ -1055,7 +1056,7 @@ void GDBRemoteCommunicationServerLLGS::StopSTDIOForwarding() {
 void GDBRemoteCommunicationServerLLGS::SendProcessOutput() {
   char buffer[1024];
   ConnectionStatus status;
-  Error error;
+  Status error;
   while (true) {
     size_t bytes_read = m_stdio_communication.Read(
         buffer, sizeof buffer, std::chrono::microseconds(0), status, &error);
@@ -1140,7 +1141,7 @@ GDBRemoteCommunicationServerLLGS::Handle_k(StringExtractorGDBRemote &packet) {
     return PacketResult::Success;
   }
 
-  Error error = m_debugged_process_sp->Kill();
+  Status error = m_debugged_process_sp->Kill();
   if (error.Fail() && log)
     log->Printf("GDBRemoteCommunicationServerLLGS::%s Failed to kill debugged "
                 "process %" PRIu64 ": %s",
@@ -1223,7 +1224,7 @@ GDBRemoteCommunicationServerLLGS::Handle_C(StringExtractorGDBRemote &packet) {
   }
 
   ResumeActionList resume_actions(StateType::eStateRunning, 0);
-  Error error;
+  Status error;
 
   // We have two branches: what to do if a continue thread is specified (in
   // which case we target
@@ -1304,7 +1305,7 @@ GDBRemoteCommunicationServerLLGS::Handle_c(StringExtractorGDBRemote &packet) {
   // Build the ResumeActionList
   ResumeActionList actions(StateType::eStateRunning, 0);
 
-  Error error = m_debugged_process_sp->Resume(actions);
+  Status error = m_debugged_process_sp->Resume(actions);
   if (error.Fail()) {
     if (log) {
       log->Printf(
@@ -1428,7 +1429,7 @@ GDBRemoteCommunicationServerLLGS::Handle_vCont(
     thread_actions.Append(thread_action);
   }
 
-  Error error = m_debugged_process_sp->Resume(thread_actions);
+  Status error = m_debugged_process_sp->Resume(thread_actions);
   if (error.Fail()) {
     if (log) {
       log->Printf("GDBRemoteCommunicationServerLLGS::%s vCont failed for "
@@ -1853,7 +1854,7 @@ GDBRemoteCommunicationServerLLGS::Handle_p(StringExtractorGDBRemote &packet) {
 
   // Retrieve the value
   RegisterValue reg_value;
-  Error error = reg_context_sp->ReadRegister(reg_info, reg_value);
+  Status error = reg_context_sp->ReadRegister(reg_info, reg_value);
   if (error.Fail()) {
     if (log)
       log->Printf("GDBRemoteCommunicationServerLLGS::%s failed, read of "
@@ -1973,7 +1974,7 @@ GDBRemoteCommunicationServerLLGS::Handle_P(StringExtractorGDBRemote &packet) {
   StreamGDBRemote response;
 
   RegisterValue reg_value(reg_bytes, reg_size, process_arch.GetByteOrder());
-  Error error = reg_context_sp->WriteRegister(reg_info, reg_value);
+  Status error = reg_context_sp->WriteRegister(reg_info, reg_value);
   if (error.Fail()) {
     if (log)
       log->Printf("GDBRemoteCommunicationServerLLGS::%s failed, write of "
@@ -2088,7 +2089,7 @@ GDBRemoteCommunicationServerLLGS::Handle_I(StringExtractorGDBRemote &packet) {
     // TODO: enqueue this block in circular buffer and send window size to
     // remote host
     ConnectionStatus status;
-    Error error;
+    Status error;
     m_stdio_communication.Write(tmp, read, status, &error);
     if (error.Fail()) {
       return SendErrorResponse(0x15);
@@ -2114,7 +2115,7 @@ GDBRemoteCommunicationServerLLGS::Handle_interrupt(
   }
 
   // Interrupt the process.
-  Error error = m_debugged_process_sp->Interrupt();
+  Status error = m_debugged_process_sp->Interrupt();
   if (error.Fail()) {
     if (log) {
       log->Printf(
@@ -2181,7 +2182,7 @@ GDBRemoteCommunicationServerLLGS::Handle_memory_read(
 
   // Retrieve the process memory.
   size_t bytes_read = 0;
-  Error error = m_debugged_process_sp->ReadMemoryWithoutTrap(
+  Status error = m_debugged_process_sp->ReadMemoryWithoutTrap(
       read_addr, &buf[0], byte_count, bytes_read);
   if (error.Fail()) {
     if (log)
@@ -2282,8 +2283,8 @@ GDBRemoteCommunicationServerLLGS::Handle_M(StringExtractorGDBRemote &packet) {
 
   // Write the process memory.
   size_t bytes_written = 0;
-  Error error = m_debugged_process_sp->WriteMemory(write_addr, &buf[0],
-                                                   byte_count, bytes_written);
+  Status error = m_debugged_process_sp->WriteMemory(write_addr, &buf[0],
+                                                    byte_count, bytes_written);
   if (error.Fail()) {
     if (log)
       log->Printf("GDBRemoteCommunicationServerLLGS::%s pid %" PRIu64
@@ -2329,7 +2330,7 @@ GDBRemoteCommunicationServerLLGS::Handle_qMemoryRegionInfoSupported(
 
   // Test if we can get any region back when asking for the region around NULL.
   MemoryRegionInfo region_info;
-  const Error error =
+  const Status error =
       m_debugged_process_sp->GetMemoryRegionInfo(0, region_info);
   if (error.Fail()) {
     // We don't support memory region info collection for this
@@ -2367,7 +2368,7 @@ GDBRemoteCommunicationServerLLGS::Handle_qMemoryRegionInfo(
 
   // Get the memory region info for the target address.
   MemoryRegionInfo region_info;
-  const Error error =
+  const Status error =
       m_debugged_process_sp->GetMemoryRegionInfo(read_addr, region_info);
   if (error.Fail()) {
     // Return the error message.
@@ -2485,7 +2486,7 @@ GDBRemoteCommunicationServerLLGS::Handle_Z(StringExtractorGDBRemote &packet) {
 
   if (want_breakpoint) {
     // Try to set the breakpoint.
-    const Error error =
+    const Status error =
         m_debugged_process_sp->SetBreakpoint(addr, size, want_hardware);
     if (error.Success())
       return SendOKResponse();
@@ -2498,7 +2499,7 @@ GDBRemoteCommunicationServerLLGS::Handle_Z(StringExtractorGDBRemote &packet) {
     return SendErrorResponse(0x09);
   } else {
     // Try to set the watchpoint.
-    const Error error = m_debugged_process_sp->SetWatchpoint(
+    const Status error = m_debugged_process_sp->SetWatchpoint(
         addr, size, watch_flags, want_hardware);
     if (error.Success())
       return SendOKResponse();
@@ -2582,7 +2583,7 @@ GDBRemoteCommunicationServerLLGS::Handle_z(StringExtractorGDBRemote &packet) {
 
   if (want_breakpoint) {
     // Try to clear the breakpoint.
-    const Error error =
+    const Status error =
         m_debugged_process_sp->RemoveBreakpoint(addr, want_hardware);
     if (error.Success())
       return SendOKResponse();
@@ -2595,7 +2596,7 @@ GDBRemoteCommunicationServerLLGS::Handle_z(StringExtractorGDBRemote &packet) {
     return SendErrorResponse(0x09);
   } else {
     // Try to clear the watchpoint.
-    const Error error = m_debugged_process_sp->RemoveWatchpoint(addr);
+    const Status error = m_debugged_process_sp->RemoveWatchpoint(addr);
     if (error.Success())
       return SendOKResponse();
     Log *log(GetLogIfAnyCategoriesSet(LIBLLDB_LOG_WATCHPOINTS));
@@ -2646,7 +2647,7 @@ GDBRemoteCommunicationServerLLGS::Handle_s(StringExtractorGDBRemote &packet) {
 
   // All other threads stop while we're single stepping a thread.
   actions.SetDefaultThreadActionIfNeeded(eStateStopped, 0);
-  Error error = m_debugged_process_sp->Resume(actions);
+  Status error = m_debugged_process_sp->Resume(actions);
   if (error.Fail()) {
     if (log)
       log->Printf("GDBRemoteCommunicationServerLLGS::%s pid %" PRIu64
@@ -2782,7 +2783,7 @@ GDBRemoteCommunicationServerLLGS::Handle_QSaveRegisterState(
 
   // Save registers to a buffer.
   DataBufferSP register_data_sp;
-  Error error = reg_context_sp->ReadAllRegisterValues(register_data_sp);
+  Status error = reg_context_sp->ReadAllRegisterValues(register_data_sp);
   if (error.Fail()) {
     if (log)
       log->Printf("GDBRemoteCommunicationServerLLGS::%s pid %" PRIu64
@@ -2871,7 +2872,7 @@ GDBRemoteCommunicationServerLLGS::Handle_QRestoreRegisterState(
     m_saved_registers_map.erase(it);
   }
 
-  Error error = reg_context_sp->WriteAllRegisterValues(register_data_sp);
+  Status error = reg_context_sp->WriteAllRegisterValues(register_data_sp);
   if (error.Fail()) {
     if (log)
       log->Printf("GDBRemoteCommunicationServerLLGS::%s pid %" PRIu64
@@ -2906,7 +2907,7 @@ GDBRemoteCommunicationServerLLGS::Handle_vAttach(
                 "pid %" PRIu64,
                 __FUNCTION__, pid);
 
-  Error error = AttachToProcess(pid);
+  Status error = AttachToProcess(pid);
 
   if (error.Fail()) {
     if (log)
@@ -2954,7 +2955,7 @@ GDBRemoteCommunicationServerLLGS::Handle_D(StringExtractorGDBRemote &packet) {
     return SendIllFormedResponse(packet, "Invalid pid");
   }
 
-  const Error error = m_debugged_process_sp->Detach();
+  const Status error = m_debugged_process_sp->Detach();
   if (error.Fail()) {
     if (log)
       log->Printf("GDBRemoteCommunicationServerLLGS::%s failed to detach from "
@@ -3058,7 +3059,7 @@ GDBRemoteCommunicationServerLLGS::Handle_qFileLoadAddress(
   packet.GetHexByteString(file_name);
 
   lldb::addr_t file_load_address = LLDB_INVALID_ADDRESS;
-  Error error =
+  Status error =
       m_debugged_process_sp->GetFileLoadAddress(file_name, file_load_address);
   if (error.Fail())
     return SendErrorResponse(69);
@@ -3098,7 +3099,7 @@ GDBRemoteCommunicationServerLLGS::Handle_QPassSignals(
   if (!m_debugged_process_sp)
     return SendErrorResponse(68);
 
-  Error error = m_debugged_process_sp->IgnoreSignals(signals);
+  Status error = m_debugged_process_sp->IgnoreSignals(signals);
   if (error.Fail())
     return SendErrorResponse(69);
 
@@ -3112,7 +3113,7 @@ void GDBRemoteCommunicationServerLLGS::MaybeCloseInferiorTerminalConnection() {
   if (m_stdio_communication.IsConnected()) {
     auto connection = m_stdio_communication.GetConnection();
     if (connection) {
-      Error error;
+      Status error;
       connection->Disconnect(&error);
 
       if (error.Success()) {
diff --git a/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationServerLLGS.h b/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationServerLLGS.h
index a47927e1c640..ebda9a911d3c 100644
--- a/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationServerLLGS.h
+++ b/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationServerLLGS.h
@@ -51,10 +51,10 @@ public:
   ///     The number of elements in the args array of cstring pointers.
   ///
   /// @return
-  ///     An Error object indicating the success or failure of making
+  ///     An Status object indicating the success or failure of making
   ///     the setting.
   //------------------------------------------------------------------
-  Error SetLaunchArguments(const char *const args[], int argc);
+  Status SetLaunchArguments(const char *const args[], int argc);
 
   //------------------------------------------------------------------
   /// Specify the launch flags for the process.
@@ -63,10 +63,10 @@ public:
   ///     The launch flags to use when launching this process.
   ///
   /// @return
-  ///     An Error object indicating the success or failure of making
+  ///     An Status object indicating the success or failure of making
   ///     the setting.
   //------------------------------------------------------------------
-  Error SetLaunchFlags(unsigned int launch_flags);
+  Status SetLaunchFlags(unsigned int launch_flags);
 
   //------------------------------------------------------------------
   /// Launch a process with the current launch settings.
@@ -76,10 +76,10 @@ public:
   /// with all the information for a child process to be launched.
   ///
   /// @return
-  ///     An Error object indicating the success or failure of the
+  ///     An Status object indicating the success or failure of the
   ///     launch.
   //------------------------------------------------------------------
-  Error LaunchProcess() override;
+  Status LaunchProcess() override;
 
   //------------------------------------------------------------------
   /// Attach to a process.
@@ -88,10 +88,10 @@ public:
   /// configured Platform.
   ///
   /// @return
-  ///     An Error object indicating the success or failure of the
+  ///     An Status object indicating the success or failure of the
   ///     attach operation.
   //------------------------------------------------------------------
-  Error AttachToProcess(lldb::pid_t pid);
+  Status AttachToProcess(lldb::pid_t pid);
 
   //------------------------------------------------------------------
   // NativeProcessProtocol::NativeDelegate overrides
@@ -103,7 +103,7 @@ public:
 
   void DidExec(NativeProcessProtocol *process) override;
 
-  Error InitializeConnection(std::unique_ptr<Connection> &&connection);
+  Status InitializeConnection(std::unique_ptr<Connection> &&connection);
 
 protected:
   MainLoop &m_mainloop;
@@ -213,7 +213,7 @@ protected:
 
   lldb::tid_t GetContinueThreadID() const { return m_continue_tid; }
 
-  Error SetSTDIOFileDescriptor(int fd);
+  Status SetSTDIOFileDescriptor(int fd);
 
   FileSpec FindModuleFile(const std::string &module_path,
                           const ArchSpec &arch) override;
diff --git a/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationServerPlatform.cpp b/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationServerPlatform.cpp
index ae1c1adb5b45..73e3732df3ef 100644
--- a/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationServerPlatform.cpp
+++ b/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationServerPlatform.cpp
@@ -82,7 +82,7 @@ GDBRemoteCommunicationServerPlatform::GDBRemoteCommunicationServerPlatform(
       &GDBRemoteCommunicationServerPlatform::Handle_jSignalsInfo);
 
   RegisterPacketHandler(StringExtractorGDBRemote::eServerPacketType_interrupt,
-                        [](StringExtractorGDBRemote packet, Error &error,
+                        [](StringExtractorGDBRemote packet, Status &error,
                            bool &interrupt, bool &quit) {
                           error.SetErrorString("interrupt received");
                           interrupt = true;
@@ -95,7 +95,7 @@ GDBRemoteCommunicationServerPlatform::GDBRemoteCommunicationServerPlatform(
 //----------------------------------------------------------------------
 GDBRemoteCommunicationServerPlatform::~GDBRemoteCommunicationServerPlatform() {}
 
-Error GDBRemoteCommunicationServerPlatform::LaunchGDBServer(
+Status GDBRemoteCommunicationServerPlatform::LaunchGDBServer(
     const lldb_private::Args &args, std::string hostname, lldb::pid_t &pid,
     uint16_t &port, std::string &socket_name) {
   if (port == UINT16_MAX)
@@ -147,7 +147,7 @@ Error GDBRemoteCommunicationServerPlatform::LaunchGDBServer(
     port_ptr = nullptr;
   }
 
-  Error error = StartDebugserverProcess(
+  Status error = StartDebugserverProcess(
       url.str().c_str(), nullptr, debugserver_launch_info, port_ptr, &args, -1);
 
   pid = debugserver_launch_info.GetProcessID();
@@ -192,7 +192,7 @@ GDBRemoteCommunicationServerPlatform::Handle_qLaunchGDBServer(
 
   lldb::pid_t debugserver_pid = LLDB_INVALID_PROCESS_ID;
   std::string socket_name;
-  Error error =
+  Status error =
       LaunchGDBServer(Args(), hostname, debugserver_pid, port, socket_name);
   if (error.Fail()) {
     if (log)
@@ -439,10 +439,10 @@ bool GDBRemoteCommunicationServerPlatform::DebugserverProcessReaped(
   return true;
 }
 
-Error GDBRemoteCommunicationServerPlatform::LaunchProcess() {
+Status GDBRemoteCommunicationServerPlatform::LaunchProcess() {
   if (!m_process_launch_info.GetArguments().GetArgumentCount())
-    return Error("%s: no process command line specified to launch",
-                 __FUNCTION__);
+    return Status("%s: no process command line specified to launch",
+                  __FUNCTION__);
 
   // specify the process monitor if not already set.  This should
   // generally be what happens since we need to reap started
@@ -454,7 +454,7 @@ Error GDBRemoteCommunicationServerPlatform::LaunchProcess() {
             this, std::placeholders::_1),
         false);
 
-  Error error = Host::LaunchProcess(m_process_launch_info);
+  Status error = Host::LaunchProcess(m_process_launch_info);
   if (!error.Success()) {
     fprintf(stderr, "%s: failed to launch executable %s", __FUNCTION__,
             m_process_launch_info.GetArguments().GetArgumentAtIndex(0));
diff --git a/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationServerPlatform.h b/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationServerPlatform.h
index 472d86e3a15c..aed5106272d1 100644
--- a/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationServerPlatform.h
+++ b/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationServerPlatform.h
@@ -34,7 +34,7 @@ public:
 
   ~GDBRemoteCommunicationServerPlatform() override;
 
-  Error LaunchProcess() override;
+  Status LaunchProcess() override;
 
   // Set both ports to zero to let the platform automatically bind to
   // a port chosen by the OS.
@@ -61,9 +61,9 @@ public:
 
   void SetInferiorArguments(const lldb_private::Args &args);
 
-  Error LaunchGDBServer(const lldb_private::Args &args, std::string hostname,
-                        lldb::pid_t &pid, uint16_t &port,
-                        std::string &socket_name);
+  Status LaunchGDBServer(const lldb_private::Args &args, std::string hostname,
+                         lldb::pid_t &pid, uint16_t &port,
+                         std::string &socket_name);
 
   void SetPendingGdbServer(lldb::pid_t pid, uint16_t port,
                            const std::string &socket_name);
diff --git a/source/Plugins/Process/gdb-remote/GDBRemoteRegisterContext.cpp b/source/Plugins/Process/gdb-remote/GDBRemoteRegisterContext.cpp
index ea4acc74893a..612c7144451e 100644
--- a/source/Plugins/Process/gdb-remote/GDBRemoteRegisterContext.cpp
+++ b/source/Plugins/Process/gdb-remote/GDBRemoteRegisterContext.cpp
@@ -95,8 +95,8 @@ bool GDBRemoteRegisterContext::ReadRegister(const RegisterInfo *reg_info,
   // Read the register
   if (ReadRegisterBytes(reg_info, m_reg_data)) {
     const bool partial_data_ok = false;
-    Error error(value.SetValueFromData(reg_info, m_reg_data,
-                                       reg_info->byte_offset, partial_data_ok));
+    Status error(value.SetValueFromData(
+        reg_info, m_reg_data, reg_info->byte_offset, partial_data_ok));
     return error.Success();
   }
   return false;
diff --git a/source/Plugins/Process/gdb-remote/ProcessGDBRemote.cpp b/source/Plugins/Process/gdb-remote/ProcessGDBRemote.cpp
index c4ae5e36d9ad..64684c5963b3 100644
--- a/source/Plugins/Process/gdb-remote/ProcessGDBRemote.cpp
+++ b/source/Plugins/Process/gdb-remote/ProcessGDBRemote.cpp
@@ -97,8 +97,8 @@ namespace lldb {
 // function and get the packet history dumped to a file.
 void DumpProcessGDBRemotePacketHistory(void *p, const char *path) {
   StreamFile strm;
-  Error error(strm.GetFile().Open(path, File::eOpenOptionWrite |
-                                            File::eOpenOptionCanCreate));
+  Status error(strm.GetFile().Open(path, File::eOpenOptionWrite |
+                                             File::eOpenOptionCanCreate));
   if (error.Success())
     ((ProcessGDBRemote *)p)->GetGDBRemote().DumpHistory(strm);
 }
@@ -324,7 +324,7 @@ bool ProcessGDBRemote::ParsePythonTargetDefinition(
     const FileSpec &target_definition_fspec) {
   ScriptInterpreter *interpreter =
       GetTarget().GetDebugger().GetCommandInterpreter().GetScriptInterpreter();
-  Error error;
+  Status error;
   StructuredData::ObjectSP module_object_sp(
       interpreter->LoadPluginModule(target_definition_fspec, error));
   if (module_object_sp) {
@@ -639,23 +639,23 @@ void ProcessGDBRemote::BuildDynamicRegisterInfo(bool force) {
   m_register_info.Finalize(GetTarget().GetArchitecture());
 }
 
-Error ProcessGDBRemote::WillLaunch(Module *module) {
+Status ProcessGDBRemote::WillLaunch(Module *module) {
   return WillLaunchOrAttach();
 }
 
-Error ProcessGDBRemote::WillAttachToProcessWithID(lldb::pid_t pid) {
+Status ProcessGDBRemote::WillAttachToProcessWithID(lldb::pid_t pid) {
   return WillLaunchOrAttach();
 }
 
-Error ProcessGDBRemote::WillAttachToProcessWithName(const char *process_name,
-                                                    bool wait_for_launch) {
+Status ProcessGDBRemote::WillAttachToProcessWithName(const char *process_name,
+                                                     bool wait_for_launch) {
   return WillLaunchOrAttach();
 }
 
-Error ProcessGDBRemote::DoConnectRemote(Stream *strm,
-                                        llvm::StringRef remote_url) {
+Status ProcessGDBRemote::DoConnectRemote(Stream *strm,
+                                         llvm::StringRef remote_url) {
   Log *log(ProcessGDBRemoteLog::GetLogIfAllCategoriesSet(GDBR_LOG_PROCESS));
-  Error error(WillLaunchOrAttach());
+  Status error(WillLaunchOrAttach());
 
   if (error.Fail())
     return error;
@@ -744,8 +744,8 @@ Error ProcessGDBRemote::DoConnectRemote(Stream *strm,
   return error;
 }
 
-Error ProcessGDBRemote::WillLaunchOrAttach() {
-  Error error;
+Status ProcessGDBRemote::WillLaunchOrAttach() {
+  Status error;
   m_stdio_communication.Clear();
   return error;
 }
@@ -753,10 +753,10 @@ Error ProcessGDBRemote::WillLaunchOrAttach() {
 //----------------------------------------------------------------------
 // Process Control
 //----------------------------------------------------------------------
-Error ProcessGDBRemote::DoLaunch(Module *exe_module,
-                                 ProcessLaunchInfo &launch_info) {
+Status ProcessGDBRemote::DoLaunch(Module *exe_module,
+                                  ProcessLaunchInfo &launch_info) {
   Log *log(ProcessGDBRemoteLog::GetLogIfAllCategoriesSet(GDBR_LOG_PROCESS));
-  Error error;
+  Status error;
 
   if (log)
     log->Printf("ProcessGDBRemote::%s() entered", __FUNCTION__);
@@ -965,8 +965,8 @@ Error ProcessGDBRemote::DoLaunch(Module *exe_module,
   return error;
 }
 
-Error ProcessGDBRemote::ConnectToDebugserver(llvm::StringRef connect_url) {
-  Error error;
+Status ProcessGDBRemote::ConnectToDebugserver(llvm::StringRef connect_url) {
+  Status error;
   // Only connect if we have a valid connect URL
   Log *log(ProcessGDBRemoteLog::GetLogIfAllCategoriesSet(GDBR_LOG_PROCESS));
 
@@ -1169,10 +1169,10 @@ void ProcessGDBRemote::DidLaunch() {
   DidLaunchOrAttach(process_arch);
 }
 
-Error ProcessGDBRemote::DoAttachToProcessWithID(
+Status ProcessGDBRemote::DoAttachToProcessWithID(
     lldb::pid_t attach_pid, const ProcessAttachInfo &attach_info) {
   Log *log(ProcessGDBRemoteLog::GetLogIfAllCategoriesSet(GDBR_LOG_PROCESS));
-  Error error;
+  Status error;
 
   if (log)
     log->Printf("ProcessGDBRemote::%s()", __FUNCTION__);
@@ -1197,9 +1197,9 @@ Error ProcessGDBRemote::DoAttachToProcessWithID(
   return error;
 }
 
-Error ProcessGDBRemote::DoAttachToProcessWithName(
+Status ProcessGDBRemote::DoAttachToProcessWithName(
     const char *process_name, const ProcessAttachInfo &attach_info) {
-  Error error;
+  Status error;
   // Clear out and clean up from any current state
   Clear();
 
@@ -1247,18 +1247,18 @@ void ProcessGDBRemote::DidAttach(ArchSpec &process_arch) {
   DidLaunchOrAttach(process_arch);
 }
 
-Error ProcessGDBRemote::WillResume() {
+Status ProcessGDBRemote::WillResume() {
   m_continue_c_tids.clear();
   m_continue_C_tids.clear();
   m_continue_s_tids.clear();
   m_continue_S_tids.clear();
   m_jstopinfo_sp.reset();
   m_jthreadsinfo_sp.reset();
-  return Error();
+  return Status();
 }
 
-Error ProcessGDBRemote::DoResume() {
-  Error error;
+Status ProcessGDBRemote::DoResume() {
+  Status error;
   Log *log(ProcessGDBRemoteLog::GetLogIfAllCategoriesSet(GDBR_LOG_PROCESS));
   if (log)
     log->Printf("ProcessGDBRemote::Resume()");
@@ -2113,9 +2113,9 @@ ProcessGDBRemote::SetThreadStopInfo(StructuredData::Dictionary *thread_dict) {
             if (mem_cache_dict->GetValueForKeyAsInteger<lldb::addr_t>(
                     "address", mem_cache_addr)) {
               if (mem_cache_addr != LLDB_INVALID_ADDRESS) {
-                StringExtractor bytes;
-                if (mem_cache_dict->GetValueForKeyAsString(
-                        "bytes", bytes.GetStringRef())) {
+                llvm::StringRef str;
+                if (mem_cache_dict->GetValueForKeyAsString("bytes", str)) {
+                  StringExtractor bytes(str);
                   bytes.SetFilePos(0);
 
                   const size_t byte_size = bytes.GetStringRef().size() / 2;
@@ -2400,8 +2400,8 @@ void ProcessGDBRemote::RefreshStateAfterStop() {
   m_thread_list_real.RefreshStateAfterStop();
 }
 
-Error ProcessGDBRemote::DoHalt(bool &caused_stop) {
-  Error error;
+Status ProcessGDBRemote::DoHalt(bool &caused_stop) {
+  Status error;
 
   if (m_public_state.GetValue() == eStateAttaching) {
     // We are being asked to halt during an attach. We need to just close
@@ -2412,8 +2412,8 @@ Error ProcessGDBRemote::DoHalt(bool &caused_stop) {
   return error;
 }
 
-Error ProcessGDBRemote::DoDetach(bool keep_stopped) {
-  Error error;
+Status ProcessGDBRemote::DoDetach(bool keep_stopped) {
+  Status error;
   Log *log(ProcessGDBRemoteLog::GetLogIfAllCategoriesSet(GDBR_LOG_PROCESS));
   if (log)
     log->Printf("ProcessGDBRemote::DoDetach(keep_stopped: %i)", keep_stopped);
@@ -2441,8 +2441,8 @@ Error ProcessGDBRemote::DoDetach(bool keep_stopped) {
   return error;
 }
 
-Error ProcessGDBRemote::DoDestroy() {
-  Error error;
+Status ProcessGDBRemote::DoDestroy() {
+  Status error;
   Log *log(ProcessGDBRemoteLog::GetLogIfAllCategoriesSet(GDBR_LOG_PROCESS));
   if (log)
     log->Printf("ProcessGDBRemote::DoDestroy()");
@@ -2722,7 +2722,7 @@ void ProcessGDBRemote::WillPublicStop() {
 // Process Memory
 //------------------------------------------------------------------
 size_t ProcessGDBRemote::DoReadMemory(addr_t addr, void *buf, size_t size,
-                                      Error &error) {
+                                      Status &error) {
   GetMaxMemorySize();
   bool binary_memory_read = m_gdb_comm.GetxPacketSupported();
   // M and m packets take 2 bytes for 1 byte of memory
@@ -2781,7 +2781,7 @@ size_t ProcessGDBRemote::DoReadMemory(addr_t addr, void *buf, size_t size,
 }
 
 size_t ProcessGDBRemote::DoWriteMemory(addr_t addr, const void *buf,
-                                       size_t size, Error &error) {
+                                       size_t size, Status &error) {
   GetMaxMemorySize();
   // M and m packets take 2 bytes for 1 byte of memory
   size_t max_memory_size = m_max_memory_size / 2;
@@ -2822,7 +2822,7 @@ size_t ProcessGDBRemote::DoWriteMemory(addr_t addr, const void *buf,
 
 lldb::addr_t ProcessGDBRemote::DoAllocateMemory(size_t size,
                                                 uint32_t permissions,
-                                                Error &error) {
+                                                Status &error) {
   Log *log(
       GetLogIfAnyCategoriesSet(LIBLLDB_LOG_PROCESS | LIBLLDB_LOG_EXPRESSIONS));
   addr_t allocated_addr = LLDB_INVALID_ADDRESS;
@@ -2866,27 +2866,27 @@ lldb::addr_t ProcessGDBRemote::DoAllocateMemory(size_t size,
   return allocated_addr;
 }
 
-Error ProcessGDBRemote::GetMemoryRegionInfo(addr_t load_addr,
-                                            MemoryRegionInfo &region_info) {
+Status ProcessGDBRemote::GetMemoryRegionInfo(addr_t load_addr,
+                                             MemoryRegionInfo &region_info) {
 
-  Error error(m_gdb_comm.GetMemoryRegionInfo(load_addr, region_info));
+  Status error(m_gdb_comm.GetMemoryRegionInfo(load_addr, region_info));
   return error;
 }
 
-Error ProcessGDBRemote::GetWatchpointSupportInfo(uint32_t &num) {
+Status ProcessGDBRemote::GetWatchpointSupportInfo(uint32_t &num) {
 
-  Error error(m_gdb_comm.GetWatchpointSupportInfo(num));
+  Status error(m_gdb_comm.GetWatchpointSupportInfo(num));
   return error;
 }
 
-Error ProcessGDBRemote::GetWatchpointSupportInfo(uint32_t &num, bool &after) {
-  Error error(m_gdb_comm.GetWatchpointSupportInfo(
+Status ProcessGDBRemote::GetWatchpointSupportInfo(uint32_t &num, bool &after) {
+  Status error(m_gdb_comm.GetWatchpointSupportInfo(
       num, after, GetTarget().GetArchitecture()));
   return error;
 }
 
-Error ProcessGDBRemote::DoDeallocateMemory(lldb::addr_t addr) {
-  Error error;
+Status ProcessGDBRemote::DoDeallocateMemory(lldb::addr_t addr) {
+  Status error;
   LazyBool supported = m_gdb_comm.SupportsAllocDeallocMemory();
 
   switch (supported) {
@@ -2924,7 +2924,7 @@ Error ProcessGDBRemote::DoDeallocateMemory(lldb::addr_t addr) {
 // Process STDIO
 //------------------------------------------------------------------
 size_t ProcessGDBRemote::PutSTDIN(const char *src, size_t src_len,
-                                  Error &error) {
+                                  Status &error) {
   if (m_stdio_communication.IsConnected()) {
     ConnectionStatus status;
     m_stdio_communication.Write(src, src_len, status, NULL);
@@ -2934,8 +2934,8 @@ size_t ProcessGDBRemote::PutSTDIN(const char *src, size_t src_len,
   return 0;
 }
 
-Error ProcessGDBRemote::EnableBreakpointSite(BreakpointSite *bp_site) {
-  Error error;
+Status ProcessGDBRemote::EnableBreakpointSite(BreakpointSite *bp_site) {
+  Status error;
   assert(bp_site != NULL);
 
   // Get logging info
@@ -3072,8 +3072,8 @@ Error ProcessGDBRemote::EnableBreakpointSite(BreakpointSite *bp_site) {
   return EnableSoftwareBreakpoint(bp_site);
 }
 
-Error ProcessGDBRemote::DisableBreakpointSite(BreakpointSite *bp_site) {
-  Error error;
+Status ProcessGDBRemote::DisableBreakpointSite(BreakpointSite *bp_site) {
+  Status error;
   assert(bp_site != NULL);
   addr_t addr = bp_site->GetLoadAddress();
   user_id_t site_id = bp_site->GetID();
@@ -3141,8 +3141,8 @@ static GDBStoppointType GetGDBStoppointType(Watchpoint *wp) {
     return eWatchpointWrite;
 }
 
-Error ProcessGDBRemote::EnableWatchpoint(Watchpoint *wp, bool notify) {
-  Error error;
+Status ProcessGDBRemote::EnableWatchpoint(Watchpoint *wp, bool notify) {
+  Status error;
   if (wp) {
     user_id_t watchID = wp->GetID();
     addr_t addr = wp->GetLoadAddress();
@@ -3178,8 +3178,8 @@ Error ProcessGDBRemote::EnableWatchpoint(Watchpoint *wp, bool notify) {
   return error;
 }
 
-Error ProcessGDBRemote::DisableWatchpoint(Watchpoint *wp, bool notify) {
-  Error error;
+Status ProcessGDBRemote::DisableWatchpoint(Watchpoint *wp, bool notify) {
+  Status error;
   if (wp) {
     user_id_t watchID = wp->GetID();
 
@@ -3231,8 +3231,8 @@ void ProcessGDBRemote::Clear() {
   m_thread_list.Clear();
 }
 
-Error ProcessGDBRemote::DoSignal(int signo) {
-  Error error;
+Status ProcessGDBRemote::DoSignal(int signo) {
+  Status error;
   Log *log(ProcessGDBRemoteLog::GetLogIfAllCategoriesSet(GDBR_LOG_PROCESS));
   if (log)
     log->Printf("ProcessGDBRemote::DoSignal (signal = %d)", signo);
@@ -3242,15 +3242,15 @@ Error ProcessGDBRemote::DoSignal(int signo) {
   return error;
 }
 
-Error ProcessGDBRemote::EstablishConnectionIfNeeded(
-    const ProcessInfo &process_info) {
+Status
+ProcessGDBRemote::EstablishConnectionIfNeeded(const ProcessInfo &process_info) {
   // Make sure we aren't already connected?
   if (m_gdb_comm.IsConnected())
-    return Error();
+    return Status();
 
   PlatformSP platform_sp(GetTarget().GetPlatform());
   if (platform_sp && !platform_sp->IsHost())
-    return Error("Lost debug server connection");
+    return Status("Lost debug server connection");
 
   auto error = LaunchAndConnectToDebugserver(process_info);
   if (error.Fail()) {
@@ -3277,11 +3277,11 @@ static bool SetCloexecFlag(int fd) {
 }
 #endif
 
-Error ProcessGDBRemote::LaunchAndConnectToDebugserver(
+Status ProcessGDBRemote::LaunchAndConnectToDebugserver(
     const ProcessInfo &process_info) {
   using namespace std::placeholders; // For _1, _2, etc.
 
-  Error error;
+  Status error;
   if (m_debugserver_pid == LLDB_INVALID_PROCESS_ID) {
     // If we locate debugserver, keep that located version around
     static FileSpec g_debugserver_file_spec;
@@ -3739,30 +3739,30 @@ bool ProcessGDBRemote::NewThreadNotifyBreakpointHit(
   return false;
 }
 
-Error ProcessGDBRemote::UpdateAutomaticSignalFiltering() {
+Status ProcessGDBRemote::UpdateAutomaticSignalFiltering() {
   Log *log(ProcessGDBRemoteLog::GetLogIfAllCategoriesSet(GDBR_LOG_PROCESS));
   LLDB_LOG(log, "Check if need to update ignored signals");
 
   // QPassSignals package is not supported by the server,
   // there is no way we can ignore any signals on server side.
   if (!m_gdb_comm.GetQPassSignalsSupported())
-    return Error();
+    return Status();
 
   // No signals, nothing to send.
   if (m_unix_signals_sp == nullptr)
-    return Error();
+    return Status();
 
   // Signals' version hasn't changed, no need to send anything.
   uint64_t new_signals_version = m_unix_signals_sp->GetVersion();
   if (new_signals_version == m_last_signals_version) {
     LLDB_LOG(log, "Signals' version hasn't changed. version={0}",
              m_last_signals_version);
-    return Error();
+    return Status();
   }
 
   auto signals_to_ignore =
       m_unix_signals_sp->GetFilteredSignals(false, false, false);
-  Error error = m_gdb_comm.SendSignalsToIgnore(signals_to_ignore);
+  Status error = m_gdb_comm.SendSignalsToIgnore(signals_to_ignore);
 
   LLDB_LOG(log,
            "Signals' version changed. old version={0}, new version={1}, "
@@ -3820,11 +3820,11 @@ DynamicLoader *ProcessGDBRemote::GetDynamicLoader() {
   return m_dyld_ap.get();
 }
 
-Error ProcessGDBRemote::SendEventData(const char *data) {
+Status ProcessGDBRemote::SendEventData(const char *data) {
   int return_value;
   bool was_supported;
 
-  Error error;
+  Status error;
 
   return_value = m_gdb_comm.SendLaunchEventDataPacket(data, &was_supported);
   if (return_value != 0) {
@@ -3995,7 +3995,7 @@ StructuredData::ObjectSP ProcessGDBRemote::GetSharedCacheInfo() {
   return object_sp;
 }
 
-Error ProcessGDBRemote::ConfigureStructuredData(
+Status ProcessGDBRemote::ConfigureStructuredData(
     const ConstString &type_name, const StructuredData::ObjectSP &config_sp) {
   return m_gdb_comm.ConfigureRemoteStructuredData(type_name, config_sp);
 }
@@ -4332,7 +4332,7 @@ bool ProcessGDBRemote::GetGDBServerRegisterInfo(ArchSpec &arch_to_use) {
 
   // request the target xml file
   std::string raw;
-  lldb_private::Error lldberr;
+  lldb_private::Status lldberr;
   if (!comm.ReadExtFeature(ConstString("features"), ConstString("target.xml"),
                            raw, lldberr)) {
     return false;
@@ -4424,10 +4424,10 @@ bool ProcessGDBRemote::GetGDBServerRegisterInfo(ArchSpec &arch_to_use) {
   return m_register_info.GetNumRegisters() > 0;
 }
 
-Error ProcessGDBRemote::GetLoadedModuleList(LoadedModuleInfoList &list) {
+Status ProcessGDBRemote::GetLoadedModuleList(LoadedModuleInfoList &list) {
   // Make sure LLDB has an XML parser it can use first
   if (!XMLDocument::XMLEnabled())
-    return Error(0, ErrorType::eErrorTypeGeneric);
+    return Status(0, ErrorType::eErrorTypeGeneric);
 
   Log *log = GetLogIfAnyCategoriesSet(LIBLLDB_LOG_PROCESS);
   if (log)
@@ -4441,11 +4441,11 @@ Error ProcessGDBRemote::GetLoadedModuleList(LoadedModuleInfoList &list) {
 
     // request the loaded library list
     std::string raw;
-    lldb_private::Error lldberr;
+    lldb_private::Status lldberr;
 
     if (!comm.ReadExtFeature(ConstString("libraries-svr4"), ConstString(""),
                              raw, lldberr))
-      return Error(0, ErrorType::eErrorTypeGeneric);
+      return Status(0, ErrorType::eErrorTypeGeneric);
 
     // parse the xml file in memory
     if (log)
@@ -4453,11 +4453,11 @@ Error ProcessGDBRemote::GetLoadedModuleList(LoadedModuleInfoList &list) {
     XMLDocument doc;
 
     if (!doc.ParseMemory(raw.c_str(), raw.size(), "noname.xml"))
-      return Error(0, ErrorType::eErrorTypeGeneric);
+      return Status(0, ErrorType::eErrorTypeGeneric);
 
     XMLNode root_element = doc.GetRootElement("library-list-svr4");
     if (!root_element)
-      return Error();
+      return Status();
 
     // main link map structure
     llvm::StringRef main_lm = root_element.GetAttributeValue("main-lm");
@@ -4528,22 +4528,22 @@ Error ProcessGDBRemote::GetLoadedModuleList(LoadedModuleInfoList &list) {
 
     // request the loaded library list
     std::string raw;
-    lldb_private::Error lldberr;
+    lldb_private::Status lldberr;
 
     if (!comm.ReadExtFeature(ConstString("libraries"), ConstString(""), raw,
                              lldberr))
-      return Error(0, ErrorType::eErrorTypeGeneric);
+      return Status(0, ErrorType::eErrorTypeGeneric);
 
     if (log)
       log->Printf("parsing: %s", raw.c_str());
     XMLDocument doc;
 
     if (!doc.ParseMemory(raw.c_str(), raw.size(), "noname.xml"))
-      return Error(0, ErrorType::eErrorTypeGeneric);
+      return Status(0, ErrorType::eErrorTypeGeneric);
 
     XMLNode root_element = doc.GetRootElement("library-list");
     if (!root_element)
-      return Error();
+      return Status();
 
     root_element.ForEachChildElementWithName(
         "library", [log, &list](const XMLNode &library) -> bool {
@@ -4584,10 +4584,10 @@ Error ProcessGDBRemote::GetLoadedModuleList(LoadedModuleInfoList &list) {
       log->Printf("found %" PRId32 " modules in total",
                   (int)list.m_list.size());
   } else {
-    return Error(0, ErrorType::eErrorTypeGeneric);
+    return Status(0, ErrorType::eErrorTypeGeneric);
   }
 
-  return Error();
+  return Status();
 }
 
 lldb::ModuleSP ProcessGDBRemote::LoadModuleAtAddress(const FileSpec &file,
@@ -4686,15 +4686,15 @@ size_t ProcessGDBRemote::LoadModules() {
   return LoadModules(module_list);
 }
 
-Error ProcessGDBRemote::GetFileLoadAddress(const FileSpec &file,
-                                           bool &is_loaded,
-                                           lldb::addr_t &load_addr) {
+Status ProcessGDBRemote::GetFileLoadAddress(const FileSpec &file,
+                                            bool &is_loaded,
+                                            lldb::addr_t &load_addr) {
   is_loaded = false;
   load_addr = LLDB_INVALID_ADDRESS;
 
   std::string file_path = file.GetPath(false);
   if (file_path.empty())
-    return Error("Empty file name specified");
+    return Status("Empty file name specified");
 
   StreamString packet;
   packet.PutCString("qFileLoadAddress:");
@@ -4704,27 +4704,28 @@ Error ProcessGDBRemote::GetFileLoadAddress(const FileSpec &file,
   if (m_gdb_comm.SendPacketAndWaitForResponse(packet.GetString(), response,
                                               false) !=
       GDBRemoteCommunication::PacketResult::Success)
-    return Error("Sending qFileLoadAddress packet failed");
+    return Status("Sending qFileLoadAddress packet failed");
 
   if (response.IsErrorResponse()) {
     if (response.GetError() == 1) {
       // The file is not loaded into the inferior
       is_loaded = false;
       load_addr = LLDB_INVALID_ADDRESS;
-      return Error();
+      return Status();
     }
 
-    return Error(
+    return Status(
         "Fetching file load address from remote server returned an error");
   }
 
   if (response.IsNormalResponse()) {
     is_loaded = true;
     load_addr = response.GetHexMaxU64(false, LLDB_INVALID_ADDRESS);
-    return Error();
+    return Status();
   }
 
-  return Error("Unknown error happened during sending the load address packet");
+  return Status(
+      "Unknown error happened during sending the load address packet");
 }
 
 void ProcessGDBRemote::ModulesDidLoad(ModuleList &module_list) {
diff --git a/source/Plugins/Process/gdb-remote/ProcessGDBRemote.h b/source/Plugins/Process/gdb-remote/ProcessGDBRemote.h
index a1794d0f5050..60f0464f86bb 100644
--- a/source/Plugins/Process/gdb-remote/ProcessGDBRemote.h
+++ b/source/Plugins/Process/gdb-remote/ProcessGDBRemote.h
@@ -30,7 +30,7 @@
 #include "lldb/Target/Process.h"
 #include "lldb/Target/Thread.h"
 #include "lldb/Utility/ConstString.h"
-#include "lldb/Utility/Error.h"
+#include "lldb/Utility/Status.h"
 #include "lldb/Utility/StreamString.h"
 #include "lldb/Utility/StringExtractor.h"
 #include "lldb/Utility/StringList.h"
@@ -78,25 +78,25 @@ public:
   //------------------------------------------------------------------
   // Creating a new process, or attaching to an existing one
   //------------------------------------------------------------------
-  Error WillLaunch(Module *module) override;
+  Status WillLaunch(Module *module) override;
 
-  Error DoLaunch(Module *exe_module, ProcessLaunchInfo &launch_info) override;
+  Status DoLaunch(Module *exe_module, ProcessLaunchInfo &launch_info) override;
 
   void DidLaunch() override;
 
-  Error WillAttachToProcessWithID(lldb::pid_t pid) override;
+  Status WillAttachToProcessWithID(lldb::pid_t pid) override;
 
-  Error WillAttachToProcessWithName(const char *process_name,
-                                    bool wait_for_launch) override;
+  Status WillAttachToProcessWithName(const char *process_name,
+                                     bool wait_for_launch) override;
 
-  Error DoConnectRemote(Stream *strm, llvm::StringRef remote_url) override;
+  Status DoConnectRemote(Stream *strm, llvm::StringRef remote_url) override;
 
-  Error WillLaunchOrAttach();
+  Status WillLaunchOrAttach();
 
-  Error DoAttachToProcessWithID(lldb::pid_t pid,
-                                const ProcessAttachInfo &attach_info) override;
+  Status DoAttachToProcessWithID(lldb::pid_t pid,
+                                 const ProcessAttachInfo &attach_info) override;
 
-  Error
+  Status
   DoAttachToProcessWithName(const char *process_name,
                             const ProcessAttachInfo &attach_info) override;
 
@@ -112,19 +112,19 @@ public:
   //------------------------------------------------------------------
   // Process Control
   //------------------------------------------------------------------
-  Error WillResume() override;
+  Status WillResume() override;
 
-  Error DoResume() override;
+  Status DoResume() override;
 
-  Error DoHalt(bool &caused_stop) override;
+  Status DoHalt(bool &caused_stop) override;
 
-  Error DoDetach(bool keep_stopped) override;
+  Status DoDetach(bool keep_stopped) override;
 
   bool DetachRequiresHalt() override { return true; }
 
-  Error DoSignal(int signal) override;
+  Status DoSignal(int signal) override;
 
-  Error DoDestroy() override;
+  Status DoDestroy() override;
 
   void RefreshStateAfterStop() override;
 
@@ -143,41 +143,41 @@ public:
   // Process Memory
   //------------------------------------------------------------------
   size_t DoReadMemory(lldb::addr_t addr, void *buf, size_t size,
-                      Error &error) override;
+                      Status &error) override;
 
   size_t DoWriteMemory(lldb::addr_t addr, const void *buf, size_t size,
-                       Error &error) override;
+                       Status &error) override;
 
   lldb::addr_t DoAllocateMemory(size_t size, uint32_t permissions,
-                                Error &error) override;
+                                Status &error) override;
 
-  Error GetMemoryRegionInfo(lldb::addr_t load_addr,
-                            MemoryRegionInfo &region_info) override;
+  Status GetMemoryRegionInfo(lldb::addr_t load_addr,
+                             MemoryRegionInfo &region_info) override;
 
-  Error DoDeallocateMemory(lldb::addr_t ptr) override;
+  Status DoDeallocateMemory(lldb::addr_t ptr) override;
 
   //------------------------------------------------------------------
   // Process STDIO
   //------------------------------------------------------------------
-  size_t PutSTDIN(const char *buf, size_t buf_size, Error &error) override;
+  size_t PutSTDIN(const char *buf, size_t buf_size, Status &error) override;
 
   //----------------------------------------------------------------------
   // Process Breakpoints
   //----------------------------------------------------------------------
-  Error EnableBreakpointSite(BreakpointSite *bp_site) override;
+  Status EnableBreakpointSite(BreakpointSite *bp_site) override;
 
-  Error DisableBreakpointSite(BreakpointSite *bp_site) override;
+  Status DisableBreakpointSite(BreakpointSite *bp_site) override;
 
   //----------------------------------------------------------------------
   // Process Watchpoints
   //----------------------------------------------------------------------
-  Error EnableWatchpoint(Watchpoint *wp, bool notify = true) override;
+  Status EnableWatchpoint(Watchpoint *wp, bool notify = true) override;
 
-  Error DisableWatchpoint(Watchpoint *wp, bool notify = true) override;
+  Status DisableWatchpoint(Watchpoint *wp, bool notify = true) override;
 
-  Error GetWatchpointSupportInfo(uint32_t &num) override;
+  Status GetWatchpointSupportInfo(uint32_t &num) override;
 
-  Error GetWatchpointSupportInfo(uint32_t &num, bool &after) override;
+  Status GetWatchpointSupportInfo(uint32_t &num, bool &after) override;
 
   bool StartNoticingNewThreads() override;
 
@@ -185,7 +185,7 @@ public:
 
   GDBRemoteCommunicationClient &GetGDBRemote() { return m_gdb_comm; }
 
-  Error SendEventData(const char *data) override;
+  Status SendEventData(const char *data) override;
 
   //----------------------------------------------------------------------
   // Override DidExit so we can disconnect from the remote GDB server
@@ -207,8 +207,8 @@ public:
 
   size_t LoadModules() override;
 
-  Error GetFileLoadAddress(const FileSpec &file, bool &is_loaded,
-                           lldb::addr_t &load_addr) override;
+  Status GetFileLoadAddress(const FileSpec &file, bool &is_loaded,
+                            lldb::addr_t &load_addr) override;
 
   void ModulesDidLoad(ModuleList &module_list) override;
 
@@ -216,7 +216,7 @@ public:
   GetLoadedDynamicLibrariesInfos(lldb::addr_t image_list_address,
                                  lldb::addr_t image_count) override;
 
-  Error
+  Status
   ConfigureStructuredData(const ConstString &type_name,
                           const StructuredData::ObjectSP &config_sp) override;
 
@@ -315,9 +315,9 @@ protected:
   bool UpdateThreadList(ThreadList &old_thread_list,
                         ThreadList &new_thread_list) override;
 
-  Error EstablishConnectionIfNeeded(const ProcessInfo &process_info);
+  Status EstablishConnectionIfNeeded(const ProcessInfo &process_info);
 
-  Error LaunchAndConnectToDebugserver(const ProcessInfo &process_info);
+  Status LaunchAndConnectToDebugserver(const ProcessInfo &process_info);
 
   void KillDebugserverProcess();
 
@@ -379,7 +379,7 @@ protected:
 
   void DidLaunchOrAttach(ArchSpec &process_arch);
 
-  Error ConnectToDebugserver(llvm::StringRef host_port);
+  Status ConnectToDebugserver(llvm::StringRef host_port);
 
   const char *GetDispatchQueueNameForThread(lldb::addr_t thread_dispatch_qaddr,
                                             std::string &dispatch_queue_name);
@@ -390,14 +390,14 @@ protected:
   bool GetGDBServerRegisterInfo(ArchSpec &arch);
 
   // Query remote GDBServer for a detailed loaded library list
-  Error GetLoadedModuleList(LoadedModuleInfoList &);
+  Status GetLoadedModuleList(LoadedModuleInfoList &);
 
   lldb::ModuleSP LoadModuleAtAddress(const FileSpec &file,
                                      lldb::addr_t link_map,
                                      lldb::addr_t base_addr,
                                      bool value_is_offset);
 
-  Error UpdateAutomaticSignalFiltering() override;
+  Status UpdateAutomaticSignalFiltering() override;
 
 private:
   //------------------------------------------------------------------
diff --git a/source/Plugins/Process/mach-core/ProcessMachCore.cpp b/source/Plugins/Process/mach-core/ProcessMachCore.cpp
index 3b636b9795fd..a6178500dfc5 100644
--- a/source/Plugins/Process/mach-core/ProcessMachCore.cpp
+++ b/source/Plugins/Process/mach-core/ProcessMachCore.cpp
@@ -97,8 +97,8 @@ bool ProcessMachCore::CanDebug(lldb::TargetSP target_sp,
     // ModuleSpecList::FindMatchingModuleSpec
     // enforces a strict arch mach.
     ModuleSpec core_module_spec(m_core_file);
-    Error error(ModuleList::GetSharedModule(core_module_spec, m_core_module_sp,
-                                            NULL, NULL, NULL));
+    Status error(ModuleList::GetSharedModule(core_module_spec, m_core_module_sp,
+                                             NULL, NULL, NULL));
 
     if (m_core_module_sp) {
       ObjectFile *core_objfile = m_core_module_sp->GetObjectFile();
@@ -143,7 +143,7 @@ bool ProcessMachCore::GetDynamicLoaderAddress(lldb::addr_t addr) {
   Log *log(lldb_private::GetLogIfAnyCategoriesSet(LIBLLDB_LOG_DYNAMIC_LOADER |
                                                   LIBLLDB_LOG_PROCESS));
   llvm::MachO::mach_header header;
-  Error error;
+  Status error;
   if (DoReadMemory(addr, &header, sizeof(header), error) != sizeof(header))
     return false;
   if (header.magic == llvm::MachO::MH_CIGAM ||
@@ -200,10 +200,10 @@ bool ProcessMachCore::GetDynamicLoaderAddress(lldb::addr_t addr) {
 //----------------------------------------------------------------------
 // Process Control
 //----------------------------------------------------------------------
-Error ProcessMachCore::DoLoadCore() {
+Status ProcessMachCore::DoLoadCore() {
   Log *log(lldb_private::GetLogIfAnyCategoriesSet(LIBLLDB_LOG_DYNAMIC_LOADER |
                                                   LIBLLDB_LOG_PROCESS));
-  Error error;
+  Status error;
   if (!m_core_module_sp) {
     error.SetErrorString("invalid core module");
     return error;
@@ -514,7 +514,7 @@ void ProcessMachCore::RefreshStateAfterStop() {
   // SetThreadStopInfo (m_last_stop_packet);
 }
 
-Error ProcessMachCore::DoDestroy() { return Error(); }
+Status ProcessMachCore::DoDestroy() { return Status(); }
 
 //------------------------------------------------------------------
 // Process Queries
@@ -528,14 +528,14 @@ bool ProcessMachCore::WarnBeforeDetach() const { return false; }
 // Process Memory
 //------------------------------------------------------------------
 size_t ProcessMachCore::ReadMemory(addr_t addr, void *buf, size_t size,
-                                   Error &error) {
+                                   Status &error) {
   // Don't allow the caching that lldb_private::Process::ReadMemory does
   // since in core files we have it all cached our our core file anyway.
   return DoReadMemory(addr, buf, size, error);
 }
 
 size_t ProcessMachCore::DoReadMemory(addr_t addr, void *buf, size_t size,
-                                     Error &error) {
+                                     Status &error) {
   ObjectFile *core_objfile = m_core_module_sp->GetObjectFile();
   size_t bytes_read = 0;
 
@@ -589,8 +589,8 @@ size_t ProcessMachCore::DoReadMemory(addr_t addr, void *buf, size_t size,
   return bytes_read;
 }
 
-Error ProcessMachCore::GetMemoryRegionInfo(addr_t load_addr,
-                                           MemoryRegionInfo &region_info) {
+Status ProcessMachCore::GetMemoryRegionInfo(addr_t load_addr,
+                                            MemoryRegionInfo &region_info) {
   region_info.Clear();
   const VMRangeToPermissions::Entry *permission_entry =
       m_core_range_infos.FindEntryThatContainsOrFollows(load_addr);
@@ -617,7 +617,7 @@ Error ProcessMachCore::GetMemoryRegionInfo(addr_t load_addr,
       region_info.SetExecutable(MemoryRegionInfo::eNo);
       region_info.SetMapped(MemoryRegionInfo::eNo);
     }
-    return Error();
+    return Status();
   }
 
   region_info.GetRange().SetRangeBase(load_addr);
@@ -626,7 +626,7 @@ Error ProcessMachCore::GetMemoryRegionInfo(addr_t load_addr,
   region_info.SetWritable(MemoryRegionInfo::eNo);
   region_info.SetExecutable(MemoryRegionInfo::eNo);
   region_info.SetMapped(MemoryRegionInfo::eNo);
-  return Error();
+  return Status();
 }
 
 void ProcessMachCore::Clear() { m_thread_list.Clear(); }
diff --git a/source/Plugins/Process/mach-core/ProcessMachCore.h b/source/Plugins/Process/mach-core/ProcessMachCore.h
index 99bb4a60ff39..101df6b79115 100644
--- a/source/Plugins/Process/mach-core/ProcessMachCore.h
+++ b/source/Plugins/Process/mach-core/ProcessMachCore.h
@@ -19,7 +19,7 @@
 // Project includes
 #include "lldb/Target/Process.h"
 #include "lldb/Utility/ConstString.h"
-#include "lldb/Utility/Error.h"
+#include "lldb/Utility/Status.h"
 
 class ThreadKDP;
 
@@ -54,7 +54,7 @@ public:
   //------------------------------------------------------------------
   // Creating a new process, or attaching to an existing one
   //------------------------------------------------------------------
-  lldb_private::Error DoLoadCore() override;
+  lldb_private::Status DoLoadCore() override;
 
   lldb_private::DynamicLoader *GetDynamicLoader() override;
 
@@ -68,7 +68,7 @@ public:
   //------------------------------------------------------------------
   // Process Control
   //------------------------------------------------------------------
-  lldb_private::Error DoDestroy() override;
+  lldb_private::Status DoDestroy() override;
 
   void RefreshStateAfterStop() override;
 
@@ -83,12 +83,12 @@ public:
   // Process Memory
   //------------------------------------------------------------------
   size_t ReadMemory(lldb::addr_t addr, void *buf, size_t size,
-                    lldb_private::Error &error) override;
+                    lldb_private::Status &error) override;
 
   size_t DoReadMemory(lldb::addr_t addr, void *buf, size_t size,
-                      lldb_private::Error &error) override;
+                      lldb_private::Status &error) override;
 
-  lldb_private::Error
+  lldb_private::Status
   GetMemoryRegionInfo(lldb::addr_t load_addr,
                       lldb_private::MemoryRegionInfo &region_info) override;
 
diff --git a/source/Plugins/Process/minidump/MinidumpParser.cpp b/source/Plugins/Process/minidump/MinidumpParser.cpp
index 37b3709c09c1..36350fdb6398 100644
--- a/source/Plugins/Process/minidump/MinidumpParser.cpp
+++ b/source/Plugins/Process/minidump/MinidumpParser.cpp
@@ -45,7 +45,7 @@ MinidumpParser::Create(const lldb::DataBufferSP &data_buf_sp) {
   }
 
   const MinidumpDirectory *directory = nullptr;
-  Error error;
+  Status error;
   llvm::ArrayRef<uint8_t> directory_data(
       data_buf_sp->GetBytes() + directory_list_offset,
       sizeof(MinidumpDirectory) * header->streams_count);
@@ -126,7 +126,7 @@ MinidumpParser::GetThreadContextWow64(const MinidumpThread &td) {
     return {};
 
   const TEB64 *wow64teb;
-  Error error = consumeObject(teb_mem, wow64teb);
+  Status error = consumeObject(teb_mem, wow64teb);
   if (error.Fail())
     return {};
 
diff --git a/source/Plugins/Process/minidump/MinidumpParser.h b/source/Plugins/Process/minidump/MinidumpParser.h
index 189aeb3d64e6..df6deb482afd 100644
--- a/source/Plugins/Process/minidump/MinidumpParser.h
+++ b/source/Plugins/Process/minidump/MinidumpParser.h
@@ -17,7 +17,7 @@
 // Other libraries and framework includes
 #include "lldb/Core/ArchSpec.h"
 #include "lldb/Utility/DataBuffer.h"
-#include "lldb/Utility/Error.h"
+#include "lldb/Utility/Status.h"
 
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/DenseMap.h"
diff --git a/source/Plugins/Process/minidump/MinidumpTypes.cpp b/source/Plugins/Process/minidump/MinidumpTypes.cpp
index 863d124a7ccc..24ce3f94c094 100644
--- a/source/Plugins/Process/minidump/MinidumpTypes.cpp
+++ b/source/Plugins/Process/minidump/MinidumpTypes.cpp
@@ -19,7 +19,7 @@ using namespace minidump;
 
 const MinidumpHeader *MinidumpHeader::Parse(llvm::ArrayRef<uint8_t> &data) {
   const MinidumpHeader *header = nullptr;
-  Error error = consumeObject(data, header);
+  Status error = consumeObject(data, header);
 
   const MinidumpHeaderConstants signature =
       static_cast<const MinidumpHeaderConstants>(
@@ -45,7 +45,7 @@ lldb_private::minidump::parseMinidumpString(llvm::ArrayRef<uint8_t> &data) {
   std::string result;
 
   const uint32_t *source_length;
-  Error error = consumeObject(data, source_length);
+  Status error = consumeObject(data, source_length);
   if (error.Fail() || *source_length > data.size() || *source_length % 2 != 0)
     return llvm::None;
 
@@ -71,7 +71,7 @@ lldb_private::minidump::parseMinidumpString(llvm::ArrayRef<uint8_t> &data) {
 // MinidumpThread
 const MinidumpThread *MinidumpThread::Parse(llvm::ArrayRef<uint8_t> &data) {
   const MinidumpThread *thread = nullptr;
-  Error error = consumeObject(data, thread);
+  Status error = consumeObject(data, thread);
   if (error.Fail())
     return nullptr;
 
@@ -81,7 +81,7 @@ const MinidumpThread *MinidumpThread::Parse(llvm::ArrayRef<uint8_t> &data) {
 llvm::ArrayRef<MinidumpThread>
 MinidumpThread::ParseThreadList(llvm::ArrayRef<uint8_t> &data) {
   const llvm::support::ulittle32_t *thread_count;
-  Error error = consumeObject(data, thread_count);
+  Status error = consumeObject(data, thread_count);
   if (error.Fail() || *thread_count * sizeof(MinidumpThread) > data.size())
     return {};
 
@@ -93,7 +93,7 @@ MinidumpThread::ParseThreadList(llvm::ArrayRef<uint8_t> &data) {
 const MinidumpSystemInfo *
 MinidumpSystemInfo::Parse(llvm::ArrayRef<uint8_t> &data) {
   const MinidumpSystemInfo *system_info;
-  Error error = consumeObject(data, system_info);
+  Status error = consumeObject(data, system_info);
   if (error.Fail())
     return nullptr;
 
@@ -103,7 +103,7 @@ MinidumpSystemInfo::Parse(llvm::ArrayRef<uint8_t> &data) {
 // MinidumpMiscInfo
 const MinidumpMiscInfo *MinidumpMiscInfo::Parse(llvm::ArrayRef<uint8_t> &data) {
   const MinidumpMiscInfo *misc_info;
-  Error error = consumeObject(data, misc_info);
+  Status error = consumeObject(data, misc_info);
   if (error.Fail())
     return nullptr;
 
@@ -147,7 +147,7 @@ lldb::pid_t LinuxProcStatus::GetPid() const { return pid; }
 // Module stuff
 const MinidumpModule *MinidumpModule::Parse(llvm::ArrayRef<uint8_t> &data) {
   const MinidumpModule *module = nullptr;
-  Error error = consumeObject(data, module);
+  Status error = consumeObject(data, module);
   if (error.Fail())
     return nullptr;
 
@@ -158,7 +158,7 @@ llvm::ArrayRef<MinidumpModule>
 MinidumpModule::ParseModuleList(llvm::ArrayRef<uint8_t> &data) {
 
   const llvm::support::ulittle32_t *modules_count;
-  Error error = consumeObject(data, modules_count);
+  Status error = consumeObject(data, modules_count);
   if (error.Fail() || *modules_count * sizeof(MinidumpModule) > data.size())
     return {};
 
@@ -170,7 +170,7 @@ MinidumpModule::ParseModuleList(llvm::ArrayRef<uint8_t> &data) {
 const MinidumpExceptionStream *
 MinidumpExceptionStream::Parse(llvm::ArrayRef<uint8_t> &data) {
   const MinidumpExceptionStream *exception_stream = nullptr;
-  Error error = consumeObject(data, exception_stream);
+  Status error = consumeObject(data, exception_stream);
   if (error.Fail())
     return nullptr;
 
@@ -180,7 +180,7 @@ MinidumpExceptionStream::Parse(llvm::ArrayRef<uint8_t> &data) {
 llvm::ArrayRef<MinidumpMemoryDescriptor>
 MinidumpMemoryDescriptor::ParseMemoryList(llvm::ArrayRef<uint8_t> &data) {
   const llvm::support::ulittle32_t *mem_ranges_count;
-  Error error = consumeObject(data, mem_ranges_count);
+  Status error = consumeObject(data, mem_ranges_count);
   if (error.Fail() ||
       *mem_ranges_count * sizeof(MinidumpMemoryDescriptor) > data.size())
     return {};
@@ -193,7 +193,7 @@ MinidumpMemoryDescriptor::ParseMemoryList(llvm::ArrayRef<uint8_t> &data) {
 std::pair<llvm::ArrayRef<MinidumpMemoryDescriptor64>, uint64_t>
 MinidumpMemoryDescriptor64::ParseMemory64List(llvm::ArrayRef<uint8_t> &data) {
   const llvm::support::ulittle64_t *mem_ranges_count;
-  Error error = consumeObject(data, mem_ranges_count);
+  Status error = consumeObject(data, mem_ranges_count);
   if (error.Fail() ||
       *mem_ranges_count * sizeof(MinidumpMemoryDescriptor64) > data.size())
     return {};
@@ -213,7 +213,7 @@ MinidumpMemoryDescriptor64::ParseMemory64List(llvm::ArrayRef<uint8_t> &data) {
 std::vector<const MinidumpMemoryInfo *>
 MinidumpMemoryInfo::ParseMemoryInfoList(llvm::ArrayRef<uint8_t> &data) {
   const MinidumpMemoryInfoListHeader *header;
-  Error error = consumeObject(data, header);
+  Status error = consumeObject(data, header);
   if (error.Fail() ||
       header->size_of_header < sizeof(MinidumpMemoryInfoListHeader) ||
       header->size_of_entry < sizeof(MinidumpMemoryInfo))
diff --git a/source/Plugins/Process/minidump/MinidumpTypes.h b/source/Plugins/Process/minidump/MinidumpTypes.h
index 42de7eaaca03..6de4f55a769d 100644
--- a/source/Plugins/Process/minidump/MinidumpTypes.h
+++ b/source/Plugins/Process/minidump/MinidumpTypes.h
@@ -13,7 +13,7 @@
 // Project includes
 
 // Other libraries and framework includes
-#include "lldb/Utility/Error.h"
+#include "lldb/Utility/Status.h"
 
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/BitmaskEnum.h"
@@ -158,8 +158,8 @@ enum class MinidumpMiscInfoFlags : uint32_t {
 };
 
 template <typename T>
-Error consumeObject(llvm::ArrayRef<uint8_t> &Buffer, const T *&Object) {
-  Error error;
+Status consumeObject(llvm::ArrayRef<uint8_t> &Buffer, const T *&Object) {
+  Status error;
   if (Buffer.size() < sizeof(T)) {
     error.SetErrorString("Insufficient buffer!");
     return error;
diff --git a/source/Plugins/Process/minidump/ProcessMinidump.cpp b/source/Plugins/Process/minidump/ProcessMinidump.cpp
index f3f4664ad6e1..d4d65c044eab 100644
--- a/source/Plugins/Process/minidump/ProcessMinidump.cpp
+++ b/source/Plugins/Process/minidump/ProcessMinidump.cpp
@@ -113,8 +113,8 @@ void ProcessMinidump::Terminate() {
   PluginManager::UnregisterPlugin(ProcessMinidump::CreateInstance);
 }
 
-Error ProcessMinidump::DoLoadCore() {
-  Error error;
+Status ProcessMinidump::DoLoadCore() {
+  Status error;
 
   m_thread_list = m_minidump_parser.GetThreads();
   m_active_exception = m_minidump_parser.GetExceptionStream();
@@ -141,7 +141,7 @@ ConstString ProcessMinidump::GetPluginName() { return GetPluginNameStatic(); }
 
 uint32_t ProcessMinidump::GetPluginVersion() { return 1; }
 
-Error ProcessMinidump::DoDestroy() { return Error(); }
+Status ProcessMinidump::DoDestroy() { return Status(); }
 
 void ProcessMinidump::RefreshStateAfterStop() {
   if (!m_active_exception)
@@ -184,14 +184,14 @@ bool ProcessMinidump::IsAlive() { return true; }
 bool ProcessMinidump::WarnBeforeDetach() const { return false; }
 
 size_t ProcessMinidump::ReadMemory(lldb::addr_t addr, void *buf, size_t size,
-                                   Error &error) {
+                                   Status &error) {
   // Don't allow the caching that lldb_private::Process::ReadMemory does
   // since we have it all cached in our dump file anyway.
   return DoReadMemory(addr, buf, size, error);
 }
 
 size_t ProcessMinidump::DoReadMemory(lldb::addr_t addr, void *buf, size_t size,
-                                     Error &error) {
+                                     Status &error) {
 
   llvm::ArrayRef<uint8_t> mem = m_minidump_parser.GetMemory(addr, size);
   if (mem.empty()) {
@@ -215,9 +215,9 @@ ArchSpec ProcessMinidump::GetArchitecture() {
   return ArchSpec(triple);
 }
 
-Error ProcessMinidump::GetMemoryRegionInfo(lldb::addr_t load_addr,
-                                           MemoryRegionInfo &range_info) {
-  Error error;
+Status ProcessMinidump::GetMemoryRegionInfo(lldb::addr_t load_addr,
+                                            MemoryRegionInfo &range_info) {
+  Status error;
   auto info = m_minidump_parser.GetMemoryRegionInfo(load_addr);
   if (!info) {
     error.SetErrorString("No valid MemoryRegionInfo found!");
@@ -278,7 +278,7 @@ void ProcessMinidump::ReadModuleList() {
 
     const auto file_spec = FileSpec(name.getValue(), true);
     ModuleSpec module_spec = file_spec;
-    Error error;
+    Status error;
     lldb::ModuleSP module_sp = GetTarget().GetSharedModule(module_spec, &error);
     if (!module_sp || error.Fail()) {
       continue;
diff --git a/source/Plugins/Process/minidump/ProcessMinidump.h b/source/Plugins/Process/minidump/ProcessMinidump.h
index 62407f927551..d5c46be97352 100644
--- a/source/Plugins/Process/minidump/ProcessMinidump.h
+++ b/source/Plugins/Process/minidump/ProcessMinidump.h
@@ -19,7 +19,7 @@
 #include "lldb/Target/StopInfo.h"
 #include "lldb/Target/Target.h"
 #include "lldb/Utility/ConstString.h"
-#include "lldb/Utility/Error.h"
+#include "lldb/Utility/Status.h"
 
 #include "llvm/Support/Format.h"
 #include "llvm/Support/raw_ostream.h"
@@ -53,7 +53,7 @@ public:
   bool CanDebug(lldb::TargetSP target_sp,
                 bool plugin_specified_by_name) override;
 
-  Error DoLoadCore() override;
+  Status DoLoadCore() override;
 
   DynamicLoader *GetDynamicLoader() override;
 
@@ -61,7 +61,7 @@ public:
 
   uint32_t GetPluginVersion() override;
 
-  Error DoDestroy() override;
+  Status DoDestroy() override;
 
   void RefreshStateAfterStop() override;
 
@@ -70,15 +70,15 @@ public:
   bool WarnBeforeDetach() const override;
 
   size_t ReadMemory(lldb::addr_t addr, void *buf, size_t size,
-                    Error &error) override;
+                    Status &error) override;
 
   size_t DoReadMemory(lldb::addr_t addr, void *buf, size_t size,
-                      Error &error) override;
+                      Status &error) override;
 
   ArchSpec GetArchitecture();
 
-  Error GetMemoryRegionInfo(lldb::addr_t load_addr,
-                            MemoryRegionInfo &range_info) override;
+  Status GetMemoryRegionInfo(lldb::addr_t load_addr,
+                             MemoryRegionInfo &range_info) override;
 
   bool GetProcessInfo(ProcessInstanceInfo &info) override;
 
diff --git a/source/Plugins/ScriptInterpreter/Python/ScriptInterpreterPython.cpp b/source/Plugins/ScriptInterpreter/Python/ScriptInterpreterPython.cpp
index cfab9b33e662..d6d695fc2e74 100644
--- a/source/Plugins/ScriptInterpreter/Python/ScriptInterpreterPython.cpp
+++ b/source/Plugins/ScriptInterpreter/Python/ScriptInterpreterPython.cpp
@@ -723,7 +723,7 @@ bool ScriptInterpreterPython::ExecuteOneLine(
         // the result object
 
         Pipe pipe;
-        Error pipe_result = pipe.CreateNew(false);
+        Status pipe_result = pipe.CreateNew(false);
         if (pipe_result.Success()) {
 #if defined(_WIN32)
           lldb::file_t read_file = pipe.GetReadNativeHandle();
@@ -928,7 +928,8 @@ protected:
 };
 
 void ScriptInterpreterPython::ExecuteInterpreterLoop() {
-  Timer scoped_timer(LLVM_PRETTY_FUNCTION, LLVM_PRETTY_FUNCTION);
+  static Timer::Category func_cat(LLVM_PRETTY_FUNCTION);
+  Timer scoped_timer(func_cat, LLVM_PRETTY_FUNCTION);
 
   Debugger &debugger = GetCommandInterpreter().GetDebugger();
 
@@ -1133,9 +1134,9 @@ bool ScriptInterpreterPython::ExecuteOneLineWithReturn(
   return ret_success;
 }
 
-Error ScriptInterpreterPython::ExecuteMultipleLines(
+Status ScriptInterpreterPython::ExecuteMultipleLines(
     const char *in_string, const ExecuteScriptOptions &options) {
-  Error error;
+  Status error;
 
   Locker locker(this, ScriptInterpreterPython::Locker::AcquireLock |
                           ScriptInterpreterPython::Locker::InitSession |
@@ -1220,10 +1221,10 @@ void ScriptInterpreterPython::SetBreakpointCommandCallbackFunction(
       bp_options, oneliner.c_str());
 }
 
-Error ScriptInterpreterPython::SetBreakpointCommandCallback(
+Status ScriptInterpreterPython::SetBreakpointCommandCallback(
     BreakpointOptions *bp_options,
     std::unique_ptr<BreakpointOptions::CommandData> &cmd_data_up) {
-  Error error;
+  Status error;
   error = GenerateBreakpointCommandCallbackData(cmd_data_up->user_source,
                                                 cmd_data_up->script_source);
   if (error.Fail()) {
@@ -1237,7 +1238,7 @@ Error ScriptInterpreterPython::SetBreakpointCommandCallback(
 }
 
 // Set a Python one-liner as the callback for the breakpoint.
-Error ScriptInterpreterPython::SetBreakpointCommandCallback(
+Status ScriptInterpreterPython::SetBreakpointCommandCallback(
     BreakpointOptions *bp_options, const char *command_body_text) {
   auto data_ap = llvm::make_unique<CommandDataPython>();
 
@@ -1248,8 +1249,8 @@ Error ScriptInterpreterPython::SetBreakpointCommandCallback(
   // the callback will actually invoke.
 
   data_ap->user_source.SplitIntoLines(command_body_text);
-  Error error = GenerateBreakpointCommandCallbackData(data_ap->user_source,
-                                                      data_ap->script_source);
+  Status error = GenerateBreakpointCommandCallbackData(data_ap->user_source,
+                                                       data_ap->script_source);
   if (error.Success()) {
     auto baton_sp =
         std::make_shared<BreakpointOptions::CommandBaton>(std::move(data_ap));
@@ -1285,20 +1286,20 @@ void ScriptInterpreterPython::SetWatchpointCommandCallback(
   return;
 }
 
-Error ScriptInterpreterPython::ExportFunctionDefinitionToInterpreter(
+Status ScriptInterpreterPython::ExportFunctionDefinitionToInterpreter(
     StringList &function_def) {
   // Convert StringList to one long, newline delimited, const char *.
   std::string function_def_string(function_def.CopyList());
 
-  Error error = ExecuteMultipleLines(
+  Status error = ExecuteMultipleLines(
       function_def_string.c_str(),
       ScriptInterpreter::ExecuteScriptOptions().SetEnableIO(false));
   return error;
 }
 
-Error ScriptInterpreterPython::GenerateFunction(const char *signature,
-                                                const StringList &input) {
-  Error error;
+Status ScriptInterpreterPython::GenerateFunction(const char *signature,
+                                                 const StringList &input) {
+  Status error;
   int num_lines = input.GetSize();
   if (num_lines == 0) {
     error.SetErrorString("No input data.");
@@ -1830,7 +1831,7 @@ lldb::StateType ScriptInterpreterPython::ScriptedThreadPlanGetRunState(
 
 StructuredData::ObjectSP
 ScriptInterpreterPython::LoadPluginModule(const FileSpec &file_spec,
-                                          lldb_private::Error &error) {
+                                          lldb_private::Status &error) {
   if (!file_spec.Exists()) {
     error.SetErrorString("no such file");
     return StructuredData::ObjectSP();
@@ -1847,7 +1848,7 @@ ScriptInterpreterPython::LoadPluginModule(const FileSpec &file_spec,
 
 StructuredData::DictionarySP ScriptInterpreterPython::GetDynamicSettings(
     StructuredData::ObjectSP plugin_module_sp, Target *target,
-    const char *setting_name, lldb_private::Error &error) {
+    const char *setting_name, lldb_private::Status &error) {
   if (!plugin_module_sp || !target || !setting_name || !setting_name[0] ||
       !g_swig_plugin_get)
     return StructuredData::DictionarySP();
@@ -1943,12 +1944,12 @@ bool ScriptInterpreterPython::GenerateTypeSynthClass(const char *oneliner,
   return GenerateTypeSynthClass(input, output, name_token);
 }
 
-Error ScriptInterpreterPython::GenerateBreakpointCommandCallbackData(
+Status ScriptInterpreterPython::GenerateBreakpointCommandCallbackData(
     StringList &user_input, std::string &output) {
   static uint32_t num_created_functions = 0;
   user_input.RemoveBlankLines();
   StreamString sstr;
-  Error error;
+  Status error;
   if (user_input.GetSize() == 0) {
     error.SetErrorString("No input data.");
     return error;
@@ -1995,7 +1996,8 @@ bool ScriptInterpreterPython::GetScriptedSummary(
     StructuredData::ObjectSP &callee_wrapper_sp,
     const TypeSummaryOptions &options, std::string &retval) {
 
-  Timer scoped_timer(LLVM_PRETTY_FUNCTION, LLVM_PRETTY_FUNCTION);
+  static Timer::Category func_cat(LLVM_PRETTY_FUNCTION);
+  Timer scoped_timer(func_cat, LLVM_PRETTY_FUNCTION);
 
   if (!valobj.get()) {
     retval.assign("<no object>");
@@ -2019,8 +2021,8 @@ bool ScriptInterpreterPython::GetScriptedSummary(
       {
         TypeSummaryOptionsSP options_sp(new TypeSummaryOptions(options));
 
-        Timer scoped_timer("g_swig_typescript_callback",
-                           "g_swig_typescript_callback");
+        static Timer::Category func_cat("g_swig_typescript_callback");
+        Timer scoped_timer(func_cat, "g_swig_typescript_callback");
         ret_val = g_swig_typescript_callback(
             python_function_name, GetSessionDictionary().get(), valobj,
             &new_callee, options_sp, retval);
@@ -2395,7 +2397,7 @@ ConstString ScriptInterpreterPython::GetSyntheticTypeName(
 bool ScriptInterpreterPython::RunScriptFormatKeyword(const char *impl_function,
                                                      Process *process,
                                                      std::string &output,
-                                                     Error &error) {
+                                                     Status &error) {
   bool ret_val;
   if (!process) {
     error.SetErrorString("no process");
@@ -2424,7 +2426,7 @@ bool ScriptInterpreterPython::RunScriptFormatKeyword(const char *impl_function,
 bool ScriptInterpreterPython::RunScriptFormatKeyword(const char *impl_function,
                                                      Thread *thread,
                                                      std::string &output,
-                                                     Error &error) {
+                                                     Status &error) {
   bool ret_val;
   if (!thread) {
     error.SetErrorString("no thread");
@@ -2453,7 +2455,7 @@ bool ScriptInterpreterPython::RunScriptFormatKeyword(const char *impl_function,
 bool ScriptInterpreterPython::RunScriptFormatKeyword(const char *impl_function,
                                                      Target *target,
                                                      std::string &output,
-                                                     Error &error) {
+                                                     Status &error) {
   bool ret_val;
   if (!target) {
     error.SetErrorString("no thread");
@@ -2482,7 +2484,7 @@ bool ScriptInterpreterPython::RunScriptFormatKeyword(const char *impl_function,
 bool ScriptInterpreterPython::RunScriptFormatKeyword(const char *impl_function,
                                                      StackFrame *frame,
                                                      std::string &output,
-                                                     Error &error) {
+                                                     Status &error) {
   bool ret_val;
   if (!frame) {
     error.SetErrorString("no frame");
@@ -2511,7 +2513,7 @@ bool ScriptInterpreterPython::RunScriptFormatKeyword(const char *impl_function,
 bool ScriptInterpreterPython::RunScriptFormatKeyword(const char *impl_function,
                                                      ValueObject *value,
                                                      std::string &output,
-                                                     Error &error) {
+                                                     Status &error) {
   bool ret_val;
   if (!value) {
     error.SetErrorString("no value");
@@ -2551,7 +2553,7 @@ uint64_t replace_all(std::string &str, const std::string &oldStr,
 
 bool ScriptInterpreterPython::LoadScriptingModule(
     const char *pathname, bool can_reload, bool init_session,
-    lldb_private::Error &error, StructuredData::ObjectSP *module_sp) {
+    lldb_private::Status &error, StructuredData::ObjectSP *module_sp) {
   if (!pathname || !pathname[0]) {
     error.SetErrorString("invalid pathname");
     return false;
@@ -2742,7 +2744,7 @@ ScriptInterpreterPython::SynchronicityHandler::~SynchronicityHandler() {
 bool ScriptInterpreterPython::RunScriptBasedCommand(
     const char *impl_function, const char *args,
     ScriptedCommandSynchronicity synchronicity,
-    lldb_private::CommandReturnObject &cmd_retobj, Error &error,
+    lldb_private::CommandReturnObject &cmd_retobj, Status &error,
     const lldb_private::ExecutionContext &exe_ctx) {
   if (!impl_function) {
     error.SetErrorString("no function to execute");
@@ -2790,7 +2792,7 @@ bool ScriptInterpreterPython::RunScriptBasedCommand(
 bool ScriptInterpreterPython::RunScriptBasedCommand(
     StructuredData::GenericSP impl_obj_sp, const char *args,
     ScriptedCommandSynchronicity synchronicity,
-    lldb_private::CommandReturnObject &cmd_retobj, Error &error,
+    lldb_private::CommandReturnObject &cmd_retobj, Status &error,
     const lldb_private::ExecutionContext &exe_ctx) {
   if (!impl_obj_sp || !impl_obj_sp->IsValid()) {
     error.SetErrorString("no function to execute");
@@ -3102,7 +3104,8 @@ void ScriptInterpreterPython::InitializePrivate() {
 
   g_initialized = true;
 
-  Timer scoped_timer(LLVM_PRETTY_FUNCTION, LLVM_PRETTY_FUNCTION);
+  static Timer::Category func_cat(LLVM_PRETTY_FUNCTION);
+  Timer scoped_timer(func_cat, LLVM_PRETTY_FUNCTION);
 
   // RAII-based initialization which correctly handles multiple-initialization,
   // version-
diff --git a/source/Plugins/ScriptInterpreter/Python/ScriptInterpreterPython.h b/source/Plugins/ScriptInterpreter/Python/ScriptInterpreterPython.h
index 7b0e1b000d6f..a71fcea7519c 100644
--- a/source/Plugins/ScriptInterpreter/Python/ScriptInterpreterPython.h
+++ b/source/Plugins/ScriptInterpreter/Python/ScriptInterpreterPython.h
@@ -161,11 +161,11 @@ public:
       void *ret_value,
       const ExecuteScriptOptions &options = ExecuteScriptOptions()) override;
 
-  lldb_private::Error ExecuteMultipleLines(
+  lldb_private::Status ExecuteMultipleLines(
       const char *in_string,
       const ExecuteScriptOptions &options = ExecuteScriptOptions()) override;
 
-  Error
+  Status
   ExportFunctionDefinitionToInterpreter(StringList &function_def) override;
 
   bool GenerateTypeScriptFunction(StringList &input, std::string &output,
@@ -229,12 +229,12 @@ public:
 
   StructuredData::ObjectSP
   LoadPluginModule(const FileSpec &file_spec,
-                   lldb_private::Error &error) override;
+                   lldb_private::Status &error) override;
 
   StructuredData::DictionarySP
   GetDynamicSettings(StructuredData::ObjectSP plugin_module_sp, Target *target,
                      const char *setting_name,
-                     lldb_private::Error &error) override;
+                     lldb_private::Status &error) override;
 
   size_t CalculateNumChildren(const StructuredData::ObjectSP &implementor,
                               uint32_t max) override;
@@ -262,21 +262,21 @@ public:
   RunScriptBasedCommand(const char *impl_function, const char *args,
                         ScriptedCommandSynchronicity synchronicity,
                         lldb_private::CommandReturnObject &cmd_retobj,
-                        Error &error,
+                        Status &error,
                         const lldb_private::ExecutionContext &exe_ctx) override;
 
   bool
   RunScriptBasedCommand(StructuredData::GenericSP impl_obj_sp, const char *args,
                         ScriptedCommandSynchronicity synchronicity,
                         lldb_private::CommandReturnObject &cmd_retobj,
-                        Error &error,
+                        Status &error,
                         const lldb_private::ExecutionContext &exe_ctx) override;
 
-  Error GenerateFunction(const char *signature,
-                         const StringList &input) override;
+  Status GenerateFunction(const char *signature,
+                          const StringList &input) override;
 
-  Error GenerateBreakpointCommandCallbackData(StringList &input,
-                                              std::string &output) override;
+  Status GenerateBreakpointCommandCallbackData(StringList &input,
+                                               std::string &output) override;
 
   bool GenerateWatchpointCommandCallbackData(StringList &input,
                                              std::string &output) override;
@@ -332,23 +332,23 @@ public:
   }
 
   bool RunScriptFormatKeyword(const char *impl_function, Process *process,
-                              std::string &output, Error &error) override;
+                              std::string &output, Status &error) override;
 
   bool RunScriptFormatKeyword(const char *impl_function, Thread *thread,
-                              std::string &output, Error &error) override;
+                              std::string &output, Status &error) override;
 
   bool RunScriptFormatKeyword(const char *impl_function, Target *target,
-                              std::string &output, Error &error) override;
+                              std::string &output, Status &error) override;
 
   bool RunScriptFormatKeyword(const char *impl_function, StackFrame *frame,
-                              std::string &output, Error &error) override;
+                              std::string &output, Status &error) override;
 
   bool RunScriptFormatKeyword(const char *impl_function, ValueObject *value,
-                              std::string &output, Error &error) override;
+                              std::string &output, Status &error) override;
 
   bool
   LoadScriptingModule(const char *filename, bool can_reload, bool init_session,
-                      lldb_private::Error &error,
+                      lldb_private::Status &error,
                       StructuredData::ObjectSP *module_sp = nullptr) override;
 
   bool IsReservedWord(const char *word) override;
@@ -364,14 +364,14 @@ public:
                                           CommandReturnObject &result) override;
 
   /// Set the callback body text into the callback for the breakpoint.
-  Error SetBreakpointCommandCallback(BreakpointOptions *bp_options,
-                                     const char *callback_body) override;
+  Status SetBreakpointCommandCallback(BreakpointOptions *bp_options,
+                                      const char *callback_body) override;
 
   void SetBreakpointCommandCallbackFunction(BreakpointOptions *bp_options,
                                             const char *function_name) override;
 
   /// This one is for deserialization:
-  Error SetBreakpointCommandCallback(
+  Status SetBreakpointCommandCallback(
       BreakpointOptions *bp_options,
       std::unique_ptr<BreakpointOptions::CommandData> &data_up) override;
 
diff --git a/source/Plugins/StructuredData/DarwinLog/StructuredDataDarwinLog.cpp b/source/Plugins/StructuredData/DarwinLog/StructuredDataDarwinLog.cpp
index 041d827d526b..f1450c31b36b 100644
--- a/source/Plugins/StructuredData/DarwinLog/StructuredDataDarwinLog.cpp
+++ b/source/Plugins/StructuredData/DarwinLog/StructuredDataDarwinLog.cpp
@@ -209,7 +209,7 @@ public:
 
   using OperationCreationFunc =
       std::function<FilterRuleSP(bool accept, size_t attribute_index,
-                                 const std::string &op_arg, Error &error)>;
+                                 const std::string &op_arg, Status &error)>;
 
   static void RegisterOperation(const ConstString &operation,
                                 const OperationCreationFunc &creation_func) {
@@ -218,7 +218,7 @@ public:
 
   static FilterRuleSP CreateRule(bool match_accepts, size_t attribute,
                                  const ConstString &operation,
-                                 const std::string &op_arg, Error &error) {
+                                 const std::string &op_arg, Status &error) {
     // Find the creation func for this type of filter rule.
     auto map = GetCreationFuncMap();
     auto find_it = map.find(operation);
@@ -304,7 +304,8 @@ protected:
 
 private:
   static FilterRuleSP CreateOperation(bool accept, size_t attribute_index,
-                                      const std::string &op_arg, Error &error) {
+                                      const std::string &op_arg,
+                                      Status &error) {
     // We treat the op_arg as a regex.  Validate it.
     if (op_arg.empty()) {
       error.SetErrorString("regex filter type requires a regex "
@@ -358,7 +359,8 @@ protected:
 
 private:
   static FilterRuleSP CreateOperation(bool accept, size_t attribute_index,
-                                      const std::string &op_arg, Error &error) {
+                                      const std::string &op_arg,
+                                      Status &error) {
     if (op_arg.empty()) {
       error.SetErrorString("exact match filter type requires an "
                            "argument containing the text that must "
@@ -524,9 +526,9 @@ public:
     m_filter_rules.clear();
   }
 
-  Error SetOptionValue(uint32_t option_idx, llvm::StringRef option_arg,
-                       ExecutionContext *execution_context) override {
-    Error error;
+  Status SetOptionValue(uint32_t option_idx, llvm::StringRef option_arg,
+                        ExecutionContext *execution_context) override {
+    Status error;
 
     const int short_option = m_getopt_table[option_idx].val;
     switch (short_option) {
@@ -664,8 +666,8 @@ public:
   bool GetBroadcastEvents() const { return m_broadcast_events; }
 
 private:
-  Error ParseFilterRule(llvm::StringRef rule_text) {
-    Error error;
+  Status ParseFilterRule(llvm::StringRef rule_text) {
+    Status error;
 
     if (rule_text.empty()) {
       error.SetErrorString("invalid rule_text");
@@ -899,7 +901,7 @@ protected:
     // Send configuration to the feature by way of the process.
     // Construct the options we will use.
     auto config_sp = m_options_sp->BuildConfigurationData(m_enable);
-    const Error error =
+    const Status error =
         process_sp->ConfigureStructuredData(GetDarwinLogTypeName(), config_sp);
 
     // Report results.
@@ -1040,7 +1042,7 @@ public:
   }
 };
 
-EnableOptionsSP ParseAutoEnableOptions(Error &error, Debugger &debugger) {
+EnableOptionsSP ParseAutoEnableOptions(Status &error, Debugger &debugger) {
   // We are abusing the options data model here so that we can parse
   // options without requiring the Debugger instance.
 
@@ -1212,7 +1214,7 @@ void StructuredDataDarwinLog::HandleArrivalOfStructuredData(
   // to inspect, including showing backtraces.
 }
 
-static void SetErrorWithJSON(Error &error, const char *message,
+static void SetErrorWithJSON(Status &error, const char *message,
                              StructuredData::Object &object) {
   if (!message) {
     error.SetErrorString("Internal error: message not set.");
@@ -1226,9 +1228,9 @@ static void SetErrorWithJSON(Error &error, const char *message,
   error.SetErrorStringWithFormat("%s: %s", message, object_stream.GetData());
 }
 
-Error StructuredDataDarwinLog::GetDescription(
+Status StructuredDataDarwinLog::GetDescription(
     const StructuredData::ObjectSP &object_sp, lldb_private::Stream &stream) {
-  Error error;
+  Status error;
 
   if (!object_sp) {
     error.SetErrorString("No structured data.");
@@ -1483,9 +1485,9 @@ void StructuredDataDarwinLog::DebuggerInitialize(Debugger &debugger) {
   }
 }
 
-Error StructuredDataDarwinLog::FilterLaunchInfo(ProcessLaunchInfo &launch_info,
-                                                Target *target) {
-  Error error;
+Status StructuredDataDarwinLog::FilterLaunchInfo(ProcessLaunchInfo &launch_info,
+                                                 Target *target) {
+  Status error;
 
   // If we're not debugging this launched process, there's nothing for us
   // to do here.
@@ -1815,7 +1817,7 @@ StructuredDataDarwinLog::DumpHeader(Stream &output_stream,
   }
 
   if (options_sp->GetDisplayActivityChain()) {
-    std::string activity_chain;
+    llvm::StringRef activity_chain;
     if (event.GetValueForKeyAsString("activity-chain", activity_chain) &&
         !activity_chain.empty()) {
       if (header_count > 0)
@@ -1854,7 +1856,7 @@ StructuredDataDarwinLog::DumpHeader(Stream &output_stream,
   }
 
   if (options_sp->GetDisplaySubsystem()) {
-    std::string subsystem;
+    llvm::StringRef subsystem;
     if (event.GetValueForKeyAsString("subsystem", subsystem) &&
         !subsystem.empty()) {
       if (header_count > 0)
@@ -1866,7 +1868,7 @@ StructuredDataDarwinLog::DumpHeader(Stream &output_stream,
   }
 
   if (options_sp->GetDisplayCategory()) {
-    std::string category;
+    llvm::StringRef category;
     if (event.GetValueForKeyAsString("category", category) &&
         !category.empty()) {
       if (header_count > 0)
@@ -1899,16 +1901,16 @@ size_t StructuredDataDarwinLog::HandleDisplayOfEvent(
   size_t total_bytes = 0;
 
   // Grab the message content.
-  std::string message;
+  llvm::StringRef message;
   if (!event.GetValueForKeyAsString("message", message))
     return true;
 
   // Display the log entry.
-  const auto len = message.length();
+  const auto len = message.size();
 
   total_bytes += DumpHeader(stream, event);
 
-  stream.Write(message.c_str(), len);
+  stream.Write(message.data(), len);
   total_bytes += len;
 
   // Add an end of line.
@@ -1991,7 +1993,7 @@ void StructuredDataDarwinLog::EnableNow() {
 
   // We can run it directly.
   // Send configuration to the feature by way of the process.
-  const Error error =
+  const Status error =
       process_sp->ConfigureStructuredData(GetDarwinLogTypeName(), config_sp);
 
   // Report results.
diff --git a/source/Plugins/StructuredData/DarwinLog/StructuredDataDarwinLog.h b/source/Plugins/StructuredData/DarwinLog/StructuredDataDarwinLog.h
index 7eaab127c3f3..77b6e7be71c7 100644
--- a/source/Plugins/StructuredData/DarwinLog/StructuredDataDarwinLog.h
+++ b/source/Plugins/StructuredData/DarwinLog/StructuredDataDarwinLog.h
@@ -67,8 +67,8 @@ public:
       Process &process, const ConstString &type_name,
       const StructuredData::ObjectSP &object_sp) override;
 
-  Error GetDescription(const StructuredData::ObjectSP &object_sp,
-                       lldb_private::Stream &stream) override;
+  Status GetDescription(const StructuredData::ObjectSP &object_sp,
+                        lldb_private::Stream &stream) override;
 
   bool GetEnabled(const ConstString &type_name) const override;
 
@@ -96,7 +96,8 @@ private:
                                          lldb::user_id_t break_id,
                                          lldb::user_id_t break_loc_id);
 
-  static Error FilterLaunchInfo(ProcessLaunchInfo &launch_info, Target *target);
+  static Status FilterLaunchInfo(ProcessLaunchInfo &launch_info,
+                                 Target *target);
 
   // -------------------------------------------------------------------------
   // Internal helper methods used by friend classes
diff --git a/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.cpp b/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.cpp
index 8aec35d09ce5..cb00e840673f 100644
--- a/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.cpp
+++ b/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.cpp
@@ -1595,24 +1595,17 @@ TypeSP DWARFASTParserClang::ParseTypeFromDWARF(const SymbolContext &sc,
                                             : containing_decl_ctx,
                   type_name_cstr, clang_type, storage, is_inline);
 
-              //                            if (template_param_infos.GetSize() >
-              //                            0)
-              //                            {
-              //                                clang::FunctionTemplateDecl
-              //                                *func_template_decl =
-              //                                CreateFunctionTemplateDecl
-              //                                (containing_decl_ctx,
-              //                                                                                                              function_decl,
-              //                                                                                                              type_name_cstr,
-              //                                                                                                              template_param_infos);
-              //
-              //                                CreateFunctionTemplateSpecializationInfo
-              //                                (function_decl,
-              //                                                                          func_template_decl,
-              //                                                                          template_param_infos);
-              //                            }
-              // Add the decl to our DIE to decl context map
-
+              if (has_template_params) {
+                ClangASTContext::TemplateParameterInfos template_param_infos;
+                ParseTemplateParameterInfos(die, template_param_infos);
+                clang::FunctionTemplateDecl *func_template_decl =
+                    m_ast.CreateFunctionTemplateDecl(
+                        containing_decl_ctx, function_decl, type_name_cstr,
+                        template_param_infos);
+                m_ast.CreateFunctionTemplateSpecializationInfo(
+                    function_decl, func_template_decl, template_param_infos);
+              }
+              
               lldbassert(function_decl);
 
               if (function_decl) {
@@ -1951,6 +1944,19 @@ bool DWARFASTParserClang::ParseTemplateDIE(
   const dw_tag_t tag = die.Tag();
 
   switch (tag) {
+  case DW_TAG_GNU_template_parameter_pack: {
+    template_param_infos.packed_args.reset(
+      new ClangASTContext::TemplateParameterInfos);
+    for (DWARFDIE child_die = die.GetFirstChild(); child_die.IsValid();
+         child_die = child_die.GetSibling()) {
+      if (!ParseTemplateDIE(child_die, *template_param_infos.packed_args))
+        return false;
+    }
+    if (const char *name = die.GetName()) {
+      template_param_infos.pack_name = name;
+    }
+    return true;
+  }
   case DW_TAG_template_type_parameter:
   case DW_TAG_template_value_parameter: {
     DWARFAttributes attributes;
@@ -2040,6 +2046,7 @@ bool DWARFASTParserClang::ParseTemplateParameterInfos(
     switch (tag) {
     case DW_TAG_template_type_parameter:
     case DW_TAG_template_value_parameter:
+    case DW_TAG_GNU_template_parameter_pack:
       ParseTemplateDIE(die, template_param_infos);
       break;
 
@@ -3450,6 +3457,7 @@ size_t DWARFASTParserClang::ParseChildParameters(
 
     case DW_TAG_template_type_parameter:
     case DW_TAG_template_value_parameter:
+    case DW_TAG_GNU_template_parameter_pack:
       // The one caller of this was never using the template_param_infos,
       // and the local variable was taking up a large amount of stack space
       // in SymbolFileDWARF::ParseType() so this was removed. If we ever need
diff --git a/source/Plugins/SymbolFile/DWARF/DWARFASTParserOCaml.cpp b/source/Plugins/SymbolFile/DWARF/DWARFASTParserOCaml.cpp
index 2e5be393dea2..3b1466df21b0 100644
--- a/source/Plugins/SymbolFile/DWARF/DWARFASTParserOCaml.cpp
+++ b/source/Plugins/SymbolFile/DWARF/DWARFASTParserOCaml.cpp
@@ -8,8 +8,8 @@
 #include "lldb/Symbol/ObjectFile.h"
 #include "lldb/Symbol/Type.h"
 #include "lldb/Symbol/TypeList.h"
-#include "lldb/Utility/Error.h"
 #include "lldb/Utility/Log.h"
+#include "lldb/Utility/Status.h"
 
 using namespace lldb;
 using namespace lldb_private;
diff --git a/source/Plugins/SymbolFile/DWARF/DWARFCompileUnit.cpp b/source/Plugins/SymbolFile/DWARF/DWARFCompileUnit.cpp
index ef499a6d5615..fc97a76de59b 100644
--- a/source/Plugins/SymbolFile/DWARF/DWARFCompileUnit.cpp
+++ b/source/Plugins/SymbolFile/DWARF/DWARFCompileUnit.cpp
@@ -135,8 +135,9 @@ size_t DWARFCompileUnit::ExtractDIEsIfNeeded(bool cu_die_only) {
   if ((cu_die_only && initial_die_array_size > 0) || initial_die_array_size > 1)
     return 0; // Already parsed
 
+  static Timer::Category func_cat(LLVM_PRETTY_FUNCTION);
   Timer scoped_timer(
-      LLVM_PRETTY_FUNCTION,
+      func_cat,
       "%8.8x: DWARFCompileUnit::ExtractDIEsIfNeeded( cu_die_only = %i )",
       m_offset, cu_die_only);
 
diff --git a/source/Plugins/SymbolFile/DWARF/DWARFDebugAranges.cpp b/source/Plugins/SymbolFile/DWARF/DWARFDebugAranges.cpp
index d571c512658e..e923225f1d98 100644
--- a/source/Plugins/SymbolFile/DWARF/DWARFDebugAranges.cpp
+++ b/source/Plugins/SymbolFile/DWARF/DWARFDebugAranges.cpp
@@ -110,7 +110,8 @@ void DWARFDebugAranges::AppendRange(dw_offset_t offset, dw_addr_t low_pc,
 }
 
 void DWARFDebugAranges::Sort(bool minimize) {
-  Timer scoped_timer(LLVM_PRETTY_FUNCTION, "%s this = %p", LLVM_PRETTY_FUNCTION,
+  static Timer::Category func_cat(LLVM_PRETTY_FUNCTION);
+  Timer scoped_timer(func_cat, "%s this = %p", LLVM_PRETTY_FUNCTION,
                      static_cast<void *>(this));
 
   Log *log(LogChannelDWARF::GetLogIfAll(DWARF_LOG_DEBUG_ARANGES));
diff --git a/source/Plugins/SymbolFile/DWARF/DWARFDebugInfoEntry.cpp b/source/Plugins/SymbolFile/DWARF/DWARFDebugInfoEntry.cpp
index e00eda4f35c5..8d87c201eceb 100644
--- a/source/Plugins/SymbolFile/DWARF/DWARFDebugInfoEntry.cpp
+++ b/source/Plugins/SymbolFile/DWARF/DWARFDebugInfoEntry.cpp
@@ -202,7 +202,7 @@ bool DWARFDebugInfoEntry::Extract(SymbolFileDWARF *dwarf2Data,
   const uint32_t cu_end_offset = cu->GetNextCompileUnitOffset();
   lldb::offset_t offset = *offset_ptr;
   //  if (offset >= cu_end_offset)
-  //      Log::Error("DIE at offset 0x%8.8x is beyond the end of the current
+  //      Log::Status("DIE at offset 0x%8.8x is beyond the end of the current
   //      compile unit (0x%8.8x)", m_offset, cu_end_offset);
   if ((offset < cu_end_offset) && debug_info_data.ValidOffset(offset)) {
     m_offset = offset;
@@ -1647,6 +1647,8 @@ bool DWARFDebugInfoEntry::LookupAddress(const dw_addr_t address,
       break;
     case DW_TAG_template_value_parameter:
       break;
+    case DW_TAG_GNU_template_parameter_pack:
+      break;
     case DW_TAG_thrown_type:
       break;
     case DW_TAG_try_block:
diff --git a/source/Plugins/SymbolFile/DWARF/DWARFDebugLine.cpp b/source/Plugins/SymbolFile/DWARF/DWARFDebugLine.cpp
index 077675be2271..67d8828c4516 100644
--- a/source/Plugins/SymbolFile/DWARF/DWARFDebugLine.cpp
+++ b/source/Plugins/SymbolFile/DWARF/DWARFDebugLine.cpp
@@ -484,9 +484,9 @@ bool DWARFDebugLine::ParseStatementTable(
 
   const dw_offset_t debug_line_offset = *offset_ptr;
 
+  static Timer::Category func_cat(LLVM_PRETTY_FUNCTION);
   Timer scoped_timer(
-      LLVM_PRETTY_FUNCTION,
-      "DWARFDebugLine::ParseStatementTable (.debug_line[0x%8.8x])",
+      func_cat, "DWARFDebugLine::ParseStatementTable (.debug_line[0x%8.8x])",
       debug_line_offset);
 
   if (!ParsePrologue(debug_line_data, offset_ptr, prologue.get())) {
diff --git a/source/Plugins/SymbolFile/DWARF/DWARFDebugPubnames.cpp b/source/Plugins/SymbolFile/DWARF/DWARFDebugPubnames.cpp
index 81e27f2bc95e..04bc2f497075 100644
--- a/source/Plugins/SymbolFile/DWARF/DWARFDebugPubnames.cpp
+++ b/source/Plugins/SymbolFile/DWARF/DWARFDebugPubnames.cpp
@@ -25,7 +25,8 @@ using namespace lldb_private;
 DWARFDebugPubnames::DWARFDebugPubnames() : m_sets() {}
 
 bool DWARFDebugPubnames::Extract(const DWARFDataExtractor &data) {
-  Timer scoped_timer(LLVM_PRETTY_FUNCTION,
+  static Timer::Category func_cat(LLVM_PRETTY_FUNCTION);
+  Timer scoped_timer(func_cat,
                      "DWARFDebugPubnames::Extract (byte_size = %" PRIu64 ")",
                      (uint64_t)data.GetByteSize());
   Log *log(LogChannelDWARF::GetLogIfAll(DWARF_LOG_DEBUG_PUBNAMES));
@@ -52,7 +53,8 @@ bool DWARFDebugPubnames::Extract(const DWARFDataExtractor &data) {
 }
 
 bool DWARFDebugPubnames::GeneratePubnames(SymbolFileDWARF *dwarf2Data) {
-  Timer scoped_timer(LLVM_PRETTY_FUNCTION,
+  static Timer::Category func_cat(LLVM_PRETTY_FUNCTION);
+  Timer scoped_timer(func_cat,
                      "DWARFDebugPubnames::GeneratePubnames (data = %p)",
                      static_cast<void *>(dwarf2Data));
 
diff --git a/source/Plugins/SymbolFile/DWARF/DWARFDefines.cpp b/source/Plugins/SymbolFile/DWARF/DWARFDefines.cpp
index f0d66720c55c..2ff0fe3aac41 100644
--- a/source/Plugins/SymbolFile/DWARF/DWARFDefines.cpp
+++ b/source/Plugins/SymbolFile/DWARF/DWARFDefines.cpp
@@ -610,6 +610,8 @@ DW_TAG_CategoryEnum get_tag_category(uint16_t tag) {
     return TagCategoryType;
   case DW_TAG_template_value_parameter:
     return TagCategoryType;
+  case DW_TAG_GNU_template_parameter_pack:
+    return TagCategoryType;
   case DW_TAG_thrown_type:
     return TagCategoryType;
   case DW_TAG_try_block:
diff --git a/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.cpp b/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.cpp
index ad6af8dfebd5..279efe320a46 100644
--- a/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.cpp
+++ b/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.cpp
@@ -666,8 +666,9 @@ const DWARFDebugAbbrev *SymbolFileDWARF::DebugAbbrev() const {
 
 DWARFDebugInfo *SymbolFileDWARF::DebugInfo() {
   if (m_info.get() == NULL) {
-    Timer scoped_timer(LLVM_PRETTY_FUNCTION, "%s this = %p",
-                       LLVM_PRETTY_FUNCTION, static_cast<void *>(this));
+    static Timer::Category func_cat(LLVM_PRETTY_FUNCTION);
+    Timer scoped_timer(func_cat, "%s this = %p", LLVM_PRETTY_FUNCTION,
+                       static_cast<void *>(this));
     if (get_debug_info_data().GetByteSize() > 0) {
       m_info.reset(new DWARFDebugInfo());
       if (m_info.get()) {
@@ -703,8 +704,9 @@ SymbolFileDWARF::GetDWARFCompileUnit(lldb_private::CompileUnit *comp_unit) {
 
 DWARFDebugRanges *SymbolFileDWARF::DebugRanges() {
   if (m_ranges.get() == NULL) {
-    Timer scoped_timer(LLVM_PRETTY_FUNCTION, "%s this = %p",
-                       LLVM_PRETTY_FUNCTION, static_cast<void *>(this));
+    static Timer::Category func_cat(LLVM_PRETTY_FUNCTION);
+    Timer scoped_timer(func_cat, "%s this = %p", LLVM_PRETTY_FUNCTION,
+                       static_cast<void *>(this));
     if (get_debug_ranges_data().GetByteSize() > 0) {
       m_ranges.reset(new DWARFDebugRanges());
       if (m_ranges.get())
@@ -1599,7 +1601,7 @@ void SymbolFileDWARF::UpdateExternalModuleListIfNeeded() {
             dwo_module_spec.GetArchitecture() =
                 m_obj_file->GetModule()->GetArchitecture();
             // printf ("Loading dwo = '%s'\n", dwo_path);
-            Error error = ModuleList::GetSharedModule(
+            Status error = ModuleList::GetSharedModule(
                 dwo_module_spec, module_sp, NULL, NULL, NULL);
             if (!module_sp) {
               GetObjectFile()->GetModule()->ReportWarning(
@@ -1637,7 +1639,7 @@ SymbolFileDWARF::GlobalVariableMap &SymbolFileDWARF::GetGlobalAranges() {
               if (var_sp && !var_sp->GetLocationIsConstantValueData()) {
                 const DWARFExpression &location = var_sp->LocationExpression();
                 Value location_result;
-                Error error;
+                Status error;
                 if (location.Evaluate(nullptr, nullptr, nullptr,
                                       LLDB_INVALID_ADDRESS, nullptr, nullptr,
                                       location_result, &error)) {
@@ -1666,10 +1668,12 @@ SymbolFileDWARF::GlobalVariableMap &SymbolFileDWARF::GetGlobalAranges() {
 uint32_t SymbolFileDWARF::ResolveSymbolContext(const Address &so_addr,
                                                uint32_t resolve_scope,
                                                SymbolContext &sc) {
-  Timer scoped_timer(LLVM_PRETTY_FUNCTION, "SymbolFileDWARF::"
-                                           "ResolveSymbolContext (so_addr = { "
-                                           "section = %p, offset = 0x%" PRIx64
-                                           " }, resolve_scope = 0x%8.8x)",
+  static Timer::Category func_cat(LLVM_PRETTY_FUNCTION);
+  Timer scoped_timer(func_cat,
+                     "SymbolFileDWARF::"
+                     "ResolveSymbolContext (so_addr = { "
+                     "section = %p, offset = 0x%" PRIx64
+                     " }, resolve_scope = 0x%8.8x)",
                      static_cast<void *>(so_addr.GetSection().get()),
                      so_addr.GetOffset(), resolve_scope);
   uint32_t resolved = 0;
@@ -1927,8 +1931,9 @@ void SymbolFileDWARF::Index() {
   if (m_indexed)
     return;
   m_indexed = true;
+  static Timer::Category func_cat(LLVM_PRETTY_FUNCTION);
   Timer scoped_timer(
-      LLVM_PRETTY_FUNCTION, "SymbolFileDWARF::Index (%s)",
+      func_cat, "SymbolFileDWARF::Index (%s)",
       GetObjectFile()->GetFileSpec().GetFilename().AsCString("<Unknown>"));
 
   DWARFDebugInfo *debug_info = DebugInfo();
@@ -2390,8 +2395,8 @@ SymbolFileDWARF::FindFunctions(const ConstString &name,
                                const CompilerDeclContext *parent_decl_ctx,
                                uint32_t name_type_mask, bool include_inlines,
                                bool append, SymbolContextList &sc_list) {
-  Timer scoped_timer(LLVM_PRETTY_FUNCTION,
-                     "SymbolFileDWARF::FindFunctions (name = '%s')",
+  static Timer::Category func_cat(LLVM_PRETTY_FUNCTION);
+  Timer scoped_timer(func_cat, "SymbolFileDWARF::FindFunctions (name = '%s')",
                      name.AsCString());
 
   // eFunctionNameTypeAuto should be pre-resolved by a call to
@@ -2670,8 +2675,8 @@ SymbolFileDWARF::FindFunctions(const ConstString &name,
 uint32_t SymbolFileDWARF::FindFunctions(const RegularExpression &regex,
                                         bool include_inlines, bool append,
                                         SymbolContextList &sc_list) {
-  Timer scoped_timer(LLVM_PRETTY_FUNCTION,
-                     "SymbolFileDWARF::FindFunctions (regex = '%s')",
+  static Timer::Category func_cat(LLVM_PRETTY_FUNCTION);
+  Timer scoped_timer(func_cat, "SymbolFileDWARF::FindFunctions (regex = '%s')",
                      regex.GetText().str().c_str());
 
   Log *log(LogChannelDWARF::GetLogIfAll(DWARF_LOG_LOOKUPS));
diff --git a/source/Plugins/SymbolFile/DWARF/SymbolFileDWARFDebugMap.cpp b/source/Plugins/SymbolFile/DWARF/SymbolFileDWARFDebugMap.cpp
index 45519663f71f..ad009a02a2aa 100644
--- a/source/Plugins/SymbolFile/DWARF/SymbolFileDWARFDebugMap.cpp
+++ b/source/Plugins/SymbolFile/DWARF/SymbolFileDWARFDebugMap.cpp
@@ -991,7 +991,8 @@ uint32_t SymbolFileDWARFDebugMap::FindFunctions(
     const ConstString &name, const CompilerDeclContext *parent_decl_ctx,
     uint32_t name_type_mask, bool include_inlines, bool append,
     SymbolContextList &sc_list) {
-  Timer scoped_timer(LLVM_PRETTY_FUNCTION,
+  static Timer::Category func_cat(LLVM_PRETTY_FUNCTION);
+  Timer scoped_timer(func_cat,
                      "SymbolFileDWARFDebugMap::FindFunctions (name = %s)",
                      name.GetCString());
 
@@ -1018,7 +1019,8 @@ uint32_t SymbolFileDWARFDebugMap::FindFunctions(const RegularExpression &regex,
                                                 bool include_inlines,
                                                 bool append,
                                                 SymbolContextList &sc_list) {
-  Timer scoped_timer(LLVM_PRETTY_FUNCTION,
+  static Timer::Category func_cat(LLVM_PRETTY_FUNCTION);
+  Timer scoped_timer(func_cat,
                      "SymbolFileDWARFDebugMap::FindFunctions (regex = '%s')",
                      regex.GetText().str().c_str());
 
@@ -1044,7 +1046,8 @@ uint32_t SymbolFileDWARFDebugMap::FindFunctions(const RegularExpression &regex,
 size_t SymbolFileDWARFDebugMap::GetTypes(SymbolContextScope *sc_scope,
                                          uint32_t type_mask,
                                          TypeList &type_list) {
-  Timer scoped_timer(LLVM_PRETTY_FUNCTION,
+  static Timer::Category func_cat(LLVM_PRETTY_FUNCTION);
+  Timer scoped_timer(func_cat,
                      "SymbolFileDWARFDebugMap::GetTypes (type_mask = 0x%8.8x)",
                      type_mask);
 
diff --git a/source/Plugins/SymbolVendor/ELF/SymbolVendorELF.cpp b/source/Plugins/SymbolVendor/ELF/SymbolVendorELF.cpp
index 363fba2b0bba..5a377d7b04f7 100644
--- a/source/Plugins/SymbolVendor/ELF/SymbolVendorELF.cpp
+++ b/source/Plugins/SymbolVendor/ELF/SymbolVendorELF.cpp
@@ -92,8 +92,8 @@ SymbolVendorELF::CreateInstance(const lldb::ModuleSP &module_sp,
   if (file_spec_list.IsEmpty())
     return NULL;
 
-  Timer scoped_timer(LLVM_PRETTY_FUNCTION,
-                     "SymbolVendorELF::CreateInstance (module = %s)",
+  static Timer::Category func_cat(LLVM_PRETTY_FUNCTION);
+  Timer scoped_timer(func_cat, "SymbolVendorELF::CreateInstance (module = %s)",
                      module_sp->GetFileSpec().GetPath().c_str());
 
   for (size_t idx = 0; idx < file_spec_list.GetSize(); ++idx) {
diff --git a/source/Plugins/SymbolVendor/MacOSX/SymbolVendorMacOSX.cpp b/source/Plugins/SymbolVendor/MacOSX/SymbolVendorMacOSX.cpp
index 259912e4c36e..5a79cb30eb81 100644
--- a/source/Plugins/SymbolVendor/MacOSX/SymbolVendorMacOSX.cpp
+++ b/source/Plugins/SymbolVendor/MacOSX/SymbolVendorMacOSX.cpp
@@ -113,7 +113,8 @@ SymbolVendorMacOSX::CreateInstance(const lldb::ModuleSP &module_sp,
   if (obj_name != obj_file_macho)
     return NULL;
 
-  Timer scoped_timer(LLVM_PRETTY_FUNCTION,
+  static Timer::Category func_cat(LLVM_PRETTY_FUNCTION);
+  Timer scoped_timer(func_cat,
                      "SymbolVendorMacOSX::CreateInstance (module = %s)",
                      module_sp->GetFileSpec().GetPath().c_str());
   SymbolVendorMacOSX *symbol_vendor = new SymbolVendorMacOSX(module_sp);
@@ -122,8 +123,10 @@ SymbolVendorMacOSX::CreateInstance(const lldb::ModuleSP &module_sp,
     path[0] = '\0';
 
     // Try and locate the dSYM file on Mac OS X
+    static Timer::Category func_cat2(
+        "SymbolVendorMacOSX::CreateInstance() locate dSYM");
     Timer scoped_timer2(
-        "SymbolVendorMacOSX::CreateInstance () locate dSYM",
+        func_cat2,
         "SymbolVendorMacOSX::CreateInstance (module = %s) locate dSYM",
         module_sp->GetFileSpec().GetPath().c_str());
 
diff --git a/source/Plugins/SystemRuntime/MacOSX/AppleGetItemInfoHandler.cpp b/source/Plugins/SystemRuntime/MacOSX/AppleGetItemInfoHandler.cpp
index ff96267b8831..3f0c7db676f3 100644
--- a/source/Plugins/SystemRuntime/MacOSX/AppleGetItemInfoHandler.cpp
+++ b/source/Plugins/SystemRuntime/MacOSX/AppleGetItemInfoHandler.cpp
@@ -150,7 +150,7 @@ lldb::addr_t AppleGetItemInfoHandler::SetupGetItemInfoFunction(
 
     if (!m_get_item_info_impl_code.get()) {
       if (g_get_item_info_function_code != NULL) {
-        Error error;
+        Status error;
         m_get_item_info_impl_code.reset(
             exe_ctx.GetTargetRef().GetUtilityFunctionForLanguage(
                 g_get_item_info_function_code, eLanguageTypeObjC,
@@ -177,7 +177,7 @@ lldb::addr_t AppleGetItemInfoHandler::SetupGetItemInfoFunction(
       }
 
       // Next make the runner function for our implementation utility function.
-      Error error;
+      Status error;
 
       TypeSystem *type_system =
           thread.GetProcess()->GetTarget().GetScratchTypeSystemForLanguage(
@@ -230,7 +230,8 @@ lldb::addr_t AppleGetItemInfoHandler::SetupGetItemInfoFunction(
 AppleGetItemInfoHandler::GetItemInfoReturnInfo
 AppleGetItemInfoHandler::GetItemInfo(Thread &thread, uint64_t item,
                                      addr_t page_to_free,
-                                     uint64_t page_to_free_size, Error &error) {
+                                     uint64_t page_to_free_size,
+                                     Status &error) {
   lldb::StackFrameSP thread_cur_frame = thread.GetStackFrameAtIndex(0);
   ProcessSP process_sp(thread.CalculateProcess());
   TargetSP target_sp(thread.CalculateTarget());
diff --git a/source/Plugins/SystemRuntime/MacOSX/AppleGetItemInfoHandler.h b/source/Plugins/SystemRuntime/MacOSX/AppleGetItemInfoHandler.h
index db05a19cf9ec..373808e0f7e8 100644
--- a/source/Plugins/SystemRuntime/MacOSX/AppleGetItemInfoHandler.h
+++ b/source/Plugins/SystemRuntime/MacOSX/AppleGetItemInfoHandler.h
@@ -20,7 +20,7 @@
 // Project includes
 #include "lldb/Expression/UtilityFunction.h"
 #include "lldb/Symbol/CompilerType.h"
-#include "lldb/Utility/Error.h"
+#include "lldb/Utility/Status.h"
 #include "lldb/lldb-public.h"
 
 // This class will insert a UtilityFunction into the inferior process for
@@ -95,7 +95,7 @@ public:
   GetItemInfoReturnInfo GetItemInfo(Thread &thread, lldb::addr_t item,
                                     lldb::addr_t page_to_free,
                                     uint64_t page_to_free_size,
-                                    lldb_private::Error &error);
+                                    lldb_private::Status &error);
 
   void Detach();
 
diff --git a/source/Plugins/SystemRuntime/MacOSX/AppleGetPendingItemsHandler.cpp b/source/Plugins/SystemRuntime/MacOSX/AppleGetPendingItemsHandler.cpp
index 911a07f5d738..e3d03a6f9484 100644
--- a/source/Plugins/SystemRuntime/MacOSX/AppleGetPendingItemsHandler.cpp
+++ b/source/Plugins/SystemRuntime/MacOSX/AppleGetPendingItemsHandler.cpp
@@ -155,7 +155,7 @@ lldb::addr_t AppleGetPendingItemsHandler::SetupGetPendingItemsFunction(
 
     if (!m_get_pending_items_impl_code.get()) {
       if (g_get_pending_items_function_code != NULL) {
-        Error error;
+        Status error;
         m_get_pending_items_impl_code.reset(
             exe_ctx.GetTargetRef().GetUtilityFunctionForLanguage(
                 g_get_pending_items_function_code, eLanguageTypeObjC,
@@ -183,7 +183,7 @@ lldb::addr_t AppleGetPendingItemsHandler::SetupGetPendingItemsFunction(
       }
 
       // Next make the runner function for our implementation utility function.
-      Error error;
+      Status error;
       ClangASTContext *clang_ast_context =
           thread.GetProcess()->GetTarget().GetScratchClangASTContext();
       CompilerType get_pending_items_return_type =
@@ -234,7 +234,7 @@ AppleGetPendingItemsHandler::GetPendingItemsReturnInfo
 AppleGetPendingItemsHandler::GetPendingItems(Thread &thread, addr_t queue,
                                              addr_t page_to_free,
                                              uint64_t page_to_free_size,
-                                             Error &error) {
+                                             Status &error) {
   lldb::StackFrameSP thread_cur_frame = thread.GetStackFrameAtIndex(0);
   ProcessSP process_sp(thread.CalculateProcess());
   TargetSP target_sp(thread.CalculateTarget());
diff --git a/source/Plugins/SystemRuntime/MacOSX/AppleGetPendingItemsHandler.h b/source/Plugins/SystemRuntime/MacOSX/AppleGetPendingItemsHandler.h
index 95e19625ff80..139e05b1b6c9 100644
--- a/source/Plugins/SystemRuntime/MacOSX/AppleGetPendingItemsHandler.h
+++ b/source/Plugins/SystemRuntime/MacOSX/AppleGetPendingItemsHandler.h
@@ -20,7 +20,7 @@
 // Other libraries and framework includes
 // Project includes
 #include "lldb/Symbol/CompilerType.h"
-#include "lldb/Utility/Error.h"
+#include "lldb/Utility/Status.h"
 #include "lldb/lldb-public.h"
 
 // This class will insert a UtilityFunction into the inferior process for
@@ -99,7 +99,7 @@ public:
   GetPendingItemsReturnInfo GetPendingItems(Thread &thread, lldb::addr_t queue,
                                             lldb::addr_t page_to_free,
                                             uint64_t page_to_free_size,
-                                            lldb_private::Error &error);
+                                            lldb_private::Status &error);
 
   void Detach();
 
diff --git a/source/Plugins/SystemRuntime/MacOSX/AppleGetQueuesHandler.cpp b/source/Plugins/SystemRuntime/MacOSX/AppleGetQueuesHandler.cpp
index 3de294f6e80e..c1654eb62ccc 100644
--- a/source/Plugins/SystemRuntime/MacOSX/AppleGetQueuesHandler.cpp
+++ b/source/Plugins/SystemRuntime/MacOSX/AppleGetQueuesHandler.cpp
@@ -167,7 +167,7 @@ AppleGetQueuesHandler::SetupGetQueuesFunction(Thread &thread,
 
     if (!m_get_queues_impl_code_up.get()) {
       if (g_get_current_queues_function_code != NULL) {
-        Error error;
+        Status error;
         m_get_queues_impl_code_up.reset(
             exe_ctx.GetTargetRef().GetUtilityFunctionForLanguage(
                 g_get_current_queues_function_code, eLanguageTypeC,
@@ -202,7 +202,7 @@ AppleGetQueuesHandler::SetupGetQueuesFunction(Thread &thread,
         thread.GetProcess()->GetTarget().GetScratchClangASTContext();
     CompilerType get_queues_return_type =
         clang_ast_context->GetBasicType(eBasicTypeVoid).GetPointerType();
-    Error error;
+    Status error;
     get_queues_caller = m_get_queues_impl_code_up->MakeFunctionCaller(
         get_queues_return_type, get_queues_arglist, thread_sp, error);
     if (error.Fail() || get_queues_caller == nullptr) {
@@ -237,7 +237,7 @@ AppleGetQueuesHandler::SetupGetQueuesFunction(Thread &thread,
 AppleGetQueuesHandler::GetQueuesReturnInfo
 AppleGetQueuesHandler::GetCurrentQueues(Thread &thread, addr_t page_to_free,
                                         uint64_t page_to_free_size,
-                                        Error &error) {
+                                        Status &error) {
   lldb::StackFrameSP thread_cur_frame = thread.GetStackFrameAtIndex(0);
   ProcessSP process_sp(thread.CalculateProcess());
   TargetSP target_sp(thread.CalculateTarget());
diff --git a/source/Plugins/SystemRuntime/MacOSX/AppleGetQueuesHandler.h b/source/Plugins/SystemRuntime/MacOSX/AppleGetQueuesHandler.h
index 3e20fbe08d2a..f1c79044496f 100644
--- a/source/Plugins/SystemRuntime/MacOSX/AppleGetQueuesHandler.h
+++ b/source/Plugins/SystemRuntime/MacOSX/AppleGetQueuesHandler.h
@@ -19,7 +19,7 @@
 // Other libraries and framework includes
 // Project includes
 #include "lldb/Symbol/CompilerType.h"
-#include "lldb/Utility/Error.h"
+#include "lldb/Utility/Status.h"
 #include "lldb/lldb-public.h"
 
 // This class will insert a UtilityFunction into the inferior process for
@@ -92,7 +92,7 @@ public:
   GetQueuesReturnInfo GetCurrentQueues(Thread &thread,
                                        lldb::addr_t page_to_free,
                                        uint64_t page_to_free_size,
-                                       lldb_private::Error &error);
+                                       lldb_private::Status &error);
 
   void Detach();
 
diff --git a/source/Plugins/SystemRuntime/MacOSX/AppleGetThreadItemInfoHandler.cpp b/source/Plugins/SystemRuntime/MacOSX/AppleGetThreadItemInfoHandler.cpp
index c3c0f4423c84..8d83922af1e7 100644
--- a/source/Plugins/SystemRuntime/MacOSX/AppleGetThreadItemInfoHandler.cpp
+++ b/source/Plugins/SystemRuntime/MacOSX/AppleGetThreadItemInfoHandler.cpp
@@ -161,7 +161,7 @@ lldb::addr_t AppleGetThreadItemInfoHandler::SetupGetThreadItemInfoFunction(
     // First stage is to make the ClangUtility to hold our injected function:
 
     if (!m_get_thread_item_info_impl_code.get()) {
-      Error error;
+      Status error;
       if (g_get_thread_item_info_function_code != NULL) {
         m_get_thread_item_info_impl_code.reset(
             exe_ctx.GetTargetRef().GetUtilityFunctionForLanguage(
@@ -243,7 +243,7 @@ AppleGetThreadItemInfoHandler::GetThreadItemInfo(Thread &thread,
                                                  tid_t thread_id,
                                                  addr_t page_to_free,
                                                  uint64_t page_to_free_size,
-                                                 Error &error) {
+                                                 Status &error) {
   lldb::StackFrameSP thread_cur_frame = thread.GetStackFrameAtIndex(0);
   ProcessSP process_sp(thread.CalculateProcess());
   TargetSP target_sp(thread.CalculateTarget());
diff --git a/source/Plugins/SystemRuntime/MacOSX/AppleGetThreadItemInfoHandler.h b/source/Plugins/SystemRuntime/MacOSX/AppleGetThreadItemInfoHandler.h
index 53fdda2d5ca0..62730809e0d1 100644
--- a/source/Plugins/SystemRuntime/MacOSX/AppleGetThreadItemInfoHandler.h
+++ b/source/Plugins/SystemRuntime/MacOSX/AppleGetThreadItemInfoHandler.h
@@ -20,7 +20,7 @@
 // Other libraries and framework includes
 // Project includes
 #include "lldb/Symbol/CompilerType.h"
-#include "lldb/Utility/Error.h"
+#include "lldb/Utility/Status.h"
 #include "lldb/lldb-public.h"
 
 // This class will insert a UtilityFunction into the inferior process for
@@ -93,7 +93,7 @@ public:
                                                 lldb::tid_t thread_id,
                                                 lldb::addr_t page_to_free,
                                                 uint64_t page_to_free_size,
-                                                lldb_private::Error &error);
+                                                lldb_private::Status &error);
 
   void Detach();
 
diff --git a/source/Plugins/SystemRuntime/MacOSX/SystemRuntimeMacOSX.cpp b/source/Plugins/SystemRuntime/MacOSX/SystemRuntimeMacOSX.cpp
index 394f05ca770a..1a538b236c15 100644
--- a/source/Plugins/SystemRuntime/MacOSX/SystemRuntimeMacOSX.cpp
+++ b/source/Plugins/SystemRuntime/MacOSX/SystemRuntimeMacOSX.cpp
@@ -129,7 +129,7 @@ SystemRuntimeMacOSX::GetQueueNameFromThreadQAddress(addr_t dispatch_qaddr) {
     // deref it to get the address of the dispatch_queue_t structure for this
     // thread's
     // queue.
-    Error error;
+    Status error;
     addr_t dispatch_queue_addr =
         m_process->ReadPointerFromMemory(dispatch_qaddr, error);
     if (error.Success()) {
@@ -164,7 +164,7 @@ SystemRuntimeMacOSX::GetQueueNameFromThreadQAddress(addr_t dispatch_qaddr) {
 lldb::addr_t SystemRuntimeMacOSX::GetLibdispatchQueueAddressFromThreadQAddress(
     addr_t dispatch_qaddr) {
   addr_t libdispatch_queue_t_address = LLDB_INVALID_ADDRESS;
-  Error error;
+  Status error;
   libdispatch_queue_t_address =
       m_process->ReadPointerFromMemory(dispatch_qaddr, error);
   if (!error.Success()) {
@@ -181,7 +181,7 @@ lldb::QueueKind SystemRuntimeMacOSX::GetQueueKind(addr_t dispatch_queue_addr) {
   ReadLibdispatchOffsets();
   if (m_libdispatch_offsets.IsValid() &&
       m_libdispatch_offsets.dqo_version >= 4) {
-    Error error;
+    Status error;
     uint64_t width = m_process->ReadUnsignedIntegerFromMemory(
         dispatch_queue_addr + m_libdispatch_offsets.dqo_width,
         m_libdispatch_offsets.dqo_width_size, 0, error);
@@ -252,7 +252,7 @@ SystemRuntimeMacOSX::GetQueueIDFromThreadQAddress(lldb::addr_t dispatch_qaddr) {
     // deref it to get the address of the dispatch_queue_t structure for this
     // thread's
     // queue.
-    Error error;
+    Status error;
     uint64_t dispatch_queue_addr =
         m_process->ReadPointerFromMemory(dispatch_qaddr, error);
     if (error.Success()) {
@@ -313,7 +313,7 @@ void SystemRuntimeMacOSX::ReadLibdispatchOffsets() {
                      m_process->GetByteOrder(),
                      m_process->GetAddressByteSize());
 
-  Error error;
+  Status error;
   if (m_process->ReadMemory(m_dispatch_queue_offsets_addr, memory_buffer,
                             sizeof(memory_buffer),
                             error) == sizeof(memory_buffer)) {
@@ -361,7 +361,7 @@ void SystemRuntimeMacOSX::ReadLibpthreadOffsets() {
     DataExtractor data(memory_buffer, sizeof(memory_buffer),
                        m_process->GetByteOrder(),
                        m_process->GetAddressByteSize());
-    Error error;
+    Status error;
     if (m_process->ReadMemory(m_libpthread_layout_offsets_addr, memory_buffer,
                               sizeof(memory_buffer),
                               error) == sizeof(memory_buffer)) {
@@ -416,7 +416,7 @@ void SystemRuntimeMacOSX::ReadLibdispatchTSDIndexes() {
         Address dti_struct_addr;
         if (m_process->GetTarget().ResolveLoadAddress (m_dispatch_tsd_indexes_addr, dti_struct_addr))
         {
-            Error error;
+            Status error;
             uint16_t version = m_process->GetTarget().ReadUnsignedIntegerFromMemory (dti_struct_addr, false, 2, UINT16_MAX, error);
             if (error.Success() && dti_version != UINT16_MAX)
             {
@@ -470,7 +470,7 @@ ThreadSP SystemRuntimeMacOSX::GetExtendedBacktraceThread(ThreadSP real_thread,
   ThreadSP originating_thread_sp;
   if (BacktraceRecordingHeadersInitialized() &&
       type == ConstString("libdispatch")) {
-    Error error;
+    Status error;
 
     // real_thread is either an actual, live thread (in which case we need to
     // call into
@@ -532,7 +532,7 @@ SystemRuntimeMacOSX::GetExtendedBacktraceFromItemRef(lldb::addr_t item_ref) {
   AppleGetItemInfoHandler::GetItemInfoReturnInfo ret;
   ThreadSP cur_thread_sp(
       m_process->GetThreadList().GetExpressionExecutionThread());
-  Error error;
+  Status error;
   ret = m_get_item_info_handler.GetItemInfo(*cur_thread_sp.get(), item_ref,
                                             m_page_to_free, m_page_to_free_size,
                                             error);
@@ -667,7 +667,7 @@ bool SystemRuntimeMacOSX::BacktraceRecordingHeadersInitialized() {
       queue_info_data_offset_address != LLDB_INVALID_ADDRESS &&
       item_info_version_address != LLDB_INVALID_ADDRESS &&
       item_info_data_offset_address != LLDB_INVALID_ADDRESS) {
-    Error error;
+    Status error;
     m_lib_backtrace_recording_info.queue_info_version =
         m_process->ReadUnsignedIntegerFromMemory(queue_info_version_address, 2,
                                                  0, error);
@@ -715,7 +715,7 @@ void SystemRuntimeMacOSX::PopulateQueueList(
     ThreadSP cur_thread_sp(
         m_process->GetThreadList().GetExpressionExecutionThread());
     if (cur_thread_sp) {
-      Error error;
+      Status error;
       queue_info_pointer = m_get_queues_handler.GetCurrentQueues(
           *cur_thread_sp.get(), m_page_to_free, m_page_to_free_size, error);
       m_page_to_free = LLDB_INVALID_ADDRESS;
@@ -783,7 +783,7 @@ SystemRuntimeMacOSX::GetPendingItemRefsForQueue(lldb::addr_t queue) {
   ThreadSP cur_thread_sp(
       m_process->GetThreadList().GetExpressionExecutionThread());
   if (cur_thread_sp) {
-    Error error;
+    Status error;
     pending_items_pointer = m_get_pending_items_handler.GetPendingItems(
         *cur_thread_sp.get(), queue, m_page_to_free, m_page_to_free_size,
         error);
@@ -877,7 +877,7 @@ void SystemRuntimeMacOSX::CompleteQueueItem(QueueItem *queue_item,
 
   ThreadSP cur_thread_sp(
       m_process->GetThreadList().GetExpressionExecutionThread());
-  Error error;
+  Status error;
   ret = m_get_item_info_handler.GetItemInfo(*cur_thread_sp.get(), item_ref,
                                             m_page_to_free, m_page_to_free_size,
                                             error);
@@ -910,7 +910,7 @@ void SystemRuntimeMacOSX::CompleteQueueItem(QueueItem *queue_item,
 void SystemRuntimeMacOSX::PopulateQueuesUsingLibBTR(
     lldb::addr_t queues_buffer, uint64_t queues_buffer_size, uint64_t count,
     lldb_private::QueueList &queue_list) {
-  Error error;
+  Status error;
   DataBufferHeap data(queues_buffer_size, 0);
   Log *log(lldb_private::GetLogIfAllCategoriesSet(LIBLLDB_LOG_SYSTEM_RUNTIME));
   if (m_process->ReadMemory(queues_buffer, data.GetBytes(), queues_buffer_size,
diff --git a/source/Plugins/UnwindAssembly/InstEmulation/UnwindAssemblyInstEmulation.cpp b/source/Plugins/UnwindAssembly/InstEmulation/UnwindAssemblyInstEmulation.cpp
index 84e16991cce1..3976f40dd8d1 100644
--- a/source/Plugins/UnwindAssembly/InstEmulation/UnwindAssemblyInstEmulation.cpp
+++ b/source/Plugins/UnwindAssembly/InstEmulation/UnwindAssemblyInstEmulation.cpp
@@ -21,8 +21,8 @@
 #include "lldb/Target/Thread.h"
 #include "lldb/Utility/DataBufferHeap.h"
 #include "lldb/Utility/DataExtractor.h"
-#include "lldb/Utility/Error.h"
 #include "lldb/Utility/Log.h"
+#include "lldb/Utility/Status.h"
 #include "lldb/Utility/StreamString.h"
 
 using namespace lldb;
@@ -37,7 +37,7 @@ bool UnwindAssemblyInstEmulation::GetNonCallSiteUnwindPlanFromAssembly(
   std::vector<uint8_t> function_text(range.GetByteSize());
   ProcessSP process_sp(thread.GetProcess());
   if (process_sp) {
-    Error error;
+    Status error;
     const bool prefer_file_cache = true;
     if (process_sp->GetTarget().ReadMemory(
             range.GetBaseAddress(), prefer_file_cache, function_text.data(),
diff --git a/source/Plugins/UnwindAssembly/x86/UnwindAssembly-x86.cpp b/source/Plugins/UnwindAssembly/x86/UnwindAssembly-x86.cpp
index e72097474097..c171f0f4d2a0 100644
--- a/source/Plugins/UnwindAssembly/x86/UnwindAssembly-x86.cpp
+++ b/source/Plugins/UnwindAssembly/x86/UnwindAssembly-x86.cpp
@@ -26,7 +26,7 @@
 #include "lldb/Target/Target.h"
 #include "lldb/Target/Thread.h"
 #include "lldb/Target/UnwindAssembly.h"
-#include "lldb/Utility/Error.h"
+#include "lldb/Utility/Status.h"
 
 using namespace lldb;
 using namespace lldb_private;
@@ -54,7 +54,7 @@ bool UnwindAssembly_x86::GetNonCallSiteUnwindPlanFromAssembly(
     return false;
   const bool prefer_file_cache = true;
   std::vector<uint8_t> function_text(func.GetByteSize());
-  Error error;
+  Status error;
   if (process_sp->GetTarget().ReadMemory(
           func.GetBaseAddress(), prefer_file_cache, function_text.data(),
           func.GetByteSize(), error) == func.GetByteSize()) {
@@ -161,7 +161,7 @@ bool UnwindAssembly_x86::AugmentUnwindPlanFromCallSite(
       return false;
     const bool prefer_file_cache = true;
     std::vector<uint8_t> function_text(func.GetByteSize());
-    Error error;
+    Status error;
     if (process_sp->GetTarget().ReadMemory(
             func.GetBaseAddress(), prefer_file_cache, function_text.data(),
             func.GetByteSize(), error) == func.GetByteSize()) {
@@ -192,7 +192,7 @@ bool UnwindAssembly_x86::GetFastUnwindPlan(AddressRange &func, Thread &thread,
   if (process_sp) {
     Target &target(process_sp->GetTarget());
     const bool prefer_file_cache = true;
-    Error error;
+    Status error;
     if (target.ReadMemory(func.GetBaseAddress(), prefer_file_cache,
                           opcode_data.data(), 4, error) == 4) {
       uint8_t i386_push_mov[] = {0x55, 0x89, 0xe5};
@@ -228,7 +228,7 @@ bool UnwindAssembly_x86::FirstNonPrologueInsn(
 
   const bool prefer_file_cache = true;
   std::vector<uint8_t> function_text(func.GetByteSize());
-  Error error;
+  Status error;
   if (target->ReadMemory(func.GetBaseAddress(), prefer_file_cache,
                          function_text.data(), func.GetByteSize(),
                          error) == func.GetByteSize()) {
diff --git a/source/Symbol/ClangASTContext.cpp b/source/Symbol/ClangASTContext.cpp
index 8e2576aaec95..94c91fe335a7 100644
--- a/source/Symbol/ClangASTContext.cpp
+++ b/source/Symbol/ClangASTContext.cpp
@@ -1394,6 +1394,12 @@ CompilerType ClangASTContext::CreateRecordType(DeclContext *decl_ctx,
   return CompilerType();
 }
 
+namespace {
+  bool IsValueParam(const clang::TemplateArgument &argument) {
+    return argument.getKind() == TemplateArgument::Integral;
+  }
+}
+
 static TemplateParameterList *CreateTemplateParameterList(
     ASTContext *ast,
     const ClangASTContext::TemplateParameterInfos &template_param_infos,
@@ -1401,31 +1407,51 @@ static TemplateParameterList *CreateTemplateParameterList(
   const bool parameter_pack = false;
   const bool is_typename = false;
   const unsigned depth = 0;
-  const size_t num_template_params = template_param_infos.GetSize();
+  const size_t num_template_params = template_param_infos.args.size();
+  DeclContext *const decl_context =
+      ast->getTranslationUnitDecl(); // Is this the right decl context?,
   for (size_t i = 0; i < num_template_params; ++i) {
     const char *name = template_param_infos.names[i];
 
     IdentifierInfo *identifier_info = nullptr;
     if (name && name[0])
       identifier_info = &ast->Idents.get(name);
-    if (template_param_infos.args[i].getKind() == TemplateArgument::Integral) {
+    if (IsValueParam(template_param_infos.args[i])) {
       template_param_decls.push_back(NonTypeTemplateParmDecl::Create(
-          *ast,
-          ast->getTranslationUnitDecl(), // Is this the right decl context?,
-                                         // SourceLocation StartLoc,
+          *ast, decl_context,
           SourceLocation(), SourceLocation(), depth, i, identifier_info,
           template_param_infos.args[i].getIntegralType(), parameter_pack,
           nullptr));
 
     } else {
       template_param_decls.push_back(TemplateTypeParmDecl::Create(
-          *ast,
-          ast->getTranslationUnitDecl(), // Is this the right decl context?
+          *ast, decl_context,
           SourceLocation(), SourceLocation(), depth, i, identifier_info,
           is_typename, parameter_pack));
     }
   }
-
+  
+  if (template_param_infos.packed_args &&
+      template_param_infos.packed_args->args.size()) {
+    IdentifierInfo *identifier_info = nullptr;
+    if (template_param_infos.pack_name && template_param_infos.pack_name[0])
+      identifier_info = &ast->Idents.get(template_param_infos.pack_name);
+    const bool parameter_pack_true = true;
+    if (IsValueParam(template_param_infos.packed_args->args[0])) {
+      template_param_decls.push_back(NonTypeTemplateParmDecl::Create(
+          *ast, decl_context,
+          SourceLocation(), SourceLocation(), depth, num_template_params,
+          identifier_info,
+          template_param_infos.packed_args->args[0].getIntegralType(),
+          parameter_pack_true, nullptr));
+    } else {
+      template_param_decls.push_back(TemplateTypeParmDecl::Create(
+          *ast, decl_context,
+          SourceLocation(), SourceLocation(), depth, num_template_params,
+          identifier_info,
+          is_typename, parameter_pack_true));
+    }
+  }
   clang::Expr *const requires_clause = nullptr; // TODO: Concepts
   TemplateParameterList *template_param_list = TemplateParameterList::Create(
       *ast, SourceLocation(), SourceLocation(), template_param_decls,
@@ -1535,10 +1561,19 @@ ClangASTContext::CreateClassTemplateSpecializationDecl(
     DeclContext *decl_ctx, ClassTemplateDecl *class_template_decl, int kind,
     const TemplateParameterInfos &template_param_infos) {
   ASTContext *ast = getASTContext();
+  llvm::SmallVector<clang::TemplateArgument, 2> args(
+      template_param_infos.args.size() +
+      (template_param_infos.packed_args ? 1 : 0));
+  std::copy(template_param_infos.args.begin(), template_param_infos.args.end(),
+            args.begin());
+  if (template_param_infos.packed_args) {
+    args[args.size() - 1] = TemplateArgument::CreatePackCopy(
+        *ast, template_param_infos.packed_args->args);
+  }
   ClassTemplateSpecializationDecl *class_template_specialization_decl =
       ClassTemplateSpecializationDecl::Create(
           *ast, (TagDecl::TagKind)kind, decl_ctx, SourceLocation(),
-          SourceLocation(), class_template_decl, template_param_infos.args,
+          SourceLocation(), class_template_decl, args,
           nullptr);
 
   class_template_specialization_decl->setSpecializationKind(
@@ -4458,7 +4493,7 @@ ClangASTContext::GetNumMemberFunctions(lldb::opaque_compiler_type_t type) {
 
     case clang::Type::ObjCObjectPointer: {
       const clang::ObjCObjectPointerType *objc_class_type =
-          qual_type->getAsObjCInterfacePointerType();
+          qual_type->getAs<clang::ObjCObjectPointerType>();
       const clang::ObjCInterfaceType *objc_interface_type =
           objc_class_type->getInterfaceType();
       if (objc_interface_type &&
@@ -4567,7 +4602,7 @@ ClangASTContext::GetMemberFunctionAtIndex(lldb::opaque_compiler_type_t type,
 
     case clang::Type::ObjCObjectPointer: {
       const clang::ObjCObjectPointerType *objc_class_type =
-          qual_type->getAsObjCInterfacePointerType();
+          qual_type->getAs<clang::ObjCObjectPointerType>();
       const clang::ObjCInterfaceType *objc_interface_type =
           objc_class_type->getInterfaceType();
       if (objc_interface_type &&
@@ -5636,7 +5671,7 @@ uint32_t ClangASTContext::GetNumFields(lldb::opaque_compiler_type_t type) {
 
   case clang::Type::ObjCObjectPointer: {
     const clang::ObjCObjectPointerType *objc_class_type =
-        qual_type->getAsObjCInterfacePointerType();
+        qual_type->getAs<clang::ObjCObjectPointerType>();
     const clang::ObjCInterfaceType *objc_interface_type =
         objc_class_type->getInterfaceType();
     if (objc_interface_type &&
@@ -5784,7 +5819,7 @@ CompilerType ClangASTContext::GetFieldAtIndex(lldb::opaque_compiler_type_t type,
 
   case clang::Type::ObjCObjectPointer: {
     const clang::ObjCObjectPointerType *objc_class_type =
-        qual_type->getAsObjCInterfacePointerType();
+        qual_type->getAs<clang::ObjCObjectPointerType>();
     const clang::ObjCInterfaceType *objc_interface_type =
         objc_class_type->getInterfaceType();
     if (objc_interface_type &&
@@ -6398,7 +6433,7 @@ CompilerType ClangASTContext::GetChildCompilerTypeAtIndex(
             if (base_class->isVirtual()) {
               bool handled = false;
               if (valobj) {
-                Error err;
+                Status err;
                 AddressType addr_type = eAddressTypeInvalid;
                 lldb::addr_t vtable_ptr_addr =
                     valobj->GetCPPVTableAddress(addr_type);
@@ -8792,7 +8827,7 @@ ClangASTContext::ConvertStringToFloatValue(lldb::opaque_compiler_type_t type,
       if (dst_size >= byte_size) {
         Scalar scalar = ap_float.bitcastToAPInt().zextOrTrunc(
             llvm::NextPowerOf2(byte_size) * 8);
-        lldb_private::Error get_data_error;
+        lldb_private::Status get_data_error;
         if (scalar.GetAsMemoryData(dst, byte_size,
                                    lldb_private::endian::InlHostByteOrder(),
                                    get_data_error))
@@ -9361,7 +9396,7 @@ void ClangASTContext::DumpSummary(lldb::opaque_compiler_type_t type,
         buf.back() = '\0';
         size_t bytes_read;
         size_t total_cstr_len = 0;
-        Error error;
+        Status error;
         while ((bytes_read = process->ReadMemory(pointer_address, &buf.front(),
                                                  buf.size(), error)) > 0) {
           const size_t len = strlen((const char *)&buf.front());
diff --git a/source/Symbol/CompactUnwindInfo.cpp b/source/Symbol/CompactUnwindInfo.cpp
index 77fcd33bbb3b..bc367496003e 100644
--- a/source/Symbol/CompactUnwindInfo.cpp
+++ b/source/Symbol/CompactUnwindInfo.cpp
@@ -272,7 +272,7 @@ void CompactUnwindInfo::ScanIndex(const ProcessSP &process_sp) {
         return;
       m_section_contents_if_encrypted.reset(
           new DataBufferHeap(m_section_sp->GetByteSize(), 0));
-      Error error;
+      Status error;
       if (process_sp->ReadMemory(
               m_section_sp->GetLoadBaseAddress(&process_sp->GetTarget()),
               m_section_contents_if_encrypted->GetBytes(),
@@ -836,7 +836,7 @@ bool CompactUnwindInfo::CreateUnwindPlan_x86_64(Target &target,
         if (process_sp) {
           Address subl_payload_addr(function_info.valid_range_offset_start, sl);
           subl_payload_addr.Slide(offset_to_subl_insn);
-          Error error;
+          Status error;
           uint64_t large_stack_size = process_sp->ReadUnsignedIntegerFromMemory(
               subl_payload_addr.GetLoadAddress(&target), 4, 0, error);
           if (large_stack_size != 0 && error.Success()) {
@@ -1100,7 +1100,7 @@ bool CompactUnwindInfo::CreateUnwindPlan_i386(Target &target,
         if (process_sp) {
           Address subl_payload_addr(function_info.valid_range_offset_start, sl);
           subl_payload_addr.Slide(offset_to_subl_insn);
-          Error error;
+          Status error;
           uint64_t large_stack_size = process_sp->ReadUnsignedIntegerFromMemory(
               subl_payload_addr.GetLoadAddress(&target), 4, 0, error);
           if (large_stack_size != 0 && error.Success()) {
diff --git a/source/Symbol/CompilerType.cpp b/source/Symbol/CompilerType.cpp
index cc33dc196226..e3880af27f22 100644
--- a/source/Symbol/CompilerType.cpp
+++ b/source/Symbol/CompilerType.cpp
@@ -1004,7 +1004,7 @@ bool CompilerType::ReadFromMemory(lldb_private::ExecutionContext *exe_ctx,
       if (exe_ctx)
         process = exe_ctx->GetProcessPtr();
       if (process) {
-        Error error;
+        Status error;
         return process->ReadMemory(addr, dst, byte_size, error) == byte_size;
       }
     }
@@ -1039,7 +1039,7 @@ bool CompilerType::WriteToMemory(lldb_private::ExecutionContext *exe_ctx,
       if (exe_ctx)
         process = exe_ctx->GetProcessPtr();
       if (process) {
-        Error error;
+        Status error;
         return process->WriteMemory(addr, new_value.GetData(), byte_size,
                                     error) == byte_size;
       }
diff --git a/source/Symbol/DWARFCallFrameInfo.cpp b/source/Symbol/DWARFCallFrameInfo.cpp
index d229e880d97d..015ecd856aa8 100644
--- a/source/Symbol/DWARFCallFrameInfo.cpp
+++ b/source/Symbol/DWARFCallFrameInfo.cpp
@@ -419,7 +419,8 @@ void DWARFCallFrameInfo::GetFDEIndex() {
   if (m_fde_index_initialized) // if two threads hit the locker
     return;
 
-  Timer scoped_timer(LLVM_PRETTY_FUNCTION, "%s - %s", LLVM_PRETTY_FUNCTION,
+  static Timer::Category func_cat(LLVM_PRETTY_FUNCTION);
+  Timer scoped_timer(func_cat, "%s - %s", LLVM_PRETTY_FUNCTION,
                      m_objfile.GetFileSpec().GetFilename().AsCString(""));
 
   bool clear_address_zeroth_bit = false;
diff --git a/source/Symbol/JavaASTContext.cpp b/source/Symbol/JavaASTContext.cpp
index ac029dfe5dc8..ae4e9d5134b5 100644
--- a/source/Symbol/JavaASTContext.cpp
+++ b/source/Symbol/JavaASTContext.cpp
@@ -137,7 +137,7 @@ public:
     if (m_dynamic_type_id.Evaluate(exe_ctx->GetBestExecutionContextScope(),
                                    nullptr, nullptr, 0, &obj_load_address,
                                    nullptr, result, nullptr)) {
-      Error error;
+      Status error;
 
       lldb::addr_t type_id_addr = result.GetScalar().UInt();
       lldb::ProcessSP process_sp = exe_ctx->GetProcessSP();
@@ -303,7 +303,7 @@ public:
     if (!m_length_expression.IsValid())
       return UINT32_MAX;
 
-    Error error;
+    Status error;
     ValueObjectSP address_obj = value_obj->AddressOf(error);
     if (error.Fail())
       return UINT32_MAX;
diff --git a/source/Symbol/ObjectFile.cpp b/source/Symbol/ObjectFile.cpp
index 483a315defbd..c970de6fef06 100644
--- a/source/Symbol/ObjectFile.cpp
+++ b/source/Symbol/ObjectFile.cpp
@@ -37,8 +37,9 @@ ObjectFile::FindPlugin(const lldb::ModuleSP &module_sp, const FileSpec *file,
   ObjectFileSP object_file_sp;
 
   if (module_sp) {
+    static Timer::Category func_cat(LLVM_PRETTY_FUNCTION);
     Timer scoped_timer(
-        LLVM_PRETTY_FUNCTION,
+        func_cat,
         "ObjectFile::FindPlugin (module = %s, file = %p, file_offset = "
         "0x%8.8" PRIx64 ", file_size = 0x%8.8" PRIx64 ")",
         module_sp->GetFileSpec().GetPath().c_str(),
@@ -176,9 +177,11 @@ ObjectFileSP ObjectFile::FindPlugin(const lldb::ModuleSP &module_sp,
   ObjectFileSP object_file_sp;
 
   if (module_sp) {
-    Timer scoped_timer(LLVM_PRETTY_FUNCTION, "ObjectFile::FindPlugin (module = "
-                                             "%s, process = %p, header_addr = "
-                                             "0x%" PRIx64 ")",
+    static Timer::Category func_cat(LLVM_PRETTY_FUNCTION);
+    Timer scoped_timer(func_cat,
+                       "ObjectFile::FindPlugin (module = "
+                       "%s, process = %p, header_addr = "
+                       "0x%" PRIx64 ")",
                        module_sp->GetFileSpec().GetPath().c_str(),
                        static_cast<void *>(process_sp.get()), header_addr);
     uint32_t idx;
@@ -454,7 +457,7 @@ DataBufferSP ObjectFile::ReadMemory(const ProcessSP &process_sp,
   DataBufferSP data_sp;
   if (process_sp) {
     std::unique_ptr<DataBufferHeap> data_ap(new DataBufferHeap(byte_size, 0));
-    Error error;
+    Status error;
     const size_t bytes_read = process_sp->ReadMemory(
         addr, data_ap->GetBytes(), data_ap->GetByteSize(), error);
     if (bytes_read == byte_size)
@@ -493,7 +496,7 @@ size_t ObjectFile::ReadSectionData(const Section *section,
   if (IsInMemory()) {
     ProcessSP process_sp(m_process_wp.lock());
     if (process_sp) {
-      Error error;
+      Status error;
       const addr_t base_load_addr =
           section->GetLoadBaseAddress(&process_sp->GetTarget());
       if (base_load_addr != LLDB_INVALID_ADDRESS)
@@ -654,17 +657,17 @@ ConstString ObjectFile::GetNextSyntheticSymbolName() {
   return ConstString(ss.GetString());
 }
 
-Error ObjectFile::LoadInMemory(Target &target, bool set_pc) {
-  Error error;
+Status ObjectFile::LoadInMemory(Target &target, bool set_pc) {
+  Status error;
   ProcessSP process = target.CalculateProcess();
   if (!process)
-    return Error("No Process");
+    return Status("No Process");
   if (set_pc && !GetEntryPointAddress().IsValid())
-    return Error("No entry address in object file");
+    return Status("No entry address in object file");
 
   SectionList *section_list = GetSectionList();
   if (!section_list)
-      return Error("No section in object file");
+    return Status("No section in object file");
   size_t section_count = section_list->GetNumSections(0);
   for (size_t i = 0; i < section_count; ++i) {
     SectionSP section_sp = section_list->GetSectionAtIndex(i);
diff --git a/source/Symbol/SymbolContext.cpp b/source/Symbol/SymbolContext.cpp
index d99bfc609261..5ea6f91200c1 100644
--- a/source/Symbol/SymbolContext.cpp
+++ b/source/Symbol/SymbolContext.cpp
@@ -735,7 +735,7 @@ LineEntry SymbolContext::GetFunctionStartLineEntry() const {
 
 bool SymbolContext::GetAddressRangeFromHereToEndLine(uint32_t end_line,
                                                      AddressRange &range,
-                                                     Error &error) {
+                                                     Status &error) {
   if (!line_entry.IsValid()) {
     error.SetErrorString("Symbol context has no line table.");
     return false;
diff --git a/source/Symbol/Symtab.cpp b/source/Symbol/Symtab.cpp
index e0710aa4e6b9..3eec3e706185 100644
--- a/source/Symbol/Symtab.cpp
+++ b/source/Symbol/Symtab.cpp
@@ -220,7 +220,8 @@ void Symtab::InitNameIndexes() {
   // Protected function, no need to lock mutex...
   if (!m_name_indexes_computed) {
     m_name_indexes_computed = true;
-    Timer scoped_timer(LLVM_PRETTY_FUNCTION, "%s", LLVM_PRETTY_FUNCTION);
+    static Timer::Category func_cat(LLVM_PRETTY_FUNCTION);
+    Timer scoped_timer(func_cat, "%s", LLVM_PRETTY_FUNCTION);
     // Create the name index vector to be able to quickly search by name
     const size_t num_symbols = m_symbols.size();
 #if 1
@@ -433,7 +434,8 @@ void Symtab::AppendSymbolNamesToMap(const IndexCollection &indexes,
                                     bool add_demangled, bool add_mangled,
                                     NameToIndexMap &name_to_index_map) const {
   if (add_demangled || add_mangled) {
-    Timer scoped_timer(LLVM_PRETTY_FUNCTION, "%s", LLVM_PRETTY_FUNCTION);
+    static Timer::Category func_cat(LLVM_PRETTY_FUNCTION);
+    Timer scoped_timer(func_cat, "%s", LLVM_PRETTY_FUNCTION);
     std::lock_guard<std::recursive_mutex> guard(m_mutex);
 
     // Create the name index vector to be able to quickly search by name
@@ -595,7 +597,8 @@ void Symtab::SortSymbolIndexesByValue(std::vector<uint32_t> &indexes,
                                       bool remove_duplicates) const {
   std::lock_guard<std::recursive_mutex> guard(m_mutex);
 
-  Timer scoped_timer(LLVM_PRETTY_FUNCTION, LLVM_PRETTY_FUNCTION);
+  static Timer::Category func_cat(LLVM_PRETTY_FUNCTION);
+  Timer scoped_timer(func_cat, LLVM_PRETTY_FUNCTION);
   // No need to sort if we have zero or one items...
   if (indexes.size() <= 1)
     return;
@@ -621,7 +624,8 @@ uint32_t Symtab::AppendSymbolIndexesWithName(const ConstString &symbol_name,
                                              std::vector<uint32_t> &indexes) {
   std::lock_guard<std::recursive_mutex> guard(m_mutex);
 
-  Timer scoped_timer(LLVM_PRETTY_FUNCTION, "%s", LLVM_PRETTY_FUNCTION);
+  static Timer::Category func_cat(LLVM_PRETTY_FUNCTION);
+  Timer scoped_timer(func_cat, "%s", LLVM_PRETTY_FUNCTION);
   if (symbol_name) {
     if (!m_name_indexes_computed)
       InitNameIndexes();
@@ -637,7 +641,8 @@ uint32_t Symtab::AppendSymbolIndexesWithName(const ConstString &symbol_name,
                                              std::vector<uint32_t> &indexes) {
   std::lock_guard<std::recursive_mutex> guard(m_mutex);
 
-  Timer scoped_timer(LLVM_PRETTY_FUNCTION, "%s", LLVM_PRETTY_FUNCTION);
+  static Timer::Category func_cat(LLVM_PRETTY_FUNCTION);
+  Timer scoped_timer(func_cat, "%s", LLVM_PRETTY_FUNCTION);
   if (symbol_name) {
     const size_t old_size = indexes.size();
     if (!m_name_indexes_computed)
@@ -766,7 +771,8 @@ Symtab::FindAllSymbolsWithNameAndType(const ConstString &name,
                                       std::vector<uint32_t> &symbol_indexes) {
   std::lock_guard<std::recursive_mutex> guard(m_mutex);
 
-  Timer scoped_timer(LLVM_PRETTY_FUNCTION, "%s", LLVM_PRETTY_FUNCTION);
+  static Timer::Category func_cat(LLVM_PRETTY_FUNCTION);
+  Timer scoped_timer(func_cat, "%s", LLVM_PRETTY_FUNCTION);
   // Initialize all of the lookup by name indexes before converting NAME
   // to a uniqued string NAME_STR below.
   if (!m_name_indexes_computed)
@@ -785,7 +791,8 @@ size_t Symtab::FindAllSymbolsWithNameAndType(
     Visibility symbol_visibility, std::vector<uint32_t> &symbol_indexes) {
   std::lock_guard<std::recursive_mutex> guard(m_mutex);
 
-  Timer scoped_timer(LLVM_PRETTY_FUNCTION, "%s", LLVM_PRETTY_FUNCTION);
+  static Timer::Category func_cat(LLVM_PRETTY_FUNCTION);
+  Timer scoped_timer(func_cat, "%s", LLVM_PRETTY_FUNCTION);
   // Initialize all of the lookup by name indexes before converting NAME
   // to a uniqued string NAME_STR below.
   if (!m_name_indexes_computed)
@@ -817,7 +824,8 @@ Symbol *Symtab::FindFirstSymbolWithNameAndType(const ConstString &name,
                                                Visibility symbol_visibility) {
   std::lock_guard<std::recursive_mutex> guard(m_mutex);
 
-  Timer scoped_timer(LLVM_PRETTY_FUNCTION, "%s", LLVM_PRETTY_FUNCTION);
+  static Timer::Category func_cat(LLVM_PRETTY_FUNCTION);
+  Timer scoped_timer(func_cat, "%s", LLVM_PRETTY_FUNCTION);
   if (!m_name_indexes_computed)
     InitNameIndexes();
 
diff --git a/source/Symbol/Type.cpp b/source/Symbol/Type.cpp
index 89fc9f974c67..53d9c5cc96a9 100644
--- a/source/Symbol/Type.cpp
+++ b/source/Symbol/Type.cpp
@@ -410,7 +410,7 @@ bool Type::ReadFromMemory(ExecutionContext *exe_ctx, lldb::addr_t addr,
       if (exe_ctx) {
         Process *process = exe_ctx->GetProcessPtr();
         if (process) {
-          Error error;
+          Status error;
           return exe_ctx->GetProcessPtr()->ReadMemory(addr, dst, byte_size,
                                                       error) == byte_size;
         }
diff --git a/source/Symbol/Variable.cpp b/source/Symbol/Variable.cpp
index 0d1db1cdeac0..fd19a0994966 100644
--- a/source/Symbol/Variable.cpp
+++ b/source/Symbol/Variable.cpp
@@ -330,11 +330,11 @@ bool Variable::IsInScope(StackFrame *frame) {
   return false;
 }
 
-Error Variable::GetValuesForVariableExpressionPath(
+Status Variable::GetValuesForVariableExpressionPath(
     llvm::StringRef variable_expr_path, ExecutionContextScope *scope,
     GetVariableCallback callback, void *baton, VariableList &variable_list,
     ValueObjectList &valobj_list) {
-  Error error;
+  Status error;
   if (!callback || variable_expr_path.empty()) {
     error.SetErrorString("unknown error");
     return error;
@@ -350,7 +350,7 @@ Error Variable::GetValuesForVariableExpressionPath(
       return error;
     }
     for (uint32_t i = 0; i < valobj_list.GetSize();) {
-      Error tmp_error;
+      Status tmp_error;
       ValueObjectSP valobj_sp(
           valobj_list.GetValueObjectAtIndex(i)->Dereference(tmp_error));
       if (tmp_error.Fail()) {
@@ -368,7 +368,7 @@ Error Variable::GetValuesForVariableExpressionPath(
         valobj_list);
     if (error.Success()) {
       for (uint32_t i = 0; i < valobj_list.GetSize();) {
-        Error tmp_error;
+        Status tmp_error;
         ValueObjectSP valobj_sp(
             valobj_list.GetValueObjectAtIndex(i)->AddressOf(tmp_error));
         if (tmp_error.Fail()) {
diff --git a/source/Target/Language.cpp b/source/Target/Language.cpp
index 8fef32a3b186..cde6f8654aec 100644
--- a/source/Target/Language.cpp
+++ b/source/Target/Language.cpp
@@ -387,7 +387,7 @@ DumpValueObjectOptions::DeclPrintingHelper Language::GetDeclPrintingHelper() {
   return nullptr;
 }
 
-LazyBool Language::IsLogicalTrue(ValueObject &valobj, Error &error) {
+LazyBool Language::IsLogicalTrue(ValueObject &valobj, Status &error) {
   return eLazyBoolCalculate;
 }
 
diff --git a/source/Target/LanguageRuntime.cpp b/source/Target/LanguageRuntime.cpp
index d0018b6df114..bd02121f6a4d 100644
--- a/source/Target/LanguageRuntime.cpp
+++ b/source/Target/LanguageRuntime.cpp
@@ -89,7 +89,8 @@ ExceptionSearchFilter::DoCopyForBreakpoint(Breakpoint &breakpoint) {
 }
 
 SearchFilter *ExceptionSearchFilter::CreateFromStructuredData(
-    Target &target, const StructuredData::Dictionary &data_dict, Error &error) {
+    Target &target, const StructuredData::Dictionary &data_dict,
+    Status &error) {
   SearchFilter *result = nullptr;
   return result;
 }
diff --git a/source/Target/Memory.cpp b/source/Target/Memory.cpp
index c78bd7ad7b34..ced359418682 100644
--- a/source/Target/Memory.cpp
+++ b/source/Target/Memory.cpp
@@ -129,7 +129,8 @@ bool MemoryCache::RemoveInvalidRange(lldb::addr_t base_addr,
   return false;
 }
 
-size_t MemoryCache::Read(addr_t addr, void *dst, size_t dst_len, Error &error) {
+size_t MemoryCache::Read(addr_t addr, void *dst, size_t dst_len,
+                         Status &error) {
   size_t bytes_left = dst_len;
 
   // Check the L1 cache for a range that contain the entire memory read.
@@ -344,7 +345,7 @@ void AllocatedMemoryCache::Clear() {
 
 AllocatedMemoryCache::AllocatedBlockSP
 AllocatedMemoryCache::AllocatePage(uint32_t byte_size, uint32_t permissions,
-                                   uint32_t chunk_size, Error &error) {
+                                   uint32_t chunk_size, Status &error) {
   AllocatedBlockSP block_sp;
   const size_t page_size = 4096;
   const size_t num_pages = (byte_size + page_size - 1) / page_size;
@@ -370,7 +371,7 @@ AllocatedMemoryCache::AllocatePage(uint32_t byte_size, uint32_t permissions,
 
 lldb::addr_t AllocatedMemoryCache::AllocateMemory(size_t byte_size,
                                                   uint32_t permissions,
-                                                  Error &error) {
+                                                  Status &error) {
   std::lock_guard<std::recursive_mutex> guard(m_mutex);
 
   addr_t addr = LLDB_INVALID_ADDRESS;
diff --git a/source/Target/ModuleCache.cpp b/source/Target/ModuleCache.cpp
index a4aa26a0e480..2b654772639a 100644
--- a/source/Target/ModuleCache.cpp
+++ b/source/Target/ModuleCache.cpp
@@ -54,7 +54,7 @@ private:
   FileSpec m_file_spec;
 
 public:
-  ModuleLock(const FileSpec &root_dir_spec, const UUID &uuid, Error &error);
+  ModuleLock(const FileSpec &root_dir_spec, const UUID &uuid, Status &error);
   void Delete();
 };
 
@@ -64,7 +64,7 @@ static FileSpec JoinPath(const FileSpec &path1, const char *path2) {
   return result_spec;
 }
 
-static Error MakeDirectory(const FileSpec &dir_path) {
+static Status MakeDirectory(const FileSpec &dir_path) {
   namespace fs = llvm::sys::fs;
 
   return fs::create_directories(dir_path.GetPath(), true, fs::perms::owner_all);
@@ -92,7 +92,7 @@ void DeleteExistingModule(const FileSpec &root_dir_spec,
   if (!module_uuid.IsValid())
     return;
 
-  Error error;
+  Status error;
   ModuleLock lock(root_dir_spec, module_uuid, error);
   if (error.Fail()) {
     if (log)
@@ -125,17 +125,17 @@ void DecrementRefExistingModule(const FileSpec &root_dir_spec,
   llvm::sys::fs::remove(symfile_spec.GetPath());
 }
 
-Error CreateHostSysRootModuleLink(const FileSpec &root_dir_spec,
-                                  const char *hostname,
-                                  const FileSpec &platform_module_spec,
-                                  const FileSpec &local_module_spec,
-                                  bool delete_existing) {
+Status CreateHostSysRootModuleLink(const FileSpec &root_dir_spec,
+                                   const char *hostname,
+                                   const FileSpec &platform_module_spec,
+                                   const FileSpec &local_module_spec,
+                                   bool delete_existing) {
   const auto sysroot_module_path_spec =
       JoinPath(JoinPath(root_dir_spec, hostname),
                platform_module_spec.GetPath().c_str());
   if (sysroot_module_path_spec.Exists()) {
     if (!delete_existing)
-      return Error();
+      return Status();
 
     DecrementRefExistingModule(root_dir_spec, sysroot_module_path_spec);
   }
@@ -152,7 +152,7 @@ Error CreateHostSysRootModuleLink(const FileSpec &root_dir_spec,
 } // namespace
 
 ModuleLock::ModuleLock(const FileSpec &root_dir_spec, const UUID &uuid,
-                       Error &error) {
+                       Status &error) {
   const auto lock_dir_spec = JoinPath(root_dir_spec, kLockDirName);
   error = MakeDirectory(lock_dir_spec);
   if (error.Fail())
@@ -184,9 +184,9 @@ void ModuleLock::Delete() {
 
 /////////////////////////////////////////////////////////////////////////
 
-Error ModuleCache::Put(const FileSpec &root_dir_spec, const char *hostname,
-                       const ModuleSpec &module_spec, const FileSpec &tmp_file,
-                       const FileSpec &target_file) {
+Status ModuleCache::Put(const FileSpec &root_dir_spec, const char *hostname,
+                        const ModuleSpec &module_spec, const FileSpec &tmp_file,
+                        const FileSpec &target_file) {
   const auto module_spec_dir =
       GetModuleDirectory(root_dir_spec, module_spec.GetUUID());
   const auto module_file_path =
@@ -196,27 +196,27 @@ Error ModuleCache::Put(const FileSpec &root_dir_spec, const char *hostname,
   const auto err_code =
       llvm::sys::fs::rename(tmp_file_path, module_file_path.GetPath());
   if (err_code)
-    return Error("Failed to rename file %s to %s: %s", tmp_file_path.c_str(),
-                 module_file_path.GetPath().c_str(),
-                 err_code.message().c_str());
+    return Status("Failed to rename file %s to %s: %s", tmp_file_path.c_str(),
+                  module_file_path.GetPath().c_str(),
+                  err_code.message().c_str());
 
   const auto error = CreateHostSysRootModuleLink(
       root_dir_spec, hostname, target_file, module_file_path, true);
   if (error.Fail())
-    return Error("Failed to create link to %s: %s",
-                 module_file_path.GetPath().c_str(), error.AsCString());
-  return Error();
+    return Status("Failed to create link to %s: %s",
+                  module_file_path.GetPath().c_str(), error.AsCString());
+  return Status();
 }
 
-Error ModuleCache::Get(const FileSpec &root_dir_spec, const char *hostname,
-                       const ModuleSpec &module_spec,
-                       ModuleSP &cached_module_sp, bool *did_create_ptr) {
+Status ModuleCache::Get(const FileSpec &root_dir_spec, const char *hostname,
+                        const ModuleSpec &module_spec,
+                        ModuleSP &cached_module_sp, bool *did_create_ptr) {
   const auto find_it =
       m_loaded_modules.find(module_spec.GetUUID().GetAsString());
   if (find_it != m_loaded_modules.end()) {
     cached_module_sp = (*find_it).second.lock();
     if (cached_module_sp)
-      return Error();
+      return Status();
     m_loaded_modules.erase(find_it);
   }
 
@@ -226,10 +226,10 @@ Error ModuleCache::Get(const FileSpec &root_dir_spec, const char *hostname,
       module_spec_dir, module_spec.GetFileSpec().GetFilename().AsCString());
 
   if (!module_file_path.Exists())
-    return Error("Module %s not found", module_file_path.GetPath().c_str());
+    return Status("Module %s not found", module_file_path.GetPath().c_str());
   if (module_file_path.GetByteSize() != module_spec.GetObjectSize())
-    return Error("Module %s has invalid file size",
-                 module_file_path.GetPath().c_str());
+    return Status("Module %s has invalid file size",
+                  module_file_path.GetPath().c_str());
 
   // We may have already cached module but downloaded from an another host - in
   // this case let's create a link to it.
@@ -237,8 +237,8 @@ Error ModuleCache::Get(const FileSpec &root_dir_spec, const char *hostname,
                                            module_spec.GetFileSpec(),
                                            module_file_path, false);
   if (error.Fail())
-    return Error("Failed to create link to %s: %s",
-                 module_file_path.GetPath().c_str(), error.AsCString());
+    return Status("Failed to create link to %s: %s",
+                  module_file_path.GetPath().c_str(), error.AsCString());
 
   auto cached_module_spec(module_spec);
   cached_module_spec.GetUUID().Clear(); // Clear UUID since it may contain md5
@@ -258,16 +258,16 @@ Error ModuleCache::Get(const FileSpec &root_dir_spec, const char *hostname,
   m_loaded_modules.insert(
       std::make_pair(module_spec.GetUUID().GetAsString(), cached_module_sp));
 
-  return Error();
+  return Status();
 }
 
-Error ModuleCache::GetAndPut(const FileSpec &root_dir_spec,
-                             const char *hostname,
-                             const ModuleSpec &module_spec,
-                             const ModuleDownloader &module_downloader,
-                             const SymfileDownloader &symfile_downloader,
-                             lldb::ModuleSP &cached_module_sp,
-                             bool *did_create_ptr) {
+Status ModuleCache::GetAndPut(const FileSpec &root_dir_spec,
+                              const char *hostname,
+                              const ModuleSpec &module_spec,
+                              const ModuleDownloader &module_downloader,
+                              const SymfileDownloader &symfile_downloader,
+                              lldb::ModuleSP &cached_module_sp,
+                              bool *did_create_ptr) {
   const auto module_spec_dir =
       GetModuleDirectory(root_dir_spec, module_spec.GetUUID());
   auto error = MakeDirectory(module_spec_dir);
@@ -276,9 +276,9 @@ Error ModuleCache::GetAndPut(const FileSpec &root_dir_spec,
 
   ModuleLock lock(root_dir_spec, module_spec.GetUUID(), error);
   if (error.Fail())
-    return Error("Failed to lock module %s: %s",
-                 module_spec.GetUUID().GetAsString().c_str(),
-                 error.AsCString());
+    return Status("Failed to lock module %s: %s",
+                  module_spec.GetUUID().GetAsString().c_str(),
+                  error.AsCString());
 
   const auto escaped_hostname(GetEscapedHostname(hostname));
   // Check local cache for a module.
@@ -291,13 +291,13 @@ Error ModuleCache::GetAndPut(const FileSpec &root_dir_spec,
   error = module_downloader(module_spec, tmp_download_file_spec);
   llvm::FileRemover tmp_file_remover(tmp_download_file_spec.GetPath());
   if (error.Fail())
-    return Error("Failed to download module: %s", error.AsCString());
+    return Status("Failed to download module: %s", error.AsCString());
 
   // Put downloaded file into local module cache.
   error = Put(root_dir_spec, escaped_hostname.c_str(), module_spec,
               tmp_download_file_spec, module_spec.GetFileSpec());
   if (error.Fail())
-    return Error("Failed to put module into cache: %s", error.AsCString());
+    return Status("Failed to put module into cache: %s", error.AsCString());
 
   tmp_file_remover.releaseFile();
   error = Get(root_dir_spec, escaped_hostname.c_str(), module_spec,
@@ -315,17 +315,18 @@ Error ModuleCache::GetAndPut(const FileSpec &root_dir_spec,
     // module might
     // contain the necessary symbols and the debugging is also possible without
     // a symfile.
-    return Error();
+    return Status();
 
   error = Put(root_dir_spec, escaped_hostname.c_str(), module_spec,
               tmp_download_sym_file_spec,
               GetSymbolFileSpec(module_spec.GetFileSpec()));
   if (error.Fail())
-    return Error("Failed to put symbol file into cache: %s", error.AsCString());
+    return Status("Failed to put symbol file into cache: %s",
+                  error.AsCString());
 
   tmp_symfile_remover.releaseFile();
 
   FileSpec symfile_spec = GetSymbolFileSpec(cached_module_sp->GetFileSpec());
   cached_module_sp->SetSymbolFileFileSpec(symfile_spec);
-  return Error();
+  return Status();
 }
diff --git a/source/Target/ObjCLanguageRuntime.cpp b/source/Target/ObjCLanguageRuntime.cpp
index 6aeb4c46df48..165c75a0952b 100644
--- a/source/Target/ObjCLanguageRuntime.cpp
+++ b/source/Target/ObjCLanguageRuntime.cpp
@@ -250,7 +250,7 @@ ObjCLanguageRuntime::GetClassDescriptor(ValueObject &valobj) {
 
       Process *process = exe_ctx.GetProcessPtr();
       if (process) {
-        Error error;
+        Status error;
         ObjCISA isa = process->ReadPointerFromMemory(isa_pointer, error);
         if (isa != LLDB_INVALID_ADDRESS)
           objc_class_sp = GetClassDescriptorFromISA(isa);
@@ -377,9 +377,9 @@ bool ObjCLanguageRuntime::ObjCExceptionPrecondition::EvaluatePrecondition(
 void ObjCLanguageRuntime::ObjCExceptionPrecondition::GetDescription(
     Stream &stream, lldb::DescriptionLevel level) {}
 
-Error ObjCLanguageRuntime::ObjCExceptionPrecondition::ConfigurePrecondition(
+Status ObjCLanguageRuntime::ObjCExceptionPrecondition::ConfigurePrecondition(
     Args &args) {
-  Error error;
+  Status error;
   if (args.GetArgumentCount() > 0)
     error.SetErrorString(
         "The ObjC Exception breakpoint doesn't support extra options.");
diff --git a/source/Target/PathMappingList.cpp b/source/Target/PathMappingList.cpp
index 4fbaee98da61..b834a3600d0b 100644
--- a/source/Target/PathMappingList.cpp
+++ b/source/Target/PathMappingList.cpp
@@ -16,8 +16,8 @@
 // Project includes
 #include "lldb/Host/PosixApi.h"
 #include "lldb/Target/PathMappingList.h"
-#include "lldb/Utility/Error.h"
 #include "lldb/Utility/FileSpec.h"
+#include "lldb/Utility/Status.h"
 #include "lldb/Utility/Stream.h"
 
 using namespace lldb;
diff --git a/source/Target/Platform.cpp b/source/Target/Platform.cpp
index fd909075a240..cfd971e9de62 100644
--- a/source/Target/Platform.cpp
+++ b/source/Target/Platform.cpp
@@ -39,9 +39,9 @@
 #include "lldb/Target/Target.h"
 #include "lldb/Target/UnixSignals.h"
 #include "lldb/Utility/DataBufferHeap.h"
-#include "lldb/Utility/Error.h"
 #include "lldb/Utility/FileSpec.h"
 #include "lldb/Utility/Log.h"
+#include "lldb/Utility/Status.h"
 
 #include "llvm/Support/FileSystem.h"
 
@@ -170,11 +170,11 @@ void Platform::SetHostPlatform(const lldb::PlatformSP &platform_sp) {
   }
 }
 
-Error Platform::GetFileWithUUID(const FileSpec &platform_file,
-                                const UUID *uuid_ptr, FileSpec &local_file) {
+Status Platform::GetFileWithUUID(const FileSpec &platform_file,
+                                 const UUID *uuid_ptr, FileSpec &local_file) {
   // Default to the local case
   local_file = platform_file;
-  return Error();
+  return Status();
 }
 
 FileSpecList
@@ -217,11 +217,11 @@ Platform::LocateExecutableScriptingResources(Target *target, Module &module,
 //    return PlatformSP();
 //}
 
-Error Platform::GetSharedModule(const ModuleSpec &module_spec, Process *process,
-                                ModuleSP &module_sp,
-                                const FileSpecList *module_search_paths_ptr,
-                                ModuleSP *old_module_sp_ptr,
-                                bool *did_create_ptr) {
+Status Platform::GetSharedModule(const ModuleSpec &module_spec,
+                                 Process *process, ModuleSP &module_sp,
+                                 const FileSpecList *module_search_paths_ptr,
+                                 ModuleSP *old_module_sp_ptr,
+                                 bool *did_create_ptr) {
   if (IsHost())
     return ModuleList::GetSharedModule(
         module_spec, module_sp, module_search_paths_ptr, old_module_sp_ptr,
@@ -229,7 +229,7 @@ Error Platform::GetSharedModule(const ModuleSpec &module_spec, Process *process,
 
   return GetRemoteSharedModule(module_spec, process, module_sp,
                                [&](const ModuleSpec &spec) {
-                                 Error error = ModuleList::GetSharedModule(
+                                 Status error = ModuleList::GetSharedModule(
                                      spec, module_sp, module_search_paths_ptr,
                                      old_module_sp_ptr, did_create_ptr, false);
                                  if (error.Success() && module_sp)
@@ -267,7 +267,7 @@ PlatformSP Platform::Find(const ConstString &name) {
   return PlatformSP();
 }
 
-PlatformSP Platform::Create(const ConstString &name, Error &error) {
+PlatformSP Platform::Create(const ConstString &name, Status &error) {
   PlatformCreateInstance create_callback = nullptr;
   lldb::PlatformSP platform_sp;
   if (name) {
@@ -295,7 +295,7 @@ PlatformSP Platform::Create(const ConstString &name, Error &error) {
 }
 
 PlatformSP Platform::Create(const ArchSpec &arch, ArchSpec *platform_arch_ptr,
-                            Error &error) {
+                            Status &error) {
   lldb::PlatformSP platform_sp;
   if (arch.IsValid()) {
     // Scope for locker
@@ -540,7 +540,7 @@ FileSpec Platform::GetWorkingDirectory() {
 struct RecurseCopyBaton {
   const FileSpec &dst;
   Platform *platform_ptr;
-  Error error;
+  Status error;
 };
 
 static FileSpec::EnumerateDirectoryResult
@@ -560,7 +560,7 @@ RecurseCopy_Callback(void *baton, llvm::sys::fs::file_type ft,
     FileSpec dst_dir = rc_baton->dst;
     if (!dst_dir.GetFilename())
       dst_dir.GetFilename() = src.GetLastPathComponent();
-    Error error = rc_baton->platform_ptr->MakeDirectory(
+    Status error = rc_baton->platform_ptr->MakeDirectory(
         dst_dir, lldb::eFilePermissionsDirectoryDefault);
     if (error.Fail()) {
       rc_baton->error.SetErrorStringWithFormat(
@@ -575,7 +575,8 @@ RecurseCopy_Callback(void *baton, llvm::sys::fs::file_type ft,
     // when we enumerate we can quickly fill in the filename for dst copies
     FileSpec recurse_dst;
     recurse_dst.GetDirectory().SetCString(dst_dir.GetPath().c_str());
-    RecurseCopyBaton rc_baton2 = {recurse_dst, rc_baton->platform_ptr, Error()};
+    RecurseCopyBaton rc_baton2 = {recurse_dst, rc_baton->platform_ptr,
+                                  Status()};
     FileSpec::EnumerateDirectory(src_dir_path, true, true, true,
                                  RecurseCopy_Callback, &rc_baton2);
     if (rc_baton2.error.Fail()) {
@@ -612,7 +613,7 @@ RecurseCopy_Callback(void *baton, llvm::sys::fs::file_type ft,
     FileSpec dst_file = rc_baton->dst;
     if (!dst_file.GetFilename())
       dst_file.GetFilename() = src.GetFilename();
-    Error err = rc_baton->platform_ptr->PutFile(src, dst_file);
+    Status err = rc_baton->platform_ptr->PutFile(src, dst_file);
     if (err.Fail()) {
       rc_baton->error.SetErrorString(err.AsCString());
       return FileSpec::eEnumerateDirectoryResultQuit; // got an error, bail out
@@ -629,8 +630,8 @@ RecurseCopy_Callback(void *baton, llvm::sys::fs::file_type ft,
   llvm_unreachable("Unhandled file_type!");
 }
 
-Error Platform::Install(const FileSpec &src, const FileSpec &dst) {
-  Error error;
+Status Platform::Install(const FileSpec &src, const FileSpec &dst) {
+  Status error;
 
   Log *log = GetLogIfAnyCategoriesSet(LIBLLDB_LOG_PLATFORM);
   if (log)
@@ -708,7 +709,7 @@ Error Platform::Install(const FileSpec &src, const FileSpec &dst) {
         FileSpec recurse_dst;
         recurse_dst.GetDirectory().SetCString(fixed_dst.GetCString());
         std::string src_dir_path(src.GetPath());
-        RecurseCopyBaton baton = {recurse_dst, this, Error()};
+        RecurseCopyBaton baton = {recurse_dst, this, Status()};
         FileSpec::EnumerateDirectory(src_dir_path, true, true, true,
                                      RecurseCopy_Callback, &baton);
         return baton.error;
@@ -757,11 +758,12 @@ bool Platform::SetWorkingDirectory(const FileSpec &file_spec) {
   }
 }
 
-Error Platform::MakeDirectory(const FileSpec &file_spec, uint32_t permissions) {
+Status Platform::MakeDirectory(const FileSpec &file_spec,
+                               uint32_t permissions) {
   if (IsHost())
     return llvm::sys::fs::create_directory(file_spec.GetPath(), permissions);
   else {
-    Error error;
+    Status error;
     error.SetErrorStringWithFormat("remote platform %s doesn't support %s",
                                    GetPluginName().GetCString(),
                                    LLVM_PRETTY_FUNCTION);
@@ -769,15 +771,15 @@ Error Platform::MakeDirectory(const FileSpec &file_spec, uint32_t permissions) {
   }
 }
 
-Error Platform::GetFilePermissions(const FileSpec &file_spec,
-                                   uint32_t &file_permissions) {
+Status Platform::GetFilePermissions(const FileSpec &file_spec,
+                                    uint32_t &file_permissions) {
   if (IsHost()) {
     auto Value = llvm::sys::fs::getPermissions(file_spec.GetPath());
     if (Value)
       file_permissions = Value.get();
-    return Error(Value.getError());
+    return Status(Value.getError());
   } else {
-    Error error;
+    Status error;
     error.SetErrorStringWithFormat("remote platform %s doesn't support %s",
                                    GetPluginName().GetCString(),
                                    LLVM_PRETTY_FUNCTION);
@@ -785,13 +787,13 @@ Error Platform::GetFilePermissions(const FileSpec &file_spec,
   }
 }
 
-Error Platform::SetFilePermissions(const FileSpec &file_spec,
-                                   uint32_t file_permissions) {
+Status Platform::SetFilePermissions(const FileSpec &file_spec,
+                                    uint32_t file_permissions) {
   if (IsHost()) {
     auto Perms = static_cast<llvm::sys::fs::perms>(file_permissions);
     return llvm::sys::fs::setPermissions(file_spec.GetPath(), Perms);
   } else {
-    Error error;
+    Status error;
     error.SetErrorStringWithFormat("remote platform %s doesn't support %s",
                                    GetPluginName().GetCString(),
                                    LLVM_PRETTY_FUNCTION);
@@ -877,10 +879,11 @@ bool Platform::SetOSVersion(uint32_t major, uint32_t minor, uint32_t update) {
   return false;
 }
 
-Error Platform::ResolveExecutable(const ModuleSpec &module_spec,
-                                  lldb::ModuleSP &exe_module_sp,
-                                  const FileSpecList *module_search_paths_ptr) {
-  Error error;
+Status
+Platform::ResolveExecutable(const ModuleSpec &module_spec,
+                            lldb::ModuleSP &exe_module_sp,
+                            const FileSpecList *module_search_paths_ptr) {
+  Status error;
   if (module_spec.GetFileSpec().Exists()) {
     if (module_spec.GetArchitecture().IsValid()) {
       error = ModuleList::GetSharedModule(module_spec, exe_module_sp,
@@ -909,9 +912,9 @@ Error Platform::ResolveExecutable(const ModuleSpec &module_spec,
   return error;
 }
 
-Error Platform::ResolveSymbolFile(Target &target, const ModuleSpec &sym_spec,
-                                  FileSpec &sym_file) {
-  Error error;
+Status Platform::ResolveSymbolFile(Target &target, const ModuleSpec &sym_spec,
+                                   FileSpec &sym_file) {
+  Status error;
   if (sym_spec.GetSymbolFileSpec().Exists())
     sym_file = sym_spec.GetSymbolFileSpec();
   else
@@ -960,8 +963,8 @@ const ArchSpec &Platform::GetSystemArchitecture() {
   return m_system_arch;
 }
 
-Error Platform::ConnectRemote(Args &args) {
-  Error error;
+Status Platform::ConnectRemote(Args &args) {
+  Status error;
   if (IsHost())
     error.SetErrorStringWithFormat("The currently selected platform (%s) is "
                                    "the host platform and is always connected.",
@@ -973,8 +976,8 @@ Error Platform::ConnectRemote(Args &args) {
   return error;
 }
 
-Error Platform::DisconnectRemote() {
-  Error error;
+Status Platform::DisconnectRemote() {
+  Status error;
   if (IsHost())
     error.SetErrorStringWithFormat("The currently selected platform (%s) is "
                                    "the host platform and is always connected.",
@@ -1005,8 +1008,8 @@ uint32_t Platform::FindProcesses(const ProcessInstanceInfoMatch &match_info,
   return match_count;
 }
 
-Error Platform::LaunchProcess(ProcessLaunchInfo &launch_info) {
-  Error error;
+Status Platform::LaunchProcess(ProcessLaunchInfo &launch_info) {
+  Status error;
   Log *log(lldb_private::GetLogIfAllCategoriesSet(LIBLLDB_LOG_PLATFORM));
   if (log)
     log->Printf("Platform::%s()", __FUNCTION__);
@@ -1057,13 +1060,13 @@ Error Platform::LaunchProcess(ProcessLaunchInfo &launch_info) {
   return error;
 }
 
-Error Platform::ShellExpandArguments(ProcessLaunchInfo &launch_info) {
+Status Platform::ShellExpandArguments(ProcessLaunchInfo &launch_info) {
   if (IsHost())
     return Host::ShellExpandArguments(launch_info);
-  return Error("base lldb_private::Platform class can't expand arguments");
+  return Status("base lldb_private::Platform class can't expand arguments");
 }
 
-Error Platform::KillProcess(const lldb::pid_t pid) {
+Status Platform::KillProcess(const lldb::pid_t pid) {
   Log *log(lldb_private::GetLogIfAllCategoriesSet(LIBLLDB_LOG_PLATFORM));
   if (log)
     log->Printf("Platform::%s, pid %" PRIu64, __FUNCTION__, pid);
@@ -1083,19 +1086,19 @@ Error Platform::KillProcess(const lldb::pid_t pid) {
   }
 
   if (!IsHost()) {
-    return Error(
+    return Status(
         "base lldb_private::Platform class can't kill remote processes unless "
         "they are controlled by a process plugin");
   }
   Host::Kill(pid, SIGTERM);
-  return Error();
+  return Status();
 }
 
 lldb::ProcessSP
 Platform::DebugProcess(ProcessLaunchInfo &launch_info, Debugger &debugger,
                        Target *target, // Can be nullptr, if nullptr create a
                                        // new target, else use existing one
-                       Error &error) {
+                       Status &error) {
   Log *log(lldb_private::GetLogIfAllCategoriesSet(LIBLLDB_LOG_PLATFORM));
   if (log)
     log->Printf("Platform::%s entered (target %p)", __FUNCTION__,
@@ -1186,7 +1189,7 @@ lldb::PlatformSP
 Platform::GetPlatformForArchitecture(const ArchSpec &arch,
                                      ArchSpec *platform_arch_ptr) {
   lldb::PlatformSP platform_sp;
-  Error error;
+  Status error;
   if (arch.IsValid())
     platform_sp = Platform::Create(arch, platform_arch_ptr, error);
   return platform_sp;
@@ -1230,8 +1233,8 @@ bool Platform::IsCompatibleArchitecture(const ArchSpec &arch,
   return false;
 }
 
-Error Platform::PutFile(const FileSpec &source, const FileSpec &destination,
-                        uint32_t uid, uint32_t gid) {
+Status Platform::PutFile(const FileSpec &source, const FileSpec &destination,
+                         uint32_t uid, uint32_t gid) {
   Log *log(lldb_private::GetLogIfAllCategoriesSet(LIBLLDB_LOG_PLATFORM));
   if (log)
     log->Printf("[PutFile] Using block by block transfer....\n");
@@ -1243,13 +1246,13 @@ Error Platform::PutFile(const FileSpec &source, const FileSpec &destination,
     source_open_options |= File::eOpenOptionDontFollowSymlinks;
 
   File source_file(source, source_open_options, lldb::eFilePermissionsUserRW);
-  Error error;
+  Status error;
   uint32_t permissions = source_file.GetPermissions(error);
   if (permissions == 0)
     permissions = lldb::eFilePermissionsFileDefault;
 
   if (!source_file.IsValid())
-    return Error("PutFile: unable to open source file");
+    return Status("PutFile: unable to open source file");
   lldb::user_id_t dest_file = OpenFile(
       destination, File::eOpenOptionCanCreate | File::eOpenOptionWrite |
                        File::eOpenOptionTruncate | File::eOpenOptionCloseOnExec,
@@ -1260,7 +1263,7 @@ Error Platform::PutFile(const FileSpec &source, const FileSpec &destination,
   if (error.Fail())
     return error;
   if (dest_file == UINT64_MAX)
-    return Error("unable to open target file");
+    return Status("unable to open target file");
   lldb::DataBufferSP buffer_sp(new DataBufferHeap(1024, 0));
   uint64_t offset = 0;
   for (;;) {
@@ -1291,16 +1294,16 @@ Error Platform::PutFile(const FileSpec &source, const FileSpec &destination,
   return error;
 }
 
-Error Platform::GetFile(const FileSpec &source, const FileSpec &destination) {
-  Error error("unimplemented");
+Status Platform::GetFile(const FileSpec &source, const FileSpec &destination) {
+  Status error("unimplemented");
   return error;
 }
 
-Error Platform::CreateSymlink(
-    const FileSpec &src, // The name of the link is in src
-    const FileSpec &dst) // The symlink points to dst
+Status
+Platform::CreateSymlink(const FileSpec &src, // The name of the link is in src
+                        const FileSpec &dst) // The symlink points to dst
 {
-  Error error("unimplemented");
+  Status error("unimplemented");
   return error;
 }
 
@@ -1308,8 +1311,8 @@ bool Platform::GetFileExists(const lldb_private::FileSpec &file_spec) {
   return false;
 }
 
-Error Platform::Unlink(const FileSpec &path) {
-  Error error("unimplemented");
+Status Platform::Unlink(const FileSpec &path) {
+  Status error("unimplemented");
   return error;
 }
 
@@ -1323,7 +1326,7 @@ uint64_t Platform::ConvertMmapFlagsToPlatform(const ArchSpec &arch,
   return flags_platform;
 }
 
-lldb_private::Error Platform::RunShellCommand(
+lldb_private::Status Platform::RunShellCommand(
     const char *command, // Shouldn't be nullptr
     const FileSpec &
         working_dir, // Pass empty FileSpec to use the current working directory
@@ -1339,7 +1342,7 @@ lldb_private::Error Platform::RunShellCommand(
     return Host::RunShellCommand(command, working_dir, status_ptr, signo_ptr,
                                  command_output, timeout_sec);
   else
-    return Error("unimplemented");
+    return Status("unimplemented");
 }
 
 bool Platform::CalculateMD5(const FileSpec &file_spec, uint64_t &low,
@@ -1402,11 +1405,11 @@ void OptionGroupPlatformRSync::OptionParsingStarting(
   m_ignores_remote_hostname = false;
 }
 
-lldb_private::Error
+lldb_private::Status
 OptionGroupPlatformRSync::SetOptionValue(uint32_t option_idx,
                                          llvm::StringRef option_arg,
                                          ExecutionContext *execution_context) {
-  Error error;
+  Status error;
   char short_option = (char)GetDefinitions()[option_idx].short_option;
   switch (short_option) {
   case 'r':
@@ -1448,11 +1451,11 @@ void OptionGroupPlatformSSH::OptionParsingStarting(
   m_ssh_opts.clear();
 }
 
-lldb_private::Error
+lldb_private::Status
 OptionGroupPlatformSSH::SetOptionValue(uint32_t option_idx,
                                        llvm::StringRef option_arg,
                                        ExecutionContext *execution_context) {
-  Error error;
+  Status error;
   char short_option = (char)GetDefinitions()[option_idx].short_option;
   switch (short_option) {
   case 's':
@@ -1480,10 +1483,10 @@ void OptionGroupPlatformCaching::OptionParsingStarting(
   m_cache_dir.clear();
 }
 
-lldb_private::Error OptionGroupPlatformCaching::SetOptionValue(
+lldb_private::Status OptionGroupPlatformCaching::SetOptionValue(
     uint32_t option_idx, llvm::StringRef option_arg,
     ExecutionContext *execution_context) {
-  Error error;
+  Status error;
   char short_option = (char)GetDefinitions()[option_idx].short_option;
   switch (short_option) {
   case 'c':
@@ -1514,10 +1517,9 @@ const std::vector<ConstString> &Platform::GetTrapHandlerSymbolNames() {
   return m_trap_handlers;
 }
 
-Error Platform::GetCachedExecutable(ModuleSpec &module_spec,
-                                    lldb::ModuleSP &module_sp,
-                                    const FileSpecList *module_search_paths_ptr,
-                                    Platform &remote_platform) {
+Status Platform::GetCachedExecutable(
+    ModuleSpec &module_spec, lldb::ModuleSP &module_sp,
+    const FileSpecList *module_search_paths_ptr, Platform &remote_platform) {
   const auto platform_spec = module_spec.GetFileSpec();
   const auto error = LoadCachedExecutable(
       module_spec, module_sp, module_search_paths_ptr, remote_platform);
@@ -1529,7 +1531,7 @@ Error Platform::GetCachedExecutable(ModuleSpec &module_spec,
   return error;
 }
 
-Error Platform::LoadCachedExecutable(
+Status Platform::LoadCachedExecutable(
     const ModuleSpec &module_spec, lldb::ModuleSP &module_sp,
     const FileSpecList *module_search_paths_ptr, Platform &remote_platform) {
   return GetRemoteSharedModule(module_spec, nullptr, module_sp,
@@ -1540,11 +1542,11 @@ Error Platform::LoadCachedExecutable(
                                nullptr);
 }
 
-Error Platform::GetRemoteSharedModule(const ModuleSpec &module_spec,
-                                      Process *process,
-                                      lldb::ModuleSP &module_sp,
-                                      const ModuleResolver &module_resolver,
-                                      bool *did_create_ptr) {
+Status Platform::GetRemoteSharedModule(const ModuleSpec &module_spec,
+                                       Process *process,
+                                       lldb::ModuleSP &module_sp,
+                                       const ModuleResolver &module_resolver,
+                                       bool *did_create_ptr) {
   // Get module information from a target.
   ModuleSpec resolved_module_spec;
   bool got_module_spec = false;
@@ -1561,7 +1563,7 @@ Error Platform::GetRemoteSharedModule(const ModuleSpec &module_spec,
   }
 
   if (module_spec.GetArchitecture().IsValid() == false) {
-    Error error;
+    Status error;
     // No valid architecture was specified, ask the platform for
     // the architectures that we should be using (in the correct order)
     // and see if we can find a match that way
@@ -1600,7 +1602,7 @@ Error Platform::GetRemoteSharedModule(const ModuleSpec &module_spec,
   const auto error = module_resolver(resolved_module_spec);
   if (error.Fail()) {
     if (GetCachedSharedModule(resolved_module_spec, module_sp, did_create_ptr))
-      return Error();
+      return Status();
   }
 
   return error;
@@ -1640,11 +1642,11 @@ bool Platform::GetCachedSharedModule(const ModuleSpec &module_spec,
   return false;
 }
 
-Error Platform::DownloadModuleSlice(const FileSpec &src_file_spec,
-                                    const uint64_t src_offset,
-                                    const uint64_t src_size,
-                                    const FileSpec &dst_file_spec) {
-  Error error;
+Status Platform::DownloadModuleSlice(const FileSpec &src_file_spec,
+                                     const uint64_t src_offset,
+                                     const uint64_t src_size,
+                                     const FileSpec &dst_file_spec) {
+  Status error;
 
   std::error_code EC;
   llvm::raw_fd_ostream dst(dst_file_spec.GetPath(), EC, llvm::sys::fs::F_None);
@@ -1682,15 +1684,15 @@ Error Platform::DownloadModuleSlice(const FileSpec &src_file_spec,
     dst.write(&buffer[0], n_read);
   }
 
-  Error close_error;
+  Status close_error;
   CloseFile(src_fd, close_error); // Ignoring close error.
 
   return error;
 }
 
-Error Platform::DownloadSymbolFile(const lldb::ModuleSP &module_sp,
-                                   const FileSpec &dst_file_spec) {
-  return Error(
+Status Platform::DownloadSymbolFile(const lldb::ModuleSP &module_sp,
+                                    const FileSpec &dst_file_spec) {
+  return Status(
       "Symbol file downloading not supported by the default platform.");
 }
 
@@ -1716,7 +1718,7 @@ const UnixSignalsSP &Platform::GetUnixSignals() {
 uint32_t Platform::LoadImage(lldb_private::Process *process,
                              const lldb_private::FileSpec &local_file,
                              const lldb_private::FileSpec &remote_file,
-                             lldb_private::Error &error) {
+                             lldb_private::Status &error) {
   if (local_file && remote_file) {
     // Both local and remote file was specified. Install the local file to the
     // given location.
@@ -1752,21 +1754,21 @@ uint32_t Platform::LoadImage(lldb_private::Process *process,
 
 uint32_t Platform::DoLoadImage(lldb_private::Process *process,
                                const lldb_private::FileSpec &remote_file,
-                               lldb_private::Error &error) {
+                               lldb_private::Status &error) {
   error.SetErrorString("LoadImage is not supported on the current platform");
   return LLDB_INVALID_IMAGE_TOKEN;
 }
 
-Error Platform::UnloadImage(lldb_private::Process *process,
-                            uint32_t image_token) {
-  return Error("UnloadImage is not supported on the current platform");
+Status Platform::UnloadImage(lldb_private::Process *process,
+                             uint32_t image_token) {
+  return Status("UnloadImage is not supported on the current platform");
 }
 
 lldb::ProcessSP Platform::ConnectProcess(llvm::StringRef connect_url,
                                          llvm::StringRef plugin_name,
                                          lldb_private::Debugger &debugger,
                                          lldb_private::Target *target,
-                                         lldb_private::Error &error) {
+                                         lldb_private::Status &error) {
   error.Clear();
 
   if (!target) {
@@ -1795,7 +1797,7 @@ lldb::ProcessSP Platform::ConnectProcess(llvm::StringRef connect_url,
 }
 
 size_t Platform::ConnectToWaitingProcesses(lldb_private::Debugger &debugger,
-                                           lldb_private::Error &error) {
+                                           lldb_private::Status &error) {
   error.Clear();
   return 0;
 }
diff --git a/source/Target/Process.cpp b/source/Target/Process.cpp
index 0bc58f073bf1..ff86b0dbe051 100644
--- a/source/Target/Process.cpp
+++ b/source/Target/Process.cpp
@@ -415,10 +415,10 @@ void ProcessInstanceInfo::DumpAsTableRow(Stream &s, Platform *platform,
   }
 }
 
-Error ProcessLaunchCommandOptions::SetOptionValue(
+Status ProcessLaunchCommandOptions::SetOptionValue(
     uint32_t option_idx, llvm::StringRef option_arg,
     ExecutionContext *execution_context) {
-  Error error;
+  Status error;
   const int short_option = m_getopt_table[option_idx].val;
 
   switch (short_option) {
@@ -1610,13 +1610,13 @@ void Process::SetPublicState(StateType new_state, bool restarted) {
   }
 }
 
-Error Process::Resume() {
+Status Process::Resume() {
   Log *log(lldb_private::GetLogIfAnyCategoriesSet(LIBLLDB_LOG_STATE |
                                                   LIBLLDB_LOG_PROCESS));
   if (log)
     log->Printf("Process::Resume -- locking run lock");
   if (!m_public_run_lock.TrySetRunning()) {
-    Error error("Resume request failed - process still running.");
+    Status error("Resume request failed - process still running.");
     if (log)
       log->Printf("Process::Resume: -- TrySetRunning failed, not resuming.");
     return error;
@@ -1624,13 +1624,13 @@ Error Process::Resume() {
   return PrivateResume();
 }
 
-Error Process::ResumeSynchronous(Stream *stream) {
+Status Process::ResumeSynchronous(Stream *stream) {
   Log *log(lldb_private::GetLogIfAnyCategoriesSet(LIBLLDB_LOG_STATE |
                                                   LIBLLDB_LOG_PROCESS));
   if (log)
     log->Printf("Process::ResumeSynchronous -- locking run lock");
   if (!m_public_run_lock.TrySetRunning()) {
-    Error error("Resume request failed - process still running.");
+    Status error("Resume request failed - process still running.");
     if (log)
       log->Printf("Process::Resume: -- TrySetRunning failed, not resuming.");
     return error;
@@ -1640,7 +1640,7 @@ Error Process::ResumeSynchronous(Stream *stream) {
       Listener::MakeListener("lldb.Process.ResumeSynchronous.hijack"));
   HijackProcessEvents(listener_sp);
 
-  Error error = PrivateResume();
+  Status error = PrivateResume();
   if (error.Success()) {
     StateType state =
         WaitForProcessToStop(llvm::None, NULL, true, listener_sp, stream);
@@ -1813,8 +1813,8 @@ void Process::DisableAllBreakpointSites() {
   });
 }
 
-Error Process::ClearBreakpointSiteByID(lldb::user_id_t break_id) {
-  Error error(DisableBreakpointSiteByID(break_id));
+Status Process::ClearBreakpointSiteByID(lldb::user_id_t break_id) {
+  Status error(DisableBreakpointSiteByID(break_id));
 
   if (error.Success())
     m_breakpoint_site_list.Remove(break_id);
@@ -1822,8 +1822,8 @@ Error Process::ClearBreakpointSiteByID(lldb::user_id_t break_id) {
   return error;
 }
 
-Error Process::DisableBreakpointSiteByID(lldb::user_id_t break_id) {
-  Error error;
+Status Process::DisableBreakpointSiteByID(lldb::user_id_t break_id) {
+  Status error;
   BreakpointSiteSP bp_site_sp = m_breakpoint_site_list.FindByID(break_id);
   if (bp_site_sp) {
     if (bp_site_sp->IsEnabled())
@@ -1836,8 +1836,8 @@ Error Process::DisableBreakpointSiteByID(lldb::user_id_t break_id) {
   return error;
 }
 
-Error Process::EnableBreakpointSiteByID(lldb::user_id_t break_id) {
-  Error error;
+Status Process::EnableBreakpointSiteByID(lldb::user_id_t break_id) {
+  Status error;
   BreakpointSiteSP bp_site_sp = m_breakpoint_site_list.FindByID(break_id);
   if (bp_site_sp) {
     if (!bp_site_sp->IsEnabled())
@@ -1882,7 +1882,7 @@ Process::CreateBreakpointSite(const BreakpointLocationSP &owner,
   if (owner->ShouldResolveIndirectFunctions()) {
     Symbol *symbol = owner->GetAddress().CalculateSymbolContextSymbol();
     if (symbol && symbol->IsIndirect()) {
-      Error error;
+      Status error;
       Address symbol_address = symbol->GetAddress();
       load_addr = ResolveIndirectFunction(&symbol_address, error);
       if (!error.Success() && show_error) {
@@ -1919,7 +1919,7 @@ Process::CreateBreakpointSite(const BreakpointLocationSP &owner,
       bp_site_sp.reset(new BreakpointSite(&m_breakpoint_site_list, owner,
                                           load_addr, use_hardware));
       if (bp_site_sp) {
-        Error error = EnableBreakpointSite(bp_site_sp.get());
+        Status error = EnableBreakpointSite(bp_site_sp.get());
         if (error.Success()) {
           owner->SetBreakpointSite(bp_site_sp);
           return m_breakpoint_site_list.Add(bp_site_sp);
@@ -1989,8 +1989,8 @@ size_t Process::GetSoftwareBreakpointTrapOpcode(BreakpointSite *bp_site) {
   return 0;
 }
 
-Error Process::EnableSoftwareBreakpoint(BreakpointSite *bp_site) {
-  Error error;
+Status Process::EnableSoftwareBreakpoint(BreakpointSite *bp_site) {
+  Status error;
   assert(bp_site != nullptr);
   Log *log(lldb_private::GetLogIfAnyCategoriesSet(LIBLLDB_LOG_BREAKPOINTS));
   const addr_t bp_addr = bp_site->GetLoadAddress();
@@ -2065,8 +2065,8 @@ Error Process::EnableSoftwareBreakpoint(BreakpointSite *bp_site) {
   return error;
 }
 
-Error Process::DisableSoftwareBreakpoint(BreakpointSite *bp_site) {
-  Error error;
+Status Process::DisableSoftwareBreakpoint(BreakpointSite *bp_site) {
+  Status error;
   assert(bp_site != nullptr);
   Log *log(lldb_private::GetLogIfAnyCategoriesSet(LIBLLDB_LOG_BREAKPOINTS));
   addr_t bp_addr = bp_site->GetLoadAddress();
@@ -2158,7 +2158,7 @@ Error Process::DisableSoftwareBreakpoint(BreakpointSite *bp_site) {
 // Uncomment to verify memory caching works after making changes to caching code
 //#define VERIFY_MEMORY_READS
 
-size_t Process::ReadMemory(addr_t addr, void *buf, size_t size, Error &error) {
+size_t Process::ReadMemory(addr_t addr, void *buf, size_t size, Status &error) {
   error.Clear();
   if (!GetDisableMemoryCache()) {
 #if defined(VERIFY_MEMORY_READS)
@@ -2177,7 +2177,7 @@ size_t Process::ReadMemory(addr_t addr, void *buf, size_t size, Error &error) {
       assert(verify_buf.size() == size);
       const size_t cache_bytes_read =
           m_memory_cache.Read(this, addr, buf, size, error);
-      Error verify_error;
+      Status verify_error;
       const size_t verify_bytes_read =
           ReadMemoryFromInferior(addr, const_cast<char *>(verify_buf.data()),
                                  verify_buf.size(), verify_error);
@@ -2200,7 +2200,7 @@ size_t Process::ReadMemory(addr_t addr, void *buf, size_t size, Error &error) {
 }
 
 size_t Process::ReadCStringFromMemory(addr_t addr, std::string &out_str,
-                                      Error &error) {
+                                      Status &error) {
   char buf[256];
   out_str.clear();
   addr_t curr_addr = addr;
@@ -2220,7 +2220,7 @@ size_t Process::ReadCStringFromMemory(addr_t addr, std::string &out_str,
 }
 
 size_t Process::ReadStringFromMemory(addr_t addr, char *dst, size_t max_bytes,
-                                     Error &error, size_t type_width) {
+                                     Status &error, size_t type_width) {
   size_t total_bytes_read = 0;
   if (dst && max_bytes && type_width && max_bytes >= type_width) {
     // Ensure a null terminator independent of the number of bytes that is read.
@@ -2273,13 +2273,14 @@ size_t Process::ReadStringFromMemory(addr_t addr, char *dst, size_t max_bytes,
 // correct code to find
 // null terminators.
 size_t Process::ReadCStringFromMemory(addr_t addr, char *dst,
-                                      size_t dst_max_len, Error &result_error) {
+                                      size_t dst_max_len,
+                                      Status &result_error) {
   size_t total_cstr_len = 0;
   if (dst && dst_max_len) {
     result_error.Clear();
     // NULL out everything just to be safe
     memset(dst, 0, dst_max_len);
-    Error error;
+    Status error;
     addr_t curr_addr = addr;
     const size_t cache_line_size = m_memory_cache.GetMemoryCacheLineSize();
     size_t bytes_left = dst_max_len - 1;
@@ -2318,7 +2319,7 @@ size_t Process::ReadCStringFromMemory(addr_t addr, char *dst,
 }
 
 size_t Process::ReadMemoryFromInferior(addr_t addr, void *buf, size_t size,
-                                       Error &error) {
+                                       Status &error) {
   if (buf == nullptr || size == 0)
     return 0;
 
@@ -2344,7 +2345,7 @@ size_t Process::ReadMemoryFromInferior(addr_t addr, void *buf, size_t size,
 uint64_t Process::ReadUnsignedIntegerFromMemory(lldb::addr_t vm_addr,
                                                 size_t integer_byte_size,
                                                 uint64_t fail_value,
-                                                Error &error) {
+                                                Status &error) {
   Scalar scalar;
   if (ReadScalarIntegerFromMemory(vm_addr, integer_byte_size, false, scalar,
                                   error))
@@ -2354,7 +2355,8 @@ uint64_t Process::ReadUnsignedIntegerFromMemory(lldb::addr_t vm_addr,
 
 int64_t Process::ReadSignedIntegerFromMemory(lldb::addr_t vm_addr,
                                              size_t integer_byte_size,
-                                             int64_t fail_value, Error &error) {
+                                             int64_t fail_value,
+                                             Status &error) {
   Scalar scalar;
   if (ReadScalarIntegerFromMemory(vm_addr, integer_byte_size, true, scalar,
                                   error))
@@ -2362,7 +2364,7 @@ int64_t Process::ReadSignedIntegerFromMemory(lldb::addr_t vm_addr,
   return fail_value;
 }
 
-addr_t Process::ReadPointerFromMemory(lldb::addr_t vm_addr, Error &error) {
+addr_t Process::ReadPointerFromMemory(lldb::addr_t vm_addr, Status &error) {
   Scalar scalar;
   if (ReadScalarIntegerFromMemory(vm_addr, GetAddressByteSize(), false, scalar,
                                   error))
@@ -2371,7 +2373,7 @@ addr_t Process::ReadPointerFromMemory(lldb::addr_t vm_addr, Error &error) {
 }
 
 bool Process::WritePointerToMemory(lldb::addr_t vm_addr, lldb::addr_t ptr_value,
-                                   Error &error) {
+                                   Status &error) {
   Scalar scalar;
   const uint32_t addr_byte_size = GetAddressByteSize();
   if (addr_byte_size <= 4)
@@ -2383,7 +2385,7 @@ bool Process::WritePointerToMemory(lldb::addr_t vm_addr, lldb::addr_t ptr_value,
 }
 
 size_t Process::WriteMemoryPrivate(addr_t addr, const void *buf, size_t size,
-                                   Error &error) {
+                                   Status &error) {
   size_t bytes_written = 0;
   const uint8_t *bytes = (const uint8_t *)buf;
 
@@ -2399,7 +2401,7 @@ size_t Process::WriteMemoryPrivate(addr_t addr, const void *buf, size_t size,
 }
 
 size_t Process::WriteMemory(addr_t addr, const void *buf, size_t size,
-                            Error &error) {
+                            Status &error) {
 #if defined(ENABLE_MEMORY_CACHING)
   m_memory_cache.Flush(addr, size);
 #endif
@@ -2478,7 +2480,7 @@ size_t Process::WriteMemory(addr_t addr, const void *buf, size_t size,
 }
 
 size_t Process::WriteScalarToMemory(addr_t addr, const Scalar &scalar,
-                                    size_t byte_size, Error &error) {
+                                    size_t byte_size, Status &error) {
   if (byte_size == UINT32_MAX)
     byte_size = scalar.GetByteSize();
   if (byte_size > 0) {
@@ -2497,7 +2499,7 @@ size_t Process::WriteScalarToMemory(addr_t addr, const Scalar &scalar,
 
 size_t Process::ReadScalarIntegerFromMemory(addr_t addr, uint32_t byte_size,
                                             bool is_signed, Scalar &scalar,
-                                            Error &error) {
+                                            Status &error) {
   uint64_t uval = 0;
   if (byte_size == 0) {
     error.SetErrorString("byte size is zero");
@@ -2527,7 +2529,7 @@ size_t Process::ReadScalarIntegerFromMemory(addr_t addr, uint32_t byte_size,
 
 #define USE_ALLOCATE_MEMORY_CACHE 1
 addr_t Process::AllocateMemory(size_t size, uint32_t permissions,
-                               Error &error) {
+                               Status &error) {
   if (GetPrivateState() != eStateStopped)
     return LLDB_INVALID_ADDRESS;
 
@@ -2548,7 +2550,7 @@ addr_t Process::AllocateMemory(size_t size, uint32_t permissions,
 }
 
 addr_t Process::CallocateMemory(size_t size, uint32_t permissions,
-                                Error &error) {
+                                Status &error) {
   addr_t return_addr = AllocateMemory(size, permissions, error);
   if (error.Success()) {
     std::string buffer(size, 0);
@@ -2560,7 +2562,7 @@ addr_t Process::CallocateMemory(size_t size, uint32_t permissions,
 bool Process::CanJIT() {
   if (m_can_jit == eCanJITDontKnow) {
     Log *log(lldb_private::GetLogIfAllCategoriesSet(LIBLLDB_LOG_PROCESS));
-    Error err;
+    Status err;
 
     uint64_t allocated_memory = AllocateMemory(
         8, ePermissionsReadable | ePermissionsWritable | ePermissionsExecutable,
@@ -2595,8 +2597,8 @@ void Process::SetCanRunCode(bool can_run_code) {
   m_can_interpret_function_calls = can_run_code;
 }
 
-Error Process::DeallocateMemory(addr_t ptr) {
-  Error error;
+Status Process::DeallocateMemory(addr_t ptr) {
+  Status error;
 #if defined(USE_ALLOCATE_MEMORY_CACHE)
   if (!m_allocated_memory_cache.DeallocateMemory(ptr)) {
     error.SetErrorStringWithFormat(
@@ -2625,7 +2627,7 @@ ModuleSP Process::ReadModuleFromMemory(const FileSpec &file_spec,
   }
   ModuleSP module_sp(new Module(file_spec, ArchSpec()));
   if (module_sp) {
-    Error error;
+    Status error;
     ObjectFile *objfile = module_sp->GetMemoryObjectFile(
         shared_from_this(), header_addr, error, size_to_read);
     if (objfile)
@@ -2638,7 +2640,7 @@ bool Process::GetLoadAddressPermissions(lldb::addr_t load_addr,
                                         uint32_t &permissions) {
   MemoryRegionInfo range_info;
   permissions = 0;
-  Error error(GetMemoryRegionInfo(load_addr, range_info));
+  Status error(GetMemoryRegionInfo(load_addr, range_info));
   if (!error.Success())
     return false;
   if (range_info.GetReadable() == MemoryRegionInfo::eDontKnow ||
@@ -2659,14 +2661,14 @@ bool Process::GetLoadAddressPermissions(lldb::addr_t load_addr,
   return true;
 }
 
-Error Process::EnableWatchpoint(Watchpoint *watchpoint, bool notify) {
-  Error error;
+Status Process::EnableWatchpoint(Watchpoint *watchpoint, bool notify) {
+  Status error;
   error.SetErrorString("watchpoints are not supported");
   return error;
 }
 
-Error Process::DisableWatchpoint(Watchpoint *watchpoint, bool notify) {
-  Error error;
+Status Process::DisableWatchpoint(Watchpoint *watchpoint, bool notify) {
+  Status error;
   error.SetErrorString("watchpoints are not supported");
   return error;
 }
@@ -2702,8 +2704,8 @@ void Process::LoadOperatingSystemPlugin(bool flush) {
     Flush();
 }
 
-Error Process::Launch(ProcessLaunchInfo &launch_info) {
-  Error error;
+Status Process::Launch(ProcessLaunchInfo &launch_info) {
+  Status error;
   m_abi_sp.reset();
   m_dyld_ap.reset();
   m_jit_loaders_ap.reset();
@@ -2823,8 +2825,8 @@ Error Process::Launch(ProcessLaunchInfo &launch_info) {
   return error;
 }
 
-Error Process::LoadCore() {
-  Error error = DoLoadCore();
+Status Process::LoadCore() {
+  Status error = DoLoadCore();
   if (error.Success()) {
     ListenerSP listener_sp(
         Listener::MakeListener("lldb.process.load_core_listener"));
@@ -2977,7 +2979,7 @@ ListenerSP ProcessAttachInfo::GetListenerForProcess(Debugger &debugger) {
     return debugger.GetListener();
 }
 
-Error Process::Attach(ProcessAttachInfo &attach_info) {
+Status Process::Attach(ProcessAttachInfo &attach_info) {
   m_abi_sp.reset();
   m_process_input_reader.reset();
   m_dyld_ap.reset();
@@ -2987,7 +2989,7 @@ Error Process::Attach(ProcessAttachInfo &attach_info) {
   m_stop_info_override_callback = nullptr;
 
   lldb::pid_t attach_pid = attach_info.GetProcessID();
-  Error error;
+  Status error;
   if (attach_pid == LLDB_INVALID_PROCESS_ID) {
     char process_name[PATH_MAX];
 
@@ -3221,14 +3223,14 @@ void Process::CompleteAttach() {
   m_stop_info_override_callback = process_arch.GetStopInfoOverrideCallback();
 }
 
-Error Process::ConnectRemote(Stream *strm, llvm::StringRef remote_url) {
+Status Process::ConnectRemote(Stream *strm, llvm::StringRef remote_url) {
   m_abi_sp.reset();
   m_process_input_reader.reset();
 
   // Find the process and its architecture.  Make sure it matches the
   // architecture of the current Target, and if not adjust it.
 
-  Error error(DoConnectRemote(strm, remote_url));
+  Status error(DoConnectRemote(strm, remote_url));
   if (error.Success()) {
     if (GetID() != LLDB_INVALID_PROCESS_ID) {
       EventSP event_sp;
@@ -3253,7 +3255,7 @@ Error Process::ConnectRemote(Stream *strm, llvm::StringRef remote_url) {
   return error;
 }
 
-Error Process::PrivateResume() {
+Status Process::PrivateResume() {
   Log *log(lldb_private::GetLogIfAnyCategoriesSet(LIBLLDB_LOG_PROCESS |
                                                   LIBLLDB_LOG_STEP));
   if (log)
@@ -3266,7 +3268,7 @@ Error Process::PrivateResume() {
   // our signal filters before resuming.
   UpdateAutomaticSignalFiltering();
 
-  Error error(WillResume());
+  Status error(WillResume());
   // Tell the process it is about to resume before the thread list
   if (error.Success()) {
     // Now let the thread list know we are about to resume so it
@@ -3311,9 +3313,9 @@ Error Process::PrivateResume() {
   return error;
 }
 
-Error Process::Halt(bool clear_thread_plans, bool use_run_lock) {
+Status Process::Halt(bool clear_thread_plans, bool use_run_lock) {
   if (!StateIsRunningState(m_public_state.GetValue()))
-    return Error("Process is not running.");
+    return Status("Process is not running.");
 
   // Don't clear the m_clear_thread_plans_on_stop, only set it to true if
   // in case it was already set and some thread plan logic calls halt on its
@@ -3334,7 +3336,7 @@ Error Process::Halt(bool clear_thread_plans, bool use_run_lock) {
     RestoreProcessEvents();
     SetExitStatus(SIGKILL, "Cancelled async attach.");
     Destroy(false);
-    return Error();
+    return Status();
   }
 
   // Wait for 10 second for the process to stop.
@@ -3344,16 +3346,16 @@ Error Process::Halt(bool clear_thread_plans, bool use_run_lock) {
 
   if (state == eStateInvalid || !event_sp) {
     // We timed out and didn't get a stop event...
-    return Error("Halt timed out. State = %s", StateAsCString(GetState()));
+    return Status("Halt timed out. State = %s", StateAsCString(GetState()));
   }
 
   BroadcastEvent(event_sp);
 
-  return Error();
+  return Status();
 }
 
-Error Process::StopForDestroyOrDetach(lldb::EventSP &exit_event_sp) {
-  Error error;
+Status Process::StopForDestroyOrDetach(lldb::EventSP &exit_event_sp) {
+  Status error;
 
   // Check both the public & private states here.  If we're hung evaluating an
   // expression, for instance, then
@@ -3400,18 +3402,19 @@ Error Process::StopForDestroyOrDetach(lldb::EventSP &exit_event_sp) {
       // then continue on.
       StateType private_state = m_private_state.GetValue();
       if (private_state != eStateStopped) {
-        return Error("Attempt to stop the target in order to detach timed out. "
-                     "State = %s",
-                     StateAsCString(GetState()));
+        return Status(
+            "Attempt to stop the target in order to detach timed out. "
+            "State = %s",
+            StateAsCString(GetState()));
       }
     }
   }
   return error;
 }
 
-Error Process::Detach(bool keep_stopped) {
+Status Process::Detach(bool keep_stopped) {
   EventSP exit_event_sp;
-  Error error;
+  Status error;
   m_destroy_in_process = true;
 
   error = WillDetach();
@@ -3463,7 +3466,7 @@ Error Process::Detach(bool keep_stopped) {
   return error;
 }
 
-Error Process::Destroy(bool force_kill) {
+Status Process::Destroy(bool force_kill) {
 
   // Tell ourselves we are in the process of destroying the process, so that we
   // don't do any unnecessary work
@@ -3483,7 +3486,7 @@ Error Process::Destroy(bool force_kill) {
 
   m_destroy_in_process = true;
 
-  Error error(WillDestroy());
+  Status error(WillDestroy());
   if (error.Success()) {
     EventSP exit_event_sp;
     if (DestroyRequiresHalt()) {
@@ -3538,8 +3541,8 @@ Error Process::Destroy(bool force_kill) {
   return error;
 }
 
-Error Process::Signal(int signal) {
-  Error error(WillSignal());
+Status Process::Signal(int signal) {
+  Status error(WillSignal());
   if (error.Success()) {
     error = DoSignal(signal);
     if (error.Success())
@@ -3967,9 +3970,9 @@ void Process::HandlePrivateEvent(EventSP &event_sp) {
   }
 }
 
-Error Process::HaltPrivate() {
+Status Process::HaltPrivate() {
   EventSP event_sp;
-  Error error(WillHalt());
+  Status error(WillHalt());
   if (error.Fail())
     return error;
 
@@ -4037,7 +4040,7 @@ thread_result_t Process::RunPrivateStateThread(bool is_secondary_thread) {
           log->Printf("Process::%s (arg = %p, pid = %" PRIu64
                       ") woke up with an interrupt - Halting.",
                       __FUNCTION__, static_cast<void *>(this), GetID());
-        Error error = HaltPrivate();
+        Status error = HaltPrivate();
         if (error.Fail() && log)
           log->Printf("Process::%s (arg = %p, pid = %" PRIu64
                       ") failed to halt the process: %s",
@@ -4477,7 +4480,7 @@ Process::GetStructuredDataPlugin(const ConstString &type_name) const {
     return StructuredDataPluginSP();
 }
 
-size_t Process::GetAsyncProfileData(char *buf, size_t buf_size, Error &error) {
+size_t Process::GetAsyncProfileData(char *buf, size_t buf_size, Status &error) {
   std::lock_guard<std::recursive_mutex> guard(m_profile_data_comm_mutex);
   if (m_profile_data.empty())
     return 0;
@@ -4505,7 +4508,7 @@ size_t Process::GetAsyncProfileData(char *buf, size_t buf_size, Error &error) {
 // Process STDIO
 //------------------------------------------------------------------
 
-size_t Process::GetSTDOUT(char *buf, size_t buf_size, Error &error) {
+size_t Process::GetSTDOUT(char *buf, size_t buf_size, Status &error) {
   std::lock_guard<std::recursive_mutex> guard(m_stdio_communication_mutex);
   size_t bytes_available = m_stdout_data.size();
   if (bytes_available > 0) {
@@ -4525,7 +4528,7 @@ size_t Process::GetSTDOUT(char *buf, size_t buf_size, Error &error) {
   return bytes_available;
 }
 
-size_t Process::GetSTDERR(char *buf, size_t buf_size, Error &error) {
+size_t Process::GetSTDERR(char *buf, size_t buf_size, Status &error) {
   std::lock_guard<std::recursive_mutex> gaurd(m_stdio_communication_mutex);
   size_t bytes_available = m_stderr_data.size();
   if (bytes_available > 0) {
@@ -4588,7 +4591,7 @@ public:
       SelectHelper select_helper;
       select_helper.FDSetRead(read_fd);
       select_helper.FDSetRead(pipe_read_fd);
-      Error error = select_helper.Select();
+      Status error = select_helper.Select();
 
       if (error.Fail()) {
         SetIsDone(true);
@@ -4606,7 +4609,7 @@ public:
         if (select_helper.FDIsSetRead(pipe_read_fd)) {
           size_t bytes_read;
           // Consume the interrupt byte
-          Error error = m_pipe.Read(&ch, 1, bytes_read);
+          Status error = m_pipe.Read(&ch, 1, bytes_read);
           if (error.Success()) {
             switch (ch) {
             case 'q':
@@ -4657,7 +4660,7 @@ public:
     if (m_active) {
       char ch = 'i'; // Send 'i' for interrupt
       size_t bytes_written = 0;
-      Error result = m_pipe.Write(&ch, 1, bytes_written);
+      Status result = m_pipe.Write(&ch, 1, bytes_written);
       return result.Success();
     } else {
       // This IOHandler might be pushed on the stack, but not being run
@@ -5082,7 +5085,7 @@ Process::RunThreadPlan(ExecutionContext &exe_ctx,
 
         if (do_resume) {
           num_resumes++;
-          Error resume_error = PrivateResume();
+          Status resume_error = PrivateResume();
           if (!resume_error.Success()) {
             diagnostic_manager.Printf(
                 eDiagnosticSeverityError,
@@ -5361,7 +5364,7 @@ Process::RunThreadPlan(ExecutionContext &exe_ctx,
         bool do_halt = true;
         const uint32_t num_retries = 5;
         while (try_halt_again < num_retries) {
-          Error halt_error;
+          Status halt_error;
           if (do_halt) {
             if (log)
               log->Printf("Process::RunThreadPlan(): Running Halt.");
@@ -5473,7 +5476,7 @@ Process::RunThreadPlan(ExecutionContext &exe_ctx,
     // plan, shut it down now.
     if (backup_private_state_thread.IsJoinable()) {
       StopPrivateStateThread();
-      Error error;
+      Status error;
       m_private_state_thread = backup_private_state_thread;
       if (stopper_base_plan_sp) {
         thread->DiscardThreadPlansUpToPlan(stopper_base_plan_sp);
@@ -5862,7 +5865,7 @@ void Process::DidExec() {
   target.DidExec();
 }
 
-addr_t Process::ResolveIndirectFunction(const Address *address, Error &error) {
+addr_t Process::ResolveIndirectFunction(const Address *address, Status &error) {
   if (address == nullptr) {
     error.SetErrorString("Invalid address argument");
     return LLDB_INVALID_ADDRESS;
@@ -6090,10 +6093,10 @@ Process::AdvanceAddressToNextBranchInstruction(Address default_stop_addr,
   return retval;
 }
 
-Error Process::GetMemoryRegions(
-    std::vector<lldb::MemoryRegionInfoSP> &region_list) {
+Status
+Process::GetMemoryRegions(std::vector<lldb::MemoryRegionInfoSP> &region_list) {
 
-  Error error;
+  Status error;
 
   lldb::addr_t range_end = 0;
 
@@ -6116,12 +6119,13 @@ Error Process::GetMemoryRegions(
   return error;
 }
 
-Error Process::ConfigureStructuredData(
-    const ConstString &type_name, const StructuredData::ObjectSP &config_sp) {
+Status
+Process::ConfigureStructuredData(const ConstString &type_name,
+                                 const StructuredData::ObjectSP &config_sp) {
   // If you get this, the Process-derived class needs to implement a method
   // to enable an already-reported asynchronous structured data feature.
   // See ProcessGDBRemote for an example implementation over gdb-remote.
-  return Error("unimplemented");
+  return Status("unimplemented");
 }
 
 void Process::MapSupportedStructuredDataPlugins(
@@ -6158,8 +6162,7 @@ void Process::MapSupportedStructuredDataPlugins(
         }
 
         const_type_names.insert(ConstString(type_name->GetValue()));
-        if (log)
-          log->Printf("- %s", type_name->GetValue().c_str());
+        LLDB_LOG(log, "- {0}", type_name->GetValue());
         return true;
       });
 
@@ -6229,8 +6232,8 @@ bool Process::RouteAsyncStructuredData(
   return true;
 }
 
-Error Process::UpdateAutomaticSignalFiltering() {
+Status Process::UpdateAutomaticSignalFiltering() {
   // Default implementation does nothign.
   // No automatic signal filtering to speak of.
-  return Error();
+  return Status();
 }
diff --git a/source/Target/ProcessLaunchInfo.cpp b/source/Target/ProcessLaunchInfo.cpp
index 7de55f2fdcf9..3fa40dcc5cab 100644
--- a/source/Target/ProcessLaunchInfo.cpp
+++ b/source/Target/ProcessLaunchInfo.cpp
@@ -330,7 +330,7 @@ void ProcessLaunchInfo::FinalizeFileActions(Target *target,
 }
 
 bool ProcessLaunchInfo::ConvertArgumentsForLaunchingInShell(
-    Error &error, bool localhost, bool will_debug,
+    Status &error, bool localhost, bool will_debug,
     bool first_arg_is_full_shell_command, int32_t num_resumes) {
   error.Clear();
 
diff --git a/source/Target/RegisterContext.cpp b/source/Target/RegisterContext.cpp
index 6cbfb04a6e9a..66164c175e41 100644
--- a/source/Target/RegisterContext.cpp
+++ b/source/Target/RegisterContext.cpp
@@ -91,7 +91,7 @@ RegisterContext::UpdateDynamicRegisterSize(const lldb_private::ArchSpec &arch,
   DWARFExpression dwarf_expr(opcode_ctx, dwarf_data, nullptr, 0,
                              dwarf_opcode_len);
   Value result;
-  Error error;
+  Status error;
   const lldb::offset_t offset = 0;
   if (dwarf_expr.Evaluate(&exe_ctx, nullptr, nullptr, this, opcode_ctx,
                           dwarf_data, nullptr, offset, dwarf_opcode_len,
@@ -299,11 +299,10 @@ bool RegisterContext::ClearHardwareWatchpoint(uint32_t hw_index) {
 
 bool RegisterContext::HardwareSingleStep(bool enable) { return false; }
 
-Error RegisterContext::ReadRegisterValueFromMemory(const RegisterInfo *reg_info,
-                                                   lldb::addr_t src_addr,
-                                                   uint32_t src_len,
-                                                   RegisterValue &reg_value) {
-  Error error;
+Status RegisterContext::ReadRegisterValueFromMemory(
+    const RegisterInfo *reg_info, lldb::addr_t src_addr, uint32_t src_len,
+    RegisterValue &reg_value) {
+  Status error;
   if (reg_info == nullptr) {
     error.SetErrorString("invalid register info argument.");
     return error;
@@ -318,7 +317,7 @@ Error RegisterContext::ReadRegisterValueFromMemory(const RegisterInfo *reg_info,
   //
   // Case 2: src_len > dst_len
   //
-  //   Error!  (The register should always be big enough to hold the data)
+  //   Status!  (The register should always be big enough to hold the data)
   //
   // Case 3: src_len < dst_len
   //
@@ -371,12 +370,12 @@ Error RegisterContext::ReadRegisterValueFromMemory(const RegisterInfo *reg_info,
   return error;
 }
 
-Error RegisterContext::WriteRegisterValueToMemory(
+Status RegisterContext::WriteRegisterValueToMemory(
     const RegisterInfo *reg_info, lldb::addr_t dst_addr, uint32_t dst_len,
     const RegisterValue &reg_value) {
   uint8_t dst[RegisterValue::kMaxRegisterByteSize];
 
-  Error error;
+  Status error;
 
   ProcessSP process_sp(m_thread.GetProcess());
   if (process_sp) {
diff --git a/source/Target/StackFrame.cpp b/source/Target/StackFrame.cpp
index 7b7b596c9773..9deebcaf9250 100644
--- a/source/Target/StackFrame.cpp
+++ b/source/Target/StackFrame.cpp
@@ -488,7 +488,7 @@ StackFrame::GetInScopeVariableList(bool get_file_globals,
 
 ValueObjectSP StackFrame::GetValueForVariableExpressionPath(
     llvm::StringRef var_expr, DynamicValueType use_dynamic, uint32_t options,
-    VariableSP &var_sp, Error &error) {
+    VariableSP &var_sp, Status &error) {
   llvm::StringRef original_var_expr = var_expr;
   // We can't fetch variable information for a history stack frame.
   if (m_is_history_frame)
@@ -631,7 +631,7 @@ ValueObjectSP StackFrame::GetValueForVariableExpressionPath(
       // If we have a non pointer type with a sythetic value then lets check if
       // we have an sythetic dereference specified.
       if (!valobj_sp->IsPointerType() && valobj_sp->HasSyntheticValue()) {
-        Error deref_error;
+        Status deref_error;
         if (valobj_sp->GetCompilerType().IsReferenceType()) {
           valobj_sp = valobj_sp->GetSyntheticValue()->Dereference(deref_error);
           if (error.Fail()) {
@@ -775,7 +775,7 @@ ValueObjectSP StackFrame::GetValueForVariableExpressionPath(
           // what we have is *ptr[low]. the most similar C++ syntax is to deref
           // ptr and extract bit low out of it. reading array item low would be
           // done by saying ptr[low], without a deref * sign
-          Error error;
+          Status error;
           ValueObjectSP temp(valobj_sp->Dereference(error));
           if (error.Fail()) {
             valobj_sp->GetExpressionPath(var_expr_path_strm, false);
@@ -794,7 +794,7 @@ ValueObjectSP StackFrame::GetValueForVariableExpressionPath(
           // (an operation that is equivalent to deref-ing arr)
           // and extract bit low out of it. reading array item low
           // would be done by saying arr[low], without a deref * sign
-          Error error;
+          Status error;
           ValueObjectSP temp(valobj_sp->GetChildAtIndex(0, true));
           if (error.Fail()) {
             valobj_sp->GetExpressionPath(var_expr_path_strm, false);
@@ -977,7 +977,7 @@ ValueObjectSP StackFrame::GetValueForVariableExpressionPath(
         // deref ptr and extract bits low thru high out of it. reading array
         // items low thru high would be done by saying ptr[low-high], without
         // a deref * sign
-        Error error;
+        Status error;
         ValueObjectSP temp(valobj_sp->Dereference(error));
         if (error.Fail()) {
           valobj_sp->GetExpressionPath(var_expr_path_strm, false);
@@ -994,7 +994,7 @@ ValueObjectSP StackFrame::GetValueForVariableExpressionPath(
         // arr[0] (an operation that is equivalent to deref-ing arr) and extract
         // bits low thru high out of it. reading array items low thru high would
         // be done by saying arr[low-high], without a deref * sign
-        Error error;
+        Status error;
         ValueObjectSP temp(valobj_sp->GetChildAtIndex(0, true));
         if (error.Fail()) {
           valobj_sp->GetExpressionPath(var_expr_path_strm, false);
@@ -1065,7 +1065,7 @@ ValueObjectSP StackFrame::GetValueForVariableExpressionPath(
   return valobj_sp;
 }
 
-bool StackFrame::GetFrameBaseValue(Scalar &frame_base, Error *error_ptr) {
+bool StackFrame::GetFrameBaseValue(Scalar &frame_base, Status *error_ptr) {
   std::lock_guard<std::recursive_mutex> guard(m_mutex);
   if (!m_cfa_is_valid) {
     m_frame_base_error.SetErrorString(
@@ -1111,7 +1111,7 @@ bool StackFrame::GetFrameBaseValue(Scalar &frame_base, Error *error_ptr) {
   return m_frame_base_error.Success();
 }
 
-DWARFExpression *StackFrame::GetFrameBaseExpression(Error *error_ptr) {
+DWARFExpression *StackFrame::GetFrameBaseExpression(Status *error_ptr) {
   if (!m_sc.function) {
     if (error_ptr) {
       error_ptr->SetErrorString("No function in symbol context.");
@@ -1426,7 +1426,7 @@ ValueObjectSP GetValueForDereferincingOffset(StackFrame &frame,
     return ValueObjectSP();
   }
 
-  Error error;
+  Status error;
   ValueObjectSP pointee = base->Dereference(error);
     
   if (!pointee) {
diff --git a/source/Target/StopInfo.cpp b/source/Target/StopInfo.cpp
index 8d40c7678b96..6af5ce1b2ebf 100644
--- a/source/Target/StopInfo.cpp
+++ b/source/Target/StopInfo.cpp
@@ -435,7 +435,7 @@ protected:
             // shouldn't stop that will win.
 
             if (bp_loc_sp->GetConditionText() != nullptr) {
-              Error condition_error;
+              Status condition_error;
               bool condition_says_stop =
                   bp_loc_sp->ConditionSaysStop(exe_ctx, condition_error);
 
@@ -796,7 +796,7 @@ protected:
           expr_options.SetUnwindOnError(true);
           expr_options.SetIgnoreBreakpoints(true);
           ValueObjectSP result_value_sp;
-          Error error;
+          Status error;
           result_code = UserExpression::Evaluate(
               exe_ctx, expr_options, wp_sp->GetConditionText(),
               llvm::StringRef(), result_value_sp, error);
diff --git a/source/Target/Target.cpp b/source/Target/Target.cpp
index 5c9e92aaaa27..5c46a024be60 100644
--- a/source/Target/Target.cpp
+++ b/source/Target/Target.cpp
@@ -182,7 +182,7 @@ const lldb::ProcessSP &Target::CreateProcess(ListenerSP listener_sp,
 
 const lldb::ProcessSP &Target::GetProcessSP() const { return m_process_sp; }
 
-lldb::REPLSP Target::GetREPL(Error &err, lldb::LanguageType language,
+lldb::REPLSP Target::GetREPL(Status &err, lldb::LanguageType language,
                              const char *repl_options, bool can_create) {
   if (language == eLanguageTypeUnknown) {
     std::set<LanguageType> repl_languages;
@@ -547,7 +547,7 @@ BreakpointSP Target::CreateFuncRegexBreakpoint(
 lldb::BreakpointSP
 Target::CreateExceptionBreakpoint(enum lldb::LanguageType language,
                                   bool catch_bp, bool throw_bp, bool internal,
-                                  Args *additional_args, Error *error) {
+                                  Args *additional_args, Status *error) {
   BreakpointSP exc_bkpt_sp = LanguageRuntime::CreateExceptionBreakpoint(
       *this, language, catch_bp, throw_bp, internal);
   if (exc_bkpt_sp && additional_args) {
@@ -604,9 +604,9 @@ bool Target::ProcessIsValid() {
   return (m_process_sp && m_process_sp->IsAlive());
 }
 
-static bool CheckIfWatchpointsExhausted(Target *target, Error &error) {
+static bool CheckIfWatchpointsExhausted(Target *target, Status &error) {
   uint32_t num_supported_hardware_watchpoints;
-  Error rc = target->GetProcessSP()->GetWatchpointSupportInfo(
+  Status rc = target->GetProcessSP()->GetWatchpointSupportInfo(
       num_supported_hardware_watchpoints);
   if (num_supported_hardware_watchpoints == 0) {
     error.SetErrorStringWithFormat(
@@ -621,7 +621,7 @@ static bool CheckIfWatchpointsExhausted(Target *target, Error &error) {
 // the OptionGroupWatchpoint::WatchType enum type.
 WatchpointSP Target::CreateWatchpoint(lldb::addr_t addr, size_t size,
                                       const CompilerType *type, uint32_t kind,
-                                      Error &error) {
+                                      Status &error) {
   Log *log(lldb_private::GetLogIfAllCategoriesSet(LIBLLDB_LOG_WATCHPOINTS));
   if (log)
     log->Printf("Target::%s (addr = 0x%8.8" PRIx64 " size = %" PRIu64
@@ -797,10 +797,10 @@ bool Target::EnableBreakpointByID(break_id_t break_id) {
   return false;
 }
 
-Error Target::SerializeBreakpointsToFile(const FileSpec &file,
-                                         const BreakpointIDList &bp_ids,
-                                         bool append) {
-  Error error;
+Status Target::SerializeBreakpointsToFile(const FileSpec &file,
+                                          const BreakpointIDList &bp_ids,
+                                          bool append) {
+  Status error;
 
   if (!file) {
     error.SetErrorString("Invalid FileSpec.");
@@ -891,19 +891,19 @@ Error Target::SerializeBreakpointsToFile(const FileSpec &file,
   return error;
 }
 
-Error Target::CreateBreakpointsFromFile(const FileSpec &file,
-                                        BreakpointIDList &new_bps) {
+Status Target::CreateBreakpointsFromFile(const FileSpec &file,
+                                         BreakpointIDList &new_bps) {
   std::vector<std::string> no_names;
   return CreateBreakpointsFromFile(file, no_names, new_bps);
 }
 
-Error Target::CreateBreakpointsFromFile(const FileSpec &file,
-                                        std::vector<std::string> &names,
-                                        BreakpointIDList &new_bps) {
+Status Target::CreateBreakpointsFromFile(const FileSpec &file,
+                                         std::vector<std::string> &names,
+                                         BreakpointIDList &new_bps) {
   std::unique_lock<std::recursive_mutex> lock;
   GetBreakpointList().GetListMutex(lock);
 
-  Error error;
+  Status error;
   StructuredData::ObjectSP input_data_sp =
       StructuredData::ParseJSONFromFile(file, error);
   if (!error.Success()) {
@@ -979,7 +979,7 @@ bool Target::RemoveAllWatchpoints(bool end_to_end) {
     if (!wp_sp)
       return false;
 
-    Error rc = m_process_sp->DisableWatchpoint(wp_sp.get());
+    Status rc = m_process_sp->DisableWatchpoint(wp_sp.get());
     if (rc.Fail())
       return false;
   }
@@ -1011,7 +1011,7 @@ bool Target::DisableAllWatchpoints(bool end_to_end) {
     if (!wp_sp)
       return false;
 
-    Error rc = m_process_sp->DisableWatchpoint(wp_sp.get());
+    Status rc = m_process_sp->DisableWatchpoint(wp_sp.get());
     if (rc.Fail())
       return false;
   }
@@ -1041,7 +1041,7 @@ bool Target::EnableAllWatchpoints(bool end_to_end) {
     if (!wp_sp)
       return false;
 
-    Error rc = m_process_sp->EnableWatchpoint(wp_sp.get());
+    Status rc = m_process_sp->EnableWatchpoint(wp_sp.get());
     if (rc.Fail())
       return false;
   }
@@ -1114,7 +1114,7 @@ bool Target::DisableWatchpointByID(lldb::watch_id_t watch_id) {
 
   WatchpointSP wp_sp = m_watchpoint_list.FindByID(watch_id);
   if (wp_sp) {
-    Error rc = m_process_sp->DisableWatchpoint(wp_sp.get());
+    Status rc = m_process_sp->DisableWatchpoint(wp_sp.get());
     if (rc.Success())
       return true;
 
@@ -1134,7 +1134,7 @@ bool Target::EnableWatchpointByID(lldb::watch_id_t watch_id) {
 
   WatchpointSP wp_sp = m_watchpoint_list.FindByID(watch_id);
   if (wp_sp) {
-    Error rc = m_process_sp->EnableWatchpoint(wp_sp.get());
+    Status rc = m_process_sp->EnableWatchpoint(wp_sp.get());
     if (rc.Success())
       return true;
 
@@ -1198,7 +1198,7 @@ Module *Target::GetExecutableModulePointer() {
 
 static void LoadScriptingResourceForModule(const ModuleSP &module_sp,
                                            Target *target) {
-  Error error;
+  Status error;
   StreamString feedback_stream;
   if (module_sp &&
       !module_sp->LoadScriptingResourceInTarget(target, error,
@@ -1235,7 +1235,8 @@ void Target::SetExecutableModule(ModuleSP &executable_sp,
   ClearModules(false);
 
   if (executable_sp) {
-    Timer scoped_timer(LLVM_PRETTY_FUNCTION,
+    static Timer::Category func_cat(LLVM_PRETTY_FUNCTION);
+    Timer scoped_timer(func_cat,
                        "Target::SetExecutableModule (executable = '%s')",
                        executable_sp->GetFileSpec().GetPath().c_str());
 
@@ -1335,9 +1336,9 @@ bool Target::SetArchitecture(const ArchSpec &arch_spec) {
                   arch_spec.GetArchitectureName(),
                   arch_spec.GetTriple().getTriple().c_str());
     ModuleSpec module_spec(executable_sp->GetFileSpec(), other);
-    Error error = ModuleList::GetSharedModule(module_spec, executable_sp,
-                                              &GetExecutableSearchPaths(),
-                                              nullptr, nullptr);
+    Status error = ModuleList::GetSharedModule(module_spec, executable_sp,
+                                               &GetExecutableSearchPaths(),
+                                               nullptr, nullptr);
 
     if (!error.Fail() && executable_sp) {
       SetExecutableModule(executable_sp, true);
@@ -1474,7 +1475,7 @@ bool Target::ModuleIsExcludedForUnconstrainedSearches(
 }
 
 size_t Target::ReadMemoryFromFileCache(const Address &addr, void *dst,
-                                       size_t dst_len, Error &error) {
+                                       size_t dst_len, Status &error) {
   SectionSP section_sp(addr.GetSection());
   if (section_sp) {
     // If the contents of this section are encrypted, the on-disk file is
@@ -1506,7 +1507,7 @@ size_t Target::ReadMemoryFromFileCache(const Address &addr, void *dst,
 }
 
 size_t Target::ReadMemory(const Address &addr, bool prefer_file_cache,
-                          void *dst, size_t dst_len, Error &error,
+                          void *dst, size_t dst_len, Status &error,
                           lldb::addr_t *load_addr_ptr) {
   error.Clear();
 
@@ -1598,7 +1599,7 @@ size_t Target::ReadMemory(const Address &addr, bool prefer_file_cache,
 }
 
 size_t Target::ReadCStringFromMemory(const Address &addr, std::string &out_str,
-                                     Error &error) {
+                                     Status &error) {
   char buf[256];
   out_str.clear();
   addr_t curr_addr = addr.GetLoadAddress(this);
@@ -1620,13 +1621,13 @@ size_t Target::ReadCStringFromMemory(const Address &addr, std::string &out_str,
 }
 
 size_t Target::ReadCStringFromMemory(const Address &addr, char *dst,
-                                     size_t dst_max_len, Error &result_error) {
+                                     size_t dst_max_len, Status &result_error) {
   size_t total_cstr_len = 0;
   if (dst && dst_max_len) {
     result_error.Clear();
     // NULL out everything just to be safe
     memset(dst, 0, dst_max_len);
-    Error error;
+    Status error;
     addr_t curr_addr = addr.GetLoadAddress(this);
     Address address(addr);
 
@@ -1675,7 +1676,7 @@ size_t Target::ReadCStringFromMemory(const Address &addr, char *dst,
 size_t Target::ReadScalarIntegerFromMemory(const Address &addr,
                                            bool prefer_file_cache,
                                            uint32_t byte_size, bool is_signed,
-                                           Scalar &scalar, Error &error) {
+                                           Scalar &scalar, Status &error) {
   uint64_t uval;
 
   if (byte_size <= sizeof(uval)) {
@@ -1705,7 +1706,7 @@ uint64_t Target::ReadUnsignedIntegerFromMemory(const Address &addr,
                                                bool prefer_file_cache,
                                                size_t integer_byte_size,
                                                uint64_t fail_value,
-                                               Error &error) {
+                                               Status &error) {
   Scalar scalar;
   if (ReadScalarIntegerFromMemory(addr, prefer_file_cache, integer_byte_size,
                                   false, scalar, error))
@@ -1714,7 +1715,7 @@ uint64_t Target::ReadUnsignedIntegerFromMemory(const Address &addr,
 }
 
 bool Target::ReadPointerFromMemory(const Address &addr, bool prefer_file_cache,
-                                   Error &error, Address &pointer_addr) {
+                                   Status &error, Address &pointer_addr) {
   Scalar scalar;
   if (ReadScalarIntegerFromMemory(addr, prefer_file_cache,
                                   m_arch.GetAddressByteSize(), false, scalar,
@@ -1744,10 +1745,10 @@ bool Target::ReadPointerFromMemory(const Address &addr, bool prefer_file_cache,
 }
 
 ModuleSP Target::GetSharedModule(const ModuleSpec &module_spec,
-                                 Error *error_ptr) {
+                                 Status *error_ptr) {
   ModuleSP module_sp;
 
-  Error error;
+  Status error;
 
   // First see if we already have this module in our module list.  If we do,
   // then we're done, we don't need
@@ -1918,7 +1919,7 @@ void Target::ImageSearchPathsChanged(const PathMappingList &path_list,
     target->SetExecutableModule(exe_module_sp, true);
 }
 
-TypeSystem *Target::GetScratchTypeSystemForLanguage(Error *error,
+TypeSystem *Target::GetScratchTypeSystemForLanguage(Status *error,
                                                     lldb::LanguageType language,
                                                     bool create_on_demand) {
   if (!m_valid)
@@ -1968,8 +1969,8 @@ Target::GetPersistentExpressionStateForLanguage(lldb::LanguageType language) {
 UserExpression *Target::GetUserExpressionForLanguage(
     llvm::StringRef expr, llvm::StringRef prefix, lldb::LanguageType language,
     Expression::ResultType desired_type,
-    const EvaluateExpressionOptions &options, Error &error) {
-  Error type_system_error;
+    const EvaluateExpressionOptions &options, Status &error) {
+  Status type_system_error;
 
   TypeSystem *type_system =
       GetScratchTypeSystemForLanguage(&type_system_error, language);
@@ -1996,8 +1997,8 @@ UserExpression *Target::GetUserExpressionForLanguage(
 FunctionCaller *Target::GetFunctionCallerForLanguage(
     lldb::LanguageType language, const CompilerType &return_type,
     const Address &function_address, const ValueList &arg_value_list,
-    const char *name, Error &error) {
-  Error type_system_error;
+    const char *name, Status &error) {
+  Status type_system_error;
   TypeSystem *type_system =
       GetScratchTypeSystemForLanguage(&type_system_error, language);
   FunctionCaller *persistent_fn = nullptr;
@@ -2023,8 +2024,8 @@ FunctionCaller *Target::GetFunctionCallerForLanguage(
 UtilityFunction *
 Target::GetUtilityFunctionForLanguage(const char *text,
                                       lldb::LanguageType language,
-                                      const char *name, Error &error) {
-  Error type_system_error;
+                                      const char *name, Status &error) {
+  Status type_system_error;
   TypeSystem *type_system =
       GetScratchTypeSystemForLanguage(&type_system_error, language);
   UtilityFunction *utility_fn = nullptr;
@@ -2162,7 +2163,7 @@ ExpressionResults Target::EvaluateExpression(
     execution_results = eExpressionCompleted;
   } else {
     const char *prefix = GetExpressionPrefixContentsAsCString();
-    Error error;
+    Status error;
     execution_results = UserExpression::Evaluate(exe_ctx, options, expr, prefix,
                                                  result_valobj_sp, error,
                                                  0, // Line Number
@@ -2653,8 +2654,8 @@ const TargetPropertiesSP &Target::GetGlobalProperties() {
   return *g_settings_sp_ptr;
 }
 
-Error Target::Install(ProcessLaunchInfo *launch_info) {
-  Error error;
+Status Target::Install(ProcessLaunchInfo *launch_info) {
+  Status error;
   PlatformSP platform_sp(GetPlatform());
   if (platform_sp) {
     if (platform_sp->IsRemote()) {
@@ -2784,8 +2785,8 @@ bool Target::SetSectionUnloaded(const lldb::SectionSP &section_sp,
 
 void Target::ClearAllLoadedSections() { m_section_load_history.Clear(); }
 
-Error Target::Launch(ProcessLaunchInfo &launch_info, Stream *stream) {
-  Error error;
+Status Target::Launch(ProcessLaunchInfo &launch_info, Stream *stream) {
+  Status error;
   Log *log(lldb_private::GetLogIfAllCategoriesSet(LIBLLDB_LOG_TARGET));
 
   if (log)
@@ -2933,7 +2934,7 @@ Error Target::Launch(ProcessLaunchInfo &launch_info, Stream *stream) {
             error = m_process_sp->PrivateResume();
           }
           if (!error.Success()) {
-            Error error2;
+            Status error2;
             error2.SetErrorStringWithFormat(
                 "process resume at entry point failed: %s", error.AsCString());
             error = error2;
@@ -2971,7 +2972,7 @@ Error Target::Launch(ProcessLaunchInfo &launch_info, Stream *stream) {
     }
     m_process_sp->RestoreProcessEvents();
   } else {
-    Error error2;
+    Status error2;
     error2.SetErrorStringWithFormat("process launch failed: %s",
                                     error.AsCString());
     error = error2;
@@ -2979,15 +2980,15 @@ Error Target::Launch(ProcessLaunchInfo &launch_info, Stream *stream) {
   return error;
 }
 
-Error Target::Attach(ProcessAttachInfo &attach_info, Stream *stream) {
+Status Target::Attach(ProcessAttachInfo &attach_info, Stream *stream) {
   auto state = eStateInvalid;
   auto process_sp = GetProcessSP();
   if (process_sp) {
     state = process_sp->GetState();
     if (process_sp->IsAlive() && state != eStateConnected) {
       if (state == eStateAttaching)
-        return Error("process attach is in progress");
-      return Error("a process is already being debugged");
+        return Status("process attach is in progress");
+      return Status("a process is already being debugged");
     }
   }
 
@@ -3001,8 +3002,8 @@ Error Target::Attach(ProcessAttachInfo &attach_info, Stream *stream) {
           old_exec_module_sp->GetPlatformFileSpec().GetFilename();
 
     if (!attach_info.ProcessInfoSpecified()) {
-      return Error("no process specified, create a target with a file, or "
-                   "specify the --pid or --name");
+      return Status("no process specified, create a target with a file, or "
+                    "specify the --pid or --name");
     }
   }
 
@@ -3016,7 +3017,7 @@ Error Target::Attach(ProcessAttachInfo &attach_info, Stream *stream) {
     attach_info.SetHijackListener(hijack_listener_sp);
   }
 
-  Error error;
+  Status error;
   if (state != eStateConnected && platform_sp != nullptr &&
       platform_sp->CanDebugProcess()) {
     SetPlatform(platform_sp);
diff --git a/source/Target/TargetList.cpp b/source/Target/TargetList.cpp
index 0849c18b89e0..43e1227ba205 100644
--- a/source/Target/TargetList.cpp
+++ b/source/Target/TargetList.cpp
@@ -55,33 +55,33 @@ TargetList::~TargetList() {
   m_target_list.clear();
 }
 
-Error TargetList::CreateTarget(Debugger &debugger,
-                               llvm::StringRef user_exe_path,
-                               llvm::StringRef triple_str,
-                               bool get_dependent_files,
-                               const OptionGroupPlatform *platform_options,
-                               TargetSP &target_sp) {
+Status TargetList::CreateTarget(Debugger &debugger,
+                                llvm::StringRef user_exe_path,
+                                llvm::StringRef triple_str,
+                                bool get_dependent_files,
+                                const OptionGroupPlatform *platform_options,
+                                TargetSP &target_sp) {
   return CreateTargetInternal(debugger, user_exe_path, triple_str,
                               get_dependent_files, platform_options, target_sp,
                               false);
 }
 
-Error TargetList::CreateTarget(Debugger &debugger,
-                               llvm::StringRef user_exe_path,
-                               const ArchSpec &specified_arch,
-                               bool get_dependent_files,
-                               PlatformSP &platform_sp, TargetSP &target_sp) {
+Status TargetList::CreateTarget(Debugger &debugger,
+                                llvm::StringRef user_exe_path,
+                                const ArchSpec &specified_arch,
+                                bool get_dependent_files,
+                                PlatformSP &platform_sp, TargetSP &target_sp) {
   return CreateTargetInternal(debugger, user_exe_path, specified_arch,
                               get_dependent_files, platform_sp, target_sp,
                               false);
 }
 
-Error TargetList::CreateTargetInternal(
+Status TargetList::CreateTargetInternal(
     Debugger &debugger, llvm::StringRef user_exe_path,
     llvm::StringRef triple_str, bool get_dependent_files,
     const OptionGroupPlatform *platform_options, TargetSP &target_sp,
     bool is_dummy_target) {
-  Error error;
+  Status error;
   PlatformSP platform_sp;
 
   // This is purposely left empty unless it is specified by triple_cstr.
@@ -302,34 +302,34 @@ lldb::TargetSP TargetList::GetDummyTarget(lldb_private::Debugger &debugger) {
     ArchSpec arch(Target::GetDefaultArchitecture());
     if (!arch.IsValid())
       arch = HostInfo::GetArchitecture();
-    Error err = CreateDummyTarget(
+    Status err = CreateDummyTarget(
         debugger, arch.GetTriple().getTriple().c_str(), m_dummy_target_sp);
   }
 
   return m_dummy_target_sp;
 }
 
-Error TargetList::CreateDummyTarget(Debugger &debugger,
-                                    llvm::StringRef specified_arch_name,
-                                    lldb::TargetSP &target_sp) {
+Status TargetList::CreateDummyTarget(Debugger &debugger,
+                                     llvm::StringRef specified_arch_name,
+                                     lldb::TargetSP &target_sp) {
   PlatformSP host_platform_sp(Platform::GetHostPlatform());
   return CreateTargetInternal(
       debugger, (const char *)nullptr, specified_arch_name, false,
       (const OptionGroupPlatform *)nullptr, target_sp, true);
 }
 
-Error TargetList::CreateTargetInternal(Debugger &debugger,
-                                       llvm::StringRef user_exe_path,
-                                       const ArchSpec &specified_arch,
-                                       bool get_dependent_files,
-                                       lldb::PlatformSP &platform_sp,
-                                       lldb::TargetSP &target_sp,
-                                       bool is_dummy_target) {
-  Timer scoped_timer(LLVM_PRETTY_FUNCTION,
-                     "TargetList::CreateTarget (file = '%s', arch = '%s')",
-                     user_exe_path.str().c_str(),
-                     specified_arch.GetArchitectureName());
-  Error error;
+Status TargetList::CreateTargetInternal(Debugger &debugger,
+                                        llvm::StringRef user_exe_path,
+                                        const ArchSpec &specified_arch,
+                                        bool get_dependent_files,
+                                        lldb::PlatformSP &platform_sp,
+                                        lldb::TargetSP &target_sp,
+                                        bool is_dummy_target) {
+  static Timer::Category func_cat(LLVM_PRETTY_FUNCTION);
+  Timer scoped_timer(
+      func_cat, "TargetList::CreateTarget (file = '%s', arch = '%s')",
+      user_exe_path.str().c_str(), specified_arch.GetArchitectureName());
+  Status error;
 
   ArchSpec arch(specified_arch);
 
diff --git a/source/Target/Thread.cpp b/source/Target/Thread.cpp
index 4b21382ff86f..43ae7b5413be 100644
--- a/source/Target/Thread.cpp
+++ b/source/Target/Thread.cpp
@@ -1335,8 +1335,8 @@ bool Thread::PlanIsBasePlan(ThreadPlan *plan_ptr) {
     return m_plan_stack[0].get() == plan_ptr;
 }
 
-Error Thread::UnwindInnermostExpression() {
-  Error error;
+Status Thread::UnwindInnermostExpression() {
+  Status error;
   int stack_size = m_plan_stack.size();
 
   // If the input plan is nullptr, discard all plans.  Otherwise make sure this
@@ -1635,11 +1635,11 @@ lldb::StackFrameSP Thread::GetFrameWithConcreteFrameIndex(uint32_t unwind_idx) {
   return GetStackFrameList()->GetFrameWithConcreteFrameIndex(unwind_idx);
 }
 
-Error Thread::ReturnFromFrameWithIndex(uint32_t frame_idx,
-                                       lldb::ValueObjectSP return_value_sp,
-                                       bool broadcast) {
+Status Thread::ReturnFromFrameWithIndex(uint32_t frame_idx,
+                                        lldb::ValueObjectSP return_value_sp,
+                                        bool broadcast) {
   StackFrameSP frame_sp = GetStackFrameAtIndex(frame_idx);
-  Error return_error;
+  Status return_error;
 
   if (!frame_sp) {
     return_error.SetErrorStringWithFormat(
@@ -1650,10 +1650,10 @@ Error Thread::ReturnFromFrameWithIndex(uint32_t frame_idx,
   return ReturnFromFrame(frame_sp, return_value_sp, broadcast);
 }
 
-Error Thread::ReturnFromFrame(lldb::StackFrameSP frame_sp,
-                              lldb::ValueObjectSP return_value_sp,
-                              bool broadcast) {
-  Error return_error;
+Status Thread::ReturnFromFrame(lldb::StackFrameSP frame_sp,
+                               lldb::ValueObjectSP return_value_sp,
+                               bool broadcast) {
+  Status return_error;
 
   if (!frame_sp) {
     return_error.SetErrorString("Can't return to a null frame.");
@@ -1740,8 +1740,8 @@ static void DumpAddressList(Stream &s, const std::vector<Address> &list,
   }
 }
 
-Error Thread::JumpToLine(const FileSpec &file, uint32_t line,
-                         bool can_leave_function, std::string *warnings) {
+Status Thread::JumpToLine(const FileSpec &file, uint32_t line,
+                          bool can_leave_function, std::string *warnings) {
   ExecutionContext exe_ctx(GetStackFrameAtIndex(0));
   Target *target = exe_ctx.GetTargetPtr();
   TargetSP target_sp = exe_ctx.GetTargetSP();
@@ -1769,16 +1769,16 @@ Error Thread::JumpToLine(const FileSpec &file, uint32_t line,
   // Check if we got anything.
   if (candidates.empty()) {
     if (outside_function.empty()) {
-      return Error("Cannot locate an address for %s:%i.",
-                   file.GetFilename().AsCString(), line);
+      return Status("Cannot locate an address for %s:%i.",
+                    file.GetFilename().AsCString(), line);
     } else if (outside_function.size() == 1) {
-      return Error("%s:%i is outside the current function.",
-                   file.GetFilename().AsCString(), line);
+      return Status("%s:%i is outside the current function.",
+                    file.GetFilename().AsCString(), line);
     } else {
       StreamString sstr;
       DumpAddressList(sstr, outside_function, target);
-      return Error("%s:%i has multiple candidate locations:\n%s",
-                   file.GetFilename().AsCString(), line, sstr.GetData());
+      return Status("%s:%i has multiple candidate locations:\n%s",
+                    file.GetFilename().AsCString(), line, sstr.GetData());
     }
   }
 
@@ -1794,9 +1794,9 @@ Error Thread::JumpToLine(const FileSpec &file, uint32_t line,
   }
 
   if (!reg_ctx->SetPC(dest))
-    return Error("Cannot change PC to target address.");
+    return Status("Cannot change PC to target address.");
 
-  return Error();
+  return Status();
 }
 
 void Thread::DumpUsingSettingsFormat(Stream &strm, uint32_t frame_idx,
@@ -2001,8 +2001,8 @@ bool Thread::GetDescription(Stream &strm, lldb::DescriptionLevel level,
       StructuredData::ObjectSP name = activity_dict->GetValueForKey("name");
       if (name && name->GetType() == StructuredData::Type::eTypeString && id &&
           id->GetType() == StructuredData::Type::eTypeInteger) {
-        strm.Printf("  Activity '%s', 0x%" PRIx64 "\n",
-                    name->GetAsString()->GetValue().c_str(),
+        strm.Format("  Activity '{0}', {1:x}\n",
+                    name->GetAsString()->GetValue(),
                     id->GetAsInteger()->GetValue());
       }
       printed_activity = true;
@@ -2018,8 +2018,8 @@ bool Thread::GetDescription(Stream &strm, lldb::DescriptionLevel level,
           breadcrumb_dict->GetValueForKey("name");
       if (breadcrumb_text &&
           breadcrumb_text->GetType() == StructuredData::Type::eTypeString) {
-        strm.Printf("  Current Breadcrumb: %s\n",
-                    breadcrumb_text->GetAsString()->GetValue().c_str());
+        strm.Format("  Current Breadcrumb: {0}\n",
+                    breadcrumb_text->GetAsString()->GetValue());
       }
       printed_breadcrumb = true;
     }
@@ -2040,8 +2040,7 @@ bool Thread::GetDescription(Stream &strm, lldb::DescriptionLevel level,
                 message_dict->GetValueForKey("message");
             if (message_text &&
                 message_text->GetType() == StructuredData::Type::eTypeString) {
-              strm.Printf("    %s\n",
-                          message_text->GetAsString()->GetValue().c_str());
+              strm.Format("    {0}\n", message_text->GetAsString()->GetValue());
             }
           }
         }
@@ -2117,12 +2116,12 @@ bool Thread::IsStillAtLastBreakpointHit() {
   return false;
 }
 
-Error Thread::StepIn(bool source_step,
-                     LazyBool step_in_avoids_code_without_debug_info,
-                     LazyBool step_out_avoids_code_without_debug_info)
+Status Thread::StepIn(bool source_step,
+                      LazyBool step_in_avoids_code_without_debug_info,
+                      LazyBool step_out_avoids_code_without_debug_info)
 
 {
-  Error error;
+  Status error;
   Process *process = GetProcess().get();
   if (StateIsStoppedState(process->GetState(), true)) {
     StackFrameSP frame_sp = GetStackFrameAtIndex(0);
@@ -2153,9 +2152,9 @@ Error Thread::StepIn(bool source_step,
   return error;
 }
 
-Error Thread::StepOver(bool source_step,
-                       LazyBool step_out_avoids_code_without_debug_info) {
-  Error error;
+Status Thread::StepOver(bool source_step,
+                        LazyBool step_out_avoids_code_without_debug_info) {
+  Status error;
   Process *process = GetProcess().get();
   if (StateIsStoppedState(process->GetState(), true)) {
     StackFrameSP frame_sp = GetStackFrameAtIndex(0);
@@ -2186,8 +2185,8 @@ Error Thread::StepOver(bool source_step,
   return error;
 }
 
-Error Thread::StepOut() {
-  Error error;
+Status Thread::StepOut() {
+  Status error;
   Process *process = GetProcess().get();
   if (StateIsStoppedState(process->GetState(), true)) {
     const bool first_instruction = false;
diff --git a/source/Target/ThreadPlanCallFunction.cpp b/source/Target/ThreadPlanCallFunction.cpp
index 2c630d59e84c..e3b9ae15dc95 100644
--- a/source/Target/ThreadPlanCallFunction.cpp
+++ b/source/Target/ThreadPlanCallFunction.cpp
@@ -58,7 +58,7 @@ bool ThreadPlanCallFunction::ConstructorSetup(
   // If we can't read memory at the point of the process where we are planning
   // to put our function, we're
   // not going to get any further...
-  Error error;
+  Status error;
   process_sp->ReadUnsignedIntegerFromMemory(m_function_sp, 4, 0, error);
   if (!error.Success()) {
     m_constructor_errors.Printf(
diff --git a/source/Target/ThreadPlanTracer.cpp b/source/Target/ThreadPlanTracer.cpp
index f8368123a0c7..014c7fd27975 100644
--- a/source/Target/ThreadPlanTracer.cpp
+++ b/source/Target/ThreadPlanTracer.cpp
@@ -145,7 +145,7 @@ void ThreadPlanAssemblyTracer::Log() {
 
   Disassembler *disassembler = GetDisassembler();
   if (disassembler) {
-    Error err;
+    Status err;
     process_sp->ReadMemory(pc, buffer, sizeof(buffer), err);
 
     if (err.Success()) {
diff --git a/source/Target/ThreadSpec.cpp b/source/Target/ThreadSpec.cpp
index c7eec078612a..f769d1a775af 100644
--- a/source/Target/ThreadSpec.cpp
+++ b/source/Target/ThreadSpec.cpp
@@ -39,11 +39,11 @@ const ThreadSpec &ThreadSpec::operator=(const ThreadSpec &rhs) {
 }
 
 std::unique_ptr<ThreadSpec> ThreadSpec::CreateFromStructuredData(
-    const StructuredData::Dictionary &spec_dict, Error &error) {
+    const StructuredData::Dictionary &spec_dict, Status &error) {
   uint32_t index = UINT32_MAX;
   lldb::tid_t tid = LLDB_INVALID_THREAD_ID;
-  std::string name;
-  std::string queue_name;
+  llvm::StringRef name;
+  llvm::StringRef queue_name;
 
   std::unique_ptr<ThreadSpec> thread_spec_up(new ThreadSpec());
   bool success = spec_dict.GetValueForKeyAsInteger(
@@ -59,12 +59,12 @@ std::unique_ptr<ThreadSpec> ThreadSpec::CreateFromStructuredData(
   success =
       spec_dict.GetValueForKeyAsString(GetKey(OptionNames::ThreadName), name);
   if (success)
-    thread_spec_up->SetName(name.c_str());
+    thread_spec_up->SetName(name);
 
   success = spec_dict.GetValueForKeyAsString(GetKey(OptionNames::ThreadName),
                                              queue_name);
   if (success)
-    thread_spec_up->SetQueueName(queue_name.c_str());
+    thread_spec_up->SetQueueName(queue_name);
 
   return thread_spec_up;
 }
diff --git a/source/Utility/CMakeLists.txt b/source/Utility/CMakeLists.txt
index d4e8e361017c..a1675670f0b4 100644
--- a/source/Utility/CMakeLists.txt
+++ b/source/Utility/CMakeLists.txt
@@ -5,7 +5,6 @@ add_lldb_library(lldbUtility
   DataBufferLLVM.cpp
   DataEncoder.cpp
   DataExtractor.cpp
-  Error.cpp
   FastDemangle.cpp
   FileSpec.cpp
   History.cpp
@@ -18,6 +17,7 @@ add_lldb_library(lldbUtility
   RegularExpression.cpp
   SelectHelper.cpp
   SharingPtr.cpp
+  Status.cpp
   Stream.cpp
   StreamCallback.cpp
   StreamGDBRemote.cpp
diff --git a/source/Utility/Error.cpp b/source/Utility/Error.cpp
deleted file mode 100644
index b21ee57b61af..000000000000
--- a/source/Utility/Error.cpp
+++ /dev/null
@@ -1,274 +0,0 @@
-//===-- Error.cpp -----------------------------------------------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#include "lldb/Utility/Error.h"
-
-#include "lldb/Utility/VASPrintf.h"
-#include "lldb/lldb-defines.h"            // for LLDB_GENERIC_ERROR
-#include "lldb/lldb-enumerations.h"       // for ErrorType, ErrorType::eErr...
-#include "llvm/ADT/SmallString.h"         // for SmallString
-#include "llvm/ADT/StringRef.h"           // for StringRef
-#include "llvm/Support/FormatProviders.h" // for format_provider
-
-#include <cerrno>
-#include <cstdarg>
-#include <string> // for string
-#include <system_error>
-
-#ifdef __APPLE__
-#include <mach/mach.h>
-#endif
-
-#include <stdint.h> // for uint32_t
-#include <string.h> // for strerror
-
-namespace llvm {
-class raw_ostream;
-}
-
-using namespace lldb;
-using namespace lldb_private;
-
-Error::Error() : m_code(0), m_type(eErrorTypeInvalid), m_string() {}
-
-Error::Error(ValueType err, ErrorType type)
-    : m_code(err), m_type(type), m_string() {}
-
-Error::Error(std::error_code EC)
-    : m_code(EC.value()), m_type(ErrorType::eErrorTypeGeneric),
-      m_string(EC.message()) {}
-
-Error::Error(const Error &rhs) = default;
-
-Error::Error(const char *format, ...)
-    : m_code(0), m_type(eErrorTypeInvalid), m_string() {
-  va_list args;
-  va_start(args, format);
-  SetErrorToGenericError();
-  SetErrorStringWithVarArg(format, args);
-  va_end(args);
-}
-
-//----------------------------------------------------------------------
-// Assignment operator
-//----------------------------------------------------------------------
-const Error &Error::operator=(const Error &rhs) {
-  if (this != &rhs) {
-    m_code = rhs.m_code;
-    m_type = rhs.m_type;
-    m_string = rhs.m_string;
-  }
-  return *this;
-}
-
-//----------------------------------------------------------------------
-// Assignment operator
-//----------------------------------------------------------------------
-const Error &Error::operator=(uint32_t err) {
-  m_code = err;
-  m_type = eErrorTypeMachKernel;
-  m_string.clear();
-  return *this;
-}
-
-Error::~Error() = default;
-
-//----------------------------------------------------------------------
-// Get the error value as a NULL C string. The error string will be
-// fetched and cached on demand. The cached error string value will
-// remain until the error value is changed or cleared.
-//----------------------------------------------------------------------
-const char *Error::AsCString(const char *default_error_str) const {
-  if (Success())
-    return nullptr;
-
-  if (m_string.empty()) {
-    const char *s = nullptr;
-    switch (m_type) {
-    case eErrorTypeMachKernel:
-#if defined(__APPLE__)
-      s = ::mach_error_string(m_code);
-#endif
-      break;
-
-    case eErrorTypePOSIX:
-      s = ::strerror(m_code);
-      break;
-
-    default:
-      break;
-    }
-    if (s != nullptr)
-      m_string.assign(s);
-  }
-  if (m_string.empty()) {
-    if (default_error_str)
-      m_string.assign(default_error_str);
-    else
-      return nullptr; // User wanted a nullptr string back...
-  }
-  return m_string.c_str();
-}
-
-//----------------------------------------------------------------------
-// Clear the error and any cached error string that it might contain.
-//----------------------------------------------------------------------
-void Error::Clear() {
-  m_code = 0;
-  m_type = eErrorTypeInvalid;
-  m_string.clear();
-}
-
-//----------------------------------------------------------------------
-// Access the error value.
-//----------------------------------------------------------------------
-Error::ValueType Error::GetError() const { return m_code; }
-
-//----------------------------------------------------------------------
-// Access the error type.
-//----------------------------------------------------------------------
-ErrorType Error::GetType() const { return m_type; }
-
-//----------------------------------------------------------------------
-// Returns true if this object contains a value that describes an
-// error or otherwise non-success result.
-//----------------------------------------------------------------------
-bool Error::Fail() const { return m_code != 0; }
-
-//----------------------------------------------------------------------
-// Set accesssor for the error value to "err" and the type to
-// "eErrorTypeMachKernel"
-//----------------------------------------------------------------------
-void Error::SetMachError(uint32_t err) {
-  m_code = err;
-  m_type = eErrorTypeMachKernel;
-  m_string.clear();
-}
-
-void Error::SetExpressionError(lldb::ExpressionResults result,
-                               const char *mssg) {
-  m_code = result;
-  m_type = eErrorTypeExpression;
-  m_string = mssg;
-}
-
-int Error::SetExpressionErrorWithFormat(lldb::ExpressionResults result,
-                                        const char *format, ...) {
-  int length = 0;
-
-  if (format != nullptr && format[0]) {
-    va_list args;
-    va_start(args, format);
-    length = SetErrorStringWithVarArg(format, args);
-    va_end(args);
-  } else {
-    m_string.clear();
-  }
-  m_code = result;
-  m_type = eErrorTypeExpression;
-  return length;
-}
-
-//----------------------------------------------------------------------
-// Set accesssor for the error value and type.
-//----------------------------------------------------------------------
-void Error::SetError(ValueType err, ErrorType type) {
-  m_code = err;
-  m_type = type;
-  m_string.clear();
-}
-
-//----------------------------------------------------------------------
-// Update the error value to be "errno" and update the type to
-// be "POSIX".
-//----------------------------------------------------------------------
-void Error::SetErrorToErrno() {
-  m_code = errno;
-  m_type = eErrorTypePOSIX;
-  m_string.clear();
-}
-
-//----------------------------------------------------------------------
-// Update the error value to be LLDB_GENERIC_ERROR and update the type
-// to be "Generic".
-//----------------------------------------------------------------------
-void Error::SetErrorToGenericError() {
-  m_code = LLDB_GENERIC_ERROR;
-  m_type = eErrorTypeGeneric;
-  m_string.clear();
-}
-
-//----------------------------------------------------------------------
-// Set accessor for the error string value for a specific error.
-// This allows any string to be supplied as an error explanation.
-// The error string value will remain until the error value is
-// cleared or a new error value/type is assigned.
-//----------------------------------------------------------------------
-void Error::SetErrorString(llvm::StringRef err_str) {
-  if (!err_str.empty()) {
-    // If we have an error string, we should always at least have an error
-    // set to a generic value.
-    if (Success())
-      SetErrorToGenericError();
-  }
-  m_string = err_str;
-}
-
-//------------------------------------------------------------------
-/// Set the current error string to a formatted error string.
-///
-/// @param format
-///     A printf style format string
-//------------------------------------------------------------------
-int Error::SetErrorStringWithFormat(const char *format, ...) {
-  if (format != nullptr && format[0]) {
-    va_list args;
-    va_start(args, format);
-    int length = SetErrorStringWithVarArg(format, args);
-    va_end(args);
-    return length;
-  } else {
-    m_string.clear();
-  }
-  return 0;
-}
-
-int Error::SetErrorStringWithVarArg(const char *format, va_list args) {
-  if (format != nullptr && format[0]) {
-    // If we have an error string, we should always at least have
-    // an error set to a generic value.
-    if (Success())
-      SetErrorToGenericError();
-
-    llvm::SmallString<1024> buf;
-    VASprintf(buf, format, args);
-    m_string = buf.str();
-    return buf.size();
-  } else {
-    m_string.clear();
-  }
-  return 0;
-}
-
-//----------------------------------------------------------------------
-// Returns true if the error code in this object is considered a
-// successful return value.
-//----------------------------------------------------------------------
-bool Error::Success() const { return m_code == 0; }
-
-bool Error::WasInterrupted() const {
-  return (m_type == eErrorTypePOSIX && m_code == EINTR);
-}
-
-void llvm::format_provider<lldb_private::Error>::format(
-    const lldb_private::Error &error, llvm::raw_ostream &OS,
-    llvm::StringRef Options) {
-  llvm::format_provider<llvm::StringRef>::format(error.AsCString(), OS,
-                                                 Options);
-}
diff --git a/source/Utility/JSON.cpp b/source/Utility/JSON.cpp
index d20d9e46fefd..cb23f140cbfe 100644
--- a/source/Utility/JSON.cpp
+++ b/source/Utility/JSON.cpp
@@ -246,7 +246,7 @@ JSONParser::Token JSONParser::GetToken(std::string &value) {
             "error: an error occurred getting a character from offset %" PRIu64,
             start_index);
         value = std::move(error.GetString());
-        return Token::Error;
+        return Token::Status;
 
       } else {
         const bool is_end_quote = escaped_ch == '"';
@@ -259,13 +259,13 @@ JSONParser::Token JSONParser::GetToken(std::string &value) {
                          "character 0x%4.4x at offset %" PRIu64,
                          escaped_ch, start_index);
             value = std::move(error.GetString());
-            return Token::Error;
+            return Token::Status;
           }
         } else if (is_end_quote) {
           return Token::String;
         } else if (is_null) {
           value = "error: missing end quote for string";
-          return Token::Error;
+          return Token::Status;
         }
       }
     }
@@ -316,7 +316,7 @@ JSONParser::Token JSONParser::GetToken(std::string &value) {
           error.Printf("error: extra decimal point found at offset %" PRIu64,
                        start_index);
           value = std::move(error.GetString());
-          return Token::Error;
+          return Token::Status;
         } else {
           got_decimal_point = true;
           ++m_index; // Skip this character
@@ -330,7 +330,7 @@ JSONParser::Token JSONParser::GetToken(std::string &value) {
               "error: extra exponent character found at offset %" PRIu64,
               start_index);
           value = std::move(error.GetString());
-          return Token::Error;
+          return Token::Status;
         } else {
           exp_index = m_index;
           ++m_index; // Skip this character
@@ -346,7 +346,7 @@ JSONParser::Token JSONParser::GetToken(std::string &value) {
           error.Printf("error: unexpected %c character at offset %" PRIu64,
                        next_ch, start_index);
           value = std::move(error.GetString());
-          return Token::Error;
+          return Token::Status;
         }
         break;
 
@@ -368,7 +368,7 @@ JSONParser::Token JSONParser::GetToken(std::string &value) {
                          "at offset in float value \"%s\"",
                          value.c_str());
             value = std::move(error.GetString());
-            return Token::Error;
+            return Token::Status;
           }
         } else {
           // No exponent, but we need at least one decimal after the decimal
@@ -379,7 +379,7 @@ JSONParser::Token JSONParser::GetToken(std::string &value) {
             error.Printf("error: no digits after decimal point \"%s\"",
                          value.c_str());
             value = std::move(error.GetString());
-            return Token::Error;
+            return Token::Status;
           }
         }
       } else {
@@ -390,14 +390,14 @@ JSONParser::Token JSONParser::GetToken(std::string &value) {
         } else {
           error.Printf("error: no digits negate sign \"%s\"", value.c_str());
           value = std::move(error.GetString());
-          return Token::Error;
+          return Token::Status;
         }
       }
     } else {
       error.Printf("error: invalid number found at offset %" PRIu64,
                    start_index);
       value = std::move(error.GetString());
-      return Token::Error;
+      return Token::Status;
     }
   } break;
   default:
@@ -407,7 +407,7 @@ JSONParser::Token JSONParser::GetToken(std::string &value) {
                " (around character '%c')",
                start_index, ch);
   value = std::move(error.GetString());
-  return Token::Error;
+  return Token::Status;
 }
 
 int JSONParser::GetEscapedChar(bool &was_escaped) {
diff --git a/source/Utility/SelectHelper.cpp b/source/Utility/SelectHelper.cpp
index 7b0557ea192c..a46213f8bfcb 100644
--- a/source/Utility/SelectHelper.cpp
+++ b/source/Utility/SelectHelper.cpp
@@ -15,8 +15,8 @@
 #endif
 
 #include "lldb/Utility/SelectHelper.h"
-#include "lldb/Utility/Error.h"
 #include "lldb/Utility/LLDBAssert.h"
+#include "lldb/Utility/Status.h"
 #include "lldb/lldb-enumerations.h" // for ErrorType::eErrorTypePOSIX
 #include "lldb/lldb-types.h"        // for socket_t
 
@@ -90,14 +90,14 @@ static void updateMaxFd(llvm::Optional<lldb::socket_t> &vold,
     vold = std::max(*vold, vnew);
 }
 
-lldb_private::Error SelectHelper::Select() {
-  lldb_private::Error error;
+lldb_private::Status SelectHelper::Select() {
+  lldb_private::Status error;
 #ifdef _MSC_VER
   // On windows FD_SETSIZE limits the number of file descriptors, not their
   // numeric value.
   lldbassert(m_fd_map.size() <= FD_SETSIZE);
   if (m_fd_map.size() > FD_SETSIZE)
-    return lldb_private::Error("Too many file descriptors for select()");
+    return lldb_private::Status("Too many file descriptors for select()");
 #endif
 
   llvm::Optional<lldb::socket_t> max_read_fd;
diff --git a/source/Utility/Status.cpp b/source/Utility/Status.cpp
new file mode 100644
index 000000000000..5996be1e4e05
--- /dev/null
+++ b/source/Utility/Status.cpp
@@ -0,0 +1,275 @@
+//===-- Status.cpp -----------------------------------------------*- C++
+//-*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "lldb/Utility/Status.h"
+
+#include "lldb/Utility/VASPrintf.h"
+#include "lldb/lldb-defines.h"            // for LLDB_GENERIC_ERROR
+#include "lldb/lldb-enumerations.h"       // for ErrorType, ErrorType::eErr...
+#include "llvm/ADT/SmallString.h"         // for SmallString
+#include "llvm/ADT/StringRef.h"           // for StringRef
+#include "llvm/Support/FormatProviders.h" // for format_provider
+
+#include <cerrno>
+#include <cstdarg>
+#include <string> // for string
+#include <system_error>
+
+#ifdef __APPLE__
+#include <mach/mach.h>
+#endif
+
+#include <stdint.h> // for uint32_t
+#include <string.h> // for strerror
+
+namespace llvm {
+class raw_ostream;
+}
+
+using namespace lldb;
+using namespace lldb_private;
+
+Status::Status() : m_code(0), m_type(eErrorTypeInvalid), m_string() {}
+
+Status::Status(ValueType err, ErrorType type)
+    : m_code(err), m_type(type), m_string() {}
+
+Status::Status(std::error_code EC)
+    : m_code(EC.value()), m_type(ErrorType::eErrorTypeGeneric),
+      m_string(EC.message()) {}
+
+Status::Status(const Status &rhs) = default;
+
+Status::Status(const char *format, ...)
+    : m_code(0), m_type(eErrorTypeInvalid), m_string() {
+  va_list args;
+  va_start(args, format);
+  SetErrorToGenericError();
+  SetErrorStringWithVarArg(format, args);
+  va_end(args);
+}
+
+//----------------------------------------------------------------------
+// Assignment operator
+//----------------------------------------------------------------------
+const Status &Status::operator=(const Status &rhs) {
+  if (this != &rhs) {
+    m_code = rhs.m_code;
+    m_type = rhs.m_type;
+    m_string = rhs.m_string;
+  }
+  return *this;
+}
+
+//----------------------------------------------------------------------
+// Assignment operator
+//----------------------------------------------------------------------
+const Status &Status::operator=(uint32_t err) {
+  m_code = err;
+  m_type = eErrorTypeMachKernel;
+  m_string.clear();
+  return *this;
+}
+
+Status::~Status() = default;
+
+//----------------------------------------------------------------------
+// Get the error value as a NULL C string. The error string will be
+// fetched and cached on demand. The cached error string value will
+// remain until the error value is changed or cleared.
+//----------------------------------------------------------------------
+const char *Status::AsCString(const char *default_error_str) const {
+  if (Success())
+    return nullptr;
+
+  if (m_string.empty()) {
+    const char *s = nullptr;
+    switch (m_type) {
+    case eErrorTypeMachKernel:
+#if defined(__APPLE__)
+      s = ::mach_error_string(m_code);
+#endif
+      break;
+
+    case eErrorTypePOSIX:
+      s = ::strerror(m_code);
+      break;
+
+    default:
+      break;
+    }
+    if (s != nullptr)
+      m_string.assign(s);
+  }
+  if (m_string.empty()) {
+    if (default_error_str)
+      m_string.assign(default_error_str);
+    else
+      return nullptr; // User wanted a nullptr string back...
+  }
+  return m_string.c_str();
+}
+
+//----------------------------------------------------------------------
+// Clear the error and any cached error string that it might contain.
+//----------------------------------------------------------------------
+void Status::Clear() {
+  m_code = 0;
+  m_type = eErrorTypeInvalid;
+  m_string.clear();
+}
+
+//----------------------------------------------------------------------
+// Access the error value.
+//----------------------------------------------------------------------
+Status::ValueType Status::GetError() const { return m_code; }
+
+//----------------------------------------------------------------------
+// Access the error type.
+//----------------------------------------------------------------------
+ErrorType Status::GetType() const { return m_type; }
+
+//----------------------------------------------------------------------
+// Returns true if this object contains a value that describes an
+// error or otherwise non-success result.
+//----------------------------------------------------------------------
+bool Status::Fail() const { return m_code != 0; }
+
+//----------------------------------------------------------------------
+// Set accesssor for the error value to "err" and the type to
+// "eErrorTypeMachKernel"
+//----------------------------------------------------------------------
+void Status::SetMachError(uint32_t err) {
+  m_code = err;
+  m_type = eErrorTypeMachKernel;
+  m_string.clear();
+}
+
+void Status::SetExpressionError(lldb::ExpressionResults result,
+                                const char *mssg) {
+  m_code = result;
+  m_type = eErrorTypeExpression;
+  m_string = mssg;
+}
+
+int Status::SetExpressionErrorWithFormat(lldb::ExpressionResults result,
+                                         const char *format, ...) {
+  int length = 0;
+
+  if (format != nullptr && format[0]) {
+    va_list args;
+    va_start(args, format);
+    length = SetErrorStringWithVarArg(format, args);
+    va_end(args);
+  } else {
+    m_string.clear();
+  }
+  m_code = result;
+  m_type = eErrorTypeExpression;
+  return length;
+}
+
+//----------------------------------------------------------------------
+// Set accesssor for the error value and type.
+//----------------------------------------------------------------------
+void Status::SetError(ValueType err, ErrorType type) {
+  m_code = err;
+  m_type = type;
+  m_string.clear();
+}
+
+//----------------------------------------------------------------------
+// Update the error value to be "errno" and update the type to
+// be "POSIX".
+//----------------------------------------------------------------------
+void Status::SetErrorToErrno() {
+  m_code = errno;
+  m_type = eErrorTypePOSIX;
+  m_string.clear();
+}
+
+//----------------------------------------------------------------------
+// Update the error value to be LLDB_GENERIC_ERROR and update the type
+// to be "Generic".
+//----------------------------------------------------------------------
+void Status::SetErrorToGenericError() {
+  m_code = LLDB_GENERIC_ERROR;
+  m_type = eErrorTypeGeneric;
+  m_string.clear();
+}
+
+//----------------------------------------------------------------------
+// Set accessor for the error string value for a specific error.
+// This allows any string to be supplied as an error explanation.
+// The error string value will remain until the error value is
+// cleared or a new error value/type is assigned.
+//----------------------------------------------------------------------
+void Status::SetErrorString(llvm::StringRef err_str) {
+  if (!err_str.empty()) {
+    // If we have an error string, we should always at least have an error
+    // set to a generic value.
+    if (Success())
+      SetErrorToGenericError();
+  }
+  m_string = err_str;
+}
+
+//------------------------------------------------------------------
+/// Set the current error string to a formatted error string.
+///
+/// @param format
+///     A printf style format string
+//------------------------------------------------------------------
+int Status::SetErrorStringWithFormat(const char *format, ...) {
+  if (format != nullptr && format[0]) {
+    va_list args;
+    va_start(args, format);
+    int length = SetErrorStringWithVarArg(format, args);
+    va_end(args);
+    return length;
+  } else {
+    m_string.clear();
+  }
+  return 0;
+}
+
+int Status::SetErrorStringWithVarArg(const char *format, va_list args) {
+  if (format != nullptr && format[0]) {
+    // If we have an error string, we should always at least have
+    // an error set to a generic value.
+    if (Success())
+      SetErrorToGenericError();
+
+    llvm::SmallString<1024> buf;
+    VASprintf(buf, format, args);
+    m_string = buf.str();
+    return buf.size();
+  } else {
+    m_string.clear();
+  }
+  return 0;
+}
+
+//----------------------------------------------------------------------
+// Returns true if the error code in this object is considered a
+// successful return value.
+//----------------------------------------------------------------------
+bool Status::Success() const { return m_code == 0; }
+
+bool Status::WasInterrupted() const {
+  return (m_type == eErrorTypePOSIX && m_code == EINTR);
+}
+
+void llvm::format_provider<lldb_private::Status>::format(
+    const lldb_private::Status &error, llvm::raw_ostream &OS,
+    llvm::StringRef Options) {
+  llvm::format_provider<llvm::StringRef>::format(error.AsCString(), OS,
+                                                 Options);
+}
diff --git a/source/Utility/UUID.cpp b/source/Utility/UUID.cpp
index d82f4d41215e..b47f8b52f1c2 100644
--- a/source/Utility/UUID.cpp
+++ b/source/Utility/UUID.cpp
@@ -160,12 +160,9 @@ llvm::StringRef UUID::DecodeUUIDBytesFromString(llvm::StringRef p,
   bytes_decoded = uuid_byte_idx;
   return p;
 }
-size_t UUID::SetFromCString(const char *cstr, uint32_t num_uuid_bytes) {
-  if (cstr == NULL)
-    return 0;
 
-  llvm::StringRef orig(cstr);
-  llvm::StringRef p = orig;
+size_t UUID::SetFromStringRef(llvm::StringRef str, uint32_t num_uuid_bytes) {
+  llvm::StringRef p = str;
 
   // Skip leading whitespace characters
   p = p.ltrim();
@@ -178,12 +175,19 @@ size_t UUID::SetFromCString(const char *cstr, uint32_t num_uuid_bytes) {
   // were consumed
   if (bytes_decoded == num_uuid_bytes) {
     m_num_uuid_bytes = num_uuid_bytes;
-    return orig.size() - rest.size();
+    return str.size() - rest.size();
   }
 
   // Else return zero to indicate we were not able to parse a UUID value
   return 0;
 }
+
+size_t UUID::SetFromCString(const char *cstr, uint32_t num_uuid_bytes) {
+  if (cstr == NULL)
+    return 0;
+
+  return SetFromStringRef(cstr, num_uuid_bytes);
+}
 }
 
 bool lldb_private::operator==(const lldb_private::UUID &lhs,
diff --git a/tools/debugserver/source/DNBError.h b/tools/debugserver/source/DNBError.h
index edca38ad6db7..419f4b9492ed 100644
--- a/tools/debugserver/source/DNBError.h
+++ b/tools/debugserver/source/DNBError.h
@@ -49,7 +49,7 @@ public:
     m_flavor = Generic;
     m_str.clear();
   }
-  ValueType Error() const { return m_err; }
+  ValueType Status() const { return m_err; }
   FlavorType Flavor() const { return m_flavor; }
 
   ValueType operator=(kern_return_t err) {
diff --git a/tools/debugserver/source/JSON.cpp b/tools/debugserver/source/JSON.cpp
index 19ebfd000f51..c914f2498f09 100644
--- a/tools/debugserver/source/JSON.cpp
+++ b/tools/debugserver/source/JSON.cpp
@@ -282,7 +282,7 @@ JSONParser::Token JSONParser::GetToken(std::string &value) {
         error << "error: an error occurred getting a character from offset "
               << start_index;
         value = error.str();
-        return Token::Error;
+        return Token::Status;
 
       } else {
         const bool is_end_quote = escaped_ch == '"';
@@ -296,13 +296,13 @@ JSONParser::Token JSONParser::GetToken(std::string &value) {
                   << std::setprecision(4) << std::hex << escaped_ch;
             error << " at offset " << start_index;
             value = error.str();
-            return Token::Error;
+            return Token::Status;
           }
         } else if (is_end_quote) {
           return Token::String;
         } else if (is_null) {
           value = "error: missing end quote for string";
-          return Token::Error;
+          return Token::Status;
         }
       }
     }
@@ -352,7 +352,7 @@ JSONParser::Token JSONParser::GetToken(std::string &value) {
         if (got_decimal_point) {
           error << "error: extra decimal point found at offset " << start_index;
           value = error.str();
-          return Token::Error;
+          return Token::Status;
         } else {
           got_decimal_point = true;
           ++m_index; // Skip this character
@@ -365,7 +365,7 @@ JSONParser::Token JSONParser::GetToken(std::string &value) {
           error << "error: extra exponent character found at offset "
                 << start_index;
           value = error.str();
-          return Token::Error;
+          return Token::Status;
         } else {
           exp_index = m_index;
           ++m_index; // Skip this character
@@ -381,7 +381,7 @@ JSONParser::Token JSONParser::GetToken(std::string &value) {
           error << "error: unexpected " << next_ch << " character at offset "
                 << start_index;
           value = error.str();
-          return Token::Error;
+          return Token::Status;
         }
         break;
 
@@ -403,7 +403,7 @@ JSONParser::Token JSONParser::GetToken(std::string &value) {
                      "offset in float value \""
                   << value.c_str() << "\"";
             value = error.str();
-            return Token::Error;
+            return Token::Status;
           }
         } else {
           // No exponent, but we need at least one decimal after the decimal
@@ -414,7 +414,7 @@ JSONParser::Token JSONParser::GetToken(std::string &value) {
             error << "error: no digits after decimal point \"" << value.c_str()
                   << "\"";
             value = error.str();
-            return Token::Error;
+            return Token::Status;
           }
         }
       } else {
@@ -425,13 +425,13 @@ JSONParser::Token JSONParser::GetToken(std::string &value) {
         } else {
           error << "error: no digits negate sign \"" << value.c_str() << "\"";
           value = error.str();
-          return Token::Error;
+          return Token::Status;
         }
       }
     } else {
       error << "error: invalid number found at offset " << start_index;
       value = error.str();
-      return Token::Error;
+      return Token::Status;
     }
   } break;
   default:
@@ -440,7 +440,7 @@ JSONParser::Token JSONParser::GetToken(std::string &value) {
   error << "error: failed to parse token at offset " << start_index
         << " (around character '" << ch << "')";
   value = error.str();
-  return Token::Error;
+  return Token::Status;
 }
 
 int JSONParser::GetEscapedChar(bool &was_escaped) {
diff --git a/tools/debugserver/source/JSON.h b/tools/debugserver/source/JSON.h
index eee62f453327..252ff021f912 100644
--- a/tools/debugserver/source/JSON.h
+++ b/tools/debugserver/source/JSON.h
@@ -270,7 +270,7 @@ class JSONParser : public StdStringExtractor {
 public:
   enum Token {
     Invalid,
-    Error,
+    Status,
     ObjectStart,
     ObjectEnd,
     ArrayStart,
diff --git a/tools/debugserver/source/MacOSX/MachException.cpp b/tools/debugserver/source/MacOSX/MachException.cpp
index f6e778232b82..5f085867db2c 100644
--- a/tools/debugserver/source/MacOSX/MachException.cpp
+++ b/tools/debugserver/source/MacOSX/MachException.cpp
@@ -267,7 +267,7 @@ kern_return_t MachException::Message::Receive(mach_port_t port,
                     exc_msg.hdr.msgh_reserved, exc_msg.hdr.msgh_id, options, 0,
                     sizeof(exc_msg.data), port, mach_msg_timeout, notify_port);
   }
-  return err.Error();
+  return err.Status();
 }
 
 bool MachException::Message::CatchExceptionRaise(task_t task) {
@@ -349,7 +349,7 @@ kern_return_t MachException::Message::Reply(MachProcess *process, int signal) {
                    MACH_MSG_TIMEOUT_NONE, MACH_PORT_NULL);
 
   if (err.Fail()) {
-    if (err.Error() == MACH_SEND_INTERRUPTED) {
+    if (err.Status() == MACH_SEND_INTERRUPTED) {
       if (DNBLogCheckLogBit(LOG_EXCEPTIONS))
         err.LogThreaded("::mach_msg() - send interrupted");
       // TODO: keep retrying to reply???
@@ -357,7 +357,7 @@ kern_return_t MachException::Message::Reply(MachProcess *process, int signal) {
       if (state.task_port == process->Task().TaskPort()) {
         DNBLogThreaded("error: mach_msg() returned an error when replying to a "
                        "mach exception: error = %u",
-                       err.Error());
+                       err.Status());
       } else {
         if (DNBLogCheckLogBit(LOG_EXCEPTIONS))
           err.LogThreaded("::mach_msg() - failed (child of task)");
@@ -365,7 +365,7 @@ kern_return_t MachException::Message::Reply(MachProcess *process, int signal) {
     }
   }
 
-  return err.Error();
+  return err.Status();
 }
 
 void MachException::Data::Dump() const {
@@ -430,7 +430,7 @@ kern_return_t MachException::PortInfo::Save(task_t task) {
                     "maskCnt => %u, ports, behaviors, flavors )",
                     task, mask, count);
 
-  if (err.Error() == KERN_INVALID_ARGUMENT && mask != PREV_EXC_MASK_ALL) {
+  if (err.Status() == KERN_INVALID_ARGUMENT && mask != PREV_EXC_MASK_ALL) {
     mask = PREV_EXC_MASK_ALL;
     count = (sizeof(ports) / sizeof(ports[0]));
     err = ::task_get_exception_ports(task, mask, masks, &count, ports,
@@ -444,7 +444,7 @@ kern_return_t MachException::PortInfo::Save(task_t task) {
     mask = 0;
     count = 0;
   }
-  return err.Error();
+  return err.Status();
 }
 
 kern_return_t MachException::PortInfo::Restore(task_t task) {
@@ -469,7 +469,7 @@ kern_return_t MachException::PortInfo::Restore(task_t task) {
     }
   }
   count = 0;
-  return err.Error();
+  return err.Status();
 }
 
 const char *MachException::Name(exception_type_t exc_type) {
diff --git a/tools/debugserver/source/MacOSX/MachProcess.mm b/tools/debugserver/source/MacOSX/MachProcess.mm
index 0daaca1db37e..a93f724849b8 100644
--- a/tools/debugserver/source/MacOSX/MachProcess.mm
+++ b/tools/debugserver/source/MacOSX/MachProcess.mm
@@ -1215,7 +1215,7 @@ bool MachProcess::Kill(const struct timespec *timeout_abstime) {
   err.SetErrorToErrno();
   DNBLogThreadedIf(LOG_PROCESS, "MachProcess::Kill() DoSIGSTOP() ::ptrace "
                                 "(PT_KILL, pid=%u, 0, 0) => 0x%8.8x (%s)",
-                   m_pid, err.Error(), err.AsString());
+                   m_pid, err.Status(), err.AsString());
   m_thread_actions = DNBThreadResumeActions(eStateRunning, 0);
   PrivateResume();
 
@@ -2970,7 +2970,8 @@ pid_t MachProcess::LaunchForDebug(
         DNBError ptrace_err(errno, DNBError::POSIX);
         DNBLogThreadedIf(LOG_PROCESS, "error: failed to attach to spawned pid "
                                       "%d (err = %i, errno = %i (%s))",
-                         m_pid, err, ptrace_err.Error(), ptrace_err.AsString());
+                         m_pid, err, ptrace_err.Status(),
+                         ptrace_err.AsString());
         launch_err.SetError(NUB_GENERIC_ERROR, DNBError::Generic);
       }
     } else {
@@ -3181,7 +3182,7 @@ pid_t MachProcess::ForkChildForPTraceDebugging(const char *path,
                                                char const *envp[],
                                                MachProcess *process,
                                                DNBError &launch_err) {
-  PseudoTerminal::Error pty_error = PseudoTerminal::success;
+  PseudoTerminal::Status pty_error = PseudoTerminal::success;
 
   // Use a fork that ties the child process's stdin/out/err to a pseudo
   // terminal so we can read it in our MachProcess::STDIOThread
@@ -3191,7 +3192,7 @@ pid_t MachProcess::ForkChildForPTraceDebugging(const char *path,
 
   if (pid < 0) {
     //--------------------------------------------------------------
-    // Error during fork.
+    // Status during fork.
     //--------------------------------------------------------------
     return pid;
   } else if (pid == 0) {
@@ -3406,7 +3407,7 @@ pid_t MachProcess::SBForkChildForPTraceDebugging(
 
   PseudoTerminal pty;
   if (!no_stdio) {
-    PseudoTerminal::Error pty_err =
+    PseudoTerminal::Status pty_err =
         pty.OpenFirstAvailableMaster(O_RDWR | O_NOCTTY);
     if (pty_err == PseudoTerminal::success) {
       const char *slave_name = pty.SlaveName();
@@ -3607,7 +3608,7 @@ pid_t MachProcess::BoardServiceForkChildForPTraceDebugging(
 
   PseudoTerminal pty;
   if (!no_stdio) {
-    PseudoTerminal::Error pty_err =
+    PseudoTerminal::Status pty_err =
         pty.OpenFirstAvailableMaster(O_RDWR | O_NOCTTY);
     if (pty_err == PseudoTerminal::success) {
       const char *slave_name = pty.SlaveName();
diff --git a/tools/debugserver/source/MacOSX/MachTask.mm b/tools/debugserver/source/MacOSX/MachTask.mm
index 37897a5a6acd..bd7047ecdff7 100644
--- a/tools/debugserver/source/MacOSX/MachTask.mm
+++ b/tools/debugserver/source/MacOSX/MachTask.mm
@@ -88,7 +88,7 @@ kern_return_t MachTask::Suspend() {
   err = ::task_suspend(task);
   if (DNBLogCheckLogBit(LOG_TASK) || err.Fail())
     err.LogThreaded("::task_suspend ( target_task = 0x%4.4x )", task);
-  return err.Error();
+  return err.Status();
 }
 
 //----------------------------------------------------------------------
@@ -113,7 +113,7 @@ kern_return_t MachTask::Resume() {
         err.LogThreaded("::task_resume ( target_task = 0x%4.4x )", task);
     }
   }
-  return err.Error();
+  return err.Status();
 }
 
 //----------------------------------------------------------------------
@@ -531,7 +531,7 @@ task_t MachTask::TaskPortForProcessID(pid_t pid, DNBError &err,
         char str[1024];
         ::snprintf(str, sizeof(str), "::task_for_pid ( target_tport = 0x%4.4x, "
                                      "pid = %d, &task ) => err = 0x%8.8x (%s)",
-                   task_self, pid, err.Error(),
+                   task_self, pid, err.Status(),
                    err.AsString() ? err.AsString() : "success");
         if (err.Fail())
           err.SetErrorString(str);
@@ -583,7 +583,7 @@ kern_return_t MachTask::BasicInfo(task_t task, struct task_basic_info *info) {
                    info->suspend_count, (uint64_t)info->virtual_size,
                    (uint64_t)info->resident_size, user, system);
   }
-  return err.Error();
+  return err.Status();
 }
 
 //----------------------------------------------------------------------
@@ -687,7 +687,7 @@ kern_return_t MachTask::ShutDownExcecptionThread() {
     err.LogThreaded("::mach_port_deallocate ( task = 0x%4.4x, name = 0x%4.4x )",
                     task_self, exception_port);
 
-  return err.Error();
+  return err.Status();
 }
 
 void *MachTask::ExceptionThread(void *arg) {
@@ -805,7 +805,7 @@ void *MachTask::ExceptionThread(void *arg) {
                                       MACH_RCV_MSG | MACH_RCV_INTERRUPT, 0);
     }
 
-    if (err.Error() == MACH_RCV_INTERRUPTED) {
+    if (err.Status() == MACH_RCV_INTERRUPTED) {
       // If we have no task port we should exit this thread
       if (!mach_task->ExceptionPortIsValid()) {
         DNBLogThreadedIf(LOG_EXCEPTIONS, "thread cancelled...");
@@ -824,7 +824,7 @@ void *MachTask::ExceptionThread(void *arg) {
         // Our task has died, exit the thread.
         break;
       }
-    } else if (err.Error() == MACH_RCV_TIMED_OUT) {
+    } else if (err.Status() == MACH_RCV_TIMED_OUT) {
       if (num_exceptions_received > 0) {
         // We were receiving all current exceptions with a timeout of zero
         // it is time to go back to our normal looping mode
@@ -860,7 +860,7 @@ void *MachTask::ExceptionThread(void *arg) {
         }
       }
 #endif
-    } else if (err.Error() != KERN_SUCCESS) {
+    } else if (err.Status() != KERN_SUCCESS) {
       DNBLogThreadedIf(LOG_EXCEPTIONS, "got some other error, do something "
                                        "about it??? nah, continuing for "
                                        "now...");
@@ -947,7 +947,7 @@ nub_addr_t MachTask::AllocateMemory(size_t size, uint32_t permissions) {
 
   DNBError err;
   err = ::mach_vm_allocate(task, &addr, size, TRUE);
-  if (err.Error() == KERN_SUCCESS) {
+  if (err.Status() == KERN_SUCCESS) {
     // Set the protections:
     vm_prot_t mach_prot = VM_PROT_NONE;
     if (permissions & eMemoryPermissionsReadable)
@@ -958,7 +958,7 @@ nub_addr_t MachTask::AllocateMemory(size_t size, uint32_t permissions) {
       mach_prot |= VM_PROT_EXECUTE;
 
     err = ::mach_vm_protect(task, addr, size, 0, mach_prot);
-    if (err.Error() == KERN_SUCCESS) {
+    if (err.Status() == KERN_SUCCESS) {
       m_allocations.insert(std::make_pair(addr, size));
       return addr;
     }
diff --git a/tools/debugserver/source/MacOSX/MachThreadList.cpp b/tools/debugserver/source/MacOSX/MachThreadList.cpp
index cf0e205b349a..05e9627b591b 100644
--- a/tools/debugserver/source/MacOSX/MachThreadList.cpp
+++ b/tools/debugserver/source/MacOSX/MachThreadList.cpp
@@ -294,7 +294,7 @@ MachThreadList::UpdateThreadList(MachProcess *process, bool update,
                       "thread_list_count => %u )",
                       task, thread_list, thread_list_count);
 
-    if (err.Error() == KERN_SUCCESS && thread_list_count > 0) {
+    if (err.Status() == KERN_SUCCESS && thread_list_count > 0) {
       MachThreadList::collection currThreads;
       size_t idx;
       // Iterator through the current thread list and see which threads
diff --git a/tools/debugserver/source/MacOSX/MachVMMemory.cpp b/tools/debugserver/source/MacOSX/MachVMMemory.cpp
index 76806ab1cc17..12f16ccb9f53 100644
--- a/tools/debugserver/source/MacOSX/MachVMMemory.cpp
+++ b/tools/debugserver/source/MacOSX/MachVMMemory.cpp
@@ -501,7 +501,7 @@ nub_size_t MachVMMemory::Write(task_t task, nub_addr_t address,
         nub_size_t bytes_written =
             WriteRegion(task, curr_addr, curr_data, curr_data_count);
         if (bytes_written <= 0) {
-          // Error should have already be posted by WriteRegion...
+          // Status should have already be posted by WriteRegion...
           break;
         } else {
           total_bytes_written += bytes_written;
diff --git a/tools/debugserver/source/MacOSX/arm/DNBArchImpl.cpp b/tools/debugserver/source/MacOSX/arm/DNBArchImpl.cpp
index 175aab1ae3fa..2841ddb11081 100644
--- a/tools/debugserver/source/MacOSX/arm/DNBArchImpl.cpp
+++ b/tools/debugserver/source/MacOSX/arm/DNBArchImpl.cpp
@@ -696,14 +696,14 @@ kern_return_t DNBArchMachARM::EnableHardwareSingleStep(bool enable) {
 
   if (err.Fail()) {
     err.LogThreaded("%s: failed to read the GPR registers", __FUNCTION__);
-    return err.Error();
+    return err.Status();
   }
 
   err = GetDBGState(false);
 
   if (err.Fail()) {
     err.LogThreaded("%s: failed to read the DBG registers", __FUNCTION__);
-    return err.Error();
+    return err.Status();
   }
 
 // The use of __arm64__ here is not ideal.  If debugserver is running on
diff --git a/tools/debugserver/source/MacOSX/arm64/DNBArchImplARM64.cpp b/tools/debugserver/source/MacOSX/arm64/DNBArchImplARM64.cpp
index 7d04170623d1..51bc5aaee5bb 100644
--- a/tools/debugserver/source/MacOSX/arm64/DNBArchImplARM64.cpp
+++ b/tools/debugserver/source/MacOSX/arm64/DNBArchImplARM64.cpp
@@ -546,14 +546,14 @@ kern_return_t DNBArchMachARM64::EnableHardwareSingleStep(bool enable) {
 
   if (err.Fail()) {
     err.LogThreaded("%s: failed to read the GPR registers", __FUNCTION__);
-    return err.Error();
+    return err.Status();
   }
 
   err = GetDBGState(false);
 
   if (err.Fail()) {
     err.LogThreaded("%s: failed to read the DBG registers", __FUNCTION__);
-    return err.Error();
+    return err.Status();
   }
 
   if (enable) {
diff --git a/tools/debugserver/source/PseudoTerminal.cpp b/tools/debugserver/source/PseudoTerminal.cpp
index 616aec989c82..892574a61dcf 100644
--- a/tools/debugserver/source/PseudoTerminal.cpp
+++ b/tools/debugserver/source/PseudoTerminal.cpp
@@ -65,7 +65,7 @@ void PseudoTerminal::CloseSlave() {
 // RETURNS:
 //  Zero when successful, non-zero indicating an error occurred.
 //----------------------------------------------------------------------
-PseudoTerminal::Error PseudoTerminal::OpenFirstAvailableMaster(int oflag) {
+PseudoTerminal::Status PseudoTerminal::OpenFirstAvailableMaster(int oflag) {
   // Open the master side of a pseudo terminal
   m_master_fd = ::posix_openpt(oflag);
   if (m_master_fd < 0) {
@@ -97,7 +97,7 @@ PseudoTerminal::Error PseudoTerminal::OpenFirstAvailableMaster(int oflag) {
 // RETURNS:
 //  Zero when successful, non-zero indicating an error occurred.
 //----------------------------------------------------------------------
-PseudoTerminal::Error PseudoTerminal::OpenSlave(int oflag) {
+PseudoTerminal::Status PseudoTerminal::OpenSlave(int oflag) {
   CloseSlave();
 
   // Open the master side of a pseudo terminal
@@ -153,7 +153,7 @@ const char *PseudoTerminal::SlaveName() const {
 //  in the child process: zero
 //----------------------------------------------------------------------
 
-pid_t PseudoTerminal::Fork(PseudoTerminal::Error &error) {
+pid_t PseudoTerminal::Fork(PseudoTerminal::Status &error) {
   pid_t pid = invalid_pid;
   error = OpenFirstAvailableMaster(O_RDWR | O_NOCTTY);
 
diff --git a/tools/debugserver/source/PseudoTerminal.h b/tools/debugserver/source/PseudoTerminal.h
index 409c78ff5e31..f6b71f8909a6 100644
--- a/tools/debugserver/source/PseudoTerminal.h
+++ b/tools/debugserver/source/PseudoTerminal.h
@@ -22,7 +22,7 @@ class PseudoTerminal {
 public:
   enum { invalid_fd = -1, invalid_pid = -1 };
 
-  enum Error {
+  enum Status {
     success = 0,
     err_posix_openpt_failed = -2,
     err_grantpt_failed = -3,
@@ -44,8 +44,8 @@ public:
 
   void CloseMaster();
   void CloseSlave();
-  Error OpenFirstAvailableMaster(int oflag);
-  Error OpenSlave(int oflag);
+  Status OpenFirstAvailableMaster(int oflag);
+  Status OpenSlave(int oflag);
   int MasterFD() const { return m_master_fd; }
   int SlaveFD() const { return m_slave_fd; }
   int ReleaseMasterFD() {
@@ -67,7 +67,7 @@ public:
 
   const char *SlaveName() const;
 
-  pid_t Fork(Error &error);
+  pid_t Fork(Status &error);
 
 protected:
   //------------------------------------------------------------------
diff --git a/tools/debugserver/source/RNBContext.cpp b/tools/debugserver/source/RNBContext.cpp
index 483cdf9a486a..c1319af02322 100644
--- a/tools/debugserver/source/RNBContext.cpp
+++ b/tools/debugserver/source/RNBContext.cpp
@@ -266,7 +266,7 @@ const char *RNBContext::LaunchStatusAsString(std::string &s) {
   else {
     char error_num_str[64];
     snprintf(error_num_str, sizeof(error_num_str), "%u",
-             m_launch_status.Error());
+             m_launch_status.Status());
     s = error_num_str;
   }
   return s.c_str();
diff --git a/tools/debugserver/source/RNBRemote.cpp b/tools/debugserver/source/RNBRemote.cpp
index 94260c619293..038142a0351e 100644
--- a/tools/debugserver/source/RNBRemote.cpp
+++ b/tools/debugserver/source/RNBRemote.cpp
@@ -1628,7 +1628,7 @@ rnb_err_t RNBRemote::HandlePacket_H(const char *p) {
 }
 
 rnb_err_t RNBRemote::HandlePacket_qLaunchSuccess(const char *p) {
-  if (m_ctx.HasValidProcessID() || m_ctx.LaunchStatus().Error() == 0)
+  if (m_ctx.HasValidProcessID() || m_ctx.LaunchStatus().Status() == 0)
     return SendPacket("OK");
   std::ostringstream ret_str;
   std::string status_str;
diff --git a/tools/lldb-mi/MICmdBase.cpp b/tools/lldb-mi/MICmdBase.cpp
index c645d84c1e99..cd5bf27c73fe 100644
--- a/tools/lldb-mi/MICmdBase.cpp
+++ b/tools/lldb-mi/MICmdBase.cpp
@@ -195,7 +195,7 @@ bool CMICmdBase::HasMIResultRecordExtra() const {
 // metadata
 //          object and set the command's error status.
 // Type:    Method.
-// Args:    rErrMsg - (R) Error description.
+// Args:    rErrMsg - (R) Status description.
 // Return:  None.
 // Throws:  None.
 //--
diff --git a/tools/lldb-mi/MICmdCmdData.h b/tools/lldb-mi/MICmdCmdData.h
index 683e41022af5..ab8d6ba5cd19 100644
--- a/tools/lldb-mi/MICmdCmdData.h
+++ b/tools/lldb-mi/MICmdCmdData.h
@@ -77,7 +77,7 @@ private:
   bool m_bExpressionValid;     // True = yes is valid, false = not valid
   bool m_bEvaluatedExpression; // True = yes is expression evaluated, false =
                                // failed
-  lldb::SBError m_Error;       // Error object, which is examined when
+  lldb::SBError m_Error;       // Status object, which is examined when
                                // m_bEvaluatedExpression is false
   CMIUtilString m_strValue;
   CMICmnMIValueTuple m_miValueTuple;
diff --git a/tools/lldb-mi/MIDriver.cpp b/tools/lldb-mi/MIDriver.cpp
index ea8b57297dda..49e8588bf732 100644
--- a/tools/lldb-mi/MIDriver.cpp
+++ b/tools/lldb-mi/MIDriver.cpp
@@ -710,7 +710,7 @@ const CMIUtilString &CMIDriver::GetDriverId() const { return GetId(); }
 //          Check the error message if the function returns a failure.
 // Type:    Overridden.
 // Args:    vCmd        - (R) Command instruction to interpret.
-//          vwErrMsg    - (W) Error description on command failing.
+//          vwErrMsg    - (W) Status description on command failing.
 // Return:  MIstatus::success - Command succeeded.
 //          MIstatus::failure - Command failed.
 // Throws:  None.
diff --git a/tools/lldb-mi/MIDriverBase.cpp b/tools/lldb-mi/MIDriverBase.cpp
index 0620b6154b1a..f70da6ea08e1 100644
--- a/tools/lldb-mi/MIDriverBase.cpp
+++ b/tools/lldb-mi/MIDriverBase.cpp
@@ -44,7 +44,7 @@ CMIDriverBase::~CMIDriverBase() { m_pDriverFallThru = NULL; }
 //          Check the error message if the function returns a failure.
 // Type:    Overridden.
 // Args:    vCmd        - (R) Command instruction to interpret.
-//          vwErrMsg    - (W) Error description on command failing.
+//          vwErrMsg    - (W) Status description on command failing.
 // Return:  MIstatus::success - Command succeeded.
 //          MIstatus::failure - Command failed.
 // Throws:  None.
diff --git a/tools/lldb-server/Acceptor.cpp b/tools/lldb-server/Acceptor.cpp
index 48a364886dc3..16db9e9d30df 100644
--- a/tools/lldb-server/Acceptor.cpp
+++ b/tools/lldb-server/Acceptor.cpp
@@ -56,11 +56,11 @@ const char *FindSchemeByProtocol(const Socket::SocketProtocol protocol) {
 }
 }
 
-Error Acceptor::Listen(int backlog) {
+Status Acceptor::Listen(int backlog) {
   return m_listener_socket_up->Listen(StringRef(m_name), backlog);
 }
 
-Error Acceptor::Accept(const bool child_processes_inherit, Connection *&conn) {
+Status Acceptor::Accept(const bool child_processes_inherit, Connection *&conn) {
   Socket *conn_socket = nullptr;
   auto error = m_listener_socket_up->Accept(conn_socket);
   if (error.Success())
@@ -81,7 +81,7 @@ std::string Acceptor::GetLocalSocketId() const { return m_local_socket_id(); }
 
 std::unique_ptr<Acceptor> Acceptor::Create(StringRef name,
                                            const bool child_processes_inherit,
-                                           Error &error) {
+                                           Status &error) {
   error.Clear();
 
   Socket::SocketProtocol socket_protocol = Socket::ProtocolUnixDomain;
diff --git a/tools/lldb-server/Acceptor.h b/tools/lldb-server/Acceptor.h
index 410970915f1f..207bb4d973a5 100644
--- a/tools/lldb-server/Acceptor.h
+++ b/tools/lldb-server/Acceptor.h
@@ -11,7 +11,7 @@
 
 #include "lldb/Core/Connection.h"
 #include "lldb/Host/Socket.h"
-#include "lldb/Utility/Error.h"
+#include "lldb/Utility/Status.h"
 
 #include <functional>
 #include <memory>
@@ -28,13 +28,13 @@ class Acceptor {
 public:
   virtual ~Acceptor() = default;
 
-  Error Listen(int backlog);
+  Status Listen(int backlog);
 
-  Error Accept(const bool child_processes_inherit, Connection *&conn);
+  Status Accept(const bool child_processes_inherit, Connection *&conn);
 
   static std::unique_ptr<Acceptor> Create(llvm::StringRef name,
                                           const bool child_processes_inherit,
-                                          Error &error);
+                                          Status &error);
 
   Socket::SocketProtocol GetSocketProtocol() const;
 
diff --git a/tools/lldb-server/lldb-gdbserver.cpp b/tools/lldb-server/lldb-gdbserver.cpp
index 59f5a44ce4e3..6139bfabee3d 100644
--- a/tools/lldb-server/lldb-gdbserver.cpp
+++ b/tools/lldb-server/lldb-gdbserver.cpp
@@ -35,7 +35,7 @@
 #include "lldb/Host/Pipe.h"
 #include "lldb/Host/Socket.h"
 #include "lldb/Host/StringConvert.h"
-#include "lldb/Utility/Error.h"
+#include "lldb/Utility/Status.h"
 
 #ifndef LLGS_PROGRAM_NAME
 #define LLGS_PROGRAM_NAME "lldb-server"
@@ -112,7 +112,7 @@ static void display_usage(const char *progname, const char *subcommand) {
 
 void handle_attach_to_pid(GDBRemoteCommunicationServerLLGS &gdb_server,
                           lldb::pid_t pid) {
-  Error error = gdb_server.AttachToProcess(pid);
+  Status error = gdb_server.AttachToProcess(pid);
   if (error.Fail()) {
     fprintf(stderr, "error: failed to attach to pid %" PRIu64 ": %s\n", pid,
             error.AsCString());
@@ -145,7 +145,7 @@ void handle_attach(GDBRemoteCommunicationServerLLGS &gdb_server,
 
 void handle_launch(GDBRemoteCommunicationServerLLGS &gdb_server, int argc,
                    const char *const argv[]) {
-  Error error;
+  Status error;
   error = gdb_server.SetLaunchArguments(argv, argc);
   if (error.Fail()) {
     fprintf(stderr, "error: failed to set launch args for '%s': %s\n", argv[0],
@@ -170,15 +170,15 @@ void handle_launch(GDBRemoteCommunicationServerLLGS &gdb_server, int argc,
   }
 }
 
-Error writeSocketIdToPipe(Pipe &port_pipe, const std::string &socket_id) {
+Status writeSocketIdToPipe(Pipe &port_pipe, const std::string &socket_id) {
   size_t bytes_written = 0;
   // Write the port number as a C string with the NULL terminator.
   return port_pipe.Write(socket_id.c_str(), socket_id.size() + 1,
                          bytes_written);
 }
 
-Error writeSocketIdToPipe(const char *const named_pipe_path,
-                          const std::string &socket_id) {
+Status writeSocketIdToPipe(const char *const named_pipe_path,
+                           const std::string &socket_id) {
   Pipe port_name_pipe;
   // Wait for 10 seconds for pipe to be opened.
   auto error = port_name_pipe.OpenAsWriterWithTimeout(named_pipe_path, false,
@@ -188,9 +188,9 @@ Error writeSocketIdToPipe(const char *const named_pipe_path,
   return writeSocketIdToPipe(port_name_pipe, socket_id);
 }
 
-Error writeSocketIdToPipe(int unnamed_pipe_fd, const std::string &socket_id) {
+Status writeSocketIdToPipe(int unnamed_pipe_fd, const std::string &socket_id) {
 #if defined(_WIN32)
-  return Error("Unnamed pipes are not supported on Windows.");
+  return Status("Unnamed pipes are not supported on Windows.");
 #else
   Pipe port_pipe{Pipe::kInvalidDescriptor, unnamed_pipe_fd};
   return writeSocketIdToPipe(port_pipe, socket_id);
@@ -202,7 +202,7 @@ void ConnectToRemote(MainLoop &mainloop,
                      bool reverse_connect, const char *const host_and_port,
                      const char *const progname, const char *const subcommand,
                      const char *const named_pipe_path, int unnamed_pipe_fd) {
-  Error error;
+  Status error;
 
   if (host_and_port && host_and_port[0]) {
     // Parse out host and port.
@@ -311,7 +311,7 @@ void ConnectToRemote(MainLoop &mainloop,
 // main
 //----------------------------------------------------------------------
 int main_gdbserver(int argc, char *argv[]) {
-  Error error;
+  Status error;
   MainLoop mainloop;
 #ifndef _WIN32
   // Setup signal handlers first thing.
diff --git a/tools/lldb-server/lldb-platform.cpp b/tools/lldb-server/lldb-platform.cpp
index d9790cdf43a0..8d45682566b9 100644
--- a/tools/lldb-server/lldb-platform.cpp
+++ b/tools/lldb-server/lldb-platform.cpp
@@ -34,8 +34,8 @@
 #include "lldb/Host/HostGetOpt.h"
 #include "lldb/Host/OptionParser.h"
 #include "lldb/Host/common/TCPSocket.h"
-#include "lldb/Utility/Error.h"
 #include "lldb/Utility/FileSpec.h"
+#include "lldb/Utility/Status.h"
 
 using namespace lldb;
 using namespace lldb_private;
@@ -98,13 +98,13 @@ static void display_usage(const char *progname, const char *subcommand) {
   exit(0);
 }
 
-static Error save_socket_id_to_file(const std::string &socket_id,
-                                    const FileSpec &file_spec) {
+static Status save_socket_id_to_file(const std::string &socket_id,
+                                     const FileSpec &file_spec) {
   FileSpec temp_file_spec(file_spec.GetDirectory().AsCString(), false);
-  Error error(llvm::sys::fs::create_directory(temp_file_spec.GetPath()));
+  Status error(llvm::sys::fs::create_directory(temp_file_spec.GetPath()));
   if (error.Fail())
-    return Error("Failed to create directory %s: %s",
-                 temp_file_spec.GetCString(), error.AsCString());
+    return Status("Failed to create directory %s: %s",
+                  temp_file_spec.GetCString(), error.AsCString());
 
   llvm::SmallString<64> temp_file_path;
   temp_file_spec.AppendPathComponent("port-file.%%%%%%");
@@ -112,7 +112,7 @@ static Error save_socket_id_to_file(const std::string &socket_id,
   auto err_code = llvm::sys::fs::createUniqueFile(temp_file_spec.GetPath(), FD,
                                                   temp_file_path);
   if (err_code)
-    return Error("Failed to create temp file: %s", err_code.message().c_str());
+    return Status("Failed to create temp file: %s", err_code.message().c_str());
 
   llvm::FileRemover tmp_file_remover(temp_file_path);
 
@@ -121,16 +121,16 @@ static Error save_socket_id_to_file(const std::string &socket_id,
     temp_file << socket_id;
     temp_file.close();
     if (temp_file.has_error())
-      return Error("Failed to write to port file.");
+      return Status("Failed to write to port file.");
   }
 
   err_code = llvm::sys::fs::rename(temp_file_path, file_spec.GetPath());
   if (err_code)
-    return Error("Failed to rename file %s to %s: %s", temp_file_path.c_str(),
-                 file_spec.GetPath().c_str(), err_code.message().c_str());
+    return Status("Failed to rename file %s to %s: %s", temp_file_path.c_str(),
+                  file_spec.GetPath().c_str(), err_code.message().c_str());
 
   tmp_file_remover.releaseFile();
-  return Error();
+  return Status();
 }
 
 //----------------------------------------------------------------------
@@ -144,7 +144,7 @@ int main_platform(int argc, char *argv[]) {
   signal(SIGPIPE, SIG_IGN);
   signal(SIGHUP, signal_handler);
   int long_option_index = 0;
-  Error error;
+  Status error;
   std::string listen_host_port;
   int ch;
 
@@ -350,9 +350,9 @@ int main_platform(int argc, char *argv[]) {
         lldb::pid_t pid = LLDB_INVALID_PROCESS_ID;
         uint16_t port = 0;
         std::string socket_name;
-        Error error = platform.LaunchGDBServer(inferior_arguments,
-                                               "", // hostname
-                                               pid, port, socket_name);
+        Status error = platform.LaunchGDBServer(inferior_arguments,
+                                                "", // hostname
+                                                pid, port, socket_name);
         if (error.Success())
           platform.SetPendingGdbServer(pid, port, socket_name);
         else
diff --git a/unittests/Breakpoint/BreakpointIDTest.cpp b/unittests/Breakpoint/BreakpointIDTest.cpp
index a84fa5439527..5cded9912f18 100644
--- a/unittests/Breakpoint/BreakpointIDTest.cpp
+++ b/unittests/Breakpoint/BreakpointIDTest.cpp
@@ -10,7 +10,7 @@
 #include "gtest/gtest.h"
 
 #include "lldb/Breakpoint/BreakpointID.h"
-#include "lldb/Utility/Error.h"
+#include "lldb/Utility/Status.h"
 
 #include "llvm/ADT/StringRef.h"
 
@@ -18,7 +18,7 @@ using namespace lldb;
 using namespace lldb_private;
 
 TEST(BreakpointIDTest, StringIsBreakpointName) {
-  Error E;
+  Status E;
   EXPECT_FALSE(BreakpointID::StringIsBreakpointName("1breakpoint", E));
   EXPECT_FALSE(BreakpointID::StringIsBreakpointName("-", E));
   EXPECT_FALSE(BreakpointID::StringIsBreakpointName("", E));
diff --git a/unittests/Core/ScalarTest.cpp b/unittests/Core/ScalarTest.cpp
index 3c4995c49086..692aa8aaf119 100644
--- a/unittests/Core/ScalarTest.cpp
+++ b/unittests/Core/ScalarTest.cpp
@@ -12,7 +12,7 @@
 #include "lldb/Core/Scalar.h"
 #include "lldb/Utility/DataExtractor.h"
 #include "lldb/Utility/Endian.h"
-#include "lldb/Utility/Error.h"
+#include "lldb/Utility/Status.h"
 #include "lldb/Utility/StreamString.h"
 
 using namespace lldb_private;
@@ -44,11 +44,11 @@ TEST(ScalarTest, GetBytes) {
   Scalar f_scalar;
   DataExtractor e_data(e, sizeof(e), endian::InlHostByteOrder(),
                        sizeof(void *));
-  Error e_error =
+  Status e_error =
       e_scalar.SetValueFromData(e_data, lldb::eEncodingUint, sizeof(e));
   DataExtractor f_data(f, sizeof(f), endian::InlHostByteOrder(),
                        sizeof(void *));
-  Error f_error =
+  Status f_error =
       f_scalar.SetValueFromData(f_data, lldb::eEncodingUint, sizeof(f));
   ASSERT_EQ(0, memcmp(&a, a_scalar.GetBytes(), sizeof(a)));
   ASSERT_EQ(0, memcmp(&b, b_scalar.GetBytes(), sizeof(b)));
diff --git a/unittests/Core/TimerTest.cpp b/unittests/Core/TimerTest.cpp
index 7e7eeef0a361..a35df0d49c8e 100644
--- a/unittests/Core/TimerTest.cpp
+++ b/unittests/Core/TimerTest.cpp
@@ -18,7 +18,8 @@ using namespace lldb_private;
 TEST(TimerTest, CategoryTimes) {
   Timer::ResetCategoryTimes();
   {
-    Timer t("CAT1", "");
+    static Timer::Category tcat("CAT1");
+    Timer t(tcat, "");
     std::this_thread::sleep_for(std::chrono::milliseconds(10));
   }
   StreamString ss;
@@ -32,14 +33,18 @@ TEST(TimerTest, CategoryTimes) {
 TEST(TimerTest, CategoryTimesNested) {
   Timer::ResetCategoryTimes();
   {
-    Timer t1("CAT1", "");
+    static Timer::Category tcat1("CAT1");
+    Timer t1(tcat1, "");
     std::this_thread::sleep_for(std::chrono::milliseconds(10));
-    Timer t2("CAT1", "");
+    // Explicitly testing the same category as above.
+    Timer t2(tcat1, "");
     std::this_thread::sleep_for(std::chrono::milliseconds(10));
   }
   StreamString ss;
   Timer::DumpCategoryTimes(&ss);
   double seconds;
+  // It should only appear once.
+  ASSERT_EQ(ss.GetString().count("CAT1"), 1U);
   ASSERT_EQ(1, sscanf(ss.GetData(), "%lf sec for CAT1", &seconds));
   EXPECT_LT(0.002, seconds);
   EXPECT_GT(0.2, seconds);
@@ -48,9 +53,11 @@ TEST(TimerTest, CategoryTimesNested) {
 TEST(TimerTest, CategoryTimes2) {
   Timer::ResetCategoryTimes();
   {
-    Timer t1("CAT1", "");
+    static Timer::Category tcat1("CAT1");
+    Timer t1(tcat1, "");
     std::this_thread::sleep_for(std::chrono::milliseconds(100));
-    Timer t2("CAT2", "");
+    static Timer::Category tcat2("CAT2");
+    Timer t2(tcat2, "");
     std::this_thread::sleep_for(std::chrono::milliseconds(10));
   }
   StreamString ss;
diff --git a/unittests/Editline/EditlineTest.cpp b/unittests/Editline/EditlineTest.cpp
index f4976a348198..668d1f542128 100644
--- a/unittests/Editline/EditlineTest.cpp
+++ b/unittests/Editline/EditlineTest.cpp
@@ -22,7 +22,7 @@
 #include "lldb/Host/Editline.h"
 #include "lldb/Host/Pipe.h"
 #include "lldb/Host/PseudoTerminal.h"
-#include "lldb/Utility/Error.h"
+#include "lldb/Utility/Status.h"
 #include "lldb/Utility/StringList.h"
 
 namespace {
@@ -91,7 +91,7 @@ private:
 EditlineAdapter::EditlineAdapter()
     : _editline_sp(), _pty(), _pty_master_fd(-1), _pty_slave_fd(-1),
       _el_slave_file() {
-  lldb_private::Error error;
+  lldb_private::Status error;
 
   // Open the first master pty available.
   char error_string[256];
diff --git a/unittests/Expression/GoParserTest.cpp b/unittests/Expression/GoParserTest.cpp
index 710dc5e4e86f..2af75acda914 100644
--- a/unittests/Expression/GoParserTest.cpp
+++ b/unittests/Expression/GoParserTest.cpp
@@ -13,7 +13,7 @@
 #include "gtest/gtest.h"
 
 #include "Plugins/ExpressionParser/Go/GoParser.h"
-#include "lldb/Utility/Error.h"
+#include "lldb/Utility/Status.h"
 
 using namespace lldb_private;
 
@@ -64,7 +64,7 @@ testing::AssertionResult CheckStatement(const char *_s, const char *c_expr,
   GoParser parser(code);
   std::unique_ptr<GoASTStmt> stmt(parser.Statement());
   if (parser.Failed() || !stmt) {
-    Error err;
+    Status err;
     parser.GetError(err);
     return testing::AssertionFailure() << "Error parsing " << c_expr << "\n\t"
                                        << err.AsCString();
diff --git a/unittests/Host/MainLoopTest.cpp b/unittests/Host/MainLoopTest.cpp
index a2a673d38ca5..3a39ea1c9ac7 100644
--- a/unittests/Host/MainLoopTest.cpp
+++ b/unittests/Host/MainLoopTest.cpp
@@ -32,7 +32,7 @@ public:
 
   void SetUp() override {
     bool child_processes_inherit = false;
-    Error error;
+    Status error;
     std::unique_ptr<TCPSocket> listen_socket_up(
         new TCPSocket(true, child_processes_inherit));
     ASSERT_TRUE(error.Success());
@@ -40,7 +40,7 @@ public:
     ASSERT_TRUE(error.Success());
 
     Socket *accept_socket;
-    std::future<Error> accept_error = std::async(std::launch::async, [&] {
+    std::future<Status> accept_error = std::async(std::launch::async, [&] {
       return listen_socket_up->Accept(accept_socket);
     });
 
@@ -81,7 +81,7 @@ TEST_F(MainLoopTest, ReadObject) {
 
   MainLoop loop;
 
-  Error error;
+  Status error;
   auto handle = loop.RegisterReadObject(socketpair[1], make_callback(), error);
   ASSERT_TRUE(error.Success());
   ASSERT_TRUE(handle);
@@ -96,7 +96,7 @@ TEST_F(MainLoopTest, TerminatesImmediately) {
   ASSERT_TRUE(socketpair[1]->Write(&X, len).Success());
 
   MainLoop loop;
-  Error error;
+  Status error;
   auto handle0 = loop.RegisterReadObject(socketpair[0], make_callback(), error);
   ASSERT_TRUE(error.Success());
   auto handle1 = loop.RegisterReadObject(socketpair[1], make_callback(), error);
@@ -109,7 +109,7 @@ TEST_F(MainLoopTest, TerminatesImmediately) {
 #ifdef LLVM_ON_UNIX
 TEST_F(MainLoopTest, Signal) {
   MainLoop loop;
-  Error error;
+  Status error;
 
   auto handle = loop.RegisterSignal(SIGUSR1, make_callback(), error);
   ASSERT_TRUE(error.Success());
diff --git a/unittests/Host/SocketTest.cpp b/unittests/Host/SocketTest.cpp
index eda3a11fc109..d8fdc593ed46 100644
--- a/unittests/Host/SocketTest.cpp
+++ b/unittests/Host/SocketTest.cpp
@@ -43,7 +43,7 @@ protected:
   static void AcceptThread(Socket *listen_socket,
                            const char *listen_remote_address,
                            bool child_processes_inherit, Socket **accept_socket,
-                           Error *error) {
+                           Status *error) {
     *error = listen_socket->Accept(*accept_socket);
   }
 
@@ -53,7 +53,7 @@ protected:
       const std::function<std::string(const SocketType &)> &get_connect_addr,
       std::unique_ptr<SocketType> *a_up, std::unique_ptr<SocketType> *b_up) {
     bool child_processes_inherit = false;
-    Error error;
+    Status error;
     std::unique_ptr<SocketType> listen_socket_up(
         new SocketType(true, child_processes_inherit));
     EXPECT_FALSE(error.Fail());
@@ -61,7 +61,7 @@ protected:
     EXPECT_FALSE(error.Fail());
     EXPECT_TRUE(listen_socket_up->IsValid());
 
-    Error accept_error;
+    Status accept_error;
     Socket *accept_socket;
     std::thread accept_thread(AcceptThread, listen_socket_up.get(),
                               listen_remote_address, child_processes_inherit,
@@ -94,7 +94,7 @@ TEST_F(SocketTest, DecodeHostAndPort) {
   std::string host_str;
   std::string port_str;
   int32_t port;
-  Error error;
+  Status error;
   EXPECT_TRUE(Socket::DecodeHostAndPort("localhost:1138", host_str, port_str,
                                         port, &error));
   EXPECT_STREQ("localhost", host_str.c_str());
diff --git a/unittests/Process/gdb-remote/GDBRemoteCommunicationClientTest.cpp b/unittests/Process/gdb-remote/GDBRemoteCommunicationClientTest.cpp
index d1eb3a7e9c9e..56a7e74269aa 100644
--- a/unittests/Process/gdb-remote/GDBRemoteCommunicationClientTest.cpp
+++ b/unittests/Process/gdb-remote/GDBRemoteCommunicationClientTest.cpp
@@ -318,7 +318,7 @@ TEST_F(GDBRemoteCommunicationClientTest, SendSignalsToIgnore) {
 
   const lldb::tid_t tid = 0x47;
   const uint32_t reg_num = 4;
-  std::future<Error> result = std::async(std::launch::async, [&] {
+  std::future<Status> result = std::async(std::launch::async, [&] {
     return client.SendSignalsToIgnore({2, 3, 5, 7, 0xB, 0xD, 0x11});
   });
 
@@ -342,7 +342,7 @@ TEST_F(GDBRemoteCommunicationClientTest, GetMemoryRegionInfo) {
 
   const lldb::addr_t addr = 0xa000;
   MemoryRegionInfo region_info;
-  std::future<Error> result = std::async(std::launch::async, [&] {
+  std::future<Status> result = std::async(std::launch::async, [&] {
     return client.GetMemoryRegionInfo(addr, region_info);
   });
 
@@ -363,7 +363,7 @@ TEST_F(GDBRemoteCommunicationClientTest, GetMemoryRegionInfoInvalidResponse) {
 
   const lldb::addr_t addr = 0x4000;
   MemoryRegionInfo region_info;
-  std::future<Error> result = std::async(std::launch::async, [&] {
+  std::future<Status> result = std::async(std::launch::async, [&] {
     return client.GetMemoryRegionInfo(addr, region_info);
   });
 
diff --git a/unittests/Process/gdb-remote/GDBRemoteTestUtils.cpp b/unittests/Process/gdb-remote/GDBRemoteTestUtils.cpp
index 6440e814efd4..4a86b91c0489 100644
--- a/unittests/Process/gdb-remote/GDBRemoteTestUtils.cpp
+++ b/unittests/Process/gdb-remote/GDBRemoteTestUtils.cpp
@@ -32,16 +32,15 @@ void GDBRemoteTest::TearDownTestCase() {
 
 void Connect(GDBRemoteCommunication &client, GDBRemoteCommunication &server) {
   bool child_processes_inherit = false;
-  Error error;
+  Status error;
   TCPSocket listen_socket(true, child_processes_inherit);
   ASSERT_FALSE(error.Fail());
   error = listen_socket.Listen("127.0.0.1:0", 5);
   ASSERT_FALSE(error.Fail());
 
   Socket *accept_socket;
-  std::future<Error> accept_error = std::async(std::launch::async, [&] {
-    return listen_socket.Accept(accept_socket);
-  });
+  std::future<Status> accept_error = std::async(
+      std::launch::async, [&] { return listen_socket.Accept(accept_socket); });
 
   char connect_remote_address[64];
   snprintf(connect_remote_address, sizeof(connect_remote_address),
diff --git a/unittests/Target/ModuleCacheTest.cpp b/unittests/Target/ModuleCacheTest.cpp
index 09b3edc4148b..8914f8b5eac4 100644
--- a/unittests/Target/ModuleCacheTest.cpp
+++ b/unittests/Target/ModuleCacheTest.cpp
@@ -104,7 +104,7 @@ void ModuleCacheTest::TryGetAndPut(const FileSpec &cache_dir,
   bool did_create;
   bool download_called = false;
 
-  Error error = mc.GetAndPut(
+  Status error = mc.GetAndPut(
       cache_dir, hostname, module_spec,
       [this, &download_called](const ModuleSpec &module_spec,
                                const FileSpec &tmp_download_file_spec) {
@@ -114,10 +114,10 @@ void ModuleCacheTest::TryGetAndPut(const FileSpec &cache_dir,
         std::error_code ec = llvm::sys::fs::copy_file(
             s_test_executable, tmp_download_file_spec.GetCString());
         EXPECT_FALSE(ec);
-        return Error();
+        return Status();
       },
       [](const ModuleSP &module_sp, const FileSpec &tmp_download_file_spec) {
-        return Error("Not supported.");
+        return Status("Not supported.");
       },
       module_sp, &did_create);
   EXPECT_EQ(expect_download, download_called);
diff --git a/unittests/Utility/CMakeLists.txt b/unittests/Utility/CMakeLists.txt
index 60868bcb414c..86ac3c46d774 100644
--- a/unittests/Utility/CMakeLists.txt
+++ b/unittests/Utility/CMakeLists.txt
@@ -2,9 +2,9 @@ add_subdirectory(Mocks)
 
 add_lldb_unittest(UtilityTests
   ConstStringTest.cpp
-  ErrorTest.cpp
   LogTest.cpp
   NameMatchesTest.cpp
+  StatusTest.cpp
   StringExtractorTest.cpp
   TaskPoolTest.cpp
   TildeExpressionResolverTest.cpp
diff --git a/unittests/Utility/ErrorTest.cpp b/unittests/Utility/ErrorTest.cpp
deleted file mode 100644
index a114b26ebe54..000000000000
--- a/unittests/Utility/ErrorTest.cpp
+++ /dev/null
@@ -1,19 +0,0 @@
-//===-- ErrorTest.cpp -------------------------------------------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#include "lldb/Utility/Error.h"
-#include "gtest/gtest.h"
-
-using namespace lldb_private;
-
-TEST(ErrorTest, Formatv) {
-  EXPECT_EQ("", llvm::formatv("{0}", Error()).str());
-  EXPECT_EQ("Hello Error", llvm::formatv("{0}", Error("Hello Error")).str());
-  EXPECT_EQ("Hello", llvm::formatv("{0:5}", Error("Hello Error")).str());
-}
diff --git a/unittests/Utility/StatusTest.cpp b/unittests/Utility/StatusTest.cpp
new file mode 100644
index 000000000000..9655610e4aa3
--- /dev/null
+++ b/unittests/Utility/StatusTest.cpp
@@ -0,0 +1,19 @@
+//===-- StatusTest.cpp ------------------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "lldb/Utility/Status.h"
+#include "gtest/gtest.h"
+
+using namespace lldb_private;
+
+TEST(StatusTest, Formatv) {
+  EXPECT_EQ("", llvm::formatv("{0}", Status()).str());
+  EXPECT_EQ("Hello Status", llvm::formatv("{0}", Status("Hello Status")).str());
+  EXPECT_EQ("Hello", llvm::formatv("{0:5}", Status("Hello Error")).str());
+}
diff --git a/unittests/debugserver/RNBSocketTest.cpp b/unittests/debugserver/RNBSocketTest.cpp
index 76a1b49064f5..163e12bfcac8 100644
--- a/unittests/debugserver/RNBSocketTest.cpp
+++ b/unittests/debugserver/RNBSocketTest.cpp
@@ -30,7 +30,7 @@ static void ServerCallbackv4(const void *baton, in_port_t port) {
     Socket *client_socket;
     char addr_buffer[256];
     sprintf(addr_buffer, "%s:%d", baton, port);
-    Error err = Socket::TcpConnect(addr_buffer, false, client_socket);
+    Status err = Socket::TcpConnect(addr_buffer, false, client_socket);
     if (err.Fail())
       abort();
     char buffer[32];
@@ -101,7 +101,7 @@ void TestSocketConnect(const char *addr) {
   Socket *server_socket;
   Predicate<uint16_t> port_predicate;
   port_predicate.SetValue(0, eBroadcastNever);
-  Error err =
+  Status err =
       Socket::TcpListen(addr_wrap, false, server_socket, &port_predicate);
   ASSERT_FALSE(err.Fail());
 
-- 
cgit v1.2.3